Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (texinfo) -- parsing of texinfo into SXML |
2 | ;;;; | |
31d59769 | 3 | ;;;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. |
47f3ce52 AW |
4 | ;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> |
5 | ;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com> | |
6 | ;;;; | |
7 | ;;;; This file is based on SSAX's SSAX.scm. | |
8 | ;;;; | |
9 | ;;;; This library is free software; you can redistribute it and/or | |
10 | ;;;; modify it under the terms of the GNU Lesser General Public | |
11 | ;;;; License as published by the Free Software Foundation; either | |
12 | ;;;; version 3 of the License, or (at your option) any later version. | |
13 | ;;;; | |
14 | ;;;; This library is distributed in the hope that it will be useful, | |
15 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;;;; Lesser General Public License for more details. | |
18 | ;;;; | |
19 | ;;;; You should have received a copy of the GNU Lesser General Public | |
20 | ;;;; License along with this library; if not, write to the Free Software | |
21 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | \f | |
23 | ;;; Commentary: | |
24 | ;; | |
25 | ;; @subheading Texinfo processing in scheme | |
26 | ;; | |
27 | ;; This module parses texinfo into SXML. TeX will always be the | |
28 | ;; processor of choice for print output, of course. However, although | |
29 | ;; @code{makeinfo} works well for info, its output in other formats is | |
30 | ;; not very customizable, and the program is not extensible as a whole. | |
31 | ;; This module aims to provide an extensible framework for texinfo | |
32 | ;; processing that integrates texinfo into the constellation of SXML | |
33 | ;; processing tools. | |
34 | ;; | |
35 | ;; @subheading Notes on the SXML vocabulary | |
36 | ;; | |
37 | ;; Consider the following texinfo fragment: | |
38 | ;; | |
39 | ;;@example | |
40 | ;; @@deffn Primitive set-car! pair value | |
41 | ;; This function... | |
42 | ;; @@end deffn | |
43 | ;;@end example | |
44 | ;; | |
45 | ;; Logically, the category (Primitive), name (set-car!), and arguments | |
46 | ;; (pair value) are ``attributes'' of the deffn, with the description as | |
47 | ;; the content. However, texinfo allows for @@-commands within the | |
48 | ;; arguments to an environment, like @code{@@deffn}, which means that | |
49 | ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, | |
50 | ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are | |
51 | ;; called ``arguments'', and are grouped under the special element, `%'. | |
52 | ;; | |
53 | ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In | |
54 | ;; the interests of interoperability, this module provides a conversion | |
55 | ;; function to replace the `%' with `texinfo-arguments'. | |
56 | ;; | |
57 | ;;; Code: | |
58 | ||
59 | ;; Comparison to xml output of texinfo (which is rather undocumented): | |
60 | ;; Doesn't conform to texinfo dtd | |
61 | ;; No DTD at all, in fact :-/ | |
62 | ;; Actually outputs valid xml, after transforming % | |
63 | ;; Slower (although with caching the SXML that problem can go away) | |
64 | ;; Doesn't parse menus (although menus are shite) | |
65 | ;; Args go in a dedicated element, FBOFW | |
66 | ;; Definitions are handled a lot better | |
67 | ;; Does parse comments | |
68 | ;; Outputs only significant line breaks (a biggie!) | |
69 | ;; Nodes are treated as anchors, rather than content organizers (a biggie) | |
70 | ;; (more book-like, less info-like) | |
71 | ||
72 | ;; TODO | |
73 | ;; Integration: help, indexing, plain text | |
74 | ||
75 | (define-module (texinfo) | |
76 | #:use-module (sxml simple) | |
77 | #:use-module (sxml transform) | |
78 | #:use-module (sxml ssax input-parse) | |
79 | #:use-module (srfi srfi-1) | |
0c65f52c | 80 | #:use-module (srfi srfi-11) |
47f3ce52 AW |
81 | #:use-module (srfi srfi-13) |
82 | #:export (call-with-file-and-dir | |
83 | texi-command-specs | |
84 | texi-command-depth | |
85 | texi-fragment->stexi | |
86 | texi->stexi | |
87 | stexi->sxml)) | |
88 | ||
89 | ;; Some utilities | |
90 | ||
91 | (define (parser-error port message . rest) | |
05c29c5a | 92 | (apply throw 'parser-error port message rest)) |
47f3ce52 AW |
93 | |
94 | (define (call-with-file-and-dir filename proc) | |
95 | "Call the one-argument procedure @var{proc} with an input port that | |
96 | reads from @var{filename}. During the dynamic extent of @var{proc}'s | |
97 | execution, the current directory will be @code{(dirname | |
98 | @var{filename})}. This is useful for parsing documents that can include | |
99 | files by relative path name." | |
100 | (let ((current-dir (getcwd))) | |
101 | (dynamic-wind | |
102 | (lambda () (chdir (dirname filename))) | |
103 | (lambda () | |
104 | (call-with-input-file (basename filename) proc)) | |
105 | (lambda () (chdir current-dir))))) | |
106 | ||
47f3ce52 AW |
107 | ;;======================================================================== |
108 | ;; Reflection on the XML vocabulary | |
109 | ||
110 | (define texi-command-specs | |
111 | #; | |
112 | "A list of (@var{name} @var{content-model} . @var{args}) | |
113 | ||
114 | @table @var | |
115 | @item name | |
116 | The name of an @@-command, as a symbol. | |
117 | ||
118 | @item content-model | |
119 | A symbol indicating the syntactic type of the @@-command: | |
120 | @table @code | |
121 | @item EMPTY-COMMAND | |
122 | No content, and no @code{@@end} is coming | |
123 | @item EOL-ARGS | |
124 | Unparsed arguments until end of line | |
125 | @item EOL-TEXT | |
126 | Parsed arguments until end of line | |
127 | @item INLINE-ARGS | |
128 | Unparsed arguments ending with @code{#\\@}} | |
129 | @item INLINE-TEXT | |
130 | Parsed arguments ending with @code{#\\@}} | |
131 | @item ENVIRON | |
132 | The tag is an environment tag, expect @code{@@end foo}. | |
133 | @item TABLE-ENVIRON | |
134 | Like ENVIRON, but with special parsing rules for its arguments. | |
135 | @item FRAGMENT | |
136 | For @code{*fragment*}, the command used for parsing fragments of | |
137 | texinfo documents. | |
138 | @end table | |
139 | ||
140 | @code{INLINE-TEXT} commands will receive their arguments within their | |
141 | bodies, whereas the @code{-ARGS} commands will receive them in their | |
142 | attribute list. | |
143 | ||
144 | @code{EOF-TEXT} receives its arguments in its body. | |
145 | ||
146 | @code{ENVIRON} commands have both: parsed arguments until the end of | |
147 | line, received through their attribute list, and parsed text until the | |
148 | @code{@@end}, received in their bodies. | |
149 | ||
150 | @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in | |
151 | @code{ENVIRON}. | |
152 | ||
31d59769 AW |
153 | In addition, @code{ALIAS} can alias one command to another. The alias |
154 | will never be seen in parsed stexinfo. | |
155 | ||
47f3ce52 AW |
156 | There are four @@-commands that are treated specially. @code{@@include} |
157 | is a low-level token that will not be seen by higher-level parsers, so | |
158 | it has no content-model. @code{@@para} is the paragraph command, which | |
159 | is only implicit in the texinfo source. @code{@@item} has special | |
160 | syntax, as noted above, and @code{@@entry} is how this parser treats | |
161 | @code{@@item} commands within @code{@@table}, @code{@@ftable}, and | |
162 | @code{@@vtable}. | |
163 | ||
164 | Also, indexing commands (@code{@@cindex}, etc.) are treated specially. | |
165 | Their arguments are parsed, but they are needed before entering the | |
166 | element so that an anchor can be inserted into the text before the index | |
167 | entry. | |
168 | ||
169 | @item args | |
170 | Named arguments to the command, in the same format as the formals for a | |
171 | lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, | |
172 | @code{ENVIRON}, @code{TABLE-ENVIRON} commands. | |
173 | @end table" | |
174 | '(;; Special commands | |
175 | (include #f) ;; this is a low-level token | |
176 | (para PARAGRAPH) | |
177 | (item ITEM) | |
178 | (entry ENTRY . heading) | |
179 | (noindent EMPTY-COMMAND) | |
180 | (*fragment* FRAGMENT) | |
181 | ||
182 | ;; Inline text commands | |
183 | (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant | |
184 | (bold INLINE-TEXT) | |
185 | (sample INLINE-TEXT) | |
186 | (samp INLINE-TEXT) | |
187 | (code INLINE-TEXT) | |
188 | (kbd INLINE-TEXT) | |
189 | (key INLINE-TEXT) | |
190 | (var INLINE-TEXT) | |
191 | (env INLINE-TEXT) | |
192 | (file INLINE-TEXT) | |
193 | (command INLINE-TEXT) | |
194 | (option INLINE-TEXT) | |
195 | (dfn INLINE-TEXT) | |
196 | (cite INLINE-TEXT) | |
197 | (acro INLINE-TEXT) | |
47f3ce52 AW |
198 | (email INLINE-TEXT) |
199 | (emph INLINE-TEXT) | |
200 | (strong INLINE-TEXT) | |
201 | (sample INLINE-TEXT) | |
202 | (sc INLINE-TEXT) | |
203 | (titlefont INLINE-TEXT) | |
204 | (asis INLINE-TEXT) | |
205 | (b INLINE-TEXT) | |
206 | (i INLINE-TEXT) | |
207 | (r INLINE-TEXT) | |
208 | (sansserif INLINE-TEXT) | |
209 | (slanted INLINE-TEXT) | |
210 | (t INLINE-TEXT) | |
211 | ||
212 | ;; Inline args commands | |
213 | (value INLINE-ARGS . (key)) | |
214 | (ref INLINE-ARGS . (node #:opt name section info-file manual)) | |
215 | (xref INLINE-ARGS . (node #:opt name section info-file manual)) | |
216 | (pxref INLINE-ARGS . (node #:opt name section info-file manual)) | |
31d59769 | 217 | (url ALIAS . uref) |
47f3ce52 AW |
218 | (uref INLINE-ARGS . (url #:opt title replacement)) |
219 | (anchor INLINE-ARGS . (name)) | |
220 | (dots INLINE-ARGS . ()) | |
221 | (result INLINE-ARGS . ()) | |
222 | (bullet INLINE-ARGS . ()) | |
223 | (copyright INLINE-ARGS . ()) | |
224 | (tie INLINE-ARGS . ()) | |
225 | (image INLINE-ARGS . (file #:opt width height alt-text extension)) | |
226 | ||
227 | ;; EOL args elements | |
228 | (node EOL-ARGS . (name #:opt next previous up)) | |
229 | (c EOL-ARGS . all) | |
230 | (comment EOL-ARGS . all) | |
231 | (setchapternewpage EOL-ARGS . all) | |
232 | (sp EOL-ARGS . all) | |
233 | (page EOL-ARGS . ()) | |
234 | (vskip EOL-ARGS . all) | |
235 | (syncodeindex EOL-ARGS . all) | |
236 | (contents EOL-ARGS . ()) | |
237 | (shortcontents EOL-ARGS . ()) | |
238 | (summarycontents EOL-ARGS . ()) | |
239 | (insertcopying EOL-ARGS . ()) | |
240 | (dircategory EOL-ARGS . (category)) | |
241 | (top EOL-ARGS . (title)) | |
242 | (printindex EOL-ARGS . (type)) | |
406524ea | 243 | (paragraphindent EOL-ARGS . (indent)) |
47f3ce52 AW |
244 | |
245 | ;; EOL text commands | |
246 | (*ENVIRON-ARGS* EOL-TEXT) | |
247 | (itemx EOL-TEXT) | |
248 | (set EOL-TEXT) | |
249 | (center EOL-TEXT) | |
250 | (title EOL-TEXT) | |
251 | (subtitle EOL-TEXT) | |
252 | (author EOL-TEXT) | |
253 | (chapter EOL-TEXT) | |
254 | (section EOL-TEXT) | |
255 | (appendix EOL-TEXT) | |
256 | (appendixsec EOL-TEXT) | |
257 | (unnumbered EOL-TEXT) | |
258 | (unnumberedsec EOL-TEXT) | |
259 | (subsection EOL-TEXT) | |
260 | (subsubsection EOL-TEXT) | |
261 | (appendixsubsec EOL-TEXT) | |
262 | (appendixsubsubsec EOL-TEXT) | |
263 | (unnumberedsubsec EOL-TEXT) | |
264 | (unnumberedsubsubsec EOL-TEXT) | |
265 | (chapheading EOL-TEXT) | |
266 | (majorheading EOL-TEXT) | |
267 | (heading EOL-TEXT) | |
268 | (subheading EOL-TEXT) | |
269 | (subsubheading EOL-TEXT) | |
270 | ||
271 | (deftpx EOL-TEXT-ARGS . (category name . attributes)) | |
272 | (defcvx EOL-TEXT-ARGS . (category class name)) | |
273 | (defivarx EOL-TEXT-ARGS . (class name)) | |
274 | (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) | |
275 | (defopx EOL-TEXT-ARGS . (category class name . arguments)) | |
276 | (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) | |
277 | (defmethodx EOL-TEXT-ARGS . (class name . arguments)) | |
278 | (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) | |
279 | (defoptx EOL-TEXT-ARGS . (name)) | |
280 | (defvrx EOL-TEXT-ARGS . (category name)) | |
281 | (defvarx EOL-TEXT-ARGS . (name)) | |
282 | (deftypevrx EOL-TEXT-ARGS . (category data-type name)) | |
283 | (deftypevarx EOL-TEXT-ARGS . (data-type name)) | |
284 | (deffnx EOL-TEXT-ARGS . (category name . arguments)) | |
285 | (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) | |
286 | (defspecx EOL-TEXT-ARGS . (name . arguments)) | |
287 | (defmacx EOL-TEXT-ARGS . (name . arguments)) | |
288 | (defunx EOL-TEXT-ARGS . (name . arguments)) | |
289 | (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) | |
290 | ||
291 | ;; Indexing commands | |
292 | (cindex INDEX . entry) | |
293 | (findex INDEX . entry) | |
294 | (vindex INDEX . entry) | |
295 | (kindex INDEX . entry) | |
296 | (pindex INDEX . entry) | |
297 | (tindex INDEX . entry) | |
298 | ||
299 | ;; Environment commands (those that need @end) | |
300 | (texinfo ENVIRON . title) | |
301 | (ignore ENVIRON . ()) | |
302 | (ifinfo ENVIRON . ()) | |
303 | (iftex ENVIRON . ()) | |
304 | (ifhtml ENVIRON . ()) | |
305 | (ifxml ENVIRON . ()) | |
306 | (ifplaintext ENVIRON . ()) | |
307 | (ifnotinfo ENVIRON . ()) | |
308 | (ifnottex ENVIRON . ()) | |
309 | (ifnothtml ENVIRON . ()) | |
310 | (ifnotxml ENVIRON . ()) | |
311 | (ifnotplaintext ENVIRON . ()) | |
312 | (titlepage ENVIRON . ()) | |
313 | (menu ENVIRON . ()) | |
314 | (direntry ENVIRON . ()) | |
315 | (copying ENVIRON . ()) | |
316 | (example ENVIRON . ()) | |
317 | (smallexample ENVIRON . ()) | |
318 | (display ENVIRON . ()) | |
319 | (smalldisplay ENVIRON . ()) | |
320 | (verbatim ENVIRON . ()) | |
321 | (format ENVIRON . ()) | |
322 | (smallformat ENVIRON . ()) | |
323 | (lisp ENVIRON . ()) | |
324 | (smalllisp ENVIRON . ()) | |
325 | (cartouche ENVIRON . ()) | |
326 | (quotation ENVIRON . ()) | |
327 | ||
328 | (deftp ENVIRON . (category name . attributes)) | |
329 | (defcv ENVIRON . (category class name)) | |
330 | (defivar ENVIRON . (class name)) | |
331 | (deftypeivar ENVIRON . (class data-type name)) | |
332 | (defop ENVIRON . (category class name . arguments)) | |
333 | (deftypeop ENVIRON . (category class data-type name . arguments)) | |
334 | (defmethod ENVIRON . (class name . arguments)) | |
335 | (deftypemethod ENVIRON . (class data-type name . arguments)) | |
336 | (defopt ENVIRON . (name)) | |
337 | (defvr ENVIRON . (category name)) | |
338 | (defvar ENVIRON . (name)) | |
339 | (deftypevr ENVIRON . (category data-type name)) | |
340 | (deftypevar ENVIRON . (data-type name)) | |
341 | (deffn ENVIRON . (category name . arguments)) | |
342 | (deftypefn ENVIRON . (category data-type name . arguments)) | |
343 | (defspec ENVIRON . (name . arguments)) | |
344 | (defmac ENVIRON . (name . arguments)) | |
345 | (defun ENVIRON . (name . arguments)) | |
346 | (deftypefun ENVIRON . (data-type name . arguments)) | |
347 | ||
348 | (table TABLE-ENVIRON . (formatter)) | |
349 | (itemize TABLE-ENVIRON . (formatter)) | |
350 | (enumerate TABLE-ENVIRON . (start)) | |
351 | (ftable TABLE-ENVIRON . (formatter)) | |
352 | (vtable TABLE-ENVIRON . (formatter)))) | |
353 | ||
354 | (define command-depths | |
355 | '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) | |
356 | (top . 0) (unnumbered . 1) (unnumberedsec . 2) | |
357 | (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) | |
358 | (appendix . 1) (appendixsec . 2) (appendixsection . 2) | |
359 | (appendixsubsec . 3) (appendixsubsubsec . 4))) | |
360 | (define (texi-command-depth command max-depth) | |
361 | "Given the texinfo command @var{command}, return its nesting level, or | |
362 | @code{#f} if it nests too deep for @var{max-depth}. | |
363 | ||
364 | Examples: | |
365 | @example | |
05c29c5a AW |
366 | (texi-command-depth 'chapter 4) @result{} 1 |
367 | (texi-command-depth 'top 4) @result{} 0 | |
368 | (texi-command-depth 'subsection 4) @result{} 3 | |
369 | (texi-command-depth 'appendixsubsec 4) @result{} 3 | |
370 | (texi-command-depth 'subsection 2) @result{} #f | |
47f3ce52 AW |
371 | @end example" |
372 | (let ((depth (and=> (assq command command-depths) cdr))) | |
373 | (and depth (<= depth max-depth) depth))) | |
374 | ||
375 | ;; The % is for arguments | |
376 | (define (space-significant? command) | |
377 | (memq command | |
378 | '(example smallexample verbatim lisp smalllisp menu %))) | |
379 | ||
380 | ;; Like a DTD for texinfo | |
381 | (define (command-spec command) | |
382 | (or (assq command texi-command-specs) | |
383 | (parser-error #f "Unknown command" command))) | |
384 | ||
385 | (define (inline-content? content) | |
386 | (or (eq? content 'INLINE-TEXT) (eq? content 'INLINE-ARGS))) | |
387 | ||
388 | ||
389 | ;;======================================================================== | |
390 | ;; Lower-level parsers and scanners | |
391 | ;; | |
392 | ;; They deal with primitive lexical units (Names, whitespaces, tags) and | |
393 | ;; with pieces of more generic productions. Most of these parsers must | |
394 | ;; be called in appropriate context. For example, complete-start-command | |
395 | ;; must be called only when the @-command start has been detected and | |
396 | ;; its name token has been read. | |
397 | ||
398 | ;; Test if a string is made of only whitespace | |
399 | ;; An empty string is considered made of whitespace as well | |
400 | (define (string-whitespace? str) | |
401 | (or (string-null? str) | |
402 | (string-every char-whitespace? str))) | |
403 | ||
404 | ;; Like read-text-line, but allows EOF. | |
405 | (define read-eof-breaks '(*eof* #\return #\newline)) | |
406 | (define (read-eof-line port) | |
407 | (if (eof-object? (peek-char port)) | |
408 | (peek-char port) | |
409 | (let* ((line (next-token '() read-eof-breaks | |
410 | "reading a line" port)) | |
411 | (c (read-char port))) ; must be either \n or \r or EOF | |
412 | (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) | |
413 | (read-char port)) ; skip \n that follows \r | |
414 | line))) | |
415 | ||
47f3ce52 AW |
416 | (define (skip-whitespace port) |
417 | (skip-while '(#\space #\tab #\return #\newline) port)) | |
418 | ||
419 | (define (skip-horizontal-whitespace port) | |
420 | (skip-while '(#\space #\tab) port)) | |
421 | ||
422 | ;; command ::= Letter+ | |
423 | ||
424 | ;; procedure: read-command PORT | |
425 | ;; | |
426 | ;; Read a command starting from the current position in the PORT and | |
427 | ;; return it as a symbol. | |
428 | (define (read-command port) | |
429 | (let ((first-char (peek-char port))) | |
430 | (or (char-alphabetic? first-char) | |
431 | (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) | |
432 | (string->symbol | |
433 | (next-token-of | |
434 | (lambda (c) | |
435 | (cond | |
436 | ((eof-object? c) #f) | |
437 | ((char-alphabetic? c) c) | |
438 | (else #f))) | |
439 | port))) | |
440 | ||
441 | ;; A token is a primitive lexical unit. It is a record with two fields, | |
442 | ;; token-head and token-kind. | |
443 | ;; | |
444 | ;; Token types: | |
445 | ;; END The end of a texinfo command. If the command is ended by }, | |
446 | ;; token-head will be #f. Otherwise if the command is ended by | |
447 | ;; @end COMMAND, token-head will be COMMAND. As a special case, | |
448 | ;; @bye is the end of a special @texinfo command. | |
449 | ;; START The start of a texinfo command. The token-head will be a | |
450 | ;; symbol of the @-command name. | |
451 | ;; INCLUDE An @include directive. The token-head will be empty -- the | |
452 | ;; caller is responsible for reading the include file name. | |
453 | ;; ITEM @item commands have an irregular syntax. They end at the | |
454 | ;; next @item, or at the end of the environment. For that | |
455 | ;; read-command-token treats them specially. | |
456 | ||
457 | (define (make-token kind head) (cons kind head)) | |
458 | (define token? pair?) | |
459 | (define token-kind car) | |
460 | (define token-head cdr) | |
461 | ||
462 | ;; procedure: read-command-token PORT | |
463 | ;; | |
464 | ;; This procedure starts parsing of a command token. The current | |
465 | ;; position in the stream must be #\@. This procedure scans enough of | |
466 | ;; the input stream to figure out what kind of a command token it is | |
467 | ;; seeing. The procedure returns a token structure describing the token. | |
468 | ||
469 | (define (read-command-token port) | |
470 | (assert-curr-char '(#\@) "start of the command" port) | |
471 | (let ((peeked (peek-char port))) | |
472 | (cond | |
473 | ((memq peeked '(#\! #\. #\? #\@ #\\ #\{ #\})) | |
474 | ;; @-commands that escape characters | |
475 | (make-token 'STRING (string (read-char port)))) | |
476 | (else | |
477 | (let ((name (read-command port))) | |
478 | (case name | |
479 | ((end) | |
480 | ;; got an ending tag | |
481 | (let ((command (string-trim-both | |
482 | (read-eof-line port)))) | |
483 | (or (and (not (string-null? command)) | |
484 | (string-every char-alphabetic? command)) | |
485 | (parser-error port "malformed @end" command)) | |
486 | (make-token 'END (string->symbol command)))) | |
487 | ((bye) | |
488 | ;; the end of the top | |
489 | (make-token 'END 'texinfo)) | |
490 | ((item) | |
491 | (make-token 'ITEM 'item)) | |
492 | ((include) | |
493 | (make-token 'INCLUDE #f)) | |
494 | (else | |
495 | (make-token 'START name)))))))) | |
496 | ||
497 | ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED | |
498 | ;; | |
499 | ;; This procedure must be called after we have read a string | |
500 | ;; "@verbatim\n" that begins a verbatim section. The current position | |
501 | ;; must be the first position of the verbatim body. This function reads | |
502 | ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a | |
503 | ;; character data consumer. | |
504 | ;; | |
505 | ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. | |
506 | ;; The first STRING1 argument to STR-HANDLER never contains a newline. | |
507 | ;; The second STRING2 argument often will. On the first invocation of the | |
508 | ;; STR-HANDLER, the seed is the one passed to read-verbatim-body | |
509 | ;; as the third argument. The result of this first invocation will be | |
510 | ;; passed as the seed argument to the second invocation of the line | |
511 | ;; consumer, and so on. The result of the last invocation of the | |
512 | ;; STR-HANDLER is returned by the read-verbatim-body. Note a | |
513 | ;; similarity to the fundamental 'fold' iterator. | |
514 | ;; | |
515 | ;; Within a verbatim section all characters are taken at their face | |
516 | ;; value. It ends with "\n@end verbatim(\r)?\n". | |
517 | ||
518 | ;; Must be called right after the newline after @verbatim. | |
519 | (define (read-verbatim-body port str-handler seed) | |
520 | (let loop ((seed seed)) | |
521 | (let ((fragment (next-token '() '(#\newline) | |
522 | "reading verbatim" port))) | |
523 | ;; We're reading the char after the 'fragment', which is | |
524 | ;; #\newline. | |
525 | (read-char port) | |
526 | (if (string=? fragment "@end verbatim") | |
527 | seed | |
528 | (loop (str-handler fragment "\n" seed)))))) | |
529 | ||
530 | ;; procedure+: read-arguments PORT | |
531 | ;; | |
532 | ;; This procedure reads and parses a production ArgumentList. | |
533 | ;; ArgumentList ::= S* Argument (S* , S* Argument)* S* | |
534 | ;; Argument ::= ([^@{},])* | |
535 | ;; | |
536 | ;; Arguments are the things in braces, i.e @ref{my node} has one | |
537 | ;; argument, "my node". Most commands taking braces actually don't have | |
538 | ;; arguments, they process text. For example, in | |
539 | ;; @emph{@strong{emphasized}}, the emph takes text, because the parse | |
540 | ;; continues into the braces. | |
541 | ;; | |
542 | ;; Any whitespace within Argument is replaced with a single space. | |
543 | ;; Whitespace around an Argument is trimmed. | |
544 | ;; | |
545 | ;; The procedure returns a list of arguments. Afterwards the current | |
546 | ;; character will be after the final #\}. | |
547 | ||
548 | (define (read-arguments port stop-char) | |
549 | (define (split str) | |
550 | (read-char port) ;; eat the delimiter | |
551 | (let ((ret (map (lambda (x) (if (string-null? x) #f x)) | |
552 | (map string-trim-both (string-split str #\,))))) | |
553 | (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) | |
554 | '() | |
555 | ret))) | |
556 | (split (next-token '() (list stop-char) | |
557 | "arguments of @-command" port))) | |
558 | ||
559 | ;; procedure+: complete-start-command COMMAND PORT | |
560 | ;; | |
561 | ;; This procedure is to complete parsing of an @-command. The procedure | |
562 | ;; must be called after the command token has been read. COMMAND is a | |
563 | ;; TAG-NAME. | |
564 | ;; | |
565 | ;; This procedure returns several values: | |
566 | ;; COMMAND: a symbol. | |
567 | ;; ARGUMENTS: command's arguments, as an alist. | |
568 | ;; CONTENT-MODEL: the content model of the command. | |
569 | ;; | |
570 | ;; On exit, the current position in PORT will depend on the CONTENT-MODEL. | |
571 | ;; | |
572 | ;; Content model Port position | |
573 | ;; ============= ============= | |
574 | ;; INLINE-TEXT One character after the #\{. | |
575 | ;; INLINE-ARGS The first character after the #\}. | |
576 | ;; EOL-TEXT The first non-whitespace character after the command. | |
577 | ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT | |
578 | ;; The first character on the next line. | |
579 | ;; PARAGRAPH, ITEM, EMPTY-COMMAND | |
580 | ;; The first character after the command. | |
581 | ||
582 | (define (arguments->attlist port args arg-names) | |
583 | (let loop ((in args) (names arg-names) (opt? #f) (out '())) | |
584 | (cond | |
585 | ((symbol? names) ;; a rest arg | |
586 | (reverse (if (null? in) out (acons names in out)))) | |
587 | ((and (not (null? names)) (eq? (car names) #:opt)) | |
588 | (loop in (cdr names) #t out)) | |
589 | ((null? in) | |
590 | (if (or (null? names) opt?) | |
591 | (reverse out) | |
592 | (parser-error port "@-command expected more arguments:" | |
593 | args arg-names names))) | |
594 | ((null? names) | |
595 | (parser-error port "@-command didn't expect more arguments:" in)) | |
596 | ((not (car in)) | |
597 | (or (and opt? (loop (cdr in) (cdr names) opt? out)) | |
598 | (parser-error "@-command missing required argument" | |
599 | (car names)))) | |
600 | (else | |
601 | (loop (cdr in) (cdr names) opt? | |
602 | (cons (list (car names) (car in)) out)))))) | |
603 | ||
604 | (define (parse-table-args command port) | |
605 | (let* ((line (string-trim-both (read-text-line port))) | |
606 | (length (string-length line))) | |
607 | (define (get-formatter) | |
608 | (or (and (not (zero? length)) | |
609 | (eq? (string-ref line 0) #\@) | |
610 | (let ((f (string->symbol (substring line 1)))) | |
611 | (or (inline-content? (cadr (command-spec f))) | |
612 | (parser-error | |
613 | port "@item formatter must be INLINE" f)) | |
614 | f)) | |
05c29c5a | 615 | (parser-error port "Invalid @item formatter" line))) |
47f3ce52 AW |
616 | (case command |
617 | ((enumerate) | |
618 | (if (zero? length) | |
619 | '() | |
620 | `((start | |
621 | ,(if (or (and (eq? length 1) | |
622 | (char-alphabetic? (string-ref line 0))) | |
623 | (string-every char-numeric? line)) | |
624 | line | |
625 | (parser-error | |
626 | port "Invalid enumerate start" line)))))) | |
627 | ((itemize) | |
628 | `((bullet | |
629 | ,(or (and (eq? length 1) line) | |
630 | (and (string-null? line) '(bullet)) | |
631 | (list (get-formatter)))))) | |
632 | (else ;; tables of various varieties | |
633 | `((formatter (,(get-formatter)))))))) | |
634 | ||
635 | (define (complete-start-command command port) | |
636 | (define (get-arguments type arg-names stop-char) | |
637 | (arguments->attlist port (read-arguments port stop-char) arg-names)) | |
638 | ||
639 | (let* ((spec (command-spec command)) | |
640 | (type (cadr spec)) | |
641 | (arg-names (cddr spec))) | |
642 | (case type | |
31d59769 AW |
643 | ((ALIAS) |
644 | (complete-start-command arg-names port)) | |
47f3ce52 AW |
645 | ((INLINE-TEXT) |
646 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
647 | (values command '() type)) | |
648 | ((INLINE-ARGS) | |
649 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
650 | (values command (get-arguments type arg-names #\}) type)) | |
651 | ((EOL-ARGS) | |
652 | (values command (get-arguments type arg-names #\newline) type)) | |
653 | ((ENVIRON ENTRY INDEX) | |
654 | (skip-horizontal-whitespace port) | |
655 | (values command (parse-environment-args command port) type)) | |
656 | ((TABLE-ENVIRON) | |
657 | (skip-horizontal-whitespace port) | |
658 | (values command (parse-table-args command port) type)) | |
659 | ((EOL-TEXT) | |
660 | (skip-horizontal-whitespace port) | |
661 | (values command '() type)) | |
662 | ((EOL-TEXT-ARGS) | |
663 | (skip-horizontal-whitespace port) | |
664 | (values command (parse-eol-text-args command port) type)) | |
665 | ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) | |
666 | (values command '() type)) | |
667 | (else ;; INCLUDE shouldn't get here | |
668 | (parser-error port "can't happen"))))) | |
669 | ||
670 | ;;----------------------------------------------------------------------------- | |
671 | ;; Higher-level parsers and scanners | |
672 | ;; | |
673 | ;; They parse productions corresponding entire @-commands. | |
674 | ||
675 | ;; Only reads @settitle, leaves it to the command parser to finish | |
676 | ;; reading the title. | |
677 | (define (take-until-settitle port) | |
678 | (or (find-string-from-port? "\n@settitle " port) | |
679 | (parser-error port "No \\n@settitle found")) | |
680 | (skip-horizontal-whitespace port) | |
681 | (and (eq? (peek-char port) #\newline) | |
682 | (parser-error port "You have a @settitle, but no title"))) | |
683 | ||
684 | ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED | |
685 | ;; | |
686 | ;; This procedure is to read the CharData of a texinfo document. | |
687 | ;; | |
688 | ;; text ::= (CharData | Command)* | |
689 | ;; | |
690 | ;; The procedure reads CharData and stops at @-commands (or | |
691 | ;; environments). It also stops at an open or close brace. | |
692 | ;; | |
693 | ;; port | |
694 | ;; a PORT to read | |
695 | ;; expect-eof? | |
696 | ;; a boolean indicating if EOF is normal, i.e., the character | |
697 | ;; data may be terminated by the EOF. EOF is normal | |
698 | ;; while processing the main document. | |
699 | ;; preserve-ws? | |
700 | ;; a boolean indicating if we are within a whitespace-preserving | |
701 | ;; environment. If #t, suppress paragraph detection. | |
702 | ;; str-handler | |
703 | ;; a STR-HANDLER, see read-verbatim-body | |
704 | ;; seed | |
705 | ;; an argument passed to the first invocation of STR-HANDLER. | |
706 | ;; | |
707 | ;; The procedure returns two results: SEED and TOKEN. The SEED is the | |
708 | ;; result of the last invocation of STR-HANDLER, or the original seed if | |
709 | ;; STR-HANDLER was never called. | |
710 | ;; | |
711 | ;; TOKEN can be either an eof-object (this can happen only if expect-eof? | |
712 | ;; was #t), or a texinfo token denoting the start or end of a tag. | |
713 | ||
714 | ;; read-char-data port expect-eof? preserve-ws? str-handler seed | |
715 | (define read-char-data | |
716 | (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) | |
717 | (define (handle str-handler str1 str2 seed) | |
718 | (if (and (string-null? str1) (string-null? str2)) | |
719 | seed | |
720 | (str-handler str1 str2 seed))) | |
721 | ||
722 | (lambda (port expect-eof? preserve-ws? str-handler seed) | |
723 | (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) | |
724 | (let loop ((seed seed)) | |
725 | (let* ((fragment (next-token '() end-chars "reading char data" port)) | |
726 | (term-char (peek-char port))) ; one of end-chars | |
727 | (cond | |
728 | ((eof-object? term-char) ; only if expect-eof? | |
729 | (values (handle str-handler fragment "" seed) term-char)) | |
730 | ((memq term-char '(#\@ #\{ #\})) | |
731 | (values (handle str-handler fragment "" seed) | |
732 | (case term-char | |
733 | ((#\@) (read-command-token port)) | |
734 | ((#\{) (make-token 'START '*braces*)) | |
735 | ((#\}) (read-char port) (make-token 'END #f))))) | |
736 | ((eq? term-char #\newline) | |
737 | ;; Always significant, unless directly before an end token. | |
738 | (let ((c (peek-next-char port))) | |
739 | (cond | |
740 | ((eof-object? c) | |
741 | (or expect-eof? | |
742 | (parser-error port "EOF while reading char data")) | |
743 | (values (handle str-handler fragment "" seed) c)) | |
744 | ((eq? c #\@) | |
745 | (let* ((token (read-command-token port)) | |
746 | (end? (eq? (token-kind token) 'END))) | |
747 | (values | |
748 | (handle str-handler fragment (if end? "" " ") seed) | |
749 | token))) | |
750 | ((and (not preserve-ws?) (eq? c #\newline)) | |
751 | ;; paragraph-separator ::= #\newline #\newline+ | |
752 | (skip-while '(#\newline) port) | |
753 | (skip-horizontal-whitespace port) | |
754 | (values (handle str-handler fragment "" seed) | |
755 | (make-token 'PARA 'para))) | |
756 | (else | |
757 | (loop (handle str-handler fragment | |
758 | (if preserve-ws? "\n" " ") seed))))))))))))) | |
759 | ||
760 | ; procedure+: assert-token TOKEN KIND NAME | |
761 | ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME | |
762 | (define (assert-token token kind name) | |
763 | (or (and (token? token) | |
764 | (eq? kind (token-kind token)) | |
765 | (equal? name (token-head token))) | |
766 | (parser-error #f "Expecting @end for " name ", got " token))) | |
767 | ||
768 | ;;======================================================================== | |
769 | ;; Highest-level parsers: Texinfo to SXML | |
770 | ||
771 | ;; These parsers are a set of syntactic forms to instantiate a SSAX | |
772 | ;; parser. The user tells what to do with the parsed character and | |
773 | ;; element data. These latter handlers determine if the parsing follows a | |
774 | ;; SAX or a DOM model. | |
775 | ||
776 | ;; syntax: make-command-parser fdown fup str-handler | |
777 | ||
778 | ;; Create a parser to parse and process one element, including its | |
779 | ;; character content or children elements. The parser is typically | |
780 | ;; applied to the root element of a document. | |
781 | ||
782 | ;; fdown | |
783 | ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED | |
784 | ;; | |
785 | ;; This procedure is to generate the seed to be passed to handlers | |
786 | ;; that process the content of the element. This is the function | |
787 | ;; identified as 'fdown' in the denotational semantics of the XML | |
788 | ;; parser given in the title comments to (sxml ssax). | |
789 | ;; | |
790 | ;; fup | |
791 | ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED | |
792 | ;; | |
793 | ;; This procedure is called when parsing of COMMAND is finished. | |
794 | ;; The SEED is the result from the last content parser (or from | |
795 | ;; fdown if the element has the empty content). PARENT-SEED is the | |
796 | ;; same seed as was passed to fdown. The procedure is to generate a | |
797 | ;; seed that will be the result of the element parser. This is the | |
798 | ;; function identified as 'fup' in the denotational semantics of | |
799 | ;; the XML parser given in the title comments to (sxml ssax). | |
800 | ;; | |
801 | ;; str-handler | |
802 | ;; A STR-HANDLER, see read-verbatim-body | |
803 | ;; | |
804 | ||
805 | ;; The generated parser is a | |
806 | ;; procedure COMMAND PORT SEED | |
807 | ;; | |
808 | ;; The procedure must be called *after* the command token has been read. | |
809 | ||
810 | (define (read-include-file-name port) | |
811 | (let ((x (string-trim-both (read-eof-line port)))) | |
812 | (if (string-null? x) | |
813 | (error "no file listed") | |
814 | x))) ;; fixme: should expand @value{} references | |
815 | ||
816 | (define (sxml->node-name sxml) | |
817 | "Turn some sxml string into a valid node name." | |
818 | (let loop ((in (string->list (sxml->string sxml))) (out '())) | |
819 | (if (null? in) | |
820 | (apply string (reverse out)) | |
821 | (if (memq (car in) '(#\{ #\} #\@ #\,)) | |
822 | (loop (cdr in) out) | |
823 | (loop (cdr in) (cons (car in) out)))))) | |
824 | ||
825 | (define (index command arguments fdown fup parent-seed) | |
826 | (case command | |
827 | ((deftp defcv defivar deftypeivar defop deftypeop defmethod | |
828 | deftypemethod defopt defvr defvar deftypevr deftypevar deffn | |
829 | deftypefn defspec defmac defun deftypefun) | |
830 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
831 | (cadr (assq 'name arguments))))))) | |
832 | (fup 'anchor args parent-seed | |
833 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
834 | ((cindex findex vindex kindex pindex tindex) | |
835 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
836 | (sxml->node-name | |
837 | (assq 'entry arguments))))))) | |
838 | (fup 'anchor args parent-seed | |
839 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
840 | (else parent-seed))) | |
841 | ||
842 | (define (make-command-parser fdown fup str-handler) | |
843 | (lambda (command port seed) | |
844 | (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) | |
845 | (let*-values (((command arguments expected-content) | |
846 | (complete-start-command command port))) | |
847 | (let* ((parent-seed (index command arguments fdown fup parent-seed)) | |
848 | (seed (fdown command arguments expected-content parent-seed)) | |
849 | (eof-closes? (or (memq command '(texinfo para *fragment*)) | |
850 | (eq? expected-content 'EOL-TEXT))) | |
851 | (sig-ws? (or sig-ws? (space-significant? command))) | |
852 | (up (lambda (s) (fup command arguments parent-seed s))) | |
853 | (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) | |
854 | (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) | |
855 | ||
856 | (define (port-for-content) | |
857 | (if (eq? expected-content 'EOL-TEXT) | |
858 | (call-with-input-string (read-text-line port) identity) | |
859 | port)) | |
860 | ||
861 | (cond | |
862 | ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX | |
863 | EOL-TEXT-ARGS)) | |
864 | ;; empty or finished by complete-start-command | |
865 | (up seed)) | |
866 | ((eq? command 'verbatim) | |
867 | (up (read-verbatim-body port str-handler seed))) | |
868 | (else | |
869 | (let loop ((port (port-for-content)) | |
870 | (expect-eof? eof-closes?) | |
871 | (end-para identity) | |
872 | (need-break? (and (not sig-ws?) | |
873 | (memq expected-content | |
874 | '(ENVIRON TABLE-ENVIRON | |
875 | ENTRY ITEM FRAGMENT)))) | |
876 | (seed seed)) | |
877 | (cond | |
878 | ((and need-break? (or sig-ws? (skip-whitespace port)) | |
879 | (not (memq (peek-char port) '(#\@ #\}))) | |
880 | (not (eof-object? (peek-char port)))) | |
881 | ;; Even if we have an @, it might be inline -- check | |
882 | ;; that later | |
883 | (let ((seed (end-para seed))) | |
884 | (loop port expect-eof? (make-end-para seed) #f | |
885 | (new-para seed)))) | |
886 | (else | |
887 | (let*-values (((seed token) | |
888 | (read-char-data | |
889 | port expect-eof? sig-ws? str-handler seed))) | |
890 | (cond | |
891 | ((eof-object? token) | |
892 | (case expect-eof? | |
893 | ((include #f) (end-para seed)) | |
894 | (else (up (end-para seed))))) | |
895 | (else | |
896 | (case (token-kind token) | |
897 | ((STRING) | |
898 | ;; this is only @-commands that escape | |
899 | ;; characters: @}, @@, @{ -- new para if need-break | |
900 | (let ((seed ((if need-break? end-para identity) seed))) | |
901 | (loop port expect-eof? | |
902 | (if need-break? (make-end-para seed) end-para) #f | |
903 | (str-handler (token-head token) "" | |
904 | ((if need-break? new-para identity) | |
905 | seed))))) | |
906 | ((END) | |
907 | ;; The end will only have a name if it's for an | |
908 | ;; environment | |
909 | (cond | |
910 | ((memq command '(item entry)) | |
911 | (let ((spec (command-spec (token-head token)))) | |
912 | (or (eq? (cadr spec) 'TABLE-ENVIRON) | |
913 | (parser-error | |
914 | port "@item not ended by @end table/enumerate/itemize" | |
915 | token)))) | |
916 | ((eq? expected-content 'ENVIRON) | |
917 | (assert-token token 'END command))) | |
918 | (up (end-para seed))) | |
919 | ((ITEM) | |
920 | (cond | |
921 | ((memq command '(enumerate itemize)) | |
922 | (up (visit 'item port sig-ws? (end-para seed)))) | |
923 | ((eq? expected-content 'TABLE-ENVIRON) | |
924 | (up (visit 'entry port sig-ws? (end-para seed)))) | |
925 | ((memq command '(item entry)) | |
926 | (visit command port sig-ws? (up (end-para seed)))) | |
927 | (else | |
928 | (parser-error | |
929 | port "@item must be within a table environment" | |
930 | command)))) | |
931 | ((PARA) | |
932 | ;; examine valid paragraphs? | |
933 | (loop port expect-eof? end-para (not sig-ws?) seed)) | |
934 | ((INCLUDE) | |
935 | ;; Recurse for include files | |
936 | (let ((seed (call-with-file-and-dir | |
937 | (read-include-file-name port) | |
938 | (lambda (port) | |
939 | (loop port 'include end-para | |
940 | need-break? seed))))) | |
941 | (loop port expect-eof? end-para need-break? seed))) | |
942 | ((START) ; Start of an @-command | |
943 | (let* ((head (token-head token)) | |
944 | (type (cadr (command-spec head))) | |
945 | (inline? (inline-content? type)) | |
946 | (seed ((if (and inline? (not need-break?)) | |
947 | identity end-para) seed)) | |
948 | (end-para (if inline? | |
949 | (if need-break? (make-end-para seed) | |
950 | end-para) | |
951 | identity)) | |
952 | (new-para (if (and inline? need-break?) | |
953 | new-para identity))) | |
954 | (loop port expect-eof? end-para (not inline?) | |
955 | (visit head port sig-ws? (new-para seed))))) | |
956 | (else | |
957 | (parser-error port "Unknown token type" token)))))))))))))))) | |
958 | ||
959 | ;; procedure: reverse-collect-str-drop-ws fragments | |
960 | ;; | |
961 | ;; Given the list of fragments (some of which are text strings), reverse | |
962 | ;; the list and concatenate adjacent text strings. We also drop | |
963 | ;; "unsignificant" whitespace, that is, whitespace in front, behind and | |
964 | ;; between elements. The whitespace that is included in character data | |
965 | ;; is not affected. | |
966 | (define (reverse-collect-str-drop-ws fragments) | |
967 | (cond | |
968 | ((null? fragments) ; a shortcut | |
969 | '()) | |
970 | ((and (string? (car fragments)) ; another shortcut | |
971 | (null? (cdr fragments)) ; remove single ws-only string | |
972 | (string-whitespace? (car fragments))) | |
973 | '()) | |
974 | (else | |
975 | (let loop ((fragments fragments) (result '()) (strs '()) | |
976 | (all-whitespace? #t)) | |
977 | (cond | |
978 | ((null? fragments) | |
979 | (if all-whitespace? | |
980 | result ; remove leading ws | |
981 | (cons (apply string-append strs) result))) | |
982 | ((string? (car fragments)) | |
983 | (loop (cdr fragments) result (cons (car fragments) strs) | |
984 | (and all-whitespace? | |
985 | (string-whitespace? (car fragments))))) | |
986 | (else | |
987 | (loop (cdr fragments) | |
988 | (cons | |
989 | (car fragments) | |
990 | (cond | |
991 | ((null? strs) result) | |
992 | (all-whitespace? | |
993 | (if (null? result) | |
994 | result ; remove trailing whitespace | |
995 | (cons " " result))); replace interstitial ws with | |
996 | ; one space | |
997 | (else | |
998 | (cons (apply string-append strs) result)))) | |
999 | '() #t))))))) | |
1000 | ||
1001 | (define (make-dom-parser) | |
1002 | (make-command-parser | |
1003 | (lambda (command args content seed) ; fdown | |
1004 | '()) | |
1005 | (lambda (command args parent-seed seed) ; fup | |
1006 | (let ((seed (reverse-collect-str-drop-ws seed))) | |
1007 | (acons command | |
1008 | (if (null? args) seed (acons '% args seed)) | |
1009 | parent-seed))) | |
1010 | (lambda (string1 string2 seed) ; str-handler | |
1011 | (if (string-null? string2) | |
1012 | (cons string1 seed) | |
1013 | (cons* string2 string1 seed))))) | |
1014 | ||
1015 | (define parse-environment-args | |
1016 | (let ((parser (make-dom-parser))) | |
1017 | ;; duplicate arguments->attlist to avoid unnecessary splitting | |
1018 | (lambda (command port) | |
1019 | (let ((args (cdar (parser '*ENVIRON-ARGS* port '()))) | |
1020 | (arg-names (cddr (command-spec command)))) | |
1021 | (cond | |
1022 | ((not arg-names) | |
1023 | (if (null? args) '() | |
1024 | (parser-error port "@-command doesn't take args" command))) | |
1025 | ((eq? arg-names #t) | |
1026 | (list (cons 'arguments args))) | |
1027 | (else | |
1028 | (let loop ((args args) (arg-names arg-names) (out '())) | |
1029 | (cond | |
1030 | ((null? arg-names) | |
1031 | (if (null? args) (reverse! out) | |
1032 | (parser-error port "@-command didn't expect more args" | |
1033 | command args))) | |
1034 | ((symbol? arg-names) | |
1035 | (reverse! (acons arg-names args out))) | |
1036 | ((null? args) | |
1037 | (parser-error port "@-command expects more args" | |
1038 | command arg-names)) | |
1039 | ((and (string? (car args)) (string-index (car args) #\space)) | |
1040 | => (lambda (i) | |
1041 | (let ((rest (substring/shared (car args) (1+ i)))) | |
1042 | (if (zero? i) | |
1043 | (loop (cons rest (cdr args)) arg-names out) | |
1044 | (loop (cons rest (cdr args)) (cdr arg-names) | |
1045 | (cons (list (car arg-names) | |
1046 | (substring (car args) 0 i)) | |
1047 | out)))))) | |
1048 | (else | |
1049 | (loop (cdr args) (cdr arg-names) | |
1050 | (if (and (pair? (car args)) (eq? (caar args) '*braces*)) | |
1051 | (acons (car arg-names) (cdar args) out) | |
1052 | (cons (list (car arg-names) (car args)) out)))))))))))) | |
1053 | ||
1054 | (define (parse-eol-text-args command port) | |
1055 | ;; perhaps parse-environment-args should be named more | |
1056 | ;; generically. | |
1057 | (parse-environment-args command port)) | |
1058 | ||
1059 | ;; procedure: texi-fragment->stexi STRING | |
1060 | ;; | |
1061 | ;; A DOM parser for a texinfo fragment STRING. | |
1062 | ;; | |
1063 | ;; The procedure returns an SXML tree headed by the special tag, | |
1064 | ;; *fragment*. | |
1065 | ||
1066 | (define (texi-fragment->stexi string-or-port) | |
1067 | "Parse the texinfo commands in @var{string-or-port}, and return the | |
1068 | resultant stexi tree. The head of the tree will be the special command, | |
1069 | @code{*fragment*}." | |
1070 | (define (parse port) | |
1071 | (postprocess (car ((make-dom-parser) '*fragment* port '())))) | |
1072 | (if (input-port? string-or-port) | |
1073 | (parse string-or-port) | |
1074 | (call-with-input-string string-or-port parse))) | |
1075 | ||
1076 | ;; procedure: texi->stexi PORT | |
1077 | ;; | |
1078 | ;; This is an instance of a SSAX parser above that returns an SXML | |
1079 | ;; representation of the texinfo document ready to be read at PORT. | |
1080 | ;; | |
1081 | ;; The procedure returns an SXML tree. The port points to the | |
1082 | ;; first character after the @bye, or to the end of the file. | |
1083 | ||
1084 | (define (texi->stexi port) | |
1085 | "Read a full texinfo document from @var{port} and return the parsed | |
1086 | stexi tree. The parsing will start at the @code{@@settitle} and end at | |
1087 | @code{@@bye} or EOF." | |
1088 | (let ((parser (make-dom-parser))) | |
1089 | (take-until-settitle port) | |
1090 | (postprocess (car (parser 'texinfo port '()))))) | |
1091 | ||
1092 | (define (car-eq? x y) (and (pair? x) (eq? (car x) y))) | |
1093 | (define (make-contents tree) | |
1094 | (define (lp in out depth) | |
1095 | (cond | |
1096 | ((null? in) (values in (cons 'enumerate (reverse! out)))) | |
1097 | ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) | |
1098 | => (lambda (new-depth) | |
1099 | (let ((node-name (and (car-eq? (car in) 'node) | |
1100 | (cadr (assq 'name (cdadar in)))))) | |
1101 | (cond | |
1102 | ((< new-depth depth) | |
1103 | (values in (cons 'enumerate (reverse! out)))) | |
1104 | ((> new-depth depth) | |
1105 | (let ((out-cdr (if (null? out) '() (cdr out))) | |
1106 | (out-car (if (null? out) (list 'item) (car out)))) | |
1107 | (let*-values (((new-in new-out) (lp in '() (1+ depth)))) | |
1108 | (lp new-in | |
1109 | (cons (append out-car (list new-out)) out-cdr) | |
1110 | depth)))) | |
1111 | (else ;; same depth | |
1112 | (lp (cddr in) | |
1113 | (cons | |
1114 | `(item (para | |
1115 | ,@(if node-name | |
1116 | `((ref (% (node ,node-name)))) | |
1117 | (cdadr in)))) | |
1118 | out) | |
1119 | depth)))))) | |
1120 | (else (lp (cdr in) out depth)))) | |
1121 | (let*-values (((_ contents) (lp tree '() 1))) | |
1122 | `((chapheading "Table of Contents") ,contents))) | |
1123 | ||
1124 | (define (trim-whitespace str trim-left? trim-right?) | |
1125 | (let* ((left-space? (and (not trim-left?) | |
1126 | (string-prefix? " " str))) | |
1127 | (right-space? (and (not trim-right?) | |
1128 | (string-suffix? " " str))) | |
1129 | (tail (append! (string-tokenize str) | |
1130 | (if right-space? '("") '())))) | |
1131 | (string-join (if left-space? (cons "" tail) tail)))) | |
1132 | ||
1133 | (define (postprocess tree) | |
1134 | (define (loop in out state first? sig-ws?) | |
1135 | (cond | |
1136 | ((null? in) | |
1137 | (values (reverse! out) state)) | |
1138 | ((string? (car in)) | |
1139 | (loop (cdr in) | |
1140 | (cons (if sig-ws? (car in) | |
1141 | (trim-whitespace (car in) first? (null? (cdr in)))) | |
1142 | out) | |
1143 | state #f sig-ws?)) | |
1144 | ((pair? (car in)) | |
1145 | (case (caar in) | |
1146 | ((set) | |
1147 | (if (null? (cdar in)) (error "@set missing arguments" in)) | |
1148 | (if (string? (cadar in)) | |
1149 | (let ((i (string-index (cadar in) #\space))) | |
1150 | (if i | |
1151 | (loop (cdr in) out | |
1152 | (acons (substring (cadar in) 0 i) | |
1153 | (cons (substring (cadar in) (1+ i)) (cddar in)) | |
1154 | state) | |
1155 | #f sig-ws?) | |
1156 | (loop (cdr in) out (acons (cadar in) (cddar in) state) | |
1157 | #f sig-ws?))) | |
1158 | (error "expected a constant to define for @set" in))) | |
1159 | ((value) | |
1160 | (loop (fold-right cons (cdr in) | |
1161 | (or (and=> | |
1162 | (assoc (cadr (assq 'key (cdadar in))) state) cdr) | |
1163 | (error "unknown value" (cdadar in) state))) | |
1164 | out | |
1165 | state #f sig-ws?)) | |
1166 | ((copying) | |
1167 | (loop (cdr in) out (cons (car in) state) #f sig-ws?)) | |
1168 | ((insertcopying) | |
1169 | (loop (fold-right cons (cdr in) | |
1170 | (or (cdr (assoc 'copying state)) | |
1171 | (error "copying isn't set yet"))) | |
1172 | out | |
1173 | state #f sig-ws?)) | |
1174 | ((contents) | |
1175 | (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) | |
1176 | (else | |
1177 | (let*-values (((kid-out state) | |
1178 | (loop (car in) '() state #t | |
1179 | (or sig-ws? (space-significant? (caar in)))))) | |
1180 | (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) | |
1181 | (else ; a symbol | |
1182 | (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) | |
1183 | ||
1184 | (call-with-values | |
1185 | (lambda () (loop tree '() '() #t #f)) | |
1186 | (lambda (out state) out))) | |
1187 | ||
1188 | ;; Replace % with texinfo-arguments. | |
1189 | (define (stexi->sxml tree) | |
1190 | "Transform the stexi tree @var{tree} into sxml. This involves | |
1191 | replacing the @code{%} element that keeps the texinfo arguments with an | |
1192 | element for each argument. | |
1193 | ||
1194 | FIXME: right now it just changes % to @code{texinfo-arguments} -- that | |
1195 | doesn't hang with the idea of making a dtd at some point" | |
1196 | (pre-post-order | |
1197 | tree | |
1198 | `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) | |
1199 | (*text* . ,(lambda (x t) t)) | |
1200 | (*default* . ,(lambda (x . t) (cons x t)))))) | |
1201 | ||
1202 | ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 | |
1203 | ;;; texinfo.scm ends here |