Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (texinfo) -- parsing of texinfo into SXML |
2 | ;;;; | |
3 | ;;;; Copyright (C) 2009 Free Software Foundation, Inc. | |
4 | ;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> | |
5 | ;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com> | |
6 | ;;;; | |
7 | ;;;; This file is based on SSAX's SSAX.scm. | |
8 | ;;;; | |
9 | ;;;; This library is free software; you can redistribute it and/or | |
10 | ;;;; modify it under the terms of the GNU Lesser General Public | |
11 | ;;;; License as published by the Free Software Foundation; either | |
12 | ;;;; version 3 of the License, or (at your option) any later version. | |
13 | ;;;; | |
14 | ;;;; This library is distributed in the hope that it will be useful, | |
15 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;;;; Lesser General Public License for more details. | |
18 | ;;;; | |
19 | ;;;; You should have received a copy of the GNU Lesser General Public | |
20 | ;;;; License along with this library; if not, write to the Free Software | |
21 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | \f | |
23 | ;;; Commentary: | |
24 | ;; | |
25 | ;; @subheading Texinfo processing in scheme | |
26 | ;; | |
27 | ;; This module parses texinfo into SXML. TeX will always be the | |
28 | ;; processor of choice for print output, of course. However, although | |
29 | ;; @code{makeinfo} works well for info, its output in other formats is | |
30 | ;; not very customizable, and the program is not extensible as a whole. | |
31 | ;; This module aims to provide an extensible framework for texinfo | |
32 | ;; processing that integrates texinfo into the constellation of SXML | |
33 | ;; processing tools. | |
34 | ;; | |
35 | ;; @subheading Notes on the SXML vocabulary | |
36 | ;; | |
37 | ;; Consider the following texinfo fragment: | |
38 | ;; | |
39 | ;;@example | |
40 | ;; @@deffn Primitive set-car! pair value | |
41 | ;; This function... | |
42 | ;; @@end deffn | |
43 | ;;@end example | |
44 | ;; | |
45 | ;; Logically, the category (Primitive), name (set-car!), and arguments | |
46 | ;; (pair value) are ``attributes'' of the deffn, with the description as | |
47 | ;; the content. However, texinfo allows for @@-commands within the | |
48 | ;; arguments to an environment, like @code{@@deffn}, which means that | |
49 | ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, | |
50 | ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are | |
51 | ;; called ``arguments'', and are grouped under the special element, `%'. | |
52 | ;; | |
53 | ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In | |
54 | ;; the interests of interoperability, this module provides a conversion | |
55 | ;; function to replace the `%' with `texinfo-arguments'. | |
56 | ;; | |
57 | ;;; Code: | |
58 | ||
59 | ;; Comparison to xml output of texinfo (which is rather undocumented): | |
60 | ;; Doesn't conform to texinfo dtd | |
61 | ;; No DTD at all, in fact :-/ | |
62 | ;; Actually outputs valid xml, after transforming % | |
63 | ;; Slower (although with caching the SXML that problem can go away) | |
64 | ;; Doesn't parse menus (although menus are shite) | |
65 | ;; Args go in a dedicated element, FBOFW | |
66 | ;; Definitions are handled a lot better | |
67 | ;; Does parse comments | |
68 | ;; Outputs only significant line breaks (a biggie!) | |
69 | ;; Nodes are treated as anchors, rather than content organizers (a biggie) | |
70 | ;; (more book-like, less info-like) | |
71 | ||
72 | ;; TODO | |
73 | ;; Integration: help, indexing, plain text | |
74 | ||
75 | (define-module (texinfo) | |
76 | #:use-module (sxml simple) | |
77 | #:use-module (sxml transform) | |
78 | #:use-module (sxml ssax input-parse) | |
79 | #:use-module (srfi srfi-1) | |
80 | #:use-module (srfi srfi-13) | |
81 | #:export (call-with-file-and-dir | |
82 | texi-command-specs | |
83 | texi-command-depth | |
84 | texi-fragment->stexi | |
85 | texi->stexi | |
86 | stexi->sxml)) | |
87 | ||
88 | ;; Some utilities | |
89 | ||
90 | (define (parser-error port message . rest) | |
05c29c5a | 91 | (apply throw 'parser-error port message rest)) |
47f3ce52 AW |
92 | |
93 | (define (call-with-file-and-dir filename proc) | |
94 | "Call the one-argument procedure @var{proc} with an input port that | |
95 | reads from @var{filename}. During the dynamic extent of @var{proc}'s | |
96 | execution, the current directory will be @code{(dirname | |
97 | @var{filename})}. This is useful for parsing documents that can include | |
98 | files by relative path name." | |
99 | (let ((current-dir (getcwd))) | |
100 | (dynamic-wind | |
101 | (lambda () (chdir (dirname filename))) | |
102 | (lambda () | |
103 | (call-with-input-file (basename filename) proc)) | |
104 | (lambda () (chdir current-dir))))) | |
105 | ||
106 | ;; Define this version here, because (srfi srfi-11)'s definition uses | |
107 | ;; syntax-rules, which is really damn slow | |
108 | (define-macro (let*-values bindings . body) | |
109 | (if (null? bindings) (cons 'begin body) | |
110 | (apply | |
111 | (lambda (vars initializer) | |
112 | (let ((cont | |
113 | (cons 'let*-values | |
114 | (cons (cdr bindings) body)))) | |
115 | (cond | |
116 | ((not (pair? vars)) ; regular let case, a single var | |
117 | `(let ((,vars ,initializer)) ,cont)) | |
118 | ((null? (cdr vars)) ; single var, see the prev case | |
119 | `(let ((,(car vars) ,initializer)) ,cont)) | |
120 | (else ; the most generic case | |
121 | `(call-with-values (lambda () ,initializer) | |
122 | (lambda ,vars ,cont)))))) | |
123 | (car bindings)))) | |
124 | ||
125 | ;;======================================================================== | |
126 | ;; Reflection on the XML vocabulary | |
127 | ||
128 | (define texi-command-specs | |
129 | #; | |
130 | "A list of (@var{name} @var{content-model} . @var{args}) | |
131 | ||
132 | @table @var | |
133 | @item name | |
134 | The name of an @@-command, as a symbol. | |
135 | ||
136 | @item content-model | |
137 | A symbol indicating the syntactic type of the @@-command: | |
138 | @table @code | |
139 | @item EMPTY-COMMAND | |
140 | No content, and no @code{@@end} is coming | |
141 | @item EOL-ARGS | |
142 | Unparsed arguments until end of line | |
143 | @item EOL-TEXT | |
144 | Parsed arguments until end of line | |
145 | @item INLINE-ARGS | |
146 | Unparsed arguments ending with @code{#\\@}} | |
147 | @item INLINE-TEXT | |
148 | Parsed arguments ending with @code{#\\@}} | |
149 | @item ENVIRON | |
150 | The tag is an environment tag, expect @code{@@end foo}. | |
151 | @item TABLE-ENVIRON | |
152 | Like ENVIRON, but with special parsing rules for its arguments. | |
153 | @item FRAGMENT | |
154 | For @code{*fragment*}, the command used for parsing fragments of | |
155 | texinfo documents. | |
156 | @end table | |
157 | ||
158 | @code{INLINE-TEXT} commands will receive their arguments within their | |
159 | bodies, whereas the @code{-ARGS} commands will receive them in their | |
160 | attribute list. | |
161 | ||
162 | @code{EOF-TEXT} receives its arguments in its body. | |
163 | ||
164 | @code{ENVIRON} commands have both: parsed arguments until the end of | |
165 | line, received through their attribute list, and parsed text until the | |
166 | @code{@@end}, received in their bodies. | |
167 | ||
168 | @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in | |
169 | @code{ENVIRON}. | |
170 | ||
171 | There are four @@-commands that are treated specially. @code{@@include} | |
172 | is a low-level token that will not be seen by higher-level parsers, so | |
173 | it has no content-model. @code{@@para} is the paragraph command, which | |
174 | is only implicit in the texinfo source. @code{@@item} has special | |
175 | syntax, as noted above, and @code{@@entry} is how this parser treats | |
176 | @code{@@item} commands within @code{@@table}, @code{@@ftable}, and | |
177 | @code{@@vtable}. | |
178 | ||
179 | Also, indexing commands (@code{@@cindex}, etc.) are treated specially. | |
180 | Their arguments are parsed, but they are needed before entering the | |
181 | element so that an anchor can be inserted into the text before the index | |
182 | entry. | |
183 | ||
184 | @item args | |
185 | Named arguments to the command, in the same format as the formals for a | |
186 | lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, | |
187 | @code{ENVIRON}, @code{TABLE-ENVIRON} commands. | |
188 | @end table" | |
189 | '(;; Special commands | |
190 | (include #f) ;; this is a low-level token | |
191 | (para PARAGRAPH) | |
192 | (item ITEM) | |
193 | (entry ENTRY . heading) | |
194 | (noindent EMPTY-COMMAND) | |
195 | (*fragment* FRAGMENT) | |
196 | ||
197 | ;; Inline text commands | |
198 | (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant | |
199 | (bold INLINE-TEXT) | |
200 | (sample INLINE-TEXT) | |
201 | (samp INLINE-TEXT) | |
202 | (code INLINE-TEXT) | |
203 | (kbd INLINE-TEXT) | |
204 | (key INLINE-TEXT) | |
205 | (var INLINE-TEXT) | |
206 | (env INLINE-TEXT) | |
207 | (file INLINE-TEXT) | |
208 | (command INLINE-TEXT) | |
209 | (option INLINE-TEXT) | |
210 | (dfn INLINE-TEXT) | |
211 | (cite INLINE-TEXT) | |
212 | (acro INLINE-TEXT) | |
213 | (url INLINE-TEXT) | |
214 | (email INLINE-TEXT) | |
215 | (emph INLINE-TEXT) | |
216 | (strong INLINE-TEXT) | |
217 | (sample INLINE-TEXT) | |
218 | (sc INLINE-TEXT) | |
219 | (titlefont INLINE-TEXT) | |
220 | (asis INLINE-TEXT) | |
221 | (b INLINE-TEXT) | |
222 | (i INLINE-TEXT) | |
223 | (r INLINE-TEXT) | |
224 | (sansserif INLINE-TEXT) | |
225 | (slanted INLINE-TEXT) | |
226 | (t INLINE-TEXT) | |
227 | ||
228 | ;; Inline args commands | |
229 | (value INLINE-ARGS . (key)) | |
230 | (ref INLINE-ARGS . (node #:opt name section info-file manual)) | |
231 | (xref INLINE-ARGS . (node #:opt name section info-file manual)) | |
232 | (pxref INLINE-ARGS . (node #:opt name section info-file manual)) | |
233 | (uref INLINE-ARGS . (url #:opt title replacement)) | |
234 | (anchor INLINE-ARGS . (name)) | |
235 | (dots INLINE-ARGS . ()) | |
236 | (result INLINE-ARGS . ()) | |
237 | (bullet INLINE-ARGS . ()) | |
238 | (copyright INLINE-ARGS . ()) | |
239 | (tie INLINE-ARGS . ()) | |
240 | (image INLINE-ARGS . (file #:opt width height alt-text extension)) | |
241 | ||
242 | ;; EOL args elements | |
243 | (node EOL-ARGS . (name #:opt next previous up)) | |
244 | (c EOL-ARGS . all) | |
245 | (comment EOL-ARGS . all) | |
246 | (setchapternewpage EOL-ARGS . all) | |
247 | (sp EOL-ARGS . all) | |
248 | (page EOL-ARGS . ()) | |
249 | (vskip EOL-ARGS . all) | |
250 | (syncodeindex EOL-ARGS . all) | |
251 | (contents EOL-ARGS . ()) | |
252 | (shortcontents EOL-ARGS . ()) | |
253 | (summarycontents EOL-ARGS . ()) | |
254 | (insertcopying EOL-ARGS . ()) | |
255 | (dircategory EOL-ARGS . (category)) | |
256 | (top EOL-ARGS . (title)) | |
257 | (printindex EOL-ARGS . (type)) | |
258 | ||
259 | ;; EOL text commands | |
260 | (*ENVIRON-ARGS* EOL-TEXT) | |
261 | (itemx EOL-TEXT) | |
262 | (set EOL-TEXT) | |
263 | (center EOL-TEXT) | |
264 | (title EOL-TEXT) | |
265 | (subtitle EOL-TEXT) | |
266 | (author EOL-TEXT) | |
267 | (chapter EOL-TEXT) | |
268 | (section EOL-TEXT) | |
269 | (appendix EOL-TEXT) | |
270 | (appendixsec EOL-TEXT) | |
271 | (unnumbered EOL-TEXT) | |
272 | (unnumberedsec EOL-TEXT) | |
273 | (subsection EOL-TEXT) | |
274 | (subsubsection EOL-TEXT) | |
275 | (appendixsubsec EOL-TEXT) | |
276 | (appendixsubsubsec EOL-TEXT) | |
277 | (unnumberedsubsec EOL-TEXT) | |
278 | (unnumberedsubsubsec EOL-TEXT) | |
279 | (chapheading EOL-TEXT) | |
280 | (majorheading EOL-TEXT) | |
281 | (heading EOL-TEXT) | |
282 | (subheading EOL-TEXT) | |
283 | (subsubheading EOL-TEXT) | |
284 | ||
285 | (deftpx EOL-TEXT-ARGS . (category name . attributes)) | |
286 | (defcvx EOL-TEXT-ARGS . (category class name)) | |
287 | (defivarx EOL-TEXT-ARGS . (class name)) | |
288 | (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) | |
289 | (defopx EOL-TEXT-ARGS . (category class name . arguments)) | |
290 | (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) | |
291 | (defmethodx EOL-TEXT-ARGS . (class name . arguments)) | |
292 | (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) | |
293 | (defoptx EOL-TEXT-ARGS . (name)) | |
294 | (defvrx EOL-TEXT-ARGS . (category name)) | |
295 | (defvarx EOL-TEXT-ARGS . (name)) | |
296 | (deftypevrx EOL-TEXT-ARGS . (category data-type name)) | |
297 | (deftypevarx EOL-TEXT-ARGS . (data-type name)) | |
298 | (deffnx EOL-TEXT-ARGS . (category name . arguments)) | |
299 | (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) | |
300 | (defspecx EOL-TEXT-ARGS . (name . arguments)) | |
301 | (defmacx EOL-TEXT-ARGS . (name . arguments)) | |
302 | (defunx EOL-TEXT-ARGS . (name . arguments)) | |
303 | (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) | |
304 | ||
305 | ;; Indexing commands | |
306 | (cindex INDEX . entry) | |
307 | (findex INDEX . entry) | |
308 | (vindex INDEX . entry) | |
309 | (kindex INDEX . entry) | |
310 | (pindex INDEX . entry) | |
311 | (tindex INDEX . entry) | |
312 | ||
313 | ;; Environment commands (those that need @end) | |
314 | (texinfo ENVIRON . title) | |
315 | (ignore ENVIRON . ()) | |
316 | (ifinfo ENVIRON . ()) | |
317 | (iftex ENVIRON . ()) | |
318 | (ifhtml ENVIRON . ()) | |
319 | (ifxml ENVIRON . ()) | |
320 | (ifplaintext ENVIRON . ()) | |
321 | (ifnotinfo ENVIRON . ()) | |
322 | (ifnottex ENVIRON . ()) | |
323 | (ifnothtml ENVIRON . ()) | |
324 | (ifnotxml ENVIRON . ()) | |
325 | (ifnotplaintext ENVIRON . ()) | |
326 | (titlepage ENVIRON . ()) | |
327 | (menu ENVIRON . ()) | |
328 | (direntry ENVIRON . ()) | |
329 | (copying ENVIRON . ()) | |
330 | (example ENVIRON . ()) | |
331 | (smallexample ENVIRON . ()) | |
332 | (display ENVIRON . ()) | |
333 | (smalldisplay ENVIRON . ()) | |
334 | (verbatim ENVIRON . ()) | |
335 | (format ENVIRON . ()) | |
336 | (smallformat ENVIRON . ()) | |
337 | (lisp ENVIRON . ()) | |
338 | (smalllisp ENVIRON . ()) | |
339 | (cartouche ENVIRON . ()) | |
340 | (quotation ENVIRON . ()) | |
341 | ||
342 | (deftp ENVIRON . (category name . attributes)) | |
343 | (defcv ENVIRON . (category class name)) | |
344 | (defivar ENVIRON . (class name)) | |
345 | (deftypeivar ENVIRON . (class data-type name)) | |
346 | (defop ENVIRON . (category class name . arguments)) | |
347 | (deftypeop ENVIRON . (category class data-type name . arguments)) | |
348 | (defmethod ENVIRON . (class name . arguments)) | |
349 | (deftypemethod ENVIRON . (class data-type name . arguments)) | |
350 | (defopt ENVIRON . (name)) | |
351 | (defvr ENVIRON . (category name)) | |
352 | (defvar ENVIRON . (name)) | |
353 | (deftypevr ENVIRON . (category data-type name)) | |
354 | (deftypevar ENVIRON . (data-type name)) | |
355 | (deffn ENVIRON . (category name . arguments)) | |
356 | (deftypefn ENVIRON . (category data-type name . arguments)) | |
357 | (defspec ENVIRON . (name . arguments)) | |
358 | (defmac ENVIRON . (name . arguments)) | |
359 | (defun ENVIRON . (name . arguments)) | |
360 | (deftypefun ENVIRON . (data-type name . arguments)) | |
361 | ||
362 | (table TABLE-ENVIRON . (formatter)) | |
363 | (itemize TABLE-ENVIRON . (formatter)) | |
364 | (enumerate TABLE-ENVIRON . (start)) | |
365 | (ftable TABLE-ENVIRON . (formatter)) | |
366 | (vtable TABLE-ENVIRON . (formatter)))) | |
367 | ||
368 | (define command-depths | |
369 | '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) | |
370 | (top . 0) (unnumbered . 1) (unnumberedsec . 2) | |
371 | (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) | |
372 | (appendix . 1) (appendixsec . 2) (appendixsection . 2) | |
373 | (appendixsubsec . 3) (appendixsubsubsec . 4))) | |
374 | (define (texi-command-depth command max-depth) | |
375 | "Given the texinfo command @var{command}, return its nesting level, or | |
376 | @code{#f} if it nests too deep for @var{max-depth}. | |
377 | ||
378 | Examples: | |
379 | @example | |
05c29c5a AW |
380 | (texi-command-depth 'chapter 4) @result{} 1 |
381 | (texi-command-depth 'top 4) @result{} 0 | |
382 | (texi-command-depth 'subsection 4) @result{} 3 | |
383 | (texi-command-depth 'appendixsubsec 4) @result{} 3 | |
384 | (texi-command-depth 'subsection 2) @result{} #f | |
47f3ce52 AW |
385 | @end example" |
386 | (let ((depth (and=> (assq command command-depths) cdr))) | |
387 | (and depth (<= depth max-depth) depth))) | |
388 | ||
389 | ;; The % is for arguments | |
390 | (define (space-significant? command) | |
391 | (memq command | |
392 | '(example smallexample verbatim lisp smalllisp menu %))) | |
393 | ||
394 | ;; Like a DTD for texinfo | |
395 | (define (command-spec command) | |
396 | (or (assq command texi-command-specs) | |
397 | (parser-error #f "Unknown command" command))) | |
398 | ||
399 | (define (inline-content? content) | |
400 | (or (eq? content 'INLINE-TEXT) (eq? content 'INLINE-ARGS))) | |
401 | ||
402 | ||
403 | ;;======================================================================== | |
404 | ;; Lower-level parsers and scanners | |
405 | ;; | |
406 | ;; They deal with primitive lexical units (Names, whitespaces, tags) and | |
407 | ;; with pieces of more generic productions. Most of these parsers must | |
408 | ;; be called in appropriate context. For example, complete-start-command | |
409 | ;; must be called only when the @-command start has been detected and | |
410 | ;; its name token has been read. | |
411 | ||
412 | ;; Test if a string is made of only whitespace | |
413 | ;; An empty string is considered made of whitespace as well | |
414 | (define (string-whitespace? str) | |
415 | (or (string-null? str) | |
416 | (string-every char-whitespace? str))) | |
417 | ||
418 | ;; Like read-text-line, but allows EOF. | |
419 | (define read-eof-breaks '(*eof* #\return #\newline)) | |
420 | (define (read-eof-line port) | |
421 | (if (eof-object? (peek-char port)) | |
422 | (peek-char port) | |
423 | (let* ((line (next-token '() read-eof-breaks | |
424 | "reading a line" port)) | |
425 | (c (read-char port))) ; must be either \n or \r or EOF | |
426 | (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) | |
427 | (read-char port)) ; skip \n that follows \r | |
428 | line))) | |
429 | ||
430 | (define ascii->char integer->char) | |
431 | ||
432 | (define (skip-whitespace port) | |
433 | (skip-while '(#\space #\tab #\return #\newline) port)) | |
434 | ||
435 | (define (skip-horizontal-whitespace port) | |
436 | (skip-while '(#\space #\tab) port)) | |
437 | ||
438 | ;; command ::= Letter+ | |
439 | ||
440 | ;; procedure: read-command PORT | |
441 | ;; | |
442 | ;; Read a command starting from the current position in the PORT and | |
443 | ;; return it as a symbol. | |
444 | (define (read-command port) | |
445 | (let ((first-char (peek-char port))) | |
446 | (or (char-alphabetic? first-char) | |
447 | (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) | |
448 | (string->symbol | |
449 | (next-token-of | |
450 | (lambda (c) | |
451 | (cond | |
452 | ((eof-object? c) #f) | |
453 | ((char-alphabetic? c) c) | |
454 | (else #f))) | |
455 | port))) | |
456 | ||
457 | ;; A token is a primitive lexical unit. It is a record with two fields, | |
458 | ;; token-head and token-kind. | |
459 | ;; | |
460 | ;; Token types: | |
461 | ;; END The end of a texinfo command. If the command is ended by }, | |
462 | ;; token-head will be #f. Otherwise if the command is ended by | |
463 | ;; @end COMMAND, token-head will be COMMAND. As a special case, | |
464 | ;; @bye is the end of a special @texinfo command. | |
465 | ;; START The start of a texinfo command. The token-head will be a | |
466 | ;; symbol of the @-command name. | |
467 | ;; INCLUDE An @include directive. The token-head will be empty -- the | |
468 | ;; caller is responsible for reading the include file name. | |
469 | ;; ITEM @item commands have an irregular syntax. They end at the | |
470 | ;; next @item, or at the end of the environment. For that | |
471 | ;; read-command-token treats them specially. | |
472 | ||
473 | (define (make-token kind head) (cons kind head)) | |
474 | (define token? pair?) | |
475 | (define token-kind car) | |
476 | (define token-head cdr) | |
477 | ||
478 | ;; procedure: read-command-token PORT | |
479 | ;; | |
480 | ;; This procedure starts parsing of a command token. The current | |
481 | ;; position in the stream must be #\@. This procedure scans enough of | |
482 | ;; the input stream to figure out what kind of a command token it is | |
483 | ;; seeing. The procedure returns a token structure describing the token. | |
484 | ||
485 | (define (read-command-token port) | |
486 | (assert-curr-char '(#\@) "start of the command" port) | |
487 | (let ((peeked (peek-char port))) | |
488 | (cond | |
489 | ((memq peeked '(#\! #\. #\? #\@ #\\ #\{ #\})) | |
490 | ;; @-commands that escape characters | |
491 | (make-token 'STRING (string (read-char port)))) | |
492 | (else | |
493 | (let ((name (read-command port))) | |
494 | (case name | |
495 | ((end) | |
496 | ;; got an ending tag | |
497 | (let ((command (string-trim-both | |
498 | (read-eof-line port)))) | |
499 | (or (and (not (string-null? command)) | |
500 | (string-every char-alphabetic? command)) | |
501 | (parser-error port "malformed @end" command)) | |
502 | (make-token 'END (string->symbol command)))) | |
503 | ((bye) | |
504 | ;; the end of the top | |
505 | (make-token 'END 'texinfo)) | |
506 | ((item) | |
507 | (make-token 'ITEM 'item)) | |
508 | ((include) | |
509 | (make-token 'INCLUDE #f)) | |
510 | (else | |
511 | (make-token 'START name)))))))) | |
512 | ||
513 | ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED | |
514 | ;; | |
515 | ;; This procedure must be called after we have read a string | |
516 | ;; "@verbatim\n" that begins a verbatim section. The current position | |
517 | ;; must be the first position of the verbatim body. This function reads | |
518 | ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a | |
519 | ;; character data consumer. | |
520 | ;; | |
521 | ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. | |
522 | ;; The first STRING1 argument to STR-HANDLER never contains a newline. | |
523 | ;; The second STRING2 argument often will. On the first invocation of the | |
524 | ;; STR-HANDLER, the seed is the one passed to read-verbatim-body | |
525 | ;; as the third argument. The result of this first invocation will be | |
526 | ;; passed as the seed argument to the second invocation of the line | |
527 | ;; consumer, and so on. The result of the last invocation of the | |
528 | ;; STR-HANDLER is returned by the read-verbatim-body. Note a | |
529 | ;; similarity to the fundamental 'fold' iterator. | |
530 | ;; | |
531 | ;; Within a verbatim section all characters are taken at their face | |
532 | ;; value. It ends with "\n@end verbatim(\r)?\n". | |
533 | ||
534 | ;; Must be called right after the newline after @verbatim. | |
535 | (define (read-verbatim-body port str-handler seed) | |
536 | (let loop ((seed seed)) | |
537 | (let ((fragment (next-token '() '(#\newline) | |
538 | "reading verbatim" port))) | |
539 | ;; We're reading the char after the 'fragment', which is | |
540 | ;; #\newline. | |
541 | (read-char port) | |
542 | (if (string=? fragment "@end verbatim") | |
543 | seed | |
544 | (loop (str-handler fragment "\n" seed)))))) | |
545 | ||
546 | ;; procedure+: read-arguments PORT | |
547 | ;; | |
548 | ;; This procedure reads and parses a production ArgumentList. | |
549 | ;; ArgumentList ::= S* Argument (S* , S* Argument)* S* | |
550 | ;; Argument ::= ([^@{},])* | |
551 | ;; | |
552 | ;; Arguments are the things in braces, i.e @ref{my node} has one | |
553 | ;; argument, "my node". Most commands taking braces actually don't have | |
554 | ;; arguments, they process text. For example, in | |
555 | ;; @emph{@strong{emphasized}}, the emph takes text, because the parse | |
556 | ;; continues into the braces. | |
557 | ;; | |
558 | ;; Any whitespace within Argument is replaced with a single space. | |
559 | ;; Whitespace around an Argument is trimmed. | |
560 | ;; | |
561 | ;; The procedure returns a list of arguments. Afterwards the current | |
562 | ;; character will be after the final #\}. | |
563 | ||
564 | (define (read-arguments port stop-char) | |
565 | (define (split str) | |
566 | (read-char port) ;; eat the delimiter | |
567 | (let ((ret (map (lambda (x) (if (string-null? x) #f x)) | |
568 | (map string-trim-both (string-split str #\,))))) | |
569 | (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) | |
570 | '() | |
571 | ret))) | |
572 | (split (next-token '() (list stop-char) | |
573 | "arguments of @-command" port))) | |
574 | ||
575 | ;; procedure+: complete-start-command COMMAND PORT | |
576 | ;; | |
577 | ;; This procedure is to complete parsing of an @-command. The procedure | |
578 | ;; must be called after the command token has been read. COMMAND is a | |
579 | ;; TAG-NAME. | |
580 | ;; | |
581 | ;; This procedure returns several values: | |
582 | ;; COMMAND: a symbol. | |
583 | ;; ARGUMENTS: command's arguments, as an alist. | |
584 | ;; CONTENT-MODEL: the content model of the command. | |
585 | ;; | |
586 | ;; On exit, the current position in PORT will depend on the CONTENT-MODEL. | |
587 | ;; | |
588 | ;; Content model Port position | |
589 | ;; ============= ============= | |
590 | ;; INLINE-TEXT One character after the #\{. | |
591 | ;; INLINE-ARGS The first character after the #\}. | |
592 | ;; EOL-TEXT The first non-whitespace character after the command. | |
593 | ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT | |
594 | ;; The first character on the next line. | |
595 | ;; PARAGRAPH, ITEM, EMPTY-COMMAND | |
596 | ;; The first character after the command. | |
597 | ||
598 | (define (arguments->attlist port args arg-names) | |
599 | (let loop ((in args) (names arg-names) (opt? #f) (out '())) | |
600 | (cond | |
601 | ((symbol? names) ;; a rest arg | |
602 | (reverse (if (null? in) out (acons names in out)))) | |
603 | ((and (not (null? names)) (eq? (car names) #:opt)) | |
604 | (loop in (cdr names) #t out)) | |
605 | ((null? in) | |
606 | (if (or (null? names) opt?) | |
607 | (reverse out) | |
608 | (parser-error port "@-command expected more arguments:" | |
609 | args arg-names names))) | |
610 | ((null? names) | |
611 | (parser-error port "@-command didn't expect more arguments:" in)) | |
612 | ((not (car in)) | |
613 | (or (and opt? (loop (cdr in) (cdr names) opt? out)) | |
614 | (parser-error "@-command missing required argument" | |
615 | (car names)))) | |
616 | (else | |
617 | (loop (cdr in) (cdr names) opt? | |
618 | (cons (list (car names) (car in)) out)))))) | |
619 | ||
620 | (define (parse-table-args command port) | |
621 | (let* ((line (string-trim-both (read-text-line port))) | |
622 | (length (string-length line))) | |
623 | (define (get-formatter) | |
624 | (or (and (not (zero? length)) | |
625 | (eq? (string-ref line 0) #\@) | |
626 | (let ((f (string->symbol (substring line 1)))) | |
627 | (or (inline-content? (cadr (command-spec f))) | |
628 | (parser-error | |
629 | port "@item formatter must be INLINE" f)) | |
630 | f)) | |
05c29c5a | 631 | (parser-error port "Invalid @item formatter" line))) |
47f3ce52 AW |
632 | (case command |
633 | ((enumerate) | |
634 | (if (zero? length) | |
635 | '() | |
636 | `((start | |
637 | ,(if (or (and (eq? length 1) | |
638 | (char-alphabetic? (string-ref line 0))) | |
639 | (string-every char-numeric? line)) | |
640 | line | |
641 | (parser-error | |
642 | port "Invalid enumerate start" line)))))) | |
643 | ((itemize) | |
644 | `((bullet | |
645 | ,(or (and (eq? length 1) line) | |
646 | (and (string-null? line) '(bullet)) | |
647 | (list (get-formatter)))))) | |
648 | (else ;; tables of various varieties | |
649 | `((formatter (,(get-formatter)))))))) | |
650 | ||
651 | (define (complete-start-command command port) | |
652 | (define (get-arguments type arg-names stop-char) | |
653 | (arguments->attlist port (read-arguments port stop-char) arg-names)) | |
654 | ||
655 | (let* ((spec (command-spec command)) | |
656 | (type (cadr spec)) | |
657 | (arg-names (cddr spec))) | |
658 | (case type | |
659 | ((INLINE-TEXT) | |
660 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
661 | (values command '() type)) | |
662 | ((INLINE-ARGS) | |
663 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
664 | (values command (get-arguments type arg-names #\}) type)) | |
665 | ((EOL-ARGS) | |
666 | (values command (get-arguments type arg-names #\newline) type)) | |
667 | ((ENVIRON ENTRY INDEX) | |
668 | (skip-horizontal-whitespace port) | |
669 | (values command (parse-environment-args command port) type)) | |
670 | ((TABLE-ENVIRON) | |
671 | (skip-horizontal-whitespace port) | |
672 | (values command (parse-table-args command port) type)) | |
673 | ((EOL-TEXT) | |
674 | (skip-horizontal-whitespace port) | |
675 | (values command '() type)) | |
676 | ((EOL-TEXT-ARGS) | |
677 | (skip-horizontal-whitespace port) | |
678 | (values command (parse-eol-text-args command port) type)) | |
679 | ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) | |
680 | (values command '() type)) | |
681 | (else ;; INCLUDE shouldn't get here | |
682 | (parser-error port "can't happen"))))) | |
683 | ||
684 | ;;----------------------------------------------------------------------------- | |
685 | ;; Higher-level parsers and scanners | |
686 | ;; | |
687 | ;; They parse productions corresponding entire @-commands. | |
688 | ||
689 | ;; Only reads @settitle, leaves it to the command parser to finish | |
690 | ;; reading the title. | |
691 | (define (take-until-settitle port) | |
692 | (or (find-string-from-port? "\n@settitle " port) | |
693 | (parser-error port "No \\n@settitle found")) | |
694 | (skip-horizontal-whitespace port) | |
695 | (and (eq? (peek-char port) #\newline) | |
696 | (parser-error port "You have a @settitle, but no title"))) | |
697 | ||
698 | ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED | |
699 | ;; | |
700 | ;; This procedure is to read the CharData of a texinfo document. | |
701 | ;; | |
702 | ;; text ::= (CharData | Command)* | |
703 | ;; | |
704 | ;; The procedure reads CharData and stops at @-commands (or | |
705 | ;; environments). It also stops at an open or close brace. | |
706 | ;; | |
707 | ;; port | |
708 | ;; a PORT to read | |
709 | ;; expect-eof? | |
710 | ;; a boolean indicating if EOF is normal, i.e., the character | |
711 | ;; data may be terminated by the EOF. EOF is normal | |
712 | ;; while processing the main document. | |
713 | ;; preserve-ws? | |
714 | ;; a boolean indicating if we are within a whitespace-preserving | |
715 | ;; environment. If #t, suppress paragraph detection. | |
716 | ;; str-handler | |
717 | ;; a STR-HANDLER, see read-verbatim-body | |
718 | ;; seed | |
719 | ;; an argument passed to the first invocation of STR-HANDLER. | |
720 | ;; | |
721 | ;; The procedure returns two results: SEED and TOKEN. The SEED is the | |
722 | ;; result of the last invocation of STR-HANDLER, or the original seed if | |
723 | ;; STR-HANDLER was never called. | |
724 | ;; | |
725 | ;; TOKEN can be either an eof-object (this can happen only if expect-eof? | |
726 | ;; was #t), or a texinfo token denoting the start or end of a tag. | |
727 | ||
728 | ;; read-char-data port expect-eof? preserve-ws? str-handler seed | |
729 | (define read-char-data | |
730 | (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) | |
731 | (define (handle str-handler str1 str2 seed) | |
732 | (if (and (string-null? str1) (string-null? str2)) | |
733 | seed | |
734 | (str-handler str1 str2 seed))) | |
735 | ||
736 | (lambda (port expect-eof? preserve-ws? str-handler seed) | |
737 | (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) | |
738 | (let loop ((seed seed)) | |
739 | (let* ((fragment (next-token '() end-chars "reading char data" port)) | |
740 | (term-char (peek-char port))) ; one of end-chars | |
741 | (cond | |
742 | ((eof-object? term-char) ; only if expect-eof? | |
743 | (values (handle str-handler fragment "" seed) term-char)) | |
744 | ((memq term-char '(#\@ #\{ #\})) | |
745 | (values (handle str-handler fragment "" seed) | |
746 | (case term-char | |
747 | ((#\@) (read-command-token port)) | |
748 | ((#\{) (make-token 'START '*braces*)) | |
749 | ((#\}) (read-char port) (make-token 'END #f))))) | |
750 | ((eq? term-char #\newline) | |
751 | ;; Always significant, unless directly before an end token. | |
752 | (let ((c (peek-next-char port))) | |
753 | (cond | |
754 | ((eof-object? c) | |
755 | (or expect-eof? | |
756 | (parser-error port "EOF while reading char data")) | |
757 | (values (handle str-handler fragment "" seed) c)) | |
758 | ((eq? c #\@) | |
759 | (let* ((token (read-command-token port)) | |
760 | (end? (eq? (token-kind token) 'END))) | |
761 | (values | |
762 | (handle str-handler fragment (if end? "" " ") seed) | |
763 | token))) | |
764 | ((and (not preserve-ws?) (eq? c #\newline)) | |
765 | ;; paragraph-separator ::= #\newline #\newline+ | |
766 | (skip-while '(#\newline) port) | |
767 | (skip-horizontal-whitespace port) | |
768 | (values (handle str-handler fragment "" seed) | |
769 | (make-token 'PARA 'para))) | |
770 | (else | |
771 | (loop (handle str-handler fragment | |
772 | (if preserve-ws? "\n" " ") seed))))))))))))) | |
773 | ||
774 | ; procedure+: assert-token TOKEN KIND NAME | |
775 | ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME | |
776 | (define (assert-token token kind name) | |
777 | (or (and (token? token) | |
778 | (eq? kind (token-kind token)) | |
779 | (equal? name (token-head token))) | |
780 | (parser-error #f "Expecting @end for " name ", got " token))) | |
781 | ||
782 | ;;======================================================================== | |
783 | ;; Highest-level parsers: Texinfo to SXML | |
784 | ||
785 | ;; These parsers are a set of syntactic forms to instantiate a SSAX | |
786 | ;; parser. The user tells what to do with the parsed character and | |
787 | ;; element data. These latter handlers determine if the parsing follows a | |
788 | ;; SAX or a DOM model. | |
789 | ||
790 | ;; syntax: make-command-parser fdown fup str-handler | |
791 | ||
792 | ;; Create a parser to parse and process one element, including its | |
793 | ;; character content or children elements. The parser is typically | |
794 | ;; applied to the root element of a document. | |
795 | ||
796 | ;; fdown | |
797 | ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED | |
798 | ;; | |
799 | ;; This procedure is to generate the seed to be passed to handlers | |
800 | ;; that process the content of the element. This is the function | |
801 | ;; identified as 'fdown' in the denotational semantics of the XML | |
802 | ;; parser given in the title comments to (sxml ssax). | |
803 | ;; | |
804 | ;; fup | |
805 | ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED | |
806 | ;; | |
807 | ;; This procedure is called when parsing of COMMAND is finished. | |
808 | ;; The SEED is the result from the last content parser (or from | |
809 | ;; fdown if the element has the empty content). PARENT-SEED is the | |
810 | ;; same seed as was passed to fdown. The procedure is to generate a | |
811 | ;; seed that will be the result of the element parser. This is the | |
812 | ;; function identified as 'fup' in the denotational semantics of | |
813 | ;; the XML parser given in the title comments to (sxml ssax). | |
814 | ;; | |
815 | ;; str-handler | |
816 | ;; A STR-HANDLER, see read-verbatim-body | |
817 | ;; | |
818 | ||
819 | ;; The generated parser is a | |
820 | ;; procedure COMMAND PORT SEED | |
821 | ;; | |
822 | ;; The procedure must be called *after* the command token has been read. | |
823 | ||
824 | (define (read-include-file-name port) | |
825 | (let ((x (string-trim-both (read-eof-line port)))) | |
826 | (if (string-null? x) | |
827 | (error "no file listed") | |
828 | x))) ;; fixme: should expand @value{} references | |
829 | ||
830 | (define (sxml->node-name sxml) | |
831 | "Turn some sxml string into a valid node name." | |
832 | (let loop ((in (string->list (sxml->string sxml))) (out '())) | |
833 | (if (null? in) | |
834 | (apply string (reverse out)) | |
835 | (if (memq (car in) '(#\{ #\} #\@ #\,)) | |
836 | (loop (cdr in) out) | |
837 | (loop (cdr in) (cons (car in) out)))))) | |
838 | ||
839 | (define (index command arguments fdown fup parent-seed) | |
840 | (case command | |
841 | ((deftp defcv defivar deftypeivar defop deftypeop defmethod | |
842 | deftypemethod defopt defvr defvar deftypevr deftypevar deffn | |
843 | deftypefn defspec defmac defun deftypefun) | |
844 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
845 | (cadr (assq 'name arguments))))))) | |
846 | (fup 'anchor args parent-seed | |
847 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
848 | ((cindex findex vindex kindex pindex tindex) | |
849 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
850 | (sxml->node-name | |
851 | (assq 'entry arguments))))))) | |
852 | (fup 'anchor args parent-seed | |
853 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
854 | (else parent-seed))) | |
855 | ||
856 | (define (make-command-parser fdown fup str-handler) | |
857 | (lambda (command port seed) | |
858 | (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) | |
859 | (let*-values (((command arguments expected-content) | |
860 | (complete-start-command command port))) | |
861 | (let* ((parent-seed (index command arguments fdown fup parent-seed)) | |
862 | (seed (fdown command arguments expected-content parent-seed)) | |
863 | (eof-closes? (or (memq command '(texinfo para *fragment*)) | |
864 | (eq? expected-content 'EOL-TEXT))) | |
865 | (sig-ws? (or sig-ws? (space-significant? command))) | |
866 | (up (lambda (s) (fup command arguments parent-seed s))) | |
867 | (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) | |
868 | (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) | |
869 | ||
870 | (define (port-for-content) | |
871 | (if (eq? expected-content 'EOL-TEXT) | |
872 | (call-with-input-string (read-text-line port) identity) | |
873 | port)) | |
874 | ||
875 | (cond | |
876 | ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX | |
877 | EOL-TEXT-ARGS)) | |
878 | ;; empty or finished by complete-start-command | |
879 | (up seed)) | |
880 | ((eq? command 'verbatim) | |
881 | (up (read-verbatim-body port str-handler seed))) | |
882 | (else | |
883 | (let loop ((port (port-for-content)) | |
884 | (expect-eof? eof-closes?) | |
885 | (end-para identity) | |
886 | (need-break? (and (not sig-ws?) | |
887 | (memq expected-content | |
888 | '(ENVIRON TABLE-ENVIRON | |
889 | ENTRY ITEM FRAGMENT)))) | |
890 | (seed seed)) | |
891 | (cond | |
892 | ((and need-break? (or sig-ws? (skip-whitespace port)) | |
893 | (not (memq (peek-char port) '(#\@ #\}))) | |
894 | (not (eof-object? (peek-char port)))) | |
895 | ;; Even if we have an @, it might be inline -- check | |
896 | ;; that later | |
897 | (let ((seed (end-para seed))) | |
898 | (loop port expect-eof? (make-end-para seed) #f | |
899 | (new-para seed)))) | |
900 | (else | |
901 | (let*-values (((seed token) | |
902 | (read-char-data | |
903 | port expect-eof? sig-ws? str-handler seed))) | |
904 | (cond | |
905 | ((eof-object? token) | |
906 | (case expect-eof? | |
907 | ((include #f) (end-para seed)) | |
908 | (else (up (end-para seed))))) | |
909 | (else | |
910 | (case (token-kind token) | |
911 | ((STRING) | |
912 | ;; this is only @-commands that escape | |
913 | ;; characters: @}, @@, @{ -- new para if need-break | |
914 | (let ((seed ((if need-break? end-para identity) seed))) | |
915 | (loop port expect-eof? | |
916 | (if need-break? (make-end-para seed) end-para) #f | |
917 | (str-handler (token-head token) "" | |
918 | ((if need-break? new-para identity) | |
919 | seed))))) | |
920 | ((END) | |
921 | ;; The end will only have a name if it's for an | |
922 | ;; environment | |
923 | (cond | |
924 | ((memq command '(item entry)) | |
925 | (let ((spec (command-spec (token-head token)))) | |
926 | (or (eq? (cadr spec) 'TABLE-ENVIRON) | |
927 | (parser-error | |
928 | port "@item not ended by @end table/enumerate/itemize" | |
929 | token)))) | |
930 | ((eq? expected-content 'ENVIRON) | |
931 | (assert-token token 'END command))) | |
932 | (up (end-para seed))) | |
933 | ((ITEM) | |
934 | (cond | |
935 | ((memq command '(enumerate itemize)) | |
936 | (up (visit 'item port sig-ws? (end-para seed)))) | |
937 | ((eq? expected-content 'TABLE-ENVIRON) | |
938 | (up (visit 'entry port sig-ws? (end-para seed)))) | |
939 | ((memq command '(item entry)) | |
940 | (visit command port sig-ws? (up (end-para seed)))) | |
941 | (else | |
942 | (parser-error | |
943 | port "@item must be within a table environment" | |
944 | command)))) | |
945 | ((PARA) | |
946 | ;; examine valid paragraphs? | |
947 | (loop port expect-eof? end-para (not sig-ws?) seed)) | |
948 | ((INCLUDE) | |
949 | ;; Recurse for include files | |
950 | (let ((seed (call-with-file-and-dir | |
951 | (read-include-file-name port) | |
952 | (lambda (port) | |
953 | (loop port 'include end-para | |
954 | need-break? seed))))) | |
955 | (loop port expect-eof? end-para need-break? seed))) | |
956 | ((START) ; Start of an @-command | |
957 | (let* ((head (token-head token)) | |
958 | (type (cadr (command-spec head))) | |
959 | (inline? (inline-content? type)) | |
960 | (seed ((if (and inline? (not need-break?)) | |
961 | identity end-para) seed)) | |
962 | (end-para (if inline? | |
963 | (if need-break? (make-end-para seed) | |
964 | end-para) | |
965 | identity)) | |
966 | (new-para (if (and inline? need-break?) | |
967 | new-para identity))) | |
968 | (loop port expect-eof? end-para (not inline?) | |
969 | (visit head port sig-ws? (new-para seed))))) | |
970 | (else | |
971 | (parser-error port "Unknown token type" token)))))))))))))))) | |
972 | ||
973 | ;; procedure: reverse-collect-str-drop-ws fragments | |
974 | ;; | |
975 | ;; Given the list of fragments (some of which are text strings), reverse | |
976 | ;; the list and concatenate adjacent text strings. We also drop | |
977 | ;; "unsignificant" whitespace, that is, whitespace in front, behind and | |
978 | ;; between elements. The whitespace that is included in character data | |
979 | ;; is not affected. | |
980 | (define (reverse-collect-str-drop-ws fragments) | |
981 | (cond | |
982 | ((null? fragments) ; a shortcut | |
983 | '()) | |
984 | ((and (string? (car fragments)) ; another shortcut | |
985 | (null? (cdr fragments)) ; remove single ws-only string | |
986 | (string-whitespace? (car fragments))) | |
987 | '()) | |
988 | (else | |
989 | (let loop ((fragments fragments) (result '()) (strs '()) | |
990 | (all-whitespace? #t)) | |
991 | (cond | |
992 | ((null? fragments) | |
993 | (if all-whitespace? | |
994 | result ; remove leading ws | |
995 | (cons (apply string-append strs) result))) | |
996 | ((string? (car fragments)) | |
997 | (loop (cdr fragments) result (cons (car fragments) strs) | |
998 | (and all-whitespace? | |
999 | (string-whitespace? (car fragments))))) | |
1000 | (else | |
1001 | (loop (cdr fragments) | |
1002 | (cons | |
1003 | (car fragments) | |
1004 | (cond | |
1005 | ((null? strs) result) | |
1006 | (all-whitespace? | |
1007 | (if (null? result) | |
1008 | result ; remove trailing whitespace | |
1009 | (cons " " result))); replace interstitial ws with | |
1010 | ; one space | |
1011 | (else | |
1012 | (cons (apply string-append strs) result)))) | |
1013 | '() #t))))))) | |
1014 | ||
1015 | (define (make-dom-parser) | |
1016 | (make-command-parser | |
1017 | (lambda (command args content seed) ; fdown | |
1018 | '()) | |
1019 | (lambda (command args parent-seed seed) ; fup | |
1020 | (let ((seed (reverse-collect-str-drop-ws seed))) | |
1021 | (acons command | |
1022 | (if (null? args) seed (acons '% args seed)) | |
1023 | parent-seed))) | |
1024 | (lambda (string1 string2 seed) ; str-handler | |
1025 | (if (string-null? string2) | |
1026 | (cons string1 seed) | |
1027 | (cons* string2 string1 seed))))) | |
1028 | ||
1029 | (define parse-environment-args | |
1030 | (let ((parser (make-dom-parser))) | |
1031 | ;; duplicate arguments->attlist to avoid unnecessary splitting | |
1032 | (lambda (command port) | |
1033 | (let ((args (cdar (parser '*ENVIRON-ARGS* port '()))) | |
1034 | (arg-names (cddr (command-spec command)))) | |
1035 | (cond | |
1036 | ((not arg-names) | |
1037 | (if (null? args) '() | |
1038 | (parser-error port "@-command doesn't take args" command))) | |
1039 | ((eq? arg-names #t) | |
1040 | (list (cons 'arguments args))) | |
1041 | (else | |
1042 | (let loop ((args args) (arg-names arg-names) (out '())) | |
1043 | (cond | |
1044 | ((null? arg-names) | |
1045 | (if (null? args) (reverse! out) | |
1046 | (parser-error port "@-command didn't expect more args" | |
1047 | command args))) | |
1048 | ((symbol? arg-names) | |
1049 | (reverse! (acons arg-names args out))) | |
1050 | ((null? args) | |
1051 | (parser-error port "@-command expects more args" | |
1052 | command arg-names)) | |
1053 | ((and (string? (car args)) (string-index (car args) #\space)) | |
1054 | => (lambda (i) | |
1055 | (let ((rest (substring/shared (car args) (1+ i)))) | |
1056 | (if (zero? i) | |
1057 | (loop (cons rest (cdr args)) arg-names out) | |
1058 | (loop (cons rest (cdr args)) (cdr arg-names) | |
1059 | (cons (list (car arg-names) | |
1060 | (substring (car args) 0 i)) | |
1061 | out)))))) | |
1062 | (else | |
1063 | (loop (cdr args) (cdr arg-names) | |
1064 | (if (and (pair? (car args)) (eq? (caar args) '*braces*)) | |
1065 | (acons (car arg-names) (cdar args) out) | |
1066 | (cons (list (car arg-names) (car args)) out)))))))))))) | |
1067 | ||
1068 | (define (parse-eol-text-args command port) | |
1069 | ;; perhaps parse-environment-args should be named more | |
1070 | ;; generically. | |
1071 | (parse-environment-args command port)) | |
1072 | ||
1073 | ;; procedure: texi-fragment->stexi STRING | |
1074 | ;; | |
1075 | ;; A DOM parser for a texinfo fragment STRING. | |
1076 | ;; | |
1077 | ;; The procedure returns an SXML tree headed by the special tag, | |
1078 | ;; *fragment*. | |
1079 | ||
1080 | (define (texi-fragment->stexi string-or-port) | |
1081 | "Parse the texinfo commands in @var{string-or-port}, and return the | |
1082 | resultant stexi tree. The head of the tree will be the special command, | |
1083 | @code{*fragment*}." | |
1084 | (define (parse port) | |
1085 | (postprocess (car ((make-dom-parser) '*fragment* port '())))) | |
1086 | (if (input-port? string-or-port) | |
1087 | (parse string-or-port) | |
1088 | (call-with-input-string string-or-port parse))) | |
1089 | ||
1090 | ;; procedure: texi->stexi PORT | |
1091 | ;; | |
1092 | ;; This is an instance of a SSAX parser above that returns an SXML | |
1093 | ;; representation of the texinfo document ready to be read at PORT. | |
1094 | ;; | |
1095 | ;; The procedure returns an SXML tree. The port points to the | |
1096 | ;; first character after the @bye, or to the end of the file. | |
1097 | ||
1098 | (define (texi->stexi port) | |
1099 | "Read a full texinfo document from @var{port} and return the parsed | |
1100 | stexi tree. The parsing will start at the @code{@@settitle} and end at | |
1101 | @code{@@bye} or EOF." | |
1102 | (let ((parser (make-dom-parser))) | |
1103 | (take-until-settitle port) | |
1104 | (postprocess (car (parser 'texinfo port '()))))) | |
1105 | ||
1106 | (define (car-eq? x y) (and (pair? x) (eq? (car x) y))) | |
1107 | (define (make-contents tree) | |
1108 | (define (lp in out depth) | |
1109 | (cond | |
1110 | ((null? in) (values in (cons 'enumerate (reverse! out)))) | |
1111 | ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) | |
1112 | => (lambda (new-depth) | |
1113 | (let ((node-name (and (car-eq? (car in) 'node) | |
1114 | (cadr (assq 'name (cdadar in)))))) | |
1115 | (cond | |
1116 | ((< new-depth depth) | |
1117 | (values in (cons 'enumerate (reverse! out)))) | |
1118 | ((> new-depth depth) | |
1119 | (let ((out-cdr (if (null? out) '() (cdr out))) | |
1120 | (out-car (if (null? out) (list 'item) (car out)))) | |
1121 | (let*-values (((new-in new-out) (lp in '() (1+ depth)))) | |
1122 | (lp new-in | |
1123 | (cons (append out-car (list new-out)) out-cdr) | |
1124 | depth)))) | |
1125 | (else ;; same depth | |
1126 | (lp (cddr in) | |
1127 | (cons | |
1128 | `(item (para | |
1129 | ,@(if node-name | |
1130 | `((ref (% (node ,node-name)))) | |
1131 | (cdadr in)))) | |
1132 | out) | |
1133 | depth)))))) | |
1134 | (else (lp (cdr in) out depth)))) | |
1135 | (let*-values (((_ contents) (lp tree '() 1))) | |
1136 | `((chapheading "Table of Contents") ,contents))) | |
1137 | ||
1138 | (define (trim-whitespace str trim-left? trim-right?) | |
1139 | (let* ((left-space? (and (not trim-left?) | |
1140 | (string-prefix? " " str))) | |
1141 | (right-space? (and (not trim-right?) | |
1142 | (string-suffix? " " str))) | |
1143 | (tail (append! (string-tokenize str) | |
1144 | (if right-space? '("") '())))) | |
1145 | (string-join (if left-space? (cons "" tail) tail)))) | |
1146 | ||
1147 | (define (postprocess tree) | |
1148 | (define (loop in out state first? sig-ws?) | |
1149 | (cond | |
1150 | ((null? in) | |
1151 | (values (reverse! out) state)) | |
1152 | ((string? (car in)) | |
1153 | (loop (cdr in) | |
1154 | (cons (if sig-ws? (car in) | |
1155 | (trim-whitespace (car in) first? (null? (cdr in)))) | |
1156 | out) | |
1157 | state #f sig-ws?)) | |
1158 | ((pair? (car in)) | |
1159 | (case (caar in) | |
1160 | ((set) | |
1161 | (if (null? (cdar in)) (error "@set missing arguments" in)) | |
1162 | (if (string? (cadar in)) | |
1163 | (let ((i (string-index (cadar in) #\space))) | |
1164 | (if i | |
1165 | (loop (cdr in) out | |
1166 | (acons (substring (cadar in) 0 i) | |
1167 | (cons (substring (cadar in) (1+ i)) (cddar in)) | |
1168 | state) | |
1169 | #f sig-ws?) | |
1170 | (loop (cdr in) out (acons (cadar in) (cddar in) state) | |
1171 | #f sig-ws?))) | |
1172 | (error "expected a constant to define for @set" in))) | |
1173 | ((value) | |
1174 | (loop (fold-right cons (cdr in) | |
1175 | (or (and=> | |
1176 | (assoc (cadr (assq 'key (cdadar in))) state) cdr) | |
1177 | (error "unknown value" (cdadar in) state))) | |
1178 | out | |
1179 | state #f sig-ws?)) | |
1180 | ((copying) | |
1181 | (loop (cdr in) out (cons (car in) state) #f sig-ws?)) | |
1182 | ((insertcopying) | |
1183 | (loop (fold-right cons (cdr in) | |
1184 | (or (cdr (assoc 'copying state)) | |
1185 | (error "copying isn't set yet"))) | |
1186 | out | |
1187 | state #f sig-ws?)) | |
1188 | ((contents) | |
1189 | (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) | |
1190 | (else | |
1191 | (let*-values (((kid-out state) | |
1192 | (loop (car in) '() state #t | |
1193 | (or sig-ws? (space-significant? (caar in)))))) | |
1194 | (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) | |
1195 | (else ; a symbol | |
1196 | (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) | |
1197 | ||
1198 | (call-with-values | |
1199 | (lambda () (loop tree '() '() #t #f)) | |
1200 | (lambda (out state) out))) | |
1201 | ||
1202 | ;; Replace % with texinfo-arguments. | |
1203 | (define (stexi->sxml tree) | |
1204 | "Transform the stexi tree @var{tree} into sxml. This involves | |
1205 | replacing the @code{%} element that keeps the texinfo arguments with an | |
1206 | element for each argument. | |
1207 | ||
1208 | FIXME: right now it just changes % to @code{texinfo-arguments} -- that | |
1209 | doesn't hang with the idea of making a dtd at some point" | |
1210 | (pre-post-order | |
1211 | tree | |
1212 | `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) | |
1213 | (*text* . ,(lambda (x t) t)) | |
1214 | (*default* . ,(lambda (x . t) (cons x t)))))) | |
1215 | ||
1216 | ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 | |
1217 | ;;; texinfo.scm ends here |