Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (texinfo) -- parsing of texinfo into SXML |
2 | ;;;; | |
fc2b8f6c | 3 | ;;;; Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. |
47f3ce52 AW |
4 | ;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> |
5 | ;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com> | |
6 | ;;;; | |
7 | ;;;; This file is based on SSAX's SSAX.scm. | |
8 | ;;;; | |
9 | ;;;; This library is free software; you can redistribute it and/or | |
10 | ;;;; modify it under the terms of the GNU Lesser General Public | |
11 | ;;;; License as published by the Free Software Foundation; either | |
12 | ;;;; version 3 of the License, or (at your option) any later version. | |
13 | ;;;; | |
14 | ;;;; This library is distributed in the hope that it will be useful, | |
15 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;;;; Lesser General Public License for more details. | |
18 | ;;;; | |
19 | ;;;; You should have received a copy of the GNU Lesser General Public | |
20 | ;;;; License along with this library; if not, write to the Free Software | |
21 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | \f | |
23 | ;;; Commentary: | |
24 | ;; | |
25 | ;; @subheading Texinfo processing in scheme | |
26 | ;; | |
27 | ;; This module parses texinfo into SXML. TeX will always be the | |
28 | ;; processor of choice for print output, of course. However, although | |
29 | ;; @code{makeinfo} works well for info, its output in other formats is | |
30 | ;; not very customizable, and the program is not extensible as a whole. | |
31 | ;; This module aims to provide an extensible framework for texinfo | |
32 | ;; processing that integrates texinfo into the constellation of SXML | |
33 | ;; processing tools. | |
34 | ;; | |
35 | ;; @subheading Notes on the SXML vocabulary | |
36 | ;; | |
37 | ;; Consider the following texinfo fragment: | |
38 | ;; | |
39 | ;;@example | |
40 | ;; @@deffn Primitive set-car! pair value | |
41 | ;; This function... | |
42 | ;; @@end deffn | |
43 | ;;@end example | |
44 | ;; | |
45 | ;; Logically, the category (Primitive), name (set-car!), and arguments | |
46 | ;; (pair value) are ``attributes'' of the deffn, with the description as | |
47 | ;; the content. However, texinfo allows for @@-commands within the | |
48 | ;; arguments to an environment, like @code{@@deffn}, which means that | |
49 | ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, | |
50 | ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are | |
51 | ;; called ``arguments'', and are grouped under the special element, `%'. | |
52 | ;; | |
53 | ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In | |
54 | ;; the interests of interoperability, this module provides a conversion | |
55 | ;; function to replace the `%' with `texinfo-arguments'. | |
56 | ;; | |
57 | ;;; Code: | |
58 | ||
59 | ;; Comparison to xml output of texinfo (which is rather undocumented): | |
60 | ;; Doesn't conform to texinfo dtd | |
61 | ;; No DTD at all, in fact :-/ | |
62 | ;; Actually outputs valid xml, after transforming % | |
63 | ;; Slower (although with caching the SXML that problem can go away) | |
64 | ;; Doesn't parse menus (although menus are shite) | |
65 | ;; Args go in a dedicated element, FBOFW | |
66 | ;; Definitions are handled a lot better | |
67 | ;; Does parse comments | |
68 | ;; Outputs only significant line breaks (a biggie!) | |
69 | ;; Nodes are treated as anchors, rather than content organizers (a biggie) | |
70 | ;; (more book-like, less info-like) | |
71 | ||
72 | ;; TODO | |
73 | ;; Integration: help, indexing, plain text | |
74 | ||
75 | (define-module (texinfo) | |
76 | #:use-module (sxml simple) | |
77 | #:use-module (sxml transform) | |
78 | #:use-module (sxml ssax input-parse) | |
79 | #:use-module (srfi srfi-1) | |
0c65f52c | 80 | #:use-module (srfi srfi-11) |
47f3ce52 AW |
81 | #:use-module (srfi srfi-13) |
82 | #:export (call-with-file-and-dir | |
83 | texi-command-specs | |
84 | texi-command-depth | |
85 | texi-fragment->stexi | |
86 | texi->stexi | |
87 | stexi->sxml)) | |
88 | ||
89 | ;; Some utilities | |
90 | ||
91 | (define (parser-error port message . rest) | |
05c29c5a | 92 | (apply throw 'parser-error port message rest)) |
47f3ce52 AW |
93 | |
94 | (define (call-with-file-and-dir filename proc) | |
95 | "Call the one-argument procedure @var{proc} with an input port that | |
96 | reads from @var{filename}. During the dynamic extent of @var{proc}'s | |
97 | execution, the current directory will be @code{(dirname | |
98 | @var{filename})}. This is useful for parsing documents that can include | |
99 | files by relative path name." | |
100 | (let ((current-dir (getcwd))) | |
101 | (dynamic-wind | |
102 | (lambda () (chdir (dirname filename))) | |
103 | (lambda () | |
104 | (call-with-input-file (basename filename) proc)) | |
105 | (lambda () (chdir current-dir))))) | |
106 | ||
47f3ce52 AW |
107 | ;;======================================================================== |
108 | ;; Reflection on the XML vocabulary | |
109 | ||
110 | (define texi-command-specs | |
111 | #; | |
112 | "A list of (@var{name} @var{content-model} . @var{args}) | |
113 | ||
114 | @table @var | |
115 | @item name | |
116 | The name of an @@-command, as a symbol. | |
117 | ||
118 | @item content-model | |
119 | A symbol indicating the syntactic type of the @@-command: | |
120 | @table @code | |
121 | @item EMPTY-COMMAND | |
122 | No content, and no @code{@@end} is coming | |
123 | @item EOL-ARGS | |
124 | Unparsed arguments until end of line | |
125 | @item EOL-TEXT | |
126 | Parsed arguments until end of line | |
127 | @item INLINE-ARGS | |
128 | Unparsed arguments ending with @code{#\\@}} | |
129 | @item INLINE-TEXT | |
130 | Parsed arguments ending with @code{#\\@}} | |
be52f329 AW |
131 | @item INLINE-TEXT-ARGS |
132 | Parsed arguments ending with @code{#\\@}} | |
47f3ce52 AW |
133 | @item ENVIRON |
134 | The tag is an environment tag, expect @code{@@end foo}. | |
135 | @item TABLE-ENVIRON | |
136 | Like ENVIRON, but with special parsing rules for its arguments. | |
137 | @item FRAGMENT | |
138 | For @code{*fragment*}, the command used for parsing fragments of | |
139 | texinfo documents. | |
140 | @end table | |
141 | ||
142 | @code{INLINE-TEXT} commands will receive their arguments within their | |
143 | bodies, whereas the @code{-ARGS} commands will receive them in their | |
144 | attribute list. | |
145 | ||
146 | @code{EOF-TEXT} receives its arguments in its body. | |
147 | ||
148 | @code{ENVIRON} commands have both: parsed arguments until the end of | |
149 | line, received through their attribute list, and parsed text until the | |
150 | @code{@@end}, received in their bodies. | |
151 | ||
152 | @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in | |
153 | @code{ENVIRON}. | |
154 | ||
31d59769 AW |
155 | In addition, @code{ALIAS} can alias one command to another. The alias |
156 | will never be seen in parsed stexinfo. | |
157 | ||
47f3ce52 AW |
158 | There are four @@-commands that are treated specially. @code{@@include} |
159 | is a low-level token that will not be seen by higher-level parsers, so | |
160 | it has no content-model. @code{@@para} is the paragraph command, which | |
161 | is only implicit in the texinfo source. @code{@@item} has special | |
162 | syntax, as noted above, and @code{@@entry} is how this parser treats | |
163 | @code{@@item} commands within @code{@@table}, @code{@@ftable}, and | |
164 | @code{@@vtable}. | |
165 | ||
166 | Also, indexing commands (@code{@@cindex}, etc.) are treated specially. | |
167 | Their arguments are parsed, but they are needed before entering the | |
168 | element so that an anchor can be inserted into the text before the index | |
169 | entry. | |
170 | ||
171 | @item args | |
172 | Named arguments to the command, in the same format as the formals for a | |
173 | lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, | |
be52f329 | 174 | @code{INLINE-TEXT-ARGS}, @code{ENVIRON}, @code{TABLE-ENVIRON} commands. |
47f3ce52 AW |
175 | @end table" |
176 | '(;; Special commands | |
177 | (include #f) ;; this is a low-level token | |
178 | (para PARAGRAPH) | |
179 | (item ITEM) | |
180 | (entry ENTRY . heading) | |
181 | (noindent EMPTY-COMMAND) | |
182 | (*fragment* FRAGMENT) | |
183 | ||
184 | ;; Inline text commands | |
185 | (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant | |
186 | (bold INLINE-TEXT) | |
187 | (sample INLINE-TEXT) | |
188 | (samp INLINE-TEXT) | |
189 | (code INLINE-TEXT) | |
1d14478f | 190 | (math INLINE-TEXT) |
47f3ce52 AW |
191 | (kbd INLINE-TEXT) |
192 | (key INLINE-TEXT) | |
193 | (var INLINE-TEXT) | |
194 | (env INLINE-TEXT) | |
195 | (file INLINE-TEXT) | |
196 | (command INLINE-TEXT) | |
197 | (option INLINE-TEXT) | |
198 | (dfn INLINE-TEXT) | |
199 | (cite INLINE-TEXT) | |
200 | (acro INLINE-TEXT) | |
47f3ce52 AW |
201 | (email INLINE-TEXT) |
202 | (emph INLINE-TEXT) | |
203 | (strong INLINE-TEXT) | |
204 | (sample INLINE-TEXT) | |
205 | (sc INLINE-TEXT) | |
206 | (titlefont INLINE-TEXT) | |
207 | (asis INLINE-TEXT) | |
208 | (b INLINE-TEXT) | |
209 | (i INLINE-TEXT) | |
210 | (r INLINE-TEXT) | |
211 | (sansserif INLINE-TEXT) | |
212 | (slanted INLINE-TEXT) | |
213 | (t INLINE-TEXT) | |
214 | ||
215 | ;; Inline args commands | |
216 | (value INLINE-ARGS . (key)) | |
217 | (ref INLINE-ARGS . (node #:opt name section info-file manual)) | |
218 | (xref INLINE-ARGS . (node #:opt name section info-file manual)) | |
797b2aa6 LC |
219 | (pxref INLINE-TEXT-ARGS |
220 | . (node #:opt name section info-file manual)) | |
31d59769 | 221 | (url ALIAS . uref) |
3f826e3c | 222 | (uref INLINE-TEXT-ARGS . (url #:opt title replacement)) |
47f3ce52 AW |
223 | (anchor INLINE-ARGS . (name)) |
224 | (dots INLINE-ARGS . ()) | |
225 | (result INLINE-ARGS . ()) | |
226 | (bullet INLINE-ARGS . ()) | |
227 | (copyright INLINE-ARGS . ()) | |
228 | (tie INLINE-ARGS . ()) | |
229 | (image INLINE-ARGS . (file #:opt width height alt-text extension)) | |
230 | ||
be52f329 AW |
231 | ;; Inline parsed args commands |
232 | (acronym INLINE-TEXT-ARGS . (acronym #:opt meaning)) | |
233 | ||
47f3ce52 AW |
234 | ;; EOL args elements |
235 | (node EOL-ARGS . (name #:opt next previous up)) | |
236 | (c EOL-ARGS . all) | |
237 | (comment EOL-ARGS . all) | |
238 | (setchapternewpage EOL-ARGS . all) | |
239 | (sp EOL-ARGS . all) | |
240 | (page EOL-ARGS . ()) | |
241 | (vskip EOL-ARGS . all) | |
242 | (syncodeindex EOL-ARGS . all) | |
243 | (contents EOL-ARGS . ()) | |
244 | (shortcontents EOL-ARGS . ()) | |
245 | (summarycontents EOL-ARGS . ()) | |
246 | (insertcopying EOL-ARGS . ()) | |
247 | (dircategory EOL-ARGS . (category)) | |
248 | (top EOL-ARGS . (title)) | |
249 | (printindex EOL-ARGS . (type)) | |
406524ea | 250 | (paragraphindent EOL-ARGS . (indent)) |
47f3ce52 AW |
251 | |
252 | ;; EOL text commands | |
253 | (*ENVIRON-ARGS* EOL-TEXT) | |
254 | (itemx EOL-TEXT) | |
255 | (set EOL-TEXT) | |
256 | (center EOL-TEXT) | |
257 | (title EOL-TEXT) | |
258 | (subtitle EOL-TEXT) | |
259 | (author EOL-TEXT) | |
260 | (chapter EOL-TEXT) | |
261 | (section EOL-TEXT) | |
262 | (appendix EOL-TEXT) | |
263 | (appendixsec EOL-TEXT) | |
264 | (unnumbered EOL-TEXT) | |
265 | (unnumberedsec EOL-TEXT) | |
266 | (subsection EOL-TEXT) | |
267 | (subsubsection EOL-TEXT) | |
268 | (appendixsubsec EOL-TEXT) | |
269 | (appendixsubsubsec EOL-TEXT) | |
270 | (unnumberedsubsec EOL-TEXT) | |
271 | (unnumberedsubsubsec EOL-TEXT) | |
272 | (chapheading EOL-TEXT) | |
273 | (majorheading EOL-TEXT) | |
274 | (heading EOL-TEXT) | |
275 | (subheading EOL-TEXT) | |
276 | (subsubheading EOL-TEXT) | |
277 | ||
278 | (deftpx EOL-TEXT-ARGS . (category name . attributes)) | |
279 | (defcvx EOL-TEXT-ARGS . (category class name)) | |
280 | (defivarx EOL-TEXT-ARGS . (class name)) | |
281 | (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) | |
282 | (defopx EOL-TEXT-ARGS . (category class name . arguments)) | |
283 | (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) | |
284 | (defmethodx EOL-TEXT-ARGS . (class name . arguments)) | |
285 | (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) | |
286 | (defoptx EOL-TEXT-ARGS . (name)) | |
287 | (defvrx EOL-TEXT-ARGS . (category name)) | |
288 | (defvarx EOL-TEXT-ARGS . (name)) | |
289 | (deftypevrx EOL-TEXT-ARGS . (category data-type name)) | |
290 | (deftypevarx EOL-TEXT-ARGS . (data-type name)) | |
291 | (deffnx EOL-TEXT-ARGS . (category name . arguments)) | |
292 | (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) | |
293 | (defspecx EOL-TEXT-ARGS . (name . arguments)) | |
294 | (defmacx EOL-TEXT-ARGS . (name . arguments)) | |
295 | (defunx EOL-TEXT-ARGS . (name . arguments)) | |
296 | (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) | |
297 | ||
298 | ;; Indexing commands | |
299 | (cindex INDEX . entry) | |
300 | (findex INDEX . entry) | |
301 | (vindex INDEX . entry) | |
302 | (kindex INDEX . entry) | |
303 | (pindex INDEX . entry) | |
304 | (tindex INDEX . entry) | |
305 | ||
306 | ;; Environment commands (those that need @end) | |
307 | (texinfo ENVIRON . title) | |
308 | (ignore ENVIRON . ()) | |
309 | (ifinfo ENVIRON . ()) | |
310 | (iftex ENVIRON . ()) | |
311 | (ifhtml ENVIRON . ()) | |
312 | (ifxml ENVIRON . ()) | |
313 | (ifplaintext ENVIRON . ()) | |
314 | (ifnotinfo ENVIRON . ()) | |
315 | (ifnottex ENVIRON . ()) | |
316 | (ifnothtml ENVIRON . ()) | |
317 | (ifnotxml ENVIRON . ()) | |
318 | (ifnotplaintext ENVIRON . ()) | |
319 | (titlepage ENVIRON . ()) | |
320 | (menu ENVIRON . ()) | |
321 | (direntry ENVIRON . ()) | |
322 | (copying ENVIRON . ()) | |
323 | (example ENVIRON . ()) | |
324 | (smallexample ENVIRON . ()) | |
325 | (display ENVIRON . ()) | |
326 | (smalldisplay ENVIRON . ()) | |
327 | (verbatim ENVIRON . ()) | |
328 | (format ENVIRON . ()) | |
329 | (smallformat ENVIRON . ()) | |
330 | (lisp ENVIRON . ()) | |
331 | (smalllisp ENVIRON . ()) | |
332 | (cartouche ENVIRON . ()) | |
333 | (quotation ENVIRON . ()) | |
334 | ||
335 | (deftp ENVIRON . (category name . attributes)) | |
336 | (defcv ENVIRON . (category class name)) | |
337 | (defivar ENVIRON . (class name)) | |
338 | (deftypeivar ENVIRON . (class data-type name)) | |
339 | (defop ENVIRON . (category class name . arguments)) | |
340 | (deftypeop ENVIRON . (category class data-type name . arguments)) | |
341 | (defmethod ENVIRON . (class name . arguments)) | |
342 | (deftypemethod ENVIRON . (class data-type name . arguments)) | |
343 | (defopt ENVIRON . (name)) | |
344 | (defvr ENVIRON . (category name)) | |
345 | (defvar ENVIRON . (name)) | |
346 | (deftypevr ENVIRON . (category data-type name)) | |
347 | (deftypevar ENVIRON . (data-type name)) | |
348 | (deffn ENVIRON . (category name . arguments)) | |
349 | (deftypefn ENVIRON . (category data-type name . arguments)) | |
350 | (defspec ENVIRON . (name . arguments)) | |
351 | (defmac ENVIRON . (name . arguments)) | |
352 | (defun ENVIRON . (name . arguments)) | |
353 | (deftypefun ENVIRON . (data-type name . arguments)) | |
354 | ||
355 | (table TABLE-ENVIRON . (formatter)) | |
356 | (itemize TABLE-ENVIRON . (formatter)) | |
357 | (enumerate TABLE-ENVIRON . (start)) | |
358 | (ftable TABLE-ENVIRON . (formatter)) | |
359 | (vtable TABLE-ENVIRON . (formatter)))) | |
360 | ||
361 | (define command-depths | |
362 | '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) | |
363 | (top . 0) (unnumbered . 1) (unnumberedsec . 2) | |
364 | (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) | |
365 | (appendix . 1) (appendixsec . 2) (appendixsection . 2) | |
366 | (appendixsubsec . 3) (appendixsubsubsec . 4))) | |
367 | (define (texi-command-depth command max-depth) | |
368 | "Given the texinfo command @var{command}, return its nesting level, or | |
369 | @code{#f} if it nests too deep for @var{max-depth}. | |
370 | ||
371 | Examples: | |
372 | @example | |
05c29c5a AW |
373 | (texi-command-depth 'chapter 4) @result{} 1 |
374 | (texi-command-depth 'top 4) @result{} 0 | |
375 | (texi-command-depth 'subsection 4) @result{} 3 | |
376 | (texi-command-depth 'appendixsubsec 4) @result{} 3 | |
377 | (texi-command-depth 'subsection 2) @result{} #f | |
47f3ce52 AW |
378 | @end example" |
379 | (let ((depth (and=> (assq command command-depths) cdr))) | |
380 | (and depth (<= depth max-depth) depth))) | |
381 | ||
382 | ;; The % is for arguments | |
383 | (define (space-significant? command) | |
384 | (memq command | |
385 | '(example smallexample verbatim lisp smalllisp menu %))) | |
386 | ||
387 | ;; Like a DTD for texinfo | |
388 | (define (command-spec command) | |
dc7a9cef AW |
389 | (let ((spec (assq command texi-command-specs))) |
390 | (cond | |
391 | ((not spec) | |
392 | (parser-error #f "Unknown command" command)) | |
393 | ((eq? (cadr spec) 'ALIAS) | |
394 | (command-spec (cddr spec))) | |
395 | (else | |
396 | spec)))) | |
47f3ce52 AW |
397 | |
398 | (define (inline-content? content) | |
be52f329 AW |
399 | (case content |
400 | ((INLINE-TEXT INLINE-ARGS INLINE-TEXT-ARGS) #t) | |
401 | (else #f))) | |
47f3ce52 AW |
402 | |
403 | ||
404 | ;;======================================================================== | |
405 | ;; Lower-level parsers and scanners | |
406 | ;; | |
407 | ;; They deal with primitive lexical units (Names, whitespaces, tags) and | |
408 | ;; with pieces of more generic productions. Most of these parsers must | |
409 | ;; be called in appropriate context. For example, complete-start-command | |
410 | ;; must be called only when the @-command start has been detected and | |
411 | ;; its name token has been read. | |
412 | ||
413 | ;; Test if a string is made of only whitespace | |
414 | ;; An empty string is considered made of whitespace as well | |
415 | (define (string-whitespace? str) | |
416 | (or (string-null? str) | |
417 | (string-every char-whitespace? str))) | |
418 | ||
419 | ;; Like read-text-line, but allows EOF. | |
420 | (define read-eof-breaks '(*eof* #\return #\newline)) | |
421 | (define (read-eof-line port) | |
422 | (if (eof-object? (peek-char port)) | |
423 | (peek-char port) | |
424 | (let* ((line (next-token '() read-eof-breaks | |
425 | "reading a line" port)) | |
426 | (c (read-char port))) ; must be either \n or \r or EOF | |
427 | (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) | |
428 | (read-char port)) ; skip \n that follows \r | |
429 | line))) | |
430 | ||
47f3ce52 AW |
431 | (define (skip-whitespace port) |
432 | (skip-while '(#\space #\tab #\return #\newline) port)) | |
433 | ||
434 | (define (skip-horizontal-whitespace port) | |
435 | (skip-while '(#\space #\tab) port)) | |
436 | ||
437 | ;; command ::= Letter+ | |
438 | ||
439 | ;; procedure: read-command PORT | |
440 | ;; | |
441 | ;; Read a command starting from the current position in the PORT and | |
442 | ;; return it as a symbol. | |
443 | (define (read-command port) | |
444 | (let ((first-char (peek-char port))) | |
445 | (or (char-alphabetic? first-char) | |
446 | (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) | |
447 | (string->symbol | |
448 | (next-token-of | |
449 | (lambda (c) | |
450 | (cond | |
451 | ((eof-object? c) #f) | |
452 | ((char-alphabetic? c) c) | |
453 | (else #f))) | |
454 | port))) | |
455 | ||
456 | ;; A token is a primitive lexical unit. It is a record with two fields, | |
457 | ;; token-head and token-kind. | |
458 | ;; | |
459 | ;; Token types: | |
460 | ;; END The end of a texinfo command. If the command is ended by }, | |
461 | ;; token-head will be #f. Otherwise if the command is ended by | |
462 | ;; @end COMMAND, token-head will be COMMAND. As a special case, | |
463 | ;; @bye is the end of a special @texinfo command. | |
464 | ;; START The start of a texinfo command. The token-head will be a | |
465 | ;; symbol of the @-command name. | |
466 | ;; INCLUDE An @include directive. The token-head will be empty -- the | |
467 | ;; caller is responsible for reading the include file name. | |
468 | ;; ITEM @item commands have an irregular syntax. They end at the | |
469 | ;; next @item, or at the end of the environment. For that | |
470 | ;; read-command-token treats them specially. | |
471 | ||
472 | (define (make-token kind head) (cons kind head)) | |
473 | (define token? pair?) | |
474 | (define token-kind car) | |
475 | (define token-head cdr) | |
476 | ||
477 | ;; procedure: read-command-token PORT | |
478 | ;; | |
479 | ;; This procedure starts parsing of a command token. The current | |
480 | ;; position in the stream must be #\@. This procedure scans enough of | |
481 | ;; the input stream to figure out what kind of a command token it is | |
482 | ;; seeing. The procedure returns a token structure describing the token. | |
483 | ||
484 | (define (read-command-token port) | |
485 | (assert-curr-char '(#\@) "start of the command" port) | |
486 | (let ((peeked (peek-char port))) | |
487 | (cond | |
4215ea75 | 488 | ((memq peeked '(#\! #\: #\. #\? #\@ #\\ #\{ #\})) |
47f3ce52 AW |
489 | ;; @-commands that escape characters |
490 | (make-token 'STRING (string (read-char port)))) | |
491 | (else | |
492 | (let ((name (read-command port))) | |
493 | (case name | |
494 | ((end) | |
495 | ;; got an ending tag | |
496 | (let ((command (string-trim-both | |
497 | (read-eof-line port)))) | |
498 | (or (and (not (string-null? command)) | |
499 | (string-every char-alphabetic? command)) | |
500 | (parser-error port "malformed @end" command)) | |
501 | (make-token 'END (string->symbol command)))) | |
502 | ((bye) | |
503 | ;; the end of the top | |
504 | (make-token 'END 'texinfo)) | |
505 | ((item) | |
506 | (make-token 'ITEM 'item)) | |
507 | ((include) | |
508 | (make-token 'INCLUDE #f)) | |
509 | (else | |
510 | (make-token 'START name)))))))) | |
511 | ||
512 | ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED | |
513 | ;; | |
514 | ;; This procedure must be called after we have read a string | |
515 | ;; "@verbatim\n" that begins a verbatim section. The current position | |
516 | ;; must be the first position of the verbatim body. This function reads | |
517 | ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a | |
518 | ;; character data consumer. | |
519 | ;; | |
520 | ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. | |
521 | ;; The first STRING1 argument to STR-HANDLER never contains a newline. | |
522 | ;; The second STRING2 argument often will. On the first invocation of the | |
523 | ;; STR-HANDLER, the seed is the one passed to read-verbatim-body | |
524 | ;; as the third argument. The result of this first invocation will be | |
525 | ;; passed as the seed argument to the second invocation of the line | |
526 | ;; consumer, and so on. The result of the last invocation of the | |
527 | ;; STR-HANDLER is returned by the read-verbatim-body. Note a | |
528 | ;; similarity to the fundamental 'fold' iterator. | |
529 | ;; | |
530 | ;; Within a verbatim section all characters are taken at their face | |
531 | ;; value. It ends with "\n@end verbatim(\r)?\n". | |
532 | ||
533 | ;; Must be called right after the newline after @verbatim. | |
534 | (define (read-verbatim-body port str-handler seed) | |
535 | (let loop ((seed seed)) | |
536 | (let ((fragment (next-token '() '(#\newline) | |
537 | "reading verbatim" port))) | |
538 | ;; We're reading the char after the 'fragment', which is | |
539 | ;; #\newline. | |
540 | (read-char port) | |
541 | (if (string=? fragment "@end verbatim") | |
542 | seed | |
543 | (loop (str-handler fragment "\n" seed)))))) | |
544 | ||
545 | ;; procedure+: read-arguments PORT | |
546 | ;; | |
547 | ;; This procedure reads and parses a production ArgumentList. | |
548 | ;; ArgumentList ::= S* Argument (S* , S* Argument)* S* | |
549 | ;; Argument ::= ([^@{},])* | |
550 | ;; | |
551 | ;; Arguments are the things in braces, i.e @ref{my node} has one | |
552 | ;; argument, "my node". Most commands taking braces actually don't have | |
553 | ;; arguments, they process text. For example, in | |
554 | ;; @emph{@strong{emphasized}}, the emph takes text, because the parse | |
555 | ;; continues into the braces. | |
556 | ;; | |
557 | ;; Any whitespace within Argument is replaced with a single space. | |
558 | ;; Whitespace around an Argument is trimmed. | |
559 | ;; | |
560 | ;; The procedure returns a list of arguments. Afterwards the current | |
561 | ;; character will be after the final #\}. | |
562 | ||
563 | (define (read-arguments port stop-char) | |
564 | (define (split str) | |
565 | (read-char port) ;; eat the delimiter | |
566 | (let ((ret (map (lambda (x) (if (string-null? x) #f x)) | |
567 | (map string-trim-both (string-split str #\,))))) | |
568 | (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) | |
569 | '() | |
570 | ret))) | |
571 | (split (next-token '() (list stop-char) | |
572 | "arguments of @-command" port))) | |
573 | ||
574 | ;; procedure+: complete-start-command COMMAND PORT | |
575 | ;; | |
576 | ;; This procedure is to complete parsing of an @-command. The procedure | |
577 | ;; must be called after the command token has been read. COMMAND is a | |
578 | ;; TAG-NAME. | |
579 | ;; | |
580 | ;; This procedure returns several values: | |
581 | ;; COMMAND: a symbol. | |
582 | ;; ARGUMENTS: command's arguments, as an alist. | |
583 | ;; CONTENT-MODEL: the content model of the command. | |
584 | ;; | |
585 | ;; On exit, the current position in PORT will depend on the CONTENT-MODEL. | |
586 | ;; | |
587 | ;; Content model Port position | |
588 | ;; ============= ============= | |
589 | ;; INLINE-TEXT One character after the #\{. | |
be52f329 | 590 | ;; INLINE-TEXT-ARGS One character after the #\{. |
47f3ce52 AW |
591 | ;; INLINE-ARGS The first character after the #\}. |
592 | ;; EOL-TEXT The first non-whitespace character after the command. | |
593 | ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT | |
594 | ;; The first character on the next line. | |
595 | ;; PARAGRAPH, ITEM, EMPTY-COMMAND | |
596 | ;; The first character after the command. | |
597 | ||
598 | (define (arguments->attlist port args arg-names) | |
599 | (let loop ((in args) (names arg-names) (opt? #f) (out '())) | |
600 | (cond | |
601 | ((symbol? names) ;; a rest arg | |
602 | (reverse (if (null? in) out (acons names in out)))) | |
603 | ((and (not (null? names)) (eq? (car names) #:opt)) | |
604 | (loop in (cdr names) #t out)) | |
605 | ((null? in) | |
606 | (if (or (null? names) opt?) | |
607 | (reverse out) | |
608 | (parser-error port "@-command expected more arguments:" | |
609 | args arg-names names))) | |
610 | ((null? names) | |
611 | (parser-error port "@-command didn't expect more arguments:" in)) | |
612 | ((not (car in)) | |
613 | (or (and opt? (loop (cdr in) (cdr names) opt? out)) | |
614 | (parser-error "@-command missing required argument" | |
615 | (car names)))) | |
616 | (else | |
617 | (loop (cdr in) (cdr names) opt? | |
be52f329 AW |
618 | (acons (car names) |
619 | (if (list? (car in)) (car in) (list (car in))) | |
620 | out)))))) | |
47f3ce52 AW |
621 | |
622 | (define (parse-table-args command port) | |
623 | (let* ((line (string-trim-both (read-text-line port))) | |
624 | (length (string-length line))) | |
625 | (define (get-formatter) | |
626 | (or (and (not (zero? length)) | |
627 | (eq? (string-ref line 0) #\@) | |
628 | (let ((f (string->symbol (substring line 1)))) | |
629 | (or (inline-content? (cadr (command-spec f))) | |
630 | (parser-error | |
631 | port "@item formatter must be INLINE" f)) | |
632 | f)) | |
05c29c5a | 633 | (parser-error port "Invalid @item formatter" line))) |
47f3ce52 AW |
634 | (case command |
635 | ((enumerate) | |
636 | (if (zero? length) | |
637 | '() | |
638 | `((start | |
639 | ,(if (or (and (eq? length 1) | |
640 | (char-alphabetic? (string-ref line 0))) | |
641 | (string-every char-numeric? line)) | |
642 | line | |
643 | (parser-error | |
644 | port "Invalid enumerate start" line)))))) | |
645 | ((itemize) | |
646 | `((bullet | |
647 | ,(or (and (eq? length 1) line) | |
648 | (and (string-null? line) '(bullet)) | |
649 | (list (get-formatter)))))) | |
650 | (else ;; tables of various varieties | |
651 | `((formatter (,(get-formatter)))))))) | |
652 | ||
653 | (define (complete-start-command command port) | |
654 | (define (get-arguments type arg-names stop-char) | |
655 | (arguments->attlist port (read-arguments port stop-char) arg-names)) | |
656 | ||
657 | (let* ((spec (command-spec command)) | |
dc7a9cef | 658 | (command (car spec)) |
47f3ce52 AW |
659 | (type (cadr spec)) |
660 | (arg-names (cddr spec))) | |
661 | (case type | |
662 | ((INLINE-TEXT) | |
663 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
664 | (values command '() type)) | |
665 | ((INLINE-ARGS) | |
666 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
667 | (values command (get-arguments type arg-names #\}) type)) | |
be52f329 AW |
668 | ((INLINE-TEXT-ARGS) |
669 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
670 | (values command '() type)) | |
47f3ce52 AW |
671 | ((EOL-ARGS) |
672 | (values command (get-arguments type arg-names #\newline) type)) | |
673 | ((ENVIRON ENTRY INDEX) | |
674 | (skip-horizontal-whitespace port) | |
675 | (values command (parse-environment-args command port) type)) | |
676 | ((TABLE-ENVIRON) | |
677 | (skip-horizontal-whitespace port) | |
678 | (values command (parse-table-args command port) type)) | |
679 | ((EOL-TEXT) | |
680 | (skip-horizontal-whitespace port) | |
681 | (values command '() type)) | |
682 | ((EOL-TEXT-ARGS) | |
683 | (skip-horizontal-whitespace port) | |
684 | (values command (parse-eol-text-args command port) type)) | |
685 | ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) | |
686 | (values command '() type)) | |
687 | (else ;; INCLUDE shouldn't get here | |
688 | (parser-error port "can't happen"))))) | |
689 | ||
690 | ;;----------------------------------------------------------------------------- | |
691 | ;; Higher-level parsers and scanners | |
692 | ;; | |
693 | ;; They parse productions corresponding entire @-commands. | |
694 | ||
695 | ;; Only reads @settitle, leaves it to the command parser to finish | |
696 | ;; reading the title. | |
697 | (define (take-until-settitle port) | |
698 | (or (find-string-from-port? "\n@settitle " port) | |
699 | (parser-error port "No \\n@settitle found")) | |
700 | (skip-horizontal-whitespace port) | |
701 | (and (eq? (peek-char port) #\newline) | |
702 | (parser-error port "You have a @settitle, but no title"))) | |
703 | ||
704 | ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED | |
705 | ;; | |
706 | ;; This procedure is to read the CharData of a texinfo document. | |
707 | ;; | |
708 | ;; text ::= (CharData | Command)* | |
709 | ;; | |
710 | ;; The procedure reads CharData and stops at @-commands (or | |
711 | ;; environments). It also stops at an open or close brace. | |
712 | ;; | |
713 | ;; port | |
714 | ;; a PORT to read | |
715 | ;; expect-eof? | |
716 | ;; a boolean indicating if EOF is normal, i.e., the character | |
717 | ;; data may be terminated by the EOF. EOF is normal | |
718 | ;; while processing the main document. | |
719 | ;; preserve-ws? | |
720 | ;; a boolean indicating if we are within a whitespace-preserving | |
721 | ;; environment. If #t, suppress paragraph detection. | |
722 | ;; str-handler | |
723 | ;; a STR-HANDLER, see read-verbatim-body | |
724 | ;; seed | |
725 | ;; an argument passed to the first invocation of STR-HANDLER. | |
726 | ;; | |
727 | ;; The procedure returns two results: SEED and TOKEN. The SEED is the | |
728 | ;; result of the last invocation of STR-HANDLER, or the original seed if | |
729 | ;; STR-HANDLER was never called. | |
730 | ;; | |
731 | ;; TOKEN can be either an eof-object (this can happen only if expect-eof? | |
732 | ;; was #t), or a texinfo token denoting the start or end of a tag. | |
733 | ||
734 | ;; read-char-data port expect-eof? preserve-ws? str-handler seed | |
735 | (define read-char-data | |
736 | (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) | |
737 | (define (handle str-handler str1 str2 seed) | |
738 | (if (and (string-null? str1) (string-null? str2)) | |
739 | seed | |
740 | (str-handler str1 str2 seed))) | |
741 | ||
742 | (lambda (port expect-eof? preserve-ws? str-handler seed) | |
743 | (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) | |
744 | (let loop ((seed seed)) | |
745 | (let* ((fragment (next-token '() end-chars "reading char data" port)) | |
746 | (term-char (peek-char port))) ; one of end-chars | |
747 | (cond | |
748 | ((eof-object? term-char) ; only if expect-eof? | |
749 | (values (handle str-handler fragment "" seed) term-char)) | |
750 | ((memq term-char '(#\@ #\{ #\})) | |
751 | (values (handle str-handler fragment "" seed) | |
752 | (case term-char | |
753 | ((#\@) (read-command-token port)) | |
754 | ((#\{) (make-token 'START '*braces*)) | |
755 | ((#\}) (read-char port) (make-token 'END #f))))) | |
756 | ((eq? term-char #\newline) | |
757 | ;; Always significant, unless directly before an end token. | |
758 | (let ((c (peek-next-char port))) | |
759 | (cond | |
760 | ((eof-object? c) | |
761 | (or expect-eof? | |
762 | (parser-error port "EOF while reading char data")) | |
763 | (values (handle str-handler fragment "" seed) c)) | |
764 | ((eq? c #\@) | |
765 | (let* ((token (read-command-token port)) | |
766 | (end? (eq? (token-kind token) 'END))) | |
767 | (values | |
fc2b8f6c AW |
768 | (handle str-handler fragment |
769 | (if end? "" (if preserve-ws? "\n" " ")) | |
770 | seed) | |
47f3ce52 AW |
771 | token))) |
772 | ((and (not preserve-ws?) (eq? c #\newline)) | |
773 | ;; paragraph-separator ::= #\newline #\newline+ | |
774 | (skip-while '(#\newline) port) | |
775 | (skip-horizontal-whitespace port) | |
776 | (values (handle str-handler fragment "" seed) | |
777 | (make-token 'PARA 'para))) | |
778 | (else | |
779 | (loop (handle str-handler fragment | |
780 | (if preserve-ws? "\n" " ") seed))))))))))))) | |
781 | ||
782 | ; procedure+: assert-token TOKEN KIND NAME | |
783 | ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME | |
784 | (define (assert-token token kind name) | |
785 | (or (and (token? token) | |
786 | (eq? kind (token-kind token)) | |
787 | (equal? name (token-head token))) | |
788 | (parser-error #f "Expecting @end for " name ", got " token))) | |
789 | ||
790 | ;;======================================================================== | |
791 | ;; Highest-level parsers: Texinfo to SXML | |
792 | ||
793 | ;; These parsers are a set of syntactic forms to instantiate a SSAX | |
794 | ;; parser. The user tells what to do with the parsed character and | |
795 | ;; element data. These latter handlers determine if the parsing follows a | |
796 | ;; SAX or a DOM model. | |
797 | ||
798 | ;; syntax: make-command-parser fdown fup str-handler | |
799 | ||
800 | ;; Create a parser to parse and process one element, including its | |
801 | ;; character content or children elements. The parser is typically | |
802 | ;; applied to the root element of a document. | |
803 | ||
804 | ;; fdown | |
805 | ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED | |
806 | ;; | |
807 | ;; This procedure is to generate the seed to be passed to handlers | |
808 | ;; that process the content of the element. This is the function | |
809 | ;; identified as 'fdown' in the denotational semantics of the XML | |
810 | ;; parser given in the title comments to (sxml ssax). | |
811 | ;; | |
812 | ;; fup | |
813 | ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED | |
814 | ;; | |
815 | ;; This procedure is called when parsing of COMMAND is finished. | |
816 | ;; The SEED is the result from the last content parser (or from | |
817 | ;; fdown if the element has the empty content). PARENT-SEED is the | |
818 | ;; same seed as was passed to fdown. The procedure is to generate a | |
819 | ;; seed that will be the result of the element parser. This is the | |
820 | ;; function identified as 'fup' in the denotational semantics of | |
821 | ;; the XML parser given in the title comments to (sxml ssax). | |
822 | ;; | |
823 | ;; str-handler | |
824 | ;; A STR-HANDLER, see read-verbatim-body | |
825 | ;; | |
826 | ||
827 | ;; The generated parser is a | |
828 | ;; procedure COMMAND PORT SEED | |
829 | ;; | |
830 | ;; The procedure must be called *after* the command token has been read. | |
831 | ||
832 | (define (read-include-file-name port) | |
833 | (let ((x (string-trim-both (read-eof-line port)))) | |
834 | (if (string-null? x) | |
835 | (error "no file listed") | |
836 | x))) ;; fixme: should expand @value{} references | |
837 | ||
838 | (define (sxml->node-name sxml) | |
839 | "Turn some sxml string into a valid node name." | |
840 | (let loop ((in (string->list (sxml->string sxml))) (out '())) | |
841 | (if (null? in) | |
842 | (apply string (reverse out)) | |
843 | (if (memq (car in) '(#\{ #\} #\@ #\,)) | |
844 | (loop (cdr in) out) | |
845 | (loop (cdr in) (cons (car in) out)))))) | |
846 | ||
847 | (define (index command arguments fdown fup parent-seed) | |
848 | (case command | |
849 | ((deftp defcv defivar deftypeivar defop deftypeop defmethod | |
850 | deftypemethod defopt defvr defvar deftypevr deftypevar deffn | |
851 | deftypefn defspec defmac defun deftypefun) | |
852 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
853 | (cadr (assq 'name arguments))))))) | |
854 | (fup 'anchor args parent-seed | |
855 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
856 | ((cindex findex vindex kindex pindex tindex) | |
857 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
858 | (sxml->node-name | |
859 | (assq 'entry arguments))))))) | |
860 | (fup 'anchor args parent-seed | |
861 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
862 | (else parent-seed))) | |
863 | ||
864 | (define (make-command-parser fdown fup str-handler) | |
865 | (lambda (command port seed) | |
866 | (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) | |
867 | (let*-values (((command arguments expected-content) | |
868 | (complete-start-command command port))) | |
869 | (let* ((parent-seed (index command arguments fdown fup parent-seed)) | |
870 | (seed (fdown command arguments expected-content parent-seed)) | |
871 | (eof-closes? (or (memq command '(texinfo para *fragment*)) | |
872 | (eq? expected-content 'EOL-TEXT))) | |
873 | (sig-ws? (or sig-ws? (space-significant? command))) | |
874 | (up (lambda (s) (fup command arguments parent-seed s))) | |
875 | (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) | |
876 | (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) | |
877 | ||
878 | (define (port-for-content) | |
879 | (if (eq? expected-content 'EOL-TEXT) | |
880 | (call-with-input-string (read-text-line port) identity) | |
881 | port)) | |
882 | ||
883 | (cond | |
884 | ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX | |
885 | EOL-TEXT-ARGS)) | |
886 | ;; empty or finished by complete-start-command | |
887 | (up seed)) | |
888 | ((eq? command 'verbatim) | |
889 | (up (read-verbatim-body port str-handler seed))) | |
890 | (else | |
891 | (let loop ((port (port-for-content)) | |
892 | (expect-eof? eof-closes?) | |
893 | (end-para identity) | |
894 | (need-break? (and (not sig-ws?) | |
895 | (memq expected-content | |
896 | '(ENVIRON TABLE-ENVIRON | |
897 | ENTRY ITEM FRAGMENT)))) | |
898 | (seed seed)) | |
899 | (cond | |
900 | ((and need-break? (or sig-ws? (skip-whitespace port)) | |
901 | (not (memq (peek-char port) '(#\@ #\}))) | |
902 | (not (eof-object? (peek-char port)))) | |
903 | ;; Even if we have an @, it might be inline -- check | |
904 | ;; that later | |
905 | (let ((seed (end-para seed))) | |
906 | (loop port expect-eof? (make-end-para seed) #f | |
907 | (new-para seed)))) | |
908 | (else | |
909 | (let*-values (((seed token) | |
910 | (read-char-data | |
911 | port expect-eof? sig-ws? str-handler seed))) | |
912 | (cond | |
913 | ((eof-object? token) | |
914 | (case expect-eof? | |
915 | ((include #f) (end-para seed)) | |
916 | (else (up (end-para seed))))) | |
917 | (else | |
918 | (case (token-kind token) | |
919 | ((STRING) | |
920 | ;; this is only @-commands that escape | |
921 | ;; characters: @}, @@, @{ -- new para if need-break | |
922 | (let ((seed ((if need-break? end-para identity) seed))) | |
923 | (loop port expect-eof? | |
924 | (if need-break? (make-end-para seed) end-para) #f | |
925 | (str-handler (token-head token) "" | |
926 | ((if need-break? new-para identity) | |
927 | seed))))) | |
928 | ((END) | |
929 | ;; The end will only have a name if it's for an | |
930 | ;; environment | |
931 | (cond | |
932 | ((memq command '(item entry)) | |
933 | (let ((spec (command-spec (token-head token)))) | |
934 | (or (eq? (cadr spec) 'TABLE-ENVIRON) | |
935 | (parser-error | |
936 | port "@item not ended by @end table/enumerate/itemize" | |
937 | token)))) | |
938 | ((eq? expected-content 'ENVIRON) | |
939 | (assert-token token 'END command))) | |
940 | (up (end-para seed))) | |
941 | ((ITEM) | |
942 | (cond | |
943 | ((memq command '(enumerate itemize)) | |
944 | (up (visit 'item port sig-ws? (end-para seed)))) | |
945 | ((eq? expected-content 'TABLE-ENVIRON) | |
946 | (up (visit 'entry port sig-ws? (end-para seed)))) | |
947 | ((memq command '(item entry)) | |
948 | (visit command port sig-ws? (up (end-para seed)))) | |
949 | (else | |
950 | (parser-error | |
951 | port "@item must be within a table environment" | |
952 | command)))) | |
953 | ((PARA) | |
954 | ;; examine valid paragraphs? | |
955 | (loop port expect-eof? end-para (not sig-ws?) seed)) | |
956 | ((INCLUDE) | |
957 | ;; Recurse for include files | |
958 | (let ((seed (call-with-file-and-dir | |
959 | (read-include-file-name port) | |
960 | (lambda (port) | |
961 | (loop port 'include end-para | |
962 | need-break? seed))))) | |
963 | (loop port expect-eof? end-para need-break? seed))) | |
964 | ((START) ; Start of an @-command | |
965 | (let* ((head (token-head token)) | |
dc7a9cef AW |
966 | (spec (command-spec head)) |
967 | (head (car spec)) | |
968 | (type (cadr spec)) | |
47f3ce52 AW |
969 | (inline? (inline-content? type)) |
970 | (seed ((if (and inline? (not need-break?)) | |
971 | identity end-para) seed)) | |
972 | (end-para (if inline? | |
973 | (if need-break? (make-end-para seed) | |
974 | end-para) | |
975 | identity)) | |
976 | (new-para (if (and inline? need-break?) | |
977 | new-para identity))) | |
978 | (loop port expect-eof? end-para (not inline?) | |
979 | (visit head port sig-ws? (new-para seed))))) | |
980 | (else | |
981 | (parser-error port "Unknown token type" token)))))))))))))))) | |
982 | ||
983 | ;; procedure: reverse-collect-str-drop-ws fragments | |
984 | ;; | |
985 | ;; Given the list of fragments (some of which are text strings), reverse | |
986 | ;; the list and concatenate adjacent text strings. We also drop | |
987 | ;; "unsignificant" whitespace, that is, whitespace in front, behind and | |
988 | ;; between elements. The whitespace that is included in character data | |
989 | ;; is not affected. | |
990 | (define (reverse-collect-str-drop-ws fragments) | |
991 | (cond | |
992 | ((null? fragments) ; a shortcut | |
993 | '()) | |
994 | ((and (string? (car fragments)) ; another shortcut | |
995 | (null? (cdr fragments)) ; remove single ws-only string | |
996 | (string-whitespace? (car fragments))) | |
997 | '()) | |
998 | (else | |
999 | (let loop ((fragments fragments) (result '()) (strs '()) | |
1000 | (all-whitespace? #t)) | |
1001 | (cond | |
1002 | ((null? fragments) | |
1003 | (if all-whitespace? | |
1004 | result ; remove leading ws | |
1005 | (cons (apply string-append strs) result))) | |
1006 | ((string? (car fragments)) | |
1007 | (loop (cdr fragments) result (cons (car fragments) strs) | |
1008 | (and all-whitespace? | |
1009 | (string-whitespace? (car fragments))))) | |
1010 | (else | |
1011 | (loop (cdr fragments) | |
1012 | (cons | |
1013 | (car fragments) | |
1014 | (cond | |
1015 | ((null? strs) result) | |
1016 | (all-whitespace? | |
1017 | (if (null? result) | |
1018 | result ; remove trailing whitespace | |
1019 | (cons " " result))); replace interstitial ws with | |
1020 | ; one space | |
1021 | (else | |
1022 | (cons (apply string-append strs) result)))) | |
1023 | '() #t))))))) | |
1024 | ||
be52f329 AW |
1025 | (define (parse-inline-text-args port spec text) |
1026 | (let lp ((in text) (cur '()) (out '())) | |
1027 | (cond | |
1028 | ((null? in) | |
1029 | (if (and (pair? cur) | |
1030 | (string? (car cur)) | |
1031 | (string-whitespace? (car cur))) | |
1032 | (lp in (cdr cur) out) | |
1033 | (let ((args (reverse (if (null? cur) | |
1034 | out | |
1035 | (cons (reverse cur) out))))) | |
1036 | (arguments->attlist port args (cddr spec))))) | |
1037 | ((pair? (car in)) | |
1038 | (lp (cdr in) (cons (car in) cur) out)) | |
1039 | ((string-index (car in) #\,) | |
1040 | (let* ((parts (string-split (car in) #\,)) | |
1041 | (head (string-trim-right (car parts))) | |
1042 | (rev-tail (reverse (cdr parts))) | |
1043 | (last (string-trim (car rev-tail)))) | |
1044 | (lp (cdr in) | |
1045 | (if (string-null? last) cur (cons last cur)) | |
1046 | (append (cdr rev-tail) | |
1047 | (cons (reverse (if (string-null? head) cur (cons head cur))) | |
1048 | out))))) | |
1049 | (else | |
1050 | (lp (cdr in) | |
1051 | (cons (if (null? cur) (string-trim (car in)) (car in)) cur) | |
1052 | out))))) | |
1053 | ||
47f3ce52 AW |
1054 | (define (make-dom-parser) |
1055 | (make-command-parser | |
1056 | (lambda (command args content seed) ; fdown | |
1057 | '()) | |
1058 | (lambda (command args parent-seed seed) ; fup | |
dc7a9cef AW |
1059 | (let* ((seed (reverse-collect-str-drop-ws seed)) |
1060 | (spec (command-spec command)) | |
1061 | (command (car spec))) | |
be52f329 AW |
1062 | (if (eq? (cadr spec) 'INLINE-TEXT-ARGS) |
1063 | (cons (list command (cons '% (parse-inline-text-args #f spec seed))) | |
1064 | parent-seed) | |
1065 | (acons command | |
1066 | (if (null? args) seed (acons '% args seed)) | |
1067 | parent-seed)))) | |
47f3ce52 AW |
1068 | (lambda (string1 string2 seed) ; str-handler |
1069 | (if (string-null? string2) | |
1070 | (cons string1 seed) | |
1071 | (cons* string2 string1 seed))))) | |
1072 | ||
1073 | (define parse-environment-args | |
1074 | (let ((parser (make-dom-parser))) | |
1075 | ;; duplicate arguments->attlist to avoid unnecessary splitting | |
1076 | (lambda (command port) | |
dc7a9cef AW |
1077 | (let* ((args (cdar (parser '*ENVIRON-ARGS* port '()))) |
1078 | (spec (command-spec command)) | |
1079 | (command (car spec)) | |
1080 | (arg-names (cddr spec))) | |
47f3ce52 AW |
1081 | (cond |
1082 | ((not arg-names) | |
1083 | (if (null? args) '() | |
1084 | (parser-error port "@-command doesn't take args" command))) | |
1085 | ((eq? arg-names #t) | |
1086 | (list (cons 'arguments args))) | |
1087 | (else | |
1088 | (let loop ((args args) (arg-names arg-names) (out '())) | |
1089 | (cond | |
1090 | ((null? arg-names) | |
1091 | (if (null? args) (reverse! out) | |
1092 | (parser-error port "@-command didn't expect more args" | |
1093 | command args))) | |
1094 | ((symbol? arg-names) | |
1095 | (reverse! (acons arg-names args out))) | |
1096 | ((null? args) | |
1097 | (parser-error port "@-command expects more args" | |
1098 | command arg-names)) | |
1099 | ((and (string? (car args)) (string-index (car args) #\space)) | |
1100 | => (lambda (i) | |
1101 | (let ((rest (substring/shared (car args) (1+ i)))) | |
1102 | (if (zero? i) | |
1103 | (loop (cons rest (cdr args)) arg-names out) | |
1104 | (loop (cons rest (cdr args)) (cdr arg-names) | |
1105 | (cons (list (car arg-names) | |
1106 | (substring (car args) 0 i)) | |
1107 | out)))))) | |
1108 | (else | |
1109 | (loop (cdr args) (cdr arg-names) | |
1110 | (if (and (pair? (car args)) (eq? (caar args) '*braces*)) | |
1111 | (acons (car arg-names) (cdar args) out) | |
1112 | (cons (list (car arg-names) (car args)) out)))))))))))) | |
1113 | ||
1114 | (define (parse-eol-text-args command port) | |
1115 | ;; perhaps parse-environment-args should be named more | |
1116 | ;; generically. | |
1117 | (parse-environment-args command port)) | |
1118 | ||
1119 | ;; procedure: texi-fragment->stexi STRING | |
1120 | ;; | |
1121 | ;; A DOM parser for a texinfo fragment STRING. | |
1122 | ;; | |
1123 | ;; The procedure returns an SXML tree headed by the special tag, | |
1124 | ;; *fragment*. | |
1125 | ||
1126 | (define (texi-fragment->stexi string-or-port) | |
1127 | "Parse the texinfo commands in @var{string-or-port}, and return the | |
1128 | resultant stexi tree. The head of the tree will be the special command, | |
1129 | @code{*fragment*}." | |
1130 | (define (parse port) | |
1131 | (postprocess (car ((make-dom-parser) '*fragment* port '())))) | |
1132 | (if (input-port? string-or-port) | |
1133 | (parse string-or-port) | |
1134 | (call-with-input-string string-or-port parse))) | |
1135 | ||
1136 | ;; procedure: texi->stexi PORT | |
1137 | ;; | |
1138 | ;; This is an instance of a SSAX parser above that returns an SXML | |
1139 | ;; representation of the texinfo document ready to be read at PORT. | |
1140 | ;; | |
1141 | ;; The procedure returns an SXML tree. The port points to the | |
1142 | ;; first character after the @bye, or to the end of the file. | |
1143 | ||
1144 | (define (texi->stexi port) | |
1145 | "Read a full texinfo document from @var{port} and return the parsed | |
1146 | stexi tree. The parsing will start at the @code{@@settitle} and end at | |
1147 | @code{@@bye} or EOF." | |
1148 | (let ((parser (make-dom-parser))) | |
1149 | (take-until-settitle port) | |
1150 | (postprocess (car (parser 'texinfo port '()))))) | |
1151 | ||
1152 | (define (car-eq? x y) (and (pair? x) (eq? (car x) y))) | |
1153 | (define (make-contents tree) | |
1154 | (define (lp in out depth) | |
1155 | (cond | |
1156 | ((null? in) (values in (cons 'enumerate (reverse! out)))) | |
1157 | ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) | |
1158 | => (lambda (new-depth) | |
1159 | (let ((node-name (and (car-eq? (car in) 'node) | |
1160 | (cadr (assq 'name (cdadar in)))))) | |
1161 | (cond | |
1162 | ((< new-depth depth) | |
1163 | (values in (cons 'enumerate (reverse! out)))) | |
1164 | ((> new-depth depth) | |
1165 | (let ((out-cdr (if (null? out) '() (cdr out))) | |
1166 | (out-car (if (null? out) (list 'item) (car out)))) | |
1167 | (let*-values (((new-in new-out) (lp in '() (1+ depth)))) | |
1168 | (lp new-in | |
1169 | (cons (append out-car (list new-out)) out-cdr) | |
1170 | depth)))) | |
1171 | (else ;; same depth | |
1172 | (lp (cddr in) | |
1173 | (cons | |
1174 | `(item (para | |
1175 | ,@(if node-name | |
1176 | `((ref (% (node ,node-name)))) | |
1177 | (cdadr in)))) | |
1178 | out) | |
1179 | depth)))))) | |
1180 | (else (lp (cdr in) out depth)))) | |
1181 | (let*-values (((_ contents) (lp tree '() 1))) | |
1182 | `((chapheading "Table of Contents") ,contents))) | |
1183 | ||
1184 | (define (trim-whitespace str trim-left? trim-right?) | |
1185 | (let* ((left-space? (and (not trim-left?) | |
1186 | (string-prefix? " " str))) | |
1187 | (right-space? (and (not trim-right?) | |
1188 | (string-suffix? " " str))) | |
1189 | (tail (append! (string-tokenize str) | |
1190 | (if right-space? '("") '())))) | |
1191 | (string-join (if left-space? (cons "" tail) tail)))) | |
1192 | ||
1193 | (define (postprocess tree) | |
1194 | (define (loop in out state first? sig-ws?) | |
1195 | (cond | |
1196 | ((null? in) | |
1197 | (values (reverse! out) state)) | |
1198 | ((string? (car in)) | |
1199 | (loop (cdr in) | |
1200 | (cons (if sig-ws? (car in) | |
1201 | (trim-whitespace (car in) first? (null? (cdr in)))) | |
1202 | out) | |
1203 | state #f sig-ws?)) | |
1204 | ((pair? (car in)) | |
1205 | (case (caar in) | |
1206 | ((set) | |
1207 | (if (null? (cdar in)) (error "@set missing arguments" in)) | |
1208 | (if (string? (cadar in)) | |
1209 | (let ((i (string-index (cadar in) #\space))) | |
1210 | (if i | |
1211 | (loop (cdr in) out | |
1212 | (acons (substring (cadar in) 0 i) | |
1213 | (cons (substring (cadar in) (1+ i)) (cddar in)) | |
1214 | state) | |
1215 | #f sig-ws?) | |
1216 | (loop (cdr in) out (acons (cadar in) (cddar in) state) | |
1217 | #f sig-ws?))) | |
1218 | (error "expected a constant to define for @set" in))) | |
1219 | ((value) | |
1220 | (loop (fold-right cons (cdr in) | |
1221 | (or (and=> | |
1222 | (assoc (cadr (assq 'key (cdadar in))) state) cdr) | |
1223 | (error "unknown value" (cdadar in) state))) | |
1224 | out | |
1225 | state #f sig-ws?)) | |
1226 | ((copying) | |
1227 | (loop (cdr in) out (cons (car in) state) #f sig-ws?)) | |
1228 | ((insertcopying) | |
1229 | (loop (fold-right cons (cdr in) | |
1230 | (or (cdr (assoc 'copying state)) | |
1231 | (error "copying isn't set yet"))) | |
1232 | out | |
1233 | state #f sig-ws?)) | |
1234 | ((contents) | |
1235 | (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) | |
1236 | (else | |
1237 | (let*-values (((kid-out state) | |
1238 | (loop (car in) '() state #t | |
1239 | (or sig-ws? (space-significant? (caar in)))))) | |
1240 | (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) | |
1241 | (else ; a symbol | |
1242 | (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) | |
1243 | ||
1244 | (call-with-values | |
1245 | (lambda () (loop tree '() '() #t #f)) | |
1246 | (lambda (out state) out))) | |
1247 | ||
1248 | ;; Replace % with texinfo-arguments. | |
1249 | (define (stexi->sxml tree) | |
1250 | "Transform the stexi tree @var{tree} into sxml. This involves | |
1251 | replacing the @code{%} element that keeps the texinfo arguments with an | |
1252 | element for each argument. | |
1253 | ||
1254 | FIXME: right now it just changes % to @code{texinfo-arguments} -- that | |
1255 | doesn't hang with the idea of making a dtd at some point" | |
1256 | (pre-post-order | |
1257 | tree | |
1258 | `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) | |
1259 | (*text* . ,(lambda (x t) t)) | |
1260 | (*default* . ,(lambda (x . t) (cons x t)))))) | |
1261 | ||
1262 | ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 | |
1263 | ;;; texinfo.scm ends here |