Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (texinfo) -- parsing of texinfo into SXML |
2 | ;;;; | |
31d59769 | 3 | ;;;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. |
47f3ce52 AW |
4 | ;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> |
5 | ;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com> | |
6 | ;;;; | |
7 | ;;;; This file is based on SSAX's SSAX.scm. | |
8 | ;;;; | |
9 | ;;;; This library is free software; you can redistribute it and/or | |
10 | ;;;; modify it under the terms of the GNU Lesser General Public | |
11 | ;;;; License as published by the Free Software Foundation; either | |
12 | ;;;; version 3 of the License, or (at your option) any later version. | |
13 | ;;;; | |
14 | ;;;; This library is distributed in the hope that it will be useful, | |
15 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;;;; Lesser General Public License for more details. | |
18 | ;;;; | |
19 | ;;;; You should have received a copy of the GNU Lesser General Public | |
20 | ;;;; License along with this library; if not, write to the Free Software | |
21 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | \f | |
23 | ;;; Commentary: | |
24 | ;; | |
25 | ;; @subheading Texinfo processing in scheme | |
26 | ;; | |
27 | ;; This module parses texinfo into SXML. TeX will always be the | |
28 | ;; processor of choice for print output, of course. However, although | |
29 | ;; @code{makeinfo} works well for info, its output in other formats is | |
30 | ;; not very customizable, and the program is not extensible as a whole. | |
31 | ;; This module aims to provide an extensible framework for texinfo | |
32 | ;; processing that integrates texinfo into the constellation of SXML | |
33 | ;; processing tools. | |
34 | ;; | |
35 | ;; @subheading Notes on the SXML vocabulary | |
36 | ;; | |
37 | ;; Consider the following texinfo fragment: | |
38 | ;; | |
39 | ;;@example | |
40 | ;; @@deffn Primitive set-car! pair value | |
41 | ;; This function... | |
42 | ;; @@end deffn | |
43 | ;;@end example | |
44 | ;; | |
45 | ;; Logically, the category (Primitive), name (set-car!), and arguments | |
46 | ;; (pair value) are ``attributes'' of the deffn, with the description as | |
47 | ;; the content. However, texinfo allows for @@-commands within the | |
48 | ;; arguments to an environment, like @code{@@deffn}, which means that | |
49 | ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, | |
50 | ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are | |
51 | ;; called ``arguments'', and are grouped under the special element, `%'. | |
52 | ;; | |
53 | ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In | |
54 | ;; the interests of interoperability, this module provides a conversion | |
55 | ;; function to replace the `%' with `texinfo-arguments'. | |
56 | ;; | |
57 | ;;; Code: | |
58 | ||
59 | ;; Comparison to xml output of texinfo (which is rather undocumented): | |
60 | ;; Doesn't conform to texinfo dtd | |
61 | ;; No DTD at all, in fact :-/ | |
62 | ;; Actually outputs valid xml, after transforming % | |
63 | ;; Slower (although with caching the SXML that problem can go away) | |
64 | ;; Doesn't parse menus (although menus are shite) | |
65 | ;; Args go in a dedicated element, FBOFW | |
66 | ;; Definitions are handled a lot better | |
67 | ;; Does parse comments | |
68 | ;; Outputs only significant line breaks (a biggie!) | |
69 | ;; Nodes are treated as anchors, rather than content organizers (a biggie) | |
70 | ;; (more book-like, less info-like) | |
71 | ||
72 | ;; TODO | |
73 | ;; Integration: help, indexing, plain text | |
74 | ||
75 | (define-module (texinfo) | |
76 | #:use-module (sxml simple) | |
77 | #:use-module (sxml transform) | |
78 | #:use-module (sxml ssax input-parse) | |
79 | #:use-module (srfi srfi-1) | |
80 | #:use-module (srfi srfi-13) | |
81 | #:export (call-with-file-and-dir | |
82 | texi-command-specs | |
83 | texi-command-depth | |
84 | texi-fragment->stexi | |
85 | texi->stexi | |
86 | stexi->sxml)) | |
87 | ||
88 | ;; Some utilities | |
89 | ||
90 | (define (parser-error port message . rest) | |
05c29c5a | 91 | (apply throw 'parser-error port message rest)) |
47f3ce52 AW |
92 | |
93 | (define (call-with-file-and-dir filename proc) | |
94 | "Call the one-argument procedure @var{proc} with an input port that | |
95 | reads from @var{filename}. During the dynamic extent of @var{proc}'s | |
96 | execution, the current directory will be @code{(dirname | |
97 | @var{filename})}. This is useful for parsing documents that can include | |
98 | files by relative path name." | |
99 | (let ((current-dir (getcwd))) | |
100 | (dynamic-wind | |
101 | (lambda () (chdir (dirname filename))) | |
102 | (lambda () | |
103 | (call-with-input-file (basename filename) proc)) | |
104 | (lambda () (chdir current-dir))))) | |
105 | ||
106 | ;; Define this version here, because (srfi srfi-11)'s definition uses | |
107 | ;; syntax-rules, which is really damn slow | |
108 | (define-macro (let*-values bindings . body) | |
109 | (if (null? bindings) (cons 'begin body) | |
110 | (apply | |
111 | (lambda (vars initializer) | |
112 | (let ((cont | |
113 | (cons 'let*-values | |
114 | (cons (cdr bindings) body)))) | |
115 | (cond | |
116 | ((not (pair? vars)) ; regular let case, a single var | |
117 | `(let ((,vars ,initializer)) ,cont)) | |
118 | ((null? (cdr vars)) ; single var, see the prev case | |
119 | `(let ((,(car vars) ,initializer)) ,cont)) | |
120 | (else ; the most generic case | |
121 | `(call-with-values (lambda () ,initializer) | |
122 | (lambda ,vars ,cont)))))) | |
123 | (car bindings)))) | |
124 | ||
125 | ;;======================================================================== | |
126 | ;; Reflection on the XML vocabulary | |
127 | ||
128 | (define texi-command-specs | |
129 | #; | |
130 | "A list of (@var{name} @var{content-model} . @var{args}) | |
131 | ||
132 | @table @var | |
133 | @item name | |
134 | The name of an @@-command, as a symbol. | |
135 | ||
136 | @item content-model | |
137 | A symbol indicating the syntactic type of the @@-command: | |
138 | @table @code | |
139 | @item EMPTY-COMMAND | |
140 | No content, and no @code{@@end} is coming | |
141 | @item EOL-ARGS | |
142 | Unparsed arguments until end of line | |
143 | @item EOL-TEXT | |
144 | Parsed arguments until end of line | |
145 | @item INLINE-ARGS | |
146 | Unparsed arguments ending with @code{#\\@}} | |
147 | @item INLINE-TEXT | |
148 | Parsed arguments ending with @code{#\\@}} | |
149 | @item ENVIRON | |
150 | The tag is an environment tag, expect @code{@@end foo}. | |
151 | @item TABLE-ENVIRON | |
152 | Like ENVIRON, but with special parsing rules for its arguments. | |
153 | @item FRAGMENT | |
154 | For @code{*fragment*}, the command used for parsing fragments of | |
155 | texinfo documents. | |
156 | @end table | |
157 | ||
158 | @code{INLINE-TEXT} commands will receive their arguments within their | |
159 | bodies, whereas the @code{-ARGS} commands will receive them in their | |
160 | attribute list. | |
161 | ||
162 | @code{EOF-TEXT} receives its arguments in its body. | |
163 | ||
164 | @code{ENVIRON} commands have both: parsed arguments until the end of | |
165 | line, received through their attribute list, and parsed text until the | |
166 | @code{@@end}, received in their bodies. | |
167 | ||
168 | @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in | |
169 | @code{ENVIRON}. | |
170 | ||
31d59769 AW |
171 | In addition, @code{ALIAS} can alias one command to another. The alias |
172 | will never be seen in parsed stexinfo. | |
173 | ||
47f3ce52 AW |
174 | There are four @@-commands that are treated specially. @code{@@include} |
175 | is a low-level token that will not be seen by higher-level parsers, so | |
176 | it has no content-model. @code{@@para} is the paragraph command, which | |
177 | is only implicit in the texinfo source. @code{@@item} has special | |
178 | syntax, as noted above, and @code{@@entry} is how this parser treats | |
179 | @code{@@item} commands within @code{@@table}, @code{@@ftable}, and | |
180 | @code{@@vtable}. | |
181 | ||
182 | Also, indexing commands (@code{@@cindex}, etc.) are treated specially. | |
183 | Their arguments are parsed, but they are needed before entering the | |
184 | element so that an anchor can be inserted into the text before the index | |
185 | entry. | |
186 | ||
187 | @item args | |
188 | Named arguments to the command, in the same format as the formals for a | |
189 | lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, | |
190 | @code{ENVIRON}, @code{TABLE-ENVIRON} commands. | |
191 | @end table" | |
192 | '(;; Special commands | |
193 | (include #f) ;; this is a low-level token | |
194 | (para PARAGRAPH) | |
195 | (item ITEM) | |
196 | (entry ENTRY . heading) | |
197 | (noindent EMPTY-COMMAND) | |
198 | (*fragment* FRAGMENT) | |
199 | ||
200 | ;; Inline text commands | |
201 | (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant | |
202 | (bold INLINE-TEXT) | |
203 | (sample INLINE-TEXT) | |
204 | (samp INLINE-TEXT) | |
205 | (code INLINE-TEXT) | |
206 | (kbd INLINE-TEXT) | |
207 | (key INLINE-TEXT) | |
208 | (var INLINE-TEXT) | |
209 | (env INLINE-TEXT) | |
210 | (file INLINE-TEXT) | |
211 | (command INLINE-TEXT) | |
212 | (option INLINE-TEXT) | |
213 | (dfn INLINE-TEXT) | |
214 | (cite INLINE-TEXT) | |
215 | (acro INLINE-TEXT) | |
47f3ce52 AW |
216 | (email INLINE-TEXT) |
217 | (emph INLINE-TEXT) | |
218 | (strong INLINE-TEXT) | |
219 | (sample INLINE-TEXT) | |
220 | (sc INLINE-TEXT) | |
221 | (titlefont INLINE-TEXT) | |
222 | (asis INLINE-TEXT) | |
223 | (b INLINE-TEXT) | |
224 | (i INLINE-TEXT) | |
225 | (r INLINE-TEXT) | |
226 | (sansserif INLINE-TEXT) | |
227 | (slanted INLINE-TEXT) | |
228 | (t INLINE-TEXT) | |
229 | ||
230 | ;; Inline args commands | |
231 | (value INLINE-ARGS . (key)) | |
232 | (ref INLINE-ARGS . (node #:opt name section info-file manual)) | |
233 | (xref INLINE-ARGS . (node #:opt name section info-file manual)) | |
234 | (pxref INLINE-ARGS . (node #:opt name section info-file manual)) | |
31d59769 | 235 | (url ALIAS . uref) |
47f3ce52 AW |
236 | (uref INLINE-ARGS . (url #:opt title replacement)) |
237 | (anchor INLINE-ARGS . (name)) | |
238 | (dots INLINE-ARGS . ()) | |
239 | (result INLINE-ARGS . ()) | |
240 | (bullet INLINE-ARGS . ()) | |
241 | (copyright INLINE-ARGS . ()) | |
242 | (tie INLINE-ARGS . ()) | |
243 | (image INLINE-ARGS . (file #:opt width height alt-text extension)) | |
244 | ||
245 | ;; EOL args elements | |
246 | (node EOL-ARGS . (name #:opt next previous up)) | |
247 | (c EOL-ARGS . all) | |
248 | (comment EOL-ARGS . all) | |
249 | (setchapternewpage EOL-ARGS . all) | |
250 | (sp EOL-ARGS . all) | |
251 | (page EOL-ARGS . ()) | |
252 | (vskip EOL-ARGS . all) | |
253 | (syncodeindex EOL-ARGS . all) | |
254 | (contents EOL-ARGS . ()) | |
255 | (shortcontents EOL-ARGS . ()) | |
256 | (summarycontents EOL-ARGS . ()) | |
257 | (insertcopying EOL-ARGS . ()) | |
258 | (dircategory EOL-ARGS . (category)) | |
259 | (top EOL-ARGS . (title)) | |
260 | (printindex EOL-ARGS . (type)) | |
261 | ||
262 | ;; EOL text commands | |
263 | (*ENVIRON-ARGS* EOL-TEXT) | |
264 | (itemx EOL-TEXT) | |
265 | (set EOL-TEXT) | |
266 | (center EOL-TEXT) | |
267 | (title EOL-TEXT) | |
268 | (subtitle EOL-TEXT) | |
269 | (author EOL-TEXT) | |
270 | (chapter EOL-TEXT) | |
271 | (section EOL-TEXT) | |
272 | (appendix EOL-TEXT) | |
273 | (appendixsec EOL-TEXT) | |
274 | (unnumbered EOL-TEXT) | |
275 | (unnumberedsec EOL-TEXT) | |
276 | (subsection EOL-TEXT) | |
277 | (subsubsection EOL-TEXT) | |
278 | (appendixsubsec EOL-TEXT) | |
279 | (appendixsubsubsec EOL-TEXT) | |
280 | (unnumberedsubsec EOL-TEXT) | |
281 | (unnumberedsubsubsec EOL-TEXT) | |
282 | (chapheading EOL-TEXT) | |
283 | (majorheading EOL-TEXT) | |
284 | (heading EOL-TEXT) | |
285 | (subheading EOL-TEXT) | |
286 | (subsubheading EOL-TEXT) | |
287 | ||
288 | (deftpx EOL-TEXT-ARGS . (category name . attributes)) | |
289 | (defcvx EOL-TEXT-ARGS . (category class name)) | |
290 | (defivarx EOL-TEXT-ARGS . (class name)) | |
291 | (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) | |
292 | (defopx EOL-TEXT-ARGS . (category class name . arguments)) | |
293 | (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) | |
294 | (defmethodx EOL-TEXT-ARGS . (class name . arguments)) | |
295 | (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) | |
296 | (defoptx EOL-TEXT-ARGS . (name)) | |
297 | (defvrx EOL-TEXT-ARGS . (category name)) | |
298 | (defvarx EOL-TEXT-ARGS . (name)) | |
299 | (deftypevrx EOL-TEXT-ARGS . (category data-type name)) | |
300 | (deftypevarx EOL-TEXT-ARGS . (data-type name)) | |
301 | (deffnx EOL-TEXT-ARGS . (category name . arguments)) | |
302 | (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) | |
303 | (defspecx EOL-TEXT-ARGS . (name . arguments)) | |
304 | (defmacx EOL-TEXT-ARGS . (name . arguments)) | |
305 | (defunx EOL-TEXT-ARGS . (name . arguments)) | |
306 | (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) | |
307 | ||
308 | ;; Indexing commands | |
309 | (cindex INDEX . entry) | |
310 | (findex INDEX . entry) | |
311 | (vindex INDEX . entry) | |
312 | (kindex INDEX . entry) | |
313 | (pindex INDEX . entry) | |
314 | (tindex INDEX . entry) | |
315 | ||
316 | ;; Environment commands (those that need @end) | |
317 | (texinfo ENVIRON . title) | |
318 | (ignore ENVIRON . ()) | |
319 | (ifinfo ENVIRON . ()) | |
320 | (iftex ENVIRON . ()) | |
321 | (ifhtml ENVIRON . ()) | |
322 | (ifxml ENVIRON . ()) | |
323 | (ifplaintext ENVIRON . ()) | |
324 | (ifnotinfo ENVIRON . ()) | |
325 | (ifnottex ENVIRON . ()) | |
326 | (ifnothtml ENVIRON . ()) | |
327 | (ifnotxml ENVIRON . ()) | |
328 | (ifnotplaintext ENVIRON . ()) | |
329 | (titlepage ENVIRON . ()) | |
330 | (menu ENVIRON . ()) | |
331 | (direntry ENVIRON . ()) | |
332 | (copying ENVIRON . ()) | |
333 | (example ENVIRON . ()) | |
334 | (smallexample ENVIRON . ()) | |
335 | (display ENVIRON . ()) | |
336 | (smalldisplay ENVIRON . ()) | |
337 | (verbatim ENVIRON . ()) | |
338 | (format ENVIRON . ()) | |
339 | (smallformat ENVIRON . ()) | |
340 | (lisp ENVIRON . ()) | |
341 | (smalllisp ENVIRON . ()) | |
342 | (cartouche ENVIRON . ()) | |
343 | (quotation ENVIRON . ()) | |
344 | ||
345 | (deftp ENVIRON . (category name . attributes)) | |
346 | (defcv ENVIRON . (category class name)) | |
347 | (defivar ENVIRON . (class name)) | |
348 | (deftypeivar ENVIRON . (class data-type name)) | |
349 | (defop ENVIRON . (category class name . arguments)) | |
350 | (deftypeop ENVIRON . (category class data-type name . arguments)) | |
351 | (defmethod ENVIRON . (class name . arguments)) | |
352 | (deftypemethod ENVIRON . (class data-type name . arguments)) | |
353 | (defopt ENVIRON . (name)) | |
354 | (defvr ENVIRON . (category name)) | |
355 | (defvar ENVIRON . (name)) | |
356 | (deftypevr ENVIRON . (category data-type name)) | |
357 | (deftypevar ENVIRON . (data-type name)) | |
358 | (deffn ENVIRON . (category name . arguments)) | |
359 | (deftypefn ENVIRON . (category data-type name . arguments)) | |
360 | (defspec ENVIRON . (name . arguments)) | |
361 | (defmac ENVIRON . (name . arguments)) | |
362 | (defun ENVIRON . (name . arguments)) | |
363 | (deftypefun ENVIRON . (data-type name . arguments)) | |
364 | ||
365 | (table TABLE-ENVIRON . (formatter)) | |
366 | (itemize TABLE-ENVIRON . (formatter)) | |
367 | (enumerate TABLE-ENVIRON . (start)) | |
368 | (ftable TABLE-ENVIRON . (formatter)) | |
369 | (vtable TABLE-ENVIRON . (formatter)))) | |
370 | ||
371 | (define command-depths | |
372 | '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) | |
373 | (top . 0) (unnumbered . 1) (unnumberedsec . 2) | |
374 | (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) | |
375 | (appendix . 1) (appendixsec . 2) (appendixsection . 2) | |
376 | (appendixsubsec . 3) (appendixsubsubsec . 4))) | |
377 | (define (texi-command-depth command max-depth) | |
378 | "Given the texinfo command @var{command}, return its nesting level, or | |
379 | @code{#f} if it nests too deep for @var{max-depth}. | |
380 | ||
381 | Examples: | |
382 | @example | |
05c29c5a AW |
383 | (texi-command-depth 'chapter 4) @result{} 1 |
384 | (texi-command-depth 'top 4) @result{} 0 | |
385 | (texi-command-depth 'subsection 4) @result{} 3 | |
386 | (texi-command-depth 'appendixsubsec 4) @result{} 3 | |
387 | (texi-command-depth 'subsection 2) @result{} #f | |
47f3ce52 AW |
388 | @end example" |
389 | (let ((depth (and=> (assq command command-depths) cdr))) | |
390 | (and depth (<= depth max-depth) depth))) | |
391 | ||
392 | ;; The % is for arguments | |
393 | (define (space-significant? command) | |
394 | (memq command | |
395 | '(example smallexample verbatim lisp smalllisp menu %))) | |
396 | ||
397 | ;; Like a DTD for texinfo | |
398 | (define (command-spec command) | |
399 | (or (assq command texi-command-specs) | |
400 | (parser-error #f "Unknown command" command))) | |
401 | ||
402 | (define (inline-content? content) | |
403 | (or (eq? content 'INLINE-TEXT) (eq? content 'INLINE-ARGS))) | |
404 | ||
405 | ||
406 | ;;======================================================================== | |
407 | ;; Lower-level parsers and scanners | |
408 | ;; | |
409 | ;; They deal with primitive lexical units (Names, whitespaces, tags) and | |
410 | ;; with pieces of more generic productions. Most of these parsers must | |
411 | ;; be called in appropriate context. For example, complete-start-command | |
412 | ;; must be called only when the @-command start has been detected and | |
413 | ;; its name token has been read. | |
414 | ||
415 | ;; Test if a string is made of only whitespace | |
416 | ;; An empty string is considered made of whitespace as well | |
417 | (define (string-whitespace? str) | |
418 | (or (string-null? str) | |
419 | (string-every char-whitespace? str))) | |
420 | ||
421 | ;; Like read-text-line, but allows EOF. | |
422 | (define read-eof-breaks '(*eof* #\return #\newline)) | |
423 | (define (read-eof-line port) | |
424 | (if (eof-object? (peek-char port)) | |
425 | (peek-char port) | |
426 | (let* ((line (next-token '() read-eof-breaks | |
427 | "reading a line" port)) | |
428 | (c (read-char port))) ; must be either \n or \r or EOF | |
429 | (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) | |
430 | (read-char port)) ; skip \n that follows \r | |
431 | line))) | |
432 | ||
47f3ce52 AW |
433 | (define (skip-whitespace port) |
434 | (skip-while '(#\space #\tab #\return #\newline) port)) | |
435 | ||
436 | (define (skip-horizontal-whitespace port) | |
437 | (skip-while '(#\space #\tab) port)) | |
438 | ||
439 | ;; command ::= Letter+ | |
440 | ||
441 | ;; procedure: read-command PORT | |
442 | ;; | |
443 | ;; Read a command starting from the current position in the PORT and | |
444 | ;; return it as a symbol. | |
445 | (define (read-command port) | |
446 | (let ((first-char (peek-char port))) | |
447 | (or (char-alphabetic? first-char) | |
448 | (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) | |
449 | (string->symbol | |
450 | (next-token-of | |
451 | (lambda (c) | |
452 | (cond | |
453 | ((eof-object? c) #f) | |
454 | ((char-alphabetic? c) c) | |
455 | (else #f))) | |
456 | port))) | |
457 | ||
458 | ;; A token is a primitive lexical unit. It is a record with two fields, | |
459 | ;; token-head and token-kind. | |
460 | ;; | |
461 | ;; Token types: | |
462 | ;; END The end of a texinfo command. If the command is ended by }, | |
463 | ;; token-head will be #f. Otherwise if the command is ended by | |
464 | ;; @end COMMAND, token-head will be COMMAND. As a special case, | |
465 | ;; @bye is the end of a special @texinfo command. | |
466 | ;; START The start of a texinfo command. The token-head will be a | |
467 | ;; symbol of the @-command name. | |
468 | ;; INCLUDE An @include directive. The token-head will be empty -- the | |
469 | ;; caller is responsible for reading the include file name. | |
470 | ;; ITEM @item commands have an irregular syntax. They end at the | |
471 | ;; next @item, or at the end of the environment. For that | |
472 | ;; read-command-token treats them specially. | |
473 | ||
474 | (define (make-token kind head) (cons kind head)) | |
475 | (define token? pair?) | |
476 | (define token-kind car) | |
477 | (define token-head cdr) | |
478 | ||
479 | ;; procedure: read-command-token PORT | |
480 | ;; | |
481 | ;; This procedure starts parsing of a command token. The current | |
482 | ;; position in the stream must be #\@. This procedure scans enough of | |
483 | ;; the input stream to figure out what kind of a command token it is | |
484 | ;; seeing. The procedure returns a token structure describing the token. | |
485 | ||
486 | (define (read-command-token port) | |
487 | (assert-curr-char '(#\@) "start of the command" port) | |
488 | (let ((peeked (peek-char port))) | |
489 | (cond | |
490 | ((memq peeked '(#\! #\. #\? #\@ #\\ #\{ #\})) | |
491 | ;; @-commands that escape characters | |
492 | (make-token 'STRING (string (read-char port)))) | |
493 | (else | |
494 | (let ((name (read-command port))) | |
495 | (case name | |
496 | ((end) | |
497 | ;; got an ending tag | |
498 | (let ((command (string-trim-both | |
499 | (read-eof-line port)))) | |
500 | (or (and (not (string-null? command)) | |
501 | (string-every char-alphabetic? command)) | |
502 | (parser-error port "malformed @end" command)) | |
503 | (make-token 'END (string->symbol command)))) | |
504 | ((bye) | |
505 | ;; the end of the top | |
506 | (make-token 'END 'texinfo)) | |
507 | ((item) | |
508 | (make-token 'ITEM 'item)) | |
509 | ((include) | |
510 | (make-token 'INCLUDE #f)) | |
511 | (else | |
512 | (make-token 'START name)))))))) | |
513 | ||
514 | ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED | |
515 | ;; | |
516 | ;; This procedure must be called after we have read a string | |
517 | ;; "@verbatim\n" that begins a verbatim section. The current position | |
518 | ;; must be the first position of the verbatim body. This function reads | |
519 | ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a | |
520 | ;; character data consumer. | |
521 | ;; | |
522 | ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. | |
523 | ;; The first STRING1 argument to STR-HANDLER never contains a newline. | |
524 | ;; The second STRING2 argument often will. On the first invocation of the | |
525 | ;; STR-HANDLER, the seed is the one passed to read-verbatim-body | |
526 | ;; as the third argument. The result of this first invocation will be | |
527 | ;; passed as the seed argument to the second invocation of the line | |
528 | ;; consumer, and so on. The result of the last invocation of the | |
529 | ;; STR-HANDLER is returned by the read-verbatim-body. Note a | |
530 | ;; similarity to the fundamental 'fold' iterator. | |
531 | ;; | |
532 | ;; Within a verbatim section all characters are taken at their face | |
533 | ;; value. It ends with "\n@end verbatim(\r)?\n". | |
534 | ||
535 | ;; Must be called right after the newline after @verbatim. | |
536 | (define (read-verbatim-body port str-handler seed) | |
537 | (let loop ((seed seed)) | |
538 | (let ((fragment (next-token '() '(#\newline) | |
539 | "reading verbatim" port))) | |
540 | ;; We're reading the char after the 'fragment', which is | |
541 | ;; #\newline. | |
542 | (read-char port) | |
543 | (if (string=? fragment "@end verbatim") | |
544 | seed | |
545 | (loop (str-handler fragment "\n" seed)))))) | |
546 | ||
547 | ;; procedure+: read-arguments PORT | |
548 | ;; | |
549 | ;; This procedure reads and parses a production ArgumentList. | |
550 | ;; ArgumentList ::= S* Argument (S* , S* Argument)* S* | |
551 | ;; Argument ::= ([^@{},])* | |
552 | ;; | |
553 | ;; Arguments are the things in braces, i.e @ref{my node} has one | |
554 | ;; argument, "my node". Most commands taking braces actually don't have | |
555 | ;; arguments, they process text. For example, in | |
556 | ;; @emph{@strong{emphasized}}, the emph takes text, because the parse | |
557 | ;; continues into the braces. | |
558 | ;; | |
559 | ;; Any whitespace within Argument is replaced with a single space. | |
560 | ;; Whitespace around an Argument is trimmed. | |
561 | ;; | |
562 | ;; The procedure returns a list of arguments. Afterwards the current | |
563 | ;; character will be after the final #\}. | |
564 | ||
565 | (define (read-arguments port stop-char) | |
566 | (define (split str) | |
567 | (read-char port) ;; eat the delimiter | |
568 | (let ((ret (map (lambda (x) (if (string-null? x) #f x)) | |
569 | (map string-trim-both (string-split str #\,))))) | |
570 | (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) | |
571 | '() | |
572 | ret))) | |
573 | (split (next-token '() (list stop-char) | |
574 | "arguments of @-command" port))) | |
575 | ||
576 | ;; procedure+: complete-start-command COMMAND PORT | |
577 | ;; | |
578 | ;; This procedure is to complete parsing of an @-command. The procedure | |
579 | ;; must be called after the command token has been read. COMMAND is a | |
580 | ;; TAG-NAME. | |
581 | ;; | |
582 | ;; This procedure returns several values: | |
583 | ;; COMMAND: a symbol. | |
584 | ;; ARGUMENTS: command's arguments, as an alist. | |
585 | ;; CONTENT-MODEL: the content model of the command. | |
586 | ;; | |
587 | ;; On exit, the current position in PORT will depend on the CONTENT-MODEL. | |
588 | ;; | |
589 | ;; Content model Port position | |
590 | ;; ============= ============= | |
591 | ;; INLINE-TEXT One character after the #\{. | |
592 | ;; INLINE-ARGS The first character after the #\}. | |
593 | ;; EOL-TEXT The first non-whitespace character after the command. | |
594 | ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT | |
595 | ;; The first character on the next line. | |
596 | ;; PARAGRAPH, ITEM, EMPTY-COMMAND | |
597 | ;; The first character after the command. | |
598 | ||
599 | (define (arguments->attlist port args arg-names) | |
600 | (let loop ((in args) (names arg-names) (opt? #f) (out '())) | |
601 | (cond | |
602 | ((symbol? names) ;; a rest arg | |
603 | (reverse (if (null? in) out (acons names in out)))) | |
604 | ((and (not (null? names)) (eq? (car names) #:opt)) | |
605 | (loop in (cdr names) #t out)) | |
606 | ((null? in) | |
607 | (if (or (null? names) opt?) | |
608 | (reverse out) | |
609 | (parser-error port "@-command expected more arguments:" | |
610 | args arg-names names))) | |
611 | ((null? names) | |
612 | (parser-error port "@-command didn't expect more arguments:" in)) | |
613 | ((not (car in)) | |
614 | (or (and opt? (loop (cdr in) (cdr names) opt? out)) | |
615 | (parser-error "@-command missing required argument" | |
616 | (car names)))) | |
617 | (else | |
618 | (loop (cdr in) (cdr names) opt? | |
619 | (cons (list (car names) (car in)) out)))))) | |
620 | ||
621 | (define (parse-table-args command port) | |
622 | (let* ((line (string-trim-both (read-text-line port))) | |
623 | (length (string-length line))) | |
624 | (define (get-formatter) | |
625 | (or (and (not (zero? length)) | |
626 | (eq? (string-ref line 0) #\@) | |
627 | (let ((f (string->symbol (substring line 1)))) | |
628 | (or (inline-content? (cadr (command-spec f))) | |
629 | (parser-error | |
630 | port "@item formatter must be INLINE" f)) | |
631 | f)) | |
05c29c5a | 632 | (parser-error port "Invalid @item formatter" line))) |
47f3ce52 AW |
633 | (case command |
634 | ((enumerate) | |
635 | (if (zero? length) | |
636 | '() | |
637 | `((start | |
638 | ,(if (or (and (eq? length 1) | |
639 | (char-alphabetic? (string-ref line 0))) | |
640 | (string-every char-numeric? line)) | |
641 | line | |
642 | (parser-error | |
643 | port "Invalid enumerate start" line)))))) | |
644 | ((itemize) | |
645 | `((bullet | |
646 | ,(or (and (eq? length 1) line) | |
647 | (and (string-null? line) '(bullet)) | |
648 | (list (get-formatter)))))) | |
649 | (else ;; tables of various varieties | |
650 | `((formatter (,(get-formatter)))))))) | |
651 | ||
652 | (define (complete-start-command command port) | |
653 | (define (get-arguments type arg-names stop-char) | |
654 | (arguments->attlist port (read-arguments port stop-char) arg-names)) | |
655 | ||
656 | (let* ((spec (command-spec command)) | |
657 | (type (cadr spec)) | |
658 | (arg-names (cddr spec))) | |
659 | (case type | |
31d59769 AW |
660 | ((ALIAS) |
661 | (complete-start-command arg-names port)) | |
47f3ce52 AW |
662 | ((INLINE-TEXT) |
663 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
664 | (values command '() type)) | |
665 | ((INLINE-ARGS) | |
666 | (assert-curr-char '(#\{) "Inline element lacks {" port) | |
667 | (values command (get-arguments type arg-names #\}) type)) | |
668 | ((EOL-ARGS) | |
669 | (values command (get-arguments type arg-names #\newline) type)) | |
670 | ((ENVIRON ENTRY INDEX) | |
671 | (skip-horizontal-whitespace port) | |
672 | (values command (parse-environment-args command port) type)) | |
673 | ((TABLE-ENVIRON) | |
674 | (skip-horizontal-whitespace port) | |
675 | (values command (parse-table-args command port) type)) | |
676 | ((EOL-TEXT) | |
677 | (skip-horizontal-whitespace port) | |
678 | (values command '() type)) | |
679 | ((EOL-TEXT-ARGS) | |
680 | (skip-horizontal-whitespace port) | |
681 | (values command (parse-eol-text-args command port) type)) | |
682 | ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) | |
683 | (values command '() type)) | |
684 | (else ;; INCLUDE shouldn't get here | |
685 | (parser-error port "can't happen"))))) | |
686 | ||
687 | ;;----------------------------------------------------------------------------- | |
688 | ;; Higher-level parsers and scanners | |
689 | ;; | |
690 | ;; They parse productions corresponding entire @-commands. | |
691 | ||
692 | ;; Only reads @settitle, leaves it to the command parser to finish | |
693 | ;; reading the title. | |
694 | (define (take-until-settitle port) | |
695 | (or (find-string-from-port? "\n@settitle " port) | |
696 | (parser-error port "No \\n@settitle found")) | |
697 | (skip-horizontal-whitespace port) | |
698 | (and (eq? (peek-char port) #\newline) | |
699 | (parser-error port "You have a @settitle, but no title"))) | |
700 | ||
701 | ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED | |
702 | ;; | |
703 | ;; This procedure is to read the CharData of a texinfo document. | |
704 | ;; | |
705 | ;; text ::= (CharData | Command)* | |
706 | ;; | |
707 | ;; The procedure reads CharData and stops at @-commands (or | |
708 | ;; environments). It also stops at an open or close brace. | |
709 | ;; | |
710 | ;; port | |
711 | ;; a PORT to read | |
712 | ;; expect-eof? | |
713 | ;; a boolean indicating if EOF is normal, i.e., the character | |
714 | ;; data may be terminated by the EOF. EOF is normal | |
715 | ;; while processing the main document. | |
716 | ;; preserve-ws? | |
717 | ;; a boolean indicating if we are within a whitespace-preserving | |
718 | ;; environment. If #t, suppress paragraph detection. | |
719 | ;; str-handler | |
720 | ;; a STR-HANDLER, see read-verbatim-body | |
721 | ;; seed | |
722 | ;; an argument passed to the first invocation of STR-HANDLER. | |
723 | ;; | |
724 | ;; The procedure returns two results: SEED and TOKEN. The SEED is the | |
725 | ;; result of the last invocation of STR-HANDLER, or the original seed if | |
726 | ;; STR-HANDLER was never called. | |
727 | ;; | |
728 | ;; TOKEN can be either an eof-object (this can happen only if expect-eof? | |
729 | ;; was #t), or a texinfo token denoting the start or end of a tag. | |
730 | ||
731 | ;; read-char-data port expect-eof? preserve-ws? str-handler seed | |
732 | (define read-char-data | |
733 | (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) | |
734 | (define (handle str-handler str1 str2 seed) | |
735 | (if (and (string-null? str1) (string-null? str2)) | |
736 | seed | |
737 | (str-handler str1 str2 seed))) | |
738 | ||
739 | (lambda (port expect-eof? preserve-ws? str-handler seed) | |
740 | (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) | |
741 | (let loop ((seed seed)) | |
742 | (let* ((fragment (next-token '() end-chars "reading char data" port)) | |
743 | (term-char (peek-char port))) ; one of end-chars | |
744 | (cond | |
745 | ((eof-object? term-char) ; only if expect-eof? | |
746 | (values (handle str-handler fragment "" seed) term-char)) | |
747 | ((memq term-char '(#\@ #\{ #\})) | |
748 | (values (handle str-handler fragment "" seed) | |
749 | (case term-char | |
750 | ((#\@) (read-command-token port)) | |
751 | ((#\{) (make-token 'START '*braces*)) | |
752 | ((#\}) (read-char port) (make-token 'END #f))))) | |
753 | ((eq? term-char #\newline) | |
754 | ;; Always significant, unless directly before an end token. | |
755 | (let ((c (peek-next-char port))) | |
756 | (cond | |
757 | ((eof-object? c) | |
758 | (or expect-eof? | |
759 | (parser-error port "EOF while reading char data")) | |
760 | (values (handle str-handler fragment "" seed) c)) | |
761 | ((eq? c #\@) | |
762 | (let* ((token (read-command-token port)) | |
763 | (end? (eq? (token-kind token) 'END))) | |
764 | (values | |
765 | (handle str-handler fragment (if end? "" " ") seed) | |
766 | token))) | |
767 | ((and (not preserve-ws?) (eq? c #\newline)) | |
768 | ;; paragraph-separator ::= #\newline #\newline+ | |
769 | (skip-while '(#\newline) port) | |
770 | (skip-horizontal-whitespace port) | |
771 | (values (handle str-handler fragment "" seed) | |
772 | (make-token 'PARA 'para))) | |
773 | (else | |
774 | (loop (handle str-handler fragment | |
775 | (if preserve-ws? "\n" " ") seed))))))))))))) | |
776 | ||
777 | ; procedure+: assert-token TOKEN KIND NAME | |
778 | ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME | |
779 | (define (assert-token token kind name) | |
780 | (or (and (token? token) | |
781 | (eq? kind (token-kind token)) | |
782 | (equal? name (token-head token))) | |
783 | (parser-error #f "Expecting @end for " name ", got " token))) | |
784 | ||
785 | ;;======================================================================== | |
786 | ;; Highest-level parsers: Texinfo to SXML | |
787 | ||
788 | ;; These parsers are a set of syntactic forms to instantiate a SSAX | |
789 | ;; parser. The user tells what to do with the parsed character and | |
790 | ;; element data. These latter handlers determine if the parsing follows a | |
791 | ;; SAX or a DOM model. | |
792 | ||
793 | ;; syntax: make-command-parser fdown fup str-handler | |
794 | ||
795 | ;; Create a parser to parse and process one element, including its | |
796 | ;; character content or children elements. The parser is typically | |
797 | ;; applied to the root element of a document. | |
798 | ||
799 | ;; fdown | |
800 | ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED | |
801 | ;; | |
802 | ;; This procedure is to generate the seed to be passed to handlers | |
803 | ;; that process the content of the element. This is the function | |
804 | ;; identified as 'fdown' in the denotational semantics of the XML | |
805 | ;; parser given in the title comments to (sxml ssax). | |
806 | ;; | |
807 | ;; fup | |
808 | ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED | |
809 | ;; | |
810 | ;; This procedure is called when parsing of COMMAND is finished. | |
811 | ;; The SEED is the result from the last content parser (or from | |
812 | ;; fdown if the element has the empty content). PARENT-SEED is the | |
813 | ;; same seed as was passed to fdown. The procedure is to generate a | |
814 | ;; seed that will be the result of the element parser. This is the | |
815 | ;; function identified as 'fup' in the denotational semantics of | |
816 | ;; the XML parser given in the title comments to (sxml ssax). | |
817 | ;; | |
818 | ;; str-handler | |
819 | ;; A STR-HANDLER, see read-verbatim-body | |
820 | ;; | |
821 | ||
822 | ;; The generated parser is a | |
823 | ;; procedure COMMAND PORT SEED | |
824 | ;; | |
825 | ;; The procedure must be called *after* the command token has been read. | |
826 | ||
827 | (define (read-include-file-name port) | |
828 | (let ((x (string-trim-both (read-eof-line port)))) | |
829 | (if (string-null? x) | |
830 | (error "no file listed") | |
831 | x))) ;; fixme: should expand @value{} references | |
832 | ||
833 | (define (sxml->node-name sxml) | |
834 | "Turn some sxml string into a valid node name." | |
835 | (let loop ((in (string->list (sxml->string sxml))) (out '())) | |
836 | (if (null? in) | |
837 | (apply string (reverse out)) | |
838 | (if (memq (car in) '(#\{ #\} #\@ #\,)) | |
839 | (loop (cdr in) out) | |
840 | (loop (cdr in) (cons (car in) out)))))) | |
841 | ||
842 | (define (index command arguments fdown fup parent-seed) | |
843 | (case command | |
844 | ((deftp defcv defivar deftypeivar defop deftypeop defmethod | |
845 | deftypemethod defopt defvr defvar deftypevr deftypevar deffn | |
846 | deftypefn defspec defmac defun deftypefun) | |
847 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
848 | (cadr (assq 'name arguments))))))) | |
849 | (fup 'anchor args parent-seed | |
850 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
851 | ((cindex findex vindex kindex pindex tindex) | |
852 | (let ((args `((name ,(string-append (symbol->string command) "-" | |
853 | (sxml->node-name | |
854 | (assq 'entry arguments))))))) | |
855 | (fup 'anchor args parent-seed | |
856 | (fdown 'anchor args 'INLINE-ARGS '())))) | |
857 | (else parent-seed))) | |
858 | ||
859 | (define (make-command-parser fdown fup str-handler) | |
860 | (lambda (command port seed) | |
861 | (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) | |
862 | (let*-values (((command arguments expected-content) | |
863 | (complete-start-command command port))) | |
864 | (let* ((parent-seed (index command arguments fdown fup parent-seed)) | |
865 | (seed (fdown command arguments expected-content parent-seed)) | |
866 | (eof-closes? (or (memq command '(texinfo para *fragment*)) | |
867 | (eq? expected-content 'EOL-TEXT))) | |
868 | (sig-ws? (or sig-ws? (space-significant? command))) | |
869 | (up (lambda (s) (fup command arguments parent-seed s))) | |
870 | (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) | |
871 | (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) | |
872 | ||
873 | (define (port-for-content) | |
874 | (if (eq? expected-content 'EOL-TEXT) | |
875 | (call-with-input-string (read-text-line port) identity) | |
876 | port)) | |
877 | ||
878 | (cond | |
879 | ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX | |
880 | EOL-TEXT-ARGS)) | |
881 | ;; empty or finished by complete-start-command | |
882 | (up seed)) | |
883 | ((eq? command 'verbatim) | |
884 | (up (read-verbatim-body port str-handler seed))) | |
885 | (else | |
886 | (let loop ((port (port-for-content)) | |
887 | (expect-eof? eof-closes?) | |
888 | (end-para identity) | |
889 | (need-break? (and (not sig-ws?) | |
890 | (memq expected-content | |
891 | '(ENVIRON TABLE-ENVIRON | |
892 | ENTRY ITEM FRAGMENT)))) | |
893 | (seed seed)) | |
894 | (cond | |
895 | ((and need-break? (or sig-ws? (skip-whitespace port)) | |
896 | (not (memq (peek-char port) '(#\@ #\}))) | |
897 | (not (eof-object? (peek-char port)))) | |
898 | ;; Even if we have an @, it might be inline -- check | |
899 | ;; that later | |
900 | (let ((seed (end-para seed))) | |
901 | (loop port expect-eof? (make-end-para seed) #f | |
902 | (new-para seed)))) | |
903 | (else | |
904 | (let*-values (((seed token) | |
905 | (read-char-data | |
906 | port expect-eof? sig-ws? str-handler seed))) | |
907 | (cond | |
908 | ((eof-object? token) | |
909 | (case expect-eof? | |
910 | ((include #f) (end-para seed)) | |
911 | (else (up (end-para seed))))) | |
912 | (else | |
913 | (case (token-kind token) | |
914 | ((STRING) | |
915 | ;; this is only @-commands that escape | |
916 | ;; characters: @}, @@, @{ -- new para if need-break | |
917 | (let ((seed ((if need-break? end-para identity) seed))) | |
918 | (loop port expect-eof? | |
919 | (if need-break? (make-end-para seed) end-para) #f | |
920 | (str-handler (token-head token) "" | |
921 | ((if need-break? new-para identity) | |
922 | seed))))) | |
923 | ((END) | |
924 | ;; The end will only have a name if it's for an | |
925 | ;; environment | |
926 | (cond | |
927 | ((memq command '(item entry)) | |
928 | (let ((spec (command-spec (token-head token)))) | |
929 | (or (eq? (cadr spec) 'TABLE-ENVIRON) | |
930 | (parser-error | |
931 | port "@item not ended by @end table/enumerate/itemize" | |
932 | token)))) | |
933 | ((eq? expected-content 'ENVIRON) | |
934 | (assert-token token 'END command))) | |
935 | (up (end-para seed))) | |
936 | ((ITEM) | |
937 | (cond | |
938 | ((memq command '(enumerate itemize)) | |
939 | (up (visit 'item port sig-ws? (end-para seed)))) | |
940 | ((eq? expected-content 'TABLE-ENVIRON) | |
941 | (up (visit 'entry port sig-ws? (end-para seed)))) | |
942 | ((memq command '(item entry)) | |
943 | (visit command port sig-ws? (up (end-para seed)))) | |
944 | (else | |
945 | (parser-error | |
946 | port "@item must be within a table environment" | |
947 | command)))) | |
948 | ((PARA) | |
949 | ;; examine valid paragraphs? | |
950 | (loop port expect-eof? end-para (not sig-ws?) seed)) | |
951 | ((INCLUDE) | |
952 | ;; Recurse for include files | |
953 | (let ((seed (call-with-file-and-dir | |
954 | (read-include-file-name port) | |
955 | (lambda (port) | |
956 | (loop port 'include end-para | |
957 | need-break? seed))))) | |
958 | (loop port expect-eof? end-para need-break? seed))) | |
959 | ((START) ; Start of an @-command | |
960 | (let* ((head (token-head token)) | |
961 | (type (cadr (command-spec head))) | |
962 | (inline? (inline-content? type)) | |
963 | (seed ((if (and inline? (not need-break?)) | |
964 | identity end-para) seed)) | |
965 | (end-para (if inline? | |
966 | (if need-break? (make-end-para seed) | |
967 | end-para) | |
968 | identity)) | |
969 | (new-para (if (and inline? need-break?) | |
970 | new-para identity))) | |
971 | (loop port expect-eof? end-para (not inline?) | |
972 | (visit head port sig-ws? (new-para seed))))) | |
973 | (else | |
974 | (parser-error port "Unknown token type" token)))))))))))))))) | |
975 | ||
976 | ;; procedure: reverse-collect-str-drop-ws fragments | |
977 | ;; | |
978 | ;; Given the list of fragments (some of which are text strings), reverse | |
979 | ;; the list and concatenate adjacent text strings. We also drop | |
980 | ;; "unsignificant" whitespace, that is, whitespace in front, behind and | |
981 | ;; between elements. The whitespace that is included in character data | |
982 | ;; is not affected. | |
983 | (define (reverse-collect-str-drop-ws fragments) | |
984 | (cond | |
985 | ((null? fragments) ; a shortcut | |
986 | '()) | |
987 | ((and (string? (car fragments)) ; another shortcut | |
988 | (null? (cdr fragments)) ; remove single ws-only string | |
989 | (string-whitespace? (car fragments))) | |
990 | '()) | |
991 | (else | |
992 | (let loop ((fragments fragments) (result '()) (strs '()) | |
993 | (all-whitespace? #t)) | |
994 | (cond | |
995 | ((null? fragments) | |
996 | (if all-whitespace? | |
997 | result ; remove leading ws | |
998 | (cons (apply string-append strs) result))) | |
999 | ((string? (car fragments)) | |
1000 | (loop (cdr fragments) result (cons (car fragments) strs) | |
1001 | (and all-whitespace? | |
1002 | (string-whitespace? (car fragments))))) | |
1003 | (else | |
1004 | (loop (cdr fragments) | |
1005 | (cons | |
1006 | (car fragments) | |
1007 | (cond | |
1008 | ((null? strs) result) | |
1009 | (all-whitespace? | |
1010 | (if (null? result) | |
1011 | result ; remove trailing whitespace | |
1012 | (cons " " result))); replace interstitial ws with | |
1013 | ; one space | |
1014 | (else | |
1015 | (cons (apply string-append strs) result)))) | |
1016 | '() #t))))))) | |
1017 | ||
1018 | (define (make-dom-parser) | |
1019 | (make-command-parser | |
1020 | (lambda (command args content seed) ; fdown | |
1021 | '()) | |
1022 | (lambda (command args parent-seed seed) ; fup | |
1023 | (let ((seed (reverse-collect-str-drop-ws seed))) | |
1024 | (acons command | |
1025 | (if (null? args) seed (acons '% args seed)) | |
1026 | parent-seed))) | |
1027 | (lambda (string1 string2 seed) ; str-handler | |
1028 | (if (string-null? string2) | |
1029 | (cons string1 seed) | |
1030 | (cons* string2 string1 seed))))) | |
1031 | ||
1032 | (define parse-environment-args | |
1033 | (let ((parser (make-dom-parser))) | |
1034 | ;; duplicate arguments->attlist to avoid unnecessary splitting | |
1035 | (lambda (command port) | |
1036 | (let ((args (cdar (parser '*ENVIRON-ARGS* port '()))) | |
1037 | (arg-names (cddr (command-spec command)))) | |
1038 | (cond | |
1039 | ((not arg-names) | |
1040 | (if (null? args) '() | |
1041 | (parser-error port "@-command doesn't take args" command))) | |
1042 | ((eq? arg-names #t) | |
1043 | (list (cons 'arguments args))) | |
1044 | (else | |
1045 | (let loop ((args args) (arg-names arg-names) (out '())) | |
1046 | (cond | |
1047 | ((null? arg-names) | |
1048 | (if (null? args) (reverse! out) | |
1049 | (parser-error port "@-command didn't expect more args" | |
1050 | command args))) | |
1051 | ((symbol? arg-names) | |
1052 | (reverse! (acons arg-names args out))) | |
1053 | ((null? args) | |
1054 | (parser-error port "@-command expects more args" | |
1055 | command arg-names)) | |
1056 | ((and (string? (car args)) (string-index (car args) #\space)) | |
1057 | => (lambda (i) | |
1058 | (let ((rest (substring/shared (car args) (1+ i)))) | |
1059 | (if (zero? i) | |
1060 | (loop (cons rest (cdr args)) arg-names out) | |
1061 | (loop (cons rest (cdr args)) (cdr arg-names) | |
1062 | (cons (list (car arg-names) | |
1063 | (substring (car args) 0 i)) | |
1064 | out)))))) | |
1065 | (else | |
1066 | (loop (cdr args) (cdr arg-names) | |
1067 | (if (and (pair? (car args)) (eq? (caar args) '*braces*)) | |
1068 | (acons (car arg-names) (cdar args) out) | |
1069 | (cons (list (car arg-names) (car args)) out)))))))))))) | |
1070 | ||
1071 | (define (parse-eol-text-args command port) | |
1072 | ;; perhaps parse-environment-args should be named more | |
1073 | ;; generically. | |
1074 | (parse-environment-args command port)) | |
1075 | ||
1076 | ;; procedure: texi-fragment->stexi STRING | |
1077 | ;; | |
1078 | ;; A DOM parser for a texinfo fragment STRING. | |
1079 | ;; | |
1080 | ;; The procedure returns an SXML tree headed by the special tag, | |
1081 | ;; *fragment*. | |
1082 | ||
1083 | (define (texi-fragment->stexi string-or-port) | |
1084 | "Parse the texinfo commands in @var{string-or-port}, and return the | |
1085 | resultant stexi tree. The head of the tree will be the special command, | |
1086 | @code{*fragment*}." | |
1087 | (define (parse port) | |
1088 | (postprocess (car ((make-dom-parser) '*fragment* port '())))) | |
1089 | (if (input-port? string-or-port) | |
1090 | (parse string-or-port) | |
1091 | (call-with-input-string string-or-port parse))) | |
1092 | ||
1093 | ;; procedure: texi->stexi PORT | |
1094 | ;; | |
1095 | ;; This is an instance of a SSAX parser above that returns an SXML | |
1096 | ;; representation of the texinfo document ready to be read at PORT. | |
1097 | ;; | |
1098 | ;; The procedure returns an SXML tree. The port points to the | |
1099 | ;; first character after the @bye, or to the end of the file. | |
1100 | ||
1101 | (define (texi->stexi port) | |
1102 | "Read a full texinfo document from @var{port} and return the parsed | |
1103 | stexi tree. The parsing will start at the @code{@@settitle} and end at | |
1104 | @code{@@bye} or EOF." | |
1105 | (let ((parser (make-dom-parser))) | |
1106 | (take-until-settitle port) | |
1107 | (postprocess (car (parser 'texinfo port '()))))) | |
1108 | ||
1109 | (define (car-eq? x y) (and (pair? x) (eq? (car x) y))) | |
1110 | (define (make-contents tree) | |
1111 | (define (lp in out depth) | |
1112 | (cond | |
1113 | ((null? in) (values in (cons 'enumerate (reverse! out)))) | |
1114 | ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) | |
1115 | => (lambda (new-depth) | |
1116 | (let ((node-name (and (car-eq? (car in) 'node) | |
1117 | (cadr (assq 'name (cdadar in)))))) | |
1118 | (cond | |
1119 | ((< new-depth depth) | |
1120 | (values in (cons 'enumerate (reverse! out)))) | |
1121 | ((> new-depth depth) | |
1122 | (let ((out-cdr (if (null? out) '() (cdr out))) | |
1123 | (out-car (if (null? out) (list 'item) (car out)))) | |
1124 | (let*-values (((new-in new-out) (lp in '() (1+ depth)))) | |
1125 | (lp new-in | |
1126 | (cons (append out-car (list new-out)) out-cdr) | |
1127 | depth)))) | |
1128 | (else ;; same depth | |
1129 | (lp (cddr in) | |
1130 | (cons | |
1131 | `(item (para | |
1132 | ,@(if node-name | |
1133 | `((ref (% (node ,node-name)))) | |
1134 | (cdadr in)))) | |
1135 | out) | |
1136 | depth)))))) | |
1137 | (else (lp (cdr in) out depth)))) | |
1138 | (let*-values (((_ contents) (lp tree '() 1))) | |
1139 | `((chapheading "Table of Contents") ,contents))) | |
1140 | ||
1141 | (define (trim-whitespace str trim-left? trim-right?) | |
1142 | (let* ((left-space? (and (not trim-left?) | |
1143 | (string-prefix? " " str))) | |
1144 | (right-space? (and (not trim-right?) | |
1145 | (string-suffix? " " str))) | |
1146 | (tail (append! (string-tokenize str) | |
1147 | (if right-space? '("") '())))) | |
1148 | (string-join (if left-space? (cons "" tail) tail)))) | |
1149 | ||
1150 | (define (postprocess tree) | |
1151 | (define (loop in out state first? sig-ws?) | |
1152 | (cond | |
1153 | ((null? in) | |
1154 | (values (reverse! out) state)) | |
1155 | ((string? (car in)) | |
1156 | (loop (cdr in) | |
1157 | (cons (if sig-ws? (car in) | |
1158 | (trim-whitespace (car in) first? (null? (cdr in)))) | |
1159 | out) | |
1160 | state #f sig-ws?)) | |
1161 | ((pair? (car in)) | |
1162 | (case (caar in) | |
1163 | ((set) | |
1164 | (if (null? (cdar in)) (error "@set missing arguments" in)) | |
1165 | (if (string? (cadar in)) | |
1166 | (let ((i (string-index (cadar in) #\space))) | |
1167 | (if i | |
1168 | (loop (cdr in) out | |
1169 | (acons (substring (cadar in) 0 i) | |
1170 | (cons (substring (cadar in) (1+ i)) (cddar in)) | |
1171 | state) | |
1172 | #f sig-ws?) | |
1173 | (loop (cdr in) out (acons (cadar in) (cddar in) state) | |
1174 | #f sig-ws?))) | |
1175 | (error "expected a constant to define for @set" in))) | |
1176 | ((value) | |
1177 | (loop (fold-right cons (cdr in) | |
1178 | (or (and=> | |
1179 | (assoc (cadr (assq 'key (cdadar in))) state) cdr) | |
1180 | (error "unknown value" (cdadar in) state))) | |
1181 | out | |
1182 | state #f sig-ws?)) | |
1183 | ((copying) | |
1184 | (loop (cdr in) out (cons (car in) state) #f sig-ws?)) | |
1185 | ((insertcopying) | |
1186 | (loop (fold-right cons (cdr in) | |
1187 | (or (cdr (assoc 'copying state)) | |
1188 | (error "copying isn't set yet"))) | |
1189 | out | |
1190 | state #f sig-ws?)) | |
1191 | ((contents) | |
1192 | (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) | |
1193 | (else | |
1194 | (let*-values (((kid-out state) | |
1195 | (loop (car in) '() state #t | |
1196 | (or sig-ws? (space-significant? (caar in)))))) | |
1197 | (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) | |
1198 | (else ; a symbol | |
1199 | (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) | |
1200 | ||
1201 | (call-with-values | |
1202 | (lambda () (loop tree '() '() #t #f)) | |
1203 | (lambda (out state) out))) | |
1204 | ||
1205 | ;; Replace % with texinfo-arguments. | |
1206 | (define (stexi->sxml tree) | |
1207 | "Transform the stexi tree @var{tree} into sxml. This involves | |
1208 | replacing the @code{%} element that keeps the texinfo arguments with an | |
1209 | element for each argument. | |
1210 | ||
1211 | FIXME: right now it just changes % to @code{texinfo-arguments} -- that | |
1212 | doesn't hang with the idea of making a dtd at some point" | |
1213 | (pre-post-order | |
1214 | tree | |
1215 | `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) | |
1216 | (*text* . ,(lambda (x t) t)) | |
1217 | (*default* . ,(lambda (x . t) (cons x t)))))) | |
1218 | ||
1219 | ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 | |
1220 | ;;; texinfo.scm ends here |