1 ;;;; (texinfo) -- parsing of texinfo into SXML
3 ;;;; Copyright (C) 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 ;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com>
5 ;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com>
7 ;;;; This file is based on SSAX's SSAX.scm.
9 ;;;; This library is free software; you can redistribute it and/or
10 ;;;; modify it under the terms of the GNU Lesser General Public
11 ;;;; License as published by the Free Software Foundation; either
12 ;;;; version 3 of the License, or (at your option) any later version.
14 ;;;; This library is distributed in the hope that it will be useful,
15 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;;;; Lesser General Public License for more details.
19 ;;;; You should have received a copy of the GNU Lesser General Public
20 ;;;; License along with this library; if not, write to the Free Software
21 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 ;; @subheading Texinfo processing in scheme
27 ;; This module parses texinfo into SXML. TeX will always be the
28 ;; processor of choice for print output, of course. However, although
29 ;; @code{makeinfo} works well for info, its output in other formats is
30 ;; not very customizable, and the program is not extensible as a whole.
31 ;; This module aims to provide an extensible framework for texinfo
32 ;; processing that integrates texinfo into the constellation of SXML
35 ;; @subheading Notes on the SXML vocabulary
37 ;; Consider the following texinfo fragment:
40 ;; @@deffn Primitive set-car! pair value
45 ;; Logically, the category (Primitive), name (set-car!), and arguments
46 ;; (pair value) are ``attributes'' of the deffn, with the description as
47 ;; the content. However, texinfo allows for @@-commands within the
48 ;; arguments to an environment, like @code{@@deffn}, which means that
49 ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand,
50 ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are
51 ;; called ``arguments'', and are grouped under the special element, `%'.
53 ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In
54 ;; the interests of interoperability, this module provides a conversion
55 ;; function to replace the `%' with `texinfo-arguments'.
59 ;; Comparison to xml output of texinfo (which is rather undocumented):
60 ;; Doesn't conform to texinfo dtd
61 ;; No DTD at all, in fact :-/
62 ;; Actually outputs valid xml, after transforming %
63 ;; Slower (although with caching the SXML that problem can go away)
64 ;; Doesn't parse menus (although menus are shite)
65 ;; Args go in a dedicated element, FBOFW
66 ;; Definitions are handled a lot better
67 ;; Does parse comments
68 ;; Outputs only significant line breaks (a biggie!)
69 ;; Nodes are treated as anchors, rather than content organizers (a biggie)
70 ;; (more book-like, less info-like)
73 ;; Integration: help, indexing, plain text
75 (define-module (texinfo)
76 #:use-module (sxml simple)
77 #:use-module (sxml transform)
78 #:use-module (sxml ssax input-parse)
79 #:use-module (srfi srfi-1)
80 #:use-module (srfi srfi-11)
81 #:use-module (srfi srfi-13)
82 #:export (call-with-file-and-dir
91 (define (parser-error port message . rest)
92 (apply throw 'parser-error port message rest))
94 (define (call-with-file-and-dir filename proc)
95 "Call the one-argument procedure @var{proc} with an input port that
96 reads from @var{filename}. During the dynamic extent of @var{proc}'s
97 execution, the current directory will be @code{(dirname
98 @var{filename})}. This is useful for parsing documents that can include
99 files by relative path name."
100 (let ((current-dir (getcwd)))
102 (lambda () (chdir (dirname filename)))
104 (call-with-input-file (basename filename) proc))
105 (lambda () (chdir current-dir)))))
107 ;;========================================================================
108 ;; Reflection on the XML vocabulary
110 (define texi-command-specs
112 "A list of (@var{name} @var{content-model} . @var{args})
116 The name of an @@-command, as a symbol.
119 A symbol indicating the syntactic type of the @@-command:
122 No content, and no @code{@@end} is coming
124 Unparsed arguments until end of line
126 Parsed arguments until end of line
128 Unparsed arguments ending with @code{#\\@}}
130 Parsed arguments ending with @code{#\\@}}
131 @item INLINE-TEXT-ARGS
132 Parsed arguments ending with @code{#\\@}}
134 The tag is an environment tag, expect @code{@@end foo}.
136 Like ENVIRON, but with special parsing rules for its arguments.
138 For @code{*fragment*}, the command used for parsing fragments of
142 @code{INLINE-TEXT} commands will receive their arguments within their
143 bodies, whereas the @code{-ARGS} commands will receive them in their
146 @code{EOF-TEXT} receives its arguments in its body.
148 @code{ENVIRON} commands have both: parsed arguments until the end of
149 line, received through their attribute list, and parsed text until the
150 @code{@@end}, received in their bodies.
152 @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in
155 In addition, @code{ALIAS} can alias one command to another. The alias
156 will never be seen in parsed stexinfo.
158 There are four @@-commands that are treated specially. @code{@@include}
159 is a low-level token that will not be seen by higher-level parsers, so
160 it has no content-model. @code{@@para} is the paragraph command, which
161 is only implicit in the texinfo source. @code{@@item} has special
162 syntax, as noted above, and @code{@@entry} is how this parser treats
163 @code{@@item} commands within @code{@@table}, @code{@@ftable}, and
166 Also, indexing commands (@code{@@cindex}, etc.) are treated specially.
167 Their arguments are parsed, but they are needed before entering the
168 element so that an anchor can be inserted into the text before the index
172 Named arguments to the command, in the same format as the formals for a
173 lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS},
174 @code{INLINE-TEXT-ARGS}, @code{ENVIRON}, @code{TABLE-ENVIRON} commands.
176 '(;; Special commands
177 (include #f) ;; this is a low-level token
180 (entry ENTRY . heading)
181 (noindent EMPTY-COMMAND)
182 (*fragment* FRAGMENT)
184 ;; Inline text commands
185 (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant
195 (command INLINE-TEXT)
205 (titlefont INLINE-TEXT)
210 (sansserif INLINE-TEXT)
211 (slanted INLINE-TEXT)
214 ;; Inline args commands
215 (value INLINE-ARGS . (key))
216 (ref INLINE-ARGS . (node #:opt name section info-file manual))
217 (xref INLINE-ARGS . (node #:opt name section info-file manual))
218 (pxref INLINE-ARGS . (node #:opt name section info-file manual))
220 (uref INLINE-ARGS . (url #:opt title replacement))
221 (anchor INLINE-ARGS . (name))
222 (dots INLINE-ARGS . ())
223 (result INLINE-ARGS . ())
224 (bullet INLINE-ARGS . ())
225 (copyright INLINE-ARGS . ())
226 (tie INLINE-ARGS . ())
227 (image INLINE-ARGS . (file #:opt width height alt-text extension))
229 ;; Inline parsed args commands
230 (acronym INLINE-TEXT-ARGS . (acronym #:opt meaning))
233 (node EOL-ARGS . (name #:opt next previous up))
235 (comment EOL-ARGS . all)
236 (setchapternewpage EOL-ARGS . all)
239 (vskip EOL-ARGS . all)
240 (syncodeindex EOL-ARGS . all)
241 (contents EOL-ARGS . ())
242 (shortcontents EOL-ARGS . ())
243 (summarycontents EOL-ARGS . ())
244 (insertcopying EOL-ARGS . ())
245 (dircategory EOL-ARGS . (category))
246 (top EOL-ARGS . (title))
247 (printindex EOL-ARGS . (type))
248 (paragraphindent EOL-ARGS . (indent))
251 (*ENVIRON-ARGS* EOL-TEXT)
261 (appendixsec EOL-TEXT)
262 (unnumbered EOL-TEXT)
263 (unnumberedsec EOL-TEXT)
264 (subsection EOL-TEXT)
265 (subsubsection EOL-TEXT)
266 (appendixsubsec EOL-TEXT)
267 (appendixsubsubsec EOL-TEXT)
268 (unnumberedsubsec EOL-TEXT)
269 (unnumberedsubsubsec EOL-TEXT)
270 (chapheading EOL-TEXT)
271 (majorheading EOL-TEXT)
273 (subheading EOL-TEXT)
274 (subsubheading EOL-TEXT)
276 (deftpx EOL-TEXT-ARGS . (category name . attributes))
277 (defcvx EOL-TEXT-ARGS . (category class name))
278 (defivarx EOL-TEXT-ARGS . (class name))
279 (deftypeivarx EOL-TEXT-ARGS . (class data-type name))
280 (defopx EOL-TEXT-ARGS . (category class name . arguments))
281 (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments))
282 (defmethodx EOL-TEXT-ARGS . (class name . arguments))
283 (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments))
284 (defoptx EOL-TEXT-ARGS . (name))
285 (defvrx EOL-TEXT-ARGS . (category name))
286 (defvarx EOL-TEXT-ARGS . (name))
287 (deftypevrx EOL-TEXT-ARGS . (category data-type name))
288 (deftypevarx EOL-TEXT-ARGS . (data-type name))
289 (deffnx EOL-TEXT-ARGS . (category name . arguments))
290 (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments))
291 (defspecx EOL-TEXT-ARGS . (name . arguments))
292 (defmacx EOL-TEXT-ARGS . (name . arguments))
293 (defunx EOL-TEXT-ARGS . (name . arguments))
294 (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments))
297 (cindex INDEX . entry)
298 (findex INDEX . entry)
299 (vindex INDEX . entry)
300 (kindex INDEX . entry)
301 (pindex INDEX . entry)
302 (tindex INDEX . entry)
304 ;; Environment commands (those that need @end)
305 (texinfo ENVIRON . title)
306 (ignore ENVIRON . ())
307 (ifinfo ENVIRON . ())
309 (ifhtml ENVIRON . ())
311 (ifplaintext ENVIRON . ())
312 (ifnotinfo ENVIRON . ())
313 (ifnottex ENVIRON . ())
314 (ifnothtml ENVIRON . ())
315 (ifnotxml ENVIRON . ())
316 (ifnotplaintext ENVIRON . ())
317 (titlepage ENVIRON . ())
319 (direntry ENVIRON . ())
320 (copying ENVIRON . ())
321 (example ENVIRON . ())
322 (smallexample ENVIRON . ())
323 (display ENVIRON . ())
324 (smalldisplay ENVIRON . ())
325 (verbatim ENVIRON . ())
326 (format ENVIRON . ())
327 (smallformat ENVIRON . ())
329 (smalllisp ENVIRON . ())
330 (cartouche ENVIRON . ())
331 (quotation ENVIRON . ())
333 (deftp ENVIRON . (category name . attributes))
334 (defcv ENVIRON . (category class name))
335 (defivar ENVIRON . (class name))
336 (deftypeivar ENVIRON . (class data-type name))
337 (defop ENVIRON . (category class name . arguments))
338 (deftypeop ENVIRON . (category class data-type name . arguments))
339 (defmethod ENVIRON . (class name . arguments))
340 (deftypemethod ENVIRON . (class data-type name . arguments))
341 (defopt ENVIRON . (name))
342 (defvr ENVIRON . (category name))
343 (defvar ENVIRON . (name))
344 (deftypevr ENVIRON . (category data-type name))
345 (deftypevar ENVIRON . (data-type name))
346 (deffn ENVIRON . (category name . arguments))
347 (deftypefn ENVIRON . (category data-type name . arguments))
348 (defspec ENVIRON . (name . arguments))
349 (defmac ENVIRON . (name . arguments))
350 (defun ENVIRON . (name . arguments))
351 (deftypefun ENVIRON . (data-type name . arguments))
353 (table TABLE-ENVIRON . (formatter))
354 (itemize TABLE-ENVIRON . (formatter))
355 (enumerate TABLE-ENVIRON . (start))
356 (ftable TABLE-ENVIRON . (formatter))
357 (vtable TABLE-ENVIRON . (formatter))))
359 (define command-depths
360 '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4)
361 (top . 0) (unnumbered . 1) (unnumberedsec . 2)
362 (unnumberedsubsec . 3) (unnumberedsubsubsec . 4)
363 (appendix . 1) (appendixsec . 2) (appendixsection . 2)
364 (appendixsubsec . 3) (appendixsubsubsec . 4)))
365 (define (texi-command-depth command max-depth)
366 "Given the texinfo command @var{command}, return its nesting level, or
367 @code{#f} if it nests too deep for @var{max-depth}.
371 (texi-command-depth 'chapter 4) @result{} 1
372 (texi-command-depth 'top 4) @result{} 0
373 (texi-command-depth 'subsection 4) @result{} 3
374 (texi-command-depth 'appendixsubsec 4) @result{} 3
375 (texi-command-depth 'subsection 2) @result{} #f
377 (let ((depth (and=> (assq command command-depths) cdr)))
378 (and depth (<= depth max-depth) depth)))
380 ;; The % is for arguments
381 (define (space-significant? command)
383 '(example smallexample verbatim lisp smalllisp menu %)))
385 ;; Like a DTD for texinfo
386 (define (command-spec command)
387 (or (assq command texi-command-specs)
388 (parser-error #f "Unknown command" command)))
390 (define (inline-content? content)
392 ((INLINE-TEXT INLINE-ARGS INLINE-TEXT-ARGS) #t)
396 ;;========================================================================
397 ;; Lower-level parsers and scanners
399 ;; They deal with primitive lexical units (Names, whitespaces, tags) and
400 ;; with pieces of more generic productions. Most of these parsers must
401 ;; be called in appropriate context. For example, complete-start-command
402 ;; must be called only when the @-command start has been detected and
403 ;; its name token has been read.
405 ;; Test if a string is made of only whitespace
406 ;; An empty string is considered made of whitespace as well
407 (define (string-whitespace? str)
408 (or (string-null? str)
409 (string-every char-whitespace? str)))
411 ;; Like read-text-line, but allows EOF.
412 (define read-eof-breaks '(*eof* #\return #\newline))
413 (define (read-eof-line port)
414 (if (eof-object? (peek-char port))
416 (let* ((line (next-token '() read-eof-breaks
417 "reading a line" port))
418 (c (read-char port))) ; must be either \n or \r or EOF
419 (if (and (eq? c #\return) (eq? (peek-char port) #\newline))
420 (read-char port)) ; skip \n that follows \r
423 (define (skip-whitespace port)
424 (skip-while '(#\space #\tab #\return #\newline) port))
426 (define (skip-horizontal-whitespace port)
427 (skip-while '(#\space #\tab) port))
429 ;; command ::= Letter+
431 ;; procedure: read-command PORT
433 ;; Read a command starting from the current position in the PORT and
434 ;; return it as a symbol.
435 (define (read-command port)
436 (let ((first-char (peek-char port)))
437 (or (char-alphabetic? first-char)
438 (parser-error port "Nonalphabetic @-command char: '" first-char "'")))
444 ((char-alphabetic? c) c)
448 ;; A token is a primitive lexical unit. It is a record with two fields,
449 ;; token-head and token-kind.
452 ;; END The end of a texinfo command. If the command is ended by },
453 ;; token-head will be #f. Otherwise if the command is ended by
454 ;; @end COMMAND, token-head will be COMMAND. As a special case,
455 ;; @bye is the end of a special @texinfo command.
456 ;; START The start of a texinfo command. The token-head will be a
457 ;; symbol of the @-command name.
458 ;; INCLUDE An @include directive. The token-head will be empty -- the
459 ;; caller is responsible for reading the include file name.
460 ;; ITEM @item commands have an irregular syntax. They end at the
461 ;; next @item, or at the end of the environment. For that
462 ;; read-command-token treats them specially.
464 (define (make-token kind head) (cons kind head))
465 (define token? pair?)
466 (define token-kind car)
467 (define token-head cdr)
469 ;; procedure: read-command-token PORT
471 ;; This procedure starts parsing of a command token. The current
472 ;; position in the stream must be #\@. This procedure scans enough of
473 ;; the input stream to figure out what kind of a command token it is
474 ;; seeing. The procedure returns a token structure describing the token.
476 (define (read-command-token port)
477 (assert-curr-char '(#\@) "start of the command" port)
478 (let ((peeked (peek-char port)))
480 ((memq peeked '(#\! #\. #\? #\@ #\\ #\{ #\}))
481 ;; @-commands that escape characters
482 (make-token 'STRING (string (read-char port))))
484 (let ((name (read-command port)))
488 (let ((command (string-trim-both
489 (read-eof-line port))))
490 (or (and (not (string-null? command))
491 (string-every char-alphabetic? command))
492 (parser-error port "malformed @end" command))
493 (make-token 'END (string->symbol command))))
495 ;; the end of the top
496 (make-token 'END 'texinfo))
498 (make-token 'ITEM 'item))
500 (make-token 'INCLUDE #f))
502 (make-token 'START name))))))))
504 ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED
506 ;; This procedure must be called after we have read a string
507 ;; "@verbatim\n" that begins a verbatim section. The current position
508 ;; must be the first position of the verbatim body. This function reads
509 ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a
510 ;; character data consumer.
512 ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED.
513 ;; The first STRING1 argument to STR-HANDLER never contains a newline.
514 ;; The second STRING2 argument often will. On the first invocation of the
515 ;; STR-HANDLER, the seed is the one passed to read-verbatim-body
516 ;; as the third argument. The result of this first invocation will be
517 ;; passed as the seed argument to the second invocation of the line
518 ;; consumer, and so on. The result of the last invocation of the
519 ;; STR-HANDLER is returned by the read-verbatim-body. Note a
520 ;; similarity to the fundamental 'fold' iterator.
522 ;; Within a verbatim section all characters are taken at their face
523 ;; value. It ends with "\n@end verbatim(\r)?\n".
525 ;; Must be called right after the newline after @verbatim.
526 (define (read-verbatim-body port str-handler seed)
527 (let loop ((seed seed))
528 (let ((fragment (next-token '() '(#\newline)
529 "reading verbatim" port)))
530 ;; We're reading the char after the 'fragment', which is
533 (if (string=? fragment "@end verbatim")
535 (loop (str-handler fragment "\n" seed))))))
537 ;; procedure+: read-arguments PORT
539 ;; This procedure reads and parses a production ArgumentList.
540 ;; ArgumentList ::= S* Argument (S* , S* Argument)* S*
541 ;; Argument ::= ([^@{},])*
543 ;; Arguments are the things in braces, i.e @ref{my node} has one
544 ;; argument, "my node". Most commands taking braces actually don't have
545 ;; arguments, they process text. For example, in
546 ;; @emph{@strong{emphasized}}, the emph takes text, because the parse
547 ;; continues into the braces.
549 ;; Any whitespace within Argument is replaced with a single space.
550 ;; Whitespace around an Argument is trimmed.
552 ;; The procedure returns a list of arguments. Afterwards the current
553 ;; character will be after the final #\}.
555 (define (read-arguments port stop-char)
557 (read-char port) ;; eat the delimiter
558 (let ((ret (map (lambda (x) (if (string-null? x) #f x))
559 (map string-trim-both (string-split str #\,)))))
560 (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret)))
563 (split (next-token '() (list stop-char)
564 "arguments of @-command" port)))
566 ;; procedure+: complete-start-command COMMAND PORT
568 ;; This procedure is to complete parsing of an @-command. The procedure
569 ;; must be called after the command token has been read. COMMAND is a
572 ;; This procedure returns several values:
573 ;; COMMAND: a symbol.
574 ;; ARGUMENTS: command's arguments, as an alist.
575 ;; CONTENT-MODEL: the content model of the command.
577 ;; On exit, the current position in PORT will depend on the CONTENT-MODEL.
579 ;; Content model Port position
580 ;; ============= =============
581 ;; INLINE-TEXT One character after the #\{.
582 ;; INLINE-TEXT-ARGS One character after the #\{.
583 ;; INLINE-ARGS The first character after the #\}.
584 ;; EOL-TEXT The first non-whitespace character after the command.
585 ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT
586 ;; The first character on the next line.
587 ;; PARAGRAPH, ITEM, EMPTY-COMMAND
588 ;; The first character after the command.
590 (define (arguments->attlist port args arg-names)
591 (let loop ((in args) (names arg-names) (opt? #f) (out '()))
593 ((symbol? names) ;; a rest arg
594 (reverse (if (null? in) out (acons names in out))))
595 ((and (not (null? names)) (eq? (car names) #:opt))
596 (loop in (cdr names) #t out))
598 (if (or (null? names) opt?)
600 (parser-error port "@-command expected more arguments:"
601 args arg-names names)))
603 (parser-error port "@-command didn't expect more arguments:" in))
605 (or (and opt? (loop (cdr in) (cdr names) opt? out))
606 (parser-error "@-command missing required argument"
609 (loop (cdr in) (cdr names) opt?
611 (if (list? (car in)) (car in) (list (car in)))
614 (define (parse-table-args command port)
615 (let* ((line (string-trim-both (read-text-line port)))
616 (length (string-length line)))
617 (define (get-formatter)
618 (or (and (not (zero? length))
619 (eq? (string-ref line 0) #\@)
620 (let ((f (string->symbol (substring line 1))))
621 (or (inline-content? (cadr (command-spec f)))
623 port "@item formatter must be INLINE" f))
625 (parser-error port "Invalid @item formatter" line)))
631 ,(if (or (and (eq? length 1)
632 (char-alphabetic? (string-ref line 0)))
633 (string-every char-numeric? line))
636 port "Invalid enumerate start" line))))))
639 ,(or (and (eq? length 1) line)
640 (and (string-null? line) '(bullet))
641 (list (get-formatter))))))
642 (else ;; tables of various varieties
643 `((formatter (,(get-formatter))))))))
645 (define (complete-start-command command port)
646 (define (get-arguments type arg-names stop-char)
647 (arguments->attlist port (read-arguments port stop-char) arg-names))
649 (let* ((spec (command-spec command))
651 (arg-names (cddr spec)))
654 (complete-start-command arg-names port))
656 (assert-curr-char '(#\{) "Inline element lacks {" port)
657 (values command '() type))
659 (assert-curr-char '(#\{) "Inline element lacks {" port)
660 (values command (get-arguments type arg-names #\}) type))
662 (assert-curr-char '(#\{) "Inline element lacks {" port)
663 (values command '() type))
665 (values command (get-arguments type arg-names #\newline) type))
666 ((ENVIRON ENTRY INDEX)
667 (skip-horizontal-whitespace port)
668 (values command (parse-environment-args command port) type))
670 (skip-horizontal-whitespace port)
671 (values command (parse-table-args command port) type))
673 (skip-horizontal-whitespace port)
674 (values command '() type))
676 (skip-horizontal-whitespace port)
677 (values command (parse-eol-text-args command port) type))
678 ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT)
679 (values command '() type))
680 (else ;; INCLUDE shouldn't get here
681 (parser-error port "can't happen")))))
683 ;;-----------------------------------------------------------------------------
684 ;; Higher-level parsers and scanners
686 ;; They parse productions corresponding entire @-commands.
688 ;; Only reads @settitle, leaves it to the command parser to finish
689 ;; reading the title.
690 (define (take-until-settitle port)
691 (or (find-string-from-port? "\n@settitle " port)
692 (parser-error port "No \\n@settitle found"))
693 (skip-horizontal-whitespace port)
694 (and (eq? (peek-char port) #\newline)
695 (parser-error port "You have a @settitle, but no title")))
697 ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED
699 ;; This procedure is to read the CharData of a texinfo document.
701 ;; text ::= (CharData | Command)*
703 ;; The procedure reads CharData and stops at @-commands (or
704 ;; environments). It also stops at an open or close brace.
709 ;; a boolean indicating if EOF is normal, i.e., the character
710 ;; data may be terminated by the EOF. EOF is normal
711 ;; while processing the main document.
713 ;; a boolean indicating if we are within a whitespace-preserving
714 ;; environment. If #t, suppress paragraph detection.
716 ;; a STR-HANDLER, see read-verbatim-body
718 ;; an argument passed to the first invocation of STR-HANDLER.
720 ;; The procedure returns two results: SEED and TOKEN. The SEED is the
721 ;; result of the last invocation of STR-HANDLER, or the original seed if
722 ;; STR-HANDLER was never called.
724 ;; TOKEN can be either an eof-object (this can happen only if expect-eof?
725 ;; was #t), or a texinfo token denoting the start or end of a tag.
727 ;; read-char-data port expect-eof? preserve-ws? str-handler seed
728 (define read-char-data
729 (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline)))
730 (define (handle str-handler str1 str2 seed)
731 (if (and (string-null? str1) (string-null? str2))
733 (str-handler str1 str2 seed)))
735 (lambda (port expect-eof? preserve-ws? str-handler seed)
736 (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof)))
737 (let loop ((seed seed))
738 (let* ((fragment (next-token '() end-chars "reading char data" port))
739 (term-char (peek-char port))) ; one of end-chars
741 ((eof-object? term-char) ; only if expect-eof?
742 (values (handle str-handler fragment "" seed) term-char))
743 ((memq term-char '(#\@ #\{ #\}))
744 (values (handle str-handler fragment "" seed)
746 ((#\@) (read-command-token port))
747 ((#\{) (make-token 'START '*braces*))
748 ((#\}) (read-char port) (make-token 'END #f)))))
749 ((eq? term-char #\newline)
750 ;; Always significant, unless directly before an end token.
751 (let ((c (peek-next-char port)))
755 (parser-error port "EOF while reading char data"))
756 (values (handle str-handler fragment "" seed) c))
758 (let* ((token (read-command-token port))
759 (end? (eq? (token-kind token) 'END)))
761 (handle str-handler fragment (if end? "" " ") seed)
763 ((and (not preserve-ws?) (eq? c #\newline))
764 ;; paragraph-separator ::= #\newline #\newline+
765 (skip-while '(#\newline) port)
766 (skip-horizontal-whitespace port)
767 (values (handle str-handler fragment "" seed)
768 (make-token 'PARA 'para)))
770 (loop (handle str-handler fragment
771 (if preserve-ws? "\n" " ") seed)))))))))))))
773 ; procedure+: assert-token TOKEN KIND NAME
774 ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME
775 (define (assert-token token kind name)
776 (or (and (token? token)
777 (eq? kind (token-kind token))
778 (equal? name (token-head token)))
779 (parser-error #f "Expecting @end for " name ", got " token)))
781 ;;========================================================================
782 ;; Highest-level parsers: Texinfo to SXML
784 ;; These parsers are a set of syntactic forms to instantiate a SSAX
785 ;; parser. The user tells what to do with the parsed character and
786 ;; element data. These latter handlers determine if the parsing follows a
787 ;; SAX or a DOM model.
789 ;; syntax: make-command-parser fdown fup str-handler
791 ;; Create a parser to parse and process one element, including its
792 ;; character content or children elements. The parser is typically
793 ;; applied to the root element of a document.
796 ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED
798 ;; This procedure is to generate the seed to be passed to handlers
799 ;; that process the content of the element. This is the function
800 ;; identified as 'fdown' in the denotational semantics of the XML
801 ;; parser given in the title comments to (sxml ssax).
804 ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED
806 ;; This procedure is called when parsing of COMMAND is finished.
807 ;; The SEED is the result from the last content parser (or from
808 ;; fdown if the element has the empty content). PARENT-SEED is the
809 ;; same seed as was passed to fdown. The procedure is to generate a
810 ;; seed that will be the result of the element parser. This is the
811 ;; function identified as 'fup' in the denotational semantics of
812 ;; the XML parser given in the title comments to (sxml ssax).
815 ;; A STR-HANDLER, see read-verbatim-body
818 ;; The generated parser is a
819 ;; procedure COMMAND PORT SEED
821 ;; The procedure must be called *after* the command token has been read.
823 (define (read-include-file-name port)
824 (let ((x (string-trim-both (read-eof-line port))))
826 (error "no file listed")
827 x))) ;; fixme: should expand @value{} references
829 (define (sxml->node-name sxml)
830 "Turn some sxml string into a valid node name."
831 (let loop ((in (string->list (sxml->string sxml))) (out '()))
833 (apply string (reverse out))
834 (if (memq (car in) '(#\{ #\} #\@ #\,))
836 (loop (cdr in) (cons (car in) out))))))
838 (define (index command arguments fdown fup parent-seed)
840 ((deftp defcv defivar deftypeivar defop deftypeop defmethod
841 deftypemethod defopt defvr defvar deftypevr deftypevar deffn
842 deftypefn defspec defmac defun deftypefun)
843 (let ((args `((name ,(string-append (symbol->string command) "-"
844 (cadr (assq 'name arguments)))))))
845 (fup 'anchor args parent-seed
846 (fdown 'anchor args 'INLINE-ARGS '()))))
847 ((cindex findex vindex kindex pindex tindex)
848 (let ((args `((name ,(string-append (symbol->string command) "-"
850 (assq 'entry arguments)))))))
851 (fup 'anchor args parent-seed
852 (fdown 'anchor args 'INLINE-ARGS '()))))
855 (define (make-command-parser fdown fup str-handler)
856 (lambda (command port seed)
857 (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed))
858 (let*-values (((command arguments expected-content)
859 (complete-start-command command port)))
860 (let* ((parent-seed (index command arguments fdown fup parent-seed))
861 (seed (fdown command arguments expected-content parent-seed))
862 (eof-closes? (or (memq command '(texinfo para *fragment*))
863 (eq? expected-content 'EOL-TEXT)))
864 (sig-ws? (or sig-ws? (space-significant? command)))
865 (up (lambda (s) (fup command arguments parent-seed s)))
866 (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s)))
867 (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s)))))
869 (define (port-for-content)
870 (if (eq? expected-content 'EOL-TEXT)
871 (call-with-input-string (read-text-line port) identity)
875 ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX
877 ;; empty or finished by complete-start-command
879 ((eq? command 'verbatim)
880 (up (read-verbatim-body port str-handler seed)))
882 (let loop ((port (port-for-content))
883 (expect-eof? eof-closes?)
885 (need-break? (and (not sig-ws?)
886 (memq expected-content
887 '(ENVIRON TABLE-ENVIRON
888 ENTRY ITEM FRAGMENT))))
891 ((and need-break? (or sig-ws? (skip-whitespace port))
892 (not (memq (peek-char port) '(#\@ #\})))
893 (not (eof-object? (peek-char port))))
894 ;; Even if we have an @, it might be inline -- check
896 (let ((seed (end-para seed)))
897 (loop port expect-eof? (make-end-para seed) #f
900 (let*-values (((seed token)
902 port expect-eof? sig-ws? str-handler seed)))
906 ((include #f) (end-para seed))
907 (else (up (end-para seed)))))
909 (case (token-kind token)
911 ;; this is only @-commands that escape
912 ;; characters: @}, @@, @{ -- new para if need-break
913 (let ((seed ((if need-break? end-para identity) seed)))
914 (loop port expect-eof?
915 (if need-break? (make-end-para seed) end-para) #f
916 (str-handler (token-head token) ""
917 ((if need-break? new-para identity)
920 ;; The end will only have a name if it's for an
923 ((memq command '(item entry))
924 (let ((spec (command-spec (token-head token))))
925 (or (eq? (cadr spec) 'TABLE-ENVIRON)
927 port "@item not ended by @end table/enumerate/itemize"
929 ((eq? expected-content 'ENVIRON)
930 (assert-token token 'END command)))
931 (up (end-para seed)))
934 ((memq command '(enumerate itemize))
935 (up (visit 'item port sig-ws? (end-para seed))))
936 ((eq? expected-content 'TABLE-ENVIRON)
937 (up (visit 'entry port sig-ws? (end-para seed))))
938 ((memq command '(item entry))
939 (visit command port sig-ws? (up (end-para seed))))
942 port "@item must be within a table environment"
945 ;; examine valid paragraphs?
946 (loop port expect-eof? end-para (not sig-ws?) seed))
948 ;; Recurse for include files
949 (let ((seed (call-with-file-and-dir
950 (read-include-file-name port)
952 (loop port 'include end-para
953 need-break? seed)))))
954 (loop port expect-eof? end-para need-break? seed)))
955 ((START) ; Start of an @-command
956 (let* ((head (token-head token))
957 (type (cadr (command-spec head)))
958 (inline? (inline-content? type))
959 (seed ((if (and inline? (not need-break?))
960 identity end-para) seed))
961 (end-para (if inline?
962 (if need-break? (make-end-para seed)
965 (new-para (if (and inline? need-break?)
967 (loop port expect-eof? end-para (not inline?)
968 (visit head port sig-ws? (new-para seed)))))
970 (parser-error port "Unknown token type" token))))))))))))))))
972 ;; procedure: reverse-collect-str-drop-ws fragments
974 ;; Given the list of fragments (some of which are text strings), reverse
975 ;; the list and concatenate adjacent text strings. We also drop
976 ;; "unsignificant" whitespace, that is, whitespace in front, behind and
977 ;; between elements. The whitespace that is included in character data
979 (define (reverse-collect-str-drop-ws fragments)
981 ((null? fragments) ; a shortcut
983 ((and (string? (car fragments)) ; another shortcut
984 (null? (cdr fragments)) ; remove single ws-only string
985 (string-whitespace? (car fragments)))
988 (let loop ((fragments fragments) (result '()) (strs '())
989 (all-whitespace? #t))
993 result ; remove leading ws
994 (cons (apply string-append strs) result)))
995 ((string? (car fragments))
996 (loop (cdr fragments) result (cons (car fragments) strs)
998 (string-whitespace? (car fragments)))))
1000 (loop (cdr fragments)
1004 ((null? strs) result)
1007 result ; remove trailing whitespace
1008 (cons " " result))); replace interstitial ws with
1011 (cons (apply string-append strs) result))))
1014 (define (parse-inline-text-args port spec text)
1015 (let lp ((in text) (cur '()) (out '()))
1018 (if (and (pair? cur)
1020 (string-whitespace? (car cur)))
1021 (lp in (cdr cur) out)
1022 (let ((args (reverse (if (null? cur)
1024 (cons (reverse cur) out)))))
1025 (arguments->attlist port args (cddr spec)))))
1027 (lp (cdr in) (cons (car in) cur) out))
1028 ((string-index (car in) #\,)
1029 (let* ((parts (string-split (car in) #\,))
1030 (head (string-trim-right (car parts)))
1031 (rev-tail (reverse (cdr parts)))
1032 (last (string-trim (car rev-tail))))
1034 (if (string-null? last) cur (cons last cur))
1035 (append (cdr rev-tail)
1036 (cons (reverse (if (string-null? head) cur (cons head cur)))
1040 (cons (if (null? cur) (string-trim (car in)) (car in)) cur)
1043 (define (make-dom-parser)
1044 (make-command-parser
1045 (lambda (command args content seed) ; fdown
1047 (lambda (command args parent-seed seed) ; fup
1048 (let ((seed (reverse-collect-str-drop-ws seed))
1049 (spec (command-spec command)))
1050 (if (eq? (cadr spec) 'INLINE-TEXT-ARGS)
1051 (cons (list command (cons '% (parse-inline-text-args #f spec seed)))
1054 (if (null? args) seed (acons '% args seed))
1056 (lambda (string1 string2 seed) ; str-handler
1057 (if (string-null? string2)
1059 (cons* string2 string1 seed)))))
1061 (define parse-environment-args
1062 (let ((parser (make-dom-parser)))
1063 ;; duplicate arguments->attlist to avoid unnecessary splitting
1064 (lambda (command port)
1065 (let ((args (cdar (parser '*ENVIRON-ARGS* port '())))
1066 (arg-names (cddr (command-spec command))))
1069 (if (null? args) '()
1070 (parser-error port "@-command doesn't take args" command)))
1072 (list (cons 'arguments args)))
1074 (let loop ((args args) (arg-names arg-names) (out '()))
1077 (if (null? args) (reverse! out)
1078 (parser-error port "@-command didn't expect more args"
1080 ((symbol? arg-names)
1081 (reverse! (acons arg-names args out)))
1083 (parser-error port "@-command expects more args"
1085 ((and (string? (car args)) (string-index (car args) #\space))
1087 (let ((rest (substring/shared (car args) (1+ i))))
1089 (loop (cons rest (cdr args)) arg-names out)
1090 (loop (cons rest (cdr args)) (cdr arg-names)
1091 (cons (list (car arg-names)
1092 (substring (car args) 0 i))
1095 (loop (cdr args) (cdr arg-names)
1096 (if (and (pair? (car args)) (eq? (caar args) '*braces*))
1097 (acons (car arg-names) (cdar args) out)
1098 (cons (list (car arg-names) (car args)) out))))))))))))
1100 (define (parse-eol-text-args command port)
1101 ;; perhaps parse-environment-args should be named more
1103 (parse-environment-args command port))
1105 ;; procedure: texi-fragment->stexi STRING
1107 ;; A DOM parser for a texinfo fragment STRING.
1109 ;; The procedure returns an SXML tree headed by the special tag,
1112 (define (texi-fragment->stexi string-or-port)
1113 "Parse the texinfo commands in @var{string-or-port}, and return the
1114 resultant stexi tree. The head of the tree will be the special command,
1116 (define (parse port)
1117 (postprocess (car ((make-dom-parser) '*fragment* port '()))))
1118 (if (input-port? string-or-port)
1119 (parse string-or-port)
1120 (call-with-input-string string-or-port parse)))
1122 ;; procedure: texi->stexi PORT
1124 ;; This is an instance of a SSAX parser above that returns an SXML
1125 ;; representation of the texinfo document ready to be read at PORT.
1127 ;; The procedure returns an SXML tree. The port points to the
1128 ;; first character after the @bye, or to the end of the file.
1130 (define (texi->stexi port)
1131 "Read a full texinfo document from @var{port} and return the parsed
1132 stexi tree. The parsing will start at the @code{@@settitle} and end at
1133 @code{@@bye} or EOF."
1134 (let ((parser (make-dom-parser)))
1135 (take-until-settitle port)
1136 (postprocess (car (parser 'texinfo port '())))))
1138 (define (car-eq? x y) (and (pair? x) (eq? (car x) y)))
1139 (define (make-contents tree)
1140 (define (lp in out depth)
1142 ((null? in) (values in (cons 'enumerate (reverse! out))))
1143 ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4))
1144 => (lambda (new-depth)
1145 (let ((node-name (and (car-eq? (car in) 'node)
1146 (cadr (assq 'name (cdadar in))))))
1148 ((< new-depth depth)
1149 (values in (cons 'enumerate (reverse! out))))
1150 ((> new-depth depth)
1151 (let ((out-cdr (if (null? out) '() (cdr out)))
1152 (out-car (if (null? out) (list 'item) (car out))))
1153 (let*-values (((new-in new-out) (lp in '() (1+ depth))))
1155 (cons (append out-car (list new-out)) out-cdr)
1162 `((ref (% (node ,node-name))))
1166 (else (lp (cdr in) out depth))))
1167 (let*-values (((_ contents) (lp tree '() 1)))
1168 `((chapheading "Table of Contents") ,contents)))
1170 (define (trim-whitespace str trim-left? trim-right?)
1171 (let* ((left-space? (and (not trim-left?)
1172 (string-prefix? " " str)))
1173 (right-space? (and (not trim-right?)
1174 (string-suffix? " " str)))
1175 (tail (append! (string-tokenize str)
1176 (if right-space? '("") '()))))
1177 (string-join (if left-space? (cons "" tail) tail))))
1179 (define (postprocess tree)
1180 (define (loop in out state first? sig-ws?)
1183 (values (reverse! out) state))
1186 (cons (if sig-ws? (car in)
1187 (trim-whitespace (car in) first? (null? (cdr in))))
1193 (if (null? (cdar in)) (error "@set missing arguments" in))
1194 (if (string? (cadar in))
1195 (let ((i (string-index (cadar in) #\space)))
1198 (acons (substring (cadar in) 0 i)
1199 (cons (substring (cadar in) (1+ i)) (cddar in))
1202 (loop (cdr in) out (acons (cadar in) (cddar in) state)
1204 (error "expected a constant to define for @set" in)))
1206 (loop (fold-right cons (cdr in)
1208 (assoc (cadr (assq 'key (cdadar in))) state) cdr)
1209 (error "unknown value" (cdadar in) state)))
1213 (loop (cdr in) out (cons (car in) state) #f sig-ws?))
1215 (loop (fold-right cons (cdr in)
1216 (or (cdr (assoc 'copying state))
1217 (error "copying isn't set yet")))
1221 (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?))
1223 (let*-values (((kid-out state)
1224 (loop (car in) '() state #t
1225 (or sig-ws? (space-significant? (caar in))))))
1226 (loop (cdr in) (cons kid-out out) state #f sig-ws?)))))
1228 (loop (cdr in) (cons (car in) out) state #t sig-ws?))))
1231 (lambda () (loop tree '() '() #t #f))
1232 (lambda (out state) out)))
1234 ;; Replace % with texinfo-arguments.
1235 (define (stexi->sxml tree)
1236 "Transform the stexi tree @var{tree} into sxml. This involves
1237 replacing the @code{%} element that keeps the texinfo arguments with an
1238 element for each argument.
1240 FIXME: right now it just changes % to @code{texinfo-arguments} -- that
1241 doesn't hang with the idea of making a dtd at some point"
1244 `((% . ,(lambda (x . t) (cons 'texinfo-arguments t)))
1245 (*text* . ,(lambda (x t) t))
1246 (*default* . ,(lambda (x . t) (cons x t))))))
1248 ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5
1249 ;;; texinfo.scm ends here