;;;; (texinfo) -- parsing of texinfo into SXML ;;;; ;;;; Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. ;;;; Copyright (C) 2004, 2009 Andy Wingo ;;;; Copyright (C) 2001,2002 Oleg Kiselyov ;;;; ;;;; This file is based on SSAX's SSAX.scm. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public ;;;; License as published by the Free Software Foundation; either ;;;; version 3 of the License, or (at your option) any later version. ;;;; ;;;; This library is distributed in the hope that it will be useful, ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;;; Lesser General Public License for more details. ;;;; ;;;; You should have received a copy of the GNU Lesser General Public ;;;; License along with this library; if not, write to the Free Software ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;;; Commentary: ;; ;; @subheading Texinfo processing in scheme ;; ;; This module parses texinfo into SXML. TeX will always be the ;; processor of choice for print output, of course. However, although ;; @code{makeinfo} works well for info, its output in other formats is ;; not very customizable, and the program is not extensible as a whole. ;; This module aims to provide an extensible framework for texinfo ;; processing that integrates texinfo into the constellation of SXML ;; processing tools. ;; ;; @subheading Notes on the SXML vocabulary ;; ;; Consider the following texinfo fragment: ;; ;;@example ;; @@deffn Primitive set-car! pair value ;; This function... ;; @@end deffn ;;@end example ;; ;; Logically, the category (Primitive), name (set-car!), and arguments ;; (pair value) are ``attributes'' of the deffn, with the description as ;; the content. However, texinfo allows for @@-commands within the ;; arguments to an environment, like @code{@@deffn}, which means that ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are ;; called ``arguments'', and are grouped under the special element, `%'. ;; ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In ;; the interests of interoperability, this module provides a conversion ;; function to replace the `%' with `texinfo-arguments'. ;; ;;; Code: ;; Comparison to xml output of texinfo (which is rather undocumented): ;; Doesn't conform to texinfo dtd ;; No DTD at all, in fact :-/ ;; Actually outputs valid xml, after transforming % ;; Slower (although with caching the SXML that problem can go away) ;; Doesn't parse menus (although menus are shite) ;; Args go in a dedicated element, FBOFW ;; Definitions are handled a lot better ;; Does parse comments ;; Outputs only significant line breaks (a biggie!) ;; Nodes are treated as anchors, rather than content organizers (a biggie) ;; (more book-like, less info-like) ;; TODO ;; Integration: help, indexing, plain text (define-module (texinfo) #:use-module (sxml simple) #:use-module (sxml transform) #:use-module (sxml ssax input-parse) #:use-module (srfi srfi-1) #:use-module (srfi srfi-11) #:use-module (srfi srfi-13) #:export (call-with-file-and-dir texi-command-specs texi-command-depth texi-fragment->stexi texi->stexi stexi->sxml)) ;; Some utilities (define (parser-error port message . rest) (apply throw 'parser-error port message rest)) (define (call-with-file-and-dir filename proc) "Call the one-argument procedure @var{proc} with an input port that reads from @var{filename}. During the dynamic extent of @var{proc}'s execution, the current directory will be @code{(dirname @var{filename})}. This is useful for parsing documents that can include files by relative path name." (let ((current-dir (getcwd))) (dynamic-wind (lambda () (chdir (dirname filename))) (lambda () (call-with-input-file (basename filename) proc)) (lambda () (chdir current-dir))))) ;;======================================================================== ;; Reflection on the XML vocabulary (define texi-command-specs #; "A list of (@var{name} @var{content-model} . @var{args}) @table @var @item name The name of an @@-command, as a symbol. @item content-model A symbol indicating the syntactic type of the @@-command: @table @code @item EMPTY-COMMAND No content, and no @code{@@end} is coming @item EOL-ARGS Unparsed arguments until end of line @item EOL-TEXT Parsed arguments until end of line @item INLINE-ARGS Unparsed arguments ending with @code{#\\@}} @item INLINE-TEXT Parsed arguments ending with @code{#\\@}} @item INLINE-TEXT-ARGS Parsed arguments ending with @code{#\\@}} @item ENVIRON The tag is an environment tag, expect @code{@@end foo}. @item TABLE-ENVIRON Like ENVIRON, but with special parsing rules for its arguments. @item FRAGMENT For @code{*fragment*}, the command used for parsing fragments of texinfo documents. @end table @code{INLINE-TEXT} commands will receive their arguments within their bodies, whereas the @code{-ARGS} commands will receive them in their attribute list. @code{EOF-TEXT} receives its arguments in its body. @code{ENVIRON} commands have both: parsed arguments until the end of line, received through their attribute list, and parsed text until the @code{@@end}, received in their bodies. @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in @code{ENVIRON}. In addition, @code{ALIAS} can alias one command to another. The alias will never be seen in parsed stexinfo. There are four @@-commands that are treated specially. @code{@@include} is a low-level token that will not be seen by higher-level parsers, so it has no content-model. @code{@@para} is the paragraph command, which is only implicit in the texinfo source. @code{@@item} has special syntax, as noted above, and @code{@@entry} is how this parser treats @code{@@item} commands within @code{@@table}, @code{@@ftable}, and @code{@@vtable}. Also, indexing commands (@code{@@cindex}, etc.) are treated specially. Their arguments are parsed, but they are needed before entering the element so that an anchor can be inserted into the text before the index entry. @item args Named arguments to the command, in the same format as the formals for a lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, @code{INLINE-TEXT-ARGS}, @code{ENVIRON}, @code{TABLE-ENVIRON} commands. @end table" '(;; Special commands (include #f) ;; this is a low-level token (para PARAGRAPH) (item ITEM) (entry ENTRY . heading) (noindent EMPTY-COMMAND) (*fragment* FRAGMENT) ;; Inline text commands (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant (bold INLINE-TEXT) (sample INLINE-TEXT) (samp INLINE-TEXT) (code INLINE-TEXT) (math INLINE-TEXT) (kbd INLINE-TEXT) (key INLINE-TEXT) (var INLINE-TEXT) (env INLINE-TEXT) (file INLINE-TEXT) (command INLINE-TEXT) (option INLINE-TEXT) (dfn INLINE-TEXT) (cite INLINE-TEXT) (acro INLINE-TEXT) (email INLINE-TEXT) (emph INLINE-TEXT) (strong INLINE-TEXT) (sample INLINE-TEXT) (sc INLINE-TEXT) (titlefont INLINE-TEXT) (asis INLINE-TEXT) (b INLINE-TEXT) (i INLINE-TEXT) (r INLINE-TEXT) (sansserif INLINE-TEXT) (slanted INLINE-TEXT) (t INLINE-TEXT) ;; Inline args commands (value INLINE-ARGS . (key)) (ref INLINE-ARGS . (node #:opt name section info-file manual)) (xref INLINE-ARGS . (node #:opt name section info-file manual)) (pxref INLINE-TEXT-ARGS . (node #:opt name section info-file manual)) (url ALIAS . uref) (uref INLINE-TEXT-ARGS . (url #:opt title replacement)) (anchor INLINE-ARGS . (name)) (dots INLINE-ARGS . ()) (result INLINE-ARGS . ()) (bullet INLINE-ARGS . ()) (copyright INLINE-ARGS . ()) (tie INLINE-ARGS . ()) (image INLINE-ARGS . (file #:opt width height alt-text extension)) ;; Inline parsed args commands (acronym INLINE-TEXT-ARGS . (acronym #:opt meaning)) ;; EOL args elements (node EOL-ARGS . (name #:opt next previous up)) (c EOL-ARGS . all) (comment EOL-ARGS . all) (setchapternewpage EOL-ARGS . all) (sp EOL-ARGS . all) (page EOL-ARGS . ()) (vskip EOL-ARGS . all) (syncodeindex EOL-ARGS . all) (contents EOL-ARGS . ()) (shortcontents EOL-ARGS . ()) (summarycontents EOL-ARGS . ()) (insertcopying EOL-ARGS . ()) (dircategory EOL-ARGS . (category)) (top EOL-ARGS . (title)) (printindex EOL-ARGS . (type)) (paragraphindent EOL-ARGS . (indent)) ;; EOL text commands (*ENVIRON-ARGS* EOL-TEXT) (itemx EOL-TEXT) (set EOL-TEXT) (center EOL-TEXT) (title EOL-TEXT) (subtitle EOL-TEXT) (author EOL-TEXT) (chapter EOL-TEXT) (section EOL-TEXT) (appendix EOL-TEXT) (appendixsec EOL-TEXT) (unnumbered EOL-TEXT) (unnumberedsec EOL-TEXT) (subsection EOL-TEXT) (subsubsection EOL-TEXT) (appendixsubsec EOL-TEXT) (appendixsubsubsec EOL-TEXT) (unnumberedsubsec EOL-TEXT) (unnumberedsubsubsec EOL-TEXT) (chapheading EOL-TEXT) (majorheading EOL-TEXT) (heading EOL-TEXT) (subheading EOL-TEXT) (subsubheading EOL-TEXT) (deftpx EOL-TEXT-ARGS . (category name . attributes)) (defcvx EOL-TEXT-ARGS . (category class name)) (defivarx EOL-TEXT-ARGS . (class name)) (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) (defopx EOL-TEXT-ARGS . (category class name . arguments)) (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) (defmethodx EOL-TEXT-ARGS . (class name . arguments)) (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) (defoptx EOL-TEXT-ARGS . (name)) (defvrx EOL-TEXT-ARGS . (category name)) (defvarx EOL-TEXT-ARGS . (name)) (deftypevrx EOL-TEXT-ARGS . (category data-type name)) (deftypevarx EOL-TEXT-ARGS . (data-type name)) (deffnx EOL-TEXT-ARGS . (category name . arguments)) (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) (defspecx EOL-TEXT-ARGS . (name . arguments)) (defmacx EOL-TEXT-ARGS . (name . arguments)) (defunx EOL-TEXT-ARGS . (name . arguments)) (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) ;; Indexing commands (cindex INDEX . entry) (findex INDEX . entry) (vindex INDEX . entry) (kindex INDEX . entry) (pindex INDEX . entry) (tindex INDEX . entry) ;; Environment commands (those that need @end) (texinfo ENVIRON . title) (ignore ENVIRON . ()) (ifinfo ENVIRON . ()) (iftex ENVIRON . ()) (ifhtml ENVIRON . ()) (ifxml ENVIRON . ()) (ifplaintext ENVIRON . ()) (ifnotinfo ENVIRON . ()) (ifnottex ENVIRON . ()) (ifnothtml ENVIRON . ()) (ifnotxml ENVIRON . ()) (ifnotplaintext ENVIRON . ()) (titlepage ENVIRON . ()) (menu ENVIRON . ()) (direntry ENVIRON . ()) (copying ENVIRON . ()) (example ENVIRON . ()) (smallexample ENVIRON . ()) (display ENVIRON . ()) (smalldisplay ENVIRON . ()) (verbatim ENVIRON . ()) (format ENVIRON . ()) (smallformat ENVIRON . ()) (lisp ENVIRON . ()) (smalllisp ENVIRON . ()) (cartouche ENVIRON . ()) (quotation ENVIRON . ()) (deftp ENVIRON . (category name . attributes)) (defcv ENVIRON . (category class name)) (defivar ENVIRON . (class name)) (deftypeivar ENVIRON . (class data-type name)) (defop ENVIRON . (category class name . arguments)) (deftypeop ENVIRON . (category class data-type name . arguments)) (defmethod ENVIRON . (class name . arguments)) (deftypemethod ENVIRON . (class data-type name . arguments)) (defopt ENVIRON . (name)) (defvr ENVIRON . (category name)) (defvar ENVIRON . (name)) (deftypevr ENVIRON . (category data-type name)) (deftypevar ENVIRON . (data-type name)) (deffn ENVIRON . (category name . arguments)) (deftypefn ENVIRON . (category data-type name . arguments)) (defspec ENVIRON . (name . arguments)) (defmac ENVIRON . (name . arguments)) (defun ENVIRON . (name . arguments)) (deftypefun ENVIRON . (data-type name . arguments)) (table TABLE-ENVIRON . (formatter)) (itemize TABLE-ENVIRON . (formatter)) (enumerate TABLE-ENVIRON . (start)) (ftable TABLE-ENVIRON . (formatter)) (vtable TABLE-ENVIRON . (formatter)))) (define command-depths '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) (top . 0) (unnumbered . 1) (unnumberedsec . 2) (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) (appendix . 1) (appendixsec . 2) (appendixsection . 2) (appendixsubsec . 3) (appendixsubsubsec . 4))) (define (texi-command-depth command max-depth) "Given the texinfo command @var{command}, return its nesting level, or @code{#f} if it nests too deep for @var{max-depth}. Examples: @example (texi-command-depth 'chapter 4) @result{} 1 (texi-command-depth 'top 4) @result{} 0 (texi-command-depth 'subsection 4) @result{} 3 (texi-command-depth 'appendixsubsec 4) @result{} 3 (texi-command-depth 'subsection 2) @result{} #f @end example" (let ((depth (and=> (assq command command-depths) cdr))) (and depth (<= depth max-depth) depth))) ;; The % is for arguments (define (space-significant? command) (memq command '(example smallexample verbatim lisp smalllisp menu %))) ;; Like a DTD for texinfo (define (command-spec command) (let ((spec (assq command texi-command-specs))) (cond ((not spec) (parser-error #f "Unknown command" command)) ((eq? (cadr spec) 'ALIAS) (command-spec (cddr spec))) (else spec)))) (define (inline-content? content) (case content ((INLINE-TEXT INLINE-ARGS INLINE-TEXT-ARGS) #t) (else #f))) ;;======================================================================== ;; Lower-level parsers and scanners ;; ;; They deal with primitive lexical units (Names, whitespaces, tags) and ;; with pieces of more generic productions. Most of these parsers must ;; be called in appropriate context. For example, complete-start-command ;; must be called only when the @-command start has been detected and ;; its name token has been read. ;; Test if a string is made of only whitespace ;; An empty string is considered made of whitespace as well (define (string-whitespace? str) (or (string-null? str) (string-every char-whitespace? str))) ;; Like read-text-line, but allows EOF. (define read-eof-breaks '(*eof* #\return #\newline)) (define (read-eof-line port) (if (eof-object? (peek-char port)) (peek-char port) (let* ((line (next-token '() read-eof-breaks "reading a line" port)) (c (read-char port))) ; must be either \n or \r or EOF (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) (read-char port)) ; skip \n that follows \r line))) (define (skip-whitespace port) (skip-while '(#\space #\tab #\return #\newline) port)) (define (skip-horizontal-whitespace port) (skip-while '(#\space #\tab) port)) ;; command ::= Letter+ ;; procedure: read-command PORT ;; ;; Read a command starting from the current position in the PORT and ;; return it as a symbol. (define (read-command port) (let ((first-char (peek-char port))) (or (char-alphabetic? first-char) (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) (string->symbol (next-token-of (lambda (c) (cond ((eof-object? c) #f) ((char-alphabetic? c) c) (else #f))) port))) ;; A token is a primitive lexical unit. It is a record with two fields, ;; token-head and token-kind. ;; ;; Token types: ;; END The end of a texinfo command. If the command is ended by }, ;; token-head will be #f. Otherwise if the command is ended by ;; @end COMMAND, token-head will be COMMAND. As a special case, ;; @bye is the end of a special @texinfo command. ;; START The start of a texinfo command. The token-head will be a ;; symbol of the @-command name. ;; INCLUDE An @include directive. The token-head will be empty -- the ;; caller is responsible for reading the include file name. ;; ITEM @item commands have an irregular syntax. They end at the ;; next @item, or at the end of the environment. For that ;; read-command-token treats them specially. (define (make-token kind head) (cons kind head)) (define token? pair?) (define token-kind car) (define token-head cdr) ;; procedure: read-command-token PORT ;; ;; This procedure starts parsing of a command token. The current ;; position in the stream must be #\@. This procedure scans enough of ;; the input stream to figure out what kind of a command token it is ;; seeing. The procedure returns a token structure describing the token. (define (read-command-token port) (assert-curr-char '(#\@) "start of the command" port) (let ((peeked (peek-char port))) (cond ((memq peeked '(#\! #\: #\. #\? #\@ #\\ #\{ #\})) ;; @-commands that escape characters (make-token 'STRING (string (read-char port)))) (else (let ((name (read-command port))) (case name ((end) ;; got an ending tag (let ((command (string-trim-both (read-eof-line port)))) (or (and (not (string-null? command)) (string-every char-alphabetic? command)) (parser-error port "malformed @end" command)) (make-token 'END (string->symbol command)))) ((bye) ;; the end of the top (make-token 'END 'texinfo)) ((item) (make-token 'ITEM 'item)) ((include) (make-token 'INCLUDE #f)) (else (make-token 'START name)))))))) ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED ;; ;; This procedure must be called after we have read a string ;; "@verbatim\n" that begins a verbatim section. The current position ;; must be the first position of the verbatim body. This function reads ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a ;; character data consumer. ;; ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. ;; The first STRING1 argument to STR-HANDLER never contains a newline. ;; The second STRING2 argument often will. On the first invocation of the ;; STR-HANDLER, the seed is the one passed to read-verbatim-body ;; as the third argument. The result of this first invocation will be ;; passed as the seed argument to the second invocation of the line ;; consumer, and so on. The result of the last invocation of the ;; STR-HANDLER is returned by the read-verbatim-body. Note a ;; similarity to the fundamental 'fold' iterator. ;; ;; Within a verbatim section all characters are taken at their face ;; value. It ends with "\n@end verbatim(\r)?\n". ;; Must be called right after the newline after @verbatim. (define (read-verbatim-body port str-handler seed) (let loop ((seed seed)) (let ((fragment (next-token '() '(#\newline) "reading verbatim" port))) ;; We're reading the char after the 'fragment', which is ;; #\newline. (read-char port) (if (string=? fragment "@end verbatim") seed (loop (str-handler fragment "\n" seed)))))) ;; procedure+: read-arguments PORT ;; ;; This procedure reads and parses a production ArgumentList. ;; ArgumentList ::= S* Argument (S* , S* Argument)* S* ;; Argument ::= ([^@{},])* ;; ;; Arguments are the things in braces, i.e @ref{my node} has one ;; argument, "my node". Most commands taking braces actually don't have ;; arguments, they process text. For example, in ;; @emph{@strong{emphasized}}, the emph takes text, because the parse ;; continues into the braces. ;; ;; Any whitespace within Argument is replaced with a single space. ;; Whitespace around an Argument is trimmed. ;; ;; The procedure returns a list of arguments. Afterwards the current ;; character will be after the final #\}. (define (read-arguments port stop-char) (define (split str) (read-char port) ;; eat the delimiter (let ((ret (map (lambda (x) (if (string-null? x) #f x)) (map string-trim-both (string-split str #\,))))) (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) '() ret))) (split (next-token '() (list stop-char) "arguments of @-command" port))) ;; procedure+: complete-start-command COMMAND PORT ;; ;; This procedure is to complete parsing of an @-command. The procedure ;; must be called after the command token has been read. COMMAND is a ;; TAG-NAME. ;; ;; This procedure returns several values: ;; COMMAND: a symbol. ;; ARGUMENTS: command's arguments, as an alist. ;; CONTENT-MODEL: the content model of the command. ;; ;; On exit, the current position in PORT will depend on the CONTENT-MODEL. ;; ;; Content model Port position ;; ============= ============= ;; INLINE-TEXT One character after the #\{. ;; INLINE-TEXT-ARGS One character after the #\{. ;; INLINE-ARGS The first character after the #\}. ;; EOL-TEXT The first non-whitespace character after the command. ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT ;; The first character on the next line. ;; PARAGRAPH, ITEM, EMPTY-COMMAND ;; The first character after the command. (define (arguments->attlist port args arg-names) (let loop ((in args) (names arg-names) (opt? #f) (out '())) (cond ((symbol? names) ;; a rest arg (reverse (if (null? in) out (acons names in out)))) ((and (not (null? names)) (eq? (car names) #:opt)) (loop in (cdr names) #t out)) ((null? in) (if (or (null? names) opt?) (reverse out) (parser-error port "@-command expected more arguments:" args arg-names names))) ((null? names) (parser-error port "@-command didn't expect more arguments:" in)) ((not (car in)) (or (and opt? (loop (cdr in) (cdr names) opt? out)) (parser-error "@-command missing required argument" (car names)))) (else (loop (cdr in) (cdr names) opt? (acons (car names) (if (list? (car in)) (car in) (list (car in))) out)))))) (define (parse-table-args command port) (let* ((line (string-trim-both (read-text-line port))) (length (string-length line))) (define (get-formatter) (or (and (not (zero? length)) (eq? (string-ref line 0) #\@) (let ((f (string->symbol (substring line 1)))) (or (inline-content? (cadr (command-spec f))) (parser-error port "@item formatter must be INLINE" f)) f)) (parser-error port "Invalid @item formatter" line))) (case command ((enumerate) (if (zero? length) '() `((start ,(if (or (and (eq? length 1) (char-alphabetic? (string-ref line 0))) (string-every char-numeric? line)) line (parser-error port "Invalid enumerate start" line)))))) ((itemize) `((bullet ,(or (and (eq? length 1) line) (and (string-null? line) '(bullet)) (list (get-formatter)))))) (else ;; tables of various varieties `((formatter (,(get-formatter)))))))) (define (complete-start-command command port) (define (get-arguments type arg-names stop-char) (arguments->attlist port (read-arguments port stop-char) arg-names)) (let* ((spec (command-spec command)) (command (car spec)) (type (cadr spec)) (arg-names (cddr spec))) (case type ((INLINE-TEXT) (assert-curr-char '(#\{) "Inline element lacks {" port) (values command '() type)) ((INLINE-ARGS) (assert-curr-char '(#\{) "Inline element lacks {" port) (values command (get-arguments type arg-names #\}) type)) ((INLINE-TEXT-ARGS) (assert-curr-char '(#\{) "Inline element lacks {" port) (values command '() type)) ((EOL-ARGS) (values command (get-arguments type arg-names #\newline) type)) ((ENVIRON ENTRY INDEX) (skip-horizontal-whitespace port) (values command (parse-environment-args command port) type)) ((TABLE-ENVIRON) (skip-horizontal-whitespace port) (values command (parse-table-args command port) type)) ((EOL-TEXT) (skip-horizontal-whitespace port) (values command '() type)) ((EOL-TEXT-ARGS) (skip-horizontal-whitespace port) (values command (parse-eol-text-args command port) type)) ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) (values command '() type)) (else ;; INCLUDE shouldn't get here (parser-error port "can't happen"))))) ;;----------------------------------------------------------------------------- ;; Higher-level parsers and scanners ;; ;; They parse productions corresponding entire @-commands. ;; Only reads @settitle, leaves it to the command parser to finish ;; reading the title. (define (take-until-settitle port) (or (find-string-from-port? "\n@settitle " port) (parser-error port "No \\n@settitle found")) (skip-horizontal-whitespace port) (and (eq? (peek-char port) #\newline) (parser-error port "You have a @settitle, but no title"))) ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED ;; ;; This procedure is to read the CharData of a texinfo document. ;; ;; text ::= (CharData | Command)* ;; ;; The procedure reads CharData and stops at @-commands (or ;; environments). It also stops at an open or close brace. ;; ;; port ;; a PORT to read ;; expect-eof? ;; a boolean indicating if EOF is normal, i.e., the character ;; data may be terminated by the EOF. EOF is normal ;; while processing the main document. ;; preserve-ws? ;; a boolean indicating if we are within a whitespace-preserving ;; environment. If #t, suppress paragraph detection. ;; str-handler ;; a STR-HANDLER, see read-verbatim-body ;; seed ;; an argument passed to the first invocation of STR-HANDLER. ;; ;; The procedure returns two results: SEED and TOKEN. The SEED is the ;; result of the last invocation of STR-HANDLER, or the original seed if ;; STR-HANDLER was never called. ;; ;; TOKEN can be either an eof-object (this can happen only if expect-eof? ;; was #t), or a texinfo token denoting the start or end of a tag. ;; read-char-data port expect-eof? preserve-ws? str-handler seed (define read-char-data (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) (define (handle str-handler str1 str2 seed) (if (and (string-null? str1) (string-null? str2)) seed (str-handler str1 str2 seed))) (lambda (port expect-eof? preserve-ws? str-handler seed) (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) (let loop ((seed seed)) (let* ((fragment (next-token '() end-chars "reading char data" port)) (term-char (peek-char port))) ; one of end-chars (cond ((eof-object? term-char) ; only if expect-eof? (values (handle str-handler fragment "" seed) term-char)) ((memq term-char '(#\@ #\{ #\})) (values (handle str-handler fragment "" seed) (case term-char ((#\@) (read-command-token port)) ((#\{) (make-token 'START '*braces*)) ((#\}) (read-char port) (make-token 'END #f))))) ((eq? term-char #\newline) ;; Always significant, unless directly before an end token. (let ((c (peek-next-char port))) (cond ((eof-object? c) (or expect-eof? (parser-error port "EOF while reading char data")) (values (handle str-handler fragment "" seed) c)) ((eq? c #\@) (let* ((token (read-command-token port)) (end? (eq? (token-kind token) 'END))) (values (handle str-handler fragment (if end? "" (if preserve-ws? "\n" " ")) seed) token))) ((and (not preserve-ws?) (eq? c #\newline)) ;; paragraph-separator ::= #\newline #\newline+ (skip-while '(#\newline) port) (skip-horizontal-whitespace port) (values (handle str-handler fragment "" seed) (make-token 'PARA 'para))) (else (loop (handle str-handler fragment (if preserve-ws? "\n" " ") seed))))))))))))) ; procedure+: assert-token TOKEN KIND NAME ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME (define (assert-token token kind name) (or (and (token? token) (eq? kind (token-kind token)) (equal? name (token-head token))) (parser-error #f "Expecting @end for " name ", got " token))) ;;======================================================================== ;; Highest-level parsers: Texinfo to SXML ;; These parsers are a set of syntactic forms to instantiate a SSAX ;; parser. The user tells what to do with the parsed character and ;; element data. These latter handlers determine if the parsing follows a ;; SAX or a DOM model. ;; syntax: make-command-parser fdown fup str-handler ;; Create a parser to parse and process one element, including its ;; character content or children elements. The parser is typically ;; applied to the root element of a document. ;; fdown ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED ;; ;; This procedure is to generate the seed to be passed to handlers ;; that process the content of the element. This is the function ;; identified as 'fdown' in the denotational semantics of the XML ;; parser given in the title comments to (sxml ssax). ;; ;; fup ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED ;; ;; This procedure is called when parsing of COMMAND is finished. ;; The SEED is the result from the last content parser (or from ;; fdown if the element has the empty content). PARENT-SEED is the ;; same seed as was passed to fdown. The procedure is to generate a ;; seed that will be the result of the element parser. This is the ;; function identified as 'fup' in the denotational semantics of ;; the XML parser given in the title comments to (sxml ssax). ;; ;; str-handler ;; A STR-HANDLER, see read-verbatim-body ;; ;; The generated parser is a ;; procedure COMMAND PORT SEED ;; ;; The procedure must be called *after* the command token has been read. (define (read-include-file-name port) (let ((x (string-trim-both (read-eof-line port)))) (if (string-null? x) (error "no file listed") x))) ;; fixme: should expand @value{} references (define (sxml->node-name sxml) "Turn some sxml string into a valid node name." (let loop ((in (string->list (sxml->string sxml))) (out '())) (if (null? in) (apply string (reverse out)) (if (memq (car in) '(#\{ #\} #\@ #\,)) (loop (cdr in) out) (loop (cdr in) (cons (car in) out)))))) (define (index command arguments fdown fup parent-seed) (case command ((deftp defcv defivar deftypeivar defop deftypeop defmethod deftypemethod defopt defvr defvar deftypevr deftypevar deffn deftypefn defspec defmac defun deftypefun) (let ((args `((name ,(string-append (symbol->string command) "-" (cadr (assq 'name arguments))))))) (fup 'anchor args parent-seed (fdown 'anchor args 'INLINE-ARGS '())))) ((cindex findex vindex kindex pindex tindex) (let ((args `((name ,(string-append (symbol->string command) "-" (sxml->node-name (assq 'entry arguments))))))) (fup 'anchor args parent-seed (fdown 'anchor args 'INLINE-ARGS '())))) (else parent-seed))) (define (make-command-parser fdown fup str-handler) (lambda (command port seed) (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) (let*-values (((command arguments expected-content) (complete-start-command command port))) (let* ((parent-seed (index command arguments fdown fup parent-seed)) (seed (fdown command arguments expected-content parent-seed)) (eof-closes? (or (memq command '(texinfo para *fragment*)) (eq? expected-content 'EOL-TEXT))) (sig-ws? (or sig-ws? (space-significant? command))) (up (lambda (s) (fup command arguments parent-seed s))) (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) (define (port-for-content) (if (eq? expected-content 'EOL-TEXT) (call-with-input-string (read-text-line port) identity) port)) (cond ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX EOL-TEXT-ARGS)) ;; empty or finished by complete-start-command (up seed)) ((eq? command 'verbatim) (up (read-verbatim-body port str-handler seed))) (else (let loop ((port (port-for-content)) (expect-eof? eof-closes?) (end-para identity) (need-break? (and (not sig-ws?) (memq expected-content '(ENVIRON TABLE-ENVIRON ENTRY ITEM FRAGMENT)))) (seed seed)) (cond ((and need-break? (or sig-ws? (skip-whitespace port)) (not (memq (peek-char port) '(#\@ #\}))) (not (eof-object? (peek-char port)))) ;; Even if we have an @, it might be inline -- check ;; that later (let ((seed (end-para seed))) (loop port expect-eof? (make-end-para seed) #f (new-para seed)))) (else (let*-values (((seed token) (read-char-data port expect-eof? sig-ws? str-handler seed))) (cond ((eof-object? token) (case expect-eof? ((include #f) (end-para seed)) (else (up (end-para seed))))) (else (case (token-kind token) ((STRING) ;; this is only @-commands that escape ;; characters: @}, @@, @{ -- new para if need-break (let ((seed ((if need-break? end-para identity) seed))) (loop port expect-eof? (if need-break? (make-end-para seed) end-para) #f (str-handler (token-head token) "" ((if need-break? new-para identity) seed))))) ((END) ;; The end will only have a name if it's for an ;; environment (cond ((memq command '(item entry)) (let ((spec (command-spec (token-head token)))) (or (eq? (cadr spec) 'TABLE-ENVIRON) (parser-error port "@item not ended by @end table/enumerate/itemize" token)))) ((eq? expected-content 'ENVIRON) (assert-token token 'END command))) (up (end-para seed))) ((ITEM) (cond ((memq command '(enumerate itemize)) (up (visit 'item port sig-ws? (end-para seed)))) ((eq? expected-content 'TABLE-ENVIRON) (up (visit 'entry port sig-ws? (end-para seed)))) ((memq command '(item entry)) (visit command port sig-ws? (up (end-para seed)))) (else (parser-error port "@item must be within a table environment" command)))) ((PARA) ;; examine valid paragraphs? (loop port expect-eof? end-para (not sig-ws?) seed)) ((INCLUDE) ;; Recurse for include files (let ((seed (call-with-file-and-dir (read-include-file-name port) (lambda (port) (loop port 'include end-para need-break? seed))))) (loop port expect-eof? end-para need-break? seed))) ((START) ; Start of an @-command (let* ((head (token-head token)) (spec (command-spec head)) (head (car spec)) (type (cadr spec)) (inline? (inline-content? type)) (seed ((if (and inline? (not need-break?)) identity end-para) seed)) (end-para (if inline? (if need-break? (make-end-para seed) end-para) identity)) (new-para (if (and inline? need-break?) new-para identity))) (loop port expect-eof? end-para (not inline?) (visit head port sig-ws? (new-para seed))))) (else (parser-error port "Unknown token type" token)))))))))))))))) ;; procedure: reverse-collect-str-drop-ws fragments ;; ;; Given the list of fragments (some of which are text strings), reverse ;; the list and concatenate adjacent text strings. We also drop ;; "unsignificant" whitespace, that is, whitespace in front, behind and ;; between elements. The whitespace that is included in character data ;; is not affected. (define (reverse-collect-str-drop-ws fragments) (cond ((null? fragments) ; a shortcut '()) ((and (string? (car fragments)) ; another shortcut (null? (cdr fragments)) ; remove single ws-only string (string-whitespace? (car fragments))) '()) (else (let loop ((fragments fragments) (result '()) (strs '()) (all-whitespace? #t)) (cond ((null? fragments) (if all-whitespace? result ; remove leading ws (cons (apply string-append strs) result))) ((string? (car fragments)) (loop (cdr fragments) result (cons (car fragments) strs) (and all-whitespace? (string-whitespace? (car fragments))))) (else (loop (cdr fragments) (cons (car fragments) (cond ((null? strs) result) (all-whitespace? (if (null? result) result ; remove trailing whitespace (cons " " result))); replace interstitial ws with ; one space (else (cons (apply string-append strs) result)))) '() #t))))))) (define (parse-inline-text-args port spec text) (let lp ((in text) (cur '()) (out '())) (cond ((null? in) (if (and (pair? cur) (string? (car cur)) (string-whitespace? (car cur))) (lp in (cdr cur) out) (let ((args (reverse (if (null? cur) out (cons (reverse cur) out))))) (arguments->attlist port args (cddr spec))))) ((pair? (car in)) (lp (cdr in) (cons (car in) cur) out)) ((string-index (car in) #\,) (let* ((parts (string-split (car in) #\,)) (head (string-trim-right (car parts))) (rev-tail (reverse (cdr parts))) (last (string-trim (car rev-tail)))) (lp (cdr in) (if (string-null? last) cur (cons last cur)) (append (cdr rev-tail) (cons (reverse (if (string-null? head) cur (cons head cur))) out))))) (else (lp (cdr in) (cons (if (null? cur) (string-trim (car in)) (car in)) cur) out))))) (define (make-dom-parser) (make-command-parser (lambda (command args content seed) ; fdown '()) (lambda (command args parent-seed seed) ; fup (let* ((seed (reverse-collect-str-drop-ws seed)) (spec (command-spec command)) (command (car spec))) (if (eq? (cadr spec) 'INLINE-TEXT-ARGS) (cons (list command (cons '% (parse-inline-text-args #f spec seed))) parent-seed) (acons command (if (null? args) seed (acons '% args seed)) parent-seed)))) (lambda (string1 string2 seed) ; str-handler (if (string-null? string2) (cons string1 seed) (cons* string2 string1 seed))))) (define parse-environment-args (let ((parser (make-dom-parser))) ;; duplicate arguments->attlist to avoid unnecessary splitting (lambda (command port) (let* ((args (cdar (parser '*ENVIRON-ARGS* port '()))) (spec (command-spec command)) (command (car spec)) (arg-names (cddr spec))) (cond ((not arg-names) (if (null? args) '() (parser-error port "@-command doesn't take args" command))) ((eq? arg-names #t) (list (cons 'arguments args))) (else (let loop ((args args) (arg-names arg-names) (out '())) (cond ((null? arg-names) (if (null? args) (reverse! out) (parser-error port "@-command didn't expect more args" command args))) ((symbol? arg-names) (reverse! (acons arg-names args out))) ((null? args) (parser-error port "@-command expects more args" command arg-names)) ((and (string? (car args)) (string-index (car args) #\space)) => (lambda (i) (let ((rest (substring/shared (car args) (1+ i)))) (if (zero? i) (loop (cons rest (cdr args)) arg-names out) (loop (cons rest (cdr args)) (cdr arg-names) (cons (list (car arg-names) (substring (car args) 0 i)) out)))))) (else (loop (cdr args) (cdr arg-names) (if (and (pair? (car args)) (eq? (caar args) '*braces*)) (acons (car arg-names) (cdar args) out) (cons (list (car arg-names) (car args)) out)))))))))))) (define (parse-eol-text-args command port) ;; perhaps parse-environment-args should be named more ;; generically. (parse-environment-args command port)) ;; procedure: texi-fragment->stexi STRING ;; ;; A DOM parser for a texinfo fragment STRING. ;; ;; The procedure returns an SXML tree headed by the special tag, ;; *fragment*. (define (texi-fragment->stexi string-or-port) "Parse the texinfo commands in @var{string-or-port}, and return the resultant stexi tree. The head of the tree will be the special command, @code{*fragment*}." (define (parse port) (postprocess (car ((make-dom-parser) '*fragment* port '())))) (if (input-port? string-or-port) (parse string-or-port) (call-with-input-string string-or-port parse))) ;; procedure: texi->stexi PORT ;; ;; This is an instance of a SSAX parser above that returns an SXML ;; representation of the texinfo document ready to be read at PORT. ;; ;; The procedure returns an SXML tree. The port points to the ;; first character after the @bye, or to the end of the file. (define (texi->stexi port) "Read a full texinfo document from @var{port} and return the parsed stexi tree. The parsing will start at the @code{@@settitle} and end at @code{@@bye} or EOF." (let ((parser (make-dom-parser))) (take-until-settitle port) (postprocess (car (parser 'texinfo port '()))))) (define (car-eq? x y) (and (pair? x) (eq? (car x) y))) (define (make-contents tree) (define (lp in out depth) (cond ((null? in) (values in (cons 'enumerate (reverse! out)))) ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) => (lambda (new-depth) (let ((node-name (and (car-eq? (car in) 'node) (cadr (assq 'name (cdadar in)))))) (cond ((< new-depth depth) (values in (cons 'enumerate (reverse! out)))) ((> new-depth depth) (let ((out-cdr (if (null? out) '() (cdr out))) (out-car (if (null? out) (list 'item) (car out)))) (let*-values (((new-in new-out) (lp in '() (1+ depth)))) (lp new-in (cons (append out-car (list new-out)) out-cdr) depth)))) (else ;; same depth (lp (cddr in) (cons `(item (para ,@(if node-name `((ref (% (node ,node-name)))) (cdadr in)))) out) depth)))))) (else (lp (cdr in) out depth)))) (let*-values (((_ contents) (lp tree '() 1))) `((chapheading "Table of Contents") ,contents))) (define (trim-whitespace str trim-left? trim-right?) (let* ((left-space? (and (not trim-left?) (string-prefix? " " str))) (right-space? (and (not trim-right?) (string-suffix? " " str))) (tail (append! (string-tokenize str) (if right-space? '("") '())))) (string-join (if left-space? (cons "" tail) tail)))) (define (postprocess tree) (define (loop in out state first? sig-ws?) (cond ((null? in) (values (reverse! out) state)) ((string? (car in)) (loop (cdr in) (cons (if sig-ws? (car in) (trim-whitespace (car in) first? (null? (cdr in)))) out) state #f sig-ws?)) ((pair? (car in)) (case (caar in) ((set) (if (null? (cdar in)) (error "@set missing arguments" in)) (if (string? (cadar in)) (let ((i (string-index (cadar in) #\space))) (if i (loop (cdr in) out (acons (substring (cadar in) 0 i) (cons (substring (cadar in) (1+ i)) (cddar in)) state) #f sig-ws?) (loop (cdr in) out (acons (cadar in) (cddar in) state) #f sig-ws?))) (error "expected a constant to define for @set" in))) ((value) (loop (fold-right cons (cdr in) (or (and=> (assoc (cadr (assq 'key (cdadar in))) state) cdr) (error "unknown value" (cdadar in) state))) out state #f sig-ws?)) ((copying) (loop (cdr in) out (cons (car in) state) #f sig-ws?)) ((insertcopying) (loop (fold-right cons (cdr in) (or (cdr (assoc 'copying state)) (error "copying isn't set yet"))) out state #f sig-ws?)) ((contents) (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) (else (let*-values (((kid-out state) (loop (car in) '() state #t (or sig-ws? (space-significant? (caar in)))))) (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) (else ; a symbol (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) (call-with-values (lambda () (loop tree '() '() #t #f)) (lambda (out state) out))) ;; Replace % with texinfo-arguments. (define (stexi->sxml tree) "Transform the stexi tree @var{tree} into sxml. This involves replacing the @code{%} element that keeps the texinfo arguments with an element for each argument. FIXME: right now it just changes % to @code{texinfo-arguments} -- that doesn't hang with the idea of making a dtd at some point" (pre-post-order tree `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) (*text* . ,(lambda (x t) t)) (*default* . ,(lambda (x . t) (cons x t)))))) ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 ;;; texinfo.scm ends here