| 1 | ;;;; (texinfo) -- parsing of texinfo into SXML |
| 2 | ;;;; |
| 3 | ;;;; Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. |
| 4 | ;;;; Copyright (C) 2004, 2009 Andy Wingo <wingo at pobox dot com> |
| 5 | ;;;; Copyright (C) 2001,2002 Oleg Kiselyov <oleg at pobox dot com> |
| 6 | ;;;; |
| 7 | ;;;; This file is based on SSAX's SSAX.scm. |
| 8 | ;;;; |
| 9 | ;;;; This library is free software; you can redistribute it and/or |
| 10 | ;;;; modify it under the terms of the GNU Lesser General Public |
| 11 | ;;;; License as published by the Free Software Foundation; either |
| 12 | ;;;; version 3 of the License, or (at your option) any later version. |
| 13 | ;;;; |
| 14 | ;;;; This library is distributed in the hope that it will be useful, |
| 15 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 17 | ;;;; Lesser General Public License for more details. |
| 18 | ;;;; |
| 19 | ;;;; You should have received a copy of the GNU Lesser General Public |
| 20 | ;;;; License along with this library; if not, write to the Free Software |
| 21 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 22 | \f |
| 23 | ;;; Commentary: |
| 24 | ;; |
| 25 | ;; @subheading Texinfo processing in scheme |
| 26 | ;; |
| 27 | ;; This module parses texinfo into SXML. TeX will always be the |
| 28 | ;; processor of choice for print output, of course. However, although |
| 29 | ;; @code{makeinfo} works well for info, its output in other formats is |
| 30 | ;; not very customizable, and the program is not extensible as a whole. |
| 31 | ;; This module aims to provide an extensible framework for texinfo |
| 32 | ;; processing that integrates texinfo into the constellation of SXML |
| 33 | ;; processing tools. |
| 34 | ;; |
| 35 | ;; @subheading Notes on the SXML vocabulary |
| 36 | ;; |
| 37 | ;; Consider the following texinfo fragment: |
| 38 | ;; |
| 39 | ;;@example |
| 40 | ;; @@deffn Primitive set-car! pair value |
| 41 | ;; This function... |
| 42 | ;; @@end deffn |
| 43 | ;;@end example |
| 44 | ;; |
| 45 | ;; Logically, the category (Primitive), name (set-car!), and arguments |
| 46 | ;; (pair value) are ``attributes'' of the deffn, with the description as |
| 47 | ;; the content. However, texinfo allows for @@-commands within the |
| 48 | ;; arguments to an environment, like @code{@@deffn}, which means that |
| 49 | ;; texinfo ``attributes'' are PCDATA. XML attributes, on the other hand, |
| 50 | ;; are CDATA. For this reason, ``attributes'' of texinfo @@-commands are |
| 51 | ;; called ``arguments'', and are grouped under the special element, `%'. |
| 52 | ;; |
| 53 | ;; Because `%' is not a valid NCName, stexinfo is a superset of SXML. In |
| 54 | ;; the interests of interoperability, this module provides a conversion |
| 55 | ;; function to replace the `%' with `texinfo-arguments'. |
| 56 | ;; |
| 57 | ;;; Code: |
| 58 | |
| 59 | ;; Comparison to xml output of texinfo (which is rather undocumented): |
| 60 | ;; Doesn't conform to texinfo dtd |
| 61 | ;; No DTD at all, in fact :-/ |
| 62 | ;; Actually outputs valid xml, after transforming % |
| 63 | ;; Slower (although with caching the SXML that problem can go away) |
| 64 | ;; Doesn't parse menus (although menus are shite) |
| 65 | ;; Args go in a dedicated element, FBOFW |
| 66 | ;; Definitions are handled a lot better |
| 67 | ;; Does parse comments |
| 68 | ;; Outputs only significant line breaks (a biggie!) |
| 69 | ;; Nodes are treated as anchors, rather than content organizers (a biggie) |
| 70 | ;; (more book-like, less info-like) |
| 71 | |
| 72 | ;; TODO |
| 73 | ;; Integration: help, indexing, plain text |
| 74 | |
| 75 | (define-module (texinfo) |
| 76 | #:use-module (sxml simple) |
| 77 | #:use-module (sxml transform) |
| 78 | #:use-module (sxml ssax input-parse) |
| 79 | #:use-module (srfi srfi-1) |
| 80 | #:use-module (srfi srfi-11) |
| 81 | #:use-module (srfi srfi-13) |
| 82 | #:export (call-with-file-and-dir |
| 83 | texi-command-specs |
| 84 | texi-command-depth |
| 85 | texi-fragment->stexi |
| 86 | texi->stexi |
| 87 | stexi->sxml)) |
| 88 | |
| 89 | ;; Some utilities |
| 90 | |
| 91 | (define (parser-error port message . rest) |
| 92 | (apply throw 'parser-error port message rest)) |
| 93 | |
| 94 | (define (call-with-file-and-dir filename proc) |
| 95 | "Call the one-argument procedure @var{proc} with an input port that |
| 96 | reads from @var{filename}. During the dynamic extent of @var{proc}'s |
| 97 | execution, the current directory will be @code{(dirname |
| 98 | @var{filename})}. This is useful for parsing documents that can include |
| 99 | files by relative path name." |
| 100 | (let ((current-dir (getcwd))) |
| 101 | (dynamic-wind |
| 102 | (lambda () (chdir (dirname filename))) |
| 103 | (lambda () |
| 104 | (call-with-input-file (basename filename) proc)) |
| 105 | (lambda () (chdir current-dir))))) |
| 106 | |
| 107 | ;;======================================================================== |
| 108 | ;; Reflection on the XML vocabulary |
| 109 | |
| 110 | (define texi-command-specs |
| 111 | #; |
| 112 | "A list of (@var{name} @var{content-model} . @var{args}) |
| 113 | |
| 114 | @table @var |
| 115 | @item name |
| 116 | The name of an @@-command, as a symbol. |
| 117 | |
| 118 | @item content-model |
| 119 | A symbol indicating the syntactic type of the @@-command: |
| 120 | @table @code |
| 121 | @item EMPTY-COMMAND |
| 122 | No content, and no @code{@@end} is coming |
| 123 | @item EOL-ARGS |
| 124 | Unparsed arguments until end of line |
| 125 | @item EOL-TEXT |
| 126 | Parsed arguments until end of line |
| 127 | @item INLINE-ARGS |
| 128 | Unparsed arguments ending with @code{#\\@}} |
| 129 | @item INLINE-TEXT |
| 130 | Parsed arguments ending with @code{#\\@}} |
| 131 | @item INLINE-TEXT-ARGS |
| 132 | Parsed arguments ending with @code{#\\@}} |
| 133 | @item ENVIRON |
| 134 | The tag is an environment tag, expect @code{@@end foo}. |
| 135 | @item TABLE-ENVIRON |
| 136 | Like ENVIRON, but with special parsing rules for its arguments. |
| 137 | @item FRAGMENT |
| 138 | For @code{*fragment*}, the command used for parsing fragments of |
| 139 | texinfo documents. |
| 140 | @end table |
| 141 | |
| 142 | @code{INLINE-TEXT} commands will receive their arguments within their |
| 143 | bodies, whereas the @code{-ARGS} commands will receive them in their |
| 144 | attribute list. |
| 145 | |
| 146 | @code{EOF-TEXT} receives its arguments in its body. |
| 147 | |
| 148 | @code{ENVIRON} commands have both: parsed arguments until the end of |
| 149 | line, received through their attribute list, and parsed text until the |
| 150 | @code{@@end}, received in their bodies. |
| 151 | |
| 152 | @code{EOF-TEXT-ARGS} receives its arguments in its attribute list, as in |
| 153 | @code{ENVIRON}. |
| 154 | |
| 155 | In addition, @code{ALIAS} can alias one command to another. The alias |
| 156 | will never be seen in parsed stexinfo. |
| 157 | |
| 158 | There are four @@-commands that are treated specially. @code{@@include} |
| 159 | is a low-level token that will not be seen by higher-level parsers, so |
| 160 | it has no content-model. @code{@@para} is the paragraph command, which |
| 161 | is only implicit in the texinfo source. @code{@@item} has special |
| 162 | syntax, as noted above, and @code{@@entry} is how this parser treats |
| 163 | @code{@@item} commands within @code{@@table}, @code{@@ftable}, and |
| 164 | @code{@@vtable}. |
| 165 | |
| 166 | Also, indexing commands (@code{@@cindex}, etc.) are treated specially. |
| 167 | Their arguments are parsed, but they are needed before entering the |
| 168 | element so that an anchor can be inserted into the text before the index |
| 169 | entry. |
| 170 | |
| 171 | @item args |
| 172 | Named arguments to the command, in the same format as the formals for a |
| 173 | lambda. Only present for @code{INLINE-ARGS}, @code{EOL-ARGS}, |
| 174 | @code{INLINE-TEXT-ARGS}, @code{ENVIRON}, @code{TABLE-ENVIRON} commands. |
| 175 | @end table" |
| 176 | '(;; Special commands |
| 177 | (include #f) ;; this is a low-level token |
| 178 | (para PARAGRAPH) |
| 179 | (item ITEM) |
| 180 | (entry ENTRY . heading) |
| 181 | (noindent EMPTY-COMMAND) |
| 182 | (*fragment* FRAGMENT) |
| 183 | |
| 184 | ;; Inline text commands |
| 185 | (*braces* INLINE-TEXT) ;; FIXME: make me irrelevant |
| 186 | (bold INLINE-TEXT) |
| 187 | (sample INLINE-TEXT) |
| 188 | (samp INLINE-TEXT) |
| 189 | (code INLINE-TEXT) |
| 190 | (math INLINE-TEXT) |
| 191 | (kbd INLINE-TEXT) |
| 192 | (key INLINE-TEXT) |
| 193 | (var INLINE-TEXT) |
| 194 | (env INLINE-TEXT) |
| 195 | (file INLINE-TEXT) |
| 196 | (command INLINE-TEXT) |
| 197 | (option INLINE-TEXT) |
| 198 | (dfn INLINE-TEXT) |
| 199 | (cite INLINE-TEXT) |
| 200 | (acro INLINE-TEXT) |
| 201 | (email INLINE-TEXT) |
| 202 | (emph INLINE-TEXT) |
| 203 | (strong INLINE-TEXT) |
| 204 | (sample INLINE-TEXT) |
| 205 | (sc INLINE-TEXT) |
| 206 | (titlefont INLINE-TEXT) |
| 207 | (asis INLINE-TEXT) |
| 208 | (b INLINE-TEXT) |
| 209 | (i INLINE-TEXT) |
| 210 | (r INLINE-TEXT) |
| 211 | (sansserif INLINE-TEXT) |
| 212 | (slanted INLINE-TEXT) |
| 213 | (t INLINE-TEXT) |
| 214 | |
| 215 | ;; Inline args commands |
| 216 | (value INLINE-ARGS . (key)) |
| 217 | (ref INLINE-ARGS . (node #:opt name section info-file manual)) |
| 218 | (xref INLINE-ARGS . (node #:opt name section info-file manual)) |
| 219 | (pxref INLINE-TEXT-ARGS |
| 220 | . (node #:opt name section info-file manual)) |
| 221 | (url ALIAS . uref) |
| 222 | (uref INLINE-TEXT-ARGS . (url #:opt title replacement)) |
| 223 | (anchor INLINE-ARGS . (name)) |
| 224 | (dots INLINE-ARGS . ()) |
| 225 | (result INLINE-ARGS . ()) |
| 226 | (bullet INLINE-ARGS . ()) |
| 227 | (copyright INLINE-ARGS . ()) |
| 228 | (tie INLINE-ARGS . ()) |
| 229 | (image INLINE-ARGS . (file #:opt width height alt-text extension)) |
| 230 | |
| 231 | ;; Inline parsed args commands |
| 232 | (acronym INLINE-TEXT-ARGS . (acronym #:opt meaning)) |
| 233 | |
| 234 | ;; EOL args elements |
| 235 | (node EOL-ARGS . (name #:opt next previous up)) |
| 236 | (c EOL-ARGS . all) |
| 237 | (comment EOL-ARGS . all) |
| 238 | (setchapternewpage EOL-ARGS . all) |
| 239 | (sp EOL-ARGS . all) |
| 240 | (page EOL-ARGS . ()) |
| 241 | (vskip EOL-ARGS . all) |
| 242 | (syncodeindex EOL-ARGS . all) |
| 243 | (contents EOL-ARGS . ()) |
| 244 | (shortcontents EOL-ARGS . ()) |
| 245 | (summarycontents EOL-ARGS . ()) |
| 246 | (insertcopying EOL-ARGS . ()) |
| 247 | (dircategory EOL-ARGS . (category)) |
| 248 | (top EOL-ARGS . (title)) |
| 249 | (printindex EOL-ARGS . (type)) |
| 250 | (paragraphindent EOL-ARGS . (indent)) |
| 251 | |
| 252 | ;; EOL text commands |
| 253 | (*ENVIRON-ARGS* EOL-TEXT) |
| 254 | (itemx EOL-TEXT) |
| 255 | (set EOL-TEXT) |
| 256 | (center EOL-TEXT) |
| 257 | (title EOL-TEXT) |
| 258 | (subtitle EOL-TEXT) |
| 259 | (author EOL-TEXT) |
| 260 | (chapter EOL-TEXT) |
| 261 | (section EOL-TEXT) |
| 262 | (appendix EOL-TEXT) |
| 263 | (appendixsec EOL-TEXT) |
| 264 | (unnumbered EOL-TEXT) |
| 265 | (unnumberedsec EOL-TEXT) |
| 266 | (subsection EOL-TEXT) |
| 267 | (subsubsection EOL-TEXT) |
| 268 | (appendixsubsec EOL-TEXT) |
| 269 | (appendixsubsubsec EOL-TEXT) |
| 270 | (unnumberedsubsec EOL-TEXT) |
| 271 | (unnumberedsubsubsec EOL-TEXT) |
| 272 | (chapheading EOL-TEXT) |
| 273 | (majorheading EOL-TEXT) |
| 274 | (heading EOL-TEXT) |
| 275 | (subheading EOL-TEXT) |
| 276 | (subsubheading EOL-TEXT) |
| 277 | |
| 278 | (deftpx EOL-TEXT-ARGS . (category name . attributes)) |
| 279 | (defcvx EOL-TEXT-ARGS . (category class name)) |
| 280 | (defivarx EOL-TEXT-ARGS . (class name)) |
| 281 | (deftypeivarx EOL-TEXT-ARGS . (class data-type name)) |
| 282 | (defopx EOL-TEXT-ARGS . (category class name . arguments)) |
| 283 | (deftypeopx EOL-TEXT-ARGS . (category class data-type name . arguments)) |
| 284 | (defmethodx EOL-TEXT-ARGS . (class name . arguments)) |
| 285 | (deftypemethodx EOL-TEXT-ARGS . (class data-type name . arguments)) |
| 286 | (defoptx EOL-TEXT-ARGS . (name)) |
| 287 | (defvrx EOL-TEXT-ARGS . (category name)) |
| 288 | (defvarx EOL-TEXT-ARGS . (name)) |
| 289 | (deftypevrx EOL-TEXT-ARGS . (category data-type name)) |
| 290 | (deftypevarx EOL-TEXT-ARGS . (data-type name)) |
| 291 | (deffnx EOL-TEXT-ARGS . (category name . arguments)) |
| 292 | (deftypefnx EOL-TEXT-ARGS . (category data-type name . arguments)) |
| 293 | (defspecx EOL-TEXT-ARGS . (name . arguments)) |
| 294 | (defmacx EOL-TEXT-ARGS . (name . arguments)) |
| 295 | (defunx EOL-TEXT-ARGS . (name . arguments)) |
| 296 | (deftypefunx EOL-TEXT-ARGS . (data-type name . arguments)) |
| 297 | |
| 298 | ;; Indexing commands |
| 299 | (cindex INDEX . entry) |
| 300 | (findex INDEX . entry) |
| 301 | (vindex INDEX . entry) |
| 302 | (kindex INDEX . entry) |
| 303 | (pindex INDEX . entry) |
| 304 | (tindex INDEX . entry) |
| 305 | |
| 306 | ;; Environment commands (those that need @end) |
| 307 | (texinfo ENVIRON . title) |
| 308 | (ignore ENVIRON . ()) |
| 309 | (ifinfo ENVIRON . ()) |
| 310 | (iftex ENVIRON . ()) |
| 311 | (ifhtml ENVIRON . ()) |
| 312 | (ifxml ENVIRON . ()) |
| 313 | (ifplaintext ENVIRON . ()) |
| 314 | (ifnotinfo ENVIRON . ()) |
| 315 | (ifnottex ENVIRON . ()) |
| 316 | (ifnothtml ENVIRON . ()) |
| 317 | (ifnotxml ENVIRON . ()) |
| 318 | (ifnotplaintext ENVIRON . ()) |
| 319 | (titlepage ENVIRON . ()) |
| 320 | (menu ENVIRON . ()) |
| 321 | (direntry ENVIRON . ()) |
| 322 | (copying ENVIRON . ()) |
| 323 | (example ENVIRON . ()) |
| 324 | (smallexample ENVIRON . ()) |
| 325 | (display ENVIRON . ()) |
| 326 | (smalldisplay ENVIRON . ()) |
| 327 | (verbatim ENVIRON . ()) |
| 328 | (format ENVIRON . ()) |
| 329 | (smallformat ENVIRON . ()) |
| 330 | (lisp ENVIRON . ()) |
| 331 | (smalllisp ENVIRON . ()) |
| 332 | (cartouche ENVIRON . ()) |
| 333 | (quotation ENVIRON . ()) |
| 334 | |
| 335 | (deftp ENVIRON . (category name . attributes)) |
| 336 | (defcv ENVIRON . (category class name)) |
| 337 | (defivar ENVIRON . (class name)) |
| 338 | (deftypeivar ENVIRON . (class data-type name)) |
| 339 | (defop ENVIRON . (category class name . arguments)) |
| 340 | (deftypeop ENVIRON . (category class data-type name . arguments)) |
| 341 | (defmethod ENVIRON . (class name . arguments)) |
| 342 | (deftypemethod ENVIRON . (class data-type name . arguments)) |
| 343 | (defopt ENVIRON . (name)) |
| 344 | (defvr ENVIRON . (category name)) |
| 345 | (defvar ENVIRON . (name)) |
| 346 | (deftypevr ENVIRON . (category data-type name)) |
| 347 | (deftypevar ENVIRON . (data-type name)) |
| 348 | (deffn ENVIRON . (category name . arguments)) |
| 349 | (deftypefn ENVIRON . (category data-type name . arguments)) |
| 350 | (defspec ENVIRON . (name . arguments)) |
| 351 | (defmac ENVIRON . (name . arguments)) |
| 352 | (defun ENVIRON . (name . arguments)) |
| 353 | (deftypefun ENVIRON . (data-type name . arguments)) |
| 354 | |
| 355 | (table TABLE-ENVIRON . (formatter)) |
| 356 | (itemize TABLE-ENVIRON . (formatter)) |
| 357 | (enumerate TABLE-ENVIRON . (start)) |
| 358 | (ftable TABLE-ENVIRON . (formatter)) |
| 359 | (vtable TABLE-ENVIRON . (formatter)))) |
| 360 | |
| 361 | (define command-depths |
| 362 | '((chapter . 1) (section . 2) (subsection . 3) (subsubsection . 4) |
| 363 | (top . 0) (unnumbered . 1) (unnumberedsec . 2) |
| 364 | (unnumberedsubsec . 3) (unnumberedsubsubsec . 4) |
| 365 | (appendix . 1) (appendixsec . 2) (appendixsection . 2) |
| 366 | (appendixsubsec . 3) (appendixsubsubsec . 4))) |
| 367 | (define (texi-command-depth command max-depth) |
| 368 | "Given the texinfo command @var{command}, return its nesting level, or |
| 369 | @code{#f} if it nests too deep for @var{max-depth}. |
| 370 | |
| 371 | Examples: |
| 372 | @example |
| 373 | (texi-command-depth 'chapter 4) @result{} 1 |
| 374 | (texi-command-depth 'top 4) @result{} 0 |
| 375 | (texi-command-depth 'subsection 4) @result{} 3 |
| 376 | (texi-command-depth 'appendixsubsec 4) @result{} 3 |
| 377 | (texi-command-depth 'subsection 2) @result{} #f |
| 378 | @end example" |
| 379 | (let ((depth (and=> (assq command command-depths) cdr))) |
| 380 | (and depth (<= depth max-depth) depth))) |
| 381 | |
| 382 | ;; The % is for arguments |
| 383 | (define (space-significant? command) |
| 384 | (memq command |
| 385 | '(example smallexample verbatim lisp smalllisp menu %))) |
| 386 | |
| 387 | ;; Like a DTD for texinfo |
| 388 | (define (command-spec command) |
| 389 | (let ((spec (assq command texi-command-specs))) |
| 390 | (cond |
| 391 | ((not spec) |
| 392 | (parser-error #f "Unknown command" command)) |
| 393 | ((eq? (cadr spec) 'ALIAS) |
| 394 | (command-spec (cddr spec))) |
| 395 | (else |
| 396 | spec)))) |
| 397 | |
| 398 | (define (inline-content? content) |
| 399 | (case content |
| 400 | ((INLINE-TEXT INLINE-ARGS INLINE-TEXT-ARGS) #t) |
| 401 | (else #f))) |
| 402 | |
| 403 | |
| 404 | ;;======================================================================== |
| 405 | ;; Lower-level parsers and scanners |
| 406 | ;; |
| 407 | ;; They deal with primitive lexical units (Names, whitespaces, tags) and |
| 408 | ;; with pieces of more generic productions. Most of these parsers must |
| 409 | ;; be called in appropriate context. For example, complete-start-command |
| 410 | ;; must be called only when the @-command start has been detected and |
| 411 | ;; its name token has been read. |
| 412 | |
| 413 | ;; Test if a string is made of only whitespace |
| 414 | ;; An empty string is considered made of whitespace as well |
| 415 | (define (string-whitespace? str) |
| 416 | (or (string-null? str) |
| 417 | (string-every char-whitespace? str))) |
| 418 | |
| 419 | ;; Like read-text-line, but allows EOF. |
| 420 | (define read-eof-breaks '(*eof* #\return #\newline)) |
| 421 | (define (read-eof-line port) |
| 422 | (if (eof-object? (peek-char port)) |
| 423 | (peek-char port) |
| 424 | (let* ((line (next-token '() read-eof-breaks |
| 425 | "reading a line" port)) |
| 426 | (c (read-char port))) ; must be either \n or \r or EOF |
| 427 | (if (and (eq? c #\return) (eq? (peek-char port) #\newline)) |
| 428 | (read-char port)) ; skip \n that follows \r |
| 429 | line))) |
| 430 | |
| 431 | (define (skip-whitespace port) |
| 432 | (skip-while '(#\space #\tab #\return #\newline) port)) |
| 433 | |
| 434 | (define (skip-horizontal-whitespace port) |
| 435 | (skip-while '(#\space #\tab) port)) |
| 436 | |
| 437 | ;; command ::= Letter+ |
| 438 | |
| 439 | ;; procedure: read-command PORT |
| 440 | ;; |
| 441 | ;; Read a command starting from the current position in the PORT and |
| 442 | ;; return it as a symbol. |
| 443 | (define (read-command port) |
| 444 | (let ((first-char (peek-char port))) |
| 445 | (or (char-alphabetic? first-char) |
| 446 | (parser-error port "Nonalphabetic @-command char: '" first-char "'"))) |
| 447 | (string->symbol |
| 448 | (next-token-of |
| 449 | (lambda (c) |
| 450 | (cond |
| 451 | ((eof-object? c) #f) |
| 452 | ((char-alphabetic? c) c) |
| 453 | (else #f))) |
| 454 | port))) |
| 455 | |
| 456 | ;; A token is a primitive lexical unit. It is a record with two fields, |
| 457 | ;; token-head and token-kind. |
| 458 | ;; |
| 459 | ;; Token types: |
| 460 | ;; END The end of a texinfo command. If the command is ended by }, |
| 461 | ;; token-head will be #f. Otherwise if the command is ended by |
| 462 | ;; @end COMMAND, token-head will be COMMAND. As a special case, |
| 463 | ;; @bye is the end of a special @texinfo command. |
| 464 | ;; START The start of a texinfo command. The token-head will be a |
| 465 | ;; symbol of the @-command name. |
| 466 | ;; INCLUDE An @include directive. The token-head will be empty -- the |
| 467 | ;; caller is responsible for reading the include file name. |
| 468 | ;; ITEM @item commands have an irregular syntax. They end at the |
| 469 | ;; next @item, or at the end of the environment. For that |
| 470 | ;; read-command-token treats them specially. |
| 471 | |
| 472 | (define (make-token kind head) (cons kind head)) |
| 473 | (define token? pair?) |
| 474 | (define token-kind car) |
| 475 | (define token-head cdr) |
| 476 | |
| 477 | ;; procedure: read-command-token PORT |
| 478 | ;; |
| 479 | ;; This procedure starts parsing of a command token. The current |
| 480 | ;; position in the stream must be #\@. This procedure scans enough of |
| 481 | ;; the input stream to figure out what kind of a command token it is |
| 482 | ;; seeing. The procedure returns a token structure describing the token. |
| 483 | |
| 484 | (define (read-command-token port) |
| 485 | (assert-curr-char '(#\@) "start of the command" port) |
| 486 | (let ((peeked (peek-char port))) |
| 487 | (cond |
| 488 | ((memq peeked '(#\! #\: #\. #\? #\@ #\\ #\{ #\})) |
| 489 | ;; @-commands that escape characters |
| 490 | (make-token 'STRING (string (read-char port)))) |
| 491 | (else |
| 492 | (let ((name (read-command port))) |
| 493 | (case name |
| 494 | ((end) |
| 495 | ;; got an ending tag |
| 496 | (let ((command (string-trim-both |
| 497 | (read-eof-line port)))) |
| 498 | (or (and (not (string-null? command)) |
| 499 | (string-every char-alphabetic? command)) |
| 500 | (parser-error port "malformed @end" command)) |
| 501 | (make-token 'END (string->symbol command)))) |
| 502 | ((bye) |
| 503 | ;; the end of the top |
| 504 | (make-token 'END 'texinfo)) |
| 505 | ((item) |
| 506 | (make-token 'ITEM 'item)) |
| 507 | ((include) |
| 508 | (make-token 'INCLUDE #f)) |
| 509 | (else |
| 510 | (make-token 'START name)))))))) |
| 511 | |
| 512 | ;; procedure+: read-verbatim-body PORT STR-HANDLER SEED |
| 513 | ;; |
| 514 | ;; This procedure must be called after we have read a string |
| 515 | ;; "@verbatim\n" that begins a verbatim section. The current position |
| 516 | ;; must be the first position of the verbatim body. This function reads |
| 517 | ;; _lines_ of the verbatim body and passes them to a STR-HANDLER, a |
| 518 | ;; character data consumer. |
| 519 | ;; |
| 520 | ;; The str-handler is a STR-HANDLER, a procedure STRING1 STRING2 SEED. |
| 521 | ;; The first STRING1 argument to STR-HANDLER never contains a newline. |
| 522 | ;; The second STRING2 argument often will. On the first invocation of the |
| 523 | ;; STR-HANDLER, the seed is the one passed to read-verbatim-body |
| 524 | ;; as the third argument. The result of this first invocation will be |
| 525 | ;; passed as the seed argument to the second invocation of the line |
| 526 | ;; consumer, and so on. The result of the last invocation of the |
| 527 | ;; STR-HANDLER is returned by the read-verbatim-body. Note a |
| 528 | ;; similarity to the fundamental 'fold' iterator. |
| 529 | ;; |
| 530 | ;; Within a verbatim section all characters are taken at their face |
| 531 | ;; value. It ends with "\n@end verbatim(\r)?\n". |
| 532 | |
| 533 | ;; Must be called right after the newline after @verbatim. |
| 534 | (define (read-verbatim-body port str-handler seed) |
| 535 | (let loop ((seed seed)) |
| 536 | (let ((fragment (next-token '() '(#\newline) |
| 537 | "reading verbatim" port))) |
| 538 | ;; We're reading the char after the 'fragment', which is |
| 539 | ;; #\newline. |
| 540 | (read-char port) |
| 541 | (if (string=? fragment "@end verbatim") |
| 542 | seed |
| 543 | (loop (str-handler fragment "\n" seed)))))) |
| 544 | |
| 545 | ;; procedure+: read-arguments PORT |
| 546 | ;; |
| 547 | ;; This procedure reads and parses a production ArgumentList. |
| 548 | ;; ArgumentList ::= S* Argument (S* , S* Argument)* S* |
| 549 | ;; Argument ::= ([^@{},])* |
| 550 | ;; |
| 551 | ;; Arguments are the things in braces, i.e @ref{my node} has one |
| 552 | ;; argument, "my node". Most commands taking braces actually don't have |
| 553 | ;; arguments, they process text. For example, in |
| 554 | ;; @emph{@strong{emphasized}}, the emph takes text, because the parse |
| 555 | ;; continues into the braces. |
| 556 | ;; |
| 557 | ;; Any whitespace within Argument is replaced with a single space. |
| 558 | ;; Whitespace around an Argument is trimmed. |
| 559 | ;; |
| 560 | ;; The procedure returns a list of arguments. Afterwards the current |
| 561 | ;; character will be after the final #\}. |
| 562 | |
| 563 | (define (read-arguments port stop-char) |
| 564 | (define (split str) |
| 565 | (read-char port) ;; eat the delimiter |
| 566 | (let ((ret (map (lambda (x) (if (string-null? x) #f x)) |
| 567 | (map string-trim-both (string-split str #\,))))) |
| 568 | (if (and (pair? ret) (eq? (car ret) #f) (null? (cdr ret))) |
| 569 | '() |
| 570 | ret))) |
| 571 | (split (next-token '() (list stop-char) |
| 572 | "arguments of @-command" port))) |
| 573 | |
| 574 | ;; procedure+: complete-start-command COMMAND PORT |
| 575 | ;; |
| 576 | ;; This procedure is to complete parsing of an @-command. The procedure |
| 577 | ;; must be called after the command token has been read. COMMAND is a |
| 578 | ;; TAG-NAME. |
| 579 | ;; |
| 580 | ;; This procedure returns several values: |
| 581 | ;; COMMAND: a symbol. |
| 582 | ;; ARGUMENTS: command's arguments, as an alist. |
| 583 | ;; CONTENT-MODEL: the content model of the command. |
| 584 | ;; |
| 585 | ;; On exit, the current position in PORT will depend on the CONTENT-MODEL. |
| 586 | ;; |
| 587 | ;; Content model Port position |
| 588 | ;; ============= ============= |
| 589 | ;; INLINE-TEXT One character after the #\{. |
| 590 | ;; INLINE-TEXT-ARGS One character after the #\{. |
| 591 | ;; INLINE-ARGS The first character after the #\}. |
| 592 | ;; EOL-TEXT The first non-whitespace character after the command. |
| 593 | ;; ENVIRON, TABLE-ENVIRON, EOL-ARGS, EOL-TEXT |
| 594 | ;; The first character on the next line. |
| 595 | ;; PARAGRAPH, ITEM, EMPTY-COMMAND |
| 596 | ;; The first character after the command. |
| 597 | |
| 598 | (define (arguments->attlist port args arg-names) |
| 599 | (let loop ((in args) (names arg-names) (opt? #f) (out '())) |
| 600 | (cond |
| 601 | ((symbol? names) ;; a rest arg |
| 602 | (reverse (if (null? in) out (acons names in out)))) |
| 603 | ((and (not (null? names)) (eq? (car names) #:opt)) |
| 604 | (loop in (cdr names) #t out)) |
| 605 | ((null? in) |
| 606 | (if (or (null? names) opt?) |
| 607 | (reverse out) |
| 608 | (parser-error port "@-command expected more arguments:" |
| 609 | args arg-names names))) |
| 610 | ((null? names) |
| 611 | (parser-error port "@-command didn't expect more arguments:" in)) |
| 612 | ((not (car in)) |
| 613 | (or (and opt? (loop (cdr in) (cdr names) opt? out)) |
| 614 | (parser-error "@-command missing required argument" |
| 615 | (car names)))) |
| 616 | (else |
| 617 | (loop (cdr in) (cdr names) opt? |
| 618 | (acons (car names) |
| 619 | (if (list? (car in)) (car in) (list (car in))) |
| 620 | out)))))) |
| 621 | |
| 622 | (define (parse-table-args command port) |
| 623 | (let* ((line (string-trim-both (read-text-line port))) |
| 624 | (length (string-length line))) |
| 625 | (define (get-formatter) |
| 626 | (or (and (not (zero? length)) |
| 627 | (eq? (string-ref line 0) #\@) |
| 628 | (let ((f (string->symbol (substring line 1)))) |
| 629 | (or (inline-content? (cadr (command-spec f))) |
| 630 | (parser-error |
| 631 | port "@item formatter must be INLINE" f)) |
| 632 | f)) |
| 633 | (parser-error port "Invalid @item formatter" line))) |
| 634 | (case command |
| 635 | ((enumerate) |
| 636 | (if (zero? length) |
| 637 | '() |
| 638 | `((start |
| 639 | ,(if (or (and (eq? length 1) |
| 640 | (char-alphabetic? (string-ref line 0))) |
| 641 | (string-every char-numeric? line)) |
| 642 | line |
| 643 | (parser-error |
| 644 | port "Invalid enumerate start" line)))))) |
| 645 | ((itemize) |
| 646 | `((bullet |
| 647 | ,(or (and (eq? length 1) line) |
| 648 | (and (string-null? line) '(bullet)) |
| 649 | (list (get-formatter)))))) |
| 650 | (else ;; tables of various varieties |
| 651 | `((formatter (,(get-formatter)))))))) |
| 652 | |
| 653 | (define (complete-start-command command port) |
| 654 | (define (get-arguments type arg-names stop-char) |
| 655 | (arguments->attlist port (read-arguments port stop-char) arg-names)) |
| 656 | |
| 657 | (let* ((spec (command-spec command)) |
| 658 | (command (car spec)) |
| 659 | (type (cadr spec)) |
| 660 | (arg-names (cddr spec))) |
| 661 | (case type |
| 662 | ((INLINE-TEXT) |
| 663 | (assert-curr-char '(#\{) "Inline element lacks {" port) |
| 664 | (values command '() type)) |
| 665 | ((INLINE-ARGS) |
| 666 | (assert-curr-char '(#\{) "Inline element lacks {" port) |
| 667 | (values command (get-arguments type arg-names #\}) type)) |
| 668 | ((INLINE-TEXT-ARGS) |
| 669 | (assert-curr-char '(#\{) "Inline element lacks {" port) |
| 670 | (values command '() type)) |
| 671 | ((EOL-ARGS) |
| 672 | (values command (get-arguments type arg-names #\newline) type)) |
| 673 | ((ENVIRON ENTRY INDEX) |
| 674 | (skip-horizontal-whitespace port) |
| 675 | (values command (parse-environment-args command port) type)) |
| 676 | ((TABLE-ENVIRON) |
| 677 | (skip-horizontal-whitespace port) |
| 678 | (values command (parse-table-args command port) type)) |
| 679 | ((EOL-TEXT) |
| 680 | (skip-horizontal-whitespace port) |
| 681 | (values command '() type)) |
| 682 | ((EOL-TEXT-ARGS) |
| 683 | (skip-horizontal-whitespace port) |
| 684 | (values command (parse-eol-text-args command port) type)) |
| 685 | ((PARAGRAPH EMPTY-COMMAND ITEM FRAGMENT) |
| 686 | (values command '() type)) |
| 687 | (else ;; INCLUDE shouldn't get here |
| 688 | (parser-error port "can't happen"))))) |
| 689 | |
| 690 | ;;----------------------------------------------------------------------------- |
| 691 | ;; Higher-level parsers and scanners |
| 692 | ;; |
| 693 | ;; They parse productions corresponding entire @-commands. |
| 694 | |
| 695 | ;; Only reads @settitle, leaves it to the command parser to finish |
| 696 | ;; reading the title. |
| 697 | (define (take-until-settitle port) |
| 698 | (or (find-string-from-port? "\n@settitle " port) |
| 699 | (parser-error port "No \\n@settitle found")) |
| 700 | (skip-horizontal-whitespace port) |
| 701 | (and (eq? (peek-char port) #\newline) |
| 702 | (parser-error port "You have a @settitle, but no title"))) |
| 703 | |
| 704 | ;; procedure+: read-char-data PORT EXPECT-EOF? STR-HANDLER SEED |
| 705 | ;; |
| 706 | ;; This procedure is to read the CharData of a texinfo document. |
| 707 | ;; |
| 708 | ;; text ::= (CharData | Command)* |
| 709 | ;; |
| 710 | ;; The procedure reads CharData and stops at @-commands (or |
| 711 | ;; environments). It also stops at an open or close brace. |
| 712 | ;; |
| 713 | ;; port |
| 714 | ;; a PORT to read |
| 715 | ;; expect-eof? |
| 716 | ;; a boolean indicating if EOF is normal, i.e., the character |
| 717 | ;; data may be terminated by the EOF. EOF is normal |
| 718 | ;; while processing the main document. |
| 719 | ;; preserve-ws? |
| 720 | ;; a boolean indicating if we are within a whitespace-preserving |
| 721 | ;; environment. If #t, suppress paragraph detection. |
| 722 | ;; str-handler |
| 723 | ;; a STR-HANDLER, see read-verbatim-body |
| 724 | ;; seed |
| 725 | ;; an argument passed to the first invocation of STR-HANDLER. |
| 726 | ;; |
| 727 | ;; The procedure returns two results: SEED and TOKEN. The SEED is the |
| 728 | ;; result of the last invocation of STR-HANDLER, or the original seed if |
| 729 | ;; STR-HANDLER was never called. |
| 730 | ;; |
| 731 | ;; TOKEN can be either an eof-object (this can happen only if expect-eof? |
| 732 | ;; was #t), or a texinfo token denoting the start or end of a tag. |
| 733 | |
| 734 | ;; read-char-data port expect-eof? preserve-ws? str-handler seed |
| 735 | (define read-char-data |
| 736 | (let* ((end-chars-eof '(*eof* #\{ #\} #\@ #\newline))) |
| 737 | (define (handle str-handler str1 str2 seed) |
| 738 | (if (and (string-null? str1) (string-null? str2)) |
| 739 | seed |
| 740 | (str-handler str1 str2 seed))) |
| 741 | |
| 742 | (lambda (port expect-eof? preserve-ws? str-handler seed) |
| 743 | (let ((end-chars ((if expect-eof? identity cdr) end-chars-eof))) |
| 744 | (let loop ((seed seed)) |
| 745 | (let* ((fragment (next-token '() end-chars "reading char data" port)) |
| 746 | (term-char (peek-char port))) ; one of end-chars |
| 747 | (cond |
| 748 | ((eof-object? term-char) ; only if expect-eof? |
| 749 | (values (handle str-handler fragment "" seed) term-char)) |
| 750 | ((memq term-char '(#\@ #\{ #\})) |
| 751 | (values (handle str-handler fragment "" seed) |
| 752 | (case term-char |
| 753 | ((#\@) (read-command-token port)) |
| 754 | ((#\{) (make-token 'START '*braces*)) |
| 755 | ((#\}) (read-char port) (make-token 'END #f))))) |
| 756 | ((eq? term-char #\newline) |
| 757 | ;; Always significant, unless directly before an end token. |
| 758 | (let ((c (peek-next-char port))) |
| 759 | (cond |
| 760 | ((eof-object? c) |
| 761 | (or expect-eof? |
| 762 | (parser-error port "EOF while reading char data")) |
| 763 | (values (handle str-handler fragment "" seed) c)) |
| 764 | ((eq? c #\@) |
| 765 | (let* ((token (read-command-token port)) |
| 766 | (end? (eq? (token-kind token) 'END))) |
| 767 | (values |
| 768 | (handle str-handler fragment |
| 769 | (if end? "" (if preserve-ws? "\n" " ")) |
| 770 | seed) |
| 771 | token))) |
| 772 | ((and (not preserve-ws?) (eq? c #\newline)) |
| 773 | ;; paragraph-separator ::= #\newline #\newline+ |
| 774 | (skip-while '(#\newline) port) |
| 775 | (skip-horizontal-whitespace port) |
| 776 | (values (handle str-handler fragment "" seed) |
| 777 | (make-token 'PARA 'para))) |
| 778 | (else |
| 779 | (loop (handle str-handler fragment |
| 780 | (if preserve-ws? "\n" " ") seed))))))))))))) |
| 781 | |
| 782 | ; procedure+: assert-token TOKEN KIND NAME |
| 783 | ; Make sure that TOKEN is of anticipated KIND and has anticipated NAME |
| 784 | (define (assert-token token kind name) |
| 785 | (or (and (token? token) |
| 786 | (eq? kind (token-kind token)) |
| 787 | (equal? name (token-head token))) |
| 788 | (parser-error #f "Expecting @end for " name ", got " token))) |
| 789 | |
| 790 | ;;======================================================================== |
| 791 | ;; Highest-level parsers: Texinfo to SXML |
| 792 | |
| 793 | ;; These parsers are a set of syntactic forms to instantiate a SSAX |
| 794 | ;; parser. The user tells what to do with the parsed character and |
| 795 | ;; element data. These latter handlers determine if the parsing follows a |
| 796 | ;; SAX or a DOM model. |
| 797 | |
| 798 | ;; syntax: make-command-parser fdown fup str-handler |
| 799 | |
| 800 | ;; Create a parser to parse and process one element, including its |
| 801 | ;; character content or children elements. The parser is typically |
| 802 | ;; applied to the root element of a document. |
| 803 | |
| 804 | ;; fdown |
| 805 | ;; procedure COMMAND ARGUMENTS EXPECTED-CONTENT SEED |
| 806 | ;; |
| 807 | ;; This procedure is to generate the seed to be passed to handlers |
| 808 | ;; that process the content of the element. This is the function |
| 809 | ;; identified as 'fdown' in the denotational semantics of the XML |
| 810 | ;; parser given in the title comments to (sxml ssax). |
| 811 | ;; |
| 812 | ;; fup |
| 813 | ;; procedure COMMAND ARGUMENTS PARENT-SEED SEED |
| 814 | ;; |
| 815 | ;; This procedure is called when parsing of COMMAND is finished. |
| 816 | ;; The SEED is the result from the last content parser (or from |
| 817 | ;; fdown if the element has the empty content). PARENT-SEED is the |
| 818 | ;; same seed as was passed to fdown. The procedure is to generate a |
| 819 | ;; seed that will be the result of the element parser. This is the |
| 820 | ;; function identified as 'fup' in the denotational semantics of |
| 821 | ;; the XML parser given in the title comments to (sxml ssax). |
| 822 | ;; |
| 823 | ;; str-handler |
| 824 | ;; A STR-HANDLER, see read-verbatim-body |
| 825 | ;; |
| 826 | |
| 827 | ;; The generated parser is a |
| 828 | ;; procedure COMMAND PORT SEED |
| 829 | ;; |
| 830 | ;; The procedure must be called *after* the command token has been read. |
| 831 | |
| 832 | (define (read-include-file-name port) |
| 833 | (let ((x (string-trim-both (read-eof-line port)))) |
| 834 | (if (string-null? x) |
| 835 | (error "no file listed") |
| 836 | x))) ;; fixme: should expand @value{} references |
| 837 | |
| 838 | (define (sxml->node-name sxml) |
| 839 | "Turn some sxml string into a valid node name." |
| 840 | (let loop ((in (string->list (sxml->string sxml))) (out '())) |
| 841 | (if (null? in) |
| 842 | (apply string (reverse out)) |
| 843 | (if (memq (car in) '(#\{ #\} #\@ #\,)) |
| 844 | (loop (cdr in) out) |
| 845 | (loop (cdr in) (cons (car in) out)))))) |
| 846 | |
| 847 | (define (index command arguments fdown fup parent-seed) |
| 848 | (case command |
| 849 | ((deftp defcv defivar deftypeivar defop deftypeop defmethod |
| 850 | deftypemethod defopt defvr defvar deftypevr deftypevar deffn |
| 851 | deftypefn defspec defmac defun deftypefun) |
| 852 | (let ((args `((name ,(string-append (symbol->string command) "-" |
| 853 | (cadr (assq 'name arguments))))))) |
| 854 | (fup 'anchor args parent-seed |
| 855 | (fdown 'anchor args 'INLINE-ARGS '())))) |
| 856 | ((cindex findex vindex kindex pindex tindex) |
| 857 | (let ((args `((name ,(string-append (symbol->string command) "-" |
| 858 | (sxml->node-name |
| 859 | (assq 'entry arguments))))))) |
| 860 | (fup 'anchor args parent-seed |
| 861 | (fdown 'anchor args 'INLINE-ARGS '())))) |
| 862 | (else parent-seed))) |
| 863 | |
| 864 | (define (make-command-parser fdown fup str-handler) |
| 865 | (lambda (command port seed) |
| 866 | (let visit ((command command) (port port) (sig-ws? #f) (parent-seed seed)) |
| 867 | (let*-values (((command arguments expected-content) |
| 868 | (complete-start-command command port))) |
| 869 | (let* ((parent-seed (index command arguments fdown fup parent-seed)) |
| 870 | (seed (fdown command arguments expected-content parent-seed)) |
| 871 | (eof-closes? (or (memq command '(texinfo para *fragment*)) |
| 872 | (eq? expected-content 'EOL-TEXT))) |
| 873 | (sig-ws? (or sig-ws? (space-significant? command))) |
| 874 | (up (lambda (s) (fup command arguments parent-seed s))) |
| 875 | (new-para (lambda (s) (fdown 'para '() 'PARAGRAPH s))) |
| 876 | (make-end-para (lambda (p) (lambda (s) (fup 'para '() p s))))) |
| 877 | |
| 878 | (define (port-for-content) |
| 879 | (if (eq? expected-content 'EOL-TEXT) |
| 880 | (call-with-input-string (read-text-line port) identity) |
| 881 | port)) |
| 882 | |
| 883 | (cond |
| 884 | ((memq expected-content '(EMPTY-COMMAND INLINE-ARGS EOL-ARGS INDEX |
| 885 | EOL-TEXT-ARGS)) |
| 886 | ;; empty or finished by complete-start-command |
| 887 | (up seed)) |
| 888 | ((eq? command 'verbatim) |
| 889 | (up (read-verbatim-body port str-handler seed))) |
| 890 | (else |
| 891 | (let loop ((port (port-for-content)) |
| 892 | (expect-eof? eof-closes?) |
| 893 | (end-para identity) |
| 894 | (need-break? (and (not sig-ws?) |
| 895 | (memq expected-content |
| 896 | '(ENVIRON TABLE-ENVIRON |
| 897 | ENTRY ITEM FRAGMENT)))) |
| 898 | (seed seed)) |
| 899 | (cond |
| 900 | ((and need-break? (or sig-ws? (skip-whitespace port)) |
| 901 | (not (memq (peek-char port) '(#\@ #\}))) |
| 902 | (not (eof-object? (peek-char port)))) |
| 903 | ;; Even if we have an @, it might be inline -- check |
| 904 | ;; that later |
| 905 | (let ((seed (end-para seed))) |
| 906 | (loop port expect-eof? (make-end-para seed) #f |
| 907 | (new-para seed)))) |
| 908 | (else |
| 909 | (let*-values (((seed token) |
| 910 | (read-char-data |
| 911 | port expect-eof? sig-ws? str-handler seed))) |
| 912 | (cond |
| 913 | ((eof-object? token) |
| 914 | (case expect-eof? |
| 915 | ((include #f) (end-para seed)) |
| 916 | (else (up (end-para seed))))) |
| 917 | (else |
| 918 | (case (token-kind token) |
| 919 | ((STRING) |
| 920 | ;; this is only @-commands that escape |
| 921 | ;; characters: @}, @@, @{ -- new para if need-break |
| 922 | (let ((seed ((if need-break? end-para identity) seed))) |
| 923 | (loop port expect-eof? |
| 924 | (if need-break? (make-end-para seed) end-para) #f |
| 925 | (str-handler (token-head token) "" |
| 926 | ((if need-break? new-para identity) |
| 927 | seed))))) |
| 928 | ((END) |
| 929 | ;; The end will only have a name if it's for an |
| 930 | ;; environment |
| 931 | (cond |
| 932 | ((memq command '(item entry)) |
| 933 | (let ((spec (command-spec (token-head token)))) |
| 934 | (or (eq? (cadr spec) 'TABLE-ENVIRON) |
| 935 | (parser-error |
| 936 | port "@item not ended by @end table/enumerate/itemize" |
| 937 | token)))) |
| 938 | ((eq? expected-content 'ENVIRON) |
| 939 | (assert-token token 'END command))) |
| 940 | (up (end-para seed))) |
| 941 | ((ITEM) |
| 942 | (cond |
| 943 | ((memq command '(enumerate itemize)) |
| 944 | (up (visit 'item port sig-ws? (end-para seed)))) |
| 945 | ((eq? expected-content 'TABLE-ENVIRON) |
| 946 | (up (visit 'entry port sig-ws? (end-para seed)))) |
| 947 | ((memq command '(item entry)) |
| 948 | (visit command port sig-ws? (up (end-para seed)))) |
| 949 | (else |
| 950 | (parser-error |
| 951 | port "@item must be within a table environment" |
| 952 | command)))) |
| 953 | ((PARA) |
| 954 | ;; examine valid paragraphs? |
| 955 | (loop port expect-eof? end-para (not sig-ws?) seed)) |
| 956 | ((INCLUDE) |
| 957 | ;; Recurse for include files |
| 958 | (let ((seed (call-with-file-and-dir |
| 959 | (read-include-file-name port) |
| 960 | (lambda (port) |
| 961 | (loop port 'include end-para |
| 962 | need-break? seed))))) |
| 963 | (loop port expect-eof? end-para need-break? seed))) |
| 964 | ((START) ; Start of an @-command |
| 965 | (let* ((head (token-head token)) |
| 966 | (spec (command-spec head)) |
| 967 | (head (car spec)) |
| 968 | (type (cadr spec)) |
| 969 | (inline? (inline-content? type)) |
| 970 | (seed ((if (and inline? (not need-break?)) |
| 971 | identity end-para) seed)) |
| 972 | (end-para (if inline? |
| 973 | (if need-break? (make-end-para seed) |
| 974 | end-para) |
| 975 | identity)) |
| 976 | (new-para (if (and inline? need-break?) |
| 977 | new-para identity))) |
| 978 | (loop port expect-eof? end-para (not inline?) |
| 979 | (visit head port sig-ws? (new-para seed))))) |
| 980 | (else |
| 981 | (parser-error port "Unknown token type" token)))))))))))))))) |
| 982 | |
| 983 | ;; procedure: reverse-collect-str-drop-ws fragments |
| 984 | ;; |
| 985 | ;; Given the list of fragments (some of which are text strings), reverse |
| 986 | ;; the list and concatenate adjacent text strings. We also drop |
| 987 | ;; "unsignificant" whitespace, that is, whitespace in front, behind and |
| 988 | ;; between elements. The whitespace that is included in character data |
| 989 | ;; is not affected. |
| 990 | (define (reverse-collect-str-drop-ws fragments) |
| 991 | (cond |
| 992 | ((null? fragments) ; a shortcut |
| 993 | '()) |
| 994 | ((and (string? (car fragments)) ; another shortcut |
| 995 | (null? (cdr fragments)) ; remove single ws-only string |
| 996 | (string-whitespace? (car fragments))) |
| 997 | '()) |
| 998 | (else |
| 999 | (let loop ((fragments fragments) (result '()) (strs '()) |
| 1000 | (all-whitespace? #t)) |
| 1001 | (cond |
| 1002 | ((null? fragments) |
| 1003 | (if all-whitespace? |
| 1004 | result ; remove leading ws |
| 1005 | (cons (apply string-append strs) result))) |
| 1006 | ((string? (car fragments)) |
| 1007 | (loop (cdr fragments) result (cons (car fragments) strs) |
| 1008 | (and all-whitespace? |
| 1009 | (string-whitespace? (car fragments))))) |
| 1010 | (else |
| 1011 | (loop (cdr fragments) |
| 1012 | (cons |
| 1013 | (car fragments) |
| 1014 | (cond |
| 1015 | ((null? strs) result) |
| 1016 | (all-whitespace? |
| 1017 | (if (null? result) |
| 1018 | result ; remove trailing whitespace |
| 1019 | (cons " " result))); replace interstitial ws with |
| 1020 | ; one space |
| 1021 | (else |
| 1022 | (cons (apply string-append strs) result)))) |
| 1023 | '() #t))))))) |
| 1024 | |
| 1025 | (define (parse-inline-text-args port spec text) |
| 1026 | (let lp ((in text) (cur '()) (out '())) |
| 1027 | (cond |
| 1028 | ((null? in) |
| 1029 | (if (and (pair? cur) |
| 1030 | (string? (car cur)) |
| 1031 | (string-whitespace? (car cur))) |
| 1032 | (lp in (cdr cur) out) |
| 1033 | (let ((args (reverse (if (null? cur) |
| 1034 | out |
| 1035 | (cons (reverse cur) out))))) |
| 1036 | (arguments->attlist port args (cddr spec))))) |
| 1037 | ((pair? (car in)) |
| 1038 | (lp (cdr in) (cons (car in) cur) out)) |
| 1039 | ((string-index (car in) #\,) |
| 1040 | (let* ((parts (string-split (car in) #\,)) |
| 1041 | (head (string-trim-right (car parts))) |
| 1042 | (rev-tail (reverse (cdr parts))) |
| 1043 | (last (string-trim (car rev-tail)))) |
| 1044 | (lp (cdr in) |
| 1045 | (if (string-null? last) cur (cons last cur)) |
| 1046 | (append (cdr rev-tail) |
| 1047 | (cons (reverse (if (string-null? head) cur (cons head cur))) |
| 1048 | out))))) |
| 1049 | (else |
| 1050 | (lp (cdr in) |
| 1051 | (cons (if (null? cur) (string-trim (car in)) (car in)) cur) |
| 1052 | out))))) |
| 1053 | |
| 1054 | (define (make-dom-parser) |
| 1055 | (make-command-parser |
| 1056 | (lambda (command args content seed) ; fdown |
| 1057 | '()) |
| 1058 | (lambda (command args parent-seed seed) ; fup |
| 1059 | (let* ((seed (reverse-collect-str-drop-ws seed)) |
| 1060 | (spec (command-spec command)) |
| 1061 | (command (car spec))) |
| 1062 | (if (eq? (cadr spec) 'INLINE-TEXT-ARGS) |
| 1063 | (cons (list command (cons '% (parse-inline-text-args #f spec seed))) |
| 1064 | parent-seed) |
| 1065 | (acons command |
| 1066 | (if (null? args) seed (acons '% args seed)) |
| 1067 | parent-seed)))) |
| 1068 | (lambda (string1 string2 seed) ; str-handler |
| 1069 | (if (string-null? string2) |
| 1070 | (cons string1 seed) |
| 1071 | (cons* string2 string1 seed))))) |
| 1072 | |
| 1073 | (define parse-environment-args |
| 1074 | (let ((parser (make-dom-parser))) |
| 1075 | ;; duplicate arguments->attlist to avoid unnecessary splitting |
| 1076 | (lambda (command port) |
| 1077 | (let* ((args (cdar (parser '*ENVIRON-ARGS* port '()))) |
| 1078 | (spec (command-spec command)) |
| 1079 | (command (car spec)) |
| 1080 | (arg-names (cddr spec))) |
| 1081 | (cond |
| 1082 | ((not arg-names) |
| 1083 | (if (null? args) '() |
| 1084 | (parser-error port "@-command doesn't take args" command))) |
| 1085 | ((eq? arg-names #t) |
| 1086 | (list (cons 'arguments args))) |
| 1087 | (else |
| 1088 | (let loop ((args args) (arg-names arg-names) (out '())) |
| 1089 | (cond |
| 1090 | ((null? arg-names) |
| 1091 | (if (null? args) (reverse! out) |
| 1092 | (parser-error port "@-command didn't expect more args" |
| 1093 | command args))) |
| 1094 | ((symbol? arg-names) |
| 1095 | (reverse! (acons arg-names args out))) |
| 1096 | ((null? args) |
| 1097 | (parser-error port "@-command expects more args" |
| 1098 | command arg-names)) |
| 1099 | ((and (string? (car args)) (string-index (car args) #\space)) |
| 1100 | => (lambda (i) |
| 1101 | (let ((rest (substring/shared (car args) (1+ i)))) |
| 1102 | (if (zero? i) |
| 1103 | (loop (cons rest (cdr args)) arg-names out) |
| 1104 | (loop (cons rest (cdr args)) (cdr arg-names) |
| 1105 | (cons (list (car arg-names) |
| 1106 | (substring (car args) 0 i)) |
| 1107 | out)))))) |
| 1108 | (else |
| 1109 | (loop (cdr args) (cdr arg-names) |
| 1110 | (if (and (pair? (car args)) (eq? (caar args) '*braces*)) |
| 1111 | (acons (car arg-names) (cdar args) out) |
| 1112 | (cons (list (car arg-names) (car args)) out)))))))))))) |
| 1113 | |
| 1114 | (define (parse-eol-text-args command port) |
| 1115 | ;; perhaps parse-environment-args should be named more |
| 1116 | ;; generically. |
| 1117 | (parse-environment-args command port)) |
| 1118 | |
| 1119 | ;; procedure: texi-fragment->stexi STRING |
| 1120 | ;; |
| 1121 | ;; A DOM parser for a texinfo fragment STRING. |
| 1122 | ;; |
| 1123 | ;; The procedure returns an SXML tree headed by the special tag, |
| 1124 | ;; *fragment*. |
| 1125 | |
| 1126 | (define (texi-fragment->stexi string-or-port) |
| 1127 | "Parse the texinfo commands in @var{string-or-port}, and return the |
| 1128 | resultant stexi tree. The head of the tree will be the special command, |
| 1129 | @code{*fragment*}." |
| 1130 | (define (parse port) |
| 1131 | (postprocess (car ((make-dom-parser) '*fragment* port '())))) |
| 1132 | (if (input-port? string-or-port) |
| 1133 | (parse string-or-port) |
| 1134 | (call-with-input-string string-or-port parse))) |
| 1135 | |
| 1136 | ;; procedure: texi->stexi PORT |
| 1137 | ;; |
| 1138 | ;; This is an instance of a SSAX parser above that returns an SXML |
| 1139 | ;; representation of the texinfo document ready to be read at PORT. |
| 1140 | ;; |
| 1141 | ;; The procedure returns an SXML tree. The port points to the |
| 1142 | ;; first character after the @bye, or to the end of the file. |
| 1143 | |
| 1144 | (define (texi->stexi port) |
| 1145 | "Read a full texinfo document from @var{port} and return the parsed |
| 1146 | stexi tree. The parsing will start at the @code{@@settitle} and end at |
| 1147 | @code{@@bye} or EOF." |
| 1148 | (let ((parser (make-dom-parser))) |
| 1149 | (take-until-settitle port) |
| 1150 | (postprocess (car (parser 'texinfo port '()))))) |
| 1151 | |
| 1152 | (define (car-eq? x y) (and (pair? x) (eq? (car x) y))) |
| 1153 | (define (make-contents tree) |
| 1154 | (define (lp in out depth) |
| 1155 | (cond |
| 1156 | ((null? in) (values in (cons 'enumerate (reverse! out)))) |
| 1157 | ((and (pair? (cdr in)) (texi-command-depth (caadr in) 4)) |
| 1158 | => (lambda (new-depth) |
| 1159 | (let ((node-name (and (car-eq? (car in) 'node) |
| 1160 | (cadr (assq 'name (cdadar in)))))) |
| 1161 | (cond |
| 1162 | ((< new-depth depth) |
| 1163 | (values in (cons 'enumerate (reverse! out)))) |
| 1164 | ((> new-depth depth) |
| 1165 | (let ((out-cdr (if (null? out) '() (cdr out))) |
| 1166 | (out-car (if (null? out) (list 'item) (car out)))) |
| 1167 | (let*-values (((new-in new-out) (lp in '() (1+ depth)))) |
| 1168 | (lp new-in |
| 1169 | (cons (append out-car (list new-out)) out-cdr) |
| 1170 | depth)))) |
| 1171 | (else ;; same depth |
| 1172 | (lp (cddr in) |
| 1173 | (cons |
| 1174 | `(item (para |
| 1175 | ,@(if node-name |
| 1176 | `((ref (% (node ,node-name)))) |
| 1177 | (cdadr in)))) |
| 1178 | out) |
| 1179 | depth)))))) |
| 1180 | (else (lp (cdr in) out depth)))) |
| 1181 | (let*-values (((_ contents) (lp tree '() 1))) |
| 1182 | `((chapheading "Table of Contents") ,contents))) |
| 1183 | |
| 1184 | (define (trim-whitespace str trim-left? trim-right?) |
| 1185 | (let* ((left-space? (and (not trim-left?) |
| 1186 | (string-prefix? " " str))) |
| 1187 | (right-space? (and (not trim-right?) |
| 1188 | (string-suffix? " " str))) |
| 1189 | (tail (append! (string-tokenize str) |
| 1190 | (if right-space? '("") '())))) |
| 1191 | (string-join (if left-space? (cons "" tail) tail)))) |
| 1192 | |
| 1193 | (define (postprocess tree) |
| 1194 | (define (loop in out state first? sig-ws?) |
| 1195 | (cond |
| 1196 | ((null? in) |
| 1197 | (values (reverse! out) state)) |
| 1198 | ((string? (car in)) |
| 1199 | (loop (cdr in) |
| 1200 | (cons (if sig-ws? (car in) |
| 1201 | (trim-whitespace (car in) first? (null? (cdr in)))) |
| 1202 | out) |
| 1203 | state #f sig-ws?)) |
| 1204 | ((pair? (car in)) |
| 1205 | (case (caar in) |
| 1206 | ((set) |
| 1207 | (if (null? (cdar in)) (error "@set missing arguments" in)) |
| 1208 | (if (string? (cadar in)) |
| 1209 | (let ((i (string-index (cadar in) #\space))) |
| 1210 | (if i |
| 1211 | (loop (cdr in) out |
| 1212 | (acons (substring (cadar in) 0 i) |
| 1213 | (cons (substring (cadar in) (1+ i)) (cddar in)) |
| 1214 | state) |
| 1215 | #f sig-ws?) |
| 1216 | (loop (cdr in) out (acons (cadar in) (cddar in) state) |
| 1217 | #f sig-ws?))) |
| 1218 | (error "expected a constant to define for @set" in))) |
| 1219 | ((value) |
| 1220 | (loop (fold-right cons (cdr in) |
| 1221 | (or (and=> |
| 1222 | (assoc (cadr (assq 'key (cdadar in))) state) cdr) |
| 1223 | (error "unknown value" (cdadar in) state))) |
| 1224 | out |
| 1225 | state #f sig-ws?)) |
| 1226 | ((copying) |
| 1227 | (loop (cdr in) out (cons (car in) state) #f sig-ws?)) |
| 1228 | ((insertcopying) |
| 1229 | (loop (fold-right cons (cdr in) |
| 1230 | (or (cdr (assoc 'copying state)) |
| 1231 | (error "copying isn't set yet"))) |
| 1232 | out |
| 1233 | state #f sig-ws?)) |
| 1234 | ((contents) |
| 1235 | (loop (cdr in) (fold cons out (make-contents tree)) state #f sig-ws?)) |
| 1236 | (else |
| 1237 | (let*-values (((kid-out state) |
| 1238 | (loop (car in) '() state #t |
| 1239 | (or sig-ws? (space-significant? (caar in)))))) |
| 1240 | (loop (cdr in) (cons kid-out out) state #f sig-ws?))))) |
| 1241 | (else ; a symbol |
| 1242 | (loop (cdr in) (cons (car in) out) state #t sig-ws?)))) |
| 1243 | |
| 1244 | (call-with-values |
| 1245 | (lambda () (loop tree '() '() #t #f)) |
| 1246 | (lambda (out state) out))) |
| 1247 | |
| 1248 | ;; Replace % with texinfo-arguments. |
| 1249 | (define (stexi->sxml tree) |
| 1250 | "Transform the stexi tree @var{tree} into sxml. This involves |
| 1251 | replacing the @code{%} element that keeps the texinfo arguments with an |
| 1252 | element for each argument. |
| 1253 | |
| 1254 | FIXME: right now it just changes % to @code{texinfo-arguments} -- that |
| 1255 | doesn't hang with the idea of making a dtd at some point" |
| 1256 | (pre-post-order |
| 1257 | tree |
| 1258 | `((% . ,(lambda (x . t) (cons 'texinfo-arguments t))) |
| 1259 | (*text* . ,(lambda (x t) t)) |
| 1260 | (*default* . ,(lambda (x . t) (cons x t)))))) |
| 1261 | |
| 1262 | ;;; arch-tag: 73890afa-597c-4264-ae70-46fe7756ffb5 |
| 1263 | ;;; texinfo.scm ends here |