Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (sxml ssax) -- the SSAX parser |
2 | ;;;; | |
2b6fcf5b | 3 | ;;;; Copyright (C) 2009, 2010,2012,2013 Free Software Foundation, Inc. |
47f3ce52 AW |
4 | ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. |
5 | ;;;; Written 2001,2002,2003,2004 by Oleg Kiselyov <oleg at pobox dot com> as SSAX.scm. | |
6 | ;;;; | |
7 | ;;;; This library is free software; you can redistribute it and/or | |
8 | ;;;; modify it under the terms of the GNU Lesser General Public | |
9 | ;;;; License as published by the Free Software Foundation; either | |
10 | ;;;; version 3 of the License, or (at your option) any later version. | |
11 | ;;;; | |
12 | ;;;; This library is distributed in the hope that it will be useful, | |
13 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;;;; Lesser General Public License for more details. | |
16 | ;;;; | |
17 | ;;;; You should have received a copy of the GNU Lesser General Public | |
18 | ;;;; License along with this library; if not, write to the Free Software | |
19 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;;;; | |
21 | \f | |
22 | ;;; Commentary: | |
23 | ;; | |
24 | ;@subheading Functional XML parsing framework | |
25 | ;@subsubheading SAX/DOM and SXML parsers with support for XML Namespaces and validation | |
26 | ; | |
27 | ; This is a package of low-to-high level lexing and parsing procedures | |
28 | ; that can be combined to yield a SAX, a DOM, a validating parser, or | |
29 | ; a parser intended for a particular document type. The procedures in | |
30 | ; the package can be used separately to tokenize or parse various | |
31 | ; pieces of XML documents. The package supports XML Namespaces, | |
32 | ; internal and external parsed entities, user-controlled handling of | |
33 | ; whitespace, and validation. This module therefore is intended to be | |
34 | ; a framework, a set of "Lego blocks" you can use to build a parser | |
35 | ; following any discipline and performing validation to any degree. As | |
36 | ; an example of the parser construction, this file includes a | |
37 | ; semi-validating SXML parser. | |
38 | ||
39 | ; The present XML framework has a "sequential" feel of SAX yet a | |
40 | ; "functional style" of DOM. Like a SAX parser, the framework scans the | |
41 | ; document only once and permits incremental processing. An application | |
42 | ; that handles document elements in order can run as efficiently as | |
43 | ; possible. @emph{Unlike} a SAX parser, the framework does not require | |
44 | ; an application register stateful callbacks and surrender control to | |
45 | ; the parser. Rather, it is the application that can drive the framework | |
46 | ; -- calling its functions to get the current lexical or syntax element. | |
47 | ; These functions do not maintain or mutate any state save the input | |
48 | ; port. Therefore, the framework permits parsing of XML in a pure | |
49 | ; functional style, with the input port being a monad (or a linear, | |
50 | ; read-once parameter). | |
51 | ||
52 | ; Besides the @var{port}, there is another monad -- @var{seed}. Most of | |
53 | ; the middle- and high-level parsers are single-threaded through the | |
54 | ; @var{seed}. The functions of this framework do not process or affect | |
55 | ; the @var{seed} in any way: they simply pass it around as an instance | |
56 | ; of an opaque datatype. User functions, on the other hand, can use the | |
57 | ; seed to maintain user's state, to accumulate parsing results, etc. A | |
58 | ; user can freely mix his own functions with those of the framework. On | |
59 | ; the other hand, the user may wish to instantiate a high-level parser: | |
60 | ; @code{SSAX:make-elem-parser} or @code{SSAX:make-parser}. In the latter | |
61 | ; case, the user must provide functions of specific signatures, which | |
62 | ; are called at predictable moments during the parsing: to handle | |
63 | ; character data, element data, or processing instructions (PI). The | |
64 | ; functions are always given the @var{seed}, among other parameters, and | |
65 | ; must return the new @var{seed}. | |
66 | ||
67 | ; From a functional point of view, XML parsing is a combined | |
68 | ; pre-post-order traversal of a "tree" that is the XML document | |
69 | ; itself. This down-and-up traversal tells the user about an element | |
70 | ; when its start tag is encountered. The user is notified about the | |
71 | ; element once more, after all element's children have been | |
72 | ; handled. The process of XML parsing therefore is a fold over the | |
73 | ; raw XML document. Unlike a fold over trees defined in [1], the | |
74 | ; parser is necessarily single-threaded -- obviously as elements | |
75 | ; in a text XML document are laid down sequentially. The parser | |
76 | ; therefore is a tree fold that has been transformed to accept an | |
77 | ; accumulating parameter [1,2]. | |
78 | ||
79 | ; Formally, the denotational semantics of the parser can be expressed | |
80 | ; as | |
81 | ;@smallexample | |
82 | ; parser:: (Start-tag -> Seed -> Seed) -> | |
83 | ; (Start-tag -> Seed -> Seed -> Seed) -> | |
84 | ; (Char-Data -> Seed -> Seed) -> | |
85 | ; XML-text-fragment -> Seed -> Seed | |
86 | ; parser fdown fup fchar "<elem attrs> content </elem>" seed | |
87 | ; = fup "<elem attrs>" seed | |
88 | ; (parser fdown fup fchar "content" (fdown "<elem attrs>" seed)) | |
89 | ; | |
90 | ; parser fdown fup fchar "char-data content" seed | |
91 | ; = parser fdown fup fchar "content" (fchar "char-data" seed) | |
92 | ; | |
93 | ; parser fdown fup fchar "elem-content content" seed | |
94 | ; = parser fdown fup fchar "content" ( | |
95 | ; parser fdown fup fchar "elem-content" seed) | |
96 | ;@end smallexample | |
97 | ||
98 | ; Compare the last two equations with the left fold | |
99 | ;@smallexample | |
100 | ; fold-left kons elem:list seed = fold-left kons list (kons elem seed) | |
101 | ;@end smallexample | |
102 | ||
103 | ; The real parser created by @code{SSAX:make-parser} is slightly more | |
104 | ; complicated, to account for processing instructions, entity | |
105 | ; references, namespaces, processing of document type declaration, etc. | |
106 | ||
107 | ||
108 | ; The XML standard document referred to in this module is | |
109 | ; @uref{http://www.w3.org/TR/1998/REC-xml-19980210.html} | |
110 | ; | |
111 | ; The present file also defines a procedure that parses the text of an | |
112 | ; XML document or of a separate element into SXML, an S-expression-based | |
113 | ; model of an XML Information Set. SXML is also an Abstract Syntax Tree | |
114 | ; of an XML document. SXML is similar but not identical to DOM; SXML is | |
115 | ; particularly suitable for Scheme-based XML/HTML authoring, SXPath | |
116 | ; queries, and tree transformations. See SXML.html for more details. | |
117 | ; SXML is a term implementation of evaluation of the XML document [3]. | |
118 | ; The other implementation is context-passing. | |
119 | ||
120 | ; The present frameworks fully supports the XML Namespaces Recommendation: | |
121 | ; @uref{http://www.w3.org/TR/REC-xml-names/} | |
122 | ; Other links: | |
123 | ;@table @asis | |
124 | ;@item [1] | |
125 | ; Jeremy Gibbons, Geraint Jones, "The Under-appreciated Unfold," | |
126 | ; Proc. ICFP'98, 1998, pp. 273-279. | |
127 | ;@item [2] | |
128 | ; Richard S. Bird, The promotion and accumulation strategies in | |
129 | ; transformational programming, ACM Trans. Progr. Lang. Systems, | |
130 | ; 6(4):487-504, October 1984. | |
131 | ;@item [3] | |
132 | ; Ralf Hinze, "Deriving Backtracking Monad Transformers," | |
133 | ; Functional Pearl. Proc ICFP'00, pp. 186-197. | |
134 | ;@end table | |
135 | ;; | |
136 | ;;; Code: | |
137 | ||
138 | (define-module (sxml ssax) | |
139 | #:use-module (sxml ssax input-parse) | |
140 | #:use-module (srfi srfi-1) | |
141 | #:use-module (srfi srfi-13) | |
142 | ||
143 | #:export (current-ssax-error-port | |
144 | with-ssax-error-to-port | |
145 | xml-token? xml-token-kind xml-token-head | |
146 | make-empty-attlist attlist-add | |
147 | attlist-null? | |
148 | attlist-remove-top | |
149 | attlist->alist attlist-fold | |
4aaa0650 AW |
150 | define-parsed-entity! |
151 | reset-parsed-entity-definitions! | |
47f3ce52 AW |
152 | ssax:uri-string->symbol |
153 | ssax:skip-internal-dtd | |
154 | ssax:read-pi-body-as-string | |
155 | ssax:reverse-collect-str-drop-ws | |
156 | ssax:read-markup-token | |
157 | ssax:read-cdata-body | |
158 | ssax:read-char-ref | |
159 | ssax:read-attributes | |
160 | ssax:complete-start-tag | |
161 | ssax:read-external-id | |
162 | ssax:read-char-data | |
163 | ssax:xml->sxml | |
164 | ssax:make-parser | |
165 | ssax:make-pi-parser | |
166 | ssax:make-elem-parser)) | |
167 | ||
168 | (define (parser-error port message . rest) | |
169 | (apply throw 'parser-error port message rest)) | |
170 | (define ascii->char integer->char) | |
171 | (define char->ascii char->integer) | |
172 | ||
2b6fcf5b AW |
173 | (define current-ssax-error-port |
174 | (make-parameter (current-error-port))) | |
175 | ||
176 | (define *current-ssax-error-port* | |
177 | (parameter-fluid current-ssax-error-port)) | |
47f3ce52 AW |
178 | |
179 | (define (with-ssax-error-to-port port thunk) | |
2b6fcf5b | 180 | (parameterize ((current-ssax-error-port port)) |
47f3ce52 AW |
181 | (thunk))) |
182 | ||
7e0f26eb AW |
183 | (define (ssax:warn port . args) |
184 | (with-output-to-port (current-ssax-error-port) | |
1260fd0b AW |
185 | (lambda () |
186 | (display ";;; SSAX warning: ") | |
187 | (for-each display args) | |
188 | (newline)))) | |
47f3ce52 AW |
189 | |
190 | (define (ucscode->string codepoint) | |
191 | (string (integer->char codepoint))) | |
192 | ||
193 | (define char-newline #\newline) | |
194 | (define char-return #\return) | |
195 | (define char-tab #\tab) | |
196 | (define nl "\n") | |
197 | ||
4aaa0650 AW |
198 | ;; This isn't a great API, but a more proper fix will involve hacking |
199 | ;; SSAX. | |
200 | (define (reset-parsed-entity-definitions!) | |
201 | "Restore the set of parsed entity definitions to its initial state." | |
202 | (set! ssax:predefined-parsed-entities | |
203 | '((amp . "&") | |
204 | (lt . "<") | |
205 | (gt . ">") | |
206 | (apos . "'") | |
207 | (quot . "\"")))) | |
208 | ||
209 | (define (define-parsed-entity! entity str) | |
210 | "Define a new parsed entity. @var{entity} should be a symbol. | |
211 | ||
212 | Instances of &@var{entity}; in XML text will be replaced with the | |
213 | string @var{str}, which will then be parsed." | |
214 | (set! ssax:predefined-parsed-entities | |
215 | (acons entity str ssax:predefined-parsed-entities))) | |
216 | ||
47f3ce52 AW |
217 | ;; Execute a sequence of forms and return the result of the _first_ one. |
218 | ;; Like PROG1 in Lisp. Typically used to evaluate one or more forms with | |
219 | ;; side effects and return a value that must be computed before some or | |
220 | ;; all of the side effects happen. | |
221 | (define-syntax begin0 | |
222 | (syntax-rules () | |
223 | ((begin0 form form1 ... ) | |
224 | (let ((val form)) form1 ... val)))) | |
225 | ||
226 | ; Like let* but allowing for multiple-value bindings | |
227 | (define-syntax let*-values | |
228 | (syntax-rules () | |
229 | ((let*-values () . bodies) (begin . bodies)) | |
230 | ((let*-values (((var) initializer) . rest) . bodies) | |
231 | (let ((var initializer)) ; a single var optimization | |
232 | (let*-values rest . bodies))) | |
233 | ((let*-values ((vars initializer) . rest) . bodies) | |
234 | (call-with-values (lambda () initializer) ; the most generic case | |
235 | (lambda vars (let*-values rest . bodies)))))) | |
236 | ||
237 | ;; needed for some dumb reason | |
238 | (define inc 1+) | |
239 | (define dec 1-) | |
240 | ||
241 | (define-syntax include-from-path/filtered | |
242 | (lambda (x) | |
243 | (define (read-filtered accept-list file) | |
244 | (with-input-from-file (%search-load-path file) | |
245 | (lambda () | |
246 | (let loop ((sexp (read)) (out '())) | |
247 | (cond | |
248 | ((eof-object? sexp) (reverse out)) | |
249 | ((and (pair? sexp) (memq (car sexp) accept-list)) | |
250 | (loop (read) (cons sexp out))) | |
251 | (else | |
252 | (loop (read) out))))))) | |
253 | (syntax-case x () | |
254 | ((_ accept-list file) | |
255 | (with-syntax (((exp ...) (datum->syntax | |
256 | x | |
257 | (read-filtered | |
258 | (syntax->datum #'accept-list) | |
259 | (syntax->datum #'file))))) | |
260 | #'(begin exp ...)))))) | |
261 | ||
262 | (include-from-path "sxml/upstream/assert.scm") | |
263 | (include-from-path/filtered | |
264 | (define define-syntax ssax:define-labeled-arg-macro) | |
265 | "sxml/upstream/SSAX.scm") |