Commit | Line | Data |
---|---|---|
47f3ce52 AW |
1 | ;;;; (sxml simple) -- a simple interface to the SSAX parser |
2 | ;;;; | |
1488753a | 3 | ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc. |
47f3ce52 AW |
4 | ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>. |
5 | ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm. | |
6 | ;;;; | |
7 | ;;;; This library is free software; you can redistribute it and/or | |
8 | ;;;; modify it under the terms of the GNU Lesser General Public | |
9 | ;;;; License as published by the Free Software Foundation; either | |
10 | ;;;; version 3 of the License, or (at your option) any later version. | |
11 | ;;;; | |
12 | ;;;; This library is distributed in the hope that it will be useful, | |
13 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;;;; Lesser General Public License for more details. | |
16 | ;;;; | |
17 | ;;;; You should have received a copy of the GNU Lesser General Public | |
18 | ;;;; License along with this library; if not, write to the Free Software | |
19 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;;;; | |
21 | \f | |
22 | ;;; Commentary: | |
23 | ;; | |
24 | ;;A simple interface to XML parsing and serialization. | |
25 | ;; | |
26 | ;;; Code: | |
27 | ||
28 | (define-module (sxml simple) | |
e10c2509 | 29 | #:use-module (sxml ssax input-parse) |
47f3ce52 AW |
30 | #:use-module (sxml ssax) |
31 | #:use-module (sxml transform) | |
1488753a | 32 | #:use-module (ice-9 match) |
47f3ce52 | 33 | #:use-module (srfi srfi-13) |
d9fff48e | 34 | #:export (xml->sxml sxml->xml sxml->string)) |
47f3ce52 | 35 | |
1488753a AW |
36 | ;; Helpers from upstream/SSAX.scm. |
37 | ;; | |
38 | ||
1488753a AW |
39 | ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS |
40 | ; given the list of fragments (some of which are text strings) | |
41 | ; reverse the list and concatenate adjacent text strings. | |
42 | ; We can prove from the general case below that if LIST-OF-FRAGS | |
43 | ; has zero or one element, the result of the procedure is equal? | |
44 | ; to its argument. This fact justifies the shortcut evaluation below. | |
45 | (define (ssax:reverse-collect-str fragments) | |
46 | (cond | |
47 | ((null? fragments) '()) ; a shortcut | |
48 | ((null? (cdr fragments)) fragments) ; see the comment above | |
49 | (else | |
50 | (let loop ((fragments fragments) (result '()) (strs '())) | |
51 | (cond | |
52 | ((null? fragments) | |
53 | (if (null? strs) result | |
54 | (cons (string-concatenate/shared strs) result))) | |
55 | ((string? (car fragments)) | |
56 | (loop (cdr fragments) result (cons (car fragments) strs))) | |
57 | (else | |
58 | (loop (cdr fragments) | |
59 | (cons | |
60 | (car fragments) | |
61 | (if (null? strs) result | |
62 | (cons (string-concatenate/shared strs) result))) | |
63 | '()))))))) | |
64 | ||
e10c2509 AW |
65 | (define (read-internal-doctype-as-string port) |
66 | (string-concatenate/shared | |
67 | (let loop () | |
68 | (let ((fragment | |
69 | (next-token '() '(#\]) "reading internal DOCTYPE" port))) | |
70 | (if (eqv? #\> (peek-next-char port)) | |
71 | (begin | |
72 | (read-char port) | |
73 | (cons fragment '())) | |
74 | (cons* fragment "]" (loop))))))) | |
75 | ||
1488753a AW |
76 | ;; Ideas for the future for this interface: |
77 | ;; | |
78 | ;; * Allow doctypes to provide parsed entities | |
79 | ;; | |
80 | ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler | |
81 | ;; below) | |
82 | ;; | |
83 | ;; * Parse internal DTDs | |
84 | ;; | |
85 | ;; * Parse external DTDs | |
86 | ;; | |
a14b6e18 | 87 | (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key |
1488753a AW |
88 | (namespaces '()) |
89 | (declare-namespaces? #t) | |
90 | (trim-whitespace? #f) | |
91 | (entities '()) | |
e10c2509 AW |
92 | (default-entity-handler #f) |
93 | (doctype-handler #f)) | |
47f3ce52 | 94 | "Use SSAX to parse an XML document into SXML. Takes one optional |
a14b6e18 AW |
95 | argument, @var{string-or-port}, which defaults to the current input |
96 | port." | |
1488753a AW |
97 | ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix |
98 | ;; that the user wants on elements of a given namespace in the | |
99 | ;; resulting SXML, regardless of the abbreviated namespaces defined in | |
100 | ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true, | |
101 | ;; these namespaces are treated as if they were declared in the DTD. | |
102 | ||
103 | ;; ENTITIES: alist of SYMBOL -> STRING. | |
104 | ||
105 | ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)). | |
106 | ;; A DOC-PREFIX of #f indicates that it comes from the user. | |
107 | ;; Otherwise, prefixes are symbols. | |
e10c2509 | 108 | (define (munge-namespaces namespaces) |
1488753a AW |
109 | (map (lambda (el) |
110 | (match el | |
111 | ((prefix . uri-string) | |
112 | (cons* (and declare-namespaces? prefix) | |
113 | prefix | |
114 | (ssax:uri-string->symbol uri-string))))) | |
115 | namespaces)) | |
116 | ||
e10c2509 AW |
117 | (define (user-namespaces) |
118 | (munge-namespaces namespaces)) | |
119 | ||
1488753a AW |
120 | (define (user-entities) |
121 | (if (and default-entity-handler | |
122 | (not (assq '*DEFAULT* entities))) | |
123 | (acons '*DEFAULT* default-entity-handler entities) | |
124 | entities)) | |
125 | ||
126 | (define (name->sxml name) | |
127 | (match name | |
128 | ((prefix . local-part) | |
129 | (symbol-append prefix (string->symbol ":") local-part)) | |
130 | (_ name))) | |
131 | ||
e10c2509 AW |
132 | (define (doctype-continuation seed) |
133 | (lambda* (#:key (entities '()) (namespaces '())) | |
134 | (values #f | |
135 | (append entities (user-entities)) | |
136 | (append (munge-namespaces namespaces) (user-namespaces)) | |
137 | seed))) | |
138 | ||
1488753a AW |
139 | ;; The SEED in this parser is the SXML: initialized to '() at each new |
140 | ;; level by the fdown handlers; built in reverse by the fhere parsers; | |
141 | ;; and reverse-collected by the fup handlers. | |
142 | (define parser | |
143 | (ssax:make-parser | |
144 | NEW-LEVEL-SEED ; fdown | |
145 | (lambda (elem-gi attributes namespaces expected-content seed) | |
146 | '()) | |
147 | ||
148 | FINISH-ELEMENT ; fup | |
149 | (lambda (elem-gi attributes namespaces parent-seed seed) | |
150 | (let ((seed (if trim-whitespace? | |
151 | (ssax:reverse-collect-str-drop-ws seed) | |
152 | (ssax:reverse-collect-str seed))) | |
153 | (attrs (attlist-fold | |
154 | (lambda (attr accum) | |
155 | (cons (list (name->sxml (car attr)) (cdr attr)) | |
156 | accum)) | |
157 | '() attributes))) | |
158 | (acons (name->sxml elem-gi) | |
159 | (if (null? attrs) | |
160 | seed | |
161 | (cons (cons '@ attrs) seed)) | |
162 | parent-seed))) | |
163 | ||
164 | CHAR-DATA-HANDLER ; fhere | |
165 | (lambda (string1 string2 seed) | |
166 | (if (string-null? string2) | |
167 | (cons string1 seed) | |
168 | (cons* string2 string1 seed))) | |
169 | ||
170 | DOCTYPE | |
171 | ;; -> ELEMS ENTITIES NAMESPACES SEED | |
172 | ;; | |
173 | ;; ELEMS is for validation and currently unused. | |
174 | ;; | |
175 | ;; ENTITIES is an alist of parsed entities (symbol -> string). | |
176 | ;; | |
177 | ;; NAMESPACES is as above. | |
178 | ;; | |
179 | ;; SEED builds up the content. | |
180 | (lambda (port docname systemid internal-subset? seed) | |
e10c2509 AW |
181 | (call-with-values |
182 | (lambda () | |
183 | (cond | |
184 | (doctype-handler | |
185 | (doctype-handler docname systemid | |
186 | (and internal-subset? | |
187 | (read-internal-doctype-as-string port)))) | |
188 | (else | |
189 | (when internal-subset? | |
190 | (ssax:skip-internal-dtd port)) | |
191 | (values)))) | |
192 | (doctype-continuation seed))) | |
1488753a AW |
193 | |
194 | UNDECL-ROOT | |
195 | ;; This is like the DOCTYPE handler, but for documents that do not | |
196 | ;; have a <!DOCTYPE!> entry. | |
197 | (lambda (elem-gi seed) | |
e10c2509 AW |
198 | (call-with-values |
199 | (lambda () | |
200 | (if doctype-handler | |
201 | (doctype-handler #f #f #f) | |
202 | (values))) | |
203 | (doctype-continuation seed))) | |
1488753a AW |
204 | |
205 | PI | |
206 | ((*DEFAULT* | |
207 | . (lambda (port pi-tag seed) | |
208 | (cons | |
209 | (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) | |
210 | seed)))))) | |
211 | ||
a14b6e18 AW |
212 | (let* ((port (if (string? string-or-port) |
213 | (open-input-string string-or-port) | |
214 | string-or-port)) | |
215 | (elements (reverse (parser port '())))) | |
216 | `(*TOP* ,@elements))) | |
47f3ce52 | 217 | |
d9fff48e AW |
218 | (define check-name |
219 | (let ((*good-cache* (make-hash-table))) | |
220 | (lambda (name) | |
221 | (if (not (hashq-ref *good-cache* name)) | |
222 | (let* ((str (symbol->string name)) | |
223 | (i (string-index str #\:)) | |
224 | (head (or (and i (substring str 0 i)) str)) | |
225 | (tail (and i (substring str (1+ i))))) | |
226 | (and i (string-index (substring str (1+ i)) #\:) | |
227 | (error "Invalid QName: more than one colon" name)) | |
228 | (for-each | |
229 | (lambda (s) | |
230 | (and s | |
231 | (or (char-alphabetic? (string-ref s 0)) | |
232 | (eq? (string-ref s 0) #\_) | |
233 | (error "Invalid name starting character" s name)) | |
234 | (string-for-each | |
235 | (lambda (c) | |
236 | (or (char-alphabetic? c) (string-index "0123456789.-_" c) | |
237 | (error "Invalid name character" c s name))) | |
238 | s))) | |
239 | (list head tail)) | |
240 | (hashq-set! *good-cache* name #t)))))) | |
241 | ||
242 | ;; The following two functions serialize tags and attributes. They are | |
243 | ;; being used in the node handlers for the post-order function, see | |
244 | ;; below. | |
245 | ||
246 | (define (attribute-value->xml value port) | |
247 | (cond | |
248 | ((pair? value) | |
249 | (attribute-value->xml (car value) port) | |
250 | (attribute-value->xml (cdr value) port)) | |
6901bad0 AW |
251 | ((null? value) |
252 | *unspecified*) | |
d9fff48e AW |
253 | ((string? value) |
254 | (string->escaped-xml value port)) | |
255 | ((procedure? value) | |
256 | (with-output-to-port port value)) | |
257 | (else | |
258 | (string->escaped-xml | |
259 | (call-with-output-string (lambda (port) (display value port))) | |
260 | port)))) | |
261 | ||
262 | (define (attribute->xml attr value port) | |
263 | (check-name attr) | |
264 | (display attr port) | |
265 | (display "=\"" port) | |
266 | (attribute-value->xml value port) | |
267 | (display #\" port)) | |
268 | ||
269 | (define (element->xml tag attrs body port) | |
270 | (check-name tag) | |
271 | (display #\< port) | |
272 | (display tag port) | |
273 | (if attrs | |
274 | (let lp ((attrs attrs)) | |
275 | (if (pair? attrs) | |
276 | (let ((attr (car attrs))) | |
277 | (display #\space port) | |
278 | (if (pair? attr) | |
279 | (attribute->xml (car attr) (cdr attr) port) | |
280 | (error "bad attribute" tag attr)) | |
281 | (lp (cdr attrs))) | |
282 | (if (not (null? attrs)) | |
283 | (error "bad attributes" tag attrs))))) | |
284 | (if (pair? body) | |
285 | (begin | |
286 | (display #\> port) | |
287 | (let lp ((body body)) | |
288 | (cond | |
289 | ((pair? body) | |
290 | (sxml->xml (car body) port) | |
291 | (lp (cdr body))) | |
292 | ((null? body) | |
293 | (display "</" port) | |
294 | (display tag port) | |
295 | (display ">" port)) | |
296 | (else | |
297 | (error "bad element body" tag body))))) | |
298 | (display " />" port))) | |
299 | ||
300 | ;; FIXME: ensure name is valid | |
301 | (define (entity->xml name port) | |
302 | (display #\& port) | |
303 | (display name port) | |
304 | (display #\; port)) | |
305 | ||
306 | ;; FIXME: ensure tag and str are valid | |
307 | (define (pi->xml tag str port) | |
308 | (display "<?" port) | |
309 | (display tag port) | |
310 | (display #\space port) | |
311 | (display str port) | |
312 | (display "?>" port)) | |
47f3ce52 AW |
313 | |
314 | (define* (sxml->xml tree #:optional (port (current-output-port))) | |
315 | "Serialize the sxml tree @var{tree} as XML. The output will be written | |
316 | to the current output port, unless the optional argument @var{port} is | |
317 | present." | |
d9fff48e AW |
318 | (cond |
319 | ((pair? tree) | |
320 | (if (symbol? (car tree)) | |
321 | ;; An element. | |
322 | (let ((tag (car tree))) | |
323 | (case tag | |
324 | ((*TOP*) | |
325 | (sxml->xml (cdr tree) port)) | |
326 | ((*ENTITY*) | |
327 | (if (and (list? (cdr tree)) (= (length (cdr tree)) 1)) | |
328 | (entity->xml (cadr tree) port) | |
329 | (error "bad *ENTITY* args" (cdr tree)))) | |
330 | ((*PI*) | |
331 | (if (and (list? (cdr tree)) (= (length (cdr tree)) 2)) | |
332 | (pi->xml (cadr tree) (caddr tree) port) | |
333 | (error "bad *PI* args" (cdr tree)))) | |
334 | (else | |
335 | (let* ((elems (cdr tree)) | |
336 | (attrs (and (pair? elems) (pair? (car elems)) | |
337 | (eq? '@ (caar elems)) | |
338 | (cdar elems)))) | |
339 | (element->xml tag attrs (if attrs (cdr elems) elems) port))))) | |
340 | ;; A nodelist. | |
341 | (for-each (lambda (x) (sxml->xml x port)) tree))) | |
342 | ((string? tree) | |
343 | (string->escaped-xml tree port)) | |
344 | ((null? tree) *unspecified*) | |
345 | ((not tree) *unspecified*) | |
346 | ((eqv? tree #t) *unspecified*) | |
347 | ((procedure? tree) | |
348 | (with-output-to-port port tree)) | |
349 | (else | |
350 | (string->escaped-xml | |
351 | (call-with-output-string (lambda (port) (display tree port))) | |
352 | port)))) | |
47f3ce52 AW |
353 | |
354 | (define (sxml->string sxml) | |
355 | "Detag an sxml tree @var{sxml} into a string. Does not perform any | |
356 | formatting." | |
357 | (string-concatenate-reverse | |
358 | (foldts | |
359 | (lambda (seed tree) ; fdown | |
360 | '()) | |
361 | (lambda (seed kid-seed tree) ; fup | |
362 | (append! kid-seed seed)) | |
363 | (lambda (seed tree) ; fhere | |
364 | (if (string? tree) (cons tree seed) seed)) | |
365 | '() | |
366 | sxml))) | |
367 | ||
47f3ce52 | 368 | (define (make-char-quotator char-encoding) |
d9fff48e | 369 | (let ((bad-chars (list->char-set (map car char-encoding)))) |
47f3ce52 AW |
370 | |
371 | ;; Check to see if str contains one of the characters in charset, | |
372 | ;; from the position i onward. If so, return that character's index. | |
373 | ;; otherwise, return #f | |
374 | (define (index-cset str i charset) | |
d9fff48e AW |
375 | (string-index str charset i)) |
376 | ||
47f3ce52 | 377 | ;; The body of the function |
d9fff48e | 378 | (lambda (str port) |
47f3ce52 | 379 | (let ((bad-pos (index-cset str 0 bad-chars))) |
d9fff48e AW |
380 | (if (not bad-pos) |
381 | (display str port) ; str had all good chars | |
382 | (let loop ((from 0) (to bad-pos)) | |
383 | (cond | |
384 | ((>= from (string-length str)) *unspecified*) | |
385 | ((not to) | |
386 | (display (substring str from (string-length str)) port)) | |
387 | (else | |
388 | (let ((quoted-char | |
389 | (cdr (assv (string-ref str to) char-encoding))) | |
390 | (new-to | |
391 | (index-cset str (+ 1 to) bad-chars))) | |
392 | (if (< from to) | |
393 | (display (substring str from to) port)) | |
394 | (display quoted-char port) | |
395 | (loop (1+ to) new-to)))))))))) | |
47f3ce52 AW |
396 | |
397 | ;; Given a string, check to make sure it does not contain characters | |
398 | ;; such as '<' or '&' that require encoding. Return either the original | |
399 | ;; string, or a list of string fragments with special characters | |
400 | ;; replaced by appropriate character entities. | |
401 | ||
402 | (define string->escaped-xml | |
403 | (make-char-quotator | |
404 | '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """)))) | |
405 | ||
406 | ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac | |
407 | ;;; simple.scm ends here | |
408 |