guile-backtrace function
[bpt/guile.git] / module / sxml / simple.scm
1 ;;;; (sxml simple) -- a simple interface to the SSAX parser
2 ;;;;
3 ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
4 ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
5 ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
6 ;;;;
7 ;;;; This library is free software; you can redistribute it and/or
8 ;;;; modify it under the terms of the GNU Lesser General Public
9 ;;;; License as published by the Free Software Foundation; either
10 ;;;; version 3 of the License, or (at your option) any later version.
11 ;;;;
12 ;;;; This library is distributed in the hope that it will be useful,
13 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;;;; Lesser General Public License for more details.
16 ;;;;
17 ;;;; You should have received a copy of the GNU Lesser General Public
18 ;;;; License along with this library; if not, write to the Free Software
19 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;;;;
21 \f
22 ;;; Commentary:
23 ;;
24 ;;A simple interface to XML parsing and serialization.
25 ;;
26 ;;; Code:
27
28 (define-module (sxml simple)
29 #:use-module (sxml ssax input-parse)
30 #:use-module (sxml ssax)
31 #:use-module (sxml transform)
32 #:use-module (ice-9 match)
33 #:use-module (srfi srfi-13)
34 #:export (xml->sxml sxml->xml sxml->string))
35
36 ;; Helpers from upstream/SSAX.scm.
37 ;;
38
39 ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
40 ; given the list of fragments (some of which are text strings)
41 ; reverse the list and concatenate adjacent text strings.
42 ; We can prove from the general case below that if LIST-OF-FRAGS
43 ; has zero or one element, the result of the procedure is equal?
44 ; to its argument. This fact justifies the shortcut evaluation below.
45 (define (ssax:reverse-collect-str fragments)
46 (cond
47 ((null? fragments) '()) ; a shortcut
48 ((null? (cdr fragments)) fragments) ; see the comment above
49 (else
50 (let loop ((fragments fragments) (result '()) (strs '()))
51 (cond
52 ((null? fragments)
53 (if (null? strs) result
54 (cons (string-concatenate/shared strs) result)))
55 ((string? (car fragments))
56 (loop (cdr fragments) result (cons (car fragments) strs)))
57 (else
58 (loop (cdr fragments)
59 (cons
60 (car fragments)
61 (if (null? strs) result
62 (cons (string-concatenate/shared strs) result)))
63 '())))))))
64
65 (define (read-internal-doctype-as-string port)
66 (string-concatenate/shared
67 (let loop ()
68 (let ((fragment
69 (next-token '() '(#\]) "reading internal DOCTYPE" port)))
70 (if (eqv? #\> (peek-next-char port))
71 (begin
72 (read-char port)
73 (cons fragment '()))
74 (cons* fragment "]" (loop)))))))
75
76 ;; Ideas for the future for this interface:
77 ;;
78 ;; * Allow doctypes to provide parsed entities
79 ;;
80 ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
81 ;; below)
82 ;;
83 ;; * Parse internal DTDs
84 ;;
85 ;; * Parse external DTDs
86 ;;
87 (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key
88 (namespaces '())
89 (declare-namespaces? #t)
90 (trim-whitespace? #f)
91 (entities '())
92 (default-entity-handler #f)
93 (doctype-handler #f))
94 "Use SSAX to parse an XML document into SXML. Takes one optional
95 argument, @var{string-or-port}, which defaults to the current input
96 port."
97 ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
98 ;; that the user wants on elements of a given namespace in the
99 ;; resulting SXML, regardless of the abbreviated namespaces defined in
100 ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
101 ;; these namespaces are treated as if they were declared in the DTD.
102
103 ;; ENTITIES: alist of SYMBOL -> STRING.
104
105 ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
106 ;; A DOC-PREFIX of #f indicates that it comes from the user.
107 ;; Otherwise, prefixes are symbols.
108 (define (munge-namespaces namespaces)
109 (map (lambda (el)
110 (match el
111 ((prefix . uri-string)
112 (cons* (and declare-namespaces? prefix)
113 prefix
114 (ssax:uri-string->symbol uri-string)))))
115 namespaces))
116
117 (define (user-namespaces)
118 (munge-namespaces namespaces))
119
120 (define (user-entities)
121 (if (and default-entity-handler
122 (not (assq '*DEFAULT* entities)))
123 (acons '*DEFAULT* default-entity-handler entities)
124 entities))
125
126 (define (name->sxml name)
127 (match name
128 ((prefix . local-part)
129 (symbol-append prefix (string->symbol ":") local-part))
130 (_ name)))
131
132 (define (doctype-continuation seed)
133 (lambda* (#:key (entities '()) (namespaces '()))
134 (values #f
135 (append entities (user-entities))
136 (append (munge-namespaces namespaces) (user-namespaces))
137 seed)))
138
139 ;; The SEED in this parser is the SXML: initialized to '() at each new
140 ;; level by the fdown handlers; built in reverse by the fhere parsers;
141 ;; and reverse-collected by the fup handlers.
142 (define parser
143 (ssax:make-parser
144 NEW-LEVEL-SEED ; fdown
145 (lambda (elem-gi attributes namespaces expected-content seed)
146 '())
147
148 FINISH-ELEMENT ; fup
149 (lambda (elem-gi attributes namespaces parent-seed seed)
150 (let ((seed (if trim-whitespace?
151 (ssax:reverse-collect-str-drop-ws seed)
152 (ssax:reverse-collect-str seed)))
153 (attrs (attlist-fold
154 (lambda (attr accum)
155 (cons (list (name->sxml (car attr)) (cdr attr))
156 accum))
157 '() attributes)))
158 (acons (name->sxml elem-gi)
159 (if (null? attrs)
160 seed
161 (cons (cons '@ attrs) seed))
162 parent-seed)))
163
164 CHAR-DATA-HANDLER ; fhere
165 (lambda (string1 string2 seed)
166 (if (string-null? string2)
167 (cons string1 seed)
168 (cons* string2 string1 seed)))
169
170 DOCTYPE
171 ;; -> ELEMS ENTITIES NAMESPACES SEED
172 ;;
173 ;; ELEMS is for validation and currently unused.
174 ;;
175 ;; ENTITIES is an alist of parsed entities (symbol -> string).
176 ;;
177 ;; NAMESPACES is as above.
178 ;;
179 ;; SEED builds up the content.
180 (lambda (port docname systemid internal-subset? seed)
181 (call-with-values
182 (lambda ()
183 (cond
184 (doctype-handler
185 (doctype-handler docname systemid
186 (and internal-subset?
187 (read-internal-doctype-as-string port))))
188 (else
189 (when internal-subset?
190 (ssax:skip-internal-dtd port))
191 (values))))
192 (doctype-continuation seed)))
193
194 UNDECL-ROOT
195 ;; This is like the DOCTYPE handler, but for documents that do not
196 ;; have a <!DOCTYPE!> entry.
197 (lambda (elem-gi seed)
198 (call-with-values
199 (lambda ()
200 (if doctype-handler
201 (doctype-handler #f #f #f)
202 (values)))
203 (doctype-continuation seed)))
204
205 PI
206 ((*DEFAULT*
207 . (lambda (port pi-tag seed)
208 (cons
209 (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
210 seed))))))
211
212 (let* ((port (if (string? string-or-port)
213 (open-input-string string-or-port)
214 string-or-port))
215 (elements (reverse (parser port '()))))
216 `(*TOP* ,@elements)))
217
218 (define check-name
219 (let ((*good-cache* (make-hash-table)))
220 (lambda (name)
221 (if (not (hashq-ref *good-cache* name))
222 (let* ((str (symbol->string name))
223 (i (string-index str #\:))
224 (head (or (and i (substring str 0 i)) str))
225 (tail (and i (substring str (1+ i)))))
226 (and i (string-index (substring str (1+ i)) #\:)
227 (error "Invalid QName: more than one colon" name))
228 (for-each
229 (lambda (s)
230 (and s
231 (or (char-alphabetic? (string-ref s 0))
232 (eq? (string-ref s 0) #\_)
233 (error "Invalid name starting character" s name))
234 (string-for-each
235 (lambda (c)
236 (or (char-alphabetic? c) (string-index "0123456789.-_" c)
237 (error "Invalid name character" c s name)))
238 s)))
239 (list head tail))
240 (hashq-set! *good-cache* name #t))))))
241
242 ;; The following two functions serialize tags and attributes. They are
243 ;; being used in the node handlers for the post-order function, see
244 ;; below.
245
246 (define (attribute-value->xml value port)
247 (cond
248 ((pair? value)
249 (attribute-value->xml (car value) port)
250 (attribute-value->xml (cdr value) port))
251 ((null? value)
252 *unspecified*)
253 ((string? value)
254 (string->escaped-xml value port))
255 ((procedure? value)
256 (with-output-to-port port value))
257 (else
258 (string->escaped-xml
259 (call-with-output-string (lambda (port) (display value port)))
260 port))))
261
262 (define (attribute->xml attr value port)
263 (check-name attr)
264 (display attr port)
265 (display "=\"" port)
266 (attribute-value->xml value port)
267 (display #\" port))
268
269 (define (element->xml tag attrs body port)
270 (check-name tag)
271 (display #\< port)
272 (display tag port)
273 (if attrs
274 (let lp ((attrs attrs))
275 (if (pair? attrs)
276 (let ((attr (car attrs)))
277 (display #\space port)
278 (if (pair? attr)
279 (attribute->xml (car attr) (cdr attr) port)
280 (error "bad attribute" tag attr))
281 (lp (cdr attrs)))
282 (if (not (null? attrs))
283 (error "bad attributes" tag attrs)))))
284 (if (pair? body)
285 (begin
286 (display #\> port)
287 (let lp ((body body))
288 (cond
289 ((pair? body)
290 (sxml->xml (car body) port)
291 (lp (cdr body)))
292 ((null? body)
293 (display "</" port)
294 (display tag port)
295 (display ">" port))
296 (else
297 (error "bad element body" tag body)))))
298 (display " />" port)))
299
300 ;; FIXME: ensure name is valid
301 (define (entity->xml name port)
302 (display #\& port)
303 (display name port)
304 (display #\; port))
305
306 ;; FIXME: ensure tag and str are valid
307 (define (pi->xml tag str port)
308 (display "<?" port)
309 (display tag port)
310 (display #\space port)
311 (display str port)
312 (display "?>" port))
313
314 (define* (sxml->xml tree #:optional (port (current-output-port)))
315 "Serialize the sxml tree @var{tree} as XML. The output will be written
316 to the current output port, unless the optional argument @var{port} is
317 present."
318 (cond
319 ((pair? tree)
320 (if (symbol? (car tree))
321 ;; An element.
322 (let ((tag (car tree)))
323 (case tag
324 ((*TOP*)
325 (sxml->xml (cdr tree) port))
326 ((*ENTITY*)
327 (if (and (list? (cdr tree)) (= (length (cdr tree)) 1))
328 (entity->xml (cadr tree) port)
329 (error "bad *ENTITY* args" (cdr tree))))
330 ((*PI*)
331 (if (and (list? (cdr tree)) (= (length (cdr tree)) 2))
332 (pi->xml (cadr tree) (caddr tree) port)
333 (error "bad *PI* args" (cdr tree))))
334 (else
335 (let* ((elems (cdr tree))
336 (attrs (and (pair? elems) (pair? (car elems))
337 (eq? '@ (caar elems))
338 (cdar elems))))
339 (element->xml tag attrs (if attrs (cdr elems) elems) port)))))
340 ;; A nodelist.
341 (for-each (lambda (x) (sxml->xml x port)) tree)))
342 ((string? tree)
343 (string->escaped-xml tree port))
344 ((null? tree) *unspecified*)
345 ((not tree) *unspecified*)
346 ((eqv? tree #t) *unspecified*)
347 ((procedure? tree)
348 (with-output-to-port port tree))
349 (else
350 (string->escaped-xml
351 (call-with-output-string (lambda (port) (display tree port)))
352 port))))
353
354 (define (sxml->string sxml)
355 "Detag an sxml tree @var{sxml} into a string. Does not perform any
356 formatting."
357 (string-concatenate-reverse
358 (foldts
359 (lambda (seed tree) ; fdown
360 '())
361 (lambda (seed kid-seed tree) ; fup
362 (append! kid-seed seed))
363 (lambda (seed tree) ; fhere
364 (if (string? tree) (cons tree seed) seed))
365 '()
366 sxml)))
367
368 (define (make-char-quotator char-encoding)
369 (let ((bad-chars (list->char-set (map car char-encoding))))
370
371 ;; Check to see if str contains one of the characters in charset,
372 ;; from the position i onward. If so, return that character's index.
373 ;; otherwise, return #f
374 (define (index-cset str i charset)
375 (string-index str charset i))
376
377 ;; The body of the function
378 (lambda (str port)
379 (let ((bad-pos (index-cset str 0 bad-chars)))
380 (if (not bad-pos)
381 (display str port) ; str had all good chars
382 (let loop ((from 0) (to bad-pos))
383 (cond
384 ((>= from (string-length str)) *unspecified*)
385 ((not to)
386 (display (substring str from (string-length str)) port))
387 (else
388 (let ((quoted-char
389 (cdr (assv (string-ref str to) char-encoding)))
390 (new-to
391 (index-cset str (+ 1 to) bad-chars)))
392 (if (< from to)
393 (display (substring str from to) port))
394 (display quoted-char port)
395 (loop (1+ to) new-to))))))))))
396
397 ;; Given a string, check to make sure it does not contain characters
398 ;; such as '<' or '&' that require encoding. Return either the original
399 ;; string, or a list of string fragments with special characters
400 ;; replaced by appropriate character entities.
401
402 (define string->escaped-xml
403 (make-char-quotator
404 '((#\< . "&lt;") (#\> . "&gt;") (#\& . "&amp;") (#\" . "&quot;"))))
405
406 ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
407 ;;; simple.scm ends here
408