make (sxml simple)'s xml->sxml more capable
[bpt/guile.git] / module / sxml / simple.scm
1 ;;;; (sxml simple) -- a simple interface to the SSAX parser
2 ;;;;
3 ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
4 ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
5 ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
6 ;;;;
7 ;;;; This library is free software; you can redistribute it and/or
8 ;;;; modify it under the terms of the GNU Lesser General Public
9 ;;;; License as published by the Free Software Foundation; either
10 ;;;; version 3 of the License, or (at your option) any later version.
11 ;;;;
12 ;;;; This library is distributed in the hope that it will be useful,
13 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;;;; Lesser General Public License for more details.
16 ;;;;
17 ;;;; You should have received a copy of the GNU Lesser General Public
18 ;;;; License along with this library; if not, write to the Free Software
19 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;;;;
21 \f
22 ;;; Commentary:
23 ;;
24 ;;A simple interface to XML parsing and serialization.
25 ;;
26 ;;; Code:
27
28 (define-module (sxml simple)
29 #:use-module (sxml ssax)
30 #:use-module (sxml transform)
31 #:use-module (ice-9 match)
32 #:use-module (srfi srfi-13)
33 #:export (xml->sxml sxml->xml sxml->string))
34
35 ;; Helpers from upstream/SSAX.scm.
36 ;;
37
38 (define (ssax:warn port msg . args)
39 (format (current-ssax-error-port)
40 ";;; SSAX warning: ~a ~a\n" msg args))
41
42 ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
43 ; given the list of fragments (some of which are text strings)
44 ; reverse the list and concatenate adjacent text strings.
45 ; We can prove from the general case below that if LIST-OF-FRAGS
46 ; has zero or one element, the result of the procedure is equal?
47 ; to its argument. This fact justifies the shortcut evaluation below.
48 (define (ssax:reverse-collect-str fragments)
49 (cond
50 ((null? fragments) '()) ; a shortcut
51 ((null? (cdr fragments)) fragments) ; see the comment above
52 (else
53 (let loop ((fragments fragments) (result '()) (strs '()))
54 (cond
55 ((null? fragments)
56 (if (null? strs) result
57 (cons (string-concatenate/shared strs) result)))
58 ((string? (car fragments))
59 (loop (cdr fragments) result (cons (car fragments) strs)))
60 (else
61 (loop (cdr fragments)
62 (cons
63 (car fragments)
64 (if (null? strs) result
65 (cons (string-concatenate/shared strs) result)))
66 '())))))))
67
68 ;; Ideas for the future for this interface:
69 ;;
70 ;; * Allow doctypes to provide parsed entities
71 ;;
72 ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
73 ;; below)
74 ;;
75 ;; * Parse internal DTDs
76 ;;
77 ;; * Parse external DTDs
78 ;;
79 (define* (xml->sxml #:optional (port (current-input-port)) #:key
80 (namespaces '())
81 (declare-namespaces? #t)
82 (trim-whitespace? #f)
83 (entities '())
84 (default-entity-handler #f))
85 "Use SSAX to parse an XML document into SXML. Takes one optional
86 argument, @var{port}, which defaults to the current input port."
87 ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
88 ;; that the user wants on elements of a given namespace in the
89 ;; resulting SXML, regardless of the abbreviated namespaces defined in
90 ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
91 ;; these namespaces are treated as if they were declared in the DTD.
92
93 ;; ENTITIES: alist of SYMBOL -> STRING.
94
95 ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
96 ;; A DOC-PREFIX of #f indicates that it comes from the user.
97 ;; Otherwise, prefixes are symbols.
98 (define (user-namespaces)
99 (map (lambda (el)
100 (match el
101 ((prefix . uri-string)
102 (cons* (and declare-namespaces? prefix)
103 prefix
104 (ssax:uri-string->symbol uri-string)))))
105 namespaces))
106
107 (define (user-entities)
108 (if (and default-entity-handler
109 (not (assq '*DEFAULT* entities)))
110 (acons '*DEFAULT* default-entity-handler entities)
111 entities))
112
113 (define (name->sxml name)
114 (match name
115 ((prefix . local-part)
116 (symbol-append prefix (string->symbol ":") local-part))
117 (_ name)))
118
119 ;; The SEED in this parser is the SXML: initialized to '() at each new
120 ;; level by the fdown handlers; built in reverse by the fhere parsers;
121 ;; and reverse-collected by the fup handlers.
122 (define parser
123 (ssax:make-parser
124 NEW-LEVEL-SEED ; fdown
125 (lambda (elem-gi attributes namespaces expected-content seed)
126 '())
127
128 FINISH-ELEMENT ; fup
129 (lambda (elem-gi attributes namespaces parent-seed seed)
130 (let ((seed (if trim-whitespace?
131 (ssax:reverse-collect-str-drop-ws seed)
132 (ssax:reverse-collect-str seed)))
133 (attrs (attlist-fold
134 (lambda (attr accum)
135 (cons (list (name->sxml (car attr)) (cdr attr))
136 accum))
137 '() attributes)))
138 (acons (name->sxml elem-gi)
139 (if (null? attrs)
140 seed
141 (cons (cons '@ attrs) seed))
142 parent-seed)))
143
144 CHAR-DATA-HANDLER ; fhere
145 (lambda (string1 string2 seed)
146 (if (string-null? string2)
147 (cons string1 seed)
148 (cons* string2 string1 seed)))
149
150 DOCTYPE
151 ;; -> ELEMS ENTITIES NAMESPACES SEED
152 ;;
153 ;; ELEMS is for validation and currently unused.
154 ;;
155 ;; ENTITIES is an alist of parsed entities (symbol -> string).
156 ;;
157 ;; NAMESPACES is as above.
158 ;;
159 ;; SEED builds up the content.
160 (lambda (port docname systemid internal-subset? seed)
161 (when internal-subset?
162 (ssax:warn port "Internal DTD subset is not currently handled ")
163 (ssax:skip-internal-dtd port))
164 (ssax:warn port "DOCTYPE DECL " docname " "
165 systemid " found and skipped")
166 (values #f (user-entities) (user-namespaces) seed))
167
168 UNDECL-ROOT
169 ;; This is like the DOCTYPE handler, but for documents that do not
170 ;; have a <!DOCTYPE!> entry.
171 (lambda (elem-gi seed)
172 (values #f (user-entities) (user-namespaces) seed))
173
174 PI
175 ((*DEFAULT*
176 . (lambda (port pi-tag seed)
177 (cons
178 (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
179 seed))))))
180
181 `(*TOP* ,@(reverse (parser port '()))))
182
183 (define check-name
184 (let ((*good-cache* (make-hash-table)))
185 (lambda (name)
186 (if (not (hashq-ref *good-cache* name))
187 (let* ((str (symbol->string name))
188 (i (string-index str #\:))
189 (head (or (and i (substring str 0 i)) str))
190 (tail (and i (substring str (1+ i)))))
191 (and i (string-index (substring str (1+ i)) #\:)
192 (error "Invalid QName: more than one colon" name))
193 (for-each
194 (lambda (s)
195 (and s
196 (or (char-alphabetic? (string-ref s 0))
197 (eq? (string-ref s 0) #\_)
198 (error "Invalid name starting character" s name))
199 (string-for-each
200 (lambda (c)
201 (or (char-alphabetic? c) (string-index "0123456789.-_" c)
202 (error "Invalid name character" c s name)))
203 s)))
204 (list head tail))
205 (hashq-set! *good-cache* name #t))))))
206
207 ;; The following two functions serialize tags and attributes. They are
208 ;; being used in the node handlers for the post-order function, see
209 ;; below.
210
211 (define (attribute-value->xml value port)
212 (cond
213 ((pair? value)
214 (attribute-value->xml (car value) port)
215 (attribute-value->xml (cdr value) port))
216 ((null? value)
217 *unspecified*)
218 ((string? value)
219 (string->escaped-xml value port))
220 ((procedure? value)
221 (with-output-to-port port value))
222 (else
223 (string->escaped-xml
224 (call-with-output-string (lambda (port) (display value port)))
225 port))))
226
227 (define (attribute->xml attr value port)
228 (check-name attr)
229 (display attr port)
230 (display "=\"" port)
231 (attribute-value->xml value port)
232 (display #\" port))
233
234 (define (element->xml tag attrs body port)
235 (check-name tag)
236 (display #\< port)
237 (display tag port)
238 (if attrs
239 (let lp ((attrs attrs))
240 (if (pair? attrs)
241 (let ((attr (car attrs)))
242 (display #\space port)
243 (if (pair? attr)
244 (attribute->xml (car attr) (cdr attr) port)
245 (error "bad attribute" tag attr))
246 (lp (cdr attrs)))
247 (if (not (null? attrs))
248 (error "bad attributes" tag attrs)))))
249 (if (pair? body)
250 (begin
251 (display #\> port)
252 (let lp ((body body))
253 (cond
254 ((pair? body)
255 (sxml->xml (car body) port)
256 (lp (cdr body)))
257 ((null? body)
258 (display "</" port)
259 (display tag port)
260 (display ">" port))
261 (else
262 (error "bad element body" tag body)))))
263 (display " />" port)))
264
265 ;; FIXME: ensure name is valid
266 (define (entity->xml name port)
267 (display #\& port)
268 (display name port)
269 (display #\; port))
270
271 ;; FIXME: ensure tag and str are valid
272 (define (pi->xml tag str port)
273 (display "<?" port)
274 (display tag port)
275 (display #\space port)
276 (display str port)
277 (display "?>" port))
278
279 (define* (sxml->xml tree #:optional (port (current-output-port)))
280 "Serialize the sxml tree @var{tree} as XML. The output will be written
281 to the current output port, unless the optional argument @var{port} is
282 present."
283 (cond
284 ((pair? tree)
285 (if (symbol? (car tree))
286 ;; An element.
287 (let ((tag (car tree)))
288 (case tag
289 ((*TOP*)
290 (sxml->xml (cdr tree) port))
291 ((*ENTITY*)
292 (if (and (list? (cdr tree)) (= (length (cdr tree)) 1))
293 (entity->xml (cadr tree) port)
294 (error "bad *ENTITY* args" (cdr tree))))
295 ((*PI*)
296 (if (and (list? (cdr tree)) (= (length (cdr tree)) 2))
297 (pi->xml (cadr tree) (caddr tree) port)
298 (error "bad *PI* args" (cdr tree))))
299 (else
300 (let* ((elems (cdr tree))
301 (attrs (and (pair? elems) (pair? (car elems))
302 (eq? '@ (caar elems))
303 (cdar elems))))
304 (element->xml tag attrs (if attrs (cdr elems) elems) port)))))
305 ;; A nodelist.
306 (for-each (lambda (x) (sxml->xml x port)) tree)))
307 ((string? tree)
308 (string->escaped-xml tree port))
309 ((null? tree) *unspecified*)
310 ((not tree) *unspecified*)
311 ((eqv? tree #t) *unspecified*)
312 ((procedure? tree)
313 (with-output-to-port port tree))
314 (else
315 (string->escaped-xml
316 (call-with-output-string (lambda (port) (display tree port)))
317 port))))
318
319 (define (sxml->string sxml)
320 "Detag an sxml tree @var{sxml} into a string. Does not perform any
321 formatting."
322 (string-concatenate-reverse
323 (foldts
324 (lambda (seed tree) ; fdown
325 '())
326 (lambda (seed kid-seed tree) ; fup
327 (append! kid-seed seed))
328 (lambda (seed tree) ; fhere
329 (if (string? tree) (cons tree seed) seed))
330 '()
331 sxml)))
332
333 (define (make-char-quotator char-encoding)
334 (let ((bad-chars (list->char-set (map car char-encoding))))
335
336 ;; Check to see if str contains one of the characters in charset,
337 ;; from the position i onward. If so, return that character's index.
338 ;; otherwise, return #f
339 (define (index-cset str i charset)
340 (string-index str charset i))
341
342 ;; The body of the function
343 (lambda (str port)
344 (let ((bad-pos (index-cset str 0 bad-chars)))
345 (if (not bad-pos)
346 (display str port) ; str had all good chars
347 (let loop ((from 0) (to bad-pos))
348 (cond
349 ((>= from (string-length str)) *unspecified*)
350 ((not to)
351 (display (substring str from (string-length str)) port))
352 (else
353 (let ((quoted-char
354 (cdr (assv (string-ref str to) char-encoding)))
355 (new-to
356 (index-cset str (+ 1 to) bad-chars)))
357 (if (< from to)
358 (display (substring str from to) port))
359 (display quoted-char port)
360 (loop (1+ to) new-to))))))))))
361
362 ;; Given a string, check to make sure it does not contain characters
363 ;; such as '<' or '&' that require encoding. Return either the original
364 ;; string, or a list of string fragments with special characters
365 ;; replaced by appropriate character entities.
366
367 (define string->escaped-xml
368 (make-char-quotator
369 '((#\< . "&lt;") (#\> . "&gt;") (#\& . "&amp;") (#\" . "&quot;"))))
370
371 ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
372 ;;; simple.scm ends here
373