1 ;;;; (sxml simple) -- a simple interface to the SSAX parser
3 ;;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
4 ;;;; Modified 2004 by Andy Wingo <wingo at pobox dot com>.
5 ;;;; Originally written by Oleg Kiselyov <oleg at pobox dot com> as SXML-to-HTML.scm.
7 ;;;; This library is free software; you can redistribute it and/or
8 ;;;; modify it under the terms of the GNU Lesser General Public
9 ;;;; License as published by the Free Software Foundation; either
10 ;;;; version 3 of the License, or (at your option) any later version.
12 ;;;; This library is distributed in the hope that it will be useful,
13 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;;;; Lesser General Public License for more details.
17 ;;;; You should have received a copy of the GNU Lesser General Public
18 ;;;; License along with this library; if not, write to the Free Software
19 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;;A simple interface to XML parsing and serialization.
28 (define-module (sxml simple)
29 #:use-module (sxml ssax input-parse)
30 #:use-module (sxml ssax)
31 #:use-module (sxml transform)
32 #:use-module (ice-9 match)
33 #:use-module (srfi srfi-13)
34 #:export (xml->sxml sxml->xml sxml->string))
36 ;; Helpers from upstream/SSAX.scm.
39 ; ssax:reverse-collect-str LIST-OF-FRAGS -> LIST-OF-FRAGS
40 ; given the list of fragments (some of which are text strings)
41 ; reverse the list and concatenate adjacent text strings.
42 ; We can prove from the general case below that if LIST-OF-FRAGS
43 ; has zero or one element, the result of the procedure is equal?
44 ; to its argument. This fact justifies the shortcut evaluation below.
45 (define (ssax:reverse-collect-str fragments)
47 ((null? fragments) '()) ; a shortcut
48 ((null? (cdr fragments)) fragments) ; see the comment above
50 (let loop ((fragments fragments) (result '()) (strs '()))
53 (if (null? strs) result
54 (cons (string-concatenate/shared strs) result)))
55 ((string? (car fragments))
56 (loop (cdr fragments) result (cons (car fragments) strs)))
61 (if (null? strs) result
62 (cons (string-concatenate/shared strs) result)))
65 (define (read-internal-doctype-as-string port)
66 (string-concatenate/shared
69 (next-token '() '(#\]) "reading internal DOCTYPE" port)))
70 (if (eqv? #\> (peek-next-char port))
74 (cons* fragment "]" (loop)))))))
76 ;; Ideas for the future for this interface:
78 ;; * Allow doctypes to provide parsed entities
80 ;; * Allow validation (the ELEMENTS value from the DOCTYPE handler
83 ;; * Parse internal DTDs
85 ;; * Parse external DTDs
87 (define* (xml->sxml #:optional (string-or-port (current-input-port)) #:key
89 (declare-namespaces? #t)
92 (default-entity-handler #f)
94 "Use SSAX to parse an XML document into SXML. Takes one optional
95 argument, @var{string-or-port}, which defaults to the current input
97 ;; NAMESPACES: alist of PREFIX -> URI. Specifies the symbol prefix
98 ;; that the user wants on elements of a given namespace in the
99 ;; resulting SXML, regardless of the abbreviated namespaces defined in
100 ;; the document by xmlns attributes. If DECLARE-NAMESPACES? is true,
101 ;; these namespaces are treated as if they were declared in the DTD.
103 ;; ENTITIES: alist of SYMBOL -> STRING.
105 ;; NAMESPACES: list of (DOC-PREFIX . (USER-PREFIX . URI)).
106 ;; A DOC-PREFIX of #f indicates that it comes from the user.
107 ;; Otherwise, prefixes are symbols.
108 (define (munge-namespaces namespaces)
111 ((prefix . uri-string)
112 (cons* (and declare-namespaces? prefix)
114 (ssax:uri-string->symbol uri-string)))))
117 (define (user-namespaces)
118 (munge-namespaces namespaces))
120 (define (user-entities)
121 (if (and default-entity-handler
122 (not (assq '*DEFAULT* entities)))
123 (acons '*DEFAULT* default-entity-handler entities)
126 (define (name->sxml name)
128 ((prefix . local-part)
129 (symbol-append prefix (string->symbol ":") local-part))
132 (define (doctype-continuation seed)
133 (lambda* (#:key (entities '()) (namespaces '()))
135 (append entities (user-entities))
136 (append (munge-namespaces namespaces) (user-namespaces))
139 ;; The SEED in this parser is the SXML: initialized to '() at each new
140 ;; level by the fdown handlers; built in reverse by the fhere parsers;
141 ;; and reverse-collected by the fup handlers.
144 NEW-LEVEL-SEED ; fdown
145 (lambda (elem-gi attributes namespaces expected-content seed)
149 (lambda (elem-gi attributes namespaces parent-seed seed)
150 (let ((seed (if trim-whitespace?
151 (ssax:reverse-collect-str-drop-ws seed)
152 (ssax:reverse-collect-str seed)))
155 (cons (list (name->sxml (car attr)) (cdr attr))
158 (acons (name->sxml elem-gi)
161 (cons (cons '@ attrs) seed))
164 CHAR-DATA-HANDLER ; fhere
165 (lambda (string1 string2 seed)
166 (if (string-null? string2)
168 (cons* string2 string1 seed)))
171 ;; -> ELEMS ENTITIES NAMESPACES SEED
173 ;; ELEMS is for validation and currently unused.
175 ;; ENTITIES is an alist of parsed entities (symbol -> string).
177 ;; NAMESPACES is as above.
179 ;; SEED builds up the content.
180 (lambda (port docname systemid internal-subset? seed)
185 (doctype-handler docname systemid
186 (and internal-subset?
187 (read-internal-doctype-as-string port))))
189 (when internal-subset?
190 (ssax:skip-internal-dtd port))
192 (doctype-continuation seed)))
195 ;; This is like the DOCTYPE handler, but for documents that do not
196 ;; have a <!DOCTYPE!> entry.
197 (lambda (elem-gi seed)
201 (doctype-handler #f #f #f)
203 (doctype-continuation seed)))
207 . (lambda (port pi-tag seed)
209 (list '*PI* pi-tag (ssax:read-pi-body-as-string port))
212 (let* ((port (if (string? string-or-port)
213 (open-input-string string-or-port)
215 (elements (reverse (parser port '()))))
216 `(*TOP* ,@elements)))
219 (let ((*good-cache* (make-hash-table)))
221 (if (not (hashq-ref *good-cache* name))
222 (let* ((str (symbol->string name))
223 (i (string-index str #\:))
224 (head (or (and i (substring str 0 i)) str))
225 (tail (and i (substring str (1+ i)))))
226 (and i (string-index (substring str (1+ i)) #\:)
227 (error "Invalid QName: more than one colon" name))
231 (or (char-alphabetic? (string-ref s 0))
232 (eq? (string-ref s 0) #\_)
233 (error "Invalid name starting character" s name))
236 (or (char-alphabetic? c) (string-index "0123456789.-_" c)
237 (error "Invalid name character" c s name)))
240 (hashq-set! *good-cache* name #t))))))
242 ;; The following two functions serialize tags and attributes. They are
243 ;; being used in the node handlers for the post-order function, see
246 (define (attribute-value->xml value port)
249 (attribute-value->xml (car value) port)
250 (attribute-value->xml (cdr value) port))
254 (string->escaped-xml value port))
256 (with-output-to-port port value))
259 (call-with-output-string (lambda (port) (display value port)))
262 (define (attribute->xml attr value port)
266 (attribute-value->xml value port)
269 (define (element->xml tag attrs body port)
274 (let lp ((attrs attrs))
276 (let ((attr (car attrs)))
277 (display #\space port)
279 (attribute->xml (car attr) (cdr attr) port)
280 (error "bad attribute" tag attr))
282 (if (not (null? attrs))
283 (error "bad attributes" tag attrs)))))
287 (let lp ((body body))
290 (sxml->xml (car body) port)
297 (error "bad element body" tag body)))))
298 (display " />" port)))
300 ;; FIXME: ensure name is valid
301 (define (entity->xml name port)
306 ;; FIXME: ensure tag and str are valid
307 (define (pi->xml tag str port)
310 (display #\space port)
314 (define* (sxml->xml tree #:optional (port (current-output-port)))
315 "Serialize the sxml tree @var{tree} as XML. The output will be written
316 to the current output port, unless the optional argument @var{port} is
320 (if (symbol? (car tree))
322 (let ((tag (car tree)))
325 (sxml->xml (cdr tree) port))
327 (if (and (list? (cdr tree)) (= (length (cdr tree)) 1))
328 (entity->xml (cadr tree) port)
329 (error "bad *ENTITY* args" (cdr tree))))
331 (if (and (list? (cdr tree)) (= (length (cdr tree)) 2))
332 (pi->xml (cadr tree) (caddr tree) port)
333 (error "bad *PI* args" (cdr tree))))
335 (let* ((elems (cdr tree))
336 (attrs (and (pair? elems) (pair? (car elems))
337 (eq? '@ (caar elems))
339 (element->xml tag attrs (if attrs (cdr elems) elems) port)))))
341 (for-each (lambda (x) (sxml->xml x port)) tree)))
343 (string->escaped-xml tree port))
344 ((null? tree) *unspecified*)
345 ((not tree) *unspecified*)
346 ((eqv? tree #t) *unspecified*)
348 (with-output-to-port port tree))
351 (call-with-output-string (lambda (port) (display tree port)))
354 (define (sxml->string sxml)
355 "Detag an sxml tree @var{sxml} into a string. Does not perform any
357 (string-concatenate-reverse
359 (lambda (seed tree) ; fdown
361 (lambda (seed kid-seed tree) ; fup
362 (append! kid-seed seed))
363 (lambda (seed tree) ; fhere
364 (if (string? tree) (cons tree seed) seed))
368 (define (make-char-quotator char-encoding)
369 (let ((bad-chars (list->char-set (map car char-encoding))))
371 ;; Check to see if str contains one of the characters in charset,
372 ;; from the position i onward. If so, return that character's index.
373 ;; otherwise, return #f
374 (define (index-cset str i charset)
375 (string-index str charset i))
377 ;; The body of the function
379 (let ((bad-pos (index-cset str 0 bad-chars)))
381 (display str port) ; str had all good chars
382 (let loop ((from 0) (to bad-pos))
384 ((>= from (string-length str)) *unspecified*)
386 (display (substring str from (string-length str)) port))
389 (cdr (assv (string-ref str to) char-encoding)))
391 (index-cset str (+ 1 to) bad-chars)))
393 (display (substring str from to) port))
394 (display quoted-char port)
395 (loop (1+ to) new-to))))))))))
397 ;; Given a string, check to make sure it does not contain characters
398 ;; such as '<' or '&' that require encoding. Return either the original
399 ;; string, or a list of string fragments with special characters
400 ;; replaced by appropriate character entities.
402 (define string->escaped-xml
404 '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """))))
406 ;;; arch-tag: 9c853b25-d82f-42ef-a959-ae26fdc7d1ac
407 ;;; simple.scm ends here