(browse-url-browser-function)
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
2e9bdf15 3;; Copyright (C) 2000, 01, 03, 2004 Free Software Foundation, Inc.
47db06aa
GM
4
5;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 6;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 7;; Keywords: xml, data
47db06aa
GM
8
9;; This file is part of GNU Emacs.
10
11;; GNU Emacs is free software; you can redistribute it and/or modify
12;; it under the terms of the GNU General Public License as published by
13;; the Free Software Foundation; either version 2, or (at your option)
14;; any later version.
15
16;; GNU Emacs is distributed in the hope that it will be useful,
17;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;; GNU General Public License for more details.
20
21;; You should have received a copy of the GNU General Public License
22;; along with GNU Emacs; see the file COPYING. If not, write to the
23;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24;; Boston, MA 02111-1307, USA.
25
26;;; Commentary:
27
a98e819b
DL
28;; This file contains a somewhat incomplete non-validating XML parser. It
29;; parses a file, and returns a list that can be used internally by
a1dfa9a3 30;; any other Lisp libraries.
47db06aa
GM
31
32;;; FILE FORMAT
33
a98e819b
DL
34;; The document type declaration may either be ignored or (optionally)
35;; parsed, but currently the parsing will only accept element
a1dfa9a3 36;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
37;; of error, the parsing stops and the XML file is shown where the
38;; parsing stopped.
47db06aa 39;;
a98e819b 40;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
41;;
42;; The XML file should have the following format:
653558a1
GM
43;; <node1 attr1="name1" attr2="name2" ...>value
44;; <node2 attr3="name3" attr4="name4">value2</node2>
45;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 46;; </node1>
a1dfa9a3 47;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
48;; be any number of attributes (or none), as well as any number of children
49;; below the nodes.
50;;
51;; There can be only top level node, but with any number of children below.
52
53;;; LIST FORMAT
54
c7f8d055
SM
55;; The functions `xml-parse-file', `xml-parse-region' and
56;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
57;;
58;; xml-list ::= (node node ...)
c7f8d055 59;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
60;; child_node_list ::= child_node child_node ...
61;; child_node ::= node | string
c7f8d055
SM
62;; qname ::= (:namespace-uri . "name") | "name"
63;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
64;; | nil
65;; string ::= "..."
66;;
a98e819b
DL
67;; Some macros are provided to ease the parsing of this list.
68;; Whitespace is preserved. Fixme: There should be a tree-walker that
69;; can remove it.
47db06aa 70
c7f8d055
SM
71;; TODO:
72;; * xml:base, xml:space support
73;; * more complete DOCTYPE parsing
74;; * pi support
75
47db06aa
GM
76;;; Code:
77
a98e819b
DL
78;; Note that {buffer-substring,match-string}-no-properties were
79;; formerly used in several places, but that removes composition info.
80
47db06aa
GM
81;;*******************************************************************
82;;**
83;;** Macros to parse the list
84;;**
85;;*******************************************************************
86
f8ab034e
MH
87(defconst xml-undefined-entity "?"
88 "What to substitute for undefined entities")
89
6d12a4df
MH
90(defvar xml-entity-alist
91 '(("lt" . "<")
92 ("gt" . ">")
93 ("apos" . "'")
94 ("quot" . "\"")
95 ("amp" . "&"))
96 "The defined entities. Entities are added to this when the DTD is parsed.")
97
98(defvar xml-sub-parser nil
99 "Dynamically set this to a non-nil value if you want to parse an XML fragment.")
100
101(defvar xml-validating-parser nil
102 "Set to non-nil to get validity checking.")
103
971489ea 104(defsubst xml-node-name (node)
47db06aa 105 "Return the tag associated with NODE.
a1dfa9a3
SM
106Without namespace-aware parsing, the tag is a symbol.
107
108With namespace-aware parsing, the tag is a cons of a string
109representing the uri of the namespace with the local name of the
110tag. For example,
111
112 <foo>
113
114would be represented by
115
116 '(\"\" . \"foo\")."
117
971489ea 118 (car node))
47db06aa 119
971489ea 120(defsubst xml-node-attributes (node)
47db06aa
GM
121 "Return the list of attributes of NODE.
122The list can be nil."
971489ea 123 (nth 1 node))
47db06aa 124
971489ea 125(defsubst xml-node-children (node)
47db06aa
GM
126 "Return the list of children of NODE.
127This is a list of nodes, and it can be nil."
971489ea 128 (cddr node))
47db06aa
GM
129
130(defun xml-get-children (node child-name)
131 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 132CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
133 (let ((match ()))
134 (dolist (child (xml-node-children node))
a1dfa9a3
SM
135 (if (and (listp child)
136 (equal (xml-node-name child) child-name))
137 (push child match)))
971489ea 138 (nreverse match)))
47db06aa 139
9bcd6a7e 140(defun xml-get-attribute-or-nil (node attribute)
47db06aa 141 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 142Return nil if the attribute was not found.
9bcd6a7e
EZ
143
144See also `xml-get-attribute'."
2e9bdf15 145 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
146
147(defsubst xml-get-attribute (node attribute)
148 "Get from NODE the value of ATTRIBUTE.
149An empty string is returned if the attribute was not found.
150
151See also `xml-get-attribute-or-nil'."
152 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
153
154;;*******************************************************************
155;;**
156;;** Creating the list
157;;**
158;;*******************************************************************
159
a98e819b 160;;;###autoload
2d42509a 161(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
162 "Parse the well-formed XML file FILE.
163If FILE is already visited, use its buffer and don't kill it.
47db06aa 164Returns the top node with all its children.
2d42509a
JB
165If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
166If PARSE-NS is non-nil, then QNAMES are expanded."
653558a1
GM
167 (let ((keep))
168 (if (get-file-buffer file)
169 (progn
170 (set-buffer (get-file-buffer file))
171 (setq keep (point)))
a98e819b
DL
172 (let (auto-mode-alist) ; no need for xml-mode
173 (find-file file)))
524425ae 174
653558a1
GM
175 (let ((xml (xml-parse-region (point-min)
176 (point-max)
177 (current-buffer)
2d42509a 178 parse-dtd parse-ns)))
653558a1
GM
179 (if keep
180 (goto-char keep)
181 (kill-buffer (current-buffer)))
182 xml)))
47db06aa 183
6d12a4df 184
5ed32352
RS
185(defvar xml-name-re)
186(defvar xml-entity-value-re)
63b446bc 187(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df
MH
188 (name-chars (concat "-[:digit:]." start-chars))
189;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
190 (whitespace "[ \t\n\r]"))
191;;[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
192;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
193;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
194;; | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
195 (defvar xml-name-start-char-re (concat "[" start-chars "]"))
196;;[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
197 (defvar xml-name-char-re (concat "[" name-chars "]"))
198;;[5] Name ::= NameStartChar (NameChar)*
199 (defvar xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
200;;[6] Names ::= Name (#x20 Name)*
201 (defvar xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
202;;[7] Nmtoken ::= (NameChar)+
203 (defvar xml-nmtoken-re (concat xml-name-char-re "+"))
204;;[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
205 (defvar xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
206;;[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
207 (defvar xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
208;;[68] EntityRef ::= '&' Name ';'
209 (defvar xml-entity-ref (concat "&" xml-name-re ";"))
210;;[69] PEReference ::= '%' Name ';'
211 (defvar xml-pe-reference-re (concat "%" xml-name-re ";"))
212;;[67] Reference ::= EntityRef | CharRef
213 (defvar xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
23d519e4
MH
214;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
215 (defvar xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
216 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
217;;[56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID per Element Type] [VC: ID Attribute Default]
218;; | 'IDREF' [VC: IDREF]
219;; | 'IDREFS' [VC: IDREF]
220;; | 'ENTITY' [VC: Entity Name]
221;; | 'ENTITIES' [VC: Entity Name]
222;; | 'NMTOKEN' [VC: Name Token]
223;; | 'NMTOKENS' [VC: Name Token]
224 (defvar xml-tokenized-type-re "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")
225;;[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
226 (defvar xml-notation-type-re (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
227 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" whitespace "*)\\)"))
228;;[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] [VC: No Duplicate Tokens]
229 (defvar xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
230 "\\(?:" whitespace "*|" whitespace "*" xml-nmtoken-re "\\)*"
231 whitespace ")\\)"))
232;;[57] EnumeratedType ::= NotationType | Enumeration
233 (defvar xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re "\\|" xml-enumeration-re "\\)"))
234;;[54] AttType ::= StringType | TokenizedType | EnumeratedType
235;;[55] StringType ::= 'CDATA'
236 (defvar xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re "\\|" xml-notation-type-re"\\|" xml-enumerated-type-re "\\)"))
237;;[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
238 (defvar xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" whitespace "\\)*" xml-att-value-re "\\)"))
239;;[53] AttDef ::= S Name S AttType S DefaultDecl
240 (defvar xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
241 whitespace "*" xml-att-type-re
242 whitespace "*" xml-default-decl-re "\\)"))
6d12a4df
MH
243;;[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
244;; | "'" ([^%&'] | PEReference | Reference)* "'"
245 (defvar xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
246 "\\|" xml-reference-re "\\)*\"\\|'\\(?:[^%&']\\|"
247 xml-pe-reference-re "\\|" xml-reference-re "\\)*'\\)")))
248;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
249;; | 'PUBLIC' S PubidLiteral S SystemLiteral
250;;[76] NDataDecl ::= S 'NDATA' S
251;;[73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
252;;[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
253;;[74] PEDef ::= EntityValue | ExternalID
254;;[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
255;;[70] EntityDecl ::= GEDecl | PEDecl
256
a98e819b
DL
257;; Note that this is setup so that we can do whitespace-skipping with
258;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
259;; compared with `re-search-forward', but that has been fixed. Also
260;; note that the standard syntax table contains other characters with
261;; whitespace syntax, like NBSP, but they are invalid in contexts in
262;; which we might skip whitespace -- specifically, they're not
263;; NameChars [XML 4].
264
265(defvar xml-syntax-table
266 (let ((table (make-syntax-table)))
267 ;; Get space syntax correct per XML [3].
268 (dotimes (c 31)
269 (modify-syntax-entry c "." table)) ; all are space in standard table
270 (dolist (c '(?\t ?\n ?\r)) ; these should be space
271 (modify-syntax-entry c " " table))
272 ;; For skipping attributes.
273 (modify-syntax-entry ?\" "\"" table)
274 (modify-syntax-entry ?' "\"" table)
275 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
276 ;; are OK by default).
277 (modify-syntax-entry ?. "_" table)
278 (modify-syntax-entry ?: "_" table)
279 ;; XML [89]
280 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
281 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
282 #x30FD #x30FE))
283 (modify-syntax-entry (decode-char 'ucs c) "w" table))
284 ;; Fixme: rest of [4]
285 table)
286 "Syntax table used by `xml-parse-region'.")
287
288;; XML [5]
289;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
290(eval-and-compile
291 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
292
293;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
294;; document ::= prolog element Misc*
295;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
296
297;;;###autoload
2d42509a 298(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
299 "Parse the region from BEG to END in BUFFER.
300If BUFFER is nil, it defaults to the current buffer.
301Returns the XML list for the region, or raises an error if the region
2d42509a 302is not well-formed XML.
47db06aa 303If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
304and returned as the first element of the list.
305If PARSE-NS is non-nil, then QNAMES are expanded."
a98e819b
DL
306 (save-restriction
307 (narrow-to-region beg end)
308 ;; Use fixed syntax table to ensure regexp char classes and syntax
309 ;; specs DTRT.
310 (with-syntax-table (standard-syntax-table)
311 (let ((case-fold-search nil) ; XML is case-sensitive.
312 xml result dtd)
313 (save-excursion
314 (if buffer
315 (set-buffer buffer))
316 (goto-char (point-min))
317 (while (not (eobp))
318 (if (search-forward "<" nil t)
319 (progn
320 (forward-char -1)
34638996 321 (setq result (xml-parse-tag parse-dtd parse-ns))
6d12a4df 322 (if (and xml result (not xml-sub-parser))
a98e819b 323 ;; translation of rule [1] of XML specifications
6d12a4df 324 (error "XML: (Not Well-Formed) Only one root tag allowed")
47db06aa 325 (cond
971489ea 326 ((null result))
34638996
EZ
327 ((and (listp (car result))
328 parse-dtd)
971489ea 329 (setq dtd (car result))
a98e819b
DL
330 (if (cdr result) ; possible leading comment
331 (add-to-list 'xml (cdr result))))
47db06aa 332 (t
a98e819b
DL
333 (add-to-list 'xml result)))))
334 (goto-char (point-max))))
335 (if parse-dtd
336 (cons dtd (nreverse xml))
337 (nreverse xml)))))))
47db06aa 338
c7f8d055 339(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
340 "Perform any namespace expansion.
341NAME is the name to perform the expansion on.
c7f8d055
SM
342DEFAULT is the default namespace. XML-NS is a cons of namespace
343names to uris. When namespace-aware parsing is off, then XML-NS
344is nil.
345
346During namespace-aware parsing, any name without a namespace is
347put into the namespace identified by DEFAULT. nil is used to
348specify that the name shouldn't be given a namespace."
349 (if (consp xml-ns)
350 (let* ((nsp (string-match ":" name))
351 (lname (if nsp (substring name (match-end 0)) name))
352 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
353 (special (and (string-equal lname "xmlns") (not prefix)))
354 ;; Setting default to nil will insure that there is not
355 ;; matching cons in xml-ns. In which case we
356 (ns (or (cdr (assoc (if special "xmlns" prefix)
357 xml-ns))
6d12a4df 358 "")))
c7f8d055
SM
359 (cons ns (if special "" lname)))
360 (intern name)))
47db06aa 361
6d12a4df
MH
362(defun xml-parse-fragment (&optional parse-dtd parse-ns)
363 "Parse xml-like fragments."
364 (let ((xml-sub-parser t)
365 children)
366 (while (not (eobp))
367 (let ((bit (xml-parse-tag
368 parse-dtd parse-ns)))
369 (if children
370 (setq children (append (list bit) children))
371 (if (stringp bit)
372 (setq children (list bit))
373 (setq children bit)))))
374 (reverse children)))
375
2d42509a 376(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 377 "Parse the tag at point.
47db06aa
GM
378If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
379returned as the first element in the list.
2d42509a 380If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 381Returns one of:
a98e819b
DL
382 - a list : the matching node
383 - nil : the point is not looking at a tag.
384 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
385 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
386 (xml-ns (if (consp parse-ns)
2d42509a
JB
387 parse-ns
388 (if parse-ns
389 (list
c7f8d055 390 ;; Default for empty prefix is no namespace
6d12a4df 391 (cons "" "")
c7f8d055 392 ;; "xml" namespace
6d12a4df 393 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 394 ;; We need to seed the xmlns namespace
6d12a4df 395 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
396 (cond
397 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
398 ;; beginning of a document).
399 ((looking-at "<\\?")
400 (search-forward "?>")
401 (skip-syntax-forward " ")
402 (xml-parse-tag parse-dtd xml-ns))
403 ;; Character data (CDATA) sections, in which no tag should be interpreted
404 ((looking-at "<!\\[CDATA\\[")
405 (let ((pos (match-end 0)))
406 (unless (search-forward "]]>" nil t)
6d12a4df 407 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110
MH
408 (concat
409 (buffer-substring pos (match-beginning 0))
410 (xml-parse-string))))
2d42509a
JB
411 ;; DTD for the document
412 ((looking-at "<!DOCTYPE")
6d12a4df
MH
413 (let ((dtd (xml-parse-dtd parse-ns)))
414 (skip-syntax-forward " ")
415 (if xml-validating-parser
416 (cons dtd (xml-parse-tag nil xml-ns))
417 (xml-parse-tag nil xml-ns))))
2d42509a
JB
418 ;; skip comments
419 ((looking-at "<!--")
420 (search-forward "-->")
421 nil)
422 ;; end tag
423 ((looking-at "</")
424 '())
425 ;; opening tag
426 ((looking-at "<\\([^/>[:space:]]+\\)")
427 (goto-char (match-end 1))
34638996
EZ
428
429 ;; Parse this node
2d42509a 430 (let* ((node-name (match-string 1))
c7f8d055
SM
431 ;; Parse the attribute list.
432 (attrs (xml-parse-attlist xml-ns))
433 children pos)
434
435 ;; add the xmlns:* attrs to our cache
436 (when (consp xml-ns)
437 (dolist (attr attrs)
438 (when (and (consp (car attr))
6d12a4df
MH
439 (equal "http://www.w3.org/2000/xmlns/"
440 (caar attr)))
441 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
442 xml-ns))))
443
43b5fd81 444 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 445
2d42509a
JB
446 ;; is this an empty element ?
447 (if (looking-at "/>")
47db06aa 448 (progn
6d12a4df 449 (forward-char 2)
971489ea 450 (nreverse children))
6d12a4df
MH
451
452 ;; is this a valid start tag ?
453 (if (eq (char-after) ?>)
454 (progn
455 (forward-char 1)
456 ;; Now check that we have the right end-tag. Note that this
457 ;; one might contain spaces after the tag name
458 (let ((end (concat "</" node-name "\\s-*>")))
459 (while (not (looking-at end))
460 (cond
461 ((looking-at "</")
462 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
463 node-name (point)))
464 ((= (char-after) ?<)
465 (let ((tag (xml-parse-tag nil xml-ns)))
466 (when tag
467 (push tag children))))
468 (t
469 (let ((expansion (xml-parse-string)))
470 (setq children
471 (if (stringp expansion)
472 (if (stringp (car children))
473 ;; The two strings were separated by a comment.
474 (setq children (append (concat (car children) expansion)
475 (cdr children)))
476 (setq children (append (list expansion) children)))
477 (setq children (append expansion children))))))))
478
479 (goto-char (match-end 0))
480 (nreverse children)))
481 ;; This was an invalid start tag (Expected ">", but didn't see it.)
482 (error "XML: (Well-Formed) Couldn't parse tag: %s"
483 (buffer-substring (- (point) 10) (+ (point) 1)))))))
484 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
485 (unless xml-sub-parser ; Usually, we error out.
486 (error "XML: (Well-Formed) Invalid character"))
487
488 ;; However, if we're parsing incrementally, then we need to deal
489 ;; with stray CDATA.
490 (xml-parse-string)))))
491
492(defun xml-parse-string ()
493 "Parse the next whatever. Could be a string, or an element."
494 (let* ((pos (point))
495 (string (progn (if (search-forward "<" nil t)
496 (forward-char -1)
497 (goto-char (point-max)))
498 (buffer-substring pos (point)))))
499 ;; Clean up the string. As per XML specifications, the XML
500 ;; processor should always pass the whole string to the
501 ;; application. But \r's should be replaced:
502 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
503 (setq pos 0)
504 (while (string-match "\r\n?" string pos)
505 (setq string (replace-match "\n" t t string))
506 (setq pos (1+ (match-beginning 0))))
507
508 (xml-substitute-special string)))
47db06aa 509
c7f8d055 510(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
511 "Return the attribute-list after point.
512Leave point at the first non-blank character after the tag."
971489ea 513 (let ((attlist ())
34638996 514 end-pos name)
a98e819b
DL
515 (skip-syntax-forward " ")
516 (while (looking-at (eval-when-compile
517 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055
SM
518 (setq end-pos (match-end 0))
519 (setq name (xml-maybe-do-ns (match-string 1) nil xml-ns))
520 (goto-char end-pos)
47db06aa 521
a158ff81
JB
522 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
523
47db06aa
GM
524 ;; Do we have a string between quotes (or double-quotes),
525 ;; or a simple word ?
a158ff81 526 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 527 (setq end-pos (match-end 0))
f0ec1711 528 (if (looking-at "'\\([^']*\\)'")
34638996 529 (setq end-pos (match-end 0))
6d12a4df 530 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
531
532 ;; Each attribute must be unique within a given element
533 (if (assoc name attlist)
6d12a4df 534 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 535
a158ff81
JB
536 ;; Multiple whitespace characters should be replaced with a single one
537 ;; in the attributes
a98e819b 538 (let ((string (match-string 1))
a158ff81 539 (pos 0))
a98e819b 540 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
541 (let ((expansion (xml-substitute-special string)))
542 (unless (stringp expansion)
543 ; We say this is the constraint. It is acctually that
544 ; external entities nor "<" can be in an attribute value.
545 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
546 (push (cons name expansion) attlist)))
a158ff81 547
34638996 548 (goto-char end-pos)
a98e819b 549 (skip-syntax-forward " "))
971489ea 550 (nreverse attlist)))
47db06aa
GM
551
552;;*******************************************************************
553;;**
554;;** The DTD (document type declaration)
555;;** The following functions know how to skip or parse the DTD of
556;;** a document
557;;**
558;;*******************************************************************
559
a98e819b
DL
560;; Fixme: This fails at least if the DTD contains conditional sections.
561
562(defun xml-skip-dtd ()
563 "Skip the DTD at point.
47db06aa 564This follows the rule [28] in the XML specifications."
6d12a4df
MH
565 (let ((xml-validating-parser nil))
566 (xml-parse-dtd)))
47db06aa 567
6d12a4df 568(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
569 "Parse the DTD at point."
570 (forward-char (eval-when-compile (length "<!DOCTYPE")))
571 (skip-syntax-forward " ")
6d12a4df
MH
572 (if (and (looking-at ">")
573 xml-validating-parser)
574 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 575
971489ea 576 ;; Get the name of the document
a98e819b
DL
577 (looking-at xml-name-regexp)
578 (let ((dtd (list (match-string 0) 'dtd))
971489ea 579 type element end-pos)
47db06aa
GM
580 (goto-char (match-end 0))
581
a98e819b
DL
582 (skip-syntax-forward " ")
583 ;; XML [75]
584 (cond ((looking-at "PUBLIC\\s-+")
585 (goto-char (match-end 0))
586 (unless (or (re-search-forward
587 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
588 nil t)
589 (re-search-forward
590 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
591 nil t))
6d12a4df 592 (error "XML: Missing Public ID"))
a98e819b 593 (let ((pubid (match-string 1)))
6d12a4df 594 (skip-syntax-forward " ")
a98e819b
DL
595 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
596 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 597 (error "XML: Missing System ID"))
a98e819b
DL
598 (push (list pubid (match-string 1) 'public) dtd)))
599 ((looking-at "SYSTEM\\s-+")
600 (goto-char (match-end 0))
601 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
602 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 603 (error "XML: Missing System ID"))
a98e819b
DL
604 (push (list (match-string 1) 'system) dtd)))
605 (skip-syntax-forward " ")
606 (if (eq ?> (char-after))
607 (forward-char)
a98e819b 608 (if (not (eq (char-after) ?\[))
6d12a4df 609 (error "XML: Bad DTD")
a98e819b
DL
610 (forward-char)
611 ;; Parse the rest of the DTD
23d519e4 612 ;; Fixme: Deal with NOTATION, PIs.
a98e819b
DL
613 (while (not (looking-at "\\s-*\\]"))
614 (skip-syntax-forward " ")
615 (cond
616
617 ;; Translation of rule [45] of XML specifications
618 ((looking-at
619 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
620
34638996 621 (setq element (match-string 1)
a98e819b
DL
622 type (match-string-no-properties 2))
623 (setq end-pos (match-end 0))
624
625 ;; Translation of rule [46] of XML specifications
626 (cond
627 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
628 (setq type 'empty))
629 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
630 (setq type 'any))
631 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
632 (setq type (xml-parse-elem-type (match-string 1 type))))
633 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
634 nil)
635 (t
6d12a4df 636 (if xml-validating-parser
07f7e9ae 637 (error "XML: (Validity) Invalid element type in the DTD"))))
27720433 638
a98e819b 639 ;; rule [45]: the element declaration must be unique
6d12a4df
MH
640 (if (and (assoc element dtd)
641 xml-validating-parser)
642 (error "XML: (Validity) Element declarations must be unique in a DTD (<%s>)"
461f3ad0 643 element))
a98e819b
DL
644
645 ;; Store the element in the DTD
646 (push (list element type) dtd)
647 (goto-char end-pos))
23d519e4
MH
648
649 ;; Translation of rule [52] of XML specifications
650 ((looking-at (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
651 "\\)[ \t\n\r]*\\(" xml-att-def-re
652 "\\)*[ \t\n\r]*>"))
653
654 ;; We don't do anything with ATTLIST currently
655 (goto-char (match-end 0)))
656
a98e819b
DL
657 ((looking-at "<!--")
658 (search-forward "-->"))
6d12a4df
MH
659 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
660 "\\)[ \t\n\r]*\\(" xml-entity-value-re
661 "\\)[ \t\n\r]*>"))
23d519e4
MH
662 (let ((name (match-string 1))
663 (value (substring (match-string 2) 1
664 (- (length (match-string 2)) 1))))
665 (goto-char (match-end 0))
6d12a4df
MH
666 (setq xml-entity-alist
667 (append xml-entity-alist
668 (list (cons name
669 (with-temp-buffer
670 (insert value)
671 (goto-char (point-min))
672 (xml-parse-fragment
673 xml-validating-parser
674 parse-ns))))))))
675 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
676 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
677 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
678 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
679 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
680 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
681 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
682 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
683 "[ \t\n\r]*>")))
23d519e4
MH
684 (let ((name (match-string 1))
685 (file (substring (match-string 2) 1
686 (- (length (match-string 2)) 1))))
687 (goto-char (match-end 0))
6d12a4df
MH
688 (setq xml-entity-alist
689 (append xml-entity-alist
690 (list (cons name (with-temp-buffer
691 (insert-file-contents file)
692 (goto-char (point-min))
693 (xml-parse-fragment
694 xml-validating-parser
695 parse-ns))))))))
63b446bc
MH
696 ;; skip parameter entity declarations
697 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+%[ \t\n\r]+\\(" xml-name-re
698 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
699 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
700 (looking-at (concat "<!ENTITY[ \t\n\r]+"
701 "%[ \t\n\r]+"
702 "\\(" xml-name-re "\\)[ \t\n\r]+"
703 "PUBLIC[ \t\n\r]+"
704 "\\(\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
705 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'\\)[ \t\n\r]+"
706 "\\(\"[^\"]+\"\\|'[^']+'\\)"
707 "[ \t\n\r]*>")))
708 (goto-char (match-end 0)))
709 ;; skip parameter entities
710 ((looking-at (concat "%" xml-name-re ";"))
711 (goto-char (match-end 0)))
a98e819b 712 (t
7e0f1123 713 (when xml-validating-parser
27720433 714 (error "XML: (Validity) Invalid DTD item"))))))
6d12a4df 715 (if (looking-at "\\s-*]>")
23d519e4 716 (goto-char (match-end 0))))
461f3ad0 717 (nreverse dtd)))
47db06aa
GM
718
719(defun xml-parse-elem-type (string)
a98e819b 720 "Convert element type STRING into a Lisp structure."
47db06aa
GM
721
722 (let (elem modifier)
723 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
724 (progn
725 (setq elem (match-string 1 string)
726 modifier (match-string 2 string))
727 (if (string-match "|" elem)
971489ea 728 (setq elem (cons 'choice
47db06aa
GM
729 (mapcar 'xml-parse-elem-type
730 (split-string elem "|"))))
731 (if (string-match "," elem)
971489ea 732 (setq elem (cons 'seq
47db06aa 733 (mapcar 'xml-parse-elem-type
a98e819b 734 (split-string elem ",")))))))
a158ff81
JB
735 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
736 (setq elem (match-string 1 string)
47db06aa
GM
737 modifier (match-string 2 string))))
738
971489ea
SM
739 (if (and (stringp elem) (string= elem "#PCDATA"))
740 (setq elem 'pcdata))
524425ae 741
971489ea
SM
742 (cond
743 ((string= modifier "+")
744 (list '+ elem))
745 ((string= modifier "*")
746 (list '* elem))
747 ((string= modifier "?")
0fa6f70c 748 (list '\? elem))
971489ea
SM
749 (t
750 elem))))
47db06aa 751
47db06aa
GM
752;;*******************************************************************
753;;**
754;;** Substituting special XML sequences
755;;**
756;;*******************************************************************
757
758(defun xml-substitute-special (string)
a98e819b
DL
759 "Return STRING, after subsituting entity references."
760 ;; This originally made repeated passes through the string from the
761 ;; beginning, which isn't correct, since then either "&amp;amp;" or
762 ;; "&#38;amp;" won't DTRT.
47db06aa 763
6d12a4df
MH
764 (let ((point 0)
765 children end-point)
ae026110 766 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df
MH
767 (setq end-point (match-end 0))
768 (let* ((this-part (match-string 1 string))
769 (prev-part (substring string point (match-beginning 0)))
770 (entity (assoc this-part xml-entity-alist))
771 (expansion
772 (cond ((string-match "#\\([0-9]+\\)" this-part)
773 (let ((c (decode-char
774 'ucs
775 (string-to-number (match-string 1 this-part)))))
776 (if c (string c))))
777 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
778 (let ((c (decode-char
779 'ucs
780 (string-to-number (match-string 1 this-part) 16))))
781 (if c (string c))))
782 (entity
783 (cdr entity))
ae026110 784 ((eq (length this-part) 0)
27720433 785 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 786 (t
f8ab034e 787 (if xml-validating-parser
6d12a4df 788 (error "XML: (Validity) Undefined entity `%s'"
f8ab034e
MH
789 this-part)
790 xml-undefined-entity)))))
6d12a4df
MH
791
792 (cond ((null children)
f0d49437
MH
793 ;; FIXME: If we have an entity that expands into XML, this won't work.
794 (setq children
795 (concat prev-part expansion)))
6d12a4df
MH
796 ((stringp children)
797 (if (stringp expansion)
798 (setq children (concat children prev-part expansion))
799 (setq children (list expansion (concat prev-part children)))))
800 ((and (stringp expansion)
801 (stringp (car children)))
802 (setcar children (concat prev-part expansion (car children))))
803 ((stringp expansion)
804 (setq children (append (concat prev-part expansion)
805 children)))
806 ((stringp (car children))
807 (setcar children (concat (car children) prev-part))
808 (setq children (append expansion children)))
809 (t
810 (setq children (list expansion
811 prev-part
812 children))))
813 (setq point end-point)))
814 (cond ((stringp children)
815 (concat children (substring string point)))
816 ((stringp (car (last children)))
a3110b5d 817 (concat (car (last children)) (substring string point)))
6d12a4df
MH
818 ((null children)
819 string)
820 (t
a3110b5d
MH
821 (concat (mapconcat 'identity
822 (nreverse children)
823 "")
824 (substring string point))))))
825
47db06aa
GM
826;;*******************************************************************
827;;**
828;;** Printing a tree.
829;;** This function is intended mainly for debugging purposes.
830;;**
831;;*******************************************************************
832
27240aa4
AS
833(defun xml-debug-print (xml &optional indent-string)
834 "Outputs the XML in the current buffer.
835XML can be a tree or a list of nodes.
836The first line is indented with the optional INDENT-STRING."
837 (setq indent-string (or indent-string ""))
971489ea 838 (dolist (node xml)
27240aa4
AS
839 (xml-debug-print-internal node indent-string)))
840
841(defalias 'xml-print 'xml-debug-print)
47db06aa 842
971489ea 843(defun xml-debug-print-internal (xml indent-string)
47db06aa 844 "Outputs the XML tree in the current buffer.
a98e819b 845The first line is indented with INDENT-STRING."
47db06aa
GM
846 (let ((tree xml)
847 attlist)
a98e819b 848 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 849
47db06aa 850 ;; output the attribute list
971489ea 851 (setq attlist (xml-node-attributes tree))
47db06aa 852 (while attlist
a98e819b 853 (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\")
971489ea 854 (setq attlist (cdr attlist)))
524425ae 855
971489ea 856 (setq tree (xml-node-children tree))
47db06aa 857
27240aa4
AS
858 (if (null tree)
859 (insert ?/ ?>)
860 (insert ?>)
861
862 ;; output the children
863 (dolist (node tree)
864 (cond
865 ((listp node)
866 (insert ?\n)
867 (xml-debug-print-internal node (concat indent-string " ")))
868 ((stringp node) (insert node))
869 (t
870 (error "Invalid XML tree"))))
871
872 (when (not (and (null (cdr tree))
873 (stringp (car tree))))
874 (insert ?\n indent-string))
875 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
876
877(provide 'xml)
878
8a02e193 879;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
47db06aa 880;;; xml.el ends here