Merge from emacs-23
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
0d30b337 3;; Copyright (C) 2000, 2001, 2002, 2003, 2004,
5df4f04c 4;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
47db06aa
GM
5
6;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 7;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 8;; Keywords: xml, data
47db06aa
GM
9
10;; This file is part of GNU Emacs.
11
eb3fa2cf 12;; GNU Emacs is free software: you can redistribute it and/or modify
47db06aa 13;; it under the terms of the GNU General Public License as published by
eb3fa2cf
GM
14;; the Free Software Foundation, either version 3 of the License, or
15;; (at your option) any later version.
47db06aa
GM
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
eb3fa2cf 23;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
47db06aa
GM
24
25;;; Commentary:
26
a98e819b
DL
27;; This file contains a somewhat incomplete non-validating XML parser. It
28;; parses a file, and returns a list that can be used internally by
a1dfa9a3 29;; any other Lisp libraries.
47db06aa
GM
30
31;;; FILE FORMAT
32
a98e819b
DL
33;; The document type declaration may either be ignored or (optionally)
34;; parsed, but currently the parsing will only accept element
a1dfa9a3 35;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
36;; of error, the parsing stops and the XML file is shown where the
37;; parsing stopped.
47db06aa 38;;
a98e819b 39;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
40;;
41;; The XML file should have the following format:
653558a1
GM
42;; <node1 attr1="name1" attr2="name2" ...>value
43;; <node2 attr3="name3" attr4="name4">value2</node2>
44;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 45;; </node1>
a1dfa9a3 46;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
47;; be any number of attributes (or none), as well as any number of children
48;; below the nodes.
49;;
50;; There can be only top level node, but with any number of children below.
51
52;;; LIST FORMAT
53
c7f8d055
SM
54;; The functions `xml-parse-file', `xml-parse-region' and
55;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
56;;
57;; xml-list ::= (node node ...)
c7f8d055 58;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
59;; child_node_list ::= child_node child_node ...
60;; child_node ::= node | string
c7f8d055
SM
61;; qname ::= (:namespace-uri . "name") | "name"
62;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
63;; | nil
64;; string ::= "..."
65;;
a98e819b
DL
66;; Some macros are provided to ease the parsing of this list.
67;; Whitespace is preserved. Fixme: There should be a tree-walker that
68;; can remove it.
47db06aa 69
c7f8d055
SM
70;; TODO:
71;; * xml:base, xml:space support
72;; * more complete DOCTYPE parsing
73;; * pi support
74
47db06aa
GM
75;;; Code:
76
f6fcdfff
CY
77;; Note that buffer-substring and match-string were formerly used in
78;; several places, because the -no-properties variants remove
79;; composition info. However, after some discussion on emacs-devel,
80;; the consensus was that the speed of the -no-properties variants was
81;; a worthwhile tradeoff especially since we're usually parsing files
82;; instead of hand-crafted XML.
a98e819b 83
47db06aa
GM
84;;*******************************************************************
85;;**
86;;** Macros to parse the list
87;;**
88;;*******************************************************************
89
f8ab034e
MH
90(defconst xml-undefined-entity "?"
91 "What to substitute for undefined entities")
92
6d12a4df
MH
93(defvar xml-entity-alist
94 '(("lt" . "<")
95 ("gt" . ">")
96 ("apos" . "'")
97 ("quot" . "\"")
98 ("amp" . "&"))
99 "The defined entities. Entities are added to this when the DTD is parsed.")
100
101(defvar xml-sub-parser nil
102 "Dynamically set this to a non-nil value if you want to parse an XML fragment.")
103
104(defvar xml-validating-parser nil
105 "Set to non-nil to get validity checking.")
106
971489ea 107(defsubst xml-node-name (node)
47db06aa 108 "Return the tag associated with NODE.
a1dfa9a3
SM
109Without namespace-aware parsing, the tag is a symbol.
110
111With namespace-aware parsing, the tag is a cons of a string
112representing the uri of the namespace with the local name of the
113tag. For example,
114
115 <foo>
116
117would be represented by
118
119 '(\"\" . \"foo\")."
120
971489ea 121 (car node))
47db06aa 122
971489ea 123(defsubst xml-node-attributes (node)
47db06aa
GM
124 "Return the list of attributes of NODE.
125The list can be nil."
971489ea 126 (nth 1 node))
47db06aa 127
971489ea 128(defsubst xml-node-children (node)
47db06aa
GM
129 "Return the list of children of NODE.
130This is a list of nodes, and it can be nil."
971489ea 131 (cddr node))
47db06aa
GM
132
133(defun xml-get-children (node child-name)
134 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 135CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
136 (let ((match ()))
137 (dolist (child (xml-node-children node))
a1dfa9a3
SM
138 (if (and (listp child)
139 (equal (xml-node-name child) child-name))
140 (push child match)))
971489ea 141 (nreverse match)))
47db06aa 142
9bcd6a7e 143(defun xml-get-attribute-or-nil (node attribute)
47db06aa 144 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 145Return nil if the attribute was not found.
9bcd6a7e
EZ
146
147See also `xml-get-attribute'."
2e9bdf15 148 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
149
150(defsubst xml-get-attribute (node attribute)
151 "Get from NODE the value of ATTRIBUTE.
152An empty string is returned if the attribute was not found.
153
154See also `xml-get-attribute-or-nil'."
155 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
156
157;;*******************************************************************
158;;**
159;;** Creating the list
160;;**
161;;*******************************************************************
162
a98e819b 163;;;###autoload
2d42509a 164(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
165 "Parse the well-formed XML file FILE.
166If FILE is already visited, use its buffer and don't kill it.
47db06aa 167Returns the top node with all its children.
2d42509a
JB
168If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
169If PARSE-NS is non-nil, then QNAMES are expanded."
882cb60d
RS
170 (if (get-file-buffer file)
171 (with-current-buffer (get-file-buffer file)
172 (save-excursion
173 (xml-parse-region (point-min)
174 (point-max)
175 (current-buffer)
176 parse-dtd parse-ns)))
177 (with-temp-buffer
178 (insert-file-contents file)
179 (xml-parse-region (point-min)
180 (point-max)
181 (current-buffer)
182 parse-dtd parse-ns))))
47db06aa 183
6d12a4df 184
5ed32352
RS
185(defvar xml-name-re)
186(defvar xml-entity-value-re)
0bf41002 187(defvar xml-att-def-re)
63b446bc 188(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df 189 (name-chars (concat "-[:digit:]." start-chars))
5178753d 190 ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
6d12a4df 191 (whitespace "[ \t\n\r]"))
5178753d
MH
192 ;;[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
193 ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
194 ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
195 ;; | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
6d12a4df 196 (defvar xml-name-start-char-re (concat "[" start-chars "]"))
5178753d 197 ;;[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
6d12a4df 198 (defvar xml-name-char-re (concat "[" name-chars "]"))
5178753d 199 ;;[5] Name ::= NameStartChar (NameChar)*
6d12a4df 200 (defvar xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
5178753d 201 ;;[6] Names ::= Name (#x20 Name)*
6d12a4df 202 (defvar xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
5178753d 203 ;;[7] Nmtoken ::= (NameChar)+
6d12a4df 204 (defvar xml-nmtoken-re (concat xml-name-char-re "+"))
5178753d 205 ;;[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
6d12a4df 206 (defvar xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
5178753d 207 ;;[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
6d12a4df 208 (defvar xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
5178753d 209 ;;[68] EntityRef ::= '&' Name ';'
6d12a4df 210 (defvar xml-entity-ref (concat "&" xml-name-re ";"))
5178753d 211 ;;[69] PEReference ::= '%' Name ';'
6d12a4df 212 (defvar xml-pe-reference-re (concat "%" xml-name-re ";"))
5178753d 213 ;;[67] Reference ::= EntityRef | CharRef
6d12a4df 214 (defvar xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
5178753d 215 ;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
23d519e4
MH
216 (defvar xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
217 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
5178753d
MH
218 ;;[56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID per Element Type] [VC: ID Attribute Default]
219 ;; | 'IDREF' [VC: IDREF]
220 ;; | 'IDREFS' [VC: IDREF]
221 ;; | 'ENTITY' [VC: Entity Name]
222 ;; | 'ENTITIES' [VC: Entity Name]
223 ;; | 'NMTOKEN' [VC: Name Token]
224 ;; | 'NMTOKENS' [VC: Name Token]
23d519e4 225 (defvar xml-tokenized-type-re "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")
5178753d 226 ;;[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
23d519e4
MH
227 (defvar xml-notation-type-re (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
228 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" whitespace "*)\\)"))
5178753d 229 ;;[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] [VC: No Duplicate Tokens]
23d519e4
MH
230 (defvar xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
231 "\\(?:" whitespace "*|" whitespace "*" xml-nmtoken-re "\\)*"
232 whitespace ")\\)"))
5178753d 233 ;;[57] EnumeratedType ::= NotationType | Enumeration
23d519e4 234 (defvar xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re "\\|" xml-enumeration-re "\\)"))
5178753d
MH
235 ;;[54] AttType ::= StringType | TokenizedType | EnumeratedType
236 ;;[55] StringType ::= 'CDATA'
23d519e4 237 (defvar xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re "\\|" xml-notation-type-re"\\|" xml-enumerated-type-re "\\)"))
5178753d 238 ;;[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
23d519e4 239 (defvar xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" whitespace "\\)*" xml-att-value-re "\\)"))
5178753d 240 ;;[53] AttDef ::= S Name S AttType S DefaultDecl
23d519e4
MH
241 (defvar xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
242 whitespace "*" xml-att-type-re
243 whitespace "*" xml-default-decl-re "\\)"))
5178753d
MH
244 ;;[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
245 ;; | "'" ([^%&'] | PEReference | Reference)* "'"
6d12a4df
MH
246 (defvar xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
247 "\\|" xml-reference-re "\\)*\"\\|'\\(?:[^%&']\\|"
248 xml-pe-reference-re "\\|" xml-reference-re "\\)*'\\)")))
249;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
250;; | 'PUBLIC' S PubidLiteral S SystemLiteral
251;;[76] NDataDecl ::= S 'NDATA' S
252;;[73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
253;;[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
254;;[74] PEDef ::= EntityValue | ExternalID
255;;[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
256;;[70] EntityDecl ::= GEDecl | PEDecl
257
a98e819b
DL
258;; Note that this is setup so that we can do whitespace-skipping with
259;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
260;; compared with `re-search-forward', but that has been fixed. Also
261;; note that the standard syntax table contains other characters with
262;; whitespace syntax, like NBSP, but they are invalid in contexts in
263;; which we might skip whitespace -- specifically, they're not
264;; NameChars [XML 4].
265
266(defvar xml-syntax-table
267 (let ((table (make-syntax-table)))
268 ;; Get space syntax correct per XML [3].
269 (dotimes (c 31)
270 (modify-syntax-entry c "." table)) ; all are space in standard table
5178753d 271 (dolist (c '(?\t ?\n ?\r)) ; these should be space
a98e819b
DL
272 (modify-syntax-entry c " " table))
273 ;; For skipping attributes.
274 (modify-syntax-entry ?\" "\"" table)
275 (modify-syntax-entry ?' "\"" table)
276 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
277 ;; are OK by default).
278 (modify-syntax-entry ?. "_" table)
279 (modify-syntax-entry ?: "_" table)
280 ;; XML [89]
aaaa8abb
MH
281 (unless (featurep 'xemacs)
282 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
283 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
284 #x30FD #x30FE))
285 (modify-syntax-entry (decode-char 'ucs c) "w" table)))
a98e819b
DL
286 ;; Fixme: rest of [4]
287 table)
288 "Syntax table used by `xml-parse-region'.")
289
290;; XML [5]
291;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
292(eval-and-compile
293 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
294
295;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
296;; document ::= prolog element Misc*
297;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
298
299;;;###autoload
2d42509a 300(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
301 "Parse the region from BEG to END in BUFFER.
302If BUFFER is nil, it defaults to the current buffer.
303Returns the XML list for the region, or raises an error if the region
2d42509a 304is not well-formed XML.
47db06aa 305If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
306and returned as the first element of the list.
307If PARSE-NS is non-nil, then QNAMES are expanded."
39d58fc0
MH
308 ;; Use fixed syntax table to ensure regexp char classes and syntax
309 ;; specs DTRT.
310 (with-syntax-table (standard-syntax-table)
311 (let ((case-fold-search nil) ; XML is case-sensitive.
312 xml result dtd)
313 (save-excursion
314 (if buffer
315 (set-buffer buffer))
316 (save-restriction
317 (narrow-to-region beg end)
a98e819b
DL
318 (goto-char (point-min))
319 (while (not (eobp))
320 (if (search-forward "<" nil t)
321 (progn
322 (forward-char -1)
34638996 323 (setq result (xml-parse-tag parse-dtd parse-ns))
01b229d1
CY
324 (cond
325 ((null result)
326 ;; Not looking at an xml start tag.
327 (forward-char 1))
328 ((and xml (not xml-sub-parser))
329 ;; Translation of rule [1] of XML specifications
330 (error "XML: (Not Well-Formed) Only one root tag allowed"))
331 ((and (listp (car result))
332 parse-dtd)
333 (setq dtd (car result))
334 (if (cdr result) ; possible leading comment
335 (add-to-list 'xml (cdr result))))
336 (t
337 (add-to-list 'xml result))))
a98e819b
DL
338 (goto-char (point-max))))
339 (if parse-dtd
340 (cons dtd (nreverse xml))
341 (nreverse xml)))))))
47db06aa 342
c7f8d055 343(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
344 "Perform any namespace expansion.
345NAME is the name to perform the expansion on.
c7f8d055
SM
346DEFAULT is the default namespace. XML-NS is a cons of namespace
347names to uris. When namespace-aware parsing is off, then XML-NS
348is nil.
349
350During namespace-aware parsing, any name without a namespace is
351put into the namespace identified by DEFAULT. nil is used to
352specify that the name shouldn't be given a namespace."
353 (if (consp xml-ns)
354 (let* ((nsp (string-match ":" name))
355 (lname (if nsp (substring name (match-end 0)) name))
356 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
357 (special (and (string-equal lname "xmlns") (not prefix)))
358 ;; Setting default to nil will insure that there is not
359 ;; matching cons in xml-ns. In which case we
360 (ns (or (cdr (assoc (if special "xmlns" prefix)
361 xml-ns))
6d12a4df 362 "")))
c7f8d055
SM
363 (cons ns (if special "" lname)))
364 (intern name)))
47db06aa 365
6d12a4df
MH
366(defun xml-parse-fragment (&optional parse-dtd parse-ns)
367 "Parse xml-like fragments."
368 (let ((xml-sub-parser t)
369 children)
370 (while (not (eobp))
371 (let ((bit (xml-parse-tag
372 parse-dtd parse-ns)))
373 (if children
374 (setq children (append (list bit) children))
375 (if (stringp bit)
376 (setq children (list bit))
377 (setq children bit)))))
378 (reverse children)))
379
2d42509a 380(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 381 "Parse the tag at point.
47db06aa
GM
382If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
383returned as the first element in the list.
2d42509a 384If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 385Returns one of:
a98e819b
DL
386 - a list : the matching node
387 - nil : the point is not looking at a tag.
388 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
389 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
390 (xml-ns (if (consp parse-ns)
2d42509a
JB
391 parse-ns
392 (if parse-ns
393 (list
5178753d 394 ;; Default for empty prefix is no namespace
6d12a4df 395 (cons "" "")
c7f8d055 396 ;; "xml" namespace
6d12a4df 397 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 398 ;; We need to seed the xmlns namespace
6d12a4df 399 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
400 (cond
401 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
402 ;; beginning of a document).
403 ((looking-at "<\\?")
404 (search-forward "?>")
405 (skip-syntax-forward " ")
406 (xml-parse-tag parse-dtd xml-ns))
407 ;; Character data (CDATA) sections, in which no tag should be interpreted
408 ((looking-at "<!\\[CDATA\\[")
409 (let ((pos (match-end 0)))
410 (unless (search-forward "]]>" nil t)
6d12a4df 411 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110 412 (concat
f6fcdfff 413 (buffer-substring-no-properties pos (match-beginning 0))
ae026110 414 (xml-parse-string))))
2d42509a
JB
415 ;; DTD for the document
416 ((looking-at "<!DOCTYPE")
6d12a4df
MH
417 (let ((dtd (xml-parse-dtd parse-ns)))
418 (skip-syntax-forward " ")
419 (if xml-validating-parser
420 (cons dtd (xml-parse-tag nil xml-ns))
421 (xml-parse-tag nil xml-ns))))
2d42509a
JB
422 ;; skip comments
423 ((looking-at "<!--")
424 (search-forward "-->")
425 nil)
426 ;; end tag
427 ((looking-at "</")
428 '())
429 ;; opening tag
430 ((looking-at "<\\([^/>[:space:]]+\\)")
431 (goto-char (match-end 1))
34638996
EZ
432
433 ;; Parse this node
f6fcdfff 434 (let* ((node-name (match-string-no-properties 1))
5178753d
MH
435 ;; Parse the attribute list.
436 (attrs (xml-parse-attlist xml-ns))
437 children pos)
c7f8d055 438
5178753d
MH
439 ;; add the xmlns:* attrs to our cache
440 (when (consp xml-ns)
c7f8d055
SM
441 (dolist (attr attrs)
442 (when (and (consp (car attr))
6d12a4df
MH
443 (equal "http://www.w3.org/2000/xmlns/"
444 (caar attr)))
445 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
446 xml-ns))))
447
5178753d 448 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 449
2d42509a
JB
450 ;; is this an empty element ?
451 (if (looking-at "/>")
47db06aa 452 (progn
6d12a4df 453 (forward-char 2)
971489ea 454 (nreverse children))
6d12a4df
MH
455
456 ;; is this a valid start tag ?
457 (if (eq (char-after) ?>)
458 (progn
459 (forward-char 1)
460 ;; Now check that we have the right end-tag. Note that this
461 ;; one might contain spaces after the tag name
462 (let ((end (concat "</" node-name "\\s-*>")))
463 (while (not (looking-at end))
464 (cond
465 ((looking-at "</")
466 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
467 node-name (point)))
468 ((= (char-after) ?<)
469 (let ((tag (xml-parse-tag nil xml-ns)))
470 (when tag
471 (push tag children))))
472 (t
473 (let ((expansion (xml-parse-string)))
474 (setq children
475 (if (stringp expansion)
476 (if (stringp (car children))
477 ;; The two strings were separated by a comment.
aaaa8abb 478 (setq children (append (list (concat (car children) expansion))
6d12a4df
MH
479 (cdr children)))
480 (setq children (append (list expansion) children)))
481 (setq children (append expansion children))))))))
482
483 (goto-char (match-end 0))
484 (nreverse children)))
485 ;; This was an invalid start tag (Expected ">", but didn't see it.)
486 (error "XML: (Well-Formed) Couldn't parse tag: %s"
f6fcdfff 487 (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
6d12a4df
MH
488 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
489 (unless xml-sub-parser ; Usually, we error out.
490 (error "XML: (Well-Formed) Invalid character"))
491
492 ;; However, if we're parsing incrementally, then we need to deal
493 ;; with stray CDATA.
494 (xml-parse-string)))))
495
496(defun xml-parse-string ()
497 "Parse the next whatever. Could be a string, or an element."
5178753d 498 (let* ((pos (point))
98b69232 499 (string (progn (skip-chars-forward "^<")
f6fcdfff 500 (buffer-substring-no-properties pos (point)))))
5178753d
MH
501 ;; Clean up the string. As per XML specifications, the XML
502 ;; processor should always pass the whole string to the
503 ;; application. But \r's should be replaced:
504 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
505 (setq pos 0)
506 (while (string-match "\r\n?" string pos)
507 (setq string (replace-match "\n" t t string))
508 (setq pos (1+ (match-beginning 0))))
509
510 (xml-substitute-special string)))
47db06aa 511
c7f8d055 512(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
513 "Return the attribute-list after point.
514Leave point at the first non-blank character after the tag."
971489ea 515 (let ((attlist ())
34638996 516 end-pos name)
a98e819b
DL
517 (skip-syntax-forward " ")
518 (while (looking-at (eval-when-compile
519 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055 520 (setq end-pos (match-end 0))
f6fcdfff 521 (setq name (xml-maybe-do-ns (match-string-no-properties 1) nil xml-ns))
c7f8d055 522 (goto-char end-pos)
47db06aa 523
a158ff81
JB
524 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
525
47db06aa
GM
526 ;; Do we have a string between quotes (or double-quotes),
527 ;; or a simple word ?
a158ff81 528 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 529 (setq end-pos (match-end 0))
f0ec1711 530 (if (looking-at "'\\([^']*\\)'")
34638996 531 (setq end-pos (match-end 0))
6d12a4df 532 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
533
534 ;; Each attribute must be unique within a given element
535 (if (assoc name attlist)
6d12a4df 536 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 537
a158ff81
JB
538 ;; Multiple whitespace characters should be replaced with a single one
539 ;; in the attributes
f6fcdfff 540 (let ((string (match-string-no-properties 1))
a158ff81 541 (pos 0))
a98e819b 542 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
543 (let ((expansion (xml-substitute-special string)))
544 (unless (stringp expansion)
5178753d
MH
545 ; We say this is the constraint. It is acctually that
546 ; external entities nor "<" can be in an attribute value.
6d12a4df
MH
547 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
548 (push (cons name expansion) attlist)))
a158ff81 549
34638996 550 (goto-char end-pos)
a98e819b 551 (skip-syntax-forward " "))
971489ea 552 (nreverse attlist)))
47db06aa
GM
553
554;;*******************************************************************
555;;**
556;;** The DTD (document type declaration)
557;;** The following functions know how to skip or parse the DTD of
558;;** a document
559;;**
560;;*******************************************************************
561
a98e819b
DL
562;; Fixme: This fails at least if the DTD contains conditional sections.
563
564(defun xml-skip-dtd ()
565 "Skip the DTD at point.
47db06aa 566This follows the rule [28] in the XML specifications."
6d12a4df
MH
567 (let ((xml-validating-parser nil))
568 (xml-parse-dtd)))
47db06aa 569
6d12a4df 570(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
571 "Parse the DTD at point."
572 (forward-char (eval-when-compile (length "<!DOCTYPE")))
573 (skip-syntax-forward " ")
6d12a4df
MH
574 (if (and (looking-at ">")
575 xml-validating-parser)
576 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 577
971489ea 578 ;; Get the name of the document
a98e819b 579 (looking-at xml-name-regexp)
f6fcdfff 580 (let ((dtd (list (match-string-no-properties 0) 'dtd))
971489ea 581 type element end-pos)
47db06aa
GM
582 (goto-char (match-end 0))
583
a98e819b
DL
584 (skip-syntax-forward " ")
585 ;; XML [75]
586 (cond ((looking-at "PUBLIC\\s-+")
587 (goto-char (match-end 0))
588 (unless (or (re-search-forward
589 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
590 nil t)
591 (re-search-forward
592 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
593 nil t))
6d12a4df 594 (error "XML: Missing Public ID"))
f6fcdfff 595 (let ((pubid (match-string-no-properties 1)))
6d12a4df 596 (skip-syntax-forward " ")
a98e819b
DL
597 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
598 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 599 (error "XML: Missing System ID"))
f6fcdfff 600 (push (list pubid (match-string-no-properties 1) 'public) dtd)))
a98e819b
DL
601 ((looking-at "SYSTEM\\s-+")
602 (goto-char (match-end 0))
603 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
604 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 605 (error "XML: Missing System ID"))
f6fcdfff 606 (push (list (match-string-no-properties 1) 'system) dtd)))
a98e819b
DL
607 (skip-syntax-forward " ")
608 (if (eq ?> (char-after))
609 (forward-char)
a98e819b 610 (if (not (eq (char-after) ?\[))
6d12a4df 611 (error "XML: Bad DTD")
a98e819b
DL
612 (forward-char)
613 ;; Parse the rest of the DTD
23d519e4 614 ;; Fixme: Deal with NOTATION, PIs.
a98e819b
DL
615 (while (not (looking-at "\\s-*\\]"))
616 (skip-syntax-forward " ")
617 (cond
618
619 ;; Translation of rule [45] of XML specifications
620 ((looking-at
621 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
622
f6fcdfff 623 (setq element (match-string-no-properties 1)
a98e819b
DL
624 type (match-string-no-properties 2))
625 (setq end-pos (match-end 0))
626
627 ;; Translation of rule [46] of XML specifications
628 (cond
629 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
630 (setq type 'empty))
631 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
632 (setq type 'any))
633 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
f6fcdfff 634 (setq type (xml-parse-elem-type (match-string-no-properties 1 type))))
a98e819b
DL
635 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
636 nil)
637 (t
6d12a4df 638 (if xml-validating-parser
07f7e9ae 639 (error "XML: (Validity) Invalid element type in the DTD"))))
27720433 640
a98e819b 641 ;; rule [45]: the element declaration must be unique
6d12a4df
MH
642 (if (and (assoc element dtd)
643 xml-validating-parser)
644 (error "XML: (Validity) Element declarations must be unique in a DTD (<%s>)"
461f3ad0 645 element))
a98e819b
DL
646
647 ;; Store the element in the DTD
648 (push (list element type) dtd)
649 (goto-char end-pos))
23d519e4
MH
650
651 ;; Translation of rule [52] of XML specifications
652 ((looking-at (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
653 "\\)[ \t\n\r]*\\(" xml-att-def-re
654 "\\)*[ \t\n\r]*>"))
655
656 ;; We don't do anything with ATTLIST currently
657 (goto-char (match-end 0)))
658
a98e819b
DL
659 ((looking-at "<!--")
660 (search-forward "-->"))
6d12a4df
MH
661 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
662 "\\)[ \t\n\r]*\\(" xml-entity-value-re
663 "\\)[ \t\n\r]*>"))
f6fcdfff
CY
664 (let ((name (match-string-no-properties 1))
665 (value (substring (match-string-no-properties 2) 1
666 (- (length (match-string-no-properties 2)) 1))))
23d519e4 667 (goto-char (match-end 0))
6d12a4df
MH
668 (setq xml-entity-alist
669 (append xml-entity-alist
670 (list (cons name
671 (with-temp-buffer
672 (insert value)
673 (goto-char (point-min))
674 (xml-parse-fragment
675 xml-validating-parser
676 parse-ns))))))))
677 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
678 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
679 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
680 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
681 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
682 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
683 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
684 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
685 "[ \t\n\r]*>")))
f6fcdfff
CY
686 (let ((name (match-string-no-properties 1))
687 (file (substring (match-string-no-properties 2) 1
688 (- (length (match-string-no-properties 2)) 1))))
23d519e4 689 (goto-char (match-end 0))
6d12a4df
MH
690 (setq xml-entity-alist
691 (append xml-entity-alist
692 (list (cons name (with-temp-buffer
693 (insert-file-contents file)
694 (goto-char (point-min))
695 (xml-parse-fragment
696 xml-validating-parser
697 parse-ns))))))))
63b446bc
MH
698 ;; skip parameter entity declarations
699 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+%[ \t\n\r]+\\(" xml-name-re
700 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
701 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
702 (looking-at (concat "<!ENTITY[ \t\n\r]+"
703 "%[ \t\n\r]+"
704 "\\(" xml-name-re "\\)[ \t\n\r]+"
705 "PUBLIC[ \t\n\r]+"
706 "\\(\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
707 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'\\)[ \t\n\r]+"
708 "\\(\"[^\"]+\"\\|'[^']+'\\)"
709 "[ \t\n\r]*>")))
710 (goto-char (match-end 0)))
711 ;; skip parameter entities
712 ((looking-at (concat "%" xml-name-re ";"))
713 (goto-char (match-end 0)))
a98e819b 714 (t
7e0f1123 715 (when xml-validating-parser
27720433 716 (error "XML: (Validity) Invalid DTD item"))))))
6d12a4df 717 (if (looking-at "\\s-*]>")
23d519e4 718 (goto-char (match-end 0))))
461f3ad0 719 (nreverse dtd)))
47db06aa
GM
720
721(defun xml-parse-elem-type (string)
a98e819b 722 "Convert element type STRING into a Lisp structure."
47db06aa
GM
723
724 (let (elem modifier)
725 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
726 (progn
f6fcdfff
CY
727 (setq elem (match-string-no-properties 1 string)
728 modifier (match-string-no-properties 2 string))
47db06aa 729 (if (string-match "|" elem)
971489ea 730 (setq elem (cons 'choice
47db06aa
GM
731 (mapcar 'xml-parse-elem-type
732 (split-string elem "|"))))
733 (if (string-match "," elem)
971489ea 734 (setq elem (cons 'seq
47db06aa 735 (mapcar 'xml-parse-elem-type
a98e819b 736 (split-string elem ",")))))))
a158ff81 737 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
f6fcdfff
CY
738 (setq elem (match-string-no-properties 1 string)
739 modifier (match-string-no-properties 2 string))))
47db06aa 740
971489ea
SM
741 (if (and (stringp elem) (string= elem "#PCDATA"))
742 (setq elem 'pcdata))
524425ae 743
971489ea
SM
744 (cond
745 ((string= modifier "+")
746 (list '+ elem))
747 ((string= modifier "*")
748 (list '* elem))
749 ((string= modifier "?")
0fa6f70c 750 (list '\? elem))
971489ea
SM
751 (t
752 elem))))
47db06aa 753
47db06aa
GM
754;;*******************************************************************
755;;**
756;;** Substituting special XML sequences
757;;**
758;;*******************************************************************
759
760(defun xml-substitute-special (string)
a98e819b
DL
761 "Return STRING, after subsituting entity references."
762 ;; This originally made repeated passes through the string from the
763 ;; beginning, which isn't correct, since then either "&amp;amp;" or
764 ;; "&#38;amp;" won't DTRT.
47db06aa 765
6d12a4df
MH
766 (let ((point 0)
767 children end-point)
ae026110 768 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df 769 (setq end-point (match-end 0))
f6fcdfff 770 (let* ((this-part (match-string-no-properties 1 string))
6d12a4df
MH
771 (prev-part (substring string point (match-beginning 0)))
772 (entity (assoc this-part xml-entity-alist))
773 (expansion
774 (cond ((string-match "#\\([0-9]+\\)" this-part)
775 (let ((c (decode-char
776 'ucs
f6fcdfff 777 (string-to-number (match-string-no-properties 1 this-part)))))
6d12a4df
MH
778 (if c (string c))))
779 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
780 (let ((c (decode-char
781 'ucs
f6fcdfff 782 (string-to-number (match-string-no-properties 1 this-part) 16))))
6d12a4df
MH
783 (if c (string c))))
784 (entity
785 (cdr entity))
ae026110 786 ((eq (length this-part) 0)
27720433 787 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 788 (t
f8ab034e 789 (if xml-validating-parser
6d12a4df 790 (error "XML: (Validity) Undefined entity `%s'"
f8ab034e
MH
791 this-part)
792 xml-undefined-entity)))))
6d12a4df
MH
793
794 (cond ((null children)
f0d49437
MH
795 ;; FIXME: If we have an entity that expands into XML, this won't work.
796 (setq children
797 (concat prev-part expansion)))
6d12a4df
MH
798 ((stringp children)
799 (if (stringp expansion)
800 (setq children (concat children prev-part expansion))
801 (setq children (list expansion (concat prev-part children)))))
802 ((and (stringp expansion)
803 (stringp (car children)))
804 (setcar children (concat prev-part expansion (car children))))
805 ((stringp expansion)
806 (setq children (append (concat prev-part expansion)
807 children)))
808 ((stringp (car children))
809 (setcar children (concat (car children) prev-part))
810 (setq children (append expansion children)))
811 (t
812 (setq children (list expansion
813 prev-part
814 children))))
815 (setq point end-point)))
816 (cond ((stringp children)
817 (concat children (substring string point)))
818 ((stringp (car (last children)))
a3110b5d 819 (concat (car (last children)) (substring string point)))
6d12a4df
MH
820 ((null children)
821 string)
822 (t
a3110b5d
MH
823 (concat (mapconcat 'identity
824 (nreverse children)
825 "")
826 (substring string point))))))
827
571855b6
UJ
828(defun xml-substitute-numeric-entities (string)
829 "Substitute SGML numeric entities by their respective utf characters.
830This function replaces numeric entities in the input STRING and
831returns the modified string. For example \"&#42;\" gets replaced
832by \"*\"."
833 (if (and string (stringp string))
834 (let ((start 0))
835 (while (string-match "&#\\([0-9]+\\);" string start)
836 (condition-case nil
837 (setq string (replace-match
838 (string (read (substring string
839 (match-beginning 1)
840 (match-end 1))))
841 nil nil string))
842 (error nil))
843 (setq start (1+ (match-beginning 0))))
844 string)
845 nil))
846
47db06aa
GM
847;;*******************************************************************
848;;**
849;;** Printing a tree.
850;;** This function is intended mainly for debugging purposes.
851;;**
852;;*******************************************************************
853
27240aa4
AS
854(defun xml-debug-print (xml &optional indent-string)
855 "Outputs the XML in the current buffer.
856XML can be a tree or a list of nodes.
857The first line is indented with the optional INDENT-STRING."
858 (setq indent-string (or indent-string ""))
971489ea 859 (dolist (node xml)
27240aa4
AS
860 (xml-debug-print-internal node indent-string)))
861
862(defalias 'xml-print 'xml-debug-print)
47db06aa 863
7731c9f4 864(defun xml-escape-string (string)
a1884c78
MH
865 "Return the string with entity substitutions made from
866xml-entity-alist."
7731c9f4
MH
867 (mapconcat (lambda (byte)
868 (let ((char (char-to-string byte)))
869 (if (rassoc char xml-entity-alist)
870 (concat "&" (car (rassoc char xml-entity-alist)) ";")
871 char)))
76a6127f
MH
872 ;; This differs from the non-unicode branch. Just
873 ;; grabbing the string works here.
874 string ""))
7731c9f4 875
971489ea 876(defun xml-debug-print-internal (xml indent-string)
47db06aa 877 "Outputs the XML tree in the current buffer.
a98e819b 878The first line is indented with INDENT-STRING."
47db06aa
GM
879 (let ((tree xml)
880 attlist)
a98e819b 881 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 882
47db06aa 883 ;; output the attribute list
971489ea 884 (setq attlist (xml-node-attributes tree))
47db06aa 885 (while attlist
7731c9f4
MH
886 (insert ?\ (symbol-name (caar attlist)) "=\""
887 (xml-escape-string (cdar attlist)) ?\")
971489ea 888 (setq attlist (cdr attlist)))
524425ae 889
971489ea 890 (setq tree (xml-node-children tree))
47db06aa 891
27240aa4
AS
892 (if (null tree)
893 (insert ?/ ?>)
894 (insert ?>)
895
896 ;; output the children
897 (dolist (node tree)
898 (cond
899 ((listp node)
900 (insert ?\n)
901 (xml-debug-print-internal node (concat indent-string " ")))
7731c9f4
MH
902 ((stringp node)
903 (insert (xml-escape-string node)))
27240aa4
AS
904 (t
905 (error "Invalid XML tree"))))
906
907 (when (not (and (null (cdr tree))
908 (stringp (car tree))))
909 (insert ?\n indent-string))
910 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
911
912(provide 'xml)
913
8a02e193 914;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
47db06aa 915;;; xml.el ends here