* progmodes/js.el (js--js-not): Add null to the list of values.
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
0d30b337 3;; Copyright (C) 2000, 2001, 2002, 2003, 2004,
ae940284 4;; 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
47db06aa
GM
5
6;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 7;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 8;; Keywords: xml, data
47db06aa
GM
9
10;; This file is part of GNU Emacs.
11
eb3fa2cf 12;; GNU Emacs is free software: you can redistribute it and/or modify
47db06aa 13;; it under the terms of the GNU General Public License as published by
eb3fa2cf
GM
14;; the Free Software Foundation, either version 3 of the License, or
15;; (at your option) any later version.
47db06aa
GM
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
eb3fa2cf 23;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
47db06aa
GM
24
25;;; Commentary:
26
a98e819b
DL
27;; This file contains a somewhat incomplete non-validating XML parser. It
28;; parses a file, and returns a list that can be used internally by
a1dfa9a3 29;; any other Lisp libraries.
47db06aa
GM
30
31;;; FILE FORMAT
32
a98e819b
DL
33;; The document type declaration may either be ignored or (optionally)
34;; parsed, but currently the parsing will only accept element
a1dfa9a3 35;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
36;; of error, the parsing stops and the XML file is shown where the
37;; parsing stopped.
47db06aa 38;;
a98e819b 39;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
40;;
41;; The XML file should have the following format:
653558a1
GM
42;; <node1 attr1="name1" attr2="name2" ...>value
43;; <node2 attr3="name3" attr4="name4">value2</node2>
44;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 45;; </node1>
a1dfa9a3 46;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
47;; be any number of attributes (or none), as well as any number of children
48;; below the nodes.
49;;
50;; There can be only top level node, but with any number of children below.
51
52;;; LIST FORMAT
53
c7f8d055
SM
54;; The functions `xml-parse-file', `xml-parse-region' and
55;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
56;;
57;; xml-list ::= (node node ...)
c7f8d055 58;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
59;; child_node_list ::= child_node child_node ...
60;; child_node ::= node | string
c7f8d055
SM
61;; qname ::= (:namespace-uri . "name") | "name"
62;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
63;; | nil
64;; string ::= "..."
65;;
a98e819b
DL
66;; Some macros are provided to ease the parsing of this list.
67;; Whitespace is preserved. Fixme: There should be a tree-walker that
68;; can remove it.
47db06aa 69
c7f8d055
SM
70;; TODO:
71;; * xml:base, xml:space support
72;; * more complete DOCTYPE parsing
73;; * pi support
74
47db06aa
GM
75;;; Code:
76
f6fcdfff
CY
77;; Note that buffer-substring and match-string were formerly used in
78;; several places, because the -no-properties variants remove
79;; composition info. However, after some discussion on emacs-devel,
80;; the consensus was that the speed of the -no-properties variants was
81;; a worthwhile tradeoff especially since we're usually parsing files
82;; instead of hand-crafted XML.
a98e819b 83
47db06aa
GM
84;;*******************************************************************
85;;**
86;;** Macros to parse the list
87;;**
88;;*******************************************************************
89
f8ab034e
MH
90(defconst xml-undefined-entity "?"
91 "What to substitute for undefined entities")
92
6d12a4df
MH
93(defvar xml-entity-alist
94 '(("lt" . "<")
95 ("gt" . ">")
96 ("apos" . "'")
97 ("quot" . "\"")
98 ("amp" . "&"))
99 "The defined entities. Entities are added to this when the DTD is parsed.")
100
101(defvar xml-sub-parser nil
102 "Dynamically set this to a non-nil value if you want to parse an XML fragment.")
103
104(defvar xml-validating-parser nil
105 "Set to non-nil to get validity checking.")
106
971489ea 107(defsubst xml-node-name (node)
47db06aa 108 "Return the tag associated with NODE.
a1dfa9a3
SM
109Without namespace-aware parsing, the tag is a symbol.
110
111With namespace-aware parsing, the tag is a cons of a string
112representing the uri of the namespace with the local name of the
113tag. For example,
114
115 <foo>
116
117would be represented by
118
119 '(\"\" . \"foo\")."
120
971489ea 121 (car node))
47db06aa 122
971489ea 123(defsubst xml-node-attributes (node)
47db06aa
GM
124 "Return the list of attributes of NODE.
125The list can be nil."
971489ea 126 (nth 1 node))
47db06aa 127
971489ea 128(defsubst xml-node-children (node)
47db06aa
GM
129 "Return the list of children of NODE.
130This is a list of nodes, and it can be nil."
971489ea 131 (cddr node))
47db06aa
GM
132
133(defun xml-get-children (node child-name)
134 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 135CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
136 (let ((match ()))
137 (dolist (child (xml-node-children node))
a1dfa9a3
SM
138 (if (and (listp child)
139 (equal (xml-node-name child) child-name))
140 (push child match)))
971489ea 141 (nreverse match)))
47db06aa 142
9bcd6a7e 143(defun xml-get-attribute-or-nil (node attribute)
47db06aa 144 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 145Return nil if the attribute was not found.
9bcd6a7e
EZ
146
147See also `xml-get-attribute'."
2e9bdf15 148 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
149
150(defsubst xml-get-attribute (node attribute)
151 "Get from NODE the value of ATTRIBUTE.
152An empty string is returned if the attribute was not found.
153
154See also `xml-get-attribute-or-nil'."
155 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
156
157;;*******************************************************************
158;;**
159;;** Creating the list
160;;**
161;;*******************************************************************
162
a98e819b 163;;;###autoload
2d42509a 164(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
165 "Parse the well-formed XML file FILE.
166If FILE is already visited, use its buffer and don't kill it.
47db06aa 167Returns the top node with all its children.
2d42509a
JB
168If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
169If PARSE-NS is non-nil, then QNAMES are expanded."
882cb60d
RS
170 (if (get-file-buffer file)
171 (with-current-buffer (get-file-buffer file)
172 (save-excursion
173 (xml-parse-region (point-min)
174 (point-max)
175 (current-buffer)
176 parse-dtd parse-ns)))
177 (with-temp-buffer
178 (insert-file-contents file)
179 (xml-parse-region (point-min)
180 (point-max)
181 (current-buffer)
182 parse-dtd parse-ns))))
47db06aa 183
6d12a4df 184
5ed32352
RS
185(defvar xml-name-re)
186(defvar xml-entity-value-re)
0bf41002 187(defvar xml-att-def-re)
63b446bc 188(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df 189 (name-chars (concat "-[:digit:]." start-chars))
5178753d 190 ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
6d12a4df 191 (whitespace "[ \t\n\r]"))
5178753d
MH
192 ;;[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
193 ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
194 ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
195 ;; | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
6d12a4df 196 (defvar xml-name-start-char-re (concat "[" start-chars "]"))
5178753d 197 ;;[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
6d12a4df 198 (defvar xml-name-char-re (concat "[" name-chars "]"))
5178753d 199 ;;[5] Name ::= NameStartChar (NameChar)*
6d12a4df 200 (defvar xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
5178753d 201 ;;[6] Names ::= Name (#x20 Name)*
6d12a4df 202 (defvar xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
5178753d 203 ;;[7] Nmtoken ::= (NameChar)+
6d12a4df 204 (defvar xml-nmtoken-re (concat xml-name-char-re "+"))
5178753d 205 ;;[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
6d12a4df 206 (defvar xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
5178753d 207 ;;[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
6d12a4df 208 (defvar xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
5178753d 209 ;;[68] EntityRef ::= '&' Name ';'
6d12a4df 210 (defvar xml-entity-ref (concat "&" xml-name-re ";"))
5178753d 211 ;;[69] PEReference ::= '%' Name ';'
6d12a4df 212 (defvar xml-pe-reference-re (concat "%" xml-name-re ";"))
5178753d 213 ;;[67] Reference ::= EntityRef | CharRef
6d12a4df 214 (defvar xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
5178753d 215 ;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
23d519e4
MH
216 (defvar xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
217 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
5178753d
MH
218 ;;[56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID per Element Type] [VC: ID Attribute Default]
219 ;; | 'IDREF' [VC: IDREF]
220 ;; | 'IDREFS' [VC: IDREF]
221 ;; | 'ENTITY' [VC: Entity Name]
222 ;; | 'ENTITIES' [VC: Entity Name]
223 ;; | 'NMTOKEN' [VC: Name Token]
224 ;; | 'NMTOKENS' [VC: Name Token]
23d519e4 225 (defvar xml-tokenized-type-re "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")
5178753d 226 ;;[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
23d519e4
MH
227 (defvar xml-notation-type-re (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
228 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" whitespace "*)\\)"))
5178753d 229 ;;[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] [VC: No Duplicate Tokens]
23d519e4
MH
230 (defvar xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
231 "\\(?:" whitespace "*|" whitespace "*" xml-nmtoken-re "\\)*"
232 whitespace ")\\)"))
5178753d 233 ;;[57] EnumeratedType ::= NotationType | Enumeration
23d519e4 234 (defvar xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re "\\|" xml-enumeration-re "\\)"))
5178753d
MH
235 ;;[54] AttType ::= StringType | TokenizedType | EnumeratedType
236 ;;[55] StringType ::= 'CDATA'
23d519e4 237 (defvar xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re "\\|" xml-notation-type-re"\\|" xml-enumerated-type-re "\\)"))
5178753d 238 ;;[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
23d519e4 239 (defvar xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" whitespace "\\)*" xml-att-value-re "\\)"))
5178753d 240 ;;[53] AttDef ::= S Name S AttType S DefaultDecl
23d519e4
MH
241 (defvar xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
242 whitespace "*" xml-att-type-re
243 whitespace "*" xml-default-decl-re "\\)"))
5178753d
MH
244 ;;[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
245 ;; | "'" ([^%&'] | PEReference | Reference)* "'"
6d12a4df
MH
246 (defvar xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
247 "\\|" xml-reference-re "\\)*\"\\|'\\(?:[^%&']\\|"
248 xml-pe-reference-re "\\|" xml-reference-re "\\)*'\\)")))
249;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
250;; | 'PUBLIC' S PubidLiteral S SystemLiteral
251;;[76] NDataDecl ::= S 'NDATA' S
252;;[73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
253;;[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
254;;[74] PEDef ::= EntityValue | ExternalID
255;;[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
256;;[70] EntityDecl ::= GEDecl | PEDecl
257
a98e819b
DL
258;; Note that this is setup so that we can do whitespace-skipping with
259;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
260;; compared with `re-search-forward', but that has been fixed. Also
261;; note that the standard syntax table contains other characters with
262;; whitespace syntax, like NBSP, but they are invalid in contexts in
263;; which we might skip whitespace -- specifically, they're not
264;; NameChars [XML 4].
265
266(defvar xml-syntax-table
267 (let ((table (make-syntax-table)))
268 ;; Get space syntax correct per XML [3].
269 (dotimes (c 31)
270 (modify-syntax-entry c "." table)) ; all are space in standard table
5178753d 271 (dolist (c '(?\t ?\n ?\r)) ; these should be space
a98e819b
DL
272 (modify-syntax-entry c " " table))
273 ;; For skipping attributes.
274 (modify-syntax-entry ?\" "\"" table)
275 (modify-syntax-entry ?' "\"" table)
276 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
277 ;; are OK by default).
278 (modify-syntax-entry ?. "_" table)
279 (modify-syntax-entry ?: "_" table)
280 ;; XML [89]
aaaa8abb
MH
281 (unless (featurep 'xemacs)
282 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
283 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
284 #x30FD #x30FE))
285 (modify-syntax-entry (decode-char 'ucs c) "w" table)))
a98e819b
DL
286 ;; Fixme: rest of [4]
287 table)
288 "Syntax table used by `xml-parse-region'.")
289
290;; XML [5]
291;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
292(eval-and-compile
293 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
294
295;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
296;; document ::= prolog element Misc*
297;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
298
299;;;###autoload
2d42509a 300(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
301 "Parse the region from BEG to END in BUFFER.
302If BUFFER is nil, it defaults to the current buffer.
303Returns the XML list for the region, or raises an error if the region
2d42509a 304is not well-formed XML.
47db06aa 305If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
306and returned as the first element of the list.
307If PARSE-NS is non-nil, then QNAMES are expanded."
39d58fc0
MH
308 ;; Use fixed syntax table to ensure regexp char classes and syntax
309 ;; specs DTRT.
310 (with-syntax-table (standard-syntax-table)
311 (let ((case-fold-search nil) ; XML is case-sensitive.
312 xml result dtd)
313 (save-excursion
314 (if buffer
315 (set-buffer buffer))
316 (save-restriction
317 (narrow-to-region beg end)
a98e819b
DL
318 (goto-char (point-min))
319 (while (not (eobp))
320 (if (search-forward "<" nil t)
321 (progn
322 (forward-char -1)
34638996 323 (setq result (xml-parse-tag parse-dtd parse-ns))
6d12a4df 324 (if (and xml result (not xml-sub-parser))
a98e819b 325 ;; translation of rule [1] of XML specifications
6d12a4df 326 (error "XML: (Not Well-Formed) Only one root tag allowed")
47db06aa 327 (cond
971489ea 328 ((null result))
34638996
EZ
329 ((and (listp (car result))
330 parse-dtd)
971489ea 331 (setq dtd (car result))
a98e819b
DL
332 (if (cdr result) ; possible leading comment
333 (add-to-list 'xml (cdr result))))
47db06aa 334 (t
a98e819b
DL
335 (add-to-list 'xml result)))))
336 (goto-char (point-max))))
337 (if parse-dtd
338 (cons dtd (nreverse xml))
339 (nreverse xml)))))))
47db06aa 340
c7f8d055 341(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
342 "Perform any namespace expansion.
343NAME is the name to perform the expansion on.
c7f8d055
SM
344DEFAULT is the default namespace. XML-NS is a cons of namespace
345names to uris. When namespace-aware parsing is off, then XML-NS
346is nil.
347
348During namespace-aware parsing, any name without a namespace is
349put into the namespace identified by DEFAULT. nil is used to
350specify that the name shouldn't be given a namespace."
351 (if (consp xml-ns)
352 (let* ((nsp (string-match ":" name))
353 (lname (if nsp (substring name (match-end 0)) name))
354 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
355 (special (and (string-equal lname "xmlns") (not prefix)))
356 ;; Setting default to nil will insure that there is not
357 ;; matching cons in xml-ns. In which case we
358 (ns (or (cdr (assoc (if special "xmlns" prefix)
359 xml-ns))
6d12a4df 360 "")))
c7f8d055
SM
361 (cons ns (if special "" lname)))
362 (intern name)))
47db06aa 363
6d12a4df
MH
364(defun xml-parse-fragment (&optional parse-dtd parse-ns)
365 "Parse xml-like fragments."
366 (let ((xml-sub-parser t)
367 children)
368 (while (not (eobp))
369 (let ((bit (xml-parse-tag
370 parse-dtd parse-ns)))
371 (if children
372 (setq children (append (list bit) children))
373 (if (stringp bit)
374 (setq children (list bit))
375 (setq children bit)))))
376 (reverse children)))
377
2d42509a 378(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 379 "Parse the tag at point.
47db06aa
GM
380If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
381returned as the first element in the list.
2d42509a 382If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 383Returns one of:
a98e819b
DL
384 - a list : the matching node
385 - nil : the point is not looking at a tag.
386 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
387 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
388 (xml-ns (if (consp parse-ns)
2d42509a
JB
389 parse-ns
390 (if parse-ns
391 (list
5178753d 392 ;; Default for empty prefix is no namespace
6d12a4df 393 (cons "" "")
c7f8d055 394 ;; "xml" namespace
6d12a4df 395 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 396 ;; We need to seed the xmlns namespace
6d12a4df 397 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
398 (cond
399 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
400 ;; beginning of a document).
401 ((looking-at "<\\?")
402 (search-forward "?>")
403 (skip-syntax-forward " ")
404 (xml-parse-tag parse-dtd xml-ns))
405 ;; Character data (CDATA) sections, in which no tag should be interpreted
406 ((looking-at "<!\\[CDATA\\[")
407 (let ((pos (match-end 0)))
408 (unless (search-forward "]]>" nil t)
6d12a4df 409 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110 410 (concat
f6fcdfff 411 (buffer-substring-no-properties pos (match-beginning 0))
ae026110 412 (xml-parse-string))))
2d42509a
JB
413 ;; DTD for the document
414 ((looking-at "<!DOCTYPE")
6d12a4df
MH
415 (let ((dtd (xml-parse-dtd parse-ns)))
416 (skip-syntax-forward " ")
417 (if xml-validating-parser
418 (cons dtd (xml-parse-tag nil xml-ns))
419 (xml-parse-tag nil xml-ns))))
2d42509a
JB
420 ;; skip comments
421 ((looking-at "<!--")
422 (search-forward "-->")
423 nil)
424 ;; end tag
425 ((looking-at "</")
426 '())
427 ;; opening tag
428 ((looking-at "<\\([^/>[:space:]]+\\)")
429 (goto-char (match-end 1))
34638996
EZ
430
431 ;; Parse this node
f6fcdfff 432 (let* ((node-name (match-string-no-properties 1))
5178753d
MH
433 ;; Parse the attribute list.
434 (attrs (xml-parse-attlist xml-ns))
435 children pos)
c7f8d055 436
5178753d
MH
437 ;; add the xmlns:* attrs to our cache
438 (when (consp xml-ns)
c7f8d055
SM
439 (dolist (attr attrs)
440 (when (and (consp (car attr))
6d12a4df
MH
441 (equal "http://www.w3.org/2000/xmlns/"
442 (caar attr)))
443 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
444 xml-ns))))
445
5178753d 446 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 447
2d42509a
JB
448 ;; is this an empty element ?
449 (if (looking-at "/>")
47db06aa 450 (progn
6d12a4df 451 (forward-char 2)
971489ea 452 (nreverse children))
6d12a4df
MH
453
454 ;; is this a valid start tag ?
455 (if (eq (char-after) ?>)
456 (progn
457 (forward-char 1)
458 ;; Now check that we have the right end-tag. Note that this
459 ;; one might contain spaces after the tag name
460 (let ((end (concat "</" node-name "\\s-*>")))
461 (while (not (looking-at end))
462 (cond
463 ((looking-at "</")
464 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
465 node-name (point)))
466 ((= (char-after) ?<)
467 (let ((tag (xml-parse-tag nil xml-ns)))
468 (when tag
469 (push tag children))))
470 (t
471 (let ((expansion (xml-parse-string)))
472 (setq children
473 (if (stringp expansion)
474 (if (stringp (car children))
475 ;; The two strings were separated by a comment.
aaaa8abb 476 (setq children (append (list (concat (car children) expansion))
6d12a4df
MH
477 (cdr children)))
478 (setq children (append (list expansion) children)))
479 (setq children (append expansion children))))))))
480
481 (goto-char (match-end 0))
482 (nreverse children)))
483 ;; This was an invalid start tag (Expected ">", but didn't see it.)
484 (error "XML: (Well-Formed) Couldn't parse tag: %s"
f6fcdfff 485 (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
6d12a4df
MH
486 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
487 (unless xml-sub-parser ; Usually, we error out.
488 (error "XML: (Well-Formed) Invalid character"))
489
490 ;; However, if we're parsing incrementally, then we need to deal
491 ;; with stray CDATA.
492 (xml-parse-string)))))
493
494(defun xml-parse-string ()
495 "Parse the next whatever. Could be a string, or an element."
5178753d 496 (let* ((pos (point))
98b69232 497 (string (progn (skip-chars-forward "^<")
f6fcdfff 498 (buffer-substring-no-properties pos (point)))))
5178753d
MH
499 ;; Clean up the string. As per XML specifications, the XML
500 ;; processor should always pass the whole string to the
501 ;; application. But \r's should be replaced:
502 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
503 (setq pos 0)
504 (while (string-match "\r\n?" string pos)
505 (setq string (replace-match "\n" t t string))
506 (setq pos (1+ (match-beginning 0))))
507
508 (xml-substitute-special string)))
47db06aa 509
c7f8d055 510(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
511 "Return the attribute-list after point.
512Leave point at the first non-blank character after the tag."
971489ea 513 (let ((attlist ())
34638996 514 end-pos name)
a98e819b
DL
515 (skip-syntax-forward " ")
516 (while (looking-at (eval-when-compile
517 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055 518 (setq end-pos (match-end 0))
f6fcdfff 519 (setq name (xml-maybe-do-ns (match-string-no-properties 1) nil xml-ns))
c7f8d055 520 (goto-char end-pos)
47db06aa 521
a158ff81
JB
522 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
523
47db06aa
GM
524 ;; Do we have a string between quotes (or double-quotes),
525 ;; or a simple word ?
a158ff81 526 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 527 (setq end-pos (match-end 0))
f0ec1711 528 (if (looking-at "'\\([^']*\\)'")
34638996 529 (setq end-pos (match-end 0))
6d12a4df 530 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
531
532 ;; Each attribute must be unique within a given element
533 (if (assoc name attlist)
6d12a4df 534 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 535
a158ff81
JB
536 ;; Multiple whitespace characters should be replaced with a single one
537 ;; in the attributes
f6fcdfff 538 (let ((string (match-string-no-properties 1))
a158ff81 539 (pos 0))
a98e819b 540 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
541 (let ((expansion (xml-substitute-special string)))
542 (unless (stringp expansion)
5178753d
MH
543 ; We say this is the constraint. It is acctually that
544 ; external entities nor "<" can be in an attribute value.
6d12a4df
MH
545 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
546 (push (cons name expansion) attlist)))
a158ff81 547
34638996 548 (goto-char end-pos)
a98e819b 549 (skip-syntax-forward " "))
971489ea 550 (nreverse attlist)))
47db06aa
GM
551
552;;*******************************************************************
553;;**
554;;** The DTD (document type declaration)
555;;** The following functions know how to skip or parse the DTD of
556;;** a document
557;;**
558;;*******************************************************************
559
a98e819b
DL
560;; Fixme: This fails at least if the DTD contains conditional sections.
561
562(defun xml-skip-dtd ()
563 "Skip the DTD at point.
47db06aa 564This follows the rule [28] in the XML specifications."
6d12a4df
MH
565 (let ((xml-validating-parser nil))
566 (xml-parse-dtd)))
47db06aa 567
6d12a4df 568(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
569 "Parse the DTD at point."
570 (forward-char (eval-when-compile (length "<!DOCTYPE")))
571 (skip-syntax-forward " ")
6d12a4df
MH
572 (if (and (looking-at ">")
573 xml-validating-parser)
574 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 575
971489ea 576 ;; Get the name of the document
a98e819b 577 (looking-at xml-name-regexp)
f6fcdfff 578 (let ((dtd (list (match-string-no-properties 0) 'dtd))
971489ea 579 type element end-pos)
47db06aa
GM
580 (goto-char (match-end 0))
581
a98e819b
DL
582 (skip-syntax-forward " ")
583 ;; XML [75]
584 (cond ((looking-at "PUBLIC\\s-+")
585 (goto-char (match-end 0))
586 (unless (or (re-search-forward
587 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
588 nil t)
589 (re-search-forward
590 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
591 nil t))
6d12a4df 592 (error "XML: Missing Public ID"))
f6fcdfff 593 (let ((pubid (match-string-no-properties 1)))
6d12a4df 594 (skip-syntax-forward " ")
a98e819b
DL
595 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
596 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 597 (error "XML: Missing System ID"))
f6fcdfff 598 (push (list pubid (match-string-no-properties 1) 'public) dtd)))
a98e819b
DL
599 ((looking-at "SYSTEM\\s-+")
600 (goto-char (match-end 0))
601 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
602 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 603 (error "XML: Missing System ID"))
f6fcdfff 604 (push (list (match-string-no-properties 1) 'system) dtd)))
a98e819b
DL
605 (skip-syntax-forward " ")
606 (if (eq ?> (char-after))
607 (forward-char)
a98e819b 608 (if (not (eq (char-after) ?\[))
6d12a4df 609 (error "XML: Bad DTD")
a98e819b
DL
610 (forward-char)
611 ;; Parse the rest of the DTD
23d519e4 612 ;; Fixme: Deal with NOTATION, PIs.
a98e819b
DL
613 (while (not (looking-at "\\s-*\\]"))
614 (skip-syntax-forward " ")
615 (cond
616
617 ;; Translation of rule [45] of XML specifications
618 ((looking-at
619 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
620
f6fcdfff 621 (setq element (match-string-no-properties 1)
a98e819b
DL
622 type (match-string-no-properties 2))
623 (setq end-pos (match-end 0))
624
625 ;; Translation of rule [46] of XML specifications
626 (cond
627 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
628 (setq type 'empty))
629 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
630 (setq type 'any))
631 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
f6fcdfff 632 (setq type (xml-parse-elem-type (match-string-no-properties 1 type))))
a98e819b
DL
633 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
634 nil)
635 (t
6d12a4df 636 (if xml-validating-parser
07f7e9ae 637 (error "XML: (Validity) Invalid element type in the DTD"))))
27720433 638
a98e819b 639 ;; rule [45]: the element declaration must be unique
6d12a4df
MH
640 (if (and (assoc element dtd)
641 xml-validating-parser)
642 (error "XML: (Validity) Element declarations must be unique in a DTD (<%s>)"
461f3ad0 643 element))
a98e819b
DL
644
645 ;; Store the element in the DTD
646 (push (list element type) dtd)
647 (goto-char end-pos))
23d519e4
MH
648
649 ;; Translation of rule [52] of XML specifications
650 ((looking-at (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
651 "\\)[ \t\n\r]*\\(" xml-att-def-re
652 "\\)*[ \t\n\r]*>"))
653
654 ;; We don't do anything with ATTLIST currently
655 (goto-char (match-end 0)))
656
a98e819b
DL
657 ((looking-at "<!--")
658 (search-forward "-->"))
6d12a4df
MH
659 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
660 "\\)[ \t\n\r]*\\(" xml-entity-value-re
661 "\\)[ \t\n\r]*>"))
f6fcdfff
CY
662 (let ((name (match-string-no-properties 1))
663 (value (substring (match-string-no-properties 2) 1
664 (- (length (match-string-no-properties 2)) 1))))
23d519e4 665 (goto-char (match-end 0))
6d12a4df
MH
666 (setq xml-entity-alist
667 (append xml-entity-alist
668 (list (cons name
669 (with-temp-buffer
670 (insert value)
671 (goto-char (point-min))
672 (xml-parse-fragment
673 xml-validating-parser
674 parse-ns))))))))
675 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
676 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
677 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
678 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
679 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
680 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
681 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
682 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
683 "[ \t\n\r]*>")))
f6fcdfff
CY
684 (let ((name (match-string-no-properties 1))
685 (file (substring (match-string-no-properties 2) 1
686 (- (length (match-string-no-properties 2)) 1))))
23d519e4 687 (goto-char (match-end 0))
6d12a4df
MH
688 (setq xml-entity-alist
689 (append xml-entity-alist
690 (list (cons name (with-temp-buffer
691 (insert-file-contents file)
692 (goto-char (point-min))
693 (xml-parse-fragment
694 xml-validating-parser
695 parse-ns))))))))
63b446bc
MH
696 ;; skip parameter entity declarations
697 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+%[ \t\n\r]+\\(" xml-name-re
698 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
699 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
700 (looking-at (concat "<!ENTITY[ \t\n\r]+"
701 "%[ \t\n\r]+"
702 "\\(" xml-name-re "\\)[ \t\n\r]+"
703 "PUBLIC[ \t\n\r]+"
704 "\\(\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
705 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'\\)[ \t\n\r]+"
706 "\\(\"[^\"]+\"\\|'[^']+'\\)"
707 "[ \t\n\r]*>")))
708 (goto-char (match-end 0)))
709 ;; skip parameter entities
710 ((looking-at (concat "%" xml-name-re ";"))
711 (goto-char (match-end 0)))
a98e819b 712 (t
7e0f1123 713 (when xml-validating-parser
27720433 714 (error "XML: (Validity) Invalid DTD item"))))))
6d12a4df 715 (if (looking-at "\\s-*]>")
23d519e4 716 (goto-char (match-end 0))))
461f3ad0 717 (nreverse dtd)))
47db06aa
GM
718
719(defun xml-parse-elem-type (string)
a98e819b 720 "Convert element type STRING into a Lisp structure."
47db06aa
GM
721
722 (let (elem modifier)
723 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
724 (progn
f6fcdfff
CY
725 (setq elem (match-string-no-properties 1 string)
726 modifier (match-string-no-properties 2 string))
47db06aa 727 (if (string-match "|" elem)
971489ea 728 (setq elem (cons 'choice
47db06aa
GM
729 (mapcar 'xml-parse-elem-type
730 (split-string elem "|"))))
731 (if (string-match "," elem)
971489ea 732 (setq elem (cons 'seq
47db06aa 733 (mapcar 'xml-parse-elem-type
a98e819b 734 (split-string elem ",")))))))
a158ff81 735 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
f6fcdfff
CY
736 (setq elem (match-string-no-properties 1 string)
737 modifier (match-string-no-properties 2 string))))
47db06aa 738
971489ea
SM
739 (if (and (stringp elem) (string= elem "#PCDATA"))
740 (setq elem 'pcdata))
524425ae 741
971489ea
SM
742 (cond
743 ((string= modifier "+")
744 (list '+ elem))
745 ((string= modifier "*")
746 (list '* elem))
747 ((string= modifier "?")
0fa6f70c 748 (list '\? elem))
971489ea
SM
749 (t
750 elem))))
47db06aa 751
47db06aa
GM
752;;*******************************************************************
753;;**
754;;** Substituting special XML sequences
755;;**
756;;*******************************************************************
757
758(defun xml-substitute-special (string)
a98e819b
DL
759 "Return STRING, after subsituting entity references."
760 ;; This originally made repeated passes through the string from the
761 ;; beginning, which isn't correct, since then either "&amp;amp;" or
762 ;; "&#38;amp;" won't DTRT.
47db06aa 763
6d12a4df
MH
764 (let ((point 0)
765 children end-point)
ae026110 766 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df 767 (setq end-point (match-end 0))
f6fcdfff 768 (let* ((this-part (match-string-no-properties 1 string))
6d12a4df
MH
769 (prev-part (substring string point (match-beginning 0)))
770 (entity (assoc this-part xml-entity-alist))
771 (expansion
772 (cond ((string-match "#\\([0-9]+\\)" this-part)
773 (let ((c (decode-char
774 'ucs
f6fcdfff 775 (string-to-number (match-string-no-properties 1 this-part)))))
6d12a4df
MH
776 (if c (string c))))
777 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
778 (let ((c (decode-char
779 'ucs
f6fcdfff 780 (string-to-number (match-string-no-properties 1 this-part) 16))))
6d12a4df
MH
781 (if c (string c))))
782 (entity
783 (cdr entity))
ae026110 784 ((eq (length this-part) 0)
27720433 785 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 786 (t
f8ab034e 787 (if xml-validating-parser
6d12a4df 788 (error "XML: (Validity) Undefined entity `%s'"
f8ab034e
MH
789 this-part)
790 xml-undefined-entity)))))
6d12a4df
MH
791
792 (cond ((null children)
f0d49437
MH
793 ;; FIXME: If we have an entity that expands into XML, this won't work.
794 (setq children
795 (concat prev-part expansion)))
6d12a4df
MH
796 ((stringp children)
797 (if (stringp expansion)
798 (setq children (concat children prev-part expansion))
799 (setq children (list expansion (concat prev-part children)))))
800 ((and (stringp expansion)
801 (stringp (car children)))
802 (setcar children (concat prev-part expansion (car children))))
803 ((stringp expansion)
804 (setq children (append (concat prev-part expansion)
805 children)))
806 ((stringp (car children))
807 (setcar children (concat (car children) prev-part))
808 (setq children (append expansion children)))
809 (t
810 (setq children (list expansion
811 prev-part
812 children))))
813 (setq point end-point)))
814 (cond ((stringp children)
815 (concat children (substring string point)))
816 ((stringp (car (last children)))
a3110b5d 817 (concat (car (last children)) (substring string point)))
6d12a4df
MH
818 ((null children)
819 string)
820 (t
a3110b5d
MH
821 (concat (mapconcat 'identity
822 (nreverse children)
823 "")
824 (substring string point))))))
825
47db06aa
GM
826;;*******************************************************************
827;;**
828;;** Printing a tree.
829;;** This function is intended mainly for debugging purposes.
830;;**
831;;*******************************************************************
832
27240aa4
AS
833(defun xml-debug-print (xml &optional indent-string)
834 "Outputs the XML in the current buffer.
835XML can be a tree or a list of nodes.
836The first line is indented with the optional INDENT-STRING."
837 (setq indent-string (or indent-string ""))
971489ea 838 (dolist (node xml)
27240aa4
AS
839 (xml-debug-print-internal node indent-string)))
840
841(defalias 'xml-print 'xml-debug-print)
47db06aa 842
7731c9f4 843(defun xml-escape-string (string)
a1884c78
MH
844 "Return the string with entity substitutions made from
845xml-entity-alist."
7731c9f4
MH
846 (mapconcat (lambda (byte)
847 (let ((char (char-to-string byte)))
848 (if (rassoc char xml-entity-alist)
849 (concat "&" (car (rassoc char xml-entity-alist)) ";")
850 char)))
76a6127f
MH
851 ;; This differs from the non-unicode branch. Just
852 ;; grabbing the string works here.
853 string ""))
7731c9f4 854
971489ea 855(defun xml-debug-print-internal (xml indent-string)
47db06aa 856 "Outputs the XML tree in the current buffer.
a98e819b 857The first line is indented with INDENT-STRING."
47db06aa
GM
858 (let ((tree xml)
859 attlist)
a98e819b 860 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 861
47db06aa 862 ;; output the attribute list
971489ea 863 (setq attlist (xml-node-attributes tree))
47db06aa 864 (while attlist
7731c9f4
MH
865 (insert ?\ (symbol-name (caar attlist)) "=\""
866 (xml-escape-string (cdar attlist)) ?\")
971489ea 867 (setq attlist (cdr attlist)))
524425ae 868
971489ea 869 (setq tree (xml-node-children tree))
47db06aa 870
27240aa4
AS
871 (if (null tree)
872 (insert ?/ ?>)
873 (insert ?>)
874
875 ;; output the children
876 (dolist (node tree)
877 (cond
878 ((listp node)
879 (insert ?\n)
880 (xml-debug-print-internal node (concat indent-string " ")))
7731c9f4
MH
881 ((stringp node)
882 (insert (xml-escape-string node)))
27240aa4
AS
883 (t
884 (error "Invalid XML tree"))))
885
886 (when (not (and (null (cdr tree))
887 (stringp (car tree))))
888 (insert ?\n indent-string))
889 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
890
891(provide 'xml)
892
8a02e193 893;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
47db06aa 894;;; xml.el ends here