Fix formatting typos.
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
0d30b337 3;; Copyright (C) 2000, 2001, 2002, 2003, 2004,
d7a0267c 4;; 2005, 2006, 2007 Free Software Foundation, Inc.
47db06aa
GM
5
6;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 7;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 8;; Keywords: xml, data
47db06aa
GM
9
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
14;; the Free Software Foundation; either version 2, or (at your option)
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING. If not, write to the
086add15
LK
24;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25;; Boston, MA 02110-1301, USA.
47db06aa
GM
26
27;;; Commentary:
28
a98e819b
DL
29;; This file contains a somewhat incomplete non-validating XML parser. It
30;; parses a file, and returns a list that can be used internally by
a1dfa9a3 31;; any other Lisp libraries.
47db06aa
GM
32
33;;; FILE FORMAT
34
a98e819b
DL
35;; The document type declaration may either be ignored or (optionally)
36;; parsed, but currently the parsing will only accept element
a1dfa9a3 37;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
38;; of error, the parsing stops and the XML file is shown where the
39;; parsing stopped.
47db06aa 40;;
a98e819b 41;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
42;;
43;; The XML file should have the following format:
653558a1
GM
44;; <node1 attr1="name1" attr2="name2" ...>value
45;; <node2 attr3="name3" attr4="name4">value2</node2>
46;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 47;; </node1>
a1dfa9a3 48;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
49;; be any number of attributes (or none), as well as any number of children
50;; below the nodes.
51;;
52;; There can be only top level node, but with any number of children below.
53
54;;; LIST FORMAT
55
c7f8d055
SM
56;; The functions `xml-parse-file', `xml-parse-region' and
57;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
58;;
59;; xml-list ::= (node node ...)
c7f8d055 60;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
61;; child_node_list ::= child_node child_node ...
62;; child_node ::= node | string
c7f8d055
SM
63;; qname ::= (:namespace-uri . "name") | "name"
64;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
65;; | nil
66;; string ::= "..."
67;;
a98e819b
DL
68;; Some macros are provided to ease the parsing of this list.
69;; Whitespace is preserved. Fixme: There should be a tree-walker that
70;; can remove it.
47db06aa 71
c7f8d055
SM
72;; TODO:
73;; * xml:base, xml:space support
74;; * more complete DOCTYPE parsing
75;; * pi support
76
47db06aa
GM
77;;; Code:
78
a98e819b
DL
79;; Note that {buffer-substring,match-string}-no-properties were
80;; formerly used in several places, but that removes composition info.
81
47db06aa
GM
82;;*******************************************************************
83;;**
84;;** Macros to parse the list
85;;**
86;;*******************************************************************
87
f8ab034e
MH
88(defconst xml-undefined-entity "?"
89 "What to substitute for undefined entities")
90
6d12a4df
MH
91(defvar xml-entity-alist
92 '(("lt" . "<")
93 ("gt" . ">")
94 ("apos" . "'")
95 ("quot" . "\"")
96 ("amp" . "&"))
97 "The defined entities. Entities are added to this when the DTD is parsed.")
98
99(defvar xml-sub-parser nil
100 "Dynamically set this to a non-nil value if you want to parse an XML fragment.")
101
102(defvar xml-validating-parser nil
103 "Set to non-nil to get validity checking.")
104
971489ea 105(defsubst xml-node-name (node)
47db06aa 106 "Return the tag associated with NODE.
a1dfa9a3
SM
107Without namespace-aware parsing, the tag is a symbol.
108
109With namespace-aware parsing, the tag is a cons of a string
110representing the uri of the namespace with the local name of the
111tag. For example,
112
113 <foo>
114
115would be represented by
116
117 '(\"\" . \"foo\")."
118
971489ea 119 (car node))
47db06aa 120
971489ea 121(defsubst xml-node-attributes (node)
47db06aa
GM
122 "Return the list of attributes of NODE.
123The list can be nil."
971489ea 124 (nth 1 node))
47db06aa 125
971489ea 126(defsubst xml-node-children (node)
47db06aa
GM
127 "Return the list of children of NODE.
128This is a list of nodes, and it can be nil."
971489ea 129 (cddr node))
47db06aa
GM
130
131(defun xml-get-children (node child-name)
132 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 133CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
134 (let ((match ()))
135 (dolist (child (xml-node-children node))
a1dfa9a3
SM
136 (if (and (listp child)
137 (equal (xml-node-name child) child-name))
138 (push child match)))
971489ea 139 (nreverse match)))
47db06aa 140
9bcd6a7e 141(defun xml-get-attribute-or-nil (node attribute)
47db06aa 142 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 143Return nil if the attribute was not found.
9bcd6a7e
EZ
144
145See also `xml-get-attribute'."
2e9bdf15 146 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
147
148(defsubst xml-get-attribute (node attribute)
149 "Get from NODE the value of ATTRIBUTE.
150An empty string is returned if the attribute was not found.
151
152See also `xml-get-attribute-or-nil'."
153 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
154
155;;*******************************************************************
156;;**
157;;** Creating the list
158;;**
159;;*******************************************************************
160
a98e819b 161;;;###autoload
2d42509a 162(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
163 "Parse the well-formed XML file FILE.
164If FILE is already visited, use its buffer and don't kill it.
47db06aa 165Returns the top node with all its children.
2d42509a
JB
166If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
167If PARSE-NS is non-nil, then QNAMES are expanded."
882cb60d
RS
168 (if (get-file-buffer file)
169 (with-current-buffer (get-file-buffer file)
170 (save-excursion
171 (xml-parse-region (point-min)
172 (point-max)
173 (current-buffer)
174 parse-dtd parse-ns)))
175 (with-temp-buffer
176 (insert-file-contents file)
177 (xml-parse-region (point-min)
178 (point-max)
179 (current-buffer)
180 parse-dtd parse-ns))))
47db06aa 181
6d12a4df 182
5ed32352
RS
183(defvar xml-name-re)
184(defvar xml-entity-value-re)
0bf41002 185(defvar xml-att-def-re)
63b446bc 186(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df 187 (name-chars (concat "-[:digit:]." start-chars))
5178753d 188 ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
6d12a4df 189 (whitespace "[ \t\n\r]"))
5178753d
MH
190 ;;[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
191 ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
192 ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
193 ;; | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
6d12a4df 194 (defvar xml-name-start-char-re (concat "[" start-chars "]"))
5178753d 195 ;;[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
6d12a4df 196 (defvar xml-name-char-re (concat "[" name-chars "]"))
5178753d 197 ;;[5] Name ::= NameStartChar (NameChar)*
6d12a4df 198 (defvar xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
5178753d 199 ;;[6] Names ::= Name (#x20 Name)*
6d12a4df 200 (defvar xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
5178753d 201 ;;[7] Nmtoken ::= (NameChar)+
6d12a4df 202 (defvar xml-nmtoken-re (concat xml-name-char-re "+"))
5178753d 203 ;;[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
6d12a4df 204 (defvar xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
5178753d 205 ;;[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
6d12a4df 206 (defvar xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
5178753d 207 ;;[68] EntityRef ::= '&' Name ';'
6d12a4df 208 (defvar xml-entity-ref (concat "&" xml-name-re ";"))
5178753d 209 ;;[69] PEReference ::= '%' Name ';'
6d12a4df 210 (defvar xml-pe-reference-re (concat "%" xml-name-re ";"))
5178753d 211 ;;[67] Reference ::= EntityRef | CharRef
6d12a4df 212 (defvar xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
5178753d 213 ;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
23d519e4
MH
214 (defvar xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
215 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
5178753d
MH
216 ;;[56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID per Element Type] [VC: ID Attribute Default]
217 ;; | 'IDREF' [VC: IDREF]
218 ;; | 'IDREFS' [VC: IDREF]
219 ;; | 'ENTITY' [VC: Entity Name]
220 ;; | 'ENTITIES' [VC: Entity Name]
221 ;; | 'NMTOKEN' [VC: Name Token]
222 ;; | 'NMTOKENS' [VC: Name Token]
23d519e4 223 (defvar xml-tokenized-type-re "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")
5178753d 224 ;;[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
23d519e4
MH
225 (defvar xml-notation-type-re (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
226 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" whitespace "*)\\)"))
5178753d 227 ;;[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] [VC: No Duplicate Tokens]
23d519e4
MH
228 (defvar xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
229 "\\(?:" whitespace "*|" whitespace "*" xml-nmtoken-re "\\)*"
230 whitespace ")\\)"))
5178753d 231 ;;[57] EnumeratedType ::= NotationType | Enumeration
23d519e4 232 (defvar xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re "\\|" xml-enumeration-re "\\)"))
5178753d
MH
233 ;;[54] AttType ::= StringType | TokenizedType | EnumeratedType
234 ;;[55] StringType ::= 'CDATA'
23d519e4 235 (defvar xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re "\\|" xml-notation-type-re"\\|" xml-enumerated-type-re "\\)"))
5178753d 236 ;;[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
23d519e4 237 (defvar xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" whitespace "\\)*" xml-att-value-re "\\)"))
5178753d 238 ;;[53] AttDef ::= S Name S AttType S DefaultDecl
23d519e4
MH
239 (defvar xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
240 whitespace "*" xml-att-type-re
241 whitespace "*" xml-default-decl-re "\\)"))
5178753d
MH
242 ;;[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
243 ;; | "'" ([^%&'] | PEReference | Reference)* "'"
6d12a4df
MH
244 (defvar xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
245 "\\|" xml-reference-re "\\)*\"\\|'\\(?:[^%&']\\|"
246 xml-pe-reference-re "\\|" xml-reference-re "\\)*'\\)")))
247;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
248;; | 'PUBLIC' S PubidLiteral S SystemLiteral
249;;[76] NDataDecl ::= S 'NDATA' S
250;;[73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
251;;[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
252;;[74] PEDef ::= EntityValue | ExternalID
253;;[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
254;;[70] EntityDecl ::= GEDecl | PEDecl
255
a98e819b
DL
256;; Note that this is setup so that we can do whitespace-skipping with
257;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
258;; compared with `re-search-forward', but that has been fixed. Also
259;; note that the standard syntax table contains other characters with
260;; whitespace syntax, like NBSP, but they are invalid in contexts in
261;; which we might skip whitespace -- specifically, they're not
262;; NameChars [XML 4].
263
264(defvar xml-syntax-table
265 (let ((table (make-syntax-table)))
266 ;; Get space syntax correct per XML [3].
267 (dotimes (c 31)
268 (modify-syntax-entry c "." table)) ; all are space in standard table
5178753d 269 (dolist (c '(?\t ?\n ?\r)) ; these should be space
a98e819b
DL
270 (modify-syntax-entry c " " table))
271 ;; For skipping attributes.
272 (modify-syntax-entry ?\" "\"" table)
273 (modify-syntax-entry ?' "\"" table)
274 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
275 ;; are OK by default).
276 (modify-syntax-entry ?. "_" table)
277 (modify-syntax-entry ?: "_" table)
278 ;; XML [89]
aaaa8abb
MH
279 (unless (featurep 'xemacs)
280 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
281 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
282 #x30FD #x30FE))
283 (modify-syntax-entry (decode-char 'ucs c) "w" table)))
a98e819b
DL
284 ;; Fixme: rest of [4]
285 table)
286 "Syntax table used by `xml-parse-region'.")
287
288;; XML [5]
289;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
290(eval-and-compile
291 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
292
293;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
294;; document ::= prolog element Misc*
295;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
296
297;;;###autoload
2d42509a 298(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
299 "Parse the region from BEG to END in BUFFER.
300If BUFFER is nil, it defaults to the current buffer.
301Returns the XML list for the region, or raises an error if the region
2d42509a 302is not well-formed XML.
47db06aa 303If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
304and returned as the first element of the list.
305If PARSE-NS is non-nil, then QNAMES are expanded."
39d58fc0
MH
306 ;; Use fixed syntax table to ensure regexp char classes and syntax
307 ;; specs DTRT.
308 (with-syntax-table (standard-syntax-table)
309 (let ((case-fold-search nil) ; XML is case-sensitive.
310 xml result dtd)
311 (save-excursion
312 (if buffer
313 (set-buffer buffer))
314 (save-restriction
315 (narrow-to-region beg end)
a98e819b
DL
316 (goto-char (point-min))
317 (while (not (eobp))
318 (if (search-forward "<" nil t)
319 (progn
320 (forward-char -1)
34638996 321 (setq result (xml-parse-tag parse-dtd parse-ns))
6d12a4df 322 (if (and xml result (not xml-sub-parser))
a98e819b 323 ;; translation of rule [1] of XML specifications
6d12a4df 324 (error "XML: (Not Well-Formed) Only one root tag allowed")
47db06aa 325 (cond
971489ea 326 ((null result))
34638996
EZ
327 ((and (listp (car result))
328 parse-dtd)
971489ea 329 (setq dtd (car result))
a98e819b
DL
330 (if (cdr result) ; possible leading comment
331 (add-to-list 'xml (cdr result))))
47db06aa 332 (t
a98e819b
DL
333 (add-to-list 'xml result)))))
334 (goto-char (point-max))))
335 (if parse-dtd
336 (cons dtd (nreverse xml))
337 (nreverse xml)))))))
47db06aa 338
c7f8d055 339(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
340 "Perform any namespace expansion.
341NAME is the name to perform the expansion on.
c7f8d055
SM
342DEFAULT is the default namespace. XML-NS is a cons of namespace
343names to uris. When namespace-aware parsing is off, then XML-NS
344is nil.
345
346During namespace-aware parsing, any name without a namespace is
347put into the namespace identified by DEFAULT. nil is used to
348specify that the name shouldn't be given a namespace."
349 (if (consp xml-ns)
350 (let* ((nsp (string-match ":" name))
351 (lname (if nsp (substring name (match-end 0)) name))
352 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
353 (special (and (string-equal lname "xmlns") (not prefix)))
354 ;; Setting default to nil will insure that there is not
355 ;; matching cons in xml-ns. In which case we
356 (ns (or (cdr (assoc (if special "xmlns" prefix)
357 xml-ns))
6d12a4df 358 "")))
c7f8d055
SM
359 (cons ns (if special "" lname)))
360 (intern name)))
47db06aa 361
6d12a4df
MH
362(defun xml-parse-fragment (&optional parse-dtd parse-ns)
363 "Parse xml-like fragments."
364 (let ((xml-sub-parser t)
365 children)
366 (while (not (eobp))
367 (let ((bit (xml-parse-tag
368 parse-dtd parse-ns)))
369 (if children
370 (setq children (append (list bit) children))
371 (if (stringp bit)
372 (setq children (list bit))
373 (setq children bit)))))
374 (reverse children)))
375
2d42509a 376(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 377 "Parse the tag at point.
47db06aa
GM
378If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
379returned as the first element in the list.
2d42509a 380If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 381Returns one of:
a98e819b
DL
382 - a list : the matching node
383 - nil : the point is not looking at a tag.
384 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
385 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
386 (xml-ns (if (consp parse-ns)
2d42509a
JB
387 parse-ns
388 (if parse-ns
389 (list
5178753d 390 ;; Default for empty prefix is no namespace
6d12a4df 391 (cons "" "")
c7f8d055 392 ;; "xml" namespace
6d12a4df 393 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 394 ;; We need to seed the xmlns namespace
6d12a4df 395 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
396 (cond
397 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
398 ;; beginning of a document).
399 ((looking-at "<\\?")
400 (search-forward "?>")
401 (skip-syntax-forward " ")
402 (xml-parse-tag parse-dtd xml-ns))
403 ;; Character data (CDATA) sections, in which no tag should be interpreted
404 ((looking-at "<!\\[CDATA\\[")
405 (let ((pos (match-end 0)))
406 (unless (search-forward "]]>" nil t)
6d12a4df 407 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110
MH
408 (concat
409 (buffer-substring pos (match-beginning 0))
410 (xml-parse-string))))
2d42509a
JB
411 ;; DTD for the document
412 ((looking-at "<!DOCTYPE")
6d12a4df
MH
413 (let ((dtd (xml-parse-dtd parse-ns)))
414 (skip-syntax-forward " ")
415 (if xml-validating-parser
416 (cons dtd (xml-parse-tag nil xml-ns))
417 (xml-parse-tag nil xml-ns))))
2d42509a
JB
418 ;; skip comments
419 ((looking-at "<!--")
420 (search-forward "-->")
421 nil)
422 ;; end tag
423 ((looking-at "</")
424 '())
425 ;; opening tag
426 ((looking-at "<\\([^/>[:space:]]+\\)")
427 (goto-char (match-end 1))
34638996
EZ
428
429 ;; Parse this node
2d42509a 430 (let* ((node-name (match-string 1))
5178753d
MH
431 ;; Parse the attribute list.
432 (attrs (xml-parse-attlist xml-ns))
433 children pos)
c7f8d055 434
5178753d
MH
435 ;; add the xmlns:* attrs to our cache
436 (when (consp xml-ns)
c7f8d055
SM
437 (dolist (attr attrs)
438 (when (and (consp (car attr))
6d12a4df
MH
439 (equal "http://www.w3.org/2000/xmlns/"
440 (caar attr)))
441 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
442 xml-ns))))
443
5178753d 444 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 445
2d42509a
JB
446 ;; is this an empty element ?
447 (if (looking-at "/>")
47db06aa 448 (progn
6d12a4df 449 (forward-char 2)
971489ea 450 (nreverse children))
6d12a4df
MH
451
452 ;; is this a valid start tag ?
453 (if (eq (char-after) ?>)
454 (progn
455 (forward-char 1)
456 ;; Now check that we have the right end-tag. Note that this
457 ;; one might contain spaces after the tag name
458 (let ((end (concat "</" node-name "\\s-*>")))
459 (while (not (looking-at end))
460 (cond
461 ((looking-at "</")
462 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
463 node-name (point)))
464 ((= (char-after) ?<)
465 (let ((tag (xml-parse-tag nil xml-ns)))
466 (when tag
467 (push tag children))))
468 (t
469 (let ((expansion (xml-parse-string)))
470 (setq children
471 (if (stringp expansion)
472 (if (stringp (car children))
473 ;; The two strings were separated by a comment.
aaaa8abb 474 (setq children (append (list (concat (car children) expansion))
6d12a4df
MH
475 (cdr children)))
476 (setq children (append (list expansion) children)))
477 (setq children (append expansion children))))))))
478
479 (goto-char (match-end 0))
480 (nreverse children)))
481 ;; This was an invalid start tag (Expected ">", but didn't see it.)
482 (error "XML: (Well-Formed) Couldn't parse tag: %s"
483 (buffer-substring (- (point) 10) (+ (point) 1)))))))
484 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
485 (unless xml-sub-parser ; Usually, we error out.
486 (error "XML: (Well-Formed) Invalid character"))
487
488 ;; However, if we're parsing incrementally, then we need to deal
489 ;; with stray CDATA.
490 (xml-parse-string)))))
491
492(defun xml-parse-string ()
493 "Parse the next whatever. Could be a string, or an element."
5178753d
MH
494 (let* ((pos (point))
495 (string (progn (if (search-forward "<" nil t)
496 (forward-char -1)
497 (goto-char (point-max)))
498 (buffer-substring pos (point)))))
499 ;; Clean up the string. As per XML specifications, the XML
500 ;; processor should always pass the whole string to the
501 ;; application. But \r's should be replaced:
502 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
503 (setq pos 0)
504 (while (string-match "\r\n?" string pos)
505 (setq string (replace-match "\n" t t string))
506 (setq pos (1+ (match-beginning 0))))
507
508 (xml-substitute-special string)))
47db06aa 509
c7f8d055 510(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
511 "Return the attribute-list after point.
512Leave point at the first non-blank character after the tag."
971489ea 513 (let ((attlist ())
34638996 514 end-pos name)
a98e819b
DL
515 (skip-syntax-forward " ")
516 (while (looking-at (eval-when-compile
517 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055
SM
518 (setq end-pos (match-end 0))
519 (setq name (xml-maybe-do-ns (match-string 1) nil xml-ns))
520 (goto-char end-pos)
47db06aa 521
a158ff81
JB
522 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
523
47db06aa
GM
524 ;; Do we have a string between quotes (or double-quotes),
525 ;; or a simple word ?
a158ff81 526 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 527 (setq end-pos (match-end 0))
f0ec1711 528 (if (looking-at "'\\([^']*\\)'")
34638996 529 (setq end-pos (match-end 0))
6d12a4df 530 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
531
532 ;; Each attribute must be unique within a given element
533 (if (assoc name attlist)
6d12a4df 534 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 535
a158ff81
JB
536 ;; Multiple whitespace characters should be replaced with a single one
537 ;; in the attributes
a98e819b 538 (let ((string (match-string 1))
a158ff81 539 (pos 0))
a98e819b 540 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
541 (let ((expansion (xml-substitute-special string)))
542 (unless (stringp expansion)
5178753d
MH
543 ; We say this is the constraint. It is acctually that
544 ; external entities nor "<" can be in an attribute value.
6d12a4df
MH
545 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
546 (push (cons name expansion) attlist)))
a158ff81 547
34638996 548 (goto-char end-pos)
a98e819b 549 (skip-syntax-forward " "))
971489ea 550 (nreverse attlist)))
47db06aa
GM
551
552;;*******************************************************************
553;;**
554;;** The DTD (document type declaration)
555;;** The following functions know how to skip or parse the DTD of
556;;** a document
557;;**
558;;*******************************************************************
559
a98e819b
DL
560;; Fixme: This fails at least if the DTD contains conditional sections.
561
562(defun xml-skip-dtd ()
563 "Skip the DTD at point.
47db06aa 564This follows the rule [28] in the XML specifications."
6d12a4df
MH
565 (let ((xml-validating-parser nil))
566 (xml-parse-dtd)))
47db06aa 567
6d12a4df 568(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
569 "Parse the DTD at point."
570 (forward-char (eval-when-compile (length "<!DOCTYPE")))
571 (skip-syntax-forward " ")
6d12a4df
MH
572 (if (and (looking-at ">")
573 xml-validating-parser)
574 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 575
971489ea 576 ;; Get the name of the document
a98e819b
DL
577 (looking-at xml-name-regexp)
578 (let ((dtd (list (match-string 0) 'dtd))
971489ea 579 type element end-pos)
47db06aa
GM
580 (goto-char (match-end 0))
581
a98e819b
DL
582 (skip-syntax-forward " ")
583 ;; XML [75]
584 (cond ((looking-at "PUBLIC\\s-+")
585 (goto-char (match-end 0))
586 (unless (or (re-search-forward
587 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
588 nil t)
589 (re-search-forward
590 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
591 nil t))
6d12a4df 592 (error "XML: Missing Public ID"))
a98e819b 593 (let ((pubid (match-string 1)))
6d12a4df 594 (skip-syntax-forward " ")
a98e819b
DL
595 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
596 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 597 (error "XML: Missing System ID"))
a98e819b
DL
598 (push (list pubid (match-string 1) 'public) dtd)))
599 ((looking-at "SYSTEM\\s-+")
600 (goto-char (match-end 0))
601 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
602 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 603 (error "XML: Missing System ID"))
a98e819b
DL
604 (push (list (match-string 1) 'system) dtd)))
605 (skip-syntax-forward " ")
606 (if (eq ?> (char-after))
607 (forward-char)
a98e819b 608 (if (not (eq (char-after) ?\[))
6d12a4df 609 (error "XML: Bad DTD")
a98e819b
DL
610 (forward-char)
611 ;; Parse the rest of the DTD
23d519e4 612 ;; Fixme: Deal with NOTATION, PIs.
a98e819b
DL
613 (while (not (looking-at "\\s-*\\]"))
614 (skip-syntax-forward " ")
615 (cond
616
617 ;; Translation of rule [45] of XML specifications
618 ((looking-at
619 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
620
34638996 621 (setq element (match-string 1)
a98e819b
DL
622 type (match-string-no-properties 2))
623 (setq end-pos (match-end 0))
624
625 ;; Translation of rule [46] of XML specifications
626 (cond
627 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
628 (setq type 'empty))
629 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
630 (setq type 'any))
631 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
632 (setq type (xml-parse-elem-type (match-string 1 type))))
633 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
634 nil)
635 (t
6d12a4df 636 (if xml-validating-parser
07f7e9ae 637 (error "XML: (Validity) Invalid element type in the DTD"))))
27720433 638
a98e819b 639 ;; rule [45]: the element declaration must be unique
6d12a4df
MH
640 (if (and (assoc element dtd)
641 xml-validating-parser)
642 (error "XML: (Validity) Element declarations must be unique in a DTD (<%s>)"
461f3ad0 643 element))
a98e819b
DL
644
645 ;; Store the element in the DTD
646 (push (list element type) dtd)
647 (goto-char end-pos))
23d519e4
MH
648
649 ;; Translation of rule [52] of XML specifications
650 ((looking-at (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
651 "\\)[ \t\n\r]*\\(" xml-att-def-re
652 "\\)*[ \t\n\r]*>"))
653
654 ;; We don't do anything with ATTLIST currently
655 (goto-char (match-end 0)))
656
a98e819b
DL
657 ((looking-at "<!--")
658 (search-forward "-->"))
6d12a4df
MH
659 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
660 "\\)[ \t\n\r]*\\(" xml-entity-value-re
661 "\\)[ \t\n\r]*>"))
23d519e4
MH
662 (let ((name (match-string 1))
663 (value (substring (match-string 2) 1
664 (- (length (match-string 2)) 1))))
665 (goto-char (match-end 0))
6d12a4df
MH
666 (setq xml-entity-alist
667 (append xml-entity-alist
668 (list (cons name
669 (with-temp-buffer
670 (insert value)
671 (goto-char (point-min))
672 (xml-parse-fragment
673 xml-validating-parser
674 parse-ns))))))))
675 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
676 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
677 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
678 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
679 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
680 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
681 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
682 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
683 "[ \t\n\r]*>")))
23d519e4
MH
684 (let ((name (match-string 1))
685 (file (substring (match-string 2) 1
686 (- (length (match-string 2)) 1))))
687 (goto-char (match-end 0))
6d12a4df
MH
688 (setq xml-entity-alist
689 (append xml-entity-alist
690 (list (cons name (with-temp-buffer
691 (insert-file-contents file)
692 (goto-char (point-min))
693 (xml-parse-fragment
694 xml-validating-parser
695 parse-ns))))))))
63b446bc
MH
696 ;; skip parameter entity declarations
697 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+%[ \t\n\r]+\\(" xml-name-re
698 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
699 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
700 (looking-at (concat "<!ENTITY[ \t\n\r]+"
701 "%[ \t\n\r]+"
702 "\\(" xml-name-re "\\)[ \t\n\r]+"
703 "PUBLIC[ \t\n\r]+"
704 "\\(\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
705 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'\\)[ \t\n\r]+"
706 "\\(\"[^\"]+\"\\|'[^']+'\\)"
707 "[ \t\n\r]*>")))
708 (goto-char (match-end 0)))
709 ;; skip parameter entities
710 ((looking-at (concat "%" xml-name-re ";"))
711 (goto-char (match-end 0)))
a98e819b 712 (t
7e0f1123 713 (when xml-validating-parser
27720433 714 (error "XML: (Validity) Invalid DTD item"))))))
6d12a4df 715 (if (looking-at "\\s-*]>")
23d519e4 716 (goto-char (match-end 0))))
461f3ad0 717 (nreverse dtd)))
47db06aa
GM
718
719(defun xml-parse-elem-type (string)
a98e819b 720 "Convert element type STRING into a Lisp structure."
47db06aa
GM
721
722 (let (elem modifier)
723 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
724 (progn
725 (setq elem (match-string 1 string)
726 modifier (match-string 2 string))
727 (if (string-match "|" elem)
971489ea 728 (setq elem (cons 'choice
47db06aa
GM
729 (mapcar 'xml-parse-elem-type
730 (split-string elem "|"))))
731 (if (string-match "," elem)
971489ea 732 (setq elem (cons 'seq
47db06aa 733 (mapcar 'xml-parse-elem-type
a98e819b 734 (split-string elem ",")))))))
a158ff81
JB
735 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
736 (setq elem (match-string 1 string)
47db06aa
GM
737 modifier (match-string 2 string))))
738
971489ea
SM
739 (if (and (stringp elem) (string= elem "#PCDATA"))
740 (setq elem 'pcdata))
524425ae 741
971489ea
SM
742 (cond
743 ((string= modifier "+")
744 (list '+ elem))
745 ((string= modifier "*")
746 (list '* elem))
747 ((string= modifier "?")
0fa6f70c 748 (list '\? elem))
971489ea
SM
749 (t
750 elem))))
47db06aa 751
47db06aa
GM
752;;*******************************************************************
753;;**
754;;** Substituting special XML sequences
755;;**
756;;*******************************************************************
757
758(defun xml-substitute-special (string)
a98e819b
DL
759 "Return STRING, after subsituting entity references."
760 ;; This originally made repeated passes through the string from the
761 ;; beginning, which isn't correct, since then either "&amp;amp;" or
762 ;; "&#38;amp;" won't DTRT.
47db06aa 763
6d12a4df
MH
764 (let ((point 0)
765 children end-point)
ae026110 766 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df
MH
767 (setq end-point (match-end 0))
768 (let* ((this-part (match-string 1 string))
769 (prev-part (substring string point (match-beginning 0)))
770 (entity (assoc this-part xml-entity-alist))
771 (expansion
772 (cond ((string-match "#\\([0-9]+\\)" this-part)
773 (let ((c (decode-char
774 'ucs
775 (string-to-number (match-string 1 this-part)))))
776 (if c (string c))))
777 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
778 (let ((c (decode-char
779 'ucs
780 (string-to-number (match-string 1 this-part) 16))))
781 (if c (string c))))
782 (entity
783 (cdr entity))
ae026110 784 ((eq (length this-part) 0)
27720433 785 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 786 (t
f8ab034e 787 (if xml-validating-parser
6d12a4df 788 (error "XML: (Validity) Undefined entity `%s'"
f8ab034e
MH
789 this-part)
790 xml-undefined-entity)))))
6d12a4df
MH
791
792 (cond ((null children)
f0d49437
MH
793 ;; FIXME: If we have an entity that expands into XML, this won't work.
794 (setq children
795 (concat prev-part expansion)))
6d12a4df
MH
796 ((stringp children)
797 (if (stringp expansion)
798 (setq children (concat children prev-part expansion))
799 (setq children (list expansion (concat prev-part children)))))
800 ((and (stringp expansion)
801 (stringp (car children)))
802 (setcar children (concat prev-part expansion (car children))))
803 ((stringp expansion)
804 (setq children (append (concat prev-part expansion)
805 children)))
806 ((stringp (car children))
807 (setcar children (concat (car children) prev-part))
808 (setq children (append expansion children)))
809 (t
810 (setq children (list expansion
811 prev-part
812 children))))
813 (setq point end-point)))
814 (cond ((stringp children)
815 (concat children (substring string point)))
816 ((stringp (car (last children)))
a3110b5d 817 (concat (car (last children)) (substring string point)))
6d12a4df
MH
818 ((null children)
819 string)
820 (t
a3110b5d
MH
821 (concat (mapconcat 'identity
822 (nreverse children)
823 "")
824 (substring string point))))))
825
47db06aa
GM
826;;*******************************************************************
827;;**
828;;** Printing a tree.
829;;** This function is intended mainly for debugging purposes.
830;;**
831;;*******************************************************************
832
27240aa4
AS
833(defun xml-debug-print (xml &optional indent-string)
834 "Outputs the XML in the current buffer.
835XML can be a tree or a list of nodes.
836The first line is indented with the optional INDENT-STRING."
837 (setq indent-string (or indent-string ""))
971489ea 838 (dolist (node xml)
27240aa4
AS
839 (xml-debug-print-internal node indent-string)))
840
841(defalias 'xml-print 'xml-debug-print)
47db06aa 842
971489ea 843(defun xml-debug-print-internal (xml indent-string)
47db06aa 844 "Outputs the XML tree in the current buffer.
a98e819b 845The first line is indented with INDENT-STRING."
47db06aa
GM
846 (let ((tree xml)
847 attlist)
a98e819b 848 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 849
47db06aa 850 ;; output the attribute list
971489ea 851 (setq attlist (xml-node-attributes tree))
47db06aa 852 (while attlist
a98e819b 853 (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\")
971489ea 854 (setq attlist (cdr attlist)))
524425ae 855
971489ea 856 (setq tree (xml-node-children tree))
47db06aa 857
27240aa4
AS
858 (if (null tree)
859 (insert ?/ ?>)
860 (insert ?>)
861
862 ;; output the children
863 (dolist (node tree)
864 (cond
865 ((listp node)
866 (insert ?\n)
867 (xml-debug-print-internal node (concat indent-string " ")))
868 ((stringp node) (insert node))
869 (t
870 (error "Invalid XML tree"))))
871
872 (when (not (and (null (cdr tree))
873 (stringp (car tree))))
874 (insert ?\n indent-string))
875 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
876
877(provide 'xml)
878
8a02e193 879;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
47db06aa 880;;; xml.el ends here