Fix use of eval-and-compile in regexps in xml.el.
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
acaf905b 3;; Copyright (C) 2000-2012 Free Software Foundation, Inc.
47db06aa
GM
4
5;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 6;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 7;; Keywords: xml, data
47db06aa
GM
8
9;; This file is part of GNU Emacs.
10
eb3fa2cf 11;; GNU Emacs is free software: you can redistribute it and/or modify
47db06aa 12;; it under the terms of the GNU General Public License as published by
eb3fa2cf
GM
13;; the Free Software Foundation, either version 3 of the License, or
14;; (at your option) any later version.
47db06aa
GM
15
16;; GNU Emacs is distributed in the hope that it will be useful,
17;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;; GNU General Public License for more details.
20
21;; You should have received a copy of the GNU General Public License
eb3fa2cf 22;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
47db06aa
GM
23
24;;; Commentary:
25
a98e819b
DL
26;; This file contains a somewhat incomplete non-validating XML parser. It
27;; parses a file, and returns a list that can be used internally by
a1dfa9a3 28;; any other Lisp libraries.
47db06aa
GM
29
30;;; FILE FORMAT
31
a98e819b
DL
32;; The document type declaration may either be ignored or (optionally)
33;; parsed, but currently the parsing will only accept element
a1dfa9a3 34;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
35;; of error, the parsing stops and the XML file is shown where the
36;; parsing stopped.
47db06aa 37;;
a98e819b 38;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
39;;
40;; The XML file should have the following format:
653558a1
GM
41;; <node1 attr1="name1" attr2="name2" ...>value
42;; <node2 attr3="name3" attr4="name4">value2</node2>
43;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 44;; </node1>
a1dfa9a3 45;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
46;; be any number of attributes (or none), as well as any number of children
47;; below the nodes.
48;;
49;; There can be only top level node, but with any number of children below.
50
51;;; LIST FORMAT
52
c7f8d055
SM
53;; The functions `xml-parse-file', `xml-parse-region' and
54;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
55;;
56;; xml-list ::= (node node ...)
c7f8d055 57;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
58;; child_node_list ::= child_node child_node ...
59;; child_node ::= node | string
c7f8d055
SM
60;; qname ::= (:namespace-uri . "name") | "name"
61;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
62;; | nil
63;; string ::= "..."
64;;
a98e819b
DL
65;; Some macros are provided to ease the parsing of this list.
66;; Whitespace is preserved. Fixme: There should be a tree-walker that
67;; can remove it.
47db06aa 68
c7f8d055
SM
69;; TODO:
70;; * xml:base, xml:space support
71;; * more complete DOCTYPE parsing
72;; * pi support
73
47db06aa
GM
74;;; Code:
75
f6fcdfff
CY
76;; Note that buffer-substring and match-string were formerly used in
77;; several places, because the -no-properties variants remove
78;; composition info. However, after some discussion on emacs-devel,
79;; the consensus was that the speed of the -no-properties variants was
80;; a worthwhile tradeoff especially since we're usually parsing files
81;; instead of hand-crafted XML.
a98e819b 82
47db06aa
GM
83;;*******************************************************************
84;;**
85;;** Macros to parse the list
86;;**
87;;*******************************************************************
88
f8ab034e
MH
89(defconst xml-undefined-entity "?"
90 "What to substitute for undefined entities")
91
6d12a4df
MH
92(defvar xml-entity-alist
93 '(("lt" . "<")
94 ("gt" . ">")
95 ("apos" . "'")
96 ("quot" . "\"")
97 ("amp" . "&"))
7f3fbd5d
CY
98 "Alist of defined XML entities.")
99
100(defvar xml-parameter-entity-alist nil
101 "Alist of defined XML parametric entities.")
6d12a4df
MH
102
103(defvar xml-sub-parser nil
7f3fbd5d 104 "Non-nil when the XML parser is parsing an XML fragment.")
6d12a4df
MH
105
106(defvar xml-validating-parser nil
107 "Set to non-nil to get validity checking.")
108
971489ea 109(defsubst xml-node-name (node)
47db06aa 110 "Return the tag associated with NODE.
a1dfa9a3
SM
111Without namespace-aware parsing, the tag is a symbol.
112
113With namespace-aware parsing, the tag is a cons of a string
114representing the uri of the namespace with the local name of the
115tag. For example,
116
117 <foo>
118
119would be represented by
120
121 '(\"\" . \"foo\")."
122
971489ea 123 (car node))
47db06aa 124
971489ea 125(defsubst xml-node-attributes (node)
47db06aa
GM
126 "Return the list of attributes of NODE.
127The list can be nil."
971489ea 128 (nth 1 node))
47db06aa 129
971489ea 130(defsubst xml-node-children (node)
47db06aa
GM
131 "Return the list of children of NODE.
132This is a list of nodes, and it can be nil."
971489ea 133 (cddr node))
47db06aa
GM
134
135(defun xml-get-children (node child-name)
136 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 137CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
138 (let ((match ()))
139 (dolist (child (xml-node-children node))
a1dfa9a3
SM
140 (if (and (listp child)
141 (equal (xml-node-name child) child-name))
142 (push child match)))
971489ea 143 (nreverse match)))
47db06aa 144
9bcd6a7e 145(defun xml-get-attribute-or-nil (node attribute)
47db06aa 146 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 147Return nil if the attribute was not found.
9bcd6a7e
EZ
148
149See also `xml-get-attribute'."
2e9bdf15 150 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
151
152(defsubst xml-get-attribute (node attribute)
153 "Get from NODE the value of ATTRIBUTE.
154An empty string is returned if the attribute was not found.
155
156See also `xml-get-attribute-or-nil'."
157 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
158
159;;*******************************************************************
160;;**
161;;** Creating the list
162;;**
163;;*******************************************************************
164
a98e819b 165;;;###autoload
2d42509a 166(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
167 "Parse the well-formed XML file FILE.
168If FILE is already visited, use its buffer and don't kill it.
47db06aa 169Returns the top node with all its children.
2d42509a
JB
170If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
171If PARSE-NS is non-nil, then QNAMES are expanded."
882cb60d
RS
172 (if (get-file-buffer file)
173 (with-current-buffer (get-file-buffer file)
174 (save-excursion
175 (xml-parse-region (point-min)
176 (point-max)
177 (current-buffer)
178 parse-dtd parse-ns)))
179 (with-temp-buffer
180 (insert-file-contents file)
181 (xml-parse-region (point-min)
182 (point-max)
183 (current-buffer)
184 parse-dtd parse-ns))))
47db06aa 185
b3218de1 186(eval-and-compile
63b446bc 187(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df 188 (name-chars (concat "-[:digit:]." start-chars))
b3218de1 189 ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
6d12a4df 190 (whitespace "[ \t\n\r]"))
b3218de1
CY
191 ;; [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
192 ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
193 ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
194 ;; | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD]
195 ;; | [#x10000-#xEFFFF]
196 (defconst xml-name-start-char-re (concat "[" start-chars "]"))
197 ;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7
198 ;; | [#x0300-#x036F] | [#x203F-#x2040]
199 (defconst xml-name-char-re (concat "[" name-chars "]"))
200 ;; [5] Name ::= NameStartChar (NameChar)*
201 (defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
202 ;; [6] Names ::= Name (#x20 Name)*
203 (defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
204 ;; [7] Nmtoken ::= (NameChar)+
205 (defconst xml-nmtoken-re (concat xml-name-char-re "+"))
206 ;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
207 (defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
208 ;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
209 (defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
210 ;; [68] EntityRef ::= '&' Name ';'
211 (defconst xml-entity-ref (concat "&" xml-name-re ";"))
212 ;; [69] PEReference ::= '%' Name ';'
213 (defconst xml-pe-reference-re (concat "%" xml-name-re ";"))
214 ;; [67] Reference ::= EntityRef | CharRef
215 (defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
216 ;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
217 (defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
218 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
219 ;; [56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default]
220 ;; | 'IDREF' [VC: IDREF]
221 ;; | 'IDREFS' [VC: IDREF]
222 ;; | 'ENTITY' [VC: Entity Name]
223 ;; | 'ENTITIES' [VC: Entity Name]
224 ;; | 'NMTOKEN' [VC: Name Token]
225 ;; | 'NMTOKENS' [VC: Name Token]
226 (defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|"
227 "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)"))
228 ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
229 (defconst xml-notation-type-re
230 (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
231 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*"
232 whitespace "*)\\)"))
233 ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
234 ;; [VC: Enumeration] [VC: No Duplicate Tokens]
235 (defconst xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
236 "\\(?:" whitespace "*|" whitespace "*"
237 xml-nmtoken-re "\\)*"
23d519e4 238 whitespace ")\\)"))
b3218de1
CY
239 ;; [57] EnumeratedType ::= NotationType | Enumeration
240 (defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re
241 "\\|" xml-enumeration-re "\\)"))
242 ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
243 ;; [55] StringType ::= 'CDATA'
244 (defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re
245 "\\|" xml-notation-type-re
246 "\\|" xml-enumerated-type-re "\\)"))
247 ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
248 (defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED"
249 whitespace "\\)*" xml-att-value-re "\\)"))
250 ;; [53] AttDef ::= S Name S AttType S DefaultDecl
251 (defconst xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
252 whitespace "*" xml-att-type-re
253 whitespace "*" xml-default-decl-re "\\)"))
254 ;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
255 ;; | "'" ([^%&'] | PEReference | Reference)* "'"
256 (defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
257 "\\|" xml-reference-re
258 "\\)*\"\\|'\\(?:[^%&']\\|"
259 xml-pe-reference-re "\\|"
260 xml-reference-re "\\)*'\\)"))))
261
262;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
263;; | 'PUBLIC' S PubidLiteral S SystemLiteral
264;; [76] NDataDecl ::= S 'NDATA' S
265;; [73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
266;; [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
267;; [74] PEDef ::= EntityValue | ExternalID
268;; [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
269;; [70] EntityDecl ::= GEDecl | PEDecl
6d12a4df 270
a98e819b
DL
271;; Note that this is setup so that we can do whitespace-skipping with
272;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
273;; compared with `re-search-forward', but that has been fixed. Also
274;; note that the standard syntax table contains other characters with
275;; whitespace syntax, like NBSP, but they are invalid in contexts in
276;; which we might skip whitespace -- specifically, they're not
277;; NameChars [XML 4].
278
279(defvar xml-syntax-table
280 (let ((table (make-syntax-table)))
281 ;; Get space syntax correct per XML [3].
282 (dotimes (c 31)
283 (modify-syntax-entry c "." table)) ; all are space in standard table
5178753d 284 (dolist (c '(?\t ?\n ?\r)) ; these should be space
a98e819b
DL
285 (modify-syntax-entry c " " table))
286 ;; For skipping attributes.
287 (modify-syntax-entry ?\" "\"" table)
288 (modify-syntax-entry ?' "\"" table)
289 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
290 ;; are OK by default).
291 (modify-syntax-entry ?. "_" table)
292 (modify-syntax-entry ?: "_" table)
293 ;; XML [89]
aaaa8abb
MH
294 (unless (featurep 'xemacs)
295 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
296 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
297 #x30FD #x30FE))
298 (modify-syntax-entry (decode-char 'ucs c) "w" table)))
a98e819b
DL
299 ;; Fixme: rest of [4]
300 table)
301 "Syntax table used by `xml-parse-region'.")
302
303;; XML [5]
304;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
305(eval-and-compile
306 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
307
308;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
309;; document ::= prolog element Misc*
310;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
311
312;;;###autoload
2d42509a 313(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
314 "Parse the region from BEG to END in BUFFER.
315If BUFFER is nil, it defaults to the current buffer.
316Returns the XML list for the region, or raises an error if the region
2d42509a 317is not well-formed XML.
47db06aa 318If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
319and returned as the first element of the list.
320If PARSE-NS is non-nil, then QNAMES are expanded."
39d58fc0
MH
321 ;; Use fixed syntax table to ensure regexp char classes and syntax
322 ;; specs DTRT.
323 (with-syntax-table (standard-syntax-table)
324 (let ((case-fold-search nil) ; XML is case-sensitive.
7f3fbd5d
CY
325 ;; Prevent entity definitions from changing the defaults
326 (xml-entity-alist xml-entity-alist)
327 (xml-parameter-entity-alist xml-parameter-entity-alist)
39d58fc0
MH
328 xml result dtd)
329 (save-excursion
330 (if buffer
331 (set-buffer buffer))
332 (save-restriction
333 (narrow-to-region beg end)
a98e819b
DL
334 (goto-char (point-min))
335 (while (not (eobp))
336 (if (search-forward "<" nil t)
337 (progn
338 (forward-char -1)
34638996 339 (setq result (xml-parse-tag parse-dtd parse-ns))
01b229d1
CY
340 (cond
341 ((null result)
342 ;; Not looking at an xml start tag.
18edb22d
CY
343 (unless (eobp)
344 (forward-char 1)))
01b229d1
CY
345 ((and xml (not xml-sub-parser))
346 ;; Translation of rule [1] of XML specifications
347 (error "XML: (Not Well-Formed) Only one root tag allowed"))
348 ((and (listp (car result))
349 parse-dtd)
350 (setq dtd (car result))
351 (if (cdr result) ; possible leading comment
352 (add-to-list 'xml (cdr result))))
353 (t
354 (add-to-list 'xml result))))
a98e819b
DL
355 (goto-char (point-max))))
356 (if parse-dtd
357 (cons dtd (nreverse xml))
358 (nreverse xml)))))))
47db06aa 359
c7f8d055 360(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
361 "Perform any namespace expansion.
362NAME is the name to perform the expansion on.
c7f8d055
SM
363DEFAULT is the default namespace. XML-NS is a cons of namespace
364names to uris. When namespace-aware parsing is off, then XML-NS
365is nil.
366
367During namespace-aware parsing, any name without a namespace is
368put into the namespace identified by DEFAULT. nil is used to
369specify that the name shouldn't be given a namespace."
370 (if (consp xml-ns)
371 (let* ((nsp (string-match ":" name))
372 (lname (if nsp (substring name (match-end 0)) name))
373 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
374 (special (and (string-equal lname "xmlns") (not prefix)))
375 ;; Setting default to nil will insure that there is not
376 ;; matching cons in xml-ns. In which case we
377 (ns (or (cdr (assoc (if special "xmlns" prefix)
378 xml-ns))
6d12a4df 379 "")))
c7f8d055
SM
380 (cons ns (if special "" lname)))
381 (intern name)))
47db06aa 382
6d12a4df
MH
383(defun xml-parse-fragment (&optional parse-dtd parse-ns)
384 "Parse xml-like fragments."
385 (let ((xml-sub-parser t)
7f3fbd5d
CY
386 ;; Prevent entity definitions from changing the defaults
387 (xml-entity-alist xml-entity-alist)
388 (xml-parameter-entity-alist xml-parameter-entity-alist)
6d12a4df
MH
389 children)
390 (while (not (eobp))
391 (let ((bit (xml-parse-tag
392 parse-dtd parse-ns)))
393 (if children
394 (setq children (append (list bit) children))
395 (if (stringp bit)
396 (setq children (list bit))
397 (setq children bit)))))
398 (reverse children)))
399
2d42509a 400(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 401 "Parse the tag at point.
47db06aa
GM
402If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
403returned as the first element in the list.
2d42509a 404If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 405Returns one of:
a98e819b
DL
406 - a list : the matching node
407 - nil : the point is not looking at a tag.
408 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
409 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
410 (xml-ns (if (consp parse-ns)
2d42509a
JB
411 parse-ns
412 (if parse-ns
413 (list
5178753d 414 ;; Default for empty prefix is no namespace
6d12a4df 415 (cons "" "")
c7f8d055 416 ;; "xml" namespace
6d12a4df 417 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 418 ;; We need to seed the xmlns namespace
6d12a4df 419 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
420 (cond
421 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
422 ;; beginning of a document).
423 ((looking-at "<\\?")
424 (search-forward "?>")
425 (skip-syntax-forward " ")
426 (xml-parse-tag parse-dtd xml-ns))
427 ;; Character data (CDATA) sections, in which no tag should be interpreted
428 ((looking-at "<!\\[CDATA\\[")
429 (let ((pos (match-end 0)))
430 (unless (search-forward "]]>" nil t)
6d12a4df 431 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110 432 (concat
f6fcdfff 433 (buffer-substring-no-properties pos (match-beginning 0))
ae026110 434 (xml-parse-string))))
2d42509a 435 ;; DTD for the document
7f3fbd5d 436 ((looking-at "<!DOCTYPE[ \t\n\r]")
6d12a4df
MH
437 (let ((dtd (xml-parse-dtd parse-ns)))
438 (skip-syntax-forward " ")
439 (if xml-validating-parser
440 (cons dtd (xml-parse-tag nil xml-ns))
441 (xml-parse-tag nil xml-ns))))
2d42509a
JB
442 ;; skip comments
443 ((looking-at "<!--")
444 (search-forward "-->")
a268160b 445 (skip-syntax-forward " ")
18edb22d 446 (unless (eobp)
772b2e2c
CY
447 (let ((xml-sub-parser t))
448 (xml-parse-tag parse-dtd xml-ns))))
2d42509a
JB
449 ;; end tag
450 ((looking-at "</")
451 '())
452 ;; opening tag
453 ((looking-at "<\\([^/>[:space:]]+\\)")
454 (goto-char (match-end 1))
34638996
EZ
455
456 ;; Parse this node
f6fcdfff 457 (let* ((node-name (match-string-no-properties 1))
5178753d
MH
458 ;; Parse the attribute list.
459 (attrs (xml-parse-attlist xml-ns))
06b60517 460 children)
c7f8d055 461
5178753d
MH
462 ;; add the xmlns:* attrs to our cache
463 (when (consp xml-ns)
c7f8d055
SM
464 (dolist (attr attrs)
465 (when (and (consp (car attr))
6d12a4df
MH
466 (equal "http://www.w3.org/2000/xmlns/"
467 (caar attr)))
468 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
469 xml-ns))))
470
5178753d 471 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 472
2d42509a
JB
473 ;; is this an empty element ?
474 (if (looking-at "/>")
47db06aa 475 (progn
6d12a4df 476 (forward-char 2)
971489ea 477 (nreverse children))
6d12a4df
MH
478
479 ;; is this a valid start tag ?
480 (if (eq (char-after) ?>)
481 (progn
482 (forward-char 1)
483 ;; Now check that we have the right end-tag. Note that this
484 ;; one might contain spaces after the tag name
485 (let ((end (concat "</" node-name "\\s-*>")))
486 (while (not (looking-at end))
487 (cond
488 ((looking-at "</")
489 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
490 node-name (point)))
491 ((= (char-after) ?<)
492 (let ((tag (xml-parse-tag nil xml-ns)))
493 (when tag
494 (push tag children))))
495 (t
496 (let ((expansion (xml-parse-string)))
497 (setq children
498 (if (stringp expansion)
499 (if (stringp (car children))
500 ;; The two strings were separated by a comment.
aaaa8abb 501 (setq children (append (list (concat (car children) expansion))
6d12a4df
MH
502 (cdr children)))
503 (setq children (append (list expansion) children)))
504 (setq children (append expansion children))))))))
505
506 (goto-char (match-end 0))
507 (nreverse children)))
508 ;; This was an invalid start tag (Expected ">", but didn't see it.)
509 (error "XML: (Well-Formed) Couldn't parse tag: %s"
f6fcdfff 510 (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
6d12a4df
MH
511 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
512 (unless xml-sub-parser ; Usually, we error out.
513 (error "XML: (Well-Formed) Invalid character"))
514
515 ;; However, if we're parsing incrementally, then we need to deal
516 ;; with stray CDATA.
517 (xml-parse-string)))))
518
519(defun xml-parse-string ()
520 "Parse the next whatever. Could be a string, or an element."
5178753d 521 (let* ((pos (point))
98b69232 522 (string (progn (skip-chars-forward "^<")
f6fcdfff 523 (buffer-substring-no-properties pos (point)))))
5178753d
MH
524 ;; Clean up the string. As per XML specifications, the XML
525 ;; processor should always pass the whole string to the
526 ;; application. But \r's should be replaced:
527 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
528 (setq pos 0)
529 (while (string-match "\r\n?" string pos)
530 (setq string (replace-match "\n" t t string))
531 (setq pos (1+ (match-beginning 0))))
532
533 (xml-substitute-special string)))
47db06aa 534
c7f8d055 535(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
536 "Return the attribute-list after point.
537Leave point at the first non-blank character after the tag."
971489ea 538 (let ((attlist ())
34638996 539 end-pos name)
a98e819b
DL
540 (skip-syntax-forward " ")
541 (while (looking-at (eval-when-compile
542 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055 543 (setq end-pos (match-end 0))
f6fcdfff 544 (setq name (xml-maybe-do-ns (match-string-no-properties 1) nil xml-ns))
c7f8d055 545 (goto-char end-pos)
47db06aa 546
a158ff81
JB
547 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
548
47db06aa
GM
549 ;; Do we have a string between quotes (or double-quotes),
550 ;; or a simple word ?
a158ff81 551 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 552 (setq end-pos (match-end 0))
f0ec1711 553 (if (looking-at "'\\([^']*\\)'")
34638996 554 (setq end-pos (match-end 0))
6d12a4df 555 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
556
557 ;; Each attribute must be unique within a given element
558 (if (assoc name attlist)
6d12a4df 559 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 560
a158ff81
JB
561 ;; Multiple whitespace characters should be replaced with a single one
562 ;; in the attributes
06b60517 563 (let ((string (match-string-no-properties 1)))
a98e819b 564 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
565 (let ((expansion (xml-substitute-special string)))
566 (unless (stringp expansion)
91af3942 567 ; We say this is the constraint. It is actually that neither
5178753d 568 ; external entities nor "<" can be in an attribute value.
6d12a4df
MH
569 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
570 (push (cons name expansion) attlist)))
a158ff81 571
34638996 572 (goto-char end-pos)
a98e819b 573 (skip-syntax-forward " "))
971489ea 574 (nreverse attlist)))
47db06aa
GM
575
576;;*******************************************************************
577;;**
578;;** The DTD (document type declaration)
579;;** The following functions know how to skip or parse the DTD of
580;;** a document
581;;**
582;;*******************************************************************
583
a98e819b
DL
584;; Fixme: This fails at least if the DTD contains conditional sections.
585
586(defun xml-skip-dtd ()
587 "Skip the DTD at point.
47db06aa 588This follows the rule [28] in the XML specifications."
6d12a4df
MH
589 (let ((xml-validating-parser nil))
590 (xml-parse-dtd)))
47db06aa 591
6d12a4df 592(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
593 "Parse the DTD at point."
594 (forward-char (eval-when-compile (length "<!DOCTYPE")))
595 (skip-syntax-forward " ")
6d12a4df
MH
596 (if (and (looking-at ">")
597 xml-validating-parser)
598 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 599
971489ea 600 ;; Get the name of the document
a98e819b 601 (looking-at xml-name-regexp)
f6fcdfff 602 (let ((dtd (list (match-string-no-properties 0) 'dtd))
7f3fbd5d 603 (xml-parameter-entity-alist xml-parameter-entity-alist))
47db06aa 604 (goto-char (match-end 0))
a98e819b 605 (skip-syntax-forward " ")
7f3fbd5d
CY
606
607 ;; External subset (XML [75])
a98e819b
DL
608 (cond ((looking-at "PUBLIC\\s-+")
609 (goto-char (match-end 0))
610 (unless (or (re-search-forward
611 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
612 nil t)
613 (re-search-forward
614 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
615 nil t))
6d12a4df 616 (error "XML: Missing Public ID"))
f6fcdfff 617 (let ((pubid (match-string-no-properties 1)))
6d12a4df 618 (skip-syntax-forward " ")
a98e819b
DL
619 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
620 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 621 (error "XML: Missing System ID"))
f6fcdfff 622 (push (list pubid (match-string-no-properties 1) 'public) dtd)))
a98e819b
DL
623 ((looking-at "SYSTEM\\s-+")
624 (goto-char (match-end 0))
625 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
626 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 627 (error "XML: Missing System ID"))
f6fcdfff 628 (push (list (match-string-no-properties 1) 'system) dtd)))
a98e819b 629 (skip-syntax-forward " ")
7f3fbd5d
CY
630
631 (if (eq (char-after) ?>)
632
633 ;; No internal subset
a98e819b 634 (forward-char)
a98e819b 635
7f3fbd5d
CY
636 ;; Internal subset (XML [28b])
637 (unless (eq (char-after) ?\[)
638 (error "XML: Bad DTD"))
639 (forward-char)
640
641 ;; Parse the rest of the DTD
642 ;; Fixme: Deal with NOTATION, PIs.
643 (while (not (looking-at "\\s-*\\]"))
644 (skip-syntax-forward " ")
645 (cond
646 ;; Element declaration [45]:
647 ((looking-at "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
648 (let ((element (match-string-no-properties 1))
649 (type (match-string-no-properties 2))
650 (end-pos (match-end 0)))
651 ;; Translation of rule [46] of XML specifications
a98e819b 652 (cond
7f3fbd5d 653 ((string-match "^EMPTY[ \t\n\r]*$" type) ; empty declaration
a98e819b 654 (setq type 'empty))
7f3fbd5d 655 ((string-match "^ANY[ \t\n\r]*$" type) ; any type of contents
a98e819b 656 (setq type 'any))
7f3fbd5d 657 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ; children ([47])
f6fcdfff 658 (setq type (xml-parse-elem-type (match-string-no-properties 1 type))))
7f3fbd5d 659 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ; substitution
a98e819b 660 nil)
7f3fbd5d
CY
661 (xml-validating-parser
662 (error "XML: (Validity) Invalid element type in the DTD")))
27720433 663
7f3fbd5d
CY
664 ;; rule [45]: the element declaration must be unique
665 (and (assoc element dtd)
666 xml-validating-parser
667 (error "XML: (Validity) DTD element declarations must be unique (<%s>)"
668 element))
a98e819b
DL
669
670 ;; Store the element in the DTD
671 (push (list element type) dtd)
7f3fbd5d
CY
672 (goto-char end-pos)))
673
674 ;; Attribute-list declaration [52] (currently unsupported):
675 ((looking-at (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
676 "\\)[ \t\n\r]*\\(" xml-att-def-re
677 "\\)*[ \t\n\r]*>"))
678 (goto-char (match-end 0)))
679
680 ;; Comments (skip to end):
681 ((looking-at "<!--")
682 (search-forward "-->"))
683
684 ;; Internal entity declarations:
685 ((looking-at (concat "<!ENTITY[ \t\n\r]+\\(%[ \t\n\r]+\\)?\\("
686 xml-name-re "\\)[ \t\n\r]*\\("
687 xml-entity-value-re "\\)[ \t\n\r]*>"))
688 (let* ((name (prog1 (match-string-no-properties 2)
689 (goto-char (match-end 0))))
690 (alist (if (match-string 1)
691 'xml-parameter-entity-alist
692 'xml-entity-alist))
693 ;; Retrieve the deplacement text:
694 (value (xml--entity-replacement-text
695 ;; Entity value, sans quotation marks:
696 (substring (match-string-no-properties 3) 1 -1))))
697 ;; If the same entity is declared more than once, the
698 ;; first declaration is binding.
699 (unless (assoc name (symbol-value alist))
700 (set alist (cons (cons name value) (symbol-value alist))))))
701
702 ;; External entity declarations (currently unsupported):
703 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(%[ \t\n\r]+\\)?\\("
704 xml-name-re "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
705 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
706 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(%[ \t\n\r]+\\)?\\("
707 xml-name-re "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
708 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
709 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
710 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
711 "[ \t\n\r]*>")))
712 (goto-char (match-end 0)))
713
714 ;; Parameter entity:
715 ((looking-at (concat "%\\(" xml-name-re "\\);"))
716 (goto-char (match-end 0))
717 (let* ((entity (match-string 1))
718 (end (point-marker))
719 (elt (assoc entity xml-parameter-entity-alist)))
720 (when elt
721 (replace-match (cdr elt) t t)
722 (goto-char end))))
723
724 ;; Anything else:
725 (xml-validating-parser
726 (error "XML: (Validity) Invalid DTD item"))))
727
6d12a4df 728 (if (looking-at "\\s-*]>")
23d519e4 729 (goto-char (match-end 0))))
461f3ad0 730 (nreverse dtd)))
47db06aa 731
7f3fbd5d
CY
732(defun xml--entity-replacement-text (string)
733 "Return the replacement text for the entity value STRING.
734The replacement text is obtained by replacing character
735references and parameter-entity references."
b3218de1
CY
736 (let ((ref-re (eval-when-compile
737 (concat "\\(?:&#\\([0-9]+\\)\\|&#x\\([0-9a-fA-F]+\\)\\|%\\("
738 xml-name-re "\\)\\);")))
7f3fbd5d
CY
739 children)
740 (while (string-match ref-re string)
741 (push (substring string 0 (match-beginning 0)) children)
742 (let ((remainder (substring string (match-end 0)))
743 ref val)
744 (cond ((setq ref (match-string 1 string))
745 ;; Decimal character reference
746 (setq val (decode-char 'ucs (string-to-number ref)))
747 (if val (push (string val) children)))
748 ;; Hexadecimal character reference
749 ((setq ref (match-string 2 string))
750 (setq val (decode-char 'ucs (string-to-number ref 16)))
751 (if val (push (string val) children)))
752 ;; Parameter entity reference
753 ((setq ref (match-string 3 string))
754 (setq val (assoc ref xml-parameter-entity-alist))
755 (if val
756 (push (cdr val) children)
757 (push (concat "%" ref ";") children))))
758 (setq string remainder)))
759 (mapconcat 'identity (nreverse (cons string children)) "")))
760
47db06aa 761(defun xml-parse-elem-type (string)
a98e819b 762 "Convert element type STRING into a Lisp structure."
47db06aa
GM
763
764 (let (elem modifier)
765 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
766 (progn
f6fcdfff
CY
767 (setq elem (match-string-no-properties 1 string)
768 modifier (match-string-no-properties 2 string))
47db06aa 769 (if (string-match "|" elem)
971489ea 770 (setq elem (cons 'choice
47db06aa
GM
771 (mapcar 'xml-parse-elem-type
772 (split-string elem "|"))))
773 (if (string-match "," elem)
971489ea 774 (setq elem (cons 'seq
47db06aa 775 (mapcar 'xml-parse-elem-type
a98e819b 776 (split-string elem ",")))))))
a158ff81 777 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
f6fcdfff
CY
778 (setq elem (match-string-no-properties 1 string)
779 modifier (match-string-no-properties 2 string))))
47db06aa 780
971489ea
SM
781 (if (and (stringp elem) (string= elem "#PCDATA"))
782 (setq elem 'pcdata))
524425ae 783
971489ea
SM
784 (cond
785 ((string= modifier "+")
786 (list '+ elem))
787 ((string= modifier "*")
788 (list '* elem))
789 ((string= modifier "?")
0fa6f70c 790 (list '\? elem))
971489ea
SM
791 (t
792 elem))))
47db06aa 793
47db06aa
GM
794;;*******************************************************************
795;;**
796;;** Substituting special XML sequences
797;;**
798;;*******************************************************************
799
800(defun xml-substitute-special (string)
cd1181db 801 "Return STRING, after substituting entity references."
a98e819b
DL
802 ;; This originally made repeated passes through the string from the
803 ;; beginning, which isn't correct, since then either "&amp;amp;" or
804 ;; "&#38;amp;" won't DTRT.
47db06aa 805
6d12a4df
MH
806 (let ((point 0)
807 children end-point)
ae026110 808 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df 809 (setq end-point (match-end 0))
f6fcdfff 810 (let* ((this-part (match-string-no-properties 1 string))
6d12a4df
MH
811 (prev-part (substring string point (match-beginning 0)))
812 (entity (assoc this-part xml-entity-alist))
06b60517 813 (expansion
6d12a4df
MH
814 (cond ((string-match "#\\([0-9]+\\)" this-part)
815 (let ((c (decode-char
816 'ucs
f6fcdfff 817 (string-to-number (match-string-no-properties 1 this-part)))))
6d12a4df
MH
818 (if c (string c))))
819 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
820 (let ((c (decode-char
821 'ucs
f6fcdfff 822 (string-to-number (match-string-no-properties 1 this-part) 16))))
6d12a4df
MH
823 (if c (string c))))
824 (entity
825 (cdr entity))
ae026110 826 ((eq (length this-part) 0)
27720433 827 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 828 (t
f8ab034e 829 (if xml-validating-parser
6d12a4df 830 (error "XML: (Validity) Undefined entity `%s'"
f8ab034e
MH
831 this-part)
832 xml-undefined-entity)))))
6d12a4df
MH
833
834 (cond ((null children)
f0d49437
MH
835 ;; FIXME: If we have an entity that expands into XML, this won't work.
836 (setq children
837 (concat prev-part expansion)))
6d12a4df
MH
838 ((stringp children)
839 (if (stringp expansion)
840 (setq children (concat children prev-part expansion))
841 (setq children (list expansion (concat prev-part children)))))
842 ((and (stringp expansion)
843 (stringp (car children)))
844 (setcar children (concat prev-part expansion (car children))))
845 ((stringp expansion)
846 (setq children (append (concat prev-part expansion)
847 children)))
848 ((stringp (car children))
849 (setcar children (concat (car children) prev-part))
850 (setq children (append expansion children)))
851 (t
852 (setq children (list expansion
853 prev-part
854 children))))
855 (setq point end-point)))
856 (cond ((stringp children)
857 (concat children (substring string point)))
858 ((stringp (car (last children)))
a3110b5d 859 (concat (car (last children)) (substring string point)))
6d12a4df
MH
860 ((null children)
861 string)
862 (t
a3110b5d
MH
863 (concat (mapconcat 'identity
864 (nreverse children)
865 "")
866 (substring string point))))))
867
571855b6
UJ
868(defun xml-substitute-numeric-entities (string)
869 "Substitute SGML numeric entities by their respective utf characters.
870This function replaces numeric entities in the input STRING and
871returns the modified string. For example \"&#42;\" gets replaced
872by \"*\"."
873 (if (and string (stringp string))
874 (let ((start 0))
875 (while (string-match "&#\\([0-9]+\\);" string start)
876 (condition-case nil
877 (setq string (replace-match
878 (string (read (substring string
879 (match-beginning 1)
880 (match-end 1))))
881 nil nil string))
882 (error nil))
883 (setq start (1+ (match-beginning 0))))
884 string)
885 nil))
886
47db06aa
GM
887;;*******************************************************************
888;;**
889;;** Printing a tree.
890;;** This function is intended mainly for debugging purposes.
891;;**
892;;*******************************************************************
893
27240aa4
AS
894(defun xml-debug-print (xml &optional indent-string)
895 "Outputs the XML in the current buffer.
896XML can be a tree or a list of nodes.
897The first line is indented with the optional INDENT-STRING."
898 (setq indent-string (or indent-string ""))
971489ea 899 (dolist (node xml)
27240aa4
AS
900 (xml-debug-print-internal node indent-string)))
901
902(defalias 'xml-print 'xml-debug-print)
47db06aa 903
7731c9f4 904(defun xml-escape-string (string)
7f3fbd5d 905 "Return STRING with entity substitutions made from `xml-entity-alist'."
7731c9f4
MH
906 (mapconcat (lambda (byte)
907 (let ((char (char-to-string byte)))
908 (if (rassoc char xml-entity-alist)
909 (concat "&" (car (rassoc char xml-entity-alist)) ";")
910 char)))
76a6127f 911 string ""))
7731c9f4 912
971489ea 913(defun xml-debug-print-internal (xml indent-string)
47db06aa 914 "Outputs the XML tree in the current buffer.
a98e819b 915The first line is indented with INDENT-STRING."
47db06aa
GM
916 (let ((tree xml)
917 attlist)
a98e819b 918 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 919
47db06aa 920 ;; output the attribute list
971489ea 921 (setq attlist (xml-node-attributes tree))
47db06aa 922 (while attlist
7731c9f4
MH
923 (insert ?\ (symbol-name (caar attlist)) "=\""
924 (xml-escape-string (cdar attlist)) ?\")
971489ea 925 (setq attlist (cdr attlist)))
524425ae 926
971489ea 927 (setq tree (xml-node-children tree))
47db06aa 928
27240aa4
AS
929 (if (null tree)
930 (insert ?/ ?>)
931 (insert ?>)
932
933 ;; output the children
934 (dolist (node tree)
935 (cond
936 ((listp node)
937 (insert ?\n)
938 (xml-debug-print-internal node (concat indent-string " ")))
7731c9f4
MH
939 ((stringp node)
940 (insert (xml-escape-string node)))
27240aa4
AS
941 (t
942 (error "Invalid XML tree"))))
943
944 (when (not (and (null (cdr tree))
945 (stringp (car tree))))
946 (insert ?\n indent-string))
947 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
948
949(provide 'xml)
950
951;;; xml.el ends here