(Version, mh-version): Add +cvs to version.
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
0d30b337
TTN
3;; Copyright (C) 2000, 2001, 2002, 2003, 2004,
4;; 2005 Free Software Foundation, Inc.
47db06aa
GM
5
6;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 7;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 8;; Keywords: xml, data
47db06aa
GM
9
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
14;; the Free Software Foundation; either version 2, or (at your option)
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING. If not, write to the
086add15
LK
24;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25;; Boston, MA 02110-1301, USA.
47db06aa
GM
26
27;;; Commentary:
28
a98e819b
DL
29;; This file contains a somewhat incomplete non-validating XML parser. It
30;; parses a file, and returns a list that can be used internally by
a1dfa9a3 31;; any other Lisp libraries.
47db06aa
GM
32
33;;; FILE FORMAT
34
a98e819b
DL
35;; The document type declaration may either be ignored or (optionally)
36;; parsed, but currently the parsing will only accept element
a1dfa9a3 37;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
38;; of error, the parsing stops and the XML file is shown where the
39;; parsing stopped.
47db06aa 40;;
a98e819b 41;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
42;;
43;; The XML file should have the following format:
653558a1
GM
44;; <node1 attr1="name1" attr2="name2" ...>value
45;; <node2 attr3="name3" attr4="name4">value2</node2>
46;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 47;; </node1>
a1dfa9a3 48;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
49;; be any number of attributes (or none), as well as any number of children
50;; below the nodes.
51;;
52;; There can be only top level node, but with any number of children below.
53
54;;; LIST FORMAT
55
c7f8d055
SM
56;; The functions `xml-parse-file', `xml-parse-region' and
57;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
58;;
59;; xml-list ::= (node node ...)
c7f8d055 60;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
61;; child_node_list ::= child_node child_node ...
62;; child_node ::= node | string
c7f8d055
SM
63;; qname ::= (:namespace-uri . "name") | "name"
64;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
65;; | nil
66;; string ::= "..."
67;;
a98e819b
DL
68;; Some macros are provided to ease the parsing of this list.
69;; Whitespace is preserved. Fixme: There should be a tree-walker that
70;; can remove it.
47db06aa 71
c7f8d055
SM
72;; TODO:
73;; * xml:base, xml:space support
74;; * more complete DOCTYPE parsing
75;; * pi support
76
47db06aa
GM
77;;; Code:
78
a98e819b
DL
79;; Note that {buffer-substring,match-string}-no-properties were
80;; formerly used in several places, but that removes composition info.
81
47db06aa
GM
82;;*******************************************************************
83;;**
84;;** Macros to parse the list
85;;**
86;;*******************************************************************
87
f8ab034e
MH
88(defconst xml-undefined-entity "?"
89 "What to substitute for undefined entities")
90
6d12a4df
MH
91(defvar xml-entity-alist
92 '(("lt" . "<")
93 ("gt" . ">")
94 ("apos" . "'")
95 ("quot" . "\"")
96 ("amp" . "&"))
97 "The defined entities. Entities are added to this when the DTD is parsed.")
98
99(defvar xml-sub-parser nil
100 "Dynamically set this to a non-nil value if you want to parse an XML fragment.")
101
102(defvar xml-validating-parser nil
103 "Set to non-nil to get validity checking.")
104
971489ea 105(defsubst xml-node-name (node)
47db06aa 106 "Return the tag associated with NODE.
a1dfa9a3
SM
107Without namespace-aware parsing, the tag is a symbol.
108
109With namespace-aware parsing, the tag is a cons of a string
110representing the uri of the namespace with the local name of the
111tag. For example,
112
113 <foo>
114
115would be represented by
116
117 '(\"\" . \"foo\")."
118
971489ea 119 (car node))
47db06aa 120
971489ea 121(defsubst xml-node-attributes (node)
47db06aa
GM
122 "Return the list of attributes of NODE.
123The list can be nil."
971489ea 124 (nth 1 node))
47db06aa 125
971489ea 126(defsubst xml-node-children (node)
47db06aa
GM
127 "Return the list of children of NODE.
128This is a list of nodes, and it can be nil."
971489ea 129 (cddr node))
47db06aa
GM
130
131(defun xml-get-children (node child-name)
132 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 133CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
134 (let ((match ()))
135 (dolist (child (xml-node-children node))
a1dfa9a3
SM
136 (if (and (listp child)
137 (equal (xml-node-name child) child-name))
138 (push child match)))
971489ea 139 (nreverse match)))
47db06aa 140
9bcd6a7e 141(defun xml-get-attribute-or-nil (node attribute)
47db06aa 142 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 143Return nil if the attribute was not found.
9bcd6a7e
EZ
144
145See also `xml-get-attribute'."
2e9bdf15 146 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
147
148(defsubst xml-get-attribute (node attribute)
149 "Get from NODE the value of ATTRIBUTE.
150An empty string is returned if the attribute was not found.
151
152See also `xml-get-attribute-or-nil'."
153 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
154
155;;*******************************************************************
156;;**
157;;** Creating the list
158;;**
159;;*******************************************************************
160
a98e819b 161;;;###autoload
2d42509a 162(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
163 "Parse the well-formed XML file FILE.
164If FILE is already visited, use its buffer and don't kill it.
47db06aa 165Returns the top node with all its children.
2d42509a
JB
166If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
167If PARSE-NS is non-nil, then QNAMES are expanded."
653558a1
GM
168 (let ((keep))
169 (if (get-file-buffer file)
170 (progn
171 (set-buffer (get-file-buffer file))
172 (setq keep (point)))
a98e819b
DL
173 (let (auto-mode-alist) ; no need for xml-mode
174 (find-file file)))
524425ae 175
653558a1
GM
176 (let ((xml (xml-parse-region (point-min)
177 (point-max)
178 (current-buffer)
2d42509a 179 parse-dtd parse-ns)))
653558a1
GM
180 (if keep
181 (goto-char keep)
182 (kill-buffer (current-buffer)))
183 xml)))
47db06aa 184
6d12a4df 185
5ed32352
RS
186(defvar xml-name-re)
187(defvar xml-entity-value-re)
0bf41002 188(defvar xml-att-def-re)
63b446bc 189(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df
MH
190 (name-chars (concat "-[:digit:]." start-chars))
191;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
192 (whitespace "[ \t\n\r]"))
193;;[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
194;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
195;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
196;; | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
197 (defvar xml-name-start-char-re (concat "[" start-chars "]"))
198;;[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
199 (defvar xml-name-char-re (concat "[" name-chars "]"))
200;;[5] Name ::= NameStartChar (NameChar)*
201 (defvar xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
202;;[6] Names ::= Name (#x20 Name)*
203 (defvar xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
204;;[7] Nmtoken ::= (NameChar)+
205 (defvar xml-nmtoken-re (concat xml-name-char-re "+"))
206;;[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
207 (defvar xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
208;;[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
209 (defvar xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
210;;[68] EntityRef ::= '&' Name ';'
211 (defvar xml-entity-ref (concat "&" xml-name-re ";"))
212;;[69] PEReference ::= '%' Name ';'
213 (defvar xml-pe-reference-re (concat "%" xml-name-re ";"))
214;;[67] Reference ::= EntityRef | CharRef
215 (defvar xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
23d519e4
MH
216;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
217 (defvar xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
218 "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
219;;[56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID per Element Type] [VC: ID Attribute Default]
220;; | 'IDREF' [VC: IDREF]
221;; | 'IDREFS' [VC: IDREF]
222;; | 'ENTITY' [VC: Entity Name]
223;; | 'ENTITIES' [VC: Entity Name]
224;; | 'NMTOKEN' [VC: Name Token]
225;; | 'NMTOKENS' [VC: Name Token]
226 (defvar xml-tokenized-type-re "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|ENTITIES\\|NMTOKEN\\|NMTOKENS\\)")
227;;[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
228 (defvar xml-notation-type-re (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
229 "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*" whitespace "*)\\)"))
230;;[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] [VC: No Duplicate Tokens]
231 (defvar xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
232 "\\(?:" whitespace "*|" whitespace "*" xml-nmtoken-re "\\)*"
233 whitespace ")\\)"))
234;;[57] EnumeratedType ::= NotationType | Enumeration
235 (defvar xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re "\\|" xml-enumeration-re "\\)"))
236;;[54] AttType ::= StringType | TokenizedType | EnumeratedType
237;;[55] StringType ::= 'CDATA'
238 (defvar xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re "\\|" xml-notation-type-re"\\|" xml-enumerated-type-re "\\)"))
239;;[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
240 (defvar xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED" whitespace "\\)*" xml-att-value-re "\\)"))
241;;[53] AttDef ::= S Name S AttType S DefaultDecl
242 (defvar xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
243 whitespace "*" xml-att-type-re
244 whitespace "*" xml-default-decl-re "\\)"))
6d12a4df
MH
245;;[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
246;; | "'" ([^%&'] | PEReference | Reference)* "'"
247 (defvar xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
248 "\\|" xml-reference-re "\\)*\"\\|'\\(?:[^%&']\\|"
249 xml-pe-reference-re "\\|" xml-reference-re "\\)*'\\)")))
250;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
251;; | 'PUBLIC' S PubidLiteral S SystemLiteral
252;;[76] NDataDecl ::= S 'NDATA' S
253;;[73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
254;;[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
255;;[74] PEDef ::= EntityValue | ExternalID
256;;[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
257;;[70] EntityDecl ::= GEDecl | PEDecl
258
a98e819b
DL
259;; Note that this is setup so that we can do whitespace-skipping with
260;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
261;; compared with `re-search-forward', but that has been fixed. Also
262;; note that the standard syntax table contains other characters with
263;; whitespace syntax, like NBSP, but they are invalid in contexts in
264;; which we might skip whitespace -- specifically, they're not
265;; NameChars [XML 4].
266
267(defvar xml-syntax-table
268 (let ((table (make-syntax-table)))
269 ;; Get space syntax correct per XML [3].
270 (dotimes (c 31)
271 (modify-syntax-entry c "." table)) ; all are space in standard table
272 (dolist (c '(?\t ?\n ?\r)) ; these should be space
273 (modify-syntax-entry c " " table))
274 ;; For skipping attributes.
275 (modify-syntax-entry ?\" "\"" table)
276 (modify-syntax-entry ?' "\"" table)
277 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
278 ;; are OK by default).
279 (modify-syntax-entry ?. "_" table)
280 (modify-syntax-entry ?: "_" table)
281 ;; XML [89]
aaaa8abb
MH
282 (unless (featurep 'xemacs)
283 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
284 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
285 #x30FD #x30FE))
286 (modify-syntax-entry (decode-char 'ucs c) "w" table)))
a98e819b
DL
287 ;; Fixme: rest of [4]
288 table)
289 "Syntax table used by `xml-parse-region'.")
290
291;; XML [5]
292;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
293(eval-and-compile
294 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
295
296;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
297;; document ::= prolog element Misc*
298;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
299
300;;;###autoload
2d42509a 301(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
302 "Parse the region from BEG to END in BUFFER.
303If BUFFER is nil, it defaults to the current buffer.
304Returns the XML list for the region, or raises an error if the region
2d42509a 305is not well-formed XML.
47db06aa 306If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
307and returned as the first element of the list.
308If PARSE-NS is non-nil, then QNAMES are expanded."
a98e819b
DL
309 (save-restriction
310 (narrow-to-region beg end)
311 ;; Use fixed syntax table to ensure regexp char classes and syntax
312 ;; specs DTRT.
313 (with-syntax-table (standard-syntax-table)
314 (let ((case-fold-search nil) ; XML is case-sensitive.
315 xml result dtd)
316 (save-excursion
317 (if buffer
318 (set-buffer buffer))
319 (goto-char (point-min))
320 (while (not (eobp))
321 (if (search-forward "<" nil t)
322 (progn
323 (forward-char -1)
34638996 324 (setq result (xml-parse-tag parse-dtd parse-ns))
6d12a4df 325 (if (and xml result (not xml-sub-parser))
a98e819b 326 ;; translation of rule [1] of XML specifications
6d12a4df 327 (error "XML: (Not Well-Formed) Only one root tag allowed")
47db06aa 328 (cond
971489ea 329 ((null result))
34638996
EZ
330 ((and (listp (car result))
331 parse-dtd)
971489ea 332 (setq dtd (car result))
a98e819b
DL
333 (if (cdr result) ; possible leading comment
334 (add-to-list 'xml (cdr result))))
47db06aa 335 (t
a98e819b
DL
336 (add-to-list 'xml result)))))
337 (goto-char (point-max))))
338 (if parse-dtd
339 (cons dtd (nreverse xml))
340 (nreverse xml)))))))
47db06aa 341
c7f8d055 342(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
343 "Perform any namespace expansion.
344NAME is the name to perform the expansion on.
c7f8d055
SM
345DEFAULT is the default namespace. XML-NS is a cons of namespace
346names to uris. When namespace-aware parsing is off, then XML-NS
347is nil.
348
349During namespace-aware parsing, any name without a namespace is
350put into the namespace identified by DEFAULT. nil is used to
351specify that the name shouldn't be given a namespace."
352 (if (consp xml-ns)
353 (let* ((nsp (string-match ":" name))
354 (lname (if nsp (substring name (match-end 0)) name))
355 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
356 (special (and (string-equal lname "xmlns") (not prefix)))
357 ;; Setting default to nil will insure that there is not
358 ;; matching cons in xml-ns. In which case we
359 (ns (or (cdr (assoc (if special "xmlns" prefix)
360 xml-ns))
6d12a4df 361 "")))
c7f8d055
SM
362 (cons ns (if special "" lname)))
363 (intern name)))
47db06aa 364
6d12a4df
MH
365(defun xml-parse-fragment (&optional parse-dtd parse-ns)
366 "Parse xml-like fragments."
367 (let ((xml-sub-parser t)
368 children)
369 (while (not (eobp))
370 (let ((bit (xml-parse-tag
371 parse-dtd parse-ns)))
372 (if children
373 (setq children (append (list bit) children))
374 (if (stringp bit)
375 (setq children (list bit))
376 (setq children bit)))))
377 (reverse children)))
378
2d42509a 379(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 380 "Parse the tag at point.
47db06aa
GM
381If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
382returned as the first element in the list.
2d42509a 383If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 384Returns one of:
a98e819b
DL
385 - a list : the matching node
386 - nil : the point is not looking at a tag.
387 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
388 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
389 (xml-ns (if (consp parse-ns)
2d42509a
JB
390 parse-ns
391 (if parse-ns
392 (list
c7f8d055 393 ;; Default for empty prefix is no namespace
6d12a4df 394 (cons "" "")
c7f8d055 395 ;; "xml" namespace
6d12a4df 396 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 397 ;; We need to seed the xmlns namespace
6d12a4df 398 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
399 (cond
400 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
401 ;; beginning of a document).
402 ((looking-at "<\\?")
403 (search-forward "?>")
404 (skip-syntax-forward " ")
405 (xml-parse-tag parse-dtd xml-ns))
406 ;; Character data (CDATA) sections, in which no tag should be interpreted
407 ((looking-at "<!\\[CDATA\\[")
408 (let ((pos (match-end 0)))
409 (unless (search-forward "]]>" nil t)
6d12a4df 410 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110
MH
411 (concat
412 (buffer-substring pos (match-beginning 0))
413 (xml-parse-string))))
2d42509a
JB
414 ;; DTD for the document
415 ((looking-at "<!DOCTYPE")
6d12a4df
MH
416 (let ((dtd (xml-parse-dtd parse-ns)))
417 (skip-syntax-forward " ")
418 (if xml-validating-parser
419 (cons dtd (xml-parse-tag nil xml-ns))
420 (xml-parse-tag nil xml-ns))))
2d42509a
JB
421 ;; skip comments
422 ((looking-at "<!--")
423 (search-forward "-->")
424 nil)
425 ;; end tag
426 ((looking-at "</")
427 '())
428 ;; opening tag
429 ((looking-at "<\\([^/>[:space:]]+\\)")
430 (goto-char (match-end 1))
34638996
EZ
431
432 ;; Parse this node
2d42509a 433 (let* ((node-name (match-string 1))
c7f8d055
SM
434 ;; Parse the attribute list.
435 (attrs (xml-parse-attlist xml-ns))
436 children pos)
437
438 ;; add the xmlns:* attrs to our cache
439 (when (consp xml-ns)
440 (dolist (attr attrs)
441 (when (and (consp (car attr))
6d12a4df
MH
442 (equal "http://www.w3.org/2000/xmlns/"
443 (caar attr)))
444 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
445 xml-ns))))
446
43b5fd81 447 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 448
2d42509a
JB
449 ;; is this an empty element ?
450 (if (looking-at "/>")
47db06aa 451 (progn
6d12a4df 452 (forward-char 2)
971489ea 453 (nreverse children))
6d12a4df
MH
454
455 ;; is this a valid start tag ?
456 (if (eq (char-after) ?>)
457 (progn
458 (forward-char 1)
459 ;; Now check that we have the right end-tag. Note that this
460 ;; one might contain spaces after the tag name
461 (let ((end (concat "</" node-name "\\s-*>")))
462 (while (not (looking-at end))
463 (cond
464 ((looking-at "</")
465 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
466 node-name (point)))
467 ((= (char-after) ?<)
468 (let ((tag (xml-parse-tag nil xml-ns)))
469 (when tag
470 (push tag children))))
471 (t
472 (let ((expansion (xml-parse-string)))
473 (setq children
474 (if (stringp expansion)
475 (if (stringp (car children))
476 ;; The two strings were separated by a comment.
aaaa8abb 477 (setq children (append (list (concat (car children) expansion))
6d12a4df
MH
478 (cdr children)))
479 (setq children (append (list expansion) children)))
480 (setq children (append expansion children))))))))
481
482 (goto-char (match-end 0))
483 (nreverse children)))
484 ;; This was an invalid start tag (Expected ">", but didn't see it.)
485 (error "XML: (Well-Formed) Couldn't parse tag: %s"
486 (buffer-substring (- (point) 10) (+ (point) 1)))))))
487 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
488 (unless xml-sub-parser ; Usually, we error out.
489 (error "XML: (Well-Formed) Invalid character"))
490
491 ;; However, if we're parsing incrementally, then we need to deal
492 ;; with stray CDATA.
493 (xml-parse-string)))))
494
495(defun xml-parse-string ()
496 "Parse the next whatever. Could be a string, or an element."
497 (let* ((pos (point))
498 (string (progn (if (search-forward "<" nil t)
499 (forward-char -1)
500 (goto-char (point-max)))
501 (buffer-substring pos (point)))))
502 ;; Clean up the string. As per XML specifications, the XML
503 ;; processor should always pass the whole string to the
504 ;; application. But \r's should be replaced:
505 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
506 (setq pos 0)
507 (while (string-match "\r\n?" string pos)
508 (setq string (replace-match "\n" t t string))
509 (setq pos (1+ (match-beginning 0))))
510
511 (xml-substitute-special string)))
47db06aa 512
c7f8d055 513(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
514 "Return the attribute-list after point.
515Leave point at the first non-blank character after the tag."
971489ea 516 (let ((attlist ())
34638996 517 end-pos name)
a98e819b
DL
518 (skip-syntax-forward " ")
519 (while (looking-at (eval-when-compile
520 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055
SM
521 (setq end-pos (match-end 0))
522 (setq name (xml-maybe-do-ns (match-string 1) nil xml-ns))
523 (goto-char end-pos)
47db06aa 524
a158ff81
JB
525 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
526
47db06aa
GM
527 ;; Do we have a string between quotes (or double-quotes),
528 ;; or a simple word ?
a158ff81 529 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 530 (setq end-pos (match-end 0))
f0ec1711 531 (if (looking-at "'\\([^']*\\)'")
34638996 532 (setq end-pos (match-end 0))
6d12a4df 533 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
534
535 ;; Each attribute must be unique within a given element
536 (if (assoc name attlist)
6d12a4df 537 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 538
a158ff81
JB
539 ;; Multiple whitespace characters should be replaced with a single one
540 ;; in the attributes
a98e819b 541 (let ((string (match-string 1))
a158ff81 542 (pos 0))
a98e819b 543 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
544 (let ((expansion (xml-substitute-special string)))
545 (unless (stringp expansion)
546 ; We say this is the constraint. It is acctually that
547 ; external entities nor "<" can be in an attribute value.
548 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
549 (push (cons name expansion) attlist)))
a158ff81 550
34638996 551 (goto-char end-pos)
a98e819b 552 (skip-syntax-forward " "))
971489ea 553 (nreverse attlist)))
47db06aa
GM
554
555;;*******************************************************************
556;;**
557;;** The DTD (document type declaration)
558;;** The following functions know how to skip or parse the DTD of
559;;** a document
560;;**
561;;*******************************************************************
562
a98e819b
DL
563;; Fixme: This fails at least if the DTD contains conditional sections.
564
565(defun xml-skip-dtd ()
566 "Skip the DTD at point.
47db06aa 567This follows the rule [28] in the XML specifications."
6d12a4df
MH
568 (let ((xml-validating-parser nil))
569 (xml-parse-dtd)))
47db06aa 570
6d12a4df 571(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
572 "Parse the DTD at point."
573 (forward-char (eval-when-compile (length "<!DOCTYPE")))
574 (skip-syntax-forward " ")
6d12a4df
MH
575 (if (and (looking-at ">")
576 xml-validating-parser)
577 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 578
971489ea 579 ;; Get the name of the document
a98e819b
DL
580 (looking-at xml-name-regexp)
581 (let ((dtd (list (match-string 0) 'dtd))
971489ea 582 type element end-pos)
47db06aa
GM
583 (goto-char (match-end 0))
584
a98e819b
DL
585 (skip-syntax-forward " ")
586 ;; XML [75]
587 (cond ((looking-at "PUBLIC\\s-+")
588 (goto-char (match-end 0))
589 (unless (or (re-search-forward
590 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
591 nil t)
592 (re-search-forward
593 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
594 nil t))
6d12a4df 595 (error "XML: Missing Public ID"))
a98e819b 596 (let ((pubid (match-string 1)))
6d12a4df 597 (skip-syntax-forward " ")
a98e819b
DL
598 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
599 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 600 (error "XML: Missing System ID"))
a98e819b
DL
601 (push (list pubid (match-string 1) 'public) dtd)))
602 ((looking-at "SYSTEM\\s-+")
603 (goto-char (match-end 0))
604 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
605 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 606 (error "XML: Missing System ID"))
a98e819b
DL
607 (push (list (match-string 1) 'system) dtd)))
608 (skip-syntax-forward " ")
609 (if (eq ?> (char-after))
610 (forward-char)
a98e819b 611 (if (not (eq (char-after) ?\[))
6d12a4df 612 (error "XML: Bad DTD")
a98e819b
DL
613 (forward-char)
614 ;; Parse the rest of the DTD
23d519e4 615 ;; Fixme: Deal with NOTATION, PIs.
a98e819b
DL
616 (while (not (looking-at "\\s-*\\]"))
617 (skip-syntax-forward " ")
618 (cond
619
620 ;; Translation of rule [45] of XML specifications
621 ((looking-at
622 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
623
34638996 624 (setq element (match-string 1)
a98e819b
DL
625 type (match-string-no-properties 2))
626 (setq end-pos (match-end 0))
627
628 ;; Translation of rule [46] of XML specifications
629 (cond
630 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
631 (setq type 'empty))
632 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
633 (setq type 'any))
634 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
635 (setq type (xml-parse-elem-type (match-string 1 type))))
636 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
637 nil)
638 (t
6d12a4df 639 (if xml-validating-parser
07f7e9ae 640 (error "XML: (Validity) Invalid element type in the DTD"))))
27720433 641
a98e819b 642 ;; rule [45]: the element declaration must be unique
6d12a4df
MH
643 (if (and (assoc element dtd)
644 xml-validating-parser)
645 (error "XML: (Validity) Element declarations must be unique in a DTD (<%s>)"
461f3ad0 646 element))
a98e819b
DL
647
648 ;; Store the element in the DTD
649 (push (list element type) dtd)
650 (goto-char end-pos))
23d519e4
MH
651
652 ;; Translation of rule [52] of XML specifications
653 ((looking-at (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
654 "\\)[ \t\n\r]*\\(" xml-att-def-re
655 "\\)*[ \t\n\r]*>"))
656
657 ;; We don't do anything with ATTLIST currently
658 (goto-char (match-end 0)))
659
a98e819b
DL
660 ((looking-at "<!--")
661 (search-forward "-->"))
6d12a4df
MH
662 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
663 "\\)[ \t\n\r]*\\(" xml-entity-value-re
664 "\\)[ \t\n\r]*>"))
23d519e4
MH
665 (let ((name (match-string 1))
666 (value (substring (match-string 2) 1
667 (- (length (match-string 2)) 1))))
668 (goto-char (match-end 0))
6d12a4df
MH
669 (setq xml-entity-alist
670 (append xml-entity-alist
671 (list (cons name
672 (with-temp-buffer
673 (insert value)
674 (goto-char (point-min))
675 (xml-parse-fragment
676 xml-validating-parser
677 parse-ns))))))))
678 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
679 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
680 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
681 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
682 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
683 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
684 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
685 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
686 "[ \t\n\r]*>")))
23d519e4
MH
687 (let ((name (match-string 1))
688 (file (substring (match-string 2) 1
689 (- (length (match-string 2)) 1))))
690 (goto-char (match-end 0))
6d12a4df
MH
691 (setq xml-entity-alist
692 (append xml-entity-alist
693 (list (cons name (with-temp-buffer
694 (insert-file-contents file)
695 (goto-char (point-min))
696 (xml-parse-fragment
697 xml-validating-parser
698 parse-ns))))))))
63b446bc
MH
699 ;; skip parameter entity declarations
700 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+%[ \t\n\r]+\\(" xml-name-re
701 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
702 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
703 (looking-at (concat "<!ENTITY[ \t\n\r]+"
704 "%[ \t\n\r]+"
705 "\\(" xml-name-re "\\)[ \t\n\r]+"
706 "PUBLIC[ \t\n\r]+"
707 "\\(\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
708 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'\\)[ \t\n\r]+"
709 "\\(\"[^\"]+\"\\|'[^']+'\\)"
710 "[ \t\n\r]*>")))
711 (goto-char (match-end 0)))
712 ;; skip parameter entities
713 ((looking-at (concat "%" xml-name-re ";"))
714 (goto-char (match-end 0)))
a98e819b 715 (t
7e0f1123 716 (when xml-validating-parser
27720433 717 (error "XML: (Validity) Invalid DTD item"))))))
6d12a4df 718 (if (looking-at "\\s-*]>")
23d519e4 719 (goto-char (match-end 0))))
461f3ad0 720 (nreverse dtd)))
47db06aa
GM
721
722(defun xml-parse-elem-type (string)
a98e819b 723 "Convert element type STRING into a Lisp structure."
47db06aa
GM
724
725 (let (elem modifier)
726 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
727 (progn
728 (setq elem (match-string 1 string)
729 modifier (match-string 2 string))
730 (if (string-match "|" elem)
971489ea 731 (setq elem (cons 'choice
47db06aa
GM
732 (mapcar 'xml-parse-elem-type
733 (split-string elem "|"))))
734 (if (string-match "," elem)
971489ea 735 (setq elem (cons 'seq
47db06aa 736 (mapcar 'xml-parse-elem-type
a98e819b 737 (split-string elem ",")))))))
a158ff81
JB
738 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
739 (setq elem (match-string 1 string)
47db06aa
GM
740 modifier (match-string 2 string))))
741
971489ea
SM
742 (if (and (stringp elem) (string= elem "#PCDATA"))
743 (setq elem 'pcdata))
524425ae 744
971489ea
SM
745 (cond
746 ((string= modifier "+")
747 (list '+ elem))
748 ((string= modifier "*")
749 (list '* elem))
750 ((string= modifier "?")
0fa6f70c 751 (list '\? elem))
971489ea
SM
752 (t
753 elem))))
47db06aa 754
47db06aa
GM
755;;*******************************************************************
756;;**
757;;** Substituting special XML sequences
758;;**
759;;*******************************************************************
760
761(defun xml-substitute-special (string)
a98e819b
DL
762 "Return STRING, after subsituting entity references."
763 ;; This originally made repeated passes through the string from the
764 ;; beginning, which isn't correct, since then either "&amp;amp;" or
765 ;; "&#38;amp;" won't DTRT.
47db06aa 766
6d12a4df
MH
767 (let ((point 0)
768 children end-point)
ae026110 769 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df
MH
770 (setq end-point (match-end 0))
771 (let* ((this-part (match-string 1 string))
772 (prev-part (substring string point (match-beginning 0)))
773 (entity (assoc this-part xml-entity-alist))
774 (expansion
775 (cond ((string-match "#\\([0-9]+\\)" this-part)
776 (let ((c (decode-char
777 'ucs
778 (string-to-number (match-string 1 this-part)))))
779 (if c (string c))))
780 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
781 (let ((c (decode-char
782 'ucs
783 (string-to-number (match-string 1 this-part) 16))))
784 (if c (string c))))
785 (entity
786 (cdr entity))
ae026110 787 ((eq (length this-part) 0)
27720433 788 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 789 (t
f8ab034e 790 (if xml-validating-parser
6d12a4df 791 (error "XML: (Validity) Undefined entity `%s'"
f8ab034e
MH
792 this-part)
793 xml-undefined-entity)))))
6d12a4df
MH
794
795 (cond ((null children)
f0d49437
MH
796 ;; FIXME: If we have an entity that expands into XML, this won't work.
797 (setq children
798 (concat prev-part expansion)))
6d12a4df
MH
799 ((stringp children)
800 (if (stringp expansion)
801 (setq children (concat children prev-part expansion))
802 (setq children (list expansion (concat prev-part children)))))
803 ((and (stringp expansion)
804 (stringp (car children)))
805 (setcar children (concat prev-part expansion (car children))))
806 ((stringp expansion)
807 (setq children (append (concat prev-part expansion)
808 children)))
809 ((stringp (car children))
810 (setcar children (concat (car children) prev-part))
811 (setq children (append expansion children)))
812 (t
813 (setq children (list expansion
814 prev-part
815 children))))
816 (setq point end-point)))
817 (cond ((stringp children)
818 (concat children (substring string point)))
819 ((stringp (car (last children)))
a3110b5d 820 (concat (car (last children)) (substring string point)))
6d12a4df
MH
821 ((null children)
822 string)
823 (t
a3110b5d
MH
824 (concat (mapconcat 'identity
825 (nreverse children)
826 "")
827 (substring string point))))))
828
47db06aa
GM
829;;*******************************************************************
830;;**
831;;** Printing a tree.
832;;** This function is intended mainly for debugging purposes.
833;;**
834;;*******************************************************************
835
27240aa4
AS
836(defun xml-debug-print (xml &optional indent-string)
837 "Outputs the XML in the current buffer.
838XML can be a tree or a list of nodes.
839The first line is indented with the optional INDENT-STRING."
840 (setq indent-string (or indent-string ""))
971489ea 841 (dolist (node xml)
27240aa4
AS
842 (xml-debug-print-internal node indent-string)))
843
844(defalias 'xml-print 'xml-debug-print)
47db06aa 845
971489ea 846(defun xml-debug-print-internal (xml indent-string)
47db06aa 847 "Outputs the XML tree in the current buffer.
a98e819b 848The first line is indented with INDENT-STRING."
47db06aa
GM
849 (let ((tree xml)
850 attlist)
a98e819b 851 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 852
47db06aa 853 ;; output the attribute list
971489ea 854 (setq attlist (xml-node-attributes tree))
47db06aa 855 (while attlist
a98e819b 856 (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\")
971489ea 857 (setq attlist (cdr attlist)))
524425ae 858
971489ea 859 (setq tree (xml-node-children tree))
47db06aa 860
27240aa4
AS
861 (if (null tree)
862 (insert ?/ ?>)
863 (insert ?>)
864
865 ;; output the children
866 (dolist (node tree)
867 (cond
868 ((listp node)
869 (insert ?\n)
870 (xml-debug-print-internal node (concat indent-string " ")))
871 ((stringp node) (insert node))
872 (t
873 (error "Invalid XML tree"))))
874
875 (when (not (and (null (cdr tree))
876 (stringp (car tree))))
877 (insert ?\n indent-string))
878 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
879
880(provide 'xml)
881
8a02e193 882;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
47db06aa 883;;; xml.el ends here