*** empty log message ***
[bpt/emacs.git] / lisp / xml.el
CommitLineData
1cd7adc6 1;;; xml.el --- XML parser
47db06aa 2
2e9bdf15 3;; Copyright (C) 2000, 01, 03, 2004 Free Software Foundation, Inc.
47db06aa
GM
4
5;; Author: Emmanuel Briot <briot@gnat.com>
720058f2 6;; Maintainer: Mark A. Hershberger <mah@everybody.org>
a98e819b 7;; Keywords: xml, data
47db06aa
GM
8
9;; This file is part of GNU Emacs.
10
11;; GNU Emacs is free software; you can redistribute it and/or modify
12;; it under the terms of the GNU General Public License as published by
13;; the Free Software Foundation; either version 2, or (at your option)
14;; any later version.
15
16;; GNU Emacs is distributed in the hope that it will be useful,
17;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;; GNU General Public License for more details.
20
21;; You should have received a copy of the GNU General Public License
22;; along with GNU Emacs; see the file COPYING. If not, write to the
23;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24;; Boston, MA 02111-1307, USA.
25
26;;; Commentary:
27
a98e819b
DL
28;; This file contains a somewhat incomplete non-validating XML parser. It
29;; parses a file, and returns a list that can be used internally by
a1dfa9a3 30;; any other Lisp libraries.
47db06aa
GM
31
32;;; FILE FORMAT
33
a98e819b
DL
34;; The document type declaration may either be ignored or (optionally)
35;; parsed, but currently the parsing will only accept element
a1dfa9a3 36;; declarations. The XML file is assumed to be well-formed. In case
a98e819b
DL
37;; of error, the parsing stops and the XML file is shown where the
38;; parsing stopped.
47db06aa 39;;
a98e819b 40;; It also knows how to ignore comments and processing instructions.
47db06aa
GM
41;;
42;; The XML file should have the following format:
653558a1
GM
43;; <node1 attr1="name1" attr2="name2" ...>value
44;; <node2 attr3="name3" attr4="name4">value2</node2>
45;; <node3 attr5="name5" attr6="name6">value3</node3>
47db06aa 46;; </node1>
a1dfa9a3 47;; Of course, the name of the nodes and attributes can be anything. There can
47db06aa
GM
48;; be any number of attributes (or none), as well as any number of children
49;; below the nodes.
50;;
51;; There can be only top level node, but with any number of children below.
52
53;;; LIST FORMAT
54
c7f8d055
SM
55;; The functions `xml-parse-file', `xml-parse-region' and
56;; `xml-parse-tag' return a list with the following format:
47db06aa
GM
57;;
58;; xml-list ::= (node node ...)
c7f8d055 59;; node ::= (qname attribute-list . child_node_list)
47db06aa
GM
60;; child_node_list ::= child_node child_node ...
61;; child_node ::= node | string
c7f8d055
SM
62;; qname ::= (:namespace-uri . "name") | "name"
63;; attribute_list ::= ((qname . "value") (qname . "value") ...)
47db06aa
GM
64;; | nil
65;; string ::= "..."
66;;
a98e819b
DL
67;; Some macros are provided to ease the parsing of this list.
68;; Whitespace is preserved. Fixme: There should be a tree-walker that
69;; can remove it.
47db06aa 70
c7f8d055
SM
71;; TODO:
72;; * xml:base, xml:space support
73;; * more complete DOCTYPE parsing
74;; * pi support
75
47db06aa
GM
76;;; Code:
77
a98e819b
DL
78;; Note that {buffer-substring,match-string}-no-properties were
79;; formerly used in several places, but that removes composition info.
80
47db06aa
GM
81;;*******************************************************************
82;;**
83;;** Macros to parse the list
84;;**
85;;*******************************************************************
86
6d12a4df
MH
87(defvar xml-entity-alist
88 '(("lt" . "<")
89 ("gt" . ">")
90 ("apos" . "'")
91 ("quot" . "\"")
92 ("amp" . "&"))
93 "The defined entities. Entities are added to this when the DTD is parsed.")
94
95(defvar xml-sub-parser nil
96 "Dynamically set this to a non-nil value if you want to parse an XML fragment.")
97
98(defvar xml-validating-parser nil
99 "Set to non-nil to get validity checking.")
100
971489ea 101(defsubst xml-node-name (node)
47db06aa 102 "Return the tag associated with NODE.
a1dfa9a3
SM
103Without namespace-aware parsing, the tag is a symbol.
104
105With namespace-aware parsing, the tag is a cons of a string
106representing the uri of the namespace with the local name of the
107tag. For example,
108
109 <foo>
110
111would be represented by
112
113 '(\"\" . \"foo\")."
114
971489ea 115 (car node))
47db06aa 116
971489ea 117(defsubst xml-node-attributes (node)
47db06aa
GM
118 "Return the list of attributes of NODE.
119The list can be nil."
971489ea 120 (nth 1 node))
47db06aa 121
971489ea 122(defsubst xml-node-children (node)
47db06aa
GM
123 "Return the list of children of NODE.
124This is a list of nodes, and it can be nil."
971489ea 125 (cddr node))
47db06aa
GM
126
127(defun xml-get-children (node child-name)
128 "Return the children of NODE whose tag is CHILD-NAME.
a1dfa9a3 129CHILD-NAME should match the value returned by `xml-node-name'."
971489ea
SM
130 (let ((match ()))
131 (dolist (child (xml-node-children node))
a1dfa9a3
SM
132 (if (and (listp child)
133 (equal (xml-node-name child) child-name))
134 (push child match)))
971489ea 135 (nreverse match)))
47db06aa 136
9bcd6a7e 137(defun xml-get-attribute-or-nil (node attribute)
47db06aa 138 "Get from NODE the value of ATTRIBUTE.
a1dfa9a3 139Return nil if the attribute was not found.
9bcd6a7e
EZ
140
141See also `xml-get-attribute'."
2e9bdf15 142 (cdr (assoc attribute (xml-node-attributes node))))
9bcd6a7e
EZ
143
144(defsubst xml-get-attribute (node attribute)
145 "Get from NODE the value of ATTRIBUTE.
146An empty string is returned if the attribute was not found.
147
148See also `xml-get-attribute-or-nil'."
149 (or (xml-get-attribute-or-nil node attribute) ""))
47db06aa
GM
150
151;;*******************************************************************
152;;**
153;;** Creating the list
154;;**
155;;*******************************************************************
156
a98e819b 157;;;###autoload
2d42509a 158(defun xml-parse-file (file &optional parse-dtd parse-ns)
a98e819b
DL
159 "Parse the well-formed XML file FILE.
160If FILE is already visited, use its buffer and don't kill it.
47db06aa 161Returns the top node with all its children.
2d42509a
JB
162If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
163If PARSE-NS is non-nil, then QNAMES are expanded."
653558a1
GM
164 (let ((keep))
165 (if (get-file-buffer file)
166 (progn
167 (set-buffer (get-file-buffer file))
168 (setq keep (point)))
a98e819b
DL
169 (let (auto-mode-alist) ; no need for xml-mode
170 (find-file file)))
524425ae 171
653558a1
GM
172 (let ((xml (xml-parse-region (point-min)
173 (point-max)
174 (current-buffer)
2d42509a 175 parse-dtd parse-ns)))
653558a1
GM
176 (if keep
177 (goto-char keep)
178 (kill-buffer (current-buffer)))
179 xml)))
47db06aa 180
6d12a4df 181
5ed32352
RS
182(defvar xml-name-re)
183(defvar xml-entity-value-re)
63b446bc 184(let* ((start-chars (concat "[:alpha:]:_"))
6d12a4df
MH
185 (name-chars (concat "-[:digit:]." start-chars))
186;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
187 (whitespace "[ \t\n\r]"))
188;;[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
189;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
190;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
191;; | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
192 (defvar xml-name-start-char-re (concat "[" start-chars "]"))
193;;[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
194 (defvar xml-name-char-re (concat "[" name-chars "]"))
195;;[5] Name ::= NameStartChar (NameChar)*
196 (defvar xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
197;;[6] Names ::= Name (#x20 Name)*
198 (defvar xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
199;;[7] Nmtoken ::= (NameChar)+
200 (defvar xml-nmtoken-re (concat xml-name-char-re "+"))
201;;[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
202 (defvar xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
203;;[66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
204 (defvar xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
205;;[68] EntityRef ::= '&' Name ';'
206 (defvar xml-entity-ref (concat "&" xml-name-re ";"))
207;;[69] PEReference ::= '%' Name ';'
208 (defvar xml-pe-reference-re (concat "%" xml-name-re ";"))
209;;[67] Reference ::= EntityRef | CharRef
210 (defvar xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
211;;[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
212;; | "'" ([^%&'] | PEReference | Reference)* "'"
213 (defvar xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
214 "\\|" xml-reference-re "\\)*\"\\|'\\(?:[^%&']\\|"
215 xml-pe-reference-re "\\|" xml-reference-re "\\)*'\\)")))
216;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
217;; | 'PUBLIC' S PubidLiteral S SystemLiteral
218;;[76] NDataDecl ::= S 'NDATA' S
219;;[73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
220;;[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
221;;[74] PEDef ::= EntityValue | ExternalID
222;;[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
223;;[70] EntityDecl ::= GEDecl | PEDecl
224
a98e819b
DL
225;; Note that this is setup so that we can do whitespace-skipping with
226;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
227;; compared with `re-search-forward', but that has been fixed. Also
228;; note that the standard syntax table contains other characters with
229;; whitespace syntax, like NBSP, but they are invalid in contexts in
230;; which we might skip whitespace -- specifically, they're not
231;; NameChars [XML 4].
232
233(defvar xml-syntax-table
234 (let ((table (make-syntax-table)))
235 ;; Get space syntax correct per XML [3].
236 (dotimes (c 31)
237 (modify-syntax-entry c "." table)) ; all are space in standard table
238 (dolist (c '(?\t ?\n ?\r)) ; these should be space
239 (modify-syntax-entry c " " table))
240 ;; For skipping attributes.
241 (modify-syntax-entry ?\" "\"" table)
242 (modify-syntax-entry ?' "\"" table)
243 ;; Non-alnum name chars should be symbol constituents (`-' and `_'
244 ;; are OK by default).
245 (modify-syntax-entry ?. "_" table)
246 (modify-syntax-entry ?: "_" table)
247 ;; XML [89]
248 (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
249 #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
250 #x30FD #x30FE))
251 (modify-syntax-entry (decode-char 'ucs c) "w" table))
252 ;; Fixme: rest of [4]
253 table)
254 "Syntax table used by `xml-parse-region'.")
255
256;; XML [5]
257;; Note that [:alpha:] matches all multibyte chars with word syntax.
ab161457
JPW
258(eval-and-compile
259 (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
a98e819b
DL
260
261;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
262;; document ::= prolog element Misc*
263;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
264
265;;;###autoload
2d42509a 266(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
47db06aa
GM
267 "Parse the region from BEG to END in BUFFER.
268If BUFFER is nil, it defaults to the current buffer.
269Returns the XML list for the region, or raises an error if the region
2d42509a 270is not well-formed XML.
47db06aa 271If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
2d42509a
JB
272and returned as the first element of the list.
273If PARSE-NS is non-nil, then QNAMES are expanded."
a98e819b
DL
274 (save-restriction
275 (narrow-to-region beg end)
276 ;; Use fixed syntax table to ensure regexp char classes and syntax
277 ;; specs DTRT.
278 (with-syntax-table (standard-syntax-table)
279 (let ((case-fold-search nil) ; XML is case-sensitive.
280 xml result dtd)
281 (save-excursion
282 (if buffer
283 (set-buffer buffer))
284 (goto-char (point-min))
285 (while (not (eobp))
286 (if (search-forward "<" nil t)
287 (progn
288 (forward-char -1)
34638996 289 (setq result (xml-parse-tag parse-dtd parse-ns))
6d12a4df 290 (if (and xml result (not xml-sub-parser))
a98e819b 291 ;; translation of rule [1] of XML specifications
6d12a4df 292 (error "XML: (Not Well-Formed) Only one root tag allowed")
47db06aa 293 (cond
971489ea 294 ((null result))
34638996
EZ
295 ((and (listp (car result))
296 parse-dtd)
971489ea 297 (setq dtd (car result))
a98e819b
DL
298 (if (cdr result) ; possible leading comment
299 (add-to-list 'xml (cdr result))))
47db06aa 300 (t
a98e819b
DL
301 (add-to-list 'xml result)))))
302 (goto-char (point-max))))
303 (if parse-dtd
304 (cons dtd (nreverse xml))
305 (nreverse xml)))))))
47db06aa 306
c7f8d055 307(defun xml-maybe-do-ns (name default xml-ns)
a1dfa9a3
SM
308 "Perform any namespace expansion.
309NAME is the name to perform the expansion on.
c7f8d055
SM
310DEFAULT is the default namespace. XML-NS is a cons of namespace
311names to uris. When namespace-aware parsing is off, then XML-NS
312is nil.
313
314During namespace-aware parsing, any name without a namespace is
315put into the namespace identified by DEFAULT. nil is used to
316specify that the name shouldn't be given a namespace."
317 (if (consp xml-ns)
318 (let* ((nsp (string-match ":" name))
319 (lname (if nsp (substring name (match-end 0)) name))
320 (prefix (if nsp (substring name 0 (match-beginning 0)) default))
321 (special (and (string-equal lname "xmlns") (not prefix)))
322 ;; Setting default to nil will insure that there is not
323 ;; matching cons in xml-ns. In which case we
324 (ns (or (cdr (assoc (if special "xmlns" prefix)
325 xml-ns))
6d12a4df 326 "")))
c7f8d055
SM
327 (cons ns (if special "" lname)))
328 (intern name)))
47db06aa 329
6d12a4df
MH
330(defun xml-parse-fragment (&optional parse-dtd parse-ns)
331 "Parse xml-like fragments."
332 (let ((xml-sub-parser t)
333 children)
334 (while (not (eobp))
335 (let ((bit (xml-parse-tag
336 parse-dtd parse-ns)))
337 (if children
338 (setq children (append (list bit) children))
339 (if (stringp bit)
340 (setq children (list bit))
341 (setq children bit)))))
342 (reverse children)))
343
2d42509a 344(defun xml-parse-tag (&optional parse-dtd parse-ns)
a98e819b 345 "Parse the tag at point.
47db06aa
GM
346If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
347returned as the first element in the list.
2d42509a 348If PARSE-NS is non-nil, then QNAMES are expanded.
47db06aa 349Returns one of:
a98e819b
DL
350 - a list : the matching node
351 - nil : the point is not looking at a tag.
352 - a pair : the first element is the DTD, the second is the node."
6d12a4df
MH
353 (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
354 (xml-ns (if (consp parse-ns)
2d42509a
JB
355 parse-ns
356 (if parse-ns
357 (list
c7f8d055 358 ;; Default for empty prefix is no namespace
6d12a4df 359 (cons "" "")
c7f8d055 360 ;; "xml" namespace
6d12a4df 361 (cons "xml" "http://www.w3.org/XML/1998/namespace")
2d42509a 362 ;; We need to seed the xmlns namespace
6d12a4df 363 (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
2d42509a
JB
364 (cond
365 ;; Processing instructions (like the <?xml version="1.0"?> tag at the
366 ;; beginning of a document).
367 ((looking-at "<\\?")
368 (search-forward "?>")
369 (skip-syntax-forward " ")
370 (xml-parse-tag parse-dtd xml-ns))
371 ;; Character data (CDATA) sections, in which no tag should be interpreted
372 ((looking-at "<!\\[CDATA\\[")
373 (let ((pos (match-end 0)))
374 (unless (search-forward "]]>" nil t)
6d12a4df 375 (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
ae026110
MH
376 (concat
377 (buffer-substring pos (match-beginning 0))
378 (xml-parse-string))))
2d42509a
JB
379 ;; DTD for the document
380 ((looking-at "<!DOCTYPE")
6d12a4df
MH
381 (let ((dtd (xml-parse-dtd parse-ns)))
382 (skip-syntax-forward " ")
383 (if xml-validating-parser
384 (cons dtd (xml-parse-tag nil xml-ns))
385 (xml-parse-tag nil xml-ns))))
2d42509a
JB
386 ;; skip comments
387 ((looking-at "<!--")
388 (search-forward "-->")
389 nil)
390 ;; end tag
391 ((looking-at "</")
392 '())
393 ;; opening tag
394 ((looking-at "<\\([^/>[:space:]]+\\)")
395 (goto-char (match-end 1))
34638996
EZ
396
397 ;; Parse this node
2d42509a 398 (let* ((node-name (match-string 1))
c7f8d055
SM
399 ;; Parse the attribute list.
400 (attrs (xml-parse-attlist xml-ns))
401 children pos)
402
403 ;; add the xmlns:* attrs to our cache
404 (when (consp xml-ns)
405 (dolist (attr attrs)
406 (when (and (consp (car attr))
6d12a4df
MH
407 (equal "http://www.w3.org/2000/xmlns/"
408 (caar attr)))
409 (push (cons (cdar attr) (cdr attr))
c7f8d055
SM
410 xml-ns))))
411
43b5fd81 412 (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
c7f8d055 413
2d42509a
JB
414 ;; is this an empty element ?
415 (if (looking-at "/>")
47db06aa 416 (progn
6d12a4df 417 (forward-char 2)
971489ea 418 (nreverse children))
6d12a4df
MH
419
420 ;; is this a valid start tag ?
421 (if (eq (char-after) ?>)
422 (progn
423 (forward-char 1)
424 ;; Now check that we have the right end-tag. Note that this
425 ;; one might contain spaces after the tag name
426 (let ((end (concat "</" node-name "\\s-*>")))
427 (while (not (looking-at end))
428 (cond
429 ((looking-at "</")
430 (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
431 node-name (point)))
432 ((= (char-after) ?<)
433 (let ((tag (xml-parse-tag nil xml-ns)))
434 (when tag
435 (push tag children))))
436 (t
437 (let ((expansion (xml-parse-string)))
438 (setq children
439 (if (stringp expansion)
440 (if (stringp (car children))
441 ;; The two strings were separated by a comment.
442 (setq children (append (concat (car children) expansion)
443 (cdr children)))
444 (setq children (append (list expansion) children)))
445 (setq children (append expansion children))))))))
446
447 (goto-char (match-end 0))
448 (nreverse children)))
449 ;; This was an invalid start tag (Expected ">", but didn't see it.)
450 (error "XML: (Well-Formed) Couldn't parse tag: %s"
451 (buffer-substring (- (point) 10) (+ (point) 1)))))))
452 (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
453 (unless xml-sub-parser ; Usually, we error out.
454 (error "XML: (Well-Formed) Invalid character"))
455
456 ;; However, if we're parsing incrementally, then we need to deal
457 ;; with stray CDATA.
458 (xml-parse-string)))))
459
460(defun xml-parse-string ()
461 "Parse the next whatever. Could be a string, or an element."
462 (let* ((pos (point))
463 (string (progn (if (search-forward "<" nil t)
464 (forward-char -1)
465 (goto-char (point-max)))
466 (buffer-substring pos (point)))))
467 ;; Clean up the string. As per XML specifications, the XML
468 ;; processor should always pass the whole string to the
469 ;; application. But \r's should be replaced:
470 ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
471 (setq pos 0)
472 (while (string-match "\r\n?" string pos)
473 (setq string (replace-match "\n" t t string))
474 (setq pos (1+ (match-beginning 0))))
475
476 (xml-substitute-special string)))
47db06aa 477
c7f8d055 478(defun xml-parse-attlist (&optional xml-ns)
a1dfa9a3
SM
479 "Return the attribute-list after point.
480Leave point at the first non-blank character after the tag."
971489ea 481 (let ((attlist ())
34638996 482 end-pos name)
a98e819b
DL
483 (skip-syntax-forward " ")
484 (while (looking-at (eval-when-compile
485 (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
c7f8d055
SM
486 (setq end-pos (match-end 0))
487 (setq name (xml-maybe-do-ns (match-string 1) nil xml-ns))
488 (goto-char end-pos)
47db06aa 489
a158ff81
JB
490 ;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
491
47db06aa
GM
492 ;; Do we have a string between quotes (or double-quotes),
493 ;; or a simple word ?
a158ff81 494 (if (looking-at "\"\\([^\"]*\\)\"")
34638996 495 (setq end-pos (match-end 0))
f0ec1711 496 (if (looking-at "'\\([^']*\\)'")
34638996 497 (setq end-pos (match-end 0))
6d12a4df 498 (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
47db06aa
GM
499
500 ;; Each attribute must be unique within a given element
501 (if (assoc name attlist)
6d12a4df 502 (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
524425ae 503
a158ff81
JB
504 ;; Multiple whitespace characters should be replaced with a single one
505 ;; in the attributes
a98e819b 506 (let ((string (match-string 1))
a158ff81 507 (pos 0))
a98e819b 508 (replace-regexp-in-string "\\s-\\{2,\\}" " " string)
6d12a4df
MH
509 (let ((expansion (xml-substitute-special string)))
510 (unless (stringp expansion)
511 ; We say this is the constraint. It is acctually that
512 ; external entities nor "<" can be in an attribute value.
513 (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
514 (push (cons name expansion) attlist)))
a158ff81 515
34638996 516 (goto-char end-pos)
a98e819b 517 (skip-syntax-forward " "))
971489ea 518 (nreverse attlist)))
47db06aa
GM
519
520;;*******************************************************************
521;;**
522;;** The DTD (document type declaration)
523;;** The following functions know how to skip or parse the DTD of
524;;** a document
525;;**
526;;*******************************************************************
527
a98e819b
DL
528;; Fixme: This fails at least if the DTD contains conditional sections.
529
530(defun xml-skip-dtd ()
531 "Skip the DTD at point.
47db06aa 532This follows the rule [28] in the XML specifications."
6d12a4df
MH
533 (let ((xml-validating-parser nil))
534 (xml-parse-dtd)))
47db06aa 535
6d12a4df 536(defun xml-parse-dtd (&optional parse-ns)
a98e819b
DL
537 "Parse the DTD at point."
538 (forward-char (eval-when-compile (length "<!DOCTYPE")))
539 (skip-syntax-forward " ")
6d12a4df
MH
540 (if (and (looking-at ">")
541 xml-validating-parser)
542 (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
524425ae 543
971489ea 544 ;; Get the name of the document
a98e819b
DL
545 (looking-at xml-name-regexp)
546 (let ((dtd (list (match-string 0) 'dtd))
971489ea 547 type element end-pos)
47db06aa
GM
548 (goto-char (match-end 0))
549
a98e819b
DL
550 (skip-syntax-forward " ")
551 ;; XML [75]
552 (cond ((looking-at "PUBLIC\\s-+")
553 (goto-char (match-end 0))
554 (unless (or (re-search-forward
555 "\\=\"\\([[:space:][:alnum:]-'()+,./:=?;!*#@$_%]*\\)\""
556 nil t)
557 (re-search-forward
558 "\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
559 nil t))
6d12a4df 560 (error "XML: Missing Public ID"))
a98e819b 561 (let ((pubid (match-string 1)))
6d12a4df 562 (skip-syntax-forward " ")
a98e819b
DL
563 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
564 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 565 (error "XML: Missing System ID"))
a98e819b
DL
566 (push (list pubid (match-string 1) 'public) dtd)))
567 ((looking-at "SYSTEM\\s-+")
568 (goto-char (match-end 0))
569 (unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
570 (re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
6d12a4df 571 (error "XML: Missing System ID"))
a98e819b
DL
572 (push (list (match-string 1) 'system) dtd)))
573 (skip-syntax-forward " ")
574 (if (eq ?> (char-after))
575 (forward-char)
a98e819b 576 (if (not (eq (char-after) ?\[))
6d12a4df 577 (error "XML: Bad DTD")
a98e819b
DL
578 (forward-char)
579 ;; Parse the rest of the DTD
6d12a4df 580 ;; Fixme: Deal with ATTLIST, NOTATION, PIs.
a98e819b
DL
581 (while (not (looking-at "\\s-*\\]"))
582 (skip-syntax-forward " ")
583 (cond
584
585 ;; Translation of rule [45] of XML specifications
586 ((looking-at
587 "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
588
34638996 589 (setq element (match-string 1)
a98e819b
DL
590 type (match-string-no-properties 2))
591 (setq end-pos (match-end 0))
592
593 ;; Translation of rule [46] of XML specifications
594 (cond
595 ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
596 (setq type 'empty))
597 ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
598 (setq type 'any))
599 ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
600 (setq type (xml-parse-elem-type (match-string 1 type))))
601 ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
602 nil)
603 (t
6d12a4df 604 (if xml-validating-parser
07f7e9ae 605 (error "XML: (Validity) Invalid element type in the DTD"))))
27720433 606
a98e819b 607 ;; rule [45]: the element declaration must be unique
6d12a4df
MH
608 (if (and (assoc element dtd)
609 xml-validating-parser)
610 (error "XML: (Validity) Element declarations must be unique in a DTD (<%s>)"
461f3ad0 611 element))
a98e819b
DL
612
613 ;; Store the element in the DTD
614 (push (list element type) dtd)
615 (goto-char end-pos))
616 ((looking-at "<!--")
617 (search-forward "-->"))
6d12a4df
MH
618 ((looking-at (concat "<!ENTITY[ \t\n\r]*\\(" xml-name-re
619 "\\)[ \t\n\r]*\\(" xml-entity-value-re
620 "\\)[ \t\n\r]*>"))
621 (let ((name (buffer-substring (nth 2 (match-data))
622 (nth 3 (match-data))))
623 (value (buffer-substring (+ (nth 4 (match-data)) 1)
624 (- (nth 5 (match-data)) 1))))
625 (goto-char (nth 1 (match-data)))
626 (setq xml-entity-alist
627 (append xml-entity-alist
628 (list (cons name
629 (with-temp-buffer
630 (insert value)
631 (goto-char (point-min))
632 (xml-parse-fragment
633 xml-validating-parser
634 parse-ns))))))))
635 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
636 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
637 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
638 (looking-at (concat "<!ENTITY[ \t\n\r]+\\(" xml-name-re
639 "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
640 "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
641 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
642 "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
643 "[ \t\n\r]*>")))
644 (let ((name (buffer-substring (nth 2 (match-data))
645 (nth 3 (match-data))))
646 (file (buffer-substring (+ (nth 4 (match-data)) 1)
647 (- (nth 5 (match-data)) 1))))
648 (goto-char (nth 1 (match-data)))
649 (setq xml-entity-alist
650 (append xml-entity-alist
651 (list (cons name (with-temp-buffer
652 (insert-file-contents file)
653 (goto-char (point-min))
654 (xml-parse-fragment
655 xml-validating-parser
656 parse-ns))))))))
63b446bc
MH
657 ;; skip parameter entity declarations
658 ((or (looking-at (concat "<!ENTITY[ \t\n\r]+%[ \t\n\r]+\\(" xml-name-re
659 "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
660 "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>"))
661 (looking-at (concat "<!ENTITY[ \t\n\r]+"
662 "%[ \t\n\r]+"
663 "\\(" xml-name-re "\\)[ \t\n\r]+"
664 "PUBLIC[ \t\n\r]+"
665 "\\(\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
666 "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'\\)[ \t\n\r]+"
667 "\\(\"[^\"]+\"\\|'[^']+'\\)"
668 "[ \t\n\r]*>")))
669 (goto-char (match-end 0)))
670 ;; skip parameter entities
671 ((looking-at (concat "%" xml-name-re ";"))
672 (goto-char (match-end 0)))
a98e819b 673 (t
7e0f1123 674 (when xml-validating-parser
27720433 675 (error "XML: (Validity) Invalid DTD item"))))))
6d12a4df
MH
676 (if (looking-at "\\s-*]>")
677 (goto-char (nth 1 (match-data)))))
461f3ad0 678 (nreverse dtd)))
47db06aa
GM
679
680(defun xml-parse-elem-type (string)
a98e819b 681 "Convert element type STRING into a Lisp structure."
47db06aa
GM
682
683 (let (elem modifier)
684 (if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
685 (progn
686 (setq elem (match-string 1 string)
687 modifier (match-string 2 string))
688 (if (string-match "|" elem)
971489ea 689 (setq elem (cons 'choice
47db06aa
GM
690 (mapcar 'xml-parse-elem-type
691 (split-string elem "|"))))
692 (if (string-match "," elem)
971489ea 693 (setq elem (cons 'seq
47db06aa 694 (mapcar 'xml-parse-elem-type
a98e819b 695 (split-string elem ",")))))))
a158ff81
JB
696 (if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
697 (setq elem (match-string 1 string)
47db06aa
GM
698 modifier (match-string 2 string))))
699
971489ea
SM
700 (if (and (stringp elem) (string= elem "#PCDATA"))
701 (setq elem 'pcdata))
524425ae 702
971489ea
SM
703 (cond
704 ((string= modifier "+")
705 (list '+ elem))
706 ((string= modifier "*")
707 (list '* elem))
708 ((string= modifier "?")
0fa6f70c 709 (list '\? elem))
971489ea
SM
710 (t
711 elem))))
47db06aa 712
47db06aa
GM
713;;*******************************************************************
714;;**
715;;** Substituting special XML sequences
716;;**
717;;*******************************************************************
718
719(defun xml-substitute-special (string)
a98e819b
DL
720 "Return STRING, after subsituting entity references."
721 ;; This originally made repeated passes through the string from the
722 ;; beginning, which isn't correct, since then either "&amp;amp;" or
723 ;; "&#38;amp;" won't DTRT.
47db06aa 724
6d12a4df
MH
725 (let ((point 0)
726 children end-point)
ae026110 727 (while (string-match "&\\([^;]*\\);" string point)
6d12a4df
MH
728 (setq end-point (match-end 0))
729 (let* ((this-part (match-string 1 string))
730 (prev-part (substring string point (match-beginning 0)))
731 (entity (assoc this-part xml-entity-alist))
732 (expansion
733 (cond ((string-match "#\\([0-9]+\\)" this-part)
734 (let ((c (decode-char
735 'ucs
736 (string-to-number (match-string 1 this-part)))))
737 (if c (string c))))
738 ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
739 (let ((c (decode-char
740 'ucs
741 (string-to-number (match-string 1 this-part) 16))))
742 (if c (string c))))
743 (entity
744 (cdr entity))
ae026110 745 ((eq (length this-part) 0)
27720433 746 (error "XML: (Not Well-Formed) No entity given"))
6d12a4df 747 (t
7e0f1123 748 (when xml-validating-parser
6d12a4df 749 (error "XML: (Validity) Undefined entity `%s'"
ae026110 750 this-part))))))
6d12a4df
MH
751
752 (cond ((null children)
f0d49437
MH
753 ;; FIXME: If we have an entity that expands into XML, this won't work.
754 (setq children
755 (concat prev-part expansion)))
6d12a4df
MH
756 ((stringp children)
757 (if (stringp expansion)
758 (setq children (concat children prev-part expansion))
759 (setq children (list expansion (concat prev-part children)))))
760 ((and (stringp expansion)
761 (stringp (car children)))
762 (setcar children (concat prev-part expansion (car children))))
763 ((stringp expansion)
764 (setq children (append (concat prev-part expansion)
765 children)))
766 ((stringp (car children))
767 (setcar children (concat (car children) prev-part))
768 (setq children (append expansion children)))
769 (t
770 (setq children (list expansion
771 prev-part
772 children))))
773 (setq point end-point)))
774 (cond ((stringp children)
775 (concat children (substring string point)))
776 ((stringp (car (last children)))
a3110b5d 777 (concat (car (last children)) (substring string point)))
6d12a4df
MH
778 ((null children)
779 string)
780 (t
a3110b5d
MH
781 (concat (mapconcat 'identity
782 (nreverse children)
783 "")
784 (substring string point))))))
785
47db06aa
GM
786;;*******************************************************************
787;;**
788;;** Printing a tree.
789;;** This function is intended mainly for debugging purposes.
790;;**
791;;*******************************************************************
792
27240aa4
AS
793(defun xml-debug-print (xml &optional indent-string)
794 "Outputs the XML in the current buffer.
795XML can be a tree or a list of nodes.
796The first line is indented with the optional INDENT-STRING."
797 (setq indent-string (or indent-string ""))
971489ea 798 (dolist (node xml)
27240aa4
AS
799 (xml-debug-print-internal node indent-string)))
800
801(defalias 'xml-print 'xml-debug-print)
47db06aa 802
971489ea 803(defun xml-debug-print-internal (xml indent-string)
47db06aa 804 "Outputs the XML tree in the current buffer.
a98e819b 805The first line is indented with INDENT-STRING."
47db06aa
GM
806 (let ((tree xml)
807 attlist)
a98e819b 808 (insert indent-string ?< (symbol-name (xml-node-name tree)))
524425ae 809
47db06aa 810 ;; output the attribute list
971489ea 811 (setq attlist (xml-node-attributes tree))
47db06aa 812 (while attlist
a98e819b 813 (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\")
971489ea 814 (setq attlist (cdr attlist)))
524425ae 815
971489ea 816 (setq tree (xml-node-children tree))
47db06aa 817
27240aa4
AS
818 (if (null tree)
819 (insert ?/ ?>)
820 (insert ?>)
821
822 ;; output the children
823 (dolist (node tree)
824 (cond
825 ((listp node)
826 (insert ?\n)
827 (xml-debug-print-internal node (concat indent-string " ")))
828 ((stringp node) (insert node))
829 (t
830 (error "Invalid XML tree"))))
831
832 (when (not (and (null (cdr tree))
833 (stringp (car tree))))
834 (insert ?\n indent-string))
835 (insert ?< ?/ (symbol-name (xml-node-name xml)) ?>))))
47db06aa
GM
836
837(provide 'xml)
838
8a02e193 839;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
47db06aa 840;;; xml.el ends here