Switch to recommended form of GPLv3 permissions notice.
[bpt/emacs.git] / lisp / nxml / nxml-parse.el
CommitLineData
8cd39fb3
MH
1;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode
2
dcb8ac09 3;; Copyright (C) 2003, 2007, 2008 Free Software Foundation, Inc.
8cd39fb3
MH
4
5;; Author: James Clark
6;; Keywords: XML
7
a2c2455c 8;; This file is part of GNU Emacs.
8cd39fb3 9
a2c2455c
GM
10;; GNU Emacs is free software; you can redistribute it and/or modify
11;; it under the terms of the GNU General Public License as published by
12;; the Free Software Foundation; either version 3, or (at your option)
13;; any later version.
8cd39fb3 14
a2c2455c
GM
15;; GNU Emacs is distributed in the hope that it will be useful,
16;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18;; GNU General Public License for more details.
19
20;; You should have received a copy of the GNU General Public License
21;; along with GNU Emacs; see the file COPYING. If not, write to the
22;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23;; Boston, MA 02110-1301, USA.
8cd39fb3
MH
24
25;;; Commentary:
26
27;; Entry point is `nxml-parse-file'.
28
29;;; Code:
30
31(require 'nxml-util)
32(require 'xmltok)
33(require 'nxml-enc)
34(require 'nxml-ns)
35
36(defvar nxml-parse-file-name nil)
37
38(defvar nxml-validate-function nil
39 "Nil or a function to be called by `nxml-parse-file' to perform validation.
40The function will be called once for each start-tag or end-tag. The
41function is passed two arguments TEXT and START-TAG. For a start-tag,
42START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in
43the same form as returned by `nxml-parse-file. For an end-tag,
44START-TAG is nil. TEXT is a string containing the text immediately
45preceding the tag, or nil if there was no such text. An empty element
46is treated as a start-tag followed by an end-tag.
47
48For a start-tag, the namespace state will be the state after
49processing the namespace declarations in the start-tag. For an
50end-tag, the namespace state will be the state before popping the
51namespace declarations for the corresponding start-tag.
52
53The function must return nil if no error is detected or a
54cons (MESSAGE . LOCATION) where MESSAGE is a string containing
55an error message and LOCATION indicates what caused the error
56as follows:
57
58- nil indicates the tag as whole caused it; this is always allowed;
59
60- text indicates the text caused it; this is allowed only if
61TEXT is non-nil;
62
63- tag-close indicates the close of the tag caused it; this is
64allowed only if START-TAG is non-nil;
65
66- (attribute-name . N) indicates that the name of the Nth attribute
67caused it; N counts from 0; this is allowed only if START-TAG is non-nil
68and N must be less than the number of attributes;
69
70- (attribute-value . N) indicates that the value of the Nth attribute
71caused it; N counts from 0; this is allowed only if START-TAG is non-nil
72and N must be less than the number of attributes.")
73
74(defun nxml-parse-file (file)
75 "Parse the XML document in FILE and return it as a list.
76An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN).
77NAME is either a string, in the case where the name does not have a
78namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a
79symbol and LOCAL-NAME is a string, in the case where the name does
80have a namespace. NAMESPACE is a keyword whose name is `:URI', where
81URI is the namespace name. ATTRIBUTES is an alist of attributes where
82each attribute has the form (NAME . VALUE), where NAME has the same
83form as an element name, and VALUE is a string. A namespace
84declaration is represented as an attribute whose name is
85\(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list
86containing strings and child elements; CHILDREN never contains two
87consecutive strings and never contains an empty string. Processing
88instructions and comments are not represented. The return value is a
89list representing the document element.
90
91If the XML document is not well-formed, an error having the condition
92`nxml-file-parse-error' will be signaled; the error data will be a
93list of the \(FILE POSITION MESSAGE), where POSITION is an integer
94specifying the position where the error was detected, and MESSAGE is a
95string describing the error.
96
97The current contents of FILE will be parsed even if there is a
98modified buffer currently visiting FILE.
99
100If the variable `nxml-validation-function' is non-nil, it will be
101called twice for each element, and any reported error will be signaled
102in the same way as well-formedness error."
103 (save-excursion
104 (set-buffer (nxml-parse-find-file file))
105 (unwind-protect
106 (let ((nxml-parse-file-name file))
107 (nxml-parse-instance))
108 (kill-buffer nil))))
109
110(defun nxml-parse-find-file (file)
111 (save-excursion
112 (set-buffer (get-buffer-create " *nXML Parse*"))
113 (erase-buffer)
114 (let ((set-auto-coding-function 'nxml-set-xml-coding))
115 (insert-file-contents file))
116 (current-buffer)))
117
118(defun nxml-parse-instance ()
119 (let (xmltok-dtd)
120 (xmltok-save
121 (xmltok-forward-prolog)
122 (nxml-check-xmltok-errors)
123 (nxml-ns-save
124 (nxml-parse-instance-1)))))
125
126(defun nxml-parse-instance-1 ()
127 (let* ((top (cons nil nil))
128 ;; tail is a cons cell, whose cdr is nil
129 ;; additional elements will destructively appended to tail
130 (tail top)
131 ;; stack of tails one for each open element
132 tail-stack
133 ;; list of QNames of open elements
134 open-element-tags
135 ;; list of strings buffering a text node, in reverse order
136 text
137 ;; position of beginning of first (in buffer) string in text
138 text-pos)
139 (while (xmltok-forward)
140 (nxml-check-xmltok-errors)
141 (cond ((memq xmltok-type '(start-tag end-tag empty-element))
142 (when text
143 (setq text (apply 'concat (nreverse text)))
144 (setcdr tail (cons text nil))
145 (setq tail (cdr tail)))
146 (when (not (eq xmltok-type 'end-tag))
147 (when (and (not open-element-tags)
148 (not (eq tail top)))
149 (nxml-parse-error nil "Multiple top-level elements"))
150 (setq open-element-tags
151 (cons (xmltok-start-tag-qname)
152 open-element-tags))
153 (nxml-ns-push-state)
154 (let ((tag (nxml-parse-start-tag)))
155 (nxml-validate-tag text text-pos tag)
156 (setq text nil)
157 (setcdr tail (cons tag nil))
158 (setq tail (cdr tail))
159 (setq tail-stack (cons tail tail-stack))
160 (setq tail (last tag))))
161 (when (not (eq xmltok-type 'start-tag))
162 (or (eq xmltok-type 'empty-element)
163 (equal (car open-element-tags)
164 (xmltok-end-tag-qname))
165 (if open-element-tags
166 (nxml-parse-error nil
167 "Unbalanced end-tag; expected </%s>"
168 (car open-element-tags))
169 (nxml-parse-error nil "Extra end-tag")))
170 (nxml-validate-tag text text-pos nil)
171 (setq text nil)
172 (nxml-ns-pop-state)
173 (setq open-element-tags (cdr open-element-tags))
174 (setq tail (car tail-stack))
175 (setq tail-stack (cdr tail-stack)))
176 (setq text-pos nil))
177 ((memq xmltok-type '(space data entity-ref char-ref cdata-section))
178 (cond (open-element-tags
179 (unless text-pos
180 (setq text-pos xmltok-start))
181 (setq text
182 (cons (nxml-current-text-string) text)))
183 ((not (eq xmltok-type 'space))
184 (nxml-parse-error
185 nil
186 "%s at top-level"
187 (cdr (assq xmltok-type
188 '((data . "Text characters")
189 (entity-ref . "Entity reference")
190 (char-ref . "Character reference")
191 (cdata-section . "CDATA section"))))))))))
192 (unless (cdr top)
193 (nxml-parse-error (point-max) "Missing document element"))
194 (cadr top)))
195
196(defun nxml-parse-start-tag ()
197 (let (parsed-attributes
198 parsed-namespace-attributes
199 atts att prefixes prefix ns value name)
200 (setq atts xmltok-namespace-attributes)
201 (while atts
202 (setq att (car atts))
203 (setq value (or (xmltok-attribute-value att)
204 (nxml-parse-error nil "Invalid attribute value")))
205 (setq ns (nxml-make-namespace value))
206 (setq prefix (and (xmltok-attribute-prefix att)
207 (xmltok-attribute-local-name att)))
208 (cond ((member prefix prefixes)
209 (nxml-parse-error nil "Duplicate namespace declaration"))
210 ((not prefix)
211 (nxml-ns-set-default ns))
212 (ns
213 (nxml-ns-set-prefix prefix ns))
214 (t (nxml-parse-error nil "Cannot undeclare namespace prefix")))
215 (setq prefixes (cons prefix prefixes))
216 (setq parsed-namespace-attributes
217 (cons (cons (nxml-make-name nxml-xmlns-namespace-uri
218 (xmltok-attribute-local-name att))
219 value)
220 parsed-namespace-attributes))
221 (setq atts (cdr atts)))
222 (setq name
223 (nxml-make-name
224 (let ((prefix (xmltok-start-tag-prefix)))
225 (if prefix
226 (or (nxml-ns-get-prefix prefix)
227 (nxml-parse-error (1+ xmltok-start)
228 "Prefix `%s' undeclared"
229 prefix))
230 (nxml-ns-get-default)))
231 (xmltok-start-tag-local-name)))
232 (setq atts xmltok-attributes)
233 (while atts
234 (setq att (car atts))
235 (setq ns
236 (let ((prefix (xmltok-attribute-prefix att)))
237 (and prefix
238 (or (nxml-ns-get-prefix prefix)
239 (nxml-parse-error (xmltok-attribute-name-start att)
240 "Prefix `%s' undeclared"
241 prefix)))))
242 (setq parsed-attributes
243 (let ((nm (nxml-make-name ns
244 (xmltok-attribute-local-name att))))
245 (when (assoc nm parsed-attributes)
246 (nxml-parse-error (xmltok-attribute-name-start att)
247 "Duplicate attribute"))
248 (cons (cons nm (or (xmltok-attribute-value att)
249 (nxml-parse-error nil "Invalid attribute value")))
250 parsed-attributes)))
251 (setq atts (cdr atts)))
252 ;; We want to end up with the attributes followed by the
253 ;; the namespace attributes in the same order as
254 ;; xmltok-attributes and xmltok-namespace-attributes respectively.
255 (when parsed-namespace-attributes
256 (setq parsed-attributes
257 (nconc parsed-namespace-attributes parsed-attributes)))
258 (list name (nreverse parsed-attributes))))
259
260(defun nxml-validate-tag (text text-pos tag)
261 (when nxml-validate-function
262 (let ((err (funcall nxml-validate-function text tag))
263 pos)
264 (when err
265 (setq pos (nxml-validate-error-position (cdr err)
266 (and text text-pos)
267 tag))
268 (or pos (error "Incorrect return value from %s"
269 nxml-validate-function))
270 (nxml-parse-error pos (car err))))))
271
272(defun nxml-validate-error-position (location text-pos tag)
273 (cond ((null location) xmltok-start)
274 ((eq location 'text) text-pos)
275 ((eq location 'tag-close)
276 (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1))))
277 ((consp location)
278 (let ((att (nth (cdr location) xmltok-attributes)))
279 (when (not att)
280 (setq att (nth (- (cdr location) (length xmltok-attributes))
281 xmltok-namespace-attributes)))
282 (cond ((not att))
283 ((eq (car location) 'attribute-name)
284 (xmltok-attribute-name-start att))
285 ((eq (car location) 'attribute-value)
286 (xmltok-attribute-value-start att)))))))
287
288(defun nxml-make-name (ns local-name)
289 (if ns
290 (cons ns local-name)
291 local-name))
292
293(defun nxml-current-text-string ()
294 (cond ((memq xmltok-type '(space data))
295 (buffer-substring-no-properties xmltok-start
296 (point)))
297 ((eq xmltok-type 'cdata-section)
298 (buffer-substring-no-properties (+ xmltok-start 9)
299 (- (point) 3)))
300 ((memq xmltok-type '(char-ref entity-ref))
301 (unless xmltok-replacement
302 (nxml-parse-error nil
303 (if (eq xmltok-type 'char-ref)
304 "Reference to unsupported Unicode character"
305 "Unresolvable entity reference")))
306 xmltok-replacement)))
307
308(defun nxml-parse-error (position &rest args)
309 (nxml-signal-file-parse-error nxml-parse-file-name
310 (or position xmltok-start)
311 (apply 'format args)))
312
313(defun nxml-check-xmltok-errors ()
314 (when xmltok-errors
315 (let ((err (car (last xmltok-errors))))
316 (nxml-signal-file-parse-error nxml-parse-file-name
317 (xmltok-error-start err)
318 (xmltok-error-message err)))))
319
320(provide 'nxml-parse)
321
ab4c34c6 322;; arch-tag: fc19639b-1bff-4673-9992-f539da89ba1e
8cd39fb3 323;;; nxml-parse.el ends here