Commit | Line | Data |
---|---|---|
8cd39fb3 MH |
1 | ;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode |
2 | ||
dcb8ac09 | 3 | ;; Copyright (C) 2003, 2007, 2008 Free Software Foundation, Inc. |
8cd39fb3 MH |
4 | |
5 | ;; Author: James Clark | |
6 | ;; Keywords: XML | |
7 | ||
a2c2455c | 8 | ;; This file is part of GNU Emacs. |
8cd39fb3 | 9 | |
a2c2455c GM |
10 | ;; GNU Emacs is free software; you can redistribute it and/or modify |
11 | ;; it under the terms of the GNU General Public License as published by | |
12 | ;; the Free Software Foundation; either version 3, or (at your option) | |
13 | ;; any later version. | |
8cd39fb3 | 14 | |
a2c2455c GM |
15 | ;; GNU Emacs is distributed in the hope that it will be useful, |
16 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | ;; GNU General Public License for more details. | |
19 | ||
20 | ;; You should have received a copy of the GNU General Public License | |
21 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
22 | ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
23 | ;; Boston, MA 02110-1301, USA. | |
8cd39fb3 MH |
24 | |
25 | ;;; Commentary: | |
26 | ||
27 | ;; Entry point is `nxml-parse-file'. | |
28 | ||
29 | ;;; Code: | |
30 | ||
31 | (require 'nxml-util) | |
32 | (require 'xmltok) | |
33 | (require 'nxml-enc) | |
34 | (require 'nxml-ns) | |
35 | ||
36 | (defvar nxml-parse-file-name nil) | |
37 | ||
38 | (defvar nxml-validate-function nil | |
39 | "Nil or a function to be called by `nxml-parse-file' to perform validation. | |
40 | The function will be called once for each start-tag or end-tag. The | |
41 | function is passed two arguments TEXT and START-TAG. For a start-tag, | |
42 | START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in | |
43 | the same form as returned by `nxml-parse-file. For an end-tag, | |
44 | START-TAG is nil. TEXT is a string containing the text immediately | |
45 | preceding the tag, or nil if there was no such text. An empty element | |
46 | is treated as a start-tag followed by an end-tag. | |
47 | ||
48 | For a start-tag, the namespace state will be the state after | |
49 | processing the namespace declarations in the start-tag. For an | |
50 | end-tag, the namespace state will be the state before popping the | |
51 | namespace declarations for the corresponding start-tag. | |
52 | ||
53 | The function must return nil if no error is detected or a | |
54 | cons (MESSAGE . LOCATION) where MESSAGE is a string containing | |
55 | an error message and LOCATION indicates what caused the error | |
56 | as follows: | |
57 | ||
58 | - nil indicates the tag as whole caused it; this is always allowed; | |
59 | ||
60 | - text indicates the text caused it; this is allowed only if | |
61 | TEXT is non-nil; | |
62 | ||
63 | - tag-close indicates the close of the tag caused it; this is | |
64 | allowed only if START-TAG is non-nil; | |
65 | ||
66 | - (attribute-name . N) indicates that the name of the Nth attribute | |
67 | caused it; N counts from 0; this is allowed only if START-TAG is non-nil | |
68 | and N must be less than the number of attributes; | |
69 | ||
70 | - (attribute-value . N) indicates that the value of the Nth attribute | |
71 | caused it; N counts from 0; this is allowed only if START-TAG is non-nil | |
72 | and N must be less than the number of attributes.") | |
73 | ||
74 | (defun nxml-parse-file (file) | |
75 | "Parse the XML document in FILE and return it as a list. | |
76 | An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN). | |
77 | NAME is either a string, in the case where the name does not have a | |
78 | namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a | |
79 | symbol and LOCAL-NAME is a string, in the case where the name does | |
80 | have a namespace. NAMESPACE is a keyword whose name is `:URI', where | |
81 | URI is the namespace name. ATTRIBUTES is an alist of attributes where | |
82 | each attribute has the form (NAME . VALUE), where NAME has the same | |
83 | form as an element name, and VALUE is a string. A namespace | |
84 | declaration is represented as an attribute whose name is | |
85 | \(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list | |
86 | containing strings and child elements; CHILDREN never contains two | |
87 | consecutive strings and never contains an empty string. Processing | |
88 | instructions and comments are not represented. The return value is a | |
89 | list representing the document element. | |
90 | ||
91 | If the XML document is not well-formed, an error having the condition | |
92 | `nxml-file-parse-error' will be signaled; the error data will be a | |
93 | list of the \(FILE POSITION MESSAGE), where POSITION is an integer | |
94 | specifying the position where the error was detected, and MESSAGE is a | |
95 | string describing the error. | |
96 | ||
97 | The current contents of FILE will be parsed even if there is a | |
98 | modified buffer currently visiting FILE. | |
99 | ||
100 | If the variable `nxml-validation-function' is non-nil, it will be | |
101 | called twice for each element, and any reported error will be signaled | |
102 | in the same way as well-formedness error." | |
103 | (save-excursion | |
104 | (set-buffer (nxml-parse-find-file file)) | |
105 | (unwind-protect | |
106 | (let ((nxml-parse-file-name file)) | |
107 | (nxml-parse-instance)) | |
108 | (kill-buffer nil)))) | |
109 | ||
110 | (defun nxml-parse-find-file (file) | |
111 | (save-excursion | |
112 | (set-buffer (get-buffer-create " *nXML Parse*")) | |
113 | (erase-buffer) | |
114 | (let ((set-auto-coding-function 'nxml-set-xml-coding)) | |
115 | (insert-file-contents file)) | |
116 | (current-buffer))) | |
117 | ||
118 | (defun nxml-parse-instance () | |
119 | (let (xmltok-dtd) | |
120 | (xmltok-save | |
121 | (xmltok-forward-prolog) | |
122 | (nxml-check-xmltok-errors) | |
123 | (nxml-ns-save | |
124 | (nxml-parse-instance-1))))) | |
125 | ||
126 | (defun nxml-parse-instance-1 () | |
127 | (let* ((top (cons nil nil)) | |
128 | ;; tail is a cons cell, whose cdr is nil | |
129 | ;; additional elements will destructively appended to tail | |
130 | (tail top) | |
131 | ;; stack of tails one for each open element | |
132 | tail-stack | |
133 | ;; list of QNames of open elements | |
134 | open-element-tags | |
135 | ;; list of strings buffering a text node, in reverse order | |
136 | text | |
137 | ;; position of beginning of first (in buffer) string in text | |
138 | text-pos) | |
139 | (while (xmltok-forward) | |
140 | (nxml-check-xmltok-errors) | |
141 | (cond ((memq xmltok-type '(start-tag end-tag empty-element)) | |
142 | (when text | |
143 | (setq text (apply 'concat (nreverse text))) | |
144 | (setcdr tail (cons text nil)) | |
145 | (setq tail (cdr tail))) | |
146 | (when (not (eq xmltok-type 'end-tag)) | |
147 | (when (and (not open-element-tags) | |
148 | (not (eq tail top))) | |
149 | (nxml-parse-error nil "Multiple top-level elements")) | |
150 | (setq open-element-tags | |
151 | (cons (xmltok-start-tag-qname) | |
152 | open-element-tags)) | |
153 | (nxml-ns-push-state) | |
154 | (let ((tag (nxml-parse-start-tag))) | |
155 | (nxml-validate-tag text text-pos tag) | |
156 | (setq text nil) | |
157 | (setcdr tail (cons tag nil)) | |
158 | (setq tail (cdr tail)) | |
159 | (setq tail-stack (cons tail tail-stack)) | |
160 | (setq tail (last tag)))) | |
161 | (when (not (eq xmltok-type 'start-tag)) | |
162 | (or (eq xmltok-type 'empty-element) | |
163 | (equal (car open-element-tags) | |
164 | (xmltok-end-tag-qname)) | |
165 | (if open-element-tags | |
166 | (nxml-parse-error nil | |
167 | "Unbalanced end-tag; expected </%s>" | |
168 | (car open-element-tags)) | |
169 | (nxml-parse-error nil "Extra end-tag"))) | |
170 | (nxml-validate-tag text text-pos nil) | |
171 | (setq text nil) | |
172 | (nxml-ns-pop-state) | |
173 | (setq open-element-tags (cdr open-element-tags)) | |
174 | (setq tail (car tail-stack)) | |
175 | (setq tail-stack (cdr tail-stack))) | |
176 | (setq text-pos nil)) | |
177 | ((memq xmltok-type '(space data entity-ref char-ref cdata-section)) | |
178 | (cond (open-element-tags | |
179 | (unless text-pos | |
180 | (setq text-pos xmltok-start)) | |
181 | (setq text | |
182 | (cons (nxml-current-text-string) text))) | |
183 | ((not (eq xmltok-type 'space)) | |
184 | (nxml-parse-error | |
185 | nil | |
186 | "%s at top-level" | |
187 | (cdr (assq xmltok-type | |
188 | '((data . "Text characters") | |
189 | (entity-ref . "Entity reference") | |
190 | (char-ref . "Character reference") | |
191 | (cdata-section . "CDATA section")))))))))) | |
192 | (unless (cdr top) | |
193 | (nxml-parse-error (point-max) "Missing document element")) | |
194 | (cadr top))) | |
195 | ||
196 | (defun nxml-parse-start-tag () | |
197 | (let (parsed-attributes | |
198 | parsed-namespace-attributes | |
199 | atts att prefixes prefix ns value name) | |
200 | (setq atts xmltok-namespace-attributes) | |
201 | (while atts | |
202 | (setq att (car atts)) | |
203 | (setq value (or (xmltok-attribute-value att) | |
204 | (nxml-parse-error nil "Invalid attribute value"))) | |
205 | (setq ns (nxml-make-namespace value)) | |
206 | (setq prefix (and (xmltok-attribute-prefix att) | |
207 | (xmltok-attribute-local-name att))) | |
208 | (cond ((member prefix prefixes) | |
209 | (nxml-parse-error nil "Duplicate namespace declaration")) | |
210 | ((not prefix) | |
211 | (nxml-ns-set-default ns)) | |
212 | (ns | |
213 | (nxml-ns-set-prefix prefix ns)) | |
214 | (t (nxml-parse-error nil "Cannot undeclare namespace prefix"))) | |
215 | (setq prefixes (cons prefix prefixes)) | |
216 | (setq parsed-namespace-attributes | |
217 | (cons (cons (nxml-make-name nxml-xmlns-namespace-uri | |
218 | (xmltok-attribute-local-name att)) | |
219 | value) | |
220 | parsed-namespace-attributes)) | |
221 | (setq atts (cdr atts))) | |
222 | (setq name | |
223 | (nxml-make-name | |
224 | (let ((prefix (xmltok-start-tag-prefix))) | |
225 | (if prefix | |
226 | (or (nxml-ns-get-prefix prefix) | |
227 | (nxml-parse-error (1+ xmltok-start) | |
228 | "Prefix `%s' undeclared" | |
229 | prefix)) | |
230 | (nxml-ns-get-default))) | |
231 | (xmltok-start-tag-local-name))) | |
232 | (setq atts xmltok-attributes) | |
233 | (while atts | |
234 | (setq att (car atts)) | |
235 | (setq ns | |
236 | (let ((prefix (xmltok-attribute-prefix att))) | |
237 | (and prefix | |
238 | (or (nxml-ns-get-prefix prefix) | |
239 | (nxml-parse-error (xmltok-attribute-name-start att) | |
240 | "Prefix `%s' undeclared" | |
241 | prefix))))) | |
242 | (setq parsed-attributes | |
243 | (let ((nm (nxml-make-name ns | |
244 | (xmltok-attribute-local-name att)))) | |
245 | (when (assoc nm parsed-attributes) | |
246 | (nxml-parse-error (xmltok-attribute-name-start att) | |
247 | "Duplicate attribute")) | |
248 | (cons (cons nm (or (xmltok-attribute-value att) | |
249 | (nxml-parse-error nil "Invalid attribute value"))) | |
250 | parsed-attributes))) | |
251 | (setq atts (cdr atts))) | |
252 | ;; We want to end up with the attributes followed by the | |
253 | ;; the namespace attributes in the same order as | |
254 | ;; xmltok-attributes and xmltok-namespace-attributes respectively. | |
255 | (when parsed-namespace-attributes | |
256 | (setq parsed-attributes | |
257 | (nconc parsed-namespace-attributes parsed-attributes))) | |
258 | (list name (nreverse parsed-attributes)))) | |
259 | ||
260 | (defun nxml-validate-tag (text text-pos tag) | |
261 | (when nxml-validate-function | |
262 | (let ((err (funcall nxml-validate-function text tag)) | |
263 | pos) | |
264 | (when err | |
265 | (setq pos (nxml-validate-error-position (cdr err) | |
266 | (and text text-pos) | |
267 | tag)) | |
268 | (or pos (error "Incorrect return value from %s" | |
269 | nxml-validate-function)) | |
270 | (nxml-parse-error pos (car err)))))) | |
271 | ||
272 | (defun nxml-validate-error-position (location text-pos tag) | |
273 | (cond ((null location) xmltok-start) | |
274 | ((eq location 'text) text-pos) | |
275 | ((eq location 'tag-close) | |
276 | (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1)))) | |
277 | ((consp location) | |
278 | (let ((att (nth (cdr location) xmltok-attributes))) | |
279 | (when (not att) | |
280 | (setq att (nth (- (cdr location) (length xmltok-attributes)) | |
281 | xmltok-namespace-attributes))) | |
282 | (cond ((not att)) | |
283 | ((eq (car location) 'attribute-name) | |
284 | (xmltok-attribute-name-start att)) | |
285 | ((eq (car location) 'attribute-value) | |
286 | (xmltok-attribute-value-start att))))))) | |
287 | ||
288 | (defun nxml-make-name (ns local-name) | |
289 | (if ns | |
290 | (cons ns local-name) | |
291 | local-name)) | |
292 | ||
293 | (defun nxml-current-text-string () | |
294 | (cond ((memq xmltok-type '(space data)) | |
295 | (buffer-substring-no-properties xmltok-start | |
296 | (point))) | |
297 | ((eq xmltok-type 'cdata-section) | |
298 | (buffer-substring-no-properties (+ xmltok-start 9) | |
299 | (- (point) 3))) | |
300 | ((memq xmltok-type '(char-ref entity-ref)) | |
301 | (unless xmltok-replacement | |
302 | (nxml-parse-error nil | |
303 | (if (eq xmltok-type 'char-ref) | |
304 | "Reference to unsupported Unicode character" | |
305 | "Unresolvable entity reference"))) | |
306 | xmltok-replacement))) | |
307 | ||
308 | (defun nxml-parse-error (position &rest args) | |
309 | (nxml-signal-file-parse-error nxml-parse-file-name | |
310 | (or position xmltok-start) | |
311 | (apply 'format args))) | |
312 | ||
313 | (defun nxml-check-xmltok-errors () | |
314 | (when xmltok-errors | |
315 | (let ((err (car (last xmltok-errors)))) | |
316 | (nxml-signal-file-parse-error nxml-parse-file-name | |
317 | (xmltok-error-start err) | |
318 | (xmltok-error-message err))))) | |
319 | ||
320 | (provide 'nxml-parse) | |
321 | ||
ab4c34c6 | 322 | ;; arch-tag: fc19639b-1bff-4673-9992-f539da89ba1e |
8cd39fb3 | 323 | ;;; nxml-parse.el ends here |