Commit | Line | Data |
---|---|---|
8cd39fb3 MH |
1 | ;;; xmltok.el --- XML tokenization |
2 | ||
ab422c4d | 3 | ;; Copyright (C) 2003, 2007-2013 Free Software Foundation, Inc. |
8cd39fb3 MH |
4 | |
5 | ;; Author: James Clark | |
6 | ;; Keywords: XML | |
7 | ||
9499da38 | 8 | ;; This file is part of GNU Emacs. |
8cd39fb3 | 9 | |
4936186e | 10 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
9499da38 | 11 | ;; it under the terms of the GNU General Public License as published by |
4936186e GM |
12 | ;; the Free Software Foundation, either version 3 of the License, or |
13 | ;; (at your option) any later version. | |
8cd39fb3 | 14 | |
9499da38 GM |
15 | ;; GNU Emacs is distributed in the hope that it will be useful, |
16 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | ;; GNU General Public License for more details. | |
19 | ||
20 | ;; You should have received a copy of the GNU General Public License | |
4936186e | 21 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
8cd39fb3 MH |
22 | |
23 | ;;; Commentary: | |
24 | ||
25 | ;; This implements an XML 1.0 parser. It also implements the XML | |
26 | ;; Namespaces Recommendation. It is designed to be conforming, but it | |
27 | ;; works a bit differently from a normal XML parser. An XML document | |
28 | ;; consists of the prolog and an instance. The prolog is parsed as a | |
29 | ;; single unit using `xmltok-forward-prolog'. The instance is | |
30 | ;; considered as a sequence of tokens, where a token is something like | |
31 | ;; a start-tag, a comment, a chunk of data or a CDATA section. The | |
32 | ;; tokenization of the instance is stateless: the tokenization of one | |
33 | ;; part of the instance does not depend on tokenization of the | |
34 | ;; preceding part of the instance. This allows the instance to be | |
35 | ;; parsed incrementally. The main entry point is `xmltok-forward': | |
36 | ;; this can be called at any point in the instance provided it is | |
37 | ;; between tokens. The other entry point is `xmltok-forward-special' | |
38 | ;; which skips over tokens other comments, processing instructions or | |
39 | ;; CDATA sections (i.e. the constructs in an instance that can contain | |
40 | ;; less than signs that don't start a token). | |
41 | ;; | |
42 | ;; This is a non-validating XML 1.0 processor. It does not resolve | |
43 | ;; parameter entities (including the external DTD subset) and it does | |
44 | ;; not resolve external general entities. | |
45 | ;; | |
46 | ;; It is non-conformant by design in the following respects. | |
47 | ;; | |
48 | ;; 1. It expects the client to detect aspects of well-formedness that | |
49 | ;; are not internal to a single token, specifically checking that | |
50 | ;; end-tags match start-tags and that the instance contains exactly | |
51 | ;; one element. | |
52 | ;; | |
53 | ;; 2. It expects the client to detect duplicate attributes. Detection | |
54 | ;; of duplicate attributes after expansion of namespace prefixes | |
55 | ;; requires the namespace processing state. Detection of duplicate | |
56 | ;; attributes before expansion of namespace prefixes does not, but is | |
57 | ;; redundant given that the client will do detection of duplicate | |
58 | ;; attributes after expansion of namespace prefixes. | |
59 | ;; | |
60 | ;; 3. It allows the client to recover from well-formedness errors. | |
61 | ;; This is essential for use in applications where the document is | |
62 | ;; being parsed during the editing process. | |
63 | ;; | |
64 | ;; 4. It does not support documents that do not conform to the lexical | |
65 | ;; requirements of the XML Namespaces Recommendation (e.g. a document | |
66 | ;; with a colon in an entity name). | |
67 | ;; | |
68 | ;; There are also a number of things that have not yet been | |
69 | ;; implemented that make it non-conformant. | |
70 | ;; | |
71 | ;; 1. It does not implement default attributes. ATTLIST declarations | |
72 | ;; are parsed, but no checking is done on the content of attribute | |
73 | ;; value literals specifying default attribute values, and default | |
74 | ;; attribute values are not reported to the client. | |
75 | ;; | |
76 | ;; 2. It does not implement internal entities containing elements. If | |
77 | ;; an internal entity is referenced and parsing its replacement text | |
78 | ;; yields one or more tags, then it will skip the reference and | |
79 | ;; report this to the client. | |
80 | ;; | |
81 | ;; 3. It does not check the syntax of public identifiers in the DTD. | |
82 | ;; | |
83 | ;; 4. It allows some non-ASCII characters in certain situations where | |
84 | ;; it should not. For example, it only enforces XML 1.0's | |
85 | ;; restrictions on name characters strictly for ASCII characters. The | |
86 | ;; problem here is XML's character model is based squarely on Unicode, | |
87 | ;; whereas Emacs's is not (as of version 21). It is not clear what | |
88 | ;; the right thing to do is. | |
89 | ||
90 | ;;; Code: | |
91 | ||
92 | (defvar xmltok-type nil) | |
93 | (defvar xmltok-start nil) | |
94 | (defvar xmltok-name-colon nil) | |
95 | (defvar xmltok-name-end nil) | |
96 | (defvar xmltok-replacement nil | |
97 | "String containing replacement for a character or entity reference.") | |
98 | ||
99 | (defvar xmltok-attributes nil | |
100 | "List containing attributes of last scanned element. | |
101 | Each member of the list is a vector representing an attribute, which | |
102 | can be accessed using the functions `xmltok-attribute-name-start', | |
103 | `xmltok-attribute-name-colon', `xmltok-attribute-name-end', | |
104 | `xmltok-attribute-value-start', `xmltok-attribute-value-end', | |
105 | `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.") | |
106 | ||
107 | (defvar xmltok-namespace-attributes nil | |
108 | "List containing namespace declarations of last scanned element. | |
109 | List has same format as `xmltok-attributes'.") | |
110 | ||
111 | (defvar xmltok-dtd nil | |
112 | "Information about the DTD used by `xmltok-forward'. | |
113 | `xmltok-forward-prolog' sets this up. | |
114 | ||
115 | It consists of an alist of general entity names vs definitions. The | |
116 | first member of the alist is t if references to entities not in the | |
117 | alist are well-formed \(e.g. because there's an external subset that | |
118 | wasn't parsed). | |
119 | ||
10545bd8 JB |
120 | Each general entity name is a string. The definition is either nil, |
121 | a symbol, a string, a cons cell. If the definition is nil, then it | |
8cd39fb3 MH |
122 | means that it's an internal entity but the result of parsing it is |
123 | unknown. If it is a symbol, then the symbol is either `unparsed', | |
124 | meaning the entity is an unparsed entity, `external', meaning the | |
125 | entity is or references an external entity, `element', meaning the | |
126 | entity includes one or more elements, or `not-well-formed', meaning | |
127 | the replacement text is not well-formed. If the definition is a | |
128 | string, then the replacement text of the entity is that string; this | |
10545bd8 JB |
129 | happens only during the parsing of the prolog. If the definition is |
130 | a cons cell \(ER . AR), then ER specifies the string that results | |
131 | from referencing the entity in element content and AR is either nil, | |
8cd39fb3 MH |
132 | meaning the replacement text included a <, or a string which is the |
133 | normalized attribute value.") | |
134 | ||
8cd39fb3 MH |
135 | |
136 | (defvar xmltok-errors nil | |
137 | "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'. | |
138 | When `xmltok-forward' and `xmltok-forward-prolog' detect a | |
139 | well-formedness error, they will add an entry to the beginning of this | |
140 | list. Each entry is a vector [MESSAGE START END], where MESSAGE is a | |
141 | string giving the error message and START and END are integers | |
142 | indicating the position of the error.") | |
143 | ||
144 | (defmacro xmltok-save (&rest body) | |
145 | `(let (xmltok-type | |
146 | xmltok-start | |
147 | xmltok-name-colon | |
148 | xmltok-name-end | |
149 | xmltok-replacement | |
150 | xmltok-attributes | |
151 | xmltok-namespace-attributes | |
8cd39fb3 MH |
152 | xmltok-errors) |
153 | ,@body)) | |
154 | ||
155 | (put 'xmltok-save 'lisp-indent-function 0) | |
156 | (def-edebug-spec xmltok-save t) | |
157 | ||
158 | (defsubst xmltok-attribute-name-start (att) | |
159 | (aref att 0)) | |
160 | ||
161 | (defsubst xmltok-attribute-name-colon (att) | |
162 | (aref att 1)) | |
163 | ||
164 | (defsubst xmltok-attribute-name-end (att) | |
165 | (aref att 2)) | |
166 | ||
167 | (defsubst xmltok-attribute-value-start (att) | |
168 | (aref att 3)) | |
169 | ||
170 | (defsubst xmltok-attribute-value-end (att) | |
171 | (aref att 4)) | |
172 | ||
173 | (defsubst xmltok-attribute-raw-normalized-value (att) | |
174 | "Return an object representing the normalized value of ATT. | |
10545bd8 JB |
175 | This can be t indicating that the normalized value is the same as |
176 | the buffer substring from the start to the end of the value, or nil | |
8cd39fb3 MH |
177 | indicating that the value is not well-formed or a string." |
178 | (aref att 5)) | |
179 | ||
180 | (defsubst xmltok-attribute-refs (att) | |
181 | "Return a list of the entity and character references in ATT. | |
182 | Each member is a vector [TYPE START END] where TYPE is either char-ref | |
10545bd8 JB |
183 | or entity-ref and START and END are integers giving the start and end of |
184 | the reference. Nested entity references are not included in the list." | |
8cd39fb3 MH |
185 | (aref att 6)) |
186 | ||
187 | (defun xmltok-attribute-prefix (att) | |
188 | (let ((colon (xmltok-attribute-name-colon att))) | |
189 | (and colon | |
190 | (buffer-substring-no-properties (xmltok-attribute-name-start att) | |
191 | colon)))) | |
192 | ||
193 | (defun xmltok-attribute-local-name (att) | |
194 | (let ((colon (xmltok-attribute-name-colon att))) | |
195 | (buffer-substring-no-properties (if colon | |
196 | (1+ colon) | |
197 | (xmltok-attribute-name-start att)) | |
198 | (xmltok-attribute-name-end att)))) | |
199 | ||
200 | (defun xmltok-attribute-value (att) | |
201 | (let ((rnv (xmltok-attribute-raw-normalized-value att))) | |
202 | (and rnv | |
203 | (if (stringp rnv) | |
204 | rnv | |
205 | (buffer-substring-no-properties (xmltok-attribute-value-start att) | |
206 | (xmltok-attribute-value-end att)))))) | |
207 | ||
208 | (defun xmltok-start-tag-prefix () | |
209 | (and xmltok-name-colon | |
210 | (buffer-substring-no-properties (1+ xmltok-start) | |
211 | xmltok-name-colon))) | |
212 | ||
213 | (defun xmltok-start-tag-local-name () | |
214 | (buffer-substring-no-properties (1+ (or xmltok-name-colon | |
215 | xmltok-start)) | |
216 | xmltok-name-end)) | |
217 | ||
218 | (defun xmltok-end-tag-prefix () | |
219 | (and xmltok-name-colon | |
220 | (buffer-substring-no-properties (+ 2 xmltok-start) | |
221 | xmltok-name-colon))) | |
222 | ||
223 | (defun xmltok-end-tag-local-name () | |
224 | (buffer-substring-no-properties (if xmltok-name-colon | |
225 | (1+ xmltok-name-colon) | |
226 | (+ 2 xmltok-start)) | |
227 | xmltok-name-end)) | |
228 | ||
229 | (defun xmltok-start-tag-qname () | |
230 | (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end)) | |
231 | ||
232 | (defun xmltok-end-tag-qname () | |
233 | (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end)) | |
234 | ||
235 | (defsubst xmltok-make-attribute (name-begin | |
236 | name-colon | |
237 | name-end | |
238 | &optional | |
239 | value-begin | |
240 | value-end | |
241 | raw-normalized-value) | |
10545bd8 JB |
242 | "Make an attribute. |
243 | RAW-NORMALIZED-VALUE is nil if the value is not well-formed, | |
244 | t if the normalized value is the string between VALUE-BEGIN | |
245 | and VALUE-END, otherwise a STRING giving the value." | |
8cd39fb3 MH |
246 | (vector name-begin |
247 | name-colon | |
248 | name-end | |
249 | value-begin | |
250 | value-end | |
251 | raw-normalized-value | |
252 | nil)) | |
253 | ||
254 | (defsubst xmltok-error-message (err) | |
255 | (aref err 0)) | |
256 | ||
257 | (defsubst xmltok-error-start (err) | |
258 | (aref err 1)) | |
259 | ||
260 | (defsubst xmltok-error-end (err) | |
261 | (aref err 2)) | |
262 | ||
263 | (defsubst xmltok-make-error (message start end) | |
264 | (vector message start end)) | |
265 | ||
266 | (defun xmltok-add-error (message &optional start end) | |
267 | (setq xmltok-errors | |
268 | (cons (xmltok-make-error message | |
269 | (or start xmltok-start) | |
270 | (or end (point))) | |
271 | xmltok-errors))) | |
272 | ||
8cd39fb3 MH |
273 | (defun xmltok-forward () |
274 | (setq xmltok-start (point)) | |
275 | (let* ((case-fold-search nil) | |
276 | (space-count (skip-chars-forward " \t\r\n")) | |
277 | (ch (char-after))) | |
278 | (cond ((eq ch ?\<) | |
279 | (cond ((> space-count 0) | |
280 | (setq xmltok-type 'space)) | |
281 | (t | |
eb22cb8c | 282 | (forward-char 1) |
8cd39fb3 MH |
283 | (xmltok-scan-after-lt)))) |
284 | ((eq ch ?\&) | |
285 | (cond ((> space-count 0) | |
286 | (setq xmltok-type 'space)) | |
287 | (t | |
eb22cb8c GM |
288 | (forward-char 1) |
289 | (xmltok-scan-after-amp 'xmltok-handle-entity)))) | |
8cd39fb3 MH |
290 | ((re-search-forward "[<&]\\|\\(]]>\\)" nil t) |
291 | (cond ((not (match-beginning 1)) | |
292 | (goto-char (match-beginning 0)) | |
293 | ;; must have got a non-space char | |
294 | (setq xmltok-type 'data)) | |
295 | ((= (match-beginning 1) xmltok-start) | |
296 | (xmltok-add-error "Found `]]>' not closing a CDATA section") | |
297 | (setq xmltok-type 'not-well-formed)) | |
298 | (t | |
299 | (goto-char (match-beginning 0)) | |
300 | (setq xmltok-type | |
301 | (if (= (point) (+ xmltok-start space-count)) | |
302 | 'space | |
303 | 'data))))) | |
304 | ((eq ch nil) | |
305 | (setq xmltok-type | |
306 | (if (> space-count 0) | |
307 | 'space | |
308 | nil))) | |
309 | (t | |
310 | (goto-char (point-max)) | |
311 | (setq xmltok-type 'data))))) | |
312 | ||
313 | (defun xmltok-forward-special (bound) | |
314 | "Scan forward past the first special token starting at or after point. | |
315 | Return nil if there is no special token that starts before BOUND. | |
316 | CDATA sections, processing instructions and comments (and indeed | |
10545bd8 JB |
317 | anything starting with < following by ? or !) count as special. |
318 | Return the type of the token." | |
8cd39fb3 MH |
319 | (when (re-search-forward "<[?!]" (1+ bound) t) |
320 | (setq xmltok-start (match-beginning 0)) | |
321 | (goto-char (1+ xmltok-start)) | |
322 | (let ((case-fold-search nil)) | |
323 | (xmltok-scan-after-lt)))) | |
324 | ||
325 | (eval-when-compile | |
326 | ||
327 | ;; A symbolic regexp is represented by a list whose CAR is the string | |
328 | ;; containing the regexp and whose cdr is a list of symbolic names | |
329 | ;; for the groups in the string. | |
330 | ||
331 | ;; Construct a symbolic regexp from a regexp. | |
332 | (defun xmltok-r (str) | |
333 | (cons str nil)) | |
334 | ||
335 | ;; Concatenate zero of more regexps and symbolic regexps. | |
336 | (defun xmltok+ (&rest args) | |
337 | (let (strs names) | |
338 | (while args | |
339 | (let ((arg (car args))) | |
340 | (if (stringp arg) | |
341 | (setq strs (cons arg strs)) | |
342 | (setq strs (cons (car arg) strs)) | |
343 | (setq names (cons (cdr arg) names))) | |
344 | (setq args (cdr args)))) | |
345 | (cons (apply 'concat (nreverse strs)) | |
346 | (apply 'append (nreverse names)))))) | |
347 | ||
348 | (eval-when-compile | |
349 | ;; Make a symbolic group named NAME from the regexp R. | |
350 | ;; R may be a symbolic regexp or an ordinary regexp. | |
351 | (defmacro xmltok-g (name &rest r) | |
352 | (let ((sym (make-symbol "r"))) | |
353 | `(let ((,sym (xmltok+ ,@r))) | |
354 | (if (stringp ,sym) | |
355 | (cons (concat "\\(" ,sym "\\)") (cons ',name nil)) | |
356 | (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym))))))) | |
357 | ||
10545bd8 | 358 | (defun xmltok-p (&rest r) (xmltok+ "\\(?:" |
8cd39fb3 MH |
359 | (apply 'xmltok+ r) |
360 | "\\)")) | |
361 | ||
362 | ;; Get the group index of ELEM in a LIST of symbols. | |
363 | (defun xmltok-get-index (elem list) | |
364 | (or elem | |
365 | (error "Missing group name")) | |
366 | (let ((found nil) | |
367 | (i 1)) | |
368 | (while list | |
369 | (cond ((eq elem (car list)) | |
370 | (setq found i) | |
371 | (setq list nil)) | |
372 | (t | |
373 | (setq i (1+ i)) | |
374 | (setq list (cdr list))))) | |
375 | (or found | |
376 | (error "Bad group name %s" elem)))) | |
377 | ||
378 | ;; Define a macro SYM using a symbolic regexp R. | |
379 | ;; SYM can be called in three ways: | |
380 | ;; (SYM regexp) | |
381 | ;; expands to the regexp in R | |
382 | ;; (SYM start G) | |
383 | ;; expands to | |
384 | ;; (match-beginning N) | |
385 | ;; where N is the group index of G in R. | |
386 | ;; (SYM end G) | |
387 | ;; expands to | |
388 | ;; (match-end N) | |
389 | ;; where N is the group index of G in R. | |
390 | (defmacro xmltok-defregexp (sym r) | |
391 | `(defalias ',sym | |
392 | (let ((r ,r)) | |
393 | `(macro lambda (action &optional group-name) | |
394 | (cond ((eq action 'regexp) | |
395 | ,(car r)) | |
396 | ((or (eq action 'start) (eq action 'beginning)) | |
397 | (list 'match-beginning (xmltok-get-index group-name | |
398 | ',(cdr r)))) | |
399 | ((eq action 'end) | |
400 | (list 'match-end (xmltok-get-index group-name | |
401 | ',(cdr r)))) | |
402 | ((eq action 'string) | |
403 | (list 'match-string | |
404 | (xmltok-get-index group-name ',(cdr r)))) | |
405 | ((eq action 'string-no-properties) | |
406 | (list 'match-string-no-properties | |
407 | (xmltok-get-index group-name ',(cdr r)))) | |
408 | (t (error "Invalid action: %s" action)))))))) | |
10545bd8 | 409 | |
8cd39fb3 MH |
410 | |
411 | (eval-when-compile | |
412 | (let* ((or "\\|") | |
413 | (open "\\(?:") | |
414 | (gopen "\\(") | |
415 | (close "\\)") | |
416 | (name-start-char "[_[:alpha:]]") | |
417 | (name-continue-not-start-char "[-.[:digit:]]") | |
418 | (name-continue-char "[-._[:alnum:]]") | |
419 | (* "*") | |
420 | (+ "+") | |
421 | (opt "?") | |
422 | (question "\\?") | |
423 | (s "[ \r\t\n]") | |
424 | (s+ (concat s +)) | |
425 | (s* (concat s *)) | |
426 | (ncname (concat name-start-char name-continue-char *)) | |
427 | (entity-ref | |
428 | (xmltok+ (xmltok-g entity-name ncname) | |
429 | (xmltok-g entity-ref-close ";") opt)) | |
430 | (decimal-ref | |
431 | (xmltok+ (xmltok-g decimal "[0-9]" +) | |
432 | (xmltok-g decimal-ref-close ";") opt)) | |
433 | (hex-ref | |
434 | (xmltok+ "x" open | |
435 | (xmltok-g hex "[0-9a-fA-F]" +) | |
436 | (xmltok-g hex-ref-close ";") opt | |
437 | close opt)) | |
438 | (char-ref | |
439 | (xmltok+ (xmltok-g number-sign "#") | |
440 | open decimal-ref or hex-ref close opt)) | |
441 | (start-tag-close | |
442 | (xmltok+ open (xmltok-g start-tag-close s* ">") | |
443 | or open (xmltok-g empty-tag-slash s* "/") | |
444 | (xmltok-g empty-tag-close ">") opt close | |
445 | or (xmltok-g start-tag-s s+) | |
446 | close)) | |
447 | (start-tag | |
448 | (xmltok+ (xmltok-g start-tag-name | |
449 | ncname (xmltok-g start-tag-colon ":" ncname) opt) | |
450 | start-tag-close opt)) | |
451 | (end-tag | |
452 | (xmltok+ (xmltok-g end-tag-slash "/") | |
453 | open (xmltok-g end-tag-name | |
454 | ncname | |
455 | (xmltok-g end-tag-colon ":" ncname) opt) | |
456 | (xmltok-g end-tag-close s* ">") opt | |
457 | close opt)) | |
458 | (comment | |
459 | (xmltok+ (xmltok-g markup-declaration "!") | |
460 | (xmltok-g comment-first-dash "-" | |
461 | (xmltok-g comment-open "-") opt) opt)) | |
462 | (cdata-section | |
463 | (xmltok+ "!" | |
464 | (xmltok-g marked-section-open "\\[") | |
465 | open "C" | |
466 | open "D" | |
467 | open "A" | |
468 | open "T" | |
469 | open "A" | |
470 | (xmltok-g cdata-section-open "\\[" ) opt | |
471 | close opt ; A | |
472 | close opt ; T | |
473 | close opt ; A | |
474 | close opt ; D | |
475 | close opt)) ; C | |
476 | (processing-instruction | |
477 | (xmltok-g processing-instruction-question question))) | |
478 | ||
479 | (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close)) | |
480 | ||
481 | (xmltok-defregexp xmltok-after-amp | |
482 | (xmltok+ entity-ref or char-ref)) | |
483 | (xmltok-defregexp xmltok-after-lt | |
484 | (xmltok+ start-tag | |
485 | or end-tag | |
486 | ;; cdata-section must come before comment | |
487 | ;; because we treat <! as a comment | |
488 | ;; and Emacs doesn't do fully greedy matching | |
489 | ;; by default | |
490 | or cdata-section | |
491 | or comment | |
492 | or processing-instruction)) | |
493 | (xmltok-defregexp | |
494 | xmltok-attribute | |
495 | (let* ((lit1 | |
496 | (xmltok+ "'" | |
497 | "[^<'&\r\n\t]*" | |
498 | (xmltok-g complex1 "[&\r\n\t][^<']*") opt | |
499 | "'")) | |
500 | (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1)) | |
501 | '(complex2))) | |
502 | (literal (xmltok-g literal lit1 or lit2)) | |
503 | (name (xmltok+ open (xmltok-g xmlns "xmlns") or ncname close | |
504 | (xmltok-g colon ":" ncname) opt))) | |
505 | (xmltok+ (xmltok-g name name) | |
506 | s* "=" | |
507 | ;; If the literal isn't followed by what it should be, | |
508 | ;; then the closing delimiter is probably really the | |
509 | ;; opening delimiter of another literal, so don't | |
510 | ;; absorb the literal in this case. | |
511 | open s* literal start-tag-close close opt))) | |
512 | (xmltok-defregexp | |
513 | xmltok-xml-declaration | |
514 | (let* ((literal-content "[-._:a-zA-Z0-9]+") | |
515 | (literal | |
516 | (concat open "\"" literal-content "\"" | |
517 | or "'" literal-content "'" close)) | |
518 | (version-att | |
519 | (xmltok+ open | |
520 | s+ (xmltok-g version-name "version") | |
521 | s* "=" | |
522 | s* (xmltok-g version-value literal) | |
523 | close opt)) | |
524 | (encoding-att | |
525 | (xmltok+ open | |
526 | s+ (xmltok-g encoding-name "encoding") | |
527 | s* "=" | |
528 | s* (xmltok-g encoding-value literal) | |
529 | close opt)) | |
530 | (yes-no | |
531 | (concat open "yes" or "no" close)) | |
532 | (standalone-att | |
533 | (xmltok+ open | |
534 | s+ (xmltok-g standalone-name "standalone") | |
535 | s* "=" | |
536 | s* (xmltok-g standalone-value | |
537 | "\"" yes-no "\"" or "'" yes-no "'") | |
538 | close opt))) | |
539 | (xmltok+ "<" question "xml" | |
540 | version-att | |
541 | encoding-att | |
542 | standalone-att | |
543 | s* question ">"))) | |
544 | (xmltok-defregexp | |
545 | xmltok-prolog | |
546 | (let* ((single-char (xmltok-g single-char "[[|,(\"'>]")) | |
547 | (internal-subset-close (xmltok-g internal-subset-close | |
548 | "][ \t\r\n]*>")) | |
549 | (starts-with-close-paren | |
550 | (xmltok-g close-paren | |
551 | ")" | |
552 | (xmltok-p | |
553 | (xmltok-g close-paren-occur "[+?]") | |
554 | or | |
555 | (xmltok-g close-paren-star "\\*")) | |
556 | opt)) | |
557 | (starts-with-percent | |
558 | (xmltok-g percent | |
559 | "%" (xmltok-g param-entity-ref | |
560 | ncname | |
561 | (xmltok-g param-entity-ref-close | |
562 | ";") opt) opt)) | |
563 | (starts-with-nmtoken-not-name | |
564 | (xmltok-g nmtoken | |
565 | (xmltok-p name-continue-not-start-char or ":") | |
566 | (xmltok-p name-continue-char or ":") *)) | |
567 | (nmtoken-after-colon | |
568 | (xmltok+ | |
569 | (xmltok-p name-continue-not-start-char or ":") | |
570 | (xmltok-p name-continue-char or ":") * | |
571 | or | |
572 | name-start-char | |
573 | name-continue-char * | |
574 | ":" | |
575 | (xmltok-p name-continue-char or ":") *)) | |
576 | (after-ncname | |
577 | (xmltok+ (xmltok-g ncname-nmtoken | |
578 | ":" (xmltok-p nmtoken-after-colon)) | |
579 | or (xmltok-p (xmltok-g colon ":" ncname) | |
580 | (xmltok-g colon-name-occur "[?+*]") opt) | |
581 | or (xmltok-g ncname-occur "[?+*]") | |
582 | or (xmltok-g ncname-colon ":"))) | |
583 | (starts-with-name | |
584 | (xmltok-g name ncname (xmltok-p after-ncname) opt)) | |
585 | (starts-with-hash | |
586 | (xmltok-g pound | |
587 | "#" (xmltok-g hash-name ncname))) | |
588 | (markup-declaration | |
589 | (xmltok-g markup-declaration | |
590 | "!" (xmltok-p (xmltok-g comment-first-dash "-" | |
591 | (xmltok-g comment-open "-") opt) | |
592 | or (xmltok-g named-markup-declaration | |
593 | ncname)) opt)) | |
594 | (after-lt | |
595 | (xmltok+ markup-declaration | |
596 | or (xmltok-g processing-instruction-question | |
597 | question) | |
598 | or (xmltok-g instance-start | |
599 | ncname))) | |
600 | (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt))) | |
601 | (xmltok+ starts-with-lt | |
602 | or single-char | |
603 | or starts-with-close-paren | |
604 | or starts-with-percent | |
605 | or starts-with-name | |
606 | or starts-with-nmtoken-not-name | |
607 | or starts-with-hash | |
608 | or internal-subset-close))))) | |
609 | ||
610 | (defconst xmltok-ncname-regexp (xmltok-ncname regexp)) | |
611 | ||
612 | (defun xmltok-scan-after-lt () | |
613 | (cond ((not (looking-at (xmltok-after-lt regexp))) | |
614 | (xmltok-add-error "`<' that is not markup must be entered as `<'") | |
615 | (setq xmltok-type 'not-well-formed)) | |
616 | (t | |
617 | (goto-char (match-end 0)) | |
618 | (cond ((xmltok-after-lt start start-tag-close) | |
619 | (setq xmltok-name-end | |
620 | (xmltok-after-lt end start-tag-name)) | |
621 | (setq xmltok-name-colon | |
622 | (xmltok-after-lt start start-tag-colon)) | |
623 | (setq xmltok-attributes nil) | |
624 | (setq xmltok-namespace-attributes nil) | |
625 | (setq xmltok-type 'start-tag)) | |
626 | ((xmltok-after-lt start end-tag-close) | |
627 | (setq xmltok-name-end | |
628 | (xmltok-after-lt end end-tag-name)) | |
629 | (setq xmltok-name-colon | |
630 | (xmltok-after-lt start end-tag-colon)) | |
631 | (setq xmltok-type 'end-tag)) | |
632 | ((xmltok-after-lt start start-tag-s) | |
633 | (setq xmltok-name-end | |
634 | (xmltok-after-lt end start-tag-name)) | |
635 | (setq xmltok-name-colon | |
636 | (xmltok-after-lt start start-tag-colon)) | |
637 | (setq xmltok-namespace-attributes nil) | |
638 | (setq xmltok-attributes nil) | |
639 | (xmltok-scan-attributes) | |
640 | xmltok-type) | |
641 | ((xmltok-after-lt start empty-tag-close) | |
642 | (setq xmltok-name-end | |
643 | (xmltok-after-lt end start-tag-name)) | |
644 | (setq xmltok-name-colon | |
645 | (xmltok-after-lt start start-tag-colon)) | |
646 | (setq xmltok-attributes nil) | |
647 | (setq xmltok-namespace-attributes nil) | |
648 | (setq xmltok-type 'empty-element)) | |
649 | ((xmltok-after-lt start cdata-section-open) | |
650 | (setq xmltok-type | |
c9990474 SM |
651 | (progn (search-forward "]]>" nil 'move) |
652 | 'cdata-section))) | |
8cd39fb3 MH |
653 | ((xmltok-after-lt start processing-instruction-question) |
654 | (xmltok-scan-after-processing-instruction-open)) | |
655 | ((xmltok-after-lt start comment-open) | |
656 | (xmltok-scan-after-comment-open)) | |
657 | ((xmltok-after-lt start empty-tag-slash) | |
658 | (setq xmltok-name-end | |
659 | (xmltok-after-lt end start-tag-name)) | |
660 | (setq xmltok-name-colon | |
661 | (xmltok-after-lt start start-tag-colon)) | |
662 | (setq xmltok-attributes nil) | |
663 | (setq xmltok-namespace-attributes nil) | |
664 | (xmltok-add-error "Expected `/>'" (1- (point))) | |
665 | (setq xmltok-type 'partial-empty-element)) | |
666 | ((xmltok-after-lt start start-tag-name) | |
667 | (xmltok-add-error "Missing `>'" | |
668 | nil | |
669 | (1+ xmltok-start)) | |
670 | (setq xmltok-name-end | |
671 | (xmltok-after-lt end start-tag-name)) | |
672 | (setq xmltok-name-colon | |
673 | (xmltok-after-lt start start-tag-colon)) | |
674 | (setq xmltok-namespace-attributes nil) | |
675 | (setq xmltok-attributes nil) | |
676 | (setq xmltok-type 'partial-start-tag)) | |
677 | ((xmltok-after-lt start end-tag-name) | |
678 | (setq xmltok-name-end (xmltok-after-lt end end-tag-name)) | |
679 | (setq xmltok-name-colon | |
680 | (xmltok-after-lt start end-tag-colon)) | |
681 | (cond ((and (not xmltok-name-colon) | |
682 | (eq (char-after) ?:)) | |
683 | (goto-char (1+ (point))) | |
684 | (xmltok-add-error "Expected name following `:'" | |
685 | (1- (point)))) | |
686 | (t | |
687 | (xmltok-add-error "Missing `>'" | |
688 | nil | |
689 | (1+ xmltok-start)))) | |
690 | (setq xmltok-type 'partial-end-tag)) | |
691 | ((xmltok-after-lt start end-tag-slash) | |
692 | (xmltok-add-error "Expected name following `</'") | |
693 | (setq xmltok-name-end nil) | |
694 | (setq xmltok-name-colon nil) | |
695 | (setq xmltok-type 'partial-end-tag)) | |
696 | ((xmltok-after-lt start marked-section-open) | |
697 | (xmltok-add-error "Expected `CDATA[' after `<!['" | |
698 | xmltok-start | |
699 | (+ 3 xmltok-start)) | |
700 | (setq xmltok-type 'not-well-formed)) | |
701 | ((xmltok-after-lt start comment-first-dash) | |
702 | (xmltok-add-error "Expected `-' after `<!-'" | |
703 | xmltok-start | |
704 | (+ 3 xmltok-start)) | |
705 | (setq xmltok-type 'not-well-formed)) | |
706 | ((xmltok-after-lt start markup-declaration) | |
707 | (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'" | |
708 | xmltok-start | |
709 | (+ 2 xmltok-start)) | |
710 | (setq xmltok-type 'not-well-formed)) | |
711 | (t | |
712 | (xmltok-add-error "Not well-formed") | |
713 | (setq xmltok-type 'not-well-formed)))))) | |
714 | ||
715 | ;; XXX This should be unified with | |
716 | ;; xmltok-scan-prolog-after-processing-instruction-open | |
717 | ;; XXX maybe should include rest of line (up to any <,>) in unclosed PI | |
718 | (defun xmltok-scan-after-processing-instruction-open () | |
c9990474 SM |
719 | (search-forward "?>" nil 'move) |
720 | (cond ((not (save-excursion | |
721 | (goto-char (+ 2 xmltok-start)) | |
722 | (and (looking-at (xmltok-ncname regexp)) | |
723 | (setq xmltok-name-end (match-end 0))))) | |
724 | (setq xmltok-name-end (+ xmltok-start 2)) | |
725 | (xmltok-add-error "<? not followed by name" | |
726 | (+ xmltok-start 2) | |
727 | (+ xmltok-start 3))) | |
728 | ((not (or (memq (char-after xmltok-name-end) | |
729 | '(?\n ?\t ?\r ? )) | |
730 | (= xmltok-name-end (- (point) 2)))) | |
731 | (xmltok-add-error "Target not followed by whitespace" | |
732 | xmltok-name-end | |
733 | (1+ xmltok-name-end))) | |
734 | ((and (= xmltok-name-end (+ xmltok-start 5)) | |
735 | (save-excursion | |
736 | (goto-char (+ xmltok-start 2)) | |
737 | (let ((case-fold-search t)) | |
738 | (looking-at "xml")))) | |
739 | (xmltok-add-error "Processing instruction target is xml" | |
740 | (+ xmltok-start 2) | |
741 | (+ xmltok-start 5)))) | |
742 | (setq xmltok-type 'processing-instruction)) | |
10545bd8 | 743 | |
8cd39fb3 | 744 | (defun xmltok-scan-after-comment-open () |
c9990474 SM |
745 | (let ((found-- (search-forward "--" nil 'move))) |
746 | (setq xmltok-type | |
747 | (cond ((or (eq (char-after) ?>) (not found--)) | |
748 | (goto-char (1+ (point))) | |
749 | 'comment) | |
750 | (t | |
751 | ;; just include the <!-- in the token | |
752 | (goto-char (+ xmltok-start 4)) | |
753 | ;; Need do this after the goto-char because | |
754 | ;; marked error should just apply to <!-- | |
755 | (xmltok-add-error "First following `--' not followed by `>'") | |
756 | 'not-well-formed))))) | |
8cd39fb3 MH |
757 | |
758 | (defun xmltok-scan-attributes () | |
759 | (let ((recovering nil) | |
760 | (atts-needing-normalization nil)) | |
761 | (while (cond ((or (looking-at (xmltok-attribute regexp)) | |
762 | ;; use non-greedy group | |
763 | (when (looking-at (concat "[^<>\n]+?" | |
764 | (xmltok-attribute regexp))) | |
765 | (unless recovering | |
766 | (xmltok-add-error "Malformed attribute" | |
767 | (point) | |
768 | (save-excursion | |
769 | (goto-char (xmltok-attribute start | |
770 | name)) | |
771 | (skip-chars-backward "\r\n\t ") | |
772 | (point)))) | |
773 | t)) | |
774 | (setq recovering nil) | |
775 | (goto-char (match-end 0)) | |
776 | (let ((att (xmltok-add-attribute))) | |
777 | (when att | |
778 | (setq atts-needing-normalization | |
779 | (cons att atts-needing-normalization)))) | |
780 | (cond ((xmltok-attribute start start-tag-s) t) | |
781 | ((xmltok-attribute start start-tag-close) | |
782 | (setq xmltok-type 'start-tag) | |
783 | nil) | |
784 | ((xmltok-attribute start empty-tag-close) | |
785 | (setq xmltok-type 'empty-element) | |
786 | nil) | |
787 | ((xmltok-attribute start empty-tag-slash) | |
788 | (setq xmltok-type 'partial-empty-element) | |
789 | (xmltok-add-error "Expected `/>'" | |
790 | (1- (point))) | |
791 | nil) | |
792 | ((looking-at "[ \t\r\n]*[\"']") | |
793 | (goto-char (match-end 0)) | |
794 | (xmltok-add-error "Missing closing delimiter" | |
795 | (1- (point))) | |
796 | (setq recovering t) | |
797 | t) | |
798 | ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]") | |
799 | (goto-char (match-end 1)) | |
800 | (xmltok-add-error "Attribute value not quoted" | |
801 | (match-beginning 1)) | |
802 | (setq recovering t) | |
803 | t) | |
804 | (t | |
805 | (xmltok-add-error "Missing attribute value" | |
806 | (1- (point))) | |
807 | (setq recovering t) | |
808 | t))) | |
809 | ((looking-at "[^<>\n]*/>") | |
810 | (let ((start (point))) | |
811 | (goto-char (match-end 0)) | |
812 | (unless recovering | |
813 | (xmltok-add-error "Malformed empty-element" | |
814 | start | |
815 | (- (point) 2)))) | |
816 | (setq xmltok-type 'empty-element) | |
817 | nil) | |
818 | ((looking-at "[^<>\n]*>") | |
819 | (let ((start (point))) | |
820 | (goto-char (match-end 0)) | |
821 | (unless recovering | |
822 | (xmltok-add-error "Malformed start-tag" | |
823 | start | |
824 | (1- (point))))) | |
825 | (setq xmltok-type 'start-tag) | |
826 | nil) | |
827 | (t | |
828 | (when recovering | |
829 | (skip-chars-forward "^<>\n")) | |
830 | (xmltok-add-error "Missing `>'" | |
831 | xmltok-start | |
832 | (1+ xmltok-start)) | |
833 | (setq xmltok-type 'partial-start-tag) | |
834 | nil))) | |
835 | (while atts-needing-normalization | |
836 | (xmltok-normalize-attribute (car atts-needing-normalization)) | |
837 | (setq atts-needing-normalization (cdr atts-needing-normalization)))) | |
838 | (setq xmltok-attributes | |
839 | (nreverse xmltok-attributes)) | |
840 | (setq xmltok-namespace-attributes | |
841 | (nreverse xmltok-namespace-attributes))) | |
842 | ||
843 | (defun xmltok-add-attribute () | |
844 | "Return the attribute if it needs normalizing, otherwise nil." | |
845 | (let* ((needs-normalizing nil) | |
846 | (att | |
847 | (if (xmltok-attribute start literal) | |
848 | (progn | |
849 | (setq needs-normalizing | |
850 | (or (xmltok-attribute start complex1) | |
851 | (xmltok-attribute start complex2))) | |
852 | (xmltok-make-attribute (xmltok-attribute start name) | |
853 | (xmltok-attribute start colon) | |
854 | (xmltok-attribute end name) | |
855 | (1+ (xmltok-attribute start literal)) | |
856 | (1- (xmltok-attribute end literal)) | |
857 | (not needs-normalizing))) | |
858 | (xmltok-make-attribute (xmltok-attribute start name) | |
859 | (xmltok-attribute start colon) | |
860 | (xmltok-attribute end name))))) | |
861 | (if (xmltok-attribute start xmlns) | |
862 | (setq xmltok-namespace-attributes | |
863 | (cons att xmltok-namespace-attributes)) | |
864 | (setq xmltok-attributes | |
865 | (cons att xmltok-attributes))) | |
866 | (and needs-normalizing | |
867 | att))) | |
10545bd8 | 868 | |
8cd39fb3 MH |
869 | (defun xmltok-normalize-attribute (att) |
870 | (let ((end (xmltok-attribute-value-end att)) | |
871 | (well-formed t) | |
872 | (value-parts nil) | |
873 | (refs nil)) | |
874 | (save-excursion | |
875 | (goto-char (xmltok-attribute-value-start att)) | |
876 | (while (progn | |
877 | (let ((n (skip-chars-forward "^\r\t\n&" end))) | |
878 | (when (> n 0) | |
879 | (setq value-parts | |
880 | (cons (buffer-substring-no-properties (- (point) n) | |
881 | (point)) | |
882 | value-parts)))) | |
883 | (when (< (point) end) | |
884 | (goto-char (1+ (point))) | |
885 | (cond ((eq (char-before) ?\&) | |
886 | (let ((xmltok-start (1- (point))) | |
887 | xmltok-type xmltok-replacement) | |
888 | (xmltok-scan-after-amp | |
889 | (lambda (start end) | |
890 | (xmltok-handle-entity start end t))) | |
891 | (cond ((or (eq xmltok-type 'char-ref) | |
892 | (eq xmltok-type 'entity-ref)) | |
893 | (setq refs | |
894 | (cons (vector xmltok-type | |
895 | xmltok-start | |
896 | (point)) | |
897 | refs)) | |
898 | (if xmltok-replacement | |
899 | (setq value-parts | |
900 | (cons xmltok-replacement | |
901 | value-parts)) | |
902 | (setq well-formed nil))) | |
903 | (t (setq well-formed nil))))) | |
904 | (t (setq value-parts | |
905 | (cons " " value-parts))))) | |
906 | (< (point) end)))) | |
907 | (when well-formed | |
908 | (aset att 5 (apply 'concat (nreverse value-parts)))) | |
909 | (aset att 6 (nreverse refs)))) | |
910 | ||
911 | (defun xmltok-scan-after-amp (entity-handler) | |
912 | (cond ((not (looking-at (xmltok-after-amp regexp))) | |
913 | (xmltok-add-error "`&' that is not markup must be entered as `&'") | |
914 | (setq xmltok-type 'not-well-formed)) | |
915 | (t | |
916 | (goto-char (match-end 0)) | |
917 | (cond ((xmltok-after-amp start entity-ref-close) | |
918 | (funcall entity-handler | |
919 | (xmltok-after-amp start entity-name) | |
920 | (xmltok-after-amp end entity-name)) | |
921 | (setq xmltok-type 'entity-ref)) | |
922 | ((xmltok-after-amp start decimal-ref-close) | |
923 | (xmltok-scan-char-ref (xmltok-after-amp start decimal) | |
924 | (xmltok-after-amp end decimal) | |
925 | 10)) | |
926 | ((xmltok-after-amp start hex-ref-close) | |
927 | (xmltok-scan-char-ref (xmltok-after-amp start hex) | |
928 | (xmltok-after-amp end hex) | |
929 | 16)) | |
930 | ((xmltok-after-amp start number-sign) | |
931 | (xmltok-add-error "Missing character number") | |
932 | (setq xmltok-type 'not-well-formed)) | |
933 | (t | |
934 | (xmltok-add-error "Missing closing `;'") | |
935 | (setq xmltok-type 'not-well-formed)))))) | |
936 | ||
937 | (defconst xmltok-entity-error-messages | |
938 | '((unparsed . "Referenced entity is unparsed") | |
939 | (not-well-formed . "Referenced entity is not well-formed") | |
940 | (external nil . "Referenced entity is external") | |
941 | (element nil . "Referenced entity contains <"))) | |
942 | ||
943 | (defun xmltok-handle-entity (start end &optional attributep) | |
944 | (let* ((name (buffer-substring-no-properties start end)) | |
945 | (name-def (assoc name xmltok-dtd)) | |
946 | (def (cdr name-def))) | |
947 | (cond ((setq xmltok-replacement (and (consp def) | |
948 | (if attributep | |
949 | (cdr def) | |
950 | (car def))))) | |
951 | ((null name-def) | |
952 | (unless (eq (car xmltok-dtd) t) | |
953 | (xmltok-add-error "Referenced entity has not been defined" | |
954 | start | |
955 | end))) | |
956 | ((and attributep (consp def)) | |
957 | (xmltok-add-error "Referenced entity contains <" | |
958 | start | |
959 | end)) | |
960 | (t | |
961 | (let ((err (cdr (assq def xmltok-entity-error-messages)))) | |
962 | (when (consp err) | |
963 | (setq err (if attributep (cdr err) (car err)))) | |
964 | (when err | |
965 | (xmltok-add-error err start end))))))) | |
966 | ||
967 | (defun xmltok-scan-char-ref (start end base) | |
968 | (setq xmltok-replacement | |
9499da38 | 969 | (let ((n (string-to-number (buffer-substring-no-properties start end) |
8cd39fb3 MH |
970 | base))) |
971 | (cond ((and (integerp n) (xmltok-valid-char-p n)) | |
972 | (setq n (xmltok-unicode-to-char n)) | |
973 | (and n (string n))) | |
974 | (t | |
975 | (xmltok-add-error "Invalid character code" start end) | |
976 | nil)))) | |
977 | (setq xmltok-type 'char-ref)) | |
978 | ||
979 | (defun xmltok-char-number (start end) | |
980 | (let* ((base (if (eq (char-after (+ start 2)) ?x) | |
981 | 16 | |
982 | 10)) | |
9499da38 | 983 | (n (string-to-number |
8cd39fb3 MH |
984 | (buffer-substring-no-properties (+ start (if (= base 16) 3 2)) |
985 | (1- end)) | |
986 | base))) | |
987 | (and (integerp n) | |
988 | (xmltok-valid-char-p n) | |
989 | n))) | |
990 | ||
991 | (defun xmltok-unclosed-reparse-p (change-start | |
992 | change-end | |
993 | pre-change-length | |
994 | start | |
995 | end | |
996 | delimiter) | |
997 | (let ((len-1 (1- (length delimiter)))) | |
998 | (goto-char (max start (- change-start len-1))) | |
999 | (search-forward delimiter (min end (+ change-end len-1)) t))) | |
1000 | ||
1001 | ;; Handles a <!-- with the next -- not followed by > | |
1002 | ||
1003 | (defun xmltok-semi-closed-reparse-p (change-start | |
1004 | change-end | |
1005 | pre-change-length | |
1006 | start | |
1007 | end | |
1008 | delimiter | |
1009 | delimiter-length) | |
1010 | (or (<= (- end delimiter-length) change-end) | |
1011 | (xmltok-unclosed-reparse-p change-start | |
1012 | change-end | |
1013 | pre-change-length | |
1014 | start | |
1015 | end | |
1016 | delimiter))) | |
1017 | ||
1018 | (defun xmltok-valid-char-p (n) | |
10545bd8 | 1019 | "Return non-nil if N is the Unicode code of a valid XML character." |
8cd39fb3 MH |
1020 | (cond ((< n #x20) (memq n '(#xA #xD #x9))) |
1021 | ((< n #xD800) t) | |
1022 | ((< n #xE000) nil) | |
1023 | ((< n #xFFFE) t) | |
1024 | (t (and (> n #xFFFF) | |
1025 | (< n #x110000))))) | |
1026 | ||
1027 | (defun xmltok-unicode-to-char (n) | |
1028 | "Return the character corresponding to Unicode scalar value N. | |
1029 | Return nil if unsupported in Emacs." | |
1030 | (decode-char 'ucs n)) | |
1031 | ||
1032 | ;;; Prolog parsing | |
1033 | ||
1034 | (defvar xmltok-contains-doctype nil) | |
1035 | (defvar xmltok-doctype-external-subset-flag nil) | |
1036 | (defvar xmltok-internal-subset-start nil) | |
1037 | (defvar xmltok-had-param-entity-ref nil) | |
1038 | (defvar xmltok-prolog-regions nil) | |
1039 | (defvar xmltok-standalone nil | |
10545bd8 | 1040 | "Non-nil if there was an XML declaration specifying standalone=\"yes\".") |
8cd39fb3 MH |
1041 | (defvar xmltok-markup-declaration-doctype-flag nil) |
1042 | ||
1043 | (defconst xmltok-predefined-entity-alist | |
1044 | '(("lt" "<" . "<") | |
1045 | ("gt" ">" . ">") | |
1046 | ("amp" "&" . "&") | |
1047 | ("apos" "'" . "'") | |
1048 | ("quot" "\"" . "\""))) | |
1049 | ||
1050 | (defun xmltok-forward-prolog () | |
1051 | "Move forward to the end of the XML prolog. | |
1052 | ||
1053 | Returns a list of vectors [TYPE START END] where TYPE is a symbol and | |
1054 | START and END are integers giving the start and end of the region of | |
1055 | that type. TYPE can be one of xml-declaration, | |
1056 | xml-declaration-attribute-name, xml-declaration-attribute-value, | |
1057 | comment, processing-instruction-left, processing-instruction-right, | |
10545bd8 | 1058 | markup-declaration-open, markup-declaration-close, |
8cd39fb3 MH |
1059 | internal-subset-open, internal-subset-close, hash-name, keyword, |
1060 | literal, encoding-name. | |
c9990474 | 1061 | Adds to `xmltok-errors' as appropriate." |
8cd39fb3 MH |
1062 | (let ((case-fold-search nil) |
1063 | xmltok-start | |
1064 | xmltok-type | |
1065 | xmltok-prolog-regions | |
1066 | xmltok-contains-doctype | |
1067 | xmltok-internal-subset-start | |
1068 | xmltok-had-param-entity-ref | |
1069 | xmltok-standalone | |
1070 | xmltok-doctype-external-subset-flag | |
1071 | xmltok-markup-declaration-doctype-flag) | |
1072 | (setq xmltok-dtd xmltok-predefined-entity-alist) | |
1073 | (xmltok-scan-xml-declaration) | |
1074 | (xmltok-next-prolog-token) | |
1075 | (while (condition-case err | |
1076 | (when (xmltok-parse-prolog-item) | |
1077 | (xmltok-next-prolog-token)) | |
1078 | (xmltok-markup-declaration-parse-error | |
1079 | (xmltok-skip-markup-declaration)))) | |
1080 | (when xmltok-internal-subset-start | |
1081 | (xmltok-add-error "No closing ]" | |
1082 | (1- xmltok-internal-subset-start) | |
1083 | xmltok-internal-subset-start)) | |
1084 | (xmltok-parse-entities) | |
8cd39fb3 MH |
1085 | (nreverse xmltok-prolog-regions))) |
1086 | ||
1087 | (defconst xmltok-bad-xml-decl-regexp | |
1088 | "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)") | |
1089 | ||
1090 | ;;;###autoload | |
1091 | (defun xmltok-get-declared-encoding-position (&optional limit) | |
1092 | "Return the position of the encoding in the XML declaration at point. | |
1093 | If there is a well-formed XML declaration starting at point and it | |
1094 | contains an encoding declaration, then return (START . END) | |
1095 | where START and END are the positions of the start and the end | |
1096 | of the encoding name; if there is no encoding declaration return | |
1097 | the position where and encoding declaration could be inserted. | |
10545bd8 JB |
1098 | If there is XML that is not well-formed that looks like an XML |
1099 | declaration, return nil. Otherwise, return t. | |
8cd39fb3 MH |
1100 | If LIMIT is non-nil, then do not consider characters beyond LIMIT." |
1101 | (cond ((let ((case-fold-search nil)) | |
1102 | (and (looking-at (xmltok-xml-declaration regexp)) | |
1103 | (or (not limit) (<= (match-end 0) limit)))) | |
1104 | (let ((end (xmltok-xml-declaration end encoding-value))) | |
1105 | (if end | |
1106 | (cons (1+ (xmltok-xml-declaration start encoding-value)) | |
1107 | (1- end)) | |
1108 | (or (xmltok-xml-declaration end version-value) | |
1109 | (+ (point) 5))))) | |
1110 | ((not (let ((case-fold-search t)) | |
1111 | (looking-at xmltok-bad-xml-decl-regexp)))))) | |
10545bd8 | 1112 | |
8cd39fb3 MH |
1113 | (defun xmltok-scan-xml-declaration () |
1114 | (when (looking-at (xmltok-xml-declaration regexp)) | |
1115 | (xmltok-add-prolog-region 'xml-declaration (point) (match-end 0)) | |
1116 | (goto-char (match-end 0)) | |
1117 | (when (xmltok-xml-declaration start version-name) | |
1118 | (xmltok-add-prolog-region 'xml-declaration-attribute-name | |
1119 | (xmltok-xml-declaration start version-name) | |
1120 | (xmltok-xml-declaration end version-name)) | |
1121 | (let ((start (xmltok-xml-declaration start version-value)) | |
1122 | (end (xmltok-xml-declaration end version-value))) | |
1123 | (xmltok-add-prolog-region 'xml-declaration-attribute-value | |
1124 | start | |
1125 | end))) | |
1126 | ;; XXX need to check encoding name | |
1127 | ;; Should start with letter, not contain colon | |
1128 | (when (xmltok-xml-declaration start encoding-name) | |
1129 | (xmltok-add-prolog-region 'xml-declaration-attribute-name | |
1130 | (xmltok-xml-declaration start encoding-name) | |
1131 | (xmltok-xml-declaration end encoding-name)) | |
1132 | (let ((start (xmltok-xml-declaration start encoding-value)) | |
1133 | (end (xmltok-xml-declaration end encoding-value))) | |
1134 | (xmltok-add-prolog-region 'encoding-name | |
1135 | (1+ start) | |
1136 | (1- end)) | |
1137 | (xmltok-add-prolog-region 'xml-declaration-attribute-value | |
1138 | start | |
1139 | end))) | |
1140 | (when (xmltok-xml-declaration start standalone-name) | |
1141 | (xmltok-add-prolog-region 'xml-declaration-attribute-name | |
1142 | (xmltok-xml-declaration start standalone-name) | |
1143 | (xmltok-xml-declaration end standalone-name)) | |
1144 | (let ((start (xmltok-xml-declaration start standalone-value)) | |
1145 | (end (xmltok-xml-declaration end standalone-value))) | |
1146 | (xmltok-add-prolog-region 'xml-declaration-attribute-value | |
1147 | start | |
1148 | end) | |
1149 | (setq xmltok-standalone | |
1150 | (string= (buffer-substring-no-properties (1+ start) (1- end)) | |
1151 | "yes")))) | |
1152 | t)) | |
1153 | ||
1154 | (defconst xmltok-markup-declaration-alist | |
1155 | '(("ELEMENT" . xmltok-parse-element-declaration) | |
1156 | ("ATTLIST" . xmltok-parse-attlist-declaration) | |
1157 | ("ENTITY" . xmltok-parse-entity-declaration) | |
1158 | ("NOTATION" . xmltok-parse-notation-declaration))) | |
1159 | ||
1160 | (defun xmltok-parse-prolog-item () | |
1161 | (cond ((eq xmltok-type 'comment) | |
1162 | (xmltok-add-prolog-region 'comment | |
1163 | xmltok-start | |
1164 | (point)) | |
1165 | t) | |
1166 | ((eq xmltok-type 'processing-instruction)) | |
1167 | ((eq xmltok-type 'named-markup-declaration) | |
1168 | (setq xmltok-markup-declaration-doctype-flag nil) | |
1169 | (xmltok-add-prolog-region 'markup-declaration-open | |
1170 | xmltok-start | |
1171 | (point)) | |
1172 | (let* ((name (buffer-substring-no-properties | |
1173 | (+ xmltok-start 2) | |
1174 | (point))) | |
1175 | (fun (cdr (assoc name xmltok-markup-declaration-alist)))) | |
1176 | (cond (fun | |
1177 | (unless xmltok-internal-subset-start | |
1178 | (xmltok-add-error | |
1179 | "Declaration allowed only in internal subset")) | |
1180 | (funcall fun)) | |
1181 | ((string= name "DOCTYPE") | |
1182 | (xmltok-parse-doctype)) | |
1183 | (t | |
1184 | (xmltok-add-error "Unknown markup declaration" | |
1185 | (+ xmltok-start 2)) | |
1186 | (xmltok-next-prolog-token) | |
1187 | (xmltok-markup-declaration-parse-error)))) | |
1188 | t) | |
1189 | ((or (eq xmltok-type 'end-prolog) | |
1190 | (not xmltok-type)) | |
1191 | nil) | |
1192 | ((eq xmltok-type 'internal-subset-close) | |
1193 | (xmltok-add-prolog-region 'internal-subset-close | |
1194 | xmltok-start | |
1195 | (1+ xmltok-start)) | |
1196 | (xmltok-add-prolog-region 'markup-declaration-close | |
1197 | (1- (point)) | |
1198 | (point)) | |
1199 | (if xmltok-internal-subset-start | |
1200 | (setq xmltok-internal-subset-start nil) | |
1201 | (xmltok-add-error "]> outside internal subset")) | |
1202 | t) | |
1203 | ((eq xmltok-type 'param-entity-ref) | |
1204 | (if xmltok-internal-subset-start | |
1205 | (setq xmltok-had-param-entity-ref t) | |
1206 | (xmltok-add-error "Parameter entity reference outside document type declaration")) | |
1207 | t) | |
1208 | ;; If we don't do this, we can get thousands of errors when | |
1209 | ;; a plain text file is parsed. | |
1210 | ((not xmltok-internal-subset-start) | |
1211 | (when (let ((err (car xmltok-errors))) | |
1212 | (or (not err) | |
1213 | (<= (xmltok-error-end err) xmltok-start))) | |
1214 | (goto-char xmltok-start)) | |
1215 | nil) | |
1216 | ((eq xmltok-type 'not-well-formed) t) | |
1217 | (t | |
1218 | (xmltok-add-error "Token allowed only inside markup declaration") | |
1219 | t))) | |
1220 | ||
1221 | (defun xmltok-parse-doctype () | |
1222 | (setq xmltok-markup-declaration-doctype-flag t) | |
1223 | (xmltok-next-prolog-token) | |
1224 | (when xmltok-internal-subset-start | |
1225 | (xmltok-add-error "DOCTYPE declaration not allowed in internal subset") | |
1226 | (xmltok-markup-declaration-parse-error)) | |
1227 | (when xmltok-contains-doctype | |
1228 | (xmltok-add-error "Duplicate DOCTYPE declaration") | |
1229 | (xmltok-markup-declaration-parse-error)) | |
1230 | (setq xmltok-contains-doctype t) | |
1231 | (xmltok-require-token 'name 'prefixed-name) | |
1232 | (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\[ ?>) | |
1233 | (cond ((eq xmltok-type ?\[) | |
1234 | (setq xmltok-internal-subset-start (point))) | |
1235 | ((eq xmltok-type ?>)) | |
1236 | (t | |
1237 | (setq xmltok-doctype-external-subset-flag t) | |
1238 | (xmltok-parse-external-id) | |
1239 | (xmltok-require-token ?\[ ?>) | |
1240 | (when (eq xmltok-type ?\[) | |
1241 | (setq xmltok-internal-subset-start (point)))))) | |
1242 | ||
1243 | (defun xmltok-parse-attlist-declaration () | |
1244 | (xmltok-require-next-token 'prefixed-name 'name) | |
1245 | (while (progn | |
1246 | (xmltok-require-next-token ?> 'name 'prefixed-name) | |
1247 | (if (eq xmltok-type ?>) | |
1248 | nil | |
1249 | (xmltok-require-next-token ?\( | |
1250 | "CDATA" | |
1251 | "ID" | |
1252 | "IDREF" | |
1253 | "IDREFS" | |
1254 | "ENTITY" | |
1255 | "ENTITIES" | |
1256 | "NMTOKEN" | |
1257 | "NMTOKENS" | |
1258 | "NOTATION") | |
1259 | (cond ((eq xmltok-type ?\() | |
1260 | (xmltok-parse-nmtoken-group)) | |
1261 | ((string= (xmltok-current-token-string) | |
1262 | "NOTATION") | |
1263 | (xmltok-require-next-token ?\() | |
1264 | (xmltok-parse-nmtoken-group))) | |
1265 | (xmltok-require-next-token "#IMPLIED" | |
1266 | "#REQUIRED" | |
1267 | "#FIXED" | |
1268 | 'literal) | |
1269 | (when (string= (xmltok-current-token-string) "#FIXED") | |
1270 | (xmltok-require-next-token 'literal)) | |
1271 | t)))) | |
10545bd8 | 1272 | |
8cd39fb3 MH |
1273 | (defun xmltok-parse-nmtoken-group () |
1274 | (while (progn | |
1275 | (xmltok-require-next-token 'nmtoken 'prefixed-name 'name) | |
1276 | (xmltok-require-next-token ?| ?\)) | |
1277 | (eq xmltok-type ?|)))) | |
1278 | ||
1279 | (defun xmltok-parse-element-declaration () | |
1280 | (xmltok-require-next-token 'name 'prefixed-name) | |
1281 | (xmltok-require-next-token "EMPTY" "ANY" ?\() | |
1282 | (when (eq xmltok-type ?\() | |
1283 | (xmltok-require-next-token "#PCDATA" | |
1284 | 'name | |
1285 | 'prefixed-name | |
1286 | 'name-occur | |
1287 | ?\() | |
1288 | (cond ((eq xmltok-type 'hash-name) | |
1289 | (xmltok-require-next-token ?| ?\) 'close-paren-star) | |
1290 | (while (eq xmltok-type ?|) | |
1291 | (xmltok-require-next-token 'name 'prefixed-name) | |
1292 | (xmltok-require-next-token 'close-paren-star ?|))) | |
1293 | (t (xmltok-parse-model-group)))) | |
1294 | (xmltok-require-next-token ?>)) | |
1295 | ||
1296 | (defun xmltok-parse-model-group () | |
1297 | (xmltok-parse-model-group-member) | |
1298 | (xmltok-require-next-token ?| | |
1299 | ?, | |
1300 | ?\) | |
1301 | 'close-paren-star | |
1302 | 'close-paren-occur) | |
1303 | (when (memq xmltok-type '(?, ?|)) | |
1304 | (let ((connector xmltok-type)) | |
1305 | (while (progn | |
1306 | (xmltok-next-prolog-token) | |
1307 | (xmltok-parse-model-group-member) | |
1308 | (xmltok-require-next-token connector | |
1309 | ?\) | |
1310 | 'close-paren-star | |
1311 | 'close-paren-occur) | |
1312 | (eq xmltok-type connector)))))) | |
10545bd8 | 1313 | |
8cd39fb3 MH |
1314 | (defun xmltok-parse-model-group-member () |
1315 | (xmltok-require-token 'name | |
1316 | 'prefixed-name | |
1317 | 'name-occur | |
1318 | ?\() | |
1319 | (when (eq xmltok-type ?\() | |
1320 | (xmltok-next-prolog-token) | |
1321 | (xmltok-parse-model-group))) | |
10545bd8 | 1322 | |
8cd39fb3 MH |
1323 | (defun xmltok-parse-entity-declaration () |
1324 | (let (paramp name) | |
1325 | (xmltok-require-next-token 'name ?%) | |
1326 | (when (eq xmltok-type ?%) | |
1327 | (setq paramp t) | |
1328 | (xmltok-require-next-token 'name)) | |
1329 | (setq name (xmltok-current-token-string)) | |
1330 | (xmltok-require-next-token 'literal "SYSTEM" "PUBLIC") | |
1331 | (cond ((eq xmltok-type 'literal) | |
1332 | (let ((replacement (xmltok-parse-entity-value))) | |
1333 | (unless paramp | |
1334 | (xmltok-define-entity name replacement))) | |
1335 | (xmltok-require-next-token ?>)) | |
1336 | (t | |
1337 | (xmltok-parse-external-id) | |
1338 | (if paramp | |
1339 | (xmltok-require-token ?>) | |
1340 | (xmltok-require-token ?> "NDATA") | |
1341 | (if (eq xmltok-type ?>) | |
1342 | (xmltok-define-entity name 'external) | |
1343 | (xmltok-require-next-token 'name) | |
1344 | (xmltok-require-next-token ?>) | |
1345 | (xmltok-define-entity name 'unparsed))))))) | |
1346 | ||
1347 | (defun xmltok-define-entity (name value) | |
1348 | (when (and (or (not xmltok-had-param-entity-ref) | |
1349 | xmltok-standalone) | |
1350 | (not (assoc name xmltok-dtd))) | |
1351 | (setq xmltok-dtd | |
1352 | (cons (cons name value) xmltok-dtd)))) | |
10545bd8 | 1353 | |
8cd39fb3 MH |
1354 | (defun xmltok-parse-entity-value () |
1355 | (let ((lim (1- (point))) | |
1356 | (well-formed t) | |
1357 | value-parts | |
1358 | start) | |
1359 | (save-excursion | |
1360 | (goto-char (1+ xmltok-start)) | |
1361 | (setq start (point)) | |
1362 | (while (progn | |
1363 | (skip-chars-forward "^%&" lim) | |
1364 | (when (< (point) lim) | |
1365 | (goto-char (1+ (point))) | |
1366 | (cond ((eq (char-before) ?%) | |
1367 | (xmltok-add-error "Parameter entity references are not allowed in the internal subset" | |
1368 | (1- (point)) | |
1369 | (point)) | |
1370 | (setq well-formed nil)) | |
1371 | (t | |
1372 | (let ((xmltok-start (1- (point))) | |
1373 | xmltok-type xmltok-replacement) | |
1374 | (xmltok-scan-after-amp (lambda (start end))) | |
1375 | (cond ((eq xmltok-type 'char-ref) | |
1376 | (setq value-parts | |
1377 | (cons (buffer-substring-no-properties | |
1378 | start | |
1379 | xmltok-start) | |
1380 | value-parts)) | |
1381 | (setq value-parts | |
1382 | (cons xmltok-replacement | |
1383 | value-parts)) | |
1384 | (setq start (point))) | |
1385 | ((eq xmltok-type 'not-well-formed) | |
1386 | (setq well-formed nil)))))) | |
1387 | t)))) | |
1388 | (if (not well-formed) | |
1389 | nil | |
1390 | (apply 'concat | |
1391 | (nreverse (cons (buffer-substring-no-properties start lim) | |
1392 | value-parts)))))) | |
10545bd8 | 1393 | |
8cd39fb3 MH |
1394 | (defun xmltok-parse-notation-declaration () |
1395 | (xmltok-require-next-token 'name) | |
1396 | (xmltok-require-next-token "SYSTEM" "PUBLIC") | |
1397 | (let ((publicp (string= (xmltok-current-token-string) "PUBLIC"))) | |
1398 | (xmltok-require-next-token 'literal) | |
1399 | (cond (publicp | |
1400 | (xmltok-require-next-token 'literal ?>) | |
1401 | (unless (eq xmltok-type ?>) | |
1402 | (xmltok-require-next-token ?>))) | |
1403 | (t (xmltok-require-next-token ?>))))) | |
1404 | ||
1405 | (defun xmltok-parse-external-id () | |
1406 | (xmltok-require-token "SYSTEM" "PUBLIC") | |
1407 | (let ((publicp (string= (xmltok-current-token-string) "PUBLIC"))) | |
1408 | (xmltok-require-next-token 'literal) | |
1409 | (when publicp | |
1410 | (xmltok-require-next-token 'literal))) | |
1411 | (xmltok-next-prolog-token)) | |
1412 | ||
1413 | (defun xmltok-require-next-token (&rest types) | |
1414 | (xmltok-next-prolog-token) | |
1415 | (apply 'xmltok-require-token types)) | |
1416 | ||
1417 | (defun xmltok-require-token (&rest types) | |
1418 | ;; XXX Generate a more helpful error message | |
1419 | (while (and (not (let ((type (car types))) | |
1420 | (if (stringp (car types)) | |
1421 | (string= (xmltok-current-token-string) type) | |
1422 | (eq type xmltok-type)))) | |
1423 | (setq types (cdr types)))) | |
1424 | (unless types | |
1425 | (when (and xmltok-type | |
1426 | (not (eq xmltok-type 'not-well-formed))) | |
1427 | (xmltok-add-error "Unexpected token")) | |
1428 | (xmltok-markup-declaration-parse-error)) | |
1429 | (let ((region-type (xmltok-prolog-region-type (car types)))) | |
1430 | (when region-type | |
1431 | (xmltok-add-prolog-region region-type | |
1432 | xmltok-start | |
1433 | (point))))) | |
1434 | ||
1435 | (defun xmltok-current-token-string () | |
1436 | (buffer-substring-no-properties xmltok-start (point))) | |
1437 | ||
1438 | (put 'xmltok-markup-declaration-parse-error | |
1439 | 'error-conditions | |
1440 | '(error xmltok-markup-declaration-parse-error)) | |
1441 | ||
1442 | (put 'xmltok-markup-declaration-parse-error | |
1443 | 'error-message | |
1444 | "Syntax error in markup declaration") | |
1445 | ||
1446 | (defun xmltok-markup-declaration-parse-error () | |
1447 | (signal 'xmltok-markup-declaration-parse-error nil)) | |
1448 | ||
1449 | (defun xmltok-skip-markup-declaration () | |
1450 | (while (cond ((eq xmltok-type ?>) | |
1451 | (xmltok-next-prolog-token) | |
1452 | nil) | |
1453 | ((and xmltok-markup-declaration-doctype-flag | |
1454 | (eq xmltok-type ?\[)) | |
1455 | (setq xmltok-internal-subset-start (point)) | |
1456 | (xmltok-next-prolog-token) | |
1457 | nil) | |
1458 | ((memq xmltok-type '(nil | |
1459 | end-prolog | |
1460 | named-markup-declaration | |
1461 | comment | |
1462 | processing-instruction)) | |
1463 | nil) | |
1464 | ((and xmltok-internal-subset-start | |
1465 | (eq xmltok-type 'internal-subset-close)) | |
1466 | nil) | |
1467 | (t (xmltok-next-prolog-token) t))) | |
1468 | xmltok-type) | |
1469 | ||
1470 | (defun xmltok-prolog-region-type (required) | |
1471 | (cond ((cdr (assq xmltok-type | |
1472 | '((literal . literal) | |
1473 | (?> . markup-declaration-close) | |
1474 | (?\[ . internal-subset-open) | |
1475 | (hash-name . hash-name))))) | |
1476 | ((and (stringp required) (eq xmltok-type 'name)) | |
1477 | 'keyword))) | |
10545bd8 | 1478 | |
8cd39fb3 | 1479 | ;; Return new token type. |
10545bd8 | 1480 | |
8cd39fb3 MH |
1481 | (defun xmltok-next-prolog-token () |
1482 | (skip-chars-forward " \t\r\n") | |
1483 | (setq xmltok-start (point)) | |
1484 | (cond ((not (and (looking-at (xmltok-prolog regexp)) | |
1485 | (goto-char (match-end 0)))) | |
1486 | (let ((ch (char-after))) | |
1487 | (cond (ch | |
1488 | (goto-char (1+ (point))) | |
1489 | (xmltok-add-error "Illegal char in prolog") | |
1490 | (setq xmltok-type 'not-well-formed)) | |
1491 | (t (setq xmltok-type nil))))) | |
1492 | ((or (xmltok-prolog start ncname-occur) | |
1493 | (xmltok-prolog start colon-name-occur)) | |
1494 | (setq xmltok-name-end (1- (point))) | |
1495 | (setq xmltok-name-colon (xmltok-prolog start colon)) | |
1496 | (setq xmltok-type 'name-occur)) | |
1497 | ((xmltok-prolog start colon) | |
1498 | (setq xmltok-name-end (point)) | |
1499 | (setq xmltok-name-colon (xmltok-prolog start colon)) | |
1500 | (unless (looking-at "[ \t\r\n>),|[%]") | |
1501 | (xmltok-add-error "Missing space after name")) | |
1502 | (setq xmltok-type 'prefixed-name)) | |
1503 | ((or (xmltok-prolog start ncname-nmtoken) | |
1504 | (xmltok-prolog start ncname-colon)) | |
1505 | (unless (looking-at "[ \t\r\n>),|[%]") | |
1506 | (xmltok-add-error "Missing space after name token")) | |
1507 | (setq xmltok-type 'nmtoken)) | |
10545bd8 | 1508 | ((xmltok-prolog start name) |
8cd39fb3 MH |
1509 | (setq xmltok-name-end (point)) |
1510 | (setq xmltok-name-colon nil) | |
1511 | (unless (looking-at "[ \t\r\n>),|[%]") | |
1512 | (xmltok-add-error "Missing space after name")) | |
1513 | (setq xmltok-type 'name)) | |
10545bd8 | 1514 | ((xmltok-prolog start hash-name) |
8cd39fb3 MH |
1515 | (setq xmltok-name-end (point)) |
1516 | (unless (looking-at "[ \t\r\n>)|%]") | |
1517 | (xmltok-add-error "Missing space after name")) | |
1518 | (setq xmltok-type 'hash-name)) | |
1519 | ((xmltok-prolog start processing-instruction-question) | |
1520 | (xmltok-scan-prolog-after-processing-instruction-open)) | |
1521 | ((xmltok-prolog start comment-open) | |
1522 | ;; XXX if not-well-formed, ignore some stuff | |
1523 | (xmltok-scan-after-comment-open)) | |
1524 | ((xmltok-prolog start named-markup-declaration) | |
1525 | (setq xmltok-type 'named-markup-declaration)) | |
1526 | ((xmltok-prolog start instance-start) | |
1527 | (goto-char xmltok-start) | |
1528 | (setq xmltok-type 'end-prolog)) | |
1529 | ((xmltok-prolog start close-paren-star) | |
1530 | (setq xmltok-type 'close-paren-star)) | |
1531 | ((xmltok-prolog start close-paren-occur) | |
1532 | (setq xmltok-type 'close-paren-occur)) | |
1533 | ((xmltok-prolog start close-paren) | |
1534 | (unless (looking-at "[ \t\r\n>,|)]") | |
1535 | (xmltok-add-error "Missing space after )")) | |
1536 | (setq xmltok-type ?\))) | |
1537 | ((xmltok-prolog start single-char) | |
1538 | (let ((ch (char-before))) | |
1539 | (cond ((memq ch '(?\" ?\')) | |
1540 | (xmltok-scan-prolog-literal)) | |
1541 | (t (setq xmltok-type ch))))) | |
1542 | ((xmltok-prolog start percent) | |
1543 | (cond ((xmltok-prolog start param-entity-ref-close) | |
1544 | (setq xmltok-name-end (1- (point))) | |
1545 | (setq xmltok-type 'param-entity-ref)) | |
1546 | ((xmltok-prolog start param-entity-ref) | |
1547 | (xmltok-add-error "Missing ;") | |
1548 | (setq xmltok-name-end (point)) | |
1549 | (setq xmltok-type 'param-entity-ref)) | |
1550 | ((looking-at "[ \t\r\n%]") | |
1551 | (setq xmltok-type ?%)) | |
1552 | (t | |
1553 | (xmltok-add-error "Expected name after %") | |
1554 | (setq xmltok-type 'not-well-formed)))) | |
1555 | ((xmltok-prolog start nmtoken) | |
1556 | (unless (looking-at "[ \t\r\n>),|[%]") | |
1557 | (xmltok-add-error "Missing space after name token")) | |
1558 | (setq xmltok-type 'nmtoken)) | |
1559 | ((xmltok-prolog start internal-subset-close) | |
1560 | (setq xmltok-type 'internal-subset-close)) | |
1561 | ((xmltok-prolog start pound) | |
1562 | (xmltok-add-error "Expected name after #") | |
1563 | (setq xmltok-type 'not-well-formed)) | |
1564 | ((xmltok-prolog start markup-declaration) | |
1565 | (xmltok-add-error "Expected name or -- after <!") | |
1566 | (setq xmltok-type 'not-well-formed)) | |
1567 | ((xmltok-prolog start comment-first-dash) | |
1568 | (xmltok-add-error "Expected <!--") | |
1569 | (setq xmltok-type 'not-well-formed)) | |
1570 | ((xmltok-prolog start less-than) | |
1571 | (xmltok-add-error "Incomplete markup") | |
1572 | (setq xmltok-type 'not-well-formed)) | |
1573 | (t (error "Unhandled token in prolog %s" | |
1574 | (match-string-no-properties 0))))) | |
1575 | ||
1576 | (defun xmltok-scan-prolog-literal () | |
1577 | (let* ((delim (string (char-before))) | |
1578 | (safe-end (save-excursion | |
1579 | (skip-chars-forward (concat "^<>[]" delim)) | |
1580 | (point))) | |
1581 | (end (save-excursion | |
1582 | (goto-char safe-end) | |
1583 | (search-forward delim nil t)))) | |
c9990474 SM |
1584 | (cond ((or (not end) |
1585 | (save-excursion | |
1586 | (goto-char end) | |
1587 | (looking-at "[ \t\r\n>%[]"))) | |
1588 | (goto-char end)) | |
1589 | ((eq (1+ safe-end) end) | |
1590 | (goto-char end) | |
1591 | (xmltok-add-error (format "Missing space after %s" delim) | |
1592 | safe-end))) | |
1593 | (setq xmltok-type 'literal))) | |
8cd39fb3 MH |
1594 | |
1595 | (defun xmltok-scan-prolog-after-processing-instruction-open () | |
c9990474 SM |
1596 | (search-forward "?>" nil 'move) |
1597 | (let* ((end (point)) | |
1598 | (target | |
1599 | (save-excursion | |
1600 | (goto-char (+ xmltok-start 2)) | |
1601 | (and (looking-at (xmltok-ncname regexp)) | |
1602 | (or (memq (char-after (match-end 0)) | |
1603 | '(?\n ?\t ?\r ? )) | |
1604 | (= (match-end 0) (- end 2))) | |
1605 | (match-string-no-properties 0))))) | |
1606 | (cond ((not target) | |
1607 | (xmltok-add-error "\ | |
8cd39fb3 | 1608 | Processing instruction does not start with a name" |
c9990474 SM |
1609 | (+ xmltok-start 2) |
1610 | (+ xmltok-start 3))) | |
1611 | ((not (and (= (length target) 3) | |
1612 | (let ((case-fold-search t)) | |
1613 | (string-match "xml" target))))) | |
1614 | ((= xmltok-start 1) | |
1615 | (xmltok-add-error "Invalid XML declaration" | |
1616 | xmltok-start | |
1617 | (point))) | |
1618 | ((save-excursion | |
1619 | (goto-char xmltok-start) | |
1620 | (looking-at (xmltok-xml-declaration regexp))) | |
1621 | (xmltok-add-error "XML declaration not at beginning of file" | |
1622 | xmltok-start | |
1623 | (point))) | |
1624 | (t | |
1625 | (xmltok-add-error "Processing instruction has target of xml" | |
1626 | (+ xmltok-start 2) | |
1627 | (+ xmltok-start 5)))) | |
1628 | (xmltok-add-prolog-region 'processing-instruction-left | |
1629 | xmltok-start | |
1630 | (+ xmltok-start | |
1631 | 2 | |
1632 | (if target | |
1633 | (length target) | |
1634 | 0))) | |
1635 | (xmltok-add-prolog-region 'processing-instruction-right | |
1636 | (if target | |
1637 | (save-excursion | |
1638 | (goto-char (+ xmltok-start | |
1639 | (length target) | |
1640 | 2)) | |
1641 | (skip-chars-forward " \t\r\n") | |
1642 | (point)) | |
1643 | (+ xmltok-start 2)) | |
1644 | (point))) | |
1645 | (setq xmltok-type 'processing-instruction)) | |
8cd39fb3 MH |
1646 | |
1647 | (defun xmltok-parse-entities () | |
1648 | (let ((todo xmltok-dtd)) | |
1649 | (when (and (or xmltok-had-param-entity-ref | |
1650 | xmltok-doctype-external-subset-flag) | |
1651 | (not xmltok-standalone)) | |
1652 | (setq xmltok-dtd (cons t xmltok-dtd))) | |
1653 | (while todo | |
1654 | (xmltok-parse-entity (car todo)) | |
1655 | (setq todo (cdr todo))))) | |
10545bd8 | 1656 | |
8cd39fb3 MH |
1657 | (defun xmltok-parse-entity (name-def) |
1658 | (let ((def (cdr name-def)) | |
10545bd8 | 1659 | ;; in case its value is buffer local |
8cd39fb3 MH |
1660 | (xmltok-dtd xmltok-dtd) |
1661 | buf) | |
1662 | (when (stringp def) | |
1663 | (if (string-match "\\`[^&<\t\r\n]*\\'" def) | |
1664 | (setcdr name-def (cons def def)) | |
1665 | (setcdr name-def 'not-well-formed) ; avoid infinite expansion loops | |
1666 | (setq buf (get-buffer-create | |
1667 | (format " *Entity %s*" (car name-def)))) | |
2adaf057 | 1668 | (with-current-buffer buf |
8cd39fb3 MH |
1669 | (erase-buffer) |
1670 | (insert def) | |
1671 | (goto-char (point-min)) | |
1672 | (setcdr name-def | |
1673 | (xmltok-parse-entity-replacement))) | |
1674 | (kill-buffer buf))))) | |
1675 | ||
1676 | (defun xmltok-parse-entity-replacement () | |
1677 | (let ((def (cons "" ""))) | |
1678 | (while (let* ((start (point)) | |
1679 | (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t)) | |
1680 | (ch (and found (char-before))) | |
1681 | (str (buffer-substring-no-properties | |
1682 | start | |
1683 | (if found | |
1684 | (match-beginning 0) | |
1685 | (point-max))))) | |
1686 | (setq def | |
1687 | (xmltok-append-entity-def def | |
1688 | (cons str str))) | |
1689 | (cond ((not found) nil) | |
1690 | ((eq ch ?>) | |
1691 | (setq def 'not-well-formed) | |
1692 | nil) | |
1693 | ((eq ch ?<) | |
1694 | (xmltok-save | |
1695 | (setq xmltok-start (1- (point))) | |
1696 | (xmltok-scan-after-lt) | |
1697 | (setq def | |
1698 | (xmltok-append-entity-def | |
1699 | def | |
1700 | (cond ((memq xmltok-type | |
1701 | '(start-tag | |
1702 | end-tag | |
1703 | empty-element)) | |
1704 | 'element) | |
1705 | ((memq xmltok-type | |
1706 | '(comment | |
1707 | processing-instruction)) | |
1708 | (cons "" nil)) | |
1709 | ((eq xmltok-type | |
1710 | 'cdata-section) | |
1711 | (cons (buffer-substring-no-properties | |
1712 | (+ xmltok-start 9) | |
1713 | (- (point) 3)) | |
1714 | nil)) | |
1715 | (t 'not-well-formed))))) | |
1716 | t) | |
1717 | ((eq ch ?&) | |
1718 | (let ((xmltok-start (1- (point))) | |
1719 | xmltok-type | |
1720 | xmltok-replacement | |
1721 | xmltok-errors) | |
1722 | (xmltok-scan-after-amp 'xmltok-handle-nested-entity) | |
1723 | (cond ((eq xmltok-type 'entity-ref) | |
1724 | (setq def | |
1725 | (xmltok-append-entity-def | |
1726 | def | |
1727 | xmltok-replacement))) | |
1728 | ((eq xmltok-type 'char-ref) | |
1729 | (setq def | |
1730 | (xmltok-append-entity-def | |
1731 | def | |
1732 | (if xmltok-replacement | |
1733 | (cons xmltok-replacement | |
1734 | xmltok-replacement) | |
1735 | (and xmltok-errors 'not-well-formed))))) | |
1736 | (t | |
1737 | (setq def 'not-well-formed)))) | |
1738 | t) | |
1739 | (t | |
1740 | (setq def | |
1741 | (xmltok-append-entity-def | |
1742 | def | |
1743 | (cons (match-string-no-properties 0) | |
1744 | " "))) | |
1745 | t)))) | |
1746 | def)) | |
1747 | ||
1748 | (defun xmltok-handle-nested-entity (start end) | |
1749 | (let* ((name-def (assoc (buffer-substring-no-properties start end) | |
1750 | xmltok-dtd)) | |
1751 | (def (cdr name-def))) | |
1752 | (when (stringp def) | |
1753 | (xmltok-parse-entity name-def) | |
1754 | (setq def (cdr name-def))) | |
1755 | (setq xmltok-replacement | |
1756 | (cond ((null name-def) | |
1757 | (if (eq (car xmltok-dtd) t) | |
1758 | nil | |
1759 | 'not-well-formed)) | |
1760 | ((eq def 'unparsed) 'not-well-formed) | |
1761 | (t def))))) | |
10545bd8 | 1762 | |
8cd39fb3 MH |
1763 | (defun xmltok-append-entity-def (d1 d2) |
1764 | (cond ((consp d1) | |
1765 | (if (consp d2) | |
1766 | (cons (concat (car d1) (car d2)) | |
1767 | (and (cdr d1) | |
1768 | (cdr d2) | |
1769 | (concat (cdr d1) (cdr d2)))) | |
1770 | d2)) | |
1771 | ((consp d2) d1) | |
1772 | (t | |
1773 | (let ((defs '(not-well-formed external element))) | |
1774 | (while (not (or (eq (car defs) d1) | |
1775 | (eq (car defs) d2))) | |
1776 | (setq defs (cdr defs))) | |
1777 | (car defs))))) | |
1778 | ||
1779 | (defun xmltok-add-prolog-region (type start end) | |
1780 | (setq xmltok-prolog-regions | |
1781 | (cons (vector type start end) | |
1782 | xmltok-prolog-regions))) | |
1783 | ||
1784 | (defun xmltok-merge-attributes () | |
10545bd8 | 1785 | "Return a list merging `xmltok-attributes' and `xmltok-namespace-attributes'. |
8cd39fb3 MH |
1786 | The members of the merged list are in order of occurrence in the |
1787 | document. The list may share list structure with `xmltok-attributes' | |
1788 | and `xmltok-namespace-attributes'." | |
1789 | (cond ((not xmltok-namespace-attributes) | |
1790 | xmltok-attributes) | |
1791 | ((not xmltok-attributes) | |
1792 | xmltok-namespace-attributes) | |
1793 | (t | |
1794 | (let ((atts1 xmltok-attributes) | |
1795 | (atts2 xmltok-namespace-attributes) | |
1796 | merged) | |
1797 | (while (and atts1 atts2) | |
1798 | (cond ((< (xmltok-attribute-name-start (car atts1)) | |
1799 | (xmltok-attribute-name-start (car atts2))) | |
1800 | (setq merged (cons (car atts1) merged)) | |
1801 | (setq atts1 (cdr atts1))) | |
1802 | (t | |
1803 | (setq merged (cons (car atts2) merged)) | |
1804 | (setq atts2 (cdr atts2))))) | |
1805 | (setq merged (nreverse merged)) | |
1806 | (cond (atts1 (setq merged (nconc merged atts1))) | |
1807 | (atts2 (setq merged (nconc merged atts2)))) | |
1808 | merged)))) | |
1809 | ||
1810 | ;;; Testing | |
1811 | ||
1812 | (defun xmltok-forward-test () | |
1813 | (interactive) | |
1814 | (if (xmltok-forward) | |
1815 | (message "Scanned %s" xmltok-type) | |
1816 | (message "Scanned nothing"))) | |
1817 | ||
1818 | (defun xmltok-next-prolog-token-test () | |
1819 | (interactive) | |
1820 | (if (xmltok-next-prolog-token) | |
1821 | (message "Scanned %s" | |
1822 | (if (integerp xmltok-type) | |
1823 | (string xmltok-type) | |
1824 | xmltok-type)) | |
1825 | (message "Scanned end of file"))) | |
1826 | ||
1827 | (provide 'xmltok) | |
1828 | ||
1829 | ;;; xmltok.el ends here |