* lisp/nxml/nxml-mode.el: Use lexical-binding and syntax-propertize.
[bpt/emacs.git] / lisp / nxml / xmltok.el
CommitLineData
da3e5ebb 1;;; xmltok.el --- XML tokenization -*- lexical-binding:t -*-
8cd39fb3 2
ab422c4d 3;; Copyright (C) 2003, 2007-2013 Free Software Foundation, Inc.
8cd39fb3
MH
4
5;; Author: James Clark
6;; Keywords: XML
7
9499da38 8;; This file is part of GNU Emacs.
8cd39fb3 9
4936186e 10;; GNU Emacs is free software: you can redistribute it and/or modify
9499da38 11;; it under the terms of the GNU General Public License as published by
4936186e
GM
12;; the Free Software Foundation, either version 3 of the License, or
13;; (at your option) any later version.
8cd39fb3 14
9499da38
GM
15;; GNU Emacs is distributed in the hope that it will be useful,
16;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18;; GNU General Public License for more details.
19
20;; You should have received a copy of the GNU General Public License
4936186e 21;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
8cd39fb3
MH
22
23;;; Commentary:
24
25;; This implements an XML 1.0 parser. It also implements the XML
26;; Namespaces Recommendation. It is designed to be conforming, but it
27;; works a bit differently from a normal XML parser. An XML document
28;; consists of the prolog and an instance. The prolog is parsed as a
29;; single unit using `xmltok-forward-prolog'. The instance is
30;; considered as a sequence of tokens, where a token is something like
31;; a start-tag, a comment, a chunk of data or a CDATA section. The
32;; tokenization of the instance is stateless: the tokenization of one
33;; part of the instance does not depend on tokenization of the
34;; preceding part of the instance. This allows the instance to be
35;; parsed incrementally. The main entry point is `xmltok-forward':
36;; this can be called at any point in the instance provided it is
37;; between tokens. The other entry point is `xmltok-forward-special'
38;; which skips over tokens other comments, processing instructions or
39;; CDATA sections (i.e. the constructs in an instance that can contain
40;; less than signs that don't start a token).
41;;
42;; This is a non-validating XML 1.0 processor. It does not resolve
43;; parameter entities (including the external DTD subset) and it does
44;; not resolve external general entities.
45;;
46;; It is non-conformant by design in the following respects.
47;;
48;; 1. It expects the client to detect aspects of well-formedness that
49;; are not internal to a single token, specifically checking that
50;; end-tags match start-tags and that the instance contains exactly
51;; one element.
52;;
53;; 2. It expects the client to detect duplicate attributes. Detection
54;; of duplicate attributes after expansion of namespace prefixes
55;; requires the namespace processing state. Detection of duplicate
56;; attributes before expansion of namespace prefixes does not, but is
57;; redundant given that the client will do detection of duplicate
58;; attributes after expansion of namespace prefixes.
59;;
60;; 3. It allows the client to recover from well-formedness errors.
61;; This is essential for use in applications where the document is
62;; being parsed during the editing process.
63;;
64;; 4. It does not support documents that do not conform to the lexical
65;; requirements of the XML Namespaces Recommendation (e.g. a document
66;; with a colon in an entity name).
67;;
68;; There are also a number of things that have not yet been
69;; implemented that make it non-conformant.
70;;
71;; 1. It does not implement default attributes. ATTLIST declarations
72;; are parsed, but no checking is done on the content of attribute
73;; value literals specifying default attribute values, and default
74;; attribute values are not reported to the client.
75;;
76;; 2. It does not implement internal entities containing elements. If
77;; an internal entity is referenced and parsing its replacement text
78;; yields one or more tags, then it will skip the reference and
79;; report this to the client.
80;;
81;; 3. It does not check the syntax of public identifiers in the DTD.
82;;
83;; 4. It allows some non-ASCII characters in certain situations where
84;; it should not. For example, it only enforces XML 1.0's
85;; restrictions on name characters strictly for ASCII characters. The
86;; problem here is XML's character model is based squarely on Unicode,
87;; whereas Emacs's is not (as of version 21). It is not clear what
88;; the right thing to do is.
89
90;;; Code:
91
92(defvar xmltok-type nil)
93(defvar xmltok-start nil)
94(defvar xmltok-name-colon nil)
95(defvar xmltok-name-end nil)
96(defvar xmltok-replacement nil
97 "String containing replacement for a character or entity reference.")
98
99(defvar xmltok-attributes nil
100 "List containing attributes of last scanned element.
101Each member of the list is a vector representing an attribute, which
102can be accessed using the functions `xmltok-attribute-name-start',
103`xmltok-attribute-name-colon', `xmltok-attribute-name-end',
104`xmltok-attribute-value-start', `xmltok-attribute-value-end',
105`xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.")
106
107(defvar xmltok-namespace-attributes nil
108 "List containing namespace declarations of last scanned element.
109List has same format as `xmltok-attributes'.")
110
111(defvar xmltok-dtd nil
112 "Information about the DTD used by `xmltok-forward'.
113`xmltok-forward-prolog' sets this up.
114
115It consists of an alist of general entity names vs definitions. The
116first member of the alist is t if references to entities not in the
117alist are well-formed \(e.g. because there's an external subset that
118wasn't parsed).
119
10545bd8
JB
120Each general entity name is a string. The definition is either nil,
121a symbol, a string, a cons cell. If the definition is nil, then it
8cd39fb3
MH
122means that it's an internal entity but the result of parsing it is
123unknown. If it is a symbol, then the symbol is either `unparsed',
124meaning the entity is an unparsed entity, `external', meaning the
125entity is or references an external entity, `element', meaning the
126entity includes one or more elements, or `not-well-formed', meaning
127the replacement text is not well-formed. If the definition is a
128string, then the replacement text of the entity is that string; this
10545bd8
JB
129happens only during the parsing of the prolog. If the definition is
130a cons cell \(ER . AR), then ER specifies the string that results
131from referencing the entity in element content and AR is either nil,
8cd39fb3
MH
132meaning the replacement text included a <, or a string which is the
133normalized attribute value.")
134
8cd39fb3
MH
135
136(defvar xmltok-errors nil
137 "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'.
138When `xmltok-forward' and `xmltok-forward-prolog' detect a
139well-formedness error, they will add an entry to the beginning of this
140list. Each entry is a vector [MESSAGE START END], where MESSAGE is a
141string giving the error message and START and END are integers
142indicating the position of the error.")
143
144(defmacro xmltok-save (&rest body)
da3e5ebb 145 (declare (indent 0) (debug t))
8cd39fb3
MH
146 `(let (xmltok-type
147 xmltok-start
148 xmltok-name-colon
149 xmltok-name-end
150 xmltok-replacement
151 xmltok-attributes
152 xmltok-namespace-attributes
8cd39fb3
MH
153 xmltok-errors)
154 ,@body))
155
8cd39fb3
MH
156(defsubst xmltok-attribute-name-start (att)
157 (aref att 0))
158
159(defsubst xmltok-attribute-name-colon (att)
160 (aref att 1))
161
162(defsubst xmltok-attribute-name-end (att)
163 (aref att 2))
164
165(defsubst xmltok-attribute-value-start (att)
166 (aref att 3))
167
168(defsubst xmltok-attribute-value-end (att)
169 (aref att 4))
170
171(defsubst xmltok-attribute-raw-normalized-value (att)
172 "Return an object representing the normalized value of ATT.
10545bd8
JB
173This can be t indicating that the normalized value is the same as
174the buffer substring from the start to the end of the value, or nil
8cd39fb3
MH
175indicating that the value is not well-formed or a string."
176 (aref att 5))
177
178(defsubst xmltok-attribute-refs (att)
179 "Return a list of the entity and character references in ATT.
180Each member is a vector [TYPE START END] where TYPE is either char-ref
10545bd8
JB
181or entity-ref and START and END are integers giving the start and end of
182the reference. Nested entity references are not included in the list."
8cd39fb3
MH
183 (aref att 6))
184
185(defun xmltok-attribute-prefix (att)
186 (let ((colon (xmltok-attribute-name-colon att)))
187 (and colon
188 (buffer-substring-no-properties (xmltok-attribute-name-start att)
189 colon))))
190
191(defun xmltok-attribute-local-name (att)
192 (let ((colon (xmltok-attribute-name-colon att)))
193 (buffer-substring-no-properties (if colon
194 (1+ colon)
195 (xmltok-attribute-name-start att))
196 (xmltok-attribute-name-end att))))
197
198(defun xmltok-attribute-value (att)
199 (let ((rnv (xmltok-attribute-raw-normalized-value att)))
200 (and rnv
201 (if (stringp rnv)
202 rnv
203 (buffer-substring-no-properties (xmltok-attribute-value-start att)
204 (xmltok-attribute-value-end att))))))
205
206(defun xmltok-start-tag-prefix ()
207 (and xmltok-name-colon
208 (buffer-substring-no-properties (1+ xmltok-start)
209 xmltok-name-colon)))
210
211(defun xmltok-start-tag-local-name ()
212 (buffer-substring-no-properties (1+ (or xmltok-name-colon
213 xmltok-start))
214 xmltok-name-end))
215
216(defun xmltok-end-tag-prefix ()
217 (and xmltok-name-colon
218 (buffer-substring-no-properties (+ 2 xmltok-start)
219 xmltok-name-colon)))
220
221(defun xmltok-end-tag-local-name ()
222 (buffer-substring-no-properties (if xmltok-name-colon
223 (1+ xmltok-name-colon)
224 (+ 2 xmltok-start))
225 xmltok-name-end))
226
227(defun xmltok-start-tag-qname ()
228 (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end))
229
230(defun xmltok-end-tag-qname ()
231 (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end))
232
233(defsubst xmltok-make-attribute (name-begin
234 name-colon
235 name-end
236 &optional
237 value-begin
238 value-end
239 raw-normalized-value)
10545bd8
JB
240 "Make an attribute.
241RAW-NORMALIZED-VALUE is nil if the value is not well-formed,
242t if the normalized value is the string between VALUE-BEGIN
243and VALUE-END, otherwise a STRING giving the value."
8cd39fb3
MH
244 (vector name-begin
245 name-colon
246 name-end
247 value-begin
248 value-end
249 raw-normalized-value
250 nil))
251
252(defsubst xmltok-error-message (err)
253 (aref err 0))
254
255(defsubst xmltok-error-start (err)
256 (aref err 1))
257
258(defsubst xmltok-error-end (err)
259 (aref err 2))
260
261(defsubst xmltok-make-error (message start end)
262 (vector message start end))
263
264(defun xmltok-add-error (message &optional start end)
265 (setq xmltok-errors
266 (cons (xmltok-make-error message
267 (or start xmltok-start)
268 (or end (point)))
269 xmltok-errors)))
270
8cd39fb3
MH
271(defun xmltok-forward ()
272 (setq xmltok-start (point))
273 (let* ((case-fold-search nil)
274 (space-count (skip-chars-forward " \t\r\n"))
275 (ch (char-after)))
276 (cond ((eq ch ?\<)
277 (cond ((> space-count 0)
278 (setq xmltok-type 'space))
279 (t
eb22cb8c 280 (forward-char 1)
8cd39fb3
MH
281 (xmltok-scan-after-lt))))
282 ((eq ch ?\&)
283 (cond ((> space-count 0)
284 (setq xmltok-type 'space))
285 (t
eb22cb8c
GM
286 (forward-char 1)
287 (xmltok-scan-after-amp 'xmltok-handle-entity))))
8cd39fb3
MH
288 ((re-search-forward "[<&]\\|\\(]]>\\)" nil t)
289 (cond ((not (match-beginning 1))
290 (goto-char (match-beginning 0))
291 ;; must have got a non-space char
292 (setq xmltok-type 'data))
293 ((= (match-beginning 1) xmltok-start)
294 (xmltok-add-error "Found `]]>' not closing a CDATA section")
295 (setq xmltok-type 'not-well-formed))
296 (t
297 (goto-char (match-beginning 0))
298 (setq xmltok-type
299 (if (= (point) (+ xmltok-start space-count))
300 'space
301 'data)))))
302 ((eq ch nil)
303 (setq xmltok-type
304 (if (> space-count 0)
305 'space
306 nil)))
307 (t
308 (goto-char (point-max))
309 (setq xmltok-type 'data)))))
310
311(defun xmltok-forward-special (bound)
312 "Scan forward past the first special token starting at or after point.
313Return nil if there is no special token that starts before BOUND.
314CDATA sections, processing instructions and comments (and indeed
10545bd8
JB
315anything starting with < following by ? or !) count as special.
316Return the type of the token."
8cd39fb3
MH
317 (when (re-search-forward "<[?!]" (1+ bound) t)
318 (setq xmltok-start (match-beginning 0))
319 (goto-char (1+ xmltok-start))
320 (let ((case-fold-search nil))
321 (xmltok-scan-after-lt))))
322
323(eval-when-compile
324
325 ;; A symbolic regexp is represented by a list whose CAR is the string
326 ;; containing the regexp and whose cdr is a list of symbolic names
327 ;; for the groups in the string.
328
329 ;; Construct a symbolic regexp from a regexp.
330 (defun xmltok-r (str)
331 (cons str nil))
332
333 ;; Concatenate zero of more regexps and symbolic regexps.
334 (defun xmltok+ (&rest args)
335 (let (strs names)
336 (while args
337 (let ((arg (car args)))
338 (if (stringp arg)
339 (setq strs (cons arg strs))
340 (setq strs (cons (car arg) strs))
341 (setq names (cons (cdr arg) names)))
342 (setq args (cdr args))))
343 (cons (apply 'concat (nreverse strs))
344 (apply 'append (nreverse names))))))
345
346(eval-when-compile
347 ;; Make a symbolic group named NAME from the regexp R.
348 ;; R may be a symbolic regexp or an ordinary regexp.
349 (defmacro xmltok-g (name &rest r)
350 (let ((sym (make-symbol "r")))
351 `(let ((,sym (xmltok+ ,@r)))
352 (if (stringp ,sym)
353 (cons (concat "\\(" ,sym "\\)") (cons ',name nil))
354 (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym)))))))
355
10545bd8 356 (defun xmltok-p (&rest r) (xmltok+ "\\(?:"
8cd39fb3
MH
357 (apply 'xmltok+ r)
358 "\\)"))
359
360 ;; Get the group index of ELEM in a LIST of symbols.
361 (defun xmltok-get-index (elem list)
362 (or elem
363 (error "Missing group name"))
364 (let ((found nil)
365 (i 1))
366 (while list
367 (cond ((eq elem (car list))
368 (setq found i)
369 (setq list nil))
370 (t
371 (setq i (1+ i))
372 (setq list (cdr list)))))
373 (or found
374 (error "Bad group name %s" elem))))
375
376 ;; Define a macro SYM using a symbolic regexp R.
377 ;; SYM can be called in three ways:
378 ;; (SYM regexp)
379 ;; expands to the regexp in R
380 ;; (SYM start G)
381 ;; expands to
382 ;; (match-beginning N)
383 ;; where N is the group index of G in R.
384 ;; (SYM end G)
385 ;; expands to
386 ;; (match-end N)
387 ;; where N is the group index of G in R.
388 (defmacro xmltok-defregexp (sym r)
389 `(defalias ',sym
390 (let ((r ,r))
391 `(macro lambda (action &optional group-name)
392 (cond ((eq action 'regexp)
393 ,(car r))
394 ((or (eq action 'start) (eq action 'beginning))
395 (list 'match-beginning (xmltok-get-index group-name
396 ',(cdr r))))
397 ((eq action 'end)
398 (list 'match-end (xmltok-get-index group-name
399 ',(cdr r))))
400 ((eq action 'string)
401 (list 'match-string
402 (xmltok-get-index group-name ',(cdr r))))
403 ((eq action 'string-no-properties)
404 (list 'match-string-no-properties
405 (xmltok-get-index group-name ',(cdr r))))
406 (t (error "Invalid action: %s" action))))))))
10545bd8 407
8cd39fb3
MH
408
409(eval-when-compile
410 (let* ((or "\\|")
411 (open "\\(?:")
8cd39fb3
MH
412 (close "\\)")
413 (name-start-char "[_[:alpha:]]")
414 (name-continue-not-start-char "[-.[:digit:]]")
415 (name-continue-char "[-._[:alnum:]]")
416 (* "*")
417 (+ "+")
418 (opt "?")
419 (question "\\?")
420 (s "[ \r\t\n]")
421 (s+ (concat s +))
422 (s* (concat s *))
423 (ncname (concat name-start-char name-continue-char *))
424 (entity-ref
425 (xmltok+ (xmltok-g entity-name ncname)
426 (xmltok-g entity-ref-close ";") opt))
427 (decimal-ref
428 (xmltok+ (xmltok-g decimal "[0-9]" +)
429 (xmltok-g decimal-ref-close ";") opt))
430 (hex-ref
431 (xmltok+ "x" open
432 (xmltok-g hex "[0-9a-fA-F]" +)
433 (xmltok-g hex-ref-close ";") opt
434 close opt))
435 (char-ref
436 (xmltok+ (xmltok-g number-sign "#")
437 open decimal-ref or hex-ref close opt))
438 (start-tag-close
439 (xmltok+ open (xmltok-g start-tag-close s* ">")
440 or open (xmltok-g empty-tag-slash s* "/")
441 (xmltok-g empty-tag-close ">") opt close
442 or (xmltok-g start-tag-s s+)
443 close))
444 (start-tag
445 (xmltok+ (xmltok-g start-tag-name
446 ncname (xmltok-g start-tag-colon ":" ncname) opt)
447 start-tag-close opt))
448 (end-tag
449 (xmltok+ (xmltok-g end-tag-slash "/")
450 open (xmltok-g end-tag-name
451 ncname
452 (xmltok-g end-tag-colon ":" ncname) opt)
453 (xmltok-g end-tag-close s* ">") opt
454 close opt))
455 (comment
456 (xmltok+ (xmltok-g markup-declaration "!")
457 (xmltok-g comment-first-dash "-"
458 (xmltok-g comment-open "-") opt) opt))
459 (cdata-section
460 (xmltok+ "!"
461 (xmltok-g marked-section-open "\\[")
462 open "C"
463 open "D"
464 open "A"
465 open "T"
466 open "A"
467 (xmltok-g cdata-section-open "\\[" ) opt
468 close opt ; A
469 close opt ; T
470 close opt ; A
471 close opt ; D
472 close opt)) ; C
473 (processing-instruction
474 (xmltok-g processing-instruction-question question)))
475
476 (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close))
477
478 (xmltok-defregexp xmltok-after-amp
479 (xmltok+ entity-ref or char-ref))
480 (xmltok-defregexp xmltok-after-lt
481 (xmltok+ start-tag
482 or end-tag
483 ;; cdata-section must come before comment
484 ;; because we treat <! as a comment
485 ;; and Emacs doesn't do fully greedy matching
486 ;; by default
487 or cdata-section
488 or comment
489 or processing-instruction))
490 (xmltok-defregexp
491 xmltok-attribute
492 (let* ((lit1
493 (xmltok+ "'"
494 "[^<'&\r\n\t]*"
495 (xmltok-g complex1 "[&\r\n\t][^<']*") opt
496 "'"))
497 (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1))
498 '(complex2)))
499 (literal (xmltok-g literal lit1 or lit2))
500 (name (xmltok+ open (xmltok-g xmlns "xmlns") or ncname close
501 (xmltok-g colon ":" ncname) opt)))
502 (xmltok+ (xmltok-g name name)
503 s* "="
504 ;; If the literal isn't followed by what it should be,
505 ;; then the closing delimiter is probably really the
506 ;; opening delimiter of another literal, so don't
507 ;; absorb the literal in this case.
508 open s* literal start-tag-close close opt)))
509 (xmltok-defregexp
510 xmltok-xml-declaration
511 (let* ((literal-content "[-._:a-zA-Z0-9]+")
512 (literal
513 (concat open "\"" literal-content "\""
514 or "'" literal-content "'" close))
515 (version-att
516 (xmltok+ open
517 s+ (xmltok-g version-name "version")
518 s* "="
519 s* (xmltok-g version-value literal)
520 close opt))
521 (encoding-att
522 (xmltok+ open
523 s+ (xmltok-g encoding-name "encoding")
524 s* "="
525 s* (xmltok-g encoding-value literal)
526 close opt))
527 (yes-no
528 (concat open "yes" or "no" close))
529 (standalone-att
530 (xmltok+ open
531 s+ (xmltok-g standalone-name "standalone")
532 s* "="
533 s* (xmltok-g standalone-value
534 "\"" yes-no "\"" or "'" yes-no "'")
535 close opt)))
536 (xmltok+ "<" question "xml"
537 version-att
538 encoding-att
539 standalone-att
540 s* question ">")))
541 (xmltok-defregexp
542 xmltok-prolog
543 (let* ((single-char (xmltok-g single-char "[[|,(\"'>]"))
544 (internal-subset-close (xmltok-g internal-subset-close
545 "][ \t\r\n]*>"))
546 (starts-with-close-paren
547 (xmltok-g close-paren
548 ")"
549 (xmltok-p
550 (xmltok-g close-paren-occur "[+?]")
551 or
552 (xmltok-g close-paren-star "\\*"))
553 opt))
554 (starts-with-percent
555 (xmltok-g percent
556 "%" (xmltok-g param-entity-ref
557 ncname
558 (xmltok-g param-entity-ref-close
559 ";") opt) opt))
560 (starts-with-nmtoken-not-name
561 (xmltok-g nmtoken
562 (xmltok-p name-continue-not-start-char or ":")
563 (xmltok-p name-continue-char or ":") *))
564 (nmtoken-after-colon
565 (xmltok+
566 (xmltok-p name-continue-not-start-char or ":")
567 (xmltok-p name-continue-char or ":") *
568 or
569 name-start-char
570 name-continue-char *
571 ":"
572 (xmltok-p name-continue-char or ":") *))
573 (after-ncname
574 (xmltok+ (xmltok-g ncname-nmtoken
575 ":" (xmltok-p nmtoken-after-colon))
576 or (xmltok-p (xmltok-g colon ":" ncname)
577 (xmltok-g colon-name-occur "[?+*]") opt)
578 or (xmltok-g ncname-occur "[?+*]")
579 or (xmltok-g ncname-colon ":")))
580 (starts-with-name
581 (xmltok-g name ncname (xmltok-p after-ncname) opt))
582 (starts-with-hash
583 (xmltok-g pound
584 "#" (xmltok-g hash-name ncname)))
585 (markup-declaration
586 (xmltok-g markup-declaration
587 "!" (xmltok-p (xmltok-g comment-first-dash "-"
588 (xmltok-g comment-open "-") opt)
589 or (xmltok-g named-markup-declaration
590 ncname)) opt))
591 (after-lt
592 (xmltok+ markup-declaration
593 or (xmltok-g processing-instruction-question
594 question)
595 or (xmltok-g instance-start
596 ncname)))
597 (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt)))
598 (xmltok+ starts-with-lt
599 or single-char
600 or starts-with-close-paren
601 or starts-with-percent
602 or starts-with-name
603 or starts-with-nmtoken-not-name
604 or starts-with-hash
605 or internal-subset-close)))))
606
607(defconst xmltok-ncname-regexp (xmltok-ncname regexp))
608
609(defun xmltok-scan-after-lt ()
610 (cond ((not (looking-at (xmltok-after-lt regexp)))
611 (xmltok-add-error "`<' that is not markup must be entered as `&lt;'")
612 (setq xmltok-type 'not-well-formed))
613 (t
614 (goto-char (match-end 0))
615 (cond ((xmltok-after-lt start start-tag-close)
616 (setq xmltok-name-end
617 (xmltok-after-lt end start-tag-name))
618 (setq xmltok-name-colon
619 (xmltok-after-lt start start-tag-colon))
620 (setq xmltok-attributes nil)
621 (setq xmltok-namespace-attributes nil)
622 (setq xmltok-type 'start-tag))
623 ((xmltok-after-lt start end-tag-close)
624 (setq xmltok-name-end
625 (xmltok-after-lt end end-tag-name))
626 (setq xmltok-name-colon
627 (xmltok-after-lt start end-tag-colon))
628 (setq xmltok-type 'end-tag))
629 ((xmltok-after-lt start start-tag-s)
630 (setq xmltok-name-end
631 (xmltok-after-lt end start-tag-name))
632 (setq xmltok-name-colon
633 (xmltok-after-lt start start-tag-colon))
634 (setq xmltok-namespace-attributes nil)
635 (setq xmltok-attributes nil)
636 (xmltok-scan-attributes)
637 xmltok-type)
638 ((xmltok-after-lt start empty-tag-close)
639 (setq xmltok-name-end
640 (xmltok-after-lt end start-tag-name))
641 (setq xmltok-name-colon
642 (xmltok-after-lt start start-tag-colon))
643 (setq xmltok-attributes nil)
644 (setq xmltok-namespace-attributes nil)
645 (setq xmltok-type 'empty-element))
646 ((xmltok-after-lt start cdata-section-open)
647 (setq xmltok-type
c9990474
SM
648 (progn (search-forward "]]>" nil 'move)
649 'cdata-section)))
8cd39fb3
MH
650 ((xmltok-after-lt start processing-instruction-question)
651 (xmltok-scan-after-processing-instruction-open))
652 ((xmltok-after-lt start comment-open)
653 (xmltok-scan-after-comment-open))
654 ((xmltok-after-lt start empty-tag-slash)
655 (setq xmltok-name-end
656 (xmltok-after-lt end start-tag-name))
657 (setq xmltok-name-colon
658 (xmltok-after-lt start start-tag-colon))
659 (setq xmltok-attributes nil)
660 (setq xmltok-namespace-attributes nil)
661 (xmltok-add-error "Expected `/>'" (1- (point)))
662 (setq xmltok-type 'partial-empty-element))
663 ((xmltok-after-lt start start-tag-name)
664 (xmltok-add-error "Missing `>'"
665 nil
666 (1+ xmltok-start))
667 (setq xmltok-name-end
668 (xmltok-after-lt end start-tag-name))
669 (setq xmltok-name-colon
670 (xmltok-after-lt start start-tag-colon))
671 (setq xmltok-namespace-attributes nil)
672 (setq xmltok-attributes nil)
673 (setq xmltok-type 'partial-start-tag))
674 ((xmltok-after-lt start end-tag-name)
675 (setq xmltok-name-end (xmltok-after-lt end end-tag-name))
676 (setq xmltok-name-colon
677 (xmltok-after-lt start end-tag-colon))
678 (cond ((and (not xmltok-name-colon)
679 (eq (char-after) ?:))
680 (goto-char (1+ (point)))
681 (xmltok-add-error "Expected name following `:'"
682 (1- (point))))
683 (t
684 (xmltok-add-error "Missing `>'"
685 nil
686 (1+ xmltok-start))))
687 (setq xmltok-type 'partial-end-tag))
688 ((xmltok-after-lt start end-tag-slash)
689 (xmltok-add-error "Expected name following `</'")
690 (setq xmltok-name-end nil)
691 (setq xmltok-name-colon nil)
692 (setq xmltok-type 'partial-end-tag))
693 ((xmltok-after-lt start marked-section-open)
694 (xmltok-add-error "Expected `CDATA[' after `<!['"
695 xmltok-start
696 (+ 3 xmltok-start))
697 (setq xmltok-type 'not-well-formed))
698 ((xmltok-after-lt start comment-first-dash)
699 (xmltok-add-error "Expected `-' after `<!-'"
700 xmltok-start
701 (+ 3 xmltok-start))
702 (setq xmltok-type 'not-well-formed))
703 ((xmltok-after-lt start markup-declaration)
704 (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'"
705 xmltok-start
706 (+ 2 xmltok-start))
707 (setq xmltok-type 'not-well-formed))
708 (t
709 (xmltok-add-error "Not well-formed")
710 (setq xmltok-type 'not-well-formed))))))
711
712;; XXX This should be unified with
713;; xmltok-scan-prolog-after-processing-instruction-open
714;; XXX maybe should include rest of line (up to any <,>) in unclosed PI
715(defun xmltok-scan-after-processing-instruction-open ()
c9990474
SM
716 (search-forward "?>" nil 'move)
717 (cond ((not (save-excursion
718 (goto-char (+ 2 xmltok-start))
719 (and (looking-at (xmltok-ncname regexp))
720 (setq xmltok-name-end (match-end 0)))))
721 (setq xmltok-name-end (+ xmltok-start 2))
722 (xmltok-add-error "<? not followed by name"
723 (+ xmltok-start 2)
724 (+ xmltok-start 3)))
725 ((not (or (memq (char-after xmltok-name-end)
726 '(?\n ?\t ?\r ? ))
727 (= xmltok-name-end (- (point) 2))))
728 (xmltok-add-error "Target not followed by whitespace"
729 xmltok-name-end
730 (1+ xmltok-name-end)))
731 ((and (= xmltok-name-end (+ xmltok-start 5))
732 (save-excursion
733 (goto-char (+ xmltok-start 2))
734 (let ((case-fold-search t))
735 (looking-at "xml"))))
736 (xmltok-add-error "Processing instruction target is xml"
737 (+ xmltok-start 2)
738 (+ xmltok-start 5))))
739 (setq xmltok-type 'processing-instruction))
10545bd8 740
8cd39fb3 741(defun xmltok-scan-after-comment-open ()
c9990474
SM
742 (let ((found-- (search-forward "--" nil 'move)))
743 (setq xmltok-type
744 (cond ((or (eq (char-after) ?>) (not found--))
745 (goto-char (1+ (point)))
746 'comment)
747 (t
748 ;; just include the <!-- in the token
749 (goto-char (+ xmltok-start 4))
750 ;; Need do this after the goto-char because
751 ;; marked error should just apply to <!--
752 (xmltok-add-error "First following `--' not followed by `>'")
753 'not-well-formed)))))
8cd39fb3
MH
754
755(defun xmltok-scan-attributes ()
756 (let ((recovering nil)
757 (atts-needing-normalization nil))
758 (while (cond ((or (looking-at (xmltok-attribute regexp))
759 ;; use non-greedy group
760 (when (looking-at (concat "[^<>\n]+?"
761 (xmltok-attribute regexp)))
762 (unless recovering
763 (xmltok-add-error "Malformed attribute"
764 (point)
765 (save-excursion
766 (goto-char (xmltok-attribute start
767 name))
768 (skip-chars-backward "\r\n\t ")
769 (point))))
770 t))
771 (setq recovering nil)
772 (goto-char (match-end 0))
773 (let ((att (xmltok-add-attribute)))
774 (when att
775 (setq atts-needing-normalization
776 (cons att atts-needing-normalization))))
777 (cond ((xmltok-attribute start start-tag-s) t)
778 ((xmltok-attribute start start-tag-close)
779 (setq xmltok-type 'start-tag)
780 nil)
781 ((xmltok-attribute start empty-tag-close)
782 (setq xmltok-type 'empty-element)
783 nil)
784 ((xmltok-attribute start empty-tag-slash)
785 (setq xmltok-type 'partial-empty-element)
786 (xmltok-add-error "Expected `/>'"
787 (1- (point)))
788 nil)
789 ((looking-at "[ \t\r\n]*[\"']")
790 (goto-char (match-end 0))
791 (xmltok-add-error "Missing closing delimiter"
792 (1- (point)))
793 (setq recovering t)
794 t)
795 ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]")
796 (goto-char (match-end 1))
797 (xmltok-add-error "Attribute value not quoted"
798 (match-beginning 1))
799 (setq recovering t)
800 t)
801 (t
802 (xmltok-add-error "Missing attribute value"
803 (1- (point)))
804 (setq recovering t)
805 t)))
806 ((looking-at "[^<>\n]*/>")
807 (let ((start (point)))
808 (goto-char (match-end 0))
809 (unless recovering
810 (xmltok-add-error "Malformed empty-element"
811 start
812 (- (point) 2))))
813 (setq xmltok-type 'empty-element)
814 nil)
815 ((looking-at "[^<>\n]*>")
816 (let ((start (point)))
817 (goto-char (match-end 0))
818 (unless recovering
819 (xmltok-add-error "Malformed start-tag"
820 start
821 (1- (point)))))
822 (setq xmltok-type 'start-tag)
823 nil)
824 (t
825 (when recovering
826 (skip-chars-forward "^<>\n"))
827 (xmltok-add-error "Missing `>'"
828 xmltok-start
829 (1+ xmltok-start))
830 (setq xmltok-type 'partial-start-tag)
831 nil)))
832 (while atts-needing-normalization
833 (xmltok-normalize-attribute (car atts-needing-normalization))
834 (setq atts-needing-normalization (cdr atts-needing-normalization))))
835 (setq xmltok-attributes
836 (nreverse xmltok-attributes))
837 (setq xmltok-namespace-attributes
838 (nreverse xmltok-namespace-attributes)))
839
840(defun xmltok-add-attribute ()
841 "Return the attribute if it needs normalizing, otherwise nil."
842 (let* ((needs-normalizing nil)
843 (att
844 (if (xmltok-attribute start literal)
845 (progn
846 (setq needs-normalizing
847 (or (xmltok-attribute start complex1)
848 (xmltok-attribute start complex2)))
849 (xmltok-make-attribute (xmltok-attribute start name)
850 (xmltok-attribute start colon)
851 (xmltok-attribute end name)
852 (1+ (xmltok-attribute start literal))
853 (1- (xmltok-attribute end literal))
854 (not needs-normalizing)))
855 (xmltok-make-attribute (xmltok-attribute start name)
856 (xmltok-attribute start colon)
857 (xmltok-attribute end name)))))
858 (if (xmltok-attribute start xmlns)
859 (setq xmltok-namespace-attributes
860 (cons att xmltok-namespace-attributes))
861 (setq xmltok-attributes
862 (cons att xmltok-attributes)))
863 (and needs-normalizing
864 att)))
10545bd8 865
8cd39fb3
MH
866(defun xmltok-normalize-attribute (att)
867 (let ((end (xmltok-attribute-value-end att))
868 (well-formed t)
869 (value-parts nil)
870 (refs nil))
871 (save-excursion
872 (goto-char (xmltok-attribute-value-start att))
873 (while (progn
874 (let ((n (skip-chars-forward "^\r\t\n&" end)))
875 (when (> n 0)
876 (setq value-parts
877 (cons (buffer-substring-no-properties (- (point) n)
878 (point))
879 value-parts))))
880 (when (< (point) end)
881 (goto-char (1+ (point)))
882 (cond ((eq (char-before) ?\&)
883 (let ((xmltok-start (1- (point)))
884 xmltok-type xmltok-replacement)
885 (xmltok-scan-after-amp
886 (lambda (start end)
887 (xmltok-handle-entity start end t)))
888 (cond ((or (eq xmltok-type 'char-ref)
889 (eq xmltok-type 'entity-ref))
890 (setq refs
891 (cons (vector xmltok-type
892 xmltok-start
893 (point))
894 refs))
895 (if xmltok-replacement
896 (setq value-parts
897 (cons xmltok-replacement
898 value-parts))
899 (setq well-formed nil)))
900 (t (setq well-formed nil)))))
901 (t (setq value-parts
902 (cons " " value-parts)))))
903 (< (point) end))))
904 (when well-formed
905 (aset att 5 (apply 'concat (nreverse value-parts))))
906 (aset att 6 (nreverse refs))))
907
908(defun xmltok-scan-after-amp (entity-handler)
909 (cond ((not (looking-at (xmltok-after-amp regexp)))
910 (xmltok-add-error "`&' that is not markup must be entered as `&amp;'")
911 (setq xmltok-type 'not-well-formed))
912 (t
913 (goto-char (match-end 0))
914 (cond ((xmltok-after-amp start entity-ref-close)
915 (funcall entity-handler
916 (xmltok-after-amp start entity-name)
917 (xmltok-after-amp end entity-name))
918 (setq xmltok-type 'entity-ref))
919 ((xmltok-after-amp start decimal-ref-close)
920 (xmltok-scan-char-ref (xmltok-after-amp start decimal)
921 (xmltok-after-amp end decimal)
922 10))
923 ((xmltok-after-amp start hex-ref-close)
924 (xmltok-scan-char-ref (xmltok-after-amp start hex)
925 (xmltok-after-amp end hex)
926 16))
927 ((xmltok-after-amp start number-sign)
928 (xmltok-add-error "Missing character number")
929 (setq xmltok-type 'not-well-formed))
930 (t
931 (xmltok-add-error "Missing closing `;'")
932 (setq xmltok-type 'not-well-formed))))))
933
934(defconst xmltok-entity-error-messages
935 '((unparsed . "Referenced entity is unparsed")
936 (not-well-formed . "Referenced entity is not well-formed")
937 (external nil . "Referenced entity is external")
938 (element nil . "Referenced entity contains <")))
939
940(defun xmltok-handle-entity (start end &optional attributep)
941 (let* ((name (buffer-substring-no-properties start end))
942 (name-def (assoc name xmltok-dtd))
943 (def (cdr name-def)))
944 (cond ((setq xmltok-replacement (and (consp def)
945 (if attributep
946 (cdr def)
947 (car def)))))
948 ((null name-def)
949 (unless (eq (car xmltok-dtd) t)
950 (xmltok-add-error "Referenced entity has not been defined"
951 start
952 end)))
953 ((and attributep (consp def))
954 (xmltok-add-error "Referenced entity contains <"
955 start
956 end))
957 (t
958 (let ((err (cdr (assq def xmltok-entity-error-messages))))
959 (when (consp err)
960 (setq err (if attributep (cdr err) (car err))))
961 (when err
962 (xmltok-add-error err start end)))))))
963
964(defun xmltok-scan-char-ref (start end base)
965 (setq xmltok-replacement
9499da38 966 (let ((n (string-to-number (buffer-substring-no-properties start end)
8cd39fb3
MH
967 base)))
968 (cond ((and (integerp n) (xmltok-valid-char-p n))
969 (setq n (xmltok-unicode-to-char n))
970 (and n (string n)))
971 (t
972 (xmltok-add-error "Invalid character code" start end)
973 nil))))
974 (setq xmltok-type 'char-ref))
975
976(defun xmltok-char-number (start end)
977 (let* ((base (if (eq (char-after (+ start 2)) ?x)
978 16
979 10))
9499da38 980 (n (string-to-number
8cd39fb3
MH
981 (buffer-substring-no-properties (+ start (if (= base 16) 3 2))
982 (1- end))
983 base)))
984 (and (integerp n)
985 (xmltok-valid-char-p n)
986 n)))
987
8cd39fb3 988(defun xmltok-valid-char-p (n)
10545bd8 989 "Return non-nil if N is the Unicode code of a valid XML character."
8cd39fb3
MH
990 (cond ((< n #x20) (memq n '(#xA #xD #x9)))
991 ((< n #xD800) t)
992 ((< n #xE000) nil)
993 ((< n #xFFFE) t)
994 (t (and (> n #xFFFF)
995 (< n #x110000)))))
996
997(defun xmltok-unicode-to-char (n)
998 "Return the character corresponding to Unicode scalar value N.
999Return nil if unsupported in Emacs."
1000 (decode-char 'ucs n))
1001
1002;;; Prolog parsing
1003
1004(defvar xmltok-contains-doctype nil)
1005(defvar xmltok-doctype-external-subset-flag nil)
1006(defvar xmltok-internal-subset-start nil)
1007(defvar xmltok-had-param-entity-ref nil)
1008(defvar xmltok-prolog-regions nil)
1009(defvar xmltok-standalone nil
10545bd8 1010 "Non-nil if there was an XML declaration specifying standalone=\"yes\".")
8cd39fb3
MH
1011(defvar xmltok-markup-declaration-doctype-flag nil)
1012
1013(defconst xmltok-predefined-entity-alist
1014 '(("lt" "<" . "<")
1015 ("gt" ">" . ">")
1016 ("amp" "&" . "&")
1017 ("apos" "'" . "'")
1018 ("quot" "\"" . "\"")))
1019
1020(defun xmltok-forward-prolog ()
1021 "Move forward to the end of the XML prolog.
1022
1023Returns a list of vectors [TYPE START END] where TYPE is a symbol and
1024START and END are integers giving the start and end of the region of
1025that type. TYPE can be one of xml-declaration,
1026xml-declaration-attribute-name, xml-declaration-attribute-value,
1027comment, processing-instruction-left, processing-instruction-right,
10545bd8 1028markup-declaration-open, markup-declaration-close,
8cd39fb3
MH
1029internal-subset-open, internal-subset-close, hash-name, keyword,
1030literal, encoding-name.
c9990474 1031Adds to `xmltok-errors' as appropriate."
8cd39fb3
MH
1032 (let ((case-fold-search nil)
1033 xmltok-start
1034 xmltok-type
1035 xmltok-prolog-regions
1036 xmltok-contains-doctype
1037 xmltok-internal-subset-start
1038 xmltok-had-param-entity-ref
1039 xmltok-standalone
1040 xmltok-doctype-external-subset-flag
1041 xmltok-markup-declaration-doctype-flag)
1042 (setq xmltok-dtd xmltok-predefined-entity-alist)
1043 (xmltok-scan-xml-declaration)
1044 (xmltok-next-prolog-token)
da3e5ebb 1045 (while (condition-case nil
8cd39fb3
MH
1046 (when (xmltok-parse-prolog-item)
1047 (xmltok-next-prolog-token))
1048 (xmltok-markup-declaration-parse-error
1049 (xmltok-skip-markup-declaration))))
1050 (when xmltok-internal-subset-start
1051 (xmltok-add-error "No closing ]"
1052 (1- xmltok-internal-subset-start)
1053 xmltok-internal-subset-start))
1054 (xmltok-parse-entities)
8cd39fb3
MH
1055 (nreverse xmltok-prolog-regions)))
1056
1057(defconst xmltok-bad-xml-decl-regexp
1058 "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)")
1059
1060;;;###autoload
1061(defun xmltok-get-declared-encoding-position (&optional limit)
1062 "Return the position of the encoding in the XML declaration at point.
1063If there is a well-formed XML declaration starting at point and it
1064contains an encoding declaration, then return (START . END)
1065where START and END are the positions of the start and the end
1066of the encoding name; if there is no encoding declaration return
1067the position where and encoding declaration could be inserted.
10545bd8
JB
1068If there is XML that is not well-formed that looks like an XML
1069declaration, return nil. Otherwise, return t.
8cd39fb3
MH
1070If LIMIT is non-nil, then do not consider characters beyond LIMIT."
1071 (cond ((let ((case-fold-search nil))
1072 (and (looking-at (xmltok-xml-declaration regexp))
1073 (or (not limit) (<= (match-end 0) limit))))
1074 (let ((end (xmltok-xml-declaration end encoding-value)))
1075 (if end
1076 (cons (1+ (xmltok-xml-declaration start encoding-value))
1077 (1- end))
1078 (or (xmltok-xml-declaration end version-value)
1079 (+ (point) 5)))))
1080 ((not (let ((case-fold-search t))
1081 (looking-at xmltok-bad-xml-decl-regexp))))))
10545bd8 1082
8cd39fb3
MH
1083(defun xmltok-scan-xml-declaration ()
1084 (when (looking-at (xmltok-xml-declaration regexp))
1085 (xmltok-add-prolog-region 'xml-declaration (point) (match-end 0))
1086 (goto-char (match-end 0))
1087 (when (xmltok-xml-declaration start version-name)
1088 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1089 (xmltok-xml-declaration start version-name)
1090 (xmltok-xml-declaration end version-name))
1091 (let ((start (xmltok-xml-declaration start version-value))
1092 (end (xmltok-xml-declaration end version-value)))
1093 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1094 start
1095 end)))
1096 ;; XXX need to check encoding name
1097 ;; Should start with letter, not contain colon
1098 (when (xmltok-xml-declaration start encoding-name)
1099 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1100 (xmltok-xml-declaration start encoding-name)
1101 (xmltok-xml-declaration end encoding-name))
1102 (let ((start (xmltok-xml-declaration start encoding-value))
1103 (end (xmltok-xml-declaration end encoding-value)))
1104 (xmltok-add-prolog-region 'encoding-name
1105 (1+ start)
1106 (1- end))
1107 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1108 start
1109 end)))
1110 (when (xmltok-xml-declaration start standalone-name)
1111 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1112 (xmltok-xml-declaration start standalone-name)
1113 (xmltok-xml-declaration end standalone-name))
1114 (let ((start (xmltok-xml-declaration start standalone-value))
1115 (end (xmltok-xml-declaration end standalone-value)))
1116 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1117 start
1118 end)
1119 (setq xmltok-standalone
1120 (string= (buffer-substring-no-properties (1+ start) (1- end))
1121 "yes"))))
1122 t))
1123
1124(defconst xmltok-markup-declaration-alist
1125 '(("ELEMENT" . xmltok-parse-element-declaration)
1126 ("ATTLIST" . xmltok-parse-attlist-declaration)
1127 ("ENTITY" . xmltok-parse-entity-declaration)
1128 ("NOTATION" . xmltok-parse-notation-declaration)))
1129
1130(defun xmltok-parse-prolog-item ()
1131 (cond ((eq xmltok-type 'comment)
1132 (xmltok-add-prolog-region 'comment
1133 xmltok-start
1134 (point))
1135 t)
1136 ((eq xmltok-type 'processing-instruction))
1137 ((eq xmltok-type 'named-markup-declaration)
1138 (setq xmltok-markup-declaration-doctype-flag nil)
1139 (xmltok-add-prolog-region 'markup-declaration-open
1140 xmltok-start
1141 (point))
1142 (let* ((name (buffer-substring-no-properties
1143 (+ xmltok-start 2)
1144 (point)))
1145 (fun (cdr (assoc name xmltok-markup-declaration-alist))))
1146 (cond (fun
1147 (unless xmltok-internal-subset-start
1148 (xmltok-add-error
1149 "Declaration allowed only in internal subset"))
1150 (funcall fun))
1151 ((string= name "DOCTYPE")
1152 (xmltok-parse-doctype))
1153 (t
1154 (xmltok-add-error "Unknown markup declaration"
1155 (+ xmltok-start 2))
1156 (xmltok-next-prolog-token)
1157 (xmltok-markup-declaration-parse-error))))
1158 t)
1159 ((or (eq xmltok-type 'end-prolog)
1160 (not xmltok-type))
1161 nil)
1162 ((eq xmltok-type 'internal-subset-close)
1163 (xmltok-add-prolog-region 'internal-subset-close
1164 xmltok-start
1165 (1+ xmltok-start))
1166 (xmltok-add-prolog-region 'markup-declaration-close
1167 (1- (point))
1168 (point))
1169 (if xmltok-internal-subset-start
1170 (setq xmltok-internal-subset-start nil)
1171 (xmltok-add-error "]> outside internal subset"))
1172 t)
1173 ((eq xmltok-type 'param-entity-ref)
1174 (if xmltok-internal-subset-start
1175 (setq xmltok-had-param-entity-ref t)
1176 (xmltok-add-error "Parameter entity reference outside document type declaration"))
1177 t)
1178 ;; If we don't do this, we can get thousands of errors when
1179 ;; a plain text file is parsed.
1180 ((not xmltok-internal-subset-start)
1181 (when (let ((err (car xmltok-errors)))
1182 (or (not err)
1183 (<= (xmltok-error-end err) xmltok-start)))
1184 (goto-char xmltok-start))
1185 nil)
1186 ((eq xmltok-type 'not-well-formed) t)
1187 (t
1188 (xmltok-add-error "Token allowed only inside markup declaration")
1189 t)))
1190
1191(defun xmltok-parse-doctype ()
1192 (setq xmltok-markup-declaration-doctype-flag t)
1193 (xmltok-next-prolog-token)
1194 (when xmltok-internal-subset-start
1195 (xmltok-add-error "DOCTYPE declaration not allowed in internal subset")
1196 (xmltok-markup-declaration-parse-error))
1197 (when xmltok-contains-doctype
1198 (xmltok-add-error "Duplicate DOCTYPE declaration")
1199 (xmltok-markup-declaration-parse-error))
1200 (setq xmltok-contains-doctype t)
1201 (xmltok-require-token 'name 'prefixed-name)
1202 (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\[ ?>)
1203 (cond ((eq xmltok-type ?\[)
1204 (setq xmltok-internal-subset-start (point)))
1205 ((eq xmltok-type ?>))
1206 (t
1207 (setq xmltok-doctype-external-subset-flag t)
1208 (xmltok-parse-external-id)
1209 (xmltok-require-token ?\[ ?>)
1210 (when (eq xmltok-type ?\[)
1211 (setq xmltok-internal-subset-start (point))))))
1212
1213(defun xmltok-parse-attlist-declaration ()
1214 (xmltok-require-next-token 'prefixed-name 'name)
1215 (while (progn
1216 (xmltok-require-next-token ?> 'name 'prefixed-name)
1217 (if (eq xmltok-type ?>)
1218 nil
1219 (xmltok-require-next-token ?\(
1220 "CDATA"
1221 "ID"
1222 "IDREF"
1223 "IDREFS"
1224 "ENTITY"
1225 "ENTITIES"
1226 "NMTOKEN"
1227 "NMTOKENS"
1228 "NOTATION")
1229 (cond ((eq xmltok-type ?\()
1230 (xmltok-parse-nmtoken-group))
1231 ((string= (xmltok-current-token-string)
1232 "NOTATION")
1233 (xmltok-require-next-token ?\()
1234 (xmltok-parse-nmtoken-group)))
1235 (xmltok-require-next-token "#IMPLIED"
1236 "#REQUIRED"
1237 "#FIXED"
1238 'literal)
1239 (when (string= (xmltok-current-token-string) "#FIXED")
1240 (xmltok-require-next-token 'literal))
1241 t))))
10545bd8 1242
8cd39fb3
MH
1243(defun xmltok-parse-nmtoken-group ()
1244 (while (progn
1245 (xmltok-require-next-token 'nmtoken 'prefixed-name 'name)
1246 (xmltok-require-next-token ?| ?\))
1247 (eq xmltok-type ?|))))
1248
1249(defun xmltok-parse-element-declaration ()
1250 (xmltok-require-next-token 'name 'prefixed-name)
1251 (xmltok-require-next-token "EMPTY" "ANY" ?\()
1252 (when (eq xmltok-type ?\()
1253 (xmltok-require-next-token "#PCDATA"
1254 'name
1255 'prefixed-name
1256 'name-occur
1257 ?\()
1258 (cond ((eq xmltok-type 'hash-name)
1259 (xmltok-require-next-token ?| ?\) 'close-paren-star)
1260 (while (eq xmltok-type ?|)
1261 (xmltok-require-next-token 'name 'prefixed-name)
1262 (xmltok-require-next-token 'close-paren-star ?|)))
1263 (t (xmltok-parse-model-group))))
1264 (xmltok-require-next-token ?>))
1265
1266(defun xmltok-parse-model-group ()
1267 (xmltok-parse-model-group-member)
1268 (xmltok-require-next-token ?|
1269 ?,
1270 ?\)
1271 'close-paren-star
1272 'close-paren-occur)
1273 (when (memq xmltok-type '(?, ?|))
1274 (let ((connector xmltok-type))
1275 (while (progn
1276 (xmltok-next-prolog-token)
1277 (xmltok-parse-model-group-member)
1278 (xmltok-require-next-token connector
1279 ?\)
1280 'close-paren-star
1281 'close-paren-occur)
1282 (eq xmltok-type connector))))))
10545bd8 1283
8cd39fb3
MH
1284(defun xmltok-parse-model-group-member ()
1285 (xmltok-require-token 'name
1286 'prefixed-name
1287 'name-occur
1288 ?\()
1289 (when (eq xmltok-type ?\()
1290 (xmltok-next-prolog-token)
1291 (xmltok-parse-model-group)))
10545bd8 1292
8cd39fb3
MH
1293(defun xmltok-parse-entity-declaration ()
1294 (let (paramp name)
1295 (xmltok-require-next-token 'name ?%)
1296 (when (eq xmltok-type ?%)
1297 (setq paramp t)
1298 (xmltok-require-next-token 'name))
1299 (setq name (xmltok-current-token-string))
1300 (xmltok-require-next-token 'literal "SYSTEM" "PUBLIC")
1301 (cond ((eq xmltok-type 'literal)
1302 (let ((replacement (xmltok-parse-entity-value)))
1303 (unless paramp
1304 (xmltok-define-entity name replacement)))
1305 (xmltok-require-next-token ?>))
1306 (t
1307 (xmltok-parse-external-id)
1308 (if paramp
1309 (xmltok-require-token ?>)
1310 (xmltok-require-token ?> "NDATA")
1311 (if (eq xmltok-type ?>)
1312 (xmltok-define-entity name 'external)
1313 (xmltok-require-next-token 'name)
1314 (xmltok-require-next-token ?>)
1315 (xmltok-define-entity name 'unparsed)))))))
1316
1317(defun xmltok-define-entity (name value)
1318 (when (and (or (not xmltok-had-param-entity-ref)
1319 xmltok-standalone)
1320 (not (assoc name xmltok-dtd)))
1321 (setq xmltok-dtd
1322 (cons (cons name value) xmltok-dtd))))
10545bd8 1323
8cd39fb3
MH
1324(defun xmltok-parse-entity-value ()
1325 (let ((lim (1- (point)))
1326 (well-formed t)
1327 value-parts
1328 start)
1329 (save-excursion
1330 (goto-char (1+ xmltok-start))
1331 (setq start (point))
1332 (while (progn
1333 (skip-chars-forward "^%&" lim)
1334 (when (< (point) lim)
1335 (goto-char (1+ (point)))
1336 (cond ((eq (char-before) ?%)
1337 (xmltok-add-error "Parameter entity references are not allowed in the internal subset"
1338 (1- (point))
1339 (point))
1340 (setq well-formed nil))
1341 (t
1342 (let ((xmltok-start (1- (point)))
1343 xmltok-type xmltok-replacement)
da3e5ebb 1344 (xmltok-scan-after-amp (lambda (_start _end)))
8cd39fb3
MH
1345 (cond ((eq xmltok-type 'char-ref)
1346 (setq value-parts
1347 (cons (buffer-substring-no-properties
1348 start
1349 xmltok-start)
1350 value-parts))
1351 (setq value-parts
1352 (cons xmltok-replacement
1353 value-parts))
1354 (setq start (point)))
1355 ((eq xmltok-type 'not-well-formed)
1356 (setq well-formed nil))))))
1357 t))))
1358 (if (not well-formed)
1359 nil
1360 (apply 'concat
1361 (nreverse (cons (buffer-substring-no-properties start lim)
1362 value-parts))))))
10545bd8 1363
8cd39fb3
MH
1364(defun xmltok-parse-notation-declaration ()
1365 (xmltok-require-next-token 'name)
1366 (xmltok-require-next-token "SYSTEM" "PUBLIC")
1367 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
1368 (xmltok-require-next-token 'literal)
1369 (cond (publicp
1370 (xmltok-require-next-token 'literal ?>)
1371 (unless (eq xmltok-type ?>)
1372 (xmltok-require-next-token ?>)))
1373 (t (xmltok-require-next-token ?>)))))
1374
1375(defun xmltok-parse-external-id ()
1376 (xmltok-require-token "SYSTEM" "PUBLIC")
1377 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
1378 (xmltok-require-next-token 'literal)
1379 (when publicp
1380 (xmltok-require-next-token 'literal)))
1381 (xmltok-next-prolog-token))
1382
1383(defun xmltok-require-next-token (&rest types)
1384 (xmltok-next-prolog-token)
1385 (apply 'xmltok-require-token types))
1386
1387(defun xmltok-require-token (&rest types)
1388 ;; XXX Generate a more helpful error message
1389 (while (and (not (let ((type (car types)))
1390 (if (stringp (car types))
1391 (string= (xmltok-current-token-string) type)
1392 (eq type xmltok-type))))
1393 (setq types (cdr types))))
1394 (unless types
1395 (when (and xmltok-type
1396 (not (eq xmltok-type 'not-well-formed)))
1397 (xmltok-add-error "Unexpected token"))
1398 (xmltok-markup-declaration-parse-error))
1399 (let ((region-type (xmltok-prolog-region-type (car types))))
1400 (when region-type
1401 (xmltok-add-prolog-region region-type
1402 xmltok-start
1403 (point)))))
1404
1405(defun xmltok-current-token-string ()
1406 (buffer-substring-no-properties xmltok-start (point)))
1407
54bd972f
SM
1408(define-error 'xmltok-markup-declaration-parse-error
1409 "Syntax error in markup declaration")
8cd39fb3
MH
1410
1411(defun xmltok-markup-declaration-parse-error ()
1412 (signal 'xmltok-markup-declaration-parse-error nil))
1413
1414(defun xmltok-skip-markup-declaration ()
1415 (while (cond ((eq xmltok-type ?>)
1416 (xmltok-next-prolog-token)
1417 nil)
1418 ((and xmltok-markup-declaration-doctype-flag
1419 (eq xmltok-type ?\[))
1420 (setq xmltok-internal-subset-start (point))
1421 (xmltok-next-prolog-token)
1422 nil)
1423 ((memq xmltok-type '(nil
1424 end-prolog
1425 named-markup-declaration
1426 comment
1427 processing-instruction))
1428 nil)
1429 ((and xmltok-internal-subset-start
1430 (eq xmltok-type 'internal-subset-close))
1431 nil)
1432 (t (xmltok-next-prolog-token) t)))
1433 xmltok-type)
1434
1435(defun xmltok-prolog-region-type (required)
1436 (cond ((cdr (assq xmltok-type
1437 '((literal . literal)
1438 (?> . markup-declaration-close)
1439 (?\[ . internal-subset-open)
1440 (hash-name . hash-name)))))
1441 ((and (stringp required) (eq xmltok-type 'name))
1442 'keyword)))
10545bd8 1443
8cd39fb3 1444;; Return new token type.
10545bd8 1445
8cd39fb3
MH
1446(defun xmltok-next-prolog-token ()
1447 (skip-chars-forward " \t\r\n")
1448 (setq xmltok-start (point))
1449 (cond ((not (and (looking-at (xmltok-prolog regexp))
1450 (goto-char (match-end 0))))
1451 (let ((ch (char-after)))
1452 (cond (ch
1453 (goto-char (1+ (point)))
1454 (xmltok-add-error "Illegal char in prolog")
1455 (setq xmltok-type 'not-well-formed))
1456 (t (setq xmltok-type nil)))))
1457 ((or (xmltok-prolog start ncname-occur)
1458 (xmltok-prolog start colon-name-occur))
1459 (setq xmltok-name-end (1- (point)))
1460 (setq xmltok-name-colon (xmltok-prolog start colon))
1461 (setq xmltok-type 'name-occur))
1462 ((xmltok-prolog start colon)
1463 (setq xmltok-name-end (point))
1464 (setq xmltok-name-colon (xmltok-prolog start colon))
1465 (unless (looking-at "[ \t\r\n>),|[%]")
1466 (xmltok-add-error "Missing space after name"))
1467 (setq xmltok-type 'prefixed-name))
1468 ((or (xmltok-prolog start ncname-nmtoken)
1469 (xmltok-prolog start ncname-colon))
1470 (unless (looking-at "[ \t\r\n>),|[%]")
1471 (xmltok-add-error "Missing space after name token"))
1472 (setq xmltok-type 'nmtoken))
10545bd8 1473 ((xmltok-prolog start name)
8cd39fb3
MH
1474 (setq xmltok-name-end (point))
1475 (setq xmltok-name-colon nil)
1476 (unless (looking-at "[ \t\r\n>),|[%]")
1477 (xmltok-add-error "Missing space after name"))
1478 (setq xmltok-type 'name))
10545bd8 1479 ((xmltok-prolog start hash-name)
8cd39fb3
MH
1480 (setq xmltok-name-end (point))
1481 (unless (looking-at "[ \t\r\n>)|%]")
1482 (xmltok-add-error "Missing space after name"))
1483 (setq xmltok-type 'hash-name))
1484 ((xmltok-prolog start processing-instruction-question)
1485 (xmltok-scan-prolog-after-processing-instruction-open))
1486 ((xmltok-prolog start comment-open)
1487 ;; XXX if not-well-formed, ignore some stuff
1488 (xmltok-scan-after-comment-open))
1489 ((xmltok-prolog start named-markup-declaration)
1490 (setq xmltok-type 'named-markup-declaration))
1491 ((xmltok-prolog start instance-start)
1492 (goto-char xmltok-start)
1493 (setq xmltok-type 'end-prolog))
1494 ((xmltok-prolog start close-paren-star)
1495 (setq xmltok-type 'close-paren-star))
1496 ((xmltok-prolog start close-paren-occur)
1497 (setq xmltok-type 'close-paren-occur))
1498 ((xmltok-prolog start close-paren)
1499 (unless (looking-at "[ \t\r\n>,|)]")
1500 (xmltok-add-error "Missing space after )"))
1501 (setq xmltok-type ?\)))
1502 ((xmltok-prolog start single-char)
1503 (let ((ch (char-before)))
1504 (cond ((memq ch '(?\" ?\'))
1505 (xmltok-scan-prolog-literal))
1506 (t (setq xmltok-type ch)))))
1507 ((xmltok-prolog start percent)
1508 (cond ((xmltok-prolog start param-entity-ref-close)
1509 (setq xmltok-name-end (1- (point)))
1510 (setq xmltok-type 'param-entity-ref))
1511 ((xmltok-prolog start param-entity-ref)
1512 (xmltok-add-error "Missing ;")
1513 (setq xmltok-name-end (point))
1514 (setq xmltok-type 'param-entity-ref))
1515 ((looking-at "[ \t\r\n%]")
1516 (setq xmltok-type ?%))
1517 (t
1518 (xmltok-add-error "Expected name after %")
1519 (setq xmltok-type 'not-well-formed))))
1520 ((xmltok-prolog start nmtoken)
1521 (unless (looking-at "[ \t\r\n>),|[%]")
1522 (xmltok-add-error "Missing space after name token"))
1523 (setq xmltok-type 'nmtoken))
1524 ((xmltok-prolog start internal-subset-close)
1525 (setq xmltok-type 'internal-subset-close))
1526 ((xmltok-prolog start pound)
1527 (xmltok-add-error "Expected name after #")
1528 (setq xmltok-type 'not-well-formed))
1529 ((xmltok-prolog start markup-declaration)
1530 (xmltok-add-error "Expected name or -- after <!")
1531 (setq xmltok-type 'not-well-formed))
1532 ((xmltok-prolog start comment-first-dash)
1533 (xmltok-add-error "Expected <!--")
1534 (setq xmltok-type 'not-well-formed))
1535 ((xmltok-prolog start less-than)
1536 (xmltok-add-error "Incomplete markup")
1537 (setq xmltok-type 'not-well-formed))
1538 (t (error "Unhandled token in prolog %s"
1539 (match-string-no-properties 0)))))
1540
1541(defun xmltok-scan-prolog-literal ()
1542 (let* ((delim (string (char-before)))
1543 (safe-end (save-excursion
1544 (skip-chars-forward (concat "^<>[]" delim))
1545 (point)))
1546 (end (save-excursion
1547 (goto-char safe-end)
1548 (search-forward delim nil t))))
c9990474
SM
1549 (cond ((or (not end)
1550 (save-excursion
1551 (goto-char end)
1552 (looking-at "[ \t\r\n>%[]")))
1553 (goto-char end))
1554 ((eq (1+ safe-end) end)
1555 (goto-char end)
1556 (xmltok-add-error (format "Missing space after %s" delim)
1557 safe-end)))
1558 (setq xmltok-type 'literal)))
8cd39fb3
MH
1559
1560(defun xmltok-scan-prolog-after-processing-instruction-open ()
c9990474
SM
1561 (search-forward "?>" nil 'move)
1562 (let* ((end (point))
1563 (target
1564 (save-excursion
1565 (goto-char (+ xmltok-start 2))
1566 (and (looking-at (xmltok-ncname regexp))
1567 (or (memq (char-after (match-end 0))
1568 '(?\n ?\t ?\r ? ))
1569 (= (match-end 0) (- end 2)))
1570 (match-string-no-properties 0)))))
1571 (cond ((not target)
1572 (xmltok-add-error "\
8cd39fb3 1573Processing instruction does not start with a name"
c9990474
SM
1574 (+ xmltok-start 2)
1575 (+ xmltok-start 3)))
1576 ((not (and (= (length target) 3)
1577 (let ((case-fold-search t))
1578 (string-match "xml" target)))))
1579 ((= xmltok-start 1)
1580 (xmltok-add-error "Invalid XML declaration"
1581 xmltok-start
1582 (point)))
1583 ((save-excursion
1584 (goto-char xmltok-start)
1585 (looking-at (xmltok-xml-declaration regexp)))
1586 (xmltok-add-error "XML declaration not at beginning of file"
1587 xmltok-start
1588 (point)))
1589 (t
1590 (xmltok-add-error "Processing instruction has target of xml"
1591 (+ xmltok-start 2)
1592 (+ xmltok-start 5))))
1593 (xmltok-add-prolog-region 'processing-instruction-left
1594 xmltok-start
1595 (+ xmltok-start
1596 2
1597 (if target
1598 (length target)
1599 0)))
1600 (xmltok-add-prolog-region 'processing-instruction-right
1601 (if target
1602 (save-excursion
1603 (goto-char (+ xmltok-start
1604 (length target)
1605 2))
1606 (skip-chars-forward " \t\r\n")
1607 (point))
1608 (+ xmltok-start 2))
1609 (point)))
1610 (setq xmltok-type 'processing-instruction))
8cd39fb3
MH
1611
1612(defun xmltok-parse-entities ()
1613 (let ((todo xmltok-dtd))
1614 (when (and (or xmltok-had-param-entity-ref
1615 xmltok-doctype-external-subset-flag)
1616 (not xmltok-standalone))
1617 (setq xmltok-dtd (cons t xmltok-dtd)))
1618 (while todo
1619 (xmltok-parse-entity (car todo))
1620 (setq todo (cdr todo)))))
10545bd8 1621
8cd39fb3
MH
1622(defun xmltok-parse-entity (name-def)
1623 (let ((def (cdr name-def))
10545bd8 1624 ;; in case its value is buffer local
8cd39fb3
MH
1625 (xmltok-dtd xmltok-dtd)
1626 buf)
1627 (when (stringp def)
1628 (if (string-match "\\`[^&<\t\r\n]*\\'" def)
1629 (setcdr name-def (cons def def))
1630 (setcdr name-def 'not-well-formed) ; avoid infinite expansion loops
1631 (setq buf (get-buffer-create
1632 (format " *Entity %s*" (car name-def))))
2adaf057 1633 (with-current-buffer buf
8cd39fb3
MH
1634 (erase-buffer)
1635 (insert def)
1636 (goto-char (point-min))
1637 (setcdr name-def
1638 (xmltok-parse-entity-replacement)))
1639 (kill-buffer buf)))))
1640
1641(defun xmltok-parse-entity-replacement ()
1642 (let ((def (cons "" "")))
1643 (while (let* ((start (point))
1644 (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t))
1645 (ch (and found (char-before)))
1646 (str (buffer-substring-no-properties
1647 start
1648 (if found
1649 (match-beginning 0)
1650 (point-max)))))
1651 (setq def
1652 (xmltok-append-entity-def def
1653 (cons str str)))
1654 (cond ((not found) nil)
1655 ((eq ch ?>)
1656 (setq def 'not-well-formed)
1657 nil)
1658 ((eq ch ?<)
1659 (xmltok-save
1660 (setq xmltok-start (1- (point)))
1661 (xmltok-scan-after-lt)
1662 (setq def
1663 (xmltok-append-entity-def
1664 def
1665 (cond ((memq xmltok-type
1666 '(start-tag
1667 end-tag
1668 empty-element))
1669 'element)
1670 ((memq xmltok-type
1671 '(comment
1672 processing-instruction))
1673 (cons "" nil))
1674 ((eq xmltok-type
1675 'cdata-section)
1676 (cons (buffer-substring-no-properties
1677 (+ xmltok-start 9)
1678 (- (point) 3))
1679 nil))
1680 (t 'not-well-formed)))))
1681 t)
1682 ((eq ch ?&)
1683 (let ((xmltok-start (1- (point)))
1684 xmltok-type
1685 xmltok-replacement
1686 xmltok-errors)
1687 (xmltok-scan-after-amp 'xmltok-handle-nested-entity)
1688 (cond ((eq xmltok-type 'entity-ref)
1689 (setq def
1690 (xmltok-append-entity-def
1691 def
1692 xmltok-replacement)))
1693 ((eq xmltok-type 'char-ref)
1694 (setq def
1695 (xmltok-append-entity-def
1696 def
1697 (if xmltok-replacement
1698 (cons xmltok-replacement
1699 xmltok-replacement)
1700 (and xmltok-errors 'not-well-formed)))))
1701 (t
1702 (setq def 'not-well-formed))))
1703 t)
1704 (t
1705 (setq def
1706 (xmltok-append-entity-def
1707 def
1708 (cons (match-string-no-properties 0)
1709 " ")))
1710 t))))
1711 def))
1712
1713(defun xmltok-handle-nested-entity (start end)
1714 (let* ((name-def (assoc (buffer-substring-no-properties start end)
1715 xmltok-dtd))
1716 (def (cdr name-def)))
1717 (when (stringp def)
1718 (xmltok-parse-entity name-def)
1719 (setq def (cdr name-def)))
1720 (setq xmltok-replacement
1721 (cond ((null name-def)
1722 (if (eq (car xmltok-dtd) t)
1723 nil
1724 'not-well-formed))
1725 ((eq def 'unparsed) 'not-well-formed)
1726 (t def)))))
10545bd8 1727
8cd39fb3
MH
1728(defun xmltok-append-entity-def (d1 d2)
1729 (cond ((consp d1)
1730 (if (consp d2)
1731 (cons (concat (car d1) (car d2))
1732 (and (cdr d1)
1733 (cdr d2)
1734 (concat (cdr d1) (cdr d2))))
1735 d2))
1736 ((consp d2) d1)
1737 (t
1738 (let ((defs '(not-well-formed external element)))
1739 (while (not (or (eq (car defs) d1)
1740 (eq (car defs) d2)))
1741 (setq defs (cdr defs)))
1742 (car defs)))))
1743
1744(defun xmltok-add-prolog-region (type start end)
1745 (setq xmltok-prolog-regions
1746 (cons (vector type start end)
1747 xmltok-prolog-regions)))
1748
1749(defun xmltok-merge-attributes ()
10545bd8 1750 "Return a list merging `xmltok-attributes' and `xmltok-namespace-attributes'.
8cd39fb3
MH
1751The members of the merged list are in order of occurrence in the
1752document. The list may share list structure with `xmltok-attributes'
1753and `xmltok-namespace-attributes'."
1754 (cond ((not xmltok-namespace-attributes)
1755 xmltok-attributes)
1756 ((not xmltok-attributes)
1757 xmltok-namespace-attributes)
1758 (t
1759 (let ((atts1 xmltok-attributes)
1760 (atts2 xmltok-namespace-attributes)
1761 merged)
1762 (while (and atts1 atts2)
1763 (cond ((< (xmltok-attribute-name-start (car atts1))
1764 (xmltok-attribute-name-start (car atts2)))
1765 (setq merged (cons (car atts1) merged))
1766 (setq atts1 (cdr atts1)))
1767 (t
1768 (setq merged (cons (car atts2) merged))
1769 (setq atts2 (cdr atts2)))))
1770 (setq merged (nreverse merged))
1771 (cond (atts1 (setq merged (nconc merged atts1)))
1772 (atts2 (setq merged (nconc merged atts2))))
1773 merged))))
1774
1775;;; Testing
1776
1777(defun xmltok-forward-test ()
1778 (interactive)
1779 (if (xmltok-forward)
1780 (message "Scanned %s" xmltok-type)
1781 (message "Scanned nothing")))
1782
1783(defun xmltok-next-prolog-token-test ()
1784 (interactive)
1785 (if (xmltok-next-prolog-token)
1786 (message "Scanned %s"
1787 (if (integerp xmltok-type)
1788 (string xmltok-type)
1789 xmltok-type))
1790 (message "Scanned end of file")))
1791
1792(provide 'xmltok)
1793
1794;;; xmltok.el ends here