;;; xmltok.el --- XML tokenization -*- lexical-binding:t -*- ;; Copyright (C) 2003, 2007-2014 Free Software Foundation, Inc. ;; Author: James Clark ;; Keywords: wp, hypermedia, languages, XML ;; This file is part of GNU Emacs. ;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs. If not, see . ;;; Commentary: ;; This implements an XML 1.0 parser. It also implements the XML ;; Namespaces Recommendation. It is designed to be conforming, but it ;; works a bit differently from a normal XML parser. An XML document ;; consists of the prolog and an instance. The prolog is parsed as a ;; single unit using `xmltok-forward-prolog'. The instance is ;; considered as a sequence of tokens, where a token is something like ;; a start-tag, a comment, a chunk of data or a CDATA section. The ;; tokenization of the instance is stateless: the tokenization of one ;; part of the instance does not depend on tokenization of the ;; preceding part of the instance. This allows the instance to be ;; parsed incrementally. The main entry point is `xmltok-forward': ;; this can be called at any point in the instance provided it is ;; between tokens. The other entry point is `xmltok-forward-special' ;; which skips over tokens other comments, processing instructions or ;; CDATA sections (i.e. the constructs in an instance that can contain ;; less than signs that don't start a token). ;; ;; This is a non-validating XML 1.0 processor. It does not resolve ;; parameter entities (including the external DTD subset) and it does ;; not resolve external general entities. ;; ;; It is non-conformant by design in the following respects. ;; ;; 1. It expects the client to detect aspects of well-formedness that ;; are not internal to a single token, specifically checking that ;; end-tags match start-tags and that the instance contains exactly ;; one element. ;; ;; 2. It expects the client to detect duplicate attributes. Detection ;; of duplicate attributes after expansion of namespace prefixes ;; requires the namespace processing state. Detection of duplicate ;; attributes before expansion of namespace prefixes does not, but is ;; redundant given that the client will do detection of duplicate ;; attributes after expansion of namespace prefixes. ;; ;; 3. It allows the client to recover from well-formedness errors. ;; This is essential for use in applications where the document is ;; being parsed during the editing process. ;; ;; 4. It does not support documents that do not conform to the lexical ;; requirements of the XML Namespaces Recommendation (e.g. a document ;; with a colon in an entity name). ;; ;; There are also a number of things that have not yet been ;; implemented that make it non-conformant. ;; ;; 1. It does not implement default attributes. ATTLIST declarations ;; are parsed, but no checking is done on the content of attribute ;; value literals specifying default attribute values, and default ;; attribute values are not reported to the client. ;; ;; 2. It does not implement internal entities containing elements. If ;; an internal entity is referenced and parsing its replacement text ;; yields one or more tags, then it will skip the reference and ;; report this to the client. ;; ;; 3. It does not check the syntax of public identifiers in the DTD. ;; ;; 4. It allows some non-ASCII characters in certain situations where ;; it should not. For example, it only enforces XML 1.0's ;; restrictions on name characters strictly for ASCII characters. The ;; problem here is XML's character model is based squarely on Unicode, ;; whereas Emacs's is not (as of version 21). It is not clear what ;; the right thing to do is. ;;; Code: (defvar xmltok-type nil) (defvar xmltok-start nil) (defvar xmltok-name-colon nil) (defvar xmltok-name-end nil) (defvar xmltok-replacement nil "String containing replacement for a character or entity reference.") (defvar xmltok-attributes nil "List containing attributes of last scanned element. Each member of the list is a vector representing an attribute, which can be accessed using the functions `xmltok-attribute-name-start', `xmltok-attribute-name-colon', `xmltok-attribute-name-end', `xmltok-attribute-value-start', `xmltok-attribute-value-end', `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.") (defvar xmltok-namespace-attributes nil "List containing namespace declarations of last scanned element. List has same format as `xmltok-attributes'.") (defvar xmltok-dtd nil "Information about the DTD used by `xmltok-forward'. `xmltok-forward-prolog' sets this up. It consists of an alist of general entity names vs definitions. The first member of the alist is t if references to entities not in the alist are well-formed \(e.g. because there's an external subset that wasn't parsed). Each general entity name is a string. The definition is either nil, a symbol, a string, a cons cell. If the definition is nil, then it means that it's an internal entity but the result of parsing it is unknown. If it is a symbol, then the symbol is either `unparsed', meaning the entity is an unparsed entity, `external', meaning the entity is or references an external entity, `element', meaning the entity includes one or more elements, or `not-well-formed', meaning the replacement text is not well-formed. If the definition is a string, then the replacement text of the entity is that string; this happens only during the parsing of the prolog. If the definition is a cons cell \(ER . AR), then ER specifies the string that results from referencing the entity in element content and AR is either nil, meaning the replacement text included a <, or a string which is the normalized attribute value.") (defvar xmltok-errors nil "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'. When `xmltok-forward' and `xmltok-forward-prolog' detect a well-formedness error, they will add an entry to the beginning of this list. Each entry is a vector [MESSAGE START END], where MESSAGE is a string giving the error message and START and END are integers indicating the position of the error.") (defmacro xmltok-save (&rest body) (declare (indent 0) (debug t)) `(let (xmltok-type xmltok-start xmltok-name-colon xmltok-name-end xmltok-replacement xmltok-attributes xmltok-namespace-attributes xmltok-errors) ,@body)) (defsubst xmltok-attribute-name-start (att) (aref att 0)) (defsubst xmltok-attribute-name-colon (att) (aref att 1)) (defsubst xmltok-attribute-name-end (att) (aref att 2)) (defsubst xmltok-attribute-value-start (att) (aref att 3)) (defsubst xmltok-attribute-value-end (att) (aref att 4)) (defsubst xmltok-attribute-raw-normalized-value (att) "Return an object representing the normalized value of ATT. This can be t indicating that the normalized value is the same as the buffer substring from the start to the end of the value, or nil indicating that the value is not well-formed or a string." (aref att 5)) (defsubst xmltok-attribute-refs (att) "Return a list of the entity and character references in ATT. Each member is a vector [TYPE START END] where TYPE is either char-ref or entity-ref and START and END are integers giving the start and end of the reference. Nested entity references are not included in the list." (aref att 6)) (defun xmltok-attribute-prefix (att) (let ((colon (xmltok-attribute-name-colon att))) (and colon (buffer-substring-no-properties (xmltok-attribute-name-start att) colon)))) (defun xmltok-attribute-local-name (att) (let ((colon (xmltok-attribute-name-colon att))) (buffer-substring-no-properties (if colon (1+ colon) (xmltok-attribute-name-start att)) (xmltok-attribute-name-end att)))) (defun xmltok-attribute-value (att) (let ((rnv (xmltok-attribute-raw-normalized-value att))) (and rnv (if (stringp rnv) rnv (buffer-substring-no-properties (xmltok-attribute-value-start att) (xmltok-attribute-value-end att)))))) (defun xmltok-start-tag-prefix () (and xmltok-name-colon (buffer-substring-no-properties (1+ xmltok-start) xmltok-name-colon))) (defun xmltok-start-tag-local-name () (buffer-substring-no-properties (1+ (or xmltok-name-colon xmltok-start)) xmltok-name-end)) (defun xmltok-end-tag-prefix () (and xmltok-name-colon (buffer-substring-no-properties (+ 2 xmltok-start) xmltok-name-colon))) (defun xmltok-end-tag-local-name () (buffer-substring-no-properties (if xmltok-name-colon (1+ xmltok-name-colon) (+ 2 xmltok-start)) xmltok-name-end)) (defun xmltok-start-tag-qname () (buffer-substring-no-properties (+ xmltok-start 1) xmltok-name-end)) (defun xmltok-end-tag-qname () (buffer-substring-no-properties (+ xmltok-start 2) xmltok-name-end)) (defsubst xmltok-make-attribute (name-begin name-colon name-end &optional value-begin value-end raw-normalized-value) "Make an attribute. RAW-NORMALIZED-VALUE is nil if the value is not well-formed, t if the normalized value is the string between VALUE-BEGIN and VALUE-END, otherwise a STRING giving the value." (vector name-begin name-colon name-end value-begin value-end raw-normalized-value nil)) (defsubst xmltok-error-message (err) (aref err 0)) (defsubst xmltok-error-start (err) (aref err 1)) (defsubst xmltok-error-end (err) (aref err 2)) (defsubst xmltok-make-error (message start end) (vector message start end)) (defun xmltok-add-error (message &optional start end) (setq xmltok-errors (cons (xmltok-make-error message (or start xmltok-start) (or end (point))) xmltok-errors))) (defun xmltok-forward () (setq xmltok-start (point)) (let* ((case-fold-search nil) (space-count (skip-chars-forward " \t\r\n")) (ch (char-after))) (cond ((eq ch ?\<) (cond ((> space-count 0) (setq xmltok-type 'space)) (t (forward-char 1) (xmltok-scan-after-lt)))) ((eq ch ?\&) (cond ((> space-count 0) (setq xmltok-type 'space)) (t (forward-char 1) (xmltok-scan-after-amp 'xmltok-handle-entity)))) ((re-search-forward "[<&]\\|\\(]]>\\)" nil t) (cond ((not (match-beginning 1)) (goto-char (match-beginning 0)) ;; must have got a non-space char (setq xmltok-type 'data)) ((= (match-beginning 1) xmltok-start) (xmltok-add-error "Found `]]>' not closing a CDATA section") (setq xmltok-type 'not-well-formed)) (t (goto-char (match-beginning 0)) (setq xmltok-type (if (= (point) (+ xmltok-start space-count)) 'space 'data))))) ((eq ch nil) (setq xmltok-type (if (> space-count 0) 'space nil))) (t (goto-char (point-max)) (setq xmltok-type 'data))))) (defun xmltok-forward-special (bound) "Scan forward past the first special token starting at or after point. Return nil if there is no special token that starts before BOUND. CDATA sections, processing instructions and comments (and indeed anything starting with < following by ? or !) count as special. Return the type of the token." (when (re-search-forward "<[?!]" (1+ bound) t) (setq xmltok-start (match-beginning 0)) (goto-char (1+ xmltok-start)) (let ((case-fold-search nil)) (xmltok-scan-after-lt)))) (eval-when-compile ;; A symbolic regexp is represented by a list whose CAR is the string ;; containing the regexp and whose cdr is a list of symbolic names ;; for the groups in the string. ;; Construct a symbolic regexp from a regexp. (defun xmltok-r (str) (cons str nil)) ;; Concatenate zero of more regexps and symbolic regexps. (defun xmltok+ (&rest args) (let (strs names) (while args (let ((arg (car args))) (if (stringp arg) (setq strs (cons arg strs)) (setq strs (cons (car arg) strs)) (setq names (cons (cdr arg) names))) (setq args (cdr args)))) (cons (apply 'concat (nreverse strs)) (apply 'append (nreverse names)))))) (eval-when-compile ;; Make a symbolic group named NAME from the regexp R. ;; R may be a symbolic regexp or an ordinary regexp. (defmacro xmltok-g (name &rest r) (let ((sym (make-symbol "r"))) `(let ((,sym (xmltok+ ,@r))) (if (stringp ,sym) (cons (concat "\\(" ,sym "\\)") (cons ',name nil)) (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym))))))) (defun xmltok-p (&rest r) (xmltok+ "\\(?:" (apply 'xmltok+ r) "\\)")) ;; Get the group index of ELEM in a LIST of symbols. (defun xmltok-get-index (elem list) (or elem (error "Missing group name")) (let ((found nil) (i 1)) (while list (cond ((eq elem (car list)) (setq found i) (setq list nil)) (t (setq i (1+ i)) (setq list (cdr list))))) (or found (error "Bad group name %s" elem)))) ;; Define a macro SYM using a symbolic regexp R. ;; SYM can be called in three ways: ;; (SYM regexp) ;; expands to the regexp in R ;; (SYM start G) ;; expands to ;; (match-beginning N) ;; where N is the group index of G in R. ;; (SYM end G) ;; expands to ;; (match-end N) ;; where N is the group index of G in R. (defmacro xmltok-defregexp (sym r) `(defalias ',sym (let ((r ,r)) `(macro lambda (action &optional group-name) (cond ((eq action 'regexp) ,(car r)) ((or (eq action 'start) (eq action 'beginning)) (list 'match-beginning (xmltok-get-index group-name ',(cdr r)))) ((eq action 'end) (list 'match-end (xmltok-get-index group-name ',(cdr r)))) ((eq action 'string) (list 'match-string (xmltok-get-index group-name ',(cdr r)))) ((eq action 'string-no-properties) (list 'match-string-no-properties (xmltok-get-index group-name ',(cdr r)))) (t (error "Invalid action: %s" action)))))))) (eval-when-compile (let* ((or "\\|") (open "\\(?:") (close "\\)") (name-start-char "[_[:alpha:]]") (name-continue-not-start-char "[-.[:digit:]]") (name-continue-char "[-._[:alnum:]]") (* "*") (+ "+") (opt "?") (question "\\?") (s "[ \r\t\n]") (s+ (concat s +)) (s* (concat s *)) (ncname (concat name-start-char name-continue-char *)) (entity-ref (xmltok+ (xmltok-g entity-name ncname) (xmltok-g entity-ref-close ";") opt)) (decimal-ref (xmltok+ (xmltok-g decimal "[0-9]" +) (xmltok-g decimal-ref-close ";") opt)) (hex-ref (xmltok+ "x" open (xmltok-g hex "[0-9a-fA-F]" +) (xmltok-g hex-ref-close ";") opt close opt)) (char-ref (xmltok+ (xmltok-g number-sign "#") open decimal-ref or hex-ref close opt)) (start-tag-close (xmltok+ open (xmltok-g start-tag-close s* ">") or open (xmltok-g empty-tag-slash s* "/") (xmltok-g empty-tag-close ">") opt close or (xmltok-g start-tag-s s+) close)) (start-tag (xmltok+ (xmltok-g start-tag-name ncname (xmltok-g start-tag-colon ":" ncname) opt) start-tag-close opt)) (end-tag (xmltok+ (xmltok-g end-tag-slash "/") open (xmltok-g end-tag-name ncname (xmltok-g end-tag-colon ":" ncname) opt) (xmltok-g end-tag-close s* ">") opt close opt)) (comment (xmltok+ (xmltok-g markup-declaration "!") (xmltok-g comment-first-dash "-" (xmltok-g comment-open "-") opt) opt)) (cdata-section (xmltok+ "!" (xmltok-g marked-section-open "\\[") open "C" open "D" open "A" open "T" open "A" (xmltok-g cdata-section-open "\\[" ) opt close opt ; A close opt ; T close opt ; A close opt ; D close opt)) ; C (processing-instruction (xmltok-g processing-instruction-question question))) (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close)) (xmltok-defregexp xmltok-after-amp (xmltok+ entity-ref or char-ref)) (xmltok-defregexp xmltok-after-lt (xmltok+ start-tag or end-tag ;; cdata-section must come before comment ;; because we treat "))) (xmltok-defregexp xmltok-prolog (let* ((single-char (xmltok-g single-char "[[|,(\"'>]")) (internal-subset-close (xmltok-g internal-subset-close "][ \t\r\n]*>")) (starts-with-close-paren (xmltok-g close-paren ")" (xmltok-p (xmltok-g close-paren-occur "[+?]") or (xmltok-g close-paren-star "\\*")) opt)) (starts-with-percent (xmltok-g percent "%" (xmltok-g param-entity-ref ncname (xmltok-g param-entity-ref-close ";") opt) opt)) (starts-with-nmtoken-not-name (xmltok-g nmtoken (xmltok-p name-continue-not-start-char or ":") (xmltok-p name-continue-char or ":") *)) (nmtoken-after-colon (xmltok+ (xmltok-p name-continue-not-start-char or ":") (xmltok-p name-continue-char or ":") * or name-start-char name-continue-char * ":" (xmltok-p name-continue-char or ":") *)) (after-ncname (xmltok+ (xmltok-g ncname-nmtoken ":" (xmltok-p nmtoken-after-colon)) or (xmltok-p (xmltok-g colon ":" ncname) (xmltok-g colon-name-occur "[?+*]") opt) or (xmltok-g ncname-occur "[?+*]") or (xmltok-g ncname-colon ":"))) (starts-with-name (xmltok-g name ncname (xmltok-p after-ncname) opt)) (starts-with-hash (xmltok-g pound "#" (xmltok-g hash-name ncname))) (markup-declaration (xmltok-g markup-declaration "!" (xmltok-p (xmltok-g comment-first-dash "-" (xmltok-g comment-open "-") opt) or (xmltok-g named-markup-declaration ncname)) opt)) (after-lt (xmltok+ markup-declaration or (xmltok-g processing-instruction-question question) or (xmltok-g instance-start ncname))) (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt))) (xmltok+ starts-with-lt or single-char or starts-with-close-paren or starts-with-percent or starts-with-name or starts-with-nmtoken-not-name or starts-with-hash or internal-subset-close))))) (defconst xmltok-ncname-regexp (xmltok-ncname regexp)) (defun xmltok-scan-after-lt () (cond ((not (looking-at (xmltok-after-lt regexp))) (xmltok-add-error "`<' that is not markup must be entered as `<'") (setq xmltok-type 'not-well-formed)) (t (goto-char (match-end 0)) (cond ((xmltok-after-lt start start-tag-close) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-attributes nil) (setq xmltok-namespace-attributes nil) (setq xmltok-type 'start-tag)) ((xmltok-after-lt start end-tag-close) (setq xmltok-name-end (xmltok-after-lt end end-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start end-tag-colon)) (setq xmltok-type 'end-tag)) ((xmltok-after-lt start start-tag-s) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-namespace-attributes nil) (setq xmltok-attributes nil) (xmltok-scan-attributes) xmltok-type) ((xmltok-after-lt start empty-tag-close) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-attributes nil) (setq xmltok-namespace-attributes nil) (setq xmltok-type 'empty-element)) ((xmltok-after-lt start cdata-section-open) (setq xmltok-type (progn (search-forward "]]>" nil 'move) 'cdata-section))) ((xmltok-after-lt start processing-instruction-question) (xmltok-scan-after-processing-instruction-open)) ((xmltok-after-lt start comment-open) (xmltok-scan-after-comment-open)) ((xmltok-after-lt start empty-tag-slash) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-attributes nil) (setq xmltok-namespace-attributes nil) (xmltok-add-error "Expected `/>'" (1- (point))) (setq xmltok-type 'partial-empty-element)) ((xmltok-after-lt start start-tag-name) (xmltok-add-error "Missing `>'" nil (1+ xmltok-start)) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-namespace-attributes nil) (setq xmltok-attributes nil) (setq xmltok-type 'partial-start-tag)) ((xmltok-after-lt start end-tag-name) (setq xmltok-name-end (xmltok-after-lt end end-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start end-tag-colon)) (cond ((and (not xmltok-name-colon) (eq (char-after) ?:)) (goto-char (1+ (point))) (xmltok-add-error "Expected name following `:'" (1- (point)))) (t (xmltok-add-error "Missing `>'" nil (1+ xmltok-start)))) (setq xmltok-type 'partial-end-tag)) ((xmltok-after-lt start end-tag-slash) (xmltok-add-error "Expected name following `) in unclosed PI (defun xmltok-scan-after-processing-instruction-open () (search-forward "?>" nil 'move) (cond ((not (save-excursion (goto-char (+ 2 xmltok-start)) (and (looking-at (xmltok-ncname regexp)) (setq xmltok-name-end (match-end 0))))) (setq xmltok-name-end (+ xmltok-start 2)) (xmltok-add-error ") (not found--)) (goto-char (1+ (point))) 'comment) (t ;; just include the