| 1 | ;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode |
| 2 | |
| 3 | ;; Copyright (C) 2003, 2007-2012 Free Software Foundation, Inc. |
| 4 | |
| 5 | ;; Author: James Clark |
| 6 | ;; Keywords: XML |
| 7 | |
| 8 | ;; This file is part of GNU Emacs. |
| 9 | |
| 10 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
| 11 | ;; it under the terms of the GNU General Public License as published by |
| 12 | ;; the Free Software Foundation, either version 3 of the License, or |
| 13 | ;; (at your option) any later version. |
| 14 | |
| 15 | ;; GNU Emacs is distributed in the hope that it will be useful, |
| 16 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 18 | ;; GNU General Public License for more details. |
| 19 | |
| 20 | ;; You should have received a copy of the GNU General Public License |
| 21 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
| 22 | |
| 23 | ;;; Commentary: |
| 24 | |
| 25 | ;; Entry point is `nxml-parse-file'. |
| 26 | |
| 27 | ;;; Code: |
| 28 | |
| 29 | (require 'nxml-util) |
| 30 | (require 'xmltok) |
| 31 | (require 'nxml-enc) |
| 32 | (require 'nxml-ns) |
| 33 | |
| 34 | (defvar nxml-parse-file-name nil) |
| 35 | |
| 36 | (defvar nxml-validate-function nil |
| 37 | "Either nil or a function called by `nxml-parse-file' to perform validation. |
| 38 | The function will be called once for each start-tag or end-tag. The |
| 39 | function is passed two arguments TEXT and START-TAG. For a start-tag, |
| 40 | START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in |
| 41 | the same form as returned by `nxml-parse-file'. For an end-tag, |
| 42 | START-TAG is nil. TEXT is a string containing the text immediately |
| 43 | preceding the tag, or nil if there was no such text. An empty element |
| 44 | is treated as a start-tag followed by an end-tag. |
| 45 | |
| 46 | For a start-tag, the namespace state will be the state after |
| 47 | processing the namespace declarations in the start-tag. For an |
| 48 | end-tag, the namespace state will be the state before popping the |
| 49 | namespace declarations for the corresponding start-tag. |
| 50 | |
| 51 | The function must return nil if no error is detected or a |
| 52 | cons (MESSAGE . LOCATION) where MESSAGE is a string containing |
| 53 | an error message and LOCATION indicates what caused the error |
| 54 | as follows: |
| 55 | |
| 56 | - nil indicates the tag as whole caused it; this is always allowed; |
| 57 | |
| 58 | - text indicates the text caused it; this is allowed only if |
| 59 | TEXT is non-nil; |
| 60 | |
| 61 | - tag-close indicates the close of the tag caused it; this is |
| 62 | allowed only if START-TAG is non-nil; |
| 63 | |
| 64 | - (attribute-name . N) indicates that the name of the Nth attribute |
| 65 | caused it; N counts from 0; this is allowed only if START-TAG is non-nil |
| 66 | and N must be less than the number of attributes; |
| 67 | |
| 68 | - (attribute-value . N) indicates that the value of the Nth attribute |
| 69 | caused it; N counts from 0; this is allowed only if START-TAG is non-nil |
| 70 | and N must be less than the number of attributes.") |
| 71 | |
| 72 | (defun nxml-parse-file (file) |
| 73 | "Parse the XML document in FILE and return it as a list. |
| 74 | An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN). |
| 75 | NAME is either a string, in the case where the name does not have a |
| 76 | namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a |
| 77 | symbol and LOCAL-NAME is a string, in the case where the name does |
| 78 | have a namespace. NAMESPACE is a keyword whose name is `:URI', where |
| 79 | URI is the namespace name. ATTRIBUTES is an alist of attributes where |
| 80 | each attribute has the form (NAME . VALUE), where NAME has the same |
| 81 | form as an element name, and VALUE is a string. A namespace |
| 82 | declaration is represented as an attribute whose name is |
| 83 | \(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list |
| 84 | containing strings and child elements; CHILDREN never contains two |
| 85 | consecutive strings and never contains an empty string. Processing |
| 86 | instructions and comments are not represented. The return value is a |
| 87 | list representing the document element. |
| 88 | |
| 89 | If the XML document is not well-formed, an error having the condition |
| 90 | `nxml-file-parse-error' will be signaled; the error data will be a |
| 91 | list of the form \(FILE POSITION MESSAGE), where POSITION is an |
| 92 | integer specifying the position where the error was detected, and |
| 93 | MESSAGE is a string describing the error. |
| 94 | |
| 95 | The current contents of FILE will be parsed even if there is a |
| 96 | modified buffer currently visiting FILE. |
| 97 | |
| 98 | If the variable `nxml-validate-function' is non-nil, it will be called |
| 99 | twice for each element, and any reported error will be signaled in the |
| 100 | same way as well-formedness error." |
| 101 | (with-current-buffer (nxml-parse-find-file file) |
| 102 | (unwind-protect |
| 103 | (let ((nxml-parse-file-name file)) |
| 104 | (nxml-parse-instance)) |
| 105 | (kill-buffer nil)))) |
| 106 | |
| 107 | (defun nxml-parse-find-file (file) |
| 108 | (with-current-buffer (get-buffer-create " *nXML Parse*") |
| 109 | (erase-buffer) |
| 110 | (let ((set-auto-coding-function 'nxml-set-xml-coding)) |
| 111 | (insert-file-contents file)) |
| 112 | (current-buffer))) |
| 113 | |
| 114 | (defun nxml-parse-instance () |
| 115 | (let (xmltok-dtd) |
| 116 | (xmltok-save |
| 117 | (xmltok-forward-prolog) |
| 118 | (nxml-check-xmltok-errors) |
| 119 | (nxml-ns-save |
| 120 | (nxml-parse-instance-1))))) |
| 121 | |
| 122 | (defun nxml-parse-instance-1 () |
| 123 | (let* ((top (cons nil nil)) |
| 124 | ;; tail is a cons cell, whose cdr is nil |
| 125 | ;; additional elements will destructively appended to tail |
| 126 | (tail top) |
| 127 | ;; stack of tails one for each open element |
| 128 | tail-stack |
| 129 | ;; list of QNames of open elements |
| 130 | open-element-tags |
| 131 | ;; list of strings buffering a text node, in reverse order |
| 132 | text |
| 133 | ;; position of beginning of first (in buffer) string in text |
| 134 | text-pos) |
| 135 | (while (xmltok-forward) |
| 136 | (nxml-check-xmltok-errors) |
| 137 | (cond ((memq xmltok-type '(start-tag end-tag empty-element)) |
| 138 | (when text |
| 139 | (setq text (apply 'concat (nreverse text))) |
| 140 | (setcdr tail (cons text nil)) |
| 141 | (setq tail (cdr tail))) |
| 142 | (when (not (eq xmltok-type 'end-tag)) |
| 143 | (when (and (not open-element-tags) |
| 144 | (not (eq tail top))) |
| 145 | (nxml-parse-error nil "Multiple top-level elements")) |
| 146 | (setq open-element-tags |
| 147 | (cons (xmltok-start-tag-qname) |
| 148 | open-element-tags)) |
| 149 | (nxml-ns-push-state) |
| 150 | (let ((tag (nxml-parse-start-tag))) |
| 151 | (nxml-validate-tag text text-pos tag) |
| 152 | (setq text nil) |
| 153 | (setcdr tail (cons tag nil)) |
| 154 | (setq tail (cdr tail)) |
| 155 | (setq tail-stack (cons tail tail-stack)) |
| 156 | (setq tail (last tag)))) |
| 157 | (when (not (eq xmltok-type 'start-tag)) |
| 158 | (or (eq xmltok-type 'empty-element) |
| 159 | (equal (car open-element-tags) |
| 160 | (xmltok-end-tag-qname)) |
| 161 | (if open-element-tags |
| 162 | (nxml-parse-error nil |
| 163 | "Unbalanced end-tag; expected </%s>" |
| 164 | (car open-element-tags)) |
| 165 | (nxml-parse-error nil "Extra end-tag"))) |
| 166 | (nxml-validate-tag text text-pos nil) |
| 167 | (setq text nil) |
| 168 | (nxml-ns-pop-state) |
| 169 | (setq open-element-tags (cdr open-element-tags)) |
| 170 | (setq tail (car tail-stack)) |
| 171 | (setq tail-stack (cdr tail-stack))) |
| 172 | (setq text-pos nil)) |
| 173 | ((memq xmltok-type '(space data entity-ref char-ref cdata-section)) |
| 174 | (cond (open-element-tags |
| 175 | (unless text-pos |
| 176 | (setq text-pos xmltok-start)) |
| 177 | (setq text |
| 178 | (cons (nxml-current-text-string) text))) |
| 179 | ((not (eq xmltok-type 'space)) |
| 180 | (nxml-parse-error |
| 181 | nil |
| 182 | "%s at top-level" |
| 183 | (cdr (assq xmltok-type |
| 184 | '((data . "Text characters") |
| 185 | (entity-ref . "Entity reference") |
| 186 | (char-ref . "Character reference") |
| 187 | (cdata-section . "CDATA section")))))))))) |
| 188 | (unless (cdr top) |
| 189 | (nxml-parse-error (point-max) "Missing document element")) |
| 190 | (cadr top))) |
| 191 | |
| 192 | (defun nxml-parse-start-tag () |
| 193 | (let (parsed-attributes |
| 194 | parsed-namespace-attributes |
| 195 | atts att prefixes prefix ns value name) |
| 196 | (setq atts xmltok-namespace-attributes) |
| 197 | (while atts |
| 198 | (setq att (car atts)) |
| 199 | (setq value (or (xmltok-attribute-value att) |
| 200 | (nxml-parse-error nil "Invalid attribute value"))) |
| 201 | (setq ns (nxml-make-namespace value)) |
| 202 | (setq prefix (and (xmltok-attribute-prefix att) |
| 203 | (xmltok-attribute-local-name att))) |
| 204 | (cond ((member prefix prefixes) |
| 205 | (nxml-parse-error nil "Duplicate namespace declaration")) |
| 206 | ((not prefix) |
| 207 | (nxml-ns-set-default ns)) |
| 208 | (ns |
| 209 | (nxml-ns-set-prefix prefix ns)) |
| 210 | (t (nxml-parse-error nil "Cannot undeclare namespace prefix"))) |
| 211 | (setq prefixes (cons prefix prefixes)) |
| 212 | (setq parsed-namespace-attributes |
| 213 | (cons (cons (nxml-make-name nxml-xmlns-namespace-uri |
| 214 | (xmltok-attribute-local-name att)) |
| 215 | value) |
| 216 | parsed-namespace-attributes)) |
| 217 | (setq atts (cdr atts))) |
| 218 | (setq name |
| 219 | (nxml-make-name |
| 220 | (let ((prefix (xmltok-start-tag-prefix))) |
| 221 | (if prefix |
| 222 | (or (nxml-ns-get-prefix prefix) |
| 223 | (nxml-parse-error (1+ xmltok-start) |
| 224 | "Prefix `%s' undeclared" |
| 225 | prefix)) |
| 226 | (nxml-ns-get-default))) |
| 227 | (xmltok-start-tag-local-name))) |
| 228 | (setq atts xmltok-attributes) |
| 229 | (while atts |
| 230 | (setq att (car atts)) |
| 231 | (setq ns |
| 232 | (let ((prefix (xmltok-attribute-prefix att))) |
| 233 | (and prefix |
| 234 | (or (nxml-ns-get-prefix prefix) |
| 235 | (nxml-parse-error (xmltok-attribute-name-start att) |
| 236 | "Prefix `%s' undeclared" |
| 237 | prefix))))) |
| 238 | (setq parsed-attributes |
| 239 | (let ((nm (nxml-make-name ns |
| 240 | (xmltok-attribute-local-name att)))) |
| 241 | (when (assoc nm parsed-attributes) |
| 242 | (nxml-parse-error (xmltok-attribute-name-start att) |
| 243 | "Duplicate attribute")) |
| 244 | (cons (cons nm (or (xmltok-attribute-value att) |
| 245 | (nxml-parse-error nil "Invalid attribute value"))) |
| 246 | parsed-attributes))) |
| 247 | (setq atts (cdr atts))) |
| 248 | ;; We want to end up with the attributes followed by the |
| 249 | ;; the namespace attributes in the same order as |
| 250 | ;; xmltok-attributes and xmltok-namespace-attributes respectively. |
| 251 | (when parsed-namespace-attributes |
| 252 | (setq parsed-attributes |
| 253 | (nconc parsed-namespace-attributes parsed-attributes))) |
| 254 | (list name (nreverse parsed-attributes)))) |
| 255 | |
| 256 | (defun nxml-validate-tag (text text-pos tag) |
| 257 | (when nxml-validate-function |
| 258 | (let ((err (funcall nxml-validate-function text tag)) |
| 259 | pos) |
| 260 | (when err |
| 261 | (setq pos (nxml-validate-error-position (cdr err) |
| 262 | (and text text-pos) |
| 263 | tag)) |
| 264 | (or pos (error "Incorrect return value from %s" |
| 265 | nxml-validate-function)) |
| 266 | (nxml-parse-error pos (car err)))))) |
| 267 | |
| 268 | (defun nxml-validate-error-position (location text-pos tag) |
| 269 | (cond ((null location) xmltok-start) |
| 270 | ((eq location 'text) text-pos) |
| 271 | ((eq location 'tag-close) |
| 272 | (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1)))) |
| 273 | ((consp location) |
| 274 | (let ((att (nth (cdr location) xmltok-attributes))) |
| 275 | (when (not att) |
| 276 | (setq att (nth (- (cdr location) (length xmltok-attributes)) |
| 277 | xmltok-namespace-attributes))) |
| 278 | (cond ((not att)) |
| 279 | ((eq (car location) 'attribute-name) |
| 280 | (xmltok-attribute-name-start att)) |
| 281 | ((eq (car location) 'attribute-value) |
| 282 | (xmltok-attribute-value-start att))))))) |
| 283 | |
| 284 | (defun nxml-make-name (ns local-name) |
| 285 | (if ns |
| 286 | (cons ns local-name) |
| 287 | local-name)) |
| 288 | |
| 289 | (defun nxml-current-text-string () |
| 290 | (cond ((memq xmltok-type '(space data)) |
| 291 | (buffer-substring-no-properties xmltok-start |
| 292 | (point))) |
| 293 | ((eq xmltok-type 'cdata-section) |
| 294 | (buffer-substring-no-properties (+ xmltok-start 9) |
| 295 | (- (point) 3))) |
| 296 | ((memq xmltok-type '(char-ref entity-ref)) |
| 297 | (unless xmltok-replacement |
| 298 | (nxml-parse-error nil |
| 299 | (if (eq xmltok-type 'char-ref) |
| 300 | "Reference to unsupported Unicode character" |
| 301 | "Unresolvable entity reference"))) |
| 302 | xmltok-replacement))) |
| 303 | |
| 304 | (defun nxml-parse-error (position &rest args) |
| 305 | (nxml-signal-file-parse-error nxml-parse-file-name |
| 306 | (or position xmltok-start) |
| 307 | (apply 'format args))) |
| 308 | |
| 309 | (defun nxml-check-xmltok-errors () |
| 310 | (when xmltok-errors |
| 311 | (let ((err (car (last xmltok-errors)))) |
| 312 | (nxml-signal-file-parse-error nxml-parse-file-name |
| 313 | (xmltok-error-start err) |
| 314 | (xmltok-error-message err))))) |
| 315 | |
| 316 | (provide 'nxml-parse) |
| 317 | |
| 318 | ;;; nxml-parse.el ends here |