lisp/nxml/nxml-rap.el

   1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
   2
   3 ;; Copyright (C) 2003, 2004 Free Software Foundation, Inc.
   4
   5 ;; Author: James Clark
   6 ;; Keywords: XML
   7
   8 ;; This program is free software; you can redistribute it and/or
   9 ;; modify it under the terms of the GNU General Public License as
  10 ;; published by the Free Software Foundation; either version 2 of
  11 ;; the License, or (at your option) any later version.
  12
  13 ;; This program is distributed in the hope that it will be
  14 ;; useful, but WITHOUT ANY WARRANTY; without even the implied
  15 ;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  16 ;; PURPOSE.  See the GNU General Public License for more details.
  17
  18 ;; You should have received a copy of the GNU General Public
  19 ;; License along with this program; if not, write to the Free
  20 ;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
  21 ;; MA 02111-1307 USA
  22
  23 ;;; Commentary:
  24
  25 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
  26 ;; how to handle changes. We don't want to maintain a complete parse
  27 ;; tree.  We also don't want to reparse from the start of the document
  28 ;; on every keystroke.  However, it is not possible in general to
  29 ;; parse an XML document correctly starting at a random point in the
  30 ;; middle.  The main problems are comments, CDATA sections and
  31 ;; processing instructions: these can all contain things that are
  32 ;; indistinguishable from elements. Literals in the prolog are also a
  33 ;; problem.  Attribute value literals are not a problem because
  34 ;; attribute value literals cannot contain less-than signs.
  35 ;;
  36 ;; Our strategy is to keep track of just the problematic things.
  37 ;; Specifically, we keep track of all comments, CDATA sections and
  38 ;; processing instructions in the instance.  We do this by marking all
  39 ;; except the first character of these with a non-nil nxml-inside text
  40 ;; property. The value of the nxml-inside property is comment,
  41 ;; cdata-section or processing-instruction.  The first character does
  42 ;; not have the nxml-inside property so we can find the beginning of
  43 ;; the construct by looking for a change in a text property value
  44 ;; (Emacs provides primitives for this).  We use text properties
  45 ;; rather than overlays, since the implementation of overlays doesn't
  46 ;; look like it scales to large numbers of overlays in a buffer.
  47 ;;
  48 ;; We don't in fact track all these constructs, but only track them in
  49 ;; some initial part of the instance. The variable `nxml-scan-end'
  50 ;; contains the limit of where we have scanned up to for them.
  51 ;;
  52 ;; Thus to parse some random point in the file we first ensure that we
  53 ;; have scanned up to that point.  Then we search backwards for a
  54 ;; <. Then we check whether the < has an nxml-inside property. If it
  55 ;; does we go backwards to first character that does not have an
  56 ;; nxml-inside property (this character must be a <).  Then we start
  57 ;; parsing forward from the < we have found.
  58 ;;
  59 ;; The prolog has to be parsed specially, so we also keep track of the
  60 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
  61 ;; every change to the prolog.  This won't work well if people try to
  62 ;; edit huge internal subsets. Hopefully that will be rare.
  63 ;;
  64 ;; We keep track of the changes by adding to the buffer's
  65 ;; after-change-functions hook.  Scanning is also done as a
  66 ;; prerequisite to fontification by adding to fontification-functions
  67 ;; (in the same way as jit-lock).  This means that scanning for these
  68 ;; constructs had better be quick.  Fortunately it is. Firstly, the
  69 ;; typical proportion of comments, CDATA sections and processing
  70 ;; instructions is small relative to other things.  Secondly, to scan
  71 ;; we just search for the regexp <[!?].
  72 ;;
  73 ;; One problem is unclosed comments, processing instructions and CDATA
  74 ;; sections.  Suppose, for example, we encounter a <!-- but there's no
  75 ;; matching -->.  This is not an unexpected situation if the user is
  76 ;; creating a comment. It is not helpful to treat the whole of the
  77 ;; file starting from the <!-- onwards as a single unclosed comment
  78 ;; token. Instead we treat just the <!-- as a piece of not well-formed
  79 ;; markup and continue.  The problem is that if at some later stage a
  80 ;; --> gets added to the buffer after the unclosed <!--, we will need
  81 ;; to reparse the buffer starting from the <!--.  We need to keep
  82 ;; track of these reparse dependencies; they are called dependent
  83 ;; regions in the code.
  84
  85 ;;; Code:
  86
  87 (require 'xmltok)
  88 (require 'nxml-util)
  89
  90 (defvar nxml-prolog-end nil
  91   "Integer giving position following end of the prolog.")
  92 (make-variable-buffer-local 'nxml-prolog-end)
  93
  94 (defvar nxml-scan-end nil
  95   "Marker giving position up to which we have scanned.
  96 nxml-scan-end must be >= nxml-prolog-end.  Furthermore, nxml-scan-end
  97 must not an inside position in the following sense. A position is
  98 inside if the following character is a part of, but not the first
  99 character of, a CDATA section, comment or processing instruction.
 100 Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
 101 are inside positions must have a non-nil nxml-inside property whose
 102 value is a symbol specifying what it is inside. Any characters with a
 103 non-nil fontified property must have position < nxml-scan-end and the
 104 correct face. Dependent regions must also be established for any
 105 unclosed constructs starting before nxml-scan-end.
 106 There must be no nxml-inside properties after nxml-scan-end.")
 107 (make-variable-buffer-local 'nxml-scan-end)
 108
 109 (defsubst nxml-get-inside (pos)
 110   (get-text-property pos 'nxml-inside))
 111
 112 (defsubst nxml-clear-inside (start end)
 113   (remove-text-properties start end '(nxml-inside nil)))
 114
 115 (defsubst nxml-set-inside (start end type)
 116   (put-text-property start end 'nxml-inside type))
 117
 118 (defun nxml-inside-end (pos)
 119   "Return the end of the inside region containing POS.
 120 Return nil if the character at POS is not inside."
 121   (if (nxml-get-inside pos)
 122       (or (next-single-property-change pos 'nxml-inside)
 123           (point-max))
 124     nil))
 125
 126 (defun nxml-inside-start (pos)
 127   "Return the start of the inside region containing POS.
 128 Return nil if the character at POS is not inside."
 129   (if (nxml-get-inside pos)
 130       (or (previous-single-property-change (1+ pos) 'nxml-inside)
 131           (point-min))
 132     nil))
 133
 134 ;;; Change management
 135
 136 (defun nxml-scan-after-change (start end)
 137   "Restore `nxml-scan-end' invariants after a change.
 138 The change happened between START and END.
 139 Return position after which lexical state is unchanged.
 140 END must be > nxml-prolog-end."
 141   (if (>= start nxml-scan-end)
 142       nxml-scan-end
 143     (goto-char start)
 144     (nxml-move-outside-backwards)
 145     (setq start (point))
 146     (let ((inside-remove-start start)
 147           xmltok-errors
 148           xmltok-dependent-regions)
 149       (while (or (when (xmltok-forward-special (min end nxml-scan-end))
 150                    (when (memq xmltok-type
 151                                '(comment
 152                                  cdata-section
 153                                  processing-instruction))
 154                      (nxml-clear-inside inside-remove-start
 155                                         (1+ xmltok-start))
 156                      (nxml-set-inside (1+ xmltok-start)
 157                                       (point)
 158                                       xmltok-type)
 159                      (setq inside-remove-start (point)))
 160                    (if (< (point) (min end nxml-scan-end))
 161                        t
 162                      (setq end (point))
 163                      nil))
 164                  ;; The end of the change was inside but is now outside.
 165                  ;; Imagine something really weird like
 166                  ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
 167                  ;; and suppose we deleted "<![CDATA[f"
 168                  (let ((inside-end (nxml-inside-end end)))
 169                    (when inside-end
 170                      (setq end inside-end)
 171                      t))))
 172       (nxml-clear-inside inside-remove-start end)
 173       (nxml-clear-dependent-regions start end)
 174       (nxml-mark-parse-dependent-regions))
 175     (when (> end nxml-scan-end)
 176       (set-marker nxml-scan-end end))
 177     end))
 178
 179 (defun nxml-scan-prolog ()
 180   (goto-char (point-min))
 181   (let (xmltok-dtd
 182         xmltok-errors
 183         xmltok-dependent-regions)
 184     (setq nxml-prolog-regions (xmltok-forward-prolog))
 185     (setq nxml-prolog-end (point))
 186     (nxml-clear-inside (point-min) nxml-prolog-end)
 187     (nxml-clear-dependent-regions (point-min) nxml-prolog-end)
 188     (nxml-mark-parse-dependent-regions))
 189   (when (< nxml-scan-end nxml-prolog-end)
 190     (set-marker nxml-scan-end nxml-prolog-end)))
 191
 192
 193 ;;; Dependent regions
 194
 195 (defun nxml-adjust-start-for-dependent-regions (start end pre-change-length)
 196   (let ((overlays (overlays-in (1- start) start))
 197         (adjusted-start start))
 198     (while overlays
 199       (let* ((overlay (car overlays))
 200              (ostart (overlay-start overlay)))
 201         (when (and (eq (overlay-get overlay 'category) 'nxml-dependent)
 202                    (< ostart adjusted-start))
 203           (let ((funargs (overlay-get overlay 'nxml-funargs)))
 204             (when (apply (car funargs)
 205                          (append (list start
 206                                        end
 207                                        pre-change-length
 208                                        ostart
 209                                        (overlay-end overlay))
 210                                  (cdr funargs)))
 211               (setq adjusted-start ostart)))))
 212       (setq overlays (cdr overlays)))
 213     adjusted-start))
 214
 215 (defun nxml-mark-parse-dependent-regions ()
 216   (while xmltok-dependent-regions
 217     (apply 'nxml-mark-parse-dependent-region
 218            (car xmltok-dependent-regions))
 219     (setq xmltok-dependent-regions
 220           (cdr xmltok-dependent-regions))))
 221
 222 (defun nxml-mark-parse-dependent-region (fun start end &rest args)
 223   (let ((overlay (make-overlay start end nil t t)))
 224     (overlay-put overlay 'category 'nxml-dependent)
 225     (overlay-put overlay 'nxml-funargs (cons fun args))))
 226
 227 (put 'nxml-dependent 'evaporate t)
 228
 229 (defun nxml-clear-dependent-regions (start end)
 230   (let ((overlays (overlays-in start end)))
 231     (while overlays
 232       (let* ((overlay (car overlays))
 233              (category (overlay-get overlay 'category)))
 234         (when (and (eq category 'nxml-dependent)
 235                    (<= start (overlay-start overlay)))
 236           (delete-overlay overlay)))
 237       (setq overlays (cdr overlays)))))
 238
 239 ;;; Random access parsing
 240
 241 (defun nxml-token-after ()
 242   "Return the position after the token containing the char after point.
 243 Sets up the variables `xmltok-type', `xmltok-start',
 244 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
 245 `xmltok-namespace-attributes' in the same was as does
 246 `xmltok-forward'.  The prolog will be treated as a single token with
 247 type `prolog'."
 248   (let ((pos (point)))
 249     (if (< pos nxml-prolog-end)
 250         (progn
 251           (setq xmltok-type 'prolog
 252                 xmltok-start (point-min))
 253           (min nxml-prolog-end (point-max)))
 254       (nxml-ensure-scan-up-to-date)
 255       (if (nxml-get-inside pos)
 256           (save-excursion
 257             (nxml-move-outside-backwards)
 258             (xmltok-forward)
 259             (point))
 260         (save-excursion
 261           (if (or (eq (char-after) ?<)
 262                       (search-backward "<"
 263                                        (max (point-min) nxml-prolog-end)
 264                                        t))
 265               (nxml-move-outside-backwards)
 266             (goto-char (if (<= (point-min) nxml-prolog-end)
 267                            nxml-prolog-end
 268                          (or (nxml-inside-end (point-min))
 269                              (point-min)))))
 270           (while (and (nxml-tokenize-forward)
 271                       (<= (point) pos)))
 272           (point))))))
 273
 274 (defun nxml-token-before ()
 275   "Return the position after the token containing the char before point.
 276 Sets variables like `nxml-token-after'."
 277   (if (/= (point-min) (point))
 278       (save-excursion
 279         (goto-char (1- (point)))
 280         (nxml-token-after))
 281     (setq xmltok-start (point))
 282     (setq xmltok-type nil)
 283     (point)))
 284
 285 (defun nxml-tokenize-forward ()
 286   (let (xmltok-dependent-regions
 287         xmltok-errors)
 288     (when (and (xmltok-forward)
 289                (> (point) nxml-scan-end))
 290       (cond ((memq xmltok-type '(comment
 291                                  cdata-section
 292                                  processing-instruction))
 293              (nxml-with-unmodifying-text-property-changes
 294                (nxml-set-inside (1+ xmltok-start) (point) xmltok-type)))
 295             (xmltok-dependent-regions
 296              (nxml-mark-parse-dependent-regions)))
 297       (set-marker nxml-scan-end (point)))
 298     xmltok-type))
 299
 300 (defun nxml-move-outside-backwards ()
 301   "Move point to first character of the containing special thing.
 302 Leave point unmoved if it is not inside anything special."
 303   (let ((start (nxml-inside-start (point))))
 304     (when start
 305       (goto-char (1- start))
 306       (when (nxml-get-inside (point))
 307         (error "Char before inside-start at %s had nxml-inside property %s"
 308                (point)
 309                (nxml-get-inside (point)))))))
 310
 311 (defun nxml-ensure-scan-up-to-date ()
 312   (let ((pos (point)))
 313     (when (< nxml-scan-end pos)
 314       (save-excursion
 315         (goto-char nxml-scan-end)
 316         (let (xmltok-errors
 317               xmltok-dependent-regions)
 318           (while (when (xmltok-forward-special pos)
 319                    (when (memq xmltok-type
 320                                '(comment
 321                                  processing-instruction
 322                                  cdata-section))
 323                      (nxml-with-unmodifying-text-property-changes
 324                        (nxml-set-inside (1+ xmltok-start)
 325                                         (point)
 326                                         xmltok-type)))
 327                    (if (< (point) pos)
 328                        t
 329                      (setq pos (point))
 330                      nil)))
 331           (nxml-clear-dependent-regions nxml-scan-end pos)
 332           (nxml-mark-parse-dependent-regions)
 333           (set-marker nxml-scan-end pos))))))
 334
 335 ;;; Element scanning
 336
 337 (defun nxml-scan-element-forward (from &optional up)
 338   "Scan forward from FROM over a single balanced element.
 339 Point must between tokens.  Return the position of the end of the tag
 340 that ends the element. `xmltok-start' will contain the position of the
 341 start of the tag. If UP is non-nil, then scan past end-tag of element
 342 containing point.  If no element is found, return nil.  If a
 343 well-formedness error prevents scanning, signal an nxml-scan-error.
 344 Point is not moved."
 345   (let ((open-tags (and up t))
 346         found)
 347     (save-excursion
 348       (goto-char from)
 349       (while (cond ((not (nxml-tokenize-forward))
 350                     (when (consp open-tags)
 351                       (nxml-scan-error (cadr open-tags)
 352                                        "Start-tag has no end-tag"))
 353                     nil)
 354                    ((eq xmltok-type 'start-tag)
 355                     (setq open-tags
 356                           (cons (xmltok-start-tag-qname)
 357                                 (cons xmltok-start
 358                                       open-tags)))
 359                     t)
 360                    ((eq xmltok-type 'end-tag)
 361                     (cond ((not open-tags) nil)
 362                           ((not (consp open-tags)) (setq found (point)) nil)
 363                           ((not (string= (car open-tags)
 364                                          (xmltok-end-tag-qname)))
 365                            (nxml-scan-error (+ 2 xmltok-start)
 366                                             "Mismatched end-tag; \
 367 expected `%s'"
 368                                             (car open-tags)))
 369                           ((setq open-tags (cddr open-tags)) t)
 370                           (t (setq found (point)) nil)))
 371                    ((memq xmltok-type '(empty-element
 372                                         partial-empty-element))
 373                     (if open-tags
 374                         t
 375                       (setq found (point))
 376                       nil))
 377                    ((eq xmltok-type 'partial-end-tag)
 378                     (cond ((not open-tags) nil)
 379                           ((not (consp open-tags)) (setq found (point)) nil)
 380                           ((setq open-tags (cddr open-tags)) t)
 381                           (t (setq found (point)) nil)))
 382                    ((eq xmltok-type 'partial-start-tag)
 383                     (nxml-scan-error xmltok-start
 384                                      "Missing `>'"))
 385                    (t t))))
 386     found))
 387
 388 (defun nxml-scan-element-backward (from &optional up bound)
 389   "Scan backward from FROM over a single balanced element.
 390 Point must between tokens.  Return the position of the end of the tag
 391 that starts the element. `xmltok-start' will contain the position of
 392 the start of the tag.  If UP is non-nil, then scan past start-tag of
 393 element containing point.  If BOUND is non-nil, then don't scan back
 394 past BOUND.  If no element is found, return nil.  If a well-formedness
 395 error prevents scanning, signal an nxml-scan-error.  Point is not
 396 moved."
 397   (let ((open-tags (and up t))
 398         token-end found)
 399     (save-excursion
 400       (goto-char from)
 401       (while (cond ((or (< (point) nxml-prolog-end)
 402                         (not (search-backward "<"
 403                                               (max (or bound 0)
 404                                                    nxml-prolog-end)
 405                                               t)))
 406                     (when (and (consp open-tags) (not bound))
 407                       (nxml-scan-error (cadr open-tags)
 408                                        "End-tag has no start-tag"))
 409                     nil)
 410                    ((progn
 411                       (nxml-move-outside-backwards)
 412                       (save-excursion
 413                         (nxml-tokenize-forward)
 414                         (setq token-end (point)))
 415                       (eq xmltok-type 'end-tag))
 416                     (setq open-tags
 417                           (cons (xmltok-end-tag-qname)
 418                                 (cons xmltok-start open-tags)))
 419                     t)
 420                    ((eq xmltok-type 'start-tag)
 421                     (cond ((not open-tags) nil)
 422                           ((not (consp open-tags))
 423                            (setq found token-end)
 424                            nil)
 425                           ((and (car open-tags)
 426                                 (not (string= (car open-tags)
 427                                               (xmltok-start-tag-qname))))
 428                            (nxml-scan-error (1+ xmltok-start)
 429                                             "Mismatched start-tag; \
 430 expected `%s'"
 431                                             (car open-tags)))
 432                           ((setq open-tags (cddr open-tags)) t)
 433                           (t (setq found token-end) nil)))
 434                    ((memq xmltok-type '(empty-element
 435                                         partial-empty-element))
 436                     (if open-tags
 437                         t
 438                       (setq found token-end)
 439                       nil))
 440                    ((eq xmltok-type 'partial-end-tag)
 441                     (setq open-tags
 442                           (cons nil (cons xmltok-start open-tags)))
 443                     t)
 444                    ((eq xmltok-type 'partial-start-tag)
 445                     ;; if we have only a partial-start-tag
 446                     ;; then it's unlikely that there's a matching
 447                     ;; end-tag, so it's probably not helpful
 448                     ;; to treat it as a complete start-tag
 449                     (nxml-scan-error xmltok-start
 450                                      "Missing `>'"))
 451                    (t t))))
 452     found))
 453
 454 (defun nxml-scan-error (&rest args)
 455   (signal 'nxml-scan-error args))
 456
 457 (put 'nxml-scan-error
 458      'error-conditions
 459      '(error nxml-error nxml-scan-error))
 460
 461 (put 'nxml-scan-error
 462      'error-message
 463      "Scan over element that is not well-formed")
 464
 465 (provide 'nxml-rap)
 466
 467 ;;; nxml-rap.el ends here