Update copyright notices for 2013.
[bpt/emacs.git] / lisp / nxml / nxml-rap.el
1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
2
3 ;; Copyright (C) 2003-2004, 2007-2013 Free Software Foundation, Inc.
4
5 ;; Author: James Clark
6 ;; Keywords: XML
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22
23 ;;; Commentary:
24
25 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
26 ;; how to handle changes. We don't want to maintain a complete parse
27 ;; tree. We also don't want to reparse from the start of the document
28 ;; on every keystroke. However, it is not possible in general to
29 ;; parse an XML document correctly starting at a random point in the
30 ;; middle. The main problems are comments, CDATA sections and
31 ;; processing instructions: these can all contain things that are
32 ;; indistinguishable from elements. Literals in the prolog are also a
33 ;; problem. Attribute value literals are not a problem because
34 ;; attribute value literals cannot contain less-than signs.
35 ;;
36 ;; Our strategy is to keep track of just the problematic things.
37 ;; Specifically, we keep track of all comments, CDATA sections and
38 ;; processing instructions in the instance. We do this by marking all
39 ;; except the first character of these with a non-nil nxml-inside text
40 ;; property. The value of the nxml-inside property is comment,
41 ;; cdata-section or processing-instruction. The first character does
42 ;; not have the nxml-inside property so we can find the beginning of
43 ;; the construct by looking for a change in a text property value
44 ;; (Emacs provides primitives for this). We use text properties
45 ;; rather than overlays, since the implementation of overlays doesn't
46 ;; look like it scales to large numbers of overlays in a buffer.
47 ;;
48 ;; We don't in fact track all these constructs, but only track them in
49 ;; some initial part of the instance. The variable `nxml-scan-end'
50 ;; contains the limit of where we have scanned up to for them.
51 ;;
52 ;; Thus to parse some random point in the file we first ensure that we
53 ;; have scanned up to that point. Then we search backwards for a
54 ;; <. Then we check whether the < has an nxml-inside property. If it
55 ;; does we go backwards to first character that does not have an
56 ;; nxml-inside property (this character must be a <). Then we start
57 ;; parsing forward from the < we have found.
58 ;;
59 ;; The prolog has to be parsed specially, so we also keep track of the
60 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
61 ;; every change to the prolog. This won't work well if people try to
62 ;; edit huge internal subsets. Hopefully that will be rare.
63 ;;
64 ;; We keep track of the changes by adding to the buffer's
65 ;; after-change-functions hook. Scanning is also done as a
66 ;; prerequisite to fontification by adding to fontification-functions
67 ;; (in the same way as jit-lock). This means that scanning for these
68 ;; constructs had better be quick. Fortunately it is. Firstly, the
69 ;; typical proportion of comments, CDATA sections and processing
70 ;; instructions is small relative to other things. Secondly, to scan
71 ;; we just search for the regexp <[!?].
72 ;;
73 ;; One problem is unclosed comments, processing instructions and CDATA
74 ;; sections. Suppose, for example, we encounter a <!-- but there's no
75 ;; matching -->. This is not an unexpected situation if the user is
76 ;; creating a comment. It is not helpful to treat the whole of the
77 ;; file starting from the <!-- onwards as a single unclosed comment
78 ;; token. Instead we treat just the <!-- as a piece of not well-formed
79 ;; markup and continue. The problem is that if at some later stage a
80 ;; --> gets added to the buffer after the unclosed <!--, we will need
81 ;; to reparse the buffer starting from the <!--. We need to keep
82 ;; track of these reparse dependencies; they are called dependent
83 ;; regions in the code.
84
85 ;;; Code:
86
87 (require 'xmltok)
88 (require 'nxml-util)
89
90 (defvar nxml-prolog-end nil
91 "Integer giving position following end of the prolog.")
92 (make-variable-buffer-local 'nxml-prolog-end)
93
94 (defvar nxml-scan-end nil
95 "Marker giving position up to which we have scanned.
96 nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end
97 must not be an inside position in the following sense. A position is
98 inside if the following character is a part of, but not the first
99 character of, a CDATA section, comment or processing instruction.
100 Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
101 are inside positions must have a non-nil `nxml-inside' property whose
102 value is a symbol specifying what it is inside. Any characters with a
103 non-nil `fontified' property must have position < nxml-scan-end and
104 the correct face. Dependent regions must also be established for any
105 unclosed constructs starting before nxml-scan-end.
106 There must be no `nxml-inside' properties after nxml-scan-end.")
107 (make-variable-buffer-local 'nxml-scan-end)
108
109 (defsubst nxml-get-inside (pos)
110 (get-text-property pos 'nxml-inside))
111
112 (defsubst nxml-clear-inside (start end)
113 (nxml-debug-clear-inside start end)
114 (remove-text-properties start end '(nxml-inside nil)))
115
116 (defsubst nxml-set-inside (start end type)
117 (nxml-debug-set-inside start end)
118 (put-text-property start end 'nxml-inside type))
119
120 (defun nxml-inside-end (pos)
121 "Return the end of the inside region containing POS.
122 Return nil if the character at POS is not inside."
123 (if (nxml-get-inside pos)
124 (or (next-single-property-change pos 'nxml-inside)
125 (point-max))
126 nil))
127
128 (defun nxml-inside-start (pos)
129 "Return the start of the inside region containing POS.
130 Return nil if the character at POS is not inside."
131 (if (nxml-get-inside pos)
132 (or (previous-single-property-change (1+ pos) 'nxml-inside)
133 (point-min))
134 nil))
135
136 ;;; Change management
137
138 (defun nxml-scan-after-change (start end)
139 "Restore `nxml-scan-end' invariants after a change.
140 The change happened between START and END.
141 Return position after which lexical state is unchanged.
142 END must be > `nxml-prolog-end'. START must be outside
143 any 'inside' regions and at the beginning of a token."
144 (if (>= start nxml-scan-end)
145 nxml-scan-end
146 (let ((inside-remove-start start)
147 xmltok-errors
148 xmltok-dependent-regions)
149 (while (or (when (xmltok-forward-special (min end nxml-scan-end))
150 (when (memq xmltok-type
151 '(comment
152 cdata-section
153 processing-instruction))
154 (nxml-clear-inside inside-remove-start
155 (1+ xmltok-start))
156 (nxml-set-inside (1+ xmltok-start)
157 (point)
158 xmltok-type)
159 (setq inside-remove-start (point)))
160 (if (< (point) (min end nxml-scan-end))
161 t
162 (setq end (point))
163 nil))
164 ;; The end of the change was inside but is now outside.
165 ;; Imagine something really weird like
166 ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
167 ;; and suppose we deleted "<![CDATA[f"
168 (let ((inside-end (nxml-inside-end end)))
169 (when inside-end
170 (setq end inside-end)
171 t))))
172 (nxml-clear-inside inside-remove-start end)
173 (nxml-clear-dependent-regions start end)
174 (nxml-mark-parse-dependent-regions))
175 (when (> end nxml-scan-end)
176 (set-marker nxml-scan-end end))
177 end))
178
179 ;; n-s-p only called from nxml-mode.el, where this variable is defined.
180 (defvar nxml-prolog-regions)
181
182 (defun nxml-scan-prolog ()
183 (goto-char (point-min))
184 (let (xmltok-dtd
185 xmltok-errors
186 xmltok-dependent-regions)
187 (setq nxml-prolog-regions (xmltok-forward-prolog))
188 (setq nxml-prolog-end (point))
189 (nxml-clear-inside (point-min) nxml-prolog-end)
190 (nxml-clear-dependent-regions (point-min) nxml-prolog-end)
191 (nxml-mark-parse-dependent-regions))
192 (when (< nxml-scan-end nxml-prolog-end)
193 (set-marker nxml-scan-end nxml-prolog-end)))
194
195
196 ;;; Dependent regions
197
198 (defun nxml-adjust-start-for-dependent-regions (start end pre-change-length)
199 (let ((overlays (overlays-in (1- start) start))
200 (adjusted-start start))
201 (while overlays
202 (let* ((overlay (car overlays))
203 (ostart (overlay-start overlay)))
204 (when (and (eq (overlay-get overlay 'category) 'nxml-dependent)
205 (< ostart adjusted-start))
206 (let ((funargs (overlay-get overlay 'nxml-funargs)))
207 (when (apply (car funargs)
208 (append (list start
209 end
210 pre-change-length
211 ostart
212 (overlay-end overlay))
213 (cdr funargs)))
214 (setq adjusted-start ostart)))))
215 (setq overlays (cdr overlays)))
216 adjusted-start))
217
218 (defun nxml-mark-parse-dependent-regions ()
219 (while xmltok-dependent-regions
220 (apply 'nxml-mark-parse-dependent-region
221 (car xmltok-dependent-regions))
222 (setq xmltok-dependent-regions
223 (cdr xmltok-dependent-regions))))
224
225 (defun nxml-mark-parse-dependent-region (fun start end &rest args)
226 (let ((overlay (make-overlay start end nil t t)))
227 (overlay-put overlay 'category 'nxml-dependent)
228 (overlay-put overlay 'nxml-funargs (cons fun args))))
229
230 (put 'nxml-dependent 'evaporate t)
231
232 (defun nxml-clear-dependent-regions (start end)
233 (let ((overlays (overlays-in start end)))
234 (while overlays
235 (let* ((overlay (car overlays))
236 (category (overlay-get overlay 'category)))
237 (when (and (eq category 'nxml-dependent)
238 (<= start (overlay-start overlay)))
239 (delete-overlay overlay)))
240 (setq overlays (cdr overlays)))))
241
242 ;;; Random access parsing
243
244 (defun nxml-token-after ()
245 "Return the position after the token containing the char after point.
246 Sets up the variables `xmltok-type', `xmltok-start',
247 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
248 `xmltok-namespace-attributes' in the same was as does
249 `xmltok-forward'. The prolog will be treated as a single token with
250 type `prolog'."
251 (let ((pos (point)))
252 (if (< pos nxml-prolog-end)
253 (progn
254 (setq xmltok-type 'prolog
255 xmltok-start (point-min))
256 (min nxml-prolog-end (point-max)))
257 (nxml-ensure-scan-up-to-date)
258 (if (nxml-get-inside pos)
259 (save-excursion
260 (nxml-move-outside-backwards)
261 (xmltok-forward)
262 (point))
263 (save-excursion
264 (if (or (eq (char-after) ?<)
265 (search-backward "<"
266 (max (point-min) nxml-prolog-end)
267 t))
268 (nxml-move-outside-backwards)
269 (goto-char (if (<= (point-min) nxml-prolog-end)
270 nxml-prolog-end
271 (or (nxml-inside-end (point-min))
272 (point-min)))))
273 (while (and (nxml-tokenize-forward)
274 (<= (point) pos)))
275 (point))))))
276
277 (defun nxml-token-before ()
278 "Return the position after the token containing the char before point.
279 Sets variables like `nxml-token-after'."
280 (if (/= (point-min) (point))
281 (save-excursion
282 (goto-char (1- (point)))
283 (nxml-token-after))
284 (setq xmltok-start (point))
285 (setq xmltok-type nil)
286 (point)))
287
288 (defun nxml-tokenize-forward ()
289 (let (xmltok-dependent-regions
290 xmltok-errors)
291 (when (and (xmltok-forward)
292 (> (point) nxml-scan-end))
293 (cond ((memq xmltok-type '(comment
294 cdata-section
295 processing-instruction))
296 (nxml-with-unmodifying-text-property-changes
297 (nxml-set-inside (1+ xmltok-start) (point) xmltok-type)))
298 (xmltok-dependent-regions
299 (nxml-mark-parse-dependent-regions)))
300 (set-marker nxml-scan-end (point)))
301 xmltok-type))
302
303 (defun nxml-move-tag-backwards (bound)
304 "Move point backwards outside any 'inside' regions or tags.
305 Point will not move past `nxml-prolog-end'.
306 Point will either be at BOUND or a '<' character starting a tag
307 outside any 'inside' regions. Ignores dependent regions.
308 As a precondition, point must be >= BOUND."
309 (nxml-move-outside-backwards)
310 (when (not (equal (char-after) ?<))
311 (if (search-backward "<" bound t)
312 (progn
313 (nxml-move-outside-backwards)
314 (when (not (equal (char-after) ?<))
315 (search-backward "<" bound t)))
316 (goto-char bound))))
317
318 (defun nxml-move-outside-backwards ()
319 "Move point to first character of the containing special thing.
320 Leave point unmoved if it is not inside anything special."
321 (let ((start (nxml-inside-start (point))))
322 (when start
323 (goto-char (1- start))
324 (when (nxml-get-inside (point))
325 (error "Char before inside-start at %s had nxml-inside property %s"
326 (point)
327 (nxml-get-inside (point)))))))
328
329 (defun nxml-ensure-scan-up-to-date ()
330 (let ((pos (point)))
331 (when (< nxml-scan-end pos)
332 (save-excursion
333 (goto-char nxml-scan-end)
334 (let (xmltok-errors
335 xmltok-dependent-regions)
336 (while (when (xmltok-forward-special pos)
337 (when (memq xmltok-type
338 '(comment
339 processing-instruction
340 cdata-section))
341 (nxml-with-unmodifying-text-property-changes
342 (nxml-set-inside (1+ xmltok-start)
343 (point)
344 xmltok-type)))
345 (if (< (point) pos)
346 t
347 (setq pos (point))
348 nil)))
349 (nxml-clear-dependent-regions nxml-scan-end pos)
350 (nxml-mark-parse-dependent-regions)
351 (set-marker nxml-scan-end pos))))))
352
353 ;;; Element scanning
354
355 (defun nxml-scan-element-forward (from &optional up)
356 "Scan forward from FROM over a single balanced element.
357 Point must be between tokens. Return the position of the end of
358 the tag that ends the element. `xmltok-start' will contain the
359 position of the start of the tag. If UP is non-nil, then scan
360 past end-tag of element containing point. If no element is
361 found, return nil. If a well-formedness error prevents scanning,
362 signal an `nxml-scan-error'. Point is not moved."
363 (let ((open-tags (and up t))
364 found)
365 (save-excursion
366 (goto-char from)
367 (while (cond ((not (nxml-tokenize-forward))
368 (when (consp open-tags)
369 (nxml-scan-error (cadr open-tags)
370 "Start-tag has no end-tag"))
371 nil)
372 ((eq xmltok-type 'start-tag)
373 (setq open-tags
374 (cons (xmltok-start-tag-qname)
375 (cons xmltok-start
376 open-tags)))
377 t)
378 ((eq xmltok-type 'end-tag)
379 (cond ((not open-tags) nil)
380 ((not (consp open-tags)) (setq found (point)) nil)
381 ((not (string= (car open-tags)
382 (xmltok-end-tag-qname)))
383 (nxml-scan-error (+ 2 xmltok-start)
384 "Mismatched end-tag; \
385 expected `%s'"
386 (car open-tags)))
387 ((setq open-tags (cddr open-tags)) t)
388 (t (setq found (point)) nil)))
389 ((memq xmltok-type '(empty-element
390 partial-empty-element))
391 (if open-tags
392 t
393 (setq found (point))
394 nil))
395 ((eq xmltok-type 'partial-end-tag)
396 (cond ((not open-tags) nil)
397 ((not (consp open-tags)) (setq found (point)) nil)
398 ((setq open-tags (cddr open-tags)) t)
399 (t (setq found (point)) nil)))
400 ((eq xmltok-type 'partial-start-tag)
401 (nxml-scan-error xmltok-start
402 "Missing `>'"))
403 (t t))))
404 found))
405
406 (defun nxml-scan-element-backward (from &optional up bound)
407 "Scan backward from FROM over a single balanced element.
408 Point must be between tokens. Return the position of the end of
409 the tag that starts the element. `xmltok-start' will contain the
410 position of the start of the tag. If UP is non-nil, then scan
411 past start-tag of element containing point. If BOUND is non-nil,
412 then don't scan back past BOUND. If no element is found, return
413 nil. If a well-formedness error prevents scanning, signal an
414 `nxml-scan-error'. Point is not moved."
415 (let ((open-tags (and up t))
416 token-end found)
417 (save-excursion
418 (goto-char from)
419 (while (cond ((or (< (point) nxml-prolog-end)
420 (not (search-backward "<"
421 (max (or bound 0)
422 nxml-prolog-end)
423 t)))
424 (when (and (consp open-tags) (not bound))
425 (nxml-scan-error (cadr open-tags)
426 "End-tag has no start-tag"))
427 nil)
428 ((progn
429 (nxml-move-outside-backwards)
430 (save-excursion
431 (nxml-tokenize-forward)
432 (setq token-end (point)))
433 (eq xmltok-type 'end-tag))
434 (setq open-tags
435 (cons (xmltok-end-tag-qname)
436 (cons xmltok-start open-tags)))
437 t)
438 ((eq xmltok-type 'start-tag)
439 (cond ((not open-tags) nil)
440 ((not (consp open-tags))
441 (setq found token-end)
442 nil)
443 ((and (car open-tags)
444 (not (string= (car open-tags)
445 (xmltok-start-tag-qname))))
446 (nxml-scan-error (1+ xmltok-start)
447 "Mismatched start-tag; \
448 expected `%s'"
449 (car open-tags)))
450 ((setq open-tags (cddr open-tags)) t)
451 (t (setq found token-end) nil)))
452 ((memq xmltok-type '(empty-element
453 partial-empty-element))
454 (if open-tags
455 t
456 (setq found token-end)
457 nil))
458 ((eq xmltok-type 'partial-end-tag)
459 (setq open-tags
460 (cons nil (cons xmltok-start open-tags)))
461 t)
462 ((eq xmltok-type 'partial-start-tag)
463 ;; if we have only a partial-start-tag
464 ;; then it's unlikely that there's a matching
465 ;; end-tag, so it's probably not helpful
466 ;; to treat it as a complete start-tag
467 (nxml-scan-error xmltok-start
468 "Missing `>'"))
469 (t t))))
470 found))
471
472 (defun nxml-scan-error (&rest args)
473 (signal 'nxml-scan-error args))
474
475 (put 'nxml-scan-error
476 'error-conditions
477 '(error nxml-error nxml-scan-error))
478
479 (put 'nxml-scan-error
480 'error-message
481 "Scan over element that is not well-formed")
482
483 (provide 'nxml-rap)
484
485 ;;; nxml-rap.el ends here