* lisp/nxml/nxml-mode.el: Treat unclosed <[[, <?, comment, and other
[bpt/emacs.git] / lisp / nxml / nxml-rap.el
CommitLineData
8cd39fb3
MH
1;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
2
ab422c4d 3;; Copyright (C) 2003-2004, 2007-2013 Free Software Foundation, Inc.
8cd39fb3
MH
4
5;; Author: James Clark
6;; Keywords: XML
7
1159a31a 8;; This file is part of GNU Emacs.
8cd39fb3 9
4936186e 10;; GNU Emacs is free software: you can redistribute it and/or modify
1159a31a 11;; it under the terms of the GNU General Public License as published by
4936186e
GM
12;; the Free Software Foundation, either version 3 of the License, or
13;; (at your option) any later version.
8cd39fb3 14
1159a31a
GM
15;; GNU Emacs is distributed in the hope that it will be useful,
16;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18;; GNU General Public License for more details.
19
20;; You should have received a copy of the GNU General Public License
4936186e 21;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
8cd39fb3
MH
22
23;;; Commentary:
24
25;; This uses xmltok.el to do XML parsing. The fundamental problem is
26;; how to handle changes. We don't want to maintain a complete parse
27;; tree. We also don't want to reparse from the start of the document
28;; on every keystroke. However, it is not possible in general to
29;; parse an XML document correctly starting at a random point in the
30;; middle. The main problems are comments, CDATA sections and
31;; processing instructions: these can all contain things that are
32;; indistinguishable from elements. Literals in the prolog are also a
33;; problem. Attribute value literals are not a problem because
34;; attribute value literals cannot contain less-than signs.
35;;
36;; Our strategy is to keep track of just the problematic things.
37;; Specifically, we keep track of all comments, CDATA sections and
38;; processing instructions in the instance. We do this by marking all
39;; except the first character of these with a non-nil nxml-inside text
40;; property. The value of the nxml-inside property is comment,
41;; cdata-section or processing-instruction. The first character does
42;; not have the nxml-inside property so we can find the beginning of
43;; the construct by looking for a change in a text property value
44;; (Emacs provides primitives for this). We use text properties
45;; rather than overlays, since the implementation of overlays doesn't
46;; look like it scales to large numbers of overlays in a buffer.
47;;
48;; We don't in fact track all these constructs, but only track them in
49;; some initial part of the instance. The variable `nxml-scan-end'
50;; contains the limit of where we have scanned up to for them.
51;;
52;; Thus to parse some random point in the file we first ensure that we
53;; have scanned up to that point. Then we search backwards for a
54;; <. Then we check whether the < has an nxml-inside property. If it
55;; does we go backwards to first character that does not have an
56;; nxml-inside property (this character must be a <). Then we start
57;; parsing forward from the < we have found.
58;;
59;; The prolog has to be parsed specially, so we also keep track of the
60;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
61;; every change to the prolog. This won't work well if people try to
62;; edit huge internal subsets. Hopefully that will be rare.
63;;
64;; We keep track of the changes by adding to the buffer's
65;; after-change-functions hook. Scanning is also done as a
66;; prerequisite to fontification by adding to fontification-functions
67;; (in the same way as jit-lock). This means that scanning for these
68;; constructs had better be quick. Fortunately it is. Firstly, the
69;; typical proportion of comments, CDATA sections and processing
70;; instructions is small relative to other things. Secondly, to scan
71;; we just search for the regexp <[!?].
8cd39fb3
MH
72
73;;; Code:
74
75(require 'xmltok)
76(require 'nxml-util)
77
78(defvar nxml-prolog-end nil
79 "Integer giving position following end of the prolog.")
80(make-variable-buffer-local 'nxml-prolog-end)
81
82(defvar nxml-scan-end nil
83 "Marker giving position up to which we have scanned.
84nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end
10545bd8 85must not be an inside position in the following sense. A position is
8cd39fb3
MH
86inside if the following character is a part of, but not the first
87character of, a CDATA section, comment or processing instruction.
88Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
10545bd8
JB
89are inside positions must have a non-nil `nxml-inside' property whose
90value is a symbol specifying what it is inside. Any characters with a
91non-nil `fontified' property must have position < nxml-scan-end and
92the correct face. Dependent regions must also be established for any
8cd39fb3 93unclosed constructs starting before nxml-scan-end.
10545bd8 94There must be no `nxml-inside' properties after nxml-scan-end.")
8cd39fb3
MH
95(make-variable-buffer-local 'nxml-scan-end)
96
97(defsubst nxml-get-inside (pos)
98 (get-text-property pos 'nxml-inside))
99
100(defsubst nxml-clear-inside (start end)
e8ec402f 101 (nxml-debug-clear-inside start end)
8cd39fb3
MH
102 (remove-text-properties start end '(nxml-inside nil)))
103
104(defsubst nxml-set-inside (start end type)
e8ec402f 105 (nxml-debug-set-inside start end)
8cd39fb3
MH
106 (put-text-property start end 'nxml-inside type))
107
108(defun nxml-inside-end (pos)
109 "Return the end of the inside region containing POS.
110Return nil if the character at POS is not inside."
111 (if (nxml-get-inside pos)
112 (or (next-single-property-change pos 'nxml-inside)
113 (point-max))
114 nil))
115
116(defun nxml-inside-start (pos)
117 "Return the start of the inside region containing POS.
118Return nil if the character at POS is not inside."
119 (if (nxml-get-inside pos)
120 (or (previous-single-property-change (1+ pos) 'nxml-inside)
121 (point-min))
122 nil))
123
124;;; Change management
125
126(defun nxml-scan-after-change (start end)
127 "Restore `nxml-scan-end' invariants after a change.
128The change happened between START and END.
129Return position after which lexical state is unchanged.
10545bd8 130END must be > `nxml-prolog-end'. START must be outside
e8ec402f 131any 'inside' regions and at the beginning of a token."
8cd39fb3
MH
132 (if (>= start nxml-scan-end)
133 nxml-scan-end
8cd39fb3 134 (let ((inside-remove-start start)
c9990474 135 xmltok-errors)
8cd39fb3
MH
136 (while (or (when (xmltok-forward-special (min end nxml-scan-end))
137 (when (memq xmltok-type
138 '(comment
139 cdata-section
140 processing-instruction))
141 (nxml-clear-inside inside-remove-start
142 (1+ xmltok-start))
143 (nxml-set-inside (1+ xmltok-start)
144 (point)
145 xmltok-type)
146 (setq inside-remove-start (point)))
147 (if (< (point) (min end nxml-scan-end))
148 t
149 (setq end (point))
150 nil))
151 ;; The end of the change was inside but is now outside.
152 ;; Imagine something really weird like
153 ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
154 ;; and suppose we deleted "<![CDATA[f"
155 (let ((inside-end (nxml-inside-end end)))
156 (when inside-end
157 (setq end inside-end)
158 t))))
c9990474 159 (nxml-clear-inside inside-remove-start end))
8cd39fb3
MH
160 (when (> end nxml-scan-end)
161 (set-marker nxml-scan-end end))
162 end))
163
1159a31a
GM
164;; n-s-p only called from nxml-mode.el, where this variable is defined.
165(defvar nxml-prolog-regions)
166
8cd39fb3
MH
167(defun nxml-scan-prolog ()
168 (goto-char (point-min))
169 (let (xmltok-dtd
c9990474 170 xmltok-errors)
8cd39fb3
MH
171 (setq nxml-prolog-regions (xmltok-forward-prolog))
172 (setq nxml-prolog-end (point))
c9990474 173 (nxml-clear-inside (point-min) nxml-prolog-end))
8cd39fb3
MH
174 (when (< nxml-scan-end nxml-prolog-end)
175 (set-marker nxml-scan-end nxml-prolog-end)))
176
177
8cd39fb3
MH
178;;; Random access parsing
179
180(defun nxml-token-after ()
181 "Return the position after the token containing the char after point.
182Sets up the variables `xmltok-type', `xmltok-start',
183`xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
184`xmltok-namespace-attributes' in the same was as does
185`xmltok-forward'. The prolog will be treated as a single token with
186type `prolog'."
187 (let ((pos (point)))
188 (if (< pos nxml-prolog-end)
189 (progn
190 (setq xmltok-type 'prolog
191 xmltok-start (point-min))
192 (min nxml-prolog-end (point-max)))
193 (nxml-ensure-scan-up-to-date)
194 (if (nxml-get-inside pos)
195 (save-excursion
196 (nxml-move-outside-backwards)
197 (xmltok-forward)
198 (point))
199 (save-excursion
200 (if (or (eq (char-after) ?<)
201 (search-backward "<"
202 (max (point-min) nxml-prolog-end)
203 t))
204 (nxml-move-outside-backwards)
205 (goto-char (if (<= (point-min) nxml-prolog-end)
206 nxml-prolog-end
207 (or (nxml-inside-end (point-min))
208 (point-min)))))
209 (while (and (nxml-tokenize-forward)
210 (<= (point) pos)))
211 (point))))))
212
213(defun nxml-token-before ()
214 "Return the position after the token containing the char before point.
215Sets variables like `nxml-token-after'."
216 (if (/= (point-min) (point))
217 (save-excursion
218 (goto-char (1- (point)))
219 (nxml-token-after))
220 (setq xmltok-start (point))
221 (setq xmltok-type nil)
222 (point)))
223
224(defun nxml-tokenize-forward ()
c9990474 225 (let (xmltok-errors)
8cd39fb3
MH
226 (when (and (xmltok-forward)
227 (> (point) nxml-scan-end))
228 (cond ((memq xmltok-type '(comment
229 cdata-section
230 processing-instruction))
7e74b0fb 231 (with-silent-modifications
c9990474 232 (nxml-set-inside (1+ xmltok-start) (point) xmltok-type))))
8cd39fb3
MH
233 (set-marker nxml-scan-end (point)))
234 xmltok-type))
235
e8ec402f 236(defun nxml-move-tag-backwards (bound)
10545bd8
JB
237 "Move point backwards outside any 'inside' regions or tags.
238Point will not move past `nxml-prolog-end'.
239Point will either be at BOUND or a '<' character starting a tag
c9990474 240outside any 'inside' regions.
10545bd8 241As a precondition, point must be >= BOUND."
e8ec402f
MO
242 (nxml-move-outside-backwards)
243 (when (not (equal (char-after) ?<))
244 (if (search-backward "<" bound t)
245 (progn
246 (nxml-move-outside-backwards)
247 (when (not (equal (char-after) ?<))
248 (search-backward "<" bound t)))
249 (goto-char bound))))
250
8cd39fb3
MH
251(defun nxml-move-outside-backwards ()
252 "Move point to first character of the containing special thing.
253Leave point unmoved if it is not inside anything special."
254 (let ((start (nxml-inside-start (point))))
255 (when start
256 (goto-char (1- start))
257 (when (nxml-get-inside (point))
258 (error "Char before inside-start at %s had nxml-inside property %s"
259 (point)
260 (nxml-get-inside (point)))))))
261
262(defun nxml-ensure-scan-up-to-date ()
263 (let ((pos (point)))
264 (when (< nxml-scan-end pos)
265 (save-excursion
266 (goto-char nxml-scan-end)
c9990474 267 (let (xmltok-errors)
8cd39fb3
MH
268 (while (when (xmltok-forward-special pos)
269 (when (memq xmltok-type
270 '(comment
271 processing-instruction
272 cdata-section))
7e74b0fb 273 (with-silent-modifications
8cd39fb3
MH
274 (nxml-set-inside (1+ xmltok-start)
275 (point)
276 xmltok-type)))
277 (if (< (point) pos)
278 t
279 (setq pos (point))
280 nil)))
8cd39fb3
MH
281 (set-marker nxml-scan-end pos))))))
282
283;;; Element scanning
284
285(defun nxml-scan-element-forward (from &optional up)
286 "Scan forward from FROM over a single balanced element.
10545bd8
JB
287Point must be between tokens. Return the position of the end of
288the tag that ends the element. `xmltok-start' will contain the
289position of the start of the tag. If UP is non-nil, then scan
290past end-tag of element containing point. If no element is
291found, return nil. If a well-formedness error prevents scanning,
292signal an `nxml-scan-error'. Point is not moved."
8cd39fb3
MH
293 (let ((open-tags (and up t))
294 found)
295 (save-excursion
296 (goto-char from)
297 (while (cond ((not (nxml-tokenize-forward))
298 (when (consp open-tags)
299 (nxml-scan-error (cadr open-tags)
300 "Start-tag has no end-tag"))
301 nil)
302 ((eq xmltok-type 'start-tag)
303 (setq open-tags
304 (cons (xmltok-start-tag-qname)
305 (cons xmltok-start
306 open-tags)))
307 t)
308 ((eq xmltok-type 'end-tag)
309 (cond ((not open-tags) nil)
310 ((not (consp open-tags)) (setq found (point)) nil)
311 ((not (string= (car open-tags)
312 (xmltok-end-tag-qname)))
313 (nxml-scan-error (+ 2 xmltok-start)
314 "Mismatched end-tag; \
315expected `%s'"
316 (car open-tags)))
317 ((setq open-tags (cddr open-tags)) t)
318 (t (setq found (point)) nil)))
319 ((memq xmltok-type '(empty-element
320 partial-empty-element))
321 (if open-tags
322 t
323 (setq found (point))
324 nil))
325 ((eq xmltok-type 'partial-end-tag)
326 (cond ((not open-tags) nil)
327 ((not (consp open-tags)) (setq found (point)) nil)
328 ((setq open-tags (cddr open-tags)) t)
329 (t (setq found (point)) nil)))
330 ((eq xmltok-type 'partial-start-tag)
331 (nxml-scan-error xmltok-start
332 "Missing `>'"))
333 (t t))))
334 found))
335
336(defun nxml-scan-element-backward (from &optional up bound)
337 "Scan backward from FROM over a single balanced element.
10545bd8
JB
338Point must be between tokens. Return the position of the end of
339the tag that starts the element. `xmltok-start' will contain the
340position of the start of the tag. If UP is non-nil, then scan
341past start-tag of element containing point. If BOUND is non-nil,
342then don't scan back past BOUND. If no element is found, return
343nil. If a well-formedness error prevents scanning, signal an
344`nxml-scan-error'. Point is not moved."
8cd39fb3
MH
345 (let ((open-tags (and up t))
346 token-end found)
347 (save-excursion
348 (goto-char from)
349 (while (cond ((or (< (point) nxml-prolog-end)
350 (not (search-backward "<"
351 (max (or bound 0)
352 nxml-prolog-end)
353 t)))
354 (when (and (consp open-tags) (not bound))
355 (nxml-scan-error (cadr open-tags)
356 "End-tag has no start-tag"))
357 nil)
358 ((progn
359 (nxml-move-outside-backwards)
360 (save-excursion
361 (nxml-tokenize-forward)
362 (setq token-end (point)))
363 (eq xmltok-type 'end-tag))
364 (setq open-tags
365 (cons (xmltok-end-tag-qname)
366 (cons xmltok-start open-tags)))
367 t)
368 ((eq xmltok-type 'start-tag)
369 (cond ((not open-tags) nil)
370 ((not (consp open-tags))
371 (setq found token-end)
372 nil)
373 ((and (car open-tags)
374 (not (string= (car open-tags)
375 (xmltok-start-tag-qname))))
376 (nxml-scan-error (1+ xmltok-start)
377 "Mismatched start-tag; \
378expected `%s'"
379 (car open-tags)))
380 ((setq open-tags (cddr open-tags)) t)
381 (t (setq found token-end) nil)))
382 ((memq xmltok-type '(empty-element
383 partial-empty-element))
384 (if open-tags
385 t
386 (setq found token-end)
387 nil))
388 ((eq xmltok-type 'partial-end-tag)
389 (setq open-tags
390 (cons nil (cons xmltok-start open-tags)))
391 t)
392 ((eq xmltok-type 'partial-start-tag)
393 ;; if we have only a partial-start-tag
394 ;; then it's unlikely that there's a matching
395 ;; end-tag, so it's probably not helpful
396 ;; to treat it as a complete start-tag
397 (nxml-scan-error xmltok-start
398 "Missing `>'"))
399 (t t))))
400 found))
401
402(defun nxml-scan-error (&rest args)
403 (signal 'nxml-scan-error args))
404
405(put 'nxml-scan-error
406 'error-conditions
407 '(error nxml-error nxml-scan-error))
408
409(put 'nxml-scan-error
410 'error-message
411 "Scan over element that is not well-formed")
412
413(provide 'nxml-rap)
414
415;;; nxml-rap.el ends here