Commit | Line | Data |
---|---|---|
8cd39fb3 MH |
1 | ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode |
2 | ||
ab422c4d | 3 | ;; Copyright (C) 2003-2004, 2007-2013 Free Software Foundation, Inc. |
8cd39fb3 MH |
4 | |
5 | ;; Author: James Clark | |
6 | ;; Keywords: XML | |
7 | ||
1159a31a | 8 | ;; This file is part of GNU Emacs. |
8cd39fb3 | 9 | |
4936186e | 10 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
1159a31a | 11 | ;; it under the terms of the GNU General Public License as published by |
4936186e GM |
12 | ;; the Free Software Foundation, either version 3 of the License, or |
13 | ;; (at your option) any later version. | |
8cd39fb3 | 14 | |
1159a31a GM |
15 | ;; GNU Emacs is distributed in the hope that it will be useful, |
16 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | ;; GNU General Public License for more details. | |
19 | ||
20 | ;; You should have received a copy of the GNU General Public License | |
4936186e | 21 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
8cd39fb3 MH |
22 | |
23 | ;;; Commentary: | |
24 | ||
25 | ;; This uses xmltok.el to do XML parsing. The fundamental problem is | |
26 | ;; how to handle changes. We don't want to maintain a complete parse | |
27 | ;; tree. We also don't want to reparse from the start of the document | |
28 | ;; on every keystroke. However, it is not possible in general to | |
29 | ;; parse an XML document correctly starting at a random point in the | |
30 | ;; middle. The main problems are comments, CDATA sections and | |
31 | ;; processing instructions: these can all contain things that are | |
32 | ;; indistinguishable from elements. Literals in the prolog are also a | |
33 | ;; problem. Attribute value literals are not a problem because | |
34 | ;; attribute value literals cannot contain less-than signs. | |
35 | ;; | |
36 | ;; Our strategy is to keep track of just the problematic things. | |
37 | ;; Specifically, we keep track of all comments, CDATA sections and | |
38 | ;; processing instructions in the instance. We do this by marking all | |
39 | ;; except the first character of these with a non-nil nxml-inside text | |
40 | ;; property. The value of the nxml-inside property is comment, | |
41 | ;; cdata-section or processing-instruction. The first character does | |
42 | ;; not have the nxml-inside property so we can find the beginning of | |
43 | ;; the construct by looking for a change in a text property value | |
44 | ;; (Emacs provides primitives for this). We use text properties | |
45 | ;; rather than overlays, since the implementation of overlays doesn't | |
46 | ;; look like it scales to large numbers of overlays in a buffer. | |
47 | ;; | |
48 | ;; We don't in fact track all these constructs, but only track them in | |
49 | ;; some initial part of the instance. The variable `nxml-scan-end' | |
50 | ;; contains the limit of where we have scanned up to for them. | |
51 | ;; | |
52 | ;; Thus to parse some random point in the file we first ensure that we | |
53 | ;; have scanned up to that point. Then we search backwards for a | |
54 | ;; <. Then we check whether the < has an nxml-inside property. If it | |
55 | ;; does we go backwards to first character that does not have an | |
56 | ;; nxml-inside property (this character must be a <). Then we start | |
57 | ;; parsing forward from the < we have found. | |
58 | ;; | |
59 | ;; The prolog has to be parsed specially, so we also keep track of the | |
60 | ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on | |
61 | ;; every change to the prolog. This won't work well if people try to | |
62 | ;; edit huge internal subsets. Hopefully that will be rare. | |
63 | ;; | |
64 | ;; We keep track of the changes by adding to the buffer's | |
65 | ;; after-change-functions hook. Scanning is also done as a | |
66 | ;; prerequisite to fontification by adding to fontification-functions | |
67 | ;; (in the same way as jit-lock). This means that scanning for these | |
68 | ;; constructs had better be quick. Fortunately it is. Firstly, the | |
69 | ;; typical proportion of comments, CDATA sections and processing | |
70 | ;; instructions is small relative to other things. Secondly, to scan | |
71 | ;; we just search for the regexp <[!?]. | |
8cd39fb3 MH |
72 | |
73 | ;;; Code: | |
74 | ||
75 | (require 'xmltok) | |
76 | (require 'nxml-util) | |
77 | ||
78 | (defvar nxml-prolog-end nil | |
79 | "Integer giving position following end of the prolog.") | |
80 | (make-variable-buffer-local 'nxml-prolog-end) | |
81 | ||
82 | (defvar nxml-scan-end nil | |
83 | "Marker giving position up to which we have scanned. | |
84 | nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end | |
10545bd8 | 85 | must not be an inside position in the following sense. A position is |
8cd39fb3 MH |
86 | inside if the following character is a part of, but not the first |
87 | character of, a CDATA section, comment or processing instruction. | |
88 | Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that | |
10545bd8 JB |
89 | are inside positions must have a non-nil `nxml-inside' property whose |
90 | value is a symbol specifying what it is inside. Any characters with a | |
91 | non-nil `fontified' property must have position < nxml-scan-end and | |
92 | the correct face. Dependent regions must also be established for any | |
8cd39fb3 | 93 | unclosed constructs starting before nxml-scan-end. |
10545bd8 | 94 | There must be no `nxml-inside' properties after nxml-scan-end.") |
8cd39fb3 MH |
95 | (make-variable-buffer-local 'nxml-scan-end) |
96 | ||
97 | (defsubst nxml-get-inside (pos) | |
98 | (get-text-property pos 'nxml-inside)) | |
99 | ||
100 | (defsubst nxml-clear-inside (start end) | |
e8ec402f | 101 | (nxml-debug-clear-inside start end) |
8cd39fb3 MH |
102 | (remove-text-properties start end '(nxml-inside nil))) |
103 | ||
104 | (defsubst nxml-set-inside (start end type) | |
e8ec402f | 105 | (nxml-debug-set-inside start end) |
8cd39fb3 MH |
106 | (put-text-property start end 'nxml-inside type)) |
107 | ||
108 | (defun nxml-inside-end (pos) | |
109 | "Return the end of the inside region containing POS. | |
110 | Return nil if the character at POS is not inside." | |
111 | (if (nxml-get-inside pos) | |
112 | (or (next-single-property-change pos 'nxml-inside) | |
113 | (point-max)) | |
114 | nil)) | |
115 | ||
116 | (defun nxml-inside-start (pos) | |
117 | "Return the start of the inside region containing POS. | |
118 | Return nil if the character at POS is not inside." | |
119 | (if (nxml-get-inside pos) | |
120 | (or (previous-single-property-change (1+ pos) 'nxml-inside) | |
121 | (point-min)) | |
122 | nil)) | |
123 | ||
124 | ;;; Change management | |
125 | ||
126 | (defun nxml-scan-after-change (start end) | |
127 | "Restore `nxml-scan-end' invariants after a change. | |
128 | The change happened between START and END. | |
129 | Return position after which lexical state is unchanged. | |
10545bd8 | 130 | END must be > `nxml-prolog-end'. START must be outside |
e8ec402f | 131 | any 'inside' regions and at the beginning of a token." |
8cd39fb3 MH |
132 | (if (>= start nxml-scan-end) |
133 | nxml-scan-end | |
8cd39fb3 | 134 | (let ((inside-remove-start start) |
c9990474 | 135 | xmltok-errors) |
8cd39fb3 MH |
136 | (while (or (when (xmltok-forward-special (min end nxml-scan-end)) |
137 | (when (memq xmltok-type | |
138 | '(comment | |
139 | cdata-section | |
140 | processing-instruction)) | |
141 | (nxml-clear-inside inside-remove-start | |
142 | (1+ xmltok-start)) | |
143 | (nxml-set-inside (1+ xmltok-start) | |
144 | (point) | |
145 | xmltok-type) | |
146 | (setq inside-remove-start (point))) | |
147 | (if (< (point) (min end nxml-scan-end)) | |
148 | t | |
149 | (setq end (point)) | |
150 | nil)) | |
151 | ;; The end of the change was inside but is now outside. | |
152 | ;; Imagine something really weird like | |
153 | ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> --> | |
154 | ;; and suppose we deleted "<![CDATA[f" | |
155 | (let ((inside-end (nxml-inside-end end))) | |
156 | (when inside-end | |
157 | (setq end inside-end) | |
158 | t)))) | |
c9990474 | 159 | (nxml-clear-inside inside-remove-start end)) |
8cd39fb3 MH |
160 | (when (> end nxml-scan-end) |
161 | (set-marker nxml-scan-end end)) | |
162 | end)) | |
163 | ||
1159a31a GM |
164 | ;; n-s-p only called from nxml-mode.el, where this variable is defined. |
165 | (defvar nxml-prolog-regions) | |
166 | ||
8cd39fb3 MH |
167 | (defun nxml-scan-prolog () |
168 | (goto-char (point-min)) | |
169 | (let (xmltok-dtd | |
c9990474 | 170 | xmltok-errors) |
8cd39fb3 MH |
171 | (setq nxml-prolog-regions (xmltok-forward-prolog)) |
172 | (setq nxml-prolog-end (point)) | |
c9990474 | 173 | (nxml-clear-inside (point-min) nxml-prolog-end)) |
8cd39fb3 MH |
174 | (when (< nxml-scan-end nxml-prolog-end) |
175 | (set-marker nxml-scan-end nxml-prolog-end))) | |
176 | ||
177 | ||
8cd39fb3 MH |
178 | ;;; Random access parsing |
179 | ||
180 | (defun nxml-token-after () | |
181 | "Return the position after the token containing the char after point. | |
182 | Sets up the variables `xmltok-type', `xmltok-start', | |
183 | `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes', | |
184 | `xmltok-namespace-attributes' in the same was as does | |
185 | `xmltok-forward'. The prolog will be treated as a single token with | |
186 | type `prolog'." | |
187 | (let ((pos (point))) | |
188 | (if (< pos nxml-prolog-end) | |
189 | (progn | |
190 | (setq xmltok-type 'prolog | |
191 | xmltok-start (point-min)) | |
192 | (min nxml-prolog-end (point-max))) | |
193 | (nxml-ensure-scan-up-to-date) | |
194 | (if (nxml-get-inside pos) | |
195 | (save-excursion | |
196 | (nxml-move-outside-backwards) | |
197 | (xmltok-forward) | |
198 | (point)) | |
199 | (save-excursion | |
200 | (if (or (eq (char-after) ?<) | |
201 | (search-backward "<" | |
202 | (max (point-min) nxml-prolog-end) | |
203 | t)) | |
204 | (nxml-move-outside-backwards) | |
205 | (goto-char (if (<= (point-min) nxml-prolog-end) | |
206 | nxml-prolog-end | |
207 | (or (nxml-inside-end (point-min)) | |
208 | (point-min))))) | |
209 | (while (and (nxml-tokenize-forward) | |
210 | (<= (point) pos))) | |
211 | (point)))))) | |
212 | ||
213 | (defun nxml-token-before () | |
214 | "Return the position after the token containing the char before point. | |
215 | Sets variables like `nxml-token-after'." | |
216 | (if (/= (point-min) (point)) | |
217 | (save-excursion | |
218 | (goto-char (1- (point))) | |
219 | (nxml-token-after)) | |
220 | (setq xmltok-start (point)) | |
221 | (setq xmltok-type nil) | |
222 | (point))) | |
223 | ||
224 | (defun nxml-tokenize-forward () | |
c9990474 | 225 | (let (xmltok-errors) |
8cd39fb3 MH |
226 | (when (and (xmltok-forward) |
227 | (> (point) nxml-scan-end)) | |
228 | (cond ((memq xmltok-type '(comment | |
229 | cdata-section | |
230 | processing-instruction)) | |
7e74b0fb | 231 | (with-silent-modifications |
c9990474 | 232 | (nxml-set-inside (1+ xmltok-start) (point) xmltok-type)))) |
8cd39fb3 MH |
233 | (set-marker nxml-scan-end (point))) |
234 | xmltok-type)) | |
235 | ||
e8ec402f | 236 | (defun nxml-move-tag-backwards (bound) |
10545bd8 JB |
237 | "Move point backwards outside any 'inside' regions or tags. |
238 | Point will not move past `nxml-prolog-end'. | |
239 | Point will either be at BOUND or a '<' character starting a tag | |
c9990474 | 240 | outside any 'inside' regions. |
10545bd8 | 241 | As a precondition, point must be >= BOUND." |
e8ec402f MO |
242 | (nxml-move-outside-backwards) |
243 | (when (not (equal (char-after) ?<)) | |
244 | (if (search-backward "<" bound t) | |
245 | (progn | |
246 | (nxml-move-outside-backwards) | |
247 | (when (not (equal (char-after) ?<)) | |
248 | (search-backward "<" bound t))) | |
249 | (goto-char bound)))) | |
250 | ||
8cd39fb3 MH |
251 | (defun nxml-move-outside-backwards () |
252 | "Move point to first character of the containing special thing. | |
253 | Leave point unmoved if it is not inside anything special." | |
254 | (let ((start (nxml-inside-start (point)))) | |
255 | (when start | |
256 | (goto-char (1- start)) | |
257 | (when (nxml-get-inside (point)) | |
258 | (error "Char before inside-start at %s had nxml-inside property %s" | |
259 | (point) | |
260 | (nxml-get-inside (point))))))) | |
261 | ||
262 | (defun nxml-ensure-scan-up-to-date () | |
263 | (let ((pos (point))) | |
264 | (when (< nxml-scan-end pos) | |
265 | (save-excursion | |
266 | (goto-char nxml-scan-end) | |
c9990474 | 267 | (let (xmltok-errors) |
8cd39fb3 MH |
268 | (while (when (xmltok-forward-special pos) |
269 | (when (memq xmltok-type | |
270 | '(comment | |
271 | processing-instruction | |
272 | cdata-section)) | |
7e74b0fb | 273 | (with-silent-modifications |
8cd39fb3 MH |
274 | (nxml-set-inside (1+ xmltok-start) |
275 | (point) | |
276 | xmltok-type))) | |
277 | (if (< (point) pos) | |
278 | t | |
279 | (setq pos (point)) | |
280 | nil))) | |
8cd39fb3 MH |
281 | (set-marker nxml-scan-end pos)))))) |
282 | ||
283 | ;;; Element scanning | |
284 | ||
285 | (defun nxml-scan-element-forward (from &optional up) | |
286 | "Scan forward from FROM over a single balanced element. | |
10545bd8 JB |
287 | Point must be between tokens. Return the position of the end of |
288 | the tag that ends the element. `xmltok-start' will contain the | |
289 | position of the start of the tag. If UP is non-nil, then scan | |
290 | past end-tag of element containing point. If no element is | |
291 | found, return nil. If a well-formedness error prevents scanning, | |
292 | signal an `nxml-scan-error'. Point is not moved." | |
8cd39fb3 MH |
293 | (let ((open-tags (and up t)) |
294 | found) | |
295 | (save-excursion | |
296 | (goto-char from) | |
297 | (while (cond ((not (nxml-tokenize-forward)) | |
298 | (when (consp open-tags) | |
299 | (nxml-scan-error (cadr open-tags) | |
300 | "Start-tag has no end-tag")) | |
301 | nil) | |
302 | ((eq xmltok-type 'start-tag) | |
303 | (setq open-tags | |
304 | (cons (xmltok-start-tag-qname) | |
305 | (cons xmltok-start | |
306 | open-tags))) | |
307 | t) | |
308 | ((eq xmltok-type 'end-tag) | |
309 | (cond ((not open-tags) nil) | |
310 | ((not (consp open-tags)) (setq found (point)) nil) | |
311 | ((not (string= (car open-tags) | |
312 | (xmltok-end-tag-qname))) | |
313 | (nxml-scan-error (+ 2 xmltok-start) | |
314 | "Mismatched end-tag; \ | |
315 | expected `%s'" | |
316 | (car open-tags))) | |
317 | ((setq open-tags (cddr open-tags)) t) | |
318 | (t (setq found (point)) nil))) | |
319 | ((memq xmltok-type '(empty-element | |
320 | partial-empty-element)) | |
321 | (if open-tags | |
322 | t | |
323 | (setq found (point)) | |
324 | nil)) | |
325 | ((eq xmltok-type 'partial-end-tag) | |
326 | (cond ((not open-tags) nil) | |
327 | ((not (consp open-tags)) (setq found (point)) nil) | |
328 | ((setq open-tags (cddr open-tags)) t) | |
329 | (t (setq found (point)) nil))) | |
330 | ((eq xmltok-type 'partial-start-tag) | |
331 | (nxml-scan-error xmltok-start | |
332 | "Missing `>'")) | |
333 | (t t)))) | |
334 | found)) | |
335 | ||
336 | (defun nxml-scan-element-backward (from &optional up bound) | |
337 | "Scan backward from FROM over a single balanced element. | |
10545bd8 JB |
338 | Point must be between tokens. Return the position of the end of |
339 | the tag that starts the element. `xmltok-start' will contain the | |
340 | position of the start of the tag. If UP is non-nil, then scan | |
341 | past start-tag of element containing point. If BOUND is non-nil, | |
342 | then don't scan back past BOUND. If no element is found, return | |
343 | nil. If a well-formedness error prevents scanning, signal an | |
344 | `nxml-scan-error'. Point is not moved." | |
8cd39fb3 MH |
345 | (let ((open-tags (and up t)) |
346 | token-end found) | |
347 | (save-excursion | |
348 | (goto-char from) | |
349 | (while (cond ((or (< (point) nxml-prolog-end) | |
350 | (not (search-backward "<" | |
351 | (max (or bound 0) | |
352 | nxml-prolog-end) | |
353 | t))) | |
354 | (when (and (consp open-tags) (not bound)) | |
355 | (nxml-scan-error (cadr open-tags) | |
356 | "End-tag has no start-tag")) | |
357 | nil) | |
358 | ((progn | |
359 | (nxml-move-outside-backwards) | |
360 | (save-excursion | |
361 | (nxml-tokenize-forward) | |
362 | (setq token-end (point))) | |
363 | (eq xmltok-type 'end-tag)) | |
364 | (setq open-tags | |
365 | (cons (xmltok-end-tag-qname) | |
366 | (cons xmltok-start open-tags))) | |
367 | t) | |
368 | ((eq xmltok-type 'start-tag) | |
369 | (cond ((not open-tags) nil) | |
370 | ((not (consp open-tags)) | |
371 | (setq found token-end) | |
372 | nil) | |
373 | ((and (car open-tags) | |
374 | (not (string= (car open-tags) | |
375 | (xmltok-start-tag-qname)))) | |
376 | (nxml-scan-error (1+ xmltok-start) | |
377 | "Mismatched start-tag; \ | |
378 | expected `%s'" | |
379 | (car open-tags))) | |
380 | ((setq open-tags (cddr open-tags)) t) | |
381 | (t (setq found token-end) nil))) | |
382 | ((memq xmltok-type '(empty-element | |
383 | partial-empty-element)) | |
384 | (if open-tags | |
385 | t | |
386 | (setq found token-end) | |
387 | nil)) | |
388 | ((eq xmltok-type 'partial-end-tag) | |
389 | (setq open-tags | |
390 | (cons nil (cons xmltok-start open-tags))) | |
391 | t) | |
392 | ((eq xmltok-type 'partial-start-tag) | |
393 | ;; if we have only a partial-start-tag | |
394 | ;; then it's unlikely that there's a matching | |
395 | ;; end-tag, so it's probably not helpful | |
396 | ;; to treat it as a complete start-tag | |
397 | (nxml-scan-error xmltok-start | |
398 | "Missing `>'")) | |
399 | (t t)))) | |
400 | found)) | |
401 | ||
402 | (defun nxml-scan-error (&rest args) | |
403 | (signal 'nxml-scan-error args)) | |
404 | ||
405 | (put 'nxml-scan-error | |
406 | 'error-conditions | |
407 | '(error nxml-error nxml-scan-error)) | |
408 | ||
409 | (put 'nxml-scan-error | |
410 | 'error-message | |
411 | "Scan over element that is not well-formed") | |
412 | ||
413 | (provide 'nxml-rap) | |
414 | ||
415 | ;;; nxml-rap.el ends here |