Mercurial > emacs
annotate lisp/nxml/nxml-rap.el @ 96166:b6a2f2a3a2c0
Fix rst.el attribution.
author | Glenn Morris <rgm@gnu.org> |
---|---|
date | Sat, 21 Jun 2008 19:39:01 +0000 |
parents | 8c4c0ca00399 |
children | e374c747704b |
rev | line source |
---|---|
86361 | 1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode |
2 | |
87665 | 3 ;; Copyright (C) 2003, 2004, 2007, 2008 Free Software Foundation, Inc. |
86361 | 4 |
5 ;; Author: James Clark | |
6 ;; Keywords: XML | |
7 | |
86542 | 8 ;; This file is part of GNU Emacs. |
9 | |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
10 ;; GNU Emacs is free software: you can redistribute it and/or modify |
86542 | 11 ;; it under the terms of the GNU General Public License as published by |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
12 ;; the Free Software Foundation, either version 3 of the License, or |
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
13 ;; (at your option) any later version. |
86361 | 14 |
86542 | 15 ;; GNU Emacs is distributed in the hope that it will be useful, |
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 ;; GNU General Public License for more details. | |
86361 | 19 |
86542 | 20 ;; You should have received a copy of the GNU General Public License |
94666
d495d4d5452f
Switch to recommended form of GPLv3 permissions notice.
Glenn Morris <rgm@gnu.org>
parents:
87665
diff
changeset
|
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
86361 | 22 |
23 ;;; Commentary: | |
24 | |
25 ;; This uses xmltok.el to do XML parsing. The fundamental problem is | |
26 ;; how to handle changes. We don't want to maintain a complete parse | |
27 ;; tree. We also don't want to reparse from the start of the document | |
28 ;; on every keystroke. However, it is not possible in general to | |
29 ;; parse an XML document correctly starting at a random point in the | |
30 ;; middle. The main problems are comments, CDATA sections and | |
31 ;; processing instructions: these can all contain things that are | |
32 ;; indistinguishable from elements. Literals in the prolog are also a | |
33 ;; problem. Attribute value literals are not a problem because | |
34 ;; attribute value literals cannot contain less-than signs. | |
35 ;; | |
36 ;; Our strategy is to keep track of just the problematic things. | |
37 ;; Specifically, we keep track of all comments, CDATA sections and | |
38 ;; processing instructions in the instance. We do this by marking all | |
39 ;; except the first character of these with a non-nil nxml-inside text | |
40 ;; property. The value of the nxml-inside property is comment, | |
41 ;; cdata-section or processing-instruction. The first character does | |
42 ;; not have the nxml-inside property so we can find the beginning of | |
43 ;; the construct by looking for a change in a text property value | |
44 ;; (Emacs provides primitives for this). We use text properties | |
45 ;; rather than overlays, since the implementation of overlays doesn't | |
46 ;; look like it scales to large numbers of overlays in a buffer. | |
47 ;; | |
48 ;; We don't in fact track all these constructs, but only track them in | |
49 ;; some initial part of the instance. The variable `nxml-scan-end' | |
50 ;; contains the limit of where we have scanned up to for them. | |
51 ;; | |
52 ;; Thus to parse some random point in the file we first ensure that we | |
53 ;; have scanned up to that point. Then we search backwards for a | |
54 ;; <. Then we check whether the < has an nxml-inside property. If it | |
55 ;; does we go backwards to first character that does not have an | |
56 ;; nxml-inside property (this character must be a <). Then we start | |
57 ;; parsing forward from the < we have found. | |
58 ;; | |
59 ;; The prolog has to be parsed specially, so we also keep track of the | |
60 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on | |
61 ;; every change to the prolog. This won't work well if people try to | |
62 ;; edit huge internal subsets. Hopefully that will be rare. | |
63 ;; | |
64 ;; We keep track of the changes by adding to the buffer's | |
65 ;; after-change-functions hook. Scanning is also done as a | |
66 ;; prerequisite to fontification by adding to fontification-functions | |
67 ;; (in the same way as jit-lock). This means that scanning for these | |
68 ;; constructs had better be quick. Fortunately it is. Firstly, the | |
69 ;; typical proportion of comments, CDATA sections and processing | |
70 ;; instructions is small relative to other things. Secondly, to scan | |
71 ;; we just search for the regexp <[!?]. | |
72 ;; | |
73 ;; One problem is unclosed comments, processing instructions and CDATA | |
74 ;; sections. Suppose, for example, we encounter a <!-- but there's no | |
75 ;; matching -->. This is not an unexpected situation if the user is | |
76 ;; creating a comment. It is not helpful to treat the whole of the | |
77 ;; file starting from the <!-- onwards as a single unclosed comment | |
78 ;; token. Instead we treat just the <!-- as a piece of not well-formed | |
79 ;; markup and continue. The problem is that if at some later stage a | |
80 ;; --> gets added to the buffer after the unclosed <!--, we will need | |
81 ;; to reparse the buffer starting from the <!--. We need to keep | |
82 ;; track of these reparse dependencies; they are called dependent | |
83 ;; regions in the code. | |
84 | |
85 ;;; Code: | |
86 | |
87 (require 'xmltok) | |
88 (require 'nxml-util) | |
89 | |
90 (defvar nxml-prolog-end nil | |
91 "Integer giving position following end of the prolog.") | |
92 (make-variable-buffer-local 'nxml-prolog-end) | |
93 | |
94 (defvar nxml-scan-end nil | |
95 "Marker giving position up to which we have scanned. | |
96 nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end | |
97 must not an inside position in the following sense. A position is | |
98 inside if the following character is a part of, but not the first | |
99 character of, a CDATA section, comment or processing instruction. | |
100 Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that | |
101 are inside positions must have a non-nil nxml-inside property whose | |
102 value is a symbol specifying what it is inside. Any characters with a | |
103 non-nil fontified property must have position < nxml-scan-end and the | |
104 correct face. Dependent regions must also be established for any | |
105 unclosed constructs starting before nxml-scan-end. | |
106 There must be no nxml-inside properties after nxml-scan-end.") | |
107 (make-variable-buffer-local 'nxml-scan-end) | |
108 | |
109 (defsubst nxml-get-inside (pos) | |
110 (get-text-property pos 'nxml-inside)) | |
111 | |
112 (defsubst nxml-clear-inside (start end) | |
95598 | 113 (nxml-debug-clear-inside start end) |
86361 | 114 (remove-text-properties start end '(nxml-inside nil))) |
115 | |
116 (defsubst nxml-set-inside (start end type) | |
95598 | 117 (nxml-debug-set-inside start end) |
86361 | 118 (put-text-property start end 'nxml-inside type)) |
119 | |
120 (defun nxml-inside-end (pos) | |
121 "Return the end of the inside region containing POS. | |
122 Return nil if the character at POS is not inside." | |
123 (if (nxml-get-inside pos) | |
124 (or (next-single-property-change pos 'nxml-inside) | |
125 (point-max)) | |
126 nil)) | |
127 | |
128 (defun nxml-inside-start (pos) | |
129 "Return the start of the inside region containing POS. | |
130 Return nil if the character at POS is not inside." | |
131 (if (nxml-get-inside pos) | |
132 (or (previous-single-property-change (1+ pos) 'nxml-inside) | |
133 (point-min)) | |
134 nil)) | |
135 | |
136 ;;; Change management | |
137 | |
138 (defun nxml-scan-after-change (start end) | |
139 "Restore `nxml-scan-end' invariants after a change. | |
140 The change happened between START and END. | |
141 Return position after which lexical state is unchanged. | |
95598 | 142 END must be > nxml-prolog-end. START must be outside |
143 any 'inside' regions and at the beginning of a token." | |
86361 | 144 (if (>= start nxml-scan-end) |
145 nxml-scan-end | |
146 (let ((inside-remove-start start) | |
147 xmltok-errors | |
148 xmltok-dependent-regions) | |
149 (while (or (when (xmltok-forward-special (min end nxml-scan-end)) | |
150 (when (memq xmltok-type | |
151 '(comment | |
152 cdata-section | |
153 processing-instruction)) | |
154 (nxml-clear-inside inside-remove-start | |
155 (1+ xmltok-start)) | |
156 (nxml-set-inside (1+ xmltok-start) | |
157 (point) | |
158 xmltok-type) | |
159 (setq inside-remove-start (point))) | |
160 (if (< (point) (min end nxml-scan-end)) | |
161 t | |
162 (setq end (point)) | |
163 nil)) | |
164 ;; The end of the change was inside but is now outside. | |
165 ;; Imagine something really weird like | |
166 ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> --> | |
167 ;; and suppose we deleted "<![CDATA[f" | |
168 (let ((inside-end (nxml-inside-end end))) | |
169 (when inside-end | |
170 (setq end inside-end) | |
171 t)))) | |
172 (nxml-clear-inside inside-remove-start end) | |
173 (nxml-clear-dependent-regions start end) | |
174 (nxml-mark-parse-dependent-regions)) | |
175 (when (> end nxml-scan-end) | |
176 (set-marker nxml-scan-end end)) | |
177 end)) | |
178 | |
86542 | 179 ;; n-s-p only called from nxml-mode.el, where this variable is defined. |
180 (defvar nxml-prolog-regions) | |
181 | |
86361 | 182 (defun nxml-scan-prolog () |
183 (goto-char (point-min)) | |
184 (let (xmltok-dtd | |
185 xmltok-errors | |
186 xmltok-dependent-regions) | |
187 (setq nxml-prolog-regions (xmltok-forward-prolog)) | |
188 (setq nxml-prolog-end (point)) | |
189 (nxml-clear-inside (point-min) nxml-prolog-end) | |
190 (nxml-clear-dependent-regions (point-min) nxml-prolog-end) | |
191 (nxml-mark-parse-dependent-regions)) | |
192 (when (< nxml-scan-end nxml-prolog-end) | |
193 (set-marker nxml-scan-end nxml-prolog-end))) | |
194 | |
195 | |
196 ;;; Dependent regions | |
197 | |
198 (defun nxml-adjust-start-for-dependent-regions (start end pre-change-length) | |
199 (let ((overlays (overlays-in (1- start) start)) | |
200 (adjusted-start start)) | |
201 (while overlays | |
202 (let* ((overlay (car overlays)) | |
203 (ostart (overlay-start overlay))) | |
204 (when (and (eq (overlay-get overlay 'category) 'nxml-dependent) | |
205 (< ostart adjusted-start)) | |
206 (let ((funargs (overlay-get overlay 'nxml-funargs))) | |
207 (when (apply (car funargs) | |
208 (append (list start | |
209 end | |
210 pre-change-length | |
211 ostart | |
212 (overlay-end overlay)) | |
213 (cdr funargs))) | |
214 (setq adjusted-start ostart))))) | |
215 (setq overlays (cdr overlays))) | |
216 adjusted-start)) | |
95598 | 217 |
86361 | 218 (defun nxml-mark-parse-dependent-regions () |
219 (while xmltok-dependent-regions | |
220 (apply 'nxml-mark-parse-dependent-region | |
221 (car xmltok-dependent-regions)) | |
222 (setq xmltok-dependent-regions | |
223 (cdr xmltok-dependent-regions)))) | |
224 | |
225 (defun nxml-mark-parse-dependent-region (fun start end &rest args) | |
226 (let ((overlay (make-overlay start end nil t t))) | |
227 (overlay-put overlay 'category 'nxml-dependent) | |
228 (overlay-put overlay 'nxml-funargs (cons fun args)))) | |
229 | |
230 (put 'nxml-dependent 'evaporate t) | |
231 | |
232 (defun nxml-clear-dependent-regions (start end) | |
233 (let ((overlays (overlays-in start end))) | |
234 (while overlays | |
235 (let* ((overlay (car overlays)) | |
236 (category (overlay-get overlay 'category))) | |
237 (when (and (eq category 'nxml-dependent) | |
238 (<= start (overlay-start overlay))) | |
239 (delete-overlay overlay))) | |
240 (setq overlays (cdr overlays))))) | |
241 | |
242 ;;; Random access parsing | |
243 | |
244 (defun nxml-token-after () | |
245 "Return the position after the token containing the char after point. | |
246 Sets up the variables `xmltok-type', `xmltok-start', | |
247 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes', | |
248 `xmltok-namespace-attributes' in the same was as does | |
249 `xmltok-forward'. The prolog will be treated as a single token with | |
250 type `prolog'." | |
251 (let ((pos (point))) | |
252 (if (< pos nxml-prolog-end) | |
253 (progn | |
254 (setq xmltok-type 'prolog | |
255 xmltok-start (point-min)) | |
256 (min nxml-prolog-end (point-max))) | |
257 (nxml-ensure-scan-up-to-date) | |
258 (if (nxml-get-inside pos) | |
259 (save-excursion | |
260 (nxml-move-outside-backwards) | |
261 (xmltok-forward) | |
262 (point)) | |
263 (save-excursion | |
264 (if (or (eq (char-after) ?<) | |
265 (search-backward "<" | |
266 (max (point-min) nxml-prolog-end) | |
267 t)) | |
268 (nxml-move-outside-backwards) | |
269 (goto-char (if (<= (point-min) nxml-prolog-end) | |
270 nxml-prolog-end | |
271 (or (nxml-inside-end (point-min)) | |
272 (point-min))))) | |
273 (while (and (nxml-tokenize-forward) | |
274 (<= (point) pos))) | |
275 (point)))))) | |
276 | |
277 (defun nxml-token-before () | |
278 "Return the position after the token containing the char before point. | |
279 Sets variables like `nxml-token-after'." | |
280 (if (/= (point-min) (point)) | |
281 (save-excursion | |
282 (goto-char (1- (point))) | |
283 (nxml-token-after)) | |
284 (setq xmltok-start (point)) | |
285 (setq xmltok-type nil) | |
286 (point))) | |
287 | |
288 (defun nxml-tokenize-forward () | |
289 (let (xmltok-dependent-regions | |
290 xmltok-errors) | |
291 (when (and (xmltok-forward) | |
292 (> (point) nxml-scan-end)) | |
293 (cond ((memq xmltok-type '(comment | |
294 cdata-section | |
295 processing-instruction)) | |
296 (nxml-with-unmodifying-text-property-changes | |
297 (nxml-set-inside (1+ xmltok-start) (point) xmltok-type))) | |
298 (xmltok-dependent-regions | |
299 (nxml-mark-parse-dependent-regions))) | |
300 (set-marker nxml-scan-end (point))) | |
301 xmltok-type)) | |
302 | |
95598 | 303 (defun nxml-move-tag-backwards (bound) |
304 "Move point backwards outside any 'inside' regions or tags, up | |
305 to nxml-prolog-end. Point will either be at bound or a '<' | |
306 character starting a tag outside any 'inside' regions. Ignores | |
307 dependent regions. As a precondition, point must be >= bound." | |
308 (nxml-move-outside-backwards) | |
309 (when (not (equal (char-after) ?<)) | |
310 (if (search-backward "<" bound t) | |
311 (progn | |
312 (nxml-move-outside-backwards) | |
313 (when (not (equal (char-after) ?<)) | |
314 (search-backward "<" bound t))) | |
315 (goto-char bound)))) | |
316 | |
86361 | 317 (defun nxml-move-outside-backwards () |
318 "Move point to first character of the containing special thing. | |
319 Leave point unmoved if it is not inside anything special." | |
320 (let ((start (nxml-inside-start (point)))) | |
321 (when start | |
322 (goto-char (1- start)) | |
323 (when (nxml-get-inside (point)) | |
324 (error "Char before inside-start at %s had nxml-inside property %s" | |
325 (point) | |
326 (nxml-get-inside (point))))))) | |
327 | |
328 (defun nxml-ensure-scan-up-to-date () | |
329 (let ((pos (point))) | |
330 (when (< nxml-scan-end pos) | |
331 (save-excursion | |
332 (goto-char nxml-scan-end) | |
333 (let (xmltok-errors | |
334 xmltok-dependent-regions) | |
335 (while (when (xmltok-forward-special pos) | |
336 (when (memq xmltok-type | |
337 '(comment | |
338 processing-instruction | |
339 cdata-section)) | |
340 (nxml-with-unmodifying-text-property-changes | |
341 (nxml-set-inside (1+ xmltok-start) | |
342 (point) | |
343 xmltok-type))) | |
344 (if (< (point) pos) | |
345 t | |
346 (setq pos (point)) | |
347 nil))) | |
348 (nxml-clear-dependent-regions nxml-scan-end pos) | |
349 (nxml-mark-parse-dependent-regions) | |
350 (set-marker nxml-scan-end pos)))))) | |
351 | |
352 ;;; Element scanning | |
353 | |
354 (defun nxml-scan-element-forward (from &optional up) | |
355 "Scan forward from FROM over a single balanced element. | |
356 Point must between tokens. Return the position of the end of the tag | |
357 that ends the element. `xmltok-start' will contain the position of the | |
358 start of the tag. If UP is non-nil, then scan past end-tag of element | |
359 containing point. If no element is found, return nil. If a | |
360 well-formedness error prevents scanning, signal an nxml-scan-error. | |
361 Point is not moved." | |
362 (let ((open-tags (and up t)) | |
363 found) | |
364 (save-excursion | |
365 (goto-char from) | |
366 (while (cond ((not (nxml-tokenize-forward)) | |
367 (when (consp open-tags) | |
368 (nxml-scan-error (cadr open-tags) | |
369 "Start-tag has no end-tag")) | |
370 nil) | |
371 ((eq xmltok-type 'start-tag) | |
372 (setq open-tags | |
373 (cons (xmltok-start-tag-qname) | |
374 (cons xmltok-start | |
375 open-tags))) | |
376 t) | |
377 ((eq xmltok-type 'end-tag) | |
378 (cond ((not open-tags) nil) | |
379 ((not (consp open-tags)) (setq found (point)) nil) | |
380 ((not (string= (car open-tags) | |
381 (xmltok-end-tag-qname))) | |
382 (nxml-scan-error (+ 2 xmltok-start) | |
383 "Mismatched end-tag; \ | |
384 expected `%s'" | |
385 (car open-tags))) | |
386 ((setq open-tags (cddr open-tags)) t) | |
387 (t (setq found (point)) nil))) | |
388 ((memq xmltok-type '(empty-element | |
389 partial-empty-element)) | |
390 (if open-tags | |
391 t | |
392 (setq found (point)) | |
393 nil)) | |
394 ((eq xmltok-type 'partial-end-tag) | |
395 (cond ((not open-tags) nil) | |
396 ((not (consp open-tags)) (setq found (point)) nil) | |
397 ((setq open-tags (cddr open-tags)) t) | |
398 (t (setq found (point)) nil))) | |
399 ((eq xmltok-type 'partial-start-tag) | |
400 (nxml-scan-error xmltok-start | |
401 "Missing `>'")) | |
402 (t t)))) | |
403 found)) | |
404 | |
405 (defun nxml-scan-element-backward (from &optional up bound) | |
406 "Scan backward from FROM over a single balanced element. | |
407 Point must between tokens. Return the position of the end of the tag | |
408 that starts the element. `xmltok-start' will contain the position of | |
409 the start of the tag. If UP is non-nil, then scan past start-tag of | |
410 element containing point. If BOUND is non-nil, then don't scan back | |
411 past BOUND. If no element is found, return nil. If a well-formedness | |
412 error prevents scanning, signal an nxml-scan-error. Point is not | |
413 moved." | |
414 (let ((open-tags (and up t)) | |
415 token-end found) | |
416 (save-excursion | |
417 (goto-char from) | |
418 (while (cond ((or (< (point) nxml-prolog-end) | |
419 (not (search-backward "<" | |
420 (max (or bound 0) | |
421 nxml-prolog-end) | |
422 t))) | |
423 (when (and (consp open-tags) (not bound)) | |
424 (nxml-scan-error (cadr open-tags) | |
425 "End-tag has no start-tag")) | |
426 nil) | |
427 ((progn | |
428 (nxml-move-outside-backwards) | |
429 (save-excursion | |
430 (nxml-tokenize-forward) | |
431 (setq token-end (point))) | |
432 (eq xmltok-type 'end-tag)) | |
433 (setq open-tags | |
434 (cons (xmltok-end-tag-qname) | |
435 (cons xmltok-start open-tags))) | |
436 t) | |
437 ((eq xmltok-type 'start-tag) | |
438 (cond ((not open-tags) nil) | |
439 ((not (consp open-tags)) | |
440 (setq found token-end) | |
441 nil) | |
442 ((and (car open-tags) | |
443 (not (string= (car open-tags) | |
444 (xmltok-start-tag-qname)))) | |
445 (nxml-scan-error (1+ xmltok-start) | |
446 "Mismatched start-tag; \ | |
447 expected `%s'" | |
448 (car open-tags))) | |
449 ((setq open-tags (cddr open-tags)) t) | |
450 (t (setq found token-end) nil))) | |
451 ((memq xmltok-type '(empty-element | |
452 partial-empty-element)) | |
453 (if open-tags | |
454 t | |
455 (setq found token-end) | |
456 nil)) | |
457 ((eq xmltok-type 'partial-end-tag) | |
458 (setq open-tags | |
459 (cons nil (cons xmltok-start open-tags))) | |
460 t) | |
461 ((eq xmltok-type 'partial-start-tag) | |
462 ;; if we have only a partial-start-tag | |
463 ;; then it's unlikely that there's a matching | |
464 ;; end-tag, so it's probably not helpful | |
465 ;; to treat it as a complete start-tag | |
466 (nxml-scan-error xmltok-start | |
467 "Missing `>'")) | |
468 (t t)))) | |
469 found)) | |
470 | |
471 (defun nxml-scan-error (&rest args) | |
472 (signal 'nxml-scan-error args)) | |
473 | |
474 (put 'nxml-scan-error | |
475 'error-conditions | |
476 '(error nxml-error nxml-scan-error)) | |
477 | |
478 (put 'nxml-scan-error | |
479 'error-message | |
480 "Scan over element that is not well-formed") | |
481 | |
482 (provide 'nxml-rap) | |
483 | |
86379 | 484 ;; arch-tag: cba241ec-4c59-4ef3-aa51-2cf92b3dd24f |
86361 | 485 ;;; nxml-rap.el ends here |