86361
|
1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
|
|
2
|
87665
|
3 ;; Copyright (C) 2003, 2004, 2007, 2008 Free Software Foundation, Inc.
|
86361
|
4
|
|
5 ;; Author: James Clark
|
|
6 ;; Keywords: XML
|
|
7
|
86542
|
8 ;; This file is part of GNU Emacs.
|
|
9
|
|
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
|
|
11 ;; it under the terms of the GNU General Public License as published by
|
|
12 ;; the Free Software Foundation; either version 3, or (at your option)
|
|
13 ;; any later version.
|
86361
|
14
|
86542
|
15 ;; GNU Emacs is distributed in the hope that it will be useful,
|
|
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
18 ;; GNU General Public License for more details.
|
86361
|
19
|
86542
|
20 ;; You should have received a copy of the GNU General Public License
|
|
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
22 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
23 ;; Boston, MA 02110-1301, USA.
|
86361
|
24
|
|
25 ;;; Commentary:
|
|
26
|
|
27 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
|
|
28 ;; how to handle changes. We don't want to maintain a complete parse
|
|
29 ;; tree. We also don't want to reparse from the start of the document
|
|
30 ;; on every keystroke. However, it is not possible in general to
|
|
31 ;; parse an XML document correctly starting at a random point in the
|
|
32 ;; middle. The main problems are comments, CDATA sections and
|
|
33 ;; processing instructions: these can all contain things that are
|
|
34 ;; indistinguishable from elements. Literals in the prolog are also a
|
|
35 ;; problem. Attribute value literals are not a problem because
|
|
36 ;; attribute value literals cannot contain less-than signs.
|
|
37 ;;
|
|
38 ;; Our strategy is to keep track of just the problematic things.
|
|
39 ;; Specifically, we keep track of all comments, CDATA sections and
|
|
40 ;; processing instructions in the instance. We do this by marking all
|
|
41 ;; except the first character of these with a non-nil nxml-inside text
|
|
42 ;; property. The value of the nxml-inside property is comment,
|
|
43 ;; cdata-section or processing-instruction. The first character does
|
|
44 ;; not have the nxml-inside property so we can find the beginning of
|
|
45 ;; the construct by looking for a change in a text property value
|
|
46 ;; (Emacs provides primitives for this). We use text properties
|
|
47 ;; rather than overlays, since the implementation of overlays doesn't
|
|
48 ;; look like it scales to large numbers of overlays in a buffer.
|
|
49 ;;
|
|
50 ;; We don't in fact track all these constructs, but only track them in
|
|
51 ;; some initial part of the instance. The variable `nxml-scan-end'
|
|
52 ;; contains the limit of where we have scanned up to for them.
|
|
53 ;;
|
|
54 ;; Thus to parse some random point in the file we first ensure that we
|
|
55 ;; have scanned up to that point. Then we search backwards for a
|
|
56 ;; <. Then we check whether the < has an nxml-inside property. If it
|
|
57 ;; does we go backwards to first character that does not have an
|
|
58 ;; nxml-inside property (this character must be a <). Then we start
|
|
59 ;; parsing forward from the < we have found.
|
|
60 ;;
|
|
61 ;; The prolog has to be parsed specially, so we also keep track of the
|
|
62 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
|
|
63 ;; every change to the prolog. This won't work well if people try to
|
|
64 ;; edit huge internal subsets. Hopefully that will be rare.
|
|
65 ;;
|
|
66 ;; We keep track of the changes by adding to the buffer's
|
|
67 ;; after-change-functions hook. Scanning is also done as a
|
|
68 ;; prerequisite to fontification by adding to fontification-functions
|
|
69 ;; (in the same way as jit-lock). This means that scanning for these
|
|
70 ;; constructs had better be quick. Fortunately it is. Firstly, the
|
|
71 ;; typical proportion of comments, CDATA sections and processing
|
|
72 ;; instructions is small relative to other things. Secondly, to scan
|
|
73 ;; we just search for the regexp <[!?].
|
|
74 ;;
|
|
75 ;; One problem is unclosed comments, processing instructions and CDATA
|
|
76 ;; sections. Suppose, for example, we encounter a <!-- but there's no
|
|
77 ;; matching -->. This is not an unexpected situation if the user is
|
|
78 ;; creating a comment. It is not helpful to treat the whole of the
|
|
79 ;; file starting from the <!-- onwards as a single unclosed comment
|
|
80 ;; token. Instead we treat just the <!-- as a piece of not well-formed
|
|
81 ;; markup and continue. The problem is that if at some later stage a
|
|
82 ;; --> gets added to the buffer after the unclosed <!--, we will need
|
|
83 ;; to reparse the buffer starting from the <!--. We need to keep
|
|
84 ;; track of these reparse dependencies; they are called dependent
|
|
85 ;; regions in the code.
|
|
86
|
|
87 ;;; Code:
|
|
88
|
|
89 (require 'xmltok)
|
|
90 (require 'nxml-util)
|
|
91
|
|
92 (defvar nxml-prolog-end nil
|
|
93 "Integer giving position following end of the prolog.")
|
|
94 (make-variable-buffer-local 'nxml-prolog-end)
|
|
95
|
|
96 (defvar nxml-scan-end nil
|
|
97 "Marker giving position up to which we have scanned.
|
|
98 nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end
|
|
99 must not an inside position in the following sense. A position is
|
|
100 inside if the following character is a part of, but not the first
|
|
101 character of, a CDATA section, comment or processing instruction.
|
|
102 Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
|
|
103 are inside positions must have a non-nil nxml-inside property whose
|
|
104 value is a symbol specifying what it is inside. Any characters with a
|
|
105 non-nil fontified property must have position < nxml-scan-end and the
|
|
106 correct face. Dependent regions must also be established for any
|
|
107 unclosed constructs starting before nxml-scan-end.
|
|
108 There must be no nxml-inside properties after nxml-scan-end.")
|
|
109 (make-variable-buffer-local 'nxml-scan-end)
|
|
110
|
|
111 (defsubst nxml-get-inside (pos)
|
|
112 (get-text-property pos 'nxml-inside))
|
|
113
|
|
114 (defsubst nxml-clear-inside (start end)
|
|
115 (remove-text-properties start end '(nxml-inside nil)))
|
|
116
|
|
117 (defsubst nxml-set-inside (start end type)
|
|
118 (put-text-property start end 'nxml-inside type))
|
|
119
|
|
120 (defun nxml-inside-end (pos)
|
|
121 "Return the end of the inside region containing POS.
|
|
122 Return nil if the character at POS is not inside."
|
|
123 (if (nxml-get-inside pos)
|
|
124 (or (next-single-property-change pos 'nxml-inside)
|
|
125 (point-max))
|
|
126 nil))
|
|
127
|
|
128 (defun nxml-inside-start (pos)
|
|
129 "Return the start of the inside region containing POS.
|
|
130 Return nil if the character at POS is not inside."
|
|
131 (if (nxml-get-inside pos)
|
|
132 (or (previous-single-property-change (1+ pos) 'nxml-inside)
|
|
133 (point-min))
|
|
134 nil))
|
|
135
|
|
136 ;;; Change management
|
|
137
|
|
138 (defun nxml-scan-after-change (start end)
|
|
139 "Restore `nxml-scan-end' invariants after a change.
|
|
140 The change happened between START and END.
|
|
141 Return position after which lexical state is unchanged.
|
|
142 END must be > nxml-prolog-end."
|
|
143 (if (>= start nxml-scan-end)
|
|
144 nxml-scan-end
|
|
145 (goto-char start)
|
|
146 (nxml-move-outside-backwards)
|
|
147 (setq start (point))
|
|
148 (let ((inside-remove-start start)
|
|
149 xmltok-errors
|
|
150 xmltok-dependent-regions)
|
|
151 (while (or (when (xmltok-forward-special (min end nxml-scan-end))
|
|
152 (when (memq xmltok-type
|
|
153 '(comment
|
|
154 cdata-section
|
|
155 processing-instruction))
|
|
156 (nxml-clear-inside inside-remove-start
|
|
157 (1+ xmltok-start))
|
|
158 (nxml-set-inside (1+ xmltok-start)
|
|
159 (point)
|
|
160 xmltok-type)
|
|
161 (setq inside-remove-start (point)))
|
|
162 (if (< (point) (min end nxml-scan-end))
|
|
163 t
|
|
164 (setq end (point))
|
|
165 nil))
|
|
166 ;; The end of the change was inside but is now outside.
|
|
167 ;; Imagine something really weird like
|
|
168 ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
|
|
169 ;; and suppose we deleted "<![CDATA[f"
|
|
170 (let ((inside-end (nxml-inside-end end)))
|
|
171 (when inside-end
|
|
172 (setq end inside-end)
|
|
173 t))))
|
|
174 (nxml-clear-inside inside-remove-start end)
|
|
175 (nxml-clear-dependent-regions start end)
|
|
176 (nxml-mark-parse-dependent-regions))
|
|
177 (when (> end nxml-scan-end)
|
|
178 (set-marker nxml-scan-end end))
|
|
179 end))
|
|
180
|
86542
|
181 ;; n-s-p only called from nxml-mode.el, where this variable is defined.
|
|
182 (defvar nxml-prolog-regions)
|
|
183
|
86361
|
184 (defun nxml-scan-prolog ()
|
|
185 (goto-char (point-min))
|
|
186 (let (xmltok-dtd
|
|
187 xmltok-errors
|
|
188 xmltok-dependent-regions)
|
|
189 (setq nxml-prolog-regions (xmltok-forward-prolog))
|
|
190 (setq nxml-prolog-end (point))
|
|
191 (nxml-clear-inside (point-min) nxml-prolog-end)
|
|
192 (nxml-clear-dependent-regions (point-min) nxml-prolog-end)
|
|
193 (nxml-mark-parse-dependent-regions))
|
|
194 (when (< nxml-scan-end nxml-prolog-end)
|
|
195 (set-marker nxml-scan-end nxml-prolog-end)))
|
|
196
|
|
197
|
|
198 ;;; Dependent regions
|
|
199
|
|
200 (defun nxml-adjust-start-for-dependent-regions (start end pre-change-length)
|
|
201 (let ((overlays (overlays-in (1- start) start))
|
|
202 (adjusted-start start))
|
|
203 (while overlays
|
|
204 (let* ((overlay (car overlays))
|
|
205 (ostart (overlay-start overlay)))
|
|
206 (when (and (eq (overlay-get overlay 'category) 'nxml-dependent)
|
|
207 (< ostart adjusted-start))
|
|
208 (let ((funargs (overlay-get overlay 'nxml-funargs)))
|
|
209 (when (apply (car funargs)
|
|
210 (append (list start
|
|
211 end
|
|
212 pre-change-length
|
|
213 ostart
|
|
214 (overlay-end overlay))
|
|
215 (cdr funargs)))
|
|
216 (setq adjusted-start ostart)))))
|
|
217 (setq overlays (cdr overlays)))
|
|
218 adjusted-start))
|
|
219
|
|
220 (defun nxml-mark-parse-dependent-regions ()
|
|
221 (while xmltok-dependent-regions
|
|
222 (apply 'nxml-mark-parse-dependent-region
|
|
223 (car xmltok-dependent-regions))
|
|
224 (setq xmltok-dependent-regions
|
|
225 (cdr xmltok-dependent-regions))))
|
|
226
|
|
227 (defun nxml-mark-parse-dependent-region (fun start end &rest args)
|
|
228 (let ((overlay (make-overlay start end nil t t)))
|
|
229 (overlay-put overlay 'category 'nxml-dependent)
|
|
230 (overlay-put overlay 'nxml-funargs (cons fun args))))
|
|
231
|
|
232 (put 'nxml-dependent 'evaporate t)
|
|
233
|
|
234 (defun nxml-clear-dependent-regions (start end)
|
|
235 (let ((overlays (overlays-in start end)))
|
|
236 (while overlays
|
|
237 (let* ((overlay (car overlays))
|
|
238 (category (overlay-get overlay 'category)))
|
|
239 (when (and (eq category 'nxml-dependent)
|
|
240 (<= start (overlay-start overlay)))
|
|
241 (delete-overlay overlay)))
|
|
242 (setq overlays (cdr overlays)))))
|
|
243
|
|
244 ;;; Random access parsing
|
|
245
|
|
246 (defun nxml-token-after ()
|
|
247 "Return the position after the token containing the char after point.
|
|
248 Sets up the variables `xmltok-type', `xmltok-start',
|
|
249 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
|
|
250 `xmltok-namespace-attributes' in the same was as does
|
|
251 `xmltok-forward'. The prolog will be treated as a single token with
|
|
252 type `prolog'."
|
|
253 (let ((pos (point)))
|
|
254 (if (< pos nxml-prolog-end)
|
|
255 (progn
|
|
256 (setq xmltok-type 'prolog
|
|
257 xmltok-start (point-min))
|
|
258 (min nxml-prolog-end (point-max)))
|
|
259 (nxml-ensure-scan-up-to-date)
|
|
260 (if (nxml-get-inside pos)
|
|
261 (save-excursion
|
|
262 (nxml-move-outside-backwards)
|
|
263 (xmltok-forward)
|
|
264 (point))
|
|
265 (save-excursion
|
|
266 (if (or (eq (char-after) ?<)
|
|
267 (search-backward "<"
|
|
268 (max (point-min) nxml-prolog-end)
|
|
269 t))
|
|
270 (nxml-move-outside-backwards)
|
|
271 (goto-char (if (<= (point-min) nxml-prolog-end)
|
|
272 nxml-prolog-end
|
|
273 (or (nxml-inside-end (point-min))
|
|
274 (point-min)))))
|
|
275 (while (and (nxml-tokenize-forward)
|
|
276 (<= (point) pos)))
|
|
277 (point))))))
|
|
278
|
|
279 (defun nxml-token-before ()
|
|
280 "Return the position after the token containing the char before point.
|
|
281 Sets variables like `nxml-token-after'."
|
|
282 (if (/= (point-min) (point))
|
|
283 (save-excursion
|
|
284 (goto-char (1- (point)))
|
|
285 (nxml-token-after))
|
|
286 (setq xmltok-start (point))
|
|
287 (setq xmltok-type nil)
|
|
288 (point)))
|
|
289
|
|
290 (defun nxml-tokenize-forward ()
|
|
291 (let (xmltok-dependent-regions
|
|
292 xmltok-errors)
|
|
293 (when (and (xmltok-forward)
|
|
294 (> (point) nxml-scan-end))
|
|
295 (cond ((memq xmltok-type '(comment
|
|
296 cdata-section
|
|
297 processing-instruction))
|
|
298 (nxml-with-unmodifying-text-property-changes
|
|
299 (nxml-set-inside (1+ xmltok-start) (point) xmltok-type)))
|
|
300 (xmltok-dependent-regions
|
|
301 (nxml-mark-parse-dependent-regions)))
|
|
302 (set-marker nxml-scan-end (point)))
|
|
303 xmltok-type))
|
|
304
|
|
305 (defun nxml-move-outside-backwards ()
|
|
306 "Move point to first character of the containing special thing.
|
|
307 Leave point unmoved if it is not inside anything special."
|
|
308 (let ((start (nxml-inside-start (point))))
|
|
309 (when start
|
|
310 (goto-char (1- start))
|
|
311 (when (nxml-get-inside (point))
|
|
312 (error "Char before inside-start at %s had nxml-inside property %s"
|
|
313 (point)
|
|
314 (nxml-get-inside (point)))))))
|
|
315
|
|
316 (defun nxml-ensure-scan-up-to-date ()
|
|
317 (let ((pos (point)))
|
|
318 (when (< nxml-scan-end pos)
|
|
319 (save-excursion
|
|
320 (goto-char nxml-scan-end)
|
|
321 (let (xmltok-errors
|
|
322 xmltok-dependent-regions)
|
|
323 (while (when (xmltok-forward-special pos)
|
|
324 (when (memq xmltok-type
|
|
325 '(comment
|
|
326 processing-instruction
|
|
327 cdata-section))
|
|
328 (nxml-with-unmodifying-text-property-changes
|
|
329 (nxml-set-inside (1+ xmltok-start)
|
|
330 (point)
|
|
331 xmltok-type)))
|
|
332 (if (< (point) pos)
|
|
333 t
|
|
334 (setq pos (point))
|
|
335 nil)))
|
|
336 (nxml-clear-dependent-regions nxml-scan-end pos)
|
|
337 (nxml-mark-parse-dependent-regions)
|
|
338 (set-marker nxml-scan-end pos))))))
|
|
339
|
|
340 ;;; Element scanning
|
|
341
|
|
342 (defun nxml-scan-element-forward (from &optional up)
|
|
343 "Scan forward from FROM over a single balanced element.
|
|
344 Point must between tokens. Return the position of the end of the tag
|
|
345 that ends the element. `xmltok-start' will contain the position of the
|
|
346 start of the tag. If UP is non-nil, then scan past end-tag of element
|
|
347 containing point. If no element is found, return nil. If a
|
|
348 well-formedness error prevents scanning, signal an nxml-scan-error.
|
|
349 Point is not moved."
|
|
350 (let ((open-tags (and up t))
|
|
351 found)
|
|
352 (save-excursion
|
|
353 (goto-char from)
|
|
354 (while (cond ((not (nxml-tokenize-forward))
|
|
355 (when (consp open-tags)
|
|
356 (nxml-scan-error (cadr open-tags)
|
|
357 "Start-tag has no end-tag"))
|
|
358 nil)
|
|
359 ((eq xmltok-type 'start-tag)
|
|
360 (setq open-tags
|
|
361 (cons (xmltok-start-tag-qname)
|
|
362 (cons xmltok-start
|
|
363 open-tags)))
|
|
364 t)
|
|
365 ((eq xmltok-type 'end-tag)
|
|
366 (cond ((not open-tags) nil)
|
|
367 ((not (consp open-tags)) (setq found (point)) nil)
|
|
368 ((not (string= (car open-tags)
|
|
369 (xmltok-end-tag-qname)))
|
|
370 (nxml-scan-error (+ 2 xmltok-start)
|
|
371 "Mismatched end-tag; \
|
|
372 expected `%s'"
|
|
373 (car open-tags)))
|
|
374 ((setq open-tags (cddr open-tags)) t)
|
|
375 (t (setq found (point)) nil)))
|
|
376 ((memq xmltok-type '(empty-element
|
|
377 partial-empty-element))
|
|
378 (if open-tags
|
|
379 t
|
|
380 (setq found (point))
|
|
381 nil))
|
|
382 ((eq xmltok-type 'partial-end-tag)
|
|
383 (cond ((not open-tags) nil)
|
|
384 ((not (consp open-tags)) (setq found (point)) nil)
|
|
385 ((setq open-tags (cddr open-tags)) t)
|
|
386 (t (setq found (point)) nil)))
|
|
387 ((eq xmltok-type 'partial-start-tag)
|
|
388 (nxml-scan-error xmltok-start
|
|
389 "Missing `>'"))
|
|
390 (t t))))
|
|
391 found))
|
|
392
|
|
393 (defun nxml-scan-element-backward (from &optional up bound)
|
|
394 "Scan backward from FROM over a single balanced element.
|
|
395 Point must between tokens. Return the position of the end of the tag
|
|
396 that starts the element. `xmltok-start' will contain the position of
|
|
397 the start of the tag. If UP is non-nil, then scan past start-tag of
|
|
398 element containing point. If BOUND is non-nil, then don't scan back
|
|
399 past BOUND. If no element is found, return nil. If a well-formedness
|
|
400 error prevents scanning, signal an nxml-scan-error. Point is not
|
|
401 moved."
|
|
402 (let ((open-tags (and up t))
|
|
403 token-end found)
|
|
404 (save-excursion
|
|
405 (goto-char from)
|
|
406 (while (cond ((or (< (point) nxml-prolog-end)
|
|
407 (not (search-backward "<"
|
|
408 (max (or bound 0)
|
|
409 nxml-prolog-end)
|
|
410 t)))
|
|
411 (when (and (consp open-tags) (not bound))
|
|
412 (nxml-scan-error (cadr open-tags)
|
|
413 "End-tag has no start-tag"))
|
|
414 nil)
|
|
415 ((progn
|
|
416 (nxml-move-outside-backwards)
|
|
417 (save-excursion
|
|
418 (nxml-tokenize-forward)
|
|
419 (setq token-end (point)))
|
|
420 (eq xmltok-type 'end-tag))
|
|
421 (setq open-tags
|
|
422 (cons (xmltok-end-tag-qname)
|
|
423 (cons xmltok-start open-tags)))
|
|
424 t)
|
|
425 ((eq xmltok-type 'start-tag)
|
|
426 (cond ((not open-tags) nil)
|
|
427 ((not (consp open-tags))
|
|
428 (setq found token-end)
|
|
429 nil)
|
|
430 ((and (car open-tags)
|
|
431 (not (string= (car open-tags)
|
|
432 (xmltok-start-tag-qname))))
|
|
433 (nxml-scan-error (1+ xmltok-start)
|
|
434 "Mismatched start-tag; \
|
|
435 expected `%s'"
|
|
436 (car open-tags)))
|
|
437 ((setq open-tags (cddr open-tags)) t)
|
|
438 (t (setq found token-end) nil)))
|
|
439 ((memq xmltok-type '(empty-element
|
|
440 partial-empty-element))
|
|
441 (if open-tags
|
|
442 t
|
|
443 (setq found token-end)
|
|
444 nil))
|
|
445 ((eq xmltok-type 'partial-end-tag)
|
|
446 (setq open-tags
|
|
447 (cons nil (cons xmltok-start open-tags)))
|
|
448 t)
|
|
449 ((eq xmltok-type 'partial-start-tag)
|
|
450 ;; if we have only a partial-start-tag
|
|
451 ;; then it's unlikely that there's a matching
|
|
452 ;; end-tag, so it's probably not helpful
|
|
453 ;; to treat it as a complete start-tag
|
|
454 (nxml-scan-error xmltok-start
|
|
455 "Missing `>'"))
|
|
456 (t t))))
|
|
457 found))
|
|
458
|
|
459 (defun nxml-scan-error (&rest args)
|
|
460 (signal 'nxml-scan-error args))
|
|
461
|
|
462 (put 'nxml-scan-error
|
|
463 'error-conditions
|
|
464 '(error nxml-error nxml-scan-error))
|
|
465
|
|
466 (put 'nxml-scan-error
|
|
467 'error-message
|
|
468 "Scan over element that is not well-formed")
|
|
469
|
|
470 (provide 'nxml-rap)
|
|
471
|
86379
|
472 ;; arch-tag: cba241ec-4c59-4ef3-aa51-2cf92b3dd24f
|
86361
|
473 ;;; nxml-rap.el ends here
|