Parser for EBNF used to specify XML (EBNFX)
[bpt/emacs.git] / lisp / progmodes / ebnf-ebx.el
CommitLineData
f3c3dee6
VJL
1;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
2
3;; Copyright (C) 2004 Free Sofware Foundation, Inc.
4
5;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
6;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7;; Time-stamp: <2004/03/22 08:53:21 vinicius>
8;; Keywords: wp, ebnf, PostScript
9;; Version: 1.0
10
11;; This file is part of GNU Emacs.
12
13;; GNU Emacs is free software; you can redistribute it and/or modify
14;; it under the terms of the GNU General Public License as published by
15;; the Free Software Foundation; either version 2, or (at your option)
16;; any later version.
17
18;; GNU Emacs is distributed in the hope that it will be useful,
19;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21;; GNU General Public License for more details.
22
23;; You should have received a copy of the GNU General Public License
24;; along with GNU Emacs; see the file COPYING. If not, write to the
25;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26;; Boston, MA 02111-1307, USA.
27
28;;; Commentary:
29
30;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;;
32;;
33;; This is part of ebnf2ps package.
34;;
35;; This package defines a parser for EBNF used to specify XML (EBNFX).
36;;
37;; See ebnf2ps.el for documentation.
38;;
39;;
40;; EBNFX Syntax
41;; ------------
42;;
43;; See the URL:
44;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
45;; (Extensible Markup Language (XML) 1.0 (Third Edition))
46;;
47;;
48;; rule ::= symbol '::=' expression
49;; /* rules are separated by at least one blank line. */
50;;
51;; expression ::= concatenation ('|' concatenation)*
52;;
53;; concatenation ::= exception*
54;;
55;; exception ::= term ('-' term)?
56;;
57;; term ::= factor ('*' | '+' | '?')?
58;;
59;; factor ::= hex-char+
60;; | '[' '^'? ( char ( '-' char )? )+ ']'
61;; | '"' 'string' '"'
62;; | "'" "string" "'"
63;; | '(' expression ')'
64;; | symbol
65;;
66;; symbol ::= 'upper or lower case letter'
67;; ('upper or lower case letter' | '-' | '_')*
68;; /* upper and lower 8-bit accentuated characters are included */
69;;
70;; hex-char ::= '#x' [0-9A-Fa-f]+
71;;
72;; char ::= hex-char | 'any character except control characters'
73;; /* 8-bit accentuated characters are included */
74;;
75;; any-char ::= char | 'newline' | 'tab'
76;;
77;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
78;;
79;; comment ::= '/*' ( any-char - '*/' ) '*/'
80;;
81;;
82;; Below is the Notation section extracted from the URL cited above.
83;;
84;; 6 Notation
85;;
86;; The formal grammar of XML is given in this specification using a simple
87;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
88;; one symbol, in the form
89;;
90;; symbol ::= expression
91;;
92;; Symbols are written with an initial capital letter if they are the start
93;; symbol of a regular language, otherwise with an initial lowercase letter.
94;; Literal strings are quoted.
95;;
96;; Within the expression on the right-hand side of a rule, the following
97;; expressions are used to match strings of one or more characters:
98;;
99;; #xN
100;;
101;; where N is a hexadecimal integer, the expression matches the character
102;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
103;; zeros in the #xN form is insignificant.
104;;
105;; [a-zA-Z], [#xN-#xN]
106;;
107;; matches any Char with a value in the range(s) indicated (inclusive).
108;;
109;; [abc], [#xN#xN#xN]
110;;
111;; matches any Char with a value among the characters enumerated.
112;; Enumerations and ranges can be mixed in one set of brackets.
113;;
114;; [^a-z], [^#xN-#xN]
115;;
116;; matches any Char with a value outside the range indicated.
117;;
118;; [^abc], [^#xN#xN#xN]
119;;
120;; matches any Char with a value not among the characters given.
121;; Enumerations and ranges of forbidden values can be mixed in one set of
122;; brackets.
123;;
124;; "string"
125;;
126;; matches a literal string matching that given inside the double quotes.
127;;
128;; 'string'
129;;
130;; matches a literal string matching that given inside the single quotes.
131;;
132;; These symbols may be combined to match more complex patterns as follows,
133;; where A and B represent simple expressions:
134;;
135;; (expression)
136;;
137;; expression is treated as a unit and may be combined as described in this
138;; list.
139;;
140;; A?
141;;
142;; matches A or nothing; optional A.
143;;
144;; A B
145;;
146;; matches A followed by B. This operator has higher precedence than
147;; alternation; thus A B | C D is identical to (A B) | (C D).
148;;
149;; A | B
150;;
151;; matches A or B.
152;;
153;; A - B
154;;
155;; matches any string that matches A but does not match B.
156;;
157;; A+
158;;
159;; matches one or more occurrences of A. Concatenation has higher
160;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
161;;
162;; A*
163;;
164;; matches zero or more occurrences of A. Concatenation has higher
165;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
166;;
167;; Other notations used in the productions are:
168;;
169;; /* ... */
170;;
171;; comment.
172;;
173;; [ wfc: ... ]
174;;
175;; well-formedness constraint; this identifies by name a constraint on
176;; well-formed documents associated with a production.
177;;
178;; [ vc: ... ]
179;;
180;; validity constraint; this identifies by name a constraint on valid
181;; documents associated with a production.
182;;
183;;
184;; Differences Between EBNFX And ebnf2ps EBNFX
185;; -------------------------------------------
186;;
187;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
188;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
189;; characters (from \240 to \377) for rule name, string and comment. Also
190;; rule name can start with upper case letter.
191;;
192;;
193;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
194
195;;; Code:
196
197
198(require 'ebnf-otz)
199
200
201(defvar ebnf-ebx-lex nil
202 "Value returned by `ebnf-ebx-lex' function.")
203
204\f
205;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
206;; Syntactic analyzer
207
208
209;;; rulelist ::= rule+
210
211(defun ebnf-ebx-parser (start)
212 "EBNFX parser."
213 (let ((total (+ (- ebnf-limit start) 1))
214 (bias (1- start))
215 (origin (point))
216 rule-list token rule)
217 (goto-char start)
218 (setq token (ebnf-ebx-lex))
219 (and (eq token 'end-of-input)
220 (error "Invalid EBNFX file format"))
221 (and (eq token 'end-of-rule)
222 (setq token (ebnf-ebx-lex)))
223 (while (not (eq token 'end-of-input))
224 (ebnf-message-float
225 "Parsing...%s%%"
226 (/ (* (- (point) bias) 100.0) total))
227 (setq token (ebnf-ebx-rule token)
228 rule (cdr token)
229 token (car token))
230 (or (ebnf-add-empty-rule-list rule)
231 (setq rule-list (cons rule rule-list))))
232 (goto-char origin)
233 rule-list))
234
235
236;;; rule ::= symbol '::=' expression
237
238
239(defun ebnf-ebx-rule (token)
240 (let ((name ebnf-ebx-lex)
241 (action ebnf-action)
242 elements)
243 (setq ebnf-action nil)
244 (or (eq token 'non-terminal)
245 (error "Invalid rule name"))
246 (setq token (ebnf-ebx-lex))
247 (or (eq token 'production)
248 (error "Invalid rule: missing `::='"))
249 (setq elements (ebnf-ebx-expression))
250 (or (memq (car elements) '(end-of-rule end-of-input))
251 (error "Invalid rule: there is no end of rule"))
252 (setq elements (cdr elements))
253 (ebnf-eps-add-production name)
254 (cons (ebnf-ebx-lex)
255 (ebnf-make-production name elements action))))
256
257
258;; expression ::= concatenation ('|' concatenation)*
259
260
261(defun ebnf-ebx-expression ()
262 (let (body concatenation)
263 (while (eq (car (setq concatenation
264 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
265 'alternative)
266 (setq body (cons (cdr concatenation) body)))
267 (ebnf-token-alternative body concatenation)))
268
269
270;; concatenation ::= exception*
271
272
273(defun ebnf-ebx-concatenation (token)
274 (let ((term (ebnf-ebx-exception token))
275 seq)
276 (or (setq token (car term)
277 term (cdr term))
278 (error "Empty element"))
279 (setq seq (cons term seq))
280 (while (setq term (ebnf-ebx-exception token)
281 token (car term)
282 term (cdr term))
283 (setq seq (cons term seq)))
284 (cons token
285 (if (= (length seq) 1)
286 ;; sequence with only one element
287 (car seq)
288 ;; a real sequence
289 (ebnf-make-sequence (nreverse seq))))))
290
291
292;;; exception ::= term ('-' term)?
293
294
295(defun ebnf-ebx-exception (token)
296 (let ((term (ebnf-ebx-term token)))
297 (if (eq (car term) 'exception)
298 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
299 (cons (car except)
300 (ebnf-make-except (cdr term) (cdr except))))
301 term)))
302
303
304
305;;; term ::= factor ('*' | '+' | '?')?
306
307
308(defun ebnf-ebx-term (token)
309 (let ((factor (ebnf-ebx-factor token)))
310 (when factor
311 (setq token (ebnf-ebx-lex))
312 (cond ((eq token 'zero-or-more)
313 (setq factor (ebnf-make-zero-or-more factor)
314 token (ebnf-ebx-lex)))
315 ((eq token 'one-or-more)
316 (setq factor (ebnf-make-one-or-more factor)
317 token (ebnf-ebx-lex)))
318 ((eq token 'optional)
319 (setq factor (ebnf-token-optional factor)
320 token (ebnf-ebx-lex)))))
321 (cons token factor)))
322
323
324;;; factor ::= hex-char+
325;;; | '[' '^'? ( char ( '-' char )? )+ ']'
326;;; | '"' 'string' '"'
327;;; | "'" "string" "'"
328;;; | '(' expression ')'
329;;; | symbol
330;;;
331;;; symbol ::= 'upper or lower case letter'
332;;; ('upper or lower case letter' | '-' | '_')*
333;;; /* upper and lower 8-bit accentuated characters are included */
334;;;
335;;; hex-char ::= '#x' [0-9A-Fa-f]+
336;;;
337;;; char ::= hex-char | 'any character except control characters'
338;;; /* 8-bit accentuated characters are included */
339;;;
340;;; any-char ::= char | 'newline' | 'tab'
341
342
343(defun ebnf-ebx-factor (token)
344 (cond
345 ;; terminal
346 ((eq token 'terminal)
347 (ebnf-make-terminal ebnf-ebx-lex))
348 ;; non-terminal
349 ((eq token 'non-terminal)
350 (ebnf-make-non-terminal ebnf-ebx-lex))
351 ;; group
352 ((eq token 'begin-group)
353 (let ((body (ebnf-ebx-expression)))
354 (or (eq (car body) 'end-group)
355 (error "Missing `)'"))
356 (cdr body)))
357 ;; no element
358 (t
359 nil)
360 ))
361
362\f
363;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
364;; Lexical analyzer
365
366
367(defconst ebnf-ebx-token-table (make-vector 256 'error)
368 "Vector used to map characters to a lexical token.")
369
370
371(defun ebnf-ebx-initialize ()
372 "Initialize EBNFX token table."
373 ;; control character & control 8-bit character are set to `error'
374 (let ((char ?\101))
375 ;; printable character: A-Z
376 (while (< char ?\133)
377 (aset ebnf-ebx-token-table char 'non-terminal)
378 (setq char (1+ char)))
379 ;; printable character: a-z
380 (setq char ?\141)
381 (while (< char ?\173)
382 (aset ebnf-ebx-token-table char 'non-terminal)
383 (setq char (1+ char)))
384 ;; European 8-bit accentuated characters:
385 (setq char ?\240)
386 (while (< char ?\400)
387 (aset ebnf-ebx-token-table char 'non-terminal)
388 (setq char (1+ char)))
389 ;; Override end of line characters:
390 (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed
391 (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return
392 ;; Override space characters:
393 (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab
394 (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab
395 (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space
396 ;; Override form feed character:
397 (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed
398 ;; Override other lexical characters:
399 (aset ebnf-ebx-token-table ?# 'hash)
400 (aset ebnf-ebx-token-table ?\" 'double-quote)
401 (aset ebnf-ebx-token-table ?\' 'single-quote)
402 (aset ebnf-ebx-token-table ?\( 'begin-group)
403 (aset ebnf-ebx-token-table ?\) 'end-group)
404 (aset ebnf-ebx-token-table ?- 'exception)
405 (aset ebnf-ebx-token-table ?: 'colon)
406 (aset ebnf-ebx-token-table ?\[ 'begin-square)
407 (aset ebnf-ebx-token-table ?| 'alternative)
408 (aset ebnf-ebx-token-table ?* 'zero-or-more)
409 (aset ebnf-ebx-token-table ?+ 'one-or-more)
410 (aset ebnf-ebx-token-table ?\? 'optional)
411 ;; Override comment character:
412 (aset ebnf-ebx-token-table ?/ 'comment)))
413
414
415;; replace the range "\240-\377" (see `ebnf-range-regexp').
416(defconst ebnf-ebx-non-terminal-chars
417 (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377))
418(defconst ebnf-ebx-non-terminal-letter-chars
419 (ebnf-range-regexp "A-Za-z" ?\240 ?\377))
420
421
422(defun ebnf-ebx-lex ()
423 "Lexical analyser for EBNFX.
424
425Return a lexical token.
426
427See documentation for variable `ebnf-ebx-lex'."
428 (if (>= (point) ebnf-limit)
429 'end-of-input
430 (let (token)
431 ;; skip spaces and comments
432 (while (if (> (following-char) 255)
433 (progn
434 (setq token 'error)
435 nil)
436 (setq token (aref ebnf-ebx-token-table (following-char)))
437 (cond
438 ((eq token 'space)
439 (skip-chars-forward " \013\t" ebnf-limit)
440 (< (point) ebnf-limit))
441 ((eq token 'comment)
442 (ebnf-ebx-skip-comment))
443 ((eq token 'form-feed)
444 (forward-char)
445 (setq ebnf-action 'form-feed))
446 ((eq token 'end-of-rule)
447 (ebnf-ebx-skip-end-of-rule))
448 ((and (eq token 'begin-square)
449 (let ((case-fold-search t))
450 (looking-at "\\[\\(wfc\\|vc\\):")))
451 (ebnf-ebx-skip-constraint))
452 (t nil)
453 )))
454 (cond
455 ;; end of input
456 ((>= (point) ebnf-limit)
457 'end-of-input)
458 ;; error
459 ((eq token 'error)
460 (error "Illegal character"))
461 ;; end of rule
462 ((eq token 'end-of-rule)
463 'end-of-rule)
464 ;; terminal: #x [0-9A-Fa-f]+
465 ((eq token 'hash)
466 (setq ebnf-ebx-lex (ebnf-ebx-character))
467 'terminal)
468 ;; terminal: "string"
469 ((eq token 'double-quote)
470 (setq ebnf-ebx-lex (ebnf-ebx-string ?\"))
471 'terminal)
472 ;; terminal: 'string'
473 ((eq token 'single-quote)
474 (setq ebnf-ebx-lex (ebnf-ebx-string ?\'))
475 'terminal)
476 ;; terminal: [ ^? ( char ( - char )? )+ ]
477 ((eq token 'begin-square)
478 (setq ebnf-ebx-lex (ebnf-ebx-range))
479 'terminal)
480 ;; non-terminal: NAME
481 ((eq token 'non-terminal)
482 (setq ebnf-ebx-lex
483 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars))
484 'non-terminal)
485 ;; colon: ::=
486 ((eq token 'colon)
487 (or (looking-at "::=")
488 (error "Missing `::=' token"))
489 (forward-char 3)
490 'production)
491 ;; miscellaneous: (, ), *, +, ?, |, -
492 (t
493 (forward-char)
494 token)
495 ))))
496
497
498;; replace the range "\177-\237" (see `ebnf-range-regexp').
499(defconst ebnf-ebx-constraint-chars
500 (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237))
501
502
503(defun ebnf-ebx-skip-constraint ()
504 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0)
505 (error "Invalid character"))
506 (or (= (following-char) ?\])
507 (error "Missing end of constraint `]'"))
508 (forward-char)
509 t)
510
511
512
513(defun ebnf-ebx-skip-end-of-rule ()
514 (let (eor-p)
515 (while (progn
516 ;; end of rule ==> 2 or more consecutive end of lines
517 (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
518 eor-p))
519 ;; skip spaces
520 (skip-chars-forward " \013\t" ebnf-limit)
521 ;; skip comments
522 (and (= (following-char) ?/)
523 (ebnf-ebx-skip-comment))))
524 (not eor-p)))
525
526
527;; replace the range "\177-\237" (see `ebnf-range-regexp').
528(defconst ebnf-ebx-comment-chars
529 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237))
530(defconst ebnf-ebx-filename-chars
531 (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237))
532
533
534(defun ebnf-ebx-skip-comment ()
535 (forward-char)
536 (or (= (following-char) ?*)
537 (error "Invalid beginning of comment"))
538 (forward-char)
539 (cond
540 ;; open EPS file
541 ((and ebnf-eps-executing (= (following-char) ?\[))
542 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
543 ;; close EPS file
544 ((and ebnf-eps-executing (= (following-char) ?\]))
545 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
546 ;; any other action in comment
547 (t
548 (setq ebnf-action (aref ebnf-comment-table (following-char))))
549 )
550 (while (progn
551 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit)
552 (or (= (following-char) ?*)
553 (error "Missing end of comment"))
554 (forward-char)
555 (and (/= (following-char) ?/)
556 (< (point) ebnf-limit))))
557 ;; check for a valid end of comment
558 (and (>= (point) ebnf-limit)
559 (error "Missing end of comment"))
560 (forward-char)
561 t)
562
563
564(defun ebnf-ebx-eps-filename ()
565 (forward-char)
566 (let (fname nchar)
567 (while (progn
568 (setq fname
569 (concat fname
570 (ebnf-buffer-substring ebnf-ebx-filename-chars)))
571 (and (< (point) ebnf-limit)
572 (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0)
573 (< (point) ebnf-limit)
574 (/= (following-char) ?/)))
575 (setq fname (concat fname (make-string nchar ?*))
576 nchar nil))
577 (if (or (not nchar) (= nchar 0))
578 fname
579 (and (< (point) ebnf-limit)
580 (= (following-char) ?/)
581 (setq nchar (1- nchar)))
582 (concat fname (make-string nchar ?*)))))
583
584
585;; replace the range "\240-\377" (see `ebnf-range-regexp').
586(defconst ebnf-ebx-double-string-chars
587 (ebnf-range-regexp "\t -!#-~" ?\240 ?\377))
588(defconst ebnf-ebx-single-string-chars
589 (ebnf-range-regexp "\t -&(-~" ?\240 ?\377))
590
591
592(defun ebnf-ebx-string (delim)
593 (buffer-substring-no-properties
594 (progn
595 (forward-char)
596 (point))
597 (progn
598 (skip-chars-forward (if (= delim ?\")
599 ebnf-ebx-double-string-chars
600 ebnf-ebx-single-string-chars)
601 ebnf-limit)
602 (or (= (following-char) delim)
603 (error "Missing string delimiter `%c'" delim))
604 (prog1
605 (point)
606 (forward-char)))))
607
608
609(defun ebnf-ebx-character ()
610 ;; #x [0-9A-Fa-f]+
611 (buffer-substring-no-properties
612 (point)
613 (progn
614 (ebnf-ebx-hex-character)
615 (point))))
616
617
618(defun ebnf-ebx-range ()
619 ;; [ ^? ( char ( - char )? )+ ]
620 (buffer-substring-no-properties
621 (point)
622 (progn
623 (forward-char)
624 (and (= (following-char) ?^)
625 (forward-char))
626 (and (= (following-char) ?-)
627 (forward-char))
628 (while (progn
629 (ebnf-ebx-any-character)
630 (when (= (following-char) ?-)
631 (forward-char)
632 (ebnf-ebx-any-character))
633 (and (/= (following-char) ?\])
634 (< (point) ebnf-limit))))
635 (and (>= (point) ebnf-limit)
636 (error "Missing end of character range `]'"))
637 (forward-char)
638 (point))))
639
640
641(defun ebnf-ebx-any-character ()
642 (let ((char (following-char)))
643 (cond ((= char ?#)
644 (ebnf-ebx-hex-character t))
645 ((or (and (<= ?\ char) (<= char ?\")) ; #
646 (and (<= ?$ char) (<= char ?,)) ; -
647 (and (<= ?. char) (<= char ?\\)) ; ]
648 (and (<= ?^ char) (<= char ?~))
649 (and (<= ?\240 char) (<= char ?\377)))
650 (forward-char))
651 (t
652 (error "Invalid character `%c'" char)))))
653
654
655(defun ebnf-ebx-hex-character (&optional no-error)
656 ;; #x [0-9A-Fa-f]+
657 (forward-char)
658 (if (/= (following-char) ?x)
659 (or no-error
660 (error "Invalid hexadecimal character"))
661 (forward-char)
662 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0)
663 (error "Invalid hexadecimal character"))))
664
665\f
666;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
667
668
669(provide 'ebnf-ebx)
670
671
672;;; arch-tag:
673;;; ebnf-ebx.el ends here