Fix renamed filenames in file headers.
[bpt/emacs.git] / lisp / cedet / semantic / lex.el
1 ;;; lex.el --- Lexical Analyzer builder
2
3 ;;; Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 ;;; 2007, 2008, 2009 Free Software Foundation, Inc.
5
6 ;; Author: Eric M. Ludlam <zappo@gnu.org>
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22
23 ;;; Commentary:
24 ;;
25 ;; This file handles the creation of lexical analyzers for different
26 ;; languages in Emacs Lisp. The purpose of a lexical analyzer is to
27 ;; convert a buffer into a list of lexical tokens. Each token
28 ;; contains the token class (such as 'number, 'symbol, 'IF, etc) and
29 ;; the location in the buffer it was found. Optionally, a token also
30 ;; contains a string representing what is at the designated buffer
31 ;; location.
32 ;;
33 ;; Tokens are pushed onto a token stream, which is basically a list of
34 ;; all the lexical tokens from the analyzed region. The token stream
35 ;; is then handed to the grammar which parsers the file.
36 ;;
37 ;;; How it works
38 ;;
39 ;; Each analyzer specifies a condition and forms. These conditions
40 ;; and forms are assembled into a function by `define-lex' that does
41 ;; the lexical analysis.
42 ;;
43 ;; In the lexical analyzer created with `define-lex', each condition
44 ;; is tested for a given point. When the conditin is true, the forms
45 ;; run.
46 ;;
47 ;; The forms can push a lexical token onto the token stream. The
48 ;; analyzer forms also must move the current analyzer point. If the
49 ;; analyzer point is moved without pushing a token, then tne matched
50 ;; syntax is effectively ignored, or skipped.
51 ;;
52 ;; Thus, starting at the beginning of a region to be analyzed, each
53 ;; condition is tested. One will match, and a lexical token might be
54 ;; pushed, and the point is moved to the end of the lexical token
55 ;; identified. At the new position, the process occurs again until
56 ;; the end of the specified region is reached.
57 ;;
58 ;;; How to use semantic-lex
59 ;;
60 ;; To create a lexer for a language, use the `define-lex' macro.
61 ;;
62 ;; The `define-lex' macro accepts a list of lexical analyzers. Each
63 ;; analyzer is created with `define-lex-analyzer', or one of the
64 ;; derivitive macros. A single analyzer defines a regular expression
65 ;; to match text in a buffer, and a short segment of code to create
66 ;; one lexical token.
67 ;;
68 ;; Each analyzer has a NAME, DOC, a CONDITION, and possibly some
69 ;; FORMS. The NAME is the name used in `define-lex'. The DOC
70 ;; describes what the analyzer should do.
71 ;;
72 ;; The CONDITION evaluates the text at the current point in the
73 ;; current buffer. If CONDITION is true, then the FORMS will be
74 ;; executed.
75 ;;
76 ;; The purpose of the FORMS is to push new lexical tokens onto the
77 ;; list of tokens for the current buffer, and to move point after the
78 ;; matched text.
79 ;;
80 ;; Some macros for creating one analyzer are:
81 ;;
82 ;; define-lex-analyzer - A generic analyzer associating any style of
83 ;; condition to forms.
84 ;; define-lex-regex-analyzer - Matches a regular expression.
85 ;; define-lex-simple-regex-analyzer - Matches a regular expressions,
86 ;; and pushes the match.
87 ;; define-lex-block-analyzer - Matches list syntax, and defines
88 ;; handles open/close delimiters.
89 ;;
90 ;; These macros are used by the grammar compiler when lexical
91 ;; information is specified in a grammar:
92 ;; define-lex- * -type-analyzer - Matches syntax specified in
93 ;; a grammar, and pushes one token for it. The * would
94 ;; be `sexp' for things like lists or strings, and
95 ;; `string' for things that need to match some special
96 ;; string, such as "\\." where a literal match is needed.
97 ;;
98 ;;; Lexical Tables
99 ;;
100 ;; There are tables of different symbols managed in semantic-lex.el.
101 ;; They are:
102 ;;
103 ;; Lexical keyword table - A Table of symbols declared in a grammar
104 ;; file with the %keyword declaration.
105 ;; Keywords are used by `semantic-lex-symbol-or-keyword'
106 ;; to create lexical tokens based on the keyword.
107 ;;
108 ;; Lexical type table - A table of symbols declared in a grammer
109 ;; file with the %type declaration.
110 ;; The grammar compiler uses the type table to create new
111 ;; lexical analyzers. These analyzers are then used to when
112 ;; a new lexical analyzer is made for a language.
113 ;;
114 ;;; Lexical Types
115 ;;
116 ;; A lexical type defines a kind of lexical analyzer that will be
117 ;; automatically generated from a grammar file based on some
118 ;; predetermined attributes. For now these two attributes are
119 ;; recognized :
120 ;;
121 ;; * matchdatatype : define the kind of lexical analyzer. That is :
122 ;;
123 ;; - regexp : define a regexp analyzer (see
124 ;; `define-lex-regex-type-analyzer')
125 ;;
126 ;; - string : define a string analyzer (see
127 ;; `define-lex-string-type-analyzer')
128 ;;
129 ;; - block : define a block type analyzer (see
130 ;; `define-lex-block-type-analyzer')
131 ;;
132 ;; - sexp : define a sexp analyzer (see
133 ;; `define-lex-sexp-type-analyzer')
134 ;;
135 ;; - keyword : define a keyword analyzer (see
136 ;; `define-lex-keyword-type-analyzer')
137 ;;
138 ;; * syntax : define the syntax that matches a syntactic
139 ;; expression. When syntax is matched the corresponding type
140 ;; analyzer is entered and the resulting match data will be
141 ;; interpreted based on the kind of analyzer (see matchdatatype
142 ;; above).
143 ;;
144 ;; The following lexical types are predefined :
145 ;;
146 ;; +-------------+---------------+--------------------------------+
147 ;; | type | matchdatatype | syntax |
148 ;; +-------------+---------------+--------------------------------+
149 ;; | punctuation | string | "\\(\\s.\\|\\s$\\|\\s'\\)+" |
150 ;; | keyword | keyword | "\\(\\sw\\|\\s_\\)+" |
151 ;; | symbol | regexp | "\\(\\sw\\|\\s_\\)+" |
152 ;; | string | sexp | "\\s\"" |
153 ;; | number | regexp | semantic-lex-number-expression |
154 ;; | block | block | "\\s(\\|\\s)" |
155 ;; +-------------+---------------+--------------------------------+
156 ;;
157 ;; In a grammar you must use a %type expression to automatically generate
158 ;; the corresponding analyzers of that type.
159 ;;
160 ;; Here is an example to auto-generate punctuation analyzers
161 ;; with 'matchdatatype and 'syntax predefined (see table above)
162 ;;
163 ;; %type <punctuation> ;; will auto-generate this kind of analyzers
164 ;;
165 ;; It is equivalent to write :
166 ;;
167 ;; %type <punctuation> syntax "\\(\\s.\\|\\s$\\|\\s'\\)+" matchdatatype string
168 ;;
169 ;; ;; Some punctuations based on the type defines above
170 ;;
171 ;; %token <punctuation> NOT "!"
172 ;; %token <punctuation> NOTEQ "!="
173 ;; %token <punctuation> MOD "%"
174 ;; %token <punctuation> MODEQ "%="
175 ;;
176
177 ;;; On the Semantic 1.x lexer
178 ;;
179 ;; In semantic 1.x, the lexical analyzer was an all purpose routine.
180 ;; To boost efficiency, the analyzer is now a series of routines that
181 ;; are constructed at build time into a single routine. This will
182 ;; eliminate unneeded if statements to speed the lexer.
183
184 (require 'semantic/fw)
185 ;;; Code:
186
187 ;;; Compatibility
188 ;;
189 (eval-and-compile
190 (if (not (fboundp 'with-syntax-table))
191
192 ;; Copied from Emacs 21 for compatibility with released Emacses.
193 (defmacro with-syntax-table (table &rest body)
194 "With syntax table of current buffer set to a copy of TABLE, evaluate BODY.
195 The syntax table of the current buffer is saved, BODY is evaluated, and the
196 saved table is restored, even in case of an abnormal exit.
197 Value is what BODY returns."
198 (let ((old-table (make-symbol "table"))
199 (old-buffer (make-symbol "buffer")))
200 `(let ((,old-table (syntax-table))
201 (,old-buffer (current-buffer)))
202 (unwind-protect
203 (progn
204 (set-syntax-table (copy-syntax-table ,table))
205 ,@body)
206 (save-current-buffer
207 (set-buffer ,old-buffer)
208 (set-syntax-table ,old-table))))))
209
210 ))
211 \f
212 ;;; Semantic 2.x lexical analysis
213 ;;
214 (defun semantic-lex-map-symbols (fun table &optional property)
215 "Call function FUN on every symbol in TABLE.
216 If optional PROPERTY is non-nil, call FUN only on every symbol which
217 as a PROPERTY value. FUN receives a symbol as argument."
218 (if (arrayp table)
219 (mapatoms
220 #'(lambda (symbol)
221 (if (or (null property) (get symbol property))
222 (funcall fun symbol)))
223 table)))
224
225 ;;; Lexical keyword table handling.
226 ;;
227 ;; These keywords are keywords defined for using in a grammar with the
228 ;; %keyword declaration, and are not keywords used in Emacs Lisp.
229
230 (defvar semantic-flex-keywords-obarray nil
231 "Buffer local keyword obarray for the lexical analyzer.
232 These keywords are matched explicitly, and converted into special symbols.")
233 (make-variable-buffer-local 'semantic-flex-keywords-obarray)
234
235 (defmacro semantic-lex-keyword-invalid (name)
236 "Signal that NAME is an invalid keyword name."
237 `(signal 'wrong-type-argument '(semantic-lex-keyword-p ,name)))
238
239 (defsubst semantic-lex-keyword-symbol (name)
240 "Return keyword symbol with NAME or nil if not found."
241 (and (arrayp semantic-flex-keywords-obarray)
242 (stringp name)
243 (intern-soft name semantic-flex-keywords-obarray)))
244
245 (defsubst semantic-lex-keyword-p (name)
246 "Return non-nil if a keyword with NAME exists in the keyword table.
247 Return nil otherwise."
248 (and (setq name (semantic-lex-keyword-symbol name))
249 (symbol-value name)))
250
251 (defsubst semantic-lex-keyword-set (name value)
252 "Set value of keyword with NAME to VALUE and return VALUE."
253 (set (intern name semantic-flex-keywords-obarray) value))
254
255 (defsubst semantic-lex-keyword-value (name)
256 "Return value of keyword with NAME.
257 Signal an error if a keyword with NAME does not exist."
258 (let ((keyword (semantic-lex-keyword-symbol name)))
259 (if keyword
260 (symbol-value keyword)
261 (semantic-lex-keyword-invalid name))))
262
263 (defsubst semantic-lex-keyword-put (name property value)
264 "For keyword with NAME, set its PROPERTY to VALUE."
265 (let ((keyword (semantic-lex-keyword-symbol name)))
266 (if keyword
267 (put keyword property value)
268 (semantic-lex-keyword-invalid name))))
269
270 (defsubst semantic-lex-keyword-get (name property)
271 "For keyword with NAME, return its PROPERTY value."
272 (let ((keyword (semantic-lex-keyword-symbol name)))
273 (if keyword
274 (get keyword property)
275 (semantic-lex-keyword-invalid name))))
276
277 (defun semantic-lex-make-keyword-table (specs &optional propspecs)
278 "Convert keyword SPECS into an obarray and return it.
279 SPECS must be a list of (NAME . TOKSYM) elements, where:
280
281 NAME is the name of the keyword symbol to define.
282 TOKSYM is the lexical token symbol of that keyword.
283
284 If optional argument PROPSPECS is non nil, then interpret it, and
285 apply those properties.
286 PROPSPECS must be a list of (NAME PROPERTY VALUE) elements."
287 ;; Create the symbol hash table
288 (let ((semantic-flex-keywords-obarray (make-vector 13 0))
289 spec)
290 ;; fill it with stuff
291 (while specs
292 (setq spec (car specs)
293 specs (cdr specs))
294 (semantic-lex-keyword-set (car spec) (cdr spec)))
295 ;; Apply all properties
296 (while propspecs
297 (setq spec (car propspecs)
298 propspecs (cdr propspecs))
299 (semantic-lex-keyword-put (car spec) (nth 1 spec) (nth 2 spec)))
300 semantic-flex-keywords-obarray))
301
302 (defsubst semantic-lex-map-keywords (fun &optional property)
303 "Call function FUN on every lexical keyword.
304 If optional PROPERTY is non-nil, call FUN only on every keyword which
305 as a PROPERTY value. FUN receives a lexical keyword as argument."
306 (semantic-lex-map-symbols
307 fun semantic-flex-keywords-obarray property))
308
309 (defun semantic-lex-keywords (&optional property)
310 "Return a list of lexical keywords.
311 If optional PROPERTY is non-nil, return only keywords which have a
312 PROPERTY set."
313 (let (keywords)
314 (semantic-lex-map-keywords
315 #'(lambda (symbol) (setq keywords (cons symbol keywords)))
316 property)
317 keywords))
318 \f
319 ;;; Type table handling.
320 ;;
321 ;; The lexical type table manages types that occur in a grammar file
322 ;; with the %type declaration. Types represent different syntaxes.
323 ;; See code for `semantic-lex-preset-default-types' for the classic
324 ;; types of syntax.
325 (defvar semantic-lex-types-obarray nil
326 "Buffer local types obarray for the lexical analyzer.")
327 (make-variable-buffer-local 'semantic-lex-types-obarray)
328
329 (defmacro semantic-lex-type-invalid (type)
330 "Signal that TYPE is an invalid lexical type name."
331 `(signal 'wrong-type-argument '(semantic-lex-type-p ,type)))
332
333 (defsubst semantic-lex-type-symbol (type)
334 "Return symbol with TYPE or nil if not found."
335 (and (arrayp semantic-lex-types-obarray)
336 (stringp type)
337 (intern-soft type semantic-lex-types-obarray)))
338
339 (defsubst semantic-lex-type-p (type)
340 "Return non-nil if a symbol with TYPE name exists."
341 (and (setq type (semantic-lex-type-symbol type))
342 (symbol-value type)))
343
344 (defsubst semantic-lex-type-set (type value)
345 "Set value of symbol with TYPE name to VALUE and return VALUE."
346 (set (intern type semantic-lex-types-obarray) value))
347
348 (defsubst semantic-lex-type-value (type &optional noerror)
349 "Return value of symbol with TYPE name.
350 If optional argument NOERROR is non-nil return nil if a symbol with
351 TYPE name does not exist. Otherwise signal an error."
352 (let ((sym (semantic-lex-type-symbol type)))
353 (if sym
354 (symbol-value sym)
355 (unless noerror
356 (semantic-lex-type-invalid type)))))
357
358 (defsubst semantic-lex-type-put (type property value &optional add)
359 "For symbol with TYPE name, set its PROPERTY to VALUE.
360 If optional argument ADD is non-nil, create a new symbol with TYPE
361 name if it does not already exist. Otherwise signal an error."
362 (let ((sym (semantic-lex-type-symbol type)))
363 (unless sym
364 (or add (semantic-lex-type-invalid type))
365 (semantic-lex-type-set type nil)
366 (setq sym (semantic-lex-type-symbol type)))
367 (put sym property value)))
368
369 (defsubst semantic-lex-type-get (type property &optional noerror)
370 "For symbol with TYPE name, return its PROPERTY value.
371 If optional argument NOERROR is non-nil return nil if a symbol with
372 TYPE name does not exist. Otherwise signal an error."
373 (let ((sym (semantic-lex-type-symbol type)))
374 (if sym
375 (get sym property)
376 (unless noerror
377 (semantic-lex-type-invalid type)))))
378
379 (defun semantic-lex-preset-default-types ()
380 "Install useful default properties for well known types."
381 (semantic-lex-type-put "punctuation" 'matchdatatype 'string t)
382 (semantic-lex-type-put "punctuation" 'syntax "\\(\\s.\\|\\s$\\|\\s'\\)+")
383 (semantic-lex-type-put "keyword" 'matchdatatype 'keyword t)
384 (semantic-lex-type-put "keyword" 'syntax "\\(\\sw\\|\\s_\\)+")
385 (semantic-lex-type-put "symbol" 'matchdatatype 'regexp t)
386 (semantic-lex-type-put "symbol" 'syntax "\\(\\sw\\|\\s_\\)+")
387 (semantic-lex-type-put "string" 'matchdatatype 'sexp t)
388 (semantic-lex-type-put "string" 'syntax "\\s\"")
389 (semantic-lex-type-put "number" 'matchdatatype 'regexp t)
390 (semantic-lex-type-put "number" 'syntax 'semantic-lex-number-expression)
391 (semantic-lex-type-put "block" 'matchdatatype 'block t)
392 (semantic-lex-type-put "block" 'syntax "\\s(\\|\\s)")
393 )
394
395 (defun semantic-lex-make-type-table (specs &optional propspecs)
396 "Convert type SPECS into an obarray and return it.
397 SPECS must be a list of (TYPE . TOKENS) elements, where:
398
399 TYPE is the name of the type symbol to define.
400 TOKENS is an list of (TOKSYM . MATCHER) elements, where:
401
402 TOKSYM is any lexical token symbol.
403 MATCHER is a string or regexp a text must match to be a such
404 lexical token.
405
406 If optional argument PROPSPECS is non nil, then interpret it, and
407 apply those properties.
408 PROPSPECS must be a list of (TYPE PROPERTY VALUE)."
409 ;; Create the symbol hash table
410 (let* ((semantic-lex-types-obarray (make-vector 13 0))
411 spec type tokens token alist default)
412 ;; fill it with stuff
413 (while specs
414 (setq spec (car specs)
415 specs (cdr specs)
416 type (car spec)
417 tokens (cdr spec)
418 default nil
419 alist nil)
420 (while tokens
421 (setq token (car tokens)
422 tokens (cdr tokens))
423 (if (cdr token)
424 (setq alist (cons token alist))
425 (setq token (car token))
426 (if default
427 (message
428 "*Warning* default value of <%s> tokens changed to %S, was %S"
429 type default token))
430 (setq default token)))
431 ;; Ensure the default matching spec is the first one.
432 (semantic-lex-type-set type (cons default (nreverse alist))))
433 ;; Install useful default types & properties
434 (semantic-lex-preset-default-types)
435 ;; Apply all properties
436 (while propspecs
437 (setq spec (car propspecs)
438 propspecs (cdr propspecs))
439 ;; Create the type if necessary.
440 (semantic-lex-type-put (car spec) (nth 1 spec) (nth 2 spec) t))
441 semantic-lex-types-obarray))
442
443 (defsubst semantic-lex-map-types (fun &optional property)
444 "Call function FUN on every lexical type.
445 If optional PROPERTY is non-nil, call FUN only on every type symbol
446 which as a PROPERTY value. FUN receives a type symbol as argument."
447 (semantic-lex-map-symbols
448 fun semantic-lex-types-obarray property))
449
450 (defun semantic-lex-types (&optional property)
451 "Return a list of lexical type symbols.
452 If optional PROPERTY is non-nil, return only type symbols which have
453 PROPERTY set."
454 (let (types)
455 (semantic-lex-map-types
456 #'(lambda (symbol) (setq types (cons symbol types)))
457 property)
458 types))
459 \f
460 ;;; Lexical Analyzer framework settings
461 ;;
462
463 (defvar semantic-lex-analyzer 'semantic-flex
464 "The lexical analyzer used for a given buffer.
465 See `semantic-lex' for documentation.
466 For compatibility with Semantic 1.x it defaults to `semantic-flex'.")
467 (make-variable-buffer-local 'semantic-lex-analyzer)
468
469 (defvar semantic-lex-tokens
470 '(
471 (bol)
472 (charquote)
473 (close-paren)
474 (comment)
475 (newline)
476 (open-paren)
477 (punctuation)
478 (semantic-list)
479 (string)
480 (symbol)
481 (whitespace)
482 )
483 "An alist of of semantic token types.
484 As of December 2001 (semantic 1.4beta13), this variable is not used in
485 any code. The only use is to refer to the doc-string from elsewhere.
486
487 The key to this alist is the symbol representing token type that
488 \\[semantic-flex] returns. These are
489
490 - bol: Empty string matching a beginning of line.
491 This token is produced with
492 `semantic-lex-beginning-of-line'.
493
494 - charquote: String sequences that match `\\s\\+' regexp.
495 This token is produced with `semantic-lex-charquote'.
496
497 - close-paren: Characters that match `\\s)' regexp.
498 These are typically `)', `}', `]', etc.
499 This token is produced with
500 `semantic-lex-close-paren'.
501
502 - comment: A comment chunk. These token types are not
503 produced by default.
504 This token is produced with `semantic-lex-comments'.
505 Comments are ignored with `semantic-lex-ignore-comments'.
506 Comments are treated as whitespace with
507 `semantic-lex-comments-as-whitespace'.
508
509 - newline Characters matching `\\s-*\\(\n\\|\\s>\\)' regexp.
510 This token is produced with `semantic-lex-newline'.
511
512 - open-paren: Characters that match `\\s(' regexp.
513 These are typically `(', `{', `[', etc.
514 If `semantic-lex-paren-or-list' is used,
515 then `open-paren' is not usually generated unless
516 the `depth' argument to \\[semantic-lex] is
517 greater than 0.
518 This token is always produced if the analyzer
519 `semantic-lex-open-paren' is used.
520
521 - punctuation: Characters matching `{\\(\\s.\\|\\s$\\|\\s'\\)'
522 regexp.
523 This token is produced with `semantic-lex-punctuation'.
524 Always specify this analyzer after the comment
525 analyzer.
526
527 - semantic-list: String delimited by matching parenthesis, braces,
528 etc. that the lexer skipped over, because the
529 `depth' parameter to \\[semantic-flex] was not high
530 enough.
531 This token is produced with `semantic-lex-paren-or-list'.
532
533 - string: Quoted strings, i.e., string sequences that start
534 and end with characters matching `\\s\"'
535 regexp. The lexer relies on @code{forward-sexp} to
536 find the matching end.
537 This token is produced with `semantic-lex-string'.
538
539 - symbol: String sequences that match `\\(\\sw\\|\\s_\\)+'
540 regexp.
541 This token is produced with
542 `semantic-lex-symbol-or-keyword'. Always add this analyzer
543 after `semantic-lex-number', or other analyzers that
544 match its regular expression.
545
546 - whitespace: Characters that match `\\s-+' regexp.
547 This token is produced with `semantic-lex-whitespace'.")
548
549 (defvar semantic-lex-syntax-modifications nil
550 "Changes to the syntax table for this buffer.
551 These changes are active only while the buffer is being flexed.
552 This is a list where each element has the form:
553 (CHAR CLASS)
554 CHAR is the char passed to `modify-syntax-entry',
555 and CLASS is the string also passed to `modify-syntax-entry' to define
556 what syntax class CHAR has.")
557 (make-variable-buffer-local 'semantic-lex-syntax-modifications)
558
559 (defvar semantic-lex-syntax-table nil
560 "Syntax table used by lexical analysis.
561 See also `semantic-lex-syntax-modifications'.")
562 (make-variable-buffer-local 'semantic-lex-syntax-table)
563
564 (defvar semantic-lex-comment-regex nil
565 "Regular expression for identifying comment start during lexical analysis.
566 This may be automatically set when semantic initializes in a mode, but
567 may need to be overriden for some special languages.")
568 (make-variable-buffer-local 'semantic-lex-comment-regex)
569
570 (defvar semantic-lex-number-expression
571 ;; This expression was written by David Ponce for Java, and copied
572 ;; here for C and any other similar language.
573 (eval-when-compile
574 (concat "\\("
575 "\\<[0-9]+[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>"
576 "\\|"
577 "\\<[0-9]+[.][eE][-+]?[0-9]+[fFdD]?\\>"
578 "\\|"
579 "\\<[0-9]+[.][fFdD]\\>"
580 "\\|"
581 "\\<[0-9]+[.]"
582 "\\|"
583 "[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>"
584 "\\|"
585 "\\<[0-9]+[eE][-+]?[0-9]+[fFdD]?\\>"
586 "\\|"
587 "\\<0[xX][0-9a-fA-F]+[lL]?\\>"
588 "\\|"
589 "\\<[0-9]+[lLfFdD]?\\>"
590 "\\)"
591 ))
592 "Regular expression for matching a number.
593 If this value is nil, no number extraction is done during lex.
594 This expression tries to match C and Java like numbers.
595
596 DECIMAL_LITERAL:
597 [1-9][0-9]*
598 ;
599 HEX_LITERAL:
600 0[xX][0-9a-fA-F]+
601 ;
602 OCTAL_LITERAL:
603 0[0-7]*
604 ;
605 INTEGER_LITERAL:
606 <DECIMAL_LITERAL>[lL]?
607 | <HEX_LITERAL>[lL]?
608 | <OCTAL_LITERAL>[lL]?
609 ;
610 EXPONENT:
611 [eE][+-]?[09]+
612 ;
613 FLOATING_POINT_LITERAL:
614 [0-9]+[.][0-9]*<EXPONENT>?[fFdD]?
615 | [.][0-9]+<EXPONENT>?[fFdD]?
616 | [0-9]+<EXPONENT>[fFdD]?
617 | [0-9]+<EXPONENT>?[fFdD]
618 ;")
619 (make-variable-buffer-local 'semantic-lex-number-expression)
620
621 (defvar semantic-lex-depth 0
622 "Default lexing depth.
623 This specifies how many lists to create tokens in.")
624 (make-variable-buffer-local 'semantic-lex-depth)
625
626 (defvar semantic-lex-unterminated-syntax-end-function
627 (lambda (syntax syntax-start lex-end) lex-end)
628 "Function called when unterminated syntax is encountered.
629 This should be set to one function. That function should take three
630 parameters. The SYNTAX, or type of syntax which is unterminated.
631 SYNTAX-START where the broken syntax begins.
632 LEX-END is where the lexical analysis was asked to end.
633 This function can be used for languages that can intelligently fix up
634 broken syntax, or the exit lexical analysis via `throw' or `signal'
635 when finding unterminated syntax.")
636
637 ;;; Interactive testing commands
638
639 (defun semantic-lex-test (arg)
640 "Test the semantic lexer in the current buffer.
641 If universal argument ARG, then try the whole buffer."
642 (interactive "P")
643 (let* ((start (current-time))
644 (result (semantic-lex
645 (if arg (point-min) (point))
646 (point-max)))
647 (end (current-time)))
648 (message "Elapsed Time: %.2f seconds."
649 (semantic-elapsed-time start end))
650 (pop-to-buffer "*Lexer Output*")
651 (require 'pp)
652 (erase-buffer)
653 (insert (pp-to-string result))
654 (goto-char (point-min))
655 ))
656
657 (defun semantic-lex-test-full-depth (arg)
658 "Test the semantic lexer in the current buffer parsing through lists.
659 Usually the lexer parses
660 If universal argument ARG, then try the whole buffer."
661 (interactive "P")
662 (let* ((start (current-time))
663 (result (semantic-lex
664 (if arg (point-min) (point))
665 (point-max)
666 100))
667 (end (current-time)))
668 (message "Elapsed Time: %.2f seconds."
669 (semantic-elapsed-time start end))
670 (pop-to-buffer "*Lexer Output*")
671 (require 'pp)
672 (erase-buffer)
673 (insert (pp-to-string result))
674 (goto-char (point-min))
675 ))
676
677 (defun semantic-lex-test-region (beg end)
678 "Test the semantic lexer in the current buffer.
679 Analyze the area between BEG and END."
680 (interactive "r")
681 (let ((result (semantic-lex beg end)))
682 (pop-to-buffer "*Lexer Output*")
683 (require 'pp)
684 (erase-buffer)
685 (insert (pp-to-string result))
686 (goto-char (point-min))
687 ))
688
689 (defvar semantic-lex-debug nil
690 "When non-nil, debug the local lexical analyzer.")
691
692 (defun semantic-lex-debug (arg)
693 "Debug the semantic lexer in the current buffer.
694 Argument ARG specifies of the analyze the whole buffer, or start at point.
695 While engaged, each token identified by the lexer will be highlighted
696 in the target buffer A description of the current token will be
697 displayed in the minibuffer. Press SPC to move to the next lexical token."
698 (interactive "P")
699 (require 'semantic/debug)
700 (let ((semantic-lex-debug t))
701 (semantic-lex-test arg)))
702
703 (defun semantic-lex-highlight-token (token)
704 "Highlight the lexical TOKEN.
705 TOKEN is a lexical token with a START And END position.
706 Return the overlay."
707 (let ((o (semantic-make-overlay (semantic-lex-token-start token)
708 (semantic-lex-token-end token))))
709 (semantic-overlay-put o 'face 'highlight)
710 o))
711
712 (defsubst semantic-lex-debug-break (token)
713 "Break during lexical analysis at TOKEN."
714 (when semantic-lex-debug
715 (let ((o nil))
716 (unwind-protect
717 (progn
718 (when token
719 (setq o (semantic-lex-highlight-token token)))
720 (semantic-read-event
721 (format "%S :: SPC - continue" token))
722 )
723 (when o
724 (semantic-overlay-delete o))))))
725
726 ;;; Lexical analyzer creation
727 ;;
728 ;; Code for creating a lex function from lists of analyzers.
729 ;;
730 ;; A lexical analyzer is created from a list of individual analyzers.
731 ;; Each individual analyzer specifies a single match, and code that
732 ;; goes with it.
733 ;;
734 ;; Creation of an analyzer assembles these analyzers into a new function
735 ;; with the behaviors of all the individual analyzers.
736 ;;
737 (defmacro semantic-lex-one-token (analyzers)
738 "Calculate one token from the current buffer at point.
739 Uses locally bound variables from `define-lex'.
740 Argument ANALYZERS is the list of analyzers being used."
741 (cons 'cond (mapcar #'symbol-value analyzers)))
742
743 (defvar semantic-lex-end-point nil
744 "The end point as tracked through lexical functions.")
745
746 (defvar semantic-lex-current-depth nil
747 "The current depth as tracked through lexical functions.")
748
749 (defvar semantic-lex-maximum-depth nil
750 "The maximum depth of parenthisis as tracked through lexical functions.")
751
752 (defvar semantic-lex-token-stream nil
753 "The current token stream we are collecting.")
754
755 (defvar semantic-lex-analysis-bounds nil
756 "The bounds of the current analysis.")
757
758 (defvar semantic-lex-block-streams nil
759 "Streams of tokens inside collapsed blocks.
760 This is an alist of (ANCHOR . STREAM) elements where ANCHOR is the
761 start position of the block, and STREAM is the list of tokens in that
762 block.")
763
764 (defvar semantic-lex-reset-hooks nil
765 "List of hooks major-modes use to reset lexical analyzers.
766 Hooks are called with START and END values for the current lexical pass.
767 Should be set with `add-hook'specifying a LOCAL option.")
768
769 ;; Stack of nested blocks.
770 (defvar semantic-lex-block-stack nil)
771 ;;(defvar semantic-lex-timeout 5
772 ;; "*Number of sections of lexing before giving up.")
773
774 (defmacro define-lex (name doc &rest analyzers)
775 "Create a new lexical analyzer with NAME.
776 DOC is a documentation string describing this analyzer.
777 ANALYZERS are small code snippets of analyzers to use when
778 building the new NAMED analyzer. Only use analyzers which
779 are written to be used in `define-lex'.
780 Each analyzer should be an analyzer created with `define-lex-analyzer'.
781 Note: The order in which analyzers are listed is important.
782 If two analyzers can match the same text, it is important to order the
783 analyzers so that the one you want to match first occurs first. For
784 example, it is good to put a numbe analyzer in front of a symbol
785 analyzer which might mistake a number for as a symbol."
786 `(defun ,name (start end &optional depth length)
787 ,(concat doc "\nSee `semantic-lex' for more information.")
788 ;; Make sure the state of block parsing starts over.
789 (setq semantic-lex-block-streams nil)
790 ;; Allow specialty reset items.
791 (run-hook-with-args 'semantic-lex-reset-hooks start end)
792 ;; Lexing state.
793 (let* (;(starttime (current-time))
794 (starting-position (point))
795 (semantic-lex-token-stream nil)
796 (semantic-lex-block-stack nil)
797 (tmp-start start)
798 (semantic-lex-end-point start)
799 (semantic-lex-current-depth 0)
800 ;; Use the default depth when not specified.
801 (semantic-lex-maximum-depth
802 (or depth semantic-lex-depth))
803 ;; Bounds needed for unterminated syntax
804 (semantic-lex-analysis-bounds (cons start end))
805 ;; This entry prevents text properties from
806 ;; confusing our lexical analysis. See Emacs 22 (CVS)
807 ;; version of C++ mode with template hack text properties.
808 (parse-sexp-lookup-properties nil)
809 )
810 ;; Maybe REMOVE THIS LATER.
811 ;; Trying to find incremental parser bug.
812 (when (> end (point-max))
813 (error ,(format "%s: end (%%d) > point-max (%%d)" name)
814 end (point-max)))
815 (with-syntax-table semantic-lex-syntax-table
816 (goto-char start)
817 (while (and (< (point) end)
818 (or (not length)
819 (<= (length semantic-lex-token-stream) length)))
820 (semantic-lex-one-token ,analyzers)
821 (when (eq semantic-lex-end-point tmp-start)
822 (error ,(format "%s: endless loop at %%d, after %%S" name)
823 tmp-start (car semantic-lex-token-stream)))
824 (setq tmp-start semantic-lex-end-point)
825 (goto-char semantic-lex-end-point)
826 ;;(when (> (semantic-elapsed-time starttime (current-time))
827 ;; semantic-lex-timeout)
828 ;; (error "Timeout during lex at char %d" (point)))
829 (semantic-throw-on-input 'lex)
830 (semantic-lex-debug-break (car semantic-lex-token-stream))
831 ))
832 ;; Check that there is no unterminated block.
833 (when semantic-lex-block-stack
834 (let* ((last (pop semantic-lex-block-stack))
835 (blk last))
836 (while blk
837 (message
838 ,(format "%s: `%%s' block from %%S is unterminated" name)
839 (car blk) (cadr blk))
840 (setq blk (pop semantic-lex-block-stack)))
841 (semantic-lex-unterminated-syntax-detected (car last))))
842 ;; Return to where we started.
843 ;; Do not wrap in protective stuff so that if there is an error
844 ;; thrown, the user knows where.
845 (goto-char starting-position)
846 ;; Return the token stream
847 (nreverse semantic-lex-token-stream))))
848 \f
849 ;;; Collapsed block tokens delimited by any tokens.
850 ;;
851 (defun semantic-lex-start-block (syntax)
852 "Mark the last read token as the beginning of a SYNTAX block."
853 (if (or (not semantic-lex-maximum-depth)
854 (< semantic-lex-current-depth semantic-lex-maximum-depth))
855 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
856 (push (list syntax (car semantic-lex-token-stream))
857 semantic-lex-block-stack)))
858
859 (defun semantic-lex-end-block (syntax)
860 "Process the end of a previously marked SYNTAX block.
861 That is, collapse the tokens inside that block, including the
862 beginning and end of block tokens, into a high level block token of
863 class SYNTAX.
864 The token at beginning of block is the one marked by a previous call
865 to `semantic-lex-start-block'. The current token is the end of block.
866 The collapsed tokens are saved in `semantic-lex-block-streams'."
867 (if (null semantic-lex-block-stack)
868 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
869 (let* ((stream semantic-lex-token-stream)
870 (blk (pop semantic-lex-block-stack))
871 (bstream (cdr blk))
872 (first (car bstream))
873 (last (pop stream)) ;; The current token mark the EOBLK
874 tok)
875 (if (not (eq (car blk) syntax))
876 ;; SYNTAX doesn't match the syntax of the current block in
877 ;; the stack. So we encountered the end of the SYNTAX block
878 ;; before the end of the current one in the stack which is
879 ;; signaled unterminated.
880 (semantic-lex-unterminated-syntax-detected (car blk))
881 ;; Move tokens found inside the block from the main stream
882 ;; into a separate block stream.
883 (while (and stream (not (eq (setq tok (pop stream)) first)))
884 (push tok bstream))
885 ;; The token marked as beginning of block was not encountered.
886 ;; This should not happen!
887 (or (eq tok first)
888 (error "Token %S not found at beginning of block `%s'"
889 first syntax))
890 ;; Save the block stream for future reuse, to avoid to redo
891 ;; the lexical analysis of the block content!
892 ;; Anchor the block stream with its start position, so we can
893 ;; use: (cdr (assq start semantic-lex-block-streams)) to
894 ;; quickly retrieve the lexical stream associated to a block.
895 (setcar blk (semantic-lex-token-start first))
896 (setcdr blk (nreverse bstream))
897 (push blk semantic-lex-block-streams)
898 ;; In the main stream, replace the tokens inside the block by
899 ;; a high level block token of class SYNTAX.
900 (setq semantic-lex-token-stream stream)
901 (semantic-lex-push-token
902 (semantic-lex-token
903 syntax (car blk) (semantic-lex-token-end last)))
904 ))))
905 \f
906 ;;; Lexical token API
907 ;;
908 ;; Functions for accessing parts of a token. Use these functions
909 ;; instead of accessing the list structure directly because the
910 ;; contents of the lexical may change.
911 ;;
912 (defmacro semantic-lex-token (symbol start end &optional str)
913 "Create a lexical token.
914 SYMBOL is a symbol representing the class of syntax found.
915 START and END define the bounds of the token in the current buffer.
916 Optional STR is the string for the token iff the the bounds
917 in the buffer do not cover the string they represent. (As from
918 macro expansion.)"
919 ;; This if statement checks the existance of a STR argument at
920 ;; compile time, where STR is some symbol or constant. If the
921 ;; variable STr (runtime) is nil, this will make an incorrect decision.
922 ;;
923 ;; It is like this to maintain the original speed of the compiled
924 ;; code.
925 (if str
926 `(cons ,symbol (cons ,str (cons ,start ,end)))
927 `(cons ,symbol (cons ,start ,end))))
928
929 (defun semantic-lex-token-p (thing)
930 "Return non-nil if THING is a semantic lex token.
931 This is an exhaustively robust check."
932 (and (consp thing)
933 (symbolp (car thing))
934 (or (and (numberp (nth 1 thing))
935 (numberp (nthcdr 2 thing)))
936 (and (stringp (nth 1 thing))
937 (numberp (nth 2 thing))
938 (numberp (nthcdr 3 thing)))
939 ))
940 )
941
942 (defun semantic-lex-token-with-text-p (thing)
943 "Return non-nil if THING is a semantic lex token.
944 This is an exhaustively robust check."
945 (and (consp thing)
946 (symbolp (car thing))
947 (= (length thing) 4)
948 (stringp (nth 1 thing))
949 (numberp (nth 2 thing))
950 (numberp (nth 3 thing)))
951 )
952
953 (defun semantic-lex-token-without-text-p (thing)
954 "Return non-nil if THING is a semantic lex token.
955 This is an exhaustively robust check."
956 (and (consp thing)
957 (symbolp (car thing))
958 (= (length thing) 3)
959 (numberp (nth 1 thing))
960 (numberp (nth 2 thing)))
961 )
962
963 (defun semantic-lex-expand-block-specs (specs)
964 "Expand block specifications SPECS into a Lisp form.
965 SPECS is a list of (BLOCK BEGIN END) elements where BLOCK, BEGIN, and
966 END are token class symbols that indicate to produce one collapsed
967 BLOCK token from tokens found between BEGIN and END ones.
968 BLOCK must be a non-nil symbol, and at least one of the BEGIN or END
969 symbols must be non-nil too.
970 When BEGIN is non-nil, generate a call to `semantic-lex-start-block'
971 when a BEGIN token class is encountered.
972 When END is non-nil, generate a call to `semantic-lex-end-block' when
973 an END token class is encountered."
974 (let ((class (make-symbol "class"))
975 (form nil))
976 (dolist (spec specs)
977 (when (car spec)
978 (when (nth 1 spec)
979 (push `((eq ',(nth 1 spec) ,class)
980 (semantic-lex-start-block ',(car spec)))
981 form))
982 (when (nth 2 spec)
983 (push `((eq ',(nth 2 spec) ,class)
984 (semantic-lex-end-block ',(car spec)))
985 form))))
986 (when form
987 `((let ((,class (semantic-lex-token-class
988 (car semantic-lex-token-stream))))
989 (cond ,@(nreverse form))))
990 )))
991
992 (defmacro semantic-lex-push-token (token &rest blockspecs)
993 "Push TOKEN in the lexical analyzer token stream.
994 Return the lexical analysis current end point.
995 If optional arguments BLOCKSPECS is non-nil, it specifies to process
996 collapsed block tokens. See `semantic-lex-expand-block-specs' for
997 more details.
998 This macro should only be called within the bounds of
999 `define-lex-analyzer'. It changes the values of the lexical analyzer
1000 variables `token-stream' and `semantic-lex-end-point'. If you need to
1001 move `semantic-lex-end-point' somewhere else, just modify this
1002 variable after calling `semantic-lex-push-token'."
1003 `(progn
1004 (push ,token semantic-lex-token-stream)
1005 ,@(semantic-lex-expand-block-specs blockspecs)
1006 (setq semantic-lex-end-point
1007 (semantic-lex-token-end (car semantic-lex-token-stream)))
1008 ))
1009
1010 (defsubst semantic-lex-token-class (token)
1011 "Fetch the class of the lexical token TOKEN.
1012 See also the function `semantic-lex-token'."
1013 (car token))
1014
1015 (defsubst semantic-lex-token-bounds (token)
1016 "Fetch the start and end locations of the lexical token TOKEN.
1017 Return a pair (START . END)."
1018 (if (not (numberp (car (cdr token))))
1019 (cdr (cdr token))
1020 (cdr token)))
1021
1022 (defsubst semantic-lex-token-start (token)
1023 "Fetch the start position of the lexical token TOKEN.
1024 See also the function `semantic-lex-token'."
1025 (car (semantic-lex-token-bounds token)))
1026
1027 (defsubst semantic-lex-token-end (token)
1028 "Fetch the end position of the lexical token TOKEN.
1029 See also the function `semantic-lex-token'."
1030 (cdr (semantic-lex-token-bounds token)))
1031
1032 (defsubst semantic-lex-token-text (token)
1033 "Fetch the text associated with the lexical token TOKEN.
1034 See also the function `semantic-lex-token'."
1035 (if (stringp (car (cdr token)))
1036 (car (cdr token))
1037 (buffer-substring-no-properties
1038 (semantic-lex-token-start token)
1039 (semantic-lex-token-end token))))
1040
1041 (defun semantic-lex-init ()
1042 "Initialize any lexical state for this buffer."
1043 (unless semantic-lex-comment-regex
1044 (setq semantic-lex-comment-regex
1045 (if comment-start-skip
1046 (concat "\\(\\s<\\|" comment-start-skip "\\)")
1047 "\\(\\s<\\)")))
1048 ;; Setup the lexer syntax-table
1049 (setq semantic-lex-syntax-table (copy-syntax-table (syntax-table)))
1050 (dolist (mod semantic-lex-syntax-modifications)
1051 (modify-syntax-entry
1052 (car mod) (nth 1 mod) semantic-lex-syntax-table)))
1053
1054 (define-overloadable-function semantic-lex (start end &optional depth length)
1055 "Lexically analyze text in the current buffer between START and END.
1056 Optional argument DEPTH indicates at what level to scan over entire
1057 lists. The last argument, LENGTH specifies that `semantic-lex'
1058 should only return LENGTH tokens. The return value is a token stream.
1059 Each element is a list, such of the form
1060 (symbol start-expression . end-expression)
1061 where SYMBOL denotes the token type.
1062 See `semantic-lex-tokens' variable for details on token types. END
1063 does not mark the end of the text scanned, only the end of the
1064 beginning of text scanned. Thus, if a string extends past END, the
1065 end of the return token will be larger than END. To truly restrict
1066 scanning, use `narrow-to-region'."
1067 (funcall semantic-lex-analyzer start end depth length))
1068
1069 (defsubst semantic-lex-buffer (&optional depth)
1070 "Lex the current buffer.
1071 Optional argument DEPTH is the depth to scan into lists."
1072 (semantic-lex (point-min) (point-max) depth))
1073
1074 (defsubst semantic-lex-list (semlist depth)
1075 "Lex the body of SEMLIST to DEPTH."
1076 (semantic-lex (semantic-lex-token-start semlist)
1077 (semantic-lex-token-end semlist)
1078 depth))
1079 \f
1080 ;;; Analyzer creation macros
1081 ;;
1082 ;; An individual analyzer is a condition and code that goes with it.
1083 ;;
1084 ;; Created analyzers become variables with the code associated with them
1085 ;; as the symbol value. These analyzers are assembled into a lexer
1086 ;; to create new lexical analyzers.
1087 ;;
1088 (defsubst semantic-lex-unterminated-syntax-detected (syntax)
1089 "Inside a lexical analyzer, use this when unterminated syntax was found.
1090 Argument SYNTAX indicates the type of syntax that is unterminated.
1091 The job of this function is to move (point) to a new logical location
1092 so that analysis can continue, if possible."
1093 (goto-char
1094 (funcall semantic-lex-unterminated-syntax-end-function
1095 syntax
1096 (car semantic-lex-analysis-bounds)
1097 (cdr semantic-lex-analysis-bounds)
1098 ))
1099 (setq semantic-lex-end-point (point)))
1100
1101 (defcustom semantic-lex-debug-analyzers nil
1102 "Non nil means to debug analyzers with syntax protection.
1103 Only in effect if `debug-on-error' is also non-nil."
1104 :group 'semantic
1105 :type 'boolean)
1106
1107 (defmacro semantic-lex-unterminated-syntax-protection (syntax &rest forms)
1108 "For SYNTAX, execute FORMS with protection for unterminated syntax.
1109 If FORMS throws an error, treat this as a syntax problem, and
1110 execute the unterminated syntax code. FORMS should return a position.
1111 Irreguardless of an error, the cursor should be moved to the end of
1112 the desired syntax, and a position returned.
1113 If `debug-on-error' is set, errors are not caught, so that you can
1114 debug them.
1115 Avoid using a large FORMS since it is duplicated."
1116 `(if (and debug-on-error semantic-lex-debug-analyzers)
1117 (progn ,@forms)
1118 (condition-case nil
1119 (progn ,@forms)
1120 (error
1121 (semantic-lex-unterminated-syntax-detected ,syntax)))))
1122 (put 'semantic-lex-unterminated-syntax-protection
1123 'lisp-indent-function 1)
1124
1125 (defmacro define-lex-analyzer (name doc condition &rest forms)
1126 "Create a single lexical analyzer NAME with DOC.
1127 When an analyzer is called, the current buffer and point are
1128 positioned in a buffer at the location to be analyzed.
1129 CONDITION is an expression which returns t if FORMS should be run.
1130 Within the bounds of CONDITION and FORMS, the use of backquote
1131 can be used to evaluate expressions at compile time.
1132 While forms are running, the following variables will be locally bound:
1133 `semantic-lex-analysis-bounds' - The bounds of the current analysis.
1134 of the form (START . END)
1135 `semantic-lex-maximum-depth' - The maximum depth of semantic-list
1136 for the current analysis.
1137 `semantic-lex-current-depth' - The current depth of `semantic-list' that has
1138 been decended.
1139 `semantic-lex-end-point' - End Point after match.
1140 Analyzers should set this to a buffer location if their
1141 match string does not represent the end of the matched text.
1142 `semantic-lex-token-stream' - The token list being collected.
1143 Add new lexical tokens to this list.
1144 Proper action in FORMS is to move the value of `semantic-lex-end-point' to
1145 after the location of the analyzed entry, and to add any discovered tokens
1146 at the beginning of `semantic-lex-token-stream'.
1147 This can be done by using `semantic-lex-push-token'."
1148 `(eval-and-compile
1149 (defvar ,name nil ,doc)
1150 (defun ,name nil)
1151 ;; Do this part separately so that re-evaluation rebuilds this code.
1152 (setq ,name '(,condition ,@forms))
1153 ;; Build a single lexical analyzer function, so the doc for
1154 ;; function help is automatically provided, and perhaps the
1155 ;; function could be useful for testing and debugging one
1156 ;; analyzer.
1157 (fset ',name (lambda () ,doc
1158 (let ((semantic-lex-token-stream nil)
1159 (semantic-lex-end-point (point))
1160 (semantic-lex-analysis-bounds
1161 (cons (point) (point-max)))
1162 (semantic-lex-current-depth 0)
1163 (semantic-lex-maximum-depth
1164 semantic-lex-depth)
1165 )
1166 (when ,condition ,@forms)
1167 semantic-lex-token-stream)))
1168 ))
1169
1170 (defmacro define-lex-regex-analyzer (name doc regexp &rest forms)
1171 "Create a lexical analyzer with NAME and DOC that will match REGEXP.
1172 FORMS are evaluated upon a successful match.
1173 See `define-lex-analyzer' for more about analyzers."
1174 `(define-lex-analyzer ,name
1175 ,doc
1176 (looking-at ,regexp)
1177 ,@forms
1178 ))
1179
1180 (defmacro define-lex-simple-regex-analyzer (name doc regexp toksym
1181 &optional index
1182 &rest forms)
1183 "Create a lexical analyzer with NAME and DOC that match REGEXP.
1184 TOKSYM is the symbol to use when creating a semantic lexical token.
1185 INDEX is the index into the match that defines the bounds of the token.
1186 Index should be a plain integer, and not specified in the macro as an
1187 expression.
1188 FORMS are evaluated upon a successful match BEFORE the new token is
1189 created. It is valid to ignore FORMS.
1190 See `define-lex-analyzer' for more about analyzers."
1191 `(define-lex-analyzer ,name
1192 ,doc
1193 (looking-at ,regexp)
1194 ,@forms
1195 (semantic-lex-push-token
1196 (semantic-lex-token ,toksym
1197 (match-beginning ,(or index 0))
1198 (match-end ,(or index 0))))
1199 ))
1200
1201 (defmacro define-lex-block-analyzer (name doc spec1 &rest specs)
1202 "Create a lexical analyzer NAME for paired delimiters blocks.
1203 It detects a paired delimiters block or the corresponding open or
1204 close delimiter depending on the value of the variable
1205 `semantic-lex-current-depth'. DOC is the documentation string of the lexical
1206 analyzer. SPEC1 and SPECS specify the token symbols and open, close
1207 delimiters used. Each SPEC has the form:
1208
1209 \(BLOCK-SYM (OPEN-DELIM OPEN-SYM) (CLOSE-DELIM CLOSE-SYM))
1210
1211 where BLOCK-SYM is the symbol returned in a block token. OPEN-DELIM
1212 and CLOSE-DELIM are respectively the open and close delimiters
1213 identifying a block. OPEN-SYM and CLOSE-SYM are respectively the
1214 symbols returned in open and close tokens."
1215 (let ((specs (cons spec1 specs))
1216 spec open olist clist)
1217 (while specs
1218 (setq spec (car specs)
1219 specs (cdr specs)
1220 open (nth 1 spec)
1221 ;; build alist ((OPEN-DELIM OPEN-SYM BLOCK-SYM) ...)
1222 olist (cons (list (car open) (cadr open) (car spec)) olist)
1223 ;; build alist ((CLOSE-DELIM CLOSE-SYM) ...)
1224 clist (cons (nth 2 spec) clist)))
1225 `(define-lex-analyzer ,name
1226 ,doc
1227 (and
1228 (looking-at "\\(\\s(\\|\\s)\\)")
1229 (let ((text (match-string 0)) match)
1230 (cond
1231 ((setq match (assoc text ',olist))
1232 (if (or (not semantic-lex-maximum-depth)
1233 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1234 (progn
1235 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1236 (semantic-lex-push-token
1237 (semantic-lex-token
1238 (nth 1 match)
1239 (match-beginning 0) (match-end 0))))
1240 (semantic-lex-push-token
1241 (semantic-lex-token
1242 (nth 2 match)
1243 (match-beginning 0)
1244 (save-excursion
1245 (semantic-lex-unterminated-syntax-protection (nth 2 match)
1246 (forward-list 1)
1247 (point)))
1248 ))
1249 ))
1250 ((setq match (assoc text ',clist))
1251 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
1252 (semantic-lex-push-token
1253 (semantic-lex-token
1254 (nth 1 match)
1255 (match-beginning 0) (match-end 0)))))))
1256 )))
1257 \f
1258 ;;; Analyzers
1259 ;;
1260 ;; Pre-defined common analyzers.
1261 ;;
1262 (define-lex-analyzer semantic-lex-default-action
1263 "The default action when no other lexical actions match text.
1264 This action will just throw an error."
1265 t
1266 (error "Unmatched Text during Lexical Analysis"))
1267
1268 (define-lex-analyzer semantic-lex-beginning-of-line
1269 "Detect and create a beginning of line token (BOL)."
1270 (and (bolp)
1271 ;; Just insert a (bol N . N) token in the token stream,
1272 ;; without moving the point. N is the point at the
1273 ;; beginning of line.
1274 (semantic-lex-push-token (semantic-lex-token 'bol (point) (point)))
1275 nil) ;; CONTINUE
1276 ;; We identify and add the BOL token onto the stream, but since
1277 ;; semantic-lex-end-point doesn't move, we always fail CONDITION, and have no
1278 ;; FORMS body.
1279 nil)
1280
1281 (define-lex-simple-regex-analyzer semantic-lex-newline
1282 "Detect and create newline tokens."
1283 "\\s-*\\(\n\\|\\s>\\)" 'newline 1)
1284
1285 (define-lex-regex-analyzer semantic-lex-newline-as-whitespace
1286 "Detect and create newline tokens.
1287 Use this ONLY if newlines are not whitespace characters (such as when
1288 they are comment end characters) AND when you want whitespace tokens."
1289 "\\s-*\\(\n\\|\\s>\\)"
1290 ;; Language wants whitespaces. Create a token for it.
1291 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream))
1292 'whitespace)
1293 ;; Merge whitespace tokens together if they are adjacent. Two
1294 ;; whitespace tokens may be sperated by a comment which is not in
1295 ;; the token stream.
1296 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1297 (match-end 0))
1298 (semantic-lex-push-token
1299 (semantic-lex-token
1300 'whitespace (match-beginning 0) (match-end 0)))))
1301
1302 (define-lex-regex-analyzer semantic-lex-ignore-newline
1303 "Detect and ignore newline tokens.
1304 Use this ONLY if newlines are not whitespace characters (such as when
1305 they are comment end characters)."
1306 "\\s-*\\(\n\\|\\s>\\)"
1307 (setq semantic-lex-end-point (match-end 0)))
1308
1309 (define-lex-regex-analyzer semantic-lex-whitespace
1310 "Detect and create whitespace tokens."
1311 ;; catch whitespace when needed
1312 "\\s-+"
1313 ;; Language wants whitespaces. Create a token for it.
1314 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream))
1315 'whitespace)
1316 ;; Merge whitespace tokens together if they are adjacent. Two
1317 ;; whitespace tokens may be sperated by a comment which is not in
1318 ;; the token stream.
1319 (progn
1320 (setq semantic-lex-end-point (match-end 0))
1321 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1322 semantic-lex-end-point))
1323 (semantic-lex-push-token
1324 (semantic-lex-token
1325 'whitespace (match-beginning 0) (match-end 0)))))
1326
1327 (define-lex-regex-analyzer semantic-lex-ignore-whitespace
1328 "Detect and skip over whitespace tokens."
1329 ;; catch whitespace when needed
1330 "\\s-+"
1331 ;; Skip over the detected whitespace, do not create a token for it.
1332 (setq semantic-lex-end-point (match-end 0)))
1333
1334 (define-lex-simple-regex-analyzer semantic-lex-number
1335 "Detect and create number tokens.
1336 See `semantic-lex-number-expression' for details on matching numbers,
1337 and number formats."
1338 semantic-lex-number-expression 'number)
1339
1340 (define-lex-regex-analyzer semantic-lex-symbol-or-keyword
1341 "Detect and create symbol and keyword tokens."
1342 "\\(\\sw\\|\\s_\\)+"
1343 (semantic-lex-push-token
1344 (semantic-lex-token
1345 (or (semantic-lex-keyword-p (match-string 0)) 'symbol)
1346 (match-beginning 0) (match-end 0))))
1347
1348 (define-lex-simple-regex-analyzer semantic-lex-charquote
1349 "Detect and create charquote tokens."
1350 ;; Character quoting characters (ie, \n as newline)
1351 "\\s\\+" 'charquote)
1352
1353 (define-lex-simple-regex-analyzer semantic-lex-punctuation
1354 "Detect and create punctuation tokens."
1355 "\\(\\s.\\|\\s$\\|\\s'\\)" 'punctuation)
1356
1357 (define-lex-analyzer semantic-lex-punctuation-type
1358 "Detect and create a punctuation type token.
1359 Recognized punctuations are defined in the current table of lexical
1360 types, as the value of the `punctuation' token type."
1361 (and (looking-at "\\(\\s.\\|\\s$\\|\\s'\\)+")
1362 (let* ((key (match-string 0))
1363 (pos (match-beginning 0))
1364 (end (match-end 0))
1365 (len (- end pos))
1366 (lst (semantic-lex-type-value "punctuation" t))
1367 (def (car lst)) ;; default lexical symbol or nil
1368 (lst (cdr lst)) ;; alist of (LEX-SYM . PUNCT-STRING)
1369 (elt nil))
1370 (if lst
1371 ;; Starting with the longest one, search if the
1372 ;; punctuation string is defined for this language.
1373 (while (and (> len 0) (not (setq elt (rassoc key lst))))
1374 (setq len (1- len)
1375 key (substring key 0 len))))
1376 (if elt ;; Return the punctuation token found
1377 (semantic-lex-push-token
1378 (semantic-lex-token (car elt) pos (+ pos len)))
1379 (if def ;; Return a default generic token
1380 (semantic-lex-push-token
1381 (semantic-lex-token def pos end))
1382 ;; Nothing match
1383 )))))
1384
1385 (define-lex-regex-analyzer semantic-lex-paren-or-list
1386 "Detect open parenthesis.
1387 Return either a paren token or a semantic list token depending on
1388 `semantic-lex-current-depth'."
1389 "\\s("
1390 (if (or (not semantic-lex-maximum-depth)
1391 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1392 (progn
1393 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1394 (semantic-lex-push-token
1395 (semantic-lex-token
1396 'open-paren (match-beginning 0) (match-end 0))))
1397 (semantic-lex-push-token
1398 (semantic-lex-token
1399 'semantic-list (match-beginning 0)
1400 (save-excursion
1401 (semantic-lex-unterminated-syntax-protection 'semantic-list
1402 (forward-list 1)
1403 (point))
1404 )))
1405 ))
1406
1407 (define-lex-simple-regex-analyzer semantic-lex-open-paren
1408 "Detect and create an open parenthisis token."
1409 "\\s(" 'open-paren 0 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)))
1410
1411 (define-lex-simple-regex-analyzer semantic-lex-close-paren
1412 "Detect and create a close paren token."
1413 "\\s)" 'close-paren 0 (setq semantic-lex-current-depth (1- semantic-lex-current-depth)))
1414
1415 (define-lex-regex-analyzer semantic-lex-string
1416 "Detect and create a string token."
1417 "\\s\""
1418 ;; Zing to the end of this string.
1419 (semantic-lex-push-token
1420 (semantic-lex-token
1421 'string (point)
1422 (save-excursion
1423 (semantic-lex-unterminated-syntax-protection 'string
1424 (forward-sexp 1)
1425 (point))
1426 ))))
1427
1428 (define-lex-regex-analyzer semantic-lex-comments
1429 "Detect and create a comment token."
1430 semantic-lex-comment-regex
1431 (save-excursion
1432 (forward-comment 1)
1433 ;; Generate newline token if enabled
1434 (if (bolp) (backward-char 1))
1435 (setq semantic-lex-end-point (point))
1436 ;; Language wants comments or want them as whitespaces,
1437 ;; link them together.
1438 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'comment)
1439 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1440 semantic-lex-end-point)
1441 (semantic-lex-push-token
1442 (semantic-lex-token
1443 'comment (match-beginning 0) semantic-lex-end-point)))))
1444
1445 (define-lex-regex-analyzer semantic-lex-comments-as-whitespace
1446 "Detect comments and create a whitespace token."
1447 semantic-lex-comment-regex
1448 (save-excursion
1449 (forward-comment 1)
1450 ;; Generate newline token if enabled
1451 (if (bolp) (backward-char 1))
1452 (setq semantic-lex-end-point (point))
1453 ;; Language wants comments or want them as whitespaces,
1454 ;; link them together.
1455 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'whitespace)
1456 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1457 semantic-lex-end-point)
1458 (semantic-lex-push-token
1459 (semantic-lex-token
1460 'whitespace (match-beginning 0) semantic-lex-end-point)))))
1461
1462 (define-lex-regex-analyzer semantic-lex-ignore-comments
1463 "Detect and create a comment token."
1464 semantic-lex-comment-regex
1465 (let ((comment-start-point (point)))
1466 (forward-comment 1)
1467 (if (eq (point) comment-start-point)
1468 ;; In this case our start-skip string failed
1469 ;; to work properly. Lets try and move over
1470 ;; whatever white space we matched to begin
1471 ;; with.
1472 (skip-syntax-forward "-.'"
1473 (save-excursion
1474 (end-of-line)
1475 (point)))
1476 ;; We may need to back up so newlines or whitespace is generated.
1477 (if (bolp)
1478 (backward-char 1)))
1479 (if (eq (point) comment-start-point)
1480 (error "Strange comment syntax prevents lexical analysis"))
1481 (setq semantic-lex-end-point (point))))
1482 \f
1483 ;;; Comment lexer
1484 ;;
1485 ;; Predefined lexers that could be used instead of creating new
1486 ;; analyers.
1487
1488 (define-lex semantic-comment-lexer
1489 "A simple lexical analyzer that handles comments.
1490 This lexer will only return comment tokens. It is the default lexer
1491 used by `semantic-find-doc-snarf-comment' to snarf up the comment at
1492 point."
1493 semantic-lex-ignore-whitespace
1494 semantic-lex-ignore-newline
1495 semantic-lex-comments
1496 semantic-lex-default-action)
1497
1498 ;;; Test Lexer
1499 ;;
1500 (define-lex semantic-simple-lexer
1501 "A simple lexical analyzer that handles simple buffers.
1502 This lexer ignores comments and whitespace, and will return
1503 syntax as specified by the syntax table."
1504 semantic-lex-ignore-whitespace
1505 semantic-lex-ignore-newline
1506 semantic-lex-number
1507 semantic-lex-symbol-or-keyword
1508 semantic-lex-charquote
1509 semantic-lex-paren-or-list
1510 semantic-lex-close-paren
1511 semantic-lex-string
1512 semantic-lex-ignore-comments
1513 semantic-lex-punctuation
1514 semantic-lex-default-action)
1515 \f
1516 ;;; Analyzers generated from grammar.
1517 ;;
1518 ;; Some analyzers are hand written. Analyzers created with these
1519 ;; functions are generated from the grammar files.
1520
1521 (defmacro define-lex-keyword-type-analyzer (name doc syntax)
1522 "Define a keyword type analyzer NAME with DOC string.
1523 SYNTAX is the regexp that matches a keyword syntactic expression."
1524 (let ((key (make-symbol "key")))
1525 `(define-lex-analyzer ,name
1526 ,doc
1527 (and (looking-at ,syntax)
1528 (let ((,key (semantic-lex-keyword-p (match-string 0))))
1529 (when ,key
1530 (semantic-lex-push-token
1531 (semantic-lex-token
1532 ,key (match-beginning 0) (match-end 0)))))))
1533 ))
1534
1535 (defmacro define-lex-sexp-type-analyzer (name doc syntax token)
1536 "Define a sexp type analyzer NAME with DOC string.
1537 SYNTAX is the regexp that matches the beginning of the s-expression.
1538 TOKEN is the lexical token returned when SYNTAX matches."
1539 `(define-lex-regex-analyzer ,name
1540 ,doc
1541 ,syntax
1542 (semantic-lex-push-token
1543 (semantic-lex-token
1544 ,token (point)
1545 (save-excursion
1546 (semantic-lex-unterminated-syntax-protection ,token
1547 (forward-sexp 1)
1548 (point))))))
1549 )
1550
1551 (defmacro define-lex-regex-type-analyzer (name doc syntax matches default)
1552 "Define a regexp type analyzer NAME with DOC string.
1553 SYNTAX is the regexp that matches a syntactic expression.
1554 MATCHES is an alist of lexical elements used to refine the syntactic
1555 expression.
1556 DEFAULT is the default lexical token returned when no MATCHES."
1557 (if matches
1558 (let* ((val (make-symbol "val"))
1559 (lst (make-symbol "lst"))
1560 (elt (make-symbol "elt"))
1561 (pos (make-symbol "pos"))
1562 (end (make-symbol "end")))
1563 `(define-lex-analyzer ,name
1564 ,doc
1565 (and (looking-at ,syntax)
1566 (let* ((,val (match-string 0))
1567 (,pos (match-beginning 0))
1568 (,end (match-end 0))
1569 (,lst ,matches)
1570 ,elt)
1571 (while (and ,lst (not ,elt))
1572 (if (string-match (cdar ,lst) ,val)
1573 (setq ,elt (caar ,lst))
1574 (setq ,lst (cdr ,lst))))
1575 (semantic-lex-push-token
1576 (semantic-lex-token (or ,elt ,default) ,pos ,end))))
1577 ))
1578 `(define-lex-simple-regex-analyzer ,name
1579 ,doc
1580 ,syntax ,default)
1581 ))
1582
1583 (defmacro define-lex-string-type-analyzer (name doc syntax matches default)
1584 "Define a string type analyzer NAME with DOC string.
1585 SYNTAX is the regexp that matches a syntactic expression.
1586 MATCHES is an alist of lexical elements used to refine the syntactic
1587 expression.
1588 DEFAULT is the default lexical token returned when no MATCHES."
1589 (if matches
1590 (let* ((val (make-symbol "val"))
1591 (lst (make-symbol "lst"))
1592 (elt (make-symbol "elt"))
1593 (pos (make-symbol "pos"))
1594 (end (make-symbol "end"))
1595 (len (make-symbol "len")))
1596 `(define-lex-analyzer ,name
1597 ,doc
1598 (and (looking-at ,syntax)
1599 (let* ((,val (match-string 0))
1600 (,pos (match-beginning 0))
1601 (,end (match-end 0))
1602 (,len (- ,end ,pos))
1603 (,lst ,matches)
1604 ,elt)
1605 ;; Starting with the longest one, search if a lexical
1606 ;; value match a token defined for this language.
1607 (while (and (> ,len 0) (not (setq ,elt (rassoc ,val ,lst))))
1608 (setq ,len (1- ,len)
1609 ,val (substring ,val 0 ,len)))
1610 (when ,elt ;; Adjust token end position.
1611 (setq ,elt (car ,elt)
1612 ,end (+ ,pos ,len)))
1613 (semantic-lex-push-token
1614 (semantic-lex-token (or ,elt ,default) ,pos ,end))))
1615 ))
1616 `(define-lex-simple-regex-analyzer ,name
1617 ,doc
1618 ,syntax ,default)
1619 ))
1620
1621 (defmacro define-lex-block-type-analyzer (name doc syntax matches)
1622 "Define a block type analyzer NAME with DOC string.
1623
1624 SYNTAX is the regexp that matches block delimiters, typically the
1625 open (`\\\\s(') and close (`\\\\s)') parenthesis syntax classes.
1626
1627 MATCHES is a pair (OPEN-SPECS . CLOSE-SPECS) that defines blocks.
1628
1629 OPEN-SPECS is a list of (OPEN-DELIM OPEN-TOKEN BLOCK-TOKEN) elements
1630 where:
1631
1632 OPEN-DELIM is a string: the block open delimiter character.
1633
1634 OPEN-TOKEN is the lexical token class associated to the OPEN-DELIM
1635 delimiter.
1636
1637 BLOCK-TOKEN is the lexical token class associated to the block
1638 that starts at the OPEN-DELIM delimiter.
1639
1640 CLOSE-SPECS is a list of (CLOSE-DELIM CLOSE-TOKEN) elements where:
1641
1642 CLOSE-DELIM is a string: the block end delimiter character.
1643
1644 CLOSE-TOKEN is the lexical token class associated to the
1645 CLOSE-DELIM delimiter.
1646
1647 Each element in OPEN-SPECS must have a corresponding element in
1648 CLOSE-SPECS.
1649
1650 The lexer will return a BLOCK-TOKEN token when the value of
1651 `semantic-lex-current-depth' is greater than or equal to the maximum
1652 depth of parenthesis tracking (see also the function `semantic-lex').
1653 Otherwise it will return OPEN-TOKEN and CLOSE-TOKEN tokens.
1654
1655 TO DO: Put the following in the developer's guide and just put a
1656 reference here.
1657
1658 In the grammar:
1659
1660 The value of a block token must be a string that contains a readable
1661 sexp of the form:
1662
1663 \"(OPEN-TOKEN CLOSE-TOKEN)\"
1664
1665 OPEN-TOKEN and CLOSE-TOKEN represent the block delimiters, and must be
1666 lexical tokens of respectively `open-paren' and `close-paren' types.
1667 Their value is the corresponding delimiter character as a string.
1668
1669 Here is a small example to analyze a parenthesis block:
1670
1671 %token <block> PAREN_BLOCK \"(LPAREN RPAREN)\"
1672 %token <open-paren> LPAREN \"(\"
1673 %token <close-paren> RPAREN \")\"
1674
1675 When the lexer encounters the open-paren delimiter \"(\":
1676
1677 - If the maximum depth of parenthesis tracking is not reached (that
1678 is, current depth < max depth), it returns a (LPAREN start . end)
1679 token, then continue analysis inside the block. Later, when the
1680 corresponding close-paren delimiter \")\" will be encountered, it
1681 will return a (RPAREN start . end) token.
1682
1683 - If the maximum depth of parenthesis tracking is reached (current
1684 depth >= max depth), it returns the whole parenthesis block as
1685 a (PAREN_BLOCK start . end) token."
1686 (let* ((val (make-symbol "val"))
1687 (lst (make-symbol "lst"))
1688 (elt (make-symbol "elt")))
1689 `(define-lex-analyzer ,name
1690 ,doc
1691 (and
1692 (looking-at ,syntax) ;; "\\(\\s(\\|\\s)\\)"
1693 (let ((,val (match-string 0))
1694 (,lst ,matches)
1695 ,elt)
1696 (cond
1697 ((setq ,elt (assoc ,val (car ,lst)))
1698 (if (or (not semantic-lex-maximum-depth)
1699 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1700 (progn
1701 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1702 (semantic-lex-push-token
1703 (semantic-lex-token
1704 (nth 1 ,elt)
1705 (match-beginning 0) (match-end 0))))
1706 (semantic-lex-push-token
1707 (semantic-lex-token
1708 (nth 2 ,elt)
1709 (match-beginning 0)
1710 (save-excursion
1711 (semantic-lex-unterminated-syntax-protection (nth 2 ,elt)
1712 (forward-list 1)
1713 (point)))))))
1714 ((setq ,elt (assoc ,val (cdr ,lst)))
1715 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
1716 (semantic-lex-push-token
1717 (semantic-lex-token
1718 (nth 1 ,elt)
1719 (match-beginning 0) (match-end 0))))
1720 ))))
1721 ))
1722 \f
1723 ;;; Lexical Safety
1724 ;;
1725 ;; The semantic lexers, unlike other lexers, can throw errors on
1726 ;; unbalanced syntax. Since editing is all about changeging test
1727 ;; we need to provide a convenient way to protect against syntactic
1728 ;; inequalities.
1729
1730 (defmacro semantic-lex-catch-errors (symbol &rest forms)
1731 "Using SYMBOL, execute FORMS catching lexical errors.
1732 If FORMS results in a call to the parser that throws a lexical error,
1733 the error will be caught here without the buffer's cache being thrown
1734 out of date.
1735 If there is an error, the syntax that failed is returned.
1736 If there is no error, then the last value of FORMS is returned."
1737 (let ((ret (make-symbol "ret"))
1738 (syntax (make-symbol "syntax"))
1739 (start (make-symbol "start"))
1740 (end (make-symbol "end")))
1741 `(let* ((semantic-lex-unterminated-syntax-end-function
1742 (lambda (,syntax ,start ,end)
1743 (throw ',symbol ,syntax)))
1744 ;; Delete the below when semantic-flex is fully retired.
1745 (semantic-flex-unterminated-syntax-end-function
1746 semantic-lex-unterminated-syntax-end-function)
1747 (,ret (catch ',symbol
1748 (save-excursion
1749 ,@forms
1750 nil))))
1751 ;; Great Sadness. Assume that FORMS execute within the
1752 ;; confines of the current buffer only! Mark this thing
1753 ;; unparseable iff the special symbol was thrown. This
1754 ;; will prevent future calls from parsing, but will allow
1755 ;; then to still return the cache.
1756 (when ,ret
1757 ;; Leave this message off. If an APP using this fcn wants
1758 ;; a message, they can do it themselves. This cleans up
1759 ;; problems with the idle scheduler obscuring useful data.
1760 ;;(message "Buffer not currently parsable (%S)." ,ret)
1761 (semantic-parse-tree-unparseable))
1762 ,ret)))
1763 (put 'semantic-lex-catch-errors 'lisp-indent-function 1)
1764
1765 \f
1766 ;;; Interfacing with edebug
1767 ;;
1768 (add-hook
1769 'edebug-setup-hook
1770 #'(lambda ()
1771
1772 (def-edebug-spec define-lex
1773 (&define name stringp (&rest symbolp))
1774 )
1775 (def-edebug-spec define-lex-analyzer
1776 (&define name stringp form def-body)
1777 )
1778 (def-edebug-spec define-lex-regex-analyzer
1779 (&define name stringp form def-body)
1780 )
1781 (def-edebug-spec define-lex-simple-regex-analyzer
1782 (&define name stringp form symbolp [ &optional form ] def-body)
1783 )
1784 (def-edebug-spec define-lex-block-analyzer
1785 (&define name stringp form (&rest form))
1786 )
1787 (def-edebug-spec semantic-lex-catch-errors
1788 (symbolp def-body)
1789 )
1790
1791 ))
1792 \f
1793 ;;; Compatibility with Semantic 1.x lexical analysis
1794 ;;
1795 ;; NOTE: DELETE THIS SOMEDAY SOON
1796
1797 (semantic-alias-obsolete 'semantic-flex-start 'semantic-lex-token-start)
1798 (semantic-alias-obsolete 'semantic-flex-end 'semantic-lex-token-end)
1799 (semantic-alias-obsolete 'semantic-flex-text 'semantic-lex-token-text)
1800 (semantic-alias-obsolete 'semantic-flex-make-keyword-table 'semantic-lex-make-keyword-table)
1801 (semantic-alias-obsolete 'semantic-flex-keyword-p 'semantic-lex-keyword-p)
1802 (semantic-alias-obsolete 'semantic-flex-keyword-put 'semantic-lex-keyword-put)
1803 (semantic-alias-obsolete 'semantic-flex-keyword-get 'semantic-lex-keyword-get)
1804 (semantic-alias-obsolete 'semantic-flex-map-keywords 'semantic-lex-map-keywords)
1805 (semantic-alias-obsolete 'semantic-flex-keywords 'semantic-lex-keywords)
1806 (semantic-alias-obsolete 'semantic-flex-buffer 'semantic-lex-buffer)
1807 (semantic-alias-obsolete 'semantic-flex-list 'semantic-lex-list)
1808
1809 ;; This simple scanner uses the syntax table to generate a stream of
1810 ;; simple tokens of the form:
1811 ;;
1812 ;; (SYMBOL START . END)
1813 ;;
1814 ;; Where symbol is the type of thing it is. START and END mark that
1815 ;; objects boundary.
1816
1817 (defvar semantic-flex-tokens semantic-lex-tokens
1818 "An alist of of semantic token types.
1819 See variable `semantic-lex-tokens'.")
1820
1821 (defvar semantic-flex-unterminated-syntax-end-function
1822 (lambda (syntax syntax-start flex-end) flex-end)
1823 "Function called when unterminated syntax is encountered.
1824 This should be set to one function. That function should take three
1825 parameters. The SYNTAX, or type of syntax which is unterminated.
1826 SYNTAX-START where the broken syntax begins.
1827 FLEX-END is where the lexical analysis was asked to end.
1828 This function can be used for languages that can intelligently fix up
1829 broken syntax, or the exit lexical analysis via `throw' or `signal'
1830 when finding unterminated syntax.")
1831
1832 (defvar semantic-flex-extensions nil
1833 "Buffer local extensions to the lexical analyzer.
1834 This should contain an alist with a key of a regex and a data element of
1835 a function. The function should both move point, and return a lexical
1836 token of the form:
1837 ( TYPE START . END)
1838 nil is also a valid return value.
1839 TYPE can be any type of symbol, as long as it doesn't occur as a
1840 nonterminal in the language definition.")
1841 (make-variable-buffer-local 'semantic-flex-extensions)
1842
1843 (defvar semantic-flex-syntax-modifications nil
1844 "Changes to the syntax table for this buffer.
1845 These changes are active only while the buffer is being flexed.
1846 This is a list where each element has the form:
1847 (CHAR CLASS)
1848 CHAR is the char passed to `modify-syntax-entry',
1849 and CLASS is the string also passed to `modify-syntax-entry' to define
1850 what syntax class CHAR has.")
1851 (make-variable-buffer-local 'semantic-flex-syntax-modifications)
1852
1853 (defvar semantic-ignore-comments t
1854 "Default comment handling.
1855 t means to strip comments when flexing. Nil means to keep comments
1856 as part of the token stream.")
1857 (make-variable-buffer-local 'semantic-ignore-comments)
1858
1859 (defvar semantic-flex-enable-newlines nil
1860 "When flexing, report 'newlines as syntactic elements.
1861 Useful for languages where the newline is a special case terminator.
1862 Only set this on a per mode basis, not globally.")
1863 (make-variable-buffer-local 'semantic-flex-enable-newlines)
1864
1865 (defvar semantic-flex-enable-whitespace nil
1866 "When flexing, report 'whitespace as syntactic elements.
1867 Useful for languages where the syntax is whitespace dependent.
1868 Only set this on a per mode basis, not globally.")
1869 (make-variable-buffer-local 'semantic-flex-enable-whitespace)
1870
1871 (defvar semantic-flex-enable-bol nil
1872 "When flexing, report beginning of lines as syntactic elements.
1873 Useful for languages like python which are indentation sensitive.
1874 Only set this on a per mode basis, not globally.")
1875 (make-variable-buffer-local 'semantic-flex-enable-bol)
1876
1877 (defvar semantic-number-expression semantic-lex-number-expression
1878 "See variable `semantic-lex-number-expression'.")
1879 (make-variable-buffer-local 'semantic-number-expression)
1880
1881 (defvar semantic-flex-depth 0
1882 "Default flexing depth.
1883 This specifies how many lists to create tokens in.")
1884 (make-variable-buffer-local 'semantic-flex-depth)
1885
1886 (defun semantic-flex (start end &optional depth length)
1887 "Using the syntax table, do something roughly equivalent to flex.
1888 Semantically check between START and END. Optional argument DEPTH
1889 indicates at what level to scan over entire lists.
1890 The return value is a token stream. Each element is a list, such of
1891 the form (symbol start-expression . end-expression) where SYMBOL
1892 denotes the token type.
1893 See `semantic-flex-tokens' variable for details on token types.
1894 END does not mark the end of the text scanned, only the end of the
1895 beginning of text scanned. Thus, if a string extends past END, the
1896 end of the return token will be larger than END. To truly restrict
1897 scanning, use `narrow-to-region'.
1898 The last argument, LENGTH specifies that `semantic-flex' should only
1899 return LENGTH tokens."
1900 (message "`semantic-flex' is an obsolete function. Use `define-lex' to create lexers.")
1901 (if (not semantic-flex-keywords-obarray)
1902 (setq semantic-flex-keywords-obarray [ nil ]))
1903 (let ((ts nil)
1904 (pos (point))
1905 (ep nil)
1906 (curdepth 0)
1907 (cs (if comment-start-skip
1908 (concat "\\(\\s<\\|" comment-start-skip "\\)")
1909 (concat "\\(\\s<\\)")))
1910 (newsyntax (copy-syntax-table (syntax-table)))
1911 (mods semantic-flex-syntax-modifications)
1912 ;; Use the default depth if it is not specified.
1913 (depth (or depth semantic-flex-depth)))
1914 ;; Update the syntax table
1915 (while mods
1916 (modify-syntax-entry (car (car mods)) (car (cdr (car mods))) newsyntax)
1917 (setq mods (cdr mods)))
1918 (with-syntax-table newsyntax
1919 (goto-char start)
1920 (while (and (< (point) end) (or (not length) (<= (length ts) length)))
1921 (cond
1922 ;; catch beginning of lines when needed.
1923 ;; Must be done before catching any other tokens!
1924 ((and semantic-flex-enable-bol
1925 (bolp)
1926 ;; Just insert a (bol N . N) token in the token stream,
1927 ;; without moving the point. N is the point at the
1928 ;; beginning of line.
1929 (setq ts (cons (cons 'bol (cons (point) (point))) ts))
1930 nil)) ;; CONTINUE
1931 ;; special extensions, includes whitespace, nl, etc.
1932 ((and semantic-flex-extensions
1933 (let ((fe semantic-flex-extensions)
1934 (r nil))
1935 (while fe
1936 (if (looking-at (car (car fe)))
1937 (setq ts (cons (funcall (cdr (car fe))) ts)
1938 r t
1939 fe nil
1940 ep (point)))
1941 (setq fe (cdr fe)))
1942 (if (and r (not (car ts))) (setq ts (cdr ts)))
1943 r)))
1944 ;; catch newlines when needed
1945 ((looking-at "\\s-*\\(\n\\|\\s>\\)")
1946 (if semantic-flex-enable-newlines
1947 (setq ep (match-end 1)
1948 ts (cons (cons 'newline
1949 (cons (match-beginning 1) ep))
1950 ts))))
1951 ;; catch whitespace when needed
1952 ((looking-at "\\s-+")
1953 (if semantic-flex-enable-whitespace
1954 ;; Language wants whitespaces, link them together.
1955 (if (eq (car (car ts)) 'whitespace)
1956 (setcdr (cdr (car ts)) (match-end 0))
1957 (setq ts (cons (cons 'whitespace
1958 (cons (match-beginning 0)
1959 (match-end 0)))
1960 ts)))))
1961 ;; numbers
1962 ((and semantic-number-expression
1963 (looking-at semantic-number-expression))
1964 (setq ts (cons (cons 'number
1965 (cons (match-beginning 0)
1966 (match-end 0)))
1967 ts)))
1968 ;; symbols
1969 ((looking-at "\\(\\sw\\|\\s_\\)+")
1970 (setq ts (cons (cons
1971 ;; Get info on if this is a keyword or not
1972 (or (semantic-flex-keyword-p (match-string 0))
1973 'symbol)
1974 (cons (match-beginning 0) (match-end 0)))
1975 ts)))
1976 ;; Character quoting characters (ie, \n as newline)
1977 ((looking-at "\\s\\+")
1978 (setq ts (cons (cons 'charquote
1979 (cons (match-beginning 0) (match-end 0)))
1980 ts)))
1981 ;; Open parens, or semantic-lists.
1982 ((looking-at "\\s(")
1983 (if (or (not depth) (< curdepth depth))
1984 (progn
1985 (setq curdepth (1+ curdepth))
1986 (setq ts (cons (cons 'open-paren
1987 (cons (match-beginning 0) (match-end 0)))
1988 ts)))
1989 (setq ts (cons
1990 (cons 'semantic-list
1991 (cons (match-beginning 0)
1992 (save-excursion
1993 (condition-case nil
1994 (forward-list 1)
1995 ;; This case makes flex robust
1996 ;; to broken lists.
1997 (error
1998 (goto-char
1999 (funcall
2000 semantic-flex-unterminated-syntax-end-function
2001 'semantic-list
2002 start end))))
2003 (setq ep (point)))))
2004 ts))))
2005 ;; Close parens
2006 ((looking-at "\\s)")
2007 (setq ts (cons (cons 'close-paren
2008 (cons (match-beginning 0) (match-end 0)))
2009 ts))
2010 (setq curdepth (1- curdepth)))
2011 ;; String initiators
2012 ((looking-at "\\s\"")
2013 ;; Zing to the end of this string.
2014 (setq ts (cons (cons 'string
2015 (cons (match-beginning 0)
2016 (save-excursion
2017 (condition-case nil
2018 (forward-sexp 1)
2019 ;; This case makes flex
2020 ;; robust to broken strings.
2021 (error
2022 (goto-char
2023 (funcall
2024 semantic-flex-unterminated-syntax-end-function
2025 'string
2026 start end))))
2027 (setq ep (point)))))
2028 ts)))
2029 ;; comments
2030 ((looking-at cs)
2031 (if (and semantic-ignore-comments
2032 (not semantic-flex-enable-whitespace))
2033 ;; If the language doesn't deal with comments nor
2034 ;; whitespaces, ignore them here.
2035 (let ((comment-start-point (point)))
2036 (forward-comment 1)
2037 (if (eq (point) comment-start-point)
2038 ;; In this case our start-skip string failed
2039 ;; to work properly. Lets try and move over
2040 ;; whatever white space we matched to begin
2041 ;; with.
2042 (skip-syntax-forward "-.'"
2043 (save-excursion
2044 (end-of-line)
2045 (point)))
2046 ;;(forward-comment 1)
2047 ;; Generate newline token if enabled
2048 (if (and semantic-flex-enable-newlines
2049 (bolp))
2050 (backward-char 1)))
2051 (if (eq (point) comment-start-point)
2052 (error "Strange comment syntax prevents lexical analysis"))
2053 (setq ep (point)))
2054 (let ((tk (if semantic-ignore-comments 'whitespace 'comment)))
2055 (save-excursion
2056 (forward-comment 1)
2057 ;; Generate newline token if enabled
2058 (if (and semantic-flex-enable-newlines
2059 (bolp))
2060 (backward-char 1))
2061 (setq ep (point)))
2062 ;; Language wants comments or want them as whitespaces,
2063 ;; link them together.
2064 (if (eq (car (car ts)) tk)
2065 (setcdr (cdr (car ts)) ep)
2066 (setq ts (cons (cons tk (cons (match-beginning 0) ep))
2067 ts))))))
2068 ;; punctuation
2069 ((looking-at "\\(\\s.\\|\\s$\\|\\s'\\)")
2070 (setq ts (cons (cons 'punctuation
2071 (cons (match-beginning 0) (match-end 0)))
2072 ts)))
2073 ;; unknown token
2074 (t
2075 (error "What is that?")))
2076 (goto-char (or ep (match-end 0)))
2077 (setq ep nil)))
2078 ;; maybe catch the last beginning of line when needed
2079 (and semantic-flex-enable-bol
2080 (= (point) end)
2081 (bolp)
2082 (setq ts (cons (cons 'bol (cons (point) (point))) ts)))
2083 (goto-char pos)
2084 ;;(message "Flexing muscles...done")
2085 (nreverse ts)))
2086
2087 (provide 'semantic/lex)
2088
2089 ;;; semantic-lex.el ends here