* emacs-lisp/autoload.el (generated-autoload-load-name): New var.
[bpt/emacs.git] / lisp / cedet / semantic / lex.el
1 ;;; lex.el --- Lexical Analyzer builder
2
3 ;;; Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 ;;; 2007, 2008, 2009 Free Software Foundation, Inc.
5
6 ;; Author: Eric M. Ludlam <zappo@gnu.org>
7
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22
23 ;;; Commentary:
24 ;;
25 ;; This file handles the creation of lexical analyzers for different
26 ;; languages in Emacs Lisp. The purpose of a lexical analyzer is to
27 ;; convert a buffer into a list of lexical tokens. Each token
28 ;; contains the token class (such as 'number, 'symbol, 'IF, etc) and
29 ;; the location in the buffer it was found. Optionally, a token also
30 ;; contains a string representing what is at the designated buffer
31 ;; location.
32 ;;
33 ;; Tokens are pushed onto a token stream, which is basically a list of
34 ;; all the lexical tokens from the analyzed region. The token stream
35 ;; is then handed to the grammar which parsers the file.
36 ;;
37 ;;; How it works
38 ;;
39 ;; Each analyzer specifies a condition and forms. These conditions
40 ;; and forms are assembled into a function by `define-lex' that does
41 ;; the lexical analysis.
42 ;;
43 ;; In the lexical analyzer created with `define-lex', each condition
44 ;; is tested for a given point. When the conditin is true, the forms
45 ;; run.
46 ;;
47 ;; The forms can push a lexical token onto the token stream. The
48 ;; analyzer forms also must move the current analyzer point. If the
49 ;; analyzer point is moved without pushing a token, then tne matched
50 ;; syntax is effectively ignored, or skipped.
51 ;;
52 ;; Thus, starting at the beginning of a region to be analyzed, each
53 ;; condition is tested. One will match, and a lexical token might be
54 ;; pushed, and the point is moved to the end of the lexical token
55 ;; identified. At the new position, the process occurs again until
56 ;; the end of the specified region is reached.
57 ;;
58 ;;; How to use semantic-lex
59 ;;
60 ;; To create a lexer for a language, use the `define-lex' macro.
61 ;;
62 ;; The `define-lex' macro accepts a list of lexical analyzers. Each
63 ;; analyzer is created with `define-lex-analyzer', or one of the
64 ;; derivitive macros. A single analyzer defines a regular expression
65 ;; to match text in a buffer, and a short segment of code to create
66 ;; one lexical token.
67 ;;
68 ;; Each analyzer has a NAME, DOC, a CONDITION, and possibly some
69 ;; FORMS. The NAME is the name used in `define-lex'. The DOC
70 ;; describes what the analyzer should do.
71 ;;
72 ;; The CONDITION evaluates the text at the current point in the
73 ;; current buffer. If CONDITION is true, then the FORMS will be
74 ;; executed.
75 ;;
76 ;; The purpose of the FORMS is to push new lexical tokens onto the
77 ;; list of tokens for the current buffer, and to move point after the
78 ;; matched text.
79 ;;
80 ;; Some macros for creating one analyzer are:
81 ;;
82 ;; define-lex-analyzer - A generic analyzer associating any style of
83 ;; condition to forms.
84 ;; define-lex-regex-analyzer - Matches a regular expression.
85 ;; define-lex-simple-regex-analyzer - Matches a regular expressions,
86 ;; and pushes the match.
87 ;; define-lex-block-analyzer - Matches list syntax, and defines
88 ;; handles open/close delimiters.
89 ;;
90 ;; These macros are used by the grammar compiler when lexical
91 ;; information is specified in a grammar:
92 ;; define-lex- * -type-analyzer - Matches syntax specified in
93 ;; a grammar, and pushes one token for it. The * would
94 ;; be `sexp' for things like lists or strings, and
95 ;; `string' for things that need to match some special
96 ;; string, such as "\\." where a literal match is needed.
97 ;;
98 ;;; Lexical Tables
99 ;;
100 ;; There are tables of different symbols managed in semantic-lex.el.
101 ;; They are:
102 ;;
103 ;; Lexical keyword table - A Table of symbols declared in a grammar
104 ;; file with the %keyword declaration.
105 ;; Keywords are used by `semantic-lex-symbol-or-keyword'
106 ;; to create lexical tokens based on the keyword.
107 ;;
108 ;; Lexical type table - A table of symbols declared in a grammer
109 ;; file with the %type declaration.
110 ;; The grammar compiler uses the type table to create new
111 ;; lexical analyzers. These analyzers are then used to when
112 ;; a new lexical analyzer is made for a language.
113 ;;
114 ;;; Lexical Types
115 ;;
116 ;; A lexical type defines a kind of lexical analyzer that will be
117 ;; automatically generated from a grammar file based on some
118 ;; predetermined attributes. For now these two attributes are
119 ;; recognized :
120 ;;
121 ;; * matchdatatype : define the kind of lexical analyzer. That is :
122 ;;
123 ;; - regexp : define a regexp analyzer (see
124 ;; `define-lex-regex-type-analyzer')
125 ;;
126 ;; - string : define a string analyzer (see
127 ;; `define-lex-string-type-analyzer')
128 ;;
129 ;; - block : define a block type analyzer (see
130 ;; `define-lex-block-type-analyzer')
131 ;;
132 ;; - sexp : define a sexp analyzer (see
133 ;; `define-lex-sexp-type-analyzer')
134 ;;
135 ;; - keyword : define a keyword analyzer (see
136 ;; `define-lex-keyword-type-analyzer')
137 ;;
138 ;; * syntax : define the syntax that matches a syntactic
139 ;; expression. When syntax is matched the corresponding type
140 ;; analyzer is entered and the resulting match data will be
141 ;; interpreted based on the kind of analyzer (see matchdatatype
142 ;; above).
143 ;;
144 ;; The following lexical types are predefined :
145 ;;
146 ;; +-------------+---------------+--------------------------------+
147 ;; | type | matchdatatype | syntax |
148 ;; +-------------+---------------+--------------------------------+
149 ;; | punctuation | string | "\\(\\s.\\|\\s$\\|\\s'\\)+" |
150 ;; | keyword | keyword | "\\(\\sw\\|\\s_\\)+" |
151 ;; | symbol | regexp | "\\(\\sw\\|\\s_\\)+" |
152 ;; | string | sexp | "\\s\"" |
153 ;; | number | regexp | semantic-lex-number-expression |
154 ;; | block | block | "\\s(\\|\\s)" |
155 ;; +-------------+---------------+--------------------------------+
156 ;;
157 ;; In a grammar you must use a %type expression to automatically generate
158 ;; the corresponding analyzers of that type.
159 ;;
160 ;; Here is an example to auto-generate punctuation analyzers
161 ;; with 'matchdatatype and 'syntax predefined (see table above)
162 ;;
163 ;; %type <punctuation> ;; will auto-generate this kind of analyzers
164 ;;
165 ;; It is equivalent to write :
166 ;;
167 ;; %type <punctuation> syntax "\\(\\s.\\|\\s$\\|\\s'\\)+" matchdatatype string
168 ;;
169 ;; ;; Some punctuations based on the type defines above
170 ;;
171 ;; %token <punctuation> NOT "!"
172 ;; %token <punctuation> NOTEQ "!="
173 ;; %token <punctuation> MOD "%"
174 ;; %token <punctuation> MODEQ "%="
175 ;;
176
177 ;;; On the Semantic 1.x lexer
178 ;;
179 ;; In semantic 1.x, the lexical analyzer was an all purpose routine.
180 ;; To boost efficiency, the analyzer is now a series of routines that
181 ;; are constructed at build time into a single routine. This will
182 ;; eliminate unneeded if statements to speed the lexer.
183
184 (require 'semantic/fw)
185 ;;; Code:
186
187 ;;; Compatibility
188 ;;
189 (eval-and-compile
190 (if (not (fboundp 'with-syntax-table))
191
192 ;; Copied from Emacs 21 for compatibility with released Emacses.
193 (defmacro with-syntax-table (table &rest body)
194 "With syntax table of current buffer set to a copy of TABLE, evaluate BODY.
195 The syntax table of the current buffer is saved, BODY is evaluated, and the
196 saved table is restored, even in case of an abnormal exit.
197 Value is what BODY returns."
198 (let ((old-table (make-symbol "table"))
199 (old-buffer (make-symbol "buffer")))
200 `(let ((,old-table (syntax-table))
201 (,old-buffer (current-buffer)))
202 (unwind-protect
203 (progn
204 (set-syntax-table (copy-syntax-table ,table))
205 ,@body)
206 (save-current-buffer
207 (set-buffer ,old-buffer)
208 (set-syntax-table ,old-table))))))
209
210 ))
211 \f
212 ;;; Semantic 2.x lexical analysis
213 ;;
214 (defun semantic-lex-map-symbols (fun table &optional property)
215 "Call function FUN on every symbol in TABLE.
216 If optional PROPERTY is non-nil, call FUN only on every symbol which
217 as a PROPERTY value. FUN receives a symbol as argument."
218 (if (arrayp table)
219 (mapatoms
220 #'(lambda (symbol)
221 (if (or (null property) (get symbol property))
222 (funcall fun symbol)))
223 table)))
224
225 ;;; Lexical keyword table handling.
226 ;;
227 ;; These keywords are keywords defined for using in a grammar with the
228 ;; %keyword declaration, and are not keywords used in Emacs Lisp.
229
230 (defvar semantic-flex-keywords-obarray nil
231 "Buffer local keyword obarray for the lexical analyzer.
232 These keywords are matched explicitly, and converted into special symbols.")
233 (make-variable-buffer-local 'semantic-flex-keywords-obarray)
234
235 (defmacro semantic-lex-keyword-invalid (name)
236 "Signal that NAME is an invalid keyword name."
237 `(signal 'wrong-type-argument '(semantic-lex-keyword-p ,name)))
238
239 (defsubst semantic-lex-keyword-symbol (name)
240 "Return keyword symbol with NAME or nil if not found."
241 (and (arrayp semantic-flex-keywords-obarray)
242 (stringp name)
243 (intern-soft name semantic-flex-keywords-obarray)))
244
245 (defsubst semantic-lex-keyword-p (name)
246 "Return non-nil if a keyword with NAME exists in the keyword table.
247 Return nil otherwise."
248 (and (setq name (semantic-lex-keyword-symbol name))
249 (symbol-value name)))
250
251 (defsubst semantic-lex-keyword-set (name value)
252 "Set value of keyword with NAME to VALUE and return VALUE."
253 (set (intern name semantic-flex-keywords-obarray) value))
254
255 (defsubst semantic-lex-keyword-value (name)
256 "Return value of keyword with NAME.
257 Signal an error if a keyword with NAME does not exist."
258 (let ((keyword (semantic-lex-keyword-symbol name)))
259 (if keyword
260 (symbol-value keyword)
261 (semantic-lex-keyword-invalid name))))
262
263 (defsubst semantic-lex-keyword-put (name property value)
264 "For keyword with NAME, set its PROPERTY to VALUE."
265 (let ((keyword (semantic-lex-keyword-symbol name)))
266 (if keyword
267 (put keyword property value)
268 (semantic-lex-keyword-invalid name))))
269
270 (defsubst semantic-lex-keyword-get (name property)
271 "For keyword with NAME, return its PROPERTY value."
272 (let ((keyword (semantic-lex-keyword-symbol name)))
273 (if keyword
274 (get keyword property)
275 (semantic-lex-keyword-invalid name))))
276
277 (defun semantic-lex-make-keyword-table (specs &optional propspecs)
278 "Convert keyword SPECS into an obarray and return it.
279 SPECS must be a list of (NAME . TOKSYM) elements, where:
280
281 NAME is the name of the keyword symbol to define.
282 TOKSYM is the lexical token symbol of that keyword.
283
284 If optional argument PROPSPECS is non nil, then interpret it, and
285 apply those properties.
286 PROPSPECS must be a list of (NAME PROPERTY VALUE) elements."
287 ;; Create the symbol hash table
288 (let ((semantic-flex-keywords-obarray (make-vector 13 0))
289 spec)
290 ;; fill it with stuff
291 (while specs
292 (setq spec (car specs)
293 specs (cdr specs))
294 (semantic-lex-keyword-set (car spec) (cdr spec)))
295 ;; Apply all properties
296 (while propspecs
297 (setq spec (car propspecs)
298 propspecs (cdr propspecs))
299 (semantic-lex-keyword-put (car spec) (nth 1 spec) (nth 2 spec)))
300 semantic-flex-keywords-obarray))
301
302 (defsubst semantic-lex-map-keywords (fun &optional property)
303 "Call function FUN on every lexical keyword.
304 If optional PROPERTY is non-nil, call FUN only on every keyword which
305 as a PROPERTY value. FUN receives a lexical keyword as argument."
306 (semantic-lex-map-symbols
307 fun semantic-flex-keywords-obarray property))
308
309 (defun semantic-lex-keywords (&optional property)
310 "Return a list of lexical keywords.
311 If optional PROPERTY is non-nil, return only keywords which have a
312 PROPERTY set."
313 (let (keywords)
314 (semantic-lex-map-keywords
315 #'(lambda (symbol) (setq keywords (cons symbol keywords)))
316 property)
317 keywords))
318
319 ;;; Inline functions:
320
321 (defvar semantic-lex-unterminated-syntax-end-function)
322 (defvar semantic-lex-analysis-bounds)
323 (defvar semantic-lex-end-point)
324
325 (defsubst semantic-lex-token-bounds (token)
326 "Fetch the start and end locations of the lexical token TOKEN.
327 Return a pair (START . END)."
328 (if (not (numberp (car (cdr token))))
329 (cdr (cdr token))
330 (cdr token)))
331
332 (defsubst semantic-lex-token-start (token)
333 "Fetch the start position of the lexical token TOKEN.
334 See also the function `semantic-lex-token'."
335 (car (semantic-lex-token-bounds token)))
336
337 (defsubst semantic-lex-token-end (token)
338 "Fetch the end position of the lexical token TOKEN.
339 See also the function `semantic-lex-token'."
340 (cdr (semantic-lex-token-bounds token)))
341
342 (defsubst semantic-lex-unterminated-syntax-detected (syntax)
343 "Inside a lexical analyzer, use this when unterminated syntax was found.
344 Argument SYNTAX indicates the type of syntax that is unterminated.
345 The job of this function is to move (point) to a new logical location
346 so that analysis can continue, if possible."
347 (goto-char
348 (funcall semantic-lex-unterminated-syntax-end-function
349 syntax
350 (car semantic-lex-analysis-bounds)
351 (cdr semantic-lex-analysis-bounds)
352 ))
353 (setq semantic-lex-end-point (point)))
354 \f
355 ;;; Type table handling.
356 ;;
357 ;; The lexical type table manages types that occur in a grammar file
358 ;; with the %type declaration. Types represent different syntaxes.
359 ;; See code for `semantic-lex-preset-default-types' for the classic
360 ;; types of syntax.
361 (defvar semantic-lex-types-obarray nil
362 "Buffer local types obarray for the lexical analyzer.")
363 (make-variable-buffer-local 'semantic-lex-types-obarray)
364
365 (defmacro semantic-lex-type-invalid (type)
366 "Signal that TYPE is an invalid lexical type name."
367 `(signal 'wrong-type-argument '(semantic-lex-type-p ,type)))
368
369 (defsubst semantic-lex-type-symbol (type)
370 "Return symbol with TYPE or nil if not found."
371 (and (arrayp semantic-lex-types-obarray)
372 (stringp type)
373 (intern-soft type semantic-lex-types-obarray)))
374
375 (defsubst semantic-lex-type-p (type)
376 "Return non-nil if a symbol with TYPE name exists."
377 (and (setq type (semantic-lex-type-symbol type))
378 (symbol-value type)))
379
380 (defsubst semantic-lex-type-set (type value)
381 "Set value of symbol with TYPE name to VALUE and return VALUE."
382 (set (intern type semantic-lex-types-obarray) value))
383
384 (defsubst semantic-lex-type-value (type &optional noerror)
385 "Return value of symbol with TYPE name.
386 If optional argument NOERROR is non-nil return nil if a symbol with
387 TYPE name does not exist. Otherwise signal an error."
388 (let ((sym (semantic-lex-type-symbol type)))
389 (if sym
390 (symbol-value sym)
391 (unless noerror
392 (semantic-lex-type-invalid type)))))
393
394 (defsubst semantic-lex-type-put (type property value &optional add)
395 "For symbol with TYPE name, set its PROPERTY to VALUE.
396 If optional argument ADD is non-nil, create a new symbol with TYPE
397 name if it does not already exist. Otherwise signal an error."
398 (let ((sym (semantic-lex-type-symbol type)))
399 (unless sym
400 (or add (semantic-lex-type-invalid type))
401 (semantic-lex-type-set type nil)
402 (setq sym (semantic-lex-type-symbol type)))
403 (put sym property value)))
404
405 (defsubst semantic-lex-type-get (type property &optional noerror)
406 "For symbol with TYPE name, return its PROPERTY value.
407 If optional argument NOERROR is non-nil return nil if a symbol with
408 TYPE name does not exist. Otherwise signal an error."
409 (let ((sym (semantic-lex-type-symbol type)))
410 (if sym
411 (get sym property)
412 (unless noerror
413 (semantic-lex-type-invalid type)))))
414
415 (defun semantic-lex-preset-default-types ()
416 "Install useful default properties for well known types."
417 (semantic-lex-type-put "punctuation" 'matchdatatype 'string t)
418 (semantic-lex-type-put "punctuation" 'syntax "\\(\\s.\\|\\s$\\|\\s'\\)+")
419 (semantic-lex-type-put "keyword" 'matchdatatype 'keyword t)
420 (semantic-lex-type-put "keyword" 'syntax "\\(\\sw\\|\\s_\\)+")
421 (semantic-lex-type-put "symbol" 'matchdatatype 'regexp t)
422 (semantic-lex-type-put "symbol" 'syntax "\\(\\sw\\|\\s_\\)+")
423 (semantic-lex-type-put "string" 'matchdatatype 'sexp t)
424 (semantic-lex-type-put "string" 'syntax "\\s\"")
425 (semantic-lex-type-put "number" 'matchdatatype 'regexp t)
426 (semantic-lex-type-put "number" 'syntax 'semantic-lex-number-expression)
427 (semantic-lex-type-put "block" 'matchdatatype 'block t)
428 (semantic-lex-type-put "block" 'syntax "\\s(\\|\\s)")
429 )
430
431 (defun semantic-lex-make-type-table (specs &optional propspecs)
432 "Convert type SPECS into an obarray and return it.
433 SPECS must be a list of (TYPE . TOKENS) elements, where:
434
435 TYPE is the name of the type symbol to define.
436 TOKENS is an list of (TOKSYM . MATCHER) elements, where:
437
438 TOKSYM is any lexical token symbol.
439 MATCHER is a string or regexp a text must match to be a such
440 lexical token.
441
442 If optional argument PROPSPECS is non nil, then interpret it, and
443 apply those properties.
444 PROPSPECS must be a list of (TYPE PROPERTY VALUE)."
445 ;; Create the symbol hash table
446 (let* ((semantic-lex-types-obarray (make-vector 13 0))
447 spec type tokens token alist default)
448 ;; fill it with stuff
449 (while specs
450 (setq spec (car specs)
451 specs (cdr specs)
452 type (car spec)
453 tokens (cdr spec)
454 default nil
455 alist nil)
456 (while tokens
457 (setq token (car tokens)
458 tokens (cdr tokens))
459 (if (cdr token)
460 (setq alist (cons token alist))
461 (setq token (car token))
462 (if default
463 (message
464 "*Warning* default value of <%s> tokens changed to %S, was %S"
465 type default token))
466 (setq default token)))
467 ;; Ensure the default matching spec is the first one.
468 (semantic-lex-type-set type (cons default (nreverse alist))))
469 ;; Install useful default types & properties
470 (semantic-lex-preset-default-types)
471 ;; Apply all properties
472 (while propspecs
473 (setq spec (car propspecs)
474 propspecs (cdr propspecs))
475 ;; Create the type if necessary.
476 (semantic-lex-type-put (car spec) (nth 1 spec) (nth 2 spec) t))
477 semantic-lex-types-obarray))
478
479 (defsubst semantic-lex-map-types (fun &optional property)
480 "Call function FUN on every lexical type.
481 If optional PROPERTY is non-nil, call FUN only on every type symbol
482 which as a PROPERTY value. FUN receives a type symbol as argument."
483 (semantic-lex-map-symbols
484 fun semantic-lex-types-obarray property))
485
486 (defun semantic-lex-types (&optional property)
487 "Return a list of lexical type symbols.
488 If optional PROPERTY is non-nil, return only type symbols which have
489 PROPERTY set."
490 (let (types)
491 (semantic-lex-map-types
492 #'(lambda (symbol) (setq types (cons symbol types)))
493 property)
494 types))
495 \f
496 ;;; Lexical Analyzer framework settings
497 ;;
498
499 (defvar semantic-lex-analyzer 'semantic-flex
500 "The lexical analyzer used for a given buffer.
501 See `semantic-lex' for documentation.
502 For compatibility with Semantic 1.x it defaults to `semantic-flex'.")
503 (make-variable-buffer-local 'semantic-lex-analyzer)
504
505 (defvar semantic-lex-tokens
506 '(
507 (bol)
508 (charquote)
509 (close-paren)
510 (comment)
511 (newline)
512 (open-paren)
513 (punctuation)
514 (semantic-list)
515 (string)
516 (symbol)
517 (whitespace)
518 )
519 "An alist of of semantic token types.
520 As of December 2001 (semantic 1.4beta13), this variable is not used in
521 any code. The only use is to refer to the doc-string from elsewhere.
522
523 The key to this alist is the symbol representing token type that
524 \\[semantic-flex] returns. These are
525
526 - bol: Empty string matching a beginning of line.
527 This token is produced with
528 `semantic-lex-beginning-of-line'.
529
530 - charquote: String sequences that match `\\s\\+' regexp.
531 This token is produced with `semantic-lex-charquote'.
532
533 - close-paren: Characters that match `\\s)' regexp.
534 These are typically `)', `}', `]', etc.
535 This token is produced with
536 `semantic-lex-close-paren'.
537
538 - comment: A comment chunk. These token types are not
539 produced by default.
540 This token is produced with `semantic-lex-comments'.
541 Comments are ignored with `semantic-lex-ignore-comments'.
542 Comments are treated as whitespace with
543 `semantic-lex-comments-as-whitespace'.
544
545 - newline Characters matching `\\s-*\\(\n\\|\\s>\\)' regexp.
546 This token is produced with `semantic-lex-newline'.
547
548 - open-paren: Characters that match `\\s(' regexp.
549 These are typically `(', `{', `[', etc.
550 If `semantic-lex-paren-or-list' is used,
551 then `open-paren' is not usually generated unless
552 the `depth' argument to \\[semantic-lex] is
553 greater than 0.
554 This token is always produced if the analyzer
555 `semantic-lex-open-paren' is used.
556
557 - punctuation: Characters matching `{\\(\\s.\\|\\s$\\|\\s'\\)'
558 regexp.
559 This token is produced with `semantic-lex-punctuation'.
560 Always specify this analyzer after the comment
561 analyzer.
562
563 - semantic-list: String delimited by matching parenthesis, braces,
564 etc. that the lexer skipped over, because the
565 `depth' parameter to \\[semantic-flex] was not high
566 enough.
567 This token is produced with `semantic-lex-paren-or-list'.
568
569 - string: Quoted strings, i.e., string sequences that start
570 and end with characters matching `\\s\"'
571 regexp. The lexer relies on @code{forward-sexp} to
572 find the matching end.
573 This token is produced with `semantic-lex-string'.
574
575 - symbol: String sequences that match `\\(\\sw\\|\\s_\\)+'
576 regexp.
577 This token is produced with
578 `semantic-lex-symbol-or-keyword'. Always add this analyzer
579 after `semantic-lex-number', or other analyzers that
580 match its regular expression.
581
582 - whitespace: Characters that match `\\s-+' regexp.
583 This token is produced with `semantic-lex-whitespace'.")
584
585 (defvar semantic-lex-syntax-modifications nil
586 "Changes to the syntax table for this buffer.
587 These changes are active only while the buffer is being flexed.
588 This is a list where each element has the form:
589 (CHAR CLASS)
590 CHAR is the char passed to `modify-syntax-entry',
591 and CLASS is the string also passed to `modify-syntax-entry' to define
592 what syntax class CHAR has.")
593 (make-variable-buffer-local 'semantic-lex-syntax-modifications)
594
595 (defvar semantic-lex-syntax-table nil
596 "Syntax table used by lexical analysis.
597 See also `semantic-lex-syntax-modifications'.")
598 (make-variable-buffer-local 'semantic-lex-syntax-table)
599
600 (defvar semantic-lex-comment-regex nil
601 "Regular expression for identifying comment start during lexical analysis.
602 This may be automatically set when semantic initializes in a mode, but
603 may need to be overriden for some special languages.")
604 (make-variable-buffer-local 'semantic-lex-comment-regex)
605
606 (defvar semantic-lex-number-expression
607 ;; This expression was written by David Ponce for Java, and copied
608 ;; here for C and any other similar language.
609 (eval-when-compile
610 (concat "\\("
611 "\\<[0-9]+[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>"
612 "\\|"
613 "\\<[0-9]+[.][eE][-+]?[0-9]+[fFdD]?\\>"
614 "\\|"
615 "\\<[0-9]+[.][fFdD]\\>"
616 "\\|"
617 "\\<[0-9]+[.]"
618 "\\|"
619 "[.][0-9]+\\([eE][-+]?[0-9]+\\)?[fFdD]?\\>"
620 "\\|"
621 "\\<[0-9]+[eE][-+]?[0-9]+[fFdD]?\\>"
622 "\\|"
623 "\\<0[xX][0-9a-fA-F]+[lL]?\\>"
624 "\\|"
625 "\\<[0-9]+[lLfFdD]?\\>"
626 "\\)"
627 ))
628 "Regular expression for matching a number.
629 If this value is nil, no number extraction is done during lex.
630 This expression tries to match C and Java like numbers.
631
632 DECIMAL_LITERAL:
633 [1-9][0-9]*
634 ;
635 HEX_LITERAL:
636 0[xX][0-9a-fA-F]+
637 ;
638 OCTAL_LITERAL:
639 0[0-7]*
640 ;
641 INTEGER_LITERAL:
642 <DECIMAL_LITERAL>[lL]?
643 | <HEX_LITERAL>[lL]?
644 | <OCTAL_LITERAL>[lL]?
645 ;
646 EXPONENT:
647 [eE][+-]?[09]+
648 ;
649 FLOATING_POINT_LITERAL:
650 [0-9]+[.][0-9]*<EXPONENT>?[fFdD]?
651 | [.][0-9]+<EXPONENT>?[fFdD]?
652 | [0-9]+<EXPONENT>[fFdD]?
653 | [0-9]+<EXPONENT>?[fFdD]
654 ;")
655 (make-variable-buffer-local 'semantic-lex-number-expression)
656
657 (defvar semantic-lex-depth 0
658 "Default lexing depth.
659 This specifies how many lists to create tokens in.")
660 (make-variable-buffer-local 'semantic-lex-depth)
661
662 (defvar semantic-lex-unterminated-syntax-end-function
663 (lambda (syntax syntax-start lex-end) lex-end)
664 "Function called when unterminated syntax is encountered.
665 This should be set to one function. That function should take three
666 parameters. The SYNTAX, or type of syntax which is unterminated.
667 SYNTAX-START where the broken syntax begins.
668 LEX-END is where the lexical analysis was asked to end.
669 This function can be used for languages that can intelligently fix up
670 broken syntax, or the exit lexical analysis via `throw' or `signal'
671 when finding unterminated syntax.")
672
673 ;;; Interactive testing commands
674
675 (declare-function semantic-elapsed-time "semantic")
676
677 (defun semantic-lex-test (arg)
678 "Test the semantic lexer in the current buffer.
679 If universal argument ARG, then try the whole buffer."
680 (interactive "P")
681 (require 'semantic)
682 (let* ((start (current-time))
683 (result (semantic-lex
684 (if arg (point-min) (point))
685 (point-max)))
686 (end (current-time)))
687 (message "Elapsed Time: %.2f seconds."
688 (semantic-elapsed-time start end))
689 (pop-to-buffer "*Lexer Output*")
690 (require 'pp)
691 (erase-buffer)
692 (insert (pp-to-string result))
693 (goto-char (point-min))
694 ))
695
696 (defun semantic-lex-test-full-depth (arg)
697 "Test the semantic lexer in the current buffer parsing through lists.
698 Usually the lexer parses
699 If universal argument ARG, then try the whole buffer."
700 (interactive "P")
701 (let* ((start (current-time))
702 (result (semantic-lex
703 (if arg (point-min) (point))
704 (point-max)
705 100))
706 (end (current-time)))
707 (message "Elapsed Time: %.2f seconds."
708 (semantic-elapsed-time start end))
709 (pop-to-buffer "*Lexer Output*")
710 (require 'pp)
711 (erase-buffer)
712 (insert (pp-to-string result))
713 (goto-char (point-min))
714 ))
715
716 (defun semantic-lex-test-region (beg end)
717 "Test the semantic lexer in the current buffer.
718 Analyze the area between BEG and END."
719 (interactive "r")
720 (let ((result (semantic-lex beg end)))
721 (pop-to-buffer "*Lexer Output*")
722 (require 'pp)
723 (erase-buffer)
724 (insert (pp-to-string result))
725 (goto-char (point-min))
726 ))
727
728 (defvar semantic-lex-debug nil
729 "When non-nil, debug the local lexical analyzer.")
730
731 (defun semantic-lex-debug (arg)
732 "Debug the semantic lexer in the current buffer.
733 Argument ARG specifies of the analyze the whole buffer, or start at point.
734 While engaged, each token identified by the lexer will be highlighted
735 in the target buffer A description of the current token will be
736 displayed in the minibuffer. Press SPC to move to the next lexical token."
737 (interactive "P")
738 (require 'semantic/debug)
739 (let ((semantic-lex-debug t))
740 (semantic-lex-test arg)))
741
742 (defun semantic-lex-highlight-token (token)
743 "Highlight the lexical TOKEN.
744 TOKEN is a lexical token with a START And END position.
745 Return the overlay."
746 (let ((o (semantic-make-overlay (semantic-lex-token-start token)
747 (semantic-lex-token-end token))))
748 (semantic-overlay-put o 'face 'highlight)
749 o))
750
751 (defsubst semantic-lex-debug-break (token)
752 "Break during lexical analysis at TOKEN."
753 (when semantic-lex-debug
754 (let ((o nil))
755 (unwind-protect
756 (progn
757 (when token
758 (setq o (semantic-lex-highlight-token token)))
759 (semantic-read-event
760 (format "%S :: SPC - continue" token))
761 )
762 (when o
763 (semantic-overlay-delete o))))))
764
765 ;;; Lexical analyzer creation
766 ;;
767 ;; Code for creating a lex function from lists of analyzers.
768 ;;
769 ;; A lexical analyzer is created from a list of individual analyzers.
770 ;; Each individual analyzer specifies a single match, and code that
771 ;; goes with it.
772 ;;
773 ;; Creation of an analyzer assembles these analyzers into a new function
774 ;; with the behaviors of all the individual analyzers.
775 ;;
776 (defmacro semantic-lex-one-token (analyzers)
777 "Calculate one token from the current buffer at point.
778 Uses locally bound variables from `define-lex'.
779 Argument ANALYZERS is the list of analyzers being used."
780 (cons 'cond (mapcar #'symbol-value analyzers)))
781
782 (defvar semantic-lex-end-point nil
783 "The end point as tracked through lexical functions.")
784
785 (defvar semantic-lex-current-depth nil
786 "The current depth as tracked through lexical functions.")
787
788 (defvar semantic-lex-maximum-depth nil
789 "The maximum depth of parenthisis as tracked through lexical functions.")
790
791 (defvar semantic-lex-token-stream nil
792 "The current token stream we are collecting.")
793
794 (defvar semantic-lex-analysis-bounds nil
795 "The bounds of the current analysis.")
796
797 (defvar semantic-lex-block-streams nil
798 "Streams of tokens inside collapsed blocks.
799 This is an alist of (ANCHOR . STREAM) elements where ANCHOR is the
800 start position of the block, and STREAM is the list of tokens in that
801 block.")
802
803 (defvar semantic-lex-reset-hooks nil
804 "List of hooks major-modes use to reset lexical analyzers.
805 Hooks are called with START and END values for the current lexical pass.
806 Should be set with `add-hook'specifying a LOCAL option.")
807
808 ;; Stack of nested blocks.
809 (defvar semantic-lex-block-stack nil)
810 ;;(defvar semantic-lex-timeout 5
811 ;; "*Number of sections of lexing before giving up.")
812
813 (defmacro define-lex (name doc &rest analyzers)
814 "Create a new lexical analyzer with NAME.
815 DOC is a documentation string describing this analyzer.
816 ANALYZERS are small code snippets of analyzers to use when
817 building the new NAMED analyzer. Only use analyzers which
818 are written to be used in `define-lex'.
819 Each analyzer should be an analyzer created with `define-lex-analyzer'.
820 Note: The order in which analyzers are listed is important.
821 If two analyzers can match the same text, it is important to order the
822 analyzers so that the one you want to match first occurs first. For
823 example, it is good to put a numbe analyzer in front of a symbol
824 analyzer which might mistake a number for as a symbol."
825 `(defun ,name (start end &optional depth length)
826 ,(concat doc "\nSee `semantic-lex' for more information.")
827 ;; Make sure the state of block parsing starts over.
828 (setq semantic-lex-block-streams nil)
829 ;; Allow specialty reset items.
830 (run-hook-with-args 'semantic-lex-reset-hooks start end)
831 ;; Lexing state.
832 (let* (;(starttime (current-time))
833 (starting-position (point))
834 (semantic-lex-token-stream nil)
835 (semantic-lex-block-stack nil)
836 (tmp-start start)
837 (semantic-lex-end-point start)
838 (semantic-lex-current-depth 0)
839 ;; Use the default depth when not specified.
840 (semantic-lex-maximum-depth
841 (or depth semantic-lex-depth))
842 ;; Bounds needed for unterminated syntax
843 (semantic-lex-analysis-bounds (cons start end))
844 ;; This entry prevents text properties from
845 ;; confusing our lexical analysis. See Emacs 22 (CVS)
846 ;; version of C++ mode with template hack text properties.
847 (parse-sexp-lookup-properties nil)
848 )
849 ;; Maybe REMOVE THIS LATER.
850 ;; Trying to find incremental parser bug.
851 (when (> end (point-max))
852 (error ,(format "%s: end (%%d) > point-max (%%d)" name)
853 end (point-max)))
854 (with-syntax-table semantic-lex-syntax-table
855 (goto-char start)
856 (while (and (< (point) end)
857 (or (not length)
858 (<= (length semantic-lex-token-stream) length)))
859 (semantic-lex-one-token ,analyzers)
860 (when (eq semantic-lex-end-point tmp-start)
861 (error ,(format "%s: endless loop at %%d, after %%S" name)
862 tmp-start (car semantic-lex-token-stream)))
863 (setq tmp-start semantic-lex-end-point)
864 (goto-char semantic-lex-end-point)
865 ;;(when (> (semantic-elapsed-time starttime (current-time))
866 ;; semantic-lex-timeout)
867 ;; (error "Timeout during lex at char %d" (point)))
868 (semantic-throw-on-input 'lex)
869 (semantic-lex-debug-break (car semantic-lex-token-stream))
870 ))
871 ;; Check that there is no unterminated block.
872 (when semantic-lex-block-stack
873 (let* ((last (pop semantic-lex-block-stack))
874 (blk last))
875 (while blk
876 (message
877 ,(format "%s: `%%s' block from %%S is unterminated" name)
878 (car blk) (cadr blk))
879 (setq blk (pop semantic-lex-block-stack)))
880 (semantic-lex-unterminated-syntax-detected (car last))))
881 ;; Return to where we started.
882 ;; Do not wrap in protective stuff so that if there is an error
883 ;; thrown, the user knows where.
884 (goto-char starting-position)
885 ;; Return the token stream
886 (nreverse semantic-lex-token-stream))))
887 \f
888 ;;; Collapsed block tokens delimited by any tokens.
889 ;;
890 (defun semantic-lex-start-block (syntax)
891 "Mark the last read token as the beginning of a SYNTAX block."
892 (if (or (not semantic-lex-maximum-depth)
893 (< semantic-lex-current-depth semantic-lex-maximum-depth))
894 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
895 (push (list syntax (car semantic-lex-token-stream))
896 semantic-lex-block-stack)))
897
898 (defun semantic-lex-end-block (syntax)
899 "Process the end of a previously marked SYNTAX block.
900 That is, collapse the tokens inside that block, including the
901 beginning and end of block tokens, into a high level block token of
902 class SYNTAX.
903 The token at beginning of block is the one marked by a previous call
904 to `semantic-lex-start-block'. The current token is the end of block.
905 The collapsed tokens are saved in `semantic-lex-block-streams'."
906 (if (null semantic-lex-block-stack)
907 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
908 (let* ((stream semantic-lex-token-stream)
909 (blk (pop semantic-lex-block-stack))
910 (bstream (cdr blk))
911 (first (car bstream))
912 (last (pop stream)) ;; The current token mark the EOBLK
913 tok)
914 (if (not (eq (car blk) syntax))
915 ;; SYNTAX doesn't match the syntax of the current block in
916 ;; the stack. So we encountered the end of the SYNTAX block
917 ;; before the end of the current one in the stack which is
918 ;; signaled unterminated.
919 (semantic-lex-unterminated-syntax-detected (car blk))
920 ;; Move tokens found inside the block from the main stream
921 ;; into a separate block stream.
922 (while (and stream (not (eq (setq tok (pop stream)) first)))
923 (push tok bstream))
924 ;; The token marked as beginning of block was not encountered.
925 ;; This should not happen!
926 (or (eq tok first)
927 (error "Token %S not found at beginning of block `%s'"
928 first syntax))
929 ;; Save the block stream for future reuse, to avoid to redo
930 ;; the lexical analysis of the block content!
931 ;; Anchor the block stream with its start position, so we can
932 ;; use: (cdr (assq start semantic-lex-block-streams)) to
933 ;; quickly retrieve the lexical stream associated to a block.
934 (setcar blk (semantic-lex-token-start first))
935 (setcdr blk (nreverse bstream))
936 (push blk semantic-lex-block-streams)
937 ;; In the main stream, replace the tokens inside the block by
938 ;; a high level block token of class SYNTAX.
939 (setq semantic-lex-token-stream stream)
940 (semantic-lex-push-token
941 (semantic-lex-token
942 syntax (car blk) (semantic-lex-token-end last)))
943 ))))
944 \f
945 ;;; Lexical token API
946 ;;
947 ;; Functions for accessing parts of a token. Use these functions
948 ;; instead of accessing the list structure directly because the
949 ;; contents of the lexical may change.
950 ;;
951 (defmacro semantic-lex-token (symbol start end &optional str)
952 "Create a lexical token.
953 SYMBOL is a symbol representing the class of syntax found.
954 START and END define the bounds of the token in the current buffer.
955 Optional STR is the string for the token iff the the bounds
956 in the buffer do not cover the string they represent. (As from
957 macro expansion.)"
958 ;; This if statement checks the existance of a STR argument at
959 ;; compile time, where STR is some symbol or constant. If the
960 ;; variable STr (runtime) is nil, this will make an incorrect decision.
961 ;;
962 ;; It is like this to maintain the original speed of the compiled
963 ;; code.
964 (if str
965 `(cons ,symbol (cons ,str (cons ,start ,end)))
966 `(cons ,symbol (cons ,start ,end))))
967
968 (defun semantic-lex-token-p (thing)
969 "Return non-nil if THING is a semantic lex token.
970 This is an exhaustively robust check."
971 (and (consp thing)
972 (symbolp (car thing))
973 (or (and (numberp (nth 1 thing))
974 (numberp (nthcdr 2 thing)))
975 (and (stringp (nth 1 thing))
976 (numberp (nth 2 thing))
977 (numberp (nthcdr 3 thing)))
978 ))
979 )
980
981 (defun semantic-lex-token-with-text-p (thing)
982 "Return non-nil if THING is a semantic lex token.
983 This is an exhaustively robust check."
984 (and (consp thing)
985 (symbolp (car thing))
986 (= (length thing) 4)
987 (stringp (nth 1 thing))
988 (numberp (nth 2 thing))
989 (numberp (nth 3 thing)))
990 )
991
992 (defun semantic-lex-token-without-text-p (thing)
993 "Return non-nil if THING is a semantic lex token.
994 This is an exhaustively robust check."
995 (and (consp thing)
996 (symbolp (car thing))
997 (= (length thing) 3)
998 (numberp (nth 1 thing))
999 (numberp (nth 2 thing)))
1000 )
1001
1002 (eval-and-compile
1003
1004 (defun semantic-lex-expand-block-specs (specs)
1005 "Expand block specifications SPECS into a Lisp form.
1006 SPECS is a list of (BLOCK BEGIN END) elements where BLOCK, BEGIN, and
1007 END are token class symbols that indicate to produce one collapsed
1008 BLOCK token from tokens found between BEGIN and END ones.
1009 BLOCK must be a non-nil symbol, and at least one of the BEGIN or END
1010 symbols must be non-nil too.
1011 When BEGIN is non-nil, generate a call to `semantic-lex-start-block'
1012 when a BEGIN token class is encountered.
1013 When END is non-nil, generate a call to `semantic-lex-end-block' when
1014 an END token class is encountered."
1015 (let ((class (make-symbol "class"))
1016 (form nil))
1017 (dolist (spec specs)
1018 (when (car spec)
1019 (when (nth 1 spec)
1020 (push `((eq ',(nth 1 spec) ,class)
1021 (semantic-lex-start-block ',(car spec)))
1022 form))
1023 (when (nth 2 spec)
1024 (push `((eq ',(nth 2 spec) ,class)
1025 (semantic-lex-end-block ',(car spec)))
1026 form))))
1027 (when form
1028 `((let ((,class (semantic-lex-token-class
1029 (car semantic-lex-token-stream))))
1030 (cond ,@(nreverse form))))
1031 )))
1032 )
1033
1034 (defmacro semantic-lex-push-token (token &rest blockspecs)
1035 "Push TOKEN in the lexical analyzer token stream.
1036 Return the lexical analysis current end point.
1037 If optional arguments BLOCKSPECS is non-nil, it specifies to process
1038 collapsed block tokens. See `semantic-lex-expand-block-specs' for
1039 more details.
1040 This macro should only be called within the bounds of
1041 `define-lex-analyzer'. It changes the values of the lexical analyzer
1042 variables `token-stream' and `semantic-lex-end-point'. If you need to
1043 move `semantic-lex-end-point' somewhere else, just modify this
1044 variable after calling `semantic-lex-push-token'."
1045 `(progn
1046 (push ,token semantic-lex-token-stream)
1047 ,@(semantic-lex-expand-block-specs blockspecs)
1048 (setq semantic-lex-end-point
1049 (semantic-lex-token-end (car semantic-lex-token-stream)))
1050 ))
1051
1052 (defsubst semantic-lex-token-class (token)
1053 "Fetch the class of the lexical token TOKEN.
1054 See also the function `semantic-lex-token'."
1055 (car token))
1056
1057 (defsubst semantic-lex-token-text (token)
1058 "Fetch the text associated with the lexical token TOKEN.
1059 See also the function `semantic-lex-token'."
1060 (if (stringp (car (cdr token)))
1061 (car (cdr token))
1062 (buffer-substring-no-properties
1063 (semantic-lex-token-start token)
1064 (semantic-lex-token-end token))))
1065
1066 (defun semantic-lex-init ()
1067 "Initialize any lexical state for this buffer."
1068 (unless semantic-lex-comment-regex
1069 (setq semantic-lex-comment-regex
1070 (if comment-start-skip
1071 (concat "\\(\\s<\\|" comment-start-skip "\\)")
1072 "\\(\\s<\\)")))
1073 ;; Setup the lexer syntax-table
1074 (setq semantic-lex-syntax-table (copy-syntax-table (syntax-table)))
1075 (dolist (mod semantic-lex-syntax-modifications)
1076 (modify-syntax-entry
1077 (car mod) (nth 1 mod) semantic-lex-syntax-table)))
1078
1079 ;;;###autoload
1080 (define-overloadable-function semantic-lex (start end &optional depth length)
1081 "Lexically analyze text in the current buffer between START and END.
1082 Optional argument DEPTH indicates at what level to scan over entire
1083 lists. The last argument, LENGTH specifies that `semantic-lex'
1084 should only return LENGTH tokens. The return value is a token stream.
1085 Each element is a list, such of the form
1086 (symbol start-expression . end-expression)
1087 where SYMBOL denotes the token type.
1088 See `semantic-lex-tokens' variable for details on token types. END
1089 does not mark the end of the text scanned, only the end of the
1090 beginning of text scanned. Thus, if a string extends past END, the
1091 end of the return token will be larger than END. To truly restrict
1092 scanning, use `narrow-to-region'."
1093 (funcall semantic-lex-analyzer start end depth length))
1094
1095 (defsubst semantic-lex-buffer (&optional depth)
1096 "Lex the current buffer.
1097 Optional argument DEPTH is the depth to scan into lists."
1098 (semantic-lex (point-min) (point-max) depth))
1099
1100 (defsubst semantic-lex-list (semlist depth)
1101 "Lex the body of SEMLIST to DEPTH."
1102 (semantic-lex (semantic-lex-token-start semlist)
1103 (semantic-lex-token-end semlist)
1104 depth))
1105 \f
1106 ;;; Analyzer creation macros
1107 ;;
1108 ;; An individual analyzer is a condition and code that goes with it.
1109 ;;
1110 ;; Created analyzers become variables with the code associated with them
1111 ;; as the symbol value. These analyzers are assembled into a lexer
1112 ;; to create new lexical analyzers.
1113
1114 (defcustom semantic-lex-debug-analyzers nil
1115 "Non nil means to debug analyzers with syntax protection.
1116 Only in effect if `debug-on-error' is also non-nil."
1117 :group 'semantic
1118 :type 'boolean)
1119
1120 (defmacro semantic-lex-unterminated-syntax-protection (syntax &rest forms)
1121 "For SYNTAX, execute FORMS with protection for unterminated syntax.
1122 If FORMS throws an error, treat this as a syntax problem, and
1123 execute the unterminated syntax code. FORMS should return a position.
1124 Irreguardless of an error, the cursor should be moved to the end of
1125 the desired syntax, and a position returned.
1126 If `debug-on-error' is set, errors are not caught, so that you can
1127 debug them.
1128 Avoid using a large FORMS since it is duplicated."
1129 `(if (and debug-on-error semantic-lex-debug-analyzers)
1130 (progn ,@forms)
1131 (condition-case nil
1132 (progn ,@forms)
1133 (error
1134 (semantic-lex-unterminated-syntax-detected ,syntax)))))
1135 (put 'semantic-lex-unterminated-syntax-protection
1136 'lisp-indent-function 1)
1137
1138 (defmacro define-lex-analyzer (name doc condition &rest forms)
1139 "Create a single lexical analyzer NAME with DOC.
1140 When an analyzer is called, the current buffer and point are
1141 positioned in a buffer at the location to be analyzed.
1142 CONDITION is an expression which returns t if FORMS should be run.
1143 Within the bounds of CONDITION and FORMS, the use of backquote
1144 can be used to evaluate expressions at compile time.
1145 While forms are running, the following variables will be locally bound:
1146 `semantic-lex-analysis-bounds' - The bounds of the current analysis.
1147 of the form (START . END)
1148 `semantic-lex-maximum-depth' - The maximum depth of semantic-list
1149 for the current analysis.
1150 `semantic-lex-current-depth' - The current depth of `semantic-list' that has
1151 been decended.
1152 `semantic-lex-end-point' - End Point after match.
1153 Analyzers should set this to a buffer location if their
1154 match string does not represent the end of the matched text.
1155 `semantic-lex-token-stream' - The token list being collected.
1156 Add new lexical tokens to this list.
1157 Proper action in FORMS is to move the value of `semantic-lex-end-point' to
1158 after the location of the analyzed entry, and to add any discovered tokens
1159 at the beginning of `semantic-lex-token-stream'.
1160 This can be done by using `semantic-lex-push-token'."
1161 `(eval-and-compile
1162 (defvar ,name nil ,doc)
1163 (defun ,name nil)
1164 ;; Do this part separately so that re-evaluation rebuilds this code.
1165 (setq ,name '(,condition ,@forms))
1166 ;; Build a single lexical analyzer function, so the doc for
1167 ;; function help is automatically provided, and perhaps the
1168 ;; function could be useful for testing and debugging one
1169 ;; analyzer.
1170 (fset ',name (lambda () ,doc
1171 (let ((semantic-lex-token-stream nil)
1172 (semantic-lex-end-point (point))
1173 (semantic-lex-analysis-bounds
1174 (cons (point) (point-max)))
1175 (semantic-lex-current-depth 0)
1176 (semantic-lex-maximum-depth
1177 semantic-lex-depth)
1178 )
1179 (when ,condition ,@forms)
1180 semantic-lex-token-stream)))
1181 ))
1182
1183 (defmacro define-lex-regex-analyzer (name doc regexp &rest forms)
1184 "Create a lexical analyzer with NAME and DOC that will match REGEXP.
1185 FORMS are evaluated upon a successful match.
1186 See `define-lex-analyzer' for more about analyzers."
1187 `(define-lex-analyzer ,name
1188 ,doc
1189 (looking-at ,regexp)
1190 ,@forms
1191 ))
1192
1193 (defmacro define-lex-simple-regex-analyzer (name doc regexp toksym
1194 &optional index
1195 &rest forms)
1196 "Create a lexical analyzer with NAME and DOC that match REGEXP.
1197 TOKSYM is the symbol to use when creating a semantic lexical token.
1198 INDEX is the index into the match that defines the bounds of the token.
1199 Index should be a plain integer, and not specified in the macro as an
1200 expression.
1201 FORMS are evaluated upon a successful match BEFORE the new token is
1202 created. It is valid to ignore FORMS.
1203 See `define-lex-analyzer' for more about analyzers."
1204 `(define-lex-analyzer ,name
1205 ,doc
1206 (looking-at ,regexp)
1207 ,@forms
1208 (semantic-lex-push-token
1209 (semantic-lex-token ,toksym
1210 (match-beginning ,(or index 0))
1211 (match-end ,(or index 0))))
1212 ))
1213
1214 (defmacro define-lex-block-analyzer (name doc spec1 &rest specs)
1215 "Create a lexical analyzer NAME for paired delimiters blocks.
1216 It detects a paired delimiters block or the corresponding open or
1217 close delimiter depending on the value of the variable
1218 `semantic-lex-current-depth'. DOC is the documentation string of the lexical
1219 analyzer. SPEC1 and SPECS specify the token symbols and open, close
1220 delimiters used. Each SPEC has the form:
1221
1222 \(BLOCK-SYM (OPEN-DELIM OPEN-SYM) (CLOSE-DELIM CLOSE-SYM))
1223
1224 where BLOCK-SYM is the symbol returned in a block token. OPEN-DELIM
1225 and CLOSE-DELIM are respectively the open and close delimiters
1226 identifying a block. OPEN-SYM and CLOSE-SYM are respectively the
1227 symbols returned in open and close tokens."
1228 (let ((specs (cons spec1 specs))
1229 spec open olist clist)
1230 (while specs
1231 (setq spec (car specs)
1232 specs (cdr specs)
1233 open (nth 1 spec)
1234 ;; build alist ((OPEN-DELIM OPEN-SYM BLOCK-SYM) ...)
1235 olist (cons (list (car open) (cadr open) (car spec)) olist)
1236 ;; build alist ((CLOSE-DELIM CLOSE-SYM) ...)
1237 clist (cons (nth 2 spec) clist)))
1238 `(define-lex-analyzer ,name
1239 ,doc
1240 (and
1241 (looking-at "\\(\\s(\\|\\s)\\)")
1242 (let ((text (match-string 0)) match)
1243 (cond
1244 ((setq match (assoc text ',olist))
1245 (if (or (not semantic-lex-maximum-depth)
1246 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1247 (progn
1248 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1249 (semantic-lex-push-token
1250 (semantic-lex-token
1251 (nth 1 match)
1252 (match-beginning 0) (match-end 0))))
1253 (semantic-lex-push-token
1254 (semantic-lex-token
1255 (nth 2 match)
1256 (match-beginning 0)
1257 (save-excursion
1258 (semantic-lex-unterminated-syntax-protection (nth 2 match)
1259 (forward-list 1)
1260 (point)))
1261 ))
1262 ))
1263 ((setq match (assoc text ',clist))
1264 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
1265 (semantic-lex-push-token
1266 (semantic-lex-token
1267 (nth 1 match)
1268 (match-beginning 0) (match-end 0)))))))
1269 )))
1270 \f
1271 ;;; Analyzers
1272 ;;
1273 ;; Pre-defined common analyzers.
1274 ;;
1275 (define-lex-analyzer semantic-lex-default-action
1276 "The default action when no other lexical actions match text.
1277 This action will just throw an error."
1278 t
1279 (error "Unmatched Text during Lexical Analysis"))
1280
1281 (define-lex-analyzer semantic-lex-beginning-of-line
1282 "Detect and create a beginning of line token (BOL)."
1283 (and (bolp)
1284 ;; Just insert a (bol N . N) token in the token stream,
1285 ;; without moving the point. N is the point at the
1286 ;; beginning of line.
1287 (semantic-lex-push-token (semantic-lex-token 'bol (point) (point)))
1288 nil) ;; CONTINUE
1289 ;; We identify and add the BOL token onto the stream, but since
1290 ;; semantic-lex-end-point doesn't move, we always fail CONDITION, and have no
1291 ;; FORMS body.
1292 nil)
1293
1294 (define-lex-simple-regex-analyzer semantic-lex-newline
1295 "Detect and create newline tokens."
1296 "\\s-*\\(\n\\|\\s>\\)" 'newline 1)
1297
1298 (define-lex-regex-analyzer semantic-lex-newline-as-whitespace
1299 "Detect and create newline tokens.
1300 Use this ONLY if newlines are not whitespace characters (such as when
1301 they are comment end characters) AND when you want whitespace tokens."
1302 "\\s-*\\(\n\\|\\s>\\)"
1303 ;; Language wants whitespaces. Create a token for it.
1304 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream))
1305 'whitespace)
1306 ;; Merge whitespace tokens together if they are adjacent. Two
1307 ;; whitespace tokens may be sperated by a comment which is not in
1308 ;; the token stream.
1309 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1310 (match-end 0))
1311 (semantic-lex-push-token
1312 (semantic-lex-token
1313 'whitespace (match-beginning 0) (match-end 0)))))
1314
1315 (define-lex-regex-analyzer semantic-lex-ignore-newline
1316 "Detect and ignore newline tokens.
1317 Use this ONLY if newlines are not whitespace characters (such as when
1318 they are comment end characters)."
1319 "\\s-*\\(\n\\|\\s>\\)"
1320 (setq semantic-lex-end-point (match-end 0)))
1321
1322 (define-lex-regex-analyzer semantic-lex-whitespace
1323 "Detect and create whitespace tokens."
1324 ;; catch whitespace when needed
1325 "\\s-+"
1326 ;; Language wants whitespaces. Create a token for it.
1327 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream))
1328 'whitespace)
1329 ;; Merge whitespace tokens together if they are adjacent. Two
1330 ;; whitespace tokens may be sperated by a comment which is not in
1331 ;; the token stream.
1332 (progn
1333 (setq semantic-lex-end-point (match-end 0))
1334 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1335 semantic-lex-end-point))
1336 (semantic-lex-push-token
1337 (semantic-lex-token
1338 'whitespace (match-beginning 0) (match-end 0)))))
1339
1340 (define-lex-regex-analyzer semantic-lex-ignore-whitespace
1341 "Detect and skip over whitespace tokens."
1342 ;; catch whitespace when needed
1343 "\\s-+"
1344 ;; Skip over the detected whitespace, do not create a token for it.
1345 (setq semantic-lex-end-point (match-end 0)))
1346
1347 (define-lex-simple-regex-analyzer semantic-lex-number
1348 "Detect and create number tokens.
1349 See `semantic-lex-number-expression' for details on matching numbers,
1350 and number formats."
1351 semantic-lex-number-expression 'number)
1352
1353 (define-lex-regex-analyzer semantic-lex-symbol-or-keyword
1354 "Detect and create symbol and keyword tokens."
1355 "\\(\\sw\\|\\s_\\)+"
1356 (semantic-lex-push-token
1357 (semantic-lex-token
1358 (or (semantic-lex-keyword-p (match-string 0)) 'symbol)
1359 (match-beginning 0) (match-end 0))))
1360
1361 (define-lex-simple-regex-analyzer semantic-lex-charquote
1362 "Detect and create charquote tokens."
1363 ;; Character quoting characters (ie, \n as newline)
1364 "\\s\\+" 'charquote)
1365
1366 (define-lex-simple-regex-analyzer semantic-lex-punctuation
1367 "Detect and create punctuation tokens."
1368 "\\(\\s.\\|\\s$\\|\\s'\\)" 'punctuation)
1369
1370 (define-lex-analyzer semantic-lex-punctuation-type
1371 "Detect and create a punctuation type token.
1372 Recognized punctuations are defined in the current table of lexical
1373 types, as the value of the `punctuation' token type."
1374 (and (looking-at "\\(\\s.\\|\\s$\\|\\s'\\)+")
1375 (let* ((key (match-string 0))
1376 (pos (match-beginning 0))
1377 (end (match-end 0))
1378 (len (- end pos))
1379 (lst (semantic-lex-type-value "punctuation" t))
1380 (def (car lst)) ;; default lexical symbol or nil
1381 (lst (cdr lst)) ;; alist of (LEX-SYM . PUNCT-STRING)
1382 (elt nil))
1383 (if lst
1384 ;; Starting with the longest one, search if the
1385 ;; punctuation string is defined for this language.
1386 (while (and (> len 0) (not (setq elt (rassoc key lst))))
1387 (setq len (1- len)
1388 key (substring key 0 len))))
1389 (if elt ;; Return the punctuation token found
1390 (semantic-lex-push-token
1391 (semantic-lex-token (car elt) pos (+ pos len)))
1392 (if def ;; Return a default generic token
1393 (semantic-lex-push-token
1394 (semantic-lex-token def pos end))
1395 ;; Nothing match
1396 )))))
1397
1398 (define-lex-regex-analyzer semantic-lex-paren-or-list
1399 "Detect open parenthesis.
1400 Return either a paren token or a semantic list token depending on
1401 `semantic-lex-current-depth'."
1402 "\\s("
1403 (if (or (not semantic-lex-maximum-depth)
1404 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1405 (progn
1406 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1407 (semantic-lex-push-token
1408 (semantic-lex-token
1409 'open-paren (match-beginning 0) (match-end 0))))
1410 (semantic-lex-push-token
1411 (semantic-lex-token
1412 'semantic-list (match-beginning 0)
1413 (save-excursion
1414 (semantic-lex-unterminated-syntax-protection 'semantic-list
1415 (forward-list 1)
1416 (point))
1417 )))
1418 ))
1419
1420 (define-lex-simple-regex-analyzer semantic-lex-open-paren
1421 "Detect and create an open parenthisis token."
1422 "\\s(" 'open-paren 0 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth)))
1423
1424 (define-lex-simple-regex-analyzer semantic-lex-close-paren
1425 "Detect and create a close paren token."
1426 "\\s)" 'close-paren 0 (setq semantic-lex-current-depth (1- semantic-lex-current-depth)))
1427
1428 (define-lex-regex-analyzer semantic-lex-string
1429 "Detect and create a string token."
1430 "\\s\""
1431 ;; Zing to the end of this string.
1432 (semantic-lex-push-token
1433 (semantic-lex-token
1434 'string (point)
1435 (save-excursion
1436 (semantic-lex-unterminated-syntax-protection 'string
1437 (forward-sexp 1)
1438 (point))
1439 ))))
1440
1441 (define-lex-regex-analyzer semantic-lex-comments
1442 "Detect and create a comment token."
1443 semantic-lex-comment-regex
1444 (save-excursion
1445 (forward-comment 1)
1446 ;; Generate newline token if enabled
1447 (if (bolp) (backward-char 1))
1448 (setq semantic-lex-end-point (point))
1449 ;; Language wants comments or want them as whitespaces,
1450 ;; link them together.
1451 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'comment)
1452 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1453 semantic-lex-end-point)
1454 (semantic-lex-push-token
1455 (semantic-lex-token
1456 'comment (match-beginning 0) semantic-lex-end-point)))))
1457
1458 (define-lex-regex-analyzer semantic-lex-comments-as-whitespace
1459 "Detect comments and create a whitespace token."
1460 semantic-lex-comment-regex
1461 (save-excursion
1462 (forward-comment 1)
1463 ;; Generate newline token if enabled
1464 (if (bolp) (backward-char 1))
1465 (setq semantic-lex-end-point (point))
1466 ;; Language wants comments or want them as whitespaces,
1467 ;; link them together.
1468 (if (eq (semantic-lex-token-class (car semantic-lex-token-stream)) 'whitespace)
1469 (setcdr (semantic-lex-token-bounds (car semantic-lex-token-stream))
1470 semantic-lex-end-point)
1471 (semantic-lex-push-token
1472 (semantic-lex-token
1473 'whitespace (match-beginning 0) semantic-lex-end-point)))))
1474
1475 (define-lex-regex-analyzer semantic-lex-ignore-comments
1476 "Detect and create a comment token."
1477 semantic-lex-comment-regex
1478 (let ((comment-start-point (point)))
1479 (forward-comment 1)
1480 (if (eq (point) comment-start-point)
1481 ;; In this case our start-skip string failed
1482 ;; to work properly. Lets try and move over
1483 ;; whatever white space we matched to begin
1484 ;; with.
1485 (skip-syntax-forward "-.'"
1486 (save-excursion
1487 (end-of-line)
1488 (point)))
1489 ;; We may need to back up so newlines or whitespace is generated.
1490 (if (bolp)
1491 (backward-char 1)))
1492 (if (eq (point) comment-start-point)
1493 (error "Strange comment syntax prevents lexical analysis"))
1494 (setq semantic-lex-end-point (point))))
1495 \f
1496 ;;; Comment lexer
1497 ;;
1498 ;; Predefined lexers that could be used instead of creating new
1499 ;; analyers.
1500
1501 (define-lex semantic-comment-lexer
1502 "A simple lexical analyzer that handles comments.
1503 This lexer will only return comment tokens. It is the default lexer
1504 used by `semantic-find-doc-snarf-comment' to snarf up the comment at
1505 point."
1506 semantic-lex-ignore-whitespace
1507 semantic-lex-ignore-newline
1508 semantic-lex-comments
1509 semantic-lex-default-action)
1510
1511 ;;; Test Lexer
1512 ;;
1513 (define-lex semantic-simple-lexer
1514 "A simple lexical analyzer that handles simple buffers.
1515 This lexer ignores comments and whitespace, and will return
1516 syntax as specified by the syntax table."
1517 semantic-lex-ignore-whitespace
1518 semantic-lex-ignore-newline
1519 semantic-lex-number
1520 semantic-lex-symbol-or-keyword
1521 semantic-lex-charquote
1522 semantic-lex-paren-or-list
1523 semantic-lex-close-paren
1524 semantic-lex-string
1525 semantic-lex-ignore-comments
1526 semantic-lex-punctuation
1527 semantic-lex-default-action)
1528 \f
1529 ;;; Analyzers generated from grammar.
1530 ;;
1531 ;; Some analyzers are hand written. Analyzers created with these
1532 ;; functions are generated from the grammar files.
1533
1534 (defmacro define-lex-keyword-type-analyzer (name doc syntax)
1535 "Define a keyword type analyzer NAME with DOC string.
1536 SYNTAX is the regexp that matches a keyword syntactic expression."
1537 (let ((key (make-symbol "key")))
1538 `(define-lex-analyzer ,name
1539 ,doc
1540 (and (looking-at ,syntax)
1541 (let ((,key (semantic-lex-keyword-p (match-string 0))))
1542 (when ,key
1543 (semantic-lex-push-token
1544 (semantic-lex-token
1545 ,key (match-beginning 0) (match-end 0)))))))
1546 ))
1547
1548 (defmacro define-lex-sexp-type-analyzer (name doc syntax token)
1549 "Define a sexp type analyzer NAME with DOC string.
1550 SYNTAX is the regexp that matches the beginning of the s-expression.
1551 TOKEN is the lexical token returned when SYNTAX matches."
1552 `(define-lex-regex-analyzer ,name
1553 ,doc
1554 ,syntax
1555 (semantic-lex-push-token
1556 (semantic-lex-token
1557 ,token (point)
1558 (save-excursion
1559 (semantic-lex-unterminated-syntax-protection ,token
1560 (forward-sexp 1)
1561 (point))))))
1562 )
1563
1564 (defmacro define-lex-regex-type-analyzer (name doc syntax matches default)
1565 "Define a regexp type analyzer NAME with DOC string.
1566 SYNTAX is the regexp that matches a syntactic expression.
1567 MATCHES is an alist of lexical elements used to refine the syntactic
1568 expression.
1569 DEFAULT is the default lexical token returned when no MATCHES."
1570 (if matches
1571 (let* ((val (make-symbol "val"))
1572 (lst (make-symbol "lst"))
1573 (elt (make-symbol "elt"))
1574 (pos (make-symbol "pos"))
1575 (end (make-symbol "end")))
1576 `(define-lex-analyzer ,name
1577 ,doc
1578 (and (looking-at ,syntax)
1579 (let* ((,val (match-string 0))
1580 (,pos (match-beginning 0))
1581 (,end (match-end 0))
1582 (,lst ,matches)
1583 ,elt)
1584 (while (and ,lst (not ,elt))
1585 (if (string-match (cdar ,lst) ,val)
1586 (setq ,elt (caar ,lst))
1587 (setq ,lst (cdr ,lst))))
1588 (semantic-lex-push-token
1589 (semantic-lex-token (or ,elt ,default) ,pos ,end))))
1590 ))
1591 `(define-lex-simple-regex-analyzer ,name
1592 ,doc
1593 ,syntax ,default)
1594 ))
1595
1596 (defmacro define-lex-string-type-analyzer (name doc syntax matches default)
1597 "Define a string type analyzer NAME with DOC string.
1598 SYNTAX is the regexp that matches a syntactic expression.
1599 MATCHES is an alist of lexical elements used to refine the syntactic
1600 expression.
1601 DEFAULT is the default lexical token returned when no MATCHES."
1602 (if matches
1603 (let* ((val (make-symbol "val"))
1604 (lst (make-symbol "lst"))
1605 (elt (make-symbol "elt"))
1606 (pos (make-symbol "pos"))
1607 (end (make-symbol "end"))
1608 (len (make-symbol "len")))
1609 `(define-lex-analyzer ,name
1610 ,doc
1611 (and (looking-at ,syntax)
1612 (let* ((,val (match-string 0))
1613 (,pos (match-beginning 0))
1614 (,end (match-end 0))
1615 (,len (- ,end ,pos))
1616 (,lst ,matches)
1617 ,elt)
1618 ;; Starting with the longest one, search if a lexical
1619 ;; value match a token defined for this language.
1620 (while (and (> ,len 0) (not (setq ,elt (rassoc ,val ,lst))))
1621 (setq ,len (1- ,len)
1622 ,val (substring ,val 0 ,len)))
1623 (when ,elt ;; Adjust token end position.
1624 (setq ,elt (car ,elt)
1625 ,end (+ ,pos ,len)))
1626 (semantic-lex-push-token
1627 (semantic-lex-token (or ,elt ,default) ,pos ,end))))
1628 ))
1629 `(define-lex-simple-regex-analyzer ,name
1630 ,doc
1631 ,syntax ,default)
1632 ))
1633
1634 (defmacro define-lex-block-type-analyzer (name doc syntax matches)
1635 "Define a block type analyzer NAME with DOC string.
1636
1637 SYNTAX is the regexp that matches block delimiters, typically the
1638 open (`\\\\s(') and close (`\\\\s)') parenthesis syntax classes.
1639
1640 MATCHES is a pair (OPEN-SPECS . CLOSE-SPECS) that defines blocks.
1641
1642 OPEN-SPECS is a list of (OPEN-DELIM OPEN-TOKEN BLOCK-TOKEN) elements
1643 where:
1644
1645 OPEN-DELIM is a string: the block open delimiter character.
1646
1647 OPEN-TOKEN is the lexical token class associated to the OPEN-DELIM
1648 delimiter.
1649
1650 BLOCK-TOKEN is the lexical token class associated to the block
1651 that starts at the OPEN-DELIM delimiter.
1652
1653 CLOSE-SPECS is a list of (CLOSE-DELIM CLOSE-TOKEN) elements where:
1654
1655 CLOSE-DELIM is a string: the block end delimiter character.
1656
1657 CLOSE-TOKEN is the lexical token class associated to the
1658 CLOSE-DELIM delimiter.
1659
1660 Each element in OPEN-SPECS must have a corresponding element in
1661 CLOSE-SPECS.
1662
1663 The lexer will return a BLOCK-TOKEN token when the value of
1664 `semantic-lex-current-depth' is greater than or equal to the maximum
1665 depth of parenthesis tracking (see also the function `semantic-lex').
1666 Otherwise it will return OPEN-TOKEN and CLOSE-TOKEN tokens.
1667
1668 TO DO: Put the following in the developer's guide and just put a
1669 reference here.
1670
1671 In the grammar:
1672
1673 The value of a block token must be a string that contains a readable
1674 sexp of the form:
1675
1676 \"(OPEN-TOKEN CLOSE-TOKEN)\"
1677
1678 OPEN-TOKEN and CLOSE-TOKEN represent the block delimiters, and must be
1679 lexical tokens of respectively `open-paren' and `close-paren' types.
1680 Their value is the corresponding delimiter character as a string.
1681
1682 Here is a small example to analyze a parenthesis block:
1683
1684 %token <block> PAREN_BLOCK \"(LPAREN RPAREN)\"
1685 %token <open-paren> LPAREN \"(\"
1686 %token <close-paren> RPAREN \")\"
1687
1688 When the lexer encounters the open-paren delimiter \"(\":
1689
1690 - If the maximum depth of parenthesis tracking is not reached (that
1691 is, current depth < max depth), it returns a (LPAREN start . end)
1692 token, then continue analysis inside the block. Later, when the
1693 corresponding close-paren delimiter \")\" will be encountered, it
1694 will return a (RPAREN start . end) token.
1695
1696 - If the maximum depth of parenthesis tracking is reached (current
1697 depth >= max depth), it returns the whole parenthesis block as
1698 a (PAREN_BLOCK start . end) token."
1699 (let* ((val (make-symbol "val"))
1700 (lst (make-symbol "lst"))
1701 (elt (make-symbol "elt")))
1702 `(define-lex-analyzer ,name
1703 ,doc
1704 (and
1705 (looking-at ,syntax) ;; "\\(\\s(\\|\\s)\\)"
1706 (let ((,val (match-string 0))
1707 (,lst ,matches)
1708 ,elt)
1709 (cond
1710 ((setq ,elt (assoc ,val (car ,lst)))
1711 (if (or (not semantic-lex-maximum-depth)
1712 (< semantic-lex-current-depth semantic-lex-maximum-depth))
1713 (progn
1714 (setq semantic-lex-current-depth (1+ semantic-lex-current-depth))
1715 (semantic-lex-push-token
1716 (semantic-lex-token
1717 (nth 1 ,elt)
1718 (match-beginning 0) (match-end 0))))
1719 (semantic-lex-push-token
1720 (semantic-lex-token
1721 (nth 2 ,elt)
1722 (match-beginning 0)
1723 (save-excursion
1724 (semantic-lex-unterminated-syntax-protection (nth 2 ,elt)
1725 (forward-list 1)
1726 (point)))))))
1727 ((setq ,elt (assoc ,val (cdr ,lst)))
1728 (setq semantic-lex-current-depth (1- semantic-lex-current-depth))
1729 (semantic-lex-push-token
1730 (semantic-lex-token
1731 (nth 1 ,elt)
1732 (match-beginning 0) (match-end 0))))
1733 ))))
1734 ))
1735 \f
1736 ;;; Lexical Safety
1737 ;;
1738 ;; The semantic lexers, unlike other lexers, can throw errors on
1739 ;; unbalanced syntax. Since editing is all about changeging test
1740 ;; we need to provide a convenient way to protect against syntactic
1741 ;; inequalities.
1742
1743 (defmacro semantic-lex-catch-errors (symbol &rest forms)
1744 "Using SYMBOL, execute FORMS catching lexical errors.
1745 If FORMS results in a call to the parser that throws a lexical error,
1746 the error will be caught here without the buffer's cache being thrown
1747 out of date.
1748 If there is an error, the syntax that failed is returned.
1749 If there is no error, then the last value of FORMS is returned."
1750 (let ((ret (make-symbol "ret"))
1751 (syntax (make-symbol "syntax"))
1752 (start (make-symbol "start"))
1753 (end (make-symbol "end")))
1754 `(let* ((semantic-lex-unterminated-syntax-end-function
1755 (lambda (,syntax ,start ,end)
1756 (throw ',symbol ,syntax)))
1757 ;; Delete the below when semantic-flex is fully retired.
1758 (semantic-flex-unterminated-syntax-end-function
1759 semantic-lex-unterminated-syntax-end-function)
1760 (,ret (catch ',symbol
1761 (save-excursion
1762 ,@forms
1763 nil))))
1764 ;; Great Sadness. Assume that FORMS execute within the
1765 ;; confines of the current buffer only! Mark this thing
1766 ;; unparseable iff the special symbol was thrown. This
1767 ;; will prevent future calls from parsing, but will allow
1768 ;; then to still return the cache.
1769 (when ,ret
1770 ;; Leave this message off. If an APP using this fcn wants
1771 ;; a message, they can do it themselves. This cleans up
1772 ;; problems with the idle scheduler obscuring useful data.
1773 ;;(message "Buffer not currently parsable (%S)." ,ret)
1774 (semantic-parse-tree-unparseable))
1775 ,ret)))
1776 (put 'semantic-lex-catch-errors 'lisp-indent-function 1)
1777
1778 \f
1779 ;;; Interfacing with edebug
1780 ;;
1781 (add-hook
1782 'edebug-setup-hook
1783 #'(lambda ()
1784
1785 (def-edebug-spec define-lex
1786 (&define name stringp (&rest symbolp))
1787 )
1788 (def-edebug-spec define-lex-analyzer
1789 (&define name stringp form def-body)
1790 )
1791 (def-edebug-spec define-lex-regex-analyzer
1792 (&define name stringp form def-body)
1793 )
1794 (def-edebug-spec define-lex-simple-regex-analyzer
1795 (&define name stringp form symbolp [ &optional form ] def-body)
1796 )
1797 (def-edebug-spec define-lex-block-analyzer
1798 (&define name stringp form (&rest form))
1799 )
1800 (def-edebug-spec semantic-lex-catch-errors
1801 (symbolp def-body)
1802 )
1803
1804 ))
1805 \f
1806 ;;; Compatibility with Semantic 1.x lexical analysis
1807 ;;
1808 ;; NOTE: DELETE THIS SOMEDAY SOON
1809
1810 (semantic-alias-obsolete 'semantic-flex-start 'semantic-lex-token-start)
1811 (semantic-alias-obsolete 'semantic-flex-end 'semantic-lex-token-end)
1812 (semantic-alias-obsolete 'semantic-flex-text 'semantic-lex-token-text)
1813 (semantic-alias-obsolete 'semantic-flex-make-keyword-table 'semantic-lex-make-keyword-table)
1814 (semantic-alias-obsolete 'semantic-flex-keyword-p 'semantic-lex-keyword-p)
1815 (semantic-alias-obsolete 'semantic-flex-keyword-put 'semantic-lex-keyword-put)
1816 (semantic-alias-obsolete 'semantic-flex-keyword-get 'semantic-lex-keyword-get)
1817 (semantic-alias-obsolete 'semantic-flex-map-keywords 'semantic-lex-map-keywords)
1818 (semantic-alias-obsolete 'semantic-flex-keywords 'semantic-lex-keywords)
1819 (semantic-alias-obsolete 'semantic-flex-buffer 'semantic-lex-buffer)
1820 (semantic-alias-obsolete 'semantic-flex-list 'semantic-lex-list)
1821
1822 ;; This simple scanner uses the syntax table to generate a stream of
1823 ;; simple tokens of the form:
1824 ;;
1825 ;; (SYMBOL START . END)
1826 ;;
1827 ;; Where symbol is the type of thing it is. START and END mark that
1828 ;; objects boundary.
1829
1830 (defvar semantic-flex-tokens semantic-lex-tokens
1831 "An alist of of semantic token types.
1832 See variable `semantic-lex-tokens'.")
1833
1834 (defvar semantic-flex-unterminated-syntax-end-function
1835 (lambda (syntax syntax-start flex-end) flex-end)
1836 "Function called when unterminated syntax is encountered.
1837 This should be set to one function. That function should take three
1838 parameters. The SYNTAX, or type of syntax which is unterminated.
1839 SYNTAX-START where the broken syntax begins.
1840 FLEX-END is where the lexical analysis was asked to end.
1841 This function can be used for languages that can intelligently fix up
1842 broken syntax, or the exit lexical analysis via `throw' or `signal'
1843 when finding unterminated syntax.")
1844
1845 (defvar semantic-flex-extensions nil
1846 "Buffer local extensions to the lexical analyzer.
1847 This should contain an alist with a key of a regex and a data element of
1848 a function. The function should both move point, and return a lexical
1849 token of the form:
1850 ( TYPE START . END)
1851 nil is also a valid return value.
1852 TYPE can be any type of symbol, as long as it doesn't occur as a
1853 nonterminal in the language definition.")
1854 (make-variable-buffer-local 'semantic-flex-extensions)
1855
1856 (defvar semantic-flex-syntax-modifications nil
1857 "Changes to the syntax table for this buffer.
1858 These changes are active only while the buffer is being flexed.
1859 This is a list where each element has the form:
1860 (CHAR CLASS)
1861 CHAR is the char passed to `modify-syntax-entry',
1862 and CLASS is the string also passed to `modify-syntax-entry' to define
1863 what syntax class CHAR has.")
1864 (make-variable-buffer-local 'semantic-flex-syntax-modifications)
1865
1866 (defvar semantic-ignore-comments t
1867 "Default comment handling.
1868 t means to strip comments when flexing. Nil means to keep comments
1869 as part of the token stream.")
1870 (make-variable-buffer-local 'semantic-ignore-comments)
1871
1872 (defvar semantic-flex-enable-newlines nil
1873 "When flexing, report 'newlines as syntactic elements.
1874 Useful for languages where the newline is a special case terminator.
1875 Only set this on a per mode basis, not globally.")
1876 (make-variable-buffer-local 'semantic-flex-enable-newlines)
1877
1878 (defvar semantic-flex-enable-whitespace nil
1879 "When flexing, report 'whitespace as syntactic elements.
1880 Useful for languages where the syntax is whitespace dependent.
1881 Only set this on a per mode basis, not globally.")
1882 (make-variable-buffer-local 'semantic-flex-enable-whitespace)
1883
1884 (defvar semantic-flex-enable-bol nil
1885 "When flexing, report beginning of lines as syntactic elements.
1886 Useful for languages like python which are indentation sensitive.
1887 Only set this on a per mode basis, not globally.")
1888 (make-variable-buffer-local 'semantic-flex-enable-bol)
1889
1890 (defvar semantic-number-expression semantic-lex-number-expression
1891 "See variable `semantic-lex-number-expression'.")
1892 (make-variable-buffer-local 'semantic-number-expression)
1893
1894 (defvar semantic-flex-depth 0
1895 "Default flexing depth.
1896 This specifies how many lists to create tokens in.")
1897 (make-variable-buffer-local 'semantic-flex-depth)
1898
1899 (defun semantic-flex (start end &optional depth length)
1900 "Using the syntax table, do something roughly equivalent to flex.
1901 Semantically check between START and END. Optional argument DEPTH
1902 indicates at what level to scan over entire lists.
1903 The return value is a token stream. Each element is a list, such of
1904 the form (symbol start-expression . end-expression) where SYMBOL
1905 denotes the token type.
1906 See `semantic-flex-tokens' variable for details on token types.
1907 END does not mark the end of the text scanned, only the end of the
1908 beginning of text scanned. Thus, if a string extends past END, the
1909 end of the return token will be larger than END. To truly restrict
1910 scanning, use `narrow-to-region'.
1911 The last argument, LENGTH specifies that `semantic-flex' should only
1912 return LENGTH tokens."
1913 (message "`semantic-flex' is an obsolete function. Use `define-lex' to create lexers.")
1914 (if (not semantic-flex-keywords-obarray)
1915 (setq semantic-flex-keywords-obarray [ nil ]))
1916 (let ((ts nil)
1917 (pos (point))
1918 (ep nil)
1919 (curdepth 0)
1920 (cs (if comment-start-skip
1921 (concat "\\(\\s<\\|" comment-start-skip "\\)")
1922 (concat "\\(\\s<\\)")))
1923 (newsyntax (copy-syntax-table (syntax-table)))
1924 (mods semantic-flex-syntax-modifications)
1925 ;; Use the default depth if it is not specified.
1926 (depth (or depth semantic-flex-depth)))
1927 ;; Update the syntax table
1928 (while mods
1929 (modify-syntax-entry (car (car mods)) (car (cdr (car mods))) newsyntax)
1930 (setq mods (cdr mods)))
1931 (with-syntax-table newsyntax
1932 (goto-char start)
1933 (while (and (< (point) end) (or (not length) (<= (length ts) length)))
1934 (cond
1935 ;; catch beginning of lines when needed.
1936 ;; Must be done before catching any other tokens!
1937 ((and semantic-flex-enable-bol
1938 (bolp)
1939 ;; Just insert a (bol N . N) token in the token stream,
1940 ;; without moving the point. N is the point at the
1941 ;; beginning of line.
1942 (setq ts (cons (cons 'bol (cons (point) (point))) ts))
1943 nil)) ;; CONTINUE
1944 ;; special extensions, includes whitespace, nl, etc.
1945 ((and semantic-flex-extensions
1946 (let ((fe semantic-flex-extensions)
1947 (r nil))
1948 (while fe
1949 (if (looking-at (car (car fe)))
1950 (setq ts (cons (funcall (cdr (car fe))) ts)
1951 r t
1952 fe nil
1953 ep (point)))
1954 (setq fe (cdr fe)))
1955 (if (and r (not (car ts))) (setq ts (cdr ts)))
1956 r)))
1957 ;; catch newlines when needed
1958 ((looking-at "\\s-*\\(\n\\|\\s>\\)")
1959 (if semantic-flex-enable-newlines
1960 (setq ep (match-end 1)
1961 ts (cons (cons 'newline
1962 (cons (match-beginning 1) ep))
1963 ts))))
1964 ;; catch whitespace when needed
1965 ((looking-at "\\s-+")
1966 (if semantic-flex-enable-whitespace
1967 ;; Language wants whitespaces, link them together.
1968 (if (eq (car (car ts)) 'whitespace)
1969 (setcdr (cdr (car ts)) (match-end 0))
1970 (setq ts (cons (cons 'whitespace
1971 (cons (match-beginning 0)
1972 (match-end 0)))
1973 ts)))))
1974 ;; numbers
1975 ((and semantic-number-expression
1976 (looking-at semantic-number-expression))
1977 (setq ts (cons (cons 'number
1978 (cons (match-beginning 0)
1979 (match-end 0)))
1980 ts)))
1981 ;; symbols
1982 ((looking-at "\\(\\sw\\|\\s_\\)+")
1983 (setq ts (cons (cons
1984 ;; Get info on if this is a keyword or not
1985 (or (semantic-lex-keyword-p (match-string 0))
1986 'symbol)
1987 (cons (match-beginning 0) (match-end 0)))
1988 ts)))
1989 ;; Character quoting characters (ie, \n as newline)
1990 ((looking-at "\\s\\+")
1991 (setq ts (cons (cons 'charquote
1992 (cons (match-beginning 0) (match-end 0)))
1993 ts)))
1994 ;; Open parens, or semantic-lists.
1995 ((looking-at "\\s(")
1996 (if (or (not depth) (< curdepth depth))
1997 (progn
1998 (setq curdepth (1+ curdepth))
1999 (setq ts (cons (cons 'open-paren
2000 (cons (match-beginning 0) (match-end 0)))
2001 ts)))
2002 (setq ts (cons
2003 (cons 'semantic-list
2004 (cons (match-beginning 0)
2005 (save-excursion
2006 (condition-case nil
2007 (forward-list 1)
2008 ;; This case makes flex robust
2009 ;; to broken lists.
2010 (error
2011 (goto-char
2012 (funcall
2013 semantic-flex-unterminated-syntax-end-function
2014 'semantic-list
2015 start end))))
2016 (setq ep (point)))))
2017 ts))))
2018 ;; Close parens
2019 ((looking-at "\\s)")
2020 (setq ts (cons (cons 'close-paren
2021 (cons (match-beginning 0) (match-end 0)))
2022 ts))
2023 (setq curdepth (1- curdepth)))
2024 ;; String initiators
2025 ((looking-at "\\s\"")
2026 ;; Zing to the end of this string.
2027 (setq ts (cons (cons 'string
2028 (cons (match-beginning 0)
2029 (save-excursion
2030 (condition-case nil
2031 (forward-sexp 1)
2032 ;; This case makes flex
2033 ;; robust to broken strings.
2034 (error
2035 (goto-char
2036 (funcall
2037 semantic-flex-unterminated-syntax-end-function
2038 'string
2039 start end))))
2040 (setq ep (point)))))
2041 ts)))
2042 ;; comments
2043 ((looking-at cs)
2044 (if (and semantic-ignore-comments
2045 (not semantic-flex-enable-whitespace))
2046 ;; If the language doesn't deal with comments nor
2047 ;; whitespaces, ignore them here.
2048 (let ((comment-start-point (point)))
2049 (forward-comment 1)
2050 (if (eq (point) comment-start-point)
2051 ;; In this case our start-skip string failed
2052 ;; to work properly. Lets try and move over
2053 ;; whatever white space we matched to begin
2054 ;; with.
2055 (skip-syntax-forward "-.'"
2056 (save-excursion
2057 (end-of-line)
2058 (point)))
2059 ;;(forward-comment 1)
2060 ;; Generate newline token if enabled
2061 (if (and semantic-flex-enable-newlines
2062 (bolp))
2063 (backward-char 1)))
2064 (if (eq (point) comment-start-point)
2065 (error "Strange comment syntax prevents lexical analysis"))
2066 (setq ep (point)))
2067 (let ((tk (if semantic-ignore-comments 'whitespace 'comment)))
2068 (save-excursion
2069 (forward-comment 1)
2070 ;; Generate newline token if enabled
2071 (if (and semantic-flex-enable-newlines
2072 (bolp))
2073 (backward-char 1))
2074 (setq ep (point)))
2075 ;; Language wants comments or want them as whitespaces,
2076 ;; link them together.
2077 (if (eq (car (car ts)) tk)
2078 (setcdr (cdr (car ts)) ep)
2079 (setq ts (cons (cons tk (cons (match-beginning 0) ep))
2080 ts))))))
2081 ;; punctuation
2082 ((looking-at "\\(\\s.\\|\\s$\\|\\s'\\)")
2083 (setq ts (cons (cons 'punctuation
2084 (cons (match-beginning 0) (match-end 0)))
2085 ts)))
2086 ;; unknown token
2087 (t
2088 (error "What is that?")))
2089 (goto-char (or ep (match-end 0)))
2090 (setq ep nil)))
2091 ;; maybe catch the last beginning of line when needed
2092 (and semantic-flex-enable-bol
2093 (= (point) end)
2094 (bolp)
2095 (setq ts (cons (cons 'bol (cons (point) (point))) ts)))
2096 (goto-char pos)
2097 ;;(message "Flexing muscles...done")
2098 (nreverse ts)))
2099
2100 (provide 'semantic/lex)
2101
2102 ;; Local variables:
2103 ;; generated-autoload-file: "loaddefs.el"
2104 ;; generated-autoload-feature: semantic/loaddefs
2105 ;; generated-autoload-load-name: "semantic/lex"
2106 ;; End:
2107
2108 ;;; semantic-lex.el ends here