defconst, defvar: proclaim special at compile-time
[bpt/guile.git] / module / language / elisp / lexer.scm
CommitLineData
eb80072d
LC
1;;; Guile Emacs Lisp
2
14b9aa95 3;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
eb80072d
LC
4;;;
5;;; This library is free software; you can redistribute it and/or
6;;; modify it under the terms of the GNU Lesser General Public
7;;; License as published by the Free Software Foundation; either
8;;; version 3 of the License, or (at your option) any later version.
9;;;
10;;; This library is distributed in the hope that it will be useful,
11;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13;;; Lesser General Public License for more details.
14;;;
15;;; You should have received a copy of the GNU Lesser General Public
16;;; License along with this library; if not, write to the Free Software
17;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25512a94
DK
18
19;;; Code:
20
21(define-module (language elisp lexer)
22 #:use-module (ice-9 regex)
0faf3965 23 #:use-module (language elisp runtime)
ddb4364b 24 #:export (get-lexer get-lexer/1))
25512a94 25
c983a199
BT
26;;; This is the lexical analyzer for the elisp reader. It is
27;;; hand-written instead of using some generator. I think this is the
28;;; best solution because of all that fancy escape sequence handling and
29;;; the like.
30;;;
31;;; Characters are handled internally as integers representing their
32;;; code value. This is necessary because elisp allows a lot of fancy
33;;; modifiers that set certain high-range bits and the resulting values
34;;; would not fit into a real Scheme character range. Additionally,
35;;; elisp wants characters as integers, so we just do the right thing...
36;;;
37;;; TODO: #@count comments
5b1ee3be 38
c983a199 39;;; Report an error from the lexer (that is, invalid input given).
5b1ee3be
DK
40
41(define (lexer-error port msg . args)
42 (apply error msg args))
43
c983a199
BT
44;;; In a character, set a given bit. This is just some bit-wise or'ing
45;;; on the characters integer code and converting back to character.
5b1ee3be
DK
46
47(define (set-char-bit chr bit)
48 (logior chr (ash 1 bit)))
49
c983a199
BT
50;;; Check if a character equals some other. This is just like char=?
51;;; except that the tested one could be EOF in which case it simply
52;;; isn't equal.
5b1ee3be
DK
53
54(define (is-char? tested should-be)
55 (and (not (eof-object? tested))
56 (char=? tested should-be)))
57
c983a199
BT
58;;; For a character (as integer code), find the real character it
59;;; represents or #\nul if out of range. This is used to work with
60;;; Scheme character functions like char-numeric?.
5b1ee3be
DK
61
62(define (real-character chr)
63 (if (< chr 256)
f4e5e411
BT
64 (integer->char chr)
65 #\nul))
5b1ee3be 66
c983a199
BT
67;;; Return the control modified version of a character. This is not
68;;; just setting a modifier bit, because ASCII conrol characters must be
69;;; handled as such, and in elisp C-? is the delete character for
70;;; historical reasons. Otherwise, we set bit 26.
5b1ee3be
DK
71
72(define (add-control chr)
73 (let ((real (real-character chr)))
74 (if (char-alphabetic? real)
f4e5e411
BT
75 (- (char->integer (char-upcase real)) (char->integer #\@))
76 (case real
77 ((#\?) 127)
78 ((#\@) 0)
79 (else (set-char-bit chr 26))))))
5b1ee3be 80
c983a199
BT
81;;; Parse a charcode given in some base, basically octal or hexadecimal
82;;; are needed. A requested number of digits can be given (#f means it
83;;; does not matter and arbitrary many are allowed), and additionally
84;;; early return allowed (if fewer valid digits are found). These
85;;; options are all we need to handle the \u, \U, \x and \ddd (octal
86;;; digits) escape sequences.
5b1ee3be
DK
87
88(define (charcode-escape port base digits early-return)
89 (let iterate ((result 0)
90 (procdigs 0))
91 (if (and digits (>= procdigs digits))
f4e5e411
BT
92 result
93 (let* ((cur (read-char port))
94 (value (cond
95 ((char-numeric? cur)
96 (- (char->integer cur) (char->integer #\0)))
97 ((char-alphabetic? cur)
98 (let ((code (- (char->integer (char-upcase cur))
99 (char->integer #\A))))
100 (if (< code 0)
101 #f
102 (+ code 10))))
103 (else #f)))
104 (valid (and value (< value base))))
105 (if (not valid)
106 (if (or (not digits) early-return)
107 (begin
108 (unread-char cur port)
109 result)
110 (lexer-error port
111 "invalid digit in escape-code"
112 base
113 cur))
114 (iterate (+ (* result base) value) (1+ procdigs)))))))
5b1ee3be 115
c983a199
BT
116;;; Read a character and process escape-sequences when necessary. The
117;;; special in-string argument defines if this character is part of a
118;;; string literal or a single character literal, the difference being
119;;; that in strings the meta modifier sets bit 7, while it is bit 27 for
120;;; characters.
5b1ee3be
DK
121
122(define basic-escape-codes
f4e5e411
BT
123 '((#\a . 7)
124 (#\b . 8)
125 (#\t . 9)
126 (#\n . 10)
127 (#\v . 11)
128 (#\f . 12)
129 (#\r . 13)
130 (#\e . 27)
131 (#\s . 32)
132 (#\d . 127)))
5b1ee3be
DK
133
134(define (get-character port in-string)
f4e5e411
BT
135 (let ((meta-bits `((#\A . 22)
136 (#\s . 23)
137 (#\H . 24)
138 (#\S . 25)
139 (#\M . ,(if in-string 7 27))))
5b1ee3be
DK
140 (cur (read-char port)))
141 (if (char=? cur #\\)
f4e5e411
BT
142 ;; Handle an escape-sequence.
143 (let* ((escaped (read-char port))
144 (esc-code (assq-ref basic-escape-codes escaped))
145 (meta (assq-ref meta-bits escaped)))
146 (cond
147 ;; Meta-check must be before esc-code check because \s- must
148 ;; be recognized as the super-meta modifier if a - follows.
149 ;; If not, it will be caught as \s -> space escape code.
150 ((and meta (is-char? (peek-char port) #\-))
151 (if (not (char=? (read-char port) #\-))
152 (error "expected - after control sequence"))
153 (set-char-bit (get-character port in-string) meta))
154 ;; One of the basic control character escape names?
155 (esc-code esc-code)
156 ;; Handle \ddd octal code if it is one.
157 ((and (char>=? escaped #\0) (char<? escaped #\8))
158 (begin
159 (unread-char escaped port)
160 (charcode-escape port 8 3 #t)))
161 ;; Check for some escape-codes directly or otherwise use the
162 ;; escaped character literally.
163 (else
5b1ee3be
DK
164 (case escaped
165 ((#\^) (add-control (get-character port in-string)))
166 ((#\C)
167 (if (is-char? (peek-char port) #\-)
f4e5e411
BT
168 (begin
169 (if (not (char=? (read-char port) #\-))
170 (error "expected - after control sequence"))
171 (add-control (get-character port in-string)))
172 escaped))
5b1ee3be
DK
173 ((#\x) (charcode-escape port 16 #f #t))
174 ((#\u) (charcode-escape port 16 4 #f))
175 ((#\U) (charcode-escape port 16 8 #f))
176 (else (char->integer escaped))))))
f4e5e411
BT
177 ;; No escape-sequence, just the literal character. But remember
178 ;; to get the code instead!
179 (char->integer cur))))
5b1ee3be 180
c983a199
BT
181;;; Read a symbol or number from a port until something follows that
182;;; marks the start of a new token (like whitespace or parentheses).
183;;; The data read is returned as a string for further conversion to the
184;;; correct type, but we also return what this is
185;;; (integer/float/symbol). If any escaped character is found, it must
186;;; be a symbol. Otherwise we at the end check the result-string
187;;; against regular expressions to determine if it is possibly an
188;;; integer or a float.
25512a94
DK
189
190(define integer-regex (make-regexp "^[+-]?[0-9]+\\.?$"))
abcf4a9e 191
25512a94 192(define float-regex
f4e5e411
BT
193 (make-regexp
194 "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$"))
25512a94 195
c983a199
BT
196;;; A dot is also allowed literally, only a single dort alone is parsed
197;;; as the 'dot' terminal for dotted lists.
abcf4a9e 198
25512a94
DK
199(define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?."))
200
201(define (get-symbol-or-number port)
202 (let iterate ((result-chars '())
203 (had-escape #f))
204 (let* ((c (read-char port))
205 (finish (lambda ()
f4e5e411
BT
206 (let ((result (list->string
207 (reverse result-chars))))
25512a94 208 (values
f4e5e411
BT
209 (cond
210 ((and (not had-escape)
211 (regexp-exec integer-regex result))
212 'integer)
213 ((and (not had-escape)
214 (regexp-exec float-regex result))
215 'float)
216 (else 'symbol))
217 result))))
25512a94
DK
218 (need-no-escape? (lambda (c)
219 (or (char-numeric? c)
220 (char-alphabetic? c)
f4e5e411
BT
221 (char-set-contains?
222 no-escape-punctuation
223 c)))))
25512a94 224 (cond
f4e5e411
BT
225 ((eof-object? c) (finish))
226 ((need-no-escape? c) (iterate (cons c result-chars) had-escape))
227 ((char=? c #\\) (iterate (cons (read-char port) result-chars) #t))
228 (else
229 (unread-char c port)
230 (finish))))))
9a9f1231 231
c983a199
BT
232;;; Parse a circular structure marker without the leading # (which was
233;;; already read and recognized), that is, a number as identifier and
234;;; then either = or #.
9a9f1231
DK
235
236(define (get-circular-marker port)
237 (call-with-values
f4e5e411
BT
238 (lambda ()
239 (let iterate ((result 0))
240 (let ((cur (read-char port)))
241 (if (char-numeric? cur)
242 (let ((val (- (char->integer cur) (char->integer #\0))))
243 (iterate (+ (* result 10) val)))
244 (values result cur)))))
9a9f1231
DK
245 (lambda (id type)
246 (case type
247 ((#\#) `(circular-ref . ,id))
248 ((#\=) `(circular-def . ,id))
f4e5e411
BT
249 (else (lexer-error port
250 "invalid circular marker character"
251 type))))))
25512a94 252
c983a199
BT
253;;; Main lexer routine, which is given a port and does look for the next
254;;; token.
25512a94 255
03e00c5c
BT
256(define lexical-binding-regexp
257 (make-regexp
258 "-\\*-(|.*;)[ \t]*lexical-binding:[ \t]*([^;]*[^ \t;]).*-\\*-"))
259
25512a94 260(define (lex port)
03e00c5c
BT
261 (define (lexical-binding-value string)
262 (and=> (regexp-exec lexical-binding-regexp string)
263 (lambda (match)
264 (not (member (match:substring match 2) '("nil" "()"))))))
14b9aa95
AW
265 (let* ((return (let ((file (if (file-port? port)
266 (port-filename port)
267 #f))
268 (line (1+ (port-line port)))
269 (column (1+ (port-column port))))
270 (lambda (token value)
271 (let ((obj (cons token value)))
272 (set-source-property! obj 'filename file)
273 (set-source-property! obj 'line line)
274 (set-source-property! obj 'column column)
275 obj))))
276 ;; Read afterwards so the source-properties are correct above
277 ;; and actually point to the very character to be read.
278 (c (read-char port)))
25512a94 279 (cond
f4e5e411 280 ;; End of input must be specially marked to the parser.
1dfe5939 281 ((eof-object? c) (return 'eof c))
f4e5e411
BT
282 ;; Whitespace, just skip it.
283 ((char-whitespace? c) (lex port))
284 ;; The dot is only the one for dotted lists if followed by
285 ;; whitespace. Otherwise it is considered part of a number of
286 ;; symbol.
287 ((and (char=? c #\.)
288 (char-whitespace? (peek-char port)))
289 (return 'dot #f))
290 ;; Continue checking for literal character values.
291 (else
292 (case c
293 ;; A line comment, skip until end-of-line is found.
294 ((#\;)
03e00c5c
BT
295 (if (= (port-line port) 0)
296 (let iterate ((chars '()))
297 (let ((cur (read-char port)))
298 (if (or (eof-object? cur) (char=? cur #\newline))
299 (let ((string (list->string (reverse chars))))
300 (return 'set-lexical-binding-mode!
301 (lexical-binding-value string)))
302 (iterate (cons cur chars)))))
303 (let iterate ()
304 (let ((cur (read-char port)))
305 (if (or (eof-object? cur) (char=? cur #\newline))
306 (lex port)
307 (iterate))))))
f4e5e411
BT
308 ;; A character literal.
309 ((#\?)
310 (return 'character (get-character port #f)))
311 ;; A literal string. This is mainly a sequence of characters
312 ;; just as in the character literals, the only difference is
313 ;; that escaped newline and space are to be completely ignored
314 ;; and that meta-escapes set bit 7 rather than bit 27.
315 ((#\")
316 (let iterate ((result-chars '()))
317 (let ((cur (read-char port)))
318 (case cur
319 ((#\")
0faf3965
RT
320 (return 'string
321 (make-lisp-string
322 (list->string (reverse result-chars)))))
f4e5e411
BT
323 ((#\\)
324 (let ((escaped (read-char port)))
325 (case escaped
326 ((#\newline #\space)
327 (iterate result-chars))
328 (else
329 (unread-char escaped port)
330 (unread-char cur port)
331 (iterate
332 (cons (integer->char (get-character port #t))
333 result-chars))))))
334 (else (iterate (cons cur result-chars)))))))
f4e5e411 335 ((#\#)
b7966c10
BT
336 (let ((c (read-char port)))
337 (case c
338 ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
339 (unread-char c port)
340 (let ((mark (get-circular-marker port)))
341 (return (car mark) (cdr mark))))
342 ((#\')
5199c059
BT
343 (return 'function #f))
344 ((#\:)
345 (call-with-values
346 (lambda () (get-symbol-or-number port))
347 (lambda (type str)
348 (return 'symbol (make-symbol str))))))))
f4e5e411
BT
349 ;; Parentheses and other special-meaning single characters.
350 ((#\() (return 'paren-open #f))
351 ((#\)) (return 'paren-close #f))
352 ((#\[) (return 'square-open #f))
353 ((#\]) (return 'square-close #f))
354 ((#\') (return 'quote #f))
355 ((#\`) (return 'backquote #f))
356 ;; Unquote and unquote-splicing.
357 ((#\,)
358 (if (is-char? (peek-char port) #\@)
e840cc65 359 (if (not (char=? (read-char port) #\@))
f4e5e411
BT
360 (error "expected @ in unquote-splicing")
361 (return 'unquote-splicing #f))
e840cc65 362 (return 'unquote #f)))
f4e5e411
BT
363 ;; Remaining are numbers and symbols. Process input until next
364 ;; whitespace is found, and see if it looks like a number
365 ;; (float/integer) or symbol and return accordingly.
366 (else
367 (unread-char c port)
368 (call-with-values
369 (lambda () (get-symbol-or-number port))
370 (lambda (type str)
371 (case type
372 ((symbol)
c7622319
RT
373 (cond
374 ((equal? str "nil")
375 (return 'symbol #nil))
376 ((equal? str "t")
377 (return 'symbol #t))
378 (else
379 ;; str could be empty if the first character is already
380 ;; something not allowed in a symbol (and not escaped)!
381 ;; Take care about that, it is an error because that
382 ;; character should have been handled elsewhere or is
383 ;; invalid in the input.
384 (if (zero? (string-length str))
385 (begin
386 ;; Take it out so the REPL might not get into an
387 ;; infinite loop with further reading attempts.
388 (read-char port)
389 (error "invalid character in input" c))
390 (return 'symbol (string->symbol str))))))
f4e5e411
BT
391 ((integer)
392 ;; In elisp, something like "1." is an integer, while
393 ;; string->number returns an inexact real. Thus we need
394 ;; a conversion here, but it should always result in an
395 ;; integer!
396 (return
397 'integer
398 (let ((num (inexact->exact (string->number str))))
399 (if (not (integer? num))
400 (error "expected integer" str num))
401 num)))
402 ((float)
403 (return 'float (let ((num (string->number str)))
404 (if (exact? num)
405 (error "expected inexact float"
406 str
407 num))
408 num)))
409 (else (error "wrong number/symbol type" type)))))))))))
25512a94 410
c983a199
BT
411;;; Build a lexer thunk for a port. This is the exported routine which
412;;; can be used to create a lexer for the parser to use.
25512a94
DK
413
414(define (get-lexer port)
f4e5e411 415 (lambda () (lex port)))
ddb4364b 416
c983a199
BT
417;;; Build a special lexer that will only read enough for one expression
418;;; and then always return end-of-input. If we find one of the quotation
419;;; stuff, one more expression is needed in any case.
ddb4364b
DK
420
421(define (get-lexer/1 port)
422 (let ((lex (get-lexer port))
423 (finished #f)
424 (paren-level 0))
425 (lambda ()
426 (if finished
6854c324 427 (cons 'eof ((@ (ice-9 binary-ports) eof-object)))
f4e5e411
BT
428 (let ((next (lex))
429 (quotation #f))
430 (case (car next)
431 ((paren-open square-open)
432 (set! paren-level (1+ paren-level)))
433 ((paren-close square-close)
434 (set! paren-level (1- paren-level)))
435 ((quote backquote unquote unquote-splicing circular-def)
436 (set! quotation #t)))
437 (if (and (not quotation) (<= paren-level 0))
438 (set! finished #t))
439 next)))))