5a0e6b3ffeb6fc58c009f4708a5082784d90b041
[bpt/guile.git] / module / language / elisp / lexer.scm
1 ;;; Guile Emacs Lisp
2
3 ;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc.
4 ;;;
5 ;;; This library is free software; you can redistribute it and/or
6 ;;; modify it under the terms of the GNU Lesser General Public
7 ;;; License as published by the Free Software Foundation; either
8 ;;; version 3 of the License, or (at your option) any later version.
9 ;;;
10 ;;; This library is distributed in the hope that it will be useful,
11 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;; Lesser General Public License for more details.
14 ;;;
15 ;;; You should have received a copy of the GNU Lesser General Public
16 ;;; License along with this library; if not, write to the Free Software
17 ;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19 ;;; Code:
20
21 (define-module (language elisp lexer)
22 #:use-module (ice-9 regex)
23 #:export (get-lexer get-lexer/1))
24
25 ;;; This is the lexical analyzer for the elisp reader. It is
26 ;;; hand-written instead of using some generator. I think this is the
27 ;;; best solution because of all that fancy escape sequence handling and
28 ;;; the like.
29 ;;;
30 ;;; Characters are handled internally as integers representing their
31 ;;; code value. This is necessary because elisp allows a lot of fancy
32 ;;; modifiers that set certain high-range bits and the resulting values
33 ;;; would not fit into a real Scheme character range. Additionally,
34 ;;; elisp wants characters as integers, so we just do the right thing...
35 ;;;
36 ;;; TODO: #@count comments
37
38 ;;; Report an error from the lexer (that is, invalid input given).
39
40 (define (lexer-error port msg . args)
41 (apply error msg args))
42
43 ;;; In a character, set a given bit. This is just some bit-wise or'ing
44 ;;; on the characters integer code and converting back to character.
45
46 (define (set-char-bit chr bit)
47 (logior chr (ash 1 bit)))
48
49 ;;; Check if a character equals some other. This is just like char=?
50 ;;; except that the tested one could be EOF in which case it simply
51 ;;; isn't equal.
52
53 (define (is-char? tested should-be)
54 (and (not (eof-object? tested))
55 (char=? tested should-be)))
56
57 ;;; For a character (as integer code), find the real character it
58 ;;; represents or #\nul if out of range. This is used to work with
59 ;;; Scheme character functions like char-numeric?.
60
61 (define (real-character chr)
62 (if (< chr 256)
63 (integer->char chr)
64 #\nul))
65
66 ;;; Return the control modified version of a character. This is not
67 ;;; just setting a modifier bit, because ASCII conrol characters must be
68 ;;; handled as such, and in elisp C-? is the delete character for
69 ;;; historical reasons. Otherwise, we set bit 26.
70
71 (define (add-control chr)
72 (let ((real (real-character chr)))
73 (if (char-alphabetic? real)
74 (- (char->integer (char-upcase real)) (char->integer #\@))
75 (case real
76 ((#\?) 127)
77 ((#\@) 0)
78 (else (set-char-bit chr 26))))))
79
80 ;;; Parse a charcode given in some base, basically octal or hexadecimal
81 ;;; are needed. A requested number of digits can be given (#f means it
82 ;;; does not matter and arbitrary many are allowed), and additionally
83 ;;; early return allowed (if fewer valid digits are found). These
84 ;;; options are all we need to handle the \u, \U, \x and \ddd (octal
85 ;;; digits) escape sequences.
86
87 (define (charcode-escape port base digits early-return)
88 (let iterate ((result 0)
89 (procdigs 0))
90 (if (and digits (>= procdigs digits))
91 result
92 (let* ((cur (read-char port))
93 (value (cond
94 ((char-numeric? cur)
95 (- (char->integer cur) (char->integer #\0)))
96 ((char-alphabetic? cur)
97 (let ((code (- (char->integer (char-upcase cur))
98 (char->integer #\A))))
99 (if (< code 0)
100 #f
101 (+ code 10))))
102 (else #f)))
103 (valid (and value (< value base))))
104 (if (not valid)
105 (if (or (not digits) early-return)
106 (begin
107 (unread-char cur port)
108 result)
109 (lexer-error port
110 "invalid digit in escape-code"
111 base
112 cur))
113 (iterate (+ (* result base) value) (1+ procdigs)))))))
114
115 ;;; Read a character and process escape-sequences when necessary. The
116 ;;; special in-string argument defines if this character is part of a
117 ;;; string literal or a single character literal, the difference being
118 ;;; that in strings the meta modifier sets bit 7, while it is bit 27 for
119 ;;; characters.
120
121 (define basic-escape-codes
122 '((#\a . 7)
123 (#\b . 8)
124 (#\t . 9)
125 (#\n . 10)
126 (#\v . 11)
127 (#\f . 12)
128 (#\r . 13)
129 (#\e . 27)
130 (#\s . 32)
131 (#\d . 127)))
132
133 (define (get-character port in-string)
134 (let ((meta-bits `((#\A . 22)
135 (#\s . 23)
136 (#\H . 24)
137 (#\S . 25)
138 (#\M . ,(if in-string 7 27))))
139 (cur (read-char port)))
140 (if (char=? cur #\\)
141 ;; Handle an escape-sequence.
142 (let* ((escaped (read-char port))
143 (esc-code (assq-ref basic-escape-codes escaped))
144 (meta (assq-ref meta-bits escaped)))
145 (cond
146 ;; Meta-check must be before esc-code check because \s- must
147 ;; be recognized as the super-meta modifier if a - follows.
148 ;; If not, it will be caught as \s -> space escape code.
149 ((and meta (is-char? (peek-char port) #\-))
150 (if (not (char=? (read-char port) #\-))
151 (error "expected - after control sequence"))
152 (set-char-bit (get-character port in-string) meta))
153 ;; One of the basic control character escape names?
154 (esc-code esc-code)
155 ;; Handle \ddd octal code if it is one.
156 ((and (char>=? escaped #\0) (char<? escaped #\8))
157 (begin
158 (unread-char escaped port)
159 (charcode-escape port 8 3 #t)))
160 ;; Check for some escape-codes directly or otherwise use the
161 ;; escaped character literally.
162 (else
163 (case escaped
164 ((#\^) (add-control (get-character port in-string)))
165 ((#\C)
166 (if (is-char? (peek-char port) #\-)
167 (begin
168 (if (not (char=? (read-char port) #\-))
169 (error "expected - after control sequence"))
170 (add-control (get-character port in-string)))
171 escaped))
172 ((#\x) (charcode-escape port 16 #f #t))
173 ((#\u) (charcode-escape port 16 4 #f))
174 ((#\U) (charcode-escape port 16 8 #f))
175 (else (char->integer escaped))))))
176 ;; No escape-sequence, just the literal character. But remember
177 ;; to get the code instead!
178 (char->integer cur))))
179
180 ;;; Read a symbol or number from a port until something follows that
181 ;;; marks the start of a new token (like whitespace or parentheses).
182 ;;; The data read is returned as a string for further conversion to the
183 ;;; correct type, but we also return what this is
184 ;;; (integer/float/symbol). If any escaped character is found, it must
185 ;;; be a symbol. Otherwise we at the end check the result-string
186 ;;; against regular expressions to determine if it is possibly an
187 ;;; integer or a float.
188
189 (define integer-regex (make-regexp "^[+-]?[0-9]+\\.?$"))
190
191 (define float-regex
192 (make-regexp
193 "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$"))
194
195 ;;; A dot is also allowed literally, only a single dort alone is parsed
196 ;;; as the 'dot' terminal for dotted lists.
197
198 (define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?."))
199
200 (define (get-symbol-or-number port)
201 (let iterate ((result-chars '())
202 (had-escape #f))
203 (let* ((c (read-char port))
204 (finish (lambda ()
205 (let ((result (list->string
206 (reverse result-chars))))
207 (values
208 (cond
209 ((and (not had-escape)
210 (regexp-exec integer-regex result))
211 'integer)
212 ((and (not had-escape)
213 (regexp-exec float-regex result))
214 'float)
215 (else 'symbol))
216 result))))
217 (need-no-escape? (lambda (c)
218 (or (char-numeric? c)
219 (char-alphabetic? c)
220 (char-set-contains?
221 no-escape-punctuation
222 c)))))
223 (cond
224 ((eof-object? c) (finish))
225 ((need-no-escape? c) (iterate (cons c result-chars) had-escape))
226 ((char=? c #\\) (iterate (cons (read-char port) result-chars) #t))
227 (else
228 (unread-char c port)
229 (finish))))))
230
231 ;;; Parse a circular structure marker without the leading # (which was
232 ;;; already read and recognized), that is, a number as identifier and
233 ;;; then either = or #.
234
235 (define (get-circular-marker port)
236 (call-with-values
237 (lambda ()
238 (let iterate ((result 0))
239 (let ((cur (read-char port)))
240 (if (char-numeric? cur)
241 (let ((val (- (char->integer cur) (char->integer #\0))))
242 (iterate (+ (* result 10) val)))
243 (values result cur)))))
244 (lambda (id type)
245 (case type
246 ((#\#) `(circular-ref . ,id))
247 ((#\=) `(circular-def . ,id))
248 (else (lexer-error port
249 "invalid circular marker character"
250 type))))))
251
252 ;;; Main lexer routine, which is given a port and does look for the next
253 ;;; token.
254
255 (define lexical-binding-regexp
256 (make-regexp
257 "-\\*-(|.*;)[ \t]*lexical-binding:[ \t]*([^;]*[^ \t;]).*-\\*-"))
258
259 (define (lex port)
260 (define (lexical-binding-value string)
261 (and=> (regexp-exec lexical-binding-regexp string)
262 (lambda (match)
263 (not (member (match:substring match 2) '("nil" "()"))))))
264 (let* ((return (let ((file (if (file-port? port)
265 (port-filename port)
266 #f))
267 (line (1+ (port-line port)))
268 (column (1+ (port-column port))))
269 (lambda (token value)
270 (let ((obj (cons token value)))
271 (set-source-property! obj 'filename file)
272 (set-source-property! obj 'line line)
273 (set-source-property! obj 'column column)
274 obj))))
275 ;; Read afterwards so the source-properties are correct above
276 ;; and actually point to the very character to be read.
277 (c (read-char port)))
278 (cond
279 ;; End of input must be specially marked to the parser.
280 ((eof-object? c) (return 'eof c))
281 ;; Whitespace, just skip it.
282 ((char-whitespace? c) (lex port))
283 ;; The dot is only the one for dotted lists if followed by
284 ;; whitespace. Otherwise it is considered part of a number of
285 ;; symbol.
286 ((and (char=? c #\.)
287 (char-whitespace? (peek-char port)))
288 (return 'dot #f))
289 ;; Continue checking for literal character values.
290 (else
291 (case c
292 ;; A line comment, skip until end-of-line is found.
293 ((#\;)
294 (if (= (port-line port) 0)
295 (let iterate ((chars '()))
296 (let ((cur (read-char port)))
297 (if (or (eof-object? cur) (char=? cur #\newline))
298 (let ((string (list->string (reverse chars))))
299 (return 'set-lexical-binding-mode!
300 (lexical-binding-value string)))
301 (iterate (cons cur chars)))))
302 (let iterate ()
303 (let ((cur (read-char port)))
304 (if (or (eof-object? cur) (char=? cur #\newline))
305 (lex port)
306 (iterate))))))
307 ;; A character literal.
308 ((#\?)
309 (return 'character (get-character port #f)))
310 ;; A literal string. This is mainly a sequence of characters
311 ;; just as in the character literals, the only difference is
312 ;; that escaped newline and space are to be completely ignored
313 ;; and that meta-escapes set bit 7 rather than bit 27.
314 ((#\")
315 (let iterate ((result-chars '()))
316 (let ((cur (read-char port)))
317 (case cur
318 ((#\")
319 (return 'string (list->string (reverse result-chars))))
320 ((#\\)
321 (let ((escaped (read-char port)))
322 (case escaped
323 ((#\newline #\space)
324 (iterate result-chars))
325 (else
326 (unread-char escaped port)
327 (unread-char cur port)
328 (iterate
329 (cons (integer->char (get-character port #t))
330 result-chars))))))
331 (else (iterate (cons cur result-chars)))))))
332 ((#\#)
333 (let ((c (read-char port)))
334 (case c
335 ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
336 (unread-char c port)
337 (let ((mark (get-circular-marker port)))
338 (return (car mark) (cdr mark))))
339 ((#\')
340 (return 'function #f))
341 ((#\:)
342 (call-with-values
343 (lambda () (get-symbol-or-number port))
344 (lambda (type str)
345 (return 'symbol (make-symbol str))))))))
346 ;; Parentheses and other special-meaning single characters.
347 ((#\() (return 'paren-open #f))
348 ((#\)) (return 'paren-close #f))
349 ((#\[) (return 'square-open #f))
350 ((#\]) (return 'square-close #f))
351 ((#\') (return 'quote #f))
352 ((#\`) (return 'backquote #f))
353 ;; Unquote and unquote-splicing.
354 ((#\,)
355 (if (is-char? (peek-char port) #\@)
356 (if (not (char=? (read-char port) #\@))
357 (error "expected @ in unquote-splicing")
358 (return 'unquote-splicing #f))
359 (return 'unquote #f)))
360 ;; Remaining are numbers and symbols. Process input until next
361 ;; whitespace is found, and see if it looks like a number
362 ;; (float/integer) or symbol and return accordingly.
363 (else
364 (unread-char c port)
365 (call-with-values
366 (lambda () (get-symbol-or-number port))
367 (lambda (type str)
368 (case type
369 ((symbol)
370 ;; str could be empty if the first character is already
371 ;; something not allowed in a symbol (and not escaped)!
372 ;; Take care about that, it is an error because that
373 ;; character should have been handled elsewhere or is
374 ;; invalid in the input.
375 (if (zero? (string-length str))
376 (begin
377 ;; Take it out so the REPL might not get into an
378 ;; infinite loop with further reading attempts.
379 (read-char port)
380 (error "invalid character in input" c))
381 (return 'symbol (string->symbol str))))
382 ((integer)
383 ;; In elisp, something like "1." is an integer, while
384 ;; string->number returns an inexact real. Thus we need
385 ;; a conversion here, but it should always result in an
386 ;; integer!
387 (return
388 'integer
389 (let ((num (inexact->exact (string->number str))))
390 (if (not (integer? num))
391 (error "expected integer" str num))
392 num)))
393 ((float)
394 (return 'float (let ((num (string->number str)))
395 (if (exact? num)
396 (error "expected inexact float"
397 str
398 num))
399 num)))
400 (else (error "wrong number/symbol type" type)))))))))))
401
402 ;;; Build a lexer thunk for a port. This is the exported routine which
403 ;;; can be used to create a lexer for the parser to use.
404
405 (define (get-lexer port)
406 (lambda () (lex port)))
407
408 ;;; Build a special lexer that will only read enough for one expression
409 ;;; and then always return end-of-input. If we find one of the quotation
410 ;;; stuff, one more expression is needed in any case.
411
412 (define (get-lexer/1 port)
413 (let ((lex (get-lexer port))
414 (finished #f)
415 (paren-level 0))
416 (lambda ()
417 (if finished
418 (cons 'eof ((@ (ice-9 binary-ports) eof-object)))
419 (let ((next (lex))
420 (quotation #f))
421 (case (car next)
422 ((paren-open square-open)
423 (set! paren-level (1+ paren-level)))
424 ((paren-close square-close)
425 (set! paren-level (1- paren-level)))
426 ((quote backquote unquote unquote-splicing circular-def)
427 (set! quotation #t)))
428 (if (and (not quotation) (<= paren-level 0))
429 (set! finished #t))
430 next)))))