Commit | Line | Data |
---|---|---|
eb80072d LC |
1 | ;;; Guile Emacs Lisp |
2 | ||
14b9aa95 | 3 | ;;; Copyright (C) 2009, 2010, 2013 Free Software Foundation, Inc. |
eb80072d LC |
4 | ;;; |
5 | ;;; This library is free software; you can redistribute it and/or | |
6 | ;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;; License as published by the Free Software Foundation; either | |
8 | ;;; version 3 of the License, or (at your option) any later version. | |
9 | ;;; | |
10 | ;;; This library is distributed in the hope that it will be useful, | |
11 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;; Lesser General Public License for more details. | |
14 | ;;; | |
15 | ;;; You should have received a copy of the GNU Lesser General Public | |
16 | ;;; License along with this library; if not, write to the Free Software | |
17 | ;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25512a94 DK |
18 | |
19 | ;;; Code: | |
20 | ||
21 | (define-module (language elisp lexer) | |
22 | #:use-module (ice-9 regex) | |
0faf3965 | 23 | #:use-module (language elisp runtime) |
ddb4364b | 24 | #:export (get-lexer get-lexer/1)) |
25512a94 | 25 | |
c983a199 BT |
26 | ;;; This is the lexical analyzer for the elisp reader. It is |
27 | ;;; hand-written instead of using some generator. I think this is the | |
28 | ;;; best solution because of all that fancy escape sequence handling and | |
29 | ;;; the like. | |
30 | ;;; | |
31 | ;;; Characters are handled internally as integers representing their | |
32 | ;;; code value. This is necessary because elisp allows a lot of fancy | |
33 | ;;; modifiers that set certain high-range bits and the resulting values | |
34 | ;;; would not fit into a real Scheme character range. Additionally, | |
35 | ;;; elisp wants characters as integers, so we just do the right thing... | |
36 | ;;; | |
37 | ;;; TODO: #@count comments | |
5b1ee3be | 38 | |
c983a199 | 39 | ;;; Report an error from the lexer (that is, invalid input given). |
5b1ee3be DK |
40 | |
41 | (define (lexer-error port msg . args) | |
42 | (apply error msg args)) | |
43 | ||
c983a199 BT |
44 | ;;; In a character, set a given bit. This is just some bit-wise or'ing |
45 | ;;; on the characters integer code and converting back to character. | |
5b1ee3be DK |
46 | |
47 | (define (set-char-bit chr bit) | |
48 | (logior chr (ash 1 bit))) | |
49 | ||
c983a199 BT |
50 | ;;; Check if a character equals some other. This is just like char=? |
51 | ;;; except that the tested one could be EOF in which case it simply | |
52 | ;;; isn't equal. | |
5b1ee3be DK |
53 | |
54 | (define (is-char? tested should-be) | |
55 | (and (not (eof-object? tested)) | |
56 | (char=? tested should-be))) | |
57 | ||
c983a199 BT |
58 | ;;; For a character (as integer code), find the real character it |
59 | ;;; represents or #\nul if out of range. This is used to work with | |
60 | ;;; Scheme character functions like char-numeric?. | |
5b1ee3be DK |
61 | |
62 | (define (real-character chr) | |
63 | (if (< chr 256) | |
f4e5e411 BT |
64 | (integer->char chr) |
65 | #\nul)) | |
5b1ee3be | 66 | |
c983a199 BT |
67 | ;;; Return the control modified version of a character. This is not |
68 | ;;; just setting a modifier bit, because ASCII conrol characters must be | |
69 | ;;; handled as such, and in elisp C-? is the delete character for | |
70 | ;;; historical reasons. Otherwise, we set bit 26. | |
5b1ee3be DK |
71 | |
72 | (define (add-control chr) | |
73 | (let ((real (real-character chr))) | |
74 | (if (char-alphabetic? real) | |
f4e5e411 BT |
75 | (- (char->integer (char-upcase real)) (char->integer #\@)) |
76 | (case real | |
77 | ((#\?) 127) | |
78 | ((#\@) 0) | |
79 | (else (set-char-bit chr 26)))))) | |
5b1ee3be | 80 | |
c983a199 BT |
81 | ;;; Parse a charcode given in some base, basically octal or hexadecimal |
82 | ;;; are needed. A requested number of digits can be given (#f means it | |
83 | ;;; does not matter and arbitrary many are allowed), and additionally | |
84 | ;;; early return allowed (if fewer valid digits are found). These | |
85 | ;;; options are all we need to handle the \u, \U, \x and \ddd (octal | |
86 | ;;; digits) escape sequences. | |
5b1ee3be DK |
87 | |
88 | (define (charcode-escape port base digits early-return) | |
89 | (let iterate ((result 0) | |
90 | (procdigs 0)) | |
91 | (if (and digits (>= procdigs digits)) | |
f4e5e411 BT |
92 | result |
93 | (let* ((cur (read-char port)) | |
94 | (value (cond | |
95 | ((char-numeric? cur) | |
96 | (- (char->integer cur) (char->integer #\0))) | |
97 | ((char-alphabetic? cur) | |
98 | (let ((code (- (char->integer (char-upcase cur)) | |
99 | (char->integer #\A)))) | |
100 | (if (< code 0) | |
101 | #f | |
102 | (+ code 10)))) | |
103 | (else #f))) | |
104 | (valid (and value (< value base)))) | |
105 | (if (not valid) | |
106 | (if (or (not digits) early-return) | |
107 | (begin | |
108 | (unread-char cur port) | |
109 | result) | |
110 | (lexer-error port | |
111 | "invalid digit in escape-code" | |
112 | base | |
113 | cur)) | |
114 | (iterate (+ (* result base) value) (1+ procdigs))))))) | |
5b1ee3be | 115 | |
c983a199 BT |
116 | ;;; Read a character and process escape-sequences when necessary. The |
117 | ;;; special in-string argument defines if this character is part of a | |
118 | ;;; string literal or a single character literal, the difference being | |
119 | ;;; that in strings the meta modifier sets bit 7, while it is bit 27 for | |
120 | ;;; characters. | |
5b1ee3be DK |
121 | |
122 | (define basic-escape-codes | |
f4e5e411 BT |
123 | '((#\a . 7) |
124 | (#\b . 8) | |
125 | (#\t . 9) | |
126 | (#\n . 10) | |
127 | (#\v . 11) | |
128 | (#\f . 12) | |
129 | (#\r . 13) | |
130 | (#\e . 27) | |
131 | (#\s . 32) | |
132 | (#\d . 127))) | |
5b1ee3be DK |
133 | |
134 | (define (get-character port in-string) | |
f4e5e411 BT |
135 | (let ((meta-bits `((#\A . 22) |
136 | (#\s . 23) | |
137 | (#\H . 24) | |
138 | (#\S . 25) | |
139 | (#\M . ,(if in-string 7 27)))) | |
5b1ee3be DK |
140 | (cur (read-char port))) |
141 | (if (char=? cur #\\) | |
f4e5e411 BT |
142 | ;; Handle an escape-sequence. |
143 | (let* ((escaped (read-char port)) | |
144 | (esc-code (assq-ref basic-escape-codes escaped)) | |
145 | (meta (assq-ref meta-bits escaped))) | |
146 | (cond | |
147 | ;; Meta-check must be before esc-code check because \s- must | |
148 | ;; be recognized as the super-meta modifier if a - follows. | |
149 | ;; If not, it will be caught as \s -> space escape code. | |
150 | ((and meta (is-char? (peek-char port) #\-)) | |
151 | (if (not (char=? (read-char port) #\-)) | |
152 | (error "expected - after control sequence")) | |
153 | (set-char-bit (get-character port in-string) meta)) | |
154 | ;; One of the basic control character escape names? | |
155 | (esc-code esc-code) | |
156 | ;; Handle \ddd octal code if it is one. | |
157 | ((and (char>=? escaped #\0) (char<? escaped #\8)) | |
158 | (begin | |
159 | (unread-char escaped port) | |
160 | (charcode-escape port 8 3 #t))) | |
161 | ;; Check for some escape-codes directly or otherwise use the | |
162 | ;; escaped character literally. | |
163 | (else | |
5b1ee3be DK |
164 | (case escaped |
165 | ((#\^) (add-control (get-character port in-string))) | |
166 | ((#\C) | |
167 | (if (is-char? (peek-char port) #\-) | |
f4e5e411 BT |
168 | (begin |
169 | (if (not (char=? (read-char port) #\-)) | |
170 | (error "expected - after control sequence")) | |
171 | (add-control (get-character port in-string))) | |
172 | escaped)) | |
5b1ee3be DK |
173 | ((#\x) (charcode-escape port 16 #f #t)) |
174 | ((#\u) (charcode-escape port 16 4 #f)) | |
175 | ((#\U) (charcode-escape port 16 8 #f)) | |
176 | (else (char->integer escaped)))))) | |
f4e5e411 BT |
177 | ;; No escape-sequence, just the literal character. But remember |
178 | ;; to get the code instead! | |
179 | (char->integer cur)))) | |
5b1ee3be | 180 | |
c983a199 BT |
181 | ;;; Read a symbol or number from a port until something follows that |
182 | ;;; marks the start of a new token (like whitespace or parentheses). | |
183 | ;;; The data read is returned as a string for further conversion to the | |
184 | ;;; correct type, but we also return what this is | |
185 | ;;; (integer/float/symbol). If any escaped character is found, it must | |
186 | ;;; be a symbol. Otherwise we at the end check the result-string | |
187 | ;;; against regular expressions to determine if it is possibly an | |
188 | ;;; integer or a float. | |
25512a94 DK |
189 | |
190 | (define integer-regex (make-regexp "^[+-]?[0-9]+\\.?$")) | |
abcf4a9e | 191 | |
25512a94 | 192 | (define float-regex |
f4e5e411 BT |
193 | (make-regexp |
194 | "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$")) | |
25512a94 | 195 | |
c983a199 BT |
196 | ;;; A dot is also allowed literally, only a single dort alone is parsed |
197 | ;;; as the 'dot' terminal for dotted lists. | |
abcf4a9e | 198 | |
25512a94 DK |
199 | (define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?.")) |
200 | ||
201 | (define (get-symbol-or-number port) | |
202 | (let iterate ((result-chars '()) | |
203 | (had-escape #f)) | |
204 | (let* ((c (read-char port)) | |
205 | (finish (lambda () | |
f4e5e411 BT |
206 | (let ((result (list->string |
207 | (reverse result-chars)))) | |
25512a94 | 208 | (values |
f4e5e411 BT |
209 | (cond |
210 | ((and (not had-escape) | |
211 | (regexp-exec integer-regex result)) | |
212 | 'integer) | |
213 | ((and (not had-escape) | |
214 | (regexp-exec float-regex result)) | |
215 | 'float) | |
216 | (else 'symbol)) | |
217 | result)))) | |
25512a94 DK |
218 | (need-no-escape? (lambda (c) |
219 | (or (char-numeric? c) | |
220 | (char-alphabetic? c) | |
f4e5e411 BT |
221 | (char-set-contains? |
222 | no-escape-punctuation | |
223 | c))))) | |
25512a94 | 224 | (cond |
f4e5e411 BT |
225 | ((eof-object? c) (finish)) |
226 | ((need-no-escape? c) (iterate (cons c result-chars) had-escape)) | |
227 | ((char=? c #\\) (iterate (cons (read-char port) result-chars) #t)) | |
228 | (else | |
229 | (unread-char c port) | |
230 | (finish)))))) | |
9a9f1231 | 231 | |
c983a199 BT |
232 | ;;; Parse a circular structure marker without the leading # (which was |
233 | ;;; already read and recognized), that is, a number as identifier and | |
234 | ;;; then either = or #. | |
9a9f1231 DK |
235 | |
236 | (define (get-circular-marker port) | |
237 | (call-with-values | |
f4e5e411 BT |
238 | (lambda () |
239 | (let iterate ((result 0)) | |
240 | (let ((cur (read-char port))) | |
241 | (if (char-numeric? cur) | |
242 | (let ((val (- (char->integer cur) (char->integer #\0)))) | |
243 | (iterate (+ (* result 10) val))) | |
244 | (values result cur))))) | |
9a9f1231 DK |
245 | (lambda (id type) |
246 | (case type | |
247 | ((#\#) `(circular-ref . ,id)) | |
248 | ((#\=) `(circular-def . ,id)) | |
f4e5e411 BT |
249 | (else (lexer-error port |
250 | "invalid circular marker character" | |
251 | type)))))) | |
25512a94 | 252 | |
c983a199 BT |
253 | ;;; Main lexer routine, which is given a port and does look for the next |
254 | ;;; token. | |
25512a94 | 255 | |
03e00c5c BT |
256 | (define lexical-binding-regexp |
257 | (make-regexp | |
258 | "-\\*-(|.*;)[ \t]*lexical-binding:[ \t]*([^;]*[^ \t;]).*-\\*-")) | |
259 | ||
25512a94 | 260 | (define (lex port) |
03e00c5c BT |
261 | (define (lexical-binding-value string) |
262 | (and=> (regexp-exec lexical-binding-regexp string) | |
263 | (lambda (match) | |
264 | (not (member (match:substring match 2) '("nil" "()")))))) | |
14b9aa95 AW |
265 | (let* ((return (let ((file (if (file-port? port) |
266 | (port-filename port) | |
267 | #f)) | |
268 | (line (1+ (port-line port))) | |
269 | (column (1+ (port-column port)))) | |
270 | (lambda (token value) | |
271 | (let ((obj (cons token value))) | |
272 | (set-source-property! obj 'filename file) | |
273 | (set-source-property! obj 'line line) | |
274 | (set-source-property! obj 'column column) | |
275 | obj)))) | |
276 | ;; Read afterwards so the source-properties are correct above | |
277 | ;; and actually point to the very character to be read. | |
278 | (c (read-char port))) | |
25512a94 | 279 | (cond |
f4e5e411 | 280 | ;; End of input must be specially marked to the parser. |
1dfe5939 | 281 | ((eof-object? c) (return 'eof c)) |
f4e5e411 BT |
282 | ;; Whitespace, just skip it. |
283 | ((char-whitespace? c) (lex port)) | |
284 | ;; The dot is only the one for dotted lists if followed by | |
285 | ;; whitespace. Otherwise it is considered part of a number of | |
286 | ;; symbol. | |
287 | ((and (char=? c #\.) | |
288 | (char-whitespace? (peek-char port))) | |
289 | (return 'dot #f)) | |
290 | ;; Continue checking for literal character values. | |
291 | (else | |
292 | (case c | |
293 | ;; A line comment, skip until end-of-line is found. | |
294 | ((#\;) | |
03e00c5c BT |
295 | (if (= (port-line port) 0) |
296 | (let iterate ((chars '())) | |
297 | (let ((cur (read-char port))) | |
298 | (if (or (eof-object? cur) (char=? cur #\newline)) | |
299 | (let ((string (list->string (reverse chars)))) | |
300 | (return 'set-lexical-binding-mode! | |
301 | (lexical-binding-value string))) | |
302 | (iterate (cons cur chars))))) | |
303 | (let iterate () | |
304 | (let ((cur (read-char port))) | |
305 | (if (or (eof-object? cur) (char=? cur #\newline)) | |
306 | (lex port) | |
307 | (iterate)))))) | |
f4e5e411 BT |
308 | ;; A character literal. |
309 | ((#\?) | |
310 | (return 'character (get-character port #f))) | |
311 | ;; A literal string. This is mainly a sequence of characters | |
312 | ;; just as in the character literals, the only difference is | |
313 | ;; that escaped newline and space are to be completely ignored | |
314 | ;; and that meta-escapes set bit 7 rather than bit 27. | |
315 | ((#\") | |
316 | (let iterate ((result-chars '())) | |
317 | (let ((cur (read-char port))) | |
318 | (case cur | |
319 | ((#\") | |
0faf3965 RT |
320 | (return 'string |
321 | (make-lisp-string | |
322 | (list->string (reverse result-chars))))) | |
f4e5e411 BT |
323 | ((#\\) |
324 | (let ((escaped (read-char port))) | |
325 | (case escaped | |
326 | ((#\newline #\space) | |
327 | (iterate result-chars)) | |
328 | (else | |
329 | (unread-char escaped port) | |
330 | (unread-char cur port) | |
331 | (iterate | |
332 | (cons (integer->char (get-character port #t)) | |
333 | result-chars)))))) | |
334 | (else (iterate (cons cur result-chars))))))) | |
f4e5e411 | 335 | ((#\#) |
b7966c10 BT |
336 | (let ((c (read-char port))) |
337 | (case c | |
338 | ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) | |
339 | (unread-char c port) | |
340 | (let ((mark (get-circular-marker port))) | |
341 | (return (car mark) (cdr mark)))) | |
342 | ((#\') | |
5199c059 BT |
343 | (return 'function #f)) |
344 | ((#\:) | |
345 | (call-with-values | |
346 | (lambda () (get-symbol-or-number port)) | |
347 | (lambda (type str) | |
348 | (return 'symbol (make-symbol str)))))))) | |
f4e5e411 BT |
349 | ;; Parentheses and other special-meaning single characters. |
350 | ((#\() (return 'paren-open #f)) | |
351 | ((#\)) (return 'paren-close #f)) | |
352 | ((#\[) (return 'square-open #f)) | |
353 | ((#\]) (return 'square-close #f)) | |
354 | ((#\') (return 'quote #f)) | |
355 | ((#\`) (return 'backquote #f)) | |
356 | ;; Unquote and unquote-splicing. | |
357 | ((#\,) | |
358 | (if (is-char? (peek-char port) #\@) | |
e840cc65 | 359 | (if (not (char=? (read-char port) #\@)) |
f4e5e411 BT |
360 | (error "expected @ in unquote-splicing") |
361 | (return 'unquote-splicing #f)) | |
e840cc65 | 362 | (return 'unquote #f))) |
f4e5e411 BT |
363 | ;; Remaining are numbers and symbols. Process input until next |
364 | ;; whitespace is found, and see if it looks like a number | |
365 | ;; (float/integer) or symbol and return accordingly. | |
366 | (else | |
367 | (unread-char c port) | |
368 | (call-with-values | |
369 | (lambda () (get-symbol-or-number port)) | |
370 | (lambda (type str) | |
371 | (case type | |
372 | ((symbol) | |
c7622319 RT |
373 | (cond |
374 | ((equal? str "nil") | |
375 | (return 'symbol #nil)) | |
376 | ((equal? str "t") | |
377 | (return 'symbol #t)) | |
378 | (else | |
379 | ;; str could be empty if the first character is already | |
380 | ;; something not allowed in a symbol (and not escaped)! | |
381 | ;; Take care about that, it is an error because that | |
382 | ;; character should have been handled elsewhere or is | |
383 | ;; invalid in the input. | |
384 | (if (zero? (string-length str)) | |
385 | (begin | |
386 | ;; Take it out so the REPL might not get into an | |
387 | ;; infinite loop with further reading attempts. | |
388 | (read-char port) | |
389 | (error "invalid character in input" c)) | |
390 | (return 'symbol (string->symbol str)))))) | |
f4e5e411 BT |
391 | ((integer) |
392 | ;; In elisp, something like "1." is an integer, while | |
393 | ;; string->number returns an inexact real. Thus we need | |
394 | ;; a conversion here, but it should always result in an | |
395 | ;; integer! | |
396 | (return | |
397 | 'integer | |
398 | (let ((num (inexact->exact (string->number str)))) | |
399 | (if (not (integer? num)) | |
400 | (error "expected integer" str num)) | |
401 | num))) | |
402 | ((float) | |
403 | (return 'float (let ((num (string->number str))) | |
404 | (if (exact? num) | |
405 | (error "expected inexact float" | |
406 | str | |
407 | num)) | |
408 | num))) | |
409 | (else (error "wrong number/symbol type" type))))))))))) | |
25512a94 | 410 | |
c983a199 BT |
411 | ;;; Build a lexer thunk for a port. This is the exported routine which |
412 | ;;; can be used to create a lexer for the parser to use. | |
25512a94 DK |
413 | |
414 | (define (get-lexer port) | |
f4e5e411 | 415 | (lambda () (lex port))) |
ddb4364b | 416 | |
c983a199 BT |
417 | ;;; Build a special lexer that will only read enough for one expression |
418 | ;;; and then always return end-of-input. If we find one of the quotation | |
419 | ;;; stuff, one more expression is needed in any case. | |
ddb4364b DK |
420 | |
421 | (define (get-lexer/1 port) | |
422 | (let ((lex (get-lexer port)) | |
423 | (finished #f) | |
424 | (paren-level 0)) | |
425 | (lambda () | |
426 | (if finished | |
6854c324 | 427 | (cons 'eof ((@ (ice-9 binary-ports) eof-object))) |
f4e5e411 BT |
428 | (let ((next (lex)) |
429 | (quotation #f)) | |
430 | (case (car next) | |
431 | ((paren-open square-open) | |
432 | (set! paren-level (1+ paren-level))) | |
433 | ((paren-close square-close) | |
434 | (set! paren-level (1- paren-level))) | |
435 | ((quote backquote unquote unquote-splicing circular-def) | |
436 | (set! quotation #t))) | |
437 | (if (and (not quotation) (<= paren-level 0)) | |
438 | (set! finished #t)) | |
439 | next))))) |