Commit | Line | Data |
---|---|---|
eb80072d LC |
1 | ;;; Guile Emacs Lisp |
2 | ||
c983a199 | 3 | ;;; Copyright (C) 2009, 2010 Free Software Foundation, Inc. |
eb80072d LC |
4 | ;;; |
5 | ;;; This library is free software; you can redistribute it and/or | |
6 | ;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;; License as published by the Free Software Foundation; either | |
8 | ;;; version 3 of the License, or (at your option) any later version. | |
9 | ;;; | |
10 | ;;; This library is distributed in the hope that it will be useful, | |
11 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;; Lesser General Public License for more details. | |
14 | ;;; | |
15 | ;;; You should have received a copy of the GNU Lesser General Public | |
16 | ;;; License along with this library; if not, write to the Free Software | |
17 | ;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25512a94 DK |
18 | |
19 | ;;; Code: | |
20 | ||
21 | (define-module (language elisp lexer) | |
22 | #:use-module (ice-9 regex) | |
ddb4364b | 23 | #:export (get-lexer get-lexer/1)) |
25512a94 | 24 | |
c983a199 BT |
25 | ;;; This is the lexical analyzer for the elisp reader. It is |
26 | ;;; hand-written instead of using some generator. I think this is the | |
27 | ;;; best solution because of all that fancy escape sequence handling and | |
28 | ;;; the like. | |
29 | ;;; | |
30 | ;;; Characters are handled internally as integers representing their | |
31 | ;;; code value. This is necessary because elisp allows a lot of fancy | |
32 | ;;; modifiers that set certain high-range bits and the resulting values | |
33 | ;;; would not fit into a real Scheme character range. Additionally, | |
34 | ;;; elisp wants characters as integers, so we just do the right thing... | |
35 | ;;; | |
36 | ;;; TODO: #@count comments | |
5b1ee3be | 37 | |
c983a199 | 38 | ;;; Report an error from the lexer (that is, invalid input given). |
5b1ee3be DK |
39 | |
40 | (define (lexer-error port msg . args) | |
41 | (apply error msg args)) | |
42 | ||
c983a199 BT |
43 | ;;; In a character, set a given bit. This is just some bit-wise or'ing |
44 | ;;; on the characters integer code and converting back to character. | |
5b1ee3be DK |
45 | |
46 | (define (set-char-bit chr bit) | |
47 | (logior chr (ash 1 bit))) | |
48 | ||
c983a199 BT |
49 | ;;; Check if a character equals some other. This is just like char=? |
50 | ;;; except that the tested one could be EOF in which case it simply | |
51 | ;;; isn't equal. | |
5b1ee3be DK |
52 | |
53 | (define (is-char? tested should-be) | |
54 | (and (not (eof-object? tested)) | |
55 | (char=? tested should-be))) | |
56 | ||
c983a199 BT |
57 | ;;; For a character (as integer code), find the real character it |
58 | ;;; represents or #\nul if out of range. This is used to work with | |
59 | ;;; Scheme character functions like char-numeric?. | |
5b1ee3be DK |
60 | |
61 | (define (real-character chr) | |
62 | (if (< chr 256) | |
f4e5e411 BT |
63 | (integer->char chr) |
64 | #\nul)) | |
5b1ee3be | 65 | |
c983a199 BT |
66 | ;;; Return the control modified version of a character. This is not |
67 | ;;; just setting a modifier bit, because ASCII conrol characters must be | |
68 | ;;; handled as such, and in elisp C-? is the delete character for | |
69 | ;;; historical reasons. Otherwise, we set bit 26. | |
5b1ee3be DK |
70 | |
71 | (define (add-control chr) | |
72 | (let ((real (real-character chr))) | |
73 | (if (char-alphabetic? real) | |
f4e5e411 BT |
74 | (- (char->integer (char-upcase real)) (char->integer #\@)) |
75 | (case real | |
76 | ((#\?) 127) | |
77 | ((#\@) 0) | |
78 | (else (set-char-bit chr 26)))))) | |
5b1ee3be | 79 | |
c983a199 BT |
80 | ;;; Parse a charcode given in some base, basically octal or hexadecimal |
81 | ;;; are needed. A requested number of digits can be given (#f means it | |
82 | ;;; does not matter and arbitrary many are allowed), and additionally | |
83 | ;;; early return allowed (if fewer valid digits are found). These | |
84 | ;;; options are all we need to handle the \u, \U, \x and \ddd (octal | |
85 | ;;; digits) escape sequences. | |
5b1ee3be DK |
86 | |
87 | (define (charcode-escape port base digits early-return) | |
88 | (let iterate ((result 0) | |
89 | (procdigs 0)) | |
90 | (if (and digits (>= procdigs digits)) | |
f4e5e411 BT |
91 | result |
92 | (let* ((cur (read-char port)) | |
93 | (value (cond | |
94 | ((char-numeric? cur) | |
95 | (- (char->integer cur) (char->integer #\0))) | |
96 | ((char-alphabetic? cur) | |
97 | (let ((code (- (char->integer (char-upcase cur)) | |
98 | (char->integer #\A)))) | |
99 | (if (< code 0) | |
100 | #f | |
101 | (+ code 10)))) | |
102 | (else #f))) | |
103 | (valid (and value (< value base)))) | |
104 | (if (not valid) | |
105 | (if (or (not digits) early-return) | |
106 | (begin | |
107 | (unread-char cur port) | |
108 | result) | |
109 | (lexer-error port | |
110 | "invalid digit in escape-code" | |
111 | base | |
112 | cur)) | |
113 | (iterate (+ (* result base) value) (1+ procdigs))))))) | |
5b1ee3be | 114 | |
c983a199 BT |
115 | ;;; Read a character and process escape-sequences when necessary. The |
116 | ;;; special in-string argument defines if this character is part of a | |
117 | ;;; string literal or a single character literal, the difference being | |
118 | ;;; that in strings the meta modifier sets bit 7, while it is bit 27 for | |
119 | ;;; characters. | |
5b1ee3be DK |
120 | |
121 | (define basic-escape-codes | |
f4e5e411 BT |
122 | '((#\a . 7) |
123 | (#\b . 8) | |
124 | (#\t . 9) | |
125 | (#\n . 10) | |
126 | (#\v . 11) | |
127 | (#\f . 12) | |
128 | (#\r . 13) | |
129 | (#\e . 27) | |
130 | (#\s . 32) | |
131 | (#\d . 127))) | |
5b1ee3be DK |
132 | |
133 | (define (get-character port in-string) | |
f4e5e411 BT |
134 | (let ((meta-bits `((#\A . 22) |
135 | (#\s . 23) | |
136 | (#\H . 24) | |
137 | (#\S . 25) | |
138 | (#\M . ,(if in-string 7 27)))) | |
5b1ee3be DK |
139 | (cur (read-char port))) |
140 | (if (char=? cur #\\) | |
f4e5e411 BT |
141 | ;; Handle an escape-sequence. |
142 | (let* ((escaped (read-char port)) | |
143 | (esc-code (assq-ref basic-escape-codes escaped)) | |
144 | (meta (assq-ref meta-bits escaped))) | |
145 | (cond | |
146 | ;; Meta-check must be before esc-code check because \s- must | |
147 | ;; be recognized as the super-meta modifier if a - follows. | |
148 | ;; If not, it will be caught as \s -> space escape code. | |
149 | ((and meta (is-char? (peek-char port) #\-)) | |
150 | (if (not (char=? (read-char port) #\-)) | |
151 | (error "expected - after control sequence")) | |
152 | (set-char-bit (get-character port in-string) meta)) | |
153 | ;; One of the basic control character escape names? | |
154 | (esc-code esc-code) | |
155 | ;; Handle \ddd octal code if it is one. | |
156 | ((and (char>=? escaped #\0) (char<? escaped #\8)) | |
157 | (begin | |
158 | (unread-char escaped port) | |
159 | (charcode-escape port 8 3 #t))) | |
160 | ;; Check for some escape-codes directly or otherwise use the | |
161 | ;; escaped character literally. | |
162 | (else | |
5b1ee3be DK |
163 | (case escaped |
164 | ((#\^) (add-control (get-character port in-string))) | |
165 | ((#\C) | |
166 | (if (is-char? (peek-char port) #\-) | |
f4e5e411 BT |
167 | (begin |
168 | (if (not (char=? (read-char port) #\-)) | |
169 | (error "expected - after control sequence")) | |
170 | (add-control (get-character port in-string))) | |
171 | escaped)) | |
5b1ee3be DK |
172 | ((#\x) (charcode-escape port 16 #f #t)) |
173 | ((#\u) (charcode-escape port 16 4 #f)) | |
174 | ((#\U) (charcode-escape port 16 8 #f)) | |
175 | (else (char->integer escaped)))))) | |
f4e5e411 BT |
176 | ;; No escape-sequence, just the literal character. But remember |
177 | ;; to get the code instead! | |
178 | (char->integer cur)))) | |
5b1ee3be | 179 | |
c983a199 BT |
180 | ;;; Read a symbol or number from a port until something follows that |
181 | ;;; marks the start of a new token (like whitespace or parentheses). | |
182 | ;;; The data read is returned as a string for further conversion to the | |
183 | ;;; correct type, but we also return what this is | |
184 | ;;; (integer/float/symbol). If any escaped character is found, it must | |
185 | ;;; be a symbol. Otherwise we at the end check the result-string | |
186 | ;;; against regular expressions to determine if it is possibly an | |
187 | ;;; integer or a float. | |
25512a94 DK |
188 | |
189 | (define integer-regex (make-regexp "^[+-]?[0-9]+\\.?$")) | |
abcf4a9e | 190 | |
25512a94 | 191 | (define float-regex |
f4e5e411 BT |
192 | (make-regexp |
193 | "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$")) | |
25512a94 | 194 | |
c983a199 BT |
195 | ;;; A dot is also allowed literally, only a single dort alone is parsed |
196 | ;;; as the 'dot' terminal for dotted lists. | |
abcf4a9e | 197 | |
25512a94 DK |
198 | (define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?.")) |
199 | ||
200 | (define (get-symbol-or-number port) | |
201 | (let iterate ((result-chars '()) | |
202 | (had-escape #f)) | |
203 | (let* ((c (read-char port)) | |
204 | (finish (lambda () | |
f4e5e411 BT |
205 | (let ((result (list->string |
206 | (reverse result-chars)))) | |
25512a94 | 207 | (values |
f4e5e411 BT |
208 | (cond |
209 | ((and (not had-escape) | |
210 | (regexp-exec integer-regex result)) | |
211 | 'integer) | |
212 | ((and (not had-escape) | |
213 | (regexp-exec float-regex result)) | |
214 | 'float) | |
215 | (else 'symbol)) | |
216 | result)))) | |
25512a94 DK |
217 | (need-no-escape? (lambda (c) |
218 | (or (char-numeric? c) | |
219 | (char-alphabetic? c) | |
f4e5e411 BT |
220 | (char-set-contains? |
221 | no-escape-punctuation | |
222 | c))))) | |
25512a94 | 223 | (cond |
f4e5e411 BT |
224 | ((eof-object? c) (finish)) |
225 | ((need-no-escape? c) (iterate (cons c result-chars) had-escape)) | |
226 | ((char=? c #\\) (iterate (cons (read-char port) result-chars) #t)) | |
227 | (else | |
228 | (unread-char c port) | |
229 | (finish)))))) | |
9a9f1231 | 230 | |
c983a199 BT |
231 | ;;; Parse a circular structure marker without the leading # (which was |
232 | ;;; already read and recognized), that is, a number as identifier and | |
233 | ;;; then either = or #. | |
9a9f1231 DK |
234 | |
235 | (define (get-circular-marker port) | |
236 | (call-with-values | |
f4e5e411 BT |
237 | (lambda () |
238 | (let iterate ((result 0)) | |
239 | (let ((cur (read-char port))) | |
240 | (if (char-numeric? cur) | |
241 | (let ((val (- (char->integer cur) (char->integer #\0)))) | |
242 | (iterate (+ (* result 10) val))) | |
243 | (values result cur))))) | |
9a9f1231 DK |
244 | (lambda (id type) |
245 | (case type | |
246 | ((#\#) `(circular-ref . ,id)) | |
247 | ((#\=) `(circular-def . ,id)) | |
f4e5e411 BT |
248 | (else (lexer-error port |
249 | "invalid circular marker character" | |
250 | type)))))) | |
25512a94 | 251 | |
c983a199 BT |
252 | ;;; Main lexer routine, which is given a port and does look for the next |
253 | ;;; token. | |
25512a94 | 254 | |
03e00c5c BT |
255 | (define lexical-binding-regexp |
256 | (make-regexp | |
257 | "-\\*-(|.*;)[ \t]*lexical-binding:[ \t]*([^;]*[^ \t;]).*-\\*-")) | |
258 | ||
25512a94 | 259 | (define (lex port) |
03e00c5c BT |
260 | (define (lexical-binding-value string) |
261 | (and=> (regexp-exec lexical-binding-regexp string) | |
262 | (lambda (match) | |
263 | (not (member (match:substring match 2) '("nil" "()")))))) | |
f4e5e411 BT |
264 | (let ((return (let ((file (if (file-port? port) |
265 | (port-filename port) | |
266 | #f)) | |
25512a94 DK |
267 | (line (1+ (port-line port))) |
268 | (column (1+ (port-column port)))) | |
269 | (lambda (token value) | |
270 | (let ((obj (cons token value))) | |
271 | (set-source-property! obj 'filename file) | |
272 | (set-source-property! obj 'line line) | |
273 | (set-source-property! obj 'column column) | |
274 | obj)))) | |
c983a199 BT |
275 | ;; Read afterwards so the source-properties are correct above |
276 | ;; and actually point to the very character to be read. | |
25512a94 DK |
277 | (c (read-char port))) |
278 | (cond | |
f4e5e411 | 279 | ;; End of input must be specially marked to the parser. |
1dfe5939 | 280 | ((eof-object? c) (return 'eof c)) |
f4e5e411 BT |
281 | ;; Whitespace, just skip it. |
282 | ((char-whitespace? c) (lex port)) | |
283 | ;; The dot is only the one for dotted lists if followed by | |
284 | ;; whitespace. Otherwise it is considered part of a number of | |
285 | ;; symbol. | |
286 | ((and (char=? c #\.) | |
287 | (char-whitespace? (peek-char port))) | |
288 | (return 'dot #f)) | |
289 | ;; Continue checking for literal character values. | |
290 | (else | |
291 | (case c | |
292 | ;; A line comment, skip until end-of-line is found. | |
293 | ((#\;) | |
03e00c5c BT |
294 | (if (= (port-line port) 0) |
295 | (let iterate ((chars '())) | |
296 | (let ((cur (read-char port))) | |
297 | (if (or (eof-object? cur) (char=? cur #\newline)) | |
298 | (let ((string (list->string (reverse chars)))) | |
299 | (return 'set-lexical-binding-mode! | |
300 | (lexical-binding-value string))) | |
301 | (iterate (cons cur chars))))) | |
302 | (let iterate () | |
303 | (let ((cur (read-char port))) | |
304 | (if (or (eof-object? cur) (char=? cur #\newline)) | |
305 | (lex port) | |
306 | (iterate)))))) | |
f4e5e411 BT |
307 | ;; A character literal. |
308 | ((#\?) | |
309 | (return 'character (get-character port #f))) | |
310 | ;; A literal string. This is mainly a sequence of characters | |
311 | ;; just as in the character literals, the only difference is | |
312 | ;; that escaped newline and space are to be completely ignored | |
313 | ;; and that meta-escapes set bit 7 rather than bit 27. | |
314 | ((#\") | |
315 | (let iterate ((result-chars '())) | |
316 | (let ((cur (read-char port))) | |
317 | (case cur | |
318 | ((#\") | |
319 | (return 'string (list->string (reverse result-chars)))) | |
320 | ((#\\) | |
321 | (let ((escaped (read-char port))) | |
322 | (case escaped | |
323 | ((#\newline #\space) | |
324 | (iterate result-chars)) | |
325 | (else | |
326 | (unread-char escaped port) | |
327 | (unread-char cur port) | |
328 | (iterate | |
329 | (cons (integer->char (get-character port #t)) | |
330 | result-chars)))))) | |
331 | (else (iterate (cons cur result-chars))))))) | |
f4e5e411 | 332 | ((#\#) |
b7966c10 BT |
333 | (let ((c (read-char port))) |
334 | (case c | |
335 | ((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) | |
336 | (unread-char c port) | |
337 | (let ((mark (get-circular-marker port))) | |
338 | (return (car mark) (cdr mark)))) | |
339 | ((#\') | |
340 | (return 'function #f))))) | |
f4e5e411 BT |
341 | ;; Parentheses and other special-meaning single characters. |
342 | ((#\() (return 'paren-open #f)) | |
343 | ((#\)) (return 'paren-close #f)) | |
344 | ((#\[) (return 'square-open #f)) | |
345 | ((#\]) (return 'square-close #f)) | |
346 | ((#\') (return 'quote #f)) | |
347 | ((#\`) (return 'backquote #f)) | |
348 | ;; Unquote and unquote-splicing. | |
349 | ((#\,) | |
350 | (if (is-char? (peek-char port) #\@) | |
e840cc65 | 351 | (if (not (char=? (read-char port) #\@)) |
f4e5e411 BT |
352 | (error "expected @ in unquote-splicing") |
353 | (return 'unquote-splicing #f)) | |
e840cc65 | 354 | (return 'unquote #f))) |
f4e5e411 BT |
355 | ;; Remaining are numbers and symbols. Process input until next |
356 | ;; whitespace is found, and see if it looks like a number | |
357 | ;; (float/integer) or symbol and return accordingly. | |
358 | (else | |
359 | (unread-char c port) | |
360 | (call-with-values | |
361 | (lambda () (get-symbol-or-number port)) | |
362 | (lambda (type str) | |
363 | (case type | |
364 | ((symbol) | |
365 | ;; str could be empty if the first character is already | |
366 | ;; something not allowed in a symbol (and not escaped)! | |
367 | ;; Take care about that, it is an error because that | |
368 | ;; character should have been handled elsewhere or is | |
369 | ;; invalid in the input. | |
370 | (if (zero? (string-length str)) | |
371 | (begin | |
372 | ;; Take it out so the REPL might not get into an | |
373 | ;; infinite loop with further reading attempts. | |
374 | (read-char port) | |
375 | (error "invalid character in input" c)) | |
376 | (return 'symbol (string->symbol str)))) | |
377 | ((integer) | |
378 | ;; In elisp, something like "1." is an integer, while | |
379 | ;; string->number returns an inexact real. Thus we need | |
380 | ;; a conversion here, but it should always result in an | |
381 | ;; integer! | |
382 | (return | |
383 | 'integer | |
384 | (let ((num (inexact->exact (string->number str)))) | |
385 | (if (not (integer? num)) | |
386 | (error "expected integer" str num)) | |
387 | num))) | |
388 | ((float) | |
389 | (return 'float (let ((num (string->number str))) | |
390 | (if (exact? num) | |
391 | (error "expected inexact float" | |
392 | str | |
393 | num)) | |
394 | num))) | |
395 | (else (error "wrong number/symbol type" type))))))))))) | |
25512a94 | 396 | |
c983a199 BT |
397 | ;;; Build a lexer thunk for a port. This is the exported routine which |
398 | ;;; can be used to create a lexer for the parser to use. | |
25512a94 DK |
399 | |
400 | (define (get-lexer port) | |
f4e5e411 | 401 | (lambda () (lex port))) |
ddb4364b | 402 | |
c983a199 BT |
403 | ;;; Build a special lexer that will only read enough for one expression |
404 | ;;; and then always return end-of-input. If we find one of the quotation | |
405 | ;;; stuff, one more expression is needed in any case. | |
ddb4364b DK |
406 | |
407 | (define (get-lexer/1 port) | |
408 | (let ((lex (get-lexer port)) | |
409 | (finished #f) | |
410 | (paren-level 0)) | |
411 | (lambda () | |
412 | (if finished | |
6854c324 | 413 | (cons 'eof ((@ (ice-9 binary-ports) eof-object))) |
f4e5e411 BT |
414 | (let ((next (lex)) |
415 | (quotation #f)) | |
416 | (case (car next) | |
417 | ((paren-open square-open) | |
418 | (set! paren-level (1+ paren-level))) | |
419 | ((paren-close square-close) | |
420 | (set! paren-level (1- paren-level))) | |
421 | ((quote backquote unquote unquote-splicing circular-def) | |
422 | (set! quotation #t))) | |
423 | (if (and (not quotation) (<= paren-level 0)) | |
424 | (set! finished #t)) | |
425 | next))))) |