Commit | Line | Data |
---|---|---|
5dcd3e48 AW |
1 | ;;; ECMAScript for Guile |
2 | ||
0ecd70a2 | 3 | ;; Copyright (C) 2009, 2010 Free Software Foundation, Inc. |
5dcd3e48 | 4 | |
53befeb7 NJ |
5 | ;;;; This library is free software; you can redistribute it and/or |
6 | ;;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;;; License as published by the Free Software Foundation; either | |
8 | ;;;; version 3 of the License, or (at your option) any later version. | |
9 | ;;;; | |
10 | ;;;; This library is distributed in the hope that it will be useful, | |
11 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;;; Lesser General Public License for more details. | |
14 | ;;;; | |
15 | ;;;; You should have received a copy of the GNU Lesser General Public | |
16 | ;;;; License along with this library; if not, write to the Free Software | |
17 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
5dcd3e48 AW |
18 | |
19 | ;;; Code: | |
20 | ||
21 | (define-module (language ecmascript tokenize) | |
22 | #:use-module (ice-9 rdelim) | |
23 | #:use-module ((srfi srfi-1) #:select (unfold-right)) | |
0ecd70a2 | 24 | #:use-module (system base lalr) |
5dcd3e48 AW |
25 | #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1)) |
26 | ||
0b229e81 AW |
27 | (define (syntax-error message . args) |
28 | (apply throw 'SyntaxError message args)) | |
29 | ||
5dcd3e48 AW |
30 | ;; taken from SSAX, sorta |
31 | (define (read-until delims port) | |
32 | (if (eof-object? (peek-char port)) | |
0b229e81 | 33 | (syntax-error "EOF while reading a token") |
5dcd3e48 AW |
34 | (let ((token (read-delimited delims port 'peek))) |
35 | (if (eof-object? (peek-char port)) | |
0b229e81 | 36 | (syntax-error "EOF while reading a token") |
5dcd3e48 AW |
37 | token)))) |
38 | ||
39 | (define (char-hex? c) | |
40 | (and (not (eof-object? c)) | |
41 | (or (char-numeric? c) | |
42 | (memv c '(#\a #\b #\c #\d #\e #\f)) | |
43 | (memv c '(#\A #\B #\C #\D #\E #\F))))) | |
44 | ||
45 | (define (digit->number c) | |
46 | (- (char->integer c) (char->integer #\0))) | |
47 | ||
48 | (define (hex->number c) | |
49 | (if (char-numeric? c) | |
50 | (digit->number c) | |
51 | (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a))))) | |
52 | ||
53 | (define (read-slash port div?) | |
e5f5113c LC |
54 | (let ((c1 (begin |
55 | (read-char port) | |
56 | (peek-char port)))) | |
5dcd3e48 AW |
57 | (cond |
58 | ((eof-object? c1) | |
59 | ;; hmm. error if we're not looking for a div? ? | |
60 | '(/ . #f)) | |
61 | ((char=? c1 #\/) | |
62 | (read-line port) | |
63 | (next-token port div?)) | |
64 | ((char=? c1 #\*) | |
65 | (read-char port) | |
66 | (let lp ((c (read-char port))) | |
67 | (cond | |
0b229e81 | 68 | ((eof-object? c) (syntax-error "EOF while in multi-line comment")) |
5dcd3e48 AW |
69 | ((char=? c #\*) |
70 | (if (eqv? (peek-char port) #\/) | |
71 | (begin | |
72 | (read-char port) | |
73 | (next-token port div?)) | |
74 | (lp (read-char port)))) | |
75 | (else | |
76 | (lp (read-char port)))))) | |
77 | (div? | |
78 | (case c1 | |
0ecd70a2 LC |
79 | ((#\=) (read-char port) (make-lexical-token '/= #f #f)) |
80 | (else (make-lexical-token '/ #f #f)))) | |
5dcd3e48 AW |
81 | (else |
82 | (read-regexp port))))) | |
83 | ||
84 | (define (read-regexp port) | |
85 | ;; first slash already read | |
86 | (let ((terms (string #\/ #\\ #\nl #\cr))) | |
87 | (let lp ((str (read-until terms port)) (head "")) | |
88 | (let ((terminator (peek-char port))) | |
89 | (cond | |
90 | ((char=? terminator #\/) | |
91 | (read-char port) | |
92 | ;; flags | |
93 | (let lp ((c (peek-char port)) (flags '())) | |
94 | (if (or (eof-object? c) | |
95 | (not (or (char-alphabetic? c) | |
96 | (char-numeric? c) | |
97 | (char=? c #\$) | |
98 | (char=? c #\_)))) | |
0ecd70a2 LC |
99 | (make-lexical-token 'RegexpLiteral #f |
100 | (cons (string-append head str) | |
101 | (reverse flags))) | |
5dcd3e48 AW |
102 | (begin (read-char port) |
103 | (lp (peek-char port) (cons c flags)))))) | |
104 | ((char=? terminator #\\) | |
105 | (read-char port) | |
106 | (let ((echar (read-char port))) | |
107 | (lp (read-until terms port) | |
108 | (string-append head str (string #\\ echar))))) | |
109 | (else | |
0b229e81 | 110 | (syntax-error "regexp literals may not contain newlines" str))))))) |
5dcd3e48 AW |
111 | |
112 | (define (read-string port) | |
113 | (let ((c (read-char port))) | |
114 | (let ((terms (string c #\\ #\nl #\cr))) | |
115 | (define (read-escape port) | |
116 | (let ((c (read-char port))) | |
117 | (case c | |
118 | ((#\' #\" #\\) c) | |
119 | ((#\b) #\bs) | |
120 | ((#\f) #\np) | |
121 | ((#\n) #\nl) | |
122 | ((#\r) #\cr) | |
123 | ((#\t) #\tab) | |
124 | ((#\v) #\vt) | |
125 | ((#\0) | |
126 | (let ((next (peek-char port))) | |
127 | (cond ((eof-object? next) #\nul) | |
128 | ((char-numeric? next) | |
0b229e81 | 129 | (syntax-error "octal escape sequences are not supported")) |
5dcd3e48 AW |
130 | (else #\nul)))) |
131 | ((#\x) | |
132 | (let* ((a (read-char port)) | |
133 | (b (read-char port))) | |
134 | (cond | |
135 | ((and (char-hex? a) (char-hex? b)) | |
136 | (integer->char (+ (* 16 (hex->number a)) (hex->number b)))) | |
137 | (else | |
0b229e81 | 138 | (syntax-error "bad hex character escape" a b))))) |
5dcd3e48 | 139 | ((#\u) |
0b229e81 | 140 | (syntax-error "unicode not supported")) |
5dcd3e48 AW |
141 | (else |
142 | c)))) | |
143 | (let lp ((str (read-until terms port))) | |
144 | (let ((terminator (peek-char port))) | |
145 | (cond | |
146 | ((char=? terminator c) | |
147 | (read-char port) | |
148 | str) | |
149 | ((char=? terminator #\\) | |
150 | (read-char port) | |
151 | (let ((echar (read-escape port))) | |
152 | (lp (string-append str (string echar) | |
153 | (read-until terms port))))) | |
154 | (else | |
0b229e81 | 155 | (syntax-error "string literals may not contain newlines" str)))))))) |
5dcd3e48 AW |
156 | |
157 | (define *keywords* | |
158 | '(("break" . break) | |
159 | ("else" . else) | |
160 | ("new" . new) | |
161 | ("var" . var) | |
162 | ("case" . case) | |
163 | ("finally" . finally) | |
164 | ("return" . return) | |
165 | ("void" . void) | |
166 | ("catch" . catch) | |
167 | ("for" . for) | |
168 | ("switch" . switch) | |
169 | ("while" . while) | |
170 | ("continue" . continue) | |
171 | ("function" . function) | |
172 | ("this" . this) | |
173 | ("with" . with) | |
174 | ("default" . default) | |
175 | ("if" . if) | |
176 | ("throw" . throw) | |
177 | ("delete" . delete) | |
178 | ("in" . in) | |
179 | ("try" . try) | |
180 | ("do" . do) | |
181 | ("instanceof" . instanceof) | |
182 | ("typeof" . typeof) | |
183 | ||
184 | ;; these aren't exactly keywords, but hey | |
185 | ("null" . null) | |
186 | ("true" . true) | |
187 | ("false" . false))) | |
188 | ||
189 | (define *future-reserved-words* | |
190 | '(("abstract" . abstract) | |
191 | ("enum" . enum) | |
192 | ("int" . int) | |
193 | ("short" . short) | |
194 | ("boolean" . boolean) | |
195 | ("export" . export) | |
196 | ("interface" . interface) | |
197 | ("static" . static) | |
198 | ("byte" . byte) | |
199 | ("extends" . extends) | |
200 | ("long" . long) | |
201 | ("super" . super) | |
202 | ("char" . char) | |
203 | ("final" . final) | |
204 | ("native" . native) | |
205 | ("synchronized" . synchronized) | |
206 | ("class" . class) | |
207 | ("float" . float) | |
208 | ("package" . package) | |
209 | ("throws" . throws) | |
210 | ("const" . const) | |
211 | ("goto" . goto) | |
212 | ("private" . private) | |
213 | ("transient" . transient) | |
214 | ("debugger" . debugger) | |
215 | ("implements" . implements) | |
216 | ("protected" . protected) | |
217 | ("volatile" . volatile) | |
218 | ("double" . double) | |
219 | ("import" . import) | |
220 | ("public" . public))) | |
221 | ||
0ecd70a2 | 222 | (define (read-identifier port loc) |
5dcd3e48 AW |
223 | (let lp ((c (peek-char port)) (chars '())) |
224 | (if (or (eof-object? c) | |
225 | (not (or (char-alphabetic? c) | |
226 | (char-numeric? c) | |
227 | (char=? c #\$) | |
228 | (char=? c #\_)))) | |
229 | (let ((word (list->string (reverse chars)))) | |
230 | (cond ((assoc-ref *keywords* word) | |
0ecd70a2 | 231 | => (lambda (x) (make-lexical-token x loc #f))) |
5dcd3e48 | 232 | ((assoc-ref *future-reserved-words* word) |
0b229e81 | 233 | (syntax-error "word is reserved for the future, dude." word)) |
0ecd70a2 LC |
234 | (else (make-lexical-token 'Identifier loc |
235 | (string->symbol word))))) | |
5dcd3e48 AW |
236 | (begin (read-char port) |
237 | (lp (peek-char port) (cons c chars)))))) | |
238 | ||
239 | (define (read-numeric port) | |
240 | (let* ((c0 (if (char=? (peek-char port) #\.) | |
241 | #\0 | |
242 | (read-char port))) | |
243 | (c1 (peek-char port))) | |
244 | (cond | |
245 | ((eof-object? c1) (digit->number c0)) | |
246 | ((and (char=? c0 #\0) (char=? c1 #\x)) | |
247 | (read-char port) | |
248 | (let ((c (peek-char port))) | |
249 | (if (not (char-hex? c)) | |
0b229e81 | 250 | (syntax-error "bad digit reading hexadecimal number" c)) |
5dcd3e48 AW |
251 | (let lp ((c c) (acc 0)) |
252 | (cond ((char-hex? c) | |
253 | (read-char port) | |
254 | (lp (peek-char port) | |
255 | (+ (* 16 acc) (hex->number c)))) | |
256 | (else | |
257 | acc))))) | |
258 | ((and (char=? c0 #\0) (char-numeric? c1)) | |
259 | (let lp ((c c1) (acc 0)) | |
260 | (cond ((eof-object? c) acc) | |
261 | ((char-numeric? c) | |
262 | (if (or (char=? c #\8) (char=? c #\9)) | |
0b229e81 | 263 | (syntax-error "invalid digit in octal sequence" c)) |
5dcd3e48 AW |
264 | (read-char port) |
265 | (lp (peek-char port) | |
266 | (+ (* 8 acc) (digit->number c)))) | |
267 | (else | |
268 | acc)))) | |
269 | (else | |
270 | (let lp ((c1 c1) (acc (digit->number c0))) | |
271 | (cond | |
272 | ((eof-object? c1) acc) | |
273 | ((char-numeric? c1) | |
274 | (read-char port) | |
275 | (lp (peek-char port) | |
276 | (+ (* 10 acc) (digit->number c1)))) | |
277 | ((or (char=? c1 #\e) (char=? c1 #\E)) | |
278 | (read-char port) | |
279 | (let ((add (let ((c (peek-char port))) | |
0b229e81 | 280 | (cond ((eof-object? c) (syntax-error "error reading exponent: EOF")) |
5dcd3e48 AW |
281 | ((char=? c #\+) (read-char port) +) |
282 | ((char=? c #\-) (read-char port) -) | |
283 | ((char-numeric? c) +) | |
0b229e81 AW |
284 | (else (syntax-error "error reading exponent: non-digit" |
285 | c)))))) | |
5dcd3e48 AW |
286 | (let lp ((c (peek-char port)) (e 0)) |
287 | (cond ((and (not (eof-object? c)) (char-numeric? c)) | |
288 | (read-char port) | |
289 | (lp (peek-char port) (add (* 10 e) (digit->number c)))) | |
290 | (else | |
291 | (* (if (negative? e) (* acc 1.0) acc) (expt 10 e))))))) | |
292 | ((char=? c1 #\.) | |
293 | (read-char port) | |
294 | (let lp2 ((c (peek-char port)) (dec 0.0) (n -1)) | |
295 | (cond ((and (not (eof-object? c)) (char-numeric? c)) | |
296 | (read-char port) | |
297 | (lp2 (peek-char port) | |
298 | (+ dec (* (digit->number c) (expt 10 n))) | |
299 | (1- n))) | |
300 | (else | |
301 | ;; loop back to catch an exponential part | |
302 | (lp c (+ acc dec)))))) | |
303 | (else | |
304 | acc))))))) | |
305 | ||
306 | (define *punctuation* | |
307 | '(("{" . lbrace) | |
308 | ("}" . rbrace) | |
309 | ("(" . lparen) | |
310 | (")" . rparen) | |
311 | ("[" . lbracket) | |
312 | ("]" . rbracket) | |
313 | ("." . dot) | |
314 | (";" . semicolon) | |
315 | ("," . comma) | |
316 | ("<" . <) | |
317 | (">" . >) | |
318 | ("<=" . <=) | |
319 | (">=" . >=) | |
320 | ("==" . ==) | |
321 | ("!=" . !=) | |
322 | ("===" . ===) | |
323 | ("!==" . !==) | |
324 | ("+" . +) | |
325 | ("-" . -) | |
326 | ("*" . *) | |
327 | ("%" . %) | |
328 | ("++" . ++) | |
329 | ("--" . --) | |
330 | ("<<" . <<) | |
331 | (">>" . >>) | |
332 | (">>>" . >>>) | |
333 | ("&" . &) | |
334 | ("|" . bor) | |
335 | ("^" . ^) | |
336 | ("!" . !) | |
337 | ("~" . ~) | |
338 | ("&&" . &&) | |
339 | ("||" . or) | |
340 | ("?" . ?) | |
341 | (":" . colon) | |
342 | ("=" . =) | |
343 | ("+=" . +=) | |
344 | ("-=" . -=) | |
345 | ("*=" . *=) | |
346 | ("%=" . %=) | |
347 | ("<<=" . <<=) | |
348 | (">>=" . >>=) | |
349 | (">>>=" . >>>=) | |
350 | ("&=" . &=) | |
351 | ("|=" . bor=) | |
352 | ("^=" . ^=))) | |
353 | ||
354 | (define *div-punctuation* | |
355 | '(("/" . /) | |
356 | ("/=" . /=))) | |
357 | ||
358 | ;; node ::= (char (symbol | #f) node*) | |
359 | (define read-punctuation | |
360 | (let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*)) | |
361 | (cond ((null? puncs) | |
362 | nodes) | |
363 | ((assv-ref nodes (string-ref (caar puncs) 0)) | |
364 | => (lambda (node-tail) | |
365 | (if (= (string-length (caar puncs)) 1) | |
366 | (set-car! node-tail (cdar puncs)) | |
367 | (set-cdr! node-tail | |
368 | (lp (cdr node-tail) | |
369 | `((,(substring (caar puncs) 1) | |
370 | . ,(cdar puncs)))))) | |
371 | (lp nodes (cdr puncs)))) | |
372 | (else | |
eb1482ac | 373 | (lp (cons (list (string-ref (caar puncs) 0) #f) nodes) |
5dcd3e48 | 374 | puncs)))))) |
0ecd70a2 | 375 | (lambda (port loc) |
5dcd3e48 AW |
376 | (let lp ((c (peek-char port)) (tree punc-tree) (candidate #f)) |
377 | (cond | |
378 | ((assv-ref tree c) | |
379 | => (lambda (node-tail) | |
380 | (read-char port) | |
381 | (lp (peek-char port) (cdr node-tail) (car node-tail)))) | |
382 | (candidate | |
0ecd70a2 | 383 | (make-lexical-token candidate loc #f)) |
5dcd3e48 | 384 | (else |
0b229e81 | 385 | (syntax-error "bad syntax: character not allowed" c))))))) |
5dcd3e48 AW |
386 | |
387 | (define (next-token port div?) | |
0ecd70a2 LC |
388 | (let ((c (peek-char port)) |
389 | (loc (make-source-location (port-filename port) | |
390 | (port-line port) | |
391 | (port-column port) | |
392 | (false-if-exception (seek port 0 SEEK_CUR)) | |
393 | #f))) | |
a3e34104 AW |
394 | (let ((tok |
395 | (case c | |
396 | ((#\ht #\vt #\np #\space) | |
397 | ; whitespace | |
398 | (read-char port) | |
399 | (next-token port div?)) | |
400 | ((#\newline #\cr) | |
401 | ; line break | |
402 | (read-char port) | |
403 | (next-token port div?)) | |
404 | ((#\/) | |
405 | ;; division, single comment, double comment, or regexp | |
406 | (read-slash port div?)) | |
407 | ((#\" #\') | |
408 | ; string literal | |
0ecd70a2 | 409 | (make-lexical-token 'StringLiteral loc (read-string port))) |
a3e34104 AW |
410 | (else |
411 | (cond | |
412 | ((eof-object? c) | |
413 | '*eoi*) | |
414 | ((or (char-alphabetic? c) | |
415 | (char=? c #\$) | |
416 | (char=? c #\_)) | |
417 | ;; reserved word or identifier | |
0ecd70a2 | 418 | (read-identifier port loc)) |
a3e34104 AW |
419 | ((char-numeric? c) |
420 | ;; numeric -- also accept . FIXME, requires lookahead | |
0ecd70a2 | 421 | (make-lexical-token 'NumericLiteral loc (read-numeric port))) |
a3e34104 AW |
422 | (else |
423 | ;; punctuation | |
0ecd70a2 LC |
424 | (read-punctuation port loc))))))) |
425 | ||
a3e34104 | 426 | tok))) |
5dcd3e48 AW |
427 | |
428 | (define (make-tokenizer port) | |
429 | (let ((div? #f)) | |
430 | (lambda () | |
431 | (let ((tok (next-token port div?))) | |
432 | (set! div? (and (pair? tok) (eq? (car tok) 'identifier))) | |
433 | tok)))) | |
434 | ||
435 | (define (make-tokenizer/1 port) | |
436 | (let ((div? #f) | |
437 | (eoi? #f) | |
438 | (stack '())) | |
439 | (lambda () | |
440 | (if eoi? | |
441 | '*eoi* | |
442 | (let ((tok (next-token port div?))) | |
0ecd70a2 | 443 | (case (if (lexical-token? tok) (lexical-token-category tok) tok) |
5dcd3e48 | 444 | ((lparen) |
0ecd70a2 | 445 | (set! stack (make-lexical-token 'lparen #f stack))) |
5dcd3e48 AW |
446 | ((rparen) |
447 | (if (and (pair? stack) (eq? (car stack) 'lparen)) | |
448 | (set! stack (cdr stack)) | |
0b229e81 | 449 | (syntax-error "unexpected right parenthesis"))) |
5dcd3e48 | 450 | ((lbracket) |
0ecd70a2 | 451 | (set! stack (make-lexical-token 'lbracket #f stack))) |
5dcd3e48 AW |
452 | ((rbracket) |
453 | (if (and (pair? stack) (eq? (car stack) 'lbracket)) | |
454 | (set! stack (cdr stack)) | |
0b229e81 | 455 | (syntax-error "unexpected right bracket" stack))) |
5dcd3e48 | 456 | ((lbrace) |
0ecd70a2 | 457 | (set! stack (make-lexical-token 'lbrace #f stack))) |
5dcd3e48 AW |
458 | ((rbrace) |
459 | (if (and (pair? stack) (eq? (car stack) 'lbrace)) | |
460 | (set! stack (cdr stack)) | |
0b229e81 | 461 | (syntax-error "unexpected right brace" stack))) |
5dcd3e48 AW |
462 | ((semicolon) |
463 | (set! eoi? (null? stack)))) | |
0ecd70a2 LC |
464 | (set! div? (and (lexical-token? tok) |
465 | (let ((cat (lexical-token-category tok))) | |
466 | (or (eq? cat 'Identifier) | |
467 | (eq? cat 'NumericLiteral) | |
468 | (eq? cat 'StringLiteral))))) | |
5dcd3e48 AW |
469 | tok))))) |
470 | ||
471 | (define (tokenize port) | |
472 | (let ((next (make-tokenizer port))) | |
473 | (let lp ((out '())) | |
474 | (let ((tok (next))) | |
475 | (if (eq? tok '*eoi*) | |
476 | (reverse! out) | |
477 | (lp (cons tok out))))))) | |
478 | ||
479 | (define (tokenize/1 port) | |
480 | (let ((next (make-tokenizer/1 port))) | |
481 | (let lp ((out '())) | |
482 | (let ((tok (next))) | |
483 | (if (eq? tok '*eoi*) | |
484 | (reverse! out) | |
485 | (lp (cons tok out))))))) | |
486 |