Commit | Line | Data |
---|---|---|
5dcd3e48 AW |
1 | ;;; ECMAScript for Guile |
2 | ||
b1846b7f | 3 | ;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. |
5dcd3e48 | 4 | |
53befeb7 NJ |
5 | ;;;; This library is free software; you can redistribute it and/or |
6 | ;;;; modify it under the terms of the GNU Lesser General Public | |
7 | ;;;; License as published by the Free Software Foundation; either | |
8 | ;;;; version 3 of the License, or (at your option) any later version. | |
9 | ;;;; | |
10 | ;;;; This library is distributed in the hope that it will be useful, | |
11 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | ;;;; Lesser General Public License for more details. | |
14 | ;;;; | |
15 | ;;;; You should have received a copy of the GNU Lesser General Public | |
16 | ;;;; License along with this library; if not, write to the Free Software | |
17 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
5dcd3e48 AW |
18 | |
19 | ;;; Code: | |
20 | ||
21 | (define-module (language ecmascript tokenize) | |
22 | #:use-module (ice-9 rdelim) | |
23 | #:use-module ((srfi srfi-1) #:select (unfold-right)) | |
0ecd70a2 | 24 | #:use-module (system base lalr) |
5dcd3e48 AW |
25 | #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1)) |
26 | ||
b8b63433 AW |
27 | (define (syntax-error what loc form . args) |
28 | (throw 'syntax-error #f what | |
29 | (and=> loc source-location->source-properties) | |
30 | form #f args)) | |
31 | ||
32 | (define (port-source-location port) | |
33 | (make-source-location (port-filename port) | |
34 | (port-line port) | |
35 | (port-column port) | |
36 | (false-if-exception (ftell port)) | |
37 | #f)) | |
0b229e81 | 38 | |
5dcd3e48 | 39 | ;; taken from SSAX, sorta |
b8b63433 | 40 | (define (read-until delims port loc) |
5dcd3e48 | 41 | (if (eof-object? (peek-char port)) |
b8b63433 | 42 | (syntax-error "EOF while reading a token" loc #f) |
5dcd3e48 AW |
43 | (let ((token (read-delimited delims port 'peek))) |
44 | (if (eof-object? (peek-char port)) | |
b8b63433 | 45 | (syntax-error "EOF while reading a token" loc token) |
5dcd3e48 AW |
46 | token)))) |
47 | ||
48 | (define (char-hex? c) | |
49 | (and (not (eof-object? c)) | |
50 | (or (char-numeric? c) | |
51 | (memv c '(#\a #\b #\c #\d #\e #\f)) | |
52 | (memv c '(#\A #\B #\C #\D #\E #\F))))) | |
53 | ||
54 | (define (digit->number c) | |
55 | (- (char->integer c) (char->integer #\0))) | |
56 | ||
57 | (define (hex->number c) | |
58 | (if (char-numeric? c) | |
59 | (digit->number c) | |
60 | (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a))))) | |
61 | ||
b8b63433 | 62 | (define (read-slash port loc div?) |
e5f5113c LC |
63 | (let ((c1 (begin |
64 | (read-char port) | |
65 | (peek-char port)))) | |
5dcd3e48 AW |
66 | (cond |
67 | ((eof-object? c1) | |
68 | ;; hmm. error if we're not looking for a div? ? | |
b8b63433 | 69 | (make-lexical-token '/ loc #f)) |
5dcd3e48 AW |
70 | ((char=? c1 #\/) |
71 | (read-line port) | |
72 | (next-token port div?)) | |
73 | ((char=? c1 #\*) | |
74 | (read-char port) | |
75 | (let lp ((c (read-char port))) | |
76 | (cond | |
b8b63433 AW |
77 | ((eof-object? c) |
78 | (syntax-error "EOF while in multi-line comment" loc #f)) | |
5dcd3e48 AW |
79 | ((char=? c #\*) |
80 | (if (eqv? (peek-char port) #\/) | |
81 | (begin | |
82 | (read-char port) | |
83 | (next-token port div?)) | |
84 | (lp (read-char port)))) | |
85 | (else | |
86 | (lp (read-char port)))))) | |
87 | (div? | |
88 | (case c1 | |
b8b63433 AW |
89 | ((#\=) (read-char port) (make-lexical-token '/= loc #f)) |
90 | (else (make-lexical-token '/ loc #f)))) | |
5dcd3e48 | 91 | (else |
b8b63433 | 92 | (read-regexp port loc))))) |
5dcd3e48 | 93 | |
b8b63433 | 94 | (define (read-regexp port loc) |
5dcd3e48 AW |
95 | ;; first slash already read |
96 | (let ((terms (string #\/ #\\ #\nl #\cr))) | |
b8b63433 | 97 | (let lp ((str (read-until terms port loc)) (head "")) |
5dcd3e48 AW |
98 | (let ((terminator (peek-char port))) |
99 | (cond | |
100 | ((char=? terminator #\/) | |
101 | (read-char port) | |
102 | ;; flags | |
103 | (let lp ((c (peek-char port)) (flags '())) | |
104 | (if (or (eof-object? c) | |
105 | (not (or (char-alphabetic? c) | |
106 | (char-numeric? c) | |
107 | (char=? c #\$) | |
108 | (char=? c #\_)))) | |
b8b63433 | 109 | (make-lexical-token 'RegexpLiteral loc |
0ecd70a2 LC |
110 | (cons (string-append head str) |
111 | (reverse flags))) | |
5dcd3e48 AW |
112 | (begin (read-char port) |
113 | (lp (peek-char port) (cons c flags)))))) | |
114 | ((char=? terminator #\\) | |
115 | (read-char port) | |
116 | (let ((echar (read-char port))) | |
b8b63433 | 117 | (lp (read-until terms port loc) |
5dcd3e48 AW |
118 | (string-append head str (string #\\ echar))))) |
119 | (else | |
b8b63433 AW |
120 | (syntax-error "regexp literals may not contain newlines" |
121 | loc str))))))) | |
5dcd3e48 | 122 | |
b8b63433 | 123 | (define (read-string port loc) |
5dcd3e48 AW |
124 | (let ((c (read-char port))) |
125 | (let ((terms (string c #\\ #\nl #\cr))) | |
126 | (define (read-escape port) | |
127 | (let ((c (read-char port))) | |
128 | (case c | |
129 | ((#\' #\" #\\) c) | |
130 | ((#\b) #\bs) | |
131 | ((#\f) #\np) | |
132 | ((#\n) #\nl) | |
133 | ((#\r) #\cr) | |
134 | ((#\t) #\tab) | |
135 | ((#\v) #\vt) | |
136 | ((#\0) | |
137 | (let ((next (peek-char port))) | |
b8b63433 AW |
138 | (cond |
139 | ((eof-object? next) #\nul) | |
140 | ((char-numeric? next) | |
141 | (syntax-error "octal escape sequences are not supported" | |
142 | loc #f)) | |
143 | (else #\nul)))) | |
5dcd3e48 AW |
144 | ((#\x) |
145 | (let* ((a (read-char port)) | |
146 | (b (read-char port))) | |
147 | (cond | |
148 | ((and (char-hex? a) (char-hex? b)) | |
149 | (integer->char (+ (* 16 (hex->number a)) (hex->number b)))) | |
150 | (else | |
b8b63433 | 151 | (syntax-error "bad hex character escape" loc (string a b)))))) |
5dcd3e48 | 152 | ((#\u) |
b1846b7f NL |
153 | (let* ((a (read-char port)) |
154 | (b (read-char port)) | |
155 | (c (read-char port)) | |
156 | (d (read-char port))) | |
157 | (integer->char (string->number (string a b c d) 16)))) | |
5dcd3e48 AW |
158 | (else |
159 | c)))) | |
b8b63433 | 160 | (let lp ((str (read-until terms port loc))) |
5dcd3e48 AW |
161 | (let ((terminator (peek-char port))) |
162 | (cond | |
163 | ((char=? terminator c) | |
164 | (read-char port) | |
b8b63433 | 165 | (make-lexical-token 'StringLiteral loc str)) |
5dcd3e48 AW |
166 | ((char=? terminator #\\) |
167 | (read-char port) | |
168 | (let ((echar (read-escape port))) | |
169 | (lp (string-append str (string echar) | |
b8b63433 | 170 | (read-until terms port loc))))) |
5dcd3e48 | 171 | (else |
b8b63433 AW |
172 | (syntax-error "string literals may not contain newlines" |
173 | loc str)))))))) | |
5dcd3e48 AW |
174 | |
175 | (define *keywords* | |
176 | '(("break" . break) | |
177 | ("else" . else) | |
178 | ("new" . new) | |
179 | ("var" . var) | |
180 | ("case" . case) | |
181 | ("finally" . finally) | |
182 | ("return" . return) | |
183 | ("void" . void) | |
184 | ("catch" . catch) | |
185 | ("for" . for) | |
186 | ("switch" . switch) | |
187 | ("while" . while) | |
188 | ("continue" . continue) | |
189 | ("function" . function) | |
190 | ("this" . this) | |
191 | ("with" . with) | |
192 | ("default" . default) | |
193 | ("if" . if) | |
194 | ("throw" . throw) | |
195 | ("delete" . delete) | |
196 | ("in" . in) | |
197 | ("try" . try) | |
198 | ("do" . do) | |
199 | ("instanceof" . instanceof) | |
200 | ("typeof" . typeof) | |
201 | ||
202 | ;; these aren't exactly keywords, but hey | |
203 | ("null" . null) | |
204 | ("true" . true) | |
205 | ("false" . false))) | |
206 | ||
207 | (define *future-reserved-words* | |
208 | '(("abstract" . abstract) | |
209 | ("enum" . enum) | |
210 | ("int" . int) | |
211 | ("short" . short) | |
212 | ("boolean" . boolean) | |
213 | ("export" . export) | |
214 | ("interface" . interface) | |
215 | ("static" . static) | |
216 | ("byte" . byte) | |
217 | ("extends" . extends) | |
218 | ("long" . long) | |
219 | ("super" . super) | |
220 | ("char" . char) | |
221 | ("final" . final) | |
222 | ("native" . native) | |
223 | ("synchronized" . synchronized) | |
224 | ("class" . class) | |
225 | ("float" . float) | |
226 | ("package" . package) | |
227 | ("throws" . throws) | |
228 | ("const" . const) | |
229 | ("goto" . goto) | |
230 | ("private" . private) | |
231 | ("transient" . transient) | |
232 | ("debugger" . debugger) | |
233 | ("implements" . implements) | |
234 | ("protected" . protected) | |
235 | ("volatile" . volatile) | |
236 | ("double" . double) | |
237 | ("import" . import) | |
238 | ("public" . public))) | |
239 | ||
0ecd70a2 | 240 | (define (read-identifier port loc) |
5dcd3e48 AW |
241 | (let lp ((c (peek-char port)) (chars '())) |
242 | (if (or (eof-object? c) | |
243 | (not (or (char-alphabetic? c) | |
244 | (char-numeric? c) | |
245 | (char=? c #\$) | |
246 | (char=? c #\_)))) | |
247 | (let ((word (list->string (reverse chars)))) | |
248 | (cond ((assoc-ref *keywords* word) | |
0ecd70a2 | 249 | => (lambda (x) (make-lexical-token x loc #f))) |
5dcd3e48 | 250 | ((assoc-ref *future-reserved-words* word) |
b8b63433 AW |
251 | (syntax-error "word is reserved for the future, dude." |
252 | loc word)) | |
0ecd70a2 LC |
253 | (else (make-lexical-token 'Identifier loc |
254 | (string->symbol word))))) | |
5dcd3e48 AW |
255 | (begin (read-char port) |
256 | (lp (peek-char port) (cons c chars)))))) | |
257 | ||
b8b63433 | 258 | (define (read-numeric port loc) |
5dcd3e48 AW |
259 | (let* ((c0 (if (char=? (peek-char port) #\.) |
260 | #\0 | |
261 | (read-char port))) | |
262 | (c1 (peek-char port))) | |
263 | (cond | |
264 | ((eof-object? c1) (digit->number c0)) | |
32629820 | 265 | ((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X))) |
5dcd3e48 AW |
266 | (read-char port) |
267 | (let ((c (peek-char port))) | |
268 | (if (not (char-hex? c)) | |
b8b63433 AW |
269 | (syntax-error "bad digit reading hexadecimal number" |
270 | loc c)) | |
5dcd3e48 AW |
271 | (let lp ((c c) (acc 0)) |
272 | (cond ((char-hex? c) | |
273 | (read-char port) | |
274 | (lp (peek-char port) | |
275 | (+ (* 16 acc) (hex->number c)))) | |
276 | (else | |
277 | acc))))) | |
278 | ((and (char=? c0 #\0) (char-numeric? c1)) | |
279 | (let lp ((c c1) (acc 0)) | |
280 | (cond ((eof-object? c) acc) | |
281 | ((char-numeric? c) | |
282 | (if (or (char=? c #\8) (char=? c #\9)) | |
b8b63433 AW |
283 | (syntax-error "invalid digit in octal sequence" |
284 | loc c)) | |
5dcd3e48 AW |
285 | (read-char port) |
286 | (lp (peek-char port) | |
287 | (+ (* 8 acc) (digit->number c)))) | |
288 | (else | |
289 | acc)))) | |
290 | (else | |
291 | (let lp ((c1 c1) (acc (digit->number c0))) | |
292 | (cond | |
293 | ((eof-object? c1) acc) | |
294 | ((char-numeric? c1) | |
295 | (read-char port) | |
296 | (lp (peek-char port) | |
297 | (+ (* 10 acc) (digit->number c1)))) | |
298 | ((or (char=? c1 #\e) (char=? c1 #\E)) | |
299 | (read-char port) | |
300 | (let ((add (let ((c (peek-char port))) | |
b8b63433 AW |
301 | (cond ((eof-object? c) |
302 | (syntax-error "error reading exponent: EOF" | |
303 | loc #f)) | |
5dcd3e48 AW |
304 | ((char=? c #\+) (read-char port) +) |
305 | ((char=? c #\-) (read-char port) -) | |
306 | ((char-numeric? c) +) | |
b8b63433 AW |
307 | (else |
308 | (syntax-error "error reading exponent: non-digit" | |
309 | loc c)))))) | |
5dcd3e48 AW |
310 | (let lp ((c (peek-char port)) (e 0)) |
311 | (cond ((and (not (eof-object? c)) (char-numeric? c)) | |
312 | (read-char port) | |
313 | (lp (peek-char port) (add (* 10 e) (digit->number c)))) | |
314 | (else | |
315 | (* (if (negative? e) (* acc 1.0) acc) (expt 10 e))))))) | |
316 | ((char=? c1 #\.) | |
317 | (read-char port) | |
318 | (let lp2 ((c (peek-char port)) (dec 0.0) (n -1)) | |
319 | (cond ((and (not (eof-object? c)) (char-numeric? c)) | |
320 | (read-char port) | |
321 | (lp2 (peek-char port) | |
322 | (+ dec (* (digit->number c) (expt 10 n))) | |
323 | (1- n))) | |
324 | (else | |
325 | ;; loop back to catch an exponential part | |
326 | (lp c (+ acc dec)))))) | |
327 | (else | |
328 | acc))))))) | |
329 | ||
330 | (define *punctuation* | |
331 | '(("{" . lbrace) | |
332 | ("}" . rbrace) | |
333 | ("(" . lparen) | |
334 | (")" . rparen) | |
335 | ("[" . lbracket) | |
336 | ("]" . rbracket) | |
337 | ("." . dot) | |
338 | (";" . semicolon) | |
339 | ("," . comma) | |
340 | ("<" . <) | |
341 | (">" . >) | |
342 | ("<=" . <=) | |
343 | (">=" . >=) | |
344 | ("==" . ==) | |
345 | ("!=" . !=) | |
346 | ("===" . ===) | |
347 | ("!==" . !==) | |
348 | ("+" . +) | |
349 | ("-" . -) | |
350 | ("*" . *) | |
351 | ("%" . %) | |
352 | ("++" . ++) | |
353 | ("--" . --) | |
354 | ("<<" . <<) | |
355 | (">>" . >>) | |
356 | (">>>" . >>>) | |
357 | ("&" . &) | |
358 | ("|" . bor) | |
359 | ("^" . ^) | |
360 | ("!" . !) | |
361 | ("~" . ~) | |
362 | ("&&" . &&) | |
363 | ("||" . or) | |
364 | ("?" . ?) | |
365 | (":" . colon) | |
366 | ("=" . =) | |
367 | ("+=" . +=) | |
368 | ("-=" . -=) | |
369 | ("*=" . *=) | |
370 | ("%=" . %=) | |
371 | ("<<=" . <<=) | |
372 | (">>=" . >>=) | |
373 | (">>>=" . >>>=) | |
374 | ("&=" . &=) | |
375 | ("|=" . bor=) | |
376 | ("^=" . ^=))) | |
377 | ||
378 | (define *div-punctuation* | |
379 | '(("/" . /) | |
380 | ("/=" . /=))) | |
381 | ||
382 | ;; node ::= (char (symbol | #f) node*) | |
383 | (define read-punctuation | |
384 | (let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*)) | |
385 | (cond ((null? puncs) | |
386 | nodes) | |
387 | ((assv-ref nodes (string-ref (caar puncs) 0)) | |
388 | => (lambda (node-tail) | |
389 | (if (= (string-length (caar puncs)) 1) | |
390 | (set-car! node-tail (cdar puncs)) | |
391 | (set-cdr! node-tail | |
392 | (lp (cdr node-tail) | |
393 | `((,(substring (caar puncs) 1) | |
394 | . ,(cdar puncs)))))) | |
395 | (lp nodes (cdr puncs)))) | |
396 | (else | |
eb1482ac | 397 | (lp (cons (list (string-ref (caar puncs) 0) #f) nodes) |
5dcd3e48 | 398 | puncs)))))) |
0ecd70a2 | 399 | (lambda (port loc) |
5dcd3e48 AW |
400 | (let lp ((c (peek-char port)) (tree punc-tree) (candidate #f)) |
401 | (cond | |
402 | ((assv-ref tree c) | |
403 | => (lambda (node-tail) | |
404 | (read-char port) | |
405 | (lp (peek-char port) (cdr node-tail) (car node-tail)))) | |
406 | (candidate | |
0ecd70a2 | 407 | (make-lexical-token candidate loc #f)) |
5dcd3e48 | 408 | (else |
b8b63433 | 409 | (syntax-error "bad syntax: character not allowed" loc c))))))) |
5dcd3e48 AW |
410 | |
411 | (define (next-token port div?) | |
0ecd70a2 | 412 | (let ((c (peek-char port)) |
b8b63433 AW |
413 | (loc (port-source-location port))) |
414 | (case c | |
110f6521 | 415 | ((#\ht #\vt #\np #\space #\x00A0) ; whitespace |
b8b63433 AW |
416 | (read-char port) |
417 | (next-token port div?)) | |
418 | ((#\newline #\cr) ; line break | |
419 | (read-char port) | |
420 | (next-token port div?)) | |
421 | ((#\/) | |
422 | ;; division, single comment, double comment, or regexp | |
423 | (read-slash port loc div?)) | |
424 | ((#\" #\') ; string literal | |
425 | (read-string port loc)) | |
426 | (else | |
427 | (cond | |
428 | ((eof-object? c) | |
429 | '*eoi*) | |
430 | ((or (char-alphabetic? c) | |
431 | (char=? c #\$) | |
432 | (char=? c #\_)) | |
433 | ;; reserved word or identifier | |
434 | (read-identifier port loc)) | |
435 | ((char-numeric? c) | |
436 | ;; numeric -- also accept . FIXME, requires lookahead | |
437 | (make-lexical-token 'NumericLiteral loc (read-numeric port loc))) | |
438 | (else | |
439 | ;; punctuation | |
440 | (read-punctuation port loc))))))) | |
5dcd3e48 AW |
441 | |
442 | (define (make-tokenizer port) | |
443 | (let ((div? #f)) | |
444 | (lambda () | |
445 | (let ((tok (next-token port div?))) | |
b8b63433 AW |
446 | (set! div? (and (lexical-token? tok) |
447 | (let ((cat (lexical-token-category tok))) | |
448 | (or (eq? cat 'Identifier) | |
449 | (eq? cat 'NumericLiteral) | |
450 | (eq? cat 'StringLiteral))))) | |
5dcd3e48 AW |
451 | tok)))) |
452 | ||
453 | (define (make-tokenizer/1 port) | |
454 | (let ((div? #f) | |
455 | (eoi? #f) | |
456 | (stack '())) | |
457 | (lambda () | |
458 | (if eoi? | |
459 | '*eoi* | |
460 | (let ((tok (next-token port div?))) | |
0ecd70a2 | 461 | (case (if (lexical-token? tok) (lexical-token-category tok) tok) |
5dcd3e48 | 462 | ((lparen) |
a608cad2 | 463 | (set! stack (cons tok stack))) |
5dcd3e48 | 464 | ((rparen) |
a608cad2 AW |
465 | (if (and (pair? stack) |
466 | (eq? (lexical-token-category (car stack)) 'lparen)) | |
5dcd3e48 | 467 | (set! stack (cdr stack)) |
b8b63433 AW |
468 | (syntax-error "unexpected right parenthesis" |
469 | (lexical-token-source tok) | |
470 | #f))) | |
5dcd3e48 | 471 | ((lbracket) |
a608cad2 | 472 | (set! stack (cons tok stack))) |
5dcd3e48 | 473 | ((rbracket) |
a608cad2 AW |
474 | (if (and (pair? stack) |
475 | (eq? (lexical-token-category (car stack)) 'lbracket)) | |
5dcd3e48 | 476 | (set! stack (cdr stack)) |
b8b63433 AW |
477 | (syntax-error "unexpected right bracket" |
478 | (lexical-token-source tok) | |
479 | #f))) | |
5dcd3e48 | 480 | ((lbrace) |
a608cad2 | 481 | (set! stack (cons tok stack))) |
5dcd3e48 | 482 | ((rbrace) |
a608cad2 AW |
483 | (if (and (pair? stack) |
484 | (eq? (lexical-token-category (car stack)) 'lbrace)) | |
5dcd3e48 | 485 | (set! stack (cdr stack)) |
b8b63433 AW |
486 | (syntax-error "unexpected right brace" |
487 | (lexical-token-source tok) | |
488 | #f))) | |
5dcd3e48 AW |
489 | ((semicolon) |
490 | (set! eoi? (null? stack)))) | |
0ecd70a2 LC |
491 | (set! div? (and (lexical-token? tok) |
492 | (let ((cat (lexical-token-category tok))) | |
493 | (or (eq? cat 'Identifier) | |
494 | (eq? cat 'NumericLiteral) | |
495 | (eq? cat 'StringLiteral))))) | |
5dcd3e48 AW |
496 | tok))))) |
497 | ||
498 | (define (tokenize port) | |
499 | (let ((next (make-tokenizer port))) | |
500 | (let lp ((out '())) | |
501 | (let ((tok (next))) | |
502 | (if (eq? tok '*eoi*) | |
503 | (reverse! out) | |
504 | (lp (cons tok out))))))) | |
505 | ||
506 | (define (tokenize/1 port) | |
507 | (let ((next (make-tokenizer/1 port))) | |
508 | (let lp ((out '())) | |
509 | (let ((tok (next))) | |
510 | (if (eq? tok '*eoi*) | |
511 | (reverse! out) | |
512 | (lp (cons tok out))))))) | |
513 |