Merge commit 'de1eb420a5a95b17e85b19c4d98c869036e9ecb0'
[bpt/guile.git] / module / language / ecmascript / tokenize.scm
1 ;;; ECMAScript for Guile
2
3 ;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
4
5 ;;;; This library is free software; you can redistribute it and/or
6 ;;;; modify it under the terms of the GNU Lesser General Public
7 ;;;; License as published by the Free Software Foundation; either
8 ;;;; version 3 of the License, or (at your option) any later version.
9 ;;;;
10 ;;;; This library is distributed in the hope that it will be useful,
11 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 ;;;; Lesser General Public License for more details.
14 ;;;;
15 ;;;; You should have received a copy of the GNU Lesser General Public
16 ;;;; License along with this library; if not, write to the Free Software
17 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19 ;;; Code:
20
21 (define-module (language ecmascript tokenize)
22 #:use-module (ice-9 rdelim)
23 #:use-module ((srfi srfi-1) #:select (unfold-right))
24 #:use-module (system base lalr)
25 #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1))
26
27 (define (syntax-error what loc form . args)
28 (throw 'syntax-error #f what
29 (and=> loc source-location->source-properties)
30 form #f args))
31
32 (define (port-source-location port)
33 (make-source-location (port-filename port)
34 (port-line port)
35 (port-column port)
36 (false-if-exception (ftell port))
37 #f))
38
39 ;; taken from SSAX, sorta
40 (define (read-until delims port loc)
41 (if (eof-object? (peek-char port))
42 (syntax-error "EOF while reading a token" loc #f)
43 (let ((token (read-delimited delims port 'peek)))
44 (if (eof-object? (peek-char port))
45 (syntax-error "EOF while reading a token" loc token)
46 token))))
47
48 (define (char-hex? c)
49 (and (not (eof-object? c))
50 (or (char-numeric? c)
51 (memv c '(#\a #\b #\c #\d #\e #\f))
52 (memv c '(#\A #\B #\C #\D #\E #\F)))))
53
54 (define (digit->number c)
55 (- (char->integer c) (char->integer #\0)))
56
57 (define (hex->number c)
58 (if (char-numeric? c)
59 (digit->number c)
60 (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
61
62 (define (read-slash port loc div?)
63 (let ((c1 (begin
64 (read-char port)
65 (peek-char port))))
66 (cond
67 ((eof-object? c1)
68 ;; hmm. error if we're not looking for a div? ?
69 (make-lexical-token '/ loc #f))
70 ((char=? c1 #\/)
71 (read-line port)
72 (next-token port div?))
73 ((char=? c1 #\*)
74 (read-char port)
75 (let lp ((c (read-char port)))
76 (cond
77 ((eof-object? c)
78 (syntax-error "EOF while in multi-line comment" loc #f))
79 ((char=? c #\*)
80 (if (eqv? (peek-char port) #\/)
81 (begin
82 (read-char port)
83 (next-token port div?))
84 (lp (read-char port))))
85 (else
86 (lp (read-char port))))))
87 (div?
88 (case c1
89 ((#\=) (read-char port) (make-lexical-token '/= loc #f))
90 (else (make-lexical-token '/ loc #f))))
91 (else
92 (read-regexp port loc)))))
93
94 (define (read-regexp port loc)
95 ;; first slash already read
96 (let ((terms (string #\/ #\\ #\nl #\cr)))
97 (let lp ((str (read-until terms port loc)) (head ""))
98 (let ((terminator (peek-char port)))
99 (cond
100 ((char=? terminator #\/)
101 (read-char port)
102 ;; flags
103 (let lp ((c (peek-char port)) (flags '()))
104 (if (or (eof-object? c)
105 (not (or (char-alphabetic? c)
106 (char-numeric? c)
107 (char=? c #\$)
108 (char=? c #\_))))
109 (make-lexical-token 'RegexpLiteral loc
110 (cons (string-append head str)
111 (reverse flags)))
112 (begin (read-char port)
113 (lp (peek-char port) (cons c flags))))))
114 ((char=? terminator #\\)
115 (read-char port)
116 (let ((echar (read-char port)))
117 (lp (read-until terms port loc)
118 (string-append head str (string #\\ echar)))))
119 (else
120 (syntax-error "regexp literals may not contain newlines"
121 loc str)))))))
122
123 (define (read-string port loc)
124 (let ((c (read-char port)))
125 (let ((terms (string c #\\ #\nl #\cr)))
126 (define (read-escape port)
127 (let ((c (read-char port)))
128 (case c
129 ((#\' #\" #\\) c)
130 ((#\b) #\bs)
131 ((#\f) #\np)
132 ((#\n) #\nl)
133 ((#\r) #\cr)
134 ((#\t) #\tab)
135 ((#\v) #\vt)
136 ((#\0)
137 (let ((next (peek-char port)))
138 (cond
139 ((eof-object? next) #\nul)
140 ((char-numeric? next)
141 (syntax-error "octal escape sequences are not supported"
142 loc #f))
143 (else #\nul))))
144 ((#\x)
145 (let* ((a (read-char port))
146 (b (read-char port)))
147 (cond
148 ((and (char-hex? a) (char-hex? b))
149 (integer->char (+ (* 16 (hex->number a)) (hex->number b))))
150 (else
151 (syntax-error "bad hex character escape" loc (string a b))))))
152 ((#\u)
153 (let* ((a (read-char port))
154 (b (read-char port))
155 (c (read-char port))
156 (d (read-char port)))
157 (integer->char (string->number (string a b c d) 16))))
158 (else
159 c))))
160 (let lp ((str (read-until terms port loc)))
161 (let ((terminator (peek-char port)))
162 (cond
163 ((char=? terminator c)
164 (read-char port)
165 (make-lexical-token 'StringLiteral loc str))
166 ((char=? terminator #\\)
167 (read-char port)
168 (let ((echar (read-escape port)))
169 (lp (string-append str (string echar)
170 (read-until terms port loc)))))
171 (else
172 (syntax-error "string literals may not contain newlines"
173 loc str))))))))
174
175 (define *keywords*
176 '(("break" . break)
177 ("else" . else)
178 ("new" . new)
179 ("var" . var)
180 ("case" . case)
181 ("finally" . finally)
182 ("return" . return)
183 ("void" . void)
184 ("catch" . catch)
185 ("for" . for)
186 ("switch" . switch)
187 ("while" . while)
188 ("continue" . continue)
189 ("function" . function)
190 ("this" . this)
191 ("with" . with)
192 ("default" . default)
193 ("if" . if)
194 ("throw" . throw)
195 ("delete" . delete)
196 ("in" . in)
197 ("try" . try)
198 ("do" . do)
199 ("instanceof" . instanceof)
200 ("typeof" . typeof)
201
202 ;; these aren't exactly keywords, but hey
203 ("null" . null)
204 ("true" . true)
205 ("false" . false)))
206
207 (define *future-reserved-words*
208 '(("abstract" . abstract)
209 ("enum" . enum)
210 ("int" . int)
211 ("short" . short)
212 ("boolean" . boolean)
213 ("export" . export)
214 ("interface" . interface)
215 ("static" . static)
216 ("byte" . byte)
217 ("extends" . extends)
218 ("long" . long)
219 ("super" . super)
220 ("char" . char)
221 ("final" . final)
222 ("native" . native)
223 ("synchronized" . synchronized)
224 ("class" . class)
225 ("float" . float)
226 ("package" . package)
227 ("throws" . throws)
228 ("const" . const)
229 ("goto" . goto)
230 ("private" . private)
231 ("transient" . transient)
232 ("debugger" . debugger)
233 ("implements" . implements)
234 ("protected" . protected)
235 ("volatile" . volatile)
236 ("double" . double)
237 ("import" . import)
238 ("public" . public)))
239
240 (define (read-identifier port loc)
241 (let lp ((c (peek-char port)) (chars '()))
242 (if (or (eof-object? c)
243 (not (or (char-alphabetic? c)
244 (char-numeric? c)
245 (char=? c #\$)
246 (char=? c #\_))))
247 (let ((word (list->string (reverse chars))))
248 (cond ((assoc-ref *keywords* word)
249 => (lambda (x) (make-lexical-token x loc #f)))
250 ((assoc-ref *future-reserved-words* word)
251 (syntax-error "word is reserved for the future, dude."
252 loc word))
253 (else (make-lexical-token 'Identifier loc
254 (string->symbol word)))))
255 (begin (read-char port)
256 (lp (peek-char port) (cons c chars))))))
257
258 (define (read-numeric port loc)
259 (let* ((c0 (if (char=? (peek-char port) #\.)
260 #\0
261 (read-char port)))
262 (c1 (peek-char port)))
263 (cond
264 ((eof-object? c1) (digit->number c0))
265 ((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X)))
266 (read-char port)
267 (let ((c (peek-char port)))
268 (if (not (char-hex? c))
269 (syntax-error "bad digit reading hexadecimal number"
270 loc c))
271 (let lp ((c c) (acc 0))
272 (cond ((char-hex? c)
273 (read-char port)
274 (lp (peek-char port)
275 (+ (* 16 acc) (hex->number c))))
276 (else
277 acc)))))
278 ((and (char=? c0 #\0) (char-numeric? c1))
279 (let lp ((c c1) (acc 0))
280 (cond ((eof-object? c) acc)
281 ((char-numeric? c)
282 (if (or (char=? c #\8) (char=? c #\9))
283 (syntax-error "invalid digit in octal sequence"
284 loc c))
285 (read-char port)
286 (lp (peek-char port)
287 (+ (* 8 acc) (digit->number c))))
288 (else
289 acc))))
290 (else
291 (let lp ((c1 c1) (acc (digit->number c0)))
292 (cond
293 ((eof-object? c1) acc)
294 ((char-numeric? c1)
295 (read-char port)
296 (lp (peek-char port)
297 (+ (* 10 acc) (digit->number c1))))
298 ((or (char=? c1 #\e) (char=? c1 #\E))
299 (read-char port)
300 (let ((add (let ((c (peek-char port)))
301 (cond ((eof-object? c)
302 (syntax-error "error reading exponent: EOF"
303 loc #f))
304 ((char=? c #\+) (read-char port) +)
305 ((char=? c #\-) (read-char port) -)
306 ((char-numeric? c) +)
307 (else
308 (syntax-error "error reading exponent: non-digit"
309 loc c))))))
310 (let lp ((c (peek-char port)) (e 0))
311 (cond ((and (not (eof-object? c)) (char-numeric? c))
312 (read-char port)
313 (lp (peek-char port) (add (* 10 e) (digit->number c))))
314 (else
315 (* (if (negative? e) (* acc 1.0) acc) (expt 10 e)))))))
316 ((char=? c1 #\.)
317 (read-char port)
318 (let lp2 ((c (peek-char port)) (dec 0.0) (n -1))
319 (cond ((and (not (eof-object? c)) (char-numeric? c))
320 (read-char port)
321 (lp2 (peek-char port)
322 (+ dec (* (digit->number c) (expt 10 n)))
323 (1- n)))
324 (else
325 ;; loop back to catch an exponential part
326 (lp c (+ acc dec))))))
327 (else
328 acc)))))))
329
330 (define *punctuation*
331 '(("{" . lbrace)
332 ("}" . rbrace)
333 ("(" . lparen)
334 (")" . rparen)
335 ("[" . lbracket)
336 ("]" . rbracket)
337 ("." . dot)
338 (";" . semicolon)
339 ("," . comma)
340 ("<" . <)
341 (">" . >)
342 ("<=" . <=)
343 (">=" . >=)
344 ("==" . ==)
345 ("!=" . !=)
346 ("===" . ===)
347 ("!==" . !==)
348 ("+" . +)
349 ("-" . -)
350 ("*" . *)
351 ("%" . %)
352 ("++" . ++)
353 ("--" . --)
354 ("<<" . <<)
355 (">>" . >>)
356 (">>>" . >>>)
357 ("&" . &)
358 ("|" . bor)
359 ("^" . ^)
360 ("!" . !)
361 ("~" . ~)
362 ("&&" . &&)
363 ("||" . or)
364 ("?" . ?)
365 (":" . colon)
366 ("=" . =)
367 ("+=" . +=)
368 ("-=" . -=)
369 ("*=" . *=)
370 ("%=" . %=)
371 ("<<=" . <<=)
372 (">>=" . >>=)
373 (">>>=" . >>>=)
374 ("&=" . &=)
375 ("|=" . bor=)
376 ("^=" . ^=)))
377
378 (define *div-punctuation*
379 '(("/" . /)
380 ("/=" . /=)))
381
382 ;; node ::= (char (symbol | #f) node*)
383 (define read-punctuation
384 (let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*))
385 (cond ((null? puncs)
386 nodes)
387 ((assv-ref nodes (string-ref (caar puncs) 0))
388 => (lambda (node-tail)
389 (if (= (string-length (caar puncs)) 1)
390 (set-car! node-tail (cdar puncs))
391 (set-cdr! node-tail
392 (lp (cdr node-tail)
393 `((,(substring (caar puncs) 1)
394 . ,(cdar puncs))))))
395 (lp nodes (cdr puncs))))
396 (else
397 (lp (cons (list (string-ref (caar puncs) 0) #f) nodes)
398 puncs))))))
399 (lambda (port loc)
400 (let lp ((c (peek-char port)) (tree punc-tree) (candidate #f))
401 (cond
402 ((assv-ref tree c)
403 => (lambda (node-tail)
404 (read-char port)
405 (lp (peek-char port) (cdr node-tail) (car node-tail))))
406 (candidate
407 (make-lexical-token candidate loc #f))
408 (else
409 (syntax-error "bad syntax: character not allowed" loc c)))))))
410
411 (define (next-token port div?)
412 (let ((c (peek-char port))
413 (loc (port-source-location port)))
414 (case c
415 ((#\ht #\vt #\np #\space #\x00A0) ; whitespace
416 (read-char port)
417 (next-token port div?))
418 ((#\newline #\cr) ; line break
419 (read-char port)
420 (next-token port div?))
421 ((#\/)
422 ;; division, single comment, double comment, or regexp
423 (read-slash port loc div?))
424 ((#\" #\') ; string literal
425 (read-string port loc))
426 (else
427 (cond
428 ((eof-object? c)
429 '*eoi*)
430 ((or (char-alphabetic? c)
431 (char=? c #\$)
432 (char=? c #\_))
433 ;; reserved word or identifier
434 (read-identifier port loc))
435 ((char-numeric? c)
436 ;; numeric -- also accept . FIXME, requires lookahead
437 (make-lexical-token 'NumericLiteral loc (read-numeric port loc)))
438 (else
439 ;; punctuation
440 (read-punctuation port loc)))))))
441
442 (define (make-tokenizer port)
443 (let ((div? #f))
444 (lambda ()
445 (let ((tok (next-token port div?)))
446 (set! div? (and (lexical-token? tok)
447 (let ((cat (lexical-token-category tok)))
448 (or (eq? cat 'Identifier)
449 (eq? cat 'NumericLiteral)
450 (eq? cat 'StringLiteral)))))
451 tok))))
452
453 (define (make-tokenizer/1 port)
454 (let ((div? #f)
455 (eoi? #f)
456 (stack '()))
457 (lambda ()
458 (if eoi?
459 '*eoi*
460 (let ((tok (next-token port div?)))
461 (case (if (lexical-token? tok) (lexical-token-category tok) tok)
462 ((lparen)
463 (set! stack (cons tok stack)))
464 ((rparen)
465 (if (and (pair? stack)
466 (eq? (lexical-token-category (car stack)) 'lparen))
467 (set! stack (cdr stack))
468 (syntax-error "unexpected right parenthesis"
469 (lexical-token-source tok)
470 #f)))
471 ((lbracket)
472 (set! stack (cons tok stack)))
473 ((rbracket)
474 (if (and (pair? stack)
475 (eq? (lexical-token-category (car stack)) 'lbracket))
476 (set! stack (cdr stack))
477 (syntax-error "unexpected right bracket"
478 (lexical-token-source tok)
479 #f)))
480 ((lbrace)
481 (set! stack (cons tok stack)))
482 ((rbrace)
483 (if (and (pair? stack)
484 (eq? (lexical-token-category (car stack)) 'lbrace))
485 (set! stack (cdr stack))
486 (syntax-error "unexpected right brace"
487 (lexical-token-source tok)
488 #f)))
489 ((semicolon)
490 (set! eoi? (null? stack))))
491 (set! div? (and (lexical-token? tok)
492 (let ((cat (lexical-token-category tok)))
493 (or (eq? cat 'Identifier)
494 (eq? cat 'NumericLiteral)
495 (eq? cat 'StringLiteral)))))
496 tok)))))
497
498 (define (tokenize port)
499 (let ((next (make-tokenizer port)))
500 (let lp ((out '()))
501 (let ((tok (next)))
502 (if (eq? tok '*eoi*)
503 (reverse! out)
504 (lp (cons tok out)))))))
505
506 (define (tokenize/1 port)
507 (let ((next (make-tokenizer/1 port)))
508 (let lp ((out '()))
509 (let ((tok (next)))
510 (if (eq? tok '*eoi*)
511 (reverse! out)
512 (lp (cons tok out)))))))
513