Adapt ECMAScript parser and lexer to `(system base lalr)'.
[bpt/guile.git] / module / language / ecmascript / tokenize.scm
CommitLineData
5dcd3e48
AW
1;;; ECMAScript for Guile
2
0ecd70a2 3;; Copyright (C) 2009, 2010 Free Software Foundation, Inc.
5dcd3e48 4
53befeb7
NJ
5;;;; This library is free software; you can redistribute it and/or
6;;;; modify it under the terms of the GNU Lesser General Public
7;;;; License as published by the Free Software Foundation; either
8;;;; version 3 of the License, or (at your option) any later version.
9;;;;
10;;;; This library is distributed in the hope that it will be useful,
11;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13;;;; Lesser General Public License for more details.
14;;;;
15;;;; You should have received a copy of the GNU Lesser General Public
16;;;; License along with this library; if not, write to the Free Software
17;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5dcd3e48
AW
18
19;;; Code:
20
21(define-module (language ecmascript tokenize)
22 #:use-module (ice-9 rdelim)
23 #:use-module ((srfi srfi-1) #:select (unfold-right))
0ecd70a2 24 #:use-module (system base lalr)
5dcd3e48
AW
25 #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1))
26
0b229e81
AW
27(define (syntax-error message . args)
28 (apply throw 'SyntaxError message args))
29
5dcd3e48
AW
30;; taken from SSAX, sorta
31(define (read-until delims port)
32 (if (eof-object? (peek-char port))
0b229e81 33 (syntax-error "EOF while reading a token")
5dcd3e48
AW
34 (let ((token (read-delimited delims port 'peek)))
35 (if (eof-object? (peek-char port))
0b229e81 36 (syntax-error "EOF while reading a token")
5dcd3e48
AW
37 token))))
38
39(define (char-hex? c)
40 (and (not (eof-object? c))
41 (or (char-numeric? c)
42 (memv c '(#\a #\b #\c #\d #\e #\f))
43 (memv c '(#\A #\B #\C #\D #\E #\F)))))
44
45(define (digit->number c)
46 (- (char->integer c) (char->integer #\0)))
47
48(define (hex->number c)
49 (if (char-numeric? c)
50 (digit->number c)
51 (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a)))))
52
53(define (read-slash port div?)
e5f5113c
LC
54 (let ((c1 (begin
55 (read-char port)
56 (peek-char port))))
5dcd3e48
AW
57 (cond
58 ((eof-object? c1)
59 ;; hmm. error if we're not looking for a div? ?
60 '(/ . #f))
61 ((char=? c1 #\/)
62 (read-line port)
63 (next-token port div?))
64 ((char=? c1 #\*)
65 (read-char port)
66 (let lp ((c (read-char port)))
67 (cond
0b229e81 68 ((eof-object? c) (syntax-error "EOF while in multi-line comment"))
5dcd3e48
AW
69 ((char=? c #\*)
70 (if (eqv? (peek-char port) #\/)
71 (begin
72 (read-char port)
73 (next-token port div?))
74 (lp (read-char port))))
75 (else
76 (lp (read-char port))))))
77 (div?
78 (case c1
0ecd70a2
LC
79 ((#\=) (read-char port) (make-lexical-token '/= #f #f))
80 (else (make-lexical-token '/ #f #f))))
5dcd3e48
AW
81 (else
82 (read-regexp port)))))
83
84(define (read-regexp port)
85 ;; first slash already read
86 (let ((terms (string #\/ #\\ #\nl #\cr)))
87 (let lp ((str (read-until terms port)) (head ""))
88 (let ((terminator (peek-char port)))
89 (cond
90 ((char=? terminator #\/)
91 (read-char port)
92 ;; flags
93 (let lp ((c (peek-char port)) (flags '()))
94 (if (or (eof-object? c)
95 (not (or (char-alphabetic? c)
96 (char-numeric? c)
97 (char=? c #\$)
98 (char=? c #\_))))
0ecd70a2
LC
99 (make-lexical-token 'RegexpLiteral #f
100 (cons (string-append head str)
101 (reverse flags)))
5dcd3e48
AW
102 (begin (read-char port)
103 (lp (peek-char port) (cons c flags))))))
104 ((char=? terminator #\\)
105 (read-char port)
106 (let ((echar (read-char port)))
107 (lp (read-until terms port)
108 (string-append head str (string #\\ echar)))))
109 (else
0b229e81 110 (syntax-error "regexp literals may not contain newlines" str)))))))
5dcd3e48
AW
111
112(define (read-string port)
113 (let ((c (read-char port)))
114 (let ((terms (string c #\\ #\nl #\cr)))
115 (define (read-escape port)
116 (let ((c (read-char port)))
117 (case c
118 ((#\' #\" #\\) c)
119 ((#\b) #\bs)
120 ((#\f) #\np)
121 ((#\n) #\nl)
122 ((#\r) #\cr)
123 ((#\t) #\tab)
124 ((#\v) #\vt)
125 ((#\0)
126 (let ((next (peek-char port)))
127 (cond ((eof-object? next) #\nul)
128 ((char-numeric? next)
0b229e81 129 (syntax-error "octal escape sequences are not supported"))
5dcd3e48
AW
130 (else #\nul))))
131 ((#\x)
132 (let* ((a (read-char port))
133 (b (read-char port)))
134 (cond
135 ((and (char-hex? a) (char-hex? b))
136 (integer->char (+ (* 16 (hex->number a)) (hex->number b))))
137 (else
0b229e81 138 (syntax-error "bad hex character escape" a b)))))
5dcd3e48 139 ((#\u)
0b229e81 140 (syntax-error "unicode not supported"))
5dcd3e48
AW
141 (else
142 c))))
143 (let lp ((str (read-until terms port)))
144 (let ((terminator (peek-char port)))
145 (cond
146 ((char=? terminator c)
147 (read-char port)
148 str)
149 ((char=? terminator #\\)
150 (read-char port)
151 (let ((echar (read-escape port)))
152 (lp (string-append str (string echar)
153 (read-until terms port)))))
154 (else
0b229e81 155 (syntax-error "string literals may not contain newlines" str))))))))
5dcd3e48
AW
156
157(define *keywords*
158 '(("break" . break)
159 ("else" . else)
160 ("new" . new)
161 ("var" . var)
162 ("case" . case)
163 ("finally" . finally)
164 ("return" . return)
165 ("void" . void)
166 ("catch" . catch)
167 ("for" . for)
168 ("switch" . switch)
169 ("while" . while)
170 ("continue" . continue)
171 ("function" . function)
172 ("this" . this)
173 ("with" . with)
174 ("default" . default)
175 ("if" . if)
176 ("throw" . throw)
177 ("delete" . delete)
178 ("in" . in)
179 ("try" . try)
180 ("do" . do)
181 ("instanceof" . instanceof)
182 ("typeof" . typeof)
183
184 ;; these aren't exactly keywords, but hey
185 ("null" . null)
186 ("true" . true)
187 ("false" . false)))
188
189(define *future-reserved-words*
190 '(("abstract" . abstract)
191 ("enum" . enum)
192 ("int" . int)
193 ("short" . short)
194 ("boolean" . boolean)
195 ("export" . export)
196 ("interface" . interface)
197 ("static" . static)
198 ("byte" . byte)
199 ("extends" . extends)
200 ("long" . long)
201 ("super" . super)
202 ("char" . char)
203 ("final" . final)
204 ("native" . native)
205 ("synchronized" . synchronized)
206 ("class" . class)
207 ("float" . float)
208 ("package" . package)
209 ("throws" . throws)
210 ("const" . const)
211 ("goto" . goto)
212 ("private" . private)
213 ("transient" . transient)
214 ("debugger" . debugger)
215 ("implements" . implements)
216 ("protected" . protected)
217 ("volatile" . volatile)
218 ("double" . double)
219 ("import" . import)
220 ("public" . public)))
221
0ecd70a2 222(define (read-identifier port loc)
5dcd3e48
AW
223 (let lp ((c (peek-char port)) (chars '()))
224 (if (or (eof-object? c)
225 (not (or (char-alphabetic? c)
226 (char-numeric? c)
227 (char=? c #\$)
228 (char=? c #\_))))
229 (let ((word (list->string (reverse chars))))
230 (cond ((assoc-ref *keywords* word)
0ecd70a2 231 => (lambda (x) (make-lexical-token x loc #f)))
5dcd3e48 232 ((assoc-ref *future-reserved-words* word)
0b229e81 233 (syntax-error "word is reserved for the future, dude." word))
0ecd70a2
LC
234 (else (make-lexical-token 'Identifier loc
235 (string->symbol word)))))
5dcd3e48
AW
236 (begin (read-char port)
237 (lp (peek-char port) (cons c chars))))))
238
239(define (read-numeric port)
240 (let* ((c0 (if (char=? (peek-char port) #\.)
241 #\0
242 (read-char port)))
243 (c1 (peek-char port)))
244 (cond
245 ((eof-object? c1) (digit->number c0))
246 ((and (char=? c0 #\0) (char=? c1 #\x))
247 (read-char port)
248 (let ((c (peek-char port)))
249 (if (not (char-hex? c))
0b229e81 250 (syntax-error "bad digit reading hexadecimal number" c))
5dcd3e48
AW
251 (let lp ((c c) (acc 0))
252 (cond ((char-hex? c)
253 (read-char port)
254 (lp (peek-char port)
255 (+ (* 16 acc) (hex->number c))))
256 (else
257 acc)))))
258 ((and (char=? c0 #\0) (char-numeric? c1))
259 (let lp ((c c1) (acc 0))
260 (cond ((eof-object? c) acc)
261 ((char-numeric? c)
262 (if (or (char=? c #\8) (char=? c #\9))
0b229e81 263 (syntax-error "invalid digit in octal sequence" c))
5dcd3e48
AW
264 (read-char port)
265 (lp (peek-char port)
266 (+ (* 8 acc) (digit->number c))))
267 (else
268 acc))))
269 (else
270 (let lp ((c1 c1) (acc (digit->number c0)))
271 (cond
272 ((eof-object? c1) acc)
273 ((char-numeric? c1)
274 (read-char port)
275 (lp (peek-char port)
276 (+ (* 10 acc) (digit->number c1))))
277 ((or (char=? c1 #\e) (char=? c1 #\E))
278 (read-char port)
279 (let ((add (let ((c (peek-char port)))
0b229e81 280 (cond ((eof-object? c) (syntax-error "error reading exponent: EOF"))
5dcd3e48
AW
281 ((char=? c #\+) (read-char port) +)
282 ((char=? c #\-) (read-char port) -)
283 ((char-numeric? c) +)
0b229e81
AW
284 (else (syntax-error "error reading exponent: non-digit"
285 c))))))
5dcd3e48
AW
286 (let lp ((c (peek-char port)) (e 0))
287 (cond ((and (not (eof-object? c)) (char-numeric? c))
288 (read-char port)
289 (lp (peek-char port) (add (* 10 e) (digit->number c))))
290 (else
291 (* (if (negative? e) (* acc 1.0) acc) (expt 10 e)))))))
292 ((char=? c1 #\.)
293 (read-char port)
294 (let lp2 ((c (peek-char port)) (dec 0.0) (n -1))
295 (cond ((and (not (eof-object? c)) (char-numeric? c))
296 (read-char port)
297 (lp2 (peek-char port)
298 (+ dec (* (digit->number c) (expt 10 n)))
299 (1- n)))
300 (else
301 ;; loop back to catch an exponential part
302 (lp c (+ acc dec))))))
303 (else
304 acc)))))))
305
306(define *punctuation*
307 '(("{" . lbrace)
308 ("}" . rbrace)
309 ("(" . lparen)
310 (")" . rparen)
311 ("[" . lbracket)
312 ("]" . rbracket)
313 ("." . dot)
314 (";" . semicolon)
315 ("," . comma)
316 ("<" . <)
317 (">" . >)
318 ("<=" . <=)
319 (">=" . >=)
320 ("==" . ==)
321 ("!=" . !=)
322 ("===" . ===)
323 ("!==" . !==)
324 ("+" . +)
325 ("-" . -)
326 ("*" . *)
327 ("%" . %)
328 ("++" . ++)
329 ("--" . --)
330 ("<<" . <<)
331 (">>" . >>)
332 (">>>" . >>>)
333 ("&" . &)
334 ("|" . bor)
335 ("^" . ^)
336 ("!" . !)
337 ("~" . ~)
338 ("&&" . &&)
339 ("||" . or)
340 ("?" . ?)
341 (":" . colon)
342 ("=" . =)
343 ("+=" . +=)
344 ("-=" . -=)
345 ("*=" . *=)
346 ("%=" . %=)
347 ("<<=" . <<=)
348 (">>=" . >>=)
349 (">>>=" . >>>=)
350 ("&=" . &=)
351 ("|=" . bor=)
352 ("^=" . ^=)))
353
354(define *div-punctuation*
355 '(("/" . /)
356 ("/=" . /=)))
357
358;; node ::= (char (symbol | #f) node*)
359(define read-punctuation
360 (let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*))
361 (cond ((null? puncs)
362 nodes)
363 ((assv-ref nodes (string-ref (caar puncs) 0))
364 => (lambda (node-tail)
365 (if (= (string-length (caar puncs)) 1)
366 (set-car! node-tail (cdar puncs))
367 (set-cdr! node-tail
368 (lp (cdr node-tail)
369 `((,(substring (caar puncs) 1)
370 . ,(cdar puncs))))))
371 (lp nodes (cdr puncs))))
372 (else
eb1482ac 373 (lp (cons (list (string-ref (caar puncs) 0) #f) nodes)
5dcd3e48 374 puncs))))))
0ecd70a2 375 (lambda (port loc)
5dcd3e48
AW
376 (let lp ((c (peek-char port)) (tree punc-tree) (candidate #f))
377 (cond
378 ((assv-ref tree c)
379 => (lambda (node-tail)
380 (read-char port)
381 (lp (peek-char port) (cdr node-tail) (car node-tail))))
382 (candidate
0ecd70a2 383 (make-lexical-token candidate loc #f))
5dcd3e48 384 (else
0b229e81 385 (syntax-error "bad syntax: character not allowed" c)))))))
5dcd3e48
AW
386
387(define (next-token port div?)
0ecd70a2
LC
388 (let ((c (peek-char port))
389 (loc (make-source-location (port-filename port)
390 (port-line port)
391 (port-column port)
392 (false-if-exception (seek port 0 SEEK_CUR))
393 #f)))
a3e34104
AW
394 (let ((tok
395 (case c
396 ((#\ht #\vt #\np #\space)
397 ; whitespace
398 (read-char port)
399 (next-token port div?))
400 ((#\newline #\cr)
401 ; line break
402 (read-char port)
403 (next-token port div?))
404 ((#\/)
405 ;; division, single comment, double comment, or regexp
406 (read-slash port div?))
407 ((#\" #\')
408 ; string literal
0ecd70a2 409 (make-lexical-token 'StringLiteral loc (read-string port)))
a3e34104
AW
410 (else
411 (cond
412 ((eof-object? c)
413 '*eoi*)
414 ((or (char-alphabetic? c)
415 (char=? c #\$)
416 (char=? c #\_))
417 ;; reserved word or identifier
0ecd70a2 418 (read-identifier port loc))
a3e34104
AW
419 ((char-numeric? c)
420 ;; numeric -- also accept . FIXME, requires lookahead
0ecd70a2 421 (make-lexical-token 'NumericLiteral loc (read-numeric port)))
a3e34104
AW
422 (else
423 ;; punctuation
0ecd70a2
LC
424 (read-punctuation port loc)))))))
425
a3e34104 426 tok)))
5dcd3e48
AW
427
428(define (make-tokenizer port)
429 (let ((div? #f))
430 (lambda ()
431 (let ((tok (next-token port div?)))
432 (set! div? (and (pair? tok) (eq? (car tok) 'identifier)))
433 tok))))
434
435(define (make-tokenizer/1 port)
436 (let ((div? #f)
437 (eoi? #f)
438 (stack '()))
439 (lambda ()
440 (if eoi?
441 '*eoi*
442 (let ((tok (next-token port div?)))
0ecd70a2 443 (case (if (lexical-token? tok) (lexical-token-category tok) tok)
5dcd3e48 444 ((lparen)
0ecd70a2 445 (set! stack (make-lexical-token 'lparen #f stack)))
5dcd3e48
AW
446 ((rparen)
447 (if (and (pair? stack) (eq? (car stack) 'lparen))
448 (set! stack (cdr stack))
0b229e81 449 (syntax-error "unexpected right parenthesis")))
5dcd3e48 450 ((lbracket)
0ecd70a2 451 (set! stack (make-lexical-token 'lbracket #f stack)))
5dcd3e48
AW
452 ((rbracket)
453 (if (and (pair? stack) (eq? (car stack) 'lbracket))
454 (set! stack (cdr stack))
0b229e81 455 (syntax-error "unexpected right bracket" stack)))
5dcd3e48 456 ((lbrace)
0ecd70a2 457 (set! stack (make-lexical-token 'lbrace #f stack)))
5dcd3e48
AW
458 ((rbrace)
459 (if (and (pair? stack) (eq? (car stack) 'lbrace))
460 (set! stack (cdr stack))
0b229e81 461 (syntax-error "unexpected right brace" stack)))
5dcd3e48
AW
462 ((semicolon)
463 (set! eoi? (null? stack))))
0ecd70a2
LC
464 (set! div? (and (lexical-token? tok)
465 (let ((cat (lexical-token-category tok)))
466 (or (eq? cat 'Identifier)
467 (eq? cat 'NumericLiteral)
468 (eq? cat 'StringLiteral)))))
5dcd3e48
AW
469 tok)))))
470
471(define (tokenize port)
472 (let ((next (make-tokenizer port)))
473 (let lp ((out '()))
474 (let ((tok (next)))
475 (if (eq? tok '*eoi*)
476 (reverse! out)
477 (lp (cons tok out)))))))
478
479(define (tokenize/1 port)
480 (let ((next (make-tokenizer/1 port)))
481 (let lp ((out '()))
482 (let ((tok (next)))
483 (if (eq? tok '*eoi*)
484 (reverse! out)
485 (lp (cons tok out)))))))
486