| 1 | ;;; ECMAScript for Guile |
| 2 | |
| 3 | ;; Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc. |
| 4 | |
| 5 | ;;;; This library is free software; you can redistribute it and/or |
| 6 | ;;;; modify it under the terms of the GNU Lesser General Public |
| 7 | ;;;; License as published by the Free Software Foundation; either |
| 8 | ;;;; version 3 of the License, or (at your option) any later version. |
| 9 | ;;;; |
| 10 | ;;;; This library is distributed in the hope that it will be useful, |
| 11 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | ;;;; Lesser General Public License for more details. |
| 14 | ;;;; |
| 15 | ;;;; You should have received a copy of the GNU Lesser General Public |
| 16 | ;;;; License along with this library; if not, write to the Free Software |
| 17 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 18 | |
| 19 | ;;; Code: |
| 20 | |
| 21 | (define-module (language ecmascript tokenize) |
| 22 | #:use-module (ice-9 rdelim) |
| 23 | #:use-module ((srfi srfi-1) #:select (unfold-right)) |
| 24 | #:use-module (system base lalr) |
| 25 | #:export (next-token make-tokenizer make-tokenizer/1 tokenize tokenize/1)) |
| 26 | |
| 27 | (define (syntax-error what loc form . args) |
| 28 | (throw 'syntax-error #f what |
| 29 | (and=> loc source-location->source-properties) |
| 30 | form #f args)) |
| 31 | |
| 32 | (define (port-source-location port) |
| 33 | (make-source-location (port-filename port) |
| 34 | (port-line port) |
| 35 | (port-column port) |
| 36 | (false-if-exception (ftell port)) |
| 37 | #f)) |
| 38 | |
| 39 | ;; taken from SSAX, sorta |
| 40 | (define (read-until delims port loc) |
| 41 | (if (eof-object? (peek-char port)) |
| 42 | (syntax-error "EOF while reading a token" loc #f) |
| 43 | (let ((token (read-delimited delims port 'peek))) |
| 44 | (if (eof-object? (peek-char port)) |
| 45 | (syntax-error "EOF while reading a token" loc token) |
| 46 | token)))) |
| 47 | |
| 48 | (define (char-hex? c) |
| 49 | (and (not (eof-object? c)) |
| 50 | (or (char-numeric? c) |
| 51 | (memv c '(#\a #\b #\c #\d #\e #\f)) |
| 52 | (memv c '(#\A #\B #\C #\D #\E #\F))))) |
| 53 | |
| 54 | (define (digit->number c) |
| 55 | (- (char->integer c) (char->integer #\0))) |
| 56 | |
| 57 | (define (hex->number c) |
| 58 | (if (char-numeric? c) |
| 59 | (digit->number c) |
| 60 | (+ 10 (- (char->integer (char-downcase c)) (char->integer #\a))))) |
| 61 | |
| 62 | (define (read-slash port loc div?) |
| 63 | (let ((c1 (begin |
| 64 | (read-char port) |
| 65 | (peek-char port)))) |
| 66 | (cond |
| 67 | ((eof-object? c1) |
| 68 | ;; hmm. error if we're not looking for a div? ? |
| 69 | (make-lexical-token '/ loc #f)) |
| 70 | ((char=? c1 #\/) |
| 71 | (read-line port) |
| 72 | (next-token port div?)) |
| 73 | ((char=? c1 #\*) |
| 74 | (read-char port) |
| 75 | (let lp ((c (read-char port))) |
| 76 | (cond |
| 77 | ((eof-object? c) |
| 78 | (syntax-error "EOF while in multi-line comment" loc #f)) |
| 79 | ((char=? c #\*) |
| 80 | (if (eqv? (peek-char port) #\/) |
| 81 | (begin |
| 82 | (read-char port) |
| 83 | (next-token port div?)) |
| 84 | (lp (read-char port)))) |
| 85 | (else |
| 86 | (lp (read-char port)))))) |
| 87 | (div? |
| 88 | (case c1 |
| 89 | ((#\=) (read-char port) (make-lexical-token '/= loc #f)) |
| 90 | (else (make-lexical-token '/ loc #f)))) |
| 91 | (else |
| 92 | (read-regexp port loc))))) |
| 93 | |
| 94 | (define (read-regexp port loc) |
| 95 | ;; first slash already read |
| 96 | (let ((terms (string #\/ #\\ #\nl #\cr))) |
| 97 | (let lp ((str (read-until terms port loc)) (head "")) |
| 98 | (let ((terminator (peek-char port))) |
| 99 | (cond |
| 100 | ((char=? terminator #\/) |
| 101 | (read-char port) |
| 102 | ;; flags |
| 103 | (let lp ((c (peek-char port)) (flags '())) |
| 104 | (if (or (eof-object? c) |
| 105 | (not (or (char-alphabetic? c) |
| 106 | (char-numeric? c) |
| 107 | (char=? c #\$) |
| 108 | (char=? c #\_)))) |
| 109 | (make-lexical-token 'RegexpLiteral loc |
| 110 | (cons (string-append head str) |
| 111 | (reverse flags))) |
| 112 | (begin (read-char port) |
| 113 | (lp (peek-char port) (cons c flags)))))) |
| 114 | ((char=? terminator #\\) |
| 115 | (read-char port) |
| 116 | (let ((echar (read-char port))) |
| 117 | (lp (read-until terms port loc) |
| 118 | (string-append head str (string #\\ echar))))) |
| 119 | (else |
| 120 | (syntax-error "regexp literals may not contain newlines" |
| 121 | loc str))))))) |
| 122 | |
| 123 | (define (read-string port loc) |
| 124 | (let ((c (read-char port))) |
| 125 | (let ((terms (string c #\\ #\nl #\cr))) |
| 126 | (define (read-escape port) |
| 127 | (let ((c (read-char port))) |
| 128 | (case c |
| 129 | ((#\' #\" #\\) c) |
| 130 | ((#\b) #\bs) |
| 131 | ((#\f) #\np) |
| 132 | ((#\n) #\nl) |
| 133 | ((#\r) #\cr) |
| 134 | ((#\t) #\tab) |
| 135 | ((#\v) #\vt) |
| 136 | ((#\0) |
| 137 | (let ((next (peek-char port))) |
| 138 | (cond |
| 139 | ((eof-object? next) #\nul) |
| 140 | ((char-numeric? next) |
| 141 | (syntax-error "octal escape sequences are not supported" |
| 142 | loc #f)) |
| 143 | (else #\nul)))) |
| 144 | ((#\x) |
| 145 | (let* ((a (read-char port)) |
| 146 | (b (read-char port))) |
| 147 | (cond |
| 148 | ((and (char-hex? a) (char-hex? b)) |
| 149 | (integer->char (+ (* 16 (hex->number a)) (hex->number b)))) |
| 150 | (else |
| 151 | (syntax-error "bad hex character escape" loc (string a b)))))) |
| 152 | ((#\u) |
| 153 | (let* ((a (read-char port)) |
| 154 | (b (read-char port)) |
| 155 | (c (read-char port)) |
| 156 | (d (read-char port))) |
| 157 | (integer->char (string->number (string a b c d) 16)))) |
| 158 | (else |
| 159 | c)))) |
| 160 | (let lp ((str (read-until terms port loc))) |
| 161 | (let ((terminator (peek-char port))) |
| 162 | (cond |
| 163 | ((char=? terminator c) |
| 164 | (read-char port) |
| 165 | (make-lexical-token 'StringLiteral loc str)) |
| 166 | ((char=? terminator #\\) |
| 167 | (read-char port) |
| 168 | (let ((echar (read-escape port))) |
| 169 | (lp (string-append str (string echar) |
| 170 | (read-until terms port loc))))) |
| 171 | (else |
| 172 | (syntax-error "string literals may not contain newlines" |
| 173 | loc str)))))))) |
| 174 | |
| 175 | (define *keywords* |
| 176 | '(("break" . break) |
| 177 | ("else" . else) |
| 178 | ("new" . new) |
| 179 | ("var" . var) |
| 180 | ("case" . case) |
| 181 | ("finally" . finally) |
| 182 | ("return" . return) |
| 183 | ("void" . void) |
| 184 | ("catch" . catch) |
| 185 | ("for" . for) |
| 186 | ("switch" . switch) |
| 187 | ("while" . while) |
| 188 | ("continue" . continue) |
| 189 | ("function" . function) |
| 190 | ("this" . this) |
| 191 | ("with" . with) |
| 192 | ("default" . default) |
| 193 | ("if" . if) |
| 194 | ("throw" . throw) |
| 195 | ("delete" . delete) |
| 196 | ("in" . in) |
| 197 | ("try" . try) |
| 198 | ("do" . do) |
| 199 | ("instanceof" . instanceof) |
| 200 | ("typeof" . typeof) |
| 201 | |
| 202 | ;; these aren't exactly keywords, but hey |
| 203 | ("null" . null) |
| 204 | ("true" . true) |
| 205 | ("false" . false))) |
| 206 | |
| 207 | (define *future-reserved-words* |
| 208 | '(("abstract" . abstract) |
| 209 | ("enum" . enum) |
| 210 | ("int" . int) |
| 211 | ("short" . short) |
| 212 | ("boolean" . boolean) |
| 213 | ("export" . export) |
| 214 | ("interface" . interface) |
| 215 | ("static" . static) |
| 216 | ("byte" . byte) |
| 217 | ("extends" . extends) |
| 218 | ("long" . long) |
| 219 | ("super" . super) |
| 220 | ("char" . char) |
| 221 | ("final" . final) |
| 222 | ("native" . native) |
| 223 | ("synchronized" . synchronized) |
| 224 | ("class" . class) |
| 225 | ("float" . float) |
| 226 | ("package" . package) |
| 227 | ("throws" . throws) |
| 228 | ("const" . const) |
| 229 | ("goto" . goto) |
| 230 | ("private" . private) |
| 231 | ("transient" . transient) |
| 232 | ("debugger" . debugger) |
| 233 | ("implements" . implements) |
| 234 | ("protected" . protected) |
| 235 | ("volatile" . volatile) |
| 236 | ("double" . double) |
| 237 | ("import" . import) |
| 238 | ("public" . public))) |
| 239 | |
| 240 | (define (read-identifier port loc) |
| 241 | (let lp ((c (peek-char port)) (chars '())) |
| 242 | (if (or (eof-object? c) |
| 243 | (not (or (char-alphabetic? c) |
| 244 | (char-numeric? c) |
| 245 | (char=? c #\$) |
| 246 | (char=? c #\_)))) |
| 247 | (let ((word (list->string (reverse chars)))) |
| 248 | (cond ((assoc-ref *keywords* word) |
| 249 | => (lambda (x) (make-lexical-token x loc #f))) |
| 250 | ((assoc-ref *future-reserved-words* word) |
| 251 | (syntax-error "word is reserved for the future, dude." |
| 252 | loc word)) |
| 253 | (else (make-lexical-token 'Identifier loc |
| 254 | (string->symbol word))))) |
| 255 | (begin (read-char port) |
| 256 | (lp (peek-char port) (cons c chars)))))) |
| 257 | |
| 258 | (define (read-numeric port loc) |
| 259 | (let* ((c0 (if (char=? (peek-char port) #\.) |
| 260 | #\0 |
| 261 | (read-char port))) |
| 262 | (c1 (peek-char port))) |
| 263 | (cond |
| 264 | ((eof-object? c1) (digit->number c0)) |
| 265 | ((and (char=? c0 #\0) (or (char=? c1 #\x) (char=? c1 #\X))) |
| 266 | (read-char port) |
| 267 | (let ((c (peek-char port))) |
| 268 | (if (not (char-hex? c)) |
| 269 | (syntax-error "bad digit reading hexadecimal number" |
| 270 | loc c)) |
| 271 | (let lp ((c c) (acc 0)) |
| 272 | (cond ((char-hex? c) |
| 273 | (read-char port) |
| 274 | (lp (peek-char port) |
| 275 | (+ (* 16 acc) (hex->number c)))) |
| 276 | (else |
| 277 | acc))))) |
| 278 | ((and (char=? c0 #\0) (char-numeric? c1)) |
| 279 | (let lp ((c c1) (acc 0)) |
| 280 | (cond ((eof-object? c) acc) |
| 281 | ((char-numeric? c) |
| 282 | (if (or (char=? c #\8) (char=? c #\9)) |
| 283 | (syntax-error "invalid digit in octal sequence" |
| 284 | loc c)) |
| 285 | (read-char port) |
| 286 | (lp (peek-char port) |
| 287 | (+ (* 8 acc) (digit->number c)))) |
| 288 | (else |
| 289 | acc)))) |
| 290 | (else |
| 291 | (let lp ((c1 c1) (acc (digit->number c0))) |
| 292 | (cond |
| 293 | ((eof-object? c1) acc) |
| 294 | ((char-numeric? c1) |
| 295 | (read-char port) |
| 296 | (lp (peek-char port) |
| 297 | (+ (* 10 acc) (digit->number c1)))) |
| 298 | ((or (char=? c1 #\e) (char=? c1 #\E)) |
| 299 | (read-char port) |
| 300 | (let ((add (let ((c (peek-char port))) |
| 301 | (cond ((eof-object? c) |
| 302 | (syntax-error "error reading exponent: EOF" |
| 303 | loc #f)) |
| 304 | ((char=? c #\+) (read-char port) +) |
| 305 | ((char=? c #\-) (read-char port) -) |
| 306 | ((char-numeric? c) +) |
| 307 | (else |
| 308 | (syntax-error "error reading exponent: non-digit" |
| 309 | loc c)))))) |
| 310 | (let lp ((c (peek-char port)) (e 0)) |
| 311 | (cond ((and (not (eof-object? c)) (char-numeric? c)) |
| 312 | (read-char port) |
| 313 | (lp (peek-char port) (add (* 10 e) (digit->number c)))) |
| 314 | (else |
| 315 | (* (if (negative? e) (* acc 1.0) acc) (expt 10 e))))))) |
| 316 | ((char=? c1 #\.) |
| 317 | (read-char port) |
| 318 | (let lp2 ((c (peek-char port)) (dec 0.0) (n -1)) |
| 319 | (cond ((and (not (eof-object? c)) (char-numeric? c)) |
| 320 | (read-char port) |
| 321 | (lp2 (peek-char port) |
| 322 | (+ dec (* (digit->number c) (expt 10 n))) |
| 323 | (1- n))) |
| 324 | (else |
| 325 | ;; loop back to catch an exponential part |
| 326 | (lp c (+ acc dec)))))) |
| 327 | (else |
| 328 | acc))))))) |
| 329 | |
| 330 | (define *punctuation* |
| 331 | '(("{" . lbrace) |
| 332 | ("}" . rbrace) |
| 333 | ("(" . lparen) |
| 334 | (")" . rparen) |
| 335 | ("[" . lbracket) |
| 336 | ("]" . rbracket) |
| 337 | ("." . dot) |
| 338 | (";" . semicolon) |
| 339 | ("," . comma) |
| 340 | ("<" . <) |
| 341 | (">" . >) |
| 342 | ("<=" . <=) |
| 343 | (">=" . >=) |
| 344 | ("==" . ==) |
| 345 | ("!=" . !=) |
| 346 | ("===" . ===) |
| 347 | ("!==" . !==) |
| 348 | ("+" . +) |
| 349 | ("-" . -) |
| 350 | ("*" . *) |
| 351 | ("%" . %) |
| 352 | ("++" . ++) |
| 353 | ("--" . --) |
| 354 | ("<<" . <<) |
| 355 | (">>" . >>) |
| 356 | (">>>" . >>>) |
| 357 | ("&" . &) |
| 358 | ("|" . bor) |
| 359 | ("^" . ^) |
| 360 | ("!" . !) |
| 361 | ("~" . ~) |
| 362 | ("&&" . &&) |
| 363 | ("||" . or) |
| 364 | ("?" . ?) |
| 365 | (":" . colon) |
| 366 | ("=" . =) |
| 367 | ("+=" . +=) |
| 368 | ("-=" . -=) |
| 369 | ("*=" . *=) |
| 370 | ("%=" . %=) |
| 371 | ("<<=" . <<=) |
| 372 | (">>=" . >>=) |
| 373 | (">>>=" . >>>=) |
| 374 | ("&=" . &=) |
| 375 | ("|=" . bor=) |
| 376 | ("^=" . ^=))) |
| 377 | |
| 378 | (define *div-punctuation* |
| 379 | '(("/" . /) |
| 380 | ("/=" . /=))) |
| 381 | |
| 382 | ;; node ::= (char (symbol | #f) node*) |
| 383 | (define read-punctuation |
| 384 | (let ((punc-tree (let lp ((nodes '()) (puncs *punctuation*)) |
| 385 | (cond ((null? puncs) |
| 386 | nodes) |
| 387 | ((assv-ref nodes (string-ref (caar puncs) 0)) |
| 388 | => (lambda (node-tail) |
| 389 | (if (= (string-length (caar puncs)) 1) |
| 390 | (set-car! node-tail (cdar puncs)) |
| 391 | (set-cdr! node-tail |
| 392 | (lp (cdr node-tail) |
| 393 | `((,(substring (caar puncs) 1) |
| 394 | . ,(cdar puncs)))))) |
| 395 | (lp nodes (cdr puncs)))) |
| 396 | (else |
| 397 | (lp (cons (list (string-ref (caar puncs) 0) #f) nodes) |
| 398 | puncs)))))) |
| 399 | (lambda (port loc) |
| 400 | (let lp ((c (peek-char port)) (tree punc-tree) (candidate #f)) |
| 401 | (cond |
| 402 | ((assv-ref tree c) |
| 403 | => (lambda (node-tail) |
| 404 | (read-char port) |
| 405 | (lp (peek-char port) (cdr node-tail) (car node-tail)))) |
| 406 | (candidate |
| 407 | (make-lexical-token candidate loc #f)) |
| 408 | (else |
| 409 | (syntax-error "bad syntax: character not allowed" loc c))))))) |
| 410 | |
| 411 | (define (next-token port div?) |
| 412 | (let ((c (peek-char port)) |
| 413 | (loc (port-source-location port))) |
| 414 | (case c |
| 415 | ((#\ht #\vt #\np #\space #\x00A0) ; whitespace |
| 416 | (read-char port) |
| 417 | (next-token port div?)) |
| 418 | ((#\newline #\cr) ; line break |
| 419 | (read-char port) |
| 420 | (next-token port div?)) |
| 421 | ((#\/) |
| 422 | ;; division, single comment, double comment, or regexp |
| 423 | (read-slash port loc div?)) |
| 424 | ((#\" #\') ; string literal |
| 425 | (read-string port loc)) |
| 426 | (else |
| 427 | (cond |
| 428 | ((eof-object? c) |
| 429 | '*eoi*) |
| 430 | ((or (char-alphabetic? c) |
| 431 | (char=? c #\$) |
| 432 | (char=? c #\_)) |
| 433 | ;; reserved word or identifier |
| 434 | (read-identifier port loc)) |
| 435 | ((char-numeric? c) |
| 436 | ;; numeric -- also accept . FIXME, requires lookahead |
| 437 | (make-lexical-token 'NumericLiteral loc (read-numeric port loc))) |
| 438 | (else |
| 439 | ;; punctuation |
| 440 | (read-punctuation port loc))))))) |
| 441 | |
| 442 | (define (make-tokenizer port) |
| 443 | (let ((div? #f)) |
| 444 | (lambda () |
| 445 | (let ((tok (next-token port div?))) |
| 446 | (set! div? (and (lexical-token? tok) |
| 447 | (let ((cat (lexical-token-category tok))) |
| 448 | (or (eq? cat 'Identifier) |
| 449 | (eq? cat 'NumericLiteral) |
| 450 | (eq? cat 'StringLiteral))))) |
| 451 | tok)))) |
| 452 | |
| 453 | (define (make-tokenizer/1 port) |
| 454 | (let ((div? #f) |
| 455 | (eoi? #f) |
| 456 | (stack '())) |
| 457 | (lambda () |
| 458 | (if eoi? |
| 459 | '*eoi* |
| 460 | (let ((tok (next-token port div?))) |
| 461 | (case (if (lexical-token? tok) (lexical-token-category tok) tok) |
| 462 | ((lparen) |
| 463 | (set! stack (cons tok stack))) |
| 464 | ((rparen) |
| 465 | (if (and (pair? stack) |
| 466 | (eq? (lexical-token-category (car stack)) 'lparen)) |
| 467 | (set! stack (cdr stack)) |
| 468 | (syntax-error "unexpected right parenthesis" |
| 469 | (lexical-token-source tok) |
| 470 | #f))) |
| 471 | ((lbracket) |
| 472 | (set! stack (cons tok stack))) |
| 473 | ((rbracket) |
| 474 | (if (and (pair? stack) |
| 475 | (eq? (lexical-token-category (car stack)) 'lbracket)) |
| 476 | (set! stack (cdr stack)) |
| 477 | (syntax-error "unexpected right bracket" |
| 478 | (lexical-token-source tok) |
| 479 | #f))) |
| 480 | ((lbrace) |
| 481 | (set! stack (cons tok stack))) |
| 482 | ((rbrace) |
| 483 | (if (and (pair? stack) |
| 484 | (eq? (lexical-token-category (car stack)) 'lbrace)) |
| 485 | (set! stack (cdr stack)) |
| 486 | (syntax-error "unexpected right brace" |
| 487 | (lexical-token-source tok) |
| 488 | #f))) |
| 489 | ((semicolon) |
| 490 | (set! eoi? (null? stack)))) |
| 491 | (set! div? (and (lexical-token? tok) |
| 492 | (let ((cat (lexical-token-category tok))) |
| 493 | (or (eq? cat 'Identifier) |
| 494 | (eq? cat 'NumericLiteral) |
| 495 | (eq? cat 'StringLiteral))))) |
| 496 | tok))))) |
| 497 | |
| 498 | (define (tokenize port) |
| 499 | (let ((next (make-tokenizer port))) |
| 500 | (let lp ((out '())) |
| 501 | (let ((tok (next))) |
| 502 | (if (eq? tok '*eoi*) |
| 503 | (reverse! out) |
| 504 | (lp (cons tok out))))))) |
| 505 | |
| 506 | (define (tokenize/1 port) |
| 507 | (let ((next (make-tokenizer/1 port))) |
| 508 | (let lp ((out '())) |
| 509 | (let ((tok (next))) |
| 510 | (if (eq? tok '*eoi*) |
| 511 | (reverse! out) |
| 512 | (lp (cons tok out))))))) |
| 513 | |