| 1 | (* Modified by Matthew Fluet on 2011-06-17. |
| 2 | * Use simple file name (rather than absolute paths) in line directives in output. |
| 3 | *) |
| 4 | (* Modified by Vesa Karvonen on 2007-12-19. |
| 5 | * Create line directives in output. |
| 6 | *) |
| 7 | (* Modified by Matthew Fluet on 2007-11-07. |
| 8 | * Add %posint command. |
| 9 | *) |
| 10 | (* Modified by StephenWeeks on 2005-08-18. |
| 11 | * Fix file starting position |
| 12 | *) |
| 13 | (* Modified by Stephen Weeks on 2004-10-19. |
| 14 | * Do not create references to Unsafe structure. |
| 15 | *) |
| 16 | (* Lexical analyzer generator for Standard ML. |
| 17 | Version 1.7.0, June 1998 |
| 18 | |
| 19 | Copyright (c) 1989-1992 by Andrew W. Appel, |
| 20 | David R. Tarditi, James S. Mattson |
| 21 | |
| 22 | This software comes with ABSOLUTELY NO WARRANTY. |
| 23 | This software is subject only to the PRINCETON STANDARD ML SOFTWARE LIBRARY |
| 24 | COPYRIGHT NOTICE, LICENSE AND DISCLAIMER, (in the file "COPYRIGHT", |
| 25 | distributed with this software). You may copy and distribute this software; |
| 26 | see the COPYRIGHT NOTICE for details and restrictions. |
| 27 | |
| 28 | Changes: |
| 29 | 07/25/89 (drt): added %header declaration, code to place |
| 30 | user declarations at same level as makeLexer, etc. |
| 31 | This is needed for the parser generator. |
| 32 | /10/89 (appel): added %arg declaration (see lexgen.doc). |
| 33 | /04/90 (drt): fixed following bug: couldn't use the lexer after an |
| 34 | error occurred -- NextTok and inquote weren't being reset |
| 35 | 10/22/91 (drt): disabled use of lookahead |
| 36 | 10/23/92 (drt): disabled use of $ operator (which involves lookahead), |
| 37 | added handlers for dictionary lookup routine |
| 38 | 11/02/92 (drt): changed handler for exception Reject in generated lexer |
| 39 | to Internal.Reject |
| 40 | 02/01/94 (appel): Moved the exception handler for Reject in such |
| 41 | a way as to allow tail-recursion (improves performance |
| 42 | wonderfully!). |
| 43 | 02/01/94 (appel): Fixed a bug in parsing of state names. |
| 44 | 05/19/94 (Mikael Pettersson, mpe@ida.liu.se): |
| 45 | Transition tables are usually represented as strings, but |
| 46 | when the range is too large, int vectors constructed by |
| 47 | code like "Vector.vector[1,2,3,...]" are used instead. |
| 48 | The problem with this isn't that the vector itself takes |
| 49 | a lot of space, but that the code generated by SML/NJ to |
| 50 | construct the intermediate list at run-time is *HUGE*. My |
| 51 | fix is to encode an int vector as a string literal (using |
| 52 | two bytes per int) and emit code to decode the string to |
| 53 | a vector at run-time. SML/NJ compiles string literals into |
| 54 | substrings in the code, so this uses much less space. |
| 55 | 06/02/94 (jhr): Modified export-lex.sml to conform to new installation |
| 56 | scheme. Also removed tab characters from string literals. |
| 57 | 10/05/94 (jhr): Changed generator to produce code that uses the new |
| 58 | basis style strings and characters. |
| 59 | 10/06/94 (jhr) Modified code to compile under new basis style strings |
| 60 | and characters. |
| 61 | 02/08/95 (jhr) Modified to use new List module interface. |
| 62 | 05/18/95 (jhr) changed Vector.vector to Vector.fromList |
| 63 | |
| 64 | * Revision 1.9 1998/01/06 19:23:53 appel |
| 65 | * added %posarg feature to permit position-within-file to be passed |
| 66 | * as a parameter to makeLexer |
| 67 | * |
| 68 | # Revision 1.8 1998/01/06 19:01:48 appel |
| 69 | # repaired error messages like "cannot have both %structure and %header" |
| 70 | # |
| 71 | # Revision 1.7 1998/01/06 18:55:49 appel |
| 72 | # permit %% to be unescaped within regular expressions |
| 73 | # |
| 74 | # Revision 1.6 1998/01/06 18:46:13 appel |
| 75 | # removed undocumented feature that permitted extra %% at end of rules |
| 76 | # |
| 77 | # Revision 1.5 1998/01/06 18:29:23 appel |
| 78 | # put yylineno variable inside makeLexer function |
| 79 | # |
| 80 | # Revision 1.4 1998/01/06 18:19:59 appel |
| 81 | # check for newline inside quoted string |
| 82 | # |
| 83 | # Revision 1.3 1997/10/04 03:52:13 dbm |
| 84 | # Fix to remove output file if ml-lex fails. |
| 85 | # |
| 86 | 10/17/02 (jhr) changed bad character error message to properly |
| 87 | print the bad character. |
| 88 | 10/17/02 (jhr) fixed skipws to use Char.isSpace test. |
| 89 | 07/27/05 (jhr) add \r as a recognized escape sequence. |
| 90 | *) |
| 91 | |
| 92 | (* Subject: lookahead in sml-lex |
| 93 | Reply-to: david.tarditi@CS.CMU.EDU |
| 94 | Date: Mon, 21 Oct 91 14:13:26 -0400 |
| 95 | |
| 96 | There is a serious bug in the implementation of lookahead, |
| 97 | as done in sml-lex, and described in Aho, Sethi, and Ullman, |
| 98 | p. 134 "Implementing the Lookahead Operator" |
| 99 | |
| 100 | We have disallowed the use of lookahead for now because |
| 101 | of this bug. |
| 102 | |
| 103 | As a counter-example to the implementation described in |
| 104 | ASU, consider the following specification with the |
| 105 | input string "aba" (this example is taken from |
| 106 | a comp.compilers message from Dec. 1989, I think): |
| 107 | |
| 108 | type lexresult=unit |
| 109 | val linenum = ref 1 |
| 110 | fun error x = TextIO.output(TextIO.stdErr, x ^ "\n") |
| 111 | val eof = fn () => () |
| 112 | %% |
| 113 | %structure Lex |
| 114 | %% |
| 115 | (a|ab)/ba => (print yytext; print "\n"; ()); |
| 116 | |
| 117 | The ASU proposal works as follows. Suppose that we are |
| 118 | using NFA's to represent our regular expressions. Then to |
| 119 | build an NFA for e1 / e2, we build an NFA n1 for e1 |
| 120 | and an NFA n2 for e2, and add an epsilon transition |
| 121 | from e1 to e2. |
| 122 | |
| 123 | When lexing, when we encounter the end state of e1e2, |
| 124 | we take as the end of the string the position in |
| 125 | the string that was the last occurrence of the state of |
| 126 | the NFA having a transition on the epsilon introduced |
| 127 | for /. |
| 128 | |
| 129 | Using the example we have above, we'll have an NFA |
| 130 | with the following states: |
| 131 | |
| 132 | |
| 133 | 1 -- a --> 2 -- b --> 3 |
| 134 | | | |
| 135 | | epsilon | epsilon |
| 136 | | | |
| 137 | |------------> 4 -- b --> 5 -- a --> 6 |
| 138 | |
| 139 | On our example, we get the following list of transitions: |
| 140 | |
| 141 | a : 2, 4 (make an epsilon transition from 2 to 4) |
| 142 | ab : 3, 4, 5 (make an epsilon transition from 3 to 4) |
| 143 | aba : 6 |
| 144 | |
| 145 | If we chose the last state in which we made an epsilon transition, |
| 146 | we'll chose the transition from 3 to 4, and end up with "ab" |
| 147 | as our token, when we should have "a" as our token. |
| 148 | |
| 149 | *) |
| 150 | |
| 151 | functor RedBlack(B : sig type key |
| 152 | val > : key*key->bool |
| 153 | end): |
| 154 | sig type tree |
| 155 | type key |
| 156 | val empty : tree |
| 157 | val insert : key * tree -> tree |
| 158 | val lookup : key * tree -> key |
| 159 | exception notfound of key |
| 160 | end = |
| 161 | struct |
| 162 | open B |
| 163 | datatype color = RED | BLACK |
| 164 | datatype tree = empty | tree of key * color * tree * tree |
| 165 | exception notfound of key |
| 166 | |
| 167 | fun insert (key,t) = |
| 168 | let fun f empty = tree(key,RED,empty,empty) |
| 169 | | f (tree(k,BLACK,l,r)) = |
| 170 | if key>k |
| 171 | then case f r |
| 172 | of r as tree(rk,RED, rl as tree(rlk,RED,rll,rlr),rr) => |
| 173 | (case l |
| 174 | of tree(lk,RED,ll,lr) => |
| 175 | tree(k,RED,tree(lk,BLACK,ll,lr), |
| 176 | tree(rk,BLACK,rl,rr)) |
| 177 | | _ => tree(rlk,BLACK,tree(k,RED,l,rll), |
| 178 | tree(rk,RED,rlr,rr))) |
| 179 | | r as tree(rk,RED,rl, rr as tree(rrk,RED,rrl,rrr)) => |
| 180 | (case l |
| 181 | of tree(lk,RED,ll,lr) => |
| 182 | tree(k,RED,tree(lk,BLACK,ll,lr), |
| 183 | tree(rk,BLACK,rl,rr)) |
| 184 | | _ => tree(rk,BLACK,tree(k,RED,l,rl),rr)) |
| 185 | | r => tree(k,BLACK,l,r) |
| 186 | else if k>key |
| 187 | then case f l |
| 188 | of l as tree(lk,RED,ll, lr as tree(lrk,RED,lrl,lrr)) => |
| 189 | (case r |
| 190 | of tree(rk,RED,rl,rr) => |
| 191 | tree(k,RED,tree(lk,BLACK,ll,lr), |
| 192 | tree(rk,BLACK,rl,rr)) |
| 193 | | _ => tree(lrk,BLACK,tree(lk,RED,ll,lrl), |
| 194 | tree(k,RED,lrr,r))) |
| 195 | | l as tree(lk,RED, ll as tree(llk,RED,lll,llr), lr) => |
| 196 | (case r |
| 197 | of tree(rk,RED,rl,rr) => |
| 198 | tree(k,RED,tree(lk,BLACK,ll,lr), |
| 199 | tree(rk,BLACK,rl,rr)) |
| 200 | | _ => tree(lk,BLACK,ll,tree(k,RED,lr,r))) |
| 201 | | l => tree(k,BLACK,l,r) |
| 202 | else tree(key,BLACK,l,r) |
| 203 | | f (tree(k,RED,l,r)) = |
| 204 | if key>k then tree(k,RED,l, f r) |
| 205 | else if k>key then tree(k,RED, f l, r) |
| 206 | else tree(key,RED,l,r) |
| 207 | in case f t |
| 208 | of tree(k,RED, l as tree(_,RED,_,_), r) => tree(k,BLACK,l,r) |
| 209 | | tree(k,RED, l, r as tree(_,RED,_,_)) => tree(k,BLACK,l,r) |
| 210 | | t => t |
| 211 | end |
| 212 | |
| 213 | |
| 214 | fun lookup (key,t) = |
| 215 | let fun look empty = raise (notfound key) |
| 216 | | look (tree(k,_,l,r)) = |
| 217 | if k>key then look l |
| 218 | else if key>k then look r |
| 219 | else k |
| 220 | in look t |
| 221 | end |
| 222 | |
| 223 | end |
| 224 | |
| 225 | signature LEXGEN = |
| 226 | sig |
| 227 | val lexGen: string -> unit |
| 228 | end |
| 229 | |
| 230 | structure LexGen: LEXGEN = |
| 231 | struct |
| 232 | val sub = Array.sub |
| 233 | infix 9 sub |
| 234 | |
| 235 | type pos = {line : int, col : int} |
| 236 | |
| 237 | datatype token = CHARS of bool array | QMARK | STAR | PLUS | BAR |
| 238 | | LP | RP | CARAT | DOLLAR | SLASH | STATE of string list |
| 239 | | REPS of int * int | ID of string | ACTION of pos * string |
| 240 | | BOF | EOF | ASSIGN | SEMI | ARROW | LEXMARK | LEXSTATES |
| 241 | | COUNT | REJECT | FULLCHARSET | STRUCT | HEADER | ARG | POSARG |
| 242 | | POSINT |
| 243 | |
| 244 | datatype exp = EPS | CLASS of bool array * int | CLOSURE of exp |
| 245 | | ALT of exp * exp | CAT of exp * exp | TRAIL of int |
| 246 | | END of int |
| 247 | |
| 248 | (* flags describing input Lex spec. - unnecessary code is omitted *) |
| 249 | (* if possible *) |
| 250 | |
| 251 | val CharFormat = ref false; |
| 252 | val UsesTrailingContext = ref false; |
| 253 | val UsesPrevNewLine = ref false; |
| 254 | |
| 255 | (* flags for various bells & whistles that Lex has. These slow the |
| 256 | lexer down and should be omitted from production lexers (if you |
| 257 | really want speed) *) |
| 258 | |
| 259 | val CountNewLines = ref false; |
| 260 | val PosArg = ref false; |
| 261 | val HaveReject = ref false; |
| 262 | |
| 263 | (* Can increase size of character set *) |
| 264 | |
| 265 | val CharSetSize = ref 129; |
| 266 | |
| 267 | (* Can name structure or declare header code *) |
| 268 | |
| 269 | val StrName = ref "Mlex" |
| 270 | val HeaderCode = ref "" |
| 271 | val HeaderPos = ref {line = 0, col = 0} |
| 272 | val HeaderDecl = ref false |
| 273 | val ArgCode = ref (NONE: (pos * string) option) |
| 274 | val StrDecl = ref false |
| 275 | |
| 276 | (* Can define INTEGER structure for yypos variable. *) |
| 277 | val PosIntName = ref "Int" |
| 278 | val PosIntDecl = ref false |
| 279 | |
| 280 | val ResetFlags = fn () => (CountNewLines := false; HaveReject := false; |
| 281 | PosArg := false; |
| 282 | UsesTrailingContext := false; |
| 283 | CharSetSize := 129; StrName := "Mlex"; |
| 284 | HeaderCode := ""; HeaderDecl:= false; |
| 285 | ArgCode := NONE; |
| 286 | StrDecl := false; |
| 287 | PosIntName := "Int"; PosIntDecl := false) |
| 288 | |
| 289 | val LexOut = ref(TextIO.stdOut) |
| 290 | val LexOutLine = ref 1 |
| 291 | fun setLexOut s = (LexOut := s; LexOutLine := 1) |
| 292 | fun say x = |
| 293 | (TextIO.output (!LexOut, x) |
| 294 | ; CharVector.app |
| 295 | (fn #"\n" => LexOutLine := !LexOutLine + 1 | _ => ()) |
| 296 | x) |
| 297 | val InFile = ref "" |
| 298 | val OutFile = ref "" |
| 299 | fun fmtLineDir {line, col} file = |
| 300 | String.concat ["(*#line ", Int.toString line, ".", Int.toString (col+1), |
| 301 | " \"", file, "\"*)"] |
| 302 | val sayPos = |
| 303 | fn SOME pos => say (fmtLineDir pos (!InFile)) |
| 304 | | NONE => (say (fmtLineDir {line = !LexOutLine, col = 0} (!OutFile)); |
| 305 | say "\n") |
| 306 | |
| 307 | (* Union: merge two sorted lists of integers *) |
| 308 | |
| 309 | fun union(a,b) = let val rec merge = fn |
| 310 | (nil,nil,z) => z |
| 311 | | (nil,el::more,z) => merge(nil,more,el::z) |
| 312 | | (el::more,nil,z) => merge(more,nil,el::z) |
| 313 | | (x::morex,y::morey,z) => if (x:int)=(y:int) |
| 314 | then merge(morex,morey,x::z) |
| 315 | else if x>y then merge(morex,y::morey,x::z) |
| 316 | else merge(x::morex,morey,y::z) |
| 317 | in merge(rev a,rev b,nil) |
| 318 | end |
| 319 | |
| 320 | (* Nullable: compute if a important expression parse tree node is nullable *) |
| 321 | |
| 322 | val rec nullable = fn |
| 323 | EPS => true |
| 324 | | CLASS(_) => false |
| 325 | | CLOSURE(_) => true |
| 326 | | ALT(n1,n2) => nullable(n1) orelse nullable(n2) |
| 327 | | CAT(n1,n2) => nullable(n1) andalso nullable(n2) |
| 328 | | TRAIL(_) => true |
| 329 | | END(_) => false |
| 330 | |
| 331 | (* FIRSTPOS: firstpos function for parse tree expressions *) |
| 332 | |
| 333 | and firstpos = fn |
| 334 | EPS => nil |
| 335 | | CLASS(_,i) => [i] |
| 336 | | CLOSURE(n) => firstpos(n) |
| 337 | | ALT(n1,n2) => union(firstpos(n1),firstpos(n2)) |
| 338 | | CAT(n1,n2) => if nullable(n1) then union(firstpos(n1),firstpos(n2)) |
| 339 | else firstpos(n1) |
| 340 | | TRAIL(i) => [i] |
| 341 | | END(i) => [i] |
| 342 | |
| 343 | (* LASTPOS: Lastpos function for parse tree expressions *) |
| 344 | |
| 345 | and lastpos = fn |
| 346 | EPS => nil |
| 347 | | CLASS(_,i) => [i] |
| 348 | | CLOSURE(n) => lastpos(n) |
| 349 | | ALT(n1,n2) => union(lastpos(n1),lastpos(n2)) |
| 350 | | CAT(n1,n2) => if nullable(n2) then union(lastpos(n1),lastpos(n2)) |
| 351 | else lastpos(n2) |
| 352 | | TRAIL(i) => [i] |
| 353 | | END(i) => [i] |
| 354 | ; |
| 355 | |
| 356 | (* ++: Increment an integer reference *) |
| 357 | |
| 358 | fun ++(x) : int = (x := !x + 1; !x); |
| 359 | |
| 360 | structure dict = |
| 361 | struct |
| 362 | type 'a relation = 'a * 'a -> bool |
| 363 | abstype ('b,'a) dictionary = DATA of { Table : ('b * 'a) list, |
| 364 | Leq : 'b * 'b -> bool } |
| 365 | with |
| 366 | exception LOOKUP |
| 367 | fun create Leqfunc = DATA { Table = nil, Leq = Leqfunc } |
| 368 | fun lookup (DATA { Table = entrylist, Leq = leq }) key = |
| 369 | let fun search [] = raise LOOKUP |
| 370 | | search((k,item)::entries) = |
| 371 | if leq(key,k) |
| 372 | then if leq(k,key) then item else raise LOOKUP |
| 373 | else search entries |
| 374 | in search entrylist |
| 375 | end |
| 376 | fun enter (DATA { Table = entrylist, Leq = leq }) |
| 377 | (newentry as (key : 'b,item :'a)) : ('b,'a) dictionary = |
| 378 | let val gt = fn a => fn b => not (leq(a,b)) |
| 379 | val eq = fn k => fn k' => (leq(k,k')) andalso (leq(k',k)) |
| 380 | fun update nil = [ newentry ] |
| 381 | | update ((entry as (k,_))::entries) = |
| 382 | if (eq key k) then newentry::entries |
| 383 | else if gt k key then newentry::(entry::entries) |
| 384 | else entry::(update entries) |
| 385 | in DATA { Table = update entrylist, Leq = leq } |
| 386 | end |
| 387 | fun listofdict (DATA { Table = entrylist,Leq = leq}) = |
| 388 | let fun f (nil,r) = rev r |
| 389 | | f (a::b,r) = f (b,a::r) |
| 390 | in f(entrylist,nil) |
| 391 | end |
| 392 | end |
| 393 | end |
| 394 | |
| 395 | open dict; |
| 396 | |
| 397 | (* INPUT.ML : Input w/ one character push back capability *) |
| 398 | |
| 399 | val LineNum = ref 1; |
| 400 | |
| 401 | abstype ibuf = |
| 402 | BUF of TextIO.instream * {b : string ref, p : int ref} |
| 403 | with |
| 404 | local |
| 405 | val pos = ref 0 |
| 406 | val linePos = ref 0 (* incorrect after ungetch newline, non fatal *) |
| 407 | in |
| 408 | fun resetLexPos () = (LineNum := 1; pos := 0; linePos :=0) |
| 409 | fun getLexPos () = {line = !LineNum, col = !pos - !linePos} |
| 410 | fun make_ibuf(s) = BUF (s, {b=ref"", p = ref 0}) |
| 411 | fun close_ibuf (BUF (s,_)) = TextIO.closeIn(s) |
| 412 | exception eof |
| 413 | fun getch (a as (BUF(s,{b,p}))) = |
| 414 | if (!p = (size (!b))) |
| 415 | then (b := TextIO.inputN(s, 1024); |
| 416 | p := 0; |
| 417 | if (size (!b))=0 |
| 418 | then raise eof |
| 419 | else getch a) |
| 420 | else (let val ch = String.sub(!b,!p) |
| 421 | in (pos := !pos + 1; |
| 422 | if ch = #"\n" |
| 423 | then (LineNum := !LineNum + 1; |
| 424 | linePos := !pos) |
| 425 | else (); |
| 426 | p := !p + 1; |
| 427 | ch) |
| 428 | end) |
| 429 | fun ungetch(BUF(s,{b,p})) = ( |
| 430 | pos := !pos - 1; |
| 431 | p := !p - 1; |
| 432 | if String.sub(!b,!p) = #"\n" |
| 433 | then LineNum := !LineNum - 1 |
| 434 | else ()) |
| 435 | end |
| 436 | end; |
| 437 | |
| 438 | exception Error |
| 439 | |
| 440 | fun prErr x = ( |
| 441 | TextIO.output (TextIO.stdErr, String.concat [ |
| 442 | "ml-lex: error, line ", (Int.toString (!LineNum)), ": ", x, "\n" |
| 443 | ]); |
| 444 | raise Error) |
| 445 | fun prSynErr x = ( |
| 446 | TextIO.output (TextIO.stdErr, String.concat [ |
| 447 | "ml-lex: syntax error, line ", (Int.toString (!LineNum)), ": ", x, "\n" |
| 448 | ]); |
| 449 | raise Error) |
| 450 | |
| 451 | exception SyntaxError; (* error in user's input file *) |
| 452 | |
| 453 | exception LexError; (* unexpected error in lexer *) |
| 454 | |
| 455 | val LexBuf = ref(make_ibuf(TextIO.stdIn)); |
| 456 | val LexState = ref 0; |
| 457 | val NextTok = ref BOF; |
| 458 | val inquote = ref false; |
| 459 | |
| 460 | fun AdvanceTok () : unit = let |
| 461 | fun isLetter c = |
| 462 | ((c >= #"a") andalso (c <= #"z")) orelse |
| 463 | ((c >= #"A") andalso (c <= #"Z")) |
| 464 | fun isDigit c = (c >= #"0") andalso (c <= #"9") |
| 465 | (* check for valid (non-leading) identifier character (added by JHR) *) |
| 466 | fun isIdentChr c = |
| 467 | ((isLetter c) orelse (isDigit c) orelse (c = #"_") orelse (c = #"'")) |
| 468 | fun atoi s = let |
| 469 | fun num (c::r, n) = if isDigit c |
| 470 | then num (r, 10*n + (Char.ord c - Char.ord #"0")) |
| 471 | else n |
| 472 | | num ([], n) = n |
| 473 | in |
| 474 | num (explode s, 0) |
| 475 | end |
| 476 | |
| 477 | fun skipws () = let val ch = nextch() |
| 478 | in |
| 479 | if Char.isSpace ch |
| 480 | then skipws() |
| 481 | else ch |
| 482 | end |
| 483 | |
| 484 | and nextch () = getch(!LexBuf) |
| 485 | |
| 486 | and escaped () = (case nextch() |
| 487 | of #"b" => #"\008" |
| 488 | | #"n" => #"\n" |
| 489 | | #"r" => #"\r" |
| 490 | | #"t" => #"\t" |
| 491 | | #"h" => #"\128" |
| 492 | | x => let |
| 493 | fun err t = prErr("illegal ascii escape '"^(implode(rev t))^"'") |
| 494 | fun cvt c = (Char.ord c - Char.ord #"0") |
| 495 | fun f (n, c, t) = if c=3 |
| 496 | then if n >= (!CharSetSize) |
| 497 | then err t |
| 498 | else Char.chr n |
| 499 | else let val ch=nextch() |
| 500 | in |
| 501 | if isDigit ch |
| 502 | then f(n*10+(cvt ch), c+1, ch::t) |
| 503 | else err t |
| 504 | end |
| 505 | in |
| 506 | if isDigit x then f(cvt x, 1, [x]) else x |
| 507 | end |
| 508 | (* end case *)) |
| 509 | |
| 510 | and onechar x = let val c = Array.array(!CharSetSize, false) |
| 511 | in |
| 512 | Array.update(c, Char.ord(x), true); CHARS(c) |
| 513 | end |
| 514 | |
| 515 | in case !LexState of 0 => let val makeTok = fn () => |
| 516 | case skipws() |
| 517 | (* Lex % operators *) |
| 518 | of #"%" => (case nextch() of |
| 519 | #"%" => LEXMARK |
| 520 | | a => let fun f s = |
| 521 | let val a = nextch() |
| 522 | in if isLetter a then f(a::s) |
| 523 | else (ungetch(!LexBuf); |
| 524 | implode(rev s)) |
| 525 | end |
| 526 | in case f [a] |
| 527 | of "reject" => REJECT |
| 528 | | "count" => COUNT |
| 529 | | "full" => FULLCHARSET |
| 530 | | "s" => LEXSTATES |
| 531 | | "S" => LEXSTATES |
| 532 | | "structure" => STRUCT |
| 533 | | "header" => HEADER |
| 534 | | "arg" => ARG |
| 535 | | "posarg" => POSARG |
| 536 | | "posint" => POSINT |
| 537 | | _ => prErr "unknown % operator " |
| 538 | end |
| 539 | ) |
| 540 | (* semicolon (for end of LEXSTATES) *) |
| 541 | | #";" => SEMI |
| 542 | (* anything else *) |
| 543 | | ch => if isLetter(ch) then |
| 544 | let fun getID matched = |
| 545 | let val x = nextch() |
| 546 | (**** fix by JHR |
| 547 | in if isLetter(x) orelse isDigit(x) orelse |
| 548 | x = "_" orelse x = "'" |
| 549 | ****) |
| 550 | in if (isIdentChr x) |
| 551 | then getID (x::matched) |
| 552 | else (ungetch(!LexBuf); implode(rev matched)) |
| 553 | end |
| 554 | in ID(getID [ch]) |
| 555 | end |
| 556 | else prSynErr (String.concat[ |
| 557 | "bad character: \"", Char.toString ch, "\"" |
| 558 | ]) |
| 559 | in NextTok := makeTok() |
| 560 | end |
| 561 | | 1 => let val rec makeTok = fn () => |
| 562 | if !inquote then case nextch() of |
| 563 | (* inside quoted string *) |
| 564 | #"\\" => onechar(escaped()) |
| 565 | | #"\"" => (inquote := false; makeTok()) |
| 566 | | #"\n" => (prSynErr "end-of-line inside quoted string"; |
| 567 | inquote := false; makeTok()) |
| 568 | | x => onechar(x) |
| 569 | else case skipws() of |
| 570 | (* single character operators *) |
| 571 | #"?" => QMARK |
| 572 | | #"*" => STAR |
| 573 | | #"+" => PLUS |
| 574 | | #"|" => BAR |
| 575 | | #"(" => LP |
| 576 | | #")" => RP |
| 577 | | #"^" => CARAT |
| 578 | | #"$" => DOLLAR |
| 579 | | #"/" => SLASH |
| 580 | | #";" => SEMI |
| 581 | | #"." => let val c = Array.array(!CharSetSize,true) in |
| 582 | Array.update(c,10,false); CHARS(c) |
| 583 | end |
| 584 | (* assign and arrow *) |
| 585 | | #"=" => let val c = nextch() in |
| 586 | if c = #">" then ARROW else (ungetch(!LexBuf); ASSIGN) |
| 587 | end |
| 588 | (* character set *) |
| 589 | | #"[" => let val rec classch = fn () => let val x = skipws() |
| 590 | in if x = #"\\" then escaped() else x |
| 591 | end; |
| 592 | val first = classch(); |
| 593 | val flag = (first <> #"^"); |
| 594 | val c = Array.array(!CharSetSize,not flag); |
| 595 | fun add NONE = () |
| 596 | | add (SOME x) = Array.update(c, Char.ord(x), flag) |
| 597 | and range (x, y) = if x>y |
| 598 | then (prErr "bad char. range") |
| 599 | else let |
| 600 | val i = ref(Char.ord(x)) and j = Char.ord(y) |
| 601 | in while !i<=j do ( |
| 602 | add (SOME(Char.chr(!i))); |
| 603 | i := !i + 1) |
| 604 | end |
| 605 | and getClass last = (case classch() |
| 606 | of #"]" => (add(last); c) |
| 607 | | #"-" => (case last |
| 608 | of NONE => getClass(SOME #"-") |
| 609 | | (SOME last') => let val x = classch() |
| 610 | in |
| 611 | if x = #"]" |
| 612 | then (add(last); add(SOME #"-"); c) |
| 613 | else (range(last',x); getClass(NONE)) |
| 614 | end |
| 615 | (* end case *)) |
| 616 | | x => (add(last); getClass(SOME x)) |
| 617 | (* end case *)) |
| 618 | in CHARS(getClass(if first = #"^" then NONE else SOME first)) |
| 619 | end |
| 620 | (* Start States specification *) |
| 621 | | #"<" => let val rec get_state = fn (prev,matched) => |
| 622 | case nextch() of |
| 623 | #">" => matched::prev |
| 624 | | #"," => get_state(matched::prev,"") |
| 625 | | x => if isIdentChr(x) |
| 626 | then get_state(prev,matched ^ String.str x) |
| 627 | else (prSynErr "bad start state list") |
| 628 | in STATE(get_state(nil,"")) |
| 629 | end |
| 630 | (* {id} or repititions *) |
| 631 | | #"{" => let val ch = nextch() in if isLetter(ch) then |
| 632 | let fun getID matched = (case nextch() |
| 633 | of #"}" => matched |
| 634 | | x => if (isIdentChr x) then |
| 635 | getID(matched ^ String.str x) |
| 636 | else (prErr "invalid char. class name") |
| 637 | (* end case *)) |
| 638 | in ID(getID(String.str ch)) |
| 639 | end |
| 640 | else if isDigit(ch) then |
| 641 | let fun get_r (matched, r1) = (case nextch() |
| 642 | of #"}" => let val n = atoi(matched) in |
| 643 | if r1 = ~1 then (n,n) else (r1,n) |
| 644 | end |
| 645 | | #"," => if r1 = ~1 then get_r("",atoi(matched)) |
| 646 | else (prErr "invalid repetitions spec.") |
| 647 | | x => if isDigit(x) |
| 648 | then get_r(matched ^ String.str x,r1) |
| 649 | else (prErr "invalid char in repetitions spec") |
| 650 | (* end case *)) |
| 651 | in REPS(get_r(String.str ch,~1)) |
| 652 | end |
| 653 | else (prErr "bad repetitions spec") |
| 654 | end |
| 655 | (* Lex % operators *) |
| 656 | | #"\\" => onechar(escaped()) |
| 657 | (* start quoted string *) |
| 658 | | #"\"" => (inquote := true; makeTok()) |
| 659 | (* anything else *) |
| 660 | | ch => onechar(ch) |
| 661 | in NextTok := makeTok() |
| 662 | end |
| 663 | | 2 => NextTok := |
| 664 | (case skipws() of |
| 665 | #"(" => |
| 666 | let |
| 667 | fun loop_to_end (backslash, x) = |
| 668 | let |
| 669 | val c = getch (! LexBuf) |
| 670 | val notb = not backslash |
| 671 | val nstr = c :: x |
| 672 | in |
| 673 | case c of |
| 674 | #"\"" => if notb then nstr |
| 675 | else loop_to_end (false, nstr) |
| 676 | | _ => loop_to_end (c = #"\\" andalso notb, nstr) |
| 677 | end |
| 678 | fun GetAct (lpct, x) = |
| 679 | let |
| 680 | val c = getch (! LexBuf) |
| 681 | val nstr = c :: x |
| 682 | in |
| 683 | case c of |
| 684 | #"\"" => GetAct (lpct, loop_to_end (false, nstr)) |
| 685 | | #"(" => GetAct (lpct + 1, nstr) |
| 686 | | #")" => if lpct = 0 then implode (rev x) |
| 687 | else GetAct(lpct - 1, nstr) |
| 688 | | _ => GetAct(lpct, nstr) |
| 689 | end |
| 690 | in |
| 691 | ACTION (getLexPos (), GetAct (0,nil)) |
| 692 | end |
| 693 | | #";" => SEMI |
| 694 | | c => (prSynErr ("invalid character " ^ String.str c))) |
| 695 | | _ => raise LexError |
| 696 | end |
| 697 | handle eof => NextTok := EOF ; |
| 698 | |
| 699 | fun GetTok (_:unit) : token = |
| 700 | let val t = !NextTok in AdvanceTok(); t |
| 701 | end; |
| 702 | val SymTab = ref (create String.<=) : (string,exp) dictionary ref |
| 703 | |
| 704 | fun GetExp () : exp = |
| 705 | |
| 706 | let val rec optional = fn e => ALT(EPS,e) |
| 707 | |
| 708 | and lookup' = fn name => |
| 709 | lookup(!SymTab) name |
| 710 | handle LOOKUP => prErr ("bad regular expression name: "^ |
| 711 | name) |
| 712 | |
| 713 | and newline = fn () => let val c = Array.array(!CharSetSize,false) in |
| 714 | Array.update(c,10,true); c |
| 715 | end |
| 716 | |
| 717 | and endline = fn e => trail(e,CLASS(newline(),0)) |
| 718 | |
| 719 | and trail = fn (e1,e2) => CAT(CAT(e1,TRAIL(0)),e2) |
| 720 | |
| 721 | and closure1 = fn e => CAT(e,CLOSURE(e)) |
| 722 | |
| 723 | and repeat = fn (min,max,e) => let val rec rep = fn |
| 724 | (0,0) => EPS |
| 725 | | (0,1) => ALT(e,EPS) |
| 726 | | (0,i) => CAT(rep(0,1),rep(0,i-1)) |
| 727 | | (i,j) => CAT(e,rep(i-1,j-1)) |
| 728 | in rep(min,max) |
| 729 | end |
| 730 | |
| 731 | and exp0 = fn () => case GetTok() of |
| 732 | CHARS(c) => exp1(CLASS(c,0)) |
| 733 | | LP => let val e = exp0() in |
| 734 | if !NextTok = RP then |
| 735 | (AdvanceTok(); exp1(e)) |
| 736 | else (prSynErr "missing ')'") end |
| 737 | | ID(name) => exp1(lookup' name) |
| 738 | | _ => raise SyntaxError |
| 739 | |
| 740 | and exp1 = fn (e) => case !NextTok of |
| 741 | SEMI => e |
| 742 | | ARROW => e |
| 743 | | EOF => e |
| 744 | | LP => exp2(e,exp0()) |
| 745 | | RP => e |
| 746 | | t => (AdvanceTok(); case t of |
| 747 | QMARK => exp1(optional(e)) |
| 748 | | STAR => exp1(CLOSURE(e)) |
| 749 | | PLUS => exp1(closure1(e)) |
| 750 | | CHARS(c) => exp2(e,CLASS(c,0)) |
| 751 | | BAR => ALT(e,exp0()) |
| 752 | | DOLLAR => (UsesTrailingContext := true; endline(e)) |
| 753 | | SLASH => (UsesTrailingContext := true; |
| 754 | trail(e,exp0())) |
| 755 | | REPS(i,j) => exp1(repeat(i,j,e)) |
| 756 | | ID(name) => exp2(e,lookup' name) |
| 757 | | _ => raise SyntaxError) |
| 758 | |
| 759 | and exp2 = fn (e1,e2) => case !NextTok of |
| 760 | SEMI => CAT(e1,e2) |
| 761 | | ARROW => CAT(e1,e2) |
| 762 | | EOF => CAT(e1,e2) |
| 763 | | LP => exp2(CAT(e1,e2),exp0()) |
| 764 | | RP => CAT(e1,e2) |
| 765 | | t => (AdvanceTok(); case t of |
| 766 | QMARK => exp1(CAT(e1,optional(e2))) |
| 767 | | STAR => exp1(CAT(e1,CLOSURE(e2))) |
| 768 | | PLUS => exp1(CAT(e1,closure1(e2))) |
| 769 | | CHARS(c) => exp2(CAT(e1,e2),CLASS(c,0)) |
| 770 | | BAR => ALT(CAT(e1,e2),exp0()) |
| 771 | | DOLLAR => (UsesTrailingContext := true; |
| 772 | endline(CAT(e1,e2))) |
| 773 | | SLASH => (UsesTrailingContext := true; |
| 774 | trail(CAT(e1,e2),exp0())) |
| 775 | | REPS(i,j) => exp1(CAT(e1,repeat(i,j,e2))) |
| 776 | | ID(name) => exp2(CAT(e1,e2),lookup' name) |
| 777 | | _ => raise SyntaxError) |
| 778 | in exp0() |
| 779 | end; |
| 780 | val StateTab = ref(create(String.<=)) : (string,int) dictionary ref |
| 781 | |
| 782 | val StateNum = ref 0; |
| 783 | |
| 784 | fun GetStates () : int list = |
| 785 | |
| 786 | let fun add nil sl = sl |
| 787 | | add (x::y) sl = add y (union ([lookup (!StateTab)(x) |
| 788 | handle LOOKUP => |
| 789 | prErr ("bad state name: "^x) |
| 790 | ],sl)) |
| 791 | |
| 792 | fun addall i sl = |
| 793 | if i <= !StateNum then addall (i+2) (union ([i],sl)) |
| 794 | else sl |
| 795 | |
| 796 | fun incall (x::y) = (x+1)::incall y |
| 797 | | incall nil = nil |
| 798 | |
| 799 | fun addincs nil = nil |
| 800 | | addincs (x::y) = x::(x+1)::addincs y |
| 801 | |
| 802 | val state_list = |
| 803 | case !NextTok of |
| 804 | STATE s => (AdvanceTok(); LexState := 1; add s nil) |
| 805 | | _ => addall 1 nil |
| 806 | |
| 807 | in case !NextTok |
| 808 | of CARAT => (LexState := 1; AdvanceTok(); UsesPrevNewLine := true; |
| 809 | incall state_list) |
| 810 | | _ => addincs state_list |
| 811 | end |
| 812 | |
| 813 | val LeafNum = ref ~1; |
| 814 | |
| 815 | fun renum(e : exp) : exp = |
| 816 | let val rec label = fn |
| 817 | EPS => EPS |
| 818 | | CLASS(x,_) => CLASS(x,++LeafNum) |
| 819 | | CLOSURE(e) => CLOSURE(label(e)) |
| 820 | | ALT(e1,e2) => ALT(label(e1),label(e2)) |
| 821 | | CAT(e1,e2) => CAT(label(e1),label(e2)) |
| 822 | | TRAIL(i) => TRAIL(++LeafNum) |
| 823 | | END(i) => END(++LeafNum) |
| 824 | in label(e) |
| 825 | end; |
| 826 | |
| 827 | exception ParseError; |
| 828 | |
| 829 | fun parse() : (string * (int list * exp) list * ((string,pos*string) dictionary)) = |
| 830 | let val Accept = ref (create String.<=) : (string,pos*string) dictionary ref |
| 831 | val rec ParseRtns = fn l => case getch(!LexBuf) of |
| 832 | #"%" => let val c = getch(!LexBuf) in |
| 833 | if c = #"%" then (implode (rev l)) |
| 834 | else ParseRtns(c :: #"%" :: l) |
| 835 | end |
| 836 | | c => ParseRtns(c::l) |
| 837 | and ParseDefs = fn () => |
| 838 | (LexState:=0; AdvanceTok(); case !NextTok of |
| 839 | LEXMARK => () |
| 840 | | LEXSTATES => |
| 841 | let fun f () = (case !NextTok of (ID i) => |
| 842 | (StateTab := enter(!StateTab)(i,++StateNum); |
| 843 | ++StateNum; AdvanceTok(); f()) |
| 844 | | _ => ()) |
| 845 | in AdvanceTok(); f (); |
| 846 | if !NextTok=SEMI then ParseDefs() else |
| 847 | (prSynErr "expected ';'") |
| 848 | end |
| 849 | | ID x => (LexState:=1; AdvanceTok(); if GetTok() = ASSIGN |
| 850 | then (SymTab := enter(!SymTab)(x,GetExp()); |
| 851 | if !NextTok = SEMI then ParseDefs() |
| 852 | else (prSynErr "expected ';'")) |
| 853 | else raise SyntaxError) |
| 854 | | REJECT => (HaveReject := true; ParseDefs()) |
| 855 | | COUNT => (CountNewLines := true; ParseDefs()) |
| 856 | | FULLCHARSET => (CharSetSize := 256; ParseDefs()) |
| 857 | | HEADER => (LexState := 2; AdvanceTok(); |
| 858 | case GetTok() |
| 859 | of ACTION (p, s) => |
| 860 | if (!StrDecl) then |
| 861 | (prErr "cannot have both %structure and %header \ |
| 862 | \declarations") |
| 863 | else if (!HeaderDecl) then |
| 864 | (prErr "duplicate %header declarations") |
| 865 | else |
| 866 | (HeaderCode := s; LexState := 0; |
| 867 | HeaderPos := p; |
| 868 | HeaderDecl := true; ParseDefs()) |
| 869 | | _ => raise SyntaxError) |
| 870 | | POSARG => (PosArg := true; ParseDefs()) |
| 871 | | POSINT => (AdvanceTok(); |
| 872 | case !NextTok of |
| 873 | (ID i) => |
| 874 | if (!PosIntDecl) then |
| 875 | (prErr "duplicate %posint declarations") |
| 876 | else (PosIntName := i; PosIntDecl := true) |
| 877 | | _ => (prErr "expected ID"); |
| 878 | ParseDefs()) |
| 879 | | ARG => (LexState := 2; AdvanceTok(); |
| 880 | case GetTok() |
| 881 | of ACTION s => |
| 882 | (case !ArgCode |
| 883 | of SOME _ => prErr "duplicate %arg declarations" |
| 884 | | NONE => ArgCode := SOME s; |
| 885 | LexState := 0; |
| 886 | ParseDefs()) |
| 887 | | _ => raise SyntaxError) |
| 888 | | STRUCT => (AdvanceTok(); |
| 889 | case !NextTok of |
| 890 | (ID i) => |
| 891 | if (!HeaderDecl) then |
| 892 | (prErr "cannot have both %structure and %header \ |
| 893 | \declarations") |
| 894 | else if (!StrDecl) then |
| 895 | (prErr "duplicate %structure declarations") |
| 896 | else (StrName := i; StrDecl := true) |
| 897 | | _ => (prErr "expected ID"); |
| 898 | ParseDefs()) |
| 899 | | _ => raise SyntaxError) |
| 900 | and ParseRules = |
| 901 | fn rules => (LexState:=1; AdvanceTok(); case !NextTok of |
| 902 | EOF => rules |
| 903 | | _ => |
| 904 | let val s = GetStates() |
| 905 | val e = renum(CAT(GetExp(),END(0))) |
| 906 | in |
| 907 | if !NextTok = ARROW then |
| 908 | (LexState:=2; AdvanceTok(); |
| 909 | case GetTok() of ACTION(act) => |
| 910 | if !NextTok=SEMI then |
| 911 | (Accept:=enter(!Accept) (Int.toString (!LeafNum),act); |
| 912 | ParseRules((s,e)::rules)) |
| 913 | else (prSynErr "expected ';'") |
| 914 | | _ => raise SyntaxError) |
| 915 | else (prSynErr "expected '=>'") |
| 916 | end) |
| 917 | in let val usercode = ParseRtns nil |
| 918 | in (ParseDefs(); (usercode,ParseRules(nil),!Accept)) |
| 919 | end |
| 920 | end handle SyntaxError => (prSynErr "") |
| 921 | |
| 922 | fun makebegin () : unit = |
| 923 | let fun make nil = () |
| 924 | | make ((x,n:int)::y)=(say "val "; say x; say " = " ; |
| 925 | say "STARTSTATE "; |
| 926 | say (Int.toString n); say ";\n"; make y) |
| 927 | in say "\n(* start state definitions *)\n\n"; make(listofdict(!StateTab)) |
| 928 | end |
| 929 | |
| 930 | structure L = |
| 931 | struct |
| 932 | nonfix > |
| 933 | type key = int list * string |
| 934 | fun > ((key,item:string),(key',item')) = |
| 935 | let fun f ((a:int)::a') (b::b') = if Int.> (a,b) then true |
| 936 | else if a=b then f a' b' |
| 937 | else false |
| 938 | | f _ _ = false |
| 939 | in f key key' |
| 940 | end |
| 941 | end |
| 942 | |
| 943 | structure RB = RedBlack(L) |
| 944 | |
| 945 | fun maketable (fins:(int * (int list)) list, |
| 946 | tcs :(int * (int list)) list, |
| 947 | tcpairs: (int * int) list, |
| 948 | trans : (int*(int list)) list) : unit = |
| 949 | |
| 950 | (* Fins = (state #, list of final leaves for the state) list |
| 951 | tcs = (state #, list of trailing context leaves which begin in this state) |
| 952 | list |
| 953 | tcpairs = (trailing context leaf, end leaf) list |
| 954 | trans = (state #,list of transitions for state) list *) |
| 955 | |
| 956 | let datatype elem = N of int | T of int | D of int |
| 957 | val count = ref 0 |
| 958 | val _ = (if length(trans)<256 then CharFormat := true |
| 959 | else CharFormat := false; |
| 960 | if !UsesTrailingContext then |
| 961 | (say "\ndatatype yyfinstate = N of int | \ |
| 962 | \ T of int | D of int\n") |
| 963 | else say "\ndatatype yyfinstate = N of int"; |
| 964 | say "\ntype statedata = {fin : yyfinstate list, trans: "; |
| 965 | case !CharFormat of |
| 966 | true => say "string}" |
| 967 | | false => say "int Vector.vector}"; |
| 968 | say "\n(* transition & final state table *)\nval tab = let\n"; |
| 969 | case !CharFormat of |
| 970 | true => () |
| 971 | | false => |
| 972 | (say "fun decode s k =\n"; |
| 973 | say " let val k' = k + k\n"; |
| 974 | say " val hi = Char.ord(String.sub(s, k'))\n"; |
| 975 | say " val lo = Char.ord(String.sub(s, k' + 1))\n"; |
| 976 | say " in hi * 256 + lo end\n")) |
| 977 | |
| 978 | val newfins = |
| 979 | let fun IsEndLeaf t = |
| 980 | let fun f ((l,e)::r) = if (e=t) then true else f r |
| 981 | | f nil = false in f tcpairs end |
| 982 | |
| 983 | fun GetEndLeaf t = |
| 984 | let fun f ((tl,el)::r) = if (tl=t) then el else f r |
| 985 | | f _ = raise Match |
| 986 | in f tcpairs |
| 987 | end |
| 988 | fun GetTrConLeaves s = |
| 989 | let fun f ((s',l)::r) = if (s = s') then l else f r |
| 990 | | f nil = nil |
| 991 | in f tcs |
| 992 | end |
| 993 | fun sort_leaves s = |
| 994 | let fun insert (x:int) (a::b) = |
| 995 | if (x <= a) then x::(a::b) |
| 996 | else a::(insert x b) |
| 997 | | insert x nil = [x] |
| 998 | in List.foldr (fn (x,r) => insert x r) [] s |
| 999 | end |
| 1000 | fun conv a = if (IsEndLeaf a) then (D a) else (N a) |
| 1001 | fun merge (a::a',b::b') = |
| 1002 | if (a <= b) then (conv a)::merge(a',b::b') |
| 1003 | else (T b)::(merge(a::a',b')) |
| 1004 | | merge (a::a',nil) = (conv a)::(merge (a',nil)) |
| 1005 | | merge (nil,b::b') = (T b)::(merge (b',nil)) |
| 1006 | | merge (nil,nil) = nil |
| 1007 | |
| 1008 | in map (fn (x,l) => |
| 1009 | rev (merge (l, |
| 1010 | sort_leaves (map (fn x => GetEndLeaf x) (GetTrConLeaves x))))) |
| 1011 | fins |
| 1012 | end |
| 1013 | |
| 1014 | val rs = |
| 1015 | let open RB |
| 1016 | fun makeItems x = |
| 1017 | let fun emit8(x, pos) = |
| 1018 | let val s = StringCvt.padLeft #"0" 3 (Int.toString x) |
| 1019 | in |
| 1020 | case pos |
| 1021 | of 16 => (say "\\\n\\\\"; say s; 1) |
| 1022 | | _ => (say "\\"; say s; pos+1) |
| 1023 | end |
| 1024 | fun emit16(x, pos) = |
| 1025 | let val hi8 = x div 256 |
| 1026 | val lo8 = x - hi8 * 256 (* x rem 256 *) |
| 1027 | in |
| 1028 | emit8(lo8, emit8(hi8, pos)) |
| 1029 | end |
| 1030 | fun MakeString([], _, _) = () |
| 1031 | | MakeString(x::xs, emitter, pos) = |
| 1032 | MakeString(xs, emitter, emitter(x, pos)) |
| 1033 | in case !CharFormat of |
| 1034 | true => (say " \n\""; MakeString(x,emit8,0); say "\"\n") |
| 1035 | | false => (say (Int.toString(length x)); |
| 1036 | say ", \n\""; MakeString(x,emit16,0); say "\"\n") |
| 1037 | end |
| 1038 | |
| 1039 | fun makeEntry(nil,rs,t) = rev rs |
| 1040 | | makeEntry(((l:int,x)::y),rs,t) = |
| 1041 | let val name = (Int.toString l) |
| 1042 | in let val (r,n) = lookup ((x,name),t) |
| 1043 | in makeEntry(y,(n::rs),t) |
| 1044 | end handle notfound _ => |
| 1045 | (count := !count+1; |
| 1046 | say " ("; say name; say ","; |
| 1047 | makeItems x; say "),\n"; |
| 1048 | makeEntry(y,(name::rs),(insert ((x,name),t)))) |
| 1049 | end |
| 1050 | |
| 1051 | val _ = say "val s = [ \n" |
| 1052 | val res = makeEntry(trans,nil,empty) |
| 1053 | val _ = |
| 1054 | case !CharFormat |
| 1055 | of true => (say "(0, \"\")]\n"; say "fun f x = x \n") |
| 1056 | | false => (say "(0, 0, \"\")]\n"; |
| 1057 | say "fun f(n, i, x) = (n, Vector.tabulate(i, decode x)) \n") |
| 1058 | |
| 1059 | val _ = say "val s = List.map f (List.rev (tl (List.rev s))) \n" |
| 1060 | val _ = say "exception LexHackingError \n" |
| 1061 | val _ = say "fun look ((j,x)::r, i: int) = if i = j then x else look(r, i) \n" |
| 1062 | val _ = say " | look ([], i) = raise LexHackingError\n" |
| 1063 | |
| 1064 | val _ = say "fun g {fin=x, trans=i} = {fin=x, trans=look(s,i)} \n" |
| 1065 | in res |
| 1066 | end |
| 1067 | |
| 1068 | fun makeTable args = let |
| 1069 | fun makeOne (a, b) = let |
| 1070 | fun item (N i) = ("N", i) |
| 1071 | | item (T i) = ("T", i) |
| 1072 | | item (D i) = ("D", i) |
| 1073 | fun makeItem x = let |
| 1074 | val (t, n) = item x |
| 1075 | in |
| 1076 | app say ["(", t, " ", Int.toString n, ")"] |
| 1077 | end |
| 1078 | fun makeItems [] = () |
| 1079 | | makeItems [x] = makeItem x |
| 1080 | | makeItems (hd :: tl) = |
| 1081 | (makeItem hd; say ","; makeItems tl) |
| 1082 | in |
| 1083 | say "{fin = ["; |
| 1084 | makeItems b; |
| 1085 | app say ["], trans = ", a, "}"] |
| 1086 | end |
| 1087 | fun mt ([], []) = () |
| 1088 | | mt ([a], [b]) = makeOne (a, b) |
| 1089 | | mt (a :: a', b :: b') = |
| 1090 | (makeOne (a, b); say ",\n"; mt (a', b')) |
| 1091 | | mt _ = raise Match |
| 1092 | in |
| 1093 | mt args |
| 1094 | end |
| 1095 | |
| 1096 | (* |
| 1097 | fun makeTable(nil,nil) = () |
| 1098 | | makeTable(a::a',b::b') = |
| 1099 | let fun makeItems nil = () |
| 1100 | | makeItems (hd::tl) = |
| 1101 | let val (t,n) = |
| 1102 | case hd of |
| 1103 | (N i) => ("(N ",i) |
| 1104 | | (T i) => ("(T ",i) |
| 1105 | | (D i) => ("(D ",i) |
| 1106 | in (say t; say (Int.toString n); say ")"; |
| 1107 | if null tl |
| 1108 | then () |
| 1109 | else (say ","; makeItems tl)) |
| 1110 | end |
| 1111 | in (say "{fin = ["; makeItems b; |
| 1112 | say "], trans = "; say a; say "}"; |
| 1113 | if null a' |
| 1114 | then () |
| 1115 | else (say ",\n"; makeTable(a',b'))) |
| 1116 | end |
| 1117 | *) |
| 1118 | |
| 1119 | fun msg x = TextIO.output(TextIO.stdOut, x) |
| 1120 | |
| 1121 | in (say "in Vector.fromList(List.map g \n["; makeTable(rs,newfins); |
| 1122 | say "])\nend\n"; |
| 1123 | msg ("\nNumber of states = " ^ (Int.toString (length trans))); |
| 1124 | msg ("\nNumber of distinct rows = " ^ (Int.toString (!count))); |
| 1125 | msg ("\nApprox. memory size of trans. table = " ^ |
| 1126 | (Int.toString (!count*(!CharSetSize)*(if !CharFormat then 1 else 8)))); |
| 1127 | msg " bytes\n") |
| 1128 | end |
| 1129 | |
| 1130 | (* makeaccept: Takes a (string,string) dictionary, prints case statement for |
| 1131 | accepting leaf actions. The key strings are the leaf #'s, the data strings |
| 1132 | are the actions *) |
| 1133 | |
| 1134 | fun makeaccept ends = |
| 1135 | let fun startline f = if f then say " " else say "| " |
| 1136 | fun make(nil,f) = (startline f; say "_ => raise Internal.LexerError\n") |
| 1137 | | make((x,(p,a))::y,f) = (startline f; say x; say " => "; |
| 1138 | if Substring.size(#2 (Substring.position "yytext" (Substring.full a))) = 0 |
| 1139 | then |
| 1140 | (say "("; sayPos (SOME p); say a; sayPos NONE; say ")") |
| 1141 | else (say "let val yytext=yymktext() in "; |
| 1142 | sayPos (SOME p); say a; sayPos NONE; say " end"); |
| 1143 | say "\n"; make(y,false)) |
| 1144 | in make (listofdict(ends),true) |
| 1145 | end |
| 1146 | |
| 1147 | fun leafdata(e:(int list * exp) list) = |
| 1148 | let val fp = Array.array(!LeafNum + 1,nil) |
| 1149 | and leaf = Array.array(!LeafNum + 1,EPS) |
| 1150 | and tcpairs = ref nil |
| 1151 | and trailmark = ref ~1; |
| 1152 | val rec add = fn |
| 1153 | (nil,x) => () |
| 1154 | | (hd::tl,x) => (Array.update(fp,hd,union(fp sub hd,x)); |
| 1155 | add(tl,x)) |
| 1156 | and moredata = fn |
| 1157 | CLOSURE(e1) => |
| 1158 | (moredata(e1); add(lastpos(e1),firstpos(e1))) |
| 1159 | | ALT(e1,e2) => (moredata(e1); moredata(e2)) |
| 1160 | | CAT(e1,e2) => (moredata(e1); moredata(e2); |
| 1161 | add(lastpos(e1),firstpos(e2))) |
| 1162 | | CLASS(x,i) => Array.update(leaf,i,CLASS(x,i)) |
| 1163 | | TRAIL(i) => (Array.update(leaf,i,TRAIL(i)); if !trailmark = ~1 |
| 1164 | then trailmark := i else ()) |
| 1165 | | END(i) => (Array.update(leaf,i,END(i)); if !trailmark <> ~1 |
| 1166 | then (tcpairs := (!trailmark,i)::(!tcpairs); |
| 1167 | trailmark := ~1) else ()) |
| 1168 | | _ => () |
| 1169 | and makedata = fn |
| 1170 | nil => () |
| 1171 | | (_,x)::tl => (moredata(x);makedata(tl)) |
| 1172 | in trailmark := ~1; makedata(e); (fp,leaf,!tcpairs) |
| 1173 | end; |
| 1174 | |
| 1175 | fun makedfa(rules) = |
| 1176 | let val StateTab = ref (create(String.<=)) : (string,int) dictionary ref |
| 1177 | val fintab = ref (create(Int.<=)) : (int,(int list)) dictionary ref |
| 1178 | val transtab = ref (create(Int.<=)) : (int,int list) dictionary ref |
| 1179 | val tctab = ref (create(Int.<=)) : (int,(int list)) dictionary ref |
| 1180 | val (fp, leaf, tcpairs) = leafdata(rules); |
| 1181 | |
| 1182 | fun visit (state,statenum) = |
| 1183 | let val transitions = gettrans(state) in |
| 1184 | fintab := enter(!fintab)(statenum,getfin(state)); |
| 1185 | tctab := enter(!tctab)(statenum,gettc(state)); |
| 1186 | transtab := enter(!transtab)(statenum,transitions) |
| 1187 | end |
| 1188 | |
| 1189 | and visitstarts (states) = |
| 1190 | let fun vs nil i = () |
| 1191 | | vs (hd::tl) i = (visit (hd,i); vs tl (i+1)) |
| 1192 | in vs states 0 |
| 1193 | end |
| 1194 | |
| 1195 | and hashstate(s: int list) = |
| 1196 | let val rec hs = |
| 1197 | fn (nil,z) => z |
| 1198 | | ((x:int)::y,z) => hs(y,z ^ " " ^ (Int.toString x)) |
| 1199 | in hs(s,"") |
| 1200 | end |
| 1201 | |
| 1202 | and find(s) = lookup(!StateTab)(hashstate(s)) |
| 1203 | |
| 1204 | and add(s,n) = StateTab := enter(!StateTab)(hashstate(s),n) |
| 1205 | |
| 1206 | and getstate (state) = |
| 1207 | find(state) |
| 1208 | handle LOOKUP => let val n = ++StateNum in |
| 1209 | add(state,n); visit(state,n); n |
| 1210 | end |
| 1211 | |
| 1212 | and getfin state = |
| 1213 | let fun f nil fins = fins |
| 1214 | | f (hd::tl) fins = |
| 1215 | case (leaf sub hd) |
| 1216 | of END _ => f tl (hd::fins) |
| 1217 | | _ => f tl fins |
| 1218 | in f state nil |
| 1219 | end |
| 1220 | |
| 1221 | and gettc state = |
| 1222 | let fun f nil fins = fins |
| 1223 | | f (hd::tl) fins = |
| 1224 | case (leaf sub hd) |
| 1225 | of TRAIL _ => f tl (hd::fins) |
| 1226 | | _ => f tl fins |
| 1227 | in f state nil |
| 1228 | end |
| 1229 | |
| 1230 | and gettrans (state) = |
| 1231 | let fun loop c tlist = |
| 1232 | let fun cktrans nil r = r |
| 1233 | | cktrans (hd::tl) r = |
| 1234 | case (leaf sub hd) of |
| 1235 | CLASS(i,_)=> |
| 1236 | (if (i sub c) then cktrans tl (union(r,fp sub hd)) |
| 1237 | else cktrans tl r handle Subscript => |
| 1238 | cktrans tl r |
| 1239 | ) |
| 1240 | | _ => cktrans tl r |
| 1241 | in if c >= 0 then |
| 1242 | let val v=cktrans state nil |
| 1243 | in loop (c-1) (if v=nil then 0::tlist else (getstate v)::tlist) |
| 1244 | end |
| 1245 | else tlist |
| 1246 | end |
| 1247 | in loop ((!CharSetSize) - 1) nil |
| 1248 | end |
| 1249 | |
| 1250 | and startstates() = |
| 1251 | let val startarray = Array.array(!StateNum + 1, nil); |
| 1252 | fun listofarray(a,n) = |
| 1253 | let fun f i l = if i >= 0 then f (i-1) ((a sub i)::l) else l |
| 1254 | in f (n-1) nil end |
| 1255 | val rec makess = fn |
| 1256 | nil => () |
| 1257 | | (startlist,e)::tl => (fix(startlist,firstpos(e));makess(tl)) |
| 1258 | and fix = fn |
| 1259 | (nil,_) => () |
| 1260 | | (s::tl,firsts) => (Array.update(startarray,s, |
| 1261 | union(firsts,startarray sub s)); |
| 1262 | fix(tl,firsts)) |
| 1263 | in makess(rules);listofarray(startarray, !StateNum + 1) |
| 1264 | end |
| 1265 | |
| 1266 | in visitstarts(startstates()); |
| 1267 | (listofdict(!fintab),listofdict(!transtab),listofdict(!tctab),tcpairs) |
| 1268 | end |
| 1269 | |
| 1270 | val skel_hd = |
| 1271 | " struct\n\ |
| 1272 | \ structure UserDeclarations =\n\ |
| 1273 | \ struct\n\ |
| 1274 | \" |
| 1275 | |
| 1276 | val skel_mid2 = |
| 1277 | " | Internal.D k => action (i,(acts::l),k::rs)\n\ |
| 1278 | \ | Internal.T k =>\n\ |
| 1279 | \ let fun f (a::b,r) =\n\ |
| 1280 | \ if a=k\n\ |
| 1281 | \ then action(i,(((Internal.N a)::acts)::l),(b@r))\n\ |
| 1282 | \ else f (b,a::r)\n\ |
| 1283 | \ | f (nil,r) = action(i,(acts::l),rs)\n\ |
| 1284 | \ in f (rs,nil)\n\ |
| 1285 | \ end\n\ |
| 1286 | \" |
| 1287 | |
| 1288 | fun lexGen(infile) = |
| 1289 | let val outfile = infile ^ ".sml" |
| 1290 | val () = (InFile := OS.Path.file infile; OutFile := OS.Path.file outfile) |
| 1291 | fun PrintLexer (ends) = |
| 1292 | let val sayln = fn x => (say x; say "\n") |
| 1293 | in case !ArgCode |
| 1294 | of NONE => (sayln "fun lex () : Internal.result ="; |
| 1295 | sayln "let fun continue() = lex() in") |
| 1296 | | SOME (p,s) => |
| 1297 | (say "fun lex "; say "(yyarg as ("; |
| 1298 | sayPos (SOME p); say s; sayPos NONE; sayln ")) ="; |
| 1299 | sayln "let fun continue() : Internal.result = "); |
| 1300 | say " let fun scan (s,AcceptingLeaves : Internal.yyfinstate"; |
| 1301 | sayln " list list,l,i0) ="; |
| 1302 | if !UsesTrailingContext |
| 1303 | then say "\tlet fun action (i,nil,rs)" |
| 1304 | else say "\tlet fun action (i,nil)"; |
| 1305 | sayln " = raise LexError"; |
| 1306 | if !UsesTrailingContext |
| 1307 | then sayln "\t| action (i,nil::l,rs) = action(i-1,l,rs)" |
| 1308 | else sayln "\t| action (i,nil::l) = action (i-1,l)"; |
| 1309 | if !UsesTrailingContext |
| 1310 | then sayln "\t| action (i,(node::acts)::l,rs) =" |
| 1311 | else sayln "\t| action (i,(node::acts)::l) ="; |
| 1312 | sayln "\t\tcase node of"; |
| 1313 | sayln "\t\t Internal.N yyk => "; |
| 1314 | sayln "\t\t\t(let fun yymktext() = String.substring(!yyb,i0,i-i0)\n\ |
| 1315 | \\t\t\t val yypos = YYPosInt.+(YYPosInt.fromInt i0, !yygone)"; |
| 1316 | if !CountNewLines |
| 1317 | then (sayln "\t\t\tval _ = yylineno := CharVectorSlice.foldli"; |
| 1318 | sayln "\t\t\t\t(fn (_,#\"\\n\", n) => n+1 | (_,_, n) => n) (!yylineno) (CharVectorSlice.slice (!yyb,i0,SOME(i-i0)))") |
| 1319 | else (); |
| 1320 | if !HaveReject |
| 1321 | then (say "\t\t\tfun REJECT() = action(i,acts::l"; |
| 1322 | if !UsesTrailingContext |
| 1323 | then sayln ",rs)" else sayln ")") |
| 1324 | else (); |
| 1325 | sayln "\t\t\topen UserDeclarations Internal.StartStates"; |
| 1326 | sayln " in (yybufpos := i; case yyk of "; |
| 1327 | sayln ""; |
| 1328 | sayln "\t\t\t(* Application actions *)\n"; |
| 1329 | makeaccept(ends); |
| 1330 | say "\n\t\t) end "; |
| 1331 | say ")\n\n"; |
| 1332 | if (!UsesTrailingContext) then say skel_mid2 else (); |
| 1333 | sayln "\tval {fin,trans} = Vector.sub(Internal.tab, s)"; |
| 1334 | sayln "\tval NewAcceptingLeaves = fin::AcceptingLeaves"; |
| 1335 | sayln "\tin if l = !yybl then"; |
| 1336 | sayln "\t if trans = #trans(Vector.sub(Internal.tab,0))"; |
| 1337 | sayln "\t then action(l,NewAcceptingLeaves"; |
| 1338 | if !UsesTrailingContext then say ",nil" else (); |
| 1339 | say ") else"; |
| 1340 | sayln "\t let val newchars= if !yydone then \"\" else yyinput 1024"; |
| 1341 | sayln "\t in if (String.size newchars)=0"; |
| 1342 | sayln "\t\t then (yydone := true;"; |
| 1343 | say "\t\t if (l=i0) then UserDeclarations.eof "; |
| 1344 | sayln (case !ArgCode of NONE => "()" | SOME _ => "yyarg"); |
| 1345 | say "\t\t else action(l,NewAcceptingLeaves"; |
| 1346 | if !UsesTrailingContext then |
| 1347 | sayln ",nil))" else sayln "))"; |
| 1348 | sayln "\t\t else (if i0=l then yyb := newchars"; |
| 1349 | sayln "\t\t else yyb := String.substring(!yyb,i0,l-i0)^newchars;"; |
| 1350 | sayln "\t\t yygone := YYPosInt.+(!yygone, YYPosInt.fromInt i0);"; |
| 1351 | sayln "\t\t yybl := String.size (!yyb);"; |
| 1352 | sayln "\t\t scan (s,AcceptingLeaves,l-i0,0))"; |
| 1353 | sayln "\t end"; |
| 1354 | sayln "\t else let val NewChar = Char.ord(CharVector.sub(!yyb,l))"; |
| 1355 | if !CharSetSize=129 |
| 1356 | then sayln "\t\tval NewChar = if NewChar<128 then NewChar else 128" |
| 1357 | else (); |
| 1358 | say "\t\tval NewState = "; |
| 1359 | sayln (if !CharFormat |
| 1360 | then "Char.ord(CharVector.sub(trans,NewChar))" |
| 1361 | else "Vector.sub(trans, NewChar)"); |
| 1362 | say "\t\tin if NewState=0 then action(l,NewAcceptingLeaves"; |
| 1363 | if !UsesTrailingContext then sayln ",nil)" else sayln ")"; |
| 1364 | sayln "\t\telse scan(NewState,NewAcceptingLeaves,l+1,i0)"; |
| 1365 | sayln "\tend"; |
| 1366 | sayln "\tend"; |
| 1367 | if !UsesPrevNewLine then () else sayln "(*"; |
| 1368 | sayln "\tval start= if String.substring(!yyb,!yybufpos-1,1)=\"\\n\""; |
| 1369 | sayln "then !yybegin+1 else !yybegin"; |
| 1370 | if !UsesPrevNewLine then () else sayln "*)"; |
| 1371 | say "\tin scan("; |
| 1372 | if !UsesPrevNewLine then say "start" |
| 1373 | else say "!yybegin (* start *)"; |
| 1374 | sayln ",nil,!yybufpos,!yybufpos)"; |
| 1375 | sayln " end"; |
| 1376 | sayln (case !ArgCode of NONE => "end" | SOME _ => "in continue end"); |
| 1377 | sayln " in lex"; |
| 1378 | sayln " end"; |
| 1379 | sayln "end" |
| 1380 | end |
| 1381 | |
| 1382 | in (UsesPrevNewLine := false; |
| 1383 | ResetFlags(); |
| 1384 | LexBuf := make_ibuf(TextIO.openIn infile); |
| 1385 | NextTok := BOF; |
| 1386 | inquote := false; |
| 1387 | setLexOut (TextIO.openOut(outfile)); |
| 1388 | StateNum := 2; |
| 1389 | resetLexPos (); |
| 1390 | StateTab := enter(create(String.<=))("INITIAL",1); |
| 1391 | LeafNum := ~1; |
| 1392 | let |
| 1393 | val (user_code,rules,ends) = |
| 1394 | parse() handle x => |
| 1395 | (close_ibuf(!LexBuf); |
| 1396 | TextIO.closeOut(!LexOut); |
| 1397 | OS.FileSys.remove outfile; |
| 1398 | raise x) |
| 1399 | val (fins,trans,tctab,tcpairs) = makedfa(rules) |
| 1400 | val _ = if !UsesTrailingContext then |
| 1401 | (close_ibuf(!LexBuf); |
| 1402 | TextIO.closeOut(!LexOut); |
| 1403 | OS.FileSys.remove outfile; |
| 1404 | prErr "lookahead is unimplemented") |
| 1405 | else () |
| 1406 | in |
| 1407 | if (!HeaderDecl) |
| 1408 | then (sayPos (SOME (!HeaderPos)) |
| 1409 | ; say (!HeaderCode) |
| 1410 | ; sayPos NONE) |
| 1411 | else say ("structure " ^ (!StrName)); |
| 1412 | say "=\n"; |
| 1413 | say skel_hd; |
| 1414 | sayPos (SOME {line = 1, col = 0}); |
| 1415 | say user_code; |
| 1416 | sayPos NONE; |
| 1417 | say "end (* end of user routines *)\n"; |
| 1418 | say "exception LexError (* raised if illegal leaf "; |
| 1419 | say "action tried *)\n"; |
| 1420 | say "structure Internal =\n\tstruct\n"; |
| 1421 | maketable(fins,tctab,tcpairs,trans); |
| 1422 | say "structure StartStates =\n\tstruct\n"; |
| 1423 | say "\tdatatype yystartstate = STARTSTATE of int\n"; |
| 1424 | makebegin(); |
| 1425 | say "\nend\n"; |
| 1426 | say "type result = UserDeclarations.lexresult\n"; |
| 1427 | say "\texception LexerError (* raised if illegal leaf "; |
| 1428 | say "action tried *)\n"; |
| 1429 | say "end\n\n"; |
| 1430 | say ("structure YYPosInt : INTEGER = " ^ (!PosIntName) ^ "\n"); |
| 1431 | say (if (!PosArg) then "fun makeLexer (yyinput,yygone0:YYPosInt.int) =\nlet\n" |
| 1432 | else "fun makeLexer yyinput =\nlet\tval yygone0= YYPosInt.fromInt ~1\n"); |
| 1433 | if !CountNewLines then say "\tval yylineno = ref 0\n\n" else (); |
| 1434 | say "\tval yyb = ref \"\\n\" \t\t(* buffer *)\n\ |
| 1435 | \\tval yybl = ref 1\t\t(*buffer length *)\n\ |
| 1436 | \\tval yybufpos = ref 1\t\t(* location of next character to use *)\n\ |
| 1437 | \\tval yygone = ref yygone0\t(* position in file of beginning of buffer *)\n\ |
| 1438 | \\tval yydone = ref false\t\t(* eof found yet? *)\n\ |
| 1439 | \\tval yybegin = ref 1\t\t(*Current 'start state' for lexer *)\n\ |
| 1440 | \\n\tval YYBEGIN = fn (Internal.StartStates.STARTSTATE x) =>\n\ |
| 1441 | \\t\t yybegin := x\n\n"; |
| 1442 | PrintLexer(ends); |
| 1443 | close_ibuf(!LexBuf); |
| 1444 | TextIO.closeOut(!LexOut) |
| 1445 | end) |
| 1446 | end |
| 1447 | end |