1 (* From the SML
/NJ benchmark suite
. *)
5 val testit
: TextIO.outstream
-> unit
7 (* Lexical analyzer generator for Standard ML
.
8 Version
1.6.0, October
1994
10 Copyright (c
) 1989-1992 by Andrew W
. Appel
,
11 David R
. Tarditi
, James S
. Mattson
13 This software comes
with ABSOLUTELY NO WARRANTY
.
14 This software is subject only to the PRINCETON STANDARD ML SOFTWARE LIBRARY
15 COPYRIGHT NOTICE
, LICENSE AND DISCLAIMER
, (in the file
"COPYRIGHT",
16 distributed
with this software
). You may copy
and distribute this software
;
17 see the COPYRIGHT NOTICE for details
and restrictions
.
20 07/25/89 (drt
): added
%header declaration
, code to place
21 user declarations at same level
as makeLexer
, etc
.
22 This is needed for the parser generator
.
23 /10/89 (appel
): added
%arg
declaration (see lexgen
.doc
).
24 /04/90 (drt
): fixed following bug
: couldn
't use the lexer after an
25 error occurred
-- NextTok
and inquote weren
't being reset
26 10/22/91 (drt
): disabled use
of lookahead
27 10/23/92 (drt
): disabled use
of $
operator (which involves lookahead
),
28 added handlers for dictionary lookup routine
29 11/02/92 (drt
): changed handler for
exception Reject
in generated lexer
31 02/01/94 (appel
): Moved the
exception handler for Reject
in such
32 a way
as to allow tail
-recursion (improves performance
34 02/01/94 (appel
): Fixed a bug
in parsing
of state names
.
35 05/19/94 (Mikael Pettersson
, mpe@ida
.liu
.se
):
36 Transition tables are usually represented
as strings
, but
37 when the range is too large
, int vectors constructed by
38 code like
"Vector.vector[1,2,3,...]" are used instead
.
39 The problem
with this isn
't that the vector itself takes
40 a lot
of space
, but that the code generated by SML
/NJ to
41 construct the intermediate list at run
-time is
*HUGE
*. My
42 fix is to encode an
int vector
as a
string literal (using
43 two bytes per
int) and emit code to decode the
string to
44 a vector at run
-time
. SML
/NJ compiles
string literals into
45 substrings
in the code
, so this uses much less space
.
46 06/02/94 (jhr
): Modified export
-lex
.sml to conform to new installation
47 scheme
. Also removed tab characters from
string literals
.
48 10/05/94 (jhr
): Changed generator to produce code that uses the new
49 basis style strings
and characters
.
50 10/06/94 (jhr
) Modified code to compile under new basis style strings
52 02/08/95 (jhr
) Modified to use new
List module interface
.
53 05/18/95 (jhr
) changed
Vector.vector to
Vector.fromList
55 * $Log
: lexgen
.sml
,v $
56 * Revision
1.6 1996/10/03 14:57:30 jhr
57 * Qualified use
of Int.quot
, since it is no longer available at top
-level
; improved
58 * the code that prints the tables
.
60 * Revision
1.5 1996/09/16 12:25:14 george
61 * here is a bug
in ml
-lex (109.17) when using the
%count flag
. The yylineno
62 * variable should get reinitialized to zero on each call to makeLexer
, but
63 * instead is globally allocated
and never reset
.
65 * Revision
1.4 1996/08/13 13:50:36 george
66 * Fixed bugs
in counting
lines (from jhr
)
68 * Revision
1.3 1996/07/25 20:38:52 jhr
69 * Fixed bug
in ungetch that caused Subscript exceptions
.
71 * Revision
1.2 1996/02/26 15:02:27 george
72 * print no longer overloaded
.
73 * use
of makestring has been removed
and replaced
with Int.toString
..
74 * use
of IO replaced
with TextIO
76 * Revision
1.1.1.1 1996/01/31 16:01:15 george
81 (* Subject
: lookahead
in sml
-lex
82 Reply
-to
: david
.tarditi@CS
.CMU
.EDU
83 Date
: Mon
, 21 Oct
91 14:13:26 -0400
85 There is a serious bug
in the implementation
of lookahead
,
86 as done
in sml
-lex
, and described
in Aho
, Sethi
, and Ullman
,
87 p
. 134 "Implementing the Lookahead Operator"
89 We have disallowed the use
of lookahead for now because
92 As a counter
-example to the implementation described
in
93 ASU
, consider the following specification
with the
94 input
string "aba" (this example is taken from
95 a comp
.compilers message from Dec
. 1989, I think
):
99 fun error x
= TextIO.output(TextIO.stdErr
, x ^
"\n")
100 val eof
= fn () => ()
104 (a|ab
)/ba
=> (print yytext
; print
"\n"; ());
106 The ASU proposal works
as follows
. Suppose that we are
107 using NFA
's to represent our regular expressions
. Then to
108 build an NFA for e1
/ e2
, we build an NFA n1 for e1
109 and an NFA n2 for e2
, and add an epsilon transition
112 When lexing
, when we encounter the
end state
of e1e2
,
113 we take
as the
end of the
string the position
in
114 the
string that was the last occurrence
of the state
of
115 the NFA having a transition on the epsilon introduced
118 Using the example we have above
, we
'll have an NFA
119 with the following states
:
122 1 -- a
--> 2 -- b
--> 3
126 |
------------> 4 -- b
--> 5 -- a
--> 6
128 On our example
, we get the following list
of transitions
:
130 a
: 2, 4 (make an epsilon transition from
2 to
4)
131 ab
: 3, 4, 5 (make an epsilon transition from
3 to
4)
134 If we chose the last state
in which we made an epsilon transition
,
135 we
'll chose the transition from
3 to
4, and end up
with "ab"
136 as our token
, when we should have
"a" as our token
.
140 functor RedBlack(B
: sig type key
141 val > : key
*key
->bool
146 val insert
: key
* tree
-> tree
147 val lookup
: key
* tree
-> key
148 exception notfound
of key
152 datatype color
= RED | BLACK
153 datatype tree
= empty | tree
of key
* color
* tree
* tree
154 exception notfound
of key
157 let fun f empty
= tree(key
,RED
,empty
,empty
)
158 |
f (tree(k
,BLACK
,l
,r
)) =
161 of r
as tree(rk
,RED
, rl
as tree(rlk
,RED
,rll
,rlr
),rr
) =>
163 of tree(lk
,RED
,ll
,lr
) =>
164 tree(k
,RED
,tree(lk
,BLACK
,ll
,lr
),
165 tree(rk
,BLACK
,rl
,rr
))
166 | _
=> tree(rlk
,BLACK
,tree(k
,RED
,l
,rll
),
167 tree(rk
,RED
,rlr
,rr
)))
168 | r
as tree(rk
,RED
,rl
, rr
as tree(rrk
,RED
,rrl
,rrr
)) =>
170 of tree(lk
,RED
,ll
,lr
) =>
171 tree(k
,RED
,tree(lk
,BLACK
,ll
,lr
),
172 tree(rk
,BLACK
,rl
,rr
))
173 | _
=> tree(rk
,BLACK
,tree(k
,RED
,l
,rl
),rr
))
174 | r
=> tree(k
,BLACK
,l
,r
)
177 of l
as tree(lk
,RED
,ll
, lr
as tree(lrk
,RED
,lrl
,lrr
)) =>
179 of tree(rk
,RED
,rl
,rr
) =>
180 tree(k
,RED
,tree(lk
,BLACK
,ll
,lr
),
181 tree(rk
,BLACK
,rl
,rr
))
182 | _
=> tree(lrk
,BLACK
,tree(lk
,RED
,ll
,lrl
),
184 | l
as tree(lk
,RED
, ll
as tree(llk
,RED
,lll
,llr
), lr
) =>
186 of tree(rk
,RED
,rl
,rr
) =>
187 tree(k
,RED
,tree(lk
,BLACK
,ll
,lr
),
188 tree(rk
,BLACK
,rl
,rr
))
189 | _
=> tree(lk
,BLACK
,ll
,tree(k
,RED
,lr
,r
)))
190 | l
=> tree(k
,BLACK
,l
,r
)
191 else tree(key
,BLACK
,l
,r
)
192 |
f (tree(k
,RED
,l
,r
)) =
193 if key
>k
then tree(k
,RED
,l
, f r
)
194 else if k
>key
then tree(k
,RED
, f l
, r
)
195 else tree(key
,RED
,l
,r
)
197 of tree(k
,RED
, l
as tree(_
,RED
,_
,_
), r
) => tree(k
,BLACK
,l
,r
)
198 |
tree(k
,RED
, l
, r
as tree(_
,RED
,_
,_
)) => tree(k
,BLACK
,l
,r
)
204 let fun look empty
= raise (notfound key
)
205 |
look (tree(k
,_
,l
,r
)) =
207 else if key
>k
then look r
216 val lexGen
: string -> unit
219 structure LexGen
: LEXGEN
=
224 datatype token
= CHARS
of bool array | QMARK | STAR | PLUS | BAR
225 | LP | RP | CARAT | DOLLAR | SLASH | STATE
of string list
226 | REPS
of int * int | ID
of string | ACTION
of string
227 | BOF | EOF | ASSIGN | SEMI | ARROW | LEXMARK | LEXSTATES |
228 COUNT | REJECT | FULLCHARSET | STRUCT | HEADER | ARG
230 datatype exp
= EPS | CLASS
of bool array
* int | CLOSURE
of exp
231 | ALT
of exp
* exp | CAT
of exp
* exp | TRAIL
of int
234 (* flags describing input Lex spec
. - unnecessary code is omitted
*)
237 val CharFormat
= ref
false;
238 val UsesTrailingContext
= ref
false;
239 val UsesPrevNewLine
= ref
false;
241 (* flags for various bells
& whistles that Lex has
. These slow the
242 lexer down
and should be omitted from production
lexers (if you
243 really want speed
) *)
245 val CountNewLines
= ref
false;
246 val HaveReject
= ref
false;
248 (* Can increase size
of character set
*)
250 val CharSetSize
= ref
129;
252 (* Can name
structure or declare header code
*)
254 val StrName
= ref
"Mlex"
255 val HeaderCode
= ref
""
256 val HeaderDecl
= ref
false
257 val ArgCode
= ref (NONE
: string option
)
258 val StrDecl
= ref
false
260 val ResetFlags
= fn () => (CountNewLines
:= false; HaveReject
:= false;
261 UsesTrailingContext
:= false;
262 CharSetSize
:= 129; StrName
:= "Mlex";
263 HeaderCode
:= ""; HeaderDecl
:= false;
267 val LexOut
= ref(TextIO.stdOut
)
268 fun say x
= TextIO.output(!LexOut
, x
)
270 (* Union
: merge two sorted lists
of integers
*)
272 fun union(a
,b
) = let val rec merge
= fn
274 |
(nil
,el
::more
,z
) => merge(nil
,more
,el
::z
)
275 |
(el
::more
,nil
,z
) => merge(more
,nil
,el
::z
)
276 |
(x
::morex
,y
::morey
,z
) => if (x
:int)=(y
:int)
277 then merge(morex
,morey
,x
::z
)
278 else if x
>y
then merge(morex
,y
::morey
,x
::z
)
279 else merge(x
::morex
,morey
,y
::z
)
280 in merge(rev a
,rev b
,nil
)
283 (* Nullable
: compute
if a important expression parse tree node is nullable
*)
285 val rec nullable
= fn
289 |
ALT(n1
,n2
) => nullable(n1
) orelse nullable(n2
)
290 |
CAT(n1
,n2
) => nullable(n1
) andalso nullable(n2
)
294 (* FIRSTPOS
: firstpos function for parse tree expressions
*)
299 |
CLOSURE(n
) => firstpos(n
)
300 |
ALT(n1
,n2
) => union(firstpos(n1
),firstpos(n2
))
301 |
CAT(n1
,n2
) => if nullable(n1
) then union(firstpos(n1
),firstpos(n2
))
306 (* LASTPOS
: Lastpos function for parse tree expressions
*)
311 |
CLOSURE(n
) => lastpos(n
)
312 |
ALT(n1
,n2
) => union(lastpos(n1
),lastpos(n2
))
313 |
CAT(n1
,n2
) => if nullable(n2
) then union(lastpos(n1
),lastpos(n2
))
319 (* ++: Increment an integer reference
*)
321 fun ++(x
) : int = (x
:= !x
+ 1; !x
);
325 type 'a relation
= 'a
* 'a
-> bool
326 abstype ('b
,'a
) dictionary
= DATA
of {Table
: ('b
* 'a
) list
,
327 Leq
: 'b
* 'b
-> bool }
330 fun create Leqfunc
= DATA
{ Table
= nil
, Leq
= Leqfunc
}
332 fun lookup (DATA
{ Table
= entrylist
, Leq
= leq
}) key
=
333 let fun search
[] = raise LOOKUP
334 |
search((k
,item
)::entries
) =
336 then if leq(k
,key
) then item
else raise LOOKUP
341 fun enter (DATA
{ Table
= entrylist
, Leq
= leq
})
342 (newentry
as (key
: 'b
,item
:'a
)) : ('b
,'a
) dictionary
=
343 let val gt
= fn a
=> fn b
=> not (leq(a
,b
))
344 val eq
= fn k
=> fn k
' => (leq(k
,k
')) andalso (leq(k
',k
))
345 fun update nil
= [ newentry
]
346 |
update ((entry
as (k
,_
))::entries
) =
347 if (eq key k
) then newentry
::entries
348 else if gt k key
then newentry
::(entry
::entries
)
349 else entry
::(update entries
)
350 in DATA
{ Table
= update entrylist
, Leq
= leq
}
353 fun listofdict (DATA
{ Table
= entrylist
,Leq
= leq
}) =
354 let fun f (nil
,r
) = rev r
355 |
f (a
::b
,r
) = f (b
,a
::r
)
359 end (* structure Dict
*)
363 (* INPUT
.ML
: Input w
/ one character push back capability
*)
368 BUF
of TextIO.instream
* {b
: string ref
, p
: int ref
}
370 fun make_ibuf(s
) = BUF (s
, {b
=ref
"", p
= ref
0})
371 fun close_ibuf (BUF (s
,_
)) = TextIO.closeIn(s
)
373 fun getch (a
as (BUF(s
,{b
,p
}))) =
374 if (!p
= (size (!b
)))
375 then (b
:= TextIO.inputN(s
, 1024);
380 else (let val ch
= String.sub(!b
,!p
)
382 then LineNum
:= !LineNum
+ 1
388 fun ungetch(BUF(s
,{b
,p
})) = (
390 if String.sub(!b
,!p
) = #
"\n"
391 then LineNum
:= !LineNum
- 1
398 TextIO.output (TextIO.stdErr
, String.concat
[
399 "ml-lex: error, line ", (Int.toString (!LineNum
)), ": ", x
, "\n"
403 TextIO.output (TextIO.stdErr
, String.concat
[
404 "ml-lex: syntax error, line ", (Int.toString (!LineNum
)), ": ", x
, "\n"
408 exception SyntaxError
; (* error
in user
's input file
*)
410 exception LexError
; (* unexpected error
in lexer
*)
412 val LexBuf
= ref(make_ibuf(TextIO.stdIn
));
413 val LexState
= ref
0;
414 val NextTok
= ref BOF
;
415 val inquote
= ref
false;
417 fun AdvanceTok () : unit
= let
419 ((c
>= #
"a") andalso (c
<= #
"z")) orelse
420 ((c
>= #
"A") andalso (c
<= #
"Z"))
421 fun isDigit c
= (c
>= #
"0") andalso (c
<= #
"9")
422 (* check for
valid (non
-leading
) identifier
character (added by JHR
) *)
424 ((isLetter c
) orelse (isDigit c
) orelse (c
= #
"_") orelse (c
= #
"'"))
426 fun num (c
::r
, n
) = if isDigit c
427 then num (r
, 10*n
+ (Char.ord c
- Char.ord #
"0"))
434 fun skipws () = (case nextch()
441 and nextch () = getch(!LexBuf
)
443 and escaped () = (case nextch()
449 fun err t
= prErr("illegal ascii escape '"^
(implode(rev t
))^
"'")
450 fun cvt c
= (Char.ord c
- Char.ord #
"0")
451 fun f (n
, c
, t
) = if c
=3
452 then if n
>= (!CharSetSize
)
455 else let val ch
=nextch()
458 then f(n
*10+(cvt ch
), c
+1, ch
::t
)
462 if isDigit x
then f(cvt x
, 1, [x
]) else x
467 let val c
= array(!CharSetSize
, false)
469 update(c
, Char.ord(x
), true);
473 in case !LexState
of 0 => let val makeTok
= fn () =>
475 (* Lex
% operators
*)
476 of #
"%" => (case nextch() of
480 in if isLetter a
then f(a
::s
)
481 else (ungetch(!LexBuf
);
485 in if command
= "reject" then REJECT
486 else if command
= "count" then COUNT
487 else if command
= "full" then FULLCHARSET
488 else if command
= "s" then LEXSTATES
489 else if command
= "S" then LEXSTATES
490 else if command
= "structure" then STRUCT
491 else if command
= "header" then HEADER
492 else if command
= "arg" then ARG
493 else prErr
"unknown % operator "
496 (* semicolon (for
end of LEXSTATES
) *)
499 | ch
=> if isLetter(ch
) then
500 let fun getID matched
=
503 in if isLetter(x
) orelse isDigit(x
) orelse
504 x
= "_" orelse x
= "'"
507 then getID (x
::matched
)
508 else (ungetch(!LexBuf
); implode(rev matched
))
512 else (prSynErr ("bad character: " ^
String.str ch
))
513 in NextTok
:= makeTok()
515 |
1 => let val rec makeTok
= fn () =>
516 if !inquote
then case nextch() of
517 (* inside quoted
string *)
518 #
"\\" => onechar(escaped())
519 | #
"\"" => (inquote
:= false; makeTok())
521 else case skipws() of
522 (* single character operators
*)
533 | #
"." => let val c
= array(!CharSetSize
,true) in
534 update(c
,10,false); CHARS(c
)
536 (* assign
and arrow
*)
537 | #
"=" => let val c
= nextch() in
538 if c
= #
">" then ARROW
else (ungetch(!LexBuf
); ASSIGN
)
541 | #
"[" => let val rec classch
= fn () => let val x
= skipws()
542 in if x
= #
"\\" then escaped() else x
544 val first
= classch();
545 val flag
= (first
<> #
"^");
546 val c
= array(!CharSetSize
,not flag
);
548 |
add (SOME x
) = update(c
, Char.ord(x
), flag
)
549 and range (x
, y
) = if x
>y
550 then (prErr
"bad char. range")
552 val i
= ref(Char.ord(x
)) and j
= Char.ord(y
)
554 add (SOME(Char.chr(!i
)));
557 and getClass last
= (case classch()
558 of #
"]" => (add(last
); c
)
560 of NONE
=> getClass(SOME #
"-")
561 |
(SOME last
') => let val x
= classch()
564 then (add(last
); add(SOME #
"-"); c
)
565 else (range(last
',x
); getClass(NONE
))
568 | x
=> (add(last
); getClass(SOME x
))
570 in CHARS(getClass(if first
= #
"^" then NONE
else SOME first
))
572 (* Start States specification
*)
573 | #
"<" => let val rec get_state
= fn (prev
,matched
) =>
575 #
">" => matched
::prev
576 | #
"," => get_state(matched
::prev
,"")
577 | x
=> if isIdentChr(x
)
578 then get_state(prev
,matched ^
String.str x
)
579 else (prSynErr
"bad start state list")
580 in STATE(get_state(nil
,""))
582 (* {id
} or repititions
*)
583 | #
"{" => let val ch
= nextch() in if isLetter(ch
) then
584 let fun getID matched
= (case nextch()
586 | x
=> if (isIdentChr x
) then
587 getID(matched ^
String.str x
)
588 else (prErr
"invalid char. class name")
590 in ID(getID(String.str ch
))
592 else if isDigit(ch
) then
593 let fun get_r (matched
, r1
) = (case nextch()
594 of #
"}" => let val n
= atoi(matched
) in
595 if r1
= ~
1 then (n
,n
) else (r1
,n
)
597 | #
"," => if r1
= ~
1 then get_r("",atoi(matched
))
598 else (prErr
"invalid repetitions spec.")
600 then get_r(matched ^
String.str x
,r1
)
601 else (prErr
"invalid char in repetitions spec")
603 in REPS(get_r(String.str ch
,~
1))
605 else (prErr
"bad repetitions spec")
607 (* Lex
% operators
*)
608 | #
"%" => if nextch() = #
"%" then LEXMARK
else
609 (ungetch(!LexBuf
); onechar (#
"%"))
610 (* backslash escape
*)
611 | #
"\\" => onechar(escaped())
612 (* start quoted
string *)
613 | #
"\"" => (inquote
:= true; makeTok())
616 in NextTok
:= makeTok()
621 fun GetAct (lpct
,x
) = (case getch(!LexBuf
)
622 of #
"(" => GetAct (lpct
+1, #
"("::x
)
623 | #
")" => if lpct
= 0 then (implode (rev x
))
624 else GetAct(lpct
-1, #
")"::x
)
625 | y
=> GetAct(lpct
,y
::x
)
627 in ACTION (GetAct (0,nil
))
630 | c
=> (prSynErr ("invalid character " ^
String.str c
)))
631 | _
=> raise LexError
633 handle eof
=> NextTok
:= EOF
;
635 fun GetTok (_
:unit
) : token
=
636 let val t
= !NextTok
in AdvanceTok(); t
638 val SymTab
= ref (create
String.<=) : (string,exp
) dictionary ref
640 fun GetExp () : exp
=
642 let val rec optional
= fn e
=> ALT(EPS
,e
)
644 and lookup
' = fn name
=>
646 handle LOOKUP
=> prErr ("bad regular expression name: "^
649 and newline
= fn () => let val c
= array(!CharSetSize
,false) in
653 and endline
= fn e
=> trail(e
,CLASS(newline(),0))
655 and trail
= fn (e1
,e2
) => CAT(CAT(e1
,TRAIL(0)),e2
)
657 and closure1
= fn e
=> CAT(e
,CLOSURE(e
))
659 and repeat
= fn (min
,max
,e
) => let val rec rep
= fn
661 |
(0,1) => ALT(e
,EPS
)
662 |
(0,i
) => CAT(rep(0,1),rep(0,i
-1))
663 |
(i
,j
) => CAT(e
,rep(i
-1,j
-1))
667 and exp0
= fn () => case GetTok() of
668 CHARS(c
) => exp1(CLASS(c
,0))
669 | LP
=> let val e
= exp0() in
670 if !NextTok
= RP
then
671 (AdvanceTok(); exp1(e
))
672 else (prSynErr
"missing '('") end
673 |
ID(name
) => exp1(lookup
' name
)
674 | _
=> raise SyntaxError
676 and exp1
= fn (e
) => case !NextTok
of
680 | LP
=> exp2(e
,exp0())
682 | t
=> (AdvanceTok(); case t
of
683 QMARK
=> exp1(optional(e
))
684 | STAR
=> exp1(CLOSURE(e
))
685 | PLUS
=> exp1(closure1(e
))
686 |
CHARS(c
) => exp2(e
,CLASS(c
,0))
687 | BAR
=> ALT(e
,exp0())
688 | DOLLAR
=> (UsesTrailingContext
:= true; endline(e
))
689 | SLASH
=> (UsesTrailingContext
:= true;
691 |
REPS(i
,j
) => exp1(repeat(i
,j
,e
))
692 |
ID(name
) => exp2(e
,lookup
' name
)
693 | _
=> raise SyntaxError
)
695 and exp2
= fn (e1
,e2
) => case !NextTok
of
697 | ARROW
=> CAT(e1
,e2
)
699 | LP
=> exp2(CAT(e1
,e2
),exp0())
701 | t
=> (AdvanceTok(); case t
of
702 QMARK
=> exp1(CAT(e1
,optional(e2
)))
703 | STAR
=> exp1(CAT(e1
,CLOSURE(e2
)))
704 | PLUS
=> exp1(CAT(e1
,closure1(e2
)))
705 |
CHARS(c
) => exp2(CAT(e1
,e2
),CLASS(c
,0))
706 | BAR
=> ALT(CAT(e1
,e2
),exp0())
707 | DOLLAR
=> (UsesTrailingContext
:= true;
709 | SLASH
=> (UsesTrailingContext
:= true;
710 trail(CAT(e1
,e2
),exp0()))
711 |
REPS(i
,j
) => exp1(CAT(e1
,repeat(i
,j
,e2
)))
712 |
ID(name
) => exp2(CAT(e1
,e2
),lookup
' name
)
713 | _
=> raise SyntaxError
)
716 val StateTab
= ref(create(String.<=)) : (string,int) dictionary ref
718 val StateNum
= ref
0;
720 fun GetStates () : int list
=
722 let fun add nil sl
= sl
723 |
add (x
::y
) sl
= add
y (union ([lookup (!StateTab
)(x
)
725 prErr ("bad state name: "^x
)
729 if i
<= !StateNum
then addall (i
+2) (union ([i
],sl
))
732 fun incall (x
::y
) = (x
+1)::incall y
735 fun addincs nil
= nil
736 |
addincs (x
::y
) = x
::(x
+1)::addincs y
740 STATE s
=> (AdvanceTok(); LexState
:= 1; add s nil
)
744 of CARAT
=> (LexState
:= 1; AdvanceTok(); UsesPrevNewLine
:= true;
746 | _
=> addincs state_list
749 val LeafNum
= ref ~
1;
751 fun renum(e
: exp
) : exp
=
752 let val rec label
= fn
754 |
CLASS(x
,_
) => CLASS(x
,++LeafNum
)
755 |
CLOSURE(e
) => CLOSURE(label(e
))
756 |
ALT(e1
,e2
) => ALT(label(e1
),label(e2
))
757 |
CAT(e1
,e2
) => CAT(label(e1
),label(e2
))
758 |
TRAIL(i
) => TRAIL(++LeafNum
)
759 |
END(i
) => END(++LeafNum
)
763 exception ParseError
;
765 fun parse() : (string * (int list
* exp
) list
* ((string,string) dictionary
)) =
766 let val Accept
= ref (create
String.<=) : (string,string) dictionary ref
767 val rec ParseRtns
= fn l
=> case getch(!LexBuf
) of
768 #
"%" => let val c
= getch(!LexBuf
) in
769 if c
= #
"%" then (implode (rev l
))
770 else ParseRtns(c
:: #
"%" :: l
)
772 | c
=> ParseRtns(c
::l
)
773 and ParseDefs
= fn () =>
774 (LexState
:=0; AdvanceTok(); case !NextTok
of
777 let fun f () = (case !NextTok
of (ID i
) =>
778 (StateTab
:= enter(!StateTab
)(i
,++StateNum
);
779 ++StateNum
; AdvanceTok(); f())
781 in AdvanceTok(); f ();
782 if !NextTok
=SEMI
then ParseDefs() else
783 (prSynErr
"expected ';'")
785 | ID x
=> (LexState
:=1; AdvanceTok(); if GetTok() = ASSIGN
786 then (SymTab
:= enter(!SymTab
)(x
,GetExp());
787 if !NextTok
= SEMI
then ParseDefs()
788 else (prSynErr
"expected ';'"))
789 else raise SyntaxError
)
790 | REJECT
=> (HaveReject
:= true; ParseDefs())
791 | COUNT
=> (CountNewLines
:= true; ParseDefs())
792 | FULLCHARSET
=> (CharSetSize
:= 256; ParseDefs())
793 | HEADER
=> (LexState
:= 2; AdvanceTok();
797 (prErr
"cannot have both %s and %header \
799 else if (!HeaderDecl
) then
800 (prErr
"duplicate %header declarations")
802 (HeaderCode
:= s
; LexState
:= 0;
803 HeaderDecl
:= true; ParseDefs())
804 | _
=> raise SyntaxError
)
805 | ARG
=> (LexState
:= 2; AdvanceTok();
809 of SOME _
=> prErr
"duplicate %arg declarations"
810 | NONE
=> ArgCode
:= SOME s
;
813 | _
=> raise SyntaxError
)
814 | STRUCT
=> (AdvanceTok();
817 if (!HeaderDecl
) then
818 (prErr
"cannot have both %s and %header \
820 else if (!StrDecl
) then
821 (prErr
"duplicate %s declarations")
823 | _
=> (prErr
"expected ID");
825 | _
=> raise SyntaxError
)
827 fn rules
=> (LexState
:=1; AdvanceTok(); case !NextTok
of
831 let val s
= GetStates()
832 val e
= renum(CAT(GetExp(),END(0)))
834 if !NextTok
= ARROW
then
835 (LexState
:=2; AdvanceTok();
836 case GetTok() of ACTION(act
) =>
837 if !NextTok
=SEMI
then
838 (Accept
:=enter(!Accept
) (Int.toString (!LeafNum
),act
);
839 ParseRules((s
,e
)::rules
))
840 else (prSynErr
"expected ';'")
841 | _
=> raise SyntaxError
)
842 else (prSynErr
"expected '=>'")
844 in let val usercode
= ParseRtns nil
845 in (ParseDefs(); (usercode
,ParseRules(nil
),!Accept
))
847 end handle SyntaxError
=> (prSynErr
"")
849 fun makebegin () : unit
=
850 let fun make nil
= ()
851 |
make ((x
,n
:int)::y
)=(say
"val "; say x
; say
" = " ;
853 say (Int.toString n
); say
";\n"; make y
)
854 in say
"\n(* start state definitions *)\n\n"; make(listofdict(!StateTab
))
860 type key
= int list
* string
861 fun > ((key
,item
:string),(key
',item
')) =
862 let fun f ((a
:int)::a
') (b
::b
') = if Int.> (a
,b
) then true
863 else if a
=b
then f a
' b
'
870 structure RB
= RedBlack(L
)
872 fun maketable (fins
:(int * (int list
)) list
,
873 tcs
:(int * (int list
)) list
,
874 tcpairs
: (int * int) list
,
875 trans
: (int*(int list
)) list
) : unit
=
877 (* Fins
= (state #
, list
of final leaves for the state
) list
878 tcs
= (state #
, list
of trailing context leaves which begin
in this state
)
880 tcpairs
= (trailing context leaf
, end leaf
) list
881 trans
= (state #
,list
of transitions for state
) list
*)
883 let datatype elem
= N
of int | T
of int | D
of int
885 val _
= (if length(trans
)<256 then CharFormat
:= true
886 else CharFormat
:= false;
887 if !UsesTrailingContext
then
888 (say
"\ndatatype yyfinstate = N of int | \
889 \ T of int | D of int\n")
890 else say
"\ndatatype yyfinstate = N of int";
891 say
"\ntype statedata = {fin : yyfinstate list, trans: ";
893 true => say
"string}"
894 |
false => say
"int Vector.vector}";
895 say
"\n(* transition & final state table *)\nval tab = let\n";
899 (say
"fun decode s k =\n";
900 say
" let val k' = k + k\n";
901 say
" val hi = Char.ord(String.sub(s, k'))\n";
902 say
" val lo = Char.ord(String.sub(s, k' + 1))\n";
903 say
" in hi * 256 + lo end\n"))
905 let fun IsEndLeaf t
=
906 let fun f ((l
,e
)::r
) = if (e
=t
) then true else f r
907 | f nil
= false in f tcpairs
end
910 let fun f ((tl
,el
)::r
) = if (tl
=t
) then el
else f r
913 fun GetTrConLeaves s
=
914 let fun f ((s
',l
)::r
) = if (s
= s
') then l
else f r
919 let fun insert (x
:int) (a
::b
) =
920 if (x
<= a
) then x
::(a
::b
)
923 in List.foldr (fn (x
,r
) => insert x r
) [] s
925 fun conv a
= if (IsEndLeaf a
) then (D a
) else (N a
)
926 fun merge (a
::a
',b
::b
') =
927 if (a
<= b
) then (conv a
)::merge(a
',b
::b
')
928 else (T b
)::(merge(a
::a
',b
'))
929 |
merge (a
::a
',nil
) = (conv a
)::(merge (a
',nil
))
930 |
merge (nil
,b
::b
') = (T b
)::(merge (b
',nil
))
931 |
merge (nil
,nil
) = nil
935 sort_leaves (map (fn x
=> GetEndLeaf x
) (GetTrConLeaves x
)))))
942 let fun emit8(x
, pos
) = let
943 val s
= StringCvt.padLeft #
"0" 3 (Int.toString x
)
946 of 16 => (say
"\\\n\\\\"; say s
; 1)
947 | _
=> (say
"\\"; say s
; pos
+1)
950 let val hi8
= x
div 256
951 val lo8
= x
- hi8
* 256 (* x rem
256 *)
953 emit8(lo8
, emit8(hi8
, pos
))
955 fun MakeString([], _
, _
) = ()
956 |
MakeString(x
::xs
, emitter
, pos
) =
957 MakeString(xs
, emitter
, emitter(x
, pos
))
958 in case !CharFormat
of
959 true => (say
" =\n\""; MakeString(x
,emit8
,0); say
"\"\n")
960 |
false => (say
" = Vector.tabulate("; say (Int.toString(length x
));
961 say
", decode\n\""; MakeString(x
,emit16
,0); say
"\")\n")
963 fun makeEntry(nil
,rs
,t
) = rev rs
964 |
makeEntry(((l
:int,x
)::y
),rs
,t
) =
965 let val name
= "s" ^
(Int.toString l
)
966 in let val (r
,n
) = lookup ((x
,name
),t
)
967 in makeEntry(y
,(n
::rs
),t
)
968 end handle notfound _
=> (count
:= !count
+1;
969 say
"val "; say name
; makeItems x
;
970 makeEntry(y
,(name
::rs
),(insert ((x
,name
),t
))))
972 in (makeEntry(trans
,nil
,empty
))
975 fun makeTable(nil
,nil
) = ()
976 |
makeTable(a
::a
',b
::b
') =
977 let fun makeItems nil
= ()
978 |
makeItems (hd
::tl
) =
984 in (say t
; say (Int.toString n
); say
")";
987 else (say
","; makeItems tl
))
989 in (say
"{fin = ["; makeItems b
;
990 say
"], trans = "; say a
; say
"}";
993 else (say
",\n"; makeTable(a
',b
')))
996 fun msg x
= () (*TextIO.output(TextIO.stdOut
, x
)*)
998 in (say
"in Vector.fromList\n["; makeTable(rs
,newfins
); say
"]\nend\n";
999 msg ("\nNumber of states = " ^
(Int.toString (length trans
)));
1000 msg ("\nNumber of distinct rows = " ^
(Int.toString (!count
)));
1001 msg ("\nApprox. memory size of trans. table = " ^
1002 (Int.toString (!count
*(!CharSetSize
)*(if !CharFormat
then 1 else 8))));
1006 (* makeaccept
: Takes
a (string,string) dictionary
, prints
case statement for
1007 accepting leaf actions
. The key strings are the leaf #
's
, the data strings
1010 fun makeaccept ends
=
1011 let fun startline f
= if f
then say
" " else say
"| "
1012 fun make(nil
,f
) = (startline f
; say
"_ => raise Internal.LexerError\n")
1013 |
make((x
,a
)::y
,f
) = (startline f
; say x
; say
" => (";
1014 say a
; say
")\n"; make(y
,false))
1015 in make (listofdict(ends
),true)
1018 fun leafdata(e
:(int list
* exp
) list
) =
1019 let val fp
= array(!LeafNum
+ 1,nil
)
1020 and leaf
= array(!LeafNum
+ 1,EPS
)
1021 and tcpairs
= ref nil
1022 and trailmark
= ref ~
1;
1025 |
(hd
::tl
,x
) => (update(fp
,hd
,union(fp sub hd
,x
));
1029 (moredata(e1
); add(lastpos(e1
),firstpos(e1
)))
1030 |
ALT(e1
,e2
) => (moredata(e1
); moredata(e2
))
1031 |
CAT(e1
,e2
) => (moredata(e1
); moredata(e2
);
1032 add(lastpos(e1
),firstpos(e2
)))
1033 |
CLASS(x
,i
) => update(leaf
,i
,CLASS(x
,i
))
1034 |
TRAIL(i
) => (update(leaf
,i
,TRAIL(i
)); if !trailmark
= ~
1
1035 then trailmark
:= i
else ())
1036 |
END(i
) => (update(leaf
,i
,END(i
)); if !trailmark
<> ~
1
1037 then (tcpairs
:= (!trailmark
,i
)::(!tcpairs
);
1038 trailmark
:= ~
1) else ())
1042 |
(_
,x
)::tl
=> (moredata(x
);makedata(tl
))
1043 in trailmark
:= ~
1; makedata(e
); (fp
,leaf
,!tcpairs
)
1046 fun makedfa(rules
) =
1047 let val StateTab
= ref (create(String.<=)) : (string,int) dictionary ref
1048 val fintab
= ref (create(Int.<=)) : (int,(int list
)) dictionary ref
1049 val transtab
= ref (create(Int.<=)) : (int,int list
) dictionary ref
1050 val tctab
= ref (create(Int.<=)) : (int,(int list
)) dictionary ref
1051 val (fp
, leaf
, tcpairs
) = leafdata(rules
);
1053 fun visit (state
,statenum
) =
1054 let val transitions
= gettrans(state
) in
1055 fintab
:= enter(!fintab
)(statenum
,getfin(state
));
1056 tctab
:= enter(!tctab
)(statenum
,gettc(state
));
1057 transtab
:= enter(!transtab
)(statenum
,transitions
)
1060 and visitstarts (states
) =
1061 let fun vs nil i
= ()
1062 |
vs (hd
::tl
) i
= (visit (hd
,i
); vs
tl (i
+1))
1066 and hashstate(s
: int list
) =
1069 |
((x
:int)::y
,z
) => hs(y
,z ^
" " ^
(Int.toString x
))
1073 and find(s
) = lookup(!StateTab
)(hashstate(s
))
1075 and add(s
,n
) = StateTab
:= enter(!StateTab
)(hashstate(s
),n
)
1077 and getstate (state
) =
1079 handle LOOKUP
=> let val n
= ++StateNum
in
1080 add(state
,n
); visit(state
,n
); n
1084 let fun f nil fins
= fins
1087 of END _
=> f
tl (hd
::fins
)
1093 let fun f nil fins
= fins
1096 of TRAIL _
=> f
tl (hd
::fins
)
1101 and gettrans (state
) =
1102 let fun loop c tlist
=
1103 let fun cktrans nil r
= r
1104 |
cktrans (hd
::tl
) r
=
1105 case (leaf sub hd
) of
1107 (if (i sub c
) then cktrans
tl (union(r
,fp sub hd
))
1108 else cktrans tl r
handle Subscript
=>
1113 let val v
=cktrans state nil
1114 in loop (c
-1) (if v
=nil
then 0::tlist
else (getstate v
)::tlist
)
1118 in loop ((!CharSetSize
) - 1) nil
1122 let val startarray
= array(!StateNum
+ 1, nil
);
1123 fun listofarray(a
,n
) =
1124 let fun f i l
= if i
>= 0 then f (i
-1) ((a sub i
)::l
) else l
1128 |
(startlist
,e
)::tl
=> (fix(startlist
,firstpos(e
));makess(tl
))
1131 |
(s
::tl
,firsts
) => (update(startarray
,s
,
1132 union(firsts
,startarray sub s
));
1134 in makess(rules
);listofarray(startarray
, !StateNum
+ 1)
1137 in visitstarts(startstates());
1138 (listofdict(!fintab
),listofdict(!transtab
),listofdict(!tctab
),tcpairs
)
1143 \ structure UserDeclarations =\n\
1148 " | Internal
.D k
=> action (i
,(acts
::l
),k
::rs
)\n\
1149 \ | Internal
.T k
=>\n\
1150 \
let fun f (a
::b
,r
) =\n\
1152 \
then action(i
,(((Internal
.N a
)::acts
)::l
),(b@r
))\n\
1153 \
else f (b
,a
::r
)\n\
1154 \ |
f (nil
,r
) = action(i
,(acts
::l
),rs
)\n\
1159 fun lexGen(infile
) =
1160 let val outfile
= infile ^
".sml"
1161 fun PrintLexer (ends
) =
1162 let val sayln
= fn x
=> (say x
; say
"\n")
1164 of NONE
=> (sayln
"fun lex () : Internal.result =";
1165 sayln
"let fun continue() = lex() in")
1166 | SOME s
=> (say
"fun lex "; say
"(yyarg as ("; say s
; sayln
")) =";
1167 sayln
"let fun continue() : Internal.result = ");
1168 say
" let fun scan (s,AcceptingLeaves : Internal.yyfinstate";
1169 sayln
" list list,l,i0) =";
1170 if !UsesTrailingContext
1171 then say
"\tlet fun action (i,nil,rs)"
1172 else say
"\tlet fun action (i,nil)";
1173 sayln
" = raise LexError";
1174 if !UsesTrailingContext
1175 then sayln
"\t| action (i,nil::l,rs) = action(i-1,l,rs)"
1176 else sayln
"\t| action (i,nil::l) = action (i-1,l)";
1177 if !UsesTrailingContext
1178 then sayln
"\t| action (i,(node::acts)::l,rs) ="
1179 else sayln
"\t| action (i,(node::acts)::l) =";
1180 sayln
"\t\tcase node of";
1181 sayln
"\t\t Internal.N yyk => ";
1182 sayln
"\t\t\t(let val yytext = substring(!yyb,i0,i-i0)\n\
1183 \\t\t\t val yypos = i0+ !yygone";
1185 then (sayln
"\t\t\tval _ = yylineno := CharVector.foldl";
1186 sayln
"\t\t\t\t(fn (#\"\\n\", n) => n+1 | (_, n) => n) (!yylineno) yytext")
1189 then (say
"\t\t\tfun REJECT() = action(i,acts::l";
1190 if !UsesTrailingContext
1191 then sayln
",rs)" else sayln
")")
1193 sayln
"\t\t\topen UserDeclarations Internal.StartStates";
1194 sayln
" in (yybufpos := i; case yyk of ";
1196 sayln
"\t\t\t(* Application actions *)\n";
1200 if (!UsesTrailingContext
) then say skel_mid2
else ();
1201 sayln
"\tval {fin,trans} = Vector.sub(Internal.tab, s)";
1202 sayln
"\tval NewAcceptingLeaves = fin::AcceptingLeaves";
1203 sayln
"\tin if l = !yybl then";
1204 sayln
"\t if trans = #trans(Vector.sub(Internal.tab,0))";
1205 sayln
"\t then action(l,NewAcceptingLeaves";
1206 if !UsesTrailingContext
then say
",nil" else ();
1208 sayln
"\t let val newchars= if !yydone then \"\" else yyinput 1024";
1209 sayln
"\t in if (size newchars)=0";
1210 sayln
"\t\t then (yydone := true;";
1211 say
"\t\t if (l=i0) then UserDeclarations.eof ";
1212 sayln (case !ArgCode
of NONE
=> "()" | SOME _
=> "yyarg");
1213 say
"\t\t else action(l,NewAcceptingLeaves";
1214 if !UsesTrailingContext
then
1215 sayln
",nil))" else sayln
"))";
1216 sayln
"\t\t else (if i0=l then yyb := newchars";
1217 sayln
"\t\t else yyb := substring(!yyb,i0,l-i0)^newchars;";
1218 sayln
"\t\t yygone := !yygone+i0;";
1219 sayln
"\t\t yybl := size (!yyb);";
1220 sayln
"\t\t scan (s,AcceptingLeaves,l-i0,0))";
1222 sayln
"\t else let val NewChar = Char.ord(String.sub(!yyb,l))";
1223 say
"\t\tval NewState = ";
1224 case (!CharFormat
,!CharSetSize
)
1225 of (true,129) => sayln
"if NewChar<128 then Char.ord(String.sub(trans,NewChar)) else Char.ord(String.sub(trans,128))"
1226 |
(true,256) => sayln
"Char.ord(String.sub(trans,NewChar))"
1227 |
(false,129) => sayln
"if NewChar<128 then Vector.sub(trans, NewChar) else Vector.sub(trans, 128)"
1228 |
(false,256) => sayln
"Vector.sub(trans, NewChar)";
1229 say
"\t\tin if NewState=0 then action(l,NewAcceptingLeaves";
1230 if !UsesTrailingContext
then sayln
",nil)" else sayln
")";
1231 sayln
"\t\telse scan(NewState,NewAcceptingLeaves,l+1,i0)";
1234 if !UsesPrevNewLine
then () else sayln
"(*";
1235 sayln
"\tval start= if substring(!yyb,!yybufpos-1,1)=\"\\n\"";
1236 sayln
"then !yybegin+1 else !yybegin";
1237 if !UsesPrevNewLine
then () else sayln
"*)";
1239 if !UsesPrevNewLine
then say
"start"
1240 else say
"!yybegin (* start *)";
1241 sayln
",nil,!yybufpos,!yybufpos)";
1243 sayln (case !ArgCode
of NONE
=> "end" | SOME _
=> "in continue end");
1249 in (UsesPrevNewLine
:= false;
1251 LexBuf
:= make_ibuf(TextIO.openIn infile
);
1254 LexOut
:= TextIO.openOut(outfile
);
1257 StateTab
:= enter(create(String.<=))("INITIAL",1);
1260 val (user_code
,rules
,ends
) =
1262 (close_ibuf(!LexBuf
);
1263 TextIO.closeOut(!LexOut
);
1265 val (fins
,trans
,tctab
,tcpairs
) = makedfa(rules
)
1266 val _
= if !UsesTrailingContext
then
1267 (close_ibuf(!LexBuf
);
1268 TextIO.closeOut(!LexOut
);
1269 prErr
"lookahead is unimplemented")
1273 then say (!HeaderCode
)
1274 else say ("structure " ^
(!StrName
));
1278 say
"end (* end of user routines *)\n";
1279 say
"exception LexError (* raised if illegal leaf ";
1280 say
"action tried *)\n";
1281 say
"structure Internal =\n\tstruct\n";
1282 maketable(fins
,tctab
,tcpairs
,trans
);
1283 say
"structure StartStates =\n\tstruct\n";
1284 say
"\tdatatype yystartstate = STARTSTATE of int\n";
1287 say
"type result = UserDeclarations.lexresult\n";
1288 say
"\texception LexerError (* raised if illegal leaf ";
1289 say
"action tried *)\n";
1291 say
"fun makeLexer yyinput = \n";
1293 if !CountNewLines
then say
"\tval yylineno = ref 0\n\n" else ();
1294 say
"\tval yyb = ref \"\\n\" \t\t(* buffer *)\n\
1295 \\tval yybl = ref 1\t\t(*buffer length *)\n\
1296 \\tval yybufpos = ref 1\t\t(* location of next character to use *)\n\
1297 \\tval yygone = ref 1\t\t(* position in file of beginning of buffer *)\n\
1298 \\tval yydone = ref false\t\t(* eof found yet? *)\n\
1299 \\tval yybegin = ref 1\t\t(*Current 'start state' for lexer *)\n\
1300 \\n\tval YYBEGIN = fn (Internal.StartStates.STARTSTATE x) =>\n\
1301 \\t\t yybegin := x\n\n";
1303 close_ibuf(!LexBuf
);
1304 TextIO.closeOut(!LexOut
)
1309 structure Main
: BMARK
=
1311 val s
= OS
.FileSys
.getDir()
1312 fun doit () = LexGen
.lexGen (s^
"/DATA/ml.lex");
1324 fun testit _
= LexGen
.lexGen (s^
"DATA/ml.lex")