Import Upstream version 20180207
[hcoop/debian/mlton.git] / lib / mlton / basic / regexp.sml
CommitLineData
7f918cf1
CE
1(* Copyright (C) 2009 Matthew Fluet.
2 * Copyright (C) 1999-2006 Henry Cejtin, Matthew Fluet, Suresh
3 * Jagannathan, and Stephen Weeks.
4 *
5 * MLton is released under a BSD-style license.
6 * See the file MLton-LICENSE for details.
7 *)
8
9(* Many of the algorithms in this module are based on
10 * Compilers: Principles, Techniques, and Tools by Aho, Sethi, and Ullman,
11 * which I will refer to in comments as the Dragon Book.
12 *)
13local
14 fun ++ (r: int ref): int =
15 let
16 val n = 1 + !r
17 val _ = r := n
18 in n
19 end
20
21 val numChars: int = Char.maxOrd + 1
22
23 local
24 val validCharsString =
25 "\n\t@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ()[]<>!?-&#;'/=\"$.\\"
26 in
27 val validChars =
28 Vector.tabulate (numChars, fn i =>
29 String.contains (validCharsString, Char.fromInt i))
30
31 fun edgeLabel (cs: char list): string =
32 let
33 val chars = implode cs
34 val n = String.size chars
35 val numValidChars = String.size validCharsString
36 in
37 if n = numChars
38 then "."
39 else
40 (if n >= Int.quot (numValidChars, 2)
41 then (* Character complement. *)
42 concat ["[^",
43 String.alphabetize
44 (String.keepAll
45 (validCharsString, fn c =>
46 not (String.contains
47 (chars, c)))),
48 "]"]
49 else if (1 = String.size chars
50 andalso not (String.contains
51 (". ", String.sub (chars, 0))))
52 then chars
53 else concat ["[", chars, "]"])
54 end
55 val edgeLabel =
56 Trace.trace ("Regexp.edgeLabel", List.layout Char.layout, String.layout)
57 edgeLabel
58 end
59
60 structure Save =
61 struct
62 datatype t = T of {index: int ref}
63
64 fun layout (T {index, ...}) =
65 let
66 open Layout
67 in
68 seq [str "Save ", Int.layout (!index)]
69 end
70
71 fun new () = T {index = ref ~1}
72
73 fun equals (T {index = i, ...}, T {index = i', ...}) = i = i'
74
75 fun assign (T {index, ...}, i) = index := i
76
77 fun index (T {index, ...}) = !index
78
79 val index = Trace.trace ("Regexp.Save.index", layout, Int.layout) index
80 end
81
82 structure Regexp =
83 struct
84 datatype t =
85 AnchorFinish
86 | AnchorStart
87 | CharSet of char -> bool
88 | Or of t list
89 | Seq of t list
90 | Save of t * Save.t
91 | Star of t
92
93 fun layout (r: t): Layout.t =
94 let
95 open Layout
96 in
97 case r of
98 AnchorFinish => str "AnchorFinish"
99 | AnchorStart => str "AnchorStart"
100 | CharSet f =>
101 seq [str "[",
102 str (edgeLabel (Int.foldDown
103 (0, numChars, [], fn (i, ac) =>
104 let
105 val c = Char.fromInt i
106 in
107 if f c
108 then c :: ac
109 else ac
110 end))),
111 str "]"]
112 | Or rs => seq [str "Or ", List.layout layout rs]
113 | Seq rs => seq [str "Seq ", List.layout layout rs]
114 | Save (r, s) => seq [str "Save ",
115 Layout.tuple [layout r, Save.layout s]]
116 | Star r => seq [str "Star ", layout r]
117 end
118
119 val toString = Layout.toString o layout
120 end
121
122 structure Stack:
123 sig
124 type 'a t
125
126 val clear: 'a t -> unit
127 val foreach: 'a t * ('a -> unit) -> unit
128 val new: int * 'a -> 'a t
129 val peekMap: 'a t * ('a ->'b option) -> 'b option
130 val push: 'a t * 'a -> unit
131 end =
132 struct
133 datatype 'a t = T of {elts: 'a array,
134 size: int ref}
135
136 fun new (size: int, dummy: 'a): 'a t =
137 T {elts = Array.new (size, dummy),
138 size = ref 0}
139
140 fun push (T {elts, size}, x) =
141 let
142 val n = !size
143 val _ = Array.update (elts, n, x)
144 val _ = size := n + 1
145 in ()
146 end
147
148 fun clear (T {size, ...}) = size := 0
149
150 fun foreach (T {elts, size, ...}, f) =
151 Int.for (0, !size, fn i => f (Array.sub (elts, i)))
152
153 fun peekMap (T {elts, size, ...}, f) =
154 let
155 val n = !size
156 fun loop i =
157 if i = n
158 then NONE
159 else (case f (Array.sub (elts, i)) of
160 NONE => loop (i + 1)
161 | SOME z => SOME z)
162 in
163 loop 0
164 end
165 end
166
167 (* NFA state. *)
168 (* The states in an NFA are indexed from 0 to n-1, where n is the number
169 * of states.
170 *)
171 structure State =
172 struct
173 type t = int
174
175 val layout = Int.layout
176 end
177
178 structure MatchAction =
179 struct
180 datatype t =
181 Finish of Save.t
182 | Start of Save.t
183
184 val equals =
185 fn (Finish s, Finish s') => Save.equals (s, s')
186 | (Start s, Start s') => Save.equals (s, s')
187 | _ => false
188
189 fun layout a =
190 let
191 open Layout
192 in
193 case a of
194 Finish s => seq [str "Finish ", Save.layout s]
195 | Start s => seq [str "Start ", Save.layout s]
196 end
197 end
198
199 structure Match =
200 struct
201 datatype t = T of {all: Substring.t,
202 matches: (Save.t * Substring.t) array}
203
204 fun all (T {all, ...}) = all
205
206 val startLength = #2 o Substring.base o all
207
208 val endOf = Substring.endOf o all
209
210 val length = Substring.length o all
211
212 fun layout (T {all, matches}) =
213 let open Layout
214 in tuple [Substring.layout all,
215 Array.layout (Layout.tuple2
216 (Save.layout, Substring.layout)) matches]
217 end
218
219 fun funs (T {matches, ...}) =
220 let
221 fun peek (s: Save.t): Substring.t option =
222 Option.map (Array.peek (matches, fn (s', _) =>
223 Save.equals (s, s')),
224 #2)
225 in {exists = Option.isSome o peek,
226 lookup = valOf o peek,
227 peek = peek}
228 end
229
230 fun stringFuns m =
231 let
232 val {peek, lookup, exists} = funs m
233 in
234 {exists = exists,
235 lookup = Substring.toString o lookup,
236 peek = fn s => Option.map (peek s, Substring.toString)}
237 end
238
239 local
240 fun make sel (m, s) = sel (funs m) s
241 in
242 val peek = make #peek
243 val lookup = make #lookup
244 val exists = make #exists
245 end
246
247 fun peekString (m, s) = Option.map (peek (m, s), Substring.toString)
248 val lookupString = Substring.toString o lookup
249 end
250
251 structure Actions =
252 struct
253 datatype t = T of (int * MatchAction.t vector) list
254
255 fun layout (T l) =
256 List.layout (Layout.tuple2 (Int.layout,
257 Vector.layout MatchAction.layout))
258 l
259
260 val empty = T []
261
262 fun add (a as T l, i, v: MatchAction.t vector) =
263 if Vector.isEmpty v
264 then a
265 else T ((i, v) :: l)
266 end
267
268 structure NFA =
269 struct
270 structure State = State
271 (* State i is final iff isSome (Array.sub (final, i)).
272 * Characters are grouped into equivalence classes, represented by
273 * integers in [0, numCharClasses).
274 * The equivalence class of c is Array.sub (charClass, Char.toInt c).
275 * The dimensions of next is numStates x numCharClasses.
276 * The outgoing states from state i on input char c are given by
277 * Array2.sub (next, i, Array.sub (charClass, Char.to Int c)).
278 * seen, stack1, and stack2 are used in the two stack simulation of
279 * the NFA (see fun match). We preallocate them as part of the NFA
280 * so they don't have to be allocated on each call to match.
281 *)
282 datatype t =
283 T of {anchorStarts: (State.t * MatchAction.t vector) vector,
284 charClass: int array, (* of length numChars *)
285 final: {actions: MatchAction.t vector,
286 requireFinish: bool} option array,
287 next: (State.t * MatchAction.t vector) array Array2.t,
288 saves: Save.t vector,
289 seen: bool array,
290 stack1: (State.t * Actions.t) Stack.t,
291 stack2: (State.t * Actions.t) Stack.t,
292 start: State.t}
293 end
294
295 (* Non-deterministic Finite Automaton. *)
296 structure NFA:
297 sig
298 structure State:
299 sig
300 type t = int
301
302 val layout: t -> Layout.t
303 end
304
305 datatype t = datatype NFA.t
306
307 val fromRegexp: Regexp.t -> t
308 val layoutDot: t * string (* title *) -> Layout.t
309 val match: {nfa: t,
310 short: bool,
311 string: string,
312 startPos: int} -> (int * Actions.t) option
313 val numCharClasses: t -> int
314 val numStates: t -> int
315 val saves: t -> Save.t vector
316 end =
317 struct
318 open NFA
319
320 fun numStates (T {next, ...}) = Array2.nRows next
321 fun numCharClasses (T {next, ...}) = Array2.nCols next
322 fun saves (T {saves, ...}) = saves
323
324 (* Simulating an NFA with two stacks and a bit vector, as in Algorithm
325 * 3.4 (page 126) of the Dragon Book.
326 *)
327 fun match {nfa as T {anchorStarts, charClass, final,
328 next, stack1, stack2, start, ...},
329 short,
330 string = s,
331 startPos}: (int * Actions.t) option =
332 let
333 val numStates = numStates nfa
334 val n = String.size s
335 val seen = Array.array (numStates, false)
336 fun loop (current, nextStates, i: int,
337 last: (int * Actions.t) option)
338 : (int * Actions.t) option =
339 let
340 val last =
341 case (Stack.peekMap
342 (current, fn (s, a) =>
343 case Array.sub (final, s) of
344 NONE => NONE
345 | SOME {actions, requireFinish} =>
346 if requireFinish andalso i < n
347 then NONE
348 else SOME (i, Actions.add (a, i, actions)))) of
349 NONE => last
350 | s => s
351 in
352 if numStates = 0
353 orelse i = n
354 orelse (short andalso isSome last)
355 then (Stack.clear current
356 ; last)
357 else
358 let
359 val _ = Array.modify (seen, fn _ => false)
360 val c = Array.sub (charClass,
361 Char.toInt (String.sub (s, i)))
362 val _ =
363 Stack.foreach
364 (current, fn (s, a) =>
365 Array.foreach
366 (Array2.sub (next, s, c),
367 fn (s', v) =>
368 if Array.sub (seen, s')
369 then ()
370 else (Array.update (seen, s', true)
371 ; (Stack.push
372 (nextStates,
373 (s', Actions.add (a, i, v)))))))
374 val _ = Stack.clear current
375 in loop (nextStates, current, i + 1, last)
376 end
377 end
378 val _ = Stack.push (stack1, (start, Actions.empty))
379 val _ =
380 if startPos = 0
381 then (Vector.foreach
382 (anchorStarts, fn (s, v) =>
383 Stack.push
384 (stack1,
385 (s, Actions.add (Actions.empty, startPos, v)))))
386 else ()
387 in
388 loop (stack1, stack2, startPos, NONE)
389 end
390
391 (* This conversion from a regular expression to an NFA is based on
392 * Section 3.9 (pages 134 -- 140) of the Dragon Book.
393 *
394 * It creates one NFA state for each CharSet (called a "position") that
395 * is in the regexp. There is also one extra state for the start state.
396 * It adds edges as in rules 1 and 2 (page 138) for the followpos
397 * function.
398 *)
399 fun fromRegexp (r: Regexp.t): t =
400 let
401 fun loop (r, ac as (saves, numPos)) =
402 let
403 open Regexp
404 in
405 case r of
406 AnchorFinish => (saves, numPos + 1)
407 | AnchorStart => (saves, numPos + 1)
408 | CharSet _ => (saves, numPos + 1)
409 | Or rs => List.fold (rs, ac, loop)
410 | Save (r, s) => loop (r, (s :: saves, numPos))
411 | Seq rs => List.fold (rs, ac, loop)
412 | Star r => loop (r, ac)
413 end
414 val (saves, numPos) = loop (r, ([], 0))
415 val saves = Vector.fromList saves
416 val numStates = numPos + 1
417 val start = numPos
418 val posCounter = ref ~1
419 val follow: MatchAction.t vector option Array2.t =
420 Array2.new (numStates, numStates, NONE)
421 val posChars = Array2.tabulate (numPos, numChars, fn _ => false)
422 local
423 (* Sets of positions represented as vectors of length numPos.
424 *)
425 datatype t = T of MatchAction.t vector option vector
426 in
427 type set = t
428 fun lookup (T v, s) = Vector.sub (v, s)
429 val empty: t = T (Vector.new (numPos, NONE))
430 fun addActions (T v, a) =
431 T (Vector.map
432 (v, fn opt =>
433 Option.map (opt, fn a' => Vector.concat [a, a'])))
434 fun addAction (s, a) = addActions (s, Vector.new1 a)
435 fun union (T v, T v'): t =
436 T (Vector.tabulate
437 (numPos, fn i =>
438 case (Vector.sub (v, i), Vector.sub (v', i)) of
439 (NONE, a) => a
440 | (a, NONE) => a
441 | _ => Error.bug "Regexp.NFA.fromRegexp.union"))
442 fun singleton (i: int): t =
443 T (Vector.tabulate (numPos, fn j =>
444 if i = j
445 then SOME (Vector.new0 ())
446 else NONE))
447 fun foreach (T v, f) =
448 Vector.foreachi (v, fn (i, opt) =>
449 case opt of
450 NONE => ()
451 | SOME a => f (i, a))
452 end
453 fun connect (v, v') =
454 foreach
455 (v, fn (s, a) =>
456 foreach
457 (v', fn (s', a') =>
458 Array2.update (follow, s, s',
459 SOME (Vector.concat [a, a']))))
460 val anchorFinishes = ref []
461 val anchorStarts = ref []
462 fun anchor r =
463 let
464 val i = ++ posCounter
465 val _ = List.push (r, i)
466 val first = singleton i
467 in
468 {first = first,
469 last = first,
470 nullable = NONE}
471 end
472 (* The following loop fills in follow and posChars.
473 * first set of positions that
474 * nullable is SOME v iff the regexp is nullable, where v is the
475 * sequence of actions to perform if the expression is null.
476 *)
477 fun loop (r: Regexp.t): {first: set,
478 last: set,
479 nullable: MatchAction.t vector option} =
480 case r of
481 Regexp.AnchorFinish => anchor anchorFinishes
482 | Regexp.AnchorStart => anchor anchorStarts
483 | Regexp.CharSet f =>
484 let
485 val i = ++ posCounter
486 val _ =
487 Int.for
488 (0, numChars, fn c =>
489 if f (Char.chr c)
490 then Array2.update (posChars, i, c, true)
491 else ())
492 val first = singleton i
493 in {first = first,
494 last = first,
495 nullable = NONE}
496 end
497 | Regexp.Or rs =>
498 List.fold
499 (rs, {first = empty,
500 last = empty,
501 nullable = NONE},
502 fn (r, {first = f, last = l, nullable = n}) =>
503 let
504 val {first = f', last = l', nullable = n'} =
505 loop r
506 in
507 {first = union (f, f'),
508 last = union (l, l'),
509 nullable = if isSome n then n else n'}
510 end)
511 | Regexp.Save (r, s) =>
512 let
513 val {first = f, last = l, nullable = n} = loop r
514 val start = MatchAction.Start s
515 val finish = MatchAction.Finish s
516 in
517 {first = addAction (f, start),
518 last = addAction (l, finish),
519 nullable = Option.map (n, fn v =>
520 Vector.concat
521 [Vector.new1 start,
522 v,
523 Vector.new1 finish])}
524 end
525 | Regexp.Seq rs =>
526 List.fold
527 (rs, {first = empty,
528 last = empty,
529 nullable = SOME (Vector.new0 ())},
530 fn (r, {first = f, last = l, nullable = n}) =>
531 let
532 val {first = f', last = l', nullable = n'} =
533 loop r
534 val _ = connect (l, f')
535 val first =
536 case n of
537 NONE => f
538 | SOME v => union (f, addActions (f', v))
539 val last =
540 case n' of
541 NONE => l'
542 | SOME v => union (l', addActions (l, v))
543 in
544 {first = first,
545 last = last,
546 nullable = (case (n, n') of
547 (SOME v, SOME v') =>
548 SOME (Vector.concat [v, v'])
549 | _ => NONE)}
550 end)
551 | Regexp.Star r =>
552 let
553 val {first = f, last = l, ...} = loop r
554 val _ = connect (l, f)
555 in
556 {first = f, last = l,
557 nullable = SOME (Vector.new0 ())}
558 end
559 val {first, last, nullable} = loop r
560 local
561 fun extract (anchors, positions) =
562 Vector.keepAllMap
563 (Vector.fromListMap
564 (!anchors, fn s =>
565 Option.map (lookup (positions, s), fn v => (s, v))),
566 fn x => x)
567 in
568 (* Any anchor starts in first should be anchor starts. *)
569 val anchorStarts = extract (anchorStarts, first)
570 (* Any anchor finishes in last should be anchor finishes *)
571 val anchorFinishes = extract (anchorFinishes, last)
572 end
573 (* The positions in first are reachable from the start state. *)
574 val _ = foreach (first, fn (i, a) =>
575 Array2.update (follow, start, i, SOME a))
576 val final = Array.array (numStates, NONE)
577 (* The positions that are followed by an anchorFinish are final,
578 * with requireFinish = true.
579 *)
580 val _ =
581 Vector.foreach
582 (anchorFinishes, fn (j, _) =>
583 Int.for
584 (0, numStates, fn i =>
585 case Array2.sub (follow, i, j) of
586 NONE => ()
587 | SOME a =>
588 Array.update (final, i, SOME {actions = a,
589 requireFinish = true})))
590 (* The positions in last are all final. *)
591 val _ =
592 foreach (last, fn (i, a) =>
593 Array.update (final, i, SOME {actions = a,
594 requireFinish = false}))
595 (* The start state is final iff the whole regexp is nullable. *)
596 val _ =
597 case nullable of
598 NONE => ()
599 | SOME v =>
600 Array.update (final, start,
601 SOME {actions = v,
602 requireFinish = false})
603 (* Compute the transition table, "next". *)
604 val tmp: MatchAction.t vector option Array.t =
605 Array.new (numStates, NONE)
606 val next =
607 Array2.tabulate
608 (numStates, numChars, fn (i, c) =>
609 let
610 val _ =
611 Int.for
612 (0, numPos, fn j =>
613 case Array2.sub (follow, i, j) of
614 NONE => ()
615 | SOME a =>
616 if Array2.sub (posChars, j, c)
617 then Array.update (tmp, j, SOME a)
618 else ())
619 val res =
620 Array.keepAllMapi (tmp, fn (i, opt) =>
621 Option.map (opt, fn v => (i, v)))
622 val _ = Int.for (0, numStates, fn j =>
623 Array.update (tmp, j, NONE))
624 in
625 res
626 end)
627 (* Two characters are equivalent if all states treat them the
628 * same.
629 *)
630 fun charEquiv (c: int, c': int) =
631 Int.forall
632 (0, numStates, fn i =>
633 Array.equals
634 (Array2.sub (next, i, c),
635 Array2.sub (next, i, c'),
636 fn ((j, v), (j', v')) =>
637 j = j' andalso Vector.equals (v, v', MatchAction.equals)))
638 (* Compute charClass. *)
639 val repCounter = ref ~1
640 val reps = ref [] (* representative of each char class *)
641 val charClass = Array.new (numChars, ~1)
642 val _ =
643 Int.for (0, numChars, fn c =>
644 let
645 val rep =
646 case List.peek (!reps, fn {char, ...} =>
647 charEquiv (c, char)) of
648 NONE =>
649 let
650 val rep = ++ repCounter
651 in List.push (reps, {char = c, rep = rep})
652 ; rep
653 end
654 | SOME {rep, ...} => rep
655 in Array.update (charClass, c, rep)
656 end)
657 val numClasses = 1 + !repCounter
658 (* Compute "next" for the charClasses. *)
659 val next' =
660 Array2.new (numStates, numClasses, Array.fromList [])
661 val _ =
662 List.foreach
663 (!reps, fn {char, rep} =>
664 Int.for (0, numStates, fn state =>
665 Array2.update (next', state, rep,
666 Array2.sub (next, state, char))))
667 in
668 T {anchorStarts = anchorStarts,
669 charClass = charClass,
670 final = final,
671 next = next',
672 saves = saves,
673 seen = Array.new (numStates, false),
674 stack1 = Stack.new (numStates, (~1, Actions.empty)),
675 stack2 = Stack.new (numStates, (~1, Actions.empty)),
676 start = start}
677 end
678
679 structure Graph = DirectedGraph
680 fun layoutDot (T {anchorStarts, charClass, final, next, start, ...},
681 title: string): Layout.t =
682 let
683 val numStates = Array2.nRows next
684 open Dot
685 val g = Graph.new ()
686 val nodes = Vector.tabulate (numStates, fn _ => Graph.newNode g)
687 fun node i = Vector.sub (nodes, i)
688 val {get = nodeOptions, ...} =
689 Property.get (Graph.Node.plist,
690 Property.initFun
691 (fn _ => let open NodeOption
692 in ref []
693 end))
694 val {get = edgeOptions, ...} =
695 Property.get (Graph.Edge.plist,
696 Property.initFun
697 (fn _ => let open EdgeOption
698 in ref []
699 end))
700 fun addNodeOption (i, opts) =
701 let val r = nodeOptions (node i)
702 in r := opts @ !r
703 end
704 val _ = addNodeOption (start, [NodeOption.label "start"])
705 val _ =
706 Int.for
707 (0, numStates, fn src =>
708 let
709 val shape =
710 case (isSome (Array.sub (final, src)),
711 Vector.exists (anchorStarts, fn (s, _) =>
712 s = src)) of
713 (false, false) => Ellipse
714 | (true, false) => Box
715 | (false, true) => Diamond
716 | (true, true) => Polygon {sides = 5, options = []}
717 val _ =
718 addNodeOption (src, let open NodeOption
719 in [Shape shape]
720 end)
721 val dsts = Array.new (numStates, [])
722 val _ =
723 Int.forDown
724 (0, numChars, fn c =>
725 if Vector.sub (validChars, c)
726 then
727 let
728 val char = Char.fromInt c
729 val class = Array.sub (charClass, c)
730 in Array.foreach
731 (Array2.sub (next, src, class), fn (dst, _) =>
732 (Array.update (dsts, dst,
733 char :: Array.sub (dsts, dst))))
734 end
735 else ())
736 in
737 Array.foreachi
738 (dsts, fn (dst, cs) =>
739 case cs of
740 [] => ()
741 | _ =>
742 let
743 val edge = Graph.addEdge (g, {from = node src,
744 to = node dst})
745 in List.push (edgeOptions edge,
746 EdgeOption.label (edgeLabel cs))
747 end)
748 end)
749 in
750 Graph.layoutDot (g, fn {nodeName} =>
751 {title = title,
752 options =
753 let open GraphOption
754 in [
755 RankDir LeftToRight,
756 Rank (Min, [{nodeName = nodeName (node start)}])
757 ]
758 end,
759 edgeOptions = ! o edgeOptions,
760 nodeOptions = ! o nodeOptions})
761 end
762 end
763
764 structure DFA:
765 sig
766 type t
767
768 val fromNFA: NFA.t -> t
769 val layoutDot: {dfa: t,
770 showDead: bool,
771 title: string} -> Layout.t
772 val match: {dfa: t,
773 short: bool,
774 string: string,
775 startPos: int,
776 anchorStart: bool} -> (int * Actions.t) option
777 val minimize: t -> t
778 val saves: t -> Save.t vector
779 end =
780 struct
781 (* The states in a DFA are indexed from 0 to n-1, where n is the number
782 * of states.
783 *)
784 structure State =
785 struct
786 type t = int
787
788 val layout = Int.layout
789 end
790
791 type slot = int
792
793 structure EdgeAction =
794 struct
795 datatype t =
796 Add of {from: slot,
797 to: slot,
798 actions: MatchAction.t vector}
799 | Init of {to: slot,
800 actions: MatchAction.t vector}
801
802 val equals =
803 fn (Add {from = f, to = t, actions = a},
804 Add {from = f', to = t', actions = a'}) =>
805 f = f' andalso t = t'
806 andalso Vector.equals (a, a', MatchAction.equals)
807 | (Init {to = t, actions = a},
808 Init {to = t', actions = a'}) =>
809 t = t' andalso Vector.equals (a, a', MatchAction.equals)
810 | _ => false
811
812 val toString =
813 fn Add {from, to, actions} =>
814 concat ["(",
815 Int.toString from, ", ",
816 Int.toString to, ", ",
817 Layout.toString
818 (Vector.layout MatchAction.layout actions),
819 ")"]
820 | Init {to, actions} =>
821 concat ["(",
822 Int.toString to, ", ",
823 Layout.toString
824 (Vector.layout MatchAction.layout actions),
825 ")"]
826
827 val layout =
828 let open Layout
829 in
830 fn Add {from, to, actions} =>
831 Layout.record
832 [("from", Int.layout from),
833 ("to", Int.layout to),
834 ("actions",
835 Vector.layout MatchAction.layout actions)]
836 | Init {actions, to} =>
837 Layout.record
838 [("to", Int.layout to),
839 ("actions",
840 Vector.layout MatchAction.layout actions)]
841 end
842 end
843
844 (* State i is final iff Array.sub (final, i).
845 * Characters are grouped into equivalence classes, represented by
846 * integers in [0, numCharClasses).
847 * The equivalence class of c is Array.sub (charClass, Char.toInt c).
848 * The dimensions of next are numStates x numCharClasses
849 * The outgoing state from state i on input char c is
850 * Array2.sub (next, i, Array.sub (charClass, Char.toInt c)).
851 * actions1 and actions2 are used only during matching. They
852 * represent the actions associated with each NFA state. They are of
853 * the same length as the number of states in the NFA.
854 *)
855 datatype t =
856 T of {anchorStart: State.t,
857 anchorStartStack: MatchAction.t vector vector,
858 charClass: int array, (* of length numChars *)
859 dead: bool array,
860 final: {actions: MatchAction.t vector,
861 requireFinish: bool,
862 slot: int} option array,
863 next: (State.t * EdgeAction.t vector) Array2.t,
864 saves: Save.t vector,
865 stack1: Actions.t array, (* of size maxNumNFAStates *)
866 stack2: Actions.t array, (* of size maxNumNFAStates *)
867 start: State.t,
868 startStack: MatchAction.t vector vector}
869
870 fun numStates (T {next, ...}): int = Array2.nRows next
871 fun saves (T {saves, ...}) = saves
872
873 fun dead (numStates, numCharClasses, final, next) =
874 Array.tabulate
875 (numStates, fn i =>
876 not (isSome (Array.sub (final, i)))
877 andalso Int.forall (0, numCharClasses, fn c =>
878 let val (j, v) = Array2.sub (next, i, c)
879 in i = j andalso Vector.isEmpty v
880 end))
881
882 (* To build a DFA from an NFA, I use the usual "subset construction",
883 * as in algorithm 3.2 (page 118) of the Dragon Book.
884 *
885 * It associates each (reachable) set of states in the NFA with a single
886 * state in the DFA.
887 *)
888 fun fromNFA (nfa as NFA.T {anchorStarts, charClass,
889 final, next, saves, start, ...}) =
890 let
891 val numNFAStates = NFA.numStates nfa
892 val numCharClasses = NFA.numCharClasses nfa
893 (* Determine the NFA states that have save info.
894 *)
895 val nfaStateSave = Array.array (numNFAStates, false)
896 fun visit (s: NFA.State.t): unit =
897 if Array.sub (nfaStateSave, s)
898 then ()
899 else (Array.update (nfaStateSave, s, true)
900 ; Int.for (0, numCharClasses, fn c =>
901 Array.foreach
902 (Array2.sub (next, s, c), fn (s', _) =>
903 visit s')))
904 val _ =
905 Vector.foreach
906 (anchorStarts, fn (s, v) =>
907 if Vector.isEmpty v
908 then ()
909 else visit s)
910 val _ =
911 Int.for (0, numNFAStates, fn s =>
912 if Array.sub (nfaStateSave, s)
913 then ()
914 else
915 Int.for (0, numCharClasses, fn c =>
916 Array.foreach
917 (Array2.sub (next, s, c), fn (s', v) =>
918 if Vector.isEmpty v
919 then ()
920 else visit s')))
921 (* Sets of states are represented as arrays, sorted in increasing
922 * order of state index.
923 *)
924 type states = NFA.State.t array
925 val counter = ref ~1
926 type work =
927 {states: states,
928 state: int,
929 out: (State.t * EdgeAction.t vector) vector option ref}
930 val cache: work list ref = ref []
931 val todo: work list ref = ref []
932 val maxNumStates: int ref = ref 0
933 fun statesToState (ss: states): State.t =
934 let
935 val n = Array.length ss
936 val _ = if n > !maxNumStates
937 then maxNumStates := n
938 else ()
939 in
940 case List.peek (!cache, fn {states, ...} =>
941 Array.equals (ss, states, op =)) of
942 NONE =>
943 let
944 val state = ++ counter
945 val work = {out = ref NONE,
946 state = state,
947 states = ss}
948 val _ = List.push (cache, work)
949 val _ = List.push (todo, work)
950 in
951 state
952 end
953 | SOME {state, ...} => state
954 end
955 val statesToState =
956 Trace.trace ("Regexp.DFA.fromNFA.statesToState",
957 Array.layout NFA.State.layout,
958 State.layout)
959 statesToState
960 local
961 val seen = Array.array (NFA.numStates nfa, NONE)
962 in
963 fun computeOut states =
964 Vector.tabulate
965 (numCharClasses, fn c =>
966 let
967 val _ = Array.modify (seen, fn _ => NONE)
968 val _ =
969 Array.foreachi
970 (states, fn (fromSlot: slot,
971 fromState: NFA.State.t) =>
972 Array.foreach
973 (Array2.sub (next, fromState, c),
974 fn (toState: NFA.State.t, v) =>
975 case Array.sub (seen, toState) of
976 NONE =>
977 Array.update
978 (seen, toState,
979 SOME {fromSlot = fromSlot,
980 fromState = fromState,
981 toState = toState,
982 actions = v})
983 | SOME _ => ()))
984 val toStates = Array.keepAllMap (seen, fn opt => opt)
985 val edgeActions = ref []
986 val toStates =
987 Array.mapi
988 (toStates, fn (toSlot: slot,
989 {fromSlot, fromState, toState,
990 actions}) =>
991 (if Array.sub (nfaStateSave, toState)
992 then
993 List.push
994 (edgeActions,
995 if Array.sub (nfaStateSave, fromState)
996 then
997 EdgeAction.Add
998 {from = fromSlot,
999 to = toSlot,
1000 actions = actions}
1001 else (EdgeAction.Init
1002 {to = toSlot,
1003 actions = actions}))
1004 else ()
1005 ; toState))
1006 in (statesToState toStates,
1007 Vector.fromList (!edgeActions))
1008 end)
1009 end
1010 fun loop () =
1011 case !todo of
1012 [] => ()
1013 | {states, out, ...} :: rest =>
1014 (todo := rest
1015 ; out := SOME (computeOut states)
1016 ; loop ())
1017 (* These calls to statesToState initialize the worklist. *)
1018 val start' = statesToState (Array.fromList [start])
1019 val startStack = Vector.new1 (Vector.new0 ())
1020 val anchorStartStates =
1021 Array.fromList
1022 (List.insert
1023 (Vector.toListMap (anchorStarts, #1), start, op <=))
1024 val anchorStart' = statesToState anchorStartStates
1025 val anchorStartStack =
1026 Vector.tabulate
1027 (Array.length anchorStartStates,
1028 fn i =>
1029 let
1030 val s = Array.sub (anchorStartStates, i)
1031 in
1032 case Vector.peek (anchorStarts, fn (s', _) => s = s') of
1033 NONE => Vector.new0 ()
1034 | SOME (_, v) => v
1035 end)
1036 val _ = loop ()
1037 (* The worklist is empty. Compute the transition table. *)
1038 val numStates = 1 + !counter
1039 val next' = Array2.new (numStates, numCharClasses,
1040 (~1, Vector.new0 ()))
1041 val final' = Array.new (numStates, NONE)
1042 val _ =
1043 List.foreach
1044 (!cache, fn {states, state = i, out, ...}: work =>
1045 let
1046 val _ =
1047 Vector.foreachi
1048 (valOf (! out), fn (c, j) =>
1049 Array2.update (next', i, c, j))
1050 val _ =
1051 case Array.sub (final', i) of
1052 SOME {requireFinish = false, ...} => ()
1053 | _ =>
1054 case Array.peekMapi (states, fn s =>
1055 Array.sub (final, s)) of
1056 NONE => ()
1057 | SOME (slot, {actions, requireFinish}) =>
1058 Array.update
1059 (final', i,
1060 SOME {actions = actions,
1061 requireFinish = requireFinish,
1062 slot = slot})
1063 in
1064 ()
1065 end)
1066 fun newStack () = Array.new (!maxNumStates, Actions.empty)
1067 in T {anchorStart = anchorStart',
1068 anchorStartStack = anchorStartStack,
1069 charClass = charClass,
1070 dead = dead (numStates, numCharClasses, final', next'),
1071 final = final',
1072 next = next',
1073 saves = saves,
1074 stack1 = newStack (),
1075 stack2 = newStack (),
1076 start = start',
1077 startStack = startStack}
1078 end
1079
1080 (*
1081 * match could be sped up some by doing the match in two passes.
1082 * The first pass just determines if the match will succeed.
1083 * The second pass computes all the edge actions.
1084 *)
1085 fun match {dfa = T {anchorStart = ancSt, anchorStartStack,
1086 charClass, dead, final, next, stack1, stack2,
1087 start, startStack, ...},
1088 short: bool,
1089 string = s,
1090 startPos: int,
1091 anchorStart: bool}: (int * Actions.t) option =
1092 let
1093 val n = String.size s
1094 fun loop (i: int,
1095 state: int,
1096 stack1, stack2,
1097 last: (int * Actions.t) option)
1098 : (int * Actions.t) option =
1099 let
1100 val last =
1101 case Array.sub (final, state) of
1102 NONE => last
1103 | SOME {actions, requireFinish, slot} =>
1104 if requireFinish andalso i < n
1105 then NONE
1106 else
1107 SOME (i, Actions.add (Array.sub (stack1, slot),
1108 i, actions))
1109 in
1110 if Array.sub (dead, state)
1111 orelse i = n
1112 orelse (short andalso isSome last)
1113 then last
1114 else
1115 let
1116 val (state, edgeActions) =
1117 Array2.sub (next, state,
1118 Array.sub
1119 (charClass,
1120 Char.toInt (String.sub (s, i))))
1121 val _ =
1122 Vector.foreach
1123 (edgeActions,
1124 fn EdgeAction.Add {from, to, actions} =>
1125 Array.update
1126 (stack2, to,
1127 Actions.add (Array.sub (stack1, from),
1128 i, actions))
1129 | EdgeAction.Init {to, actions} =>
1130 Array.update
1131 (stack2, to,
1132 Actions.add (Actions.empty, i, actions)))
1133 in
1134 loop (i + 1, state, stack2, stack1, last)
1135 end
1136 end
1137 val (state, initStack) =
1138 if anchorStart
1139 then (ancSt, anchorStartStack)
1140 else (start, startStack)
1141 val _ =
1142 Vector.foreachi
1143 (initStack, fn (slot, v) =>
1144 Array.update (stack1, slot,
1145 Actions.add (Actions.empty, startPos, v)))
1146 val res = loop (startPos, state, stack1, stack2, NONE)
1147 in
1148 res
1149 end
1150
1151 val match =
1152 Trace.trace ("Regexp.DFA.match",
1153 fn {string, startPos, ...} =>
1154 Layout.tuple [String.layout string,
1155 Int.layout startPos],
1156 Option.layout (Layout.tuple2
1157 (Int.layout, Actions.layout)))
1158 match
1159
1160 structure Graph = DirectedGraph
1161 structure Env = Env (structure Domain = MonoVector (EdgeAction))
1162 fun layoutDot {dfa as T {anchorStart, charClass, dead, final,
1163 next, start, ...},
1164 title: string,
1165 showDead: bool}: Layout.t =
1166 let
1167 val numStates = numStates dfa
1168 open Dot
1169 val g = Graph.new ()
1170 val nodes = Vector.tabulate (numStates, fn _ => Graph.newNode g)
1171 fun node i = Vector.sub (nodes, i)
1172 val {get = nodeOptions, ...} =
1173 Property.get (Graph.Node.plist,
1174 Property.initFun
1175 (fn _ => let open NodeOption
1176 in ref []
1177 end))
1178 val {get = edgeOptions, ...} =
1179 Property.get (Graph.Edge.plist,
1180 Property.initFun
1181 (fn _ => let open EdgeOption
1182 in ref []
1183 end))
1184 fun addNodeOption (i, opts) =
1185 let val r = nodeOptions (node i)
1186 in r := opts @ !r
1187 end
1188 val _ = addNodeOption (start, [NodeOption.label "start"])
1189 val _ =
1190 Int.for
1191 (0, numStates, fn src =>
1192 let
1193 val shape =
1194 case (isSome (Array.sub (final, src)),
1195 src = anchorStart) of
1196 (false, false) => Ellipse
1197 | (true, false) => Box
1198 | (false, true) => Diamond
1199 | (true, true) => Polygon {sides = 5, options = []}
1200 val _ =
1201 addNodeOption (src, let open NodeOption
1202 in [Shape shape]
1203 end)
1204 val dsts = Array.new (numStates, Env.empty ())
1205 val _ =
1206 Int.forDown
1207 (0, numChars, fn c =>
1208 if Vector.sub (validChars, c)
1209 then
1210 let
1211 val (dst, v) =
1212 Array2.sub (next, src,
1213 Array.sub (charClass, c))
1214 val e = Array.sub (dsts, dst)
1215 val c = Char.fromInt c
1216 val cs =
1217 case Env.peek (e, v) of
1218 NONE => [c]
1219 | SOME cs => c :: cs
1220 in Array.update
1221 (dsts, dst, Env.extend (e, v, cs))
1222 end
1223 else ())
1224 val src = node src
1225 in
1226 Array.foreachi
1227 (dsts, fn (dst, e) =>
1228 if not showDead andalso Array.sub (dead, dst)
1229 then ()
1230 else
1231 Env.foreachi
1232 (e, fn (v, cs) =>
1233 let
1234 val edge = Graph.addEdge (g, {from = src,
1235 to = node dst})
1236 val label =
1237 concat [edgeLabel cs,
1238 " -- ",
1239 Layout.toString
1240 (Vector.layout (Layout.str o
1241 EdgeAction.toString)
1242 v)]
1243 in List.push (edgeOptions edge,
1244 EdgeOption.label label)
1245 end))
1246 end)
1247 in
1248 Graph.layoutDot (g, fn {nodeName} =>
1249 {title = title,
1250 options =
1251 let open GraphOption
1252 in [
1253 RankDir LeftToRight,
1254 Rank (Min, [{nodeName = nodeName (node start)}])
1255 ]
1256 end,
1257 edgeOptions = ! o edgeOptions,
1258 nodeOptions = ! o nodeOptions})
1259 end
1260
1261 fun minimize d = d
1262 (* This DFA minimization algorithm is based on algorithm 3.6 (page 142)
1263 * of the Dragon Book.
1264 *
1265 * It maintains an array, r, that stores for each state s the
1266 * representative of the class to which s belongs.
1267 * It repeatedly refines an equivalence relation, represented by a list
1268 * of classes, where each class is a list of states.
1269 *)
1270(* fun minimize (dfa as T {anchorStart, charClass, final,
1271 * start, next, ...}): t =
1272 * let
1273 * val numStates = numStates dfa
1274 * val numCharClasses = numCharClasses dfa
1275 * type class = int list
1276 * type classes = class list
1277 * val repCounter = ref ~1
1278 * val change = ref false
1279 * fun newRep () = (change := true; ++ repCounter)
1280 * val finRep = newRep ()
1281 * val nonfinRep = newRep ()
1282 * val r = Array.tabulate (numStates, fn i =>
1283 * if Array.sub (final, i)
1284 * then finRep
1285 * else nonfinRep)
1286 * fun rep s = Array.sub (r, s)
1287 * fun trans (s, c) = rep (Array2.sub (next, s, c))
1288 * fun refine (class: class, ac: classes): classes =
1289 * let
1290 * val r =
1291 * List.fold
1292 * (class, [], fn (state, classes) =>
1293 * let
1294 * fun loop (classes, ac) =
1295 * case classes of
1296 * [] =>
1297 * (case ac of
1298 * [] => [{class = [state],
1299 * old = state}]
1300 * | _ =>
1301 * let
1302 * val s = newRep ()
1303 * val _ = Array.update (r, state, s)
1304 * in {class = [state],
1305 * old = state} :: ac
1306 * end)
1307 * | (z as {class, old}) :: classes =>
1308 * if Int.forall
1309 * (0, numCharClasses, fn c =>
1310 * trans (old, c) = trans (state, c))
1311 * then
1312 * (Array.update (r, state, rep old)
1313 * ; {class = state :: class,
1314 * old = old} :: (List.appendRev
1315 * (classes, ac)))
1316 * else loop (classes, z :: ac)
1317 * in loop (classes, [])
1318 * end)
1319 * in List.fold (r, ac, fn ({class, ...}, ac) =>
1320 * case class of
1321 * [_] => ac
1322 * | _ => class :: ac)
1323 * end
1324 * val refine =
1325 * Trace.trace ("refine",
1326 * (List.layout Int.layout o #1),
1327 * Layout.ignore)
1328 * refine
1329 * fun refineAll (classes: classes): unit =
1330 * case classes of
1331 * [] => ()
1332 * | _ =>
1333 * let
1334 * val _ = change := false
1335 * val classes =
1336 * List.fold (classes, [], fn (class, ac) =>
1337 * case class of
1338 * [_] => ac
1339 * | _ => refine (class, ac))
1340 * in if !change
1341 * then refineAll classes
1342 * else ()
1343 * end
1344 * val (fin, nonfin) =
1345 * Int.fold (0, numStates, ([], []), fn (i, (f, n)) =>
1346 * if Array.sub (final, i)
1347 * then (i :: f, n)
1348 * else (f, i :: n))
1349 * val _ = refineAll [fin, nonfin]
1350 * val numStates' = 1 + !repCounter
1351 * (* Compute reachable states. *)
1352 * val reached = Array.new (numStates', false)
1353 * fun visit (s: int (* an old state *)): unit =
1354 * let
1355 * val s' = rep s
1356 * in
1357 * if Array.sub (reached, s')
1358 * then ()
1359 * else (Array.update (reached, s', true)
1360 * ; Int.for (0, numCharClasses, fn c =>
1361 * visit (Array2.sub (next, s, c))))
1362 * end
1363 * val _ = visit start
1364 * val _ = visit anchorStart
1365 * (* Compute new representatives. *)
1366 * val c = ref ~1
1367 * val newR = Array.tabulate (numStates', fn s =>
1368 * if Array.sub (reached, s)
1369 * then ++ c
1370 * else ~1)
1371 * val numStates' = 1 + !c
1372 * val _ = Array.modify (r, fn s => Array.sub (newR, s))
1373 * val next' = Array2.new (numStates', numCharClasses, ~1)
1374 * val _ =
1375 * Array2.foreachi
1376 * (next, fn (s, c, s') =>
1377 * Array2.update (next', rep s, c, rep s'))
1378 * val final' = Array.array (numStates', false)
1379 * val _ =
1380 * Array.foreachi
1381 * (final, fn (i, b) =>
1382 * if b then Array.update (final', rep i, true) else ())
1383 * in T {anchorStart = rep anchorStart,
1384 * charClass = charClass,
1385 * dead = dead (numStates', numCharClasses, final', next'),
1386 * final = final',
1387 * start = rep start,
1388 * next = next'}
1389 * end
1390 *)
1391 end
1392in
1393 structure Regexp: REGEXP =
1394 struct
1395 structure Save = Save
1396 structure Match = Match
1397
1398 open Regexp
1399
1400 val anchorFinish = AnchorFinish
1401 val anchorStart = AnchorStart
1402 val isChar = CharSet
1403 fun isNotChar f = isChar (not o f)
1404 fun char c = isChar (fn c' => c = c')
1405 fun notChar c = isChar (fn c' => c <> c')
1406 val or = Or
1407 val save = Save
1408 val seq = Seq
1409 val star = Star
1410 val zeroOrMore = star
1411
1412 val dquote = char #"\""
1413
1414 val any = isChar (fn _ => true)
1415 val anys = star any
1416 val ascii = isChar (fn c => ord c <= 127)
1417 val asciis = star ascii
1418
1419 val none = isChar (fn _ => false)
1420 fun oneOf s = isChar (fn c => String.contains (s, c))
1421 fun notOneOf s = isNotChar (fn c => String.contains (s, c))
1422 val digit = isChar Char.isDigit
1423 val digits = star digit
1424 val nonDigit = isNotChar Char.isDigit
1425 val space = isChar Char.isSpace
1426 val spaces = star space
1427
1428 fun string (s: string): t =
1429 seq (Int.foldDown (0, String.size s, [], fn (i, ac) =>
1430 char (String.sub (s, i)) :: ac))
1431
1432 fun stringIgnoreCase (s: string): t =
1433 seq (Int.foldDown
1434 (0, String.size s, [], fn (i, ac) =>
1435 let
1436 val c = Char.toLower (String.sub (s, i))
1437 in
1438 isChar (fn c' => c = Char.toLower c')
1439 end :: ac))
1440
1441 val null = seq [] (* Language containing the empty string only. *)
1442 fun oneOrMore r = seq [r, star r]
1443 fun optional r = or [null, r]
1444 fun repeat (r, n: int) = seq (List.tabulate (n, fn _ => r))
1445 fun lower (r, n: int) = seq [repeat (r, n), star r]
1446 fun upper (r, n: int) =
1447 if n <= 0
1448 then null
1449 else or [null, seq [r, upper (r, n - 1)]]
1450 fun range (r, n: int, m: int) =
1451 seq [repeat (r, n), upper (r, m - n)]
1452
1453 structure Compiled =
1454 struct
1455 datatype machine =
1456 DFA of DFA.t
1457 | NFA of NFA.t
1458
1459 datatype t = T of {regexp: Regexp.t,
1460 machine: machine}
1461
1462 fun layoutDot (T {machine, ...}) =
1463 case machine of
1464 DFA m => DFA.layoutDot {dfa = m, showDead = false,
1465 title = "dfa"}
1466 | NFA m => NFA.layoutDot (m, "nfa")
1467
1468 fun layoutDotToFile (c: t, f: File.t) =
1469 File.withOut (f, fn out => Layout.output (layoutDot c, out))
1470
1471 fun layout (T {regexp, ...}) =
1472 let
1473 open Layout
1474 in
1475 Regexp.layout regexp
1476(*
1477 align [case machine of
1478 DFA dfa => DFA.layout dfa
1479 | NFA nfa => NFA.layout nfa
1480 (* str "implementing", Regexp.layout regexp *)
1481 ]
1482*)
1483 end
1484
1485 fun match {compiled = T {machine, ...},
1486 short, startPos, string} =
1487 let
1488 val anchorStart = startPos = 0
1489 val (saves, opt) =
1490 case machine of
1491 DFA dfa =>
1492 (DFA.saves dfa,
1493 DFA.match {anchorStart = anchorStart,
1494 dfa = dfa,
1495 short = short,
1496 string = string,
1497 startPos = startPos})
1498
1499 | NFA nfa =>
1500 (NFA.saves nfa,
1501 NFA.match {nfa = nfa,
1502 short = short,
1503 string = string,
1504 startPos = startPos})
1505 exception No
1506 in
1507 Option.map
1508 (opt, fn (stop, Actions.T actions) =>
1509 let
1510 val _ = Vector.foreachi (saves, fn (i, s) =>
1511 Save.assign (s, i))
1512 val n = Vector.length saves
1513 val starts = Array.array (n, ~1)
1514 val matches = Array.array (n, NONE)
1515 val _ =
1516 List.foreach
1517 (rev actions, fn (i, v) =>
1518 Vector.foreach
1519 (v, fn ma =>
1520 case ma of
1521 MatchAction.Finish s =>
1522 let
1523 val index = Save.index s
1524 val start = Array.sub (starts, index)
1525 in
1526 Array.update
1527 (matches, index,
1528 SOME (Substring.substring
1529 (string, {start = start,
1530 length = i - start})))
1531 end
1532 | MatchAction.Start s =>
1533 Array.update (starts, Save.index s, i)))
1534 val matches =
1535 Array.keepAllMapi
1536 (matches, fn (i, sso) =>
1537 case sso of
1538 NONE => NONE
1539 | SOME ss => SOME (Vector.sub (saves, i), ss))
1540 val all =
1541 Substring.substring
1542 (string, {start = startPos,
1543 length = stop - startPos})
1544 in
1545 Match.T {all = all,
1546 matches = matches}
1547 end) handle No => NONE
1548 end
1549
1550 val match =
1551 Trace.trace
1552 ("Regexp.Compiled.match",
1553 fn {compiled, short, startPos, string} =>
1554 Layout.record
1555 [("short", Bool.layout short),
1556 ("startPos", Int.layout startPos),
1557 ("string", String.layout string),
1558 ("compiled", layout compiled)],
1559 Option.layout Match.layout)
1560 match
1561
1562 fun matchLong (c, s, i) =
1563 match {compiled = c,
1564 short = false,
1565 startPos = i,
1566 string = s}
1567
1568 fun matchShort (c, s, i) =
1569 match {compiled = c,
1570 short = true,
1571 startPos = i,
1572 string = s}
1573
1574 fun matchAll (r, s) =
1575 case matchLong (r, s, 0) of
1576 NONE => NONE
1577 | SOME m => if String.size s = Substring.length (Match.all m)
1578 then SOME m
1579 else NONE
1580
1581 val matchesAll = isSome o matchAll
1582
1583 fun matchPrefix (r, s) = matchShort (r, s, 0)
1584
1585 val matchesPrefix = isSome o matchPrefix
1586
1587 fun find (c: t, s: string, startPos, short: bool) =
1588 let
1589 val n = String.size s
1590 fun loop (i: int) =
1591 if i >= n
1592 then NONE
1593 else
1594 case match {compiled = c,
1595 short = short,
1596 startPos = i,
1597 string = s} of
1598 NONE => loop (i + 1)
1599 | SOME m => SOME m
1600 in loop startPos
1601 end
1602
1603 fun findLong (c, s, i) = find (c, s, i, false)
1604 fun findShort (c, s, i) = find (c, s, i, true)
1605
1606 fun foreachMatchShort (c, s, f: Match.t -> unit) =
1607 let
1608 fun loop i =
1609 case findShort (c, s, i) of
1610 NONE => ()
1611 | SOME m => (f m; loop (Match.endOf m))
1612 in
1613 loop 0
1614 end
1615 end
1616
1617 fun compileDFA r =
1618 let
1619 val nfa = NFA.fromRegexp r
1620 in
1621 Compiled.T
1622 {regexp = r,
1623 machine = Compiled.DFA (DFA.minimize (DFA.fromNFA nfa))}
1624 end
1625
1626 val compileDFA =
1627 Trace.trace ("Regexp.compileDFA", layout, Compiled.layout) compileDFA
1628
1629 fun compileNFA r =
1630 Compiled.T
1631 {regexp = r,
1632 machine = Compiled.NFA (NFA.fromRegexp r)}
1633
1634 val compileNFA =
1635 Trace.trace ("Regexp.compileNFA", layout, Compiled.layout) compileNFA
1636
1637 (* POSIX 1003.2 regular expressions
1638 * caveats: does not support back references '\N'
1639 * does not support unmatched ')'
1640 * does not support '[=' style coallating elements
1641 * does not support coallating elements as range endpoints
1642 *
1643 * grammar:
1644 * S ::= Re
1645 * Re ::= Br Re0
1646 * Re0 ::= e | '|' Br Re0
1647 * Br ::= P Br0
1648 * Br0 ::= e | P Br0
1649 * P ::= A P0
1650 * P0 ::= e | '*' | '+' | '?' | Bnd
1651 * Bnd ::= '{' N Bnd0 '}'
1652 * Bnd0 ::= e | ',' Bnd1
1653 * Bnd1 ::= e | N
1654 * A ::= '(' Re ')'
1655 * | '()'
1656 * | '[' Be ']'
1657 * | '.'
1658 * | '^'
1659 * | '$'
1660 * | '\' C
1661 * | C
1662 * Be ::= Be0
1663 * Be0 ::= '^' Be1 | Be1
1664 * Be1 ::= ']' Be2 | Be2
1665 * Be2 ::= '-' Be3 | Be3
1666 * Be3 ::= e
1667 * | '-'
1668 * | C '-' C Be3
1669 * | '[' '.' Ce '.' ']' Be3
1670 * | '[' ':' Cl ':' ']' Be3
1671 * | C Be3
1672 * Ce ::= e | C Ce
1673 * Cl ::= 'alnum' | ... | 'xdigit'
1674 *)
1675 local
1676 exception X of string
1677 type res = t * Save.t vector
1678
1679 fun S (s: char list) : res =
1680 Re (s, fn (s, re, saves) =>
1681 case s of
1682 [] => (re, saves)
1683 | _ => raise (X "S"))
1684 and Re (s: char list,
1685 k: char list * t * Save.t vector -> res) =
1686 Br (s, fn (s, re, saves) =>
1687 Re0 (s, [re], [saves], k))
1688 and Re0 (s: char list, res: t list, savess: Save.t vector list,
1689 k: char list * t * Save.t vector -> res) =
1690 let
1691 fun finish s =
1692 k (s, or (List.rev res), Vector.concat (List.rev savess))
1693 in
1694 case s of
1695 [] => finish s
1696 | #")"::_ => finish s
1697 | #"|"::s => Br (s, fn (s, re, saves) =>
1698 Re0 (s, re::res, saves::savess, k))
1699 | _ => raise (X "Re0")
1700 end
1701 and Br (s: char list,
1702 k: char list * t * Save.t vector -> res) =
1703 P (s, fn (s, re, saves) =>
1704 Br0 (s, [re], [saves], k))
1705 and Br0 (s: char list, res: t list, savess: Save.t vector list,
1706 k: char list * t * Save.t vector -> res) =
1707 let
1708 fun finish s =
1709 k (s, seq (List.rev res), Vector.concat (List.rev savess))
1710 in
1711 case s of
1712 [] => finish s
1713 | #")"::_ => finish s
1714 | #"|"::_ => finish s
1715 | _ => P (s, fn (s, re, saves) =>
1716 Br0 (s, re::res, saves::savess, k))
1717 end
1718 and P (s: char list,
1719 k: char list * t * Save.t vector -> res) =
1720 A (s, fn (s, re, saves) => P0 (s, re, saves, [], [], k))
1721 and P0 (s: char list,
1722 re: t, saves: Save.t vector,
1723 res: t list, savess: Save.t vector list,
1724 k: char list * t * Save.t vector -> res) =
1725 let
1726 fun finish (s, re) =
1727 k (s, seq (List.rev (re::res)),
1728 Vector.concat (List.rev (saves::savess)))
1729 fun default () =
1730 let
1731 val res = re::res
1732 val savess = saves::savess
1733 in
1734 A (s, fn (s, re, saves) =>
1735 P0 (s, re, saves, res, savess, k))
1736 end
1737 in
1738 case s of
1739 [] => finish (s, re)
1740 | #")"::_ => finish (s, re)
1741 | #"|"::_ => finish (s, re)
1742 | #"*"::s => finish (s, star re)
1743 | #"+"::s => finish (s, oneOrMore re)
1744 | #"?"::s => finish (s, optional re)
1745 | #"{"::(c::s) => if Char.isDigit c
1746 then Bnd (c::s, fn (s, f) =>
1747 finish (s, f re))
1748 else default ()
1749 | _ => default ()
1750 end
1751 and Bnd (s: char list,
1752 k: char list * (t -> t) -> res) =
1753 N (s, fn (s, n) =>
1754 Bnd0 (s, n, fn (s, f) =>
1755 case s of
1756 #"}"::s => k (s, f)
1757 | _ => raise (X "Bnd")))
1758 and Bnd0 (s: char list, n: int,
1759 k: char list * (t -> t) -> res) =
1760 let
1761 fun finish (s, f) = k (s, f)
1762 in
1763 case s of
1764 #"}"::_ => finish (s, fn re => repeat (re, n))
1765 | #","::s => Bnd1 (s, n, k)
1766 | _ => raise (X "Bnd0")
1767 end
1768 and Bnd1 (s: char list, n: int,
1769 k: char list * (t -> t) -> res) =
1770 let
1771 fun finish (s, f) = k (s, f)
1772 in
1773 case s of
1774 #"}"::_ => finish (s, fn re => lower (re, n))
1775 | _ => N (s, fn (s, m) =>
1776 if m < n
1777 then raise (X "Bnd1")
1778 else finish (s, fn re => range (re, n, m)))
1779 end
1780 and N (s: char list,
1781 k: char list * int -> res) =
1782 let
1783 in
1784 case s of
1785 d::s' => (case Char.digitToInt d of
1786 SOME d => N1 (s', d, k)
1787 | NONE => raise (X "N"))
1788 | _ => raise (X "N")
1789 end
1790 and N1 (s: char list, n: int,
1791 k: char list * int -> res) =
1792 let
1793 fun finish s =
1794 k (s, n)
1795 in
1796 case s of
1797 [] => finish s
1798 | d::s' => (case Char.digitToInt d of
1799 SOME d => N1 (s', n * 10 + d, k)
1800 | NONE => finish s)
1801 end
1802 and A (s: char list,
1803 k: char list * t * Save.t vector -> res) =
1804 let
1805 fun finish (s, re, saves) =
1806 k (s, re, saves)
1807 fun finishR (s, re) =
1808 finish (s, re, Vector.new0 ())
1809 fun finishN s =
1810 finishR (s, null)
1811 fun finishC (s, c) =
1812 finishR (s, char c)
1813 in
1814 case s of
1815 #"("::(#")"::s) => finishN s
1816 | #"("::s => let
1817 val save' = Save.new ()
1818 in
1819 Re (s, fn (s, re, saves) =>
1820 case s of
1821 #")"::s => k (s, save (re, save'),
1822 Vector.concat
1823 [Vector.new1 save', saves])
1824 | _ => raise (X "A"))
1825 end
1826 | #"["::s => let
1827 in
1828 Be (s, fn (s, re) =>
1829 case s of
1830 #"]"::s => finishR (s, re)
1831 | _ => raise (X "A"))
1832 end
1833 | #"."::s => finishR (s, any)
1834 | #"^"::s => finishR (s, anchorStart)
1835 | #"$"::s => finishR (s, anchorFinish)
1836 | #"\\"::(c::s) => finishC (s, c)
1837 | c::s => if String.contains (")|*+?{", c)
1838 then raise (X "A")
1839 else finishC (s, c)
1840 | _ => raise (X "A")
1841 end
1842 and Be (s: char list,
1843 k: char list * t -> res) =
1844 Be0 (s, k)
1845 and Be0 (s: char list,
1846 k: char list * t -> res) =
1847 let
1848 in
1849 case s of
1850 #"^"::s => Be1 (s, true, k)
1851 | _ => Be1 (s, false, k)
1852 end
1853 and Be1 (s: char list, inv: bool,
1854 k: char list * t -> res) =
1855 let
1856 in
1857 case s of
1858 #"]"::s => Be2 (s, inv, [#"]"], k)
1859 | _ => Be2 (s, inv, [], k)
1860 end
1861 and Be2 (s: char list, inv: bool, cs: char list,
1862 k: char list * t -> res) =
1863 let
1864 in
1865 case s of
1866 #"-"::s => Be3 (s, inv, #"-"::cs, [], [], k)
1867 | _ => Be3 (s, inv, cs, [], [], k)
1868 end
1869 and Be3 (s: char list, inv: bool,
1870 cs: char list, cps: (char -> bool) list, ces: string list,
1871 k: char list * t -> res) =
1872 let
1873 fun finish (s: char list,
1874 cs: char list,
1875 cps: (char -> bool) list,
1876 ces: string list) =
1877 let
1878 fun finish' re = k (s, re)
1879 val s = implode cs
1880 val cp = fn c => List.exists (cps, fn cp => cp c)
1881 in
1882 if inv
1883 then
1884 (case ces of
1885 [] =>
1886 finish'
1887 (isNotChar
1888 (fn c =>
1889 cp c orelse String.contains (s, c)))
1890 | _ => Error.bug "Regexp.fromString: can't handle collating elements in negated bracket expressions")
1891 else finish' (List.fold
1892 (ces, or [isChar cp,
1893 oneOf s],
1894 fn (ce, re) =>
1895 or [string ce, re]))
1896 end
1897 in
1898 case s of
1899 #"]"::_ => finish (s, cs, cps, ces)
1900 | #"-"::s => (case s of
1901 #"]"::_ => finish (s, #"-"::cs, cps, ces)
1902 | _ => raise (X "Be3"))
1903 | c1::(#"-"::(c2::s)) =>
1904 let
1905 val r1 = Char.ord c1
1906 val r2 = Char.ord c2
1907 val cp = fn c =>
1908 let val r = Char.ord c
1909 in r1 <= r andalso r <= r2
1910 end
1911 in
1912 Be3 (s, inv, cs, cp::cps, ces, k)
1913 end
1914 | #"["::(#"."::s) =>
1915 Ce (s, [], fn (s, ce) =>
1916 case s of
1917 #"."::(#"]"::s) => Be3 (s, inv, cs, cps, ce::ces, k)
1918 | _ => raise (X "Be3"))
1919 | #"["::(#":"::s) =>
1920 Cl (s, fn (s, cp) =>
1921 case s of
1922 #":"::(#"]"::s) => Be3 (s, inv, cs, cp::cps, ces, k)
1923 | _ => raise (X "Be3"))
1924 | c::s => Be3 (s, inv, c::cs, cps, ces, k)
1925 | _ => raise (X "Be3")
1926 end
1927 and Ce (s: char list, ce: char list,
1928 k: char list * string -> res) =
1929 let
1930 fun finish s =
1931 k (s, implode (List.rev ce))
1932 in
1933 case s of
1934 #"."::_ => finish s
1935 | c::s => Ce (s, c::ce, k)
1936 | _ => raise (X "Ce")
1937 end
1938 and Cl (s: char list,
1939 k: char list * (char -> bool) -> res) =
1940 let
1941 in
1942 case s of
1943 #"a"::(#"l"::(#"n"::(#"u"::(#"m"::s)))) =>
1944 k (s, Char.isAlphaNum)
1945 | #"a"::(#"l"::(#"p"::(#"h"::(#"a"::s)))) =>
1946 k (s, Char.isAlpha)
1947 | #"b"::(#"l"::(#"a"::(#"n"::(#"k"::_)))) =>
1948 raise (X "Cl:blank")
1949 | #"c"::(#"n"::(#"t"::(#"r"::(#"l"::s)))) =>
1950 k (s, Char.isCntrl)
1951 | #"d"::(#"i"::(#"g"::(#"i"::(#"t"::s)))) =>
1952 k (s, Char.isDigit)
1953 | #"g"::(#"r"::(#"a"::(#"p"::(#"h"::s)))) =>
1954 k (s, Char.isGraph)
1955 | #"l"::(#"o"::(#"w"::(#"e"::(#"r"::s)))) =>
1956 k (s, Char.isLower)
1957 | #"p"::(#"r"::(#"i"::(#"n"::(#"t"::s)))) =>
1958 k (s, Char.isPrint)
1959 | #"p"::(#"u"::(#"n"::(#"c"::(#"t"::_)))) =>
1960 raise (X "Cl:punct")
1961 | #"s"::(#"p"::(#"a"::(#"c"::(#"e"::s)))) =>
1962 k (s, Char.isSpace)
1963 | #"u"::(#"p"::(#"p"::(#"e"::(#"r"::s)))) =>
1964 k (s, Char.isUpper)
1965 | #"x"::(#"d"::(#"i"::(#"g"::(#"i"::(#"t"::s))))) =>
1966 k (s, Char.isHexDigit)
1967 | _ => raise (X "Cl")
1968 end
1969 in
1970 val fromString: string -> (t * Save.t vector) option =
1971 fn s => (SOME (S (explode s))) handle X _ => NONE
1972 val fromString =
1973 Trace.trace ("Regexp.fromString",
1974 String.layout,
1975 Option.layout (layout o #1))
1976 fromString
1977 end
1978 end
1979
1980(* local
1981 * val _ =
1982 * let open Trace.Immediate
1983 * in
1984 * flagged()
1985 * ; debug := Out Out.error
1986 * ; on []
1987 * end
1988 * open Regexp
1989 * val a = char #"a"
1990 * val b = char #"b"
1991 * val c = char #"c"
1992 * val d = char #"d"
1993 * val r = a
1994 * val r = star a
1995 * val r = or []
1996 * val r = star any
1997 * val r = seq []
1998 * val r = or [a, b]
1999 * val r = seq [a, b, c, d]
2000 * val r = or [seq [a, b, c],
2001 * seq [a, b, d]]
2002 * val r =
2003 * seq [star (or [a, b]),
2004 * a, b, b]
2005 * val d = digit
2006 * val eol = char #"#"
2007 * val space = oneOf " \t"
2008 * val r =
2009 * seq [or [anchorStart, notOneOf "0123456789("],
2010 * or [seq [char #"(", d, d, d, char #")"],
2011 * seq [d, d, d]],
2012 * char #" ",
2013 * d, d, d,
2014 * oneOf " -",
2015 * d, d, d, d,
2016 * or [eol, nonDigit]]
2017 *
2018 * fun doit (name, lay) =
2019 * let
2020 * val dot = concat ["/tmp/", name, ".dot"]
2021 * val ps = concat ["/tmp/", name, ".ps"]
2022 * val _ = File.withOut (dot, fn out => Layout.output (lay, out))
2023 * val _ = OS.Process.system (concat ["dot ", dot, " >", ps])
2024 * in ()
2025 * end
2026 * val nfa = NFA.fromRegexp r
2027 * val _ = doit ("nfa", NFA.layoutDot (nfa, "nfa"))
2028 * val _ = Out.output (Out.error,
2029 * concat ["numCharClasses = ",
2030 * Int.toString (NFA.numCharClasses nfa),
2031 * "\n"])
2032 * val dfa = DFA.fromNFA nfa
2033 * val _ = doit ("dfa",
2034 * DFA.layoutDot {dfa = dfa, title = "dfa", showDead = false})
2035 * val min = DFA.minimize dfa
2036 * val _ = doit ("min",
2037 * DFA.layoutDot {dfa = min, title = "min", showDead = false})
2038 * in
2039 * end
2040 *)
2041end
2042
2043(* local
2044 * val _ =
2045 * let
2046 * open Trace.Immediate
2047 * in
2048 * debug := Out Out.error
2049 * ; flagged()
2050 * ; on ["Regexp.match"]
2051 * end
2052 * structure Z = TestRegexp (Regexp)
2053 * in
2054 * end
2055 *)