Commit | Line | Data |
---|---|---|
7f918cf1 CE |
1 | (* Copyright (C) 2009 Matthew Fluet. |
2 | * Copyright (C) 1999-2006 Henry Cejtin, Matthew Fluet, Suresh | |
3 | * Jagannathan, and Stephen Weeks. | |
4 | * | |
5 | * MLton is released under a BSD-style license. | |
6 | * See the file MLton-LICENSE for details. | |
7 | *) | |
8 | ||
9 | (* Many of the algorithms in this module are based on | |
10 | * Compilers: Principles, Techniques, and Tools by Aho, Sethi, and Ullman, | |
11 | * which I will refer to in comments as the Dragon Book. | |
12 | *) | |
13 | local | |
14 | fun ++ (r: int ref): int = | |
15 | let | |
16 | val n = 1 + !r | |
17 | val _ = r := n | |
18 | in n | |
19 | end | |
20 | ||
21 | val numChars: int = Char.maxOrd + 1 | |
22 | ||
23 | local | |
24 | val validCharsString = | |
25 | "\n\t@abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ()[]<>!?-&#;'/=\"$.\\" | |
26 | in | |
27 | val validChars = | |
28 | Vector.tabulate (numChars, fn i => | |
29 | String.contains (validCharsString, Char.fromInt i)) | |
30 | ||
31 | fun edgeLabel (cs: char list): string = | |
32 | let | |
33 | val chars = implode cs | |
34 | val n = String.size chars | |
35 | val numValidChars = String.size validCharsString | |
36 | in | |
37 | if n = numChars | |
38 | then "." | |
39 | else | |
40 | (if n >= Int.quot (numValidChars, 2) | |
41 | then (* Character complement. *) | |
42 | concat ["[^", | |
43 | String.alphabetize | |
44 | (String.keepAll | |
45 | (validCharsString, fn c => | |
46 | not (String.contains | |
47 | (chars, c)))), | |
48 | "]"] | |
49 | else if (1 = String.size chars | |
50 | andalso not (String.contains | |
51 | (". ", String.sub (chars, 0)))) | |
52 | then chars | |
53 | else concat ["[", chars, "]"]) | |
54 | end | |
55 | val edgeLabel = | |
56 | Trace.trace ("Regexp.edgeLabel", List.layout Char.layout, String.layout) | |
57 | edgeLabel | |
58 | end | |
59 | ||
60 | structure Save = | |
61 | struct | |
62 | datatype t = T of {index: int ref} | |
63 | ||
64 | fun layout (T {index, ...}) = | |
65 | let | |
66 | open Layout | |
67 | in | |
68 | seq [str "Save ", Int.layout (!index)] | |
69 | end | |
70 | ||
71 | fun new () = T {index = ref ~1} | |
72 | ||
73 | fun equals (T {index = i, ...}, T {index = i', ...}) = i = i' | |
74 | ||
75 | fun assign (T {index, ...}, i) = index := i | |
76 | ||
77 | fun index (T {index, ...}) = !index | |
78 | ||
79 | val index = Trace.trace ("Regexp.Save.index", layout, Int.layout) index | |
80 | end | |
81 | ||
82 | structure Regexp = | |
83 | struct | |
84 | datatype t = | |
85 | AnchorFinish | |
86 | | AnchorStart | |
87 | | CharSet of char -> bool | |
88 | | Or of t list | |
89 | | Seq of t list | |
90 | | Save of t * Save.t | |
91 | | Star of t | |
92 | ||
93 | fun layout (r: t): Layout.t = | |
94 | let | |
95 | open Layout | |
96 | in | |
97 | case r of | |
98 | AnchorFinish => str "AnchorFinish" | |
99 | | AnchorStart => str "AnchorStart" | |
100 | | CharSet f => | |
101 | seq [str "[", | |
102 | str (edgeLabel (Int.foldDown | |
103 | (0, numChars, [], fn (i, ac) => | |
104 | let | |
105 | val c = Char.fromInt i | |
106 | in | |
107 | if f c | |
108 | then c :: ac | |
109 | else ac | |
110 | end))), | |
111 | str "]"] | |
112 | | Or rs => seq [str "Or ", List.layout layout rs] | |
113 | | Seq rs => seq [str "Seq ", List.layout layout rs] | |
114 | | Save (r, s) => seq [str "Save ", | |
115 | Layout.tuple [layout r, Save.layout s]] | |
116 | | Star r => seq [str "Star ", layout r] | |
117 | end | |
118 | ||
119 | val toString = Layout.toString o layout | |
120 | end | |
121 | ||
122 | structure Stack: | |
123 | sig | |
124 | type 'a t | |
125 | ||
126 | val clear: 'a t -> unit | |
127 | val foreach: 'a t * ('a -> unit) -> unit | |
128 | val new: int * 'a -> 'a t | |
129 | val peekMap: 'a t * ('a ->'b option) -> 'b option | |
130 | val push: 'a t * 'a -> unit | |
131 | end = | |
132 | struct | |
133 | datatype 'a t = T of {elts: 'a array, | |
134 | size: int ref} | |
135 | ||
136 | fun new (size: int, dummy: 'a): 'a t = | |
137 | T {elts = Array.new (size, dummy), | |
138 | size = ref 0} | |
139 | ||
140 | fun push (T {elts, size}, x) = | |
141 | let | |
142 | val n = !size | |
143 | val _ = Array.update (elts, n, x) | |
144 | val _ = size := n + 1 | |
145 | in () | |
146 | end | |
147 | ||
148 | fun clear (T {size, ...}) = size := 0 | |
149 | ||
150 | fun foreach (T {elts, size, ...}, f) = | |
151 | Int.for (0, !size, fn i => f (Array.sub (elts, i))) | |
152 | ||
153 | fun peekMap (T {elts, size, ...}, f) = | |
154 | let | |
155 | val n = !size | |
156 | fun loop i = | |
157 | if i = n | |
158 | then NONE | |
159 | else (case f (Array.sub (elts, i)) of | |
160 | NONE => loop (i + 1) | |
161 | | SOME z => SOME z) | |
162 | in | |
163 | loop 0 | |
164 | end | |
165 | end | |
166 | ||
167 | (* NFA state. *) | |
168 | (* The states in an NFA are indexed from 0 to n-1, where n is the number | |
169 | * of states. | |
170 | *) | |
171 | structure State = | |
172 | struct | |
173 | type t = int | |
174 | ||
175 | val layout = Int.layout | |
176 | end | |
177 | ||
178 | structure MatchAction = | |
179 | struct | |
180 | datatype t = | |
181 | Finish of Save.t | |
182 | | Start of Save.t | |
183 | ||
184 | val equals = | |
185 | fn (Finish s, Finish s') => Save.equals (s, s') | |
186 | | (Start s, Start s') => Save.equals (s, s') | |
187 | | _ => false | |
188 | ||
189 | fun layout a = | |
190 | let | |
191 | open Layout | |
192 | in | |
193 | case a of | |
194 | Finish s => seq [str "Finish ", Save.layout s] | |
195 | | Start s => seq [str "Start ", Save.layout s] | |
196 | end | |
197 | end | |
198 | ||
199 | structure Match = | |
200 | struct | |
201 | datatype t = T of {all: Substring.t, | |
202 | matches: (Save.t * Substring.t) array} | |
203 | ||
204 | fun all (T {all, ...}) = all | |
205 | ||
206 | val startLength = #2 o Substring.base o all | |
207 | ||
208 | val endOf = Substring.endOf o all | |
209 | ||
210 | val length = Substring.length o all | |
211 | ||
212 | fun layout (T {all, matches}) = | |
213 | let open Layout | |
214 | in tuple [Substring.layout all, | |
215 | Array.layout (Layout.tuple2 | |
216 | (Save.layout, Substring.layout)) matches] | |
217 | end | |
218 | ||
219 | fun funs (T {matches, ...}) = | |
220 | let | |
221 | fun peek (s: Save.t): Substring.t option = | |
222 | Option.map (Array.peek (matches, fn (s', _) => | |
223 | Save.equals (s, s')), | |
224 | #2) | |
225 | in {exists = Option.isSome o peek, | |
226 | lookup = valOf o peek, | |
227 | peek = peek} | |
228 | end | |
229 | ||
230 | fun stringFuns m = | |
231 | let | |
232 | val {peek, lookup, exists} = funs m | |
233 | in | |
234 | {exists = exists, | |
235 | lookup = Substring.toString o lookup, | |
236 | peek = fn s => Option.map (peek s, Substring.toString)} | |
237 | end | |
238 | ||
239 | local | |
240 | fun make sel (m, s) = sel (funs m) s | |
241 | in | |
242 | val peek = make #peek | |
243 | val lookup = make #lookup | |
244 | val exists = make #exists | |
245 | end | |
246 | ||
247 | fun peekString (m, s) = Option.map (peek (m, s), Substring.toString) | |
248 | val lookupString = Substring.toString o lookup | |
249 | end | |
250 | ||
251 | structure Actions = | |
252 | struct | |
253 | datatype t = T of (int * MatchAction.t vector) list | |
254 | ||
255 | fun layout (T l) = | |
256 | List.layout (Layout.tuple2 (Int.layout, | |
257 | Vector.layout MatchAction.layout)) | |
258 | l | |
259 | ||
260 | val empty = T [] | |
261 | ||
262 | fun add (a as T l, i, v: MatchAction.t vector) = | |
263 | if Vector.isEmpty v | |
264 | then a | |
265 | else T ((i, v) :: l) | |
266 | end | |
267 | ||
268 | structure NFA = | |
269 | struct | |
270 | structure State = State | |
271 | (* State i is final iff isSome (Array.sub (final, i)). | |
272 | * Characters are grouped into equivalence classes, represented by | |
273 | * integers in [0, numCharClasses). | |
274 | * The equivalence class of c is Array.sub (charClass, Char.toInt c). | |
275 | * The dimensions of next is numStates x numCharClasses. | |
276 | * The outgoing states from state i on input char c are given by | |
277 | * Array2.sub (next, i, Array.sub (charClass, Char.to Int c)). | |
278 | * seen, stack1, and stack2 are used in the two stack simulation of | |
279 | * the NFA (see fun match). We preallocate them as part of the NFA | |
280 | * so they don't have to be allocated on each call to match. | |
281 | *) | |
282 | datatype t = | |
283 | T of {anchorStarts: (State.t * MatchAction.t vector) vector, | |
284 | charClass: int array, (* of length numChars *) | |
285 | final: {actions: MatchAction.t vector, | |
286 | requireFinish: bool} option array, | |
287 | next: (State.t * MatchAction.t vector) array Array2.t, | |
288 | saves: Save.t vector, | |
289 | seen: bool array, | |
290 | stack1: (State.t * Actions.t) Stack.t, | |
291 | stack2: (State.t * Actions.t) Stack.t, | |
292 | start: State.t} | |
293 | end | |
294 | ||
295 | (* Non-deterministic Finite Automaton. *) | |
296 | structure NFA: | |
297 | sig | |
298 | structure State: | |
299 | sig | |
300 | type t = int | |
301 | ||
302 | val layout: t -> Layout.t | |
303 | end | |
304 | ||
305 | datatype t = datatype NFA.t | |
306 | ||
307 | val fromRegexp: Regexp.t -> t | |
308 | val layoutDot: t * string (* title *) -> Layout.t | |
309 | val match: {nfa: t, | |
310 | short: bool, | |
311 | string: string, | |
312 | startPos: int} -> (int * Actions.t) option | |
313 | val numCharClasses: t -> int | |
314 | val numStates: t -> int | |
315 | val saves: t -> Save.t vector | |
316 | end = | |
317 | struct | |
318 | open NFA | |
319 | ||
320 | fun numStates (T {next, ...}) = Array2.nRows next | |
321 | fun numCharClasses (T {next, ...}) = Array2.nCols next | |
322 | fun saves (T {saves, ...}) = saves | |
323 | ||
324 | (* Simulating an NFA with two stacks and a bit vector, as in Algorithm | |
325 | * 3.4 (page 126) of the Dragon Book. | |
326 | *) | |
327 | fun match {nfa as T {anchorStarts, charClass, final, | |
328 | next, stack1, stack2, start, ...}, | |
329 | short, | |
330 | string = s, | |
331 | startPos}: (int * Actions.t) option = | |
332 | let | |
333 | val numStates = numStates nfa | |
334 | val n = String.size s | |
335 | val seen = Array.array (numStates, false) | |
336 | fun loop (current, nextStates, i: int, | |
337 | last: (int * Actions.t) option) | |
338 | : (int * Actions.t) option = | |
339 | let | |
340 | val last = | |
341 | case (Stack.peekMap | |
342 | (current, fn (s, a) => | |
343 | case Array.sub (final, s) of | |
344 | NONE => NONE | |
345 | | SOME {actions, requireFinish} => | |
346 | if requireFinish andalso i < n | |
347 | then NONE | |
348 | else SOME (i, Actions.add (a, i, actions)))) of | |
349 | NONE => last | |
350 | | s => s | |
351 | in | |
352 | if numStates = 0 | |
353 | orelse i = n | |
354 | orelse (short andalso isSome last) | |
355 | then (Stack.clear current | |
356 | ; last) | |
357 | else | |
358 | let | |
359 | val _ = Array.modify (seen, fn _ => false) | |
360 | val c = Array.sub (charClass, | |
361 | Char.toInt (String.sub (s, i))) | |
362 | val _ = | |
363 | Stack.foreach | |
364 | (current, fn (s, a) => | |
365 | Array.foreach | |
366 | (Array2.sub (next, s, c), | |
367 | fn (s', v) => | |
368 | if Array.sub (seen, s') | |
369 | then () | |
370 | else (Array.update (seen, s', true) | |
371 | ; (Stack.push | |
372 | (nextStates, | |
373 | (s', Actions.add (a, i, v))))))) | |
374 | val _ = Stack.clear current | |
375 | in loop (nextStates, current, i + 1, last) | |
376 | end | |
377 | end | |
378 | val _ = Stack.push (stack1, (start, Actions.empty)) | |
379 | val _ = | |
380 | if startPos = 0 | |
381 | then (Vector.foreach | |
382 | (anchorStarts, fn (s, v) => | |
383 | Stack.push | |
384 | (stack1, | |
385 | (s, Actions.add (Actions.empty, startPos, v))))) | |
386 | else () | |
387 | in | |
388 | loop (stack1, stack2, startPos, NONE) | |
389 | end | |
390 | ||
391 | (* This conversion from a regular expression to an NFA is based on | |
392 | * Section 3.9 (pages 134 -- 140) of the Dragon Book. | |
393 | * | |
394 | * It creates one NFA state for each CharSet (called a "position") that | |
395 | * is in the regexp. There is also one extra state for the start state. | |
396 | * It adds edges as in rules 1 and 2 (page 138) for the followpos | |
397 | * function. | |
398 | *) | |
399 | fun fromRegexp (r: Regexp.t): t = | |
400 | let | |
401 | fun loop (r, ac as (saves, numPos)) = | |
402 | let | |
403 | open Regexp | |
404 | in | |
405 | case r of | |
406 | AnchorFinish => (saves, numPos + 1) | |
407 | | AnchorStart => (saves, numPos + 1) | |
408 | | CharSet _ => (saves, numPos + 1) | |
409 | | Or rs => List.fold (rs, ac, loop) | |
410 | | Save (r, s) => loop (r, (s :: saves, numPos)) | |
411 | | Seq rs => List.fold (rs, ac, loop) | |
412 | | Star r => loop (r, ac) | |
413 | end | |
414 | val (saves, numPos) = loop (r, ([], 0)) | |
415 | val saves = Vector.fromList saves | |
416 | val numStates = numPos + 1 | |
417 | val start = numPos | |
418 | val posCounter = ref ~1 | |
419 | val follow: MatchAction.t vector option Array2.t = | |
420 | Array2.new (numStates, numStates, NONE) | |
421 | val posChars = Array2.tabulate (numPos, numChars, fn _ => false) | |
422 | local | |
423 | (* Sets of positions represented as vectors of length numPos. | |
424 | *) | |
425 | datatype t = T of MatchAction.t vector option vector | |
426 | in | |
427 | type set = t | |
428 | fun lookup (T v, s) = Vector.sub (v, s) | |
429 | val empty: t = T (Vector.new (numPos, NONE)) | |
430 | fun addActions (T v, a) = | |
431 | T (Vector.map | |
432 | (v, fn opt => | |
433 | Option.map (opt, fn a' => Vector.concat [a, a']))) | |
434 | fun addAction (s, a) = addActions (s, Vector.new1 a) | |
435 | fun union (T v, T v'): t = | |
436 | T (Vector.tabulate | |
437 | (numPos, fn i => | |
438 | case (Vector.sub (v, i), Vector.sub (v', i)) of | |
439 | (NONE, a) => a | |
440 | | (a, NONE) => a | |
441 | | _ => Error.bug "Regexp.NFA.fromRegexp.union")) | |
442 | fun singleton (i: int): t = | |
443 | T (Vector.tabulate (numPos, fn j => | |
444 | if i = j | |
445 | then SOME (Vector.new0 ()) | |
446 | else NONE)) | |
447 | fun foreach (T v, f) = | |
448 | Vector.foreachi (v, fn (i, opt) => | |
449 | case opt of | |
450 | NONE => () | |
451 | | SOME a => f (i, a)) | |
452 | end | |
453 | fun connect (v, v') = | |
454 | foreach | |
455 | (v, fn (s, a) => | |
456 | foreach | |
457 | (v', fn (s', a') => | |
458 | Array2.update (follow, s, s', | |
459 | SOME (Vector.concat [a, a'])))) | |
460 | val anchorFinishes = ref [] | |
461 | val anchorStarts = ref [] | |
462 | fun anchor r = | |
463 | let | |
464 | val i = ++ posCounter | |
465 | val _ = List.push (r, i) | |
466 | val first = singleton i | |
467 | in | |
468 | {first = first, | |
469 | last = first, | |
470 | nullable = NONE} | |
471 | end | |
472 | (* The following loop fills in follow and posChars. | |
473 | * first set of positions that | |
474 | * nullable is SOME v iff the regexp is nullable, where v is the | |
475 | * sequence of actions to perform if the expression is null. | |
476 | *) | |
477 | fun loop (r: Regexp.t): {first: set, | |
478 | last: set, | |
479 | nullable: MatchAction.t vector option} = | |
480 | case r of | |
481 | Regexp.AnchorFinish => anchor anchorFinishes | |
482 | | Regexp.AnchorStart => anchor anchorStarts | |
483 | | Regexp.CharSet f => | |
484 | let | |
485 | val i = ++ posCounter | |
486 | val _ = | |
487 | Int.for | |
488 | (0, numChars, fn c => | |
489 | if f (Char.chr c) | |
490 | then Array2.update (posChars, i, c, true) | |
491 | else ()) | |
492 | val first = singleton i | |
493 | in {first = first, | |
494 | last = first, | |
495 | nullable = NONE} | |
496 | end | |
497 | | Regexp.Or rs => | |
498 | List.fold | |
499 | (rs, {first = empty, | |
500 | last = empty, | |
501 | nullable = NONE}, | |
502 | fn (r, {first = f, last = l, nullable = n}) => | |
503 | let | |
504 | val {first = f', last = l', nullable = n'} = | |
505 | loop r | |
506 | in | |
507 | {first = union (f, f'), | |
508 | last = union (l, l'), | |
509 | nullable = if isSome n then n else n'} | |
510 | end) | |
511 | | Regexp.Save (r, s) => | |
512 | let | |
513 | val {first = f, last = l, nullable = n} = loop r | |
514 | val start = MatchAction.Start s | |
515 | val finish = MatchAction.Finish s | |
516 | in | |
517 | {first = addAction (f, start), | |
518 | last = addAction (l, finish), | |
519 | nullable = Option.map (n, fn v => | |
520 | Vector.concat | |
521 | [Vector.new1 start, | |
522 | v, | |
523 | Vector.new1 finish])} | |
524 | end | |
525 | | Regexp.Seq rs => | |
526 | List.fold | |
527 | (rs, {first = empty, | |
528 | last = empty, | |
529 | nullable = SOME (Vector.new0 ())}, | |
530 | fn (r, {first = f, last = l, nullable = n}) => | |
531 | let | |
532 | val {first = f', last = l', nullable = n'} = | |
533 | loop r | |
534 | val _ = connect (l, f') | |
535 | val first = | |
536 | case n of | |
537 | NONE => f | |
538 | | SOME v => union (f, addActions (f', v)) | |
539 | val last = | |
540 | case n' of | |
541 | NONE => l' | |
542 | | SOME v => union (l', addActions (l, v)) | |
543 | in | |
544 | {first = first, | |
545 | last = last, | |
546 | nullable = (case (n, n') of | |
547 | (SOME v, SOME v') => | |
548 | SOME (Vector.concat [v, v']) | |
549 | | _ => NONE)} | |
550 | end) | |
551 | | Regexp.Star r => | |
552 | let | |
553 | val {first = f, last = l, ...} = loop r | |
554 | val _ = connect (l, f) | |
555 | in | |
556 | {first = f, last = l, | |
557 | nullable = SOME (Vector.new0 ())} | |
558 | end | |
559 | val {first, last, nullable} = loop r | |
560 | local | |
561 | fun extract (anchors, positions) = | |
562 | Vector.keepAllMap | |
563 | (Vector.fromListMap | |
564 | (!anchors, fn s => | |
565 | Option.map (lookup (positions, s), fn v => (s, v))), | |
566 | fn x => x) | |
567 | in | |
568 | (* Any anchor starts in first should be anchor starts. *) | |
569 | val anchorStarts = extract (anchorStarts, first) | |
570 | (* Any anchor finishes in last should be anchor finishes *) | |
571 | val anchorFinishes = extract (anchorFinishes, last) | |
572 | end | |
573 | (* The positions in first are reachable from the start state. *) | |
574 | val _ = foreach (first, fn (i, a) => | |
575 | Array2.update (follow, start, i, SOME a)) | |
576 | val final = Array.array (numStates, NONE) | |
577 | (* The positions that are followed by an anchorFinish are final, | |
578 | * with requireFinish = true. | |
579 | *) | |
580 | val _ = | |
581 | Vector.foreach | |
582 | (anchorFinishes, fn (j, _) => | |
583 | Int.for | |
584 | (0, numStates, fn i => | |
585 | case Array2.sub (follow, i, j) of | |
586 | NONE => () | |
587 | | SOME a => | |
588 | Array.update (final, i, SOME {actions = a, | |
589 | requireFinish = true}))) | |
590 | (* The positions in last are all final. *) | |
591 | val _ = | |
592 | foreach (last, fn (i, a) => | |
593 | Array.update (final, i, SOME {actions = a, | |
594 | requireFinish = false})) | |
595 | (* The start state is final iff the whole regexp is nullable. *) | |
596 | val _ = | |
597 | case nullable of | |
598 | NONE => () | |
599 | | SOME v => | |
600 | Array.update (final, start, | |
601 | SOME {actions = v, | |
602 | requireFinish = false}) | |
603 | (* Compute the transition table, "next". *) | |
604 | val tmp: MatchAction.t vector option Array.t = | |
605 | Array.new (numStates, NONE) | |
606 | val next = | |
607 | Array2.tabulate | |
608 | (numStates, numChars, fn (i, c) => | |
609 | let | |
610 | val _ = | |
611 | Int.for | |
612 | (0, numPos, fn j => | |
613 | case Array2.sub (follow, i, j) of | |
614 | NONE => () | |
615 | | SOME a => | |
616 | if Array2.sub (posChars, j, c) | |
617 | then Array.update (tmp, j, SOME a) | |
618 | else ()) | |
619 | val res = | |
620 | Array.keepAllMapi (tmp, fn (i, opt) => | |
621 | Option.map (opt, fn v => (i, v))) | |
622 | val _ = Int.for (0, numStates, fn j => | |
623 | Array.update (tmp, j, NONE)) | |
624 | in | |
625 | res | |
626 | end) | |
627 | (* Two characters are equivalent if all states treat them the | |
628 | * same. | |
629 | *) | |
630 | fun charEquiv (c: int, c': int) = | |
631 | Int.forall | |
632 | (0, numStates, fn i => | |
633 | Array.equals | |
634 | (Array2.sub (next, i, c), | |
635 | Array2.sub (next, i, c'), | |
636 | fn ((j, v), (j', v')) => | |
637 | j = j' andalso Vector.equals (v, v', MatchAction.equals))) | |
638 | (* Compute charClass. *) | |
639 | val repCounter = ref ~1 | |
640 | val reps = ref [] (* representative of each char class *) | |
641 | val charClass = Array.new (numChars, ~1) | |
642 | val _ = | |
643 | Int.for (0, numChars, fn c => | |
644 | let | |
645 | val rep = | |
646 | case List.peek (!reps, fn {char, ...} => | |
647 | charEquiv (c, char)) of | |
648 | NONE => | |
649 | let | |
650 | val rep = ++ repCounter | |
651 | in List.push (reps, {char = c, rep = rep}) | |
652 | ; rep | |
653 | end | |
654 | | SOME {rep, ...} => rep | |
655 | in Array.update (charClass, c, rep) | |
656 | end) | |
657 | val numClasses = 1 + !repCounter | |
658 | (* Compute "next" for the charClasses. *) | |
659 | val next' = | |
660 | Array2.new (numStates, numClasses, Array.fromList []) | |
661 | val _ = | |
662 | List.foreach | |
663 | (!reps, fn {char, rep} => | |
664 | Int.for (0, numStates, fn state => | |
665 | Array2.update (next', state, rep, | |
666 | Array2.sub (next, state, char)))) | |
667 | in | |
668 | T {anchorStarts = anchorStarts, | |
669 | charClass = charClass, | |
670 | final = final, | |
671 | next = next', | |
672 | saves = saves, | |
673 | seen = Array.new (numStates, false), | |
674 | stack1 = Stack.new (numStates, (~1, Actions.empty)), | |
675 | stack2 = Stack.new (numStates, (~1, Actions.empty)), | |
676 | start = start} | |
677 | end | |
678 | ||
679 | structure Graph = DirectedGraph | |
680 | fun layoutDot (T {anchorStarts, charClass, final, next, start, ...}, | |
681 | title: string): Layout.t = | |
682 | let | |
683 | val numStates = Array2.nRows next | |
684 | open Dot | |
685 | val g = Graph.new () | |
686 | val nodes = Vector.tabulate (numStates, fn _ => Graph.newNode g) | |
687 | fun node i = Vector.sub (nodes, i) | |
688 | val {get = nodeOptions, ...} = | |
689 | Property.get (Graph.Node.plist, | |
690 | Property.initFun | |
691 | (fn _ => let open NodeOption | |
692 | in ref [] | |
693 | end)) | |
694 | val {get = edgeOptions, ...} = | |
695 | Property.get (Graph.Edge.plist, | |
696 | Property.initFun | |
697 | (fn _ => let open EdgeOption | |
698 | in ref [] | |
699 | end)) | |
700 | fun addNodeOption (i, opts) = | |
701 | let val r = nodeOptions (node i) | |
702 | in r := opts @ !r | |
703 | end | |
704 | val _ = addNodeOption (start, [NodeOption.label "start"]) | |
705 | val _ = | |
706 | Int.for | |
707 | (0, numStates, fn src => | |
708 | let | |
709 | val shape = | |
710 | case (isSome (Array.sub (final, src)), | |
711 | Vector.exists (anchorStarts, fn (s, _) => | |
712 | s = src)) of | |
713 | (false, false) => Ellipse | |
714 | | (true, false) => Box | |
715 | | (false, true) => Diamond | |
716 | | (true, true) => Polygon {sides = 5, options = []} | |
717 | val _ = | |
718 | addNodeOption (src, let open NodeOption | |
719 | in [Shape shape] | |
720 | end) | |
721 | val dsts = Array.new (numStates, []) | |
722 | val _ = | |
723 | Int.forDown | |
724 | (0, numChars, fn c => | |
725 | if Vector.sub (validChars, c) | |
726 | then | |
727 | let | |
728 | val char = Char.fromInt c | |
729 | val class = Array.sub (charClass, c) | |
730 | in Array.foreach | |
731 | (Array2.sub (next, src, class), fn (dst, _) => | |
732 | (Array.update (dsts, dst, | |
733 | char :: Array.sub (dsts, dst)))) | |
734 | end | |
735 | else ()) | |
736 | in | |
737 | Array.foreachi | |
738 | (dsts, fn (dst, cs) => | |
739 | case cs of | |
740 | [] => () | |
741 | | _ => | |
742 | let | |
743 | val edge = Graph.addEdge (g, {from = node src, | |
744 | to = node dst}) | |
745 | in List.push (edgeOptions edge, | |
746 | EdgeOption.label (edgeLabel cs)) | |
747 | end) | |
748 | end) | |
749 | in | |
750 | Graph.layoutDot (g, fn {nodeName} => | |
751 | {title = title, | |
752 | options = | |
753 | let open GraphOption | |
754 | in [ | |
755 | RankDir LeftToRight, | |
756 | Rank (Min, [{nodeName = nodeName (node start)}]) | |
757 | ] | |
758 | end, | |
759 | edgeOptions = ! o edgeOptions, | |
760 | nodeOptions = ! o nodeOptions}) | |
761 | end | |
762 | end | |
763 | ||
764 | structure DFA: | |
765 | sig | |
766 | type t | |
767 | ||
768 | val fromNFA: NFA.t -> t | |
769 | val layoutDot: {dfa: t, | |
770 | showDead: bool, | |
771 | title: string} -> Layout.t | |
772 | val match: {dfa: t, | |
773 | short: bool, | |
774 | string: string, | |
775 | startPos: int, | |
776 | anchorStart: bool} -> (int * Actions.t) option | |
777 | val minimize: t -> t | |
778 | val saves: t -> Save.t vector | |
779 | end = | |
780 | struct | |
781 | (* The states in a DFA are indexed from 0 to n-1, where n is the number | |
782 | * of states. | |
783 | *) | |
784 | structure State = | |
785 | struct | |
786 | type t = int | |
787 | ||
788 | val layout = Int.layout | |
789 | end | |
790 | ||
791 | type slot = int | |
792 | ||
793 | structure EdgeAction = | |
794 | struct | |
795 | datatype t = | |
796 | Add of {from: slot, | |
797 | to: slot, | |
798 | actions: MatchAction.t vector} | |
799 | | Init of {to: slot, | |
800 | actions: MatchAction.t vector} | |
801 | ||
802 | val equals = | |
803 | fn (Add {from = f, to = t, actions = a}, | |
804 | Add {from = f', to = t', actions = a'}) => | |
805 | f = f' andalso t = t' | |
806 | andalso Vector.equals (a, a', MatchAction.equals) | |
807 | | (Init {to = t, actions = a}, | |
808 | Init {to = t', actions = a'}) => | |
809 | t = t' andalso Vector.equals (a, a', MatchAction.equals) | |
810 | | _ => false | |
811 | ||
812 | val toString = | |
813 | fn Add {from, to, actions} => | |
814 | concat ["(", | |
815 | Int.toString from, ", ", | |
816 | Int.toString to, ", ", | |
817 | Layout.toString | |
818 | (Vector.layout MatchAction.layout actions), | |
819 | ")"] | |
820 | | Init {to, actions} => | |
821 | concat ["(", | |
822 | Int.toString to, ", ", | |
823 | Layout.toString | |
824 | (Vector.layout MatchAction.layout actions), | |
825 | ")"] | |
826 | ||
827 | val layout = | |
828 | let open Layout | |
829 | in | |
830 | fn Add {from, to, actions} => | |
831 | Layout.record | |
832 | [("from", Int.layout from), | |
833 | ("to", Int.layout to), | |
834 | ("actions", | |
835 | Vector.layout MatchAction.layout actions)] | |
836 | | Init {actions, to} => | |
837 | Layout.record | |
838 | [("to", Int.layout to), | |
839 | ("actions", | |
840 | Vector.layout MatchAction.layout actions)] | |
841 | end | |
842 | end | |
843 | ||
844 | (* State i is final iff Array.sub (final, i). | |
845 | * Characters are grouped into equivalence classes, represented by | |
846 | * integers in [0, numCharClasses). | |
847 | * The equivalence class of c is Array.sub (charClass, Char.toInt c). | |
848 | * The dimensions of next are numStates x numCharClasses | |
849 | * The outgoing state from state i on input char c is | |
850 | * Array2.sub (next, i, Array.sub (charClass, Char.toInt c)). | |
851 | * actions1 and actions2 are used only during matching. They | |
852 | * represent the actions associated with each NFA state. They are of | |
853 | * the same length as the number of states in the NFA. | |
854 | *) | |
855 | datatype t = | |
856 | T of {anchorStart: State.t, | |
857 | anchorStartStack: MatchAction.t vector vector, | |
858 | charClass: int array, (* of length numChars *) | |
859 | dead: bool array, | |
860 | final: {actions: MatchAction.t vector, | |
861 | requireFinish: bool, | |
862 | slot: int} option array, | |
863 | next: (State.t * EdgeAction.t vector) Array2.t, | |
864 | saves: Save.t vector, | |
865 | stack1: Actions.t array, (* of size maxNumNFAStates *) | |
866 | stack2: Actions.t array, (* of size maxNumNFAStates *) | |
867 | start: State.t, | |
868 | startStack: MatchAction.t vector vector} | |
869 | ||
870 | fun numStates (T {next, ...}): int = Array2.nRows next | |
871 | fun saves (T {saves, ...}) = saves | |
872 | ||
873 | fun dead (numStates, numCharClasses, final, next) = | |
874 | Array.tabulate | |
875 | (numStates, fn i => | |
876 | not (isSome (Array.sub (final, i))) | |
877 | andalso Int.forall (0, numCharClasses, fn c => | |
878 | let val (j, v) = Array2.sub (next, i, c) | |
879 | in i = j andalso Vector.isEmpty v | |
880 | end)) | |
881 | ||
882 | (* To build a DFA from an NFA, I use the usual "subset construction", | |
883 | * as in algorithm 3.2 (page 118) of the Dragon Book. | |
884 | * | |
885 | * It associates each (reachable) set of states in the NFA with a single | |
886 | * state in the DFA. | |
887 | *) | |
888 | fun fromNFA (nfa as NFA.T {anchorStarts, charClass, | |
889 | final, next, saves, start, ...}) = | |
890 | let | |
891 | val numNFAStates = NFA.numStates nfa | |
892 | val numCharClasses = NFA.numCharClasses nfa | |
893 | (* Determine the NFA states that have save info. | |
894 | *) | |
895 | val nfaStateSave = Array.array (numNFAStates, false) | |
896 | fun visit (s: NFA.State.t): unit = | |
897 | if Array.sub (nfaStateSave, s) | |
898 | then () | |
899 | else (Array.update (nfaStateSave, s, true) | |
900 | ; Int.for (0, numCharClasses, fn c => | |
901 | Array.foreach | |
902 | (Array2.sub (next, s, c), fn (s', _) => | |
903 | visit s'))) | |
904 | val _ = | |
905 | Vector.foreach | |
906 | (anchorStarts, fn (s, v) => | |
907 | if Vector.isEmpty v | |
908 | then () | |
909 | else visit s) | |
910 | val _ = | |
911 | Int.for (0, numNFAStates, fn s => | |
912 | if Array.sub (nfaStateSave, s) | |
913 | then () | |
914 | else | |
915 | Int.for (0, numCharClasses, fn c => | |
916 | Array.foreach | |
917 | (Array2.sub (next, s, c), fn (s', v) => | |
918 | if Vector.isEmpty v | |
919 | then () | |
920 | else visit s'))) | |
921 | (* Sets of states are represented as arrays, sorted in increasing | |
922 | * order of state index. | |
923 | *) | |
924 | type states = NFA.State.t array | |
925 | val counter = ref ~1 | |
926 | type work = | |
927 | {states: states, | |
928 | state: int, | |
929 | out: (State.t * EdgeAction.t vector) vector option ref} | |
930 | val cache: work list ref = ref [] | |
931 | val todo: work list ref = ref [] | |
932 | val maxNumStates: int ref = ref 0 | |
933 | fun statesToState (ss: states): State.t = | |
934 | let | |
935 | val n = Array.length ss | |
936 | val _ = if n > !maxNumStates | |
937 | then maxNumStates := n | |
938 | else () | |
939 | in | |
940 | case List.peek (!cache, fn {states, ...} => | |
941 | Array.equals (ss, states, op =)) of | |
942 | NONE => | |
943 | let | |
944 | val state = ++ counter | |
945 | val work = {out = ref NONE, | |
946 | state = state, | |
947 | states = ss} | |
948 | val _ = List.push (cache, work) | |
949 | val _ = List.push (todo, work) | |
950 | in | |
951 | state | |
952 | end | |
953 | | SOME {state, ...} => state | |
954 | end | |
955 | val statesToState = | |
956 | Trace.trace ("Regexp.DFA.fromNFA.statesToState", | |
957 | Array.layout NFA.State.layout, | |
958 | State.layout) | |
959 | statesToState | |
960 | local | |
961 | val seen = Array.array (NFA.numStates nfa, NONE) | |
962 | in | |
963 | fun computeOut states = | |
964 | Vector.tabulate | |
965 | (numCharClasses, fn c => | |
966 | let | |
967 | val _ = Array.modify (seen, fn _ => NONE) | |
968 | val _ = | |
969 | Array.foreachi | |
970 | (states, fn (fromSlot: slot, | |
971 | fromState: NFA.State.t) => | |
972 | Array.foreach | |
973 | (Array2.sub (next, fromState, c), | |
974 | fn (toState: NFA.State.t, v) => | |
975 | case Array.sub (seen, toState) of | |
976 | NONE => | |
977 | Array.update | |
978 | (seen, toState, | |
979 | SOME {fromSlot = fromSlot, | |
980 | fromState = fromState, | |
981 | toState = toState, | |
982 | actions = v}) | |
983 | | SOME _ => ())) | |
984 | val toStates = Array.keepAllMap (seen, fn opt => opt) | |
985 | val edgeActions = ref [] | |
986 | val toStates = | |
987 | Array.mapi | |
988 | (toStates, fn (toSlot: slot, | |
989 | {fromSlot, fromState, toState, | |
990 | actions}) => | |
991 | (if Array.sub (nfaStateSave, toState) | |
992 | then | |
993 | List.push | |
994 | (edgeActions, | |
995 | if Array.sub (nfaStateSave, fromState) | |
996 | then | |
997 | EdgeAction.Add | |
998 | {from = fromSlot, | |
999 | to = toSlot, | |
1000 | actions = actions} | |
1001 | else (EdgeAction.Init | |
1002 | {to = toSlot, | |
1003 | actions = actions})) | |
1004 | else () | |
1005 | ; toState)) | |
1006 | in (statesToState toStates, | |
1007 | Vector.fromList (!edgeActions)) | |
1008 | end) | |
1009 | end | |
1010 | fun loop () = | |
1011 | case !todo of | |
1012 | [] => () | |
1013 | | {states, out, ...} :: rest => | |
1014 | (todo := rest | |
1015 | ; out := SOME (computeOut states) | |
1016 | ; loop ()) | |
1017 | (* These calls to statesToState initialize the worklist. *) | |
1018 | val start' = statesToState (Array.fromList [start]) | |
1019 | val startStack = Vector.new1 (Vector.new0 ()) | |
1020 | val anchorStartStates = | |
1021 | Array.fromList | |
1022 | (List.insert | |
1023 | (Vector.toListMap (anchorStarts, #1), start, op <=)) | |
1024 | val anchorStart' = statesToState anchorStartStates | |
1025 | val anchorStartStack = | |
1026 | Vector.tabulate | |
1027 | (Array.length anchorStartStates, | |
1028 | fn i => | |
1029 | let | |
1030 | val s = Array.sub (anchorStartStates, i) | |
1031 | in | |
1032 | case Vector.peek (anchorStarts, fn (s', _) => s = s') of | |
1033 | NONE => Vector.new0 () | |
1034 | | SOME (_, v) => v | |
1035 | end) | |
1036 | val _ = loop () | |
1037 | (* The worklist is empty. Compute the transition table. *) | |
1038 | val numStates = 1 + !counter | |
1039 | val next' = Array2.new (numStates, numCharClasses, | |
1040 | (~1, Vector.new0 ())) | |
1041 | val final' = Array.new (numStates, NONE) | |
1042 | val _ = | |
1043 | List.foreach | |
1044 | (!cache, fn {states, state = i, out, ...}: work => | |
1045 | let | |
1046 | val _ = | |
1047 | Vector.foreachi | |
1048 | (valOf (! out), fn (c, j) => | |
1049 | Array2.update (next', i, c, j)) | |
1050 | val _ = | |
1051 | case Array.sub (final', i) of | |
1052 | SOME {requireFinish = false, ...} => () | |
1053 | | _ => | |
1054 | case Array.peekMapi (states, fn s => | |
1055 | Array.sub (final, s)) of | |
1056 | NONE => () | |
1057 | | SOME (slot, {actions, requireFinish}) => | |
1058 | Array.update | |
1059 | (final', i, | |
1060 | SOME {actions = actions, | |
1061 | requireFinish = requireFinish, | |
1062 | slot = slot}) | |
1063 | in | |
1064 | () | |
1065 | end) | |
1066 | fun newStack () = Array.new (!maxNumStates, Actions.empty) | |
1067 | in T {anchorStart = anchorStart', | |
1068 | anchorStartStack = anchorStartStack, | |
1069 | charClass = charClass, | |
1070 | dead = dead (numStates, numCharClasses, final', next'), | |
1071 | final = final', | |
1072 | next = next', | |
1073 | saves = saves, | |
1074 | stack1 = newStack (), | |
1075 | stack2 = newStack (), | |
1076 | start = start', | |
1077 | startStack = startStack} | |
1078 | end | |
1079 | ||
1080 | (* | |
1081 | * match could be sped up some by doing the match in two passes. | |
1082 | * The first pass just determines if the match will succeed. | |
1083 | * The second pass computes all the edge actions. | |
1084 | *) | |
1085 | fun match {dfa = T {anchorStart = ancSt, anchorStartStack, | |
1086 | charClass, dead, final, next, stack1, stack2, | |
1087 | start, startStack, ...}, | |
1088 | short: bool, | |
1089 | string = s, | |
1090 | startPos: int, | |
1091 | anchorStart: bool}: (int * Actions.t) option = | |
1092 | let | |
1093 | val n = String.size s | |
1094 | fun loop (i: int, | |
1095 | state: int, | |
1096 | stack1, stack2, | |
1097 | last: (int * Actions.t) option) | |
1098 | : (int * Actions.t) option = | |
1099 | let | |
1100 | val last = | |
1101 | case Array.sub (final, state) of | |
1102 | NONE => last | |
1103 | | SOME {actions, requireFinish, slot} => | |
1104 | if requireFinish andalso i < n | |
1105 | then NONE | |
1106 | else | |
1107 | SOME (i, Actions.add (Array.sub (stack1, slot), | |
1108 | i, actions)) | |
1109 | in | |
1110 | if Array.sub (dead, state) | |
1111 | orelse i = n | |
1112 | orelse (short andalso isSome last) | |
1113 | then last | |
1114 | else | |
1115 | let | |
1116 | val (state, edgeActions) = | |
1117 | Array2.sub (next, state, | |
1118 | Array.sub | |
1119 | (charClass, | |
1120 | Char.toInt (String.sub (s, i)))) | |
1121 | val _ = | |
1122 | Vector.foreach | |
1123 | (edgeActions, | |
1124 | fn EdgeAction.Add {from, to, actions} => | |
1125 | Array.update | |
1126 | (stack2, to, | |
1127 | Actions.add (Array.sub (stack1, from), | |
1128 | i, actions)) | |
1129 | | EdgeAction.Init {to, actions} => | |
1130 | Array.update | |
1131 | (stack2, to, | |
1132 | Actions.add (Actions.empty, i, actions))) | |
1133 | in | |
1134 | loop (i + 1, state, stack2, stack1, last) | |
1135 | end | |
1136 | end | |
1137 | val (state, initStack) = | |
1138 | if anchorStart | |
1139 | then (ancSt, anchorStartStack) | |
1140 | else (start, startStack) | |
1141 | val _ = | |
1142 | Vector.foreachi | |
1143 | (initStack, fn (slot, v) => | |
1144 | Array.update (stack1, slot, | |
1145 | Actions.add (Actions.empty, startPos, v))) | |
1146 | val res = loop (startPos, state, stack1, stack2, NONE) | |
1147 | in | |
1148 | res | |
1149 | end | |
1150 | ||
1151 | val match = | |
1152 | Trace.trace ("Regexp.DFA.match", | |
1153 | fn {string, startPos, ...} => | |
1154 | Layout.tuple [String.layout string, | |
1155 | Int.layout startPos], | |
1156 | Option.layout (Layout.tuple2 | |
1157 | (Int.layout, Actions.layout))) | |
1158 | match | |
1159 | ||
1160 | structure Graph = DirectedGraph | |
1161 | structure Env = Env (structure Domain = MonoVector (EdgeAction)) | |
1162 | fun layoutDot {dfa as T {anchorStart, charClass, dead, final, | |
1163 | next, start, ...}, | |
1164 | title: string, | |
1165 | showDead: bool}: Layout.t = | |
1166 | let | |
1167 | val numStates = numStates dfa | |
1168 | open Dot | |
1169 | val g = Graph.new () | |
1170 | val nodes = Vector.tabulate (numStates, fn _ => Graph.newNode g) | |
1171 | fun node i = Vector.sub (nodes, i) | |
1172 | val {get = nodeOptions, ...} = | |
1173 | Property.get (Graph.Node.plist, | |
1174 | Property.initFun | |
1175 | (fn _ => let open NodeOption | |
1176 | in ref [] | |
1177 | end)) | |
1178 | val {get = edgeOptions, ...} = | |
1179 | Property.get (Graph.Edge.plist, | |
1180 | Property.initFun | |
1181 | (fn _ => let open EdgeOption | |
1182 | in ref [] | |
1183 | end)) | |
1184 | fun addNodeOption (i, opts) = | |
1185 | let val r = nodeOptions (node i) | |
1186 | in r := opts @ !r | |
1187 | end | |
1188 | val _ = addNodeOption (start, [NodeOption.label "start"]) | |
1189 | val _ = | |
1190 | Int.for | |
1191 | (0, numStates, fn src => | |
1192 | let | |
1193 | val shape = | |
1194 | case (isSome (Array.sub (final, src)), | |
1195 | src = anchorStart) of | |
1196 | (false, false) => Ellipse | |
1197 | | (true, false) => Box | |
1198 | | (false, true) => Diamond | |
1199 | | (true, true) => Polygon {sides = 5, options = []} | |
1200 | val _ = | |
1201 | addNodeOption (src, let open NodeOption | |
1202 | in [Shape shape] | |
1203 | end) | |
1204 | val dsts = Array.new (numStates, Env.empty ()) | |
1205 | val _ = | |
1206 | Int.forDown | |
1207 | (0, numChars, fn c => | |
1208 | if Vector.sub (validChars, c) | |
1209 | then | |
1210 | let | |
1211 | val (dst, v) = | |
1212 | Array2.sub (next, src, | |
1213 | Array.sub (charClass, c)) | |
1214 | val e = Array.sub (dsts, dst) | |
1215 | val c = Char.fromInt c | |
1216 | val cs = | |
1217 | case Env.peek (e, v) of | |
1218 | NONE => [c] | |
1219 | | SOME cs => c :: cs | |
1220 | in Array.update | |
1221 | (dsts, dst, Env.extend (e, v, cs)) | |
1222 | end | |
1223 | else ()) | |
1224 | val src = node src | |
1225 | in | |
1226 | Array.foreachi | |
1227 | (dsts, fn (dst, e) => | |
1228 | if not showDead andalso Array.sub (dead, dst) | |
1229 | then () | |
1230 | else | |
1231 | Env.foreachi | |
1232 | (e, fn (v, cs) => | |
1233 | let | |
1234 | val edge = Graph.addEdge (g, {from = src, | |
1235 | to = node dst}) | |
1236 | val label = | |
1237 | concat [edgeLabel cs, | |
1238 | " -- ", | |
1239 | Layout.toString | |
1240 | (Vector.layout (Layout.str o | |
1241 | EdgeAction.toString) | |
1242 | v)] | |
1243 | in List.push (edgeOptions edge, | |
1244 | EdgeOption.label label) | |
1245 | end)) | |
1246 | end) | |
1247 | in | |
1248 | Graph.layoutDot (g, fn {nodeName} => | |
1249 | {title = title, | |
1250 | options = | |
1251 | let open GraphOption | |
1252 | in [ | |
1253 | RankDir LeftToRight, | |
1254 | Rank (Min, [{nodeName = nodeName (node start)}]) | |
1255 | ] | |
1256 | end, | |
1257 | edgeOptions = ! o edgeOptions, | |
1258 | nodeOptions = ! o nodeOptions}) | |
1259 | end | |
1260 | ||
1261 | fun minimize d = d | |
1262 | (* This DFA minimization algorithm is based on algorithm 3.6 (page 142) | |
1263 | * of the Dragon Book. | |
1264 | * | |
1265 | * It maintains an array, r, that stores for each state s the | |
1266 | * representative of the class to which s belongs. | |
1267 | * It repeatedly refines an equivalence relation, represented by a list | |
1268 | * of classes, where each class is a list of states. | |
1269 | *) | |
1270 | (* fun minimize (dfa as T {anchorStart, charClass, final, | |
1271 | * start, next, ...}): t = | |
1272 | * let | |
1273 | * val numStates = numStates dfa | |
1274 | * val numCharClasses = numCharClasses dfa | |
1275 | * type class = int list | |
1276 | * type classes = class list | |
1277 | * val repCounter = ref ~1 | |
1278 | * val change = ref false | |
1279 | * fun newRep () = (change := true; ++ repCounter) | |
1280 | * val finRep = newRep () | |
1281 | * val nonfinRep = newRep () | |
1282 | * val r = Array.tabulate (numStates, fn i => | |
1283 | * if Array.sub (final, i) | |
1284 | * then finRep | |
1285 | * else nonfinRep) | |
1286 | * fun rep s = Array.sub (r, s) | |
1287 | * fun trans (s, c) = rep (Array2.sub (next, s, c)) | |
1288 | * fun refine (class: class, ac: classes): classes = | |
1289 | * let | |
1290 | * val r = | |
1291 | * List.fold | |
1292 | * (class, [], fn (state, classes) => | |
1293 | * let | |
1294 | * fun loop (classes, ac) = | |
1295 | * case classes of | |
1296 | * [] => | |
1297 | * (case ac of | |
1298 | * [] => [{class = [state], | |
1299 | * old = state}] | |
1300 | * | _ => | |
1301 | * let | |
1302 | * val s = newRep () | |
1303 | * val _ = Array.update (r, state, s) | |
1304 | * in {class = [state], | |
1305 | * old = state} :: ac | |
1306 | * end) | |
1307 | * | (z as {class, old}) :: classes => | |
1308 | * if Int.forall | |
1309 | * (0, numCharClasses, fn c => | |
1310 | * trans (old, c) = trans (state, c)) | |
1311 | * then | |
1312 | * (Array.update (r, state, rep old) | |
1313 | * ; {class = state :: class, | |
1314 | * old = old} :: (List.appendRev | |
1315 | * (classes, ac))) | |
1316 | * else loop (classes, z :: ac) | |
1317 | * in loop (classes, []) | |
1318 | * end) | |
1319 | * in List.fold (r, ac, fn ({class, ...}, ac) => | |
1320 | * case class of | |
1321 | * [_] => ac | |
1322 | * | _ => class :: ac) | |
1323 | * end | |
1324 | * val refine = | |
1325 | * Trace.trace ("refine", | |
1326 | * (List.layout Int.layout o #1), | |
1327 | * Layout.ignore) | |
1328 | * refine | |
1329 | * fun refineAll (classes: classes): unit = | |
1330 | * case classes of | |
1331 | * [] => () | |
1332 | * | _ => | |
1333 | * let | |
1334 | * val _ = change := false | |
1335 | * val classes = | |
1336 | * List.fold (classes, [], fn (class, ac) => | |
1337 | * case class of | |
1338 | * [_] => ac | |
1339 | * | _ => refine (class, ac)) | |
1340 | * in if !change | |
1341 | * then refineAll classes | |
1342 | * else () | |
1343 | * end | |
1344 | * val (fin, nonfin) = | |
1345 | * Int.fold (0, numStates, ([], []), fn (i, (f, n)) => | |
1346 | * if Array.sub (final, i) | |
1347 | * then (i :: f, n) | |
1348 | * else (f, i :: n)) | |
1349 | * val _ = refineAll [fin, nonfin] | |
1350 | * val numStates' = 1 + !repCounter | |
1351 | * (* Compute reachable states. *) | |
1352 | * val reached = Array.new (numStates', false) | |
1353 | * fun visit (s: int (* an old state *)): unit = | |
1354 | * let | |
1355 | * val s' = rep s | |
1356 | * in | |
1357 | * if Array.sub (reached, s') | |
1358 | * then () | |
1359 | * else (Array.update (reached, s', true) | |
1360 | * ; Int.for (0, numCharClasses, fn c => | |
1361 | * visit (Array2.sub (next, s, c)))) | |
1362 | * end | |
1363 | * val _ = visit start | |
1364 | * val _ = visit anchorStart | |
1365 | * (* Compute new representatives. *) | |
1366 | * val c = ref ~1 | |
1367 | * val newR = Array.tabulate (numStates', fn s => | |
1368 | * if Array.sub (reached, s) | |
1369 | * then ++ c | |
1370 | * else ~1) | |
1371 | * val numStates' = 1 + !c | |
1372 | * val _ = Array.modify (r, fn s => Array.sub (newR, s)) | |
1373 | * val next' = Array2.new (numStates', numCharClasses, ~1) | |
1374 | * val _ = | |
1375 | * Array2.foreachi | |
1376 | * (next, fn (s, c, s') => | |
1377 | * Array2.update (next', rep s, c, rep s')) | |
1378 | * val final' = Array.array (numStates', false) | |
1379 | * val _ = | |
1380 | * Array.foreachi | |
1381 | * (final, fn (i, b) => | |
1382 | * if b then Array.update (final', rep i, true) else ()) | |
1383 | * in T {anchorStart = rep anchorStart, | |
1384 | * charClass = charClass, | |
1385 | * dead = dead (numStates', numCharClasses, final', next'), | |
1386 | * final = final', | |
1387 | * start = rep start, | |
1388 | * next = next'} | |
1389 | * end | |
1390 | *) | |
1391 | end | |
1392 | in | |
1393 | structure Regexp: REGEXP = | |
1394 | struct | |
1395 | structure Save = Save | |
1396 | structure Match = Match | |
1397 | ||
1398 | open Regexp | |
1399 | ||
1400 | val anchorFinish = AnchorFinish | |
1401 | val anchorStart = AnchorStart | |
1402 | val isChar = CharSet | |
1403 | fun isNotChar f = isChar (not o f) | |
1404 | fun char c = isChar (fn c' => c = c') | |
1405 | fun notChar c = isChar (fn c' => c <> c') | |
1406 | val or = Or | |
1407 | val save = Save | |
1408 | val seq = Seq | |
1409 | val star = Star | |
1410 | val zeroOrMore = star | |
1411 | ||
1412 | val dquote = char #"\"" | |
1413 | ||
1414 | val any = isChar (fn _ => true) | |
1415 | val anys = star any | |
1416 | val ascii = isChar (fn c => ord c <= 127) | |
1417 | val asciis = star ascii | |
1418 | ||
1419 | val none = isChar (fn _ => false) | |
1420 | fun oneOf s = isChar (fn c => String.contains (s, c)) | |
1421 | fun notOneOf s = isNotChar (fn c => String.contains (s, c)) | |
1422 | val digit = isChar Char.isDigit | |
1423 | val digits = star digit | |
1424 | val nonDigit = isNotChar Char.isDigit | |
1425 | val space = isChar Char.isSpace | |
1426 | val spaces = star space | |
1427 | ||
1428 | fun string (s: string): t = | |
1429 | seq (Int.foldDown (0, String.size s, [], fn (i, ac) => | |
1430 | char (String.sub (s, i)) :: ac)) | |
1431 | ||
1432 | fun stringIgnoreCase (s: string): t = | |
1433 | seq (Int.foldDown | |
1434 | (0, String.size s, [], fn (i, ac) => | |
1435 | let | |
1436 | val c = Char.toLower (String.sub (s, i)) | |
1437 | in | |
1438 | isChar (fn c' => c = Char.toLower c') | |
1439 | end :: ac)) | |
1440 | ||
1441 | val null = seq [] (* Language containing the empty string only. *) | |
1442 | fun oneOrMore r = seq [r, star r] | |
1443 | fun optional r = or [null, r] | |
1444 | fun repeat (r, n: int) = seq (List.tabulate (n, fn _ => r)) | |
1445 | fun lower (r, n: int) = seq [repeat (r, n), star r] | |
1446 | fun upper (r, n: int) = | |
1447 | if n <= 0 | |
1448 | then null | |
1449 | else or [null, seq [r, upper (r, n - 1)]] | |
1450 | fun range (r, n: int, m: int) = | |
1451 | seq [repeat (r, n), upper (r, m - n)] | |
1452 | ||
1453 | structure Compiled = | |
1454 | struct | |
1455 | datatype machine = | |
1456 | DFA of DFA.t | |
1457 | | NFA of NFA.t | |
1458 | ||
1459 | datatype t = T of {regexp: Regexp.t, | |
1460 | machine: machine} | |
1461 | ||
1462 | fun layoutDot (T {machine, ...}) = | |
1463 | case machine of | |
1464 | DFA m => DFA.layoutDot {dfa = m, showDead = false, | |
1465 | title = "dfa"} | |
1466 | | NFA m => NFA.layoutDot (m, "nfa") | |
1467 | ||
1468 | fun layoutDotToFile (c: t, f: File.t) = | |
1469 | File.withOut (f, fn out => Layout.output (layoutDot c, out)) | |
1470 | ||
1471 | fun layout (T {regexp, ...}) = | |
1472 | let | |
1473 | open Layout | |
1474 | in | |
1475 | Regexp.layout regexp | |
1476 | (* | |
1477 | align [case machine of | |
1478 | DFA dfa => DFA.layout dfa | |
1479 | | NFA nfa => NFA.layout nfa | |
1480 | (* str "implementing", Regexp.layout regexp *) | |
1481 | ] | |
1482 | *) | |
1483 | end | |
1484 | ||
1485 | fun match {compiled = T {machine, ...}, | |
1486 | short, startPos, string} = | |
1487 | let | |
1488 | val anchorStart = startPos = 0 | |
1489 | val (saves, opt) = | |
1490 | case machine of | |
1491 | DFA dfa => | |
1492 | (DFA.saves dfa, | |
1493 | DFA.match {anchorStart = anchorStart, | |
1494 | dfa = dfa, | |
1495 | short = short, | |
1496 | string = string, | |
1497 | startPos = startPos}) | |
1498 | ||
1499 | | NFA nfa => | |
1500 | (NFA.saves nfa, | |
1501 | NFA.match {nfa = nfa, | |
1502 | short = short, | |
1503 | string = string, | |
1504 | startPos = startPos}) | |
1505 | exception No | |
1506 | in | |
1507 | Option.map | |
1508 | (opt, fn (stop, Actions.T actions) => | |
1509 | let | |
1510 | val _ = Vector.foreachi (saves, fn (i, s) => | |
1511 | Save.assign (s, i)) | |
1512 | val n = Vector.length saves | |
1513 | val starts = Array.array (n, ~1) | |
1514 | val matches = Array.array (n, NONE) | |
1515 | val _ = | |
1516 | List.foreach | |
1517 | (rev actions, fn (i, v) => | |
1518 | Vector.foreach | |
1519 | (v, fn ma => | |
1520 | case ma of | |
1521 | MatchAction.Finish s => | |
1522 | let | |
1523 | val index = Save.index s | |
1524 | val start = Array.sub (starts, index) | |
1525 | in | |
1526 | Array.update | |
1527 | (matches, index, | |
1528 | SOME (Substring.substring | |
1529 | (string, {start = start, | |
1530 | length = i - start}))) | |
1531 | end | |
1532 | | MatchAction.Start s => | |
1533 | Array.update (starts, Save.index s, i))) | |
1534 | val matches = | |
1535 | Array.keepAllMapi | |
1536 | (matches, fn (i, sso) => | |
1537 | case sso of | |
1538 | NONE => NONE | |
1539 | | SOME ss => SOME (Vector.sub (saves, i), ss)) | |
1540 | val all = | |
1541 | Substring.substring | |
1542 | (string, {start = startPos, | |
1543 | length = stop - startPos}) | |
1544 | in | |
1545 | Match.T {all = all, | |
1546 | matches = matches} | |
1547 | end) handle No => NONE | |
1548 | end | |
1549 | ||
1550 | val match = | |
1551 | Trace.trace | |
1552 | ("Regexp.Compiled.match", | |
1553 | fn {compiled, short, startPos, string} => | |
1554 | Layout.record | |
1555 | [("short", Bool.layout short), | |
1556 | ("startPos", Int.layout startPos), | |
1557 | ("string", String.layout string), | |
1558 | ("compiled", layout compiled)], | |
1559 | Option.layout Match.layout) | |
1560 | match | |
1561 | ||
1562 | fun matchLong (c, s, i) = | |
1563 | match {compiled = c, | |
1564 | short = false, | |
1565 | startPos = i, | |
1566 | string = s} | |
1567 | ||
1568 | fun matchShort (c, s, i) = | |
1569 | match {compiled = c, | |
1570 | short = true, | |
1571 | startPos = i, | |
1572 | string = s} | |
1573 | ||
1574 | fun matchAll (r, s) = | |
1575 | case matchLong (r, s, 0) of | |
1576 | NONE => NONE | |
1577 | | SOME m => if String.size s = Substring.length (Match.all m) | |
1578 | then SOME m | |
1579 | else NONE | |
1580 | ||
1581 | val matchesAll = isSome o matchAll | |
1582 | ||
1583 | fun matchPrefix (r, s) = matchShort (r, s, 0) | |
1584 | ||
1585 | val matchesPrefix = isSome o matchPrefix | |
1586 | ||
1587 | fun find (c: t, s: string, startPos, short: bool) = | |
1588 | let | |
1589 | val n = String.size s | |
1590 | fun loop (i: int) = | |
1591 | if i >= n | |
1592 | then NONE | |
1593 | else | |
1594 | case match {compiled = c, | |
1595 | short = short, | |
1596 | startPos = i, | |
1597 | string = s} of | |
1598 | NONE => loop (i + 1) | |
1599 | | SOME m => SOME m | |
1600 | in loop startPos | |
1601 | end | |
1602 | ||
1603 | fun findLong (c, s, i) = find (c, s, i, false) | |
1604 | fun findShort (c, s, i) = find (c, s, i, true) | |
1605 | ||
1606 | fun foreachMatchShort (c, s, f: Match.t -> unit) = | |
1607 | let | |
1608 | fun loop i = | |
1609 | case findShort (c, s, i) of | |
1610 | NONE => () | |
1611 | | SOME m => (f m; loop (Match.endOf m)) | |
1612 | in | |
1613 | loop 0 | |
1614 | end | |
1615 | end | |
1616 | ||
1617 | fun compileDFA r = | |
1618 | let | |
1619 | val nfa = NFA.fromRegexp r | |
1620 | in | |
1621 | Compiled.T | |
1622 | {regexp = r, | |
1623 | machine = Compiled.DFA (DFA.minimize (DFA.fromNFA nfa))} | |
1624 | end | |
1625 | ||
1626 | val compileDFA = | |
1627 | Trace.trace ("Regexp.compileDFA", layout, Compiled.layout) compileDFA | |
1628 | ||
1629 | fun compileNFA r = | |
1630 | Compiled.T | |
1631 | {regexp = r, | |
1632 | machine = Compiled.NFA (NFA.fromRegexp r)} | |
1633 | ||
1634 | val compileNFA = | |
1635 | Trace.trace ("Regexp.compileNFA", layout, Compiled.layout) compileNFA | |
1636 | ||
1637 | (* POSIX 1003.2 regular expressions | |
1638 | * caveats: does not support back references '\N' | |
1639 | * does not support unmatched ')' | |
1640 | * does not support '[=' style coallating elements | |
1641 | * does not support coallating elements as range endpoints | |
1642 | * | |
1643 | * grammar: | |
1644 | * S ::= Re | |
1645 | * Re ::= Br Re0 | |
1646 | * Re0 ::= e | '|' Br Re0 | |
1647 | * Br ::= P Br0 | |
1648 | * Br0 ::= e | P Br0 | |
1649 | * P ::= A P0 | |
1650 | * P0 ::= e | '*' | '+' | '?' | Bnd | |
1651 | * Bnd ::= '{' N Bnd0 '}' | |
1652 | * Bnd0 ::= e | ',' Bnd1 | |
1653 | * Bnd1 ::= e | N | |
1654 | * A ::= '(' Re ')' | |
1655 | * | '()' | |
1656 | * | '[' Be ']' | |
1657 | * | '.' | |
1658 | * | '^' | |
1659 | * | '$' | |
1660 | * | '\' C | |
1661 | * | C | |
1662 | * Be ::= Be0 | |
1663 | * Be0 ::= '^' Be1 | Be1 | |
1664 | * Be1 ::= ']' Be2 | Be2 | |
1665 | * Be2 ::= '-' Be3 | Be3 | |
1666 | * Be3 ::= e | |
1667 | * | '-' | |
1668 | * | C '-' C Be3 | |
1669 | * | '[' '.' Ce '.' ']' Be3 | |
1670 | * | '[' ':' Cl ':' ']' Be3 | |
1671 | * | C Be3 | |
1672 | * Ce ::= e | C Ce | |
1673 | * Cl ::= 'alnum' | ... | 'xdigit' | |
1674 | *) | |
1675 | local | |
1676 | exception X of string | |
1677 | type res = t * Save.t vector | |
1678 | ||
1679 | fun S (s: char list) : res = | |
1680 | Re (s, fn (s, re, saves) => | |
1681 | case s of | |
1682 | [] => (re, saves) | |
1683 | | _ => raise (X "S")) | |
1684 | and Re (s: char list, | |
1685 | k: char list * t * Save.t vector -> res) = | |
1686 | Br (s, fn (s, re, saves) => | |
1687 | Re0 (s, [re], [saves], k)) | |
1688 | and Re0 (s: char list, res: t list, savess: Save.t vector list, | |
1689 | k: char list * t * Save.t vector -> res) = | |
1690 | let | |
1691 | fun finish s = | |
1692 | k (s, or (List.rev res), Vector.concat (List.rev savess)) | |
1693 | in | |
1694 | case s of | |
1695 | [] => finish s | |
1696 | | #")"::_ => finish s | |
1697 | | #"|"::s => Br (s, fn (s, re, saves) => | |
1698 | Re0 (s, re::res, saves::savess, k)) | |
1699 | | _ => raise (X "Re0") | |
1700 | end | |
1701 | and Br (s: char list, | |
1702 | k: char list * t * Save.t vector -> res) = | |
1703 | P (s, fn (s, re, saves) => | |
1704 | Br0 (s, [re], [saves], k)) | |
1705 | and Br0 (s: char list, res: t list, savess: Save.t vector list, | |
1706 | k: char list * t * Save.t vector -> res) = | |
1707 | let | |
1708 | fun finish s = | |
1709 | k (s, seq (List.rev res), Vector.concat (List.rev savess)) | |
1710 | in | |
1711 | case s of | |
1712 | [] => finish s | |
1713 | | #")"::_ => finish s | |
1714 | | #"|"::_ => finish s | |
1715 | | _ => P (s, fn (s, re, saves) => | |
1716 | Br0 (s, re::res, saves::savess, k)) | |
1717 | end | |
1718 | and P (s: char list, | |
1719 | k: char list * t * Save.t vector -> res) = | |
1720 | A (s, fn (s, re, saves) => P0 (s, re, saves, [], [], k)) | |
1721 | and P0 (s: char list, | |
1722 | re: t, saves: Save.t vector, | |
1723 | res: t list, savess: Save.t vector list, | |
1724 | k: char list * t * Save.t vector -> res) = | |
1725 | let | |
1726 | fun finish (s, re) = | |
1727 | k (s, seq (List.rev (re::res)), | |
1728 | Vector.concat (List.rev (saves::savess))) | |
1729 | fun default () = | |
1730 | let | |
1731 | val res = re::res | |
1732 | val savess = saves::savess | |
1733 | in | |
1734 | A (s, fn (s, re, saves) => | |
1735 | P0 (s, re, saves, res, savess, k)) | |
1736 | end | |
1737 | in | |
1738 | case s of | |
1739 | [] => finish (s, re) | |
1740 | | #")"::_ => finish (s, re) | |
1741 | | #"|"::_ => finish (s, re) | |
1742 | | #"*"::s => finish (s, star re) | |
1743 | | #"+"::s => finish (s, oneOrMore re) | |
1744 | | #"?"::s => finish (s, optional re) | |
1745 | | #"{"::(c::s) => if Char.isDigit c | |
1746 | then Bnd (c::s, fn (s, f) => | |
1747 | finish (s, f re)) | |
1748 | else default () | |
1749 | | _ => default () | |
1750 | end | |
1751 | and Bnd (s: char list, | |
1752 | k: char list * (t -> t) -> res) = | |
1753 | N (s, fn (s, n) => | |
1754 | Bnd0 (s, n, fn (s, f) => | |
1755 | case s of | |
1756 | #"}"::s => k (s, f) | |
1757 | | _ => raise (X "Bnd"))) | |
1758 | and Bnd0 (s: char list, n: int, | |
1759 | k: char list * (t -> t) -> res) = | |
1760 | let | |
1761 | fun finish (s, f) = k (s, f) | |
1762 | in | |
1763 | case s of | |
1764 | #"}"::_ => finish (s, fn re => repeat (re, n)) | |
1765 | | #","::s => Bnd1 (s, n, k) | |
1766 | | _ => raise (X "Bnd0") | |
1767 | end | |
1768 | and Bnd1 (s: char list, n: int, | |
1769 | k: char list * (t -> t) -> res) = | |
1770 | let | |
1771 | fun finish (s, f) = k (s, f) | |
1772 | in | |
1773 | case s of | |
1774 | #"}"::_ => finish (s, fn re => lower (re, n)) | |
1775 | | _ => N (s, fn (s, m) => | |
1776 | if m < n | |
1777 | then raise (X "Bnd1") | |
1778 | else finish (s, fn re => range (re, n, m))) | |
1779 | end | |
1780 | and N (s: char list, | |
1781 | k: char list * int -> res) = | |
1782 | let | |
1783 | in | |
1784 | case s of | |
1785 | d::s' => (case Char.digitToInt d of | |
1786 | SOME d => N1 (s', d, k) | |
1787 | | NONE => raise (X "N")) | |
1788 | | _ => raise (X "N") | |
1789 | end | |
1790 | and N1 (s: char list, n: int, | |
1791 | k: char list * int -> res) = | |
1792 | let | |
1793 | fun finish s = | |
1794 | k (s, n) | |
1795 | in | |
1796 | case s of | |
1797 | [] => finish s | |
1798 | | d::s' => (case Char.digitToInt d of | |
1799 | SOME d => N1 (s', n * 10 + d, k) | |
1800 | | NONE => finish s) | |
1801 | end | |
1802 | and A (s: char list, | |
1803 | k: char list * t * Save.t vector -> res) = | |
1804 | let | |
1805 | fun finish (s, re, saves) = | |
1806 | k (s, re, saves) | |
1807 | fun finishR (s, re) = | |
1808 | finish (s, re, Vector.new0 ()) | |
1809 | fun finishN s = | |
1810 | finishR (s, null) | |
1811 | fun finishC (s, c) = | |
1812 | finishR (s, char c) | |
1813 | in | |
1814 | case s of | |
1815 | #"("::(#")"::s) => finishN s | |
1816 | | #"("::s => let | |
1817 | val save' = Save.new () | |
1818 | in | |
1819 | Re (s, fn (s, re, saves) => | |
1820 | case s of | |
1821 | #")"::s => k (s, save (re, save'), | |
1822 | Vector.concat | |
1823 | [Vector.new1 save', saves]) | |
1824 | | _ => raise (X "A")) | |
1825 | end | |
1826 | | #"["::s => let | |
1827 | in | |
1828 | Be (s, fn (s, re) => | |
1829 | case s of | |
1830 | #"]"::s => finishR (s, re) | |
1831 | | _ => raise (X "A")) | |
1832 | end | |
1833 | | #"."::s => finishR (s, any) | |
1834 | | #"^"::s => finishR (s, anchorStart) | |
1835 | | #"$"::s => finishR (s, anchorFinish) | |
1836 | | #"\\"::(c::s) => finishC (s, c) | |
1837 | | c::s => if String.contains (")|*+?{", c) | |
1838 | then raise (X "A") | |
1839 | else finishC (s, c) | |
1840 | | _ => raise (X "A") | |
1841 | end | |
1842 | and Be (s: char list, | |
1843 | k: char list * t -> res) = | |
1844 | Be0 (s, k) | |
1845 | and Be0 (s: char list, | |
1846 | k: char list * t -> res) = | |
1847 | let | |
1848 | in | |
1849 | case s of | |
1850 | #"^"::s => Be1 (s, true, k) | |
1851 | | _ => Be1 (s, false, k) | |
1852 | end | |
1853 | and Be1 (s: char list, inv: bool, | |
1854 | k: char list * t -> res) = | |
1855 | let | |
1856 | in | |
1857 | case s of | |
1858 | #"]"::s => Be2 (s, inv, [#"]"], k) | |
1859 | | _ => Be2 (s, inv, [], k) | |
1860 | end | |
1861 | and Be2 (s: char list, inv: bool, cs: char list, | |
1862 | k: char list * t -> res) = | |
1863 | let | |
1864 | in | |
1865 | case s of | |
1866 | #"-"::s => Be3 (s, inv, #"-"::cs, [], [], k) | |
1867 | | _ => Be3 (s, inv, cs, [], [], k) | |
1868 | end | |
1869 | and Be3 (s: char list, inv: bool, | |
1870 | cs: char list, cps: (char -> bool) list, ces: string list, | |
1871 | k: char list * t -> res) = | |
1872 | let | |
1873 | fun finish (s: char list, | |
1874 | cs: char list, | |
1875 | cps: (char -> bool) list, | |
1876 | ces: string list) = | |
1877 | let | |
1878 | fun finish' re = k (s, re) | |
1879 | val s = implode cs | |
1880 | val cp = fn c => List.exists (cps, fn cp => cp c) | |
1881 | in | |
1882 | if inv | |
1883 | then | |
1884 | (case ces of | |
1885 | [] => | |
1886 | finish' | |
1887 | (isNotChar | |
1888 | (fn c => | |
1889 | cp c orelse String.contains (s, c))) | |
1890 | | _ => Error.bug "Regexp.fromString: can't handle collating elements in negated bracket expressions") | |
1891 | else finish' (List.fold | |
1892 | (ces, or [isChar cp, | |
1893 | oneOf s], | |
1894 | fn (ce, re) => | |
1895 | or [string ce, re])) | |
1896 | end | |
1897 | in | |
1898 | case s of | |
1899 | #"]"::_ => finish (s, cs, cps, ces) | |
1900 | | #"-"::s => (case s of | |
1901 | #"]"::_ => finish (s, #"-"::cs, cps, ces) | |
1902 | | _ => raise (X "Be3")) | |
1903 | | c1::(#"-"::(c2::s)) => | |
1904 | let | |
1905 | val r1 = Char.ord c1 | |
1906 | val r2 = Char.ord c2 | |
1907 | val cp = fn c => | |
1908 | let val r = Char.ord c | |
1909 | in r1 <= r andalso r <= r2 | |
1910 | end | |
1911 | in | |
1912 | Be3 (s, inv, cs, cp::cps, ces, k) | |
1913 | end | |
1914 | | #"["::(#"."::s) => | |
1915 | Ce (s, [], fn (s, ce) => | |
1916 | case s of | |
1917 | #"."::(#"]"::s) => Be3 (s, inv, cs, cps, ce::ces, k) | |
1918 | | _ => raise (X "Be3")) | |
1919 | | #"["::(#":"::s) => | |
1920 | Cl (s, fn (s, cp) => | |
1921 | case s of | |
1922 | #":"::(#"]"::s) => Be3 (s, inv, cs, cp::cps, ces, k) | |
1923 | | _ => raise (X "Be3")) | |
1924 | | c::s => Be3 (s, inv, c::cs, cps, ces, k) | |
1925 | | _ => raise (X "Be3") | |
1926 | end | |
1927 | and Ce (s: char list, ce: char list, | |
1928 | k: char list * string -> res) = | |
1929 | let | |
1930 | fun finish s = | |
1931 | k (s, implode (List.rev ce)) | |
1932 | in | |
1933 | case s of | |
1934 | #"."::_ => finish s | |
1935 | | c::s => Ce (s, c::ce, k) | |
1936 | | _ => raise (X "Ce") | |
1937 | end | |
1938 | and Cl (s: char list, | |
1939 | k: char list * (char -> bool) -> res) = | |
1940 | let | |
1941 | in | |
1942 | case s of | |
1943 | #"a"::(#"l"::(#"n"::(#"u"::(#"m"::s)))) => | |
1944 | k (s, Char.isAlphaNum) | |
1945 | | #"a"::(#"l"::(#"p"::(#"h"::(#"a"::s)))) => | |
1946 | k (s, Char.isAlpha) | |
1947 | | #"b"::(#"l"::(#"a"::(#"n"::(#"k"::_)))) => | |
1948 | raise (X "Cl:blank") | |
1949 | | #"c"::(#"n"::(#"t"::(#"r"::(#"l"::s)))) => | |
1950 | k (s, Char.isCntrl) | |
1951 | | #"d"::(#"i"::(#"g"::(#"i"::(#"t"::s)))) => | |
1952 | k (s, Char.isDigit) | |
1953 | | #"g"::(#"r"::(#"a"::(#"p"::(#"h"::s)))) => | |
1954 | k (s, Char.isGraph) | |
1955 | | #"l"::(#"o"::(#"w"::(#"e"::(#"r"::s)))) => | |
1956 | k (s, Char.isLower) | |
1957 | | #"p"::(#"r"::(#"i"::(#"n"::(#"t"::s)))) => | |
1958 | k (s, Char.isPrint) | |
1959 | | #"p"::(#"u"::(#"n"::(#"c"::(#"t"::_)))) => | |
1960 | raise (X "Cl:punct") | |
1961 | | #"s"::(#"p"::(#"a"::(#"c"::(#"e"::s)))) => | |
1962 | k (s, Char.isSpace) | |
1963 | | #"u"::(#"p"::(#"p"::(#"e"::(#"r"::s)))) => | |
1964 | k (s, Char.isUpper) | |
1965 | | #"x"::(#"d"::(#"i"::(#"g"::(#"i"::(#"t"::s))))) => | |
1966 | k (s, Char.isHexDigit) | |
1967 | | _ => raise (X "Cl") | |
1968 | end | |
1969 | in | |
1970 | val fromString: string -> (t * Save.t vector) option = | |
1971 | fn s => (SOME (S (explode s))) handle X _ => NONE | |
1972 | val fromString = | |
1973 | Trace.trace ("Regexp.fromString", | |
1974 | String.layout, | |
1975 | Option.layout (layout o #1)) | |
1976 | fromString | |
1977 | end | |
1978 | end | |
1979 | ||
1980 | (* local | |
1981 | * val _ = | |
1982 | * let open Trace.Immediate | |
1983 | * in | |
1984 | * flagged() | |
1985 | * ; debug := Out Out.error | |
1986 | * ; on [] | |
1987 | * end | |
1988 | * open Regexp | |
1989 | * val a = char #"a" | |
1990 | * val b = char #"b" | |
1991 | * val c = char #"c" | |
1992 | * val d = char #"d" | |
1993 | * val r = a | |
1994 | * val r = star a | |
1995 | * val r = or [] | |
1996 | * val r = star any | |
1997 | * val r = seq [] | |
1998 | * val r = or [a, b] | |
1999 | * val r = seq [a, b, c, d] | |
2000 | * val r = or [seq [a, b, c], | |
2001 | * seq [a, b, d]] | |
2002 | * val r = | |
2003 | * seq [star (or [a, b]), | |
2004 | * a, b, b] | |
2005 | * val d = digit | |
2006 | * val eol = char #"#" | |
2007 | * val space = oneOf " \t" | |
2008 | * val r = | |
2009 | * seq [or [anchorStart, notOneOf "0123456789("], | |
2010 | * or [seq [char #"(", d, d, d, char #")"], | |
2011 | * seq [d, d, d]], | |
2012 | * char #" ", | |
2013 | * d, d, d, | |
2014 | * oneOf " -", | |
2015 | * d, d, d, d, | |
2016 | * or [eol, nonDigit]] | |
2017 | * | |
2018 | * fun doit (name, lay) = | |
2019 | * let | |
2020 | * val dot = concat ["/tmp/", name, ".dot"] | |
2021 | * val ps = concat ["/tmp/", name, ".ps"] | |
2022 | * val _ = File.withOut (dot, fn out => Layout.output (lay, out)) | |
2023 | * val _ = OS.Process.system (concat ["dot ", dot, " >", ps]) | |
2024 | * in () | |
2025 | * end | |
2026 | * val nfa = NFA.fromRegexp r | |
2027 | * val _ = doit ("nfa", NFA.layoutDot (nfa, "nfa")) | |
2028 | * val _ = Out.output (Out.error, | |
2029 | * concat ["numCharClasses = ", | |
2030 | * Int.toString (NFA.numCharClasses nfa), | |
2031 | * "\n"]) | |
2032 | * val dfa = DFA.fromNFA nfa | |
2033 | * val _ = doit ("dfa", | |
2034 | * DFA.layoutDot {dfa = dfa, title = "dfa", showDead = false}) | |
2035 | * val min = DFA.minimize dfa | |
2036 | * val _ = doit ("min", | |
2037 | * DFA.layoutDot {dfa = min, title = "min", showDead = false}) | |
2038 | * in | |
2039 | * end | |
2040 | *) | |
2041 | end | |
2042 | ||
2043 | (* local | |
2044 | * val _ = | |
2045 | * let | |
2046 | * open Trace.Immediate | |
2047 | * in | |
2048 | * debug := Out Out.error | |
2049 | * ; flagged() | |
2050 | * ; on ["Regexp.match"] | |
2051 | * end | |
2052 | * structure Z = TestRegexp (Regexp) | |
2053 | * in | |
2054 | * end | |
2055 | *) |