Commit | Line | Data |
---|---|---|
7f918cf1 CE |
1 | (* Copyright (C) 2009 Matthew Fluet. |
2 | * Copyright (C) 1999-2006 Henry Cejtin, Matthew Fluet, Suresh | |
3 | * Jagannathan, and Stephen Weeks. | |
4 | * | |
5 | * MLton is released under a BSD-style license. | |
6 | * See the file MLton-LICENSE for details. | |
7 | *) | |
8 | ||
9 | signature REGEXP_STRUCTS = | |
10 | sig | |
11 | end | |
12 | ||
13 | signature REGEXP = | |
14 | sig | |
15 | include REGEXP_STRUCTS | |
16 | ||
17 | structure Save: | |
18 | sig | |
19 | type t | |
20 | ||
21 | val new: unit -> t | |
22 | end | |
23 | ||
24 | structure Match: | |
25 | sig | |
26 | type t | |
27 | ||
28 | val all: t -> Substring.t | |
29 | val startLength: t -> {start: int, length: int} | |
30 | val exists: t * Save.t -> bool | |
31 | val funs: t -> {exists: Save.t -> bool, | |
32 | lookup: Save.t -> Substring.t, | |
33 | peek: Save.t -> Substring.t option} | |
34 | val length: t -> int | |
35 | val lookup: t * Save.t -> Substring.t | |
36 | val lookupString: t * Save.t -> String.t | |
37 | val peek: t * Save.t -> Substring.t option | |
38 | val peekString: t * Save.t -> String.t option | |
39 | val stringFuns: t -> {exists: Save.t -> bool, | |
40 | lookup: Save.t -> String.t, | |
41 | peek: Save.t -> String.t option} | |
42 | end | |
43 | ||
44 | structure Compiled: | |
45 | sig | |
46 | type t | |
47 | ||
48 | (* Find the first substring of s starting at or after index i that | |
49 | * matches r. Return the first character and the character just | |
50 | * past the end of the substring. | |
51 | *) | |
52 | val findShort: t * string * int -> Match.t option | |
53 | val findLong: t * string * int -> Match.t option | |
54 | val foreachMatchShort: t * string * (Match.t -> unit) -> unit | |
55 | val layout: t -> Layout.t | |
56 | val layoutDot: t -> Layout.t | |
57 | val layoutDotToFile: t * File.t -> unit | |
58 | (* match (r, s, i) | |
59 | * Return the (shortest or longest) substring of s starting at index | |
60 | * i that matches r. | |
61 | * The substring is represented by the index of the character just | |
62 | * past its end. | |
63 | * Return NONE if there is NO match. | |
64 | * All of the saves in the match will be set. | |
65 | *) | |
66 | val matchAll: t * string -> Match.t option | |
67 | val matchLong: t * string * int -> Match.t option | |
68 | val matchShort: t * string * int -> Match.t option | |
69 | val matchesAll: t * string -> bool | |
70 | val matchesPrefix: t * string -> bool | |
71 | end | |
72 | ||
73 | type t | |
74 | ||
75 | val anchorFinish: t | |
76 | val anchorStart: t | |
77 | val any: t (* arbitrary character *) | |
78 | val anys: t (* arbitrary number of characters *) | |
79 | val ascii: t (* arbitrary ascii character *) | |
80 | val asciis: t (* arbitrary ascii characters *) | |
81 | val char: char -> t | |
82 | val compileDFA: t -> Compiled.t | |
83 | val compileNFA: t -> Compiled.t | |
84 | val digit: t | |
85 | val digits: t | |
86 | val dquote: t (* char #"\"" *) | |
87 | val fromString: string -> (t * Save.t vector) option | |
88 | val isChar: (char -> bool) -> t | |
89 | val isNotChar: (char -> bool) -> t | |
90 | val layout: t -> Layout.t | |
91 | val nonDigit: t | |
92 | val none: t | |
93 | val notChar: char -> t | |
94 | val notOneOf: string -> t | |
95 | val null: t (* empty string *) | |
96 | val oneOf: string -> t | |
97 | val oneOrMore: t -> t | |
98 | val optional: t -> t | |
99 | val or: t list -> t | |
100 | val save: t * Save.t -> t | |
101 | val seq: t list -> t | |
102 | val spaces: t (* star (isChar Char.isSpace) *) | |
103 | val star: t -> t | |
104 | val string: string -> t (* case matters *) | |
105 | val stringIgnoreCase: string -> t (* case doesn't matter *) | |
106 | val toString: t -> string | |
107 | val zeroOrMore: t -> t (* same as star *) | |
108 | end | |
109 | ||
110 | ||
111 | functor TestRegexp (S: REGEXP): sig end = | |
112 | struct | |
113 | ||
114 | val _ = print "TestRegexp\n" | |
115 | ||
116 | open S | |
117 | open Compiled | |
118 | val compile = if true then compileNFA else compileDFA | |
119 | ||
120 | val _ = | |
121 | Assert.assert | |
122 | ("TestRegexp.save", fn () => | |
123 | let | |
124 | val s = Save.new () | |
125 | in | |
126 | List.forall | |
127 | ([(save (seq [], s), "", ""), | |
128 | (save (star (oneOf "a"), s), "", ""), | |
129 | (seq [save (seq [], s), seq []], "", ""), | |
130 | (seq [oneOf "a", save (seq [], s)], "a", "")], | |
131 | fn (r, s1, s2) => | |
132 | let | |
133 | val c = compile r | |
134 | in | |
135 | case matchAll (c, s1) of | |
136 | NONE => false | |
137 | | SOME m => Match.lookupString (m, s) = s2 | |
138 | end) | |
139 | end) | |
140 | ||
141 | val _ = | |
142 | Assert.assert | |
143 | ("TestRegexp.doesMatchAll", fn () => | |
144 | List.forall ([(any, "a"), | |
145 | (anys, "abc")], | |
146 | fn (r, s) => matchesAll (compile r, s))) | |
147 | val tests = | |
148 | List.map ([ | |
149 | ("\\a", "a"), | |
150 | ("^$", ""), | |
151 | ("abc", "abc"), | |
152 | (".", "a"), | |
153 | ("^foo$", "foo"), | |
154 | ("^...$", "foo"), | |
155 | ("^.*$", "foo"), | |
156 | ("^.*foo@bar\\.com$", "foo@bar.com"), | |
157 | ("(abc)","abc"), | |
158 | ("\\(abc\\)","(abc)"), | |
159 | ("(abc){2,4}$", "abcabc"), | |
160 | ("(abc){2,4}$", "abcabcabc"), | |
161 | ("(abc){2,4}$", "abcabcabcabc") | |
162 | ], | |
163 | fn (r, s) => | |
164 | let | |
165 | val opt = SOME (String.size s) | |
166 | in | |
167 | (#1 (valOf (fromString r)), s, opt, opt) | |
168 | end) | |
169 | @ | |
170 | [ | |
171 | (#1 (valOf (fromString "a")), "a", SOME 1, SOME 1), | |
172 | (#1 (valOf (fromString "a*")), "a", SOME 0, SOME 1), | |
173 | (#1 (valOf (fromString "a+")), "a", SOME 1, SOME 1), | |
174 | (#1 (valOf (fromString "a+")), "aa", SOME 1, SOME 2), | |
175 | (#1 (valOf (fromString "[^a]")), "a", NONE, NONE), | |
176 | (#1 (valOf (fromString "[^a]")), "b", SOME 1, SOME 1), | |
177 | (stringIgnoreCase "abc", "abc", SOME (3: int), SOME (3: int)), | |
178 | (stringIgnoreCase "abc", "aBC", SOME 3, SOME 3), | |
179 | (stringIgnoreCase "ab", "abab", SOME 2, SOME 2), | |
180 | (string "abc", "abc", SOME 3, SOME 3), | |
181 | (string "Abc", "abc", NONE, NONE), | |
182 | (seq [anchorStart, anchorFinish], "", SOME 0, SOME 0), | |
183 | (seq [anchorStart, string "abc", anchorFinish], "abc", SOME 3, SOME 3), | |
184 | (seq [or [null, anchorFinish], string "a"], "a", SOME 1, SOME 1), | |
185 | (seq [or [anchorFinish, null], string "a"], "a", SOME 1, SOME 1), | |
186 | (seq [], "abc", SOME 0, SOME 0), | |
187 | (seq [string "ab"], "ab", SOME 2, SOME 2), | |
188 | (seq [char #"a", char #"b", char #"c"], "abc", SOME 3, SOME 3), | |
189 | (seq [string "ab", null], "abc", SOME 2, SOME 2), | |
190 | (or [string "a", string "ab", string "abc"], "abc", SOME 1, SOME 3), | |
191 | (seq [or [string "ab", null], | |
192 | or [string "abcde", string "cd"]], "abcde", | |
193 | SOME 4, SOME 5), | |
194 | (star (or [null, char #"a"]), "aaa", SOME 0, SOME 3), | |
195 | (star (string "ab"), "ababab", SOME 0, SOME 6), | |
196 | let val r = Save.new () | |
197 | in (save (string "ab", r), "ab", SOME 2, SOME 2) | |
198 | end, | |
199 | let val r = Save.new () | |
200 | in (seq [string "a", save (string "bc", r), string "d"], | |
201 | "abcd", SOME 4, SOME 4) | |
202 | end, | |
203 | let val s1 = Save.new () | |
204 | val s2 = Save.new () | |
205 | in (seq [save (string "a", s1), | |
206 | save (string "b", s2)], | |
207 | "ab", SOME 2, SOME 2) | |
208 | end, | |
209 | let val s1 = Save.new () | |
210 | val s2 = Save.new () | |
211 | in (seq [save (string "a", s1), | |
212 | string "b", | |
213 | save (string "c", s2), | |
214 | string "d"], | |
215 | "abcd", | |
216 | SOME 4, SOME 4) | |
217 | end, | |
218 | let val s1 = Save.new () | |
219 | in (seq [string "a", | |
220 | save (string "b", s1), | |
221 | string "c"], | |
222 | "abc", | |
223 | SOME 3, SOME 3) | |
224 | end, | |
225 | let val s1 = Save.new () | |
226 | in (seq [string "abc", | |
227 | save (string "d", s1), | |
228 | string "e"], | |
229 | "abcde", | |
230 | SOME 5, SOME 5) | |
231 | end, | |
232 | let val s1 = Save.new () | |
233 | val s2 = Save.new () | |
234 | in (seq [string "abc", | |
235 | save (string "d", s1), | |
236 | string "e", | |
237 | save (string "f", s2)], | |
238 | "abcdef", | |
239 | SOME 6, SOME 6) | |
240 | end, | |
241 | let val s1 = Save.new () | |
242 | val s2 = Save.new () | |
243 | in (seq [string "abc", | |
244 | save (string "d", s1), | |
245 | string "e", | |
246 | save (string "fgh", s2)], | |
247 | "abcdefgh", | |
248 | SOME 8, SOME 8) | |
249 | end | |
250 | ] | |
251 | ||
252 | val _ = | |
253 | Assert.assert | |
254 | ("Test.Regexp.match", fn () => | |
255 | List.forall (tests, | |
256 | fn (r, s: string, i1, i2) => | |
257 | let | |
258 | val r = compile r | |
259 | val _ = Compiled.layoutDotToFile (r, "/tmp/z.dot") | |
260 | fun doit m = Option.map (m (r, s, 0), Match.length) | |
261 | in | |
262 | i1 = doit matchShort | |
263 | andalso i2 = doit matchLong | |
264 | end)) | |
265 | ||
266 | val tests = | |
267 | [(string "abc", "123abc", SOME (3: int, 6: int)), | |
268 | (string "abc", "123abcde", SOME (3, 6)), | |
269 | (string "abd", "123abcde", NONE), | |
270 | (seq [string "a", star (string "ab"), string "c"], | |
271 | "1234aabababcdef", SOME (4, 12))] | |
272 | ||
273 | val _ = | |
274 | Assert.assert | |
275 | ("Regexp.findShort", fn () => | |
276 | List.forall | |
277 | (tests, fn (r, s, opt) => | |
278 | opt = (Option.map | |
279 | (findShort (compile r, s, 0), fn m => | |
280 | let val (_, {start, length}) = Substring.base (Match.all m) | |
281 | in (start, start + length) | |
282 | end)))) | |
283 | ||
284 | val _ = | |
285 | Assert.assert | |
286 | ("Regexp.findShort2", fn () => | |
287 | List.forall | |
288 | ([(SOME (2, 4), (string "cd", "abcdef", 0)), | |
289 | (SOME (2, 4), (seq [char #"c", star (isNotChar Char.isSpace)], | |
290 | "abcd fg", 0))], | |
291 | fn (res, (r, s, i)) => | |
292 | res = | |
293 | Option.map (findLong (compile r, s, i), fn m => | |
294 | let val (_, {start, length}) = Substring.base (Match.all m) | |
295 | in (start, start + length) | |
296 | end))) | |
297 | ||
298 | end |