Commit | Line | Data |
---|---|---|
805e021f CE |
1 | /* |
2 | * Copyright 2000, International Business Machines Corporation and others. | |
3 | * All Rights Reserved. | |
4 | * | |
5 | * This software has been released under the terms of the IBM Public | |
6 | * License. For details, see the LICENSE file in the top-level source | |
7 | * directory or online at http://www.openafs.org/dl/license10.html | |
8 | */ | |
9 | ||
10 | /* | |
11 | * routines to do regular expression matching | |
12 | * | |
13 | * Entry points: | |
14 | * | |
15 | * re_comp(s) | |
16 | * char *s; | |
17 | * ... returns 0 if the string s was compiled successfully, | |
18 | * a pointer to an error message otherwise. | |
19 | * If passed 0 or a null string returns without changing | |
20 | * the currently compiled re (see note 11 below). | |
21 | * | |
22 | * re_exec(s) | |
23 | * char *s; | |
24 | * ... returns 1 if the string s matches the last compiled regular | |
25 | * expression, | |
26 | * 0 if the string s failed to match the last compiled | |
27 | * regular expression, and | |
28 | * -1 if the compiled regular expression was invalid | |
29 | * (indicating an internal error). | |
30 | * | |
31 | * The strings passed to both re_comp and re_exec may have trailing or | |
32 | * embedded newline characters; they are terminated by nulls. | |
33 | * | |
34 | * The identity of the author of these routines is lost in antiquity; | |
35 | * this is essentially the same as the re code in the original V6 ed. | |
36 | * | |
37 | * The regular expressions recognized are described below. This description | |
38 | * is essentially the same as that for ed. | |
39 | * | |
40 | * A regular expression specifies a set of strings of characters. | |
41 | * A member of this set of strings is said to be matched by | |
42 | * the regular expression. In the following specification for | |
43 | * regular expressions the word `character' means any character but NUL. | |
44 | * | |
45 | * 1. Any character except a special character matches itself. | |
46 | * Special characters are the regular expression delimiter plus | |
47 | * \ [ . and sometimes ^ * $. | |
48 | * 2. A . matches any character. | |
49 | * 3. A \ followed by any character except a digit or ( ) | |
50 | * matches that character. | |
51 | * 4. A nonempty string s bracketed [s] (or [^s]) matches any | |
52 | * character in (or not in) s. In s, \ has no special meaning, | |
53 | * and ] may only appear as the first letter. A substring | |
54 | * a-b, with a and b in ascending ASCII order, stands for | |
55 | * the inclusive range of ASCII characters. | |
56 | * 5. A regular expression of form 1-4 followed by * matches a | |
57 | * sequence of 0 or more matches of the regular expression. | |
58 | * 6. A regular expression, x, of form 1-8, bracketed \(x\) | |
59 | * matches what x matches. | |
60 | * 7. A \ followed by a digit n matches a copy of the string that the | |
61 | * bracketed regular expression beginning with the nth \( matched. | |
62 | * 8. A regular expression of form 1-8, x, followed by a regular | |
63 | * expression of form 1-7, y matches a match for x followed by | |
64 | * a match for y, with the x match being as long as possible | |
65 | * while still permitting a y match. | |
66 | * 9. A regular expression of form 1-8 preceded by ^ (or followed | |
67 | * by $), is constrained to matches that begin at the left | |
68 | * (or end at the right) end of a line. | |
69 | * 10. A regular expression of form 1-9 picks out the longest among | |
70 | * the leftmost matches in a line. | |
71 | * 11. An empty regular expression stands for a copy of the last | |
72 | * regular expression encountered. | |
73 | */ | |
74 | ||
75 | #include <afsconfig.h> | |
76 | #include <afs/param.h> | |
77 | ||
78 | /* | |
79 | * constants for re's | |
80 | */ | |
81 | #define CBRA 1 | |
82 | #define CCHR 2 | |
83 | #define CDOT 4 | |
84 | #define CCL 6 | |
85 | #define NCCL 8 | |
86 | #define CDOL 10 | |
87 | #define CEOF 11 | |
88 | #define CKET 12 | |
89 | #define CBACK 18 | |
90 | ||
91 | #define CSTAR 01 | |
92 | ||
93 | #define ESIZE 512 | |
94 | #define NBRA 9 | |
95 | ||
96 | static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; | |
97 | static char circf; | |
98 | ||
99 | static int advance(char *, char *); | |
100 | static int backref(int i, char *); | |
101 | static int cclass(char *, char, int); | |
102 | ||
103 | /* | |
104 | * compile the regular expression argument into a dfa | |
105 | */ | |
106 | char * | |
107 | re_comp(const char *sp) | |
108 | { | |
109 | int c; | |
110 | char *ep = expbuf; | |
111 | int cclcnt, numbra = 0; | |
112 | char *lastep = 0; | |
113 | char bracket[NBRA]; | |
114 | char *bracketp = &bracket[0]; | |
115 | static char *retoolong = "Regular expression too long"; | |
116 | ||
117 | #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } | |
118 | ||
119 | if (sp == 0 || *sp == '\0') { | |
120 | if (*ep == 0) | |
121 | return ("No previous regular expression"); | |
122 | return (0); | |
123 | } | |
124 | if (*sp == '^') { | |
125 | circf = 1; | |
126 | sp++; | |
127 | } else | |
128 | circf = 0; | |
129 | for (;;) { | |
130 | if (ep >= &expbuf[ESIZE - 10 /* fudge factor */]) | |
131 | comerr(retoolong); | |
132 | if ((c = *sp++) == '\0') { | |
133 | if (bracketp != bracket) | |
134 | comerr("unmatched \\("); | |
135 | *ep++ = CEOF; | |
136 | *ep++ = 0; | |
137 | return (0); | |
138 | } | |
139 | if (c != '*') | |
140 | lastep = ep; | |
141 | switch (c) { | |
142 | ||
143 | case '.': | |
144 | *ep++ = CDOT; | |
145 | continue; | |
146 | ||
147 | case '*': | |
148 | if (lastep == 0 || *lastep == CBRA || *lastep == CKET) | |
149 | goto defchar; | |
150 | *lastep |= CSTAR; | |
151 | continue; | |
152 | ||
153 | case '$': | |
154 | if (*sp != '\0') | |
155 | goto defchar; | |
156 | *ep++ = CDOL; | |
157 | continue; | |
158 | ||
159 | case '[': | |
160 | *ep++ = CCL; | |
161 | *ep++ = 0; | |
162 | cclcnt = 1; | |
163 | if ((c = *sp++) == '^') { | |
164 | c = *sp++; | |
165 | ep[-2] = NCCL; | |
166 | } | |
167 | do { | |
168 | if (c == '\0') | |
169 | comerr("missing ]"); | |
170 | if (c == '-' && ep[-1] != 0) { | |
171 | if ((c = *sp++) == ']') { | |
172 | *ep++ = '-'; | |
173 | cclcnt++; | |
174 | break; | |
175 | } | |
176 | while (ep[-1] < c) { | |
177 | *ep = ep[-1] + 1; | |
178 | ep++; | |
179 | cclcnt++; | |
180 | if (ep >= &expbuf[ESIZE - 10 /* fudge factor */]) | |
181 | comerr(retoolong); | |
182 | } | |
183 | } | |
184 | *ep++ = c; | |
185 | cclcnt++; | |
186 | if (ep >= &expbuf[ESIZE - 10 /* fudge factor */]) | |
187 | comerr(retoolong); | |
188 | } while ((c = *sp++) != ']'); | |
189 | lastep[1] = cclcnt; | |
190 | continue; | |
191 | ||
192 | case '\\': | |
193 | if ((c = *sp++) == '(') { | |
194 | if (numbra >= NBRA) | |
195 | comerr("too many \\(\\) pairs"); | |
196 | *bracketp++ = numbra; | |
197 | *ep++ = CBRA; | |
198 | *ep++ = numbra++; | |
199 | continue; | |
200 | } | |
201 | if (c == ')') { | |
202 | if (bracketp <= bracket) | |
203 | comerr("unmatched \\)"); | |
204 | *ep++ = CKET; | |
205 | *ep++ = *--bracketp; | |
206 | continue; | |
207 | } | |
208 | if (c >= '1' && c < ('1' + NBRA)) { | |
209 | *ep++ = CBACK; | |
210 | *ep++ = c - '1'; | |
211 | continue; | |
212 | } | |
213 | *ep++ = CCHR; | |
214 | *ep++ = c; | |
215 | continue; | |
216 | ||
217 | defchar: | |
218 | default: | |
219 | *ep++ = CCHR; | |
220 | *ep++ = c; | |
221 | } | |
222 | } | |
223 | } | |
224 | ||
225 | static int | |
226 | cclass(char *set, char c, int af) | |
227 | { | |
228 | int n; | |
229 | ||
230 | if (c == 0) | |
231 | return (0); | |
232 | n = *set++; | |
233 | while (--n) | |
234 | if (*set++ == c) | |
235 | return (af); | |
236 | return (!af); | |
237 | } | |
238 | ||
239 | static int | |
240 | backref(int i, char *lp) | |
241 | { | |
242 | char *bp; | |
243 | ||
244 | bp = braslist[i]; | |
245 | while (*bp++ == *lp++) | |
246 | if (bp >= braelist[i]) | |
247 | return (1); | |
248 | return (0); | |
249 | } | |
250 | ||
251 | /* | |
252 | * try to match the next thing in the dfa | |
253 | */ | |
254 | static int | |
255 | advance(char *lp, char *ep) | |
256 | { | |
257 | char *curlp; | |
258 | int ct, i; | |
259 | int rv; | |
260 | ||
261 | for (;;) | |
262 | switch (*ep++) { | |
263 | ||
264 | case CCHR: | |
265 | if (*ep++ == *lp++) | |
266 | continue; | |
267 | return (0); | |
268 | ||
269 | case CDOT: | |
270 | if (*lp++) | |
271 | continue; | |
272 | return (0); | |
273 | ||
274 | case CDOL: | |
275 | if (*lp == '\0') | |
276 | continue; | |
277 | return (0); | |
278 | ||
279 | case CEOF: | |
280 | return (1); | |
281 | ||
282 | case CCL: | |
283 | if (cclass(ep, *lp++, 1)) { | |
284 | ep += *ep; | |
285 | continue; | |
286 | } | |
287 | return (0); | |
288 | ||
289 | case NCCL: | |
290 | if (cclass(ep, *lp++, 0)) { | |
291 | ep += *ep; | |
292 | continue; | |
293 | } | |
294 | return (0); | |
295 | ||
296 | case CBRA: | |
297 | braslist[(int) *ep++] = lp; | |
298 | continue; | |
299 | ||
300 | case CKET: | |
301 | braelist[(int) *ep++] = lp; | |
302 | continue; | |
303 | ||
304 | case CBACK: | |
305 | if (braelist[i = *ep++] == 0) | |
306 | return (-1); | |
307 | if (backref(i, lp)) { | |
308 | lp += braelist[i] - braslist[i]; | |
309 | continue; | |
310 | } | |
311 | return (0); | |
312 | ||
313 | case CBACK | CSTAR: | |
314 | if (braelist[i = *ep++] == 0) | |
315 | return (-1); | |
316 | curlp = lp; | |
317 | ct = braelist[i] - braslist[i]; | |
318 | while (backref(i, lp)) | |
319 | lp += ct; | |
320 | while (lp >= curlp) { | |
321 | if ((rv = advance(lp, ep))) | |
322 | return (rv); | |
323 | lp -= ct; | |
324 | } | |
325 | continue; | |
326 | ||
327 | case CDOT | CSTAR: | |
328 | curlp = lp; | |
329 | while (*lp++); | |
330 | goto star; | |
331 | ||
332 | case CCHR | CSTAR: | |
333 | curlp = lp; | |
334 | while (*lp++ == *ep); | |
335 | ep++; | |
336 | goto star; | |
337 | ||
338 | case CCL | CSTAR: | |
339 | case NCCL | CSTAR: | |
340 | curlp = lp; | |
341 | while (cclass(ep, *lp++, ep[-1] == (CCL | CSTAR))); | |
342 | ep += *ep; | |
343 | goto star; | |
344 | ||
345 | star: | |
346 | do { | |
347 | lp--; | |
348 | if ((rv = advance(lp, ep))) | |
349 | return (rv); | |
350 | } while (lp > curlp); | |
351 | return (0); | |
352 | ||
353 | default: | |
354 | return (-1); | |
355 | } | |
356 | } | |
357 | ||
358 | /* | |
359 | * match the argument string against the compiled re | |
360 | */ | |
361 | int | |
362 | re_exec(const char *p1) | |
363 | { | |
364 | char *p2 = expbuf; | |
365 | int c; | |
366 | int rv; | |
367 | ||
368 | for (c = 0; c < NBRA; c++) { | |
369 | braslist[c] = 0; | |
370 | braelist[c] = 0; | |
371 | } | |
372 | if (circf) | |
373 | return ((advance((char *)p1, p2))); | |
374 | /* | |
375 | * fast check for first character | |
376 | */ | |
377 | if (*p2 == CCHR) { | |
378 | c = p2[1]; | |
379 | do { | |
380 | if (*p1 != c) | |
381 | continue; | |
382 | if ((rv = advance((char *)p1, p2))) | |
383 | return (rv); | |
384 | } while (*p1++); | |
385 | return (0); | |
386 | } | |
387 | /* | |
388 | * regular algorithm | |
389 | */ | |
390 | do | |
391 | if ((rv = advance((char *)p1, p2))) | |
392 | return (rv); | |
393 | while (*p1++); | |
394 | return (0); | |
395 | } | |
396 |