backport to buster
[hcoop/debian/openafs.git] / src / util / regex.c
1 /*
2 * Copyright 2000, International Business Machines Corporation and others.
3 * All Rights Reserved.
4 *
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
8 */
9
10 /*
11 * routines to do regular expression matching
12 *
13 * Entry points:
14 *
15 * re_comp(s)
16 * char *s;
17 * ... returns 0 if the string s was compiled successfully,
18 * a pointer to an error message otherwise.
19 * If passed 0 or a null string returns without changing
20 * the currently compiled re (see note 11 below).
21 *
22 * re_exec(s)
23 * char *s;
24 * ... returns 1 if the string s matches the last compiled regular
25 * expression,
26 * 0 if the string s failed to match the last compiled
27 * regular expression, and
28 * -1 if the compiled regular expression was invalid
29 * (indicating an internal error).
30 *
31 * The strings passed to both re_comp and re_exec may have trailing or
32 * embedded newline characters; they are terminated by nulls.
33 *
34 * The identity of the author of these routines is lost in antiquity;
35 * this is essentially the same as the re code in the original V6 ed.
36 *
37 * The regular expressions recognized are described below. This description
38 * is essentially the same as that for ed.
39 *
40 * A regular expression specifies a set of strings of characters.
41 * A member of this set of strings is said to be matched by
42 * the regular expression. In the following specification for
43 * regular expressions the word `character' means any character but NUL.
44 *
45 * 1. Any character except a special character matches itself.
46 * Special characters are the regular expression delimiter plus
47 * \ [ . and sometimes ^ * $.
48 * 2. A . matches any character.
49 * 3. A \ followed by any character except a digit or ( )
50 * matches that character.
51 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
52 * character in (or not in) s. In s, \ has no special meaning,
53 * and ] may only appear as the first letter. A substring
54 * a-b, with a and b in ascending ASCII order, stands for
55 * the inclusive range of ASCII characters.
56 * 5. A regular expression of form 1-4 followed by * matches a
57 * sequence of 0 or more matches of the regular expression.
58 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
59 * matches what x matches.
60 * 7. A \ followed by a digit n matches a copy of the string that the
61 * bracketed regular expression beginning with the nth \( matched.
62 * 8. A regular expression of form 1-8, x, followed by a regular
63 * expression of form 1-7, y matches a match for x followed by
64 * a match for y, with the x match being as long as possible
65 * while still permitting a y match.
66 * 9. A regular expression of form 1-8 preceded by ^ (or followed
67 * by $), is constrained to matches that begin at the left
68 * (or end at the right) end of a line.
69 * 10. A regular expression of form 1-9 picks out the longest among
70 * the leftmost matches in a line.
71 * 11. An empty regular expression stands for a copy of the last
72 * regular expression encountered.
73 */
74
75 #include <afsconfig.h>
76 #include <afs/param.h>
77
78 /*
79 * constants for re's
80 */
81 #define CBRA 1
82 #define CCHR 2
83 #define CDOT 4
84 #define CCL 6
85 #define NCCL 8
86 #define CDOL 10
87 #define CEOF 11
88 #define CKET 12
89 #define CBACK 18
90
91 #define CSTAR 01
92
93 #define ESIZE 512
94 #define NBRA 9
95
96 static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
97 static char circf;
98
99 static int advance(char *, char *);
100 static int backref(int i, char *);
101 static int cclass(char *, char, int);
102
103 /*
104 * compile the regular expression argument into a dfa
105 */
106 char *
107 re_comp(const char *sp)
108 {
109 int c;
110 char *ep = expbuf;
111 int cclcnt, numbra = 0;
112 char *lastep = 0;
113 char bracket[NBRA];
114 char *bracketp = &bracket[0];
115 static char *retoolong = "Regular expression too long";
116
117 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
118
119 if (sp == 0 || *sp == '\0') {
120 if (*ep == 0)
121 return ("No previous regular expression");
122 return (0);
123 }
124 if (*sp == '^') {
125 circf = 1;
126 sp++;
127 } else
128 circf = 0;
129 for (;;) {
130 if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
131 comerr(retoolong);
132 if ((c = *sp++) == '\0') {
133 if (bracketp != bracket)
134 comerr("unmatched \\(");
135 *ep++ = CEOF;
136 *ep++ = 0;
137 return (0);
138 }
139 if (c != '*')
140 lastep = ep;
141 switch (c) {
142
143 case '.':
144 *ep++ = CDOT;
145 continue;
146
147 case '*':
148 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
149 goto defchar;
150 *lastep |= CSTAR;
151 continue;
152
153 case '$':
154 if (*sp != '\0')
155 goto defchar;
156 *ep++ = CDOL;
157 continue;
158
159 case '[':
160 *ep++ = CCL;
161 *ep++ = 0;
162 cclcnt = 1;
163 if ((c = *sp++) == '^') {
164 c = *sp++;
165 ep[-2] = NCCL;
166 }
167 do {
168 if (c == '\0')
169 comerr("missing ]");
170 if (c == '-' && ep[-1] != 0) {
171 if ((c = *sp++) == ']') {
172 *ep++ = '-';
173 cclcnt++;
174 break;
175 }
176 while (ep[-1] < c) {
177 *ep = ep[-1] + 1;
178 ep++;
179 cclcnt++;
180 if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
181 comerr(retoolong);
182 }
183 }
184 *ep++ = c;
185 cclcnt++;
186 if (ep >= &expbuf[ESIZE - 10 /* fudge factor */])
187 comerr(retoolong);
188 } while ((c = *sp++) != ']');
189 lastep[1] = cclcnt;
190 continue;
191
192 case '\\':
193 if ((c = *sp++) == '(') {
194 if (numbra >= NBRA)
195 comerr("too many \\(\\) pairs");
196 *bracketp++ = numbra;
197 *ep++ = CBRA;
198 *ep++ = numbra++;
199 continue;
200 }
201 if (c == ')') {
202 if (bracketp <= bracket)
203 comerr("unmatched \\)");
204 *ep++ = CKET;
205 *ep++ = *--bracketp;
206 continue;
207 }
208 if (c >= '1' && c < ('1' + NBRA)) {
209 *ep++ = CBACK;
210 *ep++ = c - '1';
211 continue;
212 }
213 *ep++ = CCHR;
214 *ep++ = c;
215 continue;
216
217 defchar:
218 default:
219 *ep++ = CCHR;
220 *ep++ = c;
221 }
222 }
223 }
224
225 static int
226 cclass(char *set, char c, int af)
227 {
228 int n;
229
230 if (c == 0)
231 return (0);
232 n = *set++;
233 while (--n)
234 if (*set++ == c)
235 return (af);
236 return (!af);
237 }
238
239 static int
240 backref(int i, char *lp)
241 {
242 char *bp;
243
244 bp = braslist[i];
245 while (*bp++ == *lp++)
246 if (bp >= braelist[i])
247 return (1);
248 return (0);
249 }
250
251 /*
252 * try to match the next thing in the dfa
253 */
254 static int
255 advance(char *lp, char *ep)
256 {
257 char *curlp;
258 int ct, i;
259 int rv;
260
261 for (;;)
262 switch (*ep++) {
263
264 case CCHR:
265 if (*ep++ == *lp++)
266 continue;
267 return (0);
268
269 case CDOT:
270 if (*lp++)
271 continue;
272 return (0);
273
274 case CDOL:
275 if (*lp == '\0')
276 continue;
277 return (0);
278
279 case CEOF:
280 return (1);
281
282 case CCL:
283 if (cclass(ep, *lp++, 1)) {
284 ep += *ep;
285 continue;
286 }
287 return (0);
288
289 case NCCL:
290 if (cclass(ep, *lp++, 0)) {
291 ep += *ep;
292 continue;
293 }
294 return (0);
295
296 case CBRA:
297 braslist[(int) *ep++] = lp;
298 continue;
299
300 case CKET:
301 braelist[(int) *ep++] = lp;
302 continue;
303
304 case CBACK:
305 if (braelist[i = *ep++] == 0)
306 return (-1);
307 if (backref(i, lp)) {
308 lp += braelist[i] - braslist[i];
309 continue;
310 }
311 return (0);
312
313 case CBACK | CSTAR:
314 if (braelist[i = *ep++] == 0)
315 return (-1);
316 curlp = lp;
317 ct = braelist[i] - braslist[i];
318 while (backref(i, lp))
319 lp += ct;
320 while (lp >= curlp) {
321 if ((rv = advance(lp, ep)))
322 return (rv);
323 lp -= ct;
324 }
325 continue;
326
327 case CDOT | CSTAR:
328 curlp = lp;
329 while (*lp++);
330 goto star;
331
332 case CCHR | CSTAR:
333 curlp = lp;
334 while (*lp++ == *ep);
335 ep++;
336 goto star;
337
338 case CCL | CSTAR:
339 case NCCL | CSTAR:
340 curlp = lp;
341 while (cclass(ep, *lp++, ep[-1] == (CCL | CSTAR)));
342 ep += *ep;
343 goto star;
344
345 star:
346 do {
347 lp--;
348 if ((rv = advance(lp, ep)))
349 return (rv);
350 } while (lp > curlp);
351 return (0);
352
353 default:
354 return (-1);
355 }
356 }
357
358 /*
359 * match the argument string against the compiled re
360 */
361 int
362 re_exec(const char *p1)
363 {
364 char *p2 = expbuf;
365 int c;
366 int rv;
367
368 for (c = 0; c < NBRA; c++) {
369 braslist[c] = 0;
370 braelist[c] = 0;
371 }
372 if (circf)
373 return ((advance((char *)p1, p2)));
374 /*
375 * fast check for first character
376 */
377 if (*p2 == CCHR) {
378 c = p2[1];
379 do {
380 if (*p1 != c)
381 continue;
382 if ((rv = advance((char *)p1, p2)))
383 return (rv);
384 } while (*p1++);
385 return (0);
386 }
387 /*
388 * regular algorithm
389 */
390 do
391 if ((rv = advance((char *)p1, p2)))
392 return (rv);
393 while (*p1++);
394 return (0);
395 }
396