Give subprocess creation a way to find a valid current directory
[bpt/emacs.git] / src / regex.c
1 /* Extended regular expression matching and search library,
2 version 0.11.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1985, 89, 90, 91, 92 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
26
27 #define _GNU_SOURCE
28
29 /* We need this for `regex.h', and perhaps for the Emacs include files. */
30 #include <sys/types.h>
31
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35
36 /* The `emacs' switch turns on certain matching commands
37 that make sense only in Emacs. */
38 #ifdef emacs
39
40 #include "lisp.h"
41 #include "buffer.h"
42 #include "syntax.h"
43
44 /* Emacs uses `NULL' as a predicate. */
45 #undef NULL
46
47 #else /* not emacs */
48
49 /* We used to test for `BSTRING' here, but only GCC and Emacs define
50 `BSTRING', as far as I know, and neither of them use this code. */
51 #if HAVE_STRING_H || STDC_HEADERS
52 #include <string.h>
53 #ifndef bcmp
54 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
55 #endif
56 #ifndef bcopy
57 #define bcopy(s, d, n) memcpy ((d), (s), (n))
58 #endif
59 #ifndef bzero
60 #define bzero(s, n) memset ((s), 0, (n))
61 #endif
62 #else
63 #include <strings.h>
64 #endif
65
66 #ifdef STDC_HEADERS
67 #include <stdlib.h>
68 #else
69 char *malloc ();
70 char *realloc ();
71 #endif
72
73
74 /* Define the syntax stuff for \<, \>, etc. */
75
76 /* This must be nonzero for the wordchar and notwordchar pattern
77 commands in re_match_2. */
78 #ifndef Sword
79 #define Sword 1
80 #endif
81
82 #ifdef SYNTAX_TABLE
83
84 extern char *re_syntax_table;
85
86 #else /* not SYNTAX_TABLE */
87
88 /* How many characters in the character set. */
89 #define CHAR_SET_SIZE 256
90
91 static char re_syntax_table[CHAR_SET_SIZE];
92
93 static void
94 init_syntax_once ()
95 {
96 register int c;
97 static int done = 0;
98
99 if (done)
100 return;
101
102 bzero (re_syntax_table, sizeof re_syntax_table);
103
104 for (c = 'a'; c <= 'z'; c++)
105 re_syntax_table[c] = Sword;
106
107 for (c = 'A'; c <= 'Z'; c++)
108 re_syntax_table[c] = Sword;
109
110 for (c = '0'; c <= '9'; c++)
111 re_syntax_table[c] = Sword;
112
113 re_syntax_table['_'] = Sword;
114
115 done = 1;
116 }
117
118 #endif /* not SYNTAX_TABLE */
119
120 #define SYNTAX(c) re_syntax_table[c]
121
122 #endif /* not emacs */
123 \f
124 /* Get the interface, including the syntax bits. */
125 #include "regex.h"
126
127 /* isalpha etc. are used for the character classes. */
128 #include <ctype.h>
129
130 #ifndef isascii
131 #define isascii(c) 1
132 #endif
133
134 #ifdef isblank
135 #define ISBLANK(c) (isascii (c) && isblank (c))
136 #else
137 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
138 #endif
139 #ifdef isgraph
140 #define ISGRAPH(c) (isascii (c) && isgraph (c))
141 #else
142 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
143 #endif
144
145 #define ISPRINT(c) (isascii (c) && isprint (c))
146 #define ISDIGIT(c) (isascii (c) && isdigit (c))
147 #define ISALNUM(c) (isascii (c) && isalnum (c))
148 #define ISALPHA(c) (isascii (c) && isalpha (c))
149 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
150 #define ISLOWER(c) (isascii (c) && islower (c))
151 #define ISPUNCT(c) (isascii (c) && ispunct (c))
152 #define ISSPACE(c) (isascii (c) && isspace (c))
153 #define ISUPPER(c) (isascii (c) && isupper (c))
154 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
155
156 #ifndef NULL
157 #define NULL 0
158 #endif
159
160 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
161 since ours (we hope) works properly with all combinations of
162 machines, compilers, `char' and `unsigned char' argument types.
163 (Per Bothner suggested the basic approach.) */
164 #undef SIGN_EXTEND_CHAR
165 #if __STDC__
166 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
167 #else /* not __STDC__ */
168 /* As in Harbison and Steele. */
169 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
170 #endif
171 \f
172 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
173 use `alloca' instead of `malloc'. This is because using malloc in
174 re_search* or re_match* could cause memory leaks when C-g is used in
175 Emacs; also, malloc is slower and causes storage fragmentation. On
176 the other hand, malloc is more portable, and easier to debug.
177
178 Because we sometimes use alloca, some routines have to be macros,
179 not functions -- `alloca'-allocated space disappears at the end of the
180 function it is called in. */
181
182 #ifdef REGEX_MALLOC
183
184 #define REGEX_ALLOCATE malloc
185 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
186
187 #else /* not REGEX_MALLOC */
188
189 /* Emacs already defines alloca, sometimes. */
190 #ifndef alloca
191
192 /* Make alloca work the best possible way. */
193 #ifdef __GNUC__
194 #define alloca __builtin_alloca
195 #else /* not __GNUC__ */
196 #if HAVE_ALLOCA_H
197 #include <alloca.h>
198 #else /* not __GNUC__ or HAVE_ALLOCA_H */
199 #ifndef _AIX /* Already did AIX, up at the top. */
200 char *alloca ();
201 #endif /* not _AIX */
202 #endif /* not HAVE_ALLOCA_H */
203 #endif /* not __GNUC__ */
204
205 #endif /* not alloca */
206
207 #define REGEX_ALLOCATE alloca
208
209 /* Assumes a `char *destination' variable. */
210 #define REGEX_REALLOCATE(source, osize, nsize) \
211 (destination = (char *) alloca (nsize), \
212 bcopy (source, destination, osize), \
213 destination)
214
215 #endif /* not REGEX_MALLOC */
216
217
218 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
219 `string1' or just past its end. This works if PTR is NULL, which is
220 a good thing. */
221 #define FIRST_STRING_P(ptr) \
222 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
223
224 /* (Re)Allocate N items of type T using malloc, or fail. */
225 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
226 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
227 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
228
229 #define BYTEWIDTH 8 /* In bits. */
230
231 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
232
233 #define MAX(a, b) ((a) > (b) ? (a) : (b))
234 #define MIN(a, b) ((a) < (b) ? (a) : (b))
235
236 typedef char boolean;
237 #define false 0
238 #define true 1
239 \f
240 /* These are the command codes that appear in compiled regular
241 expressions. Some opcodes are followed by argument bytes. A
242 command code can specify any interpretation whatsoever for its
243 arguments. Zero bytes may appear in the compiled regular expression.
244
245 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
246 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
247 `exactn' we use here must also be 1. */
248
249 typedef enum
250 {
251 no_op = 0,
252
253 /* Followed by one byte giving n, then by n literal bytes. */
254 exactn = 1,
255
256 /* Matches any (more or less) character. */
257 anychar,
258
259 /* Matches any one char belonging to specified set. First
260 following byte is number of bitmap bytes. Then come bytes
261 for a bitmap saying which chars are in. Bits in each byte
262 are ordered low-bit-first. A character is in the set if its
263 bit is 1. A character too large to have a bit in the map is
264 automatically not in the set. */
265 charset,
266
267 /* Same parameters as charset, but match any character that is
268 not one of those specified. */
269 charset_not,
270
271 /* Start remembering the text that is matched, for storing in a
272 register. Followed by one byte with the register number, in
273 the range 0 to one less than the pattern buffer's re_nsub
274 field. Then followed by one byte with the number of groups
275 inner to this one. (This last has to be part of the
276 start_memory only because we need it in the on_failure_jump
277 of re_match_2.) */
278 start_memory,
279
280 /* Stop remembering the text that is matched and store it in a
281 memory register. Followed by one byte with the register
282 number, in the range 0 to one less than `re_nsub' in the
283 pattern buffer, and one byte with the number of inner groups,
284 just like `start_memory'. (We need the number of inner
285 groups here because we don't have any easy way of finding the
286 corresponding start_memory when we're at a stop_memory.) */
287 stop_memory,
288
289 /* Match a duplicate of something remembered. Followed by one
290 byte containing the register number. */
291 duplicate,
292
293 /* Fail unless at beginning of line. */
294 begline,
295
296 /* Fail unless at end of line. */
297 endline,
298
299 /* Succeeds if at beginning of buffer (if emacs) or at beginning
300 of string to be matched (if not). */
301 begbuf,
302
303 /* Analogously, for end of buffer/string. */
304 endbuf,
305
306 /* Followed by two byte relative address to which to jump. */
307 jump,
308
309 /* Same as jump, but marks the end of an alternative. */
310 jump_past_alt,
311
312 /* Followed by two-byte relative address of place to resume at
313 in case of failure. */
314 on_failure_jump,
315
316 /* Like on_failure_jump, but pushes a placeholder instead of the
317 current string position when executed. */
318 on_failure_keep_string_jump,
319
320 /* Throw away latest failure point and then jump to following
321 two-byte relative address. */
322 pop_failure_jump,
323
324 /* Change to pop_failure_jump if know won't have to backtrack to
325 match; otherwise change to jump. This is used to jump
326 back to the beginning of a repeat. If what follows this jump
327 clearly won't match what the repeat does, such that we can be
328 sure that there is no use backtracking out of repetitions
329 already matched, then we change it to a pop_failure_jump.
330 Followed by two-byte address. */
331 maybe_pop_jump,
332
333 /* Jump to following two-byte address, and push a dummy failure
334 point. This failure point will be thrown away if an attempt
335 is made to use it for a failure. A `+' construct makes this
336 before the first repeat. Also used as an intermediary kind
337 of jump when compiling an alternative. */
338 dummy_failure_jump,
339
340 /* Push a dummy failure point and continue. Used at the end of
341 alternatives. */
342 push_dummy_failure,
343
344 /* Followed by two-byte relative address and two-byte number n.
345 After matching N times, jump to the address upon failure. */
346 succeed_n,
347
348 /* Followed by two-byte relative address, and two-byte number n.
349 Jump to the address N times, then fail. */
350 jump_n,
351
352 /* Set the following two-byte relative address to the
353 subsequent two-byte number. The address *includes* the two
354 bytes of number. */
355 set_number_at,
356
357 wordchar, /* Matches any word-constituent character. */
358 notwordchar, /* Matches any char that is not a word-constituent. */
359
360 wordbeg, /* Succeeds if at word beginning. */
361 wordend, /* Succeeds if at word end. */
362
363 wordbound, /* Succeeds if at a word boundary. */
364 notwordbound /* Succeeds if not at a word boundary. */
365
366 #ifdef emacs
367 ,before_dot, /* Succeeds if before point. */
368 at_dot, /* Succeeds if at point. */
369 after_dot, /* Succeeds if after point. */
370
371 /* Matches any character whose syntax is specified. Followed by
372 a byte which contains a syntax code, e.g., Sword. */
373 syntaxspec,
374
375 /* Matches any character whose syntax is not that specified. */
376 notsyntaxspec
377 #endif /* emacs */
378 } re_opcode_t;
379 \f
380 /* Common operations on the compiled pattern. */
381
382 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
383
384 #define STORE_NUMBER(destination, number) \
385 do { \
386 (destination)[0] = (number) & 0377; \
387 (destination)[1] = (number) >> 8; \
388 } while (0)
389
390 /* Same as STORE_NUMBER, except increment DESTINATION to
391 the byte after where the number is stored. Therefore, DESTINATION
392 must be an lvalue. */
393
394 #define STORE_NUMBER_AND_INCR(destination, number) \
395 do { \
396 STORE_NUMBER (destination, number); \
397 (destination) += 2; \
398 } while (0)
399
400 /* Put into DESTINATION a number stored in two contiguous bytes starting
401 at SOURCE. */
402
403 #define EXTRACT_NUMBER(destination, source) \
404 do { \
405 (destination) = *(source) & 0377; \
406 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
407 } while (0)
408
409 #ifdef DEBUG
410 static void
411 extract_number (dest, source)
412 int *dest;
413 unsigned char *source;
414 {
415 int temp = SIGN_EXTEND_CHAR (*(source + 1));
416 *dest = *source & 0377;
417 *dest += temp << 8;
418 }
419
420 #ifndef EXTRACT_MACROS /* To debug the macros. */
421 #undef EXTRACT_NUMBER
422 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
423 #endif /* not EXTRACT_MACROS */
424
425 #endif /* DEBUG */
426
427 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
428 SOURCE must be an lvalue. */
429
430 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
431 do { \
432 EXTRACT_NUMBER (destination, source); \
433 (source) += 2; \
434 } while (0)
435
436 #ifdef DEBUG
437 static void
438 extract_number_and_incr (destination, source)
439 int *destination;
440 unsigned char **source;
441 {
442 extract_number (destination, *source);
443 *source += 2;
444 }
445
446 #ifndef EXTRACT_MACROS
447 #undef EXTRACT_NUMBER_AND_INCR
448 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
449 extract_number_and_incr (&dest, &src)
450 #endif /* not EXTRACT_MACROS */
451
452 #endif /* DEBUG */
453 \f
454 /* If DEBUG is defined, Regex prints many voluminous messages about what
455 it is doing (if the variable `debug' is nonzero). If linked with the
456 main program in `iregex.c', you can enter patterns and strings
457 interactively. And if linked with the main program in `main.c' and
458 the other test files, you can run the already-written tests. */
459
460 #ifdef DEBUG
461
462 /* We use standard I/O for debugging. */
463 #include <stdio.h>
464
465 /* It is useful to test things that ``must'' be true when debugging. */
466 #include <assert.h>
467
468 static int debug = 0;
469
470 #define DEBUG_STATEMENT(e) e
471 #define DEBUG_PRINT1(x) if (debug) printf (x)
472 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
473 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
474 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
475 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
476 if (debug) print_partial_compiled_pattern (s, e)
477 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
478 if (debug) print_double_string (w, s1, sz1, s2, sz2)
479
480
481 extern void printchar ();
482
483 /* Print the fastmap in human-readable form. */
484
485 void
486 print_fastmap (fastmap)
487 char *fastmap;
488 {
489 unsigned was_a_range = 0;
490 unsigned i = 0;
491
492 while (i < (1 << BYTEWIDTH))
493 {
494 if (fastmap[i++])
495 {
496 was_a_range = 0;
497 printchar (i - 1);
498 while (i < (1 << BYTEWIDTH) && fastmap[i])
499 {
500 was_a_range = 1;
501 i++;
502 }
503 if (was_a_range)
504 {
505 printf ("-");
506 printchar (i - 1);
507 }
508 }
509 }
510 putchar ('\n');
511 }
512
513
514 /* Print a compiled pattern string in human-readable form, starting at
515 the START pointer into it and ending just before the pointer END. */
516
517 void
518 print_partial_compiled_pattern (start, end)
519 unsigned char *start;
520 unsigned char *end;
521 {
522 int mcnt, mcnt2;
523 unsigned char *p = start;
524 unsigned char *pend = end;
525
526 if (start == NULL)
527 {
528 printf ("(null)\n");
529 return;
530 }
531
532 /* Loop over pattern commands. */
533 while (p < pend)
534 {
535 switch ((re_opcode_t) *p++)
536 {
537 case no_op:
538 printf ("/no_op");
539 break;
540
541 case exactn:
542 mcnt = *p++;
543 printf ("/exactn/%d", mcnt);
544 do
545 {
546 putchar ('/');
547 printchar (*p++);
548 }
549 while (--mcnt);
550 break;
551
552 case start_memory:
553 mcnt = *p++;
554 printf ("/start_memory/%d/%d", mcnt, *p++);
555 break;
556
557 case stop_memory:
558 mcnt = *p++;
559 printf ("/stop_memory/%d/%d", mcnt, *p++);
560 break;
561
562 case duplicate:
563 printf ("/duplicate/%d", *p++);
564 break;
565
566 case anychar:
567 printf ("/anychar");
568 break;
569
570 case charset:
571 case charset_not:
572 {
573 register int c;
574
575 printf ("/charset%s",
576 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
577
578 assert (p + *p < pend);
579
580 for (c = 0; c < *p; c++)
581 {
582 unsigned bit;
583 unsigned char map_byte = p[1 + c];
584
585 putchar ('/');
586
587 for (bit = 0; bit < BYTEWIDTH; bit++)
588 if (map_byte & (1 << bit))
589 printchar (c * BYTEWIDTH + bit);
590 }
591 p += 1 + *p;
592 break;
593 }
594
595 case begline:
596 printf ("/begline");
597 break;
598
599 case endline:
600 printf ("/endline");
601 break;
602
603 case on_failure_jump:
604 extract_number_and_incr (&mcnt, &p);
605 printf ("/on_failure_jump/0/%d", mcnt);
606 break;
607
608 case on_failure_keep_string_jump:
609 extract_number_and_incr (&mcnt, &p);
610 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
611 break;
612
613 case dummy_failure_jump:
614 extract_number_and_incr (&mcnt, &p);
615 printf ("/dummy_failure_jump/0/%d", mcnt);
616 break;
617
618 case push_dummy_failure:
619 printf ("/push_dummy_failure");
620 break;
621
622 case maybe_pop_jump:
623 extract_number_and_incr (&mcnt, &p);
624 printf ("/maybe_pop_jump/0/%d", mcnt);
625 break;
626
627 case pop_failure_jump:
628 extract_number_and_incr (&mcnt, &p);
629 printf ("/pop_failure_jump/0/%d", mcnt);
630 break;
631
632 case jump_past_alt:
633 extract_number_and_incr (&mcnt, &p);
634 printf ("/jump_past_alt/0/%d", mcnt);
635 break;
636
637 case jump:
638 extract_number_and_incr (&mcnt, &p);
639 printf ("/jump/0/%d", mcnt);
640 break;
641
642 case succeed_n:
643 extract_number_and_incr (&mcnt, &p);
644 extract_number_and_incr (&mcnt2, &p);
645 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
646 break;
647
648 case jump_n:
649 extract_number_and_incr (&mcnt, &p);
650 extract_number_and_incr (&mcnt2, &p);
651 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
652 break;
653
654 case set_number_at:
655 extract_number_and_incr (&mcnt, &p);
656 extract_number_and_incr (&mcnt2, &p);
657 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
658 break;
659
660 case wordbound:
661 printf ("/wordbound");
662 break;
663
664 case notwordbound:
665 printf ("/notwordbound");
666 break;
667
668 case wordbeg:
669 printf ("/wordbeg");
670 break;
671
672 case wordend:
673 printf ("/wordend");
674
675 #ifdef emacs
676 case before_dot:
677 printf ("/before_dot");
678 break;
679
680 case at_dot:
681 printf ("/at_dot");
682 break;
683
684 case after_dot:
685 printf ("/after_dot");
686 break;
687
688 case syntaxspec:
689 printf ("/syntaxspec");
690 mcnt = *p++;
691 printf ("/%d", mcnt);
692 break;
693
694 case notsyntaxspec:
695 printf ("/notsyntaxspec");
696 mcnt = *p++;
697 printf ("/%d", mcnt);
698 break;
699 #endif /* emacs */
700
701 case wordchar:
702 printf ("/wordchar");
703 break;
704
705 case notwordchar:
706 printf ("/notwordchar");
707 break;
708
709 case begbuf:
710 printf ("/begbuf");
711 break;
712
713 case endbuf:
714 printf ("/endbuf");
715 break;
716
717 default:
718 printf ("?%d", *(p-1));
719 }
720 }
721 printf ("/\n");
722 }
723
724
725 void
726 print_compiled_pattern (bufp)
727 struct re_pattern_buffer *bufp;
728 {
729 unsigned char *buffer = bufp->buffer;
730
731 print_partial_compiled_pattern (buffer, buffer + bufp->used);
732 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
733
734 if (bufp->fastmap_accurate && bufp->fastmap)
735 {
736 printf ("fastmap: ");
737 print_fastmap (bufp->fastmap);
738 }
739
740 printf ("re_nsub: %d\t", bufp->re_nsub);
741 printf ("regs_alloc: %d\t", bufp->regs_allocated);
742 printf ("can_be_null: %d\t", bufp->can_be_null);
743 printf ("newline_anchor: %d\n", bufp->newline_anchor);
744 printf ("no_sub: %d\t", bufp->no_sub);
745 printf ("not_bol: %d\t", bufp->not_bol);
746 printf ("not_eol: %d\t", bufp->not_eol);
747 printf ("syntax: %d\n", bufp->syntax);
748 /* Perhaps we should print the translate table? */
749 }
750
751
752 void
753 print_double_string (where, string1, size1, string2, size2)
754 const char *where;
755 const char *string1;
756 const char *string2;
757 int size1;
758 int size2;
759 {
760 unsigned this_char;
761
762 if (where == NULL)
763 printf ("(null)");
764 else
765 {
766 if (FIRST_STRING_P (where))
767 {
768 for (this_char = where - string1; this_char < size1; this_char++)
769 printchar (string1[this_char]);
770
771 where = string2;
772 }
773
774 for (this_char = where - string2; this_char < size2; this_char++)
775 printchar (string2[this_char]);
776 }
777 }
778
779 #else /* not DEBUG */
780
781 #undef assert
782 #define assert(e)
783
784 #define DEBUG_STATEMENT(e)
785 #define DEBUG_PRINT1(x)
786 #define DEBUG_PRINT2(x1, x2)
787 #define DEBUG_PRINT3(x1, x2, x3)
788 #define DEBUG_PRINT4(x1, x2, x3, x4)
789 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
790 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
791
792 #endif /* not DEBUG */
793 \f
794 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
795 also be assigned to arbitrarily: each pattern buffer stores its own
796 syntax, so it can be changed between regex compilations. */
797 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
798
799
800 /* Specify the precise syntax of regexps for compilation. This provides
801 for compatibility for various utilities which historically have
802 different, incompatible syntaxes.
803
804 The argument SYNTAX is a bit mask comprised of the various bits
805 defined in regex.h. We return the old syntax. */
806
807 reg_syntax_t
808 re_set_syntax (syntax)
809 reg_syntax_t syntax;
810 {
811 reg_syntax_t ret = re_syntax_options;
812
813 re_syntax_options = syntax;
814 return ret;
815 }
816 \f
817 /* This table gives an error message for each of the error codes listed
818 in regex.h. Obviously the order here has to be same as there. */
819
820 static const char *re_error_msg[] =
821 { NULL, /* REG_NOERROR */
822 "No match", /* REG_NOMATCH */
823 "Invalid regular expression", /* REG_BADPAT */
824 "Invalid collation character", /* REG_ECOLLATE */
825 "Invalid character class name", /* REG_ECTYPE */
826 "Trailing backslash", /* REG_EESCAPE */
827 "Invalid back reference", /* REG_ESUBREG */
828 "Unmatched [ or [^", /* REG_EBRACK */
829 "Unmatched ( or \\(", /* REG_EPAREN */
830 "Unmatched \\{", /* REG_EBRACE */
831 "Invalid content of \\{\\}", /* REG_BADBR */
832 "Invalid range end", /* REG_ERANGE */
833 "Memory exhausted", /* REG_ESPACE */
834 "Invalid preceding regular expression", /* REG_BADRPT */
835 "Premature end of regular expression", /* REG_EEND */
836 "Regular expression too big", /* REG_ESIZE */
837 "Unmatched ) or \\)", /* REG_ERPAREN */
838 };
839 \f
840 /* Subroutine declarations and macros for regex_compile. */
841
842 static void store_op1 (), store_op2 ();
843 static void insert_op1 (), insert_op2 ();
844 static boolean at_begline_loc_p (), at_endline_loc_p ();
845 static boolean group_in_compile_stack ();
846 static reg_errcode_t compile_range ();
847
848 /* Fetch the next character in the uncompiled pattern---translating it
849 if necessary. Also cast from a signed character in the constant
850 string passed to us by the user to an unsigned char that we can use
851 as an array index (in, e.g., `translate'). */
852 #define PATFETCH(c) \
853 do {if (p == pend) return REG_EEND; \
854 c = (unsigned char) *p++; \
855 if (translate) c = translate[c]; \
856 } while (0)
857
858 /* Fetch the next character in the uncompiled pattern, with no
859 translation. */
860 #define PATFETCH_RAW(c) \
861 do {if (p == pend) return REG_EEND; \
862 c = (unsigned char) *p++; \
863 } while (0)
864
865 /* Go backwards one character in the pattern. */
866 #define PATUNFETCH p--
867
868
869 /* If `translate' is non-null, return translate[D], else just D. We
870 cast the subscript to translate because some data is declared as
871 `char *', to avoid warnings when a string constant is passed. But
872 when we use a character as a subscript we must make it unsigned. */
873 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
874
875
876 /* Macros for outputting the compiled pattern into `buffer'. */
877
878 /* If the buffer isn't allocated when it comes in, use this. */
879 #define INIT_BUF_SIZE 32
880
881 /* Make sure we have at least N more bytes of space in buffer. */
882 #define GET_BUFFER_SPACE(n) \
883 while (b - bufp->buffer + (n) > bufp->allocated) \
884 EXTEND_BUFFER ()
885
886 /* Make sure we have one more byte of buffer space and then add C to it. */
887 #define BUF_PUSH(c) \
888 do { \
889 GET_BUFFER_SPACE (1); \
890 *b++ = (unsigned char) (c); \
891 } while (0)
892
893
894 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
895 #define BUF_PUSH_2(c1, c2) \
896 do { \
897 GET_BUFFER_SPACE (2); \
898 *b++ = (unsigned char) (c1); \
899 *b++ = (unsigned char) (c2); \
900 } while (0)
901
902
903 /* As with BUF_PUSH_2, except for three bytes. */
904 #define BUF_PUSH_3(c1, c2, c3) \
905 do { \
906 GET_BUFFER_SPACE (3); \
907 *b++ = (unsigned char) (c1); \
908 *b++ = (unsigned char) (c2); \
909 *b++ = (unsigned char) (c3); \
910 } while (0)
911
912
913 /* Store a jump with opcode OP at LOC to location TO. We store a
914 relative address offset by the three bytes the jump itself occupies. */
915 #define STORE_JUMP(op, loc, to) \
916 store_op1 (op, loc, (to) - (loc) - 3)
917
918 /* Likewise, for a two-argument jump. */
919 #define STORE_JUMP2(op, loc, to, arg) \
920 store_op2 (op, loc, (to) - (loc) - 3, arg)
921
922 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
923 #define INSERT_JUMP(op, loc, to) \
924 insert_op1 (op, loc, (to) - (loc) - 3, b)
925
926 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
927 #define INSERT_JUMP2(op, loc, to, arg) \
928 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
929
930
931 /* This is not an arbitrary limit: the arguments which represent offsets
932 into the pattern are two bytes long. So if 2^16 bytes turns out to
933 be too small, many things would have to change. */
934 #define MAX_BUF_SIZE (1L << 16)
935
936
937 /* Extend the buffer by twice its current size via realloc and
938 reset the pointers that pointed into the old block to point to the
939 correct places in the new one. If extending the buffer results in it
940 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
941 #define EXTEND_BUFFER() \
942 do { \
943 unsigned char *old_buffer = bufp->buffer; \
944 if (bufp->allocated == MAX_BUF_SIZE) \
945 return REG_ESIZE; \
946 bufp->allocated <<= 1; \
947 if (bufp->allocated > MAX_BUF_SIZE) \
948 bufp->allocated = MAX_BUF_SIZE; \
949 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
950 if (bufp->buffer == NULL) \
951 return REG_ESPACE; \
952 /* If the buffer moved, move all the pointers into it. */ \
953 if (old_buffer != bufp->buffer) \
954 { \
955 b = (b - old_buffer) + bufp->buffer; \
956 begalt = (begalt - old_buffer) + bufp->buffer; \
957 if (fixup_alt_jump) \
958 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
959 if (laststart) \
960 laststart = (laststart - old_buffer) + bufp->buffer; \
961 if (pending_exact) \
962 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
963 } \
964 } while (0)
965
966
967 /* Since we have one byte reserved for the register number argument to
968 {start,stop}_memory, the maximum number of groups we can report
969 things about is what fits in that byte. */
970 #define MAX_REGNUM 255
971
972 /* But patterns can have more than `MAX_REGNUM' registers. We just
973 ignore the excess. */
974 typedef unsigned regnum_t;
975
976
977 /* Macros for the compile stack. */
978
979 /* Since offsets can go either forwards or backwards, this type needs to
980 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
981 typedef int pattern_offset_t;
982
983 typedef struct
984 {
985 pattern_offset_t begalt_offset;
986 pattern_offset_t fixup_alt_jump;
987 pattern_offset_t inner_group_offset;
988 pattern_offset_t laststart_offset;
989 regnum_t regnum;
990 } compile_stack_elt_t;
991
992
993 typedef struct
994 {
995 compile_stack_elt_t *stack;
996 unsigned size;
997 unsigned avail; /* Offset of next open position. */
998 } compile_stack_type;
999
1000
1001 #define INIT_COMPILE_STACK_SIZE 32
1002
1003 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1004 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1005
1006 /* The next available element. */
1007 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1008
1009
1010 /* Set the bit for character C in a list. */
1011 #define SET_LIST_BIT(c) \
1012 (b[((unsigned char) (c)) / BYTEWIDTH] \
1013 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1014
1015
1016 /* Get the next unsigned number in the uncompiled pattern. */
1017 #define GET_UNSIGNED_NUMBER(num) \
1018 { if (p != pend) \
1019 { \
1020 PATFETCH (c); \
1021 while (ISDIGIT (c)) \
1022 { \
1023 if (num < 0) \
1024 num = 0; \
1025 num = num * 10 + c - '0'; \
1026 if (p == pend) \
1027 break; \
1028 PATFETCH (c); \
1029 } \
1030 } \
1031 }
1032
1033 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1034
1035 #define IS_CHAR_CLASS(string) \
1036 (STREQ (string, "alpha") || STREQ (string, "upper") \
1037 || STREQ (string, "lower") || STREQ (string, "digit") \
1038 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1039 || STREQ (string, "space") || STREQ (string, "print") \
1040 || STREQ (string, "punct") || STREQ (string, "graph") \
1041 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1042 \f
1043 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1044 Returns one of error codes defined in `regex.h', or zero for success.
1045
1046 Assumes the `allocated' (and perhaps `buffer') and `translate'
1047 fields are set in BUFP on entry.
1048
1049 If it succeeds, results are put in BUFP (if it returns an error, the
1050 contents of BUFP are undefined):
1051 `buffer' is the compiled pattern;
1052 `syntax' is set to SYNTAX;
1053 `used' is set to the length of the compiled pattern;
1054 `fastmap_accurate' is zero;
1055 `re_nsub' is the number of subexpressions in PATTERN;
1056 `not_bol' and `not_eol' are zero;
1057
1058 The `fastmap' and `newline_anchor' fields are neither
1059 examined nor set. */
1060
1061 static reg_errcode_t
1062 regex_compile (pattern, size, syntax, bufp)
1063 const char *pattern;
1064 int size;
1065 reg_syntax_t syntax;
1066 struct re_pattern_buffer *bufp;
1067 {
1068 /* We fetch characters from PATTERN here. Even though PATTERN is
1069 `char *' (i.e., signed), we declare these variables as unsigned, so
1070 they can be reliably used as array indices. */
1071 register unsigned char c, c1;
1072
1073 /* A random tempory spot in PATTERN. */
1074 const char *p1;
1075
1076 /* Points to the end of the buffer, where we should append. */
1077 register unsigned char *b;
1078
1079 /* Keeps track of unclosed groups. */
1080 compile_stack_type compile_stack;
1081
1082 /* Points to the current (ending) position in the pattern. */
1083 const char *p = pattern;
1084 const char *pend = pattern + size;
1085
1086 /* How to translate the characters in the pattern. */
1087 char *translate = bufp->translate;
1088
1089 /* Address of the count-byte of the most recently inserted `exactn'
1090 command. This makes it possible to tell if a new exact-match
1091 character can be added to that command or if the character requires
1092 a new `exactn' command. */
1093 unsigned char *pending_exact = 0;
1094
1095 /* Address of start of the most recently finished expression.
1096 This tells, e.g., postfix * where to find the start of its
1097 operand. Reset at the beginning of groups and alternatives. */
1098 unsigned char *laststart = 0;
1099
1100 /* Address of beginning of regexp, or inside of last group. */
1101 unsigned char *begalt;
1102
1103 /* Place in the uncompiled pattern (i.e., the {) to
1104 which to go back if the interval is invalid. */
1105 const char *beg_interval;
1106
1107 /* Address of the place where a forward jump should go to the end of
1108 the containing expression. Each alternative of an `or' -- except the
1109 last -- ends with a forward jump of this sort. */
1110 unsigned char *fixup_alt_jump = 0;
1111
1112 /* Counts open-groups as they are encountered. Remembered for the
1113 matching close-group on the compile stack, so the same register
1114 number is put in the stop_memory as the start_memory. */
1115 regnum_t regnum = 0;
1116
1117 #ifdef DEBUG
1118 DEBUG_PRINT1 ("\nCompiling pattern: ");
1119 if (debug)
1120 {
1121 unsigned debug_count;
1122
1123 for (debug_count = 0; debug_count < size; debug_count++)
1124 printchar (pattern[debug_count]);
1125 putchar ('\n');
1126 }
1127 #endif /* DEBUG */
1128
1129 /* Initialize the compile stack. */
1130 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1131 if (compile_stack.stack == NULL)
1132 return REG_ESPACE;
1133
1134 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1135 compile_stack.avail = 0;
1136
1137 /* Initialize the pattern buffer. */
1138 bufp->syntax = syntax;
1139 bufp->fastmap_accurate = 0;
1140 bufp->not_bol = bufp->not_eol = 0;
1141
1142 /* Set `used' to zero, so that if we return an error, the pattern
1143 printer (for debugging) will think there's no pattern. We reset it
1144 at the end. */
1145 bufp->used = 0;
1146
1147 /* Always count groups, whether or not bufp->no_sub is set. */
1148 bufp->re_nsub = 0;
1149
1150 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1151 /* Initialize the syntax table. */
1152 init_syntax_once ();
1153 #endif
1154
1155 if (bufp->allocated == 0)
1156 {
1157 if (bufp->buffer)
1158 { /* If zero allocated, but buffer is non-null, try to realloc
1159 enough space. This loses if buffer's address is bogus, but
1160 that is the user's responsibility. */
1161 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1162 }
1163 else
1164 { /* Caller did not allocate a buffer. Do it for them. */
1165 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1166 }
1167 if (!bufp->buffer) return REG_ESPACE;
1168
1169 bufp->allocated = INIT_BUF_SIZE;
1170 }
1171
1172 begalt = b = bufp->buffer;
1173
1174 /* Loop through the uncompiled pattern until we're at the end. */
1175 while (p != pend)
1176 {
1177 PATFETCH (c);
1178
1179 switch (c)
1180 {
1181 case '^':
1182 {
1183 if ( /* If at start of pattern, it's an operator. */
1184 p == pattern + 1
1185 /* If context independent, it's an operator. */
1186 || syntax & RE_CONTEXT_INDEP_ANCHORS
1187 /* Otherwise, depends on what's come before. */
1188 || at_begline_loc_p (pattern, p, syntax))
1189 BUF_PUSH (begline);
1190 else
1191 goto normal_char;
1192 }
1193 break;
1194
1195
1196 case '$':
1197 {
1198 if ( /* If at end of pattern, it's an operator. */
1199 p == pend
1200 /* If context independent, it's an operator. */
1201 || syntax & RE_CONTEXT_INDEP_ANCHORS
1202 /* Otherwise, depends on what's next. */
1203 || at_endline_loc_p (p, pend, syntax))
1204 BUF_PUSH (endline);
1205 else
1206 goto normal_char;
1207 }
1208 break;
1209
1210
1211 case '+':
1212 case '?':
1213 if ((syntax & RE_BK_PLUS_QM)
1214 || (syntax & RE_LIMITED_OPS))
1215 goto normal_char;
1216 handle_plus:
1217 case '*':
1218 /* If there is no previous pattern... */
1219 if (!laststart)
1220 {
1221 if (syntax & RE_CONTEXT_INVALID_OPS)
1222 return REG_BADRPT;
1223 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1224 goto normal_char;
1225 }
1226
1227 {
1228 /* Are we optimizing this jump? */
1229 boolean keep_string_p = false;
1230
1231 /* 1 means zero (many) matches is allowed. */
1232 char zero_times_ok = 0, many_times_ok = 0;
1233
1234 /* If there is a sequence of repetition chars, collapse it
1235 down to just one (the right one). We can't combine
1236 interval operators with these because of, e.g., `a{2}*',
1237 which should only match an even number of `a's. */
1238
1239 for (;;)
1240 {
1241 zero_times_ok |= c != '+';
1242 many_times_ok |= c != '?';
1243
1244 if (p == pend)
1245 break;
1246
1247 PATFETCH (c);
1248
1249 if (c == '*'
1250 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1251 ;
1252
1253 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1254 {
1255 if (p == pend) return REG_EESCAPE;
1256
1257 PATFETCH (c1);
1258 if (!(c1 == '+' || c1 == '?'))
1259 {
1260 PATUNFETCH;
1261 PATUNFETCH;
1262 break;
1263 }
1264
1265 c = c1;
1266 }
1267 else
1268 {
1269 PATUNFETCH;
1270 break;
1271 }
1272
1273 /* If we get here, we found another repeat character. */
1274 }
1275
1276 /* Star, etc. applied to an empty pattern is equivalent
1277 to an empty pattern. */
1278 if (!laststart)
1279 break;
1280
1281 /* Now we know whether or not zero matches is allowed
1282 and also whether or not two or more matches is allowed. */
1283 if (many_times_ok)
1284 { /* More than one repetition is allowed, so put in at the
1285 end a backward relative jump from `b' to before the next
1286 jump we're going to put in below (which jumps from
1287 laststart to after this jump).
1288
1289 But if we are at the `*' in the exact sequence `.*\n',
1290 insert an unconditional jump backwards to the .,
1291 instead of the beginning of the loop. This way we only
1292 push a failure point once, instead of every time
1293 through the loop. */
1294 assert (p - 1 > pattern);
1295
1296 /* Allocate the space for the jump. */
1297 GET_BUFFER_SPACE (3);
1298
1299 /* We know we are not at the first character of the pattern,
1300 because laststart was nonzero. And we've already
1301 incremented `p', by the way, to be the character after
1302 the `*'. Do we have to do something analogous here
1303 for null bytes, because of RE_DOT_NOT_NULL? */
1304 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1305 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1306 && !(syntax & RE_DOT_NEWLINE))
1307 { /* We have .*\n. */
1308 STORE_JUMP (jump, b, laststart);
1309 keep_string_p = true;
1310 }
1311 else
1312 /* Anything else. */
1313 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1314
1315 /* We've added more stuff to the buffer. */
1316 b += 3;
1317 }
1318
1319 /* On failure, jump from laststart to b + 3, which will be the
1320 end of the buffer after this jump is inserted. */
1321 GET_BUFFER_SPACE (3);
1322 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1323 : on_failure_jump,
1324 laststart, b + 3);
1325 pending_exact = 0;
1326 b += 3;
1327
1328 if (!zero_times_ok)
1329 {
1330 /* At least one repetition is required, so insert a
1331 `dummy_failure_jump' before the initial
1332 `on_failure_jump' instruction of the loop. This
1333 effects a skip over that instruction the first time
1334 we hit that loop. */
1335 GET_BUFFER_SPACE (3);
1336 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1337 b += 3;
1338 }
1339 }
1340 break;
1341
1342
1343 case '.':
1344 laststart = b;
1345 BUF_PUSH (anychar);
1346 break;
1347
1348
1349 case '[':
1350 {
1351 boolean had_char_class = false;
1352
1353 if (p == pend) return REG_EBRACK;
1354
1355 /* Ensure that we have enough space to push a charset: the
1356 opcode, the length count, and the bitset; 34 bytes in all. */
1357 GET_BUFFER_SPACE (34);
1358
1359 laststart = b;
1360
1361 /* We test `*p == '^' twice, instead of using an if
1362 statement, so we only need one BUF_PUSH. */
1363 BUF_PUSH (*p == '^' ? charset_not : charset);
1364 if (*p == '^')
1365 p++;
1366
1367 /* Remember the first position in the bracket expression. */
1368 p1 = p;
1369
1370 /* Push the number of bytes in the bitmap. */
1371 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1372
1373 /* Clear the whole map. */
1374 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1375
1376 /* charset_not matches newline according to a syntax bit. */
1377 if ((re_opcode_t) b[-2] == charset_not
1378 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1379 SET_LIST_BIT ('\n');
1380
1381 /* Read in characters and ranges, setting map bits. */
1382 for (;;)
1383 {
1384 if (p == pend) return REG_EBRACK;
1385
1386 PATFETCH (c);
1387
1388 /* \ might escape characters inside [...] and [^...]. */
1389 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1390 {
1391 if (p == pend) return REG_EESCAPE;
1392
1393 PATFETCH (c1);
1394 SET_LIST_BIT (c1);
1395 continue;
1396 }
1397
1398 /* Could be the end of the bracket expression. If it's
1399 not (i.e., when the bracket expression is `[]' so
1400 far), the ']' character bit gets set way below. */
1401 if (c == ']' && p != p1 + 1)
1402 break;
1403
1404 /* Look ahead to see if it's a range when the last thing
1405 was a character class. */
1406 if (had_char_class && c == '-' && *p != ']')
1407 return REG_ERANGE;
1408
1409 /* Look ahead to see if it's a range when the last thing
1410 was a character: if this is a hyphen not at the
1411 beginning or the end of a list, then it's the range
1412 operator. */
1413 if (c == '-'
1414 && !(p - 2 >= pattern && p[-2] == '[')
1415 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1416 && *p != ']')
1417 {
1418 reg_errcode_t ret
1419 = compile_range (&p, pend, translate, syntax, b);
1420 if (ret != REG_NOERROR) return ret;
1421 }
1422
1423 else if (p[0] == '-' && p[1] != ']')
1424 { /* This handles ranges made up of characters only. */
1425 reg_errcode_t ret;
1426
1427 /* Move past the `-'. */
1428 PATFETCH (c1);
1429
1430 ret = compile_range (&p, pend, translate, syntax, b);
1431 if (ret != REG_NOERROR) return ret;
1432 }
1433
1434 /* See if we're at the beginning of a possible character
1435 class. */
1436
1437 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1438 { /* Leave room for the null. */
1439 char str[CHAR_CLASS_MAX_LENGTH + 1];
1440
1441 PATFETCH (c);
1442 c1 = 0;
1443
1444 /* If pattern is `[[:'. */
1445 if (p == pend) return REG_EBRACK;
1446
1447 for (;;)
1448 {
1449 PATFETCH (c);
1450 if (c == ':' || c == ']' || p == pend
1451 || c1 == CHAR_CLASS_MAX_LENGTH)
1452 break;
1453 str[c1++] = c;
1454 }
1455 str[c1] = '\0';
1456
1457 /* If isn't a word bracketed by `[:' and:`]':
1458 undo the ending character, the letters, and leave
1459 the leading `:' and `[' (but set bits for them). */
1460 if (c == ':' && *p == ']')
1461 {
1462 int ch;
1463 boolean is_alnum = STREQ (str, "alnum");
1464 boolean is_alpha = STREQ (str, "alpha");
1465 boolean is_blank = STREQ (str, "blank");
1466 boolean is_cntrl = STREQ (str, "cntrl");
1467 boolean is_digit = STREQ (str, "digit");
1468 boolean is_graph = STREQ (str, "graph");
1469 boolean is_lower = STREQ (str, "lower");
1470 boolean is_print = STREQ (str, "print");
1471 boolean is_punct = STREQ (str, "punct");
1472 boolean is_space = STREQ (str, "space");
1473 boolean is_upper = STREQ (str, "upper");
1474 boolean is_xdigit = STREQ (str, "xdigit");
1475
1476 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1477
1478 /* Throw away the ] at the end of the character
1479 class. */
1480 PATFETCH (c);
1481
1482 if (p == pend) return REG_EBRACK;
1483
1484 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1485 {
1486 if ( (is_alnum && ISALNUM (ch))
1487 || (is_alpha && ISALPHA (ch))
1488 || (is_blank && ISBLANK (ch))
1489 || (is_cntrl && ISCNTRL (ch))
1490 || (is_digit && ISDIGIT (ch))
1491 || (is_graph && ISGRAPH (ch))
1492 || (is_lower && ISLOWER (ch))
1493 || (is_print && ISPRINT (ch))
1494 || (is_punct && ISPUNCT (ch))
1495 || (is_space && ISSPACE (ch))
1496 || (is_upper && ISUPPER (ch))
1497 || (is_xdigit && ISXDIGIT (ch)))
1498 SET_LIST_BIT (ch);
1499 }
1500 had_char_class = true;
1501 }
1502 else
1503 {
1504 c1++;
1505 while (c1--)
1506 PATUNFETCH;
1507 SET_LIST_BIT ('[');
1508 SET_LIST_BIT (':');
1509 had_char_class = false;
1510 }
1511 }
1512 else
1513 {
1514 had_char_class = false;
1515 SET_LIST_BIT (c);
1516 }
1517 }
1518
1519 /* Discard any (non)matching list bytes that are all 0 at the
1520 end of the map. Decrease the map-length byte too. */
1521 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1522 b[-1]--;
1523 b += b[-1];
1524 }
1525 break;
1526
1527
1528 case '(':
1529 if (syntax & RE_NO_BK_PARENS)
1530 goto handle_open;
1531 else
1532 goto normal_char;
1533
1534
1535 case ')':
1536 if (syntax & RE_NO_BK_PARENS)
1537 goto handle_close;
1538 else
1539 goto normal_char;
1540
1541
1542 case '\n':
1543 if (syntax & RE_NEWLINE_ALT)
1544 goto handle_alt;
1545 else
1546 goto normal_char;
1547
1548
1549 case '|':
1550 if (syntax & RE_NO_BK_VBAR)
1551 goto handle_alt;
1552 else
1553 goto normal_char;
1554
1555
1556 case '{':
1557 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1558 goto handle_interval;
1559 else
1560 goto normal_char;
1561
1562
1563 case '\\':
1564 if (p == pend) return REG_EESCAPE;
1565
1566 /* Do not translate the character after the \, so that we can
1567 distinguish, e.g., \B from \b, even if we normally would
1568 translate, e.g., B to b. */
1569 PATFETCH_RAW (c);
1570
1571 switch (c)
1572 {
1573 case '(':
1574 if (syntax & RE_NO_BK_PARENS)
1575 goto normal_backslash;
1576
1577 handle_open:
1578 bufp->re_nsub++;
1579 regnum++;
1580
1581 if (COMPILE_STACK_FULL)
1582 {
1583 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1584 compile_stack_elt_t);
1585 if (compile_stack.stack == NULL) return REG_ESPACE;
1586
1587 compile_stack.size <<= 1;
1588 }
1589
1590 /* These are the values to restore when we hit end of this
1591 group. They are all relative offsets, so that if the
1592 whole pattern moves because of realloc, they will still
1593 be valid. */
1594 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1595 COMPILE_STACK_TOP.fixup_alt_jump
1596 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1597 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1598 COMPILE_STACK_TOP.regnum = regnum;
1599
1600 /* We will eventually replace the 0 with the number of
1601 groups inner to this one. But do not push a
1602 start_memory for groups beyond the last one we can
1603 represent in the compiled pattern. */
1604 if (regnum <= MAX_REGNUM)
1605 {
1606 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1607 BUF_PUSH_3 (start_memory, regnum, 0);
1608 }
1609
1610 compile_stack.avail++;
1611
1612 fixup_alt_jump = 0;
1613 laststart = 0;
1614 begalt = b;
1615 break;
1616
1617
1618 case ')':
1619 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1620
1621 if (COMPILE_STACK_EMPTY)
1622 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1623 goto normal_backslash;
1624 else
1625 return REG_ERPAREN;
1626
1627 handle_close:
1628 if (fixup_alt_jump)
1629 { /* Push a dummy failure point at the end of the
1630 alternative for a possible future
1631 `pop_failure_jump' to pop. See comments at
1632 `push_dummy_failure' in `re_match_2'. */
1633 BUF_PUSH (push_dummy_failure);
1634
1635 /* We allocated space for this jump when we assigned
1636 to `fixup_alt_jump', in the `handle_alt' case below. */
1637 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1638 }
1639
1640 /* See similar code for backslashed left paren above. */
1641 if (COMPILE_STACK_EMPTY)
1642 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1643 goto normal_char;
1644 else
1645 return REG_ERPAREN;
1646
1647 /* Since we just checked for an empty stack above, this
1648 ``can't happen''. */
1649 assert (compile_stack.avail != 0);
1650 {
1651 /* We don't just want to restore into `regnum', because
1652 later groups should continue to be numbered higher,
1653 as in `(ab)c(de)' -- the second group is #2. */
1654 regnum_t this_group_regnum;
1655
1656 compile_stack.avail--;
1657 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1658 fixup_alt_jump
1659 = COMPILE_STACK_TOP.fixup_alt_jump
1660 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1661 : 0;
1662 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1663 this_group_regnum = COMPILE_STACK_TOP.regnum;
1664
1665 /* We're at the end of the group, so now we know how many
1666 groups were inside this one. */
1667 if (this_group_regnum <= MAX_REGNUM)
1668 {
1669 unsigned char *inner_group_loc
1670 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1671
1672 *inner_group_loc = regnum - this_group_regnum;
1673 BUF_PUSH_3 (stop_memory, this_group_regnum,
1674 regnum - this_group_regnum);
1675 }
1676 }
1677 break;
1678
1679
1680 case '|': /* `\|'. */
1681 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1682 goto normal_backslash;
1683 handle_alt:
1684 if (syntax & RE_LIMITED_OPS)
1685 goto normal_char;
1686
1687 /* Insert before the previous alternative a jump which
1688 jumps to this alternative if the former fails. */
1689 GET_BUFFER_SPACE (3);
1690 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1691 pending_exact = 0;
1692 b += 3;
1693
1694 /* The alternative before this one has a jump after it
1695 which gets executed if it gets matched. Adjust that
1696 jump so it will jump to this alternative's analogous
1697 jump (put in below, which in turn will jump to the next
1698 (if any) alternative's such jump, etc.). The last such
1699 jump jumps to the correct final destination. A picture:
1700 _____ _____
1701 | | | |
1702 | v | v
1703 a | b | c
1704
1705 If we are at `b', then fixup_alt_jump right now points to a
1706 three-byte space after `a'. We'll put in the jump, set
1707 fixup_alt_jump to right after `b', and leave behind three
1708 bytes which we'll fill in when we get to after `c'. */
1709
1710 if (fixup_alt_jump)
1711 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1712
1713 /* Mark and leave space for a jump after this alternative,
1714 to be filled in later either by next alternative or
1715 when know we're at the end of a series of alternatives. */
1716 fixup_alt_jump = b;
1717 GET_BUFFER_SPACE (3);
1718 b += 3;
1719
1720 laststart = 0;
1721 begalt = b;
1722 break;
1723
1724
1725 case '{':
1726 /* If \{ is a literal. */
1727 if (!(syntax & RE_INTERVALS)
1728 /* If we're at `\{' and it's not the open-interval
1729 operator. */
1730 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1731 || (p - 2 == pattern && p == pend))
1732 goto normal_backslash;
1733
1734 handle_interval:
1735 {
1736 /* If got here, then the syntax allows intervals. */
1737
1738 /* At least (most) this many matches must be made. */
1739 int lower_bound = -1, upper_bound = -1;
1740
1741 beg_interval = p - 1;
1742
1743 if (p == pend)
1744 {
1745 if (syntax & RE_NO_BK_BRACES)
1746 goto unfetch_interval;
1747 else
1748 return REG_EBRACE;
1749 }
1750
1751 GET_UNSIGNED_NUMBER (lower_bound);
1752
1753 if (c == ',')
1754 {
1755 GET_UNSIGNED_NUMBER (upper_bound);
1756 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1757 }
1758 else
1759 /* Interval such as `{1}' => match exactly once. */
1760 upper_bound = lower_bound;
1761
1762 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1763 || lower_bound > upper_bound)
1764 {
1765 if (syntax & RE_NO_BK_BRACES)
1766 goto unfetch_interval;
1767 else
1768 return REG_BADBR;
1769 }
1770
1771 if (!(syntax & RE_NO_BK_BRACES))
1772 {
1773 if (c != '\\') return REG_EBRACE;
1774
1775 PATFETCH (c);
1776 }
1777
1778 if (c != '}')
1779 {
1780 if (syntax & RE_NO_BK_BRACES)
1781 goto unfetch_interval;
1782 else
1783 return REG_BADBR;
1784 }
1785
1786 /* We just parsed a valid interval. */
1787
1788 /* If it's invalid to have no preceding re. */
1789 if (!laststart)
1790 {
1791 if (syntax & RE_CONTEXT_INVALID_OPS)
1792 return REG_BADRPT;
1793 else if (syntax & RE_CONTEXT_INDEP_OPS)
1794 laststart = b;
1795 else
1796 goto unfetch_interval;
1797 }
1798
1799 /* If the upper bound is zero, don't want to succeed at
1800 all; jump from `laststart' to `b + 3', which will be
1801 the end of the buffer after we insert the jump. */
1802 if (upper_bound == 0)
1803 {
1804 GET_BUFFER_SPACE (3);
1805 INSERT_JUMP (jump, laststart, b + 3);
1806 b += 3;
1807 }
1808
1809 /* Otherwise, we have a nontrivial interval. When
1810 we're all done, the pattern will look like:
1811 set_number_at <jump count> <upper bound>
1812 set_number_at <succeed_n count> <lower bound>
1813 succeed_n <after jump addr> <succed_n count>
1814 <body of loop>
1815 jump_n <succeed_n addr> <jump count>
1816 (The upper bound and `jump_n' are omitted if
1817 `upper_bound' is 1, though.) */
1818 else
1819 { /* If the upper bound is > 1, we need to insert
1820 more at the end of the loop. */
1821 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1822
1823 GET_BUFFER_SPACE (nbytes);
1824
1825 /* Initialize lower bound of the `succeed_n', even
1826 though it will be set during matching by its
1827 attendant `set_number_at' (inserted next),
1828 because `re_compile_fastmap' needs to know.
1829 Jump to the `jump_n' we might insert below. */
1830 INSERT_JUMP2 (succeed_n, laststart,
1831 b + 5 + (upper_bound > 1) * 5,
1832 lower_bound);
1833 b += 5;
1834
1835 /* Code to initialize the lower bound. Insert
1836 before the `succeed_n'. The `5' is the last two
1837 bytes of this `set_number_at', plus 3 bytes of
1838 the following `succeed_n'. */
1839 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1840 b += 5;
1841
1842 if (upper_bound > 1)
1843 { /* More than one repetition is allowed, so
1844 append a backward jump to the `succeed_n'
1845 that starts this interval.
1846
1847 When we've reached this during matching,
1848 we'll have matched the interval once, so
1849 jump back only `upper_bound - 1' times. */
1850 STORE_JUMP2 (jump_n, b, laststart + 5,
1851 upper_bound - 1);
1852 b += 5;
1853
1854 /* The location we want to set is the second
1855 parameter of the `jump_n'; that is `b-2' as
1856 an absolute address. `laststart' will be
1857 the `set_number_at' we're about to insert;
1858 `laststart+3' the number to set, the source
1859 for the relative address. But we are
1860 inserting into the middle of the pattern --
1861 so everything is getting moved up by 5.
1862 Conclusion: (b - 2) - (laststart + 3) + 5,
1863 i.e., b - laststart.
1864
1865 We insert this at the beginning of the loop
1866 so that if we fail during matching, we'll
1867 reinitialize the bounds. */
1868 insert_op2 (set_number_at, laststart, b - laststart,
1869 upper_bound - 1, b);
1870 b += 5;
1871 }
1872 }
1873 pending_exact = 0;
1874 beg_interval = NULL;
1875 }
1876 break;
1877
1878 unfetch_interval:
1879 /* If an invalid interval, match the characters as literals. */
1880 assert (beg_interval);
1881 p = beg_interval;
1882 beg_interval = NULL;
1883
1884 /* normal_char and normal_backslash need `c'. */
1885 PATFETCH (c);
1886
1887 if (!(syntax & RE_NO_BK_BRACES))
1888 {
1889 if (p > pattern && p[-1] == '\\')
1890 goto normal_backslash;
1891 }
1892 goto normal_char;
1893
1894 #ifdef emacs
1895 /* There is no way to specify the before_dot and after_dot
1896 operators. rms says this is ok. --karl */
1897 case '=':
1898 BUF_PUSH (at_dot);
1899 break;
1900
1901 case 's':
1902 laststart = b;
1903 PATFETCH (c);
1904 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1905 break;
1906
1907 case 'S':
1908 laststart = b;
1909 PATFETCH (c);
1910 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1911 break;
1912 #endif /* emacs */
1913
1914
1915 case 'w':
1916 laststart = b;
1917 BUF_PUSH (wordchar);
1918 break;
1919
1920
1921 case 'W':
1922 laststart = b;
1923 BUF_PUSH (notwordchar);
1924 break;
1925
1926
1927 case '<':
1928 BUF_PUSH (wordbeg);
1929 break;
1930
1931 case '>':
1932 BUF_PUSH (wordend);
1933 break;
1934
1935 case 'b':
1936 BUF_PUSH (wordbound);
1937 break;
1938
1939 case 'B':
1940 BUF_PUSH (notwordbound);
1941 break;
1942
1943 case '`':
1944 BUF_PUSH (begbuf);
1945 break;
1946
1947 case '\'':
1948 BUF_PUSH (endbuf);
1949 break;
1950
1951 case '1': case '2': case '3': case '4': case '5':
1952 case '6': case '7': case '8': case '9':
1953 if (syntax & RE_NO_BK_REFS)
1954 goto normal_char;
1955
1956 c1 = c - '0';
1957
1958 if (c1 > regnum)
1959 return REG_ESUBREG;
1960
1961 /* Can't back reference to a subexpression if inside of it. */
1962 if (group_in_compile_stack (compile_stack, c1))
1963 goto normal_char;
1964
1965 laststart = b;
1966 BUF_PUSH_2 (duplicate, c1);
1967 break;
1968
1969
1970 case '+':
1971 case '?':
1972 if (syntax & RE_BK_PLUS_QM)
1973 goto handle_plus;
1974 else
1975 goto normal_backslash;
1976
1977 default:
1978 normal_backslash:
1979 /* You might think it would be useful for \ to mean
1980 not to translate; but if we don't translate it
1981 it will never match anything. */
1982 c = TRANSLATE (c);
1983 goto normal_char;
1984 }
1985 break;
1986
1987
1988 default:
1989 /* Expects the character in `c'. */
1990 normal_char:
1991 /* If no exactn currently being built. */
1992 if (!pending_exact
1993
1994 /* If last exactn not at current position. */
1995 || pending_exact + *pending_exact + 1 != b
1996
1997 /* We have only one byte following the exactn for the count. */
1998 || *pending_exact == (1 << BYTEWIDTH) - 1
1999
2000 /* If followed by a repetition operator. */
2001 || *p == '*' || *p == '^'
2002 || ((syntax & RE_BK_PLUS_QM)
2003 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2004 : (*p == '+' || *p == '?'))
2005 || ((syntax & RE_INTERVALS)
2006 && ((syntax & RE_NO_BK_BRACES)
2007 ? *p == '{'
2008 : (p[0] == '\\' && p[1] == '{'))))
2009 {
2010 /* Start building a new exactn. */
2011
2012 laststart = b;
2013
2014 BUF_PUSH_2 (exactn, 0);
2015 pending_exact = b - 1;
2016 }
2017
2018 BUF_PUSH (c);
2019 (*pending_exact)++;
2020 break;
2021 } /* switch (c) */
2022 } /* while p != pend */
2023
2024
2025 /* Through the pattern now. */
2026
2027 if (fixup_alt_jump)
2028 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2029
2030 if (!COMPILE_STACK_EMPTY)
2031 return REG_EPAREN;
2032
2033 free (compile_stack.stack);
2034
2035 /* We have succeeded; set the length of the buffer. */
2036 bufp->used = b - bufp->buffer;
2037
2038 #ifdef DEBUG
2039 if (debug)
2040 {
2041 DEBUG_PRINT1 ("\nCompiled pattern: ");
2042 print_compiled_pattern (bufp);
2043 }
2044 #endif /* DEBUG */
2045
2046 return REG_NOERROR;
2047 } /* regex_compile */
2048 \f
2049 /* Subroutines for `regex_compile'. */
2050
2051 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2052
2053 static void
2054 store_op1 (op, loc, arg)
2055 re_opcode_t op;
2056 unsigned char *loc;
2057 int arg;
2058 {
2059 *loc = (unsigned char) op;
2060 STORE_NUMBER (loc + 1, arg);
2061 }
2062
2063
2064 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2065
2066 static void
2067 store_op2 (op, loc, arg1, arg2)
2068 re_opcode_t op;
2069 unsigned char *loc;
2070 int arg1, arg2;
2071 {
2072 *loc = (unsigned char) op;
2073 STORE_NUMBER (loc + 1, arg1);
2074 STORE_NUMBER (loc + 3, arg2);
2075 }
2076
2077
2078 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2079 for OP followed by two-byte integer parameter ARG. */
2080
2081 static void
2082 insert_op1 (op, loc, arg, end)
2083 re_opcode_t op;
2084 unsigned char *loc;
2085 int arg;
2086 unsigned char *end;
2087 {
2088 register unsigned char *pfrom = end;
2089 register unsigned char *pto = end + 3;
2090
2091 while (pfrom != loc)
2092 *--pto = *--pfrom;
2093
2094 store_op1 (op, loc, arg);
2095 }
2096
2097
2098 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2099
2100 static void
2101 insert_op2 (op, loc, arg1, arg2, end)
2102 re_opcode_t op;
2103 unsigned char *loc;
2104 int arg1, arg2;
2105 unsigned char *end;
2106 {
2107 register unsigned char *pfrom = end;
2108 register unsigned char *pto = end + 5;
2109
2110 while (pfrom != loc)
2111 *--pto = *--pfrom;
2112
2113 store_op2 (op, loc, arg1, arg2);
2114 }
2115
2116
2117 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2118 after an alternative or a begin-subexpression. We assume there is at
2119 least one character before the ^. */
2120
2121 static boolean
2122 at_begline_loc_p (pattern, p, syntax)
2123 const char *pattern, *p;
2124 reg_syntax_t syntax;
2125 {
2126 const char *prev = p - 2;
2127 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2128
2129 return
2130 /* After a subexpression? */
2131 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2132 /* After an alternative? */
2133 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2134 }
2135
2136
2137 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2138 at least one character after the $, i.e., `P < PEND'. */
2139
2140 static boolean
2141 at_endline_loc_p (p, pend, syntax)
2142 const char *p, *pend;
2143 int syntax;
2144 {
2145 const char *next = p;
2146 boolean next_backslash = *next == '\\';
2147 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2148
2149 return
2150 /* Before a subexpression? */
2151 (syntax & RE_NO_BK_PARENS ? *next == ')'
2152 : next_backslash && next_next && *next_next == ')')
2153 /* Before an alternative? */
2154 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2155 : next_backslash && next_next && *next_next == '|');
2156 }
2157
2158
2159 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2160 false if it's not. */
2161
2162 static boolean
2163 group_in_compile_stack (compile_stack, regnum)
2164 compile_stack_type compile_stack;
2165 regnum_t regnum;
2166 {
2167 int this_element;
2168
2169 for (this_element = compile_stack.avail - 1;
2170 this_element >= 0;
2171 this_element--)
2172 if (compile_stack.stack[this_element].regnum == regnum)
2173 return true;
2174
2175 return false;
2176 }
2177
2178
2179 /* Read the ending character of a range (in a bracket expression) from the
2180 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2181 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2182 Then we set the translation of all bits between the starting and
2183 ending characters (inclusive) in the compiled pattern B.
2184
2185 Return an error code.
2186
2187 We use these short variable names so we can use the same macros as
2188 `regex_compile' itself. */
2189
2190 static reg_errcode_t
2191 compile_range (p_ptr, pend, translate, syntax, b)
2192 const char **p_ptr, *pend;
2193 char *translate;
2194 reg_syntax_t syntax;
2195 unsigned char *b;
2196 {
2197 unsigned this_char;
2198
2199 const char *p = *p_ptr;
2200
2201 /* Even though the pattern is a signed `char *', we need to fetch into
2202 `unsigned char's. Reason: if the high bit of the pattern character
2203 is set, the range endpoints will be negative if we fetch into a
2204 signed `char *'. */
2205 unsigned char range_end;
2206 unsigned char range_start = p[-2];
2207
2208 if (p == pend)
2209 return REG_ERANGE;
2210
2211 PATFETCH (range_end);
2212
2213 /* Have to increment the pointer into the pattern string, so the
2214 caller isn't still at the ending character. */
2215 (*p_ptr)++;
2216
2217 /* If the start is after the end, the range is empty. */
2218 if (range_start > range_end)
2219 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2220
2221 /* Here we see why `this_char' has to be larger than an `unsigned
2222 char' -- the range is inclusive, so if `range_end' == 0xff
2223 (assuming 8-bit characters), we would otherwise go into an infinite
2224 loop, since all characters <= 0xff. */
2225 for (this_char = range_start; this_char <= range_end; this_char++)
2226 {
2227 SET_LIST_BIT (TRANSLATE (this_char));
2228 }
2229
2230 return REG_NOERROR;
2231 }
2232 \f
2233 /* Failure stack declarations and macros; both re_compile_fastmap and
2234 re_match_2 use a failure stack. These have to be macros because of
2235 REGEX_ALLOCATE. */
2236
2237
2238 /* Number of failure points for which to initially allocate space
2239 when matching. If this number is exceeded, we allocate more
2240 space, so it is not a hard limit. */
2241 #ifndef INIT_FAILURE_ALLOC
2242 #define INIT_FAILURE_ALLOC 5
2243 #endif
2244
2245 /* Roughly the maximum number of failure points on the stack. Would be
2246 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2247 This is a variable only so users of regex can assign to it; we never
2248 change it ourselves. */
2249 int re_max_failures = 2000;
2250
2251 typedef const unsigned char *fail_stack_elt_t;
2252
2253 typedef struct
2254 {
2255 fail_stack_elt_t *stack;
2256 unsigned size;
2257 unsigned avail; /* Offset of next open position. */
2258 } fail_stack_type;
2259
2260 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2261 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2262 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2263 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2264
2265
2266 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2267
2268 #define INIT_FAIL_STACK() \
2269 do { \
2270 fail_stack.stack = (fail_stack_elt_t *) \
2271 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2272 \
2273 if (fail_stack.stack == NULL) \
2274 return -2; \
2275 \
2276 fail_stack.size = INIT_FAILURE_ALLOC; \
2277 fail_stack.avail = 0; \
2278 } while (0)
2279
2280
2281 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2282
2283 Return 1 if succeeds, and 0 if either ran out of memory
2284 allocating space for it or it was already too large.
2285
2286 REGEX_REALLOCATE requires `destination' be declared. */
2287
2288 #define DOUBLE_FAIL_STACK(fail_stack) \
2289 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2290 ? 0 \
2291 : ((fail_stack).stack = (fail_stack_elt_t *) \
2292 REGEX_REALLOCATE ((fail_stack).stack, \
2293 (fail_stack).size * sizeof (fail_stack_elt_t), \
2294 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2295 \
2296 (fail_stack).stack == NULL \
2297 ? 0 \
2298 : ((fail_stack).size <<= 1, \
2299 1)))
2300
2301
2302 /* Push PATTERN_OP on FAIL_STACK.
2303
2304 Return 1 if was able to do so and 0 if ran out of memory allocating
2305 space to do so. */
2306 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2307 ((FAIL_STACK_FULL () \
2308 && !DOUBLE_FAIL_STACK (fail_stack)) \
2309 ? 0 \
2310 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2311 1))
2312
2313 /* This pushes an item onto the failure stack. Must be a four-byte
2314 value. Assumes the variable `fail_stack'. Probably should only
2315 be called from within `PUSH_FAILURE_POINT'. */
2316 #define PUSH_FAILURE_ITEM(item) \
2317 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2318
2319 /* The complement operation. Assumes `fail_stack' is nonempty. */
2320 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2321
2322 /* Used to omit pushing failure point id's when we're not debugging. */
2323 #ifdef DEBUG
2324 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2325 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2326 #else
2327 #define DEBUG_PUSH(item)
2328 #define DEBUG_POP(item_addr)
2329 #endif
2330
2331
2332 /* Push the information about the state we will need
2333 if we ever fail back to it.
2334
2335 Requires variables fail_stack, regstart, regend, reg_info, and
2336 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2337 declared.
2338
2339 Does `return FAILURE_CODE' if runs out of memory. */
2340
2341 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2342 do { \
2343 char *destination; \
2344 /* Must be int, so when we don't save any registers, the arithmetic \
2345 of 0 + -1 isn't done as unsigned. */ \
2346 int this_reg; \
2347 \
2348 DEBUG_STATEMENT (failure_id++); \
2349 DEBUG_STATEMENT (nfailure_points_pushed++); \
2350 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2351 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2352 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2353 \
2354 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2355 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2356 \
2357 /* Ensure we have enough space allocated for what we will push. */ \
2358 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2359 { \
2360 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2361 return failure_code; \
2362 \
2363 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2364 (fail_stack).size); \
2365 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2366 } \
2367 \
2368 /* Push the info, starting with the registers. */ \
2369 DEBUG_PRINT1 ("\n"); \
2370 \
2371 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2372 this_reg++) \
2373 { \
2374 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2375 DEBUG_STATEMENT (num_regs_pushed++); \
2376 \
2377 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2378 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2379 \
2380 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2381 PUSH_FAILURE_ITEM (regend[this_reg]); \
2382 \
2383 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2384 DEBUG_PRINT2 (" match_null=%d", \
2385 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2386 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2387 DEBUG_PRINT2 (" matched_something=%d", \
2388 MATCHED_SOMETHING (reg_info[this_reg])); \
2389 DEBUG_PRINT2 (" ever_matched=%d", \
2390 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2391 DEBUG_PRINT1 ("\n"); \
2392 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2393 } \
2394 \
2395 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2396 PUSH_FAILURE_ITEM (lowest_active_reg); \
2397 \
2398 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2399 PUSH_FAILURE_ITEM (highest_active_reg); \
2400 \
2401 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2402 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2403 PUSH_FAILURE_ITEM (pattern_place); \
2404 \
2405 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2406 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2407 size2); \
2408 DEBUG_PRINT1 ("'\n"); \
2409 PUSH_FAILURE_ITEM (string_place); \
2410 \
2411 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2412 DEBUG_PUSH (failure_id); \
2413 } while (0)
2414
2415 /* This is the number of items that are pushed and popped on the stack
2416 for each register. */
2417 #define NUM_REG_ITEMS 3
2418
2419 /* Individual items aside from the registers. */
2420 #ifdef DEBUG
2421 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2422 #else
2423 #define NUM_NONREG_ITEMS 4
2424 #endif
2425
2426 /* We push at most this many items on the stack. */
2427 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2428
2429 /* We actually push this many items. */
2430 #define NUM_FAILURE_ITEMS \
2431 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2432 + NUM_NONREG_ITEMS)
2433
2434 /* How many items can still be added to the stack without overflowing it. */
2435 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2436
2437
2438 /* Pops what PUSH_FAIL_STACK pushes.
2439
2440 We restore into the parameters, all of which should be lvalues:
2441 STR -- the saved data position.
2442 PAT -- the saved pattern position.
2443 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2444 REGSTART, REGEND -- arrays of string positions.
2445 REG_INFO -- array of information about each subexpression.
2446
2447 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2448 `pend', `string1', `size1', `string2', and `size2'. */
2449
2450 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2451 { \
2452 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2453 int this_reg; \
2454 const unsigned char *string_temp; \
2455 \
2456 assert (!FAIL_STACK_EMPTY ()); \
2457 \
2458 /* Remove failure points and point to how many regs pushed. */ \
2459 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2460 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2461 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2462 \
2463 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2464 \
2465 DEBUG_POP (&failure_id); \
2466 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2467 \
2468 /* If the saved string location is NULL, it came from an \
2469 on_failure_keep_string_jump opcode, and we want to throw away the \
2470 saved NULL, thus retaining our current position in the string. */ \
2471 string_temp = POP_FAILURE_ITEM (); \
2472 if (string_temp != NULL) \
2473 str = (const char *) string_temp; \
2474 \
2475 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2476 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2477 DEBUG_PRINT1 ("'\n"); \
2478 \
2479 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2480 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2481 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2482 \
2483 /* Restore register info. */ \
2484 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2485 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2486 \
2487 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2488 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2489 \
2490 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2491 { \
2492 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2493 \
2494 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2495 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2496 \
2497 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2498 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2499 \
2500 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2501 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2502 } \
2503 \
2504 DEBUG_STATEMENT (nfailure_points_popped++); \
2505 } /* POP_FAILURE_POINT */
2506 \f
2507 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2508 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2509 characters can start a string that matches the pattern. This fastmap
2510 is used by re_search to skip quickly over impossible starting points.
2511
2512 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2513 area as BUFP->fastmap.
2514
2515 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2516 the pattern buffer.
2517
2518 Returns 0 if we succeed, -2 if an internal error. */
2519
2520 int
2521 re_compile_fastmap (bufp)
2522 struct re_pattern_buffer *bufp;
2523 {
2524 int j, k;
2525 fail_stack_type fail_stack;
2526 #ifndef REGEX_MALLOC
2527 char *destination;
2528 #endif
2529 /* We don't push any register information onto the failure stack. */
2530 unsigned num_regs = 0;
2531
2532 register char *fastmap = bufp->fastmap;
2533 unsigned char *pattern = bufp->buffer;
2534 unsigned long size = bufp->used;
2535 const unsigned char *p = pattern;
2536 register unsigned char *pend = pattern + size;
2537
2538 /* Assume that each path through the pattern can be null until
2539 proven otherwise. We set this false at the bottom of switch
2540 statement, to which we get only if a particular path doesn't
2541 match the empty string. */
2542 boolean path_can_be_null = true;
2543
2544 /* We aren't doing a `succeed_n' to begin with. */
2545 boolean succeed_n_p = false;
2546
2547 assert (fastmap != NULL && p != NULL);
2548
2549 INIT_FAIL_STACK ();
2550 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2551 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2552 bufp->can_be_null = 0;
2553
2554 while (p != pend || !FAIL_STACK_EMPTY ())
2555 {
2556 if (p == pend)
2557 {
2558 bufp->can_be_null |= path_can_be_null;
2559
2560 /* Reset for next path. */
2561 path_can_be_null = true;
2562
2563 p = fail_stack.stack[--fail_stack.avail];
2564 }
2565
2566 /* We should never be about to go beyond the end of the pattern. */
2567 assert (p < pend);
2568
2569 #ifdef SWITCH_ENUM_BUG
2570 switch ((int) ((re_opcode_t) *p++))
2571 #else
2572 switch ((re_opcode_t) *p++)
2573 #endif
2574 {
2575
2576 /* I guess the idea here is to simply not bother with a fastmap
2577 if a backreference is used, since it's too hard to figure out
2578 the fastmap for the corresponding group. Setting
2579 `can_be_null' stops `re_search_2' from using the fastmap, so
2580 that is all we do. */
2581 case duplicate:
2582 bufp->can_be_null = 1;
2583 return 0;
2584
2585
2586 /* Following are the cases which match a character. These end
2587 with `break'. */
2588
2589 case exactn:
2590 fastmap[p[1]] = 1;
2591 break;
2592
2593
2594 case charset:
2595 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2596 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2597 fastmap[j] = 1;
2598 break;
2599
2600
2601 case charset_not:
2602 /* Chars beyond end of map must be allowed. */
2603 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2604 fastmap[j] = 1;
2605
2606 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2607 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2608 fastmap[j] = 1;
2609 break;
2610
2611
2612 case wordchar:
2613 for (j = 0; j < (1 << BYTEWIDTH); j++)
2614 if (SYNTAX (j) == Sword)
2615 fastmap[j] = 1;
2616 break;
2617
2618
2619 case notwordchar:
2620 for (j = 0; j < (1 << BYTEWIDTH); j++)
2621 if (SYNTAX (j) != Sword)
2622 fastmap[j] = 1;
2623 break;
2624
2625
2626 case anychar:
2627 /* `.' matches anything ... */
2628 for (j = 0; j < (1 << BYTEWIDTH); j++)
2629 fastmap[j] = 1;
2630
2631 /* ... except perhaps newline. */
2632 if (!(bufp->syntax & RE_DOT_NEWLINE))
2633 fastmap['\n'] = 0;
2634
2635 /* Return if we have already set `can_be_null'; if we have,
2636 then the fastmap is irrelevant. Something's wrong here. */
2637 else if (bufp->can_be_null)
2638 return 0;
2639
2640 /* Otherwise, have to check alternative paths. */
2641 break;
2642
2643
2644 #ifdef emacs
2645 case syntaxspec:
2646 k = *p++;
2647 for (j = 0; j < (1 << BYTEWIDTH); j++)
2648 if (SYNTAX (j) == (enum syntaxcode) k)
2649 fastmap[j] = 1;
2650 break;
2651
2652
2653 case notsyntaxspec:
2654 k = *p++;
2655 for (j = 0; j < (1 << BYTEWIDTH); j++)
2656 if (SYNTAX (j) != (enum syntaxcode) k)
2657 fastmap[j] = 1;
2658 break;
2659
2660
2661 /* All cases after this match the empty string. These end with
2662 `continue'. */
2663
2664
2665 case before_dot:
2666 case at_dot:
2667 case after_dot:
2668 continue;
2669 #endif /* not emacs */
2670
2671
2672 case no_op:
2673 case begline:
2674 case endline:
2675 case begbuf:
2676 case endbuf:
2677 case wordbound:
2678 case notwordbound:
2679 case wordbeg:
2680 case wordend:
2681 case push_dummy_failure:
2682 continue;
2683
2684
2685 case jump_n:
2686 case pop_failure_jump:
2687 case maybe_pop_jump:
2688 case jump:
2689 case jump_past_alt:
2690 case dummy_failure_jump:
2691 EXTRACT_NUMBER_AND_INCR (j, p);
2692 p += j;
2693 if (j > 0)
2694 continue;
2695
2696 /* Jump backward implies we just went through the body of a
2697 loop and matched nothing. Opcode jumped to should be
2698 `on_failure_jump' or `succeed_n'. Just treat it like an
2699 ordinary jump. For a * loop, it has pushed its failure
2700 point already; if so, discard that as redundant. */
2701 if ((re_opcode_t) *p != on_failure_jump
2702 && (re_opcode_t) *p != succeed_n)
2703 continue;
2704
2705 p++;
2706 EXTRACT_NUMBER_AND_INCR (j, p);
2707 p += j;
2708
2709 /* If what's on the stack is where we are now, pop it. */
2710 if (!FAIL_STACK_EMPTY ()
2711 && fail_stack.stack[fail_stack.avail - 1] == p)
2712 fail_stack.avail--;
2713
2714 continue;
2715
2716
2717 case on_failure_jump:
2718 case on_failure_keep_string_jump:
2719 handle_on_failure_jump:
2720 EXTRACT_NUMBER_AND_INCR (j, p);
2721
2722 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2723 end of the pattern. We don't want to push such a point,
2724 since when we restore it above, entering the switch will
2725 increment `p' past the end of the pattern. We don't need
2726 to push such a point since we obviously won't find any more
2727 fastmap entries beyond `pend'. Such a pattern can match
2728 the null string, though. */
2729 if (p + j < pend)
2730 {
2731 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2732 return -2;
2733 }
2734 else
2735 bufp->can_be_null = 1;
2736
2737 if (succeed_n_p)
2738 {
2739 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2740 succeed_n_p = false;
2741 }
2742
2743 continue;
2744
2745
2746 case succeed_n:
2747 /* Get to the number of times to succeed. */
2748 p += 2;
2749
2750 /* Increment p past the n for when k != 0. */
2751 EXTRACT_NUMBER_AND_INCR (k, p);
2752 if (k == 0)
2753 {
2754 p -= 4;
2755 succeed_n_p = true; /* Spaghetti code alert. */
2756 goto handle_on_failure_jump;
2757 }
2758 continue;
2759
2760
2761 case set_number_at:
2762 p += 4;
2763 continue;
2764
2765
2766 case start_memory:
2767 case stop_memory:
2768 p += 2;
2769 continue;
2770
2771
2772 default:
2773 abort (); /* We have listed all the cases. */
2774 } /* switch *p++ */
2775
2776 /* Getting here means we have found the possible starting
2777 characters for one path of the pattern -- and that the empty
2778 string does not match. We need not follow this path further.
2779 Instead, look at the next alternative (remembered on the
2780 stack), or quit if no more. The test at the top of the loop
2781 does these things. */
2782 path_can_be_null = false;
2783 p = pend;
2784 } /* while p */
2785
2786 /* Set `can_be_null' for the last path (also the first path, if the
2787 pattern is empty). */
2788 bufp->can_be_null |= path_can_be_null;
2789 return 0;
2790 } /* re_compile_fastmap */
2791 \f
2792 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2793 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2794 this memory for recording register information. STARTS and ENDS
2795 must be allocated using the malloc library routine, and must each
2796 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2797
2798 If NUM_REGS == 0, then subsequent matches should allocate their own
2799 register data.
2800
2801 Unless this function is called, the first search or match using
2802 PATTERN_BUFFER will allocate its own register data, without
2803 freeing the old data. */
2804
2805 void
2806 re_set_registers (bufp, regs, num_regs, starts, ends)
2807 struct re_pattern_buffer *bufp;
2808 struct re_registers *regs;
2809 unsigned num_regs;
2810 regoff_t *starts, *ends;
2811 {
2812 if (num_regs)
2813 {
2814 bufp->regs_allocated = REGS_REALLOCATE;
2815 regs->num_regs = num_regs;
2816 regs->start = starts;
2817 regs->end = ends;
2818 }
2819 else
2820 {
2821 bufp->regs_allocated = REGS_UNALLOCATED;
2822 regs->num_regs = 0;
2823 regs->start = regs->end = (regoff_t) 0;
2824 }
2825 }
2826 \f
2827 /* Searching routines. */
2828
2829 /* Like re_search_2, below, but only one string is specified, and
2830 doesn't let you say where to stop matching. */
2831
2832 int
2833 re_search (bufp, string, size, startpos, range, regs)
2834 struct re_pattern_buffer *bufp;
2835 const char *string;
2836 int size, startpos, range;
2837 struct re_registers *regs;
2838 {
2839 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2840 regs, size);
2841 }
2842
2843
2844 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2845 virtual concatenation of STRING1 and STRING2, starting first at index
2846 STARTPOS, then at STARTPOS + 1, and so on.
2847
2848 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2849
2850 RANGE is how far to scan while trying to match. RANGE = 0 means try
2851 only at STARTPOS; in general, the last start tried is STARTPOS +
2852 RANGE.
2853
2854 In REGS, return the indices of the virtual concatenation of STRING1
2855 and STRING2 that matched the entire BUFP->buffer and its contained
2856 subexpressions.
2857
2858 Do not consider matching one past the index STOP in the virtual
2859 concatenation of STRING1 and STRING2.
2860
2861 We return either the position in the strings at which the match was
2862 found, -1 if no match, or -2 if error (such as failure
2863 stack overflow). */
2864
2865 int
2866 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2867 struct re_pattern_buffer *bufp;
2868 const char *string1, *string2;
2869 int size1, size2;
2870 int startpos;
2871 int range;
2872 struct re_registers *regs;
2873 int stop;
2874 {
2875 int val;
2876 register char *fastmap = bufp->fastmap;
2877 register char *translate = bufp->translate;
2878 int total_size = size1 + size2;
2879 int endpos = startpos + range;
2880
2881 /* Check for out-of-range STARTPOS. */
2882 if (startpos < 0 || startpos > total_size)
2883 return -1;
2884
2885 /* Fix up RANGE if it might eventually take us outside
2886 the virtual concatenation of STRING1 and STRING2. */
2887 if (endpos < -1)
2888 range = -1 - startpos;
2889 else if (endpos > total_size)
2890 range = total_size - startpos;
2891
2892 /* If the search isn't to be a backwards one, don't waste time in a
2893 search for a pattern that must be anchored. */
2894 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2895 {
2896 if (startpos > 0)
2897 return -1;
2898 else
2899 range = 1;
2900 }
2901
2902 /* Update the fastmap now if not correct already. */
2903 if (fastmap && !bufp->fastmap_accurate)
2904 if (re_compile_fastmap (bufp) == -2)
2905 return -2;
2906
2907 /* Loop through the string, looking for a place to start matching. */
2908 for (;;)
2909 {
2910 /* If a fastmap is supplied, skip quickly over characters that
2911 cannot be the start of a match. If the pattern can match the
2912 null string, however, we don't need to skip characters; we want
2913 the first null string. */
2914 if (fastmap && startpos < total_size && !bufp->can_be_null)
2915 {
2916 if (range > 0) /* Searching forwards. */
2917 {
2918 register const char *d;
2919 register int lim = 0;
2920 int irange = range;
2921
2922 if (startpos < size1 && startpos + range >= size1)
2923 lim = range - (size1 - startpos);
2924
2925 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2926
2927 /* Written out as an if-else to avoid testing `translate'
2928 inside the loop. */
2929 if (translate)
2930 while (range > lim
2931 && !fastmap[(unsigned char) translate[*d++]])
2932 range--;
2933 else
2934 while (range > lim && !fastmap[(unsigned char) *d++])
2935 range--;
2936
2937 startpos += irange - range;
2938 }
2939 else /* Searching backwards. */
2940 {
2941 register char c = (size1 == 0 || startpos >= size1
2942 ? string2[startpos - size1]
2943 : string1[startpos]);
2944
2945 if (!fastmap[(unsigned char) TRANSLATE (c)])
2946 goto advance;
2947 }
2948 }
2949
2950 /* If can't match the null string, and that's all we have left, fail. */
2951 if (range >= 0 && startpos == total_size && fastmap
2952 && !bufp->can_be_null)
2953 return -1;
2954
2955 val = re_match_2 (bufp, string1, size1, string2, size2,
2956 startpos, regs, stop);
2957 if (val >= 0)
2958 return startpos;
2959
2960 if (val == -2)
2961 return -2;
2962
2963 advance:
2964 if (!range)
2965 break;
2966 else if (range > 0)
2967 {
2968 range--;
2969 startpos++;
2970 }
2971 else
2972 {
2973 range++;
2974 startpos--;
2975 }
2976 }
2977 return -1;
2978 } /* re_search_2 */
2979 \f
2980 /* Declarations and macros for re_match_2. */
2981
2982 static int bcmp_translate ();
2983 static boolean alt_match_null_string_p (),
2984 common_op_match_null_string_p (),
2985 group_match_null_string_p ();
2986
2987 /* Structure for per-register (a.k.a. per-group) information.
2988 This must not be longer than one word, because we push this value
2989 onto the failure stack. Other register information, such as the
2990 starting and ending positions (which are addresses), and the list of
2991 inner groups (which is a bits list) are maintained in separate
2992 variables.
2993
2994 We are making a (strictly speaking) nonportable assumption here: that
2995 the compiler will pack our bit fields into something that fits into
2996 the type of `word', i.e., is something that fits into one item on the
2997 failure stack. */
2998 typedef union
2999 {
3000 fail_stack_elt_t word;
3001 struct
3002 {
3003 /* This field is one if this group can match the empty string,
3004 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3005 #define MATCH_NULL_UNSET_VALUE 3
3006 unsigned match_null_string_p : 2;
3007 unsigned is_active : 1;
3008 unsigned matched_something : 1;
3009 unsigned ever_matched_something : 1;
3010 } bits;
3011 } register_info_type;
3012
3013 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3014 #define IS_ACTIVE(R) ((R).bits.is_active)
3015 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3016 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3017
3018
3019 /* Call this when have matched a real character; it sets `matched' flags
3020 for the subexpressions which we are currently inside. Also records
3021 that those subexprs have matched. */
3022 #define SET_REGS_MATCHED() \
3023 do \
3024 { \
3025 unsigned r; \
3026 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3027 { \
3028 MATCHED_SOMETHING (reg_info[r]) \
3029 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3030 = 1; \
3031 } \
3032 } \
3033 while (0)
3034
3035
3036 /* This converts PTR, a pointer into one of the search strings `string1'
3037 and `string2' into an offset from the beginning of that string. */
3038 #define POINTER_TO_OFFSET(ptr) \
3039 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3040
3041 /* Registers are set to a sentinel when they haven't yet matched. */
3042 #define REG_UNSET_VALUE ((char *) -1)
3043 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3044
3045
3046 /* Macros for dealing with the split strings in re_match_2. */
3047
3048 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3049
3050 /* Call before fetching a character with *d. This switches over to
3051 string2 if necessary. */
3052 #define PREFETCH() \
3053 while (d == dend) \
3054 { \
3055 /* End of string2 => fail. */ \
3056 if (dend == end_match_2) \
3057 goto fail; \
3058 /* End of string1 => advance to string2. */ \
3059 d = string2; \
3060 dend = end_match_2; \
3061 }
3062
3063
3064 /* Test if at very beginning or at very end of the virtual concatenation
3065 of `string1' and `string2'. If only one string, it's `string2'. */
3066 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3067 #define AT_STRINGS_END(d) ((d) == end2)
3068
3069
3070 /* Test if D points to a character which is word-constituent. We have
3071 two special cases to check for: if past the end of string1, look at
3072 the first character in string2; and if before the beginning of
3073 string2, look at the last character in string1. */
3074 #define WORDCHAR_P(d) \
3075 (SYNTAX ((d) == end1 ? *string2 \
3076 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3077 == Sword)
3078
3079 /* Test if the character before D and the one at D differ with respect
3080 to being word-constituent. */
3081 #define AT_WORD_BOUNDARY(d) \
3082 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3083 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3084
3085
3086 /* Free everything we malloc. */
3087 #ifdef REGEX_MALLOC
3088 #define FREE_VAR(var) if (var) free (var); var = NULL
3089 #define FREE_VARIABLES() \
3090 do { \
3091 FREE_VAR (fail_stack.stack); \
3092 FREE_VAR (regstart); \
3093 FREE_VAR (regend); \
3094 FREE_VAR (old_regstart); \
3095 FREE_VAR (old_regend); \
3096 FREE_VAR (best_regstart); \
3097 FREE_VAR (best_regend); \
3098 FREE_VAR (reg_info); \
3099 FREE_VAR (reg_dummy); \
3100 FREE_VAR (reg_info_dummy); \
3101 } while (0)
3102 #else /* not REGEX_MALLOC */
3103 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3104 #define FREE_VARIABLES() alloca (0)
3105 #endif /* not REGEX_MALLOC */
3106
3107
3108 /* These values must meet several constraints. They must not be valid
3109 register values; since we have a limit of 255 registers (because
3110 we use only one byte in the pattern for the register number), we can
3111 use numbers larger than 255. They must differ by 1, because of
3112 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3113 be larger than the value for the highest register, so we do not try
3114 to actually save any registers when none are active. */
3115 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3116 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3117 \f
3118 /* Matching routines. */
3119
3120 #ifndef emacs /* Emacs never uses this. */
3121 /* re_match is like re_match_2 except it takes only a single string. */
3122
3123 int
3124 re_match (bufp, string, size, pos, regs)
3125 struct re_pattern_buffer *bufp;
3126 const char *string;
3127 int size, pos;
3128 struct re_registers *regs;
3129 {
3130 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3131 }
3132 #endif /* not emacs */
3133
3134
3135 /* re_match_2 matches the compiled pattern in BUFP against the
3136 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3137 and SIZE2, respectively). We start matching at POS, and stop
3138 matching at STOP.
3139
3140 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3141 store offsets for the substring each group matched in REGS. See the
3142 documentation for exactly how many groups we fill.
3143
3144 We return -1 if no match, -2 if an internal error (such as the
3145 failure stack overflowing). Otherwise, we return the length of the
3146 matched substring. */
3147
3148 int
3149 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3150 struct re_pattern_buffer *bufp;
3151 const char *string1, *string2;
3152 int size1, size2;
3153 int pos;
3154 struct re_registers *regs;
3155 int stop;
3156 {
3157 /* General temporaries. */
3158 int mcnt;
3159 unsigned char *p1;
3160
3161 /* Just past the end of the corresponding string. */
3162 const char *end1, *end2;
3163
3164 /* Pointers into string1 and string2, just past the last characters in
3165 each to consider matching. */
3166 const char *end_match_1, *end_match_2;
3167
3168 /* Where we are in the data, and the end of the current string. */
3169 const char *d, *dend;
3170
3171 /* Where we are in the pattern, and the end of the pattern. */
3172 unsigned char *p = bufp->buffer;
3173 register unsigned char *pend = p + bufp->used;
3174
3175 /* We use this to map every character in the string. */
3176 char *translate = bufp->translate;
3177
3178 /* Failure point stack. Each place that can handle a failure further
3179 down the line pushes a failure point on this stack. It consists of
3180 restart, regend, and reg_info for all registers corresponding to
3181 the subexpressions we're currently inside, plus the number of such
3182 registers, and, finally, two char *'s. The first char * is where
3183 to resume scanning the pattern; the second one is where to resume
3184 scanning the strings. If the latter is zero, the failure point is
3185 a ``dummy''; if a failure happens and the failure point is a dummy,
3186 it gets discarded and the next next one is tried. */
3187 fail_stack_type fail_stack;
3188 #ifdef DEBUG
3189 static unsigned failure_id = 0;
3190 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3191 #endif
3192
3193 /* We fill all the registers internally, independent of what we
3194 return, for use in backreferences. The number here includes
3195 an element for register zero. */
3196 unsigned num_regs = bufp->re_nsub + 1;
3197
3198 /* The currently active registers. */
3199 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3200 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3201
3202 /* Information on the contents of registers. These are pointers into
3203 the input strings; they record just what was matched (on this
3204 attempt) by a subexpression part of the pattern, that is, the
3205 regnum-th regstart pointer points to where in the pattern we began
3206 matching and the regnum-th regend points to right after where we
3207 stopped matching the regnum-th subexpression. (The zeroth register
3208 keeps track of what the whole pattern matches.) */
3209 const char **regstart, **regend;
3210
3211 /* If a group that's operated upon by a repetition operator fails to
3212 match anything, then the register for its start will need to be
3213 restored because it will have been set to wherever in the string we
3214 are when we last see its open-group operator. Similarly for a
3215 register's end. */
3216 const char **old_regstart, **old_regend;
3217
3218 /* The is_active field of reg_info helps us keep track of which (possibly
3219 nested) subexpressions we are currently in. The matched_something
3220 field of reg_info[reg_num] helps us tell whether or not we have
3221 matched any of the pattern so far this time through the reg_num-th
3222 subexpression. These two fields get reset each time through any
3223 loop their register is in. */
3224 register_info_type *reg_info;
3225
3226 /* The following record the register info as found in the above
3227 variables when we find a match better than any we've seen before.
3228 This happens as we backtrack through the failure points, which in
3229 turn happens only if we have not yet matched the entire string. */
3230 unsigned best_regs_set = false;
3231 const char **best_regstart, **best_regend;
3232
3233 /* Logically, this is `best_regend[0]'. But we don't want to have to
3234 allocate space for that if we're not allocating space for anything
3235 else (see below). Also, we never need info about register 0 for
3236 any of the other register vectors, and it seems rather a kludge to
3237 treat `best_regend' differently than the rest. So we keep track of
3238 the end of the best match so far in a separate variable. We
3239 initialize this to NULL so that when we backtrack the first time
3240 and need to test it, it's not garbage. */
3241 const char *match_end = NULL;
3242
3243 /* Used when we pop values we don't care about. */
3244 const char **reg_dummy;
3245 register_info_type *reg_info_dummy;
3246
3247 #ifdef DEBUG
3248 /* Counts the total number of registers pushed. */
3249 unsigned num_regs_pushed = 0;
3250 #endif
3251
3252 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3253
3254 INIT_FAIL_STACK ();
3255
3256 /* Do not bother to initialize all the register variables if there are
3257 no groups in the pattern, as it takes a fair amount of time. If
3258 there are groups, we include space for register 0 (the whole
3259 pattern), even though we never use it, since it simplifies the
3260 array indexing. We should fix this. */
3261 if (bufp->re_nsub)
3262 {
3263 regstart = REGEX_TALLOC (num_regs, const char *);
3264 regend = REGEX_TALLOC (num_regs, const char *);
3265 old_regstart = REGEX_TALLOC (num_regs, const char *);
3266 old_regend = REGEX_TALLOC (num_regs, const char *);
3267 best_regstart = REGEX_TALLOC (num_regs, const char *);
3268 best_regend = REGEX_TALLOC (num_regs, const char *);
3269 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3270 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3271 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3272
3273 if (!(regstart && regend && old_regstart && old_regend && reg_info
3274 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3275 {
3276 FREE_VARIABLES ();
3277 return -2;
3278 }
3279 }
3280 #ifdef REGEX_MALLOC
3281 else
3282 {
3283 /* We must initialize all our variables to NULL, so that
3284 `FREE_VARIABLES' doesn't try to free them. */
3285 regstart = regend = old_regstart = old_regend = best_regstart
3286 = best_regend = reg_dummy = NULL;
3287 reg_info = reg_info_dummy = (register_info_type *) NULL;
3288 }
3289 #endif /* REGEX_MALLOC */
3290
3291 /* The starting position is bogus. */
3292 if (pos < 0 || pos > size1 + size2)
3293 {
3294 FREE_VARIABLES ();
3295 return -1;
3296 }
3297
3298 /* Initialize subexpression text positions to -1 to mark ones that no
3299 start_memory/stop_memory has been seen for. Also initialize the
3300 register information struct. */
3301 for (mcnt = 1; mcnt < num_regs; mcnt++)
3302 {
3303 regstart[mcnt] = regend[mcnt]
3304 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3305
3306 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3307 IS_ACTIVE (reg_info[mcnt]) = 0;
3308 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3309 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3310 }
3311
3312 /* We move `string1' into `string2' if the latter's empty -- but not if
3313 `string1' is null. */
3314 if (size2 == 0 && string1 != NULL)
3315 {
3316 string2 = string1;
3317 size2 = size1;
3318 string1 = 0;
3319 size1 = 0;
3320 }
3321 end1 = string1 + size1;
3322 end2 = string2 + size2;
3323
3324 /* Compute where to stop matching, within the two strings. */
3325 if (stop <= size1)
3326 {
3327 end_match_1 = string1 + stop;
3328 end_match_2 = string2;
3329 }
3330 else
3331 {
3332 end_match_1 = end1;
3333 end_match_2 = string2 + stop - size1;
3334 }
3335
3336 /* `p' scans through the pattern as `d' scans through the data.
3337 `dend' is the end of the input string that `d' points within. `d'
3338 is advanced into the following input string whenever necessary, but
3339 this happens before fetching; therefore, at the beginning of the
3340 loop, `d' can be pointing at the end of a string, but it cannot
3341 equal `string2'. */
3342 if (size1 > 0 && pos <= size1)
3343 {
3344 d = string1 + pos;
3345 dend = end_match_1;
3346 }
3347 else
3348 {
3349 d = string2 + pos - size1;
3350 dend = end_match_2;
3351 }
3352
3353 DEBUG_PRINT1 ("The compiled pattern is: ");
3354 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3355 DEBUG_PRINT1 ("The string to match is: `");
3356 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3357 DEBUG_PRINT1 ("'\n");
3358
3359 /* This loops over pattern commands. It exits by returning from the
3360 function if the match is complete, or it drops through if the match
3361 fails at this starting point in the input data. */
3362 for (;;)
3363 {
3364 DEBUG_PRINT2 ("\n0x%x: ", p);
3365
3366 if (p == pend)
3367 { /* End of pattern means we might have succeeded. */
3368 DEBUG_PRINT1 ("end of pattern ... ");
3369
3370 /* If we haven't matched the entire string, and we want the
3371 longest match, try backtracking. */
3372 if (d != end_match_2)
3373 {
3374 DEBUG_PRINT1 ("backtracking.\n");
3375
3376 if (!FAIL_STACK_EMPTY ())
3377 { /* More failure points to try. */
3378 boolean same_str_p = (FIRST_STRING_P (match_end)
3379 == MATCHING_IN_FIRST_STRING);
3380
3381 /* If exceeds best match so far, save it. */
3382 if (!best_regs_set
3383 || (same_str_p && d > match_end)
3384 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3385 {
3386 best_regs_set = true;
3387 match_end = d;
3388
3389 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3390
3391 for (mcnt = 1; mcnt < num_regs; mcnt++)
3392 {
3393 best_regstart[mcnt] = regstart[mcnt];
3394 best_regend[mcnt] = regend[mcnt];
3395 }
3396 }
3397 goto fail;
3398 }
3399
3400 /* If no failure points, don't restore garbage. */
3401 else if (best_regs_set)
3402 {
3403 restore_best_regs:
3404 /* Restore best match. It may happen that `dend ==
3405 end_match_1' while the restored d is in string2.
3406 For example, the pattern `x.*y.*z' against the
3407 strings `x-' and `y-z-', if the two strings are
3408 not consecutive in memory. */
3409 DEBUG_PRINT1 ("Restoring best registers.\n");
3410
3411 d = match_end;
3412 dend = ((d >= string1 && d <= end1)
3413 ? end_match_1 : end_match_2);
3414
3415 for (mcnt = 1; mcnt < num_regs; mcnt++)
3416 {
3417 regstart[mcnt] = best_regstart[mcnt];
3418 regend[mcnt] = best_regend[mcnt];
3419 }
3420 }
3421 } /* d != end_match_2 */
3422
3423 DEBUG_PRINT1 ("Accepting match.\n");
3424
3425 /* If caller wants register contents data back, do it. */
3426 if (regs && !bufp->no_sub)
3427 {
3428 /* Have the register data arrays been allocated? */
3429 if (bufp->regs_allocated == REGS_UNALLOCATED)
3430 { /* No. So allocate them with malloc. We need one
3431 extra element beyond `num_regs' for the `-1' marker
3432 GNU code uses. */
3433 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3434 regs->start = TALLOC (regs->num_regs, regoff_t);
3435 regs->end = TALLOC (regs->num_regs, regoff_t);
3436 if (regs->start == NULL || regs->end == NULL)
3437 return -2;
3438 bufp->regs_allocated = REGS_REALLOCATE;
3439 }
3440 else if (bufp->regs_allocated == REGS_REALLOCATE)
3441 { /* Yes. If we need more elements than were already
3442 allocated, reallocate them. If we need fewer, just
3443 leave it alone. */
3444 if (regs->num_regs < num_regs + 1)
3445 {
3446 regs->num_regs = num_regs + 1;
3447 RETALLOC (regs->start, regs->num_regs, regoff_t);
3448 RETALLOC (regs->end, regs->num_regs, regoff_t);
3449 if (regs->start == NULL || regs->end == NULL)
3450 return -2;
3451 }
3452 }
3453 else
3454 assert (bufp->regs_allocated == REGS_FIXED);
3455
3456 /* Convert the pointer data in `regstart' and `regend' to
3457 indices. Register zero has to be set differently,
3458 since we haven't kept track of any info for it. */
3459 if (regs->num_regs > 0)
3460 {
3461 regs->start[0] = pos;
3462 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3463 : d - string2 + size1);
3464 }
3465
3466 /* Go through the first `min (num_regs, regs->num_regs)'
3467 registers, since that is all we initialized. */
3468 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3469 {
3470 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3471 regs->start[mcnt] = regs->end[mcnt] = -1;
3472 else
3473 {
3474 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3475 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3476 }
3477 }
3478
3479 /* If the regs structure we return has more elements than
3480 were in the pattern, set the extra elements to -1. If
3481 we (re)allocated the registers, this is the case,
3482 because we always allocate enough to have at least one
3483 -1 at the end. */
3484 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3485 regs->start[mcnt] = regs->end[mcnt] = -1;
3486 } /* regs && !bufp->no_sub */
3487
3488 FREE_VARIABLES ();
3489 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3490 nfailure_points_pushed, nfailure_points_popped,
3491 nfailure_points_pushed - nfailure_points_popped);
3492 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3493
3494 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3495 ? string1
3496 : string2 - size1);
3497
3498 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3499
3500 return mcnt;
3501 }
3502
3503 /* Otherwise match next pattern command. */
3504 #ifdef SWITCH_ENUM_BUG
3505 switch ((int) ((re_opcode_t) *p++))
3506 #else
3507 switch ((re_opcode_t) *p++)
3508 #endif
3509 {
3510 /* Ignore these. Used to ignore the n of succeed_n's which
3511 currently have n == 0. */
3512 case no_op:
3513 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3514 break;
3515
3516
3517 /* Match the next n pattern characters exactly. The following
3518 byte in the pattern defines n, and the n bytes after that
3519 are the characters to match. */
3520 case exactn:
3521 mcnt = *p++;
3522 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3523
3524 /* This is written out as an if-else so we don't waste time
3525 testing `translate' inside the loop. */
3526 if (translate)
3527 {
3528 do
3529 {
3530 PREFETCH ();
3531 if (translate[(unsigned char) *d++] != (char) *p++)
3532 goto fail;
3533 }
3534 while (--mcnt);
3535 }
3536 else
3537 {
3538 do
3539 {
3540 PREFETCH ();
3541 if (*d++ != (char) *p++) goto fail;
3542 }
3543 while (--mcnt);
3544 }
3545 SET_REGS_MATCHED ();
3546 break;
3547
3548
3549 /* Match any character except possibly a newline or a null. */
3550 case anychar:
3551 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3552
3553 PREFETCH ();
3554
3555 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3556 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3557 goto fail;
3558
3559 SET_REGS_MATCHED ();
3560 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3561 d++;
3562 break;
3563
3564
3565 case charset:
3566 case charset_not:
3567 {
3568 register unsigned char c;
3569 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3570
3571 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3572
3573 PREFETCH ();
3574 c = TRANSLATE (*d); /* The character to match. */
3575
3576 /* Cast to `unsigned' instead of `unsigned char' in case the
3577 bit list is a full 32 bytes long. */
3578 if (c < (unsigned) (*p * BYTEWIDTH)
3579 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3580 not = !not;
3581
3582 p += 1 + *p;
3583
3584 if (!not) goto fail;
3585
3586 SET_REGS_MATCHED ();
3587 d++;
3588 break;
3589 }
3590
3591
3592 /* The beginning of a group is represented by start_memory.
3593 The arguments are the register number in the next byte, and the
3594 number of groups inner to this one in the next. The text
3595 matched within the group is recorded (in the internal
3596 registers data structure) under the register number. */
3597 case start_memory:
3598 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3599
3600 /* Find out if this group can match the empty string. */
3601 p1 = p; /* To send to group_match_null_string_p. */
3602
3603 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3604 REG_MATCH_NULL_STRING_P (reg_info[*p])
3605 = group_match_null_string_p (&p1, pend, reg_info);
3606
3607 /* Save the position in the string where we were the last time
3608 we were at this open-group operator in case the group is
3609 operated upon by a repetition operator, e.g., with `(a*)*b'
3610 against `ab'; then we want to ignore where we are now in
3611 the string in case this attempt to match fails. */
3612 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3613 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3614 : regstart[*p];
3615 DEBUG_PRINT2 (" old_regstart: %d\n",
3616 POINTER_TO_OFFSET (old_regstart[*p]));
3617
3618 regstart[*p] = d;
3619 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3620
3621 IS_ACTIVE (reg_info[*p]) = 1;
3622 MATCHED_SOMETHING (reg_info[*p]) = 0;
3623
3624 /* This is the new highest active register. */
3625 highest_active_reg = *p;
3626
3627 /* If nothing was active before, this is the new lowest active
3628 register. */
3629 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3630 lowest_active_reg = *p;
3631
3632 /* Move past the register number and inner group count. */
3633 p += 2;
3634 break;
3635
3636
3637 /* The stop_memory opcode represents the end of a group. Its
3638 arguments are the same as start_memory's: the register
3639 number, and the number of inner groups. */
3640 case stop_memory:
3641 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3642
3643 /* We need to save the string position the last time we were at
3644 this close-group operator in case the group is operated
3645 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3646 against `aba'; then we want to ignore where we are now in
3647 the string in case this attempt to match fails. */
3648 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3649 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3650 : regend[*p];
3651 DEBUG_PRINT2 (" old_regend: %d\n",
3652 POINTER_TO_OFFSET (old_regend[*p]));
3653
3654 regend[*p] = d;
3655 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3656
3657 /* This register isn't active anymore. */
3658 IS_ACTIVE (reg_info[*p]) = 0;
3659
3660 /* If this was the only register active, nothing is active
3661 anymore. */
3662 if (lowest_active_reg == highest_active_reg)
3663 {
3664 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3665 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3666 }
3667 else
3668 { /* We must scan for the new highest active register, since
3669 it isn't necessarily one less than now: consider
3670 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3671 new highest active register is 1. */
3672 unsigned char r = *p - 1;
3673 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3674 r--;
3675
3676 /* If we end up at register zero, that means that we saved
3677 the registers as the result of an `on_failure_jump', not
3678 a `start_memory', and we jumped to past the innermost
3679 `stop_memory'. For example, in ((.)*) we save
3680 registers 1 and 2 as a result of the *, but when we pop
3681 back to the second ), we are at the stop_memory 1.
3682 Thus, nothing is active. */
3683 if (r == 0)
3684 {
3685 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3686 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3687 }
3688 else
3689 highest_active_reg = r;
3690 }
3691
3692 /* If just failed to match something this time around with a
3693 group that's operated on by a repetition operator, try to
3694 force exit from the ``loop'', and restore the register
3695 information for this group that we had before trying this
3696 last match. */
3697 if ((!MATCHED_SOMETHING (reg_info[*p])
3698 || (re_opcode_t) p[-3] == start_memory)
3699 && (p + 2) < pend)
3700 {
3701 boolean is_a_jump_n = false;
3702
3703 p1 = p + 2;
3704 mcnt = 0;
3705 switch ((re_opcode_t) *p1++)
3706 {
3707 case jump_n:
3708 is_a_jump_n = true;
3709 case pop_failure_jump:
3710 case maybe_pop_jump:
3711 case jump:
3712 case dummy_failure_jump:
3713 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3714 if (is_a_jump_n)
3715 p1 += 2;
3716 break;
3717
3718 default:
3719 /* do nothing */ ;
3720 }
3721 p1 += mcnt;
3722
3723 /* If the next operation is a jump backwards in the pattern
3724 to an on_failure_jump right before the start_memory
3725 corresponding to this stop_memory, exit from the loop
3726 by forcing a failure after pushing on the stack the
3727 on_failure_jump's jump in the pattern, and d. */
3728 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3729 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3730 {
3731 /* If this group ever matched anything, then restore
3732 what its registers were before trying this last
3733 failed match, e.g., with `(a*)*b' against `ab' for
3734 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3735 against `aba' for regend[3].
3736
3737 Also restore the registers for inner groups for,
3738 e.g., `((a*)(b*))*' against `aba' (register 3 would
3739 otherwise get trashed). */
3740
3741 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3742 {
3743 unsigned r;
3744
3745 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3746
3747 /* Restore this and inner groups' (if any) registers. */
3748 for (r = *p; r < *p + *(p + 1); r++)
3749 {
3750 regstart[r] = old_regstart[r];
3751
3752 /* xx why this test? */
3753 if ((int) old_regend[r] >= (int) regstart[r])
3754 regend[r] = old_regend[r];
3755 }
3756 }
3757 p1++;
3758 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3759 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3760
3761 goto fail;
3762 }
3763 }
3764
3765 /* Move past the register number and the inner group count. */
3766 p += 2;
3767 break;
3768
3769
3770 /* \<digit> has been turned into a `duplicate' command which is
3771 followed by the numeric value of <digit> as the register number. */
3772 case duplicate:
3773 {
3774 register const char *d2, *dend2;
3775 int regno = *p++; /* Get which register to match against. */
3776 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3777
3778 /* Can't back reference a group which we've never matched. */
3779 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3780 goto fail;
3781
3782 /* Where in input to try to start matching. */
3783 d2 = regstart[regno];
3784
3785 /* Where to stop matching; if both the place to start and
3786 the place to stop matching are in the same string, then
3787 set to the place to stop, otherwise, for now have to use
3788 the end of the first string. */
3789
3790 dend2 = ((FIRST_STRING_P (regstart[regno])
3791 == FIRST_STRING_P (regend[regno]))
3792 ? regend[regno] : end_match_1);
3793 for (;;)
3794 {
3795 /* If necessary, advance to next segment in register
3796 contents. */
3797 while (d2 == dend2)
3798 {
3799 if (dend2 == end_match_2) break;
3800 if (dend2 == regend[regno]) break;
3801
3802 /* End of string1 => advance to string2. */
3803 d2 = string2;
3804 dend2 = regend[regno];
3805 }
3806 /* At end of register contents => success */
3807 if (d2 == dend2) break;
3808
3809 /* If necessary, advance to next segment in data. */
3810 PREFETCH ();
3811
3812 /* How many characters left in this segment to match. */
3813 mcnt = dend - d;
3814
3815 /* Want how many consecutive characters we can match in
3816 one shot, so, if necessary, adjust the count. */
3817 if (mcnt > dend2 - d2)
3818 mcnt = dend2 - d2;
3819
3820 /* Compare that many; failure if mismatch, else move
3821 past them. */
3822 if (translate
3823 ? bcmp_translate (d, d2, mcnt, translate)
3824 : bcmp (d, d2, mcnt))
3825 goto fail;
3826 d += mcnt, d2 += mcnt;
3827 }
3828 }
3829 break;
3830
3831
3832 /* begline matches the empty string at the beginning of the string
3833 (unless `not_bol' is set in `bufp'), and, if
3834 `newline_anchor' is set, after newlines. */
3835 case begline:
3836 DEBUG_PRINT1 ("EXECUTING begline.\n");
3837
3838 if (AT_STRINGS_BEG (d))
3839 {
3840 if (!bufp->not_bol) break;
3841 }
3842 else if (d[-1] == '\n' && bufp->newline_anchor)
3843 {
3844 break;
3845 }
3846 /* In all other cases, we fail. */
3847 goto fail;
3848
3849
3850 /* endline is the dual of begline. */
3851 case endline:
3852 DEBUG_PRINT1 ("EXECUTING endline.\n");
3853
3854 if (AT_STRINGS_END (d))
3855 {
3856 if (!bufp->not_eol) break;
3857 }
3858
3859 /* We have to ``prefetch'' the next character. */
3860 else if ((d == end1 ? *string2 : *d) == '\n'
3861 && bufp->newline_anchor)
3862 {
3863 break;
3864 }
3865 goto fail;
3866
3867
3868 /* Match at the very beginning of the data. */
3869 case begbuf:
3870 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3871 if (AT_STRINGS_BEG (d))
3872 break;
3873 goto fail;
3874
3875
3876 /* Match at the very end of the data. */
3877 case endbuf:
3878 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3879 if (AT_STRINGS_END (d))
3880 break;
3881 goto fail;
3882
3883
3884 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3885 pushes NULL as the value for the string on the stack. Then
3886 `pop_failure_point' will keep the current value for the
3887 string, instead of restoring it. To see why, consider
3888 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3889 then the . fails against the \n. But the next thing we want
3890 to do is match the \n against the \n; if we restored the
3891 string value, we would be back at the foo.
3892
3893 Because this is used only in specific cases, we don't need to
3894 check all the things that `on_failure_jump' does, to make
3895 sure the right things get saved on the stack. Hence we don't
3896 share its code. The only reason to push anything on the
3897 stack at all is that otherwise we would have to change
3898 `anychar's code to do something besides goto fail in this
3899 case; that seems worse than this. */
3900 case on_failure_keep_string_jump:
3901 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3902
3903 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3904 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3905
3906 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3907 break;
3908
3909
3910 /* Uses of on_failure_jump:
3911
3912 Each alternative starts with an on_failure_jump that points
3913 to the beginning of the next alternative. Each alternative
3914 except the last ends with a jump that in effect jumps past
3915 the rest of the alternatives. (They really jump to the
3916 ending jump of the following alternative, because tensioning
3917 these jumps is a hassle.)
3918
3919 Repeats start with an on_failure_jump that points past both
3920 the repetition text and either the following jump or
3921 pop_failure_jump back to this on_failure_jump. */
3922 case on_failure_jump:
3923 on_failure:
3924 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3925
3926 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3927 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3928
3929 /* If this on_failure_jump comes right before a group (i.e.,
3930 the original * applied to a group), save the information
3931 for that group and all inner ones, so that if we fail back
3932 to this point, the group's information will be correct.
3933 For example, in \(a*\)*\1, we need the preceding group,
3934 and in \(\(a*\)b*\)\2, we need the inner group. */
3935
3936 /* We can't use `p' to check ahead because we push
3937 a failure point to `p + mcnt' after we do this. */
3938 p1 = p;
3939
3940 /* We need to skip no_op's before we look for the
3941 start_memory in case this on_failure_jump is happening as
3942 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3943 against aba. */
3944 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3945 p1++;
3946
3947 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3948 {
3949 /* We have a new highest active register now. This will
3950 get reset at the start_memory we are about to get to,
3951 but we will have saved all the registers relevant to
3952 this repetition op, as described above. */
3953 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3954 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3955 lowest_active_reg = *(p1 + 1);
3956 }
3957
3958 DEBUG_PRINT1 (":\n");
3959 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3960 break;
3961
3962
3963 /* A smart repeat ends with `maybe_pop_jump'.
3964 We change it to either `pop_failure_jump' or `jump'. */
3965 case maybe_pop_jump:
3966 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3967 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
3968 {
3969 register unsigned char *p2 = p;
3970
3971 /* Compare the beginning of the repeat with what in the
3972 pattern follows its end. If we can establish that there
3973 is nothing that they would both match, i.e., that we
3974 would have to backtrack because of (as in, e.g., `a*a')
3975 then we can change to pop_failure_jump, because we'll
3976 never have to backtrack.
3977
3978 This is not true in the case of alternatives: in
3979 `(a|ab)*' we do need to backtrack to the `ab' alternative
3980 (e.g., if the string was `ab'). But instead of trying to
3981 detect that here, the alternative has put on a dummy
3982 failure point which is what we will end up popping. */
3983
3984 /* Skip over open/close-group commands. */
3985 while (p2 + 2 < pend
3986 && ((re_opcode_t) *p2 == stop_memory
3987 || (re_opcode_t) *p2 == start_memory))
3988 p2 += 3; /* Skip over args, too. */
3989
3990 /* If we're at the end of the pattern, we can change. */
3991 if (p2 == pend)
3992 {
3993 /* Consider what happens when matching ":\(.*\)"
3994 against ":/". I don't really understand this code
3995 yet. */
3996 p[-3] = (unsigned char) pop_failure_jump;
3997 DEBUG_PRINT1
3998 (" End of pattern: change to `pop_failure_jump'.\n");
3999 }
4000
4001 else if ((re_opcode_t) *p2 == exactn
4002 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4003 {
4004 register unsigned char c
4005 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4006 p1 = p + mcnt;
4007
4008 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4009 to the `maybe_finalize_jump' of this case. Examine what
4010 follows. */
4011 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4012 {
4013 p[-3] = (unsigned char) pop_failure_jump;
4014 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4015 c, p1[5]);
4016 }
4017
4018 else if ((re_opcode_t) p1[3] == charset
4019 || (re_opcode_t) p1[3] == charset_not)
4020 {
4021 int not = (re_opcode_t) p1[3] == charset_not;
4022
4023 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4024 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4025 not = !not;
4026
4027 /* `not' is equal to 1 if c would match, which means
4028 that we can't change to pop_failure_jump. */
4029 if (!not)
4030 {
4031 p[-3] = (unsigned char) pop_failure_jump;
4032 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4033 }
4034 }
4035 }
4036 }
4037 p -= 2; /* Point at relative address again. */
4038 if ((re_opcode_t) p[-1] != pop_failure_jump)
4039 {
4040 p[-1] = (unsigned char) jump;
4041 DEBUG_PRINT1 (" Match => jump.\n");
4042 goto unconditional_jump;
4043 }
4044 /* Note fall through. */
4045
4046
4047 /* The end of a simple repeat has a pop_failure_jump back to
4048 its matching on_failure_jump, where the latter will push a
4049 failure point. The pop_failure_jump takes off failure
4050 points put on by this pop_failure_jump's matching
4051 on_failure_jump; we got through the pattern to here from the
4052 matching on_failure_jump, so didn't fail. */
4053 case pop_failure_jump:
4054 {
4055 /* We need to pass separate storage for the lowest and
4056 highest registers, even though we don't care about the
4057 actual values. Otherwise, we will restore only one
4058 register from the stack, since lowest will == highest in
4059 `pop_failure_point'. */
4060 unsigned dummy_low_reg, dummy_high_reg;
4061 unsigned char *pdummy;
4062 const char *sdummy;
4063
4064 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4065 POP_FAILURE_POINT (sdummy, pdummy,
4066 dummy_low_reg, dummy_high_reg,
4067 reg_dummy, reg_dummy, reg_info_dummy);
4068 }
4069 /* Note fall through. */
4070
4071
4072 /* Unconditionally jump (without popping any failure points). */
4073 case jump:
4074 unconditional_jump:
4075 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4076 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4077 p += mcnt; /* Do the jump. */
4078 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4079 break;
4080
4081
4082 /* We need this opcode so we can detect where alternatives end
4083 in `group_match_null_string_p' et al. */
4084 case jump_past_alt:
4085 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4086 goto unconditional_jump;
4087
4088
4089 /* Normally, the on_failure_jump pushes a failure point, which
4090 then gets popped at pop_failure_jump. We will end up at
4091 pop_failure_jump, also, and with a pattern of, say, `a+', we
4092 are skipping over the on_failure_jump, so we have to push
4093 something meaningless for pop_failure_jump to pop. */
4094 case dummy_failure_jump:
4095 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4096 /* It doesn't matter what we push for the string here. What
4097 the code at `fail' tests is the value for the pattern. */
4098 PUSH_FAILURE_POINT (0, 0, -2);
4099 goto unconditional_jump;
4100
4101
4102 /* At the end of an alternative, we need to push a dummy failure
4103 point in case we are followed by a `pop_failure_jump', because
4104 we don't want the failure point for the alternative to be
4105 popped. For example, matching `(a|ab)*' against `aab'
4106 requires that we match the `ab' alternative. */
4107 case push_dummy_failure:
4108 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4109 /* See comments just above at `dummy_failure_jump' about the
4110 two zeroes. */
4111 PUSH_FAILURE_POINT (0, 0, -2);
4112 break;
4113
4114 /* Have to succeed matching what follows at least n times.
4115 After that, handle like `on_failure_jump'. */
4116 case succeed_n:
4117 EXTRACT_NUMBER (mcnt, p + 2);
4118 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4119
4120 assert (mcnt >= 0);
4121 /* Originally, this is how many times we HAVE to succeed. */
4122 if (mcnt > 0)
4123 {
4124 mcnt--;
4125 p += 2;
4126 STORE_NUMBER_AND_INCR (p, mcnt);
4127 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4128 }
4129 else if (mcnt == 0)
4130 {
4131 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4132 p[2] = (unsigned char) no_op;
4133 p[3] = (unsigned char) no_op;
4134 goto on_failure;
4135 }
4136 break;
4137
4138 case jump_n:
4139 EXTRACT_NUMBER (mcnt, p + 2);
4140 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4141
4142 /* Originally, this is how many times we CAN jump. */
4143 if (mcnt)
4144 {
4145 mcnt--;
4146 STORE_NUMBER (p + 2, mcnt);
4147 goto unconditional_jump;
4148 }
4149 /* If don't have to jump any more, skip over the rest of command. */
4150 else
4151 p += 4;
4152 break;
4153
4154 case set_number_at:
4155 {
4156 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4157
4158 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4159 p1 = p + mcnt;
4160 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4161 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4162 STORE_NUMBER (p1, mcnt);
4163 break;
4164 }
4165
4166 case wordbound:
4167 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4168 if (AT_WORD_BOUNDARY (d))
4169 break;
4170 goto fail;
4171
4172 case notwordbound:
4173 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4174 if (AT_WORD_BOUNDARY (d))
4175 goto fail;
4176 break;
4177
4178 case wordbeg:
4179 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4180 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4181 break;
4182 goto fail;
4183
4184 case wordend:
4185 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4186 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4187 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4188 break;
4189 goto fail;
4190
4191 #ifdef emacs
4192 #ifdef emacs19
4193 case before_dot:
4194 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4195 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4196 goto fail;
4197 break;
4198
4199 case at_dot:
4200 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4201 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4202 goto fail;
4203 break;
4204
4205 case after_dot:
4206 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4207 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4208 goto fail;
4209 break;
4210 #else /* not emacs19 */
4211 case at_dot:
4212 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4213 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4214 goto fail;
4215 break;
4216 #endif /* not emacs19 */
4217
4218 case syntaxspec:
4219 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4220 mcnt = *p++;
4221 goto matchsyntax;
4222
4223 case wordchar:
4224 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4225 mcnt = (int) Sword;
4226 matchsyntax:
4227 PREFETCH ();
4228 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4229 goto fail;
4230 SET_REGS_MATCHED ();
4231 break;
4232
4233 case notsyntaxspec:
4234 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4235 mcnt = *p++;
4236 goto matchnotsyntax;
4237
4238 case notwordchar:
4239 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4240 mcnt = (int) Sword;
4241 matchnotsyntax:
4242 PREFETCH ();
4243 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4244 goto fail;
4245 SET_REGS_MATCHED ();
4246 break;
4247
4248 #else /* not emacs */
4249 case wordchar:
4250 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4251 PREFETCH ();
4252 if (!WORDCHAR_P (d))
4253 goto fail;
4254 SET_REGS_MATCHED ();
4255 d++;
4256 break;
4257
4258 case notwordchar:
4259 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4260 PREFETCH ();
4261 if (WORDCHAR_P (d))
4262 goto fail;
4263 SET_REGS_MATCHED ();
4264 d++;
4265 break;
4266 #endif /* not emacs */
4267
4268 default:
4269 abort ();
4270 }
4271 continue; /* Successfully executed one pattern command; keep going. */
4272
4273
4274 /* We goto here if a matching operation fails. */
4275 fail:
4276 if (!FAIL_STACK_EMPTY ())
4277 { /* A restart point is known. Restore to that state. */
4278 DEBUG_PRINT1 ("\nFAIL:\n");
4279 POP_FAILURE_POINT (d, p,
4280 lowest_active_reg, highest_active_reg,
4281 regstart, regend, reg_info);
4282
4283 /* If this failure point is a dummy, try the next one. */
4284 if (!p)
4285 goto fail;
4286
4287 /* If we failed to the end of the pattern, don't examine *p. */
4288 assert (p <= pend);
4289 if (p < pend)
4290 {
4291 boolean is_a_jump_n = false;
4292
4293 /* If failed to a backwards jump that's part of a repetition
4294 loop, need to pop this failure point and use the next one. */
4295 switch ((re_opcode_t) *p)
4296 {
4297 case jump_n:
4298 is_a_jump_n = true;
4299 case maybe_pop_jump:
4300 case pop_failure_jump:
4301 case jump:
4302 p1 = p + 1;
4303 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4304 p1 += mcnt;
4305
4306 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4307 || (!is_a_jump_n
4308 && (re_opcode_t) *p1 == on_failure_jump))
4309 goto fail;
4310 break;
4311 default:
4312 /* do nothing */ ;
4313 }
4314 }
4315
4316 if (d >= string1 && d <= end1)
4317 dend = end_match_1;
4318 }
4319 else
4320 break; /* Matching at this starting point really fails. */
4321 } /* for (;;) */
4322
4323 if (best_regs_set)
4324 goto restore_best_regs;
4325
4326 FREE_VARIABLES ();
4327
4328 return -1; /* Failure to match. */
4329 } /* re_match_2 */
4330 \f
4331 /* Subroutine definitions for re_match_2. */
4332
4333
4334 /* We are passed P pointing to a register number after a start_memory.
4335
4336 Return true if the pattern up to the corresponding stop_memory can
4337 match the empty string, and false otherwise.
4338
4339 If we find the matching stop_memory, sets P to point to one past its number.
4340 Otherwise, sets P to an undefined byte less than or equal to END.
4341
4342 We don't handle duplicates properly (yet). */
4343
4344 static boolean
4345 group_match_null_string_p (p, end, reg_info)
4346 unsigned char **p, *end;
4347 register_info_type *reg_info;
4348 {
4349 int mcnt;
4350 /* Point to after the args to the start_memory. */
4351 unsigned char *p1 = *p + 2;
4352
4353 while (p1 < end)
4354 {
4355 /* Skip over opcodes that can match nothing, and return true or
4356 false, as appropriate, when we get to one that can't, or to the
4357 matching stop_memory. */
4358
4359 switch ((re_opcode_t) *p1)
4360 {
4361 /* Could be either a loop or a series of alternatives. */
4362 case on_failure_jump:
4363 p1++;
4364 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4365
4366 /* If the next operation is not a jump backwards in the
4367 pattern. */
4368
4369 if (mcnt >= 0)
4370 {
4371 /* Go through the on_failure_jumps of the alternatives,
4372 seeing if any of the alternatives cannot match nothing.
4373 The last alternative starts with only a jump,
4374 whereas the rest start with on_failure_jump and end
4375 with a jump, e.g., here is the pattern for `a|b|c':
4376
4377 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4378 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4379 /exactn/1/c
4380
4381 So, we have to first go through the first (n-1)
4382 alternatives and then deal with the last one separately. */
4383
4384
4385 /* Deal with the first (n-1) alternatives, which start
4386 with an on_failure_jump (see above) that jumps to right
4387 past a jump_past_alt. */
4388
4389 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4390 {
4391 /* `mcnt' holds how many bytes long the alternative
4392 is, including the ending `jump_past_alt' and
4393 its number. */
4394
4395 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4396 reg_info))
4397 return false;
4398
4399 /* Move to right after this alternative, including the
4400 jump_past_alt. */
4401 p1 += mcnt;
4402
4403 /* Break if it's the beginning of an n-th alternative
4404 that doesn't begin with an on_failure_jump. */
4405 if ((re_opcode_t) *p1 != on_failure_jump)
4406 break;
4407
4408 /* Still have to check that it's not an n-th
4409 alternative that starts with an on_failure_jump. */
4410 p1++;
4411 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4412 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4413 {
4414 /* Get to the beginning of the n-th alternative. */
4415 p1 -= 3;
4416 break;
4417 }
4418 }
4419
4420 /* Deal with the last alternative: go back and get number
4421 of the `jump_past_alt' just before it. `mcnt' contains
4422 the length of the alternative. */
4423 EXTRACT_NUMBER (mcnt, p1 - 2);
4424
4425 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4426 return false;
4427
4428 p1 += mcnt; /* Get past the n-th alternative. */
4429 } /* if mcnt > 0 */
4430 break;
4431
4432
4433 case stop_memory:
4434 assert (p1[1] == **p);
4435 *p = p1 + 2;
4436 return true;
4437
4438
4439 default:
4440 if (!common_op_match_null_string_p (&p1, end, reg_info))
4441 return false;
4442 }
4443 } /* while p1 < end */
4444
4445 return false;
4446 } /* group_match_null_string_p */
4447
4448
4449 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4450 It expects P to be the first byte of a single alternative and END one
4451 byte past the last. The alternative can contain groups. */
4452
4453 static boolean
4454 alt_match_null_string_p (p, end, reg_info)
4455 unsigned char *p, *end;
4456 register_info_type *reg_info;
4457 {
4458 int mcnt;
4459 unsigned char *p1 = p;
4460
4461 while (p1 < end)
4462 {
4463 /* Skip over opcodes that can match nothing, and break when we get
4464 to one that can't. */
4465
4466 switch ((re_opcode_t) *p1)
4467 {
4468 /* It's a loop. */
4469 case on_failure_jump:
4470 p1++;
4471 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4472 p1 += mcnt;
4473 break;
4474
4475 default:
4476 if (!common_op_match_null_string_p (&p1, end, reg_info))
4477 return false;
4478 }
4479 } /* while p1 < end */
4480
4481 return true;
4482 } /* alt_match_null_string_p */
4483
4484
4485 /* Deals with the ops common to group_match_null_string_p and
4486 alt_match_null_string_p.
4487
4488 Sets P to one after the op and its arguments, if any. */
4489
4490 static boolean
4491 common_op_match_null_string_p (p, end, reg_info)
4492 unsigned char **p, *end;
4493 register_info_type *reg_info;
4494 {
4495 int mcnt;
4496 boolean ret;
4497 int reg_no;
4498 unsigned char *p1 = *p;
4499
4500 switch ((re_opcode_t) *p1++)
4501 {
4502 case no_op:
4503 case begline:
4504 case endline:
4505 case begbuf:
4506 case endbuf:
4507 case wordbeg:
4508 case wordend:
4509 case wordbound:
4510 case notwordbound:
4511 #ifdef emacs
4512 case before_dot:
4513 case at_dot:
4514 case after_dot:
4515 #endif
4516 break;
4517
4518 case start_memory:
4519 reg_no = *p1;
4520 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4521 ret = group_match_null_string_p (&p1, end, reg_info);
4522
4523 /* Have to set this here in case we're checking a group which
4524 contains a group and a back reference to it. */
4525
4526 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4527 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4528
4529 if (!ret)
4530 return false;
4531 break;
4532
4533 /* If this is an optimized succeed_n for zero times, make the jump. */
4534 case jump:
4535 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4536 if (mcnt >= 0)
4537 p1 += mcnt;
4538 else
4539 return false;
4540 break;
4541
4542 case succeed_n:
4543 /* Get to the number of times to succeed. */
4544 p1 += 2;
4545 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4546
4547 if (mcnt == 0)
4548 {
4549 p1 -= 4;
4550 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4551 p1 += mcnt;
4552 }
4553 else
4554 return false;
4555 break;
4556
4557 case duplicate:
4558 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4559 return false;
4560 break;
4561
4562 case set_number_at:
4563 p1 += 4;
4564
4565 default:
4566 /* All other opcodes mean we cannot match the empty string. */
4567 return false;
4568 }
4569
4570 *p = p1;
4571 return true;
4572 } /* common_op_match_null_string_p */
4573
4574
4575 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4576 bytes; nonzero otherwise. */
4577
4578 static int
4579 bcmp_translate (s1, s2, len, translate)
4580 unsigned char *s1, *s2;
4581 register int len;
4582 char *translate;
4583 {
4584 register unsigned char *p1 = s1, *p2 = s2;
4585 while (len)
4586 {
4587 if (translate[*p1++] != translate[*p2++]) return 1;
4588 len--;
4589 }
4590 return 0;
4591 }
4592 \f
4593 /* Entry points for GNU code. */
4594
4595 /* re_compile_pattern is the GNU regular expression compiler: it
4596 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4597 Returns 0 if the pattern was valid, otherwise an error string.
4598
4599 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4600 are set in BUFP on entry.
4601
4602 We call regex_compile to do the actual compilation. */
4603
4604 const char *
4605 re_compile_pattern (pattern, length, bufp)
4606 const char *pattern;
4607 int length;
4608 struct re_pattern_buffer *bufp;
4609 {
4610 reg_errcode_t ret;
4611
4612 /* GNU code is written to assume at least RE_NREGS registers will be set
4613 (and at least one extra will be -1). */
4614 bufp->regs_allocated = REGS_UNALLOCATED;
4615
4616 /* And GNU code determines whether or not to get register information
4617 by passing null for the REGS argument to re_match, etc., not by
4618 setting no_sub. */
4619 bufp->no_sub = 0;
4620
4621 /* Match anchors at newline. */
4622 bufp->newline_anchor = 1;
4623
4624 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4625
4626 return re_error_msg[(int) ret];
4627 }
4628 \f
4629 /* Entry points compatible with 4.2 BSD regex library. We don't define
4630 them if this is an Emacs or POSIX compilation. */
4631
4632 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4633
4634 /* BSD has one and only one pattern buffer. */
4635 static struct re_pattern_buffer re_comp_buf;
4636
4637 char *
4638 re_comp (s)
4639 const char *s;
4640 {
4641 reg_errcode_t ret;
4642
4643 if (!s)
4644 {
4645 if (!re_comp_buf.buffer)
4646 return "No previous regular expression";
4647 return 0;
4648 }
4649
4650 if (!re_comp_buf.buffer)
4651 {
4652 re_comp_buf.buffer = (unsigned char *) malloc (200);
4653 if (re_comp_buf.buffer == NULL)
4654 return "Memory exhausted";
4655 re_comp_buf.allocated = 200;
4656
4657 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4658 if (re_comp_buf.fastmap == NULL)
4659 return "Memory exhausted";
4660 }
4661
4662 /* Since `re_exec' always passes NULL for the `regs' argument, we
4663 don't need to initialize the pattern buffer fields which affect it. */
4664
4665 /* Match anchors at newlines. */
4666 re_comp_buf.newline_anchor = 1;
4667
4668 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4669
4670 /* Yes, we're discarding `const' here. */
4671 return (char *) re_error_msg[(int) ret];
4672 }
4673
4674
4675 int
4676 re_exec (s)
4677 const char *s;
4678 {
4679 const int len = strlen (s);
4680 return
4681 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4682 }
4683 #endif /* not emacs and not _POSIX_SOURCE */
4684 \f
4685 /* POSIX.2 functions. Don't define these for Emacs. */
4686
4687 #ifndef emacs
4688
4689 /* regcomp takes a regular expression as a string and compiles it.
4690
4691 PREG is a regex_t *. We do not expect any fields to be initialized,
4692 since POSIX says we shouldn't. Thus, we set
4693
4694 `buffer' to the compiled pattern;
4695 `used' to the length of the compiled pattern;
4696 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4697 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4698 RE_SYNTAX_POSIX_BASIC;
4699 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4700 `fastmap' and `fastmap_accurate' to zero;
4701 `re_nsub' to the number of subexpressions in PATTERN.
4702
4703 PATTERN is the address of the pattern string.
4704
4705 CFLAGS is a series of bits which affect compilation.
4706
4707 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4708 use POSIX basic syntax.
4709
4710 If REG_NEWLINE is set, then . and [^...] don't match newline.
4711 Also, regexec will try a match beginning after every newline.
4712
4713 If REG_ICASE is set, then we considers upper- and lowercase
4714 versions of letters to be equivalent when matching.
4715
4716 If REG_NOSUB is set, then when PREG is passed to regexec, that
4717 routine will report only success or failure, and nothing about the
4718 registers.
4719
4720 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4721 the return codes and their meanings.) */
4722
4723 int
4724 regcomp (preg, pattern, cflags)
4725 regex_t *preg;
4726 const char *pattern;
4727 int cflags;
4728 {
4729 reg_errcode_t ret;
4730 unsigned syntax
4731 = (cflags & REG_EXTENDED) ?
4732 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4733
4734 /* regex_compile will allocate the space for the compiled pattern. */
4735 preg->buffer = 0;
4736 preg->allocated = 0;
4737
4738 /* Don't bother to use a fastmap when searching. This simplifies the
4739 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4740 characters after newlines into the fastmap. This way, we just try
4741 every character. */
4742 preg->fastmap = 0;
4743
4744 if (cflags & REG_ICASE)
4745 {
4746 unsigned i;
4747
4748 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4749 if (preg->translate == NULL)
4750 return (int) REG_ESPACE;
4751
4752 /* Map uppercase characters to corresponding lowercase ones. */
4753 for (i = 0; i < CHAR_SET_SIZE; i++)
4754 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4755 }
4756 else
4757 preg->translate = NULL;
4758
4759 /* If REG_NEWLINE is set, newlines are treated differently. */
4760 if (cflags & REG_NEWLINE)
4761 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4762 syntax &= ~RE_DOT_NEWLINE;
4763 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4764 /* It also changes the matching behavior. */
4765 preg->newline_anchor = 1;
4766 }
4767 else
4768 preg->newline_anchor = 0;
4769
4770 preg->no_sub = !!(cflags & REG_NOSUB);
4771
4772 /* POSIX says a null character in the pattern terminates it, so we
4773 can use strlen here in compiling the pattern. */
4774 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4775
4776 /* POSIX doesn't distinguish between an unmatched open-group and an
4777 unmatched close-group: both are REG_EPAREN. */
4778 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4779
4780 return (int) ret;
4781 }
4782
4783
4784 /* regexec searches for a given pattern, specified by PREG, in the
4785 string STRING.
4786
4787 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4788 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4789 least NMATCH elements, and we set them to the offsets of the
4790 corresponding matched substrings.
4791
4792 EFLAGS specifies `execution flags' which affect matching: if
4793 REG_NOTBOL is set, then ^ does not match at the beginning of the
4794 string; if REG_NOTEOL is set, then $ does not match at the end.
4795
4796 We return 0 if we find a match and REG_NOMATCH if not. */
4797
4798 int
4799 regexec (preg, string, nmatch, pmatch, eflags)
4800 const regex_t *preg;
4801 const char *string;
4802 size_t nmatch;
4803 regmatch_t pmatch[];
4804 int eflags;
4805 {
4806 int ret;
4807 struct re_registers regs;
4808 regex_t private_preg;
4809 int len = strlen (string);
4810 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4811
4812 private_preg = *preg;
4813
4814 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4815 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4816
4817 /* The user has told us exactly how many registers to return
4818 information about, via `nmatch'. We have to pass that on to the
4819 matching routines. */
4820 private_preg.regs_allocated = REGS_FIXED;
4821
4822 if (want_reg_info)
4823 {
4824 regs.num_regs = nmatch;
4825 regs.start = TALLOC (nmatch, regoff_t);
4826 regs.end = TALLOC (nmatch, regoff_t);
4827 if (regs.start == NULL || regs.end == NULL)
4828 return (int) REG_NOMATCH;
4829 }
4830
4831 /* Perform the searching operation. */
4832 ret = re_search (&private_preg, string, len,
4833 /* start: */ 0, /* range: */ len,
4834 want_reg_info ? &regs : (struct re_registers *) 0);
4835
4836 /* Copy the register information to the POSIX structure. */
4837 if (want_reg_info)
4838 {
4839 if (ret >= 0)
4840 {
4841 unsigned r;
4842
4843 for (r = 0; r < nmatch; r++)
4844 {
4845 pmatch[r].rm_so = regs.start[r];
4846 pmatch[r].rm_eo = regs.end[r];
4847 }
4848 }
4849
4850 /* If we needed the temporary register info, free the space now. */
4851 free (regs.start);
4852 free (regs.end);
4853 }
4854
4855 /* We want zero return to mean success, unlike `re_search'. */
4856 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4857 }
4858
4859
4860 /* Returns a message corresponding to an error code, ERRCODE, returned
4861 from either regcomp or regexec. We don't use PREG here. */
4862
4863 size_t
4864 regerror (errcode, preg, errbuf, errbuf_size)
4865 int errcode;
4866 const regex_t *preg;
4867 char *errbuf;
4868 size_t errbuf_size;
4869 {
4870 const char *msg
4871 = re_error_msg[errcode] == NULL ? "Success" : re_error_msg[errcode];
4872 size_t msg_size = strlen (msg) + 1; /* Includes the null. */
4873
4874 if (errbuf_size != 0)
4875 {
4876 if (msg_size > errbuf_size)
4877 {
4878 strncpy (errbuf, msg, errbuf_size - 1);
4879 errbuf[errbuf_size - 1] = 0;
4880 }
4881 else
4882 strcpy (errbuf, msg);
4883 }
4884
4885 return msg_size;
4886 }
4887
4888
4889 /* Free dynamically allocated space used by PREG. */
4890
4891 void
4892 regfree (preg)
4893 regex_t *preg;
4894 {
4895 if (preg->buffer != NULL)
4896 free (preg->buffer);
4897 preg->buffer = NULL;
4898
4899 preg->allocated = 0;
4900 preg->used = 0;
4901
4902 if (preg->fastmap != NULL)
4903 free (preg->fastmap);
4904 preg->fastmap = NULL;
4905 preg->fastmap_accurate = 0;
4906
4907 if (preg->translate != NULL)
4908 free (preg->translate);
4909 preg->translate = NULL;
4910 }
4911
4912 #endif /* not emacs */
4913 \f
4914 /*
4915 Local variables:
4916 make-backup-files: t
4917 version-control: t
4918 trim-versions-without-asking: nil
4919 End:
4920 */