(GC_SETJMP_WORKS): Define for i386, sparc, m68k, alpha.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a
RS
1/* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
bc78d348
KB
3 internationalization features.)
4
505bde11 5 Copyright (C) 1993,94,95,96,97,98,2000 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
ba4a8e51 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25fe55af 20 USA. */
fa9a63c5 21
505bde11 22/* TODO:
1fb352e0 23 - use analyze_first to optimize non-empty loops
1fb352e0 24 - clean up multibyte issues
505bde11
SM
25 - structure the opcode space into opcode+flag.
26 - merge with glic's regex.[ch]
27
28 That's it for now -sm */
29
fa9a63c5
RM
30/* AIX requires this to be the first thing in the file. */
31#if defined (_AIX) && !defined (REGEX_MALLOC)
32 #pragma alloca
33#endif
34
68d96f02 35#undef _GNU_SOURCE
fa9a63c5
RM
36#define _GNU_SOURCE
37
51352796 38#ifdef emacs
25fe55af 39/* Converts the pointer to the char to BEG-based offset from the start. */
99633e97 40#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
7e95234e 41#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
51352796
RS
42#else
43#define PTR_TO_OFFSET(d) 0
44#endif
b18215fc 45
fa9a63c5
RM
46#ifdef HAVE_CONFIG_H
47#include <config.h>
48#endif
49
25fe55af 50/* We need this for `regex.h', and perhaps for the Emacs include files. */
fa9a63c5
RM
51#include <sys/types.h>
52
25fe55af 53/* This is for other GNU distributions with internationalized messages. */
fa9a63c5
RM
54#if HAVE_LIBINTL_H || defined (_LIBC)
55# include <libintl.h>
56#else
57# define gettext(msgid) (msgid)
58#endif
59
5e69f11e
RM
60#ifndef gettext_noop
61/* This define is so xgettext can find the internationalizable
62 strings. */
63#define gettext_noop(String) String
64#endif
65
fa9a63c5
RM
66/* The `emacs' switch turns on certain matching commands
67 that make sense only in Emacs. */
68#ifdef emacs
69
70#include "lisp.h"
71#include "buffer.h"
b18215fc
RS
72
73/* Make syntax table lookup grant data in gl_state. */
74#define SYNTAX_ENTRY_VIA_PROPERTY
75
fa9a63c5 76#include "syntax.h"
b18215fc
RS
77#include "charset.h"
78#include "category.h"
fa9a63c5 79
9abbd165 80#define malloc xmalloc
64e3c718 81#define realloc xrealloc
9abbd165
RS
82#define free xfree
83
4e8a9132
SM
84#define RE_STRING_CHAR(p, s) \
85 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
86
fa9a63c5
RM
87#else /* not emacs */
88
89/* If we are not linking with Emacs proper,
90 we can't use the relocating allocator
91 even if config.h says that we can. */
92#undef REL_ALLOC
93
94#if defined (STDC_HEADERS) || defined (_LIBC)
95#include <stdlib.h>
96#else
97char *malloc ();
98char *realloc ();
99#endif
100
9e4ecb26 101/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
25fe55af 102 If nothing else has been done, use the method below. */
9e4ecb26
KH
103#ifdef INHIBIT_STRING_HEADER
104#if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
105#if !defined (bzero) && !defined (bcopy)
106#undef INHIBIT_STRING_HEADER
107#endif
108#endif
109#endif
110
111/* This is the normal way of making sure we have a bcopy and a bzero.
112 This is used in most programs--a few other programs avoid this
113 by defining INHIBIT_STRING_HEADER. */
fa9a63c5 114#ifndef INHIBIT_STRING_HEADER
7f998252 115#if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
fa9a63c5
RM
116#include <string.h>
117#ifndef bcmp
118#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
119#endif
120#ifndef bcopy
121#define bcopy(s, d, n) memcpy ((d), (s), (n))
122#endif
123#ifndef bzero
124#define bzero(s, n) memset ((s), 0, (n))
125#endif
126#else
127#include <strings.h>
128#endif
129#endif
130
131/* Define the syntax stuff for \<, \>, etc. */
132
990b2375
SM
133/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
134enum syntaxcode { Swhitespace = 0, Sword = 1 };
fa9a63c5
RM
135
136#ifdef SWITCH_ENUM_BUG
137#define SWITCH_ENUM_CAST(x) ((int)(x))
138#else
139#define SWITCH_ENUM_CAST(x) (x)
140#endif
141
142#ifdef SYNTAX_TABLE
143
144extern char *re_syntax_table;
145
146#else /* not SYNTAX_TABLE */
147
148/* How many characters in the character set. */
149#define CHAR_SET_SIZE 256
150
151static char re_syntax_table[CHAR_SET_SIZE];
152
153static void
154init_syntax_once ()
155{
156 register int c;
157 static int done = 0;
158
159 if (done)
160 return;
161
162 bzero (re_syntax_table, sizeof re_syntax_table);
163
164 for (c = 'a'; c <= 'z'; c++)
165 re_syntax_table[c] = Sword;
166
167 for (c = 'A'; c <= 'Z'; c++)
168 re_syntax_table[c] = Sword;
169
170 for (c = '0'; c <= '9'; c++)
171 re_syntax_table[c] = Sword;
172
173 re_syntax_table['_'] = Sword;
174
175 done = 1;
176}
177
178#endif /* not SYNTAX_TABLE */
179
180#define SYNTAX(c) re_syntax_table[c]
181
e934739e 182/* Dummy macros for non-Emacs environments. */
b18215fc 183#define BASE_LEADING_CODE_P(c) (0)
1fb352e0
SM
184#define CHAR_CHARSET(c) 0
185#define CHARSET_LEADING_CODE_BASE(c) 0
b18215fc
RS
186#define WORD_BOUNDARY_P(c1, c2) (0)
187#define CHAR_HEAD_P(p) (1)
188#define SINGLE_BYTE_CHAR_P(c) (1)
189#define SAME_CHARSET_P(c1, c2) (1)
190#define MULTIBYTE_FORM_LENGTH(p, s) (1)
191#define STRING_CHAR(p, s) (*(p))
4e8a9132 192#define RE_STRING_CHAR STRING_CHAR
b18215fc 193#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
b18215fc
RS
194#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
195 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
fa9a63c5 196#endif /* not emacs */
4e8a9132
SM
197
198#ifndef RE_TRANSLATE
199#define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
200#define RE_TRANSLATE_P(TBL) (TBL)
201#endif
fa9a63c5
RM
202\f
203/* Get the interface, including the syntax bits. */
204#include "regex.h"
205
f71b19b6
DL
206/* isalpha etc. are used for the character classes. */
207#include <ctype.h>
fa9a63c5 208
f71b19b6 209#ifdef emacs
fa9a63c5 210
f71b19b6
DL
211/* 1 if C is an ASCII character. */
212#define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 213
f71b19b6
DL
214/* 1 if C is a unibyte character. */
215#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 216
f71b19b6 217/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 218
f71b19b6
DL
219/* In Emacs, these are only used for single-byte characters. */
220#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
221#define ISCNTRL(c) ((c) < ' ')
222#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
223 || ((c) >= 'a' && (c) <= 'f') \
224 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
225
226/* This is only used for single-byte characters. */
227#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
228
229/* The rest must handle multibyte characters. */
230
231#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 232 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
233 : 1)
234
235#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 236 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
237 : 1)
238
f71b19b6
DL
239#define ISALNUM(c) (IS_REAL_ASCII (c) \
240 ? (((c) >= 'a' && (c) <= 'z') \
241 || ((c) >= 'A' && (c) <= 'Z') \
242 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
243 : SYNTAX (c) == Sword)
244
f71b19b6
DL
245#define ISALPHA(c) (IS_REAL_ASCII (c) \
246 ? (((c) >= 'a' && (c) <= 'z') \
247 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
248 : SYNTAX (c) == Sword)
249
250#define ISLOWER(c) (LOWERCASEP (c))
251
f71b19b6
DL
252#define ISPUNCT(c) (IS_REAL_ASCII (c) \
253 ? ((c) > ' ' && (c) < 0177 \
254 && !(((c) >= 'a' && (c) <= 'z') \
255 || ((c) >= 'A' && (c) <= 'Z') \
256 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
257 : SYNTAX (c) != Sword)
258
259#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
260
261#define ISUPPER(c) (UPPERCASEP (c))
262
263#define ISWORD(c) (SYNTAX (c) == Sword)
264
265#else /* not emacs */
266
f71b19b6
DL
267/* Jim Meyering writes:
268
269 "... Some ctype macros are valid only for character codes that
270 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
271 using /bin/cc or gcc but without giving an ansi option). So, all
272 ctype uses should be through macros like ISPRINT... If
273 STDC_HEADERS is defined, then autoconf has verified that the ctype
274 macros don't need to be guarded with references to isascii. ...
275 Defining isascii to 1 should let any compiler worth its salt
276 eliminate the && through constant folding." */
277
278#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
279#define ISASCII(c) 1
280#else
281#define ISASCII(c) isascii(c)
282#endif
283
284/* 1 if C is an ASCII character. */
285#define IS_REAL_ASCII(c) ((c) < 0200)
286
287/* This distinction is not meaningful, except in Emacs. */
288#define ISUNIBYTE(c) 1
289
290#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
291#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
292#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
293
fa9a63c5
RM
294#ifdef isblank
295#define ISBLANK(c) (ISASCII (c) && isblank (c))
296#else
297#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
298#endif
299#ifdef isgraph
300#define ISGRAPH(c) (ISASCII (c) && isgraph (c))
301#else
302#define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
303#endif
304
305#define ISPRINT(c) (ISASCII (c) && isprint (c))
306#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
307#define ISALNUM(c) (ISASCII (c) && isalnum (c))
308#define ISALPHA(c) (ISASCII (c) && isalpha (c))
309#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
310#define ISLOWER(c) (ISASCII (c) && islower (c))
311#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
312#define ISSPACE(c) (ISASCII (c) && isspace (c))
313#define ISUPPER(c) (ISASCII (c) && isupper (c))
314#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
315
96cc36cc
RS
316#define ISWORD(c) ISALPHA(c)
317
318#endif /* not emacs */
319\f
fa9a63c5 320#ifndef NULL
075f06ec 321#define NULL (void *)0
fa9a63c5
RM
322#endif
323
324/* We remove any previous definition of `SIGN_EXTEND_CHAR',
325 since ours (we hope) works properly with all combinations of
326 machines, compilers, `char' and `unsigned char' argument types.
25fe55af 327 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
328#undef SIGN_EXTEND_CHAR
329#if __STDC__
330#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
331#else /* not __STDC__ */
332/* As in Harbison and Steele. */
333#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
334#endif
335\f
336/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
337 use `alloca' instead of `malloc'. This is because using malloc in
338 re_search* or re_match* could cause memory leaks when C-g is used in
339 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
340 the other hand, malloc is more portable, and easier to debug.
341
fa9a63c5
RM
342 Because we sometimes use alloca, some routines have to be macros,
343 not functions -- `alloca'-allocated space disappears at the end of the
344 function it is called in. */
345
346#ifdef REGEX_MALLOC
347
348#define REGEX_ALLOCATE malloc
349#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
350#define REGEX_FREE free
351
352#else /* not REGEX_MALLOC */
353
354/* Emacs already defines alloca, sometimes. */
355#ifndef alloca
356
357/* Make alloca work the best possible way. */
358#ifdef __GNUC__
359#define alloca __builtin_alloca
360#else /* not __GNUC__ */
361#if HAVE_ALLOCA_H
362#include <alloca.h>
363#else /* not __GNUC__ or HAVE_ALLOCA_H */
f3c4387f 364#if 0 /* It is a bad idea to declare alloca. We always cast the result. */
25fe55af 365#ifndef _AIX /* Already did AIX, up at the top. */
fa9a63c5
RM
366char *alloca ();
367#endif /* not _AIX */
f3c4387f 368#endif
5e69f11e 369#endif /* not HAVE_ALLOCA_H */
fa9a63c5
RM
370#endif /* not __GNUC__ */
371
372#endif /* not alloca */
373
374#define REGEX_ALLOCATE alloca
375
376/* Assumes a `char *destination' variable. */
377#define REGEX_REALLOCATE(source, osize, nsize) \
378 (destination = (char *) alloca (nsize), \
379 bcopy (source, destination, osize), \
380 destination)
381
382/* No need to do anything to free, after alloca. */
c2e1680a 383#define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
384
385#endif /* not REGEX_MALLOC */
386
387/* Define how to allocate the failure stack. */
388
33487cc8 389#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4297555e 390
fa9a63c5
RM
391#define REGEX_ALLOCATE_STACK(size) \
392 r_alloc (&failure_stack_ptr, (size))
393#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
394 r_re_alloc (&failure_stack_ptr, (nsize))
395#define REGEX_FREE_STACK(ptr) \
396 r_alloc_free (&failure_stack_ptr)
397
4297555e 398#else /* not using relocating allocator */
fa9a63c5
RM
399
400#ifdef REGEX_MALLOC
401
402#define REGEX_ALLOCATE_STACK malloc
403#define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
404#define REGEX_FREE_STACK free
405
406#else /* not REGEX_MALLOC */
407
408#define REGEX_ALLOCATE_STACK alloca
409
410#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
411 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 412/* No need to explicitly free anything. */
ed0767d8 413#define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5
RM
414
415#endif /* not REGEX_MALLOC */
4297555e 416#endif /* not using relocating allocator */
fa9a63c5
RM
417
418
419/* True if `size1' is non-NULL and PTR is pointing anywhere inside
420 `string1' or just past its end. This works if PTR is NULL, which is
421 a good thing. */
25fe55af 422#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
423 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
424
425/* (Re)Allocate N items of type T using malloc, or fail. */
426#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
427#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
428#define RETALLOC_IF(addr, n, t) \
429 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
430#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
431
25fe55af 432#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
433
434#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
435
436#undef MAX
437#undef MIN
438#define MAX(a, b) ((a) > (b) ? (a) : (b))
439#define MIN(a, b) ((a) < (b) ? (a) : (b))
440
66f0296e
SM
441/* Type of source-pattern and string chars. */
442typedef const unsigned char re_char;
443
fa9a63c5
RM
444typedef char boolean;
445#define false 0
446#define true 1
447
448static int re_match_2_internal ();
449\f
450/* These are the command codes that appear in compiled regular
25fe55af 451 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
452 command code can specify any interpretation whatsoever for its
453 arguments. Zero bytes may appear in the compiled regular expression. */
454
455typedef enum
456{
457 no_op = 0,
458
25fe55af 459 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
460 succeed,
461
25fe55af 462 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
463 exactn,
464
25fe55af 465 /* Matches any (more or less) character. */
fa9a63c5
RM
466 anychar,
467
25fe55af
RS
468 /* Matches any one char belonging to specified set. First
469 following byte is number of bitmap bytes. Then come bytes
470 for a bitmap saying which chars are in. Bits in each byte
471 are ordered low-bit-first. A character is in the set if its
472 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
473 automatically not in the set.
474
475 If the length byte has the 0x80 bit set, then that stuff
476 is followed by a range table:
477 2 bytes of flags for character sets (low 8 bits, high 8 bits)
478 See RANGE_TABLE_WORK_BITS below.
479 2 bytes, the number of pairs that follow
480 pairs, each 2 multibyte characters,
481 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
482 charset,
483
25fe55af
RS
484 /* Same parameters as charset, but match any character that is
485 not one of those specified. */
fa9a63c5
RM
486 charset_not,
487
25fe55af
RS
488 /* Start remembering the text that is matched, for storing in a
489 register. Followed by one byte with the register number, in
490 the range 0 to one less than the pattern buffer's re_nsub
505bde11 491 field. */
fa9a63c5
RM
492 start_memory,
493
25fe55af
RS
494 /* Stop remembering the text that is matched and store it in a
495 memory register. Followed by one byte with the register
496 number, in the range 0 to one less than `re_nsub' in the
505bde11 497 pattern buffer. */
fa9a63c5
RM
498 stop_memory,
499
25fe55af
RS
500 /* Match a duplicate of something remembered. Followed by one
501 byte containing the register number. */
fa9a63c5
RM
502 duplicate,
503
25fe55af 504 /* Fail unless at beginning of line. */
fa9a63c5
RM
505 begline,
506
25fe55af 507 /* Fail unless at end of line. */
fa9a63c5
RM
508 endline,
509
25fe55af
RS
510 /* Succeeds if at beginning of buffer (if emacs) or at beginning
511 of string to be matched (if not). */
fa9a63c5
RM
512 begbuf,
513
25fe55af 514 /* Analogously, for end of buffer/string. */
fa9a63c5 515 endbuf,
5e69f11e 516
25fe55af 517 /* Followed by two byte relative address to which to jump. */
5e69f11e 518 jump,
fa9a63c5 519
25fe55af
RS
520 /* Followed by two-byte relative address of place to resume at
521 in case of failure. */
fa9a63c5 522 on_failure_jump,
5e69f11e 523
25fe55af
RS
524 /* Like on_failure_jump, but pushes a placeholder instead of the
525 current string position when executed. */
fa9a63c5 526 on_failure_keep_string_jump,
5e69f11e 527
505bde11
SM
528 /* Just like `on_failure_jump', except that it checks that we
529 don't get stuck in an infinite loop (matching an empty string
530 indefinitely). */
531 on_failure_jump_loop,
532
0683b6fa
SM
533 /* Just like `on_failure_jump_loop', except that it checks for
534 a different kind of loop (the kind that shows up with non-greedy
535 operators). This operation has to be immediately preceded
536 by a `no_op'. */
537 on_failure_jump_nastyloop,
538
505bde11
SM
539 /* A smart `on_failure_jump' used for greedy * and + operators.
540 It analyses the loop before which it is put and if the
541 loop does not require backtracking, it changes itself to
4e8a9132
SM
542 `on_failure_keep_string_jump' and short-circuits the loop,
543 else it just defaults to changing itself into `on_failure_jump'.
544 It assumes that it is pointing to just past a `jump'. */
505bde11 545 on_failure_jump_smart,
fa9a63c5 546
25fe55af 547 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
548 After matching N times, jump to the address upon failure.
549 Does not work if N starts at 0: use on_failure_jump_loop
550 instead. */
fa9a63c5
RM
551 succeed_n,
552
25fe55af
RS
553 /* Followed by two-byte relative address, and two-byte number n.
554 Jump to the address N times, then fail. */
fa9a63c5
RM
555 jump_n,
556
25fe55af
RS
557 /* Set the following two-byte relative address to the
558 subsequent two-byte number. The address *includes* the two
559 bytes of number. */
fa9a63c5
RM
560 set_number_at,
561
fa9a63c5
RM
562 wordbeg, /* Succeeds if at word beginning. */
563 wordend, /* Succeeds if at word end. */
564
565 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 566 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5
RM
567
568 /* Matches any character whose syntax is specified. Followed by
25fe55af 569 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
570 syntaxspec,
571
572 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
573 notsyntaxspec
574
575#ifdef emacs
576 ,before_dot, /* Succeeds if before point. */
577 at_dot, /* Succeeds if at point. */
578 after_dot, /* Succeeds if after point. */
b18215fc
RS
579
580 /* Matches any character whose category-set contains the specified
25fe55af
RS
581 category. The operator is followed by a byte which contains a
582 category code (mnemonic ASCII character). */
b18215fc
RS
583 categoryspec,
584
585 /* Matches any character whose category-set does not contain the
586 specified category. The operator is followed by a byte which
587 contains the category code (mnemonic ASCII character). */
588 notcategoryspec
fa9a63c5
RM
589#endif /* emacs */
590} re_opcode_t;
591\f
592/* Common operations on the compiled pattern. */
593
594/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
595
596#define STORE_NUMBER(destination, number) \
597 do { \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
600 } while (0)
601
602/* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
605
606#define STORE_NUMBER_AND_INCR(destination, number) \
607 do { \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
610 } while (0)
611
612/* Put into DESTINATION a number stored in two contiguous bytes starting
613 at SOURCE. */
614
615#define EXTRACT_NUMBER(destination, source) \
616 do { \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
619 } while (0)
620
621#ifdef DEBUG
622static void
623extract_number (dest, source)
624 int *dest;
625 unsigned char *source;
626{
5e69f11e 627 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
628 *dest = *source & 0377;
629 *dest += temp << 8;
630}
631
25fe55af 632#ifndef EXTRACT_MACROS /* To debug the macros. */
fa9a63c5
RM
633#undef EXTRACT_NUMBER
634#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
635#endif /* not EXTRACT_MACROS */
636
637#endif /* DEBUG */
638
639/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
640 SOURCE must be an lvalue. */
641
642#define EXTRACT_NUMBER_AND_INCR(destination, source) \
643 do { \
644 EXTRACT_NUMBER (destination, source); \
25fe55af 645 (source) += 2; \
fa9a63c5
RM
646 } while (0)
647
648#ifdef DEBUG
649static void
650extract_number_and_incr (destination, source)
651 int *destination;
652 unsigned char **source;
5e69f11e 653{
fa9a63c5
RM
654 extract_number (destination, *source);
655 *source += 2;
656}
657
658#ifndef EXTRACT_MACROS
659#undef EXTRACT_NUMBER_AND_INCR
660#define EXTRACT_NUMBER_AND_INCR(dest, src) \
661 extract_number_and_incr (&dest, &src)
662#endif /* not EXTRACT_MACROS */
663
664#endif /* DEBUG */
665\f
b18215fc
RS
666/* Store a multibyte character in three contiguous bytes starting
667 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 668 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
669
670#define STORE_CHARACTER_AND_INCR(destination, character) \
671 do { \
672 (destination)[0] = (character) & 0377; \
673 (destination)[1] = ((character) >> 8) & 0377; \
674 (destination)[2] = (character) >> 16; \
675 (destination) += 3; \
676 } while (0)
677
678/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 679 starting at SOURCE. */
b18215fc
RS
680
681#define EXTRACT_CHARACTER(destination, source) \
682 do { \
683 (destination) = ((source)[0] \
684 | ((source)[1] << 8) \
685 | ((source)[2] << 16)); \
686 } while (0)
687
688
689/* Macros for charset. */
690
691/* Size of bitmap of charset P in bytes. P is a start of charset,
692 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
693#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
694
695/* Nonzero if charset P has range table. */
25fe55af 696#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
697
698/* Return the address of range table of charset P. But not the start
699 of table itself, but the before where the number of ranges is
96cc36cc
RS
700 stored. `2 +' means to skip re_opcode_t and size of bitmap,
701 and the 2 bytes of flags at the start of the range table. */
702#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
703
704/* Extract the bit flags that start a range table. */
705#define CHARSET_RANGE_TABLE_BITS(p) \
706 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
707 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
708
709/* Test if C is listed in the bitmap of charset P. */
710#define CHARSET_LOOKUP_BITMAP(p, c) \
711 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
712 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
713
714/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
715 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
716 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
717 and end. */
718#define CHARSET_RANGE_TABLE_END(range_table, count) \
719 ((range_table) + (count) * 2 * 3)
720
25fe55af 721/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
722 COUNT is number of ranges in RANGE_TABLE. */
723#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
724 do \
725 { \
726 int range_start, range_end; \
727 unsigned char *p; \
728 unsigned char *range_table_end \
729 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
730 \
731 for (p = (range_table); p < range_table_end; p += 2 * 3) \
732 { \
733 EXTRACT_CHARACTER (range_start, p); \
734 EXTRACT_CHARACTER (range_end, p + 3); \
735 \
736 if (range_start <= (c) && (c) <= range_end) \
737 { \
738 (not) = !(not); \
739 break; \
740 } \
741 } \
742 } \
743 while (0)
744
745/* Test if C is in range table of CHARSET. The flag NOT is negated if
746 C is listed in it. */
747#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
748 do \
749 { \
750 /* Number of ranges in range table. */ \
751 int count; \
752 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
753 \
754 EXTRACT_NUMBER_AND_INCR (count, range_table); \
755 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
756 } \
757 while (0)
758\f
fa9a63c5
RM
759/* If DEBUG is defined, Regex prints many voluminous messages about what
760 it is doing (if the variable `debug' is nonzero). If linked with the
761 main program in `iregex.c', you can enter patterns and strings
762 interactively. And if linked with the main program in `main.c' and
25fe55af 763 the other test files, you can run the already-written tests. */
fa9a63c5
RM
764
765#ifdef DEBUG
766
767/* We use standard I/O for debugging. */
768#include <stdio.h>
769
770/* It is useful to test things that ``must'' be true when debugging. */
771#include <assert.h>
772
99633e97 773static int debug = -100000;
fa9a63c5
RM
774
775#define DEBUG_STATEMENT(e) e
99633e97
SM
776#define DEBUG_PRINT1(x) if (debug > 0) printf (x)
777#define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
778#define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
779#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
25fe55af 780#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 781 if (debug > 0) print_partial_compiled_pattern (s, e)
fa9a63c5 782#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 783 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
784
785
786/* Print the fastmap in human-readable form. */
787
788void
789print_fastmap (fastmap)
790 char *fastmap;
791{
792 unsigned was_a_range = 0;
5e69f11e
RM
793 unsigned i = 0;
794
fa9a63c5
RM
795 while (i < (1 << BYTEWIDTH))
796 {
797 if (fastmap[i++])
798 {
799 was_a_range = 0;
25fe55af
RS
800 putchar (i - 1);
801 while (i < (1 << BYTEWIDTH) && fastmap[i])
802 {
803 was_a_range = 1;
804 i++;
805 }
fa9a63c5 806 if (was_a_range)
25fe55af
RS
807 {
808 printf ("-");
809 putchar (i - 1);
810 }
811 }
fa9a63c5 812 }
5e69f11e 813 putchar ('\n');
fa9a63c5
RM
814}
815
816
817/* Print a compiled pattern string in human-readable form, starting at
818 the START pointer into it and ending just before the pointer END. */
819
820void
821print_partial_compiled_pattern (start, end)
822 unsigned char *start;
823 unsigned char *end;
824{
825 int mcnt, mcnt2;
826 unsigned char *p = start;
827 unsigned char *pend = end;
828
829 if (start == NULL)
830 {
831 printf ("(null)\n");
832 return;
833 }
5e69f11e 834
fa9a63c5
RM
835 /* Loop over pattern commands. */
836 while (p < pend)
837 {
838 printf ("%d:\t", p - start);
839
840 switch ((re_opcode_t) *p++)
841 {
25fe55af
RS
842 case no_op:
843 printf ("/no_op");
844 break;
fa9a63c5 845
99633e97
SM
846 case succeed:
847 printf ("/succeed");
848 break;
849
fa9a63c5
RM
850 case exactn:
851 mcnt = *p++;
25fe55af
RS
852 printf ("/exactn/%d", mcnt);
853 do
fa9a63c5 854 {
25fe55af 855 putchar ('/');
fa9a63c5 856 putchar (*p++);
25fe55af
RS
857 }
858 while (--mcnt);
859 break;
fa9a63c5
RM
860
861 case start_memory:
505bde11 862 printf ("/start_memory/%d", *p++);
25fe55af 863 break;
fa9a63c5
RM
864
865 case stop_memory:
505bde11 866 printf ("/stop_memory/%d", *p++);
25fe55af 867 break;
fa9a63c5
RM
868
869 case duplicate:
870 printf ("/duplicate/%d", *p++);
871 break;
872
873 case anychar:
874 printf ("/anychar");
875 break;
876
877 case charset:
25fe55af
RS
878 case charset_not:
879 {
880 register int c, last = -100;
fa9a63c5 881 register int in_range = 0;
99633e97
SM
882 int length = CHARSET_BITMAP_SIZE (p - 1);
883 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5
RM
884
885 printf ("/charset [%s",
25fe55af 886 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 887
25fe55af 888 assert (p + *p < pend);
fa9a63c5 889
25fe55af 890 for (c = 0; c < 256; c++)
96cc36cc 891 if (c / 8 < length
fa9a63c5
RM
892 && (p[1 + (c/8)] & (1 << (c % 8))))
893 {
894 /* Are we starting a range? */
895 if (last + 1 == c && ! in_range)
896 {
897 putchar ('-');
898 in_range = 1;
899 }
900 /* Have we broken a range? */
901 else if (last + 1 != c && in_range)
96cc36cc 902 {
fa9a63c5
RM
903 putchar (last);
904 in_range = 0;
905 }
5e69f11e 906
fa9a63c5
RM
907 if (! in_range)
908 putchar (c);
909
910 last = c;
25fe55af 911 }
fa9a63c5
RM
912
913 if (in_range)
914 putchar (last);
915
916 putchar (']');
917
99633e97 918 p += 1 + length;
96cc36cc 919
96cc36cc 920 if (has_range_table)
99633e97
SM
921 {
922 int count;
923 printf ("has-range-table");
924
925 /* ??? Should print the range table; for now, just skip it. */
926 p += 2; /* skip range table bits */
927 EXTRACT_NUMBER_AND_INCR (count, p);
928 p = CHARSET_RANGE_TABLE_END (p, count);
929 }
fa9a63c5
RM
930 }
931 break;
932
933 case begline:
934 printf ("/begline");
25fe55af 935 break;
fa9a63c5
RM
936
937 case endline:
25fe55af
RS
938 printf ("/endline");
939 break;
fa9a63c5
RM
940
941 case on_failure_jump:
25fe55af
RS
942 extract_number_and_incr (&mcnt, &p);
943 printf ("/on_failure_jump to %d", p + mcnt - start);
944 break;
fa9a63c5
RM
945
946 case on_failure_keep_string_jump:
25fe55af
RS
947 extract_number_and_incr (&mcnt, &p);
948 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
949 break;
fa9a63c5 950
0683b6fa
SM
951 case on_failure_jump_nastyloop:
952 extract_number_and_incr (&mcnt, &p);
953 printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
954 break;
955
505bde11 956 case on_failure_jump_loop:
fa9a63c5 957 extract_number_and_incr (&mcnt, &p);
505bde11 958 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
959 break;
960
505bde11 961 case on_failure_jump_smart:
fa9a63c5 962 extract_number_and_incr (&mcnt, &p);
505bde11 963 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
964 break;
965
25fe55af 966 case jump:
fa9a63c5 967 extract_number_and_incr (&mcnt, &p);
25fe55af 968 printf ("/jump to %d", p + mcnt - start);
fa9a63c5
RM
969 break;
970
25fe55af
RS
971 case succeed_n:
972 extract_number_and_incr (&mcnt, &p);
973 extract_number_and_incr (&mcnt2, &p);
99633e97 974 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 975 break;
5e69f11e 976
25fe55af
RS
977 case jump_n:
978 extract_number_and_incr (&mcnt, &p);
979 extract_number_and_incr (&mcnt2, &p);
99633e97 980 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 981 break;
5e69f11e 982
25fe55af
RS
983 case set_number_at:
984 extract_number_and_incr (&mcnt, &p);
985 extract_number_and_incr (&mcnt2, &p);
99633e97 986 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 987 break;
5e69f11e 988
25fe55af 989 case wordbound:
fa9a63c5
RM
990 printf ("/wordbound");
991 break;
992
993 case notwordbound:
994 printf ("/notwordbound");
25fe55af 995 break;
fa9a63c5
RM
996
997 case wordbeg:
998 printf ("/wordbeg");
999 break;
5e69f11e 1000
fa9a63c5
RM
1001 case wordend:
1002 printf ("/wordend");
5e69f11e 1003
1fb352e0
SM
1004 case syntaxspec:
1005 printf ("/syntaxspec");
1006 mcnt = *p++;
1007 printf ("/%d", mcnt);
1008 break;
1009
1010 case notsyntaxspec:
1011 printf ("/notsyntaxspec");
1012 mcnt = *p++;
1013 printf ("/%d", mcnt);
1014 break;
1015
fa9a63c5
RM
1016#ifdef emacs
1017 case before_dot:
1018 printf ("/before_dot");
25fe55af 1019 break;
fa9a63c5
RM
1020
1021 case at_dot:
1022 printf ("/at_dot");
25fe55af 1023 break;
fa9a63c5
RM
1024
1025 case after_dot:
1026 printf ("/after_dot");
25fe55af 1027 break;
fa9a63c5 1028
1fb352e0
SM
1029 case categoryspec:
1030 printf ("/categoryspec");
fa9a63c5
RM
1031 mcnt = *p++;
1032 printf ("/%d", mcnt);
25fe55af 1033 break;
5e69f11e 1034
1fb352e0
SM
1035 case notcategoryspec:
1036 printf ("/notcategoryspec");
fa9a63c5
RM
1037 mcnt = *p++;
1038 printf ("/%d", mcnt);
1039 break;
1040#endif /* emacs */
1041
fa9a63c5
RM
1042 case begbuf:
1043 printf ("/begbuf");
25fe55af 1044 break;
fa9a63c5
RM
1045
1046 case endbuf:
1047 printf ("/endbuf");
25fe55af 1048 break;
fa9a63c5 1049
25fe55af
RS
1050 default:
1051 printf ("?%d", *(p-1));
fa9a63c5
RM
1052 }
1053
1054 putchar ('\n');
1055 }
1056
1057 printf ("%d:\tend of pattern.\n", p - start);
1058}
1059
1060
1061void
1062print_compiled_pattern (bufp)
1063 struct re_pattern_buffer *bufp;
1064{
1065 unsigned char *buffer = bufp->buffer;
1066
1067 print_partial_compiled_pattern (buffer, buffer + bufp->used);
505bde11 1068 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, bufp->allocated);
fa9a63c5
RM
1069
1070 if (bufp->fastmap_accurate && bufp->fastmap)
1071 {
1072 printf ("fastmap: ");
1073 print_fastmap (bufp->fastmap);
1074 }
1075
1076 printf ("re_nsub: %d\t", bufp->re_nsub);
1077 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1078 printf ("can_be_null: %d\t", bufp->can_be_null);
1079 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1080 printf ("no_sub: %d\t", bufp->no_sub);
1081 printf ("not_bol: %d\t", bufp->not_bol);
1082 printf ("not_eol: %d\t", bufp->not_eol);
1083 printf ("syntax: %d\n", bufp->syntax);
505bde11 1084 fflush (stdout);
fa9a63c5
RM
1085 /* Perhaps we should print the translate table? */
1086}
1087
1088
1089void
1090print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1091 re_char *where;
1092 re_char *string1;
1093 re_char *string2;
fa9a63c5
RM
1094 int size1;
1095 int size2;
1096{
1097 unsigned this_char;
5e69f11e 1098
fa9a63c5
RM
1099 if (where == NULL)
1100 printf ("(null)");
1101 else
1102 {
1103 if (FIRST_STRING_P (where))
25fe55af
RS
1104 {
1105 for (this_char = where - string1; this_char < size1; this_char++)
1106 putchar (string1[this_char]);
fa9a63c5 1107
25fe55af
RS
1108 where = string2;
1109 }
fa9a63c5
RM
1110
1111 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1112 putchar (string2[this_char]);
fa9a63c5
RM
1113 }
1114}
1115
1116#else /* not DEBUG */
1117
1118#undef assert
1119#define assert(e)
1120
1121#define DEBUG_STATEMENT(e)
1122#define DEBUG_PRINT1(x)
1123#define DEBUG_PRINT2(x1, x2)
1124#define DEBUG_PRINT3(x1, x2, x3)
1125#define DEBUG_PRINT4(x1, x2, x3, x4)
1126#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1127#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1128
1129#endif /* not DEBUG */
1130\f
1131/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1132 also be assigned to arbitrarily: each pattern buffer stores its own
1133 syntax, so it can be changed between regex compilations. */
1134/* This has no initializer because initialized variables in Emacs
1135 become read-only after dumping. */
1136reg_syntax_t re_syntax_options;
1137
1138
1139/* Specify the precise syntax of regexps for compilation. This provides
1140 for compatibility for various utilities which historically have
1141 different, incompatible syntaxes.
1142
1143 The argument SYNTAX is a bit mask comprised of the various bits
25fe55af 1144 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1145
1146reg_syntax_t
1147re_set_syntax (syntax)
1148 reg_syntax_t syntax;
1149{
1150 reg_syntax_t ret = re_syntax_options;
5e69f11e 1151
fa9a63c5
RM
1152 re_syntax_options = syntax;
1153 return ret;
1154}
1155\f
1156/* This table gives an error message for each of the error codes listed
25fe55af 1157 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1158 POSIX doesn't require that we do anything for REG_NOERROR,
25fe55af 1159 but why not be nice? */
fa9a63c5
RM
1160
1161static const char *re_error_msgid[] =
5e69f11e
RM
1162 {
1163 gettext_noop ("Success"), /* REG_NOERROR */
1164 gettext_noop ("No match"), /* REG_NOMATCH */
1165 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1166 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1167 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1168 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1169 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1170 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1171 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1172 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1173 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1174 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1175 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1176 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1177 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1178 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1179 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
fa9a63c5
RM
1180 };
1181\f
25fe55af 1182/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1183
1184/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1185 searching and matching functions should not call alloca. On some
1186 systems, alloca is implemented in terms of malloc, and if we're
1187 using the relocating allocator routines, then malloc could cause a
1188 relocation, which might (if the strings being searched are in the
1189 ralloc heap) shift the data out from underneath the regexp
1190 routines.
1191
5e69f11e 1192 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1193 processes input from X in a signal handler; processing X input may
1194 call malloc; if input arrives while a matching routine is calling
1195 malloc, then we're scrod. But Emacs can't just block input while
1196 calling matching routines; then we don't notice interrupts when
1197 they come in. So, Emacs blocks input around all regexp calls
1198 except the matching calls, which it leaves unprotected, in the
1199 faith that they will not malloc. */
1200
1201/* Normally, this is fine. */
1202#define MATCH_MAY_ALLOCATE
1203
1204/* When using GNU C, we are not REALLY using the C alloca, no matter
1205 what config.h may say. So don't take precautions for it. */
1206#ifdef __GNUC__
1207#undef C_ALLOCA
1208#endif
1209
1210/* The match routines may not allocate if (1) they would do it with malloc
1211 and (2) it's not safe for them to use malloc.
1212 Note that if REL_ALLOC is defined, matching would not use malloc for the
1213 failure stack, but we would still use it for the register vectors;
25fe55af 1214 so REL_ALLOC should not affect this. */
fa9a63c5
RM
1215#if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1216#undef MATCH_MAY_ALLOCATE
1217#endif
1218
1219\f
1220/* Failure stack declarations and macros; both re_compile_fastmap and
1221 re_match_2 use a failure stack. These have to be macros because of
1222 REGEX_ALLOCATE_STACK. */
5e69f11e 1223
fa9a63c5 1224
320a2a73 1225/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1226 when matching. If this number is exceeded, we allocate more
1227 space, so it is not a hard limit. */
1228#ifndef INIT_FAILURE_ALLOC
320a2a73 1229#define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1230#endif
1231
1232/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1233 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1234 This is a variable only so users of regex can assign to it; we never
25fe55af 1235 change it ourselves. */
fa9a63c5 1236#if defined (MATCH_MAY_ALLOCATE)
320a2a73
KH
1237/* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1238 whose default stack limit is 2mb. In order for a larger
1239 value to work reliably, you have to try to make it accord
1240 with the process stack limit. */
1241int re_max_failures = 40000;
fa9a63c5 1242#else
320a2a73 1243int re_max_failures = 4000;
fa9a63c5
RM
1244#endif
1245
1246union fail_stack_elt
1247{
66f0296e 1248 const unsigned char *pointer;
505bde11 1249 unsigned int integer;
fa9a63c5
RM
1250};
1251
1252typedef union fail_stack_elt fail_stack_elt_t;
1253
1254typedef struct
1255{
1256 fail_stack_elt_t *stack;
1257 unsigned size;
505bde11
SM
1258 unsigned avail; /* Offset of next open position. */
1259 unsigned frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1260} fail_stack_type;
1261
505bde11
SM
1262#define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1263#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1264#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1265
1266
1267/* Define macros to initialize and free the failure stack.
1268 Do `return -2' if the alloc fails. */
1269
1270#ifdef MATCH_MAY_ALLOCATE
1271#define INIT_FAIL_STACK() \
1272 do { \
1273 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1274 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1275 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1276 \
1277 if (fail_stack.stack == NULL) \
1278 return -2; \
1279 \
1280 fail_stack.size = INIT_FAILURE_ALLOC; \
1281 fail_stack.avail = 0; \
505bde11 1282 fail_stack.frame = 0; \
fa9a63c5
RM
1283 } while (0)
1284
1285#define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1286#else
1287#define INIT_FAIL_STACK() \
1288 do { \
1289 fail_stack.avail = 0; \
505bde11 1290 fail_stack.frame = 0; \
fa9a63c5
RM
1291 } while (0)
1292
ed0767d8 1293#define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1294#endif
1295
1296
320a2a73
KH
1297/* Double the size of FAIL_STACK, up to a limit
1298 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1299
1300 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1301 allocating space for it or it was already too large.
1302
25fe55af 1303 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1304
320a2a73
KH
1305/* Factor to increase the failure stack size by
1306 when we increase it.
1307 This used to be 2, but 2 was too wasteful
1308 because the old discarded stacks added up to as much space
1309 were as ultimate, maximum-size stack. */
1310#define FAIL_STACK_GROWTH_FACTOR 4
1311
1312#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1313 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1314 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1315 ? 0 \
320a2a73
KH
1316 : ((fail_stack).stack \
1317 = (fail_stack_elt_t *) \
25fe55af
RS
1318 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1319 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1320 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1321 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1322 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1323 \
1324 (fail_stack).stack == NULL \
1325 ? 0 \
6453db45
KH
1326 : ((fail_stack).size \
1327 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1328 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1329 * FAIL_STACK_GROWTH_FACTOR)) \
1330 / sizeof (fail_stack_elt_t)), \
25fe55af 1331 1)))
fa9a63c5
RM
1332
1333
5e69f11e 1334/* Push pointer POINTER on FAIL_STACK.
fa9a63c5
RM
1335 Return 1 if was able to do so and 0 if ran out of memory allocating
1336 space to do so. */
1337#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1338 ((FAIL_STACK_FULL () \
320a2a73 1339 && !GROW_FAIL_STACK (FAIL_STACK)) \
fa9a63c5
RM
1340 ? 0 \
1341 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1342 1))
505bde11 1343#define POP_PATTERN_OP() POP_FAILURE_POINTER ()
fa9a63c5
RM
1344
1345/* Push a pointer value onto the failure stack.
1346 Assumes the variable `fail_stack'. Probably should only
25fe55af 1347 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1348#define PUSH_FAILURE_POINTER(item) \
1349 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1350
1351/* This pushes an integer-valued item onto the failure stack.
1352 Assumes the variable `fail_stack'. Probably should only
25fe55af 1353 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1354#define PUSH_FAILURE_INT(item) \
1355 fail_stack.stack[fail_stack.avail++].integer = (item)
1356
1357/* Push a fail_stack_elt_t value onto the failure stack.
1358 Assumes the variable `fail_stack'. Probably should only
25fe55af 1359 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1360#define PUSH_FAILURE_ELT(item) \
1361 fail_stack.stack[fail_stack.avail++] = (item)
1362
1363/* These three POP... operations complement the three PUSH... operations.
1364 All assume that `fail_stack' is nonempty. */
1365#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1366#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1367#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1368
505bde11
SM
1369/* Individual items aside from the registers. */
1370#define NUM_NONREG_ITEMS 3
1371
1372/* Used to examine the stack (to detect infinite loops). */
1373#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1374#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1375#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1376#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1377
1378
505bde11
SM
1379#define ENSURE_FAIL_STACK(space) \
1380while (REMAINING_AVAIL_SLOTS <= space) { \
1381 if (!GROW_FAIL_STACK (fail_stack)) \
1382 return -2; \
1383 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1384 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1385}
1386
1387/* Push register NUM onto the stack. */
1388#define PUSH_FAILURE_REG(num) \
1389do { \
1390 char *destination; \
1391 ENSURE_FAIL_STACK(3); \
1392 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1393 num, regstart[num], regend[num]); \
1394 PUSH_FAILURE_POINTER (regstart[num]); \
1395 PUSH_FAILURE_POINTER (regend[num]); \
1396 PUSH_FAILURE_INT (num); \
1397} while (0)
1398
1399/* Pop a saved register off the stack. */
1400#define POP_FAILURE_REG() \
1401do { \
1402 int reg = POP_FAILURE_INT (); \
1403 regend[reg] = POP_FAILURE_POINTER (); \
1404 regstart[reg] = POP_FAILURE_POINTER (); \
1405 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1406 reg, regstart[reg], regend[reg]); \
1407} while (0)
1408
1409/* Check that we are not stuck in an infinite loop. */
1410#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1411do { \
1412 int failure = TOP_FAILURE_HANDLE(); \
1413 /* Check for infinite matching loops */ \
1414 while (failure > 0 && \
1415 (FAILURE_STR (failure) == string_place \
1416 || FAILURE_STR (failure) == NULL)) \
1417 { \
1418 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1419 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11
SM
1420 if (FAILURE_PAT (failure) == pat_cur) \
1421 goto fail; \
66f0296e 1422 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1423 failure = NEXT_FAILURE_HANDLE(failure); \
1424 } \
1425 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1426} while (0)
1427
fa9a63c5 1428/* Push the information about the state we will need
5e69f11e
RM
1429 if we ever fail back to it.
1430
505bde11 1431 Requires variables fail_stack, regstart, regend and
320a2a73 1432 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1433 declared.
5e69f11e 1434
fa9a63c5
RM
1435 Does `return FAILURE_CODE' if runs out of memory. */
1436
505bde11
SM
1437#define PUSH_FAILURE_POINT(pattern, string_place) \
1438do { \
1439 char *destination; \
1440 /* Must be int, so when we don't save any registers, the arithmetic \
1441 of 0 + -1 isn't done as unsigned. */ \
1442 \
1443 DEBUG_STATEMENT (failure_id++); \
1444 DEBUG_STATEMENT (nfailure_points_pushed++); \
1445 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1446 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1447 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1448 \
1449 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1450 \
1451 DEBUG_PRINT1 ("\n"); \
1452 \
1453 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1454 PUSH_FAILURE_INT (fail_stack.frame); \
1455 \
1456 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1457 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1458 DEBUG_PRINT1 ("'\n"); \
1459 PUSH_FAILURE_POINTER (string_place); \
1460 \
1461 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1462 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1463 PUSH_FAILURE_POINTER (pattern); \
1464 \
1465 /* Close the frame by moving the frame pointer past it. */ \
1466 fail_stack.frame = fail_stack.avail; \
1467} while (0)
fa9a63c5 1468
320a2a73
KH
1469/* Estimate the size of data pushed by a typical failure stack entry.
1470 An estimate is all we need, because all we use this for
1471 is to choose a limit for how big to make the failure stack. */
1472
1473#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1474
fa9a63c5
RM
1475/* How many items can still be added to the stack without overflowing it. */
1476#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1477
1478
1479/* Pops what PUSH_FAIL_STACK pushes.
1480
1481 We restore into the parameters, all of which should be lvalues:
1482 STR -- the saved data position.
1483 PAT -- the saved pattern position.
fa9a63c5 1484 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1485
fa9a63c5 1486 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1487 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1488
505bde11
SM
1489#define POP_FAILURE_POINT(str, pat) \
1490do { \
fa9a63c5
RM
1491 assert (!FAIL_STACK_EMPTY ()); \
1492 \
1493 /* Remove failure points and point to how many regs pushed. */ \
1494 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1495 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1496 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1497 \
505bde11
SM
1498 /* Pop the saved registers. */ \
1499 while (fail_stack.frame < fail_stack.avail) \
1500 POP_FAILURE_REG (); \
fa9a63c5 1501 \
505bde11
SM
1502 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1503 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1504 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1505 \
1506 /* If the saved string location is NULL, it came from an \
1507 on_failure_keep_string_jump opcode, and we want to throw away the \
1508 saved NULL, thus retaining our current position in the string. */ \
66f0296e 1509 str = (re_char *) POP_FAILURE_POINTER (); \
505bde11 1510 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1511 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1512 DEBUG_PRINT1 ("'\n"); \
1513 \
505bde11
SM
1514 fail_stack.frame = POP_FAILURE_INT (); \
1515 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1516 \
505bde11
SM
1517 assert (fail_stack.avail >= 0); \
1518 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1519 \
fa9a63c5 1520 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1521} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1522
1523
1524\f
fa9a63c5 1525/* Registers are set to a sentinel when they haven't yet matched. */
505bde11 1526#define REG_UNSET_VALUE NULL
fa9a63c5
RM
1527#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1528\f
1529/* Subroutine declarations and macros for regex_compile. */
1530
99633e97
SM
1531static void store_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc, int arg));
1532static void store_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1533 int arg1, int arg2));
1534static void insert_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1535 int arg, unsigned char *end));
1536static void insert_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1537 int arg1, int arg2, unsigned char *end));
1538static boolean at_begline_loc_p _RE_ARGS((const unsigned char *pattern,
1539 const unsigned char *p,
1540 reg_syntax_t syntax));
1541static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p,
1542 const unsigned char *pend,
1543 reg_syntax_t syntax));
4e8a9132 1544static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
fa9a63c5 1545
5e69f11e 1546/* Fetch the next character in the uncompiled pattern---translating it
fa9a63c5
RM
1547 if necessary. Also cast from a signed character in the constant
1548 string passed to us by the user to an unsigned char that we can use
1549 as an array index (in, e.g., `translate'). */
1550#define PATFETCH(c) \
99633e97
SM
1551 do { \
1552 PATFETCH_RAW (c); \
28703c16 1553 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
fa9a63c5
RM
1554 } while (0)
1555
1556/* Fetch the next character in the uncompiled pattern, with no
25fe55af 1557 translation. */
fa9a63c5
RM
1558#define PATFETCH_RAW(c) \
1559 do {if (p == pend) return REG_EEND; \
66f0296e 1560 c = *p++; \
fa9a63c5
RM
1561 } while (0)
1562
1563/* Go backwards one character in the pattern. */
1564#define PATUNFETCH p--
1565
1566
1567/* If `translate' is non-null, return translate[D], else just D. We
1568 cast the subscript to translate because some data is declared as
1569 `char *', to avoid warnings when a string constant is passed. But
1570 when we use a character as a subscript we must make it unsigned. */
6676cb1c
RS
1571#ifndef TRANSLATE
1572#define TRANSLATE(d) \
66f0296e 1573 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1574#endif
fa9a63c5
RM
1575
1576
1577/* Macros for outputting the compiled pattern into `buffer'. */
1578
1579/* If the buffer isn't allocated when it comes in, use this. */
1580#define INIT_BUF_SIZE 32
1581
25fe55af 1582/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5
RM
1583#define GET_BUFFER_SPACE(n) \
1584 while (b - bufp->buffer + (n) > bufp->allocated) \
1585 EXTEND_BUFFER ()
1586
1587/* Make sure we have one more byte of buffer space and then add C to it. */
1588#define BUF_PUSH(c) \
1589 do { \
1590 GET_BUFFER_SPACE (1); \
1591 *b++ = (unsigned char) (c); \
1592 } while (0)
1593
1594
1595/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1596#define BUF_PUSH_2(c1, c2) \
1597 do { \
1598 GET_BUFFER_SPACE (2); \
1599 *b++ = (unsigned char) (c1); \
1600 *b++ = (unsigned char) (c2); \
1601 } while (0)
1602
1603
25fe55af 1604/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1605#define BUF_PUSH_3(c1, c2, c3) \
1606 do { \
1607 GET_BUFFER_SPACE (3); \
1608 *b++ = (unsigned char) (c1); \
1609 *b++ = (unsigned char) (c2); \
1610 *b++ = (unsigned char) (c3); \
1611 } while (0)
1612
1613
1614/* Store a jump with opcode OP at LOC to location TO. We store a
25fe55af 1615 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1616#define STORE_JUMP(op, loc, to) \
1617 store_op1 (op, loc, (to) - (loc) - 3)
1618
1619/* Likewise, for a two-argument jump. */
1620#define STORE_JUMP2(op, loc, to, arg) \
1621 store_op2 (op, loc, (to) - (loc) - 3, arg)
1622
25fe55af 1623/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1624#define INSERT_JUMP(op, loc, to) \
1625 insert_op1 (op, loc, (to) - (loc) - 3, b)
1626
1627/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1628#define INSERT_JUMP2(op, loc, to, arg) \
1629 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1630
1631
1632/* This is not an arbitrary limit: the arguments which represent offsets
25fe55af 1633 into the pattern are two bytes long. So if 2^16 bytes turns out to
fa9a63c5
RM
1634 be too small, many things would have to change. */
1635#define MAX_BUF_SIZE (1L << 16)
1636
1637
1638/* Extend the buffer by twice its current size via realloc and
1639 reset the pointers that pointed into the old block to point to the
1640 correct places in the new one. If extending the buffer results in it
25fe55af 1641 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
fa9a63c5 1642#define EXTEND_BUFFER() \
25fe55af 1643 do { \
fa9a63c5 1644 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1645 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1646 return REG_ESIZE; \
1647 bufp->allocated <<= 1; \
1648 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1649 bufp->allocated = MAX_BUF_SIZE; \
fa9a63c5
RM
1650 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1651 if (bufp->buffer == NULL) \
1652 return REG_ESPACE; \
1653 /* If the buffer moved, move all the pointers into it. */ \
1654 if (old_buffer != bufp->buffer) \
1655 { \
25fe55af
RS
1656 b = (b - old_buffer) + bufp->buffer; \
1657 begalt = (begalt - old_buffer) + bufp->buffer; \
1658 if (fixup_alt_jump) \
1659 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1660 if (laststart) \
1661 laststart = (laststart - old_buffer) + bufp->buffer; \
1662 if (pending_exact) \
1663 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
fa9a63c5
RM
1664 } \
1665 } while (0)
1666
1667
1668/* Since we have one byte reserved for the register number argument to
1669 {start,stop}_memory, the maximum number of groups we can report
1670 things about is what fits in that byte. */
1671#define MAX_REGNUM 255
1672
1673/* But patterns can have more than `MAX_REGNUM' registers. We just
1674 ignore the excess. */
1675typedef unsigned regnum_t;
1676
1677
1678/* Macros for the compile stack. */
1679
1680/* Since offsets can go either forwards or backwards, this type needs to
25fe55af 1681 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
fa9a63c5
RM
1682typedef int pattern_offset_t;
1683
1684typedef struct
1685{
1686 pattern_offset_t begalt_offset;
1687 pattern_offset_t fixup_alt_jump;
5e69f11e 1688 pattern_offset_t laststart_offset;
fa9a63c5
RM
1689 regnum_t regnum;
1690} compile_stack_elt_t;
1691
1692
1693typedef struct
1694{
1695 compile_stack_elt_t *stack;
1696 unsigned size;
1697 unsigned avail; /* Offset of next open position. */
1698} compile_stack_type;
1699
1700
1701#define INIT_COMPILE_STACK_SIZE 32
1702
1703#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1704#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1705
25fe55af 1706/* The next available element. */
fa9a63c5
RM
1707#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1708
1709
b18215fc
RS
1710/* Structure to manage work area for range table. */
1711struct range_table_work_area
1712{
1713 int *table; /* actual work area. */
1714 int allocated; /* allocated size for work area in bytes. */
25fe55af 1715 int used; /* actually used size in words. */
96cc36cc 1716 int bits; /* flag to record character classes */
b18215fc
RS
1717};
1718
1719/* Make sure that WORK_AREA can hold more N multibyte characters. */
1720#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1721 do { \
1722 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1723 { \
1724 (work_area).allocated += 16 * sizeof (int); \
1725 if ((work_area).table) \
1726 (work_area).table \
1727 = (int *) realloc ((work_area).table, (work_area).allocated); \
1728 else \
1729 (work_area).table \
1730 = (int *) malloc ((work_area).allocated); \
1731 if ((work_area).table == 0) \
1732 FREE_STACK_RETURN (REG_ESPACE); \
1733 } \
1734 } while (0)
1735
96cc36cc
RS
1736#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1737 (work_area).bits |= (bit)
1738
1739/* These bits represent the various character classes such as [:alnum:]
1740 in a charset's range table. */
1741#define BIT_ALNUM 0x1
1742#define BIT_ALPHA 0x2
1743#define BIT_WORD 0x4
f71b19b6
DL
1744#define BIT_ASCII 0x8
1745#define BIT_NONASCII 0x10
96cc36cc
RS
1746#define BIT_GRAPH 0x20
1747#define BIT_LOWER 0x40
1748#define BIT_PRINT 0x80
1749#define BIT_PUNCT 0x100
1750#define BIT_SPACE 0x200
1751#define BIT_UPPER 0x400
f71b19b6
DL
1752#define BIT_UNIBYTE 0x800
1753#define BIT_MULTIBYTE 0x1000
96cc36cc 1754
b18215fc
RS
1755/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1756#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1757 do { \
1758 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1759 (work_area).table[(work_area).used++] = (range_start); \
1760 (work_area).table[(work_area).used++] = (range_end); \
1761 } while (0)
1762
25fe55af 1763/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1764#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1765 do { \
1766 if ((work_area).table) \
1767 free ((work_area).table); \
1768 } while (0)
1769
96cc36cc 1770#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1771#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1772#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc
RS
1773#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1774
1775
fa9a63c5 1776/* Set the bit for character C in a list. */
25fe55af
RS
1777#define SET_LIST_BIT(c) \
1778 (b[((unsigned char) (c)) / BYTEWIDTH] \
fa9a63c5
RM
1779 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1780
1781
1782/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1783#define GET_UNSIGNED_NUMBER(num) \
99633e97 1784 do { if (p != pend) \
fa9a63c5 1785 { \
25fe55af
RS
1786 PATFETCH (c); \
1787 while (ISDIGIT (c)) \
1788 { \
1789 if (num < 0) \
1790 num = 0; \
1791 num = num * 10 + c - '0'; \
1792 if (p == pend) \
1793 break; \
1794 PATFETCH (c); \
1795 } \
1796 } \
99633e97 1797 } while (0)
fa9a63c5
RM
1798
1799#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1800
1801#define IS_CHAR_CLASS(string) \
1802 (STREQ (string, "alpha") || STREQ (string, "upper") \
1803 || STREQ (string, "lower") || STREQ (string, "digit") \
1804 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1805 || STREQ (string, "space") || STREQ (string, "print") \
1806 || STREQ (string, "punct") || STREQ (string, "graph") \
96cc36cc 1807 || STREQ (string, "cntrl") || STREQ (string, "blank") \
f71b19b6
DL
1808 || STREQ (string, "word") \
1809 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1810 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
99633e97
SM
1811
1812/* QUIT is only used on NTemacs. */
1813#if !defined (WINDOWSNT) || !defined (emacs)
1814#undef QUIT
1815#define QUIT
1816#endif
fa9a63c5
RM
1817\f
1818#ifndef MATCH_MAY_ALLOCATE
1819
1820/* If we cannot allocate large objects within re_match_2_internal,
1821 we make the fail stack and register vectors global.
1822 The fail stack, we grow to the maximum size when a regexp
1823 is compiled.
1824 The register vectors, we adjust in size each time we
1825 compile a regexp, according to the number of registers it needs. */
1826
1827static fail_stack_type fail_stack;
1828
1829/* Size with which the following vectors are currently allocated.
1830 That is so we can make them bigger as needed,
25fe55af 1831 but never make them smaller. */
fa9a63c5
RM
1832static int regs_allocated_size;
1833
66f0296e
SM
1834static re_char ** regstart, ** regend;
1835static re_char **best_regstart, **best_regend;
fa9a63c5
RM
1836
1837/* Make the register vectors big enough for NUM_REGS registers,
25fe55af 1838 but don't make them smaller. */
fa9a63c5
RM
1839
1840static
1841regex_grow_registers (num_regs)
1842 int num_regs;
1843{
1844 if (num_regs > regs_allocated_size)
1845 {
66f0296e
SM
1846 RETALLOC_IF (regstart, num_regs, re_char *);
1847 RETALLOC_IF (regend, num_regs, re_char *);
1848 RETALLOC_IF (best_regstart, num_regs, re_char *);
1849 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
1850
1851 regs_allocated_size = num_regs;
1852 }
1853}
1854
1855#endif /* not MATCH_MAY_ALLOCATE */
1856\f
99633e97
SM
1857static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1858 compile_stack,
1859 regnum_t regnum));
1860
fa9a63c5
RM
1861/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1862 Returns one of error codes defined in `regex.h', or zero for success.
1863
1864 Assumes the `allocated' (and perhaps `buffer') and `translate'
1865 fields are set in BUFP on entry.
1866
1867 If it succeeds, results are put in BUFP (if it returns an error, the
1868 contents of BUFP are undefined):
1869 `buffer' is the compiled pattern;
1870 `syntax' is set to SYNTAX;
1871 `used' is set to the length of the compiled pattern;
1872 `fastmap_accurate' is zero;
1873 `re_nsub' is the number of subexpressions in PATTERN;
1874 `not_bol' and `not_eol' are zero;
5e69f11e 1875
fa9a63c5
RM
1876 The `fastmap' and `newline_anchor' fields are neither
1877 examined nor set. */
1878
505bde11
SM
1879/* Insert the `jump' from the end of last alternative to "here".
1880 The space for the jump has already been allocated. */
1881#define FIXUP_ALT_JUMP() \
1882do { \
1883 if (fixup_alt_jump) \
1884 STORE_JUMP (jump, fixup_alt_jump, b); \
1885} while (0)
1886
1887
fa9a63c5
RM
1888/* Return, freeing storage we allocated. */
1889#define FREE_STACK_RETURN(value) \
b18215fc
RS
1890 do { \
1891 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1892 free (compile_stack.stack); \
1893 return value; \
1894 } while (0)
fa9a63c5
RM
1895
1896static reg_errcode_t
1897regex_compile (pattern, size, syntax, bufp)
66f0296e 1898 re_char *pattern;
fa9a63c5
RM
1899 int size;
1900 reg_syntax_t syntax;
1901 struct re_pattern_buffer *bufp;
1902{
1903 /* We fetch characters from PATTERN here. Even though PATTERN is
1904 `char *' (i.e., signed), we declare these variables as unsigned, so
1905 they can be reliably used as array indices. */
b18215fc 1906 register unsigned int c, c1;
5e69f11e 1907
fa9a63c5 1908 /* A random temporary spot in PATTERN. */
66f0296e 1909 re_char *p1;
fa9a63c5
RM
1910
1911 /* Points to the end of the buffer, where we should append. */
1912 register unsigned char *b;
5e69f11e 1913
fa9a63c5
RM
1914 /* Keeps track of unclosed groups. */
1915 compile_stack_type compile_stack;
1916
1917 /* Points to the current (ending) position in the pattern. */
22336245
RS
1918#ifdef AIX
1919 /* `const' makes AIX compiler fail. */
66f0296e 1920 unsigned char *p = pattern;
22336245 1921#else
66f0296e 1922 re_char *p = pattern;
22336245 1923#endif
66f0296e 1924 re_char *pend = pattern + size;
5e69f11e 1925
fa9a63c5 1926 /* How to translate the characters in the pattern. */
6676cb1c 1927 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
1928
1929 /* Address of the count-byte of the most recently inserted `exactn'
1930 command. This makes it possible to tell if a new exact-match
1931 character can be added to that command or if the character requires
1932 a new `exactn' command. */
1933 unsigned char *pending_exact = 0;
1934
1935 /* Address of start of the most recently finished expression.
1936 This tells, e.g., postfix * where to find the start of its
1937 operand. Reset at the beginning of groups and alternatives. */
1938 unsigned char *laststart = 0;
1939
1940 /* Address of beginning of regexp, or inside of last group. */
1941 unsigned char *begalt;
1942
1943 /* Place in the uncompiled pattern (i.e., the {) to
1944 which to go back if the interval is invalid. */
66f0296e 1945 re_char *beg_interval;
5e69f11e 1946
fa9a63c5 1947 /* Address of the place where a forward jump should go to the end of
25fe55af 1948 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
1949 last -- ends with a forward jump of this sort. */
1950 unsigned char *fixup_alt_jump = 0;
1951
1952 /* Counts open-groups as they are encountered. Remembered for the
1953 matching close-group on the compile stack, so the same register
1954 number is put in the stop_memory as the start_memory. */
1955 regnum_t regnum = 0;
1956
b18215fc
RS
1957 /* Work area for range table of charset. */
1958 struct range_table_work_area range_table_work;
1959
fa9a63c5 1960#ifdef DEBUG
99633e97 1961 debug++;
fa9a63c5 1962 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 1963 if (debug > 0)
fa9a63c5
RM
1964 {
1965 unsigned debug_count;
5e69f11e 1966
fa9a63c5 1967 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 1968 putchar (pattern[debug_count]);
fa9a63c5
RM
1969 putchar ('\n');
1970 }
1971#endif /* DEBUG */
1972
1973 /* Initialize the compile stack. */
1974 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1975 if (compile_stack.stack == NULL)
1976 return REG_ESPACE;
1977
1978 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1979 compile_stack.avail = 0;
1980
b18215fc
RS
1981 range_table_work.table = 0;
1982 range_table_work.allocated = 0;
1983
fa9a63c5
RM
1984 /* Initialize the pattern buffer. */
1985 bufp->syntax = syntax;
1986 bufp->fastmap_accurate = 0;
1987 bufp->not_bol = bufp->not_eol = 0;
1988
1989 /* Set `used' to zero, so that if we return an error, the pattern
1990 printer (for debugging) will think there's no pattern. We reset it
1991 at the end. */
1992 bufp->used = 0;
5e69f11e 1993
fa9a63c5 1994 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 1995 bufp->re_nsub = 0;
fa9a63c5 1996
b18215fc
RS
1997#ifdef emacs
1998 /* bufp->multibyte is set before regex_compile is called, so don't alter
1999 it. */
2000#else /* not emacs */
2001 /* Nothing is recognized as a multibyte character. */
2002 bufp->multibyte = 0;
2003#endif
2004
fa9a63c5
RM
2005#if !defined (emacs) && !defined (SYNTAX_TABLE)
2006 /* Initialize the syntax table. */
2007 init_syntax_once ();
2008#endif
2009
2010 if (bufp->allocated == 0)
2011 {
2012 if (bufp->buffer)
2013 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2014 enough space. This loses if buffer's address is bogus, but
2015 that is the user's responsibility. */
2016 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2017 }
fa9a63c5 2018 else
25fe55af
RS
2019 { /* Caller did not allocate a buffer. Do it for them. */
2020 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2021 }
fa9a63c5
RM
2022 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2023
2024 bufp->allocated = INIT_BUF_SIZE;
2025 }
2026
2027 begalt = b = bufp->buffer;
2028
2029 /* Loop through the uncompiled pattern until we're at the end. */
2030 while (p != pend)
2031 {
2032 PATFETCH (c);
2033
2034 switch (c)
25fe55af
RS
2035 {
2036 case '^':
2037 {
2038 if ( /* If at start of pattern, it's an operator. */
2039 p == pattern + 1
2040 /* If context independent, it's an operator. */
2041 || syntax & RE_CONTEXT_INDEP_ANCHORS
2042 /* Otherwise, depends on what's come before. */
2043 || at_begline_loc_p (pattern, p, syntax))
2044 BUF_PUSH (begline);
2045 else
2046 goto normal_char;
2047 }
2048 break;
2049
2050
2051 case '$':
2052 {
2053 if ( /* If at end of pattern, it's an operator. */
2054 p == pend
2055 /* If context independent, it's an operator. */
2056 || syntax & RE_CONTEXT_INDEP_ANCHORS
2057 /* Otherwise, depends on what's next. */
2058 || at_endline_loc_p (p, pend, syntax))
2059 BUF_PUSH (endline);
2060 else
2061 goto normal_char;
2062 }
2063 break;
fa9a63c5
RM
2064
2065
2066 case '+':
25fe55af
RS
2067 case '?':
2068 if ((syntax & RE_BK_PLUS_QM)
2069 || (syntax & RE_LIMITED_OPS))
2070 goto normal_char;
2071 handle_plus:
2072 case '*':
2073 /* If there is no previous pattern... */
2074 if (!laststart)
2075 {
2076 if (syntax & RE_CONTEXT_INVALID_OPS)
2077 FREE_STACK_RETURN (REG_BADRPT);
2078 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2079 goto normal_char;
2080 }
2081
2082 {
25fe55af 2083 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2084 boolean zero_times_ok = 0, many_times_ok = 0;
2085 boolean greedy = 1;
25fe55af
RS
2086
2087 /* If there is a sequence of repetition chars, collapse it
2088 down to just one (the right one). We can't combine
2089 interval operators with these because of, e.g., `a{2}*',
2090 which should only match an even number of `a's. */
2091
2092 for (;;)
2093 {
1c8c6d39
DL
2094 if (!(syntax & RE_ALL_GREEDY)
2095 && c == '?' && (zero_times_ok || many_times_ok))
2096 greedy = 0;
2097 else
2098 {
2099 zero_times_ok |= c != '+';
2100 many_times_ok |= c != '?';
2101 }
25fe55af
RS
2102
2103 if (p == pend)
2104 break;
ed0767d8
SM
2105 else if (*p == '*'
2106 || (!(syntax & RE_BK_PLUS_QM)
2107 && (*p == '+' || *p == '?')))
25fe55af 2108 ;
ed0767d8 2109 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2110 {
ed0767d8
SM
2111 if (p+1 == pend)
2112 FREE_STACK_RETURN (REG_EESCAPE);
2113 if (p[1] == '+' || p[1] == '?')
2114 PATFETCH (c); /* Gobble up the backslash. */
2115 else
2116 break;
25fe55af
RS
2117 }
2118 else
ed0767d8 2119 break;
25fe55af 2120 /* If we get here, we found another repeat character. */
ed0767d8
SM
2121 PATFETCH (c);
2122 }
25fe55af
RS
2123
2124 /* Star, etc. applied to an empty pattern is equivalent
2125 to an empty pattern. */
4e8a9132 2126 if (!laststart || laststart == b)
25fe55af
RS
2127 break;
2128
2129 /* Now we know whether or not zero matches is allowed
2130 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2131 if (greedy)
2132 {
99633e97 2133 if (many_times_ok)
4e8a9132
SM
2134 {
2135 boolean simple = skip_one_char (laststart) == b;
2136 unsigned int startoffset = 0;
2137 assert (skip_one_char (laststart) <= b);
2138
2139 if (!zero_times_ok && simple)
2140 { /* Since simple * loops can be made faster by using
2141 on_failure_keep_string_jump, we turn simple P+
2142 into PP* if P is simple. */
2143 unsigned char *p1, *p2;
2144 startoffset = b - laststart;
2145 GET_BUFFER_SPACE (startoffset);
2146 p1 = b; p2 = laststart;
2147 while (p2 < p1)
2148 *b++ = *p2++;
2149 zero_times_ok = 1;
99633e97 2150 }
4e8a9132
SM
2151
2152 GET_BUFFER_SPACE (6);
2153 if (!zero_times_ok)
2154 /* A + loop. */
2155 STORE_JUMP (on_failure_jump_loop, b, b + 6);
99633e97 2156 else
4e8a9132
SM
2157 /* Simple * loops can use on_failure_keep_string_jump
2158 depending on what follows. But since we don't know
2159 that yet, we leave the decision up to
2160 on_failure_jump_smart. */
2161 INSERT_JUMP (simple ? on_failure_jump_smart
2162 : on_failure_jump_loop,
2163 laststart + startoffset, b + 6);
99633e97 2164 b += 3;
4e8a9132 2165 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2166 b += 3;
2167 }
2168 else
2169 {
4e8a9132
SM
2170 /* A simple ? pattern. */
2171 assert (zero_times_ok);
2172 GET_BUFFER_SPACE (3);
2173 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2174 b += 3;
2175 }
1c8c6d39
DL
2176 }
2177 else /* not greedy */
2178 { /* I wish the greedy and non-greedy cases could be merged. */
2179
0683b6fa 2180 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2181 if (many_times_ok)
2182 {
505bde11 2183 /* The non-greedy multiple match looks like a repeat..until:
1c8c6d39 2184 we only need a conditional jump at the end of the loop */
0683b6fa
SM
2185 BUF_PUSH (no_op);
2186 STORE_JUMP (on_failure_jump_nastyloop, b, laststart);
1c8c6d39
DL
2187 b += 3;
2188 if (zero_times_ok)
2189 {
2190 /* The repeat...until naturally matches one or more.
2191 To also match zero times, we need to first jump to
2192 the end of the loop (its conditional jump). */
1c8c6d39
DL
2193 INSERT_JUMP (jump, laststart, b);
2194 b += 3;
2195 }
2196 }
2197 else
2198 {
2199 /* non-greedy a?? */
1c8c6d39
DL
2200 INSERT_JUMP (jump, laststart, b + 3);
2201 b += 3;
2202 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2203 b += 3;
2204 }
2205 }
2206 }
4e8a9132 2207 pending_exact = 0;
fa9a63c5
RM
2208 break;
2209
2210
2211 case '.':
25fe55af
RS
2212 laststart = b;
2213 BUF_PUSH (anychar);
2214 break;
fa9a63c5
RM
2215
2216
25fe55af
RS
2217 case '[':
2218 {
b18215fc 2219 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2220
25fe55af 2221 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2222
25fe55af
RS
2223 /* Ensure that we have enough space to push a charset: the
2224 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2225 GET_BUFFER_SPACE (34);
2226
25fe55af 2227 laststart = b;
e318085a 2228
25fe55af
RS
2229 /* We test `*p == '^' twice, instead of using an if
2230 statement, so we only need one BUF_PUSH. */
2231 BUF_PUSH (*p == '^' ? charset_not : charset);
2232 if (*p == '^')
2233 p++;
e318085a 2234
25fe55af
RS
2235 /* Remember the first position in the bracket expression. */
2236 p1 = p;
e318085a 2237
25fe55af
RS
2238 /* Push the number of bytes in the bitmap. */
2239 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2240
25fe55af
RS
2241 /* Clear the whole map. */
2242 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2243
25fe55af
RS
2244 /* charset_not matches newline according to a syntax bit. */
2245 if ((re_opcode_t) b[-2] == charset_not
2246 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2247 SET_LIST_BIT ('\n');
fa9a63c5 2248
25fe55af
RS
2249 /* Read in characters and ranges, setting map bits. */
2250 for (;;)
2251 {
b18215fc
RS
2252 int len;
2253 boolean escaped_char = false;
e318085a 2254
25fe55af 2255 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2256
25fe55af 2257 PATFETCH (c);
e318085a 2258
25fe55af
RS
2259 /* \ might escape characters inside [...] and [^...]. */
2260 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2261 {
2262 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2263
2264 PATFETCH (c);
b18215fc 2265 escaped_char = true;
25fe55af 2266 }
b18215fc
RS
2267 else
2268 {
657fcfbd
RS
2269 /* Could be the end of the bracket expression. If it's
2270 not (i.e., when the bracket expression is `[]' so
2271 far), the ']' character bit gets set way below. */
2272 if (c == ']' && p != p1 + 1)
2273 break;
25fe55af 2274 }
b18215fc
RS
2275
2276 /* If C indicates start of multibyte char, get the
2277 actual character code in C, and set the pattern
2278 pointer P to the next character boundary. */
2279 if (bufp->multibyte && BASE_LEADING_CODE_P (c))
2280 {
2281 PATUNFETCH;
2282 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2283 p += len;
25fe55af 2284 }
b18215fc
RS
2285 /* What should we do for the character which is
2286 greater than 0x7F, but not BASE_LEADING_CODE_P?
2287 XXX */
2288
25fe55af
RS
2289 /* See if we're at the beginning of a possible character
2290 class. */
b18215fc
RS
2291
2292 else if (!escaped_char &&
2293 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2294 {
2295 /* Leave room for the null. */
25fe55af 2296 char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2297 const unsigned char *class_beg;
b18215fc 2298
25fe55af
RS
2299 PATFETCH (c);
2300 c1 = 0;
ed0767d8 2301 class_beg = p;
b18215fc 2302
25fe55af
RS
2303 /* If pattern is `[[:'. */
2304 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2305
25fe55af
RS
2306 for (;;)
2307 {
2308 PATFETCH (c);
2309 if (c == ':' || c == ']' || p == pend
2310 || c1 == CHAR_CLASS_MAX_LENGTH)
2311 break;
2312 str[c1++] = c;
2313 }
2314 str[c1] = '\0';
b18215fc
RS
2315
2316 /* If isn't a word bracketed by `[:' and `:]':
2317 undo the ending character, the letters, and
2318 leave the leading `:' and `[' (but set bits for
2319 them). */
25fe55af
RS
2320 if (c == ':' && *p == ']')
2321 {
2322 int ch;
2323 boolean is_alnum = STREQ (str, "alnum");
2324 boolean is_alpha = STREQ (str, "alpha");
f71b19b6 2325 boolean is_ascii = STREQ (str, "ascii");
25fe55af
RS
2326 boolean is_blank = STREQ (str, "blank");
2327 boolean is_cntrl = STREQ (str, "cntrl");
2328 boolean is_digit = STREQ (str, "digit");
2329 boolean is_graph = STREQ (str, "graph");
2330 boolean is_lower = STREQ (str, "lower");
f71b19b6
DL
2331 boolean is_multibyte = STREQ (str, "multibyte");
2332 boolean is_nonascii = STREQ (str, "nonascii");
25fe55af
RS
2333 boolean is_print = STREQ (str, "print");
2334 boolean is_punct = STREQ (str, "punct");
2335 boolean is_space = STREQ (str, "space");
f71b19b6 2336 boolean is_unibyte = STREQ (str, "unibyte");
25fe55af 2337 boolean is_upper = STREQ (str, "upper");
96cc36cc 2338 boolean is_word = STREQ (str, "word");
f71b19b6 2339 boolean is_xdigit = STREQ (str, "xdigit");
25fe55af
RS
2340
2341 if (!IS_CHAR_CLASS (str))
fa9a63c5
RM
2342 FREE_STACK_RETURN (REG_ECTYPE);
2343
25fe55af
RS
2344 /* Throw away the ] at the end of the character
2345 class. */
2346 PATFETCH (c);
fa9a63c5 2347
25fe55af 2348 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2349
96cc36cc
RS
2350 /* Most character classes in a multibyte match
2351 just set a flag. Exceptions are is_blank,
2352 is_digit, is_cntrl, and is_xdigit, since
2353 they can only match ASCII characters. We
2354 don't need to handle them for multibyte. */
2355
2356 if (bufp->multibyte)
2357 {
2358 int bit = 0;
2359
2360 if (is_alnum) bit = BIT_ALNUM;
2361 if (is_alpha) bit = BIT_ALPHA;
f71b19b6 2362 if (is_ascii) bit = BIT_ASCII;
96cc36cc
RS
2363 if (is_graph) bit = BIT_GRAPH;
2364 if (is_lower) bit = BIT_LOWER;
f71b19b6
DL
2365 if (is_multibyte) bit = BIT_MULTIBYTE;
2366 if (is_nonascii) bit = BIT_NONASCII;
96cc36cc
RS
2367 if (is_print) bit = BIT_PRINT;
2368 if (is_punct) bit = BIT_PUNCT;
2369 if (is_space) bit = BIT_SPACE;
f71b19b6 2370 if (is_unibyte) bit = BIT_UNIBYTE;
96cc36cc
RS
2371 if (is_upper) bit = BIT_UPPER;
2372 if (is_word) bit = BIT_WORD;
2373 if (bit)
2374 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2375 bit);
2376 }
2377
2378 /* Handle character classes for ASCII characters. */
25fe55af
RS
2379 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2380 {
7ae68633 2381 int translated = TRANSLATE (ch);
fa9a63c5
RM
2382 /* This was split into 3 if's to
2383 avoid an arbitrary limit in some compiler. */
25fe55af
RS
2384 if ( (is_alnum && ISALNUM (ch))
2385 || (is_alpha && ISALPHA (ch))
2386 || (is_blank && ISBLANK (ch))
2387 || (is_cntrl && ISCNTRL (ch)))
7ae68633 2388 SET_LIST_BIT (translated);
fa9a63c5 2389 if ( (is_digit && ISDIGIT (ch))
25fe55af
RS
2390 || (is_graph && ISGRAPH (ch))
2391 || (is_lower && ISLOWER (ch))
2392 || (is_print && ISPRINT (ch)))
7ae68633 2393 SET_LIST_BIT (translated);
fa9a63c5 2394 if ( (is_punct && ISPUNCT (ch))
25fe55af
RS
2395 || (is_space && ISSPACE (ch))
2396 || (is_upper && ISUPPER (ch))
2397 || (is_xdigit && ISXDIGIT (ch)))
7ae68633 2398 SET_LIST_BIT (translated);
f71b19b6
DL
2399 if ( (is_ascii && IS_REAL_ASCII (ch))
2400 || (is_nonascii && !IS_REAL_ASCII (ch))
2401 || (is_unibyte && ISUNIBYTE (ch))
2402 || (is_multibyte && !ISUNIBYTE (ch)))
2403 SET_LIST_BIT (translated);
2404
96cc36cc
RS
2405 if ( (is_word && ISWORD (ch)))
2406 SET_LIST_BIT (translated);
25fe55af 2407 }
b18215fc
RS
2408
2409 /* Repeat the loop. */
2410 continue;
25fe55af
RS
2411 }
2412 else
2413 {
ed0767d8
SM
2414 /* Go back to right after the "[:". */
2415 p = class_beg;
25fe55af 2416 SET_LIST_BIT ('[');
b18215fc
RS
2417
2418 /* Because the `:' may starts the range, we
2419 can't simply set bit and repeat the loop.
25fe55af 2420 Instead, just set it to C and handle below. */
b18215fc 2421 c = ':';
25fe55af
RS
2422 }
2423 }
b18215fc
RS
2424
2425 if (p < pend && p[0] == '-' && p[1] != ']')
2426 {
2427
2428 /* Discard the `-'. */
2429 PATFETCH (c1);
2430
2431 /* Fetch the character which ends the range. */
2432 PATFETCH (c1);
2433 if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
e318085a 2434 {
b18215fc
RS
2435 PATUNFETCH;
2436 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2437 p += len;
e318085a 2438 }
b18215fc 2439
e934739e
RS
2440 if (SINGLE_BYTE_CHAR_P (c)
2441 && ! SINGLE_BYTE_CHAR_P (c1))
2442 {
2443 /* Handle a range such as \177-\377 in multibyte mode.
2444 Split that into two ranges,,
2445 the low one ending at 0237, and the high one
2446 starting at ...040. */
505bde11
SM
2447 /* Unless I'm missing something,
2448 this line is useless. -sm
2449 int c1_base = (c1 & ~0177) | 040; */
e934739e
RS
2450 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2451 c1 = 0237;
2452 }
2453 else if (!SAME_CHARSET_P (c, c1))
b18215fc 2454 FREE_STACK_RETURN (REG_ERANGE);
e318085a 2455 }
25fe55af 2456 else
b18215fc
RS
2457 /* Range from C to C. */
2458 c1 = c;
2459
2460 /* Set the range ... */
2461 if (SINGLE_BYTE_CHAR_P (c))
2462 /* ... into bitmap. */
25fe55af 2463 {
b18215fc
RS
2464 unsigned this_char;
2465 int range_start = c, range_end = c1;
2466
2467 /* If the start is after the end, the range is empty. */
2468 if (range_start > range_end)
2469 {
2470 if (syntax & RE_NO_EMPTY_RANGES)
2471 FREE_STACK_RETURN (REG_ERANGE);
2472 /* Else, repeat the loop. */
2473 }
2474 else
2475 {
2476 for (this_char = range_start; this_char <= range_end;
2477 this_char++)
2478 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 2479 }
25fe55af 2480 }
e318085a 2481 else
b18215fc
RS
2482 /* ... into range table. */
2483 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
2484 }
2485
25fe55af
RS
2486 /* Discard any (non)matching list bytes that are all 0 at the
2487 end of the map. Decrease the map-length byte too. */
2488 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2489 b[-1]--;
2490 b += b[-1];
fa9a63c5 2491
96cc36cc
RS
2492 /* Build real range table from work area. */
2493 if (RANGE_TABLE_WORK_USED (range_table_work)
2494 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
2495 {
2496 int i;
2497 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2498
b18215fc 2499 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
2500 bytes for flags, two for COUNT, and three bytes for
2501 each character. */
2502 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 2503
b18215fc
RS
2504 /* Indicate the existence of range table. */
2505 laststart[1] |= 0x80;
fa9a63c5 2506
96cc36cc
RS
2507 /* Store the character class flag bits into the range table.
2508 If not in emacs, these flag bits are always 0. */
2509 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2510 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2511
b18215fc
RS
2512 STORE_NUMBER_AND_INCR (b, used / 2);
2513 for (i = 0; i < used; i++)
2514 STORE_CHARACTER_AND_INCR
2515 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2516 }
25fe55af
RS
2517 }
2518 break;
fa9a63c5
RM
2519
2520
b18215fc 2521 case '(':
25fe55af
RS
2522 if (syntax & RE_NO_BK_PARENS)
2523 goto handle_open;
2524 else
2525 goto normal_char;
fa9a63c5
RM
2526
2527
25fe55af
RS
2528 case ')':
2529 if (syntax & RE_NO_BK_PARENS)
2530 goto handle_close;
2531 else
2532 goto normal_char;
e318085a
RS
2533
2534
25fe55af
RS
2535 case '\n':
2536 if (syntax & RE_NEWLINE_ALT)
2537 goto handle_alt;
2538 else
2539 goto normal_char;
e318085a
RS
2540
2541
b18215fc 2542 case '|':
25fe55af
RS
2543 if (syntax & RE_NO_BK_VBAR)
2544 goto handle_alt;
2545 else
2546 goto normal_char;
2547
2548
2549 case '{':
2550 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2551 goto handle_interval;
2552 else
2553 goto normal_char;
2554
2555
2556 case '\\':
2557 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2558
2559 /* Do not translate the character after the \, so that we can
2560 distinguish, e.g., \B from \b, even if we normally would
2561 translate, e.g., B to b. */
2562 PATFETCH_RAW (c);
2563
2564 switch (c)
2565 {
2566 case '(':
2567 if (syntax & RE_NO_BK_PARENS)
2568 goto normal_backslash;
2569
2570 handle_open:
505bde11
SM
2571 {
2572 int shy = 0;
2573 if (p+1 < pend)
2574 {
2575 /* Look for a special (?...) construct */
ed0767d8 2576 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 2577 {
ed0767d8 2578 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
2579 PATFETCH (c);
2580 switch (c)
2581 {
2582 case ':': shy = 1; break;
2583 default:
2584 /* Only (?:...) is supported right now. */
2585 FREE_STACK_RETURN (REG_BADPAT);
2586 }
2587 }
505bde11
SM
2588 }
2589
2590 if (!shy)
2591 {
2592 bufp->re_nsub++;
2593 regnum++;
2594 }
25fe55af 2595
99633e97
SM
2596 if (COMPILE_STACK_FULL)
2597 {
2598 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2599 compile_stack_elt_t);
2600 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 2601
99633e97
SM
2602 compile_stack.size <<= 1;
2603 }
25fe55af 2604
99633e97
SM
2605 /* These are the values to restore when we hit end of this
2606 group. They are all relative offsets, so that if the
2607 whole pattern moves because of realloc, they will still
2608 be valid. */
2609 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2610 COMPILE_STACK_TOP.fixup_alt_jump
2611 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2612 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2613 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
2614
2615 /* Do not push a
2616 start_memory for groups beyond the last one we can
2617 represent in the compiled pattern. */
2618 if (regnum <= MAX_REGNUM && !shy)
2619 BUF_PUSH_2 (start_memory, regnum);
2620
2621 compile_stack.avail++;
2622
2623 fixup_alt_jump = 0;
2624 laststart = 0;
2625 begalt = b;
2626 /* If we've reached MAX_REGNUM groups, then this open
2627 won't actually generate any code, so we'll have to
2628 clear pending_exact explicitly. */
2629 pending_exact = 0;
2630 break;
505bde11 2631 }
25fe55af
RS
2632
2633 case ')':
2634 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2635
2636 if (COMPILE_STACK_EMPTY)
505bde11
SM
2637 {
2638 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2639 goto normal_backslash;
2640 else
2641 FREE_STACK_RETURN (REG_ERPAREN);
2642 }
25fe55af
RS
2643
2644 handle_close:
505bde11 2645 FIXUP_ALT_JUMP ();
25fe55af
RS
2646
2647 /* See similar code for backslashed left paren above. */
2648 if (COMPILE_STACK_EMPTY)
505bde11
SM
2649 {
2650 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2651 goto normal_char;
2652 else
2653 FREE_STACK_RETURN (REG_ERPAREN);
2654 }
25fe55af
RS
2655
2656 /* Since we just checked for an empty stack above, this
2657 ``can't happen''. */
2658 assert (compile_stack.avail != 0);
2659 {
2660 /* We don't just want to restore into `regnum', because
2661 later groups should continue to be numbered higher,
2662 as in `(ab)c(de)' -- the second group is #2. */
2663 regnum_t this_group_regnum;
2664
2665 compile_stack.avail--;
2666 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2667 fixup_alt_jump
2668 = COMPILE_STACK_TOP.fixup_alt_jump
2669 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2670 : 0;
2671 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2672 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
2673 /* If we've reached MAX_REGNUM groups, then this open
2674 won't actually generate any code, so we'll have to
2675 clear pending_exact explicitly. */
2676 pending_exact = 0;
e318085a 2677
25fe55af
RS
2678 /* We're at the end of the group, so now we know how many
2679 groups were inside this one. */
505bde11
SM
2680 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
2681 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
2682 }
2683 break;
2684
2685
2686 case '|': /* `\|'. */
2687 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2688 goto normal_backslash;
2689 handle_alt:
2690 if (syntax & RE_LIMITED_OPS)
2691 goto normal_char;
2692
2693 /* Insert before the previous alternative a jump which
2694 jumps to this alternative if the former fails. */
2695 GET_BUFFER_SPACE (3);
2696 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2697 pending_exact = 0;
2698 b += 3;
2699
2700 /* The alternative before this one has a jump after it
2701 which gets executed if it gets matched. Adjust that
2702 jump so it will jump to this alternative's analogous
2703 jump (put in below, which in turn will jump to the next
2704 (if any) alternative's such jump, etc.). The last such
2705 jump jumps to the correct final destination. A picture:
2706 _____ _____
2707 | | | |
2708 | v | v
2709 a | b | c
2710
2711 If we are at `b', then fixup_alt_jump right now points to a
2712 three-byte space after `a'. We'll put in the jump, set
2713 fixup_alt_jump to right after `b', and leave behind three
2714 bytes which we'll fill in when we get to after `c'. */
2715
505bde11 2716 FIXUP_ALT_JUMP ();
25fe55af
RS
2717
2718 /* Mark and leave space for a jump after this alternative,
2719 to be filled in later either by next alternative or
2720 when know we're at the end of a series of alternatives. */
2721 fixup_alt_jump = b;
2722 GET_BUFFER_SPACE (3);
2723 b += 3;
2724
2725 laststart = 0;
2726 begalt = b;
2727 break;
2728
2729
2730 case '{':
2731 /* If \{ is a literal. */
2732 if (!(syntax & RE_INTERVALS)
2733 /* If we're at `\{' and it's not the open-interval
2734 operator. */
ed0767d8
SM
2735 || (syntax & RE_NO_BK_BRACES)
2736 /* What is that? -sm */
2737 /* || (p - 2 == pattern && p == pend) */)
25fe55af
RS
2738 goto normal_backslash;
2739
2740 handle_interval:
2741 {
2742 /* If got here, then the syntax allows intervals. */
2743
2744 /* At least (most) this many matches must be made. */
99633e97 2745 int lower_bound = 0, upper_bound = -1;
25fe55af 2746
ed0767d8 2747 beg_interval = p;
25fe55af
RS
2748
2749 if (p == pend)
2750 {
2751 if (syntax & RE_NO_BK_BRACES)
2752 goto unfetch_interval;
2753 else
2754 FREE_STACK_RETURN (REG_EBRACE);
2755 }
2756
2757 GET_UNSIGNED_NUMBER (lower_bound);
2758
2759 if (c == ',')
ed0767d8 2760 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
2761 else
2762 /* Interval such as `{1}' => match exactly once. */
2763 upper_bound = lower_bound;
2764
2765 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 2766 || (upper_bound >= 0 && lower_bound > upper_bound))
25fe55af
RS
2767 {
2768 if (syntax & RE_NO_BK_BRACES)
2769 goto unfetch_interval;
2770 else
2771 FREE_STACK_RETURN (REG_BADBR);
2772 }
2773
2774 if (!(syntax & RE_NO_BK_BRACES))
2775 {
2776 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2777
2778 PATFETCH (c);
2779 }
2780
2781 if (c != '}')
2782 {
2783 if (syntax & RE_NO_BK_BRACES)
2784 goto unfetch_interval;
2785 else
2786 FREE_STACK_RETURN (REG_BADBR);
2787 }
2788
2789 /* We just parsed a valid interval. */
2790
2791 /* If it's invalid to have no preceding re. */
2792 if (!laststart)
2793 {
2794 if (syntax & RE_CONTEXT_INVALID_OPS)
2795 FREE_STACK_RETURN (REG_BADRPT);
2796 else if (syntax & RE_CONTEXT_INDEP_OPS)
2797 laststart = b;
2798 else
2799 goto unfetch_interval;
2800 }
2801
25fe55af 2802 if (upper_bound == 0)
ed0767d8
SM
2803 /* If the upper bound is zero, just drop the sub pattern
2804 altogether. */
2805 b = laststart;
2806 else if (lower_bound == 1 && upper_bound == 1)
2807 /* Just match it once: nothing to do here. */
2808 ;
25fe55af
RS
2809
2810 /* Otherwise, we have a nontrivial interval. When
2811 we're all done, the pattern will look like:
2812 set_number_at <jump count> <upper bound>
2813 set_number_at <succeed_n count> <lower bound>
2814 succeed_n <after jump addr> <succeed_n count>
2815 <body of loop>
2816 jump_n <succeed_n addr> <jump count>
2817 (The upper bound and `jump_n' are omitted if
2818 `upper_bound' is 1, though.) */
2819 else
2820 { /* If the upper bound is > 1, we need to insert
2821 more at the end of the loop. */
ed0767d8
SM
2822 unsigned int nbytes = (upper_bound < 0 ? 3
2823 : upper_bound > 1 ? 5 : 0);
2824 unsigned int startoffset = 0;
2825
2826 GET_BUFFER_SPACE (20); /* We might use less. */
2827
2828 if (lower_bound == 0)
2829 {
2830 /* A succeed_n that starts with 0 is really a
2831 a simple on_failure_jump_loop. */
2832 INSERT_JUMP (on_failure_jump_loop, laststart,
2833 b + 3 + nbytes);
2834 b += 3;
2835 }
2836 else
2837 {
2838 /* Initialize lower bound of the `succeed_n', even
2839 though it will be set during matching by its
2840 attendant `set_number_at' (inserted next),
2841 because `re_compile_fastmap' needs to know.
2842 Jump to the `jump_n' we might insert below. */
2843 INSERT_JUMP2 (succeed_n, laststart,
2844 b + 5 + nbytes,
2845 lower_bound);
2846 b += 5;
2847
2848 /* Code to initialize the lower bound. Insert
2849 before the `succeed_n'. The `5' is the last two
2850 bytes of this `set_number_at', plus 3 bytes of
2851 the following `succeed_n'. */
2852 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2853 b += 5;
2854 startoffset += 5;
2855 }
2856
2857 if (upper_bound < 0)
2858 {
2859 /* A negative upper bound stands for infinity,
2860 in which case it degenerates to a plain jump. */
2861 STORE_JUMP (jump, b, laststart + startoffset);
2862 b += 3;
2863 }
2864 else if (upper_bound > 1)
25fe55af
RS
2865 { /* More than one repetition is allowed, so
2866 append a backward jump to the `succeed_n'
2867 that starts this interval.
2868
2869 When we've reached this during matching,
2870 we'll have matched the interval once, so
2871 jump back only `upper_bound - 1' times. */
ed0767d8 2872 STORE_JUMP2 (jump_n, b, laststart + startoffset,
25fe55af
RS
2873 upper_bound - 1);
2874 b += 5;
2875
2876 /* The location we want to set is the second
2877 parameter of the `jump_n'; that is `b-2' as
2878 an absolute address. `laststart' will be
2879 the `set_number_at' we're about to insert;
2880 `laststart+3' the number to set, the source
2881 for the relative address. But we are
2882 inserting into the middle of the pattern --
2883 so everything is getting moved up by 5.
2884 Conclusion: (b - 2) - (laststart + 3) + 5,
2885 i.e., b - laststart.
2886
2887 We insert this at the beginning of the loop
2888 so that if we fail during matching, we'll
2889 reinitialize the bounds. */
2890 insert_op2 (set_number_at, laststart, b - laststart,
2891 upper_bound - 1, b);
2892 b += 5;
2893 }
2894 }
2895 pending_exact = 0;
2896 beg_interval = NULL;
2897 }
2898 break;
2899
2900 unfetch_interval:
2901 /* If an invalid interval, match the characters as literals. */
2902 assert (beg_interval);
2903 p = beg_interval;
2904 beg_interval = NULL;
2905
2906 /* normal_char and normal_backslash need `c'. */
ed0767d8 2907 c = '{';
25fe55af
RS
2908
2909 if (!(syntax & RE_NO_BK_BRACES))
2910 {
ed0767d8
SM
2911 assert (p > pattern && p[-1] == '\\');
2912 goto normal_backslash;
25fe55af 2913 }
ed0767d8
SM
2914 else
2915 goto normal_char;
e318085a 2916
b18215fc 2917#ifdef emacs
25fe55af
RS
2918 /* There is no way to specify the before_dot and after_dot
2919 operators. rms says this is ok. --karl */
2920 case '=':
2921 BUF_PUSH (at_dot);
2922 break;
2923
2924 case 's':
2925 laststart = b;
2926 PATFETCH (c);
2927 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2928 break;
2929
2930 case 'S':
2931 laststart = b;
2932 PATFETCH (c);
2933 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2934 break;
b18215fc
RS
2935
2936 case 'c':
2937 laststart = b;
2938 PATFETCH_RAW (c);
2939 BUF_PUSH_2 (categoryspec, c);
2940 break;
e318085a 2941
b18215fc
RS
2942 case 'C':
2943 laststart = b;
2944 PATFETCH_RAW (c);
2945 BUF_PUSH_2 (notcategoryspec, c);
2946 break;
2947#endif /* emacs */
e318085a 2948
e318085a 2949
25fe55af
RS
2950 case 'w':
2951 laststart = b;
1fb352e0 2952 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 2953 break;
e318085a 2954
e318085a 2955
25fe55af
RS
2956 case 'W':
2957 laststart = b;
1fb352e0 2958 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 2959 break;
e318085a
RS
2960
2961
25fe55af
RS
2962 case '<':
2963 BUF_PUSH (wordbeg);
2964 break;
e318085a 2965
25fe55af
RS
2966 case '>':
2967 BUF_PUSH (wordend);
2968 break;
e318085a 2969
25fe55af
RS
2970 case 'b':
2971 BUF_PUSH (wordbound);
2972 break;
e318085a 2973
25fe55af
RS
2974 case 'B':
2975 BUF_PUSH (notwordbound);
2976 break;
fa9a63c5 2977
25fe55af
RS
2978 case '`':
2979 BUF_PUSH (begbuf);
2980 break;
e318085a 2981
25fe55af
RS
2982 case '\'':
2983 BUF_PUSH (endbuf);
2984 break;
e318085a 2985
25fe55af
RS
2986 case '1': case '2': case '3': case '4': case '5':
2987 case '6': case '7': case '8': case '9':
2988 if (syntax & RE_NO_BK_REFS)
2989 goto normal_char;
e318085a 2990
25fe55af 2991 c1 = c - '0';
e318085a 2992
25fe55af
RS
2993 if (c1 > regnum)
2994 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 2995
25fe55af
RS
2996 /* Can't back reference to a subexpression if inside of it. */
2997 if (group_in_compile_stack (compile_stack, c1))
2998 goto normal_char;
e318085a 2999
25fe55af
RS
3000 laststart = b;
3001 BUF_PUSH_2 (duplicate, c1);
3002 break;
e318085a 3003
e318085a 3004
25fe55af
RS
3005 case '+':
3006 case '?':
3007 if (syntax & RE_BK_PLUS_QM)
3008 goto handle_plus;
3009 else
3010 goto normal_backslash;
3011
3012 default:
3013 normal_backslash:
3014 /* You might think it would be useful for \ to mean
3015 not to translate; but if we don't translate it
3016 it will never match anything. */
3017 c = TRANSLATE (c);
3018 goto normal_char;
3019 }
3020 break;
fa9a63c5
RM
3021
3022
3023 default:
25fe55af 3024 /* Expects the character in `c'. */
fa9a63c5 3025 normal_char:
b18215fc
RS
3026 p1 = p - 1; /* P1 points the head of C. */
3027#ifdef emacs
3028 if (bufp->multibyte)
3583e969
KH
3029 {
3030 c = STRING_CHAR (p1, pend - p1);
3031 c = TRANSLATE (c);
3032 /* Set P to the next character boundary. */
3033 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
3034 }
b18215fc 3035#endif
fa9a63c5 3036 /* If no exactn currently being built. */
25fe55af 3037 if (!pending_exact
fa9a63c5 3038
25fe55af
RS
3039 /* If last exactn not at current position. */
3040 || pending_exact + *pending_exact + 1 != b
5e69f11e 3041
25fe55af 3042 /* We have only one byte following the exactn for the count. */
b18215fc 3043 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
fa9a63c5 3044
25fe55af 3045 /* If followed by a repetition operator. */
9d99031f 3046 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3047 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3048 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3049 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3050 || ((syntax & RE_INTERVALS)
25fe55af 3051 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3052 ? p != pend && *p == '{'
3053 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3054 {
3055 /* Start building a new exactn. */
5e69f11e 3056
25fe55af 3057 laststart = b;
fa9a63c5
RM
3058
3059 BUF_PUSH_2 (exactn, 0);
3060 pending_exact = b - 1;
25fe55af 3061 }
5e69f11e 3062
3583e969
KH
3063#ifdef emacs
3064 if (! SINGLE_BYTE_CHAR_P (c))
3065 {
1c8c6d39
DL
3066 unsigned char str[MAX_MULTIBYTE_LENGTH];
3067 int i = CHAR_STRING (c, str);
3583e969
KH
3068 int j;
3069 for (j = 0; j < i; j++)
3070 {
3071 BUF_PUSH (str[j]);
3072 (*pending_exact)++;
3073 }
3074 }
3075 else
3076#endif
b18215fc 3077 {
e934739e
RS
3078 BUF_PUSH (c);
3079 (*pending_exact)++;
b18215fc 3080 }
fa9a63c5 3081 break;
25fe55af 3082 } /* switch (c) */
fa9a63c5
RM
3083 } /* while p != pend */
3084
5e69f11e 3085
fa9a63c5 3086 /* Through the pattern now. */
5e69f11e 3087
505bde11 3088 FIXUP_ALT_JUMP ();
fa9a63c5 3089
5e69f11e 3090 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3091 FREE_STACK_RETURN (REG_EPAREN);
3092
3093 /* If we don't want backtracking, force success
3094 the first time we reach the end of the compiled pattern. */
3095 if (syntax & RE_NO_POSIX_BACKTRACKING)
3096 BUF_PUSH (succeed);
3097
3098 free (compile_stack.stack);
3099
3100 /* We have succeeded; set the length of the buffer. */
3101 bufp->used = b - bufp->buffer;
3102
3103#ifdef DEBUG
99633e97 3104 if (debug > 0)
fa9a63c5 3105 {
505bde11 3106 re_compile_fastmap (bufp);
fa9a63c5
RM
3107 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3108 print_compiled_pattern (bufp);
3109 }
99633e97 3110 debug--;
fa9a63c5
RM
3111#endif /* DEBUG */
3112
3113#ifndef MATCH_MAY_ALLOCATE
3114 /* Initialize the failure stack to the largest possible stack. This
3115 isn't necessary unless we're trying to avoid calling alloca in
3116 the search and match routines. */
3117 {
3118 int num_regs = bufp->re_nsub + 1;
3119
320a2a73 3120 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3121 {
a26f4ccd 3122 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3123
fa9a63c5
RM
3124 if (! fail_stack.stack)
3125 fail_stack.stack
5e69f11e 3126 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3127 * sizeof (fail_stack_elt_t));
3128 else
3129 fail_stack.stack
3130 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3131 (fail_stack.size
3132 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3133 }
3134
3135 regex_grow_registers (num_regs);
3136 }
3137#endif /* not MATCH_MAY_ALLOCATE */
3138
3139 return REG_NOERROR;
3140} /* regex_compile */
3141\f
3142/* Subroutines for `regex_compile'. */
3143
25fe55af 3144/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3145
3146static void
3147store_op1 (op, loc, arg)
3148 re_opcode_t op;
3149 unsigned char *loc;
3150 int arg;
3151{
3152 *loc = (unsigned char) op;
3153 STORE_NUMBER (loc + 1, arg);
3154}
3155
3156
3157/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3158
3159static void
3160store_op2 (op, loc, arg1, arg2)
3161 re_opcode_t op;
3162 unsigned char *loc;
3163 int arg1, arg2;
3164{
3165 *loc = (unsigned char) op;
3166 STORE_NUMBER (loc + 1, arg1);
3167 STORE_NUMBER (loc + 3, arg2);
3168}
3169
3170
3171/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3172 for OP followed by two-byte integer parameter ARG. */
3173
3174static void
3175insert_op1 (op, loc, arg, end)
3176 re_opcode_t op;
3177 unsigned char *loc;
3178 int arg;
5e69f11e 3179 unsigned char *end;
fa9a63c5
RM
3180{
3181 register unsigned char *pfrom = end;
3182 register unsigned char *pto = end + 3;
3183
3184 while (pfrom != loc)
3185 *--pto = *--pfrom;
5e69f11e 3186
fa9a63c5
RM
3187 store_op1 (op, loc, arg);
3188}
3189
3190
3191/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3192
3193static void
3194insert_op2 (op, loc, arg1, arg2, end)
3195 re_opcode_t op;
3196 unsigned char *loc;
3197 int arg1, arg2;
5e69f11e 3198 unsigned char *end;
fa9a63c5
RM
3199{
3200 register unsigned char *pfrom = end;
3201 register unsigned char *pto = end + 5;
3202
3203 while (pfrom != loc)
3204 *--pto = *--pfrom;
5e69f11e 3205
fa9a63c5
RM
3206 store_op2 (op, loc, arg1, arg2);
3207}
3208
3209
3210/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3211 after an alternative or a begin-subexpression. We assume there is at
3212 least one character before the ^. */
3213
3214static boolean
3215at_begline_loc_p (pattern, p, syntax)
99633e97 3216 const unsigned char *pattern, *p;
fa9a63c5
RM
3217 reg_syntax_t syntax;
3218{
4e8a9132 3219 const unsigned char *prev = p - 2;
fa9a63c5 3220 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3221
fa9a63c5
RM
3222 return
3223 /* After a subexpression? */
3224 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3225 /* After an alternative? */
fa9a63c5
RM
3226 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3227}
3228
3229
3230/* The dual of at_begline_loc_p. This one is for $. We assume there is
3231 at least one character after the $, i.e., `P < PEND'. */
3232
3233static boolean
3234at_endline_loc_p (p, pend, syntax)
99633e97
SM
3235 const unsigned char *p, *pend;
3236 reg_syntax_t syntax;
fa9a63c5 3237{
4e8a9132 3238 const unsigned char *next = p;
fa9a63c5 3239 boolean next_backslash = *next == '\\';
4e8a9132 3240 const unsigned char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3241
fa9a63c5
RM
3242 return
3243 /* Before a subexpression? */
3244 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3245 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3246 /* Before an alternative? */
3247 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3248 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3249}
3250
3251
5e69f11e 3252/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3253 false if it's not. */
3254
3255static boolean
3256group_in_compile_stack (compile_stack, regnum)
3257 compile_stack_type compile_stack;
3258 regnum_t regnum;
3259{
3260 int this_element;
3261
5e69f11e
RM
3262 for (this_element = compile_stack.avail - 1;
3263 this_element >= 0;
fa9a63c5
RM
3264 this_element--)
3265 if (compile_stack.stack[this_element].regnum == regnum)
3266 return true;
3267
3268 return false;
3269}
fa9a63c5
RM
3270\f
3271/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3272 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3273 characters can start a string that matches the pattern. This fastmap
3274 is used by re_search to skip quickly over impossible starting points.
3275
96cc36cc
RS
3276 Character codes above (1 << BYTEWIDTH) are not represented in the
3277 fastmap, but the leading codes are represented. Thus, the fastmap
3278 indicates which character sets could start a match.
3279
fa9a63c5
RM
3280 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3281 area as BUFP->fastmap.
5e69f11e 3282
fa9a63c5
RM
3283 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3284 the pattern buffer.
3285
3286 Returns 0 if we succeed, -2 if an internal error. */
3287
3288int
3289re_compile_fastmap (bufp)
3290 struct re_pattern_buffer *bufp;
3291{
505bde11 3292 int j, k;
1fb352e0 3293 boolean not;
fa9a63c5
RM
3294#ifdef MATCH_MAY_ALLOCATE
3295 fail_stack_type fail_stack;
3296#endif
3297#ifndef REGEX_MALLOC
3298 char *destination;
3299#endif
5e69f11e 3300
fa9a63c5
RM
3301 register char *fastmap = bufp->fastmap;
3302 unsigned char *pattern = bufp->buffer;
3303 unsigned long size = bufp->used;
3304 unsigned char *p = pattern;
3305 register unsigned char *pend = pattern + size;
1fb352e0 3306 const boolean multibyte = bufp->multibyte;
fa9a63c5 3307
99633e97 3308#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
fa9a63c5
RM
3309 /* This holds the pointer to the failure stack, when
3310 it is allocated relocatably. */
3311 fail_stack_elt_t *failure_stack_ptr;
99633e97 3312#endif
fa9a63c5
RM
3313
3314 /* Assume that each path through the pattern can be null until
25fe55af 3315 proven otherwise. We set this false at the bottom of switch
fa9a63c5
RM
3316 statement, to which we get only if a particular path doesn't
3317 match the empty string. */
3318 boolean path_can_be_null = true;
3319
b18215fc 3320 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3321 flag is set true. */
b18215fc
RS
3322 boolean match_any_multibyte_characters = false;
3323
fa9a63c5 3324 assert (fastmap != NULL && p != NULL);
5e69f11e 3325
fa9a63c5 3326 INIT_FAIL_STACK ();
25fe55af 3327 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
fa9a63c5
RM
3328 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3329 bufp->can_be_null = 0;
5e69f11e 3330
505bde11
SM
3331 /* The loop below works as follows:
3332 - It has a working-list kept in the PATTERN_STACK and which basically
3333 starts by only containing a pointer to the first operation.
3334 - If the opcode we're looking at is a match against some set of
3335 chars, then we add those chars to the fastmap and go on to the
3336 next work element from the worklist (done via `break').
3337 - If the opcode is a control operator on the other hand, we either
3338 ignore it (if it's meaningless at this point, such as `start_memory')
3339 or execute it (if it's a jump). If the jump has several destinations
3340 (i.e. `on_failure_jump'), then we push the other destination onto the
3341 worklist.
3342 We guarantee termination by ignoring backward jumps (more or less),
3343 so that `p' is monotonically increasing. More to the point, we
3344 never set `p' (or push) anything `<= p1'. */
3345
3346 /* If can_be_null is set, then the fastmap will not be used anyway. */
3347 while (!bufp->can_be_null)
fa9a63c5 3348 {
505bde11
SM
3349 /* `p1' is used as a marker of how far back a `on_failure_jump'
3350 can go without being ignored. It is normally equal to `p'
3351 (which prevents any backward `on_failure_jump') except right
3352 after a plain `jump', to allow patterns such as:
3353 0: jump 10
3354 3..9: <body>
3355 10: on_failure_jump 3
3356 as used for the *? operator. */
3357 unsigned char *p1 = p;
3358
ed0767d8 3359 if (p >= pend || *p == succeed)
fa9a63c5
RM
3360 {
3361 /* We have reached the (effective) end of pattern. */
505bde11 3362 if (!PATTERN_STACK_EMPTY ())
fa9a63c5
RM
3363 {
3364 bufp->can_be_null |= path_can_be_null;
3365
3366 /* Reset for next path. */
3367 path_can_be_null = true;
3368
66f0296e 3369 p = (unsigned char*) POP_PATTERN_OP ();
fa9a63c5
RM
3370
3371 continue;
3372 }
3373 else
3374 break;
3375 }
3376
25fe55af 3377 /* We should never be about to go beyond the end of the pattern. */
fa9a63c5 3378 assert (p < pend);
5e69f11e 3379
fa9a63c5
RM
3380 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3381 {
3382
fa9a63c5 3383 case duplicate:
505bde11
SM
3384 /* If the first character has to match a backreference, that means
3385 that the group was empty (since it already matched). Since this
3386 is the only case that interests us here, we can assume that the
3387 backreference must match the empty string. */
3388 p++;
3389 continue;
fa9a63c5
RM
3390
3391
3392 /* Following are the cases which match a character. These end
25fe55af 3393 with `break'. */
fa9a63c5
RM
3394
3395 case exactn:
25fe55af 3396 fastmap[p[1]] = 1;
fa9a63c5
RM
3397 break;
3398
3399
1fb352e0
SM
3400 case anychar:
3401 /* We could put all the chars except for \n (and maybe \0)
3402 but we don't bother since it is generally not worth it. */
3403 bufp->can_be_null = 1;
3404 continue;
fa9a63c5
RM
3405
3406
b18215fc 3407 case charset_not:
ba5c004d
RS
3408 /* Chars beyond end of bitmap are possible matches.
3409 All the single-byte codes can occur in multibyte buffers.
3410 So any that are not listed in the charset
3411 are possible matches, even in multibyte buffers. */
1fb352e0 3412 if (!fastmap) break;
b18215fc 3413 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3414 j < (1 << BYTEWIDTH); j++)
b18215fc 3415 fastmap[j] = 1;
1fb352e0
SM
3416 /* Fallthrough */
3417 case charset:
3418 if (!fastmap) break;
3419 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3420 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3421 j >= 0; j--)
1fb352e0 3422 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3423 fastmap[j] = 1;
3424
1fb352e0
SM
3425 if ((not && multibyte)
3426 /* Any character set can possibly contain a character
3427 which doesn't match the specified set of characters. */
3428 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3429 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3430 /* If we can match a character class, we can match
3431 any character set. */
b18215fc
RS
3432 {
3433 set_fastmap_for_multibyte_characters:
3434 if (match_any_multibyte_characters == false)
3435 {
3436 for (j = 0x80; j < 0xA0; j++) /* XXX */
3437 if (BASE_LEADING_CODE_P (j))
3438 fastmap[j] = 1;
3439 match_any_multibyte_characters = true;
3440 }
3441 }
b18215fc 3442
1fb352e0
SM
3443 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3444 && match_any_multibyte_characters == false)
3445 {
3446 /* Set fastmap[I] 1 where I is a base leading code of each
3447 multibyte character in the range table. */
3448 int c, count;
b18215fc 3449
1fb352e0
SM
3450 /* Make P points the range table. `+ 2' is to skip flag
3451 bits for a character class. */
3452 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3453
1fb352e0
SM
3454 /* Extract the number of ranges in range table into COUNT. */
3455 EXTRACT_NUMBER_AND_INCR (count, p);
3456 for (; count > 0; count--, p += 2 * 3) /* XXX */
3457 {
3458 /* Extract the start of each range. */
3459 EXTRACT_CHARACTER (c, p);
3460 j = CHAR_CHARSET (c);
3461 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3462 }
3463 }
b18215fc
RS
3464 break;
3465
1fb352e0
SM
3466 case syntaxspec:
3467 case notsyntaxspec:
3468 if (!fastmap) break;
3469#ifndef emacs
3470 not = (re_opcode_t)p[-1] == notsyntaxspec;
3471 k = *p++;
3472 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3473 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3474 fastmap[j] = 1;
b18215fc 3475 break;
1fb352e0 3476#else /* emacs */
b18215fc
RS
3477 /* This match depends on text properties. These end with
3478 aborting optimizations. */
3479 bufp->can_be_null = 1;
505bde11 3480 continue;
b18215fc
RS
3481
3482 case categoryspec:
b18215fc 3483 case notcategoryspec:
1fb352e0
SM
3484 if (!fastmap) break;
3485 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3486 k = *p++;
1fb352e0
SM
3487 for (j = 0; j < (1 << BYTEWIDTH); j++)
3488 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3489 fastmap[j] = 1;
3490
1fb352e0 3491 if (multibyte)
b18215fc 3492 /* Any character set can possibly contain a character
1fb352e0 3493 whose category is K (or not). */
b18215fc
RS
3494 goto set_fastmap_for_multibyte_characters;
3495 break;
3496
fa9a63c5 3497 /* All cases after this match the empty string. These end with
25fe55af 3498 `continue'. */
fa9a63c5 3499
fa9a63c5
RM
3500 case before_dot:
3501 case at_dot:
3502 case after_dot:
1fb352e0 3503#endif /* !emacs */
25fe55af
RS
3504 case no_op:
3505 case begline:
3506 case endline:
fa9a63c5
RM
3507 case begbuf:
3508 case endbuf:
3509 case wordbound:
3510 case notwordbound:
3511 case wordbeg:
3512 case wordend:
25fe55af 3513 continue;
fa9a63c5
RM
3514
3515
fa9a63c5 3516 case jump:
25fe55af 3517 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
3518 if (j < 0)
3519 /* Backward jumps can only go back to code that we've already
3520 visited. `re_compile' should make sure this is true. */
3521 break;
25fe55af 3522 p += j;
505bde11
SM
3523 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3524 {
3525 case on_failure_jump:
3526 case on_failure_keep_string_jump:
505bde11 3527 case on_failure_jump_loop:
0683b6fa 3528 case on_failure_jump_nastyloop:
505bde11
SM
3529 case on_failure_jump_smart:
3530 p++;
3531 break;
3532 default:
3533 continue;
3534 };
3535 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3536 to jump back to "just after here". */
3537 /* Fallthrough */
fa9a63c5 3538
25fe55af
RS
3539 case on_failure_jump:
3540 case on_failure_keep_string_jump:
0683b6fa 3541 case on_failure_jump_nastyloop:
505bde11
SM
3542 case on_failure_jump_loop:
3543 case on_failure_jump_smart:
25fe55af 3544 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 3545 if (p + j <= p1)
ed0767d8
SM
3546 ; /* Backward jump to be ignored. */
3547 else if (!PUSH_PATTERN_OP (p + j, fail_stack))
3548 return (RESET_FAIL_STACK (), -2);
25fe55af 3549 continue;
fa9a63c5
RM
3550
3551
ed0767d8
SM
3552 case jump_n:
3553 /* This code simply does not properly handle forward jump_n. */
3554 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3555 p += 4;
3556 /* jump_n can either jump or fall through. The (backward) jump
3557 case has already been handled, so we only need to look at the
3558 fallthrough case. */
3559 continue;
3560
fa9a63c5 3561 case succeed_n:
ed0767d8
SM
3562 /* If N == 0, it should be an on_failure_jump_loop instead. */
3563 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3564 p += 4;
3565 /* We only care about one iteration of the loop, so we don't
3566 need to consider the case where this behaves like an
3567 on_failure_jump. */
25fe55af 3568 continue;
fa9a63c5
RM
3569
3570
3571 case set_number_at:
25fe55af
RS
3572 p += 4;
3573 continue;
fa9a63c5
RM
3574
3575
3576 case start_memory:
25fe55af 3577 case stop_memory:
505bde11 3578 p += 1;
fa9a63c5
RM
3579 continue;
3580
3581
3582 default:
25fe55af
RS
3583 abort (); /* We have listed all the cases. */
3584 } /* switch *p++ */
fa9a63c5
RM
3585
3586 /* Getting here means we have found the possible starting
25fe55af
RS
3587 characters for one path of the pattern -- and that the empty
3588 string does not match. We need not follow this path further.
3589 Instead, look at the next alternative (remembered on the
3590 stack), or quit if no more. The test at the top of the loop
3591 does these things. */
fa9a63c5
RM
3592 path_can_be_null = false;
3593 p = pend;
3594 } /* while p */
3595
3596 /* Set `can_be_null' for the last path (also the first path, if the
25fe55af 3597 pattern is empty). */
fa9a63c5 3598 bufp->can_be_null |= path_can_be_null;
fa9a63c5
RM
3599 RESET_FAIL_STACK ();
3600 return 0;
3601} /* re_compile_fastmap */
3602\f
3603/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3604 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3605 this memory for recording register information. STARTS and ENDS
3606 must be allocated using the malloc library routine, and must each
3607 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3608
3609 If NUM_REGS == 0, then subsequent matches should allocate their own
3610 register data.
3611
3612 Unless this function is called, the first search or match using
3613 PATTERN_BUFFER will allocate its own register data, without
3614 freeing the old data. */
3615
3616void
3617re_set_registers (bufp, regs, num_regs, starts, ends)
3618 struct re_pattern_buffer *bufp;
3619 struct re_registers *regs;
3620 unsigned num_regs;
3621 regoff_t *starts, *ends;
3622{
3623 if (num_regs)
3624 {
3625 bufp->regs_allocated = REGS_REALLOCATE;
3626 regs->num_regs = num_regs;
3627 regs->start = starts;
3628 regs->end = ends;
3629 }
3630 else
3631 {
3632 bufp->regs_allocated = REGS_UNALLOCATED;
3633 regs->num_regs = 0;
3634 regs->start = regs->end = (regoff_t *) 0;
3635 }
3636}
3637\f
25fe55af 3638/* Searching routines. */
fa9a63c5
RM
3639
3640/* Like re_search_2, below, but only one string is specified, and
3641 doesn't let you say where to stop matching. */
3642
3643int
3644re_search (bufp, string, size, startpos, range, regs)
3645 struct re_pattern_buffer *bufp;
3646 const char *string;
3647 int size, startpos, range;
3648 struct re_registers *regs;
3649{
5e69f11e 3650 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
3651 regs, size);
3652}
3653
b18215fc
RS
3654/* End address of virtual concatenation of string. */
3655#define STOP_ADDR_VSTRING(P) \
3656 (((P) >= size1 ? string2 + size2 : string1 + size1))
3657
3658/* Address of POS in the concatenation of virtual string. */
3659#define POS_ADDR_VSTRING(POS) \
3660 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
3661
3662/* Using the compiled pattern in BUFP->buffer, first tries to match the
3663 virtual concatenation of STRING1 and STRING2, starting first at index
3664 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 3665
fa9a63c5 3666 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 3667
fa9a63c5
RM
3668 RANGE is how far to scan while trying to match. RANGE = 0 means try
3669 only at STARTPOS; in general, the last start tried is STARTPOS +
3670 RANGE.
5e69f11e 3671
fa9a63c5
RM
3672 In REGS, return the indices of the virtual concatenation of STRING1
3673 and STRING2 that matched the entire BUFP->buffer and its contained
3674 subexpressions.
5e69f11e 3675
fa9a63c5
RM
3676 Do not consider matching one past the index STOP in the virtual
3677 concatenation of STRING1 and STRING2.
3678
3679 We return either the position in the strings at which the match was
3680 found, -1 if no match, or -2 if error (such as failure
3681 stack overflow). */
3682
3683int
66f0296e 3684re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 3685 struct re_pattern_buffer *bufp;
66f0296e 3686 const char *str1, *str2;
fa9a63c5
RM
3687 int size1, size2;
3688 int startpos;
3689 int range;
3690 struct re_registers *regs;
3691 int stop;
3692{
3693 int val;
66f0296e
SM
3694 re_char *string1 = (re_char*) str1;
3695 re_char *string2 = (re_char*) str2;
fa9a63c5 3696 register char *fastmap = bufp->fastmap;
6676cb1c 3697 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
3698 int total_size = size1 + size2;
3699 int endpos = startpos + range;
c8499ba5 3700 int anchored_start = 0;
fa9a63c5 3701
25fe55af 3702 /* Nonzero if we have to concern multibyte character. */
0683b6fa 3703 const boolean multibyte = bufp->multibyte;
b18215fc 3704
fa9a63c5
RM
3705 /* Check for out-of-range STARTPOS. */
3706 if (startpos < 0 || startpos > total_size)
3707 return -1;
5e69f11e 3708
fa9a63c5 3709 /* Fix up RANGE if it might eventually take us outside
34597fa9 3710 the virtual concatenation of STRING1 and STRING2.
5e69f11e 3711 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
3712 if (endpos < 0)
3713 range = 0 - startpos;
fa9a63c5
RM
3714 else if (endpos > total_size)
3715 range = total_size - startpos;
3716
3717 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 3718 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
3719 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3720 {
3721 if (startpos > 0)
3722 return -1;
3723 else
7b140fd7 3724 range = 0;
fa9a63c5
RM
3725 }
3726
ae4788a8
RS
3727#ifdef emacs
3728 /* In a forward search for something that starts with \=.
3729 don't keep searching past point. */
3730 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3731 {
7b140fd7
RS
3732 range = PT_BYTE - BEGV_BYTE - startpos;
3733 if (range < 0)
ae4788a8
RS
3734 return -1;
3735 }
3736#endif /* emacs */
3737
fa9a63c5
RM
3738 /* Update the fastmap now if not correct already. */
3739 if (fastmap && !bufp->fastmap_accurate)
3740 if (re_compile_fastmap (bufp) == -2)
3741 return -2;
5e69f11e 3742
c8499ba5
RS
3743 /* See whether the pattern is anchored. */
3744 if (bufp->buffer[0] == begline)
3745 anchored_start = 1;
3746
b18215fc 3747#ifdef emacs
cc9b4df2
KH
3748 gl_state.object = re_match_object;
3749 {
99633e97 3750 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
3751
3752 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3753 }
b18215fc
RS
3754#endif
3755
fa9a63c5
RM
3756 /* Loop through the string, looking for a place to start matching. */
3757 for (;;)
5e69f11e 3758 {
c8499ba5
RS
3759 /* If the pattern is anchored,
3760 skip quickly past places we cannot match.
3761 We don't bother to treat startpos == 0 specially
3762 because that case doesn't repeat. */
3763 if (anchored_start && startpos > 0)
3764 {
3765 if (! (bufp->newline_anchor
3766 && ((startpos <= size1 ? string1[startpos - 1]
3767 : string2[startpos - size1 - 1])
3768 == '\n')))
3769 goto advance;
3770 }
3771
fa9a63c5 3772 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
3773 cannot be the start of a match. If the pattern can match the
3774 null string, however, we don't need to skip characters; we want
3775 the first null string. */
fa9a63c5
RM
3776 if (fastmap && startpos < total_size && !bufp->can_be_null)
3777 {
66f0296e 3778 register re_char *d;
e934739e
RS
3779 register unsigned int buf_ch;
3780
3781 d = POS_ADDR_VSTRING (startpos);
3782
25fe55af 3783 if (range > 0) /* Searching forwards. */
fa9a63c5 3784 {
fa9a63c5
RM
3785 register int lim = 0;
3786 int irange = range;
3787
25fe55af
RS
3788 if (startpos < size1 && startpos + range >= size1)
3789 lim = range - (size1 - startpos);
fa9a63c5 3790
25fe55af
RS
3791 /* Written out as an if-else to avoid testing `translate'
3792 inside the loop. */
28ae27ae
AS
3793 if (RE_TRANSLATE_P (translate))
3794 {
e934739e
RS
3795 if (multibyte)
3796 while (range > lim)
3797 {
3798 int buf_charlen;
3799
3800 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
3801 buf_charlen);
3802
3803 buf_ch = RE_TRANSLATE (translate, buf_ch);
3804 if (buf_ch >= 0400
3805 || fastmap[buf_ch])
3806 break;
3807
3808 range -= buf_charlen;
3809 d += buf_charlen;
3810 }
3811 else
3812 while (range > lim
66f0296e 3813 && !fastmap[RE_TRANSLATE (translate, *d)])
33c46939
RS
3814 {
3815 d++;
3816 range--;
3817 }
e934739e 3818 }
fa9a63c5 3819 else
66f0296e 3820 while (range > lim && !fastmap[*d])
33c46939
RS
3821 {
3822 d++;
3823 range--;
3824 }
fa9a63c5
RM
3825
3826 startpos += irange - range;
3827 }
25fe55af 3828 else /* Searching backwards. */
fa9a63c5 3829 {
4e8a9132
SM
3830 buf_ch = STRING_CHAR (d, (startpos >= size1
3831 ? size2 + size1 - startpos
3832 : size1 - startpos));
28703c16 3833 if (RE_TRANSLATE_P (translate))
e934739e 3834 buf_ch = RE_TRANSLATE (translate, buf_ch);
fa9a63c5 3835
e934739e
RS
3836 if (! (buf_ch >= 0400
3837 || fastmap[buf_ch]))
fa9a63c5
RM
3838 goto advance;
3839 }
3840 }
3841
3842 /* If can't match the null string, and that's all we have left, fail. */
3843 if (range >= 0 && startpos == total_size && fastmap
25fe55af 3844 && !bufp->can_be_null)
fa9a63c5
RM
3845 return -1;
3846
3847 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3848 startpos, regs, stop);
3849#ifndef REGEX_MALLOC
3850#ifdef C_ALLOCA
3851 alloca (0);
3852#endif
3853#endif
3854
3855 if (val >= 0)
3856 return startpos;
5e69f11e 3857
fa9a63c5
RM
3858 if (val == -2)
3859 return -2;
3860
3861 advance:
5e69f11e 3862 if (!range)
25fe55af 3863 break;
5e69f11e 3864 else if (range > 0)
25fe55af 3865 {
b18215fc
RS
3866 /* Update STARTPOS to the next character boundary. */
3867 if (multibyte)
3868 {
66f0296e
SM
3869 re_char *p = POS_ADDR_VSTRING (startpos);
3870 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
3871 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
3872
3873 range -= len;
3874 if (range < 0)
3875 break;
3876 startpos += len;
3877 }
3878 else
3879 {
b560c397
RS
3880 range--;
3881 startpos++;
3882 }
e318085a 3883 }
fa9a63c5 3884 else
25fe55af
RS
3885 {
3886 range++;
3887 startpos--;
b18215fc
RS
3888
3889 /* Update STARTPOS to the previous character boundary. */
3890 if (multibyte)
3891 {
66f0296e 3892 re_char *p = POS_ADDR_VSTRING (startpos);
b18215fc
RS
3893 int len = 0;
3894
3895 /* Find the head of multibyte form. */
5d967c7a 3896 while (!CHAR_HEAD_P (*p))
b18215fc
RS
3897 p--, len++;
3898
3899 /* Adjust it. */
3900#if 0 /* XXX */
3901 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
3902 ;
3903 else
3904#endif
3905 {
3906 range += len;
3907 if (range > 0)
3908 break;
3909
3910 startpos -= len;
3911 }
3912 }
25fe55af 3913 }
fa9a63c5
RM
3914 }
3915 return -1;
3916} /* re_search_2 */
3917\f
3918/* Declarations and macros for re_match_2. */
3919
3920static int bcmp_translate ();
fa9a63c5
RM
3921
3922/* This converts PTR, a pointer into one of the search strings `string1'
3923 and `string2' into an offset from the beginning of that string. */
3924#define POINTER_TO_OFFSET(ptr) \
3925 (FIRST_STRING_P (ptr) \
3926 ? ((regoff_t) ((ptr) - string1)) \
3927 : ((regoff_t) ((ptr) - string2 + size1)))
3928
fa9a63c5
RM
3929/* Call before fetching a character with *d. This switches over to
3930 string2 if necessary. */
3931#define PREFETCH() \
25fe55af 3932 while (d == dend) \
fa9a63c5
RM
3933 { \
3934 /* End of string2 => fail. */ \
25fe55af
RS
3935 if (dend == end_match_2) \
3936 goto fail; \
3937 /* End of string1 => advance to string2. */ \
3938 d = string2; \
fa9a63c5
RM
3939 dend = end_match_2; \
3940 }
3941
3942
3943/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 3944 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 3945#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 3946#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
3947
3948
3949/* Test if D points to a character which is word-constituent. We have
3950 two special cases to check for: if past the end of string1, look at
3951 the first character in string2; and if before the beginning of
3952 string2, look at the last character in string1. */
3953#define WORDCHAR_P(d) \
3954 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 3955 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
3956 == Sword)
3957
9121ca40 3958/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
3959
3960/* The comment at case wordbound is following one, but we don't use
3961 AT_WORD_BOUNDARY anymore to support multibyte form.
3962
3963 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
3964 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3965 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
3966 macro and introducing temporary variables works around the bug. */
3967
9121ca40 3968#if 0
fa9a63c5
RM
3969/* Test if the character before D and the one at D differ with respect
3970 to being word-constituent. */
3971#define AT_WORD_BOUNDARY(d) \
3972 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3973 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 3974#endif
fa9a63c5
RM
3975
3976/* Free everything we malloc. */
3977#ifdef MATCH_MAY_ALLOCATE
00049484 3978#define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
fa9a63c5
RM
3979#define FREE_VARIABLES() \
3980 do { \
3981 REGEX_FREE_STACK (fail_stack.stack); \
3982 FREE_VAR (regstart); \
3983 FREE_VAR (regend); \
fa9a63c5
RM
3984 FREE_VAR (best_regstart); \
3985 FREE_VAR (best_regend); \
fa9a63c5
RM
3986 } while (0)
3987#else
3988#define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3989#endif /* not MATCH_MAY_ALLOCATE */
3990
505bde11
SM
3991\f
3992/* Optimization routines. */
3993
4e8a9132
SM
3994/* If the operation is a match against one or more chars,
3995 return a pointer to the next operation, else return NULL. */
3996static unsigned char *
3997skip_one_char (p)
3998 unsigned char *p;
3999{
4000 switch (SWITCH_ENUM_CAST (*p++))
4001 {
4002 case anychar:
4003 break;
4004
4005 case exactn:
4006 p += *p + 1;
4007 break;
4008
4009 case charset_not:
4010 case charset:
4011 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4012 {
4013 int mcnt;
4014 p = CHARSET_RANGE_TABLE (p - 1);
4015 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4016 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4017 }
4018 else
4019 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4020 break;
4021
4e8a9132
SM
4022 case syntaxspec:
4023 case notsyntaxspec:
1fb352e0 4024#ifdef emacs
4e8a9132
SM
4025 case categoryspec:
4026 case notcategoryspec:
4027#endif /* emacs */
4028 p++;
4029 break;
4030
4031 default:
4032 p = NULL;
4033 }
4034 return p;
4035}
4036
4037
505bde11
SM
4038/* Jump over non-matching operations. */
4039static unsigned char *
4e8a9132 4040skip_noops (p, pend)
505bde11 4041 unsigned char *p, *pend;
505bde11
SM
4042{
4043 int mcnt;
4044 while (p < pend)
4045 {
4046 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4047 {
4048 case start_memory:
505bde11
SM
4049 case stop_memory:
4050 p += 2; break;
4051 case no_op:
4052 p += 1; break;
4053 case jump:
4054 p += 1;
4055 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4056 p += mcnt;
4057 break;
4058 default:
4059 return p;
4060 }
4061 }
4062 assert (p == pend);
4063 return p;
4064}
4065
4066/* Non-zero if "p1 matches something" implies "p2 fails". */
4067static int
4068mutually_exclusive_p (bufp, p1, p2)
4069 struct re_pattern_buffer *bufp;
4070 unsigned char *p1, *p2;
4071{
4e8a9132
SM
4072 re_opcode_t op2;
4073 const boolean multibyte = bufp->multibyte;
505bde11
SM
4074 unsigned char *pend = bufp->buffer + bufp->used;
4075
4e8a9132 4076 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4077 && p2 >= bufp->buffer && p2 <= pend);
4078
4079 /* Skip over open/close-group commands.
4080 If what follows this loop is a ...+ construct,
4081 look at what begins its body, since we will have to
4082 match at least one of that. */
4e8a9132
SM
4083 p2 = skip_noops (p2, pend);
4084 /* The same skip can be done for p1, except that this function
4085 is only used in the case where p1 is a simple match operator. */
4086 /* p1 = skip_noops (p1, pend); */
4087
4088 assert (p1 >= bufp->buffer && p1 < pend
4089 && p2 >= bufp->buffer && p2 <= pend);
4090
4091 op2 = p2 == pend ? succeed : *p2;
4092
4093 switch (SWITCH_ENUM_CAST (op2))
505bde11 4094 {
4e8a9132
SM
4095 case succeed:
4096 case endbuf:
4097 /* If we're at the end of the pattern, we can change. */
4098 if (skip_one_char (p1))
505bde11 4099 {
505bde11
SM
4100 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4101 return 1;
505bde11 4102 }
4e8a9132
SM
4103 break;
4104
4105 case endline:
4106 if (!bufp->newline_anchor)
4107 break;
4108 /* Fallthrough */
4109 case exactn:
4110 {
4111 register unsigned int c
4112 = (re_opcode_t) *p2 == endline ? '\n'
4113 : RE_STRING_CHAR(p2 + 2, pend - p2 - 2);
505bde11 4114
4e8a9132
SM
4115 if ((re_opcode_t) *p1 == exactn)
4116 {
4117 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4118 {
4119 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4120 return 1;
4121 }
4122 }
505bde11 4123
4e8a9132
SM
4124 else if ((re_opcode_t) *p1 == charset
4125 || (re_opcode_t) *p1 == charset_not)
4126 {
4127 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4128
4e8a9132
SM
4129 /* Test if C is listed in charset (or charset_not)
4130 at `p1'. */
4131 if (SINGLE_BYTE_CHAR_P (c))
4132 {
4133 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4134 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4135 not = !not;
4136 }
4137 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4138 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4139
4e8a9132
SM
4140 /* `not' is equal to 1 if c would match, which means
4141 that we can't change to pop_failure_jump. */
4142 if (!not)
4143 {
4144 DEBUG_PRINT1 (" No match => fast loop.\n");
4145 return 1;
4146 }
4147 }
4148 else if ((re_opcode_t) *p1 == anychar
4149 && c == '\n')
4150 {
4151 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4152 return 1;
4153 }
4154 }
4155 break;
505bde11 4156
4e8a9132
SM
4157 case charset:
4158 case charset_not:
4159 {
4160 if ((re_opcode_t) *p1 == exactn)
4161 /* Reuse the code above. */
4162 return mutually_exclusive_p (bufp, p2, p1);
505bde11
SM
4163
4164
4165 /* It is hard to list up all the character in charset
4166 P2 if it includes multibyte character. Give up in
4167 such case. */
4168 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4169 {
4170 /* Now, we are sure that P2 has no range table.
4171 So, for the size of bitmap in P2, `p2[1]' is
4172 enough. But P1 may have range table, so the
4173 size of bitmap table of P1 is extracted by
4174 using macro `CHARSET_BITMAP_SIZE'.
4175
4176 Since we know that all the character listed in
4177 P2 is ASCII, it is enough to test only bitmap
4178 table of P1. */
4179
4180 if (*p1 == *p2)
4181 {
4182 int idx;
4183 /* We win if the charset inside the loop
4184 has no overlap with the one after the loop. */
4185 for (idx = 0;
4186 (idx < (int) p2[1]
4187 && idx < CHARSET_BITMAP_SIZE (p1));
4188 idx++)
4189 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4190 break;
4191
4192 if (idx == p2[1]
4193 || idx == CHARSET_BITMAP_SIZE (p1))
4194 {
4195 DEBUG_PRINT1 (" No match => fast loop.\n");
4196 return 1;
4197 }
4198 }
4199 else if ((re_opcode_t) *p1 == charset
4200 || (re_opcode_t) *p1 == charset_not)
4201 {
4202 int idx;
4203 /* We win if the charset_not inside the loop lists
4204 every character listed in the charset after. */
4205 for (idx = 0; idx < (int) p2[1]; idx++)
4206 if (! (p2[2 + idx] == 0
4207 || (idx < CHARSET_BITMAP_SIZE (p1)
4208 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4209 break;
4210
4e8a9132
SM
4211 if (idx == p2[1])
4212 {
4213 DEBUG_PRINT1 (" No match => fast loop.\n");
4214 return 1;
4215 }
4216 }
4217 }
4218 }
4219
4e8a9132
SM
4220 case wordend:
4221 case notsyntaxspec:
4222 return ((re_opcode_t) *p1 == syntaxspec
4223 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4224
4225 case wordbeg:
4226 case syntaxspec:
4227 return ((re_opcode_t) *p1 == notsyntaxspec
4228 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4229
4230 case wordbound:
4231 return (((re_opcode_t) *p1 == notsyntaxspec
4232 || (re_opcode_t) *p1 == syntaxspec)
4233 && p1[1] == Sword);
4234
1fb352e0 4235#ifdef emacs
4e8a9132
SM
4236 case categoryspec:
4237 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4238 case notcategoryspec:
4239 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4240#endif /* emacs */
4241
4242 default:
4243 ;
505bde11
SM
4244 }
4245
4246 /* Safe default. */
4247 return 0;
4248}
4249
fa9a63c5
RM
4250\f
4251/* Matching routines. */
4252
25fe55af 4253#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4254/* re_match is like re_match_2 except it takes only a single string. */
4255
4256int
4257re_match (bufp, string, size, pos, regs)
4258 struct re_pattern_buffer *bufp;
4259 const char *string;
4260 int size, pos;
4261 struct re_registers *regs;
4262{
4263 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4264 pos, regs, size);
4265 alloca (0);
4266 return result;
4267}
4268#endif /* not emacs */
4269
b18215fc
RS
4270#ifdef emacs
4271/* In Emacs, this is the string or buffer in which we
25fe55af 4272 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4273Lisp_Object re_match_object;
4274#endif
fa9a63c5
RM
4275
4276/* re_match_2 matches the compiled pattern in BUFP against the
4277 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4278 and SIZE2, respectively). We start matching at POS, and stop
4279 matching at STOP.
5e69f11e 4280
fa9a63c5 4281 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4282 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4283 documentation for exactly how many groups we fill.
4284
4285 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4286 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4287 matched substring. */
4288
4289int
4290re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4291 struct re_pattern_buffer *bufp;
4292 const char *string1, *string2;
4293 int size1, size2;
4294 int pos;
4295 struct re_registers *regs;
4296 int stop;
4297{
b18215fc 4298 int result;
25fe55af 4299
b18215fc 4300#ifdef emacs
cc9b4df2
KH
4301 int charpos;
4302 gl_state.object = re_match_object;
99633e97 4303 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4304 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4305#endif
4306
4307 result = re_match_2_internal (bufp, string1, size1, string2, size2,
cc9b4df2 4308 pos, regs, stop);
fa9a63c5
RM
4309 alloca (0);
4310 return result;
4311}
4312
4313/* This is a separate function so that we can force an alloca cleanup
25fe55af 4314 afterwards. */
fa9a63c5
RM
4315static int
4316re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4317 struct re_pattern_buffer *bufp;
66f0296e 4318 re_char *string1, *string2;
fa9a63c5
RM
4319 int size1, size2;
4320 int pos;
4321 struct re_registers *regs;
4322 int stop;
4323{
4324 /* General temporaries. */
4325 int mcnt;
66f0296e 4326 boolean not;
fa9a63c5
RM
4327 unsigned char *p1;
4328
4329 /* Just past the end of the corresponding string. */
66f0296e 4330 re_char *end1, *end2;
fa9a63c5
RM
4331
4332 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4333 each to consider matching. */
66f0296e 4334 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4335
4336 /* Where we are in the data, and the end of the current string. */
66f0296e 4337 re_char *d, *dend;
5e69f11e 4338
99633e97
SM
4339 /* Used sometimes to remember where we were before starting matching
4340 an operator so that we can go back in case of failure. This "atomic"
4341 behavior of matching opcodes is indispensable to the correctness
4342 of the on_failure_keep_string_jump optimization. */
4343 re_char *dfail;
4344
fa9a63c5
RM
4345 /* Where we are in the pattern, and the end of the pattern. */
4346 unsigned char *p = bufp->buffer;
4347 register unsigned char *pend = p + bufp->used;
4348
25fe55af 4349 /* We use this to map every character in the string. */
6676cb1c 4350 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4351
25fe55af 4352 /* Nonzero if we have to concern multibyte character. */
1fb352e0 4353 const boolean multibyte = bufp->multibyte;
b18215fc 4354
fa9a63c5
RM
4355 /* Failure point stack. Each place that can handle a failure further
4356 down the line pushes a failure point on this stack. It consists of
505bde11 4357 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4358 the subexpressions we're currently inside, plus the number of such
4359 registers, and, finally, two char *'s. The first char * is where
4360 to resume scanning the pattern; the second one is where to resume
505bde11 4361 scanning the strings. */
25fe55af 4362#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4363 fail_stack_type fail_stack;
4364#endif
4365#ifdef DEBUG
4366 static unsigned failure_id = 0;
4367 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4368#endif
4369
99633e97 4370#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
fa9a63c5
RM
4371 /* This holds the pointer to the failure stack, when
4372 it is allocated relocatably. */
4373 fail_stack_elt_t *failure_stack_ptr;
99633e97 4374#endif
fa9a63c5
RM
4375
4376 /* We fill all the registers internally, independent of what we
25fe55af 4377 return, for use in backreferences. The number here includes
fa9a63c5
RM
4378 an element for register zero. */
4379 unsigned num_regs = bufp->re_nsub + 1;
5e69f11e 4380
fa9a63c5
RM
4381 /* Information on the contents of registers. These are pointers into
4382 the input strings; they record just what was matched (on this
4383 attempt) by a subexpression part of the pattern, that is, the
4384 regnum-th regstart pointer points to where in the pattern we began
4385 matching and the regnum-th regend points to right after where we
4386 stopped matching the regnum-th subexpression. (The zeroth register
4387 keeps track of what the whole pattern matches.) */
4388#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4389 re_char **regstart, **regend;
fa9a63c5
RM
4390#endif
4391
fa9a63c5 4392 /* The following record the register info as found in the above
5e69f11e 4393 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4394 This happens as we backtrack through the failure points, which in
4395 turn happens only if we have not yet matched the entire string. */
4396 unsigned best_regs_set = false;
4397#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4398 re_char **best_regstart, **best_regend;
fa9a63c5 4399#endif
5e69f11e 4400
fa9a63c5
RM
4401 /* Logically, this is `best_regend[0]'. But we don't want to have to
4402 allocate space for that if we're not allocating space for anything
25fe55af 4403 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4404 any of the other register vectors, and it seems rather a kludge to
4405 treat `best_regend' differently than the rest. So we keep track of
4406 the end of the best match so far in a separate variable. We
4407 initialize this to NULL so that when we backtrack the first time
4408 and need to test it, it's not garbage. */
66f0296e 4409 re_char *match_end = NULL;
fa9a63c5 4410
fa9a63c5
RM
4411#ifdef DEBUG
4412 /* Counts the total number of registers pushed. */
5e69f11e 4413 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4414#endif
4415
4416 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4417
fa9a63c5 4418 INIT_FAIL_STACK ();
5e69f11e 4419
fa9a63c5
RM
4420#ifdef MATCH_MAY_ALLOCATE
4421 /* Do not bother to initialize all the register variables if there are
4422 no groups in the pattern, as it takes a fair amount of time. If
4423 there are groups, we include space for register 0 (the whole
4424 pattern), even though we never use it, since it simplifies the
4425 array indexing. We should fix this. */
4426 if (bufp->re_nsub)
4427 {
66f0296e
SM
4428 regstart = REGEX_TALLOC (num_regs, re_char *);
4429 regend = REGEX_TALLOC (num_regs, re_char *);
4430 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4431 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 4432
505bde11 4433 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
4434 {
4435 FREE_VARIABLES ();
4436 return -2;
4437 }
fa9a63c5
RM
4438 }
4439 else
4440 {
4441 /* We must initialize all our variables to NULL, so that
25fe55af 4442 `FREE_VARIABLES' doesn't try to free them. */
505bde11 4443 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
4444 }
4445#endif /* MATCH_MAY_ALLOCATE */
4446
4447 /* The starting position is bogus. */
4448 if (pos < 0 || pos > size1 + size2)
4449 {
4450 FREE_VARIABLES ();
4451 return -1;
4452 }
5e69f11e 4453
fa9a63c5
RM
4454 /* Initialize subexpression text positions to -1 to mark ones that no
4455 start_memory/stop_memory has been seen for. Also initialize the
4456 register information struct. */
4457 for (mcnt = 1; mcnt < num_regs; mcnt++)
99633e97
SM
4458 regstart[mcnt] = regend[mcnt] = REG_UNSET_VALUE;
4459
4460 /* Shorten strings to `stop'. */
4461 if (stop <= size1)
fa9a63c5 4462 {
99633e97
SM
4463 size1 = stop;
4464 size2 = 0;
fa9a63c5 4465 }
99633e97
SM
4466 else if (stop <= size1 + size2)
4467 size2 = stop - size1;
5e69f11e 4468
fa9a63c5 4469 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 4470 `string1' is null. */
fa9a63c5
RM
4471 if (size2 == 0 && string1 != NULL)
4472 {
4473 string2 = string1;
4474 size2 = size1;
4475 string1 = 0;
4476 size1 = 0;
4477 }
4478 end1 = string1 + size1;
4479 end2 = string2 + size2;
4480
4481 /* Compute where to stop matching, within the two strings. */
99633e97
SM
4482 end_match_1 = end1;
4483 end_match_2 = end2;
fa9a63c5 4484
5e69f11e 4485 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
4486 `dend' is the end of the input string that `d' points within. `d'
4487 is advanced into the following input string whenever necessary, but
4488 this happens before fetching; therefore, at the beginning of the
4489 loop, `d' can be pointing at the end of a string, but it cannot
4490 equal `string2'. */
4491 if (size1 > 0 && pos <= size1)
4492 {
4493 d = string1 + pos;
4494 dend = end_match_1;
4495 }
4496 else
4497 {
4498 d = string2 + pos - size1;
4499 dend = end_match_2;
4500 }
4501
4502 DEBUG_PRINT1 ("The compiled pattern is: ");
4503 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4504 DEBUG_PRINT1 ("The string to match is: `");
4505 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4506 DEBUG_PRINT1 ("'\n");
5e69f11e 4507
25fe55af 4508 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
4509 function if the match is complete, or it drops through if the match
4510 fails at this starting point in the input data. */
4511 for (;;)
4512 {
505bde11 4513 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
4514
4515 if (p == pend)
4516 { /* End of pattern means we might have succeeded. */
25fe55af 4517 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 4518
fa9a63c5 4519 /* If we haven't matched the entire string, and we want the
25fe55af
RS
4520 longest match, try backtracking. */
4521 if (d != end_match_2)
fa9a63c5
RM
4522 {
4523 /* 1 if this match ends in the same string (string1 or string2)
4524 as the best previous match. */
5e69f11e 4525 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 4526 == FIRST_STRING_P (d));
fa9a63c5
RM
4527 /* 1 if this match is the best seen so far. */
4528 boolean best_match_p;
4529
4530 /* AIX compiler got confused when this was combined
25fe55af 4531 with the previous declaration. */
fa9a63c5
RM
4532 if (same_str_p)
4533 best_match_p = d > match_end;
4534 else
99633e97 4535 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 4536
25fe55af
RS
4537 DEBUG_PRINT1 ("backtracking.\n");
4538
4539 if (!FAIL_STACK_EMPTY ())
4540 { /* More failure points to try. */
4541
4542 /* If exceeds best match so far, save it. */
4543 if (!best_regs_set || best_match_p)
4544 {
4545 best_regs_set = true;
4546 match_end = d;
4547
4548 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4549
4550 for (mcnt = 1; mcnt < num_regs; mcnt++)
4551 {
4552 best_regstart[mcnt] = regstart[mcnt];
4553 best_regend[mcnt] = regend[mcnt];
4554 }
4555 }
4556 goto fail;
4557 }
4558
4559 /* If no failure points, don't restore garbage. And if
4560 last match is real best match, don't restore second
4561 best one. */
4562 else if (best_regs_set && !best_match_p)
4563 {
4564 restore_best_regs:
4565 /* Restore best match. It may happen that `dend ==
4566 end_match_1' while the restored d is in string2.
4567 For example, the pattern `x.*y.*z' against the
4568 strings `x-' and `y-z-', if the two strings are
4569 not consecutive in memory. */
4570 DEBUG_PRINT1 ("Restoring best registers.\n");
4571
4572 d = match_end;
4573 dend = ((d >= string1 && d <= end1)
4574 ? end_match_1 : end_match_2);
fa9a63c5
RM
4575
4576 for (mcnt = 1; mcnt < num_regs; mcnt++)
4577 {
4578 regstart[mcnt] = best_regstart[mcnt];
4579 regend[mcnt] = best_regend[mcnt];
4580 }
25fe55af
RS
4581 }
4582 } /* d != end_match_2 */
fa9a63c5
RM
4583
4584 succeed_label:
25fe55af 4585 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 4586
25fe55af
RS
4587 /* If caller wants register contents data back, do it. */
4588 if (regs && !bufp->no_sub)
fa9a63c5 4589 {
25fe55af
RS
4590 /* Have the register data arrays been allocated? */
4591 if (bufp->regs_allocated == REGS_UNALLOCATED)
4592 { /* No. So allocate them with malloc. We need one
4593 extra element beyond `num_regs' for the `-1' marker
4594 GNU code uses. */
4595 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4596 regs->start = TALLOC (regs->num_regs, regoff_t);
4597 regs->end = TALLOC (regs->num_regs, regoff_t);
4598 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4599 {
4600 FREE_VARIABLES ();
4601 return -2;
4602 }
25fe55af
RS
4603 bufp->regs_allocated = REGS_REALLOCATE;
4604 }
4605 else if (bufp->regs_allocated == REGS_REALLOCATE)
4606 { /* Yes. If we need more elements than were already
4607 allocated, reallocate them. If we need fewer, just
4608 leave it alone. */
4609 if (regs->num_regs < num_regs + 1)
4610 {
4611 regs->num_regs = num_regs + 1;
4612 RETALLOC (regs->start, regs->num_regs, regoff_t);
4613 RETALLOC (regs->end, regs->num_regs, regoff_t);
4614 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4615 {
4616 FREE_VARIABLES ();
4617 return -2;
4618 }
25fe55af
RS
4619 }
4620 }
4621 else
fa9a63c5
RM
4622 {
4623 /* These braces fend off a "empty body in an else-statement"
25fe55af 4624 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
4625 assert (bufp->regs_allocated == REGS_FIXED);
4626 }
4627
25fe55af
RS
4628 /* Convert the pointer data in `regstart' and `regend' to
4629 indices. Register zero has to be set differently,
4630 since we haven't kept track of any info for it. */
4631 if (regs->num_regs > 0)
4632 {
4633 regs->start[0] = pos;
99633e97 4634 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 4635 }
5e69f11e 4636
25fe55af
RS
4637 /* Go through the first `min (num_regs, regs->num_regs)'
4638 registers, since that is all we initialized. */
fa9a63c5
RM
4639 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4640 {
25fe55af
RS
4641 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4642 regs->start[mcnt] = regs->end[mcnt] = -1;
4643 else
4644 {
fa9a63c5
RM
4645 regs->start[mcnt]
4646 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
25fe55af 4647 regs->end[mcnt]
fa9a63c5 4648 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
25fe55af 4649 }
fa9a63c5 4650 }
5e69f11e 4651
25fe55af
RS
4652 /* If the regs structure we return has more elements than
4653 were in the pattern, set the extra elements to -1. If
4654 we (re)allocated the registers, this is the case,
4655 because we always allocate enough to have at least one
4656 -1 at the end. */
4657 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4658 regs->start[mcnt] = regs->end[mcnt] = -1;
fa9a63c5
RM
4659 } /* regs && !bufp->no_sub */
4660
25fe55af
RS
4661 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4662 nfailure_points_pushed, nfailure_points_popped,
4663 nfailure_points_pushed - nfailure_points_popped);
4664 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 4665
99633e97 4666 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 4667
25fe55af 4668 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 4669
25fe55af
RS
4670 FREE_VARIABLES ();
4671 return mcnt;
4672 }
fa9a63c5 4673
25fe55af 4674 /* Otherwise match next pattern command. */
fa9a63c5
RM
4675 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4676 {
25fe55af
RS
4677 /* Ignore these. Used to ignore the n of succeed_n's which
4678 currently have n == 0. */
4679 case no_op:
4680 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4681 break;
fa9a63c5
RM
4682
4683 case succeed:
25fe55af 4684 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
4685 goto succeed_label;
4686
25fe55af
RS
4687 /* Match the next n pattern characters exactly. The following
4688 byte in the pattern defines n, and the n bytes after that
4689 are the characters to match. */
fa9a63c5
RM
4690 case exactn:
4691 mcnt = *p++;
25fe55af 4692 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 4693
99633e97
SM
4694 /* Remember the start point to rollback upon failure. */
4695 dfail = d;
4696
25fe55af
RS
4697 /* This is written out as an if-else so we don't waste time
4698 testing `translate' inside the loop. */
28703c16 4699 if (RE_TRANSLATE_P (translate))
fa9a63c5 4700 {
e934739e
RS
4701#ifdef emacs
4702 if (multibyte)
4703 do
4704 {
4705 int pat_charlen, buf_charlen;
e71b1971 4706 unsigned int pat_ch, buf_ch;
e934739e
RS
4707
4708 PREFETCH ();
4709 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4710 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4711
4712 if (RE_TRANSLATE (translate, buf_ch)
4713 != pat_ch)
99633e97
SM
4714 {
4715 d = dfail;
4716 goto fail;
4717 }
e934739e
RS
4718
4719 p += pat_charlen;
4720 d += buf_charlen;
4721 mcnt -= pat_charlen;
4722 }
4723 while (mcnt > 0);
4724 else
4725#endif /* not emacs */
4726 do
4727 {
4728 PREFETCH ();
66f0296e 4729 if (RE_TRANSLATE (translate, *d) != *p++)
99633e97
SM
4730 {
4731 d = dfail;
4732 goto fail;
4733 }
33c46939 4734 d++;
e934739e
RS
4735 }
4736 while (--mcnt);
fa9a63c5
RM
4737 }
4738 else
4739 {
4740 do
4741 {
4742 PREFETCH ();
99633e97
SM
4743 if (*d++ != *p++)
4744 {
4745 d = dfail;
4746 goto fail;
4747 }
fa9a63c5
RM
4748 }
4749 while (--mcnt);
4750 }
25fe55af 4751 break;
fa9a63c5
RM
4752
4753
25fe55af 4754 /* Match any character except possibly a newline or a null. */
fa9a63c5 4755 case anychar:
e934739e
RS
4756 {
4757 int buf_charlen;
e71b1971 4758 unsigned int buf_ch;
fa9a63c5 4759
e934739e 4760 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 4761
e934739e 4762 PREFETCH ();
fa9a63c5 4763
e934739e
RS
4764#ifdef emacs
4765 if (multibyte)
4766 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4767 else
4768#endif /* not emacs */
4769 {
66f0296e 4770 buf_ch = *d;
e934739e
RS
4771 buf_charlen = 1;
4772 }
4773
4774 buf_ch = TRANSLATE (buf_ch);
4775
4776 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4777 && buf_ch == '\n')
4778 || ((bufp->syntax & RE_DOT_NOT_NULL)
4779 && buf_ch == '\000'))
4780 goto fail;
4781
e934739e
RS
4782 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4783 d += buf_charlen;
4784 }
fa9a63c5
RM
4785 break;
4786
4787
4788 case charset:
4789 case charset_not:
4790 {
b18215fc 4791 register unsigned int c;
fa9a63c5 4792 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4793 int len;
4794
4795 /* Start of actual range_table, or end of bitmap if there is no
4796 range table. */
4797 unsigned char *range_table;
4798
96cc36cc 4799 /* Nonzero if there is a range table. */
b18215fc
RS
4800 int range_table_exists;
4801
96cc36cc
RS
4802 /* Number of ranges of range table. This is not included
4803 in the initial byte-length of the command. */
4804 int count = 0;
fa9a63c5 4805
25fe55af 4806 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5
RM
4807
4808 PREFETCH ();
66f0296e 4809 c = *d;
fa9a63c5 4810
b18215fc 4811 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc
RS
4812
4813#ifdef emacs
b18215fc 4814 if (range_table_exists)
96cc36cc
RS
4815 {
4816 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4817 EXTRACT_NUMBER_AND_INCR (count, range_table);
4818 }
b18215fc
RS
4819
4820 if (multibyte && BASE_LEADING_CODE_P (c))
4821 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
96cc36cc 4822#endif /* emacs */
b18215fc
RS
4823
4824 if (SINGLE_BYTE_CHAR_P (c))
4825 { /* Lookup bitmap. */
4826 c = TRANSLATE (c); /* The character to match. */
4827 len = 1;
4828
4829 /* Cast to `unsigned' instead of `unsigned char' in
4830 case the bit list is a full 32 bytes long. */
4831 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
4832 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4833 not = !not;
b18215fc 4834 }
96cc36cc 4835#ifdef emacs
b18215fc 4836 else if (range_table_exists)
96cc36cc
RS
4837 {
4838 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4839
4840 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4841 | (class_bits & BIT_ALPHA && ISALPHA (c))
f71b19b6 4842 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
96cc36cc
RS
4843 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4844 | (class_bits & BIT_LOWER && ISLOWER (c))
f71b19b6
DL
4845 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4846 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
96cc36cc
RS
4847 | (class_bits & BIT_PRINT && ISPRINT (c))
4848 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4849 | (class_bits & BIT_SPACE && ISSPACE (c))
f71b19b6 4850 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
96cc36cc
RS
4851 | (class_bits & BIT_UPPER && ISUPPER (c))
4852 | (class_bits & BIT_WORD && ISWORD (c)))
4853 not = !not;
4854 else
4855 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4856 }
4857#endif /* emacs */
fa9a63c5 4858
96cc36cc
RS
4859 if (range_table_exists)
4860 p = CHARSET_RANGE_TABLE_END (range_table, count);
4861 else
4862 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
4863
4864 if (!not) goto fail;
5e69f11e 4865
b18215fc 4866 d += len;
fa9a63c5
RM
4867 break;
4868 }
4869
4870
25fe55af 4871 /* The beginning of a group is represented by start_memory.
505bde11 4872 The argument is the register number. The text
25fe55af
RS
4873 matched within the group is recorded (in the internal
4874 registers data structure) under the register number. */
4875 case start_memory:
505bde11
SM
4876 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
4877
4878 /* In case we need to undo this operation (via backtracking). */
4879 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 4880
25fe55af 4881 regstart[*p] = d;
505bde11 4882 regend[*p] = REG_UNSET_VALUE; /* probably unnecessary. -sm */
fa9a63c5
RM
4883 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4884
25fe55af 4885 /* Move past the register number and inner group count. */
505bde11 4886 p += 1;
25fe55af 4887 break;
fa9a63c5
RM
4888
4889
25fe55af 4890 /* The stop_memory opcode represents the end of a group. Its
505bde11 4891 argument is the same as start_memory's: the register number. */
fa9a63c5 4892 case stop_memory:
505bde11
SM
4893 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
4894
4895 assert (!REG_UNSET (regstart[*p]));
4896 /* Strictly speaking, there should be code such as:
4897
4898 assert (REG_UNSET (regend[*p]));
4899 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
4900
4901 But the only info to be pushed is regend[*p] and it is known to
4902 be UNSET, so there really isn't anything to push.
4903 Not pushing anything, on the other hand deprives us from the
4904 guarantee that regend[*p] is UNSET since undoing this operation
4905 will not reset its value properly. This is not important since
4906 the value will only be read on the next start_memory or at
4907 the very end and both events can only happen if this stop_memory
4908 is *not* undone. */
fa9a63c5 4909
25fe55af 4910 regend[*p] = d;
fa9a63c5
RM
4911 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4912
25fe55af 4913 /* Move past the register number and the inner group count. */
505bde11 4914 p += 1;
25fe55af 4915 break;
fa9a63c5
RM
4916
4917
4918 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
4919 followed by the numeric value of <digit> as the register number. */
4920 case duplicate:
fa9a63c5 4921 {
66f0296e 4922 register re_char *d2, *dend2;
25fe55af 4923 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
4924 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4925
25fe55af
RS
4926 /* Can't back reference a group which we've never matched. */
4927 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4928 goto fail;
5e69f11e 4929
25fe55af
RS
4930 /* Where in input to try to start matching. */
4931 d2 = regstart[regno];
5e69f11e 4932
99633e97
SM
4933 /* Remember the start point to rollback upon failure. */
4934 dfail = d;
4935
25fe55af
RS
4936 /* Where to stop matching; if both the place to start and
4937 the place to stop matching are in the same string, then
4938 set to the place to stop, otherwise, for now have to use
4939 the end of the first string. */
fa9a63c5 4940
25fe55af 4941 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
4942 == FIRST_STRING_P (regend[regno]))
4943 ? regend[regno] : end_match_1);
4944 for (;;)
4945 {
4946 /* If necessary, advance to next segment in register
25fe55af 4947 contents. */
fa9a63c5
RM
4948 while (d2 == dend2)
4949 {
4950 if (dend2 == end_match_2) break;
4951 if (dend2 == regend[regno]) break;
4952
25fe55af
RS
4953 /* End of string1 => advance to string2. */
4954 d2 = string2;
4955 dend2 = regend[regno];
fa9a63c5
RM
4956 }
4957 /* At end of register contents => success */
4958 if (d2 == dend2) break;
4959
4960 /* If necessary, advance to next segment in data. */
4961 PREFETCH ();
4962
4963 /* How many characters left in this segment to match. */
4964 mcnt = dend - d;
5e69f11e 4965
fa9a63c5 4966 /* Want how many consecutive characters we can match in
25fe55af
RS
4967 one shot, so, if necessary, adjust the count. */
4968 if (mcnt > dend2 - d2)
fa9a63c5 4969 mcnt = dend2 - d2;
5e69f11e 4970
fa9a63c5 4971 /* Compare that many; failure if mismatch, else move
25fe55af 4972 past them. */
28703c16 4973 if (RE_TRANSLATE_P (translate)
25fe55af
RS
4974 ? bcmp_translate (d, d2, mcnt, translate)
4975 : bcmp (d, d2, mcnt))
99633e97
SM
4976 {
4977 d = dfail;
4978 goto fail;
4979 }
fa9a63c5 4980 d += mcnt, d2 += mcnt;
fa9a63c5
RM
4981 }
4982 }
4983 break;
4984
4985
25fe55af
RS
4986 /* begline matches the empty string at the beginning of the string
4987 (unless `not_bol' is set in `bufp'), and, if
4988 `newline_anchor' is set, after newlines. */
fa9a63c5 4989 case begline:
25fe55af 4990 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 4991
25fe55af
RS
4992 if (AT_STRINGS_BEG (d))
4993 {
4994 if (!bufp->not_bol) break;
4995 }
4996 else if (d[-1] == '\n' && bufp->newline_anchor)
4997 {
4998 break;
4999 }
5000 /* In all other cases, we fail. */
5001 goto fail;
fa9a63c5
RM
5002
5003
25fe55af 5004 /* endline is the dual of begline. */
fa9a63c5 5005 case endline:
25fe55af 5006 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5007
25fe55af
RS
5008 if (AT_STRINGS_END (d))
5009 {
5010 if (!bufp->not_eol) break;
5011 }
5e69f11e 5012
25fe55af
RS
5013 /* We have to ``prefetch'' the next character. */
5014 else if ((d == end1 ? *string2 : *d) == '\n'
5015 && bufp->newline_anchor)
5016 {
5017 break;
5018 }
5019 goto fail;
fa9a63c5
RM
5020
5021
5022 /* Match at the very beginning of the data. */
25fe55af
RS
5023 case begbuf:
5024 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5025 if (AT_STRINGS_BEG (d))
5026 break;
5027 goto fail;
fa9a63c5
RM
5028
5029
5030 /* Match at the very end of the data. */
25fe55af
RS
5031 case endbuf:
5032 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5033 if (AT_STRINGS_END (d))
5034 break;
25fe55af 5035 goto fail;
5e69f11e 5036
5e69f11e 5037
25fe55af
RS
5038 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5039 pushes NULL as the value for the string on the stack. Then
505bde11 5040 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5041 string, instead of restoring it. To see why, consider
5042 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5043 then the . fails against the \n. But the next thing we want
5044 to do is match the \n against the \n; if we restored the
5045 string value, we would be back at the foo.
5046
5047 Because this is used only in specific cases, we don't need to
5048 check all the things that `on_failure_jump' does, to make
5049 sure the right things get saved on the stack. Hence we don't
5050 share its code. The only reason to push anything on the
5051 stack at all is that otherwise we would have to change
5052 `anychar's code to do something besides goto fail in this
5053 case; that seems worse than this. */
5054 case on_failure_keep_string_jump:
505bde11
SM
5055 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5056 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5057 mcnt, p + mcnt);
fa9a63c5 5058
505bde11
SM
5059 PUSH_FAILURE_POINT (p - 3, NULL);
5060 break;
5061
0683b6fa
SM
5062 /* A nasty loop is introduced by the non-greedy *? and +?.
5063 With such loops, the stack only ever contains one failure point
5064 at a time, so that a plain on_failure_jump_loop kind of
5065 cycle detection cannot work. Worse yet, such a detection
5066 can not only fail to detect a cycle, but it can also wrongly
5067 detect a cycle (between different instantiations of the same
5068 loop.
5069 So the method used for those nasty loops is a little different:
5070 We use a special cycle-detection-stack-frame which is pushed
5071 when the on_failure_jump_nastyloop failure-point is *popped*.
5072 This special frame thus marks the beginning of one iteration
5073 through the loop and we can hence easily check right here
5074 whether something matched between the beginning and the end of
5075 the loop. */
5076 case on_failure_jump_nastyloop:
5077 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5078 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5079 mcnt, p + mcnt);
5080
5081 assert ((re_opcode_t)p[-4] == no_op);
5082 CHECK_INFINITE_LOOP (p - 4, d);
5083 PUSH_FAILURE_POINT (p - 3, d);
5084 break;
5085
505bde11 5086
4e8a9132
SM
5087 /* Simple loop detecting on_failure_jump: just check on the
5088 failure stack if the same spot was already hit earlier. */
505bde11
SM
5089 case on_failure_jump_loop:
5090 on_failure:
5091 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5092 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5093 mcnt, p + mcnt);
5094
5095 CHECK_INFINITE_LOOP (p - 3, d);
5096 PUSH_FAILURE_POINT (p - 3, d);
25fe55af 5097 break;
fa9a63c5
RM
5098
5099
5100 /* Uses of on_failure_jump:
5e69f11e 5101
25fe55af
RS
5102 Each alternative starts with an on_failure_jump that points
5103 to the beginning of the next alternative. Each alternative
5104 except the last ends with a jump that in effect jumps past
5105 the rest of the alternatives. (They really jump to the
5106 ending jump of the following alternative, because tensioning
5107 these jumps is a hassle.)
fa9a63c5 5108
25fe55af
RS
5109 Repeats start with an on_failure_jump that points past both
5110 the repetition text and either the following jump or
5111 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5112 case on_failure_jump:
27c3b45d 5113 QUIT;
25fe55af 5114 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5115 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5116 mcnt, p + mcnt);
25fe55af 5117
505bde11 5118 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5119 break;
5120
4e8a9132 5121 /* This operation is used for greedy *.
505bde11
SM
5122 Compare the beginning of the repeat with what in the
5123 pattern follows its end. If we can establish that there
5124 is nothing that they would both match, i.e., that we
5125 would have to backtrack because of (as in, e.g., `a*a')
5126 then we can use a non-backtracking loop based on
4e8a9132 5127 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5128 case on_failure_jump_smart:
27c3b45d 5129 QUIT;
25fe55af 5130 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5131 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5132 mcnt, p + mcnt);
25fe55af 5133 {
505bde11
SM
5134 unsigned char *p1 = p; /* Next operation. */
5135 unsigned char *p2 = p + mcnt; /* Destination of the jump. */
fa9a63c5 5136
505bde11
SM
5137 p -= 3; /* Reset so that we will re-execute the
5138 instruction once it's been changed. */
fa9a63c5 5139
4e8a9132
SM
5140 EXTRACT_NUMBER (mcnt, p2 - 2);
5141
5142 /* Ensure this is a indeed the trivial kind of loop
5143 we are expecting. */
5144 assert (skip_one_char (p1) == p2 - 3);
5145 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5146 DEBUG_STATEMENT (debug += 2);
505bde11 5147 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5148 {
505bde11 5149 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132
SM
5150 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5151 *p = (unsigned char) on_failure_keep_string_jump;
5152 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5153 }
505bde11 5154 else
fa9a63c5 5155 {
505bde11
SM
5156 /* Default to a safe `on_failure_jump' loop. */
5157 DEBUG_PRINT1 (" smart default => slow loop.\n");
4e8a9132 5158 *p = (unsigned char) on_failure_jump;
fa9a63c5 5159 }
99633e97 5160 DEBUG_STATEMENT (debug -= 2);
25fe55af 5161 }
505bde11 5162 break;
25fe55af
RS
5163
5164 /* Unconditionally jump (without popping any failure points). */
5165 case jump:
fa9a63c5 5166 unconditional_jump:
27c3b45d 5167 QUIT;
fa9a63c5 5168 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5169 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5170 p += mcnt; /* Do the jump. */
505bde11 5171 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5172 break;
5173
5174
25fe55af
RS
5175 /* Have to succeed matching what follows at least n times.
5176 After that, handle like `on_failure_jump'. */
5177 case succeed_n:
5178 EXTRACT_NUMBER (mcnt, p + 2);
5179 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5180
25fe55af
RS
5181 assert (mcnt >= 0);
5182 /* Originally, this is how many times we HAVE to succeed. */
5183 if (mcnt > 0)
5184 {
5185 mcnt--;
fa9a63c5 5186 p += 2;
25fe55af 5187 STORE_NUMBER_AND_INCR (p, mcnt);
505bde11 5188 DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt);
25fe55af 5189 }
fa9a63c5 5190 else if (mcnt == 0)
25fe55af 5191 {
505bde11 5192 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
fa9a63c5 5193 p[2] = (unsigned char) no_op;
25fe55af
RS
5194 p[3] = (unsigned char) no_op;
5195 goto on_failure;
5196 }
5197 break;
5198
5199 case jump_n:
5200 EXTRACT_NUMBER (mcnt, p + 2);
5201 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5202
5203 /* Originally, this is how many times we CAN jump. */
5204 if (mcnt)
5205 {
5206 mcnt--;
5207 STORE_NUMBER (p + 2, mcnt);
5e69f11e 5208 goto unconditional_jump;
25fe55af
RS
5209 }
5210 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5211 else
5212 p += 4;
25fe55af 5213 break;
5e69f11e 5214
fa9a63c5
RM
5215 case set_number_at:
5216 {
25fe55af 5217 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5218
25fe55af
RS
5219 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5220 p1 = p + mcnt;
5221 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11 5222 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
fa9a63c5 5223 STORE_NUMBER (p1, mcnt);
25fe55af
RS
5224 break;
5225 }
9121ca40
KH
5226
5227 case wordbound:
66f0296e
SM
5228 case notwordbound:
5229 not = (re_opcode_t) *(p - 1) == notwordbound;
5230 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5231
99633e97 5232 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5233
b18215fc 5234 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5235 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5236 not = !not;
b18215fc
RS
5237 else
5238 {
5239 /* C1 is the character before D, S1 is the syntax of C1, C2
5240 is the character at D, and S2 is the syntax of C2. */
5241 int c1, c2, s1, s2;
b18215fc 5242#ifdef emacs
66f0296e 5243 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d - 1));
5d967c7a 5244 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5245#endif
66f0296e
SM
5246 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5247 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5248 s1 = SYNTAX (c1);
5249#ifdef emacs
5d967c7a 5250 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5251#endif
66f0296e
SM
5252 PREFETCH ();
5253 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5254 c2 = STRING_CHAR (d, dend - d);
b18215fc
RS
5255 s2 = SYNTAX (c2);
5256
5257 if (/* Case 2: Only one of S1 and S2 is Sword. */
5258 ((s1 == Sword) != (s2 == Sword))
5259 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5260 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5261 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5262 not = !not;
5263 }
5264 if (not)
9121ca40 5265 break;
b18215fc 5266 else
9121ca40 5267 goto fail;
fa9a63c5
RM
5268
5269 case wordbeg:
25fe55af 5270 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5271
b18215fc
RS
5272 /* We FAIL in one of the following cases: */
5273
25fe55af 5274 /* Case 1: D is at the end of string. */
b18215fc 5275 if (AT_STRINGS_END (d))
99633e97 5276 goto fail;
b18215fc
RS
5277 else
5278 {
5279 /* C1 is the character before D, S1 is the syntax of C1, C2
5280 is the character at D, and S2 is the syntax of C2. */
5281 int c1, c2, s1, s2;
fa9a63c5 5282#ifdef emacs
99633e97 5283 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
92432794 5284 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5285#endif
99633e97
SM
5286 PREFETCH ();
5287 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
5288 c2 = STRING_CHAR (d, dend - d);
b18215fc 5289 s2 = SYNTAX (c2);
25fe55af 5290
b18215fc
RS
5291 /* Case 2: S2 is not Sword. */
5292 if (s2 != Sword)
5293 goto fail;
5294
5295 /* Case 3: D is not at the beginning of string ... */
5296 if (!AT_STRINGS_BEG (d))
5297 {
5298 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5299#ifdef emacs
5d967c7a 5300 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5301#endif
b18215fc
RS
5302 s1 = SYNTAX (c1);
5303
5304 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5305 returns 0. */
b18215fc
RS
5306 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5307 goto fail;
5308 }
5309 }
e318085a
RS
5310 break;
5311
b18215fc 5312 case wordend:
25fe55af 5313 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5314
5315 /* We FAIL in one of the following cases: */
5316
5317 /* Case 1: D is at the beginning of string. */
5318 if (AT_STRINGS_BEG (d))
e318085a 5319 goto fail;
b18215fc
RS
5320 else
5321 {
5322 /* C1 is the character before D, S1 is the syntax of C1, C2
5323 is the character at D, and S2 is the syntax of C2. */
5324 int c1, c2, s1, s2;
5d967c7a 5325#ifdef emacs
99633e97 5326 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d) - 1);
92432794 5327 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5328#endif
99633e97 5329 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5330 s1 = SYNTAX (c1);
5331
5332 /* Case 2: S1 is not Sword. */
5333 if (s1 != Sword)
5334 goto fail;
5335
5336 /* Case 3: D is not at the end of string ... */
5337 if (!AT_STRINGS_END (d))
5338 {
66f0296e 5339 PREFETCH ();
99633e97 5340 /* FIXME: This does a STRING_CHAR even for unibyte buffers. */
66f0296e 5341 c2 = STRING_CHAR (d, dend - d);
5d967c7a
RS
5342#ifdef emacs
5343 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5344#endif
b18215fc
RS
5345 s2 = SYNTAX (c2);
5346
5347 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5348 returns 0. */
b18215fc 5349 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5350 goto fail;
b18215fc
RS
5351 }
5352 }
e318085a
RS
5353 break;
5354
fa9a63c5 5355 case syntaxspec:
1fb352e0
SM
5356 case notsyntaxspec:
5357 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 5358 mcnt = *p++;
1fb352e0 5359 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 5360 PREFETCH ();
b18215fc
RS
5361#ifdef emacs
5362 {
92432794 5363 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
b18215fc
RS
5364 UPDATE_SYNTAX_TABLE (pos1);
5365 }
25fe55af 5366#endif
b18215fc
RS
5367 {
5368 int c, len;
5369
5370 if (multibyte)
5371 /* we must concern about multibyte form, ... */
5372 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5373 else
5374 /* everything should be handled as ASCII, even though it
5375 looks like multibyte form. */
5376 c = *d, len = 1;
5377
990b2375 5378 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 5379 goto fail;
b18215fc
RS
5380 d += len;
5381 }
fa9a63c5
RM
5382 break;
5383
b18215fc 5384#ifdef emacs
1fb352e0
SM
5385 case before_dot:
5386 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5387 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 5388 goto fail;
b18215fc
RS
5389 break;
5390
1fb352e0
SM
5391 case at_dot:
5392 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5393 if (PTR_BYTE_POS (d) != PT_BYTE)
5394 goto fail;
5395 break;
b18215fc 5396
1fb352e0
SM
5397 case after_dot:
5398 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5399 if (PTR_BYTE_POS (d) <= PT_BYTE)
5400 goto fail;
e318085a 5401 break;
fa9a63c5 5402
1fb352e0 5403 case categoryspec:
b18215fc 5404 case notcategoryspec:
1fb352e0 5405 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 5406 mcnt = *p++;
1fb352e0 5407 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
5408 PREFETCH ();
5409 {
5410 int c, len;
5411
5412 if (multibyte)
5413 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5414 else
5415 c = *d, len = 1;
5416
1fb352e0 5417 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
5418 goto fail;
5419 d += len;
5420 }
fa9a63c5 5421 break;
5e69f11e 5422
1fb352e0 5423#endif /* emacs */
5e69f11e 5424
b18215fc
RS
5425 default:
5426 abort ();
fa9a63c5 5427 }
b18215fc 5428 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
5429
5430
5431 /* We goto here if a matching operation fails. */
5432 fail:
27c3b45d 5433 QUIT;
fa9a63c5 5434 if (!FAIL_STACK_EMPTY ())
505bde11 5435 {
66f0296e 5436 re_char *str;
505bde11
SM
5437 unsigned char *pat;
5438 /* A restart point is known. Restore to that state. */
b18215fc 5439 DEBUG_PRINT1 ("\nFAIL:\n");
505bde11
SM
5440 POP_FAILURE_POINT (str, pat);
5441 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5442 {
5443 case on_failure_keep_string_jump:
5444 assert (str == NULL);
5445 goto continue_failure_jump;
5446
0683b6fa
SM
5447 case on_failure_jump_nastyloop:
5448 assert ((re_opcode_t)pat[-2] == no_op);
5449 PUSH_FAILURE_POINT (pat - 2, str);
5450 /* Fallthrough */
5451
505bde11
SM
5452 case on_failure_jump_loop:
5453 case on_failure_jump:
5454 case succeed_n:
5455 d = str;
5456 continue_failure_jump:
5457 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5458 p = pat + mcnt;
5459 break;
b18215fc 5460
0683b6fa
SM
5461 case no_op:
5462 /* A special frame used for nastyloops. */
5463 goto fail;
5464
505bde11
SM
5465 default:
5466 abort();
5467 }
fa9a63c5 5468
505bde11 5469 assert (p >= bufp->buffer && p <= pend);
b18215fc
RS
5470
5471 if (d >= string1 && d <= end1)
fa9a63c5 5472 dend = end_match_1;
b18215fc 5473 }
fa9a63c5 5474 else
b18215fc 5475 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
5476 } /* for (;;) */
5477
5478 if (best_regs_set)
5479 goto restore_best_regs;
5480
5481 FREE_VARIABLES ();
5482
b18215fc 5483 return -1; /* Failure to match. */
fa9a63c5
RM
5484} /* re_match_2 */
5485\f
5486/* Subroutine definitions for re_match_2. */
5487
fa9a63c5
RM
5488/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5489 bytes; nonzero otherwise. */
5e69f11e 5490
fa9a63c5
RM
5491static int
5492bcmp_translate (s1, s2, len, translate)
5493 unsigned char *s1, *s2;
5494 register int len;
6676cb1c 5495 RE_TRANSLATE_TYPE translate;
fa9a63c5
RM
5496{
5497 register unsigned char *p1 = s1, *p2 = s2;
e934739e
RS
5498 unsigned char *p1_end = s1 + len;
5499 unsigned char *p2_end = s2 + len;
5500
5501 while (p1 != p1_end && p2 != p2_end)
fa9a63c5 5502 {
e934739e
RS
5503 int p1_charlen, p2_charlen;
5504 int p1_ch, p2_ch;
5505
99633e97 5506 /* FIXME: This assumes `multibyte = true'. */
e934739e
RS
5507 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5508 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
5509
5510 if (RE_TRANSLATE (translate, p1_ch)
5511 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 5512 return 1;
e934739e
RS
5513
5514 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 5515 }
e934739e
RS
5516
5517 if (p1 != p1_end || p2 != p2_end)
5518 return 1;
5519
fa9a63c5
RM
5520 return 0;
5521}
5522\f
5523/* Entry points for GNU code. */
5524
5525/* re_compile_pattern is the GNU regular expression compiler: it
5526 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5527 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 5528
fa9a63c5
RM
5529 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5530 are set in BUFP on entry.
5e69f11e 5531
b18215fc 5532 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
5533
5534const char *
5535re_compile_pattern (pattern, length, bufp)
5536 const char *pattern;
5537 int length;
5538 struct re_pattern_buffer *bufp;
5539{
5540 reg_errcode_t ret;
5e69f11e 5541
fa9a63c5
RM
5542 /* GNU code is written to assume at least RE_NREGS registers will be set
5543 (and at least one extra will be -1). */
5544 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 5545
fa9a63c5
RM
5546 /* And GNU code determines whether or not to get register information
5547 by passing null for the REGS argument to re_match, etc., not by
5548 setting no_sub. */
5549 bufp->no_sub = 0;
5e69f11e 5550
b18215fc 5551 /* Match anchors at newline. */
fa9a63c5 5552 bufp->newline_anchor = 1;
5e69f11e 5553
fa9a63c5
RM
5554 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5555
5556 if (!ret)
5557 return NULL;
5558 return gettext (re_error_msgid[(int) ret]);
5e69f11e 5559}
fa9a63c5 5560\f
b18215fc
RS
5561/* Entry points compatible with 4.2 BSD regex library. We don't define
5562 them unless specifically requested. */
fa9a63c5 5563
0c085854 5564#if defined (_REGEX_RE_COMP) || defined (_LIBC)
fa9a63c5
RM
5565
5566/* BSD has one and only one pattern buffer. */
5567static struct re_pattern_buffer re_comp_buf;
5568
5569char *
48afdd44
RM
5570#ifdef _LIBC
5571/* Make these definitions weak in libc, so POSIX programs can redefine
5572 these names if they don't use our functions, and still use
5573 regcomp/regexec below without link errors. */
5574weak_function
5575#endif
fa9a63c5
RM
5576re_comp (s)
5577 const char *s;
5578{
5579 reg_errcode_t ret;
5e69f11e 5580
fa9a63c5
RM
5581 if (!s)
5582 {
5583 if (!re_comp_buf.buffer)
5584 return gettext ("No previous regular expression");
5585 return 0;
5586 }
5587
5588 if (!re_comp_buf.buffer)
5589 {
5590 re_comp_buf.buffer = (unsigned char *) malloc (200);
5591 if (re_comp_buf.buffer == NULL)
b18215fc 5592 return gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
5593 re_comp_buf.allocated = 200;
5594
5595 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5596 if (re_comp_buf.fastmap == NULL)
5597 return gettext (re_error_msgid[(int) REG_ESPACE]);
5598 }
5599
5600 /* Since `re_exec' always passes NULL for the `regs' argument, we
5601 don't need to initialize the pattern buffer fields which affect it. */
5602
b18215fc 5603 /* Match anchors at newlines. */
fa9a63c5
RM
5604 re_comp_buf.newline_anchor = 1;
5605
5606 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 5607
fa9a63c5
RM
5608 if (!ret)
5609 return NULL;
5610
5611 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5612 return (char *) gettext (re_error_msgid[(int) ret]);
5613}
5614
5615
5616int
48afdd44
RM
5617#ifdef _LIBC
5618weak_function
5619#endif
fa9a63c5
RM
5620re_exec (s)
5621 const char *s;
5622{
5623 const int len = strlen (s);
5624 return
5625 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5626}
5627#endif /* _REGEX_RE_COMP */
5628\f
5629/* POSIX.2 functions. Don't define these for Emacs. */
5630
5631#ifndef emacs
5632
5633/* regcomp takes a regular expression as a string and compiles it.
5634
b18215fc 5635 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
5636 since POSIX says we shouldn't. Thus, we set
5637
5638 `buffer' to the compiled pattern;
5639 `used' to the length of the compiled pattern;
5640 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5641 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5642 RE_SYNTAX_POSIX_BASIC;
5643 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5644 `fastmap' and `fastmap_accurate' to zero;
5645 `re_nsub' to the number of subexpressions in PATTERN.
5646
5647 PATTERN is the address of the pattern string.
5648
5649 CFLAGS is a series of bits which affect compilation.
5650
5651 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5652 use POSIX basic syntax.
5653
5654 If REG_NEWLINE is set, then . and [^...] don't match newline.
5655 Also, regexec will try a match beginning after every newline.
5656
5657 If REG_ICASE is set, then we considers upper- and lowercase
5658 versions of letters to be equivalent when matching.
5659
5660 If REG_NOSUB is set, then when PREG is passed to regexec, that
5661 routine will report only success or failure, and nothing about the
5662 registers.
5663
b18215fc 5664 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
5665 the return codes and their meanings.) */
5666
5667int
5668regcomp (preg, pattern, cflags)
5669 regex_t *preg;
5e69f11e 5670 const char *pattern;
fa9a63c5
RM
5671 int cflags;
5672{
5673 reg_errcode_t ret;
5674 unsigned syntax
5675 = (cflags & REG_EXTENDED) ?
5676 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5677
5678 /* regex_compile will allocate the space for the compiled pattern. */
5679 preg->buffer = 0;
5680 preg->allocated = 0;
5681 preg->used = 0;
5e69f11e 5682
fa9a63c5
RM
5683 /* Don't bother to use a fastmap when searching. This simplifies the
5684 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5685 characters after newlines into the fastmap. This way, we just try
5686 every character. */
5687 preg->fastmap = 0;
5e69f11e 5688
fa9a63c5
RM
5689 if (cflags & REG_ICASE)
5690 {
5691 unsigned i;
5e69f11e 5692
6676cb1c
RS
5693 preg->translate
5694 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5695 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 5696 if (preg->translate == NULL)
b18215fc 5697 return (int) REG_ESPACE;
fa9a63c5
RM
5698
5699 /* Map uppercase characters to corresponding lowercase ones. */
5700 for (i = 0; i < CHAR_SET_SIZE; i++)
b18215fc 5701 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
fa9a63c5
RM
5702 }
5703 else
5704 preg->translate = NULL;
5705
5706 /* If REG_NEWLINE is set, newlines are treated differently. */
5707 if (cflags & REG_NEWLINE)
5708 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5709 syntax &= ~RE_DOT_NEWLINE;
5710 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
b18215fc 5711 /* It also changes the matching behavior. */
fa9a63c5
RM
5712 preg->newline_anchor = 1;
5713 }
5714 else
5715 preg->newline_anchor = 0;
5716
5717 preg->no_sub = !!(cflags & REG_NOSUB);
5718
5e69f11e 5719 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5
RM
5720 can use strlen here in compiling the pattern. */
5721 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5e69f11e 5722
fa9a63c5
RM
5723 /* POSIX doesn't distinguish between an unmatched open-group and an
5724 unmatched close-group: both are REG_EPAREN. */
5725 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5e69f11e 5726
fa9a63c5
RM
5727 return (int) ret;
5728}
5729
5730
5731/* regexec searches for a given pattern, specified by PREG, in the
5732 string STRING.
5e69f11e 5733
fa9a63c5 5734 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 5735 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
5736 least NMATCH elements, and we set them to the offsets of the
5737 corresponding matched substrings.
5e69f11e 5738
fa9a63c5
RM
5739 EFLAGS specifies `execution flags' which affect matching: if
5740 REG_NOTBOL is set, then ^ does not match at the beginning of the
5741 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 5742
fa9a63c5
RM
5743 We return 0 if we find a match and REG_NOMATCH if not. */
5744
5745int
5746regexec (preg, string, nmatch, pmatch, eflags)
5747 const regex_t *preg;
5e69f11e
RM
5748 const char *string;
5749 size_t nmatch;
5750 regmatch_t pmatch[];
fa9a63c5
RM
5751 int eflags;
5752{
5753 int ret;
5754 struct re_registers regs;
5755 regex_t private_preg;
5756 int len = strlen (string);
5757 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5758
5759 private_preg = *preg;
5e69f11e 5760
fa9a63c5
RM
5761 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5762 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 5763
fa9a63c5
RM
5764 /* The user has told us exactly how many registers to return
5765 information about, via `nmatch'. We have to pass that on to the
b18215fc 5766 matching routines. */
fa9a63c5 5767 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 5768
fa9a63c5
RM
5769 if (want_reg_info)
5770 {
5771 regs.num_regs = nmatch;
5772 regs.start = TALLOC (nmatch, regoff_t);
5773 regs.end = TALLOC (nmatch, regoff_t);
5774 if (regs.start == NULL || regs.end == NULL)
b18215fc 5775 return (int) REG_NOMATCH;
fa9a63c5
RM
5776 }
5777
5778 /* Perform the searching operation. */
5779 ret = re_search (&private_preg, string, len,
b18215fc
RS
5780 /* start: */ 0, /* range: */ len,
5781 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 5782
fa9a63c5
RM
5783 /* Copy the register information to the POSIX structure. */
5784 if (want_reg_info)
5785 {
5786 if (ret >= 0)
b18215fc
RS
5787 {
5788 unsigned r;
fa9a63c5 5789
b18215fc
RS
5790 for (r = 0; r < nmatch; r++)
5791 {
5792 pmatch[r].rm_so = regs.start[r];
5793 pmatch[r].rm_eo = regs.end[r];
5794 }
5795 }
fa9a63c5 5796
b18215fc 5797 /* If we needed the temporary register info, free the space now. */
fa9a63c5
RM
5798 free (regs.start);
5799 free (regs.end);
5800 }
5801
5802 /* We want zero return to mean success, unlike `re_search'. */
5803 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5804}
5805
5806
5807/* Returns a message corresponding to an error code, ERRCODE, returned
5808 from either regcomp or regexec. We don't use PREG here. */
5809
5810size_t
5811regerror (errcode, preg, errbuf, errbuf_size)
5812 int errcode;
5813 const regex_t *preg;
5814 char *errbuf;
5815 size_t errbuf_size;
5816{
5817 const char *msg;
5818 size_t msg_size;
5819
5820 if (errcode < 0
5821 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 5822 /* Only error codes returned by the rest of the code should be passed
b18215fc 5823 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
5824 code generates an invalid error code, then the program has a bug.
5825 Dump core so we can fix it. */
5826 abort ();
5827
5828 msg = gettext (re_error_msgid[errcode]);
5829
5830 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 5831
fa9a63c5
RM
5832 if (errbuf_size != 0)
5833 {
5834 if (msg_size > errbuf_size)
b18215fc
RS
5835 {
5836 strncpy (errbuf, msg, errbuf_size - 1);
5837 errbuf[errbuf_size - 1] = 0;
5838 }
fa9a63c5 5839 else
b18215fc 5840 strcpy (errbuf, msg);
fa9a63c5
RM
5841 }
5842
5843 return msg_size;
5844}
5845
5846
5847/* Free dynamically allocated space used by PREG. */
5848
5849void
5850regfree (preg)
5851 regex_t *preg;
5852{
5853 if (preg->buffer != NULL)
5854 free (preg->buffer);
5855 preg->buffer = NULL;
5e69f11e 5856
fa9a63c5
RM
5857 preg->allocated = 0;
5858 preg->used = 0;
5859
5860 if (preg->fastmap != NULL)
5861 free (preg->fastmap);
5862 preg->fastmap = NULL;
5863 preg->fastmap_accurate = 0;
5864
5865 if (preg->translate != NULL)
5866 free (preg->translate);
5867 preg->translate = NULL;
5868}
5869
5870#endif /* not emacs */