Fixes: debbugs:17865
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
ba318903 5 Copyright (C) 1993-2014 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
fee0bd5f 18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
fa9a63c5 19
6df42991 20/* TODO:
505bde11 21 - structure the opcode space into opcode+flag.
dc1e502d 22 - merge with glibc's regex.[ch].
01618498 23 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
24 need to modify the compiled regexp so that re_match can be reentrant.
25 - get rid of on_failure_jump_smart by doing the optimization in re_comp
26 rather than at run-time, so that re_match can be reentrant.
01618498 27*/
505bde11 28
b7432bb2 29/* AIX requires this to be the first thing in the file. */
0b32bf0e 30#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
31 #pragma alloca
32#endif
33
b8df54ff
PE
34/* Ignore some GCC warnings for now. This section should go away
35 once the Emacs and Gnulib regex code is merged. */
31ff141c 36#if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) || defined __clang__
b8df54ff
PE
37# pragma GCC diagnostic ignored "-Wstrict-overflow"
38# ifndef emacs
b8df54ff
PE
39# pragma GCC diagnostic ignored "-Wunused-function"
40# pragma GCC diagnostic ignored "-Wunused-macros"
41# pragma GCC diagnostic ignored "-Wunused-result"
42# pragma GCC diagnostic ignored "-Wunused-variable"
43# endif
44#endif
45
a5d376b0 46#if 4 < __GNUC__ + (6 <= __GNUC_MINOR__) && ! defined __clang__
31ff141c
PE
47# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
48#endif
49
cf38a720 50#include <config.h>
fa9a63c5 51
0e926e56
PE
52#include <stddef.h>
53
54#ifdef emacs
4bb91c68
SM
55/* We need this for `regex.h', and perhaps for the Emacs include files. */
56# include <sys/types.h>
57#endif
fa9a63c5 58
14473664
SM
59/* Whether to use ISO C Amendment 1 wide char functions.
60 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
61#if defined _LIBC
62#define WIDE_CHAR_SUPPORT 1
63#else
14473664 64#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
65 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
66#endif
14473664 67
fa463103 68/* For platform which support the ISO C amendment 1 functionality we
14473664 69 support user defined character classes. */
a0ad02f7 70#if WIDE_CHAR_SUPPORT
14473664
SM
71/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
72# include <wchar.h>
73# include <wctype.h>
74#endif
75
c0f9ea08
SM
76#ifdef _LIBC
77/* We have to keep the namespace clean. */
78# define regfree(preg) __regfree (preg)
79# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
80# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672 81# define regerror(err_code, preg, errbuf, errbuf_size) \
5e617bc2 82 __regerror (err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
83# define re_set_registers(bu, re, nu, st, en) \
84 __re_set_registers (bu, re, nu, st, en)
85# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
86 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
87# define re_match(bufp, string, size, pos, regs) \
88 __re_match (bufp, string, size, pos, regs)
89# define re_search(bufp, string, size, startpos, range, regs) \
90 __re_search (bufp, string, size, startpos, range, regs)
91# define re_compile_pattern(pattern, length, bufp) \
92 __re_compile_pattern (pattern, length, bufp)
93# define re_set_syntax(syntax) __re_set_syntax (syntax)
94# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
95 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
96# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
97
14473664
SM
98/* Make sure we call libc's function even if the user overrides them. */
99# define btowc __btowc
100# define iswctype __iswctype
101# define wctype __wctype
102
c0f9ea08
SM
103# define WEAK_ALIAS(a,b) weak_alias (a, b)
104
105/* We are also using some library internals. */
106# include <locale/localeinfo.h>
107# include <locale/elem-hash.h>
108# include <langinfo.h>
109#else
110# define WEAK_ALIAS(a,b)
111#endif
112
4bb91c68 113/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 114#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
115# include <libintl.h>
116#else
117# define gettext(msgid) (msgid)
118#endif
119
5e69f11e
RM
120#ifndef gettext_noop
121/* This define is so xgettext can find the internationalizable
122 strings. */
0b32bf0e 123# define gettext_noop(String) String
5e69f11e
RM
124#endif
125
fa9a63c5
RM
126/* The `emacs' switch turns on certain matching commands
127 that make sense only in Emacs. */
128#ifdef emacs
129
0b32bf0e 130# include "lisp.h"
e5560ff7 131# include "character.h"
0b32bf0e 132# include "buffer.h"
b18215fc 133
0b32bf0e 134# include "syntax.h"
0b32bf0e 135# include "category.h"
fa9a63c5 136
c6cfd910
PE
137/* Make syntax table lookup grant data in gl_state. */
138# define SYNTAX(c) syntax_property (c, 1)
139
7689ef0b
EZ
140# ifdef malloc
141# undef malloc
142# endif
0b32bf0e 143# define malloc xmalloc
7689ef0b
EZ
144# ifdef realloc
145# undef realloc
146# endif
0b32bf0e 147# define realloc xrealloc
7689ef0b
EZ
148# ifdef free
149# undef free
150# endif
0b32bf0e 151# define free xfree
9abbd165 152
7814e705 153/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
154# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
155# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
156
157# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 158# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
159# define RE_STRING_CHAR(p, multibyte) \
160 (multibyte ? (STRING_CHAR (p)) : (*(p)))
161# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
162 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 163
4c0354d7 164# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 165
2afc21f5 166# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 167
6fdd04b0
KH
168/* Set C a (possibly converted to multibyte) character before P. P
169 points into a string which is the virtual concatenation of STR1
170 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
171# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
172 do { \
02cb78b5 173 if (target_multibyte) \
bf216479
KH
174 { \
175 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
176 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
177 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 178 c = STRING_CHAR (dtemp); \
bf216479
KH
179 } \
180 else \
181 { \
182 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 183 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 184 } \
2d1675e4
SM
185 } while (0)
186
6fdd04b0
KH
187/* Set C a (possibly converted to multibyte) character at P, and set
188 LEN to the byte length of that character. */
189# define GET_CHAR_AFTER(c, p, len) \
190 do { \
02cb78b5 191 if (target_multibyte) \
62a6e103 192 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
193 else \
194 { \
cf9c99bc 195 (c) = *p; \
6fdd04b0 196 len = 1; \
cf9c99bc 197 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 198 } \
8f924df7 199 } while (0)
4e8a9132 200
fa9a63c5
RM
201#else /* not emacs */
202
203/* If we are not linking with Emacs proper,
204 we can't use the relocating allocator
205 even if config.h says that we can. */
0b32bf0e 206# undef REL_ALLOC
fa9a63c5 207
4004364e 208# include <unistd.h>
fa9a63c5 209
a77f947b
CY
210/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
211
b8df54ff 212static void *
d2762c86 213xmalloc (size_t size)
a77f947b 214{
38182d90 215 void *val = malloc (size);
a77f947b
CY
216 if (!val && size)
217 {
218 write (2, "virtual memory exhausted\n", 25);
219 exit (1);
220 }
221 return val;
222}
223
b8df54ff 224static void *
d2762c86 225xrealloc (void *block, size_t size)
a77f947b 226{
38182d90 227 void *val;
a77f947b
CY
228 /* We must call malloc explicitly when BLOCK is 0, since some
229 reallocs don't do this. */
230 if (! block)
38182d90 231 val = malloc (size);
a77f947b 232 else
38182d90 233 val = realloc (block, size);
a77f947b
CY
234 if (!val && size)
235 {
236 write (2, "virtual memory exhausted\n", 25);
237 exit (1);
238 }
239 return val;
240}
241
a073faa6
CY
242# ifdef malloc
243# undef malloc
244# endif
245# define malloc xmalloc
246# ifdef realloc
247# undef realloc
248# endif
249# define realloc xrealloc
250
f5d9e83a 251# include <stdbool.h>
9cfdb3ec 252# include <string.h>
fa9a63c5
RM
253
254/* Define the syntax stuff for \<, \>, etc. */
255
990b2375 256/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 257enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 258
e934739e 259/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
260# define MAX_MULTIBYTE_LENGTH 1
261# define RE_MULTIBYTE_P(x) 0
bf216479 262# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e 263# define WORD_BOUNDARY_P(c1, c2) (0)
aa3830c4 264# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 265# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
266# define STRING_CHAR(p) (*(p))
267# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 268# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
269# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
270# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
271# define RE_CHAR_TO_MULTIBYTE(c) (c)
272# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 273# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 274 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
275# define GET_CHAR_AFTER(c, p, len) \
276 (c = *p, len = 1)
9117d724 277# define CHAR_BYTE8_P(c) (0)
bf216479 278# define CHAR_LEADING_CODE(c) (c)
8f924df7 279
fa9a63c5 280#endif /* not emacs */
4e8a9132
SM
281
282#ifndef RE_TRANSLATE
0b32bf0e
SM
283# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
284# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 285#endif
fa9a63c5
RM
286\f
287/* Get the interface, including the syntax bits. */
288#include "regex.h"
289
f71b19b6
DL
290/* isalpha etc. are used for the character classes. */
291#include <ctype.h>
fa9a63c5 292
f71b19b6 293#ifdef emacs
fa9a63c5 294
f71b19b6 295/* 1 if C is an ASCII character. */
0b32bf0e 296# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 297
f71b19b6 298/* 1 if C is a unibyte character. */
0b32bf0e 299# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 300
f71b19b6 301/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 302
f71b19b6 303/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
304# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
305# define ISCNTRL(c) ((c) < ' ')
306# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
307 || ((c) >= 'a' && (c) <= 'f') \
308 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
309
310/* This is only used for single-byte characters. */
0b32bf0e 311# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
312
313/* The rest must handle multibyte characters. */
314
0b32bf0e 315# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 316 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
317 : 1)
318
14473664 319# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 320 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
321 : 1)
322
0b32bf0e 323# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
324 ? (((c) >= 'a' && (c) <= 'z') \
325 || ((c) >= 'A' && (c) <= 'Z') \
326 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
327 : SYNTAX (c) == Sword)
328
0b32bf0e 329# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
330 ? (((c) >= 'a' && (c) <= 'z') \
331 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
332 : SYNTAX (c) == Sword)
333
5da9919f 334# define ISLOWER(c) lowercasep (c)
96cc36cc 335
0b32bf0e 336# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
337 ? ((c) > ' ' && (c) < 0177 \
338 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
339 || ((c) >= 'A' && (c) <= 'Z') \
340 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
341 : SYNTAX (c) != Sword)
342
0b32bf0e 343# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 344
5da9919f 345# define ISUPPER(c) uppercasep (c)
96cc36cc 346
0b32bf0e 347# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
348
349#else /* not emacs */
350
f71b19b6 351/* 1 if C is an ASCII character. */
0b32bf0e 352# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
353
354/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
355# define ISUNIBYTE(c) 1
356
357# ifdef isblank
0e926e56 358# define ISBLANK(c) isblank (c)
0b32bf0e
SM
359# else
360# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
361# endif
362# ifdef isgraph
0e926e56 363# define ISGRAPH(c) isgraph (c)
0b32bf0e 364# else
0e926e56 365# define ISGRAPH(c) (isprint (c) && !isspace (c))
0b32bf0e
SM
366# endif
367
0e926e56 368/* Solaris defines ISPRINT so we must undefine it first. */
4bb91c68 369# undef ISPRINT
0e926e56
PE
370# define ISPRINT(c) isprint (c)
371# define ISDIGIT(c) isdigit (c)
372# define ISALNUM(c) isalnum (c)
373# define ISALPHA(c) isalpha (c)
374# define ISCNTRL(c) iscntrl (c)
375# define ISLOWER(c) islower (c)
376# define ISPUNCT(c) ispunct (c)
377# define ISSPACE(c) isspace (c)
378# define ISUPPER(c) isupper (c)
379# define ISXDIGIT(c) isxdigit (c)
0b32bf0e 380
5e617bc2 381# define ISWORD(c) ISALPHA (c)
0b32bf0e 382
4bb91c68 383# ifdef _tolower
5e617bc2 384# define TOLOWER(c) _tolower (c)
4bb91c68 385# else
5e617bc2 386# define TOLOWER(c) tolower (c)
4bb91c68
SM
387# endif
388
389/* How many characters in the character set. */
390# define CHAR_SET_SIZE 256
391
0b32bf0e 392# ifdef SYNTAX_TABLE
f71b19b6 393
0b32bf0e 394extern char *re_syntax_table;
f71b19b6 395
0b32bf0e
SM
396# else /* not SYNTAX_TABLE */
397
0b32bf0e
SM
398static char re_syntax_table[CHAR_SET_SIZE];
399
400static void
d2762c86 401init_syntax_once (void)
0b32bf0e
SM
402{
403 register int c;
404 static int done = 0;
405
406 if (done)
407 return;
408
72af86bd 409 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 410
4bb91c68
SM
411 for (c = 0; c < CHAR_SET_SIZE; ++c)
412 if (ISALNUM (c))
413 re_syntax_table[c] = Sword;
fa9a63c5 414
669fa600 415 re_syntax_table['_'] = Ssymbol;
fa9a63c5 416
0b32bf0e
SM
417 done = 1;
418}
419
420# endif /* not SYNTAX_TABLE */
96cc36cc 421
4bb91c68
SM
422# define SYNTAX(c) re_syntax_table[(c)]
423
96cc36cc
RS
424#endif /* not emacs */
425\f
261cb4bb 426#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
427\f
428/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
429 use `alloca' instead of `malloc'. This is because using malloc in
430 re_search* or re_match* could cause memory leaks when C-g is used in
431 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
432 the other hand, malloc is more portable, and easier to debug.
433
fa9a63c5
RM
434 Because we sometimes use alloca, some routines have to be macros,
435 not functions -- `alloca'-allocated space disappears at the end of the
436 function it is called in. */
437
438#ifdef REGEX_MALLOC
439
0b32bf0e
SM
440# define REGEX_ALLOCATE malloc
441# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
442# define REGEX_FREE free
fa9a63c5
RM
443
444#else /* not REGEX_MALLOC */
445
446/* Emacs already defines alloca, sometimes. */
0b32bf0e 447# ifndef alloca
fa9a63c5
RM
448
449/* Make alloca work the best possible way. */
0b32bf0e
SM
450# ifdef __GNUC__
451# define alloca __builtin_alloca
452# else /* not __GNUC__ */
7f585e7a 453# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
454# include <alloca.h>
455# endif /* HAVE_ALLOCA_H */
456# endif /* not __GNUC__ */
fa9a63c5 457
0b32bf0e 458# endif /* not alloca */
fa9a63c5 459
0b32bf0e 460# define REGEX_ALLOCATE alloca
fa9a63c5
RM
461
462/* Assumes a `char *destination' variable. */
0b32bf0e 463# define REGEX_REALLOCATE(source, osize, nsize) \
7d652d97 464 (destination = alloca (nsize), \
4bb91c68 465 memcpy (destination, source, osize))
fa9a63c5
RM
466
467/* No need to do anything to free, after alloca. */
0b32bf0e 468# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
469
470#endif /* not REGEX_MALLOC */
471
472/* Define how to allocate the failure stack. */
473
0b32bf0e 474#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 475
0b32bf0e 476# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 477 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 478# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 479 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 480# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
481 r_alloc_free (&failure_stack_ptr)
482
4297555e 483#else /* not using relocating allocator */
fa9a63c5 484
0b32bf0e 485# ifdef REGEX_MALLOC
fa9a63c5 486
0b32bf0e
SM
487# define REGEX_ALLOCATE_STACK malloc
488# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
489# define REGEX_FREE_STACK free
fa9a63c5 490
0b32bf0e 491# else /* not REGEX_MALLOC */
fa9a63c5 492
0b32bf0e 493# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 494
0b32bf0e 495# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 496 REGEX_REALLOCATE (source, osize, nsize)
7814e705 497/* No need to explicitly free anything. */
0b32bf0e 498# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 499
0b32bf0e 500# endif /* not REGEX_MALLOC */
4297555e 501#endif /* not using relocating allocator */
fa9a63c5
RM
502
503
504/* True if `size1' is non-NULL and PTR is pointing anywhere inside
505 `string1' or just past its end. This works if PTR is NULL, which is
506 a good thing. */
25fe55af 507#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
508 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
509
510/* (Re)Allocate N items of type T using malloc, or fail. */
511#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
512#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
fa9a63c5
RM
513#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
514
4bb91c68 515#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
516
517#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
518
519#undef MAX
520#undef MIN
521#define MAX(a, b) ((a) > (b) ? (a) : (b))
522#define MIN(a, b) ((a) < (b) ? (a) : (b))
523
66f0296e 524/* Type of source-pattern and string chars. */
a6fc3b5c
EZ
525#ifdef _MSC_VER
526typedef unsigned char re_char;
29abe551 527typedef const re_char const_re_char;
a6fc3b5c 528#else
66f0296e 529typedef const unsigned char re_char;
29abe551 530typedef re_char const_re_char;
a6fc3b5c 531#endif
66f0296e 532
fa9a63c5 533typedef char boolean;
fa9a63c5 534
261cb4bb
PE
535static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
536 re_char *string1, size_t size1,
537 re_char *string2, size_t size2,
538 ssize_t pos,
539 struct re_registers *regs,
540 ssize_t stop);
fa9a63c5
RM
541\f
542/* These are the command codes that appear in compiled regular
4bb91c68 543 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
544 command code can specify any interpretation whatsoever for its
545 arguments. Zero bytes may appear in the compiled regular expression. */
546
547typedef enum
548{
549 no_op = 0,
550
4bb91c68 551 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
552 succeed,
553
25fe55af 554 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
555 exactn,
556
25fe55af 557 /* Matches any (more or less) character. */
fa9a63c5
RM
558 anychar,
559
25fe55af
RS
560 /* Matches any one char belonging to specified set. First
561 following byte is number of bitmap bytes. Then come bytes
562 for a bitmap saying which chars are in. Bits in each byte
563 are ordered low-bit-first. A character is in the set if its
564 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
565 automatically not in the set.
566
567 If the length byte has the 0x80 bit set, then that stuff
568 is followed by a range table:
569 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 570 See RANGE_TABLE_WORK_BITS below.
01618498 571 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 572 pairs, each 2 multibyte characters,
0b32bf0e 573 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
574 charset,
575
25fe55af 576 /* Same parameters as charset, but match any character that is
4bb91c68 577 not one of those specified. */
fa9a63c5
RM
578 charset_not,
579
25fe55af
RS
580 /* Start remembering the text that is matched, for storing in a
581 register. Followed by one byte with the register number, in
582 the range 0 to one less than the pattern buffer's re_nsub
505bde11 583 field. */
fa9a63c5
RM
584 start_memory,
585
25fe55af
RS
586 /* Stop remembering the text that is matched and store it in a
587 memory register. Followed by one byte with the register
588 number, in the range 0 to one less than `re_nsub' in the
505bde11 589 pattern buffer. */
fa9a63c5
RM
590 stop_memory,
591
25fe55af 592 /* Match a duplicate of something remembered. Followed by one
4bb91c68 593 byte containing the register number. */
fa9a63c5
RM
594 duplicate,
595
25fe55af 596 /* Fail unless at beginning of line. */
fa9a63c5
RM
597 begline,
598
4bb91c68 599 /* Fail unless at end of line. */
fa9a63c5
RM
600 endline,
601
25fe55af
RS
602 /* Succeeds if at beginning of buffer (if emacs) or at beginning
603 of string to be matched (if not). */
fa9a63c5
RM
604 begbuf,
605
25fe55af 606 /* Analogously, for end of buffer/string. */
fa9a63c5 607 endbuf,
5e69f11e 608
25fe55af 609 /* Followed by two byte relative address to which to jump. */
5e69f11e 610 jump,
fa9a63c5 611
25fe55af 612 /* Followed by two-byte relative address of place to resume at
7814e705 613 in case of failure. */
fa9a63c5 614 on_failure_jump,
5e69f11e 615
25fe55af
RS
616 /* Like on_failure_jump, but pushes a placeholder instead of the
617 current string position when executed. */
fa9a63c5 618 on_failure_keep_string_jump,
5e69f11e 619
505bde11
SM
620 /* Just like `on_failure_jump', except that it checks that we
621 don't get stuck in an infinite loop (matching an empty string
622 indefinitely). */
623 on_failure_jump_loop,
624
0683b6fa
SM
625 /* Just like `on_failure_jump_loop', except that it checks for
626 a different kind of loop (the kind that shows up with non-greedy
627 operators). This operation has to be immediately preceded
628 by a `no_op'. */
629 on_failure_jump_nastyloop,
630
0b32bf0e 631 /* A smart `on_failure_jump' used for greedy * and + operators.
c7015153 632 It analyzes the loop before which it is put and if the
505bde11 633 loop does not require backtracking, it changes itself to
4e8a9132
SM
634 `on_failure_keep_string_jump' and short-circuits the loop,
635 else it just defaults to changing itself into `on_failure_jump'.
636 It assumes that it is pointing to just past a `jump'. */
505bde11 637 on_failure_jump_smart,
fa9a63c5 638
25fe55af 639 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
640 After matching N times, jump to the address upon failure.
641 Does not work if N starts at 0: use on_failure_jump_loop
642 instead. */
fa9a63c5
RM
643 succeed_n,
644
25fe55af
RS
645 /* Followed by two-byte relative address, and two-byte number n.
646 Jump to the address N times, then fail. */
fa9a63c5
RM
647 jump_n,
648
25fe55af 649 /* Set the following two-byte relative address to the
7814e705 650 subsequent two-byte number. The address *includes* the two
25fe55af 651 bytes of number. */
fa9a63c5
RM
652 set_number_at,
653
fa9a63c5
RM
654 wordbeg, /* Succeeds if at word beginning. */
655 wordend, /* Succeeds if at word end. */
656
657 wordbound, /* Succeeds if at a word boundary. */
7814e705 658 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 659
669fa600
SM
660 symbeg, /* Succeeds if at symbol beginning. */
661 symend, /* Succeeds if at symbol end. */
662
fa9a63c5 663 /* Matches any character whose syntax is specified. Followed by
25fe55af 664 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
665 syntaxspec,
666
667 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
668 notsyntaxspec
669
670#ifdef emacs
671 ,before_dot, /* Succeeds if before point. */
672 at_dot, /* Succeeds if at point. */
673 after_dot, /* Succeeds if after point. */
b18215fc
RS
674
675 /* Matches any character whose category-set contains the specified
7814e705
JB
676 category. The operator is followed by a byte which contains a
677 category code (mnemonic ASCII character). */
b18215fc
RS
678 categoryspec,
679
680 /* Matches any character whose category-set does not contain the
681 specified category. The operator is followed by a byte which
682 contains the category code (mnemonic ASCII character). */
683 notcategoryspec
fa9a63c5
RM
684#endif /* emacs */
685} re_opcode_t;
686\f
687/* Common operations on the compiled pattern. */
688
689/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
690
691#define STORE_NUMBER(destination, number) \
692 do { \
693 (destination)[0] = (number) & 0377; \
694 (destination)[1] = (number) >> 8; \
695 } while (0)
696
697/* Same as STORE_NUMBER, except increment DESTINATION to
698 the byte after where the number is stored. Therefore, DESTINATION
699 must be an lvalue. */
700
701#define STORE_NUMBER_AND_INCR(destination, number) \
702 do { \
703 STORE_NUMBER (destination, number); \
704 (destination) += 2; \
705 } while (0)
706
707/* Put into DESTINATION a number stored in two contiguous bytes starting
708 at SOURCE. */
709
710#define EXTRACT_NUMBER(destination, source) \
dc4a2ee0 711 ((destination) = extract_number (source))
fa9a63c5 712
dc4a2ee0
PE
713static int
714extract_number (re_char *source)
fa9a63c5 715{
dc4a2ee0 716 return (SIGN_EXTEND_CHAR (source[1]) << 8) + source[0];
fa9a63c5
RM
717}
718
fa9a63c5
RM
719/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
720 SOURCE must be an lvalue. */
721
722#define EXTRACT_NUMBER_AND_INCR(destination, source) \
dc4a2ee0 723 ((destination) = extract_number_and_incr (&source))
fa9a63c5 724
dc4a2ee0
PE
725static int
726extract_number_and_incr (re_char **source)
5e69f11e 727{
dc4a2ee0 728 int num = extract_number (*source);
fa9a63c5 729 *source += 2;
dc4a2ee0 730 return num;
fa9a63c5 731}
fa9a63c5 732\f
b18215fc
RS
733/* Store a multibyte character in three contiguous bytes starting
734 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 735 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
736
737#define STORE_CHARACTER_AND_INCR(destination, character) \
738 do { \
739 (destination)[0] = (character) & 0377; \
740 (destination)[1] = ((character) >> 8) & 0377; \
741 (destination)[2] = (character) >> 16; \
742 (destination) += 3; \
743 } while (0)
744
745/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 746 starting at SOURCE. */
b18215fc
RS
747
748#define EXTRACT_CHARACTER(destination, source) \
749 do { \
750 (destination) = ((source)[0] \
751 | ((source)[1] << 8) \
752 | ((source)[2] << 16)); \
753 } while (0)
754
755
756/* Macros for charset. */
757
758/* Size of bitmap of charset P in bytes. P is a start of charset,
759 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
760#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
761
762/* Nonzero if charset P has range table. */
25fe55af 763#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
764
765/* Return the address of range table of charset P. But not the start
766 of table itself, but the before where the number of ranges is
96cc36cc
RS
767 stored. `2 +' means to skip re_opcode_t and size of bitmap,
768 and the 2 bytes of flags at the start of the range table. */
769#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
770
78779650 771#ifdef emacs
96cc36cc
RS
772/* Extract the bit flags that start a range table. */
773#define CHARSET_RANGE_TABLE_BITS(p) \
774 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
775 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
78779650 776#endif
b18215fc 777
b18215fc 778/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
779 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
780 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
781 and end. */
782#define CHARSET_RANGE_TABLE_END(range_table, count) \
783 ((range_table) + (count) * 2 * 3)
784
7814e705 785/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
786 COUNT is number of ranges in RANGE_TABLE. */
787#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
788 do \
789 { \
01618498 790 re_wchar_t range_start, range_end; \
19ed5445 791 re_char *rtp; \
01618498 792 re_char *range_table_end \
b18215fc
RS
793 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
794 \
19ed5445 795 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 796 { \
19ed5445
PE
797 EXTRACT_CHARACTER (range_start, rtp); \
798 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
799 \
800 if (range_start <= (c) && (c) <= range_end) \
801 { \
802 (not) = !(not); \
803 break; \
804 } \
805 } \
806 } \
807 while (0)
808
809/* Test if C is in range table of CHARSET. The flag NOT is negated if
810 C is listed in it. */
811#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
812 do \
813 { \
814 /* Number of ranges in range table. */ \
815 int count; \
01618498
SM
816 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
817 \
b18215fc
RS
818 EXTRACT_NUMBER_AND_INCR (count, range_table); \
819 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
820 } \
821 while (0)
822\f
fa9a63c5
RM
823/* If DEBUG is defined, Regex prints many voluminous messages about what
824 it is doing (if the variable `debug' is nonzero). If linked with the
825 main program in `iregex.c', you can enter patterns and strings
826 interactively. And if linked with the main program in `main.c' and
4bb91c68 827 the other test files, you can run the already-written tests. */
fa9a63c5
RM
828
829#ifdef DEBUG
830
831/* We use standard I/O for debugging. */
0b32bf0e 832# include <stdio.h>
fa9a63c5
RM
833
834/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 835# include <assert.h>
fa9a63c5 836
99633e97 837static int debug = -100000;
fa9a63c5 838
0b32bf0e 839# define DEBUG_STATEMENT(e) e
dc4a2ee0
PE
840# define DEBUG_PRINT(...) if (debug > 0) printf (__VA_ARGS__)
841# define DEBUG_COMPILES_ARGUMENTS
0b32bf0e 842# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 843 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 844# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 845 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
846
847
848/* Print the fastmap in human-readable form. */
849
dc4a2ee0
PE
850static void
851print_fastmap (char *fastmap)
fa9a63c5
RM
852{
853 unsigned was_a_range = 0;
5e69f11e
RM
854 unsigned i = 0;
855
fa9a63c5
RM
856 while (i < (1 << BYTEWIDTH))
857 {
858 if (fastmap[i++])
859 {
860 was_a_range = 0;
25fe55af
RS
861 putchar (i - 1);
862 while (i < (1 << BYTEWIDTH) && fastmap[i])
863 {
864 was_a_range = 1;
865 i++;
866 }
fa9a63c5 867 if (was_a_range)
25fe55af
RS
868 {
869 printf ("-");
870 putchar (i - 1);
871 }
872 }
fa9a63c5 873 }
5e69f11e 874 putchar ('\n');
fa9a63c5
RM
875}
876
877
878/* Print a compiled pattern string in human-readable form, starting at
879 the START pointer into it and ending just before the pointer END. */
880
dc4a2ee0
PE
881static void
882print_partial_compiled_pattern (re_char *start, re_char *end)
fa9a63c5
RM
883{
884 int mcnt, mcnt2;
01618498
SM
885 re_char *p = start;
886 re_char *pend = end;
fa9a63c5
RM
887
888 if (start == NULL)
889 {
a1a052df 890 fprintf (stderr, "(null)\n");
fa9a63c5
RM
891 return;
892 }
5e69f11e 893
fa9a63c5
RM
894 /* Loop over pattern commands. */
895 while (p < pend)
896 {
dc4a2ee0 897 fprintf (stderr, "%td:\t", p - start);
fa9a63c5
RM
898
899 switch ((re_opcode_t) *p++)
900 {
25fe55af 901 case no_op:
a1a052df 902 fprintf (stderr, "/no_op");
25fe55af 903 break;
fa9a63c5 904
99633e97 905 case succeed:
a1a052df 906 fprintf (stderr, "/succeed");
99633e97
SM
907 break;
908
fa9a63c5
RM
909 case exactn:
910 mcnt = *p++;
a1a052df 911 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 912 do
fa9a63c5 913 {
a1a052df 914 fprintf (stderr, "/%c", *p++);
25fe55af
RS
915 }
916 while (--mcnt);
917 break;
fa9a63c5
RM
918
919 case start_memory:
a1a052df 920 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 921 break;
fa9a63c5
RM
922
923 case stop_memory:
a1a052df 924 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 925 break;
fa9a63c5
RM
926
927 case duplicate:
a1a052df 928 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
929 break;
930
931 case anychar:
a1a052df 932 fprintf (stderr, "/anychar");
fa9a63c5
RM
933 break;
934
935 case charset:
25fe55af
RS
936 case charset_not:
937 {
938 register int c, last = -100;
fa9a63c5 939 register int in_range = 0;
99633e97
SM
940 int length = CHARSET_BITMAP_SIZE (p - 1);
941 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 942
a1a052df 943 fprintf (stderr, "/charset [%s",
839966f3 944 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 945
839966f3
KH
946 if (p + *p >= pend)
947 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 948
25fe55af 949 for (c = 0; c < 256; c++)
96cc36cc 950 if (c / 8 < length
fa9a63c5
RM
951 && (p[1 + (c/8)] & (1 << (c % 8))))
952 {
953 /* Are we starting a range? */
954 if (last + 1 == c && ! in_range)
955 {
a1a052df 956 fprintf (stderr, "-");
fa9a63c5
RM
957 in_range = 1;
958 }
959 /* Have we broken a range? */
960 else if (last + 1 != c && in_range)
96cc36cc 961 {
a1a052df 962 fprintf (stderr, "%c", last);
fa9a63c5
RM
963 in_range = 0;
964 }
5e69f11e 965
fa9a63c5 966 if (! in_range)
a1a052df 967 fprintf (stderr, "%c", c);
fa9a63c5
RM
968
969 last = c;
25fe55af 970 }
fa9a63c5
RM
971
972 if (in_range)
a1a052df 973 fprintf (stderr, "%c", last);
fa9a63c5 974
a1a052df 975 fprintf (stderr, "]");
fa9a63c5 976
99633e97 977 p += 1 + length;
96cc36cc 978
96cc36cc 979 if (has_range_table)
99633e97
SM
980 {
981 int count;
a1a052df 982 fprintf (stderr, "has-range-table");
99633e97
SM
983
984 /* ??? Should print the range table; for now, just skip it. */
985 p += 2; /* skip range table bits */
986 EXTRACT_NUMBER_AND_INCR (count, p);
987 p = CHARSET_RANGE_TABLE_END (p, count);
988 }
fa9a63c5
RM
989 }
990 break;
991
992 case begline:
a1a052df 993 fprintf (stderr, "/begline");
25fe55af 994 break;
fa9a63c5
RM
995
996 case endline:
a1a052df 997 fprintf (stderr, "/endline");
25fe55af 998 break;
fa9a63c5
RM
999
1000 case on_failure_jump:
dc4a2ee0
PE
1001 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1002 fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
25fe55af 1003 break;
fa9a63c5
RM
1004
1005 case on_failure_keep_string_jump:
dc4a2ee0
PE
1006 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1007 fprintf (stderr, "/on_failure_keep_string_jump to %td",
1008 p + mcnt - start);
25fe55af 1009 break;
fa9a63c5 1010
0683b6fa 1011 case on_failure_jump_nastyloop:
dc4a2ee0
PE
1012 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1013 fprintf (stderr, "/on_failure_jump_nastyloop to %td",
1014 p + mcnt - start);
0683b6fa
SM
1015 break;
1016
505bde11 1017 case on_failure_jump_loop:
dc4a2ee0
PE
1018 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1019 fprintf (stderr, "/on_failure_jump_loop to %td",
1020 p + mcnt - start);
5e69f11e
RM
1021 break;
1022
505bde11 1023 case on_failure_jump_smart:
dc4a2ee0
PE
1024 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1025 fprintf (stderr, "/on_failure_jump_smart to %td",
1026 p + mcnt - start);
5e69f11e
RM
1027 break;
1028
25fe55af 1029 case jump:
dc4a2ee0
PE
1030 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1031 fprintf (stderr, "/jump to %td", p + mcnt - start);
fa9a63c5
RM
1032 break;
1033
25fe55af 1034 case succeed_n:
dc4a2ee0
PE
1035 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1036 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1037 fprintf (stderr, "/succeed_n to %td, %d times",
1038 p - 2 + mcnt - start, mcnt2);
25fe55af 1039 break;
5e69f11e 1040
25fe55af 1041 case jump_n:
dc4a2ee0
PE
1042 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1043 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1044 fprintf (stderr, "/jump_n to %td, %d times",
1045 p - 2 + mcnt - start, mcnt2);
25fe55af 1046 break;
5e69f11e 1047
25fe55af 1048 case set_number_at:
dc4a2ee0
PE
1049 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1050 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1051 fprintf (stderr, "/set_number_at location %td to %d",
1052 p - 2 + mcnt - start, mcnt2);
25fe55af 1053 break;
5e69f11e 1054
25fe55af 1055 case wordbound:
a1a052df 1056 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1057 break;
1058
1059 case notwordbound:
a1a052df 1060 fprintf (stderr, "/notwordbound");
25fe55af 1061 break;
fa9a63c5
RM
1062
1063 case wordbeg:
a1a052df 1064 fprintf (stderr, "/wordbeg");
fa9a63c5 1065 break;
5e69f11e 1066
fa9a63c5 1067 case wordend:
a1a052df 1068 fprintf (stderr, "/wordend");
e2543b02 1069 break;
5e69f11e 1070
669fa600 1071 case symbeg:
e2543b02 1072 fprintf (stderr, "/symbeg");
669fa600
SM
1073 break;
1074
1075 case symend:
e2543b02 1076 fprintf (stderr, "/symend");
669fa600 1077 break;
5e69f11e 1078
1fb352e0 1079 case syntaxspec:
a1a052df 1080 fprintf (stderr, "/syntaxspec");
1fb352e0 1081 mcnt = *p++;
a1a052df 1082 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1083 break;
1084
1085 case notsyntaxspec:
a1a052df 1086 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1087 mcnt = *p++;
a1a052df 1088 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1089 break;
1090
0b32bf0e 1091# ifdef emacs
fa9a63c5 1092 case before_dot:
a1a052df 1093 fprintf (stderr, "/before_dot");
25fe55af 1094 break;
fa9a63c5
RM
1095
1096 case at_dot:
a1a052df 1097 fprintf (stderr, "/at_dot");
25fe55af 1098 break;
fa9a63c5
RM
1099
1100 case after_dot:
a1a052df 1101 fprintf (stderr, "/after_dot");
25fe55af 1102 break;
fa9a63c5 1103
1fb352e0 1104 case categoryspec:
a1a052df 1105 fprintf (stderr, "/categoryspec");
fa9a63c5 1106 mcnt = *p++;
a1a052df 1107 fprintf (stderr, "/%d", mcnt);
25fe55af 1108 break;
5e69f11e 1109
1fb352e0 1110 case notcategoryspec:
a1a052df 1111 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1112 mcnt = *p++;
a1a052df 1113 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1114 break;
0b32bf0e 1115# endif /* emacs */
fa9a63c5 1116
fa9a63c5 1117 case begbuf:
a1a052df 1118 fprintf (stderr, "/begbuf");
25fe55af 1119 break;
fa9a63c5
RM
1120
1121 case endbuf:
a1a052df 1122 fprintf (stderr, "/endbuf");
25fe55af 1123 break;
fa9a63c5 1124
25fe55af 1125 default:
a1a052df 1126 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1127 }
1128
a1a052df 1129 fprintf (stderr, "\n");
fa9a63c5
RM
1130 }
1131
dc4a2ee0 1132 fprintf (stderr, "%td:\tend of pattern.\n", p - start);
fa9a63c5
RM
1133}
1134
1135
dc4a2ee0
PE
1136static void
1137print_compiled_pattern (struct re_pattern_buffer *bufp)
fa9a63c5 1138{
01618498 1139 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1140
1141 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1142 printf ("%ld bytes used/%ld bytes allocated.\n",
1143 bufp->used, bufp->allocated);
fa9a63c5
RM
1144
1145 if (bufp->fastmap_accurate && bufp->fastmap)
1146 {
1147 printf ("fastmap: ");
1148 print_fastmap (bufp->fastmap);
1149 }
1150
dc4a2ee0 1151 printf ("re_nsub: %zu\t", bufp->re_nsub);
fa9a63c5
RM
1152 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1153 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1154 printf ("no_sub: %d\t", bufp->no_sub);
1155 printf ("not_bol: %d\t", bufp->not_bol);
1156 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1157 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1158 fflush (stdout);
fa9a63c5
RM
1159 /* Perhaps we should print the translate table? */
1160}
1161
1162
dc4a2ee0
PE
1163static void
1164print_double_string (re_char *where, re_char *string1, ssize_t size1,
1165 re_char *string2, ssize_t size2)
fa9a63c5 1166{
d1dfb56c 1167 ssize_t this_char;
5e69f11e 1168
fa9a63c5
RM
1169 if (where == NULL)
1170 printf ("(null)");
1171 else
1172 {
1173 if (FIRST_STRING_P (where))
25fe55af
RS
1174 {
1175 for (this_char = where - string1; this_char < size1; this_char++)
1176 putchar (string1[this_char]);
fa9a63c5 1177
25fe55af
RS
1178 where = string2;
1179 }
fa9a63c5
RM
1180
1181 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1182 putchar (string2[this_char]);
fa9a63c5
RM
1183 }
1184}
1185
1186#else /* not DEBUG */
1187
0b32bf0e
SM
1188# undef assert
1189# define assert(e)
fa9a63c5 1190
0b32bf0e 1191# define DEBUG_STATEMENT(e)
dc4a2ee0
PE
1192# if __STDC_VERSION__ < 199901L
1193# define DEBUG_COMPILES_ARGUMENTS
1194# define DEBUG_PRINT /* 'DEBUG_PRINT (x, y)' discards X and Y. */ (void)
1195# else
1196# define DEBUG_PRINT(...)
1197# endif
0b32bf0e
SM
1198# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1199# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1200
1201#endif /* not DEBUG */
1202\f
4da60324
PE
1203/* Use this to suppress gcc's `...may be used before initialized' warnings. */
1204#ifdef lint
1205# define IF_LINT(Code) Code
1206#else
1207# define IF_LINT(Code) /* empty */
1208#endif
1209\f
fa9a63c5
RM
1210/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1211 also be assigned to arbitrarily: each pattern buffer stores its own
1212 syntax, so it can be changed between regex compilations. */
1213/* This has no initializer because initialized variables in Emacs
1214 become read-only after dumping. */
1215reg_syntax_t re_syntax_options;
1216
1217
1218/* Specify the precise syntax of regexps for compilation. This provides
1219 for compatibility for various utilities which historically have
1220 different, incompatible syntaxes.
1221
1222 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1223 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1224
1225reg_syntax_t
971de7fb 1226re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1227{
1228 reg_syntax_t ret = re_syntax_options;
5e69f11e 1229
fa9a63c5
RM
1230 re_syntax_options = syntax;
1231 return ret;
1232}
c0f9ea08 1233WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1234
1235/* Regexp to use to replace spaces, or NULL meaning don't. */
f462f075 1236static const_re_char *whitespace_regexp;
f9b0fd99
RS
1237
1238void
971de7fb 1239re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1240{
f462f075 1241 whitespace_regexp = (const_re_char *) regexp;
f9b0fd99
RS
1242}
1243WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1244\f
1245/* This table gives an error message for each of the error codes listed
4bb91c68 1246 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1247 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1248 but why not be nice? */
fa9a63c5
RM
1249
1250static const char *re_error_msgid[] =
5e69f11e
RM
1251 {
1252 gettext_noop ("Success"), /* REG_NOERROR */
1253 gettext_noop ("No match"), /* REG_NOMATCH */
1254 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1255 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1256 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1257 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1258 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1259 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1260 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1261 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1262 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1263 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1264 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1265 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1266 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1267 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1268 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1269 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1270 };
1271\f
4bb91c68 1272/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1273
1274/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1275 searching and matching functions should not call alloca. On some
1276 systems, alloca is implemented in terms of malloc, and if we're
1277 using the relocating allocator routines, then malloc could cause a
1278 relocation, which might (if the strings being searched are in the
1279 ralloc heap) shift the data out from underneath the regexp
1280 routines.
1281
5e69f11e 1282 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1283 processes input from X in a signal handler; processing X input may
1284 call malloc; if input arrives while a matching routine is calling
1285 malloc, then we're scrod. But Emacs can't just block input while
1286 calling matching routines; then we don't notice interrupts when
1287 they come in. So, Emacs blocks input around all regexp calls
1288 except the matching calls, which it leaves unprotected, in the
1289 faith that they will not malloc. */
1290
1291/* Normally, this is fine. */
1292#define MATCH_MAY_ALLOCATE
1293
fa9a63c5
RM
1294/* The match routines may not allocate if (1) they would do it with malloc
1295 and (2) it's not safe for them to use malloc.
1296 Note that if REL_ALLOC is defined, matching would not use malloc for the
1297 failure stack, but we would still use it for the register vectors;
4bb91c68 1298 so REL_ALLOC should not affect this. */
b588157e 1299#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1300# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1301#endif
1302
1303\f
1304/* Failure stack declarations and macros; both re_compile_fastmap and
1305 re_match_2 use a failure stack. These have to be macros because of
1306 REGEX_ALLOCATE_STACK. */
5e69f11e 1307
fa9a63c5 1308
320a2a73 1309/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1310 when matching. If this number is exceeded, we allocate more
1311 space, so it is not a hard limit. */
1312#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1313# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1314#endif
1315
1316/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1317 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1318 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1319 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1320 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1321# if defined MATCH_MAY_ALLOCATE
1322/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1323 whose default stack limit is 2mb. In order for a larger
1324 value to work reliably, you have to try to make it accord
1325 with the process stack limit. */
c0f9ea08
SM
1326size_t re_max_failures = 40000;
1327# else
1328size_t re_max_failures = 4000;
1329# endif
fa9a63c5
RM
1330
1331union fail_stack_elt
1332{
01618498 1333 re_char *pointer;
c0f9ea08
SM
1334 /* This should be the biggest `int' that's no bigger than a pointer. */
1335 long integer;
fa9a63c5
RM
1336};
1337
1338typedef union fail_stack_elt fail_stack_elt_t;
1339
1340typedef struct
1341{
1342 fail_stack_elt_t *stack;
c0f9ea08
SM
1343 size_t size;
1344 size_t avail; /* Offset of next open position. */
1345 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1346} fail_stack_type;
1347
505bde11 1348#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1349
1350
1351/* Define macros to initialize and free the failure stack.
1352 Do `return -2' if the alloc fails. */
1353
1354#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1355# define INIT_FAIL_STACK() \
fa9a63c5 1356 do { \
38182d90 1357 fail_stack.stack = \
320a2a73
KH
1358 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1359 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1360 \
1361 if (fail_stack.stack == NULL) \
1362 return -2; \
1363 \
1364 fail_stack.size = INIT_FAILURE_ALLOC; \
1365 fail_stack.avail = 0; \
505bde11 1366 fail_stack.frame = 0; \
fa9a63c5 1367 } while (0)
fa9a63c5 1368#else
0b32bf0e 1369# define INIT_FAIL_STACK() \
fa9a63c5
RM
1370 do { \
1371 fail_stack.avail = 0; \
505bde11 1372 fail_stack.frame = 0; \
fa9a63c5
RM
1373 } while (0)
1374
b313f9d8
PE
1375# define RETALLOC_IF(addr, n, t) \
1376 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
fa9a63c5
RM
1377#endif
1378
1379
320a2a73
KH
1380/* Double the size of FAIL_STACK, up to a limit
1381 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1382
1383 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1384 allocating space for it or it was already too large.
1385
4bb91c68 1386 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1387
320a2a73
KH
1388/* Factor to increase the failure stack size by
1389 when we increase it.
1390 This used to be 2, but 2 was too wasteful
1391 because the old discarded stacks added up to as much space
1392 were as ultimate, maximum-size stack. */
1393#define FAIL_STACK_GROWTH_FACTOR 4
1394
1395#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1396 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1397 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1398 ? 0 \
320a2a73 1399 : ((fail_stack).stack \
38182d90 1400 = REGEX_REALLOCATE_STACK ((fail_stack).stack, \
25fe55af 1401 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1402 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1403 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1404 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1405 \
1406 (fail_stack).stack == NULL \
1407 ? 0 \
6453db45
KH
1408 : ((fail_stack).size \
1409 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1410 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1411 * FAIL_STACK_GROWTH_FACTOR)) \
1412 / sizeof (fail_stack_elt_t)), \
25fe55af 1413 1)))
fa9a63c5
RM
1414
1415
fa9a63c5
RM
1416/* Push a pointer value onto the failure stack.
1417 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1418 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1419#define PUSH_FAILURE_POINTER(item) \
01618498 1420 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1421
1422/* This pushes an integer-valued item onto the failure stack.
1423 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1424 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1425#define PUSH_FAILURE_INT(item) \
1426 fail_stack.stack[fail_stack.avail++].integer = (item)
1427
b313f9d8 1428/* These POP... operations complement the PUSH... operations.
fa9a63c5
RM
1429 All assume that `fail_stack' is nonempty. */
1430#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1431#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
fa9a63c5 1432
505bde11
SM
1433/* Individual items aside from the registers. */
1434#define NUM_NONREG_ITEMS 3
1435
1436/* Used to examine the stack (to detect infinite loops). */
1437#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1438#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1439#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1440#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1441
1442
505bde11
SM
1443#define ENSURE_FAIL_STACK(space) \
1444while (REMAINING_AVAIL_SLOTS <= space) { \
1445 if (!GROW_FAIL_STACK (fail_stack)) \
1446 return -2; \
dc4a2ee0
PE
1447 DEBUG_PRINT ("\n Doubled stack; size now: %zd\n", (fail_stack).size);\
1448 DEBUG_PRINT (" slots available: %zd\n", REMAINING_AVAIL_SLOTS);\
505bde11
SM
1449}
1450
1451/* Push register NUM onto the stack. */
1452#define PUSH_FAILURE_REG(num) \
1453do { \
1454 char *destination; \
dc4a2ee0 1455 long n = num; \
505bde11 1456 ENSURE_FAIL_STACK(3); \
dc4a2ee0
PE
1457 DEBUG_PRINT (" Push reg %ld (spanning %p -> %p)\n", \
1458 n, regstart[n], regend[n]); \
1459 PUSH_FAILURE_POINTER (regstart[n]); \
1460 PUSH_FAILURE_POINTER (regend[n]); \
1461 PUSH_FAILURE_INT (n); \
505bde11
SM
1462} while (0)
1463
01618498
SM
1464/* Change the counter's value to VAL, but make sure that it will
1465 be reset when backtracking. */
1466#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1467do { \
1468 char *destination; \
1469 int c; \
1470 ENSURE_FAIL_STACK(3); \
1471 EXTRACT_NUMBER (c, ptr); \
dc4a2ee0 1472 DEBUG_PRINT (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1473 PUSH_FAILURE_INT (c); \
1474 PUSH_FAILURE_POINTER (ptr); \
1475 PUSH_FAILURE_INT (-1); \
01618498 1476 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1477} while (0)
1478
505bde11 1479/* Pop a saved register off the stack. */
dc1e502d 1480#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1481do { \
d1dfb56c 1482 long pfreg = POP_FAILURE_INT (); \
19ed5445 1483 if (pfreg == -1) \
dc1e502d
SM
1484 { \
1485 /* It's a counter. */ \
6dcf2d0e
SM
1486 /* Here, we discard `const', making re_match non-reentrant. */ \
1487 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1488 pfreg = POP_FAILURE_INT (); \
1489 STORE_NUMBER (ptr, pfreg); \
dc4a2ee0 1490 DEBUG_PRINT (" Pop counter %p = %ld\n", ptr, pfreg); \
dc1e502d
SM
1491 } \
1492 else \
1493 { \
19ed5445
PE
1494 regend[pfreg] = POP_FAILURE_POINTER (); \
1495 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc4a2ee0
PE
1496 DEBUG_PRINT (" Pop reg %ld (spanning %p -> %p)\n", \
1497 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1498 } \
505bde11
SM
1499} while (0)
1500
1501/* Check that we are not stuck in an infinite loop. */
1502#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1503do { \
d1dfb56c 1504 ssize_t failure = TOP_FAILURE_HANDLE (); \
505bde11 1505 /* Check for infinite matching loops */ \
f6df485f
RS
1506 while (failure > 0 \
1507 && (FAILURE_STR (failure) == string_place \
1508 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1509 { \
1510 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1511 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1512 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1513 { \
6df42991
SM
1514 cycle = 1; \
1515 break; \
f6df485f 1516 } \
dc4a2ee0 1517 DEBUG_PRINT (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1518 failure = NEXT_FAILURE_HANDLE(failure); \
1519 } \
dc4a2ee0 1520 DEBUG_PRINT (" Other string: %p\n", FAILURE_STR (failure)); \
505bde11 1521} while (0)
6df42991 1522
fa9a63c5 1523/* Push the information about the state we will need
5e69f11e
RM
1524 if we ever fail back to it.
1525
505bde11 1526 Requires variables fail_stack, regstart, regend and
320a2a73 1527 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1528 declared.
5e69f11e 1529
fa9a63c5
RM
1530 Does `return FAILURE_CODE' if runs out of memory. */
1531
505bde11
SM
1532#define PUSH_FAILURE_POINT(pattern, string_place) \
1533do { \
1534 char *destination; \
1535 /* Must be int, so when we don't save any registers, the arithmetic \
1536 of 0 + -1 isn't done as unsigned. */ \
1537 \
505bde11 1538 DEBUG_STATEMENT (nfailure_points_pushed++); \
dc4a2ee0
PE
1539 DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n"); \
1540 DEBUG_PRINT (" Before push, next avail: %zd\n", (fail_stack).avail); \
1541 DEBUG_PRINT (" size: %zd\n", (fail_stack).size);\
505bde11
SM
1542 \
1543 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1544 \
dc4a2ee0 1545 DEBUG_PRINT ("\n"); \
505bde11 1546 \
dc4a2ee0 1547 DEBUG_PRINT (" Push frame index: %zd\n", fail_stack.frame); \
505bde11
SM
1548 PUSH_FAILURE_INT (fail_stack.frame); \
1549 \
dc4a2ee0 1550 DEBUG_PRINT (" Push string %p: `", string_place); \
505bde11 1551 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
dc4a2ee0 1552 DEBUG_PRINT ("'\n"); \
505bde11
SM
1553 PUSH_FAILURE_POINTER (string_place); \
1554 \
dc4a2ee0 1555 DEBUG_PRINT (" Push pattern %p: ", pattern); \
505bde11
SM
1556 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1557 PUSH_FAILURE_POINTER (pattern); \
1558 \
1559 /* Close the frame by moving the frame pointer past it. */ \
1560 fail_stack.frame = fail_stack.avail; \
1561} while (0)
fa9a63c5 1562
320a2a73
KH
1563/* Estimate the size of data pushed by a typical failure stack entry.
1564 An estimate is all we need, because all we use this for
1565 is to choose a limit for how big to make the failure stack. */
ada30c0e 1566/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1567#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1568
fa9a63c5
RM
1569/* How many items can still be added to the stack without overflowing it. */
1570#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1571
1572
1573/* Pops what PUSH_FAIL_STACK pushes.
1574
1575 We restore into the parameters, all of which should be lvalues:
1576 STR -- the saved data position.
1577 PAT -- the saved pattern position.
fa9a63c5 1578 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1579
fa9a63c5 1580 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1581 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1582
505bde11
SM
1583#define POP_FAILURE_POINT(str, pat) \
1584do { \
fa9a63c5
RM
1585 assert (!FAIL_STACK_EMPTY ()); \
1586 \
1587 /* Remove failure points and point to how many regs pushed. */ \
dc4a2ee0
PE
1588 DEBUG_PRINT ("POP_FAILURE_POINT:\n"); \
1589 DEBUG_PRINT (" Before pop, next avail: %zd\n", fail_stack.avail); \
1590 DEBUG_PRINT (" size: %zd\n", fail_stack.size); \
fa9a63c5 1591 \
505bde11
SM
1592 /* Pop the saved registers. */ \
1593 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1594 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1595 \
dc4a2ee0
PE
1596 pat = POP_FAILURE_POINTER (); \
1597 DEBUG_PRINT (" Popping pattern %p: ", pat); \
505bde11 1598 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1599 \
1600 /* If the saved string location is NULL, it came from an \
1601 on_failure_keep_string_jump opcode, and we want to throw away the \
1602 saved NULL, thus retaining our current position in the string. */ \
01618498 1603 str = POP_FAILURE_POINTER (); \
dc4a2ee0 1604 DEBUG_PRINT (" Popping string %p: `", str); \
fa9a63c5 1605 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
dc4a2ee0 1606 DEBUG_PRINT ("'\n"); \
fa9a63c5 1607 \
505bde11 1608 fail_stack.frame = POP_FAILURE_INT (); \
dc4a2ee0 1609 DEBUG_PRINT (" Popping frame index: %zd\n", fail_stack.frame); \
fa9a63c5 1610 \
505bde11
SM
1611 assert (fail_stack.avail >= 0); \
1612 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1613 \
fa9a63c5 1614 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1615} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1616
1617
1618\f
fa9a63c5 1619/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1620#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1621\f
1622/* Subroutine declarations and macros for regex_compile. */
1623
261cb4bb
PE
1624static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1625 reg_syntax_t syntax,
1626 struct re_pattern_buffer *bufp);
1627static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1628static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1629static void insert_op1 (re_opcode_t op, unsigned char *loc,
1630 int arg, unsigned char *end);
1631static void insert_op2 (re_opcode_t op, unsigned char *loc,
1632 int arg1, int arg2, unsigned char *end);
1633static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1634 reg_syntax_t syntax);
1635static boolean at_endline_loc_p (re_char *p, re_char *pend,
1636 reg_syntax_t syntax);
1637static re_char *skip_one_char (re_char *p);
1638static int analyse_first (re_char *p, re_char *pend,
1639 char *fastmap, const int multibyte);
fa9a63c5 1640
fa9a63c5 1641/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1642 translation. */
36595814 1643#define PATFETCH(c) \
2d1675e4
SM
1644 do { \
1645 int len; \
1646 if (p == pend) return REG_EEND; \
62a6e103 1647 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1648 p += len; \
fa9a63c5
RM
1649 } while (0)
1650
fa9a63c5
RM
1651
1652/* If `translate' is non-null, return translate[D], else just D. We
1653 cast the subscript to translate because some data is declared as
1654 `char *', to avoid warnings when a string constant is passed. But
1655 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1656#ifndef TRANSLATE
0b32bf0e 1657# define TRANSLATE(d) \
66f0296e 1658 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1659#endif
fa9a63c5
RM
1660
1661
1662/* Macros for outputting the compiled pattern into `buffer'. */
1663
1664/* If the buffer isn't allocated when it comes in, use this. */
1665#define INIT_BUF_SIZE 32
1666
4bb91c68 1667/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1668#define GET_BUFFER_SPACE(n) \
01618498 1669 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1670 EXTEND_BUFFER ()
1671
1672/* Make sure we have one more byte of buffer space and then add C to it. */
1673#define BUF_PUSH(c) \
1674 do { \
1675 GET_BUFFER_SPACE (1); \
1676 *b++ = (unsigned char) (c); \
1677 } while (0)
1678
1679
1680/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1681#define BUF_PUSH_2(c1, c2) \
1682 do { \
1683 GET_BUFFER_SPACE (2); \
1684 *b++ = (unsigned char) (c1); \
1685 *b++ = (unsigned char) (c2); \
1686 } while (0)
1687
1688
fa9a63c5 1689/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1690 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1691#define STORE_JUMP(op, loc, to) \
1692 store_op1 (op, loc, (to) - (loc) - 3)
1693
1694/* Likewise, for a two-argument jump. */
1695#define STORE_JUMP2(op, loc, to, arg) \
1696 store_op2 (op, loc, (to) - (loc) - 3, arg)
1697
4bb91c68 1698/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1699#define INSERT_JUMP(op, loc, to) \
1700 insert_op1 (op, loc, (to) - (loc) - 3, b)
1701
1702/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1703#define INSERT_JUMP2(op, loc, to, arg) \
1704 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1705
1706
1707/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1708 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1709 be too small, many things would have to change. */
839966f3
KH
1710# define MAX_BUF_SIZE (1L << 15)
1711
fa9a63c5
RM
1712/* Extend the buffer by twice its current size via realloc and
1713 reset the pointers that pointed into the old block to point to the
1714 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1715 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1716#if __BOUNDED_POINTERS__
1717# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1718# define MOVE_BUFFER_POINTER(P) \
1719 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1720 SET_HIGH_BOUND (P), \
1721 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1722# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1723 else \
1724 { \
1725 SET_HIGH_BOUND (b); \
1726 SET_HIGH_BOUND (begalt); \
1727 if (fixup_alt_jump) \
1728 SET_HIGH_BOUND (fixup_alt_jump); \
1729 if (laststart) \
1730 SET_HIGH_BOUND (laststart); \
1731 if (pending_exact) \
1732 SET_HIGH_BOUND (pending_exact); \
1733 }
1734#else
381880b0 1735# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1736# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1737#endif
fa9a63c5 1738#define EXTEND_BUFFER() \
25fe55af 1739 do { \
381880b0 1740 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1741 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1742 return REG_ESIZE; \
1743 bufp->allocated <<= 1; \
1744 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1745 bufp->allocated = MAX_BUF_SIZE; \
01618498 1746 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1747 if (bufp->buffer == NULL) \
1748 return REG_ESPACE; \
1749 /* If the buffer moved, move all the pointers into it. */ \
1750 if (old_buffer != bufp->buffer) \
1751 { \
381880b0 1752 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1753 MOVE_BUFFER_POINTER (b); \
1754 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1755 if (fixup_alt_jump) \
4bb91c68 1756 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1757 if (laststart) \
4bb91c68 1758 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1759 if (pending_exact) \
4bb91c68 1760 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1761 } \
4bb91c68 1762 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1763 } while (0)
1764
1765
1766/* Since we have one byte reserved for the register number argument to
1767 {start,stop}_memory, the maximum number of groups we can report
1768 things about is what fits in that byte. */
1769#define MAX_REGNUM 255
1770
1771/* But patterns can have more than `MAX_REGNUM' registers. We just
1772 ignore the excess. */
098d42af 1773typedef int regnum_t;
fa9a63c5
RM
1774
1775
1776/* Macros for the compile stack. */
1777
1778/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1779 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1780/* int may be not enough when sizeof(int) == 2. */
1781typedef long pattern_offset_t;
fa9a63c5
RM
1782
1783typedef struct
1784{
1785 pattern_offset_t begalt_offset;
1786 pattern_offset_t fixup_alt_jump;
5e69f11e 1787 pattern_offset_t laststart_offset;
fa9a63c5
RM
1788 regnum_t regnum;
1789} compile_stack_elt_t;
1790
1791
1792typedef struct
1793{
1794 compile_stack_elt_t *stack;
d1dfb56c
EZ
1795 size_t size;
1796 size_t avail; /* Offset of next open position. */
fa9a63c5
RM
1797} compile_stack_type;
1798
1799
1800#define INIT_COMPILE_STACK_SIZE 32
1801
1802#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1803#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1804
4bb91c68 1805/* The next available element. */
fa9a63c5
RM
1806#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1807
0caaedb1
PE
1808/* Explicit quit checking is needed for Emacs, which uses polling to
1809 process input events. */
1810#ifdef emacs
77d11aec
RS
1811# define IMMEDIATE_QUIT_CHECK \
1812 do { \
1813 if (immediate_quit) QUIT; \
1814 } while (0)
1815#else
1816# define IMMEDIATE_QUIT_CHECK ((void)0)
1817#endif
1818\f
b18215fc
RS
1819/* Structure to manage work area for range table. */
1820struct range_table_work_area
1821{
1822 int *table; /* actual work area. */
1823 int allocated; /* allocated size for work area in bytes. */
7814e705 1824 int used; /* actually used size in words. */
96cc36cc 1825 int bits; /* flag to record character classes */
b18215fc
RS
1826};
1827
78779650
AS
1828#ifdef emacs
1829
77d11aec
RS
1830/* Make sure that WORK_AREA can hold more N multibyte characters.
1831 This is used only in set_image_of_range and set_image_of_range_1.
1832 It expects WORK_AREA to be a pointer.
1833 If it can't get the space, it returns from the surrounding function. */
1834
1835#define EXTEND_RANGE_TABLE(work_area, n) \
1836 do { \
8f924df7 1837 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1838 { \
8f924df7
KH
1839 extend_range_table_work_area (&work_area); \
1840 if ((work_area).table == 0) \
77d11aec
RS
1841 return (REG_ESPACE); \
1842 } \
b18215fc
RS
1843 } while (0)
1844
96cc36cc
RS
1845#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1846 (work_area).bits |= (bit)
1847
b18215fc
RS
1848/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1849#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1850 do { \
8f924df7 1851 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1852 (work_area).table[(work_area).used++] = (range_start); \
1853 (work_area).table[(work_area).used++] = (range_end); \
1854 } while (0)
1855
78779650
AS
1856#endif /* emacs */
1857
7814e705 1858/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1859#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1860 do { \
1861 if ((work_area).table) \
1862 free ((work_area).table); \
1863 } while (0)
1864
96cc36cc 1865#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1866#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1867#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1868#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
78779650
AS
1869
1870/* Bits used to implement the multibyte-part of the various character classes
1871 such as [:alnum:] in a charset's range table. */
1872#define BIT_WORD 0x1
1873#define BIT_LOWER 0x2
1874#define BIT_PUNCT 0x4
1875#define BIT_SPACE 0x8
1876#define BIT_UPPER 0x10
1877#define BIT_MULTIBYTE 0x20
77d11aec 1878\f
b18215fc 1879
fa9a63c5 1880/* Set the bit for character C in a list. */
01618498 1881#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1882
1883
bf216479
KH
1884#ifdef emacs
1885
cf9c99bc
KH
1886/* Store characters in the range FROM to TO in the bitmap at B (for
1887 ASCII and unibyte characters) and WORK_AREA (for multibyte
1888 characters) while translating them and paying attention to the
1889 continuity of translated characters.
8f924df7 1890
cf9c99bc
KH
1891 Implementation note: It is better to implement these fairly big
1892 macros by a function, but it's not that easy because macros called
8f924df7 1893 in this macro assume various local variables already declared. */
bf216479 1894
cf9c99bc
KH
1895/* Both FROM and TO are ASCII characters. */
1896
1897#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1898 do { \
1899 int C0, C1; \
1900 \
1901 for (C0 = (FROM); C0 <= (TO); C0++) \
1902 { \
1903 C1 = TRANSLATE (C0); \
1904 if (! ASCII_CHAR_P (C1)) \
1905 { \
1906 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1907 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1908 C1 = C0; \
1909 } \
1910 SET_LIST_BIT (C1); \
1911 } \
1912 } while (0)
1913
1914
1915/* Both FROM and TO are unibyte characters (0x80..0xFF). */
1916
1917#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1918 do { \
1919 int C0, C1, C2, I; \
1920 int USED = RANGE_TABLE_WORK_USED (work_area); \
1921 \
1922 for (C0 = (FROM); C0 <= (TO); C0++) \
1923 { \
1924 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1925 if (CHAR_BYTE8_P (C1)) \
1926 SET_LIST_BIT (C0); \
1927 else \
1928 { \
1929 C2 = TRANSLATE (C1); \
1930 if (C2 == C1 \
1931 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1932 C1 = C0; \
1933 SET_LIST_BIT (C1); \
1934 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1935 { \
1936 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1937 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1938 \
1939 if (C2 >= from - 1 && C2 <= to + 1) \
1940 { \
1941 if (C2 == from - 1) \
1942 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1943 else if (C2 == to + 1) \
1944 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1945 break; \
1946 } \
1947 } \
1948 if (I < USED) \
1949 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1950 } \
1951 } \
1952 } while (0)
1953
1954
78edd3b7 1955/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
1956
1957#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1958 do { \
1959 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1960 \
1961 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1962 for (C0 = (FROM); C0 <= (TO); C0++) \
1963 { \
1964 C1 = TRANSLATE (C0); \
1965 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1966 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1967 SET_LIST_BIT (C2); \
1968 if (C1 >= (FROM) && C1 <= (TO)) \
1969 continue; \
1970 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1971 { \
1972 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1973 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1974 \
1975 if (C1 >= from - 1 && C1 <= to + 1) \
1976 { \
1977 if (C1 == from - 1) \
1978 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1979 else if (C1 == to + 1) \
1980 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1981 break; \
1982 } \
1983 } \
1984 if (I < USED) \
1985 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1986 } \
bf216479
KH
1987 } while (0)
1988
1989#endif /* emacs */
1990
fa9a63c5 1991/* Get the next unsigned number in the uncompiled pattern. */
4618713a 1992#define GET_INTERVAL_COUNT(num) \
c72b0edd
SM
1993 do { \
1994 if (p == pend) \
1995 FREE_STACK_RETURN (REG_EBRACE); \
1996 else \
1997 { \
1998 PATFETCH (c); \
1999 while ('0' <= c && c <= '9') \
2000 { \
c72b0edd
SM
2001 if (num < 0) \
2002 num = 0; \
4618713a 2003 if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num) \
c72b0edd 2004 FREE_STACK_RETURN (REG_BADBR); \
4618713a 2005 num = num * 10 + c - '0'; \
c72b0edd
SM
2006 if (p == pend) \
2007 FREE_STACK_RETURN (REG_EBRACE); \
2008 PATFETCH (c); \
2009 } \
2010 } \
2011 } while (0)
77d11aec 2012\f
1fdab503 2013#if ! WIDE_CHAR_SUPPORT
01618498 2014
14473664 2015/* Map a string to the char class it names (if any). */
1fdab503 2016re_wctype_t
29abe551 2017re_wctype (const_re_char *str)
14473664 2018{
5b0534c8 2019 const char *string = (const char *) str;
14473664
SM
2020 if (STREQ (string, "alnum")) return RECC_ALNUM;
2021 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2022 else if (STREQ (string, "word")) return RECC_WORD;
2023 else if (STREQ (string, "ascii")) return RECC_ASCII;
2024 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2025 else if (STREQ (string, "graph")) return RECC_GRAPH;
2026 else if (STREQ (string, "lower")) return RECC_LOWER;
2027 else if (STREQ (string, "print")) return RECC_PRINT;
2028 else if (STREQ (string, "punct")) return RECC_PUNCT;
2029 else if (STREQ (string, "space")) return RECC_SPACE;
2030 else if (STREQ (string, "upper")) return RECC_UPPER;
2031 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2032 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2033 else if (STREQ (string, "digit")) return RECC_DIGIT;
2034 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2035 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2036 else if (STREQ (string, "blank")) return RECC_BLANK;
2037 else return 0;
2038}
2039
e0f24100 2040/* True if CH is in the char class CC. */
1fdab503 2041boolean
971de7fb 2042re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2043{
2044 switch (cc)
2045 {
f3fcc40d
AS
2046 case RECC_ALNUM: return ISALNUM (ch) != 0;
2047 case RECC_ALPHA: return ISALPHA (ch) != 0;
2048 case RECC_BLANK: return ISBLANK (ch) != 0;
2049 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2050 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2051 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2052 case RECC_LOWER: return ISLOWER (ch) != 0;
2053 case RECC_PRINT: return ISPRINT (ch) != 0;
2054 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2055 case RECC_SPACE: return ISSPACE (ch) != 0;
2056 case RECC_UPPER: return ISUPPER (ch) != 0;
2057 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2058 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
213bd7f2 2059 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
f3fcc40d 2060 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
213bd7f2 2061 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
f3fcc40d 2062 case RECC_WORD: return ISWORD (ch) != 0;
0cdd06f8
SM
2063 case RECC_ERROR: return false;
2064 default:
5e617bc2 2065 abort ();
14473664
SM
2066 }
2067}
fa9a63c5 2068
14473664
SM
2069/* Return a bit-pattern to use in the range-table bits to match multibyte
2070 chars of class CC. */
2071static int
971de7fb 2072re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2073{
2074 switch (cc)
2075 {
2076 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2077 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2078 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2079 case RECC_LOWER: return BIT_LOWER;
2080 case RECC_UPPER: return BIT_UPPER;
2081 case RECC_PUNCT: return BIT_PUNCT;
2082 case RECC_SPACE: return BIT_SPACE;
14473664 2083 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2084 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2085 default:
5e617bc2 2086 abort ();
14473664
SM
2087 }
2088}
2089#endif
77d11aec
RS
2090\f
2091/* Filling in the work area of a range. */
2092
2093/* Actually extend the space in WORK_AREA. */
2094
2095static void
971de7fb 2096extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2097{
77d11aec 2098 work_area->allocated += 16 * sizeof (int);
38182d90 2099 work_area->table = realloc (work_area->table, work_area->allocated);
77d11aec
RS
2100}
2101
8f924df7 2102#if 0
77d11aec
RS
2103#ifdef emacs
2104
2105/* Carefully find the ranges of codes that are equivalent
2106 under case conversion to the range start..end when passed through
2107 TRANSLATE. Handle the case where non-letters can come in between
2108 two upper-case letters (which happens in Latin-1).
2109 Also handle the case of groups of more than 2 case-equivalent chars.
2110
2111 The basic method is to look at consecutive characters and see
2112 if they can form a run that can be handled as one.
2113
2114 Returns -1 if successful, REG_ESPACE if ran out of space. */
2115
2116static int
1dae0f0a
AS
2117set_image_of_range_1 (struct range_table_work_area *work_area,
2118 re_wchar_t start, re_wchar_t end,
2119 RE_TRANSLATE_TYPE translate)
77d11aec
RS
2120{
2121 /* `one_case' indicates a character, or a run of characters,
2122 each of which is an isolate (no case-equivalents).
2123 This includes all ASCII non-letters.
2124
2125 `two_case' indicates a character, or a run of characters,
2126 each of which has two case-equivalent forms.
2127 This includes all ASCII letters.
2128
2129 `strange' indicates a character that has more than one
2130 case-equivalent. */
177c0ea7 2131
77d11aec
RS
2132 enum case_type {one_case, two_case, strange};
2133
2134 /* Describe the run that is in progress,
2135 which the next character can try to extend.
2136 If run_type is strange, that means there really is no run.
2137 If run_type is one_case, then run_start...run_end is the run.
2138 If run_type is two_case, then the run is run_start...run_end,
2139 and the case-equivalents end at run_eqv_end. */
2140
2141 enum case_type run_type = strange;
2142 int run_start, run_end, run_eqv_end;
2143
2144 Lisp_Object eqv_table;
2145
2146 if (!RE_TRANSLATE_P (translate))
2147 {
b7c12565 2148 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2149 work_area->table[work_area->used++] = (start);
2150 work_area->table[work_area->used++] = (end);
b7c12565 2151 return -1;
77d11aec
RS
2152 }
2153
2154 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2155
77d11aec
RS
2156 for (; start <= end; start++)
2157 {
2158 enum case_type this_type;
2159 int eqv = RE_TRANSLATE (eqv_table, start);
2160 int minchar, maxchar;
2161
2162 /* Classify this character */
2163 if (eqv == start)
2164 this_type = one_case;
2165 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2166 this_type = two_case;
2167 else
2168 this_type = strange;
2169
2170 if (start < eqv)
2171 minchar = start, maxchar = eqv;
2172 else
2173 minchar = eqv, maxchar = start;
2174
2175 /* Can this character extend the run in progress? */
2176 if (this_type == strange || this_type != run_type
2177 || !(minchar == run_end + 1
2178 && (run_type == two_case
2179 ? maxchar == run_eqv_end + 1 : 1)))
2180 {
2181 /* No, end the run.
2182 Record each of its equivalent ranges. */
2183 if (run_type == one_case)
2184 {
2185 EXTEND_RANGE_TABLE (work_area, 2);
2186 work_area->table[work_area->used++] = run_start;
2187 work_area->table[work_area->used++] = run_end;
2188 }
2189 else if (run_type == two_case)
2190 {
2191 EXTEND_RANGE_TABLE (work_area, 4);
2192 work_area->table[work_area->used++] = run_start;
2193 work_area->table[work_area->used++] = run_end;
2194 work_area->table[work_area->used++]
2195 = RE_TRANSLATE (eqv_table, run_start);
2196 work_area->table[work_area->used++]
2197 = RE_TRANSLATE (eqv_table, run_end);
2198 }
2199 run_type = strange;
2200 }
177c0ea7 2201
77d11aec
RS
2202 if (this_type == strange)
2203 {
2204 /* For a strange character, add each of its equivalents, one
2205 by one. Don't start a range. */
2206 do
2207 {
2208 EXTEND_RANGE_TABLE (work_area, 2);
2209 work_area->table[work_area->used++] = eqv;
2210 work_area->table[work_area->used++] = eqv;
2211 eqv = RE_TRANSLATE (eqv_table, eqv);
2212 }
2213 while (eqv != start);
2214 }
2215
2216 /* Add this char to the run, or start a new run. */
2217 else if (run_type == strange)
2218 {
2219 /* Initialize a new range. */
2220 run_type = this_type;
2221 run_start = start;
2222 run_end = start;
2223 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2224 }
2225 else
2226 {
2227 /* Extend a running range. */
2228 run_end = minchar;
2229 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2230 }
2231 }
2232
2233 /* If a run is still in progress at the end, finish it now
2234 by recording its equivalent ranges. */
2235 if (run_type == one_case)
2236 {
2237 EXTEND_RANGE_TABLE (work_area, 2);
2238 work_area->table[work_area->used++] = run_start;
2239 work_area->table[work_area->used++] = run_end;
2240 }
2241 else if (run_type == two_case)
2242 {
2243 EXTEND_RANGE_TABLE (work_area, 4);
2244 work_area->table[work_area->used++] = run_start;
2245 work_area->table[work_area->used++] = run_end;
2246 work_area->table[work_area->used++]
2247 = RE_TRANSLATE (eqv_table, run_start);
2248 work_area->table[work_area->used++]
2249 = RE_TRANSLATE (eqv_table, run_end);
2250 }
2251
2252 return -1;
2253}
36595814 2254
77d11aec 2255#endif /* emacs */
36595814 2256
2b34df4e 2257/* Record the image of the range start..end when passed through
36595814
SM
2258 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2259 and is not even necessarily contiguous.
b7c12565
RS
2260 Normally we approximate it with the smallest contiguous range that contains
2261 all the chars we need. However, for Latin-1 we go to extra effort
2262 to do a better job.
2263
2264 This function is not called for ASCII ranges.
77d11aec
RS
2265
2266 Returns -1 if successful, REG_ESPACE if ran out of space. */
2267
2268static int
1dae0f0a
AS
2269set_image_of_range (struct range_table_work_area *work_area,
2270 re_wchar_t start, re_wchar_t end,
2271 RE_TRANSLATE_TYPE translate)
36595814 2272{
77d11aec
RS
2273 re_wchar_t cmin, cmax;
2274
2275#ifdef emacs
2276 /* For Latin-1 ranges, use set_image_of_range_1
2277 to get proper handling of ranges that include letters and nonletters.
b7c12565 2278 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2279 For other character sets, we don't bother to get this right. */
b7c12565
RS
2280 if (RE_TRANSLATE_P (translate) && start < 04400
2281 && !(start < 04200 && end >= 04377))
77d11aec 2282 {
b7c12565 2283 int newend;
77d11aec 2284 int tem;
b7c12565
RS
2285 newend = end;
2286 if (newend > 04377)
2287 newend = 04377;
2288 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2289 if (tem > 0)
2290 return tem;
2291
2292 start = 04400;
2293 if (end < 04400)
2294 return -1;
2295 }
2296#endif
2297
b7c12565
RS
2298 EXTEND_RANGE_TABLE (work_area, 2);
2299 work_area->table[work_area->used++] = (start);
2300 work_area->table[work_area->used++] = (end);
2301
2302 cmin = -1, cmax = -1;
77d11aec 2303
36595814 2304 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2305 {
2306 int ch;
77d11aec 2307
b7c12565
RS
2308 for (ch = start; ch <= end; ch++)
2309 {
2310 re_wchar_t c = TRANSLATE (ch);
2311 if (! (start <= c && c <= end))
2312 {
2313 if (cmin == -1)
2314 cmin = c, cmax = c;
2315 else
2316 {
2317 cmin = MIN (cmin, c);
2318 cmax = MAX (cmax, c);
2319 }
2320 }
2321 }
2322
2323 if (cmin != -1)
2324 {
2325 EXTEND_RANGE_TABLE (work_area, 2);
2326 work_area->table[work_area->used++] = (cmin);
2327 work_area->table[work_area->used++] = (cmax);
2328 }
2329 }
36595814 2330
77d11aec
RS
2331 return -1;
2332}
8f924df7 2333#endif /* 0 */
fa9a63c5
RM
2334\f
2335#ifndef MATCH_MAY_ALLOCATE
2336
2337/* If we cannot allocate large objects within re_match_2_internal,
2338 we make the fail stack and register vectors global.
2339 The fail stack, we grow to the maximum size when a regexp
2340 is compiled.
2341 The register vectors, we adjust in size each time we
2342 compile a regexp, according to the number of registers it needs. */
2343
2344static fail_stack_type fail_stack;
2345
2346/* Size with which the following vectors are currently allocated.
2347 That is so we can make them bigger as needed,
4bb91c68 2348 but never make them smaller. */
fa9a63c5
RM
2349static int regs_allocated_size;
2350
66f0296e
SM
2351static re_char ** regstart, ** regend;
2352static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2353
2354/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2355 but don't make them smaller. */
fa9a63c5
RM
2356
2357static
1dae0f0a 2358regex_grow_registers (int num_regs)
fa9a63c5
RM
2359{
2360 if (num_regs > regs_allocated_size)
2361 {
66f0296e
SM
2362 RETALLOC_IF (regstart, num_regs, re_char *);
2363 RETALLOC_IF (regend, num_regs, re_char *);
2364 RETALLOC_IF (best_regstart, num_regs, re_char *);
2365 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2366
2367 regs_allocated_size = num_regs;
2368 }
2369}
2370
2371#endif /* not MATCH_MAY_ALLOCATE */
2372\f
261cb4bb
PE
2373static boolean group_in_compile_stack (compile_stack_type compile_stack,
2374 regnum_t regnum);
99633e97 2375
fa9a63c5
RM
2376/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2377 Returns one of error codes defined in `regex.h', or zero for success.
2378
2379 Assumes the `allocated' (and perhaps `buffer') and `translate'
2380 fields are set in BUFP on entry.
2381
2382 If it succeeds, results are put in BUFP (if it returns an error, the
2383 contents of BUFP are undefined):
2384 `buffer' is the compiled pattern;
2385 `syntax' is set to SYNTAX;
2386 `used' is set to the length of the compiled pattern;
2387 `fastmap_accurate' is zero;
2388 `re_nsub' is the number of subexpressions in PATTERN;
2389 `not_bol' and `not_eol' are zero;
5e69f11e 2390
c0f9ea08 2391 The `fastmap' field is neither examined nor set. */
fa9a63c5 2392
505bde11
SM
2393/* Insert the `jump' from the end of last alternative to "here".
2394 The space for the jump has already been allocated. */
2395#define FIXUP_ALT_JUMP() \
2396do { \
2397 if (fixup_alt_jump) \
2398 STORE_JUMP (jump, fixup_alt_jump, b); \
2399} while (0)
2400
2401
fa9a63c5
RM
2402/* Return, freeing storage we allocated. */
2403#define FREE_STACK_RETURN(value) \
b18215fc
RS
2404 do { \
2405 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2406 free (compile_stack.stack); \
2407 return value; \
2408 } while (0)
fa9a63c5
RM
2409
2410static reg_errcode_t
29abe551
PE
2411regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2412 struct re_pattern_buffer *bufp)
fa9a63c5 2413{
01618498
SM
2414 /* We fetch characters from PATTERN here. */
2415 register re_wchar_t c, c1;
5e69f11e 2416
fa9a63c5
RM
2417 /* Points to the end of the buffer, where we should append. */
2418 register unsigned char *b;
5e69f11e 2419
fa9a63c5
RM
2420 /* Keeps track of unclosed groups. */
2421 compile_stack_type compile_stack;
2422
2423 /* Points to the current (ending) position in the pattern. */
22336245
RS
2424#ifdef AIX
2425 /* `const' makes AIX compiler fail. */
66f0296e 2426 unsigned char *p = pattern;
22336245 2427#else
66f0296e 2428 re_char *p = pattern;
22336245 2429#endif
66f0296e 2430 re_char *pend = pattern + size;
5e69f11e 2431
fa9a63c5 2432 /* How to translate the characters in the pattern. */
6676cb1c 2433 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2434
2435 /* Address of the count-byte of the most recently inserted `exactn'
2436 command. This makes it possible to tell if a new exact-match
2437 character can be added to that command or if the character requires
2438 a new `exactn' command. */
2439 unsigned char *pending_exact = 0;
2440
2441 /* Address of start of the most recently finished expression.
2442 This tells, e.g., postfix * where to find the start of its
2443 operand. Reset at the beginning of groups and alternatives. */
2444 unsigned char *laststart = 0;
2445
2446 /* Address of beginning of regexp, or inside of last group. */
2447 unsigned char *begalt;
2448
2449 /* Place in the uncompiled pattern (i.e., the {) to
2450 which to go back if the interval is invalid. */
66f0296e 2451 re_char *beg_interval;
5e69f11e 2452
fa9a63c5 2453 /* Address of the place where a forward jump should go to the end of
7814e705 2454 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2455 last -- ends with a forward jump of this sort. */
2456 unsigned char *fixup_alt_jump = 0;
2457
b18215fc
RS
2458 /* Work area for range table of charset. */
2459 struct range_table_work_area range_table_work;
2460
2d1675e4
SM
2461 /* If the object matched can contain multibyte characters. */
2462 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2463
f9b0fd99
RS
2464 /* Nonzero if we have pushed down into a subpattern. */
2465 int in_subpattern = 0;
2466
2467 /* These hold the values of p, pattern, and pend from the main
2468 pattern when we have pushed into a subpattern. */
da053e48
PE
2469 re_char *main_p IF_LINT (= NULL);
2470 re_char *main_pattern IF_LINT (= NULL);
2471 re_char *main_pend IF_LINT (= NULL);
f9b0fd99 2472
fa9a63c5 2473#ifdef DEBUG
99633e97 2474 debug++;
dc4a2ee0 2475 DEBUG_PRINT ("\nCompiling pattern: ");
99633e97 2476 if (debug > 0)
fa9a63c5
RM
2477 {
2478 unsigned debug_count;
5e69f11e 2479
fa9a63c5 2480 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2481 putchar (pattern[debug_count]);
fa9a63c5
RM
2482 putchar ('\n');
2483 }
2484#endif /* DEBUG */
2485
2486 /* Initialize the compile stack. */
2487 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2488 if (compile_stack.stack == NULL)
2489 return REG_ESPACE;
2490
2491 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2492 compile_stack.avail = 0;
2493
b18215fc
RS
2494 range_table_work.table = 0;
2495 range_table_work.allocated = 0;
2496
fa9a63c5
RM
2497 /* Initialize the pattern buffer. */
2498 bufp->syntax = syntax;
2499 bufp->fastmap_accurate = 0;
2500 bufp->not_bol = bufp->not_eol = 0;
6224b623 2501 bufp->used_syntax = 0;
fa9a63c5
RM
2502
2503 /* Set `used' to zero, so that if we return an error, the pattern
2504 printer (for debugging) will think there's no pattern. We reset it
2505 at the end. */
2506 bufp->used = 0;
5e69f11e 2507
fa9a63c5 2508 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2509 bufp->re_nsub = 0;
fa9a63c5 2510
0b32bf0e 2511#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2512 /* Initialize the syntax table. */
2513 init_syntax_once ();
2514#endif
2515
2516 if (bufp->allocated == 0)
2517 {
2518 if (bufp->buffer)
2519 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2520 enough space. This loses if buffer's address is bogus, but
7814e705 2521 that is the user's responsibility. */
25fe55af
RS
2522 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2523 }
fa9a63c5 2524 else
7814e705 2525 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2526 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2527 }
fa9a63c5
RM
2528 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2529
2530 bufp->allocated = INIT_BUF_SIZE;
2531 }
2532
2533 begalt = b = bufp->buffer;
2534
2535 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2536 while (1)
fa9a63c5 2537 {
f9b0fd99
RS
2538 if (p == pend)
2539 {
2540 /* If this is the end of an included regexp,
2541 pop back to the main regexp and try again. */
2542 if (in_subpattern)
2543 {
2544 in_subpattern = 0;
2545 pattern = main_pattern;
2546 p = main_p;
2547 pend = main_pend;
2548 continue;
2549 }
2550 /* If this is the end of the main regexp, we are done. */
2551 break;
2552 }
2553
fa9a63c5
RM
2554 PATFETCH (c);
2555
2556 switch (c)
25fe55af 2557 {
f9b0fd99
RS
2558 case ' ':
2559 {
2560 re_char *p1 = p;
2561
2562 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2563 spaces normally. And don't try to do this recursively. */
2564 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2565 goto normal_char;
2566
2567 /* Peek past following spaces. */
2568 while (p1 != pend)
2569 {
2570 if (*p1 != ' ')
2571 break;
2572 p1++;
2573 }
2574 /* If the spaces are followed by a repetition op,
2575 treat them normally. */
c721eee5
RS
2576 if (p1 != pend
2577 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2578 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2579 goto normal_char;
2580
2581 /* Replace the spaces with the whitespace regexp. */
2582 in_subpattern = 1;
2583 main_p = p1;
2584 main_pend = pend;
2585 main_pattern = pattern;
2586 p = pattern = whitespace_regexp;
5b0534c8 2587 pend = p + strlen ((const char *) p);
f9b0fd99 2588 break;
7814e705 2589 }
f9b0fd99 2590
25fe55af
RS
2591 case '^':
2592 {
7814e705 2593 if ( /* If at start of pattern, it's an operator. */
25fe55af 2594 p == pattern + 1
7814e705 2595 /* If context independent, it's an operator. */
25fe55af 2596 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2597 /* Otherwise, depends on what's come before. */
25fe55af 2598 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2599 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2600 else
2601 goto normal_char;
2602 }
2603 break;
2604
2605
2606 case '$':
2607 {
2608 if ( /* If at end of pattern, it's an operator. */
2609 p == pend
7814e705 2610 /* If context independent, it's an operator. */
25fe55af
RS
2611 || syntax & RE_CONTEXT_INDEP_ANCHORS
2612 /* Otherwise, depends on what's next. */
2613 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2614 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2615 else
2616 goto normal_char;
2617 }
2618 break;
fa9a63c5
RM
2619
2620
2621 case '+':
25fe55af
RS
2622 case '?':
2623 if ((syntax & RE_BK_PLUS_QM)
2624 || (syntax & RE_LIMITED_OPS))
2625 goto normal_char;
2626 handle_plus:
2627 case '*':
5ac2eb34 2628 /* If there is no previous pattern... */
25fe55af
RS
2629 if (!laststart)
2630 {
2631 if (syntax & RE_CONTEXT_INVALID_OPS)
2632 FREE_STACK_RETURN (REG_BADRPT);
2633 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2634 goto normal_char;
2635 }
2636
2637 {
7814e705 2638 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2639 boolean zero_times_ok = 0, many_times_ok = 0;
2640 boolean greedy = 1;
25fe55af
RS
2641
2642 /* If there is a sequence of repetition chars, collapse it
2643 down to just one (the right one). We can't combine
2644 interval operators with these because of, e.g., `a{2}*',
7814e705 2645 which should only match an even number of `a's. */
25fe55af
RS
2646
2647 for (;;)
2648 {
0b32bf0e 2649 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2650 && c == '?' && (zero_times_ok || many_times_ok))
2651 greedy = 0;
2652 else
2653 {
2654 zero_times_ok |= c != '+';
2655 many_times_ok |= c != '?';
2656 }
25fe55af
RS
2657
2658 if (p == pend)
2659 break;
ed0767d8
SM
2660 else if (*p == '*'
2661 || (!(syntax & RE_BK_PLUS_QM)
2662 && (*p == '+' || *p == '?')))
25fe55af 2663 ;
ed0767d8 2664 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2665 {
ed0767d8
SM
2666 if (p+1 == pend)
2667 FREE_STACK_RETURN (REG_EESCAPE);
2668 if (p[1] == '+' || p[1] == '?')
2669 PATFETCH (c); /* Gobble up the backslash. */
2670 else
2671 break;
25fe55af
RS
2672 }
2673 else
ed0767d8 2674 break;
25fe55af 2675 /* If we get here, we found another repeat character. */
ed0767d8
SM
2676 PATFETCH (c);
2677 }
25fe55af
RS
2678
2679 /* Star, etc. applied to an empty pattern is equivalent
2680 to an empty pattern. */
4e8a9132 2681 if (!laststart || laststart == b)
25fe55af
RS
2682 break;
2683
2684 /* Now we know whether or not zero matches is allowed
7814e705 2685 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2686 if (greedy)
2687 {
99633e97 2688 if (many_times_ok)
4e8a9132
SM
2689 {
2690 boolean simple = skip_one_char (laststart) == b;
d1dfb56c 2691 size_t startoffset = 0;
f6a3f532 2692 re_opcode_t ofj =
01618498 2693 /* Check if the loop can match the empty string. */
6df42991
SM
2694 (simple || !analyse_first (laststart, b, NULL, 0))
2695 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2696 assert (skip_one_char (laststart) <= b);
177c0ea7 2697
4e8a9132
SM
2698 if (!zero_times_ok && simple)
2699 { /* Since simple * loops can be made faster by using
2700 on_failure_keep_string_jump, we turn simple P+
2701 into PP* if P is simple. */
2702 unsigned char *p1, *p2;
2703 startoffset = b - laststart;
2704 GET_BUFFER_SPACE (startoffset);
2705 p1 = b; p2 = laststart;
2706 while (p2 < p1)
2707 *b++ = *p2++;
2708 zero_times_ok = 1;
99633e97 2709 }
4e8a9132
SM
2710
2711 GET_BUFFER_SPACE (6);
2712 if (!zero_times_ok)
2713 /* A + loop. */
f6a3f532 2714 STORE_JUMP (ofj, b, b + 6);
99633e97 2715 else
4e8a9132
SM
2716 /* Simple * loops can use on_failure_keep_string_jump
2717 depending on what follows. But since we don't know
2718 that yet, we leave the decision up to
2719 on_failure_jump_smart. */
f6a3f532 2720 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2721 laststart + startoffset, b + 6);
99633e97 2722 b += 3;
4e8a9132 2723 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2724 b += 3;
2725 }
2726 else
2727 {
4e8a9132
SM
2728 /* A simple ? pattern. */
2729 assert (zero_times_ok);
2730 GET_BUFFER_SPACE (3);
2731 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2732 b += 3;
2733 }
1c8c6d39
DL
2734 }
2735 else /* not greedy */
5ac2eb34 2736 { /* I wish the greedy and non-greedy cases could be merged. */
1c8c6d39 2737
0683b6fa 2738 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2739 if (many_times_ok)
2740 {
f6a3f532
SM
2741 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2742
6df42991
SM
2743 /* The non-greedy multiple match looks like
2744 a repeat..until: we only need a conditional jump
2745 at the end of the loop. */
f6a3f532
SM
2746 if (emptyp) BUF_PUSH (no_op);
2747 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2748 : on_failure_jump, b, laststart);
1c8c6d39
DL
2749 b += 3;
2750 if (zero_times_ok)
2751 {
2752 /* The repeat...until naturally matches one or more.
2753 To also match zero times, we need to first jump to
6df42991 2754 the end of the loop (its conditional jump). */
1c8c6d39
DL
2755 INSERT_JUMP (jump, laststart, b);
2756 b += 3;
2757 }
2758 }
2759 else
2760 {
2761 /* non-greedy a?? */
1c8c6d39
DL
2762 INSERT_JUMP (jump, laststart, b + 3);
2763 b += 3;
2764 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2765 b += 3;
2766 }
2767 }
2768 }
4e8a9132 2769 pending_exact = 0;
fa9a63c5
RM
2770 break;
2771
2772
2773 case '.':
25fe55af
RS
2774 laststart = b;
2775 BUF_PUSH (anychar);
2776 break;
fa9a63c5
RM
2777
2778
25fe55af
RS
2779 case '[':
2780 {
19ed5445
PE
2781 re_char *p1;
2782
b18215fc 2783 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2784
25fe55af 2785 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2786
25fe55af
RS
2787 /* Ensure that we have enough space to push a charset: the
2788 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2789 GET_BUFFER_SPACE (34);
2790
25fe55af 2791 laststart = b;
e318085a 2792
25fe55af 2793 /* We test `*p == '^' twice, instead of using an if
7814e705 2794 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2795 BUF_PUSH (*p == '^' ? charset_not : charset);
2796 if (*p == '^')
2797 p++;
e318085a 2798
25fe55af
RS
2799 /* Remember the first position in the bracket expression. */
2800 p1 = p;
e318085a 2801
7814e705 2802 /* Push the number of bytes in the bitmap. */
25fe55af 2803 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2804
25fe55af 2805 /* Clear the whole map. */
72af86bd 2806 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2807
25fe55af
RS
2808 /* charset_not matches newline according to a syntax bit. */
2809 if ((re_opcode_t) b[-2] == charset_not
2810 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2811 SET_LIST_BIT ('\n');
fa9a63c5 2812
7814e705 2813 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2814 for (;;)
2815 {
b18215fc 2816 boolean escaped_char = false;
2d1675e4 2817 const unsigned char *p2 = p;
abbd1bcf 2818 re_wchar_t ch;
e318085a 2819
25fe55af 2820 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2821
36595814
SM
2822 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2823 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2824 So the translation is done later in a loop. Example:
2825 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2826 PATFETCH (c);
e318085a 2827
25fe55af
RS
2828 /* \ might escape characters inside [...] and [^...]. */
2829 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2830 {
2831 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2832
2833 PATFETCH (c);
b18215fc 2834 escaped_char = true;
25fe55af 2835 }
b18215fc
RS
2836 else
2837 {
7814e705 2838 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2839 not (i.e., when the bracket expression is `[]' so
2840 far), the ']' character bit gets set way below. */
2d1675e4 2841 if (c == ']' && p2 != p1)
657fcfbd 2842 break;
25fe55af 2843 }
b18215fc 2844
25fe55af
RS
2845 /* See if we're at the beginning of a possible character
2846 class. */
b18215fc 2847
2d1675e4
SM
2848 if (!escaped_char &&
2849 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2850 {
7814e705 2851 /* Leave room for the null. */
14473664 2852 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2853 const unsigned char *class_beg;
b18215fc 2854
25fe55af
RS
2855 PATFETCH (c);
2856 c1 = 0;
ed0767d8 2857 class_beg = p;
b18215fc 2858
25fe55af
RS
2859 /* If pattern is `[[:'. */
2860 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2861
25fe55af
RS
2862 for (;;)
2863 {
14473664
SM
2864 PATFETCH (c);
2865 if ((c == ':' && *p == ']') || p == pend)
2866 break;
2867 if (c1 < CHAR_CLASS_MAX_LENGTH)
2868 str[c1++] = c;
2869 else
2870 /* This is in any case an invalid class name. */
2871 str[0] = '\0';
25fe55af
RS
2872 }
2873 str[c1] = '\0';
b18215fc
RS
2874
2875 /* If isn't a word bracketed by `[:' and `:]':
2876 undo the ending character, the letters, and
2877 leave the leading `:' and `[' (but set bits for
2878 them). */
25fe55af
RS
2879 if (c == ':' && *p == ']')
2880 {
abbd1bcf 2881 re_wctype_t cc = re_wctype (str);
14473664
SM
2882
2883 if (cc == 0)
fa9a63c5
RM
2884 FREE_STACK_RETURN (REG_ECTYPE);
2885
14473664
SM
2886 /* Throw away the ] at the end of the character
2887 class. */
2888 PATFETCH (c);
fa9a63c5 2889
14473664 2890 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2891
cf9c99bc
KH
2892#ifndef emacs
2893 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
2894 if (re_iswctype (btowc (ch), cc))
2895 {
2896 c = TRANSLATE (ch);
ed00c2ac
KH
2897 if (c < (1 << BYTEWIDTH))
2898 SET_LIST_BIT (c);
8f924df7 2899 }
cf9c99bc
KH
2900#else /* emacs */
2901 /* Most character classes in a multibyte match
2902 just set a flag. Exceptions are is_blank,
2903 is_digit, is_cntrl, and is_xdigit, since
2904 they can only match ASCII characters. We
2905 don't need to handle them for multibyte.
2906 They are distinguished by a negative wctype. */
96cc36cc 2907
254c06a8
SM
2908 /* Setup the gl_state object to its buffer-defined
2909 value. This hardcodes the buffer-global
2910 syntax-table for ASCII chars, while the other chars
2911 will obey syntax-table properties. It's not ideal,
2912 but it's the way it's been done until now. */
d48cd3f4 2913 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 2914
cf9c99bc 2915 for (ch = 0; ch < 256; ++ch)
25fe55af 2916 {
cf9c99bc
KH
2917 c = RE_CHAR_TO_MULTIBYTE (ch);
2918 if (! CHAR_BYTE8_P (c)
2919 && re_iswctype (c, cc))
8f924df7 2920 {
cf9c99bc
KH
2921 SET_LIST_BIT (ch);
2922 c1 = TRANSLATE (c);
2923 if (c1 == c)
2924 continue;
2925 if (ASCII_CHAR_P (c1))
2926 SET_LIST_BIT (c1);
2927 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2928 SET_LIST_BIT (c1);
8f924df7 2929 }
25fe55af 2930 }
cf9c99bc
KH
2931 SET_RANGE_TABLE_WORK_AREA_BIT
2932 (range_table_work, re_wctype_to_bit (cc));
2933#endif /* emacs */
6224b623
SM
2934 /* In most cases the matching rule for char classes
2935 only uses the syntax table for multibyte chars,
2936 so that the content of the syntax-table it is not
2937 hardcoded in the range_table. SPACE and WORD are
2938 the two exceptions. */
2939 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2940 bufp->used_syntax = 1;
2941
b18215fc
RS
2942 /* Repeat the loop. */
2943 continue;
25fe55af
RS
2944 }
2945 else
2946 {
ed0767d8
SM
2947 /* Go back to right after the "[:". */
2948 p = class_beg;
25fe55af 2949 SET_LIST_BIT ('[');
b18215fc
RS
2950
2951 /* Because the `:' may starts the range, we
2952 can't simply set bit and repeat the loop.
7814e705 2953 Instead, just set it to C and handle below. */
b18215fc 2954 c = ':';
25fe55af
RS
2955 }
2956 }
b18215fc
RS
2957
2958 if (p < pend && p[0] == '-' && p[1] != ']')
2959 {
2960
2961 /* Discard the `-'. */
2962 PATFETCH (c1);
2963
2964 /* Fetch the character which ends the range. */
2965 PATFETCH (c1);
cf9c99bc
KH
2966#ifdef emacs
2967 if (CHAR_BYTE8_P (c1)
2968 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2969 /* Treat the range from a multibyte character to
2970 raw-byte character as empty. */
2971 c = c1 + 1;
2972#endif /* emacs */
e318085a 2973 }
25fe55af 2974 else
b18215fc
RS
2975 /* Range from C to C. */
2976 c1 = c;
2977
cf9c99bc 2978 if (c > c1)
25fe55af 2979 {
cf9c99bc
KH
2980 if (syntax & RE_NO_EMPTY_RANGES)
2981 FREE_STACK_RETURN (REG_ERANGEX);
2982 /* Else, repeat the loop. */
bf216479 2983 }
6fdd04b0 2984 else
25fe55af 2985 {
cf9c99bc
KH
2986#ifndef emacs
2987 /* Set the range into bitmap */
8f924df7 2988 for (; c <= c1; c++)
b18215fc 2989 {
cf9c99bc
KH
2990 ch = TRANSLATE (c);
2991 if (ch < (1 << BYTEWIDTH))
2992 SET_LIST_BIT (ch);
2993 }
2994#else /* emacs */
2995 if (c < 128)
2996 {
2997 ch = MIN (127, c1);
2998 SETUP_ASCII_RANGE (range_table_work, c, ch);
2999 c = ch + 1;
3000 if (CHAR_BYTE8_P (c1))
3001 c = BYTE8_TO_CHAR (128);
3002 }
3003 if (c <= c1)
3004 {
3005 if (CHAR_BYTE8_P (c))
3006 {
3007 c = CHAR_TO_BYTE8 (c);
3008 c1 = CHAR_TO_BYTE8 (c1);
3009 for (; c <= c1; c++)
3010 SET_LIST_BIT (c);
3011 }
3012 else if (multibyte)
3013 {
3014 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3015 }
3016 else
3017 {
3018 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3019 }
e934739e 3020 }
cf9c99bc 3021#endif /* emacs */
25fe55af 3022 }
e318085a
RS
3023 }
3024
25fe55af 3025 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3026 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3027 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3028 b[-1]--;
3029 b += b[-1];
fa9a63c5 3030
96cc36cc
RS
3031 /* Build real range table from work area. */
3032 if (RANGE_TABLE_WORK_USED (range_table_work)
3033 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3034 {
3035 int i;
3036 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3037
b18215fc 3038 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc 3039 bytes for flags, two for COUNT, and three bytes for
5ac2eb34 3040 each character. */
96cc36cc 3041 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3042
b18215fc
RS
3043 /* Indicate the existence of range table. */
3044 laststart[1] |= 0x80;
fa9a63c5 3045
96cc36cc
RS
3046 /* Store the character class flag bits into the range table.
3047 If not in emacs, these flag bits are always 0. */
3048 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3049 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3050
b18215fc
RS
3051 STORE_NUMBER_AND_INCR (b, used / 2);
3052 for (i = 0; i < used; i++)
3053 STORE_CHARACTER_AND_INCR
3054 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3055 }
25fe55af
RS
3056 }
3057 break;
fa9a63c5
RM
3058
3059
b18215fc 3060 case '(':
25fe55af
RS
3061 if (syntax & RE_NO_BK_PARENS)
3062 goto handle_open;
3063 else
3064 goto normal_char;
fa9a63c5
RM
3065
3066
25fe55af
RS
3067 case ')':
3068 if (syntax & RE_NO_BK_PARENS)
3069 goto handle_close;
3070 else
3071 goto normal_char;
e318085a
RS
3072
3073
25fe55af
RS
3074 case '\n':
3075 if (syntax & RE_NEWLINE_ALT)
3076 goto handle_alt;
3077 else
3078 goto normal_char;
e318085a
RS
3079
3080
b18215fc 3081 case '|':
25fe55af
RS
3082 if (syntax & RE_NO_BK_VBAR)
3083 goto handle_alt;
3084 else
3085 goto normal_char;
3086
3087
3088 case '{':
3089 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3090 goto handle_interval;
3091 else
3092 goto normal_char;
3093
3094
3095 case '\\':
3096 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3097
3098 /* Do not translate the character after the \, so that we can
3099 distinguish, e.g., \B from \b, even if we normally would
3100 translate, e.g., B to b. */
36595814 3101 PATFETCH (c);
25fe55af
RS
3102
3103 switch (c)
3104 {
3105 case '(':
3106 if (syntax & RE_NO_BK_PARENS)
3107 goto normal_backslash;
3108
3109 handle_open:
505bde11
SM
3110 {
3111 int shy = 0;
c69b0314 3112 regnum_t regnum = 0;
505bde11
SM
3113 if (p+1 < pend)
3114 {
3115 /* Look for a special (?...) construct */
ed0767d8 3116 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3117 {
ed0767d8 3118 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3119 while (!shy)
505bde11 3120 {
c69b0314
SM
3121 PATFETCH (c);
3122 switch (c)
3123 {
3124 case ':': shy = 1; break;
3125 case '0':
3126 /* An explicitly specified regnum must start
3127 with non-0. */
3128 if (regnum == 0)
3129 FREE_STACK_RETURN (REG_BADPAT);
3130 case '1': case '2': case '3': case '4':
3131 case '5': case '6': case '7': case '8': case '9':
3132 regnum = 10*regnum + (c - '0'); break;
3133 default:
3134 /* Only (?:...) is supported right now. */
3135 FREE_STACK_RETURN (REG_BADPAT);
3136 }
505bde11
SM
3137 }
3138 }
505bde11
SM
3139 }
3140
3141 if (!shy)
c69b0314
SM
3142 regnum = ++bufp->re_nsub;
3143 else if (regnum)
3144 { /* It's actually not shy, but explicitly numbered. */
3145 shy = 0;
3146 if (regnum > bufp->re_nsub)
3147 bufp->re_nsub = regnum;
3148 else if (regnum > bufp->re_nsub
3149 /* Ideally, we'd want to check that the specified
3150 group can't have matched (i.e. all subgroups
3151 using the same regnum are in other branches of
3152 OR patterns), but we don't currently keep track
3153 of enough info to do that easily. */
3154 || group_in_compile_stack (compile_stack, regnum))
3155 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3156 }
c69b0314
SM
3157 else
3158 /* It's really shy. */
3159 regnum = - bufp->re_nsub;
25fe55af 3160
99633e97
SM
3161 if (COMPILE_STACK_FULL)
3162 {
3163 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3164 compile_stack_elt_t);
3165 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3166
99633e97
SM
3167 compile_stack.size <<= 1;
3168 }
25fe55af 3169
99633e97 3170 /* These are the values to restore when we hit end of this
7814e705 3171 group. They are all relative offsets, so that if the
99633e97
SM
3172 whole pattern moves because of realloc, they will still
3173 be valid. */
3174 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3175 COMPILE_STACK_TOP.fixup_alt_jump
3176 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3177 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3178 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3179
c69b0314
SM
3180 /* Do not push a start_memory for groups beyond the last one
3181 we can represent in the compiled pattern. */
3182 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3183 BUF_PUSH_2 (start_memory, regnum);
3184
3185 compile_stack.avail++;
3186
3187 fixup_alt_jump = 0;
3188 laststart = 0;
3189 begalt = b;
3190 /* If we've reached MAX_REGNUM groups, then this open
3191 won't actually generate any code, so we'll have to
3192 clear pending_exact explicitly. */
3193 pending_exact = 0;
3194 break;
505bde11 3195 }
25fe55af
RS
3196
3197 case ')':
3198 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3199
3200 if (COMPILE_STACK_EMPTY)
505bde11
SM
3201 {
3202 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3203 goto normal_backslash;
3204 else
3205 FREE_STACK_RETURN (REG_ERPAREN);
3206 }
25fe55af
RS
3207
3208 handle_close:
505bde11 3209 FIXUP_ALT_JUMP ();
25fe55af
RS
3210
3211 /* See similar code for backslashed left paren above. */
3212 if (COMPILE_STACK_EMPTY)
505bde11
SM
3213 {
3214 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3215 goto normal_char;
3216 else
3217 FREE_STACK_RETURN (REG_ERPAREN);
3218 }
25fe55af
RS
3219
3220 /* Since we just checked for an empty stack above, this
3221 ``can't happen''. */
3222 assert (compile_stack.avail != 0);
3223 {
3224 /* We don't just want to restore into `regnum', because
3225 later groups should continue to be numbered higher,
7814e705 3226 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3227 regnum_t regnum;
25fe55af
RS
3228
3229 compile_stack.avail--;
3230 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3231 fixup_alt_jump
3232 = COMPILE_STACK_TOP.fixup_alt_jump
3233 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3234 : 0;
3235 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3236 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3237 /* If we've reached MAX_REGNUM groups, then this open
3238 won't actually generate any code, so we'll have to
3239 clear pending_exact explicitly. */
3240 pending_exact = 0;
e318085a 3241
25fe55af 3242 /* We're at the end of the group, so now we know how many
7814e705 3243 groups were inside this one. */
c69b0314
SM
3244 if (regnum <= MAX_REGNUM && regnum > 0)
3245 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3246 }
3247 break;
3248
3249
3250 case '|': /* `\|'. */
3251 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3252 goto normal_backslash;
3253 handle_alt:
3254 if (syntax & RE_LIMITED_OPS)
3255 goto normal_char;
3256
3257 /* Insert before the previous alternative a jump which
7814e705 3258 jumps to this alternative if the former fails. */
25fe55af
RS
3259 GET_BUFFER_SPACE (3);
3260 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3261 pending_exact = 0;
3262 b += 3;
3263
3264 /* The alternative before this one has a jump after it
3265 which gets executed if it gets matched. Adjust that
3266 jump so it will jump to this alternative's analogous
3267 jump (put in below, which in turn will jump to the next
3268 (if any) alternative's such jump, etc.). The last such
3269 jump jumps to the correct final destination. A picture:
3270 _____ _____
3271 | | | |
3272 | v | v
d1dfb56c 3273 a | b | c
25fe55af
RS
3274
3275 If we are at `b', then fixup_alt_jump right now points to a
3276 three-byte space after `a'. We'll put in the jump, set
3277 fixup_alt_jump to right after `b', and leave behind three
3278 bytes which we'll fill in when we get to after `c'. */
3279
505bde11 3280 FIXUP_ALT_JUMP ();
25fe55af
RS
3281
3282 /* Mark and leave space for a jump after this alternative,
3283 to be filled in later either by next alternative or
3284 when know we're at the end of a series of alternatives. */
3285 fixup_alt_jump = b;
3286 GET_BUFFER_SPACE (3);
3287 b += 3;
3288
3289 laststart = 0;
3290 begalt = b;
3291 break;
3292
3293
3294 case '{':
3295 /* If \{ is a literal. */
3296 if (!(syntax & RE_INTERVALS)
3297 /* If we're at `\{' and it's not the open-interval
3298 operator. */
4bb91c68 3299 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3300 goto normal_backslash;
3301
3302 handle_interval:
3303 {
3304 /* If got here, then the syntax allows intervals. */
3305
3306 /* At least (most) this many matches must be made. */
99633e97 3307 int lower_bound = 0, upper_bound = -1;
25fe55af 3308
ed0767d8 3309 beg_interval = p;
25fe55af 3310
4618713a 3311 GET_INTERVAL_COUNT (lower_bound);
25fe55af
RS
3312
3313 if (c == ',')
a3fab1ec 3314 GET_INTERVAL_COUNT (upper_bound);
25fe55af
RS
3315 else
3316 /* Interval such as `{1}' => match exactly once. */
3317 upper_bound = lower_bound;
3318
a3fab1ec
PE
3319 if (lower_bound < 0
3320 || (0 <= upper_bound && upper_bound < lower_bound))
3321 FREE_STACK_RETURN (REG_BADBR);
3322
25fe55af
RS
3323 if (!(syntax & RE_NO_BK_BRACES))
3324 {
4bb91c68
SM
3325 if (c != '\\')
3326 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3327 if (p == pend)
3328 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3329 PATFETCH (c);
3330 }
3331
3332 if (c != '}')
4bb91c68 3333 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3334
3335 /* We just parsed a valid interval. */
3336
3337 /* If it's invalid to have no preceding re. */
3338 if (!laststart)
3339 {
3340 if (syntax & RE_CONTEXT_INVALID_OPS)
3341 FREE_STACK_RETURN (REG_BADRPT);
3342 else if (syntax & RE_CONTEXT_INDEP_OPS)
3343 laststart = b;
3344 else
3345 goto unfetch_interval;
3346 }
3347
6df42991
SM
3348 if (upper_bound == 0)
3349 /* If the upper bound is zero, just drop the sub pattern
3350 altogether. */
3351 b = laststart;
3352 else if (lower_bound == 1 && upper_bound == 1)
3353 /* Just match it once: nothing to do here. */
3354 ;
3355
3356 /* Otherwise, we have a nontrivial interval. When
3357 we're all done, the pattern will look like:
3358 set_number_at <jump count> <upper bound>
3359 set_number_at <succeed_n count> <lower bound>
3360 succeed_n <after jump addr> <succeed_n count>
3361 <body of loop>
3362 jump_n <succeed_n addr> <jump count>
3363 (The upper bound and `jump_n' are omitted if
3364 `upper_bound' is 1, though.) */
3365 else
3366 { /* If the upper bound is > 1, we need to insert
3367 more at the end of the loop. */
3368 unsigned int nbytes = (upper_bound < 0 ? 3
3369 : upper_bound > 1 ? 5 : 0);
3370 unsigned int startoffset = 0;
3371
3372 GET_BUFFER_SPACE (20); /* We might use less. */
3373
3374 if (lower_bound == 0)
3375 {
3376 /* A succeed_n that starts with 0 is really a
3377 a simple on_failure_jump_loop. */
3378 INSERT_JUMP (on_failure_jump_loop, laststart,
3379 b + 3 + nbytes);
3380 b += 3;
3381 }
3382 else
3383 {
3384 /* Initialize lower bound of the `succeed_n', even
3385 though it will be set during matching by its
3386 attendant `set_number_at' (inserted next),
3387 because `re_compile_fastmap' needs to know.
3388 Jump to the `jump_n' we might insert below. */
3389 INSERT_JUMP2 (succeed_n, laststart,
3390 b + 5 + nbytes,
3391 lower_bound);
3392 b += 5;
3393
3394 /* Code to initialize the lower bound. Insert
7814e705 3395 before the `succeed_n'. The `5' is the last two
6df42991
SM
3396 bytes of this `set_number_at', plus 3 bytes of
3397 the following `succeed_n'. */
3398 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3399 b += 5;
3400 startoffset += 5;
3401 }
3402
3403 if (upper_bound < 0)
3404 {
3405 /* A negative upper bound stands for infinity,
3406 in which case it degenerates to a plain jump. */
3407 STORE_JUMP (jump, b, laststart + startoffset);
3408 b += 3;
3409 }
3410 else if (upper_bound > 1)
3411 { /* More than one repetition is allowed, so
3412 append a backward jump to the `succeed_n'
3413 that starts this interval.
3414
3415 When we've reached this during matching,
3416 we'll have matched the interval once, so
3417 jump back only `upper_bound - 1' times. */
3418 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3419 upper_bound - 1);
3420 b += 5;
3421
3422 /* The location we want to set is the second
3423 parameter of the `jump_n'; that is `b-2' as
3424 an absolute address. `laststart' will be
3425 the `set_number_at' we're about to insert;
3426 `laststart+3' the number to set, the source
3427 for the relative address. But we are
3428 inserting into the middle of the pattern --
3429 so everything is getting moved up by 5.
3430 Conclusion: (b - 2) - (laststart + 3) + 5,
3431 i.e., b - laststart.
3432
3433 We insert this at the beginning of the loop
3434 so that if we fail during matching, we'll
3435 reinitialize the bounds. */
3436 insert_op2 (set_number_at, laststart, b - laststart,
3437 upper_bound - 1, b);
3438 b += 5;
3439 }
3440 }
25fe55af
RS
3441 pending_exact = 0;
3442 beg_interval = NULL;
3443 }
3444 break;
3445
3446 unfetch_interval:
3447 /* If an invalid interval, match the characters as literals. */
3448 assert (beg_interval);
3449 p = beg_interval;
3450 beg_interval = NULL;
3451
3452 /* normal_char and normal_backslash need `c'. */
ed0767d8 3453 c = '{';
25fe55af
RS
3454
3455 if (!(syntax & RE_NO_BK_BRACES))
3456 {
ed0767d8
SM
3457 assert (p > pattern && p[-1] == '\\');
3458 goto normal_backslash;
25fe55af 3459 }
ed0767d8
SM
3460 else
3461 goto normal_char;
e318085a 3462
b18215fc 3463#ifdef emacs
25fe55af 3464 /* There is no way to specify the before_dot and after_dot
7814e705 3465 operators. rms says this is ok. --karl */
25fe55af 3466 case '=':
5ac2eb34 3467 laststart = b;
25fe55af
RS
3468 BUF_PUSH (at_dot);
3469 break;
3470
3471 case 's':
3472 laststart = b;
3473 PATFETCH (c);
3474 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3475 break;
3476
3477 case 'S':
3478 laststart = b;
3479 PATFETCH (c);
3480 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3481 break;
b18215fc
RS
3482
3483 case 'c':
3484 laststart = b;
36595814 3485 PATFETCH (c);
b18215fc
RS
3486 BUF_PUSH_2 (categoryspec, c);
3487 break;
e318085a 3488
b18215fc
RS
3489 case 'C':
3490 laststart = b;
36595814 3491 PATFETCH (c);
b18215fc
RS
3492 BUF_PUSH_2 (notcategoryspec, c);
3493 break;
3494#endif /* emacs */
e318085a 3495
e318085a 3496
25fe55af 3497 case 'w':
4bb91c68
SM
3498 if (syntax & RE_NO_GNU_OPS)
3499 goto normal_char;
25fe55af 3500 laststart = b;
1fb352e0 3501 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3502 break;
e318085a 3503
e318085a 3504
25fe55af 3505 case 'W':
4bb91c68
SM
3506 if (syntax & RE_NO_GNU_OPS)
3507 goto normal_char;
25fe55af 3508 laststart = b;
1fb352e0 3509 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3510 break;
e318085a
RS
3511
3512
25fe55af 3513 case '<':
4bb91c68
SM
3514 if (syntax & RE_NO_GNU_OPS)
3515 goto normal_char;
5ac2eb34 3516 laststart = b;
25fe55af
RS
3517 BUF_PUSH (wordbeg);
3518 break;
e318085a 3519
25fe55af 3520 case '>':
4bb91c68
SM
3521 if (syntax & RE_NO_GNU_OPS)
3522 goto normal_char;
5ac2eb34 3523 laststart = b;
25fe55af
RS
3524 BUF_PUSH (wordend);
3525 break;
e318085a 3526
669fa600
SM
3527 case '_':
3528 if (syntax & RE_NO_GNU_OPS)
3529 goto normal_char;
3530 laststart = b;
3531 PATFETCH (c);
3532 if (c == '<')
3533 BUF_PUSH (symbeg);
3534 else if (c == '>')
3535 BUF_PUSH (symend);
3536 else
3537 FREE_STACK_RETURN (REG_BADPAT);
3538 break;
3539
25fe55af 3540 case 'b':
4bb91c68
SM
3541 if (syntax & RE_NO_GNU_OPS)
3542 goto normal_char;
25fe55af
RS
3543 BUF_PUSH (wordbound);
3544 break;
e318085a 3545
25fe55af 3546 case 'B':
4bb91c68
SM
3547 if (syntax & RE_NO_GNU_OPS)
3548 goto normal_char;
25fe55af
RS
3549 BUF_PUSH (notwordbound);
3550 break;
fa9a63c5 3551
25fe55af 3552 case '`':
4bb91c68
SM
3553 if (syntax & RE_NO_GNU_OPS)
3554 goto normal_char;
25fe55af
RS
3555 BUF_PUSH (begbuf);
3556 break;
e318085a 3557
25fe55af 3558 case '\'':
4bb91c68
SM
3559 if (syntax & RE_NO_GNU_OPS)
3560 goto normal_char;
25fe55af
RS
3561 BUF_PUSH (endbuf);
3562 break;
e318085a 3563
25fe55af
RS
3564 case '1': case '2': case '3': case '4': case '5':
3565 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3566 {
3567 regnum_t reg;
e318085a 3568
0cdd06f8
SM
3569 if (syntax & RE_NO_BK_REFS)
3570 goto normal_backslash;
e318085a 3571
0cdd06f8 3572 reg = c - '0';
e318085a 3573
c69b0314
SM
3574 if (reg > bufp->re_nsub || reg < 1
3575 /* Can't back reference to a subexp before its end. */
3576 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3577 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3578
0cdd06f8
SM
3579 laststart = b;
3580 BUF_PUSH_2 (duplicate, reg);
3581 }
25fe55af 3582 break;
e318085a 3583
e318085a 3584
25fe55af
RS
3585 case '+':
3586 case '?':
3587 if (syntax & RE_BK_PLUS_QM)
3588 goto handle_plus;
3589 else
3590 goto normal_backslash;
3591
3592 default:
3593 normal_backslash:
3594 /* You might think it would be useful for \ to mean
3595 not to translate; but if we don't translate it
4bb91c68 3596 it will never match anything. */
25fe55af
RS
3597 goto normal_char;
3598 }
3599 break;
fa9a63c5
RM
3600
3601
3602 default:
25fe55af 3603 /* Expects the character in `c'. */
fa9a63c5 3604 normal_char:
36595814 3605 /* If no exactn currently being built. */
25fe55af 3606 if (!pending_exact
fa9a63c5 3607
25fe55af
RS
3608 /* If last exactn not at current position. */
3609 || pending_exact + *pending_exact + 1 != b
5e69f11e 3610
25fe55af 3611 /* We have only one byte following the exactn for the count. */
2d1675e4 3612 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3613
7814e705 3614 /* If followed by a repetition operator. */
9d99031f 3615 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3616 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3617 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3618 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3619 || ((syntax & RE_INTERVALS)
25fe55af 3620 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3621 ? p != pend && *p == '{'
3622 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3623 {
3624 /* Start building a new exactn. */
5e69f11e 3625
25fe55af 3626 laststart = b;
fa9a63c5
RM
3627
3628 BUF_PUSH_2 (exactn, 0);
3629 pending_exact = b - 1;
25fe55af 3630 }
5e69f11e 3631
2d1675e4
SM
3632 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3633 {
e0277a47
KH
3634 int len;
3635
cf9c99bc 3636 if (multibyte)
6fdd04b0 3637 {
cf9c99bc 3638 c = TRANSLATE (c);
6fdd04b0
KH
3639 len = CHAR_STRING (c, b);
3640 b += len;
3641 }
e0277a47 3642 else
6fdd04b0 3643 {
cf9c99bc
KH
3644 c1 = RE_CHAR_TO_MULTIBYTE (c);
3645 if (! CHAR_BYTE8_P (c1))
3646 {
3647 re_wchar_t c2 = TRANSLATE (c1);
3648
3649 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3650 c = c1;
409f2919 3651 }
6fdd04b0
KH
3652 *b++ = c;
3653 len = 1;
3654 }
2d1675e4
SM
3655 (*pending_exact) += len;
3656 }
3657
fa9a63c5 3658 break;
25fe55af 3659 } /* switch (c) */
fa9a63c5
RM
3660 } /* while p != pend */
3661
5e69f11e 3662
fa9a63c5 3663 /* Through the pattern now. */
5e69f11e 3664
505bde11 3665 FIXUP_ALT_JUMP ();
fa9a63c5 3666
5e69f11e 3667 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3668 FREE_STACK_RETURN (REG_EPAREN);
3669
3670 /* If we don't want backtracking, force success
3671 the first time we reach the end of the compiled pattern. */
3672 if (syntax & RE_NO_POSIX_BACKTRACKING)
3673 BUF_PUSH (succeed);
3674
fa9a63c5
RM
3675 /* We have succeeded; set the length of the buffer. */
3676 bufp->used = b - bufp->buffer;
3677
3678#ifdef DEBUG
99633e97 3679 if (debug > 0)
fa9a63c5 3680 {
505bde11 3681 re_compile_fastmap (bufp);
dc4a2ee0 3682 DEBUG_PRINT ("\nCompiled pattern: \n");
fa9a63c5
RM
3683 print_compiled_pattern (bufp);
3684 }
99633e97 3685 debug--;
fa9a63c5
RM
3686#endif /* DEBUG */
3687
3688#ifndef MATCH_MAY_ALLOCATE
3689 /* Initialize the failure stack to the largest possible stack. This
3690 isn't necessary unless we're trying to avoid calling alloca in
3691 the search and match routines. */
3692 {
3693 int num_regs = bufp->re_nsub + 1;
3694
320a2a73 3695 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3696 {
a26f4ccd 3697 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
38182d90
PE
3698 falk_stack.stack = realloc (fail_stack.stack,
3699 fail_stack.size * sizeof *falk_stack.stack);
fa9a63c5
RM
3700 }
3701
3702 regex_grow_registers (num_regs);
3703 }
3704#endif /* not MATCH_MAY_ALLOCATE */
3705
839966f3 3706 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3707} /* regex_compile */
3708\f
3709/* Subroutines for `regex_compile'. */
3710
7814e705 3711/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3712
3713static void
971de7fb 3714store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3715{
3716 *loc = (unsigned char) op;
3717 STORE_NUMBER (loc + 1, arg);
3718}
3719
3720
3721/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3722
3723static void
971de7fb 3724store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3725{
3726 *loc = (unsigned char) op;
3727 STORE_NUMBER (loc + 1, arg1);
3728 STORE_NUMBER (loc + 3, arg2);
3729}
3730
3731
3732/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3733 for OP followed by two-byte integer parameter ARG. */
3734
3735static void
971de7fb 3736insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3737{
3738 register unsigned char *pfrom = end;
3739 register unsigned char *pto = end + 3;
3740
3741 while (pfrom != loc)
3742 *--pto = *--pfrom;
5e69f11e 3743
fa9a63c5
RM
3744 store_op1 (op, loc, arg);
3745}
3746
3747
3748/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3749
3750static void
971de7fb 3751insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3752{
3753 register unsigned char *pfrom = end;
3754 register unsigned char *pto = end + 5;
3755
3756 while (pfrom != loc)
3757 *--pto = *--pfrom;
5e69f11e 3758
fa9a63c5
RM
3759 store_op2 (op, loc, arg1, arg2);
3760}
3761
3762
3763/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3764 after an alternative or a begin-subexpression. We assume there is at
3765 least one character before the ^. */
3766
3767static boolean
29abe551 3768at_begline_loc_p (const_re_char *pattern, const_re_char *p, reg_syntax_t syntax)
fa9a63c5 3769{
01618498 3770 re_char *prev = p - 2;
95988fcf 3771 boolean odd_backslashes;
5e69f11e 3772
95988fcf
AS
3773 /* After a subexpression? */
3774 if (*prev == '(')
3775 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3776
3777 /* After an alternative? */
3778 else if (*prev == '|')
3779 odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3780
3781 /* After a shy subexpression? */
3782 else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3783 {
3784 /* Skip over optional regnum. */
3785 while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3786 --prev;
3787
3788 if (!(prev - 2 >= pattern
3789 && prev[-1] == '?' && prev[-2] == '('))
3790 return false;
3791 prev -= 2;
3792 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3793 }
3794 else
3795 return false;
3796
3797 /* Count the number of preceding backslashes. */
3798 p = prev;
3799 while (prev - 1 >= pattern && prev[-1] == '\\')
3800 --prev;
3801 return (p - prev) & odd_backslashes;
fa9a63c5
RM
3802}
3803
3804
3805/* The dual of at_begline_loc_p. This one is for $. We assume there is
3806 at least one character after the $, i.e., `P < PEND'. */
3807
3808static boolean
29abe551 3809at_endline_loc_p (const_re_char *p, const_re_char *pend, reg_syntax_t syntax)
fa9a63c5 3810{
01618498 3811 re_char *next = p;
fa9a63c5 3812 boolean next_backslash = *next == '\\';
01618498 3813 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3814
fa9a63c5
RM
3815 return
3816 /* Before a subexpression? */
3817 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3818 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3819 /* Before an alternative? */
3820 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3821 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3822}
3823
3824
5e69f11e 3825/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3826 false if it's not. */
3827
3828static boolean
971de7fb 3829group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5 3830{
d1dfb56c 3831 ssize_t this_element;
fa9a63c5 3832
5e69f11e
RM
3833 for (this_element = compile_stack.avail - 1;
3834 this_element >= 0;
fa9a63c5
RM
3835 this_element--)
3836 if (compile_stack.stack[this_element].regnum == regnum)
3837 return true;
3838
3839 return false;
3840}
fa9a63c5 3841\f
f6a3f532
SM
3842/* analyse_first.
3843 If fastmap is non-NULL, go through the pattern and fill fastmap
3844 with all the possible leading chars. If fastmap is NULL, don't
3845 bother filling it up (obviously) and only return whether the
3846 pattern could potentially match the empty string.
3847
3848 Return 1 if p..pend might match the empty string.
3849 Return 0 if p..pend matches at least one char.
01618498 3850 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3851
3852static int
29abe551
PE
3853analyse_first (const_re_char *p, const_re_char *pend, char *fastmap,
3854 const int multibyte)
fa9a63c5 3855{
505bde11 3856 int j, k;
1fb352e0 3857 boolean not;
fa9a63c5 3858
b18215fc 3859 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3860 flag is set true. */
b18215fc
RS
3861 boolean match_any_multibyte_characters = false;
3862
f6a3f532 3863 assert (p);
5e69f11e 3864
505bde11
SM
3865 /* The loop below works as follows:
3866 - It has a working-list kept in the PATTERN_STACK and which basically
3867 starts by only containing a pointer to the first operation.
3868 - If the opcode we're looking at is a match against some set of
3869 chars, then we add those chars to the fastmap and go on to the
3870 next work element from the worklist (done via `break').
3871 - If the opcode is a control operator on the other hand, we either
3872 ignore it (if it's meaningless at this point, such as `start_memory')
3873 or execute it (if it's a jump). If the jump has several destinations
3874 (i.e. `on_failure_jump'), then we push the other destination onto the
3875 worklist.
3876 We guarantee termination by ignoring backward jumps (more or less),
3877 so that `p' is monotonically increasing. More to the point, we
3878 never set `p' (or push) anything `<= p1'. */
3879
01618498 3880 while (p < pend)
fa9a63c5 3881 {
505bde11
SM
3882 /* `p1' is used as a marker of how far back a `on_failure_jump'
3883 can go without being ignored. It is normally equal to `p'
3884 (which prevents any backward `on_failure_jump') except right
3885 after a plain `jump', to allow patterns such as:
3886 0: jump 10
3887 3..9: <body>
3888 10: on_failure_jump 3
3889 as used for the *? operator. */
01618498 3890 re_char *p1 = p;
5e69f11e 3891
7393bcbb 3892 switch (*p++)
fa9a63c5 3893 {
f6a3f532 3894 case succeed:
01618498 3895 return 1;
fa9a63c5 3896
fa9a63c5 3897 case duplicate:
505bde11
SM
3898 /* If the first character has to match a backreference, that means
3899 that the group was empty (since it already matched). Since this
3900 is the only case that interests us here, we can assume that the
3901 backreference must match the empty string. */
3902 p++;
3903 continue;
fa9a63c5
RM
3904
3905
3906 /* Following are the cases which match a character. These end
7814e705 3907 with `break'. */
fa9a63c5
RM
3908
3909 case exactn:
e0277a47 3910 if (fastmap)
cf9c99bc
KH
3911 {
3912 /* If multibyte is nonzero, the first byte of each
3913 character is an ASCII or a leading code. Otherwise,
3914 each byte is a character. Thus, this works in both
3915 cases. */
3916 fastmap[p[1]] = 1;
3917 if (! multibyte)
3918 {
3919 /* For the case of matching this unibyte regex
3920 against multibyte, we must set a leading code of
3921 the corresponding multibyte character. */
3922 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3923
86e893e3 3924 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
3925 }
3926 }
fa9a63c5
RM
3927 break;
3928
3929
1fb352e0
SM
3930 case anychar:
3931 /* We could put all the chars except for \n (and maybe \0)
3932 but we don't bother since it is generally not worth it. */
f6a3f532 3933 if (!fastmap) break;
01618498 3934 return -1;
fa9a63c5
RM
3935
3936
b18215fc 3937 case charset_not:
1fb352e0 3938 if (!fastmap) break;
bf216479
KH
3939 {
3940 /* Chars beyond end of bitmap are possible matches. */
bf216479 3941 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 3942 j < (1 << BYTEWIDTH); j++)
bf216479
KH
3943 fastmap[j] = 1;
3944 }
3945
1fb352e0
SM
3946 /* Fallthrough */
3947 case charset:
3948 if (!fastmap) break;
3949 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3950 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3951 j >= 0; j--)
1fb352e0 3952 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 3953 fastmap[j] = 1;
b18215fc 3954
6482db2e
KH
3955#ifdef emacs
3956 if (/* Any leading code can possibly start a character
1fb352e0 3957 which doesn't match the specified set of characters. */
6482db2e 3958 not
409f2919 3959 ||
6482db2e
KH
3960 /* If we can match a character class, we can match any
3961 multibyte characters. */
3962 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3963 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3964
b18215fc 3965 {
b18215fc
RS
3966 if (match_any_multibyte_characters == false)
3967 {
6482db2e
KH
3968 for (j = MIN_MULTIBYTE_LEADING_CODE;
3969 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 3970 fastmap[j] = 1;
b18215fc
RS
3971 match_any_multibyte_characters = true;
3972 }
3973 }
b18215fc 3974
1fb352e0
SM
3975 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3976 && match_any_multibyte_characters == false)
3977 {
bf216479 3978 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 3979 multibyte character in the range table. */
1fb352e0 3980 int c, count;
bf216479 3981 unsigned char lc1, lc2;
b18215fc 3982
1fb352e0 3983 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3984 bits for a character class. */
1fb352e0 3985 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3986
1fb352e0
SM
3987 /* Extract the number of ranges in range table into COUNT. */
3988 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 3989 for (; count > 0; count--, p += 3)
1fb352e0 3990 {
9117d724
KH
3991 /* Extract the start and end of each range. */
3992 EXTRACT_CHARACTER (c, p);
bf216479 3993 lc1 = CHAR_LEADING_CODE (c);
9117d724 3994 p += 3;
1fb352e0 3995 EXTRACT_CHARACTER (c, p);
bf216479
KH
3996 lc2 = CHAR_LEADING_CODE (c);
3997 for (j = lc1; j <= lc2; j++)
9117d724 3998 fastmap[j] = 1;
1fb352e0
SM
3999 }
4000 }
6482db2e 4001#endif
b18215fc
RS
4002 break;
4003
1fb352e0
SM
4004 case syntaxspec:
4005 case notsyntaxspec:
4006 if (!fastmap) break;
4007#ifndef emacs
4008 not = (re_opcode_t)p[-1] == notsyntaxspec;
4009 k = *p++;
4010 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4011 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4012 fastmap[j] = 1;
b18215fc 4013 break;
1fb352e0 4014#else /* emacs */
b18215fc
RS
4015 /* This match depends on text properties. These end with
4016 aborting optimizations. */
01618498 4017 return -1;
b18215fc
RS
4018
4019 case categoryspec:
b18215fc 4020 case notcategoryspec:
1fb352e0
SM
4021 if (!fastmap) break;
4022 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4023 k = *p++;
6482db2e 4024 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4025 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4026 fastmap[j] = 1;
4027
6482db2e
KH
4028 /* Any leading code can possibly start a character which
4029 has or doesn't has the specified category. */
4030 if (match_any_multibyte_characters == false)
6fdd04b0 4031 {
6482db2e
KH
4032 for (j = MIN_MULTIBYTE_LEADING_CODE;
4033 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4034 fastmap[j] = 1;
4035 match_any_multibyte_characters = true;
6fdd04b0 4036 }
b18215fc
RS
4037 break;
4038
fa9a63c5 4039 /* All cases after this match the empty string. These end with
25fe55af 4040 `continue'. */
fa9a63c5 4041
fa9a63c5
RM
4042 case before_dot:
4043 case at_dot:
4044 case after_dot:
1fb352e0 4045#endif /* !emacs */
25fe55af
RS
4046 case no_op:
4047 case begline:
4048 case endline:
fa9a63c5
RM
4049 case begbuf:
4050 case endbuf:
4051 case wordbound:
4052 case notwordbound:
4053 case wordbeg:
4054 case wordend:
669fa600
SM
4055 case symbeg:
4056 case symend:
25fe55af 4057 continue;
fa9a63c5
RM
4058
4059
fa9a63c5 4060 case jump:
25fe55af 4061 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4062 if (j < 0)
4063 /* Backward jumps can only go back to code that we've already
4064 visited. `re_compile' should make sure this is true. */
4065 break;
25fe55af 4066 p += j;
7393bcbb 4067 switch (*p)
505bde11
SM
4068 {
4069 case on_failure_jump:
4070 case on_failure_keep_string_jump:
505bde11 4071 case on_failure_jump_loop:
0683b6fa 4072 case on_failure_jump_nastyloop:
505bde11
SM
4073 case on_failure_jump_smart:
4074 p++;
4075 break;
4076 default:
4077 continue;
4078 };
4079 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4080 to jump back to "just after here". */
4081 /* Fallthrough */
fa9a63c5 4082
25fe55af
RS
4083 case on_failure_jump:
4084 case on_failure_keep_string_jump:
0683b6fa 4085 case on_failure_jump_nastyloop:
505bde11
SM
4086 case on_failure_jump_loop:
4087 case on_failure_jump_smart:
25fe55af 4088 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4089 if (p + j <= p1)
ed0767d8 4090 ; /* Backward jump to be ignored. */
01618498
SM
4091 else
4092 { /* We have to look down both arms.
4093 We first go down the "straight" path so as to minimize
4094 stack usage when going through alternatives. */
4095 int r = analyse_first (p, pend, fastmap, multibyte);
4096 if (r) return r;
4097 p += j;
4098 }
25fe55af 4099 continue;
fa9a63c5
RM
4100
4101
ed0767d8
SM
4102 case jump_n:
4103 /* This code simply does not properly handle forward jump_n. */
4104 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4105 p += 4;
4106 /* jump_n can either jump or fall through. The (backward) jump
4107 case has already been handled, so we only need to look at the
4108 fallthrough case. */
4109 continue;
177c0ea7 4110
fa9a63c5 4111 case succeed_n:
ed0767d8
SM
4112 /* If N == 0, it should be an on_failure_jump_loop instead. */
4113 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4114 p += 4;
4115 /* We only care about one iteration of the loop, so we don't
4116 need to consider the case where this behaves like an
4117 on_failure_jump. */
25fe55af 4118 continue;
fa9a63c5
RM
4119
4120
4121 case set_number_at:
25fe55af
RS
4122 p += 4;
4123 continue;
fa9a63c5
RM
4124
4125
4126 case start_memory:
25fe55af 4127 case stop_memory:
505bde11 4128 p += 1;
fa9a63c5
RM
4129 continue;
4130
4131
4132 default:
25fe55af
RS
4133 abort (); /* We have listed all the cases. */
4134 } /* switch *p++ */
fa9a63c5
RM
4135
4136 /* Getting here means we have found the possible starting
25fe55af 4137 characters for one path of the pattern -- and that the empty
7814e705 4138 string does not match. We need not follow this path further. */
01618498 4139 return 0;
fa9a63c5
RM
4140 } /* while p */
4141
01618498
SM
4142 /* We reached the end without matching anything. */
4143 return 1;
4144
f6a3f532
SM
4145} /* analyse_first */
4146\f
4147/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4148 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4149 characters can start a string that matches the pattern. This fastmap
4150 is used by re_search to skip quickly over impossible starting points.
4151
4152 Character codes above (1 << BYTEWIDTH) are not represented in the
4153 fastmap, but the leading codes are represented. Thus, the fastmap
4154 indicates which character sets could start a match.
4155
4156 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4157 area as BUFP->fastmap.
4158
4159 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4160 the pattern buffer.
4161
4162 Returns 0 if we succeed, -2 if an internal error. */
4163
4164int
971de7fb 4165re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4166{
4167 char *fastmap = bufp->fastmap;
4168 int analysis;
4169
4170 assert (fastmap && bufp->buffer);
4171
72af86bd 4172 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4173 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4174
4175 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4176 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4177 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4178 return 0;
4179} /* re_compile_fastmap */
4180\f
4181/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4182 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4183 this memory for recording register information. STARTS and ENDS
4184 must be allocated using the malloc library routine, and must each
4185 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4186
4187 If NUM_REGS == 0, then subsequent matches should allocate their own
4188 register data.
4189
4190 Unless this function is called, the first search or match using
4191 PATTERN_BUFFER will allocate its own register data, without
4192 freeing the old data. */
4193
4194void
971de7fb 4195re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4196{
4197 if (num_regs)
4198 {
4199 bufp->regs_allocated = REGS_REALLOCATE;
4200 regs->num_regs = num_regs;
4201 regs->start = starts;
4202 regs->end = ends;
4203 }
4204 else
4205 {
4206 bufp->regs_allocated = REGS_UNALLOCATED;
4207 regs->num_regs = 0;
7d652d97 4208 regs->start = regs->end = 0;
fa9a63c5
RM
4209 }
4210}
c0f9ea08 4211WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4212\f
7814e705 4213/* Searching routines. */
fa9a63c5
RM
4214
4215/* Like re_search_2, below, but only one string is specified, and
4216 doesn't let you say where to stop matching. */
4217
d1dfb56c
EZ
4218regoff_t
4219re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4220 ssize_t startpos, ssize_t range, struct re_registers *regs)
fa9a63c5 4221{
5e69f11e 4222 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4223 regs, size);
4224}
c0f9ea08 4225WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4226
70806df6
KH
4227/* Head address of virtual concatenation of string. */
4228#define HEAD_ADDR_VSTRING(P) \
4229 (((P) >= size1 ? string2 : string1))
4230
b18215fc
RS
4231/* Address of POS in the concatenation of virtual string. */
4232#define POS_ADDR_VSTRING(POS) \
4233 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4234
4235/* Using the compiled pattern in BUFP->buffer, first tries to match the
4236 virtual concatenation of STRING1 and STRING2, starting first at index
4237 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4238
fa9a63c5 4239 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4240
fa9a63c5
RM
4241 RANGE is how far to scan while trying to match. RANGE = 0 means try
4242 only at STARTPOS; in general, the last start tried is STARTPOS +
4243 RANGE.
5e69f11e 4244
fa9a63c5
RM
4245 In REGS, return the indices of the virtual concatenation of STRING1
4246 and STRING2 that matched the entire BUFP->buffer and its contained
4247 subexpressions.
5e69f11e 4248
fa9a63c5
RM
4249 Do not consider matching one past the index STOP in the virtual
4250 concatenation of STRING1 and STRING2.
4251
4252 We return either the position in the strings at which the match was
4253 found, -1 if no match, or -2 if error (such as failure
4254 stack overflow). */
4255
d1dfb56c
EZ
4256regoff_t
4257re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4258 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4259 struct re_registers *regs, ssize_t stop)
fa9a63c5 4260{
d1dfb56c 4261 regoff_t val;
66f0296e
SM
4262 re_char *string1 = (re_char*) str1;
4263 re_char *string2 = (re_char*) str2;
fa9a63c5 4264 register char *fastmap = bufp->fastmap;
6676cb1c 4265 register RE_TRANSLATE_TYPE translate = bufp->translate;
d1dfb56c
EZ
4266 size_t total_size = size1 + size2;
4267 ssize_t endpos = startpos + range;
c0f9ea08 4268 boolean anchored_start;
cf9c99bc
KH
4269 /* Nonzero if we are searching multibyte string. */
4270 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4271
fa9a63c5
RM
4272 /* Check for out-of-range STARTPOS. */
4273 if (startpos < 0 || startpos > total_size)
4274 return -1;
5e69f11e 4275
fa9a63c5 4276 /* Fix up RANGE if it might eventually take us outside
34597fa9 4277 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4278 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4279 if (endpos < 0)
4280 range = 0 - startpos;
fa9a63c5
RM
4281 else if (endpos > total_size)
4282 range = total_size - startpos;
4283
4284 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4285 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4286 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4287 {
4288 if (startpos > 0)
4289 return -1;
4290 else
7b140fd7 4291 range = 0;
fa9a63c5
RM
4292 }
4293
ae4788a8
RS
4294#ifdef emacs
4295 /* In a forward search for something that starts with \=.
4296 don't keep searching past point. */
4297 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4298 {
7b140fd7
RS
4299 range = PT_BYTE - BEGV_BYTE - startpos;
4300 if (range < 0)
ae4788a8
RS
4301 return -1;
4302 }
4303#endif /* emacs */
4304
fa9a63c5
RM
4305 /* Update the fastmap now if not correct already. */
4306 if (fastmap && !bufp->fastmap_accurate)
01618498 4307 re_compile_fastmap (bufp);
5e69f11e 4308
c8499ba5 4309 /* See whether the pattern is anchored. */
c0f9ea08 4310 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4311
b18215fc 4312#ifdef emacs
d48cd3f4 4313 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4314 {
d1dfb56c 4315 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4316
4317 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4318 }
b18215fc
RS
4319#endif
4320
fa9a63c5
RM
4321 /* Loop through the string, looking for a place to start matching. */
4322 for (;;)
5e69f11e 4323 {
c8499ba5
RS
4324 /* If the pattern is anchored,
4325 skip quickly past places we cannot match.
4326 We don't bother to treat startpos == 0 specially
4327 because that case doesn't repeat. */
4328 if (anchored_start && startpos > 0)
4329 {
c0f9ea08
SM
4330 if (! ((startpos <= size1 ? string1[startpos - 1]
4331 : string2[startpos - size1 - 1])
4332 == '\n'))
c8499ba5
RS
4333 goto advance;
4334 }
4335
fa9a63c5 4336 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4337 cannot be the start of a match. If the pattern can match the
4338 null string, however, we don't need to skip characters; we want
7814e705 4339 the first null string. */
fa9a63c5
RM
4340 if (fastmap && startpos < total_size && !bufp->can_be_null)
4341 {
66f0296e 4342 register re_char *d;
01618498 4343 register re_wchar_t buf_ch;
e934739e
RS
4344
4345 d = POS_ADDR_VSTRING (startpos);
4346
7814e705 4347 if (range > 0) /* Searching forwards. */
fa9a63c5 4348 {
fa9a63c5 4349 register int lim = 0;
d1dfb56c 4350 ssize_t irange = range;
fa9a63c5 4351
25fe55af
RS
4352 if (startpos < size1 && startpos + range >= size1)
4353 lim = range - (size1 - startpos);
fa9a63c5 4354
25fe55af
RS
4355 /* Written out as an if-else to avoid testing `translate'
4356 inside the loop. */
28ae27ae
AS
4357 if (RE_TRANSLATE_P (translate))
4358 {
e934739e
RS
4359 if (multibyte)
4360 while (range > lim)
4361 {
4362 int buf_charlen;
4363
62a6e103 4364 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4365 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4366 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4367 break;
4368
4369 range -= buf_charlen;
4370 d += buf_charlen;
4371 }
4372 else
bf216479 4373 while (range > lim)
33c46939 4374 {
cf9c99bc
KH
4375 register re_wchar_t ch, translated;
4376
bf216479 4377 buf_ch = *d;
cf9c99bc
KH
4378 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4379 translated = RE_TRANSLATE (translate, ch);
4380 if (translated != ch
4381 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4382 buf_ch = ch;
6fdd04b0 4383 if (fastmap[buf_ch])
bf216479 4384 break;
33c46939
RS
4385 d++;
4386 range--;
4387 }
e934739e 4388 }
fa9a63c5 4389 else
6fdd04b0
KH
4390 {
4391 if (multibyte)
4392 while (range > lim)
4393 {
4394 int buf_charlen;
fa9a63c5 4395
62a6e103 4396 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4397 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4398 break;
4399 range -= buf_charlen;
4400 d += buf_charlen;
4401 }
e934739e 4402 else
6fdd04b0 4403 while (range > lim && !fastmap[*d])
33c46939
RS
4404 {
4405 d++;
4406 range--;
4407 }
e934739e 4408 }
fa9a63c5
RM
4409 startpos += irange - range;
4410 }
7814e705 4411 else /* Searching backwards. */
fa9a63c5 4412 {
ba5e343c
KH
4413 if (multibyte)
4414 {
62a6e103 4415 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4416 buf_ch = TRANSLATE (buf_ch);
4417 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4418 goto advance;
4419 }
4420 else
4421 {
cf9c99bc
KH
4422 register re_wchar_t ch, translated;
4423
4424 buf_ch = *d;
4425 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4426 translated = TRANSLATE (ch);
4427 if (translated != ch
4428 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4429 buf_ch = ch;
4430 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4431 goto advance;
4432 }
fa9a63c5
RM
4433 }
4434 }
4435
4436 /* If can't match the null string, and that's all we have left, fail. */
4437 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4438 && !bufp->can_be_null)
fa9a63c5
RM
4439 return -1;
4440
4441 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4442 startpos, regs, stop);
fa9a63c5
RM
4443
4444 if (val >= 0)
4445 return startpos;
5e69f11e 4446
fa9a63c5
RM
4447 if (val == -2)
4448 return -2;
4449
4450 advance:
5e69f11e 4451 if (!range)
25fe55af 4452 break;
5e69f11e 4453 else if (range > 0)
25fe55af 4454 {
b18215fc
RS
4455 /* Update STARTPOS to the next character boundary. */
4456 if (multibyte)
4457 {
66f0296e 4458 re_char *p = POS_ADDR_VSTRING (startpos);
aa3830c4 4459 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4460
4461 range -= len;
4462 if (range < 0)
4463 break;
4464 startpos += len;
4465 }
4466 else
4467 {
b560c397
RS
4468 range--;
4469 startpos++;
4470 }
e318085a 4471 }
fa9a63c5 4472 else
25fe55af
RS
4473 {
4474 range++;
4475 startpos--;
b18215fc
RS
4476
4477 /* Update STARTPOS to the previous character boundary. */
4478 if (multibyte)
4479 {
70806df6
KH
4480 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4481 re_char *p0 = p;
4482 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4483
4484 /* Find the head of multibyte form. */
70806df6
KH
4485 PREV_CHAR_BOUNDARY (p, phead);
4486 range += p0 - 1 - p;
4487 if (range > 0)
4488 break;
b18215fc 4489
70806df6 4490 startpos -= p0 - 1 - p;
b18215fc 4491 }
25fe55af 4492 }
fa9a63c5
RM
4493 }
4494 return -1;
4495} /* re_search_2 */
c0f9ea08 4496WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4497\f
4498/* Declarations and macros for re_match_2. */
4499
261cb4bb
PE
4500static int bcmp_translate (re_char *s1, re_char *s2,
4501 register ssize_t len,
4502 RE_TRANSLATE_TYPE translate,
4503 const int multibyte);
fa9a63c5
RM
4504
4505/* This converts PTR, a pointer into one of the search strings `string1'
4506 and `string2' into an offset from the beginning of that string. */
4507#define POINTER_TO_OFFSET(ptr) \
4508 (FIRST_STRING_P (ptr) \
dc4a2ee0
PE
4509 ? (ptr) - string1 \
4510 : (ptr) - string2 + (ptrdiff_t) size1)
fa9a63c5 4511
fa9a63c5 4512/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4513 string2 if necessary.
4514 Check re_match_2_internal for a discussion of why end_match_2 might
4515 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4516#define PREFETCH() \
25fe55af 4517 while (d == dend) \
fa9a63c5
RM
4518 { \
4519 /* End of string2 => fail. */ \
25fe55af
RS
4520 if (dend == end_match_2) \
4521 goto fail; \
4bb91c68 4522 /* End of string1 => advance to string2. */ \
25fe55af 4523 d = string2; \
fa9a63c5
RM
4524 dend = end_match_2; \
4525 }
4526
f1ad044f
SM
4527/* Call before fetching a char with *d if you already checked other limits.
4528 This is meant for use in lookahead operations like wordend, etc..
4529 where we might need to look at parts of the string that might be
4530 outside of the LIMITs (i.e past `stop'). */
4531#define PREFETCH_NOLIMIT() \
4532 if (d == end1) \
4533 { \
4534 d = string2; \
4535 dend = end_match_2; \
4536 } \
fa9a63c5
RM
4537
4538/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4539 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4540#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4541#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5 4542
9121ca40 4543/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4544
4545/* The comment at case wordbound is following one, but we don't use
4546 AT_WORD_BOUNDARY anymore to support multibyte form.
4547
4548 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4549 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4550 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4551 macro and introducing temporary variables works around the bug. */
4552
9121ca40 4553#if 0
b313f9d8
PE
4554/* Test if D points to a character which is word-constituent. We have
4555 two special cases to check for: if past the end of string1, look at
4556 the first character in string2; and if before the beginning of
4557 string2, look at the last character in string1. */
4558#define WORDCHAR_P(d) \
4559 (SYNTAX ((d) == end1 ? *string2 \
4560 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4561 == Sword)
4562
fa9a63c5
RM
4563/* Test if the character before D and the one at D differ with respect
4564 to being word-constituent. */
4565#define AT_WORD_BOUNDARY(d) \
4566 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4567 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4568#endif
fa9a63c5
RM
4569
4570/* Free everything we malloc. */
4571#ifdef MATCH_MAY_ALLOCATE
952db0d7
PE
4572# define FREE_VAR(var) \
4573 do { \
4574 if (var) \
4575 { \
4576 REGEX_FREE (var); \
4577 var = NULL; \
4578 } \
4579 } while (0)
0b32bf0e 4580# define FREE_VARIABLES() \
fa9a63c5
RM
4581 do { \
4582 REGEX_FREE_STACK (fail_stack.stack); \
4583 FREE_VAR (regstart); \
4584 FREE_VAR (regend); \
fa9a63c5
RM
4585 FREE_VAR (best_regstart); \
4586 FREE_VAR (best_regend); \
fa9a63c5
RM
4587 } while (0)
4588#else
0b32bf0e 4589# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4590#endif /* not MATCH_MAY_ALLOCATE */
4591
505bde11
SM
4592\f
4593/* Optimization routines. */
4594
4e8a9132
SM
4595/* If the operation is a match against one or more chars,
4596 return a pointer to the next operation, else return NULL. */
01618498 4597static re_char *
29abe551 4598skip_one_char (const_re_char *p)
4e8a9132 4599{
7393bcbb 4600 switch (*p++)
4e8a9132
SM
4601 {
4602 case anychar:
4603 break;
177c0ea7 4604
4e8a9132
SM
4605 case exactn:
4606 p += *p + 1;
4607 break;
4608
4609 case charset_not:
4610 case charset:
4611 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4612 {
4613 int mcnt;
4614 p = CHARSET_RANGE_TABLE (p - 1);
4615 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4616 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4617 }
4618 else
4619 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4620 break;
177c0ea7 4621
4e8a9132
SM
4622 case syntaxspec:
4623 case notsyntaxspec:
1fb352e0 4624#ifdef emacs
4e8a9132
SM
4625 case categoryspec:
4626 case notcategoryspec:
4627#endif /* emacs */
4628 p++;
4629 break;
4630
4631 default:
4632 p = NULL;
4633 }
4634 return p;
4635}
4636
4637
505bde11 4638/* Jump over non-matching operations. */
839966f3 4639static re_char *
29abe551 4640skip_noops (const_re_char *p, const_re_char *pend)
505bde11
SM
4641{
4642 int mcnt;
4643 while (p < pend)
4644 {
7393bcbb 4645 switch (*p)
505bde11
SM
4646 {
4647 case start_memory:
505bde11
SM
4648 case stop_memory:
4649 p += 2; break;
4650 case no_op:
4651 p += 1; break;
4652 case jump:
4653 p += 1;
4654 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4655 p += mcnt;
4656 break;
4657 default:
4658 return p;
4659 }
4660 }
4661 assert (p == pend);
4662 return p;
4663}
4664
4665/* Non-zero if "p1 matches something" implies "p2 fails". */
4666static int
29abe551
PE
4667mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
4668 const_re_char *p2)
505bde11 4669{
4e8a9132 4670 re_opcode_t op2;
2d1675e4 4671 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4672 unsigned char *pend = bufp->buffer + bufp->used;
4673
4e8a9132 4674 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4675 && p2 >= bufp->buffer && p2 <= pend);
4676
4677 /* Skip over open/close-group commands.
4678 If what follows this loop is a ...+ construct,
4679 look at what begins its body, since we will have to
4680 match at least one of that. */
4e8a9132
SM
4681 p2 = skip_noops (p2, pend);
4682 /* The same skip can be done for p1, except that this function
4683 is only used in the case where p1 is a simple match operator. */
4684 /* p1 = skip_noops (p1, pend); */
4685
4686 assert (p1 >= bufp->buffer && p1 < pend
4687 && p2 >= bufp->buffer && p2 <= pend);
4688
4689 op2 = p2 == pend ? succeed : *p2;
4690
7393bcbb 4691 switch (op2)
505bde11 4692 {
4e8a9132
SM
4693 case succeed:
4694 case endbuf:
4695 /* If we're at the end of the pattern, we can change. */
4696 if (skip_one_char (p1))
505bde11 4697 {
dc4a2ee0 4698 DEBUG_PRINT (" End of pattern: fast loop.\n");
505bde11 4699 return 1;
505bde11 4700 }
4e8a9132 4701 break;
177c0ea7 4702
4e8a9132 4703 case endline:
4e8a9132
SM
4704 case exactn:
4705 {
01618498 4706 register re_wchar_t c
4e8a9132 4707 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4708 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4709
4e8a9132
SM
4710 if ((re_opcode_t) *p1 == exactn)
4711 {
62a6e103 4712 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132 4713 {
dc4a2ee0 4714 DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4e8a9132
SM
4715 return 1;
4716 }
4717 }
505bde11 4718
4e8a9132
SM
4719 else if ((re_opcode_t) *p1 == charset
4720 || (re_opcode_t) *p1 == charset_not)
4721 {
4722 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4723
4e8a9132
SM
4724 /* Test if C is listed in charset (or charset_not)
4725 at `p1'. */
6fdd04b0 4726 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4727 {
4728 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4729 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4730 not = !not;
4731 }
4732 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4733 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4734
4e8a9132
SM
4735 /* `not' is equal to 1 if c would match, which means
4736 that we can't change to pop_failure_jump. */
4737 if (!not)
4738 {
dc4a2ee0 4739 DEBUG_PRINT (" No match => fast loop.\n");
4e8a9132
SM
4740 return 1;
4741 }
4742 }
4743 else if ((re_opcode_t) *p1 == anychar
4744 && c == '\n')
4745 {
dc4a2ee0 4746 DEBUG_PRINT (" . != \\n => fast loop.\n");
4e8a9132
SM
4747 return 1;
4748 }
4749 }
4750 break;
505bde11 4751
4e8a9132 4752 case charset:
4e8a9132
SM
4753 {
4754 if ((re_opcode_t) *p1 == exactn)
4755 /* Reuse the code above. */
4756 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4757
505bde11
SM
4758 /* It is hard to list up all the character in charset
4759 P2 if it includes multibyte character. Give up in
4760 such case. */
4761 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4762 {
4763 /* Now, we are sure that P2 has no range table.
4764 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4765 enough. But P1 may have range table, so the
505bde11
SM
4766 size of bitmap table of P1 is extracted by
4767 using macro `CHARSET_BITMAP_SIZE'.
4768
6fdd04b0
KH
4769 In a multibyte case, we know that all the character
4770 listed in P2 is ASCII. In a unibyte case, P1 has only a
4771 bitmap table. So, in both cases, it is enough to test
4772 only the bitmap table of P1. */
505bde11 4773
411e4203 4774 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4775 {
4776 int idx;
4777 /* We win if the charset inside the loop
4778 has no overlap with the one after the loop. */
4779 for (idx = 0;
4780 (idx < (int) p2[1]
4781 && idx < CHARSET_BITMAP_SIZE (p1));
4782 idx++)
4783 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4784 break;
4785
4786 if (idx == p2[1]
4787 || idx == CHARSET_BITMAP_SIZE (p1))
4788 {
dc4a2ee0 4789 DEBUG_PRINT (" No match => fast loop.\n");
505bde11
SM
4790 return 1;
4791 }
4792 }
411e4203 4793 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4794 {
4795 int idx;
4796 /* We win if the charset_not inside the loop lists
7814e705 4797 every character listed in the charset after. */
505bde11
SM
4798 for (idx = 0; idx < (int) p2[1]; idx++)
4799 if (! (p2[2 + idx] == 0
4800 || (idx < CHARSET_BITMAP_SIZE (p1)
4801 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4802 break;
4803
d1dfb56c
EZ
4804 if (idx == p2[1])
4805 {
dc4a2ee0 4806 DEBUG_PRINT (" No match => fast loop.\n");
d1dfb56c
EZ
4807 return 1;
4808 }
4e8a9132
SM
4809 }
4810 }
4811 }
609b757a 4812 break;
177c0ea7 4813
411e4203 4814 case charset_not:
7393bcbb 4815 switch (*p1)
411e4203
SM
4816 {
4817 case exactn:
4818 case charset:
4819 /* Reuse the code above. */
4820 return mutually_exclusive_p (bufp, p2, p1);
4821 case charset_not:
4822 /* When we have two charset_not, it's very unlikely that
4823 they don't overlap. The union of the two sets of excluded
4824 chars should cover all possible chars, which, as a matter of
4825 fact, is virtually impossible in multibyte buffers. */
36595814 4826 break;
411e4203
SM
4827 }
4828 break;
4829
4e8a9132 4830 case wordend:
669fa600
SM
4831 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4832 case symend:
4e8a9132 4833 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4834 && (p1[1] == Ssymbol || p1[1] == Sword));
4835 case notsyntaxspec:
4836 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4837
4838 case wordbeg:
669fa600
SM
4839 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4840 case symbeg:
4e8a9132 4841 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4842 && (p1[1] == Ssymbol || p1[1] == Sword));
4843 case syntaxspec:
4844 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4845
4846 case wordbound:
4847 return (((re_opcode_t) *p1 == notsyntaxspec
4848 || (re_opcode_t) *p1 == syntaxspec)
4849 && p1[1] == Sword);
4850
1fb352e0 4851#ifdef emacs
4e8a9132
SM
4852 case categoryspec:
4853 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4854 case notcategoryspec:
4855 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4856#endif /* emacs */
4857
4858 default:
4859 ;
505bde11
SM
4860 }
4861
4862 /* Safe default. */
4863 return 0;
4864}
4865
fa9a63c5
RM
4866\f
4867/* Matching routines. */
4868
25fe55af 4869#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4870/* re_match is like re_match_2 except it takes only a single string. */
4871
d1dfb56c 4872regoff_t
d2762c86 4873re_match (struct re_pattern_buffer *bufp, const char *string,
d1dfb56c 4874 size_t size, ssize_t pos, struct re_registers *regs)
fa9a63c5 4875{
d1dfb56c
EZ
4876 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4877 size, pos, regs, size);
fa9a63c5
RM
4878 return result;
4879}
c0f9ea08 4880WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4881#endif /* not emacs */
4882
b18215fc
RS
4883#ifdef emacs
4884/* In Emacs, this is the string or buffer in which we
7814e705 4885 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4886Lisp_Object re_match_object;
4887#endif
fa9a63c5
RM
4888
4889/* re_match_2 matches the compiled pattern in BUFP against the
4890 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4891 and SIZE2, respectively). We start matching at POS, and stop
4892 matching at STOP.
5e69f11e 4893
fa9a63c5 4894 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4895 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4896 documentation for exactly how many groups we fill.
4897
4898 We return -1 if no match, -2 if an internal error (such as the
7814e705 4899 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4900 matched substring. */
4901
d1dfb56c
EZ
4902regoff_t
4903re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4904 size_t size1, const char *string2, size_t size2, ssize_t pos,
4905 struct re_registers *regs, ssize_t stop)
fa9a63c5 4906{
d1dfb56c 4907 regoff_t result;
25fe55af 4908
b18215fc 4909#ifdef emacs
d1dfb56c 4910 ssize_t charpos;
d48cd3f4 4911 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 4912 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4913 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4914#endif
4915
4bb91c68
SM
4916 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4917 (re_char*) string2, size2,
cc9b4df2 4918 pos, regs, stop);
fa9a63c5
RM
4919 return result;
4920}
c0f9ea08 4921WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4922
bf216479 4923
fa9a63c5 4924/* This is a separate function so that we can force an alloca cleanup
7814e705 4925 afterwards. */
d1dfb56c 4926static regoff_t
29abe551
PE
4927re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
4928 size_t size1, const_re_char *string2, size_t size2,
d1dfb56c 4929 ssize_t pos, struct re_registers *regs, ssize_t stop)
fa9a63c5
RM
4930{
4931 /* General temporaries. */
dc4a2ee0 4932 int mcnt;
01618498 4933 size_t reg;
fa9a63c5
RM
4934
4935 /* Just past the end of the corresponding string. */
66f0296e 4936 re_char *end1, *end2;
fa9a63c5
RM
4937
4938 /* Pointers into string1 and string2, just past the last characters in
7814e705 4939 each to consider matching. */
66f0296e 4940 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4941
4942 /* Where we are in the data, and the end of the current string. */
66f0296e 4943 re_char *d, *dend;
5e69f11e 4944
99633e97
SM
4945 /* Used sometimes to remember where we were before starting matching
4946 an operator so that we can go back in case of failure. This "atomic"
4947 behavior of matching opcodes is indispensable to the correctness
4948 of the on_failure_keep_string_jump optimization. */
4949 re_char *dfail;
4950
fa9a63c5 4951 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4952 re_char *p = bufp->buffer;
4953 re_char *pend = p + bufp->used;
fa9a63c5 4954
25fe55af 4955 /* We use this to map every character in the string. */
6676cb1c 4956 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4957
cf9c99bc 4958 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 4959 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4960
cf9c99bc
KH
4961 /* Nonzero if STRING1/STRING2 are multibyte. */
4962 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4963
fa9a63c5
RM
4964 /* Failure point stack. Each place that can handle a failure further
4965 down the line pushes a failure point on this stack. It consists of
505bde11 4966 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4967 the subexpressions we're currently inside, plus the number of such
4968 registers, and, finally, two char *'s. The first char * is where
4969 to resume scanning the pattern; the second one is where to resume
7814e705
JB
4970 scanning the strings. */
4971#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4972 fail_stack_type fail_stack;
4973#endif
dc4a2ee0 4974#ifdef DEBUG_COMPILES_ARGUMENTS
fa9a63c5
RM
4975 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4976#endif
4977
0b32bf0e 4978#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4979 /* This holds the pointer to the failure stack, when
4980 it is allocated relocatably. */
4981 fail_stack_elt_t *failure_stack_ptr;
99633e97 4982#endif
fa9a63c5
RM
4983
4984 /* We fill all the registers internally, independent of what we
7814e705 4985 return, for use in backreferences. The number here includes
fa9a63c5 4986 an element for register zero. */
4bb91c68 4987 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4988
fa9a63c5
RM
4989 /* Information on the contents of registers. These are pointers into
4990 the input strings; they record just what was matched (on this
4991 attempt) by a subexpression part of the pattern, that is, the
4992 regnum-th regstart pointer points to where in the pattern we began
4993 matching and the regnum-th regend points to right after where we
4994 stopped matching the regnum-th subexpression. (The zeroth register
4995 keeps track of what the whole pattern matches.) */
4996#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4997 re_char **regstart, **regend;
fa9a63c5
RM
4998#endif
4999
fa9a63c5 5000 /* The following record the register info as found in the above
5e69f11e 5001 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5002 This happens as we backtrack through the failure points, which in
5003 turn happens only if we have not yet matched the entire string. */
5004 unsigned best_regs_set = false;
5005#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5006 re_char **best_regstart, **best_regend;
fa9a63c5 5007#endif
5e69f11e 5008
fa9a63c5
RM
5009 /* Logically, this is `best_regend[0]'. But we don't want to have to
5010 allocate space for that if we're not allocating space for anything
7814e705 5011 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5012 any of the other register vectors, and it seems rather a kludge to
5013 treat `best_regend' differently than the rest. So we keep track of
5014 the end of the best match so far in a separate variable. We
5015 initialize this to NULL so that when we backtrack the first time
5016 and need to test it, it's not garbage. */
66f0296e 5017 re_char *match_end = NULL;
fa9a63c5 5018
dc4a2ee0 5019#ifdef DEBUG_COMPILES_ARGUMENTS
fa9a63c5 5020 /* Counts the total number of registers pushed. */
5e69f11e 5021 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5022#endif
5023
dc4a2ee0 5024 DEBUG_PRINT ("\n\nEntering re_match_2.\n");
5e69f11e 5025
fa9a63c5 5026 INIT_FAIL_STACK ();
5e69f11e 5027
fa9a63c5
RM
5028#ifdef MATCH_MAY_ALLOCATE
5029 /* Do not bother to initialize all the register variables if there are
5030 no groups in the pattern, as it takes a fair amount of time. If
5031 there are groups, we include space for register 0 (the whole
5032 pattern), even though we never use it, since it simplifies the
5033 array indexing. We should fix this. */
5034 if (bufp->re_nsub)
5035 {
66f0296e
SM
5036 regstart = REGEX_TALLOC (num_regs, re_char *);
5037 regend = REGEX_TALLOC (num_regs, re_char *);
5038 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5039 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5040
505bde11 5041 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5042 {
5043 FREE_VARIABLES ();
5044 return -2;
5045 }
fa9a63c5
RM
5046 }
5047 else
5048 {
5049 /* We must initialize all our variables to NULL, so that
25fe55af 5050 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5051 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5052 }
5053#endif /* MATCH_MAY_ALLOCATE */
5054
5055 /* The starting position is bogus. */
5056 if (pos < 0 || pos > size1 + size2)
5057 {
5058 FREE_VARIABLES ();
5059 return -1;
5060 }
5e69f11e 5061
fa9a63c5
RM
5062 /* Initialize subexpression text positions to -1 to mark ones that no
5063 start_memory/stop_memory has been seen for. Also initialize the
5064 register information struct. */
01618498
SM
5065 for (reg = 1; reg < num_regs; reg++)
5066 regstart[reg] = regend[reg] = NULL;
99633e97 5067
fa9a63c5 5068 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5069 `string1' is null. */
fa9a63c5
RM
5070 if (size2 == 0 && string1 != NULL)
5071 {
5072 string2 = string1;
5073 size2 = size1;
5074 string1 = 0;
5075 size1 = 0;
5076 }
5077 end1 = string1 + size1;
5078 end2 = string2 + size2;
5079
5e69f11e 5080 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5081 `dend' is the end of the input string that `d' points within. `d'
5082 is advanced into the following input string whenever necessary, but
5083 this happens before fetching; therefore, at the beginning of the
5084 loop, `d' can be pointing at the end of a string, but it cannot
5085 equal `string2'. */
419d1c74 5086 if (pos >= size1)
fa9a63c5 5087 {
419d1c74
SM
5088 /* Only match within string2. */
5089 d = string2 + pos - size1;
5090 dend = end_match_2 = string2 + stop - size1;
5091 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5092 }
5093 else
5094 {
f1ad044f 5095 if (stop < size1)
419d1c74
SM
5096 {
5097 /* Only match within string1. */
5098 end_match_1 = string1 + stop;
5099 /* BEWARE!
5100 When we reach end_match_1, PREFETCH normally switches to string2.
5101 But in the present case, this means that just doing a PREFETCH
5102 makes us jump from `stop' to `gap' within the string.
5103 What we really want here is for the search to stop as
5104 soon as we hit end_match_1. That's why we set end_match_2
5105 to end_match_1 (since PREFETCH fails as soon as we hit
5106 end_match_2). */
5107 end_match_2 = end_match_1;
5108 }
5109 else
f1ad044f
SM
5110 { /* It's important to use this code when stop == size so that
5111 moving `d' from end1 to string2 will not prevent the d == dend
5112 check from catching the end of string. */
419d1c74
SM
5113 end_match_1 = end1;
5114 end_match_2 = string2 + stop - size1;
5115 }
5116 d = string1 + pos;
5117 dend = end_match_1;
fa9a63c5
RM
5118 }
5119
dc4a2ee0 5120 DEBUG_PRINT ("The compiled pattern is: ");
fa9a63c5 5121 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
dc4a2ee0 5122 DEBUG_PRINT ("The string to match is: `");
fa9a63c5 5123 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
dc4a2ee0 5124 DEBUG_PRINT ("'\n");
5e69f11e 5125
7814e705 5126 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5127 function if the match is complete, or it drops through if the match
5128 fails at this starting point in the input data. */
5129 for (;;)
5130 {
dc4a2ee0 5131 DEBUG_PRINT ("\n%p: ", p);
fa9a63c5
RM
5132
5133 if (p == pend)
dc4a2ee0
PE
5134 {
5135 ptrdiff_t dcnt;
5136
5137 /* End of pattern means we might have succeeded. */
5138 DEBUG_PRINT ("end of pattern ... ");
5e69f11e 5139
fa9a63c5 5140 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5141 longest match, try backtracking. */
5142 if (d != end_match_2)
fa9a63c5
RM
5143 {
5144 /* 1 if this match ends in the same string (string1 or string2)
5145 as the best previous match. */
d42f4f0f
PE
5146 boolean same_str_p = (FIRST_STRING_P (match_end)
5147 == FIRST_STRING_P (d));
fa9a63c5
RM
5148 /* 1 if this match is the best seen so far. */
5149 boolean best_match_p;
5150
5151 /* AIX compiler got confused when this was combined
7814e705 5152 with the previous declaration. */
fa9a63c5
RM
5153 if (same_str_p)
5154 best_match_p = d > match_end;
5155 else
99633e97 5156 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5157
dc4a2ee0 5158 DEBUG_PRINT ("backtracking.\n");
25fe55af
RS
5159
5160 if (!FAIL_STACK_EMPTY ())
5161 { /* More failure points to try. */
5162
5163 /* If exceeds best match so far, save it. */
5164 if (!best_regs_set || best_match_p)
5165 {
5166 best_regs_set = true;
5167 match_end = d;
5168
dc4a2ee0 5169 DEBUG_PRINT ("\nSAVING match as best so far.\n");
25fe55af 5170
01618498 5171 for (reg = 1; reg < num_regs; reg++)
25fe55af 5172 {
01618498
SM
5173 best_regstart[reg] = regstart[reg];
5174 best_regend[reg] = regend[reg];
25fe55af
RS
5175 }
5176 }
5177 goto fail;
5178 }
5179
5180 /* If no failure points, don't restore garbage. And if
5181 last match is real best match, don't restore second
5182 best one. */
5183 else if (best_regs_set && !best_match_p)
5184 {
5185 restore_best_regs:
5186 /* Restore best match. It may happen that `dend ==
5187 end_match_1' while the restored d is in string2.
5188 For example, the pattern `x.*y.*z' against the
5189 strings `x-' and `y-z-', if the two strings are
7814e705 5190 not consecutive in memory. */
dc4a2ee0 5191 DEBUG_PRINT ("Restoring best registers.\n");
25fe55af
RS
5192
5193 d = match_end;
5194 dend = ((d >= string1 && d <= end1)
5195 ? end_match_1 : end_match_2);
fa9a63c5 5196
01618498 5197 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5198 {
01618498
SM
5199 regstart[reg] = best_regstart[reg];
5200 regend[reg] = best_regend[reg];
fa9a63c5 5201 }
25fe55af
RS
5202 }
5203 } /* d != end_match_2 */
fa9a63c5
RM
5204
5205 succeed_label:
dc4a2ee0 5206 DEBUG_PRINT ("Accepting match.\n");
fa9a63c5 5207
25fe55af
RS
5208 /* If caller wants register contents data back, do it. */
5209 if (regs && !bufp->no_sub)
fa9a63c5 5210 {
25fe55af
RS
5211 /* Have the register data arrays been allocated? */
5212 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5213 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5214 extra element beyond `num_regs' for the `-1' marker
5215 GNU code uses. */
5216 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5217 regs->start = TALLOC (regs->num_regs, regoff_t);
5218 regs->end = TALLOC (regs->num_regs, regoff_t);
5219 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5220 {
5221 FREE_VARIABLES ();
5222 return -2;
5223 }
25fe55af
RS
5224 bufp->regs_allocated = REGS_REALLOCATE;
5225 }
5226 else if (bufp->regs_allocated == REGS_REALLOCATE)
5227 { /* Yes. If we need more elements than were already
5228 allocated, reallocate them. If we need fewer, just
5229 leave it alone. */
5230 if (regs->num_regs < num_regs + 1)
5231 {
5232 regs->num_regs = num_regs + 1;
5233 RETALLOC (regs->start, regs->num_regs, regoff_t);
5234 RETALLOC (regs->end, regs->num_regs, regoff_t);
5235 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5236 {
5237 FREE_VARIABLES ();
5238 return -2;
5239 }
25fe55af
RS
5240 }
5241 }
5242 else
fa9a63c5
RM
5243 {
5244 /* These braces fend off a "empty body in an else-statement"
7814e705 5245 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5246 assert (bufp->regs_allocated == REGS_FIXED);
5247 }
5248
25fe55af
RS
5249 /* Convert the pointer data in `regstart' and `regend' to
5250 indices. Register zero has to be set differently,
5251 since we haven't kept track of any info for it. */
5252 if (regs->num_regs > 0)
5253 {
5254 regs->start[0] = pos;
99633e97 5255 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5256 }
5e69f11e 5257
25fe55af
RS
5258 /* Go through the first `min (num_regs, regs->num_regs)'
5259 registers, since that is all we initialized. */
01618498 5260 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5261 {
01618498
SM
5262 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5263 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5264 else
5265 {
dc4a2ee0
PE
5266 regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
5267 regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
25fe55af 5268 }
fa9a63c5 5269 }
5e69f11e 5270
25fe55af
RS
5271 /* If the regs structure we return has more elements than
5272 were in the pattern, set the extra elements to -1. If
5273 we (re)allocated the registers, this is the case,
5274 because we always allocate enough to have at least one
7814e705 5275 -1 at the end. */
01618498
SM
5276 for (reg = num_regs; reg < regs->num_regs; reg++)
5277 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5278 } /* regs && !bufp->no_sub */
5279
dc4a2ee0
PE
5280 DEBUG_PRINT ("%u failure points pushed, %u popped (%u remain).\n",
5281 nfailure_points_pushed, nfailure_points_popped,
5282 nfailure_points_pushed - nfailure_points_popped);
5283 DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5284
dc4a2ee0 5285 dcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5286
dc4a2ee0 5287 DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
fa9a63c5 5288
25fe55af 5289 FREE_VARIABLES ();
dc4a2ee0 5290 return dcnt;
25fe55af 5291 }
fa9a63c5 5292
7814e705 5293 /* Otherwise match next pattern command. */
7393bcbb 5294 switch (*p++)
fa9a63c5 5295 {
25fe55af
RS
5296 /* Ignore these. Used to ignore the n of succeed_n's which
5297 currently have n == 0. */
5298 case no_op:
dc4a2ee0 5299 DEBUG_PRINT ("EXECUTING no_op.\n");
25fe55af 5300 break;
fa9a63c5
RM
5301
5302 case succeed:
dc4a2ee0 5303 DEBUG_PRINT ("EXECUTING succeed.\n");
fa9a63c5
RM
5304 goto succeed_label;
5305
7814e705 5306 /* Match the next n pattern characters exactly. The following
25fe55af 5307 byte in the pattern defines n, and the n bytes after that
7814e705 5308 are the characters to match. */
fa9a63c5
RM
5309 case exactn:
5310 mcnt = *p++;
dc4a2ee0 5311 DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5312
99633e97
SM
5313 /* Remember the start point to rollback upon failure. */
5314 dfail = d;
5315
6fdd04b0 5316#ifndef emacs
25fe55af
RS
5317 /* This is written out as an if-else so we don't waste time
5318 testing `translate' inside the loop. */
28703c16 5319 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5320 do
5321 {
5322 PREFETCH ();
5323 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5324 {
6fdd04b0
KH
5325 d = dfail;
5326 goto fail;
e934739e 5327 }
6fdd04b0
KH
5328 d++;
5329 }
5330 while (--mcnt);
fa9a63c5 5331 else
6fdd04b0
KH
5332 do
5333 {
5334 PREFETCH ();
5335 if (*d++ != *p++)
bf216479 5336 {
6fdd04b0
KH
5337 d = dfail;
5338 goto fail;
bf216479 5339 }
6fdd04b0
KH
5340 }
5341 while (--mcnt);
5342#else /* emacs */
5343 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5344 if (target_multibyte)
6fdd04b0
KH
5345 do
5346 {
5347 int pat_charlen, buf_charlen;
cf9c99bc 5348 int pat_ch, buf_ch;
e934739e 5349
6fdd04b0 5350 PREFETCH ();
cf9c99bc 5351 if (multibyte)
62a6e103 5352 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5353 else
5354 {
5355 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5356 pat_charlen = 1;
5357 }
62a6e103 5358 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5359
6fdd04b0 5360 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5361 {
6fdd04b0
KH
5362 d = dfail;
5363 goto fail;
e934739e 5364 }
bf216479 5365
6fdd04b0
KH
5366 p += pat_charlen;
5367 d += buf_charlen;
5368 mcnt -= pat_charlen;
5369 }
5370 while (mcnt > 0);
fa9a63c5 5371 else
6fdd04b0
KH
5372 do
5373 {
abbd1bcf 5374 int pat_charlen;
cf9c99bc 5375 int pat_ch, buf_ch;
bf216479 5376
6fdd04b0 5377 PREFETCH ();
cf9c99bc
KH
5378 if (multibyte)
5379 {
62a6e103 5380 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5381 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5382 }
5383 else
5384 {
5385 pat_ch = *p;
5386 pat_charlen = 1;
5387 }
5388 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5389 if (! CHAR_BYTE8_P (buf_ch))
5390 {
5391 buf_ch = TRANSLATE (buf_ch);
5392 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5393 if (buf_ch < 0)
5394 buf_ch = *d;
5395 }
0e2501ed
AS
5396 else
5397 buf_ch = *d;
cf9c99bc 5398 if (buf_ch != pat_ch)
6fdd04b0
KH
5399 {
5400 d = dfail;
5401 goto fail;
bf216479 5402 }
cf9c99bc
KH
5403 p += pat_charlen;
5404 d++;
6fdd04b0
KH
5405 }
5406 while (--mcnt);
5407#endif
25fe55af 5408 break;
fa9a63c5
RM
5409
5410
25fe55af 5411 /* Match any character except possibly a newline or a null. */
fa9a63c5 5412 case anychar:
e934739e
RS
5413 {
5414 int buf_charlen;
01618498 5415 re_wchar_t buf_ch;
fa9a63c5 5416
dc4a2ee0 5417 DEBUG_PRINT ("EXECUTING anychar.\n");
fa9a63c5 5418
e934739e 5419 PREFETCH ();
62a6e103 5420 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5421 target_multibyte);
e934739e
RS
5422 buf_ch = TRANSLATE (buf_ch);
5423
5424 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5425 && buf_ch == '\n')
5426 || ((bufp->syntax & RE_DOT_NOT_NULL)
5427 && buf_ch == '\000'))
5428 goto fail;
5429
dc4a2ee0 5430 DEBUG_PRINT (" Matched `%d'.\n", *d);
e934739e
RS
5431 d += buf_charlen;
5432 }
fa9a63c5
RM
5433 break;
5434
5435
5436 case charset:
5437 case charset_not:
5438 {
b18215fc 5439 register unsigned int c;
fa9a63c5 5440 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5441 int len;
5442
5443 /* Start of actual range_table, or end of bitmap if there is no
5444 range table. */
da053e48 5445 re_char *range_table IF_LINT (= NULL);
b18215fc 5446
96cc36cc 5447 /* Nonzero if there is a range table. */
b18215fc
RS
5448 int range_table_exists;
5449
96cc36cc
RS
5450 /* Number of ranges of range table. This is not included
5451 in the initial byte-length of the command. */
5452 int count = 0;
fa9a63c5 5453
f5020181
AS
5454 /* Whether matching against a unibyte character. */
5455 boolean unibyte_char = false;
5456
dc4a2ee0 5457 DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5458
b18215fc 5459 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5460
b18215fc 5461 if (range_table_exists)
96cc36cc
RS
5462 {
5463 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5464 EXTRACT_NUMBER_AND_INCR (count, range_table);
5465 }
b18215fc 5466
2d1675e4 5467 PREFETCH ();
62a6e103 5468 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5469 if (target_multibyte)
5470 {
5471 int c1;
b18215fc 5472
cf9c99bc
KH
5473 c = TRANSLATE (c);
5474 c1 = RE_CHAR_TO_UNIBYTE (c);
5475 if (c1 >= 0)
f5020181
AS
5476 {
5477 unibyte_char = true;
5478 c = c1;
5479 }
cf9c99bc
KH
5480 }
5481 else
5482 {
5483 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5484
5485 if (! CHAR_BYTE8_P (c1))
5486 {
5487 c1 = TRANSLATE (c1);
5488 c1 = RE_CHAR_TO_UNIBYTE (c1);
5489 if (c1 >= 0)
f5020181
AS
5490 {
5491 unibyte_char = true;
5492 c = c1;
5493 }
cf9c99bc 5494 }
0b8be006
AS
5495 else
5496 unibyte_char = true;
cf9c99bc
KH
5497 }
5498
f5020181 5499 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5500 { /* Lookup bitmap. */
b18215fc
RS
5501 /* Cast to `unsigned' instead of `unsigned char' in
5502 case the bit list is a full 32 bytes long. */
5503 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5504 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5505 not = !not;
b18215fc 5506 }
96cc36cc 5507#ifdef emacs
b18215fc 5508 else if (range_table_exists)
96cc36cc
RS
5509 {
5510 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5511
14473664
SM
5512 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5513 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5514 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5515 | (class_bits & BIT_SPACE && ISSPACE (c))
5516 | (class_bits & BIT_UPPER && ISUPPER (c))
5517 | (class_bits & BIT_WORD && ISWORD (c)))
5518 not = !not;
5519 else
5520 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5521 }
5522#endif /* emacs */
fa9a63c5 5523
96cc36cc
RS
5524 if (range_table_exists)
5525 p = CHARSET_RANGE_TABLE_END (range_table, count);
5526 else
5527 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5528
5529 if (!not) goto fail;
5e69f11e 5530
b18215fc 5531 d += len;
fa9a63c5 5532 }
8fb31792 5533 break;
fa9a63c5
RM
5534
5535
25fe55af 5536 /* The beginning of a group is represented by start_memory.
505bde11 5537 The argument is the register number. The text
25fe55af 5538 matched within the group is recorded (in the internal
7814e705 5539 registers data structure) under the register number. */
25fe55af 5540 case start_memory:
dc4a2ee0 5541 DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
505bde11
SM
5542
5543 /* In case we need to undo this operation (via backtracking). */
dc4a2ee0 5544 PUSH_FAILURE_REG (*p);
fa9a63c5 5545
25fe55af 5546 regstart[*p] = d;
4bb91c68 5547 regend[*p] = NULL; /* probably unnecessary. -sm */
dc4a2ee0 5548 DEBUG_PRINT (" regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
fa9a63c5 5549
25fe55af 5550 /* Move past the register number and inner group count. */
505bde11 5551 p += 1;
25fe55af 5552 break;
fa9a63c5
RM
5553
5554
25fe55af 5555 /* The stop_memory opcode represents the end of a group. Its
505bde11 5556 argument is the same as start_memory's: the register number. */
fa9a63c5 5557 case stop_memory:
dc4a2ee0 5558 DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
505bde11
SM
5559
5560 assert (!REG_UNSET (regstart[*p]));
5561 /* Strictly speaking, there should be code such as:
177c0ea7 5562
0b32bf0e 5563 assert (REG_UNSET (regend[*p]));
505bde11
SM
5564 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5565
5566 But the only info to be pushed is regend[*p] and it is known to
5567 be UNSET, so there really isn't anything to push.
5568 Not pushing anything, on the other hand deprives us from the
5569 guarantee that regend[*p] is UNSET since undoing this operation
5570 will not reset its value properly. This is not important since
5571 the value will only be read on the next start_memory or at
5572 the very end and both events can only happen if this stop_memory
5573 is *not* undone. */
fa9a63c5 5574
25fe55af 5575 regend[*p] = d;
dc4a2ee0 5576 DEBUG_PRINT (" regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
fa9a63c5 5577
25fe55af 5578 /* Move past the register number and the inner group count. */
505bde11 5579 p += 1;
25fe55af 5580 break;
fa9a63c5
RM
5581
5582
5583 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5584 followed by the numeric value of <digit> as the register number. */
5585 case duplicate:
fa9a63c5 5586 {
66f0296e 5587 register re_char *d2, *dend2;
7814e705 5588 int regno = *p++; /* Get which register to match against. */
dc4a2ee0 5589 DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
fa9a63c5 5590
7814e705 5591 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5592 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5593 goto fail;
5e69f11e 5594
7814e705 5595 /* Where in input to try to start matching. */
25fe55af 5596 d2 = regstart[regno];
5e69f11e 5597
99633e97
SM
5598 /* Remember the start point to rollback upon failure. */
5599 dfail = d;
5600
25fe55af
RS
5601 /* Where to stop matching; if both the place to start and
5602 the place to stop matching are in the same string, then
5603 set to the place to stop, otherwise, for now have to use
5604 the end of the first string. */
fa9a63c5 5605
25fe55af 5606 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5607 == FIRST_STRING_P (regend[regno]))
5608 ? regend[regno] : end_match_1);
5609 for (;;)
5610 {
dc4a2ee0
PE
5611 ptrdiff_t dcnt;
5612
fa9a63c5 5613 /* If necessary, advance to next segment in register
25fe55af 5614 contents. */
fa9a63c5
RM
5615 while (d2 == dend2)
5616 {
5617 if (dend2 == end_match_2) break;
5618 if (dend2 == regend[regno]) break;
5619
25fe55af
RS
5620 /* End of string1 => advance to string2. */
5621 d2 = string2;
5622 dend2 = regend[regno];
fa9a63c5
RM
5623 }
5624 /* At end of register contents => success */
5625 if (d2 == dend2) break;
5626
5627 /* If necessary, advance to next segment in data. */
5628 PREFETCH ();
5629
5630 /* How many characters left in this segment to match. */
dc4a2ee0 5631 dcnt = dend - d;
5e69f11e 5632
fa9a63c5 5633 /* Want how many consecutive characters we can match in
25fe55af 5634 one shot, so, if necessary, adjust the count. */
dc4a2ee0
PE
5635 if (dcnt > dend2 - d2)
5636 dcnt = dend2 - d2;
5e69f11e 5637
fa9a63c5 5638 /* Compare that many; failure if mismatch, else move
25fe55af 5639 past them. */
28703c16 5640 if (RE_TRANSLATE_P (translate)
dc4a2ee0
PE
5641 ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
5642 : memcmp (d, d2, dcnt))
99633e97
SM
5643 {
5644 d = dfail;
5645 goto fail;
5646 }
dc4a2ee0 5647 d += dcnt, d2 += dcnt;
fa9a63c5
RM
5648 }
5649 }
5650 break;
5651
5652
25fe55af 5653 /* begline matches the empty string at the beginning of the string
c0f9ea08 5654 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5655 case begline:
dc4a2ee0 5656 DEBUG_PRINT ("EXECUTING begline.\n");
5e69f11e 5657
25fe55af
RS
5658 if (AT_STRINGS_BEG (d))
5659 {
5660 if (!bufp->not_bol) break;
5661 }
419d1c74 5662 else
25fe55af 5663 {
bf216479 5664 unsigned c;
419d1c74 5665 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5666 if (c == '\n')
419d1c74 5667 break;
25fe55af
RS
5668 }
5669 /* In all other cases, we fail. */
5670 goto fail;
fa9a63c5
RM
5671
5672
25fe55af 5673 /* endline is the dual of begline. */
fa9a63c5 5674 case endline:
dc4a2ee0 5675 DEBUG_PRINT ("EXECUTING endline.\n");
fa9a63c5 5676
25fe55af
RS
5677 if (AT_STRINGS_END (d))
5678 {
5679 if (!bufp->not_eol) break;
5680 }
f1ad044f 5681 else
25fe55af 5682 {
f1ad044f 5683 PREFETCH_NOLIMIT ();
c0f9ea08 5684 if (*d == '\n')
f1ad044f 5685 break;
25fe55af
RS
5686 }
5687 goto fail;
fa9a63c5
RM
5688
5689
5690 /* Match at the very beginning of the data. */
25fe55af 5691 case begbuf:
dc4a2ee0 5692 DEBUG_PRINT ("EXECUTING begbuf.\n");
25fe55af
RS
5693 if (AT_STRINGS_BEG (d))
5694 break;
5695 goto fail;
fa9a63c5
RM
5696
5697
5698 /* Match at the very end of the data. */
25fe55af 5699 case endbuf:
dc4a2ee0 5700 DEBUG_PRINT ("EXECUTING endbuf.\n");
fa9a63c5
RM
5701 if (AT_STRINGS_END (d))
5702 break;
25fe55af 5703 goto fail;
5e69f11e 5704
5e69f11e 5705
25fe55af
RS
5706 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5707 pushes NULL as the value for the string on the stack. Then
505bde11 5708 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5709 string, instead of restoring it. To see why, consider
7814e705 5710 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5711 then the . fails against the \n. But the next thing we want
5712 to do is match the \n against the \n; if we restored the
5713 string value, we would be back at the foo.
5714
5715 Because this is used only in specific cases, we don't need to
5716 check all the things that `on_failure_jump' does, to make
5717 sure the right things get saved on the stack. Hence we don't
5718 share its code. The only reason to push anything on the
5719 stack at all is that otherwise we would have to change
5720 `anychar's code to do something besides goto fail in this
5721 case; that seems worse than this. */
5722 case on_failure_keep_string_jump:
505bde11 5723 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5724 DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5725 mcnt, p + mcnt);
fa9a63c5 5726
505bde11
SM
5727 PUSH_FAILURE_POINT (p - 3, NULL);
5728 break;
5729
0683b6fa
SM
5730 /* A nasty loop is introduced by the non-greedy *? and +?.
5731 With such loops, the stack only ever contains one failure point
5732 at a time, so that a plain on_failure_jump_loop kind of
5733 cycle detection cannot work. Worse yet, such a detection
5734 can not only fail to detect a cycle, but it can also wrongly
5735 detect a cycle (between different instantiations of the same
6df42991 5736 loop).
0683b6fa
SM
5737 So the method used for those nasty loops is a little different:
5738 We use a special cycle-detection-stack-frame which is pushed
5739 when the on_failure_jump_nastyloop failure-point is *popped*.
5740 This special frame thus marks the beginning of one iteration
5741 through the loop and we can hence easily check right here
5742 whether something matched between the beginning and the end of
5743 the loop. */
5744 case on_failure_jump_nastyloop:
5745 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5746 DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5747 mcnt, p + mcnt);
0683b6fa
SM
5748
5749 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5750 {
5751 int cycle = 0;
5752 CHECK_INFINITE_LOOP (p - 4, d);
5753 if (!cycle)
5754 /* If there's a cycle, just continue without pushing
5755 this failure point. The failure point is the "try again"
5756 option, which shouldn't be tried.
5757 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5758 PUSH_FAILURE_POINT (p - 3, d);
5759 }
0683b6fa
SM
5760 break;
5761
4e8a9132
SM
5762 /* Simple loop detecting on_failure_jump: just check on the
5763 failure stack if the same spot was already hit earlier. */
505bde11
SM
5764 case on_failure_jump_loop:
5765 on_failure:
5766 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5767 DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5768 mcnt, p + mcnt);
6df42991
SM
5769 {
5770 int cycle = 0;
5771 CHECK_INFINITE_LOOP (p - 3, d);
5772 if (cycle)
5773 /* If there's a cycle, get out of the loop, as if the matching
5774 had failed. We used to just `goto fail' here, but that was
5775 aborting the search a bit too early: we want to keep the
5776 empty-loop-match and keep matching after the loop.
5777 We want (x?)*y\1z to match both xxyz and xxyxz. */
5778 p += mcnt;
5779 else
5780 PUSH_FAILURE_POINT (p - 3, d);
5781 }
25fe55af 5782 break;
fa9a63c5
RM
5783
5784
5785 /* Uses of on_failure_jump:
5e69f11e 5786
25fe55af
RS
5787 Each alternative starts with an on_failure_jump that points
5788 to the beginning of the next alternative. Each alternative
5789 except the last ends with a jump that in effect jumps past
5790 the rest of the alternatives. (They really jump to the
5791 ending jump of the following alternative, because tensioning
5792 these jumps is a hassle.)
fa9a63c5 5793
25fe55af
RS
5794 Repeats start with an on_failure_jump that points past both
5795 the repetition text and either the following jump or
5796 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5797 case on_failure_jump:
25fe55af 5798 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5799 DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
5800 mcnt, p + mcnt);
25fe55af 5801
505bde11 5802 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5803 break;
5804
4e8a9132 5805 /* This operation is used for greedy *.
505bde11
SM
5806 Compare the beginning of the repeat with what in the
5807 pattern follows its end. If we can establish that there
5808 is nothing that they would both match, i.e., that we
5809 would have to backtrack because of (as in, e.g., `a*a')
5810 then we can use a non-backtracking loop based on
4e8a9132 5811 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5812 case on_failure_jump_smart:
25fe55af 5813 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5814 DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5815 mcnt, p + mcnt);
25fe55af 5816 {
01618498 5817 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5818 /* Here, we discard `const', making re_match non-reentrant. */
5819 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5820 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5821
505bde11
SM
5822 p -= 3; /* Reset so that we will re-execute the
5823 instruction once it's been changed. */
fa9a63c5 5824
4e8a9132
SM
5825 EXTRACT_NUMBER (mcnt, p2 - 2);
5826
5827 /* Ensure this is a indeed the trivial kind of loop
5828 we are expecting. */
5829 assert (skip_one_char (p1) == p2 - 3);
5830 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5831 DEBUG_STATEMENT (debug += 2);
505bde11 5832 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5833 {
505bde11 5834 /* Use a fast `on_failure_keep_string_jump' loop. */
dc4a2ee0 5835 DEBUG_PRINT (" smart exclusive => fast loop.\n");
01618498 5836 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5837 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5838 }
505bde11 5839 else
fa9a63c5 5840 {
505bde11 5841 /* Default to a safe `on_failure_jump' loop. */
dc4a2ee0 5842 DEBUG_PRINT (" smart default => slow loop.\n");
01618498 5843 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5844 }
99633e97 5845 DEBUG_STATEMENT (debug -= 2);
25fe55af 5846 }
505bde11 5847 break;
25fe55af
RS
5848
5849 /* Unconditionally jump (without popping any failure points). */
5850 case jump:
fa9a63c5 5851 unconditional_jump:
5b370c2b 5852 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5853 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
dc4a2ee0 5854 DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
7814e705 5855 p += mcnt; /* Do the jump. */
dc4a2ee0 5856 DEBUG_PRINT ("(to %p).\n", p);
25fe55af
RS
5857 break;
5858
5859
25fe55af
RS
5860 /* Have to succeed matching what follows at least n times.
5861 After that, handle like `on_failure_jump'. */
5862 case succeed_n:
01618498 5863 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af 5864 EXTRACT_NUMBER (mcnt, p + 2);
dc4a2ee0 5865 DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5866
dc1e502d
SM
5867 /* Originally, mcnt is how many times we HAVE to succeed. */
5868 if (mcnt != 0)
25fe55af 5869 {
6dcf2d0e
SM
5870 /* Here, we discard `const', making re_match non-reentrant. */
5871 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5872 mcnt--;
01618498
SM
5873 p += 4;
5874 PUSH_NUMBER (p2, mcnt);
25fe55af 5875 }
dc1e502d
SM
5876 else
5877 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5878 goto on_failure;
25fe55af
RS
5879 break;
5880
5881 case jump_n:
01618498 5882 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af 5883 EXTRACT_NUMBER (mcnt, p + 2);
dc4a2ee0 5884 DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
25fe55af
RS
5885
5886 /* Originally, this is how many times we CAN jump. */
dc1e502d 5887 if (mcnt != 0)
25fe55af 5888 {
6dcf2d0e
SM
5889 /* Here, we discard `const', making re_match non-reentrant. */
5890 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5891 mcnt--;
01618498 5892 PUSH_NUMBER (p2, mcnt);
dc1e502d 5893 goto unconditional_jump;
25fe55af
RS
5894 }
5895 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5896 else
5897 p += 4;
25fe55af 5898 break;
5e69f11e 5899
fa9a63c5
RM
5900 case set_number_at:
5901 {
01618498 5902 unsigned char *p2; /* Location of the counter. */
dc4a2ee0 5903 DEBUG_PRINT ("EXECUTING set_number_at.\n");
fa9a63c5 5904
25fe55af 5905 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5906 /* Here, we discard `const', making re_match non-reentrant. */
5907 p2 = (unsigned char*) p + mcnt;
f224e500 5908 /* Signedness doesn't matter since we only copy MCNT's bits. */
25fe55af 5909 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0 5910 DEBUG_PRINT (" Setting %p to %d.\n", p2, mcnt);
01618498 5911 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5912 break;
5913 }
9121ca40
KH
5914
5915 case wordbound:
66f0296e 5916 case notwordbound:
19ed5445
PE
5917 {
5918 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
dc4a2ee0 5919 DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
fa9a63c5 5920
19ed5445 5921 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5922
19ed5445
PE
5923 /* Case 1: D is at the beginning or the end of string. */
5924 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5925 not = !not;
5926 else
5927 {
5928 /* C1 is the character before D, S1 is the syntax of C1, C2
5929 is the character at D, and S2 is the syntax of C2. */
5930 re_wchar_t c1, c2;
5931 int s1, s2;
5932 int dummy;
b18215fc 5933#ifdef emacs
d1dfb56c
EZ
5934 ssize_t offset = PTR_TO_OFFSET (d - 1);
5935 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445 5936 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5937#endif
19ed5445
PE
5938 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5939 s1 = SYNTAX (c1);
b18215fc 5940#ifdef emacs
19ed5445 5941 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5942#endif
19ed5445
PE
5943 PREFETCH_NOLIMIT ();
5944 GET_CHAR_AFTER (c2, d, dummy);
5945 s2 = SYNTAX (c2);
5946
5947 if (/* Case 2: Only one of S1 and S2 is Sword. */
5948 ((s1 == Sword) != (s2 == Sword))
5949 /* Case 3: Both of S1 and S2 are Sword, and macro
5950 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5951 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5952 not = !not;
5953 }
5954 if (not)
5955 break;
5956 else
5957 goto fail;
5958 }
fa9a63c5
RM
5959
5960 case wordbeg:
dc4a2ee0 5961 DEBUG_PRINT ("EXECUTING wordbeg.\n");
fa9a63c5 5962
b18215fc
RS
5963 /* We FAIL in one of the following cases: */
5964
7814e705 5965 /* Case 1: D is at the end of string. */
b18215fc 5966 if (AT_STRINGS_END (d))
99633e97 5967 goto fail;
b18215fc
RS
5968 else
5969 {
5970 /* C1 is the character before D, S1 is the syntax of C1, C2
5971 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5972 re_wchar_t c1, c2;
5973 int s1, s2;
bf216479 5974 int dummy;
fa9a63c5 5975#ifdef emacs
d1dfb56c
EZ
5976 ssize_t offset = PTR_TO_OFFSET (d);
5977 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5978 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5979#endif
99633e97 5980 PREFETCH ();
6fdd04b0 5981 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 5982 s2 = SYNTAX (c2);
177c0ea7 5983
b18215fc
RS
5984 /* Case 2: S2 is not Sword. */
5985 if (s2 != Sword)
5986 goto fail;
5987
5988 /* Case 3: D is not at the beginning of string ... */
5989 if (!AT_STRINGS_BEG (d))
5990 {
5991 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5992#ifdef emacs
5d967c7a 5993 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5994#endif
b18215fc
RS
5995 s1 = SYNTAX (c1);
5996
5997 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 5998 returns 0. */
b18215fc
RS
5999 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6000 goto fail;
6001 }
6002 }
e318085a
RS
6003 break;
6004
b18215fc 6005 case wordend:
dc4a2ee0 6006 DEBUG_PRINT ("EXECUTING wordend.\n");
b18215fc
RS
6007
6008 /* We FAIL in one of the following cases: */
6009
6010 /* Case 1: D is at the beginning of string. */
6011 if (AT_STRINGS_BEG (d))
e318085a 6012 goto fail;
b18215fc
RS
6013 else
6014 {
6015 /* C1 is the character before D, S1 is the syntax of C1, C2
6016 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6017 re_wchar_t c1, c2;
6018 int s1, s2;
bf216479 6019 int dummy;
5d967c7a 6020#ifdef emacs
d1dfb56c
EZ
6021 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6022 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6023 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6024#endif
99633e97 6025 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6026 s1 = SYNTAX (c1);
6027
6028 /* Case 2: S1 is not Sword. */
6029 if (s1 != Sword)
6030 goto fail;
6031
6032 /* Case 3: D is not at the end of string ... */
6033 if (!AT_STRINGS_END (d))
6034 {
f1ad044f 6035 PREFETCH_NOLIMIT ();
6fdd04b0 6036 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6037#ifdef emacs
6038 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6039#endif
b18215fc
RS
6040 s2 = SYNTAX (c2);
6041
6042 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6043 returns 0. */
b18215fc 6044 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6045 goto fail;
b18215fc
RS
6046 }
6047 }
e318085a
RS
6048 break;
6049
669fa600 6050 case symbeg:
dc4a2ee0 6051 DEBUG_PRINT ("EXECUTING symbeg.\n");
669fa600
SM
6052
6053 /* We FAIL in one of the following cases: */
6054
7814e705 6055 /* Case 1: D is at the end of string. */
669fa600
SM
6056 if (AT_STRINGS_END (d))
6057 goto fail;
6058 else
6059 {
6060 /* C1 is the character before D, S1 is the syntax of C1, C2
6061 is the character at D, and S2 is the syntax of C2. */
6062 re_wchar_t c1, c2;
6063 int s1, s2;
6064#ifdef emacs
d1dfb56c
EZ
6065 ssize_t offset = PTR_TO_OFFSET (d);
6066 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6067 UPDATE_SYNTAX_TABLE (charpos);
6068#endif
6069 PREFETCH ();
62a6e103 6070 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6071 s2 = SYNTAX (c2);
7814e705 6072
669fa600
SM
6073 /* Case 2: S2 is neither Sword nor Ssymbol. */
6074 if (s2 != Sword && s2 != Ssymbol)
6075 goto fail;
6076
6077 /* Case 3: D is not at the beginning of string ... */
6078 if (!AT_STRINGS_BEG (d))
6079 {
6080 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6081#ifdef emacs
6082 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6083#endif
6084 s1 = SYNTAX (c1);
6085
6086 /* ... and S1 is Sword or Ssymbol. */
6087 if (s1 == Sword || s1 == Ssymbol)
6088 goto fail;
6089 }
6090 }
6091 break;
6092
6093 case symend:
dc4a2ee0 6094 DEBUG_PRINT ("EXECUTING symend.\n");
669fa600
SM
6095
6096 /* We FAIL in one of the following cases: */
6097
6098 /* Case 1: D is at the beginning of string. */
6099 if (AT_STRINGS_BEG (d))
6100 goto fail;
6101 else
6102 {
6103 /* C1 is the character before D, S1 is the syntax of C1, C2
6104 is the character at D, and S2 is the syntax of C2. */
6105 re_wchar_t c1, c2;
6106 int s1, s2;
6107#ifdef emacs
d1dfb56c
EZ
6108 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6109 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6110 UPDATE_SYNTAX_TABLE (charpos);
6111#endif
6112 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6113 s1 = SYNTAX (c1);
6114
6115 /* Case 2: S1 is neither Ssymbol nor Sword. */
6116 if (s1 != Sword && s1 != Ssymbol)
6117 goto fail;
6118
6119 /* Case 3: D is not at the end of string ... */
6120 if (!AT_STRINGS_END (d))
6121 {
6122 PREFETCH_NOLIMIT ();
62a6e103 6123 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6124#ifdef emacs
134579f2 6125 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6126#endif
6127 s2 = SYNTAX (c2);
6128
6129 /* ... and S2 is Sword or Ssymbol. */
6130 if (s2 == Sword || s2 == Ssymbol)
6131 goto fail;
b18215fc
RS
6132 }
6133 }
e318085a
RS
6134 break;
6135
fa9a63c5 6136 case syntaxspec:
1fb352e0 6137 case notsyntaxspec:
b18215fc 6138 {
19ed5445
PE
6139 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6140 mcnt = *p++;
dc4a2ee0
PE
6141 DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
6142 mcnt);
19ed5445
PE
6143 PREFETCH ();
6144#ifdef emacs
6145 {
d1dfb56c
EZ
6146 ssize_t offset = PTR_TO_OFFSET (d);
6147 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445
PE
6148 UPDATE_SYNTAX_TABLE (pos1);
6149 }
25fe55af 6150#endif
19ed5445
PE
6151 {
6152 int len;
6153 re_wchar_t c;
b18215fc 6154
19ed5445
PE
6155 GET_CHAR_AFTER (c, d, len);
6156 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6157 goto fail;
6158 d += len;
6159 }
b18215fc 6160 }
8fb31792 6161 break;
fa9a63c5 6162
b18215fc 6163#ifdef emacs
1fb352e0 6164 case before_dot:
dc4a2ee0 6165 DEBUG_PRINT ("EXECUTING before_dot.\n");
1fb352e0 6166 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6167 goto fail;
b18215fc
RS
6168 break;
6169
1fb352e0 6170 case at_dot:
dc4a2ee0 6171 DEBUG_PRINT ("EXECUTING at_dot.\n");
1fb352e0
SM
6172 if (PTR_BYTE_POS (d) != PT_BYTE)
6173 goto fail;
6174 break;
b18215fc 6175
1fb352e0 6176 case after_dot:
dc4a2ee0 6177 DEBUG_PRINT ("EXECUTING after_dot.\n");
1fb352e0
SM
6178 if (PTR_BYTE_POS (d) <= PT_BYTE)
6179 goto fail;
e318085a 6180 break;
fa9a63c5 6181
1fb352e0 6182 case categoryspec:
b18215fc 6183 case notcategoryspec:
b18215fc 6184 {
8fb31792
PE
6185 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6186 mcnt = *p++;
dc4a2ee0
PE
6187 DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
6188 not ? "not" : "", mcnt);
8fb31792 6189 PREFETCH ();
01618498 6190
8fb31792
PE
6191 {
6192 int len;
6193 re_wchar_t c;
6194 GET_CHAR_AFTER (c, d, len);
6195 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6196 goto fail;
6197 d += len;
6198 }
b18215fc 6199 }
fa9a63c5 6200 break;
5e69f11e 6201
1fb352e0 6202#endif /* emacs */
5e69f11e 6203
0b32bf0e
SM
6204 default:
6205 abort ();
fa9a63c5 6206 }
b18215fc 6207 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6208
6209
6210 /* We goto here if a matching operation fails. */
6211 fail:
5b370c2b 6212 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6213 if (!FAIL_STACK_EMPTY ())
505bde11 6214 {
01618498 6215 re_char *str, *pat;
505bde11 6216 /* A restart point is known. Restore to that state. */
dc4a2ee0 6217 DEBUG_PRINT ("\nFAIL:\n");
0b32bf0e 6218 POP_FAILURE_POINT (str, pat);
7393bcbb 6219 switch (*pat++)
505bde11
SM
6220 {
6221 case on_failure_keep_string_jump:
6222 assert (str == NULL);
6223 goto continue_failure_jump;
6224
0683b6fa
SM
6225 case on_failure_jump_nastyloop:
6226 assert ((re_opcode_t)pat[-2] == no_op);
6227 PUSH_FAILURE_POINT (pat - 2, str);
6228 /* Fallthrough */
6229
505bde11
SM
6230 case on_failure_jump_loop:
6231 case on_failure_jump:
6232 case succeed_n:
6233 d = str;
6234 continue_failure_jump:
6235 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6236 p = pat + mcnt;
6237 break;
b18215fc 6238
0683b6fa
SM
6239 case no_op:
6240 /* A special frame used for nastyloops. */
6241 goto fail;
6242
505bde11 6243 default:
5e617bc2 6244 abort ();
505bde11 6245 }
fa9a63c5 6246
505bde11 6247 assert (p >= bufp->buffer && p <= pend);
b18215fc 6248
0b32bf0e 6249 if (d >= string1 && d <= end1)
fa9a63c5 6250 dend = end_match_1;
0b32bf0e 6251 }
fa9a63c5 6252 else
0b32bf0e 6253 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6254 } /* for (;;) */
6255
6256 if (best_regs_set)
6257 goto restore_best_regs;
6258
6259 FREE_VARIABLES ();
6260
b18215fc 6261 return -1; /* Failure to match. */
dc4a2ee0 6262}
fa9a63c5
RM
6263\f
6264/* Subroutine definitions for re_match_2. */
6265
fa9a63c5
RM
6266/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6267 bytes; nonzero otherwise. */
5e69f11e 6268
fa9a63c5 6269static int
29abe551 6270bcmp_translate (const_re_char *s1, const_re_char *s2, register ssize_t len,
438105ed 6271 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6272{
2d1675e4
SM
6273 register re_char *p1 = s1, *p2 = s2;
6274 re_char *p1_end = s1 + len;
6275 re_char *p2_end = s2 + len;
e934739e 6276
4bb91c68
SM
6277 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6278 different lengths, but relying on a single `len' would break this. -sm */
6279 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6280 {
e934739e 6281 int p1_charlen, p2_charlen;
01618498 6282 re_wchar_t p1_ch, p2_ch;
e934739e 6283
6fdd04b0
KH
6284 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6285 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6286
6287 if (RE_TRANSLATE (translate, p1_ch)
6288 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6289 return 1;
e934739e
RS
6290
6291 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6292 }
e934739e
RS
6293
6294 if (p1 != p1_end || p2 != p2_end)
6295 return 1;
6296
fa9a63c5
RM
6297 return 0;
6298}
6299\f
6300/* Entry points for GNU code. */
6301
6302/* re_compile_pattern is the GNU regular expression compiler: it
6303 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6304 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6305
fa9a63c5
RM
6306 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6307 are set in BUFP on entry.
5e69f11e 6308
b18215fc 6309 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6310
6311const char *
d1dfb56c
EZ
6312re_compile_pattern (const char *pattern, size_t length,
6313 struct re_pattern_buffer *bufp)
fa9a63c5
RM
6314{
6315 reg_errcode_t ret;
5e69f11e 6316
fa9a63c5
RM
6317 /* GNU code is written to assume at least RE_NREGS registers will be set
6318 (and at least one extra will be -1). */
6319 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6320
fa9a63c5
RM
6321 /* And GNU code determines whether or not to get register information
6322 by passing null for the REGS argument to re_match, etc., not by
6323 setting no_sub. */
6324 bufp->no_sub = 0;
5e69f11e 6325
4bb91c68 6326 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6327
6328 if (!ret)
6329 return NULL;
6330 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6331}
c0f9ea08 6332WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6333\f
b18215fc
RS
6334/* Entry points compatible with 4.2 BSD regex library. We don't define
6335 them unless specifically requested. */
fa9a63c5 6336
0b32bf0e 6337#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6338
6339/* BSD has one and only one pattern buffer. */
6340static struct re_pattern_buffer re_comp_buf;
6341
6342char *
0b32bf0e 6343# ifdef _LIBC
48afdd44
RM
6344/* Make these definitions weak in libc, so POSIX programs can redefine
6345 these names if they don't use our functions, and still use
6346 regcomp/regexec below without link errors. */
6347weak_function
0b32bf0e 6348# endif
31011111 6349re_comp (const char *s)
fa9a63c5
RM
6350{
6351 reg_errcode_t ret;
5e69f11e 6352
fa9a63c5
RM
6353 if (!s)
6354 {
6355 if (!re_comp_buf.buffer)
0b32bf0e 6356 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6357 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6358 return 0;
6359 }
6360
6361 if (!re_comp_buf.buffer)
6362 {
38182d90 6363 re_comp_buf.buffer = malloc (200);
fa9a63c5 6364 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6365 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6366 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6367 re_comp_buf.allocated = 200;
6368
38182d90 6369 re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
fa9a63c5 6370 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6371 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6372 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6373 }
6374
6375 /* Since `re_exec' always passes NULL for the `regs' argument, we
6376 don't need to initialize the pattern buffer fields which affect it. */
6377
fa9a63c5 6378 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6379
fa9a63c5
RM
6380 if (!ret)
6381 return NULL;
6382
6383 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6384 return (char *) gettext (re_error_msgid[(int) ret]);
6385}
6386
6387
31011111 6388int
0b32bf0e 6389# ifdef _LIBC
48afdd44 6390weak_function
0b32bf0e 6391# endif
d1dfb56c 6392re_exec (const char *s)
fa9a63c5 6393{
d1dfb56c 6394 const size_t len = strlen (s);
7d652d97 6395 return re_search (&re_comp_buf, s, len, 0, len, 0) >= 0;
fa9a63c5
RM
6396}
6397#endif /* _REGEX_RE_COMP */
6398\f
6399/* POSIX.2 functions. Don't define these for Emacs. */
6400
6401#ifndef emacs
6402
6403/* regcomp takes a regular expression as a string and compiles it.
6404
b18215fc 6405 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6406 since POSIX says we shouldn't. Thus, we set
6407
6408 `buffer' to the compiled pattern;
6409 `used' to the length of the compiled pattern;
6410 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6411 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6412 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6413 `fastmap' to an allocated space for the fastmap;
6414 `fastmap_accurate' to zero;
fa9a63c5
RM
6415 `re_nsub' to the number of subexpressions in PATTERN.
6416
6417 PATTERN is the address of the pattern string.
6418
6419 CFLAGS is a series of bits which affect compilation.
6420
6421 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6422 use POSIX basic syntax.
6423
6424 If REG_NEWLINE is set, then . and [^...] don't match newline.
6425 Also, regexec will try a match beginning after every newline.
6426
6427 If REG_ICASE is set, then we considers upper- and lowercase
6428 versions of letters to be equivalent when matching.
6429
6430 If REG_NOSUB is set, then when PREG is passed to regexec, that
6431 routine will report only success or failure, and nothing about the
6432 registers.
6433
b18215fc 6434 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6435 the return codes and their meanings.) */
6436
d1dfb56c 6437reg_errcode_t
29abe551 6438regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern,
d2762c86 6439 int cflags)
fa9a63c5
RM
6440{
6441 reg_errcode_t ret;
4bb91c68 6442 reg_syntax_t syntax
fa9a63c5
RM
6443 = (cflags & REG_EXTENDED) ?
6444 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6445
6446 /* regex_compile will allocate the space for the compiled pattern. */
6447 preg->buffer = 0;
6448 preg->allocated = 0;
6449 preg->used = 0;
5e69f11e 6450
c0f9ea08 6451 /* Try to allocate space for the fastmap. */
38182d90 6452 preg->fastmap = malloc (1 << BYTEWIDTH);
5e69f11e 6453
fa9a63c5
RM
6454 if (cflags & REG_ICASE)
6455 {
6456 unsigned i;
5e69f11e 6457
38182d90 6458 preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
fa9a63c5 6459 if (preg->translate == NULL)
0b32bf0e 6460 return (int) REG_ESPACE;
fa9a63c5
RM
6461
6462 /* Map uppercase characters to corresponding lowercase ones. */
6463 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6464 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6465 }
6466 else
6467 preg->translate = NULL;
6468
6469 /* If REG_NEWLINE is set, newlines are treated differently. */
6470 if (cflags & REG_NEWLINE)
6471 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6472 syntax &= ~RE_DOT_NEWLINE;
6473 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6474 }
6475 else
c0f9ea08 6476 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6477
6478 preg->no_sub = !!(cflags & REG_NOSUB);
6479
5e69f11e 6480 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6481 can use strlen here in compiling the pattern. */
4bb91c68 6482 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6483
fa9a63c5
RM
6484 /* POSIX doesn't distinguish between an unmatched open-group and an
6485 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6486 if (ret == REG_ERPAREN)
6487 ret = REG_EPAREN;
6488
6489 if (ret == REG_NOERROR && preg->fastmap)
6490 { /* Compute the fastmap now, since regexec cannot modify the pattern
6491 buffer. */
6492 re_compile_fastmap (preg);
6493 if (preg->can_be_null)
6494 { /* The fastmap can't be used anyway. */
6495 free (preg->fastmap);
6496 preg->fastmap = NULL;
6497 }
6498 }
d1dfb56c 6499 return ret;
fa9a63c5 6500}
c0f9ea08 6501WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6502
6503
6504/* regexec searches for a given pattern, specified by PREG, in the
6505 string STRING.
5e69f11e 6506
fa9a63c5 6507 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6508 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6509 least NMATCH elements, and we set them to the offsets of the
6510 corresponding matched substrings.
5e69f11e 6511
fa9a63c5
RM
6512 EFLAGS specifies `execution flags' which affect matching: if
6513 REG_NOTBOL is set, then ^ does not match at the beginning of the
6514 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6515
fa9a63c5
RM
6516 We return 0 if we find a match and REG_NOMATCH if not. */
6517
d1dfb56c 6518reg_errcode_t
29abe551
PE
6519regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
6520 size_t nmatch, regmatch_t pmatch[_Restrict_arr_], int eflags)
fa9a63c5 6521{
31011111 6522 regoff_t ret;
fa9a63c5
RM
6523 struct re_registers regs;
6524 regex_t private_preg;
d1dfb56c 6525 size_t len = strlen (string);
c0f9ea08 6526 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6527
6528 private_preg = *preg;
5e69f11e 6529
fa9a63c5
RM
6530 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6531 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6532
fa9a63c5
RM
6533 /* The user has told us exactly how many registers to return
6534 information about, via `nmatch'. We have to pass that on to the
b18215fc 6535 matching routines. */
fa9a63c5 6536 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6537
fa9a63c5
RM
6538 if (want_reg_info)
6539 {
6540 regs.num_regs = nmatch;
4bb91c68
SM
6541 regs.start = TALLOC (nmatch * 2, regoff_t);
6542 if (regs.start == NULL)
d1dfb56c 6543 return REG_NOMATCH;
4bb91c68 6544 regs.end = regs.start + nmatch;
fa9a63c5
RM
6545 }
6546
c0f9ea08
SM
6547 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6548 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6549 was a little bit longer but still only matching the real part.
6550 This works because the `endline' will check for a '\n' and will find a
6551 '\0', correctly deciding that this is not the end of a line.
6552 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6553 a convenient '\0' there. For all we know, the string could be preceded
6554 by '\n' which would throw things off. */
6555
fa9a63c5
RM
6556 /* Perform the searching operation. */
6557 ret = re_search (&private_preg, string, len,
0b32bf0e 6558 /* start: */ 0, /* range: */ len,
7d652d97 6559 want_reg_info ? &regs : 0);
5e69f11e 6560
fa9a63c5
RM
6561 /* Copy the register information to the POSIX structure. */
6562 if (want_reg_info)
6563 {
6564 if (ret >= 0)
0b32bf0e
SM
6565 {
6566 unsigned r;
fa9a63c5 6567
0b32bf0e
SM
6568 for (r = 0; r < nmatch; r++)
6569 {
6570 pmatch[r].rm_so = regs.start[r];
6571 pmatch[r].rm_eo = regs.end[r];
6572 }
6573 }
fa9a63c5 6574
b18215fc 6575 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6576 free (regs.start);
fa9a63c5
RM
6577 }
6578
6579 /* We want zero return to mean success, unlike `re_search'. */
d1dfb56c 6580 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
fa9a63c5 6581}
c0f9ea08 6582WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6583
6584
ec869672
JR
6585/* Returns a message corresponding to an error code, ERR_CODE, returned
6586 from either regcomp or regexec. We don't use PREG here.
6587
6588 ERR_CODE was previously called ERRCODE, but that name causes an
6589 error with msvc8 compiler. */
fa9a63c5
RM
6590
6591size_t
d2762c86 6592regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6593{
6594 const char *msg;
6595 size_t msg_size;
6596
ec869672
JR
6597 if (err_code < 0
6598 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6599 /* Only error codes returned by the rest of the code should be passed
b18215fc 6600 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6601 code generates an invalid error code, then the program has a bug.
6602 Dump core so we can fix it. */
6603 abort ();
6604
ec869672 6605 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6606
6607 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6608
fa9a63c5
RM
6609 if (errbuf_size != 0)
6610 {
6611 if (msg_size > errbuf_size)
0b32bf0e 6612 {
e99a530f 6613 memcpy (errbuf, msg, errbuf_size - 1);
0b32bf0e
SM
6614 errbuf[errbuf_size - 1] = 0;
6615 }
fa9a63c5 6616 else
0b32bf0e 6617 strcpy (errbuf, msg);
fa9a63c5
RM
6618 }
6619
6620 return msg_size;
6621}
c0f9ea08 6622WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6623
6624
6625/* Free dynamically allocated space used by PREG. */
6626
6627void
d2762c86 6628regfree (regex_t *preg)
fa9a63c5 6629{
c2cd06e6 6630 free (preg->buffer);
fa9a63c5 6631 preg->buffer = NULL;
5e69f11e 6632
fa9a63c5
RM
6633 preg->allocated = 0;
6634 preg->used = 0;
6635
c2cd06e6 6636 free (preg->fastmap);
fa9a63c5
RM
6637 preg->fastmap = NULL;
6638 preg->fastmap_accurate = 0;
6639
c2cd06e6 6640 free (preg->translate);
fa9a63c5
RM
6641 preg->translate = NULL;
6642}
c0f9ea08 6643WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6644
6645#endif /* not emacs */