Prefer enum glyph_row_area to int where appropriate.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
ab422c4d 5 Copyright (C) 1993-2013 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
fee0bd5f 18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
fa9a63c5 19
6df42991 20/* TODO:
505bde11 21 - structure the opcode space into opcode+flag.
dc1e502d 22 - merge with glibc's regex.[ch].
01618498 23 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
24 need to modify the compiled regexp so that re_match can be reentrant.
25 - get rid of on_failure_jump_smart by doing the optimization in re_comp
26 rather than at run-time, so that re_match can be reentrant.
01618498 27*/
505bde11 28
b7432bb2 29/* AIX requires this to be the first thing in the file. */
0b32bf0e 30#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
31 #pragma alloca
32#endif
33
b8df54ff
PE
34/* Ignore some GCC warnings for now. This section should go away
35 once the Emacs and Gnulib regex code is merged. */
31ff141c 36#if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) || defined __clang__
b8df54ff
PE
37# pragma GCC diagnostic ignored "-Wstrict-overflow"
38# ifndef emacs
b8df54ff
PE
39# pragma GCC diagnostic ignored "-Wunused-function"
40# pragma GCC diagnostic ignored "-Wunused-macros"
41# pragma GCC diagnostic ignored "-Wunused-result"
42# pragma GCC diagnostic ignored "-Wunused-variable"
43# endif
44#endif
45
31ff141c
PE
46#if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) && ! defined __clang__
47# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
48#endif
49
cf38a720 50#include <config.h>
fa9a63c5 51
0e926e56
PE
52#include <stddef.h>
53
54#ifdef emacs
4bb91c68
SM
55/* We need this for `regex.h', and perhaps for the Emacs include files. */
56# include <sys/types.h>
57#endif
fa9a63c5 58
14473664
SM
59/* Whether to use ISO C Amendment 1 wide char functions.
60 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
61#if defined _LIBC
62#define WIDE_CHAR_SUPPORT 1
63#else
14473664 64#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
65 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
66#endif
14473664 67
fa463103 68/* For platform which support the ISO C amendment 1 functionality we
14473664 69 support user defined character classes. */
a0ad02f7 70#if WIDE_CHAR_SUPPORT
14473664
SM
71/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
72# include <wchar.h>
73# include <wctype.h>
74#endif
75
c0f9ea08
SM
76#ifdef _LIBC
77/* We have to keep the namespace clean. */
78# define regfree(preg) __regfree (preg)
79# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
80# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672 81# define regerror(err_code, preg, errbuf, errbuf_size) \
5e617bc2 82 __regerror (err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
83# define re_set_registers(bu, re, nu, st, en) \
84 __re_set_registers (bu, re, nu, st, en)
85# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
86 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
87# define re_match(bufp, string, size, pos, regs) \
88 __re_match (bufp, string, size, pos, regs)
89# define re_search(bufp, string, size, startpos, range, regs) \
90 __re_search (bufp, string, size, startpos, range, regs)
91# define re_compile_pattern(pattern, length, bufp) \
92 __re_compile_pattern (pattern, length, bufp)
93# define re_set_syntax(syntax) __re_set_syntax (syntax)
94# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
95 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
96# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
97
14473664
SM
98/* Make sure we call libc's function even if the user overrides them. */
99# define btowc __btowc
100# define iswctype __iswctype
101# define wctype __wctype
102
c0f9ea08
SM
103# define WEAK_ALIAS(a,b) weak_alias (a, b)
104
105/* We are also using some library internals. */
106# include <locale/localeinfo.h>
107# include <locale/elem-hash.h>
108# include <langinfo.h>
109#else
110# define WEAK_ALIAS(a,b)
111#endif
112
4bb91c68 113/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 114#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
115# include <libintl.h>
116#else
117# define gettext(msgid) (msgid)
118#endif
119
5e69f11e
RM
120#ifndef gettext_noop
121/* This define is so xgettext can find the internationalizable
122 strings. */
0b32bf0e 123# define gettext_noop(String) String
5e69f11e
RM
124#endif
125
fa9a63c5
RM
126/* The `emacs' switch turns on certain matching commands
127 that make sense only in Emacs. */
128#ifdef emacs
129
0b32bf0e 130# include "lisp.h"
e5560ff7 131# include "character.h"
0b32bf0e 132# include "buffer.h"
b18215fc
RS
133
134/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 135# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 136
0b32bf0e 137# include "syntax.h"
0b32bf0e 138# include "category.h"
fa9a63c5 139
7689ef0b
EZ
140# ifdef malloc
141# undef malloc
142# endif
0b32bf0e 143# define malloc xmalloc
7689ef0b
EZ
144# ifdef realloc
145# undef realloc
146# endif
0b32bf0e 147# define realloc xrealloc
7689ef0b
EZ
148# ifdef free
149# undef free
150# endif
0b32bf0e 151# define free xfree
9abbd165 152
7814e705 153/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
154# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
155# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
156
157# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 158# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
159# define RE_STRING_CHAR(p, multibyte) \
160 (multibyte ? (STRING_CHAR (p)) : (*(p)))
161# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
162 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 163
4c0354d7 164# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 165
2afc21f5 166# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 167
6fdd04b0
KH
168/* Set C a (possibly converted to multibyte) character before P. P
169 points into a string which is the virtual concatenation of STR1
170 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
171# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
172 do { \
02cb78b5 173 if (target_multibyte) \
bf216479
KH
174 { \
175 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
176 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
177 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 178 c = STRING_CHAR (dtemp); \
bf216479
KH
179 } \
180 else \
181 { \
182 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 183 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 184 } \
2d1675e4
SM
185 } while (0)
186
6fdd04b0
KH
187/* Set C a (possibly converted to multibyte) character at P, and set
188 LEN to the byte length of that character. */
189# define GET_CHAR_AFTER(c, p, len) \
190 do { \
02cb78b5 191 if (target_multibyte) \
62a6e103 192 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
193 else \
194 { \
cf9c99bc 195 (c) = *p; \
6fdd04b0 196 len = 1; \
cf9c99bc 197 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 198 } \
8f924df7 199 } while (0)
4e8a9132 200
fa9a63c5
RM
201#else /* not emacs */
202
203/* If we are not linking with Emacs proper,
204 we can't use the relocating allocator
205 even if config.h says that we can. */
0b32bf0e 206# undef REL_ALLOC
fa9a63c5 207
4004364e 208# include <unistd.h>
fa9a63c5 209
a77f947b
CY
210/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
211
b8df54ff 212static void *
d2762c86 213xmalloc (size_t size)
a77f947b 214{
38182d90 215 void *val = malloc (size);
a77f947b
CY
216 if (!val && size)
217 {
218 write (2, "virtual memory exhausted\n", 25);
219 exit (1);
220 }
221 return val;
222}
223
b8df54ff 224static void *
d2762c86 225xrealloc (void *block, size_t size)
a77f947b 226{
38182d90 227 void *val;
a77f947b
CY
228 /* We must call malloc explicitly when BLOCK is 0, since some
229 reallocs don't do this. */
230 if (! block)
38182d90 231 val = malloc (size);
a77f947b 232 else
38182d90 233 val = realloc (block, size);
a77f947b
CY
234 if (!val && size)
235 {
236 write (2, "virtual memory exhausted\n", 25);
237 exit (1);
238 }
239 return val;
240}
241
a073faa6
CY
242# ifdef malloc
243# undef malloc
244# endif
245# define malloc xmalloc
246# ifdef realloc
247# undef realloc
248# endif
249# define realloc xrealloc
250
f5d9e83a 251# include <stdbool.h>
9cfdb3ec 252# include <string.h>
fa9a63c5
RM
253
254/* Define the syntax stuff for \<, \>, etc. */
255
990b2375 256/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 257enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 258
e934739e 259/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
260# define CHAR_CHARSET(c) 0
261# define CHARSET_LEADING_CODE_BASE(c) 0
262# define MAX_MULTIBYTE_LENGTH 1
263# define RE_MULTIBYTE_P(x) 0
bf216479 264# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
265# define WORD_BOUNDARY_P(c1, c2) (0)
266# define CHAR_HEAD_P(p) (1)
267# define SINGLE_BYTE_CHAR_P(c) (1)
268# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 269# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 270# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
271# define STRING_CHAR(p) (*(p))
272# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 273# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
274# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
275# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
276# define RE_CHAR_TO_MULTIBYTE(c) (c)
277# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 278# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 279 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
280# define GET_CHAR_AFTER(c, p, len) \
281 (c = *p, len = 1)
0b32bf0e 282# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
283# define BYTE8_TO_CHAR(c) (c)
284# define CHAR_BYTE8_P(c) (0)
bf216479 285# define CHAR_LEADING_CODE(c) (c)
8f924df7 286
fa9a63c5 287#endif /* not emacs */
4e8a9132
SM
288
289#ifndef RE_TRANSLATE
0b32bf0e
SM
290# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
291# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 292#endif
fa9a63c5
RM
293\f
294/* Get the interface, including the syntax bits. */
295#include "regex.h"
296
f71b19b6
DL
297/* isalpha etc. are used for the character classes. */
298#include <ctype.h>
fa9a63c5 299
f71b19b6 300#ifdef emacs
fa9a63c5 301
f71b19b6 302/* 1 if C is an ASCII character. */
0b32bf0e 303# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 304
f71b19b6 305/* 1 if C is a unibyte character. */
0b32bf0e 306# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 307
f71b19b6 308/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 309
f71b19b6 310/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
311# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
312# define ISCNTRL(c) ((c) < ' ')
313# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
314 || ((c) >= 'a' && (c) <= 'f') \
315 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
316
317/* This is only used for single-byte characters. */
0b32bf0e 318# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
319
320/* The rest must handle multibyte characters. */
321
0b32bf0e 322# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 323 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
324 : 1)
325
14473664 326# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 327 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
328 : 1)
329
0b32bf0e 330# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
331 ? (((c) >= 'a' && (c) <= 'z') \
332 || ((c) >= 'A' && (c) <= 'Z') \
333 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
334 : SYNTAX (c) == Sword)
335
0b32bf0e 336# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
337 ? (((c) >= 'a' && (c) <= 'z') \
338 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
339 : SYNTAX (c) == Sword)
340
5da9919f 341# define ISLOWER(c) lowercasep (c)
96cc36cc 342
0b32bf0e 343# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
344 ? ((c) > ' ' && (c) < 0177 \
345 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
346 || ((c) >= 'A' && (c) <= 'Z') \
347 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
348 : SYNTAX (c) != Sword)
349
0b32bf0e 350# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 351
5da9919f 352# define ISUPPER(c) uppercasep (c)
96cc36cc 353
0b32bf0e 354# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
355
356#else /* not emacs */
357
f71b19b6 358/* 1 if C is an ASCII character. */
0b32bf0e 359# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
360
361/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
362# define ISUNIBYTE(c) 1
363
364# ifdef isblank
0e926e56 365# define ISBLANK(c) isblank (c)
0b32bf0e
SM
366# else
367# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
368# endif
369# ifdef isgraph
0e926e56 370# define ISGRAPH(c) isgraph (c)
0b32bf0e 371# else
0e926e56 372# define ISGRAPH(c) (isprint (c) && !isspace (c))
0b32bf0e
SM
373# endif
374
0e926e56 375/* Solaris defines ISPRINT so we must undefine it first. */
4bb91c68 376# undef ISPRINT
0e926e56
PE
377# define ISPRINT(c) isprint (c)
378# define ISDIGIT(c) isdigit (c)
379# define ISALNUM(c) isalnum (c)
380# define ISALPHA(c) isalpha (c)
381# define ISCNTRL(c) iscntrl (c)
382# define ISLOWER(c) islower (c)
383# define ISPUNCT(c) ispunct (c)
384# define ISSPACE(c) isspace (c)
385# define ISUPPER(c) isupper (c)
386# define ISXDIGIT(c) isxdigit (c)
0b32bf0e 387
5e617bc2 388# define ISWORD(c) ISALPHA (c)
0b32bf0e 389
4bb91c68 390# ifdef _tolower
5e617bc2 391# define TOLOWER(c) _tolower (c)
4bb91c68 392# else
5e617bc2 393# define TOLOWER(c) tolower (c)
4bb91c68
SM
394# endif
395
396/* How many characters in the character set. */
397# define CHAR_SET_SIZE 256
398
0b32bf0e 399# ifdef SYNTAX_TABLE
f71b19b6 400
0b32bf0e 401extern char *re_syntax_table;
f71b19b6 402
0b32bf0e
SM
403# else /* not SYNTAX_TABLE */
404
0b32bf0e
SM
405static char re_syntax_table[CHAR_SET_SIZE];
406
407static void
d2762c86 408init_syntax_once (void)
0b32bf0e
SM
409{
410 register int c;
411 static int done = 0;
412
413 if (done)
414 return;
415
72af86bd 416 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 417
4bb91c68
SM
418 for (c = 0; c < CHAR_SET_SIZE; ++c)
419 if (ISALNUM (c))
420 re_syntax_table[c] = Sword;
fa9a63c5 421
669fa600 422 re_syntax_table['_'] = Ssymbol;
fa9a63c5 423
0b32bf0e
SM
424 done = 1;
425}
426
427# endif /* not SYNTAX_TABLE */
96cc36cc 428
4bb91c68
SM
429# define SYNTAX(c) re_syntax_table[(c)]
430
96cc36cc
RS
431#endif /* not emacs */
432\f
261cb4bb 433#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
434\f
435/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
436 use `alloca' instead of `malloc'. This is because using malloc in
437 re_search* or re_match* could cause memory leaks when C-g is used in
438 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
439 the other hand, malloc is more portable, and easier to debug.
440
fa9a63c5
RM
441 Because we sometimes use alloca, some routines have to be macros,
442 not functions -- `alloca'-allocated space disappears at the end of the
443 function it is called in. */
444
445#ifdef REGEX_MALLOC
446
0b32bf0e
SM
447# define REGEX_ALLOCATE malloc
448# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
449# define REGEX_FREE free
fa9a63c5
RM
450
451#else /* not REGEX_MALLOC */
452
453/* Emacs already defines alloca, sometimes. */
0b32bf0e 454# ifndef alloca
fa9a63c5
RM
455
456/* Make alloca work the best possible way. */
0b32bf0e
SM
457# ifdef __GNUC__
458# define alloca __builtin_alloca
459# else /* not __GNUC__ */
7f585e7a 460# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
461# include <alloca.h>
462# endif /* HAVE_ALLOCA_H */
463# endif /* not __GNUC__ */
fa9a63c5 464
0b32bf0e 465# endif /* not alloca */
fa9a63c5 466
0b32bf0e 467# define REGEX_ALLOCATE alloca
fa9a63c5
RM
468
469/* Assumes a `char *destination' variable. */
0b32bf0e 470# define REGEX_REALLOCATE(source, osize, nsize) \
7d652d97 471 (destination = alloca (nsize), \
4bb91c68 472 memcpy (destination, source, osize))
fa9a63c5
RM
473
474/* No need to do anything to free, after alloca. */
0b32bf0e 475# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
476
477#endif /* not REGEX_MALLOC */
478
479/* Define how to allocate the failure stack. */
480
0b32bf0e 481#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 482
0b32bf0e 483# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 484 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 485# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 486 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 487# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
488 r_alloc_free (&failure_stack_ptr)
489
4297555e 490#else /* not using relocating allocator */
fa9a63c5 491
0b32bf0e 492# ifdef REGEX_MALLOC
fa9a63c5 493
0b32bf0e
SM
494# define REGEX_ALLOCATE_STACK malloc
495# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
496# define REGEX_FREE_STACK free
fa9a63c5 497
0b32bf0e 498# else /* not REGEX_MALLOC */
fa9a63c5 499
0b32bf0e 500# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 501
0b32bf0e 502# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 503 REGEX_REALLOCATE (source, osize, nsize)
7814e705 504/* No need to explicitly free anything. */
0b32bf0e 505# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 506
0b32bf0e 507# endif /* not REGEX_MALLOC */
4297555e 508#endif /* not using relocating allocator */
fa9a63c5
RM
509
510
511/* True if `size1' is non-NULL and PTR is pointing anywhere inside
512 `string1' or just past its end. This works if PTR is NULL, which is
513 a good thing. */
25fe55af 514#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
515 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
516
517/* (Re)Allocate N items of type T using malloc, or fail. */
518#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
519#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
fa9a63c5
RM
520#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
521
4bb91c68 522#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
523
524#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
525
526#undef MAX
527#undef MIN
528#define MAX(a, b) ((a) > (b) ? (a) : (b))
529#define MIN(a, b) ((a) < (b) ? (a) : (b))
530
66f0296e 531/* Type of source-pattern and string chars. */
a6fc3b5c
EZ
532#ifdef _MSC_VER
533typedef unsigned char re_char;
29abe551 534typedef const re_char const_re_char;
a6fc3b5c 535#else
66f0296e 536typedef const unsigned char re_char;
29abe551 537typedef re_char const_re_char;
a6fc3b5c 538#endif
66f0296e 539
fa9a63c5 540typedef char boolean;
fa9a63c5 541
261cb4bb
PE
542static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
543 re_char *string1, size_t size1,
544 re_char *string2, size_t size2,
545 ssize_t pos,
546 struct re_registers *regs,
547 ssize_t stop);
fa9a63c5
RM
548\f
549/* These are the command codes that appear in compiled regular
4bb91c68 550 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
551 command code can specify any interpretation whatsoever for its
552 arguments. Zero bytes may appear in the compiled regular expression. */
553
554typedef enum
555{
556 no_op = 0,
557
4bb91c68 558 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
559 succeed,
560
25fe55af 561 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
562 exactn,
563
25fe55af 564 /* Matches any (more or less) character. */
fa9a63c5
RM
565 anychar,
566
25fe55af
RS
567 /* Matches any one char belonging to specified set. First
568 following byte is number of bitmap bytes. Then come bytes
569 for a bitmap saying which chars are in. Bits in each byte
570 are ordered low-bit-first. A character is in the set if its
571 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
572 automatically not in the set.
573
574 If the length byte has the 0x80 bit set, then that stuff
575 is followed by a range table:
576 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 577 See RANGE_TABLE_WORK_BITS below.
01618498 578 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 579 pairs, each 2 multibyte characters,
0b32bf0e 580 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
581 charset,
582
25fe55af 583 /* Same parameters as charset, but match any character that is
4bb91c68 584 not one of those specified. */
fa9a63c5
RM
585 charset_not,
586
25fe55af
RS
587 /* Start remembering the text that is matched, for storing in a
588 register. Followed by one byte with the register number, in
589 the range 0 to one less than the pattern buffer's re_nsub
505bde11 590 field. */
fa9a63c5
RM
591 start_memory,
592
25fe55af
RS
593 /* Stop remembering the text that is matched and store it in a
594 memory register. Followed by one byte with the register
595 number, in the range 0 to one less than `re_nsub' in the
505bde11 596 pattern buffer. */
fa9a63c5
RM
597 stop_memory,
598
25fe55af 599 /* Match a duplicate of something remembered. Followed by one
4bb91c68 600 byte containing the register number. */
fa9a63c5
RM
601 duplicate,
602
25fe55af 603 /* Fail unless at beginning of line. */
fa9a63c5
RM
604 begline,
605
4bb91c68 606 /* Fail unless at end of line. */
fa9a63c5
RM
607 endline,
608
25fe55af
RS
609 /* Succeeds if at beginning of buffer (if emacs) or at beginning
610 of string to be matched (if not). */
fa9a63c5
RM
611 begbuf,
612
25fe55af 613 /* Analogously, for end of buffer/string. */
fa9a63c5 614 endbuf,
5e69f11e 615
25fe55af 616 /* Followed by two byte relative address to which to jump. */
5e69f11e 617 jump,
fa9a63c5 618
25fe55af 619 /* Followed by two-byte relative address of place to resume at
7814e705 620 in case of failure. */
fa9a63c5 621 on_failure_jump,
5e69f11e 622
25fe55af
RS
623 /* Like on_failure_jump, but pushes a placeholder instead of the
624 current string position when executed. */
fa9a63c5 625 on_failure_keep_string_jump,
5e69f11e 626
505bde11
SM
627 /* Just like `on_failure_jump', except that it checks that we
628 don't get stuck in an infinite loop (matching an empty string
629 indefinitely). */
630 on_failure_jump_loop,
631
0683b6fa
SM
632 /* Just like `on_failure_jump_loop', except that it checks for
633 a different kind of loop (the kind that shows up with non-greedy
634 operators). This operation has to be immediately preceded
635 by a `no_op'. */
636 on_failure_jump_nastyloop,
637
0b32bf0e 638 /* A smart `on_failure_jump' used for greedy * and + operators.
c7015153 639 It analyzes the loop before which it is put and if the
505bde11 640 loop does not require backtracking, it changes itself to
4e8a9132
SM
641 `on_failure_keep_string_jump' and short-circuits the loop,
642 else it just defaults to changing itself into `on_failure_jump'.
643 It assumes that it is pointing to just past a `jump'. */
505bde11 644 on_failure_jump_smart,
fa9a63c5 645
25fe55af 646 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
647 After matching N times, jump to the address upon failure.
648 Does not work if N starts at 0: use on_failure_jump_loop
649 instead. */
fa9a63c5
RM
650 succeed_n,
651
25fe55af
RS
652 /* Followed by two-byte relative address, and two-byte number n.
653 Jump to the address N times, then fail. */
fa9a63c5
RM
654 jump_n,
655
25fe55af 656 /* Set the following two-byte relative address to the
7814e705 657 subsequent two-byte number. The address *includes* the two
25fe55af 658 bytes of number. */
fa9a63c5
RM
659 set_number_at,
660
fa9a63c5
RM
661 wordbeg, /* Succeeds if at word beginning. */
662 wordend, /* Succeeds if at word end. */
663
664 wordbound, /* Succeeds if at a word boundary. */
7814e705 665 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 666
669fa600
SM
667 symbeg, /* Succeeds if at symbol beginning. */
668 symend, /* Succeeds if at symbol end. */
669
fa9a63c5 670 /* Matches any character whose syntax is specified. Followed by
25fe55af 671 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
672 syntaxspec,
673
674 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
675 notsyntaxspec
676
677#ifdef emacs
678 ,before_dot, /* Succeeds if before point. */
679 at_dot, /* Succeeds if at point. */
680 after_dot, /* Succeeds if after point. */
b18215fc
RS
681
682 /* Matches any character whose category-set contains the specified
7814e705
JB
683 category. The operator is followed by a byte which contains a
684 category code (mnemonic ASCII character). */
b18215fc
RS
685 categoryspec,
686
687 /* Matches any character whose category-set does not contain the
688 specified category. The operator is followed by a byte which
689 contains the category code (mnemonic ASCII character). */
690 notcategoryspec
fa9a63c5
RM
691#endif /* emacs */
692} re_opcode_t;
693\f
694/* Common operations on the compiled pattern. */
695
696/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
697
698#define STORE_NUMBER(destination, number) \
699 do { \
700 (destination)[0] = (number) & 0377; \
701 (destination)[1] = (number) >> 8; \
702 } while (0)
703
704/* Same as STORE_NUMBER, except increment DESTINATION to
705 the byte after where the number is stored. Therefore, DESTINATION
706 must be an lvalue. */
707
708#define STORE_NUMBER_AND_INCR(destination, number) \
709 do { \
710 STORE_NUMBER (destination, number); \
711 (destination) += 2; \
712 } while (0)
713
714/* Put into DESTINATION a number stored in two contiguous bytes starting
715 at SOURCE. */
716
717#define EXTRACT_NUMBER(destination, source) \
dc4a2ee0 718 ((destination) = extract_number (source))
fa9a63c5 719
dc4a2ee0
PE
720static int
721extract_number (re_char *source)
fa9a63c5 722{
dc4a2ee0 723 return (SIGN_EXTEND_CHAR (source[1]) << 8) + source[0];
fa9a63c5
RM
724}
725
fa9a63c5
RM
726/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
727 SOURCE must be an lvalue. */
728
729#define EXTRACT_NUMBER_AND_INCR(destination, source) \
dc4a2ee0 730 ((destination) = extract_number_and_incr (&source))
fa9a63c5 731
dc4a2ee0
PE
732static int
733extract_number_and_incr (re_char **source)
5e69f11e 734{
dc4a2ee0 735 int num = extract_number (*source);
fa9a63c5 736 *source += 2;
dc4a2ee0 737 return num;
fa9a63c5 738}
fa9a63c5 739\f
b18215fc
RS
740/* Store a multibyte character in three contiguous bytes starting
741 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 742 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
743
744#define STORE_CHARACTER_AND_INCR(destination, character) \
745 do { \
746 (destination)[0] = (character) & 0377; \
747 (destination)[1] = ((character) >> 8) & 0377; \
748 (destination)[2] = (character) >> 16; \
749 (destination) += 3; \
750 } while (0)
751
752/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 753 starting at SOURCE. */
b18215fc
RS
754
755#define EXTRACT_CHARACTER(destination, source) \
756 do { \
757 (destination) = ((source)[0] \
758 | ((source)[1] << 8) \
759 | ((source)[2] << 16)); \
760 } while (0)
761
762
763/* Macros for charset. */
764
765/* Size of bitmap of charset P in bytes. P is a start of charset,
766 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
767#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
768
769/* Nonzero if charset P has range table. */
25fe55af 770#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
771
772/* Return the address of range table of charset P. But not the start
773 of table itself, but the before where the number of ranges is
96cc36cc
RS
774 stored. `2 +' means to skip re_opcode_t and size of bitmap,
775 and the 2 bytes of flags at the start of the range table. */
776#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
777
778/* Extract the bit flags that start a range table. */
779#define CHARSET_RANGE_TABLE_BITS(p) \
780 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
781 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc 782
b18215fc 783/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
784 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
785 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
786 and end. */
787#define CHARSET_RANGE_TABLE_END(range_table, count) \
788 ((range_table) + (count) * 2 * 3)
789
7814e705 790/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
791 COUNT is number of ranges in RANGE_TABLE. */
792#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
793 do \
794 { \
01618498 795 re_wchar_t range_start, range_end; \
19ed5445 796 re_char *rtp; \
01618498 797 re_char *range_table_end \
b18215fc
RS
798 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
799 \
19ed5445 800 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 801 { \
19ed5445
PE
802 EXTRACT_CHARACTER (range_start, rtp); \
803 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
804 \
805 if (range_start <= (c) && (c) <= range_end) \
806 { \
807 (not) = !(not); \
808 break; \
809 } \
810 } \
811 } \
812 while (0)
813
814/* Test if C is in range table of CHARSET. The flag NOT is negated if
815 C is listed in it. */
816#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
817 do \
818 { \
819 /* Number of ranges in range table. */ \
820 int count; \
01618498
SM
821 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
822 \
b18215fc
RS
823 EXTRACT_NUMBER_AND_INCR (count, range_table); \
824 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
825 } \
826 while (0)
827\f
fa9a63c5
RM
828/* If DEBUG is defined, Regex prints many voluminous messages about what
829 it is doing (if the variable `debug' is nonzero). If linked with the
830 main program in `iregex.c', you can enter patterns and strings
831 interactively. And if linked with the main program in `main.c' and
4bb91c68 832 the other test files, you can run the already-written tests. */
fa9a63c5
RM
833
834#ifdef DEBUG
835
836/* We use standard I/O for debugging. */
0b32bf0e 837# include <stdio.h>
fa9a63c5
RM
838
839/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 840# include <assert.h>
fa9a63c5 841
99633e97 842static int debug = -100000;
fa9a63c5 843
0b32bf0e 844# define DEBUG_STATEMENT(e) e
dc4a2ee0
PE
845# define DEBUG_PRINT(...) if (debug > 0) printf (__VA_ARGS__)
846# define DEBUG_COMPILES_ARGUMENTS
0b32bf0e 847# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 848 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 849# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 850 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
851
852
853/* Print the fastmap in human-readable form. */
854
dc4a2ee0
PE
855static void
856print_fastmap (char *fastmap)
fa9a63c5
RM
857{
858 unsigned was_a_range = 0;
5e69f11e
RM
859 unsigned i = 0;
860
fa9a63c5
RM
861 while (i < (1 << BYTEWIDTH))
862 {
863 if (fastmap[i++])
864 {
865 was_a_range = 0;
25fe55af
RS
866 putchar (i - 1);
867 while (i < (1 << BYTEWIDTH) && fastmap[i])
868 {
869 was_a_range = 1;
870 i++;
871 }
fa9a63c5 872 if (was_a_range)
25fe55af
RS
873 {
874 printf ("-");
875 putchar (i - 1);
876 }
877 }
fa9a63c5 878 }
5e69f11e 879 putchar ('\n');
fa9a63c5
RM
880}
881
882
883/* Print a compiled pattern string in human-readable form, starting at
884 the START pointer into it and ending just before the pointer END. */
885
dc4a2ee0
PE
886static void
887print_partial_compiled_pattern (re_char *start, re_char *end)
fa9a63c5
RM
888{
889 int mcnt, mcnt2;
01618498
SM
890 re_char *p = start;
891 re_char *pend = end;
fa9a63c5
RM
892
893 if (start == NULL)
894 {
a1a052df 895 fprintf (stderr, "(null)\n");
fa9a63c5
RM
896 return;
897 }
5e69f11e 898
fa9a63c5
RM
899 /* Loop over pattern commands. */
900 while (p < pend)
901 {
dc4a2ee0 902 fprintf (stderr, "%td:\t", p - start);
fa9a63c5
RM
903
904 switch ((re_opcode_t) *p++)
905 {
25fe55af 906 case no_op:
a1a052df 907 fprintf (stderr, "/no_op");
25fe55af 908 break;
fa9a63c5 909
99633e97 910 case succeed:
a1a052df 911 fprintf (stderr, "/succeed");
99633e97
SM
912 break;
913
fa9a63c5
RM
914 case exactn:
915 mcnt = *p++;
a1a052df 916 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 917 do
fa9a63c5 918 {
a1a052df 919 fprintf (stderr, "/%c", *p++);
25fe55af
RS
920 }
921 while (--mcnt);
922 break;
fa9a63c5
RM
923
924 case start_memory:
a1a052df 925 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 926 break;
fa9a63c5
RM
927
928 case stop_memory:
a1a052df 929 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 930 break;
fa9a63c5
RM
931
932 case duplicate:
a1a052df 933 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
934 break;
935
936 case anychar:
a1a052df 937 fprintf (stderr, "/anychar");
fa9a63c5
RM
938 break;
939
940 case charset:
25fe55af
RS
941 case charset_not:
942 {
943 register int c, last = -100;
fa9a63c5 944 register int in_range = 0;
99633e97
SM
945 int length = CHARSET_BITMAP_SIZE (p - 1);
946 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 947
a1a052df 948 fprintf (stderr, "/charset [%s",
839966f3 949 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 950
839966f3
KH
951 if (p + *p >= pend)
952 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 953
25fe55af 954 for (c = 0; c < 256; c++)
96cc36cc 955 if (c / 8 < length
fa9a63c5
RM
956 && (p[1 + (c/8)] & (1 << (c % 8))))
957 {
958 /* Are we starting a range? */
959 if (last + 1 == c && ! in_range)
960 {
a1a052df 961 fprintf (stderr, "-");
fa9a63c5
RM
962 in_range = 1;
963 }
964 /* Have we broken a range? */
965 else if (last + 1 != c && in_range)
96cc36cc 966 {
a1a052df 967 fprintf (stderr, "%c", last);
fa9a63c5
RM
968 in_range = 0;
969 }
5e69f11e 970
fa9a63c5 971 if (! in_range)
a1a052df 972 fprintf (stderr, "%c", c);
fa9a63c5
RM
973
974 last = c;
25fe55af 975 }
fa9a63c5
RM
976
977 if (in_range)
a1a052df 978 fprintf (stderr, "%c", last);
fa9a63c5 979
a1a052df 980 fprintf (stderr, "]");
fa9a63c5 981
99633e97 982 p += 1 + length;
96cc36cc 983
96cc36cc 984 if (has_range_table)
99633e97
SM
985 {
986 int count;
a1a052df 987 fprintf (stderr, "has-range-table");
99633e97
SM
988
989 /* ??? Should print the range table; for now, just skip it. */
990 p += 2; /* skip range table bits */
991 EXTRACT_NUMBER_AND_INCR (count, p);
992 p = CHARSET_RANGE_TABLE_END (p, count);
993 }
fa9a63c5
RM
994 }
995 break;
996
997 case begline:
a1a052df 998 fprintf (stderr, "/begline");
25fe55af 999 break;
fa9a63c5
RM
1000
1001 case endline:
a1a052df 1002 fprintf (stderr, "/endline");
25fe55af 1003 break;
fa9a63c5
RM
1004
1005 case on_failure_jump:
dc4a2ee0
PE
1006 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1007 fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
25fe55af 1008 break;
fa9a63c5
RM
1009
1010 case on_failure_keep_string_jump:
dc4a2ee0
PE
1011 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1012 fprintf (stderr, "/on_failure_keep_string_jump to %td",
1013 p + mcnt - start);
25fe55af 1014 break;
fa9a63c5 1015
0683b6fa 1016 case on_failure_jump_nastyloop:
dc4a2ee0
PE
1017 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1018 fprintf (stderr, "/on_failure_jump_nastyloop to %td",
1019 p + mcnt - start);
0683b6fa
SM
1020 break;
1021
505bde11 1022 case on_failure_jump_loop:
dc4a2ee0
PE
1023 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1024 fprintf (stderr, "/on_failure_jump_loop to %td",
1025 p + mcnt - start);
5e69f11e
RM
1026 break;
1027
505bde11 1028 case on_failure_jump_smart:
dc4a2ee0
PE
1029 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1030 fprintf (stderr, "/on_failure_jump_smart to %td",
1031 p + mcnt - start);
5e69f11e
RM
1032 break;
1033
25fe55af 1034 case jump:
dc4a2ee0
PE
1035 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1036 fprintf (stderr, "/jump to %td", p + mcnt - start);
fa9a63c5
RM
1037 break;
1038
25fe55af 1039 case succeed_n:
dc4a2ee0
PE
1040 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1041 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1042 fprintf (stderr, "/succeed_n to %td, %d times",
1043 p - 2 + mcnt - start, mcnt2);
25fe55af 1044 break;
5e69f11e 1045
25fe55af 1046 case jump_n:
dc4a2ee0
PE
1047 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1048 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1049 fprintf (stderr, "/jump_n to %td, %d times",
1050 p - 2 + mcnt - start, mcnt2);
25fe55af 1051 break;
5e69f11e 1052
25fe55af 1053 case set_number_at:
dc4a2ee0
PE
1054 EXTRACT_NUMBER_AND_INCR (mcnt, p);
1055 EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1056 fprintf (stderr, "/set_number_at location %td to %d",
1057 p - 2 + mcnt - start, mcnt2);
25fe55af 1058 break;
5e69f11e 1059
25fe55af 1060 case wordbound:
a1a052df 1061 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1062 break;
1063
1064 case notwordbound:
a1a052df 1065 fprintf (stderr, "/notwordbound");
25fe55af 1066 break;
fa9a63c5
RM
1067
1068 case wordbeg:
a1a052df 1069 fprintf (stderr, "/wordbeg");
fa9a63c5 1070 break;
5e69f11e 1071
fa9a63c5 1072 case wordend:
a1a052df 1073 fprintf (stderr, "/wordend");
e2543b02 1074 break;
5e69f11e 1075
669fa600 1076 case symbeg:
e2543b02 1077 fprintf (stderr, "/symbeg");
669fa600
SM
1078 break;
1079
1080 case symend:
e2543b02 1081 fprintf (stderr, "/symend");
669fa600 1082 break;
5e69f11e 1083
1fb352e0 1084 case syntaxspec:
a1a052df 1085 fprintf (stderr, "/syntaxspec");
1fb352e0 1086 mcnt = *p++;
a1a052df 1087 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1088 break;
1089
1090 case notsyntaxspec:
a1a052df 1091 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1092 mcnt = *p++;
a1a052df 1093 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1094 break;
1095
0b32bf0e 1096# ifdef emacs
fa9a63c5 1097 case before_dot:
a1a052df 1098 fprintf (stderr, "/before_dot");
25fe55af 1099 break;
fa9a63c5
RM
1100
1101 case at_dot:
a1a052df 1102 fprintf (stderr, "/at_dot");
25fe55af 1103 break;
fa9a63c5
RM
1104
1105 case after_dot:
a1a052df 1106 fprintf (stderr, "/after_dot");
25fe55af 1107 break;
fa9a63c5 1108
1fb352e0 1109 case categoryspec:
a1a052df 1110 fprintf (stderr, "/categoryspec");
fa9a63c5 1111 mcnt = *p++;
a1a052df 1112 fprintf (stderr, "/%d", mcnt);
25fe55af 1113 break;
5e69f11e 1114
1fb352e0 1115 case notcategoryspec:
a1a052df 1116 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1117 mcnt = *p++;
a1a052df 1118 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1119 break;
0b32bf0e 1120# endif /* emacs */
fa9a63c5 1121
fa9a63c5 1122 case begbuf:
a1a052df 1123 fprintf (stderr, "/begbuf");
25fe55af 1124 break;
fa9a63c5
RM
1125
1126 case endbuf:
a1a052df 1127 fprintf (stderr, "/endbuf");
25fe55af 1128 break;
fa9a63c5 1129
25fe55af 1130 default:
a1a052df 1131 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1132 }
1133
a1a052df 1134 fprintf (stderr, "\n");
fa9a63c5
RM
1135 }
1136
dc4a2ee0 1137 fprintf (stderr, "%td:\tend of pattern.\n", p - start);
fa9a63c5
RM
1138}
1139
1140
dc4a2ee0
PE
1141static void
1142print_compiled_pattern (struct re_pattern_buffer *bufp)
fa9a63c5 1143{
01618498 1144 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1145
1146 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1147 printf ("%ld bytes used/%ld bytes allocated.\n",
1148 bufp->used, bufp->allocated);
fa9a63c5
RM
1149
1150 if (bufp->fastmap_accurate && bufp->fastmap)
1151 {
1152 printf ("fastmap: ");
1153 print_fastmap (bufp->fastmap);
1154 }
1155
dc4a2ee0 1156 printf ("re_nsub: %zu\t", bufp->re_nsub);
fa9a63c5
RM
1157 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1158 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1159 printf ("no_sub: %d\t", bufp->no_sub);
1160 printf ("not_bol: %d\t", bufp->not_bol);
1161 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1162 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1163 fflush (stdout);
fa9a63c5
RM
1164 /* Perhaps we should print the translate table? */
1165}
1166
1167
dc4a2ee0
PE
1168static void
1169print_double_string (re_char *where, re_char *string1, ssize_t size1,
1170 re_char *string2, ssize_t size2)
fa9a63c5 1171{
d1dfb56c 1172 ssize_t this_char;
5e69f11e 1173
fa9a63c5
RM
1174 if (where == NULL)
1175 printf ("(null)");
1176 else
1177 {
1178 if (FIRST_STRING_P (where))
25fe55af
RS
1179 {
1180 for (this_char = where - string1; this_char < size1; this_char++)
1181 putchar (string1[this_char]);
fa9a63c5 1182
25fe55af
RS
1183 where = string2;
1184 }
fa9a63c5
RM
1185
1186 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1187 putchar (string2[this_char]);
fa9a63c5
RM
1188 }
1189}
1190
1191#else /* not DEBUG */
1192
0b32bf0e
SM
1193# undef assert
1194# define assert(e)
fa9a63c5 1195
0b32bf0e 1196# define DEBUG_STATEMENT(e)
dc4a2ee0
PE
1197# if __STDC_VERSION__ < 199901L
1198# define DEBUG_COMPILES_ARGUMENTS
1199# define DEBUG_PRINT /* 'DEBUG_PRINT (x, y)' discards X and Y. */ (void)
1200# else
1201# define DEBUG_PRINT(...)
1202# endif
0b32bf0e
SM
1203# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1204# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1205
1206#endif /* not DEBUG */
1207\f
4da60324
PE
1208/* Use this to suppress gcc's `...may be used before initialized' warnings. */
1209#ifdef lint
1210# define IF_LINT(Code) Code
1211#else
1212# define IF_LINT(Code) /* empty */
1213#endif
1214\f
fa9a63c5
RM
1215/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1216 also be assigned to arbitrarily: each pattern buffer stores its own
1217 syntax, so it can be changed between regex compilations. */
1218/* This has no initializer because initialized variables in Emacs
1219 become read-only after dumping. */
1220reg_syntax_t re_syntax_options;
1221
1222
1223/* Specify the precise syntax of regexps for compilation. This provides
1224 for compatibility for various utilities which historically have
1225 different, incompatible syntaxes.
1226
1227 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1228 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1229
1230reg_syntax_t
971de7fb 1231re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1232{
1233 reg_syntax_t ret = re_syntax_options;
5e69f11e 1234
fa9a63c5
RM
1235 re_syntax_options = syntax;
1236 return ret;
1237}
c0f9ea08 1238WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1239
1240/* Regexp to use to replace spaces, or NULL meaning don't. */
f462f075 1241static const_re_char *whitespace_regexp;
f9b0fd99
RS
1242
1243void
971de7fb 1244re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1245{
f462f075 1246 whitespace_regexp = (const_re_char *) regexp;
f9b0fd99
RS
1247}
1248WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1249\f
1250/* This table gives an error message for each of the error codes listed
4bb91c68 1251 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1252 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1253 but why not be nice? */
fa9a63c5
RM
1254
1255static const char *re_error_msgid[] =
5e69f11e
RM
1256 {
1257 gettext_noop ("Success"), /* REG_NOERROR */
1258 gettext_noop ("No match"), /* REG_NOMATCH */
1259 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1260 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1261 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1262 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1263 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1264 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1265 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1266 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1267 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1268 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1269 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1270 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1271 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1272 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1273 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1274 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1275 };
1276\f
4bb91c68 1277/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1278
1279/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1280 searching and matching functions should not call alloca. On some
1281 systems, alloca is implemented in terms of malloc, and if we're
1282 using the relocating allocator routines, then malloc could cause a
1283 relocation, which might (if the strings being searched are in the
1284 ralloc heap) shift the data out from underneath the regexp
1285 routines.
1286
5e69f11e 1287 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1288 processes input from X in a signal handler; processing X input may
1289 call malloc; if input arrives while a matching routine is calling
1290 malloc, then we're scrod. But Emacs can't just block input while
1291 calling matching routines; then we don't notice interrupts when
1292 they come in. So, Emacs blocks input around all regexp calls
1293 except the matching calls, which it leaves unprotected, in the
1294 faith that they will not malloc. */
1295
1296/* Normally, this is fine. */
1297#define MATCH_MAY_ALLOCATE
1298
fa9a63c5
RM
1299/* The match routines may not allocate if (1) they would do it with malloc
1300 and (2) it's not safe for them to use malloc.
1301 Note that if REL_ALLOC is defined, matching would not use malloc for the
1302 failure stack, but we would still use it for the register vectors;
4bb91c68 1303 so REL_ALLOC should not affect this. */
b588157e 1304#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1305# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1306#endif
1307
1308\f
1309/* Failure stack declarations and macros; both re_compile_fastmap and
1310 re_match_2 use a failure stack. These have to be macros because of
1311 REGEX_ALLOCATE_STACK. */
5e69f11e 1312
fa9a63c5 1313
320a2a73 1314/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1315 when matching. If this number is exceeded, we allocate more
1316 space, so it is not a hard limit. */
1317#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1318# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1319#endif
1320
1321/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1322 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1323 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1324 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1325 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1326# if defined MATCH_MAY_ALLOCATE
1327/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1328 whose default stack limit is 2mb. In order for a larger
1329 value to work reliably, you have to try to make it accord
1330 with the process stack limit. */
c0f9ea08
SM
1331size_t re_max_failures = 40000;
1332# else
1333size_t re_max_failures = 4000;
1334# endif
fa9a63c5
RM
1335
1336union fail_stack_elt
1337{
01618498 1338 re_char *pointer;
c0f9ea08
SM
1339 /* This should be the biggest `int' that's no bigger than a pointer. */
1340 long integer;
fa9a63c5
RM
1341};
1342
1343typedef union fail_stack_elt fail_stack_elt_t;
1344
1345typedef struct
1346{
1347 fail_stack_elt_t *stack;
c0f9ea08
SM
1348 size_t size;
1349 size_t avail; /* Offset of next open position. */
1350 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1351} fail_stack_type;
1352
505bde11 1353#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1354
1355
1356/* Define macros to initialize and free the failure stack.
1357 Do `return -2' if the alloc fails. */
1358
1359#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1360# define INIT_FAIL_STACK() \
fa9a63c5 1361 do { \
38182d90 1362 fail_stack.stack = \
320a2a73
KH
1363 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1364 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1365 \
1366 if (fail_stack.stack == NULL) \
1367 return -2; \
1368 \
1369 fail_stack.size = INIT_FAILURE_ALLOC; \
1370 fail_stack.avail = 0; \
505bde11 1371 fail_stack.frame = 0; \
fa9a63c5 1372 } while (0)
fa9a63c5 1373#else
0b32bf0e 1374# define INIT_FAIL_STACK() \
fa9a63c5
RM
1375 do { \
1376 fail_stack.avail = 0; \
505bde11 1377 fail_stack.frame = 0; \
fa9a63c5
RM
1378 } while (0)
1379
b313f9d8
PE
1380# define RETALLOC_IF(addr, n, t) \
1381 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
fa9a63c5
RM
1382#endif
1383
1384
320a2a73
KH
1385/* Double the size of FAIL_STACK, up to a limit
1386 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1387
1388 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1389 allocating space for it or it was already too large.
1390
4bb91c68 1391 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1392
320a2a73
KH
1393/* Factor to increase the failure stack size by
1394 when we increase it.
1395 This used to be 2, but 2 was too wasteful
1396 because the old discarded stacks added up to as much space
1397 were as ultimate, maximum-size stack. */
1398#define FAIL_STACK_GROWTH_FACTOR 4
1399
1400#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1401 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1402 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1403 ? 0 \
320a2a73 1404 : ((fail_stack).stack \
38182d90 1405 = REGEX_REALLOCATE_STACK ((fail_stack).stack, \
25fe55af 1406 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1407 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1408 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1409 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1410 \
1411 (fail_stack).stack == NULL \
1412 ? 0 \
6453db45
KH
1413 : ((fail_stack).size \
1414 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1415 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1416 * FAIL_STACK_GROWTH_FACTOR)) \
1417 / sizeof (fail_stack_elt_t)), \
25fe55af 1418 1)))
fa9a63c5
RM
1419
1420
fa9a63c5
RM
1421/* Push a pointer value onto the failure stack.
1422 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1423 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1424#define PUSH_FAILURE_POINTER(item) \
01618498 1425 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1426
1427/* This pushes an integer-valued item onto the failure stack.
1428 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1429 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1430#define PUSH_FAILURE_INT(item) \
1431 fail_stack.stack[fail_stack.avail++].integer = (item)
1432
b313f9d8 1433/* These POP... operations complement the PUSH... operations.
fa9a63c5
RM
1434 All assume that `fail_stack' is nonempty. */
1435#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1436#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
fa9a63c5 1437
505bde11
SM
1438/* Individual items aside from the registers. */
1439#define NUM_NONREG_ITEMS 3
1440
1441/* Used to examine the stack (to detect infinite loops). */
1442#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1443#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1444#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1445#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1446
1447
505bde11
SM
1448#define ENSURE_FAIL_STACK(space) \
1449while (REMAINING_AVAIL_SLOTS <= space) { \
1450 if (!GROW_FAIL_STACK (fail_stack)) \
1451 return -2; \
dc4a2ee0
PE
1452 DEBUG_PRINT ("\n Doubled stack; size now: %zd\n", (fail_stack).size);\
1453 DEBUG_PRINT (" slots available: %zd\n", REMAINING_AVAIL_SLOTS);\
505bde11
SM
1454}
1455
1456/* Push register NUM onto the stack. */
1457#define PUSH_FAILURE_REG(num) \
1458do { \
1459 char *destination; \
dc4a2ee0 1460 long n = num; \
505bde11 1461 ENSURE_FAIL_STACK(3); \
dc4a2ee0
PE
1462 DEBUG_PRINT (" Push reg %ld (spanning %p -> %p)\n", \
1463 n, regstart[n], regend[n]); \
1464 PUSH_FAILURE_POINTER (regstart[n]); \
1465 PUSH_FAILURE_POINTER (regend[n]); \
1466 PUSH_FAILURE_INT (n); \
505bde11
SM
1467} while (0)
1468
01618498
SM
1469/* Change the counter's value to VAL, but make sure that it will
1470 be reset when backtracking. */
1471#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1472do { \
1473 char *destination; \
1474 int c; \
1475 ENSURE_FAIL_STACK(3); \
1476 EXTRACT_NUMBER (c, ptr); \
dc4a2ee0 1477 DEBUG_PRINT (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1478 PUSH_FAILURE_INT (c); \
1479 PUSH_FAILURE_POINTER (ptr); \
1480 PUSH_FAILURE_INT (-1); \
01618498 1481 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1482} while (0)
1483
505bde11 1484/* Pop a saved register off the stack. */
dc1e502d 1485#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1486do { \
d1dfb56c 1487 long pfreg = POP_FAILURE_INT (); \
19ed5445 1488 if (pfreg == -1) \
dc1e502d
SM
1489 { \
1490 /* It's a counter. */ \
6dcf2d0e
SM
1491 /* Here, we discard `const', making re_match non-reentrant. */ \
1492 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1493 pfreg = POP_FAILURE_INT (); \
1494 STORE_NUMBER (ptr, pfreg); \
dc4a2ee0 1495 DEBUG_PRINT (" Pop counter %p = %ld\n", ptr, pfreg); \
dc1e502d
SM
1496 } \
1497 else \
1498 { \
19ed5445
PE
1499 regend[pfreg] = POP_FAILURE_POINTER (); \
1500 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc4a2ee0
PE
1501 DEBUG_PRINT (" Pop reg %ld (spanning %p -> %p)\n", \
1502 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1503 } \
505bde11
SM
1504} while (0)
1505
1506/* Check that we are not stuck in an infinite loop. */
1507#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1508do { \
d1dfb56c 1509 ssize_t failure = TOP_FAILURE_HANDLE (); \
505bde11 1510 /* Check for infinite matching loops */ \
f6df485f
RS
1511 while (failure > 0 \
1512 && (FAILURE_STR (failure) == string_place \
1513 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1514 { \
1515 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1516 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1517 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1518 { \
6df42991
SM
1519 cycle = 1; \
1520 break; \
f6df485f 1521 } \
dc4a2ee0 1522 DEBUG_PRINT (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1523 failure = NEXT_FAILURE_HANDLE(failure); \
1524 } \
dc4a2ee0 1525 DEBUG_PRINT (" Other string: %p\n", FAILURE_STR (failure)); \
505bde11 1526} while (0)
6df42991 1527
fa9a63c5 1528/* Push the information about the state we will need
5e69f11e
RM
1529 if we ever fail back to it.
1530
505bde11 1531 Requires variables fail_stack, regstart, regend and
320a2a73 1532 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1533 declared.
5e69f11e 1534
fa9a63c5
RM
1535 Does `return FAILURE_CODE' if runs out of memory. */
1536
505bde11
SM
1537#define PUSH_FAILURE_POINT(pattern, string_place) \
1538do { \
1539 char *destination; \
1540 /* Must be int, so when we don't save any registers, the arithmetic \
1541 of 0 + -1 isn't done as unsigned. */ \
1542 \
505bde11 1543 DEBUG_STATEMENT (nfailure_points_pushed++); \
dc4a2ee0
PE
1544 DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n"); \
1545 DEBUG_PRINT (" Before push, next avail: %zd\n", (fail_stack).avail); \
1546 DEBUG_PRINT (" size: %zd\n", (fail_stack).size);\
505bde11
SM
1547 \
1548 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1549 \
dc4a2ee0 1550 DEBUG_PRINT ("\n"); \
505bde11 1551 \
dc4a2ee0 1552 DEBUG_PRINT (" Push frame index: %zd\n", fail_stack.frame); \
505bde11
SM
1553 PUSH_FAILURE_INT (fail_stack.frame); \
1554 \
dc4a2ee0 1555 DEBUG_PRINT (" Push string %p: `", string_place); \
505bde11 1556 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
dc4a2ee0 1557 DEBUG_PRINT ("'\n"); \
505bde11
SM
1558 PUSH_FAILURE_POINTER (string_place); \
1559 \
dc4a2ee0 1560 DEBUG_PRINT (" Push pattern %p: ", pattern); \
505bde11
SM
1561 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1562 PUSH_FAILURE_POINTER (pattern); \
1563 \
1564 /* Close the frame by moving the frame pointer past it. */ \
1565 fail_stack.frame = fail_stack.avail; \
1566} while (0)
fa9a63c5 1567
320a2a73
KH
1568/* Estimate the size of data pushed by a typical failure stack entry.
1569 An estimate is all we need, because all we use this for
1570 is to choose a limit for how big to make the failure stack. */
ada30c0e 1571/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1572#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1573
fa9a63c5
RM
1574/* How many items can still be added to the stack without overflowing it. */
1575#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1576
1577
1578/* Pops what PUSH_FAIL_STACK pushes.
1579
1580 We restore into the parameters, all of which should be lvalues:
1581 STR -- the saved data position.
1582 PAT -- the saved pattern position.
fa9a63c5 1583 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1584
fa9a63c5 1585 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1586 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1587
505bde11
SM
1588#define POP_FAILURE_POINT(str, pat) \
1589do { \
fa9a63c5
RM
1590 assert (!FAIL_STACK_EMPTY ()); \
1591 \
1592 /* Remove failure points and point to how many regs pushed. */ \
dc4a2ee0
PE
1593 DEBUG_PRINT ("POP_FAILURE_POINT:\n"); \
1594 DEBUG_PRINT (" Before pop, next avail: %zd\n", fail_stack.avail); \
1595 DEBUG_PRINT (" size: %zd\n", fail_stack.size); \
fa9a63c5 1596 \
505bde11
SM
1597 /* Pop the saved registers. */ \
1598 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1599 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1600 \
dc4a2ee0
PE
1601 pat = POP_FAILURE_POINTER (); \
1602 DEBUG_PRINT (" Popping pattern %p: ", pat); \
505bde11 1603 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1604 \
1605 /* If the saved string location is NULL, it came from an \
1606 on_failure_keep_string_jump opcode, and we want to throw away the \
1607 saved NULL, thus retaining our current position in the string. */ \
01618498 1608 str = POP_FAILURE_POINTER (); \
dc4a2ee0 1609 DEBUG_PRINT (" Popping string %p: `", str); \
fa9a63c5 1610 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
dc4a2ee0 1611 DEBUG_PRINT ("'\n"); \
fa9a63c5 1612 \
505bde11 1613 fail_stack.frame = POP_FAILURE_INT (); \
dc4a2ee0 1614 DEBUG_PRINT (" Popping frame index: %zd\n", fail_stack.frame); \
fa9a63c5 1615 \
505bde11
SM
1616 assert (fail_stack.avail >= 0); \
1617 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1618 \
fa9a63c5 1619 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1620} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1621
1622
1623\f
fa9a63c5 1624/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1625#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1626\f
1627/* Subroutine declarations and macros for regex_compile. */
1628
261cb4bb
PE
1629static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1630 reg_syntax_t syntax,
1631 struct re_pattern_buffer *bufp);
1632static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1633static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1634static void insert_op1 (re_opcode_t op, unsigned char *loc,
1635 int arg, unsigned char *end);
1636static void insert_op2 (re_opcode_t op, unsigned char *loc,
1637 int arg1, int arg2, unsigned char *end);
1638static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1639 reg_syntax_t syntax);
1640static boolean at_endline_loc_p (re_char *p, re_char *pend,
1641 reg_syntax_t syntax);
1642static re_char *skip_one_char (re_char *p);
1643static int analyse_first (re_char *p, re_char *pend,
1644 char *fastmap, const int multibyte);
fa9a63c5 1645
fa9a63c5 1646/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1647 translation. */
36595814 1648#define PATFETCH(c) \
2d1675e4
SM
1649 do { \
1650 int len; \
1651 if (p == pend) return REG_EEND; \
62a6e103 1652 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1653 p += len; \
fa9a63c5
RM
1654 } while (0)
1655
fa9a63c5
RM
1656
1657/* If `translate' is non-null, return translate[D], else just D. We
1658 cast the subscript to translate because some data is declared as
1659 `char *', to avoid warnings when a string constant is passed. But
1660 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1661#ifndef TRANSLATE
0b32bf0e 1662# define TRANSLATE(d) \
66f0296e 1663 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1664#endif
fa9a63c5
RM
1665
1666
1667/* Macros for outputting the compiled pattern into `buffer'. */
1668
1669/* If the buffer isn't allocated when it comes in, use this. */
1670#define INIT_BUF_SIZE 32
1671
4bb91c68 1672/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1673#define GET_BUFFER_SPACE(n) \
01618498 1674 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1675 EXTEND_BUFFER ()
1676
1677/* Make sure we have one more byte of buffer space and then add C to it. */
1678#define BUF_PUSH(c) \
1679 do { \
1680 GET_BUFFER_SPACE (1); \
1681 *b++ = (unsigned char) (c); \
1682 } while (0)
1683
1684
1685/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1686#define BUF_PUSH_2(c1, c2) \
1687 do { \
1688 GET_BUFFER_SPACE (2); \
1689 *b++ = (unsigned char) (c1); \
1690 *b++ = (unsigned char) (c2); \
1691 } while (0)
1692
1693
fa9a63c5 1694/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1695 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1696#define STORE_JUMP(op, loc, to) \
1697 store_op1 (op, loc, (to) - (loc) - 3)
1698
1699/* Likewise, for a two-argument jump. */
1700#define STORE_JUMP2(op, loc, to, arg) \
1701 store_op2 (op, loc, (to) - (loc) - 3, arg)
1702
4bb91c68 1703/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1704#define INSERT_JUMP(op, loc, to) \
1705 insert_op1 (op, loc, (to) - (loc) - 3, b)
1706
1707/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1708#define INSERT_JUMP2(op, loc, to, arg) \
1709 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1710
1711
1712/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1713 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1714 be too small, many things would have to change. */
839966f3
KH
1715# define MAX_BUF_SIZE (1L << 15)
1716
fa9a63c5
RM
1717/* Extend the buffer by twice its current size via realloc and
1718 reset the pointers that pointed into the old block to point to the
1719 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1720 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1721#if __BOUNDED_POINTERS__
1722# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1723# define MOVE_BUFFER_POINTER(P) \
1724 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1725 SET_HIGH_BOUND (P), \
1726 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1727# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1728 else \
1729 { \
1730 SET_HIGH_BOUND (b); \
1731 SET_HIGH_BOUND (begalt); \
1732 if (fixup_alt_jump) \
1733 SET_HIGH_BOUND (fixup_alt_jump); \
1734 if (laststart) \
1735 SET_HIGH_BOUND (laststart); \
1736 if (pending_exact) \
1737 SET_HIGH_BOUND (pending_exact); \
1738 }
1739#else
381880b0 1740# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1741# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1742#endif
fa9a63c5 1743#define EXTEND_BUFFER() \
25fe55af 1744 do { \
381880b0 1745 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1746 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1747 return REG_ESIZE; \
1748 bufp->allocated <<= 1; \
1749 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1750 bufp->allocated = MAX_BUF_SIZE; \
01618498 1751 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1752 if (bufp->buffer == NULL) \
1753 return REG_ESPACE; \
1754 /* If the buffer moved, move all the pointers into it. */ \
1755 if (old_buffer != bufp->buffer) \
1756 { \
381880b0 1757 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1758 MOVE_BUFFER_POINTER (b); \
1759 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1760 if (fixup_alt_jump) \
4bb91c68 1761 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1762 if (laststart) \
4bb91c68 1763 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1764 if (pending_exact) \
4bb91c68 1765 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1766 } \
4bb91c68 1767 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1768 } while (0)
1769
1770
1771/* Since we have one byte reserved for the register number argument to
1772 {start,stop}_memory, the maximum number of groups we can report
1773 things about is what fits in that byte. */
1774#define MAX_REGNUM 255
1775
1776/* But patterns can have more than `MAX_REGNUM' registers. We just
1777 ignore the excess. */
098d42af 1778typedef int regnum_t;
fa9a63c5
RM
1779
1780
1781/* Macros for the compile stack. */
1782
1783/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1784 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1785/* int may be not enough when sizeof(int) == 2. */
1786typedef long pattern_offset_t;
fa9a63c5
RM
1787
1788typedef struct
1789{
1790 pattern_offset_t begalt_offset;
1791 pattern_offset_t fixup_alt_jump;
5e69f11e 1792 pattern_offset_t laststart_offset;
fa9a63c5
RM
1793 regnum_t regnum;
1794} compile_stack_elt_t;
1795
1796
1797typedef struct
1798{
1799 compile_stack_elt_t *stack;
d1dfb56c
EZ
1800 size_t size;
1801 size_t avail; /* Offset of next open position. */
fa9a63c5
RM
1802} compile_stack_type;
1803
1804
1805#define INIT_COMPILE_STACK_SIZE 32
1806
1807#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1808#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1809
4bb91c68 1810/* The next available element. */
fa9a63c5
RM
1811#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1812
0caaedb1
PE
1813/* Explicit quit checking is needed for Emacs, which uses polling to
1814 process input events. */
1815#ifdef emacs
77d11aec
RS
1816# define IMMEDIATE_QUIT_CHECK \
1817 do { \
1818 if (immediate_quit) QUIT; \
1819 } while (0)
1820#else
1821# define IMMEDIATE_QUIT_CHECK ((void)0)
1822#endif
1823\f
b18215fc
RS
1824/* Structure to manage work area for range table. */
1825struct range_table_work_area
1826{
1827 int *table; /* actual work area. */
1828 int allocated; /* allocated size for work area in bytes. */
7814e705 1829 int used; /* actually used size in words. */
96cc36cc 1830 int bits; /* flag to record character classes */
b18215fc
RS
1831};
1832
77d11aec
RS
1833/* Make sure that WORK_AREA can hold more N multibyte characters.
1834 This is used only in set_image_of_range and set_image_of_range_1.
1835 It expects WORK_AREA to be a pointer.
1836 If it can't get the space, it returns from the surrounding function. */
1837
1838#define EXTEND_RANGE_TABLE(work_area, n) \
1839 do { \
8f924df7 1840 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1841 { \
8f924df7
KH
1842 extend_range_table_work_area (&work_area); \
1843 if ((work_area).table == 0) \
77d11aec
RS
1844 return (REG_ESPACE); \
1845 } \
b18215fc
RS
1846 } while (0)
1847
96cc36cc
RS
1848#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1849 (work_area).bits |= (bit)
1850
14473664
SM
1851/* Bits used to implement the multibyte-part of the various character classes
1852 such as [:alnum:] in a charset's range table. */
1853#define BIT_WORD 0x1
1854#define BIT_LOWER 0x2
1855#define BIT_PUNCT 0x4
1856#define BIT_SPACE 0x8
1857#define BIT_UPPER 0x10
1858#define BIT_MULTIBYTE 0x20
96cc36cc 1859
b18215fc
RS
1860/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1861#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1862 do { \
8f924df7 1863 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1864 (work_area).table[(work_area).used++] = (range_start); \
1865 (work_area).table[(work_area).used++] = (range_end); \
1866 } while (0)
1867
7814e705 1868/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1869#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1870 do { \
1871 if ((work_area).table) \
1872 free ((work_area).table); \
1873 } while (0)
1874
96cc36cc 1875#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1876#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1877#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1878#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1879\f
b18215fc 1880
fa9a63c5 1881/* Set the bit for character C in a list. */
01618498 1882#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1883
1884
bf216479
KH
1885#ifdef emacs
1886
cf9c99bc
KH
1887/* Store characters in the range FROM to TO in the bitmap at B (for
1888 ASCII and unibyte characters) and WORK_AREA (for multibyte
1889 characters) while translating them and paying attention to the
1890 continuity of translated characters.
8f924df7 1891
cf9c99bc
KH
1892 Implementation note: It is better to implement these fairly big
1893 macros by a function, but it's not that easy because macros called
8f924df7 1894 in this macro assume various local variables already declared. */
bf216479 1895
cf9c99bc
KH
1896/* Both FROM and TO are ASCII characters. */
1897
1898#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1899 do { \
1900 int C0, C1; \
1901 \
1902 for (C0 = (FROM); C0 <= (TO); C0++) \
1903 { \
1904 C1 = TRANSLATE (C0); \
1905 if (! ASCII_CHAR_P (C1)) \
1906 { \
1907 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1908 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1909 C1 = C0; \
1910 } \
1911 SET_LIST_BIT (C1); \
1912 } \
1913 } while (0)
1914
1915
1916/* Both FROM and TO are unibyte characters (0x80..0xFF). */
1917
1918#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1919 do { \
1920 int C0, C1, C2, I; \
1921 int USED = RANGE_TABLE_WORK_USED (work_area); \
1922 \
1923 for (C0 = (FROM); C0 <= (TO); C0++) \
1924 { \
1925 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1926 if (CHAR_BYTE8_P (C1)) \
1927 SET_LIST_BIT (C0); \
1928 else \
1929 { \
1930 C2 = TRANSLATE (C1); \
1931 if (C2 == C1 \
1932 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1933 C1 = C0; \
1934 SET_LIST_BIT (C1); \
1935 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1936 { \
1937 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1938 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1939 \
1940 if (C2 >= from - 1 && C2 <= to + 1) \
1941 { \
1942 if (C2 == from - 1) \
1943 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1944 else if (C2 == to + 1) \
1945 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1946 break; \
1947 } \
1948 } \
1949 if (I < USED) \
1950 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1951 } \
1952 } \
1953 } while (0)
1954
1955
78edd3b7 1956/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
1957
1958#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1959 do { \
1960 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1961 \
1962 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1963 for (C0 = (FROM); C0 <= (TO); C0++) \
1964 { \
1965 C1 = TRANSLATE (C0); \
1966 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1967 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1968 SET_LIST_BIT (C2); \
1969 if (C1 >= (FROM) && C1 <= (TO)) \
1970 continue; \
1971 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1972 { \
1973 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1974 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1975 \
1976 if (C1 >= from - 1 && C1 <= to + 1) \
1977 { \
1978 if (C1 == from - 1) \
1979 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1980 else if (C1 == to + 1) \
1981 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1982 break; \
1983 } \
1984 } \
1985 if (I < USED) \
1986 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1987 } \
bf216479
KH
1988 } while (0)
1989
1990#endif /* emacs */
1991
fa9a63c5 1992/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1993#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
1994 do { \
1995 if (p == pend) \
1996 FREE_STACK_RETURN (REG_EBRACE); \
1997 else \
1998 { \
1999 PATFETCH (c); \
2000 while ('0' <= c && c <= '9') \
2001 { \
2002 int prev; \
2003 if (num < 0) \
2004 num = 0; \
2005 prev = num; \
2006 num = num * 10 + c - '0'; \
2007 if (num / 10 != prev) \
2008 FREE_STACK_RETURN (REG_BADBR); \
2009 if (p == pend) \
2010 FREE_STACK_RETURN (REG_EBRACE); \
2011 PATFETCH (c); \
2012 } \
2013 } \
2014 } while (0)
77d11aec 2015\f
1fdab503 2016#if ! WIDE_CHAR_SUPPORT
01618498 2017
14473664 2018/* Map a string to the char class it names (if any). */
1fdab503 2019re_wctype_t
29abe551 2020re_wctype (const_re_char *str)
14473664 2021{
5b0534c8 2022 const char *string = (const char *) str;
14473664
SM
2023 if (STREQ (string, "alnum")) return RECC_ALNUM;
2024 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2025 else if (STREQ (string, "word")) return RECC_WORD;
2026 else if (STREQ (string, "ascii")) return RECC_ASCII;
2027 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2028 else if (STREQ (string, "graph")) return RECC_GRAPH;
2029 else if (STREQ (string, "lower")) return RECC_LOWER;
2030 else if (STREQ (string, "print")) return RECC_PRINT;
2031 else if (STREQ (string, "punct")) return RECC_PUNCT;
2032 else if (STREQ (string, "space")) return RECC_SPACE;
2033 else if (STREQ (string, "upper")) return RECC_UPPER;
2034 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2035 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2036 else if (STREQ (string, "digit")) return RECC_DIGIT;
2037 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2038 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2039 else if (STREQ (string, "blank")) return RECC_BLANK;
2040 else return 0;
2041}
2042
e0f24100 2043/* True if CH is in the char class CC. */
1fdab503 2044boolean
971de7fb 2045re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2046{
2047 switch (cc)
2048 {
f3fcc40d
AS
2049 case RECC_ALNUM: return ISALNUM (ch) != 0;
2050 case RECC_ALPHA: return ISALPHA (ch) != 0;
2051 case RECC_BLANK: return ISBLANK (ch) != 0;
2052 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2053 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2054 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2055 case RECC_LOWER: return ISLOWER (ch) != 0;
2056 case RECC_PRINT: return ISPRINT (ch) != 0;
2057 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2058 case RECC_SPACE: return ISSPACE (ch) != 0;
2059 case RECC_UPPER: return ISUPPER (ch) != 0;
2060 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2061 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
213bd7f2 2062 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
f3fcc40d 2063 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
213bd7f2 2064 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
f3fcc40d 2065 case RECC_WORD: return ISWORD (ch) != 0;
0cdd06f8
SM
2066 case RECC_ERROR: return false;
2067 default:
5e617bc2 2068 abort ();
14473664
SM
2069 }
2070}
fa9a63c5 2071
14473664
SM
2072/* Return a bit-pattern to use in the range-table bits to match multibyte
2073 chars of class CC. */
2074static int
971de7fb 2075re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2076{
2077 switch (cc)
2078 {
2079 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2080 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2081 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2082 case RECC_LOWER: return BIT_LOWER;
2083 case RECC_UPPER: return BIT_UPPER;
2084 case RECC_PUNCT: return BIT_PUNCT;
2085 case RECC_SPACE: return BIT_SPACE;
14473664 2086 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2087 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2088 default:
5e617bc2 2089 abort ();
14473664
SM
2090 }
2091}
2092#endif
77d11aec
RS
2093\f
2094/* Filling in the work area of a range. */
2095
2096/* Actually extend the space in WORK_AREA. */
2097
2098static void
971de7fb 2099extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2100{
77d11aec 2101 work_area->allocated += 16 * sizeof (int);
38182d90 2102 work_area->table = realloc (work_area->table, work_area->allocated);
77d11aec
RS
2103}
2104
8f924df7 2105#if 0
77d11aec
RS
2106#ifdef emacs
2107
2108/* Carefully find the ranges of codes that are equivalent
2109 under case conversion to the range start..end when passed through
2110 TRANSLATE. Handle the case where non-letters can come in between
2111 two upper-case letters (which happens in Latin-1).
2112 Also handle the case of groups of more than 2 case-equivalent chars.
2113
2114 The basic method is to look at consecutive characters and see
2115 if they can form a run that can be handled as one.
2116
2117 Returns -1 if successful, REG_ESPACE if ran out of space. */
2118
2119static int
1dae0f0a
AS
2120set_image_of_range_1 (struct range_table_work_area *work_area,
2121 re_wchar_t start, re_wchar_t end,
2122 RE_TRANSLATE_TYPE translate)
77d11aec
RS
2123{
2124 /* `one_case' indicates a character, or a run of characters,
2125 each of which is an isolate (no case-equivalents).
2126 This includes all ASCII non-letters.
2127
2128 `two_case' indicates a character, or a run of characters,
2129 each of which has two case-equivalent forms.
2130 This includes all ASCII letters.
2131
2132 `strange' indicates a character that has more than one
2133 case-equivalent. */
177c0ea7 2134
77d11aec
RS
2135 enum case_type {one_case, two_case, strange};
2136
2137 /* Describe the run that is in progress,
2138 which the next character can try to extend.
2139 If run_type is strange, that means there really is no run.
2140 If run_type is one_case, then run_start...run_end is the run.
2141 If run_type is two_case, then the run is run_start...run_end,
2142 and the case-equivalents end at run_eqv_end. */
2143
2144 enum case_type run_type = strange;
2145 int run_start, run_end, run_eqv_end;
2146
2147 Lisp_Object eqv_table;
2148
2149 if (!RE_TRANSLATE_P (translate))
2150 {
b7c12565 2151 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2152 work_area->table[work_area->used++] = (start);
2153 work_area->table[work_area->used++] = (end);
b7c12565 2154 return -1;
77d11aec
RS
2155 }
2156
2157 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2158
77d11aec
RS
2159 for (; start <= end; start++)
2160 {
2161 enum case_type this_type;
2162 int eqv = RE_TRANSLATE (eqv_table, start);
2163 int minchar, maxchar;
2164
2165 /* Classify this character */
2166 if (eqv == start)
2167 this_type = one_case;
2168 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2169 this_type = two_case;
2170 else
2171 this_type = strange;
2172
2173 if (start < eqv)
2174 minchar = start, maxchar = eqv;
2175 else
2176 minchar = eqv, maxchar = start;
2177
2178 /* Can this character extend the run in progress? */
2179 if (this_type == strange || this_type != run_type
2180 || !(minchar == run_end + 1
2181 && (run_type == two_case
2182 ? maxchar == run_eqv_end + 1 : 1)))
2183 {
2184 /* No, end the run.
2185 Record each of its equivalent ranges. */
2186 if (run_type == one_case)
2187 {
2188 EXTEND_RANGE_TABLE (work_area, 2);
2189 work_area->table[work_area->used++] = run_start;
2190 work_area->table[work_area->used++] = run_end;
2191 }
2192 else if (run_type == two_case)
2193 {
2194 EXTEND_RANGE_TABLE (work_area, 4);
2195 work_area->table[work_area->used++] = run_start;
2196 work_area->table[work_area->used++] = run_end;
2197 work_area->table[work_area->used++]
2198 = RE_TRANSLATE (eqv_table, run_start);
2199 work_area->table[work_area->used++]
2200 = RE_TRANSLATE (eqv_table, run_end);
2201 }
2202 run_type = strange;
2203 }
177c0ea7 2204
77d11aec
RS
2205 if (this_type == strange)
2206 {
2207 /* For a strange character, add each of its equivalents, one
2208 by one. Don't start a range. */
2209 do
2210 {
2211 EXTEND_RANGE_TABLE (work_area, 2);
2212 work_area->table[work_area->used++] = eqv;
2213 work_area->table[work_area->used++] = eqv;
2214 eqv = RE_TRANSLATE (eqv_table, eqv);
2215 }
2216 while (eqv != start);
2217 }
2218
2219 /* Add this char to the run, or start a new run. */
2220 else if (run_type == strange)
2221 {
2222 /* Initialize a new range. */
2223 run_type = this_type;
2224 run_start = start;
2225 run_end = start;
2226 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2227 }
2228 else
2229 {
2230 /* Extend a running range. */
2231 run_end = minchar;
2232 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2233 }
2234 }
2235
2236 /* If a run is still in progress at the end, finish it now
2237 by recording its equivalent ranges. */
2238 if (run_type == one_case)
2239 {
2240 EXTEND_RANGE_TABLE (work_area, 2);
2241 work_area->table[work_area->used++] = run_start;
2242 work_area->table[work_area->used++] = run_end;
2243 }
2244 else if (run_type == two_case)
2245 {
2246 EXTEND_RANGE_TABLE (work_area, 4);
2247 work_area->table[work_area->used++] = run_start;
2248 work_area->table[work_area->used++] = run_end;
2249 work_area->table[work_area->used++]
2250 = RE_TRANSLATE (eqv_table, run_start);
2251 work_area->table[work_area->used++]
2252 = RE_TRANSLATE (eqv_table, run_end);
2253 }
2254
2255 return -1;
2256}
36595814 2257
77d11aec 2258#endif /* emacs */
36595814 2259
2b34df4e 2260/* Record the image of the range start..end when passed through
36595814
SM
2261 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2262 and is not even necessarily contiguous.
b7c12565
RS
2263 Normally we approximate it with the smallest contiguous range that contains
2264 all the chars we need. However, for Latin-1 we go to extra effort
2265 to do a better job.
2266
2267 This function is not called for ASCII ranges.
77d11aec
RS
2268
2269 Returns -1 if successful, REG_ESPACE if ran out of space. */
2270
2271static int
1dae0f0a
AS
2272set_image_of_range (struct range_table_work_area *work_area,
2273 re_wchar_t start, re_wchar_t end,
2274 RE_TRANSLATE_TYPE translate)
36595814 2275{
77d11aec
RS
2276 re_wchar_t cmin, cmax;
2277
2278#ifdef emacs
2279 /* For Latin-1 ranges, use set_image_of_range_1
2280 to get proper handling of ranges that include letters and nonletters.
b7c12565 2281 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2282 For other character sets, we don't bother to get this right. */
b7c12565
RS
2283 if (RE_TRANSLATE_P (translate) && start < 04400
2284 && !(start < 04200 && end >= 04377))
77d11aec 2285 {
b7c12565 2286 int newend;
77d11aec 2287 int tem;
b7c12565
RS
2288 newend = end;
2289 if (newend > 04377)
2290 newend = 04377;
2291 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2292 if (tem > 0)
2293 return tem;
2294
2295 start = 04400;
2296 if (end < 04400)
2297 return -1;
2298 }
2299#endif
2300
b7c12565
RS
2301 EXTEND_RANGE_TABLE (work_area, 2);
2302 work_area->table[work_area->used++] = (start);
2303 work_area->table[work_area->used++] = (end);
2304
2305 cmin = -1, cmax = -1;
77d11aec 2306
36595814 2307 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2308 {
2309 int ch;
77d11aec 2310
b7c12565
RS
2311 for (ch = start; ch <= end; ch++)
2312 {
2313 re_wchar_t c = TRANSLATE (ch);
2314 if (! (start <= c && c <= end))
2315 {
2316 if (cmin == -1)
2317 cmin = c, cmax = c;
2318 else
2319 {
2320 cmin = MIN (cmin, c);
2321 cmax = MAX (cmax, c);
2322 }
2323 }
2324 }
2325
2326 if (cmin != -1)
2327 {
2328 EXTEND_RANGE_TABLE (work_area, 2);
2329 work_area->table[work_area->used++] = (cmin);
2330 work_area->table[work_area->used++] = (cmax);
2331 }
2332 }
36595814 2333
77d11aec
RS
2334 return -1;
2335}
8f924df7 2336#endif /* 0 */
fa9a63c5
RM
2337\f
2338#ifndef MATCH_MAY_ALLOCATE
2339
2340/* If we cannot allocate large objects within re_match_2_internal,
2341 we make the fail stack and register vectors global.
2342 The fail stack, we grow to the maximum size when a regexp
2343 is compiled.
2344 The register vectors, we adjust in size each time we
2345 compile a regexp, according to the number of registers it needs. */
2346
2347static fail_stack_type fail_stack;
2348
2349/* Size with which the following vectors are currently allocated.
2350 That is so we can make them bigger as needed,
4bb91c68 2351 but never make them smaller. */
fa9a63c5
RM
2352static int regs_allocated_size;
2353
66f0296e
SM
2354static re_char ** regstart, ** regend;
2355static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2356
2357/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2358 but don't make them smaller. */
fa9a63c5
RM
2359
2360static
1dae0f0a 2361regex_grow_registers (int num_regs)
fa9a63c5
RM
2362{
2363 if (num_regs > regs_allocated_size)
2364 {
66f0296e
SM
2365 RETALLOC_IF (regstart, num_regs, re_char *);
2366 RETALLOC_IF (regend, num_regs, re_char *);
2367 RETALLOC_IF (best_regstart, num_regs, re_char *);
2368 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2369
2370 regs_allocated_size = num_regs;
2371 }
2372}
2373
2374#endif /* not MATCH_MAY_ALLOCATE */
2375\f
261cb4bb
PE
2376static boolean group_in_compile_stack (compile_stack_type compile_stack,
2377 regnum_t regnum);
99633e97 2378
fa9a63c5
RM
2379/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2380 Returns one of error codes defined in `regex.h', or zero for success.
2381
2382 Assumes the `allocated' (and perhaps `buffer') and `translate'
2383 fields are set in BUFP on entry.
2384
2385 If it succeeds, results are put in BUFP (if it returns an error, the
2386 contents of BUFP are undefined):
2387 `buffer' is the compiled pattern;
2388 `syntax' is set to SYNTAX;
2389 `used' is set to the length of the compiled pattern;
2390 `fastmap_accurate' is zero;
2391 `re_nsub' is the number of subexpressions in PATTERN;
2392 `not_bol' and `not_eol' are zero;
5e69f11e 2393
c0f9ea08 2394 The `fastmap' field is neither examined nor set. */
fa9a63c5 2395
505bde11
SM
2396/* Insert the `jump' from the end of last alternative to "here".
2397 The space for the jump has already been allocated. */
2398#define FIXUP_ALT_JUMP() \
2399do { \
2400 if (fixup_alt_jump) \
2401 STORE_JUMP (jump, fixup_alt_jump, b); \
2402} while (0)
2403
2404
fa9a63c5
RM
2405/* Return, freeing storage we allocated. */
2406#define FREE_STACK_RETURN(value) \
b18215fc
RS
2407 do { \
2408 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2409 free (compile_stack.stack); \
2410 return value; \
2411 } while (0)
fa9a63c5
RM
2412
2413static reg_errcode_t
29abe551
PE
2414regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2415 struct re_pattern_buffer *bufp)
fa9a63c5 2416{
01618498
SM
2417 /* We fetch characters from PATTERN here. */
2418 register re_wchar_t c, c1;
5e69f11e 2419
fa9a63c5
RM
2420 /* Points to the end of the buffer, where we should append. */
2421 register unsigned char *b;
5e69f11e 2422
fa9a63c5
RM
2423 /* Keeps track of unclosed groups. */
2424 compile_stack_type compile_stack;
2425
2426 /* Points to the current (ending) position in the pattern. */
22336245
RS
2427#ifdef AIX
2428 /* `const' makes AIX compiler fail. */
66f0296e 2429 unsigned char *p = pattern;
22336245 2430#else
66f0296e 2431 re_char *p = pattern;
22336245 2432#endif
66f0296e 2433 re_char *pend = pattern + size;
5e69f11e 2434
fa9a63c5 2435 /* How to translate the characters in the pattern. */
6676cb1c 2436 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2437
2438 /* Address of the count-byte of the most recently inserted `exactn'
2439 command. This makes it possible to tell if a new exact-match
2440 character can be added to that command or if the character requires
2441 a new `exactn' command. */
2442 unsigned char *pending_exact = 0;
2443
2444 /* Address of start of the most recently finished expression.
2445 This tells, e.g., postfix * where to find the start of its
2446 operand. Reset at the beginning of groups and alternatives. */
2447 unsigned char *laststart = 0;
2448
2449 /* Address of beginning of regexp, or inside of last group. */
2450 unsigned char *begalt;
2451
2452 /* Place in the uncompiled pattern (i.e., the {) to
2453 which to go back if the interval is invalid. */
66f0296e 2454 re_char *beg_interval;
5e69f11e 2455
fa9a63c5 2456 /* Address of the place where a forward jump should go to the end of
7814e705 2457 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2458 last -- ends with a forward jump of this sort. */
2459 unsigned char *fixup_alt_jump = 0;
2460
b18215fc
RS
2461 /* Work area for range table of charset. */
2462 struct range_table_work_area range_table_work;
2463
2d1675e4
SM
2464 /* If the object matched can contain multibyte characters. */
2465 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2466
f9b0fd99
RS
2467 /* Nonzero if we have pushed down into a subpattern. */
2468 int in_subpattern = 0;
2469
2470 /* These hold the values of p, pattern, and pend from the main
2471 pattern when we have pushed into a subpattern. */
da053e48
PE
2472 re_char *main_p IF_LINT (= NULL);
2473 re_char *main_pattern IF_LINT (= NULL);
2474 re_char *main_pend IF_LINT (= NULL);
f9b0fd99 2475
fa9a63c5 2476#ifdef DEBUG
99633e97 2477 debug++;
dc4a2ee0 2478 DEBUG_PRINT ("\nCompiling pattern: ");
99633e97 2479 if (debug > 0)
fa9a63c5
RM
2480 {
2481 unsigned debug_count;
5e69f11e 2482
fa9a63c5 2483 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2484 putchar (pattern[debug_count]);
fa9a63c5
RM
2485 putchar ('\n');
2486 }
2487#endif /* DEBUG */
2488
2489 /* Initialize the compile stack. */
2490 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2491 if (compile_stack.stack == NULL)
2492 return REG_ESPACE;
2493
2494 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2495 compile_stack.avail = 0;
2496
b18215fc
RS
2497 range_table_work.table = 0;
2498 range_table_work.allocated = 0;
2499
fa9a63c5
RM
2500 /* Initialize the pattern buffer. */
2501 bufp->syntax = syntax;
2502 bufp->fastmap_accurate = 0;
2503 bufp->not_bol = bufp->not_eol = 0;
6224b623 2504 bufp->used_syntax = 0;
fa9a63c5
RM
2505
2506 /* Set `used' to zero, so that if we return an error, the pattern
2507 printer (for debugging) will think there's no pattern. We reset it
2508 at the end. */
2509 bufp->used = 0;
5e69f11e 2510
fa9a63c5 2511 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2512 bufp->re_nsub = 0;
fa9a63c5 2513
0b32bf0e 2514#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2515 /* Initialize the syntax table. */
2516 init_syntax_once ();
2517#endif
2518
2519 if (bufp->allocated == 0)
2520 {
2521 if (bufp->buffer)
2522 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2523 enough space. This loses if buffer's address is bogus, but
7814e705 2524 that is the user's responsibility. */
25fe55af
RS
2525 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2526 }
fa9a63c5 2527 else
7814e705 2528 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2529 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2530 }
fa9a63c5
RM
2531 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2532
2533 bufp->allocated = INIT_BUF_SIZE;
2534 }
2535
2536 begalt = b = bufp->buffer;
2537
2538 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2539 while (1)
fa9a63c5 2540 {
f9b0fd99
RS
2541 if (p == pend)
2542 {
2543 /* If this is the end of an included regexp,
2544 pop back to the main regexp and try again. */
2545 if (in_subpattern)
2546 {
2547 in_subpattern = 0;
2548 pattern = main_pattern;
2549 p = main_p;
2550 pend = main_pend;
2551 continue;
2552 }
2553 /* If this is the end of the main regexp, we are done. */
2554 break;
2555 }
2556
fa9a63c5
RM
2557 PATFETCH (c);
2558
2559 switch (c)
25fe55af 2560 {
f9b0fd99
RS
2561 case ' ':
2562 {
2563 re_char *p1 = p;
2564
2565 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2566 spaces normally. And don't try to do this recursively. */
2567 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2568 goto normal_char;
2569
2570 /* Peek past following spaces. */
2571 while (p1 != pend)
2572 {
2573 if (*p1 != ' ')
2574 break;
2575 p1++;
2576 }
2577 /* If the spaces are followed by a repetition op,
2578 treat them normally. */
c721eee5
RS
2579 if (p1 != pend
2580 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2581 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2582 goto normal_char;
2583
2584 /* Replace the spaces with the whitespace regexp. */
2585 in_subpattern = 1;
2586 main_p = p1;
2587 main_pend = pend;
2588 main_pattern = pattern;
2589 p = pattern = whitespace_regexp;
5b0534c8 2590 pend = p + strlen ((const char *) p);
f9b0fd99 2591 break;
7814e705 2592 }
f9b0fd99 2593
25fe55af
RS
2594 case '^':
2595 {
7814e705 2596 if ( /* If at start of pattern, it's an operator. */
25fe55af 2597 p == pattern + 1
7814e705 2598 /* If context independent, it's an operator. */
25fe55af 2599 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2600 /* Otherwise, depends on what's come before. */
25fe55af 2601 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2602 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2603 else
2604 goto normal_char;
2605 }
2606 break;
2607
2608
2609 case '$':
2610 {
2611 if ( /* If at end of pattern, it's an operator. */
2612 p == pend
7814e705 2613 /* If context independent, it's an operator. */
25fe55af
RS
2614 || syntax & RE_CONTEXT_INDEP_ANCHORS
2615 /* Otherwise, depends on what's next. */
2616 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2617 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2618 else
2619 goto normal_char;
2620 }
2621 break;
fa9a63c5
RM
2622
2623
2624 case '+':
25fe55af
RS
2625 case '?':
2626 if ((syntax & RE_BK_PLUS_QM)
2627 || (syntax & RE_LIMITED_OPS))
2628 goto normal_char;
2629 handle_plus:
2630 case '*':
5ac2eb34 2631 /* If there is no previous pattern... */
25fe55af
RS
2632 if (!laststart)
2633 {
2634 if (syntax & RE_CONTEXT_INVALID_OPS)
2635 FREE_STACK_RETURN (REG_BADRPT);
2636 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2637 goto normal_char;
2638 }
2639
2640 {
7814e705 2641 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2642 boolean zero_times_ok = 0, many_times_ok = 0;
2643 boolean greedy = 1;
25fe55af
RS
2644
2645 /* If there is a sequence of repetition chars, collapse it
2646 down to just one (the right one). We can't combine
2647 interval operators with these because of, e.g., `a{2}*',
7814e705 2648 which should only match an even number of `a's. */
25fe55af
RS
2649
2650 for (;;)
2651 {
0b32bf0e 2652 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2653 && c == '?' && (zero_times_ok || many_times_ok))
2654 greedy = 0;
2655 else
2656 {
2657 zero_times_ok |= c != '+';
2658 many_times_ok |= c != '?';
2659 }
25fe55af
RS
2660
2661 if (p == pend)
2662 break;
ed0767d8
SM
2663 else if (*p == '*'
2664 || (!(syntax & RE_BK_PLUS_QM)
2665 && (*p == '+' || *p == '?')))
25fe55af 2666 ;
ed0767d8 2667 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2668 {
ed0767d8
SM
2669 if (p+1 == pend)
2670 FREE_STACK_RETURN (REG_EESCAPE);
2671 if (p[1] == '+' || p[1] == '?')
2672 PATFETCH (c); /* Gobble up the backslash. */
2673 else
2674 break;
25fe55af
RS
2675 }
2676 else
ed0767d8 2677 break;
25fe55af 2678 /* If we get here, we found another repeat character. */
ed0767d8
SM
2679 PATFETCH (c);
2680 }
25fe55af
RS
2681
2682 /* Star, etc. applied to an empty pattern is equivalent
2683 to an empty pattern. */
4e8a9132 2684 if (!laststart || laststart == b)
25fe55af
RS
2685 break;
2686
2687 /* Now we know whether or not zero matches is allowed
7814e705 2688 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2689 if (greedy)
2690 {
99633e97 2691 if (many_times_ok)
4e8a9132
SM
2692 {
2693 boolean simple = skip_one_char (laststart) == b;
d1dfb56c 2694 size_t startoffset = 0;
f6a3f532 2695 re_opcode_t ofj =
01618498 2696 /* Check if the loop can match the empty string. */
6df42991
SM
2697 (simple || !analyse_first (laststart, b, NULL, 0))
2698 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2699 assert (skip_one_char (laststart) <= b);
177c0ea7 2700
4e8a9132
SM
2701 if (!zero_times_ok && simple)
2702 { /* Since simple * loops can be made faster by using
2703 on_failure_keep_string_jump, we turn simple P+
2704 into PP* if P is simple. */
2705 unsigned char *p1, *p2;
2706 startoffset = b - laststart;
2707 GET_BUFFER_SPACE (startoffset);
2708 p1 = b; p2 = laststart;
2709 while (p2 < p1)
2710 *b++ = *p2++;
2711 zero_times_ok = 1;
99633e97 2712 }
4e8a9132
SM
2713
2714 GET_BUFFER_SPACE (6);
2715 if (!zero_times_ok)
2716 /* A + loop. */
f6a3f532 2717 STORE_JUMP (ofj, b, b + 6);
99633e97 2718 else
4e8a9132
SM
2719 /* Simple * loops can use on_failure_keep_string_jump
2720 depending on what follows. But since we don't know
2721 that yet, we leave the decision up to
2722 on_failure_jump_smart. */
f6a3f532 2723 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2724 laststart + startoffset, b + 6);
99633e97 2725 b += 3;
4e8a9132 2726 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2727 b += 3;
2728 }
2729 else
2730 {
4e8a9132
SM
2731 /* A simple ? pattern. */
2732 assert (zero_times_ok);
2733 GET_BUFFER_SPACE (3);
2734 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2735 b += 3;
2736 }
1c8c6d39
DL
2737 }
2738 else /* not greedy */
5ac2eb34 2739 { /* I wish the greedy and non-greedy cases could be merged. */
1c8c6d39 2740
0683b6fa 2741 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2742 if (many_times_ok)
2743 {
f6a3f532
SM
2744 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2745
6df42991
SM
2746 /* The non-greedy multiple match looks like
2747 a repeat..until: we only need a conditional jump
2748 at the end of the loop. */
f6a3f532
SM
2749 if (emptyp) BUF_PUSH (no_op);
2750 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2751 : on_failure_jump, b, laststart);
1c8c6d39
DL
2752 b += 3;
2753 if (zero_times_ok)
2754 {
2755 /* The repeat...until naturally matches one or more.
2756 To also match zero times, we need to first jump to
6df42991 2757 the end of the loop (its conditional jump). */
1c8c6d39
DL
2758 INSERT_JUMP (jump, laststart, b);
2759 b += 3;
2760 }
2761 }
2762 else
2763 {
2764 /* non-greedy a?? */
1c8c6d39
DL
2765 INSERT_JUMP (jump, laststart, b + 3);
2766 b += 3;
2767 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2768 b += 3;
2769 }
2770 }
2771 }
4e8a9132 2772 pending_exact = 0;
fa9a63c5
RM
2773 break;
2774
2775
2776 case '.':
25fe55af
RS
2777 laststart = b;
2778 BUF_PUSH (anychar);
2779 break;
fa9a63c5
RM
2780
2781
25fe55af
RS
2782 case '[':
2783 {
19ed5445
PE
2784 re_char *p1;
2785
b18215fc 2786 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2787
25fe55af 2788 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2789
25fe55af
RS
2790 /* Ensure that we have enough space to push a charset: the
2791 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2792 GET_BUFFER_SPACE (34);
2793
25fe55af 2794 laststart = b;
e318085a 2795
25fe55af 2796 /* We test `*p == '^' twice, instead of using an if
7814e705 2797 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2798 BUF_PUSH (*p == '^' ? charset_not : charset);
2799 if (*p == '^')
2800 p++;
e318085a 2801
25fe55af
RS
2802 /* Remember the first position in the bracket expression. */
2803 p1 = p;
e318085a 2804
7814e705 2805 /* Push the number of bytes in the bitmap. */
25fe55af 2806 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2807
25fe55af 2808 /* Clear the whole map. */
72af86bd 2809 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2810
25fe55af
RS
2811 /* charset_not matches newline according to a syntax bit. */
2812 if ((re_opcode_t) b[-2] == charset_not
2813 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2814 SET_LIST_BIT ('\n');
fa9a63c5 2815
7814e705 2816 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2817 for (;;)
2818 {
b18215fc 2819 boolean escaped_char = false;
2d1675e4 2820 const unsigned char *p2 = p;
abbd1bcf 2821 re_wchar_t ch;
e318085a 2822
25fe55af 2823 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2824
36595814
SM
2825 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2826 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2827 So the translation is done later in a loop. Example:
2828 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2829 PATFETCH (c);
e318085a 2830
25fe55af
RS
2831 /* \ might escape characters inside [...] and [^...]. */
2832 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2833 {
2834 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2835
2836 PATFETCH (c);
b18215fc 2837 escaped_char = true;
25fe55af 2838 }
b18215fc
RS
2839 else
2840 {
7814e705 2841 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2842 not (i.e., when the bracket expression is `[]' so
2843 far), the ']' character bit gets set way below. */
2d1675e4 2844 if (c == ']' && p2 != p1)
657fcfbd 2845 break;
25fe55af 2846 }
b18215fc 2847
25fe55af
RS
2848 /* See if we're at the beginning of a possible character
2849 class. */
b18215fc 2850
2d1675e4
SM
2851 if (!escaped_char &&
2852 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2853 {
7814e705 2854 /* Leave room for the null. */
14473664 2855 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2856 const unsigned char *class_beg;
b18215fc 2857
25fe55af
RS
2858 PATFETCH (c);
2859 c1 = 0;
ed0767d8 2860 class_beg = p;
b18215fc 2861
25fe55af
RS
2862 /* If pattern is `[[:'. */
2863 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2864
25fe55af
RS
2865 for (;;)
2866 {
14473664
SM
2867 PATFETCH (c);
2868 if ((c == ':' && *p == ']') || p == pend)
2869 break;
2870 if (c1 < CHAR_CLASS_MAX_LENGTH)
2871 str[c1++] = c;
2872 else
2873 /* This is in any case an invalid class name. */
2874 str[0] = '\0';
25fe55af
RS
2875 }
2876 str[c1] = '\0';
b18215fc
RS
2877
2878 /* If isn't a word bracketed by `[:' and `:]':
2879 undo the ending character, the letters, and
2880 leave the leading `:' and `[' (but set bits for
2881 them). */
25fe55af
RS
2882 if (c == ':' && *p == ']')
2883 {
abbd1bcf 2884 re_wctype_t cc = re_wctype (str);
14473664
SM
2885
2886 if (cc == 0)
fa9a63c5
RM
2887 FREE_STACK_RETURN (REG_ECTYPE);
2888
14473664
SM
2889 /* Throw away the ] at the end of the character
2890 class. */
2891 PATFETCH (c);
fa9a63c5 2892
14473664 2893 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2894
cf9c99bc
KH
2895#ifndef emacs
2896 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
2897 if (re_iswctype (btowc (ch), cc))
2898 {
2899 c = TRANSLATE (ch);
ed00c2ac
KH
2900 if (c < (1 << BYTEWIDTH))
2901 SET_LIST_BIT (c);
8f924df7 2902 }
cf9c99bc
KH
2903#else /* emacs */
2904 /* Most character classes in a multibyte match
2905 just set a flag. Exceptions are is_blank,
2906 is_digit, is_cntrl, and is_xdigit, since
2907 they can only match ASCII characters. We
2908 don't need to handle them for multibyte.
2909 They are distinguished by a negative wctype. */
96cc36cc 2910
254c06a8
SM
2911 /* Setup the gl_state object to its buffer-defined
2912 value. This hardcodes the buffer-global
2913 syntax-table for ASCII chars, while the other chars
2914 will obey syntax-table properties. It's not ideal,
2915 but it's the way it's been done until now. */
d48cd3f4 2916 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 2917
cf9c99bc 2918 for (ch = 0; ch < 256; ++ch)
25fe55af 2919 {
cf9c99bc
KH
2920 c = RE_CHAR_TO_MULTIBYTE (ch);
2921 if (! CHAR_BYTE8_P (c)
2922 && re_iswctype (c, cc))
8f924df7 2923 {
cf9c99bc
KH
2924 SET_LIST_BIT (ch);
2925 c1 = TRANSLATE (c);
2926 if (c1 == c)
2927 continue;
2928 if (ASCII_CHAR_P (c1))
2929 SET_LIST_BIT (c1);
2930 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2931 SET_LIST_BIT (c1);
8f924df7 2932 }
25fe55af 2933 }
cf9c99bc
KH
2934 SET_RANGE_TABLE_WORK_AREA_BIT
2935 (range_table_work, re_wctype_to_bit (cc));
2936#endif /* emacs */
6224b623
SM
2937 /* In most cases the matching rule for char classes
2938 only uses the syntax table for multibyte chars,
2939 so that the content of the syntax-table it is not
2940 hardcoded in the range_table. SPACE and WORD are
2941 the two exceptions. */
2942 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2943 bufp->used_syntax = 1;
2944
b18215fc
RS
2945 /* Repeat the loop. */
2946 continue;
25fe55af
RS
2947 }
2948 else
2949 {
ed0767d8
SM
2950 /* Go back to right after the "[:". */
2951 p = class_beg;
25fe55af 2952 SET_LIST_BIT ('[');
b18215fc
RS
2953
2954 /* Because the `:' may starts the range, we
2955 can't simply set bit and repeat the loop.
7814e705 2956 Instead, just set it to C and handle below. */
b18215fc 2957 c = ':';
25fe55af
RS
2958 }
2959 }
b18215fc
RS
2960
2961 if (p < pend && p[0] == '-' && p[1] != ']')
2962 {
2963
2964 /* Discard the `-'. */
2965 PATFETCH (c1);
2966
2967 /* Fetch the character which ends the range. */
2968 PATFETCH (c1);
cf9c99bc
KH
2969#ifdef emacs
2970 if (CHAR_BYTE8_P (c1)
2971 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2972 /* Treat the range from a multibyte character to
2973 raw-byte character as empty. */
2974 c = c1 + 1;
2975#endif /* emacs */
e318085a 2976 }
25fe55af 2977 else
b18215fc
RS
2978 /* Range from C to C. */
2979 c1 = c;
2980
cf9c99bc 2981 if (c > c1)
25fe55af 2982 {
cf9c99bc
KH
2983 if (syntax & RE_NO_EMPTY_RANGES)
2984 FREE_STACK_RETURN (REG_ERANGEX);
2985 /* Else, repeat the loop. */
bf216479 2986 }
6fdd04b0 2987 else
25fe55af 2988 {
cf9c99bc
KH
2989#ifndef emacs
2990 /* Set the range into bitmap */
8f924df7 2991 for (; c <= c1; c++)
b18215fc 2992 {
cf9c99bc
KH
2993 ch = TRANSLATE (c);
2994 if (ch < (1 << BYTEWIDTH))
2995 SET_LIST_BIT (ch);
2996 }
2997#else /* emacs */
2998 if (c < 128)
2999 {
3000 ch = MIN (127, c1);
3001 SETUP_ASCII_RANGE (range_table_work, c, ch);
3002 c = ch + 1;
3003 if (CHAR_BYTE8_P (c1))
3004 c = BYTE8_TO_CHAR (128);
3005 }
3006 if (c <= c1)
3007 {
3008 if (CHAR_BYTE8_P (c))
3009 {
3010 c = CHAR_TO_BYTE8 (c);
3011 c1 = CHAR_TO_BYTE8 (c1);
3012 for (; c <= c1; c++)
3013 SET_LIST_BIT (c);
3014 }
3015 else if (multibyte)
3016 {
3017 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3018 }
3019 else
3020 {
3021 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3022 }
e934739e 3023 }
cf9c99bc 3024#endif /* emacs */
25fe55af 3025 }
e318085a
RS
3026 }
3027
25fe55af 3028 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3029 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3030 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3031 b[-1]--;
3032 b += b[-1];
fa9a63c5 3033
96cc36cc
RS
3034 /* Build real range table from work area. */
3035 if (RANGE_TABLE_WORK_USED (range_table_work)
3036 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3037 {
3038 int i;
3039 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3040
b18215fc 3041 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc 3042 bytes for flags, two for COUNT, and three bytes for
5ac2eb34 3043 each character. */
96cc36cc 3044 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3045
b18215fc
RS
3046 /* Indicate the existence of range table. */
3047 laststart[1] |= 0x80;
fa9a63c5 3048
96cc36cc
RS
3049 /* Store the character class flag bits into the range table.
3050 If not in emacs, these flag bits are always 0. */
3051 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3052 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3053
b18215fc
RS
3054 STORE_NUMBER_AND_INCR (b, used / 2);
3055 for (i = 0; i < used; i++)
3056 STORE_CHARACTER_AND_INCR
3057 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3058 }
25fe55af
RS
3059 }
3060 break;
fa9a63c5
RM
3061
3062
b18215fc 3063 case '(':
25fe55af
RS
3064 if (syntax & RE_NO_BK_PARENS)
3065 goto handle_open;
3066 else
3067 goto normal_char;
fa9a63c5
RM
3068
3069
25fe55af
RS
3070 case ')':
3071 if (syntax & RE_NO_BK_PARENS)
3072 goto handle_close;
3073 else
3074 goto normal_char;
e318085a
RS
3075
3076
25fe55af
RS
3077 case '\n':
3078 if (syntax & RE_NEWLINE_ALT)
3079 goto handle_alt;
3080 else
3081 goto normal_char;
e318085a
RS
3082
3083
b18215fc 3084 case '|':
25fe55af
RS
3085 if (syntax & RE_NO_BK_VBAR)
3086 goto handle_alt;
3087 else
3088 goto normal_char;
3089
3090
3091 case '{':
3092 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3093 goto handle_interval;
3094 else
3095 goto normal_char;
3096
3097
3098 case '\\':
3099 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3100
3101 /* Do not translate the character after the \, so that we can
3102 distinguish, e.g., \B from \b, even if we normally would
3103 translate, e.g., B to b. */
36595814 3104 PATFETCH (c);
25fe55af
RS
3105
3106 switch (c)
3107 {
3108 case '(':
3109 if (syntax & RE_NO_BK_PARENS)
3110 goto normal_backslash;
3111
3112 handle_open:
505bde11
SM
3113 {
3114 int shy = 0;
c69b0314 3115 regnum_t regnum = 0;
505bde11
SM
3116 if (p+1 < pend)
3117 {
3118 /* Look for a special (?...) construct */
ed0767d8 3119 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3120 {
ed0767d8 3121 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3122 while (!shy)
505bde11 3123 {
c69b0314
SM
3124 PATFETCH (c);
3125 switch (c)
3126 {
3127 case ':': shy = 1; break;
3128 case '0':
3129 /* An explicitly specified regnum must start
3130 with non-0. */
3131 if (regnum == 0)
3132 FREE_STACK_RETURN (REG_BADPAT);
3133 case '1': case '2': case '3': case '4':
3134 case '5': case '6': case '7': case '8': case '9':
3135 regnum = 10*regnum + (c - '0'); break;
3136 default:
3137 /* Only (?:...) is supported right now. */
3138 FREE_STACK_RETURN (REG_BADPAT);
3139 }
505bde11
SM
3140 }
3141 }
505bde11
SM
3142 }
3143
3144 if (!shy)
c69b0314
SM
3145 regnum = ++bufp->re_nsub;
3146 else if (regnum)
3147 { /* It's actually not shy, but explicitly numbered. */
3148 shy = 0;
3149 if (regnum > bufp->re_nsub)
3150 bufp->re_nsub = regnum;
3151 else if (regnum > bufp->re_nsub
3152 /* Ideally, we'd want to check that the specified
3153 group can't have matched (i.e. all subgroups
3154 using the same regnum are in other branches of
3155 OR patterns), but we don't currently keep track
3156 of enough info to do that easily. */
3157 || group_in_compile_stack (compile_stack, regnum))
3158 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3159 }
c69b0314
SM
3160 else
3161 /* It's really shy. */
3162 regnum = - bufp->re_nsub;
25fe55af 3163
99633e97
SM
3164 if (COMPILE_STACK_FULL)
3165 {
3166 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3167 compile_stack_elt_t);
3168 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3169
99633e97
SM
3170 compile_stack.size <<= 1;
3171 }
25fe55af 3172
99633e97 3173 /* These are the values to restore when we hit end of this
7814e705 3174 group. They are all relative offsets, so that if the
99633e97
SM
3175 whole pattern moves because of realloc, they will still
3176 be valid. */
3177 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3178 COMPILE_STACK_TOP.fixup_alt_jump
3179 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3180 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3181 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3182
c69b0314
SM
3183 /* Do not push a start_memory for groups beyond the last one
3184 we can represent in the compiled pattern. */
3185 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3186 BUF_PUSH_2 (start_memory, regnum);
3187
3188 compile_stack.avail++;
3189
3190 fixup_alt_jump = 0;
3191 laststart = 0;
3192 begalt = b;
3193 /* If we've reached MAX_REGNUM groups, then this open
3194 won't actually generate any code, so we'll have to
3195 clear pending_exact explicitly. */
3196 pending_exact = 0;
3197 break;
505bde11 3198 }
25fe55af
RS
3199
3200 case ')':
3201 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3202
3203 if (COMPILE_STACK_EMPTY)
505bde11
SM
3204 {
3205 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3206 goto normal_backslash;
3207 else
3208 FREE_STACK_RETURN (REG_ERPAREN);
3209 }
25fe55af
RS
3210
3211 handle_close:
505bde11 3212 FIXUP_ALT_JUMP ();
25fe55af
RS
3213
3214 /* See similar code for backslashed left paren above. */
3215 if (COMPILE_STACK_EMPTY)
505bde11
SM
3216 {
3217 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3218 goto normal_char;
3219 else
3220 FREE_STACK_RETURN (REG_ERPAREN);
3221 }
25fe55af
RS
3222
3223 /* Since we just checked for an empty stack above, this
3224 ``can't happen''. */
3225 assert (compile_stack.avail != 0);
3226 {
3227 /* We don't just want to restore into `regnum', because
3228 later groups should continue to be numbered higher,
7814e705 3229 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3230 regnum_t regnum;
25fe55af
RS
3231
3232 compile_stack.avail--;
3233 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3234 fixup_alt_jump
3235 = COMPILE_STACK_TOP.fixup_alt_jump
3236 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3237 : 0;
3238 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3239 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3240 /* If we've reached MAX_REGNUM groups, then this open
3241 won't actually generate any code, so we'll have to
3242 clear pending_exact explicitly. */
3243 pending_exact = 0;
e318085a 3244
25fe55af 3245 /* We're at the end of the group, so now we know how many
7814e705 3246 groups were inside this one. */
c69b0314
SM
3247 if (regnum <= MAX_REGNUM && regnum > 0)
3248 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3249 }
3250 break;
3251
3252
3253 case '|': /* `\|'. */
3254 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3255 goto normal_backslash;
3256 handle_alt:
3257 if (syntax & RE_LIMITED_OPS)
3258 goto normal_char;
3259
3260 /* Insert before the previous alternative a jump which
7814e705 3261 jumps to this alternative if the former fails. */
25fe55af
RS
3262 GET_BUFFER_SPACE (3);
3263 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3264 pending_exact = 0;
3265 b += 3;
3266
3267 /* The alternative before this one has a jump after it
3268 which gets executed if it gets matched. Adjust that
3269 jump so it will jump to this alternative's analogous
3270 jump (put in below, which in turn will jump to the next
3271 (if any) alternative's such jump, etc.). The last such
3272 jump jumps to the correct final destination. A picture:
3273 _____ _____
3274 | | | |
3275 | v | v
d1dfb56c 3276 a | b | c
25fe55af
RS
3277
3278 If we are at `b', then fixup_alt_jump right now points to a
3279 three-byte space after `a'. We'll put in the jump, set
3280 fixup_alt_jump to right after `b', and leave behind three
3281 bytes which we'll fill in when we get to after `c'. */
3282
505bde11 3283 FIXUP_ALT_JUMP ();
25fe55af
RS
3284
3285 /* Mark and leave space for a jump after this alternative,
3286 to be filled in later either by next alternative or
3287 when know we're at the end of a series of alternatives. */
3288 fixup_alt_jump = b;
3289 GET_BUFFER_SPACE (3);
3290 b += 3;
3291
3292 laststart = 0;
3293 begalt = b;
3294 break;
3295
3296
3297 case '{':
3298 /* If \{ is a literal. */
3299 if (!(syntax & RE_INTERVALS)
3300 /* If we're at `\{' and it's not the open-interval
3301 operator. */
4bb91c68 3302 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3303 goto normal_backslash;
3304
3305 handle_interval:
3306 {
3307 /* If got here, then the syntax allows intervals. */
3308
3309 /* At least (most) this many matches must be made. */
99633e97 3310 int lower_bound = 0, upper_bound = -1;
25fe55af 3311
ed0767d8 3312 beg_interval = p;
25fe55af 3313
25fe55af
RS
3314 GET_UNSIGNED_NUMBER (lower_bound);
3315
3316 if (c == ',')
ed0767d8 3317 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3318 else
3319 /* Interval such as `{1}' => match exactly once. */
3320 upper_bound = lower_bound;
3321
3322 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3323 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3324 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3325
3326 if (!(syntax & RE_NO_BK_BRACES))
3327 {
4bb91c68
SM
3328 if (c != '\\')
3329 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3330 if (p == pend)
3331 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3332 PATFETCH (c);
3333 }
3334
3335 if (c != '}')
4bb91c68 3336 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3337
3338 /* We just parsed a valid interval. */
3339
3340 /* If it's invalid to have no preceding re. */
3341 if (!laststart)
3342 {
3343 if (syntax & RE_CONTEXT_INVALID_OPS)
3344 FREE_STACK_RETURN (REG_BADRPT);
3345 else if (syntax & RE_CONTEXT_INDEP_OPS)
3346 laststart = b;
3347 else
3348 goto unfetch_interval;
3349 }
3350
6df42991
SM
3351 if (upper_bound == 0)
3352 /* If the upper bound is zero, just drop the sub pattern
3353 altogether. */
3354 b = laststart;
3355 else if (lower_bound == 1 && upper_bound == 1)
3356 /* Just match it once: nothing to do here. */
3357 ;
3358
3359 /* Otherwise, we have a nontrivial interval. When
3360 we're all done, the pattern will look like:
3361 set_number_at <jump count> <upper bound>
3362 set_number_at <succeed_n count> <lower bound>
3363 succeed_n <after jump addr> <succeed_n count>
3364 <body of loop>
3365 jump_n <succeed_n addr> <jump count>
3366 (The upper bound and `jump_n' are omitted if
3367 `upper_bound' is 1, though.) */
3368 else
3369 { /* If the upper bound is > 1, we need to insert
3370 more at the end of the loop. */
3371 unsigned int nbytes = (upper_bound < 0 ? 3
3372 : upper_bound > 1 ? 5 : 0);
3373 unsigned int startoffset = 0;
3374
3375 GET_BUFFER_SPACE (20); /* We might use less. */
3376
3377 if (lower_bound == 0)
3378 {
3379 /* A succeed_n that starts with 0 is really a
3380 a simple on_failure_jump_loop. */
3381 INSERT_JUMP (on_failure_jump_loop, laststart,
3382 b + 3 + nbytes);
3383 b += 3;
3384 }
3385 else
3386 {
3387 /* Initialize lower bound of the `succeed_n', even
3388 though it will be set during matching by its
3389 attendant `set_number_at' (inserted next),
3390 because `re_compile_fastmap' needs to know.
3391 Jump to the `jump_n' we might insert below. */
3392 INSERT_JUMP2 (succeed_n, laststart,
3393 b + 5 + nbytes,
3394 lower_bound);
3395 b += 5;
3396
3397 /* Code to initialize the lower bound. Insert
7814e705 3398 before the `succeed_n'. The `5' is the last two
6df42991
SM
3399 bytes of this `set_number_at', plus 3 bytes of
3400 the following `succeed_n'. */
3401 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3402 b += 5;
3403 startoffset += 5;
3404 }
3405
3406 if (upper_bound < 0)
3407 {
3408 /* A negative upper bound stands for infinity,
3409 in which case it degenerates to a plain jump. */
3410 STORE_JUMP (jump, b, laststart + startoffset);
3411 b += 3;
3412 }
3413 else if (upper_bound > 1)
3414 { /* More than one repetition is allowed, so
3415 append a backward jump to the `succeed_n'
3416 that starts this interval.
3417
3418 When we've reached this during matching,
3419 we'll have matched the interval once, so
3420 jump back only `upper_bound - 1' times. */
3421 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3422 upper_bound - 1);
3423 b += 5;
3424
3425 /* The location we want to set is the second
3426 parameter of the `jump_n'; that is `b-2' as
3427 an absolute address. `laststart' will be
3428 the `set_number_at' we're about to insert;
3429 `laststart+3' the number to set, the source
3430 for the relative address. But we are
3431 inserting into the middle of the pattern --
3432 so everything is getting moved up by 5.
3433 Conclusion: (b - 2) - (laststart + 3) + 5,
3434 i.e., b - laststart.
3435
3436 We insert this at the beginning of the loop
3437 so that if we fail during matching, we'll
3438 reinitialize the bounds. */
3439 insert_op2 (set_number_at, laststart, b - laststart,
3440 upper_bound - 1, b);
3441 b += 5;
3442 }
3443 }
25fe55af
RS
3444 pending_exact = 0;
3445 beg_interval = NULL;
3446 }
3447 break;
3448
3449 unfetch_interval:
3450 /* If an invalid interval, match the characters as literals. */
3451 assert (beg_interval);
3452 p = beg_interval;
3453 beg_interval = NULL;
3454
3455 /* normal_char and normal_backslash need `c'. */
ed0767d8 3456 c = '{';
25fe55af
RS
3457
3458 if (!(syntax & RE_NO_BK_BRACES))
3459 {
ed0767d8
SM
3460 assert (p > pattern && p[-1] == '\\');
3461 goto normal_backslash;
25fe55af 3462 }
ed0767d8
SM
3463 else
3464 goto normal_char;
e318085a 3465
b18215fc 3466#ifdef emacs
25fe55af 3467 /* There is no way to specify the before_dot and after_dot
7814e705 3468 operators. rms says this is ok. --karl */
25fe55af 3469 case '=':
5ac2eb34 3470 laststart = b;
25fe55af
RS
3471 BUF_PUSH (at_dot);
3472 break;
3473
3474 case 's':
3475 laststart = b;
3476 PATFETCH (c);
3477 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3478 break;
3479
3480 case 'S':
3481 laststart = b;
3482 PATFETCH (c);
3483 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3484 break;
b18215fc
RS
3485
3486 case 'c':
3487 laststart = b;
36595814 3488 PATFETCH (c);
b18215fc
RS
3489 BUF_PUSH_2 (categoryspec, c);
3490 break;
e318085a 3491
b18215fc
RS
3492 case 'C':
3493 laststart = b;
36595814 3494 PATFETCH (c);
b18215fc
RS
3495 BUF_PUSH_2 (notcategoryspec, c);
3496 break;
3497#endif /* emacs */
e318085a 3498
e318085a 3499
25fe55af 3500 case 'w':
4bb91c68
SM
3501 if (syntax & RE_NO_GNU_OPS)
3502 goto normal_char;
25fe55af 3503 laststart = b;
1fb352e0 3504 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3505 break;
e318085a 3506
e318085a 3507
25fe55af 3508 case 'W':
4bb91c68
SM
3509 if (syntax & RE_NO_GNU_OPS)
3510 goto normal_char;
25fe55af 3511 laststart = b;
1fb352e0 3512 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3513 break;
e318085a
RS
3514
3515
25fe55af 3516 case '<':
4bb91c68
SM
3517 if (syntax & RE_NO_GNU_OPS)
3518 goto normal_char;
5ac2eb34 3519 laststart = b;
25fe55af
RS
3520 BUF_PUSH (wordbeg);
3521 break;
e318085a 3522
25fe55af 3523 case '>':
4bb91c68
SM
3524 if (syntax & RE_NO_GNU_OPS)
3525 goto normal_char;
5ac2eb34 3526 laststart = b;
25fe55af
RS
3527 BUF_PUSH (wordend);
3528 break;
e318085a 3529
669fa600
SM
3530 case '_':
3531 if (syntax & RE_NO_GNU_OPS)
3532 goto normal_char;
3533 laststart = b;
3534 PATFETCH (c);
3535 if (c == '<')
3536 BUF_PUSH (symbeg);
3537 else if (c == '>')
3538 BUF_PUSH (symend);
3539 else
3540 FREE_STACK_RETURN (REG_BADPAT);
3541 break;
3542
25fe55af 3543 case 'b':
4bb91c68
SM
3544 if (syntax & RE_NO_GNU_OPS)
3545 goto normal_char;
25fe55af
RS
3546 BUF_PUSH (wordbound);
3547 break;
e318085a 3548
25fe55af 3549 case 'B':
4bb91c68
SM
3550 if (syntax & RE_NO_GNU_OPS)
3551 goto normal_char;
25fe55af
RS
3552 BUF_PUSH (notwordbound);
3553 break;
fa9a63c5 3554
25fe55af 3555 case '`':
4bb91c68
SM
3556 if (syntax & RE_NO_GNU_OPS)
3557 goto normal_char;
25fe55af
RS
3558 BUF_PUSH (begbuf);
3559 break;
e318085a 3560
25fe55af 3561 case '\'':
4bb91c68
SM
3562 if (syntax & RE_NO_GNU_OPS)
3563 goto normal_char;
25fe55af
RS
3564 BUF_PUSH (endbuf);
3565 break;
e318085a 3566
25fe55af
RS
3567 case '1': case '2': case '3': case '4': case '5':
3568 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3569 {
3570 regnum_t reg;
e318085a 3571
0cdd06f8
SM
3572 if (syntax & RE_NO_BK_REFS)
3573 goto normal_backslash;
e318085a 3574
0cdd06f8 3575 reg = c - '0';
e318085a 3576
c69b0314
SM
3577 if (reg > bufp->re_nsub || reg < 1
3578 /* Can't back reference to a subexp before its end. */
3579 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3580 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3581
0cdd06f8
SM
3582 laststart = b;
3583 BUF_PUSH_2 (duplicate, reg);
3584 }
25fe55af 3585 break;
e318085a 3586
e318085a 3587
25fe55af
RS
3588 case '+':
3589 case '?':
3590 if (syntax & RE_BK_PLUS_QM)
3591 goto handle_plus;
3592 else
3593 goto normal_backslash;
3594
3595 default:
3596 normal_backslash:
3597 /* You might think it would be useful for \ to mean
3598 not to translate; but if we don't translate it
4bb91c68 3599 it will never match anything. */
25fe55af
RS
3600 goto normal_char;
3601 }
3602 break;
fa9a63c5
RM
3603
3604
3605 default:
25fe55af 3606 /* Expects the character in `c'. */
fa9a63c5 3607 normal_char:
36595814 3608 /* If no exactn currently being built. */
25fe55af 3609 if (!pending_exact
fa9a63c5 3610
25fe55af
RS
3611 /* If last exactn not at current position. */
3612 || pending_exact + *pending_exact + 1 != b
5e69f11e 3613
25fe55af 3614 /* We have only one byte following the exactn for the count. */
2d1675e4 3615 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3616
7814e705 3617 /* If followed by a repetition operator. */
9d99031f 3618 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3619 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3620 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3621 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3622 || ((syntax & RE_INTERVALS)
25fe55af 3623 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3624 ? p != pend && *p == '{'
3625 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3626 {
3627 /* Start building a new exactn. */
5e69f11e 3628
25fe55af 3629 laststart = b;
fa9a63c5
RM
3630
3631 BUF_PUSH_2 (exactn, 0);
3632 pending_exact = b - 1;
25fe55af 3633 }
5e69f11e 3634
2d1675e4
SM
3635 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3636 {
e0277a47
KH
3637 int len;
3638
cf9c99bc 3639 if (multibyte)
6fdd04b0 3640 {
cf9c99bc 3641 c = TRANSLATE (c);
6fdd04b0
KH
3642 len = CHAR_STRING (c, b);
3643 b += len;
3644 }
e0277a47 3645 else
6fdd04b0 3646 {
cf9c99bc
KH
3647 c1 = RE_CHAR_TO_MULTIBYTE (c);
3648 if (! CHAR_BYTE8_P (c1))
3649 {
3650 re_wchar_t c2 = TRANSLATE (c1);
3651
3652 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3653 c = c1;
409f2919 3654 }
6fdd04b0
KH
3655 *b++ = c;
3656 len = 1;
3657 }
2d1675e4
SM
3658 (*pending_exact) += len;
3659 }
3660
fa9a63c5 3661 break;
25fe55af 3662 } /* switch (c) */
fa9a63c5
RM
3663 } /* while p != pend */
3664
5e69f11e 3665
fa9a63c5 3666 /* Through the pattern now. */
5e69f11e 3667
505bde11 3668 FIXUP_ALT_JUMP ();
fa9a63c5 3669
5e69f11e 3670 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3671 FREE_STACK_RETURN (REG_EPAREN);
3672
3673 /* If we don't want backtracking, force success
3674 the first time we reach the end of the compiled pattern. */
3675 if (syntax & RE_NO_POSIX_BACKTRACKING)
3676 BUF_PUSH (succeed);
3677
fa9a63c5
RM
3678 /* We have succeeded; set the length of the buffer. */
3679 bufp->used = b - bufp->buffer;
3680
3681#ifdef DEBUG
99633e97 3682 if (debug > 0)
fa9a63c5 3683 {
505bde11 3684 re_compile_fastmap (bufp);
dc4a2ee0 3685 DEBUG_PRINT ("\nCompiled pattern: \n");
fa9a63c5
RM
3686 print_compiled_pattern (bufp);
3687 }
99633e97 3688 debug--;
fa9a63c5
RM
3689#endif /* DEBUG */
3690
3691#ifndef MATCH_MAY_ALLOCATE
3692 /* Initialize the failure stack to the largest possible stack. This
3693 isn't necessary unless we're trying to avoid calling alloca in
3694 the search and match routines. */
3695 {
3696 int num_regs = bufp->re_nsub + 1;
3697
320a2a73 3698 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3699 {
a26f4ccd 3700 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
38182d90
PE
3701 falk_stack.stack = realloc (fail_stack.stack,
3702 fail_stack.size * sizeof *falk_stack.stack);
fa9a63c5
RM
3703 }
3704
3705 regex_grow_registers (num_regs);
3706 }
3707#endif /* not MATCH_MAY_ALLOCATE */
3708
839966f3 3709 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3710} /* regex_compile */
3711\f
3712/* Subroutines for `regex_compile'. */
3713
7814e705 3714/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3715
3716static void
971de7fb 3717store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3718{
3719 *loc = (unsigned char) op;
3720 STORE_NUMBER (loc + 1, arg);
3721}
3722
3723
3724/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3725
3726static void
971de7fb 3727store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3728{
3729 *loc = (unsigned char) op;
3730 STORE_NUMBER (loc + 1, arg1);
3731 STORE_NUMBER (loc + 3, arg2);
3732}
3733
3734
3735/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3736 for OP followed by two-byte integer parameter ARG. */
3737
3738static void
971de7fb 3739insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3740{
3741 register unsigned char *pfrom = end;
3742 register unsigned char *pto = end + 3;
3743
3744 while (pfrom != loc)
3745 *--pto = *--pfrom;
5e69f11e 3746
fa9a63c5
RM
3747 store_op1 (op, loc, arg);
3748}
3749
3750
3751/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3752
3753static void
971de7fb 3754insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3755{
3756 register unsigned char *pfrom = end;
3757 register unsigned char *pto = end + 5;
3758
3759 while (pfrom != loc)
3760 *--pto = *--pfrom;
5e69f11e 3761
fa9a63c5
RM
3762 store_op2 (op, loc, arg1, arg2);
3763}
3764
3765
3766/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3767 after an alternative or a begin-subexpression. We assume there is at
3768 least one character before the ^. */
3769
3770static boolean
29abe551 3771at_begline_loc_p (const_re_char *pattern, const_re_char *p, reg_syntax_t syntax)
fa9a63c5 3772{
01618498 3773 re_char *prev = p - 2;
95988fcf 3774 boolean odd_backslashes;
5e69f11e 3775
95988fcf
AS
3776 /* After a subexpression? */
3777 if (*prev == '(')
3778 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3779
3780 /* After an alternative? */
3781 else if (*prev == '|')
3782 odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3783
3784 /* After a shy subexpression? */
3785 else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3786 {
3787 /* Skip over optional regnum. */
3788 while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3789 --prev;
3790
3791 if (!(prev - 2 >= pattern
3792 && prev[-1] == '?' && prev[-2] == '('))
3793 return false;
3794 prev -= 2;
3795 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3796 }
3797 else
3798 return false;
3799
3800 /* Count the number of preceding backslashes. */
3801 p = prev;
3802 while (prev - 1 >= pattern && prev[-1] == '\\')
3803 --prev;
3804 return (p - prev) & odd_backslashes;
fa9a63c5
RM
3805}
3806
3807
3808/* The dual of at_begline_loc_p. This one is for $. We assume there is
3809 at least one character after the $, i.e., `P < PEND'. */
3810
3811static boolean
29abe551 3812at_endline_loc_p (const_re_char *p, const_re_char *pend, reg_syntax_t syntax)
fa9a63c5 3813{
01618498 3814 re_char *next = p;
fa9a63c5 3815 boolean next_backslash = *next == '\\';
01618498 3816 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3817
fa9a63c5
RM
3818 return
3819 /* Before a subexpression? */
3820 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3821 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3822 /* Before an alternative? */
3823 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3824 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3825}
3826
3827
5e69f11e 3828/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3829 false if it's not. */
3830
3831static boolean
971de7fb 3832group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5 3833{
d1dfb56c 3834 ssize_t this_element;
fa9a63c5 3835
5e69f11e
RM
3836 for (this_element = compile_stack.avail - 1;
3837 this_element >= 0;
fa9a63c5
RM
3838 this_element--)
3839 if (compile_stack.stack[this_element].regnum == regnum)
3840 return true;
3841
3842 return false;
3843}
fa9a63c5 3844\f
f6a3f532
SM
3845/* analyse_first.
3846 If fastmap is non-NULL, go through the pattern and fill fastmap
3847 with all the possible leading chars. If fastmap is NULL, don't
3848 bother filling it up (obviously) and only return whether the
3849 pattern could potentially match the empty string.
3850
3851 Return 1 if p..pend might match the empty string.
3852 Return 0 if p..pend matches at least one char.
01618498 3853 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3854
3855static int
29abe551
PE
3856analyse_first (const_re_char *p, const_re_char *pend, char *fastmap,
3857 const int multibyte)
fa9a63c5 3858{
505bde11 3859 int j, k;
1fb352e0 3860 boolean not;
fa9a63c5 3861
b18215fc 3862 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3863 flag is set true. */
b18215fc
RS
3864 boolean match_any_multibyte_characters = false;
3865
f6a3f532 3866 assert (p);
5e69f11e 3867
505bde11
SM
3868 /* The loop below works as follows:
3869 - It has a working-list kept in the PATTERN_STACK and which basically
3870 starts by only containing a pointer to the first operation.
3871 - If the opcode we're looking at is a match against some set of
3872 chars, then we add those chars to the fastmap and go on to the
3873 next work element from the worklist (done via `break').
3874 - If the opcode is a control operator on the other hand, we either
3875 ignore it (if it's meaningless at this point, such as `start_memory')
3876 or execute it (if it's a jump). If the jump has several destinations
3877 (i.e. `on_failure_jump'), then we push the other destination onto the
3878 worklist.
3879 We guarantee termination by ignoring backward jumps (more or less),
3880 so that `p' is monotonically increasing. More to the point, we
3881 never set `p' (or push) anything `<= p1'. */
3882
01618498 3883 while (p < pend)
fa9a63c5 3884 {
505bde11
SM
3885 /* `p1' is used as a marker of how far back a `on_failure_jump'
3886 can go without being ignored. It is normally equal to `p'
3887 (which prevents any backward `on_failure_jump') except right
3888 after a plain `jump', to allow patterns such as:
3889 0: jump 10
3890 3..9: <body>
3891 10: on_failure_jump 3
3892 as used for the *? operator. */
01618498 3893 re_char *p1 = p;
5e69f11e 3894
7393bcbb 3895 switch (*p++)
fa9a63c5 3896 {
f6a3f532 3897 case succeed:
01618498 3898 return 1;
fa9a63c5 3899
fa9a63c5 3900 case duplicate:
505bde11
SM
3901 /* If the first character has to match a backreference, that means
3902 that the group was empty (since it already matched). Since this
3903 is the only case that interests us here, we can assume that the
3904 backreference must match the empty string. */
3905 p++;
3906 continue;
fa9a63c5
RM
3907
3908
3909 /* Following are the cases which match a character. These end
7814e705 3910 with `break'. */
fa9a63c5
RM
3911
3912 case exactn:
e0277a47 3913 if (fastmap)
cf9c99bc
KH
3914 {
3915 /* If multibyte is nonzero, the first byte of each
3916 character is an ASCII or a leading code. Otherwise,
3917 each byte is a character. Thus, this works in both
3918 cases. */
3919 fastmap[p[1]] = 1;
3920 if (! multibyte)
3921 {
3922 /* For the case of matching this unibyte regex
3923 against multibyte, we must set a leading code of
3924 the corresponding multibyte character. */
3925 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3926
86e893e3 3927 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
3928 }
3929 }
fa9a63c5
RM
3930 break;
3931
3932
1fb352e0
SM
3933 case anychar:
3934 /* We could put all the chars except for \n (and maybe \0)
3935 but we don't bother since it is generally not worth it. */
f6a3f532 3936 if (!fastmap) break;
01618498 3937 return -1;
fa9a63c5
RM
3938
3939
b18215fc 3940 case charset_not:
1fb352e0 3941 if (!fastmap) break;
bf216479
KH
3942 {
3943 /* Chars beyond end of bitmap are possible matches. */
bf216479 3944 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 3945 j < (1 << BYTEWIDTH); j++)
bf216479
KH
3946 fastmap[j] = 1;
3947 }
3948
1fb352e0
SM
3949 /* Fallthrough */
3950 case charset:
3951 if (!fastmap) break;
3952 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3953 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3954 j >= 0; j--)
1fb352e0 3955 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 3956 fastmap[j] = 1;
b18215fc 3957
6482db2e
KH
3958#ifdef emacs
3959 if (/* Any leading code can possibly start a character
1fb352e0 3960 which doesn't match the specified set of characters. */
6482db2e 3961 not
409f2919 3962 ||
6482db2e
KH
3963 /* If we can match a character class, we can match any
3964 multibyte characters. */
3965 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3966 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3967
b18215fc 3968 {
b18215fc
RS
3969 if (match_any_multibyte_characters == false)
3970 {
6482db2e
KH
3971 for (j = MIN_MULTIBYTE_LEADING_CODE;
3972 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 3973 fastmap[j] = 1;
b18215fc
RS
3974 match_any_multibyte_characters = true;
3975 }
3976 }
b18215fc 3977
1fb352e0
SM
3978 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3979 && match_any_multibyte_characters == false)
3980 {
bf216479 3981 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 3982 multibyte character in the range table. */
1fb352e0 3983 int c, count;
bf216479 3984 unsigned char lc1, lc2;
b18215fc 3985
1fb352e0 3986 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3987 bits for a character class. */
1fb352e0 3988 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3989
1fb352e0
SM
3990 /* Extract the number of ranges in range table into COUNT. */
3991 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 3992 for (; count > 0; count--, p += 3)
1fb352e0 3993 {
9117d724
KH
3994 /* Extract the start and end of each range. */
3995 EXTRACT_CHARACTER (c, p);
bf216479 3996 lc1 = CHAR_LEADING_CODE (c);
9117d724 3997 p += 3;
1fb352e0 3998 EXTRACT_CHARACTER (c, p);
bf216479
KH
3999 lc2 = CHAR_LEADING_CODE (c);
4000 for (j = lc1; j <= lc2; j++)
9117d724 4001 fastmap[j] = 1;
1fb352e0
SM
4002 }
4003 }
6482db2e 4004#endif
b18215fc
RS
4005 break;
4006
1fb352e0
SM
4007 case syntaxspec:
4008 case notsyntaxspec:
4009 if (!fastmap) break;
4010#ifndef emacs
4011 not = (re_opcode_t)p[-1] == notsyntaxspec;
4012 k = *p++;
4013 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4014 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4015 fastmap[j] = 1;
b18215fc 4016 break;
1fb352e0 4017#else /* emacs */
b18215fc
RS
4018 /* This match depends on text properties. These end with
4019 aborting optimizations. */
01618498 4020 return -1;
b18215fc
RS
4021
4022 case categoryspec:
b18215fc 4023 case notcategoryspec:
1fb352e0
SM
4024 if (!fastmap) break;
4025 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4026 k = *p++;
6482db2e 4027 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4028 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4029 fastmap[j] = 1;
4030
6482db2e
KH
4031 /* Any leading code can possibly start a character which
4032 has or doesn't has the specified category. */
4033 if (match_any_multibyte_characters == false)
6fdd04b0 4034 {
6482db2e
KH
4035 for (j = MIN_MULTIBYTE_LEADING_CODE;
4036 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4037 fastmap[j] = 1;
4038 match_any_multibyte_characters = true;
6fdd04b0 4039 }
b18215fc
RS
4040 break;
4041
fa9a63c5 4042 /* All cases after this match the empty string. These end with
25fe55af 4043 `continue'. */
fa9a63c5 4044
fa9a63c5
RM
4045 case before_dot:
4046 case at_dot:
4047 case after_dot:
1fb352e0 4048#endif /* !emacs */
25fe55af
RS
4049 case no_op:
4050 case begline:
4051 case endline:
fa9a63c5
RM
4052 case begbuf:
4053 case endbuf:
4054 case wordbound:
4055 case notwordbound:
4056 case wordbeg:
4057 case wordend:
669fa600
SM
4058 case symbeg:
4059 case symend:
25fe55af 4060 continue;
fa9a63c5
RM
4061
4062
fa9a63c5 4063 case jump:
25fe55af 4064 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4065 if (j < 0)
4066 /* Backward jumps can only go back to code that we've already
4067 visited. `re_compile' should make sure this is true. */
4068 break;
25fe55af 4069 p += j;
7393bcbb 4070 switch (*p)
505bde11
SM
4071 {
4072 case on_failure_jump:
4073 case on_failure_keep_string_jump:
505bde11 4074 case on_failure_jump_loop:
0683b6fa 4075 case on_failure_jump_nastyloop:
505bde11
SM
4076 case on_failure_jump_smart:
4077 p++;
4078 break;
4079 default:
4080 continue;
4081 };
4082 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4083 to jump back to "just after here". */
4084 /* Fallthrough */
fa9a63c5 4085
25fe55af
RS
4086 case on_failure_jump:
4087 case on_failure_keep_string_jump:
0683b6fa 4088 case on_failure_jump_nastyloop:
505bde11
SM
4089 case on_failure_jump_loop:
4090 case on_failure_jump_smart:
25fe55af 4091 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4092 if (p + j <= p1)
ed0767d8 4093 ; /* Backward jump to be ignored. */
01618498
SM
4094 else
4095 { /* We have to look down both arms.
4096 We first go down the "straight" path so as to minimize
4097 stack usage when going through alternatives. */
4098 int r = analyse_first (p, pend, fastmap, multibyte);
4099 if (r) return r;
4100 p += j;
4101 }
25fe55af 4102 continue;
fa9a63c5
RM
4103
4104
ed0767d8
SM
4105 case jump_n:
4106 /* This code simply does not properly handle forward jump_n. */
4107 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4108 p += 4;
4109 /* jump_n can either jump or fall through. The (backward) jump
4110 case has already been handled, so we only need to look at the
4111 fallthrough case. */
4112 continue;
177c0ea7 4113
fa9a63c5 4114 case succeed_n:
ed0767d8
SM
4115 /* If N == 0, it should be an on_failure_jump_loop instead. */
4116 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4117 p += 4;
4118 /* We only care about one iteration of the loop, so we don't
4119 need to consider the case where this behaves like an
4120 on_failure_jump. */
25fe55af 4121 continue;
fa9a63c5
RM
4122
4123
4124 case set_number_at:
25fe55af
RS
4125 p += 4;
4126 continue;
fa9a63c5
RM
4127
4128
4129 case start_memory:
25fe55af 4130 case stop_memory:
505bde11 4131 p += 1;
fa9a63c5
RM
4132 continue;
4133
4134
4135 default:
25fe55af
RS
4136 abort (); /* We have listed all the cases. */
4137 } /* switch *p++ */
fa9a63c5
RM
4138
4139 /* Getting here means we have found the possible starting
25fe55af 4140 characters for one path of the pattern -- and that the empty
7814e705 4141 string does not match. We need not follow this path further. */
01618498 4142 return 0;
fa9a63c5
RM
4143 } /* while p */
4144
01618498
SM
4145 /* We reached the end without matching anything. */
4146 return 1;
4147
f6a3f532
SM
4148} /* analyse_first */
4149\f
4150/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4151 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4152 characters can start a string that matches the pattern. This fastmap
4153 is used by re_search to skip quickly over impossible starting points.
4154
4155 Character codes above (1 << BYTEWIDTH) are not represented in the
4156 fastmap, but the leading codes are represented. Thus, the fastmap
4157 indicates which character sets could start a match.
4158
4159 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4160 area as BUFP->fastmap.
4161
4162 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4163 the pattern buffer.
4164
4165 Returns 0 if we succeed, -2 if an internal error. */
4166
4167int
971de7fb 4168re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4169{
4170 char *fastmap = bufp->fastmap;
4171 int analysis;
4172
4173 assert (fastmap && bufp->buffer);
4174
72af86bd 4175 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4176 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4177
4178 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4179 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4180 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4181 return 0;
4182} /* re_compile_fastmap */
4183\f
4184/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4185 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4186 this memory for recording register information. STARTS and ENDS
4187 must be allocated using the malloc library routine, and must each
4188 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4189
4190 If NUM_REGS == 0, then subsequent matches should allocate their own
4191 register data.
4192
4193 Unless this function is called, the first search or match using
4194 PATTERN_BUFFER will allocate its own register data, without
4195 freeing the old data. */
4196
4197void
971de7fb 4198re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4199{
4200 if (num_regs)
4201 {
4202 bufp->regs_allocated = REGS_REALLOCATE;
4203 regs->num_regs = num_regs;
4204 regs->start = starts;
4205 regs->end = ends;
4206 }
4207 else
4208 {
4209 bufp->regs_allocated = REGS_UNALLOCATED;
4210 regs->num_regs = 0;
7d652d97 4211 regs->start = regs->end = 0;
fa9a63c5
RM
4212 }
4213}
c0f9ea08 4214WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4215\f
7814e705 4216/* Searching routines. */
fa9a63c5
RM
4217
4218/* Like re_search_2, below, but only one string is specified, and
4219 doesn't let you say where to stop matching. */
4220
d1dfb56c
EZ
4221regoff_t
4222re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4223 ssize_t startpos, ssize_t range, struct re_registers *regs)
fa9a63c5 4224{
5e69f11e 4225 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4226 regs, size);
4227}
c0f9ea08 4228WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4229
70806df6
KH
4230/* Head address of virtual concatenation of string. */
4231#define HEAD_ADDR_VSTRING(P) \
4232 (((P) >= size1 ? string2 : string1))
4233
b18215fc
RS
4234/* Address of POS in the concatenation of virtual string. */
4235#define POS_ADDR_VSTRING(POS) \
4236 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4237
4238/* Using the compiled pattern in BUFP->buffer, first tries to match the
4239 virtual concatenation of STRING1 and STRING2, starting first at index
4240 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4241
fa9a63c5 4242 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4243
fa9a63c5
RM
4244 RANGE is how far to scan while trying to match. RANGE = 0 means try
4245 only at STARTPOS; in general, the last start tried is STARTPOS +
4246 RANGE.
5e69f11e 4247
fa9a63c5
RM
4248 In REGS, return the indices of the virtual concatenation of STRING1
4249 and STRING2 that matched the entire BUFP->buffer and its contained
4250 subexpressions.
5e69f11e 4251
fa9a63c5
RM
4252 Do not consider matching one past the index STOP in the virtual
4253 concatenation of STRING1 and STRING2.
4254
4255 We return either the position in the strings at which the match was
4256 found, -1 if no match, or -2 if error (such as failure
4257 stack overflow). */
4258
d1dfb56c
EZ
4259regoff_t
4260re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4261 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4262 struct re_registers *regs, ssize_t stop)
fa9a63c5 4263{
d1dfb56c 4264 regoff_t val;
66f0296e
SM
4265 re_char *string1 = (re_char*) str1;
4266 re_char *string2 = (re_char*) str2;
fa9a63c5 4267 register char *fastmap = bufp->fastmap;
6676cb1c 4268 register RE_TRANSLATE_TYPE translate = bufp->translate;
d1dfb56c
EZ
4269 size_t total_size = size1 + size2;
4270 ssize_t endpos = startpos + range;
c0f9ea08 4271 boolean anchored_start;
cf9c99bc
KH
4272 /* Nonzero if we are searching multibyte string. */
4273 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4274
fa9a63c5
RM
4275 /* Check for out-of-range STARTPOS. */
4276 if (startpos < 0 || startpos > total_size)
4277 return -1;
5e69f11e 4278
fa9a63c5 4279 /* Fix up RANGE if it might eventually take us outside
34597fa9 4280 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4281 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4282 if (endpos < 0)
4283 range = 0 - startpos;
fa9a63c5
RM
4284 else if (endpos > total_size)
4285 range = total_size - startpos;
4286
4287 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4288 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4289 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4290 {
4291 if (startpos > 0)
4292 return -1;
4293 else
7b140fd7 4294 range = 0;
fa9a63c5
RM
4295 }
4296
ae4788a8
RS
4297#ifdef emacs
4298 /* In a forward search for something that starts with \=.
4299 don't keep searching past point. */
4300 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4301 {
7b140fd7
RS
4302 range = PT_BYTE - BEGV_BYTE - startpos;
4303 if (range < 0)
ae4788a8
RS
4304 return -1;
4305 }
4306#endif /* emacs */
4307
fa9a63c5
RM
4308 /* Update the fastmap now if not correct already. */
4309 if (fastmap && !bufp->fastmap_accurate)
01618498 4310 re_compile_fastmap (bufp);
5e69f11e 4311
c8499ba5 4312 /* See whether the pattern is anchored. */
c0f9ea08 4313 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4314
b18215fc 4315#ifdef emacs
d48cd3f4 4316 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4317 {
d1dfb56c 4318 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4319
4320 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4321 }
b18215fc
RS
4322#endif
4323
fa9a63c5
RM
4324 /* Loop through the string, looking for a place to start matching. */
4325 for (;;)
5e69f11e 4326 {
c8499ba5
RS
4327 /* If the pattern is anchored,
4328 skip quickly past places we cannot match.
4329 We don't bother to treat startpos == 0 specially
4330 because that case doesn't repeat. */
4331 if (anchored_start && startpos > 0)
4332 {
c0f9ea08
SM
4333 if (! ((startpos <= size1 ? string1[startpos - 1]
4334 : string2[startpos - size1 - 1])
4335 == '\n'))
c8499ba5
RS
4336 goto advance;
4337 }
4338
fa9a63c5 4339 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4340 cannot be the start of a match. If the pattern can match the
4341 null string, however, we don't need to skip characters; we want
7814e705 4342 the first null string. */
fa9a63c5
RM
4343 if (fastmap && startpos < total_size && !bufp->can_be_null)
4344 {
66f0296e 4345 register re_char *d;
01618498 4346 register re_wchar_t buf_ch;
e934739e
RS
4347
4348 d = POS_ADDR_VSTRING (startpos);
4349
7814e705 4350 if (range > 0) /* Searching forwards. */
fa9a63c5 4351 {
fa9a63c5 4352 register int lim = 0;
d1dfb56c 4353 ssize_t irange = range;
fa9a63c5 4354
25fe55af
RS
4355 if (startpos < size1 && startpos + range >= size1)
4356 lim = range - (size1 - startpos);
fa9a63c5 4357
25fe55af
RS
4358 /* Written out as an if-else to avoid testing `translate'
4359 inside the loop. */
28ae27ae
AS
4360 if (RE_TRANSLATE_P (translate))
4361 {
e934739e
RS
4362 if (multibyte)
4363 while (range > lim)
4364 {
4365 int buf_charlen;
4366
62a6e103 4367 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4368 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4369 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4370 break;
4371
4372 range -= buf_charlen;
4373 d += buf_charlen;
4374 }
4375 else
bf216479 4376 while (range > lim)
33c46939 4377 {
cf9c99bc
KH
4378 register re_wchar_t ch, translated;
4379
bf216479 4380 buf_ch = *d;
cf9c99bc
KH
4381 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4382 translated = RE_TRANSLATE (translate, ch);
4383 if (translated != ch
4384 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4385 buf_ch = ch;
6fdd04b0 4386 if (fastmap[buf_ch])
bf216479 4387 break;
33c46939
RS
4388 d++;
4389 range--;
4390 }
e934739e 4391 }
fa9a63c5 4392 else
6fdd04b0
KH
4393 {
4394 if (multibyte)
4395 while (range > lim)
4396 {
4397 int buf_charlen;
fa9a63c5 4398
62a6e103 4399 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4400 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4401 break;
4402 range -= buf_charlen;
4403 d += buf_charlen;
4404 }
e934739e 4405 else
6fdd04b0 4406 while (range > lim && !fastmap[*d])
33c46939
RS
4407 {
4408 d++;
4409 range--;
4410 }
e934739e 4411 }
fa9a63c5
RM
4412 startpos += irange - range;
4413 }
7814e705 4414 else /* Searching backwards. */
fa9a63c5 4415 {
ba5e343c
KH
4416 if (multibyte)
4417 {
62a6e103 4418 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4419 buf_ch = TRANSLATE (buf_ch);
4420 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4421 goto advance;
4422 }
4423 else
4424 {
cf9c99bc
KH
4425 register re_wchar_t ch, translated;
4426
4427 buf_ch = *d;
4428 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4429 translated = TRANSLATE (ch);
4430 if (translated != ch
4431 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4432 buf_ch = ch;
4433 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4434 goto advance;
4435 }
fa9a63c5
RM
4436 }
4437 }
4438
4439 /* If can't match the null string, and that's all we have left, fail. */
4440 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4441 && !bufp->can_be_null)
fa9a63c5
RM
4442 return -1;
4443
4444 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4445 startpos, regs, stop);
fa9a63c5
RM
4446
4447 if (val >= 0)
4448 return startpos;
5e69f11e 4449
fa9a63c5
RM
4450 if (val == -2)
4451 return -2;
4452
4453 advance:
5e69f11e 4454 if (!range)
25fe55af 4455 break;
5e69f11e 4456 else if (range > 0)
25fe55af 4457 {
b18215fc
RS
4458 /* Update STARTPOS to the next character boundary. */
4459 if (multibyte)
4460 {
66f0296e 4461 re_char *p = POS_ADDR_VSTRING (startpos);
aa3830c4 4462 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4463
4464 range -= len;
4465 if (range < 0)
4466 break;
4467 startpos += len;
4468 }
4469 else
4470 {
b560c397
RS
4471 range--;
4472 startpos++;
4473 }
e318085a 4474 }
fa9a63c5 4475 else
25fe55af
RS
4476 {
4477 range++;
4478 startpos--;
b18215fc
RS
4479
4480 /* Update STARTPOS to the previous character boundary. */
4481 if (multibyte)
4482 {
70806df6
KH
4483 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4484 re_char *p0 = p;
4485 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4486
4487 /* Find the head of multibyte form. */
70806df6
KH
4488 PREV_CHAR_BOUNDARY (p, phead);
4489 range += p0 - 1 - p;
4490 if (range > 0)
4491 break;
b18215fc 4492
70806df6 4493 startpos -= p0 - 1 - p;
b18215fc 4494 }
25fe55af 4495 }
fa9a63c5
RM
4496 }
4497 return -1;
4498} /* re_search_2 */
c0f9ea08 4499WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4500\f
4501/* Declarations and macros for re_match_2. */
4502
261cb4bb
PE
4503static int bcmp_translate (re_char *s1, re_char *s2,
4504 register ssize_t len,
4505 RE_TRANSLATE_TYPE translate,
4506 const int multibyte);
fa9a63c5
RM
4507
4508/* This converts PTR, a pointer into one of the search strings `string1'
4509 and `string2' into an offset from the beginning of that string. */
4510#define POINTER_TO_OFFSET(ptr) \
4511 (FIRST_STRING_P (ptr) \
dc4a2ee0
PE
4512 ? (ptr) - string1 \
4513 : (ptr) - string2 + (ptrdiff_t) size1)
fa9a63c5 4514
fa9a63c5 4515/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4516 string2 if necessary.
4517 Check re_match_2_internal for a discussion of why end_match_2 might
4518 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4519#define PREFETCH() \
25fe55af 4520 while (d == dend) \
fa9a63c5
RM
4521 { \
4522 /* End of string2 => fail. */ \
25fe55af
RS
4523 if (dend == end_match_2) \
4524 goto fail; \
4bb91c68 4525 /* End of string1 => advance to string2. */ \
25fe55af 4526 d = string2; \
fa9a63c5
RM
4527 dend = end_match_2; \
4528 }
4529
f1ad044f
SM
4530/* Call before fetching a char with *d if you already checked other limits.
4531 This is meant for use in lookahead operations like wordend, etc..
4532 where we might need to look at parts of the string that might be
4533 outside of the LIMITs (i.e past `stop'). */
4534#define PREFETCH_NOLIMIT() \
4535 if (d == end1) \
4536 { \
4537 d = string2; \
4538 dend = end_match_2; \
4539 } \
fa9a63c5
RM
4540
4541/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4542 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4543#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4544#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5 4545
9121ca40 4546/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4547
4548/* The comment at case wordbound is following one, but we don't use
4549 AT_WORD_BOUNDARY anymore to support multibyte form.
4550
4551 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4552 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4553 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4554 macro and introducing temporary variables works around the bug. */
4555
9121ca40 4556#if 0
b313f9d8
PE
4557/* Test if D points to a character which is word-constituent. We have
4558 two special cases to check for: if past the end of string1, look at
4559 the first character in string2; and if before the beginning of
4560 string2, look at the last character in string1. */
4561#define WORDCHAR_P(d) \
4562 (SYNTAX ((d) == end1 ? *string2 \
4563 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4564 == Sword)
4565
fa9a63c5
RM
4566/* Test if the character before D and the one at D differ with respect
4567 to being word-constituent. */
4568#define AT_WORD_BOUNDARY(d) \
4569 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4570 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4571#endif
fa9a63c5
RM
4572
4573/* Free everything we malloc. */
4574#ifdef MATCH_MAY_ALLOCATE
952db0d7
PE
4575# define FREE_VAR(var) \
4576 do { \
4577 if (var) \
4578 { \
4579 REGEX_FREE (var); \
4580 var = NULL; \
4581 } \
4582 } while (0)
0b32bf0e 4583# define FREE_VARIABLES() \
fa9a63c5
RM
4584 do { \
4585 REGEX_FREE_STACK (fail_stack.stack); \
4586 FREE_VAR (regstart); \
4587 FREE_VAR (regend); \
fa9a63c5
RM
4588 FREE_VAR (best_regstart); \
4589 FREE_VAR (best_regend); \
fa9a63c5
RM
4590 } while (0)
4591#else
0b32bf0e 4592# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4593#endif /* not MATCH_MAY_ALLOCATE */
4594
505bde11
SM
4595\f
4596/* Optimization routines. */
4597
4e8a9132
SM
4598/* If the operation is a match against one or more chars,
4599 return a pointer to the next operation, else return NULL. */
01618498 4600static re_char *
29abe551 4601skip_one_char (const_re_char *p)
4e8a9132 4602{
7393bcbb 4603 switch (*p++)
4e8a9132
SM
4604 {
4605 case anychar:
4606 break;
177c0ea7 4607
4e8a9132
SM
4608 case exactn:
4609 p += *p + 1;
4610 break;
4611
4612 case charset_not:
4613 case charset:
4614 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4615 {
4616 int mcnt;
4617 p = CHARSET_RANGE_TABLE (p - 1);
4618 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4619 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4620 }
4621 else
4622 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4623 break;
177c0ea7 4624
4e8a9132
SM
4625 case syntaxspec:
4626 case notsyntaxspec:
1fb352e0 4627#ifdef emacs
4e8a9132
SM
4628 case categoryspec:
4629 case notcategoryspec:
4630#endif /* emacs */
4631 p++;
4632 break;
4633
4634 default:
4635 p = NULL;
4636 }
4637 return p;
4638}
4639
4640
505bde11 4641/* Jump over non-matching operations. */
839966f3 4642static re_char *
29abe551 4643skip_noops (const_re_char *p, const_re_char *pend)
505bde11
SM
4644{
4645 int mcnt;
4646 while (p < pend)
4647 {
7393bcbb 4648 switch (*p)
505bde11
SM
4649 {
4650 case start_memory:
505bde11
SM
4651 case stop_memory:
4652 p += 2; break;
4653 case no_op:
4654 p += 1; break;
4655 case jump:
4656 p += 1;
4657 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4658 p += mcnt;
4659 break;
4660 default:
4661 return p;
4662 }
4663 }
4664 assert (p == pend);
4665 return p;
4666}
4667
4668/* Non-zero if "p1 matches something" implies "p2 fails". */
4669static int
29abe551
PE
4670mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
4671 const_re_char *p2)
505bde11 4672{
4e8a9132 4673 re_opcode_t op2;
2d1675e4 4674 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4675 unsigned char *pend = bufp->buffer + bufp->used;
4676
4e8a9132 4677 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4678 && p2 >= bufp->buffer && p2 <= pend);
4679
4680 /* Skip over open/close-group commands.
4681 If what follows this loop is a ...+ construct,
4682 look at what begins its body, since we will have to
4683 match at least one of that. */
4e8a9132
SM
4684 p2 = skip_noops (p2, pend);
4685 /* The same skip can be done for p1, except that this function
4686 is only used in the case where p1 is a simple match operator. */
4687 /* p1 = skip_noops (p1, pend); */
4688
4689 assert (p1 >= bufp->buffer && p1 < pend
4690 && p2 >= bufp->buffer && p2 <= pend);
4691
4692 op2 = p2 == pend ? succeed : *p2;
4693
7393bcbb 4694 switch (op2)
505bde11 4695 {
4e8a9132
SM
4696 case succeed:
4697 case endbuf:
4698 /* If we're at the end of the pattern, we can change. */
4699 if (skip_one_char (p1))
505bde11 4700 {
dc4a2ee0 4701 DEBUG_PRINT (" End of pattern: fast loop.\n");
505bde11 4702 return 1;
505bde11 4703 }
4e8a9132 4704 break;
177c0ea7 4705
4e8a9132 4706 case endline:
4e8a9132
SM
4707 case exactn:
4708 {
01618498 4709 register re_wchar_t c
4e8a9132 4710 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4711 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4712
4e8a9132
SM
4713 if ((re_opcode_t) *p1 == exactn)
4714 {
62a6e103 4715 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132 4716 {
dc4a2ee0 4717 DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4e8a9132
SM
4718 return 1;
4719 }
4720 }
505bde11 4721
4e8a9132
SM
4722 else if ((re_opcode_t) *p1 == charset
4723 || (re_opcode_t) *p1 == charset_not)
4724 {
4725 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4726
4e8a9132
SM
4727 /* Test if C is listed in charset (or charset_not)
4728 at `p1'. */
6fdd04b0 4729 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4730 {
4731 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4732 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4733 not = !not;
4734 }
4735 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4736 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4737
4e8a9132
SM
4738 /* `not' is equal to 1 if c would match, which means
4739 that we can't change to pop_failure_jump. */
4740 if (!not)
4741 {
dc4a2ee0 4742 DEBUG_PRINT (" No match => fast loop.\n");
4e8a9132
SM
4743 return 1;
4744 }
4745 }
4746 else if ((re_opcode_t) *p1 == anychar
4747 && c == '\n')
4748 {
dc4a2ee0 4749 DEBUG_PRINT (" . != \\n => fast loop.\n");
4e8a9132
SM
4750 return 1;
4751 }
4752 }
4753 break;
505bde11 4754
4e8a9132 4755 case charset:
4e8a9132
SM
4756 {
4757 if ((re_opcode_t) *p1 == exactn)
4758 /* Reuse the code above. */
4759 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4760
505bde11
SM
4761 /* It is hard to list up all the character in charset
4762 P2 if it includes multibyte character. Give up in
4763 such case. */
4764 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4765 {
4766 /* Now, we are sure that P2 has no range table.
4767 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4768 enough. But P1 may have range table, so the
505bde11
SM
4769 size of bitmap table of P1 is extracted by
4770 using macro `CHARSET_BITMAP_SIZE'.
4771
6fdd04b0
KH
4772 In a multibyte case, we know that all the character
4773 listed in P2 is ASCII. In a unibyte case, P1 has only a
4774 bitmap table. So, in both cases, it is enough to test
4775 only the bitmap table of P1. */
505bde11 4776
411e4203 4777 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4778 {
4779 int idx;
4780 /* We win if the charset inside the loop
4781 has no overlap with the one after the loop. */
4782 for (idx = 0;
4783 (idx < (int) p2[1]
4784 && idx < CHARSET_BITMAP_SIZE (p1));
4785 idx++)
4786 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4787 break;
4788
4789 if (idx == p2[1]
4790 || idx == CHARSET_BITMAP_SIZE (p1))
4791 {
dc4a2ee0 4792 DEBUG_PRINT (" No match => fast loop.\n");
505bde11
SM
4793 return 1;
4794 }
4795 }
411e4203 4796 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4797 {
4798 int idx;
4799 /* We win if the charset_not inside the loop lists
7814e705 4800 every character listed in the charset after. */
505bde11
SM
4801 for (idx = 0; idx < (int) p2[1]; idx++)
4802 if (! (p2[2 + idx] == 0
4803 || (idx < CHARSET_BITMAP_SIZE (p1)
4804 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4805 break;
4806
d1dfb56c
EZ
4807 if (idx == p2[1])
4808 {
dc4a2ee0 4809 DEBUG_PRINT (" No match => fast loop.\n");
d1dfb56c
EZ
4810 return 1;
4811 }
4e8a9132
SM
4812 }
4813 }
4814 }
609b757a 4815 break;
177c0ea7 4816
411e4203 4817 case charset_not:
7393bcbb 4818 switch (*p1)
411e4203
SM
4819 {
4820 case exactn:
4821 case charset:
4822 /* Reuse the code above. */
4823 return mutually_exclusive_p (bufp, p2, p1);
4824 case charset_not:
4825 /* When we have two charset_not, it's very unlikely that
4826 they don't overlap. The union of the two sets of excluded
4827 chars should cover all possible chars, which, as a matter of
4828 fact, is virtually impossible in multibyte buffers. */
36595814 4829 break;
411e4203
SM
4830 }
4831 break;
4832
4e8a9132 4833 case wordend:
669fa600
SM
4834 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4835 case symend:
4e8a9132 4836 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4837 && (p1[1] == Ssymbol || p1[1] == Sword));
4838 case notsyntaxspec:
4839 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4840
4841 case wordbeg:
669fa600
SM
4842 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4843 case symbeg:
4e8a9132 4844 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4845 && (p1[1] == Ssymbol || p1[1] == Sword));
4846 case syntaxspec:
4847 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4848
4849 case wordbound:
4850 return (((re_opcode_t) *p1 == notsyntaxspec
4851 || (re_opcode_t) *p1 == syntaxspec)
4852 && p1[1] == Sword);
4853
1fb352e0 4854#ifdef emacs
4e8a9132
SM
4855 case categoryspec:
4856 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4857 case notcategoryspec:
4858 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4859#endif /* emacs */
4860
4861 default:
4862 ;
505bde11
SM
4863 }
4864
4865 /* Safe default. */
4866 return 0;
4867}
4868
fa9a63c5
RM
4869\f
4870/* Matching routines. */
4871
25fe55af 4872#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4873/* re_match is like re_match_2 except it takes only a single string. */
4874
d1dfb56c 4875regoff_t
d2762c86 4876re_match (struct re_pattern_buffer *bufp, const char *string,
d1dfb56c 4877 size_t size, ssize_t pos, struct re_registers *regs)
fa9a63c5 4878{
d1dfb56c
EZ
4879 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4880 size, pos, regs, size);
fa9a63c5
RM
4881 return result;
4882}
c0f9ea08 4883WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4884#endif /* not emacs */
4885
b18215fc
RS
4886#ifdef emacs
4887/* In Emacs, this is the string or buffer in which we
7814e705 4888 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4889Lisp_Object re_match_object;
4890#endif
fa9a63c5
RM
4891
4892/* re_match_2 matches the compiled pattern in BUFP against the
4893 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4894 and SIZE2, respectively). We start matching at POS, and stop
4895 matching at STOP.
5e69f11e 4896
fa9a63c5 4897 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4898 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4899 documentation for exactly how many groups we fill.
4900
4901 We return -1 if no match, -2 if an internal error (such as the
7814e705 4902 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4903 matched substring. */
4904
d1dfb56c
EZ
4905regoff_t
4906re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4907 size_t size1, const char *string2, size_t size2, ssize_t pos,
4908 struct re_registers *regs, ssize_t stop)
fa9a63c5 4909{
d1dfb56c 4910 regoff_t result;
25fe55af 4911
b18215fc 4912#ifdef emacs
d1dfb56c 4913 ssize_t charpos;
d48cd3f4 4914 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 4915 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4916 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4917#endif
4918
4bb91c68
SM
4919 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4920 (re_char*) string2, size2,
cc9b4df2 4921 pos, regs, stop);
fa9a63c5
RM
4922 return result;
4923}
c0f9ea08 4924WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4925
bf216479 4926
fa9a63c5 4927/* This is a separate function so that we can force an alloca cleanup
7814e705 4928 afterwards. */
d1dfb56c 4929static regoff_t
29abe551
PE
4930re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
4931 size_t size1, const_re_char *string2, size_t size2,
d1dfb56c 4932 ssize_t pos, struct re_registers *regs, ssize_t stop)
fa9a63c5
RM
4933{
4934 /* General temporaries. */
dc4a2ee0 4935 int mcnt;
01618498 4936 size_t reg;
fa9a63c5
RM
4937
4938 /* Just past the end of the corresponding string. */
66f0296e 4939 re_char *end1, *end2;
fa9a63c5
RM
4940
4941 /* Pointers into string1 and string2, just past the last characters in
7814e705 4942 each to consider matching. */
66f0296e 4943 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4944
4945 /* Where we are in the data, and the end of the current string. */
66f0296e 4946 re_char *d, *dend;
5e69f11e 4947
99633e97
SM
4948 /* Used sometimes to remember where we were before starting matching
4949 an operator so that we can go back in case of failure. This "atomic"
4950 behavior of matching opcodes is indispensable to the correctness
4951 of the on_failure_keep_string_jump optimization. */
4952 re_char *dfail;
4953
fa9a63c5 4954 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4955 re_char *p = bufp->buffer;
4956 re_char *pend = p + bufp->used;
fa9a63c5 4957
25fe55af 4958 /* We use this to map every character in the string. */
6676cb1c 4959 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4960
cf9c99bc 4961 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 4962 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4963
cf9c99bc
KH
4964 /* Nonzero if STRING1/STRING2 are multibyte. */
4965 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4966
fa9a63c5
RM
4967 /* Failure point stack. Each place that can handle a failure further
4968 down the line pushes a failure point on this stack. It consists of
505bde11 4969 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4970 the subexpressions we're currently inside, plus the number of such
4971 registers, and, finally, two char *'s. The first char * is where
4972 to resume scanning the pattern; the second one is where to resume
7814e705
JB
4973 scanning the strings. */
4974#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4975 fail_stack_type fail_stack;
4976#endif
dc4a2ee0 4977#ifdef DEBUG_COMPILES_ARGUMENTS
fa9a63c5
RM
4978 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4979#endif
4980
0b32bf0e 4981#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4982 /* This holds the pointer to the failure stack, when
4983 it is allocated relocatably. */
4984 fail_stack_elt_t *failure_stack_ptr;
99633e97 4985#endif
fa9a63c5
RM
4986
4987 /* We fill all the registers internally, independent of what we
7814e705 4988 return, for use in backreferences. The number here includes
fa9a63c5 4989 an element for register zero. */
4bb91c68 4990 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4991
fa9a63c5
RM
4992 /* Information on the contents of registers. These are pointers into
4993 the input strings; they record just what was matched (on this
4994 attempt) by a subexpression part of the pattern, that is, the
4995 regnum-th regstart pointer points to where in the pattern we began
4996 matching and the regnum-th regend points to right after where we
4997 stopped matching the regnum-th subexpression. (The zeroth register
4998 keeps track of what the whole pattern matches.) */
4999#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5000 re_char **regstart, **regend;
fa9a63c5
RM
5001#endif
5002
fa9a63c5 5003 /* The following record the register info as found in the above
5e69f11e 5004 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5005 This happens as we backtrack through the failure points, which in
5006 turn happens only if we have not yet matched the entire string. */
5007 unsigned best_regs_set = false;
5008#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5009 re_char **best_regstart, **best_regend;
fa9a63c5 5010#endif
5e69f11e 5011
fa9a63c5
RM
5012 /* Logically, this is `best_regend[0]'. But we don't want to have to
5013 allocate space for that if we're not allocating space for anything
7814e705 5014 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5015 any of the other register vectors, and it seems rather a kludge to
5016 treat `best_regend' differently than the rest. So we keep track of
5017 the end of the best match so far in a separate variable. We
5018 initialize this to NULL so that when we backtrack the first time
5019 and need to test it, it's not garbage. */
66f0296e 5020 re_char *match_end = NULL;
fa9a63c5 5021
dc4a2ee0 5022#ifdef DEBUG_COMPILES_ARGUMENTS
fa9a63c5 5023 /* Counts the total number of registers pushed. */
5e69f11e 5024 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5025#endif
5026
dc4a2ee0 5027 DEBUG_PRINT ("\n\nEntering re_match_2.\n");
5e69f11e 5028
fa9a63c5 5029 INIT_FAIL_STACK ();
5e69f11e 5030
fa9a63c5
RM
5031#ifdef MATCH_MAY_ALLOCATE
5032 /* Do not bother to initialize all the register variables if there are
5033 no groups in the pattern, as it takes a fair amount of time. If
5034 there are groups, we include space for register 0 (the whole
5035 pattern), even though we never use it, since it simplifies the
5036 array indexing. We should fix this. */
5037 if (bufp->re_nsub)
5038 {
66f0296e
SM
5039 regstart = REGEX_TALLOC (num_regs, re_char *);
5040 regend = REGEX_TALLOC (num_regs, re_char *);
5041 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5042 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5043
505bde11 5044 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5045 {
5046 FREE_VARIABLES ();
5047 return -2;
5048 }
fa9a63c5
RM
5049 }
5050 else
5051 {
5052 /* We must initialize all our variables to NULL, so that
25fe55af 5053 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5054 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5055 }
5056#endif /* MATCH_MAY_ALLOCATE */
5057
5058 /* The starting position is bogus. */
5059 if (pos < 0 || pos > size1 + size2)
5060 {
5061 FREE_VARIABLES ();
5062 return -1;
5063 }
5e69f11e 5064
fa9a63c5
RM
5065 /* Initialize subexpression text positions to -1 to mark ones that no
5066 start_memory/stop_memory has been seen for. Also initialize the
5067 register information struct. */
01618498
SM
5068 for (reg = 1; reg < num_regs; reg++)
5069 regstart[reg] = regend[reg] = NULL;
99633e97 5070
fa9a63c5 5071 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5072 `string1' is null. */
fa9a63c5
RM
5073 if (size2 == 0 && string1 != NULL)
5074 {
5075 string2 = string1;
5076 size2 = size1;
5077 string1 = 0;
5078 size1 = 0;
5079 }
5080 end1 = string1 + size1;
5081 end2 = string2 + size2;
5082
5e69f11e 5083 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5084 `dend' is the end of the input string that `d' points within. `d'
5085 is advanced into the following input string whenever necessary, but
5086 this happens before fetching; therefore, at the beginning of the
5087 loop, `d' can be pointing at the end of a string, but it cannot
5088 equal `string2'. */
419d1c74 5089 if (pos >= size1)
fa9a63c5 5090 {
419d1c74
SM
5091 /* Only match within string2. */
5092 d = string2 + pos - size1;
5093 dend = end_match_2 = string2 + stop - size1;
5094 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5095 }
5096 else
5097 {
f1ad044f 5098 if (stop < size1)
419d1c74
SM
5099 {
5100 /* Only match within string1. */
5101 end_match_1 = string1 + stop;
5102 /* BEWARE!
5103 When we reach end_match_1, PREFETCH normally switches to string2.
5104 But in the present case, this means that just doing a PREFETCH
5105 makes us jump from `stop' to `gap' within the string.
5106 What we really want here is for the search to stop as
5107 soon as we hit end_match_1. That's why we set end_match_2
5108 to end_match_1 (since PREFETCH fails as soon as we hit
5109 end_match_2). */
5110 end_match_2 = end_match_1;
5111 }
5112 else
f1ad044f
SM
5113 { /* It's important to use this code when stop == size so that
5114 moving `d' from end1 to string2 will not prevent the d == dend
5115 check from catching the end of string. */
419d1c74
SM
5116 end_match_1 = end1;
5117 end_match_2 = string2 + stop - size1;
5118 }
5119 d = string1 + pos;
5120 dend = end_match_1;
fa9a63c5
RM
5121 }
5122
dc4a2ee0 5123 DEBUG_PRINT ("The compiled pattern is: ");
fa9a63c5 5124 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
dc4a2ee0 5125 DEBUG_PRINT ("The string to match is: `");
fa9a63c5 5126 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
dc4a2ee0 5127 DEBUG_PRINT ("'\n");
5e69f11e 5128
7814e705 5129 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5130 function if the match is complete, or it drops through if the match
5131 fails at this starting point in the input data. */
5132 for (;;)
5133 {
dc4a2ee0 5134 DEBUG_PRINT ("\n%p: ", p);
fa9a63c5
RM
5135
5136 if (p == pend)
dc4a2ee0
PE
5137 {
5138 ptrdiff_t dcnt;
5139
5140 /* End of pattern means we might have succeeded. */
5141 DEBUG_PRINT ("end of pattern ... ");
5e69f11e 5142
fa9a63c5 5143 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5144 longest match, try backtracking. */
5145 if (d != end_match_2)
fa9a63c5
RM
5146 {
5147 /* 1 if this match ends in the same string (string1 or string2)
5148 as the best previous match. */
d42f4f0f
PE
5149 boolean same_str_p = (FIRST_STRING_P (match_end)
5150 == FIRST_STRING_P (d));
fa9a63c5
RM
5151 /* 1 if this match is the best seen so far. */
5152 boolean best_match_p;
5153
5154 /* AIX compiler got confused when this was combined
7814e705 5155 with the previous declaration. */
fa9a63c5
RM
5156 if (same_str_p)
5157 best_match_p = d > match_end;
5158 else
99633e97 5159 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5160
dc4a2ee0 5161 DEBUG_PRINT ("backtracking.\n");
25fe55af
RS
5162
5163 if (!FAIL_STACK_EMPTY ())
5164 { /* More failure points to try. */
5165
5166 /* If exceeds best match so far, save it. */
5167 if (!best_regs_set || best_match_p)
5168 {
5169 best_regs_set = true;
5170 match_end = d;
5171
dc4a2ee0 5172 DEBUG_PRINT ("\nSAVING match as best so far.\n");
25fe55af 5173
01618498 5174 for (reg = 1; reg < num_regs; reg++)
25fe55af 5175 {
01618498
SM
5176 best_regstart[reg] = regstart[reg];
5177 best_regend[reg] = regend[reg];
25fe55af
RS
5178 }
5179 }
5180 goto fail;
5181 }
5182
5183 /* If no failure points, don't restore garbage. And if
5184 last match is real best match, don't restore second
5185 best one. */
5186 else if (best_regs_set && !best_match_p)
5187 {
5188 restore_best_regs:
5189 /* Restore best match. It may happen that `dend ==
5190 end_match_1' while the restored d is in string2.
5191 For example, the pattern `x.*y.*z' against the
5192 strings `x-' and `y-z-', if the two strings are
7814e705 5193 not consecutive in memory. */
dc4a2ee0 5194 DEBUG_PRINT ("Restoring best registers.\n");
25fe55af
RS
5195
5196 d = match_end;
5197 dend = ((d >= string1 && d <= end1)
5198 ? end_match_1 : end_match_2);
fa9a63c5 5199
01618498 5200 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5201 {
01618498
SM
5202 regstart[reg] = best_regstart[reg];
5203 regend[reg] = best_regend[reg];
fa9a63c5 5204 }
25fe55af
RS
5205 }
5206 } /* d != end_match_2 */
fa9a63c5
RM
5207
5208 succeed_label:
dc4a2ee0 5209 DEBUG_PRINT ("Accepting match.\n");
fa9a63c5 5210
25fe55af
RS
5211 /* If caller wants register contents data back, do it. */
5212 if (regs && !bufp->no_sub)
fa9a63c5 5213 {
25fe55af
RS
5214 /* Have the register data arrays been allocated? */
5215 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5216 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5217 extra element beyond `num_regs' for the `-1' marker
5218 GNU code uses. */
5219 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5220 regs->start = TALLOC (regs->num_regs, regoff_t);
5221 regs->end = TALLOC (regs->num_regs, regoff_t);
5222 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5223 {
5224 FREE_VARIABLES ();
5225 return -2;
5226 }
25fe55af
RS
5227 bufp->regs_allocated = REGS_REALLOCATE;
5228 }
5229 else if (bufp->regs_allocated == REGS_REALLOCATE)
5230 { /* Yes. If we need more elements than were already
5231 allocated, reallocate them. If we need fewer, just
5232 leave it alone. */
5233 if (regs->num_regs < num_regs + 1)
5234 {
5235 regs->num_regs = num_regs + 1;
5236 RETALLOC (regs->start, regs->num_regs, regoff_t);
5237 RETALLOC (regs->end, regs->num_regs, regoff_t);
5238 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5239 {
5240 FREE_VARIABLES ();
5241 return -2;
5242 }
25fe55af
RS
5243 }
5244 }
5245 else
fa9a63c5
RM
5246 {
5247 /* These braces fend off a "empty body in an else-statement"
7814e705 5248 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5249 assert (bufp->regs_allocated == REGS_FIXED);
5250 }
5251
25fe55af
RS
5252 /* Convert the pointer data in `regstart' and `regend' to
5253 indices. Register zero has to be set differently,
5254 since we haven't kept track of any info for it. */
5255 if (regs->num_regs > 0)
5256 {
5257 regs->start[0] = pos;
99633e97 5258 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5259 }
5e69f11e 5260
25fe55af
RS
5261 /* Go through the first `min (num_regs, regs->num_regs)'
5262 registers, since that is all we initialized. */
01618498 5263 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5264 {
01618498
SM
5265 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5266 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5267 else
5268 {
dc4a2ee0
PE
5269 regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
5270 regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
25fe55af 5271 }
fa9a63c5 5272 }
5e69f11e 5273
25fe55af
RS
5274 /* If the regs structure we return has more elements than
5275 were in the pattern, set the extra elements to -1. If
5276 we (re)allocated the registers, this is the case,
5277 because we always allocate enough to have at least one
7814e705 5278 -1 at the end. */
01618498
SM
5279 for (reg = num_regs; reg < regs->num_regs; reg++)
5280 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5281 } /* regs && !bufp->no_sub */
5282
dc4a2ee0
PE
5283 DEBUG_PRINT ("%u failure points pushed, %u popped (%u remain).\n",
5284 nfailure_points_pushed, nfailure_points_popped,
5285 nfailure_points_pushed - nfailure_points_popped);
5286 DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5287
dc4a2ee0 5288 dcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5289
dc4a2ee0 5290 DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
fa9a63c5 5291
25fe55af 5292 FREE_VARIABLES ();
dc4a2ee0 5293 return dcnt;
25fe55af 5294 }
fa9a63c5 5295
7814e705 5296 /* Otherwise match next pattern command. */
7393bcbb 5297 switch (*p++)
fa9a63c5 5298 {
25fe55af
RS
5299 /* Ignore these. Used to ignore the n of succeed_n's which
5300 currently have n == 0. */
5301 case no_op:
dc4a2ee0 5302 DEBUG_PRINT ("EXECUTING no_op.\n");
25fe55af 5303 break;
fa9a63c5
RM
5304
5305 case succeed:
dc4a2ee0 5306 DEBUG_PRINT ("EXECUTING succeed.\n");
fa9a63c5
RM
5307 goto succeed_label;
5308
7814e705 5309 /* Match the next n pattern characters exactly. The following
25fe55af 5310 byte in the pattern defines n, and the n bytes after that
7814e705 5311 are the characters to match. */
fa9a63c5
RM
5312 case exactn:
5313 mcnt = *p++;
dc4a2ee0 5314 DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5315
99633e97
SM
5316 /* Remember the start point to rollback upon failure. */
5317 dfail = d;
5318
6fdd04b0 5319#ifndef emacs
25fe55af
RS
5320 /* This is written out as an if-else so we don't waste time
5321 testing `translate' inside the loop. */
28703c16 5322 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5323 do
5324 {
5325 PREFETCH ();
5326 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5327 {
6fdd04b0
KH
5328 d = dfail;
5329 goto fail;
e934739e 5330 }
6fdd04b0
KH
5331 d++;
5332 }
5333 while (--mcnt);
fa9a63c5 5334 else
6fdd04b0
KH
5335 do
5336 {
5337 PREFETCH ();
5338 if (*d++ != *p++)
bf216479 5339 {
6fdd04b0
KH
5340 d = dfail;
5341 goto fail;
bf216479 5342 }
6fdd04b0
KH
5343 }
5344 while (--mcnt);
5345#else /* emacs */
5346 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5347 if (target_multibyte)
6fdd04b0
KH
5348 do
5349 {
5350 int pat_charlen, buf_charlen;
cf9c99bc 5351 int pat_ch, buf_ch;
e934739e 5352
6fdd04b0 5353 PREFETCH ();
cf9c99bc 5354 if (multibyte)
62a6e103 5355 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5356 else
5357 {
5358 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5359 pat_charlen = 1;
5360 }
62a6e103 5361 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5362
6fdd04b0 5363 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5364 {
6fdd04b0
KH
5365 d = dfail;
5366 goto fail;
e934739e 5367 }
bf216479 5368
6fdd04b0
KH
5369 p += pat_charlen;
5370 d += buf_charlen;
5371 mcnt -= pat_charlen;
5372 }
5373 while (mcnt > 0);
fa9a63c5 5374 else
6fdd04b0
KH
5375 do
5376 {
abbd1bcf 5377 int pat_charlen;
cf9c99bc 5378 int pat_ch, buf_ch;
bf216479 5379
6fdd04b0 5380 PREFETCH ();
cf9c99bc
KH
5381 if (multibyte)
5382 {
62a6e103 5383 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5384 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5385 }
5386 else
5387 {
5388 pat_ch = *p;
5389 pat_charlen = 1;
5390 }
5391 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5392 if (! CHAR_BYTE8_P (buf_ch))
5393 {
5394 buf_ch = TRANSLATE (buf_ch);
5395 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5396 if (buf_ch < 0)
5397 buf_ch = *d;
5398 }
0e2501ed
AS
5399 else
5400 buf_ch = *d;
cf9c99bc 5401 if (buf_ch != pat_ch)
6fdd04b0
KH
5402 {
5403 d = dfail;
5404 goto fail;
bf216479 5405 }
cf9c99bc
KH
5406 p += pat_charlen;
5407 d++;
6fdd04b0
KH
5408 }
5409 while (--mcnt);
5410#endif
25fe55af 5411 break;
fa9a63c5
RM
5412
5413
25fe55af 5414 /* Match any character except possibly a newline or a null. */
fa9a63c5 5415 case anychar:
e934739e
RS
5416 {
5417 int buf_charlen;
01618498 5418 re_wchar_t buf_ch;
fa9a63c5 5419
dc4a2ee0 5420 DEBUG_PRINT ("EXECUTING anychar.\n");
fa9a63c5 5421
e934739e 5422 PREFETCH ();
62a6e103 5423 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5424 target_multibyte);
e934739e
RS
5425 buf_ch = TRANSLATE (buf_ch);
5426
5427 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5428 && buf_ch == '\n')
5429 || ((bufp->syntax & RE_DOT_NOT_NULL)
5430 && buf_ch == '\000'))
5431 goto fail;
5432
dc4a2ee0 5433 DEBUG_PRINT (" Matched `%d'.\n", *d);
e934739e
RS
5434 d += buf_charlen;
5435 }
fa9a63c5
RM
5436 break;
5437
5438
5439 case charset:
5440 case charset_not:
5441 {
b18215fc 5442 register unsigned int c;
fa9a63c5 5443 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5444 int len;
5445
5446 /* Start of actual range_table, or end of bitmap if there is no
5447 range table. */
da053e48 5448 re_char *range_table IF_LINT (= NULL);
b18215fc 5449
96cc36cc 5450 /* Nonzero if there is a range table. */
b18215fc
RS
5451 int range_table_exists;
5452
96cc36cc
RS
5453 /* Number of ranges of range table. This is not included
5454 in the initial byte-length of the command. */
5455 int count = 0;
fa9a63c5 5456
f5020181
AS
5457 /* Whether matching against a unibyte character. */
5458 boolean unibyte_char = false;
5459
dc4a2ee0 5460 DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5461
b18215fc 5462 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5463
b18215fc 5464 if (range_table_exists)
96cc36cc
RS
5465 {
5466 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5467 EXTRACT_NUMBER_AND_INCR (count, range_table);
5468 }
b18215fc 5469
2d1675e4 5470 PREFETCH ();
62a6e103 5471 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5472 if (target_multibyte)
5473 {
5474 int c1;
b18215fc 5475
cf9c99bc
KH
5476 c = TRANSLATE (c);
5477 c1 = RE_CHAR_TO_UNIBYTE (c);
5478 if (c1 >= 0)
f5020181
AS
5479 {
5480 unibyte_char = true;
5481 c = c1;
5482 }
cf9c99bc
KH
5483 }
5484 else
5485 {
5486 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5487
5488 if (! CHAR_BYTE8_P (c1))
5489 {
5490 c1 = TRANSLATE (c1);
5491 c1 = RE_CHAR_TO_UNIBYTE (c1);
5492 if (c1 >= 0)
f5020181
AS
5493 {
5494 unibyte_char = true;
5495 c = c1;
5496 }
cf9c99bc 5497 }
0b8be006
AS
5498 else
5499 unibyte_char = true;
cf9c99bc
KH
5500 }
5501
f5020181 5502 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5503 { /* Lookup bitmap. */
b18215fc
RS
5504 /* Cast to `unsigned' instead of `unsigned char' in
5505 case the bit list is a full 32 bytes long. */
5506 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5507 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5508 not = !not;
b18215fc 5509 }
96cc36cc 5510#ifdef emacs
b18215fc 5511 else if (range_table_exists)
96cc36cc
RS
5512 {
5513 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5514
14473664
SM
5515 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5516 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5517 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5518 | (class_bits & BIT_SPACE && ISSPACE (c))
5519 | (class_bits & BIT_UPPER && ISUPPER (c))
5520 | (class_bits & BIT_WORD && ISWORD (c)))
5521 not = !not;
5522 else
5523 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5524 }
5525#endif /* emacs */
fa9a63c5 5526
96cc36cc
RS
5527 if (range_table_exists)
5528 p = CHARSET_RANGE_TABLE_END (range_table, count);
5529 else
5530 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5531
5532 if (!not) goto fail;
5e69f11e 5533
b18215fc 5534 d += len;
fa9a63c5 5535 }
8fb31792 5536 break;
fa9a63c5
RM
5537
5538
25fe55af 5539 /* The beginning of a group is represented by start_memory.
505bde11 5540 The argument is the register number. The text
25fe55af 5541 matched within the group is recorded (in the internal
7814e705 5542 registers data structure) under the register number. */
25fe55af 5543 case start_memory:
dc4a2ee0 5544 DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
505bde11
SM
5545
5546 /* In case we need to undo this operation (via backtracking). */
dc4a2ee0 5547 PUSH_FAILURE_REG (*p);
fa9a63c5 5548
25fe55af 5549 regstart[*p] = d;
4bb91c68 5550 regend[*p] = NULL; /* probably unnecessary. -sm */
dc4a2ee0 5551 DEBUG_PRINT (" regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
fa9a63c5 5552
25fe55af 5553 /* Move past the register number and inner group count. */
505bde11 5554 p += 1;
25fe55af 5555 break;
fa9a63c5
RM
5556
5557
25fe55af 5558 /* The stop_memory opcode represents the end of a group. Its
505bde11 5559 argument is the same as start_memory's: the register number. */
fa9a63c5 5560 case stop_memory:
dc4a2ee0 5561 DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
505bde11
SM
5562
5563 assert (!REG_UNSET (regstart[*p]));
5564 /* Strictly speaking, there should be code such as:
177c0ea7 5565
0b32bf0e 5566 assert (REG_UNSET (regend[*p]));
505bde11
SM
5567 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5568
5569 But the only info to be pushed is regend[*p] and it is known to
5570 be UNSET, so there really isn't anything to push.
5571 Not pushing anything, on the other hand deprives us from the
5572 guarantee that regend[*p] is UNSET since undoing this operation
5573 will not reset its value properly. This is not important since
5574 the value will only be read on the next start_memory or at
5575 the very end and both events can only happen if this stop_memory
5576 is *not* undone. */
fa9a63c5 5577
25fe55af 5578 regend[*p] = d;
dc4a2ee0 5579 DEBUG_PRINT (" regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
fa9a63c5 5580
25fe55af 5581 /* Move past the register number and the inner group count. */
505bde11 5582 p += 1;
25fe55af 5583 break;
fa9a63c5
RM
5584
5585
5586 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5587 followed by the numeric value of <digit> as the register number. */
5588 case duplicate:
fa9a63c5 5589 {
66f0296e 5590 register re_char *d2, *dend2;
7814e705 5591 int regno = *p++; /* Get which register to match against. */
dc4a2ee0 5592 DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
fa9a63c5 5593
7814e705 5594 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5595 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5596 goto fail;
5e69f11e 5597
7814e705 5598 /* Where in input to try to start matching. */
25fe55af 5599 d2 = regstart[regno];
5e69f11e 5600
99633e97
SM
5601 /* Remember the start point to rollback upon failure. */
5602 dfail = d;
5603
25fe55af
RS
5604 /* Where to stop matching; if both the place to start and
5605 the place to stop matching are in the same string, then
5606 set to the place to stop, otherwise, for now have to use
5607 the end of the first string. */
fa9a63c5 5608
25fe55af 5609 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5610 == FIRST_STRING_P (regend[regno]))
5611 ? regend[regno] : end_match_1);
5612 for (;;)
5613 {
dc4a2ee0
PE
5614 ptrdiff_t dcnt;
5615
fa9a63c5 5616 /* If necessary, advance to next segment in register
25fe55af 5617 contents. */
fa9a63c5
RM
5618 while (d2 == dend2)
5619 {
5620 if (dend2 == end_match_2) break;
5621 if (dend2 == regend[regno]) break;
5622
25fe55af
RS
5623 /* End of string1 => advance to string2. */
5624 d2 = string2;
5625 dend2 = regend[regno];
fa9a63c5
RM
5626 }
5627 /* At end of register contents => success */
5628 if (d2 == dend2) break;
5629
5630 /* If necessary, advance to next segment in data. */
5631 PREFETCH ();
5632
5633 /* How many characters left in this segment to match. */
dc4a2ee0 5634 dcnt = dend - d;
5e69f11e 5635
fa9a63c5 5636 /* Want how many consecutive characters we can match in
25fe55af 5637 one shot, so, if necessary, adjust the count. */
dc4a2ee0
PE
5638 if (dcnt > dend2 - d2)
5639 dcnt = dend2 - d2;
5e69f11e 5640
fa9a63c5 5641 /* Compare that many; failure if mismatch, else move
25fe55af 5642 past them. */
28703c16 5643 if (RE_TRANSLATE_P (translate)
dc4a2ee0
PE
5644 ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
5645 : memcmp (d, d2, dcnt))
99633e97
SM
5646 {
5647 d = dfail;
5648 goto fail;
5649 }
dc4a2ee0 5650 d += dcnt, d2 += dcnt;
fa9a63c5
RM
5651 }
5652 }
5653 break;
5654
5655
25fe55af 5656 /* begline matches the empty string at the beginning of the string
c0f9ea08 5657 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5658 case begline:
dc4a2ee0 5659 DEBUG_PRINT ("EXECUTING begline.\n");
5e69f11e 5660
25fe55af
RS
5661 if (AT_STRINGS_BEG (d))
5662 {
5663 if (!bufp->not_bol) break;
5664 }
419d1c74 5665 else
25fe55af 5666 {
bf216479 5667 unsigned c;
419d1c74 5668 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5669 if (c == '\n')
419d1c74 5670 break;
25fe55af
RS
5671 }
5672 /* In all other cases, we fail. */
5673 goto fail;
fa9a63c5
RM
5674
5675
25fe55af 5676 /* endline is the dual of begline. */
fa9a63c5 5677 case endline:
dc4a2ee0 5678 DEBUG_PRINT ("EXECUTING endline.\n");
fa9a63c5 5679
25fe55af
RS
5680 if (AT_STRINGS_END (d))
5681 {
5682 if (!bufp->not_eol) break;
5683 }
f1ad044f 5684 else
25fe55af 5685 {
f1ad044f 5686 PREFETCH_NOLIMIT ();
c0f9ea08 5687 if (*d == '\n')
f1ad044f 5688 break;
25fe55af
RS
5689 }
5690 goto fail;
fa9a63c5
RM
5691
5692
5693 /* Match at the very beginning of the data. */
25fe55af 5694 case begbuf:
dc4a2ee0 5695 DEBUG_PRINT ("EXECUTING begbuf.\n");
25fe55af
RS
5696 if (AT_STRINGS_BEG (d))
5697 break;
5698 goto fail;
fa9a63c5
RM
5699
5700
5701 /* Match at the very end of the data. */
25fe55af 5702 case endbuf:
dc4a2ee0 5703 DEBUG_PRINT ("EXECUTING endbuf.\n");
fa9a63c5
RM
5704 if (AT_STRINGS_END (d))
5705 break;
25fe55af 5706 goto fail;
5e69f11e 5707
5e69f11e 5708
25fe55af
RS
5709 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5710 pushes NULL as the value for the string on the stack. Then
505bde11 5711 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5712 string, instead of restoring it. To see why, consider
7814e705 5713 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5714 then the . fails against the \n. But the next thing we want
5715 to do is match the \n against the \n; if we restored the
5716 string value, we would be back at the foo.
5717
5718 Because this is used only in specific cases, we don't need to
5719 check all the things that `on_failure_jump' does, to make
5720 sure the right things get saved on the stack. Hence we don't
5721 share its code. The only reason to push anything on the
5722 stack at all is that otherwise we would have to change
5723 `anychar's code to do something besides goto fail in this
5724 case; that seems worse than this. */
5725 case on_failure_keep_string_jump:
505bde11 5726 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5727 DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5728 mcnt, p + mcnt);
fa9a63c5 5729
505bde11
SM
5730 PUSH_FAILURE_POINT (p - 3, NULL);
5731 break;
5732
0683b6fa
SM
5733 /* A nasty loop is introduced by the non-greedy *? and +?.
5734 With such loops, the stack only ever contains one failure point
5735 at a time, so that a plain on_failure_jump_loop kind of
5736 cycle detection cannot work. Worse yet, such a detection
5737 can not only fail to detect a cycle, but it can also wrongly
5738 detect a cycle (between different instantiations of the same
6df42991 5739 loop).
0683b6fa
SM
5740 So the method used for those nasty loops is a little different:
5741 We use a special cycle-detection-stack-frame which is pushed
5742 when the on_failure_jump_nastyloop failure-point is *popped*.
5743 This special frame thus marks the beginning of one iteration
5744 through the loop and we can hence easily check right here
5745 whether something matched between the beginning and the end of
5746 the loop. */
5747 case on_failure_jump_nastyloop:
5748 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5749 DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5750 mcnt, p + mcnt);
0683b6fa
SM
5751
5752 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5753 {
5754 int cycle = 0;
5755 CHECK_INFINITE_LOOP (p - 4, d);
5756 if (!cycle)
5757 /* If there's a cycle, just continue without pushing
5758 this failure point. The failure point is the "try again"
5759 option, which shouldn't be tried.
5760 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5761 PUSH_FAILURE_POINT (p - 3, d);
5762 }
0683b6fa
SM
5763 break;
5764
4e8a9132
SM
5765 /* Simple loop detecting on_failure_jump: just check on the
5766 failure stack if the same spot was already hit earlier. */
505bde11
SM
5767 case on_failure_jump_loop:
5768 on_failure:
5769 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5770 DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5771 mcnt, p + mcnt);
6df42991
SM
5772 {
5773 int cycle = 0;
5774 CHECK_INFINITE_LOOP (p - 3, d);
5775 if (cycle)
5776 /* If there's a cycle, get out of the loop, as if the matching
5777 had failed. We used to just `goto fail' here, but that was
5778 aborting the search a bit too early: we want to keep the
5779 empty-loop-match and keep matching after the loop.
5780 We want (x?)*y\1z to match both xxyz and xxyxz. */
5781 p += mcnt;
5782 else
5783 PUSH_FAILURE_POINT (p - 3, d);
5784 }
25fe55af 5785 break;
fa9a63c5
RM
5786
5787
5788 /* Uses of on_failure_jump:
5e69f11e 5789
25fe55af
RS
5790 Each alternative starts with an on_failure_jump that points
5791 to the beginning of the next alternative. Each alternative
5792 except the last ends with a jump that in effect jumps past
5793 the rest of the alternatives. (They really jump to the
5794 ending jump of the following alternative, because tensioning
5795 these jumps is a hassle.)
fa9a63c5 5796
25fe55af
RS
5797 Repeats start with an on_failure_jump that points past both
5798 the repetition text and either the following jump or
5799 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5800 case on_failure_jump:
25fe55af 5801 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5802 DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
5803 mcnt, p + mcnt);
25fe55af 5804
505bde11 5805 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5806 break;
5807
4e8a9132 5808 /* This operation is used for greedy *.
505bde11
SM
5809 Compare the beginning of the repeat with what in the
5810 pattern follows its end. If we can establish that there
5811 is nothing that they would both match, i.e., that we
5812 would have to backtrack because of (as in, e.g., `a*a')
5813 then we can use a non-backtracking loop based on
4e8a9132 5814 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5815 case on_failure_jump_smart:
25fe55af 5816 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0
PE
5817 DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5818 mcnt, p + mcnt);
25fe55af 5819 {
01618498 5820 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5821 /* Here, we discard `const', making re_match non-reentrant. */
5822 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5823 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5824
505bde11
SM
5825 p -= 3; /* Reset so that we will re-execute the
5826 instruction once it's been changed. */
fa9a63c5 5827
4e8a9132
SM
5828 EXTRACT_NUMBER (mcnt, p2 - 2);
5829
5830 /* Ensure this is a indeed the trivial kind of loop
5831 we are expecting. */
5832 assert (skip_one_char (p1) == p2 - 3);
5833 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5834 DEBUG_STATEMENT (debug += 2);
505bde11 5835 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5836 {
505bde11 5837 /* Use a fast `on_failure_keep_string_jump' loop. */
dc4a2ee0 5838 DEBUG_PRINT (" smart exclusive => fast loop.\n");
01618498 5839 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5840 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5841 }
505bde11 5842 else
fa9a63c5 5843 {
505bde11 5844 /* Default to a safe `on_failure_jump' loop. */
dc4a2ee0 5845 DEBUG_PRINT (" smart default => slow loop.\n");
01618498 5846 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5847 }
99633e97 5848 DEBUG_STATEMENT (debug -= 2);
25fe55af 5849 }
505bde11 5850 break;
25fe55af
RS
5851
5852 /* Unconditionally jump (without popping any failure points). */
5853 case jump:
fa9a63c5 5854 unconditional_jump:
5b370c2b 5855 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5856 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
dc4a2ee0 5857 DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
7814e705 5858 p += mcnt; /* Do the jump. */
dc4a2ee0 5859 DEBUG_PRINT ("(to %p).\n", p);
25fe55af
RS
5860 break;
5861
5862
25fe55af
RS
5863 /* Have to succeed matching what follows at least n times.
5864 After that, handle like `on_failure_jump'. */
5865 case succeed_n:
01618498 5866 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af 5867 EXTRACT_NUMBER (mcnt, p + 2);
dc4a2ee0 5868 DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5869
dc1e502d
SM
5870 /* Originally, mcnt is how many times we HAVE to succeed. */
5871 if (mcnt != 0)
25fe55af 5872 {
6dcf2d0e
SM
5873 /* Here, we discard `const', making re_match non-reentrant. */
5874 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5875 mcnt--;
01618498
SM
5876 p += 4;
5877 PUSH_NUMBER (p2, mcnt);
25fe55af 5878 }
dc1e502d
SM
5879 else
5880 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5881 goto on_failure;
25fe55af
RS
5882 break;
5883
5884 case jump_n:
01618498 5885 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af 5886 EXTRACT_NUMBER (mcnt, p + 2);
dc4a2ee0 5887 DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
25fe55af
RS
5888
5889 /* Originally, this is how many times we CAN jump. */
dc1e502d 5890 if (mcnt != 0)
25fe55af 5891 {
6dcf2d0e
SM
5892 /* Here, we discard `const', making re_match non-reentrant. */
5893 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5894 mcnt--;
01618498 5895 PUSH_NUMBER (p2, mcnt);
dc1e502d 5896 goto unconditional_jump;
25fe55af
RS
5897 }
5898 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5899 else
5900 p += 4;
25fe55af 5901 break;
5e69f11e 5902
fa9a63c5
RM
5903 case set_number_at:
5904 {
01618498 5905 unsigned char *p2; /* Location of the counter. */
dc4a2ee0 5906 DEBUG_PRINT ("EXECUTING set_number_at.\n");
fa9a63c5 5907
25fe55af 5908 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5909 /* Here, we discard `const', making re_match non-reentrant. */
5910 p2 = (unsigned char*) p + mcnt;
01618498 5911 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5912 EXTRACT_NUMBER_AND_INCR (mcnt, p);
dc4a2ee0 5913 DEBUG_PRINT (" Setting %p to %d.\n", p2, mcnt);
01618498 5914 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5915 break;
5916 }
9121ca40
KH
5917
5918 case wordbound:
66f0296e 5919 case notwordbound:
19ed5445
PE
5920 {
5921 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
dc4a2ee0 5922 DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
fa9a63c5 5923
19ed5445 5924 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5925
19ed5445
PE
5926 /* Case 1: D is at the beginning or the end of string. */
5927 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5928 not = !not;
5929 else
5930 {
5931 /* C1 is the character before D, S1 is the syntax of C1, C2
5932 is the character at D, and S2 is the syntax of C2. */
5933 re_wchar_t c1, c2;
5934 int s1, s2;
5935 int dummy;
b18215fc 5936#ifdef emacs
d1dfb56c
EZ
5937 ssize_t offset = PTR_TO_OFFSET (d - 1);
5938 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445 5939 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5940#endif
19ed5445
PE
5941 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5942 s1 = SYNTAX (c1);
b18215fc 5943#ifdef emacs
19ed5445 5944 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5945#endif
19ed5445
PE
5946 PREFETCH_NOLIMIT ();
5947 GET_CHAR_AFTER (c2, d, dummy);
5948 s2 = SYNTAX (c2);
5949
5950 if (/* Case 2: Only one of S1 and S2 is Sword. */
5951 ((s1 == Sword) != (s2 == Sword))
5952 /* Case 3: Both of S1 and S2 are Sword, and macro
5953 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5954 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5955 not = !not;
5956 }
5957 if (not)
5958 break;
5959 else
5960 goto fail;
5961 }
fa9a63c5
RM
5962
5963 case wordbeg:
dc4a2ee0 5964 DEBUG_PRINT ("EXECUTING wordbeg.\n");
fa9a63c5 5965
b18215fc
RS
5966 /* We FAIL in one of the following cases: */
5967
7814e705 5968 /* Case 1: D is at the end of string. */
b18215fc 5969 if (AT_STRINGS_END (d))
99633e97 5970 goto fail;
b18215fc
RS
5971 else
5972 {
5973 /* C1 is the character before D, S1 is the syntax of C1, C2
5974 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5975 re_wchar_t c1, c2;
5976 int s1, s2;
bf216479 5977 int dummy;
fa9a63c5 5978#ifdef emacs
d1dfb56c
EZ
5979 ssize_t offset = PTR_TO_OFFSET (d);
5980 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5981 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5982#endif
99633e97 5983 PREFETCH ();
6fdd04b0 5984 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 5985 s2 = SYNTAX (c2);
177c0ea7 5986
b18215fc
RS
5987 /* Case 2: S2 is not Sword. */
5988 if (s2 != Sword)
5989 goto fail;
5990
5991 /* Case 3: D is not at the beginning of string ... */
5992 if (!AT_STRINGS_BEG (d))
5993 {
5994 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5995#ifdef emacs
5d967c7a 5996 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5997#endif
b18215fc
RS
5998 s1 = SYNTAX (c1);
5999
6000 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6001 returns 0. */
b18215fc
RS
6002 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6003 goto fail;
6004 }
6005 }
e318085a
RS
6006 break;
6007
b18215fc 6008 case wordend:
dc4a2ee0 6009 DEBUG_PRINT ("EXECUTING wordend.\n");
b18215fc
RS
6010
6011 /* We FAIL in one of the following cases: */
6012
6013 /* Case 1: D is at the beginning of string. */
6014 if (AT_STRINGS_BEG (d))
e318085a 6015 goto fail;
b18215fc
RS
6016 else
6017 {
6018 /* C1 is the character before D, S1 is the syntax of C1, C2
6019 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6020 re_wchar_t c1, c2;
6021 int s1, s2;
bf216479 6022 int dummy;
5d967c7a 6023#ifdef emacs
d1dfb56c
EZ
6024 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6025 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6026 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6027#endif
99633e97 6028 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6029 s1 = SYNTAX (c1);
6030
6031 /* Case 2: S1 is not Sword. */
6032 if (s1 != Sword)
6033 goto fail;
6034
6035 /* Case 3: D is not at the end of string ... */
6036 if (!AT_STRINGS_END (d))
6037 {
f1ad044f 6038 PREFETCH_NOLIMIT ();
6fdd04b0 6039 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6040#ifdef emacs
6041 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6042#endif
b18215fc
RS
6043 s2 = SYNTAX (c2);
6044
6045 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6046 returns 0. */
b18215fc 6047 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6048 goto fail;
b18215fc
RS
6049 }
6050 }
e318085a
RS
6051 break;
6052
669fa600 6053 case symbeg:
dc4a2ee0 6054 DEBUG_PRINT ("EXECUTING symbeg.\n");
669fa600
SM
6055
6056 /* We FAIL in one of the following cases: */
6057
7814e705 6058 /* Case 1: D is at the end of string. */
669fa600
SM
6059 if (AT_STRINGS_END (d))
6060 goto fail;
6061 else
6062 {
6063 /* C1 is the character before D, S1 is the syntax of C1, C2
6064 is the character at D, and S2 is the syntax of C2. */
6065 re_wchar_t c1, c2;
6066 int s1, s2;
6067#ifdef emacs
d1dfb56c
EZ
6068 ssize_t offset = PTR_TO_OFFSET (d);
6069 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6070 UPDATE_SYNTAX_TABLE (charpos);
6071#endif
6072 PREFETCH ();
62a6e103 6073 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6074 s2 = SYNTAX (c2);
7814e705 6075
669fa600
SM
6076 /* Case 2: S2 is neither Sword nor Ssymbol. */
6077 if (s2 != Sword && s2 != Ssymbol)
6078 goto fail;
6079
6080 /* Case 3: D is not at the beginning of string ... */
6081 if (!AT_STRINGS_BEG (d))
6082 {
6083 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6084#ifdef emacs
6085 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6086#endif
6087 s1 = SYNTAX (c1);
6088
6089 /* ... and S1 is Sword or Ssymbol. */
6090 if (s1 == Sword || s1 == Ssymbol)
6091 goto fail;
6092 }
6093 }
6094 break;
6095
6096 case symend:
dc4a2ee0 6097 DEBUG_PRINT ("EXECUTING symend.\n");
669fa600
SM
6098
6099 /* We FAIL in one of the following cases: */
6100
6101 /* Case 1: D is at the beginning of string. */
6102 if (AT_STRINGS_BEG (d))
6103 goto fail;
6104 else
6105 {
6106 /* C1 is the character before D, S1 is the syntax of C1, C2
6107 is the character at D, and S2 is the syntax of C2. */
6108 re_wchar_t c1, c2;
6109 int s1, s2;
6110#ifdef emacs
d1dfb56c
EZ
6111 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6112 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6113 UPDATE_SYNTAX_TABLE (charpos);
6114#endif
6115 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6116 s1 = SYNTAX (c1);
6117
6118 /* Case 2: S1 is neither Ssymbol nor Sword. */
6119 if (s1 != Sword && s1 != Ssymbol)
6120 goto fail;
6121
6122 /* Case 3: D is not at the end of string ... */
6123 if (!AT_STRINGS_END (d))
6124 {
6125 PREFETCH_NOLIMIT ();
62a6e103 6126 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6127#ifdef emacs
134579f2 6128 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6129#endif
6130 s2 = SYNTAX (c2);
6131
6132 /* ... and S2 is Sword or Ssymbol. */
6133 if (s2 == Sword || s2 == Ssymbol)
6134 goto fail;
b18215fc
RS
6135 }
6136 }
e318085a
RS
6137 break;
6138
fa9a63c5 6139 case syntaxspec:
1fb352e0 6140 case notsyntaxspec:
b18215fc 6141 {
19ed5445
PE
6142 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6143 mcnt = *p++;
dc4a2ee0
PE
6144 DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
6145 mcnt);
19ed5445
PE
6146 PREFETCH ();
6147#ifdef emacs
6148 {
d1dfb56c
EZ
6149 ssize_t offset = PTR_TO_OFFSET (d);
6150 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445
PE
6151 UPDATE_SYNTAX_TABLE (pos1);
6152 }
25fe55af 6153#endif
19ed5445
PE
6154 {
6155 int len;
6156 re_wchar_t c;
b18215fc 6157
19ed5445
PE
6158 GET_CHAR_AFTER (c, d, len);
6159 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6160 goto fail;
6161 d += len;
6162 }
b18215fc 6163 }
8fb31792 6164 break;
fa9a63c5 6165
b18215fc 6166#ifdef emacs
1fb352e0 6167 case before_dot:
dc4a2ee0 6168 DEBUG_PRINT ("EXECUTING before_dot.\n");
1fb352e0 6169 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6170 goto fail;
b18215fc
RS
6171 break;
6172
1fb352e0 6173 case at_dot:
dc4a2ee0 6174 DEBUG_PRINT ("EXECUTING at_dot.\n");
1fb352e0
SM
6175 if (PTR_BYTE_POS (d) != PT_BYTE)
6176 goto fail;
6177 break;
b18215fc 6178
1fb352e0 6179 case after_dot:
dc4a2ee0 6180 DEBUG_PRINT ("EXECUTING after_dot.\n");
1fb352e0
SM
6181 if (PTR_BYTE_POS (d) <= PT_BYTE)
6182 goto fail;
e318085a 6183 break;
fa9a63c5 6184
1fb352e0 6185 case categoryspec:
b18215fc 6186 case notcategoryspec:
b18215fc 6187 {
8fb31792
PE
6188 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6189 mcnt = *p++;
dc4a2ee0
PE
6190 DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
6191 not ? "not" : "", mcnt);
8fb31792 6192 PREFETCH ();
01618498 6193
8fb31792
PE
6194 {
6195 int len;
6196 re_wchar_t c;
6197 GET_CHAR_AFTER (c, d, len);
6198 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6199 goto fail;
6200 d += len;
6201 }
b18215fc 6202 }
fa9a63c5 6203 break;
5e69f11e 6204
1fb352e0 6205#endif /* emacs */
5e69f11e 6206
0b32bf0e
SM
6207 default:
6208 abort ();
fa9a63c5 6209 }
b18215fc 6210 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6211
6212
6213 /* We goto here if a matching operation fails. */
6214 fail:
5b370c2b 6215 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6216 if (!FAIL_STACK_EMPTY ())
505bde11 6217 {
01618498 6218 re_char *str, *pat;
505bde11 6219 /* A restart point is known. Restore to that state. */
dc4a2ee0 6220 DEBUG_PRINT ("\nFAIL:\n");
0b32bf0e 6221 POP_FAILURE_POINT (str, pat);
7393bcbb 6222 switch (*pat++)
505bde11
SM
6223 {
6224 case on_failure_keep_string_jump:
6225 assert (str == NULL);
6226 goto continue_failure_jump;
6227
0683b6fa
SM
6228 case on_failure_jump_nastyloop:
6229 assert ((re_opcode_t)pat[-2] == no_op);
6230 PUSH_FAILURE_POINT (pat - 2, str);
6231 /* Fallthrough */
6232
505bde11
SM
6233 case on_failure_jump_loop:
6234 case on_failure_jump:
6235 case succeed_n:
6236 d = str;
6237 continue_failure_jump:
6238 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6239 p = pat + mcnt;
6240 break;
b18215fc 6241
0683b6fa
SM
6242 case no_op:
6243 /* A special frame used for nastyloops. */
6244 goto fail;
6245
505bde11 6246 default:
5e617bc2 6247 abort ();
505bde11 6248 }
fa9a63c5 6249
505bde11 6250 assert (p >= bufp->buffer && p <= pend);
b18215fc 6251
0b32bf0e 6252 if (d >= string1 && d <= end1)
fa9a63c5 6253 dend = end_match_1;
0b32bf0e 6254 }
fa9a63c5 6255 else
0b32bf0e 6256 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6257 } /* for (;;) */
6258
6259 if (best_regs_set)
6260 goto restore_best_regs;
6261
6262 FREE_VARIABLES ();
6263
b18215fc 6264 return -1; /* Failure to match. */
dc4a2ee0 6265}
fa9a63c5
RM
6266\f
6267/* Subroutine definitions for re_match_2. */
6268
fa9a63c5
RM
6269/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6270 bytes; nonzero otherwise. */
5e69f11e 6271
fa9a63c5 6272static int
29abe551 6273bcmp_translate (const_re_char *s1, const_re_char *s2, register ssize_t len,
438105ed 6274 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6275{
2d1675e4
SM
6276 register re_char *p1 = s1, *p2 = s2;
6277 re_char *p1_end = s1 + len;
6278 re_char *p2_end = s2 + len;
e934739e 6279
4bb91c68
SM
6280 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6281 different lengths, but relying on a single `len' would break this. -sm */
6282 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6283 {
e934739e 6284 int p1_charlen, p2_charlen;
01618498 6285 re_wchar_t p1_ch, p2_ch;
e934739e 6286
6fdd04b0
KH
6287 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6288 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6289
6290 if (RE_TRANSLATE (translate, p1_ch)
6291 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6292 return 1;
e934739e
RS
6293
6294 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6295 }
e934739e
RS
6296
6297 if (p1 != p1_end || p2 != p2_end)
6298 return 1;
6299
fa9a63c5
RM
6300 return 0;
6301}
6302\f
6303/* Entry points for GNU code. */
6304
6305/* re_compile_pattern is the GNU regular expression compiler: it
6306 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6307 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6308
fa9a63c5
RM
6309 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6310 are set in BUFP on entry.
5e69f11e 6311
b18215fc 6312 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6313
6314const char *
d1dfb56c
EZ
6315re_compile_pattern (const char *pattern, size_t length,
6316 struct re_pattern_buffer *bufp)
fa9a63c5
RM
6317{
6318 reg_errcode_t ret;
5e69f11e 6319
fa9a63c5
RM
6320 /* GNU code is written to assume at least RE_NREGS registers will be set
6321 (and at least one extra will be -1). */
6322 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6323
fa9a63c5
RM
6324 /* And GNU code determines whether or not to get register information
6325 by passing null for the REGS argument to re_match, etc., not by
6326 setting no_sub. */
6327 bufp->no_sub = 0;
5e69f11e 6328
4bb91c68 6329 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6330
6331 if (!ret)
6332 return NULL;
6333 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6334}
c0f9ea08 6335WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6336\f
b18215fc
RS
6337/* Entry points compatible with 4.2 BSD regex library. We don't define
6338 them unless specifically requested. */
fa9a63c5 6339
0b32bf0e 6340#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6341
6342/* BSD has one and only one pattern buffer. */
6343static struct re_pattern_buffer re_comp_buf;
6344
6345char *
0b32bf0e 6346# ifdef _LIBC
48afdd44
RM
6347/* Make these definitions weak in libc, so POSIX programs can redefine
6348 these names if they don't use our functions, and still use
6349 regcomp/regexec below without link errors. */
6350weak_function
0b32bf0e 6351# endif
31011111 6352re_comp (const char *s)
fa9a63c5
RM
6353{
6354 reg_errcode_t ret;
5e69f11e 6355
fa9a63c5
RM
6356 if (!s)
6357 {
6358 if (!re_comp_buf.buffer)
0b32bf0e 6359 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6360 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6361 return 0;
6362 }
6363
6364 if (!re_comp_buf.buffer)
6365 {
38182d90 6366 re_comp_buf.buffer = malloc (200);
fa9a63c5 6367 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6368 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6369 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6370 re_comp_buf.allocated = 200;
6371
38182d90 6372 re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
fa9a63c5 6373 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6374 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6375 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6376 }
6377
6378 /* Since `re_exec' always passes NULL for the `regs' argument, we
6379 don't need to initialize the pattern buffer fields which affect it. */
6380
fa9a63c5 6381 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6382
fa9a63c5
RM
6383 if (!ret)
6384 return NULL;
6385
6386 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6387 return (char *) gettext (re_error_msgid[(int) ret]);
6388}
6389
6390
31011111 6391int
0b32bf0e 6392# ifdef _LIBC
48afdd44 6393weak_function
0b32bf0e 6394# endif
d1dfb56c 6395re_exec (const char *s)
fa9a63c5 6396{
d1dfb56c 6397 const size_t len = strlen (s);
7d652d97 6398 return re_search (&re_comp_buf, s, len, 0, len, 0) >= 0;
fa9a63c5
RM
6399}
6400#endif /* _REGEX_RE_COMP */
6401\f
6402/* POSIX.2 functions. Don't define these for Emacs. */
6403
6404#ifndef emacs
6405
6406/* regcomp takes a regular expression as a string and compiles it.
6407
b18215fc 6408 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6409 since POSIX says we shouldn't. Thus, we set
6410
6411 `buffer' to the compiled pattern;
6412 `used' to the length of the compiled pattern;
6413 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6414 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6415 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6416 `fastmap' to an allocated space for the fastmap;
6417 `fastmap_accurate' to zero;
fa9a63c5
RM
6418 `re_nsub' to the number of subexpressions in PATTERN.
6419
6420 PATTERN is the address of the pattern string.
6421
6422 CFLAGS is a series of bits which affect compilation.
6423
6424 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6425 use POSIX basic syntax.
6426
6427 If REG_NEWLINE is set, then . and [^...] don't match newline.
6428 Also, regexec will try a match beginning after every newline.
6429
6430 If REG_ICASE is set, then we considers upper- and lowercase
6431 versions of letters to be equivalent when matching.
6432
6433 If REG_NOSUB is set, then when PREG is passed to regexec, that
6434 routine will report only success or failure, and nothing about the
6435 registers.
6436
b18215fc 6437 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6438 the return codes and their meanings.) */
6439
d1dfb56c 6440reg_errcode_t
29abe551 6441regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern,
d2762c86 6442 int cflags)
fa9a63c5
RM
6443{
6444 reg_errcode_t ret;
4bb91c68 6445 reg_syntax_t syntax
fa9a63c5
RM
6446 = (cflags & REG_EXTENDED) ?
6447 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6448
6449 /* regex_compile will allocate the space for the compiled pattern. */
6450 preg->buffer = 0;
6451 preg->allocated = 0;
6452 preg->used = 0;
5e69f11e 6453
c0f9ea08 6454 /* Try to allocate space for the fastmap. */
38182d90 6455 preg->fastmap = malloc (1 << BYTEWIDTH);
5e69f11e 6456
fa9a63c5
RM
6457 if (cflags & REG_ICASE)
6458 {
6459 unsigned i;
5e69f11e 6460
38182d90 6461 preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
fa9a63c5 6462 if (preg->translate == NULL)
0b32bf0e 6463 return (int) REG_ESPACE;
fa9a63c5
RM
6464
6465 /* Map uppercase characters to corresponding lowercase ones. */
6466 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6467 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6468 }
6469 else
6470 preg->translate = NULL;
6471
6472 /* If REG_NEWLINE is set, newlines are treated differently. */
6473 if (cflags & REG_NEWLINE)
6474 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6475 syntax &= ~RE_DOT_NEWLINE;
6476 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6477 }
6478 else
c0f9ea08 6479 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6480
6481 preg->no_sub = !!(cflags & REG_NOSUB);
6482
5e69f11e 6483 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6484 can use strlen here in compiling the pattern. */
4bb91c68 6485 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6486
fa9a63c5
RM
6487 /* POSIX doesn't distinguish between an unmatched open-group and an
6488 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6489 if (ret == REG_ERPAREN)
6490 ret = REG_EPAREN;
6491
6492 if (ret == REG_NOERROR && preg->fastmap)
6493 { /* Compute the fastmap now, since regexec cannot modify the pattern
6494 buffer. */
6495 re_compile_fastmap (preg);
6496 if (preg->can_be_null)
6497 { /* The fastmap can't be used anyway. */
6498 free (preg->fastmap);
6499 preg->fastmap = NULL;
6500 }
6501 }
d1dfb56c 6502 return ret;
fa9a63c5 6503}
c0f9ea08 6504WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6505
6506
6507/* regexec searches for a given pattern, specified by PREG, in the
6508 string STRING.
5e69f11e 6509
fa9a63c5 6510 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6511 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6512 least NMATCH elements, and we set them to the offsets of the
6513 corresponding matched substrings.
5e69f11e 6514
fa9a63c5
RM
6515 EFLAGS specifies `execution flags' which affect matching: if
6516 REG_NOTBOL is set, then ^ does not match at the beginning of the
6517 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6518
fa9a63c5
RM
6519 We return 0 if we find a match and REG_NOMATCH if not. */
6520
d1dfb56c 6521reg_errcode_t
29abe551
PE
6522regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
6523 size_t nmatch, regmatch_t pmatch[_Restrict_arr_], int eflags)
fa9a63c5 6524{
31011111 6525 regoff_t ret;
fa9a63c5
RM
6526 struct re_registers regs;
6527 regex_t private_preg;
d1dfb56c 6528 size_t len = strlen (string);
c0f9ea08 6529 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6530
6531 private_preg = *preg;
5e69f11e 6532
fa9a63c5
RM
6533 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6534 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6535
fa9a63c5
RM
6536 /* The user has told us exactly how many registers to return
6537 information about, via `nmatch'. We have to pass that on to the
b18215fc 6538 matching routines. */
fa9a63c5 6539 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6540
fa9a63c5
RM
6541 if (want_reg_info)
6542 {
6543 regs.num_regs = nmatch;
4bb91c68
SM
6544 regs.start = TALLOC (nmatch * 2, regoff_t);
6545 if (regs.start == NULL)
d1dfb56c 6546 return REG_NOMATCH;
4bb91c68 6547 regs.end = regs.start + nmatch;
fa9a63c5
RM
6548 }
6549
c0f9ea08
SM
6550 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6551 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6552 was a little bit longer but still only matching the real part.
6553 This works because the `endline' will check for a '\n' and will find a
6554 '\0', correctly deciding that this is not the end of a line.
6555 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6556 a convenient '\0' there. For all we know, the string could be preceded
6557 by '\n' which would throw things off. */
6558
fa9a63c5
RM
6559 /* Perform the searching operation. */
6560 ret = re_search (&private_preg, string, len,
0b32bf0e 6561 /* start: */ 0, /* range: */ len,
7d652d97 6562 want_reg_info ? &regs : 0);
5e69f11e 6563
fa9a63c5
RM
6564 /* Copy the register information to the POSIX structure. */
6565 if (want_reg_info)
6566 {
6567 if (ret >= 0)
0b32bf0e
SM
6568 {
6569 unsigned r;
fa9a63c5 6570
0b32bf0e
SM
6571 for (r = 0; r < nmatch; r++)
6572 {
6573 pmatch[r].rm_so = regs.start[r];
6574 pmatch[r].rm_eo = regs.end[r];
6575 }
6576 }
fa9a63c5 6577
b18215fc 6578 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6579 free (regs.start);
fa9a63c5
RM
6580 }
6581
6582 /* We want zero return to mean success, unlike `re_search'. */
d1dfb56c 6583 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
fa9a63c5 6584}
c0f9ea08 6585WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6586
6587
ec869672
JR
6588/* Returns a message corresponding to an error code, ERR_CODE, returned
6589 from either regcomp or regexec. We don't use PREG here.
6590
6591 ERR_CODE was previously called ERRCODE, but that name causes an
6592 error with msvc8 compiler. */
fa9a63c5
RM
6593
6594size_t
d2762c86 6595regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6596{
6597 const char *msg;
6598 size_t msg_size;
6599
ec869672
JR
6600 if (err_code < 0
6601 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6602 /* Only error codes returned by the rest of the code should be passed
b18215fc 6603 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6604 code generates an invalid error code, then the program has a bug.
6605 Dump core so we can fix it. */
6606 abort ();
6607
ec869672 6608 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6609
6610 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6611
fa9a63c5
RM
6612 if (errbuf_size != 0)
6613 {
6614 if (msg_size > errbuf_size)
0b32bf0e 6615 {
e99a530f 6616 memcpy (errbuf, msg, errbuf_size - 1);
0b32bf0e
SM
6617 errbuf[errbuf_size - 1] = 0;
6618 }
fa9a63c5 6619 else
0b32bf0e 6620 strcpy (errbuf, msg);
fa9a63c5
RM
6621 }
6622
6623 return msg_size;
6624}
c0f9ea08 6625WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6626
6627
6628/* Free dynamically allocated space used by PREG. */
6629
6630void
d2762c86 6631regfree (regex_t *preg)
fa9a63c5 6632{
c2cd06e6 6633 free (preg->buffer);
fa9a63c5 6634 preg->buffer = NULL;
5e69f11e 6635
fa9a63c5
RM
6636 preg->allocated = 0;
6637 preg->used = 0;
6638
c2cd06e6 6639 free (preg->fastmap);
fa9a63c5
RM
6640 preg->fastmap = NULL;
6641 preg->fastmap_accurate = 0;
6642
c2cd06e6 6643 free (preg->translate);
fa9a63c5
RM
6644 preg->translate = NULL;
6645}
c0f9ea08 6646WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6647
6648#endif /* not emacs */