More fixes for bug #12806.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
acaf905b 5 Copyright (C) 1993-2012 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
4fc5845f 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
b8df54ff
PE
36/* Ignore some GCC warnings for now. This section should go away
37 once the Emacs and Gnulib regex code is merged. */
63807d47 38#if (__GNUC__ == 4 && 5 <= __GNUC_MINOR__) || 4 < __GNUC__
b8df54ff
PE
39# pragma GCC diagnostic ignored "-Wstrict-overflow"
40# ifndef emacs
41# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
42# pragma GCC diagnostic ignored "-Wunused-function"
43# pragma GCC diagnostic ignored "-Wunused-macros"
44# pragma GCC diagnostic ignored "-Wunused-result"
45# pragma GCC diagnostic ignored "-Wunused-variable"
46# endif
47#endif
48
cf38a720 49#include <config.h>
fa9a63c5 50
0e926e56
PE
51#include <stddef.h>
52
53#ifdef emacs
4bb91c68
SM
54/* We need this for `regex.h', and perhaps for the Emacs include files. */
55# include <sys/types.h>
56#endif
fa9a63c5 57
14473664
SM
58/* Whether to use ISO C Amendment 1 wide char functions.
59 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
60#if defined _LIBC
61#define WIDE_CHAR_SUPPORT 1
62#else
14473664 63#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
64 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
65#endif
14473664 66
fa463103 67/* For platform which support the ISO C amendment 1 functionality we
14473664 68 support user defined character classes. */
a0ad02f7 69#if WIDE_CHAR_SUPPORT
14473664
SM
70/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
71# include <wchar.h>
72# include <wctype.h>
73#endif
74
c0f9ea08
SM
75#ifdef _LIBC
76/* We have to keep the namespace clean. */
77# define regfree(preg) __regfree (preg)
78# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
79# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672 80# define regerror(err_code, preg, errbuf, errbuf_size) \
5e617bc2 81 __regerror (err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
82# define re_set_registers(bu, re, nu, st, en) \
83 __re_set_registers (bu, re, nu, st, en)
84# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
85 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
86# define re_match(bufp, string, size, pos, regs) \
87 __re_match (bufp, string, size, pos, regs)
88# define re_search(bufp, string, size, startpos, range, regs) \
89 __re_search (bufp, string, size, startpos, range, regs)
90# define re_compile_pattern(pattern, length, bufp) \
91 __re_compile_pattern (pattern, length, bufp)
92# define re_set_syntax(syntax) __re_set_syntax (syntax)
93# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
94 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
95# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
96
14473664
SM
97/* Make sure we call libc's function even if the user overrides them. */
98# define btowc __btowc
99# define iswctype __iswctype
100# define wctype __wctype
101
c0f9ea08
SM
102# define WEAK_ALIAS(a,b) weak_alias (a, b)
103
104/* We are also using some library internals. */
105# include <locale/localeinfo.h>
106# include <locale/elem-hash.h>
107# include <langinfo.h>
108#else
109# define WEAK_ALIAS(a,b)
110#endif
111
4bb91c68 112/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 113#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
114# include <libintl.h>
115#else
116# define gettext(msgid) (msgid)
117#endif
118
5e69f11e
RM
119#ifndef gettext_noop
120/* This define is so xgettext can find the internationalizable
121 strings. */
0b32bf0e 122# define gettext_noop(String) String
5e69f11e
RM
123#endif
124
fa9a63c5
RM
125/* The `emacs' switch turns on certain matching commands
126 that make sense only in Emacs. */
127#ifdef emacs
128
0b32bf0e 129# include "lisp.h"
e5560ff7 130# include "character.h"
0b32bf0e 131# include "buffer.h"
b18215fc
RS
132
133/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 134# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 135
0b32bf0e 136# include "syntax.h"
0b32bf0e 137# include "category.h"
fa9a63c5 138
7689ef0b
EZ
139# ifdef malloc
140# undef malloc
141# endif
0b32bf0e 142# define malloc xmalloc
7689ef0b
EZ
143# ifdef realloc
144# undef realloc
145# endif
0b32bf0e 146# define realloc xrealloc
7689ef0b
EZ
147# ifdef free
148# undef free
149# endif
0b32bf0e 150# define free xfree
9abbd165 151
7814e705 152/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
153# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
154# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
155
156# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 157# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
158# define RE_STRING_CHAR(p, multibyte) \
159 (multibyte ? (STRING_CHAR (p)) : (*(p)))
160# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
161 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 162
4c0354d7 163# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 164
2afc21f5 165# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 166
6fdd04b0
KH
167/* Set C a (possibly converted to multibyte) character before P. P
168 points into a string which is the virtual concatenation of STR1
169 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
170# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
171 do { \
02cb78b5 172 if (target_multibyte) \
bf216479
KH
173 { \
174 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
175 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
176 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 177 c = STRING_CHAR (dtemp); \
bf216479
KH
178 } \
179 else \
180 { \
181 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 182 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 183 } \
2d1675e4
SM
184 } while (0)
185
6fdd04b0
KH
186/* Set C a (possibly converted to multibyte) character at P, and set
187 LEN to the byte length of that character. */
188# define GET_CHAR_AFTER(c, p, len) \
189 do { \
02cb78b5 190 if (target_multibyte) \
62a6e103 191 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
192 else \
193 { \
cf9c99bc 194 (c) = *p; \
6fdd04b0 195 len = 1; \
cf9c99bc 196 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 197 } \
8f924df7 198 } while (0)
4e8a9132 199
fa9a63c5
RM
200#else /* not emacs */
201
202/* If we are not linking with Emacs proper,
203 we can't use the relocating allocator
204 even if config.h says that we can. */
0b32bf0e 205# undef REL_ALLOC
fa9a63c5 206
4004364e 207# include <unistd.h>
fa9a63c5 208
a77f947b
CY
209/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
210
b8df54ff 211static void *
d2762c86 212xmalloc (size_t size)
a77f947b 213{
38182d90 214 void *val = malloc (size);
a77f947b
CY
215 if (!val && size)
216 {
217 write (2, "virtual memory exhausted\n", 25);
218 exit (1);
219 }
220 return val;
221}
222
b8df54ff 223static void *
d2762c86 224xrealloc (void *block, size_t size)
a77f947b 225{
38182d90 226 void *val;
a77f947b
CY
227 /* We must call malloc explicitly when BLOCK is 0, since some
228 reallocs don't do this. */
229 if (! block)
38182d90 230 val = malloc (size);
a77f947b 231 else
38182d90 232 val = realloc (block, size);
a77f947b
CY
233 if (!val && size)
234 {
235 write (2, "virtual memory exhausted\n", 25);
236 exit (1);
237 }
238 return val;
239}
240
a073faa6
CY
241# ifdef malloc
242# undef malloc
243# endif
244# define malloc xmalloc
245# ifdef realloc
246# undef realloc
247# endif
248# define realloc xrealloc
249
f5d9e83a 250# include <stdbool.h>
9cfdb3ec 251# include <string.h>
fa9a63c5
RM
252
253/* Define the syntax stuff for \<, \>, etc. */
254
990b2375 255/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 256enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 257
e934739e 258/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
259# define CHAR_CHARSET(c) 0
260# define CHARSET_LEADING_CODE_BASE(c) 0
261# define MAX_MULTIBYTE_LENGTH 1
262# define RE_MULTIBYTE_P(x) 0
bf216479 263# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
264# define WORD_BOUNDARY_P(c1, c2) (0)
265# define CHAR_HEAD_P(p) (1)
266# define SINGLE_BYTE_CHAR_P(c) (1)
267# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 268# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 269# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
270# define STRING_CHAR(p) (*(p))
271# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 272# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
273# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
274# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
275# define RE_CHAR_TO_MULTIBYTE(c) (c)
276# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 277# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 278 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
279# define GET_CHAR_AFTER(c, p, len) \
280 (c = *p, len = 1)
0b32bf0e 281# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
282# define BYTE8_TO_CHAR(c) (c)
283# define CHAR_BYTE8_P(c) (0)
bf216479 284# define CHAR_LEADING_CODE(c) (c)
8f924df7 285
fa9a63c5 286#endif /* not emacs */
4e8a9132
SM
287
288#ifndef RE_TRANSLATE
0b32bf0e
SM
289# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
290# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 291#endif
fa9a63c5
RM
292\f
293/* Get the interface, including the syntax bits. */
294#include "regex.h"
295
f71b19b6
DL
296/* isalpha etc. are used for the character classes. */
297#include <ctype.h>
fa9a63c5 298
f71b19b6 299#ifdef emacs
fa9a63c5 300
f71b19b6 301/* 1 if C is an ASCII character. */
0b32bf0e 302# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 303
f71b19b6 304/* 1 if C is a unibyte character. */
0b32bf0e 305# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 306
f71b19b6 307/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 308
f71b19b6 309/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
310# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
311# define ISCNTRL(c) ((c) < ' ')
312# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
313 || ((c) >= 'a' && (c) <= 'f') \
314 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
315
316/* This is only used for single-byte characters. */
0b32bf0e 317# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
318
319/* The rest must handle multibyte characters. */
320
0b32bf0e 321# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 322 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
323 : 1)
324
14473664 325# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 326 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
327 : 1)
328
0b32bf0e 329# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
330 ? (((c) >= 'a' && (c) <= 'z') \
331 || ((c) >= 'A' && (c) <= 'Z') \
332 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
333 : SYNTAX (c) == Sword)
334
0b32bf0e 335# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
336 ? (((c) >= 'a' && (c) <= 'z') \
337 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
338 : SYNTAX (c) == Sword)
339
5da9919f 340# define ISLOWER(c) lowercasep (c)
96cc36cc 341
0b32bf0e 342# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
343 ? ((c) > ' ' && (c) < 0177 \
344 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
345 || ((c) >= 'A' && (c) <= 'Z') \
346 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
347 : SYNTAX (c) != Sword)
348
0b32bf0e 349# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 350
5da9919f 351# define ISUPPER(c) uppercasep (c)
96cc36cc 352
0b32bf0e 353# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
354
355#else /* not emacs */
356
f71b19b6 357/* 1 if C is an ASCII character. */
0b32bf0e 358# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
359
360/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
361# define ISUNIBYTE(c) 1
362
363# ifdef isblank
0e926e56 364# define ISBLANK(c) isblank (c)
0b32bf0e
SM
365# else
366# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
367# endif
368# ifdef isgraph
0e926e56 369# define ISGRAPH(c) isgraph (c)
0b32bf0e 370# else
0e926e56 371# define ISGRAPH(c) (isprint (c) && !isspace (c))
0b32bf0e
SM
372# endif
373
0e926e56 374/* Solaris defines ISPRINT so we must undefine it first. */
4bb91c68 375# undef ISPRINT
0e926e56
PE
376# define ISPRINT(c) isprint (c)
377# define ISDIGIT(c) isdigit (c)
378# define ISALNUM(c) isalnum (c)
379# define ISALPHA(c) isalpha (c)
380# define ISCNTRL(c) iscntrl (c)
381# define ISLOWER(c) islower (c)
382# define ISPUNCT(c) ispunct (c)
383# define ISSPACE(c) isspace (c)
384# define ISUPPER(c) isupper (c)
385# define ISXDIGIT(c) isxdigit (c)
0b32bf0e 386
5e617bc2 387# define ISWORD(c) ISALPHA (c)
0b32bf0e 388
4bb91c68 389# ifdef _tolower
5e617bc2 390# define TOLOWER(c) _tolower (c)
4bb91c68 391# else
5e617bc2 392# define TOLOWER(c) tolower (c)
4bb91c68
SM
393# endif
394
395/* How many characters in the character set. */
396# define CHAR_SET_SIZE 256
397
0b32bf0e 398# ifdef SYNTAX_TABLE
f71b19b6 399
0b32bf0e 400extern char *re_syntax_table;
f71b19b6 401
0b32bf0e
SM
402# else /* not SYNTAX_TABLE */
403
0b32bf0e
SM
404static char re_syntax_table[CHAR_SET_SIZE];
405
406static void
d2762c86 407init_syntax_once (void)
0b32bf0e
SM
408{
409 register int c;
410 static int done = 0;
411
412 if (done)
413 return;
414
72af86bd 415 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 416
4bb91c68
SM
417 for (c = 0; c < CHAR_SET_SIZE; ++c)
418 if (ISALNUM (c))
419 re_syntax_table[c] = Sword;
fa9a63c5 420
669fa600 421 re_syntax_table['_'] = Ssymbol;
fa9a63c5 422
0b32bf0e
SM
423 done = 1;
424}
425
426# endif /* not SYNTAX_TABLE */
96cc36cc 427
4bb91c68
SM
428# define SYNTAX(c) re_syntax_table[(c)]
429
96cc36cc
RS
430#endif /* not emacs */
431\f
261cb4bb 432#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
433\f
434/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
435 use `alloca' instead of `malloc'. This is because using malloc in
436 re_search* or re_match* could cause memory leaks when C-g is used in
437 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
438 the other hand, malloc is more portable, and easier to debug.
439
fa9a63c5
RM
440 Because we sometimes use alloca, some routines have to be macros,
441 not functions -- `alloca'-allocated space disappears at the end of the
442 function it is called in. */
443
444#ifdef REGEX_MALLOC
445
0b32bf0e
SM
446# define REGEX_ALLOCATE malloc
447# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
448# define REGEX_FREE free
fa9a63c5
RM
449
450#else /* not REGEX_MALLOC */
451
452/* Emacs already defines alloca, sometimes. */
0b32bf0e 453# ifndef alloca
fa9a63c5
RM
454
455/* Make alloca work the best possible way. */
0b32bf0e
SM
456# ifdef __GNUC__
457# define alloca __builtin_alloca
458# else /* not __GNUC__ */
7f585e7a 459# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
460# include <alloca.h>
461# endif /* HAVE_ALLOCA_H */
462# endif /* not __GNUC__ */
fa9a63c5 463
0b32bf0e 464# endif /* not alloca */
fa9a63c5 465
0b32bf0e 466# define REGEX_ALLOCATE alloca
fa9a63c5
RM
467
468/* Assumes a `char *destination' variable. */
0b32bf0e 469# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 470 (destination = (char *) alloca (nsize), \
4bb91c68 471 memcpy (destination, source, osize))
fa9a63c5
RM
472
473/* No need to do anything to free, after alloca. */
0b32bf0e 474# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
475
476#endif /* not REGEX_MALLOC */
477
478/* Define how to allocate the failure stack. */
479
0b32bf0e 480#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 481
0b32bf0e 482# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 483 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 484# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 485 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 486# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
487 r_alloc_free (&failure_stack_ptr)
488
4297555e 489#else /* not using relocating allocator */
fa9a63c5 490
0b32bf0e 491# ifdef REGEX_MALLOC
fa9a63c5 492
0b32bf0e
SM
493# define REGEX_ALLOCATE_STACK malloc
494# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
495# define REGEX_FREE_STACK free
fa9a63c5 496
0b32bf0e 497# else /* not REGEX_MALLOC */
fa9a63c5 498
0b32bf0e 499# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 500
0b32bf0e 501# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 502 REGEX_REALLOCATE (source, osize, nsize)
7814e705 503/* No need to explicitly free anything. */
0b32bf0e 504# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 505
0b32bf0e 506# endif /* not REGEX_MALLOC */
4297555e 507#endif /* not using relocating allocator */
fa9a63c5
RM
508
509
510/* True if `size1' is non-NULL and PTR is pointing anywhere inside
511 `string1' or just past its end. This works if PTR is NULL, which is
512 a good thing. */
25fe55af 513#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
514 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
515
516/* (Re)Allocate N items of type T using malloc, or fail. */
517#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
518#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
fa9a63c5
RM
519#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
520
4bb91c68 521#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
522
523#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
524
525#undef MAX
526#undef MIN
527#define MAX(a, b) ((a) > (b) ? (a) : (b))
528#define MIN(a, b) ((a) < (b) ? (a) : (b))
529
66f0296e 530/* Type of source-pattern and string chars. */
a6fc3b5c
EZ
531#ifdef _MSC_VER
532typedef unsigned char re_char;
533#else
66f0296e 534typedef const unsigned char re_char;
a6fc3b5c 535#endif
66f0296e 536
fa9a63c5 537typedef char boolean;
fa9a63c5 538
261cb4bb
PE
539static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
540 re_char *string1, size_t size1,
541 re_char *string2, size_t size2,
542 ssize_t pos,
543 struct re_registers *regs,
544 ssize_t stop);
fa9a63c5
RM
545\f
546/* These are the command codes that appear in compiled regular
4bb91c68 547 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
548 command code can specify any interpretation whatsoever for its
549 arguments. Zero bytes may appear in the compiled regular expression. */
550
551typedef enum
552{
553 no_op = 0,
554
4bb91c68 555 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
556 succeed,
557
25fe55af 558 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
559 exactn,
560
25fe55af 561 /* Matches any (more or less) character. */
fa9a63c5
RM
562 anychar,
563
25fe55af
RS
564 /* Matches any one char belonging to specified set. First
565 following byte is number of bitmap bytes. Then come bytes
566 for a bitmap saying which chars are in. Bits in each byte
567 are ordered low-bit-first. A character is in the set if its
568 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
569 automatically not in the set.
570
571 If the length byte has the 0x80 bit set, then that stuff
572 is followed by a range table:
573 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 574 See RANGE_TABLE_WORK_BITS below.
01618498 575 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 576 pairs, each 2 multibyte characters,
0b32bf0e 577 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
578 charset,
579
25fe55af 580 /* Same parameters as charset, but match any character that is
4bb91c68 581 not one of those specified. */
fa9a63c5
RM
582 charset_not,
583
25fe55af
RS
584 /* Start remembering the text that is matched, for storing in a
585 register. Followed by one byte with the register number, in
586 the range 0 to one less than the pattern buffer's re_nsub
505bde11 587 field. */
fa9a63c5
RM
588 start_memory,
589
25fe55af
RS
590 /* Stop remembering the text that is matched and store it in a
591 memory register. Followed by one byte with the register
592 number, in the range 0 to one less than `re_nsub' in the
505bde11 593 pattern buffer. */
fa9a63c5
RM
594 stop_memory,
595
25fe55af 596 /* Match a duplicate of something remembered. Followed by one
4bb91c68 597 byte containing the register number. */
fa9a63c5
RM
598 duplicate,
599
25fe55af 600 /* Fail unless at beginning of line. */
fa9a63c5
RM
601 begline,
602
4bb91c68 603 /* Fail unless at end of line. */
fa9a63c5
RM
604 endline,
605
25fe55af
RS
606 /* Succeeds if at beginning of buffer (if emacs) or at beginning
607 of string to be matched (if not). */
fa9a63c5
RM
608 begbuf,
609
25fe55af 610 /* Analogously, for end of buffer/string. */
fa9a63c5 611 endbuf,
5e69f11e 612
25fe55af 613 /* Followed by two byte relative address to which to jump. */
5e69f11e 614 jump,
fa9a63c5 615
25fe55af 616 /* Followed by two-byte relative address of place to resume at
7814e705 617 in case of failure. */
fa9a63c5 618 on_failure_jump,
5e69f11e 619
25fe55af
RS
620 /* Like on_failure_jump, but pushes a placeholder instead of the
621 current string position when executed. */
fa9a63c5 622 on_failure_keep_string_jump,
5e69f11e 623
505bde11
SM
624 /* Just like `on_failure_jump', except that it checks that we
625 don't get stuck in an infinite loop (matching an empty string
626 indefinitely). */
627 on_failure_jump_loop,
628
0683b6fa
SM
629 /* Just like `on_failure_jump_loop', except that it checks for
630 a different kind of loop (the kind that shows up with non-greedy
631 operators). This operation has to be immediately preceded
632 by a `no_op'. */
633 on_failure_jump_nastyloop,
634
0b32bf0e 635 /* A smart `on_failure_jump' used for greedy * and + operators.
c7015153 636 It analyzes the loop before which it is put and if the
505bde11 637 loop does not require backtracking, it changes itself to
4e8a9132
SM
638 `on_failure_keep_string_jump' and short-circuits the loop,
639 else it just defaults to changing itself into `on_failure_jump'.
640 It assumes that it is pointing to just past a `jump'. */
505bde11 641 on_failure_jump_smart,
fa9a63c5 642
25fe55af 643 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
644 After matching N times, jump to the address upon failure.
645 Does not work if N starts at 0: use on_failure_jump_loop
646 instead. */
fa9a63c5
RM
647 succeed_n,
648
25fe55af
RS
649 /* Followed by two-byte relative address, and two-byte number n.
650 Jump to the address N times, then fail. */
fa9a63c5
RM
651 jump_n,
652
25fe55af 653 /* Set the following two-byte relative address to the
7814e705 654 subsequent two-byte number. The address *includes* the two
25fe55af 655 bytes of number. */
fa9a63c5
RM
656 set_number_at,
657
fa9a63c5
RM
658 wordbeg, /* Succeeds if at word beginning. */
659 wordend, /* Succeeds if at word end. */
660
661 wordbound, /* Succeeds if at a word boundary. */
7814e705 662 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 663
669fa600
SM
664 symbeg, /* Succeeds if at symbol beginning. */
665 symend, /* Succeeds if at symbol end. */
666
fa9a63c5 667 /* Matches any character whose syntax is specified. Followed by
25fe55af 668 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
669 syntaxspec,
670
671 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
672 notsyntaxspec
673
674#ifdef emacs
675 ,before_dot, /* Succeeds if before point. */
676 at_dot, /* Succeeds if at point. */
677 after_dot, /* Succeeds if after point. */
b18215fc
RS
678
679 /* Matches any character whose category-set contains the specified
7814e705
JB
680 category. The operator is followed by a byte which contains a
681 category code (mnemonic ASCII character). */
b18215fc
RS
682 categoryspec,
683
684 /* Matches any character whose category-set does not contain the
685 specified category. The operator is followed by a byte which
686 contains the category code (mnemonic ASCII character). */
687 notcategoryspec
fa9a63c5
RM
688#endif /* emacs */
689} re_opcode_t;
690\f
691/* Common operations on the compiled pattern. */
692
693/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
694
695#define STORE_NUMBER(destination, number) \
696 do { \
697 (destination)[0] = (number) & 0377; \
698 (destination)[1] = (number) >> 8; \
699 } while (0)
700
701/* Same as STORE_NUMBER, except increment DESTINATION to
702 the byte after where the number is stored. Therefore, DESTINATION
703 must be an lvalue. */
704
705#define STORE_NUMBER_AND_INCR(destination, number) \
706 do { \
707 STORE_NUMBER (destination, number); \
708 (destination) += 2; \
709 } while (0)
710
711/* Put into DESTINATION a number stored in two contiguous bytes starting
712 at SOURCE. */
713
714#define EXTRACT_NUMBER(destination, source) \
715 do { \
716 (destination) = *(source) & 0377; \
717 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
718 } while (0)
719
720#ifdef DEBUG
721static void
261cb4bb 722extract_number (int *dest, re_char *source)
fa9a63c5 723{
5e69f11e 724 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
725 *dest = *source & 0377;
726 *dest += temp << 8;
727}
728
4bb91c68 729# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
730# undef EXTRACT_NUMBER
731# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
732# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
733
734#endif /* DEBUG */
735
736/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
737 SOURCE must be an lvalue. */
738
739#define EXTRACT_NUMBER_AND_INCR(destination, source) \
740 do { \
741 EXTRACT_NUMBER (destination, source); \
25fe55af 742 (source) += 2; \
fa9a63c5
RM
743 } while (0)
744
745#ifdef DEBUG
746static void
261cb4bb 747extract_number_and_incr (int *destination, re_char **source)
5e69f11e 748{
fa9a63c5
RM
749 extract_number (destination, *source);
750 *source += 2;
751}
752
0b32bf0e
SM
753# ifndef EXTRACT_MACROS
754# undef EXTRACT_NUMBER_AND_INCR
755# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 756 extract_number_and_incr (&dest, &src)
0b32bf0e 757# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
758
759#endif /* DEBUG */
760\f
b18215fc
RS
761/* Store a multibyte character in three contiguous bytes starting
762 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 763 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
764
765#define STORE_CHARACTER_AND_INCR(destination, character) \
766 do { \
767 (destination)[0] = (character) & 0377; \
768 (destination)[1] = ((character) >> 8) & 0377; \
769 (destination)[2] = (character) >> 16; \
770 (destination) += 3; \
771 } while (0)
772
773/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 774 starting at SOURCE. */
b18215fc
RS
775
776#define EXTRACT_CHARACTER(destination, source) \
777 do { \
778 (destination) = ((source)[0] \
779 | ((source)[1] << 8) \
780 | ((source)[2] << 16)); \
781 } while (0)
782
783
784/* Macros for charset. */
785
786/* Size of bitmap of charset P in bytes. P is a start of charset,
787 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
788#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
789
790/* Nonzero if charset P has range table. */
25fe55af 791#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
792
793/* Return the address of range table of charset P. But not the start
794 of table itself, but the before where the number of ranges is
96cc36cc
RS
795 stored. `2 +' means to skip re_opcode_t and size of bitmap,
796 and the 2 bytes of flags at the start of the range table. */
797#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
798
799/* Extract the bit flags that start a range table. */
800#define CHARSET_RANGE_TABLE_BITS(p) \
801 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
802 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc 803
b18215fc 804/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
805 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
806 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
807 and end. */
808#define CHARSET_RANGE_TABLE_END(range_table, count) \
809 ((range_table) + (count) * 2 * 3)
810
7814e705 811/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
812 COUNT is number of ranges in RANGE_TABLE. */
813#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
814 do \
815 { \
01618498 816 re_wchar_t range_start, range_end; \
19ed5445 817 re_char *rtp; \
01618498 818 re_char *range_table_end \
b18215fc
RS
819 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
820 \
19ed5445 821 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 822 { \
19ed5445
PE
823 EXTRACT_CHARACTER (range_start, rtp); \
824 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
825 \
826 if (range_start <= (c) && (c) <= range_end) \
827 { \
828 (not) = !(not); \
829 break; \
830 } \
831 } \
832 } \
833 while (0)
834
835/* Test if C is in range table of CHARSET. The flag NOT is negated if
836 C is listed in it. */
837#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
838 do \
839 { \
840 /* Number of ranges in range table. */ \
841 int count; \
01618498
SM
842 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
843 \
b18215fc
RS
844 EXTRACT_NUMBER_AND_INCR (count, range_table); \
845 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
846 } \
847 while (0)
848\f
fa9a63c5
RM
849/* If DEBUG is defined, Regex prints many voluminous messages about what
850 it is doing (if the variable `debug' is nonzero). If linked with the
851 main program in `iregex.c', you can enter patterns and strings
852 interactively. And if linked with the main program in `main.c' and
4bb91c68 853 the other test files, you can run the already-written tests. */
fa9a63c5
RM
854
855#ifdef DEBUG
856
857/* We use standard I/O for debugging. */
0b32bf0e 858# include <stdio.h>
fa9a63c5
RM
859
860/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 861# include <assert.h>
fa9a63c5 862
99633e97 863static int debug = -100000;
fa9a63c5 864
0b32bf0e
SM
865# define DEBUG_STATEMENT(e) e
866# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
867# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
868# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
869# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
870# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 871 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 872# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 873 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
874
875
876/* Print the fastmap in human-readable form. */
877
878void
879print_fastmap (fastmap)
880 char *fastmap;
881{
882 unsigned was_a_range = 0;
5e69f11e
RM
883 unsigned i = 0;
884
fa9a63c5
RM
885 while (i < (1 << BYTEWIDTH))
886 {
887 if (fastmap[i++])
888 {
889 was_a_range = 0;
25fe55af
RS
890 putchar (i - 1);
891 while (i < (1 << BYTEWIDTH) && fastmap[i])
892 {
893 was_a_range = 1;
894 i++;
895 }
fa9a63c5 896 if (was_a_range)
25fe55af
RS
897 {
898 printf ("-");
899 putchar (i - 1);
900 }
901 }
fa9a63c5 902 }
5e69f11e 903 putchar ('\n');
fa9a63c5
RM
904}
905
906
907/* Print a compiled pattern string in human-readable form, starting at
908 the START pointer into it and ending just before the pointer END. */
909
910void
911print_partial_compiled_pattern (start, end)
01618498
SM
912 re_char *start;
913 re_char *end;
fa9a63c5
RM
914{
915 int mcnt, mcnt2;
01618498
SM
916 re_char *p = start;
917 re_char *pend = end;
fa9a63c5
RM
918
919 if (start == NULL)
920 {
a1a052df 921 fprintf (stderr, "(null)\n");
fa9a63c5
RM
922 return;
923 }
5e69f11e 924
fa9a63c5
RM
925 /* Loop over pattern commands. */
926 while (p < pend)
927 {
a1a052df 928 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
929
930 switch ((re_opcode_t) *p++)
931 {
25fe55af 932 case no_op:
a1a052df 933 fprintf (stderr, "/no_op");
25fe55af 934 break;
fa9a63c5 935
99633e97 936 case succeed:
a1a052df 937 fprintf (stderr, "/succeed");
99633e97
SM
938 break;
939
fa9a63c5
RM
940 case exactn:
941 mcnt = *p++;
a1a052df 942 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 943 do
fa9a63c5 944 {
a1a052df 945 fprintf (stderr, "/%c", *p++);
25fe55af
RS
946 }
947 while (--mcnt);
948 break;
fa9a63c5
RM
949
950 case start_memory:
a1a052df 951 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 952 break;
fa9a63c5
RM
953
954 case stop_memory:
a1a052df 955 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 956 break;
fa9a63c5
RM
957
958 case duplicate:
a1a052df 959 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
960 break;
961
962 case anychar:
a1a052df 963 fprintf (stderr, "/anychar");
fa9a63c5
RM
964 break;
965
966 case charset:
25fe55af
RS
967 case charset_not:
968 {
969 register int c, last = -100;
fa9a63c5 970 register int in_range = 0;
99633e97
SM
971 int length = CHARSET_BITMAP_SIZE (p - 1);
972 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 973
a1a052df 974 fprintf (stderr, "/charset [%s",
839966f3 975 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 976
839966f3
KH
977 if (p + *p >= pend)
978 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 979
25fe55af 980 for (c = 0; c < 256; c++)
96cc36cc 981 if (c / 8 < length
fa9a63c5
RM
982 && (p[1 + (c/8)] & (1 << (c % 8))))
983 {
984 /* Are we starting a range? */
985 if (last + 1 == c && ! in_range)
986 {
a1a052df 987 fprintf (stderr, "-");
fa9a63c5
RM
988 in_range = 1;
989 }
990 /* Have we broken a range? */
991 else if (last + 1 != c && in_range)
96cc36cc 992 {
a1a052df 993 fprintf (stderr, "%c", last);
fa9a63c5
RM
994 in_range = 0;
995 }
5e69f11e 996
fa9a63c5 997 if (! in_range)
a1a052df 998 fprintf (stderr, "%c", c);
fa9a63c5
RM
999
1000 last = c;
25fe55af 1001 }
fa9a63c5
RM
1002
1003 if (in_range)
a1a052df 1004 fprintf (stderr, "%c", last);
fa9a63c5 1005
a1a052df 1006 fprintf (stderr, "]");
fa9a63c5 1007
99633e97 1008 p += 1 + length;
96cc36cc 1009
96cc36cc 1010 if (has_range_table)
99633e97
SM
1011 {
1012 int count;
a1a052df 1013 fprintf (stderr, "has-range-table");
99633e97
SM
1014
1015 /* ??? Should print the range table; for now, just skip it. */
1016 p += 2; /* skip range table bits */
1017 EXTRACT_NUMBER_AND_INCR (count, p);
1018 p = CHARSET_RANGE_TABLE_END (p, count);
1019 }
fa9a63c5
RM
1020 }
1021 break;
1022
1023 case begline:
a1a052df 1024 fprintf (stderr, "/begline");
25fe55af 1025 break;
fa9a63c5
RM
1026
1027 case endline:
a1a052df 1028 fprintf (stderr, "/endline");
25fe55af 1029 break;
fa9a63c5
RM
1030
1031 case on_failure_jump:
25fe55af 1032 extract_number_and_incr (&mcnt, &p);
a1a052df 1033 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1034 break;
fa9a63c5
RM
1035
1036 case on_failure_keep_string_jump:
25fe55af 1037 extract_number_and_incr (&mcnt, &p);
a1a052df 1038 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1039 break;
fa9a63c5 1040
0683b6fa
SM
1041 case on_failure_jump_nastyloop:
1042 extract_number_and_incr (&mcnt, &p);
a1a052df 1043 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1044 break;
1045
505bde11 1046 case on_failure_jump_loop:
fa9a63c5 1047 extract_number_and_incr (&mcnt, &p);
a1a052df 1048 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1049 break;
1050
505bde11 1051 case on_failure_jump_smart:
fa9a63c5 1052 extract_number_and_incr (&mcnt, &p);
a1a052df 1053 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1054 break;
1055
25fe55af 1056 case jump:
fa9a63c5 1057 extract_number_and_incr (&mcnt, &p);
a1a052df 1058 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1059 break;
1060
25fe55af
RS
1061 case succeed_n:
1062 extract_number_and_incr (&mcnt, &p);
1063 extract_number_and_incr (&mcnt2, &p);
a1a052df 1064 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1065 break;
5e69f11e 1066
25fe55af
RS
1067 case jump_n:
1068 extract_number_and_incr (&mcnt, &p);
1069 extract_number_and_incr (&mcnt2, &p);
a1a052df 1070 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1071 break;
5e69f11e 1072
25fe55af
RS
1073 case set_number_at:
1074 extract_number_and_incr (&mcnt, &p);
1075 extract_number_and_incr (&mcnt2, &p);
a1a052df 1076 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1077 break;
5e69f11e 1078
25fe55af 1079 case wordbound:
a1a052df 1080 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1081 break;
1082
1083 case notwordbound:
a1a052df 1084 fprintf (stderr, "/notwordbound");
25fe55af 1085 break;
fa9a63c5
RM
1086
1087 case wordbeg:
a1a052df 1088 fprintf (stderr, "/wordbeg");
fa9a63c5 1089 break;
5e69f11e 1090
fa9a63c5 1091 case wordend:
a1a052df 1092 fprintf (stderr, "/wordend");
e2543b02 1093 break;
5e69f11e 1094
669fa600 1095 case symbeg:
e2543b02 1096 fprintf (stderr, "/symbeg");
669fa600
SM
1097 break;
1098
1099 case symend:
e2543b02 1100 fprintf (stderr, "/symend");
669fa600 1101 break;
5e69f11e 1102
1fb352e0 1103 case syntaxspec:
a1a052df 1104 fprintf (stderr, "/syntaxspec");
1fb352e0 1105 mcnt = *p++;
a1a052df 1106 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1107 break;
1108
1109 case notsyntaxspec:
a1a052df 1110 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1111 mcnt = *p++;
a1a052df 1112 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1113 break;
1114
0b32bf0e 1115# ifdef emacs
fa9a63c5 1116 case before_dot:
a1a052df 1117 fprintf (stderr, "/before_dot");
25fe55af 1118 break;
fa9a63c5
RM
1119
1120 case at_dot:
a1a052df 1121 fprintf (stderr, "/at_dot");
25fe55af 1122 break;
fa9a63c5
RM
1123
1124 case after_dot:
a1a052df 1125 fprintf (stderr, "/after_dot");
25fe55af 1126 break;
fa9a63c5 1127
1fb352e0 1128 case categoryspec:
a1a052df 1129 fprintf (stderr, "/categoryspec");
fa9a63c5 1130 mcnt = *p++;
a1a052df 1131 fprintf (stderr, "/%d", mcnt);
25fe55af 1132 break;
5e69f11e 1133
1fb352e0 1134 case notcategoryspec:
a1a052df 1135 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1136 mcnt = *p++;
a1a052df 1137 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1138 break;
0b32bf0e 1139# endif /* emacs */
fa9a63c5 1140
fa9a63c5 1141 case begbuf:
a1a052df 1142 fprintf (stderr, "/begbuf");
25fe55af 1143 break;
fa9a63c5
RM
1144
1145 case endbuf:
a1a052df 1146 fprintf (stderr, "/endbuf");
25fe55af 1147 break;
fa9a63c5 1148
25fe55af 1149 default:
a1a052df 1150 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1151 }
1152
a1a052df 1153 fprintf (stderr, "\n");
fa9a63c5
RM
1154 }
1155
a1a052df 1156 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1157}
1158
1159
1160void
1161print_compiled_pattern (bufp)
1162 struct re_pattern_buffer *bufp;
1163{
01618498 1164 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1165
1166 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1167 printf ("%ld bytes used/%ld bytes allocated.\n",
1168 bufp->used, bufp->allocated);
fa9a63c5
RM
1169
1170 if (bufp->fastmap_accurate && bufp->fastmap)
1171 {
1172 printf ("fastmap: ");
1173 print_fastmap (bufp->fastmap);
1174 }
1175
1176 printf ("re_nsub: %d\t", bufp->re_nsub);
1177 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1178 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1179 printf ("no_sub: %d\t", bufp->no_sub);
1180 printf ("not_bol: %d\t", bufp->not_bol);
1181 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1182 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1183 fflush (stdout);
fa9a63c5
RM
1184 /* Perhaps we should print the translate table? */
1185}
1186
1187
1188void
1189print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1190 re_char *where;
1191 re_char *string1;
1192 re_char *string2;
d1dfb56c
EZ
1193 ssize_t size1;
1194 ssize_t size2;
fa9a63c5 1195{
d1dfb56c 1196 ssize_t this_char;
5e69f11e 1197
fa9a63c5
RM
1198 if (where == NULL)
1199 printf ("(null)");
1200 else
1201 {
1202 if (FIRST_STRING_P (where))
25fe55af
RS
1203 {
1204 for (this_char = where - string1; this_char < size1; this_char++)
1205 putchar (string1[this_char]);
fa9a63c5 1206
25fe55af
RS
1207 where = string2;
1208 }
fa9a63c5
RM
1209
1210 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1211 putchar (string2[this_char]);
fa9a63c5
RM
1212 }
1213}
1214
1215#else /* not DEBUG */
1216
0b32bf0e
SM
1217# undef assert
1218# define assert(e)
fa9a63c5 1219
0b32bf0e
SM
1220# define DEBUG_STATEMENT(e)
1221# define DEBUG_PRINT1(x)
1222# define DEBUG_PRINT2(x1, x2)
1223# define DEBUG_PRINT3(x1, x2, x3)
1224# define DEBUG_PRINT4(x1, x2, x3, x4)
1225# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1226# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1227
1228#endif /* not DEBUG */
1229\f
4da60324
PE
1230/* Use this to suppress gcc's `...may be used before initialized' warnings. */
1231#ifdef lint
1232# define IF_LINT(Code) Code
1233#else
1234# define IF_LINT(Code) /* empty */
1235#endif
1236\f
fa9a63c5
RM
1237/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1238 also be assigned to arbitrarily: each pattern buffer stores its own
1239 syntax, so it can be changed between regex compilations. */
1240/* This has no initializer because initialized variables in Emacs
1241 become read-only after dumping. */
1242reg_syntax_t re_syntax_options;
1243
1244
1245/* Specify the precise syntax of regexps for compilation. This provides
1246 for compatibility for various utilities which historically have
1247 different, incompatible syntaxes.
1248
1249 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1250 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1251
1252reg_syntax_t
971de7fb 1253re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1254{
1255 reg_syntax_t ret = re_syntax_options;
5e69f11e 1256
fa9a63c5
RM
1257 re_syntax_options = syntax;
1258 return ret;
1259}
c0f9ea08 1260WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1261
1262/* Regexp to use to replace spaces, or NULL meaning don't. */
1263static re_char *whitespace_regexp;
1264
1265void
971de7fb 1266re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1267{
6470ea05 1268 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1269}
1270WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1271\f
1272/* This table gives an error message for each of the error codes listed
4bb91c68 1273 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1274 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1275 but why not be nice? */
fa9a63c5
RM
1276
1277static const char *re_error_msgid[] =
5e69f11e
RM
1278 {
1279 gettext_noop ("Success"), /* REG_NOERROR */
1280 gettext_noop ("No match"), /* REG_NOMATCH */
1281 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1282 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1283 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1284 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1285 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1286 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1287 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1288 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1289 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1290 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1291 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1292 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1293 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1294 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1295 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1296 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1297 };
1298\f
4bb91c68 1299/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1300
1301/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1302 searching and matching functions should not call alloca. On some
1303 systems, alloca is implemented in terms of malloc, and if we're
1304 using the relocating allocator routines, then malloc could cause a
1305 relocation, which might (if the strings being searched are in the
1306 ralloc heap) shift the data out from underneath the regexp
1307 routines.
1308
5e69f11e 1309 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1310 processes input from X in a signal handler; processing X input may
1311 call malloc; if input arrives while a matching routine is calling
1312 malloc, then we're scrod. But Emacs can't just block input while
1313 calling matching routines; then we don't notice interrupts when
1314 they come in. So, Emacs blocks input around all regexp calls
1315 except the matching calls, which it leaves unprotected, in the
1316 faith that they will not malloc. */
1317
1318/* Normally, this is fine. */
1319#define MATCH_MAY_ALLOCATE
1320
fa9a63c5
RM
1321/* The match routines may not allocate if (1) they would do it with malloc
1322 and (2) it's not safe for them to use malloc.
1323 Note that if REL_ALLOC is defined, matching would not use malloc for the
1324 failure stack, but we would still use it for the register vectors;
4bb91c68 1325 so REL_ALLOC should not affect this. */
b588157e 1326#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1327# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1328#endif
1329
1330\f
1331/* Failure stack declarations and macros; both re_compile_fastmap and
1332 re_match_2 use a failure stack. These have to be macros because of
1333 REGEX_ALLOCATE_STACK. */
5e69f11e 1334
fa9a63c5 1335
320a2a73 1336/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1337 when matching. If this number is exceeded, we allocate more
1338 space, so it is not a hard limit. */
1339#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1340# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1341#endif
1342
1343/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1344 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1345 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1346 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1347 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1348# if defined MATCH_MAY_ALLOCATE
1349/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1350 whose default stack limit is 2mb. In order for a larger
1351 value to work reliably, you have to try to make it accord
1352 with the process stack limit. */
c0f9ea08
SM
1353size_t re_max_failures = 40000;
1354# else
1355size_t re_max_failures = 4000;
1356# endif
fa9a63c5
RM
1357
1358union fail_stack_elt
1359{
01618498 1360 re_char *pointer;
c0f9ea08
SM
1361 /* This should be the biggest `int' that's no bigger than a pointer. */
1362 long integer;
fa9a63c5
RM
1363};
1364
1365typedef union fail_stack_elt fail_stack_elt_t;
1366
1367typedef struct
1368{
1369 fail_stack_elt_t *stack;
c0f9ea08
SM
1370 size_t size;
1371 size_t avail; /* Offset of next open position. */
1372 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1373} fail_stack_type;
1374
505bde11 1375#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1376
1377
1378/* Define macros to initialize and free the failure stack.
1379 Do `return -2' if the alloc fails. */
1380
1381#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1382# define INIT_FAIL_STACK() \
fa9a63c5 1383 do { \
38182d90 1384 fail_stack.stack = \
320a2a73
KH
1385 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1386 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1387 \
1388 if (fail_stack.stack == NULL) \
1389 return -2; \
1390 \
1391 fail_stack.size = INIT_FAILURE_ALLOC; \
1392 fail_stack.avail = 0; \
505bde11 1393 fail_stack.frame = 0; \
fa9a63c5 1394 } while (0)
fa9a63c5 1395#else
0b32bf0e 1396# define INIT_FAIL_STACK() \
fa9a63c5
RM
1397 do { \
1398 fail_stack.avail = 0; \
505bde11 1399 fail_stack.frame = 0; \
fa9a63c5
RM
1400 } while (0)
1401
b313f9d8
PE
1402# define RETALLOC_IF(addr, n, t) \
1403 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
fa9a63c5
RM
1404#endif
1405
1406
320a2a73
KH
1407/* Double the size of FAIL_STACK, up to a limit
1408 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1409
1410 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1411 allocating space for it or it was already too large.
1412
4bb91c68 1413 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1414
320a2a73
KH
1415/* Factor to increase the failure stack size by
1416 when we increase it.
1417 This used to be 2, but 2 was too wasteful
1418 because the old discarded stacks added up to as much space
1419 were as ultimate, maximum-size stack. */
1420#define FAIL_STACK_GROWTH_FACTOR 4
1421
1422#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1423 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1424 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1425 ? 0 \
320a2a73 1426 : ((fail_stack).stack \
38182d90 1427 = REGEX_REALLOCATE_STACK ((fail_stack).stack, \
25fe55af 1428 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1429 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1430 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1431 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1432 \
1433 (fail_stack).stack == NULL \
1434 ? 0 \
6453db45
KH
1435 : ((fail_stack).size \
1436 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1437 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1438 * FAIL_STACK_GROWTH_FACTOR)) \
1439 / sizeof (fail_stack_elt_t)), \
25fe55af 1440 1)))
fa9a63c5
RM
1441
1442
fa9a63c5
RM
1443/* Push a pointer value onto the failure stack.
1444 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1445 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1446#define PUSH_FAILURE_POINTER(item) \
01618498 1447 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1448
1449/* This pushes an integer-valued item onto the failure stack.
1450 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1451 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1452#define PUSH_FAILURE_INT(item) \
1453 fail_stack.stack[fail_stack.avail++].integer = (item)
1454
b313f9d8 1455/* These POP... operations complement the PUSH... operations.
fa9a63c5
RM
1456 All assume that `fail_stack' is nonempty. */
1457#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1458#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
fa9a63c5 1459
505bde11
SM
1460/* Individual items aside from the registers. */
1461#define NUM_NONREG_ITEMS 3
1462
1463/* Used to examine the stack (to detect infinite loops). */
1464#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1465#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1466#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1467#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1468
1469
505bde11
SM
1470#define ENSURE_FAIL_STACK(space) \
1471while (REMAINING_AVAIL_SLOTS <= space) { \
1472 if (!GROW_FAIL_STACK (fail_stack)) \
1473 return -2; \
1474 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1475 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1476}
1477
1478/* Push register NUM onto the stack. */
1479#define PUSH_FAILURE_REG(num) \
1480do { \
1481 char *destination; \
1482 ENSURE_FAIL_STACK(3); \
1483 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1484 num, regstart[num], regend[num]); \
1485 PUSH_FAILURE_POINTER (regstart[num]); \
1486 PUSH_FAILURE_POINTER (regend[num]); \
1487 PUSH_FAILURE_INT (num); \
1488} while (0)
1489
01618498
SM
1490/* Change the counter's value to VAL, but make sure that it will
1491 be reset when backtracking. */
1492#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1493do { \
1494 char *destination; \
1495 int c; \
1496 ENSURE_FAIL_STACK(3); \
1497 EXTRACT_NUMBER (c, ptr); \
01618498 1498 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1499 PUSH_FAILURE_INT (c); \
1500 PUSH_FAILURE_POINTER (ptr); \
1501 PUSH_FAILURE_INT (-1); \
01618498 1502 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1503} while (0)
1504
505bde11 1505/* Pop a saved register off the stack. */
dc1e502d 1506#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1507do { \
d1dfb56c 1508 long pfreg = POP_FAILURE_INT (); \
19ed5445 1509 if (pfreg == -1) \
dc1e502d
SM
1510 { \
1511 /* It's a counter. */ \
6dcf2d0e
SM
1512 /* Here, we discard `const', making re_match non-reentrant. */ \
1513 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1514 pfreg = POP_FAILURE_INT (); \
1515 STORE_NUMBER (ptr, pfreg); \
1516 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, pfreg); \
dc1e502d
SM
1517 } \
1518 else \
1519 { \
19ed5445
PE
1520 regend[pfreg] = POP_FAILURE_POINTER (); \
1521 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc1e502d 1522 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
19ed5445 1523 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1524 } \
505bde11
SM
1525} while (0)
1526
1527/* Check that we are not stuck in an infinite loop. */
1528#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1529do { \
d1dfb56c 1530 ssize_t failure = TOP_FAILURE_HANDLE (); \
505bde11 1531 /* Check for infinite matching loops */ \
f6df485f
RS
1532 while (failure > 0 \
1533 && (FAILURE_STR (failure) == string_place \
1534 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1535 { \
1536 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1537 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1538 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1539 { \
6df42991
SM
1540 cycle = 1; \
1541 break; \
f6df485f 1542 } \
66f0296e 1543 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1544 failure = NEXT_FAILURE_HANDLE(failure); \
1545 } \
1546 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1547} while (0)
6df42991 1548
fa9a63c5 1549/* Push the information about the state we will need
5e69f11e
RM
1550 if we ever fail back to it.
1551
505bde11 1552 Requires variables fail_stack, regstart, regend and
320a2a73 1553 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1554 declared.
5e69f11e 1555
fa9a63c5
RM
1556 Does `return FAILURE_CODE' if runs out of memory. */
1557
505bde11
SM
1558#define PUSH_FAILURE_POINT(pattern, string_place) \
1559do { \
1560 char *destination; \
1561 /* Must be int, so when we don't save any registers, the arithmetic \
1562 of 0 + -1 isn't done as unsigned. */ \
1563 \
505bde11 1564 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1565 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1566 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1567 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1568 \
1569 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1570 \
1571 DEBUG_PRINT1 ("\n"); \
1572 \
1573 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1574 PUSH_FAILURE_INT (fail_stack.frame); \
1575 \
1576 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1577 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1578 DEBUG_PRINT1 ("'\n"); \
1579 PUSH_FAILURE_POINTER (string_place); \
1580 \
1581 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1582 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1583 PUSH_FAILURE_POINTER (pattern); \
1584 \
1585 /* Close the frame by moving the frame pointer past it. */ \
1586 fail_stack.frame = fail_stack.avail; \
1587} while (0)
fa9a63c5 1588
320a2a73
KH
1589/* Estimate the size of data pushed by a typical failure stack entry.
1590 An estimate is all we need, because all we use this for
1591 is to choose a limit for how big to make the failure stack. */
ada30c0e 1592/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1593#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1594
fa9a63c5
RM
1595/* How many items can still be added to the stack without overflowing it. */
1596#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1597
1598
1599/* Pops what PUSH_FAIL_STACK pushes.
1600
1601 We restore into the parameters, all of which should be lvalues:
1602 STR -- the saved data position.
1603 PAT -- the saved pattern position.
fa9a63c5 1604 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1605
fa9a63c5 1606 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1607 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1608
505bde11
SM
1609#define POP_FAILURE_POINT(str, pat) \
1610do { \
fa9a63c5
RM
1611 assert (!FAIL_STACK_EMPTY ()); \
1612 \
1613 /* Remove failure points and point to how many regs pushed. */ \
1614 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1615 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1616 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1617 \
505bde11
SM
1618 /* Pop the saved registers. */ \
1619 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1620 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1621 \
01618498 1622 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1623 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1624 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1625 \
1626 /* If the saved string location is NULL, it came from an \
1627 on_failure_keep_string_jump opcode, and we want to throw away the \
1628 saved NULL, thus retaining our current position in the string. */ \
01618498 1629 str = POP_FAILURE_POINTER (); \
505bde11 1630 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1631 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1632 DEBUG_PRINT1 ("'\n"); \
1633 \
505bde11
SM
1634 fail_stack.frame = POP_FAILURE_INT (); \
1635 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1636 \
505bde11
SM
1637 assert (fail_stack.avail >= 0); \
1638 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1639 \
fa9a63c5 1640 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1641} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1642
1643
1644\f
fa9a63c5 1645/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1646#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1647\f
1648/* Subroutine declarations and macros for regex_compile. */
1649
261cb4bb
PE
1650static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1651 reg_syntax_t syntax,
1652 struct re_pattern_buffer *bufp);
1653static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1654static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1655static void insert_op1 (re_opcode_t op, unsigned char *loc,
1656 int arg, unsigned char *end);
1657static void insert_op2 (re_opcode_t op, unsigned char *loc,
1658 int arg1, int arg2, unsigned char *end);
1659static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1660 reg_syntax_t syntax);
1661static boolean at_endline_loc_p (re_char *p, re_char *pend,
1662 reg_syntax_t syntax);
1663static re_char *skip_one_char (re_char *p);
1664static int analyse_first (re_char *p, re_char *pend,
1665 char *fastmap, const int multibyte);
fa9a63c5 1666
fa9a63c5 1667/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1668 translation. */
36595814 1669#define PATFETCH(c) \
2d1675e4
SM
1670 do { \
1671 int len; \
1672 if (p == pend) return REG_EEND; \
62a6e103 1673 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1674 p += len; \
fa9a63c5
RM
1675 } while (0)
1676
fa9a63c5
RM
1677
1678/* If `translate' is non-null, return translate[D], else just D. We
1679 cast the subscript to translate because some data is declared as
1680 `char *', to avoid warnings when a string constant is passed. But
1681 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1682#ifndef TRANSLATE
0b32bf0e 1683# define TRANSLATE(d) \
66f0296e 1684 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1685#endif
fa9a63c5
RM
1686
1687
1688/* Macros for outputting the compiled pattern into `buffer'. */
1689
1690/* If the buffer isn't allocated when it comes in, use this. */
1691#define INIT_BUF_SIZE 32
1692
4bb91c68 1693/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1694#define GET_BUFFER_SPACE(n) \
01618498 1695 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1696 EXTEND_BUFFER ()
1697
1698/* Make sure we have one more byte of buffer space and then add C to it. */
1699#define BUF_PUSH(c) \
1700 do { \
1701 GET_BUFFER_SPACE (1); \
1702 *b++ = (unsigned char) (c); \
1703 } while (0)
1704
1705
1706/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1707#define BUF_PUSH_2(c1, c2) \
1708 do { \
1709 GET_BUFFER_SPACE (2); \
1710 *b++ = (unsigned char) (c1); \
1711 *b++ = (unsigned char) (c2); \
1712 } while (0)
1713
1714
fa9a63c5 1715/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1716 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1717#define STORE_JUMP(op, loc, to) \
1718 store_op1 (op, loc, (to) - (loc) - 3)
1719
1720/* Likewise, for a two-argument jump. */
1721#define STORE_JUMP2(op, loc, to, arg) \
1722 store_op2 (op, loc, (to) - (loc) - 3, arg)
1723
4bb91c68 1724/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1725#define INSERT_JUMP(op, loc, to) \
1726 insert_op1 (op, loc, (to) - (loc) - 3, b)
1727
1728/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1729#define INSERT_JUMP2(op, loc, to, arg) \
1730 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1731
1732
1733/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1734 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1735 be too small, many things would have to change. */
839966f3
KH
1736# define MAX_BUF_SIZE (1L << 15)
1737
fa9a63c5
RM
1738/* Extend the buffer by twice its current size via realloc and
1739 reset the pointers that pointed into the old block to point to the
1740 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1741 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1742#if __BOUNDED_POINTERS__
1743# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1744# define MOVE_BUFFER_POINTER(P) \
1745 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1746 SET_HIGH_BOUND (P), \
1747 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1748# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1749 else \
1750 { \
1751 SET_HIGH_BOUND (b); \
1752 SET_HIGH_BOUND (begalt); \
1753 if (fixup_alt_jump) \
1754 SET_HIGH_BOUND (fixup_alt_jump); \
1755 if (laststart) \
1756 SET_HIGH_BOUND (laststart); \
1757 if (pending_exact) \
1758 SET_HIGH_BOUND (pending_exact); \
1759 }
1760#else
381880b0 1761# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1762# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1763#endif
fa9a63c5 1764#define EXTEND_BUFFER() \
25fe55af 1765 do { \
381880b0 1766 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1767 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1768 return REG_ESIZE; \
1769 bufp->allocated <<= 1; \
1770 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1771 bufp->allocated = MAX_BUF_SIZE; \
01618498 1772 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1773 if (bufp->buffer == NULL) \
1774 return REG_ESPACE; \
1775 /* If the buffer moved, move all the pointers into it. */ \
1776 if (old_buffer != bufp->buffer) \
1777 { \
381880b0 1778 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1779 MOVE_BUFFER_POINTER (b); \
1780 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1781 if (fixup_alt_jump) \
4bb91c68 1782 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1783 if (laststart) \
4bb91c68 1784 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1785 if (pending_exact) \
4bb91c68 1786 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1787 } \
4bb91c68 1788 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1789 } while (0)
1790
1791
1792/* Since we have one byte reserved for the register number argument to
1793 {start,stop}_memory, the maximum number of groups we can report
1794 things about is what fits in that byte. */
1795#define MAX_REGNUM 255
1796
1797/* But patterns can have more than `MAX_REGNUM' registers. We just
1798 ignore the excess. */
098d42af 1799typedef int regnum_t;
fa9a63c5
RM
1800
1801
1802/* Macros for the compile stack. */
1803
1804/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1805 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1806/* int may be not enough when sizeof(int) == 2. */
1807typedef long pattern_offset_t;
fa9a63c5
RM
1808
1809typedef struct
1810{
1811 pattern_offset_t begalt_offset;
1812 pattern_offset_t fixup_alt_jump;
5e69f11e 1813 pattern_offset_t laststart_offset;
fa9a63c5
RM
1814 regnum_t regnum;
1815} compile_stack_elt_t;
1816
1817
1818typedef struct
1819{
1820 compile_stack_elt_t *stack;
d1dfb56c
EZ
1821 size_t size;
1822 size_t avail; /* Offset of next open position. */
fa9a63c5
RM
1823} compile_stack_type;
1824
1825
1826#define INIT_COMPILE_STACK_SIZE 32
1827
1828#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1829#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1830
4bb91c68 1831/* The next available element. */
fa9a63c5
RM
1832#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1833
0caaedb1
PE
1834/* Explicit quit checking is needed for Emacs, which uses polling to
1835 process input events. */
1836#ifdef emacs
77d11aec
RS
1837# define IMMEDIATE_QUIT_CHECK \
1838 do { \
1839 if (immediate_quit) QUIT; \
1840 } while (0)
1841#else
1842# define IMMEDIATE_QUIT_CHECK ((void)0)
1843#endif
1844\f
b18215fc
RS
1845/* Structure to manage work area for range table. */
1846struct range_table_work_area
1847{
1848 int *table; /* actual work area. */
1849 int allocated; /* allocated size for work area in bytes. */
7814e705 1850 int used; /* actually used size in words. */
96cc36cc 1851 int bits; /* flag to record character classes */
b18215fc
RS
1852};
1853
77d11aec
RS
1854/* Make sure that WORK_AREA can hold more N multibyte characters.
1855 This is used only in set_image_of_range and set_image_of_range_1.
1856 It expects WORK_AREA to be a pointer.
1857 If it can't get the space, it returns from the surrounding function. */
1858
1859#define EXTEND_RANGE_TABLE(work_area, n) \
1860 do { \
8f924df7 1861 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1862 { \
8f924df7
KH
1863 extend_range_table_work_area (&work_area); \
1864 if ((work_area).table == 0) \
77d11aec
RS
1865 return (REG_ESPACE); \
1866 } \
b18215fc
RS
1867 } while (0)
1868
96cc36cc
RS
1869#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1870 (work_area).bits |= (bit)
1871
14473664
SM
1872/* Bits used to implement the multibyte-part of the various character classes
1873 such as [:alnum:] in a charset's range table. */
1874#define BIT_WORD 0x1
1875#define BIT_LOWER 0x2
1876#define BIT_PUNCT 0x4
1877#define BIT_SPACE 0x8
1878#define BIT_UPPER 0x10
1879#define BIT_MULTIBYTE 0x20
96cc36cc 1880
b18215fc
RS
1881/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1882#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1883 do { \
8f924df7 1884 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1885 (work_area).table[(work_area).used++] = (range_start); \
1886 (work_area).table[(work_area).used++] = (range_end); \
1887 } while (0)
1888
7814e705 1889/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1890#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1891 do { \
1892 if ((work_area).table) \
1893 free ((work_area).table); \
1894 } while (0)
1895
96cc36cc 1896#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1897#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1898#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1899#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1900\f
b18215fc 1901
fa9a63c5 1902/* Set the bit for character C in a list. */
01618498 1903#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1904
1905
bf216479
KH
1906#ifdef emacs
1907
cf9c99bc
KH
1908/* Store characters in the range FROM to TO in the bitmap at B (for
1909 ASCII and unibyte characters) and WORK_AREA (for multibyte
1910 characters) while translating them and paying attention to the
1911 continuity of translated characters.
8f924df7 1912
cf9c99bc
KH
1913 Implementation note: It is better to implement these fairly big
1914 macros by a function, but it's not that easy because macros called
8f924df7 1915 in this macro assume various local variables already declared. */
bf216479 1916
cf9c99bc
KH
1917/* Both FROM and TO are ASCII characters. */
1918
1919#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1920 do { \
1921 int C0, C1; \
1922 \
1923 for (C0 = (FROM); C0 <= (TO); C0++) \
1924 { \
1925 C1 = TRANSLATE (C0); \
1926 if (! ASCII_CHAR_P (C1)) \
1927 { \
1928 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1929 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1930 C1 = C0; \
1931 } \
1932 SET_LIST_BIT (C1); \
1933 } \
1934 } while (0)
1935
1936
1937/* Both FROM and TO are unibyte characters (0x80..0xFF). */
1938
1939#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1940 do { \
1941 int C0, C1, C2, I; \
1942 int USED = RANGE_TABLE_WORK_USED (work_area); \
1943 \
1944 for (C0 = (FROM); C0 <= (TO); C0++) \
1945 { \
1946 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1947 if (CHAR_BYTE8_P (C1)) \
1948 SET_LIST_BIT (C0); \
1949 else \
1950 { \
1951 C2 = TRANSLATE (C1); \
1952 if (C2 == C1 \
1953 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1954 C1 = C0; \
1955 SET_LIST_BIT (C1); \
1956 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1957 { \
1958 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1959 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1960 \
1961 if (C2 >= from - 1 && C2 <= to + 1) \
1962 { \
1963 if (C2 == from - 1) \
1964 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1965 else if (C2 == to + 1) \
1966 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1967 break; \
1968 } \
1969 } \
1970 if (I < USED) \
1971 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1972 } \
1973 } \
1974 } while (0)
1975
1976
78edd3b7 1977/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
1978
1979#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1980 do { \
1981 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1982 \
1983 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1984 for (C0 = (FROM); C0 <= (TO); C0++) \
1985 { \
1986 C1 = TRANSLATE (C0); \
1987 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1988 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1989 SET_LIST_BIT (C2); \
1990 if (C1 >= (FROM) && C1 <= (TO)) \
1991 continue; \
1992 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1993 { \
1994 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1995 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1996 \
1997 if (C1 >= from - 1 && C1 <= to + 1) \
1998 { \
1999 if (C1 == from - 1) \
2000 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2001 else if (C1 == to + 1) \
2002 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2003 break; \
2004 } \
2005 } \
2006 if (I < USED) \
2007 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2008 } \
bf216479
KH
2009 } while (0)
2010
2011#endif /* emacs */
2012
fa9a63c5 2013/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2014#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2015 do { \
2016 if (p == pend) \
2017 FREE_STACK_RETURN (REG_EBRACE); \
2018 else \
2019 { \
2020 PATFETCH (c); \
2021 while ('0' <= c && c <= '9') \
2022 { \
2023 int prev; \
2024 if (num < 0) \
2025 num = 0; \
2026 prev = num; \
2027 num = num * 10 + c - '0'; \
2028 if (num / 10 != prev) \
2029 FREE_STACK_RETURN (REG_BADBR); \
2030 if (p == pend) \
2031 FREE_STACK_RETURN (REG_EBRACE); \
2032 PATFETCH (c); \
2033 } \
2034 } \
2035 } while (0)
77d11aec 2036\f
1fdab503 2037#if ! WIDE_CHAR_SUPPORT
01618498 2038
14473664 2039/* Map a string to the char class it names (if any). */
1fdab503 2040re_wctype_t
971de7fb 2041re_wctype (const re_char *str)
14473664 2042{
5b0534c8 2043 const char *string = (const char *) str;
14473664
SM
2044 if (STREQ (string, "alnum")) return RECC_ALNUM;
2045 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2046 else if (STREQ (string, "word")) return RECC_WORD;
2047 else if (STREQ (string, "ascii")) return RECC_ASCII;
2048 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2049 else if (STREQ (string, "graph")) return RECC_GRAPH;
2050 else if (STREQ (string, "lower")) return RECC_LOWER;
2051 else if (STREQ (string, "print")) return RECC_PRINT;
2052 else if (STREQ (string, "punct")) return RECC_PUNCT;
2053 else if (STREQ (string, "space")) return RECC_SPACE;
2054 else if (STREQ (string, "upper")) return RECC_UPPER;
2055 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2056 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2057 else if (STREQ (string, "digit")) return RECC_DIGIT;
2058 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2059 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2060 else if (STREQ (string, "blank")) return RECC_BLANK;
2061 else return 0;
2062}
2063
e0f24100 2064/* True if CH is in the char class CC. */
1fdab503 2065boolean
971de7fb 2066re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2067{
2068 switch (cc)
2069 {
f3fcc40d
AS
2070 case RECC_ALNUM: return ISALNUM (ch) != 0;
2071 case RECC_ALPHA: return ISALPHA (ch) != 0;
2072 case RECC_BLANK: return ISBLANK (ch) != 0;
2073 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2074 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2075 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2076 case RECC_LOWER: return ISLOWER (ch) != 0;
2077 case RECC_PRINT: return ISPRINT (ch) != 0;
2078 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2079 case RECC_SPACE: return ISSPACE (ch) != 0;
2080 case RECC_UPPER: return ISUPPER (ch) != 0;
2081 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2082 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
213bd7f2 2083 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
f3fcc40d 2084 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
213bd7f2 2085 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
f3fcc40d 2086 case RECC_WORD: return ISWORD (ch) != 0;
0cdd06f8
SM
2087 case RECC_ERROR: return false;
2088 default:
5e617bc2 2089 abort ();
14473664
SM
2090 }
2091}
fa9a63c5 2092
14473664
SM
2093/* Return a bit-pattern to use in the range-table bits to match multibyte
2094 chars of class CC. */
2095static int
971de7fb 2096re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2097{
2098 switch (cc)
2099 {
2100 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2101 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2102 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2103 case RECC_LOWER: return BIT_LOWER;
2104 case RECC_UPPER: return BIT_UPPER;
2105 case RECC_PUNCT: return BIT_PUNCT;
2106 case RECC_SPACE: return BIT_SPACE;
14473664 2107 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2108 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2109 default:
5e617bc2 2110 abort ();
14473664
SM
2111 }
2112}
2113#endif
77d11aec
RS
2114\f
2115/* Filling in the work area of a range. */
2116
2117/* Actually extend the space in WORK_AREA. */
2118
2119static void
971de7fb 2120extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2121{
77d11aec 2122 work_area->allocated += 16 * sizeof (int);
38182d90 2123 work_area->table = realloc (work_area->table, work_area->allocated);
77d11aec
RS
2124}
2125
8f924df7 2126#if 0
77d11aec
RS
2127#ifdef emacs
2128
2129/* Carefully find the ranges of codes that are equivalent
2130 under case conversion to the range start..end when passed through
2131 TRANSLATE. Handle the case where non-letters can come in between
2132 two upper-case letters (which happens in Latin-1).
2133 Also handle the case of groups of more than 2 case-equivalent chars.
2134
2135 The basic method is to look at consecutive characters and see
2136 if they can form a run that can be handled as one.
2137
2138 Returns -1 if successful, REG_ESPACE if ran out of space. */
2139
2140static int
1dae0f0a
AS
2141set_image_of_range_1 (struct range_table_work_area *work_area,
2142 re_wchar_t start, re_wchar_t end,
2143 RE_TRANSLATE_TYPE translate)
77d11aec
RS
2144{
2145 /* `one_case' indicates a character, or a run of characters,
2146 each of which is an isolate (no case-equivalents).
2147 This includes all ASCII non-letters.
2148
2149 `two_case' indicates a character, or a run of characters,
2150 each of which has two case-equivalent forms.
2151 This includes all ASCII letters.
2152
2153 `strange' indicates a character that has more than one
2154 case-equivalent. */
177c0ea7 2155
77d11aec
RS
2156 enum case_type {one_case, two_case, strange};
2157
2158 /* Describe the run that is in progress,
2159 which the next character can try to extend.
2160 If run_type is strange, that means there really is no run.
2161 If run_type is one_case, then run_start...run_end is the run.
2162 If run_type is two_case, then the run is run_start...run_end,
2163 and the case-equivalents end at run_eqv_end. */
2164
2165 enum case_type run_type = strange;
2166 int run_start, run_end, run_eqv_end;
2167
2168 Lisp_Object eqv_table;
2169
2170 if (!RE_TRANSLATE_P (translate))
2171 {
b7c12565 2172 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2173 work_area->table[work_area->used++] = (start);
2174 work_area->table[work_area->used++] = (end);
b7c12565 2175 return -1;
77d11aec
RS
2176 }
2177
2178 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2179
77d11aec
RS
2180 for (; start <= end; start++)
2181 {
2182 enum case_type this_type;
2183 int eqv = RE_TRANSLATE (eqv_table, start);
2184 int minchar, maxchar;
2185
2186 /* Classify this character */
2187 if (eqv == start)
2188 this_type = one_case;
2189 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2190 this_type = two_case;
2191 else
2192 this_type = strange;
2193
2194 if (start < eqv)
2195 minchar = start, maxchar = eqv;
2196 else
2197 minchar = eqv, maxchar = start;
2198
2199 /* Can this character extend the run in progress? */
2200 if (this_type == strange || this_type != run_type
2201 || !(minchar == run_end + 1
2202 && (run_type == two_case
2203 ? maxchar == run_eqv_end + 1 : 1)))
2204 {
2205 /* No, end the run.
2206 Record each of its equivalent ranges. */
2207 if (run_type == one_case)
2208 {
2209 EXTEND_RANGE_TABLE (work_area, 2);
2210 work_area->table[work_area->used++] = run_start;
2211 work_area->table[work_area->used++] = run_end;
2212 }
2213 else if (run_type == two_case)
2214 {
2215 EXTEND_RANGE_TABLE (work_area, 4);
2216 work_area->table[work_area->used++] = run_start;
2217 work_area->table[work_area->used++] = run_end;
2218 work_area->table[work_area->used++]
2219 = RE_TRANSLATE (eqv_table, run_start);
2220 work_area->table[work_area->used++]
2221 = RE_TRANSLATE (eqv_table, run_end);
2222 }
2223 run_type = strange;
2224 }
177c0ea7 2225
77d11aec
RS
2226 if (this_type == strange)
2227 {
2228 /* For a strange character, add each of its equivalents, one
2229 by one. Don't start a range. */
2230 do
2231 {
2232 EXTEND_RANGE_TABLE (work_area, 2);
2233 work_area->table[work_area->used++] = eqv;
2234 work_area->table[work_area->used++] = eqv;
2235 eqv = RE_TRANSLATE (eqv_table, eqv);
2236 }
2237 while (eqv != start);
2238 }
2239
2240 /* Add this char to the run, or start a new run. */
2241 else if (run_type == strange)
2242 {
2243 /* Initialize a new range. */
2244 run_type = this_type;
2245 run_start = start;
2246 run_end = start;
2247 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2248 }
2249 else
2250 {
2251 /* Extend a running range. */
2252 run_end = minchar;
2253 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2254 }
2255 }
2256
2257 /* If a run is still in progress at the end, finish it now
2258 by recording its equivalent ranges. */
2259 if (run_type == one_case)
2260 {
2261 EXTEND_RANGE_TABLE (work_area, 2);
2262 work_area->table[work_area->used++] = run_start;
2263 work_area->table[work_area->used++] = run_end;
2264 }
2265 else if (run_type == two_case)
2266 {
2267 EXTEND_RANGE_TABLE (work_area, 4);
2268 work_area->table[work_area->used++] = run_start;
2269 work_area->table[work_area->used++] = run_end;
2270 work_area->table[work_area->used++]
2271 = RE_TRANSLATE (eqv_table, run_start);
2272 work_area->table[work_area->used++]
2273 = RE_TRANSLATE (eqv_table, run_end);
2274 }
2275
2276 return -1;
2277}
36595814 2278
77d11aec 2279#endif /* emacs */
36595814 2280
2b34df4e 2281/* Record the image of the range start..end when passed through
36595814
SM
2282 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2283 and is not even necessarily contiguous.
b7c12565
RS
2284 Normally we approximate it with the smallest contiguous range that contains
2285 all the chars we need. However, for Latin-1 we go to extra effort
2286 to do a better job.
2287
2288 This function is not called for ASCII ranges.
77d11aec
RS
2289
2290 Returns -1 if successful, REG_ESPACE if ran out of space. */
2291
2292static int
1dae0f0a
AS
2293set_image_of_range (struct range_table_work_area *work_area,
2294 re_wchar_t start, re_wchar_t end,
2295 RE_TRANSLATE_TYPE translate)
36595814 2296{
77d11aec
RS
2297 re_wchar_t cmin, cmax;
2298
2299#ifdef emacs
2300 /* For Latin-1 ranges, use set_image_of_range_1
2301 to get proper handling of ranges that include letters and nonletters.
b7c12565 2302 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2303 For other character sets, we don't bother to get this right. */
b7c12565
RS
2304 if (RE_TRANSLATE_P (translate) && start < 04400
2305 && !(start < 04200 && end >= 04377))
77d11aec 2306 {
b7c12565 2307 int newend;
77d11aec 2308 int tem;
b7c12565
RS
2309 newend = end;
2310 if (newend > 04377)
2311 newend = 04377;
2312 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2313 if (tem > 0)
2314 return tem;
2315
2316 start = 04400;
2317 if (end < 04400)
2318 return -1;
2319 }
2320#endif
2321
b7c12565
RS
2322 EXTEND_RANGE_TABLE (work_area, 2);
2323 work_area->table[work_area->used++] = (start);
2324 work_area->table[work_area->used++] = (end);
2325
2326 cmin = -1, cmax = -1;
77d11aec 2327
36595814 2328 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2329 {
2330 int ch;
77d11aec 2331
b7c12565
RS
2332 for (ch = start; ch <= end; ch++)
2333 {
2334 re_wchar_t c = TRANSLATE (ch);
2335 if (! (start <= c && c <= end))
2336 {
2337 if (cmin == -1)
2338 cmin = c, cmax = c;
2339 else
2340 {
2341 cmin = MIN (cmin, c);
2342 cmax = MAX (cmax, c);
2343 }
2344 }
2345 }
2346
2347 if (cmin != -1)
2348 {
2349 EXTEND_RANGE_TABLE (work_area, 2);
2350 work_area->table[work_area->used++] = (cmin);
2351 work_area->table[work_area->used++] = (cmax);
2352 }
2353 }
36595814 2354
77d11aec
RS
2355 return -1;
2356}
8f924df7 2357#endif /* 0 */
fa9a63c5
RM
2358\f
2359#ifndef MATCH_MAY_ALLOCATE
2360
2361/* If we cannot allocate large objects within re_match_2_internal,
2362 we make the fail stack and register vectors global.
2363 The fail stack, we grow to the maximum size when a regexp
2364 is compiled.
2365 The register vectors, we adjust in size each time we
2366 compile a regexp, according to the number of registers it needs. */
2367
2368static fail_stack_type fail_stack;
2369
2370/* Size with which the following vectors are currently allocated.
2371 That is so we can make them bigger as needed,
4bb91c68 2372 but never make them smaller. */
fa9a63c5
RM
2373static int regs_allocated_size;
2374
66f0296e
SM
2375static re_char ** regstart, ** regend;
2376static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2377
2378/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2379 but don't make them smaller. */
fa9a63c5
RM
2380
2381static
1dae0f0a 2382regex_grow_registers (int num_regs)
fa9a63c5
RM
2383{
2384 if (num_regs > regs_allocated_size)
2385 {
66f0296e
SM
2386 RETALLOC_IF (regstart, num_regs, re_char *);
2387 RETALLOC_IF (regend, num_regs, re_char *);
2388 RETALLOC_IF (best_regstart, num_regs, re_char *);
2389 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2390
2391 regs_allocated_size = num_regs;
2392 }
2393}
2394
2395#endif /* not MATCH_MAY_ALLOCATE */
2396\f
261cb4bb
PE
2397static boolean group_in_compile_stack (compile_stack_type compile_stack,
2398 regnum_t regnum);
99633e97 2399
fa9a63c5
RM
2400/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2401 Returns one of error codes defined in `regex.h', or zero for success.
2402
2403 Assumes the `allocated' (and perhaps `buffer') and `translate'
2404 fields are set in BUFP on entry.
2405
2406 If it succeeds, results are put in BUFP (if it returns an error, the
2407 contents of BUFP are undefined):
2408 `buffer' is the compiled pattern;
2409 `syntax' is set to SYNTAX;
2410 `used' is set to the length of the compiled pattern;
2411 `fastmap_accurate' is zero;
2412 `re_nsub' is the number of subexpressions in PATTERN;
2413 `not_bol' and `not_eol' are zero;
5e69f11e 2414
c0f9ea08 2415 The `fastmap' field is neither examined nor set. */
fa9a63c5 2416
505bde11
SM
2417/* Insert the `jump' from the end of last alternative to "here".
2418 The space for the jump has already been allocated. */
2419#define FIXUP_ALT_JUMP() \
2420do { \
2421 if (fixup_alt_jump) \
2422 STORE_JUMP (jump, fixup_alt_jump, b); \
2423} while (0)
2424
2425
fa9a63c5
RM
2426/* Return, freeing storage we allocated. */
2427#define FREE_STACK_RETURN(value) \
b18215fc
RS
2428 do { \
2429 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2430 free (compile_stack.stack); \
2431 return value; \
2432 } while (0)
fa9a63c5
RM
2433
2434static reg_errcode_t
971de7fb 2435regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2436{
01618498
SM
2437 /* We fetch characters from PATTERN here. */
2438 register re_wchar_t c, c1;
5e69f11e 2439
fa9a63c5
RM
2440 /* Points to the end of the buffer, where we should append. */
2441 register unsigned char *b;
5e69f11e 2442
fa9a63c5
RM
2443 /* Keeps track of unclosed groups. */
2444 compile_stack_type compile_stack;
2445
2446 /* Points to the current (ending) position in the pattern. */
22336245
RS
2447#ifdef AIX
2448 /* `const' makes AIX compiler fail. */
66f0296e 2449 unsigned char *p = pattern;
22336245 2450#else
66f0296e 2451 re_char *p = pattern;
22336245 2452#endif
66f0296e 2453 re_char *pend = pattern + size;
5e69f11e 2454
fa9a63c5 2455 /* How to translate the characters in the pattern. */
6676cb1c 2456 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2457
2458 /* Address of the count-byte of the most recently inserted `exactn'
2459 command. This makes it possible to tell if a new exact-match
2460 character can be added to that command or if the character requires
2461 a new `exactn' command. */
2462 unsigned char *pending_exact = 0;
2463
2464 /* Address of start of the most recently finished expression.
2465 This tells, e.g., postfix * where to find the start of its
2466 operand. Reset at the beginning of groups and alternatives. */
2467 unsigned char *laststart = 0;
2468
2469 /* Address of beginning of regexp, or inside of last group. */
2470 unsigned char *begalt;
2471
2472 /* Place in the uncompiled pattern (i.e., the {) to
2473 which to go back if the interval is invalid. */
66f0296e 2474 re_char *beg_interval;
5e69f11e 2475
fa9a63c5 2476 /* Address of the place where a forward jump should go to the end of
7814e705 2477 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2478 last -- ends with a forward jump of this sort. */
2479 unsigned char *fixup_alt_jump = 0;
2480
b18215fc
RS
2481 /* Work area for range table of charset. */
2482 struct range_table_work_area range_table_work;
2483
2d1675e4
SM
2484 /* If the object matched can contain multibyte characters. */
2485 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2486
f9b0fd99
RS
2487 /* Nonzero if we have pushed down into a subpattern. */
2488 int in_subpattern = 0;
2489
2490 /* These hold the values of p, pattern, and pend from the main
2491 pattern when we have pushed into a subpattern. */
da053e48
PE
2492 re_char *main_p IF_LINT (= NULL);
2493 re_char *main_pattern IF_LINT (= NULL);
2494 re_char *main_pend IF_LINT (= NULL);
f9b0fd99 2495
fa9a63c5 2496#ifdef DEBUG
99633e97 2497 debug++;
fa9a63c5 2498 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2499 if (debug > 0)
fa9a63c5
RM
2500 {
2501 unsigned debug_count;
5e69f11e 2502
fa9a63c5 2503 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2504 putchar (pattern[debug_count]);
fa9a63c5
RM
2505 putchar ('\n');
2506 }
2507#endif /* DEBUG */
2508
2509 /* Initialize the compile stack. */
2510 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2511 if (compile_stack.stack == NULL)
2512 return REG_ESPACE;
2513
2514 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2515 compile_stack.avail = 0;
2516
b18215fc
RS
2517 range_table_work.table = 0;
2518 range_table_work.allocated = 0;
2519
fa9a63c5
RM
2520 /* Initialize the pattern buffer. */
2521 bufp->syntax = syntax;
2522 bufp->fastmap_accurate = 0;
2523 bufp->not_bol = bufp->not_eol = 0;
6224b623 2524 bufp->used_syntax = 0;
fa9a63c5
RM
2525
2526 /* Set `used' to zero, so that if we return an error, the pattern
2527 printer (for debugging) will think there's no pattern. We reset it
2528 at the end. */
2529 bufp->used = 0;
5e69f11e 2530
fa9a63c5 2531 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2532 bufp->re_nsub = 0;
fa9a63c5 2533
0b32bf0e 2534#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2535 /* Initialize the syntax table. */
2536 init_syntax_once ();
2537#endif
2538
2539 if (bufp->allocated == 0)
2540 {
2541 if (bufp->buffer)
2542 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2543 enough space. This loses if buffer's address is bogus, but
7814e705 2544 that is the user's responsibility. */
25fe55af
RS
2545 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2546 }
fa9a63c5 2547 else
7814e705 2548 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2549 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2550 }
fa9a63c5
RM
2551 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2552
2553 bufp->allocated = INIT_BUF_SIZE;
2554 }
2555
2556 begalt = b = bufp->buffer;
2557
2558 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2559 while (1)
fa9a63c5 2560 {
f9b0fd99
RS
2561 if (p == pend)
2562 {
2563 /* If this is the end of an included regexp,
2564 pop back to the main regexp and try again. */
2565 if (in_subpattern)
2566 {
2567 in_subpattern = 0;
2568 pattern = main_pattern;
2569 p = main_p;
2570 pend = main_pend;
2571 continue;
2572 }
2573 /* If this is the end of the main regexp, we are done. */
2574 break;
2575 }
2576
fa9a63c5
RM
2577 PATFETCH (c);
2578
2579 switch (c)
25fe55af 2580 {
f9b0fd99
RS
2581 case ' ':
2582 {
2583 re_char *p1 = p;
2584
2585 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2586 spaces normally. And don't try to do this recursively. */
2587 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2588 goto normal_char;
2589
2590 /* Peek past following spaces. */
2591 while (p1 != pend)
2592 {
2593 if (*p1 != ' ')
2594 break;
2595 p1++;
2596 }
2597 /* If the spaces are followed by a repetition op,
2598 treat them normally. */
c721eee5
RS
2599 if (p1 != pend
2600 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2601 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2602 goto normal_char;
2603
2604 /* Replace the spaces with the whitespace regexp. */
2605 in_subpattern = 1;
2606 main_p = p1;
2607 main_pend = pend;
2608 main_pattern = pattern;
2609 p = pattern = whitespace_regexp;
5b0534c8 2610 pend = p + strlen ((const char *) p);
f9b0fd99 2611 break;
7814e705 2612 }
f9b0fd99 2613
25fe55af
RS
2614 case '^':
2615 {
7814e705 2616 if ( /* If at start of pattern, it's an operator. */
25fe55af 2617 p == pattern + 1
7814e705 2618 /* If context independent, it's an operator. */
25fe55af 2619 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2620 /* Otherwise, depends on what's come before. */
25fe55af 2621 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2622 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2623 else
2624 goto normal_char;
2625 }
2626 break;
2627
2628
2629 case '$':
2630 {
2631 if ( /* If at end of pattern, it's an operator. */
2632 p == pend
7814e705 2633 /* If context independent, it's an operator. */
25fe55af
RS
2634 || syntax & RE_CONTEXT_INDEP_ANCHORS
2635 /* Otherwise, depends on what's next. */
2636 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2637 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2638 else
2639 goto normal_char;
2640 }
2641 break;
fa9a63c5
RM
2642
2643
2644 case '+':
25fe55af
RS
2645 case '?':
2646 if ((syntax & RE_BK_PLUS_QM)
2647 || (syntax & RE_LIMITED_OPS))
2648 goto normal_char;
2649 handle_plus:
2650 case '*':
2651 /* If there is no previous pattern... */
2652 if (!laststart)
2653 {
2654 if (syntax & RE_CONTEXT_INVALID_OPS)
2655 FREE_STACK_RETURN (REG_BADRPT);
2656 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2657 goto normal_char;
2658 }
2659
2660 {
7814e705 2661 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2662 boolean zero_times_ok = 0, many_times_ok = 0;
2663 boolean greedy = 1;
25fe55af
RS
2664
2665 /* If there is a sequence of repetition chars, collapse it
2666 down to just one (the right one). We can't combine
2667 interval operators with these because of, e.g., `a{2}*',
7814e705 2668 which should only match an even number of `a's. */
25fe55af
RS
2669
2670 for (;;)
2671 {
0b32bf0e 2672 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2673 && c == '?' && (zero_times_ok || many_times_ok))
2674 greedy = 0;
2675 else
2676 {
2677 zero_times_ok |= c != '+';
2678 many_times_ok |= c != '?';
2679 }
25fe55af
RS
2680
2681 if (p == pend)
2682 break;
ed0767d8
SM
2683 else if (*p == '*'
2684 || (!(syntax & RE_BK_PLUS_QM)
2685 && (*p == '+' || *p == '?')))
25fe55af 2686 ;
ed0767d8 2687 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2688 {
ed0767d8
SM
2689 if (p+1 == pend)
2690 FREE_STACK_RETURN (REG_EESCAPE);
2691 if (p[1] == '+' || p[1] == '?')
2692 PATFETCH (c); /* Gobble up the backslash. */
2693 else
2694 break;
25fe55af
RS
2695 }
2696 else
ed0767d8 2697 break;
25fe55af 2698 /* If we get here, we found another repeat character. */
ed0767d8
SM
2699 PATFETCH (c);
2700 }
25fe55af
RS
2701
2702 /* Star, etc. applied to an empty pattern is equivalent
2703 to an empty pattern. */
4e8a9132 2704 if (!laststart || laststart == b)
25fe55af
RS
2705 break;
2706
2707 /* Now we know whether or not zero matches is allowed
7814e705 2708 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2709 if (greedy)
2710 {
99633e97 2711 if (many_times_ok)
4e8a9132
SM
2712 {
2713 boolean simple = skip_one_char (laststart) == b;
d1dfb56c 2714 size_t startoffset = 0;
f6a3f532 2715 re_opcode_t ofj =
01618498 2716 /* Check if the loop can match the empty string. */
6df42991
SM
2717 (simple || !analyse_first (laststart, b, NULL, 0))
2718 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2719 assert (skip_one_char (laststart) <= b);
177c0ea7 2720
4e8a9132
SM
2721 if (!zero_times_ok && simple)
2722 { /* Since simple * loops can be made faster by using
2723 on_failure_keep_string_jump, we turn simple P+
2724 into PP* if P is simple. */
2725 unsigned char *p1, *p2;
2726 startoffset = b - laststart;
2727 GET_BUFFER_SPACE (startoffset);
2728 p1 = b; p2 = laststart;
2729 while (p2 < p1)
2730 *b++ = *p2++;
2731 zero_times_ok = 1;
99633e97 2732 }
4e8a9132
SM
2733
2734 GET_BUFFER_SPACE (6);
2735 if (!zero_times_ok)
2736 /* A + loop. */
f6a3f532 2737 STORE_JUMP (ofj, b, b + 6);
99633e97 2738 else
4e8a9132
SM
2739 /* Simple * loops can use on_failure_keep_string_jump
2740 depending on what follows. But since we don't know
2741 that yet, we leave the decision up to
2742 on_failure_jump_smart. */
f6a3f532 2743 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2744 laststart + startoffset, b + 6);
99633e97 2745 b += 3;
4e8a9132 2746 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2747 b += 3;
2748 }
2749 else
2750 {
4e8a9132
SM
2751 /* A simple ? pattern. */
2752 assert (zero_times_ok);
2753 GET_BUFFER_SPACE (3);
2754 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2755 b += 3;
2756 }
1c8c6d39
DL
2757 }
2758 else /* not greedy */
2759 { /* I wish the greedy and non-greedy cases could be merged. */
2760
0683b6fa 2761 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2762 if (many_times_ok)
2763 {
f6a3f532
SM
2764 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2765
6df42991
SM
2766 /* The non-greedy multiple match looks like
2767 a repeat..until: we only need a conditional jump
2768 at the end of the loop. */
f6a3f532
SM
2769 if (emptyp) BUF_PUSH (no_op);
2770 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2771 : on_failure_jump, b, laststart);
1c8c6d39
DL
2772 b += 3;
2773 if (zero_times_ok)
2774 {
2775 /* The repeat...until naturally matches one or more.
2776 To also match zero times, we need to first jump to
6df42991 2777 the end of the loop (its conditional jump). */
1c8c6d39
DL
2778 INSERT_JUMP (jump, laststart, b);
2779 b += 3;
2780 }
2781 }
2782 else
2783 {
2784 /* non-greedy a?? */
1c8c6d39
DL
2785 INSERT_JUMP (jump, laststart, b + 3);
2786 b += 3;
2787 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2788 b += 3;
2789 }
2790 }
2791 }
4e8a9132 2792 pending_exact = 0;
fa9a63c5
RM
2793 break;
2794
2795
2796 case '.':
25fe55af
RS
2797 laststart = b;
2798 BUF_PUSH (anychar);
2799 break;
fa9a63c5
RM
2800
2801
25fe55af
RS
2802 case '[':
2803 {
19ed5445
PE
2804 re_char *p1;
2805
b18215fc 2806 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2807
25fe55af 2808 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2809
25fe55af
RS
2810 /* Ensure that we have enough space to push a charset: the
2811 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2812 GET_BUFFER_SPACE (34);
2813
25fe55af 2814 laststart = b;
e318085a 2815
25fe55af 2816 /* We test `*p == '^' twice, instead of using an if
7814e705 2817 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2818 BUF_PUSH (*p == '^' ? charset_not : charset);
2819 if (*p == '^')
2820 p++;
e318085a 2821
25fe55af
RS
2822 /* Remember the first position in the bracket expression. */
2823 p1 = p;
e318085a 2824
7814e705 2825 /* Push the number of bytes in the bitmap. */
25fe55af 2826 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2827
25fe55af 2828 /* Clear the whole map. */
72af86bd 2829 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2830
25fe55af
RS
2831 /* charset_not matches newline according to a syntax bit. */
2832 if ((re_opcode_t) b[-2] == charset_not
2833 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2834 SET_LIST_BIT ('\n');
fa9a63c5 2835
7814e705 2836 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2837 for (;;)
2838 {
b18215fc 2839 boolean escaped_char = false;
2d1675e4 2840 const unsigned char *p2 = p;
abbd1bcf 2841 re_wchar_t ch;
e318085a 2842
25fe55af 2843 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2844
36595814
SM
2845 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2846 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2847 So the translation is done later in a loop. Example:
2848 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2849 PATFETCH (c);
e318085a 2850
25fe55af
RS
2851 /* \ might escape characters inside [...] and [^...]. */
2852 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2853 {
2854 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2855
2856 PATFETCH (c);
b18215fc 2857 escaped_char = true;
25fe55af 2858 }
b18215fc
RS
2859 else
2860 {
7814e705 2861 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2862 not (i.e., when the bracket expression is `[]' so
2863 far), the ']' character bit gets set way below. */
2d1675e4 2864 if (c == ']' && p2 != p1)
657fcfbd 2865 break;
25fe55af 2866 }
b18215fc 2867
25fe55af
RS
2868 /* See if we're at the beginning of a possible character
2869 class. */
b18215fc 2870
2d1675e4
SM
2871 if (!escaped_char &&
2872 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2873 {
7814e705 2874 /* Leave room for the null. */
14473664 2875 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2876 const unsigned char *class_beg;
b18215fc 2877
25fe55af
RS
2878 PATFETCH (c);
2879 c1 = 0;
ed0767d8 2880 class_beg = p;
b18215fc 2881
25fe55af
RS
2882 /* If pattern is `[[:'. */
2883 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2884
25fe55af
RS
2885 for (;;)
2886 {
14473664
SM
2887 PATFETCH (c);
2888 if ((c == ':' && *p == ']') || p == pend)
2889 break;
2890 if (c1 < CHAR_CLASS_MAX_LENGTH)
2891 str[c1++] = c;
2892 else
2893 /* This is in any case an invalid class name. */
2894 str[0] = '\0';
25fe55af
RS
2895 }
2896 str[c1] = '\0';
b18215fc
RS
2897
2898 /* If isn't a word bracketed by `[:' and `:]':
2899 undo the ending character, the letters, and
2900 leave the leading `:' and `[' (but set bits for
2901 them). */
25fe55af
RS
2902 if (c == ':' && *p == ']')
2903 {
abbd1bcf 2904 re_wctype_t cc = re_wctype (str);
14473664
SM
2905
2906 if (cc == 0)
fa9a63c5
RM
2907 FREE_STACK_RETURN (REG_ECTYPE);
2908
14473664
SM
2909 /* Throw away the ] at the end of the character
2910 class. */
2911 PATFETCH (c);
fa9a63c5 2912
14473664 2913 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2914
cf9c99bc
KH
2915#ifndef emacs
2916 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
2917 if (re_iswctype (btowc (ch), cc))
2918 {
2919 c = TRANSLATE (ch);
ed00c2ac
KH
2920 if (c < (1 << BYTEWIDTH))
2921 SET_LIST_BIT (c);
8f924df7 2922 }
cf9c99bc
KH
2923#else /* emacs */
2924 /* Most character classes in a multibyte match
2925 just set a flag. Exceptions are is_blank,
2926 is_digit, is_cntrl, and is_xdigit, since
2927 they can only match ASCII characters. We
2928 don't need to handle them for multibyte.
2929 They are distinguished by a negative wctype. */
96cc36cc 2930
254c06a8
SM
2931 /* Setup the gl_state object to its buffer-defined
2932 value. This hardcodes the buffer-global
2933 syntax-table for ASCII chars, while the other chars
2934 will obey syntax-table properties. It's not ideal,
2935 but it's the way it's been done until now. */
d48cd3f4 2936 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 2937
cf9c99bc 2938 for (ch = 0; ch < 256; ++ch)
25fe55af 2939 {
cf9c99bc
KH
2940 c = RE_CHAR_TO_MULTIBYTE (ch);
2941 if (! CHAR_BYTE8_P (c)
2942 && re_iswctype (c, cc))
8f924df7 2943 {
cf9c99bc
KH
2944 SET_LIST_BIT (ch);
2945 c1 = TRANSLATE (c);
2946 if (c1 == c)
2947 continue;
2948 if (ASCII_CHAR_P (c1))
2949 SET_LIST_BIT (c1);
2950 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2951 SET_LIST_BIT (c1);
8f924df7 2952 }
25fe55af 2953 }
cf9c99bc
KH
2954 SET_RANGE_TABLE_WORK_AREA_BIT
2955 (range_table_work, re_wctype_to_bit (cc));
2956#endif /* emacs */
6224b623
SM
2957 /* In most cases the matching rule for char classes
2958 only uses the syntax table for multibyte chars,
2959 so that the content of the syntax-table it is not
2960 hardcoded in the range_table. SPACE and WORD are
2961 the two exceptions. */
2962 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2963 bufp->used_syntax = 1;
2964
b18215fc
RS
2965 /* Repeat the loop. */
2966 continue;
25fe55af
RS
2967 }
2968 else
2969 {
ed0767d8
SM
2970 /* Go back to right after the "[:". */
2971 p = class_beg;
25fe55af 2972 SET_LIST_BIT ('[');
b18215fc
RS
2973
2974 /* Because the `:' may starts the range, we
2975 can't simply set bit and repeat the loop.
7814e705 2976 Instead, just set it to C and handle below. */
b18215fc 2977 c = ':';
25fe55af
RS
2978 }
2979 }
b18215fc
RS
2980
2981 if (p < pend && p[0] == '-' && p[1] != ']')
2982 {
2983
2984 /* Discard the `-'. */
2985 PATFETCH (c1);
2986
2987 /* Fetch the character which ends the range. */
2988 PATFETCH (c1);
cf9c99bc
KH
2989#ifdef emacs
2990 if (CHAR_BYTE8_P (c1)
2991 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2992 /* Treat the range from a multibyte character to
2993 raw-byte character as empty. */
2994 c = c1 + 1;
2995#endif /* emacs */
e318085a 2996 }
25fe55af 2997 else
b18215fc
RS
2998 /* Range from C to C. */
2999 c1 = c;
3000
cf9c99bc 3001 if (c > c1)
25fe55af 3002 {
cf9c99bc
KH
3003 if (syntax & RE_NO_EMPTY_RANGES)
3004 FREE_STACK_RETURN (REG_ERANGEX);
3005 /* Else, repeat the loop. */
bf216479 3006 }
6fdd04b0 3007 else
25fe55af 3008 {
cf9c99bc
KH
3009#ifndef emacs
3010 /* Set the range into bitmap */
8f924df7 3011 for (; c <= c1; c++)
b18215fc 3012 {
cf9c99bc
KH
3013 ch = TRANSLATE (c);
3014 if (ch < (1 << BYTEWIDTH))
3015 SET_LIST_BIT (ch);
3016 }
3017#else /* emacs */
3018 if (c < 128)
3019 {
3020 ch = MIN (127, c1);
3021 SETUP_ASCII_RANGE (range_table_work, c, ch);
3022 c = ch + 1;
3023 if (CHAR_BYTE8_P (c1))
3024 c = BYTE8_TO_CHAR (128);
3025 }
3026 if (c <= c1)
3027 {
3028 if (CHAR_BYTE8_P (c))
3029 {
3030 c = CHAR_TO_BYTE8 (c);
3031 c1 = CHAR_TO_BYTE8 (c1);
3032 for (; c <= c1; c++)
3033 SET_LIST_BIT (c);
3034 }
3035 else if (multibyte)
3036 {
3037 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3038 }
3039 else
3040 {
3041 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3042 }
e934739e 3043 }
cf9c99bc 3044#endif /* emacs */
25fe55af 3045 }
e318085a
RS
3046 }
3047
25fe55af 3048 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3049 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3050 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3051 b[-1]--;
3052 b += b[-1];
fa9a63c5 3053
96cc36cc
RS
3054 /* Build real range table from work area. */
3055 if (RANGE_TABLE_WORK_USED (range_table_work)
3056 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3057 {
3058 int i;
3059 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3060
b18215fc 3061 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3062 bytes for flags, two for COUNT, and three bytes for
3063 each character. */
3064 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3065
b18215fc
RS
3066 /* Indicate the existence of range table. */
3067 laststart[1] |= 0x80;
fa9a63c5 3068
96cc36cc
RS
3069 /* Store the character class flag bits into the range table.
3070 If not in emacs, these flag bits are always 0. */
3071 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3072 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3073
b18215fc
RS
3074 STORE_NUMBER_AND_INCR (b, used / 2);
3075 for (i = 0; i < used; i++)
3076 STORE_CHARACTER_AND_INCR
3077 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3078 }
25fe55af
RS
3079 }
3080 break;
fa9a63c5
RM
3081
3082
b18215fc 3083 case '(':
25fe55af
RS
3084 if (syntax & RE_NO_BK_PARENS)
3085 goto handle_open;
3086 else
3087 goto normal_char;
fa9a63c5
RM
3088
3089
25fe55af
RS
3090 case ')':
3091 if (syntax & RE_NO_BK_PARENS)
3092 goto handle_close;
3093 else
3094 goto normal_char;
e318085a
RS
3095
3096
25fe55af
RS
3097 case '\n':
3098 if (syntax & RE_NEWLINE_ALT)
3099 goto handle_alt;
3100 else
3101 goto normal_char;
e318085a
RS
3102
3103
b18215fc 3104 case '|':
25fe55af
RS
3105 if (syntax & RE_NO_BK_VBAR)
3106 goto handle_alt;
3107 else
3108 goto normal_char;
3109
3110
3111 case '{':
3112 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3113 goto handle_interval;
3114 else
3115 goto normal_char;
3116
3117
3118 case '\\':
3119 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3120
3121 /* Do not translate the character after the \, so that we can
3122 distinguish, e.g., \B from \b, even if we normally would
3123 translate, e.g., B to b. */
36595814 3124 PATFETCH (c);
25fe55af
RS
3125
3126 switch (c)
3127 {
3128 case '(':
3129 if (syntax & RE_NO_BK_PARENS)
3130 goto normal_backslash;
3131
3132 handle_open:
505bde11
SM
3133 {
3134 int shy = 0;
c69b0314 3135 regnum_t regnum = 0;
505bde11
SM
3136 if (p+1 < pend)
3137 {
3138 /* Look for a special (?...) construct */
ed0767d8 3139 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3140 {
ed0767d8 3141 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3142 while (!shy)
505bde11 3143 {
c69b0314
SM
3144 PATFETCH (c);
3145 switch (c)
3146 {
3147 case ':': shy = 1; break;
3148 case '0':
3149 /* An explicitly specified regnum must start
3150 with non-0. */
3151 if (regnum == 0)
3152 FREE_STACK_RETURN (REG_BADPAT);
3153 case '1': case '2': case '3': case '4':
3154 case '5': case '6': case '7': case '8': case '9':
3155 regnum = 10*regnum + (c - '0'); break;
3156 default:
3157 /* Only (?:...) is supported right now. */
3158 FREE_STACK_RETURN (REG_BADPAT);
3159 }
505bde11
SM
3160 }
3161 }
505bde11
SM
3162 }
3163
3164 if (!shy)
c69b0314
SM
3165 regnum = ++bufp->re_nsub;
3166 else if (regnum)
3167 { /* It's actually not shy, but explicitly numbered. */
3168 shy = 0;
3169 if (regnum > bufp->re_nsub)
3170 bufp->re_nsub = regnum;
3171 else if (regnum > bufp->re_nsub
3172 /* Ideally, we'd want to check that the specified
3173 group can't have matched (i.e. all subgroups
3174 using the same regnum are in other branches of
3175 OR patterns), but we don't currently keep track
3176 of enough info to do that easily. */
3177 || group_in_compile_stack (compile_stack, regnum))
3178 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3179 }
c69b0314
SM
3180 else
3181 /* It's really shy. */
3182 regnum = - bufp->re_nsub;
25fe55af 3183
99633e97
SM
3184 if (COMPILE_STACK_FULL)
3185 {
3186 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3187 compile_stack_elt_t);
3188 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3189
99633e97
SM
3190 compile_stack.size <<= 1;
3191 }
25fe55af 3192
99633e97 3193 /* These are the values to restore when we hit end of this
7814e705 3194 group. They are all relative offsets, so that if the
99633e97
SM
3195 whole pattern moves because of realloc, they will still
3196 be valid. */
3197 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3198 COMPILE_STACK_TOP.fixup_alt_jump
3199 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3200 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3201 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3202
c69b0314
SM
3203 /* Do not push a start_memory for groups beyond the last one
3204 we can represent in the compiled pattern. */
3205 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3206 BUF_PUSH_2 (start_memory, regnum);
3207
3208 compile_stack.avail++;
3209
3210 fixup_alt_jump = 0;
3211 laststart = 0;
3212 begalt = b;
3213 /* If we've reached MAX_REGNUM groups, then this open
3214 won't actually generate any code, so we'll have to
3215 clear pending_exact explicitly. */
3216 pending_exact = 0;
3217 break;
505bde11 3218 }
25fe55af
RS
3219
3220 case ')':
3221 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3222
3223 if (COMPILE_STACK_EMPTY)
505bde11
SM
3224 {
3225 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3226 goto normal_backslash;
3227 else
3228 FREE_STACK_RETURN (REG_ERPAREN);
3229 }
25fe55af
RS
3230
3231 handle_close:
505bde11 3232 FIXUP_ALT_JUMP ();
25fe55af
RS
3233
3234 /* See similar code for backslashed left paren above. */
3235 if (COMPILE_STACK_EMPTY)
505bde11
SM
3236 {
3237 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3238 goto normal_char;
3239 else
3240 FREE_STACK_RETURN (REG_ERPAREN);
3241 }
25fe55af
RS
3242
3243 /* Since we just checked for an empty stack above, this
3244 ``can't happen''. */
3245 assert (compile_stack.avail != 0);
3246 {
3247 /* We don't just want to restore into `regnum', because
3248 later groups should continue to be numbered higher,
7814e705 3249 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3250 regnum_t regnum;
25fe55af
RS
3251
3252 compile_stack.avail--;
3253 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3254 fixup_alt_jump
3255 = COMPILE_STACK_TOP.fixup_alt_jump
3256 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3257 : 0;
3258 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3259 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3260 /* If we've reached MAX_REGNUM groups, then this open
3261 won't actually generate any code, so we'll have to
3262 clear pending_exact explicitly. */
3263 pending_exact = 0;
e318085a 3264
25fe55af 3265 /* We're at the end of the group, so now we know how many
7814e705 3266 groups were inside this one. */
c69b0314
SM
3267 if (regnum <= MAX_REGNUM && regnum > 0)
3268 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3269 }
3270 break;
3271
3272
3273 case '|': /* `\|'. */
3274 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3275 goto normal_backslash;
3276 handle_alt:
3277 if (syntax & RE_LIMITED_OPS)
3278 goto normal_char;
3279
3280 /* Insert before the previous alternative a jump which
7814e705 3281 jumps to this alternative if the former fails. */
25fe55af
RS
3282 GET_BUFFER_SPACE (3);
3283 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3284 pending_exact = 0;
3285 b += 3;
3286
3287 /* The alternative before this one has a jump after it
3288 which gets executed if it gets matched. Adjust that
3289 jump so it will jump to this alternative's analogous
3290 jump (put in below, which in turn will jump to the next
3291 (if any) alternative's such jump, etc.). The last such
3292 jump jumps to the correct final destination. A picture:
3293 _____ _____
3294 | | | |
3295 | v | v
d1dfb56c 3296 a | b | c
25fe55af
RS
3297
3298 If we are at `b', then fixup_alt_jump right now points to a
3299 three-byte space after `a'. We'll put in the jump, set
3300 fixup_alt_jump to right after `b', and leave behind three
3301 bytes which we'll fill in when we get to after `c'. */
3302
505bde11 3303 FIXUP_ALT_JUMP ();
25fe55af
RS
3304
3305 /* Mark and leave space for a jump after this alternative,
3306 to be filled in later either by next alternative or
3307 when know we're at the end of a series of alternatives. */
3308 fixup_alt_jump = b;
3309 GET_BUFFER_SPACE (3);
3310 b += 3;
3311
3312 laststart = 0;
3313 begalt = b;
3314 break;
3315
3316
3317 case '{':
3318 /* If \{ is a literal. */
3319 if (!(syntax & RE_INTERVALS)
3320 /* If we're at `\{' and it's not the open-interval
3321 operator. */
4bb91c68 3322 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3323 goto normal_backslash;
3324
3325 handle_interval:
3326 {
3327 /* If got here, then the syntax allows intervals. */
3328
3329 /* At least (most) this many matches must be made. */
99633e97 3330 int lower_bound = 0, upper_bound = -1;
25fe55af 3331
ed0767d8 3332 beg_interval = p;
25fe55af 3333
25fe55af
RS
3334 GET_UNSIGNED_NUMBER (lower_bound);
3335
3336 if (c == ',')
ed0767d8 3337 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3338 else
3339 /* Interval such as `{1}' => match exactly once. */
3340 upper_bound = lower_bound;
3341
3342 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3343 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3344 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3345
3346 if (!(syntax & RE_NO_BK_BRACES))
3347 {
4bb91c68
SM
3348 if (c != '\\')
3349 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3350 if (p == pend)
3351 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3352 PATFETCH (c);
3353 }
3354
3355 if (c != '}')
4bb91c68 3356 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3357
3358 /* We just parsed a valid interval. */
3359
3360 /* If it's invalid to have no preceding re. */
3361 if (!laststart)
3362 {
3363 if (syntax & RE_CONTEXT_INVALID_OPS)
3364 FREE_STACK_RETURN (REG_BADRPT);
3365 else if (syntax & RE_CONTEXT_INDEP_OPS)
3366 laststart = b;
3367 else
3368 goto unfetch_interval;
3369 }
3370
6df42991
SM
3371 if (upper_bound == 0)
3372 /* If the upper bound is zero, just drop the sub pattern
3373 altogether. */
3374 b = laststart;
3375 else if (lower_bound == 1 && upper_bound == 1)
3376 /* Just match it once: nothing to do here. */
3377 ;
3378
3379 /* Otherwise, we have a nontrivial interval. When
3380 we're all done, the pattern will look like:
3381 set_number_at <jump count> <upper bound>
3382 set_number_at <succeed_n count> <lower bound>
3383 succeed_n <after jump addr> <succeed_n count>
3384 <body of loop>
3385 jump_n <succeed_n addr> <jump count>
3386 (The upper bound and `jump_n' are omitted if
3387 `upper_bound' is 1, though.) */
3388 else
3389 { /* If the upper bound is > 1, we need to insert
3390 more at the end of the loop. */
3391 unsigned int nbytes = (upper_bound < 0 ? 3
3392 : upper_bound > 1 ? 5 : 0);
3393 unsigned int startoffset = 0;
3394
3395 GET_BUFFER_SPACE (20); /* We might use less. */
3396
3397 if (lower_bound == 0)
3398 {
3399 /* A succeed_n that starts with 0 is really a
3400 a simple on_failure_jump_loop. */
3401 INSERT_JUMP (on_failure_jump_loop, laststart,
3402 b + 3 + nbytes);
3403 b += 3;
3404 }
3405 else
3406 {
3407 /* Initialize lower bound of the `succeed_n', even
3408 though it will be set during matching by its
3409 attendant `set_number_at' (inserted next),
3410 because `re_compile_fastmap' needs to know.
3411 Jump to the `jump_n' we might insert below. */
3412 INSERT_JUMP2 (succeed_n, laststart,
3413 b + 5 + nbytes,
3414 lower_bound);
3415 b += 5;
3416
3417 /* Code to initialize the lower bound. Insert
7814e705 3418 before the `succeed_n'. The `5' is the last two
6df42991
SM
3419 bytes of this `set_number_at', plus 3 bytes of
3420 the following `succeed_n'. */
3421 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3422 b += 5;
3423 startoffset += 5;
3424 }
3425
3426 if (upper_bound < 0)
3427 {
3428 /* A negative upper bound stands for infinity,
3429 in which case it degenerates to a plain jump. */
3430 STORE_JUMP (jump, b, laststart + startoffset);
3431 b += 3;
3432 }
3433 else if (upper_bound > 1)
3434 { /* More than one repetition is allowed, so
3435 append a backward jump to the `succeed_n'
3436 that starts this interval.
3437
3438 When we've reached this during matching,
3439 we'll have matched the interval once, so
3440 jump back only `upper_bound - 1' times. */
3441 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3442 upper_bound - 1);
3443 b += 5;
3444
3445 /* The location we want to set is the second
3446 parameter of the `jump_n'; that is `b-2' as
3447 an absolute address. `laststart' will be
3448 the `set_number_at' we're about to insert;
3449 `laststart+3' the number to set, the source
3450 for the relative address. But we are
3451 inserting into the middle of the pattern --
3452 so everything is getting moved up by 5.
3453 Conclusion: (b - 2) - (laststart + 3) + 5,
3454 i.e., b - laststart.
3455
3456 We insert this at the beginning of the loop
3457 so that if we fail during matching, we'll
3458 reinitialize the bounds. */
3459 insert_op2 (set_number_at, laststart, b - laststart,
3460 upper_bound - 1, b);
3461 b += 5;
3462 }
3463 }
25fe55af
RS
3464 pending_exact = 0;
3465 beg_interval = NULL;
3466 }
3467 break;
3468
3469 unfetch_interval:
3470 /* If an invalid interval, match the characters as literals. */
3471 assert (beg_interval);
3472 p = beg_interval;
3473 beg_interval = NULL;
3474
3475 /* normal_char and normal_backslash need `c'. */
ed0767d8 3476 c = '{';
25fe55af
RS
3477
3478 if (!(syntax & RE_NO_BK_BRACES))
3479 {
ed0767d8
SM
3480 assert (p > pattern && p[-1] == '\\');
3481 goto normal_backslash;
25fe55af 3482 }
ed0767d8
SM
3483 else
3484 goto normal_char;
e318085a 3485
b18215fc 3486#ifdef emacs
25fe55af 3487 /* There is no way to specify the before_dot and after_dot
7814e705 3488 operators. rms says this is ok. --karl */
25fe55af
RS
3489 case '=':
3490 BUF_PUSH (at_dot);
3491 break;
3492
3493 case 's':
3494 laststart = b;
3495 PATFETCH (c);
3496 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3497 break;
3498
3499 case 'S':
3500 laststart = b;
3501 PATFETCH (c);
3502 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3503 break;
b18215fc
RS
3504
3505 case 'c':
3506 laststart = b;
36595814 3507 PATFETCH (c);
b18215fc
RS
3508 BUF_PUSH_2 (categoryspec, c);
3509 break;
e318085a 3510
b18215fc
RS
3511 case 'C':
3512 laststart = b;
36595814 3513 PATFETCH (c);
b18215fc
RS
3514 BUF_PUSH_2 (notcategoryspec, c);
3515 break;
3516#endif /* emacs */
e318085a 3517
e318085a 3518
25fe55af 3519 case 'w':
4bb91c68
SM
3520 if (syntax & RE_NO_GNU_OPS)
3521 goto normal_char;
25fe55af 3522 laststart = b;
1fb352e0 3523 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3524 break;
e318085a 3525
e318085a 3526
25fe55af 3527 case 'W':
4bb91c68
SM
3528 if (syntax & RE_NO_GNU_OPS)
3529 goto normal_char;
25fe55af 3530 laststart = b;
1fb352e0 3531 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3532 break;
e318085a
RS
3533
3534
25fe55af 3535 case '<':
4bb91c68
SM
3536 if (syntax & RE_NO_GNU_OPS)
3537 goto normal_char;
25fe55af
RS
3538 BUF_PUSH (wordbeg);
3539 break;
e318085a 3540
25fe55af 3541 case '>':
4bb91c68
SM
3542 if (syntax & RE_NO_GNU_OPS)
3543 goto normal_char;
25fe55af
RS
3544 BUF_PUSH (wordend);
3545 break;
e318085a 3546
669fa600
SM
3547 case '_':
3548 if (syntax & RE_NO_GNU_OPS)
3549 goto normal_char;
3550 laststart = b;
3551 PATFETCH (c);
3552 if (c == '<')
3553 BUF_PUSH (symbeg);
3554 else if (c == '>')
3555 BUF_PUSH (symend);
3556 else
3557 FREE_STACK_RETURN (REG_BADPAT);
3558 break;
3559
25fe55af 3560 case 'b':
4bb91c68
SM
3561 if (syntax & RE_NO_GNU_OPS)
3562 goto normal_char;
25fe55af
RS
3563 BUF_PUSH (wordbound);
3564 break;
e318085a 3565
25fe55af 3566 case 'B':
4bb91c68
SM
3567 if (syntax & RE_NO_GNU_OPS)
3568 goto normal_char;
25fe55af
RS
3569 BUF_PUSH (notwordbound);
3570 break;
fa9a63c5 3571
25fe55af 3572 case '`':
4bb91c68
SM
3573 if (syntax & RE_NO_GNU_OPS)
3574 goto normal_char;
25fe55af
RS
3575 BUF_PUSH (begbuf);
3576 break;
e318085a 3577
25fe55af 3578 case '\'':
4bb91c68
SM
3579 if (syntax & RE_NO_GNU_OPS)
3580 goto normal_char;
25fe55af
RS
3581 BUF_PUSH (endbuf);
3582 break;
e318085a 3583
25fe55af
RS
3584 case '1': case '2': case '3': case '4': case '5':
3585 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3586 {
3587 regnum_t reg;
e318085a 3588
0cdd06f8
SM
3589 if (syntax & RE_NO_BK_REFS)
3590 goto normal_backslash;
e318085a 3591
0cdd06f8 3592 reg = c - '0';
e318085a 3593
c69b0314
SM
3594 if (reg > bufp->re_nsub || reg < 1
3595 /* Can't back reference to a subexp before its end. */
3596 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3597 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3598
0cdd06f8
SM
3599 laststart = b;
3600 BUF_PUSH_2 (duplicate, reg);
3601 }
25fe55af 3602 break;
e318085a 3603
e318085a 3604
25fe55af
RS
3605 case '+':
3606 case '?':
3607 if (syntax & RE_BK_PLUS_QM)
3608 goto handle_plus;
3609 else
3610 goto normal_backslash;
3611
3612 default:
3613 normal_backslash:
3614 /* You might think it would be useful for \ to mean
3615 not to translate; but if we don't translate it
4bb91c68 3616 it will never match anything. */
25fe55af
RS
3617 goto normal_char;
3618 }
3619 break;
fa9a63c5
RM
3620
3621
3622 default:
25fe55af 3623 /* Expects the character in `c'. */
fa9a63c5 3624 normal_char:
36595814 3625 /* If no exactn currently being built. */
25fe55af 3626 if (!pending_exact
fa9a63c5 3627
25fe55af
RS
3628 /* If last exactn not at current position. */
3629 || pending_exact + *pending_exact + 1 != b
5e69f11e 3630
25fe55af 3631 /* We have only one byte following the exactn for the count. */
2d1675e4 3632 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3633
7814e705 3634 /* If followed by a repetition operator. */
9d99031f 3635 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3636 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3637 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3638 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3639 || ((syntax & RE_INTERVALS)
25fe55af 3640 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3641 ? p != pend && *p == '{'
3642 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3643 {
3644 /* Start building a new exactn. */
5e69f11e 3645
25fe55af 3646 laststart = b;
fa9a63c5
RM
3647
3648 BUF_PUSH_2 (exactn, 0);
3649 pending_exact = b - 1;
25fe55af 3650 }
5e69f11e 3651
2d1675e4
SM
3652 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3653 {
e0277a47
KH
3654 int len;
3655
cf9c99bc 3656 if (multibyte)
6fdd04b0 3657 {
cf9c99bc 3658 c = TRANSLATE (c);
6fdd04b0
KH
3659 len = CHAR_STRING (c, b);
3660 b += len;
3661 }
e0277a47 3662 else
6fdd04b0 3663 {
cf9c99bc
KH
3664 c1 = RE_CHAR_TO_MULTIBYTE (c);
3665 if (! CHAR_BYTE8_P (c1))
3666 {
3667 re_wchar_t c2 = TRANSLATE (c1);
3668
3669 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3670 c = c1;
409f2919 3671 }
6fdd04b0
KH
3672 *b++ = c;
3673 len = 1;
3674 }
2d1675e4
SM
3675 (*pending_exact) += len;
3676 }
3677
fa9a63c5 3678 break;
25fe55af 3679 } /* switch (c) */
fa9a63c5
RM
3680 } /* while p != pend */
3681
5e69f11e 3682
fa9a63c5 3683 /* Through the pattern now. */
5e69f11e 3684
505bde11 3685 FIXUP_ALT_JUMP ();
fa9a63c5 3686
5e69f11e 3687 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3688 FREE_STACK_RETURN (REG_EPAREN);
3689
3690 /* If we don't want backtracking, force success
3691 the first time we reach the end of the compiled pattern. */
3692 if (syntax & RE_NO_POSIX_BACKTRACKING)
3693 BUF_PUSH (succeed);
3694
fa9a63c5
RM
3695 /* We have succeeded; set the length of the buffer. */
3696 bufp->used = b - bufp->buffer;
3697
3698#ifdef DEBUG
99633e97 3699 if (debug > 0)
fa9a63c5 3700 {
505bde11 3701 re_compile_fastmap (bufp);
fa9a63c5
RM
3702 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3703 print_compiled_pattern (bufp);
3704 }
99633e97 3705 debug--;
fa9a63c5
RM
3706#endif /* DEBUG */
3707
3708#ifndef MATCH_MAY_ALLOCATE
3709 /* Initialize the failure stack to the largest possible stack. This
3710 isn't necessary unless we're trying to avoid calling alloca in
3711 the search and match routines. */
3712 {
3713 int num_regs = bufp->re_nsub + 1;
3714
320a2a73 3715 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3716 {
a26f4ccd 3717 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
38182d90
PE
3718 falk_stack.stack = realloc (fail_stack.stack,
3719 fail_stack.size * sizeof *falk_stack.stack);
fa9a63c5
RM
3720 }
3721
3722 regex_grow_registers (num_regs);
3723 }
3724#endif /* not MATCH_MAY_ALLOCATE */
3725
839966f3 3726 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3727} /* regex_compile */
3728\f
3729/* Subroutines for `regex_compile'. */
3730
7814e705 3731/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3732
3733static void
971de7fb 3734store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3735{
3736 *loc = (unsigned char) op;
3737 STORE_NUMBER (loc + 1, arg);
3738}
3739
3740
3741/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3742
3743static void
971de7fb 3744store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3745{
3746 *loc = (unsigned char) op;
3747 STORE_NUMBER (loc + 1, arg1);
3748 STORE_NUMBER (loc + 3, arg2);
3749}
3750
3751
3752/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3753 for OP followed by two-byte integer parameter ARG. */
3754
3755static void
971de7fb 3756insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3757{
3758 register unsigned char *pfrom = end;
3759 register unsigned char *pto = end + 3;
3760
3761 while (pfrom != loc)
3762 *--pto = *--pfrom;
5e69f11e 3763
fa9a63c5
RM
3764 store_op1 (op, loc, arg);
3765}
3766
3767
3768/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3769
3770static void
971de7fb 3771insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3772{
3773 register unsigned char *pfrom = end;
3774 register unsigned char *pto = end + 5;
3775
3776 while (pfrom != loc)
3777 *--pto = *--pfrom;
5e69f11e 3778
fa9a63c5
RM
3779 store_op2 (op, loc, arg1, arg2);
3780}
3781
3782
3783/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3784 after an alternative or a begin-subexpression. We assume there is at
3785 least one character before the ^. */
3786
3787static boolean
971de7fb 3788at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3789{
01618498 3790 re_char *prev = p - 2;
95988fcf 3791 boolean odd_backslashes;
5e69f11e 3792
95988fcf
AS
3793 /* After a subexpression? */
3794 if (*prev == '(')
3795 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3796
3797 /* After an alternative? */
3798 else if (*prev == '|')
3799 odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3800
3801 /* After a shy subexpression? */
3802 else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3803 {
3804 /* Skip over optional regnum. */
3805 while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3806 --prev;
3807
3808 if (!(prev - 2 >= pattern
3809 && prev[-1] == '?' && prev[-2] == '('))
3810 return false;
3811 prev -= 2;
3812 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3813 }
3814 else
3815 return false;
3816
3817 /* Count the number of preceding backslashes. */
3818 p = prev;
3819 while (prev - 1 >= pattern && prev[-1] == '\\')
3820 --prev;
3821 return (p - prev) & odd_backslashes;
fa9a63c5
RM
3822}
3823
3824
3825/* The dual of at_begline_loc_p. This one is for $. We assume there is
3826 at least one character after the $, i.e., `P < PEND'. */
3827
3828static boolean
971de7fb 3829at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3830{
01618498 3831 re_char *next = p;
fa9a63c5 3832 boolean next_backslash = *next == '\\';
01618498 3833 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3834
fa9a63c5
RM
3835 return
3836 /* Before a subexpression? */
3837 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3838 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3839 /* Before an alternative? */
3840 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3841 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3842}
3843
3844
5e69f11e 3845/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3846 false if it's not. */
3847
3848static boolean
971de7fb 3849group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5 3850{
d1dfb56c 3851 ssize_t this_element;
fa9a63c5 3852
5e69f11e
RM
3853 for (this_element = compile_stack.avail - 1;
3854 this_element >= 0;
fa9a63c5
RM
3855 this_element--)
3856 if (compile_stack.stack[this_element].regnum == regnum)
3857 return true;
3858
3859 return false;
3860}
fa9a63c5 3861\f
f6a3f532
SM
3862/* analyse_first.
3863 If fastmap is non-NULL, go through the pattern and fill fastmap
3864 with all the possible leading chars. If fastmap is NULL, don't
3865 bother filling it up (obviously) and only return whether the
3866 pattern could potentially match the empty string.
3867
3868 Return 1 if p..pend might match the empty string.
3869 Return 0 if p..pend matches at least one char.
01618498 3870 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3871
3872static int
438105ed 3873analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3874{
505bde11 3875 int j, k;
1fb352e0 3876 boolean not;
fa9a63c5 3877
b18215fc 3878 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3879 flag is set true. */
b18215fc
RS
3880 boolean match_any_multibyte_characters = false;
3881
f6a3f532 3882 assert (p);
5e69f11e 3883
505bde11
SM
3884 /* The loop below works as follows:
3885 - It has a working-list kept in the PATTERN_STACK and which basically
3886 starts by only containing a pointer to the first operation.
3887 - If the opcode we're looking at is a match against some set of
3888 chars, then we add those chars to the fastmap and go on to the
3889 next work element from the worklist (done via `break').
3890 - If the opcode is a control operator on the other hand, we either
3891 ignore it (if it's meaningless at this point, such as `start_memory')
3892 or execute it (if it's a jump). If the jump has several destinations
3893 (i.e. `on_failure_jump'), then we push the other destination onto the
3894 worklist.
3895 We guarantee termination by ignoring backward jumps (more or less),
3896 so that `p' is monotonically increasing. More to the point, we
3897 never set `p' (or push) anything `<= p1'. */
3898
01618498 3899 while (p < pend)
fa9a63c5 3900 {
505bde11
SM
3901 /* `p1' is used as a marker of how far back a `on_failure_jump'
3902 can go without being ignored. It is normally equal to `p'
3903 (which prevents any backward `on_failure_jump') except right
3904 after a plain `jump', to allow patterns such as:
3905 0: jump 10
3906 3..9: <body>
3907 10: on_failure_jump 3
3908 as used for the *? operator. */
01618498 3909 re_char *p1 = p;
5e69f11e 3910
7393bcbb 3911 switch (*p++)
fa9a63c5 3912 {
f6a3f532 3913 case succeed:
01618498 3914 return 1;
fa9a63c5 3915
fa9a63c5 3916 case duplicate:
505bde11
SM
3917 /* If the first character has to match a backreference, that means
3918 that the group was empty (since it already matched). Since this
3919 is the only case that interests us here, we can assume that the
3920 backreference must match the empty string. */
3921 p++;
3922 continue;
fa9a63c5
RM
3923
3924
3925 /* Following are the cases which match a character. These end
7814e705 3926 with `break'. */
fa9a63c5
RM
3927
3928 case exactn:
e0277a47 3929 if (fastmap)
cf9c99bc
KH
3930 {
3931 /* If multibyte is nonzero, the first byte of each
3932 character is an ASCII or a leading code. Otherwise,
3933 each byte is a character. Thus, this works in both
3934 cases. */
3935 fastmap[p[1]] = 1;
3936 if (! multibyte)
3937 {
3938 /* For the case of matching this unibyte regex
3939 against multibyte, we must set a leading code of
3940 the corresponding multibyte character. */
3941 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3942
86e893e3 3943 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
3944 }
3945 }
fa9a63c5
RM
3946 break;
3947
3948
1fb352e0
SM
3949 case anychar:
3950 /* We could put all the chars except for \n (and maybe \0)
3951 but we don't bother since it is generally not worth it. */
f6a3f532 3952 if (!fastmap) break;
01618498 3953 return -1;
fa9a63c5
RM
3954
3955
b18215fc 3956 case charset_not:
1fb352e0 3957 if (!fastmap) break;
bf216479
KH
3958 {
3959 /* Chars beyond end of bitmap are possible matches. */
bf216479 3960 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 3961 j < (1 << BYTEWIDTH); j++)
bf216479
KH
3962 fastmap[j] = 1;
3963 }
3964
1fb352e0
SM
3965 /* Fallthrough */
3966 case charset:
3967 if (!fastmap) break;
3968 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3969 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3970 j >= 0; j--)
1fb352e0 3971 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 3972 fastmap[j] = 1;
b18215fc 3973
6482db2e
KH
3974#ifdef emacs
3975 if (/* Any leading code can possibly start a character
1fb352e0 3976 which doesn't match the specified set of characters. */
6482db2e 3977 not
409f2919 3978 ||
6482db2e
KH
3979 /* If we can match a character class, we can match any
3980 multibyte characters. */
3981 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3982 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3983
b18215fc 3984 {
b18215fc
RS
3985 if (match_any_multibyte_characters == false)
3986 {
6482db2e
KH
3987 for (j = MIN_MULTIBYTE_LEADING_CODE;
3988 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 3989 fastmap[j] = 1;
b18215fc
RS
3990 match_any_multibyte_characters = true;
3991 }
3992 }
b18215fc 3993
1fb352e0
SM
3994 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3995 && match_any_multibyte_characters == false)
3996 {
bf216479 3997 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 3998 multibyte character in the range table. */
1fb352e0 3999 int c, count;
bf216479 4000 unsigned char lc1, lc2;
b18215fc 4001
1fb352e0 4002 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4003 bits for a character class. */
1fb352e0 4004 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4005
1fb352e0
SM
4006 /* Extract the number of ranges in range table into COUNT. */
4007 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4008 for (; count > 0; count--, p += 3)
1fb352e0 4009 {
9117d724
KH
4010 /* Extract the start and end of each range. */
4011 EXTRACT_CHARACTER (c, p);
bf216479 4012 lc1 = CHAR_LEADING_CODE (c);
9117d724 4013 p += 3;
1fb352e0 4014 EXTRACT_CHARACTER (c, p);
bf216479
KH
4015 lc2 = CHAR_LEADING_CODE (c);
4016 for (j = lc1; j <= lc2; j++)
9117d724 4017 fastmap[j] = 1;
1fb352e0
SM
4018 }
4019 }
6482db2e 4020#endif
b18215fc
RS
4021 break;
4022
1fb352e0
SM
4023 case syntaxspec:
4024 case notsyntaxspec:
4025 if (!fastmap) break;
4026#ifndef emacs
4027 not = (re_opcode_t)p[-1] == notsyntaxspec;
4028 k = *p++;
4029 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4030 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4031 fastmap[j] = 1;
b18215fc 4032 break;
1fb352e0 4033#else /* emacs */
b18215fc
RS
4034 /* This match depends on text properties. These end with
4035 aborting optimizations. */
01618498 4036 return -1;
b18215fc
RS
4037
4038 case categoryspec:
b18215fc 4039 case notcategoryspec:
1fb352e0
SM
4040 if (!fastmap) break;
4041 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4042 k = *p++;
6482db2e 4043 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4044 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4045 fastmap[j] = 1;
4046
6482db2e
KH
4047 /* Any leading code can possibly start a character which
4048 has or doesn't has the specified category. */
4049 if (match_any_multibyte_characters == false)
6fdd04b0 4050 {
6482db2e
KH
4051 for (j = MIN_MULTIBYTE_LEADING_CODE;
4052 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4053 fastmap[j] = 1;
4054 match_any_multibyte_characters = true;
6fdd04b0 4055 }
b18215fc
RS
4056 break;
4057
fa9a63c5 4058 /* All cases after this match the empty string. These end with
25fe55af 4059 `continue'. */
fa9a63c5 4060
fa9a63c5
RM
4061 case before_dot:
4062 case at_dot:
4063 case after_dot:
1fb352e0 4064#endif /* !emacs */
25fe55af
RS
4065 case no_op:
4066 case begline:
4067 case endline:
fa9a63c5
RM
4068 case begbuf:
4069 case endbuf:
4070 case wordbound:
4071 case notwordbound:
4072 case wordbeg:
4073 case wordend:
669fa600
SM
4074 case symbeg:
4075 case symend:
25fe55af 4076 continue;
fa9a63c5
RM
4077
4078
fa9a63c5 4079 case jump:
25fe55af 4080 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4081 if (j < 0)
4082 /* Backward jumps can only go back to code that we've already
4083 visited. `re_compile' should make sure this is true. */
4084 break;
25fe55af 4085 p += j;
7393bcbb 4086 switch (*p)
505bde11
SM
4087 {
4088 case on_failure_jump:
4089 case on_failure_keep_string_jump:
505bde11 4090 case on_failure_jump_loop:
0683b6fa 4091 case on_failure_jump_nastyloop:
505bde11
SM
4092 case on_failure_jump_smart:
4093 p++;
4094 break;
4095 default:
4096 continue;
4097 };
4098 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4099 to jump back to "just after here". */
4100 /* Fallthrough */
fa9a63c5 4101
25fe55af
RS
4102 case on_failure_jump:
4103 case on_failure_keep_string_jump:
0683b6fa 4104 case on_failure_jump_nastyloop:
505bde11
SM
4105 case on_failure_jump_loop:
4106 case on_failure_jump_smart:
25fe55af 4107 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4108 if (p + j <= p1)
ed0767d8 4109 ; /* Backward jump to be ignored. */
01618498
SM
4110 else
4111 { /* We have to look down both arms.
4112 We first go down the "straight" path so as to minimize
4113 stack usage when going through alternatives. */
4114 int r = analyse_first (p, pend, fastmap, multibyte);
4115 if (r) return r;
4116 p += j;
4117 }
25fe55af 4118 continue;
fa9a63c5
RM
4119
4120
ed0767d8
SM
4121 case jump_n:
4122 /* This code simply does not properly handle forward jump_n. */
4123 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4124 p += 4;
4125 /* jump_n can either jump or fall through. The (backward) jump
4126 case has already been handled, so we only need to look at the
4127 fallthrough case. */
4128 continue;
177c0ea7 4129
fa9a63c5 4130 case succeed_n:
ed0767d8
SM
4131 /* If N == 0, it should be an on_failure_jump_loop instead. */
4132 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4133 p += 4;
4134 /* We only care about one iteration of the loop, so we don't
4135 need to consider the case where this behaves like an
4136 on_failure_jump. */
25fe55af 4137 continue;
fa9a63c5
RM
4138
4139
4140 case set_number_at:
25fe55af
RS
4141 p += 4;
4142 continue;
fa9a63c5
RM
4143
4144
4145 case start_memory:
25fe55af 4146 case stop_memory:
505bde11 4147 p += 1;
fa9a63c5
RM
4148 continue;
4149
4150
4151 default:
25fe55af
RS
4152 abort (); /* We have listed all the cases. */
4153 } /* switch *p++ */
fa9a63c5
RM
4154
4155 /* Getting here means we have found the possible starting
25fe55af 4156 characters for one path of the pattern -- and that the empty
7814e705 4157 string does not match. We need not follow this path further. */
01618498 4158 return 0;
fa9a63c5
RM
4159 } /* while p */
4160
01618498
SM
4161 /* We reached the end without matching anything. */
4162 return 1;
4163
f6a3f532
SM
4164} /* analyse_first */
4165\f
4166/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4167 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4168 characters can start a string that matches the pattern. This fastmap
4169 is used by re_search to skip quickly over impossible starting points.
4170
4171 Character codes above (1 << BYTEWIDTH) are not represented in the
4172 fastmap, but the leading codes are represented. Thus, the fastmap
4173 indicates which character sets could start a match.
4174
4175 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4176 area as BUFP->fastmap.
4177
4178 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4179 the pattern buffer.
4180
4181 Returns 0 if we succeed, -2 if an internal error. */
4182
4183int
971de7fb 4184re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4185{
4186 char *fastmap = bufp->fastmap;
4187 int analysis;
4188
4189 assert (fastmap && bufp->buffer);
4190
72af86bd 4191 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4192 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4193
4194 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4195 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4196 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4197 return 0;
4198} /* re_compile_fastmap */
4199\f
4200/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4201 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4202 this memory for recording register information. STARTS and ENDS
4203 must be allocated using the malloc library routine, and must each
4204 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4205
4206 If NUM_REGS == 0, then subsequent matches should allocate their own
4207 register data.
4208
4209 Unless this function is called, the first search or match using
4210 PATTERN_BUFFER will allocate its own register data, without
4211 freeing the old data. */
4212
4213void
971de7fb 4214re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4215{
4216 if (num_regs)
4217 {
4218 bufp->regs_allocated = REGS_REALLOCATE;
4219 regs->num_regs = num_regs;
4220 regs->start = starts;
4221 regs->end = ends;
4222 }
4223 else
4224 {
4225 bufp->regs_allocated = REGS_UNALLOCATED;
4226 regs->num_regs = 0;
4227 regs->start = regs->end = (regoff_t *) 0;
4228 }
4229}
c0f9ea08 4230WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4231\f
7814e705 4232/* Searching routines. */
fa9a63c5
RM
4233
4234/* Like re_search_2, below, but only one string is specified, and
4235 doesn't let you say where to stop matching. */
4236
d1dfb56c
EZ
4237regoff_t
4238re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4239 ssize_t startpos, ssize_t range, struct re_registers *regs)
fa9a63c5 4240{
5e69f11e 4241 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4242 regs, size);
4243}
c0f9ea08 4244WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4245
70806df6
KH
4246/* Head address of virtual concatenation of string. */
4247#define HEAD_ADDR_VSTRING(P) \
4248 (((P) >= size1 ? string2 : string1))
4249
b18215fc
RS
4250/* Address of POS in the concatenation of virtual string. */
4251#define POS_ADDR_VSTRING(POS) \
4252 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4253
4254/* Using the compiled pattern in BUFP->buffer, first tries to match the
4255 virtual concatenation of STRING1 and STRING2, starting first at index
4256 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4257
fa9a63c5 4258 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4259
fa9a63c5
RM
4260 RANGE is how far to scan while trying to match. RANGE = 0 means try
4261 only at STARTPOS; in general, the last start tried is STARTPOS +
4262 RANGE.
5e69f11e 4263
fa9a63c5
RM
4264 In REGS, return the indices of the virtual concatenation of STRING1
4265 and STRING2 that matched the entire BUFP->buffer and its contained
4266 subexpressions.
5e69f11e 4267
fa9a63c5
RM
4268 Do not consider matching one past the index STOP in the virtual
4269 concatenation of STRING1 and STRING2.
4270
4271 We return either the position in the strings at which the match was
4272 found, -1 if no match, or -2 if error (such as failure
4273 stack overflow). */
4274
d1dfb56c
EZ
4275regoff_t
4276re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4277 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4278 struct re_registers *regs, ssize_t stop)
fa9a63c5 4279{
d1dfb56c 4280 regoff_t val;
66f0296e
SM
4281 re_char *string1 = (re_char*) str1;
4282 re_char *string2 = (re_char*) str2;
fa9a63c5 4283 register char *fastmap = bufp->fastmap;
6676cb1c 4284 register RE_TRANSLATE_TYPE translate = bufp->translate;
d1dfb56c
EZ
4285 size_t total_size = size1 + size2;
4286 ssize_t endpos = startpos + range;
c0f9ea08 4287 boolean anchored_start;
cf9c99bc
KH
4288 /* Nonzero if we are searching multibyte string. */
4289 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4290
fa9a63c5
RM
4291 /* Check for out-of-range STARTPOS. */
4292 if (startpos < 0 || startpos > total_size)
4293 return -1;
5e69f11e 4294
fa9a63c5 4295 /* Fix up RANGE if it might eventually take us outside
34597fa9 4296 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4297 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4298 if (endpos < 0)
4299 range = 0 - startpos;
fa9a63c5
RM
4300 else if (endpos > total_size)
4301 range = total_size - startpos;
4302
4303 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4304 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4305 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4306 {
4307 if (startpos > 0)
4308 return -1;
4309 else
7b140fd7 4310 range = 0;
fa9a63c5
RM
4311 }
4312
ae4788a8
RS
4313#ifdef emacs
4314 /* In a forward search for something that starts with \=.
4315 don't keep searching past point. */
4316 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4317 {
7b140fd7
RS
4318 range = PT_BYTE - BEGV_BYTE - startpos;
4319 if (range < 0)
ae4788a8
RS
4320 return -1;
4321 }
4322#endif /* emacs */
4323
fa9a63c5
RM
4324 /* Update the fastmap now if not correct already. */
4325 if (fastmap && !bufp->fastmap_accurate)
01618498 4326 re_compile_fastmap (bufp);
5e69f11e 4327
c8499ba5 4328 /* See whether the pattern is anchored. */
c0f9ea08 4329 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4330
b18215fc 4331#ifdef emacs
d48cd3f4 4332 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4333 {
d1dfb56c 4334 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4335
4336 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4337 }
b18215fc
RS
4338#endif
4339
fa9a63c5
RM
4340 /* Loop through the string, looking for a place to start matching. */
4341 for (;;)
5e69f11e 4342 {
c8499ba5
RS
4343 /* If the pattern is anchored,
4344 skip quickly past places we cannot match.
4345 We don't bother to treat startpos == 0 specially
4346 because that case doesn't repeat. */
4347 if (anchored_start && startpos > 0)
4348 {
c0f9ea08
SM
4349 if (! ((startpos <= size1 ? string1[startpos - 1]
4350 : string2[startpos - size1 - 1])
4351 == '\n'))
c8499ba5
RS
4352 goto advance;
4353 }
4354
fa9a63c5 4355 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4356 cannot be the start of a match. If the pattern can match the
4357 null string, however, we don't need to skip characters; we want
7814e705 4358 the first null string. */
fa9a63c5
RM
4359 if (fastmap && startpos < total_size && !bufp->can_be_null)
4360 {
66f0296e 4361 register re_char *d;
01618498 4362 register re_wchar_t buf_ch;
e934739e
RS
4363
4364 d = POS_ADDR_VSTRING (startpos);
4365
7814e705 4366 if (range > 0) /* Searching forwards. */
fa9a63c5 4367 {
fa9a63c5 4368 register int lim = 0;
d1dfb56c 4369 ssize_t irange = range;
fa9a63c5 4370
25fe55af
RS
4371 if (startpos < size1 && startpos + range >= size1)
4372 lim = range - (size1 - startpos);
fa9a63c5 4373
25fe55af
RS
4374 /* Written out as an if-else to avoid testing `translate'
4375 inside the loop. */
28ae27ae
AS
4376 if (RE_TRANSLATE_P (translate))
4377 {
e934739e
RS
4378 if (multibyte)
4379 while (range > lim)
4380 {
4381 int buf_charlen;
4382
62a6e103 4383 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4384 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4385 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4386 break;
4387
4388 range -= buf_charlen;
4389 d += buf_charlen;
4390 }
4391 else
bf216479 4392 while (range > lim)
33c46939 4393 {
cf9c99bc
KH
4394 register re_wchar_t ch, translated;
4395
bf216479 4396 buf_ch = *d;
cf9c99bc
KH
4397 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4398 translated = RE_TRANSLATE (translate, ch);
4399 if (translated != ch
4400 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4401 buf_ch = ch;
6fdd04b0 4402 if (fastmap[buf_ch])
bf216479 4403 break;
33c46939
RS
4404 d++;
4405 range--;
4406 }
e934739e 4407 }
fa9a63c5 4408 else
6fdd04b0
KH
4409 {
4410 if (multibyte)
4411 while (range > lim)
4412 {
4413 int buf_charlen;
fa9a63c5 4414
62a6e103 4415 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4416 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4417 break;
4418 range -= buf_charlen;
4419 d += buf_charlen;
4420 }
e934739e 4421 else
6fdd04b0 4422 while (range > lim && !fastmap[*d])
33c46939
RS
4423 {
4424 d++;
4425 range--;
4426 }
e934739e 4427 }
fa9a63c5
RM
4428 startpos += irange - range;
4429 }
7814e705 4430 else /* Searching backwards. */
fa9a63c5 4431 {
ba5e343c
KH
4432 if (multibyte)
4433 {
62a6e103 4434 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4435 buf_ch = TRANSLATE (buf_ch);
4436 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4437 goto advance;
4438 }
4439 else
4440 {
cf9c99bc
KH
4441 register re_wchar_t ch, translated;
4442
4443 buf_ch = *d;
4444 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4445 translated = TRANSLATE (ch);
4446 if (translated != ch
4447 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4448 buf_ch = ch;
4449 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4450 goto advance;
4451 }
fa9a63c5
RM
4452 }
4453 }
4454
4455 /* If can't match the null string, and that's all we have left, fail. */
4456 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4457 && !bufp->can_be_null)
fa9a63c5
RM
4458 return -1;
4459
4460 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4461 startpos, regs, stop);
fa9a63c5
RM
4462
4463 if (val >= 0)
4464 return startpos;
5e69f11e 4465
fa9a63c5
RM
4466 if (val == -2)
4467 return -2;
4468
4469 advance:
5e69f11e 4470 if (!range)
25fe55af 4471 break;
5e69f11e 4472 else if (range > 0)
25fe55af 4473 {
b18215fc
RS
4474 /* Update STARTPOS to the next character boundary. */
4475 if (multibyte)
4476 {
66f0296e 4477 re_char *p = POS_ADDR_VSTRING (startpos);
aa3830c4 4478 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4479
4480 range -= len;
4481 if (range < 0)
4482 break;
4483 startpos += len;
4484 }
4485 else
4486 {
b560c397
RS
4487 range--;
4488 startpos++;
4489 }
e318085a 4490 }
fa9a63c5 4491 else
25fe55af
RS
4492 {
4493 range++;
4494 startpos--;
b18215fc
RS
4495
4496 /* Update STARTPOS to the previous character boundary. */
4497 if (multibyte)
4498 {
70806df6
KH
4499 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4500 re_char *p0 = p;
4501 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4502
4503 /* Find the head of multibyte form. */
70806df6
KH
4504 PREV_CHAR_BOUNDARY (p, phead);
4505 range += p0 - 1 - p;
4506 if (range > 0)
4507 break;
b18215fc 4508
70806df6 4509 startpos -= p0 - 1 - p;
b18215fc 4510 }
25fe55af 4511 }
fa9a63c5
RM
4512 }
4513 return -1;
4514} /* re_search_2 */
c0f9ea08 4515WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4516\f
4517/* Declarations and macros for re_match_2. */
4518
261cb4bb
PE
4519static int bcmp_translate (re_char *s1, re_char *s2,
4520 register ssize_t len,
4521 RE_TRANSLATE_TYPE translate,
4522 const int multibyte);
fa9a63c5
RM
4523
4524/* This converts PTR, a pointer into one of the search strings `string1'
4525 and `string2' into an offset from the beginning of that string. */
4526#define POINTER_TO_OFFSET(ptr) \
4527 (FIRST_STRING_P (ptr) \
4528 ? ((regoff_t) ((ptr) - string1)) \
4529 : ((regoff_t) ((ptr) - string2 + size1)))
4530
fa9a63c5 4531/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4532 string2 if necessary.
4533 Check re_match_2_internal for a discussion of why end_match_2 might
4534 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4535#define PREFETCH() \
25fe55af 4536 while (d == dend) \
fa9a63c5
RM
4537 { \
4538 /* End of string2 => fail. */ \
25fe55af
RS
4539 if (dend == end_match_2) \
4540 goto fail; \
4bb91c68 4541 /* End of string1 => advance to string2. */ \
25fe55af 4542 d = string2; \
fa9a63c5
RM
4543 dend = end_match_2; \
4544 }
4545
f1ad044f
SM
4546/* Call before fetching a char with *d if you already checked other limits.
4547 This is meant for use in lookahead operations like wordend, etc..
4548 where we might need to look at parts of the string that might be
4549 outside of the LIMITs (i.e past `stop'). */
4550#define PREFETCH_NOLIMIT() \
4551 if (d == end1) \
4552 { \
4553 d = string2; \
4554 dend = end_match_2; \
4555 } \
fa9a63c5
RM
4556
4557/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4558 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4559#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4560#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5 4561
9121ca40 4562/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4563
4564/* The comment at case wordbound is following one, but we don't use
4565 AT_WORD_BOUNDARY anymore to support multibyte form.
4566
4567 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4568 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4569 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4570 macro and introducing temporary variables works around the bug. */
4571
9121ca40 4572#if 0
b313f9d8
PE
4573/* Test if D points to a character which is word-constituent. We have
4574 two special cases to check for: if past the end of string1, look at
4575 the first character in string2; and if before the beginning of
4576 string2, look at the last character in string1. */
4577#define WORDCHAR_P(d) \
4578 (SYNTAX ((d) == end1 ? *string2 \
4579 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4580 == Sword)
4581
fa9a63c5
RM
4582/* Test if the character before D and the one at D differ with respect
4583 to being word-constituent. */
4584#define AT_WORD_BOUNDARY(d) \
4585 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4586 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4587#endif
fa9a63c5
RM
4588
4589/* Free everything we malloc. */
4590#ifdef MATCH_MAY_ALLOCATE
952db0d7
PE
4591# define FREE_VAR(var) \
4592 do { \
4593 if (var) \
4594 { \
4595 REGEX_FREE (var); \
4596 var = NULL; \
4597 } \
4598 } while (0)
0b32bf0e 4599# define FREE_VARIABLES() \
fa9a63c5
RM
4600 do { \
4601 REGEX_FREE_STACK (fail_stack.stack); \
4602 FREE_VAR (regstart); \
4603 FREE_VAR (regend); \
fa9a63c5
RM
4604 FREE_VAR (best_regstart); \
4605 FREE_VAR (best_regend); \
fa9a63c5
RM
4606 } while (0)
4607#else
0b32bf0e 4608# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4609#endif /* not MATCH_MAY_ALLOCATE */
4610
505bde11
SM
4611\f
4612/* Optimization routines. */
4613
4e8a9132
SM
4614/* If the operation is a match against one or more chars,
4615 return a pointer to the next operation, else return NULL. */
01618498 4616static re_char *
971de7fb 4617skip_one_char (const re_char *p)
4e8a9132 4618{
7393bcbb 4619 switch (*p++)
4e8a9132
SM
4620 {
4621 case anychar:
4622 break;
177c0ea7 4623
4e8a9132
SM
4624 case exactn:
4625 p += *p + 1;
4626 break;
4627
4628 case charset_not:
4629 case charset:
4630 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4631 {
4632 int mcnt;
4633 p = CHARSET_RANGE_TABLE (p - 1);
4634 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4635 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4636 }
4637 else
4638 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4639 break;
177c0ea7 4640
4e8a9132
SM
4641 case syntaxspec:
4642 case notsyntaxspec:
1fb352e0 4643#ifdef emacs
4e8a9132
SM
4644 case categoryspec:
4645 case notcategoryspec:
4646#endif /* emacs */
4647 p++;
4648 break;
4649
4650 default:
4651 p = NULL;
4652 }
4653 return p;
4654}
4655
4656
505bde11 4657/* Jump over non-matching operations. */
839966f3 4658static re_char *
971de7fb 4659skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4660{
4661 int mcnt;
4662 while (p < pend)
4663 {
7393bcbb 4664 switch (*p)
505bde11
SM
4665 {
4666 case start_memory:
505bde11
SM
4667 case stop_memory:
4668 p += 2; break;
4669 case no_op:
4670 p += 1; break;
4671 case jump:
4672 p += 1;
4673 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4674 p += mcnt;
4675 break;
4676 default:
4677 return p;
4678 }
4679 }
4680 assert (p == pend);
4681 return p;
4682}
4683
4684/* Non-zero if "p1 matches something" implies "p2 fails". */
4685static int
971de7fb 4686mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4687{
4e8a9132 4688 re_opcode_t op2;
2d1675e4 4689 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4690 unsigned char *pend = bufp->buffer + bufp->used;
4691
4e8a9132 4692 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4693 && p2 >= bufp->buffer && p2 <= pend);
4694
4695 /* Skip over open/close-group commands.
4696 If what follows this loop is a ...+ construct,
4697 look at what begins its body, since we will have to
4698 match at least one of that. */
4e8a9132
SM
4699 p2 = skip_noops (p2, pend);
4700 /* The same skip can be done for p1, except that this function
4701 is only used in the case where p1 is a simple match operator. */
4702 /* p1 = skip_noops (p1, pend); */
4703
4704 assert (p1 >= bufp->buffer && p1 < pend
4705 && p2 >= bufp->buffer && p2 <= pend);
4706
4707 op2 = p2 == pend ? succeed : *p2;
4708
7393bcbb 4709 switch (op2)
505bde11 4710 {
4e8a9132
SM
4711 case succeed:
4712 case endbuf:
4713 /* If we're at the end of the pattern, we can change. */
4714 if (skip_one_char (p1))
505bde11 4715 {
505bde11
SM
4716 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4717 return 1;
505bde11 4718 }
4e8a9132 4719 break;
177c0ea7 4720
4e8a9132 4721 case endline:
4e8a9132
SM
4722 case exactn:
4723 {
01618498 4724 register re_wchar_t c
4e8a9132 4725 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4726 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4727
4e8a9132
SM
4728 if ((re_opcode_t) *p1 == exactn)
4729 {
62a6e103 4730 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4731 {
4732 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4733 return 1;
4734 }
4735 }
505bde11 4736
4e8a9132
SM
4737 else if ((re_opcode_t) *p1 == charset
4738 || (re_opcode_t) *p1 == charset_not)
4739 {
4740 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4741
4e8a9132
SM
4742 /* Test if C is listed in charset (or charset_not)
4743 at `p1'. */
6fdd04b0 4744 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4745 {
4746 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4747 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4748 not = !not;
4749 }
4750 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4751 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4752
4e8a9132
SM
4753 /* `not' is equal to 1 if c would match, which means
4754 that we can't change to pop_failure_jump. */
4755 if (!not)
4756 {
4757 DEBUG_PRINT1 (" No match => fast loop.\n");
4758 return 1;
4759 }
4760 }
4761 else if ((re_opcode_t) *p1 == anychar
4762 && c == '\n')
4763 {
4764 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4765 return 1;
4766 }
4767 }
4768 break;
505bde11 4769
4e8a9132 4770 case charset:
4e8a9132
SM
4771 {
4772 if ((re_opcode_t) *p1 == exactn)
4773 /* Reuse the code above. */
4774 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4775
505bde11
SM
4776 /* It is hard to list up all the character in charset
4777 P2 if it includes multibyte character. Give up in
4778 such case. */
4779 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4780 {
4781 /* Now, we are sure that P2 has no range table.
4782 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4783 enough. But P1 may have range table, so the
505bde11
SM
4784 size of bitmap table of P1 is extracted by
4785 using macro `CHARSET_BITMAP_SIZE'.
4786
6fdd04b0
KH
4787 In a multibyte case, we know that all the character
4788 listed in P2 is ASCII. In a unibyte case, P1 has only a
4789 bitmap table. So, in both cases, it is enough to test
4790 only the bitmap table of P1. */
505bde11 4791
411e4203 4792 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4793 {
4794 int idx;
4795 /* We win if the charset inside the loop
4796 has no overlap with the one after the loop. */
4797 for (idx = 0;
4798 (idx < (int) p2[1]
4799 && idx < CHARSET_BITMAP_SIZE (p1));
4800 idx++)
4801 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4802 break;
4803
4804 if (idx == p2[1]
4805 || idx == CHARSET_BITMAP_SIZE (p1))
4806 {
4807 DEBUG_PRINT1 (" No match => fast loop.\n");
4808 return 1;
4809 }
4810 }
411e4203 4811 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4812 {
4813 int idx;
4814 /* We win if the charset_not inside the loop lists
7814e705 4815 every character listed in the charset after. */
505bde11
SM
4816 for (idx = 0; idx < (int) p2[1]; idx++)
4817 if (! (p2[2 + idx] == 0
4818 || (idx < CHARSET_BITMAP_SIZE (p1)
4819 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4820 break;
4821
d1dfb56c
EZ
4822 if (idx == p2[1])
4823 {
4824 DEBUG_PRINT1 (" No match => fast loop.\n");
4825 return 1;
4826 }
4e8a9132
SM
4827 }
4828 }
4829 }
609b757a 4830 break;
177c0ea7 4831
411e4203 4832 case charset_not:
7393bcbb 4833 switch (*p1)
411e4203
SM
4834 {
4835 case exactn:
4836 case charset:
4837 /* Reuse the code above. */
4838 return mutually_exclusive_p (bufp, p2, p1);
4839 case charset_not:
4840 /* When we have two charset_not, it's very unlikely that
4841 they don't overlap. The union of the two sets of excluded
4842 chars should cover all possible chars, which, as a matter of
4843 fact, is virtually impossible in multibyte buffers. */
36595814 4844 break;
411e4203
SM
4845 }
4846 break;
4847
4e8a9132 4848 case wordend:
669fa600
SM
4849 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4850 case symend:
4e8a9132 4851 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4852 && (p1[1] == Ssymbol || p1[1] == Sword));
4853 case notsyntaxspec:
4854 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4855
4856 case wordbeg:
669fa600
SM
4857 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4858 case symbeg:
4e8a9132 4859 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4860 && (p1[1] == Ssymbol || p1[1] == Sword));
4861 case syntaxspec:
4862 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4863
4864 case wordbound:
4865 return (((re_opcode_t) *p1 == notsyntaxspec
4866 || (re_opcode_t) *p1 == syntaxspec)
4867 && p1[1] == Sword);
4868
1fb352e0 4869#ifdef emacs
4e8a9132
SM
4870 case categoryspec:
4871 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4872 case notcategoryspec:
4873 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4874#endif /* emacs */
4875
4876 default:
4877 ;
505bde11
SM
4878 }
4879
4880 /* Safe default. */
4881 return 0;
4882}
4883
fa9a63c5
RM
4884\f
4885/* Matching routines. */
4886
25fe55af 4887#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4888/* re_match is like re_match_2 except it takes only a single string. */
4889
d1dfb56c 4890regoff_t
d2762c86 4891re_match (struct re_pattern_buffer *bufp, const char *string,
d1dfb56c 4892 size_t size, ssize_t pos, struct re_registers *regs)
fa9a63c5 4893{
d1dfb56c
EZ
4894 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4895 size, pos, regs, size);
fa9a63c5
RM
4896 return result;
4897}
c0f9ea08 4898WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4899#endif /* not emacs */
4900
b18215fc
RS
4901#ifdef emacs
4902/* In Emacs, this is the string or buffer in which we
7814e705 4903 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4904Lisp_Object re_match_object;
4905#endif
fa9a63c5
RM
4906
4907/* re_match_2 matches the compiled pattern in BUFP against the
4908 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4909 and SIZE2, respectively). We start matching at POS, and stop
4910 matching at STOP.
5e69f11e 4911
fa9a63c5 4912 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4913 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4914 documentation for exactly how many groups we fill.
4915
4916 We return -1 if no match, -2 if an internal error (such as the
7814e705 4917 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4918 matched substring. */
4919
d1dfb56c
EZ
4920regoff_t
4921re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4922 size_t size1, const char *string2, size_t size2, ssize_t pos,
4923 struct re_registers *regs, ssize_t stop)
fa9a63c5 4924{
d1dfb56c 4925 regoff_t result;
25fe55af 4926
b18215fc 4927#ifdef emacs
d1dfb56c 4928 ssize_t charpos;
d48cd3f4 4929 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 4930 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4931 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4932#endif
4933
4bb91c68
SM
4934 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4935 (re_char*) string2, size2,
cc9b4df2 4936 pos, regs, stop);
fa9a63c5
RM
4937 return result;
4938}
c0f9ea08 4939WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4940
bf216479 4941
fa9a63c5 4942/* This is a separate function so that we can force an alloca cleanup
7814e705 4943 afterwards. */
d1dfb56c
EZ
4944static regoff_t
4945re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1,
4946 size_t size1, const re_char *string2, size_t size2,
4947 ssize_t pos, struct re_registers *regs, ssize_t stop)
fa9a63c5
RM
4948{
4949 /* General temporaries. */
d1dfb56c 4950 ssize_t mcnt;
01618498 4951 size_t reg;
fa9a63c5
RM
4952
4953 /* Just past the end of the corresponding string. */
66f0296e 4954 re_char *end1, *end2;
fa9a63c5
RM
4955
4956 /* Pointers into string1 and string2, just past the last characters in
7814e705 4957 each to consider matching. */
66f0296e 4958 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4959
4960 /* Where we are in the data, and the end of the current string. */
66f0296e 4961 re_char *d, *dend;
5e69f11e 4962
99633e97
SM
4963 /* Used sometimes to remember where we were before starting matching
4964 an operator so that we can go back in case of failure. This "atomic"
4965 behavior of matching opcodes is indispensable to the correctness
4966 of the on_failure_keep_string_jump optimization. */
4967 re_char *dfail;
4968
fa9a63c5 4969 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4970 re_char *p = bufp->buffer;
4971 re_char *pend = p + bufp->used;
fa9a63c5 4972
25fe55af 4973 /* We use this to map every character in the string. */
6676cb1c 4974 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4975
cf9c99bc 4976 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 4977 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4978
cf9c99bc
KH
4979 /* Nonzero if STRING1/STRING2 are multibyte. */
4980 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4981
fa9a63c5
RM
4982 /* Failure point stack. Each place that can handle a failure further
4983 down the line pushes a failure point on this stack. It consists of
505bde11 4984 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4985 the subexpressions we're currently inside, plus the number of such
4986 registers, and, finally, two char *'s. The first char * is where
4987 to resume scanning the pattern; the second one is where to resume
7814e705
JB
4988 scanning the strings. */
4989#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4990 fail_stack_type fail_stack;
4991#endif
4992#ifdef DEBUG
fa9a63c5
RM
4993 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4994#endif
4995
0b32bf0e 4996#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4997 /* This holds the pointer to the failure stack, when
4998 it is allocated relocatably. */
4999 fail_stack_elt_t *failure_stack_ptr;
99633e97 5000#endif
fa9a63c5
RM
5001
5002 /* We fill all the registers internally, independent of what we
7814e705 5003 return, for use in backreferences. The number here includes
fa9a63c5 5004 an element for register zero. */
4bb91c68 5005 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5006
fa9a63c5
RM
5007 /* Information on the contents of registers. These are pointers into
5008 the input strings; they record just what was matched (on this
5009 attempt) by a subexpression part of the pattern, that is, the
5010 regnum-th regstart pointer points to where in the pattern we began
5011 matching and the regnum-th regend points to right after where we
5012 stopped matching the regnum-th subexpression. (The zeroth register
5013 keeps track of what the whole pattern matches.) */
5014#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5015 re_char **regstart, **regend;
fa9a63c5
RM
5016#endif
5017
fa9a63c5 5018 /* The following record the register info as found in the above
5e69f11e 5019 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5020 This happens as we backtrack through the failure points, which in
5021 turn happens only if we have not yet matched the entire string. */
5022 unsigned best_regs_set = false;
5023#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5024 re_char **best_regstart, **best_regend;
fa9a63c5 5025#endif
5e69f11e 5026
fa9a63c5
RM
5027 /* Logically, this is `best_regend[0]'. But we don't want to have to
5028 allocate space for that if we're not allocating space for anything
7814e705 5029 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5030 any of the other register vectors, and it seems rather a kludge to
5031 treat `best_regend' differently than the rest. So we keep track of
5032 the end of the best match so far in a separate variable. We
5033 initialize this to NULL so that when we backtrack the first time
5034 and need to test it, it's not garbage. */
66f0296e 5035 re_char *match_end = NULL;
fa9a63c5 5036
fa9a63c5
RM
5037#ifdef DEBUG
5038 /* Counts the total number of registers pushed. */
5e69f11e 5039 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5040#endif
5041
5042 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5043
fa9a63c5 5044 INIT_FAIL_STACK ();
5e69f11e 5045
fa9a63c5
RM
5046#ifdef MATCH_MAY_ALLOCATE
5047 /* Do not bother to initialize all the register variables if there are
5048 no groups in the pattern, as it takes a fair amount of time. If
5049 there are groups, we include space for register 0 (the whole
5050 pattern), even though we never use it, since it simplifies the
5051 array indexing. We should fix this. */
5052 if (bufp->re_nsub)
5053 {
66f0296e
SM
5054 regstart = REGEX_TALLOC (num_regs, re_char *);
5055 regend = REGEX_TALLOC (num_regs, re_char *);
5056 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5057 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5058
505bde11 5059 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5060 {
5061 FREE_VARIABLES ();
5062 return -2;
5063 }
fa9a63c5
RM
5064 }
5065 else
5066 {
5067 /* We must initialize all our variables to NULL, so that
25fe55af 5068 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5069 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5070 }
5071#endif /* MATCH_MAY_ALLOCATE */
5072
5073 /* The starting position is bogus. */
5074 if (pos < 0 || pos > size1 + size2)
5075 {
5076 FREE_VARIABLES ();
5077 return -1;
5078 }
5e69f11e 5079
fa9a63c5
RM
5080 /* Initialize subexpression text positions to -1 to mark ones that no
5081 start_memory/stop_memory has been seen for. Also initialize the
5082 register information struct. */
01618498
SM
5083 for (reg = 1; reg < num_regs; reg++)
5084 regstart[reg] = regend[reg] = NULL;
99633e97 5085
fa9a63c5 5086 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5087 `string1' is null. */
fa9a63c5
RM
5088 if (size2 == 0 && string1 != NULL)
5089 {
5090 string2 = string1;
5091 size2 = size1;
5092 string1 = 0;
5093 size1 = 0;
5094 }
5095 end1 = string1 + size1;
5096 end2 = string2 + size2;
5097
5e69f11e 5098 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5099 `dend' is the end of the input string that `d' points within. `d'
5100 is advanced into the following input string whenever necessary, but
5101 this happens before fetching; therefore, at the beginning of the
5102 loop, `d' can be pointing at the end of a string, but it cannot
5103 equal `string2'. */
419d1c74 5104 if (pos >= size1)
fa9a63c5 5105 {
419d1c74
SM
5106 /* Only match within string2. */
5107 d = string2 + pos - size1;
5108 dend = end_match_2 = string2 + stop - size1;
5109 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5110 }
5111 else
5112 {
f1ad044f 5113 if (stop < size1)
419d1c74
SM
5114 {
5115 /* Only match within string1. */
5116 end_match_1 = string1 + stop;
5117 /* BEWARE!
5118 When we reach end_match_1, PREFETCH normally switches to string2.
5119 But in the present case, this means that just doing a PREFETCH
5120 makes us jump from `stop' to `gap' within the string.
5121 What we really want here is for the search to stop as
5122 soon as we hit end_match_1. That's why we set end_match_2
5123 to end_match_1 (since PREFETCH fails as soon as we hit
5124 end_match_2). */
5125 end_match_2 = end_match_1;
5126 }
5127 else
f1ad044f
SM
5128 { /* It's important to use this code when stop == size so that
5129 moving `d' from end1 to string2 will not prevent the d == dend
5130 check from catching the end of string. */
419d1c74
SM
5131 end_match_1 = end1;
5132 end_match_2 = string2 + stop - size1;
5133 }
5134 d = string1 + pos;
5135 dend = end_match_1;
fa9a63c5
RM
5136 }
5137
5138 DEBUG_PRINT1 ("The compiled pattern is: ");
5139 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5140 DEBUG_PRINT1 ("The string to match is: `");
5141 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5142 DEBUG_PRINT1 ("'\n");
5e69f11e 5143
7814e705 5144 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5145 function if the match is complete, or it drops through if the match
5146 fails at this starting point in the input data. */
5147 for (;;)
5148 {
505bde11 5149 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5150
5151 if (p == pend)
5152 { /* End of pattern means we might have succeeded. */
25fe55af 5153 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5154
fa9a63c5 5155 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5156 longest match, try backtracking. */
5157 if (d != end_match_2)
fa9a63c5
RM
5158 {
5159 /* 1 if this match ends in the same string (string1 or string2)
5160 as the best previous match. */
d42f4f0f
PE
5161 boolean same_str_p = (FIRST_STRING_P (match_end)
5162 == FIRST_STRING_P (d));
fa9a63c5
RM
5163 /* 1 if this match is the best seen so far. */
5164 boolean best_match_p;
5165
5166 /* AIX compiler got confused when this was combined
7814e705 5167 with the previous declaration. */
fa9a63c5
RM
5168 if (same_str_p)
5169 best_match_p = d > match_end;
5170 else
99633e97 5171 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5172
25fe55af
RS
5173 DEBUG_PRINT1 ("backtracking.\n");
5174
5175 if (!FAIL_STACK_EMPTY ())
5176 { /* More failure points to try. */
5177
5178 /* If exceeds best match so far, save it. */
5179 if (!best_regs_set || best_match_p)
5180 {
5181 best_regs_set = true;
5182 match_end = d;
5183
5184 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5185
01618498 5186 for (reg = 1; reg < num_regs; reg++)
25fe55af 5187 {
01618498
SM
5188 best_regstart[reg] = regstart[reg];
5189 best_regend[reg] = regend[reg];
25fe55af
RS
5190 }
5191 }
5192 goto fail;
5193 }
5194
5195 /* If no failure points, don't restore garbage. And if
5196 last match is real best match, don't restore second
5197 best one. */
5198 else if (best_regs_set && !best_match_p)
5199 {
5200 restore_best_regs:
5201 /* Restore best match. It may happen that `dend ==
5202 end_match_1' while the restored d is in string2.
5203 For example, the pattern `x.*y.*z' against the
5204 strings `x-' and `y-z-', if the two strings are
7814e705 5205 not consecutive in memory. */
25fe55af
RS
5206 DEBUG_PRINT1 ("Restoring best registers.\n");
5207
5208 d = match_end;
5209 dend = ((d >= string1 && d <= end1)
5210 ? end_match_1 : end_match_2);
fa9a63c5 5211
01618498 5212 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5213 {
01618498
SM
5214 regstart[reg] = best_regstart[reg];
5215 regend[reg] = best_regend[reg];
fa9a63c5 5216 }
25fe55af
RS
5217 }
5218 } /* d != end_match_2 */
fa9a63c5
RM
5219
5220 succeed_label:
25fe55af 5221 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5222
25fe55af
RS
5223 /* If caller wants register contents data back, do it. */
5224 if (regs && !bufp->no_sub)
fa9a63c5 5225 {
25fe55af
RS
5226 /* Have the register data arrays been allocated? */
5227 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5228 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5229 extra element beyond `num_regs' for the `-1' marker
5230 GNU code uses. */
5231 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5232 regs->start = TALLOC (regs->num_regs, regoff_t);
5233 regs->end = TALLOC (regs->num_regs, regoff_t);
5234 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5235 {
5236 FREE_VARIABLES ();
5237 return -2;
5238 }
25fe55af
RS
5239 bufp->regs_allocated = REGS_REALLOCATE;
5240 }
5241 else if (bufp->regs_allocated == REGS_REALLOCATE)
5242 { /* Yes. If we need more elements than were already
5243 allocated, reallocate them. If we need fewer, just
5244 leave it alone. */
5245 if (regs->num_regs < num_regs + 1)
5246 {
5247 regs->num_regs = num_regs + 1;
5248 RETALLOC (regs->start, regs->num_regs, regoff_t);
5249 RETALLOC (regs->end, regs->num_regs, regoff_t);
5250 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5251 {
5252 FREE_VARIABLES ();
5253 return -2;
5254 }
25fe55af
RS
5255 }
5256 }
5257 else
fa9a63c5
RM
5258 {
5259 /* These braces fend off a "empty body in an else-statement"
7814e705 5260 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5261 assert (bufp->regs_allocated == REGS_FIXED);
5262 }
5263
25fe55af
RS
5264 /* Convert the pointer data in `regstart' and `regend' to
5265 indices. Register zero has to be set differently,
5266 since we haven't kept track of any info for it. */
5267 if (regs->num_regs > 0)
5268 {
5269 regs->start[0] = pos;
99633e97 5270 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5271 }
5e69f11e 5272
25fe55af
RS
5273 /* Go through the first `min (num_regs, regs->num_regs)'
5274 registers, since that is all we initialized. */
01618498 5275 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5276 {
01618498
SM
5277 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5278 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5279 else
5280 {
01618498
SM
5281 regs->start[reg]
5282 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5283 regs->end[reg]
5284 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5285 }
fa9a63c5 5286 }
5e69f11e 5287
25fe55af
RS
5288 /* If the regs structure we return has more elements than
5289 were in the pattern, set the extra elements to -1. If
5290 we (re)allocated the registers, this is the case,
5291 because we always allocate enough to have at least one
7814e705 5292 -1 at the end. */
01618498
SM
5293 for (reg = num_regs; reg < regs->num_regs; reg++)
5294 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5295 } /* regs && !bufp->no_sub */
5296
25fe55af
RS
5297 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5298 nfailure_points_pushed, nfailure_points_popped,
5299 nfailure_points_pushed - nfailure_points_popped);
5300 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5301
99633e97 5302 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5303
25fe55af 5304 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5305
25fe55af
RS
5306 FREE_VARIABLES ();
5307 return mcnt;
5308 }
fa9a63c5 5309
7814e705 5310 /* Otherwise match next pattern command. */
7393bcbb 5311 switch (*p++)
fa9a63c5 5312 {
25fe55af
RS
5313 /* Ignore these. Used to ignore the n of succeed_n's which
5314 currently have n == 0. */
5315 case no_op:
5316 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5317 break;
fa9a63c5
RM
5318
5319 case succeed:
25fe55af 5320 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5321 goto succeed_label;
5322
7814e705 5323 /* Match the next n pattern characters exactly. The following
25fe55af 5324 byte in the pattern defines n, and the n bytes after that
7814e705 5325 are the characters to match. */
fa9a63c5
RM
5326 case exactn:
5327 mcnt = *p++;
25fe55af 5328 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5329
99633e97
SM
5330 /* Remember the start point to rollback upon failure. */
5331 dfail = d;
5332
6fdd04b0 5333#ifndef emacs
25fe55af
RS
5334 /* This is written out as an if-else so we don't waste time
5335 testing `translate' inside the loop. */
28703c16 5336 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5337 do
5338 {
5339 PREFETCH ();
5340 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5341 {
6fdd04b0
KH
5342 d = dfail;
5343 goto fail;
e934739e 5344 }
6fdd04b0
KH
5345 d++;
5346 }
5347 while (--mcnt);
fa9a63c5 5348 else
6fdd04b0
KH
5349 do
5350 {
5351 PREFETCH ();
5352 if (*d++ != *p++)
bf216479 5353 {
6fdd04b0
KH
5354 d = dfail;
5355 goto fail;
bf216479 5356 }
6fdd04b0
KH
5357 }
5358 while (--mcnt);
5359#else /* emacs */
5360 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5361 if (target_multibyte)
6fdd04b0
KH
5362 do
5363 {
5364 int pat_charlen, buf_charlen;
cf9c99bc 5365 int pat_ch, buf_ch;
e934739e 5366
6fdd04b0 5367 PREFETCH ();
cf9c99bc 5368 if (multibyte)
62a6e103 5369 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5370 else
5371 {
5372 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5373 pat_charlen = 1;
5374 }
62a6e103 5375 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5376
6fdd04b0 5377 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5378 {
6fdd04b0
KH
5379 d = dfail;
5380 goto fail;
e934739e 5381 }
bf216479 5382
6fdd04b0
KH
5383 p += pat_charlen;
5384 d += buf_charlen;
5385 mcnt -= pat_charlen;
5386 }
5387 while (mcnt > 0);
fa9a63c5 5388 else
6fdd04b0
KH
5389 do
5390 {
abbd1bcf 5391 int pat_charlen;
cf9c99bc 5392 int pat_ch, buf_ch;
bf216479 5393
6fdd04b0 5394 PREFETCH ();
cf9c99bc
KH
5395 if (multibyte)
5396 {
62a6e103 5397 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5398 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5399 }
5400 else
5401 {
5402 pat_ch = *p;
5403 pat_charlen = 1;
5404 }
5405 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5406 if (! CHAR_BYTE8_P (buf_ch))
5407 {
5408 buf_ch = TRANSLATE (buf_ch);
5409 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5410 if (buf_ch < 0)
5411 buf_ch = *d;
5412 }
0e2501ed
AS
5413 else
5414 buf_ch = *d;
cf9c99bc 5415 if (buf_ch != pat_ch)
6fdd04b0
KH
5416 {
5417 d = dfail;
5418 goto fail;
bf216479 5419 }
cf9c99bc
KH
5420 p += pat_charlen;
5421 d++;
6fdd04b0
KH
5422 }
5423 while (--mcnt);
5424#endif
25fe55af 5425 break;
fa9a63c5
RM
5426
5427
25fe55af 5428 /* Match any character except possibly a newline or a null. */
fa9a63c5 5429 case anychar:
e934739e
RS
5430 {
5431 int buf_charlen;
01618498 5432 re_wchar_t buf_ch;
fa9a63c5 5433
e934739e 5434 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5435
e934739e 5436 PREFETCH ();
62a6e103 5437 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5438 target_multibyte);
e934739e
RS
5439 buf_ch = TRANSLATE (buf_ch);
5440
5441 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5442 && buf_ch == '\n')
5443 || ((bufp->syntax & RE_DOT_NOT_NULL)
5444 && buf_ch == '\000'))
5445 goto fail;
5446
e934739e
RS
5447 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5448 d += buf_charlen;
5449 }
fa9a63c5
RM
5450 break;
5451
5452
5453 case charset:
5454 case charset_not:
5455 {
b18215fc 5456 register unsigned int c;
fa9a63c5 5457 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5458 int len;
5459
5460 /* Start of actual range_table, or end of bitmap if there is no
5461 range table. */
da053e48 5462 re_char *range_table IF_LINT (= NULL);
b18215fc 5463
96cc36cc 5464 /* Nonzero if there is a range table. */
b18215fc
RS
5465 int range_table_exists;
5466
96cc36cc
RS
5467 /* Number of ranges of range table. This is not included
5468 in the initial byte-length of the command. */
5469 int count = 0;
fa9a63c5 5470
f5020181
AS
5471 /* Whether matching against a unibyte character. */
5472 boolean unibyte_char = false;
5473
25fe55af 5474 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5475
b18215fc 5476 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5477
b18215fc 5478 if (range_table_exists)
96cc36cc
RS
5479 {
5480 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5481 EXTRACT_NUMBER_AND_INCR (count, range_table);
5482 }
b18215fc 5483
2d1675e4 5484 PREFETCH ();
62a6e103 5485 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5486 if (target_multibyte)
5487 {
5488 int c1;
b18215fc 5489
cf9c99bc
KH
5490 c = TRANSLATE (c);
5491 c1 = RE_CHAR_TO_UNIBYTE (c);
5492 if (c1 >= 0)
f5020181
AS
5493 {
5494 unibyte_char = true;
5495 c = c1;
5496 }
cf9c99bc
KH
5497 }
5498 else
5499 {
5500 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5501
5502 if (! CHAR_BYTE8_P (c1))
5503 {
5504 c1 = TRANSLATE (c1);
5505 c1 = RE_CHAR_TO_UNIBYTE (c1);
5506 if (c1 >= 0)
f5020181
AS
5507 {
5508 unibyte_char = true;
5509 c = c1;
5510 }
cf9c99bc 5511 }
0b8be006
AS
5512 else
5513 unibyte_char = true;
cf9c99bc
KH
5514 }
5515
f5020181 5516 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5517 { /* Lookup bitmap. */
b18215fc
RS
5518 /* Cast to `unsigned' instead of `unsigned char' in
5519 case the bit list is a full 32 bytes long. */
5520 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5521 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5522 not = !not;
b18215fc 5523 }
96cc36cc 5524#ifdef emacs
b18215fc 5525 else if (range_table_exists)
96cc36cc
RS
5526 {
5527 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5528
14473664
SM
5529 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5530 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5531 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5532 | (class_bits & BIT_SPACE && ISSPACE (c))
5533 | (class_bits & BIT_UPPER && ISUPPER (c))
5534 | (class_bits & BIT_WORD && ISWORD (c)))
5535 not = !not;
5536 else
5537 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5538 }
5539#endif /* emacs */
fa9a63c5 5540
96cc36cc
RS
5541 if (range_table_exists)
5542 p = CHARSET_RANGE_TABLE_END (range_table, count);
5543 else
5544 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5545
5546 if (!not) goto fail;
5e69f11e 5547
b18215fc 5548 d += len;
fa9a63c5 5549 }
8fb31792 5550 break;
fa9a63c5
RM
5551
5552
25fe55af 5553 /* The beginning of a group is represented by start_memory.
505bde11 5554 The argument is the register number. The text
25fe55af 5555 matched within the group is recorded (in the internal
7814e705 5556 registers data structure) under the register number. */
25fe55af 5557 case start_memory:
505bde11
SM
5558 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5559
5560 /* In case we need to undo this operation (via backtracking). */
5561 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5562
25fe55af 5563 regstart[*p] = d;
4bb91c68 5564 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5565 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5566
25fe55af 5567 /* Move past the register number and inner group count. */
505bde11 5568 p += 1;
25fe55af 5569 break;
fa9a63c5
RM
5570
5571
25fe55af 5572 /* The stop_memory opcode represents the end of a group. Its
505bde11 5573 argument is the same as start_memory's: the register number. */
fa9a63c5 5574 case stop_memory:
505bde11
SM
5575 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5576
5577 assert (!REG_UNSET (regstart[*p]));
5578 /* Strictly speaking, there should be code such as:
177c0ea7 5579
0b32bf0e 5580 assert (REG_UNSET (regend[*p]));
505bde11
SM
5581 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5582
5583 But the only info to be pushed is regend[*p] and it is known to
5584 be UNSET, so there really isn't anything to push.
5585 Not pushing anything, on the other hand deprives us from the
5586 guarantee that regend[*p] is UNSET since undoing this operation
5587 will not reset its value properly. This is not important since
5588 the value will only be read on the next start_memory or at
5589 the very end and both events can only happen if this stop_memory
5590 is *not* undone. */
fa9a63c5 5591
25fe55af 5592 regend[*p] = d;
fa9a63c5
RM
5593 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5594
25fe55af 5595 /* Move past the register number and the inner group count. */
505bde11 5596 p += 1;
25fe55af 5597 break;
fa9a63c5
RM
5598
5599
5600 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5601 followed by the numeric value of <digit> as the register number. */
5602 case duplicate:
fa9a63c5 5603 {
66f0296e 5604 register re_char *d2, *dend2;
7814e705 5605 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5606 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5607
7814e705 5608 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5609 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5610 goto fail;
5e69f11e 5611
7814e705 5612 /* Where in input to try to start matching. */
25fe55af 5613 d2 = regstart[regno];
5e69f11e 5614
99633e97
SM
5615 /* Remember the start point to rollback upon failure. */
5616 dfail = d;
5617
25fe55af
RS
5618 /* Where to stop matching; if both the place to start and
5619 the place to stop matching are in the same string, then
5620 set to the place to stop, otherwise, for now have to use
5621 the end of the first string. */
fa9a63c5 5622
25fe55af 5623 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5624 == FIRST_STRING_P (regend[regno]))
5625 ? regend[regno] : end_match_1);
5626 for (;;)
5627 {
5628 /* If necessary, advance to next segment in register
25fe55af 5629 contents. */
fa9a63c5
RM
5630 while (d2 == dend2)
5631 {
5632 if (dend2 == end_match_2) break;
5633 if (dend2 == regend[regno]) break;
5634
25fe55af
RS
5635 /* End of string1 => advance to string2. */
5636 d2 = string2;
5637 dend2 = regend[regno];
fa9a63c5
RM
5638 }
5639 /* At end of register contents => success */
5640 if (d2 == dend2) break;
5641
5642 /* If necessary, advance to next segment in data. */
5643 PREFETCH ();
5644
5645 /* How many characters left in this segment to match. */
5646 mcnt = dend - d;
5e69f11e 5647
fa9a63c5 5648 /* Want how many consecutive characters we can match in
25fe55af
RS
5649 one shot, so, if necessary, adjust the count. */
5650 if (mcnt > dend2 - d2)
fa9a63c5 5651 mcnt = dend2 - d2;
5e69f11e 5652
fa9a63c5 5653 /* Compare that many; failure if mismatch, else move
25fe55af 5654 past them. */
28703c16 5655 if (RE_TRANSLATE_P (translate)
02cb78b5 5656 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5657 : memcmp (d, d2, mcnt))
99633e97
SM
5658 {
5659 d = dfail;
5660 goto fail;
5661 }
fa9a63c5 5662 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5663 }
5664 }
5665 break;
5666
5667
25fe55af 5668 /* begline matches the empty string at the beginning of the string
c0f9ea08 5669 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5670 case begline:
25fe55af 5671 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5672
25fe55af
RS
5673 if (AT_STRINGS_BEG (d))
5674 {
5675 if (!bufp->not_bol) break;
5676 }
419d1c74 5677 else
25fe55af 5678 {
bf216479 5679 unsigned c;
419d1c74 5680 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5681 if (c == '\n')
419d1c74 5682 break;
25fe55af
RS
5683 }
5684 /* In all other cases, we fail. */
5685 goto fail;
fa9a63c5
RM
5686
5687
25fe55af 5688 /* endline is the dual of begline. */
fa9a63c5 5689 case endline:
25fe55af 5690 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5691
25fe55af
RS
5692 if (AT_STRINGS_END (d))
5693 {
5694 if (!bufp->not_eol) break;
5695 }
f1ad044f 5696 else
25fe55af 5697 {
f1ad044f 5698 PREFETCH_NOLIMIT ();
c0f9ea08 5699 if (*d == '\n')
f1ad044f 5700 break;
25fe55af
RS
5701 }
5702 goto fail;
fa9a63c5
RM
5703
5704
5705 /* Match at the very beginning of the data. */
25fe55af
RS
5706 case begbuf:
5707 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5708 if (AT_STRINGS_BEG (d))
5709 break;
5710 goto fail;
fa9a63c5
RM
5711
5712
5713 /* Match at the very end of the data. */
25fe55af
RS
5714 case endbuf:
5715 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5716 if (AT_STRINGS_END (d))
5717 break;
25fe55af 5718 goto fail;
5e69f11e 5719
5e69f11e 5720
25fe55af
RS
5721 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5722 pushes NULL as the value for the string on the stack. Then
505bde11 5723 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5724 string, instead of restoring it. To see why, consider
7814e705 5725 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5726 then the . fails against the \n. But the next thing we want
5727 to do is match the \n against the \n; if we restored the
5728 string value, we would be back at the foo.
5729
5730 Because this is used only in specific cases, we don't need to
5731 check all the things that `on_failure_jump' does, to make
5732 sure the right things get saved on the stack. Hence we don't
5733 share its code. The only reason to push anything on the
5734 stack at all is that otherwise we would have to change
5735 `anychar's code to do something besides goto fail in this
5736 case; that seems worse than this. */
5737 case on_failure_keep_string_jump:
505bde11
SM
5738 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5739 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5740 mcnt, p + mcnt);
fa9a63c5 5741
505bde11
SM
5742 PUSH_FAILURE_POINT (p - 3, NULL);
5743 break;
5744
0683b6fa
SM
5745 /* A nasty loop is introduced by the non-greedy *? and +?.
5746 With such loops, the stack only ever contains one failure point
5747 at a time, so that a plain on_failure_jump_loop kind of
5748 cycle detection cannot work. Worse yet, such a detection
5749 can not only fail to detect a cycle, but it can also wrongly
5750 detect a cycle (between different instantiations of the same
6df42991 5751 loop).
0683b6fa
SM
5752 So the method used for those nasty loops is a little different:
5753 We use a special cycle-detection-stack-frame which is pushed
5754 when the on_failure_jump_nastyloop failure-point is *popped*.
5755 This special frame thus marks the beginning of one iteration
5756 through the loop and we can hence easily check right here
5757 whether something matched between the beginning and the end of
5758 the loop. */
5759 case on_failure_jump_nastyloop:
5760 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5761 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5762 mcnt, p + mcnt);
5763
5764 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5765 {
5766 int cycle = 0;
5767 CHECK_INFINITE_LOOP (p - 4, d);
5768 if (!cycle)
5769 /* If there's a cycle, just continue without pushing
5770 this failure point. The failure point is the "try again"
5771 option, which shouldn't be tried.
5772 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5773 PUSH_FAILURE_POINT (p - 3, d);
5774 }
0683b6fa
SM
5775 break;
5776
4e8a9132
SM
5777 /* Simple loop detecting on_failure_jump: just check on the
5778 failure stack if the same spot was already hit earlier. */
505bde11
SM
5779 case on_failure_jump_loop:
5780 on_failure:
5781 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5782 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5783 mcnt, p + mcnt);
6df42991
SM
5784 {
5785 int cycle = 0;
5786 CHECK_INFINITE_LOOP (p - 3, d);
5787 if (cycle)
5788 /* If there's a cycle, get out of the loop, as if the matching
5789 had failed. We used to just `goto fail' here, but that was
5790 aborting the search a bit too early: we want to keep the
5791 empty-loop-match and keep matching after the loop.
5792 We want (x?)*y\1z to match both xxyz and xxyxz. */
5793 p += mcnt;
5794 else
5795 PUSH_FAILURE_POINT (p - 3, d);
5796 }
25fe55af 5797 break;
fa9a63c5
RM
5798
5799
5800 /* Uses of on_failure_jump:
5e69f11e 5801
25fe55af
RS
5802 Each alternative starts with an on_failure_jump that points
5803 to the beginning of the next alternative. Each alternative
5804 except the last ends with a jump that in effect jumps past
5805 the rest of the alternatives. (They really jump to the
5806 ending jump of the following alternative, because tensioning
5807 these jumps is a hassle.)
fa9a63c5 5808
25fe55af
RS
5809 Repeats start with an on_failure_jump that points past both
5810 the repetition text and either the following jump or
5811 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5812 case on_failure_jump:
25fe55af 5813 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5814 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5815 mcnt, p + mcnt);
25fe55af 5816
505bde11 5817 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5818 break;
5819
4e8a9132 5820 /* This operation is used for greedy *.
505bde11
SM
5821 Compare the beginning of the repeat with what in the
5822 pattern follows its end. If we can establish that there
5823 is nothing that they would both match, i.e., that we
5824 would have to backtrack because of (as in, e.g., `a*a')
5825 then we can use a non-backtracking loop based on
4e8a9132 5826 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5827 case on_failure_jump_smart:
25fe55af 5828 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5829 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5830 mcnt, p + mcnt);
25fe55af 5831 {
01618498 5832 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5833 /* Here, we discard `const', making re_match non-reentrant. */
5834 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5835 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5836
505bde11
SM
5837 p -= 3; /* Reset so that we will re-execute the
5838 instruction once it's been changed. */
fa9a63c5 5839
4e8a9132
SM
5840 EXTRACT_NUMBER (mcnt, p2 - 2);
5841
5842 /* Ensure this is a indeed the trivial kind of loop
5843 we are expecting. */
5844 assert (skip_one_char (p1) == p2 - 3);
5845 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5846 DEBUG_STATEMENT (debug += 2);
505bde11 5847 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5848 {
505bde11 5849 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5850 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5851 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5852 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5853 }
505bde11 5854 else
fa9a63c5 5855 {
505bde11
SM
5856 /* Default to a safe `on_failure_jump' loop. */
5857 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5858 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5859 }
99633e97 5860 DEBUG_STATEMENT (debug -= 2);
25fe55af 5861 }
505bde11 5862 break;
25fe55af
RS
5863
5864 /* Unconditionally jump (without popping any failure points). */
5865 case jump:
fa9a63c5 5866 unconditional_jump:
5b370c2b 5867 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5868 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5869 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5870 p += mcnt; /* Do the jump. */
505bde11 5871 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5872 break;
5873
5874
25fe55af
RS
5875 /* Have to succeed matching what follows at least n times.
5876 After that, handle like `on_failure_jump'. */
5877 case succeed_n:
01618498 5878 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5879 EXTRACT_NUMBER (mcnt, p + 2);
5880 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5881
dc1e502d
SM
5882 /* Originally, mcnt is how many times we HAVE to succeed. */
5883 if (mcnt != 0)
25fe55af 5884 {
6dcf2d0e
SM
5885 /* Here, we discard `const', making re_match non-reentrant. */
5886 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5887 mcnt--;
01618498
SM
5888 p += 4;
5889 PUSH_NUMBER (p2, mcnt);
25fe55af 5890 }
dc1e502d
SM
5891 else
5892 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5893 goto on_failure;
25fe55af
RS
5894 break;
5895
5896 case jump_n:
01618498 5897 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5898 EXTRACT_NUMBER (mcnt, p + 2);
5899 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5900
5901 /* Originally, this is how many times we CAN jump. */
dc1e502d 5902 if (mcnt != 0)
25fe55af 5903 {
6dcf2d0e
SM
5904 /* Here, we discard `const', making re_match non-reentrant. */
5905 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5906 mcnt--;
01618498 5907 PUSH_NUMBER (p2, mcnt);
dc1e502d 5908 goto unconditional_jump;
25fe55af
RS
5909 }
5910 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5911 else
5912 p += 4;
25fe55af 5913 break;
5e69f11e 5914
fa9a63c5
RM
5915 case set_number_at:
5916 {
01618498 5917 unsigned char *p2; /* Location of the counter. */
25fe55af 5918 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5919
25fe55af 5920 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5921 /* Here, we discard `const', making re_match non-reentrant. */
5922 p2 = (unsigned char*) p + mcnt;
01618498 5923 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5924 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5925 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5926 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5927 break;
5928 }
9121ca40
KH
5929
5930 case wordbound:
66f0296e 5931 case notwordbound:
19ed5445
PE
5932 {
5933 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5934 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5935
19ed5445 5936 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5937
19ed5445
PE
5938 /* Case 1: D is at the beginning or the end of string. */
5939 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5940 not = !not;
5941 else
5942 {
5943 /* C1 is the character before D, S1 is the syntax of C1, C2
5944 is the character at D, and S2 is the syntax of C2. */
5945 re_wchar_t c1, c2;
5946 int s1, s2;
5947 int dummy;
b18215fc 5948#ifdef emacs
d1dfb56c
EZ
5949 ssize_t offset = PTR_TO_OFFSET (d - 1);
5950 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445 5951 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5952#endif
19ed5445
PE
5953 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5954 s1 = SYNTAX (c1);
b18215fc 5955#ifdef emacs
19ed5445 5956 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5957#endif
19ed5445
PE
5958 PREFETCH_NOLIMIT ();
5959 GET_CHAR_AFTER (c2, d, dummy);
5960 s2 = SYNTAX (c2);
5961
5962 if (/* Case 2: Only one of S1 and S2 is Sword. */
5963 ((s1 == Sword) != (s2 == Sword))
5964 /* Case 3: Both of S1 and S2 are Sword, and macro
5965 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5966 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5967 not = !not;
5968 }
5969 if (not)
5970 break;
5971 else
5972 goto fail;
5973 }
fa9a63c5
RM
5974
5975 case wordbeg:
25fe55af 5976 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5977
b18215fc
RS
5978 /* We FAIL in one of the following cases: */
5979
7814e705 5980 /* Case 1: D is at the end of string. */
b18215fc 5981 if (AT_STRINGS_END (d))
99633e97 5982 goto fail;
b18215fc
RS
5983 else
5984 {
5985 /* C1 is the character before D, S1 is the syntax of C1, C2
5986 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5987 re_wchar_t c1, c2;
5988 int s1, s2;
bf216479 5989 int dummy;
fa9a63c5 5990#ifdef emacs
d1dfb56c
EZ
5991 ssize_t offset = PTR_TO_OFFSET (d);
5992 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5993 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5994#endif
99633e97 5995 PREFETCH ();
6fdd04b0 5996 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 5997 s2 = SYNTAX (c2);
177c0ea7 5998
b18215fc
RS
5999 /* Case 2: S2 is not Sword. */
6000 if (s2 != Sword)
6001 goto fail;
6002
6003 /* Case 3: D is not at the beginning of string ... */
6004 if (!AT_STRINGS_BEG (d))
6005 {
6006 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6007#ifdef emacs
5d967c7a 6008 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6009#endif
b18215fc
RS
6010 s1 = SYNTAX (c1);
6011
6012 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6013 returns 0. */
b18215fc
RS
6014 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6015 goto fail;
6016 }
6017 }
e318085a
RS
6018 break;
6019
b18215fc 6020 case wordend:
25fe55af 6021 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6022
6023 /* We FAIL in one of the following cases: */
6024
6025 /* Case 1: D is at the beginning of string. */
6026 if (AT_STRINGS_BEG (d))
e318085a 6027 goto fail;
b18215fc
RS
6028 else
6029 {
6030 /* C1 is the character before D, S1 is the syntax of C1, C2
6031 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6032 re_wchar_t c1, c2;
6033 int s1, s2;
bf216479 6034 int dummy;
5d967c7a 6035#ifdef emacs
d1dfb56c
EZ
6036 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6037 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6038 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6039#endif
99633e97 6040 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6041 s1 = SYNTAX (c1);
6042
6043 /* Case 2: S1 is not Sword. */
6044 if (s1 != Sword)
6045 goto fail;
6046
6047 /* Case 3: D is not at the end of string ... */
6048 if (!AT_STRINGS_END (d))
6049 {
f1ad044f 6050 PREFETCH_NOLIMIT ();
6fdd04b0 6051 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6052#ifdef emacs
6053 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6054#endif
b18215fc
RS
6055 s2 = SYNTAX (c2);
6056
6057 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6058 returns 0. */
b18215fc 6059 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6060 goto fail;
b18215fc
RS
6061 }
6062 }
e318085a
RS
6063 break;
6064
669fa600
SM
6065 case symbeg:
6066 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6067
6068 /* We FAIL in one of the following cases: */
6069
7814e705 6070 /* Case 1: D is at the end of string. */
669fa600
SM
6071 if (AT_STRINGS_END (d))
6072 goto fail;
6073 else
6074 {
6075 /* C1 is the character before D, S1 is the syntax of C1, C2
6076 is the character at D, and S2 is the syntax of C2. */
6077 re_wchar_t c1, c2;
6078 int s1, s2;
6079#ifdef emacs
d1dfb56c
EZ
6080 ssize_t offset = PTR_TO_OFFSET (d);
6081 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6082 UPDATE_SYNTAX_TABLE (charpos);
6083#endif
6084 PREFETCH ();
62a6e103 6085 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6086 s2 = SYNTAX (c2);
7814e705 6087
669fa600
SM
6088 /* Case 2: S2 is neither Sword nor Ssymbol. */
6089 if (s2 != Sword && s2 != Ssymbol)
6090 goto fail;
6091
6092 /* Case 3: D is not at the beginning of string ... */
6093 if (!AT_STRINGS_BEG (d))
6094 {
6095 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6096#ifdef emacs
6097 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6098#endif
6099 s1 = SYNTAX (c1);
6100
6101 /* ... and S1 is Sword or Ssymbol. */
6102 if (s1 == Sword || s1 == Ssymbol)
6103 goto fail;
6104 }
6105 }
6106 break;
6107
6108 case symend:
6109 DEBUG_PRINT1 ("EXECUTING symend.\n");
6110
6111 /* We FAIL in one of the following cases: */
6112
6113 /* Case 1: D is at the beginning of string. */
6114 if (AT_STRINGS_BEG (d))
6115 goto fail;
6116 else
6117 {
6118 /* C1 is the character before D, S1 is the syntax of C1, C2
6119 is the character at D, and S2 is the syntax of C2. */
6120 re_wchar_t c1, c2;
6121 int s1, s2;
6122#ifdef emacs
d1dfb56c
EZ
6123 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6124 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6125 UPDATE_SYNTAX_TABLE (charpos);
6126#endif
6127 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6128 s1 = SYNTAX (c1);
6129
6130 /* Case 2: S1 is neither Ssymbol nor Sword. */
6131 if (s1 != Sword && s1 != Ssymbol)
6132 goto fail;
6133
6134 /* Case 3: D is not at the end of string ... */
6135 if (!AT_STRINGS_END (d))
6136 {
6137 PREFETCH_NOLIMIT ();
62a6e103 6138 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6139#ifdef emacs
134579f2 6140 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6141#endif
6142 s2 = SYNTAX (c2);
6143
6144 /* ... and S2 is Sword or Ssymbol. */
6145 if (s2 == Sword || s2 == Ssymbol)
6146 goto fail;
b18215fc
RS
6147 }
6148 }
e318085a
RS
6149 break;
6150
fa9a63c5 6151 case syntaxspec:
1fb352e0 6152 case notsyntaxspec:
b18215fc 6153 {
19ed5445
PE
6154 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6155 mcnt = *p++;
6156 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6157 PREFETCH ();
6158#ifdef emacs
6159 {
d1dfb56c
EZ
6160 ssize_t offset = PTR_TO_OFFSET (d);
6161 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445
PE
6162 UPDATE_SYNTAX_TABLE (pos1);
6163 }
25fe55af 6164#endif
19ed5445
PE
6165 {
6166 int len;
6167 re_wchar_t c;
b18215fc 6168
19ed5445
PE
6169 GET_CHAR_AFTER (c, d, len);
6170 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6171 goto fail;
6172 d += len;
6173 }
b18215fc 6174 }
8fb31792 6175 break;
fa9a63c5 6176
b18215fc 6177#ifdef emacs
1fb352e0
SM
6178 case before_dot:
6179 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6180 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6181 goto fail;
b18215fc
RS
6182 break;
6183
1fb352e0
SM
6184 case at_dot:
6185 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6186 if (PTR_BYTE_POS (d) != PT_BYTE)
6187 goto fail;
6188 break;
b18215fc 6189
1fb352e0
SM
6190 case after_dot:
6191 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6192 if (PTR_BYTE_POS (d) <= PT_BYTE)
6193 goto fail;
e318085a 6194 break;
fa9a63c5 6195
1fb352e0 6196 case categoryspec:
b18215fc 6197 case notcategoryspec:
b18215fc 6198 {
8fb31792
PE
6199 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6200 mcnt = *p++;
6201 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n",
6202 not?"not":"", mcnt);
6203 PREFETCH ();
01618498 6204
8fb31792
PE
6205 {
6206 int len;
6207 re_wchar_t c;
6208 GET_CHAR_AFTER (c, d, len);
6209 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6210 goto fail;
6211 d += len;
6212 }
b18215fc 6213 }
fa9a63c5 6214 break;
5e69f11e 6215
1fb352e0 6216#endif /* emacs */
5e69f11e 6217
0b32bf0e
SM
6218 default:
6219 abort ();
fa9a63c5 6220 }
b18215fc 6221 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6222
6223
6224 /* We goto here if a matching operation fails. */
6225 fail:
5b370c2b 6226 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6227 if (!FAIL_STACK_EMPTY ())
505bde11 6228 {
01618498 6229 re_char *str, *pat;
505bde11 6230 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6231 DEBUG_PRINT1 ("\nFAIL:\n");
6232 POP_FAILURE_POINT (str, pat);
7393bcbb 6233 switch (*pat++)
505bde11
SM
6234 {
6235 case on_failure_keep_string_jump:
6236 assert (str == NULL);
6237 goto continue_failure_jump;
6238
0683b6fa
SM
6239 case on_failure_jump_nastyloop:
6240 assert ((re_opcode_t)pat[-2] == no_op);
6241 PUSH_FAILURE_POINT (pat - 2, str);
6242 /* Fallthrough */
6243
505bde11
SM
6244 case on_failure_jump_loop:
6245 case on_failure_jump:
6246 case succeed_n:
6247 d = str;
6248 continue_failure_jump:
6249 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6250 p = pat + mcnt;
6251 break;
b18215fc 6252
0683b6fa
SM
6253 case no_op:
6254 /* A special frame used for nastyloops. */
6255 goto fail;
6256
505bde11 6257 default:
5e617bc2 6258 abort ();
505bde11 6259 }
fa9a63c5 6260
505bde11 6261 assert (p >= bufp->buffer && p <= pend);
b18215fc 6262
0b32bf0e 6263 if (d >= string1 && d <= end1)
fa9a63c5 6264 dend = end_match_1;
0b32bf0e 6265 }
fa9a63c5 6266 else
0b32bf0e 6267 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6268 } /* for (;;) */
6269
6270 if (best_regs_set)
6271 goto restore_best_regs;
6272
6273 FREE_VARIABLES ();
6274
b18215fc 6275 return -1; /* Failure to match. */
fa9a63c5
RM
6276} /* re_match_2 */
6277\f
6278/* Subroutine definitions for re_match_2. */
6279
fa9a63c5
RM
6280/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6281 bytes; nonzero otherwise. */
5e69f11e 6282
fa9a63c5 6283static int
d1dfb56c 6284bcmp_translate (const re_char *s1, const re_char *s2, register ssize_t len,
438105ed 6285 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6286{
2d1675e4
SM
6287 register re_char *p1 = s1, *p2 = s2;
6288 re_char *p1_end = s1 + len;
6289 re_char *p2_end = s2 + len;
e934739e 6290
4bb91c68
SM
6291 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6292 different lengths, but relying on a single `len' would break this. -sm */
6293 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6294 {
e934739e 6295 int p1_charlen, p2_charlen;
01618498 6296 re_wchar_t p1_ch, p2_ch;
e934739e 6297
6fdd04b0
KH
6298 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6299 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6300
6301 if (RE_TRANSLATE (translate, p1_ch)
6302 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6303 return 1;
e934739e
RS
6304
6305 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6306 }
e934739e
RS
6307
6308 if (p1 != p1_end || p2 != p2_end)
6309 return 1;
6310
fa9a63c5
RM
6311 return 0;
6312}
6313\f
6314/* Entry points for GNU code. */
6315
6316/* re_compile_pattern is the GNU regular expression compiler: it
6317 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6318 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6319
fa9a63c5
RM
6320 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6321 are set in BUFP on entry.
5e69f11e 6322
b18215fc 6323 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6324
6325const char *
d1dfb56c
EZ
6326re_compile_pattern (const char *pattern, size_t length,
6327 struct re_pattern_buffer *bufp)
fa9a63c5
RM
6328{
6329 reg_errcode_t ret;
5e69f11e 6330
fa9a63c5
RM
6331 /* GNU code is written to assume at least RE_NREGS registers will be set
6332 (and at least one extra will be -1). */
6333 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6334
fa9a63c5
RM
6335 /* And GNU code determines whether or not to get register information
6336 by passing null for the REGS argument to re_match, etc., not by
6337 setting no_sub. */
6338 bufp->no_sub = 0;
5e69f11e 6339
4bb91c68 6340 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6341
6342 if (!ret)
6343 return NULL;
6344 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6345}
c0f9ea08 6346WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6347\f
b18215fc
RS
6348/* Entry points compatible with 4.2 BSD regex library. We don't define
6349 them unless specifically requested. */
fa9a63c5 6350
0b32bf0e 6351#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6352
6353/* BSD has one and only one pattern buffer. */
6354static struct re_pattern_buffer re_comp_buf;
6355
6356char *
0b32bf0e 6357# ifdef _LIBC
48afdd44
RM
6358/* Make these definitions weak in libc, so POSIX programs can redefine
6359 these names if they don't use our functions, and still use
6360 regcomp/regexec below without link errors. */
6361weak_function
0b32bf0e 6362# endif
31011111 6363re_comp (const char *s)
fa9a63c5
RM
6364{
6365 reg_errcode_t ret;
5e69f11e 6366
fa9a63c5
RM
6367 if (!s)
6368 {
6369 if (!re_comp_buf.buffer)
0b32bf0e 6370 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6371 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6372 return 0;
6373 }
6374
6375 if (!re_comp_buf.buffer)
6376 {
38182d90 6377 re_comp_buf.buffer = malloc (200);
fa9a63c5 6378 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6379 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6380 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6381 re_comp_buf.allocated = 200;
6382
38182d90 6383 re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
fa9a63c5 6384 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6385 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6386 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6387 }
6388
6389 /* Since `re_exec' always passes NULL for the `regs' argument, we
6390 don't need to initialize the pattern buffer fields which affect it. */
6391
fa9a63c5 6392 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6393
fa9a63c5
RM
6394 if (!ret)
6395 return NULL;
6396
6397 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6398 return (char *) gettext (re_error_msgid[(int) ret]);
6399}
6400
6401
31011111 6402int
0b32bf0e 6403# ifdef _LIBC
48afdd44 6404weak_function
0b32bf0e 6405# endif
d1dfb56c 6406re_exec (const char *s)
fa9a63c5 6407{
d1dfb56c 6408 const size_t len = strlen (s);
fa9a63c5
RM
6409 return
6410 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6411}
6412#endif /* _REGEX_RE_COMP */
6413\f
6414/* POSIX.2 functions. Don't define these for Emacs. */
6415
6416#ifndef emacs
6417
6418/* regcomp takes a regular expression as a string and compiles it.
6419
b18215fc 6420 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6421 since POSIX says we shouldn't. Thus, we set
6422
6423 `buffer' to the compiled pattern;
6424 `used' to the length of the compiled pattern;
6425 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6426 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6427 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6428 `fastmap' to an allocated space for the fastmap;
6429 `fastmap_accurate' to zero;
fa9a63c5
RM
6430 `re_nsub' to the number of subexpressions in PATTERN.
6431
6432 PATTERN is the address of the pattern string.
6433
6434 CFLAGS is a series of bits which affect compilation.
6435
6436 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6437 use POSIX basic syntax.
6438
6439 If REG_NEWLINE is set, then . and [^...] don't match newline.
6440 Also, regexec will try a match beginning after every newline.
6441
6442 If REG_ICASE is set, then we considers upper- and lowercase
6443 versions of letters to be equivalent when matching.
6444
6445 If REG_NOSUB is set, then when PREG is passed to regexec, that
6446 routine will report only success or failure, and nothing about the
6447 registers.
6448
b18215fc 6449 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6450 the return codes and their meanings.) */
6451
d1dfb56c 6452reg_errcode_t
d2762c86
DN
6453regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6454 int cflags)
fa9a63c5
RM
6455{
6456 reg_errcode_t ret;
4bb91c68 6457 reg_syntax_t syntax
fa9a63c5
RM
6458 = (cflags & REG_EXTENDED) ?
6459 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6460
6461 /* regex_compile will allocate the space for the compiled pattern. */
6462 preg->buffer = 0;
6463 preg->allocated = 0;
6464 preg->used = 0;
5e69f11e 6465
c0f9ea08 6466 /* Try to allocate space for the fastmap. */
38182d90 6467 preg->fastmap = malloc (1 << BYTEWIDTH);
5e69f11e 6468
fa9a63c5
RM
6469 if (cflags & REG_ICASE)
6470 {
6471 unsigned i;
5e69f11e 6472
38182d90 6473 preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
fa9a63c5 6474 if (preg->translate == NULL)
0b32bf0e 6475 return (int) REG_ESPACE;
fa9a63c5
RM
6476
6477 /* Map uppercase characters to corresponding lowercase ones. */
6478 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6479 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6480 }
6481 else
6482 preg->translate = NULL;
6483
6484 /* If REG_NEWLINE is set, newlines are treated differently. */
6485 if (cflags & REG_NEWLINE)
6486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6487 syntax &= ~RE_DOT_NEWLINE;
6488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6489 }
6490 else
c0f9ea08 6491 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6492
6493 preg->no_sub = !!(cflags & REG_NOSUB);
6494
5e69f11e 6495 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6496 can use strlen here in compiling the pattern. */
4bb91c68 6497 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6498
fa9a63c5
RM
6499 /* POSIX doesn't distinguish between an unmatched open-group and an
6500 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6501 if (ret == REG_ERPAREN)
6502 ret = REG_EPAREN;
6503
6504 if (ret == REG_NOERROR && preg->fastmap)
6505 { /* Compute the fastmap now, since regexec cannot modify the pattern
6506 buffer. */
6507 re_compile_fastmap (preg);
6508 if (preg->can_be_null)
6509 { /* The fastmap can't be used anyway. */
6510 free (preg->fastmap);
6511 preg->fastmap = NULL;
6512 }
6513 }
d1dfb56c 6514 return ret;
fa9a63c5 6515}
c0f9ea08 6516WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6517
6518
6519/* regexec searches for a given pattern, specified by PREG, in the
6520 string STRING.
5e69f11e 6521
fa9a63c5 6522 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6523 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6524 least NMATCH elements, and we set them to the offsets of the
6525 corresponding matched substrings.
5e69f11e 6526
fa9a63c5
RM
6527 EFLAGS specifies `execution flags' which affect matching: if
6528 REG_NOTBOL is set, then ^ does not match at the beginning of the
6529 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6530
fa9a63c5
RM
6531 We return 0 if we find a match and REG_NOMATCH if not. */
6532
d1dfb56c 6533reg_errcode_t
d2762c86
DN
6534regexec (const regex_t *__restrict preg, const char *__restrict string,
6535 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5 6536{
31011111 6537 regoff_t ret;
fa9a63c5
RM
6538 struct re_registers regs;
6539 regex_t private_preg;
d1dfb56c 6540 size_t len = strlen (string);
c0f9ea08 6541 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6542
6543 private_preg = *preg;
5e69f11e 6544
fa9a63c5
RM
6545 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6546 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6547
fa9a63c5
RM
6548 /* The user has told us exactly how many registers to return
6549 information about, via `nmatch'. We have to pass that on to the
b18215fc 6550 matching routines. */
fa9a63c5 6551 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6552
fa9a63c5
RM
6553 if (want_reg_info)
6554 {
6555 regs.num_regs = nmatch;
4bb91c68
SM
6556 regs.start = TALLOC (nmatch * 2, regoff_t);
6557 if (regs.start == NULL)
d1dfb56c 6558 return REG_NOMATCH;
4bb91c68 6559 regs.end = regs.start + nmatch;
fa9a63c5
RM
6560 }
6561
c0f9ea08
SM
6562 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6563 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6564 was a little bit longer but still only matching the real part.
6565 This works because the `endline' will check for a '\n' and will find a
6566 '\0', correctly deciding that this is not the end of a line.
6567 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6568 a convenient '\0' there. For all we know, the string could be preceded
6569 by '\n' which would throw things off. */
6570
fa9a63c5
RM
6571 /* Perform the searching operation. */
6572 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6573 /* start: */ 0, /* range: */ len,
6574 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6575
fa9a63c5
RM
6576 /* Copy the register information to the POSIX structure. */
6577 if (want_reg_info)
6578 {
6579 if (ret >= 0)
0b32bf0e
SM
6580 {
6581 unsigned r;
fa9a63c5 6582
0b32bf0e
SM
6583 for (r = 0; r < nmatch; r++)
6584 {
6585 pmatch[r].rm_so = regs.start[r];
6586 pmatch[r].rm_eo = regs.end[r];
6587 }
6588 }
fa9a63c5 6589
b18215fc 6590 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6591 free (regs.start);
fa9a63c5
RM
6592 }
6593
6594 /* We want zero return to mean success, unlike `re_search'. */
d1dfb56c 6595 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
fa9a63c5 6596}
c0f9ea08 6597WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6598
6599
ec869672
JR
6600/* Returns a message corresponding to an error code, ERR_CODE, returned
6601 from either regcomp or regexec. We don't use PREG here.
6602
6603 ERR_CODE was previously called ERRCODE, but that name causes an
6604 error with msvc8 compiler. */
fa9a63c5
RM
6605
6606size_t
d2762c86 6607regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6608{
6609 const char *msg;
6610 size_t msg_size;
6611
ec869672
JR
6612 if (err_code < 0
6613 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6614 /* Only error codes returned by the rest of the code should be passed
b18215fc 6615 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6616 code generates an invalid error code, then the program has a bug.
6617 Dump core so we can fix it. */
6618 abort ();
6619
ec869672 6620 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6621
6622 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6623
fa9a63c5
RM
6624 if (errbuf_size != 0)
6625 {
6626 if (msg_size > errbuf_size)
0b32bf0e 6627 {
e99a530f 6628 memcpy (errbuf, msg, errbuf_size - 1);
0b32bf0e
SM
6629 errbuf[errbuf_size - 1] = 0;
6630 }
fa9a63c5 6631 else
0b32bf0e 6632 strcpy (errbuf, msg);
fa9a63c5
RM
6633 }
6634
6635 return msg_size;
6636}
c0f9ea08 6637WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6638
6639
6640/* Free dynamically allocated space used by PREG. */
6641
6642void
d2762c86 6643regfree (regex_t *preg)
fa9a63c5 6644{
c2cd06e6 6645 free (preg->buffer);
fa9a63c5 6646 preg->buffer = NULL;
5e69f11e 6647
fa9a63c5
RM
6648 preg->allocated = 0;
6649 preg->used = 0;
6650
c2cd06e6 6651 free (preg->fastmap);
fa9a63c5
RM
6652 preg->fastmap = NULL;
6653 preg->fastmap_accurate = 0;
6654
c2cd06e6 6655 free (preg->translate);
fa9a63c5
RM
6656 preg->translate = NULL;
6657}
c0f9ea08 6658WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6659
6660#endif /* not emacs */