Assume C89 or later.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
acaf905b 5 Copyright (C) 1993-2012 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
4fc5845f 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
b8df54ff
PE
36/* Ignore some GCC warnings for now. This section should go away
37 once the Emacs and Gnulib regex code is merged. */
38#if (__GNUC__ == 4 && 3 <= __GNUC_MINOR__) || 4 < __GNUC__
39# pragma GCC diagnostic ignored "-Wstrict-overflow"
40# ifndef emacs
41# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
42# pragma GCC diagnostic ignored "-Wunused-function"
43# pragma GCC diagnostic ignored "-Wunused-macros"
44# pragma GCC diagnostic ignored "-Wunused-result"
45# pragma GCC diagnostic ignored "-Wunused-variable"
46# endif
47#endif
48
fa9a63c5 49#ifdef HAVE_CONFIG_H
0b32bf0e 50# include <config.h>
fa9a63c5
RM
51#endif
52
0e926e56
PE
53#include <stddef.h>
54
55#ifdef emacs
4bb91c68
SM
56/* We need this for `regex.h', and perhaps for the Emacs include files. */
57# include <sys/types.h>
58#endif
fa9a63c5 59
14473664
SM
60/* Whether to use ISO C Amendment 1 wide char functions.
61 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
62#if defined _LIBC
63#define WIDE_CHAR_SUPPORT 1
64#else
14473664 65#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
66 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
67#endif
14473664 68
fa463103 69/* For platform which support the ISO C amendment 1 functionality we
14473664 70 support user defined character classes. */
a0ad02f7 71#if WIDE_CHAR_SUPPORT
14473664
SM
72/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
73# include <wchar.h>
74# include <wctype.h>
75#endif
76
c0f9ea08
SM
77#ifdef _LIBC
78/* We have to keep the namespace clean. */
79# define regfree(preg) __regfree (preg)
80# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
81# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672 82# define regerror(err_code, preg, errbuf, errbuf_size) \
5e617bc2 83 __regerror (err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
84# define re_set_registers(bu, re, nu, st, en) \
85 __re_set_registers (bu, re, nu, st, en)
86# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
87 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
88# define re_match(bufp, string, size, pos, regs) \
89 __re_match (bufp, string, size, pos, regs)
90# define re_search(bufp, string, size, startpos, range, regs) \
91 __re_search (bufp, string, size, startpos, range, regs)
92# define re_compile_pattern(pattern, length, bufp) \
93 __re_compile_pattern (pattern, length, bufp)
94# define re_set_syntax(syntax) __re_set_syntax (syntax)
95# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
96 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
97# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
98
14473664
SM
99/* Make sure we call libc's function even if the user overrides them. */
100# define btowc __btowc
101# define iswctype __iswctype
102# define wctype __wctype
103
c0f9ea08
SM
104# define WEAK_ALIAS(a,b) weak_alias (a, b)
105
106/* We are also using some library internals. */
107# include <locale/localeinfo.h>
108# include <locale/elem-hash.h>
109# include <langinfo.h>
110#else
111# define WEAK_ALIAS(a,b)
112#endif
113
4bb91c68 114/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 115#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
116# include <libintl.h>
117#else
118# define gettext(msgid) (msgid)
119#endif
120
5e69f11e
RM
121#ifndef gettext_noop
122/* This define is so xgettext can find the internationalizable
123 strings. */
0b32bf0e 124# define gettext_noop(String) String
5e69f11e
RM
125#endif
126
fa9a63c5
RM
127/* The `emacs' switch turns on certain matching commands
128 that make sense only in Emacs. */
129#ifdef emacs
130
d7306fe6 131# include <setjmp.h>
0b32bf0e
SM
132# include "lisp.h"
133# include "buffer.h"
b18215fc
RS
134
135/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 136# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 137
0b32bf0e 138# include "syntax.h"
9117d724 139# include "character.h"
0b32bf0e 140# include "category.h"
fa9a63c5 141
7689ef0b
EZ
142# ifdef malloc
143# undef malloc
144# endif
0b32bf0e 145# define malloc xmalloc
7689ef0b
EZ
146# ifdef realloc
147# undef realloc
148# endif
0b32bf0e 149# define realloc xrealloc
7689ef0b
EZ
150# ifdef free
151# undef free
152# endif
0b32bf0e 153# define free xfree
9abbd165 154
7814e705 155/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
156# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
157# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
158
159# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 160# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
161# define RE_STRING_CHAR(p, multibyte) \
162 (multibyte ? (STRING_CHAR (p)) : (*(p)))
163# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
164 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 165
4c0354d7 166# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 167
2afc21f5 168# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 169
6fdd04b0
KH
170/* Set C a (possibly converted to multibyte) character before P. P
171 points into a string which is the virtual concatenation of STR1
172 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
173# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
174 do { \
02cb78b5 175 if (target_multibyte) \
bf216479
KH
176 { \
177 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
178 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
179 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 180 c = STRING_CHAR (dtemp); \
bf216479
KH
181 } \
182 else \
183 { \
184 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 185 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 186 } \
2d1675e4
SM
187 } while (0)
188
6fdd04b0
KH
189/* Set C a (possibly converted to multibyte) character at P, and set
190 LEN to the byte length of that character. */
191# define GET_CHAR_AFTER(c, p, len) \
192 do { \
02cb78b5 193 if (target_multibyte) \
62a6e103 194 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
195 else \
196 { \
cf9c99bc 197 (c) = *p; \
6fdd04b0 198 len = 1; \
cf9c99bc 199 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 200 } \
8f924df7 201 } while (0)
4e8a9132 202
fa9a63c5
RM
203#else /* not emacs */
204
205/* If we are not linking with Emacs proper,
206 we can't use the relocating allocator
207 even if config.h says that we can. */
0b32bf0e 208# undef REL_ALLOC
fa9a63c5 209
4004364e 210# include <unistd.h>
fa9a63c5 211
a77f947b
CY
212/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
213
b8df54ff 214static void *
d2762c86 215xmalloc (size_t size)
a77f947b
CY
216{
217 register void *val;
218 val = (void *) malloc (size);
219 if (!val && size)
220 {
221 write (2, "virtual memory exhausted\n", 25);
222 exit (1);
223 }
224 return val;
225}
226
b8df54ff 227static void *
d2762c86 228xrealloc (void *block, size_t size)
a77f947b
CY
229{
230 register void *val;
231 /* We must call malloc explicitly when BLOCK is 0, since some
232 reallocs don't do this. */
233 if (! block)
234 val = (void *) malloc (size);
235 else
236 val = (void *) realloc (block, size);
237 if (!val && size)
238 {
239 write (2, "virtual memory exhausted\n", 25);
240 exit (1);
241 }
242 return val;
243}
244
a073faa6
CY
245# ifdef malloc
246# undef malloc
247# endif
248# define malloc xmalloc
249# ifdef realloc
250# undef realloc
251# endif
252# define realloc xrealloc
253
9cfdb3ec 254# include <string.h>
fa9a63c5
RM
255
256/* Define the syntax stuff for \<, \>, etc. */
257
990b2375 258/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 259enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 260
0b32bf0e 261# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 262
e934739e 263/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
264# define CHAR_CHARSET(c) 0
265# define CHARSET_LEADING_CODE_BASE(c) 0
266# define MAX_MULTIBYTE_LENGTH 1
267# define RE_MULTIBYTE_P(x) 0
bf216479 268# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
269# define WORD_BOUNDARY_P(c1, c2) (0)
270# define CHAR_HEAD_P(p) (1)
271# define SINGLE_BYTE_CHAR_P(c) (1)
272# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 273# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 274# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
275# define STRING_CHAR(p) (*(p))
276# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 277# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
278# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
279# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
280# define RE_CHAR_TO_MULTIBYTE(c) (c)
281# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 282# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 283 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
284# define GET_CHAR_AFTER(c, p, len) \
285 (c = *p, len = 1)
0b32bf0e 286# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
287# define BYTE8_TO_CHAR(c) (c)
288# define CHAR_BYTE8_P(c) (0)
bf216479 289# define CHAR_LEADING_CODE(c) (c)
8f924df7 290
fa9a63c5 291#endif /* not emacs */
4e8a9132
SM
292
293#ifndef RE_TRANSLATE
0b32bf0e
SM
294# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
295# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 296#endif
fa9a63c5
RM
297\f
298/* Get the interface, including the syntax bits. */
299#include "regex.h"
300
f71b19b6
DL
301/* isalpha etc. are used for the character classes. */
302#include <ctype.h>
fa9a63c5 303
f71b19b6 304#ifdef emacs
fa9a63c5 305
f71b19b6 306/* 1 if C is an ASCII character. */
0b32bf0e 307# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 308
f71b19b6 309/* 1 if C is a unibyte character. */
0b32bf0e 310# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 311
f71b19b6 312/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 313
f71b19b6 314/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
315# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
316# define ISCNTRL(c) ((c) < ' ')
317# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
318 || ((c) >= 'a' && (c) <= 'f') \
319 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
320
321/* This is only used for single-byte characters. */
0b32bf0e 322# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
323
324/* The rest must handle multibyte characters. */
325
0b32bf0e 326# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 327 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
328 : 1)
329
14473664 330# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 331 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
332 : 1)
333
0b32bf0e 334# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
335 ? (((c) >= 'a' && (c) <= 'z') \
336 || ((c) >= 'A' && (c) <= 'Z') \
337 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
338 : SYNTAX (c) == Sword)
339
0b32bf0e 340# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
341 ? (((c) >= 'a' && (c) <= 'z') \
342 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
343 : SYNTAX (c) == Sword)
344
5da9919f 345# define ISLOWER(c) lowercasep (c)
96cc36cc 346
0b32bf0e 347# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
348 ? ((c) > ' ' && (c) < 0177 \
349 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
350 || ((c) >= 'A' && (c) <= 'Z') \
351 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
352 : SYNTAX (c) != Sword)
353
0b32bf0e 354# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 355
5da9919f 356# define ISUPPER(c) uppercasep (c)
96cc36cc 357
0b32bf0e 358# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
359
360#else /* not emacs */
361
f71b19b6 362/* 1 if C is an ASCII character. */
0b32bf0e 363# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
364
365/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
366# define ISUNIBYTE(c) 1
367
368# ifdef isblank
0e926e56 369# define ISBLANK(c) isblank (c)
0b32bf0e
SM
370# else
371# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
372# endif
373# ifdef isgraph
0e926e56 374# define ISGRAPH(c) isgraph (c)
0b32bf0e 375# else
0e926e56 376# define ISGRAPH(c) (isprint (c) && !isspace (c))
0b32bf0e
SM
377# endif
378
0e926e56 379/* Solaris defines ISPRINT so we must undefine it first. */
4bb91c68 380# undef ISPRINT
0e926e56
PE
381# define ISPRINT(c) isprint (c)
382# define ISDIGIT(c) isdigit (c)
383# define ISALNUM(c) isalnum (c)
384# define ISALPHA(c) isalpha (c)
385# define ISCNTRL(c) iscntrl (c)
386# define ISLOWER(c) islower (c)
387# define ISPUNCT(c) ispunct (c)
388# define ISSPACE(c) isspace (c)
389# define ISUPPER(c) isupper (c)
390# define ISXDIGIT(c) isxdigit (c)
0b32bf0e 391
5e617bc2 392# define ISWORD(c) ISALPHA (c)
0b32bf0e 393
4bb91c68 394# ifdef _tolower
5e617bc2 395# define TOLOWER(c) _tolower (c)
4bb91c68 396# else
5e617bc2 397# define TOLOWER(c) tolower (c)
4bb91c68
SM
398# endif
399
400/* How many characters in the character set. */
401# define CHAR_SET_SIZE 256
402
0b32bf0e 403# ifdef SYNTAX_TABLE
f71b19b6 404
0b32bf0e 405extern char *re_syntax_table;
f71b19b6 406
0b32bf0e
SM
407# else /* not SYNTAX_TABLE */
408
0b32bf0e
SM
409static char re_syntax_table[CHAR_SET_SIZE];
410
411static void
d2762c86 412init_syntax_once (void)
0b32bf0e
SM
413{
414 register int c;
415 static int done = 0;
416
417 if (done)
418 return;
419
72af86bd 420 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 421
4bb91c68
SM
422 for (c = 0; c < CHAR_SET_SIZE; ++c)
423 if (ISALNUM (c))
424 re_syntax_table[c] = Sword;
fa9a63c5 425
669fa600 426 re_syntax_table['_'] = Ssymbol;
fa9a63c5 427
0b32bf0e
SM
428 done = 1;
429}
430
431# endif /* not SYNTAX_TABLE */
96cc36cc 432
4bb91c68
SM
433# define SYNTAX(c) re_syntax_table[(c)]
434
96cc36cc
RS
435#endif /* not emacs */
436\f
261cb4bb 437#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
438\f
439/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
440 use `alloca' instead of `malloc'. This is because using malloc in
441 re_search* or re_match* could cause memory leaks when C-g is used in
442 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
443 the other hand, malloc is more portable, and easier to debug.
444
fa9a63c5
RM
445 Because we sometimes use alloca, some routines have to be macros,
446 not functions -- `alloca'-allocated space disappears at the end of the
447 function it is called in. */
448
449#ifdef REGEX_MALLOC
450
0b32bf0e
SM
451# define REGEX_ALLOCATE malloc
452# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
453# define REGEX_FREE free
fa9a63c5
RM
454
455#else /* not REGEX_MALLOC */
456
457/* Emacs already defines alloca, sometimes. */
0b32bf0e 458# ifndef alloca
fa9a63c5
RM
459
460/* Make alloca work the best possible way. */
0b32bf0e
SM
461# ifdef __GNUC__
462# define alloca __builtin_alloca
463# else /* not __GNUC__ */
7f585e7a 464# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
465# include <alloca.h>
466# endif /* HAVE_ALLOCA_H */
467# endif /* not __GNUC__ */
fa9a63c5 468
0b32bf0e 469# endif /* not alloca */
fa9a63c5 470
0b32bf0e 471# define REGEX_ALLOCATE alloca
fa9a63c5
RM
472
473/* Assumes a `char *destination' variable. */
0b32bf0e 474# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 475 (destination = (char *) alloca (nsize), \
4bb91c68 476 memcpy (destination, source, osize))
fa9a63c5
RM
477
478/* No need to do anything to free, after alloca. */
0b32bf0e 479# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
480
481#endif /* not REGEX_MALLOC */
482
483/* Define how to allocate the failure stack. */
484
0b32bf0e 485#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 486
0b32bf0e 487# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 488 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 489# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 490 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 491# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
492 r_alloc_free (&failure_stack_ptr)
493
4297555e 494#else /* not using relocating allocator */
fa9a63c5 495
0b32bf0e 496# ifdef REGEX_MALLOC
fa9a63c5 497
0b32bf0e
SM
498# define REGEX_ALLOCATE_STACK malloc
499# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
500# define REGEX_FREE_STACK free
fa9a63c5 501
0b32bf0e 502# else /* not REGEX_MALLOC */
fa9a63c5 503
0b32bf0e 504# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 505
0b32bf0e 506# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 507 REGEX_REALLOCATE (source, osize, nsize)
7814e705 508/* No need to explicitly free anything. */
0b32bf0e 509# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 510
0b32bf0e 511# endif /* not REGEX_MALLOC */
4297555e 512#endif /* not using relocating allocator */
fa9a63c5
RM
513
514
515/* True if `size1' is non-NULL and PTR is pointing anywhere inside
516 `string1' or just past its end. This works if PTR is NULL, which is
517 a good thing. */
25fe55af 518#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
519 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
520
521/* (Re)Allocate N items of type T using malloc, or fail. */
522#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
523#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
fa9a63c5
RM
524#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
525
4bb91c68 526#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
527
528#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
529
530#undef MAX
531#undef MIN
532#define MAX(a, b) ((a) > (b) ? (a) : (b))
533#define MIN(a, b) ((a) < (b) ? (a) : (b))
534
66f0296e 535/* Type of source-pattern and string chars. */
a6fc3b5c
EZ
536#ifdef _MSC_VER
537typedef unsigned char re_char;
538#else
66f0296e 539typedef const unsigned char re_char;
a6fc3b5c 540#endif
66f0296e 541
fa9a63c5
RM
542typedef char boolean;
543#define false 0
544#define true 1
545
261cb4bb
PE
546static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
547 re_char *string1, size_t size1,
548 re_char *string2, size_t size2,
549 ssize_t pos,
550 struct re_registers *regs,
551 ssize_t stop);
fa9a63c5
RM
552\f
553/* These are the command codes that appear in compiled regular
4bb91c68 554 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
555 command code can specify any interpretation whatsoever for its
556 arguments. Zero bytes may appear in the compiled regular expression. */
557
558typedef enum
559{
560 no_op = 0,
561
4bb91c68 562 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
563 succeed,
564
25fe55af 565 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
566 exactn,
567
25fe55af 568 /* Matches any (more or less) character. */
fa9a63c5
RM
569 anychar,
570
25fe55af
RS
571 /* Matches any one char belonging to specified set. First
572 following byte is number of bitmap bytes. Then come bytes
573 for a bitmap saying which chars are in. Bits in each byte
574 are ordered low-bit-first. A character is in the set if its
575 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
576 automatically not in the set.
577
578 If the length byte has the 0x80 bit set, then that stuff
579 is followed by a range table:
580 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 581 See RANGE_TABLE_WORK_BITS below.
01618498 582 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 583 pairs, each 2 multibyte characters,
0b32bf0e 584 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
585 charset,
586
25fe55af 587 /* Same parameters as charset, but match any character that is
4bb91c68 588 not one of those specified. */
fa9a63c5
RM
589 charset_not,
590
25fe55af
RS
591 /* Start remembering the text that is matched, for storing in a
592 register. Followed by one byte with the register number, in
593 the range 0 to one less than the pattern buffer's re_nsub
505bde11 594 field. */
fa9a63c5
RM
595 start_memory,
596
25fe55af
RS
597 /* Stop remembering the text that is matched and store it in a
598 memory register. Followed by one byte with the register
599 number, in the range 0 to one less than `re_nsub' in the
505bde11 600 pattern buffer. */
fa9a63c5
RM
601 stop_memory,
602
25fe55af 603 /* Match a duplicate of something remembered. Followed by one
4bb91c68 604 byte containing the register number. */
fa9a63c5
RM
605 duplicate,
606
25fe55af 607 /* Fail unless at beginning of line. */
fa9a63c5
RM
608 begline,
609
4bb91c68 610 /* Fail unless at end of line. */
fa9a63c5
RM
611 endline,
612
25fe55af
RS
613 /* Succeeds if at beginning of buffer (if emacs) or at beginning
614 of string to be matched (if not). */
fa9a63c5
RM
615 begbuf,
616
25fe55af 617 /* Analogously, for end of buffer/string. */
fa9a63c5 618 endbuf,
5e69f11e 619
25fe55af 620 /* Followed by two byte relative address to which to jump. */
5e69f11e 621 jump,
fa9a63c5 622
25fe55af 623 /* Followed by two-byte relative address of place to resume at
7814e705 624 in case of failure. */
fa9a63c5 625 on_failure_jump,
5e69f11e 626
25fe55af
RS
627 /* Like on_failure_jump, but pushes a placeholder instead of the
628 current string position when executed. */
fa9a63c5 629 on_failure_keep_string_jump,
5e69f11e 630
505bde11
SM
631 /* Just like `on_failure_jump', except that it checks that we
632 don't get stuck in an infinite loop (matching an empty string
633 indefinitely). */
634 on_failure_jump_loop,
635
0683b6fa
SM
636 /* Just like `on_failure_jump_loop', except that it checks for
637 a different kind of loop (the kind that shows up with non-greedy
638 operators). This operation has to be immediately preceded
639 by a `no_op'. */
640 on_failure_jump_nastyloop,
641
0b32bf0e 642 /* A smart `on_failure_jump' used for greedy * and + operators.
c7015153 643 It analyzes the loop before which it is put and if the
505bde11 644 loop does not require backtracking, it changes itself to
4e8a9132
SM
645 `on_failure_keep_string_jump' and short-circuits the loop,
646 else it just defaults to changing itself into `on_failure_jump'.
647 It assumes that it is pointing to just past a `jump'. */
505bde11 648 on_failure_jump_smart,
fa9a63c5 649
25fe55af 650 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
651 After matching N times, jump to the address upon failure.
652 Does not work if N starts at 0: use on_failure_jump_loop
653 instead. */
fa9a63c5
RM
654 succeed_n,
655
25fe55af
RS
656 /* Followed by two-byte relative address, and two-byte number n.
657 Jump to the address N times, then fail. */
fa9a63c5
RM
658 jump_n,
659
25fe55af 660 /* Set the following two-byte relative address to the
7814e705 661 subsequent two-byte number. The address *includes* the two
25fe55af 662 bytes of number. */
fa9a63c5
RM
663 set_number_at,
664
fa9a63c5
RM
665 wordbeg, /* Succeeds if at word beginning. */
666 wordend, /* Succeeds if at word end. */
667
668 wordbound, /* Succeeds if at a word boundary. */
7814e705 669 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 670
669fa600
SM
671 symbeg, /* Succeeds if at symbol beginning. */
672 symend, /* Succeeds if at symbol end. */
673
fa9a63c5 674 /* Matches any character whose syntax is specified. Followed by
25fe55af 675 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
676 syntaxspec,
677
678 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
679 notsyntaxspec
680
681#ifdef emacs
682 ,before_dot, /* Succeeds if before point. */
683 at_dot, /* Succeeds if at point. */
684 after_dot, /* Succeeds if after point. */
b18215fc
RS
685
686 /* Matches any character whose category-set contains the specified
7814e705
JB
687 category. The operator is followed by a byte which contains a
688 category code (mnemonic ASCII character). */
b18215fc
RS
689 categoryspec,
690
691 /* Matches any character whose category-set does not contain the
692 specified category. The operator is followed by a byte which
693 contains the category code (mnemonic ASCII character). */
694 notcategoryspec
fa9a63c5
RM
695#endif /* emacs */
696} re_opcode_t;
697\f
698/* Common operations on the compiled pattern. */
699
700/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
701
702#define STORE_NUMBER(destination, number) \
703 do { \
704 (destination)[0] = (number) & 0377; \
705 (destination)[1] = (number) >> 8; \
706 } while (0)
707
708/* Same as STORE_NUMBER, except increment DESTINATION to
709 the byte after where the number is stored. Therefore, DESTINATION
710 must be an lvalue. */
711
712#define STORE_NUMBER_AND_INCR(destination, number) \
713 do { \
714 STORE_NUMBER (destination, number); \
715 (destination) += 2; \
716 } while (0)
717
718/* Put into DESTINATION a number stored in two contiguous bytes starting
719 at SOURCE. */
720
721#define EXTRACT_NUMBER(destination, source) \
722 do { \
723 (destination) = *(source) & 0377; \
724 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
725 } while (0)
726
727#ifdef DEBUG
728static void
261cb4bb 729extract_number (int *dest, re_char *source)
fa9a63c5 730{
5e69f11e 731 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
732 *dest = *source & 0377;
733 *dest += temp << 8;
734}
735
4bb91c68 736# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
737# undef EXTRACT_NUMBER
738# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
739# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
740
741#endif /* DEBUG */
742
743/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
744 SOURCE must be an lvalue. */
745
746#define EXTRACT_NUMBER_AND_INCR(destination, source) \
747 do { \
748 EXTRACT_NUMBER (destination, source); \
25fe55af 749 (source) += 2; \
fa9a63c5
RM
750 } while (0)
751
752#ifdef DEBUG
753static void
261cb4bb 754extract_number_and_incr (int *destination, re_char **source)
5e69f11e 755{
fa9a63c5
RM
756 extract_number (destination, *source);
757 *source += 2;
758}
759
0b32bf0e
SM
760# ifndef EXTRACT_MACROS
761# undef EXTRACT_NUMBER_AND_INCR
762# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 763 extract_number_and_incr (&dest, &src)
0b32bf0e 764# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
765
766#endif /* DEBUG */
767\f
b18215fc
RS
768/* Store a multibyte character in three contiguous bytes starting
769 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 770 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
771
772#define STORE_CHARACTER_AND_INCR(destination, character) \
773 do { \
774 (destination)[0] = (character) & 0377; \
775 (destination)[1] = ((character) >> 8) & 0377; \
776 (destination)[2] = (character) >> 16; \
777 (destination) += 3; \
778 } while (0)
779
780/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 781 starting at SOURCE. */
b18215fc
RS
782
783#define EXTRACT_CHARACTER(destination, source) \
784 do { \
785 (destination) = ((source)[0] \
786 | ((source)[1] << 8) \
787 | ((source)[2] << 16)); \
788 } while (0)
789
790
791/* Macros for charset. */
792
793/* Size of bitmap of charset P in bytes. P is a start of charset,
794 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
795#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
796
797/* Nonzero if charset P has range table. */
25fe55af 798#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
799
800/* Return the address of range table of charset P. But not the start
801 of table itself, but the before where the number of ranges is
96cc36cc
RS
802 stored. `2 +' means to skip re_opcode_t and size of bitmap,
803 and the 2 bytes of flags at the start of the range table. */
804#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
805
806/* Extract the bit flags that start a range table. */
807#define CHARSET_RANGE_TABLE_BITS(p) \
808 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
809 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc 810
b18215fc 811/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
812 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
813 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
814 and end. */
815#define CHARSET_RANGE_TABLE_END(range_table, count) \
816 ((range_table) + (count) * 2 * 3)
817
7814e705 818/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
819 COUNT is number of ranges in RANGE_TABLE. */
820#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
821 do \
822 { \
01618498 823 re_wchar_t range_start, range_end; \
19ed5445 824 re_char *rtp; \
01618498 825 re_char *range_table_end \
b18215fc
RS
826 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
827 \
19ed5445 828 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 829 { \
19ed5445
PE
830 EXTRACT_CHARACTER (range_start, rtp); \
831 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
832 \
833 if (range_start <= (c) && (c) <= range_end) \
834 { \
835 (not) = !(not); \
836 break; \
837 } \
838 } \
839 } \
840 while (0)
841
842/* Test if C is in range table of CHARSET. The flag NOT is negated if
843 C is listed in it. */
844#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
845 do \
846 { \
847 /* Number of ranges in range table. */ \
848 int count; \
01618498
SM
849 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
850 \
b18215fc
RS
851 EXTRACT_NUMBER_AND_INCR (count, range_table); \
852 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
853 } \
854 while (0)
855\f
fa9a63c5
RM
856/* If DEBUG is defined, Regex prints many voluminous messages about what
857 it is doing (if the variable `debug' is nonzero). If linked with the
858 main program in `iregex.c', you can enter patterns and strings
859 interactively. And if linked with the main program in `main.c' and
4bb91c68 860 the other test files, you can run the already-written tests. */
fa9a63c5
RM
861
862#ifdef DEBUG
863
864/* We use standard I/O for debugging. */
0b32bf0e 865# include <stdio.h>
fa9a63c5
RM
866
867/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 868# include <assert.h>
fa9a63c5 869
99633e97 870static int debug = -100000;
fa9a63c5 871
0b32bf0e
SM
872# define DEBUG_STATEMENT(e) e
873# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
874# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
875# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
876# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
877# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 878 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 879# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 880 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
881
882
883/* Print the fastmap in human-readable form. */
884
885void
886print_fastmap (fastmap)
887 char *fastmap;
888{
889 unsigned was_a_range = 0;
5e69f11e
RM
890 unsigned i = 0;
891
fa9a63c5
RM
892 while (i < (1 << BYTEWIDTH))
893 {
894 if (fastmap[i++])
895 {
896 was_a_range = 0;
25fe55af
RS
897 putchar (i - 1);
898 while (i < (1 << BYTEWIDTH) && fastmap[i])
899 {
900 was_a_range = 1;
901 i++;
902 }
fa9a63c5 903 if (was_a_range)
25fe55af
RS
904 {
905 printf ("-");
906 putchar (i - 1);
907 }
908 }
fa9a63c5 909 }
5e69f11e 910 putchar ('\n');
fa9a63c5
RM
911}
912
913
914/* Print a compiled pattern string in human-readable form, starting at
915 the START pointer into it and ending just before the pointer END. */
916
917void
918print_partial_compiled_pattern (start, end)
01618498
SM
919 re_char *start;
920 re_char *end;
fa9a63c5
RM
921{
922 int mcnt, mcnt2;
01618498
SM
923 re_char *p = start;
924 re_char *pend = end;
fa9a63c5
RM
925
926 if (start == NULL)
927 {
a1a052df 928 fprintf (stderr, "(null)\n");
fa9a63c5
RM
929 return;
930 }
5e69f11e 931
fa9a63c5
RM
932 /* Loop over pattern commands. */
933 while (p < pend)
934 {
a1a052df 935 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
936
937 switch ((re_opcode_t) *p++)
938 {
25fe55af 939 case no_op:
a1a052df 940 fprintf (stderr, "/no_op");
25fe55af 941 break;
fa9a63c5 942
99633e97 943 case succeed:
a1a052df 944 fprintf (stderr, "/succeed");
99633e97
SM
945 break;
946
fa9a63c5
RM
947 case exactn:
948 mcnt = *p++;
a1a052df 949 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 950 do
fa9a63c5 951 {
a1a052df 952 fprintf (stderr, "/%c", *p++);
25fe55af
RS
953 }
954 while (--mcnt);
955 break;
fa9a63c5
RM
956
957 case start_memory:
a1a052df 958 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 959 break;
fa9a63c5
RM
960
961 case stop_memory:
a1a052df 962 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 963 break;
fa9a63c5
RM
964
965 case duplicate:
a1a052df 966 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
967 break;
968
969 case anychar:
a1a052df 970 fprintf (stderr, "/anychar");
fa9a63c5
RM
971 break;
972
973 case charset:
25fe55af
RS
974 case charset_not:
975 {
976 register int c, last = -100;
fa9a63c5 977 register int in_range = 0;
99633e97
SM
978 int length = CHARSET_BITMAP_SIZE (p - 1);
979 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 980
a1a052df 981 fprintf (stderr, "/charset [%s",
839966f3 982 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 983
839966f3
KH
984 if (p + *p >= pend)
985 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 986
25fe55af 987 for (c = 0; c < 256; c++)
96cc36cc 988 if (c / 8 < length
fa9a63c5
RM
989 && (p[1 + (c/8)] & (1 << (c % 8))))
990 {
991 /* Are we starting a range? */
992 if (last + 1 == c && ! in_range)
993 {
a1a052df 994 fprintf (stderr, "-");
fa9a63c5
RM
995 in_range = 1;
996 }
997 /* Have we broken a range? */
998 else if (last + 1 != c && in_range)
96cc36cc 999 {
a1a052df 1000 fprintf (stderr, "%c", last);
fa9a63c5
RM
1001 in_range = 0;
1002 }
5e69f11e 1003
fa9a63c5 1004 if (! in_range)
a1a052df 1005 fprintf (stderr, "%c", c);
fa9a63c5
RM
1006
1007 last = c;
25fe55af 1008 }
fa9a63c5
RM
1009
1010 if (in_range)
a1a052df 1011 fprintf (stderr, "%c", last);
fa9a63c5 1012
a1a052df 1013 fprintf (stderr, "]");
fa9a63c5 1014
99633e97 1015 p += 1 + length;
96cc36cc 1016
96cc36cc 1017 if (has_range_table)
99633e97
SM
1018 {
1019 int count;
a1a052df 1020 fprintf (stderr, "has-range-table");
99633e97
SM
1021
1022 /* ??? Should print the range table; for now, just skip it. */
1023 p += 2; /* skip range table bits */
1024 EXTRACT_NUMBER_AND_INCR (count, p);
1025 p = CHARSET_RANGE_TABLE_END (p, count);
1026 }
fa9a63c5
RM
1027 }
1028 break;
1029
1030 case begline:
a1a052df 1031 fprintf (stderr, "/begline");
25fe55af 1032 break;
fa9a63c5
RM
1033
1034 case endline:
a1a052df 1035 fprintf (stderr, "/endline");
25fe55af 1036 break;
fa9a63c5
RM
1037
1038 case on_failure_jump:
25fe55af 1039 extract_number_and_incr (&mcnt, &p);
a1a052df 1040 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1041 break;
fa9a63c5
RM
1042
1043 case on_failure_keep_string_jump:
25fe55af 1044 extract_number_and_incr (&mcnt, &p);
a1a052df 1045 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1046 break;
fa9a63c5 1047
0683b6fa
SM
1048 case on_failure_jump_nastyloop:
1049 extract_number_and_incr (&mcnt, &p);
a1a052df 1050 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1051 break;
1052
505bde11 1053 case on_failure_jump_loop:
fa9a63c5 1054 extract_number_and_incr (&mcnt, &p);
a1a052df 1055 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1056 break;
1057
505bde11 1058 case on_failure_jump_smart:
fa9a63c5 1059 extract_number_and_incr (&mcnt, &p);
a1a052df 1060 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1061 break;
1062
25fe55af 1063 case jump:
fa9a63c5 1064 extract_number_and_incr (&mcnt, &p);
a1a052df 1065 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1066 break;
1067
25fe55af
RS
1068 case succeed_n:
1069 extract_number_and_incr (&mcnt, &p);
1070 extract_number_and_incr (&mcnt2, &p);
a1a052df 1071 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1072 break;
5e69f11e 1073
25fe55af
RS
1074 case jump_n:
1075 extract_number_and_incr (&mcnt, &p);
1076 extract_number_and_incr (&mcnt2, &p);
a1a052df 1077 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1078 break;
5e69f11e 1079
25fe55af
RS
1080 case set_number_at:
1081 extract_number_and_incr (&mcnt, &p);
1082 extract_number_and_incr (&mcnt2, &p);
a1a052df 1083 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1084 break;
5e69f11e 1085
25fe55af 1086 case wordbound:
a1a052df 1087 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1088 break;
1089
1090 case notwordbound:
a1a052df 1091 fprintf (stderr, "/notwordbound");
25fe55af 1092 break;
fa9a63c5
RM
1093
1094 case wordbeg:
a1a052df 1095 fprintf (stderr, "/wordbeg");
fa9a63c5 1096 break;
5e69f11e 1097
fa9a63c5 1098 case wordend:
a1a052df 1099 fprintf (stderr, "/wordend");
e2543b02 1100 break;
5e69f11e 1101
669fa600 1102 case symbeg:
e2543b02 1103 fprintf (stderr, "/symbeg");
669fa600
SM
1104 break;
1105
1106 case symend:
e2543b02 1107 fprintf (stderr, "/symend");
669fa600 1108 break;
5e69f11e 1109
1fb352e0 1110 case syntaxspec:
a1a052df 1111 fprintf (stderr, "/syntaxspec");
1fb352e0 1112 mcnt = *p++;
a1a052df 1113 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1114 break;
1115
1116 case notsyntaxspec:
a1a052df 1117 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1118 mcnt = *p++;
a1a052df 1119 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1120 break;
1121
0b32bf0e 1122# ifdef emacs
fa9a63c5 1123 case before_dot:
a1a052df 1124 fprintf (stderr, "/before_dot");
25fe55af 1125 break;
fa9a63c5
RM
1126
1127 case at_dot:
a1a052df 1128 fprintf (stderr, "/at_dot");
25fe55af 1129 break;
fa9a63c5
RM
1130
1131 case after_dot:
a1a052df 1132 fprintf (stderr, "/after_dot");
25fe55af 1133 break;
fa9a63c5 1134
1fb352e0 1135 case categoryspec:
a1a052df 1136 fprintf (stderr, "/categoryspec");
fa9a63c5 1137 mcnt = *p++;
a1a052df 1138 fprintf (stderr, "/%d", mcnt);
25fe55af 1139 break;
5e69f11e 1140
1fb352e0 1141 case notcategoryspec:
a1a052df 1142 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1143 mcnt = *p++;
a1a052df 1144 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1145 break;
0b32bf0e 1146# endif /* emacs */
fa9a63c5 1147
fa9a63c5 1148 case begbuf:
a1a052df 1149 fprintf (stderr, "/begbuf");
25fe55af 1150 break;
fa9a63c5
RM
1151
1152 case endbuf:
a1a052df 1153 fprintf (stderr, "/endbuf");
25fe55af 1154 break;
fa9a63c5 1155
25fe55af 1156 default:
a1a052df 1157 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1158 }
1159
a1a052df 1160 fprintf (stderr, "\n");
fa9a63c5
RM
1161 }
1162
a1a052df 1163 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1164}
1165
1166
1167void
1168print_compiled_pattern (bufp)
1169 struct re_pattern_buffer *bufp;
1170{
01618498 1171 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1172
1173 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1174 printf ("%ld bytes used/%ld bytes allocated.\n",
1175 bufp->used, bufp->allocated);
fa9a63c5
RM
1176
1177 if (bufp->fastmap_accurate && bufp->fastmap)
1178 {
1179 printf ("fastmap: ");
1180 print_fastmap (bufp->fastmap);
1181 }
1182
1183 printf ("re_nsub: %d\t", bufp->re_nsub);
1184 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1185 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1186 printf ("no_sub: %d\t", bufp->no_sub);
1187 printf ("not_bol: %d\t", bufp->not_bol);
1188 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1189 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1190 fflush (stdout);
fa9a63c5
RM
1191 /* Perhaps we should print the translate table? */
1192}
1193
1194
1195void
1196print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1197 re_char *where;
1198 re_char *string1;
1199 re_char *string2;
d1dfb56c
EZ
1200 ssize_t size1;
1201 ssize_t size2;
fa9a63c5 1202{
d1dfb56c 1203 ssize_t this_char;
5e69f11e 1204
fa9a63c5
RM
1205 if (where == NULL)
1206 printf ("(null)");
1207 else
1208 {
1209 if (FIRST_STRING_P (where))
25fe55af
RS
1210 {
1211 for (this_char = where - string1; this_char < size1; this_char++)
1212 putchar (string1[this_char]);
fa9a63c5 1213
25fe55af
RS
1214 where = string2;
1215 }
fa9a63c5
RM
1216
1217 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1218 putchar (string2[this_char]);
fa9a63c5
RM
1219 }
1220}
1221
1222#else /* not DEBUG */
1223
0b32bf0e
SM
1224# undef assert
1225# define assert(e)
fa9a63c5 1226
0b32bf0e
SM
1227# define DEBUG_STATEMENT(e)
1228# define DEBUG_PRINT1(x)
1229# define DEBUG_PRINT2(x1, x2)
1230# define DEBUG_PRINT3(x1, x2, x3)
1231# define DEBUG_PRINT4(x1, x2, x3, x4)
1232# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1233# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1234
1235#endif /* not DEBUG */
1236\f
4da60324
PE
1237/* Use this to suppress gcc's `...may be used before initialized' warnings. */
1238#ifdef lint
1239# define IF_LINT(Code) Code
1240#else
1241# define IF_LINT(Code) /* empty */
1242#endif
1243\f
fa9a63c5
RM
1244/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1245 also be assigned to arbitrarily: each pattern buffer stores its own
1246 syntax, so it can be changed between regex compilations. */
1247/* This has no initializer because initialized variables in Emacs
1248 become read-only after dumping. */
1249reg_syntax_t re_syntax_options;
1250
1251
1252/* Specify the precise syntax of regexps for compilation. This provides
1253 for compatibility for various utilities which historically have
1254 different, incompatible syntaxes.
1255
1256 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1257 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1258
1259reg_syntax_t
971de7fb 1260re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1261{
1262 reg_syntax_t ret = re_syntax_options;
5e69f11e 1263
fa9a63c5
RM
1264 re_syntax_options = syntax;
1265 return ret;
1266}
c0f9ea08 1267WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1268
1269/* Regexp to use to replace spaces, or NULL meaning don't. */
1270static re_char *whitespace_regexp;
1271
1272void
971de7fb 1273re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1274{
6470ea05 1275 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1276}
1277WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1278\f
1279/* This table gives an error message for each of the error codes listed
4bb91c68 1280 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1281 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1282 but why not be nice? */
fa9a63c5
RM
1283
1284static const char *re_error_msgid[] =
5e69f11e
RM
1285 {
1286 gettext_noop ("Success"), /* REG_NOERROR */
1287 gettext_noop ("No match"), /* REG_NOMATCH */
1288 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1289 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1290 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1291 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1292 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1293 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1294 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1295 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1296 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1297 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1298 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1299 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1300 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1301 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1302 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1303 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1304 };
1305\f
4bb91c68 1306/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1307
1308/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1309 searching and matching functions should not call alloca. On some
1310 systems, alloca is implemented in terms of malloc, and if we're
1311 using the relocating allocator routines, then malloc could cause a
1312 relocation, which might (if the strings being searched are in the
1313 ralloc heap) shift the data out from underneath the regexp
1314 routines.
1315
5e69f11e 1316 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1317 processes input from X in a signal handler; processing X input may
1318 call malloc; if input arrives while a matching routine is calling
1319 malloc, then we're scrod. But Emacs can't just block input while
1320 calling matching routines; then we don't notice interrupts when
1321 they come in. So, Emacs blocks input around all regexp calls
1322 except the matching calls, which it leaves unprotected, in the
1323 faith that they will not malloc. */
1324
1325/* Normally, this is fine. */
1326#define MATCH_MAY_ALLOCATE
1327
fa9a63c5
RM
1328/* The match routines may not allocate if (1) they would do it with malloc
1329 and (2) it's not safe for them to use malloc.
1330 Note that if REL_ALLOC is defined, matching would not use malloc for the
1331 failure stack, but we would still use it for the register vectors;
4bb91c68 1332 so REL_ALLOC should not affect this. */
b588157e 1333#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1334# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1335#endif
1336
1337\f
1338/* Failure stack declarations and macros; both re_compile_fastmap and
1339 re_match_2 use a failure stack. These have to be macros because of
1340 REGEX_ALLOCATE_STACK. */
5e69f11e 1341
fa9a63c5 1342
320a2a73 1343/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1344 when matching. If this number is exceeded, we allocate more
1345 space, so it is not a hard limit. */
1346#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1347# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1348#endif
1349
1350/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1351 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1352 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1353 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1354 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1355# if defined MATCH_MAY_ALLOCATE
1356/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1357 whose default stack limit is 2mb. In order for a larger
1358 value to work reliably, you have to try to make it accord
1359 with the process stack limit. */
c0f9ea08
SM
1360size_t re_max_failures = 40000;
1361# else
1362size_t re_max_failures = 4000;
1363# endif
fa9a63c5
RM
1364
1365union fail_stack_elt
1366{
01618498 1367 re_char *pointer;
c0f9ea08
SM
1368 /* This should be the biggest `int' that's no bigger than a pointer. */
1369 long integer;
fa9a63c5
RM
1370};
1371
1372typedef union fail_stack_elt fail_stack_elt_t;
1373
1374typedef struct
1375{
1376 fail_stack_elt_t *stack;
c0f9ea08
SM
1377 size_t size;
1378 size_t avail; /* Offset of next open position. */
1379 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1380} fail_stack_type;
1381
505bde11 1382#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1383
1384
1385/* Define macros to initialize and free the failure stack.
1386 Do `return -2' if the alloc fails. */
1387
1388#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1389# define INIT_FAIL_STACK() \
fa9a63c5
RM
1390 do { \
1391 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1392 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1393 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1394 \
1395 if (fail_stack.stack == NULL) \
1396 return -2; \
1397 \
1398 fail_stack.size = INIT_FAILURE_ALLOC; \
1399 fail_stack.avail = 0; \
505bde11 1400 fail_stack.frame = 0; \
fa9a63c5 1401 } while (0)
fa9a63c5 1402#else
0b32bf0e 1403# define INIT_FAIL_STACK() \
fa9a63c5
RM
1404 do { \
1405 fail_stack.avail = 0; \
505bde11 1406 fail_stack.frame = 0; \
fa9a63c5
RM
1407 } while (0)
1408
b313f9d8
PE
1409# define RETALLOC_IF(addr, n, t) \
1410 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
fa9a63c5
RM
1411#endif
1412
1413
320a2a73
KH
1414/* Double the size of FAIL_STACK, up to a limit
1415 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1416
1417 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1418 allocating space for it or it was already too large.
1419
4bb91c68 1420 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1421
320a2a73
KH
1422/* Factor to increase the failure stack size by
1423 when we increase it.
1424 This used to be 2, but 2 was too wasteful
1425 because the old discarded stacks added up to as much space
1426 were as ultimate, maximum-size stack. */
1427#define FAIL_STACK_GROWTH_FACTOR 4
1428
1429#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1430 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1431 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1432 ? 0 \
320a2a73
KH
1433 : ((fail_stack).stack \
1434 = (fail_stack_elt_t *) \
25fe55af
RS
1435 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1436 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1437 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1438 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1439 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1440 \
1441 (fail_stack).stack == NULL \
1442 ? 0 \
6453db45
KH
1443 : ((fail_stack).size \
1444 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1445 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1446 * FAIL_STACK_GROWTH_FACTOR)) \
1447 / sizeof (fail_stack_elt_t)), \
25fe55af 1448 1)))
fa9a63c5
RM
1449
1450
fa9a63c5
RM
1451/* Push a pointer value onto the failure stack.
1452 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1453 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1454#define PUSH_FAILURE_POINTER(item) \
01618498 1455 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1456
1457/* This pushes an integer-valued item onto the failure stack.
1458 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1459 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1460#define PUSH_FAILURE_INT(item) \
1461 fail_stack.stack[fail_stack.avail++].integer = (item)
1462
b313f9d8 1463/* These POP... operations complement the PUSH... operations.
fa9a63c5
RM
1464 All assume that `fail_stack' is nonempty. */
1465#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1466#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
fa9a63c5 1467
505bde11
SM
1468/* Individual items aside from the registers. */
1469#define NUM_NONREG_ITEMS 3
1470
1471/* Used to examine the stack (to detect infinite loops). */
1472#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1473#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1474#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1475#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1476
1477
505bde11
SM
1478#define ENSURE_FAIL_STACK(space) \
1479while (REMAINING_AVAIL_SLOTS <= space) { \
1480 if (!GROW_FAIL_STACK (fail_stack)) \
1481 return -2; \
1482 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1483 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1484}
1485
1486/* Push register NUM onto the stack. */
1487#define PUSH_FAILURE_REG(num) \
1488do { \
1489 char *destination; \
1490 ENSURE_FAIL_STACK(3); \
1491 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1492 num, regstart[num], regend[num]); \
1493 PUSH_FAILURE_POINTER (regstart[num]); \
1494 PUSH_FAILURE_POINTER (regend[num]); \
1495 PUSH_FAILURE_INT (num); \
1496} while (0)
1497
01618498
SM
1498/* Change the counter's value to VAL, but make sure that it will
1499 be reset when backtracking. */
1500#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1501do { \
1502 char *destination; \
1503 int c; \
1504 ENSURE_FAIL_STACK(3); \
1505 EXTRACT_NUMBER (c, ptr); \
01618498 1506 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1507 PUSH_FAILURE_INT (c); \
1508 PUSH_FAILURE_POINTER (ptr); \
1509 PUSH_FAILURE_INT (-1); \
01618498 1510 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1511} while (0)
1512
505bde11 1513/* Pop a saved register off the stack. */
dc1e502d 1514#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1515do { \
d1dfb56c 1516 long pfreg = POP_FAILURE_INT (); \
19ed5445 1517 if (pfreg == -1) \
dc1e502d
SM
1518 { \
1519 /* It's a counter. */ \
6dcf2d0e
SM
1520 /* Here, we discard `const', making re_match non-reentrant. */ \
1521 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1522 pfreg = POP_FAILURE_INT (); \
1523 STORE_NUMBER (ptr, pfreg); \
1524 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, pfreg); \
dc1e502d
SM
1525 } \
1526 else \
1527 { \
19ed5445
PE
1528 regend[pfreg] = POP_FAILURE_POINTER (); \
1529 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc1e502d 1530 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
19ed5445 1531 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1532 } \
505bde11
SM
1533} while (0)
1534
1535/* Check that we are not stuck in an infinite loop. */
1536#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1537do { \
d1dfb56c 1538 ssize_t failure = TOP_FAILURE_HANDLE (); \
505bde11 1539 /* Check for infinite matching loops */ \
f6df485f
RS
1540 while (failure > 0 \
1541 && (FAILURE_STR (failure) == string_place \
1542 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1543 { \
1544 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1545 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1546 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1547 { \
6df42991
SM
1548 cycle = 1; \
1549 break; \
f6df485f 1550 } \
66f0296e 1551 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1552 failure = NEXT_FAILURE_HANDLE(failure); \
1553 } \
1554 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1555} while (0)
6df42991 1556
fa9a63c5 1557/* Push the information about the state we will need
5e69f11e
RM
1558 if we ever fail back to it.
1559
505bde11 1560 Requires variables fail_stack, regstart, regend and
320a2a73 1561 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1562 declared.
5e69f11e 1563
fa9a63c5
RM
1564 Does `return FAILURE_CODE' if runs out of memory. */
1565
505bde11
SM
1566#define PUSH_FAILURE_POINT(pattern, string_place) \
1567do { \
1568 char *destination; \
1569 /* Must be int, so when we don't save any registers, the arithmetic \
1570 of 0 + -1 isn't done as unsigned. */ \
1571 \
505bde11 1572 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1573 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1574 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1575 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1576 \
1577 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1578 \
1579 DEBUG_PRINT1 ("\n"); \
1580 \
1581 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1582 PUSH_FAILURE_INT (fail_stack.frame); \
1583 \
1584 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1585 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1586 DEBUG_PRINT1 ("'\n"); \
1587 PUSH_FAILURE_POINTER (string_place); \
1588 \
1589 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1590 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1591 PUSH_FAILURE_POINTER (pattern); \
1592 \
1593 /* Close the frame by moving the frame pointer past it. */ \
1594 fail_stack.frame = fail_stack.avail; \
1595} while (0)
fa9a63c5 1596
320a2a73
KH
1597/* Estimate the size of data pushed by a typical failure stack entry.
1598 An estimate is all we need, because all we use this for
1599 is to choose a limit for how big to make the failure stack. */
ada30c0e 1600/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1601#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1602
fa9a63c5
RM
1603/* How many items can still be added to the stack without overflowing it. */
1604#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1605
1606
1607/* Pops what PUSH_FAIL_STACK pushes.
1608
1609 We restore into the parameters, all of which should be lvalues:
1610 STR -- the saved data position.
1611 PAT -- the saved pattern position.
fa9a63c5 1612 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1613
fa9a63c5 1614 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1615 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1616
505bde11
SM
1617#define POP_FAILURE_POINT(str, pat) \
1618do { \
fa9a63c5
RM
1619 assert (!FAIL_STACK_EMPTY ()); \
1620 \
1621 /* Remove failure points and point to how many regs pushed. */ \
1622 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1623 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1624 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1625 \
505bde11
SM
1626 /* Pop the saved registers. */ \
1627 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1628 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1629 \
01618498 1630 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1631 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1632 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1633 \
1634 /* If the saved string location is NULL, it came from an \
1635 on_failure_keep_string_jump opcode, and we want to throw away the \
1636 saved NULL, thus retaining our current position in the string. */ \
01618498 1637 str = POP_FAILURE_POINTER (); \
505bde11 1638 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1639 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1640 DEBUG_PRINT1 ("'\n"); \
1641 \
505bde11
SM
1642 fail_stack.frame = POP_FAILURE_INT (); \
1643 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1644 \
505bde11
SM
1645 assert (fail_stack.avail >= 0); \
1646 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1647 \
fa9a63c5 1648 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1649} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1650
1651
1652\f
fa9a63c5 1653/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1654#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1655\f
1656/* Subroutine declarations and macros for regex_compile. */
1657
261cb4bb
PE
1658static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1659 reg_syntax_t syntax,
1660 struct re_pattern_buffer *bufp);
1661static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1662static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1663static void insert_op1 (re_opcode_t op, unsigned char *loc,
1664 int arg, unsigned char *end);
1665static void insert_op2 (re_opcode_t op, unsigned char *loc,
1666 int arg1, int arg2, unsigned char *end);
1667static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1668 reg_syntax_t syntax);
1669static boolean at_endline_loc_p (re_char *p, re_char *pend,
1670 reg_syntax_t syntax);
1671static re_char *skip_one_char (re_char *p);
1672static int analyse_first (re_char *p, re_char *pend,
1673 char *fastmap, const int multibyte);
fa9a63c5 1674
fa9a63c5 1675/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1676 translation. */
36595814 1677#define PATFETCH(c) \
2d1675e4
SM
1678 do { \
1679 int len; \
1680 if (p == pend) return REG_EEND; \
62a6e103 1681 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1682 p += len; \
fa9a63c5
RM
1683 } while (0)
1684
fa9a63c5
RM
1685
1686/* If `translate' is non-null, return translate[D], else just D. We
1687 cast the subscript to translate because some data is declared as
1688 `char *', to avoid warnings when a string constant is passed. But
1689 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1690#ifndef TRANSLATE
0b32bf0e 1691# define TRANSLATE(d) \
66f0296e 1692 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1693#endif
fa9a63c5
RM
1694
1695
1696/* Macros for outputting the compiled pattern into `buffer'. */
1697
1698/* If the buffer isn't allocated when it comes in, use this. */
1699#define INIT_BUF_SIZE 32
1700
4bb91c68 1701/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1702#define GET_BUFFER_SPACE(n) \
01618498 1703 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1704 EXTEND_BUFFER ()
1705
1706/* Make sure we have one more byte of buffer space and then add C to it. */
1707#define BUF_PUSH(c) \
1708 do { \
1709 GET_BUFFER_SPACE (1); \
1710 *b++ = (unsigned char) (c); \
1711 } while (0)
1712
1713
1714/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1715#define BUF_PUSH_2(c1, c2) \
1716 do { \
1717 GET_BUFFER_SPACE (2); \
1718 *b++ = (unsigned char) (c1); \
1719 *b++ = (unsigned char) (c2); \
1720 } while (0)
1721
1722
fa9a63c5 1723/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1724 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1725#define STORE_JUMP(op, loc, to) \
1726 store_op1 (op, loc, (to) - (loc) - 3)
1727
1728/* Likewise, for a two-argument jump. */
1729#define STORE_JUMP2(op, loc, to, arg) \
1730 store_op2 (op, loc, (to) - (loc) - 3, arg)
1731
4bb91c68 1732/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1733#define INSERT_JUMP(op, loc, to) \
1734 insert_op1 (op, loc, (to) - (loc) - 3, b)
1735
1736/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1737#define INSERT_JUMP2(op, loc, to, arg) \
1738 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1739
1740
1741/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1742 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1743 be too small, many things would have to change. */
839966f3
KH
1744# define MAX_BUF_SIZE (1L << 15)
1745
1746#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1747/* Any other compiler which, like MSC, has allocation limit below 2^16
1748 bytes will have to use approach similar to what was done below for
1749 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1750 reallocating to 0 bytes. Such thing is not going to work too well.
1751 You have been warned!! */
1752#if defined _MSC_VER && !defined WIN32
1753/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1754# define MAX_BUF_SIZE 65500L
1755#else
1756# define MAX_BUF_SIZE (1L << 16)
1757#endif
839966f3 1758#endif /* 0 */
fa9a63c5
RM
1759
1760/* Extend the buffer by twice its current size via realloc and
1761 reset the pointers that pointed into the old block to point to the
1762 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1763 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1764#if __BOUNDED_POINTERS__
1765# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1766# define MOVE_BUFFER_POINTER(P) \
1767 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1768 SET_HIGH_BOUND (P), \
1769 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1770# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1771 else \
1772 { \
1773 SET_HIGH_BOUND (b); \
1774 SET_HIGH_BOUND (begalt); \
1775 if (fixup_alt_jump) \
1776 SET_HIGH_BOUND (fixup_alt_jump); \
1777 if (laststart) \
1778 SET_HIGH_BOUND (laststart); \
1779 if (pending_exact) \
1780 SET_HIGH_BOUND (pending_exact); \
1781 }
1782#else
381880b0 1783# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1784# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1785#endif
fa9a63c5 1786#define EXTEND_BUFFER() \
25fe55af 1787 do { \
381880b0 1788 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1789 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1790 return REG_ESIZE; \
1791 bufp->allocated <<= 1; \
1792 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1793 bufp->allocated = MAX_BUF_SIZE; \
01618498 1794 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1795 if (bufp->buffer == NULL) \
1796 return REG_ESPACE; \
1797 /* If the buffer moved, move all the pointers into it. */ \
1798 if (old_buffer != bufp->buffer) \
1799 { \
381880b0 1800 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1801 MOVE_BUFFER_POINTER (b); \
1802 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1803 if (fixup_alt_jump) \
4bb91c68 1804 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1805 if (laststart) \
4bb91c68 1806 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1807 if (pending_exact) \
4bb91c68 1808 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1809 } \
4bb91c68 1810 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1811 } while (0)
1812
1813
1814/* Since we have one byte reserved for the register number argument to
1815 {start,stop}_memory, the maximum number of groups we can report
1816 things about is what fits in that byte. */
1817#define MAX_REGNUM 255
1818
1819/* But patterns can have more than `MAX_REGNUM' registers. We just
1820 ignore the excess. */
098d42af 1821typedef int regnum_t;
fa9a63c5
RM
1822
1823
1824/* Macros for the compile stack. */
1825
1826/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1827 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1828/* int may be not enough when sizeof(int) == 2. */
1829typedef long pattern_offset_t;
fa9a63c5
RM
1830
1831typedef struct
1832{
1833 pattern_offset_t begalt_offset;
1834 pattern_offset_t fixup_alt_jump;
5e69f11e 1835 pattern_offset_t laststart_offset;
fa9a63c5
RM
1836 regnum_t regnum;
1837} compile_stack_elt_t;
1838
1839
1840typedef struct
1841{
1842 compile_stack_elt_t *stack;
d1dfb56c
EZ
1843 size_t size;
1844 size_t avail; /* Offset of next open position. */
fa9a63c5
RM
1845} compile_stack_type;
1846
1847
1848#define INIT_COMPILE_STACK_SIZE 32
1849
1850#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1851#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1852
4bb91c68 1853/* The next available element. */
fa9a63c5
RM
1854#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1855
1cee1e27
SM
1856/* Explicit quit checking is only used on NTemacs and whenever we
1857 use polling to process input events. */
1858#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1859extern int immediate_quit;
1860# define IMMEDIATE_QUIT_CHECK \
1861 do { \
1862 if (immediate_quit) QUIT; \
1863 } while (0)
1864#else
1865# define IMMEDIATE_QUIT_CHECK ((void)0)
1866#endif
1867\f
b18215fc
RS
1868/* Structure to manage work area for range table. */
1869struct range_table_work_area
1870{
1871 int *table; /* actual work area. */
1872 int allocated; /* allocated size for work area in bytes. */
7814e705 1873 int used; /* actually used size in words. */
96cc36cc 1874 int bits; /* flag to record character classes */
b18215fc
RS
1875};
1876
77d11aec
RS
1877/* Make sure that WORK_AREA can hold more N multibyte characters.
1878 This is used only in set_image_of_range and set_image_of_range_1.
1879 It expects WORK_AREA to be a pointer.
1880 If it can't get the space, it returns from the surrounding function. */
1881
1882#define EXTEND_RANGE_TABLE(work_area, n) \
1883 do { \
8f924df7 1884 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1885 { \
8f924df7
KH
1886 extend_range_table_work_area (&work_area); \
1887 if ((work_area).table == 0) \
77d11aec
RS
1888 return (REG_ESPACE); \
1889 } \
b18215fc
RS
1890 } while (0)
1891
96cc36cc
RS
1892#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1893 (work_area).bits |= (bit)
1894
14473664
SM
1895/* Bits used to implement the multibyte-part of the various character classes
1896 such as [:alnum:] in a charset's range table. */
1897#define BIT_WORD 0x1
1898#define BIT_LOWER 0x2
1899#define BIT_PUNCT 0x4
1900#define BIT_SPACE 0x8
1901#define BIT_UPPER 0x10
1902#define BIT_MULTIBYTE 0x20
96cc36cc 1903
b18215fc
RS
1904/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1905#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1906 do { \
8f924df7 1907 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1908 (work_area).table[(work_area).used++] = (range_start); \
1909 (work_area).table[(work_area).used++] = (range_end); \
1910 } while (0)
1911
7814e705 1912/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1913#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1914 do { \
1915 if ((work_area).table) \
1916 free ((work_area).table); \
1917 } while (0)
1918
96cc36cc 1919#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1920#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1921#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1922#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1923\f
b18215fc 1924
fa9a63c5 1925/* Set the bit for character C in a list. */
01618498 1926#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1927
1928
bf216479
KH
1929#ifdef emacs
1930
cf9c99bc
KH
1931/* Store characters in the range FROM to TO in the bitmap at B (for
1932 ASCII and unibyte characters) and WORK_AREA (for multibyte
1933 characters) while translating them and paying attention to the
1934 continuity of translated characters.
8f924df7 1935
cf9c99bc
KH
1936 Implementation note: It is better to implement these fairly big
1937 macros by a function, but it's not that easy because macros called
8f924df7 1938 in this macro assume various local variables already declared. */
bf216479 1939
cf9c99bc
KH
1940/* Both FROM and TO are ASCII characters. */
1941
1942#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1943 do { \
1944 int C0, C1; \
1945 \
1946 for (C0 = (FROM); C0 <= (TO); C0++) \
1947 { \
1948 C1 = TRANSLATE (C0); \
1949 if (! ASCII_CHAR_P (C1)) \
1950 { \
1951 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1952 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1953 C1 = C0; \
1954 } \
1955 SET_LIST_BIT (C1); \
1956 } \
1957 } while (0)
1958
1959
1960/* Both FROM and TO are unibyte characters (0x80..0xFF). */
1961
1962#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1963 do { \
1964 int C0, C1, C2, I; \
1965 int USED = RANGE_TABLE_WORK_USED (work_area); \
1966 \
1967 for (C0 = (FROM); C0 <= (TO); C0++) \
1968 { \
1969 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1970 if (CHAR_BYTE8_P (C1)) \
1971 SET_LIST_BIT (C0); \
1972 else \
1973 { \
1974 C2 = TRANSLATE (C1); \
1975 if (C2 == C1 \
1976 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1977 C1 = C0; \
1978 SET_LIST_BIT (C1); \
1979 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1980 { \
1981 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1982 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1983 \
1984 if (C2 >= from - 1 && C2 <= to + 1) \
1985 { \
1986 if (C2 == from - 1) \
1987 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1988 else if (C2 == to + 1) \
1989 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1990 break; \
1991 } \
1992 } \
1993 if (I < USED) \
1994 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1995 } \
1996 } \
1997 } while (0)
1998
1999
78edd3b7 2000/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2001
2002#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2003 do { \
2004 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2005 \
2006 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2007 for (C0 = (FROM); C0 <= (TO); C0++) \
2008 { \
2009 C1 = TRANSLATE (C0); \
2010 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2011 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2012 SET_LIST_BIT (C2); \
2013 if (C1 >= (FROM) && C1 <= (TO)) \
2014 continue; \
2015 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2016 { \
2017 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2018 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2019 \
2020 if (C1 >= from - 1 && C1 <= to + 1) \
2021 { \
2022 if (C1 == from - 1) \
2023 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2024 else if (C1 == to + 1) \
2025 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2026 break; \
2027 } \
2028 } \
2029 if (I < USED) \
2030 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2031 } \
bf216479
KH
2032 } while (0)
2033
2034#endif /* emacs */
2035
fa9a63c5 2036/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2037#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2038 do { \
2039 if (p == pend) \
2040 FREE_STACK_RETURN (REG_EBRACE); \
2041 else \
2042 { \
2043 PATFETCH (c); \
2044 while ('0' <= c && c <= '9') \
2045 { \
2046 int prev; \
2047 if (num < 0) \
2048 num = 0; \
2049 prev = num; \
2050 num = num * 10 + c - '0'; \
2051 if (num / 10 != prev) \
2052 FREE_STACK_RETURN (REG_BADBR); \
2053 if (p == pend) \
2054 FREE_STACK_RETURN (REG_EBRACE); \
2055 PATFETCH (c); \
2056 } \
2057 } \
2058 } while (0)
77d11aec 2059\f
1fdab503 2060#if ! WIDE_CHAR_SUPPORT
01618498 2061
14473664 2062/* Map a string to the char class it names (if any). */
1fdab503 2063re_wctype_t
971de7fb 2064re_wctype (const re_char *str)
14473664 2065{
5b0534c8 2066 const char *string = (const char *) str;
14473664
SM
2067 if (STREQ (string, "alnum")) return RECC_ALNUM;
2068 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2069 else if (STREQ (string, "word")) return RECC_WORD;
2070 else if (STREQ (string, "ascii")) return RECC_ASCII;
2071 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2072 else if (STREQ (string, "graph")) return RECC_GRAPH;
2073 else if (STREQ (string, "lower")) return RECC_LOWER;
2074 else if (STREQ (string, "print")) return RECC_PRINT;
2075 else if (STREQ (string, "punct")) return RECC_PUNCT;
2076 else if (STREQ (string, "space")) return RECC_SPACE;
2077 else if (STREQ (string, "upper")) return RECC_UPPER;
2078 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2079 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2080 else if (STREQ (string, "digit")) return RECC_DIGIT;
2081 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2082 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2083 else if (STREQ (string, "blank")) return RECC_BLANK;
2084 else return 0;
2085}
2086
e0f24100 2087/* True if CH is in the char class CC. */
1fdab503 2088boolean
971de7fb 2089re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2090{
2091 switch (cc)
2092 {
f3fcc40d
AS
2093 case RECC_ALNUM: return ISALNUM (ch) != 0;
2094 case RECC_ALPHA: return ISALPHA (ch) != 0;
2095 case RECC_BLANK: return ISBLANK (ch) != 0;
2096 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2097 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2098 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2099 case RECC_LOWER: return ISLOWER (ch) != 0;
2100 case RECC_PRINT: return ISPRINT (ch) != 0;
2101 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2102 case RECC_SPACE: return ISSPACE (ch) != 0;
2103 case RECC_UPPER: return ISUPPER (ch) != 0;
2104 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2105 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
213bd7f2 2106 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
f3fcc40d 2107 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
213bd7f2 2108 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
f3fcc40d 2109 case RECC_WORD: return ISWORD (ch) != 0;
0cdd06f8
SM
2110 case RECC_ERROR: return false;
2111 default:
5e617bc2 2112 abort ();
14473664
SM
2113 }
2114}
fa9a63c5 2115
14473664
SM
2116/* Return a bit-pattern to use in the range-table bits to match multibyte
2117 chars of class CC. */
2118static int
971de7fb 2119re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2120{
2121 switch (cc)
2122 {
2123 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2124 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2125 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2126 case RECC_LOWER: return BIT_LOWER;
2127 case RECC_UPPER: return BIT_UPPER;
2128 case RECC_PUNCT: return BIT_PUNCT;
2129 case RECC_SPACE: return BIT_SPACE;
14473664 2130 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2131 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2132 default:
5e617bc2 2133 abort ();
14473664
SM
2134 }
2135}
2136#endif
77d11aec
RS
2137\f
2138/* Filling in the work area of a range. */
2139
2140/* Actually extend the space in WORK_AREA. */
2141
2142static void
971de7fb 2143extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2144{
77d11aec
RS
2145 work_area->allocated += 16 * sizeof (int);
2146 if (work_area->table)
2147 work_area->table
2148 = (int *) realloc (work_area->table, work_area->allocated);
2149 else
2150 work_area->table
2151 = (int *) malloc (work_area->allocated);
2152}
2153
8f924df7 2154#if 0
77d11aec
RS
2155#ifdef emacs
2156
2157/* Carefully find the ranges of codes that are equivalent
2158 under case conversion to the range start..end when passed through
2159 TRANSLATE. Handle the case where non-letters can come in between
2160 two upper-case letters (which happens in Latin-1).
2161 Also handle the case of groups of more than 2 case-equivalent chars.
2162
2163 The basic method is to look at consecutive characters and see
2164 if they can form a run that can be handled as one.
2165
2166 Returns -1 if successful, REG_ESPACE if ran out of space. */
2167
2168static int
1dae0f0a
AS
2169set_image_of_range_1 (struct range_table_work_area *work_area,
2170 re_wchar_t start, re_wchar_t end,
2171 RE_TRANSLATE_TYPE translate)
77d11aec
RS
2172{
2173 /* `one_case' indicates a character, or a run of characters,
2174 each of which is an isolate (no case-equivalents).
2175 This includes all ASCII non-letters.
2176
2177 `two_case' indicates a character, or a run of characters,
2178 each of which has two case-equivalent forms.
2179 This includes all ASCII letters.
2180
2181 `strange' indicates a character that has more than one
2182 case-equivalent. */
177c0ea7 2183
77d11aec
RS
2184 enum case_type {one_case, two_case, strange};
2185
2186 /* Describe the run that is in progress,
2187 which the next character can try to extend.
2188 If run_type is strange, that means there really is no run.
2189 If run_type is one_case, then run_start...run_end is the run.
2190 If run_type is two_case, then the run is run_start...run_end,
2191 and the case-equivalents end at run_eqv_end. */
2192
2193 enum case_type run_type = strange;
2194 int run_start, run_end, run_eqv_end;
2195
2196 Lisp_Object eqv_table;
2197
2198 if (!RE_TRANSLATE_P (translate))
2199 {
b7c12565 2200 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2201 work_area->table[work_area->used++] = (start);
2202 work_area->table[work_area->used++] = (end);
b7c12565 2203 return -1;
77d11aec
RS
2204 }
2205
2206 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2207
77d11aec
RS
2208 for (; start <= end; start++)
2209 {
2210 enum case_type this_type;
2211 int eqv = RE_TRANSLATE (eqv_table, start);
2212 int minchar, maxchar;
2213
2214 /* Classify this character */
2215 if (eqv == start)
2216 this_type = one_case;
2217 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2218 this_type = two_case;
2219 else
2220 this_type = strange;
2221
2222 if (start < eqv)
2223 minchar = start, maxchar = eqv;
2224 else
2225 minchar = eqv, maxchar = start;
2226
2227 /* Can this character extend the run in progress? */
2228 if (this_type == strange || this_type != run_type
2229 || !(minchar == run_end + 1
2230 && (run_type == two_case
2231 ? maxchar == run_eqv_end + 1 : 1)))
2232 {
2233 /* No, end the run.
2234 Record each of its equivalent ranges. */
2235 if (run_type == one_case)
2236 {
2237 EXTEND_RANGE_TABLE (work_area, 2);
2238 work_area->table[work_area->used++] = run_start;
2239 work_area->table[work_area->used++] = run_end;
2240 }
2241 else if (run_type == two_case)
2242 {
2243 EXTEND_RANGE_TABLE (work_area, 4);
2244 work_area->table[work_area->used++] = run_start;
2245 work_area->table[work_area->used++] = run_end;
2246 work_area->table[work_area->used++]
2247 = RE_TRANSLATE (eqv_table, run_start);
2248 work_area->table[work_area->used++]
2249 = RE_TRANSLATE (eqv_table, run_end);
2250 }
2251 run_type = strange;
2252 }
177c0ea7 2253
77d11aec
RS
2254 if (this_type == strange)
2255 {
2256 /* For a strange character, add each of its equivalents, one
2257 by one. Don't start a range. */
2258 do
2259 {
2260 EXTEND_RANGE_TABLE (work_area, 2);
2261 work_area->table[work_area->used++] = eqv;
2262 work_area->table[work_area->used++] = eqv;
2263 eqv = RE_TRANSLATE (eqv_table, eqv);
2264 }
2265 while (eqv != start);
2266 }
2267
2268 /* Add this char to the run, or start a new run. */
2269 else if (run_type == strange)
2270 {
2271 /* Initialize a new range. */
2272 run_type = this_type;
2273 run_start = start;
2274 run_end = start;
2275 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2276 }
2277 else
2278 {
2279 /* Extend a running range. */
2280 run_end = minchar;
2281 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2282 }
2283 }
2284
2285 /* If a run is still in progress at the end, finish it now
2286 by recording its equivalent ranges. */
2287 if (run_type == one_case)
2288 {
2289 EXTEND_RANGE_TABLE (work_area, 2);
2290 work_area->table[work_area->used++] = run_start;
2291 work_area->table[work_area->used++] = run_end;
2292 }
2293 else if (run_type == two_case)
2294 {
2295 EXTEND_RANGE_TABLE (work_area, 4);
2296 work_area->table[work_area->used++] = run_start;
2297 work_area->table[work_area->used++] = run_end;
2298 work_area->table[work_area->used++]
2299 = RE_TRANSLATE (eqv_table, run_start);
2300 work_area->table[work_area->used++]
2301 = RE_TRANSLATE (eqv_table, run_end);
2302 }
2303
2304 return -1;
2305}
36595814 2306
77d11aec 2307#endif /* emacs */
36595814 2308
2b34df4e 2309/* Record the image of the range start..end when passed through
36595814
SM
2310 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2311 and is not even necessarily contiguous.
b7c12565
RS
2312 Normally we approximate it with the smallest contiguous range that contains
2313 all the chars we need. However, for Latin-1 we go to extra effort
2314 to do a better job.
2315
2316 This function is not called for ASCII ranges.
77d11aec
RS
2317
2318 Returns -1 if successful, REG_ESPACE if ran out of space. */
2319
2320static int
1dae0f0a
AS
2321set_image_of_range (struct range_table_work_area *work_area,
2322 re_wchar_t start, re_wchar_t end,
2323 RE_TRANSLATE_TYPE translate)
36595814 2324{
77d11aec
RS
2325 re_wchar_t cmin, cmax;
2326
2327#ifdef emacs
2328 /* For Latin-1 ranges, use set_image_of_range_1
2329 to get proper handling of ranges that include letters and nonletters.
b7c12565 2330 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2331 For other character sets, we don't bother to get this right. */
b7c12565
RS
2332 if (RE_TRANSLATE_P (translate) && start < 04400
2333 && !(start < 04200 && end >= 04377))
77d11aec 2334 {
b7c12565 2335 int newend;
77d11aec 2336 int tem;
b7c12565
RS
2337 newend = end;
2338 if (newend > 04377)
2339 newend = 04377;
2340 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2341 if (tem > 0)
2342 return tem;
2343
2344 start = 04400;
2345 if (end < 04400)
2346 return -1;
2347 }
2348#endif
2349
b7c12565
RS
2350 EXTEND_RANGE_TABLE (work_area, 2);
2351 work_area->table[work_area->used++] = (start);
2352 work_area->table[work_area->used++] = (end);
2353
2354 cmin = -1, cmax = -1;
77d11aec 2355
36595814 2356 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2357 {
2358 int ch;
77d11aec 2359
b7c12565
RS
2360 for (ch = start; ch <= end; ch++)
2361 {
2362 re_wchar_t c = TRANSLATE (ch);
2363 if (! (start <= c && c <= end))
2364 {
2365 if (cmin == -1)
2366 cmin = c, cmax = c;
2367 else
2368 {
2369 cmin = MIN (cmin, c);
2370 cmax = MAX (cmax, c);
2371 }
2372 }
2373 }
2374
2375 if (cmin != -1)
2376 {
2377 EXTEND_RANGE_TABLE (work_area, 2);
2378 work_area->table[work_area->used++] = (cmin);
2379 work_area->table[work_area->used++] = (cmax);
2380 }
2381 }
36595814 2382
77d11aec
RS
2383 return -1;
2384}
8f924df7 2385#endif /* 0 */
fa9a63c5
RM
2386\f
2387#ifndef MATCH_MAY_ALLOCATE
2388
2389/* If we cannot allocate large objects within re_match_2_internal,
2390 we make the fail stack and register vectors global.
2391 The fail stack, we grow to the maximum size when a regexp
2392 is compiled.
2393 The register vectors, we adjust in size each time we
2394 compile a regexp, according to the number of registers it needs. */
2395
2396static fail_stack_type fail_stack;
2397
2398/* Size with which the following vectors are currently allocated.
2399 That is so we can make them bigger as needed,
4bb91c68 2400 but never make them smaller. */
fa9a63c5
RM
2401static int regs_allocated_size;
2402
66f0296e
SM
2403static re_char ** regstart, ** regend;
2404static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2405
2406/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2407 but don't make them smaller. */
fa9a63c5
RM
2408
2409static
1dae0f0a 2410regex_grow_registers (int num_regs)
fa9a63c5
RM
2411{
2412 if (num_regs > regs_allocated_size)
2413 {
66f0296e
SM
2414 RETALLOC_IF (regstart, num_regs, re_char *);
2415 RETALLOC_IF (regend, num_regs, re_char *);
2416 RETALLOC_IF (best_regstart, num_regs, re_char *);
2417 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2418
2419 regs_allocated_size = num_regs;
2420 }
2421}
2422
2423#endif /* not MATCH_MAY_ALLOCATE */
2424\f
261cb4bb
PE
2425static boolean group_in_compile_stack (compile_stack_type compile_stack,
2426 regnum_t regnum);
99633e97 2427
fa9a63c5
RM
2428/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2429 Returns one of error codes defined in `regex.h', or zero for success.
2430
2431 Assumes the `allocated' (and perhaps `buffer') and `translate'
2432 fields are set in BUFP on entry.
2433
2434 If it succeeds, results are put in BUFP (if it returns an error, the
2435 contents of BUFP are undefined):
2436 `buffer' is the compiled pattern;
2437 `syntax' is set to SYNTAX;
2438 `used' is set to the length of the compiled pattern;
2439 `fastmap_accurate' is zero;
2440 `re_nsub' is the number of subexpressions in PATTERN;
2441 `not_bol' and `not_eol' are zero;
5e69f11e 2442
c0f9ea08 2443 The `fastmap' field is neither examined nor set. */
fa9a63c5 2444
505bde11
SM
2445/* Insert the `jump' from the end of last alternative to "here".
2446 The space for the jump has already been allocated. */
2447#define FIXUP_ALT_JUMP() \
2448do { \
2449 if (fixup_alt_jump) \
2450 STORE_JUMP (jump, fixup_alt_jump, b); \
2451} while (0)
2452
2453
fa9a63c5
RM
2454/* Return, freeing storage we allocated. */
2455#define FREE_STACK_RETURN(value) \
b18215fc
RS
2456 do { \
2457 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2458 free (compile_stack.stack); \
2459 return value; \
2460 } while (0)
fa9a63c5
RM
2461
2462static reg_errcode_t
971de7fb 2463regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2464{
01618498
SM
2465 /* We fetch characters from PATTERN here. */
2466 register re_wchar_t c, c1;
5e69f11e 2467
fa9a63c5
RM
2468 /* Points to the end of the buffer, where we should append. */
2469 register unsigned char *b;
5e69f11e 2470
fa9a63c5
RM
2471 /* Keeps track of unclosed groups. */
2472 compile_stack_type compile_stack;
2473
2474 /* Points to the current (ending) position in the pattern. */
22336245
RS
2475#ifdef AIX
2476 /* `const' makes AIX compiler fail. */
66f0296e 2477 unsigned char *p = pattern;
22336245 2478#else
66f0296e 2479 re_char *p = pattern;
22336245 2480#endif
66f0296e 2481 re_char *pend = pattern + size;
5e69f11e 2482
fa9a63c5 2483 /* How to translate the characters in the pattern. */
6676cb1c 2484 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2485
2486 /* Address of the count-byte of the most recently inserted `exactn'
2487 command. This makes it possible to tell if a new exact-match
2488 character can be added to that command or if the character requires
2489 a new `exactn' command. */
2490 unsigned char *pending_exact = 0;
2491
2492 /* Address of start of the most recently finished expression.
2493 This tells, e.g., postfix * where to find the start of its
2494 operand. Reset at the beginning of groups and alternatives. */
2495 unsigned char *laststart = 0;
2496
2497 /* Address of beginning of regexp, or inside of last group. */
2498 unsigned char *begalt;
2499
2500 /* Place in the uncompiled pattern (i.e., the {) to
2501 which to go back if the interval is invalid. */
66f0296e 2502 re_char *beg_interval;
5e69f11e 2503
fa9a63c5 2504 /* Address of the place where a forward jump should go to the end of
7814e705 2505 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2506 last -- ends with a forward jump of this sort. */
2507 unsigned char *fixup_alt_jump = 0;
2508
b18215fc
RS
2509 /* Work area for range table of charset. */
2510 struct range_table_work_area range_table_work;
2511
2d1675e4
SM
2512 /* If the object matched can contain multibyte characters. */
2513 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2514
f9b0fd99
RS
2515 /* Nonzero if we have pushed down into a subpattern. */
2516 int in_subpattern = 0;
2517
2518 /* These hold the values of p, pattern, and pend from the main
2519 pattern when we have pushed into a subpattern. */
da053e48
PE
2520 re_char *main_p IF_LINT (= NULL);
2521 re_char *main_pattern IF_LINT (= NULL);
2522 re_char *main_pend IF_LINT (= NULL);
f9b0fd99 2523
fa9a63c5 2524#ifdef DEBUG
99633e97 2525 debug++;
fa9a63c5 2526 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2527 if (debug > 0)
fa9a63c5
RM
2528 {
2529 unsigned debug_count;
5e69f11e 2530
fa9a63c5 2531 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2532 putchar (pattern[debug_count]);
fa9a63c5
RM
2533 putchar ('\n');
2534 }
2535#endif /* DEBUG */
2536
2537 /* Initialize the compile stack. */
2538 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2539 if (compile_stack.stack == NULL)
2540 return REG_ESPACE;
2541
2542 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2543 compile_stack.avail = 0;
2544
b18215fc
RS
2545 range_table_work.table = 0;
2546 range_table_work.allocated = 0;
2547
fa9a63c5
RM
2548 /* Initialize the pattern buffer. */
2549 bufp->syntax = syntax;
2550 bufp->fastmap_accurate = 0;
2551 bufp->not_bol = bufp->not_eol = 0;
6224b623 2552 bufp->used_syntax = 0;
fa9a63c5
RM
2553
2554 /* Set `used' to zero, so that if we return an error, the pattern
2555 printer (for debugging) will think there's no pattern. We reset it
2556 at the end. */
2557 bufp->used = 0;
5e69f11e 2558
fa9a63c5 2559 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2560 bufp->re_nsub = 0;
fa9a63c5 2561
0b32bf0e 2562#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2563 /* Initialize the syntax table. */
2564 init_syntax_once ();
2565#endif
2566
2567 if (bufp->allocated == 0)
2568 {
2569 if (bufp->buffer)
2570 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2571 enough space. This loses if buffer's address is bogus, but
7814e705 2572 that is the user's responsibility. */
25fe55af
RS
2573 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2574 }
fa9a63c5 2575 else
7814e705 2576 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2577 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2578 }
fa9a63c5
RM
2579 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2580
2581 bufp->allocated = INIT_BUF_SIZE;
2582 }
2583
2584 begalt = b = bufp->buffer;
2585
2586 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2587 while (1)
fa9a63c5 2588 {
f9b0fd99
RS
2589 if (p == pend)
2590 {
2591 /* If this is the end of an included regexp,
2592 pop back to the main regexp and try again. */
2593 if (in_subpattern)
2594 {
2595 in_subpattern = 0;
2596 pattern = main_pattern;
2597 p = main_p;
2598 pend = main_pend;
2599 continue;
2600 }
2601 /* If this is the end of the main regexp, we are done. */
2602 break;
2603 }
2604
fa9a63c5
RM
2605 PATFETCH (c);
2606
2607 switch (c)
25fe55af 2608 {
f9b0fd99
RS
2609 case ' ':
2610 {
2611 re_char *p1 = p;
2612
2613 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2614 spaces normally. And don't try to do this recursively. */
2615 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2616 goto normal_char;
2617
2618 /* Peek past following spaces. */
2619 while (p1 != pend)
2620 {
2621 if (*p1 != ' ')
2622 break;
2623 p1++;
2624 }
2625 /* If the spaces are followed by a repetition op,
2626 treat them normally. */
c721eee5
RS
2627 if (p1 != pend
2628 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2629 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2630 goto normal_char;
2631
2632 /* Replace the spaces with the whitespace regexp. */
2633 in_subpattern = 1;
2634 main_p = p1;
2635 main_pend = pend;
2636 main_pattern = pattern;
2637 p = pattern = whitespace_regexp;
5b0534c8 2638 pend = p + strlen ((const char *) p);
f9b0fd99 2639 break;
7814e705 2640 }
f9b0fd99 2641
25fe55af
RS
2642 case '^':
2643 {
7814e705 2644 if ( /* If at start of pattern, it's an operator. */
25fe55af 2645 p == pattern + 1
7814e705 2646 /* If context independent, it's an operator. */
25fe55af 2647 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2648 /* Otherwise, depends on what's come before. */
25fe55af 2649 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2650 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2651 else
2652 goto normal_char;
2653 }
2654 break;
2655
2656
2657 case '$':
2658 {
2659 if ( /* If at end of pattern, it's an operator. */
2660 p == pend
7814e705 2661 /* If context independent, it's an operator. */
25fe55af
RS
2662 || syntax & RE_CONTEXT_INDEP_ANCHORS
2663 /* Otherwise, depends on what's next. */
2664 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2665 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2666 else
2667 goto normal_char;
2668 }
2669 break;
fa9a63c5
RM
2670
2671
2672 case '+':
25fe55af
RS
2673 case '?':
2674 if ((syntax & RE_BK_PLUS_QM)
2675 || (syntax & RE_LIMITED_OPS))
2676 goto normal_char;
2677 handle_plus:
2678 case '*':
2679 /* If there is no previous pattern... */
2680 if (!laststart)
2681 {
2682 if (syntax & RE_CONTEXT_INVALID_OPS)
2683 FREE_STACK_RETURN (REG_BADRPT);
2684 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2685 goto normal_char;
2686 }
2687
2688 {
7814e705 2689 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2690 boolean zero_times_ok = 0, many_times_ok = 0;
2691 boolean greedy = 1;
25fe55af
RS
2692
2693 /* If there is a sequence of repetition chars, collapse it
2694 down to just one (the right one). We can't combine
2695 interval operators with these because of, e.g., `a{2}*',
7814e705 2696 which should only match an even number of `a's. */
25fe55af
RS
2697
2698 for (;;)
2699 {
0b32bf0e 2700 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2701 && c == '?' && (zero_times_ok || many_times_ok))
2702 greedy = 0;
2703 else
2704 {
2705 zero_times_ok |= c != '+';
2706 many_times_ok |= c != '?';
2707 }
25fe55af
RS
2708
2709 if (p == pend)
2710 break;
ed0767d8
SM
2711 else if (*p == '*'
2712 || (!(syntax & RE_BK_PLUS_QM)
2713 && (*p == '+' || *p == '?')))
25fe55af 2714 ;
ed0767d8 2715 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2716 {
ed0767d8
SM
2717 if (p+1 == pend)
2718 FREE_STACK_RETURN (REG_EESCAPE);
2719 if (p[1] == '+' || p[1] == '?')
2720 PATFETCH (c); /* Gobble up the backslash. */
2721 else
2722 break;
25fe55af
RS
2723 }
2724 else
ed0767d8 2725 break;
25fe55af 2726 /* If we get here, we found another repeat character. */
ed0767d8
SM
2727 PATFETCH (c);
2728 }
25fe55af
RS
2729
2730 /* Star, etc. applied to an empty pattern is equivalent
2731 to an empty pattern. */
4e8a9132 2732 if (!laststart || laststart == b)
25fe55af
RS
2733 break;
2734
2735 /* Now we know whether or not zero matches is allowed
7814e705 2736 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2737 if (greedy)
2738 {
99633e97 2739 if (many_times_ok)
4e8a9132
SM
2740 {
2741 boolean simple = skip_one_char (laststart) == b;
d1dfb56c 2742 size_t startoffset = 0;
f6a3f532 2743 re_opcode_t ofj =
01618498 2744 /* Check if the loop can match the empty string. */
6df42991
SM
2745 (simple || !analyse_first (laststart, b, NULL, 0))
2746 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2747 assert (skip_one_char (laststart) <= b);
177c0ea7 2748
4e8a9132
SM
2749 if (!zero_times_ok && simple)
2750 { /* Since simple * loops can be made faster by using
2751 on_failure_keep_string_jump, we turn simple P+
2752 into PP* if P is simple. */
2753 unsigned char *p1, *p2;
2754 startoffset = b - laststart;
2755 GET_BUFFER_SPACE (startoffset);
2756 p1 = b; p2 = laststart;
2757 while (p2 < p1)
2758 *b++ = *p2++;
2759 zero_times_ok = 1;
99633e97 2760 }
4e8a9132
SM
2761
2762 GET_BUFFER_SPACE (6);
2763 if (!zero_times_ok)
2764 /* A + loop. */
f6a3f532 2765 STORE_JUMP (ofj, b, b + 6);
99633e97 2766 else
4e8a9132
SM
2767 /* Simple * loops can use on_failure_keep_string_jump
2768 depending on what follows. But since we don't know
2769 that yet, we leave the decision up to
2770 on_failure_jump_smart. */
f6a3f532 2771 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2772 laststart + startoffset, b + 6);
99633e97 2773 b += 3;
4e8a9132 2774 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2775 b += 3;
2776 }
2777 else
2778 {
4e8a9132
SM
2779 /* A simple ? pattern. */
2780 assert (zero_times_ok);
2781 GET_BUFFER_SPACE (3);
2782 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2783 b += 3;
2784 }
1c8c6d39
DL
2785 }
2786 else /* not greedy */
2787 { /* I wish the greedy and non-greedy cases could be merged. */
2788
0683b6fa 2789 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2790 if (many_times_ok)
2791 {
f6a3f532
SM
2792 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2793
6df42991
SM
2794 /* The non-greedy multiple match looks like
2795 a repeat..until: we only need a conditional jump
2796 at the end of the loop. */
f6a3f532
SM
2797 if (emptyp) BUF_PUSH (no_op);
2798 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2799 : on_failure_jump, b, laststart);
1c8c6d39
DL
2800 b += 3;
2801 if (zero_times_ok)
2802 {
2803 /* The repeat...until naturally matches one or more.
2804 To also match zero times, we need to first jump to
6df42991 2805 the end of the loop (its conditional jump). */
1c8c6d39
DL
2806 INSERT_JUMP (jump, laststart, b);
2807 b += 3;
2808 }
2809 }
2810 else
2811 {
2812 /* non-greedy a?? */
1c8c6d39
DL
2813 INSERT_JUMP (jump, laststart, b + 3);
2814 b += 3;
2815 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2816 b += 3;
2817 }
2818 }
2819 }
4e8a9132 2820 pending_exact = 0;
fa9a63c5
RM
2821 break;
2822
2823
2824 case '.':
25fe55af
RS
2825 laststart = b;
2826 BUF_PUSH (anychar);
2827 break;
fa9a63c5
RM
2828
2829
25fe55af
RS
2830 case '[':
2831 {
19ed5445
PE
2832 re_char *p1;
2833
b18215fc 2834 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2835
25fe55af 2836 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2837
25fe55af
RS
2838 /* Ensure that we have enough space to push a charset: the
2839 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2840 GET_BUFFER_SPACE (34);
2841
25fe55af 2842 laststart = b;
e318085a 2843
25fe55af 2844 /* We test `*p == '^' twice, instead of using an if
7814e705 2845 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2846 BUF_PUSH (*p == '^' ? charset_not : charset);
2847 if (*p == '^')
2848 p++;
e318085a 2849
25fe55af
RS
2850 /* Remember the first position in the bracket expression. */
2851 p1 = p;
e318085a 2852
7814e705 2853 /* Push the number of bytes in the bitmap. */
25fe55af 2854 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2855
25fe55af 2856 /* Clear the whole map. */
72af86bd 2857 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2858
25fe55af
RS
2859 /* charset_not matches newline according to a syntax bit. */
2860 if ((re_opcode_t) b[-2] == charset_not
2861 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2862 SET_LIST_BIT ('\n');
fa9a63c5 2863
7814e705 2864 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2865 for (;;)
2866 {
b18215fc 2867 boolean escaped_char = false;
2d1675e4 2868 const unsigned char *p2 = p;
abbd1bcf 2869 re_wchar_t ch;
e318085a 2870
25fe55af 2871 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2872
36595814
SM
2873 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2874 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2875 So the translation is done later in a loop. Example:
2876 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2877 PATFETCH (c);
e318085a 2878
25fe55af
RS
2879 /* \ might escape characters inside [...] and [^...]. */
2880 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2881 {
2882 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2883
2884 PATFETCH (c);
b18215fc 2885 escaped_char = true;
25fe55af 2886 }
b18215fc
RS
2887 else
2888 {
7814e705 2889 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2890 not (i.e., when the bracket expression is `[]' so
2891 far), the ']' character bit gets set way below. */
2d1675e4 2892 if (c == ']' && p2 != p1)
657fcfbd 2893 break;
25fe55af 2894 }
b18215fc 2895
25fe55af
RS
2896 /* See if we're at the beginning of a possible character
2897 class. */
b18215fc 2898
2d1675e4
SM
2899 if (!escaped_char &&
2900 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2901 {
7814e705 2902 /* Leave room for the null. */
14473664 2903 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2904 const unsigned char *class_beg;
b18215fc 2905
25fe55af
RS
2906 PATFETCH (c);
2907 c1 = 0;
ed0767d8 2908 class_beg = p;
b18215fc 2909
25fe55af
RS
2910 /* If pattern is `[[:'. */
2911 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2912
25fe55af
RS
2913 for (;;)
2914 {
14473664
SM
2915 PATFETCH (c);
2916 if ((c == ':' && *p == ']') || p == pend)
2917 break;
2918 if (c1 < CHAR_CLASS_MAX_LENGTH)
2919 str[c1++] = c;
2920 else
2921 /* This is in any case an invalid class name. */
2922 str[0] = '\0';
25fe55af
RS
2923 }
2924 str[c1] = '\0';
b18215fc
RS
2925
2926 /* If isn't a word bracketed by `[:' and `:]':
2927 undo the ending character, the letters, and
2928 leave the leading `:' and `[' (but set bits for
2929 them). */
25fe55af
RS
2930 if (c == ':' && *p == ']')
2931 {
abbd1bcf 2932 re_wctype_t cc = re_wctype (str);
14473664
SM
2933
2934 if (cc == 0)
fa9a63c5
RM
2935 FREE_STACK_RETURN (REG_ECTYPE);
2936
14473664
SM
2937 /* Throw away the ] at the end of the character
2938 class. */
2939 PATFETCH (c);
fa9a63c5 2940
14473664 2941 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2942
cf9c99bc
KH
2943#ifndef emacs
2944 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
2945 if (re_iswctype (btowc (ch), cc))
2946 {
2947 c = TRANSLATE (ch);
ed00c2ac
KH
2948 if (c < (1 << BYTEWIDTH))
2949 SET_LIST_BIT (c);
8f924df7 2950 }
cf9c99bc
KH
2951#else /* emacs */
2952 /* Most character classes in a multibyte match
2953 just set a flag. Exceptions are is_blank,
2954 is_digit, is_cntrl, and is_xdigit, since
2955 they can only match ASCII characters. We
2956 don't need to handle them for multibyte.
2957 They are distinguished by a negative wctype. */
96cc36cc 2958
254c06a8
SM
2959 /* Setup the gl_state object to its buffer-defined
2960 value. This hardcodes the buffer-global
2961 syntax-table for ASCII chars, while the other chars
2962 will obey syntax-table properties. It's not ideal,
2963 but it's the way it's been done until now. */
d48cd3f4 2964 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 2965
cf9c99bc 2966 for (ch = 0; ch < 256; ++ch)
25fe55af 2967 {
cf9c99bc
KH
2968 c = RE_CHAR_TO_MULTIBYTE (ch);
2969 if (! CHAR_BYTE8_P (c)
2970 && re_iswctype (c, cc))
8f924df7 2971 {
cf9c99bc
KH
2972 SET_LIST_BIT (ch);
2973 c1 = TRANSLATE (c);
2974 if (c1 == c)
2975 continue;
2976 if (ASCII_CHAR_P (c1))
2977 SET_LIST_BIT (c1);
2978 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2979 SET_LIST_BIT (c1);
8f924df7 2980 }
25fe55af 2981 }
cf9c99bc
KH
2982 SET_RANGE_TABLE_WORK_AREA_BIT
2983 (range_table_work, re_wctype_to_bit (cc));
2984#endif /* emacs */
6224b623
SM
2985 /* In most cases the matching rule for char classes
2986 only uses the syntax table for multibyte chars,
2987 so that the content of the syntax-table it is not
2988 hardcoded in the range_table. SPACE and WORD are
2989 the two exceptions. */
2990 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2991 bufp->used_syntax = 1;
2992
b18215fc
RS
2993 /* Repeat the loop. */
2994 continue;
25fe55af
RS
2995 }
2996 else
2997 {
ed0767d8
SM
2998 /* Go back to right after the "[:". */
2999 p = class_beg;
25fe55af 3000 SET_LIST_BIT ('[');
b18215fc
RS
3001
3002 /* Because the `:' may starts the range, we
3003 can't simply set bit and repeat the loop.
7814e705 3004 Instead, just set it to C and handle below. */
b18215fc 3005 c = ':';
25fe55af
RS
3006 }
3007 }
b18215fc
RS
3008
3009 if (p < pend && p[0] == '-' && p[1] != ']')
3010 {
3011
3012 /* Discard the `-'. */
3013 PATFETCH (c1);
3014
3015 /* Fetch the character which ends the range. */
3016 PATFETCH (c1);
cf9c99bc
KH
3017#ifdef emacs
3018 if (CHAR_BYTE8_P (c1)
3019 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3020 /* Treat the range from a multibyte character to
3021 raw-byte character as empty. */
3022 c = c1 + 1;
3023#endif /* emacs */
e318085a 3024 }
25fe55af 3025 else
b18215fc
RS
3026 /* Range from C to C. */
3027 c1 = c;
3028
cf9c99bc 3029 if (c > c1)
25fe55af 3030 {
cf9c99bc
KH
3031 if (syntax & RE_NO_EMPTY_RANGES)
3032 FREE_STACK_RETURN (REG_ERANGEX);
3033 /* Else, repeat the loop. */
bf216479 3034 }
6fdd04b0 3035 else
25fe55af 3036 {
cf9c99bc
KH
3037#ifndef emacs
3038 /* Set the range into bitmap */
8f924df7 3039 for (; c <= c1; c++)
b18215fc 3040 {
cf9c99bc
KH
3041 ch = TRANSLATE (c);
3042 if (ch < (1 << BYTEWIDTH))
3043 SET_LIST_BIT (ch);
3044 }
3045#else /* emacs */
3046 if (c < 128)
3047 {
3048 ch = MIN (127, c1);
3049 SETUP_ASCII_RANGE (range_table_work, c, ch);
3050 c = ch + 1;
3051 if (CHAR_BYTE8_P (c1))
3052 c = BYTE8_TO_CHAR (128);
3053 }
3054 if (c <= c1)
3055 {
3056 if (CHAR_BYTE8_P (c))
3057 {
3058 c = CHAR_TO_BYTE8 (c);
3059 c1 = CHAR_TO_BYTE8 (c1);
3060 for (; c <= c1; c++)
3061 SET_LIST_BIT (c);
3062 }
3063 else if (multibyte)
3064 {
3065 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3066 }
3067 else
3068 {
3069 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3070 }
e934739e 3071 }
cf9c99bc 3072#endif /* emacs */
25fe55af 3073 }
e318085a
RS
3074 }
3075
25fe55af 3076 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3077 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3078 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3079 b[-1]--;
3080 b += b[-1];
fa9a63c5 3081
96cc36cc
RS
3082 /* Build real range table from work area. */
3083 if (RANGE_TABLE_WORK_USED (range_table_work)
3084 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3085 {
3086 int i;
3087 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3088
b18215fc 3089 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3090 bytes for flags, two for COUNT, and three bytes for
3091 each character. */
3092 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3093
b18215fc
RS
3094 /* Indicate the existence of range table. */
3095 laststart[1] |= 0x80;
fa9a63c5 3096
96cc36cc
RS
3097 /* Store the character class flag bits into the range table.
3098 If not in emacs, these flag bits are always 0. */
3099 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3100 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3101
b18215fc
RS
3102 STORE_NUMBER_AND_INCR (b, used / 2);
3103 for (i = 0; i < used; i++)
3104 STORE_CHARACTER_AND_INCR
3105 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3106 }
25fe55af
RS
3107 }
3108 break;
fa9a63c5
RM
3109
3110
b18215fc 3111 case '(':
25fe55af
RS
3112 if (syntax & RE_NO_BK_PARENS)
3113 goto handle_open;
3114 else
3115 goto normal_char;
fa9a63c5
RM
3116
3117
25fe55af
RS
3118 case ')':
3119 if (syntax & RE_NO_BK_PARENS)
3120 goto handle_close;
3121 else
3122 goto normal_char;
e318085a
RS
3123
3124
25fe55af
RS
3125 case '\n':
3126 if (syntax & RE_NEWLINE_ALT)
3127 goto handle_alt;
3128 else
3129 goto normal_char;
e318085a
RS
3130
3131
b18215fc 3132 case '|':
25fe55af
RS
3133 if (syntax & RE_NO_BK_VBAR)
3134 goto handle_alt;
3135 else
3136 goto normal_char;
3137
3138
3139 case '{':
3140 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3141 goto handle_interval;
3142 else
3143 goto normal_char;
3144
3145
3146 case '\\':
3147 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3148
3149 /* Do not translate the character after the \, so that we can
3150 distinguish, e.g., \B from \b, even if we normally would
3151 translate, e.g., B to b. */
36595814 3152 PATFETCH (c);
25fe55af
RS
3153
3154 switch (c)
3155 {
3156 case '(':
3157 if (syntax & RE_NO_BK_PARENS)
3158 goto normal_backslash;
3159
3160 handle_open:
505bde11
SM
3161 {
3162 int shy = 0;
c69b0314 3163 regnum_t regnum = 0;
505bde11
SM
3164 if (p+1 < pend)
3165 {
3166 /* Look for a special (?...) construct */
ed0767d8 3167 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3168 {
ed0767d8 3169 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3170 while (!shy)
505bde11 3171 {
c69b0314
SM
3172 PATFETCH (c);
3173 switch (c)
3174 {
3175 case ':': shy = 1; break;
3176 case '0':
3177 /* An explicitly specified regnum must start
3178 with non-0. */
3179 if (regnum == 0)
3180 FREE_STACK_RETURN (REG_BADPAT);
3181 case '1': case '2': case '3': case '4':
3182 case '5': case '6': case '7': case '8': case '9':
3183 regnum = 10*regnum + (c - '0'); break;
3184 default:
3185 /* Only (?:...) is supported right now. */
3186 FREE_STACK_RETURN (REG_BADPAT);
3187 }
505bde11
SM
3188 }
3189 }
505bde11
SM
3190 }
3191
3192 if (!shy)
c69b0314
SM
3193 regnum = ++bufp->re_nsub;
3194 else if (regnum)
3195 { /* It's actually not shy, but explicitly numbered. */
3196 shy = 0;
3197 if (regnum > bufp->re_nsub)
3198 bufp->re_nsub = regnum;
3199 else if (regnum > bufp->re_nsub
3200 /* Ideally, we'd want to check that the specified
3201 group can't have matched (i.e. all subgroups
3202 using the same regnum are in other branches of
3203 OR patterns), but we don't currently keep track
3204 of enough info to do that easily. */
3205 || group_in_compile_stack (compile_stack, regnum))
3206 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3207 }
c69b0314
SM
3208 else
3209 /* It's really shy. */
3210 regnum = - bufp->re_nsub;
25fe55af 3211
99633e97
SM
3212 if (COMPILE_STACK_FULL)
3213 {
3214 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3215 compile_stack_elt_t);
3216 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3217
99633e97
SM
3218 compile_stack.size <<= 1;
3219 }
25fe55af 3220
99633e97 3221 /* These are the values to restore when we hit end of this
7814e705 3222 group. They are all relative offsets, so that if the
99633e97
SM
3223 whole pattern moves because of realloc, they will still
3224 be valid. */
3225 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3226 COMPILE_STACK_TOP.fixup_alt_jump
3227 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3228 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3229 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3230
c69b0314
SM
3231 /* Do not push a start_memory for groups beyond the last one
3232 we can represent in the compiled pattern. */
3233 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3234 BUF_PUSH_2 (start_memory, regnum);
3235
3236 compile_stack.avail++;
3237
3238 fixup_alt_jump = 0;
3239 laststart = 0;
3240 begalt = b;
3241 /* If we've reached MAX_REGNUM groups, then this open
3242 won't actually generate any code, so we'll have to
3243 clear pending_exact explicitly. */
3244 pending_exact = 0;
3245 break;
505bde11 3246 }
25fe55af
RS
3247
3248 case ')':
3249 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3250
3251 if (COMPILE_STACK_EMPTY)
505bde11
SM
3252 {
3253 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3254 goto normal_backslash;
3255 else
3256 FREE_STACK_RETURN (REG_ERPAREN);
3257 }
25fe55af
RS
3258
3259 handle_close:
505bde11 3260 FIXUP_ALT_JUMP ();
25fe55af
RS
3261
3262 /* See similar code for backslashed left paren above. */
3263 if (COMPILE_STACK_EMPTY)
505bde11
SM
3264 {
3265 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3266 goto normal_char;
3267 else
3268 FREE_STACK_RETURN (REG_ERPAREN);
3269 }
25fe55af
RS
3270
3271 /* Since we just checked for an empty stack above, this
3272 ``can't happen''. */
3273 assert (compile_stack.avail != 0);
3274 {
3275 /* We don't just want to restore into `regnum', because
3276 later groups should continue to be numbered higher,
7814e705 3277 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3278 regnum_t regnum;
25fe55af
RS
3279
3280 compile_stack.avail--;
3281 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3282 fixup_alt_jump
3283 = COMPILE_STACK_TOP.fixup_alt_jump
3284 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3285 : 0;
3286 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3287 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3288 /* If we've reached MAX_REGNUM groups, then this open
3289 won't actually generate any code, so we'll have to
3290 clear pending_exact explicitly. */
3291 pending_exact = 0;
e318085a 3292
25fe55af 3293 /* We're at the end of the group, so now we know how many
7814e705 3294 groups were inside this one. */
c69b0314
SM
3295 if (regnum <= MAX_REGNUM && regnum > 0)
3296 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3297 }
3298 break;
3299
3300
3301 case '|': /* `\|'. */
3302 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3303 goto normal_backslash;
3304 handle_alt:
3305 if (syntax & RE_LIMITED_OPS)
3306 goto normal_char;
3307
3308 /* Insert before the previous alternative a jump which
7814e705 3309 jumps to this alternative if the former fails. */
25fe55af
RS
3310 GET_BUFFER_SPACE (3);
3311 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3312 pending_exact = 0;
3313 b += 3;
3314
3315 /* The alternative before this one has a jump after it
3316 which gets executed if it gets matched. Adjust that
3317 jump so it will jump to this alternative's analogous
3318 jump (put in below, which in turn will jump to the next
3319 (if any) alternative's such jump, etc.). The last such
3320 jump jumps to the correct final destination. A picture:
3321 _____ _____
3322 | | | |
3323 | v | v
d1dfb56c 3324 a | b | c
25fe55af
RS
3325
3326 If we are at `b', then fixup_alt_jump right now points to a
3327 three-byte space after `a'. We'll put in the jump, set
3328 fixup_alt_jump to right after `b', and leave behind three
3329 bytes which we'll fill in when we get to after `c'. */
3330
505bde11 3331 FIXUP_ALT_JUMP ();
25fe55af
RS
3332
3333 /* Mark and leave space for a jump after this alternative,
3334 to be filled in later either by next alternative or
3335 when know we're at the end of a series of alternatives. */
3336 fixup_alt_jump = b;
3337 GET_BUFFER_SPACE (3);
3338 b += 3;
3339
3340 laststart = 0;
3341 begalt = b;
3342 break;
3343
3344
3345 case '{':
3346 /* If \{ is a literal. */
3347 if (!(syntax & RE_INTERVALS)
3348 /* If we're at `\{' and it's not the open-interval
3349 operator. */
4bb91c68 3350 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3351 goto normal_backslash;
3352
3353 handle_interval:
3354 {
3355 /* If got here, then the syntax allows intervals. */
3356
3357 /* At least (most) this many matches must be made. */
99633e97 3358 int lower_bound = 0, upper_bound = -1;
25fe55af 3359
ed0767d8 3360 beg_interval = p;
25fe55af 3361
25fe55af
RS
3362 GET_UNSIGNED_NUMBER (lower_bound);
3363
3364 if (c == ',')
ed0767d8 3365 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3366 else
3367 /* Interval such as `{1}' => match exactly once. */
3368 upper_bound = lower_bound;
3369
3370 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3371 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3372 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3373
3374 if (!(syntax & RE_NO_BK_BRACES))
3375 {
4bb91c68
SM
3376 if (c != '\\')
3377 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3378 if (p == pend)
3379 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3380 PATFETCH (c);
3381 }
3382
3383 if (c != '}')
4bb91c68 3384 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3385
3386 /* We just parsed a valid interval. */
3387
3388 /* If it's invalid to have no preceding re. */
3389 if (!laststart)
3390 {
3391 if (syntax & RE_CONTEXT_INVALID_OPS)
3392 FREE_STACK_RETURN (REG_BADRPT);
3393 else if (syntax & RE_CONTEXT_INDEP_OPS)
3394 laststart = b;
3395 else
3396 goto unfetch_interval;
3397 }
3398
6df42991
SM
3399 if (upper_bound == 0)
3400 /* If the upper bound is zero, just drop the sub pattern
3401 altogether. */
3402 b = laststart;
3403 else if (lower_bound == 1 && upper_bound == 1)
3404 /* Just match it once: nothing to do here. */
3405 ;
3406
3407 /* Otherwise, we have a nontrivial interval. When
3408 we're all done, the pattern will look like:
3409 set_number_at <jump count> <upper bound>
3410 set_number_at <succeed_n count> <lower bound>
3411 succeed_n <after jump addr> <succeed_n count>
3412 <body of loop>
3413 jump_n <succeed_n addr> <jump count>
3414 (The upper bound and `jump_n' are omitted if
3415 `upper_bound' is 1, though.) */
3416 else
3417 { /* If the upper bound is > 1, we need to insert
3418 more at the end of the loop. */
3419 unsigned int nbytes = (upper_bound < 0 ? 3
3420 : upper_bound > 1 ? 5 : 0);
3421 unsigned int startoffset = 0;
3422
3423 GET_BUFFER_SPACE (20); /* We might use less. */
3424
3425 if (lower_bound == 0)
3426 {
3427 /* A succeed_n that starts with 0 is really a
3428 a simple on_failure_jump_loop. */
3429 INSERT_JUMP (on_failure_jump_loop, laststart,
3430 b + 3 + nbytes);
3431 b += 3;
3432 }
3433 else
3434 {
3435 /* Initialize lower bound of the `succeed_n', even
3436 though it will be set during matching by its
3437 attendant `set_number_at' (inserted next),
3438 because `re_compile_fastmap' needs to know.
3439 Jump to the `jump_n' we might insert below. */
3440 INSERT_JUMP2 (succeed_n, laststart,
3441 b + 5 + nbytes,
3442 lower_bound);
3443 b += 5;
3444
3445 /* Code to initialize the lower bound. Insert
7814e705 3446 before the `succeed_n'. The `5' is the last two
6df42991
SM
3447 bytes of this `set_number_at', plus 3 bytes of
3448 the following `succeed_n'. */
3449 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3450 b += 5;
3451 startoffset += 5;
3452 }
3453
3454 if (upper_bound < 0)
3455 {
3456 /* A negative upper bound stands for infinity,
3457 in which case it degenerates to a plain jump. */
3458 STORE_JUMP (jump, b, laststart + startoffset);
3459 b += 3;
3460 }
3461 else if (upper_bound > 1)
3462 { /* More than one repetition is allowed, so
3463 append a backward jump to the `succeed_n'
3464 that starts this interval.
3465
3466 When we've reached this during matching,
3467 we'll have matched the interval once, so
3468 jump back only `upper_bound - 1' times. */
3469 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3470 upper_bound - 1);
3471 b += 5;
3472
3473 /* The location we want to set is the second
3474 parameter of the `jump_n'; that is `b-2' as
3475 an absolute address. `laststart' will be
3476 the `set_number_at' we're about to insert;
3477 `laststart+3' the number to set, the source
3478 for the relative address. But we are
3479 inserting into the middle of the pattern --
3480 so everything is getting moved up by 5.
3481 Conclusion: (b - 2) - (laststart + 3) + 5,
3482 i.e., b - laststart.
3483
3484 We insert this at the beginning of the loop
3485 so that if we fail during matching, we'll
3486 reinitialize the bounds. */
3487 insert_op2 (set_number_at, laststart, b - laststart,
3488 upper_bound - 1, b);
3489 b += 5;
3490 }
3491 }
25fe55af
RS
3492 pending_exact = 0;
3493 beg_interval = NULL;
3494 }
3495 break;
3496
3497 unfetch_interval:
3498 /* If an invalid interval, match the characters as literals. */
3499 assert (beg_interval);
3500 p = beg_interval;
3501 beg_interval = NULL;
3502
3503 /* normal_char and normal_backslash need `c'. */
ed0767d8 3504 c = '{';
25fe55af
RS
3505
3506 if (!(syntax & RE_NO_BK_BRACES))
3507 {
ed0767d8
SM
3508 assert (p > pattern && p[-1] == '\\');
3509 goto normal_backslash;
25fe55af 3510 }
ed0767d8
SM
3511 else
3512 goto normal_char;
e318085a 3513
b18215fc 3514#ifdef emacs
25fe55af 3515 /* There is no way to specify the before_dot and after_dot
7814e705 3516 operators. rms says this is ok. --karl */
25fe55af
RS
3517 case '=':
3518 BUF_PUSH (at_dot);
3519 break;
3520
3521 case 's':
3522 laststart = b;
3523 PATFETCH (c);
3524 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3525 break;
3526
3527 case 'S':
3528 laststart = b;
3529 PATFETCH (c);
3530 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3531 break;
b18215fc
RS
3532
3533 case 'c':
3534 laststart = b;
36595814 3535 PATFETCH (c);
b18215fc
RS
3536 BUF_PUSH_2 (categoryspec, c);
3537 break;
e318085a 3538
b18215fc
RS
3539 case 'C':
3540 laststart = b;
36595814 3541 PATFETCH (c);
b18215fc
RS
3542 BUF_PUSH_2 (notcategoryspec, c);
3543 break;
3544#endif /* emacs */
e318085a 3545
e318085a 3546
25fe55af 3547 case 'w':
4bb91c68
SM
3548 if (syntax & RE_NO_GNU_OPS)
3549 goto normal_char;
25fe55af 3550 laststart = b;
1fb352e0 3551 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3552 break;
e318085a 3553
e318085a 3554
25fe55af 3555 case 'W':
4bb91c68
SM
3556 if (syntax & RE_NO_GNU_OPS)
3557 goto normal_char;
25fe55af 3558 laststart = b;
1fb352e0 3559 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3560 break;
e318085a
RS
3561
3562
25fe55af 3563 case '<':
4bb91c68
SM
3564 if (syntax & RE_NO_GNU_OPS)
3565 goto normal_char;
25fe55af
RS
3566 BUF_PUSH (wordbeg);
3567 break;
e318085a 3568
25fe55af 3569 case '>':
4bb91c68
SM
3570 if (syntax & RE_NO_GNU_OPS)
3571 goto normal_char;
25fe55af
RS
3572 BUF_PUSH (wordend);
3573 break;
e318085a 3574
669fa600
SM
3575 case '_':
3576 if (syntax & RE_NO_GNU_OPS)
3577 goto normal_char;
3578 laststart = b;
3579 PATFETCH (c);
3580 if (c == '<')
3581 BUF_PUSH (symbeg);
3582 else if (c == '>')
3583 BUF_PUSH (symend);
3584 else
3585 FREE_STACK_RETURN (REG_BADPAT);
3586 break;
3587
25fe55af 3588 case 'b':
4bb91c68
SM
3589 if (syntax & RE_NO_GNU_OPS)
3590 goto normal_char;
25fe55af
RS
3591 BUF_PUSH (wordbound);
3592 break;
e318085a 3593
25fe55af 3594 case 'B':
4bb91c68
SM
3595 if (syntax & RE_NO_GNU_OPS)
3596 goto normal_char;
25fe55af
RS
3597 BUF_PUSH (notwordbound);
3598 break;
fa9a63c5 3599
25fe55af 3600 case '`':
4bb91c68
SM
3601 if (syntax & RE_NO_GNU_OPS)
3602 goto normal_char;
25fe55af
RS
3603 BUF_PUSH (begbuf);
3604 break;
e318085a 3605
25fe55af 3606 case '\'':
4bb91c68
SM
3607 if (syntax & RE_NO_GNU_OPS)
3608 goto normal_char;
25fe55af
RS
3609 BUF_PUSH (endbuf);
3610 break;
e318085a 3611
25fe55af
RS
3612 case '1': case '2': case '3': case '4': case '5':
3613 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3614 {
3615 regnum_t reg;
e318085a 3616
0cdd06f8
SM
3617 if (syntax & RE_NO_BK_REFS)
3618 goto normal_backslash;
e318085a 3619
0cdd06f8 3620 reg = c - '0';
e318085a 3621
c69b0314
SM
3622 if (reg > bufp->re_nsub || reg < 1
3623 /* Can't back reference to a subexp before its end. */
3624 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3625 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3626
0cdd06f8
SM
3627 laststart = b;
3628 BUF_PUSH_2 (duplicate, reg);
3629 }
25fe55af 3630 break;
e318085a 3631
e318085a 3632
25fe55af
RS
3633 case '+':
3634 case '?':
3635 if (syntax & RE_BK_PLUS_QM)
3636 goto handle_plus;
3637 else
3638 goto normal_backslash;
3639
3640 default:
3641 normal_backslash:
3642 /* You might think it would be useful for \ to mean
3643 not to translate; but if we don't translate it
4bb91c68 3644 it will never match anything. */
25fe55af
RS
3645 goto normal_char;
3646 }
3647 break;
fa9a63c5
RM
3648
3649
3650 default:
25fe55af 3651 /* Expects the character in `c'. */
fa9a63c5 3652 normal_char:
36595814 3653 /* If no exactn currently being built. */
25fe55af 3654 if (!pending_exact
fa9a63c5 3655
25fe55af
RS
3656 /* If last exactn not at current position. */
3657 || pending_exact + *pending_exact + 1 != b
5e69f11e 3658
25fe55af 3659 /* We have only one byte following the exactn for the count. */
2d1675e4 3660 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3661
7814e705 3662 /* If followed by a repetition operator. */
9d99031f 3663 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3664 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3665 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3666 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3667 || ((syntax & RE_INTERVALS)
25fe55af 3668 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3669 ? p != pend && *p == '{'
3670 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3671 {
3672 /* Start building a new exactn. */
5e69f11e 3673
25fe55af 3674 laststart = b;
fa9a63c5
RM
3675
3676 BUF_PUSH_2 (exactn, 0);
3677 pending_exact = b - 1;
25fe55af 3678 }
5e69f11e 3679
2d1675e4
SM
3680 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3681 {
e0277a47
KH
3682 int len;
3683
cf9c99bc 3684 if (multibyte)
6fdd04b0 3685 {
cf9c99bc 3686 c = TRANSLATE (c);
6fdd04b0
KH
3687 len = CHAR_STRING (c, b);
3688 b += len;
3689 }
e0277a47 3690 else
6fdd04b0 3691 {
cf9c99bc
KH
3692 c1 = RE_CHAR_TO_MULTIBYTE (c);
3693 if (! CHAR_BYTE8_P (c1))
3694 {
3695 re_wchar_t c2 = TRANSLATE (c1);
3696
3697 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3698 c = c1;
409f2919 3699 }
6fdd04b0
KH
3700 *b++ = c;
3701 len = 1;
3702 }
2d1675e4
SM
3703 (*pending_exact) += len;
3704 }
3705
fa9a63c5 3706 break;
25fe55af 3707 } /* switch (c) */
fa9a63c5
RM
3708 } /* while p != pend */
3709
5e69f11e 3710
fa9a63c5 3711 /* Through the pattern now. */
5e69f11e 3712
505bde11 3713 FIXUP_ALT_JUMP ();
fa9a63c5 3714
5e69f11e 3715 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3716 FREE_STACK_RETURN (REG_EPAREN);
3717
3718 /* If we don't want backtracking, force success
3719 the first time we reach the end of the compiled pattern. */
3720 if (syntax & RE_NO_POSIX_BACKTRACKING)
3721 BUF_PUSH (succeed);
3722
fa9a63c5
RM
3723 /* We have succeeded; set the length of the buffer. */
3724 bufp->used = b - bufp->buffer;
3725
3726#ifdef DEBUG
99633e97 3727 if (debug > 0)
fa9a63c5 3728 {
505bde11 3729 re_compile_fastmap (bufp);
fa9a63c5
RM
3730 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3731 print_compiled_pattern (bufp);
3732 }
99633e97 3733 debug--;
fa9a63c5
RM
3734#endif /* DEBUG */
3735
3736#ifndef MATCH_MAY_ALLOCATE
3737 /* Initialize the failure stack to the largest possible stack. This
3738 isn't necessary unless we're trying to avoid calling alloca in
3739 the search and match routines. */
3740 {
3741 int num_regs = bufp->re_nsub + 1;
3742
320a2a73 3743 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3744 {
a26f4ccd 3745 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3746
fa9a63c5
RM
3747 if (! fail_stack.stack)
3748 fail_stack.stack
5e69f11e 3749 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3750 * sizeof (fail_stack_elt_t));
3751 else
3752 fail_stack.stack
3753 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3754 (fail_stack.size
3755 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3756 }
3757
3758 regex_grow_registers (num_regs);
3759 }
3760#endif /* not MATCH_MAY_ALLOCATE */
3761
839966f3 3762 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3763} /* regex_compile */
3764\f
3765/* Subroutines for `regex_compile'. */
3766
7814e705 3767/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3768
3769static void
971de7fb 3770store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3771{
3772 *loc = (unsigned char) op;
3773 STORE_NUMBER (loc + 1, arg);
3774}
3775
3776
3777/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3778
3779static void
971de7fb 3780store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3781{
3782 *loc = (unsigned char) op;
3783 STORE_NUMBER (loc + 1, arg1);
3784 STORE_NUMBER (loc + 3, arg2);
3785}
3786
3787
3788/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3789 for OP followed by two-byte integer parameter ARG. */
3790
3791static void
971de7fb 3792insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3793{
3794 register unsigned char *pfrom = end;
3795 register unsigned char *pto = end + 3;
3796
3797 while (pfrom != loc)
3798 *--pto = *--pfrom;
5e69f11e 3799
fa9a63c5
RM
3800 store_op1 (op, loc, arg);
3801}
3802
3803
3804/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3805
3806static void
971de7fb 3807insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3808{
3809 register unsigned char *pfrom = end;
3810 register unsigned char *pto = end + 5;
3811
3812 while (pfrom != loc)
3813 *--pto = *--pfrom;
5e69f11e 3814
fa9a63c5
RM
3815 store_op2 (op, loc, arg1, arg2);
3816}
3817
3818
3819/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3820 after an alternative or a begin-subexpression. We assume there is at
3821 least one character before the ^. */
3822
3823static boolean
971de7fb 3824at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3825{
01618498 3826 re_char *prev = p - 2;
fa9a63c5 3827 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3828
fa9a63c5
RM
3829 return
3830 /* After a subexpression? */
3831 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3832 /* After an alternative? */
d2af47df
SM
3833 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3834 /* After a shy subexpression? */
3835 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3836 && prev[-1] == '?' && prev[-2] == '('
3837 && (syntax & RE_NO_BK_PARENS
3838 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3839}
3840
3841
3842/* The dual of at_begline_loc_p. This one is for $. We assume there is
3843 at least one character after the $, i.e., `P < PEND'. */
3844
3845static boolean
971de7fb 3846at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3847{
01618498 3848 re_char *next = p;
fa9a63c5 3849 boolean next_backslash = *next == '\\';
01618498 3850 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3851
fa9a63c5
RM
3852 return
3853 /* Before a subexpression? */
3854 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3855 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3856 /* Before an alternative? */
3857 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3858 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3859}
3860
3861
5e69f11e 3862/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3863 false if it's not. */
3864
3865static boolean
971de7fb 3866group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5 3867{
d1dfb56c 3868 ssize_t this_element;
fa9a63c5 3869
5e69f11e
RM
3870 for (this_element = compile_stack.avail - 1;
3871 this_element >= 0;
fa9a63c5
RM
3872 this_element--)
3873 if (compile_stack.stack[this_element].regnum == regnum)
3874 return true;
3875
3876 return false;
3877}
fa9a63c5 3878\f
f6a3f532
SM
3879/* analyse_first.
3880 If fastmap is non-NULL, go through the pattern and fill fastmap
3881 with all the possible leading chars. If fastmap is NULL, don't
3882 bother filling it up (obviously) and only return whether the
3883 pattern could potentially match the empty string.
3884
3885 Return 1 if p..pend might match the empty string.
3886 Return 0 if p..pend matches at least one char.
01618498 3887 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3888
3889static int
438105ed 3890analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3891{
505bde11 3892 int j, k;
1fb352e0 3893 boolean not;
fa9a63c5 3894
b18215fc 3895 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3896 flag is set true. */
b18215fc
RS
3897 boolean match_any_multibyte_characters = false;
3898
f6a3f532 3899 assert (p);
5e69f11e 3900
505bde11
SM
3901 /* The loop below works as follows:
3902 - It has a working-list kept in the PATTERN_STACK and which basically
3903 starts by only containing a pointer to the first operation.
3904 - If the opcode we're looking at is a match against some set of
3905 chars, then we add those chars to the fastmap and go on to the
3906 next work element from the worklist (done via `break').
3907 - If the opcode is a control operator on the other hand, we either
3908 ignore it (if it's meaningless at this point, such as `start_memory')
3909 or execute it (if it's a jump). If the jump has several destinations
3910 (i.e. `on_failure_jump'), then we push the other destination onto the
3911 worklist.
3912 We guarantee termination by ignoring backward jumps (more or less),
3913 so that `p' is monotonically increasing. More to the point, we
3914 never set `p' (or push) anything `<= p1'. */
3915
01618498 3916 while (p < pend)
fa9a63c5 3917 {
505bde11
SM
3918 /* `p1' is used as a marker of how far back a `on_failure_jump'
3919 can go without being ignored. It is normally equal to `p'
3920 (which prevents any backward `on_failure_jump') except right
3921 after a plain `jump', to allow patterns such as:
3922 0: jump 10
3923 3..9: <body>
3924 10: on_failure_jump 3
3925 as used for the *? operator. */
01618498 3926 re_char *p1 = p;
5e69f11e 3927
fa9a63c5
RM
3928 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3929 {
f6a3f532 3930 case succeed:
01618498 3931 return 1;
fa9a63c5 3932
fa9a63c5 3933 case duplicate:
505bde11
SM
3934 /* If the first character has to match a backreference, that means
3935 that the group was empty (since it already matched). Since this
3936 is the only case that interests us here, we can assume that the
3937 backreference must match the empty string. */
3938 p++;
3939 continue;
fa9a63c5
RM
3940
3941
3942 /* Following are the cases which match a character. These end
7814e705 3943 with `break'. */
fa9a63c5
RM
3944
3945 case exactn:
e0277a47 3946 if (fastmap)
cf9c99bc
KH
3947 {
3948 /* If multibyte is nonzero, the first byte of each
3949 character is an ASCII or a leading code. Otherwise,
3950 each byte is a character. Thus, this works in both
3951 cases. */
3952 fastmap[p[1]] = 1;
3953 if (! multibyte)
3954 {
3955 /* For the case of matching this unibyte regex
3956 against multibyte, we must set a leading code of
3957 the corresponding multibyte character. */
3958 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3959
86e893e3 3960 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
3961 }
3962 }
fa9a63c5
RM
3963 break;
3964
3965
1fb352e0
SM
3966 case anychar:
3967 /* We could put all the chars except for \n (and maybe \0)
3968 but we don't bother since it is generally not worth it. */
f6a3f532 3969 if (!fastmap) break;
01618498 3970 return -1;
fa9a63c5
RM
3971
3972
b18215fc 3973 case charset_not:
1fb352e0 3974 if (!fastmap) break;
bf216479
KH
3975 {
3976 /* Chars beyond end of bitmap are possible matches. */
bf216479 3977 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 3978 j < (1 << BYTEWIDTH); j++)
bf216479
KH
3979 fastmap[j] = 1;
3980 }
3981
1fb352e0
SM
3982 /* Fallthrough */
3983 case charset:
3984 if (!fastmap) break;
3985 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3986 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3987 j >= 0; j--)
1fb352e0 3988 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 3989 fastmap[j] = 1;
b18215fc 3990
6482db2e
KH
3991#ifdef emacs
3992 if (/* Any leading code can possibly start a character
1fb352e0 3993 which doesn't match the specified set of characters. */
6482db2e 3994 not
409f2919 3995 ||
6482db2e
KH
3996 /* If we can match a character class, we can match any
3997 multibyte characters. */
3998 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3999 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4000
b18215fc 4001 {
b18215fc
RS
4002 if (match_any_multibyte_characters == false)
4003 {
6482db2e
KH
4004 for (j = MIN_MULTIBYTE_LEADING_CODE;
4005 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4006 fastmap[j] = 1;
b18215fc
RS
4007 match_any_multibyte_characters = true;
4008 }
4009 }
b18215fc 4010
1fb352e0
SM
4011 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4012 && match_any_multibyte_characters == false)
4013 {
bf216479 4014 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 4015 multibyte character in the range table. */
1fb352e0 4016 int c, count;
bf216479 4017 unsigned char lc1, lc2;
b18215fc 4018
1fb352e0 4019 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4020 bits for a character class. */
1fb352e0 4021 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4022
1fb352e0
SM
4023 /* Extract the number of ranges in range table into COUNT. */
4024 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4025 for (; count > 0; count--, p += 3)
1fb352e0 4026 {
9117d724
KH
4027 /* Extract the start and end of each range. */
4028 EXTRACT_CHARACTER (c, p);
bf216479 4029 lc1 = CHAR_LEADING_CODE (c);
9117d724 4030 p += 3;
1fb352e0 4031 EXTRACT_CHARACTER (c, p);
bf216479
KH
4032 lc2 = CHAR_LEADING_CODE (c);
4033 for (j = lc1; j <= lc2; j++)
9117d724 4034 fastmap[j] = 1;
1fb352e0
SM
4035 }
4036 }
6482db2e 4037#endif
b18215fc
RS
4038 break;
4039
1fb352e0
SM
4040 case syntaxspec:
4041 case notsyntaxspec:
4042 if (!fastmap) break;
4043#ifndef emacs
4044 not = (re_opcode_t)p[-1] == notsyntaxspec;
4045 k = *p++;
4046 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4047 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4048 fastmap[j] = 1;
b18215fc 4049 break;
1fb352e0 4050#else /* emacs */
b18215fc
RS
4051 /* This match depends on text properties. These end with
4052 aborting optimizations. */
01618498 4053 return -1;
b18215fc
RS
4054
4055 case categoryspec:
b18215fc 4056 case notcategoryspec:
1fb352e0
SM
4057 if (!fastmap) break;
4058 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4059 k = *p++;
6482db2e 4060 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4061 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4062 fastmap[j] = 1;
4063
6482db2e
KH
4064 /* Any leading code can possibly start a character which
4065 has or doesn't has the specified category. */
4066 if (match_any_multibyte_characters == false)
6fdd04b0 4067 {
6482db2e
KH
4068 for (j = MIN_MULTIBYTE_LEADING_CODE;
4069 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4070 fastmap[j] = 1;
4071 match_any_multibyte_characters = true;
6fdd04b0 4072 }
b18215fc
RS
4073 break;
4074
fa9a63c5 4075 /* All cases after this match the empty string. These end with
25fe55af 4076 `continue'. */
fa9a63c5 4077
fa9a63c5
RM
4078 case before_dot:
4079 case at_dot:
4080 case after_dot:
1fb352e0 4081#endif /* !emacs */
25fe55af
RS
4082 case no_op:
4083 case begline:
4084 case endline:
fa9a63c5
RM
4085 case begbuf:
4086 case endbuf:
4087 case wordbound:
4088 case notwordbound:
4089 case wordbeg:
4090 case wordend:
669fa600
SM
4091 case symbeg:
4092 case symend:
25fe55af 4093 continue;
fa9a63c5
RM
4094
4095
fa9a63c5 4096 case jump:
25fe55af 4097 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4098 if (j < 0)
4099 /* Backward jumps can only go back to code that we've already
4100 visited. `re_compile' should make sure this is true. */
4101 break;
25fe55af 4102 p += j;
505bde11
SM
4103 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4104 {
4105 case on_failure_jump:
4106 case on_failure_keep_string_jump:
505bde11 4107 case on_failure_jump_loop:
0683b6fa 4108 case on_failure_jump_nastyloop:
505bde11
SM
4109 case on_failure_jump_smart:
4110 p++;
4111 break;
4112 default:
4113 continue;
4114 };
4115 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4116 to jump back to "just after here". */
4117 /* Fallthrough */
fa9a63c5 4118
25fe55af
RS
4119 case on_failure_jump:
4120 case on_failure_keep_string_jump:
0683b6fa 4121 case on_failure_jump_nastyloop:
505bde11
SM
4122 case on_failure_jump_loop:
4123 case on_failure_jump_smart:
25fe55af 4124 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4125 if (p + j <= p1)
ed0767d8 4126 ; /* Backward jump to be ignored. */
01618498
SM
4127 else
4128 { /* We have to look down both arms.
4129 We first go down the "straight" path so as to minimize
4130 stack usage when going through alternatives. */
4131 int r = analyse_first (p, pend, fastmap, multibyte);
4132 if (r) return r;
4133 p += j;
4134 }
25fe55af 4135 continue;
fa9a63c5
RM
4136
4137
ed0767d8
SM
4138 case jump_n:
4139 /* This code simply does not properly handle forward jump_n. */
4140 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4141 p += 4;
4142 /* jump_n can either jump or fall through. The (backward) jump
4143 case has already been handled, so we only need to look at the
4144 fallthrough case. */
4145 continue;
177c0ea7 4146
fa9a63c5 4147 case succeed_n:
ed0767d8
SM
4148 /* If N == 0, it should be an on_failure_jump_loop instead. */
4149 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4150 p += 4;
4151 /* We only care about one iteration of the loop, so we don't
4152 need to consider the case where this behaves like an
4153 on_failure_jump. */
25fe55af 4154 continue;
fa9a63c5
RM
4155
4156
4157 case set_number_at:
25fe55af
RS
4158 p += 4;
4159 continue;
fa9a63c5
RM
4160
4161
4162 case start_memory:
25fe55af 4163 case stop_memory:
505bde11 4164 p += 1;
fa9a63c5
RM
4165 continue;
4166
4167
4168 default:
25fe55af
RS
4169 abort (); /* We have listed all the cases. */
4170 } /* switch *p++ */
fa9a63c5
RM
4171
4172 /* Getting here means we have found the possible starting
25fe55af 4173 characters for one path of the pattern -- and that the empty
7814e705 4174 string does not match. We need not follow this path further. */
01618498 4175 return 0;
fa9a63c5
RM
4176 } /* while p */
4177
01618498
SM
4178 /* We reached the end without matching anything. */
4179 return 1;
4180
f6a3f532
SM
4181} /* analyse_first */
4182\f
4183/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4184 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4185 characters can start a string that matches the pattern. This fastmap
4186 is used by re_search to skip quickly over impossible starting points.
4187
4188 Character codes above (1 << BYTEWIDTH) are not represented in the
4189 fastmap, but the leading codes are represented. Thus, the fastmap
4190 indicates which character sets could start a match.
4191
4192 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4193 area as BUFP->fastmap.
4194
4195 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4196 the pattern buffer.
4197
4198 Returns 0 if we succeed, -2 if an internal error. */
4199
4200int
971de7fb 4201re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4202{
4203 char *fastmap = bufp->fastmap;
4204 int analysis;
4205
4206 assert (fastmap && bufp->buffer);
4207
72af86bd 4208 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4209 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4210
4211 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4212 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4213 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4214 return 0;
4215} /* re_compile_fastmap */
4216\f
4217/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4218 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4219 this memory for recording register information. STARTS and ENDS
4220 must be allocated using the malloc library routine, and must each
4221 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4222
4223 If NUM_REGS == 0, then subsequent matches should allocate their own
4224 register data.
4225
4226 Unless this function is called, the first search or match using
4227 PATTERN_BUFFER will allocate its own register data, without
4228 freeing the old data. */
4229
4230void
971de7fb 4231re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4232{
4233 if (num_regs)
4234 {
4235 bufp->regs_allocated = REGS_REALLOCATE;
4236 regs->num_regs = num_regs;
4237 regs->start = starts;
4238 regs->end = ends;
4239 }
4240 else
4241 {
4242 bufp->regs_allocated = REGS_UNALLOCATED;
4243 regs->num_regs = 0;
4244 regs->start = regs->end = (regoff_t *) 0;
4245 }
4246}
c0f9ea08 4247WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4248\f
7814e705 4249/* Searching routines. */
fa9a63c5
RM
4250
4251/* Like re_search_2, below, but only one string is specified, and
4252 doesn't let you say where to stop matching. */
4253
d1dfb56c
EZ
4254regoff_t
4255re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4256 ssize_t startpos, ssize_t range, struct re_registers *regs)
fa9a63c5 4257{
5e69f11e 4258 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4259 regs, size);
4260}
c0f9ea08 4261WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4262
70806df6
KH
4263/* Head address of virtual concatenation of string. */
4264#define HEAD_ADDR_VSTRING(P) \
4265 (((P) >= size1 ? string2 : string1))
4266
b18215fc
RS
4267/* Address of POS in the concatenation of virtual string. */
4268#define POS_ADDR_VSTRING(POS) \
4269 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4270
4271/* Using the compiled pattern in BUFP->buffer, first tries to match the
4272 virtual concatenation of STRING1 and STRING2, starting first at index
4273 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4274
fa9a63c5 4275 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4276
fa9a63c5
RM
4277 RANGE is how far to scan while trying to match. RANGE = 0 means try
4278 only at STARTPOS; in general, the last start tried is STARTPOS +
4279 RANGE.
5e69f11e 4280
fa9a63c5
RM
4281 In REGS, return the indices of the virtual concatenation of STRING1
4282 and STRING2 that matched the entire BUFP->buffer and its contained
4283 subexpressions.
5e69f11e 4284
fa9a63c5
RM
4285 Do not consider matching one past the index STOP in the virtual
4286 concatenation of STRING1 and STRING2.
4287
4288 We return either the position in the strings at which the match was
4289 found, -1 if no match, or -2 if error (such as failure
4290 stack overflow). */
4291
d1dfb56c
EZ
4292regoff_t
4293re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4294 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4295 struct re_registers *regs, ssize_t stop)
fa9a63c5 4296{
d1dfb56c 4297 regoff_t val;
66f0296e
SM
4298 re_char *string1 = (re_char*) str1;
4299 re_char *string2 = (re_char*) str2;
fa9a63c5 4300 register char *fastmap = bufp->fastmap;
6676cb1c 4301 register RE_TRANSLATE_TYPE translate = bufp->translate;
d1dfb56c
EZ
4302 size_t total_size = size1 + size2;
4303 ssize_t endpos = startpos + range;
c0f9ea08 4304 boolean anchored_start;
cf9c99bc
KH
4305 /* Nonzero if we are searching multibyte string. */
4306 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4307
fa9a63c5
RM
4308 /* Check for out-of-range STARTPOS. */
4309 if (startpos < 0 || startpos > total_size)
4310 return -1;
5e69f11e 4311
fa9a63c5 4312 /* Fix up RANGE if it might eventually take us outside
34597fa9 4313 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4314 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4315 if (endpos < 0)
4316 range = 0 - startpos;
fa9a63c5
RM
4317 else if (endpos > total_size)
4318 range = total_size - startpos;
4319
4320 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4321 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4322 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4323 {
4324 if (startpos > 0)
4325 return -1;
4326 else
7b140fd7 4327 range = 0;
fa9a63c5
RM
4328 }
4329
ae4788a8
RS
4330#ifdef emacs
4331 /* In a forward search for something that starts with \=.
4332 don't keep searching past point. */
4333 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4334 {
7b140fd7
RS
4335 range = PT_BYTE - BEGV_BYTE - startpos;
4336 if (range < 0)
ae4788a8
RS
4337 return -1;
4338 }
4339#endif /* emacs */
4340
fa9a63c5
RM
4341 /* Update the fastmap now if not correct already. */
4342 if (fastmap && !bufp->fastmap_accurate)
01618498 4343 re_compile_fastmap (bufp);
5e69f11e 4344
c8499ba5 4345 /* See whether the pattern is anchored. */
c0f9ea08 4346 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4347
b18215fc 4348#ifdef emacs
d48cd3f4 4349 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4350 {
d1dfb56c 4351 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4352
4353 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4354 }
b18215fc
RS
4355#endif
4356
fa9a63c5
RM
4357 /* Loop through the string, looking for a place to start matching. */
4358 for (;;)
5e69f11e 4359 {
c8499ba5
RS
4360 /* If the pattern is anchored,
4361 skip quickly past places we cannot match.
4362 We don't bother to treat startpos == 0 specially
4363 because that case doesn't repeat. */
4364 if (anchored_start && startpos > 0)
4365 {
c0f9ea08
SM
4366 if (! ((startpos <= size1 ? string1[startpos - 1]
4367 : string2[startpos - size1 - 1])
4368 == '\n'))
c8499ba5
RS
4369 goto advance;
4370 }
4371
fa9a63c5 4372 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4373 cannot be the start of a match. If the pattern can match the
4374 null string, however, we don't need to skip characters; we want
7814e705 4375 the first null string. */
fa9a63c5
RM
4376 if (fastmap && startpos < total_size && !bufp->can_be_null)
4377 {
66f0296e 4378 register re_char *d;
01618498 4379 register re_wchar_t buf_ch;
e934739e
RS
4380
4381 d = POS_ADDR_VSTRING (startpos);
4382
7814e705 4383 if (range > 0) /* Searching forwards. */
fa9a63c5 4384 {
fa9a63c5 4385 register int lim = 0;
d1dfb56c 4386 ssize_t irange = range;
fa9a63c5 4387
25fe55af
RS
4388 if (startpos < size1 && startpos + range >= size1)
4389 lim = range - (size1 - startpos);
fa9a63c5 4390
25fe55af
RS
4391 /* Written out as an if-else to avoid testing `translate'
4392 inside the loop. */
28ae27ae
AS
4393 if (RE_TRANSLATE_P (translate))
4394 {
e934739e
RS
4395 if (multibyte)
4396 while (range > lim)
4397 {
4398 int buf_charlen;
4399
62a6e103 4400 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4401 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4402 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4403 break;
4404
4405 range -= buf_charlen;
4406 d += buf_charlen;
4407 }
4408 else
bf216479 4409 while (range > lim)
33c46939 4410 {
cf9c99bc
KH
4411 register re_wchar_t ch, translated;
4412
bf216479 4413 buf_ch = *d;
cf9c99bc
KH
4414 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4415 translated = RE_TRANSLATE (translate, ch);
4416 if (translated != ch
4417 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4418 buf_ch = ch;
6fdd04b0 4419 if (fastmap[buf_ch])
bf216479 4420 break;
33c46939
RS
4421 d++;
4422 range--;
4423 }
e934739e 4424 }
fa9a63c5 4425 else
6fdd04b0
KH
4426 {
4427 if (multibyte)
4428 while (range > lim)
4429 {
4430 int buf_charlen;
fa9a63c5 4431
62a6e103 4432 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4433 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4434 break;
4435 range -= buf_charlen;
4436 d += buf_charlen;
4437 }
e934739e 4438 else
6fdd04b0 4439 while (range > lim && !fastmap[*d])
33c46939
RS
4440 {
4441 d++;
4442 range--;
4443 }
e934739e 4444 }
fa9a63c5
RM
4445 startpos += irange - range;
4446 }
7814e705 4447 else /* Searching backwards. */
fa9a63c5 4448 {
ba5e343c
KH
4449 if (multibyte)
4450 {
62a6e103 4451 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4452 buf_ch = TRANSLATE (buf_ch);
4453 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4454 goto advance;
4455 }
4456 else
4457 {
cf9c99bc
KH
4458 register re_wchar_t ch, translated;
4459
4460 buf_ch = *d;
4461 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4462 translated = TRANSLATE (ch);
4463 if (translated != ch
4464 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4465 buf_ch = ch;
4466 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4467 goto advance;
4468 }
fa9a63c5
RM
4469 }
4470 }
4471
4472 /* If can't match the null string, and that's all we have left, fail. */
4473 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4474 && !bufp->can_be_null)
fa9a63c5
RM
4475 return -1;
4476
4477 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4478 startpos, regs, stop);
fa9a63c5
RM
4479
4480 if (val >= 0)
4481 return startpos;
5e69f11e 4482
fa9a63c5
RM
4483 if (val == -2)
4484 return -2;
4485
4486 advance:
5e69f11e 4487 if (!range)
25fe55af 4488 break;
5e69f11e 4489 else if (range > 0)
25fe55af 4490 {
b18215fc
RS
4491 /* Update STARTPOS to the next character boundary. */
4492 if (multibyte)
4493 {
66f0296e 4494 re_char *p = POS_ADDR_VSTRING (startpos);
aa3830c4 4495 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4496
4497 range -= len;
4498 if (range < 0)
4499 break;
4500 startpos += len;
4501 }
4502 else
4503 {
b560c397
RS
4504 range--;
4505 startpos++;
4506 }
e318085a 4507 }
fa9a63c5 4508 else
25fe55af
RS
4509 {
4510 range++;
4511 startpos--;
b18215fc
RS
4512
4513 /* Update STARTPOS to the previous character boundary. */
4514 if (multibyte)
4515 {
70806df6
KH
4516 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4517 re_char *p0 = p;
4518 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4519
4520 /* Find the head of multibyte form. */
70806df6
KH
4521 PREV_CHAR_BOUNDARY (p, phead);
4522 range += p0 - 1 - p;
4523 if (range > 0)
4524 break;
b18215fc 4525
70806df6 4526 startpos -= p0 - 1 - p;
b18215fc 4527 }
25fe55af 4528 }
fa9a63c5
RM
4529 }
4530 return -1;
4531} /* re_search_2 */
c0f9ea08 4532WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4533\f
4534/* Declarations and macros for re_match_2. */
4535
261cb4bb
PE
4536static int bcmp_translate (re_char *s1, re_char *s2,
4537 register ssize_t len,
4538 RE_TRANSLATE_TYPE translate,
4539 const int multibyte);
fa9a63c5
RM
4540
4541/* This converts PTR, a pointer into one of the search strings `string1'
4542 and `string2' into an offset from the beginning of that string. */
4543#define POINTER_TO_OFFSET(ptr) \
4544 (FIRST_STRING_P (ptr) \
4545 ? ((regoff_t) ((ptr) - string1)) \
4546 : ((regoff_t) ((ptr) - string2 + size1)))
4547
fa9a63c5 4548/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4549 string2 if necessary.
4550 Check re_match_2_internal for a discussion of why end_match_2 might
4551 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4552#define PREFETCH() \
25fe55af 4553 while (d == dend) \
fa9a63c5
RM
4554 { \
4555 /* End of string2 => fail. */ \
25fe55af
RS
4556 if (dend == end_match_2) \
4557 goto fail; \
4bb91c68 4558 /* End of string1 => advance to string2. */ \
25fe55af 4559 d = string2; \
fa9a63c5
RM
4560 dend = end_match_2; \
4561 }
4562
f1ad044f
SM
4563/* Call before fetching a char with *d if you already checked other limits.
4564 This is meant for use in lookahead operations like wordend, etc..
4565 where we might need to look at parts of the string that might be
4566 outside of the LIMITs (i.e past `stop'). */
4567#define PREFETCH_NOLIMIT() \
4568 if (d == end1) \
4569 { \
4570 d = string2; \
4571 dend = end_match_2; \
4572 } \
fa9a63c5
RM
4573
4574/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4575 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4576#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4577#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5 4578
9121ca40 4579/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4580
4581/* The comment at case wordbound is following one, but we don't use
4582 AT_WORD_BOUNDARY anymore to support multibyte form.
4583
4584 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4585 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4586 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4587 macro and introducing temporary variables works around the bug. */
4588
9121ca40 4589#if 0
b313f9d8
PE
4590/* Test if D points to a character which is word-constituent. We have
4591 two special cases to check for: if past the end of string1, look at
4592 the first character in string2; and if before the beginning of
4593 string2, look at the last character in string1. */
4594#define WORDCHAR_P(d) \
4595 (SYNTAX ((d) == end1 ? *string2 \
4596 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4597 == Sword)
4598
fa9a63c5
RM
4599/* Test if the character before D and the one at D differ with respect
4600 to being word-constituent. */
4601#define AT_WORD_BOUNDARY(d) \
4602 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4603 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4604#endif
fa9a63c5
RM
4605
4606/* Free everything we malloc. */
4607#ifdef MATCH_MAY_ALLOCATE
952db0d7
PE
4608# define FREE_VAR(var) \
4609 do { \
4610 if (var) \
4611 { \
4612 REGEX_FREE (var); \
4613 var = NULL; \
4614 } \
4615 } while (0)
0b32bf0e 4616# define FREE_VARIABLES() \
fa9a63c5
RM
4617 do { \
4618 REGEX_FREE_STACK (fail_stack.stack); \
4619 FREE_VAR (regstart); \
4620 FREE_VAR (regend); \
fa9a63c5
RM
4621 FREE_VAR (best_regstart); \
4622 FREE_VAR (best_regend); \
fa9a63c5
RM
4623 } while (0)
4624#else
0b32bf0e 4625# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4626#endif /* not MATCH_MAY_ALLOCATE */
4627
505bde11
SM
4628\f
4629/* Optimization routines. */
4630
4e8a9132
SM
4631/* If the operation is a match against one or more chars,
4632 return a pointer to the next operation, else return NULL. */
01618498 4633static re_char *
971de7fb 4634skip_one_char (const re_char *p)
4e8a9132
SM
4635{
4636 switch (SWITCH_ENUM_CAST (*p++))
4637 {
4638 case anychar:
4639 break;
177c0ea7 4640
4e8a9132
SM
4641 case exactn:
4642 p += *p + 1;
4643 break;
4644
4645 case charset_not:
4646 case charset:
4647 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4648 {
4649 int mcnt;
4650 p = CHARSET_RANGE_TABLE (p - 1);
4651 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4652 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4653 }
4654 else
4655 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4656 break;
177c0ea7 4657
4e8a9132
SM
4658 case syntaxspec:
4659 case notsyntaxspec:
1fb352e0 4660#ifdef emacs
4e8a9132
SM
4661 case categoryspec:
4662 case notcategoryspec:
4663#endif /* emacs */
4664 p++;
4665 break;
4666
4667 default:
4668 p = NULL;
4669 }
4670 return p;
4671}
4672
4673
505bde11 4674/* Jump over non-matching operations. */
839966f3 4675static re_char *
971de7fb 4676skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4677{
4678 int mcnt;
4679 while (p < pend)
4680 {
4681 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4682 {
4683 case start_memory:
505bde11
SM
4684 case stop_memory:
4685 p += 2; break;
4686 case no_op:
4687 p += 1; break;
4688 case jump:
4689 p += 1;
4690 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4691 p += mcnt;
4692 break;
4693 default:
4694 return p;
4695 }
4696 }
4697 assert (p == pend);
4698 return p;
4699}
4700
4701/* Non-zero if "p1 matches something" implies "p2 fails". */
4702static int
971de7fb 4703mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4704{
4e8a9132 4705 re_opcode_t op2;
2d1675e4 4706 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4707 unsigned char *pend = bufp->buffer + bufp->used;
4708
4e8a9132 4709 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4710 && p2 >= bufp->buffer && p2 <= pend);
4711
4712 /* Skip over open/close-group commands.
4713 If what follows this loop is a ...+ construct,
4714 look at what begins its body, since we will have to
4715 match at least one of that. */
4e8a9132
SM
4716 p2 = skip_noops (p2, pend);
4717 /* The same skip can be done for p1, except that this function
4718 is only used in the case where p1 is a simple match operator. */
4719 /* p1 = skip_noops (p1, pend); */
4720
4721 assert (p1 >= bufp->buffer && p1 < pend
4722 && p2 >= bufp->buffer && p2 <= pend);
4723
4724 op2 = p2 == pend ? succeed : *p2;
4725
4726 switch (SWITCH_ENUM_CAST (op2))
505bde11 4727 {
4e8a9132
SM
4728 case succeed:
4729 case endbuf:
4730 /* If we're at the end of the pattern, we can change. */
4731 if (skip_one_char (p1))
505bde11 4732 {
505bde11
SM
4733 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4734 return 1;
505bde11 4735 }
4e8a9132 4736 break;
177c0ea7 4737
4e8a9132 4738 case endline:
4e8a9132
SM
4739 case exactn:
4740 {
01618498 4741 register re_wchar_t c
4e8a9132 4742 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4743 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4744
4e8a9132
SM
4745 if ((re_opcode_t) *p1 == exactn)
4746 {
62a6e103 4747 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4748 {
4749 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4750 return 1;
4751 }
4752 }
505bde11 4753
4e8a9132
SM
4754 else if ((re_opcode_t) *p1 == charset
4755 || (re_opcode_t) *p1 == charset_not)
4756 {
4757 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4758
4e8a9132
SM
4759 /* Test if C is listed in charset (or charset_not)
4760 at `p1'. */
6fdd04b0 4761 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4762 {
4763 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4764 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4765 not = !not;
4766 }
4767 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4768 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4769
4e8a9132
SM
4770 /* `not' is equal to 1 if c would match, which means
4771 that we can't change to pop_failure_jump. */
4772 if (!not)
4773 {
4774 DEBUG_PRINT1 (" No match => fast loop.\n");
4775 return 1;
4776 }
4777 }
4778 else if ((re_opcode_t) *p1 == anychar
4779 && c == '\n')
4780 {
4781 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4782 return 1;
4783 }
4784 }
4785 break;
505bde11 4786
4e8a9132 4787 case charset:
4e8a9132
SM
4788 {
4789 if ((re_opcode_t) *p1 == exactn)
4790 /* Reuse the code above. */
4791 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4792
505bde11
SM
4793 /* It is hard to list up all the character in charset
4794 P2 if it includes multibyte character. Give up in
4795 such case. */
4796 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4797 {
4798 /* Now, we are sure that P2 has no range table.
4799 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4800 enough. But P1 may have range table, so the
505bde11
SM
4801 size of bitmap table of P1 is extracted by
4802 using macro `CHARSET_BITMAP_SIZE'.
4803
6fdd04b0
KH
4804 In a multibyte case, we know that all the character
4805 listed in P2 is ASCII. In a unibyte case, P1 has only a
4806 bitmap table. So, in both cases, it is enough to test
4807 only the bitmap table of P1. */
505bde11 4808
411e4203 4809 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4810 {
4811 int idx;
4812 /* We win if the charset inside the loop
4813 has no overlap with the one after the loop. */
4814 for (idx = 0;
4815 (idx < (int) p2[1]
4816 && idx < CHARSET_BITMAP_SIZE (p1));
4817 idx++)
4818 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4819 break;
4820
4821 if (idx == p2[1]
4822 || idx == CHARSET_BITMAP_SIZE (p1))
4823 {
4824 DEBUG_PRINT1 (" No match => fast loop.\n");
4825 return 1;
4826 }
4827 }
411e4203 4828 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4829 {
4830 int idx;
4831 /* We win if the charset_not inside the loop lists
7814e705 4832 every character listed in the charset after. */
505bde11
SM
4833 for (idx = 0; idx < (int) p2[1]; idx++)
4834 if (! (p2[2 + idx] == 0
4835 || (idx < CHARSET_BITMAP_SIZE (p1)
4836 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4837 break;
4838
d1dfb56c
EZ
4839 if (idx == p2[1])
4840 {
4841 DEBUG_PRINT1 (" No match => fast loop.\n");
4842 return 1;
4843 }
4e8a9132
SM
4844 }
4845 }
4846 }
609b757a 4847 break;
177c0ea7 4848
411e4203
SM
4849 case charset_not:
4850 switch (SWITCH_ENUM_CAST (*p1))
4851 {
4852 case exactn:
4853 case charset:
4854 /* Reuse the code above. */
4855 return mutually_exclusive_p (bufp, p2, p1);
4856 case charset_not:
4857 /* When we have two charset_not, it's very unlikely that
4858 they don't overlap. The union of the two sets of excluded
4859 chars should cover all possible chars, which, as a matter of
4860 fact, is virtually impossible in multibyte buffers. */
36595814 4861 break;
411e4203
SM
4862 }
4863 break;
4864
4e8a9132 4865 case wordend:
669fa600
SM
4866 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4867 case symend:
4e8a9132 4868 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4869 && (p1[1] == Ssymbol || p1[1] == Sword));
4870 case notsyntaxspec:
4871 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4872
4873 case wordbeg:
669fa600
SM
4874 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4875 case symbeg:
4e8a9132 4876 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4877 && (p1[1] == Ssymbol || p1[1] == Sword));
4878 case syntaxspec:
4879 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4880
4881 case wordbound:
4882 return (((re_opcode_t) *p1 == notsyntaxspec
4883 || (re_opcode_t) *p1 == syntaxspec)
4884 && p1[1] == Sword);
4885
1fb352e0 4886#ifdef emacs
4e8a9132
SM
4887 case categoryspec:
4888 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4889 case notcategoryspec:
4890 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4891#endif /* emacs */
4892
4893 default:
4894 ;
505bde11
SM
4895 }
4896
4897 /* Safe default. */
4898 return 0;
4899}
4900
fa9a63c5
RM
4901\f
4902/* Matching routines. */
4903
25fe55af 4904#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4905/* re_match is like re_match_2 except it takes only a single string. */
4906
d1dfb56c 4907regoff_t
d2762c86 4908re_match (struct re_pattern_buffer *bufp, const char *string,
d1dfb56c 4909 size_t size, ssize_t pos, struct re_registers *regs)
fa9a63c5 4910{
d1dfb56c
EZ
4911 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4912 size, pos, regs, size);
fa9a63c5
RM
4913 return result;
4914}
c0f9ea08 4915WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4916#endif /* not emacs */
4917
b18215fc
RS
4918#ifdef emacs
4919/* In Emacs, this is the string or buffer in which we
7814e705 4920 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4921Lisp_Object re_match_object;
4922#endif
fa9a63c5
RM
4923
4924/* re_match_2 matches the compiled pattern in BUFP against the
4925 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4926 and SIZE2, respectively). We start matching at POS, and stop
4927 matching at STOP.
5e69f11e 4928
fa9a63c5 4929 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4930 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4931 documentation for exactly how many groups we fill.
4932
4933 We return -1 if no match, -2 if an internal error (such as the
7814e705 4934 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4935 matched substring. */
4936
d1dfb56c
EZ
4937regoff_t
4938re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4939 size_t size1, const char *string2, size_t size2, ssize_t pos,
4940 struct re_registers *regs, ssize_t stop)
fa9a63c5 4941{
d1dfb56c 4942 regoff_t result;
25fe55af 4943
b18215fc 4944#ifdef emacs
d1dfb56c 4945 ssize_t charpos;
d48cd3f4 4946 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 4947 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4948 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4949#endif
4950
4bb91c68
SM
4951 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4952 (re_char*) string2, size2,
cc9b4df2 4953 pos, regs, stop);
fa9a63c5
RM
4954 return result;
4955}
c0f9ea08 4956WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4957
bf216479 4958
fa9a63c5 4959/* This is a separate function so that we can force an alloca cleanup
7814e705 4960 afterwards. */
d1dfb56c
EZ
4961static regoff_t
4962re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1,
4963 size_t size1, const re_char *string2, size_t size2,
4964 ssize_t pos, struct re_registers *regs, ssize_t stop)
fa9a63c5
RM
4965{
4966 /* General temporaries. */
d1dfb56c 4967 ssize_t mcnt;
01618498 4968 size_t reg;
fa9a63c5
RM
4969
4970 /* Just past the end of the corresponding string. */
66f0296e 4971 re_char *end1, *end2;
fa9a63c5
RM
4972
4973 /* Pointers into string1 and string2, just past the last characters in
7814e705 4974 each to consider matching. */
66f0296e 4975 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4976
4977 /* Where we are in the data, and the end of the current string. */
66f0296e 4978 re_char *d, *dend;
5e69f11e 4979
99633e97
SM
4980 /* Used sometimes to remember where we were before starting matching
4981 an operator so that we can go back in case of failure. This "atomic"
4982 behavior of matching opcodes is indispensable to the correctness
4983 of the on_failure_keep_string_jump optimization. */
4984 re_char *dfail;
4985
fa9a63c5 4986 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4987 re_char *p = bufp->buffer;
4988 re_char *pend = p + bufp->used;
fa9a63c5 4989
25fe55af 4990 /* We use this to map every character in the string. */
6676cb1c 4991 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4992
cf9c99bc 4993 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 4994 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4995
cf9c99bc
KH
4996 /* Nonzero if STRING1/STRING2 are multibyte. */
4997 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4998
fa9a63c5
RM
4999 /* Failure point stack. Each place that can handle a failure further
5000 down the line pushes a failure point on this stack. It consists of
505bde11 5001 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5002 the subexpressions we're currently inside, plus the number of such
5003 registers, and, finally, two char *'s. The first char * is where
5004 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5005 scanning the strings. */
5006#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5007 fail_stack_type fail_stack;
5008#endif
5009#ifdef DEBUG
fa9a63c5
RM
5010 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5011#endif
5012
0b32bf0e 5013#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5014 /* This holds the pointer to the failure stack, when
5015 it is allocated relocatably. */
5016 fail_stack_elt_t *failure_stack_ptr;
99633e97 5017#endif
fa9a63c5
RM
5018
5019 /* We fill all the registers internally, independent of what we
7814e705 5020 return, for use in backreferences. The number here includes
fa9a63c5 5021 an element for register zero. */
4bb91c68 5022 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5023
fa9a63c5
RM
5024 /* Information on the contents of registers. These are pointers into
5025 the input strings; they record just what was matched (on this
5026 attempt) by a subexpression part of the pattern, that is, the
5027 regnum-th regstart pointer points to where in the pattern we began
5028 matching and the regnum-th regend points to right after where we
5029 stopped matching the regnum-th subexpression. (The zeroth register
5030 keeps track of what the whole pattern matches.) */
5031#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5032 re_char **regstart, **regend;
fa9a63c5
RM
5033#endif
5034
fa9a63c5 5035 /* The following record the register info as found in the above
5e69f11e 5036 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5037 This happens as we backtrack through the failure points, which in
5038 turn happens only if we have not yet matched the entire string. */
5039 unsigned best_regs_set = false;
5040#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5041 re_char **best_regstart, **best_regend;
fa9a63c5 5042#endif
5e69f11e 5043
fa9a63c5
RM
5044 /* Logically, this is `best_regend[0]'. But we don't want to have to
5045 allocate space for that if we're not allocating space for anything
7814e705 5046 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5047 any of the other register vectors, and it seems rather a kludge to
5048 treat `best_regend' differently than the rest. So we keep track of
5049 the end of the best match so far in a separate variable. We
5050 initialize this to NULL so that when we backtrack the first time
5051 and need to test it, it's not garbage. */
66f0296e 5052 re_char *match_end = NULL;
fa9a63c5 5053
fa9a63c5
RM
5054#ifdef DEBUG
5055 /* Counts the total number of registers pushed. */
5e69f11e 5056 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5057#endif
5058
5059 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5060
fa9a63c5 5061 INIT_FAIL_STACK ();
5e69f11e 5062
fa9a63c5
RM
5063#ifdef MATCH_MAY_ALLOCATE
5064 /* Do not bother to initialize all the register variables if there are
5065 no groups in the pattern, as it takes a fair amount of time. If
5066 there are groups, we include space for register 0 (the whole
5067 pattern), even though we never use it, since it simplifies the
5068 array indexing. We should fix this. */
5069 if (bufp->re_nsub)
5070 {
66f0296e
SM
5071 regstart = REGEX_TALLOC (num_regs, re_char *);
5072 regend = REGEX_TALLOC (num_regs, re_char *);
5073 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5074 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5075
505bde11 5076 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5077 {
5078 FREE_VARIABLES ();
5079 return -2;
5080 }
fa9a63c5
RM
5081 }
5082 else
5083 {
5084 /* We must initialize all our variables to NULL, so that
25fe55af 5085 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5086 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5087 }
5088#endif /* MATCH_MAY_ALLOCATE */
5089
5090 /* The starting position is bogus. */
5091 if (pos < 0 || pos > size1 + size2)
5092 {
5093 FREE_VARIABLES ();
5094 return -1;
5095 }
5e69f11e 5096
fa9a63c5
RM
5097 /* Initialize subexpression text positions to -1 to mark ones that no
5098 start_memory/stop_memory has been seen for. Also initialize the
5099 register information struct. */
01618498
SM
5100 for (reg = 1; reg < num_regs; reg++)
5101 regstart[reg] = regend[reg] = NULL;
99633e97 5102
fa9a63c5 5103 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5104 `string1' is null. */
fa9a63c5
RM
5105 if (size2 == 0 && string1 != NULL)
5106 {
5107 string2 = string1;
5108 size2 = size1;
5109 string1 = 0;
5110 size1 = 0;
5111 }
5112 end1 = string1 + size1;
5113 end2 = string2 + size2;
5114
5e69f11e 5115 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5116 `dend' is the end of the input string that `d' points within. `d'
5117 is advanced into the following input string whenever necessary, but
5118 this happens before fetching; therefore, at the beginning of the
5119 loop, `d' can be pointing at the end of a string, but it cannot
5120 equal `string2'. */
419d1c74 5121 if (pos >= size1)
fa9a63c5 5122 {
419d1c74
SM
5123 /* Only match within string2. */
5124 d = string2 + pos - size1;
5125 dend = end_match_2 = string2 + stop - size1;
5126 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5127 }
5128 else
5129 {
f1ad044f 5130 if (stop < size1)
419d1c74
SM
5131 {
5132 /* Only match within string1. */
5133 end_match_1 = string1 + stop;
5134 /* BEWARE!
5135 When we reach end_match_1, PREFETCH normally switches to string2.
5136 But in the present case, this means that just doing a PREFETCH
5137 makes us jump from `stop' to `gap' within the string.
5138 What we really want here is for the search to stop as
5139 soon as we hit end_match_1. That's why we set end_match_2
5140 to end_match_1 (since PREFETCH fails as soon as we hit
5141 end_match_2). */
5142 end_match_2 = end_match_1;
5143 }
5144 else
f1ad044f
SM
5145 { /* It's important to use this code when stop == size so that
5146 moving `d' from end1 to string2 will not prevent the d == dend
5147 check from catching the end of string. */
419d1c74
SM
5148 end_match_1 = end1;
5149 end_match_2 = string2 + stop - size1;
5150 }
5151 d = string1 + pos;
5152 dend = end_match_1;
fa9a63c5
RM
5153 }
5154
5155 DEBUG_PRINT1 ("The compiled pattern is: ");
5156 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5157 DEBUG_PRINT1 ("The string to match is: `");
5158 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5159 DEBUG_PRINT1 ("'\n");
5e69f11e 5160
7814e705 5161 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5162 function if the match is complete, or it drops through if the match
5163 fails at this starting point in the input data. */
5164 for (;;)
5165 {
505bde11 5166 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5167
5168 if (p == pend)
5169 { /* End of pattern means we might have succeeded. */
25fe55af 5170 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5171
fa9a63c5 5172 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5173 longest match, try backtracking. */
5174 if (d != end_match_2)
fa9a63c5
RM
5175 {
5176 /* 1 if this match ends in the same string (string1 or string2)
5177 as the best previous match. */
5e69f11e 5178 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5179 == FIRST_STRING_P (d));
fa9a63c5
RM
5180 /* 1 if this match is the best seen so far. */
5181 boolean best_match_p;
5182
5183 /* AIX compiler got confused when this was combined
7814e705 5184 with the previous declaration. */
fa9a63c5
RM
5185 if (same_str_p)
5186 best_match_p = d > match_end;
5187 else
99633e97 5188 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5189
25fe55af
RS
5190 DEBUG_PRINT1 ("backtracking.\n");
5191
5192 if (!FAIL_STACK_EMPTY ())
5193 { /* More failure points to try. */
5194
5195 /* If exceeds best match so far, save it. */
5196 if (!best_regs_set || best_match_p)
5197 {
5198 best_regs_set = true;
5199 match_end = d;
5200
5201 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5202
01618498 5203 for (reg = 1; reg < num_regs; reg++)
25fe55af 5204 {
01618498
SM
5205 best_regstart[reg] = regstart[reg];
5206 best_regend[reg] = regend[reg];
25fe55af
RS
5207 }
5208 }
5209 goto fail;
5210 }
5211
5212 /* If no failure points, don't restore garbage. And if
5213 last match is real best match, don't restore second
5214 best one. */
5215 else if (best_regs_set && !best_match_p)
5216 {
5217 restore_best_regs:
5218 /* Restore best match. It may happen that `dend ==
5219 end_match_1' while the restored d is in string2.
5220 For example, the pattern `x.*y.*z' against the
5221 strings `x-' and `y-z-', if the two strings are
7814e705 5222 not consecutive in memory. */
25fe55af
RS
5223 DEBUG_PRINT1 ("Restoring best registers.\n");
5224
5225 d = match_end;
5226 dend = ((d >= string1 && d <= end1)
5227 ? end_match_1 : end_match_2);
fa9a63c5 5228
01618498 5229 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5230 {
01618498
SM
5231 regstart[reg] = best_regstart[reg];
5232 regend[reg] = best_regend[reg];
fa9a63c5 5233 }
25fe55af
RS
5234 }
5235 } /* d != end_match_2 */
fa9a63c5
RM
5236
5237 succeed_label:
25fe55af 5238 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5239
25fe55af
RS
5240 /* If caller wants register contents data back, do it. */
5241 if (regs && !bufp->no_sub)
fa9a63c5 5242 {
25fe55af
RS
5243 /* Have the register data arrays been allocated? */
5244 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5245 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5246 extra element beyond `num_regs' for the `-1' marker
5247 GNU code uses. */
5248 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5249 regs->start = TALLOC (regs->num_regs, regoff_t);
5250 regs->end = TALLOC (regs->num_regs, regoff_t);
5251 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5252 {
5253 FREE_VARIABLES ();
5254 return -2;
5255 }
25fe55af
RS
5256 bufp->regs_allocated = REGS_REALLOCATE;
5257 }
5258 else if (bufp->regs_allocated == REGS_REALLOCATE)
5259 { /* Yes. If we need more elements than were already
5260 allocated, reallocate them. If we need fewer, just
5261 leave it alone. */
5262 if (regs->num_regs < num_regs + 1)
5263 {
5264 regs->num_regs = num_regs + 1;
5265 RETALLOC (regs->start, regs->num_regs, regoff_t);
5266 RETALLOC (regs->end, regs->num_regs, regoff_t);
5267 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5268 {
5269 FREE_VARIABLES ();
5270 return -2;
5271 }
25fe55af
RS
5272 }
5273 }
5274 else
fa9a63c5
RM
5275 {
5276 /* These braces fend off a "empty body in an else-statement"
7814e705 5277 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5278 assert (bufp->regs_allocated == REGS_FIXED);
5279 }
5280
25fe55af
RS
5281 /* Convert the pointer data in `regstart' and `regend' to
5282 indices. Register zero has to be set differently,
5283 since we haven't kept track of any info for it. */
5284 if (regs->num_regs > 0)
5285 {
5286 regs->start[0] = pos;
99633e97 5287 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5288 }
5e69f11e 5289
25fe55af
RS
5290 /* Go through the first `min (num_regs, regs->num_regs)'
5291 registers, since that is all we initialized. */
01618498 5292 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5293 {
01618498
SM
5294 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5295 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5296 else
5297 {
01618498
SM
5298 regs->start[reg]
5299 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5300 regs->end[reg]
5301 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5302 }
fa9a63c5 5303 }
5e69f11e 5304
25fe55af
RS
5305 /* If the regs structure we return has more elements than
5306 were in the pattern, set the extra elements to -1. If
5307 we (re)allocated the registers, this is the case,
5308 because we always allocate enough to have at least one
7814e705 5309 -1 at the end. */
01618498
SM
5310 for (reg = num_regs; reg < regs->num_regs; reg++)
5311 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5312 } /* regs && !bufp->no_sub */
5313
25fe55af
RS
5314 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5315 nfailure_points_pushed, nfailure_points_popped,
5316 nfailure_points_pushed - nfailure_points_popped);
5317 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5318
99633e97 5319 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5320
25fe55af 5321 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5322
25fe55af
RS
5323 FREE_VARIABLES ();
5324 return mcnt;
5325 }
fa9a63c5 5326
7814e705 5327 /* Otherwise match next pattern command. */
fa9a63c5
RM
5328 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5329 {
25fe55af
RS
5330 /* Ignore these. Used to ignore the n of succeed_n's which
5331 currently have n == 0. */
5332 case no_op:
5333 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5334 break;
fa9a63c5
RM
5335
5336 case succeed:
25fe55af 5337 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5338 goto succeed_label;
5339
7814e705 5340 /* Match the next n pattern characters exactly. The following
25fe55af 5341 byte in the pattern defines n, and the n bytes after that
7814e705 5342 are the characters to match. */
fa9a63c5
RM
5343 case exactn:
5344 mcnt = *p++;
25fe55af 5345 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5346
99633e97
SM
5347 /* Remember the start point to rollback upon failure. */
5348 dfail = d;
5349
6fdd04b0 5350#ifndef emacs
25fe55af
RS
5351 /* This is written out as an if-else so we don't waste time
5352 testing `translate' inside the loop. */
28703c16 5353 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5354 do
5355 {
5356 PREFETCH ();
5357 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5358 {
6fdd04b0
KH
5359 d = dfail;
5360 goto fail;
e934739e 5361 }
6fdd04b0
KH
5362 d++;
5363 }
5364 while (--mcnt);
fa9a63c5 5365 else
6fdd04b0
KH
5366 do
5367 {
5368 PREFETCH ();
5369 if (*d++ != *p++)
bf216479 5370 {
6fdd04b0
KH
5371 d = dfail;
5372 goto fail;
bf216479 5373 }
6fdd04b0
KH
5374 }
5375 while (--mcnt);
5376#else /* emacs */
5377 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5378 if (target_multibyte)
6fdd04b0
KH
5379 do
5380 {
5381 int pat_charlen, buf_charlen;
cf9c99bc 5382 int pat_ch, buf_ch;
e934739e 5383
6fdd04b0 5384 PREFETCH ();
cf9c99bc 5385 if (multibyte)
62a6e103 5386 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5387 else
5388 {
5389 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5390 pat_charlen = 1;
5391 }
62a6e103 5392 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5393
6fdd04b0 5394 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5395 {
6fdd04b0
KH
5396 d = dfail;
5397 goto fail;
e934739e 5398 }
bf216479 5399
6fdd04b0
KH
5400 p += pat_charlen;
5401 d += buf_charlen;
5402 mcnt -= pat_charlen;
5403 }
5404 while (mcnt > 0);
fa9a63c5 5405 else
6fdd04b0
KH
5406 do
5407 {
abbd1bcf 5408 int pat_charlen;
cf9c99bc 5409 int pat_ch, buf_ch;
bf216479 5410
6fdd04b0 5411 PREFETCH ();
cf9c99bc
KH
5412 if (multibyte)
5413 {
62a6e103 5414 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5415 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5416 }
5417 else
5418 {
5419 pat_ch = *p;
5420 pat_charlen = 1;
5421 }
5422 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5423 if (! CHAR_BYTE8_P (buf_ch))
5424 {
5425 buf_ch = TRANSLATE (buf_ch);
5426 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5427 if (buf_ch < 0)
5428 buf_ch = *d;
5429 }
0e2501ed
AS
5430 else
5431 buf_ch = *d;
cf9c99bc 5432 if (buf_ch != pat_ch)
6fdd04b0
KH
5433 {
5434 d = dfail;
5435 goto fail;
bf216479 5436 }
cf9c99bc
KH
5437 p += pat_charlen;
5438 d++;
6fdd04b0
KH
5439 }
5440 while (--mcnt);
5441#endif
25fe55af 5442 break;
fa9a63c5
RM
5443
5444
25fe55af 5445 /* Match any character except possibly a newline or a null. */
fa9a63c5 5446 case anychar:
e934739e
RS
5447 {
5448 int buf_charlen;
01618498 5449 re_wchar_t buf_ch;
fa9a63c5 5450
e934739e 5451 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5452
e934739e 5453 PREFETCH ();
62a6e103 5454 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5455 target_multibyte);
e934739e
RS
5456 buf_ch = TRANSLATE (buf_ch);
5457
5458 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5459 && buf_ch == '\n')
5460 || ((bufp->syntax & RE_DOT_NOT_NULL)
5461 && buf_ch == '\000'))
5462 goto fail;
5463
e934739e
RS
5464 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5465 d += buf_charlen;
5466 }
fa9a63c5
RM
5467 break;
5468
5469
5470 case charset:
5471 case charset_not:
5472 {
b18215fc 5473 register unsigned int c;
fa9a63c5 5474 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5475 int len;
5476
5477 /* Start of actual range_table, or end of bitmap if there is no
5478 range table. */
da053e48 5479 re_char *range_table IF_LINT (= NULL);
b18215fc 5480
96cc36cc 5481 /* Nonzero if there is a range table. */
b18215fc
RS
5482 int range_table_exists;
5483
96cc36cc
RS
5484 /* Number of ranges of range table. This is not included
5485 in the initial byte-length of the command. */
5486 int count = 0;
fa9a63c5 5487
f5020181
AS
5488 /* Whether matching against a unibyte character. */
5489 boolean unibyte_char = false;
5490
25fe55af 5491 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5492
b18215fc 5493 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5494
b18215fc 5495 if (range_table_exists)
96cc36cc
RS
5496 {
5497 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5498 EXTRACT_NUMBER_AND_INCR (count, range_table);
5499 }
b18215fc 5500
2d1675e4 5501 PREFETCH ();
62a6e103 5502 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5503 if (target_multibyte)
5504 {
5505 int c1;
b18215fc 5506
cf9c99bc
KH
5507 c = TRANSLATE (c);
5508 c1 = RE_CHAR_TO_UNIBYTE (c);
5509 if (c1 >= 0)
f5020181
AS
5510 {
5511 unibyte_char = true;
5512 c = c1;
5513 }
cf9c99bc
KH
5514 }
5515 else
5516 {
5517 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5518
5519 if (! CHAR_BYTE8_P (c1))
5520 {
5521 c1 = TRANSLATE (c1);
5522 c1 = RE_CHAR_TO_UNIBYTE (c1);
5523 if (c1 >= 0)
f5020181
AS
5524 {
5525 unibyte_char = true;
5526 c = c1;
5527 }
cf9c99bc 5528 }
0b8be006
AS
5529 else
5530 unibyte_char = true;
cf9c99bc
KH
5531 }
5532
f5020181 5533 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5534 { /* Lookup bitmap. */
b18215fc
RS
5535 /* Cast to `unsigned' instead of `unsigned char' in
5536 case the bit list is a full 32 bytes long. */
5537 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5538 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5539 not = !not;
b18215fc 5540 }
96cc36cc 5541#ifdef emacs
b18215fc 5542 else if (range_table_exists)
96cc36cc
RS
5543 {
5544 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5545
14473664
SM
5546 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5547 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5548 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5549 | (class_bits & BIT_SPACE && ISSPACE (c))
5550 | (class_bits & BIT_UPPER && ISUPPER (c))
5551 | (class_bits & BIT_WORD && ISWORD (c)))
5552 not = !not;
5553 else
5554 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5555 }
5556#endif /* emacs */
fa9a63c5 5557
96cc36cc
RS
5558 if (range_table_exists)
5559 p = CHARSET_RANGE_TABLE_END (range_table, count);
5560 else
5561 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5562
5563 if (!not) goto fail;
5e69f11e 5564
b18215fc 5565 d += len;
fa9a63c5 5566 }
8fb31792 5567 break;
fa9a63c5
RM
5568
5569
25fe55af 5570 /* The beginning of a group is represented by start_memory.
505bde11 5571 The argument is the register number. The text
25fe55af 5572 matched within the group is recorded (in the internal
7814e705 5573 registers data structure) under the register number. */
25fe55af 5574 case start_memory:
505bde11
SM
5575 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5576
5577 /* In case we need to undo this operation (via backtracking). */
5578 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5579
25fe55af 5580 regstart[*p] = d;
4bb91c68 5581 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5582 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5583
25fe55af 5584 /* Move past the register number and inner group count. */
505bde11 5585 p += 1;
25fe55af 5586 break;
fa9a63c5
RM
5587
5588
25fe55af 5589 /* The stop_memory opcode represents the end of a group. Its
505bde11 5590 argument is the same as start_memory's: the register number. */
fa9a63c5 5591 case stop_memory:
505bde11
SM
5592 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5593
5594 assert (!REG_UNSET (regstart[*p]));
5595 /* Strictly speaking, there should be code such as:
177c0ea7 5596
0b32bf0e 5597 assert (REG_UNSET (regend[*p]));
505bde11
SM
5598 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5599
5600 But the only info to be pushed is regend[*p] and it is known to
5601 be UNSET, so there really isn't anything to push.
5602 Not pushing anything, on the other hand deprives us from the
5603 guarantee that regend[*p] is UNSET since undoing this operation
5604 will not reset its value properly. This is not important since
5605 the value will only be read on the next start_memory or at
5606 the very end and both events can only happen if this stop_memory
5607 is *not* undone. */
fa9a63c5 5608
25fe55af 5609 regend[*p] = d;
fa9a63c5
RM
5610 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5611
25fe55af 5612 /* Move past the register number and the inner group count. */
505bde11 5613 p += 1;
25fe55af 5614 break;
fa9a63c5
RM
5615
5616
5617 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5618 followed by the numeric value of <digit> as the register number. */
5619 case duplicate:
fa9a63c5 5620 {
66f0296e 5621 register re_char *d2, *dend2;
7814e705 5622 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5623 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5624
7814e705 5625 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5626 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5627 goto fail;
5e69f11e 5628
7814e705 5629 /* Where in input to try to start matching. */
25fe55af 5630 d2 = regstart[regno];
5e69f11e 5631
99633e97
SM
5632 /* Remember the start point to rollback upon failure. */
5633 dfail = d;
5634
25fe55af
RS
5635 /* Where to stop matching; if both the place to start and
5636 the place to stop matching are in the same string, then
5637 set to the place to stop, otherwise, for now have to use
5638 the end of the first string. */
fa9a63c5 5639
25fe55af 5640 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5641 == FIRST_STRING_P (regend[regno]))
5642 ? regend[regno] : end_match_1);
5643 for (;;)
5644 {
5645 /* If necessary, advance to next segment in register
25fe55af 5646 contents. */
fa9a63c5
RM
5647 while (d2 == dend2)
5648 {
5649 if (dend2 == end_match_2) break;
5650 if (dend2 == regend[regno]) break;
5651
25fe55af
RS
5652 /* End of string1 => advance to string2. */
5653 d2 = string2;
5654 dend2 = regend[regno];
fa9a63c5
RM
5655 }
5656 /* At end of register contents => success */
5657 if (d2 == dend2) break;
5658
5659 /* If necessary, advance to next segment in data. */
5660 PREFETCH ();
5661
5662 /* How many characters left in this segment to match. */
5663 mcnt = dend - d;
5e69f11e 5664
fa9a63c5 5665 /* Want how many consecutive characters we can match in
25fe55af
RS
5666 one shot, so, if necessary, adjust the count. */
5667 if (mcnt > dend2 - d2)
fa9a63c5 5668 mcnt = dend2 - d2;
5e69f11e 5669
fa9a63c5 5670 /* Compare that many; failure if mismatch, else move
25fe55af 5671 past them. */
28703c16 5672 if (RE_TRANSLATE_P (translate)
02cb78b5 5673 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5674 : memcmp (d, d2, mcnt))
99633e97
SM
5675 {
5676 d = dfail;
5677 goto fail;
5678 }
fa9a63c5 5679 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5680 }
5681 }
5682 break;
5683
5684
25fe55af 5685 /* begline matches the empty string at the beginning of the string
c0f9ea08 5686 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5687 case begline:
25fe55af 5688 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5689
25fe55af
RS
5690 if (AT_STRINGS_BEG (d))
5691 {
5692 if (!bufp->not_bol) break;
5693 }
419d1c74 5694 else
25fe55af 5695 {
bf216479 5696 unsigned c;
419d1c74 5697 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5698 if (c == '\n')
419d1c74 5699 break;
25fe55af
RS
5700 }
5701 /* In all other cases, we fail. */
5702 goto fail;
fa9a63c5
RM
5703
5704
25fe55af 5705 /* endline is the dual of begline. */
fa9a63c5 5706 case endline:
25fe55af 5707 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5708
25fe55af
RS
5709 if (AT_STRINGS_END (d))
5710 {
5711 if (!bufp->not_eol) break;
5712 }
f1ad044f 5713 else
25fe55af 5714 {
f1ad044f 5715 PREFETCH_NOLIMIT ();
c0f9ea08 5716 if (*d == '\n')
f1ad044f 5717 break;
25fe55af
RS
5718 }
5719 goto fail;
fa9a63c5
RM
5720
5721
5722 /* Match at the very beginning of the data. */
25fe55af
RS
5723 case begbuf:
5724 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5725 if (AT_STRINGS_BEG (d))
5726 break;
5727 goto fail;
fa9a63c5
RM
5728
5729
5730 /* Match at the very end of the data. */
25fe55af
RS
5731 case endbuf:
5732 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5733 if (AT_STRINGS_END (d))
5734 break;
25fe55af 5735 goto fail;
5e69f11e 5736
5e69f11e 5737
25fe55af
RS
5738 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5739 pushes NULL as the value for the string on the stack. Then
505bde11 5740 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5741 string, instead of restoring it. To see why, consider
7814e705 5742 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5743 then the . fails against the \n. But the next thing we want
5744 to do is match the \n against the \n; if we restored the
5745 string value, we would be back at the foo.
5746
5747 Because this is used only in specific cases, we don't need to
5748 check all the things that `on_failure_jump' does, to make
5749 sure the right things get saved on the stack. Hence we don't
5750 share its code. The only reason to push anything on the
5751 stack at all is that otherwise we would have to change
5752 `anychar's code to do something besides goto fail in this
5753 case; that seems worse than this. */
5754 case on_failure_keep_string_jump:
505bde11
SM
5755 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5756 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5757 mcnt, p + mcnt);
fa9a63c5 5758
505bde11
SM
5759 PUSH_FAILURE_POINT (p - 3, NULL);
5760 break;
5761
0683b6fa
SM
5762 /* A nasty loop is introduced by the non-greedy *? and +?.
5763 With such loops, the stack only ever contains one failure point
5764 at a time, so that a plain on_failure_jump_loop kind of
5765 cycle detection cannot work. Worse yet, such a detection
5766 can not only fail to detect a cycle, but it can also wrongly
5767 detect a cycle (between different instantiations of the same
6df42991 5768 loop).
0683b6fa
SM
5769 So the method used for those nasty loops is a little different:
5770 We use a special cycle-detection-stack-frame which is pushed
5771 when the on_failure_jump_nastyloop failure-point is *popped*.
5772 This special frame thus marks the beginning of one iteration
5773 through the loop and we can hence easily check right here
5774 whether something matched between the beginning and the end of
5775 the loop. */
5776 case on_failure_jump_nastyloop:
5777 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5778 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5779 mcnt, p + mcnt);
5780
5781 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5782 {
5783 int cycle = 0;
5784 CHECK_INFINITE_LOOP (p - 4, d);
5785 if (!cycle)
5786 /* If there's a cycle, just continue without pushing
5787 this failure point. The failure point is the "try again"
5788 option, which shouldn't be tried.
5789 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5790 PUSH_FAILURE_POINT (p - 3, d);
5791 }
0683b6fa
SM
5792 break;
5793
4e8a9132
SM
5794 /* Simple loop detecting on_failure_jump: just check on the
5795 failure stack if the same spot was already hit earlier. */
505bde11
SM
5796 case on_failure_jump_loop:
5797 on_failure:
5798 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5799 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5800 mcnt, p + mcnt);
6df42991
SM
5801 {
5802 int cycle = 0;
5803 CHECK_INFINITE_LOOP (p - 3, d);
5804 if (cycle)
5805 /* If there's a cycle, get out of the loop, as if the matching
5806 had failed. We used to just `goto fail' here, but that was
5807 aborting the search a bit too early: we want to keep the
5808 empty-loop-match and keep matching after the loop.
5809 We want (x?)*y\1z to match both xxyz and xxyxz. */
5810 p += mcnt;
5811 else
5812 PUSH_FAILURE_POINT (p - 3, d);
5813 }
25fe55af 5814 break;
fa9a63c5
RM
5815
5816
5817 /* Uses of on_failure_jump:
5e69f11e 5818
25fe55af
RS
5819 Each alternative starts with an on_failure_jump that points
5820 to the beginning of the next alternative. Each alternative
5821 except the last ends with a jump that in effect jumps past
5822 the rest of the alternatives. (They really jump to the
5823 ending jump of the following alternative, because tensioning
5824 these jumps is a hassle.)
fa9a63c5 5825
25fe55af
RS
5826 Repeats start with an on_failure_jump that points past both
5827 the repetition text and either the following jump or
5828 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5829 case on_failure_jump:
25fe55af 5830 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5831 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5832 mcnt, p + mcnt);
25fe55af 5833
505bde11 5834 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5835 break;
5836
4e8a9132 5837 /* This operation is used for greedy *.
505bde11
SM
5838 Compare the beginning of the repeat with what in the
5839 pattern follows its end. If we can establish that there
5840 is nothing that they would both match, i.e., that we
5841 would have to backtrack because of (as in, e.g., `a*a')
5842 then we can use a non-backtracking loop based on
4e8a9132 5843 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5844 case on_failure_jump_smart:
25fe55af 5845 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5846 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5847 mcnt, p + mcnt);
25fe55af 5848 {
01618498 5849 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5850 /* Here, we discard `const', making re_match non-reentrant. */
5851 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5852 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5853
505bde11
SM
5854 p -= 3; /* Reset so that we will re-execute the
5855 instruction once it's been changed. */
fa9a63c5 5856
4e8a9132
SM
5857 EXTRACT_NUMBER (mcnt, p2 - 2);
5858
5859 /* Ensure this is a indeed the trivial kind of loop
5860 we are expecting. */
5861 assert (skip_one_char (p1) == p2 - 3);
5862 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5863 DEBUG_STATEMENT (debug += 2);
505bde11 5864 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5865 {
505bde11 5866 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5867 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5868 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5869 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5870 }
505bde11 5871 else
fa9a63c5 5872 {
505bde11
SM
5873 /* Default to a safe `on_failure_jump' loop. */
5874 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5875 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5876 }
99633e97 5877 DEBUG_STATEMENT (debug -= 2);
25fe55af 5878 }
505bde11 5879 break;
25fe55af
RS
5880
5881 /* Unconditionally jump (without popping any failure points). */
5882 case jump:
fa9a63c5 5883 unconditional_jump:
5b370c2b 5884 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5885 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5886 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5887 p += mcnt; /* Do the jump. */
505bde11 5888 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5889 break;
5890
5891
25fe55af
RS
5892 /* Have to succeed matching what follows at least n times.
5893 After that, handle like `on_failure_jump'. */
5894 case succeed_n:
01618498 5895 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5896 EXTRACT_NUMBER (mcnt, p + 2);
5897 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5898
dc1e502d
SM
5899 /* Originally, mcnt is how many times we HAVE to succeed. */
5900 if (mcnt != 0)
25fe55af 5901 {
6dcf2d0e
SM
5902 /* Here, we discard `const', making re_match non-reentrant. */
5903 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5904 mcnt--;
01618498
SM
5905 p += 4;
5906 PUSH_NUMBER (p2, mcnt);
25fe55af 5907 }
dc1e502d
SM
5908 else
5909 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5910 goto on_failure;
25fe55af
RS
5911 break;
5912
5913 case jump_n:
01618498 5914 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5915 EXTRACT_NUMBER (mcnt, p + 2);
5916 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5917
5918 /* Originally, this is how many times we CAN jump. */
dc1e502d 5919 if (mcnt != 0)
25fe55af 5920 {
6dcf2d0e
SM
5921 /* Here, we discard `const', making re_match non-reentrant. */
5922 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5923 mcnt--;
01618498 5924 PUSH_NUMBER (p2, mcnt);
dc1e502d 5925 goto unconditional_jump;
25fe55af
RS
5926 }
5927 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5928 else
5929 p += 4;
25fe55af 5930 break;
5e69f11e 5931
fa9a63c5
RM
5932 case set_number_at:
5933 {
01618498 5934 unsigned char *p2; /* Location of the counter. */
25fe55af 5935 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5936
25fe55af 5937 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5938 /* Here, we discard `const', making re_match non-reentrant. */
5939 p2 = (unsigned char*) p + mcnt;
01618498 5940 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5941 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5942 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5943 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5944 break;
5945 }
9121ca40
KH
5946
5947 case wordbound:
66f0296e 5948 case notwordbound:
19ed5445
PE
5949 {
5950 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5951 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5952
19ed5445 5953 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5954
19ed5445
PE
5955 /* Case 1: D is at the beginning or the end of string. */
5956 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5957 not = !not;
5958 else
5959 {
5960 /* C1 is the character before D, S1 is the syntax of C1, C2
5961 is the character at D, and S2 is the syntax of C2. */
5962 re_wchar_t c1, c2;
5963 int s1, s2;
5964 int dummy;
b18215fc 5965#ifdef emacs
d1dfb56c
EZ
5966 ssize_t offset = PTR_TO_OFFSET (d - 1);
5967 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445 5968 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5969#endif
19ed5445
PE
5970 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5971 s1 = SYNTAX (c1);
b18215fc 5972#ifdef emacs
19ed5445 5973 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5974#endif
19ed5445
PE
5975 PREFETCH_NOLIMIT ();
5976 GET_CHAR_AFTER (c2, d, dummy);
5977 s2 = SYNTAX (c2);
5978
5979 if (/* Case 2: Only one of S1 and S2 is Sword. */
5980 ((s1 == Sword) != (s2 == Sword))
5981 /* Case 3: Both of S1 and S2 are Sword, and macro
5982 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5983 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5984 not = !not;
5985 }
5986 if (not)
5987 break;
5988 else
5989 goto fail;
5990 }
fa9a63c5
RM
5991
5992 case wordbeg:
25fe55af 5993 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5994
b18215fc
RS
5995 /* We FAIL in one of the following cases: */
5996
7814e705 5997 /* Case 1: D is at the end of string. */
b18215fc 5998 if (AT_STRINGS_END (d))
99633e97 5999 goto fail;
b18215fc
RS
6000 else
6001 {
6002 /* C1 is the character before D, S1 is the syntax of C1, C2
6003 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6004 re_wchar_t c1, c2;
6005 int s1, s2;
bf216479 6006 int dummy;
fa9a63c5 6007#ifdef emacs
d1dfb56c
EZ
6008 ssize_t offset = PTR_TO_OFFSET (d);
6009 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6010 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6011#endif
99633e97 6012 PREFETCH ();
6fdd04b0 6013 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6014 s2 = SYNTAX (c2);
177c0ea7 6015
b18215fc
RS
6016 /* Case 2: S2 is not Sword. */
6017 if (s2 != Sword)
6018 goto fail;
6019
6020 /* Case 3: D is not at the beginning of string ... */
6021 if (!AT_STRINGS_BEG (d))
6022 {
6023 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6024#ifdef emacs
5d967c7a 6025 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6026#endif
b18215fc
RS
6027 s1 = SYNTAX (c1);
6028
6029 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6030 returns 0. */
b18215fc
RS
6031 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6032 goto fail;
6033 }
6034 }
e318085a
RS
6035 break;
6036
b18215fc 6037 case wordend:
25fe55af 6038 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6039
6040 /* We FAIL in one of the following cases: */
6041
6042 /* Case 1: D is at the beginning of string. */
6043 if (AT_STRINGS_BEG (d))
e318085a 6044 goto fail;
b18215fc
RS
6045 else
6046 {
6047 /* C1 is the character before D, S1 is the syntax of C1, C2
6048 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6049 re_wchar_t c1, c2;
6050 int s1, s2;
bf216479 6051 int dummy;
5d967c7a 6052#ifdef emacs
d1dfb56c
EZ
6053 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6054 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6055 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6056#endif
99633e97 6057 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6058 s1 = SYNTAX (c1);
6059
6060 /* Case 2: S1 is not Sword. */
6061 if (s1 != Sword)
6062 goto fail;
6063
6064 /* Case 3: D is not at the end of string ... */
6065 if (!AT_STRINGS_END (d))
6066 {
f1ad044f 6067 PREFETCH_NOLIMIT ();
6fdd04b0 6068 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6069#ifdef emacs
6070 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6071#endif
b18215fc
RS
6072 s2 = SYNTAX (c2);
6073
6074 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6075 returns 0. */
b18215fc 6076 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6077 goto fail;
b18215fc
RS
6078 }
6079 }
e318085a
RS
6080 break;
6081
669fa600
SM
6082 case symbeg:
6083 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6084
6085 /* We FAIL in one of the following cases: */
6086
7814e705 6087 /* Case 1: D is at the end of string. */
669fa600
SM
6088 if (AT_STRINGS_END (d))
6089 goto fail;
6090 else
6091 {
6092 /* C1 is the character before D, S1 is the syntax of C1, C2
6093 is the character at D, and S2 is the syntax of C2. */
6094 re_wchar_t c1, c2;
6095 int s1, s2;
6096#ifdef emacs
d1dfb56c
EZ
6097 ssize_t offset = PTR_TO_OFFSET (d);
6098 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6099 UPDATE_SYNTAX_TABLE (charpos);
6100#endif
6101 PREFETCH ();
62a6e103 6102 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6103 s2 = SYNTAX (c2);
7814e705 6104
669fa600
SM
6105 /* Case 2: S2 is neither Sword nor Ssymbol. */
6106 if (s2 != Sword && s2 != Ssymbol)
6107 goto fail;
6108
6109 /* Case 3: D is not at the beginning of string ... */
6110 if (!AT_STRINGS_BEG (d))
6111 {
6112 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6113#ifdef emacs
6114 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6115#endif
6116 s1 = SYNTAX (c1);
6117
6118 /* ... and S1 is Sword or Ssymbol. */
6119 if (s1 == Sword || s1 == Ssymbol)
6120 goto fail;
6121 }
6122 }
6123 break;
6124
6125 case symend:
6126 DEBUG_PRINT1 ("EXECUTING symend.\n");
6127
6128 /* We FAIL in one of the following cases: */
6129
6130 /* Case 1: D is at the beginning of string. */
6131 if (AT_STRINGS_BEG (d))
6132 goto fail;
6133 else
6134 {
6135 /* C1 is the character before D, S1 is the syntax of C1, C2
6136 is the character at D, and S2 is the syntax of C2. */
6137 re_wchar_t c1, c2;
6138 int s1, s2;
6139#ifdef emacs
d1dfb56c
EZ
6140 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6141 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6142 UPDATE_SYNTAX_TABLE (charpos);
6143#endif
6144 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6145 s1 = SYNTAX (c1);
6146
6147 /* Case 2: S1 is neither Ssymbol nor Sword. */
6148 if (s1 != Sword && s1 != Ssymbol)
6149 goto fail;
6150
6151 /* Case 3: D is not at the end of string ... */
6152 if (!AT_STRINGS_END (d))
6153 {
6154 PREFETCH_NOLIMIT ();
62a6e103 6155 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6156#ifdef emacs
134579f2 6157 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6158#endif
6159 s2 = SYNTAX (c2);
6160
6161 /* ... and S2 is Sword or Ssymbol. */
6162 if (s2 == Sword || s2 == Ssymbol)
6163 goto fail;
b18215fc
RS
6164 }
6165 }
e318085a
RS
6166 break;
6167
fa9a63c5 6168 case syntaxspec:
1fb352e0 6169 case notsyntaxspec:
b18215fc 6170 {
19ed5445
PE
6171 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6172 mcnt = *p++;
6173 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6174 PREFETCH ();
6175#ifdef emacs
6176 {
d1dfb56c
EZ
6177 ssize_t offset = PTR_TO_OFFSET (d);
6178 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445
PE
6179 UPDATE_SYNTAX_TABLE (pos1);
6180 }
25fe55af 6181#endif
19ed5445
PE
6182 {
6183 int len;
6184 re_wchar_t c;
b18215fc 6185
19ed5445
PE
6186 GET_CHAR_AFTER (c, d, len);
6187 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6188 goto fail;
6189 d += len;
6190 }
b18215fc 6191 }
8fb31792 6192 break;
fa9a63c5 6193
b18215fc 6194#ifdef emacs
1fb352e0
SM
6195 case before_dot:
6196 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6197 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6198 goto fail;
b18215fc
RS
6199 break;
6200
1fb352e0
SM
6201 case at_dot:
6202 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6203 if (PTR_BYTE_POS (d) != PT_BYTE)
6204 goto fail;
6205 break;
b18215fc 6206
1fb352e0
SM
6207 case after_dot:
6208 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6209 if (PTR_BYTE_POS (d) <= PT_BYTE)
6210 goto fail;
e318085a 6211 break;
fa9a63c5 6212
1fb352e0 6213 case categoryspec:
b18215fc 6214 case notcategoryspec:
b18215fc 6215 {
8fb31792
PE
6216 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6217 mcnt = *p++;
6218 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n",
6219 not?"not":"", mcnt);
6220 PREFETCH ();
01618498 6221
8fb31792
PE
6222 {
6223 int len;
6224 re_wchar_t c;
6225 GET_CHAR_AFTER (c, d, len);
6226 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6227 goto fail;
6228 d += len;
6229 }
b18215fc 6230 }
fa9a63c5 6231 break;
5e69f11e 6232
1fb352e0 6233#endif /* emacs */
5e69f11e 6234
0b32bf0e
SM
6235 default:
6236 abort ();
fa9a63c5 6237 }
b18215fc 6238 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6239
6240
6241 /* We goto here if a matching operation fails. */
6242 fail:
5b370c2b 6243 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6244 if (!FAIL_STACK_EMPTY ())
505bde11 6245 {
01618498 6246 re_char *str, *pat;
505bde11 6247 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6248 DEBUG_PRINT1 ("\nFAIL:\n");
6249 POP_FAILURE_POINT (str, pat);
505bde11
SM
6250 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6251 {
6252 case on_failure_keep_string_jump:
6253 assert (str == NULL);
6254 goto continue_failure_jump;
6255
0683b6fa
SM
6256 case on_failure_jump_nastyloop:
6257 assert ((re_opcode_t)pat[-2] == no_op);
6258 PUSH_FAILURE_POINT (pat - 2, str);
6259 /* Fallthrough */
6260
505bde11
SM
6261 case on_failure_jump_loop:
6262 case on_failure_jump:
6263 case succeed_n:
6264 d = str;
6265 continue_failure_jump:
6266 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6267 p = pat + mcnt;
6268 break;
b18215fc 6269
0683b6fa
SM
6270 case no_op:
6271 /* A special frame used for nastyloops. */
6272 goto fail;
6273
505bde11 6274 default:
5e617bc2 6275 abort ();
505bde11 6276 }
fa9a63c5 6277
505bde11 6278 assert (p >= bufp->buffer && p <= pend);
b18215fc 6279
0b32bf0e 6280 if (d >= string1 && d <= end1)
fa9a63c5 6281 dend = end_match_1;
0b32bf0e 6282 }
fa9a63c5 6283 else
0b32bf0e 6284 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6285 } /* for (;;) */
6286
6287 if (best_regs_set)
6288 goto restore_best_regs;
6289
6290 FREE_VARIABLES ();
6291
b18215fc 6292 return -1; /* Failure to match. */
fa9a63c5
RM
6293} /* re_match_2 */
6294\f
6295/* Subroutine definitions for re_match_2. */
6296
fa9a63c5
RM
6297/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6298 bytes; nonzero otherwise. */
5e69f11e 6299
fa9a63c5 6300static int
d1dfb56c 6301bcmp_translate (const re_char *s1, const re_char *s2, register ssize_t len,
438105ed 6302 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6303{
2d1675e4
SM
6304 register re_char *p1 = s1, *p2 = s2;
6305 re_char *p1_end = s1 + len;
6306 re_char *p2_end = s2 + len;
e934739e 6307
4bb91c68
SM
6308 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6309 different lengths, but relying on a single `len' would break this. -sm */
6310 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6311 {
e934739e 6312 int p1_charlen, p2_charlen;
01618498 6313 re_wchar_t p1_ch, p2_ch;
e934739e 6314
6fdd04b0
KH
6315 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6316 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6317
6318 if (RE_TRANSLATE (translate, p1_ch)
6319 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6320 return 1;
e934739e
RS
6321
6322 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6323 }
e934739e
RS
6324
6325 if (p1 != p1_end || p2 != p2_end)
6326 return 1;
6327
fa9a63c5
RM
6328 return 0;
6329}
6330\f
6331/* Entry points for GNU code. */
6332
6333/* re_compile_pattern is the GNU regular expression compiler: it
6334 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6335 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6336
fa9a63c5
RM
6337 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6338 are set in BUFP on entry.
5e69f11e 6339
b18215fc 6340 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6341
6342const char *
d1dfb56c
EZ
6343re_compile_pattern (const char *pattern, size_t length,
6344 struct re_pattern_buffer *bufp)
fa9a63c5
RM
6345{
6346 reg_errcode_t ret;
5e69f11e 6347
fa9a63c5
RM
6348 /* GNU code is written to assume at least RE_NREGS registers will be set
6349 (and at least one extra will be -1). */
6350 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6351
fa9a63c5
RM
6352 /* And GNU code determines whether or not to get register information
6353 by passing null for the REGS argument to re_match, etc., not by
6354 setting no_sub. */
6355 bufp->no_sub = 0;
5e69f11e 6356
4bb91c68 6357 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6358
6359 if (!ret)
6360 return NULL;
6361 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6362}
c0f9ea08 6363WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6364\f
b18215fc
RS
6365/* Entry points compatible with 4.2 BSD regex library. We don't define
6366 them unless specifically requested. */
fa9a63c5 6367
0b32bf0e 6368#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6369
6370/* BSD has one and only one pattern buffer. */
6371static struct re_pattern_buffer re_comp_buf;
6372
6373char *
0b32bf0e 6374# ifdef _LIBC
48afdd44
RM
6375/* Make these definitions weak in libc, so POSIX programs can redefine
6376 these names if they don't use our functions, and still use
6377 regcomp/regexec below without link errors. */
6378weak_function
0b32bf0e 6379# endif
31011111 6380re_comp (const char *s)
fa9a63c5
RM
6381{
6382 reg_errcode_t ret;
5e69f11e 6383
fa9a63c5
RM
6384 if (!s)
6385 {
6386 if (!re_comp_buf.buffer)
0b32bf0e 6387 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6388 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6389 return 0;
6390 }
6391
6392 if (!re_comp_buf.buffer)
6393 {
6394 re_comp_buf.buffer = (unsigned char *) malloc (200);
6395 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6396 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6397 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6398 re_comp_buf.allocated = 200;
6399
6400 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6401 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6402 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6403 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6404 }
6405
6406 /* Since `re_exec' always passes NULL for the `regs' argument, we
6407 don't need to initialize the pattern buffer fields which affect it. */
6408
fa9a63c5 6409 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6410
fa9a63c5
RM
6411 if (!ret)
6412 return NULL;
6413
6414 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6415 return (char *) gettext (re_error_msgid[(int) ret]);
6416}
6417
6418
31011111 6419int
0b32bf0e 6420# ifdef _LIBC
48afdd44 6421weak_function
0b32bf0e 6422# endif
d1dfb56c 6423re_exec (const char *s)
fa9a63c5 6424{
d1dfb56c 6425 const size_t len = strlen (s);
fa9a63c5
RM
6426 return
6427 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6428}
6429#endif /* _REGEX_RE_COMP */
6430\f
6431/* POSIX.2 functions. Don't define these for Emacs. */
6432
6433#ifndef emacs
6434
6435/* regcomp takes a regular expression as a string and compiles it.
6436
b18215fc 6437 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6438 since POSIX says we shouldn't. Thus, we set
6439
6440 `buffer' to the compiled pattern;
6441 `used' to the length of the compiled pattern;
6442 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6443 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6444 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6445 `fastmap' to an allocated space for the fastmap;
6446 `fastmap_accurate' to zero;
fa9a63c5
RM
6447 `re_nsub' to the number of subexpressions in PATTERN.
6448
6449 PATTERN is the address of the pattern string.
6450
6451 CFLAGS is a series of bits which affect compilation.
6452
6453 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6454 use POSIX basic syntax.
6455
6456 If REG_NEWLINE is set, then . and [^...] don't match newline.
6457 Also, regexec will try a match beginning after every newline.
6458
6459 If REG_ICASE is set, then we considers upper- and lowercase
6460 versions of letters to be equivalent when matching.
6461
6462 If REG_NOSUB is set, then when PREG is passed to regexec, that
6463 routine will report only success or failure, and nothing about the
6464 registers.
6465
b18215fc 6466 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6467 the return codes and their meanings.) */
6468
d1dfb56c 6469reg_errcode_t
d2762c86
DN
6470regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6471 int cflags)
fa9a63c5
RM
6472{
6473 reg_errcode_t ret;
4bb91c68 6474 reg_syntax_t syntax
fa9a63c5
RM
6475 = (cflags & REG_EXTENDED) ?
6476 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6477
6478 /* regex_compile will allocate the space for the compiled pattern. */
6479 preg->buffer = 0;
6480 preg->allocated = 0;
6481 preg->used = 0;
5e69f11e 6482
c0f9ea08
SM
6483 /* Try to allocate space for the fastmap. */
6484 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6485
fa9a63c5
RM
6486 if (cflags & REG_ICASE)
6487 {
6488 unsigned i;
5e69f11e 6489
6676cb1c
RS
6490 preg->translate
6491 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6492 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6493 if (preg->translate == NULL)
0b32bf0e 6494 return (int) REG_ESPACE;
fa9a63c5
RM
6495
6496 /* Map uppercase characters to corresponding lowercase ones. */
6497 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6498 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6499 }
6500 else
6501 preg->translate = NULL;
6502
6503 /* If REG_NEWLINE is set, newlines are treated differently. */
6504 if (cflags & REG_NEWLINE)
6505 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6506 syntax &= ~RE_DOT_NEWLINE;
6507 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6508 }
6509 else
c0f9ea08 6510 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6511
6512 preg->no_sub = !!(cflags & REG_NOSUB);
6513
5e69f11e 6514 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6515 can use strlen here in compiling the pattern. */
4bb91c68 6516 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6517
fa9a63c5
RM
6518 /* POSIX doesn't distinguish between an unmatched open-group and an
6519 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6520 if (ret == REG_ERPAREN)
6521 ret = REG_EPAREN;
6522
6523 if (ret == REG_NOERROR && preg->fastmap)
6524 { /* Compute the fastmap now, since regexec cannot modify the pattern
6525 buffer. */
6526 re_compile_fastmap (preg);
6527 if (preg->can_be_null)
6528 { /* The fastmap can't be used anyway. */
6529 free (preg->fastmap);
6530 preg->fastmap = NULL;
6531 }
6532 }
d1dfb56c 6533 return ret;
fa9a63c5 6534}
c0f9ea08 6535WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6536
6537
6538/* regexec searches for a given pattern, specified by PREG, in the
6539 string STRING.
5e69f11e 6540
fa9a63c5 6541 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6542 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6543 least NMATCH elements, and we set them to the offsets of the
6544 corresponding matched substrings.
5e69f11e 6545
fa9a63c5
RM
6546 EFLAGS specifies `execution flags' which affect matching: if
6547 REG_NOTBOL is set, then ^ does not match at the beginning of the
6548 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6549
fa9a63c5
RM
6550 We return 0 if we find a match and REG_NOMATCH if not. */
6551
d1dfb56c 6552reg_errcode_t
d2762c86
DN
6553regexec (const regex_t *__restrict preg, const char *__restrict string,
6554 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5 6555{
31011111 6556 regoff_t ret;
fa9a63c5
RM
6557 struct re_registers regs;
6558 regex_t private_preg;
d1dfb56c 6559 size_t len = strlen (string);
c0f9ea08 6560 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6561
6562 private_preg = *preg;
5e69f11e 6563
fa9a63c5
RM
6564 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6565 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6566
fa9a63c5
RM
6567 /* The user has told us exactly how many registers to return
6568 information about, via `nmatch'. We have to pass that on to the
b18215fc 6569 matching routines. */
fa9a63c5 6570 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6571
fa9a63c5
RM
6572 if (want_reg_info)
6573 {
6574 regs.num_regs = nmatch;
4bb91c68
SM
6575 regs.start = TALLOC (nmatch * 2, regoff_t);
6576 if (regs.start == NULL)
d1dfb56c 6577 return REG_NOMATCH;
4bb91c68 6578 regs.end = regs.start + nmatch;
fa9a63c5
RM
6579 }
6580
c0f9ea08
SM
6581 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6582 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6583 was a little bit longer but still only matching the real part.
6584 This works because the `endline' will check for a '\n' and will find a
6585 '\0', correctly deciding that this is not the end of a line.
6586 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6587 a convenient '\0' there. For all we know, the string could be preceded
6588 by '\n' which would throw things off. */
6589
fa9a63c5
RM
6590 /* Perform the searching operation. */
6591 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6592 /* start: */ 0, /* range: */ len,
6593 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6594
fa9a63c5
RM
6595 /* Copy the register information to the POSIX structure. */
6596 if (want_reg_info)
6597 {
6598 if (ret >= 0)
0b32bf0e
SM
6599 {
6600 unsigned r;
fa9a63c5 6601
0b32bf0e
SM
6602 for (r = 0; r < nmatch; r++)
6603 {
6604 pmatch[r].rm_so = regs.start[r];
6605 pmatch[r].rm_eo = regs.end[r];
6606 }
6607 }
fa9a63c5 6608
b18215fc 6609 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6610 free (regs.start);
fa9a63c5
RM
6611 }
6612
6613 /* We want zero return to mean success, unlike `re_search'. */
d1dfb56c 6614 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
fa9a63c5 6615}
c0f9ea08 6616WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6617
6618
ec869672
JR
6619/* Returns a message corresponding to an error code, ERR_CODE, returned
6620 from either regcomp or regexec. We don't use PREG here.
6621
6622 ERR_CODE was previously called ERRCODE, but that name causes an
6623 error with msvc8 compiler. */
fa9a63c5
RM
6624
6625size_t
d2762c86 6626regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6627{
6628 const char *msg;
6629 size_t msg_size;
6630
ec869672
JR
6631 if (err_code < 0
6632 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6633 /* Only error codes returned by the rest of the code should be passed
b18215fc 6634 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6635 code generates an invalid error code, then the program has a bug.
6636 Dump core so we can fix it. */
6637 abort ();
6638
ec869672 6639 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6640
6641 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6642
fa9a63c5
RM
6643 if (errbuf_size != 0)
6644 {
6645 if (msg_size > errbuf_size)
0b32bf0e
SM
6646 {
6647 strncpy (errbuf, msg, errbuf_size - 1);
6648 errbuf[errbuf_size - 1] = 0;
6649 }
fa9a63c5 6650 else
0b32bf0e 6651 strcpy (errbuf, msg);
fa9a63c5
RM
6652 }
6653
6654 return msg_size;
6655}
c0f9ea08 6656WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6657
6658
6659/* Free dynamically allocated space used by PREG. */
6660
6661void
d2762c86 6662regfree (regex_t *preg)
fa9a63c5 6663{
c2cd06e6 6664 free (preg->buffer);
fa9a63c5 6665 preg->buffer = NULL;
5e69f11e 6666
fa9a63c5
RM
6667 preg->allocated = 0;
6668 preg->used = 0;
6669
c2cd06e6 6670 free (preg->fastmap);
fa9a63c5
RM
6671 preg->fastmap = NULL;
6672 preg->fastmap_accurate = 0;
6673
c2cd06e6 6674 free (preg->translate);
fa9a63c5
RM
6675 preg->translate = NULL;
6676}
c0f9ea08 6677WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6678
6679#endif /* not emacs */