* Makefile.in (ALL_CFLAGS): Add -I../lib -I${srcdir}/../lib.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
acaf905b 5 Copyright (C) 1993-2012 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
4fc5845f 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
b8df54ff
PE
36/* Ignore some GCC warnings for now. This section should go away
37 once the Emacs and Gnulib regex code is merged. */
38#if (__GNUC__ == 4 && 3 <= __GNUC_MINOR__) || 4 < __GNUC__
39# pragma GCC diagnostic ignored "-Wstrict-overflow"
40# ifndef emacs
41# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
42# pragma GCC diagnostic ignored "-Wunused-function"
43# pragma GCC diagnostic ignored "-Wunused-macros"
44# pragma GCC diagnostic ignored "-Wunused-result"
45# pragma GCC diagnostic ignored "-Wunused-variable"
46# endif
47#endif
48
fa9a63c5 49#ifdef HAVE_CONFIG_H
0b32bf0e 50# include <config.h>
fa9a63c5
RM
51#endif
52
0e926e56
PE
53#include <stddef.h>
54
55#ifdef emacs
4bb91c68
SM
56/* We need this for `regex.h', and perhaps for the Emacs include files. */
57# include <sys/types.h>
58#endif
fa9a63c5 59
14473664
SM
60/* Whether to use ISO C Amendment 1 wide char functions.
61 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
62#if defined _LIBC
63#define WIDE_CHAR_SUPPORT 1
64#else
14473664 65#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
66 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
67#endif
14473664 68
fa463103 69/* For platform which support the ISO C amendment 1 functionality we
14473664 70 support user defined character classes. */
a0ad02f7 71#if WIDE_CHAR_SUPPORT
14473664
SM
72/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
73# include <wchar.h>
74# include <wctype.h>
75#endif
76
c0f9ea08
SM
77#ifdef _LIBC
78/* We have to keep the namespace clean. */
79# define regfree(preg) __regfree (preg)
80# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
81# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672 82# define regerror(err_code, preg, errbuf, errbuf_size) \
5e617bc2 83 __regerror (err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
84# define re_set_registers(bu, re, nu, st, en) \
85 __re_set_registers (bu, re, nu, st, en)
86# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
87 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
88# define re_match(bufp, string, size, pos, regs) \
89 __re_match (bufp, string, size, pos, regs)
90# define re_search(bufp, string, size, startpos, range, regs) \
91 __re_search (bufp, string, size, startpos, range, regs)
92# define re_compile_pattern(pattern, length, bufp) \
93 __re_compile_pattern (pattern, length, bufp)
94# define re_set_syntax(syntax) __re_set_syntax (syntax)
95# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
96 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
97# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
98
14473664
SM
99/* Make sure we call libc's function even if the user overrides them. */
100# define btowc __btowc
101# define iswctype __iswctype
102# define wctype __wctype
103
c0f9ea08
SM
104# define WEAK_ALIAS(a,b) weak_alias (a, b)
105
106/* We are also using some library internals. */
107# include <locale/localeinfo.h>
108# include <locale/elem-hash.h>
109# include <langinfo.h>
110#else
111# define WEAK_ALIAS(a,b)
112#endif
113
4bb91c68 114/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 115#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
116# include <libintl.h>
117#else
118# define gettext(msgid) (msgid)
119#endif
120
5e69f11e
RM
121#ifndef gettext_noop
122/* This define is so xgettext can find the internationalizable
123 strings. */
0b32bf0e 124# define gettext_noop(String) String
5e69f11e
RM
125#endif
126
fa9a63c5
RM
127/* The `emacs' switch turns on certain matching commands
128 that make sense only in Emacs. */
129#ifdef emacs
130
d7306fe6 131# include <setjmp.h>
0b32bf0e 132# include "lisp.h"
e5560ff7 133# include "character.h"
0b32bf0e 134# include "buffer.h"
b18215fc
RS
135
136/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 137# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 138
0b32bf0e 139# include "syntax.h"
0b32bf0e 140# include "category.h"
fa9a63c5 141
7689ef0b
EZ
142# ifdef malloc
143# undef malloc
144# endif
0b32bf0e 145# define malloc xmalloc
7689ef0b
EZ
146# ifdef realloc
147# undef realloc
148# endif
0b32bf0e 149# define realloc xrealloc
7689ef0b
EZ
150# ifdef free
151# undef free
152# endif
0b32bf0e 153# define free xfree
9abbd165 154
7814e705 155/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
156# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
157# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
158
159# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 160# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
161# define RE_STRING_CHAR(p, multibyte) \
162 (multibyte ? (STRING_CHAR (p)) : (*(p)))
163# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
164 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 165
4c0354d7 166# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 167
2afc21f5 168# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 169
6fdd04b0
KH
170/* Set C a (possibly converted to multibyte) character before P. P
171 points into a string which is the virtual concatenation of STR1
172 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
173# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
174 do { \
02cb78b5 175 if (target_multibyte) \
bf216479
KH
176 { \
177 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
178 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
179 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 180 c = STRING_CHAR (dtemp); \
bf216479
KH
181 } \
182 else \
183 { \
184 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 185 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 186 } \
2d1675e4
SM
187 } while (0)
188
6fdd04b0
KH
189/* Set C a (possibly converted to multibyte) character at P, and set
190 LEN to the byte length of that character. */
191# define GET_CHAR_AFTER(c, p, len) \
192 do { \
02cb78b5 193 if (target_multibyte) \
62a6e103 194 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
195 else \
196 { \
cf9c99bc 197 (c) = *p; \
6fdd04b0 198 len = 1; \
cf9c99bc 199 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 200 } \
8f924df7 201 } while (0)
4e8a9132 202
fa9a63c5
RM
203#else /* not emacs */
204
205/* If we are not linking with Emacs proper,
206 we can't use the relocating allocator
207 even if config.h says that we can. */
0b32bf0e 208# undef REL_ALLOC
fa9a63c5 209
4004364e 210# include <unistd.h>
fa9a63c5 211
a77f947b
CY
212/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
213
b8df54ff 214static void *
d2762c86 215xmalloc (size_t size)
a77f947b
CY
216{
217 register void *val;
218 val = (void *) malloc (size);
219 if (!val && size)
220 {
221 write (2, "virtual memory exhausted\n", 25);
222 exit (1);
223 }
224 return val;
225}
226
b8df54ff 227static void *
d2762c86 228xrealloc (void *block, size_t size)
a77f947b
CY
229{
230 register void *val;
231 /* We must call malloc explicitly when BLOCK is 0, since some
232 reallocs don't do this. */
233 if (! block)
234 val = (void *) malloc (size);
235 else
236 val = (void *) realloc (block, size);
237 if (!val && size)
238 {
239 write (2, "virtual memory exhausted\n", 25);
240 exit (1);
241 }
242 return val;
243}
244
a073faa6
CY
245# ifdef malloc
246# undef malloc
247# endif
248# define malloc xmalloc
249# ifdef realloc
250# undef realloc
251# endif
252# define realloc xrealloc
253
9cfdb3ec 254# include <string.h>
fa9a63c5
RM
255
256/* Define the syntax stuff for \<, \>, etc. */
257
990b2375 258/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 259enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 260
0b32bf0e 261# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 262
e934739e 263/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
264# define CHAR_CHARSET(c) 0
265# define CHARSET_LEADING_CODE_BASE(c) 0
266# define MAX_MULTIBYTE_LENGTH 1
267# define RE_MULTIBYTE_P(x) 0
bf216479 268# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
269# define WORD_BOUNDARY_P(c1, c2) (0)
270# define CHAR_HEAD_P(p) (1)
271# define SINGLE_BYTE_CHAR_P(c) (1)
272# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 273# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 274# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
275# define STRING_CHAR(p) (*(p))
276# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 277# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
278# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
279# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
280# define RE_CHAR_TO_MULTIBYTE(c) (c)
281# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 282# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 283 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
284# define GET_CHAR_AFTER(c, p, len) \
285 (c = *p, len = 1)
0b32bf0e 286# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
287# define BYTE8_TO_CHAR(c) (c)
288# define CHAR_BYTE8_P(c) (0)
bf216479 289# define CHAR_LEADING_CODE(c) (c)
8f924df7 290
fa9a63c5 291#endif /* not emacs */
4e8a9132
SM
292
293#ifndef RE_TRANSLATE
0b32bf0e
SM
294# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
295# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 296#endif
fa9a63c5
RM
297\f
298/* Get the interface, including the syntax bits. */
299#include "regex.h"
300
f71b19b6
DL
301/* isalpha etc. are used for the character classes. */
302#include <ctype.h>
fa9a63c5 303
f71b19b6 304#ifdef emacs
fa9a63c5 305
f71b19b6 306/* 1 if C is an ASCII character. */
0b32bf0e 307# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 308
f71b19b6 309/* 1 if C is a unibyte character. */
0b32bf0e 310# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 311
f71b19b6 312/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 313
f71b19b6 314/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
315# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
316# define ISCNTRL(c) ((c) < ' ')
317# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
318 || ((c) >= 'a' && (c) <= 'f') \
319 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
320
321/* This is only used for single-byte characters. */
0b32bf0e 322# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
323
324/* The rest must handle multibyte characters. */
325
0b32bf0e 326# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 327 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
328 : 1)
329
14473664 330# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 331 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
332 : 1)
333
0b32bf0e 334# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
335 ? (((c) >= 'a' && (c) <= 'z') \
336 || ((c) >= 'A' && (c) <= 'Z') \
337 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
338 : SYNTAX (c) == Sword)
339
0b32bf0e 340# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
341 ? (((c) >= 'a' && (c) <= 'z') \
342 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
343 : SYNTAX (c) == Sword)
344
5da9919f 345# define ISLOWER(c) lowercasep (c)
96cc36cc 346
0b32bf0e 347# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
348 ? ((c) > ' ' && (c) < 0177 \
349 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
350 || ((c) >= 'A' && (c) <= 'Z') \
351 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
352 : SYNTAX (c) != Sword)
353
0b32bf0e 354# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 355
5da9919f 356# define ISUPPER(c) uppercasep (c)
96cc36cc 357
0b32bf0e 358# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
359
360#else /* not emacs */
361
f71b19b6 362/* 1 if C is an ASCII character. */
0b32bf0e 363# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
364
365/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
366# define ISUNIBYTE(c) 1
367
368# ifdef isblank
0e926e56 369# define ISBLANK(c) isblank (c)
0b32bf0e
SM
370# else
371# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
372# endif
373# ifdef isgraph
0e926e56 374# define ISGRAPH(c) isgraph (c)
0b32bf0e 375# else
0e926e56 376# define ISGRAPH(c) (isprint (c) && !isspace (c))
0b32bf0e
SM
377# endif
378
0e926e56 379/* Solaris defines ISPRINT so we must undefine it first. */
4bb91c68 380# undef ISPRINT
0e926e56
PE
381# define ISPRINT(c) isprint (c)
382# define ISDIGIT(c) isdigit (c)
383# define ISALNUM(c) isalnum (c)
384# define ISALPHA(c) isalpha (c)
385# define ISCNTRL(c) iscntrl (c)
386# define ISLOWER(c) islower (c)
387# define ISPUNCT(c) ispunct (c)
388# define ISSPACE(c) isspace (c)
389# define ISUPPER(c) isupper (c)
390# define ISXDIGIT(c) isxdigit (c)
0b32bf0e 391
5e617bc2 392# define ISWORD(c) ISALPHA (c)
0b32bf0e 393
4bb91c68 394# ifdef _tolower
5e617bc2 395# define TOLOWER(c) _tolower (c)
4bb91c68 396# else
5e617bc2 397# define TOLOWER(c) tolower (c)
4bb91c68
SM
398# endif
399
400/* How many characters in the character set. */
401# define CHAR_SET_SIZE 256
402
0b32bf0e 403# ifdef SYNTAX_TABLE
f71b19b6 404
0b32bf0e 405extern char *re_syntax_table;
f71b19b6 406
0b32bf0e
SM
407# else /* not SYNTAX_TABLE */
408
0b32bf0e
SM
409static char re_syntax_table[CHAR_SET_SIZE];
410
411static void
d2762c86 412init_syntax_once (void)
0b32bf0e
SM
413{
414 register int c;
415 static int done = 0;
416
417 if (done)
418 return;
419
72af86bd 420 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 421
4bb91c68
SM
422 for (c = 0; c < CHAR_SET_SIZE; ++c)
423 if (ISALNUM (c))
424 re_syntax_table[c] = Sword;
fa9a63c5 425
669fa600 426 re_syntax_table['_'] = Ssymbol;
fa9a63c5 427
0b32bf0e
SM
428 done = 1;
429}
430
431# endif /* not SYNTAX_TABLE */
96cc36cc 432
4bb91c68
SM
433# define SYNTAX(c) re_syntax_table[(c)]
434
96cc36cc
RS
435#endif /* not emacs */
436\f
261cb4bb 437#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
438\f
439/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
440 use `alloca' instead of `malloc'. This is because using malloc in
441 re_search* or re_match* could cause memory leaks when C-g is used in
442 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
443 the other hand, malloc is more portable, and easier to debug.
444
fa9a63c5
RM
445 Because we sometimes use alloca, some routines have to be macros,
446 not functions -- `alloca'-allocated space disappears at the end of the
447 function it is called in. */
448
449#ifdef REGEX_MALLOC
450
0b32bf0e
SM
451# define REGEX_ALLOCATE malloc
452# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
453# define REGEX_FREE free
fa9a63c5
RM
454
455#else /* not REGEX_MALLOC */
456
457/* Emacs already defines alloca, sometimes. */
0b32bf0e 458# ifndef alloca
fa9a63c5
RM
459
460/* Make alloca work the best possible way. */
0b32bf0e
SM
461# ifdef __GNUC__
462# define alloca __builtin_alloca
463# else /* not __GNUC__ */
7f585e7a 464# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
465# include <alloca.h>
466# endif /* HAVE_ALLOCA_H */
467# endif /* not __GNUC__ */
fa9a63c5 468
0b32bf0e 469# endif /* not alloca */
fa9a63c5 470
0b32bf0e 471# define REGEX_ALLOCATE alloca
fa9a63c5
RM
472
473/* Assumes a `char *destination' variable. */
0b32bf0e 474# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 475 (destination = (char *) alloca (nsize), \
4bb91c68 476 memcpy (destination, source, osize))
fa9a63c5
RM
477
478/* No need to do anything to free, after alloca. */
0b32bf0e 479# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
480
481#endif /* not REGEX_MALLOC */
482
483/* Define how to allocate the failure stack. */
484
0b32bf0e 485#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 486
0b32bf0e 487# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 488 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 489# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 490 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 491# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
492 r_alloc_free (&failure_stack_ptr)
493
4297555e 494#else /* not using relocating allocator */
fa9a63c5 495
0b32bf0e 496# ifdef REGEX_MALLOC
fa9a63c5 497
0b32bf0e
SM
498# define REGEX_ALLOCATE_STACK malloc
499# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
500# define REGEX_FREE_STACK free
fa9a63c5 501
0b32bf0e 502# else /* not REGEX_MALLOC */
fa9a63c5 503
0b32bf0e 504# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 505
0b32bf0e 506# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 507 REGEX_REALLOCATE (source, osize, nsize)
7814e705 508/* No need to explicitly free anything. */
0b32bf0e 509# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 510
0b32bf0e 511# endif /* not REGEX_MALLOC */
4297555e 512#endif /* not using relocating allocator */
fa9a63c5
RM
513
514
515/* True if `size1' is non-NULL and PTR is pointing anywhere inside
516 `string1' or just past its end. This works if PTR is NULL, which is
517 a good thing. */
25fe55af 518#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
519 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
520
521/* (Re)Allocate N items of type T using malloc, or fail. */
522#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
523#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
fa9a63c5
RM
524#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
525
4bb91c68 526#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
527
528#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
529
530#undef MAX
531#undef MIN
532#define MAX(a, b) ((a) > (b) ? (a) : (b))
533#define MIN(a, b) ((a) < (b) ? (a) : (b))
534
66f0296e 535/* Type of source-pattern and string chars. */
a6fc3b5c
EZ
536#ifdef _MSC_VER
537typedef unsigned char re_char;
538#else
66f0296e 539typedef const unsigned char re_char;
a6fc3b5c 540#endif
66f0296e 541
fa9a63c5
RM
542typedef char boolean;
543#define false 0
544#define true 1
545
261cb4bb
PE
546static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
547 re_char *string1, size_t size1,
548 re_char *string2, size_t size2,
549 ssize_t pos,
550 struct re_registers *regs,
551 ssize_t stop);
fa9a63c5
RM
552\f
553/* These are the command codes that appear in compiled regular
4bb91c68 554 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
555 command code can specify any interpretation whatsoever for its
556 arguments. Zero bytes may appear in the compiled regular expression. */
557
558typedef enum
559{
560 no_op = 0,
561
4bb91c68 562 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
563 succeed,
564
25fe55af 565 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
566 exactn,
567
25fe55af 568 /* Matches any (more or less) character. */
fa9a63c5
RM
569 anychar,
570
25fe55af
RS
571 /* Matches any one char belonging to specified set. First
572 following byte is number of bitmap bytes. Then come bytes
573 for a bitmap saying which chars are in. Bits in each byte
574 are ordered low-bit-first. A character is in the set if its
575 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
576 automatically not in the set.
577
578 If the length byte has the 0x80 bit set, then that stuff
579 is followed by a range table:
580 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 581 See RANGE_TABLE_WORK_BITS below.
01618498 582 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 583 pairs, each 2 multibyte characters,
0b32bf0e 584 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
585 charset,
586
25fe55af 587 /* Same parameters as charset, but match any character that is
4bb91c68 588 not one of those specified. */
fa9a63c5
RM
589 charset_not,
590
25fe55af
RS
591 /* Start remembering the text that is matched, for storing in a
592 register. Followed by one byte with the register number, in
593 the range 0 to one less than the pattern buffer's re_nsub
505bde11 594 field. */
fa9a63c5
RM
595 start_memory,
596
25fe55af
RS
597 /* Stop remembering the text that is matched and store it in a
598 memory register. Followed by one byte with the register
599 number, in the range 0 to one less than `re_nsub' in the
505bde11 600 pattern buffer. */
fa9a63c5
RM
601 stop_memory,
602
25fe55af 603 /* Match a duplicate of something remembered. Followed by one
4bb91c68 604 byte containing the register number. */
fa9a63c5
RM
605 duplicate,
606
25fe55af 607 /* Fail unless at beginning of line. */
fa9a63c5
RM
608 begline,
609
4bb91c68 610 /* Fail unless at end of line. */
fa9a63c5
RM
611 endline,
612
25fe55af
RS
613 /* Succeeds if at beginning of buffer (if emacs) or at beginning
614 of string to be matched (if not). */
fa9a63c5
RM
615 begbuf,
616
25fe55af 617 /* Analogously, for end of buffer/string. */
fa9a63c5 618 endbuf,
5e69f11e 619
25fe55af 620 /* Followed by two byte relative address to which to jump. */
5e69f11e 621 jump,
fa9a63c5 622
25fe55af 623 /* Followed by two-byte relative address of place to resume at
7814e705 624 in case of failure. */
fa9a63c5 625 on_failure_jump,
5e69f11e 626
25fe55af
RS
627 /* Like on_failure_jump, but pushes a placeholder instead of the
628 current string position when executed. */
fa9a63c5 629 on_failure_keep_string_jump,
5e69f11e 630
505bde11
SM
631 /* Just like `on_failure_jump', except that it checks that we
632 don't get stuck in an infinite loop (matching an empty string
633 indefinitely). */
634 on_failure_jump_loop,
635
0683b6fa
SM
636 /* Just like `on_failure_jump_loop', except that it checks for
637 a different kind of loop (the kind that shows up with non-greedy
638 operators). This operation has to be immediately preceded
639 by a `no_op'. */
640 on_failure_jump_nastyloop,
641
0b32bf0e 642 /* A smart `on_failure_jump' used for greedy * and + operators.
c7015153 643 It analyzes the loop before which it is put and if the
505bde11 644 loop does not require backtracking, it changes itself to
4e8a9132
SM
645 `on_failure_keep_string_jump' and short-circuits the loop,
646 else it just defaults to changing itself into `on_failure_jump'.
647 It assumes that it is pointing to just past a `jump'. */
505bde11 648 on_failure_jump_smart,
fa9a63c5 649
25fe55af 650 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
651 After matching N times, jump to the address upon failure.
652 Does not work if N starts at 0: use on_failure_jump_loop
653 instead. */
fa9a63c5
RM
654 succeed_n,
655
25fe55af
RS
656 /* Followed by two-byte relative address, and two-byte number n.
657 Jump to the address N times, then fail. */
fa9a63c5
RM
658 jump_n,
659
25fe55af 660 /* Set the following two-byte relative address to the
7814e705 661 subsequent two-byte number. The address *includes* the two
25fe55af 662 bytes of number. */
fa9a63c5
RM
663 set_number_at,
664
fa9a63c5
RM
665 wordbeg, /* Succeeds if at word beginning. */
666 wordend, /* Succeeds if at word end. */
667
668 wordbound, /* Succeeds if at a word boundary. */
7814e705 669 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 670
669fa600
SM
671 symbeg, /* Succeeds if at symbol beginning. */
672 symend, /* Succeeds if at symbol end. */
673
fa9a63c5 674 /* Matches any character whose syntax is specified. Followed by
25fe55af 675 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
676 syntaxspec,
677
678 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
679 notsyntaxspec
680
681#ifdef emacs
682 ,before_dot, /* Succeeds if before point. */
683 at_dot, /* Succeeds if at point. */
684 after_dot, /* Succeeds if after point. */
b18215fc
RS
685
686 /* Matches any character whose category-set contains the specified
7814e705
JB
687 category. The operator is followed by a byte which contains a
688 category code (mnemonic ASCII character). */
b18215fc
RS
689 categoryspec,
690
691 /* Matches any character whose category-set does not contain the
692 specified category. The operator is followed by a byte which
693 contains the category code (mnemonic ASCII character). */
694 notcategoryspec
fa9a63c5
RM
695#endif /* emacs */
696} re_opcode_t;
697\f
698/* Common operations on the compiled pattern. */
699
700/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
701
702#define STORE_NUMBER(destination, number) \
703 do { \
704 (destination)[0] = (number) & 0377; \
705 (destination)[1] = (number) >> 8; \
706 } while (0)
707
708/* Same as STORE_NUMBER, except increment DESTINATION to
709 the byte after where the number is stored. Therefore, DESTINATION
710 must be an lvalue. */
711
712#define STORE_NUMBER_AND_INCR(destination, number) \
713 do { \
714 STORE_NUMBER (destination, number); \
715 (destination) += 2; \
716 } while (0)
717
718/* Put into DESTINATION a number stored in two contiguous bytes starting
719 at SOURCE. */
720
721#define EXTRACT_NUMBER(destination, source) \
722 do { \
723 (destination) = *(source) & 0377; \
724 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
725 } while (0)
726
727#ifdef DEBUG
728static void
261cb4bb 729extract_number (int *dest, re_char *source)
fa9a63c5 730{
5e69f11e 731 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
732 *dest = *source & 0377;
733 *dest += temp << 8;
734}
735
4bb91c68 736# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
737# undef EXTRACT_NUMBER
738# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
739# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
740
741#endif /* DEBUG */
742
743/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
744 SOURCE must be an lvalue. */
745
746#define EXTRACT_NUMBER_AND_INCR(destination, source) \
747 do { \
748 EXTRACT_NUMBER (destination, source); \
25fe55af 749 (source) += 2; \
fa9a63c5
RM
750 } while (0)
751
752#ifdef DEBUG
753static void
261cb4bb 754extract_number_and_incr (int *destination, re_char **source)
5e69f11e 755{
fa9a63c5
RM
756 extract_number (destination, *source);
757 *source += 2;
758}
759
0b32bf0e
SM
760# ifndef EXTRACT_MACROS
761# undef EXTRACT_NUMBER_AND_INCR
762# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 763 extract_number_and_incr (&dest, &src)
0b32bf0e 764# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
765
766#endif /* DEBUG */
767\f
b18215fc
RS
768/* Store a multibyte character in three contiguous bytes starting
769 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 770 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
771
772#define STORE_CHARACTER_AND_INCR(destination, character) \
773 do { \
774 (destination)[0] = (character) & 0377; \
775 (destination)[1] = ((character) >> 8) & 0377; \
776 (destination)[2] = (character) >> 16; \
777 (destination) += 3; \
778 } while (0)
779
780/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 781 starting at SOURCE. */
b18215fc
RS
782
783#define EXTRACT_CHARACTER(destination, source) \
784 do { \
785 (destination) = ((source)[0] \
786 | ((source)[1] << 8) \
787 | ((source)[2] << 16)); \
788 } while (0)
789
790
791/* Macros for charset. */
792
793/* Size of bitmap of charset P in bytes. P is a start of charset,
794 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
795#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
796
797/* Nonzero if charset P has range table. */
25fe55af 798#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
799
800/* Return the address of range table of charset P. But not the start
801 of table itself, but the before where the number of ranges is
96cc36cc
RS
802 stored. `2 +' means to skip re_opcode_t and size of bitmap,
803 and the 2 bytes of flags at the start of the range table. */
804#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
805
806/* Extract the bit flags that start a range table. */
807#define CHARSET_RANGE_TABLE_BITS(p) \
808 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
809 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc 810
b18215fc 811/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
812 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
813 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
814 and end. */
815#define CHARSET_RANGE_TABLE_END(range_table, count) \
816 ((range_table) + (count) * 2 * 3)
817
7814e705 818/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
819 COUNT is number of ranges in RANGE_TABLE. */
820#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
821 do \
822 { \
01618498 823 re_wchar_t range_start, range_end; \
19ed5445 824 re_char *rtp; \
01618498 825 re_char *range_table_end \
b18215fc
RS
826 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
827 \
19ed5445 828 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 829 { \
19ed5445
PE
830 EXTRACT_CHARACTER (range_start, rtp); \
831 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
832 \
833 if (range_start <= (c) && (c) <= range_end) \
834 { \
835 (not) = !(not); \
836 break; \
837 } \
838 } \
839 } \
840 while (0)
841
842/* Test if C is in range table of CHARSET. The flag NOT is negated if
843 C is listed in it. */
844#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
845 do \
846 { \
847 /* Number of ranges in range table. */ \
848 int count; \
01618498
SM
849 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
850 \
b18215fc
RS
851 EXTRACT_NUMBER_AND_INCR (count, range_table); \
852 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
853 } \
854 while (0)
855\f
fa9a63c5
RM
856/* If DEBUG is defined, Regex prints many voluminous messages about what
857 it is doing (if the variable `debug' is nonzero). If linked with the
858 main program in `iregex.c', you can enter patterns and strings
859 interactively. And if linked with the main program in `main.c' and
4bb91c68 860 the other test files, you can run the already-written tests. */
fa9a63c5
RM
861
862#ifdef DEBUG
863
864/* We use standard I/O for debugging. */
0b32bf0e 865# include <stdio.h>
fa9a63c5
RM
866
867/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 868# include <assert.h>
fa9a63c5 869
99633e97 870static int debug = -100000;
fa9a63c5 871
0b32bf0e
SM
872# define DEBUG_STATEMENT(e) e
873# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
874# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
875# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
876# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
877# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 878 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 879# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 880 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
881
882
883/* Print the fastmap in human-readable form. */
884
885void
886print_fastmap (fastmap)
887 char *fastmap;
888{
889 unsigned was_a_range = 0;
5e69f11e
RM
890 unsigned i = 0;
891
fa9a63c5
RM
892 while (i < (1 << BYTEWIDTH))
893 {
894 if (fastmap[i++])
895 {
896 was_a_range = 0;
25fe55af
RS
897 putchar (i - 1);
898 while (i < (1 << BYTEWIDTH) && fastmap[i])
899 {
900 was_a_range = 1;
901 i++;
902 }
fa9a63c5 903 if (was_a_range)
25fe55af
RS
904 {
905 printf ("-");
906 putchar (i - 1);
907 }
908 }
fa9a63c5 909 }
5e69f11e 910 putchar ('\n');
fa9a63c5
RM
911}
912
913
914/* Print a compiled pattern string in human-readable form, starting at
915 the START pointer into it and ending just before the pointer END. */
916
917void
918print_partial_compiled_pattern (start, end)
01618498
SM
919 re_char *start;
920 re_char *end;
fa9a63c5
RM
921{
922 int mcnt, mcnt2;
01618498
SM
923 re_char *p = start;
924 re_char *pend = end;
fa9a63c5
RM
925
926 if (start == NULL)
927 {
a1a052df 928 fprintf (stderr, "(null)\n");
fa9a63c5
RM
929 return;
930 }
5e69f11e 931
fa9a63c5
RM
932 /* Loop over pattern commands. */
933 while (p < pend)
934 {
a1a052df 935 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
936
937 switch ((re_opcode_t) *p++)
938 {
25fe55af 939 case no_op:
a1a052df 940 fprintf (stderr, "/no_op");
25fe55af 941 break;
fa9a63c5 942
99633e97 943 case succeed:
a1a052df 944 fprintf (stderr, "/succeed");
99633e97
SM
945 break;
946
fa9a63c5
RM
947 case exactn:
948 mcnt = *p++;
a1a052df 949 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 950 do
fa9a63c5 951 {
a1a052df 952 fprintf (stderr, "/%c", *p++);
25fe55af
RS
953 }
954 while (--mcnt);
955 break;
fa9a63c5
RM
956
957 case start_memory:
a1a052df 958 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 959 break;
fa9a63c5
RM
960
961 case stop_memory:
a1a052df 962 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 963 break;
fa9a63c5
RM
964
965 case duplicate:
a1a052df 966 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
967 break;
968
969 case anychar:
a1a052df 970 fprintf (stderr, "/anychar");
fa9a63c5
RM
971 break;
972
973 case charset:
25fe55af
RS
974 case charset_not:
975 {
976 register int c, last = -100;
fa9a63c5 977 register int in_range = 0;
99633e97
SM
978 int length = CHARSET_BITMAP_SIZE (p - 1);
979 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 980
a1a052df 981 fprintf (stderr, "/charset [%s",
839966f3 982 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 983
839966f3
KH
984 if (p + *p >= pend)
985 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 986
25fe55af 987 for (c = 0; c < 256; c++)
96cc36cc 988 if (c / 8 < length
fa9a63c5
RM
989 && (p[1 + (c/8)] & (1 << (c % 8))))
990 {
991 /* Are we starting a range? */
992 if (last + 1 == c && ! in_range)
993 {
a1a052df 994 fprintf (stderr, "-");
fa9a63c5
RM
995 in_range = 1;
996 }
997 /* Have we broken a range? */
998 else if (last + 1 != c && in_range)
96cc36cc 999 {
a1a052df 1000 fprintf (stderr, "%c", last);
fa9a63c5
RM
1001 in_range = 0;
1002 }
5e69f11e 1003
fa9a63c5 1004 if (! in_range)
a1a052df 1005 fprintf (stderr, "%c", c);
fa9a63c5
RM
1006
1007 last = c;
25fe55af 1008 }
fa9a63c5
RM
1009
1010 if (in_range)
a1a052df 1011 fprintf (stderr, "%c", last);
fa9a63c5 1012
a1a052df 1013 fprintf (stderr, "]");
fa9a63c5 1014
99633e97 1015 p += 1 + length;
96cc36cc 1016
96cc36cc 1017 if (has_range_table)
99633e97
SM
1018 {
1019 int count;
a1a052df 1020 fprintf (stderr, "has-range-table");
99633e97
SM
1021
1022 /* ??? Should print the range table; for now, just skip it. */
1023 p += 2; /* skip range table bits */
1024 EXTRACT_NUMBER_AND_INCR (count, p);
1025 p = CHARSET_RANGE_TABLE_END (p, count);
1026 }
fa9a63c5
RM
1027 }
1028 break;
1029
1030 case begline:
a1a052df 1031 fprintf (stderr, "/begline");
25fe55af 1032 break;
fa9a63c5
RM
1033
1034 case endline:
a1a052df 1035 fprintf (stderr, "/endline");
25fe55af 1036 break;
fa9a63c5
RM
1037
1038 case on_failure_jump:
25fe55af 1039 extract_number_and_incr (&mcnt, &p);
a1a052df 1040 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1041 break;
fa9a63c5
RM
1042
1043 case on_failure_keep_string_jump:
25fe55af 1044 extract_number_and_incr (&mcnt, &p);
a1a052df 1045 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1046 break;
fa9a63c5 1047
0683b6fa
SM
1048 case on_failure_jump_nastyloop:
1049 extract_number_and_incr (&mcnt, &p);
a1a052df 1050 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1051 break;
1052
505bde11 1053 case on_failure_jump_loop:
fa9a63c5 1054 extract_number_and_incr (&mcnt, &p);
a1a052df 1055 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1056 break;
1057
505bde11 1058 case on_failure_jump_smart:
fa9a63c5 1059 extract_number_and_incr (&mcnt, &p);
a1a052df 1060 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1061 break;
1062
25fe55af 1063 case jump:
fa9a63c5 1064 extract_number_and_incr (&mcnt, &p);
a1a052df 1065 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1066 break;
1067
25fe55af
RS
1068 case succeed_n:
1069 extract_number_and_incr (&mcnt, &p);
1070 extract_number_and_incr (&mcnt2, &p);
a1a052df 1071 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1072 break;
5e69f11e 1073
25fe55af
RS
1074 case jump_n:
1075 extract_number_and_incr (&mcnt, &p);
1076 extract_number_and_incr (&mcnt2, &p);
a1a052df 1077 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1078 break;
5e69f11e 1079
25fe55af
RS
1080 case set_number_at:
1081 extract_number_and_incr (&mcnt, &p);
1082 extract_number_and_incr (&mcnt2, &p);
a1a052df 1083 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1084 break;
5e69f11e 1085
25fe55af 1086 case wordbound:
a1a052df 1087 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1088 break;
1089
1090 case notwordbound:
a1a052df 1091 fprintf (stderr, "/notwordbound");
25fe55af 1092 break;
fa9a63c5
RM
1093
1094 case wordbeg:
a1a052df 1095 fprintf (stderr, "/wordbeg");
fa9a63c5 1096 break;
5e69f11e 1097
fa9a63c5 1098 case wordend:
a1a052df 1099 fprintf (stderr, "/wordend");
e2543b02 1100 break;
5e69f11e 1101
669fa600 1102 case symbeg:
e2543b02 1103 fprintf (stderr, "/symbeg");
669fa600
SM
1104 break;
1105
1106 case symend:
e2543b02 1107 fprintf (stderr, "/symend");
669fa600 1108 break;
5e69f11e 1109
1fb352e0 1110 case syntaxspec:
a1a052df 1111 fprintf (stderr, "/syntaxspec");
1fb352e0 1112 mcnt = *p++;
a1a052df 1113 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1114 break;
1115
1116 case notsyntaxspec:
a1a052df 1117 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1118 mcnt = *p++;
a1a052df 1119 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1120 break;
1121
0b32bf0e 1122# ifdef emacs
fa9a63c5 1123 case before_dot:
a1a052df 1124 fprintf (stderr, "/before_dot");
25fe55af 1125 break;
fa9a63c5
RM
1126
1127 case at_dot:
a1a052df 1128 fprintf (stderr, "/at_dot");
25fe55af 1129 break;
fa9a63c5
RM
1130
1131 case after_dot:
a1a052df 1132 fprintf (stderr, "/after_dot");
25fe55af 1133 break;
fa9a63c5 1134
1fb352e0 1135 case categoryspec:
a1a052df 1136 fprintf (stderr, "/categoryspec");
fa9a63c5 1137 mcnt = *p++;
a1a052df 1138 fprintf (stderr, "/%d", mcnt);
25fe55af 1139 break;
5e69f11e 1140
1fb352e0 1141 case notcategoryspec:
a1a052df 1142 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1143 mcnt = *p++;
a1a052df 1144 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1145 break;
0b32bf0e 1146# endif /* emacs */
fa9a63c5 1147
fa9a63c5 1148 case begbuf:
a1a052df 1149 fprintf (stderr, "/begbuf");
25fe55af 1150 break;
fa9a63c5
RM
1151
1152 case endbuf:
a1a052df 1153 fprintf (stderr, "/endbuf");
25fe55af 1154 break;
fa9a63c5 1155
25fe55af 1156 default:
a1a052df 1157 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1158 }
1159
a1a052df 1160 fprintf (stderr, "\n");
fa9a63c5
RM
1161 }
1162
a1a052df 1163 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1164}
1165
1166
1167void
1168print_compiled_pattern (bufp)
1169 struct re_pattern_buffer *bufp;
1170{
01618498 1171 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1172
1173 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1174 printf ("%ld bytes used/%ld bytes allocated.\n",
1175 bufp->used, bufp->allocated);
fa9a63c5
RM
1176
1177 if (bufp->fastmap_accurate && bufp->fastmap)
1178 {
1179 printf ("fastmap: ");
1180 print_fastmap (bufp->fastmap);
1181 }
1182
1183 printf ("re_nsub: %d\t", bufp->re_nsub);
1184 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1185 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1186 printf ("no_sub: %d\t", bufp->no_sub);
1187 printf ("not_bol: %d\t", bufp->not_bol);
1188 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1189 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1190 fflush (stdout);
fa9a63c5
RM
1191 /* Perhaps we should print the translate table? */
1192}
1193
1194
1195void
1196print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1197 re_char *where;
1198 re_char *string1;
1199 re_char *string2;
d1dfb56c
EZ
1200 ssize_t size1;
1201 ssize_t size2;
fa9a63c5 1202{
d1dfb56c 1203 ssize_t this_char;
5e69f11e 1204
fa9a63c5
RM
1205 if (where == NULL)
1206 printf ("(null)");
1207 else
1208 {
1209 if (FIRST_STRING_P (where))
25fe55af
RS
1210 {
1211 for (this_char = where - string1; this_char < size1; this_char++)
1212 putchar (string1[this_char]);
fa9a63c5 1213
25fe55af
RS
1214 where = string2;
1215 }
fa9a63c5
RM
1216
1217 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1218 putchar (string2[this_char]);
fa9a63c5
RM
1219 }
1220}
1221
1222#else /* not DEBUG */
1223
0b32bf0e
SM
1224# undef assert
1225# define assert(e)
fa9a63c5 1226
0b32bf0e
SM
1227# define DEBUG_STATEMENT(e)
1228# define DEBUG_PRINT1(x)
1229# define DEBUG_PRINT2(x1, x2)
1230# define DEBUG_PRINT3(x1, x2, x3)
1231# define DEBUG_PRINT4(x1, x2, x3, x4)
1232# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1233# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1234
1235#endif /* not DEBUG */
1236\f
4da60324
PE
1237/* Use this to suppress gcc's `...may be used before initialized' warnings. */
1238#ifdef lint
1239# define IF_LINT(Code) Code
1240#else
1241# define IF_LINT(Code) /* empty */
1242#endif
1243\f
fa9a63c5
RM
1244/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1245 also be assigned to arbitrarily: each pattern buffer stores its own
1246 syntax, so it can be changed between regex compilations. */
1247/* This has no initializer because initialized variables in Emacs
1248 become read-only after dumping. */
1249reg_syntax_t re_syntax_options;
1250
1251
1252/* Specify the precise syntax of regexps for compilation. This provides
1253 for compatibility for various utilities which historically have
1254 different, incompatible syntaxes.
1255
1256 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1257 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1258
1259reg_syntax_t
971de7fb 1260re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1261{
1262 reg_syntax_t ret = re_syntax_options;
5e69f11e 1263
fa9a63c5
RM
1264 re_syntax_options = syntax;
1265 return ret;
1266}
c0f9ea08 1267WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1268
1269/* Regexp to use to replace spaces, or NULL meaning don't. */
1270static re_char *whitespace_regexp;
1271
1272void
971de7fb 1273re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1274{
6470ea05 1275 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1276}
1277WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1278\f
1279/* This table gives an error message for each of the error codes listed
4bb91c68 1280 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1281 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1282 but why not be nice? */
fa9a63c5
RM
1283
1284static const char *re_error_msgid[] =
5e69f11e
RM
1285 {
1286 gettext_noop ("Success"), /* REG_NOERROR */
1287 gettext_noop ("No match"), /* REG_NOMATCH */
1288 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1289 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1290 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1291 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1292 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1293 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1294 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1295 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1296 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1297 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1298 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1299 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1300 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1301 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1302 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1303 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1304 };
1305\f
4bb91c68 1306/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1307
1308/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1309 searching and matching functions should not call alloca. On some
1310 systems, alloca is implemented in terms of malloc, and if we're
1311 using the relocating allocator routines, then malloc could cause a
1312 relocation, which might (if the strings being searched are in the
1313 ralloc heap) shift the data out from underneath the regexp
1314 routines.
1315
5e69f11e 1316 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1317 processes input from X in a signal handler; processing X input may
1318 call malloc; if input arrives while a matching routine is calling
1319 malloc, then we're scrod. But Emacs can't just block input while
1320 calling matching routines; then we don't notice interrupts when
1321 they come in. So, Emacs blocks input around all regexp calls
1322 except the matching calls, which it leaves unprotected, in the
1323 faith that they will not malloc. */
1324
1325/* Normally, this is fine. */
1326#define MATCH_MAY_ALLOCATE
1327
fa9a63c5
RM
1328/* The match routines may not allocate if (1) they would do it with malloc
1329 and (2) it's not safe for them to use malloc.
1330 Note that if REL_ALLOC is defined, matching would not use malloc for the
1331 failure stack, but we would still use it for the register vectors;
4bb91c68 1332 so REL_ALLOC should not affect this. */
b588157e 1333#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1334# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1335#endif
1336
1337\f
1338/* Failure stack declarations and macros; both re_compile_fastmap and
1339 re_match_2 use a failure stack. These have to be macros because of
1340 REGEX_ALLOCATE_STACK. */
5e69f11e 1341
fa9a63c5 1342
320a2a73 1343/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1344 when matching. If this number is exceeded, we allocate more
1345 space, so it is not a hard limit. */
1346#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1347# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1348#endif
1349
1350/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1351 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1352 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1353 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1354 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1355# if defined MATCH_MAY_ALLOCATE
1356/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1357 whose default stack limit is 2mb. In order for a larger
1358 value to work reliably, you have to try to make it accord
1359 with the process stack limit. */
c0f9ea08
SM
1360size_t re_max_failures = 40000;
1361# else
1362size_t re_max_failures = 4000;
1363# endif
fa9a63c5
RM
1364
1365union fail_stack_elt
1366{
01618498 1367 re_char *pointer;
c0f9ea08
SM
1368 /* This should be the biggest `int' that's no bigger than a pointer. */
1369 long integer;
fa9a63c5
RM
1370};
1371
1372typedef union fail_stack_elt fail_stack_elt_t;
1373
1374typedef struct
1375{
1376 fail_stack_elt_t *stack;
c0f9ea08
SM
1377 size_t size;
1378 size_t avail; /* Offset of next open position. */
1379 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1380} fail_stack_type;
1381
505bde11 1382#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1383
1384
1385/* Define macros to initialize and free the failure stack.
1386 Do `return -2' if the alloc fails. */
1387
1388#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1389# define INIT_FAIL_STACK() \
fa9a63c5
RM
1390 do { \
1391 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1392 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1393 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1394 \
1395 if (fail_stack.stack == NULL) \
1396 return -2; \
1397 \
1398 fail_stack.size = INIT_FAILURE_ALLOC; \
1399 fail_stack.avail = 0; \
505bde11 1400 fail_stack.frame = 0; \
fa9a63c5 1401 } while (0)
fa9a63c5 1402#else
0b32bf0e 1403# define INIT_FAIL_STACK() \
fa9a63c5
RM
1404 do { \
1405 fail_stack.avail = 0; \
505bde11 1406 fail_stack.frame = 0; \
fa9a63c5
RM
1407 } while (0)
1408
b313f9d8
PE
1409# define RETALLOC_IF(addr, n, t) \
1410 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
fa9a63c5
RM
1411#endif
1412
1413
320a2a73
KH
1414/* Double the size of FAIL_STACK, up to a limit
1415 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1416
1417 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1418 allocating space for it or it was already too large.
1419
4bb91c68 1420 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1421
320a2a73
KH
1422/* Factor to increase the failure stack size by
1423 when we increase it.
1424 This used to be 2, but 2 was too wasteful
1425 because the old discarded stacks added up to as much space
1426 were as ultimate, maximum-size stack. */
1427#define FAIL_STACK_GROWTH_FACTOR 4
1428
1429#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1430 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1431 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1432 ? 0 \
320a2a73
KH
1433 : ((fail_stack).stack \
1434 = (fail_stack_elt_t *) \
25fe55af
RS
1435 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1436 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1437 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1438 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1439 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1440 \
1441 (fail_stack).stack == NULL \
1442 ? 0 \
6453db45
KH
1443 : ((fail_stack).size \
1444 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1445 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1446 * FAIL_STACK_GROWTH_FACTOR)) \
1447 / sizeof (fail_stack_elt_t)), \
25fe55af 1448 1)))
fa9a63c5
RM
1449
1450
fa9a63c5
RM
1451/* Push a pointer value onto the failure stack.
1452 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1453 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1454#define PUSH_FAILURE_POINTER(item) \
01618498 1455 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1456
1457/* This pushes an integer-valued item onto the failure stack.
1458 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1459 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1460#define PUSH_FAILURE_INT(item) \
1461 fail_stack.stack[fail_stack.avail++].integer = (item)
1462
b313f9d8 1463/* These POP... operations complement the PUSH... operations.
fa9a63c5
RM
1464 All assume that `fail_stack' is nonempty. */
1465#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1466#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
fa9a63c5 1467
505bde11
SM
1468/* Individual items aside from the registers. */
1469#define NUM_NONREG_ITEMS 3
1470
1471/* Used to examine the stack (to detect infinite loops). */
1472#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1473#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1474#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1475#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1476
1477
505bde11
SM
1478#define ENSURE_FAIL_STACK(space) \
1479while (REMAINING_AVAIL_SLOTS <= space) { \
1480 if (!GROW_FAIL_STACK (fail_stack)) \
1481 return -2; \
1482 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1483 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1484}
1485
1486/* Push register NUM onto the stack. */
1487#define PUSH_FAILURE_REG(num) \
1488do { \
1489 char *destination; \
1490 ENSURE_FAIL_STACK(3); \
1491 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1492 num, regstart[num], regend[num]); \
1493 PUSH_FAILURE_POINTER (regstart[num]); \
1494 PUSH_FAILURE_POINTER (regend[num]); \
1495 PUSH_FAILURE_INT (num); \
1496} while (0)
1497
01618498
SM
1498/* Change the counter's value to VAL, but make sure that it will
1499 be reset when backtracking. */
1500#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1501do { \
1502 char *destination; \
1503 int c; \
1504 ENSURE_FAIL_STACK(3); \
1505 EXTRACT_NUMBER (c, ptr); \
01618498 1506 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1507 PUSH_FAILURE_INT (c); \
1508 PUSH_FAILURE_POINTER (ptr); \
1509 PUSH_FAILURE_INT (-1); \
01618498 1510 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1511} while (0)
1512
505bde11 1513/* Pop a saved register off the stack. */
dc1e502d 1514#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1515do { \
d1dfb56c 1516 long pfreg = POP_FAILURE_INT (); \
19ed5445 1517 if (pfreg == -1) \
dc1e502d
SM
1518 { \
1519 /* It's a counter. */ \
6dcf2d0e
SM
1520 /* Here, we discard `const', making re_match non-reentrant. */ \
1521 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1522 pfreg = POP_FAILURE_INT (); \
1523 STORE_NUMBER (ptr, pfreg); \
1524 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, pfreg); \
dc1e502d
SM
1525 } \
1526 else \
1527 { \
19ed5445
PE
1528 regend[pfreg] = POP_FAILURE_POINTER (); \
1529 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc1e502d 1530 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
19ed5445 1531 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1532 } \
505bde11
SM
1533} while (0)
1534
1535/* Check that we are not stuck in an infinite loop. */
1536#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1537do { \
d1dfb56c 1538 ssize_t failure = TOP_FAILURE_HANDLE (); \
505bde11 1539 /* Check for infinite matching loops */ \
f6df485f
RS
1540 while (failure > 0 \
1541 && (FAILURE_STR (failure) == string_place \
1542 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1543 { \
1544 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1545 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1546 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1547 { \
6df42991
SM
1548 cycle = 1; \
1549 break; \
f6df485f 1550 } \
66f0296e 1551 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1552 failure = NEXT_FAILURE_HANDLE(failure); \
1553 } \
1554 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1555} while (0)
6df42991 1556
fa9a63c5 1557/* Push the information about the state we will need
5e69f11e
RM
1558 if we ever fail back to it.
1559
505bde11 1560 Requires variables fail_stack, regstart, regend and
320a2a73 1561 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1562 declared.
5e69f11e 1563
fa9a63c5
RM
1564 Does `return FAILURE_CODE' if runs out of memory. */
1565
505bde11
SM
1566#define PUSH_FAILURE_POINT(pattern, string_place) \
1567do { \
1568 char *destination; \
1569 /* Must be int, so when we don't save any registers, the arithmetic \
1570 of 0 + -1 isn't done as unsigned. */ \
1571 \
505bde11 1572 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1573 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1574 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1575 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1576 \
1577 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1578 \
1579 DEBUG_PRINT1 ("\n"); \
1580 \
1581 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1582 PUSH_FAILURE_INT (fail_stack.frame); \
1583 \
1584 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1585 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1586 DEBUG_PRINT1 ("'\n"); \
1587 PUSH_FAILURE_POINTER (string_place); \
1588 \
1589 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1590 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1591 PUSH_FAILURE_POINTER (pattern); \
1592 \
1593 /* Close the frame by moving the frame pointer past it. */ \
1594 fail_stack.frame = fail_stack.avail; \
1595} while (0)
fa9a63c5 1596
320a2a73
KH
1597/* Estimate the size of data pushed by a typical failure stack entry.
1598 An estimate is all we need, because all we use this for
1599 is to choose a limit for how big to make the failure stack. */
ada30c0e 1600/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1601#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1602
fa9a63c5
RM
1603/* How many items can still be added to the stack without overflowing it. */
1604#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1605
1606
1607/* Pops what PUSH_FAIL_STACK pushes.
1608
1609 We restore into the parameters, all of which should be lvalues:
1610 STR -- the saved data position.
1611 PAT -- the saved pattern position.
fa9a63c5 1612 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1613
fa9a63c5 1614 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1615 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1616
505bde11
SM
1617#define POP_FAILURE_POINT(str, pat) \
1618do { \
fa9a63c5
RM
1619 assert (!FAIL_STACK_EMPTY ()); \
1620 \
1621 /* Remove failure points and point to how many regs pushed. */ \
1622 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1623 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1624 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1625 \
505bde11
SM
1626 /* Pop the saved registers. */ \
1627 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1628 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1629 \
01618498 1630 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1631 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1632 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1633 \
1634 /* If the saved string location is NULL, it came from an \
1635 on_failure_keep_string_jump opcode, and we want to throw away the \
1636 saved NULL, thus retaining our current position in the string. */ \
01618498 1637 str = POP_FAILURE_POINTER (); \
505bde11 1638 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1639 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1640 DEBUG_PRINT1 ("'\n"); \
1641 \
505bde11
SM
1642 fail_stack.frame = POP_FAILURE_INT (); \
1643 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1644 \
505bde11
SM
1645 assert (fail_stack.avail >= 0); \
1646 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1647 \
fa9a63c5 1648 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1649} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1650
1651
1652\f
fa9a63c5 1653/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1654#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1655\f
1656/* Subroutine declarations and macros for regex_compile. */
1657
261cb4bb
PE
1658static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1659 reg_syntax_t syntax,
1660 struct re_pattern_buffer *bufp);
1661static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1662static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1663static void insert_op1 (re_opcode_t op, unsigned char *loc,
1664 int arg, unsigned char *end);
1665static void insert_op2 (re_opcode_t op, unsigned char *loc,
1666 int arg1, int arg2, unsigned char *end);
1667static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1668 reg_syntax_t syntax);
1669static boolean at_endline_loc_p (re_char *p, re_char *pend,
1670 reg_syntax_t syntax);
1671static re_char *skip_one_char (re_char *p);
1672static int analyse_first (re_char *p, re_char *pend,
1673 char *fastmap, const int multibyte);
fa9a63c5 1674
fa9a63c5 1675/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1676 translation. */
36595814 1677#define PATFETCH(c) \
2d1675e4
SM
1678 do { \
1679 int len; \
1680 if (p == pend) return REG_EEND; \
62a6e103 1681 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1682 p += len; \
fa9a63c5
RM
1683 } while (0)
1684
fa9a63c5
RM
1685
1686/* If `translate' is non-null, return translate[D], else just D. We
1687 cast the subscript to translate because some data is declared as
1688 `char *', to avoid warnings when a string constant is passed. But
1689 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1690#ifndef TRANSLATE
0b32bf0e 1691# define TRANSLATE(d) \
66f0296e 1692 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1693#endif
fa9a63c5
RM
1694
1695
1696/* Macros for outputting the compiled pattern into `buffer'. */
1697
1698/* If the buffer isn't allocated when it comes in, use this. */
1699#define INIT_BUF_SIZE 32
1700
4bb91c68 1701/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1702#define GET_BUFFER_SPACE(n) \
01618498 1703 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1704 EXTEND_BUFFER ()
1705
1706/* Make sure we have one more byte of buffer space and then add C to it. */
1707#define BUF_PUSH(c) \
1708 do { \
1709 GET_BUFFER_SPACE (1); \
1710 *b++ = (unsigned char) (c); \
1711 } while (0)
1712
1713
1714/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1715#define BUF_PUSH_2(c1, c2) \
1716 do { \
1717 GET_BUFFER_SPACE (2); \
1718 *b++ = (unsigned char) (c1); \
1719 *b++ = (unsigned char) (c2); \
1720 } while (0)
1721
1722
fa9a63c5 1723/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1724 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1725#define STORE_JUMP(op, loc, to) \
1726 store_op1 (op, loc, (to) - (loc) - 3)
1727
1728/* Likewise, for a two-argument jump. */
1729#define STORE_JUMP2(op, loc, to, arg) \
1730 store_op2 (op, loc, (to) - (loc) - 3, arg)
1731
4bb91c68 1732/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1733#define INSERT_JUMP(op, loc, to) \
1734 insert_op1 (op, loc, (to) - (loc) - 3, b)
1735
1736/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1737#define INSERT_JUMP2(op, loc, to, arg) \
1738 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1739
1740
1741/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1742 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1743 be too small, many things would have to change. */
839966f3
KH
1744# define MAX_BUF_SIZE (1L << 15)
1745
1746#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1747/* Any other compiler which, like MSC, has allocation limit below 2^16
1748 bytes will have to use approach similar to what was done below for
1749 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1750 reallocating to 0 bytes. Such thing is not going to work too well.
1751 You have been warned!! */
1752#if defined _MSC_VER && !defined WIN32
1753/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1754# define MAX_BUF_SIZE 65500L
1755#else
1756# define MAX_BUF_SIZE (1L << 16)
1757#endif
839966f3 1758#endif /* 0 */
fa9a63c5
RM
1759
1760/* Extend the buffer by twice its current size via realloc and
1761 reset the pointers that pointed into the old block to point to the
1762 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1763 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1764#if __BOUNDED_POINTERS__
1765# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1766# define MOVE_BUFFER_POINTER(P) \
1767 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1768 SET_HIGH_BOUND (P), \
1769 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1770# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1771 else \
1772 { \
1773 SET_HIGH_BOUND (b); \
1774 SET_HIGH_BOUND (begalt); \
1775 if (fixup_alt_jump) \
1776 SET_HIGH_BOUND (fixup_alt_jump); \
1777 if (laststart) \
1778 SET_HIGH_BOUND (laststart); \
1779 if (pending_exact) \
1780 SET_HIGH_BOUND (pending_exact); \
1781 }
1782#else
381880b0 1783# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1784# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1785#endif
fa9a63c5 1786#define EXTEND_BUFFER() \
25fe55af 1787 do { \
381880b0 1788 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1789 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1790 return REG_ESIZE; \
1791 bufp->allocated <<= 1; \
1792 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1793 bufp->allocated = MAX_BUF_SIZE; \
01618498 1794 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1795 if (bufp->buffer == NULL) \
1796 return REG_ESPACE; \
1797 /* If the buffer moved, move all the pointers into it. */ \
1798 if (old_buffer != bufp->buffer) \
1799 { \
381880b0 1800 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1801 MOVE_BUFFER_POINTER (b); \
1802 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1803 if (fixup_alt_jump) \
4bb91c68 1804 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1805 if (laststart) \
4bb91c68 1806 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1807 if (pending_exact) \
4bb91c68 1808 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1809 } \
4bb91c68 1810 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1811 } while (0)
1812
1813
1814/* Since we have one byte reserved for the register number argument to
1815 {start,stop}_memory, the maximum number of groups we can report
1816 things about is what fits in that byte. */
1817#define MAX_REGNUM 255
1818
1819/* But patterns can have more than `MAX_REGNUM' registers. We just
1820 ignore the excess. */
098d42af 1821typedef int regnum_t;
fa9a63c5
RM
1822
1823
1824/* Macros for the compile stack. */
1825
1826/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1827 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1828/* int may be not enough when sizeof(int) == 2. */
1829typedef long pattern_offset_t;
fa9a63c5
RM
1830
1831typedef struct
1832{
1833 pattern_offset_t begalt_offset;
1834 pattern_offset_t fixup_alt_jump;
5e69f11e 1835 pattern_offset_t laststart_offset;
fa9a63c5
RM
1836 regnum_t regnum;
1837} compile_stack_elt_t;
1838
1839
1840typedef struct
1841{
1842 compile_stack_elt_t *stack;
d1dfb56c
EZ
1843 size_t size;
1844 size_t avail; /* Offset of next open position. */
fa9a63c5
RM
1845} compile_stack_type;
1846
1847
1848#define INIT_COMPILE_STACK_SIZE 32
1849
1850#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1851#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1852
4bb91c68 1853/* The next available element. */
fa9a63c5
RM
1854#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1855
1cee1e27
SM
1856/* Explicit quit checking is only used on NTemacs and whenever we
1857 use polling to process input events. */
1858#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1859extern int immediate_quit;
1860# define IMMEDIATE_QUIT_CHECK \
1861 do { \
1862 if (immediate_quit) QUIT; \
1863 } while (0)
1864#else
1865# define IMMEDIATE_QUIT_CHECK ((void)0)
1866#endif
1867\f
b18215fc
RS
1868/* Structure to manage work area for range table. */
1869struct range_table_work_area
1870{
1871 int *table; /* actual work area. */
1872 int allocated; /* allocated size for work area in bytes. */
7814e705 1873 int used; /* actually used size in words. */
96cc36cc 1874 int bits; /* flag to record character classes */
b18215fc
RS
1875};
1876
77d11aec
RS
1877/* Make sure that WORK_AREA can hold more N multibyte characters.
1878 This is used only in set_image_of_range and set_image_of_range_1.
1879 It expects WORK_AREA to be a pointer.
1880 If it can't get the space, it returns from the surrounding function. */
1881
1882#define EXTEND_RANGE_TABLE(work_area, n) \
1883 do { \
8f924df7 1884 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1885 { \
8f924df7
KH
1886 extend_range_table_work_area (&work_area); \
1887 if ((work_area).table == 0) \
77d11aec
RS
1888 return (REG_ESPACE); \
1889 } \
b18215fc
RS
1890 } while (0)
1891
96cc36cc
RS
1892#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1893 (work_area).bits |= (bit)
1894
14473664
SM
1895/* Bits used to implement the multibyte-part of the various character classes
1896 such as [:alnum:] in a charset's range table. */
1897#define BIT_WORD 0x1
1898#define BIT_LOWER 0x2
1899#define BIT_PUNCT 0x4
1900#define BIT_SPACE 0x8
1901#define BIT_UPPER 0x10
1902#define BIT_MULTIBYTE 0x20
96cc36cc 1903
b18215fc
RS
1904/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1905#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1906 do { \
8f924df7 1907 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1908 (work_area).table[(work_area).used++] = (range_start); \
1909 (work_area).table[(work_area).used++] = (range_end); \
1910 } while (0)
1911
7814e705 1912/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1913#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1914 do { \
1915 if ((work_area).table) \
1916 free ((work_area).table); \
1917 } while (0)
1918
96cc36cc 1919#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1920#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1921#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1922#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1923\f
b18215fc 1924
fa9a63c5 1925/* Set the bit for character C in a list. */
01618498 1926#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1927
1928
bf216479
KH
1929#ifdef emacs
1930
cf9c99bc
KH
1931/* Store characters in the range FROM to TO in the bitmap at B (for
1932 ASCII and unibyte characters) and WORK_AREA (for multibyte
1933 characters) while translating them and paying attention to the
1934 continuity of translated characters.
8f924df7 1935
cf9c99bc
KH
1936 Implementation note: It is better to implement these fairly big
1937 macros by a function, but it's not that easy because macros called
8f924df7 1938 in this macro assume various local variables already declared. */
bf216479 1939
cf9c99bc
KH
1940/* Both FROM and TO are ASCII characters. */
1941
1942#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1943 do { \
1944 int C0, C1; \
1945 \
1946 for (C0 = (FROM); C0 <= (TO); C0++) \
1947 { \
1948 C1 = TRANSLATE (C0); \
1949 if (! ASCII_CHAR_P (C1)) \
1950 { \
1951 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1952 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1953 C1 = C0; \
1954 } \
1955 SET_LIST_BIT (C1); \
1956 } \
1957 } while (0)
1958
1959
1960/* Both FROM and TO are unibyte characters (0x80..0xFF). */
1961
1962#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1963 do { \
1964 int C0, C1, C2, I; \
1965 int USED = RANGE_TABLE_WORK_USED (work_area); \
1966 \
1967 for (C0 = (FROM); C0 <= (TO); C0++) \
1968 { \
1969 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1970 if (CHAR_BYTE8_P (C1)) \
1971 SET_LIST_BIT (C0); \
1972 else \
1973 { \
1974 C2 = TRANSLATE (C1); \
1975 if (C2 == C1 \
1976 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1977 C1 = C0; \
1978 SET_LIST_BIT (C1); \
1979 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1980 { \
1981 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1982 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1983 \
1984 if (C2 >= from - 1 && C2 <= to + 1) \
1985 { \
1986 if (C2 == from - 1) \
1987 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1988 else if (C2 == to + 1) \
1989 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1990 break; \
1991 } \
1992 } \
1993 if (I < USED) \
1994 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1995 } \
1996 } \
1997 } while (0)
1998
1999
78edd3b7 2000/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2001
2002#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2003 do { \
2004 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2005 \
2006 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2007 for (C0 = (FROM); C0 <= (TO); C0++) \
2008 { \
2009 C1 = TRANSLATE (C0); \
2010 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2011 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2012 SET_LIST_BIT (C2); \
2013 if (C1 >= (FROM) && C1 <= (TO)) \
2014 continue; \
2015 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2016 { \
2017 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2018 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2019 \
2020 if (C1 >= from - 1 && C1 <= to + 1) \
2021 { \
2022 if (C1 == from - 1) \
2023 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2024 else if (C1 == to + 1) \
2025 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2026 break; \
2027 } \
2028 } \
2029 if (I < USED) \
2030 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2031 } \
bf216479
KH
2032 } while (0)
2033
2034#endif /* emacs */
2035
fa9a63c5 2036/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2037#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2038 do { \
2039 if (p == pend) \
2040 FREE_STACK_RETURN (REG_EBRACE); \
2041 else \
2042 { \
2043 PATFETCH (c); \
2044 while ('0' <= c && c <= '9') \
2045 { \
2046 int prev; \
2047 if (num < 0) \
2048 num = 0; \
2049 prev = num; \
2050 num = num * 10 + c - '0'; \
2051 if (num / 10 != prev) \
2052 FREE_STACK_RETURN (REG_BADBR); \
2053 if (p == pend) \
2054 FREE_STACK_RETURN (REG_EBRACE); \
2055 PATFETCH (c); \
2056 } \
2057 } \
2058 } while (0)
77d11aec 2059\f
1fdab503 2060#if ! WIDE_CHAR_SUPPORT
01618498 2061
14473664 2062/* Map a string to the char class it names (if any). */
1fdab503 2063re_wctype_t
971de7fb 2064re_wctype (const re_char *str)
14473664 2065{
5b0534c8 2066 const char *string = (const char *) str;
14473664
SM
2067 if (STREQ (string, "alnum")) return RECC_ALNUM;
2068 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2069 else if (STREQ (string, "word")) return RECC_WORD;
2070 else if (STREQ (string, "ascii")) return RECC_ASCII;
2071 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2072 else if (STREQ (string, "graph")) return RECC_GRAPH;
2073 else if (STREQ (string, "lower")) return RECC_LOWER;
2074 else if (STREQ (string, "print")) return RECC_PRINT;
2075 else if (STREQ (string, "punct")) return RECC_PUNCT;
2076 else if (STREQ (string, "space")) return RECC_SPACE;
2077 else if (STREQ (string, "upper")) return RECC_UPPER;
2078 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2079 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2080 else if (STREQ (string, "digit")) return RECC_DIGIT;
2081 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2082 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2083 else if (STREQ (string, "blank")) return RECC_BLANK;
2084 else return 0;
2085}
2086
e0f24100 2087/* True if CH is in the char class CC. */
1fdab503 2088boolean
971de7fb 2089re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2090{
2091 switch (cc)
2092 {
f3fcc40d
AS
2093 case RECC_ALNUM: return ISALNUM (ch) != 0;
2094 case RECC_ALPHA: return ISALPHA (ch) != 0;
2095 case RECC_BLANK: return ISBLANK (ch) != 0;
2096 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2097 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2098 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2099 case RECC_LOWER: return ISLOWER (ch) != 0;
2100 case RECC_PRINT: return ISPRINT (ch) != 0;
2101 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2102 case RECC_SPACE: return ISSPACE (ch) != 0;
2103 case RECC_UPPER: return ISUPPER (ch) != 0;
2104 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2105 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
213bd7f2 2106 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
f3fcc40d 2107 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
213bd7f2 2108 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
f3fcc40d 2109 case RECC_WORD: return ISWORD (ch) != 0;
0cdd06f8
SM
2110 case RECC_ERROR: return false;
2111 default:
5e617bc2 2112 abort ();
14473664
SM
2113 }
2114}
fa9a63c5 2115
14473664
SM
2116/* Return a bit-pattern to use in the range-table bits to match multibyte
2117 chars of class CC. */
2118static int
971de7fb 2119re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2120{
2121 switch (cc)
2122 {
2123 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2124 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2125 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2126 case RECC_LOWER: return BIT_LOWER;
2127 case RECC_UPPER: return BIT_UPPER;
2128 case RECC_PUNCT: return BIT_PUNCT;
2129 case RECC_SPACE: return BIT_SPACE;
14473664 2130 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2131 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2132 default:
5e617bc2 2133 abort ();
14473664
SM
2134 }
2135}
2136#endif
77d11aec
RS
2137\f
2138/* Filling in the work area of a range. */
2139
2140/* Actually extend the space in WORK_AREA. */
2141
2142static void
971de7fb 2143extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2144{
77d11aec
RS
2145 work_area->allocated += 16 * sizeof (int);
2146 if (work_area->table)
2147 work_area->table
2148 = (int *) realloc (work_area->table, work_area->allocated);
2149 else
2150 work_area->table
2151 = (int *) malloc (work_area->allocated);
2152}
2153
8f924df7 2154#if 0
77d11aec
RS
2155#ifdef emacs
2156
2157/* Carefully find the ranges of codes that are equivalent
2158 under case conversion to the range start..end when passed through
2159 TRANSLATE. Handle the case where non-letters can come in between
2160 two upper-case letters (which happens in Latin-1).
2161 Also handle the case of groups of more than 2 case-equivalent chars.
2162
2163 The basic method is to look at consecutive characters and see
2164 if they can form a run that can be handled as one.
2165
2166 Returns -1 if successful, REG_ESPACE if ran out of space. */
2167
2168static int
1dae0f0a
AS
2169set_image_of_range_1 (struct range_table_work_area *work_area,
2170 re_wchar_t start, re_wchar_t end,
2171 RE_TRANSLATE_TYPE translate)
77d11aec
RS
2172{
2173 /* `one_case' indicates a character, or a run of characters,
2174 each of which is an isolate (no case-equivalents).
2175 This includes all ASCII non-letters.
2176
2177 `two_case' indicates a character, or a run of characters,
2178 each of which has two case-equivalent forms.
2179 This includes all ASCII letters.
2180
2181 `strange' indicates a character that has more than one
2182 case-equivalent. */
177c0ea7 2183
77d11aec
RS
2184 enum case_type {one_case, two_case, strange};
2185
2186 /* Describe the run that is in progress,
2187 which the next character can try to extend.
2188 If run_type is strange, that means there really is no run.
2189 If run_type is one_case, then run_start...run_end is the run.
2190 If run_type is two_case, then the run is run_start...run_end,
2191 and the case-equivalents end at run_eqv_end. */
2192
2193 enum case_type run_type = strange;
2194 int run_start, run_end, run_eqv_end;
2195
2196 Lisp_Object eqv_table;
2197
2198 if (!RE_TRANSLATE_P (translate))
2199 {
b7c12565 2200 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2201 work_area->table[work_area->used++] = (start);
2202 work_area->table[work_area->used++] = (end);
b7c12565 2203 return -1;
77d11aec
RS
2204 }
2205
2206 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2207
77d11aec
RS
2208 for (; start <= end; start++)
2209 {
2210 enum case_type this_type;
2211 int eqv = RE_TRANSLATE (eqv_table, start);
2212 int minchar, maxchar;
2213
2214 /* Classify this character */
2215 if (eqv == start)
2216 this_type = one_case;
2217 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2218 this_type = two_case;
2219 else
2220 this_type = strange;
2221
2222 if (start < eqv)
2223 minchar = start, maxchar = eqv;
2224 else
2225 minchar = eqv, maxchar = start;
2226
2227 /* Can this character extend the run in progress? */
2228 if (this_type == strange || this_type != run_type
2229 || !(minchar == run_end + 1
2230 && (run_type == two_case
2231 ? maxchar == run_eqv_end + 1 : 1)))
2232 {
2233 /* No, end the run.
2234 Record each of its equivalent ranges. */
2235 if (run_type == one_case)
2236 {
2237 EXTEND_RANGE_TABLE (work_area, 2);
2238 work_area->table[work_area->used++] = run_start;
2239 work_area->table[work_area->used++] = run_end;
2240 }
2241 else if (run_type == two_case)
2242 {
2243 EXTEND_RANGE_TABLE (work_area, 4);
2244 work_area->table[work_area->used++] = run_start;
2245 work_area->table[work_area->used++] = run_end;
2246 work_area->table[work_area->used++]
2247 = RE_TRANSLATE (eqv_table, run_start);
2248 work_area->table[work_area->used++]
2249 = RE_TRANSLATE (eqv_table, run_end);
2250 }
2251 run_type = strange;
2252 }
177c0ea7 2253
77d11aec
RS
2254 if (this_type == strange)
2255 {
2256 /* For a strange character, add each of its equivalents, one
2257 by one. Don't start a range. */
2258 do
2259 {
2260 EXTEND_RANGE_TABLE (work_area, 2);
2261 work_area->table[work_area->used++] = eqv;
2262 work_area->table[work_area->used++] = eqv;
2263 eqv = RE_TRANSLATE (eqv_table, eqv);
2264 }
2265 while (eqv != start);
2266 }
2267
2268 /* Add this char to the run, or start a new run. */
2269 else if (run_type == strange)
2270 {
2271 /* Initialize a new range. */
2272 run_type = this_type;
2273 run_start = start;
2274 run_end = start;
2275 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2276 }
2277 else
2278 {
2279 /* Extend a running range. */
2280 run_end = minchar;
2281 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2282 }
2283 }
2284
2285 /* If a run is still in progress at the end, finish it now
2286 by recording its equivalent ranges. */
2287 if (run_type == one_case)
2288 {
2289 EXTEND_RANGE_TABLE (work_area, 2);
2290 work_area->table[work_area->used++] = run_start;
2291 work_area->table[work_area->used++] = run_end;
2292 }
2293 else if (run_type == two_case)
2294 {
2295 EXTEND_RANGE_TABLE (work_area, 4);
2296 work_area->table[work_area->used++] = run_start;
2297 work_area->table[work_area->used++] = run_end;
2298 work_area->table[work_area->used++]
2299 = RE_TRANSLATE (eqv_table, run_start);
2300 work_area->table[work_area->used++]
2301 = RE_TRANSLATE (eqv_table, run_end);
2302 }
2303
2304 return -1;
2305}
36595814 2306
77d11aec 2307#endif /* emacs */
36595814 2308
2b34df4e 2309/* Record the image of the range start..end when passed through
36595814
SM
2310 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2311 and is not even necessarily contiguous.
b7c12565
RS
2312 Normally we approximate it with the smallest contiguous range that contains
2313 all the chars we need. However, for Latin-1 we go to extra effort
2314 to do a better job.
2315
2316 This function is not called for ASCII ranges.
77d11aec
RS
2317
2318 Returns -1 if successful, REG_ESPACE if ran out of space. */
2319
2320static int
1dae0f0a
AS
2321set_image_of_range (struct range_table_work_area *work_area,
2322 re_wchar_t start, re_wchar_t end,
2323 RE_TRANSLATE_TYPE translate)
36595814 2324{
77d11aec
RS
2325 re_wchar_t cmin, cmax;
2326
2327#ifdef emacs
2328 /* For Latin-1 ranges, use set_image_of_range_1
2329 to get proper handling of ranges that include letters and nonletters.
b7c12565 2330 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2331 For other character sets, we don't bother to get this right. */
b7c12565
RS
2332 if (RE_TRANSLATE_P (translate) && start < 04400
2333 && !(start < 04200 && end >= 04377))
77d11aec 2334 {
b7c12565 2335 int newend;
77d11aec 2336 int tem;
b7c12565
RS
2337 newend = end;
2338 if (newend > 04377)
2339 newend = 04377;
2340 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2341 if (tem > 0)
2342 return tem;
2343
2344 start = 04400;
2345 if (end < 04400)
2346 return -1;
2347 }
2348#endif
2349
b7c12565
RS
2350 EXTEND_RANGE_TABLE (work_area, 2);
2351 work_area->table[work_area->used++] = (start);
2352 work_area->table[work_area->used++] = (end);
2353
2354 cmin = -1, cmax = -1;
77d11aec 2355
36595814 2356 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2357 {
2358 int ch;
77d11aec 2359
b7c12565
RS
2360 for (ch = start; ch <= end; ch++)
2361 {
2362 re_wchar_t c = TRANSLATE (ch);
2363 if (! (start <= c && c <= end))
2364 {
2365 if (cmin == -1)
2366 cmin = c, cmax = c;
2367 else
2368 {
2369 cmin = MIN (cmin, c);
2370 cmax = MAX (cmax, c);
2371 }
2372 }
2373 }
2374
2375 if (cmin != -1)
2376 {
2377 EXTEND_RANGE_TABLE (work_area, 2);
2378 work_area->table[work_area->used++] = (cmin);
2379 work_area->table[work_area->used++] = (cmax);
2380 }
2381 }
36595814 2382
77d11aec
RS
2383 return -1;
2384}
8f924df7 2385#endif /* 0 */
fa9a63c5
RM
2386\f
2387#ifndef MATCH_MAY_ALLOCATE
2388
2389/* If we cannot allocate large objects within re_match_2_internal,
2390 we make the fail stack and register vectors global.
2391 The fail stack, we grow to the maximum size when a regexp
2392 is compiled.
2393 The register vectors, we adjust in size each time we
2394 compile a regexp, according to the number of registers it needs. */
2395
2396static fail_stack_type fail_stack;
2397
2398/* Size with which the following vectors are currently allocated.
2399 That is so we can make them bigger as needed,
4bb91c68 2400 but never make them smaller. */
fa9a63c5
RM
2401static int regs_allocated_size;
2402
66f0296e
SM
2403static re_char ** regstart, ** regend;
2404static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2405
2406/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2407 but don't make them smaller. */
fa9a63c5
RM
2408
2409static
1dae0f0a 2410regex_grow_registers (int num_regs)
fa9a63c5
RM
2411{
2412 if (num_regs > regs_allocated_size)
2413 {
66f0296e
SM
2414 RETALLOC_IF (regstart, num_regs, re_char *);
2415 RETALLOC_IF (regend, num_regs, re_char *);
2416 RETALLOC_IF (best_regstart, num_regs, re_char *);
2417 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2418
2419 regs_allocated_size = num_regs;
2420 }
2421}
2422
2423#endif /* not MATCH_MAY_ALLOCATE */
2424\f
261cb4bb
PE
2425static boolean group_in_compile_stack (compile_stack_type compile_stack,
2426 regnum_t regnum);
99633e97 2427
fa9a63c5
RM
2428/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2429 Returns one of error codes defined in `regex.h', or zero for success.
2430
2431 Assumes the `allocated' (and perhaps `buffer') and `translate'
2432 fields are set in BUFP on entry.
2433
2434 If it succeeds, results are put in BUFP (if it returns an error, the
2435 contents of BUFP are undefined):
2436 `buffer' is the compiled pattern;
2437 `syntax' is set to SYNTAX;
2438 `used' is set to the length of the compiled pattern;
2439 `fastmap_accurate' is zero;
2440 `re_nsub' is the number of subexpressions in PATTERN;
2441 `not_bol' and `not_eol' are zero;
5e69f11e 2442
c0f9ea08 2443 The `fastmap' field is neither examined nor set. */
fa9a63c5 2444
505bde11
SM
2445/* Insert the `jump' from the end of last alternative to "here".
2446 The space for the jump has already been allocated. */
2447#define FIXUP_ALT_JUMP() \
2448do { \
2449 if (fixup_alt_jump) \
2450 STORE_JUMP (jump, fixup_alt_jump, b); \
2451} while (0)
2452
2453
fa9a63c5
RM
2454/* Return, freeing storage we allocated. */
2455#define FREE_STACK_RETURN(value) \
b18215fc
RS
2456 do { \
2457 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2458 free (compile_stack.stack); \
2459 return value; \
2460 } while (0)
fa9a63c5
RM
2461
2462static reg_errcode_t
971de7fb 2463regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2464{
01618498
SM
2465 /* We fetch characters from PATTERN here. */
2466 register re_wchar_t c, c1;
5e69f11e 2467
fa9a63c5
RM
2468 /* Points to the end of the buffer, where we should append. */
2469 register unsigned char *b;
5e69f11e 2470
fa9a63c5
RM
2471 /* Keeps track of unclosed groups. */
2472 compile_stack_type compile_stack;
2473
2474 /* Points to the current (ending) position in the pattern. */
22336245
RS
2475#ifdef AIX
2476 /* `const' makes AIX compiler fail. */
66f0296e 2477 unsigned char *p = pattern;
22336245 2478#else
66f0296e 2479 re_char *p = pattern;
22336245 2480#endif
66f0296e 2481 re_char *pend = pattern + size;
5e69f11e 2482
fa9a63c5 2483 /* How to translate the characters in the pattern. */
6676cb1c 2484 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2485
2486 /* Address of the count-byte of the most recently inserted `exactn'
2487 command. This makes it possible to tell if a new exact-match
2488 character can be added to that command or if the character requires
2489 a new `exactn' command. */
2490 unsigned char *pending_exact = 0;
2491
2492 /* Address of start of the most recently finished expression.
2493 This tells, e.g., postfix * where to find the start of its
2494 operand. Reset at the beginning of groups and alternatives. */
2495 unsigned char *laststart = 0;
2496
2497 /* Address of beginning of regexp, or inside of last group. */
2498 unsigned char *begalt;
2499
2500 /* Place in the uncompiled pattern (i.e., the {) to
2501 which to go back if the interval is invalid. */
66f0296e 2502 re_char *beg_interval;
5e69f11e 2503
fa9a63c5 2504 /* Address of the place where a forward jump should go to the end of
7814e705 2505 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2506 last -- ends with a forward jump of this sort. */
2507 unsigned char *fixup_alt_jump = 0;
2508
b18215fc
RS
2509 /* Work area for range table of charset. */
2510 struct range_table_work_area range_table_work;
2511
2d1675e4
SM
2512 /* If the object matched can contain multibyte characters. */
2513 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2514
f9b0fd99
RS
2515 /* Nonzero if we have pushed down into a subpattern. */
2516 int in_subpattern = 0;
2517
2518 /* These hold the values of p, pattern, and pend from the main
2519 pattern when we have pushed into a subpattern. */
da053e48
PE
2520 re_char *main_p IF_LINT (= NULL);
2521 re_char *main_pattern IF_LINT (= NULL);
2522 re_char *main_pend IF_LINT (= NULL);
f9b0fd99 2523
fa9a63c5 2524#ifdef DEBUG
99633e97 2525 debug++;
fa9a63c5 2526 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2527 if (debug > 0)
fa9a63c5
RM
2528 {
2529 unsigned debug_count;
5e69f11e 2530
fa9a63c5 2531 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2532 putchar (pattern[debug_count]);
fa9a63c5
RM
2533 putchar ('\n');
2534 }
2535#endif /* DEBUG */
2536
2537 /* Initialize the compile stack. */
2538 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2539 if (compile_stack.stack == NULL)
2540 return REG_ESPACE;
2541
2542 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2543 compile_stack.avail = 0;
2544
b18215fc
RS
2545 range_table_work.table = 0;
2546 range_table_work.allocated = 0;
2547
fa9a63c5
RM
2548 /* Initialize the pattern buffer. */
2549 bufp->syntax = syntax;
2550 bufp->fastmap_accurate = 0;
2551 bufp->not_bol = bufp->not_eol = 0;
6224b623 2552 bufp->used_syntax = 0;
fa9a63c5
RM
2553
2554 /* Set `used' to zero, so that if we return an error, the pattern
2555 printer (for debugging) will think there's no pattern. We reset it
2556 at the end. */
2557 bufp->used = 0;
5e69f11e 2558
fa9a63c5 2559 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2560 bufp->re_nsub = 0;
fa9a63c5 2561
0b32bf0e 2562#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2563 /* Initialize the syntax table. */
2564 init_syntax_once ();
2565#endif
2566
2567 if (bufp->allocated == 0)
2568 {
2569 if (bufp->buffer)
2570 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2571 enough space. This loses if buffer's address is bogus, but
7814e705 2572 that is the user's responsibility. */
25fe55af
RS
2573 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2574 }
fa9a63c5 2575 else
7814e705 2576 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2577 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2578 }
fa9a63c5
RM
2579 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2580
2581 bufp->allocated = INIT_BUF_SIZE;
2582 }
2583
2584 begalt = b = bufp->buffer;
2585
2586 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2587 while (1)
fa9a63c5 2588 {
f9b0fd99
RS
2589 if (p == pend)
2590 {
2591 /* If this is the end of an included regexp,
2592 pop back to the main regexp and try again. */
2593 if (in_subpattern)
2594 {
2595 in_subpattern = 0;
2596 pattern = main_pattern;
2597 p = main_p;
2598 pend = main_pend;
2599 continue;
2600 }
2601 /* If this is the end of the main regexp, we are done. */
2602 break;
2603 }
2604
fa9a63c5
RM
2605 PATFETCH (c);
2606
2607 switch (c)
25fe55af 2608 {
f9b0fd99
RS
2609 case ' ':
2610 {
2611 re_char *p1 = p;
2612
2613 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2614 spaces normally. And don't try to do this recursively. */
2615 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2616 goto normal_char;
2617
2618 /* Peek past following spaces. */
2619 while (p1 != pend)
2620 {
2621 if (*p1 != ' ')
2622 break;
2623 p1++;
2624 }
2625 /* If the spaces are followed by a repetition op,
2626 treat them normally. */
c721eee5
RS
2627 if (p1 != pend
2628 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2629 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2630 goto normal_char;
2631
2632 /* Replace the spaces with the whitespace regexp. */
2633 in_subpattern = 1;
2634 main_p = p1;
2635 main_pend = pend;
2636 main_pattern = pattern;
2637 p = pattern = whitespace_regexp;
5b0534c8 2638 pend = p + strlen ((const char *) p);
f9b0fd99 2639 break;
7814e705 2640 }
f9b0fd99 2641
25fe55af
RS
2642 case '^':
2643 {
7814e705 2644 if ( /* If at start of pattern, it's an operator. */
25fe55af 2645 p == pattern + 1
7814e705 2646 /* If context independent, it's an operator. */
25fe55af 2647 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2648 /* Otherwise, depends on what's come before. */
25fe55af 2649 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2650 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2651 else
2652 goto normal_char;
2653 }
2654 break;
2655
2656
2657 case '$':
2658 {
2659 if ( /* If at end of pattern, it's an operator. */
2660 p == pend
7814e705 2661 /* If context independent, it's an operator. */
25fe55af
RS
2662 || syntax & RE_CONTEXT_INDEP_ANCHORS
2663 /* Otherwise, depends on what's next. */
2664 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2665 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2666 else
2667 goto normal_char;
2668 }
2669 break;
fa9a63c5
RM
2670
2671
2672 case '+':
25fe55af
RS
2673 case '?':
2674 if ((syntax & RE_BK_PLUS_QM)
2675 || (syntax & RE_LIMITED_OPS))
2676 goto normal_char;
2677 handle_plus:
2678 case '*':
2679 /* If there is no previous pattern... */
2680 if (!laststart)
2681 {
2682 if (syntax & RE_CONTEXT_INVALID_OPS)
2683 FREE_STACK_RETURN (REG_BADRPT);
2684 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2685 goto normal_char;
2686 }
2687
2688 {
7814e705 2689 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2690 boolean zero_times_ok = 0, many_times_ok = 0;
2691 boolean greedy = 1;
25fe55af
RS
2692
2693 /* If there is a sequence of repetition chars, collapse it
2694 down to just one (the right one). We can't combine
2695 interval operators with these because of, e.g., `a{2}*',
7814e705 2696 which should only match an even number of `a's. */
25fe55af
RS
2697
2698 for (;;)
2699 {
0b32bf0e 2700 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2701 && c == '?' && (zero_times_ok || many_times_ok))
2702 greedy = 0;
2703 else
2704 {
2705 zero_times_ok |= c != '+';
2706 many_times_ok |= c != '?';
2707 }
25fe55af
RS
2708
2709 if (p == pend)
2710 break;
ed0767d8
SM
2711 else if (*p == '*'
2712 || (!(syntax & RE_BK_PLUS_QM)
2713 && (*p == '+' || *p == '?')))
25fe55af 2714 ;
ed0767d8 2715 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2716 {
ed0767d8
SM
2717 if (p+1 == pend)
2718 FREE_STACK_RETURN (REG_EESCAPE);
2719 if (p[1] == '+' || p[1] == '?')
2720 PATFETCH (c); /* Gobble up the backslash. */
2721 else
2722 break;
25fe55af
RS
2723 }
2724 else
ed0767d8 2725 break;
25fe55af 2726 /* If we get here, we found another repeat character. */
ed0767d8
SM
2727 PATFETCH (c);
2728 }
25fe55af
RS
2729
2730 /* Star, etc. applied to an empty pattern is equivalent
2731 to an empty pattern. */
4e8a9132 2732 if (!laststart || laststart == b)
25fe55af
RS
2733 break;
2734
2735 /* Now we know whether or not zero matches is allowed
7814e705 2736 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2737 if (greedy)
2738 {
99633e97 2739 if (many_times_ok)
4e8a9132
SM
2740 {
2741 boolean simple = skip_one_char (laststart) == b;
d1dfb56c 2742 size_t startoffset = 0;
f6a3f532 2743 re_opcode_t ofj =
01618498 2744 /* Check if the loop can match the empty string. */
6df42991
SM
2745 (simple || !analyse_first (laststart, b, NULL, 0))
2746 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2747 assert (skip_one_char (laststart) <= b);
177c0ea7 2748
4e8a9132
SM
2749 if (!zero_times_ok && simple)
2750 { /* Since simple * loops can be made faster by using
2751 on_failure_keep_string_jump, we turn simple P+
2752 into PP* if P is simple. */
2753 unsigned char *p1, *p2;
2754 startoffset = b - laststart;
2755 GET_BUFFER_SPACE (startoffset);
2756 p1 = b; p2 = laststart;
2757 while (p2 < p1)
2758 *b++ = *p2++;
2759 zero_times_ok = 1;
99633e97 2760 }
4e8a9132
SM
2761
2762 GET_BUFFER_SPACE (6);
2763 if (!zero_times_ok)
2764 /* A + loop. */
f6a3f532 2765 STORE_JUMP (ofj, b, b + 6);
99633e97 2766 else
4e8a9132
SM
2767 /* Simple * loops can use on_failure_keep_string_jump
2768 depending on what follows. But since we don't know
2769 that yet, we leave the decision up to
2770 on_failure_jump_smart. */
f6a3f532 2771 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2772 laststart + startoffset, b + 6);
99633e97 2773 b += 3;
4e8a9132 2774 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2775 b += 3;
2776 }
2777 else
2778 {
4e8a9132
SM
2779 /* A simple ? pattern. */
2780 assert (zero_times_ok);
2781 GET_BUFFER_SPACE (3);
2782 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2783 b += 3;
2784 }
1c8c6d39
DL
2785 }
2786 else /* not greedy */
2787 { /* I wish the greedy and non-greedy cases could be merged. */
2788
0683b6fa 2789 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2790 if (many_times_ok)
2791 {
f6a3f532
SM
2792 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2793
6df42991
SM
2794 /* The non-greedy multiple match looks like
2795 a repeat..until: we only need a conditional jump
2796 at the end of the loop. */
f6a3f532
SM
2797 if (emptyp) BUF_PUSH (no_op);
2798 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2799 : on_failure_jump, b, laststart);
1c8c6d39
DL
2800 b += 3;
2801 if (zero_times_ok)
2802 {
2803 /* The repeat...until naturally matches one or more.
2804 To also match zero times, we need to first jump to
6df42991 2805 the end of the loop (its conditional jump). */
1c8c6d39
DL
2806 INSERT_JUMP (jump, laststart, b);
2807 b += 3;
2808 }
2809 }
2810 else
2811 {
2812 /* non-greedy a?? */
1c8c6d39
DL
2813 INSERT_JUMP (jump, laststart, b + 3);
2814 b += 3;
2815 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2816 b += 3;
2817 }
2818 }
2819 }
4e8a9132 2820 pending_exact = 0;
fa9a63c5
RM
2821 break;
2822
2823
2824 case '.':
25fe55af
RS
2825 laststart = b;
2826 BUF_PUSH (anychar);
2827 break;
fa9a63c5
RM
2828
2829
25fe55af
RS
2830 case '[':
2831 {
19ed5445
PE
2832 re_char *p1;
2833
b18215fc 2834 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2835
25fe55af 2836 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2837
25fe55af
RS
2838 /* Ensure that we have enough space to push a charset: the
2839 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2840 GET_BUFFER_SPACE (34);
2841
25fe55af 2842 laststart = b;
e318085a 2843
25fe55af 2844 /* We test `*p == '^' twice, instead of using an if
7814e705 2845 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2846 BUF_PUSH (*p == '^' ? charset_not : charset);
2847 if (*p == '^')
2848 p++;
e318085a 2849
25fe55af
RS
2850 /* Remember the first position in the bracket expression. */
2851 p1 = p;
e318085a 2852
7814e705 2853 /* Push the number of bytes in the bitmap. */
25fe55af 2854 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2855
25fe55af 2856 /* Clear the whole map. */
72af86bd 2857 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2858
25fe55af
RS
2859 /* charset_not matches newline according to a syntax bit. */
2860 if ((re_opcode_t) b[-2] == charset_not
2861 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2862 SET_LIST_BIT ('\n');
fa9a63c5 2863
7814e705 2864 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2865 for (;;)
2866 {
b18215fc 2867 boolean escaped_char = false;
2d1675e4 2868 const unsigned char *p2 = p;
abbd1bcf 2869 re_wchar_t ch;
e318085a 2870
25fe55af 2871 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2872
36595814
SM
2873 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2874 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2875 So the translation is done later in a loop. Example:
2876 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2877 PATFETCH (c);
e318085a 2878
25fe55af
RS
2879 /* \ might escape characters inside [...] and [^...]. */
2880 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2881 {
2882 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2883
2884 PATFETCH (c);
b18215fc 2885 escaped_char = true;
25fe55af 2886 }
b18215fc
RS
2887 else
2888 {
7814e705 2889 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2890 not (i.e., when the bracket expression is `[]' so
2891 far), the ']' character bit gets set way below. */
2d1675e4 2892 if (c == ']' && p2 != p1)
657fcfbd 2893 break;
25fe55af 2894 }
b18215fc 2895
25fe55af
RS
2896 /* See if we're at the beginning of a possible character
2897 class. */
b18215fc 2898
2d1675e4
SM
2899 if (!escaped_char &&
2900 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2901 {
7814e705 2902 /* Leave room for the null. */
14473664 2903 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2904 const unsigned char *class_beg;
b18215fc 2905
25fe55af
RS
2906 PATFETCH (c);
2907 c1 = 0;
ed0767d8 2908 class_beg = p;
b18215fc 2909
25fe55af
RS
2910 /* If pattern is `[[:'. */
2911 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2912
25fe55af
RS
2913 for (;;)
2914 {
14473664
SM
2915 PATFETCH (c);
2916 if ((c == ':' && *p == ']') || p == pend)
2917 break;
2918 if (c1 < CHAR_CLASS_MAX_LENGTH)
2919 str[c1++] = c;
2920 else
2921 /* This is in any case an invalid class name. */
2922 str[0] = '\0';
25fe55af
RS
2923 }
2924 str[c1] = '\0';
b18215fc
RS
2925
2926 /* If isn't a word bracketed by `[:' and `:]':
2927 undo the ending character, the letters, and
2928 leave the leading `:' and `[' (but set bits for
2929 them). */
25fe55af
RS
2930 if (c == ':' && *p == ']')
2931 {
abbd1bcf 2932 re_wctype_t cc = re_wctype (str);
14473664
SM
2933
2934 if (cc == 0)
fa9a63c5
RM
2935 FREE_STACK_RETURN (REG_ECTYPE);
2936
14473664
SM
2937 /* Throw away the ] at the end of the character
2938 class. */
2939 PATFETCH (c);
fa9a63c5 2940
14473664 2941 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2942
cf9c99bc
KH
2943#ifndef emacs
2944 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
2945 if (re_iswctype (btowc (ch), cc))
2946 {
2947 c = TRANSLATE (ch);
ed00c2ac
KH
2948 if (c < (1 << BYTEWIDTH))
2949 SET_LIST_BIT (c);
8f924df7 2950 }
cf9c99bc
KH
2951#else /* emacs */
2952 /* Most character classes in a multibyte match
2953 just set a flag. Exceptions are is_blank,
2954 is_digit, is_cntrl, and is_xdigit, since
2955 they can only match ASCII characters. We
2956 don't need to handle them for multibyte.
2957 They are distinguished by a negative wctype. */
96cc36cc 2958
254c06a8
SM
2959 /* Setup the gl_state object to its buffer-defined
2960 value. This hardcodes the buffer-global
2961 syntax-table for ASCII chars, while the other chars
2962 will obey syntax-table properties. It's not ideal,
2963 but it's the way it's been done until now. */
d48cd3f4 2964 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 2965
cf9c99bc 2966 for (ch = 0; ch < 256; ++ch)
25fe55af 2967 {
cf9c99bc
KH
2968 c = RE_CHAR_TO_MULTIBYTE (ch);
2969 if (! CHAR_BYTE8_P (c)
2970 && re_iswctype (c, cc))
8f924df7 2971 {
cf9c99bc
KH
2972 SET_LIST_BIT (ch);
2973 c1 = TRANSLATE (c);
2974 if (c1 == c)
2975 continue;
2976 if (ASCII_CHAR_P (c1))
2977 SET_LIST_BIT (c1);
2978 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2979 SET_LIST_BIT (c1);
8f924df7 2980 }
25fe55af 2981 }
cf9c99bc
KH
2982 SET_RANGE_TABLE_WORK_AREA_BIT
2983 (range_table_work, re_wctype_to_bit (cc));
2984#endif /* emacs */
6224b623
SM
2985 /* In most cases the matching rule for char classes
2986 only uses the syntax table for multibyte chars,
2987 so that the content of the syntax-table it is not
2988 hardcoded in the range_table. SPACE and WORD are
2989 the two exceptions. */
2990 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2991 bufp->used_syntax = 1;
2992
b18215fc
RS
2993 /* Repeat the loop. */
2994 continue;
25fe55af
RS
2995 }
2996 else
2997 {
ed0767d8
SM
2998 /* Go back to right after the "[:". */
2999 p = class_beg;
25fe55af 3000 SET_LIST_BIT ('[');
b18215fc
RS
3001
3002 /* Because the `:' may starts the range, we
3003 can't simply set bit and repeat the loop.
7814e705 3004 Instead, just set it to C and handle below. */
b18215fc 3005 c = ':';
25fe55af
RS
3006 }
3007 }
b18215fc
RS
3008
3009 if (p < pend && p[0] == '-' && p[1] != ']')
3010 {
3011
3012 /* Discard the `-'. */
3013 PATFETCH (c1);
3014
3015 /* Fetch the character which ends the range. */
3016 PATFETCH (c1);
cf9c99bc
KH
3017#ifdef emacs
3018 if (CHAR_BYTE8_P (c1)
3019 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3020 /* Treat the range from a multibyte character to
3021 raw-byte character as empty. */
3022 c = c1 + 1;
3023#endif /* emacs */
e318085a 3024 }
25fe55af 3025 else
b18215fc
RS
3026 /* Range from C to C. */
3027 c1 = c;
3028
cf9c99bc 3029 if (c > c1)
25fe55af 3030 {
cf9c99bc
KH
3031 if (syntax & RE_NO_EMPTY_RANGES)
3032 FREE_STACK_RETURN (REG_ERANGEX);
3033 /* Else, repeat the loop. */
bf216479 3034 }
6fdd04b0 3035 else
25fe55af 3036 {
cf9c99bc
KH
3037#ifndef emacs
3038 /* Set the range into bitmap */
8f924df7 3039 for (; c <= c1; c++)
b18215fc 3040 {
cf9c99bc
KH
3041 ch = TRANSLATE (c);
3042 if (ch < (1 << BYTEWIDTH))
3043 SET_LIST_BIT (ch);
3044 }
3045#else /* emacs */
3046 if (c < 128)
3047 {
3048 ch = MIN (127, c1);
3049 SETUP_ASCII_RANGE (range_table_work, c, ch);
3050 c = ch + 1;
3051 if (CHAR_BYTE8_P (c1))
3052 c = BYTE8_TO_CHAR (128);
3053 }
3054 if (c <= c1)
3055 {
3056 if (CHAR_BYTE8_P (c))
3057 {
3058 c = CHAR_TO_BYTE8 (c);
3059 c1 = CHAR_TO_BYTE8 (c1);
3060 for (; c <= c1; c++)
3061 SET_LIST_BIT (c);
3062 }
3063 else if (multibyte)
3064 {
3065 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3066 }
3067 else
3068 {
3069 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3070 }
e934739e 3071 }
cf9c99bc 3072#endif /* emacs */
25fe55af 3073 }
e318085a
RS
3074 }
3075
25fe55af 3076 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3077 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3078 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3079 b[-1]--;
3080 b += b[-1];
fa9a63c5 3081
96cc36cc
RS
3082 /* Build real range table from work area. */
3083 if (RANGE_TABLE_WORK_USED (range_table_work)
3084 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3085 {
3086 int i;
3087 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3088
b18215fc 3089 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3090 bytes for flags, two for COUNT, and three bytes for
3091 each character. */
3092 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3093
b18215fc
RS
3094 /* Indicate the existence of range table. */
3095 laststart[1] |= 0x80;
fa9a63c5 3096
96cc36cc
RS
3097 /* Store the character class flag bits into the range table.
3098 If not in emacs, these flag bits are always 0. */
3099 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3100 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3101
b18215fc
RS
3102 STORE_NUMBER_AND_INCR (b, used / 2);
3103 for (i = 0; i < used; i++)
3104 STORE_CHARACTER_AND_INCR
3105 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3106 }
25fe55af
RS
3107 }
3108 break;
fa9a63c5
RM
3109
3110
b18215fc 3111 case '(':
25fe55af
RS
3112 if (syntax & RE_NO_BK_PARENS)
3113 goto handle_open;
3114 else
3115 goto normal_char;
fa9a63c5
RM
3116
3117
25fe55af
RS
3118 case ')':
3119 if (syntax & RE_NO_BK_PARENS)
3120 goto handle_close;
3121 else
3122 goto normal_char;
e318085a
RS
3123
3124
25fe55af
RS
3125 case '\n':
3126 if (syntax & RE_NEWLINE_ALT)
3127 goto handle_alt;
3128 else
3129 goto normal_char;
e318085a
RS
3130
3131
b18215fc 3132 case '|':
25fe55af
RS
3133 if (syntax & RE_NO_BK_VBAR)
3134 goto handle_alt;
3135 else
3136 goto normal_char;
3137
3138
3139 case '{':
3140 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3141 goto handle_interval;
3142 else
3143 goto normal_char;
3144
3145
3146 case '\\':
3147 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3148
3149 /* Do not translate the character after the \, so that we can
3150 distinguish, e.g., \B from \b, even if we normally would
3151 translate, e.g., B to b. */
36595814 3152 PATFETCH (c);
25fe55af
RS
3153
3154 switch (c)
3155 {
3156 case '(':
3157 if (syntax & RE_NO_BK_PARENS)
3158 goto normal_backslash;
3159
3160 handle_open:
505bde11
SM
3161 {
3162 int shy = 0;
c69b0314 3163 regnum_t regnum = 0;
505bde11
SM
3164 if (p+1 < pend)
3165 {
3166 /* Look for a special (?...) construct */
ed0767d8 3167 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3168 {
ed0767d8 3169 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3170 while (!shy)
505bde11 3171 {
c69b0314
SM
3172 PATFETCH (c);
3173 switch (c)
3174 {
3175 case ':': shy = 1; break;
3176 case '0':
3177 /* An explicitly specified regnum must start
3178 with non-0. */
3179 if (regnum == 0)
3180 FREE_STACK_RETURN (REG_BADPAT);
3181 case '1': case '2': case '3': case '4':
3182 case '5': case '6': case '7': case '8': case '9':
3183 regnum = 10*regnum + (c - '0'); break;
3184 default:
3185 /* Only (?:...) is supported right now. */
3186 FREE_STACK_RETURN (REG_BADPAT);
3187 }
505bde11
SM
3188 }
3189 }
505bde11
SM
3190 }
3191
3192 if (!shy)
c69b0314
SM
3193 regnum = ++bufp->re_nsub;
3194 else if (regnum)
3195 { /* It's actually not shy, but explicitly numbered. */
3196 shy = 0;
3197 if (regnum > bufp->re_nsub)
3198 bufp->re_nsub = regnum;
3199 else if (regnum > bufp->re_nsub
3200 /* Ideally, we'd want to check that the specified
3201 group can't have matched (i.e. all subgroups
3202 using the same regnum are in other branches of
3203 OR patterns), but we don't currently keep track
3204 of enough info to do that easily. */
3205 || group_in_compile_stack (compile_stack, regnum))
3206 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3207 }
c69b0314
SM
3208 else
3209 /* It's really shy. */
3210 regnum = - bufp->re_nsub;
25fe55af 3211
99633e97
SM
3212 if (COMPILE_STACK_FULL)
3213 {
3214 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3215 compile_stack_elt_t);
3216 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3217
99633e97
SM
3218 compile_stack.size <<= 1;
3219 }
25fe55af 3220
99633e97 3221 /* These are the values to restore when we hit end of this
7814e705 3222 group. They are all relative offsets, so that if the
99633e97
SM
3223 whole pattern moves because of realloc, they will still
3224 be valid. */
3225 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3226 COMPILE_STACK_TOP.fixup_alt_jump
3227 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3228 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3229 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3230
c69b0314
SM
3231 /* Do not push a start_memory for groups beyond the last one
3232 we can represent in the compiled pattern. */
3233 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3234 BUF_PUSH_2 (start_memory, regnum);
3235
3236 compile_stack.avail++;
3237
3238 fixup_alt_jump = 0;
3239 laststart = 0;
3240 begalt = b;
3241 /* If we've reached MAX_REGNUM groups, then this open
3242 won't actually generate any code, so we'll have to
3243 clear pending_exact explicitly. */
3244 pending_exact = 0;
3245 break;
505bde11 3246 }
25fe55af
RS
3247
3248 case ')':
3249 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3250
3251 if (COMPILE_STACK_EMPTY)
505bde11
SM
3252 {
3253 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3254 goto normal_backslash;
3255 else
3256 FREE_STACK_RETURN (REG_ERPAREN);
3257 }
25fe55af
RS
3258
3259 handle_close:
505bde11 3260 FIXUP_ALT_JUMP ();
25fe55af
RS
3261
3262 /* See similar code for backslashed left paren above. */
3263 if (COMPILE_STACK_EMPTY)
505bde11
SM
3264 {
3265 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3266 goto normal_char;
3267 else
3268 FREE_STACK_RETURN (REG_ERPAREN);
3269 }
25fe55af
RS
3270
3271 /* Since we just checked for an empty stack above, this
3272 ``can't happen''. */
3273 assert (compile_stack.avail != 0);
3274 {
3275 /* We don't just want to restore into `regnum', because
3276 later groups should continue to be numbered higher,
7814e705 3277 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3278 regnum_t regnum;
25fe55af
RS
3279
3280 compile_stack.avail--;
3281 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3282 fixup_alt_jump
3283 = COMPILE_STACK_TOP.fixup_alt_jump
3284 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3285 : 0;
3286 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3287 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3288 /* If we've reached MAX_REGNUM groups, then this open
3289 won't actually generate any code, so we'll have to
3290 clear pending_exact explicitly. */
3291 pending_exact = 0;
e318085a 3292
25fe55af 3293 /* We're at the end of the group, so now we know how many
7814e705 3294 groups were inside this one. */
c69b0314
SM
3295 if (regnum <= MAX_REGNUM && regnum > 0)
3296 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3297 }
3298 break;
3299
3300
3301 case '|': /* `\|'. */
3302 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3303 goto normal_backslash;
3304 handle_alt:
3305 if (syntax & RE_LIMITED_OPS)
3306 goto normal_char;
3307
3308 /* Insert before the previous alternative a jump which
7814e705 3309 jumps to this alternative if the former fails. */
25fe55af
RS
3310 GET_BUFFER_SPACE (3);
3311 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3312 pending_exact = 0;
3313 b += 3;
3314
3315 /* The alternative before this one has a jump after it
3316 which gets executed if it gets matched. Adjust that
3317 jump so it will jump to this alternative's analogous
3318 jump (put in below, which in turn will jump to the next
3319 (if any) alternative's such jump, etc.). The last such
3320 jump jumps to the correct final destination. A picture:
3321 _____ _____
3322 | | | |
3323 | v | v
d1dfb56c 3324 a | b | c
25fe55af
RS
3325
3326 If we are at `b', then fixup_alt_jump right now points to a
3327 three-byte space after `a'. We'll put in the jump, set
3328 fixup_alt_jump to right after `b', and leave behind three
3329 bytes which we'll fill in when we get to after `c'. */
3330
505bde11 3331 FIXUP_ALT_JUMP ();
25fe55af
RS
3332
3333 /* Mark and leave space for a jump after this alternative,
3334 to be filled in later either by next alternative or
3335 when know we're at the end of a series of alternatives. */
3336 fixup_alt_jump = b;
3337 GET_BUFFER_SPACE (3);
3338 b += 3;
3339
3340 laststart = 0;
3341 begalt = b;
3342 break;
3343
3344
3345 case '{':
3346 /* If \{ is a literal. */
3347 if (!(syntax & RE_INTERVALS)
3348 /* If we're at `\{' and it's not the open-interval
3349 operator. */
4bb91c68 3350 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3351 goto normal_backslash;
3352
3353 handle_interval:
3354 {
3355 /* If got here, then the syntax allows intervals. */
3356
3357 /* At least (most) this many matches must be made. */
99633e97 3358 int lower_bound = 0, upper_bound = -1;
25fe55af 3359
ed0767d8 3360 beg_interval = p;
25fe55af 3361
25fe55af
RS
3362 GET_UNSIGNED_NUMBER (lower_bound);
3363
3364 if (c == ',')
ed0767d8 3365 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3366 else
3367 /* Interval such as `{1}' => match exactly once. */
3368 upper_bound = lower_bound;
3369
3370 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3371 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3372 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3373
3374 if (!(syntax & RE_NO_BK_BRACES))
3375 {
4bb91c68
SM
3376 if (c != '\\')
3377 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3378 if (p == pend)
3379 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3380 PATFETCH (c);
3381 }
3382
3383 if (c != '}')
4bb91c68 3384 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3385
3386 /* We just parsed a valid interval. */
3387
3388 /* If it's invalid to have no preceding re. */
3389 if (!laststart)
3390 {
3391 if (syntax & RE_CONTEXT_INVALID_OPS)
3392 FREE_STACK_RETURN (REG_BADRPT);
3393 else if (syntax & RE_CONTEXT_INDEP_OPS)
3394 laststart = b;
3395 else
3396 goto unfetch_interval;
3397 }
3398
6df42991
SM
3399 if (upper_bound == 0)
3400 /* If the upper bound is zero, just drop the sub pattern
3401 altogether. */
3402 b = laststart;
3403 else if (lower_bound == 1 && upper_bound == 1)
3404 /* Just match it once: nothing to do here. */
3405 ;
3406
3407 /* Otherwise, we have a nontrivial interval. When
3408 we're all done, the pattern will look like:
3409 set_number_at <jump count> <upper bound>
3410 set_number_at <succeed_n count> <lower bound>
3411 succeed_n <after jump addr> <succeed_n count>
3412 <body of loop>
3413 jump_n <succeed_n addr> <jump count>
3414 (The upper bound and `jump_n' are omitted if
3415 `upper_bound' is 1, though.) */
3416 else
3417 { /* If the upper bound is > 1, we need to insert
3418 more at the end of the loop. */
3419 unsigned int nbytes = (upper_bound < 0 ? 3
3420 : upper_bound > 1 ? 5 : 0);
3421 unsigned int startoffset = 0;
3422
3423 GET_BUFFER_SPACE (20); /* We might use less. */
3424
3425 if (lower_bound == 0)
3426 {
3427 /* A succeed_n that starts with 0 is really a
3428 a simple on_failure_jump_loop. */
3429 INSERT_JUMP (on_failure_jump_loop, laststart,
3430 b + 3 + nbytes);
3431 b += 3;
3432 }
3433 else
3434 {
3435 /* Initialize lower bound of the `succeed_n', even
3436 though it will be set during matching by its
3437 attendant `set_number_at' (inserted next),
3438 because `re_compile_fastmap' needs to know.
3439 Jump to the `jump_n' we might insert below. */
3440 INSERT_JUMP2 (succeed_n, laststart,
3441 b + 5 + nbytes,
3442 lower_bound);
3443 b += 5;
3444
3445 /* Code to initialize the lower bound. Insert
7814e705 3446 before the `succeed_n'. The `5' is the last two
6df42991
SM
3447 bytes of this `set_number_at', plus 3 bytes of
3448 the following `succeed_n'. */
3449 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3450 b += 5;
3451 startoffset += 5;
3452 }
3453
3454 if (upper_bound < 0)
3455 {
3456 /* A negative upper bound stands for infinity,
3457 in which case it degenerates to a plain jump. */
3458 STORE_JUMP (jump, b, laststart + startoffset);
3459 b += 3;
3460 }
3461 else if (upper_bound > 1)
3462 { /* More than one repetition is allowed, so
3463 append a backward jump to the `succeed_n'
3464 that starts this interval.
3465
3466 When we've reached this during matching,
3467 we'll have matched the interval once, so
3468 jump back only `upper_bound - 1' times. */
3469 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3470 upper_bound - 1);
3471 b += 5;
3472
3473 /* The location we want to set is the second
3474 parameter of the `jump_n'; that is `b-2' as
3475 an absolute address. `laststart' will be
3476 the `set_number_at' we're about to insert;
3477 `laststart+3' the number to set, the source
3478 for the relative address. But we are
3479 inserting into the middle of the pattern --
3480 so everything is getting moved up by 5.
3481 Conclusion: (b - 2) - (laststart + 3) + 5,
3482 i.e., b - laststart.
3483
3484 We insert this at the beginning of the loop
3485 so that if we fail during matching, we'll
3486 reinitialize the bounds. */
3487 insert_op2 (set_number_at, laststart, b - laststart,
3488 upper_bound - 1, b);
3489 b += 5;
3490 }
3491 }
25fe55af
RS
3492 pending_exact = 0;
3493 beg_interval = NULL;
3494 }
3495 break;
3496
3497 unfetch_interval:
3498 /* If an invalid interval, match the characters as literals. */
3499 assert (beg_interval);
3500 p = beg_interval;
3501 beg_interval = NULL;
3502
3503 /* normal_char and normal_backslash need `c'. */
ed0767d8 3504 c = '{';
25fe55af
RS
3505
3506 if (!(syntax & RE_NO_BK_BRACES))
3507 {
ed0767d8
SM
3508 assert (p > pattern && p[-1] == '\\');
3509 goto normal_backslash;
25fe55af 3510 }
ed0767d8
SM
3511 else
3512 goto normal_char;
e318085a 3513
b18215fc 3514#ifdef emacs
25fe55af 3515 /* There is no way to specify the before_dot and after_dot
7814e705 3516 operators. rms says this is ok. --karl */
25fe55af
RS
3517 case '=':
3518 BUF_PUSH (at_dot);
3519 break;
3520
3521 case 's':
3522 laststart = b;
3523 PATFETCH (c);
3524 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3525 break;
3526
3527 case 'S':
3528 laststart = b;
3529 PATFETCH (c);
3530 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3531 break;
b18215fc
RS
3532
3533 case 'c':
3534 laststart = b;
36595814 3535 PATFETCH (c);
b18215fc
RS
3536 BUF_PUSH_2 (categoryspec, c);
3537 break;
e318085a 3538
b18215fc
RS
3539 case 'C':
3540 laststart = b;
36595814 3541 PATFETCH (c);
b18215fc
RS
3542 BUF_PUSH_2 (notcategoryspec, c);
3543 break;
3544#endif /* emacs */
e318085a 3545
e318085a 3546
25fe55af 3547 case 'w':
4bb91c68
SM
3548 if (syntax & RE_NO_GNU_OPS)
3549 goto normal_char;
25fe55af 3550 laststart = b;
1fb352e0 3551 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3552 break;
e318085a 3553
e318085a 3554
25fe55af 3555 case 'W':
4bb91c68
SM
3556 if (syntax & RE_NO_GNU_OPS)
3557 goto normal_char;
25fe55af 3558 laststart = b;
1fb352e0 3559 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3560 break;
e318085a
RS
3561
3562
25fe55af 3563 case '<':
4bb91c68
SM
3564 if (syntax & RE_NO_GNU_OPS)
3565 goto normal_char;
25fe55af
RS
3566 BUF_PUSH (wordbeg);
3567 break;
e318085a 3568
25fe55af 3569 case '>':
4bb91c68
SM
3570 if (syntax & RE_NO_GNU_OPS)
3571 goto normal_char;
25fe55af
RS
3572 BUF_PUSH (wordend);
3573 break;
e318085a 3574
669fa600
SM
3575 case '_':
3576 if (syntax & RE_NO_GNU_OPS)
3577 goto normal_char;
3578 laststart = b;
3579 PATFETCH (c);
3580 if (c == '<')
3581 BUF_PUSH (symbeg);
3582 else if (c == '>')
3583 BUF_PUSH (symend);
3584 else
3585 FREE_STACK_RETURN (REG_BADPAT);
3586 break;
3587
25fe55af 3588 case 'b':
4bb91c68
SM
3589 if (syntax & RE_NO_GNU_OPS)
3590 goto normal_char;
25fe55af
RS
3591 BUF_PUSH (wordbound);
3592 break;
e318085a 3593
25fe55af 3594 case 'B':
4bb91c68
SM
3595 if (syntax & RE_NO_GNU_OPS)
3596 goto normal_char;
25fe55af
RS
3597 BUF_PUSH (notwordbound);
3598 break;
fa9a63c5 3599
25fe55af 3600 case '`':
4bb91c68
SM
3601 if (syntax & RE_NO_GNU_OPS)
3602 goto normal_char;
25fe55af
RS
3603 BUF_PUSH (begbuf);
3604 break;
e318085a 3605
25fe55af 3606 case '\'':
4bb91c68
SM
3607 if (syntax & RE_NO_GNU_OPS)
3608 goto normal_char;
25fe55af
RS
3609 BUF_PUSH (endbuf);
3610 break;
e318085a 3611
25fe55af
RS
3612 case '1': case '2': case '3': case '4': case '5':
3613 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3614 {
3615 regnum_t reg;
e318085a 3616
0cdd06f8
SM
3617 if (syntax & RE_NO_BK_REFS)
3618 goto normal_backslash;
e318085a 3619
0cdd06f8 3620 reg = c - '0';
e318085a 3621
c69b0314
SM
3622 if (reg > bufp->re_nsub || reg < 1
3623 /* Can't back reference to a subexp before its end. */
3624 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3625 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3626
0cdd06f8
SM
3627 laststart = b;
3628 BUF_PUSH_2 (duplicate, reg);
3629 }
25fe55af 3630 break;
e318085a 3631
e318085a 3632
25fe55af
RS
3633 case '+':
3634 case '?':
3635 if (syntax & RE_BK_PLUS_QM)
3636 goto handle_plus;
3637 else
3638 goto normal_backslash;
3639
3640 default:
3641 normal_backslash:
3642 /* You might think it would be useful for \ to mean
3643 not to translate; but if we don't translate it
4bb91c68 3644 it will never match anything. */
25fe55af
RS
3645 goto normal_char;
3646 }
3647 break;
fa9a63c5
RM
3648
3649
3650 default:
25fe55af 3651 /* Expects the character in `c'. */
fa9a63c5 3652 normal_char:
36595814 3653 /* If no exactn currently being built. */
25fe55af 3654 if (!pending_exact
fa9a63c5 3655
25fe55af
RS
3656 /* If last exactn not at current position. */
3657 || pending_exact + *pending_exact + 1 != b
5e69f11e 3658
25fe55af 3659 /* We have only one byte following the exactn for the count. */
2d1675e4 3660 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3661
7814e705 3662 /* If followed by a repetition operator. */
9d99031f 3663 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3664 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3665 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3666 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3667 || ((syntax & RE_INTERVALS)
25fe55af 3668 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3669 ? p != pend && *p == '{'
3670 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3671 {
3672 /* Start building a new exactn. */
5e69f11e 3673
25fe55af 3674 laststart = b;
fa9a63c5
RM
3675
3676 BUF_PUSH_2 (exactn, 0);
3677 pending_exact = b - 1;
25fe55af 3678 }
5e69f11e 3679
2d1675e4
SM
3680 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3681 {
e0277a47
KH
3682 int len;
3683
cf9c99bc 3684 if (multibyte)
6fdd04b0 3685 {
cf9c99bc 3686 c = TRANSLATE (c);
6fdd04b0
KH
3687 len = CHAR_STRING (c, b);
3688 b += len;
3689 }
e0277a47 3690 else
6fdd04b0 3691 {
cf9c99bc
KH
3692 c1 = RE_CHAR_TO_MULTIBYTE (c);
3693 if (! CHAR_BYTE8_P (c1))
3694 {
3695 re_wchar_t c2 = TRANSLATE (c1);
3696
3697 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3698 c = c1;
409f2919 3699 }
6fdd04b0
KH
3700 *b++ = c;
3701 len = 1;
3702 }
2d1675e4
SM
3703 (*pending_exact) += len;
3704 }
3705
fa9a63c5 3706 break;
25fe55af 3707 } /* switch (c) */
fa9a63c5
RM
3708 } /* while p != pend */
3709
5e69f11e 3710
fa9a63c5 3711 /* Through the pattern now. */
5e69f11e 3712
505bde11 3713 FIXUP_ALT_JUMP ();
fa9a63c5 3714
5e69f11e 3715 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3716 FREE_STACK_RETURN (REG_EPAREN);
3717
3718 /* If we don't want backtracking, force success
3719 the first time we reach the end of the compiled pattern. */
3720 if (syntax & RE_NO_POSIX_BACKTRACKING)
3721 BUF_PUSH (succeed);
3722
fa9a63c5
RM
3723 /* We have succeeded; set the length of the buffer. */
3724 bufp->used = b - bufp->buffer;
3725
3726#ifdef DEBUG
99633e97 3727 if (debug > 0)
fa9a63c5 3728 {
505bde11 3729 re_compile_fastmap (bufp);
fa9a63c5
RM
3730 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3731 print_compiled_pattern (bufp);
3732 }
99633e97 3733 debug--;
fa9a63c5
RM
3734#endif /* DEBUG */
3735
3736#ifndef MATCH_MAY_ALLOCATE
3737 /* Initialize the failure stack to the largest possible stack. This
3738 isn't necessary unless we're trying to avoid calling alloca in
3739 the search and match routines. */
3740 {
3741 int num_regs = bufp->re_nsub + 1;
3742
320a2a73 3743 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3744 {
a26f4ccd 3745 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3746
fa9a63c5
RM
3747 if (! fail_stack.stack)
3748 fail_stack.stack
5e69f11e 3749 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3750 * sizeof (fail_stack_elt_t));
3751 else
3752 fail_stack.stack
3753 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3754 (fail_stack.size
3755 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3756 }
3757
3758 regex_grow_registers (num_regs);
3759 }
3760#endif /* not MATCH_MAY_ALLOCATE */
3761
839966f3 3762 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3763} /* regex_compile */
3764\f
3765/* Subroutines for `regex_compile'. */
3766
7814e705 3767/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3768
3769static void
971de7fb 3770store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3771{
3772 *loc = (unsigned char) op;
3773 STORE_NUMBER (loc + 1, arg);
3774}
3775
3776
3777/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3778
3779static void
971de7fb 3780store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3781{
3782 *loc = (unsigned char) op;
3783 STORE_NUMBER (loc + 1, arg1);
3784 STORE_NUMBER (loc + 3, arg2);
3785}
3786
3787
3788/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3789 for OP followed by two-byte integer parameter ARG. */
3790
3791static void
971de7fb 3792insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3793{
3794 register unsigned char *pfrom = end;
3795 register unsigned char *pto = end + 3;
3796
3797 while (pfrom != loc)
3798 *--pto = *--pfrom;
5e69f11e 3799
fa9a63c5
RM
3800 store_op1 (op, loc, arg);
3801}
3802
3803
3804/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3805
3806static void
971de7fb 3807insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3808{
3809 register unsigned char *pfrom = end;
3810 register unsigned char *pto = end + 5;
3811
3812 while (pfrom != loc)
3813 *--pto = *--pfrom;
5e69f11e 3814
fa9a63c5
RM
3815 store_op2 (op, loc, arg1, arg2);
3816}
3817
3818
3819/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3820 after an alternative or a begin-subexpression. We assume there is at
3821 least one character before the ^. */
3822
3823static boolean
971de7fb 3824at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3825{
01618498 3826 re_char *prev = p - 2;
95988fcf 3827 boolean odd_backslashes;
5e69f11e 3828
95988fcf
AS
3829 /* After a subexpression? */
3830 if (*prev == '(')
3831 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3832
3833 /* After an alternative? */
3834 else if (*prev == '|')
3835 odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3836
3837 /* After a shy subexpression? */
3838 else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3839 {
3840 /* Skip over optional regnum. */
3841 while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3842 --prev;
3843
3844 if (!(prev - 2 >= pattern
3845 && prev[-1] == '?' && prev[-2] == '('))
3846 return false;
3847 prev -= 2;
3848 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3849 }
3850 else
3851 return false;
3852
3853 /* Count the number of preceding backslashes. */
3854 p = prev;
3855 while (prev - 1 >= pattern && prev[-1] == '\\')
3856 --prev;
3857 return (p - prev) & odd_backslashes;
fa9a63c5
RM
3858}
3859
3860
3861/* The dual of at_begline_loc_p. This one is for $. We assume there is
3862 at least one character after the $, i.e., `P < PEND'. */
3863
3864static boolean
971de7fb 3865at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3866{
01618498 3867 re_char *next = p;
fa9a63c5 3868 boolean next_backslash = *next == '\\';
01618498 3869 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3870
fa9a63c5
RM
3871 return
3872 /* Before a subexpression? */
3873 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3874 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3875 /* Before an alternative? */
3876 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3877 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3878}
3879
3880
5e69f11e 3881/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3882 false if it's not. */
3883
3884static boolean
971de7fb 3885group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5 3886{
d1dfb56c 3887 ssize_t this_element;
fa9a63c5 3888
5e69f11e
RM
3889 for (this_element = compile_stack.avail - 1;
3890 this_element >= 0;
fa9a63c5
RM
3891 this_element--)
3892 if (compile_stack.stack[this_element].regnum == regnum)
3893 return true;
3894
3895 return false;
3896}
fa9a63c5 3897\f
f6a3f532
SM
3898/* analyse_first.
3899 If fastmap is non-NULL, go through the pattern and fill fastmap
3900 with all the possible leading chars. If fastmap is NULL, don't
3901 bother filling it up (obviously) and only return whether the
3902 pattern could potentially match the empty string.
3903
3904 Return 1 if p..pend might match the empty string.
3905 Return 0 if p..pend matches at least one char.
01618498 3906 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3907
3908static int
438105ed 3909analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3910{
505bde11 3911 int j, k;
1fb352e0 3912 boolean not;
fa9a63c5 3913
b18215fc 3914 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3915 flag is set true. */
b18215fc
RS
3916 boolean match_any_multibyte_characters = false;
3917
f6a3f532 3918 assert (p);
5e69f11e 3919
505bde11
SM
3920 /* The loop below works as follows:
3921 - It has a working-list kept in the PATTERN_STACK and which basically
3922 starts by only containing a pointer to the first operation.
3923 - If the opcode we're looking at is a match against some set of
3924 chars, then we add those chars to the fastmap and go on to the
3925 next work element from the worklist (done via `break').
3926 - If the opcode is a control operator on the other hand, we either
3927 ignore it (if it's meaningless at this point, such as `start_memory')
3928 or execute it (if it's a jump). If the jump has several destinations
3929 (i.e. `on_failure_jump'), then we push the other destination onto the
3930 worklist.
3931 We guarantee termination by ignoring backward jumps (more or less),
3932 so that `p' is monotonically increasing. More to the point, we
3933 never set `p' (or push) anything `<= p1'. */
3934
01618498 3935 while (p < pend)
fa9a63c5 3936 {
505bde11
SM
3937 /* `p1' is used as a marker of how far back a `on_failure_jump'
3938 can go without being ignored. It is normally equal to `p'
3939 (which prevents any backward `on_failure_jump') except right
3940 after a plain `jump', to allow patterns such as:
3941 0: jump 10
3942 3..9: <body>
3943 10: on_failure_jump 3
3944 as used for the *? operator. */
01618498 3945 re_char *p1 = p;
5e69f11e 3946
fa9a63c5
RM
3947 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3948 {
f6a3f532 3949 case succeed:
01618498 3950 return 1;
fa9a63c5 3951
fa9a63c5 3952 case duplicate:
505bde11
SM
3953 /* If the first character has to match a backreference, that means
3954 that the group was empty (since it already matched). Since this
3955 is the only case that interests us here, we can assume that the
3956 backreference must match the empty string. */
3957 p++;
3958 continue;
fa9a63c5
RM
3959
3960
3961 /* Following are the cases which match a character. These end
7814e705 3962 with `break'. */
fa9a63c5
RM
3963
3964 case exactn:
e0277a47 3965 if (fastmap)
cf9c99bc
KH
3966 {
3967 /* If multibyte is nonzero, the first byte of each
3968 character is an ASCII or a leading code. Otherwise,
3969 each byte is a character. Thus, this works in both
3970 cases. */
3971 fastmap[p[1]] = 1;
3972 if (! multibyte)
3973 {
3974 /* For the case of matching this unibyte regex
3975 against multibyte, we must set a leading code of
3976 the corresponding multibyte character. */
3977 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3978
86e893e3 3979 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
3980 }
3981 }
fa9a63c5
RM
3982 break;
3983
3984
1fb352e0
SM
3985 case anychar:
3986 /* We could put all the chars except for \n (and maybe \0)
3987 but we don't bother since it is generally not worth it. */
f6a3f532 3988 if (!fastmap) break;
01618498 3989 return -1;
fa9a63c5
RM
3990
3991
b18215fc 3992 case charset_not:
1fb352e0 3993 if (!fastmap) break;
bf216479
KH
3994 {
3995 /* Chars beyond end of bitmap are possible matches. */
bf216479 3996 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 3997 j < (1 << BYTEWIDTH); j++)
bf216479
KH
3998 fastmap[j] = 1;
3999 }
4000
1fb352e0
SM
4001 /* Fallthrough */
4002 case charset:
4003 if (!fastmap) break;
4004 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4005 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4006 j >= 0; j--)
1fb352e0 4007 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4008 fastmap[j] = 1;
b18215fc 4009
6482db2e
KH
4010#ifdef emacs
4011 if (/* Any leading code can possibly start a character
1fb352e0 4012 which doesn't match the specified set of characters. */
6482db2e 4013 not
409f2919 4014 ||
6482db2e
KH
4015 /* If we can match a character class, we can match any
4016 multibyte characters. */
4017 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4018 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4019
b18215fc 4020 {
b18215fc
RS
4021 if (match_any_multibyte_characters == false)
4022 {
6482db2e
KH
4023 for (j = MIN_MULTIBYTE_LEADING_CODE;
4024 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4025 fastmap[j] = 1;
b18215fc
RS
4026 match_any_multibyte_characters = true;
4027 }
4028 }
b18215fc 4029
1fb352e0
SM
4030 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4031 && match_any_multibyte_characters == false)
4032 {
bf216479 4033 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 4034 multibyte character in the range table. */
1fb352e0 4035 int c, count;
bf216479 4036 unsigned char lc1, lc2;
b18215fc 4037
1fb352e0 4038 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4039 bits for a character class. */
1fb352e0 4040 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4041
1fb352e0
SM
4042 /* Extract the number of ranges in range table into COUNT. */
4043 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4044 for (; count > 0; count--, p += 3)
1fb352e0 4045 {
9117d724
KH
4046 /* Extract the start and end of each range. */
4047 EXTRACT_CHARACTER (c, p);
bf216479 4048 lc1 = CHAR_LEADING_CODE (c);
9117d724 4049 p += 3;
1fb352e0 4050 EXTRACT_CHARACTER (c, p);
bf216479
KH
4051 lc2 = CHAR_LEADING_CODE (c);
4052 for (j = lc1; j <= lc2; j++)
9117d724 4053 fastmap[j] = 1;
1fb352e0
SM
4054 }
4055 }
6482db2e 4056#endif
b18215fc
RS
4057 break;
4058
1fb352e0
SM
4059 case syntaxspec:
4060 case notsyntaxspec:
4061 if (!fastmap) break;
4062#ifndef emacs
4063 not = (re_opcode_t)p[-1] == notsyntaxspec;
4064 k = *p++;
4065 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4066 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4067 fastmap[j] = 1;
b18215fc 4068 break;
1fb352e0 4069#else /* emacs */
b18215fc
RS
4070 /* This match depends on text properties. These end with
4071 aborting optimizations. */
01618498 4072 return -1;
b18215fc
RS
4073
4074 case categoryspec:
b18215fc 4075 case notcategoryspec:
1fb352e0
SM
4076 if (!fastmap) break;
4077 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4078 k = *p++;
6482db2e 4079 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4080 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4081 fastmap[j] = 1;
4082
6482db2e
KH
4083 /* Any leading code can possibly start a character which
4084 has or doesn't has the specified category. */
4085 if (match_any_multibyte_characters == false)
6fdd04b0 4086 {
6482db2e
KH
4087 for (j = MIN_MULTIBYTE_LEADING_CODE;
4088 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4089 fastmap[j] = 1;
4090 match_any_multibyte_characters = true;
6fdd04b0 4091 }
b18215fc
RS
4092 break;
4093
fa9a63c5 4094 /* All cases after this match the empty string. These end with
25fe55af 4095 `continue'. */
fa9a63c5 4096
fa9a63c5
RM
4097 case before_dot:
4098 case at_dot:
4099 case after_dot:
1fb352e0 4100#endif /* !emacs */
25fe55af
RS
4101 case no_op:
4102 case begline:
4103 case endline:
fa9a63c5
RM
4104 case begbuf:
4105 case endbuf:
4106 case wordbound:
4107 case notwordbound:
4108 case wordbeg:
4109 case wordend:
669fa600
SM
4110 case symbeg:
4111 case symend:
25fe55af 4112 continue;
fa9a63c5
RM
4113
4114
fa9a63c5 4115 case jump:
25fe55af 4116 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4117 if (j < 0)
4118 /* Backward jumps can only go back to code that we've already
4119 visited. `re_compile' should make sure this is true. */
4120 break;
25fe55af 4121 p += j;
505bde11
SM
4122 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4123 {
4124 case on_failure_jump:
4125 case on_failure_keep_string_jump:
505bde11 4126 case on_failure_jump_loop:
0683b6fa 4127 case on_failure_jump_nastyloop:
505bde11
SM
4128 case on_failure_jump_smart:
4129 p++;
4130 break;
4131 default:
4132 continue;
4133 };
4134 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4135 to jump back to "just after here". */
4136 /* Fallthrough */
fa9a63c5 4137
25fe55af
RS
4138 case on_failure_jump:
4139 case on_failure_keep_string_jump:
0683b6fa 4140 case on_failure_jump_nastyloop:
505bde11
SM
4141 case on_failure_jump_loop:
4142 case on_failure_jump_smart:
25fe55af 4143 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4144 if (p + j <= p1)
ed0767d8 4145 ; /* Backward jump to be ignored. */
01618498
SM
4146 else
4147 { /* We have to look down both arms.
4148 We first go down the "straight" path so as to minimize
4149 stack usage when going through alternatives. */
4150 int r = analyse_first (p, pend, fastmap, multibyte);
4151 if (r) return r;
4152 p += j;
4153 }
25fe55af 4154 continue;
fa9a63c5
RM
4155
4156
ed0767d8
SM
4157 case jump_n:
4158 /* This code simply does not properly handle forward jump_n. */
4159 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4160 p += 4;
4161 /* jump_n can either jump or fall through. The (backward) jump
4162 case has already been handled, so we only need to look at the
4163 fallthrough case. */
4164 continue;
177c0ea7 4165
fa9a63c5 4166 case succeed_n:
ed0767d8
SM
4167 /* If N == 0, it should be an on_failure_jump_loop instead. */
4168 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4169 p += 4;
4170 /* We only care about one iteration of the loop, so we don't
4171 need to consider the case where this behaves like an
4172 on_failure_jump. */
25fe55af 4173 continue;
fa9a63c5
RM
4174
4175
4176 case set_number_at:
25fe55af
RS
4177 p += 4;
4178 continue;
fa9a63c5
RM
4179
4180
4181 case start_memory:
25fe55af 4182 case stop_memory:
505bde11 4183 p += 1;
fa9a63c5
RM
4184 continue;
4185
4186
4187 default:
25fe55af
RS
4188 abort (); /* We have listed all the cases. */
4189 } /* switch *p++ */
fa9a63c5
RM
4190
4191 /* Getting here means we have found the possible starting
25fe55af 4192 characters for one path of the pattern -- and that the empty
7814e705 4193 string does not match. We need not follow this path further. */
01618498 4194 return 0;
fa9a63c5
RM
4195 } /* while p */
4196
01618498
SM
4197 /* We reached the end without matching anything. */
4198 return 1;
4199
f6a3f532
SM
4200} /* analyse_first */
4201\f
4202/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4203 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4204 characters can start a string that matches the pattern. This fastmap
4205 is used by re_search to skip quickly over impossible starting points.
4206
4207 Character codes above (1 << BYTEWIDTH) are not represented in the
4208 fastmap, but the leading codes are represented. Thus, the fastmap
4209 indicates which character sets could start a match.
4210
4211 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4212 area as BUFP->fastmap.
4213
4214 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4215 the pattern buffer.
4216
4217 Returns 0 if we succeed, -2 if an internal error. */
4218
4219int
971de7fb 4220re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4221{
4222 char *fastmap = bufp->fastmap;
4223 int analysis;
4224
4225 assert (fastmap && bufp->buffer);
4226
72af86bd 4227 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4228 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4229
4230 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4231 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4232 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4233 return 0;
4234} /* re_compile_fastmap */
4235\f
4236/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4237 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4238 this memory for recording register information. STARTS and ENDS
4239 must be allocated using the malloc library routine, and must each
4240 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4241
4242 If NUM_REGS == 0, then subsequent matches should allocate their own
4243 register data.
4244
4245 Unless this function is called, the first search or match using
4246 PATTERN_BUFFER will allocate its own register data, without
4247 freeing the old data. */
4248
4249void
971de7fb 4250re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4251{
4252 if (num_regs)
4253 {
4254 bufp->regs_allocated = REGS_REALLOCATE;
4255 regs->num_regs = num_regs;
4256 regs->start = starts;
4257 regs->end = ends;
4258 }
4259 else
4260 {
4261 bufp->regs_allocated = REGS_UNALLOCATED;
4262 regs->num_regs = 0;
4263 regs->start = regs->end = (regoff_t *) 0;
4264 }
4265}
c0f9ea08 4266WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4267\f
7814e705 4268/* Searching routines. */
fa9a63c5
RM
4269
4270/* Like re_search_2, below, but only one string is specified, and
4271 doesn't let you say where to stop matching. */
4272
d1dfb56c
EZ
4273regoff_t
4274re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4275 ssize_t startpos, ssize_t range, struct re_registers *regs)
fa9a63c5 4276{
5e69f11e 4277 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4278 regs, size);
4279}
c0f9ea08 4280WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4281
70806df6
KH
4282/* Head address of virtual concatenation of string. */
4283#define HEAD_ADDR_VSTRING(P) \
4284 (((P) >= size1 ? string2 : string1))
4285
b18215fc
RS
4286/* Address of POS in the concatenation of virtual string. */
4287#define POS_ADDR_VSTRING(POS) \
4288 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4289
4290/* Using the compiled pattern in BUFP->buffer, first tries to match the
4291 virtual concatenation of STRING1 and STRING2, starting first at index
4292 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4293
fa9a63c5 4294 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4295
fa9a63c5
RM
4296 RANGE is how far to scan while trying to match. RANGE = 0 means try
4297 only at STARTPOS; in general, the last start tried is STARTPOS +
4298 RANGE.
5e69f11e 4299
fa9a63c5
RM
4300 In REGS, return the indices of the virtual concatenation of STRING1
4301 and STRING2 that matched the entire BUFP->buffer and its contained
4302 subexpressions.
5e69f11e 4303
fa9a63c5
RM
4304 Do not consider matching one past the index STOP in the virtual
4305 concatenation of STRING1 and STRING2.
4306
4307 We return either the position in the strings at which the match was
4308 found, -1 if no match, or -2 if error (such as failure
4309 stack overflow). */
4310
d1dfb56c
EZ
4311regoff_t
4312re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4313 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4314 struct re_registers *regs, ssize_t stop)
fa9a63c5 4315{
d1dfb56c 4316 regoff_t val;
66f0296e
SM
4317 re_char *string1 = (re_char*) str1;
4318 re_char *string2 = (re_char*) str2;
fa9a63c5 4319 register char *fastmap = bufp->fastmap;
6676cb1c 4320 register RE_TRANSLATE_TYPE translate = bufp->translate;
d1dfb56c
EZ
4321 size_t total_size = size1 + size2;
4322 ssize_t endpos = startpos + range;
c0f9ea08 4323 boolean anchored_start;
cf9c99bc
KH
4324 /* Nonzero if we are searching multibyte string. */
4325 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4326
fa9a63c5
RM
4327 /* Check for out-of-range STARTPOS. */
4328 if (startpos < 0 || startpos > total_size)
4329 return -1;
5e69f11e 4330
fa9a63c5 4331 /* Fix up RANGE if it might eventually take us outside
34597fa9 4332 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4333 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4334 if (endpos < 0)
4335 range = 0 - startpos;
fa9a63c5
RM
4336 else if (endpos > total_size)
4337 range = total_size - startpos;
4338
4339 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4340 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4341 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4342 {
4343 if (startpos > 0)
4344 return -1;
4345 else
7b140fd7 4346 range = 0;
fa9a63c5
RM
4347 }
4348
ae4788a8
RS
4349#ifdef emacs
4350 /* In a forward search for something that starts with \=.
4351 don't keep searching past point. */
4352 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4353 {
7b140fd7
RS
4354 range = PT_BYTE - BEGV_BYTE - startpos;
4355 if (range < 0)
ae4788a8
RS
4356 return -1;
4357 }
4358#endif /* emacs */
4359
fa9a63c5
RM
4360 /* Update the fastmap now if not correct already. */
4361 if (fastmap && !bufp->fastmap_accurate)
01618498 4362 re_compile_fastmap (bufp);
5e69f11e 4363
c8499ba5 4364 /* See whether the pattern is anchored. */
c0f9ea08 4365 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4366
b18215fc 4367#ifdef emacs
d48cd3f4 4368 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4369 {
d1dfb56c 4370 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4371
4372 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4373 }
b18215fc
RS
4374#endif
4375
fa9a63c5
RM
4376 /* Loop through the string, looking for a place to start matching. */
4377 for (;;)
5e69f11e 4378 {
c8499ba5
RS
4379 /* If the pattern is anchored,
4380 skip quickly past places we cannot match.
4381 We don't bother to treat startpos == 0 specially
4382 because that case doesn't repeat. */
4383 if (anchored_start && startpos > 0)
4384 {
c0f9ea08
SM
4385 if (! ((startpos <= size1 ? string1[startpos - 1]
4386 : string2[startpos - size1 - 1])
4387 == '\n'))
c8499ba5
RS
4388 goto advance;
4389 }
4390
fa9a63c5 4391 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4392 cannot be the start of a match. If the pattern can match the
4393 null string, however, we don't need to skip characters; we want
7814e705 4394 the first null string. */
fa9a63c5
RM
4395 if (fastmap && startpos < total_size && !bufp->can_be_null)
4396 {
66f0296e 4397 register re_char *d;
01618498 4398 register re_wchar_t buf_ch;
e934739e
RS
4399
4400 d = POS_ADDR_VSTRING (startpos);
4401
7814e705 4402 if (range > 0) /* Searching forwards. */
fa9a63c5 4403 {
fa9a63c5 4404 register int lim = 0;
d1dfb56c 4405 ssize_t irange = range;
fa9a63c5 4406
25fe55af
RS
4407 if (startpos < size1 && startpos + range >= size1)
4408 lim = range - (size1 - startpos);
fa9a63c5 4409
25fe55af
RS
4410 /* Written out as an if-else to avoid testing `translate'
4411 inside the loop. */
28ae27ae
AS
4412 if (RE_TRANSLATE_P (translate))
4413 {
e934739e
RS
4414 if (multibyte)
4415 while (range > lim)
4416 {
4417 int buf_charlen;
4418
62a6e103 4419 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4420 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4421 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4422 break;
4423
4424 range -= buf_charlen;
4425 d += buf_charlen;
4426 }
4427 else
bf216479 4428 while (range > lim)
33c46939 4429 {
cf9c99bc
KH
4430 register re_wchar_t ch, translated;
4431
bf216479 4432 buf_ch = *d;
cf9c99bc
KH
4433 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4434 translated = RE_TRANSLATE (translate, ch);
4435 if (translated != ch
4436 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4437 buf_ch = ch;
6fdd04b0 4438 if (fastmap[buf_ch])
bf216479 4439 break;
33c46939
RS
4440 d++;
4441 range--;
4442 }
e934739e 4443 }
fa9a63c5 4444 else
6fdd04b0
KH
4445 {
4446 if (multibyte)
4447 while (range > lim)
4448 {
4449 int buf_charlen;
fa9a63c5 4450
62a6e103 4451 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4452 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4453 break;
4454 range -= buf_charlen;
4455 d += buf_charlen;
4456 }
e934739e 4457 else
6fdd04b0 4458 while (range > lim && !fastmap[*d])
33c46939
RS
4459 {
4460 d++;
4461 range--;
4462 }
e934739e 4463 }
fa9a63c5
RM
4464 startpos += irange - range;
4465 }
7814e705 4466 else /* Searching backwards. */
fa9a63c5 4467 {
ba5e343c
KH
4468 if (multibyte)
4469 {
62a6e103 4470 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4471 buf_ch = TRANSLATE (buf_ch);
4472 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4473 goto advance;
4474 }
4475 else
4476 {
cf9c99bc
KH
4477 register re_wchar_t ch, translated;
4478
4479 buf_ch = *d;
4480 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4481 translated = TRANSLATE (ch);
4482 if (translated != ch
4483 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4484 buf_ch = ch;
4485 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4486 goto advance;
4487 }
fa9a63c5
RM
4488 }
4489 }
4490
4491 /* If can't match the null string, and that's all we have left, fail. */
4492 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4493 && !bufp->can_be_null)
fa9a63c5
RM
4494 return -1;
4495
4496 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4497 startpos, regs, stop);
fa9a63c5
RM
4498
4499 if (val >= 0)
4500 return startpos;
5e69f11e 4501
fa9a63c5
RM
4502 if (val == -2)
4503 return -2;
4504
4505 advance:
5e69f11e 4506 if (!range)
25fe55af 4507 break;
5e69f11e 4508 else if (range > 0)
25fe55af 4509 {
b18215fc
RS
4510 /* Update STARTPOS to the next character boundary. */
4511 if (multibyte)
4512 {
66f0296e 4513 re_char *p = POS_ADDR_VSTRING (startpos);
aa3830c4 4514 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4515
4516 range -= len;
4517 if (range < 0)
4518 break;
4519 startpos += len;
4520 }
4521 else
4522 {
b560c397
RS
4523 range--;
4524 startpos++;
4525 }
e318085a 4526 }
fa9a63c5 4527 else
25fe55af
RS
4528 {
4529 range++;
4530 startpos--;
b18215fc
RS
4531
4532 /* Update STARTPOS to the previous character boundary. */
4533 if (multibyte)
4534 {
70806df6
KH
4535 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4536 re_char *p0 = p;
4537 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4538
4539 /* Find the head of multibyte form. */
70806df6
KH
4540 PREV_CHAR_BOUNDARY (p, phead);
4541 range += p0 - 1 - p;
4542 if (range > 0)
4543 break;
b18215fc 4544
70806df6 4545 startpos -= p0 - 1 - p;
b18215fc 4546 }
25fe55af 4547 }
fa9a63c5
RM
4548 }
4549 return -1;
4550} /* re_search_2 */
c0f9ea08 4551WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4552\f
4553/* Declarations and macros for re_match_2. */
4554
261cb4bb
PE
4555static int bcmp_translate (re_char *s1, re_char *s2,
4556 register ssize_t len,
4557 RE_TRANSLATE_TYPE translate,
4558 const int multibyte);
fa9a63c5
RM
4559
4560/* This converts PTR, a pointer into one of the search strings `string1'
4561 and `string2' into an offset from the beginning of that string. */
4562#define POINTER_TO_OFFSET(ptr) \
4563 (FIRST_STRING_P (ptr) \
4564 ? ((regoff_t) ((ptr) - string1)) \
4565 : ((regoff_t) ((ptr) - string2 + size1)))
4566
fa9a63c5 4567/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4568 string2 if necessary.
4569 Check re_match_2_internal for a discussion of why end_match_2 might
4570 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4571#define PREFETCH() \
25fe55af 4572 while (d == dend) \
fa9a63c5
RM
4573 { \
4574 /* End of string2 => fail. */ \
25fe55af
RS
4575 if (dend == end_match_2) \
4576 goto fail; \
4bb91c68 4577 /* End of string1 => advance to string2. */ \
25fe55af 4578 d = string2; \
fa9a63c5
RM
4579 dend = end_match_2; \
4580 }
4581
f1ad044f
SM
4582/* Call before fetching a char with *d if you already checked other limits.
4583 This is meant for use in lookahead operations like wordend, etc..
4584 where we might need to look at parts of the string that might be
4585 outside of the LIMITs (i.e past `stop'). */
4586#define PREFETCH_NOLIMIT() \
4587 if (d == end1) \
4588 { \
4589 d = string2; \
4590 dend = end_match_2; \
4591 } \
fa9a63c5
RM
4592
4593/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4594 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4595#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4596#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5 4597
9121ca40 4598/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4599
4600/* The comment at case wordbound is following one, but we don't use
4601 AT_WORD_BOUNDARY anymore to support multibyte form.
4602
4603 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4604 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4605 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4606 macro and introducing temporary variables works around the bug. */
4607
9121ca40 4608#if 0
b313f9d8
PE
4609/* Test if D points to a character which is word-constituent. We have
4610 two special cases to check for: if past the end of string1, look at
4611 the first character in string2; and if before the beginning of
4612 string2, look at the last character in string1. */
4613#define WORDCHAR_P(d) \
4614 (SYNTAX ((d) == end1 ? *string2 \
4615 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4616 == Sword)
4617
fa9a63c5
RM
4618/* Test if the character before D and the one at D differ with respect
4619 to being word-constituent. */
4620#define AT_WORD_BOUNDARY(d) \
4621 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4622 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4623#endif
fa9a63c5
RM
4624
4625/* Free everything we malloc. */
4626#ifdef MATCH_MAY_ALLOCATE
952db0d7
PE
4627# define FREE_VAR(var) \
4628 do { \
4629 if (var) \
4630 { \
4631 REGEX_FREE (var); \
4632 var = NULL; \
4633 } \
4634 } while (0)
0b32bf0e 4635# define FREE_VARIABLES() \
fa9a63c5
RM
4636 do { \
4637 REGEX_FREE_STACK (fail_stack.stack); \
4638 FREE_VAR (regstart); \
4639 FREE_VAR (regend); \
fa9a63c5
RM
4640 FREE_VAR (best_regstart); \
4641 FREE_VAR (best_regend); \
fa9a63c5
RM
4642 } while (0)
4643#else
0b32bf0e 4644# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4645#endif /* not MATCH_MAY_ALLOCATE */
4646
505bde11
SM
4647\f
4648/* Optimization routines. */
4649
4e8a9132
SM
4650/* If the operation is a match against one or more chars,
4651 return a pointer to the next operation, else return NULL. */
01618498 4652static re_char *
971de7fb 4653skip_one_char (const re_char *p)
4e8a9132
SM
4654{
4655 switch (SWITCH_ENUM_CAST (*p++))
4656 {
4657 case anychar:
4658 break;
177c0ea7 4659
4e8a9132
SM
4660 case exactn:
4661 p += *p + 1;
4662 break;
4663
4664 case charset_not:
4665 case charset:
4666 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4667 {
4668 int mcnt;
4669 p = CHARSET_RANGE_TABLE (p - 1);
4670 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4671 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4672 }
4673 else
4674 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4675 break;
177c0ea7 4676
4e8a9132
SM
4677 case syntaxspec:
4678 case notsyntaxspec:
1fb352e0 4679#ifdef emacs
4e8a9132
SM
4680 case categoryspec:
4681 case notcategoryspec:
4682#endif /* emacs */
4683 p++;
4684 break;
4685
4686 default:
4687 p = NULL;
4688 }
4689 return p;
4690}
4691
4692
505bde11 4693/* Jump over non-matching operations. */
839966f3 4694static re_char *
971de7fb 4695skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4696{
4697 int mcnt;
4698 while (p < pend)
4699 {
4700 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4701 {
4702 case start_memory:
505bde11
SM
4703 case stop_memory:
4704 p += 2; break;
4705 case no_op:
4706 p += 1; break;
4707 case jump:
4708 p += 1;
4709 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4710 p += mcnt;
4711 break;
4712 default:
4713 return p;
4714 }
4715 }
4716 assert (p == pend);
4717 return p;
4718}
4719
4720/* Non-zero if "p1 matches something" implies "p2 fails". */
4721static int
971de7fb 4722mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4723{
4e8a9132 4724 re_opcode_t op2;
2d1675e4 4725 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4726 unsigned char *pend = bufp->buffer + bufp->used;
4727
4e8a9132 4728 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4729 && p2 >= bufp->buffer && p2 <= pend);
4730
4731 /* Skip over open/close-group commands.
4732 If what follows this loop is a ...+ construct,
4733 look at what begins its body, since we will have to
4734 match at least one of that. */
4e8a9132
SM
4735 p2 = skip_noops (p2, pend);
4736 /* The same skip can be done for p1, except that this function
4737 is only used in the case where p1 is a simple match operator. */
4738 /* p1 = skip_noops (p1, pend); */
4739
4740 assert (p1 >= bufp->buffer && p1 < pend
4741 && p2 >= bufp->buffer && p2 <= pend);
4742
4743 op2 = p2 == pend ? succeed : *p2;
4744
4745 switch (SWITCH_ENUM_CAST (op2))
505bde11 4746 {
4e8a9132
SM
4747 case succeed:
4748 case endbuf:
4749 /* If we're at the end of the pattern, we can change. */
4750 if (skip_one_char (p1))
505bde11 4751 {
505bde11
SM
4752 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4753 return 1;
505bde11 4754 }
4e8a9132 4755 break;
177c0ea7 4756
4e8a9132 4757 case endline:
4e8a9132
SM
4758 case exactn:
4759 {
01618498 4760 register re_wchar_t c
4e8a9132 4761 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4762 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4763
4e8a9132
SM
4764 if ((re_opcode_t) *p1 == exactn)
4765 {
62a6e103 4766 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4767 {
4768 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4769 return 1;
4770 }
4771 }
505bde11 4772
4e8a9132
SM
4773 else if ((re_opcode_t) *p1 == charset
4774 || (re_opcode_t) *p1 == charset_not)
4775 {
4776 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4777
4e8a9132
SM
4778 /* Test if C is listed in charset (or charset_not)
4779 at `p1'. */
6fdd04b0 4780 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4781 {
4782 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4783 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4784 not = !not;
4785 }
4786 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4787 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4788
4e8a9132
SM
4789 /* `not' is equal to 1 if c would match, which means
4790 that we can't change to pop_failure_jump. */
4791 if (!not)
4792 {
4793 DEBUG_PRINT1 (" No match => fast loop.\n");
4794 return 1;
4795 }
4796 }
4797 else if ((re_opcode_t) *p1 == anychar
4798 && c == '\n')
4799 {
4800 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4801 return 1;
4802 }
4803 }
4804 break;
505bde11 4805
4e8a9132 4806 case charset:
4e8a9132
SM
4807 {
4808 if ((re_opcode_t) *p1 == exactn)
4809 /* Reuse the code above. */
4810 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4811
505bde11
SM
4812 /* It is hard to list up all the character in charset
4813 P2 if it includes multibyte character. Give up in
4814 such case. */
4815 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4816 {
4817 /* Now, we are sure that P2 has no range table.
4818 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4819 enough. But P1 may have range table, so the
505bde11
SM
4820 size of bitmap table of P1 is extracted by
4821 using macro `CHARSET_BITMAP_SIZE'.
4822
6fdd04b0
KH
4823 In a multibyte case, we know that all the character
4824 listed in P2 is ASCII. In a unibyte case, P1 has only a
4825 bitmap table. So, in both cases, it is enough to test
4826 only the bitmap table of P1. */
505bde11 4827
411e4203 4828 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4829 {
4830 int idx;
4831 /* We win if the charset inside the loop
4832 has no overlap with the one after the loop. */
4833 for (idx = 0;
4834 (idx < (int) p2[1]
4835 && idx < CHARSET_BITMAP_SIZE (p1));
4836 idx++)
4837 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4838 break;
4839
4840 if (idx == p2[1]
4841 || idx == CHARSET_BITMAP_SIZE (p1))
4842 {
4843 DEBUG_PRINT1 (" No match => fast loop.\n");
4844 return 1;
4845 }
4846 }
411e4203 4847 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4848 {
4849 int idx;
4850 /* We win if the charset_not inside the loop lists
7814e705 4851 every character listed in the charset after. */
505bde11
SM
4852 for (idx = 0; idx < (int) p2[1]; idx++)
4853 if (! (p2[2 + idx] == 0
4854 || (idx < CHARSET_BITMAP_SIZE (p1)
4855 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4856 break;
4857
d1dfb56c
EZ
4858 if (idx == p2[1])
4859 {
4860 DEBUG_PRINT1 (" No match => fast loop.\n");
4861 return 1;
4862 }
4e8a9132
SM
4863 }
4864 }
4865 }
609b757a 4866 break;
177c0ea7 4867
411e4203
SM
4868 case charset_not:
4869 switch (SWITCH_ENUM_CAST (*p1))
4870 {
4871 case exactn:
4872 case charset:
4873 /* Reuse the code above. */
4874 return mutually_exclusive_p (bufp, p2, p1);
4875 case charset_not:
4876 /* When we have two charset_not, it's very unlikely that
4877 they don't overlap. The union of the two sets of excluded
4878 chars should cover all possible chars, which, as a matter of
4879 fact, is virtually impossible in multibyte buffers. */
36595814 4880 break;
411e4203
SM
4881 }
4882 break;
4883
4e8a9132 4884 case wordend:
669fa600
SM
4885 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4886 case symend:
4e8a9132 4887 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4888 && (p1[1] == Ssymbol || p1[1] == Sword));
4889 case notsyntaxspec:
4890 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4891
4892 case wordbeg:
669fa600
SM
4893 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4894 case symbeg:
4e8a9132 4895 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4896 && (p1[1] == Ssymbol || p1[1] == Sword));
4897 case syntaxspec:
4898 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4899
4900 case wordbound:
4901 return (((re_opcode_t) *p1 == notsyntaxspec
4902 || (re_opcode_t) *p1 == syntaxspec)
4903 && p1[1] == Sword);
4904
1fb352e0 4905#ifdef emacs
4e8a9132
SM
4906 case categoryspec:
4907 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4908 case notcategoryspec:
4909 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4910#endif /* emacs */
4911
4912 default:
4913 ;
505bde11
SM
4914 }
4915
4916 /* Safe default. */
4917 return 0;
4918}
4919
fa9a63c5
RM
4920\f
4921/* Matching routines. */
4922
25fe55af 4923#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4924/* re_match is like re_match_2 except it takes only a single string. */
4925
d1dfb56c 4926regoff_t
d2762c86 4927re_match (struct re_pattern_buffer *bufp, const char *string,
d1dfb56c 4928 size_t size, ssize_t pos, struct re_registers *regs)
fa9a63c5 4929{
d1dfb56c
EZ
4930 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4931 size, pos, regs, size);
fa9a63c5
RM
4932 return result;
4933}
c0f9ea08 4934WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4935#endif /* not emacs */
4936
b18215fc
RS
4937#ifdef emacs
4938/* In Emacs, this is the string or buffer in which we
7814e705 4939 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4940Lisp_Object re_match_object;
4941#endif
fa9a63c5
RM
4942
4943/* re_match_2 matches the compiled pattern in BUFP against the
4944 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4945 and SIZE2, respectively). We start matching at POS, and stop
4946 matching at STOP.
5e69f11e 4947
fa9a63c5 4948 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4949 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4950 documentation for exactly how many groups we fill.
4951
4952 We return -1 if no match, -2 if an internal error (such as the
7814e705 4953 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4954 matched substring. */
4955
d1dfb56c
EZ
4956regoff_t
4957re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4958 size_t size1, const char *string2, size_t size2, ssize_t pos,
4959 struct re_registers *regs, ssize_t stop)
fa9a63c5 4960{
d1dfb56c 4961 regoff_t result;
25fe55af 4962
b18215fc 4963#ifdef emacs
d1dfb56c 4964 ssize_t charpos;
d48cd3f4 4965 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 4966 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4967 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4968#endif
4969
4bb91c68
SM
4970 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4971 (re_char*) string2, size2,
cc9b4df2 4972 pos, regs, stop);
fa9a63c5
RM
4973 return result;
4974}
c0f9ea08 4975WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4976
bf216479 4977
fa9a63c5 4978/* This is a separate function so that we can force an alloca cleanup
7814e705 4979 afterwards. */
d1dfb56c
EZ
4980static regoff_t
4981re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1,
4982 size_t size1, const re_char *string2, size_t size2,
4983 ssize_t pos, struct re_registers *regs, ssize_t stop)
fa9a63c5
RM
4984{
4985 /* General temporaries. */
d1dfb56c 4986 ssize_t mcnt;
01618498 4987 size_t reg;
fa9a63c5
RM
4988
4989 /* Just past the end of the corresponding string. */
66f0296e 4990 re_char *end1, *end2;
fa9a63c5
RM
4991
4992 /* Pointers into string1 and string2, just past the last characters in
7814e705 4993 each to consider matching. */
66f0296e 4994 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4995
4996 /* Where we are in the data, and the end of the current string. */
66f0296e 4997 re_char *d, *dend;
5e69f11e 4998
99633e97
SM
4999 /* Used sometimes to remember where we were before starting matching
5000 an operator so that we can go back in case of failure. This "atomic"
5001 behavior of matching opcodes is indispensable to the correctness
5002 of the on_failure_keep_string_jump optimization. */
5003 re_char *dfail;
5004
fa9a63c5 5005 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5006 re_char *p = bufp->buffer;
5007 re_char *pend = p + bufp->used;
fa9a63c5 5008
25fe55af 5009 /* We use this to map every character in the string. */
6676cb1c 5010 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5011
cf9c99bc 5012 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5013 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5014
cf9c99bc
KH
5015 /* Nonzero if STRING1/STRING2 are multibyte. */
5016 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5017
fa9a63c5
RM
5018 /* Failure point stack. Each place that can handle a failure further
5019 down the line pushes a failure point on this stack. It consists of
505bde11 5020 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5021 the subexpressions we're currently inside, plus the number of such
5022 registers, and, finally, two char *'s. The first char * is where
5023 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5024 scanning the strings. */
5025#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5026 fail_stack_type fail_stack;
5027#endif
5028#ifdef DEBUG
fa9a63c5
RM
5029 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5030#endif
5031
0b32bf0e 5032#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5033 /* This holds the pointer to the failure stack, when
5034 it is allocated relocatably. */
5035 fail_stack_elt_t *failure_stack_ptr;
99633e97 5036#endif
fa9a63c5
RM
5037
5038 /* We fill all the registers internally, independent of what we
7814e705 5039 return, for use in backreferences. The number here includes
fa9a63c5 5040 an element for register zero. */
4bb91c68 5041 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5042
fa9a63c5
RM
5043 /* Information on the contents of registers. These are pointers into
5044 the input strings; they record just what was matched (on this
5045 attempt) by a subexpression part of the pattern, that is, the
5046 regnum-th regstart pointer points to where in the pattern we began
5047 matching and the regnum-th regend points to right after where we
5048 stopped matching the regnum-th subexpression. (The zeroth register
5049 keeps track of what the whole pattern matches.) */
5050#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5051 re_char **regstart, **regend;
fa9a63c5
RM
5052#endif
5053
fa9a63c5 5054 /* The following record the register info as found in the above
5e69f11e 5055 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5056 This happens as we backtrack through the failure points, which in
5057 turn happens only if we have not yet matched the entire string. */
5058 unsigned best_regs_set = false;
5059#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5060 re_char **best_regstart, **best_regend;
fa9a63c5 5061#endif
5e69f11e 5062
fa9a63c5
RM
5063 /* Logically, this is `best_regend[0]'. But we don't want to have to
5064 allocate space for that if we're not allocating space for anything
7814e705 5065 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5066 any of the other register vectors, and it seems rather a kludge to
5067 treat `best_regend' differently than the rest. So we keep track of
5068 the end of the best match so far in a separate variable. We
5069 initialize this to NULL so that when we backtrack the first time
5070 and need to test it, it's not garbage. */
66f0296e 5071 re_char *match_end = NULL;
fa9a63c5 5072
fa9a63c5
RM
5073#ifdef DEBUG
5074 /* Counts the total number of registers pushed. */
5e69f11e 5075 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5076#endif
5077
5078 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5079
fa9a63c5 5080 INIT_FAIL_STACK ();
5e69f11e 5081
fa9a63c5
RM
5082#ifdef MATCH_MAY_ALLOCATE
5083 /* Do not bother to initialize all the register variables if there are
5084 no groups in the pattern, as it takes a fair amount of time. If
5085 there are groups, we include space for register 0 (the whole
5086 pattern), even though we never use it, since it simplifies the
5087 array indexing. We should fix this. */
5088 if (bufp->re_nsub)
5089 {
66f0296e
SM
5090 regstart = REGEX_TALLOC (num_regs, re_char *);
5091 regend = REGEX_TALLOC (num_regs, re_char *);
5092 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5093 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5094
505bde11 5095 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5096 {
5097 FREE_VARIABLES ();
5098 return -2;
5099 }
fa9a63c5
RM
5100 }
5101 else
5102 {
5103 /* We must initialize all our variables to NULL, so that
25fe55af 5104 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5105 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5106 }
5107#endif /* MATCH_MAY_ALLOCATE */
5108
5109 /* The starting position is bogus. */
5110 if (pos < 0 || pos > size1 + size2)
5111 {
5112 FREE_VARIABLES ();
5113 return -1;
5114 }
5e69f11e 5115
fa9a63c5
RM
5116 /* Initialize subexpression text positions to -1 to mark ones that no
5117 start_memory/stop_memory has been seen for. Also initialize the
5118 register information struct. */
01618498
SM
5119 for (reg = 1; reg < num_regs; reg++)
5120 regstart[reg] = regend[reg] = NULL;
99633e97 5121
fa9a63c5 5122 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5123 `string1' is null. */
fa9a63c5
RM
5124 if (size2 == 0 && string1 != NULL)
5125 {
5126 string2 = string1;
5127 size2 = size1;
5128 string1 = 0;
5129 size1 = 0;
5130 }
5131 end1 = string1 + size1;
5132 end2 = string2 + size2;
5133
5e69f11e 5134 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5135 `dend' is the end of the input string that `d' points within. `d'
5136 is advanced into the following input string whenever necessary, but
5137 this happens before fetching; therefore, at the beginning of the
5138 loop, `d' can be pointing at the end of a string, but it cannot
5139 equal `string2'. */
419d1c74 5140 if (pos >= size1)
fa9a63c5 5141 {
419d1c74
SM
5142 /* Only match within string2. */
5143 d = string2 + pos - size1;
5144 dend = end_match_2 = string2 + stop - size1;
5145 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5146 }
5147 else
5148 {
f1ad044f 5149 if (stop < size1)
419d1c74
SM
5150 {
5151 /* Only match within string1. */
5152 end_match_1 = string1 + stop;
5153 /* BEWARE!
5154 When we reach end_match_1, PREFETCH normally switches to string2.
5155 But in the present case, this means that just doing a PREFETCH
5156 makes us jump from `stop' to `gap' within the string.
5157 What we really want here is for the search to stop as
5158 soon as we hit end_match_1. That's why we set end_match_2
5159 to end_match_1 (since PREFETCH fails as soon as we hit
5160 end_match_2). */
5161 end_match_2 = end_match_1;
5162 }
5163 else
f1ad044f
SM
5164 { /* It's important to use this code when stop == size so that
5165 moving `d' from end1 to string2 will not prevent the d == dend
5166 check from catching the end of string. */
419d1c74
SM
5167 end_match_1 = end1;
5168 end_match_2 = string2 + stop - size1;
5169 }
5170 d = string1 + pos;
5171 dend = end_match_1;
fa9a63c5
RM
5172 }
5173
5174 DEBUG_PRINT1 ("The compiled pattern is: ");
5175 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5176 DEBUG_PRINT1 ("The string to match is: `");
5177 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5178 DEBUG_PRINT1 ("'\n");
5e69f11e 5179
7814e705 5180 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5181 function if the match is complete, or it drops through if the match
5182 fails at this starting point in the input data. */
5183 for (;;)
5184 {
505bde11 5185 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5186
5187 if (p == pend)
5188 { /* End of pattern means we might have succeeded. */
25fe55af 5189 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5190
fa9a63c5 5191 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5192 longest match, try backtracking. */
5193 if (d != end_match_2)
fa9a63c5
RM
5194 {
5195 /* 1 if this match ends in the same string (string1 or string2)
5196 as the best previous match. */
5e69f11e 5197 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5198 == FIRST_STRING_P (d));
fa9a63c5
RM
5199 /* 1 if this match is the best seen so far. */
5200 boolean best_match_p;
5201
5202 /* AIX compiler got confused when this was combined
7814e705 5203 with the previous declaration. */
fa9a63c5
RM
5204 if (same_str_p)
5205 best_match_p = d > match_end;
5206 else
99633e97 5207 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5208
25fe55af
RS
5209 DEBUG_PRINT1 ("backtracking.\n");
5210
5211 if (!FAIL_STACK_EMPTY ())
5212 { /* More failure points to try. */
5213
5214 /* If exceeds best match so far, save it. */
5215 if (!best_regs_set || best_match_p)
5216 {
5217 best_regs_set = true;
5218 match_end = d;
5219
5220 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5221
01618498 5222 for (reg = 1; reg < num_regs; reg++)
25fe55af 5223 {
01618498
SM
5224 best_regstart[reg] = regstart[reg];
5225 best_regend[reg] = regend[reg];
25fe55af
RS
5226 }
5227 }
5228 goto fail;
5229 }
5230
5231 /* If no failure points, don't restore garbage. And if
5232 last match is real best match, don't restore second
5233 best one. */
5234 else if (best_regs_set && !best_match_p)
5235 {
5236 restore_best_regs:
5237 /* Restore best match. It may happen that `dend ==
5238 end_match_1' while the restored d is in string2.
5239 For example, the pattern `x.*y.*z' against the
5240 strings `x-' and `y-z-', if the two strings are
7814e705 5241 not consecutive in memory. */
25fe55af
RS
5242 DEBUG_PRINT1 ("Restoring best registers.\n");
5243
5244 d = match_end;
5245 dend = ((d >= string1 && d <= end1)
5246 ? end_match_1 : end_match_2);
fa9a63c5 5247
01618498 5248 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5249 {
01618498
SM
5250 regstart[reg] = best_regstart[reg];
5251 regend[reg] = best_regend[reg];
fa9a63c5 5252 }
25fe55af
RS
5253 }
5254 } /* d != end_match_2 */
fa9a63c5
RM
5255
5256 succeed_label:
25fe55af 5257 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5258
25fe55af
RS
5259 /* If caller wants register contents data back, do it. */
5260 if (regs && !bufp->no_sub)
fa9a63c5 5261 {
25fe55af
RS
5262 /* Have the register data arrays been allocated? */
5263 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5264 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5265 extra element beyond `num_regs' for the `-1' marker
5266 GNU code uses. */
5267 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5268 regs->start = TALLOC (regs->num_regs, regoff_t);
5269 regs->end = TALLOC (regs->num_regs, regoff_t);
5270 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5271 {
5272 FREE_VARIABLES ();
5273 return -2;
5274 }
25fe55af
RS
5275 bufp->regs_allocated = REGS_REALLOCATE;
5276 }
5277 else if (bufp->regs_allocated == REGS_REALLOCATE)
5278 { /* Yes. If we need more elements than were already
5279 allocated, reallocate them. If we need fewer, just
5280 leave it alone. */
5281 if (regs->num_regs < num_regs + 1)
5282 {
5283 regs->num_regs = num_regs + 1;
5284 RETALLOC (regs->start, regs->num_regs, regoff_t);
5285 RETALLOC (regs->end, regs->num_regs, regoff_t);
5286 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5287 {
5288 FREE_VARIABLES ();
5289 return -2;
5290 }
25fe55af
RS
5291 }
5292 }
5293 else
fa9a63c5
RM
5294 {
5295 /* These braces fend off a "empty body in an else-statement"
7814e705 5296 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5297 assert (bufp->regs_allocated == REGS_FIXED);
5298 }
5299
25fe55af
RS
5300 /* Convert the pointer data in `regstart' and `regend' to
5301 indices. Register zero has to be set differently,
5302 since we haven't kept track of any info for it. */
5303 if (regs->num_regs > 0)
5304 {
5305 regs->start[0] = pos;
99633e97 5306 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5307 }
5e69f11e 5308
25fe55af
RS
5309 /* Go through the first `min (num_regs, regs->num_regs)'
5310 registers, since that is all we initialized. */
01618498 5311 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5312 {
01618498
SM
5313 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5314 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5315 else
5316 {
01618498
SM
5317 regs->start[reg]
5318 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5319 regs->end[reg]
5320 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5321 }
fa9a63c5 5322 }
5e69f11e 5323
25fe55af
RS
5324 /* If the regs structure we return has more elements than
5325 were in the pattern, set the extra elements to -1. If
5326 we (re)allocated the registers, this is the case,
5327 because we always allocate enough to have at least one
7814e705 5328 -1 at the end. */
01618498
SM
5329 for (reg = num_regs; reg < regs->num_regs; reg++)
5330 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5331 } /* regs && !bufp->no_sub */
5332
25fe55af
RS
5333 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5334 nfailure_points_pushed, nfailure_points_popped,
5335 nfailure_points_pushed - nfailure_points_popped);
5336 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5337
99633e97 5338 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5339
25fe55af 5340 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5341
25fe55af
RS
5342 FREE_VARIABLES ();
5343 return mcnt;
5344 }
fa9a63c5 5345
7814e705 5346 /* Otherwise match next pattern command. */
fa9a63c5
RM
5347 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5348 {
25fe55af
RS
5349 /* Ignore these. Used to ignore the n of succeed_n's which
5350 currently have n == 0. */
5351 case no_op:
5352 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5353 break;
fa9a63c5
RM
5354
5355 case succeed:
25fe55af 5356 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5357 goto succeed_label;
5358
7814e705 5359 /* Match the next n pattern characters exactly. The following
25fe55af 5360 byte in the pattern defines n, and the n bytes after that
7814e705 5361 are the characters to match. */
fa9a63c5
RM
5362 case exactn:
5363 mcnt = *p++;
25fe55af 5364 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5365
99633e97
SM
5366 /* Remember the start point to rollback upon failure. */
5367 dfail = d;
5368
6fdd04b0 5369#ifndef emacs
25fe55af
RS
5370 /* This is written out as an if-else so we don't waste time
5371 testing `translate' inside the loop. */
28703c16 5372 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5373 do
5374 {
5375 PREFETCH ();
5376 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5377 {
6fdd04b0
KH
5378 d = dfail;
5379 goto fail;
e934739e 5380 }
6fdd04b0
KH
5381 d++;
5382 }
5383 while (--mcnt);
fa9a63c5 5384 else
6fdd04b0
KH
5385 do
5386 {
5387 PREFETCH ();
5388 if (*d++ != *p++)
bf216479 5389 {
6fdd04b0
KH
5390 d = dfail;
5391 goto fail;
bf216479 5392 }
6fdd04b0
KH
5393 }
5394 while (--mcnt);
5395#else /* emacs */
5396 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5397 if (target_multibyte)
6fdd04b0
KH
5398 do
5399 {
5400 int pat_charlen, buf_charlen;
cf9c99bc 5401 int pat_ch, buf_ch;
e934739e 5402
6fdd04b0 5403 PREFETCH ();
cf9c99bc 5404 if (multibyte)
62a6e103 5405 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5406 else
5407 {
5408 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5409 pat_charlen = 1;
5410 }
62a6e103 5411 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5412
6fdd04b0 5413 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5414 {
6fdd04b0
KH
5415 d = dfail;
5416 goto fail;
e934739e 5417 }
bf216479 5418
6fdd04b0
KH
5419 p += pat_charlen;
5420 d += buf_charlen;
5421 mcnt -= pat_charlen;
5422 }
5423 while (mcnt > 0);
fa9a63c5 5424 else
6fdd04b0
KH
5425 do
5426 {
abbd1bcf 5427 int pat_charlen;
cf9c99bc 5428 int pat_ch, buf_ch;
bf216479 5429
6fdd04b0 5430 PREFETCH ();
cf9c99bc
KH
5431 if (multibyte)
5432 {
62a6e103 5433 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5434 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5435 }
5436 else
5437 {
5438 pat_ch = *p;
5439 pat_charlen = 1;
5440 }
5441 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5442 if (! CHAR_BYTE8_P (buf_ch))
5443 {
5444 buf_ch = TRANSLATE (buf_ch);
5445 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5446 if (buf_ch < 0)
5447 buf_ch = *d;
5448 }
0e2501ed
AS
5449 else
5450 buf_ch = *d;
cf9c99bc 5451 if (buf_ch != pat_ch)
6fdd04b0
KH
5452 {
5453 d = dfail;
5454 goto fail;
bf216479 5455 }
cf9c99bc
KH
5456 p += pat_charlen;
5457 d++;
6fdd04b0
KH
5458 }
5459 while (--mcnt);
5460#endif
25fe55af 5461 break;
fa9a63c5
RM
5462
5463
25fe55af 5464 /* Match any character except possibly a newline or a null. */
fa9a63c5 5465 case anychar:
e934739e
RS
5466 {
5467 int buf_charlen;
01618498 5468 re_wchar_t buf_ch;
fa9a63c5 5469
e934739e 5470 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5471
e934739e 5472 PREFETCH ();
62a6e103 5473 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5474 target_multibyte);
e934739e
RS
5475 buf_ch = TRANSLATE (buf_ch);
5476
5477 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5478 && buf_ch == '\n')
5479 || ((bufp->syntax & RE_DOT_NOT_NULL)
5480 && buf_ch == '\000'))
5481 goto fail;
5482
e934739e
RS
5483 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5484 d += buf_charlen;
5485 }
fa9a63c5
RM
5486 break;
5487
5488
5489 case charset:
5490 case charset_not:
5491 {
b18215fc 5492 register unsigned int c;
fa9a63c5 5493 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5494 int len;
5495
5496 /* Start of actual range_table, or end of bitmap if there is no
5497 range table. */
da053e48 5498 re_char *range_table IF_LINT (= NULL);
b18215fc 5499
96cc36cc 5500 /* Nonzero if there is a range table. */
b18215fc
RS
5501 int range_table_exists;
5502
96cc36cc
RS
5503 /* Number of ranges of range table. This is not included
5504 in the initial byte-length of the command. */
5505 int count = 0;
fa9a63c5 5506
f5020181
AS
5507 /* Whether matching against a unibyte character. */
5508 boolean unibyte_char = false;
5509
25fe55af 5510 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5511
b18215fc 5512 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5513
b18215fc 5514 if (range_table_exists)
96cc36cc
RS
5515 {
5516 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5517 EXTRACT_NUMBER_AND_INCR (count, range_table);
5518 }
b18215fc 5519
2d1675e4 5520 PREFETCH ();
62a6e103 5521 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5522 if (target_multibyte)
5523 {
5524 int c1;
b18215fc 5525
cf9c99bc
KH
5526 c = TRANSLATE (c);
5527 c1 = RE_CHAR_TO_UNIBYTE (c);
5528 if (c1 >= 0)
f5020181
AS
5529 {
5530 unibyte_char = true;
5531 c = c1;
5532 }
cf9c99bc
KH
5533 }
5534 else
5535 {
5536 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5537
5538 if (! CHAR_BYTE8_P (c1))
5539 {
5540 c1 = TRANSLATE (c1);
5541 c1 = RE_CHAR_TO_UNIBYTE (c1);
5542 if (c1 >= 0)
f5020181
AS
5543 {
5544 unibyte_char = true;
5545 c = c1;
5546 }
cf9c99bc 5547 }
0b8be006
AS
5548 else
5549 unibyte_char = true;
cf9c99bc
KH
5550 }
5551
f5020181 5552 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5553 { /* Lookup bitmap. */
b18215fc
RS
5554 /* Cast to `unsigned' instead of `unsigned char' in
5555 case the bit list is a full 32 bytes long. */
5556 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5557 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5558 not = !not;
b18215fc 5559 }
96cc36cc 5560#ifdef emacs
b18215fc 5561 else if (range_table_exists)
96cc36cc
RS
5562 {
5563 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5564
14473664
SM
5565 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5566 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5567 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5568 | (class_bits & BIT_SPACE && ISSPACE (c))
5569 | (class_bits & BIT_UPPER && ISUPPER (c))
5570 | (class_bits & BIT_WORD && ISWORD (c)))
5571 not = !not;
5572 else
5573 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5574 }
5575#endif /* emacs */
fa9a63c5 5576
96cc36cc
RS
5577 if (range_table_exists)
5578 p = CHARSET_RANGE_TABLE_END (range_table, count);
5579 else
5580 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5581
5582 if (!not) goto fail;
5e69f11e 5583
b18215fc 5584 d += len;
fa9a63c5 5585 }
8fb31792 5586 break;
fa9a63c5
RM
5587
5588
25fe55af 5589 /* The beginning of a group is represented by start_memory.
505bde11 5590 The argument is the register number. The text
25fe55af 5591 matched within the group is recorded (in the internal
7814e705 5592 registers data structure) under the register number. */
25fe55af 5593 case start_memory:
505bde11
SM
5594 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5595
5596 /* In case we need to undo this operation (via backtracking). */
5597 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5598
25fe55af 5599 regstart[*p] = d;
4bb91c68 5600 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5601 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5602
25fe55af 5603 /* Move past the register number and inner group count. */
505bde11 5604 p += 1;
25fe55af 5605 break;
fa9a63c5
RM
5606
5607
25fe55af 5608 /* The stop_memory opcode represents the end of a group. Its
505bde11 5609 argument is the same as start_memory's: the register number. */
fa9a63c5 5610 case stop_memory:
505bde11
SM
5611 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5612
5613 assert (!REG_UNSET (regstart[*p]));
5614 /* Strictly speaking, there should be code such as:
177c0ea7 5615
0b32bf0e 5616 assert (REG_UNSET (regend[*p]));
505bde11
SM
5617 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5618
5619 But the only info to be pushed is regend[*p] and it is known to
5620 be UNSET, so there really isn't anything to push.
5621 Not pushing anything, on the other hand deprives us from the
5622 guarantee that regend[*p] is UNSET since undoing this operation
5623 will not reset its value properly. This is not important since
5624 the value will only be read on the next start_memory or at
5625 the very end and both events can only happen if this stop_memory
5626 is *not* undone. */
fa9a63c5 5627
25fe55af 5628 regend[*p] = d;
fa9a63c5
RM
5629 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5630
25fe55af 5631 /* Move past the register number and the inner group count. */
505bde11 5632 p += 1;
25fe55af 5633 break;
fa9a63c5
RM
5634
5635
5636 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5637 followed by the numeric value of <digit> as the register number. */
5638 case duplicate:
fa9a63c5 5639 {
66f0296e 5640 register re_char *d2, *dend2;
7814e705 5641 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5642 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5643
7814e705 5644 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5645 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5646 goto fail;
5e69f11e 5647
7814e705 5648 /* Where in input to try to start matching. */
25fe55af 5649 d2 = regstart[regno];
5e69f11e 5650
99633e97
SM
5651 /* Remember the start point to rollback upon failure. */
5652 dfail = d;
5653
25fe55af
RS
5654 /* Where to stop matching; if both the place to start and
5655 the place to stop matching are in the same string, then
5656 set to the place to stop, otherwise, for now have to use
5657 the end of the first string. */
fa9a63c5 5658
25fe55af 5659 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5660 == FIRST_STRING_P (regend[regno]))
5661 ? regend[regno] : end_match_1);
5662 for (;;)
5663 {
5664 /* If necessary, advance to next segment in register
25fe55af 5665 contents. */
fa9a63c5
RM
5666 while (d2 == dend2)
5667 {
5668 if (dend2 == end_match_2) break;
5669 if (dend2 == regend[regno]) break;
5670
25fe55af
RS
5671 /* End of string1 => advance to string2. */
5672 d2 = string2;
5673 dend2 = regend[regno];
fa9a63c5
RM
5674 }
5675 /* At end of register contents => success */
5676 if (d2 == dend2) break;
5677
5678 /* If necessary, advance to next segment in data. */
5679 PREFETCH ();
5680
5681 /* How many characters left in this segment to match. */
5682 mcnt = dend - d;
5e69f11e 5683
fa9a63c5 5684 /* Want how many consecutive characters we can match in
25fe55af
RS
5685 one shot, so, if necessary, adjust the count. */
5686 if (mcnt > dend2 - d2)
fa9a63c5 5687 mcnt = dend2 - d2;
5e69f11e 5688
fa9a63c5 5689 /* Compare that many; failure if mismatch, else move
25fe55af 5690 past them. */
28703c16 5691 if (RE_TRANSLATE_P (translate)
02cb78b5 5692 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5693 : memcmp (d, d2, mcnt))
99633e97
SM
5694 {
5695 d = dfail;
5696 goto fail;
5697 }
fa9a63c5 5698 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5699 }
5700 }
5701 break;
5702
5703
25fe55af 5704 /* begline matches the empty string at the beginning of the string
c0f9ea08 5705 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5706 case begline:
25fe55af 5707 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5708
25fe55af
RS
5709 if (AT_STRINGS_BEG (d))
5710 {
5711 if (!bufp->not_bol) break;
5712 }
419d1c74 5713 else
25fe55af 5714 {
bf216479 5715 unsigned c;
419d1c74 5716 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5717 if (c == '\n')
419d1c74 5718 break;
25fe55af
RS
5719 }
5720 /* In all other cases, we fail. */
5721 goto fail;
fa9a63c5
RM
5722
5723
25fe55af 5724 /* endline is the dual of begline. */
fa9a63c5 5725 case endline:
25fe55af 5726 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5727
25fe55af
RS
5728 if (AT_STRINGS_END (d))
5729 {
5730 if (!bufp->not_eol) break;
5731 }
f1ad044f 5732 else
25fe55af 5733 {
f1ad044f 5734 PREFETCH_NOLIMIT ();
c0f9ea08 5735 if (*d == '\n')
f1ad044f 5736 break;
25fe55af
RS
5737 }
5738 goto fail;
fa9a63c5
RM
5739
5740
5741 /* Match at the very beginning of the data. */
25fe55af
RS
5742 case begbuf:
5743 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5744 if (AT_STRINGS_BEG (d))
5745 break;
5746 goto fail;
fa9a63c5
RM
5747
5748
5749 /* Match at the very end of the data. */
25fe55af
RS
5750 case endbuf:
5751 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5752 if (AT_STRINGS_END (d))
5753 break;
25fe55af 5754 goto fail;
5e69f11e 5755
5e69f11e 5756
25fe55af
RS
5757 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5758 pushes NULL as the value for the string on the stack. Then
505bde11 5759 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5760 string, instead of restoring it. To see why, consider
7814e705 5761 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5762 then the . fails against the \n. But the next thing we want
5763 to do is match the \n against the \n; if we restored the
5764 string value, we would be back at the foo.
5765
5766 Because this is used only in specific cases, we don't need to
5767 check all the things that `on_failure_jump' does, to make
5768 sure the right things get saved on the stack. Hence we don't
5769 share its code. The only reason to push anything on the
5770 stack at all is that otherwise we would have to change
5771 `anychar's code to do something besides goto fail in this
5772 case; that seems worse than this. */
5773 case on_failure_keep_string_jump:
505bde11
SM
5774 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5775 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5776 mcnt, p + mcnt);
fa9a63c5 5777
505bde11
SM
5778 PUSH_FAILURE_POINT (p - 3, NULL);
5779 break;
5780
0683b6fa
SM
5781 /* A nasty loop is introduced by the non-greedy *? and +?.
5782 With such loops, the stack only ever contains one failure point
5783 at a time, so that a plain on_failure_jump_loop kind of
5784 cycle detection cannot work. Worse yet, such a detection
5785 can not only fail to detect a cycle, but it can also wrongly
5786 detect a cycle (between different instantiations of the same
6df42991 5787 loop).
0683b6fa
SM
5788 So the method used for those nasty loops is a little different:
5789 We use a special cycle-detection-stack-frame which is pushed
5790 when the on_failure_jump_nastyloop failure-point is *popped*.
5791 This special frame thus marks the beginning of one iteration
5792 through the loop and we can hence easily check right here
5793 whether something matched between the beginning and the end of
5794 the loop. */
5795 case on_failure_jump_nastyloop:
5796 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5797 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5798 mcnt, p + mcnt);
5799
5800 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5801 {
5802 int cycle = 0;
5803 CHECK_INFINITE_LOOP (p - 4, d);
5804 if (!cycle)
5805 /* If there's a cycle, just continue without pushing
5806 this failure point. The failure point is the "try again"
5807 option, which shouldn't be tried.
5808 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5809 PUSH_FAILURE_POINT (p - 3, d);
5810 }
0683b6fa
SM
5811 break;
5812
4e8a9132
SM
5813 /* Simple loop detecting on_failure_jump: just check on the
5814 failure stack if the same spot was already hit earlier. */
505bde11
SM
5815 case on_failure_jump_loop:
5816 on_failure:
5817 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5818 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5819 mcnt, p + mcnt);
6df42991
SM
5820 {
5821 int cycle = 0;
5822 CHECK_INFINITE_LOOP (p - 3, d);
5823 if (cycle)
5824 /* If there's a cycle, get out of the loop, as if the matching
5825 had failed. We used to just `goto fail' here, but that was
5826 aborting the search a bit too early: we want to keep the
5827 empty-loop-match and keep matching after the loop.
5828 We want (x?)*y\1z to match both xxyz and xxyxz. */
5829 p += mcnt;
5830 else
5831 PUSH_FAILURE_POINT (p - 3, d);
5832 }
25fe55af 5833 break;
fa9a63c5
RM
5834
5835
5836 /* Uses of on_failure_jump:
5e69f11e 5837
25fe55af
RS
5838 Each alternative starts with an on_failure_jump that points
5839 to the beginning of the next alternative. Each alternative
5840 except the last ends with a jump that in effect jumps past
5841 the rest of the alternatives. (They really jump to the
5842 ending jump of the following alternative, because tensioning
5843 these jumps is a hassle.)
fa9a63c5 5844
25fe55af
RS
5845 Repeats start with an on_failure_jump that points past both
5846 the repetition text and either the following jump or
5847 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5848 case on_failure_jump:
25fe55af 5849 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5850 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5851 mcnt, p + mcnt);
25fe55af 5852
505bde11 5853 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5854 break;
5855
4e8a9132 5856 /* This operation is used for greedy *.
505bde11
SM
5857 Compare the beginning of the repeat with what in the
5858 pattern follows its end. If we can establish that there
5859 is nothing that they would both match, i.e., that we
5860 would have to backtrack because of (as in, e.g., `a*a')
5861 then we can use a non-backtracking loop based on
4e8a9132 5862 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5863 case on_failure_jump_smart:
25fe55af 5864 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5865 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5866 mcnt, p + mcnt);
25fe55af 5867 {
01618498 5868 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5869 /* Here, we discard `const', making re_match non-reentrant. */
5870 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5871 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5872
505bde11
SM
5873 p -= 3; /* Reset so that we will re-execute the
5874 instruction once it's been changed. */
fa9a63c5 5875
4e8a9132
SM
5876 EXTRACT_NUMBER (mcnt, p2 - 2);
5877
5878 /* Ensure this is a indeed the trivial kind of loop
5879 we are expecting. */
5880 assert (skip_one_char (p1) == p2 - 3);
5881 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5882 DEBUG_STATEMENT (debug += 2);
505bde11 5883 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5884 {
505bde11 5885 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5886 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5887 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5888 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5889 }
505bde11 5890 else
fa9a63c5 5891 {
505bde11
SM
5892 /* Default to a safe `on_failure_jump' loop. */
5893 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5894 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5895 }
99633e97 5896 DEBUG_STATEMENT (debug -= 2);
25fe55af 5897 }
505bde11 5898 break;
25fe55af
RS
5899
5900 /* Unconditionally jump (without popping any failure points). */
5901 case jump:
fa9a63c5 5902 unconditional_jump:
5b370c2b 5903 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5904 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5905 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5906 p += mcnt; /* Do the jump. */
505bde11 5907 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5908 break;
5909
5910
25fe55af
RS
5911 /* Have to succeed matching what follows at least n times.
5912 After that, handle like `on_failure_jump'. */
5913 case succeed_n:
01618498 5914 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5915 EXTRACT_NUMBER (mcnt, p + 2);
5916 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5917
dc1e502d
SM
5918 /* Originally, mcnt is how many times we HAVE to succeed. */
5919 if (mcnt != 0)
25fe55af 5920 {
6dcf2d0e
SM
5921 /* Here, we discard `const', making re_match non-reentrant. */
5922 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5923 mcnt--;
01618498
SM
5924 p += 4;
5925 PUSH_NUMBER (p2, mcnt);
25fe55af 5926 }
dc1e502d
SM
5927 else
5928 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5929 goto on_failure;
25fe55af
RS
5930 break;
5931
5932 case jump_n:
01618498 5933 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5934 EXTRACT_NUMBER (mcnt, p + 2);
5935 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5936
5937 /* Originally, this is how many times we CAN jump. */
dc1e502d 5938 if (mcnt != 0)
25fe55af 5939 {
6dcf2d0e
SM
5940 /* Here, we discard `const', making re_match non-reentrant. */
5941 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5942 mcnt--;
01618498 5943 PUSH_NUMBER (p2, mcnt);
dc1e502d 5944 goto unconditional_jump;
25fe55af
RS
5945 }
5946 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5947 else
5948 p += 4;
25fe55af 5949 break;
5e69f11e 5950
fa9a63c5
RM
5951 case set_number_at:
5952 {
01618498 5953 unsigned char *p2; /* Location of the counter. */
25fe55af 5954 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5955
25fe55af 5956 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5957 /* Here, we discard `const', making re_match non-reentrant. */
5958 p2 = (unsigned char*) p + mcnt;
01618498 5959 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5960 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5961 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5962 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5963 break;
5964 }
9121ca40
KH
5965
5966 case wordbound:
66f0296e 5967 case notwordbound:
19ed5445
PE
5968 {
5969 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5970 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5971
19ed5445 5972 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5973
19ed5445
PE
5974 /* Case 1: D is at the beginning or the end of string. */
5975 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5976 not = !not;
5977 else
5978 {
5979 /* C1 is the character before D, S1 is the syntax of C1, C2
5980 is the character at D, and S2 is the syntax of C2. */
5981 re_wchar_t c1, c2;
5982 int s1, s2;
5983 int dummy;
b18215fc 5984#ifdef emacs
d1dfb56c
EZ
5985 ssize_t offset = PTR_TO_OFFSET (d - 1);
5986 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445 5987 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5988#endif
19ed5445
PE
5989 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5990 s1 = SYNTAX (c1);
b18215fc 5991#ifdef emacs
19ed5445 5992 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5993#endif
19ed5445
PE
5994 PREFETCH_NOLIMIT ();
5995 GET_CHAR_AFTER (c2, d, dummy);
5996 s2 = SYNTAX (c2);
5997
5998 if (/* Case 2: Only one of S1 and S2 is Sword. */
5999 ((s1 == Sword) != (s2 == Sword))
6000 /* Case 3: Both of S1 and S2 are Sword, and macro
6001 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
6002 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
6003 not = !not;
6004 }
6005 if (not)
6006 break;
6007 else
6008 goto fail;
6009 }
fa9a63c5
RM
6010
6011 case wordbeg:
25fe55af 6012 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6013
b18215fc
RS
6014 /* We FAIL in one of the following cases: */
6015
7814e705 6016 /* Case 1: D is at the end of string. */
b18215fc 6017 if (AT_STRINGS_END (d))
99633e97 6018 goto fail;
b18215fc
RS
6019 else
6020 {
6021 /* C1 is the character before D, S1 is the syntax of C1, C2
6022 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6023 re_wchar_t c1, c2;
6024 int s1, s2;
bf216479 6025 int dummy;
fa9a63c5 6026#ifdef emacs
d1dfb56c
EZ
6027 ssize_t offset = PTR_TO_OFFSET (d);
6028 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6029 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6030#endif
99633e97 6031 PREFETCH ();
6fdd04b0 6032 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6033 s2 = SYNTAX (c2);
177c0ea7 6034
b18215fc
RS
6035 /* Case 2: S2 is not Sword. */
6036 if (s2 != Sword)
6037 goto fail;
6038
6039 /* Case 3: D is not at the beginning of string ... */
6040 if (!AT_STRINGS_BEG (d))
6041 {
6042 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6043#ifdef emacs
5d967c7a 6044 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6045#endif
b18215fc
RS
6046 s1 = SYNTAX (c1);
6047
6048 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6049 returns 0. */
b18215fc
RS
6050 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6051 goto fail;
6052 }
6053 }
e318085a
RS
6054 break;
6055
b18215fc 6056 case wordend:
25fe55af 6057 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6058
6059 /* We FAIL in one of the following cases: */
6060
6061 /* Case 1: D is at the beginning of string. */
6062 if (AT_STRINGS_BEG (d))
e318085a 6063 goto fail;
b18215fc
RS
6064 else
6065 {
6066 /* C1 is the character before D, S1 is the syntax of C1, C2
6067 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6068 re_wchar_t c1, c2;
6069 int s1, s2;
bf216479 6070 int dummy;
5d967c7a 6071#ifdef emacs
d1dfb56c
EZ
6072 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6073 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6074 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6075#endif
99633e97 6076 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6077 s1 = SYNTAX (c1);
6078
6079 /* Case 2: S1 is not Sword. */
6080 if (s1 != Sword)
6081 goto fail;
6082
6083 /* Case 3: D is not at the end of string ... */
6084 if (!AT_STRINGS_END (d))
6085 {
f1ad044f 6086 PREFETCH_NOLIMIT ();
6fdd04b0 6087 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6088#ifdef emacs
6089 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6090#endif
b18215fc
RS
6091 s2 = SYNTAX (c2);
6092
6093 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6094 returns 0. */
b18215fc 6095 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6096 goto fail;
b18215fc
RS
6097 }
6098 }
e318085a
RS
6099 break;
6100
669fa600
SM
6101 case symbeg:
6102 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6103
6104 /* We FAIL in one of the following cases: */
6105
7814e705 6106 /* Case 1: D is at the end of string. */
669fa600
SM
6107 if (AT_STRINGS_END (d))
6108 goto fail;
6109 else
6110 {
6111 /* C1 is the character before D, S1 is the syntax of C1, C2
6112 is the character at D, and S2 is the syntax of C2. */
6113 re_wchar_t c1, c2;
6114 int s1, s2;
6115#ifdef emacs
d1dfb56c
EZ
6116 ssize_t offset = PTR_TO_OFFSET (d);
6117 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6118 UPDATE_SYNTAX_TABLE (charpos);
6119#endif
6120 PREFETCH ();
62a6e103 6121 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6122 s2 = SYNTAX (c2);
7814e705 6123
669fa600
SM
6124 /* Case 2: S2 is neither Sword nor Ssymbol. */
6125 if (s2 != Sword && s2 != Ssymbol)
6126 goto fail;
6127
6128 /* Case 3: D is not at the beginning of string ... */
6129 if (!AT_STRINGS_BEG (d))
6130 {
6131 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6132#ifdef emacs
6133 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6134#endif
6135 s1 = SYNTAX (c1);
6136
6137 /* ... and S1 is Sword or Ssymbol. */
6138 if (s1 == Sword || s1 == Ssymbol)
6139 goto fail;
6140 }
6141 }
6142 break;
6143
6144 case symend:
6145 DEBUG_PRINT1 ("EXECUTING symend.\n");
6146
6147 /* We FAIL in one of the following cases: */
6148
6149 /* Case 1: D is at the beginning of string. */
6150 if (AT_STRINGS_BEG (d))
6151 goto fail;
6152 else
6153 {
6154 /* C1 is the character before D, S1 is the syntax of C1, C2
6155 is the character at D, and S2 is the syntax of C2. */
6156 re_wchar_t c1, c2;
6157 int s1, s2;
6158#ifdef emacs
d1dfb56c
EZ
6159 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6160 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6161 UPDATE_SYNTAX_TABLE (charpos);
6162#endif
6163 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6164 s1 = SYNTAX (c1);
6165
6166 /* Case 2: S1 is neither Ssymbol nor Sword. */
6167 if (s1 != Sword && s1 != Ssymbol)
6168 goto fail;
6169
6170 /* Case 3: D is not at the end of string ... */
6171 if (!AT_STRINGS_END (d))
6172 {
6173 PREFETCH_NOLIMIT ();
62a6e103 6174 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6175#ifdef emacs
134579f2 6176 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6177#endif
6178 s2 = SYNTAX (c2);
6179
6180 /* ... and S2 is Sword or Ssymbol. */
6181 if (s2 == Sword || s2 == Ssymbol)
6182 goto fail;
b18215fc
RS
6183 }
6184 }
e318085a
RS
6185 break;
6186
fa9a63c5 6187 case syntaxspec:
1fb352e0 6188 case notsyntaxspec:
b18215fc 6189 {
19ed5445
PE
6190 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6191 mcnt = *p++;
6192 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6193 PREFETCH ();
6194#ifdef emacs
6195 {
d1dfb56c
EZ
6196 ssize_t offset = PTR_TO_OFFSET (d);
6197 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445
PE
6198 UPDATE_SYNTAX_TABLE (pos1);
6199 }
25fe55af 6200#endif
19ed5445
PE
6201 {
6202 int len;
6203 re_wchar_t c;
b18215fc 6204
19ed5445
PE
6205 GET_CHAR_AFTER (c, d, len);
6206 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6207 goto fail;
6208 d += len;
6209 }
b18215fc 6210 }
8fb31792 6211 break;
fa9a63c5 6212
b18215fc 6213#ifdef emacs
1fb352e0
SM
6214 case before_dot:
6215 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6216 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6217 goto fail;
b18215fc
RS
6218 break;
6219
1fb352e0
SM
6220 case at_dot:
6221 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6222 if (PTR_BYTE_POS (d) != PT_BYTE)
6223 goto fail;
6224 break;
b18215fc 6225
1fb352e0
SM
6226 case after_dot:
6227 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6228 if (PTR_BYTE_POS (d) <= PT_BYTE)
6229 goto fail;
e318085a 6230 break;
fa9a63c5 6231
1fb352e0 6232 case categoryspec:
b18215fc 6233 case notcategoryspec:
b18215fc 6234 {
8fb31792
PE
6235 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6236 mcnt = *p++;
6237 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n",
6238 not?"not":"", mcnt);
6239 PREFETCH ();
01618498 6240
8fb31792
PE
6241 {
6242 int len;
6243 re_wchar_t c;
6244 GET_CHAR_AFTER (c, d, len);
6245 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6246 goto fail;
6247 d += len;
6248 }
b18215fc 6249 }
fa9a63c5 6250 break;
5e69f11e 6251
1fb352e0 6252#endif /* emacs */
5e69f11e 6253
0b32bf0e
SM
6254 default:
6255 abort ();
fa9a63c5 6256 }
b18215fc 6257 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6258
6259
6260 /* We goto here if a matching operation fails. */
6261 fail:
5b370c2b 6262 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6263 if (!FAIL_STACK_EMPTY ())
505bde11 6264 {
01618498 6265 re_char *str, *pat;
505bde11 6266 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6267 DEBUG_PRINT1 ("\nFAIL:\n");
6268 POP_FAILURE_POINT (str, pat);
505bde11
SM
6269 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6270 {
6271 case on_failure_keep_string_jump:
6272 assert (str == NULL);
6273 goto continue_failure_jump;
6274
0683b6fa
SM
6275 case on_failure_jump_nastyloop:
6276 assert ((re_opcode_t)pat[-2] == no_op);
6277 PUSH_FAILURE_POINT (pat - 2, str);
6278 /* Fallthrough */
6279
505bde11
SM
6280 case on_failure_jump_loop:
6281 case on_failure_jump:
6282 case succeed_n:
6283 d = str;
6284 continue_failure_jump:
6285 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6286 p = pat + mcnt;
6287 break;
b18215fc 6288
0683b6fa
SM
6289 case no_op:
6290 /* A special frame used for nastyloops. */
6291 goto fail;
6292
505bde11 6293 default:
5e617bc2 6294 abort ();
505bde11 6295 }
fa9a63c5 6296
505bde11 6297 assert (p >= bufp->buffer && p <= pend);
b18215fc 6298
0b32bf0e 6299 if (d >= string1 && d <= end1)
fa9a63c5 6300 dend = end_match_1;
0b32bf0e 6301 }
fa9a63c5 6302 else
0b32bf0e 6303 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6304 } /* for (;;) */
6305
6306 if (best_regs_set)
6307 goto restore_best_regs;
6308
6309 FREE_VARIABLES ();
6310
b18215fc 6311 return -1; /* Failure to match. */
fa9a63c5
RM
6312} /* re_match_2 */
6313\f
6314/* Subroutine definitions for re_match_2. */
6315
fa9a63c5
RM
6316/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6317 bytes; nonzero otherwise. */
5e69f11e 6318
fa9a63c5 6319static int
d1dfb56c 6320bcmp_translate (const re_char *s1, const re_char *s2, register ssize_t len,
438105ed 6321 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6322{
2d1675e4
SM
6323 register re_char *p1 = s1, *p2 = s2;
6324 re_char *p1_end = s1 + len;
6325 re_char *p2_end = s2 + len;
e934739e 6326
4bb91c68
SM
6327 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6328 different lengths, but relying on a single `len' would break this. -sm */
6329 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6330 {
e934739e 6331 int p1_charlen, p2_charlen;
01618498 6332 re_wchar_t p1_ch, p2_ch;
e934739e 6333
6fdd04b0
KH
6334 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6335 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6336
6337 if (RE_TRANSLATE (translate, p1_ch)
6338 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6339 return 1;
e934739e
RS
6340
6341 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6342 }
e934739e
RS
6343
6344 if (p1 != p1_end || p2 != p2_end)
6345 return 1;
6346
fa9a63c5
RM
6347 return 0;
6348}
6349\f
6350/* Entry points for GNU code. */
6351
6352/* re_compile_pattern is the GNU regular expression compiler: it
6353 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6354 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6355
fa9a63c5
RM
6356 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6357 are set in BUFP on entry.
5e69f11e 6358
b18215fc 6359 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6360
6361const char *
d1dfb56c
EZ
6362re_compile_pattern (const char *pattern, size_t length,
6363 struct re_pattern_buffer *bufp)
fa9a63c5
RM
6364{
6365 reg_errcode_t ret;
5e69f11e 6366
fa9a63c5
RM
6367 /* GNU code is written to assume at least RE_NREGS registers will be set
6368 (and at least one extra will be -1). */
6369 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6370
fa9a63c5
RM
6371 /* And GNU code determines whether or not to get register information
6372 by passing null for the REGS argument to re_match, etc., not by
6373 setting no_sub. */
6374 bufp->no_sub = 0;
5e69f11e 6375
4bb91c68 6376 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6377
6378 if (!ret)
6379 return NULL;
6380 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6381}
c0f9ea08 6382WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6383\f
b18215fc
RS
6384/* Entry points compatible with 4.2 BSD regex library. We don't define
6385 them unless specifically requested. */
fa9a63c5 6386
0b32bf0e 6387#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6388
6389/* BSD has one and only one pattern buffer. */
6390static struct re_pattern_buffer re_comp_buf;
6391
6392char *
0b32bf0e 6393# ifdef _LIBC
48afdd44
RM
6394/* Make these definitions weak in libc, so POSIX programs can redefine
6395 these names if they don't use our functions, and still use
6396 regcomp/regexec below without link errors. */
6397weak_function
0b32bf0e 6398# endif
31011111 6399re_comp (const char *s)
fa9a63c5
RM
6400{
6401 reg_errcode_t ret;
5e69f11e 6402
fa9a63c5
RM
6403 if (!s)
6404 {
6405 if (!re_comp_buf.buffer)
0b32bf0e 6406 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6407 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6408 return 0;
6409 }
6410
6411 if (!re_comp_buf.buffer)
6412 {
6413 re_comp_buf.buffer = (unsigned char *) malloc (200);
6414 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6415 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6416 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6417 re_comp_buf.allocated = 200;
6418
6419 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6420 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6421 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6422 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6423 }
6424
6425 /* Since `re_exec' always passes NULL for the `regs' argument, we
6426 don't need to initialize the pattern buffer fields which affect it. */
6427
fa9a63c5 6428 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6429
fa9a63c5
RM
6430 if (!ret)
6431 return NULL;
6432
6433 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6434 return (char *) gettext (re_error_msgid[(int) ret]);
6435}
6436
6437
31011111 6438int
0b32bf0e 6439# ifdef _LIBC
48afdd44 6440weak_function
0b32bf0e 6441# endif
d1dfb56c 6442re_exec (const char *s)
fa9a63c5 6443{
d1dfb56c 6444 const size_t len = strlen (s);
fa9a63c5
RM
6445 return
6446 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6447}
6448#endif /* _REGEX_RE_COMP */
6449\f
6450/* POSIX.2 functions. Don't define these for Emacs. */
6451
6452#ifndef emacs
6453
6454/* regcomp takes a regular expression as a string and compiles it.
6455
b18215fc 6456 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6457 since POSIX says we shouldn't. Thus, we set
6458
6459 `buffer' to the compiled pattern;
6460 `used' to the length of the compiled pattern;
6461 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6462 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6463 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6464 `fastmap' to an allocated space for the fastmap;
6465 `fastmap_accurate' to zero;
fa9a63c5
RM
6466 `re_nsub' to the number of subexpressions in PATTERN.
6467
6468 PATTERN is the address of the pattern string.
6469
6470 CFLAGS is a series of bits which affect compilation.
6471
6472 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6473 use POSIX basic syntax.
6474
6475 If REG_NEWLINE is set, then . and [^...] don't match newline.
6476 Also, regexec will try a match beginning after every newline.
6477
6478 If REG_ICASE is set, then we considers upper- and lowercase
6479 versions of letters to be equivalent when matching.
6480
6481 If REG_NOSUB is set, then when PREG is passed to regexec, that
6482 routine will report only success or failure, and nothing about the
6483 registers.
6484
b18215fc 6485 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6486 the return codes and their meanings.) */
6487
d1dfb56c 6488reg_errcode_t
d2762c86
DN
6489regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6490 int cflags)
fa9a63c5
RM
6491{
6492 reg_errcode_t ret;
4bb91c68 6493 reg_syntax_t syntax
fa9a63c5
RM
6494 = (cflags & REG_EXTENDED) ?
6495 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6496
6497 /* regex_compile will allocate the space for the compiled pattern. */
6498 preg->buffer = 0;
6499 preg->allocated = 0;
6500 preg->used = 0;
5e69f11e 6501
c0f9ea08
SM
6502 /* Try to allocate space for the fastmap. */
6503 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6504
fa9a63c5
RM
6505 if (cflags & REG_ICASE)
6506 {
6507 unsigned i;
5e69f11e 6508
6676cb1c
RS
6509 preg->translate
6510 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6511 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6512 if (preg->translate == NULL)
0b32bf0e 6513 return (int) REG_ESPACE;
fa9a63c5
RM
6514
6515 /* Map uppercase characters to corresponding lowercase ones. */
6516 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6517 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6518 }
6519 else
6520 preg->translate = NULL;
6521
6522 /* If REG_NEWLINE is set, newlines are treated differently. */
6523 if (cflags & REG_NEWLINE)
6524 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6525 syntax &= ~RE_DOT_NEWLINE;
6526 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6527 }
6528 else
c0f9ea08 6529 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6530
6531 preg->no_sub = !!(cflags & REG_NOSUB);
6532
5e69f11e 6533 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6534 can use strlen here in compiling the pattern. */
4bb91c68 6535 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6536
fa9a63c5
RM
6537 /* POSIX doesn't distinguish between an unmatched open-group and an
6538 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6539 if (ret == REG_ERPAREN)
6540 ret = REG_EPAREN;
6541
6542 if (ret == REG_NOERROR && preg->fastmap)
6543 { /* Compute the fastmap now, since regexec cannot modify the pattern
6544 buffer. */
6545 re_compile_fastmap (preg);
6546 if (preg->can_be_null)
6547 { /* The fastmap can't be used anyway. */
6548 free (preg->fastmap);
6549 preg->fastmap = NULL;
6550 }
6551 }
d1dfb56c 6552 return ret;
fa9a63c5 6553}
c0f9ea08 6554WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6555
6556
6557/* regexec searches for a given pattern, specified by PREG, in the
6558 string STRING.
5e69f11e 6559
fa9a63c5 6560 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6561 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6562 least NMATCH elements, and we set them to the offsets of the
6563 corresponding matched substrings.
5e69f11e 6564
fa9a63c5
RM
6565 EFLAGS specifies `execution flags' which affect matching: if
6566 REG_NOTBOL is set, then ^ does not match at the beginning of the
6567 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6568
fa9a63c5
RM
6569 We return 0 if we find a match and REG_NOMATCH if not. */
6570
d1dfb56c 6571reg_errcode_t
d2762c86
DN
6572regexec (const regex_t *__restrict preg, const char *__restrict string,
6573 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5 6574{
31011111 6575 regoff_t ret;
fa9a63c5
RM
6576 struct re_registers regs;
6577 regex_t private_preg;
d1dfb56c 6578 size_t len = strlen (string);
c0f9ea08 6579 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6580
6581 private_preg = *preg;
5e69f11e 6582
fa9a63c5
RM
6583 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6584 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6585
fa9a63c5
RM
6586 /* The user has told us exactly how many registers to return
6587 information about, via `nmatch'. We have to pass that on to the
b18215fc 6588 matching routines. */
fa9a63c5 6589 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6590
fa9a63c5
RM
6591 if (want_reg_info)
6592 {
6593 regs.num_regs = nmatch;
4bb91c68
SM
6594 regs.start = TALLOC (nmatch * 2, regoff_t);
6595 if (regs.start == NULL)
d1dfb56c 6596 return REG_NOMATCH;
4bb91c68 6597 regs.end = regs.start + nmatch;
fa9a63c5
RM
6598 }
6599
c0f9ea08
SM
6600 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6601 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6602 was a little bit longer but still only matching the real part.
6603 This works because the `endline' will check for a '\n' and will find a
6604 '\0', correctly deciding that this is not the end of a line.
6605 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6606 a convenient '\0' there. For all we know, the string could be preceded
6607 by '\n' which would throw things off. */
6608
fa9a63c5
RM
6609 /* Perform the searching operation. */
6610 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6611 /* start: */ 0, /* range: */ len,
6612 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6613
fa9a63c5
RM
6614 /* Copy the register information to the POSIX structure. */
6615 if (want_reg_info)
6616 {
6617 if (ret >= 0)
0b32bf0e
SM
6618 {
6619 unsigned r;
fa9a63c5 6620
0b32bf0e
SM
6621 for (r = 0; r < nmatch; r++)
6622 {
6623 pmatch[r].rm_so = regs.start[r];
6624 pmatch[r].rm_eo = regs.end[r];
6625 }
6626 }
fa9a63c5 6627
b18215fc 6628 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6629 free (regs.start);
fa9a63c5
RM
6630 }
6631
6632 /* We want zero return to mean success, unlike `re_search'. */
d1dfb56c 6633 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
fa9a63c5 6634}
c0f9ea08 6635WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6636
6637
ec869672
JR
6638/* Returns a message corresponding to an error code, ERR_CODE, returned
6639 from either regcomp or regexec. We don't use PREG here.
6640
6641 ERR_CODE was previously called ERRCODE, but that name causes an
6642 error with msvc8 compiler. */
fa9a63c5
RM
6643
6644size_t
d2762c86 6645regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6646{
6647 const char *msg;
6648 size_t msg_size;
6649
ec869672
JR
6650 if (err_code < 0
6651 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6652 /* Only error codes returned by the rest of the code should be passed
b18215fc 6653 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6654 code generates an invalid error code, then the program has a bug.
6655 Dump core so we can fix it. */
6656 abort ();
6657
ec869672 6658 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6659
6660 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6661
fa9a63c5
RM
6662 if (errbuf_size != 0)
6663 {
6664 if (msg_size > errbuf_size)
0b32bf0e
SM
6665 {
6666 strncpy (errbuf, msg, errbuf_size - 1);
6667 errbuf[errbuf_size - 1] = 0;
6668 }
fa9a63c5 6669 else
0b32bf0e 6670 strcpy (errbuf, msg);
fa9a63c5
RM
6671 }
6672
6673 return msg_size;
6674}
c0f9ea08 6675WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6676
6677
6678/* Free dynamically allocated space used by PREG. */
6679
6680void
d2762c86 6681regfree (regex_t *preg)
fa9a63c5 6682{
c2cd06e6 6683 free (preg->buffer);
fa9a63c5 6684 preg->buffer = NULL;
5e69f11e 6685
fa9a63c5
RM
6686 preg->allocated = 0;
6687 preg->used = 0;
6688
c2cd06e6 6689 free (preg->fastmap);
fa9a63c5
RM
6690 preg->fastmap = NULL;
6691 preg->fastmap_accurate = 0;
6692
c2cd06e6 6693 free (preg->translate);
fa9a63c5
RM
6694 preg->translate = NULL;
6695}
c0f9ea08 6696WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6697
6698#endif /* not emacs */