Add 2012 to FSF copyright years for Emacs files
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
acaf905b 5 Copyright (C) 1993-2012 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
4fc5845f 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
fa9a63c5 36#ifdef HAVE_CONFIG_H
0b32bf0e 37# include <config.h>
fa9a63c5
RM
38#endif
39
0e926e56
PE
40#include <stddef.h>
41
42#ifdef emacs
4bb91c68
SM
43/* We need this for `regex.h', and perhaps for the Emacs include files. */
44# include <sys/types.h>
45#endif
fa9a63c5 46
14473664
SM
47/* Whether to use ISO C Amendment 1 wide char functions.
48 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
49#if defined _LIBC
50#define WIDE_CHAR_SUPPORT 1
51#else
14473664 52#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
53 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
54#endif
14473664 55
fa463103 56/* For platform which support the ISO C amendment 1 functionality we
14473664 57 support user defined character classes. */
a0ad02f7 58#if WIDE_CHAR_SUPPORT
14473664
SM
59/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
60# include <wchar.h>
61# include <wctype.h>
62#endif
63
c0f9ea08
SM
64#ifdef _LIBC
65/* We have to keep the namespace clean. */
66# define regfree(preg) __regfree (preg)
67# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
68# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672 69# define regerror(err_code, preg, errbuf, errbuf_size) \
5e617bc2 70 __regerror (err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
71# define re_set_registers(bu, re, nu, st, en) \
72 __re_set_registers (bu, re, nu, st, en)
73# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
74 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
75# define re_match(bufp, string, size, pos, regs) \
76 __re_match (bufp, string, size, pos, regs)
77# define re_search(bufp, string, size, startpos, range, regs) \
78 __re_search (bufp, string, size, startpos, range, regs)
79# define re_compile_pattern(pattern, length, bufp) \
80 __re_compile_pattern (pattern, length, bufp)
81# define re_set_syntax(syntax) __re_set_syntax (syntax)
82# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
83 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
84# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
85
14473664
SM
86/* Make sure we call libc's function even if the user overrides them. */
87# define btowc __btowc
88# define iswctype __iswctype
89# define wctype __wctype
90
c0f9ea08
SM
91# define WEAK_ALIAS(a,b) weak_alias (a, b)
92
93/* We are also using some library internals. */
94# include <locale/localeinfo.h>
95# include <locale/elem-hash.h>
96# include <langinfo.h>
97#else
98# define WEAK_ALIAS(a,b)
99#endif
100
4bb91c68 101/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 102#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
103# include <libintl.h>
104#else
105# define gettext(msgid) (msgid)
106#endif
107
5e69f11e
RM
108#ifndef gettext_noop
109/* This define is so xgettext can find the internationalizable
110 strings. */
0b32bf0e 111# define gettext_noop(String) String
5e69f11e
RM
112#endif
113
fa9a63c5
RM
114/* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116#ifdef emacs
117
d7306fe6 118# include <setjmp.h>
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e 125# include "syntax.h"
9117d724 126# include "character.h"
0b32bf0e 127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
7814e705 142/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 147# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
148# define RE_STRING_CHAR(p, multibyte) \
149 (multibyte ? (STRING_CHAR (p)) : (*(p)))
150# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 152
4c0354d7 153# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 154
2afc21f5 155# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 156
6fdd04b0
KH
157/* Set C a (possibly converted to multibyte) character before P. P
158 points into a string which is the virtual concatenation of STR1
159 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
160# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
161 do { \
02cb78b5 162 if (target_multibyte) \
bf216479
KH
163 { \
164 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
165 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
166 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 167 c = STRING_CHAR (dtemp); \
bf216479
KH
168 } \
169 else \
170 { \
171 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 172 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 173 } \
2d1675e4
SM
174 } while (0)
175
6fdd04b0
KH
176/* Set C a (possibly converted to multibyte) character at P, and set
177 LEN to the byte length of that character. */
178# define GET_CHAR_AFTER(c, p, len) \
179 do { \
02cb78b5 180 if (target_multibyte) \
62a6e103 181 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
182 else \
183 { \
cf9c99bc 184 (c) = *p; \
6fdd04b0 185 len = 1; \
cf9c99bc 186 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 187 } \
8f924df7 188 } while (0)
4e8a9132 189
fa9a63c5
RM
190#else /* not emacs */
191
192/* If we are not linking with Emacs proper,
193 we can't use the relocating allocator
194 even if config.h says that we can. */
0b32bf0e 195# undef REL_ALLOC
fa9a63c5 196
4004364e 197# include <unistd.h>
fa9a63c5 198
a77f947b
CY
199/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
200
201void *
d2762c86 202xmalloc (size_t size)
a77f947b
CY
203{
204 register void *val;
205 val = (void *) malloc (size);
206 if (!val && size)
207 {
208 write (2, "virtual memory exhausted\n", 25);
209 exit (1);
210 }
211 return val;
212}
213
214void *
d2762c86 215xrealloc (void *block, size_t size)
a77f947b
CY
216{
217 register void *val;
218 /* We must call malloc explicitly when BLOCK is 0, since some
219 reallocs don't do this. */
220 if (! block)
221 val = (void *) malloc (size);
222 else
223 val = (void *) realloc (block, size);
224 if (!val && size)
225 {
226 write (2, "virtual memory exhausted\n", 25);
227 exit (1);
228 }
229 return val;
230}
231
a073faa6
CY
232# ifdef malloc
233# undef malloc
234# endif
235# define malloc xmalloc
236# ifdef realloc
237# undef realloc
238# endif
239# define realloc xrealloc
240
9cfdb3ec 241# include <string.h>
fa9a63c5
RM
242
243/* Define the syntax stuff for \<, \>, etc. */
244
990b2375 245/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 246enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 247
0b32bf0e 248# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 249
e934739e 250/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
251# define CHAR_CHARSET(c) 0
252# define CHARSET_LEADING_CODE_BASE(c) 0
253# define MAX_MULTIBYTE_LENGTH 1
254# define RE_MULTIBYTE_P(x) 0
bf216479 255# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
256# define WORD_BOUNDARY_P(c1, c2) (0)
257# define CHAR_HEAD_P(p) (1)
258# define SINGLE_BYTE_CHAR_P(c) (1)
259# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 260# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 261# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
262# define STRING_CHAR(p) (*(p))
263# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 264# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
265# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
266# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
267# define RE_CHAR_TO_MULTIBYTE(c) (c)
268# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 269# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 270 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
271# define GET_CHAR_AFTER(c, p, len) \
272 (c = *p, len = 1)
0b32bf0e 273# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
274# define BYTE8_TO_CHAR(c) (c)
275# define CHAR_BYTE8_P(c) (0)
bf216479 276# define CHAR_LEADING_CODE(c) (c)
8f924df7 277
fa9a63c5 278#endif /* not emacs */
4e8a9132
SM
279
280#ifndef RE_TRANSLATE
0b32bf0e
SM
281# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
282# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 283#endif
fa9a63c5
RM
284\f
285/* Get the interface, including the syntax bits. */
286#include "regex.h"
287
f71b19b6
DL
288/* isalpha etc. are used for the character classes. */
289#include <ctype.h>
fa9a63c5 290
f71b19b6 291#ifdef emacs
fa9a63c5 292
f71b19b6 293/* 1 if C is an ASCII character. */
0b32bf0e 294# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 295
f71b19b6 296/* 1 if C is a unibyte character. */
0b32bf0e 297# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 298
f71b19b6 299/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 300
f71b19b6 301/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
302# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
303# define ISCNTRL(c) ((c) < ' ')
304# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
305 || ((c) >= 'a' && (c) <= 'f') \
306 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
307
308/* This is only used for single-byte characters. */
0b32bf0e 309# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
310
311/* The rest must handle multibyte characters. */
312
0b32bf0e 313# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 314 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
315 : 1)
316
14473664 317# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 318 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
319 : 1)
320
0b32bf0e 321# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
322 ? (((c) >= 'a' && (c) <= 'z') \
323 || ((c) >= 'A' && (c) <= 'Z') \
324 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
325 : SYNTAX (c) == Sword)
326
0b32bf0e 327# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
328 ? (((c) >= 'a' && (c) <= 'z') \
329 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
330 : SYNTAX (c) == Sword)
331
5da9919f 332# define ISLOWER(c) lowercasep (c)
96cc36cc 333
0b32bf0e 334# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
335 ? ((c) > ' ' && (c) < 0177 \
336 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
337 || ((c) >= 'A' && (c) <= 'Z') \
338 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
339 : SYNTAX (c) != Sword)
340
0b32bf0e 341# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 342
5da9919f 343# define ISUPPER(c) uppercasep (c)
96cc36cc 344
0b32bf0e 345# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
346
347#else /* not emacs */
348
f71b19b6 349/* 1 if C is an ASCII character. */
0b32bf0e 350# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
351
352/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
353# define ISUNIBYTE(c) 1
354
355# ifdef isblank
0e926e56 356# define ISBLANK(c) isblank (c)
0b32bf0e
SM
357# else
358# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
359# endif
360# ifdef isgraph
0e926e56 361# define ISGRAPH(c) isgraph (c)
0b32bf0e 362# else
0e926e56 363# define ISGRAPH(c) (isprint (c) && !isspace (c))
0b32bf0e
SM
364# endif
365
0e926e56 366/* Solaris defines ISPRINT so we must undefine it first. */
4bb91c68 367# undef ISPRINT
0e926e56
PE
368# define ISPRINT(c) isprint (c)
369# define ISDIGIT(c) isdigit (c)
370# define ISALNUM(c) isalnum (c)
371# define ISALPHA(c) isalpha (c)
372# define ISCNTRL(c) iscntrl (c)
373# define ISLOWER(c) islower (c)
374# define ISPUNCT(c) ispunct (c)
375# define ISSPACE(c) isspace (c)
376# define ISUPPER(c) isupper (c)
377# define ISXDIGIT(c) isxdigit (c)
0b32bf0e 378
5e617bc2 379# define ISWORD(c) ISALPHA (c)
0b32bf0e 380
4bb91c68 381# ifdef _tolower
5e617bc2 382# define TOLOWER(c) _tolower (c)
4bb91c68 383# else
5e617bc2 384# define TOLOWER(c) tolower (c)
4bb91c68
SM
385# endif
386
387/* How many characters in the character set. */
388# define CHAR_SET_SIZE 256
389
0b32bf0e 390# ifdef SYNTAX_TABLE
f71b19b6 391
0b32bf0e 392extern char *re_syntax_table;
f71b19b6 393
0b32bf0e
SM
394# else /* not SYNTAX_TABLE */
395
0b32bf0e
SM
396static char re_syntax_table[CHAR_SET_SIZE];
397
398static void
d2762c86 399init_syntax_once (void)
0b32bf0e
SM
400{
401 register int c;
402 static int done = 0;
403
404 if (done)
405 return;
406
72af86bd 407 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 408
4bb91c68
SM
409 for (c = 0; c < CHAR_SET_SIZE; ++c)
410 if (ISALNUM (c))
411 re_syntax_table[c] = Sword;
fa9a63c5 412
669fa600 413 re_syntax_table['_'] = Ssymbol;
fa9a63c5 414
0b32bf0e
SM
415 done = 1;
416}
417
418# endif /* not SYNTAX_TABLE */
96cc36cc 419
4bb91c68
SM
420# define SYNTAX(c) re_syntax_table[(c)]
421
96cc36cc
RS
422#endif /* not emacs */
423\f
fa9a63c5
RM
424/* We remove any previous definition of `SIGN_EXTEND_CHAR',
425 since ours (we hope) works properly with all combinations of
426 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 427 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
428#undef SIGN_EXTEND_CHAR
429#if __STDC__
0b32bf0e 430# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
431#else /* not __STDC__ */
432/* As in Harbison and Steele. */
0b32bf0e 433# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
434#endif
435\f
436/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
437 use `alloca' instead of `malloc'. This is because using malloc in
438 re_search* or re_match* could cause memory leaks when C-g is used in
439 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
440 the other hand, malloc is more portable, and easier to debug.
441
fa9a63c5
RM
442 Because we sometimes use alloca, some routines have to be macros,
443 not functions -- `alloca'-allocated space disappears at the end of the
444 function it is called in. */
445
446#ifdef REGEX_MALLOC
447
0b32bf0e
SM
448# define REGEX_ALLOCATE malloc
449# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
450# define REGEX_FREE free
fa9a63c5
RM
451
452#else /* not REGEX_MALLOC */
453
454/* Emacs already defines alloca, sometimes. */
0b32bf0e 455# ifndef alloca
fa9a63c5
RM
456
457/* Make alloca work the best possible way. */
0b32bf0e
SM
458# ifdef __GNUC__
459# define alloca __builtin_alloca
460# else /* not __GNUC__ */
7f585e7a 461# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
462# include <alloca.h>
463# endif /* HAVE_ALLOCA_H */
464# endif /* not __GNUC__ */
fa9a63c5 465
0b32bf0e 466# endif /* not alloca */
fa9a63c5 467
0b32bf0e 468# define REGEX_ALLOCATE alloca
fa9a63c5
RM
469
470/* Assumes a `char *destination' variable. */
0b32bf0e 471# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 472 (destination = (char *) alloca (nsize), \
4bb91c68 473 memcpy (destination, source, osize))
fa9a63c5
RM
474
475/* No need to do anything to free, after alloca. */
0b32bf0e 476# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
477
478#endif /* not REGEX_MALLOC */
479
480/* Define how to allocate the failure stack. */
481
0b32bf0e 482#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 483
0b32bf0e 484# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 485 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 486# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 487 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 488# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
489 r_alloc_free (&failure_stack_ptr)
490
4297555e 491#else /* not using relocating allocator */
fa9a63c5 492
0b32bf0e 493# ifdef REGEX_MALLOC
fa9a63c5 494
0b32bf0e
SM
495# define REGEX_ALLOCATE_STACK malloc
496# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
497# define REGEX_FREE_STACK free
fa9a63c5 498
0b32bf0e 499# else /* not REGEX_MALLOC */
fa9a63c5 500
0b32bf0e 501# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 502
0b32bf0e 503# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 504 REGEX_REALLOCATE (source, osize, nsize)
7814e705 505/* No need to explicitly free anything. */
0b32bf0e 506# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 507
0b32bf0e 508# endif /* not REGEX_MALLOC */
4297555e 509#endif /* not using relocating allocator */
fa9a63c5
RM
510
511
512/* True if `size1' is non-NULL and PTR is pointing anywhere inside
513 `string1' or just past its end. This works if PTR is NULL, which is
514 a good thing. */
25fe55af 515#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
516 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
517
518/* (Re)Allocate N items of type T using malloc, or fail. */
519#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
520#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
fa9a63c5
RM
521#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
522
4bb91c68 523#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
524
525#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
526
527#undef MAX
528#undef MIN
529#define MAX(a, b) ((a) > (b) ? (a) : (b))
530#define MIN(a, b) ((a) < (b) ? (a) : (b))
531
66f0296e 532/* Type of source-pattern and string chars. */
a6fc3b5c
EZ
533#ifdef _MSC_VER
534typedef unsigned char re_char;
535#else
66f0296e 536typedef const unsigned char re_char;
a6fc3b5c 537#endif
66f0296e 538
fa9a63c5
RM
539typedef char boolean;
540#define false 0
541#define true 1
542
d1dfb56c
EZ
543static regoff_t re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
544 re_char *string1, size_t size1,
545 re_char *string2, size_t size2,
546 ssize_t pos,
547 struct re_registers *regs,
548 ssize_t stop));
fa9a63c5
RM
549\f
550/* These are the command codes that appear in compiled regular
4bb91c68 551 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
552 command code can specify any interpretation whatsoever for its
553 arguments. Zero bytes may appear in the compiled regular expression. */
554
555typedef enum
556{
557 no_op = 0,
558
4bb91c68 559 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
560 succeed,
561
25fe55af 562 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
563 exactn,
564
25fe55af 565 /* Matches any (more or less) character. */
fa9a63c5
RM
566 anychar,
567
25fe55af
RS
568 /* Matches any one char belonging to specified set. First
569 following byte is number of bitmap bytes. Then come bytes
570 for a bitmap saying which chars are in. Bits in each byte
571 are ordered low-bit-first. A character is in the set if its
572 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
573 automatically not in the set.
574
575 If the length byte has the 0x80 bit set, then that stuff
576 is followed by a range table:
577 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 578 See RANGE_TABLE_WORK_BITS below.
01618498 579 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 580 pairs, each 2 multibyte characters,
0b32bf0e 581 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
582 charset,
583
25fe55af 584 /* Same parameters as charset, but match any character that is
4bb91c68 585 not one of those specified. */
fa9a63c5
RM
586 charset_not,
587
25fe55af
RS
588 /* Start remembering the text that is matched, for storing in a
589 register. Followed by one byte with the register number, in
590 the range 0 to one less than the pattern buffer's re_nsub
505bde11 591 field. */
fa9a63c5
RM
592 start_memory,
593
25fe55af
RS
594 /* Stop remembering the text that is matched and store it in a
595 memory register. Followed by one byte with the register
596 number, in the range 0 to one less than `re_nsub' in the
505bde11 597 pattern buffer. */
fa9a63c5
RM
598 stop_memory,
599
25fe55af 600 /* Match a duplicate of something remembered. Followed by one
4bb91c68 601 byte containing the register number. */
fa9a63c5
RM
602 duplicate,
603
25fe55af 604 /* Fail unless at beginning of line. */
fa9a63c5
RM
605 begline,
606
4bb91c68 607 /* Fail unless at end of line. */
fa9a63c5
RM
608 endline,
609
25fe55af
RS
610 /* Succeeds if at beginning of buffer (if emacs) or at beginning
611 of string to be matched (if not). */
fa9a63c5
RM
612 begbuf,
613
25fe55af 614 /* Analogously, for end of buffer/string. */
fa9a63c5 615 endbuf,
5e69f11e 616
25fe55af 617 /* Followed by two byte relative address to which to jump. */
5e69f11e 618 jump,
fa9a63c5 619
25fe55af 620 /* Followed by two-byte relative address of place to resume at
7814e705 621 in case of failure. */
fa9a63c5 622 on_failure_jump,
5e69f11e 623
25fe55af
RS
624 /* Like on_failure_jump, but pushes a placeholder instead of the
625 current string position when executed. */
fa9a63c5 626 on_failure_keep_string_jump,
5e69f11e 627
505bde11
SM
628 /* Just like `on_failure_jump', except that it checks that we
629 don't get stuck in an infinite loop (matching an empty string
630 indefinitely). */
631 on_failure_jump_loop,
632
0683b6fa
SM
633 /* Just like `on_failure_jump_loop', except that it checks for
634 a different kind of loop (the kind that shows up with non-greedy
635 operators). This operation has to be immediately preceded
636 by a `no_op'. */
637 on_failure_jump_nastyloop,
638
0b32bf0e 639 /* A smart `on_failure_jump' used for greedy * and + operators.
c7015153 640 It analyzes the loop before which it is put and if the
505bde11 641 loop does not require backtracking, it changes itself to
4e8a9132
SM
642 `on_failure_keep_string_jump' and short-circuits the loop,
643 else it just defaults to changing itself into `on_failure_jump'.
644 It assumes that it is pointing to just past a `jump'. */
505bde11 645 on_failure_jump_smart,
fa9a63c5 646
25fe55af 647 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
648 After matching N times, jump to the address upon failure.
649 Does not work if N starts at 0: use on_failure_jump_loop
650 instead. */
fa9a63c5
RM
651 succeed_n,
652
25fe55af
RS
653 /* Followed by two-byte relative address, and two-byte number n.
654 Jump to the address N times, then fail. */
fa9a63c5
RM
655 jump_n,
656
25fe55af 657 /* Set the following two-byte relative address to the
7814e705 658 subsequent two-byte number. The address *includes* the two
25fe55af 659 bytes of number. */
fa9a63c5
RM
660 set_number_at,
661
fa9a63c5
RM
662 wordbeg, /* Succeeds if at word beginning. */
663 wordend, /* Succeeds if at word end. */
664
665 wordbound, /* Succeeds if at a word boundary. */
7814e705 666 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 667
669fa600
SM
668 symbeg, /* Succeeds if at symbol beginning. */
669 symend, /* Succeeds if at symbol end. */
670
fa9a63c5 671 /* Matches any character whose syntax is specified. Followed by
25fe55af 672 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
673 syntaxspec,
674
675 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
676 notsyntaxspec
677
678#ifdef emacs
679 ,before_dot, /* Succeeds if before point. */
680 at_dot, /* Succeeds if at point. */
681 after_dot, /* Succeeds if after point. */
b18215fc
RS
682
683 /* Matches any character whose category-set contains the specified
7814e705
JB
684 category. The operator is followed by a byte which contains a
685 category code (mnemonic ASCII character). */
b18215fc
RS
686 categoryspec,
687
688 /* Matches any character whose category-set does not contain the
689 specified category. The operator is followed by a byte which
690 contains the category code (mnemonic ASCII character). */
691 notcategoryspec
fa9a63c5
RM
692#endif /* emacs */
693} re_opcode_t;
694\f
695/* Common operations on the compiled pattern. */
696
697/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
698
699#define STORE_NUMBER(destination, number) \
700 do { \
701 (destination)[0] = (number) & 0377; \
702 (destination)[1] = (number) >> 8; \
703 } while (0)
704
705/* Same as STORE_NUMBER, except increment DESTINATION to
706 the byte after where the number is stored. Therefore, DESTINATION
707 must be an lvalue. */
708
709#define STORE_NUMBER_AND_INCR(destination, number) \
710 do { \
711 STORE_NUMBER (destination, number); \
712 (destination) += 2; \
713 } while (0)
714
715/* Put into DESTINATION a number stored in two contiguous bytes starting
716 at SOURCE. */
717
718#define EXTRACT_NUMBER(destination, source) \
719 do { \
720 (destination) = *(source) & 0377; \
721 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
722 } while (0)
723
724#ifdef DEBUG
4bb91c68 725static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
726static void
727extract_number (dest, source)
728 int *dest;
01618498 729 re_char *source;
fa9a63c5 730{
5e69f11e 731 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
732 *dest = *source & 0377;
733 *dest += temp << 8;
734}
735
4bb91c68 736# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
737# undef EXTRACT_NUMBER
738# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
739# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
740
741#endif /* DEBUG */
742
743/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
744 SOURCE must be an lvalue. */
745
746#define EXTRACT_NUMBER_AND_INCR(destination, source) \
747 do { \
748 EXTRACT_NUMBER (destination, source); \
25fe55af 749 (source) += 2; \
fa9a63c5
RM
750 } while (0)
751
752#ifdef DEBUG
4bb91c68
SM
753static void extract_number_and_incr _RE_ARGS ((int *destination,
754 re_char **source));
fa9a63c5
RM
755static void
756extract_number_and_incr (destination, source)
757 int *destination;
01618498 758 re_char **source;
5e69f11e 759{
fa9a63c5
RM
760 extract_number (destination, *source);
761 *source += 2;
762}
763
0b32bf0e
SM
764# ifndef EXTRACT_MACROS
765# undef EXTRACT_NUMBER_AND_INCR
766# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 767 extract_number_and_incr (&dest, &src)
0b32bf0e 768# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
769
770#endif /* DEBUG */
771\f
b18215fc
RS
772/* Store a multibyte character in three contiguous bytes starting
773 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 774 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
775
776#define STORE_CHARACTER_AND_INCR(destination, character) \
777 do { \
778 (destination)[0] = (character) & 0377; \
779 (destination)[1] = ((character) >> 8) & 0377; \
780 (destination)[2] = (character) >> 16; \
781 (destination) += 3; \
782 } while (0)
783
784/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 785 starting at SOURCE. */
b18215fc
RS
786
787#define EXTRACT_CHARACTER(destination, source) \
788 do { \
789 (destination) = ((source)[0] \
790 | ((source)[1] << 8) \
791 | ((source)[2] << 16)); \
792 } while (0)
793
794
795/* Macros for charset. */
796
797/* Size of bitmap of charset P in bytes. P is a start of charset,
798 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
799#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
800
801/* Nonzero if charset P has range table. */
25fe55af 802#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
803
804/* Return the address of range table of charset P. But not the start
805 of table itself, but the before where the number of ranges is
96cc36cc
RS
806 stored. `2 +' means to skip re_opcode_t and size of bitmap,
807 and the 2 bytes of flags at the start of the range table. */
808#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
809
810/* Extract the bit flags that start a range table. */
811#define CHARSET_RANGE_TABLE_BITS(p) \
812 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
813 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc 814
b18215fc 815/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
816 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
817 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
818 and end. */
819#define CHARSET_RANGE_TABLE_END(range_table, count) \
820 ((range_table) + (count) * 2 * 3)
821
7814e705 822/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
823 COUNT is number of ranges in RANGE_TABLE. */
824#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
825 do \
826 { \
01618498 827 re_wchar_t range_start, range_end; \
19ed5445 828 re_char *rtp; \
01618498 829 re_char *range_table_end \
b18215fc
RS
830 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
831 \
19ed5445 832 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 833 { \
19ed5445
PE
834 EXTRACT_CHARACTER (range_start, rtp); \
835 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
836 \
837 if (range_start <= (c) && (c) <= range_end) \
838 { \
839 (not) = !(not); \
840 break; \
841 } \
842 } \
843 } \
844 while (0)
845
846/* Test if C is in range table of CHARSET. The flag NOT is negated if
847 C is listed in it. */
848#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
849 do \
850 { \
851 /* Number of ranges in range table. */ \
852 int count; \
01618498
SM
853 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
854 \
b18215fc
RS
855 EXTRACT_NUMBER_AND_INCR (count, range_table); \
856 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
857 } \
858 while (0)
859\f
fa9a63c5
RM
860/* If DEBUG is defined, Regex prints many voluminous messages about what
861 it is doing (if the variable `debug' is nonzero). If linked with the
862 main program in `iregex.c', you can enter patterns and strings
863 interactively. And if linked with the main program in `main.c' and
4bb91c68 864 the other test files, you can run the already-written tests. */
fa9a63c5
RM
865
866#ifdef DEBUG
867
868/* We use standard I/O for debugging. */
0b32bf0e 869# include <stdio.h>
fa9a63c5
RM
870
871/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 872# include <assert.h>
fa9a63c5 873
99633e97 874static int debug = -100000;
fa9a63c5 875
0b32bf0e
SM
876# define DEBUG_STATEMENT(e) e
877# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
878# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
879# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
880# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
881# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 882 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 883# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 884 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
885
886
887/* Print the fastmap in human-readable form. */
888
889void
890print_fastmap (fastmap)
891 char *fastmap;
892{
893 unsigned was_a_range = 0;
5e69f11e
RM
894 unsigned i = 0;
895
fa9a63c5
RM
896 while (i < (1 << BYTEWIDTH))
897 {
898 if (fastmap[i++])
899 {
900 was_a_range = 0;
25fe55af
RS
901 putchar (i - 1);
902 while (i < (1 << BYTEWIDTH) && fastmap[i])
903 {
904 was_a_range = 1;
905 i++;
906 }
fa9a63c5 907 if (was_a_range)
25fe55af
RS
908 {
909 printf ("-");
910 putchar (i - 1);
911 }
912 }
fa9a63c5 913 }
5e69f11e 914 putchar ('\n');
fa9a63c5
RM
915}
916
917
918/* Print a compiled pattern string in human-readable form, starting at
919 the START pointer into it and ending just before the pointer END. */
920
921void
922print_partial_compiled_pattern (start, end)
01618498
SM
923 re_char *start;
924 re_char *end;
fa9a63c5
RM
925{
926 int mcnt, mcnt2;
01618498
SM
927 re_char *p = start;
928 re_char *pend = end;
fa9a63c5
RM
929
930 if (start == NULL)
931 {
a1a052df 932 fprintf (stderr, "(null)\n");
fa9a63c5
RM
933 return;
934 }
5e69f11e 935
fa9a63c5
RM
936 /* Loop over pattern commands. */
937 while (p < pend)
938 {
a1a052df 939 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
940
941 switch ((re_opcode_t) *p++)
942 {
25fe55af 943 case no_op:
a1a052df 944 fprintf (stderr, "/no_op");
25fe55af 945 break;
fa9a63c5 946
99633e97 947 case succeed:
a1a052df 948 fprintf (stderr, "/succeed");
99633e97
SM
949 break;
950
fa9a63c5
RM
951 case exactn:
952 mcnt = *p++;
a1a052df 953 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 954 do
fa9a63c5 955 {
a1a052df 956 fprintf (stderr, "/%c", *p++);
25fe55af
RS
957 }
958 while (--mcnt);
959 break;
fa9a63c5
RM
960
961 case start_memory:
a1a052df 962 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 963 break;
fa9a63c5
RM
964
965 case stop_memory:
a1a052df 966 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 967 break;
fa9a63c5
RM
968
969 case duplicate:
a1a052df 970 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
971 break;
972
973 case anychar:
a1a052df 974 fprintf (stderr, "/anychar");
fa9a63c5
RM
975 break;
976
977 case charset:
25fe55af
RS
978 case charset_not:
979 {
980 register int c, last = -100;
fa9a63c5 981 register int in_range = 0;
99633e97
SM
982 int length = CHARSET_BITMAP_SIZE (p - 1);
983 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 984
a1a052df 985 fprintf (stderr, "/charset [%s",
839966f3 986 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 987
839966f3
KH
988 if (p + *p >= pend)
989 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 990
25fe55af 991 for (c = 0; c < 256; c++)
96cc36cc 992 if (c / 8 < length
fa9a63c5
RM
993 && (p[1 + (c/8)] & (1 << (c % 8))))
994 {
995 /* Are we starting a range? */
996 if (last + 1 == c && ! in_range)
997 {
a1a052df 998 fprintf (stderr, "-");
fa9a63c5
RM
999 in_range = 1;
1000 }
1001 /* Have we broken a range? */
1002 else if (last + 1 != c && in_range)
96cc36cc 1003 {
a1a052df 1004 fprintf (stderr, "%c", last);
fa9a63c5
RM
1005 in_range = 0;
1006 }
5e69f11e 1007
fa9a63c5 1008 if (! in_range)
a1a052df 1009 fprintf (stderr, "%c", c);
fa9a63c5
RM
1010
1011 last = c;
25fe55af 1012 }
fa9a63c5
RM
1013
1014 if (in_range)
a1a052df 1015 fprintf (stderr, "%c", last);
fa9a63c5 1016
a1a052df 1017 fprintf (stderr, "]");
fa9a63c5 1018
99633e97 1019 p += 1 + length;
96cc36cc 1020
96cc36cc 1021 if (has_range_table)
99633e97
SM
1022 {
1023 int count;
a1a052df 1024 fprintf (stderr, "has-range-table");
99633e97
SM
1025
1026 /* ??? Should print the range table; for now, just skip it. */
1027 p += 2; /* skip range table bits */
1028 EXTRACT_NUMBER_AND_INCR (count, p);
1029 p = CHARSET_RANGE_TABLE_END (p, count);
1030 }
fa9a63c5
RM
1031 }
1032 break;
1033
1034 case begline:
a1a052df 1035 fprintf (stderr, "/begline");
25fe55af 1036 break;
fa9a63c5
RM
1037
1038 case endline:
a1a052df 1039 fprintf (stderr, "/endline");
25fe55af 1040 break;
fa9a63c5
RM
1041
1042 case on_failure_jump:
25fe55af 1043 extract_number_and_incr (&mcnt, &p);
a1a052df 1044 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1045 break;
fa9a63c5
RM
1046
1047 case on_failure_keep_string_jump:
25fe55af 1048 extract_number_and_incr (&mcnt, &p);
a1a052df 1049 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1050 break;
fa9a63c5 1051
0683b6fa
SM
1052 case on_failure_jump_nastyloop:
1053 extract_number_and_incr (&mcnt, &p);
a1a052df 1054 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1055 break;
1056
505bde11 1057 case on_failure_jump_loop:
fa9a63c5 1058 extract_number_and_incr (&mcnt, &p);
a1a052df 1059 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1060 break;
1061
505bde11 1062 case on_failure_jump_smart:
fa9a63c5 1063 extract_number_and_incr (&mcnt, &p);
a1a052df 1064 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1065 break;
1066
25fe55af 1067 case jump:
fa9a63c5 1068 extract_number_and_incr (&mcnt, &p);
a1a052df 1069 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1070 break;
1071
25fe55af
RS
1072 case succeed_n:
1073 extract_number_and_incr (&mcnt, &p);
1074 extract_number_and_incr (&mcnt2, &p);
a1a052df 1075 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1076 break;
5e69f11e 1077
25fe55af
RS
1078 case jump_n:
1079 extract_number_and_incr (&mcnt, &p);
1080 extract_number_and_incr (&mcnt2, &p);
a1a052df 1081 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1082 break;
5e69f11e 1083
25fe55af
RS
1084 case set_number_at:
1085 extract_number_and_incr (&mcnt, &p);
1086 extract_number_and_incr (&mcnt2, &p);
a1a052df 1087 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1088 break;
5e69f11e 1089
25fe55af 1090 case wordbound:
a1a052df 1091 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1092 break;
1093
1094 case notwordbound:
a1a052df 1095 fprintf (stderr, "/notwordbound");
25fe55af 1096 break;
fa9a63c5
RM
1097
1098 case wordbeg:
a1a052df 1099 fprintf (stderr, "/wordbeg");
fa9a63c5 1100 break;
5e69f11e 1101
fa9a63c5 1102 case wordend:
a1a052df 1103 fprintf (stderr, "/wordend");
e2543b02 1104 break;
5e69f11e 1105
669fa600 1106 case symbeg:
e2543b02 1107 fprintf (stderr, "/symbeg");
669fa600
SM
1108 break;
1109
1110 case symend:
e2543b02 1111 fprintf (stderr, "/symend");
669fa600 1112 break;
5e69f11e 1113
1fb352e0 1114 case syntaxspec:
a1a052df 1115 fprintf (stderr, "/syntaxspec");
1fb352e0 1116 mcnt = *p++;
a1a052df 1117 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1118 break;
1119
1120 case notsyntaxspec:
a1a052df 1121 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1122 mcnt = *p++;
a1a052df 1123 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1124 break;
1125
0b32bf0e 1126# ifdef emacs
fa9a63c5 1127 case before_dot:
a1a052df 1128 fprintf (stderr, "/before_dot");
25fe55af 1129 break;
fa9a63c5
RM
1130
1131 case at_dot:
a1a052df 1132 fprintf (stderr, "/at_dot");
25fe55af 1133 break;
fa9a63c5
RM
1134
1135 case after_dot:
a1a052df 1136 fprintf (stderr, "/after_dot");
25fe55af 1137 break;
fa9a63c5 1138
1fb352e0 1139 case categoryspec:
a1a052df 1140 fprintf (stderr, "/categoryspec");
fa9a63c5 1141 mcnt = *p++;
a1a052df 1142 fprintf (stderr, "/%d", mcnt);
25fe55af 1143 break;
5e69f11e 1144
1fb352e0 1145 case notcategoryspec:
a1a052df 1146 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1147 mcnt = *p++;
a1a052df 1148 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1149 break;
0b32bf0e 1150# endif /* emacs */
fa9a63c5 1151
fa9a63c5 1152 case begbuf:
a1a052df 1153 fprintf (stderr, "/begbuf");
25fe55af 1154 break;
fa9a63c5
RM
1155
1156 case endbuf:
a1a052df 1157 fprintf (stderr, "/endbuf");
25fe55af 1158 break;
fa9a63c5 1159
25fe55af 1160 default:
a1a052df 1161 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1162 }
1163
a1a052df 1164 fprintf (stderr, "\n");
fa9a63c5
RM
1165 }
1166
a1a052df 1167 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1168}
1169
1170
1171void
1172print_compiled_pattern (bufp)
1173 struct re_pattern_buffer *bufp;
1174{
01618498 1175 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1176
1177 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1178 printf ("%ld bytes used/%ld bytes allocated.\n",
1179 bufp->used, bufp->allocated);
fa9a63c5
RM
1180
1181 if (bufp->fastmap_accurate && bufp->fastmap)
1182 {
1183 printf ("fastmap: ");
1184 print_fastmap (bufp->fastmap);
1185 }
1186
1187 printf ("re_nsub: %d\t", bufp->re_nsub);
1188 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1189 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1190 printf ("no_sub: %d\t", bufp->no_sub);
1191 printf ("not_bol: %d\t", bufp->not_bol);
1192 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1193 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1194 fflush (stdout);
fa9a63c5
RM
1195 /* Perhaps we should print the translate table? */
1196}
1197
1198
1199void
1200print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1201 re_char *where;
1202 re_char *string1;
1203 re_char *string2;
d1dfb56c
EZ
1204 ssize_t size1;
1205 ssize_t size2;
fa9a63c5 1206{
d1dfb56c 1207 ssize_t this_char;
5e69f11e 1208
fa9a63c5
RM
1209 if (where == NULL)
1210 printf ("(null)");
1211 else
1212 {
1213 if (FIRST_STRING_P (where))
25fe55af
RS
1214 {
1215 for (this_char = where - string1; this_char < size1; this_char++)
1216 putchar (string1[this_char]);
fa9a63c5 1217
25fe55af
RS
1218 where = string2;
1219 }
fa9a63c5
RM
1220
1221 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1222 putchar (string2[this_char]);
fa9a63c5
RM
1223 }
1224}
1225
1226#else /* not DEBUG */
1227
0b32bf0e
SM
1228# undef assert
1229# define assert(e)
fa9a63c5 1230
0b32bf0e
SM
1231# define DEBUG_STATEMENT(e)
1232# define DEBUG_PRINT1(x)
1233# define DEBUG_PRINT2(x1, x2)
1234# define DEBUG_PRINT3(x1, x2, x3)
1235# define DEBUG_PRINT4(x1, x2, x3, x4)
1236# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1237# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1238
1239#endif /* not DEBUG */
1240\f
4da60324
PE
1241/* Use this to suppress gcc's `...may be used before initialized' warnings. */
1242#ifdef lint
1243# define IF_LINT(Code) Code
1244#else
1245# define IF_LINT(Code) /* empty */
1246#endif
1247\f
fa9a63c5
RM
1248/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1249 also be assigned to arbitrarily: each pattern buffer stores its own
1250 syntax, so it can be changed between regex compilations. */
1251/* This has no initializer because initialized variables in Emacs
1252 become read-only after dumping. */
1253reg_syntax_t re_syntax_options;
1254
1255
1256/* Specify the precise syntax of regexps for compilation. This provides
1257 for compatibility for various utilities which historically have
1258 different, incompatible syntaxes.
1259
1260 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1261 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1262
1263reg_syntax_t
971de7fb 1264re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1265{
1266 reg_syntax_t ret = re_syntax_options;
5e69f11e 1267
fa9a63c5
RM
1268 re_syntax_options = syntax;
1269 return ret;
1270}
c0f9ea08 1271WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1272
1273/* Regexp to use to replace spaces, or NULL meaning don't. */
1274static re_char *whitespace_regexp;
1275
1276void
971de7fb 1277re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1278{
6470ea05 1279 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1280}
1281WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1282\f
1283/* This table gives an error message for each of the error codes listed
4bb91c68 1284 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1285 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1286 but why not be nice? */
fa9a63c5
RM
1287
1288static const char *re_error_msgid[] =
5e69f11e
RM
1289 {
1290 gettext_noop ("Success"), /* REG_NOERROR */
1291 gettext_noop ("No match"), /* REG_NOMATCH */
1292 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1293 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1294 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1295 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1296 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1297 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1298 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1299 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1300 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1301 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1302 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1303 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1304 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1305 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1306 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1307 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1308 };
1309\f
4bb91c68 1310/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1311
1312/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1313 searching and matching functions should not call alloca. On some
1314 systems, alloca is implemented in terms of malloc, and if we're
1315 using the relocating allocator routines, then malloc could cause a
1316 relocation, which might (if the strings being searched are in the
1317 ralloc heap) shift the data out from underneath the regexp
1318 routines.
1319
5e69f11e 1320 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1321 processes input from X in a signal handler; processing X input may
1322 call malloc; if input arrives while a matching routine is calling
1323 malloc, then we're scrod. But Emacs can't just block input while
1324 calling matching routines; then we don't notice interrupts when
1325 they come in. So, Emacs blocks input around all regexp calls
1326 except the matching calls, which it leaves unprotected, in the
1327 faith that they will not malloc. */
1328
1329/* Normally, this is fine. */
1330#define MATCH_MAY_ALLOCATE
1331
fa9a63c5
RM
1332/* The match routines may not allocate if (1) they would do it with malloc
1333 and (2) it's not safe for them to use malloc.
1334 Note that if REL_ALLOC is defined, matching would not use malloc for the
1335 failure stack, but we would still use it for the register vectors;
4bb91c68 1336 so REL_ALLOC should not affect this. */
b588157e 1337#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1338# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1339#endif
1340
1341\f
1342/* Failure stack declarations and macros; both re_compile_fastmap and
1343 re_match_2 use a failure stack. These have to be macros because of
1344 REGEX_ALLOCATE_STACK. */
5e69f11e 1345
fa9a63c5 1346
320a2a73 1347/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1348 when matching. If this number is exceeded, we allocate more
1349 space, so it is not a hard limit. */
1350#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1351# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1352#endif
1353
1354/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1355 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1356 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1357 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1358 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1359# if defined MATCH_MAY_ALLOCATE
1360/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1361 whose default stack limit is 2mb. In order for a larger
1362 value to work reliably, you have to try to make it accord
1363 with the process stack limit. */
c0f9ea08
SM
1364size_t re_max_failures = 40000;
1365# else
1366size_t re_max_failures = 4000;
1367# endif
fa9a63c5
RM
1368
1369union fail_stack_elt
1370{
01618498 1371 re_char *pointer;
c0f9ea08
SM
1372 /* This should be the biggest `int' that's no bigger than a pointer. */
1373 long integer;
fa9a63c5
RM
1374};
1375
1376typedef union fail_stack_elt fail_stack_elt_t;
1377
1378typedef struct
1379{
1380 fail_stack_elt_t *stack;
c0f9ea08
SM
1381 size_t size;
1382 size_t avail; /* Offset of next open position. */
1383 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1384} fail_stack_type;
1385
505bde11 1386#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1387
1388
1389/* Define macros to initialize and free the failure stack.
1390 Do `return -2' if the alloc fails. */
1391
1392#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1393# define INIT_FAIL_STACK() \
fa9a63c5
RM
1394 do { \
1395 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1396 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1397 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1398 \
1399 if (fail_stack.stack == NULL) \
1400 return -2; \
1401 \
1402 fail_stack.size = INIT_FAILURE_ALLOC; \
1403 fail_stack.avail = 0; \
505bde11 1404 fail_stack.frame = 0; \
fa9a63c5 1405 } while (0)
fa9a63c5 1406#else
0b32bf0e 1407# define INIT_FAIL_STACK() \
fa9a63c5
RM
1408 do { \
1409 fail_stack.avail = 0; \
505bde11 1410 fail_stack.frame = 0; \
fa9a63c5
RM
1411 } while (0)
1412
b313f9d8
PE
1413# define RETALLOC_IF(addr, n, t) \
1414 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
fa9a63c5
RM
1415#endif
1416
1417
320a2a73
KH
1418/* Double the size of FAIL_STACK, up to a limit
1419 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1420
1421 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1422 allocating space for it or it was already too large.
1423
4bb91c68 1424 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1425
320a2a73
KH
1426/* Factor to increase the failure stack size by
1427 when we increase it.
1428 This used to be 2, but 2 was too wasteful
1429 because the old discarded stacks added up to as much space
1430 were as ultimate, maximum-size stack. */
1431#define FAIL_STACK_GROWTH_FACTOR 4
1432
1433#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1434 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1435 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1436 ? 0 \
320a2a73
KH
1437 : ((fail_stack).stack \
1438 = (fail_stack_elt_t *) \
25fe55af
RS
1439 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1440 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1441 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1442 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1443 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1444 \
1445 (fail_stack).stack == NULL \
1446 ? 0 \
6453db45
KH
1447 : ((fail_stack).size \
1448 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1449 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1450 * FAIL_STACK_GROWTH_FACTOR)) \
1451 / sizeof (fail_stack_elt_t)), \
25fe55af 1452 1)))
fa9a63c5
RM
1453
1454
fa9a63c5
RM
1455/* Push a pointer value onto the failure stack.
1456 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1457 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1458#define PUSH_FAILURE_POINTER(item) \
01618498 1459 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1460
1461/* This pushes an integer-valued item onto the failure stack.
1462 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1463 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1464#define PUSH_FAILURE_INT(item) \
1465 fail_stack.stack[fail_stack.avail++].integer = (item)
1466
b313f9d8 1467/* These POP... operations complement the PUSH... operations.
fa9a63c5
RM
1468 All assume that `fail_stack' is nonempty. */
1469#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1470#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
fa9a63c5 1471
505bde11
SM
1472/* Individual items aside from the registers. */
1473#define NUM_NONREG_ITEMS 3
1474
1475/* Used to examine the stack (to detect infinite loops). */
1476#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1477#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1478#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1479#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1480
1481
505bde11
SM
1482#define ENSURE_FAIL_STACK(space) \
1483while (REMAINING_AVAIL_SLOTS <= space) { \
1484 if (!GROW_FAIL_STACK (fail_stack)) \
1485 return -2; \
1486 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1487 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1488}
1489
1490/* Push register NUM onto the stack. */
1491#define PUSH_FAILURE_REG(num) \
1492do { \
1493 char *destination; \
1494 ENSURE_FAIL_STACK(3); \
1495 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1496 num, regstart[num], regend[num]); \
1497 PUSH_FAILURE_POINTER (regstart[num]); \
1498 PUSH_FAILURE_POINTER (regend[num]); \
1499 PUSH_FAILURE_INT (num); \
1500} while (0)
1501
01618498
SM
1502/* Change the counter's value to VAL, but make sure that it will
1503 be reset when backtracking. */
1504#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1505do { \
1506 char *destination; \
1507 int c; \
1508 ENSURE_FAIL_STACK(3); \
1509 EXTRACT_NUMBER (c, ptr); \
01618498 1510 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1511 PUSH_FAILURE_INT (c); \
1512 PUSH_FAILURE_POINTER (ptr); \
1513 PUSH_FAILURE_INT (-1); \
01618498 1514 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1515} while (0)
1516
505bde11 1517/* Pop a saved register off the stack. */
dc1e502d 1518#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1519do { \
d1dfb56c 1520 long pfreg = POP_FAILURE_INT (); \
19ed5445 1521 if (pfreg == -1) \
dc1e502d
SM
1522 { \
1523 /* It's a counter. */ \
6dcf2d0e
SM
1524 /* Here, we discard `const', making re_match non-reentrant. */ \
1525 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1526 pfreg = POP_FAILURE_INT (); \
1527 STORE_NUMBER (ptr, pfreg); \
1528 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, pfreg); \
dc1e502d
SM
1529 } \
1530 else \
1531 { \
19ed5445
PE
1532 regend[pfreg] = POP_FAILURE_POINTER (); \
1533 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc1e502d 1534 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
19ed5445 1535 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1536 } \
505bde11
SM
1537} while (0)
1538
1539/* Check that we are not stuck in an infinite loop. */
1540#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1541do { \
d1dfb56c 1542 ssize_t failure = TOP_FAILURE_HANDLE (); \
505bde11 1543 /* Check for infinite matching loops */ \
f6df485f
RS
1544 while (failure > 0 \
1545 && (FAILURE_STR (failure) == string_place \
1546 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1547 { \
1548 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1549 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1550 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1551 { \
6df42991
SM
1552 cycle = 1; \
1553 break; \
f6df485f 1554 } \
66f0296e 1555 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1556 failure = NEXT_FAILURE_HANDLE(failure); \
1557 } \
1558 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1559} while (0)
6df42991 1560
fa9a63c5 1561/* Push the information about the state we will need
5e69f11e
RM
1562 if we ever fail back to it.
1563
505bde11 1564 Requires variables fail_stack, regstart, regend and
320a2a73 1565 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1566 declared.
5e69f11e 1567
fa9a63c5
RM
1568 Does `return FAILURE_CODE' if runs out of memory. */
1569
505bde11
SM
1570#define PUSH_FAILURE_POINT(pattern, string_place) \
1571do { \
1572 char *destination; \
1573 /* Must be int, so when we don't save any registers, the arithmetic \
1574 of 0 + -1 isn't done as unsigned. */ \
1575 \
505bde11 1576 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1577 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1578 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1579 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1580 \
1581 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1582 \
1583 DEBUG_PRINT1 ("\n"); \
1584 \
1585 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1586 PUSH_FAILURE_INT (fail_stack.frame); \
1587 \
1588 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1589 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1590 DEBUG_PRINT1 ("'\n"); \
1591 PUSH_FAILURE_POINTER (string_place); \
1592 \
1593 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1594 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1595 PUSH_FAILURE_POINTER (pattern); \
1596 \
1597 /* Close the frame by moving the frame pointer past it. */ \
1598 fail_stack.frame = fail_stack.avail; \
1599} while (0)
fa9a63c5 1600
320a2a73
KH
1601/* Estimate the size of data pushed by a typical failure stack entry.
1602 An estimate is all we need, because all we use this for
1603 is to choose a limit for how big to make the failure stack. */
ada30c0e 1604/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1605#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1606
fa9a63c5
RM
1607/* How many items can still be added to the stack without overflowing it. */
1608#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1609
1610
1611/* Pops what PUSH_FAIL_STACK pushes.
1612
1613 We restore into the parameters, all of which should be lvalues:
1614 STR -- the saved data position.
1615 PAT -- the saved pattern position.
fa9a63c5 1616 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1617
fa9a63c5 1618 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1619 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1620
505bde11
SM
1621#define POP_FAILURE_POINT(str, pat) \
1622do { \
fa9a63c5
RM
1623 assert (!FAIL_STACK_EMPTY ()); \
1624 \
1625 /* Remove failure points and point to how many regs pushed. */ \
1626 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1627 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1628 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1629 \
505bde11
SM
1630 /* Pop the saved registers. */ \
1631 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1632 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1633 \
01618498 1634 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1635 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1636 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1637 \
1638 /* If the saved string location is NULL, it came from an \
1639 on_failure_keep_string_jump opcode, and we want to throw away the \
1640 saved NULL, thus retaining our current position in the string. */ \
01618498 1641 str = POP_FAILURE_POINTER (); \
505bde11 1642 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1643 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1644 DEBUG_PRINT1 ("'\n"); \
1645 \
505bde11
SM
1646 fail_stack.frame = POP_FAILURE_INT (); \
1647 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1648 \
505bde11
SM
1649 assert (fail_stack.avail >= 0); \
1650 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1651 \
fa9a63c5 1652 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1653} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1654
1655
1656\f
fa9a63c5 1657/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1658#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1659\f
1660/* Subroutine declarations and macros for regex_compile. */
1661
4bb91c68
SM
1662static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1663 reg_syntax_t syntax,
1664 struct re_pattern_buffer *bufp));
1665static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1666static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1667 int arg1, int arg2));
1668static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1669 int arg, unsigned char *end));
1670static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1671 int arg1, int arg2, unsigned char *end));
01618498
SM
1672static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1673 re_char *p,
4bb91c68 1674 reg_syntax_t syntax));
01618498
SM
1675static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1676 re_char *pend,
4bb91c68 1677 reg_syntax_t syntax));
01618498
SM
1678static re_char *skip_one_char _RE_ARGS ((re_char *p));
1679static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1680 char *fastmap, const int multibyte));
fa9a63c5 1681
fa9a63c5 1682/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1683 translation. */
36595814 1684#define PATFETCH(c) \
2d1675e4
SM
1685 do { \
1686 int len; \
1687 if (p == pend) return REG_EEND; \
62a6e103 1688 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1689 p += len; \
fa9a63c5
RM
1690 } while (0)
1691
fa9a63c5
RM
1692
1693/* If `translate' is non-null, return translate[D], else just D. We
1694 cast the subscript to translate because some data is declared as
1695 `char *', to avoid warnings when a string constant is passed. But
1696 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1697#ifndef TRANSLATE
0b32bf0e 1698# define TRANSLATE(d) \
66f0296e 1699 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1700#endif
fa9a63c5
RM
1701
1702
1703/* Macros for outputting the compiled pattern into `buffer'. */
1704
1705/* If the buffer isn't allocated when it comes in, use this. */
1706#define INIT_BUF_SIZE 32
1707
4bb91c68 1708/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1709#define GET_BUFFER_SPACE(n) \
01618498 1710 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1711 EXTEND_BUFFER ()
1712
1713/* Make sure we have one more byte of buffer space and then add C to it. */
1714#define BUF_PUSH(c) \
1715 do { \
1716 GET_BUFFER_SPACE (1); \
1717 *b++ = (unsigned char) (c); \
1718 } while (0)
1719
1720
1721/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1722#define BUF_PUSH_2(c1, c2) \
1723 do { \
1724 GET_BUFFER_SPACE (2); \
1725 *b++ = (unsigned char) (c1); \
1726 *b++ = (unsigned char) (c2); \
1727 } while (0)
1728
1729
fa9a63c5 1730/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1731 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1732#define STORE_JUMP(op, loc, to) \
1733 store_op1 (op, loc, (to) - (loc) - 3)
1734
1735/* Likewise, for a two-argument jump. */
1736#define STORE_JUMP2(op, loc, to, arg) \
1737 store_op2 (op, loc, (to) - (loc) - 3, arg)
1738
4bb91c68 1739/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1740#define INSERT_JUMP(op, loc, to) \
1741 insert_op1 (op, loc, (to) - (loc) - 3, b)
1742
1743/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1744#define INSERT_JUMP2(op, loc, to, arg) \
1745 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1746
1747
1748/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1749 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1750 be too small, many things would have to change. */
839966f3
KH
1751# define MAX_BUF_SIZE (1L << 15)
1752
1753#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1754/* Any other compiler which, like MSC, has allocation limit below 2^16
1755 bytes will have to use approach similar to what was done below for
1756 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1757 reallocating to 0 bytes. Such thing is not going to work too well.
1758 You have been warned!! */
1759#if defined _MSC_VER && !defined WIN32
1760/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1761# define MAX_BUF_SIZE 65500L
1762#else
1763# define MAX_BUF_SIZE (1L << 16)
1764#endif
839966f3 1765#endif /* 0 */
fa9a63c5
RM
1766
1767/* Extend the buffer by twice its current size via realloc and
1768 reset the pointers that pointed into the old block to point to the
1769 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1770 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1771#if __BOUNDED_POINTERS__
1772# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1773# define MOVE_BUFFER_POINTER(P) \
1774 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1775 SET_HIGH_BOUND (P), \
1776 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1777# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1778 else \
1779 { \
1780 SET_HIGH_BOUND (b); \
1781 SET_HIGH_BOUND (begalt); \
1782 if (fixup_alt_jump) \
1783 SET_HIGH_BOUND (fixup_alt_jump); \
1784 if (laststart) \
1785 SET_HIGH_BOUND (laststart); \
1786 if (pending_exact) \
1787 SET_HIGH_BOUND (pending_exact); \
1788 }
1789#else
381880b0 1790# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1791# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1792#endif
fa9a63c5 1793#define EXTEND_BUFFER() \
25fe55af 1794 do { \
381880b0 1795 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1796 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1797 return REG_ESIZE; \
1798 bufp->allocated <<= 1; \
1799 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1800 bufp->allocated = MAX_BUF_SIZE; \
01618498 1801 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1802 if (bufp->buffer == NULL) \
1803 return REG_ESPACE; \
1804 /* If the buffer moved, move all the pointers into it. */ \
1805 if (old_buffer != bufp->buffer) \
1806 { \
381880b0 1807 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1808 MOVE_BUFFER_POINTER (b); \
1809 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1810 if (fixup_alt_jump) \
4bb91c68 1811 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1812 if (laststart) \
4bb91c68 1813 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1814 if (pending_exact) \
4bb91c68 1815 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1816 } \
4bb91c68 1817 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1818 } while (0)
1819
1820
1821/* Since we have one byte reserved for the register number argument to
1822 {start,stop}_memory, the maximum number of groups we can report
1823 things about is what fits in that byte. */
1824#define MAX_REGNUM 255
1825
1826/* But patterns can have more than `MAX_REGNUM' registers. We just
1827 ignore the excess. */
098d42af 1828typedef int regnum_t;
fa9a63c5
RM
1829
1830
1831/* Macros for the compile stack. */
1832
1833/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1834 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1835/* int may be not enough when sizeof(int) == 2. */
1836typedef long pattern_offset_t;
fa9a63c5
RM
1837
1838typedef struct
1839{
1840 pattern_offset_t begalt_offset;
1841 pattern_offset_t fixup_alt_jump;
5e69f11e 1842 pattern_offset_t laststart_offset;
fa9a63c5
RM
1843 regnum_t regnum;
1844} compile_stack_elt_t;
1845
1846
1847typedef struct
1848{
1849 compile_stack_elt_t *stack;
d1dfb56c
EZ
1850 size_t size;
1851 size_t avail; /* Offset of next open position. */
fa9a63c5
RM
1852} compile_stack_type;
1853
1854
1855#define INIT_COMPILE_STACK_SIZE 32
1856
1857#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1858#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1859
4bb91c68 1860/* The next available element. */
fa9a63c5
RM
1861#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1862
1cee1e27
SM
1863/* Explicit quit checking is only used on NTemacs and whenever we
1864 use polling to process input events. */
1865#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1866extern int immediate_quit;
1867# define IMMEDIATE_QUIT_CHECK \
1868 do { \
1869 if (immediate_quit) QUIT; \
1870 } while (0)
1871#else
1872# define IMMEDIATE_QUIT_CHECK ((void)0)
1873#endif
1874\f
b18215fc
RS
1875/* Structure to manage work area for range table. */
1876struct range_table_work_area
1877{
1878 int *table; /* actual work area. */
1879 int allocated; /* allocated size for work area in bytes. */
7814e705 1880 int used; /* actually used size in words. */
96cc36cc 1881 int bits; /* flag to record character classes */
b18215fc
RS
1882};
1883
77d11aec
RS
1884/* Make sure that WORK_AREA can hold more N multibyte characters.
1885 This is used only in set_image_of_range and set_image_of_range_1.
1886 It expects WORK_AREA to be a pointer.
1887 If it can't get the space, it returns from the surrounding function. */
1888
1889#define EXTEND_RANGE_TABLE(work_area, n) \
1890 do { \
8f924df7 1891 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1892 { \
8f924df7
KH
1893 extend_range_table_work_area (&work_area); \
1894 if ((work_area).table == 0) \
77d11aec
RS
1895 return (REG_ESPACE); \
1896 } \
b18215fc
RS
1897 } while (0)
1898
96cc36cc
RS
1899#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1900 (work_area).bits |= (bit)
1901
14473664
SM
1902/* Bits used to implement the multibyte-part of the various character classes
1903 such as [:alnum:] in a charset's range table. */
1904#define BIT_WORD 0x1
1905#define BIT_LOWER 0x2
1906#define BIT_PUNCT 0x4
1907#define BIT_SPACE 0x8
1908#define BIT_UPPER 0x10
1909#define BIT_MULTIBYTE 0x20
96cc36cc 1910
b18215fc
RS
1911/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1912#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1913 do { \
8f924df7 1914 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1915 (work_area).table[(work_area).used++] = (range_start); \
1916 (work_area).table[(work_area).used++] = (range_end); \
1917 } while (0)
1918
7814e705 1919/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1920#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1921 do { \
1922 if ((work_area).table) \
1923 free ((work_area).table); \
1924 } while (0)
1925
96cc36cc 1926#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1927#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1928#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1929#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1930\f
b18215fc 1931
fa9a63c5 1932/* Set the bit for character C in a list. */
01618498 1933#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1934
1935
bf216479
KH
1936#ifdef emacs
1937
cf9c99bc
KH
1938/* Store characters in the range FROM to TO in the bitmap at B (for
1939 ASCII and unibyte characters) and WORK_AREA (for multibyte
1940 characters) while translating them and paying attention to the
1941 continuity of translated characters.
8f924df7 1942
cf9c99bc
KH
1943 Implementation note: It is better to implement these fairly big
1944 macros by a function, but it's not that easy because macros called
8f924df7 1945 in this macro assume various local variables already declared. */
bf216479 1946
cf9c99bc
KH
1947/* Both FROM and TO are ASCII characters. */
1948
1949#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1950 do { \
1951 int C0, C1; \
1952 \
1953 for (C0 = (FROM); C0 <= (TO); C0++) \
1954 { \
1955 C1 = TRANSLATE (C0); \
1956 if (! ASCII_CHAR_P (C1)) \
1957 { \
1958 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1959 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1960 C1 = C0; \
1961 } \
1962 SET_LIST_BIT (C1); \
1963 } \
1964 } while (0)
1965
1966
1967/* Both FROM and TO are unibyte characters (0x80..0xFF). */
1968
1969#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1970 do { \
1971 int C0, C1, C2, I; \
1972 int USED = RANGE_TABLE_WORK_USED (work_area); \
1973 \
1974 for (C0 = (FROM); C0 <= (TO); C0++) \
1975 { \
1976 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1977 if (CHAR_BYTE8_P (C1)) \
1978 SET_LIST_BIT (C0); \
1979 else \
1980 { \
1981 C2 = TRANSLATE (C1); \
1982 if (C2 == C1 \
1983 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1984 C1 = C0; \
1985 SET_LIST_BIT (C1); \
1986 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1987 { \
1988 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1989 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1990 \
1991 if (C2 >= from - 1 && C2 <= to + 1) \
1992 { \
1993 if (C2 == from - 1) \
1994 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1995 else if (C2 == to + 1) \
1996 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1997 break; \
1998 } \
1999 } \
2000 if (I < USED) \
2001 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2002 } \
2003 } \
2004 } while (0)
2005
2006
78edd3b7 2007/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2008
2009#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2010 do { \
2011 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2012 \
2013 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2014 for (C0 = (FROM); C0 <= (TO); C0++) \
2015 { \
2016 C1 = TRANSLATE (C0); \
2017 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2018 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2019 SET_LIST_BIT (C2); \
2020 if (C1 >= (FROM) && C1 <= (TO)) \
2021 continue; \
2022 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2023 { \
2024 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2025 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2026 \
2027 if (C1 >= from - 1 && C1 <= to + 1) \
2028 { \
2029 if (C1 == from - 1) \
2030 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2031 else if (C1 == to + 1) \
2032 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2033 break; \
2034 } \
2035 } \
2036 if (I < USED) \
2037 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2038 } \
bf216479
KH
2039 } while (0)
2040
2041#endif /* emacs */
2042
fa9a63c5 2043/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2044#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2045 do { \
2046 if (p == pend) \
2047 FREE_STACK_RETURN (REG_EBRACE); \
2048 else \
2049 { \
2050 PATFETCH (c); \
2051 while ('0' <= c && c <= '9') \
2052 { \
2053 int prev; \
2054 if (num < 0) \
2055 num = 0; \
2056 prev = num; \
2057 num = num * 10 + c - '0'; \
2058 if (num / 10 != prev) \
2059 FREE_STACK_RETURN (REG_BADBR); \
2060 if (p == pend) \
2061 FREE_STACK_RETURN (REG_EBRACE); \
2062 PATFETCH (c); \
2063 } \
2064 } \
2065 } while (0)
77d11aec 2066\f
1fdab503 2067#if ! WIDE_CHAR_SUPPORT
01618498 2068
14473664 2069/* Map a string to the char class it names (if any). */
1fdab503 2070re_wctype_t
971de7fb 2071re_wctype (const re_char *str)
14473664 2072{
5b0534c8 2073 const char *string = (const char *) str;
14473664
SM
2074 if (STREQ (string, "alnum")) return RECC_ALNUM;
2075 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2076 else if (STREQ (string, "word")) return RECC_WORD;
2077 else if (STREQ (string, "ascii")) return RECC_ASCII;
2078 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2079 else if (STREQ (string, "graph")) return RECC_GRAPH;
2080 else if (STREQ (string, "lower")) return RECC_LOWER;
2081 else if (STREQ (string, "print")) return RECC_PRINT;
2082 else if (STREQ (string, "punct")) return RECC_PUNCT;
2083 else if (STREQ (string, "space")) return RECC_SPACE;
2084 else if (STREQ (string, "upper")) return RECC_UPPER;
2085 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2086 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2087 else if (STREQ (string, "digit")) return RECC_DIGIT;
2088 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2089 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2090 else if (STREQ (string, "blank")) return RECC_BLANK;
2091 else return 0;
2092}
2093
e0f24100 2094/* True if CH is in the char class CC. */
1fdab503 2095boolean
971de7fb 2096re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2097{
2098 switch (cc)
2099 {
f3fcc40d
AS
2100 case RECC_ALNUM: return ISALNUM (ch) != 0;
2101 case RECC_ALPHA: return ISALPHA (ch) != 0;
2102 case RECC_BLANK: return ISBLANK (ch) != 0;
2103 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2104 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2105 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2106 case RECC_LOWER: return ISLOWER (ch) != 0;
2107 case RECC_PRINT: return ISPRINT (ch) != 0;
2108 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2109 case RECC_SPACE: return ISSPACE (ch) != 0;
2110 case RECC_UPPER: return ISUPPER (ch) != 0;
2111 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2112 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
213bd7f2 2113 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
f3fcc40d 2114 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
213bd7f2 2115 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
f3fcc40d 2116 case RECC_WORD: return ISWORD (ch) != 0;
0cdd06f8
SM
2117 case RECC_ERROR: return false;
2118 default:
5e617bc2 2119 abort ();
14473664
SM
2120 }
2121}
fa9a63c5 2122
14473664
SM
2123/* Return a bit-pattern to use in the range-table bits to match multibyte
2124 chars of class CC. */
2125static int
971de7fb 2126re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2127{
2128 switch (cc)
2129 {
2130 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2131 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2132 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2133 case RECC_LOWER: return BIT_LOWER;
2134 case RECC_UPPER: return BIT_UPPER;
2135 case RECC_PUNCT: return BIT_PUNCT;
2136 case RECC_SPACE: return BIT_SPACE;
14473664 2137 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2138 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2139 default:
5e617bc2 2140 abort ();
14473664
SM
2141 }
2142}
2143#endif
77d11aec
RS
2144\f
2145/* Filling in the work area of a range. */
2146
2147/* Actually extend the space in WORK_AREA. */
2148
2149static void
971de7fb 2150extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2151{
77d11aec
RS
2152 work_area->allocated += 16 * sizeof (int);
2153 if (work_area->table)
2154 work_area->table
2155 = (int *) realloc (work_area->table, work_area->allocated);
2156 else
2157 work_area->table
2158 = (int *) malloc (work_area->allocated);
2159}
2160
8f924df7 2161#if 0
77d11aec
RS
2162#ifdef emacs
2163
2164/* Carefully find the ranges of codes that are equivalent
2165 under case conversion to the range start..end when passed through
2166 TRANSLATE. Handle the case where non-letters can come in between
2167 two upper-case letters (which happens in Latin-1).
2168 Also handle the case of groups of more than 2 case-equivalent chars.
2169
2170 The basic method is to look at consecutive characters and see
2171 if they can form a run that can be handled as one.
2172
2173 Returns -1 if successful, REG_ESPACE if ran out of space. */
2174
2175static int
1dae0f0a
AS
2176set_image_of_range_1 (struct range_table_work_area *work_area,
2177 re_wchar_t start, re_wchar_t end,
2178 RE_TRANSLATE_TYPE translate)
77d11aec
RS
2179{
2180 /* `one_case' indicates a character, or a run of characters,
2181 each of which is an isolate (no case-equivalents).
2182 This includes all ASCII non-letters.
2183
2184 `two_case' indicates a character, or a run of characters,
2185 each of which has two case-equivalent forms.
2186 This includes all ASCII letters.
2187
2188 `strange' indicates a character that has more than one
2189 case-equivalent. */
177c0ea7 2190
77d11aec
RS
2191 enum case_type {one_case, two_case, strange};
2192
2193 /* Describe the run that is in progress,
2194 which the next character can try to extend.
2195 If run_type is strange, that means there really is no run.
2196 If run_type is one_case, then run_start...run_end is the run.
2197 If run_type is two_case, then the run is run_start...run_end,
2198 and the case-equivalents end at run_eqv_end. */
2199
2200 enum case_type run_type = strange;
2201 int run_start, run_end, run_eqv_end;
2202
2203 Lisp_Object eqv_table;
2204
2205 if (!RE_TRANSLATE_P (translate))
2206 {
b7c12565 2207 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2208 work_area->table[work_area->used++] = (start);
2209 work_area->table[work_area->used++] = (end);
b7c12565 2210 return -1;
77d11aec
RS
2211 }
2212
2213 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2214
77d11aec
RS
2215 for (; start <= end; start++)
2216 {
2217 enum case_type this_type;
2218 int eqv = RE_TRANSLATE (eqv_table, start);
2219 int minchar, maxchar;
2220
2221 /* Classify this character */
2222 if (eqv == start)
2223 this_type = one_case;
2224 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2225 this_type = two_case;
2226 else
2227 this_type = strange;
2228
2229 if (start < eqv)
2230 minchar = start, maxchar = eqv;
2231 else
2232 minchar = eqv, maxchar = start;
2233
2234 /* Can this character extend the run in progress? */
2235 if (this_type == strange || this_type != run_type
2236 || !(minchar == run_end + 1
2237 && (run_type == two_case
2238 ? maxchar == run_eqv_end + 1 : 1)))
2239 {
2240 /* No, end the run.
2241 Record each of its equivalent ranges. */
2242 if (run_type == one_case)
2243 {
2244 EXTEND_RANGE_TABLE (work_area, 2);
2245 work_area->table[work_area->used++] = run_start;
2246 work_area->table[work_area->used++] = run_end;
2247 }
2248 else if (run_type == two_case)
2249 {
2250 EXTEND_RANGE_TABLE (work_area, 4);
2251 work_area->table[work_area->used++] = run_start;
2252 work_area->table[work_area->used++] = run_end;
2253 work_area->table[work_area->used++]
2254 = RE_TRANSLATE (eqv_table, run_start);
2255 work_area->table[work_area->used++]
2256 = RE_TRANSLATE (eqv_table, run_end);
2257 }
2258 run_type = strange;
2259 }
177c0ea7 2260
77d11aec
RS
2261 if (this_type == strange)
2262 {
2263 /* For a strange character, add each of its equivalents, one
2264 by one. Don't start a range. */
2265 do
2266 {
2267 EXTEND_RANGE_TABLE (work_area, 2);
2268 work_area->table[work_area->used++] = eqv;
2269 work_area->table[work_area->used++] = eqv;
2270 eqv = RE_TRANSLATE (eqv_table, eqv);
2271 }
2272 while (eqv != start);
2273 }
2274
2275 /* Add this char to the run, or start a new run. */
2276 else if (run_type == strange)
2277 {
2278 /* Initialize a new range. */
2279 run_type = this_type;
2280 run_start = start;
2281 run_end = start;
2282 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2283 }
2284 else
2285 {
2286 /* Extend a running range. */
2287 run_end = minchar;
2288 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2289 }
2290 }
2291
2292 /* If a run is still in progress at the end, finish it now
2293 by recording its equivalent ranges. */
2294 if (run_type == one_case)
2295 {
2296 EXTEND_RANGE_TABLE (work_area, 2);
2297 work_area->table[work_area->used++] = run_start;
2298 work_area->table[work_area->used++] = run_end;
2299 }
2300 else if (run_type == two_case)
2301 {
2302 EXTEND_RANGE_TABLE (work_area, 4);
2303 work_area->table[work_area->used++] = run_start;
2304 work_area->table[work_area->used++] = run_end;
2305 work_area->table[work_area->used++]
2306 = RE_TRANSLATE (eqv_table, run_start);
2307 work_area->table[work_area->used++]
2308 = RE_TRANSLATE (eqv_table, run_end);
2309 }
2310
2311 return -1;
2312}
36595814 2313
77d11aec 2314#endif /* emacs */
36595814 2315
2b34df4e 2316/* Record the image of the range start..end when passed through
36595814
SM
2317 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2318 and is not even necessarily contiguous.
b7c12565
RS
2319 Normally we approximate it with the smallest contiguous range that contains
2320 all the chars we need. However, for Latin-1 we go to extra effort
2321 to do a better job.
2322
2323 This function is not called for ASCII ranges.
77d11aec
RS
2324
2325 Returns -1 if successful, REG_ESPACE if ran out of space. */
2326
2327static int
1dae0f0a
AS
2328set_image_of_range (struct range_table_work_area *work_area,
2329 re_wchar_t start, re_wchar_t end,
2330 RE_TRANSLATE_TYPE translate)
36595814 2331{
77d11aec
RS
2332 re_wchar_t cmin, cmax;
2333
2334#ifdef emacs
2335 /* For Latin-1 ranges, use set_image_of_range_1
2336 to get proper handling of ranges that include letters and nonletters.
b7c12565 2337 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2338 For other character sets, we don't bother to get this right. */
b7c12565
RS
2339 if (RE_TRANSLATE_P (translate) && start < 04400
2340 && !(start < 04200 && end >= 04377))
77d11aec 2341 {
b7c12565 2342 int newend;
77d11aec 2343 int tem;
b7c12565
RS
2344 newend = end;
2345 if (newend > 04377)
2346 newend = 04377;
2347 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2348 if (tem > 0)
2349 return tem;
2350
2351 start = 04400;
2352 if (end < 04400)
2353 return -1;
2354 }
2355#endif
2356
b7c12565
RS
2357 EXTEND_RANGE_TABLE (work_area, 2);
2358 work_area->table[work_area->used++] = (start);
2359 work_area->table[work_area->used++] = (end);
2360
2361 cmin = -1, cmax = -1;
77d11aec 2362
36595814 2363 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2364 {
2365 int ch;
77d11aec 2366
b7c12565
RS
2367 for (ch = start; ch <= end; ch++)
2368 {
2369 re_wchar_t c = TRANSLATE (ch);
2370 if (! (start <= c && c <= end))
2371 {
2372 if (cmin == -1)
2373 cmin = c, cmax = c;
2374 else
2375 {
2376 cmin = MIN (cmin, c);
2377 cmax = MAX (cmax, c);
2378 }
2379 }
2380 }
2381
2382 if (cmin != -1)
2383 {
2384 EXTEND_RANGE_TABLE (work_area, 2);
2385 work_area->table[work_area->used++] = (cmin);
2386 work_area->table[work_area->used++] = (cmax);
2387 }
2388 }
36595814 2389
77d11aec
RS
2390 return -1;
2391}
8f924df7 2392#endif /* 0 */
fa9a63c5
RM
2393\f
2394#ifndef MATCH_MAY_ALLOCATE
2395
2396/* If we cannot allocate large objects within re_match_2_internal,
2397 we make the fail stack and register vectors global.
2398 The fail stack, we grow to the maximum size when a regexp
2399 is compiled.
2400 The register vectors, we adjust in size each time we
2401 compile a regexp, according to the number of registers it needs. */
2402
2403static fail_stack_type fail_stack;
2404
2405/* Size with which the following vectors are currently allocated.
2406 That is so we can make them bigger as needed,
4bb91c68 2407 but never make them smaller. */
fa9a63c5
RM
2408static int regs_allocated_size;
2409
66f0296e
SM
2410static re_char ** regstart, ** regend;
2411static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2412
2413/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2414 but don't make them smaller. */
fa9a63c5
RM
2415
2416static
1dae0f0a 2417regex_grow_registers (int num_regs)
fa9a63c5
RM
2418{
2419 if (num_regs > regs_allocated_size)
2420 {
66f0296e
SM
2421 RETALLOC_IF (regstart, num_regs, re_char *);
2422 RETALLOC_IF (regend, num_regs, re_char *);
2423 RETALLOC_IF (best_regstart, num_regs, re_char *);
2424 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2425
2426 regs_allocated_size = num_regs;
2427 }
2428}
2429
2430#endif /* not MATCH_MAY_ALLOCATE */
2431\f
99633e97
SM
2432static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2433 compile_stack,
2434 regnum_t regnum));
2435
fa9a63c5
RM
2436/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2437 Returns one of error codes defined in `regex.h', or zero for success.
2438
2439 Assumes the `allocated' (and perhaps `buffer') and `translate'
2440 fields are set in BUFP on entry.
2441
2442 If it succeeds, results are put in BUFP (if it returns an error, the
2443 contents of BUFP are undefined):
2444 `buffer' is the compiled pattern;
2445 `syntax' is set to SYNTAX;
2446 `used' is set to the length of the compiled pattern;
2447 `fastmap_accurate' is zero;
2448 `re_nsub' is the number of subexpressions in PATTERN;
2449 `not_bol' and `not_eol' are zero;
5e69f11e 2450
c0f9ea08 2451 The `fastmap' field is neither examined nor set. */
fa9a63c5 2452
505bde11
SM
2453/* Insert the `jump' from the end of last alternative to "here".
2454 The space for the jump has already been allocated. */
2455#define FIXUP_ALT_JUMP() \
2456do { \
2457 if (fixup_alt_jump) \
2458 STORE_JUMP (jump, fixup_alt_jump, b); \
2459} while (0)
2460
2461
fa9a63c5
RM
2462/* Return, freeing storage we allocated. */
2463#define FREE_STACK_RETURN(value) \
b18215fc
RS
2464 do { \
2465 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2466 free (compile_stack.stack); \
2467 return value; \
2468 } while (0)
fa9a63c5
RM
2469
2470static reg_errcode_t
971de7fb 2471regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2472{
01618498
SM
2473 /* We fetch characters from PATTERN here. */
2474 register re_wchar_t c, c1;
5e69f11e 2475
fa9a63c5
RM
2476 /* Points to the end of the buffer, where we should append. */
2477 register unsigned char *b;
5e69f11e 2478
fa9a63c5
RM
2479 /* Keeps track of unclosed groups. */
2480 compile_stack_type compile_stack;
2481
2482 /* Points to the current (ending) position in the pattern. */
22336245
RS
2483#ifdef AIX
2484 /* `const' makes AIX compiler fail. */
66f0296e 2485 unsigned char *p = pattern;
22336245 2486#else
66f0296e 2487 re_char *p = pattern;
22336245 2488#endif
66f0296e 2489 re_char *pend = pattern + size;
5e69f11e 2490
fa9a63c5 2491 /* How to translate the characters in the pattern. */
6676cb1c 2492 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2493
2494 /* Address of the count-byte of the most recently inserted `exactn'
2495 command. This makes it possible to tell if a new exact-match
2496 character can be added to that command or if the character requires
2497 a new `exactn' command. */
2498 unsigned char *pending_exact = 0;
2499
2500 /* Address of start of the most recently finished expression.
2501 This tells, e.g., postfix * where to find the start of its
2502 operand. Reset at the beginning of groups and alternatives. */
2503 unsigned char *laststart = 0;
2504
2505 /* Address of beginning of regexp, or inside of last group. */
2506 unsigned char *begalt;
2507
2508 /* Place in the uncompiled pattern (i.e., the {) to
2509 which to go back if the interval is invalid. */
66f0296e 2510 re_char *beg_interval;
5e69f11e 2511
fa9a63c5 2512 /* Address of the place where a forward jump should go to the end of
7814e705 2513 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2514 last -- ends with a forward jump of this sort. */
2515 unsigned char *fixup_alt_jump = 0;
2516
b18215fc
RS
2517 /* Work area for range table of charset. */
2518 struct range_table_work_area range_table_work;
2519
2d1675e4
SM
2520 /* If the object matched can contain multibyte characters. */
2521 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2522
f9b0fd99
RS
2523 /* Nonzero if we have pushed down into a subpattern. */
2524 int in_subpattern = 0;
2525
2526 /* These hold the values of p, pattern, and pend from the main
2527 pattern when we have pushed into a subpattern. */
da053e48
PE
2528 re_char *main_p IF_LINT (= NULL);
2529 re_char *main_pattern IF_LINT (= NULL);
2530 re_char *main_pend IF_LINT (= NULL);
f9b0fd99 2531
fa9a63c5 2532#ifdef DEBUG
99633e97 2533 debug++;
fa9a63c5 2534 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2535 if (debug > 0)
fa9a63c5
RM
2536 {
2537 unsigned debug_count;
5e69f11e 2538
fa9a63c5 2539 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2540 putchar (pattern[debug_count]);
fa9a63c5
RM
2541 putchar ('\n');
2542 }
2543#endif /* DEBUG */
2544
2545 /* Initialize the compile stack. */
2546 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2547 if (compile_stack.stack == NULL)
2548 return REG_ESPACE;
2549
2550 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2551 compile_stack.avail = 0;
2552
b18215fc
RS
2553 range_table_work.table = 0;
2554 range_table_work.allocated = 0;
2555
fa9a63c5
RM
2556 /* Initialize the pattern buffer. */
2557 bufp->syntax = syntax;
2558 bufp->fastmap_accurate = 0;
2559 bufp->not_bol = bufp->not_eol = 0;
6224b623 2560 bufp->used_syntax = 0;
fa9a63c5
RM
2561
2562 /* Set `used' to zero, so that if we return an error, the pattern
2563 printer (for debugging) will think there's no pattern. We reset it
2564 at the end. */
2565 bufp->used = 0;
5e69f11e 2566
fa9a63c5 2567 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2568 bufp->re_nsub = 0;
fa9a63c5 2569
0b32bf0e 2570#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2571 /* Initialize the syntax table. */
2572 init_syntax_once ();
2573#endif
2574
2575 if (bufp->allocated == 0)
2576 {
2577 if (bufp->buffer)
2578 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2579 enough space. This loses if buffer's address is bogus, but
7814e705 2580 that is the user's responsibility. */
25fe55af
RS
2581 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2582 }
fa9a63c5 2583 else
7814e705 2584 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2585 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2586 }
fa9a63c5
RM
2587 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2588
2589 bufp->allocated = INIT_BUF_SIZE;
2590 }
2591
2592 begalt = b = bufp->buffer;
2593
2594 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2595 while (1)
fa9a63c5 2596 {
f9b0fd99
RS
2597 if (p == pend)
2598 {
2599 /* If this is the end of an included regexp,
2600 pop back to the main regexp and try again. */
2601 if (in_subpattern)
2602 {
2603 in_subpattern = 0;
2604 pattern = main_pattern;
2605 p = main_p;
2606 pend = main_pend;
2607 continue;
2608 }
2609 /* If this is the end of the main regexp, we are done. */
2610 break;
2611 }
2612
fa9a63c5
RM
2613 PATFETCH (c);
2614
2615 switch (c)
25fe55af 2616 {
f9b0fd99
RS
2617 case ' ':
2618 {
2619 re_char *p1 = p;
2620
2621 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2622 spaces normally. And don't try to do this recursively. */
2623 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2624 goto normal_char;
2625
2626 /* Peek past following spaces. */
2627 while (p1 != pend)
2628 {
2629 if (*p1 != ' ')
2630 break;
2631 p1++;
2632 }
2633 /* If the spaces are followed by a repetition op,
2634 treat them normally. */
c721eee5
RS
2635 if (p1 != pend
2636 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2637 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2638 goto normal_char;
2639
2640 /* Replace the spaces with the whitespace regexp. */
2641 in_subpattern = 1;
2642 main_p = p1;
2643 main_pend = pend;
2644 main_pattern = pattern;
2645 p = pattern = whitespace_regexp;
5b0534c8 2646 pend = p + strlen ((const char *) p);
f9b0fd99 2647 break;
7814e705 2648 }
f9b0fd99 2649
25fe55af
RS
2650 case '^':
2651 {
7814e705 2652 if ( /* If at start of pattern, it's an operator. */
25fe55af 2653 p == pattern + 1
7814e705 2654 /* If context independent, it's an operator. */
25fe55af 2655 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2656 /* Otherwise, depends on what's come before. */
25fe55af 2657 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2658 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2659 else
2660 goto normal_char;
2661 }
2662 break;
2663
2664
2665 case '$':
2666 {
2667 if ( /* If at end of pattern, it's an operator. */
2668 p == pend
7814e705 2669 /* If context independent, it's an operator. */
25fe55af
RS
2670 || syntax & RE_CONTEXT_INDEP_ANCHORS
2671 /* Otherwise, depends on what's next. */
2672 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2673 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2674 else
2675 goto normal_char;
2676 }
2677 break;
fa9a63c5
RM
2678
2679
2680 case '+':
25fe55af
RS
2681 case '?':
2682 if ((syntax & RE_BK_PLUS_QM)
2683 || (syntax & RE_LIMITED_OPS))
2684 goto normal_char;
2685 handle_plus:
2686 case '*':
2687 /* If there is no previous pattern... */
2688 if (!laststart)
2689 {
2690 if (syntax & RE_CONTEXT_INVALID_OPS)
2691 FREE_STACK_RETURN (REG_BADRPT);
2692 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2693 goto normal_char;
2694 }
2695
2696 {
7814e705 2697 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2698 boolean zero_times_ok = 0, many_times_ok = 0;
2699 boolean greedy = 1;
25fe55af
RS
2700
2701 /* If there is a sequence of repetition chars, collapse it
2702 down to just one (the right one). We can't combine
2703 interval operators with these because of, e.g., `a{2}*',
7814e705 2704 which should only match an even number of `a's. */
25fe55af
RS
2705
2706 for (;;)
2707 {
0b32bf0e 2708 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2709 && c == '?' && (zero_times_ok || many_times_ok))
2710 greedy = 0;
2711 else
2712 {
2713 zero_times_ok |= c != '+';
2714 many_times_ok |= c != '?';
2715 }
25fe55af
RS
2716
2717 if (p == pend)
2718 break;
ed0767d8
SM
2719 else if (*p == '*'
2720 || (!(syntax & RE_BK_PLUS_QM)
2721 && (*p == '+' || *p == '?')))
25fe55af 2722 ;
ed0767d8 2723 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2724 {
ed0767d8
SM
2725 if (p+1 == pend)
2726 FREE_STACK_RETURN (REG_EESCAPE);
2727 if (p[1] == '+' || p[1] == '?')
2728 PATFETCH (c); /* Gobble up the backslash. */
2729 else
2730 break;
25fe55af
RS
2731 }
2732 else
ed0767d8 2733 break;
25fe55af 2734 /* If we get here, we found another repeat character. */
ed0767d8
SM
2735 PATFETCH (c);
2736 }
25fe55af
RS
2737
2738 /* Star, etc. applied to an empty pattern is equivalent
2739 to an empty pattern. */
4e8a9132 2740 if (!laststart || laststart == b)
25fe55af
RS
2741 break;
2742
2743 /* Now we know whether or not zero matches is allowed
7814e705 2744 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2745 if (greedy)
2746 {
99633e97 2747 if (many_times_ok)
4e8a9132
SM
2748 {
2749 boolean simple = skip_one_char (laststart) == b;
d1dfb56c 2750 size_t startoffset = 0;
f6a3f532 2751 re_opcode_t ofj =
01618498 2752 /* Check if the loop can match the empty string. */
6df42991
SM
2753 (simple || !analyse_first (laststart, b, NULL, 0))
2754 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2755 assert (skip_one_char (laststart) <= b);
177c0ea7 2756
4e8a9132
SM
2757 if (!zero_times_ok && simple)
2758 { /* Since simple * loops can be made faster by using
2759 on_failure_keep_string_jump, we turn simple P+
2760 into PP* if P is simple. */
2761 unsigned char *p1, *p2;
2762 startoffset = b - laststart;
2763 GET_BUFFER_SPACE (startoffset);
2764 p1 = b; p2 = laststart;
2765 while (p2 < p1)
2766 *b++ = *p2++;
2767 zero_times_ok = 1;
99633e97 2768 }
4e8a9132
SM
2769
2770 GET_BUFFER_SPACE (6);
2771 if (!zero_times_ok)
2772 /* A + loop. */
f6a3f532 2773 STORE_JUMP (ofj, b, b + 6);
99633e97 2774 else
4e8a9132
SM
2775 /* Simple * loops can use on_failure_keep_string_jump
2776 depending on what follows. But since we don't know
2777 that yet, we leave the decision up to
2778 on_failure_jump_smart. */
f6a3f532 2779 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2780 laststart + startoffset, b + 6);
99633e97 2781 b += 3;
4e8a9132 2782 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2783 b += 3;
2784 }
2785 else
2786 {
4e8a9132
SM
2787 /* A simple ? pattern. */
2788 assert (zero_times_ok);
2789 GET_BUFFER_SPACE (3);
2790 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2791 b += 3;
2792 }
1c8c6d39
DL
2793 }
2794 else /* not greedy */
2795 { /* I wish the greedy and non-greedy cases could be merged. */
2796
0683b6fa 2797 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2798 if (many_times_ok)
2799 {
f6a3f532
SM
2800 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2801
6df42991
SM
2802 /* The non-greedy multiple match looks like
2803 a repeat..until: we only need a conditional jump
2804 at the end of the loop. */
f6a3f532
SM
2805 if (emptyp) BUF_PUSH (no_op);
2806 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2807 : on_failure_jump, b, laststart);
1c8c6d39
DL
2808 b += 3;
2809 if (zero_times_ok)
2810 {
2811 /* The repeat...until naturally matches one or more.
2812 To also match zero times, we need to first jump to
6df42991 2813 the end of the loop (its conditional jump). */
1c8c6d39
DL
2814 INSERT_JUMP (jump, laststart, b);
2815 b += 3;
2816 }
2817 }
2818 else
2819 {
2820 /* non-greedy a?? */
1c8c6d39
DL
2821 INSERT_JUMP (jump, laststart, b + 3);
2822 b += 3;
2823 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2824 b += 3;
2825 }
2826 }
2827 }
4e8a9132 2828 pending_exact = 0;
fa9a63c5
RM
2829 break;
2830
2831
2832 case '.':
25fe55af
RS
2833 laststart = b;
2834 BUF_PUSH (anychar);
2835 break;
fa9a63c5
RM
2836
2837
25fe55af
RS
2838 case '[':
2839 {
19ed5445
PE
2840 re_char *p1;
2841
b18215fc 2842 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2843
25fe55af 2844 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2845
25fe55af
RS
2846 /* Ensure that we have enough space to push a charset: the
2847 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2848 GET_BUFFER_SPACE (34);
2849
25fe55af 2850 laststart = b;
e318085a 2851
25fe55af 2852 /* We test `*p == '^' twice, instead of using an if
7814e705 2853 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2854 BUF_PUSH (*p == '^' ? charset_not : charset);
2855 if (*p == '^')
2856 p++;
e318085a 2857
25fe55af
RS
2858 /* Remember the first position in the bracket expression. */
2859 p1 = p;
e318085a 2860
7814e705 2861 /* Push the number of bytes in the bitmap. */
25fe55af 2862 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2863
25fe55af 2864 /* Clear the whole map. */
72af86bd 2865 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2866
25fe55af
RS
2867 /* charset_not matches newline according to a syntax bit. */
2868 if ((re_opcode_t) b[-2] == charset_not
2869 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2870 SET_LIST_BIT ('\n');
fa9a63c5 2871
7814e705 2872 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2873 for (;;)
2874 {
b18215fc 2875 boolean escaped_char = false;
2d1675e4 2876 const unsigned char *p2 = p;
abbd1bcf 2877 re_wchar_t ch;
e318085a 2878
25fe55af 2879 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2880
36595814
SM
2881 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2882 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2883 So the translation is done later in a loop. Example:
2884 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2885 PATFETCH (c);
e318085a 2886
25fe55af
RS
2887 /* \ might escape characters inside [...] and [^...]. */
2888 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2889 {
2890 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2891
2892 PATFETCH (c);
b18215fc 2893 escaped_char = true;
25fe55af 2894 }
b18215fc
RS
2895 else
2896 {
7814e705 2897 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2898 not (i.e., when the bracket expression is `[]' so
2899 far), the ']' character bit gets set way below. */
2d1675e4 2900 if (c == ']' && p2 != p1)
657fcfbd 2901 break;
25fe55af 2902 }
b18215fc 2903
25fe55af
RS
2904 /* See if we're at the beginning of a possible character
2905 class. */
b18215fc 2906
2d1675e4
SM
2907 if (!escaped_char &&
2908 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2909 {
7814e705 2910 /* Leave room for the null. */
14473664 2911 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2912 const unsigned char *class_beg;
b18215fc 2913
25fe55af
RS
2914 PATFETCH (c);
2915 c1 = 0;
ed0767d8 2916 class_beg = p;
b18215fc 2917
25fe55af
RS
2918 /* If pattern is `[[:'. */
2919 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2920
25fe55af
RS
2921 for (;;)
2922 {
14473664
SM
2923 PATFETCH (c);
2924 if ((c == ':' && *p == ']') || p == pend)
2925 break;
2926 if (c1 < CHAR_CLASS_MAX_LENGTH)
2927 str[c1++] = c;
2928 else
2929 /* This is in any case an invalid class name. */
2930 str[0] = '\0';
25fe55af
RS
2931 }
2932 str[c1] = '\0';
b18215fc
RS
2933
2934 /* If isn't a word bracketed by `[:' and `:]':
2935 undo the ending character, the letters, and
2936 leave the leading `:' and `[' (but set bits for
2937 them). */
25fe55af
RS
2938 if (c == ':' && *p == ']')
2939 {
abbd1bcf 2940 re_wctype_t cc = re_wctype (str);
14473664
SM
2941
2942 if (cc == 0)
fa9a63c5
RM
2943 FREE_STACK_RETURN (REG_ECTYPE);
2944
14473664
SM
2945 /* Throw away the ] at the end of the character
2946 class. */
2947 PATFETCH (c);
fa9a63c5 2948
14473664 2949 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2950
cf9c99bc
KH
2951#ifndef emacs
2952 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
2953 if (re_iswctype (btowc (ch), cc))
2954 {
2955 c = TRANSLATE (ch);
ed00c2ac
KH
2956 if (c < (1 << BYTEWIDTH))
2957 SET_LIST_BIT (c);
8f924df7 2958 }
cf9c99bc
KH
2959#else /* emacs */
2960 /* Most character classes in a multibyte match
2961 just set a flag. Exceptions are is_blank,
2962 is_digit, is_cntrl, and is_xdigit, since
2963 they can only match ASCII characters. We
2964 don't need to handle them for multibyte.
2965 They are distinguished by a negative wctype. */
96cc36cc 2966
254c06a8
SM
2967 /* Setup the gl_state object to its buffer-defined
2968 value. This hardcodes the buffer-global
2969 syntax-table for ASCII chars, while the other chars
2970 will obey syntax-table properties. It's not ideal,
2971 but it's the way it's been done until now. */
d48cd3f4 2972 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 2973
cf9c99bc 2974 for (ch = 0; ch < 256; ++ch)
25fe55af 2975 {
cf9c99bc
KH
2976 c = RE_CHAR_TO_MULTIBYTE (ch);
2977 if (! CHAR_BYTE8_P (c)
2978 && re_iswctype (c, cc))
8f924df7 2979 {
cf9c99bc
KH
2980 SET_LIST_BIT (ch);
2981 c1 = TRANSLATE (c);
2982 if (c1 == c)
2983 continue;
2984 if (ASCII_CHAR_P (c1))
2985 SET_LIST_BIT (c1);
2986 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2987 SET_LIST_BIT (c1);
8f924df7 2988 }
25fe55af 2989 }
cf9c99bc
KH
2990 SET_RANGE_TABLE_WORK_AREA_BIT
2991 (range_table_work, re_wctype_to_bit (cc));
2992#endif /* emacs */
6224b623
SM
2993 /* In most cases the matching rule for char classes
2994 only uses the syntax table for multibyte chars,
2995 so that the content of the syntax-table it is not
2996 hardcoded in the range_table. SPACE and WORD are
2997 the two exceptions. */
2998 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2999 bufp->used_syntax = 1;
3000
b18215fc
RS
3001 /* Repeat the loop. */
3002 continue;
25fe55af
RS
3003 }
3004 else
3005 {
ed0767d8
SM
3006 /* Go back to right after the "[:". */
3007 p = class_beg;
25fe55af 3008 SET_LIST_BIT ('[');
b18215fc
RS
3009
3010 /* Because the `:' may starts the range, we
3011 can't simply set bit and repeat the loop.
7814e705 3012 Instead, just set it to C and handle below. */
b18215fc 3013 c = ':';
25fe55af
RS
3014 }
3015 }
b18215fc
RS
3016
3017 if (p < pend && p[0] == '-' && p[1] != ']')
3018 {
3019
3020 /* Discard the `-'. */
3021 PATFETCH (c1);
3022
3023 /* Fetch the character which ends the range. */
3024 PATFETCH (c1);
cf9c99bc
KH
3025#ifdef emacs
3026 if (CHAR_BYTE8_P (c1)
3027 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3028 /* Treat the range from a multibyte character to
3029 raw-byte character as empty. */
3030 c = c1 + 1;
3031#endif /* emacs */
e318085a 3032 }
25fe55af 3033 else
b18215fc
RS
3034 /* Range from C to C. */
3035 c1 = c;
3036
cf9c99bc 3037 if (c > c1)
25fe55af 3038 {
cf9c99bc
KH
3039 if (syntax & RE_NO_EMPTY_RANGES)
3040 FREE_STACK_RETURN (REG_ERANGEX);
3041 /* Else, repeat the loop. */
bf216479 3042 }
6fdd04b0 3043 else
25fe55af 3044 {
cf9c99bc
KH
3045#ifndef emacs
3046 /* Set the range into bitmap */
8f924df7 3047 for (; c <= c1; c++)
b18215fc 3048 {
cf9c99bc
KH
3049 ch = TRANSLATE (c);
3050 if (ch < (1 << BYTEWIDTH))
3051 SET_LIST_BIT (ch);
3052 }
3053#else /* emacs */
3054 if (c < 128)
3055 {
3056 ch = MIN (127, c1);
3057 SETUP_ASCII_RANGE (range_table_work, c, ch);
3058 c = ch + 1;
3059 if (CHAR_BYTE8_P (c1))
3060 c = BYTE8_TO_CHAR (128);
3061 }
3062 if (c <= c1)
3063 {
3064 if (CHAR_BYTE8_P (c))
3065 {
3066 c = CHAR_TO_BYTE8 (c);
3067 c1 = CHAR_TO_BYTE8 (c1);
3068 for (; c <= c1; c++)
3069 SET_LIST_BIT (c);
3070 }
3071 else if (multibyte)
3072 {
3073 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3074 }
3075 else
3076 {
3077 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3078 }
e934739e 3079 }
cf9c99bc 3080#endif /* emacs */
25fe55af 3081 }
e318085a
RS
3082 }
3083
25fe55af 3084 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3085 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3086 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3087 b[-1]--;
3088 b += b[-1];
fa9a63c5 3089
96cc36cc
RS
3090 /* Build real range table from work area. */
3091 if (RANGE_TABLE_WORK_USED (range_table_work)
3092 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3093 {
3094 int i;
3095 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3096
b18215fc 3097 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3098 bytes for flags, two for COUNT, and three bytes for
3099 each character. */
3100 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3101
b18215fc
RS
3102 /* Indicate the existence of range table. */
3103 laststart[1] |= 0x80;
fa9a63c5 3104
96cc36cc
RS
3105 /* Store the character class flag bits into the range table.
3106 If not in emacs, these flag bits are always 0. */
3107 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3108 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3109
b18215fc
RS
3110 STORE_NUMBER_AND_INCR (b, used / 2);
3111 for (i = 0; i < used; i++)
3112 STORE_CHARACTER_AND_INCR
3113 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3114 }
25fe55af
RS
3115 }
3116 break;
fa9a63c5
RM
3117
3118
b18215fc 3119 case '(':
25fe55af
RS
3120 if (syntax & RE_NO_BK_PARENS)
3121 goto handle_open;
3122 else
3123 goto normal_char;
fa9a63c5
RM
3124
3125
25fe55af
RS
3126 case ')':
3127 if (syntax & RE_NO_BK_PARENS)
3128 goto handle_close;
3129 else
3130 goto normal_char;
e318085a
RS
3131
3132
25fe55af
RS
3133 case '\n':
3134 if (syntax & RE_NEWLINE_ALT)
3135 goto handle_alt;
3136 else
3137 goto normal_char;
e318085a
RS
3138
3139
b18215fc 3140 case '|':
25fe55af
RS
3141 if (syntax & RE_NO_BK_VBAR)
3142 goto handle_alt;
3143 else
3144 goto normal_char;
3145
3146
3147 case '{':
3148 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3149 goto handle_interval;
3150 else
3151 goto normal_char;
3152
3153
3154 case '\\':
3155 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3156
3157 /* Do not translate the character after the \, so that we can
3158 distinguish, e.g., \B from \b, even if we normally would
3159 translate, e.g., B to b. */
36595814 3160 PATFETCH (c);
25fe55af
RS
3161
3162 switch (c)
3163 {
3164 case '(':
3165 if (syntax & RE_NO_BK_PARENS)
3166 goto normal_backslash;
3167
3168 handle_open:
505bde11
SM
3169 {
3170 int shy = 0;
c69b0314 3171 regnum_t regnum = 0;
505bde11
SM
3172 if (p+1 < pend)
3173 {
3174 /* Look for a special (?...) construct */
ed0767d8 3175 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3176 {
ed0767d8 3177 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3178 while (!shy)
505bde11 3179 {
c69b0314
SM
3180 PATFETCH (c);
3181 switch (c)
3182 {
3183 case ':': shy = 1; break;
3184 case '0':
3185 /* An explicitly specified regnum must start
3186 with non-0. */
3187 if (regnum == 0)
3188 FREE_STACK_RETURN (REG_BADPAT);
3189 case '1': case '2': case '3': case '4':
3190 case '5': case '6': case '7': case '8': case '9':
3191 regnum = 10*regnum + (c - '0'); break;
3192 default:
3193 /* Only (?:...) is supported right now. */
3194 FREE_STACK_RETURN (REG_BADPAT);
3195 }
505bde11
SM
3196 }
3197 }
505bde11
SM
3198 }
3199
3200 if (!shy)
c69b0314
SM
3201 regnum = ++bufp->re_nsub;
3202 else if (regnum)
3203 { /* It's actually not shy, but explicitly numbered. */
3204 shy = 0;
3205 if (regnum > bufp->re_nsub)
3206 bufp->re_nsub = regnum;
3207 else if (regnum > bufp->re_nsub
3208 /* Ideally, we'd want to check that the specified
3209 group can't have matched (i.e. all subgroups
3210 using the same regnum are in other branches of
3211 OR patterns), but we don't currently keep track
3212 of enough info to do that easily. */
3213 || group_in_compile_stack (compile_stack, regnum))
3214 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3215 }
c69b0314
SM
3216 else
3217 /* It's really shy. */
3218 regnum = - bufp->re_nsub;
25fe55af 3219
99633e97
SM
3220 if (COMPILE_STACK_FULL)
3221 {
3222 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3223 compile_stack_elt_t);
3224 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3225
99633e97
SM
3226 compile_stack.size <<= 1;
3227 }
25fe55af 3228
99633e97 3229 /* These are the values to restore when we hit end of this
7814e705 3230 group. They are all relative offsets, so that if the
99633e97
SM
3231 whole pattern moves because of realloc, they will still
3232 be valid. */
3233 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3234 COMPILE_STACK_TOP.fixup_alt_jump
3235 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3236 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3237 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3238
c69b0314
SM
3239 /* Do not push a start_memory for groups beyond the last one
3240 we can represent in the compiled pattern. */
3241 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3242 BUF_PUSH_2 (start_memory, regnum);
3243
3244 compile_stack.avail++;
3245
3246 fixup_alt_jump = 0;
3247 laststart = 0;
3248 begalt = b;
3249 /* If we've reached MAX_REGNUM groups, then this open
3250 won't actually generate any code, so we'll have to
3251 clear pending_exact explicitly. */
3252 pending_exact = 0;
3253 break;
505bde11 3254 }
25fe55af
RS
3255
3256 case ')':
3257 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3258
3259 if (COMPILE_STACK_EMPTY)
505bde11
SM
3260 {
3261 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3262 goto normal_backslash;
3263 else
3264 FREE_STACK_RETURN (REG_ERPAREN);
3265 }
25fe55af
RS
3266
3267 handle_close:
505bde11 3268 FIXUP_ALT_JUMP ();
25fe55af
RS
3269
3270 /* See similar code for backslashed left paren above. */
3271 if (COMPILE_STACK_EMPTY)
505bde11
SM
3272 {
3273 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3274 goto normal_char;
3275 else
3276 FREE_STACK_RETURN (REG_ERPAREN);
3277 }
25fe55af
RS
3278
3279 /* Since we just checked for an empty stack above, this
3280 ``can't happen''. */
3281 assert (compile_stack.avail != 0);
3282 {
3283 /* We don't just want to restore into `regnum', because
3284 later groups should continue to be numbered higher,
7814e705 3285 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3286 regnum_t regnum;
25fe55af
RS
3287
3288 compile_stack.avail--;
3289 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3290 fixup_alt_jump
3291 = COMPILE_STACK_TOP.fixup_alt_jump
3292 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3293 : 0;
3294 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3295 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3296 /* If we've reached MAX_REGNUM groups, then this open
3297 won't actually generate any code, so we'll have to
3298 clear pending_exact explicitly. */
3299 pending_exact = 0;
e318085a 3300
25fe55af 3301 /* We're at the end of the group, so now we know how many
7814e705 3302 groups were inside this one. */
c69b0314
SM
3303 if (regnum <= MAX_REGNUM && regnum > 0)
3304 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3305 }
3306 break;
3307
3308
3309 case '|': /* `\|'. */
3310 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3311 goto normal_backslash;
3312 handle_alt:
3313 if (syntax & RE_LIMITED_OPS)
3314 goto normal_char;
3315
3316 /* Insert before the previous alternative a jump which
7814e705 3317 jumps to this alternative if the former fails. */
25fe55af
RS
3318 GET_BUFFER_SPACE (3);
3319 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3320 pending_exact = 0;
3321 b += 3;
3322
3323 /* The alternative before this one has a jump after it
3324 which gets executed if it gets matched. Adjust that
3325 jump so it will jump to this alternative's analogous
3326 jump (put in below, which in turn will jump to the next
3327 (if any) alternative's such jump, etc.). The last such
3328 jump jumps to the correct final destination. A picture:
3329 _____ _____
3330 | | | |
3331 | v | v
d1dfb56c 3332 a | b | c
25fe55af
RS
3333
3334 If we are at `b', then fixup_alt_jump right now points to a
3335 three-byte space after `a'. We'll put in the jump, set
3336 fixup_alt_jump to right after `b', and leave behind three
3337 bytes which we'll fill in when we get to after `c'. */
3338
505bde11 3339 FIXUP_ALT_JUMP ();
25fe55af
RS
3340
3341 /* Mark and leave space for a jump after this alternative,
3342 to be filled in later either by next alternative or
3343 when know we're at the end of a series of alternatives. */
3344 fixup_alt_jump = b;
3345 GET_BUFFER_SPACE (3);
3346 b += 3;
3347
3348 laststart = 0;
3349 begalt = b;
3350 break;
3351
3352
3353 case '{':
3354 /* If \{ is a literal. */
3355 if (!(syntax & RE_INTERVALS)
3356 /* If we're at `\{' and it's not the open-interval
3357 operator. */
4bb91c68 3358 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3359 goto normal_backslash;
3360
3361 handle_interval:
3362 {
3363 /* If got here, then the syntax allows intervals. */
3364
3365 /* At least (most) this many matches must be made. */
99633e97 3366 int lower_bound = 0, upper_bound = -1;
25fe55af 3367
ed0767d8 3368 beg_interval = p;
25fe55af 3369
25fe55af
RS
3370 GET_UNSIGNED_NUMBER (lower_bound);
3371
3372 if (c == ',')
ed0767d8 3373 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3374 else
3375 /* Interval such as `{1}' => match exactly once. */
3376 upper_bound = lower_bound;
3377
3378 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3379 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3380 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3381
3382 if (!(syntax & RE_NO_BK_BRACES))
3383 {
4bb91c68
SM
3384 if (c != '\\')
3385 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3386 if (p == pend)
3387 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3388 PATFETCH (c);
3389 }
3390
3391 if (c != '}')
4bb91c68 3392 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3393
3394 /* We just parsed a valid interval. */
3395
3396 /* If it's invalid to have no preceding re. */
3397 if (!laststart)
3398 {
3399 if (syntax & RE_CONTEXT_INVALID_OPS)
3400 FREE_STACK_RETURN (REG_BADRPT);
3401 else if (syntax & RE_CONTEXT_INDEP_OPS)
3402 laststart = b;
3403 else
3404 goto unfetch_interval;
3405 }
3406
6df42991
SM
3407 if (upper_bound == 0)
3408 /* If the upper bound is zero, just drop the sub pattern
3409 altogether. */
3410 b = laststart;
3411 else if (lower_bound == 1 && upper_bound == 1)
3412 /* Just match it once: nothing to do here. */
3413 ;
3414
3415 /* Otherwise, we have a nontrivial interval. When
3416 we're all done, the pattern will look like:
3417 set_number_at <jump count> <upper bound>
3418 set_number_at <succeed_n count> <lower bound>
3419 succeed_n <after jump addr> <succeed_n count>
3420 <body of loop>
3421 jump_n <succeed_n addr> <jump count>
3422 (The upper bound and `jump_n' are omitted if
3423 `upper_bound' is 1, though.) */
3424 else
3425 { /* If the upper bound is > 1, we need to insert
3426 more at the end of the loop. */
3427 unsigned int nbytes = (upper_bound < 0 ? 3
3428 : upper_bound > 1 ? 5 : 0);
3429 unsigned int startoffset = 0;
3430
3431 GET_BUFFER_SPACE (20); /* We might use less. */
3432
3433 if (lower_bound == 0)
3434 {
3435 /* A succeed_n that starts with 0 is really a
3436 a simple on_failure_jump_loop. */
3437 INSERT_JUMP (on_failure_jump_loop, laststart,
3438 b + 3 + nbytes);
3439 b += 3;
3440 }
3441 else
3442 {
3443 /* Initialize lower bound of the `succeed_n', even
3444 though it will be set during matching by its
3445 attendant `set_number_at' (inserted next),
3446 because `re_compile_fastmap' needs to know.
3447 Jump to the `jump_n' we might insert below. */
3448 INSERT_JUMP2 (succeed_n, laststart,
3449 b + 5 + nbytes,
3450 lower_bound);
3451 b += 5;
3452
3453 /* Code to initialize the lower bound. Insert
7814e705 3454 before the `succeed_n'. The `5' is the last two
6df42991
SM
3455 bytes of this `set_number_at', plus 3 bytes of
3456 the following `succeed_n'. */
3457 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3458 b += 5;
3459 startoffset += 5;
3460 }
3461
3462 if (upper_bound < 0)
3463 {
3464 /* A negative upper bound stands for infinity,
3465 in which case it degenerates to a plain jump. */
3466 STORE_JUMP (jump, b, laststart + startoffset);
3467 b += 3;
3468 }
3469 else if (upper_bound > 1)
3470 { /* More than one repetition is allowed, so
3471 append a backward jump to the `succeed_n'
3472 that starts this interval.
3473
3474 When we've reached this during matching,
3475 we'll have matched the interval once, so
3476 jump back only `upper_bound - 1' times. */
3477 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3478 upper_bound - 1);
3479 b += 5;
3480
3481 /* The location we want to set is the second
3482 parameter of the `jump_n'; that is `b-2' as
3483 an absolute address. `laststart' will be
3484 the `set_number_at' we're about to insert;
3485 `laststart+3' the number to set, the source
3486 for the relative address. But we are
3487 inserting into the middle of the pattern --
3488 so everything is getting moved up by 5.
3489 Conclusion: (b - 2) - (laststart + 3) + 5,
3490 i.e., b - laststart.
3491
3492 We insert this at the beginning of the loop
3493 so that if we fail during matching, we'll
3494 reinitialize the bounds. */
3495 insert_op2 (set_number_at, laststart, b - laststart,
3496 upper_bound - 1, b);
3497 b += 5;
3498 }
3499 }
25fe55af
RS
3500 pending_exact = 0;
3501 beg_interval = NULL;
3502 }
3503 break;
3504
3505 unfetch_interval:
3506 /* If an invalid interval, match the characters as literals. */
3507 assert (beg_interval);
3508 p = beg_interval;
3509 beg_interval = NULL;
3510
3511 /* normal_char and normal_backslash need `c'. */
ed0767d8 3512 c = '{';
25fe55af
RS
3513
3514 if (!(syntax & RE_NO_BK_BRACES))
3515 {
ed0767d8
SM
3516 assert (p > pattern && p[-1] == '\\');
3517 goto normal_backslash;
25fe55af 3518 }
ed0767d8
SM
3519 else
3520 goto normal_char;
e318085a 3521
b18215fc 3522#ifdef emacs
25fe55af 3523 /* There is no way to specify the before_dot and after_dot
7814e705 3524 operators. rms says this is ok. --karl */
25fe55af
RS
3525 case '=':
3526 BUF_PUSH (at_dot);
3527 break;
3528
3529 case 's':
3530 laststart = b;
3531 PATFETCH (c);
3532 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3533 break;
3534
3535 case 'S':
3536 laststart = b;
3537 PATFETCH (c);
3538 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3539 break;
b18215fc
RS
3540
3541 case 'c':
3542 laststart = b;
36595814 3543 PATFETCH (c);
b18215fc
RS
3544 BUF_PUSH_2 (categoryspec, c);
3545 break;
e318085a 3546
b18215fc
RS
3547 case 'C':
3548 laststart = b;
36595814 3549 PATFETCH (c);
b18215fc
RS
3550 BUF_PUSH_2 (notcategoryspec, c);
3551 break;
3552#endif /* emacs */
e318085a 3553
e318085a 3554
25fe55af 3555 case 'w':
4bb91c68
SM
3556 if (syntax & RE_NO_GNU_OPS)
3557 goto normal_char;
25fe55af 3558 laststart = b;
1fb352e0 3559 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3560 break;
e318085a 3561
e318085a 3562
25fe55af 3563 case 'W':
4bb91c68
SM
3564 if (syntax & RE_NO_GNU_OPS)
3565 goto normal_char;
25fe55af 3566 laststart = b;
1fb352e0 3567 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3568 break;
e318085a
RS
3569
3570
25fe55af 3571 case '<':
4bb91c68
SM
3572 if (syntax & RE_NO_GNU_OPS)
3573 goto normal_char;
25fe55af
RS
3574 BUF_PUSH (wordbeg);
3575 break;
e318085a 3576
25fe55af 3577 case '>':
4bb91c68
SM
3578 if (syntax & RE_NO_GNU_OPS)
3579 goto normal_char;
25fe55af
RS
3580 BUF_PUSH (wordend);
3581 break;
e318085a 3582
669fa600
SM
3583 case '_':
3584 if (syntax & RE_NO_GNU_OPS)
3585 goto normal_char;
3586 laststart = b;
3587 PATFETCH (c);
3588 if (c == '<')
3589 BUF_PUSH (symbeg);
3590 else if (c == '>')
3591 BUF_PUSH (symend);
3592 else
3593 FREE_STACK_RETURN (REG_BADPAT);
3594 break;
3595
25fe55af 3596 case 'b':
4bb91c68
SM
3597 if (syntax & RE_NO_GNU_OPS)
3598 goto normal_char;
25fe55af
RS
3599 BUF_PUSH (wordbound);
3600 break;
e318085a 3601
25fe55af 3602 case 'B':
4bb91c68
SM
3603 if (syntax & RE_NO_GNU_OPS)
3604 goto normal_char;
25fe55af
RS
3605 BUF_PUSH (notwordbound);
3606 break;
fa9a63c5 3607
25fe55af 3608 case '`':
4bb91c68
SM
3609 if (syntax & RE_NO_GNU_OPS)
3610 goto normal_char;
25fe55af
RS
3611 BUF_PUSH (begbuf);
3612 break;
e318085a 3613
25fe55af 3614 case '\'':
4bb91c68
SM
3615 if (syntax & RE_NO_GNU_OPS)
3616 goto normal_char;
25fe55af
RS
3617 BUF_PUSH (endbuf);
3618 break;
e318085a 3619
25fe55af
RS
3620 case '1': case '2': case '3': case '4': case '5':
3621 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3622 {
3623 regnum_t reg;
e318085a 3624
0cdd06f8
SM
3625 if (syntax & RE_NO_BK_REFS)
3626 goto normal_backslash;
e318085a 3627
0cdd06f8 3628 reg = c - '0';
e318085a 3629
c69b0314
SM
3630 if (reg > bufp->re_nsub || reg < 1
3631 /* Can't back reference to a subexp before its end. */
3632 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3633 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3634
0cdd06f8
SM
3635 laststart = b;
3636 BUF_PUSH_2 (duplicate, reg);
3637 }
25fe55af 3638 break;
e318085a 3639
e318085a 3640
25fe55af
RS
3641 case '+':
3642 case '?':
3643 if (syntax & RE_BK_PLUS_QM)
3644 goto handle_plus;
3645 else
3646 goto normal_backslash;
3647
3648 default:
3649 normal_backslash:
3650 /* You might think it would be useful for \ to mean
3651 not to translate; but if we don't translate it
4bb91c68 3652 it will never match anything. */
25fe55af
RS
3653 goto normal_char;
3654 }
3655 break;
fa9a63c5
RM
3656
3657
3658 default:
25fe55af 3659 /* Expects the character in `c'. */
fa9a63c5 3660 normal_char:
36595814 3661 /* If no exactn currently being built. */
25fe55af 3662 if (!pending_exact
fa9a63c5 3663
25fe55af
RS
3664 /* If last exactn not at current position. */
3665 || pending_exact + *pending_exact + 1 != b
5e69f11e 3666
25fe55af 3667 /* We have only one byte following the exactn for the count. */
2d1675e4 3668 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3669
7814e705 3670 /* If followed by a repetition operator. */
9d99031f 3671 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3672 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3673 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3674 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3675 || ((syntax & RE_INTERVALS)
25fe55af 3676 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3677 ? p != pend && *p == '{'
3678 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3679 {
3680 /* Start building a new exactn. */
5e69f11e 3681
25fe55af 3682 laststart = b;
fa9a63c5
RM
3683
3684 BUF_PUSH_2 (exactn, 0);
3685 pending_exact = b - 1;
25fe55af 3686 }
5e69f11e 3687
2d1675e4
SM
3688 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3689 {
e0277a47
KH
3690 int len;
3691
cf9c99bc 3692 if (multibyte)
6fdd04b0 3693 {
cf9c99bc 3694 c = TRANSLATE (c);
6fdd04b0
KH
3695 len = CHAR_STRING (c, b);
3696 b += len;
3697 }
e0277a47 3698 else
6fdd04b0 3699 {
cf9c99bc
KH
3700 c1 = RE_CHAR_TO_MULTIBYTE (c);
3701 if (! CHAR_BYTE8_P (c1))
3702 {
3703 re_wchar_t c2 = TRANSLATE (c1);
3704
3705 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3706 c = c1;
409f2919 3707 }
6fdd04b0
KH
3708 *b++ = c;
3709 len = 1;
3710 }
2d1675e4
SM
3711 (*pending_exact) += len;
3712 }
3713
fa9a63c5 3714 break;
25fe55af 3715 } /* switch (c) */
fa9a63c5
RM
3716 } /* while p != pend */
3717
5e69f11e 3718
fa9a63c5 3719 /* Through the pattern now. */
5e69f11e 3720
505bde11 3721 FIXUP_ALT_JUMP ();
fa9a63c5 3722
5e69f11e 3723 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3724 FREE_STACK_RETURN (REG_EPAREN);
3725
3726 /* If we don't want backtracking, force success
3727 the first time we reach the end of the compiled pattern. */
3728 if (syntax & RE_NO_POSIX_BACKTRACKING)
3729 BUF_PUSH (succeed);
3730
fa9a63c5
RM
3731 /* We have succeeded; set the length of the buffer. */
3732 bufp->used = b - bufp->buffer;
3733
3734#ifdef DEBUG
99633e97 3735 if (debug > 0)
fa9a63c5 3736 {
505bde11 3737 re_compile_fastmap (bufp);
fa9a63c5
RM
3738 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3739 print_compiled_pattern (bufp);
3740 }
99633e97 3741 debug--;
fa9a63c5
RM
3742#endif /* DEBUG */
3743
3744#ifndef MATCH_MAY_ALLOCATE
3745 /* Initialize the failure stack to the largest possible stack. This
3746 isn't necessary unless we're trying to avoid calling alloca in
3747 the search and match routines. */
3748 {
3749 int num_regs = bufp->re_nsub + 1;
3750
320a2a73 3751 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3752 {
a26f4ccd 3753 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3754
fa9a63c5
RM
3755 if (! fail_stack.stack)
3756 fail_stack.stack
5e69f11e 3757 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3758 * sizeof (fail_stack_elt_t));
3759 else
3760 fail_stack.stack
3761 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3762 (fail_stack.size
3763 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3764 }
3765
3766 regex_grow_registers (num_regs);
3767 }
3768#endif /* not MATCH_MAY_ALLOCATE */
3769
839966f3 3770 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3771} /* regex_compile */
3772\f
3773/* Subroutines for `regex_compile'. */
3774
7814e705 3775/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3776
3777static void
971de7fb 3778store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3779{
3780 *loc = (unsigned char) op;
3781 STORE_NUMBER (loc + 1, arg);
3782}
3783
3784
3785/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3786
3787static void
971de7fb 3788store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3789{
3790 *loc = (unsigned char) op;
3791 STORE_NUMBER (loc + 1, arg1);
3792 STORE_NUMBER (loc + 3, arg2);
3793}
3794
3795
3796/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3797 for OP followed by two-byte integer parameter ARG. */
3798
3799static void
971de7fb 3800insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3801{
3802 register unsigned char *pfrom = end;
3803 register unsigned char *pto = end + 3;
3804
3805 while (pfrom != loc)
3806 *--pto = *--pfrom;
5e69f11e 3807
fa9a63c5
RM
3808 store_op1 (op, loc, arg);
3809}
3810
3811
3812/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3813
3814static void
971de7fb 3815insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3816{
3817 register unsigned char *pfrom = end;
3818 register unsigned char *pto = end + 5;
3819
3820 while (pfrom != loc)
3821 *--pto = *--pfrom;
5e69f11e 3822
fa9a63c5
RM
3823 store_op2 (op, loc, arg1, arg2);
3824}
3825
3826
3827/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3828 after an alternative or a begin-subexpression. We assume there is at
3829 least one character before the ^. */
3830
3831static boolean
971de7fb 3832at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3833{
01618498 3834 re_char *prev = p - 2;
fa9a63c5 3835 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3836
fa9a63c5
RM
3837 return
3838 /* After a subexpression? */
3839 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3840 /* After an alternative? */
d2af47df
SM
3841 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3842 /* After a shy subexpression? */
3843 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3844 && prev[-1] == '?' && prev[-2] == '('
3845 && (syntax & RE_NO_BK_PARENS
3846 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3847}
3848
3849
3850/* The dual of at_begline_loc_p. This one is for $. We assume there is
3851 at least one character after the $, i.e., `P < PEND'. */
3852
3853static boolean
971de7fb 3854at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3855{
01618498 3856 re_char *next = p;
fa9a63c5 3857 boolean next_backslash = *next == '\\';
01618498 3858 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3859
fa9a63c5
RM
3860 return
3861 /* Before a subexpression? */
3862 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3863 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3864 /* Before an alternative? */
3865 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3866 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3867}
3868
3869
5e69f11e 3870/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3871 false if it's not. */
3872
3873static boolean
971de7fb 3874group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5 3875{
d1dfb56c 3876 ssize_t this_element;
fa9a63c5 3877
5e69f11e
RM
3878 for (this_element = compile_stack.avail - 1;
3879 this_element >= 0;
fa9a63c5
RM
3880 this_element--)
3881 if (compile_stack.stack[this_element].regnum == regnum)
3882 return true;
3883
3884 return false;
3885}
fa9a63c5 3886\f
f6a3f532
SM
3887/* analyse_first.
3888 If fastmap is non-NULL, go through the pattern and fill fastmap
3889 with all the possible leading chars. If fastmap is NULL, don't
3890 bother filling it up (obviously) and only return whether the
3891 pattern could potentially match the empty string.
3892
3893 Return 1 if p..pend might match the empty string.
3894 Return 0 if p..pend matches at least one char.
01618498 3895 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3896
3897static int
438105ed 3898analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3899{
505bde11 3900 int j, k;
1fb352e0 3901 boolean not;
fa9a63c5 3902
b18215fc 3903 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3904 flag is set true. */
b18215fc
RS
3905 boolean match_any_multibyte_characters = false;
3906
f6a3f532 3907 assert (p);
5e69f11e 3908
505bde11
SM
3909 /* The loop below works as follows:
3910 - It has a working-list kept in the PATTERN_STACK and which basically
3911 starts by only containing a pointer to the first operation.
3912 - If the opcode we're looking at is a match against some set of
3913 chars, then we add those chars to the fastmap and go on to the
3914 next work element from the worklist (done via `break').
3915 - If the opcode is a control operator on the other hand, we either
3916 ignore it (if it's meaningless at this point, such as `start_memory')
3917 or execute it (if it's a jump). If the jump has several destinations
3918 (i.e. `on_failure_jump'), then we push the other destination onto the
3919 worklist.
3920 We guarantee termination by ignoring backward jumps (more or less),
3921 so that `p' is monotonically increasing. More to the point, we
3922 never set `p' (or push) anything `<= p1'. */
3923
01618498 3924 while (p < pend)
fa9a63c5 3925 {
505bde11
SM
3926 /* `p1' is used as a marker of how far back a `on_failure_jump'
3927 can go without being ignored. It is normally equal to `p'
3928 (which prevents any backward `on_failure_jump') except right
3929 after a plain `jump', to allow patterns such as:
3930 0: jump 10
3931 3..9: <body>
3932 10: on_failure_jump 3
3933 as used for the *? operator. */
01618498 3934 re_char *p1 = p;
5e69f11e 3935
fa9a63c5
RM
3936 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3937 {
f6a3f532 3938 case succeed:
01618498 3939 return 1;
fa9a63c5 3940
fa9a63c5 3941 case duplicate:
505bde11
SM
3942 /* If the first character has to match a backreference, that means
3943 that the group was empty (since it already matched). Since this
3944 is the only case that interests us here, we can assume that the
3945 backreference must match the empty string. */
3946 p++;
3947 continue;
fa9a63c5
RM
3948
3949
3950 /* Following are the cases which match a character. These end
7814e705 3951 with `break'. */
fa9a63c5
RM
3952
3953 case exactn:
e0277a47 3954 if (fastmap)
cf9c99bc
KH
3955 {
3956 /* If multibyte is nonzero, the first byte of each
3957 character is an ASCII or a leading code. Otherwise,
3958 each byte is a character. Thus, this works in both
3959 cases. */
3960 fastmap[p[1]] = 1;
3961 if (! multibyte)
3962 {
3963 /* For the case of matching this unibyte regex
3964 against multibyte, we must set a leading code of
3965 the corresponding multibyte character. */
3966 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3967
86e893e3 3968 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
3969 }
3970 }
fa9a63c5
RM
3971 break;
3972
3973
1fb352e0
SM
3974 case anychar:
3975 /* We could put all the chars except for \n (and maybe \0)
3976 but we don't bother since it is generally not worth it. */
f6a3f532 3977 if (!fastmap) break;
01618498 3978 return -1;
fa9a63c5
RM
3979
3980
b18215fc 3981 case charset_not:
1fb352e0 3982 if (!fastmap) break;
bf216479
KH
3983 {
3984 /* Chars beyond end of bitmap are possible matches. */
bf216479 3985 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 3986 j < (1 << BYTEWIDTH); j++)
bf216479
KH
3987 fastmap[j] = 1;
3988 }
3989
1fb352e0
SM
3990 /* Fallthrough */
3991 case charset:
3992 if (!fastmap) break;
3993 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3994 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3995 j >= 0; j--)
1fb352e0 3996 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 3997 fastmap[j] = 1;
b18215fc 3998
6482db2e
KH
3999#ifdef emacs
4000 if (/* Any leading code can possibly start a character
1fb352e0 4001 which doesn't match the specified set of characters. */
6482db2e 4002 not
409f2919 4003 ||
6482db2e
KH
4004 /* If we can match a character class, we can match any
4005 multibyte characters. */
4006 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4007 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4008
b18215fc 4009 {
b18215fc
RS
4010 if (match_any_multibyte_characters == false)
4011 {
6482db2e
KH
4012 for (j = MIN_MULTIBYTE_LEADING_CODE;
4013 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4014 fastmap[j] = 1;
b18215fc
RS
4015 match_any_multibyte_characters = true;
4016 }
4017 }
b18215fc 4018
1fb352e0
SM
4019 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4020 && match_any_multibyte_characters == false)
4021 {
bf216479 4022 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 4023 multibyte character in the range table. */
1fb352e0 4024 int c, count;
bf216479 4025 unsigned char lc1, lc2;
b18215fc 4026
1fb352e0 4027 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4028 bits for a character class. */
1fb352e0 4029 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4030
1fb352e0
SM
4031 /* Extract the number of ranges in range table into COUNT. */
4032 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4033 for (; count > 0; count--, p += 3)
1fb352e0 4034 {
9117d724
KH
4035 /* Extract the start and end of each range. */
4036 EXTRACT_CHARACTER (c, p);
bf216479 4037 lc1 = CHAR_LEADING_CODE (c);
9117d724 4038 p += 3;
1fb352e0 4039 EXTRACT_CHARACTER (c, p);
bf216479
KH
4040 lc2 = CHAR_LEADING_CODE (c);
4041 for (j = lc1; j <= lc2; j++)
9117d724 4042 fastmap[j] = 1;
1fb352e0
SM
4043 }
4044 }
6482db2e 4045#endif
b18215fc
RS
4046 break;
4047
1fb352e0
SM
4048 case syntaxspec:
4049 case notsyntaxspec:
4050 if (!fastmap) break;
4051#ifndef emacs
4052 not = (re_opcode_t)p[-1] == notsyntaxspec;
4053 k = *p++;
4054 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4055 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4056 fastmap[j] = 1;
b18215fc 4057 break;
1fb352e0 4058#else /* emacs */
b18215fc
RS
4059 /* This match depends on text properties. These end with
4060 aborting optimizations. */
01618498 4061 return -1;
b18215fc
RS
4062
4063 case categoryspec:
b18215fc 4064 case notcategoryspec:
1fb352e0
SM
4065 if (!fastmap) break;
4066 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4067 k = *p++;
6482db2e 4068 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4069 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4070 fastmap[j] = 1;
4071
6482db2e
KH
4072 /* Any leading code can possibly start a character which
4073 has or doesn't has the specified category. */
4074 if (match_any_multibyte_characters == false)
6fdd04b0 4075 {
6482db2e
KH
4076 for (j = MIN_MULTIBYTE_LEADING_CODE;
4077 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4078 fastmap[j] = 1;
4079 match_any_multibyte_characters = true;
6fdd04b0 4080 }
b18215fc
RS
4081 break;
4082
fa9a63c5 4083 /* All cases after this match the empty string. These end with
25fe55af 4084 `continue'. */
fa9a63c5 4085
fa9a63c5
RM
4086 case before_dot:
4087 case at_dot:
4088 case after_dot:
1fb352e0 4089#endif /* !emacs */
25fe55af
RS
4090 case no_op:
4091 case begline:
4092 case endline:
fa9a63c5
RM
4093 case begbuf:
4094 case endbuf:
4095 case wordbound:
4096 case notwordbound:
4097 case wordbeg:
4098 case wordend:
669fa600
SM
4099 case symbeg:
4100 case symend:
25fe55af 4101 continue;
fa9a63c5
RM
4102
4103
fa9a63c5 4104 case jump:
25fe55af 4105 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4106 if (j < 0)
4107 /* Backward jumps can only go back to code that we've already
4108 visited. `re_compile' should make sure this is true. */
4109 break;
25fe55af 4110 p += j;
505bde11
SM
4111 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4112 {
4113 case on_failure_jump:
4114 case on_failure_keep_string_jump:
505bde11 4115 case on_failure_jump_loop:
0683b6fa 4116 case on_failure_jump_nastyloop:
505bde11
SM
4117 case on_failure_jump_smart:
4118 p++;
4119 break;
4120 default:
4121 continue;
4122 };
4123 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4124 to jump back to "just after here". */
4125 /* Fallthrough */
fa9a63c5 4126
25fe55af
RS
4127 case on_failure_jump:
4128 case on_failure_keep_string_jump:
0683b6fa 4129 case on_failure_jump_nastyloop:
505bde11
SM
4130 case on_failure_jump_loop:
4131 case on_failure_jump_smart:
25fe55af 4132 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4133 if (p + j <= p1)
ed0767d8 4134 ; /* Backward jump to be ignored. */
01618498
SM
4135 else
4136 { /* We have to look down both arms.
4137 We first go down the "straight" path so as to minimize
4138 stack usage when going through alternatives. */
4139 int r = analyse_first (p, pend, fastmap, multibyte);
4140 if (r) return r;
4141 p += j;
4142 }
25fe55af 4143 continue;
fa9a63c5
RM
4144
4145
ed0767d8
SM
4146 case jump_n:
4147 /* This code simply does not properly handle forward jump_n. */
4148 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4149 p += 4;
4150 /* jump_n can either jump or fall through. The (backward) jump
4151 case has already been handled, so we only need to look at the
4152 fallthrough case. */
4153 continue;
177c0ea7 4154
fa9a63c5 4155 case succeed_n:
ed0767d8
SM
4156 /* If N == 0, it should be an on_failure_jump_loop instead. */
4157 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4158 p += 4;
4159 /* We only care about one iteration of the loop, so we don't
4160 need to consider the case where this behaves like an
4161 on_failure_jump. */
25fe55af 4162 continue;
fa9a63c5
RM
4163
4164
4165 case set_number_at:
25fe55af
RS
4166 p += 4;
4167 continue;
fa9a63c5
RM
4168
4169
4170 case start_memory:
25fe55af 4171 case stop_memory:
505bde11 4172 p += 1;
fa9a63c5
RM
4173 continue;
4174
4175
4176 default:
25fe55af
RS
4177 abort (); /* We have listed all the cases. */
4178 } /* switch *p++ */
fa9a63c5
RM
4179
4180 /* Getting here means we have found the possible starting
25fe55af 4181 characters for one path of the pattern -- and that the empty
7814e705 4182 string does not match. We need not follow this path further. */
01618498 4183 return 0;
fa9a63c5
RM
4184 } /* while p */
4185
01618498
SM
4186 /* We reached the end without matching anything. */
4187 return 1;
4188
f6a3f532
SM
4189} /* analyse_first */
4190\f
4191/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4192 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4193 characters can start a string that matches the pattern. This fastmap
4194 is used by re_search to skip quickly over impossible starting points.
4195
4196 Character codes above (1 << BYTEWIDTH) are not represented in the
4197 fastmap, but the leading codes are represented. Thus, the fastmap
4198 indicates which character sets could start a match.
4199
4200 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4201 area as BUFP->fastmap.
4202
4203 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4204 the pattern buffer.
4205
4206 Returns 0 if we succeed, -2 if an internal error. */
4207
4208int
971de7fb 4209re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4210{
4211 char *fastmap = bufp->fastmap;
4212 int analysis;
4213
4214 assert (fastmap && bufp->buffer);
4215
72af86bd 4216 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4217 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4218
4219 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4220 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4221 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4222 return 0;
4223} /* re_compile_fastmap */
4224\f
4225/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4226 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4227 this memory for recording register information. STARTS and ENDS
4228 must be allocated using the malloc library routine, and must each
4229 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4230
4231 If NUM_REGS == 0, then subsequent matches should allocate their own
4232 register data.
4233
4234 Unless this function is called, the first search or match using
4235 PATTERN_BUFFER will allocate its own register data, without
4236 freeing the old data. */
4237
4238void
971de7fb 4239re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4240{
4241 if (num_regs)
4242 {
4243 bufp->regs_allocated = REGS_REALLOCATE;
4244 regs->num_regs = num_regs;
4245 regs->start = starts;
4246 regs->end = ends;
4247 }
4248 else
4249 {
4250 bufp->regs_allocated = REGS_UNALLOCATED;
4251 regs->num_regs = 0;
4252 regs->start = regs->end = (regoff_t *) 0;
4253 }
4254}
c0f9ea08 4255WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4256\f
7814e705 4257/* Searching routines. */
fa9a63c5
RM
4258
4259/* Like re_search_2, below, but only one string is specified, and
4260 doesn't let you say where to stop matching. */
4261
d1dfb56c
EZ
4262regoff_t
4263re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4264 ssize_t startpos, ssize_t range, struct re_registers *regs)
fa9a63c5 4265{
5e69f11e 4266 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4267 regs, size);
4268}
c0f9ea08 4269WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4270
70806df6
KH
4271/* Head address of virtual concatenation of string. */
4272#define HEAD_ADDR_VSTRING(P) \
4273 (((P) >= size1 ? string2 : string1))
4274
b18215fc
RS
4275/* Address of POS in the concatenation of virtual string. */
4276#define POS_ADDR_VSTRING(POS) \
4277 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4278
4279/* Using the compiled pattern in BUFP->buffer, first tries to match the
4280 virtual concatenation of STRING1 and STRING2, starting first at index
4281 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4282
fa9a63c5 4283 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4284
fa9a63c5
RM
4285 RANGE is how far to scan while trying to match. RANGE = 0 means try
4286 only at STARTPOS; in general, the last start tried is STARTPOS +
4287 RANGE.
5e69f11e 4288
fa9a63c5
RM
4289 In REGS, return the indices of the virtual concatenation of STRING1
4290 and STRING2 that matched the entire BUFP->buffer and its contained
4291 subexpressions.
5e69f11e 4292
fa9a63c5
RM
4293 Do not consider matching one past the index STOP in the virtual
4294 concatenation of STRING1 and STRING2.
4295
4296 We return either the position in the strings at which the match was
4297 found, -1 if no match, or -2 if error (such as failure
4298 stack overflow). */
4299
d1dfb56c
EZ
4300regoff_t
4301re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4302 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4303 struct re_registers *regs, ssize_t stop)
fa9a63c5 4304{
d1dfb56c 4305 regoff_t val;
66f0296e
SM
4306 re_char *string1 = (re_char*) str1;
4307 re_char *string2 = (re_char*) str2;
fa9a63c5 4308 register char *fastmap = bufp->fastmap;
6676cb1c 4309 register RE_TRANSLATE_TYPE translate = bufp->translate;
d1dfb56c
EZ
4310 size_t total_size = size1 + size2;
4311 ssize_t endpos = startpos + range;
c0f9ea08 4312 boolean anchored_start;
cf9c99bc
KH
4313 /* Nonzero if we are searching multibyte string. */
4314 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4315
fa9a63c5
RM
4316 /* Check for out-of-range STARTPOS. */
4317 if (startpos < 0 || startpos > total_size)
4318 return -1;
5e69f11e 4319
fa9a63c5 4320 /* Fix up RANGE if it might eventually take us outside
34597fa9 4321 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4322 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4323 if (endpos < 0)
4324 range = 0 - startpos;
fa9a63c5
RM
4325 else if (endpos > total_size)
4326 range = total_size - startpos;
4327
4328 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4329 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4330 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4331 {
4332 if (startpos > 0)
4333 return -1;
4334 else
7b140fd7 4335 range = 0;
fa9a63c5
RM
4336 }
4337
ae4788a8
RS
4338#ifdef emacs
4339 /* In a forward search for something that starts with \=.
4340 don't keep searching past point. */
4341 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4342 {
7b140fd7
RS
4343 range = PT_BYTE - BEGV_BYTE - startpos;
4344 if (range < 0)
ae4788a8
RS
4345 return -1;
4346 }
4347#endif /* emacs */
4348
fa9a63c5
RM
4349 /* Update the fastmap now if not correct already. */
4350 if (fastmap && !bufp->fastmap_accurate)
01618498 4351 re_compile_fastmap (bufp);
5e69f11e 4352
c8499ba5 4353 /* See whether the pattern is anchored. */
c0f9ea08 4354 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4355
b18215fc 4356#ifdef emacs
d48cd3f4 4357 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4358 {
d1dfb56c 4359 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4360
4361 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4362 }
b18215fc
RS
4363#endif
4364
fa9a63c5
RM
4365 /* Loop through the string, looking for a place to start matching. */
4366 for (;;)
5e69f11e 4367 {
c8499ba5
RS
4368 /* If the pattern is anchored,
4369 skip quickly past places we cannot match.
4370 We don't bother to treat startpos == 0 specially
4371 because that case doesn't repeat. */
4372 if (anchored_start && startpos > 0)
4373 {
c0f9ea08
SM
4374 if (! ((startpos <= size1 ? string1[startpos - 1]
4375 : string2[startpos - size1 - 1])
4376 == '\n'))
c8499ba5
RS
4377 goto advance;
4378 }
4379
fa9a63c5 4380 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4381 cannot be the start of a match. If the pattern can match the
4382 null string, however, we don't need to skip characters; we want
7814e705 4383 the first null string. */
fa9a63c5
RM
4384 if (fastmap && startpos < total_size && !bufp->can_be_null)
4385 {
66f0296e 4386 register re_char *d;
01618498 4387 register re_wchar_t buf_ch;
e934739e
RS
4388
4389 d = POS_ADDR_VSTRING (startpos);
4390
7814e705 4391 if (range > 0) /* Searching forwards. */
fa9a63c5 4392 {
fa9a63c5 4393 register int lim = 0;
d1dfb56c 4394 ssize_t irange = range;
fa9a63c5 4395
25fe55af
RS
4396 if (startpos < size1 && startpos + range >= size1)
4397 lim = range - (size1 - startpos);
fa9a63c5 4398
25fe55af
RS
4399 /* Written out as an if-else to avoid testing `translate'
4400 inside the loop. */
28ae27ae
AS
4401 if (RE_TRANSLATE_P (translate))
4402 {
e934739e
RS
4403 if (multibyte)
4404 while (range > lim)
4405 {
4406 int buf_charlen;
4407
62a6e103 4408 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4409 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4410 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4411 break;
4412
4413 range -= buf_charlen;
4414 d += buf_charlen;
4415 }
4416 else
bf216479 4417 while (range > lim)
33c46939 4418 {
cf9c99bc
KH
4419 register re_wchar_t ch, translated;
4420
bf216479 4421 buf_ch = *d;
cf9c99bc
KH
4422 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4423 translated = RE_TRANSLATE (translate, ch);
4424 if (translated != ch
4425 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4426 buf_ch = ch;
6fdd04b0 4427 if (fastmap[buf_ch])
bf216479 4428 break;
33c46939
RS
4429 d++;
4430 range--;
4431 }
e934739e 4432 }
fa9a63c5 4433 else
6fdd04b0
KH
4434 {
4435 if (multibyte)
4436 while (range > lim)
4437 {
4438 int buf_charlen;
fa9a63c5 4439
62a6e103 4440 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4441 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4442 break;
4443 range -= buf_charlen;
4444 d += buf_charlen;
4445 }
e934739e 4446 else
6fdd04b0 4447 while (range > lim && !fastmap[*d])
33c46939
RS
4448 {
4449 d++;
4450 range--;
4451 }
e934739e 4452 }
fa9a63c5
RM
4453 startpos += irange - range;
4454 }
7814e705 4455 else /* Searching backwards. */
fa9a63c5 4456 {
ba5e343c
KH
4457 if (multibyte)
4458 {
62a6e103 4459 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4460 buf_ch = TRANSLATE (buf_ch);
4461 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4462 goto advance;
4463 }
4464 else
4465 {
cf9c99bc
KH
4466 register re_wchar_t ch, translated;
4467
4468 buf_ch = *d;
4469 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4470 translated = TRANSLATE (ch);
4471 if (translated != ch
4472 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4473 buf_ch = ch;
4474 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4475 goto advance;
4476 }
fa9a63c5
RM
4477 }
4478 }
4479
4480 /* If can't match the null string, and that's all we have left, fail. */
4481 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4482 && !bufp->can_be_null)
fa9a63c5
RM
4483 return -1;
4484
4485 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4486 startpos, regs, stop);
fa9a63c5
RM
4487
4488 if (val >= 0)
4489 return startpos;
5e69f11e 4490
fa9a63c5
RM
4491 if (val == -2)
4492 return -2;
4493
4494 advance:
5e69f11e 4495 if (!range)
25fe55af 4496 break;
5e69f11e 4497 else if (range > 0)
25fe55af 4498 {
b18215fc
RS
4499 /* Update STARTPOS to the next character boundary. */
4500 if (multibyte)
4501 {
66f0296e 4502 re_char *p = POS_ADDR_VSTRING (startpos);
aa3830c4 4503 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4504
4505 range -= len;
4506 if (range < 0)
4507 break;
4508 startpos += len;
4509 }
4510 else
4511 {
b560c397
RS
4512 range--;
4513 startpos++;
4514 }
e318085a 4515 }
fa9a63c5 4516 else
25fe55af
RS
4517 {
4518 range++;
4519 startpos--;
b18215fc
RS
4520
4521 /* Update STARTPOS to the previous character boundary. */
4522 if (multibyte)
4523 {
70806df6
KH
4524 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4525 re_char *p0 = p;
4526 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4527
4528 /* Find the head of multibyte form. */
70806df6
KH
4529 PREV_CHAR_BOUNDARY (p, phead);
4530 range += p0 - 1 - p;
4531 if (range > 0)
4532 break;
b18215fc 4533
70806df6 4534 startpos -= p0 - 1 - p;
b18215fc 4535 }
25fe55af 4536 }
fa9a63c5
RM
4537 }
4538 return -1;
4539} /* re_search_2 */
c0f9ea08 4540WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4541\f
4542/* Declarations and macros for re_match_2. */
4543
5e617bc2
JB
4544static int bcmp_translate _RE_ARGS ((re_char *s1, re_char *s2,
4545 register ssize_t len,
4546 RE_TRANSLATE_TYPE translate,
4547 const int multibyte));
fa9a63c5
RM
4548
4549/* This converts PTR, a pointer into one of the search strings `string1'
4550 and `string2' into an offset from the beginning of that string. */
4551#define POINTER_TO_OFFSET(ptr) \
4552 (FIRST_STRING_P (ptr) \
4553 ? ((regoff_t) ((ptr) - string1)) \
4554 : ((regoff_t) ((ptr) - string2 + size1)))
4555
fa9a63c5 4556/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4557 string2 if necessary.
4558 Check re_match_2_internal for a discussion of why end_match_2 might
4559 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4560#define PREFETCH() \
25fe55af 4561 while (d == dend) \
fa9a63c5
RM
4562 { \
4563 /* End of string2 => fail. */ \
25fe55af
RS
4564 if (dend == end_match_2) \
4565 goto fail; \
4bb91c68 4566 /* End of string1 => advance to string2. */ \
25fe55af 4567 d = string2; \
fa9a63c5
RM
4568 dend = end_match_2; \
4569 }
4570
f1ad044f
SM
4571/* Call before fetching a char with *d if you already checked other limits.
4572 This is meant for use in lookahead operations like wordend, etc..
4573 where we might need to look at parts of the string that might be
4574 outside of the LIMITs (i.e past `stop'). */
4575#define PREFETCH_NOLIMIT() \
4576 if (d == end1) \
4577 { \
4578 d = string2; \
4579 dend = end_match_2; \
4580 } \
fa9a63c5
RM
4581
4582/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4583 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4584#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4585#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5 4586
9121ca40 4587/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4588
4589/* The comment at case wordbound is following one, but we don't use
4590 AT_WORD_BOUNDARY anymore to support multibyte form.
4591
4592 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4593 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4594 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4595 macro and introducing temporary variables works around the bug. */
4596
9121ca40 4597#if 0
b313f9d8
PE
4598/* Test if D points to a character which is word-constituent. We have
4599 two special cases to check for: if past the end of string1, look at
4600 the first character in string2; and if before the beginning of
4601 string2, look at the last character in string1. */
4602#define WORDCHAR_P(d) \
4603 (SYNTAX ((d) == end1 ? *string2 \
4604 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4605 == Sword)
4606
fa9a63c5
RM
4607/* Test if the character before D and the one at D differ with respect
4608 to being word-constituent. */
4609#define AT_WORD_BOUNDARY(d) \
4610 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4611 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4612#endif
fa9a63c5
RM
4613
4614/* Free everything we malloc. */
4615#ifdef MATCH_MAY_ALLOCATE
952db0d7
PE
4616# define FREE_VAR(var) \
4617 do { \
4618 if (var) \
4619 { \
4620 REGEX_FREE (var); \
4621 var = NULL; \
4622 } \
4623 } while (0)
0b32bf0e 4624# define FREE_VARIABLES() \
fa9a63c5
RM
4625 do { \
4626 REGEX_FREE_STACK (fail_stack.stack); \
4627 FREE_VAR (regstart); \
4628 FREE_VAR (regend); \
fa9a63c5
RM
4629 FREE_VAR (best_regstart); \
4630 FREE_VAR (best_regend); \
fa9a63c5
RM
4631 } while (0)
4632#else
0b32bf0e 4633# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4634#endif /* not MATCH_MAY_ALLOCATE */
4635
505bde11
SM
4636\f
4637/* Optimization routines. */
4638
4e8a9132
SM
4639/* If the operation is a match against one or more chars,
4640 return a pointer to the next operation, else return NULL. */
01618498 4641static re_char *
971de7fb 4642skip_one_char (const re_char *p)
4e8a9132
SM
4643{
4644 switch (SWITCH_ENUM_CAST (*p++))
4645 {
4646 case anychar:
4647 break;
177c0ea7 4648
4e8a9132
SM
4649 case exactn:
4650 p += *p + 1;
4651 break;
4652
4653 case charset_not:
4654 case charset:
4655 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4656 {
4657 int mcnt;
4658 p = CHARSET_RANGE_TABLE (p - 1);
4659 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4660 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4661 }
4662 else
4663 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4664 break;
177c0ea7 4665
4e8a9132
SM
4666 case syntaxspec:
4667 case notsyntaxspec:
1fb352e0 4668#ifdef emacs
4e8a9132
SM
4669 case categoryspec:
4670 case notcategoryspec:
4671#endif /* emacs */
4672 p++;
4673 break;
4674
4675 default:
4676 p = NULL;
4677 }
4678 return p;
4679}
4680
4681
505bde11 4682/* Jump over non-matching operations. */
839966f3 4683static re_char *
971de7fb 4684skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4685{
4686 int mcnt;
4687 while (p < pend)
4688 {
4689 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4690 {
4691 case start_memory:
505bde11
SM
4692 case stop_memory:
4693 p += 2; break;
4694 case no_op:
4695 p += 1; break;
4696 case jump:
4697 p += 1;
4698 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4699 p += mcnt;
4700 break;
4701 default:
4702 return p;
4703 }
4704 }
4705 assert (p == pend);
4706 return p;
4707}
4708
4709/* Non-zero if "p1 matches something" implies "p2 fails". */
4710static int
971de7fb 4711mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4712{
4e8a9132 4713 re_opcode_t op2;
2d1675e4 4714 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4715 unsigned char *pend = bufp->buffer + bufp->used;
4716
4e8a9132 4717 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4718 && p2 >= bufp->buffer && p2 <= pend);
4719
4720 /* Skip over open/close-group commands.
4721 If what follows this loop is a ...+ construct,
4722 look at what begins its body, since we will have to
4723 match at least one of that. */
4e8a9132
SM
4724 p2 = skip_noops (p2, pend);
4725 /* The same skip can be done for p1, except that this function
4726 is only used in the case where p1 is a simple match operator. */
4727 /* p1 = skip_noops (p1, pend); */
4728
4729 assert (p1 >= bufp->buffer && p1 < pend
4730 && p2 >= bufp->buffer && p2 <= pend);
4731
4732 op2 = p2 == pend ? succeed : *p2;
4733
4734 switch (SWITCH_ENUM_CAST (op2))
505bde11 4735 {
4e8a9132
SM
4736 case succeed:
4737 case endbuf:
4738 /* If we're at the end of the pattern, we can change. */
4739 if (skip_one_char (p1))
505bde11 4740 {
505bde11
SM
4741 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4742 return 1;
505bde11 4743 }
4e8a9132 4744 break;
177c0ea7 4745
4e8a9132 4746 case endline:
4e8a9132
SM
4747 case exactn:
4748 {
01618498 4749 register re_wchar_t c
4e8a9132 4750 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4751 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4752
4e8a9132
SM
4753 if ((re_opcode_t) *p1 == exactn)
4754 {
62a6e103 4755 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4756 {
4757 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4758 return 1;
4759 }
4760 }
505bde11 4761
4e8a9132
SM
4762 else if ((re_opcode_t) *p1 == charset
4763 || (re_opcode_t) *p1 == charset_not)
4764 {
4765 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4766
4e8a9132
SM
4767 /* Test if C is listed in charset (or charset_not)
4768 at `p1'. */
6fdd04b0 4769 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4770 {
4771 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4772 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4773 not = !not;
4774 }
4775 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4776 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4777
4e8a9132
SM
4778 /* `not' is equal to 1 if c would match, which means
4779 that we can't change to pop_failure_jump. */
4780 if (!not)
4781 {
4782 DEBUG_PRINT1 (" No match => fast loop.\n");
4783 return 1;
4784 }
4785 }
4786 else if ((re_opcode_t) *p1 == anychar
4787 && c == '\n')
4788 {
4789 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4790 return 1;
4791 }
4792 }
4793 break;
505bde11 4794
4e8a9132 4795 case charset:
4e8a9132
SM
4796 {
4797 if ((re_opcode_t) *p1 == exactn)
4798 /* Reuse the code above. */
4799 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4800
505bde11
SM
4801 /* It is hard to list up all the character in charset
4802 P2 if it includes multibyte character. Give up in
4803 such case. */
4804 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4805 {
4806 /* Now, we are sure that P2 has no range table.
4807 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4808 enough. But P1 may have range table, so the
505bde11
SM
4809 size of bitmap table of P1 is extracted by
4810 using macro `CHARSET_BITMAP_SIZE'.
4811
6fdd04b0
KH
4812 In a multibyte case, we know that all the character
4813 listed in P2 is ASCII. In a unibyte case, P1 has only a
4814 bitmap table. So, in both cases, it is enough to test
4815 only the bitmap table of P1. */
505bde11 4816
411e4203 4817 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4818 {
4819 int idx;
4820 /* We win if the charset inside the loop
4821 has no overlap with the one after the loop. */
4822 for (idx = 0;
4823 (idx < (int) p2[1]
4824 && idx < CHARSET_BITMAP_SIZE (p1));
4825 idx++)
4826 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4827 break;
4828
4829 if (idx == p2[1]
4830 || idx == CHARSET_BITMAP_SIZE (p1))
4831 {
4832 DEBUG_PRINT1 (" No match => fast loop.\n");
4833 return 1;
4834 }
4835 }
411e4203 4836 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4837 {
4838 int idx;
4839 /* We win if the charset_not inside the loop lists
7814e705 4840 every character listed in the charset after. */
505bde11
SM
4841 for (idx = 0; idx < (int) p2[1]; idx++)
4842 if (! (p2[2 + idx] == 0
4843 || (idx < CHARSET_BITMAP_SIZE (p1)
4844 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4845 break;
4846
d1dfb56c
EZ
4847 if (idx == p2[1])
4848 {
4849 DEBUG_PRINT1 (" No match => fast loop.\n");
4850 return 1;
4851 }
4e8a9132
SM
4852 }
4853 }
4854 }
609b757a 4855 break;
177c0ea7 4856
411e4203
SM
4857 case charset_not:
4858 switch (SWITCH_ENUM_CAST (*p1))
4859 {
4860 case exactn:
4861 case charset:
4862 /* Reuse the code above. */
4863 return mutually_exclusive_p (bufp, p2, p1);
4864 case charset_not:
4865 /* When we have two charset_not, it's very unlikely that
4866 they don't overlap. The union of the two sets of excluded
4867 chars should cover all possible chars, which, as a matter of
4868 fact, is virtually impossible in multibyte buffers. */
36595814 4869 break;
411e4203
SM
4870 }
4871 break;
4872
4e8a9132 4873 case wordend:
669fa600
SM
4874 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4875 case symend:
4e8a9132 4876 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4877 && (p1[1] == Ssymbol || p1[1] == Sword));
4878 case notsyntaxspec:
4879 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4880
4881 case wordbeg:
669fa600
SM
4882 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4883 case symbeg:
4e8a9132 4884 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4885 && (p1[1] == Ssymbol || p1[1] == Sword));
4886 case syntaxspec:
4887 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4888
4889 case wordbound:
4890 return (((re_opcode_t) *p1 == notsyntaxspec
4891 || (re_opcode_t) *p1 == syntaxspec)
4892 && p1[1] == Sword);
4893
1fb352e0 4894#ifdef emacs
4e8a9132
SM
4895 case categoryspec:
4896 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4897 case notcategoryspec:
4898 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4899#endif /* emacs */
4900
4901 default:
4902 ;
505bde11
SM
4903 }
4904
4905 /* Safe default. */
4906 return 0;
4907}
4908
fa9a63c5
RM
4909\f
4910/* Matching routines. */
4911
25fe55af 4912#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4913/* re_match is like re_match_2 except it takes only a single string. */
4914
d1dfb56c 4915regoff_t
d2762c86 4916re_match (struct re_pattern_buffer *bufp, const char *string,
d1dfb56c 4917 size_t size, ssize_t pos, struct re_registers *regs)
fa9a63c5 4918{
d1dfb56c
EZ
4919 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4920 size, pos, regs, size);
fa9a63c5
RM
4921 return result;
4922}
c0f9ea08 4923WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4924#endif /* not emacs */
4925
b18215fc
RS
4926#ifdef emacs
4927/* In Emacs, this is the string or buffer in which we
7814e705 4928 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4929Lisp_Object re_match_object;
4930#endif
fa9a63c5
RM
4931
4932/* re_match_2 matches the compiled pattern in BUFP against the
4933 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4934 and SIZE2, respectively). We start matching at POS, and stop
4935 matching at STOP.
5e69f11e 4936
fa9a63c5 4937 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4938 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4939 documentation for exactly how many groups we fill.
4940
4941 We return -1 if no match, -2 if an internal error (such as the
7814e705 4942 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4943 matched substring. */
4944
d1dfb56c
EZ
4945regoff_t
4946re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4947 size_t size1, const char *string2, size_t size2, ssize_t pos,
4948 struct re_registers *regs, ssize_t stop)
fa9a63c5 4949{
d1dfb56c 4950 regoff_t result;
25fe55af 4951
b18215fc 4952#ifdef emacs
d1dfb56c 4953 ssize_t charpos;
d48cd3f4 4954 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 4955 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4956 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4957#endif
4958
4bb91c68
SM
4959 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4960 (re_char*) string2, size2,
cc9b4df2 4961 pos, regs, stop);
fa9a63c5
RM
4962 return result;
4963}
c0f9ea08 4964WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4965
bf216479 4966
fa9a63c5 4967/* This is a separate function so that we can force an alloca cleanup
7814e705 4968 afterwards. */
d1dfb56c
EZ
4969static regoff_t
4970re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1,
4971 size_t size1, const re_char *string2, size_t size2,
4972 ssize_t pos, struct re_registers *regs, ssize_t stop)
fa9a63c5
RM
4973{
4974 /* General temporaries. */
d1dfb56c 4975 ssize_t mcnt;
01618498 4976 size_t reg;
fa9a63c5
RM
4977
4978 /* Just past the end of the corresponding string. */
66f0296e 4979 re_char *end1, *end2;
fa9a63c5
RM
4980
4981 /* Pointers into string1 and string2, just past the last characters in
7814e705 4982 each to consider matching. */
66f0296e 4983 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4984
4985 /* Where we are in the data, and the end of the current string. */
66f0296e 4986 re_char *d, *dend;
5e69f11e 4987
99633e97
SM
4988 /* Used sometimes to remember where we were before starting matching
4989 an operator so that we can go back in case of failure. This "atomic"
4990 behavior of matching opcodes is indispensable to the correctness
4991 of the on_failure_keep_string_jump optimization. */
4992 re_char *dfail;
4993
fa9a63c5 4994 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4995 re_char *p = bufp->buffer;
4996 re_char *pend = p + bufp->used;
fa9a63c5 4997
25fe55af 4998 /* We use this to map every character in the string. */
6676cb1c 4999 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5000
cf9c99bc 5001 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5002 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5003
cf9c99bc
KH
5004 /* Nonzero if STRING1/STRING2 are multibyte. */
5005 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5006
fa9a63c5
RM
5007 /* Failure point stack. Each place that can handle a failure further
5008 down the line pushes a failure point on this stack. It consists of
505bde11 5009 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5010 the subexpressions we're currently inside, plus the number of such
5011 registers, and, finally, two char *'s. The first char * is where
5012 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5013 scanning the strings. */
5014#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5015 fail_stack_type fail_stack;
5016#endif
5017#ifdef DEBUG
fa9a63c5
RM
5018 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5019#endif
5020
0b32bf0e 5021#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5022 /* This holds the pointer to the failure stack, when
5023 it is allocated relocatably. */
5024 fail_stack_elt_t *failure_stack_ptr;
99633e97 5025#endif
fa9a63c5
RM
5026
5027 /* We fill all the registers internally, independent of what we
7814e705 5028 return, for use in backreferences. The number here includes
fa9a63c5 5029 an element for register zero. */
4bb91c68 5030 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5031
fa9a63c5
RM
5032 /* Information on the contents of registers. These are pointers into
5033 the input strings; they record just what was matched (on this
5034 attempt) by a subexpression part of the pattern, that is, the
5035 regnum-th regstart pointer points to where in the pattern we began
5036 matching and the regnum-th regend points to right after where we
5037 stopped matching the regnum-th subexpression. (The zeroth register
5038 keeps track of what the whole pattern matches.) */
5039#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5040 re_char **regstart, **regend;
fa9a63c5
RM
5041#endif
5042
fa9a63c5 5043 /* The following record the register info as found in the above
5e69f11e 5044 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5045 This happens as we backtrack through the failure points, which in
5046 turn happens only if we have not yet matched the entire string. */
5047 unsigned best_regs_set = false;
5048#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5049 re_char **best_regstart, **best_regend;
fa9a63c5 5050#endif
5e69f11e 5051
fa9a63c5
RM
5052 /* Logically, this is `best_regend[0]'. But we don't want to have to
5053 allocate space for that if we're not allocating space for anything
7814e705 5054 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5055 any of the other register vectors, and it seems rather a kludge to
5056 treat `best_regend' differently than the rest. So we keep track of
5057 the end of the best match so far in a separate variable. We
5058 initialize this to NULL so that when we backtrack the first time
5059 and need to test it, it's not garbage. */
66f0296e 5060 re_char *match_end = NULL;
fa9a63c5 5061
fa9a63c5
RM
5062#ifdef DEBUG
5063 /* Counts the total number of registers pushed. */
5e69f11e 5064 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5065#endif
5066
5067 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5068
fa9a63c5 5069 INIT_FAIL_STACK ();
5e69f11e 5070
fa9a63c5
RM
5071#ifdef MATCH_MAY_ALLOCATE
5072 /* Do not bother to initialize all the register variables if there are
5073 no groups in the pattern, as it takes a fair amount of time. If
5074 there are groups, we include space for register 0 (the whole
5075 pattern), even though we never use it, since it simplifies the
5076 array indexing. We should fix this. */
5077 if (bufp->re_nsub)
5078 {
66f0296e
SM
5079 regstart = REGEX_TALLOC (num_regs, re_char *);
5080 regend = REGEX_TALLOC (num_regs, re_char *);
5081 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5082 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5083
505bde11 5084 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5085 {
5086 FREE_VARIABLES ();
5087 return -2;
5088 }
fa9a63c5
RM
5089 }
5090 else
5091 {
5092 /* We must initialize all our variables to NULL, so that
25fe55af 5093 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5094 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5095 }
5096#endif /* MATCH_MAY_ALLOCATE */
5097
5098 /* The starting position is bogus. */
5099 if (pos < 0 || pos > size1 + size2)
5100 {
5101 FREE_VARIABLES ();
5102 return -1;
5103 }
5e69f11e 5104
fa9a63c5
RM
5105 /* Initialize subexpression text positions to -1 to mark ones that no
5106 start_memory/stop_memory has been seen for. Also initialize the
5107 register information struct. */
01618498
SM
5108 for (reg = 1; reg < num_regs; reg++)
5109 regstart[reg] = regend[reg] = NULL;
99633e97 5110
fa9a63c5 5111 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5112 `string1' is null. */
fa9a63c5
RM
5113 if (size2 == 0 && string1 != NULL)
5114 {
5115 string2 = string1;
5116 size2 = size1;
5117 string1 = 0;
5118 size1 = 0;
5119 }
5120 end1 = string1 + size1;
5121 end2 = string2 + size2;
5122
5e69f11e 5123 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5124 `dend' is the end of the input string that `d' points within. `d'
5125 is advanced into the following input string whenever necessary, but
5126 this happens before fetching; therefore, at the beginning of the
5127 loop, `d' can be pointing at the end of a string, but it cannot
5128 equal `string2'. */
419d1c74 5129 if (pos >= size1)
fa9a63c5 5130 {
419d1c74
SM
5131 /* Only match within string2. */
5132 d = string2 + pos - size1;
5133 dend = end_match_2 = string2 + stop - size1;
5134 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5135 }
5136 else
5137 {
f1ad044f 5138 if (stop < size1)
419d1c74
SM
5139 {
5140 /* Only match within string1. */
5141 end_match_1 = string1 + stop;
5142 /* BEWARE!
5143 When we reach end_match_1, PREFETCH normally switches to string2.
5144 But in the present case, this means that just doing a PREFETCH
5145 makes us jump from `stop' to `gap' within the string.
5146 What we really want here is for the search to stop as
5147 soon as we hit end_match_1. That's why we set end_match_2
5148 to end_match_1 (since PREFETCH fails as soon as we hit
5149 end_match_2). */
5150 end_match_2 = end_match_1;
5151 }
5152 else
f1ad044f
SM
5153 { /* It's important to use this code when stop == size so that
5154 moving `d' from end1 to string2 will not prevent the d == dend
5155 check from catching the end of string. */
419d1c74
SM
5156 end_match_1 = end1;
5157 end_match_2 = string2 + stop - size1;
5158 }
5159 d = string1 + pos;
5160 dend = end_match_1;
fa9a63c5
RM
5161 }
5162
5163 DEBUG_PRINT1 ("The compiled pattern is: ");
5164 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5165 DEBUG_PRINT1 ("The string to match is: `");
5166 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5167 DEBUG_PRINT1 ("'\n");
5e69f11e 5168
7814e705 5169 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5170 function if the match is complete, or it drops through if the match
5171 fails at this starting point in the input data. */
5172 for (;;)
5173 {
505bde11 5174 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5175
5176 if (p == pend)
5177 { /* End of pattern means we might have succeeded. */
25fe55af 5178 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5179
fa9a63c5 5180 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5181 longest match, try backtracking. */
5182 if (d != end_match_2)
fa9a63c5
RM
5183 {
5184 /* 1 if this match ends in the same string (string1 or string2)
5185 as the best previous match. */
5e69f11e 5186 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5187 == FIRST_STRING_P (d));
fa9a63c5
RM
5188 /* 1 if this match is the best seen so far. */
5189 boolean best_match_p;
5190
5191 /* AIX compiler got confused when this was combined
7814e705 5192 with the previous declaration. */
fa9a63c5
RM
5193 if (same_str_p)
5194 best_match_p = d > match_end;
5195 else
99633e97 5196 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5197
25fe55af
RS
5198 DEBUG_PRINT1 ("backtracking.\n");
5199
5200 if (!FAIL_STACK_EMPTY ())
5201 { /* More failure points to try. */
5202
5203 /* If exceeds best match so far, save it. */
5204 if (!best_regs_set || best_match_p)
5205 {
5206 best_regs_set = true;
5207 match_end = d;
5208
5209 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5210
01618498 5211 for (reg = 1; reg < num_regs; reg++)
25fe55af 5212 {
01618498
SM
5213 best_regstart[reg] = regstart[reg];
5214 best_regend[reg] = regend[reg];
25fe55af
RS
5215 }
5216 }
5217 goto fail;
5218 }
5219
5220 /* If no failure points, don't restore garbage. And if
5221 last match is real best match, don't restore second
5222 best one. */
5223 else if (best_regs_set && !best_match_p)
5224 {
5225 restore_best_regs:
5226 /* Restore best match. It may happen that `dend ==
5227 end_match_1' while the restored d is in string2.
5228 For example, the pattern `x.*y.*z' against the
5229 strings `x-' and `y-z-', if the two strings are
7814e705 5230 not consecutive in memory. */
25fe55af
RS
5231 DEBUG_PRINT1 ("Restoring best registers.\n");
5232
5233 d = match_end;
5234 dend = ((d >= string1 && d <= end1)
5235 ? end_match_1 : end_match_2);
fa9a63c5 5236
01618498 5237 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5238 {
01618498
SM
5239 regstart[reg] = best_regstart[reg];
5240 regend[reg] = best_regend[reg];
fa9a63c5 5241 }
25fe55af
RS
5242 }
5243 } /* d != end_match_2 */
fa9a63c5
RM
5244
5245 succeed_label:
25fe55af 5246 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5247
25fe55af
RS
5248 /* If caller wants register contents data back, do it. */
5249 if (regs && !bufp->no_sub)
fa9a63c5 5250 {
25fe55af
RS
5251 /* Have the register data arrays been allocated? */
5252 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5253 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5254 extra element beyond `num_regs' for the `-1' marker
5255 GNU code uses. */
5256 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5257 regs->start = TALLOC (regs->num_regs, regoff_t);
5258 regs->end = TALLOC (regs->num_regs, regoff_t);
5259 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5260 {
5261 FREE_VARIABLES ();
5262 return -2;
5263 }
25fe55af
RS
5264 bufp->regs_allocated = REGS_REALLOCATE;
5265 }
5266 else if (bufp->regs_allocated == REGS_REALLOCATE)
5267 { /* Yes. If we need more elements than were already
5268 allocated, reallocate them. If we need fewer, just
5269 leave it alone. */
5270 if (regs->num_regs < num_regs + 1)
5271 {
5272 regs->num_regs = num_regs + 1;
5273 RETALLOC (regs->start, regs->num_regs, regoff_t);
5274 RETALLOC (regs->end, regs->num_regs, regoff_t);
5275 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5276 {
5277 FREE_VARIABLES ();
5278 return -2;
5279 }
25fe55af
RS
5280 }
5281 }
5282 else
fa9a63c5
RM
5283 {
5284 /* These braces fend off a "empty body in an else-statement"
7814e705 5285 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5286 assert (bufp->regs_allocated == REGS_FIXED);
5287 }
5288
25fe55af
RS
5289 /* Convert the pointer data in `regstart' and `regend' to
5290 indices. Register zero has to be set differently,
5291 since we haven't kept track of any info for it. */
5292 if (regs->num_regs > 0)
5293 {
5294 regs->start[0] = pos;
99633e97 5295 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5296 }
5e69f11e 5297
25fe55af
RS
5298 /* Go through the first `min (num_regs, regs->num_regs)'
5299 registers, since that is all we initialized. */
01618498 5300 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5301 {
01618498
SM
5302 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5303 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5304 else
5305 {
01618498
SM
5306 regs->start[reg]
5307 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5308 regs->end[reg]
5309 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5310 }
fa9a63c5 5311 }
5e69f11e 5312
25fe55af
RS
5313 /* If the regs structure we return has more elements than
5314 were in the pattern, set the extra elements to -1. If
5315 we (re)allocated the registers, this is the case,
5316 because we always allocate enough to have at least one
7814e705 5317 -1 at the end. */
01618498
SM
5318 for (reg = num_regs; reg < regs->num_regs; reg++)
5319 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5320 } /* regs && !bufp->no_sub */
5321
25fe55af
RS
5322 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5323 nfailure_points_pushed, nfailure_points_popped,
5324 nfailure_points_pushed - nfailure_points_popped);
5325 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5326
99633e97 5327 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5328
25fe55af 5329 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5330
25fe55af
RS
5331 FREE_VARIABLES ();
5332 return mcnt;
5333 }
fa9a63c5 5334
7814e705 5335 /* Otherwise match next pattern command. */
fa9a63c5
RM
5336 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5337 {
25fe55af
RS
5338 /* Ignore these. Used to ignore the n of succeed_n's which
5339 currently have n == 0. */
5340 case no_op:
5341 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5342 break;
fa9a63c5
RM
5343
5344 case succeed:
25fe55af 5345 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5346 goto succeed_label;
5347
7814e705 5348 /* Match the next n pattern characters exactly. The following
25fe55af 5349 byte in the pattern defines n, and the n bytes after that
7814e705 5350 are the characters to match. */
fa9a63c5
RM
5351 case exactn:
5352 mcnt = *p++;
25fe55af 5353 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5354
99633e97
SM
5355 /* Remember the start point to rollback upon failure. */
5356 dfail = d;
5357
6fdd04b0 5358#ifndef emacs
25fe55af
RS
5359 /* This is written out as an if-else so we don't waste time
5360 testing `translate' inside the loop. */
28703c16 5361 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5362 do
5363 {
5364 PREFETCH ();
5365 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5366 {
6fdd04b0
KH
5367 d = dfail;
5368 goto fail;
e934739e 5369 }
6fdd04b0
KH
5370 d++;
5371 }
5372 while (--mcnt);
fa9a63c5 5373 else
6fdd04b0
KH
5374 do
5375 {
5376 PREFETCH ();
5377 if (*d++ != *p++)
bf216479 5378 {
6fdd04b0
KH
5379 d = dfail;
5380 goto fail;
bf216479 5381 }
6fdd04b0
KH
5382 }
5383 while (--mcnt);
5384#else /* emacs */
5385 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5386 if (target_multibyte)
6fdd04b0
KH
5387 do
5388 {
5389 int pat_charlen, buf_charlen;
cf9c99bc 5390 int pat_ch, buf_ch;
e934739e 5391
6fdd04b0 5392 PREFETCH ();
cf9c99bc 5393 if (multibyte)
62a6e103 5394 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5395 else
5396 {
5397 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5398 pat_charlen = 1;
5399 }
62a6e103 5400 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5401
6fdd04b0 5402 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5403 {
6fdd04b0
KH
5404 d = dfail;
5405 goto fail;
e934739e 5406 }
bf216479 5407
6fdd04b0
KH
5408 p += pat_charlen;
5409 d += buf_charlen;
5410 mcnt -= pat_charlen;
5411 }
5412 while (mcnt > 0);
fa9a63c5 5413 else
6fdd04b0
KH
5414 do
5415 {
abbd1bcf 5416 int pat_charlen;
cf9c99bc 5417 int pat_ch, buf_ch;
bf216479 5418
6fdd04b0 5419 PREFETCH ();
cf9c99bc
KH
5420 if (multibyte)
5421 {
62a6e103 5422 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5423 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5424 }
5425 else
5426 {
5427 pat_ch = *p;
5428 pat_charlen = 1;
5429 }
5430 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5431 if (! CHAR_BYTE8_P (buf_ch))
5432 {
5433 buf_ch = TRANSLATE (buf_ch);
5434 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5435 if (buf_ch < 0)
5436 buf_ch = *d;
5437 }
0e2501ed
AS
5438 else
5439 buf_ch = *d;
cf9c99bc 5440 if (buf_ch != pat_ch)
6fdd04b0
KH
5441 {
5442 d = dfail;
5443 goto fail;
bf216479 5444 }
cf9c99bc
KH
5445 p += pat_charlen;
5446 d++;
6fdd04b0
KH
5447 }
5448 while (--mcnt);
5449#endif
25fe55af 5450 break;
fa9a63c5
RM
5451
5452
25fe55af 5453 /* Match any character except possibly a newline or a null. */
fa9a63c5 5454 case anychar:
e934739e
RS
5455 {
5456 int buf_charlen;
01618498 5457 re_wchar_t buf_ch;
fa9a63c5 5458
e934739e 5459 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5460
e934739e 5461 PREFETCH ();
62a6e103 5462 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5463 target_multibyte);
e934739e
RS
5464 buf_ch = TRANSLATE (buf_ch);
5465
5466 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5467 && buf_ch == '\n')
5468 || ((bufp->syntax & RE_DOT_NOT_NULL)
5469 && buf_ch == '\000'))
5470 goto fail;
5471
e934739e
RS
5472 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5473 d += buf_charlen;
5474 }
fa9a63c5
RM
5475 break;
5476
5477
5478 case charset:
5479 case charset_not:
5480 {
b18215fc 5481 register unsigned int c;
fa9a63c5 5482 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5483 int len;
5484
5485 /* Start of actual range_table, or end of bitmap if there is no
5486 range table. */
da053e48 5487 re_char *range_table IF_LINT (= NULL);
b18215fc 5488
96cc36cc 5489 /* Nonzero if there is a range table. */
b18215fc
RS
5490 int range_table_exists;
5491
96cc36cc
RS
5492 /* Number of ranges of range table. This is not included
5493 in the initial byte-length of the command. */
5494 int count = 0;
fa9a63c5 5495
f5020181
AS
5496 /* Whether matching against a unibyte character. */
5497 boolean unibyte_char = false;
5498
25fe55af 5499 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5500
b18215fc 5501 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5502
b18215fc 5503 if (range_table_exists)
96cc36cc
RS
5504 {
5505 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5506 EXTRACT_NUMBER_AND_INCR (count, range_table);
5507 }
b18215fc 5508
2d1675e4 5509 PREFETCH ();
62a6e103 5510 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5511 if (target_multibyte)
5512 {
5513 int c1;
b18215fc 5514
cf9c99bc
KH
5515 c = TRANSLATE (c);
5516 c1 = RE_CHAR_TO_UNIBYTE (c);
5517 if (c1 >= 0)
f5020181
AS
5518 {
5519 unibyte_char = true;
5520 c = c1;
5521 }
cf9c99bc
KH
5522 }
5523 else
5524 {
5525 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5526
5527 if (! CHAR_BYTE8_P (c1))
5528 {
5529 c1 = TRANSLATE (c1);
5530 c1 = RE_CHAR_TO_UNIBYTE (c1);
5531 if (c1 >= 0)
f5020181
AS
5532 {
5533 unibyte_char = true;
5534 c = c1;
5535 }
cf9c99bc 5536 }
0b8be006
AS
5537 else
5538 unibyte_char = true;
cf9c99bc
KH
5539 }
5540
f5020181 5541 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5542 { /* Lookup bitmap. */
b18215fc
RS
5543 /* Cast to `unsigned' instead of `unsigned char' in
5544 case the bit list is a full 32 bytes long. */
5545 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5546 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5547 not = !not;
b18215fc 5548 }
96cc36cc 5549#ifdef emacs
b18215fc 5550 else if (range_table_exists)
96cc36cc
RS
5551 {
5552 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5553
14473664
SM
5554 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5555 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5556 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5557 | (class_bits & BIT_SPACE && ISSPACE (c))
5558 | (class_bits & BIT_UPPER && ISUPPER (c))
5559 | (class_bits & BIT_WORD && ISWORD (c)))
5560 not = !not;
5561 else
5562 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5563 }
5564#endif /* emacs */
fa9a63c5 5565
96cc36cc
RS
5566 if (range_table_exists)
5567 p = CHARSET_RANGE_TABLE_END (range_table, count);
5568 else
5569 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5570
5571 if (!not) goto fail;
5e69f11e 5572
b18215fc 5573 d += len;
fa9a63c5 5574 }
8fb31792 5575 break;
fa9a63c5
RM
5576
5577
25fe55af 5578 /* The beginning of a group is represented by start_memory.
505bde11 5579 The argument is the register number. The text
25fe55af 5580 matched within the group is recorded (in the internal
7814e705 5581 registers data structure) under the register number. */
25fe55af 5582 case start_memory:
505bde11
SM
5583 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5584
5585 /* In case we need to undo this operation (via backtracking). */
5586 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5587
25fe55af 5588 regstart[*p] = d;
4bb91c68 5589 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5590 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5591
25fe55af 5592 /* Move past the register number and inner group count. */
505bde11 5593 p += 1;
25fe55af 5594 break;
fa9a63c5
RM
5595
5596
25fe55af 5597 /* The stop_memory opcode represents the end of a group. Its
505bde11 5598 argument is the same as start_memory's: the register number. */
fa9a63c5 5599 case stop_memory:
505bde11
SM
5600 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5601
5602 assert (!REG_UNSET (regstart[*p]));
5603 /* Strictly speaking, there should be code such as:
177c0ea7 5604
0b32bf0e 5605 assert (REG_UNSET (regend[*p]));
505bde11
SM
5606 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5607
5608 But the only info to be pushed is regend[*p] and it is known to
5609 be UNSET, so there really isn't anything to push.
5610 Not pushing anything, on the other hand deprives us from the
5611 guarantee that regend[*p] is UNSET since undoing this operation
5612 will not reset its value properly. This is not important since
5613 the value will only be read on the next start_memory or at
5614 the very end and both events can only happen if this stop_memory
5615 is *not* undone. */
fa9a63c5 5616
25fe55af 5617 regend[*p] = d;
fa9a63c5
RM
5618 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5619
25fe55af 5620 /* Move past the register number and the inner group count. */
505bde11 5621 p += 1;
25fe55af 5622 break;
fa9a63c5
RM
5623
5624
5625 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5626 followed by the numeric value of <digit> as the register number. */
5627 case duplicate:
fa9a63c5 5628 {
66f0296e 5629 register re_char *d2, *dend2;
7814e705 5630 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5631 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5632
7814e705 5633 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5634 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5635 goto fail;
5e69f11e 5636
7814e705 5637 /* Where in input to try to start matching. */
25fe55af 5638 d2 = regstart[regno];
5e69f11e 5639
99633e97
SM
5640 /* Remember the start point to rollback upon failure. */
5641 dfail = d;
5642
25fe55af
RS
5643 /* Where to stop matching; if both the place to start and
5644 the place to stop matching are in the same string, then
5645 set to the place to stop, otherwise, for now have to use
5646 the end of the first string. */
fa9a63c5 5647
25fe55af 5648 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5649 == FIRST_STRING_P (regend[regno]))
5650 ? regend[regno] : end_match_1);
5651 for (;;)
5652 {
5653 /* If necessary, advance to next segment in register
25fe55af 5654 contents. */
fa9a63c5
RM
5655 while (d2 == dend2)
5656 {
5657 if (dend2 == end_match_2) break;
5658 if (dend2 == regend[regno]) break;
5659
25fe55af
RS
5660 /* End of string1 => advance to string2. */
5661 d2 = string2;
5662 dend2 = regend[regno];
fa9a63c5
RM
5663 }
5664 /* At end of register contents => success */
5665 if (d2 == dend2) break;
5666
5667 /* If necessary, advance to next segment in data. */
5668 PREFETCH ();
5669
5670 /* How many characters left in this segment to match. */
5671 mcnt = dend - d;
5e69f11e 5672
fa9a63c5 5673 /* Want how many consecutive characters we can match in
25fe55af
RS
5674 one shot, so, if necessary, adjust the count. */
5675 if (mcnt > dend2 - d2)
fa9a63c5 5676 mcnt = dend2 - d2;
5e69f11e 5677
fa9a63c5 5678 /* Compare that many; failure if mismatch, else move
25fe55af 5679 past them. */
28703c16 5680 if (RE_TRANSLATE_P (translate)
02cb78b5 5681 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5682 : memcmp (d, d2, mcnt))
99633e97
SM
5683 {
5684 d = dfail;
5685 goto fail;
5686 }
fa9a63c5 5687 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5688 }
5689 }
5690 break;
5691
5692
25fe55af 5693 /* begline matches the empty string at the beginning of the string
c0f9ea08 5694 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5695 case begline:
25fe55af 5696 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5697
25fe55af
RS
5698 if (AT_STRINGS_BEG (d))
5699 {
5700 if (!bufp->not_bol) break;
5701 }
419d1c74 5702 else
25fe55af 5703 {
bf216479 5704 unsigned c;
419d1c74 5705 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5706 if (c == '\n')
419d1c74 5707 break;
25fe55af
RS
5708 }
5709 /* In all other cases, we fail. */
5710 goto fail;
fa9a63c5
RM
5711
5712
25fe55af 5713 /* endline is the dual of begline. */
fa9a63c5 5714 case endline:
25fe55af 5715 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5716
25fe55af
RS
5717 if (AT_STRINGS_END (d))
5718 {
5719 if (!bufp->not_eol) break;
5720 }
f1ad044f 5721 else
25fe55af 5722 {
f1ad044f 5723 PREFETCH_NOLIMIT ();
c0f9ea08 5724 if (*d == '\n')
f1ad044f 5725 break;
25fe55af
RS
5726 }
5727 goto fail;
fa9a63c5
RM
5728
5729
5730 /* Match at the very beginning of the data. */
25fe55af
RS
5731 case begbuf:
5732 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5733 if (AT_STRINGS_BEG (d))
5734 break;
5735 goto fail;
fa9a63c5
RM
5736
5737
5738 /* Match at the very end of the data. */
25fe55af
RS
5739 case endbuf:
5740 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5741 if (AT_STRINGS_END (d))
5742 break;
25fe55af 5743 goto fail;
5e69f11e 5744
5e69f11e 5745
25fe55af
RS
5746 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5747 pushes NULL as the value for the string on the stack. Then
505bde11 5748 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5749 string, instead of restoring it. To see why, consider
7814e705 5750 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5751 then the . fails against the \n. But the next thing we want
5752 to do is match the \n against the \n; if we restored the
5753 string value, we would be back at the foo.
5754
5755 Because this is used only in specific cases, we don't need to
5756 check all the things that `on_failure_jump' does, to make
5757 sure the right things get saved on the stack. Hence we don't
5758 share its code. The only reason to push anything on the
5759 stack at all is that otherwise we would have to change
5760 `anychar's code to do something besides goto fail in this
5761 case; that seems worse than this. */
5762 case on_failure_keep_string_jump:
505bde11
SM
5763 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5764 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5765 mcnt, p + mcnt);
fa9a63c5 5766
505bde11
SM
5767 PUSH_FAILURE_POINT (p - 3, NULL);
5768 break;
5769
0683b6fa
SM
5770 /* A nasty loop is introduced by the non-greedy *? and +?.
5771 With such loops, the stack only ever contains one failure point
5772 at a time, so that a plain on_failure_jump_loop kind of
5773 cycle detection cannot work. Worse yet, such a detection
5774 can not only fail to detect a cycle, but it can also wrongly
5775 detect a cycle (between different instantiations of the same
6df42991 5776 loop).
0683b6fa
SM
5777 So the method used for those nasty loops is a little different:
5778 We use a special cycle-detection-stack-frame which is pushed
5779 when the on_failure_jump_nastyloop failure-point is *popped*.
5780 This special frame thus marks the beginning of one iteration
5781 through the loop and we can hence easily check right here
5782 whether something matched between the beginning and the end of
5783 the loop. */
5784 case on_failure_jump_nastyloop:
5785 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5786 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5787 mcnt, p + mcnt);
5788
5789 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5790 {
5791 int cycle = 0;
5792 CHECK_INFINITE_LOOP (p - 4, d);
5793 if (!cycle)
5794 /* If there's a cycle, just continue without pushing
5795 this failure point. The failure point is the "try again"
5796 option, which shouldn't be tried.
5797 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5798 PUSH_FAILURE_POINT (p - 3, d);
5799 }
0683b6fa
SM
5800 break;
5801
4e8a9132
SM
5802 /* Simple loop detecting on_failure_jump: just check on the
5803 failure stack if the same spot was already hit earlier. */
505bde11
SM
5804 case on_failure_jump_loop:
5805 on_failure:
5806 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5807 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5808 mcnt, p + mcnt);
6df42991
SM
5809 {
5810 int cycle = 0;
5811 CHECK_INFINITE_LOOP (p - 3, d);
5812 if (cycle)
5813 /* If there's a cycle, get out of the loop, as if the matching
5814 had failed. We used to just `goto fail' here, but that was
5815 aborting the search a bit too early: we want to keep the
5816 empty-loop-match and keep matching after the loop.
5817 We want (x?)*y\1z to match both xxyz and xxyxz. */
5818 p += mcnt;
5819 else
5820 PUSH_FAILURE_POINT (p - 3, d);
5821 }
25fe55af 5822 break;
fa9a63c5
RM
5823
5824
5825 /* Uses of on_failure_jump:
5e69f11e 5826
25fe55af
RS
5827 Each alternative starts with an on_failure_jump that points
5828 to the beginning of the next alternative. Each alternative
5829 except the last ends with a jump that in effect jumps past
5830 the rest of the alternatives. (They really jump to the
5831 ending jump of the following alternative, because tensioning
5832 these jumps is a hassle.)
fa9a63c5 5833
25fe55af
RS
5834 Repeats start with an on_failure_jump that points past both
5835 the repetition text and either the following jump or
5836 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5837 case on_failure_jump:
25fe55af 5838 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5839 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5840 mcnt, p + mcnt);
25fe55af 5841
505bde11 5842 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5843 break;
5844
4e8a9132 5845 /* This operation is used for greedy *.
505bde11
SM
5846 Compare the beginning of the repeat with what in the
5847 pattern follows its end. If we can establish that there
5848 is nothing that they would both match, i.e., that we
5849 would have to backtrack because of (as in, e.g., `a*a')
5850 then we can use a non-backtracking loop based on
4e8a9132 5851 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5852 case on_failure_jump_smart:
25fe55af 5853 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5854 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5855 mcnt, p + mcnt);
25fe55af 5856 {
01618498 5857 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5858 /* Here, we discard `const', making re_match non-reentrant. */
5859 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5860 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5861
505bde11
SM
5862 p -= 3; /* Reset so that we will re-execute the
5863 instruction once it's been changed. */
fa9a63c5 5864
4e8a9132
SM
5865 EXTRACT_NUMBER (mcnt, p2 - 2);
5866
5867 /* Ensure this is a indeed the trivial kind of loop
5868 we are expecting. */
5869 assert (skip_one_char (p1) == p2 - 3);
5870 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5871 DEBUG_STATEMENT (debug += 2);
505bde11 5872 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5873 {
505bde11 5874 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5875 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5876 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5877 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5878 }
505bde11 5879 else
fa9a63c5 5880 {
505bde11
SM
5881 /* Default to a safe `on_failure_jump' loop. */
5882 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5883 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5884 }
99633e97 5885 DEBUG_STATEMENT (debug -= 2);
25fe55af 5886 }
505bde11 5887 break;
25fe55af
RS
5888
5889 /* Unconditionally jump (without popping any failure points). */
5890 case jump:
fa9a63c5 5891 unconditional_jump:
5b370c2b 5892 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5893 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5894 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5895 p += mcnt; /* Do the jump. */
505bde11 5896 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5897 break;
5898
5899
25fe55af
RS
5900 /* Have to succeed matching what follows at least n times.
5901 After that, handle like `on_failure_jump'. */
5902 case succeed_n:
01618498 5903 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5904 EXTRACT_NUMBER (mcnt, p + 2);
5905 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5906
dc1e502d
SM
5907 /* Originally, mcnt is how many times we HAVE to succeed. */
5908 if (mcnt != 0)
25fe55af 5909 {
6dcf2d0e
SM
5910 /* Here, we discard `const', making re_match non-reentrant. */
5911 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5912 mcnt--;
01618498
SM
5913 p += 4;
5914 PUSH_NUMBER (p2, mcnt);
25fe55af 5915 }
dc1e502d
SM
5916 else
5917 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5918 goto on_failure;
25fe55af
RS
5919 break;
5920
5921 case jump_n:
01618498 5922 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5923 EXTRACT_NUMBER (mcnt, p + 2);
5924 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5925
5926 /* Originally, this is how many times we CAN jump. */
dc1e502d 5927 if (mcnt != 0)
25fe55af 5928 {
6dcf2d0e
SM
5929 /* Here, we discard `const', making re_match non-reentrant. */
5930 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5931 mcnt--;
01618498 5932 PUSH_NUMBER (p2, mcnt);
dc1e502d 5933 goto unconditional_jump;
25fe55af
RS
5934 }
5935 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5936 else
5937 p += 4;
25fe55af 5938 break;
5e69f11e 5939
fa9a63c5
RM
5940 case set_number_at:
5941 {
01618498 5942 unsigned char *p2; /* Location of the counter. */
25fe55af 5943 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5944
25fe55af 5945 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5946 /* Here, we discard `const', making re_match non-reentrant. */
5947 p2 = (unsigned char*) p + mcnt;
01618498 5948 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5949 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5950 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5951 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5952 break;
5953 }
9121ca40
KH
5954
5955 case wordbound:
66f0296e 5956 case notwordbound:
19ed5445
PE
5957 {
5958 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5959 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5960
19ed5445 5961 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5962
19ed5445
PE
5963 /* Case 1: D is at the beginning or the end of string. */
5964 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5965 not = !not;
5966 else
5967 {
5968 /* C1 is the character before D, S1 is the syntax of C1, C2
5969 is the character at D, and S2 is the syntax of C2. */
5970 re_wchar_t c1, c2;
5971 int s1, s2;
5972 int dummy;
b18215fc 5973#ifdef emacs
d1dfb56c
EZ
5974 ssize_t offset = PTR_TO_OFFSET (d - 1);
5975 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445 5976 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5977#endif
19ed5445
PE
5978 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5979 s1 = SYNTAX (c1);
b18215fc 5980#ifdef emacs
19ed5445 5981 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5982#endif
19ed5445
PE
5983 PREFETCH_NOLIMIT ();
5984 GET_CHAR_AFTER (c2, d, dummy);
5985 s2 = SYNTAX (c2);
5986
5987 if (/* Case 2: Only one of S1 and S2 is Sword. */
5988 ((s1 == Sword) != (s2 == Sword))
5989 /* Case 3: Both of S1 and S2 are Sword, and macro
5990 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5991 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5992 not = !not;
5993 }
5994 if (not)
5995 break;
5996 else
5997 goto fail;
5998 }
fa9a63c5
RM
5999
6000 case wordbeg:
25fe55af 6001 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6002
b18215fc
RS
6003 /* We FAIL in one of the following cases: */
6004
7814e705 6005 /* Case 1: D is at the end of string. */
b18215fc 6006 if (AT_STRINGS_END (d))
99633e97 6007 goto fail;
b18215fc
RS
6008 else
6009 {
6010 /* C1 is the character before D, S1 is the syntax of C1, C2
6011 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6012 re_wchar_t c1, c2;
6013 int s1, s2;
bf216479 6014 int dummy;
fa9a63c5 6015#ifdef emacs
d1dfb56c
EZ
6016 ssize_t offset = PTR_TO_OFFSET (d);
6017 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6018 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6019#endif
99633e97 6020 PREFETCH ();
6fdd04b0 6021 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6022 s2 = SYNTAX (c2);
177c0ea7 6023
b18215fc
RS
6024 /* Case 2: S2 is not Sword. */
6025 if (s2 != Sword)
6026 goto fail;
6027
6028 /* Case 3: D is not at the beginning of string ... */
6029 if (!AT_STRINGS_BEG (d))
6030 {
6031 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6032#ifdef emacs
5d967c7a 6033 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6034#endif
b18215fc
RS
6035 s1 = SYNTAX (c1);
6036
6037 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6038 returns 0. */
b18215fc
RS
6039 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6040 goto fail;
6041 }
6042 }
e318085a
RS
6043 break;
6044
b18215fc 6045 case wordend:
25fe55af 6046 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6047
6048 /* We FAIL in one of the following cases: */
6049
6050 /* Case 1: D is at the beginning of string. */
6051 if (AT_STRINGS_BEG (d))
e318085a 6052 goto fail;
b18215fc
RS
6053 else
6054 {
6055 /* C1 is the character before D, S1 is the syntax of C1, C2
6056 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6057 re_wchar_t c1, c2;
6058 int s1, s2;
bf216479 6059 int dummy;
5d967c7a 6060#ifdef emacs
d1dfb56c
EZ
6061 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6062 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6063 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6064#endif
99633e97 6065 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6066 s1 = SYNTAX (c1);
6067
6068 /* Case 2: S1 is not Sword. */
6069 if (s1 != Sword)
6070 goto fail;
6071
6072 /* Case 3: D is not at the end of string ... */
6073 if (!AT_STRINGS_END (d))
6074 {
f1ad044f 6075 PREFETCH_NOLIMIT ();
6fdd04b0 6076 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6077#ifdef emacs
6078 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6079#endif
b18215fc
RS
6080 s2 = SYNTAX (c2);
6081
6082 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6083 returns 0. */
b18215fc 6084 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6085 goto fail;
b18215fc
RS
6086 }
6087 }
e318085a
RS
6088 break;
6089
669fa600
SM
6090 case symbeg:
6091 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6092
6093 /* We FAIL in one of the following cases: */
6094
7814e705 6095 /* Case 1: D is at the end of string. */
669fa600
SM
6096 if (AT_STRINGS_END (d))
6097 goto fail;
6098 else
6099 {
6100 /* C1 is the character before D, S1 is the syntax of C1, C2
6101 is the character at D, and S2 is the syntax of C2. */
6102 re_wchar_t c1, c2;
6103 int s1, s2;
6104#ifdef emacs
d1dfb56c
EZ
6105 ssize_t offset = PTR_TO_OFFSET (d);
6106 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6107 UPDATE_SYNTAX_TABLE (charpos);
6108#endif
6109 PREFETCH ();
62a6e103 6110 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6111 s2 = SYNTAX (c2);
7814e705 6112
669fa600
SM
6113 /* Case 2: S2 is neither Sword nor Ssymbol. */
6114 if (s2 != Sword && s2 != Ssymbol)
6115 goto fail;
6116
6117 /* Case 3: D is not at the beginning of string ... */
6118 if (!AT_STRINGS_BEG (d))
6119 {
6120 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6121#ifdef emacs
6122 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6123#endif
6124 s1 = SYNTAX (c1);
6125
6126 /* ... and S1 is Sword or Ssymbol. */
6127 if (s1 == Sword || s1 == Ssymbol)
6128 goto fail;
6129 }
6130 }
6131 break;
6132
6133 case symend:
6134 DEBUG_PRINT1 ("EXECUTING symend.\n");
6135
6136 /* We FAIL in one of the following cases: */
6137
6138 /* Case 1: D is at the beginning of string. */
6139 if (AT_STRINGS_BEG (d))
6140 goto fail;
6141 else
6142 {
6143 /* C1 is the character before D, S1 is the syntax of C1, C2
6144 is the character at D, and S2 is the syntax of C2. */
6145 re_wchar_t c1, c2;
6146 int s1, s2;
6147#ifdef emacs
d1dfb56c
EZ
6148 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6149 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
669fa600
SM
6150 UPDATE_SYNTAX_TABLE (charpos);
6151#endif
6152 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6153 s1 = SYNTAX (c1);
6154
6155 /* Case 2: S1 is neither Ssymbol nor Sword. */
6156 if (s1 != Sword && s1 != Ssymbol)
6157 goto fail;
6158
6159 /* Case 3: D is not at the end of string ... */
6160 if (!AT_STRINGS_END (d))
6161 {
6162 PREFETCH_NOLIMIT ();
62a6e103 6163 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6164#ifdef emacs
134579f2 6165 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6166#endif
6167 s2 = SYNTAX (c2);
6168
6169 /* ... and S2 is Sword or Ssymbol. */
6170 if (s2 == Sword || s2 == Ssymbol)
6171 goto fail;
b18215fc
RS
6172 }
6173 }
e318085a
RS
6174 break;
6175
fa9a63c5 6176 case syntaxspec:
1fb352e0 6177 case notsyntaxspec:
b18215fc 6178 {
19ed5445
PE
6179 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6180 mcnt = *p++;
6181 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6182 PREFETCH ();
6183#ifdef emacs
6184 {
d1dfb56c
EZ
6185 ssize_t offset = PTR_TO_OFFSET (d);
6186 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
19ed5445
PE
6187 UPDATE_SYNTAX_TABLE (pos1);
6188 }
25fe55af 6189#endif
19ed5445
PE
6190 {
6191 int len;
6192 re_wchar_t c;
b18215fc 6193
19ed5445
PE
6194 GET_CHAR_AFTER (c, d, len);
6195 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6196 goto fail;
6197 d += len;
6198 }
b18215fc 6199 }
8fb31792 6200 break;
fa9a63c5 6201
b18215fc 6202#ifdef emacs
1fb352e0
SM
6203 case before_dot:
6204 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6205 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6206 goto fail;
b18215fc
RS
6207 break;
6208
1fb352e0
SM
6209 case at_dot:
6210 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6211 if (PTR_BYTE_POS (d) != PT_BYTE)
6212 goto fail;
6213 break;
b18215fc 6214
1fb352e0
SM
6215 case after_dot:
6216 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6217 if (PTR_BYTE_POS (d) <= PT_BYTE)
6218 goto fail;
e318085a 6219 break;
fa9a63c5 6220
1fb352e0 6221 case categoryspec:
b18215fc 6222 case notcategoryspec:
b18215fc 6223 {
8fb31792
PE
6224 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6225 mcnt = *p++;
6226 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n",
6227 not?"not":"", mcnt);
6228 PREFETCH ();
01618498 6229
8fb31792
PE
6230 {
6231 int len;
6232 re_wchar_t c;
6233 GET_CHAR_AFTER (c, d, len);
6234 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6235 goto fail;
6236 d += len;
6237 }
b18215fc 6238 }
fa9a63c5 6239 break;
5e69f11e 6240
1fb352e0 6241#endif /* emacs */
5e69f11e 6242
0b32bf0e
SM
6243 default:
6244 abort ();
fa9a63c5 6245 }
b18215fc 6246 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6247
6248
6249 /* We goto here if a matching operation fails. */
6250 fail:
5b370c2b 6251 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6252 if (!FAIL_STACK_EMPTY ())
505bde11 6253 {
01618498 6254 re_char *str, *pat;
505bde11 6255 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6256 DEBUG_PRINT1 ("\nFAIL:\n");
6257 POP_FAILURE_POINT (str, pat);
505bde11
SM
6258 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6259 {
6260 case on_failure_keep_string_jump:
6261 assert (str == NULL);
6262 goto continue_failure_jump;
6263
0683b6fa
SM
6264 case on_failure_jump_nastyloop:
6265 assert ((re_opcode_t)pat[-2] == no_op);
6266 PUSH_FAILURE_POINT (pat - 2, str);
6267 /* Fallthrough */
6268
505bde11
SM
6269 case on_failure_jump_loop:
6270 case on_failure_jump:
6271 case succeed_n:
6272 d = str;
6273 continue_failure_jump:
6274 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6275 p = pat + mcnt;
6276 break;
b18215fc 6277
0683b6fa
SM
6278 case no_op:
6279 /* A special frame used for nastyloops. */
6280 goto fail;
6281
505bde11 6282 default:
5e617bc2 6283 abort ();
505bde11 6284 }
fa9a63c5 6285
505bde11 6286 assert (p >= bufp->buffer && p <= pend);
b18215fc 6287
0b32bf0e 6288 if (d >= string1 && d <= end1)
fa9a63c5 6289 dend = end_match_1;
0b32bf0e 6290 }
fa9a63c5 6291 else
0b32bf0e 6292 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6293 } /* for (;;) */
6294
6295 if (best_regs_set)
6296 goto restore_best_regs;
6297
6298 FREE_VARIABLES ();
6299
b18215fc 6300 return -1; /* Failure to match. */
fa9a63c5
RM
6301} /* re_match_2 */
6302\f
6303/* Subroutine definitions for re_match_2. */
6304
fa9a63c5
RM
6305/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6306 bytes; nonzero otherwise. */
5e69f11e 6307
fa9a63c5 6308static int
d1dfb56c 6309bcmp_translate (const re_char *s1, const re_char *s2, register ssize_t len,
438105ed 6310 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6311{
2d1675e4
SM
6312 register re_char *p1 = s1, *p2 = s2;
6313 re_char *p1_end = s1 + len;
6314 re_char *p2_end = s2 + len;
e934739e 6315
4bb91c68
SM
6316 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6317 different lengths, but relying on a single `len' would break this. -sm */
6318 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6319 {
e934739e 6320 int p1_charlen, p2_charlen;
01618498 6321 re_wchar_t p1_ch, p2_ch;
e934739e 6322
6fdd04b0
KH
6323 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6324 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6325
6326 if (RE_TRANSLATE (translate, p1_ch)
6327 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6328 return 1;
e934739e
RS
6329
6330 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6331 }
e934739e
RS
6332
6333 if (p1 != p1_end || p2 != p2_end)
6334 return 1;
6335
fa9a63c5
RM
6336 return 0;
6337}
6338\f
6339/* Entry points for GNU code. */
6340
6341/* re_compile_pattern is the GNU regular expression compiler: it
6342 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6343 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6344
fa9a63c5
RM
6345 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6346 are set in BUFP on entry.
5e69f11e 6347
b18215fc 6348 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6349
6350const char *
d1dfb56c
EZ
6351re_compile_pattern (const char *pattern, size_t length,
6352 struct re_pattern_buffer *bufp)
fa9a63c5
RM
6353{
6354 reg_errcode_t ret;
5e69f11e 6355
fa9a63c5
RM
6356 /* GNU code is written to assume at least RE_NREGS registers will be set
6357 (and at least one extra will be -1). */
6358 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6359
fa9a63c5
RM
6360 /* And GNU code determines whether or not to get register information
6361 by passing null for the REGS argument to re_match, etc., not by
6362 setting no_sub. */
6363 bufp->no_sub = 0;
5e69f11e 6364
4bb91c68 6365 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6366
6367 if (!ret)
6368 return NULL;
6369 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6370}
c0f9ea08 6371WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6372\f
b18215fc
RS
6373/* Entry points compatible with 4.2 BSD regex library. We don't define
6374 them unless specifically requested. */
fa9a63c5 6375
0b32bf0e 6376#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6377
6378/* BSD has one and only one pattern buffer. */
6379static struct re_pattern_buffer re_comp_buf;
6380
6381char *
0b32bf0e 6382# ifdef _LIBC
48afdd44
RM
6383/* Make these definitions weak in libc, so POSIX programs can redefine
6384 these names if they don't use our functions, and still use
6385 regcomp/regexec below without link errors. */
6386weak_function
0b32bf0e 6387# endif
31011111 6388re_comp (const char *s)
fa9a63c5
RM
6389{
6390 reg_errcode_t ret;
5e69f11e 6391
fa9a63c5
RM
6392 if (!s)
6393 {
6394 if (!re_comp_buf.buffer)
0b32bf0e 6395 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6396 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6397 return 0;
6398 }
6399
6400 if (!re_comp_buf.buffer)
6401 {
6402 re_comp_buf.buffer = (unsigned char *) malloc (200);
6403 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6404 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6405 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6406 re_comp_buf.allocated = 200;
6407
6408 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6409 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6410 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6411 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6412 }
6413
6414 /* Since `re_exec' always passes NULL for the `regs' argument, we
6415 don't need to initialize the pattern buffer fields which affect it. */
6416
fa9a63c5 6417 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6418
fa9a63c5
RM
6419 if (!ret)
6420 return NULL;
6421
6422 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6423 return (char *) gettext (re_error_msgid[(int) ret]);
6424}
6425
6426
31011111 6427int
0b32bf0e 6428# ifdef _LIBC
48afdd44 6429weak_function
0b32bf0e 6430# endif
d1dfb56c 6431re_exec (const char *s)
fa9a63c5 6432{
d1dfb56c 6433 const size_t len = strlen (s);
fa9a63c5
RM
6434 return
6435 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6436}
6437#endif /* _REGEX_RE_COMP */
6438\f
6439/* POSIX.2 functions. Don't define these for Emacs. */
6440
6441#ifndef emacs
6442
6443/* regcomp takes a regular expression as a string and compiles it.
6444
b18215fc 6445 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6446 since POSIX says we shouldn't. Thus, we set
6447
6448 `buffer' to the compiled pattern;
6449 `used' to the length of the compiled pattern;
6450 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6451 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6452 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6453 `fastmap' to an allocated space for the fastmap;
6454 `fastmap_accurate' to zero;
fa9a63c5
RM
6455 `re_nsub' to the number of subexpressions in PATTERN.
6456
6457 PATTERN is the address of the pattern string.
6458
6459 CFLAGS is a series of bits which affect compilation.
6460
6461 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6462 use POSIX basic syntax.
6463
6464 If REG_NEWLINE is set, then . and [^...] don't match newline.
6465 Also, regexec will try a match beginning after every newline.
6466
6467 If REG_ICASE is set, then we considers upper- and lowercase
6468 versions of letters to be equivalent when matching.
6469
6470 If REG_NOSUB is set, then when PREG is passed to regexec, that
6471 routine will report only success or failure, and nothing about the
6472 registers.
6473
b18215fc 6474 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6475 the return codes and their meanings.) */
6476
d1dfb56c 6477reg_errcode_t
d2762c86
DN
6478regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6479 int cflags)
fa9a63c5
RM
6480{
6481 reg_errcode_t ret;
4bb91c68 6482 reg_syntax_t syntax
fa9a63c5
RM
6483 = (cflags & REG_EXTENDED) ?
6484 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6485
6486 /* regex_compile will allocate the space for the compiled pattern. */
6487 preg->buffer = 0;
6488 preg->allocated = 0;
6489 preg->used = 0;
5e69f11e 6490
c0f9ea08
SM
6491 /* Try to allocate space for the fastmap. */
6492 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6493
fa9a63c5
RM
6494 if (cflags & REG_ICASE)
6495 {
6496 unsigned i;
5e69f11e 6497
6676cb1c
RS
6498 preg->translate
6499 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6500 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6501 if (preg->translate == NULL)
0b32bf0e 6502 return (int) REG_ESPACE;
fa9a63c5
RM
6503
6504 /* Map uppercase characters to corresponding lowercase ones. */
6505 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6506 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6507 }
6508 else
6509 preg->translate = NULL;
6510
6511 /* If REG_NEWLINE is set, newlines are treated differently. */
6512 if (cflags & REG_NEWLINE)
6513 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6514 syntax &= ~RE_DOT_NEWLINE;
6515 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6516 }
6517 else
c0f9ea08 6518 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6519
6520 preg->no_sub = !!(cflags & REG_NOSUB);
6521
5e69f11e 6522 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6523 can use strlen here in compiling the pattern. */
4bb91c68 6524 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6525
fa9a63c5
RM
6526 /* POSIX doesn't distinguish between an unmatched open-group and an
6527 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6528 if (ret == REG_ERPAREN)
6529 ret = REG_EPAREN;
6530
6531 if (ret == REG_NOERROR && preg->fastmap)
6532 { /* Compute the fastmap now, since regexec cannot modify the pattern
6533 buffer. */
6534 re_compile_fastmap (preg);
6535 if (preg->can_be_null)
6536 { /* The fastmap can't be used anyway. */
6537 free (preg->fastmap);
6538 preg->fastmap = NULL;
6539 }
6540 }
d1dfb56c 6541 return ret;
fa9a63c5 6542}
c0f9ea08 6543WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6544
6545
6546/* regexec searches for a given pattern, specified by PREG, in the
6547 string STRING.
5e69f11e 6548
fa9a63c5 6549 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6550 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6551 least NMATCH elements, and we set them to the offsets of the
6552 corresponding matched substrings.
5e69f11e 6553
fa9a63c5
RM
6554 EFLAGS specifies `execution flags' which affect matching: if
6555 REG_NOTBOL is set, then ^ does not match at the beginning of the
6556 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6557
fa9a63c5
RM
6558 We return 0 if we find a match and REG_NOMATCH if not. */
6559
d1dfb56c 6560reg_errcode_t
d2762c86
DN
6561regexec (const regex_t *__restrict preg, const char *__restrict string,
6562 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5 6563{
31011111 6564 regoff_t ret;
fa9a63c5
RM
6565 struct re_registers regs;
6566 regex_t private_preg;
d1dfb56c 6567 size_t len = strlen (string);
c0f9ea08 6568 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6569
6570 private_preg = *preg;
5e69f11e 6571
fa9a63c5
RM
6572 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6573 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6574
fa9a63c5
RM
6575 /* The user has told us exactly how many registers to return
6576 information about, via `nmatch'. We have to pass that on to the
b18215fc 6577 matching routines. */
fa9a63c5 6578 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6579
fa9a63c5
RM
6580 if (want_reg_info)
6581 {
6582 regs.num_regs = nmatch;
4bb91c68
SM
6583 regs.start = TALLOC (nmatch * 2, regoff_t);
6584 if (regs.start == NULL)
d1dfb56c 6585 return REG_NOMATCH;
4bb91c68 6586 regs.end = regs.start + nmatch;
fa9a63c5
RM
6587 }
6588
c0f9ea08
SM
6589 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6590 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6591 was a little bit longer but still only matching the real part.
6592 This works because the `endline' will check for a '\n' and will find a
6593 '\0', correctly deciding that this is not the end of a line.
6594 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6595 a convenient '\0' there. For all we know, the string could be preceded
6596 by '\n' which would throw things off. */
6597
fa9a63c5
RM
6598 /* Perform the searching operation. */
6599 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6600 /* start: */ 0, /* range: */ len,
6601 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6602
fa9a63c5
RM
6603 /* Copy the register information to the POSIX structure. */
6604 if (want_reg_info)
6605 {
6606 if (ret >= 0)
0b32bf0e
SM
6607 {
6608 unsigned r;
fa9a63c5 6609
0b32bf0e
SM
6610 for (r = 0; r < nmatch; r++)
6611 {
6612 pmatch[r].rm_so = regs.start[r];
6613 pmatch[r].rm_eo = regs.end[r];
6614 }
6615 }
fa9a63c5 6616
b18215fc 6617 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6618 free (regs.start);
fa9a63c5
RM
6619 }
6620
6621 /* We want zero return to mean success, unlike `re_search'. */
d1dfb56c 6622 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
fa9a63c5 6623}
c0f9ea08 6624WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6625
6626
ec869672
JR
6627/* Returns a message corresponding to an error code, ERR_CODE, returned
6628 from either regcomp or regexec. We don't use PREG here.
6629
6630 ERR_CODE was previously called ERRCODE, but that name causes an
6631 error with msvc8 compiler. */
fa9a63c5
RM
6632
6633size_t
d2762c86 6634regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6635{
6636 const char *msg;
6637 size_t msg_size;
6638
ec869672
JR
6639 if (err_code < 0
6640 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6641 /* Only error codes returned by the rest of the code should be passed
b18215fc 6642 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6643 code generates an invalid error code, then the program has a bug.
6644 Dump core so we can fix it. */
6645 abort ();
6646
ec869672 6647 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6648
6649 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6650
fa9a63c5
RM
6651 if (errbuf_size != 0)
6652 {
6653 if (msg_size > errbuf_size)
0b32bf0e
SM
6654 {
6655 strncpy (errbuf, msg, errbuf_size - 1);
6656 errbuf[errbuf_size - 1] = 0;
6657 }
fa9a63c5 6658 else
0b32bf0e 6659 strcpy (errbuf, msg);
fa9a63c5
RM
6660 }
6661
6662 return msg_size;
6663}
c0f9ea08 6664WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6665
6666
6667/* Free dynamically allocated space used by PREG. */
6668
6669void
d2762c86 6670regfree (regex_t *preg)
fa9a63c5 6671{
c2cd06e6 6672 free (preg->buffer);
fa9a63c5 6673 preg->buffer = NULL;
5e69f11e 6674
fa9a63c5
RM
6675 preg->allocated = 0;
6676 preg->used = 0;
6677
c2cd06e6 6678 free (preg->fastmap);
fa9a63c5
RM
6679 preg->fastmap = NULL;
6680 preg->fastmap_accurate = 0;
6681
c2cd06e6 6682 free (preg->translate);
fa9a63c5
RM
6683 preg->translate = NULL;
6684}
c0f9ea08 6685WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6686
6687#endif /* not emacs */