* regex.c (CHARSET_LOOKUP_RANGE_TABLE_RAW, POP_FAILURE_REG_OR_COUNT):
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
95df8112 5 Copyright (C) 1993-2011 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
4fc5845f 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
fa9a63c5 36#ifdef HAVE_CONFIG_H
0b32bf0e 37# include <config.h>
fa9a63c5
RM
38#endif
39
4bb91c68
SM
40#if defined STDC_HEADERS && !defined emacs
41# include <stddef.h>
42#else
43/* We need this for `regex.h', and perhaps for the Emacs include files. */
44# include <sys/types.h>
45#endif
fa9a63c5 46
14473664
SM
47/* Whether to use ISO C Amendment 1 wide char functions.
48 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
49#if defined _LIBC
50#define WIDE_CHAR_SUPPORT 1
51#else
14473664 52#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
53 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
54#endif
14473664
SM
55
56/* For platform which support the ISO C amendement 1 functionality we
57 support user defined character classes. */
a0ad02f7 58#if WIDE_CHAR_SUPPORT
14473664
SM
59/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
60# include <wchar.h>
61# include <wctype.h>
62#endif
63
c0f9ea08
SM
64#ifdef _LIBC
65/* We have to keep the namespace clean. */
66# define regfree(preg) __regfree (preg)
67# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
68# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
69# define regerror(err_code, preg, errbuf, errbuf_size) \
70 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
71# define re_set_registers(bu, re, nu, st, en) \
72 __re_set_registers (bu, re, nu, st, en)
73# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
74 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
75# define re_match(bufp, string, size, pos, regs) \
76 __re_match (bufp, string, size, pos, regs)
77# define re_search(bufp, string, size, startpos, range, regs) \
78 __re_search (bufp, string, size, startpos, range, regs)
79# define re_compile_pattern(pattern, length, bufp) \
80 __re_compile_pattern (pattern, length, bufp)
81# define re_set_syntax(syntax) __re_set_syntax (syntax)
82# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
83 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
84# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
85
14473664
SM
86/* Make sure we call libc's function even if the user overrides them. */
87# define btowc __btowc
88# define iswctype __iswctype
89# define wctype __wctype
90
c0f9ea08
SM
91# define WEAK_ALIAS(a,b) weak_alias (a, b)
92
93/* We are also using some library internals. */
94# include <locale/localeinfo.h>
95# include <locale/elem-hash.h>
96# include <langinfo.h>
97#else
98# define WEAK_ALIAS(a,b)
99#endif
100
4bb91c68 101/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 102#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
103# include <libintl.h>
104#else
105# define gettext(msgid) (msgid)
106#endif
107
5e69f11e
RM
108#ifndef gettext_noop
109/* This define is so xgettext can find the internationalizable
110 strings. */
0b32bf0e 111# define gettext_noop(String) String
5e69f11e
RM
112#endif
113
fa9a63c5
RM
114/* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116#ifdef emacs
117
d7306fe6 118# include <setjmp.h>
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e 125# include "syntax.h"
9117d724 126# include "character.h"
0b32bf0e 127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
7814e705 142/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 147# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
148# define RE_STRING_CHAR(p, multibyte) \
149 (multibyte ? (STRING_CHAR (p)) : (*(p)))
150# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 152
4c0354d7 153# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 154
2afc21f5 155# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 156
6fdd04b0
KH
157/* Set C a (possibly converted to multibyte) character before P. P
158 points into a string which is the virtual concatenation of STR1
159 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
160# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
161 do { \
02cb78b5 162 if (target_multibyte) \
bf216479
KH
163 { \
164 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
165 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
166 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 167 c = STRING_CHAR (dtemp); \
bf216479
KH
168 } \
169 else \
170 { \
171 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 172 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 173 } \
2d1675e4
SM
174 } while (0)
175
6fdd04b0
KH
176/* Set C a (possibly converted to multibyte) character at P, and set
177 LEN to the byte length of that character. */
178# define GET_CHAR_AFTER(c, p, len) \
179 do { \
02cb78b5 180 if (target_multibyte) \
62a6e103 181 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
182 else \
183 { \
cf9c99bc 184 (c) = *p; \
6fdd04b0 185 len = 1; \
cf9c99bc 186 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 187 } \
8f924df7 188 } while (0)
4e8a9132 189
fa9a63c5
RM
190#else /* not emacs */
191
192/* If we are not linking with Emacs proper,
193 we can't use the relocating allocator
194 even if config.h says that we can. */
0b32bf0e 195# undef REL_ALLOC
fa9a63c5 196
4004364e 197# include <unistd.h>
fa9a63c5 198
a77f947b
CY
199/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
200
201void *
d2762c86 202xmalloc (size_t size)
a77f947b
CY
203{
204 register void *val;
205 val = (void *) malloc (size);
206 if (!val && size)
207 {
208 write (2, "virtual memory exhausted\n", 25);
209 exit (1);
210 }
211 return val;
212}
213
214void *
d2762c86 215xrealloc (void *block, size_t size)
a77f947b
CY
216{
217 register void *val;
218 /* We must call malloc explicitly when BLOCK is 0, since some
219 reallocs don't do this. */
220 if (! block)
221 val = (void *) malloc (size);
222 else
223 val = (void *) realloc (block, size);
224 if (!val && size)
225 {
226 write (2, "virtual memory exhausted\n", 25);
227 exit (1);
228 }
229 return val;
230}
231
a073faa6
CY
232# ifdef malloc
233# undef malloc
234# endif
235# define malloc xmalloc
236# ifdef realloc
237# undef realloc
238# endif
239# define realloc xrealloc
240
72af86bd
AS
241/* This is the normal way of making sure we have memcpy, memcmp and memset. */
242# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
243# include <string.h>
244# else
245# include <strings.h>
246# ifndef memcmp
247# define memcmp(s1, s2, n) bcmp (s1, s2, n)
0b32bf0e 248# endif
72af86bd
AS
249# ifndef memcpy
250# define memcpy(d, s, n) (bcopy (s, d, n), (d))
0b32bf0e
SM
251# endif
252# endif
fa9a63c5
RM
253
254/* Define the syntax stuff for \<, \>, etc. */
255
990b2375 256/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 257enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 258
0b32bf0e 259# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 260
e934739e 261/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
262# define CHAR_CHARSET(c) 0
263# define CHARSET_LEADING_CODE_BASE(c) 0
264# define MAX_MULTIBYTE_LENGTH 1
265# define RE_MULTIBYTE_P(x) 0
bf216479 266# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
267# define WORD_BOUNDARY_P(c1, c2) (0)
268# define CHAR_HEAD_P(p) (1)
269# define SINGLE_BYTE_CHAR_P(c) (1)
270# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 271# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 272# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
273# define STRING_CHAR(p) (*(p))
274# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 275# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
276# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
277# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
278# define RE_CHAR_TO_MULTIBYTE(c) (c)
279# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 280# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 281 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
282# define GET_CHAR_AFTER(c, p, len) \
283 (c = *p, len = 1)
0b32bf0e 284# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
285# define BYTE8_TO_CHAR(c) (c)
286# define CHAR_BYTE8_P(c) (0)
bf216479 287# define CHAR_LEADING_CODE(c) (c)
8f924df7 288
fa9a63c5 289#endif /* not emacs */
4e8a9132
SM
290
291#ifndef RE_TRANSLATE
0b32bf0e
SM
292# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
293# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 294#endif
fa9a63c5
RM
295\f
296/* Get the interface, including the syntax bits. */
297#include "regex.h"
298
f71b19b6
DL
299/* isalpha etc. are used for the character classes. */
300#include <ctype.h>
fa9a63c5 301
f71b19b6 302#ifdef emacs
fa9a63c5 303
f71b19b6 304/* 1 if C is an ASCII character. */
0b32bf0e 305# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 306
f71b19b6 307/* 1 if C is a unibyte character. */
0b32bf0e 308# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 309
f71b19b6 310/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 311
f71b19b6 312/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
313# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
314# define ISCNTRL(c) ((c) < ' ')
315# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
316 || ((c) >= 'a' && (c) <= 'f') \
317 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
318
319/* This is only used for single-byte characters. */
0b32bf0e 320# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
321
322/* The rest must handle multibyte characters. */
323
0b32bf0e 324# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 325 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
326 : 1)
327
14473664 328# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 329 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
330 : 1)
331
0b32bf0e 332# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
333 ? (((c) >= 'a' && (c) <= 'z') \
334 || ((c) >= 'A' && (c) <= 'Z') \
335 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
336 : SYNTAX (c) == Sword)
337
0b32bf0e 338# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
339 ? (((c) >= 'a' && (c) <= 'z') \
340 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
341 : SYNTAX (c) == Sword)
342
0b32bf0e 343# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 344
0b32bf0e 345# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
346 ? ((c) > ' ' && (c) < 0177 \
347 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
348 || ((c) >= 'A' && (c) <= 'Z') \
349 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
350 : SYNTAX (c) != Sword)
351
0b32bf0e 352# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 353
0b32bf0e 354# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 355
0b32bf0e 356# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
357
358#else /* not emacs */
359
f71b19b6
DL
360/* Jim Meyering writes:
361
362 "... Some ctype macros are valid only for character codes that
363 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
364 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 365 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
366 STDC_HEADERS is defined, then autoconf has verified that the ctype
367 macros don't need to be guarded with references to isascii. ...
368 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
369 eliminate the && through constant folding."
370 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 371
4bb91c68 372# undef ISASCII
0b32bf0e
SM
373# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
374# define ISASCII(c) 1
375# else
376# define ISASCII(c) isascii(c)
377# endif
f71b19b6
DL
378
379/* 1 if C is an ASCII character. */
0b32bf0e 380# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
381
382/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
383# define ISUNIBYTE(c) 1
384
385# ifdef isblank
386# define ISBLANK(c) (ISASCII (c) && isblank (c))
387# else
388# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
389# endif
390# ifdef isgraph
391# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
392# else
393# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
394# endif
395
4bb91c68 396# undef ISPRINT
0b32bf0e
SM
397# define ISPRINT(c) (ISASCII (c) && isprint (c))
398# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
399# define ISALNUM(c) (ISASCII (c) && isalnum (c))
400# define ISALPHA(c) (ISASCII (c) && isalpha (c))
401# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
402# define ISLOWER(c) (ISASCII (c) && islower (c))
403# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
404# define ISSPACE(c) (ISASCII (c) && isspace (c))
405# define ISUPPER(c) (ISASCII (c) && isupper (c))
406# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
407
408# define ISWORD(c) ISALPHA(c)
409
4bb91c68
SM
410# ifdef _tolower
411# define TOLOWER(c) _tolower(c)
412# else
413# define TOLOWER(c) tolower(c)
414# endif
415
416/* How many characters in the character set. */
417# define CHAR_SET_SIZE 256
418
0b32bf0e 419# ifdef SYNTAX_TABLE
f71b19b6 420
0b32bf0e 421extern char *re_syntax_table;
f71b19b6 422
0b32bf0e
SM
423# else /* not SYNTAX_TABLE */
424
0b32bf0e
SM
425static char re_syntax_table[CHAR_SET_SIZE];
426
427static void
d2762c86 428init_syntax_once (void)
0b32bf0e
SM
429{
430 register int c;
431 static int done = 0;
432
433 if (done)
434 return;
435
72af86bd 436 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 437
4bb91c68
SM
438 for (c = 0; c < CHAR_SET_SIZE; ++c)
439 if (ISALNUM (c))
440 re_syntax_table[c] = Sword;
fa9a63c5 441
669fa600 442 re_syntax_table['_'] = Ssymbol;
fa9a63c5 443
0b32bf0e
SM
444 done = 1;
445}
446
447# endif /* not SYNTAX_TABLE */
96cc36cc 448
4bb91c68
SM
449# define SYNTAX(c) re_syntax_table[(c)]
450
96cc36cc
RS
451#endif /* not emacs */
452\f
fa9a63c5 453#ifndef NULL
0b32bf0e 454# define NULL (void *)0
fa9a63c5
RM
455#endif
456
457/* We remove any previous definition of `SIGN_EXTEND_CHAR',
458 since ours (we hope) works properly with all combinations of
459 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 460 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
461#undef SIGN_EXTEND_CHAR
462#if __STDC__
0b32bf0e 463# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
464#else /* not __STDC__ */
465/* As in Harbison and Steele. */
0b32bf0e 466# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
467#endif
468\f
469/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
470 use `alloca' instead of `malloc'. This is because using malloc in
471 re_search* or re_match* could cause memory leaks when C-g is used in
472 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
473 the other hand, malloc is more portable, and easier to debug.
474
fa9a63c5
RM
475 Because we sometimes use alloca, some routines have to be macros,
476 not functions -- `alloca'-allocated space disappears at the end of the
477 function it is called in. */
478
479#ifdef REGEX_MALLOC
480
0b32bf0e
SM
481# define REGEX_ALLOCATE malloc
482# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
483# define REGEX_FREE free
fa9a63c5
RM
484
485#else /* not REGEX_MALLOC */
486
487/* Emacs already defines alloca, sometimes. */
0b32bf0e 488# ifndef alloca
fa9a63c5
RM
489
490/* Make alloca work the best possible way. */
0b32bf0e
SM
491# ifdef __GNUC__
492# define alloca __builtin_alloca
493# else /* not __GNUC__ */
7f585e7a 494# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
495# include <alloca.h>
496# endif /* HAVE_ALLOCA_H */
497# endif /* not __GNUC__ */
fa9a63c5 498
0b32bf0e 499# endif /* not alloca */
fa9a63c5 500
0b32bf0e 501# define REGEX_ALLOCATE alloca
fa9a63c5
RM
502
503/* Assumes a `char *destination' variable. */
0b32bf0e 504# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 505 (destination = (char *) alloca (nsize), \
4bb91c68 506 memcpy (destination, source, osize))
fa9a63c5
RM
507
508/* No need to do anything to free, after alloca. */
0b32bf0e 509# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
510
511#endif /* not REGEX_MALLOC */
512
513/* Define how to allocate the failure stack. */
514
0b32bf0e 515#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 516
0b32bf0e 517# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 518 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 519# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 520 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 521# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
522 r_alloc_free (&failure_stack_ptr)
523
4297555e 524#else /* not using relocating allocator */
fa9a63c5 525
0b32bf0e 526# ifdef REGEX_MALLOC
fa9a63c5 527
0b32bf0e
SM
528# define REGEX_ALLOCATE_STACK malloc
529# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
530# define REGEX_FREE_STACK free
fa9a63c5 531
0b32bf0e 532# else /* not REGEX_MALLOC */
fa9a63c5 533
0b32bf0e 534# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 535
0b32bf0e 536# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 537 REGEX_REALLOCATE (source, osize, nsize)
7814e705 538/* No need to explicitly free anything. */
0b32bf0e 539# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 540
0b32bf0e 541# endif /* not REGEX_MALLOC */
4297555e 542#endif /* not using relocating allocator */
fa9a63c5
RM
543
544
545/* True if `size1' is non-NULL and PTR is pointing anywhere inside
546 `string1' or just past its end. This works if PTR is NULL, which is
547 a good thing. */
25fe55af 548#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
549 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
550
551/* (Re)Allocate N items of type T using malloc, or fail. */
552#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
553#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
554#define RETALLOC_IF(addr, n, t) \
555 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
556#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
557
4bb91c68 558#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
559
560#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
561
562#undef MAX
563#undef MIN
564#define MAX(a, b) ((a) > (b) ? (a) : (b))
565#define MIN(a, b) ((a) < (b) ? (a) : (b))
566
66f0296e
SM
567/* Type of source-pattern and string chars. */
568typedef const unsigned char re_char;
569
fa9a63c5
RM
570typedef char boolean;
571#define false 0
572#define true 1
573
4bb91c68
SM
574static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
575 re_char *string1, int size1,
576 re_char *string2, int size2,
577 int pos,
578 struct re_registers *regs,
579 int stop));
fa9a63c5
RM
580\f
581/* These are the command codes that appear in compiled regular
4bb91c68 582 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
583 command code can specify any interpretation whatsoever for its
584 arguments. Zero bytes may appear in the compiled regular expression. */
585
586typedef enum
587{
588 no_op = 0,
589
4bb91c68 590 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
591 succeed,
592
25fe55af 593 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
594 exactn,
595
25fe55af 596 /* Matches any (more or less) character. */
fa9a63c5
RM
597 anychar,
598
25fe55af
RS
599 /* Matches any one char belonging to specified set. First
600 following byte is number of bitmap bytes. Then come bytes
601 for a bitmap saying which chars are in. Bits in each byte
602 are ordered low-bit-first. A character is in the set if its
603 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
604 automatically not in the set.
605
606 If the length byte has the 0x80 bit set, then that stuff
607 is followed by a range table:
608 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 609 See RANGE_TABLE_WORK_BITS below.
01618498 610 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 611 pairs, each 2 multibyte characters,
0b32bf0e 612 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
613 charset,
614
25fe55af 615 /* Same parameters as charset, but match any character that is
4bb91c68 616 not one of those specified. */
fa9a63c5
RM
617 charset_not,
618
25fe55af
RS
619 /* Start remembering the text that is matched, for storing in a
620 register. Followed by one byte with the register number, in
621 the range 0 to one less than the pattern buffer's re_nsub
505bde11 622 field. */
fa9a63c5
RM
623 start_memory,
624
25fe55af
RS
625 /* Stop remembering the text that is matched and store it in a
626 memory register. Followed by one byte with the register
627 number, in the range 0 to one less than `re_nsub' in the
505bde11 628 pattern buffer. */
fa9a63c5
RM
629 stop_memory,
630
25fe55af 631 /* Match a duplicate of something remembered. Followed by one
4bb91c68 632 byte containing the register number. */
fa9a63c5
RM
633 duplicate,
634
25fe55af 635 /* Fail unless at beginning of line. */
fa9a63c5
RM
636 begline,
637
4bb91c68 638 /* Fail unless at end of line. */
fa9a63c5
RM
639 endline,
640
25fe55af
RS
641 /* Succeeds if at beginning of buffer (if emacs) or at beginning
642 of string to be matched (if not). */
fa9a63c5
RM
643 begbuf,
644
25fe55af 645 /* Analogously, for end of buffer/string. */
fa9a63c5 646 endbuf,
5e69f11e 647
25fe55af 648 /* Followed by two byte relative address to which to jump. */
5e69f11e 649 jump,
fa9a63c5 650
25fe55af 651 /* Followed by two-byte relative address of place to resume at
7814e705 652 in case of failure. */
fa9a63c5 653 on_failure_jump,
5e69f11e 654
25fe55af
RS
655 /* Like on_failure_jump, but pushes a placeholder instead of the
656 current string position when executed. */
fa9a63c5 657 on_failure_keep_string_jump,
5e69f11e 658
505bde11
SM
659 /* Just like `on_failure_jump', except that it checks that we
660 don't get stuck in an infinite loop (matching an empty string
661 indefinitely). */
662 on_failure_jump_loop,
663
0683b6fa
SM
664 /* Just like `on_failure_jump_loop', except that it checks for
665 a different kind of loop (the kind that shows up with non-greedy
666 operators). This operation has to be immediately preceded
667 by a `no_op'. */
668 on_failure_jump_nastyloop,
669
0b32bf0e 670 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
671 It analyses the loop before which it is put and if the
672 loop does not require backtracking, it changes itself to
4e8a9132
SM
673 `on_failure_keep_string_jump' and short-circuits the loop,
674 else it just defaults to changing itself into `on_failure_jump'.
675 It assumes that it is pointing to just past a `jump'. */
505bde11 676 on_failure_jump_smart,
fa9a63c5 677
25fe55af 678 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
679 After matching N times, jump to the address upon failure.
680 Does not work if N starts at 0: use on_failure_jump_loop
681 instead. */
fa9a63c5
RM
682 succeed_n,
683
25fe55af
RS
684 /* Followed by two-byte relative address, and two-byte number n.
685 Jump to the address N times, then fail. */
fa9a63c5
RM
686 jump_n,
687
25fe55af 688 /* Set the following two-byte relative address to the
7814e705 689 subsequent two-byte number. The address *includes* the two
25fe55af 690 bytes of number. */
fa9a63c5
RM
691 set_number_at,
692
fa9a63c5
RM
693 wordbeg, /* Succeeds if at word beginning. */
694 wordend, /* Succeeds if at word end. */
695
696 wordbound, /* Succeeds if at a word boundary. */
7814e705 697 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 698
669fa600
SM
699 symbeg, /* Succeeds if at symbol beginning. */
700 symend, /* Succeeds if at symbol end. */
701
fa9a63c5 702 /* Matches any character whose syntax is specified. Followed by
25fe55af 703 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
704 syntaxspec,
705
706 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
707 notsyntaxspec
708
709#ifdef emacs
710 ,before_dot, /* Succeeds if before point. */
711 at_dot, /* Succeeds if at point. */
712 after_dot, /* Succeeds if after point. */
b18215fc
RS
713
714 /* Matches any character whose category-set contains the specified
7814e705
JB
715 category. The operator is followed by a byte which contains a
716 category code (mnemonic ASCII character). */
b18215fc
RS
717 categoryspec,
718
719 /* Matches any character whose category-set does not contain the
720 specified category. The operator is followed by a byte which
721 contains the category code (mnemonic ASCII character). */
722 notcategoryspec
fa9a63c5
RM
723#endif /* emacs */
724} re_opcode_t;
725\f
726/* Common operations on the compiled pattern. */
727
728/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
729
730#define STORE_NUMBER(destination, number) \
731 do { \
732 (destination)[0] = (number) & 0377; \
733 (destination)[1] = (number) >> 8; \
734 } while (0)
735
736/* Same as STORE_NUMBER, except increment DESTINATION to
737 the byte after where the number is stored. Therefore, DESTINATION
738 must be an lvalue. */
739
740#define STORE_NUMBER_AND_INCR(destination, number) \
741 do { \
742 STORE_NUMBER (destination, number); \
743 (destination) += 2; \
744 } while (0)
745
746/* Put into DESTINATION a number stored in two contiguous bytes starting
747 at SOURCE. */
748
749#define EXTRACT_NUMBER(destination, source) \
750 do { \
751 (destination) = *(source) & 0377; \
752 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
753 } while (0)
754
755#ifdef DEBUG
4bb91c68 756static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
757static void
758extract_number (dest, source)
759 int *dest;
01618498 760 re_char *source;
fa9a63c5 761{
5e69f11e 762 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
763 *dest = *source & 0377;
764 *dest += temp << 8;
765}
766
4bb91c68 767# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
768# undef EXTRACT_NUMBER
769# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
770# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
771
772#endif /* DEBUG */
773
774/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
775 SOURCE must be an lvalue. */
776
777#define EXTRACT_NUMBER_AND_INCR(destination, source) \
778 do { \
779 EXTRACT_NUMBER (destination, source); \
25fe55af 780 (source) += 2; \
fa9a63c5
RM
781 } while (0)
782
783#ifdef DEBUG
4bb91c68
SM
784static void extract_number_and_incr _RE_ARGS ((int *destination,
785 re_char **source));
fa9a63c5
RM
786static void
787extract_number_and_incr (destination, source)
788 int *destination;
01618498 789 re_char **source;
5e69f11e 790{
fa9a63c5
RM
791 extract_number (destination, *source);
792 *source += 2;
793}
794
0b32bf0e
SM
795# ifndef EXTRACT_MACROS
796# undef EXTRACT_NUMBER_AND_INCR
797# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 798 extract_number_and_incr (&dest, &src)
0b32bf0e 799# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
800
801#endif /* DEBUG */
802\f
b18215fc
RS
803/* Store a multibyte character in three contiguous bytes starting
804 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 805 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
806
807#define STORE_CHARACTER_AND_INCR(destination, character) \
808 do { \
809 (destination)[0] = (character) & 0377; \
810 (destination)[1] = ((character) >> 8) & 0377; \
811 (destination)[2] = (character) >> 16; \
812 (destination) += 3; \
813 } while (0)
814
815/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 816 starting at SOURCE. */
b18215fc
RS
817
818#define EXTRACT_CHARACTER(destination, source) \
819 do { \
820 (destination) = ((source)[0] \
821 | ((source)[1] << 8) \
822 | ((source)[2] << 16)); \
823 } while (0)
824
825
826/* Macros for charset. */
827
828/* Size of bitmap of charset P in bytes. P is a start of charset,
829 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
830#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
831
832/* Nonzero if charset P has range table. */
25fe55af 833#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
834
835/* Return the address of range table of charset P. But not the start
836 of table itself, but the before where the number of ranges is
96cc36cc
RS
837 stored. `2 +' means to skip re_opcode_t and size of bitmap,
838 and the 2 bytes of flags at the start of the range table. */
839#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
840
841/* Extract the bit flags that start a range table. */
842#define CHARSET_RANGE_TABLE_BITS(p) \
843 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
844 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
845
846/* Test if C is listed in the bitmap of charset P. */
847#define CHARSET_LOOKUP_BITMAP(p, c) \
848 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
849 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
850
851/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
852 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
853 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
854 and end. */
855#define CHARSET_RANGE_TABLE_END(range_table, count) \
856 ((range_table) + (count) * 2 * 3)
857
7814e705 858/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
859 COUNT is number of ranges in RANGE_TABLE. */
860#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
861 do \
862 { \
01618498 863 re_wchar_t range_start, range_end; \
19ed5445 864 re_char *rtp; \
01618498 865 re_char *range_table_end \
b18215fc
RS
866 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
867 \
19ed5445 868 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
b18215fc 869 { \
19ed5445
PE
870 EXTRACT_CHARACTER (range_start, rtp); \
871 EXTRACT_CHARACTER (range_end, rtp + 3); \
b18215fc
RS
872 \
873 if (range_start <= (c) && (c) <= range_end) \
874 { \
875 (not) = !(not); \
876 break; \
877 } \
878 } \
879 } \
880 while (0)
881
882/* Test if C is in range table of CHARSET. The flag NOT is negated if
883 C is listed in it. */
884#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
885 do \
886 { \
887 /* Number of ranges in range table. */ \
888 int count; \
01618498
SM
889 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
890 \
b18215fc
RS
891 EXTRACT_NUMBER_AND_INCR (count, range_table); \
892 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
893 } \
894 while (0)
895\f
fa9a63c5
RM
896/* If DEBUG is defined, Regex prints many voluminous messages about what
897 it is doing (if the variable `debug' is nonzero). If linked with the
898 main program in `iregex.c', you can enter patterns and strings
899 interactively. And if linked with the main program in `main.c' and
4bb91c68 900 the other test files, you can run the already-written tests. */
fa9a63c5
RM
901
902#ifdef DEBUG
903
904/* We use standard I/O for debugging. */
0b32bf0e 905# include <stdio.h>
fa9a63c5
RM
906
907/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 908# include <assert.h>
fa9a63c5 909
99633e97 910static int debug = -100000;
fa9a63c5 911
0b32bf0e
SM
912# define DEBUG_STATEMENT(e) e
913# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
914# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
915# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
916# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
917# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 918 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 919# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 920 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
921
922
923/* Print the fastmap in human-readable form. */
924
925void
926print_fastmap (fastmap)
927 char *fastmap;
928{
929 unsigned was_a_range = 0;
5e69f11e
RM
930 unsigned i = 0;
931
fa9a63c5
RM
932 while (i < (1 << BYTEWIDTH))
933 {
934 if (fastmap[i++])
935 {
936 was_a_range = 0;
25fe55af
RS
937 putchar (i - 1);
938 while (i < (1 << BYTEWIDTH) && fastmap[i])
939 {
940 was_a_range = 1;
941 i++;
942 }
fa9a63c5 943 if (was_a_range)
25fe55af
RS
944 {
945 printf ("-");
946 putchar (i - 1);
947 }
948 }
fa9a63c5 949 }
5e69f11e 950 putchar ('\n');
fa9a63c5
RM
951}
952
953
954/* Print a compiled pattern string in human-readable form, starting at
955 the START pointer into it and ending just before the pointer END. */
956
957void
958print_partial_compiled_pattern (start, end)
01618498
SM
959 re_char *start;
960 re_char *end;
fa9a63c5
RM
961{
962 int mcnt, mcnt2;
01618498
SM
963 re_char *p = start;
964 re_char *pend = end;
fa9a63c5
RM
965
966 if (start == NULL)
967 {
a1a052df 968 fprintf (stderr, "(null)\n");
fa9a63c5
RM
969 return;
970 }
5e69f11e 971
fa9a63c5
RM
972 /* Loop over pattern commands. */
973 while (p < pend)
974 {
a1a052df 975 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
976
977 switch ((re_opcode_t) *p++)
978 {
25fe55af 979 case no_op:
a1a052df 980 fprintf (stderr, "/no_op");
25fe55af 981 break;
fa9a63c5 982
99633e97 983 case succeed:
a1a052df 984 fprintf (stderr, "/succeed");
99633e97
SM
985 break;
986
fa9a63c5
RM
987 case exactn:
988 mcnt = *p++;
a1a052df 989 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 990 do
fa9a63c5 991 {
a1a052df 992 fprintf (stderr, "/%c", *p++);
25fe55af
RS
993 }
994 while (--mcnt);
995 break;
fa9a63c5
RM
996
997 case start_memory:
a1a052df 998 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 999 break;
fa9a63c5
RM
1000
1001 case stop_memory:
a1a052df 1002 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1003 break;
fa9a63c5
RM
1004
1005 case duplicate:
a1a052df 1006 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1007 break;
1008
1009 case anychar:
a1a052df 1010 fprintf (stderr, "/anychar");
fa9a63c5
RM
1011 break;
1012
1013 case charset:
25fe55af
RS
1014 case charset_not:
1015 {
1016 register int c, last = -100;
fa9a63c5 1017 register int in_range = 0;
99633e97
SM
1018 int length = CHARSET_BITMAP_SIZE (p - 1);
1019 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1020
a1a052df 1021 fprintf (stderr, "/charset [%s",
839966f3 1022 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1023
839966f3
KH
1024 if (p + *p >= pend)
1025 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1026
25fe55af 1027 for (c = 0; c < 256; c++)
96cc36cc 1028 if (c / 8 < length
fa9a63c5
RM
1029 && (p[1 + (c/8)] & (1 << (c % 8))))
1030 {
1031 /* Are we starting a range? */
1032 if (last + 1 == c && ! in_range)
1033 {
a1a052df 1034 fprintf (stderr, "-");
fa9a63c5
RM
1035 in_range = 1;
1036 }
1037 /* Have we broken a range? */
1038 else if (last + 1 != c && in_range)
96cc36cc 1039 {
a1a052df 1040 fprintf (stderr, "%c", last);
fa9a63c5
RM
1041 in_range = 0;
1042 }
5e69f11e 1043
fa9a63c5 1044 if (! in_range)
a1a052df 1045 fprintf (stderr, "%c", c);
fa9a63c5
RM
1046
1047 last = c;
25fe55af 1048 }
fa9a63c5
RM
1049
1050 if (in_range)
a1a052df 1051 fprintf (stderr, "%c", last);
fa9a63c5 1052
a1a052df 1053 fprintf (stderr, "]");
fa9a63c5 1054
99633e97 1055 p += 1 + length;
96cc36cc 1056
96cc36cc 1057 if (has_range_table)
99633e97
SM
1058 {
1059 int count;
a1a052df 1060 fprintf (stderr, "has-range-table");
99633e97
SM
1061
1062 /* ??? Should print the range table; for now, just skip it. */
1063 p += 2; /* skip range table bits */
1064 EXTRACT_NUMBER_AND_INCR (count, p);
1065 p = CHARSET_RANGE_TABLE_END (p, count);
1066 }
fa9a63c5
RM
1067 }
1068 break;
1069
1070 case begline:
a1a052df 1071 fprintf (stderr, "/begline");
25fe55af 1072 break;
fa9a63c5
RM
1073
1074 case endline:
a1a052df 1075 fprintf (stderr, "/endline");
25fe55af 1076 break;
fa9a63c5
RM
1077
1078 case on_failure_jump:
25fe55af 1079 extract_number_and_incr (&mcnt, &p);
a1a052df 1080 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1081 break;
fa9a63c5
RM
1082
1083 case on_failure_keep_string_jump:
25fe55af 1084 extract_number_and_incr (&mcnt, &p);
a1a052df 1085 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1086 break;
fa9a63c5 1087
0683b6fa
SM
1088 case on_failure_jump_nastyloop:
1089 extract_number_and_incr (&mcnt, &p);
a1a052df 1090 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1091 break;
1092
505bde11 1093 case on_failure_jump_loop:
fa9a63c5 1094 extract_number_and_incr (&mcnt, &p);
a1a052df 1095 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1096 break;
1097
505bde11 1098 case on_failure_jump_smart:
fa9a63c5 1099 extract_number_and_incr (&mcnt, &p);
a1a052df 1100 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1101 break;
1102
25fe55af 1103 case jump:
fa9a63c5 1104 extract_number_and_incr (&mcnt, &p);
a1a052df 1105 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1106 break;
1107
25fe55af
RS
1108 case succeed_n:
1109 extract_number_and_incr (&mcnt, &p);
1110 extract_number_and_incr (&mcnt2, &p);
a1a052df 1111 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1112 break;
5e69f11e 1113
25fe55af
RS
1114 case jump_n:
1115 extract_number_and_incr (&mcnt, &p);
1116 extract_number_and_incr (&mcnt2, &p);
a1a052df 1117 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1118 break;
5e69f11e 1119
25fe55af
RS
1120 case set_number_at:
1121 extract_number_and_incr (&mcnt, &p);
1122 extract_number_and_incr (&mcnt2, &p);
a1a052df 1123 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1124 break;
5e69f11e 1125
25fe55af 1126 case wordbound:
a1a052df 1127 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1128 break;
1129
1130 case notwordbound:
a1a052df 1131 fprintf (stderr, "/notwordbound");
25fe55af 1132 break;
fa9a63c5
RM
1133
1134 case wordbeg:
a1a052df 1135 fprintf (stderr, "/wordbeg");
fa9a63c5 1136 break;
5e69f11e 1137
fa9a63c5 1138 case wordend:
a1a052df 1139 fprintf (stderr, "/wordend");
e2543b02 1140 break;
5e69f11e 1141
669fa600 1142 case symbeg:
e2543b02 1143 fprintf (stderr, "/symbeg");
669fa600
SM
1144 break;
1145
1146 case symend:
e2543b02 1147 fprintf (stderr, "/symend");
669fa600 1148 break;
5e69f11e 1149
1fb352e0 1150 case syntaxspec:
a1a052df 1151 fprintf (stderr, "/syntaxspec");
1fb352e0 1152 mcnt = *p++;
a1a052df 1153 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1154 break;
1155
1156 case notsyntaxspec:
a1a052df 1157 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1158 mcnt = *p++;
a1a052df 1159 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1160 break;
1161
0b32bf0e 1162# ifdef emacs
fa9a63c5 1163 case before_dot:
a1a052df 1164 fprintf (stderr, "/before_dot");
25fe55af 1165 break;
fa9a63c5
RM
1166
1167 case at_dot:
a1a052df 1168 fprintf (stderr, "/at_dot");
25fe55af 1169 break;
fa9a63c5
RM
1170
1171 case after_dot:
a1a052df 1172 fprintf (stderr, "/after_dot");
25fe55af 1173 break;
fa9a63c5 1174
1fb352e0 1175 case categoryspec:
a1a052df 1176 fprintf (stderr, "/categoryspec");
fa9a63c5 1177 mcnt = *p++;
a1a052df 1178 fprintf (stderr, "/%d", mcnt);
25fe55af 1179 break;
5e69f11e 1180
1fb352e0 1181 case notcategoryspec:
a1a052df 1182 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1183 mcnt = *p++;
a1a052df 1184 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1185 break;
0b32bf0e 1186# endif /* emacs */
fa9a63c5 1187
fa9a63c5 1188 case begbuf:
a1a052df 1189 fprintf (stderr, "/begbuf");
25fe55af 1190 break;
fa9a63c5
RM
1191
1192 case endbuf:
a1a052df 1193 fprintf (stderr, "/endbuf");
25fe55af 1194 break;
fa9a63c5 1195
25fe55af 1196 default:
a1a052df 1197 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1198 }
1199
a1a052df 1200 fprintf (stderr, "\n");
fa9a63c5
RM
1201 }
1202
a1a052df 1203 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1204}
1205
1206
1207void
1208print_compiled_pattern (bufp)
1209 struct re_pattern_buffer *bufp;
1210{
01618498 1211 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1212
1213 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1214 printf ("%ld bytes used/%ld bytes allocated.\n",
1215 bufp->used, bufp->allocated);
fa9a63c5
RM
1216
1217 if (bufp->fastmap_accurate && bufp->fastmap)
1218 {
1219 printf ("fastmap: ");
1220 print_fastmap (bufp->fastmap);
1221 }
1222
1223 printf ("re_nsub: %d\t", bufp->re_nsub);
1224 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1225 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1226 printf ("no_sub: %d\t", bufp->no_sub);
1227 printf ("not_bol: %d\t", bufp->not_bol);
1228 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1229 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1230 fflush (stdout);
fa9a63c5
RM
1231 /* Perhaps we should print the translate table? */
1232}
1233
1234
1235void
1236print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1237 re_char *where;
1238 re_char *string1;
1239 re_char *string2;
fa9a63c5
RM
1240 int size1;
1241 int size2;
1242{
4bb91c68 1243 int this_char;
5e69f11e 1244
fa9a63c5
RM
1245 if (where == NULL)
1246 printf ("(null)");
1247 else
1248 {
1249 if (FIRST_STRING_P (where))
25fe55af
RS
1250 {
1251 for (this_char = where - string1; this_char < size1; this_char++)
1252 putchar (string1[this_char]);
fa9a63c5 1253
25fe55af
RS
1254 where = string2;
1255 }
fa9a63c5
RM
1256
1257 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1258 putchar (string2[this_char]);
fa9a63c5
RM
1259 }
1260}
1261
1262#else /* not DEBUG */
1263
0b32bf0e
SM
1264# undef assert
1265# define assert(e)
fa9a63c5 1266
0b32bf0e
SM
1267# define DEBUG_STATEMENT(e)
1268# define DEBUG_PRINT1(x)
1269# define DEBUG_PRINT2(x1, x2)
1270# define DEBUG_PRINT3(x1, x2, x3)
1271# define DEBUG_PRINT4(x1, x2, x3, x4)
1272# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1273# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1274
1275#endif /* not DEBUG */
1276\f
1277/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1278 also be assigned to arbitrarily: each pattern buffer stores its own
1279 syntax, so it can be changed between regex compilations. */
1280/* This has no initializer because initialized variables in Emacs
1281 become read-only after dumping. */
1282reg_syntax_t re_syntax_options;
1283
1284
1285/* Specify the precise syntax of regexps for compilation. This provides
1286 for compatibility for various utilities which historically have
1287 different, incompatible syntaxes.
1288
1289 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1290 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1291
1292reg_syntax_t
971de7fb 1293re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1294{
1295 reg_syntax_t ret = re_syntax_options;
5e69f11e 1296
fa9a63c5
RM
1297 re_syntax_options = syntax;
1298 return ret;
1299}
c0f9ea08 1300WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1301
1302/* Regexp to use to replace spaces, or NULL meaning don't. */
1303static re_char *whitespace_regexp;
1304
1305void
971de7fb 1306re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1307{
6470ea05 1308 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1309}
1310WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1311\f
1312/* This table gives an error message for each of the error codes listed
4bb91c68 1313 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1314 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1315 but why not be nice? */
fa9a63c5
RM
1316
1317static const char *re_error_msgid[] =
5e69f11e
RM
1318 {
1319 gettext_noop ("Success"), /* REG_NOERROR */
1320 gettext_noop ("No match"), /* REG_NOMATCH */
1321 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1322 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1323 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1324 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1325 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1326 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1327 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1328 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1329 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1330 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1331 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1332 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1333 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1334 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1335 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1336 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1337 };
1338\f
4bb91c68 1339/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1340
1341/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1342 searching and matching functions should not call alloca. On some
1343 systems, alloca is implemented in terms of malloc, and if we're
1344 using the relocating allocator routines, then malloc could cause a
1345 relocation, which might (if the strings being searched are in the
1346 ralloc heap) shift the data out from underneath the regexp
1347 routines.
1348
5e69f11e 1349 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1350 processes input from X in a signal handler; processing X input may
1351 call malloc; if input arrives while a matching routine is calling
1352 malloc, then we're scrod. But Emacs can't just block input while
1353 calling matching routines; then we don't notice interrupts when
1354 they come in. So, Emacs blocks input around all regexp calls
1355 except the matching calls, which it leaves unprotected, in the
1356 faith that they will not malloc. */
1357
1358/* Normally, this is fine. */
1359#define MATCH_MAY_ALLOCATE
1360
fa9a63c5
RM
1361/* The match routines may not allocate if (1) they would do it with malloc
1362 and (2) it's not safe for them to use malloc.
1363 Note that if REL_ALLOC is defined, matching would not use malloc for the
1364 failure stack, but we would still use it for the register vectors;
4bb91c68 1365 so REL_ALLOC should not affect this. */
b588157e 1366#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1367# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1368#endif
1369
1370\f
1371/* Failure stack declarations and macros; both re_compile_fastmap and
1372 re_match_2 use a failure stack. These have to be macros because of
1373 REGEX_ALLOCATE_STACK. */
5e69f11e 1374
fa9a63c5 1375
320a2a73 1376/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1377 when matching. If this number is exceeded, we allocate more
1378 space, so it is not a hard limit. */
1379#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1380# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1381#endif
1382
1383/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1384 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1385 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1386 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1387 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1388# if defined MATCH_MAY_ALLOCATE
1389/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1390 whose default stack limit is 2mb. In order for a larger
1391 value to work reliably, you have to try to make it accord
1392 with the process stack limit. */
c0f9ea08
SM
1393size_t re_max_failures = 40000;
1394# else
1395size_t re_max_failures = 4000;
1396# endif
fa9a63c5
RM
1397
1398union fail_stack_elt
1399{
01618498 1400 re_char *pointer;
c0f9ea08
SM
1401 /* This should be the biggest `int' that's no bigger than a pointer. */
1402 long integer;
fa9a63c5
RM
1403};
1404
1405typedef union fail_stack_elt fail_stack_elt_t;
1406
1407typedef struct
1408{
1409 fail_stack_elt_t *stack;
c0f9ea08
SM
1410 size_t size;
1411 size_t avail; /* Offset of next open position. */
1412 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1413} fail_stack_type;
1414
505bde11 1415#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1416#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1417
1418
1419/* Define macros to initialize and free the failure stack.
1420 Do `return -2' if the alloc fails. */
1421
1422#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1423# define INIT_FAIL_STACK() \
fa9a63c5
RM
1424 do { \
1425 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1426 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1427 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1428 \
1429 if (fail_stack.stack == NULL) \
1430 return -2; \
1431 \
1432 fail_stack.size = INIT_FAILURE_ALLOC; \
1433 fail_stack.avail = 0; \
505bde11 1434 fail_stack.frame = 0; \
fa9a63c5
RM
1435 } while (0)
1436
0b32bf0e 1437# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1438#else
0b32bf0e 1439# define INIT_FAIL_STACK() \
fa9a63c5
RM
1440 do { \
1441 fail_stack.avail = 0; \
505bde11 1442 fail_stack.frame = 0; \
fa9a63c5
RM
1443 } while (0)
1444
0b32bf0e 1445# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1446#endif
1447
1448
320a2a73
KH
1449/* Double the size of FAIL_STACK, up to a limit
1450 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1451
1452 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1453 allocating space for it or it was already too large.
1454
4bb91c68 1455 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1456
320a2a73
KH
1457/* Factor to increase the failure stack size by
1458 when we increase it.
1459 This used to be 2, but 2 was too wasteful
1460 because the old discarded stacks added up to as much space
1461 were as ultimate, maximum-size stack. */
1462#define FAIL_STACK_GROWTH_FACTOR 4
1463
1464#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1465 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1466 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1467 ? 0 \
320a2a73
KH
1468 : ((fail_stack).stack \
1469 = (fail_stack_elt_t *) \
25fe55af
RS
1470 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1471 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1472 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1473 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1474 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1475 \
1476 (fail_stack).stack == NULL \
1477 ? 0 \
6453db45
KH
1478 : ((fail_stack).size \
1479 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1480 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1481 * FAIL_STACK_GROWTH_FACTOR)) \
1482 / sizeof (fail_stack_elt_t)), \
25fe55af 1483 1)))
fa9a63c5
RM
1484
1485
fa9a63c5
RM
1486/* Push a pointer value onto the failure stack.
1487 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1488 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1489#define PUSH_FAILURE_POINTER(item) \
01618498 1490 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1491
1492/* This pushes an integer-valued item onto the failure stack.
1493 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1494 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1495#define PUSH_FAILURE_INT(item) \
1496 fail_stack.stack[fail_stack.avail++].integer = (item)
1497
1498/* Push a fail_stack_elt_t value onto the failure stack.
1499 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1500 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1501#define PUSH_FAILURE_ELT(item) \
1502 fail_stack.stack[fail_stack.avail++] = (item)
1503
1504/* These three POP... operations complement the three PUSH... operations.
1505 All assume that `fail_stack' is nonempty. */
1506#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1507#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1508#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1509
505bde11
SM
1510/* Individual items aside from the registers. */
1511#define NUM_NONREG_ITEMS 3
1512
1513/* Used to examine the stack (to detect infinite loops). */
1514#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1515#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1516#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1517#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1518
1519
505bde11
SM
1520#define ENSURE_FAIL_STACK(space) \
1521while (REMAINING_AVAIL_SLOTS <= space) { \
1522 if (!GROW_FAIL_STACK (fail_stack)) \
1523 return -2; \
1524 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1525 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1526}
1527
1528/* Push register NUM onto the stack. */
1529#define PUSH_FAILURE_REG(num) \
1530do { \
1531 char *destination; \
1532 ENSURE_FAIL_STACK(3); \
1533 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1534 num, regstart[num], regend[num]); \
1535 PUSH_FAILURE_POINTER (regstart[num]); \
1536 PUSH_FAILURE_POINTER (regend[num]); \
1537 PUSH_FAILURE_INT (num); \
1538} while (0)
1539
01618498
SM
1540/* Change the counter's value to VAL, but make sure that it will
1541 be reset when backtracking. */
1542#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1543do { \
1544 char *destination; \
1545 int c; \
1546 ENSURE_FAIL_STACK(3); \
1547 EXTRACT_NUMBER (c, ptr); \
01618498 1548 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1549 PUSH_FAILURE_INT (c); \
1550 PUSH_FAILURE_POINTER (ptr); \
1551 PUSH_FAILURE_INT (-1); \
01618498 1552 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1553} while (0)
1554
505bde11 1555/* Pop a saved register off the stack. */
dc1e502d 1556#define POP_FAILURE_REG_OR_COUNT() \
505bde11 1557do { \
19ed5445
PE
1558 int pfreg = POP_FAILURE_INT (); \
1559 if (pfreg == -1) \
dc1e502d
SM
1560 { \
1561 /* It's a counter. */ \
6dcf2d0e
SM
1562 /* Here, we discard `const', making re_match non-reentrant. */ \
1563 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
19ed5445
PE
1564 pfreg = POP_FAILURE_INT (); \
1565 STORE_NUMBER (ptr, pfreg); \
1566 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, pfreg); \
dc1e502d
SM
1567 } \
1568 else \
1569 { \
19ed5445
PE
1570 regend[pfreg] = POP_FAILURE_POINTER (); \
1571 regstart[pfreg] = POP_FAILURE_POINTER (); \
dc1e502d 1572 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
19ed5445 1573 pfreg, regstart[pfreg], regend[pfreg]); \
dc1e502d 1574 } \
505bde11
SM
1575} while (0)
1576
1577/* Check that we are not stuck in an infinite loop. */
1578#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1579do { \
f6df485f 1580 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1581 /* Check for infinite matching loops */ \
f6df485f
RS
1582 while (failure > 0 \
1583 && (FAILURE_STR (failure) == string_place \
1584 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1585 { \
1586 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1587 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1588 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1589 { \
6df42991
SM
1590 cycle = 1; \
1591 break; \
f6df485f 1592 } \
66f0296e 1593 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1594 failure = NEXT_FAILURE_HANDLE(failure); \
1595 } \
1596 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1597} while (0)
6df42991 1598
fa9a63c5 1599/* Push the information about the state we will need
5e69f11e
RM
1600 if we ever fail back to it.
1601
505bde11 1602 Requires variables fail_stack, regstart, regend and
320a2a73 1603 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1604 declared.
5e69f11e 1605
fa9a63c5
RM
1606 Does `return FAILURE_CODE' if runs out of memory. */
1607
505bde11
SM
1608#define PUSH_FAILURE_POINT(pattern, string_place) \
1609do { \
1610 char *destination; \
1611 /* Must be int, so when we don't save any registers, the arithmetic \
1612 of 0 + -1 isn't done as unsigned. */ \
1613 \
505bde11 1614 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1615 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1616 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1617 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1618 \
1619 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1620 \
1621 DEBUG_PRINT1 ("\n"); \
1622 \
1623 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1624 PUSH_FAILURE_INT (fail_stack.frame); \
1625 \
1626 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1627 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1628 DEBUG_PRINT1 ("'\n"); \
1629 PUSH_FAILURE_POINTER (string_place); \
1630 \
1631 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1632 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1633 PUSH_FAILURE_POINTER (pattern); \
1634 \
1635 /* Close the frame by moving the frame pointer past it. */ \
1636 fail_stack.frame = fail_stack.avail; \
1637} while (0)
fa9a63c5 1638
320a2a73
KH
1639/* Estimate the size of data pushed by a typical failure stack entry.
1640 An estimate is all we need, because all we use this for
1641 is to choose a limit for how big to make the failure stack. */
ada30c0e 1642/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1643#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1644
fa9a63c5
RM
1645/* How many items can still be added to the stack without overflowing it. */
1646#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1647
1648
1649/* Pops what PUSH_FAIL_STACK pushes.
1650
1651 We restore into the parameters, all of which should be lvalues:
1652 STR -- the saved data position.
1653 PAT -- the saved pattern position.
fa9a63c5 1654 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1655
fa9a63c5 1656 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1657 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1658
505bde11
SM
1659#define POP_FAILURE_POINT(str, pat) \
1660do { \
fa9a63c5
RM
1661 assert (!FAIL_STACK_EMPTY ()); \
1662 \
1663 /* Remove failure points and point to how many regs pushed. */ \
1664 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1665 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1666 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1667 \
505bde11
SM
1668 /* Pop the saved registers. */ \
1669 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1670 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1671 \
01618498 1672 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1673 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1674 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1675 \
1676 /* If the saved string location is NULL, it came from an \
1677 on_failure_keep_string_jump opcode, and we want to throw away the \
1678 saved NULL, thus retaining our current position in the string. */ \
01618498 1679 str = POP_FAILURE_POINTER (); \
505bde11 1680 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1681 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1682 DEBUG_PRINT1 ("'\n"); \
1683 \
505bde11
SM
1684 fail_stack.frame = POP_FAILURE_INT (); \
1685 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1686 \
505bde11
SM
1687 assert (fail_stack.avail >= 0); \
1688 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1689 \
fa9a63c5 1690 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1691} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1692
1693
1694\f
fa9a63c5 1695/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1696#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1697\f
1698/* Subroutine declarations and macros for regex_compile. */
1699
4bb91c68
SM
1700static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1701 reg_syntax_t syntax,
1702 struct re_pattern_buffer *bufp));
1703static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1704static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1705 int arg1, int arg2));
1706static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1707 int arg, unsigned char *end));
1708static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1709 int arg1, int arg2, unsigned char *end));
01618498
SM
1710static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1711 re_char *p,
4bb91c68 1712 reg_syntax_t syntax));
01618498
SM
1713static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1714 re_char *pend,
4bb91c68 1715 reg_syntax_t syntax));
01618498
SM
1716static re_char *skip_one_char _RE_ARGS ((re_char *p));
1717static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1718 char *fastmap, const int multibyte));
fa9a63c5 1719
fa9a63c5 1720/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1721 translation. */
36595814 1722#define PATFETCH(c) \
2d1675e4
SM
1723 do { \
1724 int len; \
1725 if (p == pend) return REG_EEND; \
62a6e103 1726 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1727 p += len; \
fa9a63c5
RM
1728 } while (0)
1729
fa9a63c5
RM
1730
1731/* If `translate' is non-null, return translate[D], else just D. We
1732 cast the subscript to translate because some data is declared as
1733 `char *', to avoid warnings when a string constant is passed. But
1734 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1735#ifndef TRANSLATE
0b32bf0e 1736# define TRANSLATE(d) \
66f0296e 1737 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1738#endif
fa9a63c5
RM
1739
1740
1741/* Macros for outputting the compiled pattern into `buffer'. */
1742
1743/* If the buffer isn't allocated when it comes in, use this. */
1744#define INIT_BUF_SIZE 32
1745
4bb91c68 1746/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1747#define GET_BUFFER_SPACE(n) \
01618498 1748 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1749 EXTEND_BUFFER ()
1750
1751/* Make sure we have one more byte of buffer space and then add C to it. */
1752#define BUF_PUSH(c) \
1753 do { \
1754 GET_BUFFER_SPACE (1); \
1755 *b++ = (unsigned char) (c); \
1756 } while (0)
1757
1758
1759/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1760#define BUF_PUSH_2(c1, c2) \
1761 do { \
1762 GET_BUFFER_SPACE (2); \
1763 *b++ = (unsigned char) (c1); \
1764 *b++ = (unsigned char) (c2); \
1765 } while (0)
1766
1767
4bb91c68 1768/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1769#define BUF_PUSH_3(c1, c2, c3) \
1770 do { \
1771 GET_BUFFER_SPACE (3); \
1772 *b++ = (unsigned char) (c1); \
1773 *b++ = (unsigned char) (c2); \
1774 *b++ = (unsigned char) (c3); \
1775 } while (0)
1776
1777
1778/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1779 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1780#define STORE_JUMP(op, loc, to) \
1781 store_op1 (op, loc, (to) - (loc) - 3)
1782
1783/* Likewise, for a two-argument jump. */
1784#define STORE_JUMP2(op, loc, to, arg) \
1785 store_op2 (op, loc, (to) - (loc) - 3, arg)
1786
4bb91c68 1787/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1788#define INSERT_JUMP(op, loc, to) \
1789 insert_op1 (op, loc, (to) - (loc) - 3, b)
1790
1791/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1792#define INSERT_JUMP2(op, loc, to, arg) \
1793 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1794
1795
1796/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1797 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1798 be too small, many things would have to change. */
839966f3
KH
1799# define MAX_BUF_SIZE (1L << 15)
1800
1801#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1802/* Any other compiler which, like MSC, has allocation limit below 2^16
1803 bytes will have to use approach similar to what was done below for
1804 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1805 reallocating to 0 bytes. Such thing is not going to work too well.
1806 You have been warned!! */
1807#if defined _MSC_VER && !defined WIN32
1808/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1809# define MAX_BUF_SIZE 65500L
1810#else
1811# define MAX_BUF_SIZE (1L << 16)
1812#endif
839966f3 1813#endif /* 0 */
fa9a63c5
RM
1814
1815/* Extend the buffer by twice its current size via realloc and
1816 reset the pointers that pointed into the old block to point to the
1817 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1818 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1819#if __BOUNDED_POINTERS__
1820# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1821# define MOVE_BUFFER_POINTER(P) \
1822 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1823 SET_HIGH_BOUND (P), \
1824 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1825# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1826 else \
1827 { \
1828 SET_HIGH_BOUND (b); \
1829 SET_HIGH_BOUND (begalt); \
1830 if (fixup_alt_jump) \
1831 SET_HIGH_BOUND (fixup_alt_jump); \
1832 if (laststart) \
1833 SET_HIGH_BOUND (laststart); \
1834 if (pending_exact) \
1835 SET_HIGH_BOUND (pending_exact); \
1836 }
1837#else
381880b0 1838# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1839# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1840#endif
fa9a63c5 1841#define EXTEND_BUFFER() \
25fe55af 1842 do { \
381880b0 1843 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1844 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1845 return REG_ESIZE; \
1846 bufp->allocated <<= 1; \
1847 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1848 bufp->allocated = MAX_BUF_SIZE; \
01618498 1849 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1850 if (bufp->buffer == NULL) \
1851 return REG_ESPACE; \
1852 /* If the buffer moved, move all the pointers into it. */ \
1853 if (old_buffer != bufp->buffer) \
1854 { \
381880b0 1855 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1856 MOVE_BUFFER_POINTER (b); \
1857 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1858 if (fixup_alt_jump) \
4bb91c68 1859 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1860 if (laststart) \
4bb91c68 1861 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1862 if (pending_exact) \
4bb91c68 1863 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1864 } \
4bb91c68 1865 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1866 } while (0)
1867
1868
1869/* Since we have one byte reserved for the register number argument to
1870 {start,stop}_memory, the maximum number of groups we can report
1871 things about is what fits in that byte. */
1872#define MAX_REGNUM 255
1873
1874/* But patterns can have more than `MAX_REGNUM' registers. We just
1875 ignore the excess. */
098d42af 1876typedef int regnum_t;
fa9a63c5
RM
1877
1878
1879/* Macros for the compile stack. */
1880
1881/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1882 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1883/* int may be not enough when sizeof(int) == 2. */
1884typedef long pattern_offset_t;
fa9a63c5
RM
1885
1886typedef struct
1887{
1888 pattern_offset_t begalt_offset;
1889 pattern_offset_t fixup_alt_jump;
5e69f11e 1890 pattern_offset_t laststart_offset;
fa9a63c5
RM
1891 regnum_t regnum;
1892} compile_stack_elt_t;
1893
1894
1895typedef struct
1896{
1897 compile_stack_elt_t *stack;
1898 unsigned size;
1899 unsigned avail; /* Offset of next open position. */
1900} compile_stack_type;
1901
1902
1903#define INIT_COMPILE_STACK_SIZE 32
1904
1905#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1906#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1907
4bb91c68 1908/* The next available element. */
fa9a63c5
RM
1909#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1910
1cee1e27
SM
1911/* Explicit quit checking is only used on NTemacs and whenever we
1912 use polling to process input events. */
1913#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1914extern int immediate_quit;
1915# define IMMEDIATE_QUIT_CHECK \
1916 do { \
1917 if (immediate_quit) QUIT; \
1918 } while (0)
1919#else
1920# define IMMEDIATE_QUIT_CHECK ((void)0)
1921#endif
1922\f
b18215fc
RS
1923/* Structure to manage work area for range table. */
1924struct range_table_work_area
1925{
1926 int *table; /* actual work area. */
1927 int allocated; /* allocated size for work area in bytes. */
7814e705 1928 int used; /* actually used size in words. */
96cc36cc 1929 int bits; /* flag to record character classes */
b18215fc
RS
1930};
1931
77d11aec
RS
1932/* Make sure that WORK_AREA can hold more N multibyte characters.
1933 This is used only in set_image_of_range and set_image_of_range_1.
1934 It expects WORK_AREA to be a pointer.
1935 If it can't get the space, it returns from the surrounding function. */
1936
1937#define EXTEND_RANGE_TABLE(work_area, n) \
1938 do { \
8f924df7 1939 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1940 { \
8f924df7
KH
1941 extend_range_table_work_area (&work_area); \
1942 if ((work_area).table == 0) \
77d11aec
RS
1943 return (REG_ESPACE); \
1944 } \
b18215fc
RS
1945 } while (0)
1946
96cc36cc
RS
1947#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1948 (work_area).bits |= (bit)
1949
14473664
SM
1950/* Bits used to implement the multibyte-part of the various character classes
1951 such as [:alnum:] in a charset's range table. */
1952#define BIT_WORD 0x1
1953#define BIT_LOWER 0x2
1954#define BIT_PUNCT 0x4
1955#define BIT_SPACE 0x8
1956#define BIT_UPPER 0x10
1957#define BIT_MULTIBYTE 0x20
96cc36cc 1958
b18215fc
RS
1959/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1960#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1961 do { \
8f924df7 1962 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1963 (work_area).table[(work_area).used++] = (range_start); \
1964 (work_area).table[(work_area).used++] = (range_end); \
1965 } while (0)
1966
7814e705 1967/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1968#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1969 do { \
1970 if ((work_area).table) \
1971 free ((work_area).table); \
1972 } while (0)
1973
96cc36cc 1974#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1975#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1976#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1977#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1978\f
b18215fc 1979
fa9a63c5 1980/* Set the bit for character C in a list. */
01618498 1981#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1982
1983
bf216479
KH
1984#ifdef emacs
1985
cf9c99bc
KH
1986/* Store characters in the range FROM to TO in the bitmap at B (for
1987 ASCII and unibyte characters) and WORK_AREA (for multibyte
1988 characters) while translating them and paying attention to the
1989 continuity of translated characters.
8f924df7 1990
cf9c99bc
KH
1991 Implementation note: It is better to implement these fairly big
1992 macros by a function, but it's not that easy because macros called
8f924df7 1993 in this macro assume various local variables already declared. */
bf216479 1994
cf9c99bc
KH
1995/* Both FROM and TO are ASCII characters. */
1996
1997#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1998 do { \
1999 int C0, C1; \
2000 \
2001 for (C0 = (FROM); C0 <= (TO); C0++) \
2002 { \
2003 C1 = TRANSLATE (C0); \
2004 if (! ASCII_CHAR_P (C1)) \
2005 { \
2006 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2007 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2008 C1 = C0; \
2009 } \
2010 SET_LIST_BIT (C1); \
2011 } \
2012 } while (0)
2013
2014
2015/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2016
2017#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2018 do { \
2019 int C0, C1, C2, I; \
2020 int USED = RANGE_TABLE_WORK_USED (work_area); \
2021 \
2022 for (C0 = (FROM); C0 <= (TO); C0++) \
2023 { \
2024 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2025 if (CHAR_BYTE8_P (C1)) \
2026 SET_LIST_BIT (C0); \
2027 else \
2028 { \
2029 C2 = TRANSLATE (C1); \
2030 if (C2 == C1 \
2031 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2032 C1 = C0; \
2033 SET_LIST_BIT (C1); \
2034 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2035 { \
2036 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2037 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2038 \
2039 if (C2 >= from - 1 && C2 <= to + 1) \
2040 { \
2041 if (C2 == from - 1) \
2042 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2043 else if (C2 == to + 1) \
2044 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2045 break; \
2046 } \
2047 } \
2048 if (I < USED) \
2049 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2050 } \
2051 } \
2052 } while (0)
2053
2054
78edd3b7 2055/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2056
2057#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2058 do { \
2059 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2060 \
2061 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2062 for (C0 = (FROM); C0 <= (TO); C0++) \
2063 { \
2064 C1 = TRANSLATE (C0); \
2065 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2066 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2067 SET_LIST_BIT (C2); \
2068 if (C1 >= (FROM) && C1 <= (TO)) \
2069 continue; \
2070 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2071 { \
2072 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2073 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2074 \
2075 if (C1 >= from - 1 && C1 <= to + 1) \
2076 { \
2077 if (C1 == from - 1) \
2078 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2079 else if (C1 == to + 1) \
2080 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2081 break; \
2082 } \
2083 } \
2084 if (I < USED) \
2085 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2086 } \
bf216479
KH
2087 } while (0)
2088
2089#endif /* emacs */
2090
fa9a63c5 2091/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2092#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2093 do { \
2094 if (p == pend) \
2095 FREE_STACK_RETURN (REG_EBRACE); \
2096 else \
2097 { \
2098 PATFETCH (c); \
2099 while ('0' <= c && c <= '9') \
2100 { \
2101 int prev; \
2102 if (num < 0) \
2103 num = 0; \
2104 prev = num; \
2105 num = num * 10 + c - '0'; \
2106 if (num / 10 != prev) \
2107 FREE_STACK_RETURN (REG_BADBR); \
2108 if (p == pend) \
2109 FREE_STACK_RETURN (REG_EBRACE); \
2110 PATFETCH (c); \
2111 } \
2112 } \
2113 } while (0)
77d11aec 2114\f
1fdab503 2115#if ! WIDE_CHAR_SUPPORT
01618498 2116
14473664 2117/* Map a string to the char class it names (if any). */
1fdab503 2118re_wctype_t
971de7fb 2119re_wctype (const re_char *str)
14473664 2120{
5b0534c8 2121 const char *string = (const char *) str;
14473664
SM
2122 if (STREQ (string, "alnum")) return RECC_ALNUM;
2123 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2124 else if (STREQ (string, "word")) return RECC_WORD;
2125 else if (STREQ (string, "ascii")) return RECC_ASCII;
2126 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2127 else if (STREQ (string, "graph")) return RECC_GRAPH;
2128 else if (STREQ (string, "lower")) return RECC_LOWER;
2129 else if (STREQ (string, "print")) return RECC_PRINT;
2130 else if (STREQ (string, "punct")) return RECC_PUNCT;
2131 else if (STREQ (string, "space")) return RECC_SPACE;
2132 else if (STREQ (string, "upper")) return RECC_UPPER;
2133 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2134 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2135 else if (STREQ (string, "digit")) return RECC_DIGIT;
2136 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2137 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2138 else if (STREQ (string, "blank")) return RECC_BLANK;
2139 else return 0;
2140}
2141
e0f24100 2142/* True if CH is in the char class CC. */
1fdab503 2143boolean
971de7fb 2144re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2145{
2146 switch (cc)
2147 {
0cdd06f8
SM
2148 case RECC_ALNUM: return ISALNUM (ch);
2149 case RECC_ALPHA: return ISALPHA (ch);
2150 case RECC_BLANK: return ISBLANK (ch);
2151 case RECC_CNTRL: return ISCNTRL (ch);
2152 case RECC_DIGIT: return ISDIGIT (ch);
2153 case RECC_GRAPH: return ISGRAPH (ch);
2154 case RECC_LOWER: return ISLOWER (ch);
2155 case RECC_PRINT: return ISPRINT (ch);
2156 case RECC_PUNCT: return ISPUNCT (ch);
2157 case RECC_SPACE: return ISSPACE (ch);
2158 case RECC_UPPER: return ISUPPER (ch);
2159 case RECC_XDIGIT: return ISXDIGIT (ch);
2160 case RECC_ASCII: return IS_REAL_ASCII (ch);
2161 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2162 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2163 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2164 case RECC_WORD: return ISWORD (ch);
2165 case RECC_ERROR: return false;
2166 default:
2167 abort();
14473664
SM
2168 }
2169}
fa9a63c5 2170
14473664
SM
2171/* Return a bit-pattern to use in the range-table bits to match multibyte
2172 chars of class CC. */
2173static int
971de7fb 2174re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2175{
2176 switch (cc)
2177 {
2178 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2179 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2180 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2181 case RECC_LOWER: return BIT_LOWER;
2182 case RECC_UPPER: return BIT_UPPER;
2183 case RECC_PUNCT: return BIT_PUNCT;
2184 case RECC_SPACE: return BIT_SPACE;
14473664 2185 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2186 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2187 default:
2188 abort();
14473664
SM
2189 }
2190}
2191#endif
77d11aec
RS
2192\f
2193/* Filling in the work area of a range. */
2194
2195/* Actually extend the space in WORK_AREA. */
2196
2197static void
971de7fb 2198extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2199{
77d11aec
RS
2200 work_area->allocated += 16 * sizeof (int);
2201 if (work_area->table)
2202 work_area->table
2203 = (int *) realloc (work_area->table, work_area->allocated);
2204 else
2205 work_area->table
2206 = (int *) malloc (work_area->allocated);
2207}
2208
8f924df7 2209#if 0
77d11aec
RS
2210#ifdef emacs
2211
2212/* Carefully find the ranges of codes that are equivalent
2213 under case conversion to the range start..end when passed through
2214 TRANSLATE. Handle the case where non-letters can come in between
2215 two upper-case letters (which happens in Latin-1).
2216 Also handle the case of groups of more than 2 case-equivalent chars.
2217
2218 The basic method is to look at consecutive characters and see
2219 if they can form a run that can be handled as one.
2220
2221 Returns -1 if successful, REG_ESPACE if ran out of space. */
2222
2223static int
2224set_image_of_range_1 (work_area, start, end, translate)
2225 RE_TRANSLATE_TYPE translate;
2226 struct range_table_work_area *work_area;
2227 re_wchar_t start, end;
2228{
2229 /* `one_case' indicates a character, or a run of characters,
2230 each of which is an isolate (no case-equivalents).
2231 This includes all ASCII non-letters.
2232
2233 `two_case' indicates a character, or a run of characters,
2234 each of which has two case-equivalent forms.
2235 This includes all ASCII letters.
2236
2237 `strange' indicates a character that has more than one
2238 case-equivalent. */
177c0ea7 2239
77d11aec
RS
2240 enum case_type {one_case, two_case, strange};
2241
2242 /* Describe the run that is in progress,
2243 which the next character can try to extend.
2244 If run_type is strange, that means there really is no run.
2245 If run_type is one_case, then run_start...run_end is the run.
2246 If run_type is two_case, then the run is run_start...run_end,
2247 and the case-equivalents end at run_eqv_end. */
2248
2249 enum case_type run_type = strange;
2250 int run_start, run_end, run_eqv_end;
2251
2252 Lisp_Object eqv_table;
2253
2254 if (!RE_TRANSLATE_P (translate))
2255 {
b7c12565 2256 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2257 work_area->table[work_area->used++] = (start);
2258 work_area->table[work_area->used++] = (end);
b7c12565 2259 return -1;
77d11aec
RS
2260 }
2261
2262 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2263
77d11aec
RS
2264 for (; start <= end; start++)
2265 {
2266 enum case_type this_type;
2267 int eqv = RE_TRANSLATE (eqv_table, start);
2268 int minchar, maxchar;
2269
2270 /* Classify this character */
2271 if (eqv == start)
2272 this_type = one_case;
2273 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2274 this_type = two_case;
2275 else
2276 this_type = strange;
2277
2278 if (start < eqv)
2279 minchar = start, maxchar = eqv;
2280 else
2281 minchar = eqv, maxchar = start;
2282
2283 /* Can this character extend the run in progress? */
2284 if (this_type == strange || this_type != run_type
2285 || !(minchar == run_end + 1
2286 && (run_type == two_case
2287 ? maxchar == run_eqv_end + 1 : 1)))
2288 {
2289 /* No, end the run.
2290 Record each of its equivalent ranges. */
2291 if (run_type == one_case)
2292 {
2293 EXTEND_RANGE_TABLE (work_area, 2);
2294 work_area->table[work_area->used++] = run_start;
2295 work_area->table[work_area->used++] = run_end;
2296 }
2297 else if (run_type == two_case)
2298 {
2299 EXTEND_RANGE_TABLE (work_area, 4);
2300 work_area->table[work_area->used++] = run_start;
2301 work_area->table[work_area->used++] = run_end;
2302 work_area->table[work_area->used++]
2303 = RE_TRANSLATE (eqv_table, run_start);
2304 work_area->table[work_area->used++]
2305 = RE_TRANSLATE (eqv_table, run_end);
2306 }
2307 run_type = strange;
2308 }
177c0ea7 2309
77d11aec
RS
2310 if (this_type == strange)
2311 {
2312 /* For a strange character, add each of its equivalents, one
2313 by one. Don't start a range. */
2314 do
2315 {
2316 EXTEND_RANGE_TABLE (work_area, 2);
2317 work_area->table[work_area->used++] = eqv;
2318 work_area->table[work_area->used++] = eqv;
2319 eqv = RE_TRANSLATE (eqv_table, eqv);
2320 }
2321 while (eqv != start);
2322 }
2323
2324 /* Add this char to the run, or start a new run. */
2325 else if (run_type == strange)
2326 {
2327 /* Initialize a new range. */
2328 run_type = this_type;
2329 run_start = start;
2330 run_end = start;
2331 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2332 }
2333 else
2334 {
2335 /* Extend a running range. */
2336 run_end = minchar;
2337 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2338 }
2339 }
2340
2341 /* If a run is still in progress at the end, finish it now
2342 by recording its equivalent ranges. */
2343 if (run_type == one_case)
2344 {
2345 EXTEND_RANGE_TABLE (work_area, 2);
2346 work_area->table[work_area->used++] = run_start;
2347 work_area->table[work_area->used++] = run_end;
2348 }
2349 else if (run_type == two_case)
2350 {
2351 EXTEND_RANGE_TABLE (work_area, 4);
2352 work_area->table[work_area->used++] = run_start;
2353 work_area->table[work_area->used++] = run_end;
2354 work_area->table[work_area->used++]
2355 = RE_TRANSLATE (eqv_table, run_start);
2356 work_area->table[work_area->used++]
2357 = RE_TRANSLATE (eqv_table, run_end);
2358 }
2359
2360 return -1;
2361}
36595814 2362
77d11aec 2363#endif /* emacs */
36595814 2364
2b34df4e 2365/* Record the image of the range start..end when passed through
36595814
SM
2366 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2367 and is not even necessarily contiguous.
b7c12565
RS
2368 Normally we approximate it with the smallest contiguous range that contains
2369 all the chars we need. However, for Latin-1 we go to extra effort
2370 to do a better job.
2371
2372 This function is not called for ASCII ranges.
77d11aec
RS
2373
2374 Returns -1 if successful, REG_ESPACE if ran out of space. */
2375
2376static int
36595814
SM
2377set_image_of_range (work_area, start, end, translate)
2378 RE_TRANSLATE_TYPE translate;
2379 struct range_table_work_area *work_area;
2380 re_wchar_t start, end;
2381{
77d11aec
RS
2382 re_wchar_t cmin, cmax;
2383
2384#ifdef emacs
2385 /* For Latin-1 ranges, use set_image_of_range_1
2386 to get proper handling of ranges that include letters and nonletters.
b7c12565 2387 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2388 For other character sets, we don't bother to get this right. */
b7c12565
RS
2389 if (RE_TRANSLATE_P (translate) && start < 04400
2390 && !(start < 04200 && end >= 04377))
77d11aec 2391 {
b7c12565 2392 int newend;
77d11aec 2393 int tem;
b7c12565
RS
2394 newend = end;
2395 if (newend > 04377)
2396 newend = 04377;
2397 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2398 if (tem > 0)
2399 return tem;
2400
2401 start = 04400;
2402 if (end < 04400)
2403 return -1;
2404 }
2405#endif
2406
b7c12565
RS
2407 EXTEND_RANGE_TABLE (work_area, 2);
2408 work_area->table[work_area->used++] = (start);
2409 work_area->table[work_area->used++] = (end);
2410
2411 cmin = -1, cmax = -1;
77d11aec 2412
36595814 2413 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2414 {
2415 int ch;
77d11aec 2416
b7c12565
RS
2417 for (ch = start; ch <= end; ch++)
2418 {
2419 re_wchar_t c = TRANSLATE (ch);
2420 if (! (start <= c && c <= end))
2421 {
2422 if (cmin == -1)
2423 cmin = c, cmax = c;
2424 else
2425 {
2426 cmin = MIN (cmin, c);
2427 cmax = MAX (cmax, c);
2428 }
2429 }
2430 }
2431
2432 if (cmin != -1)
2433 {
2434 EXTEND_RANGE_TABLE (work_area, 2);
2435 work_area->table[work_area->used++] = (cmin);
2436 work_area->table[work_area->used++] = (cmax);
2437 }
2438 }
36595814 2439
77d11aec
RS
2440 return -1;
2441}
8f924df7 2442#endif /* 0 */
fa9a63c5
RM
2443\f
2444#ifndef MATCH_MAY_ALLOCATE
2445
2446/* If we cannot allocate large objects within re_match_2_internal,
2447 we make the fail stack and register vectors global.
2448 The fail stack, we grow to the maximum size when a regexp
2449 is compiled.
2450 The register vectors, we adjust in size each time we
2451 compile a regexp, according to the number of registers it needs. */
2452
2453static fail_stack_type fail_stack;
2454
2455/* Size with which the following vectors are currently allocated.
2456 That is so we can make them bigger as needed,
4bb91c68 2457 but never make them smaller. */
fa9a63c5
RM
2458static int regs_allocated_size;
2459
66f0296e
SM
2460static re_char ** regstart, ** regend;
2461static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2462
2463/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2464 but don't make them smaller. */
fa9a63c5
RM
2465
2466static
2467regex_grow_registers (num_regs)
2468 int num_regs;
2469{
2470 if (num_regs > regs_allocated_size)
2471 {
66f0296e
SM
2472 RETALLOC_IF (regstart, num_regs, re_char *);
2473 RETALLOC_IF (regend, num_regs, re_char *);
2474 RETALLOC_IF (best_regstart, num_regs, re_char *);
2475 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2476
2477 regs_allocated_size = num_regs;
2478 }
2479}
2480
2481#endif /* not MATCH_MAY_ALLOCATE */
2482\f
99633e97
SM
2483static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2484 compile_stack,
2485 regnum_t regnum));
2486
fa9a63c5
RM
2487/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2488 Returns one of error codes defined in `regex.h', or zero for success.
2489
2490 Assumes the `allocated' (and perhaps `buffer') and `translate'
2491 fields are set in BUFP on entry.
2492
2493 If it succeeds, results are put in BUFP (if it returns an error, the
2494 contents of BUFP are undefined):
2495 `buffer' is the compiled pattern;
2496 `syntax' is set to SYNTAX;
2497 `used' is set to the length of the compiled pattern;
2498 `fastmap_accurate' is zero;
2499 `re_nsub' is the number of subexpressions in PATTERN;
2500 `not_bol' and `not_eol' are zero;
5e69f11e 2501
c0f9ea08 2502 The `fastmap' field is neither examined nor set. */
fa9a63c5 2503
505bde11
SM
2504/* Insert the `jump' from the end of last alternative to "here".
2505 The space for the jump has already been allocated. */
2506#define FIXUP_ALT_JUMP() \
2507do { \
2508 if (fixup_alt_jump) \
2509 STORE_JUMP (jump, fixup_alt_jump, b); \
2510} while (0)
2511
2512
fa9a63c5
RM
2513/* Return, freeing storage we allocated. */
2514#define FREE_STACK_RETURN(value) \
b18215fc
RS
2515 do { \
2516 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2517 free (compile_stack.stack); \
2518 return value; \
2519 } while (0)
fa9a63c5
RM
2520
2521static reg_errcode_t
971de7fb 2522regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2523{
01618498
SM
2524 /* We fetch characters from PATTERN here. */
2525 register re_wchar_t c, c1;
5e69f11e 2526
fa9a63c5
RM
2527 /* Points to the end of the buffer, where we should append. */
2528 register unsigned char *b;
5e69f11e 2529
fa9a63c5
RM
2530 /* Keeps track of unclosed groups. */
2531 compile_stack_type compile_stack;
2532
2533 /* Points to the current (ending) position in the pattern. */
22336245
RS
2534#ifdef AIX
2535 /* `const' makes AIX compiler fail. */
66f0296e 2536 unsigned char *p = pattern;
22336245 2537#else
66f0296e 2538 re_char *p = pattern;
22336245 2539#endif
66f0296e 2540 re_char *pend = pattern + size;
5e69f11e 2541
fa9a63c5 2542 /* How to translate the characters in the pattern. */
6676cb1c 2543 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2544
2545 /* Address of the count-byte of the most recently inserted `exactn'
2546 command. This makes it possible to tell if a new exact-match
2547 character can be added to that command or if the character requires
2548 a new `exactn' command. */
2549 unsigned char *pending_exact = 0;
2550
2551 /* Address of start of the most recently finished expression.
2552 This tells, e.g., postfix * where to find the start of its
2553 operand. Reset at the beginning of groups and alternatives. */
2554 unsigned char *laststart = 0;
2555
2556 /* Address of beginning of regexp, or inside of last group. */
2557 unsigned char *begalt;
2558
2559 /* Place in the uncompiled pattern (i.e., the {) to
2560 which to go back if the interval is invalid. */
66f0296e 2561 re_char *beg_interval;
5e69f11e 2562
fa9a63c5 2563 /* Address of the place where a forward jump should go to the end of
7814e705 2564 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2565 last -- ends with a forward jump of this sort. */
2566 unsigned char *fixup_alt_jump = 0;
2567
b18215fc
RS
2568 /* Work area for range table of charset. */
2569 struct range_table_work_area range_table_work;
2570
2d1675e4
SM
2571 /* If the object matched can contain multibyte characters. */
2572 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2573
8f924df7 2574 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2575 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2576
f9b0fd99
RS
2577 /* Nonzero if we have pushed down into a subpattern. */
2578 int in_subpattern = 0;
2579
2580 /* These hold the values of p, pattern, and pend from the main
2581 pattern when we have pushed into a subpattern. */
2582 re_char *main_p;
2583 re_char *main_pattern;
2584 re_char *main_pend;
2585
fa9a63c5 2586#ifdef DEBUG
99633e97 2587 debug++;
fa9a63c5 2588 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2589 if (debug > 0)
fa9a63c5
RM
2590 {
2591 unsigned debug_count;
5e69f11e 2592
fa9a63c5 2593 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2594 putchar (pattern[debug_count]);
fa9a63c5
RM
2595 putchar ('\n');
2596 }
2597#endif /* DEBUG */
2598
2599 /* Initialize the compile stack. */
2600 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2601 if (compile_stack.stack == NULL)
2602 return REG_ESPACE;
2603
2604 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2605 compile_stack.avail = 0;
2606
b18215fc
RS
2607 range_table_work.table = 0;
2608 range_table_work.allocated = 0;
2609
fa9a63c5
RM
2610 /* Initialize the pattern buffer. */
2611 bufp->syntax = syntax;
2612 bufp->fastmap_accurate = 0;
2613 bufp->not_bol = bufp->not_eol = 0;
6224b623 2614 bufp->used_syntax = 0;
fa9a63c5
RM
2615
2616 /* Set `used' to zero, so that if we return an error, the pattern
2617 printer (for debugging) will think there's no pattern. We reset it
2618 at the end. */
2619 bufp->used = 0;
5e69f11e 2620
fa9a63c5 2621 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2622 bufp->re_nsub = 0;
fa9a63c5 2623
0b32bf0e 2624#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2625 /* Initialize the syntax table. */
2626 init_syntax_once ();
2627#endif
2628
2629 if (bufp->allocated == 0)
2630 {
2631 if (bufp->buffer)
2632 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2633 enough space. This loses if buffer's address is bogus, but
7814e705 2634 that is the user's responsibility. */
25fe55af
RS
2635 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2636 }
fa9a63c5 2637 else
7814e705 2638 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2639 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2640 }
fa9a63c5
RM
2641 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2642
2643 bufp->allocated = INIT_BUF_SIZE;
2644 }
2645
2646 begalt = b = bufp->buffer;
2647
2648 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2649 while (1)
fa9a63c5 2650 {
f9b0fd99
RS
2651 if (p == pend)
2652 {
2653 /* If this is the end of an included regexp,
2654 pop back to the main regexp and try again. */
2655 if (in_subpattern)
2656 {
2657 in_subpattern = 0;
2658 pattern = main_pattern;
2659 p = main_p;
2660 pend = main_pend;
2661 continue;
2662 }
2663 /* If this is the end of the main regexp, we are done. */
2664 break;
2665 }
2666
fa9a63c5
RM
2667 PATFETCH (c);
2668
2669 switch (c)
25fe55af 2670 {
f9b0fd99
RS
2671 case ' ':
2672 {
2673 re_char *p1 = p;
2674
2675 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2676 spaces normally. And don't try to do this recursively. */
2677 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2678 goto normal_char;
2679
2680 /* Peek past following spaces. */
2681 while (p1 != pend)
2682 {
2683 if (*p1 != ' ')
2684 break;
2685 p1++;
2686 }
2687 /* If the spaces are followed by a repetition op,
2688 treat them normally. */
c721eee5
RS
2689 if (p1 != pend
2690 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2691 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2692 goto normal_char;
2693
2694 /* Replace the spaces with the whitespace regexp. */
2695 in_subpattern = 1;
2696 main_p = p1;
2697 main_pend = pend;
2698 main_pattern = pattern;
2699 p = pattern = whitespace_regexp;
5b0534c8 2700 pend = p + strlen ((const char *) p);
f9b0fd99 2701 break;
7814e705 2702 }
f9b0fd99 2703
25fe55af
RS
2704 case '^':
2705 {
7814e705 2706 if ( /* If at start of pattern, it's an operator. */
25fe55af 2707 p == pattern + 1
7814e705 2708 /* If context independent, it's an operator. */
25fe55af 2709 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2710 /* Otherwise, depends on what's come before. */
25fe55af 2711 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2712 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2713 else
2714 goto normal_char;
2715 }
2716 break;
2717
2718
2719 case '$':
2720 {
2721 if ( /* If at end of pattern, it's an operator. */
2722 p == pend
7814e705 2723 /* If context independent, it's an operator. */
25fe55af
RS
2724 || syntax & RE_CONTEXT_INDEP_ANCHORS
2725 /* Otherwise, depends on what's next. */
2726 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2727 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2728 else
2729 goto normal_char;
2730 }
2731 break;
fa9a63c5
RM
2732
2733
2734 case '+':
25fe55af
RS
2735 case '?':
2736 if ((syntax & RE_BK_PLUS_QM)
2737 || (syntax & RE_LIMITED_OPS))
2738 goto normal_char;
2739 handle_plus:
2740 case '*':
2741 /* If there is no previous pattern... */
2742 if (!laststart)
2743 {
2744 if (syntax & RE_CONTEXT_INVALID_OPS)
2745 FREE_STACK_RETURN (REG_BADRPT);
2746 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2747 goto normal_char;
2748 }
2749
2750 {
7814e705 2751 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2752 boolean zero_times_ok = 0, many_times_ok = 0;
2753 boolean greedy = 1;
25fe55af
RS
2754
2755 /* If there is a sequence of repetition chars, collapse it
2756 down to just one (the right one). We can't combine
2757 interval operators with these because of, e.g., `a{2}*',
7814e705 2758 which should only match an even number of `a's. */
25fe55af
RS
2759
2760 for (;;)
2761 {
0b32bf0e 2762 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2763 && c == '?' && (zero_times_ok || many_times_ok))
2764 greedy = 0;
2765 else
2766 {
2767 zero_times_ok |= c != '+';
2768 many_times_ok |= c != '?';
2769 }
25fe55af
RS
2770
2771 if (p == pend)
2772 break;
ed0767d8
SM
2773 else if (*p == '*'
2774 || (!(syntax & RE_BK_PLUS_QM)
2775 && (*p == '+' || *p == '?')))
25fe55af 2776 ;
ed0767d8 2777 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2778 {
ed0767d8
SM
2779 if (p+1 == pend)
2780 FREE_STACK_RETURN (REG_EESCAPE);
2781 if (p[1] == '+' || p[1] == '?')
2782 PATFETCH (c); /* Gobble up the backslash. */
2783 else
2784 break;
25fe55af
RS
2785 }
2786 else
ed0767d8 2787 break;
25fe55af 2788 /* If we get here, we found another repeat character. */
ed0767d8
SM
2789 PATFETCH (c);
2790 }
25fe55af
RS
2791
2792 /* Star, etc. applied to an empty pattern is equivalent
2793 to an empty pattern. */
4e8a9132 2794 if (!laststart || laststart == b)
25fe55af
RS
2795 break;
2796
2797 /* Now we know whether or not zero matches is allowed
7814e705 2798 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2799 if (greedy)
2800 {
99633e97 2801 if (many_times_ok)
4e8a9132
SM
2802 {
2803 boolean simple = skip_one_char (laststart) == b;
2804 unsigned int startoffset = 0;
f6a3f532 2805 re_opcode_t ofj =
01618498 2806 /* Check if the loop can match the empty string. */
6df42991
SM
2807 (simple || !analyse_first (laststart, b, NULL, 0))
2808 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2809 assert (skip_one_char (laststart) <= b);
177c0ea7 2810
4e8a9132
SM
2811 if (!zero_times_ok && simple)
2812 { /* Since simple * loops can be made faster by using
2813 on_failure_keep_string_jump, we turn simple P+
2814 into PP* if P is simple. */
2815 unsigned char *p1, *p2;
2816 startoffset = b - laststart;
2817 GET_BUFFER_SPACE (startoffset);
2818 p1 = b; p2 = laststart;
2819 while (p2 < p1)
2820 *b++ = *p2++;
2821 zero_times_ok = 1;
99633e97 2822 }
4e8a9132
SM
2823
2824 GET_BUFFER_SPACE (6);
2825 if (!zero_times_ok)
2826 /* A + loop. */
f6a3f532 2827 STORE_JUMP (ofj, b, b + 6);
99633e97 2828 else
4e8a9132
SM
2829 /* Simple * loops can use on_failure_keep_string_jump
2830 depending on what follows. But since we don't know
2831 that yet, we leave the decision up to
2832 on_failure_jump_smart. */
f6a3f532 2833 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2834 laststart + startoffset, b + 6);
99633e97 2835 b += 3;
4e8a9132 2836 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2837 b += 3;
2838 }
2839 else
2840 {
4e8a9132
SM
2841 /* A simple ? pattern. */
2842 assert (zero_times_ok);
2843 GET_BUFFER_SPACE (3);
2844 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2845 b += 3;
2846 }
1c8c6d39
DL
2847 }
2848 else /* not greedy */
2849 { /* I wish the greedy and non-greedy cases could be merged. */
2850
0683b6fa 2851 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2852 if (many_times_ok)
2853 {
f6a3f532
SM
2854 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2855
6df42991
SM
2856 /* The non-greedy multiple match looks like
2857 a repeat..until: we only need a conditional jump
2858 at the end of the loop. */
f6a3f532
SM
2859 if (emptyp) BUF_PUSH (no_op);
2860 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2861 : on_failure_jump, b, laststart);
1c8c6d39
DL
2862 b += 3;
2863 if (zero_times_ok)
2864 {
2865 /* The repeat...until naturally matches one or more.
2866 To also match zero times, we need to first jump to
6df42991 2867 the end of the loop (its conditional jump). */
1c8c6d39
DL
2868 INSERT_JUMP (jump, laststart, b);
2869 b += 3;
2870 }
2871 }
2872 else
2873 {
2874 /* non-greedy a?? */
1c8c6d39
DL
2875 INSERT_JUMP (jump, laststart, b + 3);
2876 b += 3;
2877 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2878 b += 3;
2879 }
2880 }
2881 }
4e8a9132 2882 pending_exact = 0;
fa9a63c5
RM
2883 break;
2884
2885
2886 case '.':
25fe55af
RS
2887 laststart = b;
2888 BUF_PUSH (anychar);
2889 break;
fa9a63c5
RM
2890
2891
25fe55af
RS
2892 case '[':
2893 {
19ed5445
PE
2894 re_char *p1;
2895
b18215fc 2896 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2897
25fe55af 2898 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2899
25fe55af
RS
2900 /* Ensure that we have enough space to push a charset: the
2901 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2902 GET_BUFFER_SPACE (34);
2903
25fe55af 2904 laststart = b;
e318085a 2905
25fe55af 2906 /* We test `*p == '^' twice, instead of using an if
7814e705 2907 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2908 BUF_PUSH (*p == '^' ? charset_not : charset);
2909 if (*p == '^')
2910 p++;
e318085a 2911
25fe55af
RS
2912 /* Remember the first position in the bracket expression. */
2913 p1 = p;
e318085a 2914
7814e705 2915 /* Push the number of bytes in the bitmap. */
25fe55af 2916 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2917
25fe55af 2918 /* Clear the whole map. */
72af86bd 2919 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2920
25fe55af
RS
2921 /* charset_not matches newline according to a syntax bit. */
2922 if ((re_opcode_t) b[-2] == charset_not
2923 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2924 SET_LIST_BIT ('\n');
fa9a63c5 2925
7814e705 2926 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2927 for (;;)
2928 {
b18215fc 2929 boolean escaped_char = false;
2d1675e4 2930 const unsigned char *p2 = p;
cf9c99bc 2931 re_wchar_t ch, c2;
e318085a 2932
25fe55af 2933 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2934
36595814
SM
2935 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2936 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2937 So the translation is done later in a loop. Example:
2938 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2939 PATFETCH (c);
e318085a 2940
25fe55af
RS
2941 /* \ might escape characters inside [...] and [^...]. */
2942 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2943 {
2944 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2945
2946 PATFETCH (c);
b18215fc 2947 escaped_char = true;
25fe55af 2948 }
b18215fc
RS
2949 else
2950 {
7814e705 2951 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2952 not (i.e., when the bracket expression is `[]' so
2953 far), the ']' character bit gets set way below. */
2d1675e4 2954 if (c == ']' && p2 != p1)
657fcfbd 2955 break;
25fe55af 2956 }
b18215fc 2957
25fe55af
RS
2958 /* See if we're at the beginning of a possible character
2959 class. */
b18215fc 2960
2d1675e4
SM
2961 if (!escaped_char &&
2962 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2963 {
7814e705 2964 /* Leave room for the null. */
14473664 2965 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2966 const unsigned char *class_beg;
b18215fc 2967
25fe55af
RS
2968 PATFETCH (c);
2969 c1 = 0;
ed0767d8 2970 class_beg = p;
b18215fc 2971
25fe55af
RS
2972 /* If pattern is `[[:'. */
2973 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2974
25fe55af
RS
2975 for (;;)
2976 {
14473664
SM
2977 PATFETCH (c);
2978 if ((c == ':' && *p == ']') || p == pend)
2979 break;
2980 if (c1 < CHAR_CLASS_MAX_LENGTH)
2981 str[c1++] = c;
2982 else
2983 /* This is in any case an invalid class name. */
2984 str[0] = '\0';
25fe55af
RS
2985 }
2986 str[c1] = '\0';
b18215fc
RS
2987
2988 /* If isn't a word bracketed by `[:' and `:]':
2989 undo the ending character, the letters, and
2990 leave the leading `:' and `[' (but set bits for
2991 them). */
25fe55af
RS
2992 if (c == ':' && *p == ']')
2993 {
14473664 2994 re_wctype_t cc;
8f924df7 2995 int limit;
14473664
SM
2996
2997 cc = re_wctype (str);
2998
2999 if (cc == 0)
fa9a63c5
RM
3000 FREE_STACK_RETURN (REG_ECTYPE);
3001
14473664
SM
3002 /* Throw away the ] at the end of the character
3003 class. */
3004 PATFETCH (c);
fa9a63c5 3005
14473664 3006 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3007
cf9c99bc
KH
3008#ifndef emacs
3009 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3010 if (re_iswctype (btowc (ch), cc))
3011 {
3012 c = TRANSLATE (ch);
ed00c2ac
KH
3013 if (c < (1 << BYTEWIDTH))
3014 SET_LIST_BIT (c);
8f924df7 3015 }
cf9c99bc
KH
3016#else /* emacs */
3017 /* Most character classes in a multibyte match
3018 just set a flag. Exceptions are is_blank,
3019 is_digit, is_cntrl, and is_xdigit, since
3020 they can only match ASCII characters. We
3021 don't need to handle them for multibyte.
3022 They are distinguished by a negative wctype. */
96cc36cc 3023
254c06a8
SM
3024 /* Setup the gl_state object to its buffer-defined
3025 value. This hardcodes the buffer-global
3026 syntax-table for ASCII chars, while the other chars
3027 will obey syntax-table properties. It's not ideal,
3028 but it's the way it's been done until now. */
d48cd3f4 3029 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 3030
cf9c99bc 3031 for (ch = 0; ch < 256; ++ch)
25fe55af 3032 {
cf9c99bc
KH
3033 c = RE_CHAR_TO_MULTIBYTE (ch);
3034 if (! CHAR_BYTE8_P (c)
3035 && re_iswctype (c, cc))
8f924df7 3036 {
cf9c99bc
KH
3037 SET_LIST_BIT (ch);
3038 c1 = TRANSLATE (c);
3039 if (c1 == c)
3040 continue;
3041 if (ASCII_CHAR_P (c1))
3042 SET_LIST_BIT (c1);
3043 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3044 SET_LIST_BIT (c1);
8f924df7 3045 }
25fe55af 3046 }
cf9c99bc
KH
3047 SET_RANGE_TABLE_WORK_AREA_BIT
3048 (range_table_work, re_wctype_to_bit (cc));
3049#endif /* emacs */
6224b623
SM
3050 /* In most cases the matching rule for char classes
3051 only uses the syntax table for multibyte chars,
3052 so that the content of the syntax-table it is not
3053 hardcoded in the range_table. SPACE and WORD are
3054 the two exceptions. */
3055 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3056 bufp->used_syntax = 1;
3057
b18215fc
RS
3058 /* Repeat the loop. */
3059 continue;
25fe55af
RS
3060 }
3061 else
3062 {
ed0767d8
SM
3063 /* Go back to right after the "[:". */
3064 p = class_beg;
25fe55af 3065 SET_LIST_BIT ('[');
b18215fc
RS
3066
3067 /* Because the `:' may starts the range, we
3068 can't simply set bit and repeat the loop.
7814e705 3069 Instead, just set it to C and handle below. */
b18215fc 3070 c = ':';
25fe55af
RS
3071 }
3072 }
b18215fc
RS
3073
3074 if (p < pend && p[0] == '-' && p[1] != ']')
3075 {
3076
3077 /* Discard the `-'. */
3078 PATFETCH (c1);
3079
3080 /* Fetch the character which ends the range. */
3081 PATFETCH (c1);
cf9c99bc
KH
3082#ifdef emacs
3083 if (CHAR_BYTE8_P (c1)
3084 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3085 /* Treat the range from a multibyte character to
3086 raw-byte character as empty. */
3087 c = c1 + 1;
3088#endif /* emacs */
e318085a 3089 }
25fe55af 3090 else
b18215fc
RS
3091 /* Range from C to C. */
3092 c1 = c;
3093
cf9c99bc 3094 if (c > c1)
25fe55af 3095 {
cf9c99bc
KH
3096 if (syntax & RE_NO_EMPTY_RANGES)
3097 FREE_STACK_RETURN (REG_ERANGEX);
3098 /* Else, repeat the loop. */
bf216479 3099 }
6fdd04b0 3100 else
25fe55af 3101 {
cf9c99bc
KH
3102#ifndef emacs
3103 /* Set the range into bitmap */
8f924df7 3104 for (; c <= c1; c++)
b18215fc 3105 {
cf9c99bc
KH
3106 ch = TRANSLATE (c);
3107 if (ch < (1 << BYTEWIDTH))
3108 SET_LIST_BIT (ch);
3109 }
3110#else /* emacs */
3111 if (c < 128)
3112 {
3113 ch = MIN (127, c1);
3114 SETUP_ASCII_RANGE (range_table_work, c, ch);
3115 c = ch + 1;
3116 if (CHAR_BYTE8_P (c1))
3117 c = BYTE8_TO_CHAR (128);
3118 }
3119 if (c <= c1)
3120 {
3121 if (CHAR_BYTE8_P (c))
3122 {
3123 c = CHAR_TO_BYTE8 (c);
3124 c1 = CHAR_TO_BYTE8 (c1);
3125 for (; c <= c1; c++)
3126 SET_LIST_BIT (c);
3127 }
3128 else if (multibyte)
3129 {
3130 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3131 }
3132 else
3133 {
3134 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3135 }
e934739e 3136 }
cf9c99bc 3137#endif /* emacs */
25fe55af 3138 }
e318085a
RS
3139 }
3140
25fe55af 3141 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3142 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3143 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3144 b[-1]--;
3145 b += b[-1];
fa9a63c5 3146
96cc36cc
RS
3147 /* Build real range table from work area. */
3148 if (RANGE_TABLE_WORK_USED (range_table_work)
3149 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3150 {
3151 int i;
3152 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3153
b18215fc 3154 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3155 bytes for flags, two for COUNT, and three bytes for
3156 each character. */
3157 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3158
b18215fc
RS
3159 /* Indicate the existence of range table. */
3160 laststart[1] |= 0x80;
fa9a63c5 3161
96cc36cc
RS
3162 /* Store the character class flag bits into the range table.
3163 If not in emacs, these flag bits are always 0. */
3164 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3165 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3166
b18215fc
RS
3167 STORE_NUMBER_AND_INCR (b, used / 2);
3168 for (i = 0; i < used; i++)
3169 STORE_CHARACTER_AND_INCR
3170 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3171 }
25fe55af
RS
3172 }
3173 break;
fa9a63c5
RM
3174
3175
b18215fc 3176 case '(':
25fe55af
RS
3177 if (syntax & RE_NO_BK_PARENS)
3178 goto handle_open;
3179 else
3180 goto normal_char;
fa9a63c5
RM
3181
3182
25fe55af
RS
3183 case ')':
3184 if (syntax & RE_NO_BK_PARENS)
3185 goto handle_close;
3186 else
3187 goto normal_char;
e318085a
RS
3188
3189
25fe55af
RS
3190 case '\n':
3191 if (syntax & RE_NEWLINE_ALT)
3192 goto handle_alt;
3193 else
3194 goto normal_char;
e318085a
RS
3195
3196
b18215fc 3197 case '|':
25fe55af
RS
3198 if (syntax & RE_NO_BK_VBAR)
3199 goto handle_alt;
3200 else
3201 goto normal_char;
3202
3203
3204 case '{':
3205 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3206 goto handle_interval;
3207 else
3208 goto normal_char;
3209
3210
3211 case '\\':
3212 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3213
3214 /* Do not translate the character after the \, so that we can
3215 distinguish, e.g., \B from \b, even if we normally would
3216 translate, e.g., B to b. */
36595814 3217 PATFETCH (c);
25fe55af
RS
3218
3219 switch (c)
3220 {
3221 case '(':
3222 if (syntax & RE_NO_BK_PARENS)
3223 goto normal_backslash;
3224
3225 handle_open:
505bde11
SM
3226 {
3227 int shy = 0;
c69b0314 3228 regnum_t regnum = 0;
505bde11
SM
3229 if (p+1 < pend)
3230 {
3231 /* Look for a special (?...) construct */
ed0767d8 3232 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3233 {
ed0767d8 3234 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3235 while (!shy)
505bde11 3236 {
c69b0314
SM
3237 PATFETCH (c);
3238 switch (c)
3239 {
3240 case ':': shy = 1; break;
3241 case '0':
3242 /* An explicitly specified regnum must start
3243 with non-0. */
3244 if (regnum == 0)
3245 FREE_STACK_RETURN (REG_BADPAT);
3246 case '1': case '2': case '3': case '4':
3247 case '5': case '6': case '7': case '8': case '9':
3248 regnum = 10*regnum + (c - '0'); break;
3249 default:
3250 /* Only (?:...) is supported right now. */
3251 FREE_STACK_RETURN (REG_BADPAT);
3252 }
505bde11
SM
3253 }
3254 }
505bde11
SM
3255 }
3256
3257 if (!shy)
c69b0314
SM
3258 regnum = ++bufp->re_nsub;
3259 else if (regnum)
3260 { /* It's actually not shy, but explicitly numbered. */
3261 shy = 0;
3262 if (regnum > bufp->re_nsub)
3263 bufp->re_nsub = regnum;
3264 else if (regnum > bufp->re_nsub
3265 /* Ideally, we'd want to check that the specified
3266 group can't have matched (i.e. all subgroups
3267 using the same regnum are in other branches of
3268 OR patterns), but we don't currently keep track
3269 of enough info to do that easily. */
3270 || group_in_compile_stack (compile_stack, regnum))
3271 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3272 }
c69b0314
SM
3273 else
3274 /* It's really shy. */
3275 regnum = - bufp->re_nsub;
25fe55af 3276
99633e97
SM
3277 if (COMPILE_STACK_FULL)
3278 {
3279 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3280 compile_stack_elt_t);
3281 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3282
99633e97
SM
3283 compile_stack.size <<= 1;
3284 }
25fe55af 3285
99633e97 3286 /* These are the values to restore when we hit end of this
7814e705 3287 group. They are all relative offsets, so that if the
99633e97
SM
3288 whole pattern moves because of realloc, they will still
3289 be valid. */
3290 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3291 COMPILE_STACK_TOP.fixup_alt_jump
3292 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3293 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3294 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3295
c69b0314
SM
3296 /* Do not push a start_memory for groups beyond the last one
3297 we can represent in the compiled pattern. */
3298 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3299 BUF_PUSH_2 (start_memory, regnum);
3300
3301 compile_stack.avail++;
3302
3303 fixup_alt_jump = 0;
3304 laststart = 0;
3305 begalt = b;
3306 /* If we've reached MAX_REGNUM groups, then this open
3307 won't actually generate any code, so we'll have to
3308 clear pending_exact explicitly. */
3309 pending_exact = 0;
3310 break;
505bde11 3311 }
25fe55af
RS
3312
3313 case ')':
3314 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3315
3316 if (COMPILE_STACK_EMPTY)
505bde11
SM
3317 {
3318 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3319 goto normal_backslash;
3320 else
3321 FREE_STACK_RETURN (REG_ERPAREN);
3322 }
25fe55af
RS
3323
3324 handle_close:
505bde11 3325 FIXUP_ALT_JUMP ();
25fe55af
RS
3326
3327 /* See similar code for backslashed left paren above. */
3328 if (COMPILE_STACK_EMPTY)
505bde11
SM
3329 {
3330 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3331 goto normal_char;
3332 else
3333 FREE_STACK_RETURN (REG_ERPAREN);
3334 }
25fe55af
RS
3335
3336 /* Since we just checked for an empty stack above, this
3337 ``can't happen''. */
3338 assert (compile_stack.avail != 0);
3339 {
3340 /* We don't just want to restore into `regnum', because
3341 later groups should continue to be numbered higher,
7814e705 3342 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3343 regnum_t regnum;
25fe55af
RS
3344
3345 compile_stack.avail--;
3346 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3347 fixup_alt_jump
3348 = COMPILE_STACK_TOP.fixup_alt_jump
3349 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3350 : 0;
3351 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3352 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3353 /* If we've reached MAX_REGNUM groups, then this open
3354 won't actually generate any code, so we'll have to
3355 clear pending_exact explicitly. */
3356 pending_exact = 0;
e318085a 3357
25fe55af 3358 /* We're at the end of the group, so now we know how many
7814e705 3359 groups were inside this one. */
c69b0314
SM
3360 if (regnum <= MAX_REGNUM && regnum > 0)
3361 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3362 }
3363 break;
3364
3365
3366 case '|': /* `\|'. */
3367 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3368 goto normal_backslash;
3369 handle_alt:
3370 if (syntax & RE_LIMITED_OPS)
3371 goto normal_char;
3372
3373 /* Insert before the previous alternative a jump which
7814e705 3374 jumps to this alternative if the former fails. */
25fe55af
RS
3375 GET_BUFFER_SPACE (3);
3376 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3377 pending_exact = 0;
3378 b += 3;
3379
3380 /* The alternative before this one has a jump after it
3381 which gets executed if it gets matched. Adjust that
3382 jump so it will jump to this alternative's analogous
3383 jump (put in below, which in turn will jump to the next
3384 (if any) alternative's such jump, etc.). The last such
3385 jump jumps to the correct final destination. A picture:
3386 _____ _____
3387 | | | |
3388 | v | v
3389 a | b | c
3390
3391 If we are at `b', then fixup_alt_jump right now points to a
3392 three-byte space after `a'. We'll put in the jump, set
3393 fixup_alt_jump to right after `b', and leave behind three
3394 bytes which we'll fill in when we get to after `c'. */
3395
505bde11 3396 FIXUP_ALT_JUMP ();
25fe55af
RS
3397
3398 /* Mark and leave space for a jump after this alternative,
3399 to be filled in later either by next alternative or
3400 when know we're at the end of a series of alternatives. */
3401 fixup_alt_jump = b;
3402 GET_BUFFER_SPACE (3);
3403 b += 3;
3404
3405 laststart = 0;
3406 begalt = b;
3407 break;
3408
3409
3410 case '{':
3411 /* If \{ is a literal. */
3412 if (!(syntax & RE_INTERVALS)
3413 /* If we're at `\{' and it's not the open-interval
3414 operator. */
4bb91c68 3415 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3416 goto normal_backslash;
3417
3418 handle_interval:
3419 {
3420 /* If got here, then the syntax allows intervals. */
3421
3422 /* At least (most) this many matches must be made. */
99633e97 3423 int lower_bound = 0, upper_bound = -1;
25fe55af 3424
ed0767d8 3425 beg_interval = p;
25fe55af 3426
25fe55af
RS
3427 GET_UNSIGNED_NUMBER (lower_bound);
3428
3429 if (c == ',')
ed0767d8 3430 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3431 else
3432 /* Interval such as `{1}' => match exactly once. */
3433 upper_bound = lower_bound;
3434
3435 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3436 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3437 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3438
3439 if (!(syntax & RE_NO_BK_BRACES))
3440 {
4bb91c68
SM
3441 if (c != '\\')
3442 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3443 if (p == pend)
3444 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3445 PATFETCH (c);
3446 }
3447
3448 if (c != '}')
4bb91c68 3449 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3450
3451 /* We just parsed a valid interval. */
3452
3453 /* If it's invalid to have no preceding re. */
3454 if (!laststart)
3455 {
3456 if (syntax & RE_CONTEXT_INVALID_OPS)
3457 FREE_STACK_RETURN (REG_BADRPT);
3458 else if (syntax & RE_CONTEXT_INDEP_OPS)
3459 laststart = b;
3460 else
3461 goto unfetch_interval;
3462 }
3463
6df42991
SM
3464 if (upper_bound == 0)
3465 /* If the upper bound is zero, just drop the sub pattern
3466 altogether. */
3467 b = laststart;
3468 else if (lower_bound == 1 && upper_bound == 1)
3469 /* Just match it once: nothing to do here. */
3470 ;
3471
3472 /* Otherwise, we have a nontrivial interval. When
3473 we're all done, the pattern will look like:
3474 set_number_at <jump count> <upper bound>
3475 set_number_at <succeed_n count> <lower bound>
3476 succeed_n <after jump addr> <succeed_n count>
3477 <body of loop>
3478 jump_n <succeed_n addr> <jump count>
3479 (The upper bound and `jump_n' are omitted if
3480 `upper_bound' is 1, though.) */
3481 else
3482 { /* If the upper bound is > 1, we need to insert
3483 more at the end of the loop. */
3484 unsigned int nbytes = (upper_bound < 0 ? 3
3485 : upper_bound > 1 ? 5 : 0);
3486 unsigned int startoffset = 0;
3487
3488 GET_BUFFER_SPACE (20); /* We might use less. */
3489
3490 if (lower_bound == 0)
3491 {
3492 /* A succeed_n that starts with 0 is really a
3493 a simple on_failure_jump_loop. */
3494 INSERT_JUMP (on_failure_jump_loop, laststart,
3495 b + 3 + nbytes);
3496 b += 3;
3497 }
3498 else
3499 {
3500 /* Initialize lower bound of the `succeed_n', even
3501 though it will be set during matching by its
3502 attendant `set_number_at' (inserted next),
3503 because `re_compile_fastmap' needs to know.
3504 Jump to the `jump_n' we might insert below. */
3505 INSERT_JUMP2 (succeed_n, laststart,
3506 b + 5 + nbytes,
3507 lower_bound);
3508 b += 5;
3509
3510 /* Code to initialize the lower bound. Insert
7814e705 3511 before the `succeed_n'. The `5' is the last two
6df42991
SM
3512 bytes of this `set_number_at', plus 3 bytes of
3513 the following `succeed_n'. */
3514 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3515 b += 5;
3516 startoffset += 5;
3517 }
3518
3519 if (upper_bound < 0)
3520 {
3521 /* A negative upper bound stands for infinity,
3522 in which case it degenerates to a plain jump. */
3523 STORE_JUMP (jump, b, laststart + startoffset);
3524 b += 3;
3525 }
3526 else if (upper_bound > 1)
3527 { /* More than one repetition is allowed, so
3528 append a backward jump to the `succeed_n'
3529 that starts this interval.
3530
3531 When we've reached this during matching,
3532 we'll have matched the interval once, so
3533 jump back only `upper_bound - 1' times. */
3534 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3535 upper_bound - 1);
3536 b += 5;
3537
3538 /* The location we want to set is the second
3539 parameter of the `jump_n'; that is `b-2' as
3540 an absolute address. `laststart' will be
3541 the `set_number_at' we're about to insert;
3542 `laststart+3' the number to set, the source
3543 for the relative address. But we are
3544 inserting into the middle of the pattern --
3545 so everything is getting moved up by 5.
3546 Conclusion: (b - 2) - (laststart + 3) + 5,
3547 i.e., b - laststart.
3548
3549 We insert this at the beginning of the loop
3550 so that if we fail during matching, we'll
3551 reinitialize the bounds. */
3552 insert_op2 (set_number_at, laststart, b - laststart,
3553 upper_bound - 1, b);
3554 b += 5;
3555 }
3556 }
25fe55af
RS
3557 pending_exact = 0;
3558 beg_interval = NULL;
3559 }
3560 break;
3561
3562 unfetch_interval:
3563 /* If an invalid interval, match the characters as literals. */
3564 assert (beg_interval);
3565 p = beg_interval;
3566 beg_interval = NULL;
3567
3568 /* normal_char and normal_backslash need `c'. */
ed0767d8 3569 c = '{';
25fe55af
RS
3570
3571 if (!(syntax & RE_NO_BK_BRACES))
3572 {
ed0767d8
SM
3573 assert (p > pattern && p[-1] == '\\');
3574 goto normal_backslash;
25fe55af 3575 }
ed0767d8
SM
3576 else
3577 goto normal_char;
e318085a 3578
b18215fc 3579#ifdef emacs
25fe55af 3580 /* There is no way to specify the before_dot and after_dot
7814e705 3581 operators. rms says this is ok. --karl */
25fe55af
RS
3582 case '=':
3583 BUF_PUSH (at_dot);
3584 break;
3585
3586 case 's':
3587 laststart = b;
3588 PATFETCH (c);
3589 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3590 break;
3591
3592 case 'S':
3593 laststart = b;
3594 PATFETCH (c);
3595 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3596 break;
b18215fc
RS
3597
3598 case 'c':
3599 laststart = b;
36595814 3600 PATFETCH (c);
b18215fc
RS
3601 BUF_PUSH_2 (categoryspec, c);
3602 break;
e318085a 3603
b18215fc
RS
3604 case 'C':
3605 laststart = b;
36595814 3606 PATFETCH (c);
b18215fc
RS
3607 BUF_PUSH_2 (notcategoryspec, c);
3608 break;
3609#endif /* emacs */
e318085a 3610
e318085a 3611
25fe55af 3612 case 'w':
4bb91c68
SM
3613 if (syntax & RE_NO_GNU_OPS)
3614 goto normal_char;
25fe55af 3615 laststart = b;
1fb352e0 3616 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3617 break;
e318085a 3618
e318085a 3619
25fe55af 3620 case 'W':
4bb91c68
SM
3621 if (syntax & RE_NO_GNU_OPS)
3622 goto normal_char;
25fe55af 3623 laststart = b;
1fb352e0 3624 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3625 break;
e318085a
RS
3626
3627
25fe55af 3628 case '<':
4bb91c68
SM
3629 if (syntax & RE_NO_GNU_OPS)
3630 goto normal_char;
25fe55af
RS
3631 BUF_PUSH (wordbeg);
3632 break;
e318085a 3633
25fe55af 3634 case '>':
4bb91c68
SM
3635 if (syntax & RE_NO_GNU_OPS)
3636 goto normal_char;
25fe55af
RS
3637 BUF_PUSH (wordend);
3638 break;
e318085a 3639
669fa600
SM
3640 case '_':
3641 if (syntax & RE_NO_GNU_OPS)
3642 goto normal_char;
3643 laststart = b;
3644 PATFETCH (c);
3645 if (c == '<')
3646 BUF_PUSH (symbeg);
3647 else if (c == '>')
3648 BUF_PUSH (symend);
3649 else
3650 FREE_STACK_RETURN (REG_BADPAT);
3651 break;
3652
25fe55af 3653 case 'b':
4bb91c68
SM
3654 if (syntax & RE_NO_GNU_OPS)
3655 goto normal_char;
25fe55af
RS
3656 BUF_PUSH (wordbound);
3657 break;
e318085a 3658
25fe55af 3659 case 'B':
4bb91c68
SM
3660 if (syntax & RE_NO_GNU_OPS)
3661 goto normal_char;
25fe55af
RS
3662 BUF_PUSH (notwordbound);
3663 break;
fa9a63c5 3664
25fe55af 3665 case '`':
4bb91c68
SM
3666 if (syntax & RE_NO_GNU_OPS)
3667 goto normal_char;
25fe55af
RS
3668 BUF_PUSH (begbuf);
3669 break;
e318085a 3670
25fe55af 3671 case '\'':
4bb91c68
SM
3672 if (syntax & RE_NO_GNU_OPS)
3673 goto normal_char;
25fe55af
RS
3674 BUF_PUSH (endbuf);
3675 break;
e318085a 3676
25fe55af
RS
3677 case '1': case '2': case '3': case '4': case '5':
3678 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3679 {
3680 regnum_t reg;
e318085a 3681
0cdd06f8
SM
3682 if (syntax & RE_NO_BK_REFS)
3683 goto normal_backslash;
e318085a 3684
0cdd06f8 3685 reg = c - '0';
e318085a 3686
c69b0314
SM
3687 if (reg > bufp->re_nsub || reg < 1
3688 /* Can't back reference to a subexp before its end. */
3689 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3690 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3691
0cdd06f8
SM
3692 laststart = b;
3693 BUF_PUSH_2 (duplicate, reg);
3694 }
25fe55af 3695 break;
e318085a 3696
e318085a 3697
25fe55af
RS
3698 case '+':
3699 case '?':
3700 if (syntax & RE_BK_PLUS_QM)
3701 goto handle_plus;
3702 else
3703 goto normal_backslash;
3704
3705 default:
3706 normal_backslash:
3707 /* You might think it would be useful for \ to mean
3708 not to translate; but if we don't translate it
4bb91c68 3709 it will never match anything. */
25fe55af
RS
3710 goto normal_char;
3711 }
3712 break;
fa9a63c5
RM
3713
3714
3715 default:
25fe55af 3716 /* Expects the character in `c'. */
fa9a63c5 3717 normal_char:
36595814 3718 /* If no exactn currently being built. */
25fe55af 3719 if (!pending_exact
fa9a63c5 3720
25fe55af
RS
3721 /* If last exactn not at current position. */
3722 || pending_exact + *pending_exact + 1 != b
5e69f11e 3723
25fe55af 3724 /* We have only one byte following the exactn for the count. */
2d1675e4 3725 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3726
7814e705 3727 /* If followed by a repetition operator. */
9d99031f 3728 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3729 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3730 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3731 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3732 || ((syntax & RE_INTERVALS)
25fe55af 3733 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3734 ? p != pend && *p == '{'
3735 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3736 {
3737 /* Start building a new exactn. */
5e69f11e 3738
25fe55af 3739 laststart = b;
fa9a63c5
RM
3740
3741 BUF_PUSH_2 (exactn, 0);
3742 pending_exact = b - 1;
25fe55af 3743 }
5e69f11e 3744
2d1675e4
SM
3745 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3746 {
e0277a47
KH
3747 int len;
3748
cf9c99bc 3749 if (multibyte)
6fdd04b0 3750 {
cf9c99bc 3751 c = TRANSLATE (c);
6fdd04b0
KH
3752 len = CHAR_STRING (c, b);
3753 b += len;
3754 }
e0277a47 3755 else
6fdd04b0 3756 {
cf9c99bc
KH
3757 c1 = RE_CHAR_TO_MULTIBYTE (c);
3758 if (! CHAR_BYTE8_P (c1))
3759 {
3760 re_wchar_t c2 = TRANSLATE (c1);
3761
3762 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3763 c = c1;
409f2919 3764 }
6fdd04b0
KH
3765 *b++ = c;
3766 len = 1;
3767 }
2d1675e4
SM
3768 (*pending_exact) += len;
3769 }
3770
fa9a63c5 3771 break;
25fe55af 3772 } /* switch (c) */
fa9a63c5
RM
3773 } /* while p != pend */
3774
5e69f11e 3775
fa9a63c5 3776 /* Through the pattern now. */
5e69f11e 3777
505bde11 3778 FIXUP_ALT_JUMP ();
fa9a63c5 3779
5e69f11e 3780 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3781 FREE_STACK_RETURN (REG_EPAREN);
3782
3783 /* If we don't want backtracking, force success
3784 the first time we reach the end of the compiled pattern. */
3785 if (syntax & RE_NO_POSIX_BACKTRACKING)
3786 BUF_PUSH (succeed);
3787
fa9a63c5
RM
3788 /* We have succeeded; set the length of the buffer. */
3789 bufp->used = b - bufp->buffer;
3790
3791#ifdef DEBUG
99633e97 3792 if (debug > 0)
fa9a63c5 3793 {
505bde11 3794 re_compile_fastmap (bufp);
fa9a63c5
RM
3795 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3796 print_compiled_pattern (bufp);
3797 }
99633e97 3798 debug--;
fa9a63c5
RM
3799#endif /* DEBUG */
3800
3801#ifndef MATCH_MAY_ALLOCATE
3802 /* Initialize the failure stack to the largest possible stack. This
3803 isn't necessary unless we're trying to avoid calling alloca in
3804 the search and match routines. */
3805 {
3806 int num_regs = bufp->re_nsub + 1;
3807
320a2a73 3808 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3809 {
a26f4ccd 3810 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3811
fa9a63c5
RM
3812 if (! fail_stack.stack)
3813 fail_stack.stack
5e69f11e 3814 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3815 * sizeof (fail_stack_elt_t));
3816 else
3817 fail_stack.stack
3818 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3819 (fail_stack.size
3820 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3821 }
3822
3823 regex_grow_registers (num_regs);
3824 }
3825#endif /* not MATCH_MAY_ALLOCATE */
3826
839966f3 3827 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3828} /* regex_compile */
3829\f
3830/* Subroutines for `regex_compile'. */
3831
7814e705 3832/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3833
3834static void
971de7fb 3835store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3836{
3837 *loc = (unsigned char) op;
3838 STORE_NUMBER (loc + 1, arg);
3839}
3840
3841
3842/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3843
3844static void
971de7fb 3845store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3846{
3847 *loc = (unsigned char) op;
3848 STORE_NUMBER (loc + 1, arg1);
3849 STORE_NUMBER (loc + 3, arg2);
3850}
3851
3852
3853/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3854 for OP followed by two-byte integer parameter ARG. */
3855
3856static void
971de7fb 3857insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3858{
3859 register unsigned char *pfrom = end;
3860 register unsigned char *pto = end + 3;
3861
3862 while (pfrom != loc)
3863 *--pto = *--pfrom;
5e69f11e 3864
fa9a63c5
RM
3865 store_op1 (op, loc, arg);
3866}
3867
3868
3869/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3870
3871static void
971de7fb 3872insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3873{
3874 register unsigned char *pfrom = end;
3875 register unsigned char *pto = end + 5;
3876
3877 while (pfrom != loc)
3878 *--pto = *--pfrom;
5e69f11e 3879
fa9a63c5
RM
3880 store_op2 (op, loc, arg1, arg2);
3881}
3882
3883
3884/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3885 after an alternative or a begin-subexpression. We assume there is at
3886 least one character before the ^. */
3887
3888static boolean
971de7fb 3889at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3890{
01618498 3891 re_char *prev = p - 2;
fa9a63c5 3892 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3893
fa9a63c5
RM
3894 return
3895 /* After a subexpression? */
3896 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3897 /* After an alternative? */
d2af47df
SM
3898 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3899 /* After a shy subexpression? */
3900 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3901 && prev[-1] == '?' && prev[-2] == '('
3902 && (syntax & RE_NO_BK_PARENS
3903 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3904}
3905
3906
3907/* The dual of at_begline_loc_p. This one is for $. We assume there is
3908 at least one character after the $, i.e., `P < PEND'. */
3909
3910static boolean
971de7fb 3911at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3912{
01618498 3913 re_char *next = p;
fa9a63c5 3914 boolean next_backslash = *next == '\\';
01618498 3915 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3916
fa9a63c5
RM
3917 return
3918 /* Before a subexpression? */
3919 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3920 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3921 /* Before an alternative? */
3922 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3923 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3924}
3925
3926
5e69f11e 3927/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3928 false if it's not. */
3929
3930static boolean
971de7fb 3931group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5
RM
3932{
3933 int this_element;
3934
5e69f11e
RM
3935 for (this_element = compile_stack.avail - 1;
3936 this_element >= 0;
fa9a63c5
RM
3937 this_element--)
3938 if (compile_stack.stack[this_element].regnum == regnum)
3939 return true;
3940
3941 return false;
3942}
fa9a63c5 3943\f
f6a3f532
SM
3944/* analyse_first.
3945 If fastmap is non-NULL, go through the pattern and fill fastmap
3946 with all the possible leading chars. If fastmap is NULL, don't
3947 bother filling it up (obviously) and only return whether the
3948 pattern could potentially match the empty string.
3949
3950 Return 1 if p..pend might match the empty string.
3951 Return 0 if p..pend matches at least one char.
01618498 3952 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3953
3954static int
438105ed 3955analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3956{
505bde11 3957 int j, k;
1fb352e0 3958 boolean not;
fa9a63c5 3959
b18215fc 3960 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3961 flag is set true. */
b18215fc
RS
3962 boolean match_any_multibyte_characters = false;
3963
f6a3f532 3964 assert (p);
5e69f11e 3965
505bde11
SM
3966 /* The loop below works as follows:
3967 - It has a working-list kept in the PATTERN_STACK and which basically
3968 starts by only containing a pointer to the first operation.
3969 - If the opcode we're looking at is a match against some set of
3970 chars, then we add those chars to the fastmap and go on to the
3971 next work element from the worklist (done via `break').
3972 - If the opcode is a control operator on the other hand, we either
3973 ignore it (if it's meaningless at this point, such as `start_memory')
3974 or execute it (if it's a jump). If the jump has several destinations
3975 (i.e. `on_failure_jump'), then we push the other destination onto the
3976 worklist.
3977 We guarantee termination by ignoring backward jumps (more or less),
3978 so that `p' is monotonically increasing. More to the point, we
3979 never set `p' (or push) anything `<= p1'. */
3980
01618498 3981 while (p < pend)
fa9a63c5 3982 {
505bde11
SM
3983 /* `p1' is used as a marker of how far back a `on_failure_jump'
3984 can go without being ignored. It is normally equal to `p'
3985 (which prevents any backward `on_failure_jump') except right
3986 after a plain `jump', to allow patterns such as:
3987 0: jump 10
3988 3..9: <body>
3989 10: on_failure_jump 3
3990 as used for the *? operator. */
01618498 3991 re_char *p1 = p;
5e69f11e 3992
fa9a63c5
RM
3993 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3994 {
f6a3f532 3995 case succeed:
01618498 3996 return 1;
fa9a63c5 3997
fa9a63c5 3998 case duplicate:
505bde11
SM
3999 /* If the first character has to match a backreference, that means
4000 that the group was empty (since it already matched). Since this
4001 is the only case that interests us here, we can assume that the
4002 backreference must match the empty string. */
4003 p++;
4004 continue;
fa9a63c5
RM
4005
4006
4007 /* Following are the cases which match a character. These end
7814e705 4008 with `break'. */
fa9a63c5
RM
4009
4010 case exactn:
e0277a47 4011 if (fastmap)
cf9c99bc
KH
4012 {
4013 /* If multibyte is nonzero, the first byte of each
4014 character is an ASCII or a leading code. Otherwise,
4015 each byte is a character. Thus, this works in both
4016 cases. */
4017 fastmap[p[1]] = 1;
4018 if (! multibyte)
4019 {
4020 /* For the case of matching this unibyte regex
4021 against multibyte, we must set a leading code of
4022 the corresponding multibyte character. */
4023 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4024
86e893e3 4025 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
4026 }
4027 }
fa9a63c5
RM
4028 break;
4029
4030
1fb352e0
SM
4031 case anychar:
4032 /* We could put all the chars except for \n (and maybe \0)
4033 but we don't bother since it is generally not worth it. */
f6a3f532 4034 if (!fastmap) break;
01618498 4035 return -1;
fa9a63c5
RM
4036
4037
b18215fc 4038 case charset_not:
1fb352e0 4039 if (!fastmap) break;
bf216479
KH
4040 {
4041 /* Chars beyond end of bitmap are possible matches. */
bf216479 4042 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4043 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4044 fastmap[j] = 1;
4045 }
4046
1fb352e0
SM
4047 /* Fallthrough */
4048 case charset:
4049 if (!fastmap) break;
4050 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4051 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4052 j >= 0; j--)
1fb352e0 4053 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4054 fastmap[j] = 1;
b18215fc 4055
6482db2e
KH
4056#ifdef emacs
4057 if (/* Any leading code can possibly start a character
1fb352e0 4058 which doesn't match the specified set of characters. */
6482db2e 4059 not
409f2919 4060 ||
6482db2e
KH
4061 /* If we can match a character class, we can match any
4062 multibyte characters. */
4063 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4064 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4065
b18215fc 4066 {
b18215fc
RS
4067 if (match_any_multibyte_characters == false)
4068 {
6482db2e
KH
4069 for (j = MIN_MULTIBYTE_LEADING_CODE;
4070 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4071 fastmap[j] = 1;
b18215fc
RS
4072 match_any_multibyte_characters = true;
4073 }
4074 }
b18215fc 4075
1fb352e0
SM
4076 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4077 && match_any_multibyte_characters == false)
4078 {
bf216479 4079 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 4080 multibyte character in the range table. */
1fb352e0 4081 int c, count;
bf216479 4082 unsigned char lc1, lc2;
b18215fc 4083
1fb352e0 4084 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4085 bits for a character class. */
1fb352e0 4086 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4087
1fb352e0
SM
4088 /* Extract the number of ranges in range table into COUNT. */
4089 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4090 for (; count > 0; count--, p += 3)
1fb352e0 4091 {
9117d724
KH
4092 /* Extract the start and end of each range. */
4093 EXTRACT_CHARACTER (c, p);
bf216479 4094 lc1 = CHAR_LEADING_CODE (c);
9117d724 4095 p += 3;
1fb352e0 4096 EXTRACT_CHARACTER (c, p);
bf216479
KH
4097 lc2 = CHAR_LEADING_CODE (c);
4098 for (j = lc1; j <= lc2; j++)
9117d724 4099 fastmap[j] = 1;
1fb352e0
SM
4100 }
4101 }
6482db2e 4102#endif
b18215fc
RS
4103 break;
4104
1fb352e0
SM
4105 case syntaxspec:
4106 case notsyntaxspec:
4107 if (!fastmap) break;
4108#ifndef emacs
4109 not = (re_opcode_t)p[-1] == notsyntaxspec;
4110 k = *p++;
4111 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4112 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4113 fastmap[j] = 1;
b18215fc 4114 break;
1fb352e0 4115#else /* emacs */
b18215fc
RS
4116 /* This match depends on text properties. These end with
4117 aborting optimizations. */
01618498 4118 return -1;
b18215fc
RS
4119
4120 case categoryspec:
b18215fc 4121 case notcategoryspec:
1fb352e0
SM
4122 if (!fastmap) break;
4123 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4124 k = *p++;
6482db2e 4125 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4126 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4127 fastmap[j] = 1;
4128
6482db2e
KH
4129 /* Any leading code can possibly start a character which
4130 has or doesn't has the specified category. */
4131 if (match_any_multibyte_characters == false)
6fdd04b0 4132 {
6482db2e
KH
4133 for (j = MIN_MULTIBYTE_LEADING_CODE;
4134 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4135 fastmap[j] = 1;
4136 match_any_multibyte_characters = true;
6fdd04b0 4137 }
b18215fc
RS
4138 break;
4139
fa9a63c5 4140 /* All cases after this match the empty string. These end with
25fe55af 4141 `continue'. */
fa9a63c5 4142
fa9a63c5
RM
4143 case before_dot:
4144 case at_dot:
4145 case after_dot:
1fb352e0 4146#endif /* !emacs */
25fe55af
RS
4147 case no_op:
4148 case begline:
4149 case endline:
fa9a63c5
RM
4150 case begbuf:
4151 case endbuf:
4152 case wordbound:
4153 case notwordbound:
4154 case wordbeg:
4155 case wordend:
669fa600
SM
4156 case symbeg:
4157 case symend:
25fe55af 4158 continue;
fa9a63c5
RM
4159
4160
fa9a63c5 4161 case jump:
25fe55af 4162 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4163 if (j < 0)
4164 /* Backward jumps can only go back to code that we've already
4165 visited. `re_compile' should make sure this is true. */
4166 break;
25fe55af 4167 p += j;
505bde11
SM
4168 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4169 {
4170 case on_failure_jump:
4171 case on_failure_keep_string_jump:
505bde11 4172 case on_failure_jump_loop:
0683b6fa 4173 case on_failure_jump_nastyloop:
505bde11
SM
4174 case on_failure_jump_smart:
4175 p++;
4176 break;
4177 default:
4178 continue;
4179 };
4180 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4181 to jump back to "just after here". */
4182 /* Fallthrough */
fa9a63c5 4183
25fe55af
RS
4184 case on_failure_jump:
4185 case on_failure_keep_string_jump:
0683b6fa 4186 case on_failure_jump_nastyloop:
505bde11
SM
4187 case on_failure_jump_loop:
4188 case on_failure_jump_smart:
25fe55af 4189 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4190 if (p + j <= p1)
ed0767d8 4191 ; /* Backward jump to be ignored. */
01618498
SM
4192 else
4193 { /* We have to look down both arms.
4194 We first go down the "straight" path so as to minimize
4195 stack usage when going through alternatives. */
4196 int r = analyse_first (p, pend, fastmap, multibyte);
4197 if (r) return r;
4198 p += j;
4199 }
25fe55af 4200 continue;
fa9a63c5
RM
4201
4202
ed0767d8
SM
4203 case jump_n:
4204 /* This code simply does not properly handle forward jump_n. */
4205 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4206 p += 4;
4207 /* jump_n can either jump or fall through. The (backward) jump
4208 case has already been handled, so we only need to look at the
4209 fallthrough case. */
4210 continue;
177c0ea7 4211
fa9a63c5 4212 case succeed_n:
ed0767d8
SM
4213 /* If N == 0, it should be an on_failure_jump_loop instead. */
4214 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4215 p += 4;
4216 /* We only care about one iteration of the loop, so we don't
4217 need to consider the case where this behaves like an
4218 on_failure_jump. */
25fe55af 4219 continue;
fa9a63c5
RM
4220
4221
4222 case set_number_at:
25fe55af
RS
4223 p += 4;
4224 continue;
fa9a63c5
RM
4225
4226
4227 case start_memory:
25fe55af 4228 case stop_memory:
505bde11 4229 p += 1;
fa9a63c5
RM
4230 continue;
4231
4232
4233 default:
25fe55af
RS
4234 abort (); /* We have listed all the cases. */
4235 } /* switch *p++ */
fa9a63c5
RM
4236
4237 /* Getting here means we have found the possible starting
25fe55af 4238 characters for one path of the pattern -- and that the empty
7814e705 4239 string does not match. We need not follow this path further. */
01618498 4240 return 0;
fa9a63c5
RM
4241 } /* while p */
4242
01618498
SM
4243 /* We reached the end without matching anything. */
4244 return 1;
4245
f6a3f532
SM
4246} /* analyse_first */
4247\f
4248/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4249 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4250 characters can start a string that matches the pattern. This fastmap
4251 is used by re_search to skip quickly over impossible starting points.
4252
4253 Character codes above (1 << BYTEWIDTH) are not represented in the
4254 fastmap, but the leading codes are represented. Thus, the fastmap
4255 indicates which character sets could start a match.
4256
4257 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4258 area as BUFP->fastmap.
4259
4260 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4261 the pattern buffer.
4262
4263 Returns 0 if we succeed, -2 if an internal error. */
4264
4265int
971de7fb 4266re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4267{
4268 char *fastmap = bufp->fastmap;
4269 int analysis;
4270
4271 assert (fastmap && bufp->buffer);
4272
72af86bd 4273 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4274 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4275
4276 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4277 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4278 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4279 return 0;
4280} /* re_compile_fastmap */
4281\f
4282/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4283 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4284 this memory for recording register information. STARTS and ENDS
4285 must be allocated using the malloc library routine, and must each
4286 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4287
4288 If NUM_REGS == 0, then subsequent matches should allocate their own
4289 register data.
4290
4291 Unless this function is called, the first search or match using
4292 PATTERN_BUFFER will allocate its own register data, without
4293 freeing the old data. */
4294
4295void
971de7fb 4296re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4297{
4298 if (num_regs)
4299 {
4300 bufp->regs_allocated = REGS_REALLOCATE;
4301 regs->num_regs = num_regs;
4302 regs->start = starts;
4303 regs->end = ends;
4304 }
4305 else
4306 {
4307 bufp->regs_allocated = REGS_UNALLOCATED;
4308 regs->num_regs = 0;
4309 regs->start = regs->end = (regoff_t *) 0;
4310 }
4311}
c0f9ea08 4312WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4313\f
7814e705 4314/* Searching routines. */
fa9a63c5
RM
4315
4316/* Like re_search_2, below, but only one string is specified, and
4317 doesn't let you say where to stop matching. */
4318
4319int
971de7fb 4320re_search (struct re_pattern_buffer *bufp, const char *string, int size, int startpos, int range, struct re_registers *regs)
fa9a63c5 4321{
5e69f11e 4322 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4323 regs, size);
4324}
c0f9ea08 4325WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4326
70806df6
KH
4327/* Head address of virtual concatenation of string. */
4328#define HEAD_ADDR_VSTRING(P) \
4329 (((P) >= size1 ? string2 : string1))
4330
b18215fc
RS
4331/* End address of virtual concatenation of string. */
4332#define STOP_ADDR_VSTRING(P) \
4333 (((P) >= size1 ? string2 + size2 : string1 + size1))
4334
4335/* Address of POS in the concatenation of virtual string. */
4336#define POS_ADDR_VSTRING(POS) \
4337 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4338
4339/* Using the compiled pattern in BUFP->buffer, first tries to match the
4340 virtual concatenation of STRING1 and STRING2, starting first at index
4341 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4342
fa9a63c5 4343 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4344
fa9a63c5
RM
4345 RANGE is how far to scan while trying to match. RANGE = 0 means try
4346 only at STARTPOS; in general, the last start tried is STARTPOS +
4347 RANGE.
5e69f11e 4348
fa9a63c5
RM
4349 In REGS, return the indices of the virtual concatenation of STRING1
4350 and STRING2 that matched the entire BUFP->buffer and its contained
4351 subexpressions.
5e69f11e 4352
fa9a63c5
RM
4353 Do not consider matching one past the index STOP in the virtual
4354 concatenation of STRING1 and STRING2.
4355
4356 We return either the position in the strings at which the match was
4357 found, -1 if no match, or -2 if error (such as failure
4358 stack overflow). */
4359
4360int
971de7fb 4361re_search_2 (struct re_pattern_buffer *bufp, const char *str1, int size1, const char *str2, int size2, int startpos, int range, struct re_registers *regs, int stop)
fa9a63c5
RM
4362{
4363 int val;
66f0296e
SM
4364 re_char *string1 = (re_char*) str1;
4365 re_char *string2 = (re_char*) str2;
fa9a63c5 4366 register char *fastmap = bufp->fastmap;
6676cb1c 4367 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4368 int total_size = size1 + size2;
4369 int endpos = startpos + range;
c0f9ea08 4370 boolean anchored_start;
cf9c99bc
KH
4371 /* Nonzero if we are searching multibyte string. */
4372 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4373
fa9a63c5
RM
4374 /* Check for out-of-range STARTPOS. */
4375 if (startpos < 0 || startpos > total_size)
4376 return -1;
5e69f11e 4377
fa9a63c5 4378 /* Fix up RANGE if it might eventually take us outside
34597fa9 4379 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4380 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4381 if (endpos < 0)
4382 range = 0 - startpos;
fa9a63c5
RM
4383 else if (endpos > total_size)
4384 range = total_size - startpos;
4385
4386 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4387 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4388 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4389 {
4390 if (startpos > 0)
4391 return -1;
4392 else
7b140fd7 4393 range = 0;
fa9a63c5
RM
4394 }
4395
ae4788a8
RS
4396#ifdef emacs
4397 /* In a forward search for something that starts with \=.
4398 don't keep searching past point. */
4399 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4400 {
7b140fd7
RS
4401 range = PT_BYTE - BEGV_BYTE - startpos;
4402 if (range < 0)
ae4788a8
RS
4403 return -1;
4404 }
4405#endif /* emacs */
4406
fa9a63c5
RM
4407 /* Update the fastmap now if not correct already. */
4408 if (fastmap && !bufp->fastmap_accurate)
01618498 4409 re_compile_fastmap (bufp);
5e69f11e 4410
c8499ba5 4411 /* See whether the pattern is anchored. */
c0f9ea08 4412 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4413
b18215fc 4414#ifdef emacs
d48cd3f4 4415 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4416 {
99633e97 4417 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4418
4419 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4420 }
b18215fc
RS
4421#endif
4422
fa9a63c5
RM
4423 /* Loop through the string, looking for a place to start matching. */
4424 for (;;)
5e69f11e 4425 {
c8499ba5
RS
4426 /* If the pattern is anchored,
4427 skip quickly past places we cannot match.
4428 We don't bother to treat startpos == 0 specially
4429 because that case doesn't repeat. */
4430 if (anchored_start && startpos > 0)
4431 {
c0f9ea08
SM
4432 if (! ((startpos <= size1 ? string1[startpos - 1]
4433 : string2[startpos - size1 - 1])
4434 == '\n'))
c8499ba5
RS
4435 goto advance;
4436 }
4437
fa9a63c5 4438 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4439 cannot be the start of a match. If the pattern can match the
4440 null string, however, we don't need to skip characters; we want
7814e705 4441 the first null string. */
fa9a63c5
RM
4442 if (fastmap && startpos < total_size && !bufp->can_be_null)
4443 {
66f0296e 4444 register re_char *d;
01618498 4445 register re_wchar_t buf_ch;
e934739e
RS
4446
4447 d = POS_ADDR_VSTRING (startpos);
4448
7814e705 4449 if (range > 0) /* Searching forwards. */
fa9a63c5 4450 {
fa9a63c5
RM
4451 register int lim = 0;
4452 int irange = range;
4453
25fe55af
RS
4454 if (startpos < size1 && startpos + range >= size1)
4455 lim = range - (size1 - startpos);
fa9a63c5 4456
25fe55af
RS
4457 /* Written out as an if-else to avoid testing `translate'
4458 inside the loop. */
28ae27ae
AS
4459 if (RE_TRANSLATE_P (translate))
4460 {
e934739e
RS
4461 if (multibyte)
4462 while (range > lim)
4463 {
4464 int buf_charlen;
4465
62a6e103 4466 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4467 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4468 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4469 break;
4470
4471 range -= buf_charlen;
4472 d += buf_charlen;
4473 }
4474 else
bf216479 4475 while (range > lim)
33c46939 4476 {
cf9c99bc
KH
4477 register re_wchar_t ch, translated;
4478
bf216479 4479 buf_ch = *d;
cf9c99bc
KH
4480 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4481 translated = RE_TRANSLATE (translate, ch);
4482 if (translated != ch
4483 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4484 buf_ch = ch;
6fdd04b0 4485 if (fastmap[buf_ch])
bf216479 4486 break;
33c46939
RS
4487 d++;
4488 range--;
4489 }
e934739e 4490 }
fa9a63c5 4491 else
6fdd04b0
KH
4492 {
4493 if (multibyte)
4494 while (range > lim)
4495 {
4496 int buf_charlen;
fa9a63c5 4497
62a6e103 4498 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4499 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4500 break;
4501 range -= buf_charlen;
4502 d += buf_charlen;
4503 }
e934739e 4504 else
6fdd04b0 4505 while (range > lim && !fastmap[*d])
33c46939
RS
4506 {
4507 d++;
4508 range--;
4509 }
e934739e 4510 }
fa9a63c5
RM
4511 startpos += irange - range;
4512 }
7814e705 4513 else /* Searching backwards. */
fa9a63c5 4514 {
ba5e343c
KH
4515 if (multibyte)
4516 {
62a6e103 4517 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4518 buf_ch = TRANSLATE (buf_ch);
4519 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4520 goto advance;
4521 }
4522 else
4523 {
cf9c99bc
KH
4524 register re_wchar_t ch, translated;
4525
4526 buf_ch = *d;
4527 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4528 translated = TRANSLATE (ch);
4529 if (translated != ch
4530 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4531 buf_ch = ch;
4532 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4533 goto advance;
4534 }
fa9a63c5
RM
4535 }
4536 }
4537
4538 /* If can't match the null string, and that's all we have left, fail. */
4539 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4540 && !bufp->can_be_null)
fa9a63c5
RM
4541 return -1;
4542
4543 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4544 startpos, regs, stop);
fa9a63c5
RM
4545
4546 if (val >= 0)
4547 return startpos;
5e69f11e 4548
fa9a63c5
RM
4549 if (val == -2)
4550 return -2;
4551
4552 advance:
5e69f11e 4553 if (!range)
25fe55af 4554 break;
5e69f11e 4555 else if (range > 0)
25fe55af 4556 {
b18215fc
RS
4557 /* Update STARTPOS to the next character boundary. */
4558 if (multibyte)
4559 {
66f0296e
SM
4560 re_char *p = POS_ADDR_VSTRING (startpos);
4561 re_char *pend = STOP_ADDR_VSTRING (startpos);
aa3830c4 4562 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4563
4564 range -= len;
4565 if (range < 0)
4566 break;
4567 startpos += len;
4568 }
4569 else
4570 {
b560c397
RS
4571 range--;
4572 startpos++;
4573 }
e318085a 4574 }
fa9a63c5 4575 else
25fe55af
RS
4576 {
4577 range++;
4578 startpos--;
b18215fc
RS
4579
4580 /* Update STARTPOS to the previous character boundary. */
4581 if (multibyte)
4582 {
70806df6
KH
4583 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4584 re_char *p0 = p;
4585 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4586
4587 /* Find the head of multibyte form. */
70806df6
KH
4588 PREV_CHAR_BOUNDARY (p, phead);
4589 range += p0 - 1 - p;
4590 if (range > 0)
4591 break;
b18215fc 4592
70806df6 4593 startpos -= p0 - 1 - p;
b18215fc 4594 }
25fe55af 4595 }
fa9a63c5
RM
4596 }
4597 return -1;
4598} /* re_search_2 */
c0f9ea08 4599WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4600\f
4601/* Declarations and macros for re_match_2. */
4602
2d1675e4
SM
4603static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4604 register int len,
4605 RE_TRANSLATE_TYPE translate,
4606 const int multibyte));
fa9a63c5
RM
4607
4608/* This converts PTR, a pointer into one of the search strings `string1'
4609 and `string2' into an offset from the beginning of that string. */
4610#define POINTER_TO_OFFSET(ptr) \
4611 (FIRST_STRING_P (ptr) \
4612 ? ((regoff_t) ((ptr) - string1)) \
4613 : ((regoff_t) ((ptr) - string2 + size1)))
4614
fa9a63c5 4615/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4616 string2 if necessary.
4617 Check re_match_2_internal for a discussion of why end_match_2 might
4618 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4619#define PREFETCH() \
25fe55af 4620 while (d == dend) \
fa9a63c5
RM
4621 { \
4622 /* End of string2 => fail. */ \
25fe55af
RS
4623 if (dend == end_match_2) \
4624 goto fail; \
4bb91c68 4625 /* End of string1 => advance to string2. */ \
25fe55af 4626 d = string2; \
fa9a63c5
RM
4627 dend = end_match_2; \
4628 }
4629
f1ad044f
SM
4630/* Call before fetching a char with *d if you already checked other limits.
4631 This is meant for use in lookahead operations like wordend, etc..
4632 where we might need to look at parts of the string that might be
4633 outside of the LIMITs (i.e past `stop'). */
4634#define PREFETCH_NOLIMIT() \
4635 if (d == end1) \
4636 { \
4637 d = string2; \
4638 dend = end_match_2; \
4639 } \
fa9a63c5
RM
4640
4641/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4642 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4643#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4644#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4645
4646
4647/* Test if D points to a character which is word-constituent. We have
4648 two special cases to check for: if past the end of string1, look at
4649 the first character in string2; and if before the beginning of
4650 string2, look at the last character in string1. */
4651#define WORDCHAR_P(d) \
4652 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4653 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4654 == Sword)
4655
9121ca40 4656/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4657
4658/* The comment at case wordbound is following one, but we don't use
4659 AT_WORD_BOUNDARY anymore to support multibyte form.
4660
4661 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4662 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4663 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4664 macro and introducing temporary variables works around the bug. */
4665
9121ca40 4666#if 0
fa9a63c5
RM
4667/* Test if the character before D and the one at D differ with respect
4668 to being word-constituent. */
4669#define AT_WORD_BOUNDARY(d) \
4670 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4671 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4672#endif
fa9a63c5
RM
4673
4674/* Free everything we malloc. */
4675#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4676# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4677# define FREE_VARIABLES() \
fa9a63c5
RM
4678 do { \
4679 REGEX_FREE_STACK (fail_stack.stack); \
4680 FREE_VAR (regstart); \
4681 FREE_VAR (regend); \
fa9a63c5
RM
4682 FREE_VAR (best_regstart); \
4683 FREE_VAR (best_regend); \
fa9a63c5
RM
4684 } while (0)
4685#else
0b32bf0e 4686# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4687#endif /* not MATCH_MAY_ALLOCATE */
4688
505bde11
SM
4689\f
4690/* Optimization routines. */
4691
4e8a9132
SM
4692/* If the operation is a match against one or more chars,
4693 return a pointer to the next operation, else return NULL. */
01618498 4694static re_char *
971de7fb 4695skip_one_char (const re_char *p)
4e8a9132
SM
4696{
4697 switch (SWITCH_ENUM_CAST (*p++))
4698 {
4699 case anychar:
4700 break;
177c0ea7 4701
4e8a9132
SM
4702 case exactn:
4703 p += *p + 1;
4704 break;
4705
4706 case charset_not:
4707 case charset:
4708 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4709 {
4710 int mcnt;
4711 p = CHARSET_RANGE_TABLE (p - 1);
4712 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4713 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4714 }
4715 else
4716 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4717 break;
177c0ea7 4718
4e8a9132
SM
4719 case syntaxspec:
4720 case notsyntaxspec:
1fb352e0 4721#ifdef emacs
4e8a9132
SM
4722 case categoryspec:
4723 case notcategoryspec:
4724#endif /* emacs */
4725 p++;
4726 break;
4727
4728 default:
4729 p = NULL;
4730 }
4731 return p;
4732}
4733
4734
505bde11 4735/* Jump over non-matching operations. */
839966f3 4736static re_char *
971de7fb 4737skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4738{
4739 int mcnt;
4740 while (p < pend)
4741 {
4742 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4743 {
4744 case start_memory:
505bde11
SM
4745 case stop_memory:
4746 p += 2; break;
4747 case no_op:
4748 p += 1; break;
4749 case jump:
4750 p += 1;
4751 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4752 p += mcnt;
4753 break;
4754 default:
4755 return p;
4756 }
4757 }
4758 assert (p == pend);
4759 return p;
4760}
4761
4762/* Non-zero if "p1 matches something" implies "p2 fails". */
4763static int
971de7fb 4764mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4765{
4e8a9132 4766 re_opcode_t op2;
2d1675e4 4767 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4768 unsigned char *pend = bufp->buffer + bufp->used;
4769
4e8a9132 4770 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4771 && p2 >= bufp->buffer && p2 <= pend);
4772
4773 /* Skip over open/close-group commands.
4774 If what follows this loop is a ...+ construct,
4775 look at what begins its body, since we will have to
4776 match at least one of that. */
4e8a9132
SM
4777 p2 = skip_noops (p2, pend);
4778 /* The same skip can be done for p1, except that this function
4779 is only used in the case where p1 is a simple match operator. */
4780 /* p1 = skip_noops (p1, pend); */
4781
4782 assert (p1 >= bufp->buffer && p1 < pend
4783 && p2 >= bufp->buffer && p2 <= pend);
4784
4785 op2 = p2 == pend ? succeed : *p2;
4786
4787 switch (SWITCH_ENUM_CAST (op2))
505bde11 4788 {
4e8a9132
SM
4789 case succeed:
4790 case endbuf:
4791 /* If we're at the end of the pattern, we can change. */
4792 if (skip_one_char (p1))
505bde11 4793 {
505bde11
SM
4794 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4795 return 1;
505bde11 4796 }
4e8a9132 4797 break;
177c0ea7 4798
4e8a9132 4799 case endline:
4e8a9132
SM
4800 case exactn:
4801 {
01618498 4802 register re_wchar_t c
4e8a9132 4803 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4804 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4805
4e8a9132
SM
4806 if ((re_opcode_t) *p1 == exactn)
4807 {
62a6e103 4808 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4809 {
4810 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4811 return 1;
4812 }
4813 }
505bde11 4814
4e8a9132
SM
4815 else if ((re_opcode_t) *p1 == charset
4816 || (re_opcode_t) *p1 == charset_not)
4817 {
4818 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4819
4e8a9132
SM
4820 /* Test if C is listed in charset (or charset_not)
4821 at `p1'. */
6fdd04b0 4822 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4823 {
4824 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4825 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4826 not = !not;
4827 }
4828 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4829 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4830
4e8a9132
SM
4831 /* `not' is equal to 1 if c would match, which means
4832 that we can't change to pop_failure_jump. */
4833 if (!not)
4834 {
4835 DEBUG_PRINT1 (" No match => fast loop.\n");
4836 return 1;
4837 }
4838 }
4839 else if ((re_opcode_t) *p1 == anychar
4840 && c == '\n')
4841 {
4842 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4843 return 1;
4844 }
4845 }
4846 break;
505bde11 4847
4e8a9132 4848 case charset:
4e8a9132
SM
4849 {
4850 if ((re_opcode_t) *p1 == exactn)
4851 /* Reuse the code above. */
4852 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4853
505bde11
SM
4854 /* It is hard to list up all the character in charset
4855 P2 if it includes multibyte character. Give up in
4856 such case. */
4857 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4858 {
4859 /* Now, we are sure that P2 has no range table.
4860 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4861 enough. But P1 may have range table, so the
505bde11
SM
4862 size of bitmap table of P1 is extracted by
4863 using macro `CHARSET_BITMAP_SIZE'.
4864
6fdd04b0
KH
4865 In a multibyte case, we know that all the character
4866 listed in P2 is ASCII. In a unibyte case, P1 has only a
4867 bitmap table. So, in both cases, it is enough to test
4868 only the bitmap table of P1. */
505bde11 4869
411e4203 4870 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4871 {
4872 int idx;
4873 /* We win if the charset inside the loop
4874 has no overlap with the one after the loop. */
4875 for (idx = 0;
4876 (idx < (int) p2[1]
4877 && idx < CHARSET_BITMAP_SIZE (p1));
4878 idx++)
4879 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4880 break;
4881
4882 if (idx == p2[1]
4883 || idx == CHARSET_BITMAP_SIZE (p1))
4884 {
4885 DEBUG_PRINT1 (" No match => fast loop.\n");
4886 return 1;
4887 }
4888 }
411e4203 4889 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4890 {
4891 int idx;
4892 /* We win if the charset_not inside the loop lists
7814e705 4893 every character listed in the charset after. */
505bde11
SM
4894 for (idx = 0; idx < (int) p2[1]; idx++)
4895 if (! (p2[2 + idx] == 0
4896 || (idx < CHARSET_BITMAP_SIZE (p1)
4897 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4898 break;
4899
4e8a9132
SM
4900 if (idx == p2[1])
4901 {
4902 DEBUG_PRINT1 (" No match => fast loop.\n");
4903 return 1;
4904 }
4905 }
4906 }
4907 }
609b757a 4908 break;
177c0ea7 4909
411e4203
SM
4910 case charset_not:
4911 switch (SWITCH_ENUM_CAST (*p1))
4912 {
4913 case exactn:
4914 case charset:
4915 /* Reuse the code above. */
4916 return mutually_exclusive_p (bufp, p2, p1);
4917 case charset_not:
4918 /* When we have two charset_not, it's very unlikely that
4919 they don't overlap. The union of the two sets of excluded
4920 chars should cover all possible chars, which, as a matter of
4921 fact, is virtually impossible in multibyte buffers. */
36595814 4922 break;
411e4203
SM
4923 }
4924 break;
4925
4e8a9132 4926 case wordend:
669fa600
SM
4927 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4928 case symend:
4e8a9132 4929 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4930 && (p1[1] == Ssymbol || p1[1] == Sword));
4931 case notsyntaxspec:
4932 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4933
4934 case wordbeg:
669fa600
SM
4935 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4936 case symbeg:
4e8a9132 4937 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4938 && (p1[1] == Ssymbol || p1[1] == Sword));
4939 case syntaxspec:
4940 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4941
4942 case wordbound:
4943 return (((re_opcode_t) *p1 == notsyntaxspec
4944 || (re_opcode_t) *p1 == syntaxspec)
4945 && p1[1] == Sword);
4946
1fb352e0 4947#ifdef emacs
4e8a9132
SM
4948 case categoryspec:
4949 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4950 case notcategoryspec:
4951 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4952#endif /* emacs */
4953
4954 default:
4955 ;
505bde11
SM
4956 }
4957
4958 /* Safe default. */
4959 return 0;
4960}
4961
fa9a63c5
RM
4962\f
4963/* Matching routines. */
4964
25fe55af 4965#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4966/* re_match is like re_match_2 except it takes only a single string. */
4967
4968int
d2762c86
DN
4969re_match (struct re_pattern_buffer *bufp, const char *string,
4970 int size, int pos, struct re_registers *regs)
fa9a63c5 4971{
4bb91c68 4972 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4973 pos, regs, size);
fa9a63c5
RM
4974 return result;
4975}
c0f9ea08 4976WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4977#endif /* not emacs */
4978
b18215fc
RS
4979#ifdef emacs
4980/* In Emacs, this is the string or buffer in which we
7814e705 4981 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4982Lisp_Object re_match_object;
4983#endif
fa9a63c5
RM
4984
4985/* re_match_2 matches the compiled pattern in BUFP against the
4986 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4987 and SIZE2, respectively). We start matching at POS, and stop
4988 matching at STOP.
5e69f11e 4989
fa9a63c5 4990 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4991 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4992 documentation for exactly how many groups we fill.
4993
4994 We return -1 if no match, -2 if an internal error (such as the
7814e705 4995 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4996 matched substring. */
4997
4998int
971de7fb 4999re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5 5000{
b18215fc 5001 int result;
25fe55af 5002
b18215fc 5003#ifdef emacs
cc9b4df2 5004 int charpos;
d48cd3f4 5005 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 5006 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5007 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5008#endif
5009
4bb91c68
SM
5010 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5011 (re_char*) string2, size2,
cc9b4df2 5012 pos, regs, stop);
fa9a63c5
RM
5013 return result;
5014}
c0f9ea08 5015WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5016
bf216479 5017
fa9a63c5 5018/* This is a separate function so that we can force an alloca cleanup
7814e705 5019 afterwards. */
fa9a63c5 5020static int
971de7fb 5021re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1, int size1, const re_char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5
RM
5022{
5023 /* General temporaries. */
5024 int mcnt;
01618498 5025 size_t reg;
fa9a63c5
RM
5026
5027 /* Just past the end of the corresponding string. */
66f0296e 5028 re_char *end1, *end2;
fa9a63c5
RM
5029
5030 /* Pointers into string1 and string2, just past the last characters in
7814e705 5031 each to consider matching. */
66f0296e 5032 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5033
5034 /* Where we are in the data, and the end of the current string. */
66f0296e 5035 re_char *d, *dend;
5e69f11e 5036
99633e97
SM
5037 /* Used sometimes to remember where we were before starting matching
5038 an operator so that we can go back in case of failure. This "atomic"
5039 behavior of matching opcodes is indispensable to the correctness
5040 of the on_failure_keep_string_jump optimization. */
5041 re_char *dfail;
5042
fa9a63c5 5043 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5044 re_char *p = bufp->buffer;
5045 re_char *pend = p + bufp->used;
fa9a63c5 5046
25fe55af 5047 /* We use this to map every character in the string. */
6676cb1c 5048 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5049
cf9c99bc 5050 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5051 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5052
cf9c99bc
KH
5053 /* Nonzero if STRING1/STRING2 are multibyte. */
5054 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5055
fa9a63c5
RM
5056 /* Failure point stack. Each place that can handle a failure further
5057 down the line pushes a failure point on this stack. It consists of
505bde11 5058 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5059 the subexpressions we're currently inside, plus the number of such
5060 registers, and, finally, two char *'s. The first char * is where
5061 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5062 scanning the strings. */
5063#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5064 fail_stack_type fail_stack;
5065#endif
5066#ifdef DEBUG
fa9a63c5
RM
5067 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5068#endif
5069
0b32bf0e 5070#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5071 /* This holds the pointer to the failure stack, when
5072 it is allocated relocatably. */
5073 fail_stack_elt_t *failure_stack_ptr;
99633e97 5074#endif
fa9a63c5
RM
5075
5076 /* We fill all the registers internally, independent of what we
7814e705 5077 return, for use in backreferences. The number here includes
fa9a63c5 5078 an element for register zero. */
4bb91c68 5079 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5080
fa9a63c5
RM
5081 /* Information on the contents of registers. These are pointers into
5082 the input strings; they record just what was matched (on this
5083 attempt) by a subexpression part of the pattern, that is, the
5084 regnum-th regstart pointer points to where in the pattern we began
5085 matching and the regnum-th regend points to right after where we
5086 stopped matching the regnum-th subexpression. (The zeroth register
5087 keeps track of what the whole pattern matches.) */
5088#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5089 re_char **regstart, **regend;
fa9a63c5
RM
5090#endif
5091
fa9a63c5 5092 /* The following record the register info as found in the above
5e69f11e 5093 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5094 This happens as we backtrack through the failure points, which in
5095 turn happens only if we have not yet matched the entire string. */
5096 unsigned best_regs_set = false;
5097#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5098 re_char **best_regstart, **best_regend;
fa9a63c5 5099#endif
5e69f11e 5100
fa9a63c5
RM
5101 /* Logically, this is `best_regend[0]'. But we don't want to have to
5102 allocate space for that if we're not allocating space for anything
7814e705 5103 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5104 any of the other register vectors, and it seems rather a kludge to
5105 treat `best_regend' differently than the rest. So we keep track of
5106 the end of the best match so far in a separate variable. We
5107 initialize this to NULL so that when we backtrack the first time
5108 and need to test it, it's not garbage. */
66f0296e 5109 re_char *match_end = NULL;
fa9a63c5 5110
fa9a63c5
RM
5111#ifdef DEBUG
5112 /* Counts the total number of registers pushed. */
5e69f11e 5113 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5114#endif
5115
5116 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5117
fa9a63c5 5118 INIT_FAIL_STACK ();
5e69f11e 5119
fa9a63c5
RM
5120#ifdef MATCH_MAY_ALLOCATE
5121 /* Do not bother to initialize all the register variables if there are
5122 no groups in the pattern, as it takes a fair amount of time. If
5123 there are groups, we include space for register 0 (the whole
5124 pattern), even though we never use it, since it simplifies the
5125 array indexing. We should fix this. */
5126 if (bufp->re_nsub)
5127 {
66f0296e
SM
5128 regstart = REGEX_TALLOC (num_regs, re_char *);
5129 regend = REGEX_TALLOC (num_regs, re_char *);
5130 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5131 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5132
505bde11 5133 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5134 {
5135 FREE_VARIABLES ();
5136 return -2;
5137 }
fa9a63c5
RM
5138 }
5139 else
5140 {
5141 /* We must initialize all our variables to NULL, so that
25fe55af 5142 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5143 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5144 }
5145#endif /* MATCH_MAY_ALLOCATE */
5146
5147 /* The starting position is bogus. */
5148 if (pos < 0 || pos > size1 + size2)
5149 {
5150 FREE_VARIABLES ();
5151 return -1;
5152 }
5e69f11e 5153
fa9a63c5
RM
5154 /* Initialize subexpression text positions to -1 to mark ones that no
5155 start_memory/stop_memory has been seen for. Also initialize the
5156 register information struct. */
01618498
SM
5157 for (reg = 1; reg < num_regs; reg++)
5158 regstart[reg] = regend[reg] = NULL;
99633e97 5159
fa9a63c5 5160 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5161 `string1' is null. */
fa9a63c5
RM
5162 if (size2 == 0 && string1 != NULL)
5163 {
5164 string2 = string1;
5165 size2 = size1;
5166 string1 = 0;
5167 size1 = 0;
5168 }
5169 end1 = string1 + size1;
5170 end2 = string2 + size2;
5171
5e69f11e 5172 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5173 `dend' is the end of the input string that `d' points within. `d'
5174 is advanced into the following input string whenever necessary, but
5175 this happens before fetching; therefore, at the beginning of the
5176 loop, `d' can be pointing at the end of a string, but it cannot
5177 equal `string2'. */
419d1c74 5178 if (pos >= size1)
fa9a63c5 5179 {
419d1c74
SM
5180 /* Only match within string2. */
5181 d = string2 + pos - size1;
5182 dend = end_match_2 = string2 + stop - size1;
5183 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5184 }
5185 else
5186 {
f1ad044f 5187 if (stop < size1)
419d1c74
SM
5188 {
5189 /* Only match within string1. */
5190 end_match_1 = string1 + stop;
5191 /* BEWARE!
5192 When we reach end_match_1, PREFETCH normally switches to string2.
5193 But in the present case, this means that just doing a PREFETCH
5194 makes us jump from `stop' to `gap' within the string.
5195 What we really want here is for the search to stop as
5196 soon as we hit end_match_1. That's why we set end_match_2
5197 to end_match_1 (since PREFETCH fails as soon as we hit
5198 end_match_2). */
5199 end_match_2 = end_match_1;
5200 }
5201 else
f1ad044f
SM
5202 { /* It's important to use this code when stop == size so that
5203 moving `d' from end1 to string2 will not prevent the d == dend
5204 check from catching the end of string. */
419d1c74
SM
5205 end_match_1 = end1;
5206 end_match_2 = string2 + stop - size1;
5207 }
5208 d = string1 + pos;
5209 dend = end_match_1;
fa9a63c5
RM
5210 }
5211
5212 DEBUG_PRINT1 ("The compiled pattern is: ");
5213 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5214 DEBUG_PRINT1 ("The string to match is: `");
5215 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5216 DEBUG_PRINT1 ("'\n");
5e69f11e 5217
7814e705 5218 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5219 function if the match is complete, or it drops through if the match
5220 fails at this starting point in the input data. */
5221 for (;;)
5222 {
505bde11 5223 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5224
5225 if (p == pend)
5226 { /* End of pattern means we might have succeeded. */
25fe55af 5227 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5228
fa9a63c5 5229 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5230 longest match, try backtracking. */
5231 if (d != end_match_2)
fa9a63c5
RM
5232 {
5233 /* 1 if this match ends in the same string (string1 or string2)
5234 as the best previous match. */
5e69f11e 5235 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5236 == FIRST_STRING_P (d));
fa9a63c5
RM
5237 /* 1 if this match is the best seen so far. */
5238 boolean best_match_p;
5239
5240 /* AIX compiler got confused when this was combined
7814e705 5241 with the previous declaration. */
fa9a63c5
RM
5242 if (same_str_p)
5243 best_match_p = d > match_end;
5244 else
99633e97 5245 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5246
25fe55af
RS
5247 DEBUG_PRINT1 ("backtracking.\n");
5248
5249 if (!FAIL_STACK_EMPTY ())
5250 { /* More failure points to try. */
5251
5252 /* If exceeds best match so far, save it. */
5253 if (!best_regs_set || best_match_p)
5254 {
5255 best_regs_set = true;
5256 match_end = d;
5257
5258 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5259
01618498 5260 for (reg = 1; reg < num_regs; reg++)
25fe55af 5261 {
01618498
SM
5262 best_regstart[reg] = regstart[reg];
5263 best_regend[reg] = regend[reg];
25fe55af
RS
5264 }
5265 }
5266 goto fail;
5267 }
5268
5269 /* If no failure points, don't restore garbage. And if
5270 last match is real best match, don't restore second
5271 best one. */
5272 else if (best_regs_set && !best_match_p)
5273 {
5274 restore_best_regs:
5275 /* Restore best match. It may happen that `dend ==
5276 end_match_1' while the restored d is in string2.
5277 For example, the pattern `x.*y.*z' against the
5278 strings `x-' and `y-z-', if the two strings are
7814e705 5279 not consecutive in memory. */
25fe55af
RS
5280 DEBUG_PRINT1 ("Restoring best registers.\n");
5281
5282 d = match_end;
5283 dend = ((d >= string1 && d <= end1)
5284 ? end_match_1 : end_match_2);
fa9a63c5 5285
01618498 5286 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5287 {
01618498
SM
5288 regstart[reg] = best_regstart[reg];
5289 regend[reg] = best_regend[reg];
fa9a63c5 5290 }
25fe55af
RS
5291 }
5292 } /* d != end_match_2 */
fa9a63c5
RM
5293
5294 succeed_label:
25fe55af 5295 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5296
25fe55af
RS
5297 /* If caller wants register contents data back, do it. */
5298 if (regs && !bufp->no_sub)
fa9a63c5 5299 {
25fe55af
RS
5300 /* Have the register data arrays been allocated? */
5301 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5302 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5303 extra element beyond `num_regs' for the `-1' marker
5304 GNU code uses. */
5305 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5306 regs->start = TALLOC (regs->num_regs, regoff_t);
5307 regs->end = TALLOC (regs->num_regs, regoff_t);
5308 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5309 {
5310 FREE_VARIABLES ();
5311 return -2;
5312 }
25fe55af
RS
5313 bufp->regs_allocated = REGS_REALLOCATE;
5314 }
5315 else if (bufp->regs_allocated == REGS_REALLOCATE)
5316 { /* Yes. If we need more elements than were already
5317 allocated, reallocate them. If we need fewer, just
5318 leave it alone. */
5319 if (regs->num_regs < num_regs + 1)
5320 {
5321 regs->num_regs = num_regs + 1;
5322 RETALLOC (regs->start, regs->num_regs, regoff_t);
5323 RETALLOC (regs->end, regs->num_regs, regoff_t);
5324 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5325 {
5326 FREE_VARIABLES ();
5327 return -2;
5328 }
25fe55af
RS
5329 }
5330 }
5331 else
fa9a63c5
RM
5332 {
5333 /* These braces fend off a "empty body in an else-statement"
7814e705 5334 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5335 assert (bufp->regs_allocated == REGS_FIXED);
5336 }
5337
25fe55af
RS
5338 /* Convert the pointer data in `regstart' and `regend' to
5339 indices. Register zero has to be set differently,
5340 since we haven't kept track of any info for it. */
5341 if (regs->num_regs > 0)
5342 {
5343 regs->start[0] = pos;
99633e97 5344 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5345 }
5e69f11e 5346
25fe55af
RS
5347 /* Go through the first `min (num_regs, regs->num_regs)'
5348 registers, since that is all we initialized. */
01618498 5349 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5350 {
01618498
SM
5351 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5352 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5353 else
5354 {
01618498
SM
5355 regs->start[reg]
5356 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5357 regs->end[reg]
5358 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5359 }
fa9a63c5 5360 }
5e69f11e 5361
25fe55af
RS
5362 /* If the regs structure we return has more elements than
5363 were in the pattern, set the extra elements to -1. If
5364 we (re)allocated the registers, this is the case,
5365 because we always allocate enough to have at least one
7814e705 5366 -1 at the end. */
01618498
SM
5367 for (reg = num_regs; reg < regs->num_regs; reg++)
5368 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5369 } /* regs && !bufp->no_sub */
5370
25fe55af
RS
5371 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5372 nfailure_points_pushed, nfailure_points_popped,
5373 nfailure_points_pushed - nfailure_points_popped);
5374 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5375
99633e97 5376 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5377
25fe55af 5378 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5379
25fe55af
RS
5380 FREE_VARIABLES ();
5381 return mcnt;
5382 }
fa9a63c5 5383
7814e705 5384 /* Otherwise match next pattern command. */
fa9a63c5
RM
5385 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5386 {
25fe55af
RS
5387 /* Ignore these. Used to ignore the n of succeed_n's which
5388 currently have n == 0. */
5389 case no_op:
5390 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5391 break;
fa9a63c5
RM
5392
5393 case succeed:
25fe55af 5394 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5395 goto succeed_label;
5396
7814e705 5397 /* Match the next n pattern characters exactly. The following
25fe55af 5398 byte in the pattern defines n, and the n bytes after that
7814e705 5399 are the characters to match. */
fa9a63c5
RM
5400 case exactn:
5401 mcnt = *p++;
25fe55af 5402 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5403
99633e97
SM
5404 /* Remember the start point to rollback upon failure. */
5405 dfail = d;
5406
6fdd04b0 5407#ifndef emacs
25fe55af
RS
5408 /* This is written out as an if-else so we don't waste time
5409 testing `translate' inside the loop. */
28703c16 5410 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5411 do
5412 {
5413 PREFETCH ();
5414 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5415 {
6fdd04b0
KH
5416 d = dfail;
5417 goto fail;
e934739e 5418 }
6fdd04b0
KH
5419 d++;
5420 }
5421 while (--mcnt);
fa9a63c5 5422 else
6fdd04b0
KH
5423 do
5424 {
5425 PREFETCH ();
5426 if (*d++ != *p++)
bf216479 5427 {
6fdd04b0
KH
5428 d = dfail;
5429 goto fail;
bf216479 5430 }
6fdd04b0
KH
5431 }
5432 while (--mcnt);
5433#else /* emacs */
5434 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5435 if (target_multibyte)
6fdd04b0
KH
5436 do
5437 {
5438 int pat_charlen, buf_charlen;
cf9c99bc 5439 int pat_ch, buf_ch;
e934739e 5440
6fdd04b0 5441 PREFETCH ();
cf9c99bc 5442 if (multibyte)
62a6e103 5443 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5444 else
5445 {
5446 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5447 pat_charlen = 1;
5448 }
62a6e103 5449 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5450
6fdd04b0 5451 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5452 {
6fdd04b0
KH
5453 d = dfail;
5454 goto fail;
e934739e 5455 }
bf216479 5456
6fdd04b0
KH
5457 p += pat_charlen;
5458 d += buf_charlen;
5459 mcnt -= pat_charlen;
5460 }
5461 while (mcnt > 0);
fa9a63c5 5462 else
6fdd04b0
KH
5463 do
5464 {
cf9c99bc
KH
5465 int pat_charlen, buf_charlen;
5466 int pat_ch, buf_ch;
bf216479 5467
6fdd04b0 5468 PREFETCH ();
cf9c99bc
KH
5469 if (multibyte)
5470 {
62a6e103 5471 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5472 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5473 }
5474 else
5475 {
5476 pat_ch = *p;
5477 pat_charlen = 1;
5478 }
5479 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5480 if (! CHAR_BYTE8_P (buf_ch))
5481 {
5482 buf_ch = TRANSLATE (buf_ch);
5483 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5484 if (buf_ch < 0)
5485 buf_ch = *d;
5486 }
0e2501ed
AS
5487 else
5488 buf_ch = *d;
cf9c99bc 5489 if (buf_ch != pat_ch)
6fdd04b0
KH
5490 {
5491 d = dfail;
5492 goto fail;
bf216479 5493 }
cf9c99bc
KH
5494 p += pat_charlen;
5495 d++;
6fdd04b0
KH
5496 }
5497 while (--mcnt);
5498#endif
25fe55af 5499 break;
fa9a63c5
RM
5500
5501
25fe55af 5502 /* Match any character except possibly a newline or a null. */
fa9a63c5 5503 case anychar:
e934739e
RS
5504 {
5505 int buf_charlen;
01618498 5506 re_wchar_t buf_ch;
fa9a63c5 5507
e934739e 5508 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5509
e934739e 5510 PREFETCH ();
62a6e103 5511 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5512 target_multibyte);
e934739e
RS
5513 buf_ch = TRANSLATE (buf_ch);
5514
5515 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5516 && buf_ch == '\n')
5517 || ((bufp->syntax & RE_DOT_NOT_NULL)
5518 && buf_ch == '\000'))
5519 goto fail;
5520
e934739e
RS
5521 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5522 d += buf_charlen;
5523 }
fa9a63c5
RM
5524 break;
5525
5526
5527 case charset:
5528 case charset_not:
5529 {
b18215fc 5530 register unsigned int c;
fa9a63c5 5531 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5532 int len;
5533
5534 /* Start of actual range_table, or end of bitmap if there is no
5535 range table. */
01618498 5536 re_char *range_table;
b18215fc 5537
96cc36cc 5538 /* Nonzero if there is a range table. */
b18215fc
RS
5539 int range_table_exists;
5540
96cc36cc
RS
5541 /* Number of ranges of range table. This is not included
5542 in the initial byte-length of the command. */
5543 int count = 0;
fa9a63c5 5544
f5020181
AS
5545 /* Whether matching against a unibyte character. */
5546 boolean unibyte_char = false;
5547
25fe55af 5548 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5549
b18215fc 5550 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5551
b18215fc 5552 if (range_table_exists)
96cc36cc
RS
5553 {
5554 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5555 EXTRACT_NUMBER_AND_INCR (count, range_table);
5556 }
b18215fc 5557
2d1675e4 5558 PREFETCH ();
62a6e103 5559 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5560 if (target_multibyte)
5561 {
5562 int c1;
b18215fc 5563
cf9c99bc
KH
5564 c = TRANSLATE (c);
5565 c1 = RE_CHAR_TO_UNIBYTE (c);
5566 if (c1 >= 0)
f5020181
AS
5567 {
5568 unibyte_char = true;
5569 c = c1;
5570 }
cf9c99bc
KH
5571 }
5572 else
5573 {
5574 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5575
5576 if (! CHAR_BYTE8_P (c1))
5577 {
5578 c1 = TRANSLATE (c1);
5579 c1 = RE_CHAR_TO_UNIBYTE (c1);
5580 if (c1 >= 0)
f5020181
AS
5581 {
5582 unibyte_char = true;
5583 c = c1;
5584 }
cf9c99bc 5585 }
0b8be006
AS
5586 else
5587 unibyte_char = true;
cf9c99bc
KH
5588 }
5589
f5020181 5590 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5591 { /* Lookup bitmap. */
b18215fc
RS
5592 /* Cast to `unsigned' instead of `unsigned char' in
5593 case the bit list is a full 32 bytes long. */
5594 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5595 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5596 not = !not;
b18215fc 5597 }
96cc36cc 5598#ifdef emacs
b18215fc 5599 else if (range_table_exists)
96cc36cc
RS
5600 {
5601 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5602
14473664
SM
5603 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5604 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5605 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5606 | (class_bits & BIT_SPACE && ISSPACE (c))
5607 | (class_bits & BIT_UPPER && ISUPPER (c))
5608 | (class_bits & BIT_WORD && ISWORD (c)))
5609 not = !not;
5610 else
5611 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5612 }
5613#endif /* emacs */
fa9a63c5 5614
96cc36cc
RS
5615 if (range_table_exists)
5616 p = CHARSET_RANGE_TABLE_END (range_table, count);
5617 else
5618 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5619
5620 if (!not) goto fail;
5e69f11e 5621
b18215fc 5622 d += len;
fa9a63c5
RM
5623 break;
5624 }
5625
5626
25fe55af 5627 /* The beginning of a group is represented by start_memory.
505bde11 5628 The argument is the register number. The text
25fe55af 5629 matched within the group is recorded (in the internal
7814e705 5630 registers data structure) under the register number. */
25fe55af 5631 case start_memory:
505bde11
SM
5632 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5633
5634 /* In case we need to undo this operation (via backtracking). */
5635 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5636
25fe55af 5637 regstart[*p] = d;
4bb91c68 5638 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5639 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5640
25fe55af 5641 /* Move past the register number and inner group count. */
505bde11 5642 p += 1;
25fe55af 5643 break;
fa9a63c5
RM
5644
5645
25fe55af 5646 /* The stop_memory opcode represents the end of a group. Its
505bde11 5647 argument is the same as start_memory's: the register number. */
fa9a63c5 5648 case stop_memory:
505bde11
SM
5649 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5650
5651 assert (!REG_UNSET (regstart[*p]));
5652 /* Strictly speaking, there should be code such as:
177c0ea7 5653
0b32bf0e 5654 assert (REG_UNSET (regend[*p]));
505bde11
SM
5655 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5656
5657 But the only info to be pushed is regend[*p] and it is known to
5658 be UNSET, so there really isn't anything to push.
5659 Not pushing anything, on the other hand deprives us from the
5660 guarantee that regend[*p] is UNSET since undoing this operation
5661 will not reset its value properly. This is not important since
5662 the value will only be read on the next start_memory or at
5663 the very end and both events can only happen if this stop_memory
5664 is *not* undone. */
fa9a63c5 5665
25fe55af 5666 regend[*p] = d;
fa9a63c5
RM
5667 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5668
25fe55af 5669 /* Move past the register number and the inner group count. */
505bde11 5670 p += 1;
25fe55af 5671 break;
fa9a63c5
RM
5672
5673
5674 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5675 followed by the numeric value of <digit> as the register number. */
5676 case duplicate:
fa9a63c5 5677 {
66f0296e 5678 register re_char *d2, *dend2;
7814e705 5679 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5680 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5681
7814e705 5682 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5683 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5684 goto fail;
5e69f11e 5685
7814e705 5686 /* Where in input to try to start matching. */
25fe55af 5687 d2 = regstart[regno];
5e69f11e 5688
99633e97
SM
5689 /* Remember the start point to rollback upon failure. */
5690 dfail = d;
5691
25fe55af
RS
5692 /* Where to stop matching; if both the place to start and
5693 the place to stop matching are in the same string, then
5694 set to the place to stop, otherwise, for now have to use
5695 the end of the first string. */
fa9a63c5 5696
25fe55af 5697 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5698 == FIRST_STRING_P (regend[regno]))
5699 ? regend[regno] : end_match_1);
5700 for (;;)
5701 {
5702 /* If necessary, advance to next segment in register
25fe55af 5703 contents. */
fa9a63c5
RM
5704 while (d2 == dend2)
5705 {
5706 if (dend2 == end_match_2) break;
5707 if (dend2 == regend[regno]) break;
5708
25fe55af
RS
5709 /* End of string1 => advance to string2. */
5710 d2 = string2;
5711 dend2 = regend[regno];
fa9a63c5
RM
5712 }
5713 /* At end of register contents => success */
5714 if (d2 == dend2) break;
5715
5716 /* If necessary, advance to next segment in data. */
5717 PREFETCH ();
5718
5719 /* How many characters left in this segment to match. */
5720 mcnt = dend - d;
5e69f11e 5721
fa9a63c5 5722 /* Want how many consecutive characters we can match in
25fe55af
RS
5723 one shot, so, if necessary, adjust the count. */
5724 if (mcnt > dend2 - d2)
fa9a63c5 5725 mcnt = dend2 - d2;
5e69f11e 5726
fa9a63c5 5727 /* Compare that many; failure if mismatch, else move
25fe55af 5728 past them. */
28703c16 5729 if (RE_TRANSLATE_P (translate)
02cb78b5 5730 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5731 : memcmp (d, d2, mcnt))
99633e97
SM
5732 {
5733 d = dfail;
5734 goto fail;
5735 }
fa9a63c5 5736 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5737 }
5738 }
5739 break;
5740
5741
25fe55af 5742 /* begline matches the empty string at the beginning of the string
c0f9ea08 5743 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5744 case begline:
25fe55af 5745 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5746
25fe55af
RS
5747 if (AT_STRINGS_BEG (d))
5748 {
5749 if (!bufp->not_bol) break;
5750 }
419d1c74 5751 else
25fe55af 5752 {
bf216479 5753 unsigned c;
419d1c74 5754 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5755 if (c == '\n')
419d1c74 5756 break;
25fe55af
RS
5757 }
5758 /* In all other cases, we fail. */
5759 goto fail;
fa9a63c5
RM
5760
5761
25fe55af 5762 /* endline is the dual of begline. */
fa9a63c5 5763 case endline:
25fe55af 5764 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5765
25fe55af
RS
5766 if (AT_STRINGS_END (d))
5767 {
5768 if (!bufp->not_eol) break;
5769 }
f1ad044f 5770 else
25fe55af 5771 {
f1ad044f 5772 PREFETCH_NOLIMIT ();
c0f9ea08 5773 if (*d == '\n')
f1ad044f 5774 break;
25fe55af
RS
5775 }
5776 goto fail;
fa9a63c5
RM
5777
5778
5779 /* Match at the very beginning of the data. */
25fe55af
RS
5780 case begbuf:
5781 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5782 if (AT_STRINGS_BEG (d))
5783 break;
5784 goto fail;
fa9a63c5
RM
5785
5786
5787 /* Match at the very end of the data. */
25fe55af
RS
5788 case endbuf:
5789 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5790 if (AT_STRINGS_END (d))
5791 break;
25fe55af 5792 goto fail;
5e69f11e 5793
5e69f11e 5794
25fe55af
RS
5795 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5796 pushes NULL as the value for the string on the stack. Then
505bde11 5797 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5798 string, instead of restoring it. To see why, consider
7814e705 5799 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5800 then the . fails against the \n. But the next thing we want
5801 to do is match the \n against the \n; if we restored the
5802 string value, we would be back at the foo.
5803
5804 Because this is used only in specific cases, we don't need to
5805 check all the things that `on_failure_jump' does, to make
5806 sure the right things get saved on the stack. Hence we don't
5807 share its code. The only reason to push anything on the
5808 stack at all is that otherwise we would have to change
5809 `anychar's code to do something besides goto fail in this
5810 case; that seems worse than this. */
5811 case on_failure_keep_string_jump:
505bde11
SM
5812 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5813 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5814 mcnt, p + mcnt);
fa9a63c5 5815
505bde11
SM
5816 PUSH_FAILURE_POINT (p - 3, NULL);
5817 break;
5818
0683b6fa
SM
5819 /* A nasty loop is introduced by the non-greedy *? and +?.
5820 With such loops, the stack only ever contains one failure point
5821 at a time, so that a plain on_failure_jump_loop kind of
5822 cycle detection cannot work. Worse yet, such a detection
5823 can not only fail to detect a cycle, but it can also wrongly
5824 detect a cycle (between different instantiations of the same
6df42991 5825 loop).
0683b6fa
SM
5826 So the method used for those nasty loops is a little different:
5827 We use a special cycle-detection-stack-frame which is pushed
5828 when the on_failure_jump_nastyloop failure-point is *popped*.
5829 This special frame thus marks the beginning of one iteration
5830 through the loop and we can hence easily check right here
5831 whether something matched between the beginning and the end of
5832 the loop. */
5833 case on_failure_jump_nastyloop:
5834 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5835 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5836 mcnt, p + mcnt);
5837
5838 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5839 {
5840 int cycle = 0;
5841 CHECK_INFINITE_LOOP (p - 4, d);
5842 if (!cycle)
5843 /* If there's a cycle, just continue without pushing
5844 this failure point. The failure point is the "try again"
5845 option, which shouldn't be tried.
5846 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5847 PUSH_FAILURE_POINT (p - 3, d);
5848 }
0683b6fa
SM
5849 break;
5850
4e8a9132
SM
5851 /* Simple loop detecting on_failure_jump: just check on the
5852 failure stack if the same spot was already hit earlier. */
505bde11
SM
5853 case on_failure_jump_loop:
5854 on_failure:
5855 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5856 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5857 mcnt, p + mcnt);
6df42991
SM
5858 {
5859 int cycle = 0;
5860 CHECK_INFINITE_LOOP (p - 3, d);
5861 if (cycle)
5862 /* If there's a cycle, get out of the loop, as if the matching
5863 had failed. We used to just `goto fail' here, but that was
5864 aborting the search a bit too early: we want to keep the
5865 empty-loop-match and keep matching after the loop.
5866 We want (x?)*y\1z to match both xxyz and xxyxz. */
5867 p += mcnt;
5868 else
5869 PUSH_FAILURE_POINT (p - 3, d);
5870 }
25fe55af 5871 break;
fa9a63c5
RM
5872
5873
5874 /* Uses of on_failure_jump:
5e69f11e 5875
25fe55af
RS
5876 Each alternative starts with an on_failure_jump that points
5877 to the beginning of the next alternative. Each alternative
5878 except the last ends with a jump that in effect jumps past
5879 the rest of the alternatives. (They really jump to the
5880 ending jump of the following alternative, because tensioning
5881 these jumps is a hassle.)
fa9a63c5 5882
25fe55af
RS
5883 Repeats start with an on_failure_jump that points past both
5884 the repetition text and either the following jump or
5885 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5886 case on_failure_jump:
25fe55af 5887 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5888 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5889 mcnt, p + mcnt);
25fe55af 5890
505bde11 5891 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5892 break;
5893
4e8a9132 5894 /* This operation is used for greedy *.
505bde11
SM
5895 Compare the beginning of the repeat with what in the
5896 pattern follows its end. If we can establish that there
5897 is nothing that they would both match, i.e., that we
5898 would have to backtrack because of (as in, e.g., `a*a')
5899 then we can use a non-backtracking loop based on
4e8a9132 5900 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5901 case on_failure_jump_smart:
25fe55af 5902 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5903 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5904 mcnt, p + mcnt);
25fe55af 5905 {
01618498 5906 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5907 /* Here, we discard `const', making re_match non-reentrant. */
5908 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5909 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5910
505bde11
SM
5911 p -= 3; /* Reset so that we will re-execute the
5912 instruction once it's been changed. */
fa9a63c5 5913
4e8a9132
SM
5914 EXTRACT_NUMBER (mcnt, p2 - 2);
5915
5916 /* Ensure this is a indeed the trivial kind of loop
5917 we are expecting. */
5918 assert (skip_one_char (p1) == p2 - 3);
5919 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5920 DEBUG_STATEMENT (debug += 2);
505bde11 5921 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5922 {
505bde11 5923 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5924 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5925 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5926 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5927 }
505bde11 5928 else
fa9a63c5 5929 {
505bde11
SM
5930 /* Default to a safe `on_failure_jump' loop. */
5931 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5932 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5933 }
99633e97 5934 DEBUG_STATEMENT (debug -= 2);
25fe55af 5935 }
505bde11 5936 break;
25fe55af
RS
5937
5938 /* Unconditionally jump (without popping any failure points). */
5939 case jump:
fa9a63c5 5940 unconditional_jump:
5b370c2b 5941 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5942 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5943 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5944 p += mcnt; /* Do the jump. */
505bde11 5945 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5946 break;
5947
5948
25fe55af
RS
5949 /* Have to succeed matching what follows at least n times.
5950 After that, handle like `on_failure_jump'. */
5951 case succeed_n:
01618498 5952 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5953 EXTRACT_NUMBER (mcnt, p + 2);
5954 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5955
dc1e502d
SM
5956 /* Originally, mcnt is how many times we HAVE to succeed. */
5957 if (mcnt != 0)
25fe55af 5958 {
6dcf2d0e
SM
5959 /* Here, we discard `const', making re_match non-reentrant. */
5960 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5961 mcnt--;
01618498
SM
5962 p += 4;
5963 PUSH_NUMBER (p2, mcnt);
25fe55af 5964 }
dc1e502d
SM
5965 else
5966 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5967 goto on_failure;
25fe55af
RS
5968 break;
5969
5970 case jump_n:
01618498 5971 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5972 EXTRACT_NUMBER (mcnt, p + 2);
5973 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5974
5975 /* Originally, this is how many times we CAN jump. */
dc1e502d 5976 if (mcnt != 0)
25fe55af 5977 {
6dcf2d0e
SM
5978 /* Here, we discard `const', making re_match non-reentrant. */
5979 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5980 mcnt--;
01618498 5981 PUSH_NUMBER (p2, mcnt);
dc1e502d 5982 goto unconditional_jump;
25fe55af
RS
5983 }
5984 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5985 else
5986 p += 4;
25fe55af 5987 break;
5e69f11e 5988
fa9a63c5
RM
5989 case set_number_at:
5990 {
01618498 5991 unsigned char *p2; /* Location of the counter. */
25fe55af 5992 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5993
25fe55af 5994 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5995 /* Here, we discard `const', making re_match non-reentrant. */
5996 p2 = (unsigned char*) p + mcnt;
01618498 5997 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5998 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5999 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6000 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6001 break;
6002 }
9121ca40
KH
6003
6004 case wordbound:
66f0296e 6005 case notwordbound:
19ed5445
PE
6006 {
6007 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
6008 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6009
19ed5445 6010 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6011
19ed5445
PE
6012 /* Case 1: D is at the beginning or the end of string. */
6013 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
6014 not = !not;
6015 else
6016 {
6017 /* C1 is the character before D, S1 is the syntax of C1, C2
6018 is the character at D, and S2 is the syntax of C2. */
6019 re_wchar_t c1, c2;
6020 int s1, s2;
6021 int dummy;
b18215fc 6022#ifdef emacs
19ed5445
PE
6023 int offset = PTR_TO_OFFSET (d - 1);
6024 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6025 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6026#endif
19ed5445
PE
6027 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6028 s1 = SYNTAX (c1);
b18215fc 6029#ifdef emacs
19ed5445 6030 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6031#endif
19ed5445
PE
6032 PREFETCH_NOLIMIT ();
6033 GET_CHAR_AFTER (c2, d, dummy);
6034 s2 = SYNTAX (c2);
6035
6036 if (/* Case 2: Only one of S1 and S2 is Sword. */
6037 ((s1 == Sword) != (s2 == Sword))
6038 /* Case 3: Both of S1 and S2 are Sword, and macro
6039 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
6040 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
6041 not = !not;
6042 }
6043 if (not)
6044 break;
6045 else
6046 goto fail;
6047 }
fa9a63c5
RM
6048
6049 case wordbeg:
25fe55af 6050 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6051
b18215fc
RS
6052 /* We FAIL in one of the following cases: */
6053
7814e705 6054 /* Case 1: D is at the end of string. */
b18215fc 6055 if (AT_STRINGS_END (d))
99633e97 6056 goto fail;
b18215fc
RS
6057 else
6058 {
6059 /* C1 is the character before D, S1 is the syntax of C1, C2
6060 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6061 re_wchar_t c1, c2;
6062 int s1, s2;
bf216479 6063 int dummy;
fa9a63c5 6064#ifdef emacs
2d1675e4
SM
6065 int offset = PTR_TO_OFFSET (d);
6066 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6067 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6068#endif
99633e97 6069 PREFETCH ();
6fdd04b0 6070 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6071 s2 = SYNTAX (c2);
177c0ea7 6072
b18215fc
RS
6073 /* Case 2: S2 is not Sword. */
6074 if (s2 != Sword)
6075 goto fail;
6076
6077 /* Case 3: D is not at the beginning of string ... */
6078 if (!AT_STRINGS_BEG (d))
6079 {
6080 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6081#ifdef emacs
5d967c7a 6082 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6083#endif
b18215fc
RS
6084 s1 = SYNTAX (c1);
6085
6086 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6087 returns 0. */
b18215fc
RS
6088 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6089 goto fail;
6090 }
6091 }
e318085a
RS
6092 break;
6093
b18215fc 6094 case wordend:
25fe55af 6095 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6096
6097 /* We FAIL in one of the following cases: */
6098
6099 /* Case 1: D is at the beginning of string. */
6100 if (AT_STRINGS_BEG (d))
e318085a 6101 goto fail;
b18215fc
RS
6102 else
6103 {
6104 /* C1 is the character before D, S1 is the syntax of C1, C2
6105 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6106 re_wchar_t c1, c2;
6107 int s1, s2;
bf216479 6108 int dummy;
5d967c7a 6109#ifdef emacs
2d1675e4
SM
6110 int offset = PTR_TO_OFFSET (d) - 1;
6111 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6112 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6113#endif
99633e97 6114 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6115 s1 = SYNTAX (c1);
6116
6117 /* Case 2: S1 is not Sword. */
6118 if (s1 != Sword)
6119 goto fail;
6120
6121 /* Case 3: D is not at the end of string ... */
6122 if (!AT_STRINGS_END (d))
6123 {
f1ad044f 6124 PREFETCH_NOLIMIT ();
6fdd04b0 6125 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6126#ifdef emacs
6127 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6128#endif
b18215fc
RS
6129 s2 = SYNTAX (c2);
6130
6131 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6132 returns 0. */
b18215fc 6133 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6134 goto fail;
b18215fc
RS
6135 }
6136 }
e318085a
RS
6137 break;
6138
669fa600
SM
6139 case symbeg:
6140 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6141
6142 /* We FAIL in one of the following cases: */
6143
7814e705 6144 /* Case 1: D is at the end of string. */
669fa600
SM
6145 if (AT_STRINGS_END (d))
6146 goto fail;
6147 else
6148 {
6149 /* C1 is the character before D, S1 is the syntax of C1, C2
6150 is the character at D, and S2 is the syntax of C2. */
6151 re_wchar_t c1, c2;
6152 int s1, s2;
6153#ifdef emacs
6154 int offset = PTR_TO_OFFSET (d);
6155 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6156 UPDATE_SYNTAX_TABLE (charpos);
6157#endif
6158 PREFETCH ();
62a6e103 6159 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6160 s2 = SYNTAX (c2);
7814e705 6161
669fa600
SM
6162 /* Case 2: S2 is neither Sword nor Ssymbol. */
6163 if (s2 != Sword && s2 != Ssymbol)
6164 goto fail;
6165
6166 /* Case 3: D is not at the beginning of string ... */
6167 if (!AT_STRINGS_BEG (d))
6168 {
6169 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6170#ifdef emacs
6171 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6172#endif
6173 s1 = SYNTAX (c1);
6174
6175 /* ... and S1 is Sword or Ssymbol. */
6176 if (s1 == Sword || s1 == Ssymbol)
6177 goto fail;
6178 }
6179 }
6180 break;
6181
6182 case symend:
6183 DEBUG_PRINT1 ("EXECUTING symend.\n");
6184
6185 /* We FAIL in one of the following cases: */
6186
6187 /* Case 1: D is at the beginning of string. */
6188 if (AT_STRINGS_BEG (d))
6189 goto fail;
6190 else
6191 {
6192 /* C1 is the character before D, S1 is the syntax of C1, C2
6193 is the character at D, and S2 is the syntax of C2. */
6194 re_wchar_t c1, c2;
6195 int s1, s2;
6196#ifdef emacs
6197 int offset = PTR_TO_OFFSET (d) - 1;
6198 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6199 UPDATE_SYNTAX_TABLE (charpos);
6200#endif
6201 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6202 s1 = SYNTAX (c1);
6203
6204 /* Case 2: S1 is neither Ssymbol nor Sword. */
6205 if (s1 != Sword && s1 != Ssymbol)
6206 goto fail;
6207
6208 /* Case 3: D is not at the end of string ... */
6209 if (!AT_STRINGS_END (d))
6210 {
6211 PREFETCH_NOLIMIT ();
62a6e103 6212 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6213#ifdef emacs
134579f2 6214 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6215#endif
6216 s2 = SYNTAX (c2);
6217
6218 /* ... and S2 is Sword or Ssymbol. */
6219 if (s2 == Sword || s2 == Ssymbol)
6220 goto fail;
b18215fc
RS
6221 }
6222 }
e318085a
RS
6223 break;
6224
fa9a63c5 6225 case syntaxspec:
1fb352e0 6226 case notsyntaxspec:
b18215fc 6227 {
19ed5445
PE
6228 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6229 mcnt = *p++;
6230 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6231 PREFETCH ();
6232#ifdef emacs
6233 {
6234 int offset = PTR_TO_OFFSET (d);
6235 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6236 UPDATE_SYNTAX_TABLE (pos1);
6237 }
25fe55af 6238#endif
19ed5445
PE
6239 {
6240 int len;
6241 re_wchar_t c;
b18215fc 6242
19ed5445
PE
6243 GET_CHAR_AFTER (c, d, len);
6244 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6245 goto fail;
6246 d += len;
6247 }
6248 break;
b18215fc 6249 }
fa9a63c5 6250
b18215fc 6251#ifdef emacs
1fb352e0
SM
6252 case before_dot:
6253 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6254 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6255 goto fail;
b18215fc
RS
6256 break;
6257
1fb352e0
SM
6258 case at_dot:
6259 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6260 if (PTR_BYTE_POS (d) != PT_BYTE)
6261 goto fail;
6262 break;
b18215fc 6263
1fb352e0
SM
6264 case after_dot:
6265 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6266 if (PTR_BYTE_POS (d) <= PT_BYTE)
6267 goto fail;
e318085a 6268 break;
fa9a63c5 6269
1fb352e0 6270 case categoryspec:
b18215fc 6271 case notcategoryspec:
1fb352e0 6272 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6273 mcnt = *p++;
1fb352e0 6274 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6275 PREFETCH ();
6276 {
01618498
SM
6277 int len;
6278 re_wchar_t c;
6279
6fdd04b0 6280 GET_CHAR_AFTER (c, d, len);
1fb352e0 6281 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6282 goto fail;
6283 d += len;
6284 }
fa9a63c5 6285 break;
5e69f11e 6286
1fb352e0 6287#endif /* emacs */
5e69f11e 6288
0b32bf0e
SM
6289 default:
6290 abort ();
fa9a63c5 6291 }
b18215fc 6292 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6293
6294
6295 /* We goto here if a matching operation fails. */
6296 fail:
5b370c2b 6297 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6298 if (!FAIL_STACK_EMPTY ())
505bde11 6299 {
01618498 6300 re_char *str, *pat;
505bde11 6301 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6302 DEBUG_PRINT1 ("\nFAIL:\n");
6303 POP_FAILURE_POINT (str, pat);
505bde11
SM
6304 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6305 {
6306 case on_failure_keep_string_jump:
6307 assert (str == NULL);
6308 goto continue_failure_jump;
6309
0683b6fa
SM
6310 case on_failure_jump_nastyloop:
6311 assert ((re_opcode_t)pat[-2] == no_op);
6312 PUSH_FAILURE_POINT (pat - 2, str);
6313 /* Fallthrough */
6314
505bde11
SM
6315 case on_failure_jump_loop:
6316 case on_failure_jump:
6317 case succeed_n:
6318 d = str;
6319 continue_failure_jump:
6320 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6321 p = pat + mcnt;
6322 break;
b18215fc 6323
0683b6fa
SM
6324 case no_op:
6325 /* A special frame used for nastyloops. */
6326 goto fail;
6327
505bde11
SM
6328 default:
6329 abort();
6330 }
fa9a63c5 6331
505bde11 6332 assert (p >= bufp->buffer && p <= pend);
b18215fc 6333
0b32bf0e 6334 if (d >= string1 && d <= end1)
fa9a63c5 6335 dend = end_match_1;
0b32bf0e 6336 }
fa9a63c5 6337 else
0b32bf0e 6338 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6339 } /* for (;;) */
6340
6341 if (best_regs_set)
6342 goto restore_best_regs;
6343
6344 FREE_VARIABLES ();
6345
b18215fc 6346 return -1; /* Failure to match. */
fa9a63c5
RM
6347} /* re_match_2 */
6348\f
6349/* Subroutine definitions for re_match_2. */
6350
fa9a63c5
RM
6351/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6352 bytes; nonzero otherwise. */
5e69f11e 6353
fa9a63c5 6354static int
438105ed
JB
6355bcmp_translate (const re_char *s1, const re_char *s2, register int len,
6356 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6357{
2d1675e4
SM
6358 register re_char *p1 = s1, *p2 = s2;
6359 re_char *p1_end = s1 + len;
6360 re_char *p2_end = s2 + len;
e934739e 6361
4bb91c68
SM
6362 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6363 different lengths, but relying on a single `len' would break this. -sm */
6364 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6365 {
e934739e 6366 int p1_charlen, p2_charlen;
01618498 6367 re_wchar_t p1_ch, p2_ch;
e934739e 6368
6fdd04b0
KH
6369 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6370 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6371
6372 if (RE_TRANSLATE (translate, p1_ch)
6373 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6374 return 1;
e934739e
RS
6375
6376 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6377 }
e934739e
RS
6378
6379 if (p1 != p1_end || p2 != p2_end)
6380 return 1;
6381
fa9a63c5
RM
6382 return 0;
6383}
6384\f
6385/* Entry points for GNU code. */
6386
6387/* re_compile_pattern is the GNU regular expression compiler: it
6388 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6389 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6390
fa9a63c5
RM
6391 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6392 are set in BUFP on entry.
5e69f11e 6393
b18215fc 6394 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6395
6396const char *
971de7fb 6397re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp)
fa9a63c5
RM
6398{
6399 reg_errcode_t ret;
5e69f11e 6400
fa9a63c5
RM
6401 /* GNU code is written to assume at least RE_NREGS registers will be set
6402 (and at least one extra will be -1). */
6403 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6404
fa9a63c5
RM
6405 /* And GNU code determines whether or not to get register information
6406 by passing null for the REGS argument to re_match, etc., not by
6407 setting no_sub. */
6408 bufp->no_sub = 0;
5e69f11e 6409
4bb91c68 6410 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6411
6412 if (!ret)
6413 return NULL;
6414 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6415}
c0f9ea08 6416WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6417\f
b18215fc
RS
6418/* Entry points compatible with 4.2 BSD regex library. We don't define
6419 them unless specifically requested. */
fa9a63c5 6420
0b32bf0e 6421#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6422
6423/* BSD has one and only one pattern buffer. */
6424static struct re_pattern_buffer re_comp_buf;
6425
6426char *
0b32bf0e 6427# ifdef _LIBC
48afdd44
RM
6428/* Make these definitions weak in libc, so POSIX programs can redefine
6429 these names if they don't use our functions, and still use
6430 regcomp/regexec below without link errors. */
6431weak_function
0b32bf0e 6432# endif
fa9a63c5
RM
6433re_comp (s)
6434 const char *s;
6435{
6436 reg_errcode_t ret;
5e69f11e 6437
fa9a63c5
RM
6438 if (!s)
6439 {
6440 if (!re_comp_buf.buffer)
0b32bf0e 6441 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6442 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6443 return 0;
6444 }
6445
6446 if (!re_comp_buf.buffer)
6447 {
6448 re_comp_buf.buffer = (unsigned char *) malloc (200);
6449 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6450 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6451 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6452 re_comp_buf.allocated = 200;
6453
6454 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6455 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6456 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6457 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6458 }
6459
6460 /* Since `re_exec' always passes NULL for the `regs' argument, we
6461 don't need to initialize the pattern buffer fields which affect it. */
6462
fa9a63c5 6463 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6464
fa9a63c5
RM
6465 if (!ret)
6466 return NULL;
6467
6468 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6469 return (char *) gettext (re_error_msgid[(int) ret]);
6470}
6471
6472
6473int
0b32bf0e 6474# ifdef _LIBC
48afdd44 6475weak_function
0b32bf0e 6476# endif
fa9a63c5
RM
6477re_exec (s)
6478 const char *s;
6479{
6480 const int len = strlen (s);
6481 return
6482 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6483}
6484#endif /* _REGEX_RE_COMP */
6485\f
6486/* POSIX.2 functions. Don't define these for Emacs. */
6487
6488#ifndef emacs
6489
6490/* regcomp takes a regular expression as a string and compiles it.
6491
b18215fc 6492 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6493 since POSIX says we shouldn't. Thus, we set
6494
6495 `buffer' to the compiled pattern;
6496 `used' to the length of the compiled pattern;
6497 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6498 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6499 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6500 `fastmap' to an allocated space for the fastmap;
6501 `fastmap_accurate' to zero;
fa9a63c5
RM
6502 `re_nsub' to the number of subexpressions in PATTERN.
6503
6504 PATTERN is the address of the pattern string.
6505
6506 CFLAGS is a series of bits which affect compilation.
6507
6508 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6509 use POSIX basic syntax.
6510
6511 If REG_NEWLINE is set, then . and [^...] don't match newline.
6512 Also, regexec will try a match beginning after every newline.
6513
6514 If REG_ICASE is set, then we considers upper- and lowercase
6515 versions of letters to be equivalent when matching.
6516
6517 If REG_NOSUB is set, then when PREG is passed to regexec, that
6518 routine will report only success or failure, and nothing about the
6519 registers.
6520
b18215fc 6521 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6522 the return codes and their meanings.) */
6523
6524int
d2762c86
DN
6525regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6526 int cflags)
fa9a63c5
RM
6527{
6528 reg_errcode_t ret;
4bb91c68 6529 reg_syntax_t syntax
fa9a63c5
RM
6530 = (cflags & REG_EXTENDED) ?
6531 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6532
6533 /* regex_compile will allocate the space for the compiled pattern. */
6534 preg->buffer = 0;
6535 preg->allocated = 0;
6536 preg->used = 0;
5e69f11e 6537
c0f9ea08
SM
6538 /* Try to allocate space for the fastmap. */
6539 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6540
fa9a63c5
RM
6541 if (cflags & REG_ICASE)
6542 {
6543 unsigned i;
5e69f11e 6544
6676cb1c
RS
6545 preg->translate
6546 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6547 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6548 if (preg->translate == NULL)
0b32bf0e 6549 return (int) REG_ESPACE;
fa9a63c5
RM
6550
6551 /* Map uppercase characters to corresponding lowercase ones. */
6552 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6553 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6554 }
6555 else
6556 preg->translate = NULL;
6557
6558 /* If REG_NEWLINE is set, newlines are treated differently. */
6559 if (cflags & REG_NEWLINE)
6560 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6561 syntax &= ~RE_DOT_NEWLINE;
6562 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6563 }
6564 else
c0f9ea08 6565 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6566
6567 preg->no_sub = !!(cflags & REG_NOSUB);
6568
5e69f11e 6569 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6570 can use strlen here in compiling the pattern. */
4bb91c68 6571 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6572
fa9a63c5
RM
6573 /* POSIX doesn't distinguish between an unmatched open-group and an
6574 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6575 if (ret == REG_ERPAREN)
6576 ret = REG_EPAREN;
6577
6578 if (ret == REG_NOERROR && preg->fastmap)
6579 { /* Compute the fastmap now, since regexec cannot modify the pattern
6580 buffer. */
6581 re_compile_fastmap (preg);
6582 if (preg->can_be_null)
6583 { /* The fastmap can't be used anyway. */
6584 free (preg->fastmap);
6585 preg->fastmap = NULL;
6586 }
6587 }
fa9a63c5
RM
6588 return (int) ret;
6589}
c0f9ea08 6590WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6591
6592
6593/* regexec searches for a given pattern, specified by PREG, in the
6594 string STRING.
5e69f11e 6595
fa9a63c5 6596 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6597 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6598 least NMATCH elements, and we set them to the offsets of the
6599 corresponding matched substrings.
5e69f11e 6600
fa9a63c5
RM
6601 EFLAGS specifies `execution flags' which affect matching: if
6602 REG_NOTBOL is set, then ^ does not match at the beginning of the
6603 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6604
fa9a63c5
RM
6605 We return 0 if we find a match and REG_NOMATCH if not. */
6606
6607int
d2762c86
DN
6608regexec (const regex_t *__restrict preg, const char *__restrict string,
6609 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5
RM
6610{
6611 int ret;
6612 struct re_registers regs;
6613 regex_t private_preg;
6614 int len = strlen (string);
c0f9ea08 6615 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6616
6617 private_preg = *preg;
5e69f11e 6618
fa9a63c5
RM
6619 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6620 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6621
fa9a63c5
RM
6622 /* The user has told us exactly how many registers to return
6623 information about, via `nmatch'. We have to pass that on to the
b18215fc 6624 matching routines. */
fa9a63c5 6625 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6626
fa9a63c5
RM
6627 if (want_reg_info)
6628 {
6629 regs.num_regs = nmatch;
4bb91c68
SM
6630 regs.start = TALLOC (nmatch * 2, regoff_t);
6631 if (regs.start == NULL)
0b32bf0e 6632 return (int) REG_NOMATCH;
4bb91c68 6633 regs.end = regs.start + nmatch;
fa9a63c5
RM
6634 }
6635
c0f9ea08
SM
6636 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6637 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6638 was a little bit longer but still only matching the real part.
6639 This works because the `endline' will check for a '\n' and will find a
6640 '\0', correctly deciding that this is not the end of a line.
6641 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6642 a convenient '\0' there. For all we know, the string could be preceded
6643 by '\n' which would throw things off. */
6644
fa9a63c5
RM
6645 /* Perform the searching operation. */
6646 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6647 /* start: */ 0, /* range: */ len,
6648 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6649
fa9a63c5
RM
6650 /* Copy the register information to the POSIX structure. */
6651 if (want_reg_info)
6652 {
6653 if (ret >= 0)
0b32bf0e
SM
6654 {
6655 unsigned r;
fa9a63c5 6656
0b32bf0e
SM
6657 for (r = 0; r < nmatch; r++)
6658 {
6659 pmatch[r].rm_so = regs.start[r];
6660 pmatch[r].rm_eo = regs.end[r];
6661 }
6662 }
fa9a63c5 6663
b18215fc 6664 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6665 free (regs.start);
fa9a63c5
RM
6666 }
6667
6668 /* We want zero return to mean success, unlike `re_search'. */
6669 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6670}
c0f9ea08 6671WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6672
6673
ec869672
JR
6674/* Returns a message corresponding to an error code, ERR_CODE, returned
6675 from either regcomp or regexec. We don't use PREG here.
6676
6677 ERR_CODE was previously called ERRCODE, but that name causes an
6678 error with msvc8 compiler. */
fa9a63c5
RM
6679
6680size_t
d2762c86 6681regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6682{
6683 const char *msg;
6684 size_t msg_size;
6685
ec869672
JR
6686 if (err_code < 0
6687 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6688 /* Only error codes returned by the rest of the code should be passed
b18215fc 6689 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6690 code generates an invalid error code, then the program has a bug.
6691 Dump core so we can fix it. */
6692 abort ();
6693
ec869672 6694 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6695
6696 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6697
fa9a63c5
RM
6698 if (errbuf_size != 0)
6699 {
6700 if (msg_size > errbuf_size)
0b32bf0e
SM
6701 {
6702 strncpy (errbuf, msg, errbuf_size - 1);
6703 errbuf[errbuf_size - 1] = 0;
6704 }
fa9a63c5 6705 else
0b32bf0e 6706 strcpy (errbuf, msg);
fa9a63c5
RM
6707 }
6708
6709 return msg_size;
6710}
c0f9ea08 6711WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6712
6713
6714/* Free dynamically allocated space used by PREG. */
6715
6716void
d2762c86 6717regfree (regex_t *preg)
fa9a63c5 6718{
c2cd06e6 6719 free (preg->buffer);
fa9a63c5 6720 preg->buffer = NULL;
5e69f11e 6721
fa9a63c5
RM
6722 preg->allocated = 0;
6723 preg->used = 0;
6724
c2cd06e6 6725 free (preg->fastmap);
fa9a63c5
RM
6726 preg->fastmap = NULL;
6727 preg->fastmap_accurate = 0;
6728
c2cd06e6 6729 free (preg->translate);
fa9a63c5
RM
6730 preg->translate = NULL;
6731}
c0f9ea08 6732WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6733
6734#endif /* not emacs */