* search.c (boyer_moore): Rename locals to avoid shadowing.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
95df8112 5 Copyright (C) 1993-2011 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
e468b87f 9 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
4fc5845f 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
fa9a63c5 36#ifdef HAVE_CONFIG_H
0b32bf0e 37# include <config.h>
fa9a63c5
RM
38#endif
39
4bb91c68
SM
40#if defined STDC_HEADERS && !defined emacs
41# include <stddef.h>
42#else
43/* We need this for `regex.h', and perhaps for the Emacs include files. */
44# include <sys/types.h>
45#endif
fa9a63c5 46
14473664
SM
47/* Whether to use ISO C Amendment 1 wide char functions.
48 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
49#if defined _LIBC
50#define WIDE_CHAR_SUPPORT 1
51#else
14473664 52#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
53 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
54#endif
14473664
SM
55
56/* For platform which support the ISO C amendement 1 functionality we
57 support user defined character classes. */
a0ad02f7 58#if WIDE_CHAR_SUPPORT
14473664
SM
59/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
60# include <wchar.h>
61# include <wctype.h>
62#endif
63
c0f9ea08
SM
64#ifdef _LIBC
65/* We have to keep the namespace clean. */
66# define regfree(preg) __regfree (preg)
67# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
68# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
69# define regerror(err_code, preg, errbuf, errbuf_size) \
70 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
71# define re_set_registers(bu, re, nu, st, en) \
72 __re_set_registers (bu, re, nu, st, en)
73# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
74 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
75# define re_match(bufp, string, size, pos, regs) \
76 __re_match (bufp, string, size, pos, regs)
77# define re_search(bufp, string, size, startpos, range, regs) \
78 __re_search (bufp, string, size, startpos, range, regs)
79# define re_compile_pattern(pattern, length, bufp) \
80 __re_compile_pattern (pattern, length, bufp)
81# define re_set_syntax(syntax) __re_set_syntax (syntax)
82# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
83 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
84# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
85
14473664
SM
86/* Make sure we call libc's function even if the user overrides them. */
87# define btowc __btowc
88# define iswctype __iswctype
89# define wctype __wctype
90
c0f9ea08
SM
91# define WEAK_ALIAS(a,b) weak_alias (a, b)
92
93/* We are also using some library internals. */
94# include <locale/localeinfo.h>
95# include <locale/elem-hash.h>
96# include <langinfo.h>
97#else
98# define WEAK_ALIAS(a,b)
99#endif
100
4bb91c68 101/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 102#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
103# include <libintl.h>
104#else
105# define gettext(msgid) (msgid)
106#endif
107
5e69f11e
RM
108#ifndef gettext_noop
109/* This define is so xgettext can find the internationalizable
110 strings. */
0b32bf0e 111# define gettext_noop(String) String
5e69f11e
RM
112#endif
113
fa9a63c5
RM
114/* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116#ifdef emacs
117
d7306fe6 118# include <setjmp.h>
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e 125# include "syntax.h"
9117d724 126# include "character.h"
0b32bf0e 127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
7814e705 142/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 147# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
148# define RE_STRING_CHAR(p, multibyte) \
149 (multibyte ? (STRING_CHAR (p)) : (*(p)))
150# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 152
4c0354d7 153# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 154
2afc21f5 155# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 156
6fdd04b0
KH
157/* Set C a (possibly converted to multibyte) character before P. P
158 points into a string which is the virtual concatenation of STR1
159 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
160# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
161 do { \
02cb78b5 162 if (target_multibyte) \
bf216479
KH
163 { \
164 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
165 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
166 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 167 c = STRING_CHAR (dtemp); \
bf216479
KH
168 } \
169 else \
170 { \
171 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 172 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 173 } \
2d1675e4
SM
174 } while (0)
175
6fdd04b0
KH
176/* Set C a (possibly converted to multibyte) character at P, and set
177 LEN to the byte length of that character. */
178# define GET_CHAR_AFTER(c, p, len) \
179 do { \
02cb78b5 180 if (target_multibyte) \
62a6e103 181 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
182 else \
183 { \
cf9c99bc 184 (c) = *p; \
6fdd04b0 185 len = 1; \
cf9c99bc 186 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 187 } \
8f924df7 188 } while (0)
4e8a9132 189
fa9a63c5
RM
190#else /* not emacs */
191
192/* If we are not linking with Emacs proper,
193 we can't use the relocating allocator
194 even if config.h says that we can. */
0b32bf0e 195# undef REL_ALLOC
fa9a63c5 196
4004364e 197# include <unistd.h>
fa9a63c5 198
a77f947b
CY
199/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
200
201void *
d2762c86 202xmalloc (size_t size)
a77f947b
CY
203{
204 register void *val;
205 val = (void *) malloc (size);
206 if (!val && size)
207 {
208 write (2, "virtual memory exhausted\n", 25);
209 exit (1);
210 }
211 return val;
212}
213
214void *
d2762c86 215xrealloc (void *block, size_t size)
a77f947b
CY
216{
217 register void *val;
218 /* We must call malloc explicitly when BLOCK is 0, since some
219 reallocs don't do this. */
220 if (! block)
221 val = (void *) malloc (size);
222 else
223 val = (void *) realloc (block, size);
224 if (!val && size)
225 {
226 write (2, "virtual memory exhausted\n", 25);
227 exit (1);
228 }
229 return val;
230}
231
a073faa6
CY
232# ifdef malloc
233# undef malloc
234# endif
235# define malloc xmalloc
236# ifdef realloc
237# undef realloc
238# endif
239# define realloc xrealloc
240
72af86bd
AS
241/* This is the normal way of making sure we have memcpy, memcmp and memset. */
242# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
243# include <string.h>
244# else
245# include <strings.h>
246# ifndef memcmp
247# define memcmp(s1, s2, n) bcmp (s1, s2, n)
0b32bf0e 248# endif
72af86bd
AS
249# ifndef memcpy
250# define memcpy(d, s, n) (bcopy (s, d, n), (d))
0b32bf0e
SM
251# endif
252# endif
fa9a63c5
RM
253
254/* Define the syntax stuff for \<, \>, etc. */
255
990b2375 256/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 257enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 258
0b32bf0e 259# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 260
e934739e 261/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
262# define CHAR_CHARSET(c) 0
263# define CHARSET_LEADING_CODE_BASE(c) 0
264# define MAX_MULTIBYTE_LENGTH 1
265# define RE_MULTIBYTE_P(x) 0
bf216479 266# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
267# define WORD_BOUNDARY_P(c1, c2) (0)
268# define CHAR_HEAD_P(p) (1)
269# define SINGLE_BYTE_CHAR_P(c) (1)
270# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 271# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 272# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
273# define STRING_CHAR(p) (*(p))
274# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 275# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
276# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
277# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
278# define RE_CHAR_TO_MULTIBYTE(c) (c)
279# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 280# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 281 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
282# define GET_CHAR_AFTER(c, p, len) \
283 (c = *p, len = 1)
0b32bf0e 284# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
285# define BYTE8_TO_CHAR(c) (c)
286# define CHAR_BYTE8_P(c) (0)
bf216479 287# define CHAR_LEADING_CODE(c) (c)
8f924df7 288
fa9a63c5 289#endif /* not emacs */
4e8a9132
SM
290
291#ifndef RE_TRANSLATE
0b32bf0e
SM
292# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
293# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 294#endif
fa9a63c5
RM
295\f
296/* Get the interface, including the syntax bits. */
297#include "regex.h"
298
f71b19b6
DL
299/* isalpha etc. are used for the character classes. */
300#include <ctype.h>
fa9a63c5 301
f71b19b6 302#ifdef emacs
fa9a63c5 303
f71b19b6 304/* 1 if C is an ASCII character. */
0b32bf0e 305# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 306
f71b19b6 307/* 1 if C is a unibyte character. */
0b32bf0e 308# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 309
f71b19b6 310/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 311
f71b19b6 312/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
313# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
314# define ISCNTRL(c) ((c) < ' ')
315# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
316 || ((c) >= 'a' && (c) <= 'f') \
317 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
318
319/* This is only used for single-byte characters. */
0b32bf0e 320# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
321
322/* The rest must handle multibyte characters. */
323
0b32bf0e 324# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 325 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
326 : 1)
327
14473664 328# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 329 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
330 : 1)
331
0b32bf0e 332# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
333 ? (((c) >= 'a' && (c) <= 'z') \
334 || ((c) >= 'A' && (c) <= 'Z') \
335 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
336 : SYNTAX (c) == Sword)
337
0b32bf0e 338# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
339 ? (((c) >= 'a' && (c) <= 'z') \
340 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
341 : SYNTAX (c) == Sword)
342
0b32bf0e 343# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 344
0b32bf0e 345# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
346 ? ((c) > ' ' && (c) < 0177 \
347 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
348 || ((c) >= 'A' && (c) <= 'Z') \
349 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
350 : SYNTAX (c) != Sword)
351
0b32bf0e 352# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 353
0b32bf0e 354# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 355
0b32bf0e 356# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
357
358#else /* not emacs */
359
f71b19b6
DL
360/* Jim Meyering writes:
361
362 "... Some ctype macros are valid only for character codes that
363 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
364 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 365 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
366 STDC_HEADERS is defined, then autoconf has verified that the ctype
367 macros don't need to be guarded with references to isascii. ...
368 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
369 eliminate the && through constant folding."
370 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 371
4bb91c68 372# undef ISASCII
0b32bf0e
SM
373# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
374# define ISASCII(c) 1
375# else
376# define ISASCII(c) isascii(c)
377# endif
f71b19b6
DL
378
379/* 1 if C is an ASCII character. */
0b32bf0e 380# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
381
382/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
383# define ISUNIBYTE(c) 1
384
385# ifdef isblank
386# define ISBLANK(c) (ISASCII (c) && isblank (c))
387# else
388# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
389# endif
390# ifdef isgraph
391# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
392# else
393# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
394# endif
395
4bb91c68 396# undef ISPRINT
0b32bf0e
SM
397# define ISPRINT(c) (ISASCII (c) && isprint (c))
398# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
399# define ISALNUM(c) (ISASCII (c) && isalnum (c))
400# define ISALPHA(c) (ISASCII (c) && isalpha (c))
401# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
402# define ISLOWER(c) (ISASCII (c) && islower (c))
403# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
404# define ISSPACE(c) (ISASCII (c) && isspace (c))
405# define ISUPPER(c) (ISASCII (c) && isupper (c))
406# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
407
408# define ISWORD(c) ISALPHA(c)
409
4bb91c68
SM
410# ifdef _tolower
411# define TOLOWER(c) _tolower(c)
412# else
413# define TOLOWER(c) tolower(c)
414# endif
415
416/* How many characters in the character set. */
417# define CHAR_SET_SIZE 256
418
0b32bf0e 419# ifdef SYNTAX_TABLE
f71b19b6 420
0b32bf0e 421extern char *re_syntax_table;
f71b19b6 422
0b32bf0e
SM
423# else /* not SYNTAX_TABLE */
424
0b32bf0e
SM
425static char re_syntax_table[CHAR_SET_SIZE];
426
427static void
d2762c86 428init_syntax_once (void)
0b32bf0e
SM
429{
430 register int c;
431 static int done = 0;
432
433 if (done)
434 return;
435
72af86bd 436 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 437
4bb91c68
SM
438 for (c = 0; c < CHAR_SET_SIZE; ++c)
439 if (ISALNUM (c))
440 re_syntax_table[c] = Sword;
fa9a63c5 441
669fa600 442 re_syntax_table['_'] = Ssymbol;
fa9a63c5 443
0b32bf0e
SM
444 done = 1;
445}
446
447# endif /* not SYNTAX_TABLE */
96cc36cc 448
4bb91c68
SM
449# define SYNTAX(c) re_syntax_table[(c)]
450
96cc36cc
RS
451#endif /* not emacs */
452\f
fa9a63c5 453#ifndef NULL
0b32bf0e 454# define NULL (void *)0
fa9a63c5
RM
455#endif
456
457/* We remove any previous definition of `SIGN_EXTEND_CHAR',
458 since ours (we hope) works properly with all combinations of
459 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 460 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
461#undef SIGN_EXTEND_CHAR
462#if __STDC__
0b32bf0e 463# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
464#else /* not __STDC__ */
465/* As in Harbison and Steele. */
0b32bf0e 466# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
467#endif
468\f
469/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
470 use `alloca' instead of `malloc'. This is because using malloc in
471 re_search* or re_match* could cause memory leaks when C-g is used in
472 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
473 the other hand, malloc is more portable, and easier to debug.
474
fa9a63c5
RM
475 Because we sometimes use alloca, some routines have to be macros,
476 not functions -- `alloca'-allocated space disappears at the end of the
477 function it is called in. */
478
479#ifdef REGEX_MALLOC
480
0b32bf0e
SM
481# define REGEX_ALLOCATE malloc
482# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
483# define REGEX_FREE free
fa9a63c5
RM
484
485#else /* not REGEX_MALLOC */
486
487/* Emacs already defines alloca, sometimes. */
0b32bf0e 488# ifndef alloca
fa9a63c5
RM
489
490/* Make alloca work the best possible way. */
0b32bf0e
SM
491# ifdef __GNUC__
492# define alloca __builtin_alloca
493# else /* not __GNUC__ */
7f585e7a 494# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
495# include <alloca.h>
496# endif /* HAVE_ALLOCA_H */
497# endif /* not __GNUC__ */
fa9a63c5 498
0b32bf0e 499# endif /* not alloca */
fa9a63c5 500
0b32bf0e 501# define REGEX_ALLOCATE alloca
fa9a63c5
RM
502
503/* Assumes a `char *destination' variable. */
0b32bf0e 504# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 505 (destination = (char *) alloca (nsize), \
4bb91c68 506 memcpy (destination, source, osize))
fa9a63c5
RM
507
508/* No need to do anything to free, after alloca. */
0b32bf0e 509# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
510
511#endif /* not REGEX_MALLOC */
512
513/* Define how to allocate the failure stack. */
514
0b32bf0e 515#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 516
0b32bf0e 517# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 518 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 519# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 520 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 521# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
522 r_alloc_free (&failure_stack_ptr)
523
4297555e 524#else /* not using relocating allocator */
fa9a63c5 525
0b32bf0e 526# ifdef REGEX_MALLOC
fa9a63c5 527
0b32bf0e
SM
528# define REGEX_ALLOCATE_STACK malloc
529# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
530# define REGEX_FREE_STACK free
fa9a63c5 531
0b32bf0e 532# else /* not REGEX_MALLOC */
fa9a63c5 533
0b32bf0e 534# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 535
0b32bf0e 536# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 537 REGEX_REALLOCATE (source, osize, nsize)
7814e705 538/* No need to explicitly free anything. */
0b32bf0e 539# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 540
0b32bf0e 541# endif /* not REGEX_MALLOC */
4297555e 542#endif /* not using relocating allocator */
fa9a63c5
RM
543
544
545/* True if `size1' is non-NULL and PTR is pointing anywhere inside
546 `string1' or just past its end. This works if PTR is NULL, which is
547 a good thing. */
25fe55af 548#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
549 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
550
551/* (Re)Allocate N items of type T using malloc, or fail. */
552#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
553#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
554#define RETALLOC_IF(addr, n, t) \
555 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
556#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
557
4bb91c68 558#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
559
560#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
561
562#undef MAX
563#undef MIN
564#define MAX(a, b) ((a) > (b) ? (a) : (b))
565#define MIN(a, b) ((a) < (b) ? (a) : (b))
566
66f0296e
SM
567/* Type of source-pattern and string chars. */
568typedef const unsigned char re_char;
569
fa9a63c5
RM
570typedef char boolean;
571#define false 0
572#define true 1
573
4bb91c68
SM
574static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
575 re_char *string1, int size1,
576 re_char *string2, int size2,
577 int pos,
578 struct re_registers *regs,
579 int stop));
fa9a63c5
RM
580\f
581/* These are the command codes that appear in compiled regular
4bb91c68 582 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
583 command code can specify any interpretation whatsoever for its
584 arguments. Zero bytes may appear in the compiled regular expression. */
585
586typedef enum
587{
588 no_op = 0,
589
4bb91c68 590 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
591 succeed,
592
25fe55af 593 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
594 exactn,
595
25fe55af 596 /* Matches any (more or less) character. */
fa9a63c5
RM
597 anychar,
598
25fe55af
RS
599 /* Matches any one char belonging to specified set. First
600 following byte is number of bitmap bytes. Then come bytes
601 for a bitmap saying which chars are in. Bits in each byte
602 are ordered low-bit-first. A character is in the set if its
603 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
604 automatically not in the set.
605
606 If the length byte has the 0x80 bit set, then that stuff
607 is followed by a range table:
608 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 609 See RANGE_TABLE_WORK_BITS below.
01618498 610 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 611 pairs, each 2 multibyte characters,
0b32bf0e 612 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
613 charset,
614
25fe55af 615 /* Same parameters as charset, but match any character that is
4bb91c68 616 not one of those specified. */
fa9a63c5
RM
617 charset_not,
618
25fe55af
RS
619 /* Start remembering the text that is matched, for storing in a
620 register. Followed by one byte with the register number, in
621 the range 0 to one less than the pattern buffer's re_nsub
505bde11 622 field. */
fa9a63c5
RM
623 start_memory,
624
25fe55af
RS
625 /* Stop remembering the text that is matched and store it in a
626 memory register. Followed by one byte with the register
627 number, in the range 0 to one less than `re_nsub' in the
505bde11 628 pattern buffer. */
fa9a63c5
RM
629 stop_memory,
630
25fe55af 631 /* Match a duplicate of something remembered. Followed by one
4bb91c68 632 byte containing the register number. */
fa9a63c5
RM
633 duplicate,
634
25fe55af 635 /* Fail unless at beginning of line. */
fa9a63c5
RM
636 begline,
637
4bb91c68 638 /* Fail unless at end of line. */
fa9a63c5
RM
639 endline,
640
25fe55af
RS
641 /* Succeeds if at beginning of buffer (if emacs) or at beginning
642 of string to be matched (if not). */
fa9a63c5
RM
643 begbuf,
644
25fe55af 645 /* Analogously, for end of buffer/string. */
fa9a63c5 646 endbuf,
5e69f11e 647
25fe55af 648 /* Followed by two byte relative address to which to jump. */
5e69f11e 649 jump,
fa9a63c5 650
25fe55af 651 /* Followed by two-byte relative address of place to resume at
7814e705 652 in case of failure. */
fa9a63c5 653 on_failure_jump,
5e69f11e 654
25fe55af
RS
655 /* Like on_failure_jump, but pushes a placeholder instead of the
656 current string position when executed. */
fa9a63c5 657 on_failure_keep_string_jump,
5e69f11e 658
505bde11
SM
659 /* Just like `on_failure_jump', except that it checks that we
660 don't get stuck in an infinite loop (matching an empty string
661 indefinitely). */
662 on_failure_jump_loop,
663
0683b6fa
SM
664 /* Just like `on_failure_jump_loop', except that it checks for
665 a different kind of loop (the kind that shows up with non-greedy
666 operators). This operation has to be immediately preceded
667 by a `no_op'. */
668 on_failure_jump_nastyloop,
669
0b32bf0e 670 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
671 It analyses the loop before which it is put and if the
672 loop does not require backtracking, it changes itself to
4e8a9132
SM
673 `on_failure_keep_string_jump' and short-circuits the loop,
674 else it just defaults to changing itself into `on_failure_jump'.
675 It assumes that it is pointing to just past a `jump'. */
505bde11 676 on_failure_jump_smart,
fa9a63c5 677
25fe55af 678 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
679 After matching N times, jump to the address upon failure.
680 Does not work if N starts at 0: use on_failure_jump_loop
681 instead. */
fa9a63c5
RM
682 succeed_n,
683
25fe55af
RS
684 /* Followed by two-byte relative address, and two-byte number n.
685 Jump to the address N times, then fail. */
fa9a63c5
RM
686 jump_n,
687
25fe55af 688 /* Set the following two-byte relative address to the
7814e705 689 subsequent two-byte number. The address *includes* the two
25fe55af 690 bytes of number. */
fa9a63c5
RM
691 set_number_at,
692
fa9a63c5
RM
693 wordbeg, /* Succeeds if at word beginning. */
694 wordend, /* Succeeds if at word end. */
695
696 wordbound, /* Succeeds if at a word boundary. */
7814e705 697 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 698
669fa600
SM
699 symbeg, /* Succeeds if at symbol beginning. */
700 symend, /* Succeeds if at symbol end. */
701
fa9a63c5 702 /* Matches any character whose syntax is specified. Followed by
25fe55af 703 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
704 syntaxspec,
705
706 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
707 notsyntaxspec
708
709#ifdef emacs
710 ,before_dot, /* Succeeds if before point. */
711 at_dot, /* Succeeds if at point. */
712 after_dot, /* Succeeds if after point. */
b18215fc
RS
713
714 /* Matches any character whose category-set contains the specified
7814e705
JB
715 category. The operator is followed by a byte which contains a
716 category code (mnemonic ASCII character). */
b18215fc
RS
717 categoryspec,
718
719 /* Matches any character whose category-set does not contain the
720 specified category. The operator is followed by a byte which
721 contains the category code (mnemonic ASCII character). */
722 notcategoryspec
fa9a63c5
RM
723#endif /* emacs */
724} re_opcode_t;
725\f
726/* Common operations on the compiled pattern. */
727
728/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
729
730#define STORE_NUMBER(destination, number) \
731 do { \
732 (destination)[0] = (number) & 0377; \
733 (destination)[1] = (number) >> 8; \
734 } while (0)
735
736/* Same as STORE_NUMBER, except increment DESTINATION to
737 the byte after where the number is stored. Therefore, DESTINATION
738 must be an lvalue. */
739
740#define STORE_NUMBER_AND_INCR(destination, number) \
741 do { \
742 STORE_NUMBER (destination, number); \
743 (destination) += 2; \
744 } while (0)
745
746/* Put into DESTINATION a number stored in two contiguous bytes starting
747 at SOURCE. */
748
749#define EXTRACT_NUMBER(destination, source) \
750 do { \
751 (destination) = *(source) & 0377; \
752 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
753 } while (0)
754
755#ifdef DEBUG
4bb91c68 756static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
757static void
758extract_number (dest, source)
759 int *dest;
01618498 760 re_char *source;
fa9a63c5 761{
5e69f11e 762 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
763 *dest = *source & 0377;
764 *dest += temp << 8;
765}
766
4bb91c68 767# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
768# undef EXTRACT_NUMBER
769# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
770# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
771
772#endif /* DEBUG */
773
774/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
775 SOURCE must be an lvalue. */
776
777#define EXTRACT_NUMBER_AND_INCR(destination, source) \
778 do { \
779 EXTRACT_NUMBER (destination, source); \
25fe55af 780 (source) += 2; \
fa9a63c5
RM
781 } while (0)
782
783#ifdef DEBUG
4bb91c68
SM
784static void extract_number_and_incr _RE_ARGS ((int *destination,
785 re_char **source));
fa9a63c5
RM
786static void
787extract_number_and_incr (destination, source)
788 int *destination;
01618498 789 re_char **source;
5e69f11e 790{
fa9a63c5
RM
791 extract_number (destination, *source);
792 *source += 2;
793}
794
0b32bf0e
SM
795# ifndef EXTRACT_MACROS
796# undef EXTRACT_NUMBER_AND_INCR
797# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 798 extract_number_and_incr (&dest, &src)
0b32bf0e 799# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
800
801#endif /* DEBUG */
802\f
b18215fc
RS
803/* Store a multibyte character in three contiguous bytes starting
804 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 805 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
806
807#define STORE_CHARACTER_AND_INCR(destination, character) \
808 do { \
809 (destination)[0] = (character) & 0377; \
810 (destination)[1] = ((character) >> 8) & 0377; \
811 (destination)[2] = (character) >> 16; \
812 (destination) += 3; \
813 } while (0)
814
815/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 816 starting at SOURCE. */
b18215fc
RS
817
818#define EXTRACT_CHARACTER(destination, source) \
819 do { \
820 (destination) = ((source)[0] \
821 | ((source)[1] << 8) \
822 | ((source)[2] << 16)); \
823 } while (0)
824
825
826/* Macros for charset. */
827
828/* Size of bitmap of charset P in bytes. P is a start of charset,
829 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
830#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
831
832/* Nonzero if charset P has range table. */
25fe55af 833#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
834
835/* Return the address of range table of charset P. But not the start
836 of table itself, but the before where the number of ranges is
96cc36cc
RS
837 stored. `2 +' means to skip re_opcode_t and size of bitmap,
838 and the 2 bytes of flags at the start of the range table. */
839#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
840
841/* Extract the bit flags that start a range table. */
842#define CHARSET_RANGE_TABLE_BITS(p) \
843 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
844 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
845
846/* Test if C is listed in the bitmap of charset P. */
847#define CHARSET_LOOKUP_BITMAP(p, c) \
848 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
849 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
850
851/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
852 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
853 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
854 and end. */
855#define CHARSET_RANGE_TABLE_END(range_table, count) \
856 ((range_table) + (count) * 2 * 3)
857
7814e705 858/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
859 COUNT is number of ranges in RANGE_TABLE. */
860#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
861 do \
862 { \
01618498
SM
863 re_wchar_t range_start, range_end; \
864 re_char *p; \
865 re_char *range_table_end \
b18215fc
RS
866 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
867 \
868 for (p = (range_table); p < range_table_end; p += 2 * 3) \
869 { \
870 EXTRACT_CHARACTER (range_start, p); \
871 EXTRACT_CHARACTER (range_end, p + 3); \
872 \
873 if (range_start <= (c) && (c) <= range_end) \
874 { \
875 (not) = !(not); \
876 break; \
877 } \
878 } \
879 } \
880 while (0)
881
882/* Test if C is in range table of CHARSET. The flag NOT is negated if
883 C is listed in it. */
884#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
885 do \
886 { \
887 /* Number of ranges in range table. */ \
888 int count; \
01618498
SM
889 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
890 \
b18215fc
RS
891 EXTRACT_NUMBER_AND_INCR (count, range_table); \
892 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
893 } \
894 while (0)
895\f
fa9a63c5
RM
896/* If DEBUG is defined, Regex prints many voluminous messages about what
897 it is doing (if the variable `debug' is nonzero). If linked with the
898 main program in `iregex.c', you can enter patterns and strings
899 interactively. And if linked with the main program in `main.c' and
4bb91c68 900 the other test files, you can run the already-written tests. */
fa9a63c5
RM
901
902#ifdef DEBUG
903
904/* We use standard I/O for debugging. */
0b32bf0e 905# include <stdio.h>
fa9a63c5
RM
906
907/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 908# include <assert.h>
fa9a63c5 909
99633e97 910static int debug = -100000;
fa9a63c5 911
0b32bf0e
SM
912# define DEBUG_STATEMENT(e) e
913# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
914# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
915# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
916# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
917# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 918 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 919# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 920 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
921
922
923/* Print the fastmap in human-readable form. */
924
925void
926print_fastmap (fastmap)
927 char *fastmap;
928{
929 unsigned was_a_range = 0;
5e69f11e
RM
930 unsigned i = 0;
931
fa9a63c5
RM
932 while (i < (1 << BYTEWIDTH))
933 {
934 if (fastmap[i++])
935 {
936 was_a_range = 0;
25fe55af
RS
937 putchar (i - 1);
938 while (i < (1 << BYTEWIDTH) && fastmap[i])
939 {
940 was_a_range = 1;
941 i++;
942 }
fa9a63c5 943 if (was_a_range)
25fe55af
RS
944 {
945 printf ("-");
946 putchar (i - 1);
947 }
948 }
fa9a63c5 949 }
5e69f11e 950 putchar ('\n');
fa9a63c5
RM
951}
952
953
954/* Print a compiled pattern string in human-readable form, starting at
955 the START pointer into it and ending just before the pointer END. */
956
957void
958print_partial_compiled_pattern (start, end)
01618498
SM
959 re_char *start;
960 re_char *end;
fa9a63c5
RM
961{
962 int mcnt, mcnt2;
01618498
SM
963 re_char *p = start;
964 re_char *pend = end;
fa9a63c5
RM
965
966 if (start == NULL)
967 {
a1a052df 968 fprintf (stderr, "(null)\n");
fa9a63c5
RM
969 return;
970 }
5e69f11e 971
fa9a63c5
RM
972 /* Loop over pattern commands. */
973 while (p < pend)
974 {
a1a052df 975 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
976
977 switch ((re_opcode_t) *p++)
978 {
25fe55af 979 case no_op:
a1a052df 980 fprintf (stderr, "/no_op");
25fe55af 981 break;
fa9a63c5 982
99633e97 983 case succeed:
a1a052df 984 fprintf (stderr, "/succeed");
99633e97
SM
985 break;
986
fa9a63c5
RM
987 case exactn:
988 mcnt = *p++;
a1a052df 989 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 990 do
fa9a63c5 991 {
a1a052df 992 fprintf (stderr, "/%c", *p++);
25fe55af
RS
993 }
994 while (--mcnt);
995 break;
fa9a63c5
RM
996
997 case start_memory:
a1a052df 998 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 999 break;
fa9a63c5
RM
1000
1001 case stop_memory:
a1a052df 1002 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1003 break;
fa9a63c5
RM
1004
1005 case duplicate:
a1a052df 1006 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1007 break;
1008
1009 case anychar:
a1a052df 1010 fprintf (stderr, "/anychar");
fa9a63c5
RM
1011 break;
1012
1013 case charset:
25fe55af
RS
1014 case charset_not:
1015 {
1016 register int c, last = -100;
fa9a63c5 1017 register int in_range = 0;
99633e97
SM
1018 int length = CHARSET_BITMAP_SIZE (p - 1);
1019 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1020
a1a052df 1021 fprintf (stderr, "/charset [%s",
839966f3 1022 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1023
839966f3
KH
1024 if (p + *p >= pend)
1025 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1026
25fe55af 1027 for (c = 0; c < 256; c++)
96cc36cc 1028 if (c / 8 < length
fa9a63c5
RM
1029 && (p[1 + (c/8)] & (1 << (c % 8))))
1030 {
1031 /* Are we starting a range? */
1032 if (last + 1 == c && ! in_range)
1033 {
a1a052df 1034 fprintf (stderr, "-");
fa9a63c5
RM
1035 in_range = 1;
1036 }
1037 /* Have we broken a range? */
1038 else if (last + 1 != c && in_range)
96cc36cc 1039 {
a1a052df 1040 fprintf (stderr, "%c", last);
fa9a63c5
RM
1041 in_range = 0;
1042 }
5e69f11e 1043
fa9a63c5 1044 if (! in_range)
a1a052df 1045 fprintf (stderr, "%c", c);
fa9a63c5
RM
1046
1047 last = c;
25fe55af 1048 }
fa9a63c5
RM
1049
1050 if (in_range)
a1a052df 1051 fprintf (stderr, "%c", last);
fa9a63c5 1052
a1a052df 1053 fprintf (stderr, "]");
fa9a63c5 1054
99633e97 1055 p += 1 + length;
96cc36cc 1056
96cc36cc 1057 if (has_range_table)
99633e97
SM
1058 {
1059 int count;
a1a052df 1060 fprintf (stderr, "has-range-table");
99633e97
SM
1061
1062 /* ??? Should print the range table; for now, just skip it. */
1063 p += 2; /* skip range table bits */
1064 EXTRACT_NUMBER_AND_INCR (count, p);
1065 p = CHARSET_RANGE_TABLE_END (p, count);
1066 }
fa9a63c5
RM
1067 }
1068 break;
1069
1070 case begline:
a1a052df 1071 fprintf (stderr, "/begline");
25fe55af 1072 break;
fa9a63c5
RM
1073
1074 case endline:
a1a052df 1075 fprintf (stderr, "/endline");
25fe55af 1076 break;
fa9a63c5
RM
1077
1078 case on_failure_jump:
25fe55af 1079 extract_number_and_incr (&mcnt, &p);
a1a052df 1080 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1081 break;
fa9a63c5
RM
1082
1083 case on_failure_keep_string_jump:
25fe55af 1084 extract_number_and_incr (&mcnt, &p);
a1a052df 1085 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1086 break;
fa9a63c5 1087
0683b6fa
SM
1088 case on_failure_jump_nastyloop:
1089 extract_number_and_incr (&mcnt, &p);
a1a052df 1090 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1091 break;
1092
505bde11 1093 case on_failure_jump_loop:
fa9a63c5 1094 extract_number_and_incr (&mcnt, &p);
a1a052df 1095 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1096 break;
1097
505bde11 1098 case on_failure_jump_smart:
fa9a63c5 1099 extract_number_and_incr (&mcnt, &p);
a1a052df 1100 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1101 break;
1102
25fe55af 1103 case jump:
fa9a63c5 1104 extract_number_and_incr (&mcnt, &p);
a1a052df 1105 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1106 break;
1107
25fe55af
RS
1108 case succeed_n:
1109 extract_number_and_incr (&mcnt, &p);
1110 extract_number_and_incr (&mcnt2, &p);
a1a052df 1111 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1112 break;
5e69f11e 1113
25fe55af
RS
1114 case jump_n:
1115 extract_number_and_incr (&mcnt, &p);
1116 extract_number_and_incr (&mcnt2, &p);
a1a052df 1117 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1118 break;
5e69f11e 1119
25fe55af
RS
1120 case set_number_at:
1121 extract_number_and_incr (&mcnt, &p);
1122 extract_number_and_incr (&mcnt2, &p);
a1a052df 1123 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1124 break;
5e69f11e 1125
25fe55af 1126 case wordbound:
a1a052df 1127 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1128 break;
1129
1130 case notwordbound:
a1a052df 1131 fprintf (stderr, "/notwordbound");
25fe55af 1132 break;
fa9a63c5
RM
1133
1134 case wordbeg:
a1a052df 1135 fprintf (stderr, "/wordbeg");
fa9a63c5 1136 break;
5e69f11e 1137
fa9a63c5 1138 case wordend:
a1a052df 1139 fprintf (stderr, "/wordend");
e2543b02 1140 break;
5e69f11e 1141
669fa600 1142 case symbeg:
e2543b02 1143 fprintf (stderr, "/symbeg");
669fa600
SM
1144 break;
1145
1146 case symend:
e2543b02 1147 fprintf (stderr, "/symend");
669fa600 1148 break;
5e69f11e 1149
1fb352e0 1150 case syntaxspec:
a1a052df 1151 fprintf (stderr, "/syntaxspec");
1fb352e0 1152 mcnt = *p++;
a1a052df 1153 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1154 break;
1155
1156 case notsyntaxspec:
a1a052df 1157 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1158 mcnt = *p++;
a1a052df 1159 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1160 break;
1161
0b32bf0e 1162# ifdef emacs
fa9a63c5 1163 case before_dot:
a1a052df 1164 fprintf (stderr, "/before_dot");
25fe55af 1165 break;
fa9a63c5
RM
1166
1167 case at_dot:
a1a052df 1168 fprintf (stderr, "/at_dot");
25fe55af 1169 break;
fa9a63c5
RM
1170
1171 case after_dot:
a1a052df 1172 fprintf (stderr, "/after_dot");
25fe55af 1173 break;
fa9a63c5 1174
1fb352e0 1175 case categoryspec:
a1a052df 1176 fprintf (stderr, "/categoryspec");
fa9a63c5 1177 mcnt = *p++;
a1a052df 1178 fprintf (stderr, "/%d", mcnt);
25fe55af 1179 break;
5e69f11e 1180
1fb352e0 1181 case notcategoryspec:
a1a052df 1182 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1183 mcnt = *p++;
a1a052df 1184 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1185 break;
0b32bf0e 1186# endif /* emacs */
fa9a63c5 1187
fa9a63c5 1188 case begbuf:
a1a052df 1189 fprintf (stderr, "/begbuf");
25fe55af 1190 break;
fa9a63c5
RM
1191
1192 case endbuf:
a1a052df 1193 fprintf (stderr, "/endbuf");
25fe55af 1194 break;
fa9a63c5 1195
25fe55af 1196 default:
a1a052df 1197 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1198 }
1199
a1a052df 1200 fprintf (stderr, "\n");
fa9a63c5
RM
1201 }
1202
a1a052df 1203 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1204}
1205
1206
1207void
1208print_compiled_pattern (bufp)
1209 struct re_pattern_buffer *bufp;
1210{
01618498 1211 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1212
1213 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1214 printf ("%ld bytes used/%ld bytes allocated.\n",
1215 bufp->used, bufp->allocated);
fa9a63c5
RM
1216
1217 if (bufp->fastmap_accurate && bufp->fastmap)
1218 {
1219 printf ("fastmap: ");
1220 print_fastmap (bufp->fastmap);
1221 }
1222
1223 printf ("re_nsub: %d\t", bufp->re_nsub);
1224 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1225 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1226 printf ("no_sub: %d\t", bufp->no_sub);
1227 printf ("not_bol: %d\t", bufp->not_bol);
1228 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1229 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1230 fflush (stdout);
fa9a63c5
RM
1231 /* Perhaps we should print the translate table? */
1232}
1233
1234
1235void
1236print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1237 re_char *where;
1238 re_char *string1;
1239 re_char *string2;
fa9a63c5
RM
1240 int size1;
1241 int size2;
1242{
4bb91c68 1243 int this_char;
5e69f11e 1244
fa9a63c5
RM
1245 if (where == NULL)
1246 printf ("(null)");
1247 else
1248 {
1249 if (FIRST_STRING_P (where))
25fe55af
RS
1250 {
1251 for (this_char = where - string1; this_char < size1; this_char++)
1252 putchar (string1[this_char]);
fa9a63c5 1253
25fe55af
RS
1254 where = string2;
1255 }
fa9a63c5
RM
1256
1257 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1258 putchar (string2[this_char]);
fa9a63c5
RM
1259 }
1260}
1261
1262#else /* not DEBUG */
1263
0b32bf0e
SM
1264# undef assert
1265# define assert(e)
fa9a63c5 1266
0b32bf0e
SM
1267# define DEBUG_STATEMENT(e)
1268# define DEBUG_PRINT1(x)
1269# define DEBUG_PRINT2(x1, x2)
1270# define DEBUG_PRINT3(x1, x2, x3)
1271# define DEBUG_PRINT4(x1, x2, x3, x4)
1272# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1273# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1274
1275#endif /* not DEBUG */
1276\f
1277/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1278 also be assigned to arbitrarily: each pattern buffer stores its own
1279 syntax, so it can be changed between regex compilations. */
1280/* This has no initializer because initialized variables in Emacs
1281 become read-only after dumping. */
1282reg_syntax_t re_syntax_options;
1283
1284
1285/* Specify the precise syntax of regexps for compilation. This provides
1286 for compatibility for various utilities which historically have
1287 different, incompatible syntaxes.
1288
1289 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1290 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1291
1292reg_syntax_t
971de7fb 1293re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1294{
1295 reg_syntax_t ret = re_syntax_options;
5e69f11e 1296
fa9a63c5
RM
1297 re_syntax_options = syntax;
1298 return ret;
1299}
c0f9ea08 1300WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1301
1302/* Regexp to use to replace spaces, or NULL meaning don't. */
1303static re_char *whitespace_regexp;
1304
1305void
971de7fb 1306re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1307{
6470ea05 1308 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1309}
1310WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1311\f
1312/* This table gives an error message for each of the error codes listed
4bb91c68 1313 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1314 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1315 but why not be nice? */
fa9a63c5
RM
1316
1317static const char *re_error_msgid[] =
5e69f11e
RM
1318 {
1319 gettext_noop ("Success"), /* REG_NOERROR */
1320 gettext_noop ("No match"), /* REG_NOMATCH */
1321 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1322 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1323 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1324 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1325 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1326 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1327 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1328 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1329 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1330 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1331 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1332 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1333 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1334 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1335 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1336 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1337 };
1338\f
4bb91c68 1339/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1340
1341/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1342 searching and matching functions should not call alloca. On some
1343 systems, alloca is implemented in terms of malloc, and if we're
1344 using the relocating allocator routines, then malloc could cause a
1345 relocation, which might (if the strings being searched are in the
1346 ralloc heap) shift the data out from underneath the regexp
1347 routines.
1348
5e69f11e 1349 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1350 processes input from X in a signal handler; processing X input may
1351 call malloc; if input arrives while a matching routine is calling
1352 malloc, then we're scrod. But Emacs can't just block input while
1353 calling matching routines; then we don't notice interrupts when
1354 they come in. So, Emacs blocks input around all regexp calls
1355 except the matching calls, which it leaves unprotected, in the
1356 faith that they will not malloc. */
1357
1358/* Normally, this is fine. */
1359#define MATCH_MAY_ALLOCATE
1360
fa9a63c5
RM
1361/* The match routines may not allocate if (1) they would do it with malloc
1362 and (2) it's not safe for them to use malloc.
1363 Note that if REL_ALLOC is defined, matching would not use malloc for the
1364 failure stack, but we would still use it for the register vectors;
4bb91c68 1365 so REL_ALLOC should not affect this. */
b588157e 1366#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1367# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1368#endif
1369
1370\f
1371/* Failure stack declarations and macros; both re_compile_fastmap and
1372 re_match_2 use a failure stack. These have to be macros because of
1373 REGEX_ALLOCATE_STACK. */
5e69f11e 1374
fa9a63c5 1375
320a2a73 1376/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1377 when matching. If this number is exceeded, we allocate more
1378 space, so it is not a hard limit. */
1379#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1380# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1381#endif
1382
1383/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1384 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1385 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1386 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1387 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1388# if defined MATCH_MAY_ALLOCATE
1389/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1390 whose default stack limit is 2mb. In order for a larger
1391 value to work reliably, you have to try to make it accord
1392 with the process stack limit. */
c0f9ea08
SM
1393size_t re_max_failures = 40000;
1394# else
1395size_t re_max_failures = 4000;
1396# endif
fa9a63c5
RM
1397
1398union fail_stack_elt
1399{
01618498 1400 re_char *pointer;
c0f9ea08
SM
1401 /* This should be the biggest `int' that's no bigger than a pointer. */
1402 long integer;
fa9a63c5
RM
1403};
1404
1405typedef union fail_stack_elt fail_stack_elt_t;
1406
1407typedef struct
1408{
1409 fail_stack_elt_t *stack;
c0f9ea08
SM
1410 size_t size;
1411 size_t avail; /* Offset of next open position. */
1412 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1413} fail_stack_type;
1414
505bde11 1415#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1416#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1417
1418
1419/* Define macros to initialize and free the failure stack.
1420 Do `return -2' if the alloc fails. */
1421
1422#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1423# define INIT_FAIL_STACK() \
fa9a63c5
RM
1424 do { \
1425 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1426 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1427 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1428 \
1429 if (fail_stack.stack == NULL) \
1430 return -2; \
1431 \
1432 fail_stack.size = INIT_FAILURE_ALLOC; \
1433 fail_stack.avail = 0; \
505bde11 1434 fail_stack.frame = 0; \
fa9a63c5
RM
1435 } while (0)
1436
0b32bf0e 1437# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1438#else
0b32bf0e 1439# define INIT_FAIL_STACK() \
fa9a63c5
RM
1440 do { \
1441 fail_stack.avail = 0; \
505bde11 1442 fail_stack.frame = 0; \
fa9a63c5
RM
1443 } while (0)
1444
0b32bf0e 1445# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1446#endif
1447
1448
320a2a73
KH
1449/* Double the size of FAIL_STACK, up to a limit
1450 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1451
1452 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1453 allocating space for it or it was already too large.
1454
4bb91c68 1455 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1456
320a2a73
KH
1457/* Factor to increase the failure stack size by
1458 when we increase it.
1459 This used to be 2, but 2 was too wasteful
1460 because the old discarded stacks added up to as much space
1461 were as ultimate, maximum-size stack. */
1462#define FAIL_STACK_GROWTH_FACTOR 4
1463
1464#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1465 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1466 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1467 ? 0 \
320a2a73
KH
1468 : ((fail_stack).stack \
1469 = (fail_stack_elt_t *) \
25fe55af
RS
1470 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1471 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1472 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1473 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1474 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1475 \
1476 (fail_stack).stack == NULL \
1477 ? 0 \
6453db45
KH
1478 : ((fail_stack).size \
1479 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1480 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1481 * FAIL_STACK_GROWTH_FACTOR)) \
1482 / sizeof (fail_stack_elt_t)), \
25fe55af 1483 1)))
fa9a63c5
RM
1484
1485
fa9a63c5
RM
1486/* Push a pointer value onto the failure stack.
1487 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1488 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1489#define PUSH_FAILURE_POINTER(item) \
01618498 1490 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1491
1492/* This pushes an integer-valued item onto the failure stack.
1493 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1494 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1495#define PUSH_FAILURE_INT(item) \
1496 fail_stack.stack[fail_stack.avail++].integer = (item)
1497
1498/* Push a fail_stack_elt_t value onto the failure stack.
1499 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1500 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1501#define PUSH_FAILURE_ELT(item) \
1502 fail_stack.stack[fail_stack.avail++] = (item)
1503
1504/* These three POP... operations complement the three PUSH... operations.
1505 All assume that `fail_stack' is nonempty. */
1506#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1507#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1508#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1509
505bde11
SM
1510/* Individual items aside from the registers. */
1511#define NUM_NONREG_ITEMS 3
1512
1513/* Used to examine the stack (to detect infinite loops). */
1514#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1515#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1516#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1517#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1518
1519
505bde11
SM
1520#define ENSURE_FAIL_STACK(space) \
1521while (REMAINING_AVAIL_SLOTS <= space) { \
1522 if (!GROW_FAIL_STACK (fail_stack)) \
1523 return -2; \
1524 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1525 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1526}
1527
1528/* Push register NUM onto the stack. */
1529#define PUSH_FAILURE_REG(num) \
1530do { \
1531 char *destination; \
1532 ENSURE_FAIL_STACK(3); \
1533 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1534 num, regstart[num], regend[num]); \
1535 PUSH_FAILURE_POINTER (regstart[num]); \
1536 PUSH_FAILURE_POINTER (regend[num]); \
1537 PUSH_FAILURE_INT (num); \
1538} while (0)
1539
01618498
SM
1540/* Change the counter's value to VAL, but make sure that it will
1541 be reset when backtracking. */
1542#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1543do { \
1544 char *destination; \
1545 int c; \
1546 ENSURE_FAIL_STACK(3); \
1547 EXTRACT_NUMBER (c, ptr); \
01618498 1548 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1549 PUSH_FAILURE_INT (c); \
1550 PUSH_FAILURE_POINTER (ptr); \
1551 PUSH_FAILURE_INT (-1); \
01618498 1552 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1553} while (0)
1554
505bde11 1555/* Pop a saved register off the stack. */
dc1e502d 1556#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1557do { \
1558 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1559 if (reg == -1) \
1560 { \
1561 /* It's a counter. */ \
6dcf2d0e
SM
1562 /* Here, we discard `const', making re_match non-reentrant. */ \
1563 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1564 reg = POP_FAILURE_INT (); \
1565 STORE_NUMBER (ptr, reg); \
1566 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1567 } \
1568 else \
1569 { \
1570 regend[reg] = POP_FAILURE_POINTER (); \
1571 regstart[reg] = POP_FAILURE_POINTER (); \
1572 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1573 reg, regstart[reg], regend[reg]); \
1574 } \
505bde11
SM
1575} while (0)
1576
1577/* Check that we are not stuck in an infinite loop. */
1578#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1579do { \
f6df485f 1580 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1581 /* Check for infinite matching loops */ \
f6df485f
RS
1582 while (failure > 0 \
1583 && (FAILURE_STR (failure) == string_place \
1584 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1585 { \
1586 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1587 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1588 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1589 { \
6df42991
SM
1590 cycle = 1; \
1591 break; \
f6df485f 1592 } \
66f0296e 1593 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1594 failure = NEXT_FAILURE_HANDLE(failure); \
1595 } \
1596 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1597} while (0)
6df42991 1598
fa9a63c5 1599/* Push the information about the state we will need
5e69f11e
RM
1600 if we ever fail back to it.
1601
505bde11 1602 Requires variables fail_stack, regstart, regend and
320a2a73 1603 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1604 declared.
5e69f11e 1605
fa9a63c5
RM
1606 Does `return FAILURE_CODE' if runs out of memory. */
1607
505bde11
SM
1608#define PUSH_FAILURE_POINT(pattern, string_place) \
1609do { \
1610 char *destination; \
1611 /* Must be int, so when we don't save any registers, the arithmetic \
1612 of 0 + -1 isn't done as unsigned. */ \
1613 \
505bde11 1614 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1615 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1616 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1617 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1618 \
1619 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1620 \
1621 DEBUG_PRINT1 ("\n"); \
1622 \
1623 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1624 PUSH_FAILURE_INT (fail_stack.frame); \
1625 \
1626 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1627 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1628 DEBUG_PRINT1 ("'\n"); \
1629 PUSH_FAILURE_POINTER (string_place); \
1630 \
1631 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1632 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1633 PUSH_FAILURE_POINTER (pattern); \
1634 \
1635 /* Close the frame by moving the frame pointer past it. */ \
1636 fail_stack.frame = fail_stack.avail; \
1637} while (0)
fa9a63c5 1638
320a2a73
KH
1639/* Estimate the size of data pushed by a typical failure stack entry.
1640 An estimate is all we need, because all we use this for
1641 is to choose a limit for how big to make the failure stack. */
ada30c0e 1642/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1643#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1644
fa9a63c5
RM
1645/* How many items can still be added to the stack without overflowing it. */
1646#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1647
1648
1649/* Pops what PUSH_FAIL_STACK pushes.
1650
1651 We restore into the parameters, all of which should be lvalues:
1652 STR -- the saved data position.
1653 PAT -- the saved pattern position.
fa9a63c5 1654 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1655
fa9a63c5 1656 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1657 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1658
505bde11
SM
1659#define POP_FAILURE_POINT(str, pat) \
1660do { \
fa9a63c5
RM
1661 assert (!FAIL_STACK_EMPTY ()); \
1662 \
1663 /* Remove failure points and point to how many regs pushed. */ \
1664 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1665 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1666 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1667 \
505bde11
SM
1668 /* Pop the saved registers. */ \
1669 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1670 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1671 \
01618498 1672 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1673 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1674 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1675 \
1676 /* If the saved string location is NULL, it came from an \
1677 on_failure_keep_string_jump opcode, and we want to throw away the \
1678 saved NULL, thus retaining our current position in the string. */ \
01618498 1679 str = POP_FAILURE_POINTER (); \
505bde11 1680 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1681 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1682 DEBUG_PRINT1 ("'\n"); \
1683 \
505bde11
SM
1684 fail_stack.frame = POP_FAILURE_INT (); \
1685 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1686 \
505bde11
SM
1687 assert (fail_stack.avail >= 0); \
1688 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1689 \
fa9a63c5 1690 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1691} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1692
1693
1694\f
fa9a63c5 1695/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1696#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1697\f
1698/* Subroutine declarations and macros for regex_compile. */
1699
4bb91c68
SM
1700static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1701 reg_syntax_t syntax,
1702 struct re_pattern_buffer *bufp));
1703static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1704static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1705 int arg1, int arg2));
1706static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1707 int arg, unsigned char *end));
1708static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1709 int arg1, int arg2, unsigned char *end));
01618498
SM
1710static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1711 re_char *p,
4bb91c68 1712 reg_syntax_t syntax));
01618498
SM
1713static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1714 re_char *pend,
4bb91c68 1715 reg_syntax_t syntax));
01618498
SM
1716static re_char *skip_one_char _RE_ARGS ((re_char *p));
1717static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1718 char *fastmap, const int multibyte));
fa9a63c5 1719
fa9a63c5 1720/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1721 translation. */
36595814 1722#define PATFETCH(c) \
2d1675e4
SM
1723 do { \
1724 int len; \
1725 if (p == pend) return REG_EEND; \
62a6e103 1726 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1727 p += len; \
fa9a63c5
RM
1728 } while (0)
1729
fa9a63c5
RM
1730
1731/* If `translate' is non-null, return translate[D], else just D. We
1732 cast the subscript to translate because some data is declared as
1733 `char *', to avoid warnings when a string constant is passed. But
1734 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1735#ifndef TRANSLATE
0b32bf0e 1736# define TRANSLATE(d) \
66f0296e 1737 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1738#endif
fa9a63c5
RM
1739
1740
1741/* Macros for outputting the compiled pattern into `buffer'. */
1742
1743/* If the buffer isn't allocated when it comes in, use this. */
1744#define INIT_BUF_SIZE 32
1745
4bb91c68 1746/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1747#define GET_BUFFER_SPACE(n) \
01618498 1748 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1749 EXTEND_BUFFER ()
1750
1751/* Make sure we have one more byte of buffer space and then add C to it. */
1752#define BUF_PUSH(c) \
1753 do { \
1754 GET_BUFFER_SPACE (1); \
1755 *b++ = (unsigned char) (c); \
1756 } while (0)
1757
1758
1759/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1760#define BUF_PUSH_2(c1, c2) \
1761 do { \
1762 GET_BUFFER_SPACE (2); \
1763 *b++ = (unsigned char) (c1); \
1764 *b++ = (unsigned char) (c2); \
1765 } while (0)
1766
1767
4bb91c68 1768/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1769#define BUF_PUSH_3(c1, c2, c3) \
1770 do { \
1771 GET_BUFFER_SPACE (3); \
1772 *b++ = (unsigned char) (c1); \
1773 *b++ = (unsigned char) (c2); \
1774 *b++ = (unsigned char) (c3); \
1775 } while (0)
1776
1777
1778/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1779 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1780#define STORE_JUMP(op, loc, to) \
1781 store_op1 (op, loc, (to) - (loc) - 3)
1782
1783/* Likewise, for a two-argument jump. */
1784#define STORE_JUMP2(op, loc, to, arg) \
1785 store_op2 (op, loc, (to) - (loc) - 3, arg)
1786
4bb91c68 1787/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1788#define INSERT_JUMP(op, loc, to) \
1789 insert_op1 (op, loc, (to) - (loc) - 3, b)
1790
1791/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1792#define INSERT_JUMP2(op, loc, to, arg) \
1793 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1794
1795
1796/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1797 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1798 be too small, many things would have to change. */
839966f3
KH
1799# define MAX_BUF_SIZE (1L << 15)
1800
1801#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1802/* Any other compiler which, like MSC, has allocation limit below 2^16
1803 bytes will have to use approach similar to what was done below for
1804 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1805 reallocating to 0 bytes. Such thing is not going to work too well.
1806 You have been warned!! */
1807#if defined _MSC_VER && !defined WIN32
1808/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1809# define MAX_BUF_SIZE 65500L
1810#else
1811# define MAX_BUF_SIZE (1L << 16)
1812#endif
839966f3 1813#endif /* 0 */
fa9a63c5
RM
1814
1815/* Extend the buffer by twice its current size via realloc and
1816 reset the pointers that pointed into the old block to point to the
1817 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1818 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1819#if __BOUNDED_POINTERS__
1820# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1821# define MOVE_BUFFER_POINTER(P) \
1822 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1823 SET_HIGH_BOUND (P), \
1824 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1825# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1826 else \
1827 { \
1828 SET_HIGH_BOUND (b); \
1829 SET_HIGH_BOUND (begalt); \
1830 if (fixup_alt_jump) \
1831 SET_HIGH_BOUND (fixup_alt_jump); \
1832 if (laststart) \
1833 SET_HIGH_BOUND (laststart); \
1834 if (pending_exact) \
1835 SET_HIGH_BOUND (pending_exact); \
1836 }
1837#else
381880b0 1838# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1839# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1840#endif
fa9a63c5 1841#define EXTEND_BUFFER() \
25fe55af 1842 do { \
381880b0 1843 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1844 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1845 return REG_ESIZE; \
1846 bufp->allocated <<= 1; \
1847 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1848 bufp->allocated = MAX_BUF_SIZE; \
01618498 1849 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1850 if (bufp->buffer == NULL) \
1851 return REG_ESPACE; \
1852 /* If the buffer moved, move all the pointers into it. */ \
1853 if (old_buffer != bufp->buffer) \
1854 { \
381880b0 1855 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1856 MOVE_BUFFER_POINTER (b); \
1857 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1858 if (fixup_alt_jump) \
4bb91c68 1859 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1860 if (laststart) \
4bb91c68 1861 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1862 if (pending_exact) \
4bb91c68 1863 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1864 } \
4bb91c68 1865 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1866 } while (0)
1867
1868
1869/* Since we have one byte reserved for the register number argument to
1870 {start,stop}_memory, the maximum number of groups we can report
1871 things about is what fits in that byte. */
1872#define MAX_REGNUM 255
1873
1874/* But patterns can have more than `MAX_REGNUM' registers. We just
1875 ignore the excess. */
098d42af 1876typedef int regnum_t;
fa9a63c5
RM
1877
1878
1879/* Macros for the compile stack. */
1880
1881/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1882 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1883/* int may be not enough when sizeof(int) == 2. */
1884typedef long pattern_offset_t;
fa9a63c5
RM
1885
1886typedef struct
1887{
1888 pattern_offset_t begalt_offset;
1889 pattern_offset_t fixup_alt_jump;
5e69f11e 1890 pattern_offset_t laststart_offset;
fa9a63c5
RM
1891 regnum_t regnum;
1892} compile_stack_elt_t;
1893
1894
1895typedef struct
1896{
1897 compile_stack_elt_t *stack;
1898 unsigned size;
1899 unsigned avail; /* Offset of next open position. */
1900} compile_stack_type;
1901
1902
1903#define INIT_COMPILE_STACK_SIZE 32
1904
1905#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1906#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1907
4bb91c68 1908/* The next available element. */
fa9a63c5
RM
1909#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1910
1cee1e27
SM
1911/* Explicit quit checking is only used on NTemacs and whenever we
1912 use polling to process input events. */
1913#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1914extern int immediate_quit;
1915# define IMMEDIATE_QUIT_CHECK \
1916 do { \
1917 if (immediate_quit) QUIT; \
1918 } while (0)
1919#else
1920# define IMMEDIATE_QUIT_CHECK ((void)0)
1921#endif
1922\f
b18215fc
RS
1923/* Structure to manage work area for range table. */
1924struct range_table_work_area
1925{
1926 int *table; /* actual work area. */
1927 int allocated; /* allocated size for work area in bytes. */
7814e705 1928 int used; /* actually used size in words. */
96cc36cc 1929 int bits; /* flag to record character classes */
b18215fc
RS
1930};
1931
77d11aec
RS
1932/* Make sure that WORK_AREA can hold more N multibyte characters.
1933 This is used only in set_image_of_range and set_image_of_range_1.
1934 It expects WORK_AREA to be a pointer.
1935 If it can't get the space, it returns from the surrounding function. */
1936
1937#define EXTEND_RANGE_TABLE(work_area, n) \
1938 do { \
8f924df7 1939 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1940 { \
8f924df7
KH
1941 extend_range_table_work_area (&work_area); \
1942 if ((work_area).table == 0) \
77d11aec
RS
1943 return (REG_ESPACE); \
1944 } \
b18215fc
RS
1945 } while (0)
1946
96cc36cc
RS
1947#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1948 (work_area).bits |= (bit)
1949
14473664
SM
1950/* Bits used to implement the multibyte-part of the various character classes
1951 such as [:alnum:] in a charset's range table. */
1952#define BIT_WORD 0x1
1953#define BIT_LOWER 0x2
1954#define BIT_PUNCT 0x4
1955#define BIT_SPACE 0x8
1956#define BIT_UPPER 0x10
1957#define BIT_MULTIBYTE 0x20
96cc36cc 1958
b18215fc
RS
1959/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1960#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1961 do { \
8f924df7 1962 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1963 (work_area).table[(work_area).used++] = (range_start); \
1964 (work_area).table[(work_area).used++] = (range_end); \
1965 } while (0)
1966
7814e705 1967/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1968#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1969 do { \
1970 if ((work_area).table) \
1971 free ((work_area).table); \
1972 } while (0)
1973
96cc36cc 1974#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1975#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1976#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1977#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1978\f
b18215fc 1979
fa9a63c5 1980/* Set the bit for character C in a list. */
01618498 1981#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1982
1983
bf216479
KH
1984#ifdef emacs
1985
cf9c99bc
KH
1986/* Store characters in the range FROM to TO in the bitmap at B (for
1987 ASCII and unibyte characters) and WORK_AREA (for multibyte
1988 characters) while translating them and paying attention to the
1989 continuity of translated characters.
8f924df7 1990
cf9c99bc
KH
1991 Implementation note: It is better to implement these fairly big
1992 macros by a function, but it's not that easy because macros called
8f924df7 1993 in this macro assume various local variables already declared. */
bf216479 1994
cf9c99bc
KH
1995/* Both FROM and TO are ASCII characters. */
1996
1997#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1998 do { \
1999 int C0, C1; \
2000 \
2001 for (C0 = (FROM); C0 <= (TO); C0++) \
2002 { \
2003 C1 = TRANSLATE (C0); \
2004 if (! ASCII_CHAR_P (C1)) \
2005 { \
2006 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2007 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2008 C1 = C0; \
2009 } \
2010 SET_LIST_BIT (C1); \
2011 } \
2012 } while (0)
2013
2014
2015/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2016
2017#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2018 do { \
2019 int C0, C1, C2, I; \
2020 int USED = RANGE_TABLE_WORK_USED (work_area); \
2021 \
2022 for (C0 = (FROM); C0 <= (TO); C0++) \
2023 { \
2024 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2025 if (CHAR_BYTE8_P (C1)) \
2026 SET_LIST_BIT (C0); \
2027 else \
2028 { \
2029 C2 = TRANSLATE (C1); \
2030 if (C2 == C1 \
2031 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2032 C1 = C0; \
2033 SET_LIST_BIT (C1); \
2034 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2035 { \
2036 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2037 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2038 \
2039 if (C2 >= from - 1 && C2 <= to + 1) \
2040 { \
2041 if (C2 == from - 1) \
2042 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2043 else if (C2 == to + 1) \
2044 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2045 break; \
2046 } \
2047 } \
2048 if (I < USED) \
2049 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2050 } \
2051 } \
2052 } while (0)
2053
2054
78edd3b7 2055/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2056
2057#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2058 do { \
2059 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2060 \
2061 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2062 for (C0 = (FROM); C0 <= (TO); C0++) \
2063 { \
2064 C1 = TRANSLATE (C0); \
2065 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2066 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2067 SET_LIST_BIT (C2); \
2068 if (C1 >= (FROM) && C1 <= (TO)) \
2069 continue; \
2070 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2071 { \
2072 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2073 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2074 \
2075 if (C1 >= from - 1 && C1 <= to + 1) \
2076 { \
2077 if (C1 == from - 1) \
2078 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2079 else if (C1 == to + 1) \
2080 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2081 break; \
2082 } \
2083 } \
2084 if (I < USED) \
2085 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2086 } \
bf216479
KH
2087 } while (0)
2088
2089#endif /* emacs */
2090
fa9a63c5 2091/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2092#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2093 do { \
2094 if (p == pend) \
2095 FREE_STACK_RETURN (REG_EBRACE); \
2096 else \
2097 { \
2098 PATFETCH (c); \
2099 while ('0' <= c && c <= '9') \
2100 { \
2101 int prev; \
2102 if (num < 0) \
2103 num = 0; \
2104 prev = num; \
2105 num = num * 10 + c - '0'; \
2106 if (num / 10 != prev) \
2107 FREE_STACK_RETURN (REG_BADBR); \
2108 if (p == pend) \
2109 FREE_STACK_RETURN (REG_EBRACE); \
2110 PATFETCH (c); \
2111 } \
2112 } \
2113 } while (0)
77d11aec 2114\f
1fdab503 2115#if ! WIDE_CHAR_SUPPORT
01618498 2116
14473664 2117/* Map a string to the char class it names (if any). */
1fdab503 2118re_wctype_t
971de7fb 2119re_wctype (const re_char *str)
14473664 2120{
5b0534c8 2121 const char *string = (const char *) str;
14473664
SM
2122 if (STREQ (string, "alnum")) return RECC_ALNUM;
2123 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2124 else if (STREQ (string, "word")) return RECC_WORD;
2125 else if (STREQ (string, "ascii")) return RECC_ASCII;
2126 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2127 else if (STREQ (string, "graph")) return RECC_GRAPH;
2128 else if (STREQ (string, "lower")) return RECC_LOWER;
2129 else if (STREQ (string, "print")) return RECC_PRINT;
2130 else if (STREQ (string, "punct")) return RECC_PUNCT;
2131 else if (STREQ (string, "space")) return RECC_SPACE;
2132 else if (STREQ (string, "upper")) return RECC_UPPER;
2133 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2134 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2135 else if (STREQ (string, "digit")) return RECC_DIGIT;
2136 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2137 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2138 else if (STREQ (string, "blank")) return RECC_BLANK;
2139 else return 0;
2140}
2141
e0f24100 2142/* True if CH is in the char class CC. */
1fdab503 2143boolean
971de7fb 2144re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2145{
2146 switch (cc)
2147 {
0cdd06f8
SM
2148 case RECC_ALNUM: return ISALNUM (ch);
2149 case RECC_ALPHA: return ISALPHA (ch);
2150 case RECC_BLANK: return ISBLANK (ch);
2151 case RECC_CNTRL: return ISCNTRL (ch);
2152 case RECC_DIGIT: return ISDIGIT (ch);
2153 case RECC_GRAPH: return ISGRAPH (ch);
2154 case RECC_LOWER: return ISLOWER (ch);
2155 case RECC_PRINT: return ISPRINT (ch);
2156 case RECC_PUNCT: return ISPUNCT (ch);
2157 case RECC_SPACE: return ISSPACE (ch);
2158 case RECC_UPPER: return ISUPPER (ch);
2159 case RECC_XDIGIT: return ISXDIGIT (ch);
2160 case RECC_ASCII: return IS_REAL_ASCII (ch);
2161 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2162 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2163 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2164 case RECC_WORD: return ISWORD (ch);
2165 case RECC_ERROR: return false;
2166 default:
2167 abort();
14473664
SM
2168 }
2169}
fa9a63c5 2170
14473664
SM
2171/* Return a bit-pattern to use in the range-table bits to match multibyte
2172 chars of class CC. */
2173static int
971de7fb 2174re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2175{
2176 switch (cc)
2177 {
2178 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2179 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2180 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2181 case RECC_LOWER: return BIT_LOWER;
2182 case RECC_UPPER: return BIT_UPPER;
2183 case RECC_PUNCT: return BIT_PUNCT;
2184 case RECC_SPACE: return BIT_SPACE;
14473664 2185 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2186 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2187 default:
2188 abort();
14473664
SM
2189 }
2190}
2191#endif
77d11aec
RS
2192\f
2193/* Filling in the work area of a range. */
2194
2195/* Actually extend the space in WORK_AREA. */
2196
2197static void
971de7fb 2198extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2199{
77d11aec
RS
2200 work_area->allocated += 16 * sizeof (int);
2201 if (work_area->table)
2202 work_area->table
2203 = (int *) realloc (work_area->table, work_area->allocated);
2204 else
2205 work_area->table
2206 = (int *) malloc (work_area->allocated);
2207}
2208
8f924df7 2209#if 0
77d11aec
RS
2210#ifdef emacs
2211
2212/* Carefully find the ranges of codes that are equivalent
2213 under case conversion to the range start..end when passed through
2214 TRANSLATE. Handle the case where non-letters can come in between
2215 two upper-case letters (which happens in Latin-1).
2216 Also handle the case of groups of more than 2 case-equivalent chars.
2217
2218 The basic method is to look at consecutive characters and see
2219 if they can form a run that can be handled as one.
2220
2221 Returns -1 if successful, REG_ESPACE if ran out of space. */
2222
2223static int
2224set_image_of_range_1 (work_area, start, end, translate)
2225 RE_TRANSLATE_TYPE translate;
2226 struct range_table_work_area *work_area;
2227 re_wchar_t start, end;
2228{
2229 /* `one_case' indicates a character, or a run of characters,
2230 each of which is an isolate (no case-equivalents).
2231 This includes all ASCII non-letters.
2232
2233 `two_case' indicates a character, or a run of characters,
2234 each of which has two case-equivalent forms.
2235 This includes all ASCII letters.
2236
2237 `strange' indicates a character that has more than one
2238 case-equivalent. */
177c0ea7 2239
77d11aec
RS
2240 enum case_type {one_case, two_case, strange};
2241
2242 /* Describe the run that is in progress,
2243 which the next character can try to extend.
2244 If run_type is strange, that means there really is no run.
2245 If run_type is one_case, then run_start...run_end is the run.
2246 If run_type is two_case, then the run is run_start...run_end,
2247 and the case-equivalents end at run_eqv_end. */
2248
2249 enum case_type run_type = strange;
2250 int run_start, run_end, run_eqv_end;
2251
2252 Lisp_Object eqv_table;
2253
2254 if (!RE_TRANSLATE_P (translate))
2255 {
b7c12565 2256 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2257 work_area->table[work_area->used++] = (start);
2258 work_area->table[work_area->used++] = (end);
b7c12565 2259 return -1;
77d11aec
RS
2260 }
2261
2262 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2263
77d11aec
RS
2264 for (; start <= end; start++)
2265 {
2266 enum case_type this_type;
2267 int eqv = RE_TRANSLATE (eqv_table, start);
2268 int minchar, maxchar;
2269
2270 /* Classify this character */
2271 if (eqv == start)
2272 this_type = one_case;
2273 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2274 this_type = two_case;
2275 else
2276 this_type = strange;
2277
2278 if (start < eqv)
2279 minchar = start, maxchar = eqv;
2280 else
2281 minchar = eqv, maxchar = start;
2282
2283 /* Can this character extend the run in progress? */
2284 if (this_type == strange || this_type != run_type
2285 || !(minchar == run_end + 1
2286 && (run_type == two_case
2287 ? maxchar == run_eqv_end + 1 : 1)))
2288 {
2289 /* No, end the run.
2290 Record each of its equivalent ranges. */
2291 if (run_type == one_case)
2292 {
2293 EXTEND_RANGE_TABLE (work_area, 2);
2294 work_area->table[work_area->used++] = run_start;
2295 work_area->table[work_area->used++] = run_end;
2296 }
2297 else if (run_type == two_case)
2298 {
2299 EXTEND_RANGE_TABLE (work_area, 4);
2300 work_area->table[work_area->used++] = run_start;
2301 work_area->table[work_area->used++] = run_end;
2302 work_area->table[work_area->used++]
2303 = RE_TRANSLATE (eqv_table, run_start);
2304 work_area->table[work_area->used++]
2305 = RE_TRANSLATE (eqv_table, run_end);
2306 }
2307 run_type = strange;
2308 }
177c0ea7 2309
77d11aec
RS
2310 if (this_type == strange)
2311 {
2312 /* For a strange character, add each of its equivalents, one
2313 by one. Don't start a range. */
2314 do
2315 {
2316 EXTEND_RANGE_TABLE (work_area, 2);
2317 work_area->table[work_area->used++] = eqv;
2318 work_area->table[work_area->used++] = eqv;
2319 eqv = RE_TRANSLATE (eqv_table, eqv);
2320 }
2321 while (eqv != start);
2322 }
2323
2324 /* Add this char to the run, or start a new run. */
2325 else if (run_type == strange)
2326 {
2327 /* Initialize a new range. */
2328 run_type = this_type;
2329 run_start = start;
2330 run_end = start;
2331 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2332 }
2333 else
2334 {
2335 /* Extend a running range. */
2336 run_end = minchar;
2337 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2338 }
2339 }
2340
2341 /* If a run is still in progress at the end, finish it now
2342 by recording its equivalent ranges. */
2343 if (run_type == one_case)
2344 {
2345 EXTEND_RANGE_TABLE (work_area, 2);
2346 work_area->table[work_area->used++] = run_start;
2347 work_area->table[work_area->used++] = run_end;
2348 }
2349 else if (run_type == two_case)
2350 {
2351 EXTEND_RANGE_TABLE (work_area, 4);
2352 work_area->table[work_area->used++] = run_start;
2353 work_area->table[work_area->used++] = run_end;
2354 work_area->table[work_area->used++]
2355 = RE_TRANSLATE (eqv_table, run_start);
2356 work_area->table[work_area->used++]
2357 = RE_TRANSLATE (eqv_table, run_end);
2358 }
2359
2360 return -1;
2361}
36595814 2362
77d11aec 2363#endif /* emacs */
36595814 2364
2b34df4e 2365/* Record the image of the range start..end when passed through
36595814
SM
2366 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2367 and is not even necessarily contiguous.
b7c12565
RS
2368 Normally we approximate it with the smallest contiguous range that contains
2369 all the chars we need. However, for Latin-1 we go to extra effort
2370 to do a better job.
2371
2372 This function is not called for ASCII ranges.
77d11aec
RS
2373
2374 Returns -1 if successful, REG_ESPACE if ran out of space. */
2375
2376static int
36595814
SM
2377set_image_of_range (work_area, start, end, translate)
2378 RE_TRANSLATE_TYPE translate;
2379 struct range_table_work_area *work_area;
2380 re_wchar_t start, end;
2381{
77d11aec
RS
2382 re_wchar_t cmin, cmax;
2383
2384#ifdef emacs
2385 /* For Latin-1 ranges, use set_image_of_range_1
2386 to get proper handling of ranges that include letters and nonletters.
b7c12565 2387 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2388 For other character sets, we don't bother to get this right. */
b7c12565
RS
2389 if (RE_TRANSLATE_P (translate) && start < 04400
2390 && !(start < 04200 && end >= 04377))
77d11aec 2391 {
b7c12565 2392 int newend;
77d11aec 2393 int tem;
b7c12565
RS
2394 newend = end;
2395 if (newend > 04377)
2396 newend = 04377;
2397 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2398 if (tem > 0)
2399 return tem;
2400
2401 start = 04400;
2402 if (end < 04400)
2403 return -1;
2404 }
2405#endif
2406
b7c12565
RS
2407 EXTEND_RANGE_TABLE (work_area, 2);
2408 work_area->table[work_area->used++] = (start);
2409 work_area->table[work_area->used++] = (end);
2410
2411 cmin = -1, cmax = -1;
77d11aec 2412
36595814 2413 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2414 {
2415 int ch;
77d11aec 2416
b7c12565
RS
2417 for (ch = start; ch <= end; ch++)
2418 {
2419 re_wchar_t c = TRANSLATE (ch);
2420 if (! (start <= c && c <= end))
2421 {
2422 if (cmin == -1)
2423 cmin = c, cmax = c;
2424 else
2425 {
2426 cmin = MIN (cmin, c);
2427 cmax = MAX (cmax, c);
2428 }
2429 }
2430 }
2431
2432 if (cmin != -1)
2433 {
2434 EXTEND_RANGE_TABLE (work_area, 2);
2435 work_area->table[work_area->used++] = (cmin);
2436 work_area->table[work_area->used++] = (cmax);
2437 }
2438 }
36595814 2439
77d11aec
RS
2440 return -1;
2441}
8f924df7 2442#endif /* 0 */
fa9a63c5
RM
2443\f
2444#ifndef MATCH_MAY_ALLOCATE
2445
2446/* If we cannot allocate large objects within re_match_2_internal,
2447 we make the fail stack and register vectors global.
2448 The fail stack, we grow to the maximum size when a regexp
2449 is compiled.
2450 The register vectors, we adjust in size each time we
2451 compile a regexp, according to the number of registers it needs. */
2452
2453static fail_stack_type fail_stack;
2454
2455/* Size with which the following vectors are currently allocated.
2456 That is so we can make them bigger as needed,
4bb91c68 2457 but never make them smaller. */
fa9a63c5
RM
2458static int regs_allocated_size;
2459
66f0296e
SM
2460static re_char ** regstart, ** regend;
2461static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2462
2463/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2464 but don't make them smaller. */
fa9a63c5
RM
2465
2466static
2467regex_grow_registers (num_regs)
2468 int num_regs;
2469{
2470 if (num_regs > regs_allocated_size)
2471 {
66f0296e
SM
2472 RETALLOC_IF (regstart, num_regs, re_char *);
2473 RETALLOC_IF (regend, num_regs, re_char *);
2474 RETALLOC_IF (best_regstart, num_regs, re_char *);
2475 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2476
2477 regs_allocated_size = num_regs;
2478 }
2479}
2480
2481#endif /* not MATCH_MAY_ALLOCATE */
2482\f
99633e97
SM
2483static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2484 compile_stack,
2485 regnum_t regnum));
2486
fa9a63c5
RM
2487/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2488 Returns one of error codes defined in `regex.h', or zero for success.
2489
2490 Assumes the `allocated' (and perhaps `buffer') and `translate'
2491 fields are set in BUFP on entry.
2492
2493 If it succeeds, results are put in BUFP (if it returns an error, the
2494 contents of BUFP are undefined):
2495 `buffer' is the compiled pattern;
2496 `syntax' is set to SYNTAX;
2497 `used' is set to the length of the compiled pattern;
2498 `fastmap_accurate' is zero;
2499 `re_nsub' is the number of subexpressions in PATTERN;
2500 `not_bol' and `not_eol' are zero;
5e69f11e 2501
c0f9ea08 2502 The `fastmap' field is neither examined nor set. */
fa9a63c5 2503
505bde11
SM
2504/* Insert the `jump' from the end of last alternative to "here".
2505 The space for the jump has already been allocated. */
2506#define FIXUP_ALT_JUMP() \
2507do { \
2508 if (fixup_alt_jump) \
2509 STORE_JUMP (jump, fixup_alt_jump, b); \
2510} while (0)
2511
2512
fa9a63c5
RM
2513/* Return, freeing storage we allocated. */
2514#define FREE_STACK_RETURN(value) \
b18215fc
RS
2515 do { \
2516 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2517 free (compile_stack.stack); \
2518 return value; \
2519 } while (0)
fa9a63c5
RM
2520
2521static reg_errcode_t
971de7fb 2522regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2523{
01618498
SM
2524 /* We fetch characters from PATTERN here. */
2525 register re_wchar_t c, c1;
5e69f11e 2526
fa9a63c5 2527 /* A random temporary spot in PATTERN. */
66f0296e 2528 re_char *p1;
fa9a63c5
RM
2529
2530 /* Points to the end of the buffer, where we should append. */
2531 register unsigned char *b;
5e69f11e 2532
fa9a63c5
RM
2533 /* Keeps track of unclosed groups. */
2534 compile_stack_type compile_stack;
2535
2536 /* Points to the current (ending) position in the pattern. */
22336245
RS
2537#ifdef AIX
2538 /* `const' makes AIX compiler fail. */
66f0296e 2539 unsigned char *p = pattern;
22336245 2540#else
66f0296e 2541 re_char *p = pattern;
22336245 2542#endif
66f0296e 2543 re_char *pend = pattern + size;
5e69f11e 2544
fa9a63c5 2545 /* How to translate the characters in the pattern. */
6676cb1c 2546 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2547
2548 /* Address of the count-byte of the most recently inserted `exactn'
2549 command. This makes it possible to tell if a new exact-match
2550 character can be added to that command or if the character requires
2551 a new `exactn' command. */
2552 unsigned char *pending_exact = 0;
2553
2554 /* Address of start of the most recently finished expression.
2555 This tells, e.g., postfix * where to find the start of its
2556 operand. Reset at the beginning of groups and alternatives. */
2557 unsigned char *laststart = 0;
2558
2559 /* Address of beginning of regexp, or inside of last group. */
2560 unsigned char *begalt;
2561
2562 /* Place in the uncompiled pattern (i.e., the {) to
2563 which to go back if the interval is invalid. */
66f0296e 2564 re_char *beg_interval;
5e69f11e 2565
fa9a63c5 2566 /* Address of the place where a forward jump should go to the end of
7814e705 2567 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2568 last -- ends with a forward jump of this sort. */
2569 unsigned char *fixup_alt_jump = 0;
2570
b18215fc
RS
2571 /* Work area for range table of charset. */
2572 struct range_table_work_area range_table_work;
2573
2d1675e4
SM
2574 /* If the object matched can contain multibyte characters. */
2575 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2576
8f924df7 2577 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2578 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2579
f9b0fd99
RS
2580 /* Nonzero if we have pushed down into a subpattern. */
2581 int in_subpattern = 0;
2582
2583 /* These hold the values of p, pattern, and pend from the main
2584 pattern when we have pushed into a subpattern. */
2585 re_char *main_p;
2586 re_char *main_pattern;
2587 re_char *main_pend;
2588
fa9a63c5 2589#ifdef DEBUG
99633e97 2590 debug++;
fa9a63c5 2591 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2592 if (debug > 0)
fa9a63c5
RM
2593 {
2594 unsigned debug_count;
5e69f11e 2595
fa9a63c5 2596 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2597 putchar (pattern[debug_count]);
fa9a63c5
RM
2598 putchar ('\n');
2599 }
2600#endif /* DEBUG */
2601
2602 /* Initialize the compile stack. */
2603 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2604 if (compile_stack.stack == NULL)
2605 return REG_ESPACE;
2606
2607 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2608 compile_stack.avail = 0;
2609
b18215fc
RS
2610 range_table_work.table = 0;
2611 range_table_work.allocated = 0;
2612
fa9a63c5
RM
2613 /* Initialize the pattern buffer. */
2614 bufp->syntax = syntax;
2615 bufp->fastmap_accurate = 0;
2616 bufp->not_bol = bufp->not_eol = 0;
6224b623 2617 bufp->used_syntax = 0;
fa9a63c5
RM
2618
2619 /* Set `used' to zero, so that if we return an error, the pattern
2620 printer (for debugging) will think there's no pattern. We reset it
2621 at the end. */
2622 bufp->used = 0;
5e69f11e 2623
fa9a63c5 2624 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2625 bufp->re_nsub = 0;
fa9a63c5 2626
0b32bf0e 2627#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2628 /* Initialize the syntax table. */
2629 init_syntax_once ();
2630#endif
2631
2632 if (bufp->allocated == 0)
2633 {
2634 if (bufp->buffer)
2635 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2636 enough space. This loses if buffer's address is bogus, but
7814e705 2637 that is the user's responsibility. */
25fe55af
RS
2638 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2639 }
fa9a63c5 2640 else
7814e705 2641 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2642 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2643 }
fa9a63c5
RM
2644 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2645
2646 bufp->allocated = INIT_BUF_SIZE;
2647 }
2648
2649 begalt = b = bufp->buffer;
2650
2651 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2652 while (1)
fa9a63c5 2653 {
f9b0fd99
RS
2654 if (p == pend)
2655 {
2656 /* If this is the end of an included regexp,
2657 pop back to the main regexp and try again. */
2658 if (in_subpattern)
2659 {
2660 in_subpattern = 0;
2661 pattern = main_pattern;
2662 p = main_p;
2663 pend = main_pend;
2664 continue;
2665 }
2666 /* If this is the end of the main regexp, we are done. */
2667 break;
2668 }
2669
fa9a63c5
RM
2670 PATFETCH (c);
2671
2672 switch (c)
25fe55af 2673 {
f9b0fd99
RS
2674 case ' ':
2675 {
2676 re_char *p1 = p;
2677
2678 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2679 spaces normally. And don't try to do this recursively. */
2680 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2681 goto normal_char;
2682
2683 /* Peek past following spaces. */
2684 while (p1 != pend)
2685 {
2686 if (*p1 != ' ')
2687 break;
2688 p1++;
2689 }
2690 /* If the spaces are followed by a repetition op,
2691 treat them normally. */
c721eee5
RS
2692 if (p1 != pend
2693 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2694 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2695 goto normal_char;
2696
2697 /* Replace the spaces with the whitespace regexp. */
2698 in_subpattern = 1;
2699 main_p = p1;
2700 main_pend = pend;
2701 main_pattern = pattern;
2702 p = pattern = whitespace_regexp;
5b0534c8 2703 pend = p + strlen ((const char *) p);
f9b0fd99 2704 break;
7814e705 2705 }
f9b0fd99 2706
25fe55af
RS
2707 case '^':
2708 {
7814e705 2709 if ( /* If at start of pattern, it's an operator. */
25fe55af 2710 p == pattern + 1
7814e705 2711 /* If context independent, it's an operator. */
25fe55af 2712 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2713 /* Otherwise, depends on what's come before. */
25fe55af 2714 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2715 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2716 else
2717 goto normal_char;
2718 }
2719 break;
2720
2721
2722 case '$':
2723 {
2724 if ( /* If at end of pattern, it's an operator. */
2725 p == pend
7814e705 2726 /* If context independent, it's an operator. */
25fe55af
RS
2727 || syntax & RE_CONTEXT_INDEP_ANCHORS
2728 /* Otherwise, depends on what's next. */
2729 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2730 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2731 else
2732 goto normal_char;
2733 }
2734 break;
fa9a63c5
RM
2735
2736
2737 case '+':
25fe55af
RS
2738 case '?':
2739 if ((syntax & RE_BK_PLUS_QM)
2740 || (syntax & RE_LIMITED_OPS))
2741 goto normal_char;
2742 handle_plus:
2743 case '*':
2744 /* If there is no previous pattern... */
2745 if (!laststart)
2746 {
2747 if (syntax & RE_CONTEXT_INVALID_OPS)
2748 FREE_STACK_RETURN (REG_BADRPT);
2749 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2750 goto normal_char;
2751 }
2752
2753 {
7814e705 2754 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2755 boolean zero_times_ok = 0, many_times_ok = 0;
2756 boolean greedy = 1;
25fe55af
RS
2757
2758 /* If there is a sequence of repetition chars, collapse it
2759 down to just one (the right one). We can't combine
2760 interval operators with these because of, e.g., `a{2}*',
7814e705 2761 which should only match an even number of `a's. */
25fe55af
RS
2762
2763 for (;;)
2764 {
0b32bf0e 2765 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2766 && c == '?' && (zero_times_ok || many_times_ok))
2767 greedy = 0;
2768 else
2769 {
2770 zero_times_ok |= c != '+';
2771 many_times_ok |= c != '?';
2772 }
25fe55af
RS
2773
2774 if (p == pend)
2775 break;
ed0767d8
SM
2776 else if (*p == '*'
2777 || (!(syntax & RE_BK_PLUS_QM)
2778 && (*p == '+' || *p == '?')))
25fe55af 2779 ;
ed0767d8 2780 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2781 {
ed0767d8
SM
2782 if (p+1 == pend)
2783 FREE_STACK_RETURN (REG_EESCAPE);
2784 if (p[1] == '+' || p[1] == '?')
2785 PATFETCH (c); /* Gobble up the backslash. */
2786 else
2787 break;
25fe55af
RS
2788 }
2789 else
ed0767d8 2790 break;
25fe55af 2791 /* If we get here, we found another repeat character. */
ed0767d8
SM
2792 PATFETCH (c);
2793 }
25fe55af
RS
2794
2795 /* Star, etc. applied to an empty pattern is equivalent
2796 to an empty pattern. */
4e8a9132 2797 if (!laststart || laststart == b)
25fe55af
RS
2798 break;
2799
2800 /* Now we know whether or not zero matches is allowed
7814e705 2801 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2802 if (greedy)
2803 {
99633e97 2804 if (many_times_ok)
4e8a9132
SM
2805 {
2806 boolean simple = skip_one_char (laststart) == b;
2807 unsigned int startoffset = 0;
f6a3f532 2808 re_opcode_t ofj =
01618498 2809 /* Check if the loop can match the empty string. */
6df42991
SM
2810 (simple || !analyse_first (laststart, b, NULL, 0))
2811 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2812 assert (skip_one_char (laststart) <= b);
177c0ea7 2813
4e8a9132
SM
2814 if (!zero_times_ok && simple)
2815 { /* Since simple * loops can be made faster by using
2816 on_failure_keep_string_jump, we turn simple P+
2817 into PP* if P is simple. */
2818 unsigned char *p1, *p2;
2819 startoffset = b - laststart;
2820 GET_BUFFER_SPACE (startoffset);
2821 p1 = b; p2 = laststart;
2822 while (p2 < p1)
2823 *b++ = *p2++;
2824 zero_times_ok = 1;
99633e97 2825 }
4e8a9132
SM
2826
2827 GET_BUFFER_SPACE (6);
2828 if (!zero_times_ok)
2829 /* A + loop. */
f6a3f532 2830 STORE_JUMP (ofj, b, b + 6);
99633e97 2831 else
4e8a9132
SM
2832 /* Simple * loops can use on_failure_keep_string_jump
2833 depending on what follows. But since we don't know
2834 that yet, we leave the decision up to
2835 on_failure_jump_smart. */
f6a3f532 2836 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2837 laststart + startoffset, b + 6);
99633e97 2838 b += 3;
4e8a9132 2839 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2840 b += 3;
2841 }
2842 else
2843 {
4e8a9132
SM
2844 /* A simple ? pattern. */
2845 assert (zero_times_ok);
2846 GET_BUFFER_SPACE (3);
2847 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2848 b += 3;
2849 }
1c8c6d39
DL
2850 }
2851 else /* not greedy */
2852 { /* I wish the greedy and non-greedy cases could be merged. */
2853
0683b6fa 2854 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2855 if (many_times_ok)
2856 {
f6a3f532
SM
2857 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2858
6df42991
SM
2859 /* The non-greedy multiple match looks like
2860 a repeat..until: we only need a conditional jump
2861 at the end of the loop. */
f6a3f532
SM
2862 if (emptyp) BUF_PUSH (no_op);
2863 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2864 : on_failure_jump, b, laststart);
1c8c6d39
DL
2865 b += 3;
2866 if (zero_times_ok)
2867 {
2868 /* The repeat...until naturally matches one or more.
2869 To also match zero times, we need to first jump to
6df42991 2870 the end of the loop (its conditional jump). */
1c8c6d39
DL
2871 INSERT_JUMP (jump, laststart, b);
2872 b += 3;
2873 }
2874 }
2875 else
2876 {
2877 /* non-greedy a?? */
1c8c6d39
DL
2878 INSERT_JUMP (jump, laststart, b + 3);
2879 b += 3;
2880 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2881 b += 3;
2882 }
2883 }
2884 }
4e8a9132 2885 pending_exact = 0;
fa9a63c5
RM
2886 break;
2887
2888
2889 case '.':
25fe55af
RS
2890 laststart = b;
2891 BUF_PUSH (anychar);
2892 break;
fa9a63c5
RM
2893
2894
25fe55af
RS
2895 case '[':
2896 {
b18215fc 2897 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2898
25fe55af 2899 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2900
25fe55af
RS
2901 /* Ensure that we have enough space to push a charset: the
2902 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2903 GET_BUFFER_SPACE (34);
2904
25fe55af 2905 laststart = b;
e318085a 2906
25fe55af 2907 /* We test `*p == '^' twice, instead of using an if
7814e705 2908 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2909 BUF_PUSH (*p == '^' ? charset_not : charset);
2910 if (*p == '^')
2911 p++;
e318085a 2912
25fe55af
RS
2913 /* Remember the first position in the bracket expression. */
2914 p1 = p;
e318085a 2915
7814e705 2916 /* Push the number of bytes in the bitmap. */
25fe55af 2917 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2918
25fe55af 2919 /* Clear the whole map. */
72af86bd 2920 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2921
25fe55af
RS
2922 /* charset_not matches newline according to a syntax bit. */
2923 if ((re_opcode_t) b[-2] == charset_not
2924 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2925 SET_LIST_BIT ('\n');
fa9a63c5 2926
7814e705 2927 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2928 for (;;)
2929 {
b18215fc 2930 boolean escaped_char = false;
2d1675e4 2931 const unsigned char *p2 = p;
cf9c99bc 2932 re_wchar_t ch, c2;
e318085a 2933
25fe55af 2934 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2935
36595814
SM
2936 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2937 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2938 So the translation is done later in a loop. Example:
2939 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2940 PATFETCH (c);
e318085a 2941
25fe55af
RS
2942 /* \ might escape characters inside [...] and [^...]. */
2943 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2944 {
2945 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2946
2947 PATFETCH (c);
b18215fc 2948 escaped_char = true;
25fe55af 2949 }
b18215fc
RS
2950 else
2951 {
7814e705 2952 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2953 not (i.e., when the bracket expression is `[]' so
2954 far), the ']' character bit gets set way below. */
2d1675e4 2955 if (c == ']' && p2 != p1)
657fcfbd 2956 break;
25fe55af 2957 }
b18215fc 2958
25fe55af
RS
2959 /* See if we're at the beginning of a possible character
2960 class. */
b18215fc 2961
2d1675e4
SM
2962 if (!escaped_char &&
2963 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2964 {
7814e705 2965 /* Leave room for the null. */
14473664 2966 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2967 const unsigned char *class_beg;
b18215fc 2968
25fe55af
RS
2969 PATFETCH (c);
2970 c1 = 0;
ed0767d8 2971 class_beg = p;
b18215fc 2972
25fe55af
RS
2973 /* If pattern is `[[:'. */
2974 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2975
25fe55af
RS
2976 for (;;)
2977 {
14473664
SM
2978 PATFETCH (c);
2979 if ((c == ':' && *p == ']') || p == pend)
2980 break;
2981 if (c1 < CHAR_CLASS_MAX_LENGTH)
2982 str[c1++] = c;
2983 else
2984 /* This is in any case an invalid class name. */
2985 str[0] = '\0';
25fe55af
RS
2986 }
2987 str[c1] = '\0';
b18215fc
RS
2988
2989 /* If isn't a word bracketed by `[:' and `:]':
2990 undo the ending character, the letters, and
2991 leave the leading `:' and `[' (but set bits for
2992 them). */
25fe55af
RS
2993 if (c == ':' && *p == ']')
2994 {
14473664 2995 re_wctype_t cc;
8f924df7 2996 int limit;
14473664
SM
2997
2998 cc = re_wctype (str);
2999
3000 if (cc == 0)
fa9a63c5
RM
3001 FREE_STACK_RETURN (REG_ECTYPE);
3002
14473664
SM
3003 /* Throw away the ] at the end of the character
3004 class. */
3005 PATFETCH (c);
fa9a63c5 3006
14473664 3007 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3008
cf9c99bc
KH
3009#ifndef emacs
3010 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3011 if (re_iswctype (btowc (ch), cc))
3012 {
3013 c = TRANSLATE (ch);
ed00c2ac
KH
3014 if (c < (1 << BYTEWIDTH))
3015 SET_LIST_BIT (c);
8f924df7 3016 }
cf9c99bc
KH
3017#else /* emacs */
3018 /* Most character classes in a multibyte match
3019 just set a flag. Exceptions are is_blank,
3020 is_digit, is_cntrl, and is_xdigit, since
3021 they can only match ASCII characters. We
3022 don't need to handle them for multibyte.
3023 They are distinguished by a negative wctype. */
96cc36cc 3024
254c06a8
SM
3025 /* Setup the gl_state object to its buffer-defined
3026 value. This hardcodes the buffer-global
3027 syntax-table for ASCII chars, while the other chars
3028 will obey syntax-table properties. It's not ideal,
3029 but it's the way it's been done until now. */
d48cd3f4 3030 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 3031
cf9c99bc 3032 for (ch = 0; ch < 256; ++ch)
25fe55af 3033 {
cf9c99bc
KH
3034 c = RE_CHAR_TO_MULTIBYTE (ch);
3035 if (! CHAR_BYTE8_P (c)
3036 && re_iswctype (c, cc))
8f924df7 3037 {
cf9c99bc
KH
3038 SET_LIST_BIT (ch);
3039 c1 = TRANSLATE (c);
3040 if (c1 == c)
3041 continue;
3042 if (ASCII_CHAR_P (c1))
3043 SET_LIST_BIT (c1);
3044 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3045 SET_LIST_BIT (c1);
8f924df7 3046 }
25fe55af 3047 }
cf9c99bc
KH
3048 SET_RANGE_TABLE_WORK_AREA_BIT
3049 (range_table_work, re_wctype_to_bit (cc));
3050#endif /* emacs */
6224b623
SM
3051 /* In most cases the matching rule for char classes
3052 only uses the syntax table for multibyte chars,
3053 so that the content of the syntax-table it is not
3054 hardcoded in the range_table. SPACE and WORD are
3055 the two exceptions. */
3056 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3057 bufp->used_syntax = 1;
3058
b18215fc
RS
3059 /* Repeat the loop. */
3060 continue;
25fe55af
RS
3061 }
3062 else
3063 {
ed0767d8
SM
3064 /* Go back to right after the "[:". */
3065 p = class_beg;
25fe55af 3066 SET_LIST_BIT ('[');
b18215fc
RS
3067
3068 /* Because the `:' may starts the range, we
3069 can't simply set bit and repeat the loop.
7814e705 3070 Instead, just set it to C and handle below. */
b18215fc 3071 c = ':';
25fe55af
RS
3072 }
3073 }
b18215fc
RS
3074
3075 if (p < pend && p[0] == '-' && p[1] != ']')
3076 {
3077
3078 /* Discard the `-'. */
3079 PATFETCH (c1);
3080
3081 /* Fetch the character which ends the range. */
3082 PATFETCH (c1);
cf9c99bc
KH
3083#ifdef emacs
3084 if (CHAR_BYTE8_P (c1)
3085 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3086 /* Treat the range from a multibyte character to
3087 raw-byte character as empty. */
3088 c = c1 + 1;
3089#endif /* emacs */
e318085a 3090 }
25fe55af 3091 else
b18215fc
RS
3092 /* Range from C to C. */
3093 c1 = c;
3094
cf9c99bc 3095 if (c > c1)
25fe55af 3096 {
cf9c99bc
KH
3097 if (syntax & RE_NO_EMPTY_RANGES)
3098 FREE_STACK_RETURN (REG_ERANGEX);
3099 /* Else, repeat the loop. */
bf216479 3100 }
6fdd04b0 3101 else
25fe55af 3102 {
cf9c99bc
KH
3103#ifndef emacs
3104 /* Set the range into bitmap */
8f924df7 3105 for (; c <= c1; c++)
b18215fc 3106 {
cf9c99bc
KH
3107 ch = TRANSLATE (c);
3108 if (ch < (1 << BYTEWIDTH))
3109 SET_LIST_BIT (ch);
3110 }
3111#else /* emacs */
3112 if (c < 128)
3113 {
3114 ch = MIN (127, c1);
3115 SETUP_ASCII_RANGE (range_table_work, c, ch);
3116 c = ch + 1;
3117 if (CHAR_BYTE8_P (c1))
3118 c = BYTE8_TO_CHAR (128);
3119 }
3120 if (c <= c1)
3121 {
3122 if (CHAR_BYTE8_P (c))
3123 {
3124 c = CHAR_TO_BYTE8 (c);
3125 c1 = CHAR_TO_BYTE8 (c1);
3126 for (; c <= c1; c++)
3127 SET_LIST_BIT (c);
3128 }
3129 else if (multibyte)
3130 {
3131 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3132 }
3133 else
3134 {
3135 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3136 }
e934739e 3137 }
cf9c99bc 3138#endif /* emacs */
25fe55af 3139 }
e318085a
RS
3140 }
3141
25fe55af 3142 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3143 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3144 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3145 b[-1]--;
3146 b += b[-1];
fa9a63c5 3147
96cc36cc
RS
3148 /* Build real range table from work area. */
3149 if (RANGE_TABLE_WORK_USED (range_table_work)
3150 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3151 {
3152 int i;
3153 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3154
b18215fc 3155 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3156 bytes for flags, two for COUNT, and three bytes for
3157 each character. */
3158 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3159
b18215fc
RS
3160 /* Indicate the existence of range table. */
3161 laststart[1] |= 0x80;
fa9a63c5 3162
96cc36cc
RS
3163 /* Store the character class flag bits into the range table.
3164 If not in emacs, these flag bits are always 0. */
3165 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3166 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3167
b18215fc
RS
3168 STORE_NUMBER_AND_INCR (b, used / 2);
3169 for (i = 0; i < used; i++)
3170 STORE_CHARACTER_AND_INCR
3171 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3172 }
25fe55af
RS
3173 }
3174 break;
fa9a63c5
RM
3175
3176
b18215fc 3177 case '(':
25fe55af
RS
3178 if (syntax & RE_NO_BK_PARENS)
3179 goto handle_open;
3180 else
3181 goto normal_char;
fa9a63c5
RM
3182
3183
25fe55af
RS
3184 case ')':
3185 if (syntax & RE_NO_BK_PARENS)
3186 goto handle_close;
3187 else
3188 goto normal_char;
e318085a
RS
3189
3190
25fe55af
RS
3191 case '\n':
3192 if (syntax & RE_NEWLINE_ALT)
3193 goto handle_alt;
3194 else
3195 goto normal_char;
e318085a
RS
3196
3197
b18215fc 3198 case '|':
25fe55af
RS
3199 if (syntax & RE_NO_BK_VBAR)
3200 goto handle_alt;
3201 else
3202 goto normal_char;
3203
3204
3205 case '{':
3206 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3207 goto handle_interval;
3208 else
3209 goto normal_char;
3210
3211
3212 case '\\':
3213 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3214
3215 /* Do not translate the character after the \, so that we can
3216 distinguish, e.g., \B from \b, even if we normally would
3217 translate, e.g., B to b. */
36595814 3218 PATFETCH (c);
25fe55af
RS
3219
3220 switch (c)
3221 {
3222 case '(':
3223 if (syntax & RE_NO_BK_PARENS)
3224 goto normal_backslash;
3225
3226 handle_open:
505bde11
SM
3227 {
3228 int shy = 0;
c69b0314 3229 regnum_t regnum = 0;
505bde11
SM
3230 if (p+1 < pend)
3231 {
3232 /* Look for a special (?...) construct */
ed0767d8 3233 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3234 {
ed0767d8 3235 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3236 while (!shy)
505bde11 3237 {
c69b0314
SM
3238 PATFETCH (c);
3239 switch (c)
3240 {
3241 case ':': shy = 1; break;
3242 case '0':
3243 /* An explicitly specified regnum must start
3244 with non-0. */
3245 if (regnum == 0)
3246 FREE_STACK_RETURN (REG_BADPAT);
3247 case '1': case '2': case '3': case '4':
3248 case '5': case '6': case '7': case '8': case '9':
3249 regnum = 10*regnum + (c - '0'); break;
3250 default:
3251 /* Only (?:...) is supported right now. */
3252 FREE_STACK_RETURN (REG_BADPAT);
3253 }
505bde11
SM
3254 }
3255 }
505bde11
SM
3256 }
3257
3258 if (!shy)
c69b0314
SM
3259 regnum = ++bufp->re_nsub;
3260 else if (regnum)
3261 { /* It's actually not shy, but explicitly numbered. */
3262 shy = 0;
3263 if (regnum > bufp->re_nsub)
3264 bufp->re_nsub = regnum;
3265 else if (regnum > bufp->re_nsub
3266 /* Ideally, we'd want to check that the specified
3267 group can't have matched (i.e. all subgroups
3268 using the same regnum are in other branches of
3269 OR patterns), but we don't currently keep track
3270 of enough info to do that easily. */
3271 || group_in_compile_stack (compile_stack, regnum))
3272 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3273 }
c69b0314
SM
3274 else
3275 /* It's really shy. */
3276 regnum = - bufp->re_nsub;
25fe55af 3277
99633e97
SM
3278 if (COMPILE_STACK_FULL)
3279 {
3280 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3281 compile_stack_elt_t);
3282 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3283
99633e97
SM
3284 compile_stack.size <<= 1;
3285 }
25fe55af 3286
99633e97 3287 /* These are the values to restore when we hit end of this
7814e705 3288 group. They are all relative offsets, so that if the
99633e97
SM
3289 whole pattern moves because of realloc, they will still
3290 be valid. */
3291 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3292 COMPILE_STACK_TOP.fixup_alt_jump
3293 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3294 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3295 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3296
c69b0314
SM
3297 /* Do not push a start_memory for groups beyond the last one
3298 we can represent in the compiled pattern. */
3299 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3300 BUF_PUSH_2 (start_memory, regnum);
3301
3302 compile_stack.avail++;
3303
3304 fixup_alt_jump = 0;
3305 laststart = 0;
3306 begalt = b;
3307 /* If we've reached MAX_REGNUM groups, then this open
3308 won't actually generate any code, so we'll have to
3309 clear pending_exact explicitly. */
3310 pending_exact = 0;
3311 break;
505bde11 3312 }
25fe55af
RS
3313
3314 case ')':
3315 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3316
3317 if (COMPILE_STACK_EMPTY)
505bde11
SM
3318 {
3319 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3320 goto normal_backslash;
3321 else
3322 FREE_STACK_RETURN (REG_ERPAREN);
3323 }
25fe55af
RS
3324
3325 handle_close:
505bde11 3326 FIXUP_ALT_JUMP ();
25fe55af
RS
3327
3328 /* See similar code for backslashed left paren above. */
3329 if (COMPILE_STACK_EMPTY)
505bde11
SM
3330 {
3331 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3332 goto normal_char;
3333 else
3334 FREE_STACK_RETURN (REG_ERPAREN);
3335 }
25fe55af
RS
3336
3337 /* Since we just checked for an empty stack above, this
3338 ``can't happen''. */
3339 assert (compile_stack.avail != 0);
3340 {
3341 /* We don't just want to restore into `regnum', because
3342 later groups should continue to be numbered higher,
7814e705 3343 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3344 regnum_t regnum;
25fe55af
RS
3345
3346 compile_stack.avail--;
3347 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3348 fixup_alt_jump
3349 = COMPILE_STACK_TOP.fixup_alt_jump
3350 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3351 : 0;
3352 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3353 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3354 /* If we've reached MAX_REGNUM groups, then this open
3355 won't actually generate any code, so we'll have to
3356 clear pending_exact explicitly. */
3357 pending_exact = 0;
e318085a 3358
25fe55af 3359 /* We're at the end of the group, so now we know how many
7814e705 3360 groups were inside this one. */
c69b0314
SM
3361 if (regnum <= MAX_REGNUM && regnum > 0)
3362 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3363 }
3364 break;
3365
3366
3367 case '|': /* `\|'. */
3368 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3369 goto normal_backslash;
3370 handle_alt:
3371 if (syntax & RE_LIMITED_OPS)
3372 goto normal_char;
3373
3374 /* Insert before the previous alternative a jump which
7814e705 3375 jumps to this alternative if the former fails. */
25fe55af
RS
3376 GET_BUFFER_SPACE (3);
3377 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3378 pending_exact = 0;
3379 b += 3;
3380
3381 /* The alternative before this one has a jump after it
3382 which gets executed if it gets matched. Adjust that
3383 jump so it will jump to this alternative's analogous
3384 jump (put in below, which in turn will jump to the next
3385 (if any) alternative's such jump, etc.). The last such
3386 jump jumps to the correct final destination. A picture:
3387 _____ _____
3388 | | | |
3389 | v | v
3390 a | b | c
3391
3392 If we are at `b', then fixup_alt_jump right now points to a
3393 three-byte space after `a'. We'll put in the jump, set
3394 fixup_alt_jump to right after `b', and leave behind three
3395 bytes which we'll fill in when we get to after `c'. */
3396
505bde11 3397 FIXUP_ALT_JUMP ();
25fe55af
RS
3398
3399 /* Mark and leave space for a jump after this alternative,
3400 to be filled in later either by next alternative or
3401 when know we're at the end of a series of alternatives. */
3402 fixup_alt_jump = b;
3403 GET_BUFFER_SPACE (3);
3404 b += 3;
3405
3406 laststart = 0;
3407 begalt = b;
3408 break;
3409
3410
3411 case '{':
3412 /* If \{ is a literal. */
3413 if (!(syntax & RE_INTERVALS)
3414 /* If we're at `\{' and it's not the open-interval
3415 operator. */
4bb91c68 3416 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3417 goto normal_backslash;
3418
3419 handle_interval:
3420 {
3421 /* If got here, then the syntax allows intervals. */
3422
3423 /* At least (most) this many matches must be made. */
99633e97 3424 int lower_bound = 0, upper_bound = -1;
25fe55af 3425
ed0767d8 3426 beg_interval = p;
25fe55af 3427
25fe55af
RS
3428 GET_UNSIGNED_NUMBER (lower_bound);
3429
3430 if (c == ',')
ed0767d8 3431 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3432 else
3433 /* Interval such as `{1}' => match exactly once. */
3434 upper_bound = lower_bound;
3435
3436 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3437 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3438 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3439
3440 if (!(syntax & RE_NO_BK_BRACES))
3441 {
4bb91c68
SM
3442 if (c != '\\')
3443 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3444 if (p == pend)
3445 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3446 PATFETCH (c);
3447 }
3448
3449 if (c != '}')
4bb91c68 3450 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3451
3452 /* We just parsed a valid interval. */
3453
3454 /* If it's invalid to have no preceding re. */
3455 if (!laststart)
3456 {
3457 if (syntax & RE_CONTEXT_INVALID_OPS)
3458 FREE_STACK_RETURN (REG_BADRPT);
3459 else if (syntax & RE_CONTEXT_INDEP_OPS)
3460 laststart = b;
3461 else
3462 goto unfetch_interval;
3463 }
3464
6df42991
SM
3465 if (upper_bound == 0)
3466 /* If the upper bound is zero, just drop the sub pattern
3467 altogether. */
3468 b = laststart;
3469 else if (lower_bound == 1 && upper_bound == 1)
3470 /* Just match it once: nothing to do here. */
3471 ;
3472
3473 /* Otherwise, we have a nontrivial interval. When
3474 we're all done, the pattern will look like:
3475 set_number_at <jump count> <upper bound>
3476 set_number_at <succeed_n count> <lower bound>
3477 succeed_n <after jump addr> <succeed_n count>
3478 <body of loop>
3479 jump_n <succeed_n addr> <jump count>
3480 (The upper bound and `jump_n' are omitted if
3481 `upper_bound' is 1, though.) */
3482 else
3483 { /* If the upper bound is > 1, we need to insert
3484 more at the end of the loop. */
3485 unsigned int nbytes = (upper_bound < 0 ? 3
3486 : upper_bound > 1 ? 5 : 0);
3487 unsigned int startoffset = 0;
3488
3489 GET_BUFFER_SPACE (20); /* We might use less. */
3490
3491 if (lower_bound == 0)
3492 {
3493 /* A succeed_n that starts with 0 is really a
3494 a simple on_failure_jump_loop. */
3495 INSERT_JUMP (on_failure_jump_loop, laststart,
3496 b + 3 + nbytes);
3497 b += 3;
3498 }
3499 else
3500 {
3501 /* Initialize lower bound of the `succeed_n', even
3502 though it will be set during matching by its
3503 attendant `set_number_at' (inserted next),
3504 because `re_compile_fastmap' needs to know.
3505 Jump to the `jump_n' we might insert below. */
3506 INSERT_JUMP2 (succeed_n, laststart,
3507 b + 5 + nbytes,
3508 lower_bound);
3509 b += 5;
3510
3511 /* Code to initialize the lower bound. Insert
7814e705 3512 before the `succeed_n'. The `5' is the last two
6df42991
SM
3513 bytes of this `set_number_at', plus 3 bytes of
3514 the following `succeed_n'. */
3515 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3516 b += 5;
3517 startoffset += 5;
3518 }
3519
3520 if (upper_bound < 0)
3521 {
3522 /* A negative upper bound stands for infinity,
3523 in which case it degenerates to a plain jump. */
3524 STORE_JUMP (jump, b, laststart + startoffset);
3525 b += 3;
3526 }
3527 else if (upper_bound > 1)
3528 { /* More than one repetition is allowed, so
3529 append a backward jump to the `succeed_n'
3530 that starts this interval.
3531
3532 When we've reached this during matching,
3533 we'll have matched the interval once, so
3534 jump back only `upper_bound - 1' times. */
3535 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3536 upper_bound - 1);
3537 b += 5;
3538
3539 /* The location we want to set is the second
3540 parameter of the `jump_n'; that is `b-2' as
3541 an absolute address. `laststart' will be
3542 the `set_number_at' we're about to insert;
3543 `laststart+3' the number to set, the source
3544 for the relative address. But we are
3545 inserting into the middle of the pattern --
3546 so everything is getting moved up by 5.
3547 Conclusion: (b - 2) - (laststart + 3) + 5,
3548 i.e., b - laststart.
3549
3550 We insert this at the beginning of the loop
3551 so that if we fail during matching, we'll
3552 reinitialize the bounds. */
3553 insert_op2 (set_number_at, laststart, b - laststart,
3554 upper_bound - 1, b);
3555 b += 5;
3556 }
3557 }
25fe55af
RS
3558 pending_exact = 0;
3559 beg_interval = NULL;
3560 }
3561 break;
3562
3563 unfetch_interval:
3564 /* If an invalid interval, match the characters as literals. */
3565 assert (beg_interval);
3566 p = beg_interval;
3567 beg_interval = NULL;
3568
3569 /* normal_char and normal_backslash need `c'. */
ed0767d8 3570 c = '{';
25fe55af
RS
3571
3572 if (!(syntax & RE_NO_BK_BRACES))
3573 {
ed0767d8
SM
3574 assert (p > pattern && p[-1] == '\\');
3575 goto normal_backslash;
25fe55af 3576 }
ed0767d8
SM
3577 else
3578 goto normal_char;
e318085a 3579
b18215fc 3580#ifdef emacs
25fe55af 3581 /* There is no way to specify the before_dot and after_dot
7814e705 3582 operators. rms says this is ok. --karl */
25fe55af
RS
3583 case '=':
3584 BUF_PUSH (at_dot);
3585 break;
3586
3587 case 's':
3588 laststart = b;
3589 PATFETCH (c);
3590 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3591 break;
3592
3593 case 'S':
3594 laststart = b;
3595 PATFETCH (c);
3596 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3597 break;
b18215fc
RS
3598
3599 case 'c':
3600 laststart = b;
36595814 3601 PATFETCH (c);
b18215fc
RS
3602 BUF_PUSH_2 (categoryspec, c);
3603 break;
e318085a 3604
b18215fc
RS
3605 case 'C':
3606 laststart = b;
36595814 3607 PATFETCH (c);
b18215fc
RS
3608 BUF_PUSH_2 (notcategoryspec, c);
3609 break;
3610#endif /* emacs */
e318085a 3611
e318085a 3612
25fe55af 3613 case 'w':
4bb91c68
SM
3614 if (syntax & RE_NO_GNU_OPS)
3615 goto normal_char;
25fe55af 3616 laststart = b;
1fb352e0 3617 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3618 break;
e318085a 3619
e318085a 3620
25fe55af 3621 case 'W':
4bb91c68
SM
3622 if (syntax & RE_NO_GNU_OPS)
3623 goto normal_char;
25fe55af 3624 laststart = b;
1fb352e0 3625 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3626 break;
e318085a
RS
3627
3628
25fe55af 3629 case '<':
4bb91c68
SM
3630 if (syntax & RE_NO_GNU_OPS)
3631 goto normal_char;
25fe55af
RS
3632 BUF_PUSH (wordbeg);
3633 break;
e318085a 3634
25fe55af 3635 case '>':
4bb91c68
SM
3636 if (syntax & RE_NO_GNU_OPS)
3637 goto normal_char;
25fe55af
RS
3638 BUF_PUSH (wordend);
3639 break;
e318085a 3640
669fa600
SM
3641 case '_':
3642 if (syntax & RE_NO_GNU_OPS)
3643 goto normal_char;
3644 laststart = b;
3645 PATFETCH (c);
3646 if (c == '<')
3647 BUF_PUSH (symbeg);
3648 else if (c == '>')
3649 BUF_PUSH (symend);
3650 else
3651 FREE_STACK_RETURN (REG_BADPAT);
3652 break;
3653
25fe55af 3654 case 'b':
4bb91c68
SM
3655 if (syntax & RE_NO_GNU_OPS)
3656 goto normal_char;
25fe55af
RS
3657 BUF_PUSH (wordbound);
3658 break;
e318085a 3659
25fe55af 3660 case 'B':
4bb91c68
SM
3661 if (syntax & RE_NO_GNU_OPS)
3662 goto normal_char;
25fe55af
RS
3663 BUF_PUSH (notwordbound);
3664 break;
fa9a63c5 3665
25fe55af 3666 case '`':
4bb91c68
SM
3667 if (syntax & RE_NO_GNU_OPS)
3668 goto normal_char;
25fe55af
RS
3669 BUF_PUSH (begbuf);
3670 break;
e318085a 3671
25fe55af 3672 case '\'':
4bb91c68
SM
3673 if (syntax & RE_NO_GNU_OPS)
3674 goto normal_char;
25fe55af
RS
3675 BUF_PUSH (endbuf);
3676 break;
e318085a 3677
25fe55af
RS
3678 case '1': case '2': case '3': case '4': case '5':
3679 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3680 {
3681 regnum_t reg;
e318085a 3682
0cdd06f8
SM
3683 if (syntax & RE_NO_BK_REFS)
3684 goto normal_backslash;
e318085a 3685
0cdd06f8 3686 reg = c - '0';
e318085a 3687
c69b0314
SM
3688 if (reg > bufp->re_nsub || reg < 1
3689 /* Can't back reference to a subexp before its end. */
3690 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3691 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3692
0cdd06f8
SM
3693 laststart = b;
3694 BUF_PUSH_2 (duplicate, reg);
3695 }
25fe55af 3696 break;
e318085a 3697
e318085a 3698
25fe55af
RS
3699 case '+':
3700 case '?':
3701 if (syntax & RE_BK_PLUS_QM)
3702 goto handle_plus;
3703 else
3704 goto normal_backslash;
3705
3706 default:
3707 normal_backslash:
3708 /* You might think it would be useful for \ to mean
3709 not to translate; but if we don't translate it
4bb91c68 3710 it will never match anything. */
25fe55af
RS
3711 goto normal_char;
3712 }
3713 break;
fa9a63c5
RM
3714
3715
3716 default:
25fe55af 3717 /* Expects the character in `c'. */
fa9a63c5 3718 normal_char:
36595814 3719 /* If no exactn currently being built. */
25fe55af 3720 if (!pending_exact
fa9a63c5 3721
25fe55af
RS
3722 /* If last exactn not at current position. */
3723 || pending_exact + *pending_exact + 1 != b
5e69f11e 3724
25fe55af 3725 /* We have only one byte following the exactn for the count. */
2d1675e4 3726 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3727
7814e705 3728 /* If followed by a repetition operator. */
9d99031f 3729 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3730 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3731 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3732 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3733 || ((syntax & RE_INTERVALS)
25fe55af 3734 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3735 ? p != pend && *p == '{'
3736 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3737 {
3738 /* Start building a new exactn. */
5e69f11e 3739
25fe55af 3740 laststart = b;
fa9a63c5
RM
3741
3742 BUF_PUSH_2 (exactn, 0);
3743 pending_exact = b - 1;
25fe55af 3744 }
5e69f11e 3745
2d1675e4
SM
3746 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3747 {
e0277a47
KH
3748 int len;
3749
cf9c99bc 3750 if (multibyte)
6fdd04b0 3751 {
cf9c99bc 3752 c = TRANSLATE (c);
6fdd04b0
KH
3753 len = CHAR_STRING (c, b);
3754 b += len;
3755 }
e0277a47 3756 else
6fdd04b0 3757 {
cf9c99bc
KH
3758 c1 = RE_CHAR_TO_MULTIBYTE (c);
3759 if (! CHAR_BYTE8_P (c1))
3760 {
3761 re_wchar_t c2 = TRANSLATE (c1);
3762
3763 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3764 c = c1;
409f2919 3765 }
6fdd04b0
KH
3766 *b++ = c;
3767 len = 1;
3768 }
2d1675e4
SM
3769 (*pending_exact) += len;
3770 }
3771
fa9a63c5 3772 break;
25fe55af 3773 } /* switch (c) */
fa9a63c5
RM
3774 } /* while p != pend */
3775
5e69f11e 3776
fa9a63c5 3777 /* Through the pattern now. */
5e69f11e 3778
505bde11 3779 FIXUP_ALT_JUMP ();
fa9a63c5 3780
5e69f11e 3781 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3782 FREE_STACK_RETURN (REG_EPAREN);
3783
3784 /* If we don't want backtracking, force success
3785 the first time we reach the end of the compiled pattern. */
3786 if (syntax & RE_NO_POSIX_BACKTRACKING)
3787 BUF_PUSH (succeed);
3788
fa9a63c5
RM
3789 /* We have succeeded; set the length of the buffer. */
3790 bufp->used = b - bufp->buffer;
3791
3792#ifdef DEBUG
99633e97 3793 if (debug > 0)
fa9a63c5 3794 {
505bde11 3795 re_compile_fastmap (bufp);
fa9a63c5
RM
3796 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3797 print_compiled_pattern (bufp);
3798 }
99633e97 3799 debug--;
fa9a63c5
RM
3800#endif /* DEBUG */
3801
3802#ifndef MATCH_MAY_ALLOCATE
3803 /* Initialize the failure stack to the largest possible stack. This
3804 isn't necessary unless we're trying to avoid calling alloca in
3805 the search and match routines. */
3806 {
3807 int num_regs = bufp->re_nsub + 1;
3808
320a2a73 3809 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3810 {
a26f4ccd 3811 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3812
fa9a63c5
RM
3813 if (! fail_stack.stack)
3814 fail_stack.stack
5e69f11e 3815 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3816 * sizeof (fail_stack_elt_t));
3817 else
3818 fail_stack.stack
3819 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3820 (fail_stack.size
3821 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3822 }
3823
3824 regex_grow_registers (num_regs);
3825 }
3826#endif /* not MATCH_MAY_ALLOCATE */
3827
839966f3 3828 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3829} /* regex_compile */
3830\f
3831/* Subroutines for `regex_compile'. */
3832
7814e705 3833/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3834
3835static void
971de7fb 3836store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3837{
3838 *loc = (unsigned char) op;
3839 STORE_NUMBER (loc + 1, arg);
3840}
3841
3842
3843/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3844
3845static void
971de7fb 3846store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3847{
3848 *loc = (unsigned char) op;
3849 STORE_NUMBER (loc + 1, arg1);
3850 STORE_NUMBER (loc + 3, arg2);
3851}
3852
3853
3854/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3855 for OP followed by two-byte integer parameter ARG. */
3856
3857static void
971de7fb 3858insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3859{
3860 register unsigned char *pfrom = end;
3861 register unsigned char *pto = end + 3;
3862
3863 while (pfrom != loc)
3864 *--pto = *--pfrom;
5e69f11e 3865
fa9a63c5
RM
3866 store_op1 (op, loc, arg);
3867}
3868
3869
3870/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3871
3872static void
971de7fb 3873insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3874{
3875 register unsigned char *pfrom = end;
3876 register unsigned char *pto = end + 5;
3877
3878 while (pfrom != loc)
3879 *--pto = *--pfrom;
5e69f11e 3880
fa9a63c5
RM
3881 store_op2 (op, loc, arg1, arg2);
3882}
3883
3884
3885/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3886 after an alternative or a begin-subexpression. We assume there is at
3887 least one character before the ^. */
3888
3889static boolean
971de7fb 3890at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3891{
01618498 3892 re_char *prev = p - 2;
fa9a63c5 3893 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3894
fa9a63c5
RM
3895 return
3896 /* After a subexpression? */
3897 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3898 /* After an alternative? */
d2af47df
SM
3899 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3900 /* After a shy subexpression? */
3901 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3902 && prev[-1] == '?' && prev[-2] == '('
3903 && (syntax & RE_NO_BK_PARENS
3904 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3905}
3906
3907
3908/* The dual of at_begline_loc_p. This one is for $. We assume there is
3909 at least one character after the $, i.e., `P < PEND'. */
3910
3911static boolean
971de7fb 3912at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3913{
01618498 3914 re_char *next = p;
fa9a63c5 3915 boolean next_backslash = *next == '\\';
01618498 3916 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3917
fa9a63c5
RM
3918 return
3919 /* Before a subexpression? */
3920 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3921 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3922 /* Before an alternative? */
3923 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3924 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3925}
3926
3927
5e69f11e 3928/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3929 false if it's not. */
3930
3931static boolean
971de7fb 3932group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5
RM
3933{
3934 int this_element;
3935
5e69f11e
RM
3936 for (this_element = compile_stack.avail - 1;
3937 this_element >= 0;
fa9a63c5
RM
3938 this_element--)
3939 if (compile_stack.stack[this_element].regnum == regnum)
3940 return true;
3941
3942 return false;
3943}
fa9a63c5 3944\f
f6a3f532
SM
3945/* analyse_first.
3946 If fastmap is non-NULL, go through the pattern and fill fastmap
3947 with all the possible leading chars. If fastmap is NULL, don't
3948 bother filling it up (obviously) and only return whether the
3949 pattern could potentially match the empty string.
3950
3951 Return 1 if p..pend might match the empty string.
3952 Return 0 if p..pend matches at least one char.
01618498 3953 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3954
3955static int
438105ed 3956analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3957{
505bde11 3958 int j, k;
1fb352e0 3959 boolean not;
fa9a63c5 3960
b18215fc 3961 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3962 flag is set true. */
b18215fc
RS
3963 boolean match_any_multibyte_characters = false;
3964
f6a3f532 3965 assert (p);
5e69f11e 3966
505bde11
SM
3967 /* The loop below works as follows:
3968 - It has a working-list kept in the PATTERN_STACK and which basically
3969 starts by only containing a pointer to the first operation.
3970 - If the opcode we're looking at is a match against some set of
3971 chars, then we add those chars to the fastmap and go on to the
3972 next work element from the worklist (done via `break').
3973 - If the opcode is a control operator on the other hand, we either
3974 ignore it (if it's meaningless at this point, such as `start_memory')
3975 or execute it (if it's a jump). If the jump has several destinations
3976 (i.e. `on_failure_jump'), then we push the other destination onto the
3977 worklist.
3978 We guarantee termination by ignoring backward jumps (more or less),
3979 so that `p' is monotonically increasing. More to the point, we
3980 never set `p' (or push) anything `<= p1'. */
3981
01618498 3982 while (p < pend)
fa9a63c5 3983 {
505bde11
SM
3984 /* `p1' is used as a marker of how far back a `on_failure_jump'
3985 can go without being ignored. It is normally equal to `p'
3986 (which prevents any backward `on_failure_jump') except right
3987 after a plain `jump', to allow patterns such as:
3988 0: jump 10
3989 3..9: <body>
3990 10: on_failure_jump 3
3991 as used for the *? operator. */
01618498 3992 re_char *p1 = p;
5e69f11e 3993
fa9a63c5
RM
3994 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3995 {
f6a3f532 3996 case succeed:
01618498 3997 return 1;
fa9a63c5 3998
fa9a63c5 3999 case duplicate:
505bde11
SM
4000 /* If the first character has to match a backreference, that means
4001 that the group was empty (since it already matched). Since this
4002 is the only case that interests us here, we can assume that the
4003 backreference must match the empty string. */
4004 p++;
4005 continue;
fa9a63c5
RM
4006
4007
4008 /* Following are the cases which match a character. These end
7814e705 4009 with `break'. */
fa9a63c5
RM
4010
4011 case exactn:
e0277a47 4012 if (fastmap)
cf9c99bc
KH
4013 {
4014 /* If multibyte is nonzero, the first byte of each
4015 character is an ASCII or a leading code. Otherwise,
4016 each byte is a character. Thus, this works in both
4017 cases. */
4018 fastmap[p[1]] = 1;
4019 if (! multibyte)
4020 {
4021 /* For the case of matching this unibyte regex
4022 against multibyte, we must set a leading code of
4023 the corresponding multibyte character. */
4024 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4025
86e893e3 4026 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
4027 }
4028 }
fa9a63c5
RM
4029 break;
4030
4031
1fb352e0
SM
4032 case anychar:
4033 /* We could put all the chars except for \n (and maybe \0)
4034 but we don't bother since it is generally not worth it. */
f6a3f532 4035 if (!fastmap) break;
01618498 4036 return -1;
fa9a63c5
RM
4037
4038
b18215fc 4039 case charset_not:
1fb352e0 4040 if (!fastmap) break;
bf216479
KH
4041 {
4042 /* Chars beyond end of bitmap are possible matches. */
bf216479 4043 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4044 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4045 fastmap[j] = 1;
4046 }
4047
1fb352e0
SM
4048 /* Fallthrough */
4049 case charset:
4050 if (!fastmap) break;
4051 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4052 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4053 j >= 0; j--)
1fb352e0 4054 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4055 fastmap[j] = 1;
b18215fc 4056
6482db2e
KH
4057#ifdef emacs
4058 if (/* Any leading code can possibly start a character
1fb352e0 4059 which doesn't match the specified set of characters. */
6482db2e 4060 not
409f2919 4061 ||
6482db2e
KH
4062 /* If we can match a character class, we can match any
4063 multibyte characters. */
4064 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4065 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4066
b18215fc 4067 {
b18215fc
RS
4068 if (match_any_multibyte_characters == false)
4069 {
6482db2e
KH
4070 for (j = MIN_MULTIBYTE_LEADING_CODE;
4071 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4072 fastmap[j] = 1;
b18215fc
RS
4073 match_any_multibyte_characters = true;
4074 }
4075 }
b18215fc 4076
1fb352e0
SM
4077 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4078 && match_any_multibyte_characters == false)
4079 {
bf216479 4080 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 4081 multibyte character in the range table. */
1fb352e0 4082 int c, count;
bf216479 4083 unsigned char lc1, lc2;
b18215fc 4084
1fb352e0 4085 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4086 bits for a character class. */
1fb352e0 4087 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4088
1fb352e0
SM
4089 /* Extract the number of ranges in range table into COUNT. */
4090 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4091 for (; count > 0; count--, p += 3)
1fb352e0 4092 {
9117d724
KH
4093 /* Extract the start and end of each range. */
4094 EXTRACT_CHARACTER (c, p);
bf216479 4095 lc1 = CHAR_LEADING_CODE (c);
9117d724 4096 p += 3;
1fb352e0 4097 EXTRACT_CHARACTER (c, p);
bf216479
KH
4098 lc2 = CHAR_LEADING_CODE (c);
4099 for (j = lc1; j <= lc2; j++)
9117d724 4100 fastmap[j] = 1;
1fb352e0
SM
4101 }
4102 }
6482db2e 4103#endif
b18215fc
RS
4104 break;
4105
1fb352e0
SM
4106 case syntaxspec:
4107 case notsyntaxspec:
4108 if (!fastmap) break;
4109#ifndef emacs
4110 not = (re_opcode_t)p[-1] == notsyntaxspec;
4111 k = *p++;
4112 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4113 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4114 fastmap[j] = 1;
b18215fc 4115 break;
1fb352e0 4116#else /* emacs */
b18215fc
RS
4117 /* This match depends on text properties. These end with
4118 aborting optimizations. */
01618498 4119 return -1;
b18215fc
RS
4120
4121 case categoryspec:
b18215fc 4122 case notcategoryspec:
1fb352e0
SM
4123 if (!fastmap) break;
4124 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4125 k = *p++;
6482db2e 4126 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4127 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4128 fastmap[j] = 1;
4129
6482db2e
KH
4130 /* Any leading code can possibly start a character which
4131 has or doesn't has the specified category. */
4132 if (match_any_multibyte_characters == false)
6fdd04b0 4133 {
6482db2e
KH
4134 for (j = MIN_MULTIBYTE_LEADING_CODE;
4135 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4136 fastmap[j] = 1;
4137 match_any_multibyte_characters = true;
6fdd04b0 4138 }
b18215fc
RS
4139 break;
4140
fa9a63c5 4141 /* All cases after this match the empty string. These end with
25fe55af 4142 `continue'. */
fa9a63c5 4143
fa9a63c5
RM
4144 case before_dot:
4145 case at_dot:
4146 case after_dot:
1fb352e0 4147#endif /* !emacs */
25fe55af
RS
4148 case no_op:
4149 case begline:
4150 case endline:
fa9a63c5
RM
4151 case begbuf:
4152 case endbuf:
4153 case wordbound:
4154 case notwordbound:
4155 case wordbeg:
4156 case wordend:
669fa600
SM
4157 case symbeg:
4158 case symend:
25fe55af 4159 continue;
fa9a63c5
RM
4160
4161
fa9a63c5 4162 case jump:
25fe55af 4163 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4164 if (j < 0)
4165 /* Backward jumps can only go back to code that we've already
4166 visited. `re_compile' should make sure this is true. */
4167 break;
25fe55af 4168 p += j;
505bde11
SM
4169 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4170 {
4171 case on_failure_jump:
4172 case on_failure_keep_string_jump:
505bde11 4173 case on_failure_jump_loop:
0683b6fa 4174 case on_failure_jump_nastyloop:
505bde11
SM
4175 case on_failure_jump_smart:
4176 p++;
4177 break;
4178 default:
4179 continue;
4180 };
4181 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4182 to jump back to "just after here". */
4183 /* Fallthrough */
fa9a63c5 4184
25fe55af
RS
4185 case on_failure_jump:
4186 case on_failure_keep_string_jump:
0683b6fa 4187 case on_failure_jump_nastyloop:
505bde11
SM
4188 case on_failure_jump_loop:
4189 case on_failure_jump_smart:
25fe55af 4190 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4191 if (p + j <= p1)
ed0767d8 4192 ; /* Backward jump to be ignored. */
01618498
SM
4193 else
4194 { /* We have to look down both arms.
4195 We first go down the "straight" path so as to minimize
4196 stack usage when going through alternatives. */
4197 int r = analyse_first (p, pend, fastmap, multibyte);
4198 if (r) return r;
4199 p += j;
4200 }
25fe55af 4201 continue;
fa9a63c5
RM
4202
4203
ed0767d8
SM
4204 case jump_n:
4205 /* This code simply does not properly handle forward jump_n. */
4206 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4207 p += 4;
4208 /* jump_n can either jump or fall through. The (backward) jump
4209 case has already been handled, so we only need to look at the
4210 fallthrough case. */
4211 continue;
177c0ea7 4212
fa9a63c5 4213 case succeed_n:
ed0767d8
SM
4214 /* If N == 0, it should be an on_failure_jump_loop instead. */
4215 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4216 p += 4;
4217 /* We only care about one iteration of the loop, so we don't
4218 need to consider the case where this behaves like an
4219 on_failure_jump. */
25fe55af 4220 continue;
fa9a63c5
RM
4221
4222
4223 case set_number_at:
25fe55af
RS
4224 p += 4;
4225 continue;
fa9a63c5
RM
4226
4227
4228 case start_memory:
25fe55af 4229 case stop_memory:
505bde11 4230 p += 1;
fa9a63c5
RM
4231 continue;
4232
4233
4234 default:
25fe55af
RS
4235 abort (); /* We have listed all the cases. */
4236 } /* switch *p++ */
fa9a63c5
RM
4237
4238 /* Getting here means we have found the possible starting
25fe55af 4239 characters for one path of the pattern -- and that the empty
7814e705 4240 string does not match. We need not follow this path further. */
01618498 4241 return 0;
fa9a63c5
RM
4242 } /* while p */
4243
01618498
SM
4244 /* We reached the end without matching anything. */
4245 return 1;
4246
f6a3f532
SM
4247} /* analyse_first */
4248\f
4249/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4250 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4251 characters can start a string that matches the pattern. This fastmap
4252 is used by re_search to skip quickly over impossible starting points.
4253
4254 Character codes above (1 << BYTEWIDTH) are not represented in the
4255 fastmap, but the leading codes are represented. Thus, the fastmap
4256 indicates which character sets could start a match.
4257
4258 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4259 area as BUFP->fastmap.
4260
4261 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4262 the pattern buffer.
4263
4264 Returns 0 if we succeed, -2 if an internal error. */
4265
4266int
971de7fb 4267re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4268{
4269 char *fastmap = bufp->fastmap;
4270 int analysis;
4271
4272 assert (fastmap && bufp->buffer);
4273
72af86bd 4274 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4275 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4276
4277 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4278 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4279 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4280 return 0;
4281} /* re_compile_fastmap */
4282\f
4283/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4284 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4285 this memory for recording register information. STARTS and ENDS
4286 must be allocated using the malloc library routine, and must each
4287 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4288
4289 If NUM_REGS == 0, then subsequent matches should allocate their own
4290 register data.
4291
4292 Unless this function is called, the first search or match using
4293 PATTERN_BUFFER will allocate its own register data, without
4294 freeing the old data. */
4295
4296void
971de7fb 4297re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4298{
4299 if (num_regs)
4300 {
4301 bufp->regs_allocated = REGS_REALLOCATE;
4302 regs->num_regs = num_regs;
4303 regs->start = starts;
4304 regs->end = ends;
4305 }
4306 else
4307 {
4308 bufp->regs_allocated = REGS_UNALLOCATED;
4309 regs->num_regs = 0;
4310 regs->start = regs->end = (regoff_t *) 0;
4311 }
4312}
c0f9ea08 4313WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4314\f
7814e705 4315/* Searching routines. */
fa9a63c5
RM
4316
4317/* Like re_search_2, below, but only one string is specified, and
4318 doesn't let you say where to stop matching. */
4319
4320int
971de7fb 4321re_search (struct re_pattern_buffer *bufp, const char *string, int size, int startpos, int range, struct re_registers *regs)
fa9a63c5 4322{
5e69f11e 4323 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4324 regs, size);
4325}
c0f9ea08 4326WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4327
70806df6
KH
4328/* Head address of virtual concatenation of string. */
4329#define HEAD_ADDR_VSTRING(P) \
4330 (((P) >= size1 ? string2 : string1))
4331
b18215fc
RS
4332/* End address of virtual concatenation of string. */
4333#define STOP_ADDR_VSTRING(P) \
4334 (((P) >= size1 ? string2 + size2 : string1 + size1))
4335
4336/* Address of POS in the concatenation of virtual string. */
4337#define POS_ADDR_VSTRING(POS) \
4338 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4339
4340/* Using the compiled pattern in BUFP->buffer, first tries to match the
4341 virtual concatenation of STRING1 and STRING2, starting first at index
4342 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4343
fa9a63c5 4344 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4345
fa9a63c5
RM
4346 RANGE is how far to scan while trying to match. RANGE = 0 means try
4347 only at STARTPOS; in general, the last start tried is STARTPOS +
4348 RANGE.
5e69f11e 4349
fa9a63c5
RM
4350 In REGS, return the indices of the virtual concatenation of STRING1
4351 and STRING2 that matched the entire BUFP->buffer and its contained
4352 subexpressions.
5e69f11e 4353
fa9a63c5
RM
4354 Do not consider matching one past the index STOP in the virtual
4355 concatenation of STRING1 and STRING2.
4356
4357 We return either the position in the strings at which the match was
4358 found, -1 if no match, or -2 if error (such as failure
4359 stack overflow). */
4360
4361int
971de7fb 4362re_search_2 (struct re_pattern_buffer *bufp, const char *str1, int size1, const char *str2, int size2, int startpos, int range, struct re_registers *regs, int stop)
fa9a63c5
RM
4363{
4364 int val;
66f0296e
SM
4365 re_char *string1 = (re_char*) str1;
4366 re_char *string2 = (re_char*) str2;
fa9a63c5 4367 register char *fastmap = bufp->fastmap;
6676cb1c 4368 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4369 int total_size = size1 + size2;
4370 int endpos = startpos + range;
c0f9ea08 4371 boolean anchored_start;
cf9c99bc
KH
4372 /* Nonzero if we are searching multibyte string. */
4373 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4374
fa9a63c5
RM
4375 /* Check for out-of-range STARTPOS. */
4376 if (startpos < 0 || startpos > total_size)
4377 return -1;
5e69f11e 4378
fa9a63c5 4379 /* Fix up RANGE if it might eventually take us outside
34597fa9 4380 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4381 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4382 if (endpos < 0)
4383 range = 0 - startpos;
fa9a63c5
RM
4384 else if (endpos > total_size)
4385 range = total_size - startpos;
4386
4387 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4388 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4389 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4390 {
4391 if (startpos > 0)
4392 return -1;
4393 else
7b140fd7 4394 range = 0;
fa9a63c5
RM
4395 }
4396
ae4788a8
RS
4397#ifdef emacs
4398 /* In a forward search for something that starts with \=.
4399 don't keep searching past point. */
4400 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4401 {
7b140fd7
RS
4402 range = PT_BYTE - BEGV_BYTE - startpos;
4403 if (range < 0)
ae4788a8
RS
4404 return -1;
4405 }
4406#endif /* emacs */
4407
fa9a63c5
RM
4408 /* Update the fastmap now if not correct already. */
4409 if (fastmap && !bufp->fastmap_accurate)
01618498 4410 re_compile_fastmap (bufp);
5e69f11e 4411
c8499ba5 4412 /* See whether the pattern is anchored. */
c0f9ea08 4413 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4414
b18215fc 4415#ifdef emacs
d48cd3f4 4416 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4417 {
99633e97 4418 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4419
4420 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4421 }
b18215fc
RS
4422#endif
4423
fa9a63c5
RM
4424 /* Loop through the string, looking for a place to start matching. */
4425 for (;;)
5e69f11e 4426 {
c8499ba5
RS
4427 /* If the pattern is anchored,
4428 skip quickly past places we cannot match.
4429 We don't bother to treat startpos == 0 specially
4430 because that case doesn't repeat. */
4431 if (anchored_start && startpos > 0)
4432 {
c0f9ea08
SM
4433 if (! ((startpos <= size1 ? string1[startpos - 1]
4434 : string2[startpos - size1 - 1])
4435 == '\n'))
c8499ba5
RS
4436 goto advance;
4437 }
4438
fa9a63c5 4439 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4440 cannot be the start of a match. If the pattern can match the
4441 null string, however, we don't need to skip characters; we want
7814e705 4442 the first null string. */
fa9a63c5
RM
4443 if (fastmap && startpos < total_size && !bufp->can_be_null)
4444 {
66f0296e 4445 register re_char *d;
01618498 4446 register re_wchar_t buf_ch;
e934739e
RS
4447
4448 d = POS_ADDR_VSTRING (startpos);
4449
7814e705 4450 if (range > 0) /* Searching forwards. */
fa9a63c5 4451 {
fa9a63c5
RM
4452 register int lim = 0;
4453 int irange = range;
4454
25fe55af
RS
4455 if (startpos < size1 && startpos + range >= size1)
4456 lim = range - (size1 - startpos);
fa9a63c5 4457
25fe55af
RS
4458 /* Written out as an if-else to avoid testing `translate'
4459 inside the loop. */
28ae27ae
AS
4460 if (RE_TRANSLATE_P (translate))
4461 {
e934739e
RS
4462 if (multibyte)
4463 while (range > lim)
4464 {
4465 int buf_charlen;
4466
62a6e103 4467 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4468 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4469 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4470 break;
4471
4472 range -= buf_charlen;
4473 d += buf_charlen;
4474 }
4475 else
bf216479 4476 while (range > lim)
33c46939 4477 {
cf9c99bc
KH
4478 register re_wchar_t ch, translated;
4479
bf216479 4480 buf_ch = *d;
cf9c99bc
KH
4481 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4482 translated = RE_TRANSLATE (translate, ch);
4483 if (translated != ch
4484 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4485 buf_ch = ch;
6fdd04b0 4486 if (fastmap[buf_ch])
bf216479 4487 break;
33c46939
RS
4488 d++;
4489 range--;
4490 }
e934739e 4491 }
fa9a63c5 4492 else
6fdd04b0
KH
4493 {
4494 if (multibyte)
4495 while (range > lim)
4496 {
4497 int buf_charlen;
fa9a63c5 4498
62a6e103 4499 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4500 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4501 break;
4502 range -= buf_charlen;
4503 d += buf_charlen;
4504 }
e934739e 4505 else
6fdd04b0 4506 while (range > lim && !fastmap[*d])
33c46939
RS
4507 {
4508 d++;
4509 range--;
4510 }
e934739e 4511 }
fa9a63c5
RM
4512 startpos += irange - range;
4513 }
7814e705 4514 else /* Searching backwards. */
fa9a63c5 4515 {
ba5e343c
KH
4516 if (multibyte)
4517 {
62a6e103 4518 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4519 buf_ch = TRANSLATE (buf_ch);
4520 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4521 goto advance;
4522 }
4523 else
4524 {
cf9c99bc
KH
4525 register re_wchar_t ch, translated;
4526
4527 buf_ch = *d;
4528 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4529 translated = TRANSLATE (ch);
4530 if (translated != ch
4531 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4532 buf_ch = ch;
4533 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4534 goto advance;
4535 }
fa9a63c5
RM
4536 }
4537 }
4538
4539 /* If can't match the null string, and that's all we have left, fail. */
4540 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4541 && !bufp->can_be_null)
fa9a63c5
RM
4542 return -1;
4543
4544 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4545 startpos, regs, stop);
fa9a63c5
RM
4546
4547 if (val >= 0)
4548 return startpos;
5e69f11e 4549
fa9a63c5
RM
4550 if (val == -2)
4551 return -2;
4552
4553 advance:
5e69f11e 4554 if (!range)
25fe55af 4555 break;
5e69f11e 4556 else if (range > 0)
25fe55af 4557 {
b18215fc
RS
4558 /* Update STARTPOS to the next character boundary. */
4559 if (multibyte)
4560 {
66f0296e
SM
4561 re_char *p = POS_ADDR_VSTRING (startpos);
4562 re_char *pend = STOP_ADDR_VSTRING (startpos);
aa3830c4 4563 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4564
4565 range -= len;
4566 if (range < 0)
4567 break;
4568 startpos += len;
4569 }
4570 else
4571 {
b560c397
RS
4572 range--;
4573 startpos++;
4574 }
e318085a 4575 }
fa9a63c5 4576 else
25fe55af
RS
4577 {
4578 range++;
4579 startpos--;
b18215fc
RS
4580
4581 /* Update STARTPOS to the previous character boundary. */
4582 if (multibyte)
4583 {
70806df6
KH
4584 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4585 re_char *p0 = p;
4586 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4587
4588 /* Find the head of multibyte form. */
70806df6
KH
4589 PREV_CHAR_BOUNDARY (p, phead);
4590 range += p0 - 1 - p;
4591 if (range > 0)
4592 break;
b18215fc 4593
70806df6 4594 startpos -= p0 - 1 - p;
b18215fc 4595 }
25fe55af 4596 }
fa9a63c5
RM
4597 }
4598 return -1;
4599} /* re_search_2 */
c0f9ea08 4600WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4601\f
4602/* Declarations and macros for re_match_2. */
4603
2d1675e4
SM
4604static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4605 register int len,
4606 RE_TRANSLATE_TYPE translate,
4607 const int multibyte));
fa9a63c5
RM
4608
4609/* This converts PTR, a pointer into one of the search strings `string1'
4610 and `string2' into an offset from the beginning of that string. */
4611#define POINTER_TO_OFFSET(ptr) \
4612 (FIRST_STRING_P (ptr) \
4613 ? ((regoff_t) ((ptr) - string1)) \
4614 : ((regoff_t) ((ptr) - string2 + size1)))
4615
fa9a63c5 4616/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4617 string2 if necessary.
4618 Check re_match_2_internal for a discussion of why end_match_2 might
4619 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4620#define PREFETCH() \
25fe55af 4621 while (d == dend) \
fa9a63c5
RM
4622 { \
4623 /* End of string2 => fail. */ \
25fe55af
RS
4624 if (dend == end_match_2) \
4625 goto fail; \
4bb91c68 4626 /* End of string1 => advance to string2. */ \
25fe55af 4627 d = string2; \
fa9a63c5
RM
4628 dend = end_match_2; \
4629 }
4630
f1ad044f
SM
4631/* Call before fetching a char with *d if you already checked other limits.
4632 This is meant for use in lookahead operations like wordend, etc..
4633 where we might need to look at parts of the string that might be
4634 outside of the LIMITs (i.e past `stop'). */
4635#define PREFETCH_NOLIMIT() \
4636 if (d == end1) \
4637 { \
4638 d = string2; \
4639 dend = end_match_2; \
4640 } \
fa9a63c5
RM
4641
4642/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4643 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4644#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4645#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4646
4647
4648/* Test if D points to a character which is word-constituent. We have
4649 two special cases to check for: if past the end of string1, look at
4650 the first character in string2; and if before the beginning of
4651 string2, look at the last character in string1. */
4652#define WORDCHAR_P(d) \
4653 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4654 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4655 == Sword)
4656
9121ca40 4657/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4658
4659/* The comment at case wordbound is following one, but we don't use
4660 AT_WORD_BOUNDARY anymore to support multibyte form.
4661
4662 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4663 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4664 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4665 macro and introducing temporary variables works around the bug. */
4666
9121ca40 4667#if 0
fa9a63c5
RM
4668/* Test if the character before D and the one at D differ with respect
4669 to being word-constituent. */
4670#define AT_WORD_BOUNDARY(d) \
4671 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4672 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4673#endif
fa9a63c5
RM
4674
4675/* Free everything we malloc. */
4676#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4677# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4678# define FREE_VARIABLES() \
fa9a63c5
RM
4679 do { \
4680 REGEX_FREE_STACK (fail_stack.stack); \
4681 FREE_VAR (regstart); \
4682 FREE_VAR (regend); \
fa9a63c5
RM
4683 FREE_VAR (best_regstart); \
4684 FREE_VAR (best_regend); \
fa9a63c5
RM
4685 } while (0)
4686#else
0b32bf0e 4687# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4688#endif /* not MATCH_MAY_ALLOCATE */
4689
505bde11
SM
4690\f
4691/* Optimization routines. */
4692
4e8a9132
SM
4693/* If the operation is a match against one or more chars,
4694 return a pointer to the next operation, else return NULL. */
01618498 4695static re_char *
971de7fb 4696skip_one_char (const re_char *p)
4e8a9132
SM
4697{
4698 switch (SWITCH_ENUM_CAST (*p++))
4699 {
4700 case anychar:
4701 break;
177c0ea7 4702
4e8a9132
SM
4703 case exactn:
4704 p += *p + 1;
4705 break;
4706
4707 case charset_not:
4708 case charset:
4709 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4710 {
4711 int mcnt;
4712 p = CHARSET_RANGE_TABLE (p - 1);
4713 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4714 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4715 }
4716 else
4717 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4718 break;
177c0ea7 4719
4e8a9132
SM
4720 case syntaxspec:
4721 case notsyntaxspec:
1fb352e0 4722#ifdef emacs
4e8a9132
SM
4723 case categoryspec:
4724 case notcategoryspec:
4725#endif /* emacs */
4726 p++;
4727 break;
4728
4729 default:
4730 p = NULL;
4731 }
4732 return p;
4733}
4734
4735
505bde11 4736/* Jump over non-matching operations. */
839966f3 4737static re_char *
971de7fb 4738skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4739{
4740 int mcnt;
4741 while (p < pend)
4742 {
4743 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4744 {
4745 case start_memory:
505bde11
SM
4746 case stop_memory:
4747 p += 2; break;
4748 case no_op:
4749 p += 1; break;
4750 case jump:
4751 p += 1;
4752 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4753 p += mcnt;
4754 break;
4755 default:
4756 return p;
4757 }
4758 }
4759 assert (p == pend);
4760 return p;
4761}
4762
4763/* Non-zero if "p1 matches something" implies "p2 fails". */
4764static int
971de7fb 4765mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4766{
4e8a9132 4767 re_opcode_t op2;
2d1675e4 4768 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4769 unsigned char *pend = bufp->buffer + bufp->used;
4770
4e8a9132 4771 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4772 && p2 >= bufp->buffer && p2 <= pend);
4773
4774 /* Skip over open/close-group commands.
4775 If what follows this loop is a ...+ construct,
4776 look at what begins its body, since we will have to
4777 match at least one of that. */
4e8a9132
SM
4778 p2 = skip_noops (p2, pend);
4779 /* The same skip can be done for p1, except that this function
4780 is only used in the case where p1 is a simple match operator. */
4781 /* p1 = skip_noops (p1, pend); */
4782
4783 assert (p1 >= bufp->buffer && p1 < pend
4784 && p2 >= bufp->buffer && p2 <= pend);
4785
4786 op2 = p2 == pend ? succeed : *p2;
4787
4788 switch (SWITCH_ENUM_CAST (op2))
505bde11 4789 {
4e8a9132
SM
4790 case succeed:
4791 case endbuf:
4792 /* If we're at the end of the pattern, we can change. */
4793 if (skip_one_char (p1))
505bde11 4794 {
505bde11
SM
4795 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4796 return 1;
505bde11 4797 }
4e8a9132 4798 break;
177c0ea7 4799
4e8a9132 4800 case endline:
4e8a9132
SM
4801 case exactn:
4802 {
01618498 4803 register re_wchar_t c
4e8a9132 4804 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4805 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4806
4e8a9132
SM
4807 if ((re_opcode_t) *p1 == exactn)
4808 {
62a6e103 4809 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4810 {
4811 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4812 return 1;
4813 }
4814 }
505bde11 4815
4e8a9132
SM
4816 else if ((re_opcode_t) *p1 == charset
4817 || (re_opcode_t) *p1 == charset_not)
4818 {
4819 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4820
4e8a9132
SM
4821 /* Test if C is listed in charset (or charset_not)
4822 at `p1'. */
6fdd04b0 4823 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4824 {
4825 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4826 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4827 not = !not;
4828 }
4829 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4830 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4831
4e8a9132
SM
4832 /* `not' is equal to 1 if c would match, which means
4833 that we can't change to pop_failure_jump. */
4834 if (!not)
4835 {
4836 DEBUG_PRINT1 (" No match => fast loop.\n");
4837 return 1;
4838 }
4839 }
4840 else if ((re_opcode_t) *p1 == anychar
4841 && c == '\n')
4842 {
4843 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4844 return 1;
4845 }
4846 }
4847 break;
505bde11 4848
4e8a9132 4849 case charset:
4e8a9132
SM
4850 {
4851 if ((re_opcode_t) *p1 == exactn)
4852 /* Reuse the code above. */
4853 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4854
505bde11
SM
4855 /* It is hard to list up all the character in charset
4856 P2 if it includes multibyte character. Give up in
4857 such case. */
4858 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4859 {
4860 /* Now, we are sure that P2 has no range table.
4861 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4862 enough. But P1 may have range table, so the
505bde11
SM
4863 size of bitmap table of P1 is extracted by
4864 using macro `CHARSET_BITMAP_SIZE'.
4865
6fdd04b0
KH
4866 In a multibyte case, we know that all the character
4867 listed in P2 is ASCII. In a unibyte case, P1 has only a
4868 bitmap table. So, in both cases, it is enough to test
4869 only the bitmap table of P1. */
505bde11 4870
411e4203 4871 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4872 {
4873 int idx;
4874 /* We win if the charset inside the loop
4875 has no overlap with the one after the loop. */
4876 for (idx = 0;
4877 (idx < (int) p2[1]
4878 && idx < CHARSET_BITMAP_SIZE (p1));
4879 idx++)
4880 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4881 break;
4882
4883 if (idx == p2[1]
4884 || idx == CHARSET_BITMAP_SIZE (p1))
4885 {
4886 DEBUG_PRINT1 (" No match => fast loop.\n");
4887 return 1;
4888 }
4889 }
411e4203 4890 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4891 {
4892 int idx;
4893 /* We win if the charset_not inside the loop lists
7814e705 4894 every character listed in the charset after. */
505bde11
SM
4895 for (idx = 0; idx < (int) p2[1]; idx++)
4896 if (! (p2[2 + idx] == 0
4897 || (idx < CHARSET_BITMAP_SIZE (p1)
4898 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4899 break;
4900
4e8a9132
SM
4901 if (idx == p2[1])
4902 {
4903 DEBUG_PRINT1 (" No match => fast loop.\n");
4904 return 1;
4905 }
4906 }
4907 }
4908 }
609b757a 4909 break;
177c0ea7 4910
411e4203
SM
4911 case charset_not:
4912 switch (SWITCH_ENUM_CAST (*p1))
4913 {
4914 case exactn:
4915 case charset:
4916 /* Reuse the code above. */
4917 return mutually_exclusive_p (bufp, p2, p1);
4918 case charset_not:
4919 /* When we have two charset_not, it's very unlikely that
4920 they don't overlap. The union of the two sets of excluded
4921 chars should cover all possible chars, which, as a matter of
4922 fact, is virtually impossible in multibyte buffers. */
36595814 4923 break;
411e4203
SM
4924 }
4925 break;
4926
4e8a9132 4927 case wordend:
669fa600
SM
4928 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4929 case symend:
4e8a9132 4930 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4931 && (p1[1] == Ssymbol || p1[1] == Sword));
4932 case notsyntaxspec:
4933 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4934
4935 case wordbeg:
669fa600
SM
4936 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4937 case symbeg:
4e8a9132 4938 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4939 && (p1[1] == Ssymbol || p1[1] == Sword));
4940 case syntaxspec:
4941 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4942
4943 case wordbound:
4944 return (((re_opcode_t) *p1 == notsyntaxspec
4945 || (re_opcode_t) *p1 == syntaxspec)
4946 && p1[1] == Sword);
4947
1fb352e0 4948#ifdef emacs
4e8a9132
SM
4949 case categoryspec:
4950 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4951 case notcategoryspec:
4952 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4953#endif /* emacs */
4954
4955 default:
4956 ;
505bde11
SM
4957 }
4958
4959 /* Safe default. */
4960 return 0;
4961}
4962
fa9a63c5
RM
4963\f
4964/* Matching routines. */
4965
25fe55af 4966#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4967/* re_match is like re_match_2 except it takes only a single string. */
4968
4969int
d2762c86
DN
4970re_match (struct re_pattern_buffer *bufp, const char *string,
4971 int size, int pos, struct re_registers *regs)
fa9a63c5 4972{
4bb91c68 4973 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4974 pos, regs, size);
fa9a63c5
RM
4975 return result;
4976}
c0f9ea08 4977WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4978#endif /* not emacs */
4979
b18215fc
RS
4980#ifdef emacs
4981/* In Emacs, this is the string or buffer in which we
7814e705 4982 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4983Lisp_Object re_match_object;
4984#endif
fa9a63c5
RM
4985
4986/* re_match_2 matches the compiled pattern in BUFP against the
4987 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4988 and SIZE2, respectively). We start matching at POS, and stop
4989 matching at STOP.
5e69f11e 4990
fa9a63c5 4991 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4992 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4993 documentation for exactly how many groups we fill.
4994
4995 We return -1 if no match, -2 if an internal error (such as the
7814e705 4996 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4997 matched substring. */
4998
4999int
971de7fb 5000re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5 5001{
b18215fc 5002 int result;
25fe55af 5003
b18215fc 5004#ifdef emacs
cc9b4df2 5005 int charpos;
d48cd3f4 5006 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 5007 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5008 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5009#endif
5010
4bb91c68
SM
5011 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5012 (re_char*) string2, size2,
cc9b4df2 5013 pos, regs, stop);
fa9a63c5
RM
5014 return result;
5015}
c0f9ea08 5016WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5017
bf216479 5018
fa9a63c5 5019/* This is a separate function so that we can force an alloca cleanup
7814e705 5020 afterwards. */
fa9a63c5 5021static int
971de7fb 5022re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1, int size1, const re_char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5
RM
5023{
5024 /* General temporaries. */
5025 int mcnt;
01618498 5026 size_t reg;
66f0296e 5027 boolean not;
fa9a63c5
RM
5028
5029 /* Just past the end of the corresponding string. */
66f0296e 5030 re_char *end1, *end2;
fa9a63c5
RM
5031
5032 /* Pointers into string1 and string2, just past the last characters in
7814e705 5033 each to consider matching. */
66f0296e 5034 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5035
5036 /* Where we are in the data, and the end of the current string. */
66f0296e 5037 re_char *d, *dend;
5e69f11e 5038
99633e97
SM
5039 /* Used sometimes to remember where we were before starting matching
5040 an operator so that we can go back in case of failure. This "atomic"
5041 behavior of matching opcodes is indispensable to the correctness
5042 of the on_failure_keep_string_jump optimization. */
5043 re_char *dfail;
5044
fa9a63c5 5045 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5046 re_char *p = bufp->buffer;
5047 re_char *pend = p + bufp->used;
fa9a63c5 5048
25fe55af 5049 /* We use this to map every character in the string. */
6676cb1c 5050 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5051
cf9c99bc 5052 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5053 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5054
cf9c99bc
KH
5055 /* Nonzero if STRING1/STRING2 are multibyte. */
5056 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5057
fa9a63c5
RM
5058 /* Failure point stack. Each place that can handle a failure further
5059 down the line pushes a failure point on this stack. It consists of
505bde11 5060 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5061 the subexpressions we're currently inside, plus the number of such
5062 registers, and, finally, two char *'s. The first char * is where
5063 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5064 scanning the strings. */
5065#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5066 fail_stack_type fail_stack;
5067#endif
5068#ifdef DEBUG
fa9a63c5
RM
5069 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5070#endif
5071
0b32bf0e 5072#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5073 /* This holds the pointer to the failure stack, when
5074 it is allocated relocatably. */
5075 fail_stack_elt_t *failure_stack_ptr;
99633e97 5076#endif
fa9a63c5
RM
5077
5078 /* We fill all the registers internally, independent of what we
7814e705 5079 return, for use in backreferences. The number here includes
fa9a63c5 5080 an element for register zero. */
4bb91c68 5081 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5082
fa9a63c5
RM
5083 /* Information on the contents of registers. These are pointers into
5084 the input strings; they record just what was matched (on this
5085 attempt) by a subexpression part of the pattern, that is, the
5086 regnum-th regstart pointer points to where in the pattern we began
5087 matching and the regnum-th regend points to right after where we
5088 stopped matching the regnum-th subexpression. (The zeroth register
5089 keeps track of what the whole pattern matches.) */
5090#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5091 re_char **regstart, **regend;
fa9a63c5
RM
5092#endif
5093
fa9a63c5 5094 /* The following record the register info as found in the above
5e69f11e 5095 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5096 This happens as we backtrack through the failure points, which in
5097 turn happens only if we have not yet matched the entire string. */
5098 unsigned best_regs_set = false;
5099#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5100 re_char **best_regstart, **best_regend;
fa9a63c5 5101#endif
5e69f11e 5102
fa9a63c5
RM
5103 /* Logically, this is `best_regend[0]'. But we don't want to have to
5104 allocate space for that if we're not allocating space for anything
7814e705 5105 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5106 any of the other register vectors, and it seems rather a kludge to
5107 treat `best_regend' differently than the rest. So we keep track of
5108 the end of the best match so far in a separate variable. We
5109 initialize this to NULL so that when we backtrack the first time
5110 and need to test it, it's not garbage. */
66f0296e 5111 re_char *match_end = NULL;
fa9a63c5 5112
fa9a63c5
RM
5113#ifdef DEBUG
5114 /* Counts the total number of registers pushed. */
5e69f11e 5115 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5116#endif
5117
5118 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5119
fa9a63c5 5120 INIT_FAIL_STACK ();
5e69f11e 5121
fa9a63c5
RM
5122#ifdef MATCH_MAY_ALLOCATE
5123 /* Do not bother to initialize all the register variables if there are
5124 no groups in the pattern, as it takes a fair amount of time. If
5125 there are groups, we include space for register 0 (the whole
5126 pattern), even though we never use it, since it simplifies the
5127 array indexing. We should fix this. */
5128 if (bufp->re_nsub)
5129 {
66f0296e
SM
5130 regstart = REGEX_TALLOC (num_regs, re_char *);
5131 regend = REGEX_TALLOC (num_regs, re_char *);
5132 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5133 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5134
505bde11 5135 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5136 {
5137 FREE_VARIABLES ();
5138 return -2;
5139 }
fa9a63c5
RM
5140 }
5141 else
5142 {
5143 /* We must initialize all our variables to NULL, so that
25fe55af 5144 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5145 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5146 }
5147#endif /* MATCH_MAY_ALLOCATE */
5148
5149 /* The starting position is bogus. */
5150 if (pos < 0 || pos > size1 + size2)
5151 {
5152 FREE_VARIABLES ();
5153 return -1;
5154 }
5e69f11e 5155
fa9a63c5
RM
5156 /* Initialize subexpression text positions to -1 to mark ones that no
5157 start_memory/stop_memory has been seen for. Also initialize the
5158 register information struct. */
01618498
SM
5159 for (reg = 1; reg < num_regs; reg++)
5160 regstart[reg] = regend[reg] = NULL;
99633e97 5161
fa9a63c5 5162 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5163 `string1' is null. */
fa9a63c5
RM
5164 if (size2 == 0 && string1 != NULL)
5165 {
5166 string2 = string1;
5167 size2 = size1;
5168 string1 = 0;
5169 size1 = 0;
5170 }
5171 end1 = string1 + size1;
5172 end2 = string2 + size2;
5173
5e69f11e 5174 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5175 `dend' is the end of the input string that `d' points within. `d'
5176 is advanced into the following input string whenever necessary, but
5177 this happens before fetching; therefore, at the beginning of the
5178 loop, `d' can be pointing at the end of a string, but it cannot
5179 equal `string2'. */
419d1c74 5180 if (pos >= size1)
fa9a63c5 5181 {
419d1c74
SM
5182 /* Only match within string2. */
5183 d = string2 + pos - size1;
5184 dend = end_match_2 = string2 + stop - size1;
5185 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5186 }
5187 else
5188 {
f1ad044f 5189 if (stop < size1)
419d1c74
SM
5190 {
5191 /* Only match within string1. */
5192 end_match_1 = string1 + stop;
5193 /* BEWARE!
5194 When we reach end_match_1, PREFETCH normally switches to string2.
5195 But in the present case, this means that just doing a PREFETCH
5196 makes us jump from `stop' to `gap' within the string.
5197 What we really want here is for the search to stop as
5198 soon as we hit end_match_1. That's why we set end_match_2
5199 to end_match_1 (since PREFETCH fails as soon as we hit
5200 end_match_2). */
5201 end_match_2 = end_match_1;
5202 }
5203 else
f1ad044f
SM
5204 { /* It's important to use this code when stop == size so that
5205 moving `d' from end1 to string2 will not prevent the d == dend
5206 check from catching the end of string. */
419d1c74
SM
5207 end_match_1 = end1;
5208 end_match_2 = string2 + stop - size1;
5209 }
5210 d = string1 + pos;
5211 dend = end_match_1;
fa9a63c5
RM
5212 }
5213
5214 DEBUG_PRINT1 ("The compiled pattern is: ");
5215 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5216 DEBUG_PRINT1 ("The string to match is: `");
5217 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5218 DEBUG_PRINT1 ("'\n");
5e69f11e 5219
7814e705 5220 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5221 function if the match is complete, or it drops through if the match
5222 fails at this starting point in the input data. */
5223 for (;;)
5224 {
505bde11 5225 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5226
5227 if (p == pend)
5228 { /* End of pattern means we might have succeeded. */
25fe55af 5229 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5230
fa9a63c5 5231 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5232 longest match, try backtracking. */
5233 if (d != end_match_2)
fa9a63c5
RM
5234 {
5235 /* 1 if this match ends in the same string (string1 or string2)
5236 as the best previous match. */
5e69f11e 5237 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5238 == FIRST_STRING_P (d));
fa9a63c5
RM
5239 /* 1 if this match is the best seen so far. */
5240 boolean best_match_p;
5241
5242 /* AIX compiler got confused when this was combined
7814e705 5243 with the previous declaration. */
fa9a63c5
RM
5244 if (same_str_p)
5245 best_match_p = d > match_end;
5246 else
99633e97 5247 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5248
25fe55af
RS
5249 DEBUG_PRINT1 ("backtracking.\n");
5250
5251 if (!FAIL_STACK_EMPTY ())
5252 { /* More failure points to try. */
5253
5254 /* If exceeds best match so far, save it. */
5255 if (!best_regs_set || best_match_p)
5256 {
5257 best_regs_set = true;
5258 match_end = d;
5259
5260 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5261
01618498 5262 for (reg = 1; reg < num_regs; reg++)
25fe55af 5263 {
01618498
SM
5264 best_regstart[reg] = regstart[reg];
5265 best_regend[reg] = regend[reg];
25fe55af
RS
5266 }
5267 }
5268 goto fail;
5269 }
5270
5271 /* If no failure points, don't restore garbage. And if
5272 last match is real best match, don't restore second
5273 best one. */
5274 else if (best_regs_set && !best_match_p)
5275 {
5276 restore_best_regs:
5277 /* Restore best match. It may happen that `dend ==
5278 end_match_1' while the restored d is in string2.
5279 For example, the pattern `x.*y.*z' against the
5280 strings `x-' and `y-z-', if the two strings are
7814e705 5281 not consecutive in memory. */
25fe55af
RS
5282 DEBUG_PRINT1 ("Restoring best registers.\n");
5283
5284 d = match_end;
5285 dend = ((d >= string1 && d <= end1)
5286 ? end_match_1 : end_match_2);
fa9a63c5 5287
01618498 5288 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5289 {
01618498
SM
5290 regstart[reg] = best_regstart[reg];
5291 regend[reg] = best_regend[reg];
fa9a63c5 5292 }
25fe55af
RS
5293 }
5294 } /* d != end_match_2 */
fa9a63c5
RM
5295
5296 succeed_label:
25fe55af 5297 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5298
25fe55af
RS
5299 /* If caller wants register contents data back, do it. */
5300 if (regs && !bufp->no_sub)
fa9a63c5 5301 {
25fe55af
RS
5302 /* Have the register data arrays been allocated? */
5303 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5304 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5305 extra element beyond `num_regs' for the `-1' marker
5306 GNU code uses. */
5307 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5308 regs->start = TALLOC (regs->num_regs, regoff_t);
5309 regs->end = TALLOC (regs->num_regs, regoff_t);
5310 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5311 {
5312 FREE_VARIABLES ();
5313 return -2;
5314 }
25fe55af
RS
5315 bufp->regs_allocated = REGS_REALLOCATE;
5316 }
5317 else if (bufp->regs_allocated == REGS_REALLOCATE)
5318 { /* Yes. If we need more elements than were already
5319 allocated, reallocate them. If we need fewer, just
5320 leave it alone. */
5321 if (regs->num_regs < num_regs + 1)
5322 {
5323 regs->num_regs = num_regs + 1;
5324 RETALLOC (regs->start, regs->num_regs, regoff_t);
5325 RETALLOC (regs->end, regs->num_regs, regoff_t);
5326 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5327 {
5328 FREE_VARIABLES ();
5329 return -2;
5330 }
25fe55af
RS
5331 }
5332 }
5333 else
fa9a63c5
RM
5334 {
5335 /* These braces fend off a "empty body in an else-statement"
7814e705 5336 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5337 assert (bufp->regs_allocated == REGS_FIXED);
5338 }
5339
25fe55af
RS
5340 /* Convert the pointer data in `regstart' and `regend' to
5341 indices. Register zero has to be set differently,
5342 since we haven't kept track of any info for it. */
5343 if (regs->num_regs > 0)
5344 {
5345 regs->start[0] = pos;
99633e97 5346 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5347 }
5e69f11e 5348
25fe55af
RS
5349 /* Go through the first `min (num_regs, regs->num_regs)'
5350 registers, since that is all we initialized. */
01618498 5351 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5352 {
01618498
SM
5353 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5354 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5355 else
5356 {
01618498
SM
5357 regs->start[reg]
5358 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5359 regs->end[reg]
5360 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5361 }
fa9a63c5 5362 }
5e69f11e 5363
25fe55af
RS
5364 /* If the regs structure we return has more elements than
5365 were in the pattern, set the extra elements to -1. If
5366 we (re)allocated the registers, this is the case,
5367 because we always allocate enough to have at least one
7814e705 5368 -1 at the end. */
01618498
SM
5369 for (reg = num_regs; reg < regs->num_regs; reg++)
5370 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5371 } /* regs && !bufp->no_sub */
5372
25fe55af
RS
5373 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5374 nfailure_points_pushed, nfailure_points_popped,
5375 nfailure_points_pushed - nfailure_points_popped);
5376 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5377
99633e97 5378 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5379
25fe55af 5380 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5381
25fe55af
RS
5382 FREE_VARIABLES ();
5383 return mcnt;
5384 }
fa9a63c5 5385
7814e705 5386 /* Otherwise match next pattern command. */
fa9a63c5
RM
5387 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5388 {
25fe55af
RS
5389 /* Ignore these. Used to ignore the n of succeed_n's which
5390 currently have n == 0. */
5391 case no_op:
5392 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5393 break;
fa9a63c5
RM
5394
5395 case succeed:
25fe55af 5396 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5397 goto succeed_label;
5398
7814e705 5399 /* Match the next n pattern characters exactly. The following
25fe55af 5400 byte in the pattern defines n, and the n bytes after that
7814e705 5401 are the characters to match. */
fa9a63c5
RM
5402 case exactn:
5403 mcnt = *p++;
25fe55af 5404 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5405
99633e97
SM
5406 /* Remember the start point to rollback upon failure. */
5407 dfail = d;
5408
6fdd04b0 5409#ifndef emacs
25fe55af
RS
5410 /* This is written out as an if-else so we don't waste time
5411 testing `translate' inside the loop. */
28703c16 5412 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5413 do
5414 {
5415 PREFETCH ();
5416 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5417 {
6fdd04b0
KH
5418 d = dfail;
5419 goto fail;
e934739e 5420 }
6fdd04b0
KH
5421 d++;
5422 }
5423 while (--mcnt);
fa9a63c5 5424 else
6fdd04b0
KH
5425 do
5426 {
5427 PREFETCH ();
5428 if (*d++ != *p++)
bf216479 5429 {
6fdd04b0
KH
5430 d = dfail;
5431 goto fail;
bf216479 5432 }
6fdd04b0
KH
5433 }
5434 while (--mcnt);
5435#else /* emacs */
5436 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5437 if (target_multibyte)
6fdd04b0
KH
5438 do
5439 {
5440 int pat_charlen, buf_charlen;
cf9c99bc 5441 int pat_ch, buf_ch;
e934739e 5442
6fdd04b0 5443 PREFETCH ();
cf9c99bc 5444 if (multibyte)
62a6e103 5445 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5446 else
5447 {
5448 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5449 pat_charlen = 1;
5450 }
62a6e103 5451 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5452
6fdd04b0 5453 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5454 {
6fdd04b0
KH
5455 d = dfail;
5456 goto fail;
e934739e 5457 }
bf216479 5458
6fdd04b0
KH
5459 p += pat_charlen;
5460 d += buf_charlen;
5461 mcnt -= pat_charlen;
5462 }
5463 while (mcnt > 0);
fa9a63c5 5464 else
6fdd04b0
KH
5465 do
5466 {
cf9c99bc
KH
5467 int pat_charlen, buf_charlen;
5468 int pat_ch, buf_ch;
bf216479 5469
6fdd04b0 5470 PREFETCH ();
cf9c99bc
KH
5471 if (multibyte)
5472 {
62a6e103 5473 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5474 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5475 }
5476 else
5477 {
5478 pat_ch = *p;
5479 pat_charlen = 1;
5480 }
5481 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5482 if (! CHAR_BYTE8_P (buf_ch))
5483 {
5484 buf_ch = TRANSLATE (buf_ch);
5485 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5486 if (buf_ch < 0)
5487 buf_ch = *d;
5488 }
0e2501ed
AS
5489 else
5490 buf_ch = *d;
cf9c99bc 5491 if (buf_ch != pat_ch)
6fdd04b0
KH
5492 {
5493 d = dfail;
5494 goto fail;
bf216479 5495 }
cf9c99bc
KH
5496 p += pat_charlen;
5497 d++;
6fdd04b0
KH
5498 }
5499 while (--mcnt);
5500#endif
25fe55af 5501 break;
fa9a63c5
RM
5502
5503
25fe55af 5504 /* Match any character except possibly a newline or a null. */
fa9a63c5 5505 case anychar:
e934739e
RS
5506 {
5507 int buf_charlen;
01618498 5508 re_wchar_t buf_ch;
fa9a63c5 5509
e934739e 5510 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5511
e934739e 5512 PREFETCH ();
62a6e103 5513 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5514 target_multibyte);
e934739e
RS
5515 buf_ch = TRANSLATE (buf_ch);
5516
5517 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5518 && buf_ch == '\n')
5519 || ((bufp->syntax & RE_DOT_NOT_NULL)
5520 && buf_ch == '\000'))
5521 goto fail;
5522
e934739e
RS
5523 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5524 d += buf_charlen;
5525 }
fa9a63c5
RM
5526 break;
5527
5528
5529 case charset:
5530 case charset_not:
5531 {
b18215fc 5532 register unsigned int c;
fa9a63c5 5533 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5534 int len;
5535
5536 /* Start of actual range_table, or end of bitmap if there is no
5537 range table. */
01618498 5538 re_char *range_table;
b18215fc 5539
96cc36cc 5540 /* Nonzero if there is a range table. */
b18215fc
RS
5541 int range_table_exists;
5542
96cc36cc
RS
5543 /* Number of ranges of range table. This is not included
5544 in the initial byte-length of the command. */
5545 int count = 0;
fa9a63c5 5546
f5020181
AS
5547 /* Whether matching against a unibyte character. */
5548 boolean unibyte_char = false;
5549
25fe55af 5550 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5551
b18215fc 5552 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5553
b18215fc 5554 if (range_table_exists)
96cc36cc
RS
5555 {
5556 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5557 EXTRACT_NUMBER_AND_INCR (count, range_table);
5558 }
b18215fc 5559
2d1675e4 5560 PREFETCH ();
62a6e103 5561 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5562 if (target_multibyte)
5563 {
5564 int c1;
b18215fc 5565
cf9c99bc
KH
5566 c = TRANSLATE (c);
5567 c1 = RE_CHAR_TO_UNIBYTE (c);
5568 if (c1 >= 0)
f5020181
AS
5569 {
5570 unibyte_char = true;
5571 c = c1;
5572 }
cf9c99bc
KH
5573 }
5574 else
5575 {
5576 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5577
5578 if (! CHAR_BYTE8_P (c1))
5579 {
5580 c1 = TRANSLATE (c1);
5581 c1 = RE_CHAR_TO_UNIBYTE (c1);
5582 if (c1 >= 0)
f5020181
AS
5583 {
5584 unibyte_char = true;
5585 c = c1;
5586 }
cf9c99bc 5587 }
0b8be006
AS
5588 else
5589 unibyte_char = true;
cf9c99bc
KH
5590 }
5591
f5020181 5592 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5593 { /* Lookup bitmap. */
b18215fc
RS
5594 /* Cast to `unsigned' instead of `unsigned char' in
5595 case the bit list is a full 32 bytes long. */
5596 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5597 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5598 not = !not;
b18215fc 5599 }
96cc36cc 5600#ifdef emacs
b18215fc 5601 else if (range_table_exists)
96cc36cc
RS
5602 {
5603 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5604
14473664
SM
5605 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5606 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5607 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5608 | (class_bits & BIT_SPACE && ISSPACE (c))
5609 | (class_bits & BIT_UPPER && ISUPPER (c))
5610 | (class_bits & BIT_WORD && ISWORD (c)))
5611 not = !not;
5612 else
5613 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5614 }
5615#endif /* emacs */
fa9a63c5 5616
96cc36cc
RS
5617 if (range_table_exists)
5618 p = CHARSET_RANGE_TABLE_END (range_table, count);
5619 else
5620 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5621
5622 if (!not) goto fail;
5e69f11e 5623
b18215fc 5624 d += len;
fa9a63c5
RM
5625 break;
5626 }
5627
5628
25fe55af 5629 /* The beginning of a group is represented by start_memory.
505bde11 5630 The argument is the register number. The text
25fe55af 5631 matched within the group is recorded (in the internal
7814e705 5632 registers data structure) under the register number. */
25fe55af 5633 case start_memory:
505bde11
SM
5634 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5635
5636 /* In case we need to undo this operation (via backtracking). */
5637 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5638
25fe55af 5639 regstart[*p] = d;
4bb91c68 5640 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5641 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5642
25fe55af 5643 /* Move past the register number and inner group count. */
505bde11 5644 p += 1;
25fe55af 5645 break;
fa9a63c5
RM
5646
5647
25fe55af 5648 /* The stop_memory opcode represents the end of a group. Its
505bde11 5649 argument is the same as start_memory's: the register number. */
fa9a63c5 5650 case stop_memory:
505bde11
SM
5651 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5652
5653 assert (!REG_UNSET (regstart[*p]));
5654 /* Strictly speaking, there should be code such as:
177c0ea7 5655
0b32bf0e 5656 assert (REG_UNSET (regend[*p]));
505bde11
SM
5657 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5658
5659 But the only info to be pushed is regend[*p] and it is known to
5660 be UNSET, so there really isn't anything to push.
5661 Not pushing anything, on the other hand deprives us from the
5662 guarantee that regend[*p] is UNSET since undoing this operation
5663 will not reset its value properly. This is not important since
5664 the value will only be read on the next start_memory or at
5665 the very end and both events can only happen if this stop_memory
5666 is *not* undone. */
fa9a63c5 5667
25fe55af 5668 regend[*p] = d;
fa9a63c5
RM
5669 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5670
25fe55af 5671 /* Move past the register number and the inner group count. */
505bde11 5672 p += 1;
25fe55af 5673 break;
fa9a63c5
RM
5674
5675
5676 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5677 followed by the numeric value of <digit> as the register number. */
5678 case duplicate:
fa9a63c5 5679 {
66f0296e 5680 register re_char *d2, *dend2;
7814e705 5681 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5682 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5683
7814e705 5684 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5685 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5686 goto fail;
5e69f11e 5687
7814e705 5688 /* Where in input to try to start matching. */
25fe55af 5689 d2 = regstart[regno];
5e69f11e 5690
99633e97
SM
5691 /* Remember the start point to rollback upon failure. */
5692 dfail = d;
5693
25fe55af
RS
5694 /* Where to stop matching; if both the place to start and
5695 the place to stop matching are in the same string, then
5696 set to the place to stop, otherwise, for now have to use
5697 the end of the first string. */
fa9a63c5 5698
25fe55af 5699 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5700 == FIRST_STRING_P (regend[regno]))
5701 ? regend[regno] : end_match_1);
5702 for (;;)
5703 {
5704 /* If necessary, advance to next segment in register
25fe55af 5705 contents. */
fa9a63c5
RM
5706 while (d2 == dend2)
5707 {
5708 if (dend2 == end_match_2) break;
5709 if (dend2 == regend[regno]) break;
5710
25fe55af
RS
5711 /* End of string1 => advance to string2. */
5712 d2 = string2;
5713 dend2 = regend[regno];
fa9a63c5
RM
5714 }
5715 /* At end of register contents => success */
5716 if (d2 == dend2) break;
5717
5718 /* If necessary, advance to next segment in data. */
5719 PREFETCH ();
5720
5721 /* How many characters left in this segment to match. */
5722 mcnt = dend - d;
5e69f11e 5723
fa9a63c5 5724 /* Want how many consecutive characters we can match in
25fe55af
RS
5725 one shot, so, if necessary, adjust the count. */
5726 if (mcnt > dend2 - d2)
fa9a63c5 5727 mcnt = dend2 - d2;
5e69f11e 5728
fa9a63c5 5729 /* Compare that many; failure if mismatch, else move
25fe55af 5730 past them. */
28703c16 5731 if (RE_TRANSLATE_P (translate)
02cb78b5 5732 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5733 : memcmp (d, d2, mcnt))
99633e97
SM
5734 {
5735 d = dfail;
5736 goto fail;
5737 }
fa9a63c5 5738 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5739 }
5740 }
5741 break;
5742
5743
25fe55af 5744 /* begline matches the empty string at the beginning of the string
c0f9ea08 5745 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5746 case begline:
25fe55af 5747 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5748
25fe55af
RS
5749 if (AT_STRINGS_BEG (d))
5750 {
5751 if (!bufp->not_bol) break;
5752 }
419d1c74 5753 else
25fe55af 5754 {
bf216479 5755 unsigned c;
419d1c74 5756 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5757 if (c == '\n')
419d1c74 5758 break;
25fe55af
RS
5759 }
5760 /* In all other cases, we fail. */
5761 goto fail;
fa9a63c5
RM
5762
5763
25fe55af 5764 /* endline is the dual of begline. */
fa9a63c5 5765 case endline:
25fe55af 5766 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5767
25fe55af
RS
5768 if (AT_STRINGS_END (d))
5769 {
5770 if (!bufp->not_eol) break;
5771 }
f1ad044f 5772 else
25fe55af 5773 {
f1ad044f 5774 PREFETCH_NOLIMIT ();
c0f9ea08 5775 if (*d == '\n')
f1ad044f 5776 break;
25fe55af
RS
5777 }
5778 goto fail;
fa9a63c5
RM
5779
5780
5781 /* Match at the very beginning of the data. */
25fe55af
RS
5782 case begbuf:
5783 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5784 if (AT_STRINGS_BEG (d))
5785 break;
5786 goto fail;
fa9a63c5
RM
5787
5788
5789 /* Match at the very end of the data. */
25fe55af
RS
5790 case endbuf:
5791 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5792 if (AT_STRINGS_END (d))
5793 break;
25fe55af 5794 goto fail;
5e69f11e 5795
5e69f11e 5796
25fe55af
RS
5797 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5798 pushes NULL as the value for the string on the stack. Then
505bde11 5799 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5800 string, instead of restoring it. To see why, consider
7814e705 5801 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5802 then the . fails against the \n. But the next thing we want
5803 to do is match the \n against the \n; if we restored the
5804 string value, we would be back at the foo.
5805
5806 Because this is used only in specific cases, we don't need to
5807 check all the things that `on_failure_jump' does, to make
5808 sure the right things get saved on the stack. Hence we don't
5809 share its code. The only reason to push anything on the
5810 stack at all is that otherwise we would have to change
5811 `anychar's code to do something besides goto fail in this
5812 case; that seems worse than this. */
5813 case on_failure_keep_string_jump:
505bde11
SM
5814 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5815 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5816 mcnt, p + mcnt);
fa9a63c5 5817
505bde11
SM
5818 PUSH_FAILURE_POINT (p - 3, NULL);
5819 break;
5820
0683b6fa
SM
5821 /* A nasty loop is introduced by the non-greedy *? and +?.
5822 With such loops, the stack only ever contains one failure point
5823 at a time, so that a plain on_failure_jump_loop kind of
5824 cycle detection cannot work. Worse yet, such a detection
5825 can not only fail to detect a cycle, but it can also wrongly
5826 detect a cycle (between different instantiations of the same
6df42991 5827 loop).
0683b6fa
SM
5828 So the method used for those nasty loops is a little different:
5829 We use a special cycle-detection-stack-frame which is pushed
5830 when the on_failure_jump_nastyloop failure-point is *popped*.
5831 This special frame thus marks the beginning of one iteration
5832 through the loop and we can hence easily check right here
5833 whether something matched between the beginning and the end of
5834 the loop. */
5835 case on_failure_jump_nastyloop:
5836 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5837 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5838 mcnt, p + mcnt);
5839
5840 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5841 {
5842 int cycle = 0;
5843 CHECK_INFINITE_LOOP (p - 4, d);
5844 if (!cycle)
5845 /* If there's a cycle, just continue without pushing
5846 this failure point. The failure point is the "try again"
5847 option, which shouldn't be tried.
5848 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5849 PUSH_FAILURE_POINT (p - 3, d);
5850 }
0683b6fa
SM
5851 break;
5852
4e8a9132
SM
5853 /* Simple loop detecting on_failure_jump: just check on the
5854 failure stack if the same spot was already hit earlier. */
505bde11
SM
5855 case on_failure_jump_loop:
5856 on_failure:
5857 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5858 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5859 mcnt, p + mcnt);
6df42991
SM
5860 {
5861 int cycle = 0;
5862 CHECK_INFINITE_LOOP (p - 3, d);
5863 if (cycle)
5864 /* If there's a cycle, get out of the loop, as if the matching
5865 had failed. We used to just `goto fail' here, but that was
5866 aborting the search a bit too early: we want to keep the
5867 empty-loop-match and keep matching after the loop.
5868 We want (x?)*y\1z to match both xxyz and xxyxz. */
5869 p += mcnt;
5870 else
5871 PUSH_FAILURE_POINT (p - 3, d);
5872 }
25fe55af 5873 break;
fa9a63c5
RM
5874
5875
5876 /* Uses of on_failure_jump:
5e69f11e 5877
25fe55af
RS
5878 Each alternative starts with an on_failure_jump that points
5879 to the beginning of the next alternative. Each alternative
5880 except the last ends with a jump that in effect jumps past
5881 the rest of the alternatives. (They really jump to the
5882 ending jump of the following alternative, because tensioning
5883 these jumps is a hassle.)
fa9a63c5 5884
25fe55af
RS
5885 Repeats start with an on_failure_jump that points past both
5886 the repetition text and either the following jump or
5887 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5888 case on_failure_jump:
25fe55af 5889 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5890 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5891 mcnt, p + mcnt);
25fe55af 5892
505bde11 5893 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5894 break;
5895
4e8a9132 5896 /* This operation is used for greedy *.
505bde11
SM
5897 Compare the beginning of the repeat with what in the
5898 pattern follows its end. If we can establish that there
5899 is nothing that they would both match, i.e., that we
5900 would have to backtrack because of (as in, e.g., `a*a')
5901 then we can use a non-backtracking loop based on
4e8a9132 5902 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5903 case on_failure_jump_smart:
25fe55af 5904 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5905 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5906 mcnt, p + mcnt);
25fe55af 5907 {
01618498 5908 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5909 /* Here, we discard `const', making re_match non-reentrant. */
5910 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5911 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5912
505bde11
SM
5913 p -= 3; /* Reset so that we will re-execute the
5914 instruction once it's been changed. */
fa9a63c5 5915
4e8a9132
SM
5916 EXTRACT_NUMBER (mcnt, p2 - 2);
5917
5918 /* Ensure this is a indeed the trivial kind of loop
5919 we are expecting. */
5920 assert (skip_one_char (p1) == p2 - 3);
5921 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5922 DEBUG_STATEMENT (debug += 2);
505bde11 5923 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5924 {
505bde11 5925 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5926 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5927 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5928 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5929 }
505bde11 5930 else
fa9a63c5 5931 {
505bde11
SM
5932 /* Default to a safe `on_failure_jump' loop. */
5933 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5934 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5935 }
99633e97 5936 DEBUG_STATEMENT (debug -= 2);
25fe55af 5937 }
505bde11 5938 break;
25fe55af
RS
5939
5940 /* Unconditionally jump (without popping any failure points). */
5941 case jump:
fa9a63c5 5942 unconditional_jump:
5b370c2b 5943 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5944 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5945 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5946 p += mcnt; /* Do the jump. */
505bde11 5947 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5948 break;
5949
5950
25fe55af
RS
5951 /* Have to succeed matching what follows at least n times.
5952 After that, handle like `on_failure_jump'. */
5953 case succeed_n:
01618498 5954 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5955 EXTRACT_NUMBER (mcnt, p + 2);
5956 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5957
dc1e502d
SM
5958 /* Originally, mcnt is how many times we HAVE to succeed. */
5959 if (mcnt != 0)
25fe55af 5960 {
6dcf2d0e
SM
5961 /* Here, we discard `const', making re_match non-reentrant. */
5962 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5963 mcnt--;
01618498
SM
5964 p += 4;
5965 PUSH_NUMBER (p2, mcnt);
25fe55af 5966 }
dc1e502d
SM
5967 else
5968 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5969 goto on_failure;
25fe55af
RS
5970 break;
5971
5972 case jump_n:
01618498 5973 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5974 EXTRACT_NUMBER (mcnt, p + 2);
5975 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5976
5977 /* Originally, this is how many times we CAN jump. */
dc1e502d 5978 if (mcnt != 0)
25fe55af 5979 {
6dcf2d0e
SM
5980 /* Here, we discard `const', making re_match non-reentrant. */
5981 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5982 mcnt--;
01618498 5983 PUSH_NUMBER (p2, mcnt);
dc1e502d 5984 goto unconditional_jump;
25fe55af
RS
5985 }
5986 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5987 else
5988 p += 4;
25fe55af 5989 break;
5e69f11e 5990
fa9a63c5
RM
5991 case set_number_at:
5992 {
01618498 5993 unsigned char *p2; /* Location of the counter. */
25fe55af 5994 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5995
25fe55af 5996 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5997 /* Here, we discard `const', making re_match non-reentrant. */
5998 p2 = (unsigned char*) p + mcnt;
01618498 5999 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6000 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6001 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6002 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6003 break;
6004 }
9121ca40
KH
6005
6006 case wordbound:
66f0296e
SM
6007 case notwordbound:
6008 not = (re_opcode_t) *(p - 1) == notwordbound;
6009 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6010
99633e97 6011 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6012
b18215fc 6013 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6014 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6015 not = !not;
b18215fc
RS
6016 else
6017 {
6018 /* C1 is the character before D, S1 is the syntax of C1, C2
6019 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6020 re_wchar_t c1, c2;
6021 int s1, s2;
bf216479 6022 int dummy;
b18215fc 6023#ifdef emacs
2d1675e4
SM
6024 int offset = PTR_TO_OFFSET (d - 1);
6025 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6026 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6027#endif
66f0296e 6028 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6029 s1 = SYNTAX (c1);
6030#ifdef emacs
5d967c7a 6031 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6032#endif
f1ad044f 6033 PREFETCH_NOLIMIT ();
6fdd04b0 6034 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6035 s2 = SYNTAX (c2);
6036
6037 if (/* Case 2: Only one of S1 and S2 is Sword. */
6038 ((s1 == Sword) != (s2 == Sword))
6039 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6040 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6041 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6042 not = !not;
6043 }
6044 if (not)
9121ca40 6045 break;
b18215fc 6046 else
9121ca40 6047 goto fail;
fa9a63c5
RM
6048
6049 case wordbeg:
25fe55af 6050 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6051
b18215fc
RS
6052 /* We FAIL in one of the following cases: */
6053
7814e705 6054 /* Case 1: D is at the end of string. */
b18215fc 6055 if (AT_STRINGS_END (d))
99633e97 6056 goto fail;
b18215fc
RS
6057 else
6058 {
6059 /* C1 is the character before D, S1 is the syntax of C1, C2
6060 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6061 re_wchar_t c1, c2;
6062 int s1, s2;
bf216479 6063 int dummy;
fa9a63c5 6064#ifdef emacs
2d1675e4
SM
6065 int offset = PTR_TO_OFFSET (d);
6066 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6067 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6068#endif
99633e97 6069 PREFETCH ();
6fdd04b0 6070 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6071 s2 = SYNTAX (c2);
177c0ea7 6072
b18215fc
RS
6073 /* Case 2: S2 is not Sword. */
6074 if (s2 != Sword)
6075 goto fail;
6076
6077 /* Case 3: D is not at the beginning of string ... */
6078 if (!AT_STRINGS_BEG (d))
6079 {
6080 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6081#ifdef emacs
5d967c7a 6082 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6083#endif
b18215fc
RS
6084 s1 = SYNTAX (c1);
6085
6086 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6087 returns 0. */
b18215fc
RS
6088 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6089 goto fail;
6090 }
6091 }
e318085a
RS
6092 break;
6093
b18215fc 6094 case wordend:
25fe55af 6095 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6096
6097 /* We FAIL in one of the following cases: */
6098
6099 /* Case 1: D is at the beginning of string. */
6100 if (AT_STRINGS_BEG (d))
e318085a 6101 goto fail;
b18215fc
RS
6102 else
6103 {
6104 /* C1 is the character before D, S1 is the syntax of C1, C2
6105 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6106 re_wchar_t c1, c2;
6107 int s1, s2;
bf216479 6108 int dummy;
5d967c7a 6109#ifdef emacs
2d1675e4
SM
6110 int offset = PTR_TO_OFFSET (d) - 1;
6111 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6112 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6113#endif
99633e97 6114 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6115 s1 = SYNTAX (c1);
6116
6117 /* Case 2: S1 is not Sword. */
6118 if (s1 != Sword)
6119 goto fail;
6120
6121 /* Case 3: D is not at the end of string ... */
6122 if (!AT_STRINGS_END (d))
6123 {
f1ad044f 6124 PREFETCH_NOLIMIT ();
6fdd04b0 6125 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6126#ifdef emacs
6127 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6128#endif
b18215fc
RS
6129 s2 = SYNTAX (c2);
6130
6131 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6132 returns 0. */
b18215fc 6133 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6134 goto fail;
b18215fc
RS
6135 }
6136 }
e318085a
RS
6137 break;
6138
669fa600
SM
6139 case symbeg:
6140 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6141
6142 /* We FAIL in one of the following cases: */
6143
7814e705 6144 /* Case 1: D is at the end of string. */
669fa600
SM
6145 if (AT_STRINGS_END (d))
6146 goto fail;
6147 else
6148 {
6149 /* C1 is the character before D, S1 is the syntax of C1, C2
6150 is the character at D, and S2 is the syntax of C2. */
6151 re_wchar_t c1, c2;
6152 int s1, s2;
6153#ifdef emacs
6154 int offset = PTR_TO_OFFSET (d);
6155 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6156 UPDATE_SYNTAX_TABLE (charpos);
6157#endif
6158 PREFETCH ();
62a6e103 6159 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6160 s2 = SYNTAX (c2);
7814e705 6161
669fa600
SM
6162 /* Case 2: S2 is neither Sword nor Ssymbol. */
6163 if (s2 != Sword && s2 != Ssymbol)
6164 goto fail;
6165
6166 /* Case 3: D is not at the beginning of string ... */
6167 if (!AT_STRINGS_BEG (d))
6168 {
6169 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6170#ifdef emacs
6171 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6172#endif
6173 s1 = SYNTAX (c1);
6174
6175 /* ... and S1 is Sword or Ssymbol. */
6176 if (s1 == Sword || s1 == Ssymbol)
6177 goto fail;
6178 }
6179 }
6180 break;
6181
6182 case symend:
6183 DEBUG_PRINT1 ("EXECUTING symend.\n");
6184
6185 /* We FAIL in one of the following cases: */
6186
6187 /* Case 1: D is at the beginning of string. */
6188 if (AT_STRINGS_BEG (d))
6189 goto fail;
6190 else
6191 {
6192 /* C1 is the character before D, S1 is the syntax of C1, C2
6193 is the character at D, and S2 is the syntax of C2. */
6194 re_wchar_t c1, c2;
6195 int s1, s2;
6196#ifdef emacs
6197 int offset = PTR_TO_OFFSET (d) - 1;
6198 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6199 UPDATE_SYNTAX_TABLE (charpos);
6200#endif
6201 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6202 s1 = SYNTAX (c1);
6203
6204 /* Case 2: S1 is neither Ssymbol nor Sword. */
6205 if (s1 != Sword && s1 != Ssymbol)
6206 goto fail;
6207
6208 /* Case 3: D is not at the end of string ... */
6209 if (!AT_STRINGS_END (d))
6210 {
6211 PREFETCH_NOLIMIT ();
62a6e103 6212 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6213#ifdef emacs
134579f2 6214 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6215#endif
6216 s2 = SYNTAX (c2);
6217
6218 /* ... and S2 is Sword or Ssymbol. */
6219 if (s2 == Sword || s2 == Ssymbol)
6220 goto fail;
b18215fc
RS
6221 }
6222 }
e318085a
RS
6223 break;
6224
fa9a63c5 6225 case syntaxspec:
1fb352e0
SM
6226 case notsyntaxspec:
6227 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6228 mcnt = *p++;
1fb352e0 6229 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6230 PREFETCH ();
b18215fc
RS
6231#ifdef emacs
6232 {
2d1675e4
SM
6233 int offset = PTR_TO_OFFSET (d);
6234 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6235 UPDATE_SYNTAX_TABLE (pos1);
6236 }
25fe55af 6237#endif
b18215fc 6238 {
01618498
SM
6239 int len;
6240 re_wchar_t c;
b18215fc 6241
6fdd04b0 6242 GET_CHAR_AFTER (c, d, len);
990b2375 6243 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6244 goto fail;
b18215fc
RS
6245 d += len;
6246 }
fa9a63c5
RM
6247 break;
6248
b18215fc 6249#ifdef emacs
1fb352e0
SM
6250 case before_dot:
6251 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6252 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6253 goto fail;
b18215fc
RS
6254 break;
6255
1fb352e0
SM
6256 case at_dot:
6257 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6258 if (PTR_BYTE_POS (d) != PT_BYTE)
6259 goto fail;
6260 break;
b18215fc 6261
1fb352e0
SM
6262 case after_dot:
6263 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6264 if (PTR_BYTE_POS (d) <= PT_BYTE)
6265 goto fail;
e318085a 6266 break;
fa9a63c5 6267
1fb352e0 6268 case categoryspec:
b18215fc 6269 case notcategoryspec:
1fb352e0 6270 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6271 mcnt = *p++;
1fb352e0 6272 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6273 PREFETCH ();
6274 {
01618498
SM
6275 int len;
6276 re_wchar_t c;
6277
6fdd04b0 6278 GET_CHAR_AFTER (c, d, len);
1fb352e0 6279 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6280 goto fail;
6281 d += len;
6282 }
fa9a63c5 6283 break;
5e69f11e 6284
1fb352e0 6285#endif /* emacs */
5e69f11e 6286
0b32bf0e
SM
6287 default:
6288 abort ();
fa9a63c5 6289 }
b18215fc 6290 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6291
6292
6293 /* We goto here if a matching operation fails. */
6294 fail:
5b370c2b 6295 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6296 if (!FAIL_STACK_EMPTY ())
505bde11 6297 {
01618498 6298 re_char *str, *pat;
505bde11 6299 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6300 DEBUG_PRINT1 ("\nFAIL:\n");
6301 POP_FAILURE_POINT (str, pat);
505bde11
SM
6302 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6303 {
6304 case on_failure_keep_string_jump:
6305 assert (str == NULL);
6306 goto continue_failure_jump;
6307
0683b6fa
SM
6308 case on_failure_jump_nastyloop:
6309 assert ((re_opcode_t)pat[-2] == no_op);
6310 PUSH_FAILURE_POINT (pat - 2, str);
6311 /* Fallthrough */
6312
505bde11
SM
6313 case on_failure_jump_loop:
6314 case on_failure_jump:
6315 case succeed_n:
6316 d = str;
6317 continue_failure_jump:
6318 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6319 p = pat + mcnt;
6320 break;
b18215fc 6321
0683b6fa
SM
6322 case no_op:
6323 /* A special frame used for nastyloops. */
6324 goto fail;
6325
505bde11
SM
6326 default:
6327 abort();
6328 }
fa9a63c5 6329
505bde11 6330 assert (p >= bufp->buffer && p <= pend);
b18215fc 6331
0b32bf0e 6332 if (d >= string1 && d <= end1)
fa9a63c5 6333 dend = end_match_1;
0b32bf0e 6334 }
fa9a63c5 6335 else
0b32bf0e 6336 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6337 } /* for (;;) */
6338
6339 if (best_regs_set)
6340 goto restore_best_regs;
6341
6342 FREE_VARIABLES ();
6343
b18215fc 6344 return -1; /* Failure to match. */
fa9a63c5
RM
6345} /* re_match_2 */
6346\f
6347/* Subroutine definitions for re_match_2. */
6348
fa9a63c5
RM
6349/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6350 bytes; nonzero otherwise. */
5e69f11e 6351
fa9a63c5 6352static int
438105ed
JB
6353bcmp_translate (const re_char *s1, const re_char *s2, register int len,
6354 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6355{
2d1675e4
SM
6356 register re_char *p1 = s1, *p2 = s2;
6357 re_char *p1_end = s1 + len;
6358 re_char *p2_end = s2 + len;
e934739e 6359
4bb91c68
SM
6360 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6361 different lengths, but relying on a single `len' would break this. -sm */
6362 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6363 {
e934739e 6364 int p1_charlen, p2_charlen;
01618498 6365 re_wchar_t p1_ch, p2_ch;
e934739e 6366
6fdd04b0
KH
6367 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6368 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6369
6370 if (RE_TRANSLATE (translate, p1_ch)
6371 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6372 return 1;
e934739e
RS
6373
6374 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6375 }
e934739e
RS
6376
6377 if (p1 != p1_end || p2 != p2_end)
6378 return 1;
6379
fa9a63c5
RM
6380 return 0;
6381}
6382\f
6383/* Entry points for GNU code. */
6384
6385/* re_compile_pattern is the GNU regular expression compiler: it
6386 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6387 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6388
fa9a63c5
RM
6389 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6390 are set in BUFP on entry.
5e69f11e 6391
b18215fc 6392 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6393
6394const char *
971de7fb 6395re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp)
fa9a63c5
RM
6396{
6397 reg_errcode_t ret;
5e69f11e 6398
fa9a63c5
RM
6399 /* GNU code is written to assume at least RE_NREGS registers will be set
6400 (and at least one extra will be -1). */
6401 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6402
fa9a63c5
RM
6403 /* And GNU code determines whether or not to get register information
6404 by passing null for the REGS argument to re_match, etc., not by
6405 setting no_sub. */
6406 bufp->no_sub = 0;
5e69f11e 6407
4bb91c68 6408 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6409
6410 if (!ret)
6411 return NULL;
6412 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6413}
c0f9ea08 6414WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6415\f
b18215fc
RS
6416/* Entry points compatible with 4.2 BSD regex library. We don't define
6417 them unless specifically requested. */
fa9a63c5 6418
0b32bf0e 6419#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6420
6421/* BSD has one and only one pattern buffer. */
6422static struct re_pattern_buffer re_comp_buf;
6423
6424char *
0b32bf0e 6425# ifdef _LIBC
48afdd44
RM
6426/* Make these definitions weak in libc, so POSIX programs can redefine
6427 these names if they don't use our functions, and still use
6428 regcomp/regexec below without link errors. */
6429weak_function
0b32bf0e 6430# endif
fa9a63c5
RM
6431re_comp (s)
6432 const char *s;
6433{
6434 reg_errcode_t ret;
5e69f11e 6435
fa9a63c5
RM
6436 if (!s)
6437 {
6438 if (!re_comp_buf.buffer)
0b32bf0e 6439 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6440 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6441 return 0;
6442 }
6443
6444 if (!re_comp_buf.buffer)
6445 {
6446 re_comp_buf.buffer = (unsigned char *) malloc (200);
6447 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6448 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6449 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6450 re_comp_buf.allocated = 200;
6451
6452 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6453 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6454 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6455 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6456 }
6457
6458 /* Since `re_exec' always passes NULL for the `regs' argument, we
6459 don't need to initialize the pattern buffer fields which affect it. */
6460
fa9a63c5 6461 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6462
fa9a63c5
RM
6463 if (!ret)
6464 return NULL;
6465
6466 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6467 return (char *) gettext (re_error_msgid[(int) ret]);
6468}
6469
6470
6471int
0b32bf0e 6472# ifdef _LIBC
48afdd44 6473weak_function
0b32bf0e 6474# endif
fa9a63c5
RM
6475re_exec (s)
6476 const char *s;
6477{
6478 const int len = strlen (s);
6479 return
6480 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6481}
6482#endif /* _REGEX_RE_COMP */
6483\f
6484/* POSIX.2 functions. Don't define these for Emacs. */
6485
6486#ifndef emacs
6487
6488/* regcomp takes a regular expression as a string and compiles it.
6489
b18215fc 6490 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6491 since POSIX says we shouldn't. Thus, we set
6492
6493 `buffer' to the compiled pattern;
6494 `used' to the length of the compiled pattern;
6495 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6496 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6497 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6498 `fastmap' to an allocated space for the fastmap;
6499 `fastmap_accurate' to zero;
fa9a63c5
RM
6500 `re_nsub' to the number of subexpressions in PATTERN.
6501
6502 PATTERN is the address of the pattern string.
6503
6504 CFLAGS is a series of bits which affect compilation.
6505
6506 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6507 use POSIX basic syntax.
6508
6509 If REG_NEWLINE is set, then . and [^...] don't match newline.
6510 Also, regexec will try a match beginning after every newline.
6511
6512 If REG_ICASE is set, then we considers upper- and lowercase
6513 versions of letters to be equivalent when matching.
6514
6515 If REG_NOSUB is set, then when PREG is passed to regexec, that
6516 routine will report only success or failure, and nothing about the
6517 registers.
6518
b18215fc 6519 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6520 the return codes and their meanings.) */
6521
6522int
d2762c86
DN
6523regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6524 int cflags)
fa9a63c5
RM
6525{
6526 reg_errcode_t ret;
4bb91c68 6527 reg_syntax_t syntax
fa9a63c5
RM
6528 = (cflags & REG_EXTENDED) ?
6529 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6530
6531 /* regex_compile will allocate the space for the compiled pattern. */
6532 preg->buffer = 0;
6533 preg->allocated = 0;
6534 preg->used = 0;
5e69f11e 6535
c0f9ea08
SM
6536 /* Try to allocate space for the fastmap. */
6537 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6538
fa9a63c5
RM
6539 if (cflags & REG_ICASE)
6540 {
6541 unsigned i;
5e69f11e 6542
6676cb1c
RS
6543 preg->translate
6544 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6545 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6546 if (preg->translate == NULL)
0b32bf0e 6547 return (int) REG_ESPACE;
fa9a63c5
RM
6548
6549 /* Map uppercase characters to corresponding lowercase ones. */
6550 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6551 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6552 }
6553 else
6554 preg->translate = NULL;
6555
6556 /* If REG_NEWLINE is set, newlines are treated differently. */
6557 if (cflags & REG_NEWLINE)
6558 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6559 syntax &= ~RE_DOT_NEWLINE;
6560 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6561 }
6562 else
c0f9ea08 6563 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6564
6565 preg->no_sub = !!(cflags & REG_NOSUB);
6566
5e69f11e 6567 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6568 can use strlen here in compiling the pattern. */
4bb91c68 6569 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6570
fa9a63c5
RM
6571 /* POSIX doesn't distinguish between an unmatched open-group and an
6572 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6573 if (ret == REG_ERPAREN)
6574 ret = REG_EPAREN;
6575
6576 if (ret == REG_NOERROR && preg->fastmap)
6577 { /* Compute the fastmap now, since regexec cannot modify the pattern
6578 buffer. */
6579 re_compile_fastmap (preg);
6580 if (preg->can_be_null)
6581 { /* The fastmap can't be used anyway. */
6582 free (preg->fastmap);
6583 preg->fastmap = NULL;
6584 }
6585 }
fa9a63c5
RM
6586 return (int) ret;
6587}
c0f9ea08 6588WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6589
6590
6591/* regexec searches for a given pattern, specified by PREG, in the
6592 string STRING.
5e69f11e 6593
fa9a63c5 6594 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6595 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6596 least NMATCH elements, and we set them to the offsets of the
6597 corresponding matched substrings.
5e69f11e 6598
fa9a63c5
RM
6599 EFLAGS specifies `execution flags' which affect matching: if
6600 REG_NOTBOL is set, then ^ does not match at the beginning of the
6601 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6602
fa9a63c5
RM
6603 We return 0 if we find a match and REG_NOMATCH if not. */
6604
6605int
d2762c86
DN
6606regexec (const regex_t *__restrict preg, const char *__restrict string,
6607 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5
RM
6608{
6609 int ret;
6610 struct re_registers regs;
6611 regex_t private_preg;
6612 int len = strlen (string);
c0f9ea08 6613 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6614
6615 private_preg = *preg;
5e69f11e 6616
fa9a63c5
RM
6617 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6618 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6619
fa9a63c5
RM
6620 /* The user has told us exactly how many registers to return
6621 information about, via `nmatch'. We have to pass that on to the
b18215fc 6622 matching routines. */
fa9a63c5 6623 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6624
fa9a63c5
RM
6625 if (want_reg_info)
6626 {
6627 regs.num_regs = nmatch;
4bb91c68
SM
6628 regs.start = TALLOC (nmatch * 2, regoff_t);
6629 if (regs.start == NULL)
0b32bf0e 6630 return (int) REG_NOMATCH;
4bb91c68 6631 regs.end = regs.start + nmatch;
fa9a63c5
RM
6632 }
6633
c0f9ea08
SM
6634 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6635 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6636 was a little bit longer but still only matching the real part.
6637 This works because the `endline' will check for a '\n' and will find a
6638 '\0', correctly deciding that this is not the end of a line.
6639 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6640 a convenient '\0' there. For all we know, the string could be preceded
6641 by '\n' which would throw things off. */
6642
fa9a63c5
RM
6643 /* Perform the searching operation. */
6644 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6645 /* start: */ 0, /* range: */ len,
6646 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6647
fa9a63c5
RM
6648 /* Copy the register information to the POSIX structure. */
6649 if (want_reg_info)
6650 {
6651 if (ret >= 0)
0b32bf0e
SM
6652 {
6653 unsigned r;
fa9a63c5 6654
0b32bf0e
SM
6655 for (r = 0; r < nmatch; r++)
6656 {
6657 pmatch[r].rm_so = regs.start[r];
6658 pmatch[r].rm_eo = regs.end[r];
6659 }
6660 }
fa9a63c5 6661
b18215fc 6662 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6663 free (regs.start);
fa9a63c5
RM
6664 }
6665
6666 /* We want zero return to mean success, unlike `re_search'. */
6667 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6668}
c0f9ea08 6669WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6670
6671
ec869672
JR
6672/* Returns a message corresponding to an error code, ERR_CODE, returned
6673 from either regcomp or regexec. We don't use PREG here.
6674
6675 ERR_CODE was previously called ERRCODE, but that name causes an
6676 error with msvc8 compiler. */
fa9a63c5
RM
6677
6678size_t
d2762c86 6679regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6680{
6681 const char *msg;
6682 size_t msg_size;
6683
ec869672
JR
6684 if (err_code < 0
6685 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6686 /* Only error codes returned by the rest of the code should be passed
b18215fc 6687 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6688 code generates an invalid error code, then the program has a bug.
6689 Dump core so we can fix it. */
6690 abort ();
6691
ec869672 6692 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6693
6694 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6695
fa9a63c5
RM
6696 if (errbuf_size != 0)
6697 {
6698 if (msg_size > errbuf_size)
0b32bf0e
SM
6699 {
6700 strncpy (errbuf, msg, errbuf_size - 1);
6701 errbuf[errbuf_size - 1] = 0;
6702 }
fa9a63c5 6703 else
0b32bf0e 6704 strcpy (errbuf, msg);
fa9a63c5
RM
6705 }
6706
6707 return msg_size;
6708}
c0f9ea08 6709WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6710
6711
6712/* Free dynamically allocated space used by PREG. */
6713
6714void
d2762c86 6715regfree (regex_t *preg)
fa9a63c5 6716{
c2cd06e6 6717 free (preg->buffer);
fa9a63c5 6718 preg->buffer = NULL;
5e69f11e 6719
fa9a63c5
RM
6720 preg->allocated = 0;
6721 preg->used = 0;
6722
c2cd06e6 6723 free (preg->fastmap);
fa9a63c5
RM
6724 preg->fastmap = NULL;
6725 preg->fastmap_accurate = 0;
6726
c2cd06e6 6727 free (preg->translate);
fa9a63c5
RM
6728 preg->translate = NULL;
6729}
c0f9ea08 6730WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6731
6732#endif /* not emacs */