Convert consecutive FSF copyright years to ranges.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
73b0cd50 5 Copyright (C) 1993-2011
e468b87f 6 Free Software Foundation, Inc.
bc78d348 7
fa9a63c5
RM
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
e468b87f 10 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
4fc5845f 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 21 USA. */
fa9a63c5 22
6df42991 23/* TODO:
505bde11 24 - structure the opcode space into opcode+flag.
dc1e502d 25 - merge with glibc's regex.[ch].
01618498 26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
01618498 30*/
505bde11 31
fa9a63c5 32/* AIX requires this to be the first thing in the file. */
0b32bf0e 33#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
34 #pragma alloca
35#endif
36
fa9a63c5 37#ifdef HAVE_CONFIG_H
0b32bf0e 38# include <config.h>
fa9a63c5
RM
39#endif
40
4bb91c68
SM
41#if defined STDC_HEADERS && !defined emacs
42# include <stddef.h>
43#else
44/* We need this for `regex.h', and perhaps for the Emacs include files. */
45# include <sys/types.h>
46#endif
fa9a63c5 47
14473664
SM
48/* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
50#if defined _LIBC
51#define WIDE_CHAR_SUPPORT 1
52#else
14473664 53#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55#endif
14473664
SM
56
57/* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
a0ad02f7 59#if WIDE_CHAR_SUPPORT
14473664
SM
60/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61# include <wchar.h>
62# include <wctype.h>
63#endif
64
c0f9ea08
SM
65#ifdef _LIBC
66/* We have to keep the namespace clean. */
67# define regfree(preg) __regfree (preg)
68# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
70# define regerror(err_code, preg, errbuf, errbuf_size) \
71 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
72# define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76# define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78# define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80# define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82# define re_set_syntax(syntax) __re_set_syntax (syntax)
83# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
14473664
SM
87/* Make sure we call libc's function even if the user overrides them. */
88# define btowc __btowc
89# define iswctype __iswctype
90# define wctype __wctype
91
c0f9ea08
SM
92# define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94/* We are also using some library internals. */
95# include <locale/localeinfo.h>
96# include <locale/elem-hash.h>
97# include <langinfo.h>
98#else
99# define WEAK_ALIAS(a,b)
100#endif
101
4bb91c68 102/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 103#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
104# include <libintl.h>
105#else
106# define gettext(msgid) (msgid)
107#endif
108
5e69f11e
RM
109#ifndef gettext_noop
110/* This define is so xgettext can find the internationalizable
111 strings. */
0b32bf0e 112# define gettext_noop(String) String
5e69f11e
RM
113#endif
114
fa9a63c5
RM
115/* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117#ifdef emacs
118
d7306fe6 119# include <setjmp.h>
0b32bf0e
SM
120# include "lisp.h"
121# include "buffer.h"
b18215fc
RS
122
123/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 124# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 125
0b32bf0e 126# include "syntax.h"
9117d724 127# include "character.h"
0b32bf0e 128# include "category.h"
fa9a63c5 129
7689ef0b
EZ
130# ifdef malloc
131# undef malloc
132# endif
0b32bf0e 133# define malloc xmalloc
7689ef0b
EZ
134# ifdef realloc
135# undef realloc
136# endif
0b32bf0e 137# define realloc xrealloc
7689ef0b
EZ
138# ifdef free
139# undef free
140# endif
0b32bf0e 141# define free xfree
9abbd165 142
7814e705 143/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
144# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
145# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
146
147# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 148# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
149# define RE_STRING_CHAR(p, multibyte) \
150 (multibyte ? (STRING_CHAR (p)) : (*(p)))
151# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
152 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 153
4c0354d7 154# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 155
2afc21f5 156# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 157
6fdd04b0
KH
158/* Set C a (possibly converted to multibyte) character before P. P
159 points into a string which is the virtual concatenation of STR1
160 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
161# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
162 do { \
02cb78b5 163 if (target_multibyte) \
bf216479
KH
164 { \
165 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
166 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
167 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 168 c = STRING_CHAR (dtemp); \
bf216479
KH
169 } \
170 else \
171 { \
172 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 173 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 174 } \
2d1675e4
SM
175 } while (0)
176
6fdd04b0
KH
177/* Set C a (possibly converted to multibyte) character at P, and set
178 LEN to the byte length of that character. */
179# define GET_CHAR_AFTER(c, p, len) \
180 do { \
02cb78b5 181 if (target_multibyte) \
62a6e103 182 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
183 else \
184 { \
cf9c99bc 185 (c) = *p; \
6fdd04b0 186 len = 1; \
cf9c99bc 187 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 188 } \
8f924df7 189 } while (0)
4e8a9132 190
fa9a63c5
RM
191#else /* not emacs */
192
193/* If we are not linking with Emacs proper,
194 we can't use the relocating allocator
195 even if config.h says that we can. */
0b32bf0e 196# undef REL_ALLOC
fa9a63c5 197
4004364e 198# include <unistd.h>
fa9a63c5 199
a77f947b
CY
200/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
201
202void *
d2762c86 203xmalloc (size_t size)
a77f947b
CY
204{
205 register void *val;
206 val = (void *) malloc (size);
207 if (!val && size)
208 {
209 write (2, "virtual memory exhausted\n", 25);
210 exit (1);
211 }
212 return val;
213}
214
215void *
d2762c86 216xrealloc (void *block, size_t size)
a77f947b
CY
217{
218 register void *val;
219 /* We must call malloc explicitly when BLOCK is 0, since some
220 reallocs don't do this. */
221 if (! block)
222 val = (void *) malloc (size);
223 else
224 val = (void *) realloc (block, size);
225 if (!val && size)
226 {
227 write (2, "virtual memory exhausted\n", 25);
228 exit (1);
229 }
230 return val;
231}
232
a073faa6
CY
233# ifdef malloc
234# undef malloc
235# endif
236# define malloc xmalloc
237# ifdef realloc
238# undef realloc
239# endif
240# define realloc xrealloc
241
72af86bd
AS
242/* This is the normal way of making sure we have memcpy, memcmp and memset. */
243# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
244# include <string.h>
245# else
246# include <strings.h>
247# ifndef memcmp
248# define memcmp(s1, s2, n) bcmp (s1, s2, n)
0b32bf0e 249# endif
72af86bd
AS
250# ifndef memcpy
251# define memcpy(d, s, n) (bcopy (s, d, n), (d))
0b32bf0e
SM
252# endif
253# endif
fa9a63c5
RM
254
255/* Define the syntax stuff for \<, \>, etc. */
256
990b2375 257/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 258enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 259
0b32bf0e 260# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 261
e934739e 262/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
263# define CHAR_CHARSET(c) 0
264# define CHARSET_LEADING_CODE_BASE(c) 0
265# define MAX_MULTIBYTE_LENGTH 1
266# define RE_MULTIBYTE_P(x) 0
bf216479 267# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
268# define WORD_BOUNDARY_P(c1, c2) (0)
269# define CHAR_HEAD_P(p) (1)
270# define SINGLE_BYTE_CHAR_P(c) (1)
271# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 272# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 273# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
274# define STRING_CHAR(p) (*(p))
275# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 276# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
277# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
278# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
279# define RE_CHAR_TO_MULTIBYTE(c) (c)
280# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 281# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 282 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
283# define GET_CHAR_AFTER(c, p, len) \
284 (c = *p, len = 1)
0b32bf0e 285# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
286# define BYTE8_TO_CHAR(c) (c)
287# define CHAR_BYTE8_P(c) (0)
bf216479 288# define CHAR_LEADING_CODE(c) (c)
8f924df7 289
fa9a63c5 290#endif /* not emacs */
4e8a9132
SM
291
292#ifndef RE_TRANSLATE
0b32bf0e
SM
293# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
294# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 295#endif
fa9a63c5
RM
296\f
297/* Get the interface, including the syntax bits. */
298#include "regex.h"
299
f71b19b6
DL
300/* isalpha etc. are used for the character classes. */
301#include <ctype.h>
fa9a63c5 302
f71b19b6 303#ifdef emacs
fa9a63c5 304
f71b19b6 305/* 1 if C is an ASCII character. */
0b32bf0e 306# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 307
f71b19b6 308/* 1 if C is a unibyte character. */
0b32bf0e 309# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 310
f71b19b6 311/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 312
f71b19b6 313/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
314# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
315# define ISCNTRL(c) ((c) < ' ')
316# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
317 || ((c) >= 'a' && (c) <= 'f') \
318 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
319
320/* This is only used for single-byte characters. */
0b32bf0e 321# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
322
323/* The rest must handle multibyte characters. */
324
0b32bf0e 325# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 326 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
327 : 1)
328
14473664 329# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 330 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
331 : 1)
332
0b32bf0e 333# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
334 ? (((c) >= 'a' && (c) <= 'z') \
335 || ((c) >= 'A' && (c) <= 'Z') \
336 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
337 : SYNTAX (c) == Sword)
338
0b32bf0e 339# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
340 ? (((c) >= 'a' && (c) <= 'z') \
341 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
342 : SYNTAX (c) == Sword)
343
0b32bf0e 344# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 345
0b32bf0e 346# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
347 ? ((c) > ' ' && (c) < 0177 \
348 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
349 || ((c) >= 'A' && (c) <= 'Z') \
350 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
351 : SYNTAX (c) != Sword)
352
0b32bf0e 353# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 354
0b32bf0e 355# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 356
0b32bf0e 357# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
358
359#else /* not emacs */
360
f71b19b6
DL
361/* Jim Meyering writes:
362
363 "... Some ctype macros are valid only for character codes that
364 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
365 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 366 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
367 STDC_HEADERS is defined, then autoconf has verified that the ctype
368 macros don't need to be guarded with references to isascii. ...
369 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
370 eliminate the && through constant folding."
371 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 372
4bb91c68 373# undef ISASCII
0b32bf0e
SM
374# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
375# define ISASCII(c) 1
376# else
377# define ISASCII(c) isascii(c)
378# endif
f71b19b6
DL
379
380/* 1 if C is an ASCII character. */
0b32bf0e 381# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
382
383/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
384# define ISUNIBYTE(c) 1
385
386# ifdef isblank
387# define ISBLANK(c) (ISASCII (c) && isblank (c))
388# else
389# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
390# endif
391# ifdef isgraph
392# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
393# else
394# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
395# endif
396
4bb91c68 397# undef ISPRINT
0b32bf0e
SM
398# define ISPRINT(c) (ISASCII (c) && isprint (c))
399# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
400# define ISALNUM(c) (ISASCII (c) && isalnum (c))
401# define ISALPHA(c) (ISASCII (c) && isalpha (c))
402# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
403# define ISLOWER(c) (ISASCII (c) && islower (c))
404# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
405# define ISSPACE(c) (ISASCII (c) && isspace (c))
406# define ISUPPER(c) (ISASCII (c) && isupper (c))
407# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
408
409# define ISWORD(c) ISALPHA(c)
410
4bb91c68
SM
411# ifdef _tolower
412# define TOLOWER(c) _tolower(c)
413# else
414# define TOLOWER(c) tolower(c)
415# endif
416
417/* How many characters in the character set. */
418# define CHAR_SET_SIZE 256
419
0b32bf0e 420# ifdef SYNTAX_TABLE
f71b19b6 421
0b32bf0e 422extern char *re_syntax_table;
f71b19b6 423
0b32bf0e
SM
424# else /* not SYNTAX_TABLE */
425
0b32bf0e
SM
426static char re_syntax_table[CHAR_SET_SIZE];
427
428static void
d2762c86 429init_syntax_once (void)
0b32bf0e
SM
430{
431 register int c;
432 static int done = 0;
433
434 if (done)
435 return;
436
72af86bd 437 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 438
4bb91c68
SM
439 for (c = 0; c < CHAR_SET_SIZE; ++c)
440 if (ISALNUM (c))
441 re_syntax_table[c] = Sword;
fa9a63c5 442
669fa600 443 re_syntax_table['_'] = Ssymbol;
fa9a63c5 444
0b32bf0e
SM
445 done = 1;
446}
447
448# endif /* not SYNTAX_TABLE */
96cc36cc 449
4bb91c68
SM
450# define SYNTAX(c) re_syntax_table[(c)]
451
96cc36cc
RS
452#endif /* not emacs */
453\f
fa9a63c5 454#ifndef NULL
0b32bf0e 455# define NULL (void *)0
fa9a63c5
RM
456#endif
457
458/* We remove any previous definition of `SIGN_EXTEND_CHAR',
459 since ours (we hope) works properly with all combinations of
460 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 461 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
462#undef SIGN_EXTEND_CHAR
463#if __STDC__
0b32bf0e 464# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
465#else /* not __STDC__ */
466/* As in Harbison and Steele. */
0b32bf0e 467# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
468#endif
469\f
470/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
471 use `alloca' instead of `malloc'. This is because using malloc in
472 re_search* or re_match* could cause memory leaks when C-g is used in
473 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
474 the other hand, malloc is more portable, and easier to debug.
475
fa9a63c5
RM
476 Because we sometimes use alloca, some routines have to be macros,
477 not functions -- `alloca'-allocated space disappears at the end of the
478 function it is called in. */
479
480#ifdef REGEX_MALLOC
481
0b32bf0e
SM
482# define REGEX_ALLOCATE malloc
483# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
484# define REGEX_FREE free
fa9a63c5
RM
485
486#else /* not REGEX_MALLOC */
487
488/* Emacs already defines alloca, sometimes. */
0b32bf0e 489# ifndef alloca
fa9a63c5
RM
490
491/* Make alloca work the best possible way. */
0b32bf0e
SM
492# ifdef __GNUC__
493# define alloca __builtin_alloca
494# else /* not __GNUC__ */
7f585e7a 495# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
496# include <alloca.h>
497# endif /* HAVE_ALLOCA_H */
498# endif /* not __GNUC__ */
fa9a63c5 499
0b32bf0e 500# endif /* not alloca */
fa9a63c5 501
0b32bf0e 502# define REGEX_ALLOCATE alloca
fa9a63c5
RM
503
504/* Assumes a `char *destination' variable. */
0b32bf0e 505# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 506 (destination = (char *) alloca (nsize), \
4bb91c68 507 memcpy (destination, source, osize))
fa9a63c5
RM
508
509/* No need to do anything to free, after alloca. */
0b32bf0e 510# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
511
512#endif /* not REGEX_MALLOC */
513
514/* Define how to allocate the failure stack. */
515
0b32bf0e 516#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 517
0b32bf0e 518# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 519 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 520# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 521 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 522# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
523 r_alloc_free (&failure_stack_ptr)
524
4297555e 525#else /* not using relocating allocator */
fa9a63c5 526
0b32bf0e 527# ifdef REGEX_MALLOC
fa9a63c5 528
0b32bf0e
SM
529# define REGEX_ALLOCATE_STACK malloc
530# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
531# define REGEX_FREE_STACK free
fa9a63c5 532
0b32bf0e 533# else /* not REGEX_MALLOC */
fa9a63c5 534
0b32bf0e 535# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 536
0b32bf0e 537# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 538 REGEX_REALLOCATE (source, osize, nsize)
7814e705 539/* No need to explicitly free anything. */
0b32bf0e 540# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 541
0b32bf0e 542# endif /* not REGEX_MALLOC */
4297555e 543#endif /* not using relocating allocator */
fa9a63c5
RM
544
545
546/* True if `size1' is non-NULL and PTR is pointing anywhere inside
547 `string1' or just past its end. This works if PTR is NULL, which is
548 a good thing. */
25fe55af 549#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
550 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
551
552/* (Re)Allocate N items of type T using malloc, or fail. */
553#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
554#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
555#define RETALLOC_IF(addr, n, t) \
556 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
557#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
558
4bb91c68 559#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
560
561#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
562
563#undef MAX
564#undef MIN
565#define MAX(a, b) ((a) > (b) ? (a) : (b))
566#define MIN(a, b) ((a) < (b) ? (a) : (b))
567
66f0296e
SM
568/* Type of source-pattern and string chars. */
569typedef const unsigned char re_char;
570
fa9a63c5
RM
571typedef char boolean;
572#define false 0
573#define true 1
574
4bb91c68
SM
575static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
576 re_char *string1, int size1,
577 re_char *string2, int size2,
578 int pos,
579 struct re_registers *regs,
580 int stop));
fa9a63c5
RM
581\f
582/* These are the command codes that appear in compiled regular
4bb91c68 583 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
584 command code can specify any interpretation whatsoever for its
585 arguments. Zero bytes may appear in the compiled regular expression. */
586
587typedef enum
588{
589 no_op = 0,
590
4bb91c68 591 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
592 succeed,
593
25fe55af 594 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
595 exactn,
596
25fe55af 597 /* Matches any (more or less) character. */
fa9a63c5
RM
598 anychar,
599
25fe55af
RS
600 /* Matches any one char belonging to specified set. First
601 following byte is number of bitmap bytes. Then come bytes
602 for a bitmap saying which chars are in. Bits in each byte
603 are ordered low-bit-first. A character is in the set if its
604 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
605 automatically not in the set.
606
607 If the length byte has the 0x80 bit set, then that stuff
608 is followed by a range table:
609 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 610 See RANGE_TABLE_WORK_BITS below.
01618498 611 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 612 pairs, each 2 multibyte characters,
0b32bf0e 613 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
614 charset,
615
25fe55af 616 /* Same parameters as charset, but match any character that is
4bb91c68 617 not one of those specified. */
fa9a63c5
RM
618 charset_not,
619
25fe55af
RS
620 /* Start remembering the text that is matched, for storing in a
621 register. Followed by one byte with the register number, in
622 the range 0 to one less than the pattern buffer's re_nsub
505bde11 623 field. */
fa9a63c5
RM
624 start_memory,
625
25fe55af
RS
626 /* Stop remembering the text that is matched and store it in a
627 memory register. Followed by one byte with the register
628 number, in the range 0 to one less than `re_nsub' in the
505bde11 629 pattern buffer. */
fa9a63c5
RM
630 stop_memory,
631
25fe55af 632 /* Match a duplicate of something remembered. Followed by one
4bb91c68 633 byte containing the register number. */
fa9a63c5
RM
634 duplicate,
635
25fe55af 636 /* Fail unless at beginning of line. */
fa9a63c5
RM
637 begline,
638
4bb91c68 639 /* Fail unless at end of line. */
fa9a63c5
RM
640 endline,
641
25fe55af
RS
642 /* Succeeds if at beginning of buffer (if emacs) or at beginning
643 of string to be matched (if not). */
fa9a63c5
RM
644 begbuf,
645
25fe55af 646 /* Analogously, for end of buffer/string. */
fa9a63c5 647 endbuf,
5e69f11e 648
25fe55af 649 /* Followed by two byte relative address to which to jump. */
5e69f11e 650 jump,
fa9a63c5 651
25fe55af 652 /* Followed by two-byte relative address of place to resume at
7814e705 653 in case of failure. */
fa9a63c5 654 on_failure_jump,
5e69f11e 655
25fe55af
RS
656 /* Like on_failure_jump, but pushes a placeholder instead of the
657 current string position when executed. */
fa9a63c5 658 on_failure_keep_string_jump,
5e69f11e 659
505bde11
SM
660 /* Just like `on_failure_jump', except that it checks that we
661 don't get stuck in an infinite loop (matching an empty string
662 indefinitely). */
663 on_failure_jump_loop,
664
0683b6fa
SM
665 /* Just like `on_failure_jump_loop', except that it checks for
666 a different kind of loop (the kind that shows up with non-greedy
667 operators). This operation has to be immediately preceded
668 by a `no_op'. */
669 on_failure_jump_nastyloop,
670
0b32bf0e 671 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
672 It analyses the loop before which it is put and if the
673 loop does not require backtracking, it changes itself to
4e8a9132
SM
674 `on_failure_keep_string_jump' and short-circuits the loop,
675 else it just defaults to changing itself into `on_failure_jump'.
676 It assumes that it is pointing to just past a `jump'. */
505bde11 677 on_failure_jump_smart,
fa9a63c5 678
25fe55af 679 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
680 After matching N times, jump to the address upon failure.
681 Does not work if N starts at 0: use on_failure_jump_loop
682 instead. */
fa9a63c5
RM
683 succeed_n,
684
25fe55af
RS
685 /* Followed by two-byte relative address, and two-byte number n.
686 Jump to the address N times, then fail. */
fa9a63c5
RM
687 jump_n,
688
25fe55af 689 /* Set the following two-byte relative address to the
7814e705 690 subsequent two-byte number. The address *includes* the two
25fe55af 691 bytes of number. */
fa9a63c5
RM
692 set_number_at,
693
fa9a63c5
RM
694 wordbeg, /* Succeeds if at word beginning. */
695 wordend, /* Succeeds if at word end. */
696
697 wordbound, /* Succeeds if at a word boundary. */
7814e705 698 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 699
669fa600
SM
700 symbeg, /* Succeeds if at symbol beginning. */
701 symend, /* Succeeds if at symbol end. */
702
fa9a63c5 703 /* Matches any character whose syntax is specified. Followed by
25fe55af 704 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
705 syntaxspec,
706
707 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
708 notsyntaxspec
709
710#ifdef emacs
711 ,before_dot, /* Succeeds if before point. */
712 at_dot, /* Succeeds if at point. */
713 after_dot, /* Succeeds if after point. */
b18215fc
RS
714
715 /* Matches any character whose category-set contains the specified
7814e705
JB
716 category. The operator is followed by a byte which contains a
717 category code (mnemonic ASCII character). */
b18215fc
RS
718 categoryspec,
719
720 /* Matches any character whose category-set does not contain the
721 specified category. The operator is followed by a byte which
722 contains the category code (mnemonic ASCII character). */
723 notcategoryspec
fa9a63c5
RM
724#endif /* emacs */
725} re_opcode_t;
726\f
727/* Common operations on the compiled pattern. */
728
729/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
730
731#define STORE_NUMBER(destination, number) \
732 do { \
733 (destination)[0] = (number) & 0377; \
734 (destination)[1] = (number) >> 8; \
735 } while (0)
736
737/* Same as STORE_NUMBER, except increment DESTINATION to
738 the byte after where the number is stored. Therefore, DESTINATION
739 must be an lvalue. */
740
741#define STORE_NUMBER_AND_INCR(destination, number) \
742 do { \
743 STORE_NUMBER (destination, number); \
744 (destination) += 2; \
745 } while (0)
746
747/* Put into DESTINATION a number stored in two contiguous bytes starting
748 at SOURCE. */
749
750#define EXTRACT_NUMBER(destination, source) \
751 do { \
752 (destination) = *(source) & 0377; \
753 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
754 } while (0)
755
756#ifdef DEBUG
4bb91c68 757static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
758static void
759extract_number (dest, source)
760 int *dest;
01618498 761 re_char *source;
fa9a63c5 762{
5e69f11e 763 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
764 *dest = *source & 0377;
765 *dest += temp << 8;
766}
767
4bb91c68 768# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
769# undef EXTRACT_NUMBER
770# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
771# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
772
773#endif /* DEBUG */
774
775/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
776 SOURCE must be an lvalue. */
777
778#define EXTRACT_NUMBER_AND_INCR(destination, source) \
779 do { \
780 EXTRACT_NUMBER (destination, source); \
25fe55af 781 (source) += 2; \
fa9a63c5
RM
782 } while (0)
783
784#ifdef DEBUG
4bb91c68
SM
785static void extract_number_and_incr _RE_ARGS ((int *destination,
786 re_char **source));
fa9a63c5
RM
787static void
788extract_number_and_incr (destination, source)
789 int *destination;
01618498 790 re_char **source;
5e69f11e 791{
fa9a63c5
RM
792 extract_number (destination, *source);
793 *source += 2;
794}
795
0b32bf0e
SM
796# ifndef EXTRACT_MACROS
797# undef EXTRACT_NUMBER_AND_INCR
798# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 799 extract_number_and_incr (&dest, &src)
0b32bf0e 800# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
801
802#endif /* DEBUG */
803\f
b18215fc
RS
804/* Store a multibyte character in three contiguous bytes starting
805 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 806 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
807
808#define STORE_CHARACTER_AND_INCR(destination, character) \
809 do { \
810 (destination)[0] = (character) & 0377; \
811 (destination)[1] = ((character) >> 8) & 0377; \
812 (destination)[2] = (character) >> 16; \
813 (destination) += 3; \
814 } while (0)
815
816/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 817 starting at SOURCE. */
b18215fc
RS
818
819#define EXTRACT_CHARACTER(destination, source) \
820 do { \
821 (destination) = ((source)[0] \
822 | ((source)[1] << 8) \
823 | ((source)[2] << 16)); \
824 } while (0)
825
826
827/* Macros for charset. */
828
829/* Size of bitmap of charset P in bytes. P is a start of charset,
830 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
831#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
832
833/* Nonzero if charset P has range table. */
25fe55af 834#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
835
836/* Return the address of range table of charset P. But not the start
837 of table itself, but the before where the number of ranges is
96cc36cc
RS
838 stored. `2 +' means to skip re_opcode_t and size of bitmap,
839 and the 2 bytes of flags at the start of the range table. */
840#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
841
842/* Extract the bit flags that start a range table. */
843#define CHARSET_RANGE_TABLE_BITS(p) \
844 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
845 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
846
847/* Test if C is listed in the bitmap of charset P. */
848#define CHARSET_LOOKUP_BITMAP(p, c) \
849 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
850 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
851
852/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
853 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
854 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
855 and end. */
856#define CHARSET_RANGE_TABLE_END(range_table, count) \
857 ((range_table) + (count) * 2 * 3)
858
7814e705 859/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
860 COUNT is number of ranges in RANGE_TABLE. */
861#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
862 do \
863 { \
01618498
SM
864 re_wchar_t range_start, range_end; \
865 re_char *p; \
866 re_char *range_table_end \
b18215fc
RS
867 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
868 \
869 for (p = (range_table); p < range_table_end; p += 2 * 3) \
870 { \
871 EXTRACT_CHARACTER (range_start, p); \
872 EXTRACT_CHARACTER (range_end, p + 3); \
873 \
874 if (range_start <= (c) && (c) <= range_end) \
875 { \
876 (not) = !(not); \
877 break; \
878 } \
879 } \
880 } \
881 while (0)
882
883/* Test if C is in range table of CHARSET. The flag NOT is negated if
884 C is listed in it. */
885#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
886 do \
887 { \
888 /* Number of ranges in range table. */ \
889 int count; \
01618498
SM
890 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
891 \
b18215fc
RS
892 EXTRACT_NUMBER_AND_INCR (count, range_table); \
893 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
894 } \
895 while (0)
896\f
fa9a63c5
RM
897/* If DEBUG is defined, Regex prints many voluminous messages about what
898 it is doing (if the variable `debug' is nonzero). If linked with the
899 main program in `iregex.c', you can enter patterns and strings
900 interactively. And if linked with the main program in `main.c' and
4bb91c68 901 the other test files, you can run the already-written tests. */
fa9a63c5
RM
902
903#ifdef DEBUG
904
905/* We use standard I/O for debugging. */
0b32bf0e 906# include <stdio.h>
fa9a63c5
RM
907
908/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 909# include <assert.h>
fa9a63c5 910
99633e97 911static int debug = -100000;
fa9a63c5 912
0b32bf0e
SM
913# define DEBUG_STATEMENT(e) e
914# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
915# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
916# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
917# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
918# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 919 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 920# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 921 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
922
923
924/* Print the fastmap in human-readable form. */
925
926void
927print_fastmap (fastmap)
928 char *fastmap;
929{
930 unsigned was_a_range = 0;
5e69f11e
RM
931 unsigned i = 0;
932
fa9a63c5
RM
933 while (i < (1 << BYTEWIDTH))
934 {
935 if (fastmap[i++])
936 {
937 was_a_range = 0;
25fe55af
RS
938 putchar (i - 1);
939 while (i < (1 << BYTEWIDTH) && fastmap[i])
940 {
941 was_a_range = 1;
942 i++;
943 }
fa9a63c5 944 if (was_a_range)
25fe55af
RS
945 {
946 printf ("-");
947 putchar (i - 1);
948 }
949 }
fa9a63c5 950 }
5e69f11e 951 putchar ('\n');
fa9a63c5
RM
952}
953
954
955/* Print a compiled pattern string in human-readable form, starting at
956 the START pointer into it and ending just before the pointer END. */
957
958void
959print_partial_compiled_pattern (start, end)
01618498
SM
960 re_char *start;
961 re_char *end;
fa9a63c5
RM
962{
963 int mcnt, mcnt2;
01618498
SM
964 re_char *p = start;
965 re_char *pend = end;
fa9a63c5
RM
966
967 if (start == NULL)
968 {
a1a052df 969 fprintf (stderr, "(null)\n");
fa9a63c5
RM
970 return;
971 }
5e69f11e 972
fa9a63c5
RM
973 /* Loop over pattern commands. */
974 while (p < pend)
975 {
a1a052df 976 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
977
978 switch ((re_opcode_t) *p++)
979 {
25fe55af 980 case no_op:
a1a052df 981 fprintf (stderr, "/no_op");
25fe55af 982 break;
fa9a63c5 983
99633e97 984 case succeed:
a1a052df 985 fprintf (stderr, "/succeed");
99633e97
SM
986 break;
987
fa9a63c5
RM
988 case exactn:
989 mcnt = *p++;
a1a052df 990 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 991 do
fa9a63c5 992 {
a1a052df 993 fprintf (stderr, "/%c", *p++);
25fe55af
RS
994 }
995 while (--mcnt);
996 break;
fa9a63c5
RM
997
998 case start_memory:
a1a052df 999 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1000 break;
fa9a63c5
RM
1001
1002 case stop_memory:
a1a052df 1003 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1004 break;
fa9a63c5
RM
1005
1006 case duplicate:
a1a052df 1007 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1008 break;
1009
1010 case anychar:
a1a052df 1011 fprintf (stderr, "/anychar");
fa9a63c5
RM
1012 break;
1013
1014 case charset:
25fe55af
RS
1015 case charset_not:
1016 {
1017 register int c, last = -100;
fa9a63c5 1018 register int in_range = 0;
99633e97
SM
1019 int length = CHARSET_BITMAP_SIZE (p - 1);
1020 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1021
a1a052df 1022 fprintf (stderr, "/charset [%s",
839966f3 1023 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1024
839966f3
KH
1025 if (p + *p >= pend)
1026 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1027
25fe55af 1028 for (c = 0; c < 256; c++)
96cc36cc 1029 if (c / 8 < length
fa9a63c5
RM
1030 && (p[1 + (c/8)] & (1 << (c % 8))))
1031 {
1032 /* Are we starting a range? */
1033 if (last + 1 == c && ! in_range)
1034 {
a1a052df 1035 fprintf (stderr, "-");
fa9a63c5
RM
1036 in_range = 1;
1037 }
1038 /* Have we broken a range? */
1039 else if (last + 1 != c && in_range)
96cc36cc 1040 {
a1a052df 1041 fprintf (stderr, "%c", last);
fa9a63c5
RM
1042 in_range = 0;
1043 }
5e69f11e 1044
fa9a63c5 1045 if (! in_range)
a1a052df 1046 fprintf (stderr, "%c", c);
fa9a63c5
RM
1047
1048 last = c;
25fe55af 1049 }
fa9a63c5
RM
1050
1051 if (in_range)
a1a052df 1052 fprintf (stderr, "%c", last);
fa9a63c5 1053
a1a052df 1054 fprintf (stderr, "]");
fa9a63c5 1055
99633e97 1056 p += 1 + length;
96cc36cc 1057
96cc36cc 1058 if (has_range_table)
99633e97
SM
1059 {
1060 int count;
a1a052df 1061 fprintf (stderr, "has-range-table");
99633e97
SM
1062
1063 /* ??? Should print the range table; for now, just skip it. */
1064 p += 2; /* skip range table bits */
1065 EXTRACT_NUMBER_AND_INCR (count, p);
1066 p = CHARSET_RANGE_TABLE_END (p, count);
1067 }
fa9a63c5
RM
1068 }
1069 break;
1070
1071 case begline:
a1a052df 1072 fprintf (stderr, "/begline");
25fe55af 1073 break;
fa9a63c5
RM
1074
1075 case endline:
a1a052df 1076 fprintf (stderr, "/endline");
25fe55af 1077 break;
fa9a63c5
RM
1078
1079 case on_failure_jump:
25fe55af 1080 extract_number_and_incr (&mcnt, &p);
a1a052df 1081 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1082 break;
fa9a63c5
RM
1083
1084 case on_failure_keep_string_jump:
25fe55af 1085 extract_number_and_incr (&mcnt, &p);
a1a052df 1086 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1087 break;
fa9a63c5 1088
0683b6fa
SM
1089 case on_failure_jump_nastyloop:
1090 extract_number_and_incr (&mcnt, &p);
a1a052df 1091 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1092 break;
1093
505bde11 1094 case on_failure_jump_loop:
fa9a63c5 1095 extract_number_and_incr (&mcnt, &p);
a1a052df 1096 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1097 break;
1098
505bde11 1099 case on_failure_jump_smart:
fa9a63c5 1100 extract_number_and_incr (&mcnt, &p);
a1a052df 1101 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1102 break;
1103
25fe55af 1104 case jump:
fa9a63c5 1105 extract_number_and_incr (&mcnt, &p);
a1a052df 1106 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1107 break;
1108
25fe55af
RS
1109 case succeed_n:
1110 extract_number_and_incr (&mcnt, &p);
1111 extract_number_and_incr (&mcnt2, &p);
a1a052df 1112 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1113 break;
5e69f11e 1114
25fe55af
RS
1115 case jump_n:
1116 extract_number_and_incr (&mcnt, &p);
1117 extract_number_and_incr (&mcnt2, &p);
a1a052df 1118 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1119 break;
5e69f11e 1120
25fe55af
RS
1121 case set_number_at:
1122 extract_number_and_incr (&mcnt, &p);
1123 extract_number_and_incr (&mcnt2, &p);
a1a052df 1124 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1125 break;
5e69f11e 1126
25fe55af 1127 case wordbound:
a1a052df 1128 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1129 break;
1130
1131 case notwordbound:
a1a052df 1132 fprintf (stderr, "/notwordbound");
25fe55af 1133 break;
fa9a63c5
RM
1134
1135 case wordbeg:
a1a052df 1136 fprintf (stderr, "/wordbeg");
fa9a63c5 1137 break;
5e69f11e 1138
fa9a63c5 1139 case wordend:
a1a052df 1140 fprintf (stderr, "/wordend");
e2543b02 1141 break;
5e69f11e 1142
669fa600 1143 case symbeg:
e2543b02 1144 fprintf (stderr, "/symbeg");
669fa600
SM
1145 break;
1146
1147 case symend:
e2543b02 1148 fprintf (stderr, "/symend");
669fa600 1149 break;
5e69f11e 1150
1fb352e0 1151 case syntaxspec:
a1a052df 1152 fprintf (stderr, "/syntaxspec");
1fb352e0 1153 mcnt = *p++;
a1a052df 1154 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1155 break;
1156
1157 case notsyntaxspec:
a1a052df 1158 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1159 mcnt = *p++;
a1a052df 1160 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1161 break;
1162
0b32bf0e 1163# ifdef emacs
fa9a63c5 1164 case before_dot:
a1a052df 1165 fprintf (stderr, "/before_dot");
25fe55af 1166 break;
fa9a63c5
RM
1167
1168 case at_dot:
a1a052df 1169 fprintf (stderr, "/at_dot");
25fe55af 1170 break;
fa9a63c5
RM
1171
1172 case after_dot:
a1a052df 1173 fprintf (stderr, "/after_dot");
25fe55af 1174 break;
fa9a63c5 1175
1fb352e0 1176 case categoryspec:
a1a052df 1177 fprintf (stderr, "/categoryspec");
fa9a63c5 1178 mcnt = *p++;
a1a052df 1179 fprintf (stderr, "/%d", mcnt);
25fe55af 1180 break;
5e69f11e 1181
1fb352e0 1182 case notcategoryspec:
a1a052df 1183 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1184 mcnt = *p++;
a1a052df 1185 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1186 break;
0b32bf0e 1187# endif /* emacs */
fa9a63c5 1188
fa9a63c5 1189 case begbuf:
a1a052df 1190 fprintf (stderr, "/begbuf");
25fe55af 1191 break;
fa9a63c5
RM
1192
1193 case endbuf:
a1a052df 1194 fprintf (stderr, "/endbuf");
25fe55af 1195 break;
fa9a63c5 1196
25fe55af 1197 default:
a1a052df 1198 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1199 }
1200
a1a052df 1201 fprintf (stderr, "\n");
fa9a63c5
RM
1202 }
1203
a1a052df 1204 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1205}
1206
1207
1208void
1209print_compiled_pattern (bufp)
1210 struct re_pattern_buffer *bufp;
1211{
01618498 1212 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1213
1214 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1215 printf ("%ld bytes used/%ld bytes allocated.\n",
1216 bufp->used, bufp->allocated);
fa9a63c5
RM
1217
1218 if (bufp->fastmap_accurate && bufp->fastmap)
1219 {
1220 printf ("fastmap: ");
1221 print_fastmap (bufp->fastmap);
1222 }
1223
1224 printf ("re_nsub: %d\t", bufp->re_nsub);
1225 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1226 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1227 printf ("no_sub: %d\t", bufp->no_sub);
1228 printf ("not_bol: %d\t", bufp->not_bol);
1229 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1230 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1231 fflush (stdout);
fa9a63c5
RM
1232 /* Perhaps we should print the translate table? */
1233}
1234
1235
1236void
1237print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1238 re_char *where;
1239 re_char *string1;
1240 re_char *string2;
fa9a63c5
RM
1241 int size1;
1242 int size2;
1243{
4bb91c68 1244 int this_char;
5e69f11e 1245
fa9a63c5
RM
1246 if (where == NULL)
1247 printf ("(null)");
1248 else
1249 {
1250 if (FIRST_STRING_P (where))
25fe55af
RS
1251 {
1252 for (this_char = where - string1; this_char < size1; this_char++)
1253 putchar (string1[this_char]);
fa9a63c5 1254
25fe55af
RS
1255 where = string2;
1256 }
fa9a63c5
RM
1257
1258 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1259 putchar (string2[this_char]);
fa9a63c5
RM
1260 }
1261}
1262
1263#else /* not DEBUG */
1264
0b32bf0e
SM
1265# undef assert
1266# define assert(e)
fa9a63c5 1267
0b32bf0e
SM
1268# define DEBUG_STATEMENT(e)
1269# define DEBUG_PRINT1(x)
1270# define DEBUG_PRINT2(x1, x2)
1271# define DEBUG_PRINT3(x1, x2, x3)
1272# define DEBUG_PRINT4(x1, x2, x3, x4)
1273# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1274# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1275
1276#endif /* not DEBUG */
1277\f
1278/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1279 also be assigned to arbitrarily: each pattern buffer stores its own
1280 syntax, so it can be changed between regex compilations. */
1281/* This has no initializer because initialized variables in Emacs
1282 become read-only after dumping. */
1283reg_syntax_t re_syntax_options;
1284
1285
1286/* Specify the precise syntax of regexps for compilation. This provides
1287 for compatibility for various utilities which historically have
1288 different, incompatible syntaxes.
1289
1290 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1291 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1292
1293reg_syntax_t
971de7fb 1294re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1295{
1296 reg_syntax_t ret = re_syntax_options;
5e69f11e 1297
fa9a63c5
RM
1298 re_syntax_options = syntax;
1299 return ret;
1300}
c0f9ea08 1301WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1302
1303/* Regexp to use to replace spaces, or NULL meaning don't. */
1304static re_char *whitespace_regexp;
1305
1306void
971de7fb 1307re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1308{
6470ea05 1309 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1310}
1311WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1312\f
1313/* This table gives an error message for each of the error codes listed
4bb91c68 1314 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1315 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1316 but why not be nice? */
fa9a63c5
RM
1317
1318static const char *re_error_msgid[] =
5e69f11e
RM
1319 {
1320 gettext_noop ("Success"), /* REG_NOERROR */
1321 gettext_noop ("No match"), /* REG_NOMATCH */
1322 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1323 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1324 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1325 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1326 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1327 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1328 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1329 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1330 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1331 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1332 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1333 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1334 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1335 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1336 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1337 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1338 };
1339\f
4bb91c68 1340/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1341
1342/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1343 searching and matching functions should not call alloca. On some
1344 systems, alloca is implemented in terms of malloc, and if we're
1345 using the relocating allocator routines, then malloc could cause a
1346 relocation, which might (if the strings being searched are in the
1347 ralloc heap) shift the data out from underneath the regexp
1348 routines.
1349
5e69f11e 1350 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1351 processes input from X in a signal handler; processing X input may
1352 call malloc; if input arrives while a matching routine is calling
1353 malloc, then we're scrod. But Emacs can't just block input while
1354 calling matching routines; then we don't notice interrupts when
1355 they come in. So, Emacs blocks input around all regexp calls
1356 except the matching calls, which it leaves unprotected, in the
1357 faith that they will not malloc. */
1358
1359/* Normally, this is fine. */
1360#define MATCH_MAY_ALLOCATE
1361
fa9a63c5
RM
1362/* The match routines may not allocate if (1) they would do it with malloc
1363 and (2) it's not safe for them to use malloc.
1364 Note that if REL_ALLOC is defined, matching would not use malloc for the
1365 failure stack, but we would still use it for the register vectors;
4bb91c68 1366 so REL_ALLOC should not affect this. */
b588157e 1367#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1368# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1369#endif
1370
1371\f
1372/* Failure stack declarations and macros; both re_compile_fastmap and
1373 re_match_2 use a failure stack. These have to be macros because of
1374 REGEX_ALLOCATE_STACK. */
5e69f11e 1375
fa9a63c5 1376
320a2a73 1377/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1378 when matching. If this number is exceeded, we allocate more
1379 space, so it is not a hard limit. */
1380#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1381# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1382#endif
1383
1384/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1385 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1386 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1387 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1388 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1389# if defined MATCH_MAY_ALLOCATE
1390/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1391 whose default stack limit is 2mb. In order for a larger
1392 value to work reliably, you have to try to make it accord
1393 with the process stack limit. */
c0f9ea08
SM
1394size_t re_max_failures = 40000;
1395# else
1396size_t re_max_failures = 4000;
1397# endif
fa9a63c5
RM
1398
1399union fail_stack_elt
1400{
01618498 1401 re_char *pointer;
c0f9ea08
SM
1402 /* This should be the biggest `int' that's no bigger than a pointer. */
1403 long integer;
fa9a63c5
RM
1404};
1405
1406typedef union fail_stack_elt fail_stack_elt_t;
1407
1408typedef struct
1409{
1410 fail_stack_elt_t *stack;
c0f9ea08
SM
1411 size_t size;
1412 size_t avail; /* Offset of next open position. */
1413 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1414} fail_stack_type;
1415
505bde11 1416#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1417#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1418
1419
1420/* Define macros to initialize and free the failure stack.
1421 Do `return -2' if the alloc fails. */
1422
1423#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1424# define INIT_FAIL_STACK() \
fa9a63c5
RM
1425 do { \
1426 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1427 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1428 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1429 \
1430 if (fail_stack.stack == NULL) \
1431 return -2; \
1432 \
1433 fail_stack.size = INIT_FAILURE_ALLOC; \
1434 fail_stack.avail = 0; \
505bde11 1435 fail_stack.frame = 0; \
fa9a63c5
RM
1436 } while (0)
1437
0b32bf0e 1438# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1439#else
0b32bf0e 1440# define INIT_FAIL_STACK() \
fa9a63c5
RM
1441 do { \
1442 fail_stack.avail = 0; \
505bde11 1443 fail_stack.frame = 0; \
fa9a63c5
RM
1444 } while (0)
1445
0b32bf0e 1446# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1447#endif
1448
1449
320a2a73
KH
1450/* Double the size of FAIL_STACK, up to a limit
1451 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1452
1453 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1454 allocating space for it or it was already too large.
1455
4bb91c68 1456 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1457
320a2a73
KH
1458/* Factor to increase the failure stack size by
1459 when we increase it.
1460 This used to be 2, but 2 was too wasteful
1461 because the old discarded stacks added up to as much space
1462 were as ultimate, maximum-size stack. */
1463#define FAIL_STACK_GROWTH_FACTOR 4
1464
1465#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1466 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1467 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1468 ? 0 \
320a2a73
KH
1469 : ((fail_stack).stack \
1470 = (fail_stack_elt_t *) \
25fe55af
RS
1471 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1472 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1473 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1474 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1475 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1476 \
1477 (fail_stack).stack == NULL \
1478 ? 0 \
6453db45
KH
1479 : ((fail_stack).size \
1480 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1481 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1482 * FAIL_STACK_GROWTH_FACTOR)) \
1483 / sizeof (fail_stack_elt_t)), \
25fe55af 1484 1)))
fa9a63c5
RM
1485
1486
fa9a63c5
RM
1487/* Push a pointer value onto the failure stack.
1488 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1489 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1490#define PUSH_FAILURE_POINTER(item) \
01618498 1491 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1492
1493/* This pushes an integer-valued item onto the failure stack.
1494 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1495 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1496#define PUSH_FAILURE_INT(item) \
1497 fail_stack.stack[fail_stack.avail++].integer = (item)
1498
1499/* Push a fail_stack_elt_t value onto the failure stack.
1500 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1501 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1502#define PUSH_FAILURE_ELT(item) \
1503 fail_stack.stack[fail_stack.avail++] = (item)
1504
1505/* These three POP... operations complement the three PUSH... operations.
1506 All assume that `fail_stack' is nonempty. */
1507#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1508#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1509#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1510
505bde11
SM
1511/* Individual items aside from the registers. */
1512#define NUM_NONREG_ITEMS 3
1513
1514/* Used to examine the stack (to detect infinite loops). */
1515#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1516#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1517#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1518#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1519
1520
505bde11
SM
1521#define ENSURE_FAIL_STACK(space) \
1522while (REMAINING_AVAIL_SLOTS <= space) { \
1523 if (!GROW_FAIL_STACK (fail_stack)) \
1524 return -2; \
1525 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1526 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1527}
1528
1529/* Push register NUM onto the stack. */
1530#define PUSH_FAILURE_REG(num) \
1531do { \
1532 char *destination; \
1533 ENSURE_FAIL_STACK(3); \
1534 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1535 num, regstart[num], regend[num]); \
1536 PUSH_FAILURE_POINTER (regstart[num]); \
1537 PUSH_FAILURE_POINTER (regend[num]); \
1538 PUSH_FAILURE_INT (num); \
1539} while (0)
1540
01618498
SM
1541/* Change the counter's value to VAL, but make sure that it will
1542 be reset when backtracking. */
1543#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1544do { \
1545 char *destination; \
1546 int c; \
1547 ENSURE_FAIL_STACK(3); \
1548 EXTRACT_NUMBER (c, ptr); \
01618498 1549 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1550 PUSH_FAILURE_INT (c); \
1551 PUSH_FAILURE_POINTER (ptr); \
1552 PUSH_FAILURE_INT (-1); \
01618498 1553 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1554} while (0)
1555
505bde11 1556/* Pop a saved register off the stack. */
dc1e502d 1557#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1558do { \
1559 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1560 if (reg == -1) \
1561 { \
1562 /* It's a counter. */ \
6dcf2d0e
SM
1563 /* Here, we discard `const', making re_match non-reentrant. */ \
1564 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1565 reg = POP_FAILURE_INT (); \
1566 STORE_NUMBER (ptr, reg); \
1567 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1568 } \
1569 else \
1570 { \
1571 regend[reg] = POP_FAILURE_POINTER (); \
1572 regstart[reg] = POP_FAILURE_POINTER (); \
1573 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1574 reg, regstart[reg], regend[reg]); \
1575 } \
505bde11
SM
1576} while (0)
1577
1578/* Check that we are not stuck in an infinite loop. */
1579#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1580do { \
f6df485f 1581 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1582 /* Check for infinite matching loops */ \
f6df485f
RS
1583 while (failure > 0 \
1584 && (FAILURE_STR (failure) == string_place \
1585 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1586 { \
1587 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1588 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1589 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1590 { \
6df42991
SM
1591 cycle = 1; \
1592 break; \
f6df485f 1593 } \
66f0296e 1594 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1595 failure = NEXT_FAILURE_HANDLE(failure); \
1596 } \
1597 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1598} while (0)
6df42991 1599
fa9a63c5 1600/* Push the information about the state we will need
5e69f11e
RM
1601 if we ever fail back to it.
1602
505bde11 1603 Requires variables fail_stack, regstart, regend and
320a2a73 1604 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1605 declared.
5e69f11e 1606
fa9a63c5
RM
1607 Does `return FAILURE_CODE' if runs out of memory. */
1608
505bde11
SM
1609#define PUSH_FAILURE_POINT(pattern, string_place) \
1610do { \
1611 char *destination; \
1612 /* Must be int, so when we don't save any registers, the arithmetic \
1613 of 0 + -1 isn't done as unsigned. */ \
1614 \
505bde11 1615 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1616 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1617 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1618 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1619 \
1620 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1621 \
1622 DEBUG_PRINT1 ("\n"); \
1623 \
1624 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1625 PUSH_FAILURE_INT (fail_stack.frame); \
1626 \
1627 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1628 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1629 DEBUG_PRINT1 ("'\n"); \
1630 PUSH_FAILURE_POINTER (string_place); \
1631 \
1632 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1633 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1634 PUSH_FAILURE_POINTER (pattern); \
1635 \
1636 /* Close the frame by moving the frame pointer past it. */ \
1637 fail_stack.frame = fail_stack.avail; \
1638} while (0)
fa9a63c5 1639
320a2a73
KH
1640/* Estimate the size of data pushed by a typical failure stack entry.
1641 An estimate is all we need, because all we use this for
1642 is to choose a limit for how big to make the failure stack. */
ada30c0e 1643/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1644#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1645
fa9a63c5
RM
1646/* How many items can still be added to the stack without overflowing it. */
1647#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1648
1649
1650/* Pops what PUSH_FAIL_STACK pushes.
1651
1652 We restore into the parameters, all of which should be lvalues:
1653 STR -- the saved data position.
1654 PAT -- the saved pattern position.
fa9a63c5 1655 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1656
fa9a63c5 1657 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1658 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1659
505bde11
SM
1660#define POP_FAILURE_POINT(str, pat) \
1661do { \
fa9a63c5
RM
1662 assert (!FAIL_STACK_EMPTY ()); \
1663 \
1664 /* Remove failure points and point to how many regs pushed. */ \
1665 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1666 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1667 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1668 \
505bde11
SM
1669 /* Pop the saved registers. */ \
1670 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1671 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1672 \
01618498 1673 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1674 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1675 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1676 \
1677 /* If the saved string location is NULL, it came from an \
1678 on_failure_keep_string_jump opcode, and we want to throw away the \
1679 saved NULL, thus retaining our current position in the string. */ \
01618498 1680 str = POP_FAILURE_POINTER (); \
505bde11 1681 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1682 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1683 DEBUG_PRINT1 ("'\n"); \
1684 \
505bde11
SM
1685 fail_stack.frame = POP_FAILURE_INT (); \
1686 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1687 \
505bde11
SM
1688 assert (fail_stack.avail >= 0); \
1689 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1690 \
fa9a63c5 1691 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1692} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1693
1694
1695\f
fa9a63c5 1696/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1697#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1698\f
1699/* Subroutine declarations and macros for regex_compile. */
1700
4bb91c68
SM
1701static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1702 reg_syntax_t syntax,
1703 struct re_pattern_buffer *bufp));
1704static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1705static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1706 int arg1, int arg2));
1707static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1708 int arg, unsigned char *end));
1709static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1710 int arg1, int arg2, unsigned char *end));
01618498
SM
1711static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1712 re_char *p,
4bb91c68 1713 reg_syntax_t syntax));
01618498
SM
1714static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1715 re_char *pend,
4bb91c68 1716 reg_syntax_t syntax));
01618498
SM
1717static re_char *skip_one_char _RE_ARGS ((re_char *p));
1718static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1719 char *fastmap, const int multibyte));
fa9a63c5 1720
fa9a63c5 1721/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1722 translation. */
36595814 1723#define PATFETCH(c) \
2d1675e4
SM
1724 do { \
1725 int len; \
1726 if (p == pend) return REG_EEND; \
62a6e103 1727 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1728 p += len; \
fa9a63c5
RM
1729 } while (0)
1730
fa9a63c5
RM
1731
1732/* If `translate' is non-null, return translate[D], else just D. We
1733 cast the subscript to translate because some data is declared as
1734 `char *', to avoid warnings when a string constant is passed. But
1735 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1736#ifndef TRANSLATE
0b32bf0e 1737# define TRANSLATE(d) \
66f0296e 1738 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1739#endif
fa9a63c5
RM
1740
1741
1742/* Macros for outputting the compiled pattern into `buffer'. */
1743
1744/* If the buffer isn't allocated when it comes in, use this. */
1745#define INIT_BUF_SIZE 32
1746
4bb91c68 1747/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1748#define GET_BUFFER_SPACE(n) \
01618498 1749 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1750 EXTEND_BUFFER ()
1751
1752/* Make sure we have one more byte of buffer space and then add C to it. */
1753#define BUF_PUSH(c) \
1754 do { \
1755 GET_BUFFER_SPACE (1); \
1756 *b++ = (unsigned char) (c); \
1757 } while (0)
1758
1759
1760/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1761#define BUF_PUSH_2(c1, c2) \
1762 do { \
1763 GET_BUFFER_SPACE (2); \
1764 *b++ = (unsigned char) (c1); \
1765 *b++ = (unsigned char) (c2); \
1766 } while (0)
1767
1768
4bb91c68 1769/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1770#define BUF_PUSH_3(c1, c2, c3) \
1771 do { \
1772 GET_BUFFER_SPACE (3); \
1773 *b++ = (unsigned char) (c1); \
1774 *b++ = (unsigned char) (c2); \
1775 *b++ = (unsigned char) (c3); \
1776 } while (0)
1777
1778
1779/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1780 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1781#define STORE_JUMP(op, loc, to) \
1782 store_op1 (op, loc, (to) - (loc) - 3)
1783
1784/* Likewise, for a two-argument jump. */
1785#define STORE_JUMP2(op, loc, to, arg) \
1786 store_op2 (op, loc, (to) - (loc) - 3, arg)
1787
4bb91c68 1788/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1789#define INSERT_JUMP(op, loc, to) \
1790 insert_op1 (op, loc, (to) - (loc) - 3, b)
1791
1792/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1793#define INSERT_JUMP2(op, loc, to, arg) \
1794 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1795
1796
1797/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1798 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1799 be too small, many things would have to change. */
839966f3
KH
1800# define MAX_BUF_SIZE (1L << 15)
1801
1802#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1803/* Any other compiler which, like MSC, has allocation limit below 2^16
1804 bytes will have to use approach similar to what was done below for
1805 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1806 reallocating to 0 bytes. Such thing is not going to work too well.
1807 You have been warned!! */
1808#if defined _MSC_VER && !defined WIN32
1809/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1810# define MAX_BUF_SIZE 65500L
1811#else
1812# define MAX_BUF_SIZE (1L << 16)
1813#endif
839966f3 1814#endif /* 0 */
fa9a63c5
RM
1815
1816/* Extend the buffer by twice its current size via realloc and
1817 reset the pointers that pointed into the old block to point to the
1818 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1819 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1820#if __BOUNDED_POINTERS__
1821# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1822# define MOVE_BUFFER_POINTER(P) \
1823 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1824 SET_HIGH_BOUND (P), \
1825 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1826# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1827 else \
1828 { \
1829 SET_HIGH_BOUND (b); \
1830 SET_HIGH_BOUND (begalt); \
1831 if (fixup_alt_jump) \
1832 SET_HIGH_BOUND (fixup_alt_jump); \
1833 if (laststart) \
1834 SET_HIGH_BOUND (laststart); \
1835 if (pending_exact) \
1836 SET_HIGH_BOUND (pending_exact); \
1837 }
1838#else
381880b0 1839# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1840# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1841#endif
fa9a63c5 1842#define EXTEND_BUFFER() \
25fe55af 1843 do { \
381880b0 1844 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1845 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1846 return REG_ESIZE; \
1847 bufp->allocated <<= 1; \
1848 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1849 bufp->allocated = MAX_BUF_SIZE; \
01618498 1850 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1851 if (bufp->buffer == NULL) \
1852 return REG_ESPACE; \
1853 /* If the buffer moved, move all the pointers into it. */ \
1854 if (old_buffer != bufp->buffer) \
1855 { \
381880b0 1856 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1857 MOVE_BUFFER_POINTER (b); \
1858 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1859 if (fixup_alt_jump) \
4bb91c68 1860 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1861 if (laststart) \
4bb91c68 1862 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1863 if (pending_exact) \
4bb91c68 1864 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1865 } \
4bb91c68 1866 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1867 } while (0)
1868
1869
1870/* Since we have one byte reserved for the register number argument to
1871 {start,stop}_memory, the maximum number of groups we can report
1872 things about is what fits in that byte. */
1873#define MAX_REGNUM 255
1874
1875/* But patterns can have more than `MAX_REGNUM' registers. We just
1876 ignore the excess. */
098d42af 1877typedef int regnum_t;
fa9a63c5
RM
1878
1879
1880/* Macros for the compile stack. */
1881
1882/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1883 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1884/* int may be not enough when sizeof(int) == 2. */
1885typedef long pattern_offset_t;
fa9a63c5
RM
1886
1887typedef struct
1888{
1889 pattern_offset_t begalt_offset;
1890 pattern_offset_t fixup_alt_jump;
5e69f11e 1891 pattern_offset_t laststart_offset;
fa9a63c5
RM
1892 regnum_t regnum;
1893} compile_stack_elt_t;
1894
1895
1896typedef struct
1897{
1898 compile_stack_elt_t *stack;
1899 unsigned size;
1900 unsigned avail; /* Offset of next open position. */
1901} compile_stack_type;
1902
1903
1904#define INIT_COMPILE_STACK_SIZE 32
1905
1906#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1907#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1908
4bb91c68 1909/* The next available element. */
fa9a63c5
RM
1910#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1911
1cee1e27
SM
1912/* Explicit quit checking is only used on NTemacs and whenever we
1913 use polling to process input events. */
1914#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1915extern int immediate_quit;
1916# define IMMEDIATE_QUIT_CHECK \
1917 do { \
1918 if (immediate_quit) QUIT; \
1919 } while (0)
1920#else
1921# define IMMEDIATE_QUIT_CHECK ((void)0)
1922#endif
1923\f
b18215fc
RS
1924/* Structure to manage work area for range table. */
1925struct range_table_work_area
1926{
1927 int *table; /* actual work area. */
1928 int allocated; /* allocated size for work area in bytes. */
7814e705 1929 int used; /* actually used size in words. */
96cc36cc 1930 int bits; /* flag to record character classes */
b18215fc
RS
1931};
1932
77d11aec
RS
1933/* Make sure that WORK_AREA can hold more N multibyte characters.
1934 This is used only in set_image_of_range and set_image_of_range_1.
1935 It expects WORK_AREA to be a pointer.
1936 If it can't get the space, it returns from the surrounding function. */
1937
1938#define EXTEND_RANGE_TABLE(work_area, n) \
1939 do { \
8f924df7 1940 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1941 { \
8f924df7
KH
1942 extend_range_table_work_area (&work_area); \
1943 if ((work_area).table == 0) \
77d11aec
RS
1944 return (REG_ESPACE); \
1945 } \
b18215fc
RS
1946 } while (0)
1947
96cc36cc
RS
1948#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1949 (work_area).bits |= (bit)
1950
14473664
SM
1951/* Bits used to implement the multibyte-part of the various character classes
1952 such as [:alnum:] in a charset's range table. */
1953#define BIT_WORD 0x1
1954#define BIT_LOWER 0x2
1955#define BIT_PUNCT 0x4
1956#define BIT_SPACE 0x8
1957#define BIT_UPPER 0x10
1958#define BIT_MULTIBYTE 0x20
96cc36cc 1959
b18215fc
RS
1960/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1961#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1962 do { \
8f924df7 1963 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1964 (work_area).table[(work_area).used++] = (range_start); \
1965 (work_area).table[(work_area).used++] = (range_end); \
1966 } while (0)
1967
7814e705 1968/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1969#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1970 do { \
1971 if ((work_area).table) \
1972 free ((work_area).table); \
1973 } while (0)
1974
96cc36cc 1975#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1976#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1977#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1978#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1979\f
b18215fc 1980
fa9a63c5 1981/* Set the bit for character C in a list. */
01618498 1982#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1983
1984
bf216479
KH
1985#ifdef emacs
1986
cf9c99bc
KH
1987/* Store characters in the range FROM to TO in the bitmap at B (for
1988 ASCII and unibyte characters) and WORK_AREA (for multibyte
1989 characters) while translating them and paying attention to the
1990 continuity of translated characters.
8f924df7 1991
cf9c99bc
KH
1992 Implementation note: It is better to implement these fairly big
1993 macros by a function, but it's not that easy because macros called
8f924df7 1994 in this macro assume various local variables already declared. */
bf216479 1995
cf9c99bc
KH
1996/* Both FROM and TO are ASCII characters. */
1997
1998#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1999 do { \
2000 int C0, C1; \
2001 \
2002 for (C0 = (FROM); C0 <= (TO); C0++) \
2003 { \
2004 C1 = TRANSLATE (C0); \
2005 if (! ASCII_CHAR_P (C1)) \
2006 { \
2007 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2008 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2009 C1 = C0; \
2010 } \
2011 SET_LIST_BIT (C1); \
2012 } \
2013 } while (0)
2014
2015
2016/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2017
2018#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2019 do { \
2020 int C0, C1, C2, I; \
2021 int USED = RANGE_TABLE_WORK_USED (work_area); \
2022 \
2023 for (C0 = (FROM); C0 <= (TO); C0++) \
2024 { \
2025 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2026 if (CHAR_BYTE8_P (C1)) \
2027 SET_LIST_BIT (C0); \
2028 else \
2029 { \
2030 C2 = TRANSLATE (C1); \
2031 if (C2 == C1 \
2032 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2033 C1 = C0; \
2034 SET_LIST_BIT (C1); \
2035 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2036 { \
2037 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2038 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2039 \
2040 if (C2 >= from - 1 && C2 <= to + 1) \
2041 { \
2042 if (C2 == from - 1) \
2043 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2044 else if (C2 == to + 1) \
2045 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2046 break; \
2047 } \
2048 } \
2049 if (I < USED) \
2050 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2051 } \
2052 } \
2053 } while (0)
2054
2055
78edd3b7 2056/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2057
2058#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2059 do { \
2060 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2061 \
2062 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2063 for (C0 = (FROM); C0 <= (TO); C0++) \
2064 { \
2065 C1 = TRANSLATE (C0); \
2066 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2067 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2068 SET_LIST_BIT (C2); \
2069 if (C1 >= (FROM) && C1 <= (TO)) \
2070 continue; \
2071 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2072 { \
2073 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2074 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2075 \
2076 if (C1 >= from - 1 && C1 <= to + 1) \
2077 { \
2078 if (C1 == from - 1) \
2079 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2080 else if (C1 == to + 1) \
2081 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2082 break; \
2083 } \
2084 } \
2085 if (I < USED) \
2086 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2087 } \
bf216479
KH
2088 } while (0)
2089
2090#endif /* emacs */
2091
fa9a63c5 2092/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2093#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2094 do { \
2095 if (p == pend) \
2096 FREE_STACK_RETURN (REG_EBRACE); \
2097 else \
2098 { \
2099 PATFETCH (c); \
2100 while ('0' <= c && c <= '9') \
2101 { \
2102 int prev; \
2103 if (num < 0) \
2104 num = 0; \
2105 prev = num; \
2106 num = num * 10 + c - '0'; \
2107 if (num / 10 != prev) \
2108 FREE_STACK_RETURN (REG_BADBR); \
2109 if (p == pend) \
2110 FREE_STACK_RETURN (REG_EBRACE); \
2111 PATFETCH (c); \
2112 } \
2113 } \
2114 } while (0)
77d11aec 2115\f
1fdab503 2116#if ! WIDE_CHAR_SUPPORT
01618498 2117
14473664 2118/* Map a string to the char class it names (if any). */
1fdab503 2119re_wctype_t
971de7fb 2120re_wctype (const re_char *str)
14473664 2121{
ada30c0e 2122 const char *string = str;
14473664
SM
2123 if (STREQ (string, "alnum")) return RECC_ALNUM;
2124 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2125 else if (STREQ (string, "word")) return RECC_WORD;
2126 else if (STREQ (string, "ascii")) return RECC_ASCII;
2127 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2128 else if (STREQ (string, "graph")) return RECC_GRAPH;
2129 else if (STREQ (string, "lower")) return RECC_LOWER;
2130 else if (STREQ (string, "print")) return RECC_PRINT;
2131 else if (STREQ (string, "punct")) return RECC_PUNCT;
2132 else if (STREQ (string, "space")) return RECC_SPACE;
2133 else if (STREQ (string, "upper")) return RECC_UPPER;
2134 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2135 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2136 else if (STREQ (string, "digit")) return RECC_DIGIT;
2137 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2138 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2139 else if (STREQ (string, "blank")) return RECC_BLANK;
2140 else return 0;
2141}
2142
e0f24100 2143/* True if CH is in the char class CC. */
1fdab503 2144boolean
971de7fb 2145re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2146{
2147 switch (cc)
2148 {
0cdd06f8
SM
2149 case RECC_ALNUM: return ISALNUM (ch);
2150 case RECC_ALPHA: return ISALPHA (ch);
2151 case RECC_BLANK: return ISBLANK (ch);
2152 case RECC_CNTRL: return ISCNTRL (ch);
2153 case RECC_DIGIT: return ISDIGIT (ch);
2154 case RECC_GRAPH: return ISGRAPH (ch);
2155 case RECC_LOWER: return ISLOWER (ch);
2156 case RECC_PRINT: return ISPRINT (ch);
2157 case RECC_PUNCT: return ISPUNCT (ch);
2158 case RECC_SPACE: return ISSPACE (ch);
2159 case RECC_UPPER: return ISUPPER (ch);
2160 case RECC_XDIGIT: return ISXDIGIT (ch);
2161 case RECC_ASCII: return IS_REAL_ASCII (ch);
2162 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2163 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2164 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2165 case RECC_WORD: return ISWORD (ch);
2166 case RECC_ERROR: return false;
2167 default:
2168 abort();
14473664
SM
2169 }
2170}
fa9a63c5 2171
14473664
SM
2172/* Return a bit-pattern to use in the range-table bits to match multibyte
2173 chars of class CC. */
2174static int
971de7fb 2175re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2176{
2177 switch (cc)
2178 {
2179 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2180 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2181 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2182 case RECC_LOWER: return BIT_LOWER;
2183 case RECC_UPPER: return BIT_UPPER;
2184 case RECC_PUNCT: return BIT_PUNCT;
2185 case RECC_SPACE: return BIT_SPACE;
14473664 2186 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2187 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2188 default:
2189 abort();
14473664
SM
2190 }
2191}
2192#endif
77d11aec
RS
2193\f
2194/* Filling in the work area of a range. */
2195
2196/* Actually extend the space in WORK_AREA. */
2197
2198static void
971de7fb 2199extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2200{
77d11aec
RS
2201 work_area->allocated += 16 * sizeof (int);
2202 if (work_area->table)
2203 work_area->table
2204 = (int *) realloc (work_area->table, work_area->allocated);
2205 else
2206 work_area->table
2207 = (int *) malloc (work_area->allocated);
2208}
2209
8f924df7 2210#if 0
77d11aec
RS
2211#ifdef emacs
2212
2213/* Carefully find the ranges of codes that are equivalent
2214 under case conversion to the range start..end when passed through
2215 TRANSLATE. Handle the case where non-letters can come in between
2216 two upper-case letters (which happens in Latin-1).
2217 Also handle the case of groups of more than 2 case-equivalent chars.
2218
2219 The basic method is to look at consecutive characters and see
2220 if they can form a run that can be handled as one.
2221
2222 Returns -1 if successful, REG_ESPACE if ran out of space. */
2223
2224static int
2225set_image_of_range_1 (work_area, start, end, translate)
2226 RE_TRANSLATE_TYPE translate;
2227 struct range_table_work_area *work_area;
2228 re_wchar_t start, end;
2229{
2230 /* `one_case' indicates a character, or a run of characters,
2231 each of which is an isolate (no case-equivalents).
2232 This includes all ASCII non-letters.
2233
2234 `two_case' indicates a character, or a run of characters,
2235 each of which has two case-equivalent forms.
2236 This includes all ASCII letters.
2237
2238 `strange' indicates a character that has more than one
2239 case-equivalent. */
177c0ea7 2240
77d11aec
RS
2241 enum case_type {one_case, two_case, strange};
2242
2243 /* Describe the run that is in progress,
2244 which the next character can try to extend.
2245 If run_type is strange, that means there really is no run.
2246 If run_type is one_case, then run_start...run_end is the run.
2247 If run_type is two_case, then the run is run_start...run_end,
2248 and the case-equivalents end at run_eqv_end. */
2249
2250 enum case_type run_type = strange;
2251 int run_start, run_end, run_eqv_end;
2252
2253 Lisp_Object eqv_table;
2254
2255 if (!RE_TRANSLATE_P (translate))
2256 {
b7c12565 2257 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2258 work_area->table[work_area->used++] = (start);
2259 work_area->table[work_area->used++] = (end);
b7c12565 2260 return -1;
77d11aec
RS
2261 }
2262
2263 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2264
77d11aec
RS
2265 for (; start <= end; start++)
2266 {
2267 enum case_type this_type;
2268 int eqv = RE_TRANSLATE (eqv_table, start);
2269 int minchar, maxchar;
2270
2271 /* Classify this character */
2272 if (eqv == start)
2273 this_type = one_case;
2274 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2275 this_type = two_case;
2276 else
2277 this_type = strange;
2278
2279 if (start < eqv)
2280 minchar = start, maxchar = eqv;
2281 else
2282 minchar = eqv, maxchar = start;
2283
2284 /* Can this character extend the run in progress? */
2285 if (this_type == strange || this_type != run_type
2286 || !(minchar == run_end + 1
2287 && (run_type == two_case
2288 ? maxchar == run_eqv_end + 1 : 1)))
2289 {
2290 /* No, end the run.
2291 Record each of its equivalent ranges. */
2292 if (run_type == one_case)
2293 {
2294 EXTEND_RANGE_TABLE (work_area, 2);
2295 work_area->table[work_area->used++] = run_start;
2296 work_area->table[work_area->used++] = run_end;
2297 }
2298 else if (run_type == two_case)
2299 {
2300 EXTEND_RANGE_TABLE (work_area, 4);
2301 work_area->table[work_area->used++] = run_start;
2302 work_area->table[work_area->used++] = run_end;
2303 work_area->table[work_area->used++]
2304 = RE_TRANSLATE (eqv_table, run_start);
2305 work_area->table[work_area->used++]
2306 = RE_TRANSLATE (eqv_table, run_end);
2307 }
2308 run_type = strange;
2309 }
177c0ea7 2310
77d11aec
RS
2311 if (this_type == strange)
2312 {
2313 /* For a strange character, add each of its equivalents, one
2314 by one. Don't start a range. */
2315 do
2316 {
2317 EXTEND_RANGE_TABLE (work_area, 2);
2318 work_area->table[work_area->used++] = eqv;
2319 work_area->table[work_area->used++] = eqv;
2320 eqv = RE_TRANSLATE (eqv_table, eqv);
2321 }
2322 while (eqv != start);
2323 }
2324
2325 /* Add this char to the run, or start a new run. */
2326 else if (run_type == strange)
2327 {
2328 /* Initialize a new range. */
2329 run_type = this_type;
2330 run_start = start;
2331 run_end = start;
2332 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2333 }
2334 else
2335 {
2336 /* Extend a running range. */
2337 run_end = minchar;
2338 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2339 }
2340 }
2341
2342 /* If a run is still in progress at the end, finish it now
2343 by recording its equivalent ranges. */
2344 if (run_type == one_case)
2345 {
2346 EXTEND_RANGE_TABLE (work_area, 2);
2347 work_area->table[work_area->used++] = run_start;
2348 work_area->table[work_area->used++] = run_end;
2349 }
2350 else if (run_type == two_case)
2351 {
2352 EXTEND_RANGE_TABLE (work_area, 4);
2353 work_area->table[work_area->used++] = run_start;
2354 work_area->table[work_area->used++] = run_end;
2355 work_area->table[work_area->used++]
2356 = RE_TRANSLATE (eqv_table, run_start);
2357 work_area->table[work_area->used++]
2358 = RE_TRANSLATE (eqv_table, run_end);
2359 }
2360
2361 return -1;
2362}
36595814 2363
77d11aec 2364#endif /* emacs */
36595814 2365
2b34df4e 2366/* Record the image of the range start..end when passed through
36595814
SM
2367 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2368 and is not even necessarily contiguous.
b7c12565
RS
2369 Normally we approximate it with the smallest contiguous range that contains
2370 all the chars we need. However, for Latin-1 we go to extra effort
2371 to do a better job.
2372
2373 This function is not called for ASCII ranges.
77d11aec
RS
2374
2375 Returns -1 if successful, REG_ESPACE if ran out of space. */
2376
2377static int
36595814
SM
2378set_image_of_range (work_area, start, end, translate)
2379 RE_TRANSLATE_TYPE translate;
2380 struct range_table_work_area *work_area;
2381 re_wchar_t start, end;
2382{
77d11aec
RS
2383 re_wchar_t cmin, cmax;
2384
2385#ifdef emacs
2386 /* For Latin-1 ranges, use set_image_of_range_1
2387 to get proper handling of ranges that include letters and nonletters.
b7c12565 2388 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2389 For other character sets, we don't bother to get this right. */
b7c12565
RS
2390 if (RE_TRANSLATE_P (translate) && start < 04400
2391 && !(start < 04200 && end >= 04377))
77d11aec 2392 {
b7c12565 2393 int newend;
77d11aec 2394 int tem;
b7c12565
RS
2395 newend = end;
2396 if (newend > 04377)
2397 newend = 04377;
2398 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2399 if (tem > 0)
2400 return tem;
2401
2402 start = 04400;
2403 if (end < 04400)
2404 return -1;
2405 }
2406#endif
2407
b7c12565
RS
2408 EXTEND_RANGE_TABLE (work_area, 2);
2409 work_area->table[work_area->used++] = (start);
2410 work_area->table[work_area->used++] = (end);
2411
2412 cmin = -1, cmax = -1;
77d11aec 2413
36595814 2414 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2415 {
2416 int ch;
77d11aec 2417
b7c12565
RS
2418 for (ch = start; ch <= end; ch++)
2419 {
2420 re_wchar_t c = TRANSLATE (ch);
2421 if (! (start <= c && c <= end))
2422 {
2423 if (cmin == -1)
2424 cmin = c, cmax = c;
2425 else
2426 {
2427 cmin = MIN (cmin, c);
2428 cmax = MAX (cmax, c);
2429 }
2430 }
2431 }
2432
2433 if (cmin != -1)
2434 {
2435 EXTEND_RANGE_TABLE (work_area, 2);
2436 work_area->table[work_area->used++] = (cmin);
2437 work_area->table[work_area->used++] = (cmax);
2438 }
2439 }
36595814 2440
77d11aec
RS
2441 return -1;
2442}
8f924df7 2443#endif /* 0 */
fa9a63c5
RM
2444\f
2445#ifndef MATCH_MAY_ALLOCATE
2446
2447/* If we cannot allocate large objects within re_match_2_internal,
2448 we make the fail stack and register vectors global.
2449 The fail stack, we grow to the maximum size when a regexp
2450 is compiled.
2451 The register vectors, we adjust in size each time we
2452 compile a regexp, according to the number of registers it needs. */
2453
2454static fail_stack_type fail_stack;
2455
2456/* Size with which the following vectors are currently allocated.
2457 That is so we can make them bigger as needed,
4bb91c68 2458 but never make them smaller. */
fa9a63c5
RM
2459static int regs_allocated_size;
2460
66f0296e
SM
2461static re_char ** regstart, ** regend;
2462static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2463
2464/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2465 but don't make them smaller. */
fa9a63c5
RM
2466
2467static
2468regex_grow_registers (num_regs)
2469 int num_regs;
2470{
2471 if (num_regs > regs_allocated_size)
2472 {
66f0296e
SM
2473 RETALLOC_IF (regstart, num_regs, re_char *);
2474 RETALLOC_IF (regend, num_regs, re_char *);
2475 RETALLOC_IF (best_regstart, num_regs, re_char *);
2476 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2477
2478 regs_allocated_size = num_regs;
2479 }
2480}
2481
2482#endif /* not MATCH_MAY_ALLOCATE */
2483\f
99633e97
SM
2484static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2485 compile_stack,
2486 regnum_t regnum));
2487
fa9a63c5
RM
2488/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2489 Returns one of error codes defined in `regex.h', or zero for success.
2490
2491 Assumes the `allocated' (and perhaps `buffer') and `translate'
2492 fields are set in BUFP on entry.
2493
2494 If it succeeds, results are put in BUFP (if it returns an error, the
2495 contents of BUFP are undefined):
2496 `buffer' is the compiled pattern;
2497 `syntax' is set to SYNTAX;
2498 `used' is set to the length of the compiled pattern;
2499 `fastmap_accurate' is zero;
2500 `re_nsub' is the number of subexpressions in PATTERN;
2501 `not_bol' and `not_eol' are zero;
5e69f11e 2502
c0f9ea08 2503 The `fastmap' field is neither examined nor set. */
fa9a63c5 2504
505bde11
SM
2505/* Insert the `jump' from the end of last alternative to "here".
2506 The space for the jump has already been allocated. */
2507#define FIXUP_ALT_JUMP() \
2508do { \
2509 if (fixup_alt_jump) \
2510 STORE_JUMP (jump, fixup_alt_jump, b); \
2511} while (0)
2512
2513
fa9a63c5
RM
2514/* Return, freeing storage we allocated. */
2515#define FREE_STACK_RETURN(value) \
b18215fc
RS
2516 do { \
2517 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2518 free (compile_stack.stack); \
2519 return value; \
2520 } while (0)
fa9a63c5
RM
2521
2522static reg_errcode_t
971de7fb 2523regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2524{
01618498
SM
2525 /* We fetch characters from PATTERN here. */
2526 register re_wchar_t c, c1;
5e69f11e 2527
fa9a63c5 2528 /* A random temporary spot in PATTERN. */
66f0296e 2529 re_char *p1;
fa9a63c5
RM
2530
2531 /* Points to the end of the buffer, where we should append. */
2532 register unsigned char *b;
5e69f11e 2533
fa9a63c5
RM
2534 /* Keeps track of unclosed groups. */
2535 compile_stack_type compile_stack;
2536
2537 /* Points to the current (ending) position in the pattern. */
22336245
RS
2538#ifdef AIX
2539 /* `const' makes AIX compiler fail. */
66f0296e 2540 unsigned char *p = pattern;
22336245 2541#else
66f0296e 2542 re_char *p = pattern;
22336245 2543#endif
66f0296e 2544 re_char *pend = pattern + size;
5e69f11e 2545
fa9a63c5 2546 /* How to translate the characters in the pattern. */
6676cb1c 2547 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2548
2549 /* Address of the count-byte of the most recently inserted `exactn'
2550 command. This makes it possible to tell if a new exact-match
2551 character can be added to that command or if the character requires
2552 a new `exactn' command. */
2553 unsigned char *pending_exact = 0;
2554
2555 /* Address of start of the most recently finished expression.
2556 This tells, e.g., postfix * where to find the start of its
2557 operand. Reset at the beginning of groups and alternatives. */
2558 unsigned char *laststart = 0;
2559
2560 /* Address of beginning of regexp, or inside of last group. */
2561 unsigned char *begalt;
2562
2563 /* Place in the uncompiled pattern (i.e., the {) to
2564 which to go back if the interval is invalid. */
66f0296e 2565 re_char *beg_interval;
5e69f11e 2566
fa9a63c5 2567 /* Address of the place where a forward jump should go to the end of
7814e705 2568 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2569 last -- ends with a forward jump of this sort. */
2570 unsigned char *fixup_alt_jump = 0;
2571
b18215fc
RS
2572 /* Work area for range table of charset. */
2573 struct range_table_work_area range_table_work;
2574
2d1675e4
SM
2575 /* If the object matched can contain multibyte characters. */
2576 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2577
8f924df7 2578 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2579 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2580
f9b0fd99
RS
2581 /* Nonzero if we have pushed down into a subpattern. */
2582 int in_subpattern = 0;
2583
2584 /* These hold the values of p, pattern, and pend from the main
2585 pattern when we have pushed into a subpattern. */
2586 re_char *main_p;
2587 re_char *main_pattern;
2588 re_char *main_pend;
2589
fa9a63c5 2590#ifdef DEBUG
99633e97 2591 debug++;
fa9a63c5 2592 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2593 if (debug > 0)
fa9a63c5
RM
2594 {
2595 unsigned debug_count;
5e69f11e 2596
fa9a63c5 2597 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2598 putchar (pattern[debug_count]);
fa9a63c5
RM
2599 putchar ('\n');
2600 }
2601#endif /* DEBUG */
2602
2603 /* Initialize the compile stack. */
2604 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2605 if (compile_stack.stack == NULL)
2606 return REG_ESPACE;
2607
2608 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2609 compile_stack.avail = 0;
2610
b18215fc
RS
2611 range_table_work.table = 0;
2612 range_table_work.allocated = 0;
2613
fa9a63c5
RM
2614 /* Initialize the pattern buffer. */
2615 bufp->syntax = syntax;
2616 bufp->fastmap_accurate = 0;
2617 bufp->not_bol = bufp->not_eol = 0;
6224b623 2618 bufp->used_syntax = 0;
fa9a63c5
RM
2619
2620 /* Set `used' to zero, so that if we return an error, the pattern
2621 printer (for debugging) will think there's no pattern. We reset it
2622 at the end. */
2623 bufp->used = 0;
5e69f11e 2624
fa9a63c5 2625 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2626 bufp->re_nsub = 0;
fa9a63c5 2627
0b32bf0e 2628#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2629 /* Initialize the syntax table. */
2630 init_syntax_once ();
2631#endif
2632
2633 if (bufp->allocated == 0)
2634 {
2635 if (bufp->buffer)
2636 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2637 enough space. This loses if buffer's address is bogus, but
7814e705 2638 that is the user's responsibility. */
25fe55af
RS
2639 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2640 }
fa9a63c5 2641 else
7814e705 2642 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2643 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2644 }
fa9a63c5
RM
2645 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2646
2647 bufp->allocated = INIT_BUF_SIZE;
2648 }
2649
2650 begalt = b = bufp->buffer;
2651
2652 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2653 while (1)
fa9a63c5 2654 {
f9b0fd99
RS
2655 if (p == pend)
2656 {
2657 /* If this is the end of an included regexp,
2658 pop back to the main regexp and try again. */
2659 if (in_subpattern)
2660 {
2661 in_subpattern = 0;
2662 pattern = main_pattern;
2663 p = main_p;
2664 pend = main_pend;
2665 continue;
2666 }
2667 /* If this is the end of the main regexp, we are done. */
2668 break;
2669 }
2670
fa9a63c5
RM
2671 PATFETCH (c);
2672
2673 switch (c)
25fe55af 2674 {
f9b0fd99
RS
2675 case ' ':
2676 {
2677 re_char *p1 = p;
2678
2679 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2680 spaces normally. And don't try to do this recursively. */
2681 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2682 goto normal_char;
2683
2684 /* Peek past following spaces. */
2685 while (p1 != pend)
2686 {
2687 if (*p1 != ' ')
2688 break;
2689 p1++;
2690 }
2691 /* If the spaces are followed by a repetition op,
2692 treat them normally. */
c721eee5
RS
2693 if (p1 != pend
2694 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2695 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2696 goto normal_char;
2697
2698 /* Replace the spaces with the whitespace regexp. */
2699 in_subpattern = 1;
2700 main_p = p1;
2701 main_pend = pend;
2702 main_pattern = pattern;
2703 p = pattern = whitespace_regexp;
2704 pend = p + strlen (p);
2705 break;
7814e705 2706 }
f9b0fd99 2707
25fe55af
RS
2708 case '^':
2709 {
7814e705 2710 if ( /* If at start of pattern, it's an operator. */
25fe55af 2711 p == pattern + 1
7814e705 2712 /* If context independent, it's an operator. */
25fe55af 2713 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2714 /* Otherwise, depends on what's come before. */
25fe55af 2715 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2716 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2717 else
2718 goto normal_char;
2719 }
2720 break;
2721
2722
2723 case '$':
2724 {
2725 if ( /* If at end of pattern, it's an operator. */
2726 p == pend
7814e705 2727 /* If context independent, it's an operator. */
25fe55af
RS
2728 || syntax & RE_CONTEXT_INDEP_ANCHORS
2729 /* Otherwise, depends on what's next. */
2730 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2731 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2732 else
2733 goto normal_char;
2734 }
2735 break;
fa9a63c5
RM
2736
2737
2738 case '+':
25fe55af
RS
2739 case '?':
2740 if ((syntax & RE_BK_PLUS_QM)
2741 || (syntax & RE_LIMITED_OPS))
2742 goto normal_char;
2743 handle_plus:
2744 case '*':
2745 /* If there is no previous pattern... */
2746 if (!laststart)
2747 {
2748 if (syntax & RE_CONTEXT_INVALID_OPS)
2749 FREE_STACK_RETURN (REG_BADRPT);
2750 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2751 goto normal_char;
2752 }
2753
2754 {
7814e705 2755 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2756 boolean zero_times_ok = 0, many_times_ok = 0;
2757 boolean greedy = 1;
25fe55af
RS
2758
2759 /* If there is a sequence of repetition chars, collapse it
2760 down to just one (the right one). We can't combine
2761 interval operators with these because of, e.g., `a{2}*',
7814e705 2762 which should only match an even number of `a's. */
25fe55af
RS
2763
2764 for (;;)
2765 {
0b32bf0e 2766 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2767 && c == '?' && (zero_times_ok || many_times_ok))
2768 greedy = 0;
2769 else
2770 {
2771 zero_times_ok |= c != '+';
2772 many_times_ok |= c != '?';
2773 }
25fe55af
RS
2774
2775 if (p == pend)
2776 break;
ed0767d8
SM
2777 else if (*p == '*'
2778 || (!(syntax & RE_BK_PLUS_QM)
2779 && (*p == '+' || *p == '?')))
25fe55af 2780 ;
ed0767d8 2781 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2782 {
ed0767d8
SM
2783 if (p+1 == pend)
2784 FREE_STACK_RETURN (REG_EESCAPE);
2785 if (p[1] == '+' || p[1] == '?')
2786 PATFETCH (c); /* Gobble up the backslash. */
2787 else
2788 break;
25fe55af
RS
2789 }
2790 else
ed0767d8 2791 break;
25fe55af 2792 /* If we get here, we found another repeat character. */
ed0767d8
SM
2793 PATFETCH (c);
2794 }
25fe55af
RS
2795
2796 /* Star, etc. applied to an empty pattern is equivalent
2797 to an empty pattern. */
4e8a9132 2798 if (!laststart || laststart == b)
25fe55af
RS
2799 break;
2800
2801 /* Now we know whether or not zero matches is allowed
7814e705 2802 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2803 if (greedy)
2804 {
99633e97 2805 if (many_times_ok)
4e8a9132
SM
2806 {
2807 boolean simple = skip_one_char (laststart) == b;
2808 unsigned int startoffset = 0;
f6a3f532 2809 re_opcode_t ofj =
01618498 2810 /* Check if the loop can match the empty string. */
6df42991
SM
2811 (simple || !analyse_first (laststart, b, NULL, 0))
2812 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2813 assert (skip_one_char (laststart) <= b);
177c0ea7 2814
4e8a9132
SM
2815 if (!zero_times_ok && simple)
2816 { /* Since simple * loops can be made faster by using
2817 on_failure_keep_string_jump, we turn simple P+
2818 into PP* if P is simple. */
2819 unsigned char *p1, *p2;
2820 startoffset = b - laststart;
2821 GET_BUFFER_SPACE (startoffset);
2822 p1 = b; p2 = laststart;
2823 while (p2 < p1)
2824 *b++ = *p2++;
2825 zero_times_ok = 1;
99633e97 2826 }
4e8a9132
SM
2827
2828 GET_BUFFER_SPACE (6);
2829 if (!zero_times_ok)
2830 /* A + loop. */
f6a3f532 2831 STORE_JUMP (ofj, b, b + 6);
99633e97 2832 else
4e8a9132
SM
2833 /* Simple * loops can use on_failure_keep_string_jump
2834 depending on what follows. But since we don't know
2835 that yet, we leave the decision up to
2836 on_failure_jump_smart. */
f6a3f532 2837 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2838 laststart + startoffset, b + 6);
99633e97 2839 b += 3;
4e8a9132 2840 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2841 b += 3;
2842 }
2843 else
2844 {
4e8a9132
SM
2845 /* A simple ? pattern. */
2846 assert (zero_times_ok);
2847 GET_BUFFER_SPACE (3);
2848 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2849 b += 3;
2850 }
1c8c6d39
DL
2851 }
2852 else /* not greedy */
2853 { /* I wish the greedy and non-greedy cases could be merged. */
2854
0683b6fa 2855 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2856 if (many_times_ok)
2857 {
f6a3f532
SM
2858 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2859
6df42991
SM
2860 /* The non-greedy multiple match looks like
2861 a repeat..until: we only need a conditional jump
2862 at the end of the loop. */
f6a3f532
SM
2863 if (emptyp) BUF_PUSH (no_op);
2864 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2865 : on_failure_jump, b, laststart);
1c8c6d39
DL
2866 b += 3;
2867 if (zero_times_ok)
2868 {
2869 /* The repeat...until naturally matches one or more.
2870 To also match zero times, we need to first jump to
6df42991 2871 the end of the loop (its conditional jump). */
1c8c6d39
DL
2872 INSERT_JUMP (jump, laststart, b);
2873 b += 3;
2874 }
2875 }
2876 else
2877 {
2878 /* non-greedy a?? */
1c8c6d39
DL
2879 INSERT_JUMP (jump, laststart, b + 3);
2880 b += 3;
2881 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2882 b += 3;
2883 }
2884 }
2885 }
4e8a9132 2886 pending_exact = 0;
fa9a63c5
RM
2887 break;
2888
2889
2890 case '.':
25fe55af
RS
2891 laststart = b;
2892 BUF_PUSH (anychar);
2893 break;
fa9a63c5
RM
2894
2895
25fe55af
RS
2896 case '[':
2897 {
b18215fc 2898 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2899
25fe55af 2900 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2901
25fe55af
RS
2902 /* Ensure that we have enough space to push a charset: the
2903 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2904 GET_BUFFER_SPACE (34);
2905
25fe55af 2906 laststart = b;
e318085a 2907
25fe55af 2908 /* We test `*p == '^' twice, instead of using an if
7814e705 2909 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2910 BUF_PUSH (*p == '^' ? charset_not : charset);
2911 if (*p == '^')
2912 p++;
e318085a 2913
25fe55af
RS
2914 /* Remember the first position in the bracket expression. */
2915 p1 = p;
e318085a 2916
7814e705 2917 /* Push the number of bytes in the bitmap. */
25fe55af 2918 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2919
25fe55af 2920 /* Clear the whole map. */
72af86bd 2921 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2922
25fe55af
RS
2923 /* charset_not matches newline according to a syntax bit. */
2924 if ((re_opcode_t) b[-2] == charset_not
2925 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2926 SET_LIST_BIT ('\n');
fa9a63c5 2927
7814e705 2928 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2929 for (;;)
2930 {
b18215fc 2931 boolean escaped_char = false;
2d1675e4 2932 const unsigned char *p2 = p;
cf9c99bc 2933 re_wchar_t ch, c2;
e318085a 2934
25fe55af 2935 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2936
36595814
SM
2937 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2938 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2939 So the translation is done later in a loop. Example:
2940 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2941 PATFETCH (c);
e318085a 2942
25fe55af
RS
2943 /* \ might escape characters inside [...] and [^...]. */
2944 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2945 {
2946 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2947
2948 PATFETCH (c);
b18215fc 2949 escaped_char = true;
25fe55af 2950 }
b18215fc
RS
2951 else
2952 {
7814e705 2953 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2954 not (i.e., when the bracket expression is `[]' so
2955 far), the ']' character bit gets set way below. */
2d1675e4 2956 if (c == ']' && p2 != p1)
657fcfbd 2957 break;
25fe55af 2958 }
b18215fc 2959
25fe55af
RS
2960 /* See if we're at the beginning of a possible character
2961 class. */
b18215fc 2962
2d1675e4
SM
2963 if (!escaped_char &&
2964 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2965 {
7814e705 2966 /* Leave room for the null. */
14473664 2967 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2968 const unsigned char *class_beg;
b18215fc 2969
25fe55af
RS
2970 PATFETCH (c);
2971 c1 = 0;
ed0767d8 2972 class_beg = p;
b18215fc 2973
25fe55af
RS
2974 /* If pattern is `[[:'. */
2975 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2976
25fe55af
RS
2977 for (;;)
2978 {
14473664
SM
2979 PATFETCH (c);
2980 if ((c == ':' && *p == ']') || p == pend)
2981 break;
2982 if (c1 < CHAR_CLASS_MAX_LENGTH)
2983 str[c1++] = c;
2984 else
2985 /* This is in any case an invalid class name. */
2986 str[0] = '\0';
25fe55af
RS
2987 }
2988 str[c1] = '\0';
b18215fc
RS
2989
2990 /* If isn't a word bracketed by `[:' and `:]':
2991 undo the ending character, the letters, and
2992 leave the leading `:' and `[' (but set bits for
2993 them). */
25fe55af
RS
2994 if (c == ':' && *p == ']')
2995 {
14473664 2996 re_wctype_t cc;
8f924df7 2997 int limit;
14473664
SM
2998
2999 cc = re_wctype (str);
3000
3001 if (cc == 0)
fa9a63c5
RM
3002 FREE_STACK_RETURN (REG_ECTYPE);
3003
14473664
SM
3004 /* Throw away the ] at the end of the character
3005 class. */
3006 PATFETCH (c);
fa9a63c5 3007
14473664 3008 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3009
cf9c99bc
KH
3010#ifndef emacs
3011 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3012 if (re_iswctype (btowc (ch), cc))
3013 {
3014 c = TRANSLATE (ch);
ed00c2ac
KH
3015 if (c < (1 << BYTEWIDTH))
3016 SET_LIST_BIT (c);
8f924df7 3017 }
cf9c99bc
KH
3018#else /* emacs */
3019 /* Most character classes in a multibyte match
3020 just set a flag. Exceptions are is_blank,
3021 is_digit, is_cntrl, and is_xdigit, since
3022 they can only match ASCII characters. We
3023 don't need to handle them for multibyte.
3024 They are distinguished by a negative wctype. */
96cc36cc 3025
254c06a8
SM
3026 /* Setup the gl_state object to its buffer-defined
3027 value. This hardcodes the buffer-global
3028 syntax-table for ASCII chars, while the other chars
3029 will obey syntax-table properties. It's not ideal,
3030 but it's the way it's been done until now. */
d48cd3f4 3031 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 3032
cf9c99bc 3033 for (ch = 0; ch < 256; ++ch)
25fe55af 3034 {
cf9c99bc
KH
3035 c = RE_CHAR_TO_MULTIBYTE (ch);
3036 if (! CHAR_BYTE8_P (c)
3037 && re_iswctype (c, cc))
8f924df7 3038 {
cf9c99bc
KH
3039 SET_LIST_BIT (ch);
3040 c1 = TRANSLATE (c);
3041 if (c1 == c)
3042 continue;
3043 if (ASCII_CHAR_P (c1))
3044 SET_LIST_BIT (c1);
3045 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3046 SET_LIST_BIT (c1);
8f924df7 3047 }
25fe55af 3048 }
cf9c99bc
KH
3049 SET_RANGE_TABLE_WORK_AREA_BIT
3050 (range_table_work, re_wctype_to_bit (cc));
3051#endif /* emacs */
6224b623
SM
3052 /* In most cases the matching rule for char classes
3053 only uses the syntax table for multibyte chars,
3054 so that the content of the syntax-table it is not
3055 hardcoded in the range_table. SPACE and WORD are
3056 the two exceptions. */
3057 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3058 bufp->used_syntax = 1;
3059
b18215fc
RS
3060 /* Repeat the loop. */
3061 continue;
25fe55af
RS
3062 }
3063 else
3064 {
ed0767d8
SM
3065 /* Go back to right after the "[:". */
3066 p = class_beg;
25fe55af 3067 SET_LIST_BIT ('[');
b18215fc
RS
3068
3069 /* Because the `:' may starts the range, we
3070 can't simply set bit and repeat the loop.
7814e705 3071 Instead, just set it to C and handle below. */
b18215fc 3072 c = ':';
25fe55af
RS
3073 }
3074 }
b18215fc
RS
3075
3076 if (p < pend && p[0] == '-' && p[1] != ']')
3077 {
3078
3079 /* Discard the `-'. */
3080 PATFETCH (c1);
3081
3082 /* Fetch the character which ends the range. */
3083 PATFETCH (c1);
cf9c99bc
KH
3084#ifdef emacs
3085 if (CHAR_BYTE8_P (c1)
3086 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3087 /* Treat the range from a multibyte character to
3088 raw-byte character as empty. */
3089 c = c1 + 1;
3090#endif /* emacs */
e318085a 3091 }
25fe55af 3092 else
b18215fc
RS
3093 /* Range from C to C. */
3094 c1 = c;
3095
cf9c99bc 3096 if (c > c1)
25fe55af 3097 {
cf9c99bc
KH
3098 if (syntax & RE_NO_EMPTY_RANGES)
3099 FREE_STACK_RETURN (REG_ERANGEX);
3100 /* Else, repeat the loop. */
bf216479 3101 }
6fdd04b0 3102 else
25fe55af 3103 {
cf9c99bc
KH
3104#ifndef emacs
3105 /* Set the range into bitmap */
8f924df7 3106 for (; c <= c1; c++)
b18215fc 3107 {
cf9c99bc
KH
3108 ch = TRANSLATE (c);
3109 if (ch < (1 << BYTEWIDTH))
3110 SET_LIST_BIT (ch);
3111 }
3112#else /* emacs */
3113 if (c < 128)
3114 {
3115 ch = MIN (127, c1);
3116 SETUP_ASCII_RANGE (range_table_work, c, ch);
3117 c = ch + 1;
3118 if (CHAR_BYTE8_P (c1))
3119 c = BYTE8_TO_CHAR (128);
3120 }
3121 if (c <= c1)
3122 {
3123 if (CHAR_BYTE8_P (c))
3124 {
3125 c = CHAR_TO_BYTE8 (c);
3126 c1 = CHAR_TO_BYTE8 (c1);
3127 for (; c <= c1; c++)
3128 SET_LIST_BIT (c);
3129 }
3130 else if (multibyte)
3131 {
3132 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3133 }
3134 else
3135 {
3136 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3137 }
e934739e 3138 }
cf9c99bc 3139#endif /* emacs */
25fe55af 3140 }
e318085a
RS
3141 }
3142
25fe55af 3143 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3144 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3145 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3146 b[-1]--;
3147 b += b[-1];
fa9a63c5 3148
96cc36cc
RS
3149 /* Build real range table from work area. */
3150 if (RANGE_TABLE_WORK_USED (range_table_work)
3151 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3152 {
3153 int i;
3154 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3155
b18215fc 3156 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3157 bytes for flags, two for COUNT, and three bytes for
3158 each character. */
3159 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3160
b18215fc
RS
3161 /* Indicate the existence of range table. */
3162 laststart[1] |= 0x80;
fa9a63c5 3163
96cc36cc
RS
3164 /* Store the character class flag bits into the range table.
3165 If not in emacs, these flag bits are always 0. */
3166 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3167 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3168
b18215fc
RS
3169 STORE_NUMBER_AND_INCR (b, used / 2);
3170 for (i = 0; i < used; i++)
3171 STORE_CHARACTER_AND_INCR
3172 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3173 }
25fe55af
RS
3174 }
3175 break;
fa9a63c5
RM
3176
3177
b18215fc 3178 case '(':
25fe55af
RS
3179 if (syntax & RE_NO_BK_PARENS)
3180 goto handle_open;
3181 else
3182 goto normal_char;
fa9a63c5
RM
3183
3184
25fe55af
RS
3185 case ')':
3186 if (syntax & RE_NO_BK_PARENS)
3187 goto handle_close;
3188 else
3189 goto normal_char;
e318085a
RS
3190
3191
25fe55af
RS
3192 case '\n':
3193 if (syntax & RE_NEWLINE_ALT)
3194 goto handle_alt;
3195 else
3196 goto normal_char;
e318085a
RS
3197
3198
b18215fc 3199 case '|':
25fe55af
RS
3200 if (syntax & RE_NO_BK_VBAR)
3201 goto handle_alt;
3202 else
3203 goto normal_char;
3204
3205
3206 case '{':
3207 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3208 goto handle_interval;
3209 else
3210 goto normal_char;
3211
3212
3213 case '\\':
3214 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3215
3216 /* Do not translate the character after the \, so that we can
3217 distinguish, e.g., \B from \b, even if we normally would
3218 translate, e.g., B to b. */
36595814 3219 PATFETCH (c);
25fe55af
RS
3220
3221 switch (c)
3222 {
3223 case '(':
3224 if (syntax & RE_NO_BK_PARENS)
3225 goto normal_backslash;
3226
3227 handle_open:
505bde11
SM
3228 {
3229 int shy = 0;
c69b0314 3230 regnum_t regnum = 0;
505bde11
SM
3231 if (p+1 < pend)
3232 {
3233 /* Look for a special (?...) construct */
ed0767d8 3234 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3235 {
ed0767d8 3236 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3237 while (!shy)
505bde11 3238 {
c69b0314
SM
3239 PATFETCH (c);
3240 switch (c)
3241 {
3242 case ':': shy = 1; break;
3243 case '0':
3244 /* An explicitly specified regnum must start
3245 with non-0. */
3246 if (regnum == 0)
3247 FREE_STACK_RETURN (REG_BADPAT);
3248 case '1': case '2': case '3': case '4':
3249 case '5': case '6': case '7': case '8': case '9':
3250 regnum = 10*regnum + (c - '0'); break;
3251 default:
3252 /* Only (?:...) is supported right now. */
3253 FREE_STACK_RETURN (REG_BADPAT);
3254 }
505bde11
SM
3255 }
3256 }
505bde11
SM
3257 }
3258
3259 if (!shy)
c69b0314
SM
3260 regnum = ++bufp->re_nsub;
3261 else if (regnum)
3262 { /* It's actually not shy, but explicitly numbered. */
3263 shy = 0;
3264 if (regnum > bufp->re_nsub)
3265 bufp->re_nsub = regnum;
3266 else if (regnum > bufp->re_nsub
3267 /* Ideally, we'd want to check that the specified
3268 group can't have matched (i.e. all subgroups
3269 using the same regnum are in other branches of
3270 OR patterns), but we don't currently keep track
3271 of enough info to do that easily. */
3272 || group_in_compile_stack (compile_stack, regnum))
3273 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3274 }
c69b0314
SM
3275 else
3276 /* It's really shy. */
3277 regnum = - bufp->re_nsub;
25fe55af 3278
99633e97
SM
3279 if (COMPILE_STACK_FULL)
3280 {
3281 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3282 compile_stack_elt_t);
3283 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3284
99633e97
SM
3285 compile_stack.size <<= 1;
3286 }
25fe55af 3287
99633e97 3288 /* These are the values to restore when we hit end of this
7814e705 3289 group. They are all relative offsets, so that if the
99633e97
SM
3290 whole pattern moves because of realloc, they will still
3291 be valid. */
3292 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3293 COMPILE_STACK_TOP.fixup_alt_jump
3294 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3295 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3296 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3297
c69b0314
SM
3298 /* Do not push a start_memory for groups beyond the last one
3299 we can represent in the compiled pattern. */
3300 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3301 BUF_PUSH_2 (start_memory, regnum);
3302
3303 compile_stack.avail++;
3304
3305 fixup_alt_jump = 0;
3306 laststart = 0;
3307 begalt = b;
3308 /* If we've reached MAX_REGNUM groups, then this open
3309 won't actually generate any code, so we'll have to
3310 clear pending_exact explicitly. */
3311 pending_exact = 0;
3312 break;
505bde11 3313 }
25fe55af
RS
3314
3315 case ')':
3316 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3317
3318 if (COMPILE_STACK_EMPTY)
505bde11
SM
3319 {
3320 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3321 goto normal_backslash;
3322 else
3323 FREE_STACK_RETURN (REG_ERPAREN);
3324 }
25fe55af
RS
3325
3326 handle_close:
505bde11 3327 FIXUP_ALT_JUMP ();
25fe55af
RS
3328
3329 /* See similar code for backslashed left paren above. */
3330 if (COMPILE_STACK_EMPTY)
505bde11
SM
3331 {
3332 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3333 goto normal_char;
3334 else
3335 FREE_STACK_RETURN (REG_ERPAREN);
3336 }
25fe55af
RS
3337
3338 /* Since we just checked for an empty stack above, this
3339 ``can't happen''. */
3340 assert (compile_stack.avail != 0);
3341 {
3342 /* We don't just want to restore into `regnum', because
3343 later groups should continue to be numbered higher,
7814e705 3344 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3345 regnum_t regnum;
25fe55af
RS
3346
3347 compile_stack.avail--;
3348 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3349 fixup_alt_jump
3350 = COMPILE_STACK_TOP.fixup_alt_jump
3351 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3352 : 0;
3353 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3354 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3355 /* If we've reached MAX_REGNUM groups, then this open
3356 won't actually generate any code, so we'll have to
3357 clear pending_exact explicitly. */
3358 pending_exact = 0;
e318085a 3359
25fe55af 3360 /* We're at the end of the group, so now we know how many
7814e705 3361 groups were inside this one. */
c69b0314
SM
3362 if (regnum <= MAX_REGNUM && regnum > 0)
3363 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3364 }
3365 break;
3366
3367
3368 case '|': /* `\|'. */
3369 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3370 goto normal_backslash;
3371 handle_alt:
3372 if (syntax & RE_LIMITED_OPS)
3373 goto normal_char;
3374
3375 /* Insert before the previous alternative a jump which
7814e705 3376 jumps to this alternative if the former fails. */
25fe55af
RS
3377 GET_BUFFER_SPACE (3);
3378 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3379 pending_exact = 0;
3380 b += 3;
3381
3382 /* The alternative before this one has a jump after it
3383 which gets executed if it gets matched. Adjust that
3384 jump so it will jump to this alternative's analogous
3385 jump (put in below, which in turn will jump to the next
3386 (if any) alternative's such jump, etc.). The last such
3387 jump jumps to the correct final destination. A picture:
3388 _____ _____
3389 | | | |
3390 | v | v
3391 a | b | c
3392
3393 If we are at `b', then fixup_alt_jump right now points to a
3394 three-byte space after `a'. We'll put in the jump, set
3395 fixup_alt_jump to right after `b', and leave behind three
3396 bytes which we'll fill in when we get to after `c'. */
3397
505bde11 3398 FIXUP_ALT_JUMP ();
25fe55af
RS
3399
3400 /* Mark and leave space for a jump after this alternative,
3401 to be filled in later either by next alternative or
3402 when know we're at the end of a series of alternatives. */
3403 fixup_alt_jump = b;
3404 GET_BUFFER_SPACE (3);
3405 b += 3;
3406
3407 laststart = 0;
3408 begalt = b;
3409 break;
3410
3411
3412 case '{':
3413 /* If \{ is a literal. */
3414 if (!(syntax & RE_INTERVALS)
3415 /* If we're at `\{' and it's not the open-interval
3416 operator. */
4bb91c68 3417 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3418 goto normal_backslash;
3419
3420 handle_interval:
3421 {
3422 /* If got here, then the syntax allows intervals. */
3423
3424 /* At least (most) this many matches must be made. */
99633e97 3425 int lower_bound = 0, upper_bound = -1;
25fe55af 3426
ed0767d8 3427 beg_interval = p;
25fe55af 3428
25fe55af
RS
3429 GET_UNSIGNED_NUMBER (lower_bound);
3430
3431 if (c == ',')
ed0767d8 3432 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3433 else
3434 /* Interval such as `{1}' => match exactly once. */
3435 upper_bound = lower_bound;
3436
3437 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3438 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3439 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3440
3441 if (!(syntax & RE_NO_BK_BRACES))
3442 {
4bb91c68
SM
3443 if (c != '\\')
3444 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3445 if (p == pend)
3446 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3447 PATFETCH (c);
3448 }
3449
3450 if (c != '}')
4bb91c68 3451 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3452
3453 /* We just parsed a valid interval. */
3454
3455 /* If it's invalid to have no preceding re. */
3456 if (!laststart)
3457 {
3458 if (syntax & RE_CONTEXT_INVALID_OPS)
3459 FREE_STACK_RETURN (REG_BADRPT);
3460 else if (syntax & RE_CONTEXT_INDEP_OPS)
3461 laststart = b;
3462 else
3463 goto unfetch_interval;
3464 }
3465
6df42991
SM
3466 if (upper_bound == 0)
3467 /* If the upper bound is zero, just drop the sub pattern
3468 altogether. */
3469 b = laststart;
3470 else if (lower_bound == 1 && upper_bound == 1)
3471 /* Just match it once: nothing to do here. */
3472 ;
3473
3474 /* Otherwise, we have a nontrivial interval. When
3475 we're all done, the pattern will look like:
3476 set_number_at <jump count> <upper bound>
3477 set_number_at <succeed_n count> <lower bound>
3478 succeed_n <after jump addr> <succeed_n count>
3479 <body of loop>
3480 jump_n <succeed_n addr> <jump count>
3481 (The upper bound and `jump_n' are omitted if
3482 `upper_bound' is 1, though.) */
3483 else
3484 { /* If the upper bound is > 1, we need to insert
3485 more at the end of the loop. */
3486 unsigned int nbytes = (upper_bound < 0 ? 3
3487 : upper_bound > 1 ? 5 : 0);
3488 unsigned int startoffset = 0;
3489
3490 GET_BUFFER_SPACE (20); /* We might use less. */
3491
3492 if (lower_bound == 0)
3493 {
3494 /* A succeed_n that starts with 0 is really a
3495 a simple on_failure_jump_loop. */
3496 INSERT_JUMP (on_failure_jump_loop, laststart,
3497 b + 3 + nbytes);
3498 b += 3;
3499 }
3500 else
3501 {
3502 /* Initialize lower bound of the `succeed_n', even
3503 though it will be set during matching by its
3504 attendant `set_number_at' (inserted next),
3505 because `re_compile_fastmap' needs to know.
3506 Jump to the `jump_n' we might insert below. */
3507 INSERT_JUMP2 (succeed_n, laststart,
3508 b + 5 + nbytes,
3509 lower_bound);
3510 b += 5;
3511
3512 /* Code to initialize the lower bound. Insert
7814e705 3513 before the `succeed_n'. The `5' is the last two
6df42991
SM
3514 bytes of this `set_number_at', plus 3 bytes of
3515 the following `succeed_n'. */
3516 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3517 b += 5;
3518 startoffset += 5;
3519 }
3520
3521 if (upper_bound < 0)
3522 {
3523 /* A negative upper bound stands for infinity,
3524 in which case it degenerates to a plain jump. */
3525 STORE_JUMP (jump, b, laststart + startoffset);
3526 b += 3;
3527 }
3528 else if (upper_bound > 1)
3529 { /* More than one repetition is allowed, so
3530 append a backward jump to the `succeed_n'
3531 that starts this interval.
3532
3533 When we've reached this during matching,
3534 we'll have matched the interval once, so
3535 jump back only `upper_bound - 1' times. */
3536 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3537 upper_bound - 1);
3538 b += 5;
3539
3540 /* The location we want to set is the second
3541 parameter of the `jump_n'; that is `b-2' as
3542 an absolute address. `laststart' will be
3543 the `set_number_at' we're about to insert;
3544 `laststart+3' the number to set, the source
3545 for the relative address. But we are
3546 inserting into the middle of the pattern --
3547 so everything is getting moved up by 5.
3548 Conclusion: (b - 2) - (laststart + 3) + 5,
3549 i.e., b - laststart.
3550
3551 We insert this at the beginning of the loop
3552 so that if we fail during matching, we'll
3553 reinitialize the bounds. */
3554 insert_op2 (set_number_at, laststart, b - laststart,
3555 upper_bound - 1, b);
3556 b += 5;
3557 }
3558 }
25fe55af
RS
3559 pending_exact = 0;
3560 beg_interval = NULL;
3561 }
3562 break;
3563
3564 unfetch_interval:
3565 /* If an invalid interval, match the characters as literals. */
3566 assert (beg_interval);
3567 p = beg_interval;
3568 beg_interval = NULL;
3569
3570 /* normal_char and normal_backslash need `c'. */
ed0767d8 3571 c = '{';
25fe55af
RS
3572
3573 if (!(syntax & RE_NO_BK_BRACES))
3574 {
ed0767d8
SM
3575 assert (p > pattern && p[-1] == '\\');
3576 goto normal_backslash;
25fe55af 3577 }
ed0767d8
SM
3578 else
3579 goto normal_char;
e318085a 3580
b18215fc 3581#ifdef emacs
25fe55af 3582 /* There is no way to specify the before_dot and after_dot
7814e705 3583 operators. rms says this is ok. --karl */
25fe55af
RS
3584 case '=':
3585 BUF_PUSH (at_dot);
3586 break;
3587
3588 case 's':
3589 laststart = b;
3590 PATFETCH (c);
3591 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3592 break;
3593
3594 case 'S':
3595 laststart = b;
3596 PATFETCH (c);
3597 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3598 break;
b18215fc
RS
3599
3600 case 'c':
3601 laststart = b;
36595814 3602 PATFETCH (c);
b18215fc
RS
3603 BUF_PUSH_2 (categoryspec, c);
3604 break;
e318085a 3605
b18215fc
RS
3606 case 'C':
3607 laststart = b;
36595814 3608 PATFETCH (c);
b18215fc
RS
3609 BUF_PUSH_2 (notcategoryspec, c);
3610 break;
3611#endif /* emacs */
e318085a 3612
e318085a 3613
25fe55af 3614 case 'w':
4bb91c68
SM
3615 if (syntax & RE_NO_GNU_OPS)
3616 goto normal_char;
25fe55af 3617 laststart = b;
1fb352e0 3618 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3619 break;
e318085a 3620
e318085a 3621
25fe55af 3622 case 'W':
4bb91c68
SM
3623 if (syntax & RE_NO_GNU_OPS)
3624 goto normal_char;
25fe55af 3625 laststart = b;
1fb352e0 3626 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3627 break;
e318085a
RS
3628
3629
25fe55af 3630 case '<':
4bb91c68
SM
3631 if (syntax & RE_NO_GNU_OPS)
3632 goto normal_char;
25fe55af
RS
3633 BUF_PUSH (wordbeg);
3634 break;
e318085a 3635
25fe55af 3636 case '>':
4bb91c68
SM
3637 if (syntax & RE_NO_GNU_OPS)
3638 goto normal_char;
25fe55af
RS
3639 BUF_PUSH (wordend);
3640 break;
e318085a 3641
669fa600
SM
3642 case '_':
3643 if (syntax & RE_NO_GNU_OPS)
3644 goto normal_char;
3645 laststart = b;
3646 PATFETCH (c);
3647 if (c == '<')
3648 BUF_PUSH (symbeg);
3649 else if (c == '>')
3650 BUF_PUSH (symend);
3651 else
3652 FREE_STACK_RETURN (REG_BADPAT);
3653 break;
3654
25fe55af 3655 case 'b':
4bb91c68
SM
3656 if (syntax & RE_NO_GNU_OPS)
3657 goto normal_char;
25fe55af
RS
3658 BUF_PUSH (wordbound);
3659 break;
e318085a 3660
25fe55af 3661 case 'B':
4bb91c68
SM
3662 if (syntax & RE_NO_GNU_OPS)
3663 goto normal_char;
25fe55af
RS
3664 BUF_PUSH (notwordbound);
3665 break;
fa9a63c5 3666
25fe55af 3667 case '`':
4bb91c68
SM
3668 if (syntax & RE_NO_GNU_OPS)
3669 goto normal_char;
25fe55af
RS
3670 BUF_PUSH (begbuf);
3671 break;
e318085a 3672
25fe55af 3673 case '\'':
4bb91c68
SM
3674 if (syntax & RE_NO_GNU_OPS)
3675 goto normal_char;
25fe55af
RS
3676 BUF_PUSH (endbuf);
3677 break;
e318085a 3678
25fe55af
RS
3679 case '1': case '2': case '3': case '4': case '5':
3680 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3681 {
3682 regnum_t reg;
e318085a 3683
0cdd06f8
SM
3684 if (syntax & RE_NO_BK_REFS)
3685 goto normal_backslash;
e318085a 3686
0cdd06f8 3687 reg = c - '0';
e318085a 3688
c69b0314
SM
3689 if (reg > bufp->re_nsub || reg < 1
3690 /* Can't back reference to a subexp before its end. */
3691 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3692 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3693
0cdd06f8
SM
3694 laststart = b;
3695 BUF_PUSH_2 (duplicate, reg);
3696 }
25fe55af 3697 break;
e318085a 3698
e318085a 3699
25fe55af
RS
3700 case '+':
3701 case '?':
3702 if (syntax & RE_BK_PLUS_QM)
3703 goto handle_plus;
3704 else
3705 goto normal_backslash;
3706
3707 default:
3708 normal_backslash:
3709 /* You might think it would be useful for \ to mean
3710 not to translate; but if we don't translate it
4bb91c68 3711 it will never match anything. */
25fe55af
RS
3712 goto normal_char;
3713 }
3714 break;
fa9a63c5
RM
3715
3716
3717 default:
25fe55af 3718 /* Expects the character in `c'. */
fa9a63c5 3719 normal_char:
36595814 3720 /* If no exactn currently being built. */
25fe55af 3721 if (!pending_exact
fa9a63c5 3722
25fe55af
RS
3723 /* If last exactn not at current position. */
3724 || pending_exact + *pending_exact + 1 != b
5e69f11e 3725
25fe55af 3726 /* We have only one byte following the exactn for the count. */
2d1675e4 3727 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3728
7814e705 3729 /* If followed by a repetition operator. */
9d99031f 3730 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3731 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3732 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3733 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3734 || ((syntax & RE_INTERVALS)
25fe55af 3735 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3736 ? p != pend && *p == '{'
3737 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3738 {
3739 /* Start building a new exactn. */
5e69f11e 3740
25fe55af 3741 laststart = b;
fa9a63c5
RM
3742
3743 BUF_PUSH_2 (exactn, 0);
3744 pending_exact = b - 1;
25fe55af 3745 }
5e69f11e 3746
2d1675e4
SM
3747 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3748 {
e0277a47
KH
3749 int len;
3750
cf9c99bc 3751 if (multibyte)
6fdd04b0 3752 {
cf9c99bc 3753 c = TRANSLATE (c);
6fdd04b0
KH
3754 len = CHAR_STRING (c, b);
3755 b += len;
3756 }
e0277a47 3757 else
6fdd04b0 3758 {
cf9c99bc
KH
3759 c1 = RE_CHAR_TO_MULTIBYTE (c);
3760 if (! CHAR_BYTE8_P (c1))
3761 {
3762 re_wchar_t c2 = TRANSLATE (c1);
3763
3764 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3765 c = c1;
409f2919 3766 }
6fdd04b0
KH
3767 *b++ = c;
3768 len = 1;
3769 }
2d1675e4
SM
3770 (*pending_exact) += len;
3771 }
3772
fa9a63c5 3773 break;
25fe55af 3774 } /* switch (c) */
fa9a63c5
RM
3775 } /* while p != pend */
3776
5e69f11e 3777
fa9a63c5 3778 /* Through the pattern now. */
5e69f11e 3779
505bde11 3780 FIXUP_ALT_JUMP ();
fa9a63c5 3781
5e69f11e 3782 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3783 FREE_STACK_RETURN (REG_EPAREN);
3784
3785 /* If we don't want backtracking, force success
3786 the first time we reach the end of the compiled pattern. */
3787 if (syntax & RE_NO_POSIX_BACKTRACKING)
3788 BUF_PUSH (succeed);
3789
fa9a63c5
RM
3790 /* We have succeeded; set the length of the buffer. */
3791 bufp->used = b - bufp->buffer;
3792
3793#ifdef DEBUG
99633e97 3794 if (debug > 0)
fa9a63c5 3795 {
505bde11 3796 re_compile_fastmap (bufp);
fa9a63c5
RM
3797 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3798 print_compiled_pattern (bufp);
3799 }
99633e97 3800 debug--;
fa9a63c5
RM
3801#endif /* DEBUG */
3802
3803#ifndef MATCH_MAY_ALLOCATE
3804 /* Initialize the failure stack to the largest possible stack. This
3805 isn't necessary unless we're trying to avoid calling alloca in
3806 the search and match routines. */
3807 {
3808 int num_regs = bufp->re_nsub + 1;
3809
320a2a73 3810 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3811 {
a26f4ccd 3812 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3813
fa9a63c5
RM
3814 if (! fail_stack.stack)
3815 fail_stack.stack
5e69f11e 3816 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3817 * sizeof (fail_stack_elt_t));
3818 else
3819 fail_stack.stack
3820 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3821 (fail_stack.size
3822 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3823 }
3824
3825 regex_grow_registers (num_regs);
3826 }
3827#endif /* not MATCH_MAY_ALLOCATE */
3828
839966f3 3829 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3830} /* regex_compile */
3831\f
3832/* Subroutines for `regex_compile'. */
3833
7814e705 3834/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3835
3836static void
971de7fb 3837store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3838{
3839 *loc = (unsigned char) op;
3840 STORE_NUMBER (loc + 1, arg);
3841}
3842
3843
3844/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3845
3846static void
971de7fb 3847store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3848{
3849 *loc = (unsigned char) op;
3850 STORE_NUMBER (loc + 1, arg1);
3851 STORE_NUMBER (loc + 3, arg2);
3852}
3853
3854
3855/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3856 for OP followed by two-byte integer parameter ARG. */
3857
3858static void
971de7fb 3859insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3860{
3861 register unsigned char *pfrom = end;
3862 register unsigned char *pto = end + 3;
3863
3864 while (pfrom != loc)
3865 *--pto = *--pfrom;
5e69f11e 3866
fa9a63c5
RM
3867 store_op1 (op, loc, arg);
3868}
3869
3870
3871/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3872
3873static void
971de7fb 3874insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3875{
3876 register unsigned char *pfrom = end;
3877 register unsigned char *pto = end + 5;
3878
3879 while (pfrom != loc)
3880 *--pto = *--pfrom;
5e69f11e 3881
fa9a63c5
RM
3882 store_op2 (op, loc, arg1, arg2);
3883}
3884
3885
3886/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3887 after an alternative or a begin-subexpression. We assume there is at
3888 least one character before the ^. */
3889
3890static boolean
971de7fb 3891at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3892{
01618498 3893 re_char *prev = p - 2;
fa9a63c5 3894 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3895
fa9a63c5
RM
3896 return
3897 /* After a subexpression? */
3898 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3899 /* After an alternative? */
d2af47df
SM
3900 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3901 /* After a shy subexpression? */
3902 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3903 && prev[-1] == '?' && prev[-2] == '('
3904 && (syntax & RE_NO_BK_PARENS
3905 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3906}
3907
3908
3909/* The dual of at_begline_loc_p. This one is for $. We assume there is
3910 at least one character after the $, i.e., `P < PEND'. */
3911
3912static boolean
971de7fb 3913at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3914{
01618498 3915 re_char *next = p;
fa9a63c5 3916 boolean next_backslash = *next == '\\';
01618498 3917 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3918
fa9a63c5
RM
3919 return
3920 /* Before a subexpression? */
3921 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3922 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3923 /* Before an alternative? */
3924 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3925 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3926}
3927
3928
5e69f11e 3929/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3930 false if it's not. */
3931
3932static boolean
971de7fb 3933group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5
RM
3934{
3935 int this_element;
3936
5e69f11e
RM
3937 for (this_element = compile_stack.avail - 1;
3938 this_element >= 0;
fa9a63c5
RM
3939 this_element--)
3940 if (compile_stack.stack[this_element].regnum == regnum)
3941 return true;
3942
3943 return false;
3944}
fa9a63c5 3945\f
f6a3f532
SM
3946/* analyse_first.
3947 If fastmap is non-NULL, go through the pattern and fill fastmap
3948 with all the possible leading chars. If fastmap is NULL, don't
3949 bother filling it up (obviously) and only return whether the
3950 pattern could potentially match the empty string.
3951
3952 Return 1 if p..pend might match the empty string.
3953 Return 0 if p..pend matches at least one char.
01618498 3954 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3955
3956static int
438105ed 3957analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3958{
505bde11 3959 int j, k;
1fb352e0 3960 boolean not;
fa9a63c5 3961
b18215fc 3962 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3963 flag is set true. */
b18215fc
RS
3964 boolean match_any_multibyte_characters = false;
3965
f6a3f532 3966 assert (p);
5e69f11e 3967
505bde11
SM
3968 /* The loop below works as follows:
3969 - It has a working-list kept in the PATTERN_STACK and which basically
3970 starts by only containing a pointer to the first operation.
3971 - If the opcode we're looking at is a match against some set of
3972 chars, then we add those chars to the fastmap and go on to the
3973 next work element from the worklist (done via `break').
3974 - If the opcode is a control operator on the other hand, we either
3975 ignore it (if it's meaningless at this point, such as `start_memory')
3976 or execute it (if it's a jump). If the jump has several destinations
3977 (i.e. `on_failure_jump'), then we push the other destination onto the
3978 worklist.
3979 We guarantee termination by ignoring backward jumps (more or less),
3980 so that `p' is monotonically increasing. More to the point, we
3981 never set `p' (or push) anything `<= p1'. */
3982
01618498 3983 while (p < pend)
fa9a63c5 3984 {
505bde11
SM
3985 /* `p1' is used as a marker of how far back a `on_failure_jump'
3986 can go without being ignored. It is normally equal to `p'
3987 (which prevents any backward `on_failure_jump') except right
3988 after a plain `jump', to allow patterns such as:
3989 0: jump 10
3990 3..9: <body>
3991 10: on_failure_jump 3
3992 as used for the *? operator. */
01618498 3993 re_char *p1 = p;
5e69f11e 3994
fa9a63c5
RM
3995 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3996 {
f6a3f532 3997 case succeed:
01618498 3998 return 1;
fa9a63c5 3999
fa9a63c5 4000 case duplicate:
505bde11
SM
4001 /* If the first character has to match a backreference, that means
4002 that the group was empty (since it already matched). Since this
4003 is the only case that interests us here, we can assume that the
4004 backreference must match the empty string. */
4005 p++;
4006 continue;
fa9a63c5
RM
4007
4008
4009 /* Following are the cases which match a character. These end
7814e705 4010 with `break'. */
fa9a63c5
RM
4011
4012 case exactn:
e0277a47 4013 if (fastmap)
cf9c99bc
KH
4014 {
4015 /* If multibyte is nonzero, the first byte of each
4016 character is an ASCII or a leading code. Otherwise,
4017 each byte is a character. Thus, this works in both
4018 cases. */
4019 fastmap[p[1]] = 1;
4020 if (! multibyte)
4021 {
4022 /* For the case of matching this unibyte regex
4023 against multibyte, we must set a leading code of
4024 the corresponding multibyte character. */
4025 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4026
86e893e3 4027 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
4028 }
4029 }
fa9a63c5
RM
4030 break;
4031
4032
1fb352e0
SM
4033 case anychar:
4034 /* We could put all the chars except for \n (and maybe \0)
4035 but we don't bother since it is generally not worth it. */
f6a3f532 4036 if (!fastmap) break;
01618498 4037 return -1;
fa9a63c5
RM
4038
4039
b18215fc 4040 case charset_not:
1fb352e0 4041 if (!fastmap) break;
bf216479
KH
4042 {
4043 /* Chars beyond end of bitmap are possible matches. */
bf216479 4044 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4045 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4046 fastmap[j] = 1;
4047 }
4048
1fb352e0
SM
4049 /* Fallthrough */
4050 case charset:
4051 if (!fastmap) break;
4052 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4053 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4054 j >= 0; j--)
1fb352e0 4055 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4056 fastmap[j] = 1;
b18215fc 4057
6482db2e
KH
4058#ifdef emacs
4059 if (/* Any leading code can possibly start a character
1fb352e0 4060 which doesn't match the specified set of characters. */
6482db2e 4061 not
409f2919 4062 ||
6482db2e
KH
4063 /* If we can match a character class, we can match any
4064 multibyte characters. */
4065 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4066 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4067
b18215fc 4068 {
b18215fc
RS
4069 if (match_any_multibyte_characters == false)
4070 {
6482db2e
KH
4071 for (j = MIN_MULTIBYTE_LEADING_CODE;
4072 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4073 fastmap[j] = 1;
b18215fc
RS
4074 match_any_multibyte_characters = true;
4075 }
4076 }
b18215fc 4077
1fb352e0
SM
4078 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4079 && match_any_multibyte_characters == false)
4080 {
bf216479 4081 /* Set fastmap[I] to 1 where I is a leading code of each
51e4f4a8 4082 multibyte character in the range table. */
1fb352e0 4083 int c, count;
bf216479 4084 unsigned char lc1, lc2;
b18215fc 4085
1fb352e0 4086 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4087 bits for a character class. */
1fb352e0 4088 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4089
1fb352e0
SM
4090 /* Extract the number of ranges in range table into COUNT. */
4091 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4092 for (; count > 0; count--, p += 3)
1fb352e0 4093 {
9117d724
KH
4094 /* Extract the start and end of each range. */
4095 EXTRACT_CHARACTER (c, p);
bf216479 4096 lc1 = CHAR_LEADING_CODE (c);
9117d724 4097 p += 3;
1fb352e0 4098 EXTRACT_CHARACTER (c, p);
bf216479
KH
4099 lc2 = CHAR_LEADING_CODE (c);
4100 for (j = lc1; j <= lc2; j++)
9117d724 4101 fastmap[j] = 1;
1fb352e0
SM
4102 }
4103 }
6482db2e 4104#endif
b18215fc
RS
4105 break;
4106
1fb352e0
SM
4107 case syntaxspec:
4108 case notsyntaxspec:
4109 if (!fastmap) break;
4110#ifndef emacs
4111 not = (re_opcode_t)p[-1] == notsyntaxspec;
4112 k = *p++;
4113 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4114 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4115 fastmap[j] = 1;
b18215fc 4116 break;
1fb352e0 4117#else /* emacs */
b18215fc
RS
4118 /* This match depends on text properties. These end with
4119 aborting optimizations. */
01618498 4120 return -1;
b18215fc
RS
4121
4122 case categoryspec:
b18215fc 4123 case notcategoryspec:
1fb352e0
SM
4124 if (!fastmap) break;
4125 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4126 k = *p++;
6482db2e 4127 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4128 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4129 fastmap[j] = 1;
4130
6482db2e
KH
4131 /* Any leading code can possibly start a character which
4132 has or doesn't has the specified category. */
4133 if (match_any_multibyte_characters == false)
6fdd04b0 4134 {
6482db2e
KH
4135 for (j = MIN_MULTIBYTE_LEADING_CODE;
4136 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4137 fastmap[j] = 1;
4138 match_any_multibyte_characters = true;
6fdd04b0 4139 }
b18215fc
RS
4140 break;
4141
fa9a63c5 4142 /* All cases after this match the empty string. These end with
25fe55af 4143 `continue'. */
fa9a63c5 4144
fa9a63c5
RM
4145 case before_dot:
4146 case at_dot:
4147 case after_dot:
1fb352e0 4148#endif /* !emacs */
25fe55af
RS
4149 case no_op:
4150 case begline:
4151 case endline:
fa9a63c5
RM
4152 case begbuf:
4153 case endbuf:
4154 case wordbound:
4155 case notwordbound:
4156 case wordbeg:
4157 case wordend:
669fa600
SM
4158 case symbeg:
4159 case symend:
25fe55af 4160 continue;
fa9a63c5
RM
4161
4162
fa9a63c5 4163 case jump:
25fe55af 4164 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4165 if (j < 0)
4166 /* Backward jumps can only go back to code that we've already
4167 visited. `re_compile' should make sure this is true. */
4168 break;
25fe55af 4169 p += j;
505bde11
SM
4170 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4171 {
4172 case on_failure_jump:
4173 case on_failure_keep_string_jump:
505bde11 4174 case on_failure_jump_loop:
0683b6fa 4175 case on_failure_jump_nastyloop:
505bde11
SM
4176 case on_failure_jump_smart:
4177 p++;
4178 break;
4179 default:
4180 continue;
4181 };
4182 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4183 to jump back to "just after here". */
4184 /* Fallthrough */
fa9a63c5 4185
25fe55af
RS
4186 case on_failure_jump:
4187 case on_failure_keep_string_jump:
0683b6fa 4188 case on_failure_jump_nastyloop:
505bde11
SM
4189 case on_failure_jump_loop:
4190 case on_failure_jump_smart:
25fe55af 4191 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4192 if (p + j <= p1)
ed0767d8 4193 ; /* Backward jump to be ignored. */
01618498
SM
4194 else
4195 { /* We have to look down both arms.
4196 We first go down the "straight" path so as to minimize
4197 stack usage when going through alternatives. */
4198 int r = analyse_first (p, pend, fastmap, multibyte);
4199 if (r) return r;
4200 p += j;
4201 }
25fe55af 4202 continue;
fa9a63c5
RM
4203
4204
ed0767d8
SM
4205 case jump_n:
4206 /* This code simply does not properly handle forward jump_n. */
4207 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4208 p += 4;
4209 /* jump_n can either jump or fall through. The (backward) jump
4210 case has already been handled, so we only need to look at the
4211 fallthrough case. */
4212 continue;
177c0ea7 4213
fa9a63c5 4214 case succeed_n:
ed0767d8
SM
4215 /* If N == 0, it should be an on_failure_jump_loop instead. */
4216 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4217 p += 4;
4218 /* We only care about one iteration of the loop, so we don't
4219 need to consider the case where this behaves like an
4220 on_failure_jump. */
25fe55af 4221 continue;
fa9a63c5
RM
4222
4223
4224 case set_number_at:
25fe55af
RS
4225 p += 4;
4226 continue;
fa9a63c5
RM
4227
4228
4229 case start_memory:
25fe55af 4230 case stop_memory:
505bde11 4231 p += 1;
fa9a63c5
RM
4232 continue;
4233
4234
4235 default:
25fe55af
RS
4236 abort (); /* We have listed all the cases. */
4237 } /* switch *p++ */
fa9a63c5
RM
4238
4239 /* Getting here means we have found the possible starting
25fe55af 4240 characters for one path of the pattern -- and that the empty
7814e705 4241 string does not match. We need not follow this path further. */
01618498 4242 return 0;
fa9a63c5
RM
4243 } /* while p */
4244
01618498
SM
4245 /* We reached the end without matching anything. */
4246 return 1;
4247
f6a3f532
SM
4248} /* analyse_first */
4249\f
4250/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4251 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4252 characters can start a string that matches the pattern. This fastmap
4253 is used by re_search to skip quickly over impossible starting points.
4254
4255 Character codes above (1 << BYTEWIDTH) are not represented in the
4256 fastmap, but the leading codes are represented. Thus, the fastmap
4257 indicates which character sets could start a match.
4258
4259 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4260 area as BUFP->fastmap.
4261
4262 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4263 the pattern buffer.
4264
4265 Returns 0 if we succeed, -2 if an internal error. */
4266
4267int
971de7fb 4268re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4269{
4270 char *fastmap = bufp->fastmap;
4271 int analysis;
4272
4273 assert (fastmap && bufp->buffer);
4274
72af86bd 4275 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4276 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4277
4278 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4279 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4280 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4281 return 0;
4282} /* re_compile_fastmap */
4283\f
4284/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4285 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4286 this memory for recording register information. STARTS and ENDS
4287 must be allocated using the malloc library routine, and must each
4288 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4289
4290 If NUM_REGS == 0, then subsequent matches should allocate their own
4291 register data.
4292
4293 Unless this function is called, the first search or match using
4294 PATTERN_BUFFER will allocate its own register data, without
4295 freeing the old data. */
4296
4297void
971de7fb 4298re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4299{
4300 if (num_regs)
4301 {
4302 bufp->regs_allocated = REGS_REALLOCATE;
4303 regs->num_regs = num_regs;
4304 regs->start = starts;
4305 regs->end = ends;
4306 }
4307 else
4308 {
4309 bufp->regs_allocated = REGS_UNALLOCATED;
4310 regs->num_regs = 0;
4311 regs->start = regs->end = (regoff_t *) 0;
4312 }
4313}
c0f9ea08 4314WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4315\f
7814e705 4316/* Searching routines. */
fa9a63c5
RM
4317
4318/* Like re_search_2, below, but only one string is specified, and
4319 doesn't let you say where to stop matching. */
4320
4321int
971de7fb 4322re_search (struct re_pattern_buffer *bufp, const char *string, int size, int startpos, int range, struct re_registers *regs)
fa9a63c5 4323{
5e69f11e 4324 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4325 regs, size);
4326}
c0f9ea08 4327WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4328
70806df6
KH
4329/* Head address of virtual concatenation of string. */
4330#define HEAD_ADDR_VSTRING(P) \
4331 (((P) >= size1 ? string2 : string1))
4332
b18215fc
RS
4333/* End address of virtual concatenation of string. */
4334#define STOP_ADDR_VSTRING(P) \
4335 (((P) >= size1 ? string2 + size2 : string1 + size1))
4336
4337/* Address of POS in the concatenation of virtual string. */
4338#define POS_ADDR_VSTRING(POS) \
4339 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4340
4341/* Using the compiled pattern in BUFP->buffer, first tries to match the
4342 virtual concatenation of STRING1 and STRING2, starting first at index
4343 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4344
fa9a63c5 4345 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4346
fa9a63c5
RM
4347 RANGE is how far to scan while trying to match. RANGE = 0 means try
4348 only at STARTPOS; in general, the last start tried is STARTPOS +
4349 RANGE.
5e69f11e 4350
fa9a63c5
RM
4351 In REGS, return the indices of the virtual concatenation of STRING1
4352 and STRING2 that matched the entire BUFP->buffer and its contained
4353 subexpressions.
5e69f11e 4354
fa9a63c5
RM
4355 Do not consider matching one past the index STOP in the virtual
4356 concatenation of STRING1 and STRING2.
4357
4358 We return either the position in the strings at which the match was
4359 found, -1 if no match, or -2 if error (such as failure
4360 stack overflow). */
4361
4362int
971de7fb 4363re_search_2 (struct re_pattern_buffer *bufp, const char *str1, int size1, const char *str2, int size2, int startpos, int range, struct re_registers *regs, int stop)
fa9a63c5
RM
4364{
4365 int val;
66f0296e
SM
4366 re_char *string1 = (re_char*) str1;
4367 re_char *string2 = (re_char*) str2;
fa9a63c5 4368 register char *fastmap = bufp->fastmap;
6676cb1c 4369 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4370 int total_size = size1 + size2;
4371 int endpos = startpos + range;
c0f9ea08 4372 boolean anchored_start;
cf9c99bc
KH
4373 /* Nonzero if we are searching multibyte string. */
4374 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4375
fa9a63c5
RM
4376 /* Check for out-of-range STARTPOS. */
4377 if (startpos < 0 || startpos > total_size)
4378 return -1;
5e69f11e 4379
fa9a63c5 4380 /* Fix up RANGE if it might eventually take us outside
34597fa9 4381 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4382 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4383 if (endpos < 0)
4384 range = 0 - startpos;
fa9a63c5
RM
4385 else if (endpos > total_size)
4386 range = total_size - startpos;
4387
4388 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4389 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4390 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4391 {
4392 if (startpos > 0)
4393 return -1;
4394 else
7b140fd7 4395 range = 0;
fa9a63c5
RM
4396 }
4397
ae4788a8
RS
4398#ifdef emacs
4399 /* In a forward search for something that starts with \=.
4400 don't keep searching past point. */
4401 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4402 {
7b140fd7
RS
4403 range = PT_BYTE - BEGV_BYTE - startpos;
4404 if (range < 0)
ae4788a8
RS
4405 return -1;
4406 }
4407#endif /* emacs */
4408
fa9a63c5
RM
4409 /* Update the fastmap now if not correct already. */
4410 if (fastmap && !bufp->fastmap_accurate)
01618498 4411 re_compile_fastmap (bufp);
5e69f11e 4412
c8499ba5 4413 /* See whether the pattern is anchored. */
c0f9ea08 4414 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4415
b18215fc 4416#ifdef emacs
d48cd3f4 4417 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4418 {
99633e97 4419 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4420
4421 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4422 }
b18215fc
RS
4423#endif
4424
fa9a63c5
RM
4425 /* Loop through the string, looking for a place to start matching. */
4426 for (;;)
5e69f11e 4427 {
c8499ba5
RS
4428 /* If the pattern is anchored,
4429 skip quickly past places we cannot match.
4430 We don't bother to treat startpos == 0 specially
4431 because that case doesn't repeat. */
4432 if (anchored_start && startpos > 0)
4433 {
c0f9ea08
SM
4434 if (! ((startpos <= size1 ? string1[startpos - 1]
4435 : string2[startpos - size1 - 1])
4436 == '\n'))
c8499ba5
RS
4437 goto advance;
4438 }
4439
fa9a63c5 4440 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4441 cannot be the start of a match. If the pattern can match the
4442 null string, however, we don't need to skip characters; we want
7814e705 4443 the first null string. */
fa9a63c5
RM
4444 if (fastmap && startpos < total_size && !bufp->can_be_null)
4445 {
66f0296e 4446 register re_char *d;
01618498 4447 register re_wchar_t buf_ch;
e934739e
RS
4448
4449 d = POS_ADDR_VSTRING (startpos);
4450
7814e705 4451 if (range > 0) /* Searching forwards. */
fa9a63c5 4452 {
fa9a63c5
RM
4453 register int lim = 0;
4454 int irange = range;
4455
25fe55af
RS
4456 if (startpos < size1 && startpos + range >= size1)
4457 lim = range - (size1 - startpos);
fa9a63c5 4458
25fe55af
RS
4459 /* Written out as an if-else to avoid testing `translate'
4460 inside the loop. */
28ae27ae
AS
4461 if (RE_TRANSLATE_P (translate))
4462 {
e934739e
RS
4463 if (multibyte)
4464 while (range > lim)
4465 {
4466 int buf_charlen;
4467
62a6e103 4468 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4469 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4470 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4471 break;
4472
4473 range -= buf_charlen;
4474 d += buf_charlen;
4475 }
4476 else
bf216479 4477 while (range > lim)
33c46939 4478 {
cf9c99bc
KH
4479 register re_wchar_t ch, translated;
4480
bf216479 4481 buf_ch = *d;
cf9c99bc
KH
4482 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4483 translated = RE_TRANSLATE (translate, ch);
4484 if (translated != ch
4485 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4486 buf_ch = ch;
6fdd04b0 4487 if (fastmap[buf_ch])
bf216479 4488 break;
33c46939
RS
4489 d++;
4490 range--;
4491 }
e934739e 4492 }
fa9a63c5 4493 else
6fdd04b0
KH
4494 {
4495 if (multibyte)
4496 while (range > lim)
4497 {
4498 int buf_charlen;
fa9a63c5 4499
62a6e103 4500 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4501 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4502 break;
4503 range -= buf_charlen;
4504 d += buf_charlen;
4505 }
e934739e 4506 else
6fdd04b0 4507 while (range > lim && !fastmap[*d])
33c46939
RS
4508 {
4509 d++;
4510 range--;
4511 }
e934739e 4512 }
fa9a63c5
RM
4513 startpos += irange - range;
4514 }
7814e705 4515 else /* Searching backwards. */
fa9a63c5 4516 {
ba5e343c
KH
4517 if (multibyte)
4518 {
62a6e103 4519 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4520 buf_ch = TRANSLATE (buf_ch);
4521 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4522 goto advance;
4523 }
4524 else
4525 {
cf9c99bc
KH
4526 register re_wchar_t ch, translated;
4527
4528 buf_ch = *d;
4529 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4530 translated = TRANSLATE (ch);
4531 if (translated != ch
4532 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4533 buf_ch = ch;
4534 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4535 goto advance;
4536 }
fa9a63c5
RM
4537 }
4538 }
4539
4540 /* If can't match the null string, and that's all we have left, fail. */
4541 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4542 && !bufp->can_be_null)
fa9a63c5
RM
4543 return -1;
4544
4545 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4546 startpos, regs, stop);
fa9a63c5
RM
4547
4548 if (val >= 0)
4549 return startpos;
5e69f11e 4550
fa9a63c5
RM
4551 if (val == -2)
4552 return -2;
4553
4554 advance:
5e69f11e 4555 if (!range)
25fe55af 4556 break;
5e69f11e 4557 else if (range > 0)
25fe55af 4558 {
b18215fc
RS
4559 /* Update STARTPOS to the next character boundary. */
4560 if (multibyte)
4561 {
66f0296e
SM
4562 re_char *p = POS_ADDR_VSTRING (startpos);
4563 re_char *pend = STOP_ADDR_VSTRING (startpos);
aa3830c4 4564 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4565
4566 range -= len;
4567 if (range < 0)
4568 break;
4569 startpos += len;
4570 }
4571 else
4572 {
b560c397
RS
4573 range--;
4574 startpos++;
4575 }
e318085a 4576 }
fa9a63c5 4577 else
25fe55af
RS
4578 {
4579 range++;
4580 startpos--;
b18215fc
RS
4581
4582 /* Update STARTPOS to the previous character boundary. */
4583 if (multibyte)
4584 {
70806df6
KH
4585 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4586 re_char *p0 = p;
4587 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4588
4589 /* Find the head of multibyte form. */
70806df6
KH
4590 PREV_CHAR_BOUNDARY (p, phead);
4591 range += p0 - 1 - p;
4592 if (range > 0)
4593 break;
b18215fc 4594
70806df6 4595 startpos -= p0 - 1 - p;
b18215fc 4596 }
25fe55af 4597 }
fa9a63c5
RM
4598 }
4599 return -1;
4600} /* re_search_2 */
c0f9ea08 4601WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4602\f
4603/* Declarations and macros for re_match_2. */
4604
2d1675e4
SM
4605static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4606 register int len,
4607 RE_TRANSLATE_TYPE translate,
4608 const int multibyte));
fa9a63c5
RM
4609
4610/* This converts PTR, a pointer into one of the search strings `string1'
4611 and `string2' into an offset from the beginning of that string. */
4612#define POINTER_TO_OFFSET(ptr) \
4613 (FIRST_STRING_P (ptr) \
4614 ? ((regoff_t) ((ptr) - string1)) \
4615 : ((regoff_t) ((ptr) - string2 + size1)))
4616
fa9a63c5 4617/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4618 string2 if necessary.
4619 Check re_match_2_internal for a discussion of why end_match_2 might
4620 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4621#define PREFETCH() \
25fe55af 4622 while (d == dend) \
fa9a63c5
RM
4623 { \
4624 /* End of string2 => fail. */ \
25fe55af
RS
4625 if (dend == end_match_2) \
4626 goto fail; \
4bb91c68 4627 /* End of string1 => advance to string2. */ \
25fe55af 4628 d = string2; \
fa9a63c5
RM
4629 dend = end_match_2; \
4630 }
4631
f1ad044f
SM
4632/* Call before fetching a char with *d if you already checked other limits.
4633 This is meant for use in lookahead operations like wordend, etc..
4634 where we might need to look at parts of the string that might be
4635 outside of the LIMITs (i.e past `stop'). */
4636#define PREFETCH_NOLIMIT() \
4637 if (d == end1) \
4638 { \
4639 d = string2; \
4640 dend = end_match_2; \
4641 } \
fa9a63c5
RM
4642
4643/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4644 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4645#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4646#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4647
4648
4649/* Test if D points to a character which is word-constituent. We have
4650 two special cases to check for: if past the end of string1, look at
4651 the first character in string2; and if before the beginning of
4652 string2, look at the last character in string1. */
4653#define WORDCHAR_P(d) \
4654 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4655 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4656 == Sword)
4657
9121ca40 4658/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4659
4660/* The comment at case wordbound is following one, but we don't use
4661 AT_WORD_BOUNDARY anymore to support multibyte form.
4662
4663 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4664 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4665 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4666 macro and introducing temporary variables works around the bug. */
4667
9121ca40 4668#if 0
fa9a63c5
RM
4669/* Test if the character before D and the one at D differ with respect
4670 to being word-constituent. */
4671#define AT_WORD_BOUNDARY(d) \
4672 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4673 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4674#endif
fa9a63c5
RM
4675
4676/* Free everything we malloc. */
4677#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4678# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4679# define FREE_VARIABLES() \
fa9a63c5
RM
4680 do { \
4681 REGEX_FREE_STACK (fail_stack.stack); \
4682 FREE_VAR (regstart); \
4683 FREE_VAR (regend); \
fa9a63c5
RM
4684 FREE_VAR (best_regstart); \
4685 FREE_VAR (best_regend); \
fa9a63c5
RM
4686 } while (0)
4687#else
0b32bf0e 4688# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4689#endif /* not MATCH_MAY_ALLOCATE */
4690
505bde11
SM
4691\f
4692/* Optimization routines. */
4693
4e8a9132
SM
4694/* If the operation is a match against one or more chars,
4695 return a pointer to the next operation, else return NULL. */
01618498 4696static re_char *
971de7fb 4697skip_one_char (const re_char *p)
4e8a9132
SM
4698{
4699 switch (SWITCH_ENUM_CAST (*p++))
4700 {
4701 case anychar:
4702 break;
177c0ea7 4703
4e8a9132
SM
4704 case exactn:
4705 p += *p + 1;
4706 break;
4707
4708 case charset_not:
4709 case charset:
4710 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4711 {
4712 int mcnt;
4713 p = CHARSET_RANGE_TABLE (p - 1);
4714 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4715 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4716 }
4717 else
4718 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4719 break;
177c0ea7 4720
4e8a9132
SM
4721 case syntaxspec:
4722 case notsyntaxspec:
1fb352e0 4723#ifdef emacs
4e8a9132
SM
4724 case categoryspec:
4725 case notcategoryspec:
4726#endif /* emacs */
4727 p++;
4728 break;
4729
4730 default:
4731 p = NULL;
4732 }
4733 return p;
4734}
4735
4736
505bde11 4737/* Jump over non-matching operations. */
839966f3 4738static re_char *
971de7fb 4739skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4740{
4741 int mcnt;
4742 while (p < pend)
4743 {
4744 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4745 {
4746 case start_memory:
505bde11
SM
4747 case stop_memory:
4748 p += 2; break;
4749 case no_op:
4750 p += 1; break;
4751 case jump:
4752 p += 1;
4753 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4754 p += mcnt;
4755 break;
4756 default:
4757 return p;
4758 }
4759 }
4760 assert (p == pend);
4761 return p;
4762}
4763
4764/* Non-zero if "p1 matches something" implies "p2 fails". */
4765static int
971de7fb 4766mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4767{
4e8a9132 4768 re_opcode_t op2;
2d1675e4 4769 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4770 unsigned char *pend = bufp->buffer + bufp->used;
4771
4e8a9132 4772 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4773 && p2 >= bufp->buffer && p2 <= pend);
4774
4775 /* Skip over open/close-group commands.
4776 If what follows this loop is a ...+ construct,
4777 look at what begins its body, since we will have to
4778 match at least one of that. */
4e8a9132
SM
4779 p2 = skip_noops (p2, pend);
4780 /* The same skip can be done for p1, except that this function
4781 is only used in the case where p1 is a simple match operator. */
4782 /* p1 = skip_noops (p1, pend); */
4783
4784 assert (p1 >= bufp->buffer && p1 < pend
4785 && p2 >= bufp->buffer && p2 <= pend);
4786
4787 op2 = p2 == pend ? succeed : *p2;
4788
4789 switch (SWITCH_ENUM_CAST (op2))
505bde11 4790 {
4e8a9132
SM
4791 case succeed:
4792 case endbuf:
4793 /* If we're at the end of the pattern, we can change. */
4794 if (skip_one_char (p1))
505bde11 4795 {
505bde11
SM
4796 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4797 return 1;
505bde11 4798 }
4e8a9132 4799 break;
177c0ea7 4800
4e8a9132 4801 case endline:
4e8a9132
SM
4802 case exactn:
4803 {
01618498 4804 register re_wchar_t c
4e8a9132 4805 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4806 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4807
4e8a9132
SM
4808 if ((re_opcode_t) *p1 == exactn)
4809 {
62a6e103 4810 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4811 {
4812 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4813 return 1;
4814 }
4815 }
505bde11 4816
4e8a9132
SM
4817 else if ((re_opcode_t) *p1 == charset
4818 || (re_opcode_t) *p1 == charset_not)
4819 {
4820 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4821
4e8a9132
SM
4822 /* Test if C is listed in charset (or charset_not)
4823 at `p1'. */
6fdd04b0 4824 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4825 {
4826 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4827 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4828 not = !not;
4829 }
4830 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4831 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4832
4e8a9132
SM
4833 /* `not' is equal to 1 if c would match, which means
4834 that we can't change to pop_failure_jump. */
4835 if (!not)
4836 {
4837 DEBUG_PRINT1 (" No match => fast loop.\n");
4838 return 1;
4839 }
4840 }
4841 else if ((re_opcode_t) *p1 == anychar
4842 && c == '\n')
4843 {
4844 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4845 return 1;
4846 }
4847 }
4848 break;
505bde11 4849
4e8a9132 4850 case charset:
4e8a9132
SM
4851 {
4852 if ((re_opcode_t) *p1 == exactn)
4853 /* Reuse the code above. */
4854 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4855
505bde11
SM
4856 /* It is hard to list up all the character in charset
4857 P2 if it includes multibyte character. Give up in
4858 such case. */
4859 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4860 {
4861 /* Now, we are sure that P2 has no range table.
4862 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4863 enough. But P1 may have range table, so the
505bde11
SM
4864 size of bitmap table of P1 is extracted by
4865 using macro `CHARSET_BITMAP_SIZE'.
4866
6fdd04b0
KH
4867 In a multibyte case, we know that all the character
4868 listed in P2 is ASCII. In a unibyte case, P1 has only a
4869 bitmap table. So, in both cases, it is enough to test
4870 only the bitmap table of P1. */
505bde11 4871
411e4203 4872 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4873 {
4874 int idx;
4875 /* We win if the charset inside the loop
4876 has no overlap with the one after the loop. */
4877 for (idx = 0;
4878 (idx < (int) p2[1]
4879 && idx < CHARSET_BITMAP_SIZE (p1));
4880 idx++)
4881 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4882 break;
4883
4884 if (idx == p2[1]
4885 || idx == CHARSET_BITMAP_SIZE (p1))
4886 {
4887 DEBUG_PRINT1 (" No match => fast loop.\n");
4888 return 1;
4889 }
4890 }
411e4203 4891 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4892 {
4893 int idx;
4894 /* We win if the charset_not inside the loop lists
7814e705 4895 every character listed in the charset after. */
505bde11
SM
4896 for (idx = 0; idx < (int) p2[1]; idx++)
4897 if (! (p2[2 + idx] == 0
4898 || (idx < CHARSET_BITMAP_SIZE (p1)
4899 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4900 break;
4901
4e8a9132
SM
4902 if (idx == p2[1])
4903 {
4904 DEBUG_PRINT1 (" No match => fast loop.\n");
4905 return 1;
4906 }
4907 }
4908 }
4909 }
609b757a 4910 break;
177c0ea7 4911
411e4203
SM
4912 case charset_not:
4913 switch (SWITCH_ENUM_CAST (*p1))
4914 {
4915 case exactn:
4916 case charset:
4917 /* Reuse the code above. */
4918 return mutually_exclusive_p (bufp, p2, p1);
4919 case charset_not:
4920 /* When we have two charset_not, it's very unlikely that
4921 they don't overlap. The union of the two sets of excluded
4922 chars should cover all possible chars, which, as a matter of
4923 fact, is virtually impossible in multibyte buffers. */
36595814 4924 break;
411e4203
SM
4925 }
4926 break;
4927
4e8a9132 4928 case wordend:
669fa600
SM
4929 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4930 case symend:
4e8a9132 4931 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4932 && (p1[1] == Ssymbol || p1[1] == Sword));
4933 case notsyntaxspec:
4934 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4935
4936 case wordbeg:
669fa600
SM
4937 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4938 case symbeg:
4e8a9132 4939 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4940 && (p1[1] == Ssymbol || p1[1] == Sword));
4941 case syntaxspec:
4942 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4943
4944 case wordbound:
4945 return (((re_opcode_t) *p1 == notsyntaxspec
4946 || (re_opcode_t) *p1 == syntaxspec)
4947 && p1[1] == Sword);
4948
1fb352e0 4949#ifdef emacs
4e8a9132
SM
4950 case categoryspec:
4951 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4952 case notcategoryspec:
4953 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4954#endif /* emacs */
4955
4956 default:
4957 ;
505bde11
SM
4958 }
4959
4960 /* Safe default. */
4961 return 0;
4962}
4963
fa9a63c5
RM
4964\f
4965/* Matching routines. */
4966
25fe55af 4967#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4968/* re_match is like re_match_2 except it takes only a single string. */
4969
4970int
d2762c86
DN
4971re_match (struct re_pattern_buffer *bufp, const char *string,
4972 int size, int pos, struct re_registers *regs)
fa9a63c5 4973{
4bb91c68 4974 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4975 pos, regs, size);
fa9a63c5
RM
4976 return result;
4977}
c0f9ea08 4978WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4979#endif /* not emacs */
4980
b18215fc
RS
4981#ifdef emacs
4982/* In Emacs, this is the string or buffer in which we
7814e705 4983 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4984Lisp_Object re_match_object;
4985#endif
fa9a63c5
RM
4986
4987/* re_match_2 matches the compiled pattern in BUFP against the
4988 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4989 and SIZE2, respectively). We start matching at POS, and stop
4990 matching at STOP.
5e69f11e 4991
fa9a63c5 4992 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4993 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4994 documentation for exactly how many groups we fill.
4995
4996 We return -1 if no match, -2 if an internal error (such as the
7814e705 4997 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4998 matched substring. */
4999
5000int
971de7fb 5001re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5 5002{
b18215fc 5003 int result;
25fe55af 5004
b18215fc 5005#ifdef emacs
cc9b4df2 5006 int charpos;
d48cd3f4 5007 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 5008 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5009 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5010#endif
5011
4bb91c68
SM
5012 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5013 (re_char*) string2, size2,
cc9b4df2 5014 pos, regs, stop);
fa9a63c5
RM
5015 return result;
5016}
c0f9ea08 5017WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5018
bf216479 5019
fa9a63c5 5020/* This is a separate function so that we can force an alloca cleanup
7814e705 5021 afterwards. */
fa9a63c5 5022static int
971de7fb 5023re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1, int size1, const re_char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5
RM
5024{
5025 /* General temporaries. */
5026 int mcnt;
01618498 5027 size_t reg;
66f0296e 5028 boolean not;
fa9a63c5
RM
5029
5030 /* Just past the end of the corresponding string. */
66f0296e 5031 re_char *end1, *end2;
fa9a63c5
RM
5032
5033 /* Pointers into string1 and string2, just past the last characters in
7814e705 5034 each to consider matching. */
66f0296e 5035 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5036
5037 /* Where we are in the data, and the end of the current string. */
66f0296e 5038 re_char *d, *dend;
5e69f11e 5039
99633e97
SM
5040 /* Used sometimes to remember where we were before starting matching
5041 an operator so that we can go back in case of failure. This "atomic"
5042 behavior of matching opcodes is indispensable to the correctness
5043 of the on_failure_keep_string_jump optimization. */
5044 re_char *dfail;
5045
fa9a63c5 5046 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5047 re_char *p = bufp->buffer;
5048 re_char *pend = p + bufp->used;
fa9a63c5 5049
25fe55af 5050 /* We use this to map every character in the string. */
6676cb1c 5051 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5052
cf9c99bc 5053 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5054 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5055
cf9c99bc
KH
5056 /* Nonzero if STRING1/STRING2 are multibyte. */
5057 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5058
fa9a63c5
RM
5059 /* Failure point stack. Each place that can handle a failure further
5060 down the line pushes a failure point on this stack. It consists of
505bde11 5061 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5062 the subexpressions we're currently inside, plus the number of such
5063 registers, and, finally, two char *'s. The first char * is where
5064 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5065 scanning the strings. */
5066#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5067 fail_stack_type fail_stack;
5068#endif
5069#ifdef DEBUG
fa9a63c5
RM
5070 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5071#endif
5072
0b32bf0e 5073#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5074 /* This holds the pointer to the failure stack, when
5075 it is allocated relocatably. */
5076 fail_stack_elt_t *failure_stack_ptr;
99633e97 5077#endif
fa9a63c5
RM
5078
5079 /* We fill all the registers internally, independent of what we
7814e705 5080 return, for use in backreferences. The number here includes
fa9a63c5 5081 an element for register zero. */
4bb91c68 5082 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5083
fa9a63c5
RM
5084 /* Information on the contents of registers. These are pointers into
5085 the input strings; they record just what was matched (on this
5086 attempt) by a subexpression part of the pattern, that is, the
5087 regnum-th regstart pointer points to where in the pattern we began
5088 matching and the regnum-th regend points to right after where we
5089 stopped matching the regnum-th subexpression. (The zeroth register
5090 keeps track of what the whole pattern matches.) */
5091#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5092 re_char **regstart, **regend;
fa9a63c5
RM
5093#endif
5094
fa9a63c5 5095 /* The following record the register info as found in the above
5e69f11e 5096 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5097 This happens as we backtrack through the failure points, which in
5098 turn happens only if we have not yet matched the entire string. */
5099 unsigned best_regs_set = false;
5100#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5101 re_char **best_regstart, **best_regend;
fa9a63c5 5102#endif
5e69f11e 5103
fa9a63c5
RM
5104 /* Logically, this is `best_regend[0]'. But we don't want to have to
5105 allocate space for that if we're not allocating space for anything
7814e705 5106 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5107 any of the other register vectors, and it seems rather a kludge to
5108 treat `best_regend' differently than the rest. So we keep track of
5109 the end of the best match so far in a separate variable. We
5110 initialize this to NULL so that when we backtrack the first time
5111 and need to test it, it's not garbage. */
66f0296e 5112 re_char *match_end = NULL;
fa9a63c5 5113
fa9a63c5
RM
5114#ifdef DEBUG
5115 /* Counts the total number of registers pushed. */
5e69f11e 5116 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5117#endif
5118
5119 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5120
fa9a63c5 5121 INIT_FAIL_STACK ();
5e69f11e 5122
fa9a63c5
RM
5123#ifdef MATCH_MAY_ALLOCATE
5124 /* Do not bother to initialize all the register variables if there are
5125 no groups in the pattern, as it takes a fair amount of time. If
5126 there are groups, we include space for register 0 (the whole
5127 pattern), even though we never use it, since it simplifies the
5128 array indexing. We should fix this. */
5129 if (bufp->re_nsub)
5130 {
66f0296e
SM
5131 regstart = REGEX_TALLOC (num_regs, re_char *);
5132 regend = REGEX_TALLOC (num_regs, re_char *);
5133 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5134 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5135
505bde11 5136 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5137 {
5138 FREE_VARIABLES ();
5139 return -2;
5140 }
fa9a63c5
RM
5141 }
5142 else
5143 {
5144 /* We must initialize all our variables to NULL, so that
25fe55af 5145 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5146 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5147 }
5148#endif /* MATCH_MAY_ALLOCATE */
5149
5150 /* The starting position is bogus. */
5151 if (pos < 0 || pos > size1 + size2)
5152 {
5153 FREE_VARIABLES ();
5154 return -1;
5155 }
5e69f11e 5156
fa9a63c5
RM
5157 /* Initialize subexpression text positions to -1 to mark ones that no
5158 start_memory/stop_memory has been seen for. Also initialize the
5159 register information struct. */
01618498
SM
5160 for (reg = 1; reg < num_regs; reg++)
5161 regstart[reg] = regend[reg] = NULL;
99633e97 5162
fa9a63c5 5163 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5164 `string1' is null. */
fa9a63c5
RM
5165 if (size2 == 0 && string1 != NULL)
5166 {
5167 string2 = string1;
5168 size2 = size1;
5169 string1 = 0;
5170 size1 = 0;
5171 }
5172 end1 = string1 + size1;
5173 end2 = string2 + size2;
5174
5e69f11e 5175 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5176 `dend' is the end of the input string that `d' points within. `d'
5177 is advanced into the following input string whenever necessary, but
5178 this happens before fetching; therefore, at the beginning of the
5179 loop, `d' can be pointing at the end of a string, but it cannot
5180 equal `string2'. */
419d1c74 5181 if (pos >= size1)
fa9a63c5 5182 {
419d1c74
SM
5183 /* Only match within string2. */
5184 d = string2 + pos - size1;
5185 dend = end_match_2 = string2 + stop - size1;
5186 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5187 }
5188 else
5189 {
f1ad044f 5190 if (stop < size1)
419d1c74
SM
5191 {
5192 /* Only match within string1. */
5193 end_match_1 = string1 + stop;
5194 /* BEWARE!
5195 When we reach end_match_1, PREFETCH normally switches to string2.
5196 But in the present case, this means that just doing a PREFETCH
5197 makes us jump from `stop' to `gap' within the string.
5198 What we really want here is for the search to stop as
5199 soon as we hit end_match_1. That's why we set end_match_2
5200 to end_match_1 (since PREFETCH fails as soon as we hit
5201 end_match_2). */
5202 end_match_2 = end_match_1;
5203 }
5204 else
f1ad044f
SM
5205 { /* It's important to use this code when stop == size so that
5206 moving `d' from end1 to string2 will not prevent the d == dend
5207 check from catching the end of string. */
419d1c74
SM
5208 end_match_1 = end1;
5209 end_match_2 = string2 + stop - size1;
5210 }
5211 d = string1 + pos;
5212 dend = end_match_1;
fa9a63c5
RM
5213 }
5214
5215 DEBUG_PRINT1 ("The compiled pattern is: ");
5216 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5217 DEBUG_PRINT1 ("The string to match is: `");
5218 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5219 DEBUG_PRINT1 ("'\n");
5e69f11e 5220
7814e705 5221 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5222 function if the match is complete, or it drops through if the match
5223 fails at this starting point in the input data. */
5224 for (;;)
5225 {
505bde11 5226 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5227
5228 if (p == pend)
5229 { /* End of pattern means we might have succeeded. */
25fe55af 5230 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5231
fa9a63c5 5232 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5233 longest match, try backtracking. */
5234 if (d != end_match_2)
fa9a63c5
RM
5235 {
5236 /* 1 if this match ends in the same string (string1 or string2)
5237 as the best previous match. */
5e69f11e 5238 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5239 == FIRST_STRING_P (d));
fa9a63c5
RM
5240 /* 1 if this match is the best seen so far. */
5241 boolean best_match_p;
5242
5243 /* AIX compiler got confused when this was combined
7814e705 5244 with the previous declaration. */
fa9a63c5
RM
5245 if (same_str_p)
5246 best_match_p = d > match_end;
5247 else
99633e97 5248 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5249
25fe55af
RS
5250 DEBUG_PRINT1 ("backtracking.\n");
5251
5252 if (!FAIL_STACK_EMPTY ())
5253 { /* More failure points to try. */
5254
5255 /* If exceeds best match so far, save it. */
5256 if (!best_regs_set || best_match_p)
5257 {
5258 best_regs_set = true;
5259 match_end = d;
5260
5261 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5262
01618498 5263 for (reg = 1; reg < num_regs; reg++)
25fe55af 5264 {
01618498
SM
5265 best_regstart[reg] = regstart[reg];
5266 best_regend[reg] = regend[reg];
25fe55af
RS
5267 }
5268 }
5269 goto fail;
5270 }
5271
5272 /* If no failure points, don't restore garbage. And if
5273 last match is real best match, don't restore second
5274 best one. */
5275 else if (best_regs_set && !best_match_p)
5276 {
5277 restore_best_regs:
5278 /* Restore best match. It may happen that `dend ==
5279 end_match_1' while the restored d is in string2.
5280 For example, the pattern `x.*y.*z' against the
5281 strings `x-' and `y-z-', if the two strings are
7814e705 5282 not consecutive in memory. */
25fe55af
RS
5283 DEBUG_PRINT1 ("Restoring best registers.\n");
5284
5285 d = match_end;
5286 dend = ((d >= string1 && d <= end1)
5287 ? end_match_1 : end_match_2);
fa9a63c5 5288
01618498 5289 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5290 {
01618498
SM
5291 regstart[reg] = best_regstart[reg];
5292 regend[reg] = best_regend[reg];
fa9a63c5 5293 }
25fe55af
RS
5294 }
5295 } /* d != end_match_2 */
fa9a63c5
RM
5296
5297 succeed_label:
25fe55af 5298 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5299
25fe55af
RS
5300 /* If caller wants register contents data back, do it. */
5301 if (regs && !bufp->no_sub)
fa9a63c5 5302 {
25fe55af
RS
5303 /* Have the register data arrays been allocated? */
5304 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5305 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5306 extra element beyond `num_regs' for the `-1' marker
5307 GNU code uses. */
5308 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5309 regs->start = TALLOC (regs->num_regs, regoff_t);
5310 regs->end = TALLOC (regs->num_regs, regoff_t);
5311 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5312 {
5313 FREE_VARIABLES ();
5314 return -2;
5315 }
25fe55af
RS
5316 bufp->regs_allocated = REGS_REALLOCATE;
5317 }
5318 else if (bufp->regs_allocated == REGS_REALLOCATE)
5319 { /* Yes. If we need more elements than were already
5320 allocated, reallocate them. If we need fewer, just
5321 leave it alone. */
5322 if (regs->num_regs < num_regs + 1)
5323 {
5324 regs->num_regs = num_regs + 1;
5325 RETALLOC (regs->start, regs->num_regs, regoff_t);
5326 RETALLOC (regs->end, regs->num_regs, regoff_t);
5327 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5328 {
5329 FREE_VARIABLES ();
5330 return -2;
5331 }
25fe55af
RS
5332 }
5333 }
5334 else
fa9a63c5
RM
5335 {
5336 /* These braces fend off a "empty body in an else-statement"
7814e705 5337 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5338 assert (bufp->regs_allocated == REGS_FIXED);
5339 }
5340
25fe55af
RS
5341 /* Convert the pointer data in `regstart' and `regend' to
5342 indices. Register zero has to be set differently,
5343 since we haven't kept track of any info for it. */
5344 if (regs->num_regs > 0)
5345 {
5346 regs->start[0] = pos;
99633e97 5347 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5348 }
5e69f11e 5349
25fe55af
RS
5350 /* Go through the first `min (num_regs, regs->num_regs)'
5351 registers, since that is all we initialized. */
01618498 5352 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5353 {
01618498
SM
5354 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5355 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5356 else
5357 {
01618498
SM
5358 regs->start[reg]
5359 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5360 regs->end[reg]
5361 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5362 }
fa9a63c5 5363 }
5e69f11e 5364
25fe55af
RS
5365 /* If the regs structure we return has more elements than
5366 were in the pattern, set the extra elements to -1. If
5367 we (re)allocated the registers, this is the case,
5368 because we always allocate enough to have at least one
7814e705 5369 -1 at the end. */
01618498
SM
5370 for (reg = num_regs; reg < regs->num_regs; reg++)
5371 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5372 } /* regs && !bufp->no_sub */
5373
25fe55af
RS
5374 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5375 nfailure_points_pushed, nfailure_points_popped,
5376 nfailure_points_pushed - nfailure_points_popped);
5377 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5378
99633e97 5379 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5380
25fe55af 5381 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5382
25fe55af
RS
5383 FREE_VARIABLES ();
5384 return mcnt;
5385 }
fa9a63c5 5386
7814e705 5387 /* Otherwise match next pattern command. */
fa9a63c5
RM
5388 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5389 {
25fe55af
RS
5390 /* Ignore these. Used to ignore the n of succeed_n's which
5391 currently have n == 0. */
5392 case no_op:
5393 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5394 break;
fa9a63c5
RM
5395
5396 case succeed:
25fe55af 5397 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5398 goto succeed_label;
5399
7814e705 5400 /* Match the next n pattern characters exactly. The following
25fe55af 5401 byte in the pattern defines n, and the n bytes after that
7814e705 5402 are the characters to match. */
fa9a63c5
RM
5403 case exactn:
5404 mcnt = *p++;
25fe55af 5405 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5406
99633e97
SM
5407 /* Remember the start point to rollback upon failure. */
5408 dfail = d;
5409
6fdd04b0 5410#ifndef emacs
25fe55af
RS
5411 /* This is written out as an if-else so we don't waste time
5412 testing `translate' inside the loop. */
28703c16 5413 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5414 do
5415 {
5416 PREFETCH ();
5417 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5418 {
6fdd04b0
KH
5419 d = dfail;
5420 goto fail;
e934739e 5421 }
6fdd04b0
KH
5422 d++;
5423 }
5424 while (--mcnt);
fa9a63c5 5425 else
6fdd04b0
KH
5426 do
5427 {
5428 PREFETCH ();
5429 if (*d++ != *p++)
bf216479 5430 {
6fdd04b0
KH
5431 d = dfail;
5432 goto fail;
bf216479 5433 }
6fdd04b0
KH
5434 }
5435 while (--mcnt);
5436#else /* emacs */
5437 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5438 if (target_multibyte)
6fdd04b0
KH
5439 do
5440 {
5441 int pat_charlen, buf_charlen;
cf9c99bc 5442 int pat_ch, buf_ch;
e934739e 5443
6fdd04b0 5444 PREFETCH ();
cf9c99bc 5445 if (multibyte)
62a6e103 5446 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5447 else
5448 {
5449 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5450 pat_charlen = 1;
5451 }
62a6e103 5452 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5453
6fdd04b0 5454 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5455 {
6fdd04b0
KH
5456 d = dfail;
5457 goto fail;
e934739e 5458 }
bf216479 5459
6fdd04b0
KH
5460 p += pat_charlen;
5461 d += buf_charlen;
5462 mcnt -= pat_charlen;
5463 }
5464 while (mcnt > 0);
fa9a63c5 5465 else
6fdd04b0
KH
5466 do
5467 {
cf9c99bc
KH
5468 int pat_charlen, buf_charlen;
5469 int pat_ch, buf_ch;
bf216479 5470
6fdd04b0 5471 PREFETCH ();
cf9c99bc
KH
5472 if (multibyte)
5473 {
62a6e103 5474 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5475 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5476 }
5477 else
5478 {
5479 pat_ch = *p;
5480 pat_charlen = 1;
5481 }
5482 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5483 if (! CHAR_BYTE8_P (buf_ch))
5484 {
5485 buf_ch = TRANSLATE (buf_ch);
5486 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5487 if (buf_ch < 0)
5488 buf_ch = *d;
5489 }
0e2501ed
AS
5490 else
5491 buf_ch = *d;
cf9c99bc 5492 if (buf_ch != pat_ch)
6fdd04b0
KH
5493 {
5494 d = dfail;
5495 goto fail;
bf216479 5496 }
cf9c99bc
KH
5497 p += pat_charlen;
5498 d++;
6fdd04b0
KH
5499 }
5500 while (--mcnt);
5501#endif
25fe55af 5502 break;
fa9a63c5
RM
5503
5504
25fe55af 5505 /* Match any character except possibly a newline or a null. */
fa9a63c5 5506 case anychar:
e934739e
RS
5507 {
5508 int buf_charlen;
01618498 5509 re_wchar_t buf_ch;
fa9a63c5 5510
e934739e 5511 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5512
e934739e 5513 PREFETCH ();
62a6e103 5514 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5515 target_multibyte);
e934739e
RS
5516 buf_ch = TRANSLATE (buf_ch);
5517
5518 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5519 && buf_ch == '\n')
5520 || ((bufp->syntax & RE_DOT_NOT_NULL)
5521 && buf_ch == '\000'))
5522 goto fail;
5523
e934739e
RS
5524 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5525 d += buf_charlen;
5526 }
fa9a63c5
RM
5527 break;
5528
5529
5530 case charset:
5531 case charset_not:
5532 {
b18215fc 5533 register unsigned int c;
fa9a63c5 5534 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5535 int len;
5536
5537 /* Start of actual range_table, or end of bitmap if there is no
5538 range table. */
01618498 5539 re_char *range_table;
b18215fc 5540
96cc36cc 5541 /* Nonzero if there is a range table. */
b18215fc
RS
5542 int range_table_exists;
5543
96cc36cc
RS
5544 /* Number of ranges of range table. This is not included
5545 in the initial byte-length of the command. */
5546 int count = 0;
fa9a63c5 5547
f5020181
AS
5548 /* Whether matching against a unibyte character. */
5549 boolean unibyte_char = false;
5550
25fe55af 5551 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5552
b18215fc 5553 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5554
b18215fc 5555 if (range_table_exists)
96cc36cc
RS
5556 {
5557 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5558 EXTRACT_NUMBER_AND_INCR (count, range_table);
5559 }
b18215fc 5560
2d1675e4 5561 PREFETCH ();
62a6e103 5562 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5563 if (target_multibyte)
5564 {
5565 int c1;
b18215fc 5566
cf9c99bc
KH
5567 c = TRANSLATE (c);
5568 c1 = RE_CHAR_TO_UNIBYTE (c);
5569 if (c1 >= 0)
f5020181
AS
5570 {
5571 unibyte_char = true;
5572 c = c1;
5573 }
cf9c99bc
KH
5574 }
5575 else
5576 {
5577 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5578
5579 if (! CHAR_BYTE8_P (c1))
5580 {
5581 c1 = TRANSLATE (c1);
5582 c1 = RE_CHAR_TO_UNIBYTE (c1);
5583 if (c1 >= 0)
f5020181
AS
5584 {
5585 unibyte_char = true;
5586 c = c1;
5587 }
cf9c99bc 5588 }
0b8be006
AS
5589 else
5590 unibyte_char = true;
cf9c99bc
KH
5591 }
5592
f5020181 5593 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5594 { /* Lookup bitmap. */
b18215fc
RS
5595 /* Cast to `unsigned' instead of `unsigned char' in
5596 case the bit list is a full 32 bytes long. */
5597 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5598 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5599 not = !not;
b18215fc 5600 }
96cc36cc 5601#ifdef emacs
b18215fc 5602 else if (range_table_exists)
96cc36cc
RS
5603 {
5604 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5605
14473664
SM
5606 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5607 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5608 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5609 | (class_bits & BIT_SPACE && ISSPACE (c))
5610 | (class_bits & BIT_UPPER && ISUPPER (c))
5611 | (class_bits & BIT_WORD && ISWORD (c)))
5612 not = !not;
5613 else
5614 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5615 }
5616#endif /* emacs */
fa9a63c5 5617
96cc36cc
RS
5618 if (range_table_exists)
5619 p = CHARSET_RANGE_TABLE_END (range_table, count);
5620 else
5621 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5622
5623 if (!not) goto fail;
5e69f11e 5624
b18215fc 5625 d += len;
fa9a63c5
RM
5626 break;
5627 }
5628
5629
25fe55af 5630 /* The beginning of a group is represented by start_memory.
505bde11 5631 The argument is the register number. The text
25fe55af 5632 matched within the group is recorded (in the internal
7814e705 5633 registers data structure) under the register number. */
25fe55af 5634 case start_memory:
505bde11
SM
5635 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5636
5637 /* In case we need to undo this operation (via backtracking). */
5638 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5639
25fe55af 5640 regstart[*p] = d;
4bb91c68 5641 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5642 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5643
25fe55af 5644 /* Move past the register number and inner group count. */
505bde11 5645 p += 1;
25fe55af 5646 break;
fa9a63c5
RM
5647
5648
25fe55af 5649 /* The stop_memory opcode represents the end of a group. Its
505bde11 5650 argument is the same as start_memory's: the register number. */
fa9a63c5 5651 case stop_memory:
505bde11
SM
5652 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5653
5654 assert (!REG_UNSET (regstart[*p]));
5655 /* Strictly speaking, there should be code such as:
177c0ea7 5656
0b32bf0e 5657 assert (REG_UNSET (regend[*p]));
505bde11
SM
5658 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5659
5660 But the only info to be pushed is regend[*p] and it is known to
5661 be UNSET, so there really isn't anything to push.
5662 Not pushing anything, on the other hand deprives us from the
5663 guarantee that regend[*p] is UNSET since undoing this operation
5664 will not reset its value properly. This is not important since
5665 the value will only be read on the next start_memory or at
5666 the very end and both events can only happen if this stop_memory
5667 is *not* undone. */
fa9a63c5 5668
25fe55af 5669 regend[*p] = d;
fa9a63c5
RM
5670 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5671
25fe55af 5672 /* Move past the register number and the inner group count. */
505bde11 5673 p += 1;
25fe55af 5674 break;
fa9a63c5
RM
5675
5676
5677 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5678 followed by the numeric value of <digit> as the register number. */
5679 case duplicate:
fa9a63c5 5680 {
66f0296e 5681 register re_char *d2, *dend2;
7814e705 5682 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5683 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5684
7814e705 5685 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5686 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5687 goto fail;
5e69f11e 5688
7814e705 5689 /* Where in input to try to start matching. */
25fe55af 5690 d2 = regstart[regno];
5e69f11e 5691
99633e97
SM
5692 /* Remember the start point to rollback upon failure. */
5693 dfail = d;
5694
25fe55af
RS
5695 /* Where to stop matching; if both the place to start and
5696 the place to stop matching are in the same string, then
5697 set to the place to stop, otherwise, for now have to use
5698 the end of the first string. */
fa9a63c5 5699
25fe55af 5700 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5701 == FIRST_STRING_P (regend[regno]))
5702 ? regend[regno] : end_match_1);
5703 for (;;)
5704 {
5705 /* If necessary, advance to next segment in register
25fe55af 5706 contents. */
fa9a63c5
RM
5707 while (d2 == dend2)
5708 {
5709 if (dend2 == end_match_2) break;
5710 if (dend2 == regend[regno]) break;
5711
25fe55af
RS
5712 /* End of string1 => advance to string2. */
5713 d2 = string2;
5714 dend2 = regend[regno];
fa9a63c5
RM
5715 }
5716 /* At end of register contents => success */
5717 if (d2 == dend2) break;
5718
5719 /* If necessary, advance to next segment in data. */
5720 PREFETCH ();
5721
5722 /* How many characters left in this segment to match. */
5723 mcnt = dend - d;
5e69f11e 5724
fa9a63c5 5725 /* Want how many consecutive characters we can match in
25fe55af
RS
5726 one shot, so, if necessary, adjust the count. */
5727 if (mcnt > dend2 - d2)
fa9a63c5 5728 mcnt = dend2 - d2;
5e69f11e 5729
fa9a63c5 5730 /* Compare that many; failure if mismatch, else move
25fe55af 5731 past them. */
28703c16 5732 if (RE_TRANSLATE_P (translate)
02cb78b5 5733 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5734 : memcmp (d, d2, mcnt))
99633e97
SM
5735 {
5736 d = dfail;
5737 goto fail;
5738 }
fa9a63c5 5739 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5740 }
5741 }
5742 break;
5743
5744
25fe55af 5745 /* begline matches the empty string at the beginning of the string
c0f9ea08 5746 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5747 case begline:
25fe55af 5748 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5749
25fe55af
RS
5750 if (AT_STRINGS_BEG (d))
5751 {
5752 if (!bufp->not_bol) break;
5753 }
419d1c74 5754 else
25fe55af 5755 {
bf216479 5756 unsigned c;
419d1c74 5757 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5758 if (c == '\n')
419d1c74 5759 break;
25fe55af
RS
5760 }
5761 /* In all other cases, we fail. */
5762 goto fail;
fa9a63c5
RM
5763
5764
25fe55af 5765 /* endline is the dual of begline. */
fa9a63c5 5766 case endline:
25fe55af 5767 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5768
25fe55af
RS
5769 if (AT_STRINGS_END (d))
5770 {
5771 if (!bufp->not_eol) break;
5772 }
f1ad044f 5773 else
25fe55af 5774 {
f1ad044f 5775 PREFETCH_NOLIMIT ();
c0f9ea08 5776 if (*d == '\n')
f1ad044f 5777 break;
25fe55af
RS
5778 }
5779 goto fail;
fa9a63c5
RM
5780
5781
5782 /* Match at the very beginning of the data. */
25fe55af
RS
5783 case begbuf:
5784 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5785 if (AT_STRINGS_BEG (d))
5786 break;
5787 goto fail;
fa9a63c5
RM
5788
5789
5790 /* Match at the very end of the data. */
25fe55af
RS
5791 case endbuf:
5792 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5793 if (AT_STRINGS_END (d))
5794 break;
25fe55af 5795 goto fail;
5e69f11e 5796
5e69f11e 5797
25fe55af
RS
5798 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5799 pushes NULL as the value for the string on the stack. Then
505bde11 5800 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5801 string, instead of restoring it. To see why, consider
7814e705 5802 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5803 then the . fails against the \n. But the next thing we want
5804 to do is match the \n against the \n; if we restored the
5805 string value, we would be back at the foo.
5806
5807 Because this is used only in specific cases, we don't need to
5808 check all the things that `on_failure_jump' does, to make
5809 sure the right things get saved on the stack. Hence we don't
5810 share its code. The only reason to push anything on the
5811 stack at all is that otherwise we would have to change
5812 `anychar's code to do something besides goto fail in this
5813 case; that seems worse than this. */
5814 case on_failure_keep_string_jump:
505bde11
SM
5815 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5816 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5817 mcnt, p + mcnt);
fa9a63c5 5818
505bde11
SM
5819 PUSH_FAILURE_POINT (p - 3, NULL);
5820 break;
5821
0683b6fa
SM
5822 /* A nasty loop is introduced by the non-greedy *? and +?.
5823 With such loops, the stack only ever contains one failure point
5824 at a time, so that a plain on_failure_jump_loop kind of
5825 cycle detection cannot work. Worse yet, such a detection
5826 can not only fail to detect a cycle, but it can also wrongly
5827 detect a cycle (between different instantiations of the same
6df42991 5828 loop).
0683b6fa
SM
5829 So the method used for those nasty loops is a little different:
5830 We use a special cycle-detection-stack-frame which is pushed
5831 when the on_failure_jump_nastyloop failure-point is *popped*.
5832 This special frame thus marks the beginning of one iteration
5833 through the loop and we can hence easily check right here
5834 whether something matched between the beginning and the end of
5835 the loop. */
5836 case on_failure_jump_nastyloop:
5837 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5838 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5839 mcnt, p + mcnt);
5840
5841 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5842 {
5843 int cycle = 0;
5844 CHECK_INFINITE_LOOP (p - 4, d);
5845 if (!cycle)
5846 /* If there's a cycle, just continue without pushing
5847 this failure point. The failure point is the "try again"
5848 option, which shouldn't be tried.
5849 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5850 PUSH_FAILURE_POINT (p - 3, d);
5851 }
0683b6fa
SM
5852 break;
5853
4e8a9132
SM
5854 /* Simple loop detecting on_failure_jump: just check on the
5855 failure stack if the same spot was already hit earlier. */
505bde11
SM
5856 case on_failure_jump_loop:
5857 on_failure:
5858 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5859 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5860 mcnt, p + mcnt);
6df42991
SM
5861 {
5862 int cycle = 0;
5863 CHECK_INFINITE_LOOP (p - 3, d);
5864 if (cycle)
5865 /* If there's a cycle, get out of the loop, as if the matching
5866 had failed. We used to just `goto fail' here, but that was
5867 aborting the search a bit too early: we want to keep the
5868 empty-loop-match and keep matching after the loop.
5869 We want (x?)*y\1z to match both xxyz and xxyxz. */
5870 p += mcnt;
5871 else
5872 PUSH_FAILURE_POINT (p - 3, d);
5873 }
25fe55af 5874 break;
fa9a63c5
RM
5875
5876
5877 /* Uses of on_failure_jump:
5e69f11e 5878
25fe55af
RS
5879 Each alternative starts with an on_failure_jump that points
5880 to the beginning of the next alternative. Each alternative
5881 except the last ends with a jump that in effect jumps past
5882 the rest of the alternatives. (They really jump to the
5883 ending jump of the following alternative, because tensioning
5884 these jumps is a hassle.)
fa9a63c5 5885
25fe55af
RS
5886 Repeats start with an on_failure_jump that points past both
5887 the repetition text and either the following jump or
5888 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5889 case on_failure_jump:
25fe55af 5890 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5891 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5892 mcnt, p + mcnt);
25fe55af 5893
505bde11 5894 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5895 break;
5896
4e8a9132 5897 /* This operation is used for greedy *.
505bde11
SM
5898 Compare the beginning of the repeat with what in the
5899 pattern follows its end. If we can establish that there
5900 is nothing that they would both match, i.e., that we
5901 would have to backtrack because of (as in, e.g., `a*a')
5902 then we can use a non-backtracking loop based on
4e8a9132 5903 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5904 case on_failure_jump_smart:
25fe55af 5905 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5906 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5907 mcnt, p + mcnt);
25fe55af 5908 {
01618498 5909 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5910 /* Here, we discard `const', making re_match non-reentrant. */
5911 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5912 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5913
505bde11
SM
5914 p -= 3; /* Reset so that we will re-execute the
5915 instruction once it's been changed. */
fa9a63c5 5916
4e8a9132
SM
5917 EXTRACT_NUMBER (mcnt, p2 - 2);
5918
5919 /* Ensure this is a indeed the trivial kind of loop
5920 we are expecting. */
5921 assert (skip_one_char (p1) == p2 - 3);
5922 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5923 DEBUG_STATEMENT (debug += 2);
505bde11 5924 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5925 {
505bde11 5926 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5927 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5928 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5929 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5930 }
505bde11 5931 else
fa9a63c5 5932 {
505bde11
SM
5933 /* Default to a safe `on_failure_jump' loop. */
5934 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5935 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5936 }
99633e97 5937 DEBUG_STATEMENT (debug -= 2);
25fe55af 5938 }
505bde11 5939 break;
25fe55af
RS
5940
5941 /* Unconditionally jump (without popping any failure points). */
5942 case jump:
fa9a63c5 5943 unconditional_jump:
5b370c2b 5944 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5945 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5946 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5947 p += mcnt; /* Do the jump. */
505bde11 5948 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5949 break;
5950
5951
25fe55af
RS
5952 /* Have to succeed matching what follows at least n times.
5953 After that, handle like `on_failure_jump'. */
5954 case succeed_n:
01618498 5955 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5956 EXTRACT_NUMBER (mcnt, p + 2);
5957 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5958
dc1e502d
SM
5959 /* Originally, mcnt is how many times we HAVE to succeed. */
5960 if (mcnt != 0)
25fe55af 5961 {
6dcf2d0e
SM
5962 /* Here, we discard `const', making re_match non-reentrant. */
5963 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5964 mcnt--;
01618498
SM
5965 p += 4;
5966 PUSH_NUMBER (p2, mcnt);
25fe55af 5967 }
dc1e502d
SM
5968 else
5969 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5970 goto on_failure;
25fe55af
RS
5971 break;
5972
5973 case jump_n:
01618498 5974 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5975 EXTRACT_NUMBER (mcnt, p + 2);
5976 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5977
5978 /* Originally, this is how many times we CAN jump. */
dc1e502d 5979 if (mcnt != 0)
25fe55af 5980 {
6dcf2d0e
SM
5981 /* Here, we discard `const', making re_match non-reentrant. */
5982 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5983 mcnt--;
01618498 5984 PUSH_NUMBER (p2, mcnt);
dc1e502d 5985 goto unconditional_jump;
25fe55af
RS
5986 }
5987 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5988 else
5989 p += 4;
25fe55af 5990 break;
5e69f11e 5991
fa9a63c5
RM
5992 case set_number_at:
5993 {
01618498 5994 unsigned char *p2; /* Location of the counter. */
25fe55af 5995 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5996
25fe55af 5997 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5998 /* Here, we discard `const', making re_match non-reentrant. */
5999 p2 = (unsigned char*) p + mcnt;
01618498 6000 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6001 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6002 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6003 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6004 break;
6005 }
9121ca40
KH
6006
6007 case wordbound:
66f0296e
SM
6008 case notwordbound:
6009 not = (re_opcode_t) *(p - 1) == notwordbound;
6010 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6011
99633e97 6012 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6013
b18215fc 6014 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6015 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6016 not = !not;
b18215fc
RS
6017 else
6018 {
6019 /* C1 is the character before D, S1 is the syntax of C1, C2
6020 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6021 re_wchar_t c1, c2;
6022 int s1, s2;
bf216479 6023 int dummy;
b18215fc 6024#ifdef emacs
2d1675e4
SM
6025 int offset = PTR_TO_OFFSET (d - 1);
6026 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6027 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6028#endif
66f0296e 6029 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6030 s1 = SYNTAX (c1);
6031#ifdef emacs
5d967c7a 6032 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6033#endif
f1ad044f 6034 PREFETCH_NOLIMIT ();
6fdd04b0 6035 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6036 s2 = SYNTAX (c2);
6037
6038 if (/* Case 2: Only one of S1 and S2 is Sword. */
6039 ((s1 == Sword) != (s2 == Sword))
6040 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6041 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6042 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6043 not = !not;
6044 }
6045 if (not)
9121ca40 6046 break;
b18215fc 6047 else
9121ca40 6048 goto fail;
fa9a63c5
RM
6049
6050 case wordbeg:
25fe55af 6051 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6052
b18215fc
RS
6053 /* We FAIL in one of the following cases: */
6054
7814e705 6055 /* Case 1: D is at the end of string. */
b18215fc 6056 if (AT_STRINGS_END (d))
99633e97 6057 goto fail;
b18215fc
RS
6058 else
6059 {
6060 /* C1 is the character before D, S1 is the syntax of C1, C2
6061 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6062 re_wchar_t c1, c2;
6063 int s1, s2;
bf216479 6064 int dummy;
fa9a63c5 6065#ifdef emacs
2d1675e4
SM
6066 int offset = PTR_TO_OFFSET (d);
6067 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6068 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6069#endif
99633e97 6070 PREFETCH ();
6fdd04b0 6071 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6072 s2 = SYNTAX (c2);
177c0ea7 6073
b18215fc
RS
6074 /* Case 2: S2 is not Sword. */
6075 if (s2 != Sword)
6076 goto fail;
6077
6078 /* Case 3: D is not at the beginning of string ... */
6079 if (!AT_STRINGS_BEG (d))
6080 {
6081 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6082#ifdef emacs
5d967c7a 6083 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6084#endif
b18215fc
RS
6085 s1 = SYNTAX (c1);
6086
6087 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6088 returns 0. */
b18215fc
RS
6089 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6090 goto fail;
6091 }
6092 }
e318085a
RS
6093 break;
6094
b18215fc 6095 case wordend:
25fe55af 6096 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6097
6098 /* We FAIL in one of the following cases: */
6099
6100 /* Case 1: D is at the beginning of string. */
6101 if (AT_STRINGS_BEG (d))
e318085a 6102 goto fail;
b18215fc
RS
6103 else
6104 {
6105 /* C1 is the character before D, S1 is the syntax of C1, C2
6106 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6107 re_wchar_t c1, c2;
6108 int s1, s2;
bf216479 6109 int dummy;
5d967c7a 6110#ifdef emacs
2d1675e4
SM
6111 int offset = PTR_TO_OFFSET (d) - 1;
6112 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6113 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6114#endif
99633e97 6115 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6116 s1 = SYNTAX (c1);
6117
6118 /* Case 2: S1 is not Sword. */
6119 if (s1 != Sword)
6120 goto fail;
6121
6122 /* Case 3: D is not at the end of string ... */
6123 if (!AT_STRINGS_END (d))
6124 {
f1ad044f 6125 PREFETCH_NOLIMIT ();
6fdd04b0 6126 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6127#ifdef emacs
6128 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6129#endif
b18215fc
RS
6130 s2 = SYNTAX (c2);
6131
6132 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6133 returns 0. */
b18215fc 6134 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6135 goto fail;
b18215fc
RS
6136 }
6137 }
e318085a
RS
6138 break;
6139
669fa600
SM
6140 case symbeg:
6141 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6142
6143 /* We FAIL in one of the following cases: */
6144
7814e705 6145 /* Case 1: D is at the end of string. */
669fa600
SM
6146 if (AT_STRINGS_END (d))
6147 goto fail;
6148 else
6149 {
6150 /* C1 is the character before D, S1 is the syntax of C1, C2
6151 is the character at D, and S2 is the syntax of C2. */
6152 re_wchar_t c1, c2;
6153 int s1, s2;
6154#ifdef emacs
6155 int offset = PTR_TO_OFFSET (d);
6156 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6157 UPDATE_SYNTAX_TABLE (charpos);
6158#endif
6159 PREFETCH ();
62a6e103 6160 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6161 s2 = SYNTAX (c2);
7814e705 6162
669fa600
SM
6163 /* Case 2: S2 is neither Sword nor Ssymbol. */
6164 if (s2 != Sword && s2 != Ssymbol)
6165 goto fail;
6166
6167 /* Case 3: D is not at the beginning of string ... */
6168 if (!AT_STRINGS_BEG (d))
6169 {
6170 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6171#ifdef emacs
6172 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6173#endif
6174 s1 = SYNTAX (c1);
6175
6176 /* ... and S1 is Sword or Ssymbol. */
6177 if (s1 == Sword || s1 == Ssymbol)
6178 goto fail;
6179 }
6180 }
6181 break;
6182
6183 case symend:
6184 DEBUG_PRINT1 ("EXECUTING symend.\n");
6185
6186 /* We FAIL in one of the following cases: */
6187
6188 /* Case 1: D is at the beginning of string. */
6189 if (AT_STRINGS_BEG (d))
6190 goto fail;
6191 else
6192 {
6193 /* C1 is the character before D, S1 is the syntax of C1, C2
6194 is the character at D, and S2 is the syntax of C2. */
6195 re_wchar_t c1, c2;
6196 int s1, s2;
6197#ifdef emacs
6198 int offset = PTR_TO_OFFSET (d) - 1;
6199 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6200 UPDATE_SYNTAX_TABLE (charpos);
6201#endif
6202 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6203 s1 = SYNTAX (c1);
6204
6205 /* Case 2: S1 is neither Ssymbol nor Sword. */
6206 if (s1 != Sword && s1 != Ssymbol)
6207 goto fail;
6208
6209 /* Case 3: D is not at the end of string ... */
6210 if (!AT_STRINGS_END (d))
6211 {
6212 PREFETCH_NOLIMIT ();
62a6e103 6213 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6214#ifdef emacs
134579f2 6215 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6216#endif
6217 s2 = SYNTAX (c2);
6218
6219 /* ... and S2 is Sword or Ssymbol. */
6220 if (s2 == Sword || s2 == Ssymbol)
6221 goto fail;
b18215fc
RS
6222 }
6223 }
e318085a
RS
6224 break;
6225
fa9a63c5 6226 case syntaxspec:
1fb352e0
SM
6227 case notsyntaxspec:
6228 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6229 mcnt = *p++;
1fb352e0 6230 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6231 PREFETCH ();
b18215fc
RS
6232#ifdef emacs
6233 {
2d1675e4
SM
6234 int offset = PTR_TO_OFFSET (d);
6235 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6236 UPDATE_SYNTAX_TABLE (pos1);
6237 }
25fe55af 6238#endif
b18215fc 6239 {
01618498
SM
6240 int len;
6241 re_wchar_t c;
b18215fc 6242
6fdd04b0 6243 GET_CHAR_AFTER (c, d, len);
990b2375 6244 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6245 goto fail;
b18215fc
RS
6246 d += len;
6247 }
fa9a63c5
RM
6248 break;
6249
b18215fc 6250#ifdef emacs
1fb352e0
SM
6251 case before_dot:
6252 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6253 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6254 goto fail;
b18215fc
RS
6255 break;
6256
1fb352e0
SM
6257 case at_dot:
6258 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6259 if (PTR_BYTE_POS (d) != PT_BYTE)
6260 goto fail;
6261 break;
b18215fc 6262
1fb352e0
SM
6263 case after_dot:
6264 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6265 if (PTR_BYTE_POS (d) <= PT_BYTE)
6266 goto fail;
e318085a 6267 break;
fa9a63c5 6268
1fb352e0 6269 case categoryspec:
b18215fc 6270 case notcategoryspec:
1fb352e0 6271 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6272 mcnt = *p++;
1fb352e0 6273 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6274 PREFETCH ();
6275 {
01618498
SM
6276 int len;
6277 re_wchar_t c;
6278
6fdd04b0 6279 GET_CHAR_AFTER (c, d, len);
1fb352e0 6280 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6281 goto fail;
6282 d += len;
6283 }
fa9a63c5 6284 break;
5e69f11e 6285
1fb352e0 6286#endif /* emacs */
5e69f11e 6287
0b32bf0e
SM
6288 default:
6289 abort ();
fa9a63c5 6290 }
b18215fc 6291 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6292
6293
6294 /* We goto here if a matching operation fails. */
6295 fail:
5b370c2b 6296 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6297 if (!FAIL_STACK_EMPTY ())
505bde11 6298 {
01618498 6299 re_char *str, *pat;
505bde11 6300 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6301 DEBUG_PRINT1 ("\nFAIL:\n");
6302 POP_FAILURE_POINT (str, pat);
505bde11
SM
6303 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6304 {
6305 case on_failure_keep_string_jump:
6306 assert (str == NULL);
6307 goto continue_failure_jump;
6308
0683b6fa
SM
6309 case on_failure_jump_nastyloop:
6310 assert ((re_opcode_t)pat[-2] == no_op);
6311 PUSH_FAILURE_POINT (pat - 2, str);
6312 /* Fallthrough */
6313
505bde11
SM
6314 case on_failure_jump_loop:
6315 case on_failure_jump:
6316 case succeed_n:
6317 d = str;
6318 continue_failure_jump:
6319 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6320 p = pat + mcnt;
6321 break;
b18215fc 6322
0683b6fa
SM
6323 case no_op:
6324 /* A special frame used for nastyloops. */
6325 goto fail;
6326
505bde11
SM
6327 default:
6328 abort();
6329 }
fa9a63c5 6330
505bde11 6331 assert (p >= bufp->buffer && p <= pend);
b18215fc 6332
0b32bf0e 6333 if (d >= string1 && d <= end1)
fa9a63c5 6334 dend = end_match_1;
0b32bf0e 6335 }
fa9a63c5 6336 else
0b32bf0e 6337 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6338 } /* for (;;) */
6339
6340 if (best_regs_set)
6341 goto restore_best_regs;
6342
6343 FREE_VARIABLES ();
6344
b18215fc 6345 return -1; /* Failure to match. */
fa9a63c5
RM
6346} /* re_match_2 */
6347\f
6348/* Subroutine definitions for re_match_2. */
6349
fa9a63c5
RM
6350/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6351 bytes; nonzero otherwise. */
5e69f11e 6352
fa9a63c5 6353static int
438105ed
JB
6354bcmp_translate (const re_char *s1, const re_char *s2, register int len,
6355 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6356{
2d1675e4
SM
6357 register re_char *p1 = s1, *p2 = s2;
6358 re_char *p1_end = s1 + len;
6359 re_char *p2_end = s2 + len;
e934739e 6360
4bb91c68
SM
6361 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6362 different lengths, but relying on a single `len' would break this. -sm */
6363 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6364 {
e934739e 6365 int p1_charlen, p2_charlen;
01618498 6366 re_wchar_t p1_ch, p2_ch;
e934739e 6367
6fdd04b0
KH
6368 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6369 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6370
6371 if (RE_TRANSLATE (translate, p1_ch)
6372 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6373 return 1;
e934739e
RS
6374
6375 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6376 }
e934739e
RS
6377
6378 if (p1 != p1_end || p2 != p2_end)
6379 return 1;
6380
fa9a63c5
RM
6381 return 0;
6382}
6383\f
6384/* Entry points for GNU code. */
6385
6386/* re_compile_pattern is the GNU regular expression compiler: it
6387 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6388 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6389
fa9a63c5
RM
6390 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6391 are set in BUFP on entry.
5e69f11e 6392
b18215fc 6393 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6394
6395const char *
971de7fb 6396re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp)
fa9a63c5
RM
6397{
6398 reg_errcode_t ret;
5e69f11e 6399
fa9a63c5
RM
6400 /* GNU code is written to assume at least RE_NREGS registers will be set
6401 (and at least one extra will be -1). */
6402 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6403
fa9a63c5
RM
6404 /* And GNU code determines whether or not to get register information
6405 by passing null for the REGS argument to re_match, etc., not by
6406 setting no_sub. */
6407 bufp->no_sub = 0;
5e69f11e 6408
4bb91c68 6409 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6410
6411 if (!ret)
6412 return NULL;
6413 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6414}
c0f9ea08 6415WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6416\f
b18215fc
RS
6417/* Entry points compatible with 4.2 BSD regex library. We don't define
6418 them unless specifically requested. */
fa9a63c5 6419
0b32bf0e 6420#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6421
6422/* BSD has one and only one pattern buffer. */
6423static struct re_pattern_buffer re_comp_buf;
6424
6425char *
0b32bf0e 6426# ifdef _LIBC
48afdd44
RM
6427/* Make these definitions weak in libc, so POSIX programs can redefine
6428 these names if they don't use our functions, and still use
6429 regcomp/regexec below without link errors. */
6430weak_function
0b32bf0e 6431# endif
fa9a63c5
RM
6432re_comp (s)
6433 const char *s;
6434{
6435 reg_errcode_t ret;
5e69f11e 6436
fa9a63c5
RM
6437 if (!s)
6438 {
6439 if (!re_comp_buf.buffer)
0b32bf0e 6440 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6441 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6442 return 0;
6443 }
6444
6445 if (!re_comp_buf.buffer)
6446 {
6447 re_comp_buf.buffer = (unsigned char *) malloc (200);
6448 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6449 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6450 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6451 re_comp_buf.allocated = 200;
6452
6453 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6454 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6455 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6456 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6457 }
6458
6459 /* Since `re_exec' always passes NULL for the `regs' argument, we
6460 don't need to initialize the pattern buffer fields which affect it. */
6461
fa9a63c5 6462 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6463
fa9a63c5
RM
6464 if (!ret)
6465 return NULL;
6466
6467 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6468 return (char *) gettext (re_error_msgid[(int) ret]);
6469}
6470
6471
6472int
0b32bf0e 6473# ifdef _LIBC
48afdd44 6474weak_function
0b32bf0e 6475# endif
fa9a63c5
RM
6476re_exec (s)
6477 const char *s;
6478{
6479 const int len = strlen (s);
6480 return
6481 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6482}
6483#endif /* _REGEX_RE_COMP */
6484\f
6485/* POSIX.2 functions. Don't define these for Emacs. */
6486
6487#ifndef emacs
6488
6489/* regcomp takes a regular expression as a string and compiles it.
6490
b18215fc 6491 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6492 since POSIX says we shouldn't. Thus, we set
6493
6494 `buffer' to the compiled pattern;
6495 `used' to the length of the compiled pattern;
6496 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6497 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6498 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6499 `fastmap' to an allocated space for the fastmap;
6500 `fastmap_accurate' to zero;
fa9a63c5
RM
6501 `re_nsub' to the number of subexpressions in PATTERN.
6502
6503 PATTERN is the address of the pattern string.
6504
6505 CFLAGS is a series of bits which affect compilation.
6506
6507 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6508 use POSIX basic syntax.
6509
6510 If REG_NEWLINE is set, then . and [^...] don't match newline.
6511 Also, regexec will try a match beginning after every newline.
6512
6513 If REG_ICASE is set, then we considers upper- and lowercase
6514 versions of letters to be equivalent when matching.
6515
6516 If REG_NOSUB is set, then when PREG is passed to regexec, that
6517 routine will report only success or failure, and nothing about the
6518 registers.
6519
b18215fc 6520 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6521 the return codes and their meanings.) */
6522
6523int
d2762c86
DN
6524regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6525 int cflags)
fa9a63c5
RM
6526{
6527 reg_errcode_t ret;
4bb91c68 6528 reg_syntax_t syntax
fa9a63c5
RM
6529 = (cflags & REG_EXTENDED) ?
6530 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6531
6532 /* regex_compile will allocate the space for the compiled pattern. */
6533 preg->buffer = 0;
6534 preg->allocated = 0;
6535 preg->used = 0;
5e69f11e 6536
c0f9ea08
SM
6537 /* Try to allocate space for the fastmap. */
6538 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6539
fa9a63c5
RM
6540 if (cflags & REG_ICASE)
6541 {
6542 unsigned i;
5e69f11e 6543
6676cb1c
RS
6544 preg->translate
6545 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6546 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6547 if (preg->translate == NULL)
0b32bf0e 6548 return (int) REG_ESPACE;
fa9a63c5
RM
6549
6550 /* Map uppercase characters to corresponding lowercase ones. */
6551 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6552 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6553 }
6554 else
6555 preg->translate = NULL;
6556
6557 /* If REG_NEWLINE is set, newlines are treated differently. */
6558 if (cflags & REG_NEWLINE)
6559 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6560 syntax &= ~RE_DOT_NEWLINE;
6561 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6562 }
6563 else
c0f9ea08 6564 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6565
6566 preg->no_sub = !!(cflags & REG_NOSUB);
6567
5e69f11e 6568 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6569 can use strlen here in compiling the pattern. */
4bb91c68 6570 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6571
fa9a63c5
RM
6572 /* POSIX doesn't distinguish between an unmatched open-group and an
6573 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6574 if (ret == REG_ERPAREN)
6575 ret = REG_EPAREN;
6576
6577 if (ret == REG_NOERROR && preg->fastmap)
6578 { /* Compute the fastmap now, since regexec cannot modify the pattern
6579 buffer. */
6580 re_compile_fastmap (preg);
6581 if (preg->can_be_null)
6582 { /* The fastmap can't be used anyway. */
6583 free (preg->fastmap);
6584 preg->fastmap = NULL;
6585 }
6586 }
fa9a63c5
RM
6587 return (int) ret;
6588}
c0f9ea08 6589WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6590
6591
6592/* regexec searches for a given pattern, specified by PREG, in the
6593 string STRING.
5e69f11e 6594
fa9a63c5 6595 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6596 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6597 least NMATCH elements, and we set them to the offsets of the
6598 corresponding matched substrings.
5e69f11e 6599
fa9a63c5
RM
6600 EFLAGS specifies `execution flags' which affect matching: if
6601 REG_NOTBOL is set, then ^ does not match at the beginning of the
6602 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6603
fa9a63c5
RM
6604 We return 0 if we find a match and REG_NOMATCH if not. */
6605
6606int
d2762c86
DN
6607regexec (const regex_t *__restrict preg, const char *__restrict string,
6608 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
fa9a63c5
RM
6609{
6610 int ret;
6611 struct re_registers regs;
6612 regex_t private_preg;
6613 int len = strlen (string);
c0f9ea08 6614 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6615
6616 private_preg = *preg;
5e69f11e 6617
fa9a63c5
RM
6618 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6619 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6620
fa9a63c5
RM
6621 /* The user has told us exactly how many registers to return
6622 information about, via `nmatch'. We have to pass that on to the
b18215fc 6623 matching routines. */
fa9a63c5 6624 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6625
fa9a63c5
RM
6626 if (want_reg_info)
6627 {
6628 regs.num_regs = nmatch;
4bb91c68
SM
6629 regs.start = TALLOC (nmatch * 2, regoff_t);
6630 if (regs.start == NULL)
0b32bf0e 6631 return (int) REG_NOMATCH;
4bb91c68 6632 regs.end = regs.start + nmatch;
fa9a63c5
RM
6633 }
6634
c0f9ea08
SM
6635 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6636 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6637 was a little bit longer but still only matching the real part.
6638 This works because the `endline' will check for a '\n' and will find a
6639 '\0', correctly deciding that this is not the end of a line.
6640 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6641 a convenient '\0' there. For all we know, the string could be preceded
6642 by '\n' which would throw things off. */
6643
fa9a63c5
RM
6644 /* Perform the searching operation. */
6645 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6646 /* start: */ 0, /* range: */ len,
6647 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6648
fa9a63c5
RM
6649 /* Copy the register information to the POSIX structure. */
6650 if (want_reg_info)
6651 {
6652 if (ret >= 0)
0b32bf0e
SM
6653 {
6654 unsigned r;
fa9a63c5 6655
0b32bf0e
SM
6656 for (r = 0; r < nmatch; r++)
6657 {
6658 pmatch[r].rm_so = regs.start[r];
6659 pmatch[r].rm_eo = regs.end[r];
6660 }
6661 }
fa9a63c5 6662
b18215fc 6663 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6664 free (regs.start);
fa9a63c5
RM
6665 }
6666
6667 /* We want zero return to mean success, unlike `re_search'. */
6668 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6669}
c0f9ea08 6670WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6671
6672
ec869672
JR
6673/* Returns a message corresponding to an error code, ERR_CODE, returned
6674 from either regcomp or regexec. We don't use PREG here.
6675
6676 ERR_CODE was previously called ERRCODE, but that name causes an
6677 error with msvc8 compiler. */
fa9a63c5
RM
6678
6679size_t
d2762c86 6680regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
fa9a63c5
RM
6681{
6682 const char *msg;
6683 size_t msg_size;
6684
ec869672
JR
6685 if (err_code < 0
6686 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6687 /* Only error codes returned by the rest of the code should be passed
b18215fc 6688 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6689 code generates an invalid error code, then the program has a bug.
6690 Dump core so we can fix it. */
6691 abort ();
6692
ec869672 6693 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6694
6695 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6696
fa9a63c5
RM
6697 if (errbuf_size != 0)
6698 {
6699 if (msg_size > errbuf_size)
0b32bf0e
SM
6700 {
6701 strncpy (errbuf, msg, errbuf_size - 1);
6702 errbuf[errbuf_size - 1] = 0;
6703 }
fa9a63c5 6704 else
0b32bf0e 6705 strcpy (errbuf, msg);
fa9a63c5
RM
6706 }
6707
6708 return msg_size;
6709}
c0f9ea08 6710WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6711
6712
6713/* Free dynamically allocated space used by PREG. */
6714
6715void
d2762c86 6716regfree (regex_t *preg)
fa9a63c5 6717{
c2cd06e6 6718 free (preg->buffer);
fa9a63c5 6719 preg->buffer = NULL;
5e69f11e 6720
fa9a63c5
RM
6721 preg->allocated = 0;
6722 preg->used = 0;
6723
c2cd06e6 6724 free (preg->fastmap);
fa9a63c5
RM
6725 preg->fastmap = NULL;
6726 preg->fastmap_accurate = 0;
6727
c2cd06e6 6728 free (preg->translate);
fa9a63c5
RM
6729 preg->translate = NULL;
6730}
c0f9ea08 6731WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6732
6733#endif /* not emacs */