Fix previous conflict.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
114f9c96 6 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
e468b87f 7 Free Software Foundation, Inc.
bc78d348 8
fa9a63c5
RM
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
e468b87f 11 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
4fc5845f 21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 22 USA. */
fa9a63c5 23
6df42991 24/* TODO:
505bde11 25 - structure the opcode space into opcode+flag.
dc1e502d 26 - merge with glibc's regex.[ch].
01618498 27 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
28 need to modify the compiled regexp so that re_match can be reentrant.
29 - get rid of on_failure_jump_smart by doing the optimization in re_comp
30 rather than at run-time, so that re_match can be reentrant.
01618498 31*/
505bde11 32
fa9a63c5 33/* AIX requires this to be the first thing in the file. */
0b32bf0e 34#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
35 #pragma alloca
36#endif
37
fa9a63c5 38#ifdef HAVE_CONFIG_H
0b32bf0e 39# include <config.h>
fa9a63c5
RM
40#endif
41
4bb91c68
SM
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
fa9a63c5 48
14473664
SM
49/* Whether to use ISO C Amendment 1 wide char functions.
50 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
51#if defined _LIBC
52#define WIDE_CHAR_SUPPORT 1
53#else
14473664 54#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
55 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
56#endif
14473664
SM
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
a0ad02f7 60#if WIDE_CHAR_SUPPORT
14473664
SM
61/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62# include <wchar.h>
63# include <wctype.h>
64#endif
65
c0f9ea08
SM
66#ifdef _LIBC
67/* We have to keep the namespace clean. */
68# define regfree(preg) __regfree (preg)
69# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
71# define regerror(err_code, preg, errbuf, errbuf_size) \
72 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
73# define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77# define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79# define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81# define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83# define re_set_syntax(syntax) __re_set_syntax (syntax)
84# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
14473664
SM
88/* Make sure we call libc's function even if the user overrides them. */
89# define btowc __btowc
90# define iswctype __iswctype
91# define wctype __wctype
92
c0f9ea08
SM
93# define WEAK_ALIAS(a,b) weak_alias (a, b)
94
95/* We are also using some library internals. */
96# include <locale/localeinfo.h>
97# include <locale/elem-hash.h>
98# include <langinfo.h>
99#else
100# define WEAK_ALIAS(a,b)
101#endif
102
4bb91c68 103/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 104#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
105# include <libintl.h>
106#else
107# define gettext(msgid) (msgid)
108#endif
109
5e69f11e
RM
110#ifndef gettext_noop
111/* This define is so xgettext can find the internationalizable
112 strings. */
0b32bf0e 113# define gettext_noop(String) String
5e69f11e
RM
114#endif
115
fa9a63c5
RM
116/* The `emacs' switch turns on certain matching commands
117 that make sense only in Emacs. */
118#ifdef emacs
119
d7306fe6 120# include <setjmp.h>
0b32bf0e
SM
121# include "lisp.h"
122# include "buffer.h"
b18215fc
RS
123
124/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 125# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 126
0b32bf0e 127# include "syntax.h"
9117d724 128# include "character.h"
0b32bf0e 129# include "category.h"
fa9a63c5 130
7689ef0b
EZ
131# ifdef malloc
132# undef malloc
133# endif
0b32bf0e 134# define malloc xmalloc
7689ef0b
EZ
135# ifdef realloc
136# undef realloc
137# endif
0b32bf0e 138# define realloc xrealloc
7689ef0b
EZ
139# ifdef free
140# undef free
141# endif
0b32bf0e 142# define free xfree
9abbd165 143
7814e705 144/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
145# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
146# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
147
148# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 149# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
150# define RE_STRING_CHAR(p, multibyte) \
151 (multibyte ? (STRING_CHAR (p)) : (*(p)))
152# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
153 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 154
4c0354d7 155# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 156
2afc21f5 157# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 158
6fdd04b0
KH
159/* Set C a (possibly converted to multibyte) character before P. P
160 points into a string which is the virtual concatenation of STR1
161 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
162# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
163 do { \
02cb78b5 164 if (target_multibyte) \
bf216479
KH
165 { \
166 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
167 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
168 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 169 c = STRING_CHAR (dtemp); \
bf216479
KH
170 } \
171 else \
172 { \
173 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 174 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 175 } \
2d1675e4
SM
176 } while (0)
177
6fdd04b0
KH
178/* Set C a (possibly converted to multibyte) character at P, and set
179 LEN to the byte length of that character. */
180# define GET_CHAR_AFTER(c, p, len) \
181 do { \
02cb78b5 182 if (target_multibyte) \
62a6e103 183 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
184 else \
185 { \
cf9c99bc 186 (c) = *p; \
6fdd04b0 187 len = 1; \
cf9c99bc 188 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 189 } \
8f924df7 190 } while (0)
4e8a9132 191
fa9a63c5
RM
192#else /* not emacs */
193
194/* If we are not linking with Emacs proper,
195 we can't use the relocating allocator
196 even if config.h says that we can. */
0b32bf0e 197# undef REL_ALLOC
fa9a63c5 198
0b32bf0e
SM
199# if defined STDC_HEADERS || defined _LIBC
200# include <stdlib.h>
201# else
fa9a63c5
RM
202char *malloc ();
203char *realloc ();
0b32bf0e 204# endif
fa9a63c5 205
a77f947b
CY
206/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
207
208void *
209xmalloc (size)
210 size_t size;
211{
212 register void *val;
213 val = (void *) malloc (size);
214 if (!val && size)
215 {
216 write (2, "virtual memory exhausted\n", 25);
217 exit (1);
218 }
219 return val;
220}
221
222void *
223xrealloc (block, size)
224 void *block;
225 size_t size;
226{
227 register void *val;
228 /* We must call malloc explicitly when BLOCK is 0, since some
229 reallocs don't do this. */
230 if (! block)
231 val = (void *) malloc (size);
232 else
233 val = (void *) realloc (block, size);
234 if (!val && size)
235 {
236 write (2, "virtual memory exhausted\n", 25);
237 exit (1);
238 }
239 return val;
240}
241
a073faa6
CY
242# ifdef malloc
243# undef malloc
244# endif
245# define malloc xmalloc
246# ifdef realloc
247# undef realloc
248# endif
249# define realloc xrealloc
250
72af86bd
AS
251/* This is the normal way of making sure we have memcpy, memcmp and memset. */
252# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
253# include <string.h>
254# else
255# include <strings.h>
256# ifndef memcmp
257# define memcmp(s1, s2, n) bcmp (s1, s2, n)
0b32bf0e 258# endif
72af86bd
AS
259# ifndef memcpy
260# define memcpy(d, s, n) (bcopy (s, d, n), (d))
0b32bf0e
SM
261# endif
262# endif
fa9a63c5
RM
263
264/* Define the syntax stuff for \<, \>, etc. */
265
990b2375 266/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 267enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 268
0b32bf0e 269# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 270
e934739e 271/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
272# define CHAR_CHARSET(c) 0
273# define CHARSET_LEADING_CODE_BASE(c) 0
274# define MAX_MULTIBYTE_LENGTH 1
275# define RE_MULTIBYTE_P(x) 0
bf216479 276# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
277# define WORD_BOUNDARY_P(c1, c2) (0)
278# define CHAR_HEAD_P(p) (1)
279# define SINGLE_BYTE_CHAR_P(c) (1)
280# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 281# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 282# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
283# define STRING_CHAR(p) (*(p))
284# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 285# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
286# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
287# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
288# define RE_CHAR_TO_MULTIBYTE(c) (c)
289# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 290# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 291 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
292# define GET_CHAR_AFTER(c, p, len) \
293 (c = *p, len = 1)
0b32bf0e 294# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
295# define BYTE8_TO_CHAR(c) (c)
296# define CHAR_BYTE8_P(c) (0)
bf216479 297# define CHAR_LEADING_CODE(c) (c)
8f924df7 298
fa9a63c5 299#endif /* not emacs */
4e8a9132
SM
300
301#ifndef RE_TRANSLATE
0b32bf0e
SM
302# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
303# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 304#endif
fa9a63c5
RM
305\f
306/* Get the interface, including the syntax bits. */
307#include "regex.h"
308
f71b19b6
DL
309/* isalpha etc. are used for the character classes. */
310#include <ctype.h>
fa9a63c5 311
f71b19b6 312#ifdef emacs
fa9a63c5 313
f71b19b6 314/* 1 if C is an ASCII character. */
0b32bf0e 315# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 316
f71b19b6 317/* 1 if C is a unibyte character. */
0b32bf0e 318# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 319
f71b19b6 320/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 321
f71b19b6 322/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
323# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
324# define ISCNTRL(c) ((c) < ' ')
325# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
326 || ((c) >= 'a' && (c) <= 'f') \
327 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
328
329/* This is only used for single-byte characters. */
0b32bf0e 330# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
331
332/* The rest must handle multibyte characters. */
333
0b32bf0e 334# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 335 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
336 : 1)
337
14473664 338# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 339 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
340 : 1)
341
0b32bf0e 342# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
343 ? (((c) >= 'a' && (c) <= 'z') \
344 || ((c) >= 'A' && (c) <= 'Z') \
345 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
346 : SYNTAX (c) == Sword)
347
0b32bf0e 348# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
349 ? (((c) >= 'a' && (c) <= 'z') \
350 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
351 : SYNTAX (c) == Sword)
352
0b32bf0e 353# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 354
0b32bf0e 355# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
356 ? ((c) > ' ' && (c) < 0177 \
357 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
358 || ((c) >= 'A' && (c) <= 'Z') \
359 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
360 : SYNTAX (c) != Sword)
361
0b32bf0e 362# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 363
0b32bf0e 364# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 365
0b32bf0e 366# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
367
368#else /* not emacs */
369
f71b19b6
DL
370/* Jim Meyering writes:
371
372 "... Some ctype macros are valid only for character codes that
373 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
374 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 375 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
376 STDC_HEADERS is defined, then autoconf has verified that the ctype
377 macros don't need to be guarded with references to isascii. ...
378 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
379 eliminate the && through constant folding."
380 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 381
4bb91c68 382# undef ISASCII
0b32bf0e
SM
383# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
384# define ISASCII(c) 1
385# else
386# define ISASCII(c) isascii(c)
387# endif
f71b19b6
DL
388
389/* 1 if C is an ASCII character. */
0b32bf0e 390# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
391
392/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
393# define ISUNIBYTE(c) 1
394
395# ifdef isblank
396# define ISBLANK(c) (ISASCII (c) && isblank (c))
397# else
398# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
399# endif
400# ifdef isgraph
401# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
402# else
403# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
404# endif
405
4bb91c68 406# undef ISPRINT
0b32bf0e
SM
407# define ISPRINT(c) (ISASCII (c) && isprint (c))
408# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
409# define ISALNUM(c) (ISASCII (c) && isalnum (c))
410# define ISALPHA(c) (ISASCII (c) && isalpha (c))
411# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
412# define ISLOWER(c) (ISASCII (c) && islower (c))
413# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
414# define ISSPACE(c) (ISASCII (c) && isspace (c))
415# define ISUPPER(c) (ISASCII (c) && isupper (c))
416# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
417
418# define ISWORD(c) ISALPHA(c)
419
4bb91c68
SM
420# ifdef _tolower
421# define TOLOWER(c) _tolower(c)
422# else
423# define TOLOWER(c) tolower(c)
424# endif
425
426/* How many characters in the character set. */
427# define CHAR_SET_SIZE 256
428
0b32bf0e 429# ifdef SYNTAX_TABLE
f71b19b6 430
0b32bf0e 431extern char *re_syntax_table;
f71b19b6 432
0b32bf0e
SM
433# else /* not SYNTAX_TABLE */
434
0b32bf0e
SM
435static char re_syntax_table[CHAR_SET_SIZE];
436
437static void
438init_syntax_once ()
439{
440 register int c;
441 static int done = 0;
442
443 if (done)
444 return;
445
72af86bd 446 memset (re_syntax_table, 0, sizeof re_syntax_table);
0b32bf0e 447
4bb91c68
SM
448 for (c = 0; c < CHAR_SET_SIZE; ++c)
449 if (ISALNUM (c))
450 re_syntax_table[c] = Sword;
fa9a63c5 451
669fa600 452 re_syntax_table['_'] = Ssymbol;
fa9a63c5 453
0b32bf0e
SM
454 done = 1;
455}
456
457# endif /* not SYNTAX_TABLE */
96cc36cc 458
4bb91c68
SM
459# define SYNTAX(c) re_syntax_table[(c)]
460
96cc36cc
RS
461#endif /* not emacs */
462\f
fa9a63c5 463#ifndef NULL
0b32bf0e 464# define NULL (void *)0
fa9a63c5
RM
465#endif
466
467/* We remove any previous definition of `SIGN_EXTEND_CHAR',
468 since ours (we hope) works properly with all combinations of
469 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 470 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
471#undef SIGN_EXTEND_CHAR
472#if __STDC__
0b32bf0e 473# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
474#else /* not __STDC__ */
475/* As in Harbison and Steele. */
0b32bf0e 476# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
477#endif
478\f
479/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
480 use `alloca' instead of `malloc'. This is because using malloc in
481 re_search* or re_match* could cause memory leaks when C-g is used in
482 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
483 the other hand, malloc is more portable, and easier to debug.
484
fa9a63c5
RM
485 Because we sometimes use alloca, some routines have to be macros,
486 not functions -- `alloca'-allocated space disappears at the end of the
487 function it is called in. */
488
489#ifdef REGEX_MALLOC
490
0b32bf0e
SM
491# define REGEX_ALLOCATE malloc
492# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
493# define REGEX_FREE free
fa9a63c5
RM
494
495#else /* not REGEX_MALLOC */
496
497/* Emacs already defines alloca, sometimes. */
0b32bf0e 498# ifndef alloca
fa9a63c5
RM
499
500/* Make alloca work the best possible way. */
0b32bf0e
SM
501# ifdef __GNUC__
502# define alloca __builtin_alloca
503# else /* not __GNUC__ */
7f585e7a 504# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
505# include <alloca.h>
506# endif /* HAVE_ALLOCA_H */
507# endif /* not __GNUC__ */
fa9a63c5 508
0b32bf0e 509# endif /* not alloca */
fa9a63c5 510
0b32bf0e 511# define REGEX_ALLOCATE alloca
fa9a63c5
RM
512
513/* Assumes a `char *destination' variable. */
0b32bf0e 514# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 515 (destination = (char *) alloca (nsize), \
4bb91c68 516 memcpy (destination, source, osize))
fa9a63c5
RM
517
518/* No need to do anything to free, after alloca. */
0b32bf0e 519# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
520
521#endif /* not REGEX_MALLOC */
522
523/* Define how to allocate the failure stack. */
524
0b32bf0e 525#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 526
0b32bf0e 527# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 528 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 529# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 530 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 531# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
532 r_alloc_free (&failure_stack_ptr)
533
4297555e 534#else /* not using relocating allocator */
fa9a63c5 535
0b32bf0e 536# ifdef REGEX_MALLOC
fa9a63c5 537
0b32bf0e
SM
538# define REGEX_ALLOCATE_STACK malloc
539# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
540# define REGEX_FREE_STACK free
fa9a63c5 541
0b32bf0e 542# else /* not REGEX_MALLOC */
fa9a63c5 543
0b32bf0e 544# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 545
0b32bf0e 546# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 547 REGEX_REALLOCATE (source, osize, nsize)
7814e705 548/* No need to explicitly free anything. */
0b32bf0e 549# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 550
0b32bf0e 551# endif /* not REGEX_MALLOC */
4297555e 552#endif /* not using relocating allocator */
fa9a63c5
RM
553
554
555/* True if `size1' is non-NULL and PTR is pointing anywhere inside
556 `string1' or just past its end. This works if PTR is NULL, which is
557 a good thing. */
25fe55af 558#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
559 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
560
561/* (Re)Allocate N items of type T using malloc, or fail. */
562#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
563#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
564#define RETALLOC_IF(addr, n, t) \
565 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
566#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
567
4bb91c68 568#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
569
570#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
571
572#undef MAX
573#undef MIN
574#define MAX(a, b) ((a) > (b) ? (a) : (b))
575#define MIN(a, b) ((a) < (b) ? (a) : (b))
576
66f0296e
SM
577/* Type of source-pattern and string chars. */
578typedef const unsigned char re_char;
579
fa9a63c5
RM
580typedef char boolean;
581#define false 0
582#define true 1
583
4bb91c68
SM
584static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
585 re_char *string1, int size1,
586 re_char *string2, int size2,
587 int pos,
588 struct re_registers *regs,
589 int stop));
fa9a63c5
RM
590\f
591/* These are the command codes that appear in compiled regular
4bb91c68 592 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
593 command code can specify any interpretation whatsoever for its
594 arguments. Zero bytes may appear in the compiled regular expression. */
595
596typedef enum
597{
598 no_op = 0,
599
4bb91c68 600 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
601 succeed,
602
25fe55af 603 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
604 exactn,
605
25fe55af 606 /* Matches any (more or less) character. */
fa9a63c5
RM
607 anychar,
608
25fe55af
RS
609 /* Matches any one char belonging to specified set. First
610 following byte is number of bitmap bytes. Then come bytes
611 for a bitmap saying which chars are in. Bits in each byte
612 are ordered low-bit-first. A character is in the set if its
613 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
614 automatically not in the set.
615
616 If the length byte has the 0x80 bit set, then that stuff
617 is followed by a range table:
618 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 619 See RANGE_TABLE_WORK_BITS below.
01618498 620 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 621 pairs, each 2 multibyte characters,
0b32bf0e 622 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
623 charset,
624
25fe55af 625 /* Same parameters as charset, but match any character that is
4bb91c68 626 not one of those specified. */
fa9a63c5
RM
627 charset_not,
628
25fe55af
RS
629 /* Start remembering the text that is matched, for storing in a
630 register. Followed by one byte with the register number, in
631 the range 0 to one less than the pattern buffer's re_nsub
505bde11 632 field. */
fa9a63c5
RM
633 start_memory,
634
25fe55af
RS
635 /* Stop remembering the text that is matched and store it in a
636 memory register. Followed by one byte with the register
637 number, in the range 0 to one less than `re_nsub' in the
505bde11 638 pattern buffer. */
fa9a63c5
RM
639 stop_memory,
640
25fe55af 641 /* Match a duplicate of something remembered. Followed by one
4bb91c68 642 byte containing the register number. */
fa9a63c5
RM
643 duplicate,
644
25fe55af 645 /* Fail unless at beginning of line. */
fa9a63c5
RM
646 begline,
647
4bb91c68 648 /* Fail unless at end of line. */
fa9a63c5
RM
649 endline,
650
25fe55af
RS
651 /* Succeeds if at beginning of buffer (if emacs) or at beginning
652 of string to be matched (if not). */
fa9a63c5
RM
653 begbuf,
654
25fe55af 655 /* Analogously, for end of buffer/string. */
fa9a63c5 656 endbuf,
5e69f11e 657
25fe55af 658 /* Followed by two byte relative address to which to jump. */
5e69f11e 659 jump,
fa9a63c5 660
25fe55af 661 /* Followed by two-byte relative address of place to resume at
7814e705 662 in case of failure. */
fa9a63c5 663 on_failure_jump,
5e69f11e 664
25fe55af
RS
665 /* Like on_failure_jump, but pushes a placeholder instead of the
666 current string position when executed. */
fa9a63c5 667 on_failure_keep_string_jump,
5e69f11e 668
505bde11
SM
669 /* Just like `on_failure_jump', except that it checks that we
670 don't get stuck in an infinite loop (matching an empty string
671 indefinitely). */
672 on_failure_jump_loop,
673
0683b6fa
SM
674 /* Just like `on_failure_jump_loop', except that it checks for
675 a different kind of loop (the kind that shows up with non-greedy
676 operators). This operation has to be immediately preceded
677 by a `no_op'. */
678 on_failure_jump_nastyloop,
679
0b32bf0e 680 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
681 It analyses the loop before which it is put and if the
682 loop does not require backtracking, it changes itself to
4e8a9132
SM
683 `on_failure_keep_string_jump' and short-circuits the loop,
684 else it just defaults to changing itself into `on_failure_jump'.
685 It assumes that it is pointing to just past a `jump'. */
505bde11 686 on_failure_jump_smart,
fa9a63c5 687
25fe55af 688 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
689 After matching N times, jump to the address upon failure.
690 Does not work if N starts at 0: use on_failure_jump_loop
691 instead. */
fa9a63c5
RM
692 succeed_n,
693
25fe55af
RS
694 /* Followed by two-byte relative address, and two-byte number n.
695 Jump to the address N times, then fail. */
fa9a63c5
RM
696 jump_n,
697
25fe55af 698 /* Set the following two-byte relative address to the
7814e705 699 subsequent two-byte number. The address *includes* the two
25fe55af 700 bytes of number. */
fa9a63c5
RM
701 set_number_at,
702
fa9a63c5
RM
703 wordbeg, /* Succeeds if at word beginning. */
704 wordend, /* Succeeds if at word end. */
705
706 wordbound, /* Succeeds if at a word boundary. */
7814e705 707 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 708
669fa600
SM
709 symbeg, /* Succeeds if at symbol beginning. */
710 symend, /* Succeeds if at symbol end. */
711
fa9a63c5 712 /* Matches any character whose syntax is specified. Followed by
25fe55af 713 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
714 syntaxspec,
715
716 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
717 notsyntaxspec
718
719#ifdef emacs
720 ,before_dot, /* Succeeds if before point. */
721 at_dot, /* Succeeds if at point. */
722 after_dot, /* Succeeds if after point. */
b18215fc
RS
723
724 /* Matches any character whose category-set contains the specified
7814e705
JB
725 category. The operator is followed by a byte which contains a
726 category code (mnemonic ASCII character). */
b18215fc
RS
727 categoryspec,
728
729 /* Matches any character whose category-set does not contain the
730 specified category. The operator is followed by a byte which
731 contains the category code (mnemonic ASCII character). */
732 notcategoryspec
fa9a63c5
RM
733#endif /* emacs */
734} re_opcode_t;
735\f
736/* Common operations on the compiled pattern. */
737
738/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
739
740#define STORE_NUMBER(destination, number) \
741 do { \
742 (destination)[0] = (number) & 0377; \
743 (destination)[1] = (number) >> 8; \
744 } while (0)
745
746/* Same as STORE_NUMBER, except increment DESTINATION to
747 the byte after where the number is stored. Therefore, DESTINATION
748 must be an lvalue. */
749
750#define STORE_NUMBER_AND_INCR(destination, number) \
751 do { \
752 STORE_NUMBER (destination, number); \
753 (destination) += 2; \
754 } while (0)
755
756/* Put into DESTINATION a number stored in two contiguous bytes starting
757 at SOURCE. */
758
759#define EXTRACT_NUMBER(destination, source) \
760 do { \
761 (destination) = *(source) & 0377; \
762 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
763 } while (0)
764
765#ifdef DEBUG
4bb91c68 766static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
767static void
768extract_number (dest, source)
769 int *dest;
01618498 770 re_char *source;
fa9a63c5 771{
5e69f11e 772 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
773 *dest = *source & 0377;
774 *dest += temp << 8;
775}
776
4bb91c68 777# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
778# undef EXTRACT_NUMBER
779# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
780# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
781
782#endif /* DEBUG */
783
784/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
785 SOURCE must be an lvalue. */
786
787#define EXTRACT_NUMBER_AND_INCR(destination, source) \
788 do { \
789 EXTRACT_NUMBER (destination, source); \
25fe55af 790 (source) += 2; \
fa9a63c5
RM
791 } while (0)
792
793#ifdef DEBUG
4bb91c68
SM
794static void extract_number_and_incr _RE_ARGS ((int *destination,
795 re_char **source));
fa9a63c5
RM
796static void
797extract_number_and_incr (destination, source)
798 int *destination;
01618498 799 re_char **source;
5e69f11e 800{
fa9a63c5
RM
801 extract_number (destination, *source);
802 *source += 2;
803}
804
0b32bf0e
SM
805# ifndef EXTRACT_MACROS
806# undef EXTRACT_NUMBER_AND_INCR
807# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 808 extract_number_and_incr (&dest, &src)
0b32bf0e 809# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
810
811#endif /* DEBUG */
812\f
b18215fc
RS
813/* Store a multibyte character in three contiguous bytes starting
814 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 815 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
816
817#define STORE_CHARACTER_AND_INCR(destination, character) \
818 do { \
819 (destination)[0] = (character) & 0377; \
820 (destination)[1] = ((character) >> 8) & 0377; \
821 (destination)[2] = (character) >> 16; \
822 (destination) += 3; \
823 } while (0)
824
825/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 826 starting at SOURCE. */
b18215fc
RS
827
828#define EXTRACT_CHARACTER(destination, source) \
829 do { \
830 (destination) = ((source)[0] \
831 | ((source)[1] << 8) \
832 | ((source)[2] << 16)); \
833 } while (0)
834
835
836/* Macros for charset. */
837
838/* Size of bitmap of charset P in bytes. P is a start of charset,
839 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
840#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
841
842/* Nonzero if charset P has range table. */
25fe55af 843#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
844
845/* Return the address of range table of charset P. But not the start
846 of table itself, but the before where the number of ranges is
96cc36cc
RS
847 stored. `2 +' means to skip re_opcode_t and size of bitmap,
848 and the 2 bytes of flags at the start of the range table. */
849#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
850
851/* Extract the bit flags that start a range table. */
852#define CHARSET_RANGE_TABLE_BITS(p) \
853 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
854 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
855
856/* Test if C is listed in the bitmap of charset P. */
857#define CHARSET_LOOKUP_BITMAP(p, c) \
858 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
859 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
860
861/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
862 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
863 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
864 and end. */
865#define CHARSET_RANGE_TABLE_END(range_table, count) \
866 ((range_table) + (count) * 2 * 3)
867
7814e705 868/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
869 COUNT is number of ranges in RANGE_TABLE. */
870#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
871 do \
872 { \
01618498
SM
873 re_wchar_t range_start, range_end; \
874 re_char *p; \
875 re_char *range_table_end \
b18215fc
RS
876 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
877 \
878 for (p = (range_table); p < range_table_end; p += 2 * 3) \
879 { \
880 EXTRACT_CHARACTER (range_start, p); \
881 EXTRACT_CHARACTER (range_end, p + 3); \
882 \
883 if (range_start <= (c) && (c) <= range_end) \
884 { \
885 (not) = !(not); \
886 break; \
887 } \
888 } \
889 } \
890 while (0)
891
892/* Test if C is in range table of CHARSET. The flag NOT is negated if
893 C is listed in it. */
894#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
895 do \
896 { \
897 /* Number of ranges in range table. */ \
898 int count; \
01618498
SM
899 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
900 \
b18215fc
RS
901 EXTRACT_NUMBER_AND_INCR (count, range_table); \
902 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
903 } \
904 while (0)
905\f
fa9a63c5
RM
906/* If DEBUG is defined, Regex prints many voluminous messages about what
907 it is doing (if the variable `debug' is nonzero). If linked with the
908 main program in `iregex.c', you can enter patterns and strings
909 interactively. And if linked with the main program in `main.c' and
4bb91c68 910 the other test files, you can run the already-written tests. */
fa9a63c5
RM
911
912#ifdef DEBUG
913
914/* We use standard I/O for debugging. */
0b32bf0e 915# include <stdio.h>
fa9a63c5
RM
916
917/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 918# include <assert.h>
fa9a63c5 919
99633e97 920static int debug = -100000;
fa9a63c5 921
0b32bf0e
SM
922# define DEBUG_STATEMENT(e) e
923# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
924# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
925# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
926# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
927# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 928 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 929# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 930 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
931
932
933/* Print the fastmap in human-readable form. */
934
935void
936print_fastmap (fastmap)
937 char *fastmap;
938{
939 unsigned was_a_range = 0;
5e69f11e
RM
940 unsigned i = 0;
941
fa9a63c5
RM
942 while (i < (1 << BYTEWIDTH))
943 {
944 if (fastmap[i++])
945 {
946 was_a_range = 0;
25fe55af
RS
947 putchar (i - 1);
948 while (i < (1 << BYTEWIDTH) && fastmap[i])
949 {
950 was_a_range = 1;
951 i++;
952 }
fa9a63c5 953 if (was_a_range)
25fe55af
RS
954 {
955 printf ("-");
956 putchar (i - 1);
957 }
958 }
fa9a63c5 959 }
5e69f11e 960 putchar ('\n');
fa9a63c5
RM
961}
962
963
964/* Print a compiled pattern string in human-readable form, starting at
965 the START pointer into it and ending just before the pointer END. */
966
967void
968print_partial_compiled_pattern (start, end)
01618498
SM
969 re_char *start;
970 re_char *end;
fa9a63c5
RM
971{
972 int mcnt, mcnt2;
01618498
SM
973 re_char *p = start;
974 re_char *pend = end;
fa9a63c5
RM
975
976 if (start == NULL)
977 {
a1a052df 978 fprintf (stderr, "(null)\n");
fa9a63c5
RM
979 return;
980 }
5e69f11e 981
fa9a63c5
RM
982 /* Loop over pattern commands. */
983 while (p < pend)
984 {
a1a052df 985 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
986
987 switch ((re_opcode_t) *p++)
988 {
25fe55af 989 case no_op:
a1a052df 990 fprintf (stderr, "/no_op");
25fe55af 991 break;
fa9a63c5 992
99633e97 993 case succeed:
a1a052df 994 fprintf (stderr, "/succeed");
99633e97
SM
995 break;
996
fa9a63c5
RM
997 case exactn:
998 mcnt = *p++;
a1a052df 999 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 1000 do
fa9a63c5 1001 {
a1a052df 1002 fprintf (stderr, "/%c", *p++);
25fe55af
RS
1003 }
1004 while (--mcnt);
1005 break;
fa9a63c5
RM
1006
1007 case start_memory:
a1a052df 1008 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1009 break;
fa9a63c5
RM
1010
1011 case stop_memory:
a1a052df 1012 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1013 break;
fa9a63c5
RM
1014
1015 case duplicate:
a1a052df 1016 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1017 break;
1018
1019 case anychar:
a1a052df 1020 fprintf (stderr, "/anychar");
fa9a63c5
RM
1021 break;
1022
1023 case charset:
25fe55af
RS
1024 case charset_not:
1025 {
1026 register int c, last = -100;
fa9a63c5 1027 register int in_range = 0;
99633e97
SM
1028 int length = CHARSET_BITMAP_SIZE (p - 1);
1029 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1030
a1a052df 1031 fprintf (stderr, "/charset [%s",
839966f3 1032 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1033
839966f3
KH
1034 if (p + *p >= pend)
1035 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1036
25fe55af 1037 for (c = 0; c < 256; c++)
96cc36cc 1038 if (c / 8 < length
fa9a63c5
RM
1039 && (p[1 + (c/8)] & (1 << (c % 8))))
1040 {
1041 /* Are we starting a range? */
1042 if (last + 1 == c && ! in_range)
1043 {
a1a052df 1044 fprintf (stderr, "-");
fa9a63c5
RM
1045 in_range = 1;
1046 }
1047 /* Have we broken a range? */
1048 else if (last + 1 != c && in_range)
96cc36cc 1049 {
a1a052df 1050 fprintf (stderr, "%c", last);
fa9a63c5
RM
1051 in_range = 0;
1052 }
5e69f11e 1053
fa9a63c5 1054 if (! in_range)
a1a052df 1055 fprintf (stderr, "%c", c);
fa9a63c5
RM
1056
1057 last = c;
25fe55af 1058 }
fa9a63c5
RM
1059
1060 if (in_range)
a1a052df 1061 fprintf (stderr, "%c", last);
fa9a63c5 1062
a1a052df 1063 fprintf (stderr, "]");
fa9a63c5 1064
99633e97 1065 p += 1 + length;
96cc36cc 1066
96cc36cc 1067 if (has_range_table)
99633e97
SM
1068 {
1069 int count;
a1a052df 1070 fprintf (stderr, "has-range-table");
99633e97
SM
1071
1072 /* ??? Should print the range table; for now, just skip it. */
1073 p += 2; /* skip range table bits */
1074 EXTRACT_NUMBER_AND_INCR (count, p);
1075 p = CHARSET_RANGE_TABLE_END (p, count);
1076 }
fa9a63c5
RM
1077 }
1078 break;
1079
1080 case begline:
a1a052df 1081 fprintf (stderr, "/begline");
25fe55af 1082 break;
fa9a63c5
RM
1083
1084 case endline:
a1a052df 1085 fprintf (stderr, "/endline");
25fe55af 1086 break;
fa9a63c5
RM
1087
1088 case on_failure_jump:
25fe55af 1089 extract_number_and_incr (&mcnt, &p);
a1a052df 1090 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1091 break;
fa9a63c5
RM
1092
1093 case on_failure_keep_string_jump:
25fe55af 1094 extract_number_and_incr (&mcnt, &p);
a1a052df 1095 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1096 break;
fa9a63c5 1097
0683b6fa
SM
1098 case on_failure_jump_nastyloop:
1099 extract_number_and_incr (&mcnt, &p);
a1a052df 1100 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1101 break;
1102
505bde11 1103 case on_failure_jump_loop:
fa9a63c5 1104 extract_number_and_incr (&mcnt, &p);
a1a052df 1105 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1106 break;
1107
505bde11 1108 case on_failure_jump_smart:
fa9a63c5 1109 extract_number_and_incr (&mcnt, &p);
a1a052df 1110 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1111 break;
1112
25fe55af 1113 case jump:
fa9a63c5 1114 extract_number_and_incr (&mcnt, &p);
a1a052df 1115 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1116 break;
1117
25fe55af
RS
1118 case succeed_n:
1119 extract_number_and_incr (&mcnt, &p);
1120 extract_number_and_incr (&mcnt2, &p);
a1a052df 1121 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1122 break;
5e69f11e 1123
25fe55af
RS
1124 case jump_n:
1125 extract_number_and_incr (&mcnt, &p);
1126 extract_number_and_incr (&mcnt2, &p);
a1a052df 1127 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1128 break;
5e69f11e 1129
25fe55af
RS
1130 case set_number_at:
1131 extract_number_and_incr (&mcnt, &p);
1132 extract_number_and_incr (&mcnt2, &p);
a1a052df 1133 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1134 break;
5e69f11e 1135
25fe55af 1136 case wordbound:
a1a052df 1137 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1138 break;
1139
1140 case notwordbound:
a1a052df 1141 fprintf (stderr, "/notwordbound");
25fe55af 1142 break;
fa9a63c5
RM
1143
1144 case wordbeg:
a1a052df 1145 fprintf (stderr, "/wordbeg");
fa9a63c5 1146 break;
5e69f11e 1147
fa9a63c5 1148 case wordend:
a1a052df 1149 fprintf (stderr, "/wordend");
e2543b02 1150 break;
5e69f11e 1151
669fa600 1152 case symbeg:
e2543b02 1153 fprintf (stderr, "/symbeg");
669fa600
SM
1154 break;
1155
1156 case symend:
e2543b02 1157 fprintf (stderr, "/symend");
669fa600 1158 break;
5e69f11e 1159
1fb352e0 1160 case syntaxspec:
a1a052df 1161 fprintf (stderr, "/syntaxspec");
1fb352e0 1162 mcnt = *p++;
a1a052df 1163 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1164 break;
1165
1166 case notsyntaxspec:
a1a052df 1167 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1168 mcnt = *p++;
a1a052df 1169 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1170 break;
1171
0b32bf0e 1172# ifdef emacs
fa9a63c5 1173 case before_dot:
a1a052df 1174 fprintf (stderr, "/before_dot");
25fe55af 1175 break;
fa9a63c5
RM
1176
1177 case at_dot:
a1a052df 1178 fprintf (stderr, "/at_dot");
25fe55af 1179 break;
fa9a63c5
RM
1180
1181 case after_dot:
a1a052df 1182 fprintf (stderr, "/after_dot");
25fe55af 1183 break;
fa9a63c5 1184
1fb352e0 1185 case categoryspec:
a1a052df 1186 fprintf (stderr, "/categoryspec");
fa9a63c5 1187 mcnt = *p++;
a1a052df 1188 fprintf (stderr, "/%d", mcnt);
25fe55af 1189 break;
5e69f11e 1190
1fb352e0 1191 case notcategoryspec:
a1a052df 1192 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1193 mcnt = *p++;
a1a052df 1194 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1195 break;
0b32bf0e 1196# endif /* emacs */
fa9a63c5 1197
fa9a63c5 1198 case begbuf:
a1a052df 1199 fprintf (stderr, "/begbuf");
25fe55af 1200 break;
fa9a63c5
RM
1201
1202 case endbuf:
a1a052df 1203 fprintf (stderr, "/endbuf");
25fe55af 1204 break;
fa9a63c5 1205
25fe55af 1206 default:
a1a052df 1207 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1208 }
1209
a1a052df 1210 fprintf (stderr, "\n");
fa9a63c5
RM
1211 }
1212
a1a052df 1213 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1214}
1215
1216
1217void
1218print_compiled_pattern (bufp)
1219 struct re_pattern_buffer *bufp;
1220{
01618498 1221 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1222
1223 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1224 printf ("%ld bytes used/%ld bytes allocated.\n",
1225 bufp->used, bufp->allocated);
fa9a63c5
RM
1226
1227 if (bufp->fastmap_accurate && bufp->fastmap)
1228 {
1229 printf ("fastmap: ");
1230 print_fastmap (bufp->fastmap);
1231 }
1232
1233 printf ("re_nsub: %d\t", bufp->re_nsub);
1234 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1235 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1236 printf ("no_sub: %d\t", bufp->no_sub);
1237 printf ("not_bol: %d\t", bufp->not_bol);
1238 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1239 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1240 fflush (stdout);
fa9a63c5
RM
1241 /* Perhaps we should print the translate table? */
1242}
1243
1244
1245void
1246print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1247 re_char *where;
1248 re_char *string1;
1249 re_char *string2;
fa9a63c5
RM
1250 int size1;
1251 int size2;
1252{
4bb91c68 1253 int this_char;
5e69f11e 1254
fa9a63c5
RM
1255 if (where == NULL)
1256 printf ("(null)");
1257 else
1258 {
1259 if (FIRST_STRING_P (where))
25fe55af
RS
1260 {
1261 for (this_char = where - string1; this_char < size1; this_char++)
1262 putchar (string1[this_char]);
fa9a63c5 1263
25fe55af
RS
1264 where = string2;
1265 }
fa9a63c5
RM
1266
1267 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1268 putchar (string2[this_char]);
fa9a63c5
RM
1269 }
1270}
1271
1272#else /* not DEBUG */
1273
0b32bf0e
SM
1274# undef assert
1275# define assert(e)
fa9a63c5 1276
0b32bf0e
SM
1277# define DEBUG_STATEMENT(e)
1278# define DEBUG_PRINT1(x)
1279# define DEBUG_PRINT2(x1, x2)
1280# define DEBUG_PRINT3(x1, x2, x3)
1281# define DEBUG_PRINT4(x1, x2, x3, x4)
1282# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1283# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1284
1285#endif /* not DEBUG */
1286\f
1287/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1288 also be assigned to arbitrarily: each pattern buffer stores its own
1289 syntax, so it can be changed between regex compilations. */
1290/* This has no initializer because initialized variables in Emacs
1291 become read-only after dumping. */
1292reg_syntax_t re_syntax_options;
1293
1294
1295/* Specify the precise syntax of regexps for compilation. This provides
1296 for compatibility for various utilities which historically have
1297 different, incompatible syntaxes.
1298
1299 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1300 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1301
1302reg_syntax_t
971de7fb 1303re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1304{
1305 reg_syntax_t ret = re_syntax_options;
5e69f11e 1306
fa9a63c5
RM
1307 re_syntax_options = syntax;
1308 return ret;
1309}
c0f9ea08 1310WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1311
1312/* Regexp to use to replace spaces, or NULL meaning don't. */
1313static re_char *whitespace_regexp;
1314
1315void
971de7fb 1316re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1317{
6470ea05 1318 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1319}
1320WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1321\f
1322/* This table gives an error message for each of the error codes listed
4bb91c68 1323 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1324 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1325 but why not be nice? */
fa9a63c5
RM
1326
1327static const char *re_error_msgid[] =
5e69f11e
RM
1328 {
1329 gettext_noop ("Success"), /* REG_NOERROR */
1330 gettext_noop ("No match"), /* REG_NOMATCH */
1331 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1332 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1333 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1334 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1335 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1336 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1337 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1338 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1339 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1340 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1341 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1342 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1343 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1344 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1345 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1346 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1347 };
1348\f
4bb91c68 1349/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1350
1351/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1352 searching and matching functions should not call alloca. On some
1353 systems, alloca is implemented in terms of malloc, and if we're
1354 using the relocating allocator routines, then malloc could cause a
1355 relocation, which might (if the strings being searched are in the
1356 ralloc heap) shift the data out from underneath the regexp
1357 routines.
1358
5e69f11e 1359 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1360 processes input from X in a signal handler; processing X input may
1361 call malloc; if input arrives while a matching routine is calling
1362 malloc, then we're scrod. But Emacs can't just block input while
1363 calling matching routines; then we don't notice interrupts when
1364 they come in. So, Emacs blocks input around all regexp calls
1365 except the matching calls, which it leaves unprotected, in the
1366 faith that they will not malloc. */
1367
1368/* Normally, this is fine. */
1369#define MATCH_MAY_ALLOCATE
1370
fa9a63c5
RM
1371/* The match routines may not allocate if (1) they would do it with malloc
1372 and (2) it's not safe for them to use malloc.
1373 Note that if REL_ALLOC is defined, matching would not use malloc for the
1374 failure stack, but we would still use it for the register vectors;
4bb91c68 1375 so REL_ALLOC should not affect this. */
b588157e 1376#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1377# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1378#endif
1379
1380\f
1381/* Failure stack declarations and macros; both re_compile_fastmap and
1382 re_match_2 use a failure stack. These have to be macros because of
1383 REGEX_ALLOCATE_STACK. */
5e69f11e 1384
fa9a63c5 1385
320a2a73 1386/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1387 when matching. If this number is exceeded, we allocate more
1388 space, so it is not a hard limit. */
1389#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1390# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1391#endif
1392
1393/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1394 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1395 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1396 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1397 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1398# if defined MATCH_MAY_ALLOCATE
1399/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1400 whose default stack limit is 2mb. In order for a larger
1401 value to work reliably, you have to try to make it accord
1402 with the process stack limit. */
c0f9ea08
SM
1403size_t re_max_failures = 40000;
1404# else
1405size_t re_max_failures = 4000;
1406# endif
fa9a63c5
RM
1407
1408union fail_stack_elt
1409{
01618498 1410 re_char *pointer;
c0f9ea08
SM
1411 /* This should be the biggest `int' that's no bigger than a pointer. */
1412 long integer;
fa9a63c5
RM
1413};
1414
1415typedef union fail_stack_elt fail_stack_elt_t;
1416
1417typedef struct
1418{
1419 fail_stack_elt_t *stack;
c0f9ea08
SM
1420 size_t size;
1421 size_t avail; /* Offset of next open position. */
1422 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1423} fail_stack_type;
1424
505bde11 1425#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1426#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1427
1428
1429/* Define macros to initialize and free the failure stack.
1430 Do `return -2' if the alloc fails. */
1431
1432#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1433# define INIT_FAIL_STACK() \
fa9a63c5
RM
1434 do { \
1435 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1436 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1437 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1438 \
1439 if (fail_stack.stack == NULL) \
1440 return -2; \
1441 \
1442 fail_stack.size = INIT_FAILURE_ALLOC; \
1443 fail_stack.avail = 0; \
505bde11 1444 fail_stack.frame = 0; \
fa9a63c5
RM
1445 } while (0)
1446
0b32bf0e 1447# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1448#else
0b32bf0e 1449# define INIT_FAIL_STACK() \
fa9a63c5
RM
1450 do { \
1451 fail_stack.avail = 0; \
505bde11 1452 fail_stack.frame = 0; \
fa9a63c5
RM
1453 } while (0)
1454
0b32bf0e 1455# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1456#endif
1457
1458
320a2a73
KH
1459/* Double the size of FAIL_STACK, up to a limit
1460 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1461
1462 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1463 allocating space for it or it was already too large.
1464
4bb91c68 1465 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1466
320a2a73
KH
1467/* Factor to increase the failure stack size by
1468 when we increase it.
1469 This used to be 2, but 2 was too wasteful
1470 because the old discarded stacks added up to as much space
1471 were as ultimate, maximum-size stack. */
1472#define FAIL_STACK_GROWTH_FACTOR 4
1473
1474#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1475 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1476 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1477 ? 0 \
320a2a73
KH
1478 : ((fail_stack).stack \
1479 = (fail_stack_elt_t *) \
25fe55af
RS
1480 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1481 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1482 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1483 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1484 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1485 \
1486 (fail_stack).stack == NULL \
1487 ? 0 \
6453db45
KH
1488 : ((fail_stack).size \
1489 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1490 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1491 * FAIL_STACK_GROWTH_FACTOR)) \
1492 / sizeof (fail_stack_elt_t)), \
25fe55af 1493 1)))
fa9a63c5
RM
1494
1495
fa9a63c5
RM
1496/* Push a pointer value onto the failure stack.
1497 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1498 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1499#define PUSH_FAILURE_POINTER(item) \
01618498 1500 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1501
1502/* This pushes an integer-valued item onto the failure stack.
1503 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1504 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1505#define PUSH_FAILURE_INT(item) \
1506 fail_stack.stack[fail_stack.avail++].integer = (item)
1507
1508/* Push a fail_stack_elt_t value onto the failure stack.
1509 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1510 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1511#define PUSH_FAILURE_ELT(item) \
1512 fail_stack.stack[fail_stack.avail++] = (item)
1513
1514/* These three POP... operations complement the three PUSH... operations.
1515 All assume that `fail_stack' is nonempty. */
1516#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1517#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1518#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1519
505bde11
SM
1520/* Individual items aside from the registers. */
1521#define NUM_NONREG_ITEMS 3
1522
1523/* Used to examine the stack (to detect infinite loops). */
1524#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1525#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1526#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1527#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1528
1529
505bde11
SM
1530#define ENSURE_FAIL_STACK(space) \
1531while (REMAINING_AVAIL_SLOTS <= space) { \
1532 if (!GROW_FAIL_STACK (fail_stack)) \
1533 return -2; \
1534 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1535 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1536}
1537
1538/* Push register NUM onto the stack. */
1539#define PUSH_FAILURE_REG(num) \
1540do { \
1541 char *destination; \
1542 ENSURE_FAIL_STACK(3); \
1543 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1544 num, regstart[num], regend[num]); \
1545 PUSH_FAILURE_POINTER (regstart[num]); \
1546 PUSH_FAILURE_POINTER (regend[num]); \
1547 PUSH_FAILURE_INT (num); \
1548} while (0)
1549
01618498
SM
1550/* Change the counter's value to VAL, but make sure that it will
1551 be reset when backtracking. */
1552#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1553do { \
1554 char *destination; \
1555 int c; \
1556 ENSURE_FAIL_STACK(3); \
1557 EXTRACT_NUMBER (c, ptr); \
01618498 1558 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1559 PUSH_FAILURE_INT (c); \
1560 PUSH_FAILURE_POINTER (ptr); \
1561 PUSH_FAILURE_INT (-1); \
01618498 1562 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1563} while (0)
1564
505bde11 1565/* Pop a saved register off the stack. */
dc1e502d 1566#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1567do { \
1568 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1569 if (reg == -1) \
1570 { \
1571 /* It's a counter. */ \
6dcf2d0e
SM
1572 /* Here, we discard `const', making re_match non-reentrant. */ \
1573 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1574 reg = POP_FAILURE_INT (); \
1575 STORE_NUMBER (ptr, reg); \
1576 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1577 } \
1578 else \
1579 { \
1580 regend[reg] = POP_FAILURE_POINTER (); \
1581 regstart[reg] = POP_FAILURE_POINTER (); \
1582 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1583 reg, regstart[reg], regend[reg]); \
1584 } \
505bde11
SM
1585} while (0)
1586
1587/* Check that we are not stuck in an infinite loop. */
1588#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1589do { \
f6df485f 1590 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1591 /* Check for infinite matching loops */ \
f6df485f
RS
1592 while (failure > 0 \
1593 && (FAILURE_STR (failure) == string_place \
1594 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1595 { \
1596 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1597 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1598 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1599 { \
6df42991
SM
1600 cycle = 1; \
1601 break; \
f6df485f 1602 } \
66f0296e 1603 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1604 failure = NEXT_FAILURE_HANDLE(failure); \
1605 } \
1606 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1607} while (0)
6df42991 1608
fa9a63c5 1609/* Push the information about the state we will need
5e69f11e
RM
1610 if we ever fail back to it.
1611
505bde11 1612 Requires variables fail_stack, regstart, regend and
320a2a73 1613 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1614 declared.
5e69f11e 1615
fa9a63c5
RM
1616 Does `return FAILURE_CODE' if runs out of memory. */
1617
505bde11
SM
1618#define PUSH_FAILURE_POINT(pattern, string_place) \
1619do { \
1620 char *destination; \
1621 /* Must be int, so when we don't save any registers, the arithmetic \
1622 of 0 + -1 isn't done as unsigned. */ \
1623 \
505bde11 1624 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1625 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1626 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1627 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1628 \
1629 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1630 \
1631 DEBUG_PRINT1 ("\n"); \
1632 \
1633 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1634 PUSH_FAILURE_INT (fail_stack.frame); \
1635 \
1636 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1637 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1638 DEBUG_PRINT1 ("'\n"); \
1639 PUSH_FAILURE_POINTER (string_place); \
1640 \
1641 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1642 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1643 PUSH_FAILURE_POINTER (pattern); \
1644 \
1645 /* Close the frame by moving the frame pointer past it. */ \
1646 fail_stack.frame = fail_stack.avail; \
1647} while (0)
fa9a63c5 1648
320a2a73
KH
1649/* Estimate the size of data pushed by a typical failure stack entry.
1650 An estimate is all we need, because all we use this for
1651 is to choose a limit for how big to make the failure stack. */
ada30c0e 1652/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1653#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1654
fa9a63c5
RM
1655/* How many items can still be added to the stack without overflowing it. */
1656#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1657
1658
1659/* Pops what PUSH_FAIL_STACK pushes.
1660
1661 We restore into the parameters, all of which should be lvalues:
1662 STR -- the saved data position.
1663 PAT -- the saved pattern position.
fa9a63c5 1664 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1665
fa9a63c5 1666 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1667 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1668
505bde11
SM
1669#define POP_FAILURE_POINT(str, pat) \
1670do { \
fa9a63c5
RM
1671 assert (!FAIL_STACK_EMPTY ()); \
1672 \
1673 /* Remove failure points and point to how many regs pushed. */ \
1674 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1675 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1676 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1677 \
505bde11
SM
1678 /* Pop the saved registers. */ \
1679 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1680 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1681 \
01618498 1682 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1683 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1684 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1685 \
1686 /* If the saved string location is NULL, it came from an \
1687 on_failure_keep_string_jump opcode, and we want to throw away the \
1688 saved NULL, thus retaining our current position in the string. */ \
01618498 1689 str = POP_FAILURE_POINTER (); \
505bde11 1690 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1691 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1692 DEBUG_PRINT1 ("'\n"); \
1693 \
505bde11
SM
1694 fail_stack.frame = POP_FAILURE_INT (); \
1695 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1696 \
505bde11
SM
1697 assert (fail_stack.avail >= 0); \
1698 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1699 \
fa9a63c5 1700 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1701} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1702
1703
1704\f
fa9a63c5 1705/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1706#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1707\f
1708/* Subroutine declarations and macros for regex_compile. */
1709
4bb91c68
SM
1710static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1711 reg_syntax_t syntax,
1712 struct re_pattern_buffer *bufp));
1713static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1714static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1715 int arg1, int arg2));
1716static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1717 int arg, unsigned char *end));
1718static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1719 int arg1, int arg2, unsigned char *end));
01618498
SM
1720static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1721 re_char *p,
4bb91c68 1722 reg_syntax_t syntax));
01618498
SM
1723static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1724 re_char *pend,
4bb91c68 1725 reg_syntax_t syntax));
01618498
SM
1726static re_char *skip_one_char _RE_ARGS ((re_char *p));
1727static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1728 char *fastmap, const int multibyte));
fa9a63c5 1729
fa9a63c5 1730/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1731 translation. */
36595814 1732#define PATFETCH(c) \
2d1675e4
SM
1733 do { \
1734 int len; \
1735 if (p == pend) return REG_EEND; \
62a6e103 1736 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1737 p += len; \
fa9a63c5
RM
1738 } while (0)
1739
fa9a63c5
RM
1740
1741/* If `translate' is non-null, return translate[D], else just D. We
1742 cast the subscript to translate because some data is declared as
1743 `char *', to avoid warnings when a string constant is passed. But
1744 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1745#ifndef TRANSLATE
0b32bf0e 1746# define TRANSLATE(d) \
66f0296e 1747 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1748#endif
fa9a63c5
RM
1749
1750
1751/* Macros for outputting the compiled pattern into `buffer'. */
1752
1753/* If the buffer isn't allocated when it comes in, use this. */
1754#define INIT_BUF_SIZE 32
1755
4bb91c68 1756/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1757#define GET_BUFFER_SPACE(n) \
01618498 1758 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1759 EXTEND_BUFFER ()
1760
1761/* Make sure we have one more byte of buffer space and then add C to it. */
1762#define BUF_PUSH(c) \
1763 do { \
1764 GET_BUFFER_SPACE (1); \
1765 *b++ = (unsigned char) (c); \
1766 } while (0)
1767
1768
1769/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1770#define BUF_PUSH_2(c1, c2) \
1771 do { \
1772 GET_BUFFER_SPACE (2); \
1773 *b++ = (unsigned char) (c1); \
1774 *b++ = (unsigned char) (c2); \
1775 } while (0)
1776
1777
4bb91c68 1778/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1779#define BUF_PUSH_3(c1, c2, c3) \
1780 do { \
1781 GET_BUFFER_SPACE (3); \
1782 *b++ = (unsigned char) (c1); \
1783 *b++ = (unsigned char) (c2); \
1784 *b++ = (unsigned char) (c3); \
1785 } while (0)
1786
1787
1788/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1789 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1790#define STORE_JUMP(op, loc, to) \
1791 store_op1 (op, loc, (to) - (loc) - 3)
1792
1793/* Likewise, for a two-argument jump. */
1794#define STORE_JUMP2(op, loc, to, arg) \
1795 store_op2 (op, loc, (to) - (loc) - 3, arg)
1796
4bb91c68 1797/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1798#define INSERT_JUMP(op, loc, to) \
1799 insert_op1 (op, loc, (to) - (loc) - 3, b)
1800
1801/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1802#define INSERT_JUMP2(op, loc, to, arg) \
1803 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1804
1805
1806/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1807 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1808 be too small, many things would have to change. */
839966f3
KH
1809# define MAX_BUF_SIZE (1L << 15)
1810
1811#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1812/* Any other compiler which, like MSC, has allocation limit below 2^16
1813 bytes will have to use approach similar to what was done below for
1814 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1815 reallocating to 0 bytes. Such thing is not going to work too well.
1816 You have been warned!! */
1817#if defined _MSC_VER && !defined WIN32
1818/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1819# define MAX_BUF_SIZE 65500L
1820#else
1821# define MAX_BUF_SIZE (1L << 16)
1822#endif
839966f3 1823#endif /* 0 */
fa9a63c5
RM
1824
1825/* Extend the buffer by twice its current size via realloc and
1826 reset the pointers that pointed into the old block to point to the
1827 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1828 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1829#if __BOUNDED_POINTERS__
1830# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1831# define MOVE_BUFFER_POINTER(P) \
1832 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1833 SET_HIGH_BOUND (P), \
1834 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1835# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1836 else \
1837 { \
1838 SET_HIGH_BOUND (b); \
1839 SET_HIGH_BOUND (begalt); \
1840 if (fixup_alt_jump) \
1841 SET_HIGH_BOUND (fixup_alt_jump); \
1842 if (laststart) \
1843 SET_HIGH_BOUND (laststart); \
1844 if (pending_exact) \
1845 SET_HIGH_BOUND (pending_exact); \
1846 }
1847#else
381880b0 1848# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1849# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1850#endif
fa9a63c5 1851#define EXTEND_BUFFER() \
25fe55af 1852 do { \
381880b0 1853 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1854 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1855 return REG_ESIZE; \
1856 bufp->allocated <<= 1; \
1857 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1858 bufp->allocated = MAX_BUF_SIZE; \
01618498 1859 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1860 if (bufp->buffer == NULL) \
1861 return REG_ESPACE; \
1862 /* If the buffer moved, move all the pointers into it. */ \
1863 if (old_buffer != bufp->buffer) \
1864 { \
381880b0 1865 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1866 MOVE_BUFFER_POINTER (b); \
1867 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1868 if (fixup_alt_jump) \
4bb91c68 1869 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1870 if (laststart) \
4bb91c68 1871 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1872 if (pending_exact) \
4bb91c68 1873 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1874 } \
4bb91c68 1875 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1876 } while (0)
1877
1878
1879/* Since we have one byte reserved for the register number argument to
1880 {start,stop}_memory, the maximum number of groups we can report
1881 things about is what fits in that byte. */
1882#define MAX_REGNUM 255
1883
1884/* But patterns can have more than `MAX_REGNUM' registers. We just
1885 ignore the excess. */
098d42af 1886typedef int regnum_t;
fa9a63c5
RM
1887
1888
1889/* Macros for the compile stack. */
1890
1891/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1892 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1893/* int may be not enough when sizeof(int) == 2. */
1894typedef long pattern_offset_t;
fa9a63c5
RM
1895
1896typedef struct
1897{
1898 pattern_offset_t begalt_offset;
1899 pattern_offset_t fixup_alt_jump;
5e69f11e 1900 pattern_offset_t laststart_offset;
fa9a63c5
RM
1901 regnum_t regnum;
1902} compile_stack_elt_t;
1903
1904
1905typedef struct
1906{
1907 compile_stack_elt_t *stack;
1908 unsigned size;
1909 unsigned avail; /* Offset of next open position. */
1910} compile_stack_type;
1911
1912
1913#define INIT_COMPILE_STACK_SIZE 32
1914
1915#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1916#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1917
4bb91c68 1918/* The next available element. */
fa9a63c5
RM
1919#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1920
1cee1e27
SM
1921/* Explicit quit checking is only used on NTemacs and whenever we
1922 use polling to process input events. */
1923#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1924extern int immediate_quit;
1925# define IMMEDIATE_QUIT_CHECK \
1926 do { \
1927 if (immediate_quit) QUIT; \
1928 } while (0)
1929#else
1930# define IMMEDIATE_QUIT_CHECK ((void)0)
1931#endif
1932\f
b18215fc
RS
1933/* Structure to manage work area for range table. */
1934struct range_table_work_area
1935{
1936 int *table; /* actual work area. */
1937 int allocated; /* allocated size for work area in bytes. */
7814e705 1938 int used; /* actually used size in words. */
96cc36cc 1939 int bits; /* flag to record character classes */
b18215fc
RS
1940};
1941
77d11aec
RS
1942/* Make sure that WORK_AREA can hold more N multibyte characters.
1943 This is used only in set_image_of_range and set_image_of_range_1.
1944 It expects WORK_AREA to be a pointer.
1945 If it can't get the space, it returns from the surrounding function. */
1946
1947#define EXTEND_RANGE_TABLE(work_area, n) \
1948 do { \
8f924df7 1949 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1950 { \
8f924df7
KH
1951 extend_range_table_work_area (&work_area); \
1952 if ((work_area).table == 0) \
77d11aec
RS
1953 return (REG_ESPACE); \
1954 } \
b18215fc
RS
1955 } while (0)
1956
96cc36cc
RS
1957#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1958 (work_area).bits |= (bit)
1959
14473664
SM
1960/* Bits used to implement the multibyte-part of the various character classes
1961 such as [:alnum:] in a charset's range table. */
1962#define BIT_WORD 0x1
1963#define BIT_LOWER 0x2
1964#define BIT_PUNCT 0x4
1965#define BIT_SPACE 0x8
1966#define BIT_UPPER 0x10
1967#define BIT_MULTIBYTE 0x20
96cc36cc 1968
b18215fc
RS
1969/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1970#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1971 do { \
8f924df7 1972 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1973 (work_area).table[(work_area).used++] = (range_start); \
1974 (work_area).table[(work_area).used++] = (range_end); \
1975 } while (0)
1976
7814e705 1977/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1978#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1979 do { \
1980 if ((work_area).table) \
1981 free ((work_area).table); \
1982 } while (0)
1983
96cc36cc 1984#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1985#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1986#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1987#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1988\f
b18215fc 1989
fa9a63c5 1990/* Set the bit for character C in a list. */
01618498 1991#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1992
1993
bf216479
KH
1994#ifdef emacs
1995
cf9c99bc
KH
1996/* Store characters in the range FROM to TO in the bitmap at B (for
1997 ASCII and unibyte characters) and WORK_AREA (for multibyte
1998 characters) while translating them and paying attention to the
1999 continuity of translated characters.
8f924df7 2000
cf9c99bc
KH
2001 Implementation note: It is better to implement these fairly big
2002 macros by a function, but it's not that easy because macros called
8f924df7 2003 in this macro assume various local variables already declared. */
bf216479 2004
cf9c99bc
KH
2005/* Both FROM and TO are ASCII characters. */
2006
2007#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
2008 do { \
2009 int C0, C1; \
2010 \
2011 for (C0 = (FROM); C0 <= (TO); C0++) \
2012 { \
2013 C1 = TRANSLATE (C0); \
2014 if (! ASCII_CHAR_P (C1)) \
2015 { \
2016 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2017 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2018 C1 = C0; \
2019 } \
2020 SET_LIST_BIT (C1); \
2021 } \
2022 } while (0)
2023
2024
2025/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2026
2027#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2028 do { \
2029 int C0, C1, C2, I; \
2030 int USED = RANGE_TABLE_WORK_USED (work_area); \
2031 \
2032 for (C0 = (FROM); C0 <= (TO); C0++) \
2033 { \
2034 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2035 if (CHAR_BYTE8_P (C1)) \
2036 SET_LIST_BIT (C0); \
2037 else \
2038 { \
2039 C2 = TRANSLATE (C1); \
2040 if (C2 == C1 \
2041 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2042 C1 = C0; \
2043 SET_LIST_BIT (C1); \
2044 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2045 { \
2046 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2047 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2048 \
2049 if (C2 >= from - 1 && C2 <= to + 1) \
2050 { \
2051 if (C2 == from - 1) \
2052 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2053 else if (C2 == to + 1) \
2054 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2055 break; \
2056 } \
2057 } \
2058 if (I < USED) \
2059 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2060 } \
2061 } \
2062 } while (0)
2063
2064
78edd3b7 2065/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2066
2067#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2068 do { \
2069 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2070 \
2071 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2072 for (C0 = (FROM); C0 <= (TO); C0++) \
2073 { \
2074 C1 = TRANSLATE (C0); \
2075 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2076 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2077 SET_LIST_BIT (C2); \
2078 if (C1 >= (FROM) && C1 <= (TO)) \
2079 continue; \
2080 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2081 { \
2082 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2083 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2084 \
2085 if (C1 >= from - 1 && C1 <= to + 1) \
2086 { \
2087 if (C1 == from - 1) \
2088 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2089 else if (C1 == to + 1) \
2090 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2091 break; \
2092 } \
2093 } \
2094 if (I < USED) \
2095 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2096 } \
bf216479
KH
2097 } while (0)
2098
2099#endif /* emacs */
2100
fa9a63c5 2101/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2102#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2103 do { \
2104 if (p == pend) \
2105 FREE_STACK_RETURN (REG_EBRACE); \
2106 else \
2107 { \
2108 PATFETCH (c); \
2109 while ('0' <= c && c <= '9') \
2110 { \
2111 int prev; \
2112 if (num < 0) \
2113 num = 0; \
2114 prev = num; \
2115 num = num * 10 + c - '0'; \
2116 if (num / 10 != prev) \
2117 FREE_STACK_RETURN (REG_BADBR); \
2118 if (p == pend) \
2119 FREE_STACK_RETURN (REG_EBRACE); \
2120 PATFETCH (c); \
2121 } \
2122 } \
2123 } while (0)
77d11aec 2124\f
1fdab503 2125#if ! WIDE_CHAR_SUPPORT
01618498 2126
14473664 2127/* Map a string to the char class it names (if any). */
1fdab503 2128re_wctype_t
971de7fb 2129re_wctype (const re_char *str)
14473664 2130{
ada30c0e 2131 const char *string = str;
14473664
SM
2132 if (STREQ (string, "alnum")) return RECC_ALNUM;
2133 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2134 else if (STREQ (string, "word")) return RECC_WORD;
2135 else if (STREQ (string, "ascii")) return RECC_ASCII;
2136 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2137 else if (STREQ (string, "graph")) return RECC_GRAPH;
2138 else if (STREQ (string, "lower")) return RECC_LOWER;
2139 else if (STREQ (string, "print")) return RECC_PRINT;
2140 else if (STREQ (string, "punct")) return RECC_PUNCT;
2141 else if (STREQ (string, "space")) return RECC_SPACE;
2142 else if (STREQ (string, "upper")) return RECC_UPPER;
2143 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2144 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2145 else if (STREQ (string, "digit")) return RECC_DIGIT;
2146 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2147 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2148 else if (STREQ (string, "blank")) return RECC_BLANK;
2149 else return 0;
2150}
2151
e0f24100 2152/* True if CH is in the char class CC. */
1fdab503 2153boolean
971de7fb 2154re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2155{
2156 switch (cc)
2157 {
0cdd06f8
SM
2158 case RECC_ALNUM: return ISALNUM (ch);
2159 case RECC_ALPHA: return ISALPHA (ch);
2160 case RECC_BLANK: return ISBLANK (ch);
2161 case RECC_CNTRL: return ISCNTRL (ch);
2162 case RECC_DIGIT: return ISDIGIT (ch);
2163 case RECC_GRAPH: return ISGRAPH (ch);
2164 case RECC_LOWER: return ISLOWER (ch);
2165 case RECC_PRINT: return ISPRINT (ch);
2166 case RECC_PUNCT: return ISPUNCT (ch);
2167 case RECC_SPACE: return ISSPACE (ch);
2168 case RECC_UPPER: return ISUPPER (ch);
2169 case RECC_XDIGIT: return ISXDIGIT (ch);
2170 case RECC_ASCII: return IS_REAL_ASCII (ch);
2171 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2172 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2173 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2174 case RECC_WORD: return ISWORD (ch);
2175 case RECC_ERROR: return false;
2176 default:
2177 abort();
14473664
SM
2178 }
2179}
fa9a63c5 2180
14473664
SM
2181/* Return a bit-pattern to use in the range-table bits to match multibyte
2182 chars of class CC. */
2183static int
971de7fb 2184re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2185{
2186 switch (cc)
2187 {
2188 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2189 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2190 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2191 case RECC_LOWER: return BIT_LOWER;
2192 case RECC_UPPER: return BIT_UPPER;
2193 case RECC_PUNCT: return BIT_PUNCT;
2194 case RECC_SPACE: return BIT_SPACE;
14473664 2195 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2196 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2197 default:
2198 abort();
14473664
SM
2199 }
2200}
2201#endif
77d11aec
RS
2202\f
2203/* Filling in the work area of a range. */
2204
2205/* Actually extend the space in WORK_AREA. */
2206
2207static void
971de7fb 2208extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2209{
77d11aec
RS
2210 work_area->allocated += 16 * sizeof (int);
2211 if (work_area->table)
2212 work_area->table
2213 = (int *) realloc (work_area->table, work_area->allocated);
2214 else
2215 work_area->table
2216 = (int *) malloc (work_area->allocated);
2217}
2218
8f924df7 2219#if 0
77d11aec
RS
2220#ifdef emacs
2221
2222/* Carefully find the ranges of codes that are equivalent
2223 under case conversion to the range start..end when passed through
2224 TRANSLATE. Handle the case where non-letters can come in between
2225 two upper-case letters (which happens in Latin-1).
2226 Also handle the case of groups of more than 2 case-equivalent chars.
2227
2228 The basic method is to look at consecutive characters and see
2229 if they can form a run that can be handled as one.
2230
2231 Returns -1 if successful, REG_ESPACE if ran out of space. */
2232
2233static int
2234set_image_of_range_1 (work_area, start, end, translate)
2235 RE_TRANSLATE_TYPE translate;
2236 struct range_table_work_area *work_area;
2237 re_wchar_t start, end;
2238{
2239 /* `one_case' indicates a character, or a run of characters,
2240 each of which is an isolate (no case-equivalents).
2241 This includes all ASCII non-letters.
2242
2243 `two_case' indicates a character, or a run of characters,
2244 each of which has two case-equivalent forms.
2245 This includes all ASCII letters.
2246
2247 `strange' indicates a character that has more than one
2248 case-equivalent. */
177c0ea7 2249
77d11aec
RS
2250 enum case_type {one_case, two_case, strange};
2251
2252 /* Describe the run that is in progress,
2253 which the next character can try to extend.
2254 If run_type is strange, that means there really is no run.
2255 If run_type is one_case, then run_start...run_end is the run.
2256 If run_type is two_case, then the run is run_start...run_end,
2257 and the case-equivalents end at run_eqv_end. */
2258
2259 enum case_type run_type = strange;
2260 int run_start, run_end, run_eqv_end;
2261
2262 Lisp_Object eqv_table;
2263
2264 if (!RE_TRANSLATE_P (translate))
2265 {
b7c12565 2266 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2267 work_area->table[work_area->used++] = (start);
2268 work_area->table[work_area->used++] = (end);
b7c12565 2269 return -1;
77d11aec
RS
2270 }
2271
2272 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2273
77d11aec
RS
2274 for (; start <= end; start++)
2275 {
2276 enum case_type this_type;
2277 int eqv = RE_TRANSLATE (eqv_table, start);
2278 int minchar, maxchar;
2279
2280 /* Classify this character */
2281 if (eqv == start)
2282 this_type = one_case;
2283 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2284 this_type = two_case;
2285 else
2286 this_type = strange;
2287
2288 if (start < eqv)
2289 minchar = start, maxchar = eqv;
2290 else
2291 minchar = eqv, maxchar = start;
2292
2293 /* Can this character extend the run in progress? */
2294 if (this_type == strange || this_type != run_type
2295 || !(minchar == run_end + 1
2296 && (run_type == two_case
2297 ? maxchar == run_eqv_end + 1 : 1)))
2298 {
2299 /* No, end the run.
2300 Record each of its equivalent ranges. */
2301 if (run_type == one_case)
2302 {
2303 EXTEND_RANGE_TABLE (work_area, 2);
2304 work_area->table[work_area->used++] = run_start;
2305 work_area->table[work_area->used++] = run_end;
2306 }
2307 else if (run_type == two_case)
2308 {
2309 EXTEND_RANGE_TABLE (work_area, 4);
2310 work_area->table[work_area->used++] = run_start;
2311 work_area->table[work_area->used++] = run_end;
2312 work_area->table[work_area->used++]
2313 = RE_TRANSLATE (eqv_table, run_start);
2314 work_area->table[work_area->used++]
2315 = RE_TRANSLATE (eqv_table, run_end);
2316 }
2317 run_type = strange;
2318 }
177c0ea7 2319
77d11aec
RS
2320 if (this_type == strange)
2321 {
2322 /* For a strange character, add each of its equivalents, one
2323 by one. Don't start a range. */
2324 do
2325 {
2326 EXTEND_RANGE_TABLE (work_area, 2);
2327 work_area->table[work_area->used++] = eqv;
2328 work_area->table[work_area->used++] = eqv;
2329 eqv = RE_TRANSLATE (eqv_table, eqv);
2330 }
2331 while (eqv != start);
2332 }
2333
2334 /* Add this char to the run, or start a new run. */
2335 else if (run_type == strange)
2336 {
2337 /* Initialize a new range. */
2338 run_type = this_type;
2339 run_start = start;
2340 run_end = start;
2341 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2342 }
2343 else
2344 {
2345 /* Extend a running range. */
2346 run_end = minchar;
2347 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2348 }
2349 }
2350
2351 /* If a run is still in progress at the end, finish it now
2352 by recording its equivalent ranges. */
2353 if (run_type == one_case)
2354 {
2355 EXTEND_RANGE_TABLE (work_area, 2);
2356 work_area->table[work_area->used++] = run_start;
2357 work_area->table[work_area->used++] = run_end;
2358 }
2359 else if (run_type == two_case)
2360 {
2361 EXTEND_RANGE_TABLE (work_area, 4);
2362 work_area->table[work_area->used++] = run_start;
2363 work_area->table[work_area->used++] = run_end;
2364 work_area->table[work_area->used++]
2365 = RE_TRANSLATE (eqv_table, run_start);
2366 work_area->table[work_area->used++]
2367 = RE_TRANSLATE (eqv_table, run_end);
2368 }
2369
2370 return -1;
2371}
36595814 2372
77d11aec 2373#endif /* emacs */
36595814 2374
2b34df4e 2375/* Record the image of the range start..end when passed through
36595814
SM
2376 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2377 and is not even necessarily contiguous.
b7c12565
RS
2378 Normally we approximate it with the smallest contiguous range that contains
2379 all the chars we need. However, for Latin-1 we go to extra effort
2380 to do a better job.
2381
2382 This function is not called for ASCII ranges.
77d11aec
RS
2383
2384 Returns -1 if successful, REG_ESPACE if ran out of space. */
2385
2386static int
36595814
SM
2387set_image_of_range (work_area, start, end, translate)
2388 RE_TRANSLATE_TYPE translate;
2389 struct range_table_work_area *work_area;
2390 re_wchar_t start, end;
2391{
77d11aec
RS
2392 re_wchar_t cmin, cmax;
2393
2394#ifdef emacs
2395 /* For Latin-1 ranges, use set_image_of_range_1
2396 to get proper handling of ranges that include letters and nonletters.
b7c12565 2397 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2398 For other character sets, we don't bother to get this right. */
b7c12565
RS
2399 if (RE_TRANSLATE_P (translate) && start < 04400
2400 && !(start < 04200 && end >= 04377))
77d11aec 2401 {
b7c12565 2402 int newend;
77d11aec 2403 int tem;
b7c12565
RS
2404 newend = end;
2405 if (newend > 04377)
2406 newend = 04377;
2407 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2408 if (tem > 0)
2409 return tem;
2410
2411 start = 04400;
2412 if (end < 04400)
2413 return -1;
2414 }
2415#endif
2416
b7c12565
RS
2417 EXTEND_RANGE_TABLE (work_area, 2);
2418 work_area->table[work_area->used++] = (start);
2419 work_area->table[work_area->used++] = (end);
2420
2421 cmin = -1, cmax = -1;
77d11aec 2422
36595814 2423 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2424 {
2425 int ch;
77d11aec 2426
b7c12565
RS
2427 for (ch = start; ch <= end; ch++)
2428 {
2429 re_wchar_t c = TRANSLATE (ch);
2430 if (! (start <= c && c <= end))
2431 {
2432 if (cmin == -1)
2433 cmin = c, cmax = c;
2434 else
2435 {
2436 cmin = MIN (cmin, c);
2437 cmax = MAX (cmax, c);
2438 }
2439 }
2440 }
2441
2442 if (cmin != -1)
2443 {
2444 EXTEND_RANGE_TABLE (work_area, 2);
2445 work_area->table[work_area->used++] = (cmin);
2446 work_area->table[work_area->used++] = (cmax);
2447 }
2448 }
36595814 2449
77d11aec
RS
2450 return -1;
2451}
8f924df7 2452#endif /* 0 */
fa9a63c5
RM
2453\f
2454#ifndef MATCH_MAY_ALLOCATE
2455
2456/* If we cannot allocate large objects within re_match_2_internal,
2457 we make the fail stack and register vectors global.
2458 The fail stack, we grow to the maximum size when a regexp
2459 is compiled.
2460 The register vectors, we adjust in size each time we
2461 compile a regexp, according to the number of registers it needs. */
2462
2463static fail_stack_type fail_stack;
2464
2465/* Size with which the following vectors are currently allocated.
2466 That is so we can make them bigger as needed,
4bb91c68 2467 but never make them smaller. */
fa9a63c5
RM
2468static int regs_allocated_size;
2469
66f0296e
SM
2470static re_char ** regstart, ** regend;
2471static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2472
2473/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2474 but don't make them smaller. */
fa9a63c5
RM
2475
2476static
2477regex_grow_registers (num_regs)
2478 int num_regs;
2479{
2480 if (num_regs > regs_allocated_size)
2481 {
66f0296e
SM
2482 RETALLOC_IF (regstart, num_regs, re_char *);
2483 RETALLOC_IF (regend, num_regs, re_char *);
2484 RETALLOC_IF (best_regstart, num_regs, re_char *);
2485 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2486
2487 regs_allocated_size = num_regs;
2488 }
2489}
2490
2491#endif /* not MATCH_MAY_ALLOCATE */
2492\f
99633e97
SM
2493static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2494 compile_stack,
2495 regnum_t regnum));
2496
fa9a63c5
RM
2497/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2498 Returns one of error codes defined in `regex.h', or zero for success.
2499
2500 Assumes the `allocated' (and perhaps `buffer') and `translate'
2501 fields are set in BUFP on entry.
2502
2503 If it succeeds, results are put in BUFP (if it returns an error, the
2504 contents of BUFP are undefined):
2505 `buffer' is the compiled pattern;
2506 `syntax' is set to SYNTAX;
2507 `used' is set to the length of the compiled pattern;
2508 `fastmap_accurate' is zero;
2509 `re_nsub' is the number of subexpressions in PATTERN;
2510 `not_bol' and `not_eol' are zero;
5e69f11e 2511
c0f9ea08 2512 The `fastmap' field is neither examined nor set. */
fa9a63c5 2513
505bde11
SM
2514/* Insert the `jump' from the end of last alternative to "here".
2515 The space for the jump has already been allocated. */
2516#define FIXUP_ALT_JUMP() \
2517do { \
2518 if (fixup_alt_jump) \
2519 STORE_JUMP (jump, fixup_alt_jump, b); \
2520} while (0)
2521
2522
fa9a63c5
RM
2523/* Return, freeing storage we allocated. */
2524#define FREE_STACK_RETURN(value) \
b18215fc
RS
2525 do { \
2526 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2527 free (compile_stack.stack); \
2528 return value; \
2529 } while (0)
fa9a63c5
RM
2530
2531static reg_errcode_t
971de7fb 2532regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2533{
01618498
SM
2534 /* We fetch characters from PATTERN here. */
2535 register re_wchar_t c, c1;
5e69f11e 2536
fa9a63c5 2537 /* A random temporary spot in PATTERN. */
66f0296e 2538 re_char *p1;
fa9a63c5
RM
2539
2540 /* Points to the end of the buffer, where we should append. */
2541 register unsigned char *b;
5e69f11e 2542
fa9a63c5
RM
2543 /* Keeps track of unclosed groups. */
2544 compile_stack_type compile_stack;
2545
2546 /* Points to the current (ending) position in the pattern. */
22336245
RS
2547#ifdef AIX
2548 /* `const' makes AIX compiler fail. */
66f0296e 2549 unsigned char *p = pattern;
22336245 2550#else
66f0296e 2551 re_char *p = pattern;
22336245 2552#endif
66f0296e 2553 re_char *pend = pattern + size;
5e69f11e 2554
fa9a63c5 2555 /* How to translate the characters in the pattern. */
6676cb1c 2556 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2557
2558 /* Address of the count-byte of the most recently inserted `exactn'
2559 command. This makes it possible to tell if a new exact-match
2560 character can be added to that command or if the character requires
2561 a new `exactn' command. */
2562 unsigned char *pending_exact = 0;
2563
2564 /* Address of start of the most recently finished expression.
2565 This tells, e.g., postfix * where to find the start of its
2566 operand. Reset at the beginning of groups and alternatives. */
2567 unsigned char *laststart = 0;
2568
2569 /* Address of beginning of regexp, or inside of last group. */
2570 unsigned char *begalt;
2571
2572 /* Place in the uncompiled pattern (i.e., the {) to
2573 which to go back if the interval is invalid. */
66f0296e 2574 re_char *beg_interval;
5e69f11e 2575
fa9a63c5 2576 /* Address of the place where a forward jump should go to the end of
7814e705 2577 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2578 last -- ends with a forward jump of this sort. */
2579 unsigned char *fixup_alt_jump = 0;
2580
b18215fc
RS
2581 /* Work area for range table of charset. */
2582 struct range_table_work_area range_table_work;
2583
2d1675e4
SM
2584 /* If the object matched can contain multibyte characters. */
2585 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2586
8f924df7 2587 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2588 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2589
f9b0fd99
RS
2590 /* Nonzero if we have pushed down into a subpattern. */
2591 int in_subpattern = 0;
2592
2593 /* These hold the values of p, pattern, and pend from the main
2594 pattern when we have pushed into a subpattern. */
2595 re_char *main_p;
2596 re_char *main_pattern;
2597 re_char *main_pend;
2598
fa9a63c5 2599#ifdef DEBUG
99633e97 2600 debug++;
fa9a63c5 2601 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2602 if (debug > 0)
fa9a63c5
RM
2603 {
2604 unsigned debug_count;
5e69f11e 2605
fa9a63c5 2606 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2607 putchar (pattern[debug_count]);
fa9a63c5
RM
2608 putchar ('\n');
2609 }
2610#endif /* DEBUG */
2611
2612 /* Initialize the compile stack. */
2613 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2614 if (compile_stack.stack == NULL)
2615 return REG_ESPACE;
2616
2617 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2618 compile_stack.avail = 0;
2619
b18215fc
RS
2620 range_table_work.table = 0;
2621 range_table_work.allocated = 0;
2622
fa9a63c5
RM
2623 /* Initialize the pattern buffer. */
2624 bufp->syntax = syntax;
2625 bufp->fastmap_accurate = 0;
2626 bufp->not_bol = bufp->not_eol = 0;
6224b623 2627 bufp->used_syntax = 0;
fa9a63c5
RM
2628
2629 /* Set `used' to zero, so that if we return an error, the pattern
2630 printer (for debugging) will think there's no pattern. We reset it
2631 at the end. */
2632 bufp->used = 0;
5e69f11e 2633
fa9a63c5 2634 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2635 bufp->re_nsub = 0;
fa9a63c5 2636
0b32bf0e 2637#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2638 /* Initialize the syntax table. */
2639 init_syntax_once ();
2640#endif
2641
2642 if (bufp->allocated == 0)
2643 {
2644 if (bufp->buffer)
2645 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2646 enough space. This loses if buffer's address is bogus, but
7814e705 2647 that is the user's responsibility. */
25fe55af
RS
2648 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2649 }
fa9a63c5 2650 else
7814e705 2651 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2652 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2653 }
fa9a63c5
RM
2654 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2655
2656 bufp->allocated = INIT_BUF_SIZE;
2657 }
2658
2659 begalt = b = bufp->buffer;
2660
2661 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2662 while (1)
fa9a63c5 2663 {
f9b0fd99
RS
2664 if (p == pend)
2665 {
2666 /* If this is the end of an included regexp,
2667 pop back to the main regexp and try again. */
2668 if (in_subpattern)
2669 {
2670 in_subpattern = 0;
2671 pattern = main_pattern;
2672 p = main_p;
2673 pend = main_pend;
2674 continue;
2675 }
2676 /* If this is the end of the main regexp, we are done. */
2677 break;
2678 }
2679
fa9a63c5
RM
2680 PATFETCH (c);
2681
2682 switch (c)
25fe55af 2683 {
f9b0fd99
RS
2684 case ' ':
2685 {
2686 re_char *p1 = p;
2687
2688 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2689 spaces normally. And don't try to do this recursively. */
2690 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2691 goto normal_char;
2692
2693 /* Peek past following spaces. */
2694 while (p1 != pend)
2695 {
2696 if (*p1 != ' ')
2697 break;
2698 p1++;
2699 }
2700 /* If the spaces are followed by a repetition op,
2701 treat them normally. */
c721eee5
RS
2702 if (p1 != pend
2703 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2704 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2705 goto normal_char;
2706
2707 /* Replace the spaces with the whitespace regexp. */
2708 in_subpattern = 1;
2709 main_p = p1;
2710 main_pend = pend;
2711 main_pattern = pattern;
2712 p = pattern = whitespace_regexp;
2713 pend = p + strlen (p);
2714 break;
7814e705 2715 }
f9b0fd99 2716
25fe55af
RS
2717 case '^':
2718 {
7814e705 2719 if ( /* If at start of pattern, it's an operator. */
25fe55af 2720 p == pattern + 1
7814e705 2721 /* If context independent, it's an operator. */
25fe55af 2722 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2723 /* Otherwise, depends on what's come before. */
25fe55af 2724 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2725 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2726 else
2727 goto normal_char;
2728 }
2729 break;
2730
2731
2732 case '$':
2733 {
2734 if ( /* If at end of pattern, it's an operator. */
2735 p == pend
7814e705 2736 /* If context independent, it's an operator. */
25fe55af
RS
2737 || syntax & RE_CONTEXT_INDEP_ANCHORS
2738 /* Otherwise, depends on what's next. */
2739 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2740 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2741 else
2742 goto normal_char;
2743 }
2744 break;
fa9a63c5
RM
2745
2746
2747 case '+':
25fe55af
RS
2748 case '?':
2749 if ((syntax & RE_BK_PLUS_QM)
2750 || (syntax & RE_LIMITED_OPS))
2751 goto normal_char;
2752 handle_plus:
2753 case '*':
2754 /* If there is no previous pattern... */
2755 if (!laststart)
2756 {
2757 if (syntax & RE_CONTEXT_INVALID_OPS)
2758 FREE_STACK_RETURN (REG_BADRPT);
2759 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2760 goto normal_char;
2761 }
2762
2763 {
7814e705 2764 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2765 boolean zero_times_ok = 0, many_times_ok = 0;
2766 boolean greedy = 1;
25fe55af
RS
2767
2768 /* If there is a sequence of repetition chars, collapse it
2769 down to just one (the right one). We can't combine
2770 interval operators with these because of, e.g., `a{2}*',
7814e705 2771 which should only match an even number of `a's. */
25fe55af
RS
2772
2773 for (;;)
2774 {
0b32bf0e 2775 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2776 && c == '?' && (zero_times_ok || many_times_ok))
2777 greedy = 0;
2778 else
2779 {
2780 zero_times_ok |= c != '+';
2781 many_times_ok |= c != '?';
2782 }
25fe55af
RS
2783
2784 if (p == pend)
2785 break;
ed0767d8
SM
2786 else if (*p == '*'
2787 || (!(syntax & RE_BK_PLUS_QM)
2788 && (*p == '+' || *p == '?')))
25fe55af 2789 ;
ed0767d8 2790 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2791 {
ed0767d8
SM
2792 if (p+1 == pend)
2793 FREE_STACK_RETURN (REG_EESCAPE);
2794 if (p[1] == '+' || p[1] == '?')
2795 PATFETCH (c); /* Gobble up the backslash. */
2796 else
2797 break;
25fe55af
RS
2798 }
2799 else
ed0767d8 2800 break;
25fe55af 2801 /* If we get here, we found another repeat character. */
ed0767d8
SM
2802 PATFETCH (c);
2803 }
25fe55af
RS
2804
2805 /* Star, etc. applied to an empty pattern is equivalent
2806 to an empty pattern. */
4e8a9132 2807 if (!laststart || laststart == b)
25fe55af
RS
2808 break;
2809
2810 /* Now we know whether or not zero matches is allowed
7814e705 2811 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2812 if (greedy)
2813 {
99633e97 2814 if (many_times_ok)
4e8a9132
SM
2815 {
2816 boolean simple = skip_one_char (laststart) == b;
2817 unsigned int startoffset = 0;
f6a3f532 2818 re_opcode_t ofj =
01618498 2819 /* Check if the loop can match the empty string. */
6df42991
SM
2820 (simple || !analyse_first (laststart, b, NULL, 0))
2821 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2822 assert (skip_one_char (laststart) <= b);
177c0ea7 2823
4e8a9132
SM
2824 if (!zero_times_ok && simple)
2825 { /* Since simple * loops can be made faster by using
2826 on_failure_keep_string_jump, we turn simple P+
2827 into PP* if P is simple. */
2828 unsigned char *p1, *p2;
2829 startoffset = b - laststart;
2830 GET_BUFFER_SPACE (startoffset);
2831 p1 = b; p2 = laststart;
2832 while (p2 < p1)
2833 *b++ = *p2++;
2834 zero_times_ok = 1;
99633e97 2835 }
4e8a9132
SM
2836
2837 GET_BUFFER_SPACE (6);
2838 if (!zero_times_ok)
2839 /* A + loop. */
f6a3f532 2840 STORE_JUMP (ofj, b, b + 6);
99633e97 2841 else
4e8a9132
SM
2842 /* Simple * loops can use on_failure_keep_string_jump
2843 depending on what follows. But since we don't know
2844 that yet, we leave the decision up to
2845 on_failure_jump_smart. */
f6a3f532 2846 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2847 laststart + startoffset, b + 6);
99633e97 2848 b += 3;
4e8a9132 2849 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2850 b += 3;
2851 }
2852 else
2853 {
4e8a9132
SM
2854 /* A simple ? pattern. */
2855 assert (zero_times_ok);
2856 GET_BUFFER_SPACE (3);
2857 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2858 b += 3;
2859 }
1c8c6d39
DL
2860 }
2861 else /* not greedy */
2862 { /* I wish the greedy and non-greedy cases could be merged. */
2863
0683b6fa 2864 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2865 if (many_times_ok)
2866 {
f6a3f532
SM
2867 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2868
6df42991
SM
2869 /* The non-greedy multiple match looks like
2870 a repeat..until: we only need a conditional jump
2871 at the end of the loop. */
f6a3f532
SM
2872 if (emptyp) BUF_PUSH (no_op);
2873 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2874 : on_failure_jump, b, laststart);
1c8c6d39
DL
2875 b += 3;
2876 if (zero_times_ok)
2877 {
2878 /* The repeat...until naturally matches one or more.
2879 To also match zero times, we need to first jump to
6df42991 2880 the end of the loop (its conditional jump). */
1c8c6d39
DL
2881 INSERT_JUMP (jump, laststart, b);
2882 b += 3;
2883 }
2884 }
2885 else
2886 {
2887 /* non-greedy a?? */
1c8c6d39
DL
2888 INSERT_JUMP (jump, laststart, b + 3);
2889 b += 3;
2890 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2891 b += 3;
2892 }
2893 }
2894 }
4e8a9132 2895 pending_exact = 0;
fa9a63c5
RM
2896 break;
2897
2898
2899 case '.':
25fe55af
RS
2900 laststart = b;
2901 BUF_PUSH (anychar);
2902 break;
fa9a63c5
RM
2903
2904
25fe55af
RS
2905 case '[':
2906 {
b18215fc 2907 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2908
25fe55af 2909 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2910
25fe55af
RS
2911 /* Ensure that we have enough space to push a charset: the
2912 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2913 GET_BUFFER_SPACE (34);
2914
25fe55af 2915 laststart = b;
e318085a 2916
25fe55af 2917 /* We test `*p == '^' twice, instead of using an if
7814e705 2918 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2919 BUF_PUSH (*p == '^' ? charset_not : charset);
2920 if (*p == '^')
2921 p++;
e318085a 2922
25fe55af
RS
2923 /* Remember the first position in the bracket expression. */
2924 p1 = p;
e318085a 2925
7814e705 2926 /* Push the number of bytes in the bitmap. */
25fe55af 2927 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2928
25fe55af 2929 /* Clear the whole map. */
72af86bd 2930 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2931
25fe55af
RS
2932 /* charset_not matches newline according to a syntax bit. */
2933 if ((re_opcode_t) b[-2] == charset_not
2934 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2935 SET_LIST_BIT ('\n');
fa9a63c5 2936
7814e705 2937 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2938 for (;;)
2939 {
b18215fc 2940 boolean escaped_char = false;
2d1675e4 2941 const unsigned char *p2 = p;
cf9c99bc 2942 re_wchar_t ch, c2;
e318085a 2943
25fe55af 2944 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2945
36595814
SM
2946 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2947 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2948 So the translation is done later in a loop. Example:
2949 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2950 PATFETCH (c);
e318085a 2951
25fe55af
RS
2952 /* \ might escape characters inside [...] and [^...]. */
2953 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2954 {
2955 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2956
2957 PATFETCH (c);
b18215fc 2958 escaped_char = true;
25fe55af 2959 }
b18215fc
RS
2960 else
2961 {
7814e705 2962 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2963 not (i.e., when the bracket expression is `[]' so
2964 far), the ']' character bit gets set way below. */
2d1675e4 2965 if (c == ']' && p2 != p1)
657fcfbd 2966 break;
25fe55af 2967 }
b18215fc 2968
25fe55af
RS
2969 /* See if we're at the beginning of a possible character
2970 class. */
b18215fc 2971
2d1675e4
SM
2972 if (!escaped_char &&
2973 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2974 {
7814e705 2975 /* Leave room for the null. */
14473664 2976 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2977 const unsigned char *class_beg;
b18215fc 2978
25fe55af
RS
2979 PATFETCH (c);
2980 c1 = 0;
ed0767d8 2981 class_beg = p;
b18215fc 2982
25fe55af
RS
2983 /* If pattern is `[[:'. */
2984 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2985
25fe55af
RS
2986 for (;;)
2987 {
14473664
SM
2988 PATFETCH (c);
2989 if ((c == ':' && *p == ']') || p == pend)
2990 break;
2991 if (c1 < CHAR_CLASS_MAX_LENGTH)
2992 str[c1++] = c;
2993 else
2994 /* This is in any case an invalid class name. */
2995 str[0] = '\0';
25fe55af
RS
2996 }
2997 str[c1] = '\0';
b18215fc
RS
2998
2999 /* If isn't a word bracketed by `[:' and `:]':
3000 undo the ending character, the letters, and
3001 leave the leading `:' and `[' (but set bits for
3002 them). */
25fe55af
RS
3003 if (c == ':' && *p == ']')
3004 {
14473664 3005 re_wctype_t cc;
8f924df7 3006 int limit;
14473664
SM
3007
3008 cc = re_wctype (str);
3009
3010 if (cc == 0)
fa9a63c5
RM
3011 FREE_STACK_RETURN (REG_ECTYPE);
3012
14473664
SM
3013 /* Throw away the ] at the end of the character
3014 class. */
3015 PATFETCH (c);
fa9a63c5 3016
14473664 3017 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3018
cf9c99bc
KH
3019#ifndef emacs
3020 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3021 if (re_iswctype (btowc (ch), cc))
3022 {
3023 c = TRANSLATE (ch);
ed00c2ac
KH
3024 if (c < (1 << BYTEWIDTH))
3025 SET_LIST_BIT (c);
8f924df7 3026 }
cf9c99bc
KH
3027#else /* emacs */
3028 /* Most character classes in a multibyte match
3029 just set a flag. Exceptions are is_blank,
3030 is_digit, is_cntrl, and is_xdigit, since
3031 they can only match ASCII characters. We
3032 don't need to handle them for multibyte.
3033 They are distinguished by a negative wctype. */
96cc36cc 3034
254c06a8
SM
3035 /* Setup the gl_state object to its buffer-defined
3036 value. This hardcodes the buffer-global
3037 syntax-table for ASCII chars, while the other chars
3038 will obey syntax-table properties. It's not ideal,
3039 but it's the way it's been done until now. */
d48cd3f4 3040 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 3041
cf9c99bc 3042 for (ch = 0; ch < 256; ++ch)
25fe55af 3043 {
cf9c99bc
KH
3044 c = RE_CHAR_TO_MULTIBYTE (ch);
3045 if (! CHAR_BYTE8_P (c)
3046 && re_iswctype (c, cc))
8f924df7 3047 {
cf9c99bc
KH
3048 SET_LIST_BIT (ch);
3049 c1 = TRANSLATE (c);
3050 if (c1 == c)
3051 continue;
3052 if (ASCII_CHAR_P (c1))
3053 SET_LIST_BIT (c1);
3054 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3055 SET_LIST_BIT (c1);
8f924df7 3056 }
25fe55af 3057 }
cf9c99bc
KH
3058 SET_RANGE_TABLE_WORK_AREA_BIT
3059 (range_table_work, re_wctype_to_bit (cc));
3060#endif /* emacs */
6224b623
SM
3061 /* In most cases the matching rule for char classes
3062 only uses the syntax table for multibyte chars,
3063 so that the content of the syntax-table it is not
3064 hardcoded in the range_table. SPACE and WORD are
3065 the two exceptions. */
3066 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3067 bufp->used_syntax = 1;
3068
b18215fc
RS
3069 /* Repeat the loop. */
3070 continue;
25fe55af
RS
3071 }
3072 else
3073 {
ed0767d8
SM
3074 /* Go back to right after the "[:". */
3075 p = class_beg;
25fe55af 3076 SET_LIST_BIT ('[');
b18215fc
RS
3077
3078 /* Because the `:' may starts the range, we
3079 can't simply set bit and repeat the loop.
7814e705 3080 Instead, just set it to C and handle below. */
b18215fc 3081 c = ':';
25fe55af
RS
3082 }
3083 }
b18215fc
RS
3084
3085 if (p < pend && p[0] == '-' && p[1] != ']')
3086 {
3087
3088 /* Discard the `-'. */
3089 PATFETCH (c1);
3090
3091 /* Fetch the character which ends the range. */
3092 PATFETCH (c1);
cf9c99bc
KH
3093#ifdef emacs
3094 if (CHAR_BYTE8_P (c1)
3095 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3096 /* Treat the range from a multibyte character to
3097 raw-byte character as empty. */
3098 c = c1 + 1;
3099#endif /* emacs */
e318085a 3100 }
25fe55af 3101 else
b18215fc
RS
3102 /* Range from C to C. */
3103 c1 = c;
3104
cf9c99bc 3105 if (c > c1)
25fe55af 3106 {
cf9c99bc
KH
3107 if (syntax & RE_NO_EMPTY_RANGES)
3108 FREE_STACK_RETURN (REG_ERANGEX);
3109 /* Else, repeat the loop. */
bf216479 3110 }
6fdd04b0 3111 else
25fe55af 3112 {
cf9c99bc
KH
3113#ifndef emacs
3114 /* Set the range into bitmap */
8f924df7 3115 for (; c <= c1; c++)
b18215fc 3116 {
cf9c99bc
KH
3117 ch = TRANSLATE (c);
3118 if (ch < (1 << BYTEWIDTH))
3119 SET_LIST_BIT (ch);
3120 }
3121#else /* emacs */
3122 if (c < 128)
3123 {
3124 ch = MIN (127, c1);
3125 SETUP_ASCII_RANGE (range_table_work, c, ch);
3126 c = ch + 1;
3127 if (CHAR_BYTE8_P (c1))
3128 c = BYTE8_TO_CHAR (128);
3129 }
3130 if (c <= c1)
3131 {
3132 if (CHAR_BYTE8_P (c))
3133 {
3134 c = CHAR_TO_BYTE8 (c);
3135 c1 = CHAR_TO_BYTE8 (c1);
3136 for (; c <= c1; c++)
3137 SET_LIST_BIT (c);
3138 }
3139 else if (multibyte)
3140 {
3141 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3142 }
3143 else
3144 {
3145 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3146 }
e934739e 3147 }
cf9c99bc 3148#endif /* emacs */
25fe55af 3149 }
e318085a
RS
3150 }
3151
25fe55af 3152 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3153 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3154 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3155 b[-1]--;
3156 b += b[-1];
fa9a63c5 3157
96cc36cc
RS
3158 /* Build real range table from work area. */
3159 if (RANGE_TABLE_WORK_USED (range_table_work)
3160 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3161 {
3162 int i;
3163 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3164
b18215fc 3165 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3166 bytes for flags, two for COUNT, and three bytes for
3167 each character. */
3168 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3169
b18215fc
RS
3170 /* Indicate the existence of range table. */
3171 laststart[1] |= 0x80;
fa9a63c5 3172
96cc36cc
RS
3173 /* Store the character class flag bits into the range table.
3174 If not in emacs, these flag bits are always 0. */
3175 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3176 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3177
b18215fc
RS
3178 STORE_NUMBER_AND_INCR (b, used / 2);
3179 for (i = 0; i < used; i++)
3180 STORE_CHARACTER_AND_INCR
3181 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3182 }
25fe55af
RS
3183 }
3184 break;
fa9a63c5
RM
3185
3186
b18215fc 3187 case '(':
25fe55af
RS
3188 if (syntax & RE_NO_BK_PARENS)
3189 goto handle_open;
3190 else
3191 goto normal_char;
fa9a63c5
RM
3192
3193
25fe55af
RS
3194 case ')':
3195 if (syntax & RE_NO_BK_PARENS)
3196 goto handle_close;
3197 else
3198 goto normal_char;
e318085a
RS
3199
3200
25fe55af
RS
3201 case '\n':
3202 if (syntax & RE_NEWLINE_ALT)
3203 goto handle_alt;
3204 else
3205 goto normal_char;
e318085a
RS
3206
3207
b18215fc 3208 case '|':
25fe55af
RS
3209 if (syntax & RE_NO_BK_VBAR)
3210 goto handle_alt;
3211 else
3212 goto normal_char;
3213
3214
3215 case '{':
3216 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3217 goto handle_interval;
3218 else
3219 goto normal_char;
3220
3221
3222 case '\\':
3223 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3224
3225 /* Do not translate the character after the \, so that we can
3226 distinguish, e.g., \B from \b, even if we normally would
3227 translate, e.g., B to b. */
36595814 3228 PATFETCH (c);
25fe55af
RS
3229
3230 switch (c)
3231 {
3232 case '(':
3233 if (syntax & RE_NO_BK_PARENS)
3234 goto normal_backslash;
3235
3236 handle_open:
505bde11
SM
3237 {
3238 int shy = 0;
c69b0314 3239 regnum_t regnum = 0;
505bde11
SM
3240 if (p+1 < pend)
3241 {
3242 /* Look for a special (?...) construct */
ed0767d8 3243 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3244 {
ed0767d8 3245 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3246 while (!shy)
505bde11 3247 {
c69b0314
SM
3248 PATFETCH (c);
3249 switch (c)
3250 {
3251 case ':': shy = 1; break;
3252 case '0':
3253 /* An explicitly specified regnum must start
3254 with non-0. */
3255 if (regnum == 0)
3256 FREE_STACK_RETURN (REG_BADPAT);
3257 case '1': case '2': case '3': case '4':
3258 case '5': case '6': case '7': case '8': case '9':
3259 regnum = 10*regnum + (c - '0'); break;
3260 default:
3261 /* Only (?:...) is supported right now. */
3262 FREE_STACK_RETURN (REG_BADPAT);
3263 }
505bde11
SM
3264 }
3265 }
505bde11
SM
3266 }
3267
3268 if (!shy)
c69b0314
SM
3269 regnum = ++bufp->re_nsub;
3270 else if (regnum)
3271 { /* It's actually not shy, but explicitly numbered. */
3272 shy = 0;
3273 if (regnum > bufp->re_nsub)
3274 bufp->re_nsub = regnum;
3275 else if (regnum > bufp->re_nsub
3276 /* Ideally, we'd want to check that the specified
3277 group can't have matched (i.e. all subgroups
3278 using the same regnum are in other branches of
3279 OR patterns), but we don't currently keep track
3280 of enough info to do that easily. */
3281 || group_in_compile_stack (compile_stack, regnum))
3282 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3283 }
c69b0314
SM
3284 else
3285 /* It's really shy. */
3286 regnum = - bufp->re_nsub;
25fe55af 3287
99633e97
SM
3288 if (COMPILE_STACK_FULL)
3289 {
3290 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3291 compile_stack_elt_t);
3292 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3293
99633e97
SM
3294 compile_stack.size <<= 1;
3295 }
25fe55af 3296
99633e97 3297 /* These are the values to restore when we hit end of this
7814e705 3298 group. They are all relative offsets, so that if the
99633e97
SM
3299 whole pattern moves because of realloc, they will still
3300 be valid. */
3301 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3302 COMPILE_STACK_TOP.fixup_alt_jump
3303 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3304 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3305 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3306
c69b0314
SM
3307 /* Do not push a start_memory for groups beyond the last one
3308 we can represent in the compiled pattern. */
3309 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3310 BUF_PUSH_2 (start_memory, regnum);
3311
3312 compile_stack.avail++;
3313
3314 fixup_alt_jump = 0;
3315 laststart = 0;
3316 begalt = b;
3317 /* If we've reached MAX_REGNUM groups, then this open
3318 won't actually generate any code, so we'll have to
3319 clear pending_exact explicitly. */
3320 pending_exact = 0;
3321 break;
505bde11 3322 }
25fe55af
RS
3323
3324 case ')':
3325 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3326
3327 if (COMPILE_STACK_EMPTY)
505bde11
SM
3328 {
3329 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3330 goto normal_backslash;
3331 else
3332 FREE_STACK_RETURN (REG_ERPAREN);
3333 }
25fe55af
RS
3334
3335 handle_close:
505bde11 3336 FIXUP_ALT_JUMP ();
25fe55af
RS
3337
3338 /* See similar code for backslashed left paren above. */
3339 if (COMPILE_STACK_EMPTY)
505bde11
SM
3340 {
3341 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3342 goto normal_char;
3343 else
3344 FREE_STACK_RETURN (REG_ERPAREN);
3345 }
25fe55af
RS
3346
3347 /* Since we just checked for an empty stack above, this
3348 ``can't happen''. */
3349 assert (compile_stack.avail != 0);
3350 {
3351 /* We don't just want to restore into `regnum', because
3352 later groups should continue to be numbered higher,
7814e705 3353 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3354 regnum_t regnum;
25fe55af
RS
3355
3356 compile_stack.avail--;
3357 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3358 fixup_alt_jump
3359 = COMPILE_STACK_TOP.fixup_alt_jump
3360 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3361 : 0;
3362 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3363 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3364 /* If we've reached MAX_REGNUM groups, then this open
3365 won't actually generate any code, so we'll have to
3366 clear pending_exact explicitly. */
3367 pending_exact = 0;
e318085a 3368
25fe55af 3369 /* We're at the end of the group, so now we know how many
7814e705 3370 groups were inside this one. */
c69b0314
SM
3371 if (regnum <= MAX_REGNUM && regnum > 0)
3372 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3373 }
3374 break;
3375
3376
3377 case '|': /* `\|'. */
3378 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3379 goto normal_backslash;
3380 handle_alt:
3381 if (syntax & RE_LIMITED_OPS)
3382 goto normal_char;
3383
3384 /* Insert before the previous alternative a jump which
7814e705 3385 jumps to this alternative if the former fails. */
25fe55af
RS
3386 GET_BUFFER_SPACE (3);
3387 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3388 pending_exact = 0;
3389 b += 3;
3390
3391 /* The alternative before this one has a jump after it
3392 which gets executed if it gets matched. Adjust that
3393 jump so it will jump to this alternative's analogous
3394 jump (put in below, which in turn will jump to the next
3395 (if any) alternative's such jump, etc.). The last such
3396 jump jumps to the correct final destination. A picture:
3397 _____ _____
3398 | | | |
3399 | v | v
3400 a | b | c
3401
3402 If we are at `b', then fixup_alt_jump right now points to a
3403 three-byte space after `a'. We'll put in the jump, set
3404 fixup_alt_jump to right after `b', and leave behind three
3405 bytes which we'll fill in when we get to after `c'. */
3406
505bde11 3407 FIXUP_ALT_JUMP ();
25fe55af
RS
3408
3409 /* Mark and leave space for a jump after this alternative,
3410 to be filled in later either by next alternative or
3411 when know we're at the end of a series of alternatives. */
3412 fixup_alt_jump = b;
3413 GET_BUFFER_SPACE (3);
3414 b += 3;
3415
3416 laststart = 0;
3417 begalt = b;
3418 break;
3419
3420
3421 case '{':
3422 /* If \{ is a literal. */
3423 if (!(syntax & RE_INTERVALS)
3424 /* If we're at `\{' and it's not the open-interval
3425 operator. */
4bb91c68 3426 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3427 goto normal_backslash;
3428
3429 handle_interval:
3430 {
3431 /* If got here, then the syntax allows intervals. */
3432
3433 /* At least (most) this many matches must be made. */
99633e97 3434 int lower_bound = 0, upper_bound = -1;
25fe55af 3435
ed0767d8 3436 beg_interval = p;
25fe55af 3437
25fe55af
RS
3438 GET_UNSIGNED_NUMBER (lower_bound);
3439
3440 if (c == ',')
ed0767d8 3441 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3442 else
3443 /* Interval such as `{1}' => match exactly once. */
3444 upper_bound = lower_bound;
3445
3446 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3447 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3448 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3449
3450 if (!(syntax & RE_NO_BK_BRACES))
3451 {
4bb91c68
SM
3452 if (c != '\\')
3453 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3454 if (p == pend)
3455 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3456 PATFETCH (c);
3457 }
3458
3459 if (c != '}')
4bb91c68 3460 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3461
3462 /* We just parsed a valid interval. */
3463
3464 /* If it's invalid to have no preceding re. */
3465 if (!laststart)
3466 {
3467 if (syntax & RE_CONTEXT_INVALID_OPS)
3468 FREE_STACK_RETURN (REG_BADRPT);
3469 else if (syntax & RE_CONTEXT_INDEP_OPS)
3470 laststart = b;
3471 else
3472 goto unfetch_interval;
3473 }
3474
6df42991
SM
3475 if (upper_bound == 0)
3476 /* If the upper bound is zero, just drop the sub pattern
3477 altogether. */
3478 b = laststart;
3479 else if (lower_bound == 1 && upper_bound == 1)
3480 /* Just match it once: nothing to do here. */
3481 ;
3482
3483 /* Otherwise, we have a nontrivial interval. When
3484 we're all done, the pattern will look like:
3485 set_number_at <jump count> <upper bound>
3486 set_number_at <succeed_n count> <lower bound>
3487 succeed_n <after jump addr> <succeed_n count>
3488 <body of loop>
3489 jump_n <succeed_n addr> <jump count>
3490 (The upper bound and `jump_n' are omitted if
3491 `upper_bound' is 1, though.) */
3492 else
3493 { /* If the upper bound is > 1, we need to insert
3494 more at the end of the loop. */
3495 unsigned int nbytes = (upper_bound < 0 ? 3
3496 : upper_bound > 1 ? 5 : 0);
3497 unsigned int startoffset = 0;
3498
3499 GET_BUFFER_SPACE (20); /* We might use less. */
3500
3501 if (lower_bound == 0)
3502 {
3503 /* A succeed_n that starts with 0 is really a
3504 a simple on_failure_jump_loop. */
3505 INSERT_JUMP (on_failure_jump_loop, laststart,
3506 b + 3 + nbytes);
3507 b += 3;
3508 }
3509 else
3510 {
3511 /* Initialize lower bound of the `succeed_n', even
3512 though it will be set during matching by its
3513 attendant `set_number_at' (inserted next),
3514 because `re_compile_fastmap' needs to know.
3515 Jump to the `jump_n' we might insert below. */
3516 INSERT_JUMP2 (succeed_n, laststart,
3517 b + 5 + nbytes,
3518 lower_bound);
3519 b += 5;
3520
3521 /* Code to initialize the lower bound. Insert
7814e705 3522 before the `succeed_n'. The `5' is the last two
6df42991
SM
3523 bytes of this `set_number_at', plus 3 bytes of
3524 the following `succeed_n'. */
3525 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3526 b += 5;
3527 startoffset += 5;
3528 }
3529
3530 if (upper_bound < 0)
3531 {
3532 /* A negative upper bound stands for infinity,
3533 in which case it degenerates to a plain jump. */
3534 STORE_JUMP (jump, b, laststart + startoffset);
3535 b += 3;
3536 }
3537 else if (upper_bound > 1)
3538 { /* More than one repetition is allowed, so
3539 append a backward jump to the `succeed_n'
3540 that starts this interval.
3541
3542 When we've reached this during matching,
3543 we'll have matched the interval once, so
3544 jump back only `upper_bound - 1' times. */
3545 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3546 upper_bound - 1);
3547 b += 5;
3548
3549 /* The location we want to set is the second
3550 parameter of the `jump_n'; that is `b-2' as
3551 an absolute address. `laststart' will be
3552 the `set_number_at' we're about to insert;
3553 `laststart+3' the number to set, the source
3554 for the relative address. But we are
3555 inserting into the middle of the pattern --
3556 so everything is getting moved up by 5.
3557 Conclusion: (b - 2) - (laststart + 3) + 5,
3558 i.e., b - laststart.
3559
3560 We insert this at the beginning of the loop
3561 so that if we fail during matching, we'll
3562 reinitialize the bounds. */
3563 insert_op2 (set_number_at, laststart, b - laststart,
3564 upper_bound - 1, b);
3565 b += 5;
3566 }
3567 }
25fe55af
RS
3568 pending_exact = 0;
3569 beg_interval = NULL;
3570 }
3571 break;
3572
3573 unfetch_interval:
3574 /* If an invalid interval, match the characters as literals. */
3575 assert (beg_interval);
3576 p = beg_interval;
3577 beg_interval = NULL;
3578
3579 /* normal_char and normal_backslash need `c'. */
ed0767d8 3580 c = '{';
25fe55af
RS
3581
3582 if (!(syntax & RE_NO_BK_BRACES))
3583 {
ed0767d8
SM
3584 assert (p > pattern && p[-1] == '\\');
3585 goto normal_backslash;
25fe55af 3586 }
ed0767d8
SM
3587 else
3588 goto normal_char;
e318085a 3589
b18215fc 3590#ifdef emacs
25fe55af 3591 /* There is no way to specify the before_dot and after_dot
7814e705 3592 operators. rms says this is ok. --karl */
25fe55af
RS
3593 case '=':
3594 BUF_PUSH (at_dot);
3595 break;
3596
3597 case 's':
3598 laststart = b;
3599 PATFETCH (c);
3600 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3601 break;
3602
3603 case 'S':
3604 laststart = b;
3605 PATFETCH (c);
3606 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3607 break;
b18215fc
RS
3608
3609 case 'c':
3610 laststart = b;
36595814 3611 PATFETCH (c);
b18215fc
RS
3612 BUF_PUSH_2 (categoryspec, c);
3613 break;
e318085a 3614
b18215fc
RS
3615 case 'C':
3616 laststart = b;
36595814 3617 PATFETCH (c);
b18215fc
RS
3618 BUF_PUSH_2 (notcategoryspec, c);
3619 break;
3620#endif /* emacs */
e318085a 3621
e318085a 3622
25fe55af 3623 case 'w':
4bb91c68
SM
3624 if (syntax & RE_NO_GNU_OPS)
3625 goto normal_char;
25fe55af 3626 laststart = b;
1fb352e0 3627 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3628 break;
e318085a 3629
e318085a 3630
25fe55af 3631 case 'W':
4bb91c68
SM
3632 if (syntax & RE_NO_GNU_OPS)
3633 goto normal_char;
25fe55af 3634 laststart = b;
1fb352e0 3635 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3636 break;
e318085a
RS
3637
3638
25fe55af 3639 case '<':
4bb91c68
SM
3640 if (syntax & RE_NO_GNU_OPS)
3641 goto normal_char;
25fe55af
RS
3642 BUF_PUSH (wordbeg);
3643 break;
e318085a 3644
25fe55af 3645 case '>':
4bb91c68
SM
3646 if (syntax & RE_NO_GNU_OPS)
3647 goto normal_char;
25fe55af
RS
3648 BUF_PUSH (wordend);
3649 break;
e318085a 3650
669fa600
SM
3651 case '_':
3652 if (syntax & RE_NO_GNU_OPS)
3653 goto normal_char;
3654 laststart = b;
3655 PATFETCH (c);
3656 if (c == '<')
3657 BUF_PUSH (symbeg);
3658 else if (c == '>')
3659 BUF_PUSH (symend);
3660 else
3661 FREE_STACK_RETURN (REG_BADPAT);
3662 break;
3663
25fe55af 3664 case 'b':
4bb91c68
SM
3665 if (syntax & RE_NO_GNU_OPS)
3666 goto normal_char;
25fe55af
RS
3667 BUF_PUSH (wordbound);
3668 break;
e318085a 3669
25fe55af 3670 case 'B':
4bb91c68
SM
3671 if (syntax & RE_NO_GNU_OPS)
3672 goto normal_char;
25fe55af
RS
3673 BUF_PUSH (notwordbound);
3674 break;
fa9a63c5 3675
25fe55af 3676 case '`':
4bb91c68
SM
3677 if (syntax & RE_NO_GNU_OPS)
3678 goto normal_char;
25fe55af
RS
3679 BUF_PUSH (begbuf);
3680 break;
e318085a 3681
25fe55af 3682 case '\'':
4bb91c68
SM
3683 if (syntax & RE_NO_GNU_OPS)
3684 goto normal_char;
25fe55af
RS
3685 BUF_PUSH (endbuf);
3686 break;
e318085a 3687
25fe55af
RS
3688 case '1': case '2': case '3': case '4': case '5':
3689 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3690 {
3691 regnum_t reg;
e318085a 3692
0cdd06f8
SM
3693 if (syntax & RE_NO_BK_REFS)
3694 goto normal_backslash;
e318085a 3695
0cdd06f8 3696 reg = c - '0';
e318085a 3697
c69b0314
SM
3698 if (reg > bufp->re_nsub || reg < 1
3699 /* Can't back reference to a subexp before its end. */
3700 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3701 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3702
0cdd06f8
SM
3703 laststart = b;
3704 BUF_PUSH_2 (duplicate, reg);
3705 }
25fe55af 3706 break;
e318085a 3707
e318085a 3708
25fe55af
RS
3709 case '+':
3710 case '?':
3711 if (syntax & RE_BK_PLUS_QM)
3712 goto handle_plus;
3713 else
3714 goto normal_backslash;
3715
3716 default:
3717 normal_backslash:
3718 /* You might think it would be useful for \ to mean
3719 not to translate; but if we don't translate it
4bb91c68 3720 it will never match anything. */
25fe55af
RS
3721 goto normal_char;
3722 }
3723 break;
fa9a63c5
RM
3724
3725
3726 default:
25fe55af 3727 /* Expects the character in `c'. */
fa9a63c5 3728 normal_char:
36595814 3729 /* If no exactn currently being built. */
25fe55af 3730 if (!pending_exact
fa9a63c5 3731
25fe55af
RS
3732 /* If last exactn not at current position. */
3733 || pending_exact + *pending_exact + 1 != b
5e69f11e 3734
25fe55af 3735 /* We have only one byte following the exactn for the count. */
2d1675e4 3736 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3737
7814e705 3738 /* If followed by a repetition operator. */
9d99031f 3739 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3740 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3741 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3742 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3743 || ((syntax & RE_INTERVALS)
25fe55af 3744 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3745 ? p != pend && *p == '{'
3746 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3747 {
3748 /* Start building a new exactn. */
5e69f11e 3749
25fe55af 3750 laststart = b;
fa9a63c5
RM
3751
3752 BUF_PUSH_2 (exactn, 0);
3753 pending_exact = b - 1;
25fe55af 3754 }
5e69f11e 3755
2d1675e4
SM
3756 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3757 {
e0277a47
KH
3758 int len;
3759
cf9c99bc 3760 if (multibyte)
6fdd04b0 3761 {
cf9c99bc 3762 c = TRANSLATE (c);
6fdd04b0
KH
3763 len = CHAR_STRING (c, b);
3764 b += len;
3765 }
e0277a47 3766 else
6fdd04b0 3767 {
cf9c99bc
KH
3768 c1 = RE_CHAR_TO_MULTIBYTE (c);
3769 if (! CHAR_BYTE8_P (c1))
3770 {
3771 re_wchar_t c2 = TRANSLATE (c1);
3772
3773 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3774 c = c1;
409f2919 3775 }
6fdd04b0
KH
3776 *b++ = c;
3777 len = 1;
3778 }
2d1675e4
SM
3779 (*pending_exact) += len;
3780 }
3781
fa9a63c5 3782 break;
25fe55af 3783 } /* switch (c) */
fa9a63c5
RM
3784 } /* while p != pend */
3785
5e69f11e 3786
fa9a63c5 3787 /* Through the pattern now. */
5e69f11e 3788
505bde11 3789 FIXUP_ALT_JUMP ();
fa9a63c5 3790
5e69f11e 3791 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3792 FREE_STACK_RETURN (REG_EPAREN);
3793
3794 /* If we don't want backtracking, force success
3795 the first time we reach the end of the compiled pattern. */
3796 if (syntax & RE_NO_POSIX_BACKTRACKING)
3797 BUF_PUSH (succeed);
3798
fa9a63c5
RM
3799 /* We have succeeded; set the length of the buffer. */
3800 bufp->used = b - bufp->buffer;
3801
3802#ifdef DEBUG
99633e97 3803 if (debug > 0)
fa9a63c5 3804 {
505bde11 3805 re_compile_fastmap (bufp);
fa9a63c5
RM
3806 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3807 print_compiled_pattern (bufp);
3808 }
99633e97 3809 debug--;
fa9a63c5
RM
3810#endif /* DEBUG */
3811
3812#ifndef MATCH_MAY_ALLOCATE
3813 /* Initialize the failure stack to the largest possible stack. This
3814 isn't necessary unless we're trying to avoid calling alloca in
3815 the search and match routines. */
3816 {
3817 int num_regs = bufp->re_nsub + 1;
3818
320a2a73 3819 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3820 {
a26f4ccd 3821 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3822
fa9a63c5
RM
3823 if (! fail_stack.stack)
3824 fail_stack.stack
5e69f11e 3825 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3826 * sizeof (fail_stack_elt_t));
3827 else
3828 fail_stack.stack
3829 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3830 (fail_stack.size
3831 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3832 }
3833
3834 regex_grow_registers (num_regs);
3835 }
3836#endif /* not MATCH_MAY_ALLOCATE */
3837
839966f3 3838 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3839} /* regex_compile */
3840\f
3841/* Subroutines for `regex_compile'. */
3842
7814e705 3843/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3844
3845static void
971de7fb 3846store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3847{
3848 *loc = (unsigned char) op;
3849 STORE_NUMBER (loc + 1, arg);
3850}
3851
3852
3853/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3854
3855static void
971de7fb 3856store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3857{
3858 *loc = (unsigned char) op;
3859 STORE_NUMBER (loc + 1, arg1);
3860 STORE_NUMBER (loc + 3, arg2);
3861}
3862
3863
3864/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3865 for OP followed by two-byte integer parameter ARG. */
3866
3867static void
971de7fb 3868insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3869{
3870 register unsigned char *pfrom = end;
3871 register unsigned char *pto = end + 3;
3872
3873 while (pfrom != loc)
3874 *--pto = *--pfrom;
5e69f11e 3875
fa9a63c5
RM
3876 store_op1 (op, loc, arg);
3877}
3878
3879
3880/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3881
3882static void
971de7fb 3883insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3884{
3885 register unsigned char *pfrom = end;
3886 register unsigned char *pto = end + 5;
3887
3888 while (pfrom != loc)
3889 *--pto = *--pfrom;
5e69f11e 3890
fa9a63c5
RM
3891 store_op2 (op, loc, arg1, arg2);
3892}
3893
3894
3895/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3896 after an alternative or a begin-subexpression. We assume there is at
3897 least one character before the ^. */
3898
3899static boolean
971de7fb 3900at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3901{
01618498 3902 re_char *prev = p - 2;
fa9a63c5 3903 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3904
fa9a63c5
RM
3905 return
3906 /* After a subexpression? */
3907 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3908 /* After an alternative? */
d2af47df
SM
3909 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3910 /* After a shy subexpression? */
3911 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3912 && prev[-1] == '?' && prev[-2] == '('
3913 && (syntax & RE_NO_BK_PARENS
3914 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3915}
3916
3917
3918/* The dual of at_begline_loc_p. This one is for $. We assume there is
3919 at least one character after the $, i.e., `P < PEND'. */
3920
3921static boolean
971de7fb 3922at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3923{
01618498 3924 re_char *next = p;
fa9a63c5 3925 boolean next_backslash = *next == '\\';
01618498 3926 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3927
fa9a63c5
RM
3928 return
3929 /* Before a subexpression? */
3930 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3931 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3932 /* Before an alternative? */
3933 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3934 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3935}
3936
3937
5e69f11e 3938/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3939 false if it's not. */
3940
3941static boolean
971de7fb 3942group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5
RM
3943{
3944 int this_element;
3945
5e69f11e
RM
3946 for (this_element = compile_stack.avail - 1;
3947 this_element >= 0;
fa9a63c5
RM
3948 this_element--)
3949 if (compile_stack.stack[this_element].regnum == regnum)
3950 return true;
3951
3952 return false;
3953}
fa9a63c5 3954\f
f6a3f532
SM
3955/* analyse_first.
3956 If fastmap is non-NULL, go through the pattern and fill fastmap
3957 with all the possible leading chars. If fastmap is NULL, don't
3958 bother filling it up (obviously) and only return whether the
3959 pattern could potentially match the empty string.
3960
3961 Return 1 if p..pend might match the empty string.
3962 Return 0 if p..pend matches at least one char.
01618498 3963 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3964
3965static int
438105ed 3966analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
fa9a63c5 3967{
505bde11 3968 int j, k;
1fb352e0 3969 boolean not;
fa9a63c5 3970
b18215fc 3971 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3972 flag is set true. */
b18215fc
RS
3973 boolean match_any_multibyte_characters = false;
3974
f6a3f532 3975 assert (p);
5e69f11e 3976
505bde11
SM
3977 /* The loop below works as follows:
3978 - It has a working-list kept in the PATTERN_STACK and which basically
3979 starts by only containing a pointer to the first operation.
3980 - If the opcode we're looking at is a match against some set of
3981 chars, then we add those chars to the fastmap and go on to the
3982 next work element from the worklist (done via `break').
3983 - If the opcode is a control operator on the other hand, we either
3984 ignore it (if it's meaningless at this point, such as `start_memory')
3985 or execute it (if it's a jump). If the jump has several destinations
3986 (i.e. `on_failure_jump'), then we push the other destination onto the
3987 worklist.
3988 We guarantee termination by ignoring backward jumps (more or less),
3989 so that `p' is monotonically increasing. More to the point, we
3990 never set `p' (or push) anything `<= p1'. */
3991
01618498 3992 while (p < pend)
fa9a63c5 3993 {
505bde11
SM
3994 /* `p1' is used as a marker of how far back a `on_failure_jump'
3995 can go without being ignored. It is normally equal to `p'
3996 (which prevents any backward `on_failure_jump') except right
3997 after a plain `jump', to allow patterns such as:
3998 0: jump 10
3999 3..9: <body>
4000 10: on_failure_jump 3
4001 as used for the *? operator. */
01618498 4002 re_char *p1 = p;
5e69f11e 4003
fa9a63c5
RM
4004 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4005 {
f6a3f532 4006 case succeed:
01618498 4007 return 1;
f6a3f532 4008 continue;
fa9a63c5 4009
fa9a63c5 4010 case duplicate:
505bde11
SM
4011 /* If the first character has to match a backreference, that means
4012 that the group was empty (since it already matched). Since this
4013 is the only case that interests us here, we can assume that the
4014 backreference must match the empty string. */
4015 p++;
4016 continue;
fa9a63c5
RM
4017
4018
4019 /* Following are the cases which match a character. These end
7814e705 4020 with `break'. */
fa9a63c5
RM
4021
4022 case exactn:
e0277a47 4023 if (fastmap)
cf9c99bc
KH
4024 {
4025 /* If multibyte is nonzero, the first byte of each
4026 character is an ASCII or a leading code. Otherwise,
4027 each byte is a character. Thus, this works in both
4028 cases. */
4029 fastmap[p[1]] = 1;
4030 if (! multibyte)
4031 {
4032 /* For the case of matching this unibyte regex
4033 against multibyte, we must set a leading code of
4034 the corresponding multibyte character. */
4035 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4036
86e893e3 4037 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
4038 }
4039 }
fa9a63c5
RM
4040 break;
4041
4042
1fb352e0
SM
4043 case anychar:
4044 /* We could put all the chars except for \n (and maybe \0)
4045 but we don't bother since it is generally not worth it. */
f6a3f532 4046 if (!fastmap) break;
01618498 4047 return -1;
fa9a63c5
RM
4048
4049
b18215fc 4050 case charset_not:
1fb352e0 4051 if (!fastmap) break;
bf216479
KH
4052 {
4053 /* Chars beyond end of bitmap are possible matches. */
bf216479 4054 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4055 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4056 fastmap[j] = 1;
4057 }
4058
1fb352e0
SM
4059 /* Fallthrough */
4060 case charset:
4061 if (!fastmap) break;
4062 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4063 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4064 j >= 0; j--)
1fb352e0 4065 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4066 fastmap[j] = 1;
b18215fc 4067
6482db2e
KH
4068#ifdef emacs
4069 if (/* Any leading code can possibly start a character
1fb352e0 4070 which doesn't match the specified set of characters. */
6482db2e 4071 not
409f2919 4072 ||
6482db2e
KH
4073 /* If we can match a character class, we can match any
4074 multibyte characters. */
4075 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4076 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4077
b18215fc 4078 {
b18215fc
RS
4079 if (match_any_multibyte_characters == false)
4080 {
6482db2e
KH
4081 for (j = MIN_MULTIBYTE_LEADING_CODE;
4082 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4083 fastmap[j] = 1;
b18215fc
RS
4084 match_any_multibyte_characters = true;
4085 }
4086 }
b18215fc 4087
1fb352e0
SM
4088 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4089 && match_any_multibyte_characters == false)
4090 {
bf216479 4091 /* Set fastmap[I] to 1 where I is a leading code of each
9117d724 4092 multibyte characer in the range table. */
1fb352e0 4093 int c, count;
bf216479 4094 unsigned char lc1, lc2;
b18215fc 4095
1fb352e0 4096 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4097 bits for a character class. */
1fb352e0 4098 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4099
1fb352e0
SM
4100 /* Extract the number of ranges in range table into COUNT. */
4101 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4102 for (; count > 0; count--, p += 3)
1fb352e0 4103 {
9117d724
KH
4104 /* Extract the start and end of each range. */
4105 EXTRACT_CHARACTER (c, p);
bf216479 4106 lc1 = CHAR_LEADING_CODE (c);
9117d724 4107 p += 3;
1fb352e0 4108 EXTRACT_CHARACTER (c, p);
bf216479
KH
4109 lc2 = CHAR_LEADING_CODE (c);
4110 for (j = lc1; j <= lc2; j++)
9117d724 4111 fastmap[j] = 1;
1fb352e0
SM
4112 }
4113 }
6482db2e 4114#endif
b18215fc
RS
4115 break;
4116
1fb352e0
SM
4117 case syntaxspec:
4118 case notsyntaxspec:
4119 if (!fastmap) break;
4120#ifndef emacs
4121 not = (re_opcode_t)p[-1] == notsyntaxspec;
4122 k = *p++;
4123 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4124 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4125 fastmap[j] = 1;
b18215fc 4126 break;
1fb352e0 4127#else /* emacs */
b18215fc
RS
4128 /* This match depends on text properties. These end with
4129 aborting optimizations. */
01618498 4130 return -1;
b18215fc
RS
4131
4132 case categoryspec:
b18215fc 4133 case notcategoryspec:
1fb352e0
SM
4134 if (!fastmap) break;
4135 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4136 k = *p++;
6482db2e 4137 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4138 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4139 fastmap[j] = 1;
4140
6482db2e
KH
4141 /* Any leading code can possibly start a character which
4142 has or doesn't has the specified category. */
4143 if (match_any_multibyte_characters == false)
6fdd04b0 4144 {
6482db2e
KH
4145 for (j = MIN_MULTIBYTE_LEADING_CODE;
4146 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4147 fastmap[j] = 1;
4148 match_any_multibyte_characters = true;
6fdd04b0 4149 }
b18215fc
RS
4150 break;
4151
fa9a63c5 4152 /* All cases after this match the empty string. These end with
25fe55af 4153 `continue'. */
fa9a63c5 4154
fa9a63c5
RM
4155 case before_dot:
4156 case at_dot:
4157 case after_dot:
1fb352e0 4158#endif /* !emacs */
25fe55af
RS
4159 case no_op:
4160 case begline:
4161 case endline:
fa9a63c5
RM
4162 case begbuf:
4163 case endbuf:
4164 case wordbound:
4165 case notwordbound:
4166 case wordbeg:
4167 case wordend:
669fa600
SM
4168 case symbeg:
4169 case symend:
25fe55af 4170 continue;
fa9a63c5
RM
4171
4172
fa9a63c5 4173 case jump:
25fe55af 4174 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4175 if (j < 0)
4176 /* Backward jumps can only go back to code that we've already
4177 visited. `re_compile' should make sure this is true. */
4178 break;
25fe55af 4179 p += j;
505bde11
SM
4180 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4181 {
4182 case on_failure_jump:
4183 case on_failure_keep_string_jump:
505bde11 4184 case on_failure_jump_loop:
0683b6fa 4185 case on_failure_jump_nastyloop:
505bde11
SM
4186 case on_failure_jump_smart:
4187 p++;
4188 break;
4189 default:
4190 continue;
4191 };
4192 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4193 to jump back to "just after here". */
4194 /* Fallthrough */
fa9a63c5 4195
25fe55af
RS
4196 case on_failure_jump:
4197 case on_failure_keep_string_jump:
0683b6fa 4198 case on_failure_jump_nastyloop:
505bde11
SM
4199 case on_failure_jump_loop:
4200 case on_failure_jump_smart:
25fe55af 4201 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4202 if (p + j <= p1)
ed0767d8 4203 ; /* Backward jump to be ignored. */
01618498
SM
4204 else
4205 { /* We have to look down both arms.
4206 We first go down the "straight" path so as to minimize
4207 stack usage when going through alternatives. */
4208 int r = analyse_first (p, pend, fastmap, multibyte);
4209 if (r) return r;
4210 p += j;
4211 }
25fe55af 4212 continue;
fa9a63c5
RM
4213
4214
ed0767d8
SM
4215 case jump_n:
4216 /* This code simply does not properly handle forward jump_n. */
4217 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4218 p += 4;
4219 /* jump_n can either jump or fall through. The (backward) jump
4220 case has already been handled, so we only need to look at the
4221 fallthrough case. */
4222 continue;
177c0ea7 4223
fa9a63c5 4224 case succeed_n:
ed0767d8
SM
4225 /* If N == 0, it should be an on_failure_jump_loop instead. */
4226 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4227 p += 4;
4228 /* We only care about one iteration of the loop, so we don't
4229 need to consider the case where this behaves like an
4230 on_failure_jump. */
25fe55af 4231 continue;
fa9a63c5
RM
4232
4233
4234 case set_number_at:
25fe55af
RS
4235 p += 4;
4236 continue;
fa9a63c5
RM
4237
4238
4239 case start_memory:
25fe55af 4240 case stop_memory:
505bde11 4241 p += 1;
fa9a63c5
RM
4242 continue;
4243
4244
4245 default:
25fe55af
RS
4246 abort (); /* We have listed all the cases. */
4247 } /* switch *p++ */
fa9a63c5
RM
4248
4249 /* Getting here means we have found the possible starting
25fe55af 4250 characters for one path of the pattern -- and that the empty
7814e705 4251 string does not match. We need not follow this path further. */
01618498 4252 return 0;
fa9a63c5
RM
4253 } /* while p */
4254
01618498
SM
4255 /* We reached the end without matching anything. */
4256 return 1;
4257
f6a3f532
SM
4258} /* analyse_first */
4259\f
4260/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4261 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4262 characters can start a string that matches the pattern. This fastmap
4263 is used by re_search to skip quickly over impossible starting points.
4264
4265 Character codes above (1 << BYTEWIDTH) are not represented in the
4266 fastmap, but the leading codes are represented. Thus, the fastmap
4267 indicates which character sets could start a match.
4268
4269 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4270 area as BUFP->fastmap.
4271
4272 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4273 the pattern buffer.
4274
4275 Returns 0 if we succeed, -2 if an internal error. */
4276
4277int
971de7fb 4278re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4279{
4280 char *fastmap = bufp->fastmap;
4281 int analysis;
4282
4283 assert (fastmap && bufp->buffer);
4284
72af86bd 4285 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4286 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4287
4288 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4289 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4290 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4291 return 0;
4292} /* re_compile_fastmap */
4293\f
4294/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4295 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4296 this memory for recording register information. STARTS and ENDS
4297 must be allocated using the malloc library routine, and must each
4298 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4299
4300 If NUM_REGS == 0, then subsequent matches should allocate their own
4301 register data.
4302
4303 Unless this function is called, the first search or match using
4304 PATTERN_BUFFER will allocate its own register data, without
4305 freeing the old data. */
4306
4307void
971de7fb 4308re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4309{
4310 if (num_regs)
4311 {
4312 bufp->regs_allocated = REGS_REALLOCATE;
4313 regs->num_regs = num_regs;
4314 regs->start = starts;
4315 regs->end = ends;
4316 }
4317 else
4318 {
4319 bufp->regs_allocated = REGS_UNALLOCATED;
4320 regs->num_regs = 0;
4321 regs->start = regs->end = (regoff_t *) 0;
4322 }
4323}
c0f9ea08 4324WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4325\f
7814e705 4326/* Searching routines. */
fa9a63c5
RM
4327
4328/* Like re_search_2, below, but only one string is specified, and
4329 doesn't let you say where to stop matching. */
4330
4331int
971de7fb 4332re_search (struct re_pattern_buffer *bufp, const char *string, int size, int startpos, int range, struct re_registers *regs)
fa9a63c5 4333{
5e69f11e 4334 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4335 regs, size);
4336}
c0f9ea08 4337WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4338
70806df6
KH
4339/* Head address of virtual concatenation of string. */
4340#define HEAD_ADDR_VSTRING(P) \
4341 (((P) >= size1 ? string2 : string1))
4342
b18215fc
RS
4343/* End address of virtual concatenation of string. */
4344#define STOP_ADDR_VSTRING(P) \
4345 (((P) >= size1 ? string2 + size2 : string1 + size1))
4346
4347/* Address of POS in the concatenation of virtual string. */
4348#define POS_ADDR_VSTRING(POS) \
4349 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4350
4351/* Using the compiled pattern in BUFP->buffer, first tries to match the
4352 virtual concatenation of STRING1 and STRING2, starting first at index
4353 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4354
fa9a63c5 4355 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4356
fa9a63c5
RM
4357 RANGE is how far to scan while trying to match. RANGE = 0 means try
4358 only at STARTPOS; in general, the last start tried is STARTPOS +
4359 RANGE.
5e69f11e 4360
fa9a63c5
RM
4361 In REGS, return the indices of the virtual concatenation of STRING1
4362 and STRING2 that matched the entire BUFP->buffer and its contained
4363 subexpressions.
5e69f11e 4364
fa9a63c5
RM
4365 Do not consider matching one past the index STOP in the virtual
4366 concatenation of STRING1 and STRING2.
4367
4368 We return either the position in the strings at which the match was
4369 found, -1 if no match, or -2 if error (such as failure
4370 stack overflow). */
4371
4372int
971de7fb 4373re_search_2 (struct re_pattern_buffer *bufp, const char *str1, int size1, const char *str2, int size2, int startpos, int range, struct re_registers *regs, int stop)
fa9a63c5
RM
4374{
4375 int val;
66f0296e
SM
4376 re_char *string1 = (re_char*) str1;
4377 re_char *string2 = (re_char*) str2;
fa9a63c5 4378 register char *fastmap = bufp->fastmap;
6676cb1c 4379 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4380 int total_size = size1 + size2;
4381 int endpos = startpos + range;
c0f9ea08 4382 boolean anchored_start;
cf9c99bc
KH
4383 /* Nonzero if we are searching multibyte string. */
4384 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4385
fa9a63c5
RM
4386 /* Check for out-of-range STARTPOS. */
4387 if (startpos < 0 || startpos > total_size)
4388 return -1;
5e69f11e 4389
fa9a63c5 4390 /* Fix up RANGE if it might eventually take us outside
34597fa9 4391 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4392 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4393 if (endpos < 0)
4394 range = 0 - startpos;
fa9a63c5
RM
4395 else if (endpos > total_size)
4396 range = total_size - startpos;
4397
4398 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4399 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4400 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4401 {
4402 if (startpos > 0)
4403 return -1;
4404 else
7b140fd7 4405 range = 0;
fa9a63c5
RM
4406 }
4407
ae4788a8
RS
4408#ifdef emacs
4409 /* In a forward search for something that starts with \=.
4410 don't keep searching past point. */
4411 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4412 {
7b140fd7
RS
4413 range = PT_BYTE - BEGV_BYTE - startpos;
4414 if (range < 0)
ae4788a8
RS
4415 return -1;
4416 }
4417#endif /* emacs */
4418
fa9a63c5
RM
4419 /* Update the fastmap now if not correct already. */
4420 if (fastmap && !bufp->fastmap_accurate)
01618498 4421 re_compile_fastmap (bufp);
5e69f11e 4422
c8499ba5 4423 /* See whether the pattern is anchored. */
c0f9ea08 4424 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4425
b18215fc 4426#ifdef emacs
d48cd3f4 4427 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4428 {
99633e97 4429 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4430
4431 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4432 }
b18215fc
RS
4433#endif
4434
fa9a63c5
RM
4435 /* Loop through the string, looking for a place to start matching. */
4436 for (;;)
5e69f11e 4437 {
c8499ba5
RS
4438 /* If the pattern is anchored,
4439 skip quickly past places we cannot match.
4440 We don't bother to treat startpos == 0 specially
4441 because that case doesn't repeat. */
4442 if (anchored_start && startpos > 0)
4443 {
c0f9ea08
SM
4444 if (! ((startpos <= size1 ? string1[startpos - 1]
4445 : string2[startpos - size1 - 1])
4446 == '\n'))
c8499ba5
RS
4447 goto advance;
4448 }
4449
fa9a63c5 4450 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4451 cannot be the start of a match. If the pattern can match the
4452 null string, however, we don't need to skip characters; we want
7814e705 4453 the first null string. */
fa9a63c5
RM
4454 if (fastmap && startpos < total_size && !bufp->can_be_null)
4455 {
66f0296e 4456 register re_char *d;
01618498 4457 register re_wchar_t buf_ch;
e934739e
RS
4458
4459 d = POS_ADDR_VSTRING (startpos);
4460
7814e705 4461 if (range > 0) /* Searching forwards. */
fa9a63c5 4462 {
fa9a63c5
RM
4463 register int lim = 0;
4464 int irange = range;
4465
25fe55af
RS
4466 if (startpos < size1 && startpos + range >= size1)
4467 lim = range - (size1 - startpos);
fa9a63c5 4468
25fe55af
RS
4469 /* Written out as an if-else to avoid testing `translate'
4470 inside the loop. */
28ae27ae
AS
4471 if (RE_TRANSLATE_P (translate))
4472 {
e934739e
RS
4473 if (multibyte)
4474 while (range > lim)
4475 {
4476 int buf_charlen;
4477
62a6e103 4478 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4479 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4480 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4481 break;
4482
4483 range -= buf_charlen;
4484 d += buf_charlen;
4485 }
4486 else
bf216479 4487 while (range > lim)
33c46939 4488 {
cf9c99bc
KH
4489 register re_wchar_t ch, translated;
4490
bf216479 4491 buf_ch = *d;
cf9c99bc
KH
4492 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4493 translated = RE_TRANSLATE (translate, ch);
4494 if (translated != ch
4495 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4496 buf_ch = ch;
6fdd04b0 4497 if (fastmap[buf_ch])
bf216479 4498 break;
33c46939
RS
4499 d++;
4500 range--;
4501 }
e934739e 4502 }
fa9a63c5 4503 else
6fdd04b0
KH
4504 {
4505 if (multibyte)
4506 while (range > lim)
4507 {
4508 int buf_charlen;
fa9a63c5 4509
62a6e103 4510 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4511 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4512 break;
4513 range -= buf_charlen;
4514 d += buf_charlen;
4515 }
e934739e 4516 else
6fdd04b0 4517 while (range > lim && !fastmap[*d])
33c46939
RS
4518 {
4519 d++;
4520 range--;
4521 }
e934739e 4522 }
fa9a63c5
RM
4523 startpos += irange - range;
4524 }
7814e705 4525 else /* Searching backwards. */
fa9a63c5 4526 {
ba5e343c
KH
4527 if (multibyte)
4528 {
62a6e103 4529 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4530 buf_ch = TRANSLATE (buf_ch);
4531 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4532 goto advance;
4533 }
4534 else
4535 {
cf9c99bc
KH
4536 register re_wchar_t ch, translated;
4537
4538 buf_ch = *d;
4539 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4540 translated = TRANSLATE (ch);
4541 if (translated != ch
4542 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4543 buf_ch = ch;
4544 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4545 goto advance;
4546 }
fa9a63c5
RM
4547 }
4548 }
4549
4550 /* If can't match the null string, and that's all we have left, fail. */
4551 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4552 && !bufp->can_be_null)
fa9a63c5
RM
4553 return -1;
4554
4555 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4556 startpos, regs, stop);
fa9a63c5
RM
4557
4558 if (val >= 0)
4559 return startpos;
5e69f11e 4560
fa9a63c5
RM
4561 if (val == -2)
4562 return -2;
4563
4564 advance:
5e69f11e 4565 if (!range)
25fe55af 4566 break;
5e69f11e 4567 else if (range > 0)
25fe55af 4568 {
b18215fc
RS
4569 /* Update STARTPOS to the next character boundary. */
4570 if (multibyte)
4571 {
66f0296e
SM
4572 re_char *p = POS_ADDR_VSTRING (startpos);
4573 re_char *pend = STOP_ADDR_VSTRING (startpos);
aa3830c4 4574 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4575
4576 range -= len;
4577 if (range < 0)
4578 break;
4579 startpos += len;
4580 }
4581 else
4582 {
b560c397
RS
4583 range--;
4584 startpos++;
4585 }
e318085a 4586 }
fa9a63c5 4587 else
25fe55af
RS
4588 {
4589 range++;
4590 startpos--;
b18215fc
RS
4591
4592 /* Update STARTPOS to the previous character boundary. */
4593 if (multibyte)
4594 {
70806df6
KH
4595 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4596 re_char *p0 = p;
4597 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4598
4599 /* Find the head of multibyte form. */
70806df6
KH
4600 PREV_CHAR_BOUNDARY (p, phead);
4601 range += p0 - 1 - p;
4602 if (range > 0)
4603 break;
b18215fc 4604
70806df6 4605 startpos -= p0 - 1 - p;
b18215fc 4606 }
25fe55af 4607 }
fa9a63c5
RM
4608 }
4609 return -1;
4610} /* re_search_2 */
c0f9ea08 4611WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4612\f
4613/* Declarations and macros for re_match_2. */
4614
2d1675e4
SM
4615static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4616 register int len,
4617 RE_TRANSLATE_TYPE translate,
4618 const int multibyte));
fa9a63c5
RM
4619
4620/* This converts PTR, a pointer into one of the search strings `string1'
4621 and `string2' into an offset from the beginning of that string. */
4622#define POINTER_TO_OFFSET(ptr) \
4623 (FIRST_STRING_P (ptr) \
4624 ? ((regoff_t) ((ptr) - string1)) \
4625 : ((regoff_t) ((ptr) - string2 + size1)))
4626
fa9a63c5 4627/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4628 string2 if necessary.
4629 Check re_match_2_internal for a discussion of why end_match_2 might
4630 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4631#define PREFETCH() \
25fe55af 4632 while (d == dend) \
fa9a63c5
RM
4633 { \
4634 /* End of string2 => fail. */ \
25fe55af
RS
4635 if (dend == end_match_2) \
4636 goto fail; \
4bb91c68 4637 /* End of string1 => advance to string2. */ \
25fe55af 4638 d = string2; \
fa9a63c5
RM
4639 dend = end_match_2; \
4640 }
4641
f1ad044f
SM
4642/* Call before fetching a char with *d if you already checked other limits.
4643 This is meant for use in lookahead operations like wordend, etc..
4644 where we might need to look at parts of the string that might be
4645 outside of the LIMITs (i.e past `stop'). */
4646#define PREFETCH_NOLIMIT() \
4647 if (d == end1) \
4648 { \
4649 d = string2; \
4650 dend = end_match_2; \
4651 } \
fa9a63c5
RM
4652
4653/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4654 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4655#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4656#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4657
4658
4659/* Test if D points to a character which is word-constituent. We have
4660 two special cases to check for: if past the end of string1, look at
4661 the first character in string2; and if before the beginning of
4662 string2, look at the last character in string1. */
4663#define WORDCHAR_P(d) \
4664 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4665 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4666 == Sword)
4667
9121ca40 4668/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4669
4670/* The comment at case wordbound is following one, but we don't use
4671 AT_WORD_BOUNDARY anymore to support multibyte form.
4672
4673 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4674 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4675 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4676 macro and introducing temporary variables works around the bug. */
4677
9121ca40 4678#if 0
fa9a63c5
RM
4679/* Test if the character before D and the one at D differ with respect
4680 to being word-constituent. */
4681#define AT_WORD_BOUNDARY(d) \
4682 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4683 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4684#endif
fa9a63c5
RM
4685
4686/* Free everything we malloc. */
4687#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4688# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4689# define FREE_VARIABLES() \
fa9a63c5
RM
4690 do { \
4691 REGEX_FREE_STACK (fail_stack.stack); \
4692 FREE_VAR (regstart); \
4693 FREE_VAR (regend); \
fa9a63c5
RM
4694 FREE_VAR (best_regstart); \
4695 FREE_VAR (best_regend); \
fa9a63c5
RM
4696 } while (0)
4697#else
0b32bf0e 4698# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4699#endif /* not MATCH_MAY_ALLOCATE */
4700
505bde11
SM
4701\f
4702/* Optimization routines. */
4703
4e8a9132
SM
4704/* If the operation is a match against one or more chars,
4705 return a pointer to the next operation, else return NULL. */
01618498 4706static re_char *
971de7fb 4707skip_one_char (const re_char *p)
4e8a9132
SM
4708{
4709 switch (SWITCH_ENUM_CAST (*p++))
4710 {
4711 case anychar:
4712 break;
177c0ea7 4713
4e8a9132
SM
4714 case exactn:
4715 p += *p + 1;
4716 break;
4717
4718 case charset_not:
4719 case charset:
4720 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4721 {
4722 int mcnt;
4723 p = CHARSET_RANGE_TABLE (p - 1);
4724 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4725 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4726 }
4727 else
4728 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4729 break;
177c0ea7 4730
4e8a9132
SM
4731 case syntaxspec:
4732 case notsyntaxspec:
1fb352e0 4733#ifdef emacs
4e8a9132
SM
4734 case categoryspec:
4735 case notcategoryspec:
4736#endif /* emacs */
4737 p++;
4738 break;
4739
4740 default:
4741 p = NULL;
4742 }
4743 return p;
4744}
4745
4746
505bde11 4747/* Jump over non-matching operations. */
839966f3 4748static re_char *
971de7fb 4749skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4750{
4751 int mcnt;
4752 while (p < pend)
4753 {
4754 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4755 {
4756 case start_memory:
505bde11
SM
4757 case stop_memory:
4758 p += 2; break;
4759 case no_op:
4760 p += 1; break;
4761 case jump:
4762 p += 1;
4763 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4764 p += mcnt;
4765 break;
4766 default:
4767 return p;
4768 }
4769 }
4770 assert (p == pend);
4771 return p;
4772}
4773
4774/* Non-zero if "p1 matches something" implies "p2 fails". */
4775static int
971de7fb 4776mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4777{
4e8a9132 4778 re_opcode_t op2;
2d1675e4 4779 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4780 unsigned char *pend = bufp->buffer + bufp->used;
4781
4e8a9132 4782 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4783 && p2 >= bufp->buffer && p2 <= pend);
4784
4785 /* Skip over open/close-group commands.
4786 If what follows this loop is a ...+ construct,
4787 look at what begins its body, since we will have to
4788 match at least one of that. */
4e8a9132
SM
4789 p2 = skip_noops (p2, pend);
4790 /* The same skip can be done for p1, except that this function
4791 is only used in the case where p1 is a simple match operator. */
4792 /* p1 = skip_noops (p1, pend); */
4793
4794 assert (p1 >= bufp->buffer && p1 < pend
4795 && p2 >= bufp->buffer && p2 <= pend);
4796
4797 op2 = p2 == pend ? succeed : *p2;
4798
4799 switch (SWITCH_ENUM_CAST (op2))
505bde11 4800 {
4e8a9132
SM
4801 case succeed:
4802 case endbuf:
4803 /* If we're at the end of the pattern, we can change. */
4804 if (skip_one_char (p1))
505bde11 4805 {
505bde11
SM
4806 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4807 return 1;
505bde11 4808 }
4e8a9132 4809 break;
177c0ea7 4810
4e8a9132 4811 case endline:
4e8a9132
SM
4812 case exactn:
4813 {
01618498 4814 register re_wchar_t c
4e8a9132 4815 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4816 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4817
4e8a9132
SM
4818 if ((re_opcode_t) *p1 == exactn)
4819 {
62a6e103 4820 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4821 {
4822 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4823 return 1;
4824 }
4825 }
505bde11 4826
4e8a9132
SM
4827 else if ((re_opcode_t) *p1 == charset
4828 || (re_opcode_t) *p1 == charset_not)
4829 {
4830 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4831
4e8a9132
SM
4832 /* Test if C is listed in charset (or charset_not)
4833 at `p1'. */
6fdd04b0 4834 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4835 {
4836 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4837 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4838 not = !not;
4839 }
4840 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4841 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4842
4e8a9132
SM
4843 /* `not' is equal to 1 if c would match, which means
4844 that we can't change to pop_failure_jump. */
4845 if (!not)
4846 {
4847 DEBUG_PRINT1 (" No match => fast loop.\n");
4848 return 1;
4849 }
4850 }
4851 else if ((re_opcode_t) *p1 == anychar
4852 && c == '\n')
4853 {
4854 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4855 return 1;
4856 }
4857 }
4858 break;
505bde11 4859
4e8a9132 4860 case charset:
4e8a9132
SM
4861 {
4862 if ((re_opcode_t) *p1 == exactn)
4863 /* Reuse the code above. */
4864 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4865
505bde11
SM
4866 /* It is hard to list up all the character in charset
4867 P2 if it includes multibyte character. Give up in
4868 such case. */
4869 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4870 {
4871 /* Now, we are sure that P2 has no range table.
4872 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4873 enough. But P1 may have range table, so the
505bde11
SM
4874 size of bitmap table of P1 is extracted by
4875 using macro `CHARSET_BITMAP_SIZE'.
4876
6fdd04b0
KH
4877 In a multibyte case, we know that all the character
4878 listed in P2 is ASCII. In a unibyte case, P1 has only a
4879 bitmap table. So, in both cases, it is enough to test
4880 only the bitmap table of P1. */
505bde11 4881
411e4203 4882 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4883 {
4884 int idx;
4885 /* We win if the charset inside the loop
4886 has no overlap with the one after the loop. */
4887 for (idx = 0;
4888 (idx < (int) p2[1]
4889 && idx < CHARSET_BITMAP_SIZE (p1));
4890 idx++)
4891 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4892 break;
4893
4894 if (idx == p2[1]
4895 || idx == CHARSET_BITMAP_SIZE (p1))
4896 {
4897 DEBUG_PRINT1 (" No match => fast loop.\n");
4898 return 1;
4899 }
4900 }
411e4203 4901 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4902 {
4903 int idx;
4904 /* We win if the charset_not inside the loop lists
7814e705 4905 every character listed in the charset after. */
505bde11
SM
4906 for (idx = 0; idx < (int) p2[1]; idx++)
4907 if (! (p2[2 + idx] == 0
4908 || (idx < CHARSET_BITMAP_SIZE (p1)
4909 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4910 break;
4911
4e8a9132
SM
4912 if (idx == p2[1])
4913 {
4914 DEBUG_PRINT1 (" No match => fast loop.\n");
4915 return 1;
4916 }
4917 }
4918 }
4919 }
609b757a 4920 break;
177c0ea7 4921
411e4203
SM
4922 case charset_not:
4923 switch (SWITCH_ENUM_CAST (*p1))
4924 {
4925 case exactn:
4926 case charset:
4927 /* Reuse the code above. */
4928 return mutually_exclusive_p (bufp, p2, p1);
4929 case charset_not:
4930 /* When we have two charset_not, it's very unlikely that
4931 they don't overlap. The union of the two sets of excluded
4932 chars should cover all possible chars, which, as a matter of
4933 fact, is virtually impossible in multibyte buffers. */
36595814 4934 break;
411e4203
SM
4935 }
4936 break;
4937
4e8a9132 4938 case wordend:
669fa600
SM
4939 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4940 case symend:
4e8a9132 4941 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4942 && (p1[1] == Ssymbol || p1[1] == Sword));
4943 case notsyntaxspec:
4944 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4945
4946 case wordbeg:
669fa600
SM
4947 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4948 case symbeg:
4e8a9132 4949 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4950 && (p1[1] == Ssymbol || p1[1] == Sword));
4951 case syntaxspec:
4952 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4953
4954 case wordbound:
4955 return (((re_opcode_t) *p1 == notsyntaxspec
4956 || (re_opcode_t) *p1 == syntaxspec)
4957 && p1[1] == Sword);
4958
1fb352e0 4959#ifdef emacs
4e8a9132
SM
4960 case categoryspec:
4961 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4962 case notcategoryspec:
4963 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4964#endif /* emacs */
4965
4966 default:
4967 ;
505bde11
SM
4968 }
4969
4970 /* Safe default. */
4971 return 0;
4972}
4973
fa9a63c5
RM
4974\f
4975/* Matching routines. */
4976
25fe55af 4977#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4978/* re_match is like re_match_2 except it takes only a single string. */
4979
4980int
4981re_match (bufp, string, size, pos, regs)
4982 struct re_pattern_buffer *bufp;
4983 const char *string;
4984 int size, pos;
4985 struct re_registers *regs;
4986{
4bb91c68 4987 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4988 pos, regs, size);
fa9a63c5
RM
4989 return result;
4990}
c0f9ea08 4991WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4992#endif /* not emacs */
4993
b18215fc
RS
4994#ifdef emacs
4995/* In Emacs, this is the string or buffer in which we
7814e705 4996 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4997Lisp_Object re_match_object;
4998#endif
fa9a63c5
RM
4999
5000/* re_match_2 matches the compiled pattern in BUFP against the
5001 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5002 and SIZE2, respectively). We start matching at POS, and stop
5003 matching at STOP.
5e69f11e 5004
fa9a63c5 5005 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 5006 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
5007 documentation for exactly how many groups we fill.
5008
5009 We return -1 if no match, -2 if an internal error (such as the
7814e705 5010 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
5011 matched substring. */
5012
5013int
971de7fb 5014re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5 5015{
b18215fc 5016 int result;
25fe55af 5017
b18215fc 5018#ifdef emacs
cc9b4df2 5019 int charpos;
d48cd3f4 5020 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 5021 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5022 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5023#endif
5024
4bb91c68
SM
5025 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5026 (re_char*) string2, size2,
cc9b4df2 5027 pos, regs, stop);
fa9a63c5
RM
5028 return result;
5029}
c0f9ea08 5030WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5031
bf216479 5032
fa9a63c5 5033/* This is a separate function so that we can force an alloca cleanup
7814e705 5034 afterwards. */
fa9a63c5 5035static int
971de7fb 5036re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1, int size1, const re_char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5
RM
5037{
5038 /* General temporaries. */
5039 int mcnt;
01618498 5040 size_t reg;
66f0296e 5041 boolean not;
fa9a63c5
RM
5042
5043 /* Just past the end of the corresponding string. */
66f0296e 5044 re_char *end1, *end2;
fa9a63c5
RM
5045
5046 /* Pointers into string1 and string2, just past the last characters in
7814e705 5047 each to consider matching. */
66f0296e 5048 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5049
5050 /* Where we are in the data, and the end of the current string. */
66f0296e 5051 re_char *d, *dend;
5e69f11e 5052
99633e97
SM
5053 /* Used sometimes to remember where we were before starting matching
5054 an operator so that we can go back in case of failure. This "atomic"
5055 behavior of matching opcodes is indispensable to the correctness
5056 of the on_failure_keep_string_jump optimization. */
5057 re_char *dfail;
5058
fa9a63c5 5059 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5060 re_char *p = bufp->buffer;
5061 re_char *pend = p + bufp->used;
fa9a63c5 5062
25fe55af 5063 /* We use this to map every character in the string. */
6676cb1c 5064 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5065
cf9c99bc 5066 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5067 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5068
cf9c99bc
KH
5069 /* Nonzero if STRING1/STRING2 are multibyte. */
5070 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5071
fa9a63c5
RM
5072 /* Failure point stack. Each place that can handle a failure further
5073 down the line pushes a failure point on this stack. It consists of
505bde11 5074 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5075 the subexpressions we're currently inside, plus the number of such
5076 registers, and, finally, two char *'s. The first char * is where
5077 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5078 scanning the strings. */
5079#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5080 fail_stack_type fail_stack;
5081#endif
5082#ifdef DEBUG
fa9a63c5
RM
5083 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5084#endif
5085
0b32bf0e 5086#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5087 /* This holds the pointer to the failure stack, when
5088 it is allocated relocatably. */
5089 fail_stack_elt_t *failure_stack_ptr;
99633e97 5090#endif
fa9a63c5
RM
5091
5092 /* We fill all the registers internally, independent of what we
7814e705 5093 return, for use in backreferences. The number here includes
fa9a63c5 5094 an element for register zero. */
4bb91c68 5095 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5096
fa9a63c5
RM
5097 /* Information on the contents of registers. These are pointers into
5098 the input strings; they record just what was matched (on this
5099 attempt) by a subexpression part of the pattern, that is, the
5100 regnum-th regstart pointer points to where in the pattern we began
5101 matching and the regnum-th regend points to right after where we
5102 stopped matching the regnum-th subexpression. (The zeroth register
5103 keeps track of what the whole pattern matches.) */
5104#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5105 re_char **regstart, **regend;
fa9a63c5
RM
5106#endif
5107
fa9a63c5 5108 /* The following record the register info as found in the above
5e69f11e 5109 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5110 This happens as we backtrack through the failure points, which in
5111 turn happens only if we have not yet matched the entire string. */
5112 unsigned best_regs_set = false;
5113#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5114 re_char **best_regstart, **best_regend;
fa9a63c5 5115#endif
5e69f11e 5116
fa9a63c5
RM
5117 /* Logically, this is `best_regend[0]'. But we don't want to have to
5118 allocate space for that if we're not allocating space for anything
7814e705 5119 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5120 any of the other register vectors, and it seems rather a kludge to
5121 treat `best_regend' differently than the rest. So we keep track of
5122 the end of the best match so far in a separate variable. We
5123 initialize this to NULL so that when we backtrack the first time
5124 and need to test it, it's not garbage. */
66f0296e 5125 re_char *match_end = NULL;
fa9a63c5 5126
fa9a63c5
RM
5127#ifdef DEBUG
5128 /* Counts the total number of registers pushed. */
5e69f11e 5129 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5130#endif
5131
5132 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5133
fa9a63c5 5134 INIT_FAIL_STACK ();
5e69f11e 5135
fa9a63c5
RM
5136#ifdef MATCH_MAY_ALLOCATE
5137 /* Do not bother to initialize all the register variables if there are
5138 no groups in the pattern, as it takes a fair amount of time. If
5139 there are groups, we include space for register 0 (the whole
5140 pattern), even though we never use it, since it simplifies the
5141 array indexing. We should fix this. */
5142 if (bufp->re_nsub)
5143 {
66f0296e
SM
5144 regstart = REGEX_TALLOC (num_regs, re_char *);
5145 regend = REGEX_TALLOC (num_regs, re_char *);
5146 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5147 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5148
505bde11 5149 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5150 {
5151 FREE_VARIABLES ();
5152 return -2;
5153 }
fa9a63c5
RM
5154 }
5155 else
5156 {
5157 /* We must initialize all our variables to NULL, so that
25fe55af 5158 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5159 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5160 }
5161#endif /* MATCH_MAY_ALLOCATE */
5162
5163 /* The starting position is bogus. */
5164 if (pos < 0 || pos > size1 + size2)
5165 {
5166 FREE_VARIABLES ();
5167 return -1;
5168 }
5e69f11e 5169
fa9a63c5
RM
5170 /* Initialize subexpression text positions to -1 to mark ones that no
5171 start_memory/stop_memory has been seen for. Also initialize the
5172 register information struct. */
01618498
SM
5173 for (reg = 1; reg < num_regs; reg++)
5174 regstart[reg] = regend[reg] = NULL;
99633e97 5175
fa9a63c5 5176 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5177 `string1' is null. */
fa9a63c5
RM
5178 if (size2 == 0 && string1 != NULL)
5179 {
5180 string2 = string1;
5181 size2 = size1;
5182 string1 = 0;
5183 size1 = 0;
5184 }
5185 end1 = string1 + size1;
5186 end2 = string2 + size2;
5187
5e69f11e 5188 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5189 `dend' is the end of the input string that `d' points within. `d'
5190 is advanced into the following input string whenever necessary, but
5191 this happens before fetching; therefore, at the beginning of the
5192 loop, `d' can be pointing at the end of a string, but it cannot
5193 equal `string2'. */
419d1c74 5194 if (pos >= size1)
fa9a63c5 5195 {
419d1c74
SM
5196 /* Only match within string2. */
5197 d = string2 + pos - size1;
5198 dend = end_match_2 = string2 + stop - size1;
5199 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5200 }
5201 else
5202 {
f1ad044f 5203 if (stop < size1)
419d1c74
SM
5204 {
5205 /* Only match within string1. */
5206 end_match_1 = string1 + stop;
5207 /* BEWARE!
5208 When we reach end_match_1, PREFETCH normally switches to string2.
5209 But in the present case, this means that just doing a PREFETCH
5210 makes us jump from `stop' to `gap' within the string.
5211 What we really want here is for the search to stop as
5212 soon as we hit end_match_1. That's why we set end_match_2
5213 to end_match_1 (since PREFETCH fails as soon as we hit
5214 end_match_2). */
5215 end_match_2 = end_match_1;
5216 }
5217 else
f1ad044f
SM
5218 { /* It's important to use this code when stop == size so that
5219 moving `d' from end1 to string2 will not prevent the d == dend
5220 check from catching the end of string. */
419d1c74
SM
5221 end_match_1 = end1;
5222 end_match_2 = string2 + stop - size1;
5223 }
5224 d = string1 + pos;
5225 dend = end_match_1;
fa9a63c5
RM
5226 }
5227
5228 DEBUG_PRINT1 ("The compiled pattern is: ");
5229 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5230 DEBUG_PRINT1 ("The string to match is: `");
5231 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5232 DEBUG_PRINT1 ("'\n");
5e69f11e 5233
7814e705 5234 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5235 function if the match is complete, or it drops through if the match
5236 fails at this starting point in the input data. */
5237 for (;;)
5238 {
505bde11 5239 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5240
5241 if (p == pend)
5242 { /* End of pattern means we might have succeeded. */
25fe55af 5243 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5244
fa9a63c5 5245 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5246 longest match, try backtracking. */
5247 if (d != end_match_2)
fa9a63c5
RM
5248 {
5249 /* 1 if this match ends in the same string (string1 or string2)
5250 as the best previous match. */
5e69f11e 5251 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5252 == FIRST_STRING_P (d));
fa9a63c5
RM
5253 /* 1 if this match is the best seen so far. */
5254 boolean best_match_p;
5255
5256 /* AIX compiler got confused when this was combined
7814e705 5257 with the previous declaration. */
fa9a63c5
RM
5258 if (same_str_p)
5259 best_match_p = d > match_end;
5260 else
99633e97 5261 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5262
25fe55af
RS
5263 DEBUG_PRINT1 ("backtracking.\n");
5264
5265 if (!FAIL_STACK_EMPTY ())
5266 { /* More failure points to try. */
5267
5268 /* If exceeds best match so far, save it. */
5269 if (!best_regs_set || best_match_p)
5270 {
5271 best_regs_set = true;
5272 match_end = d;
5273
5274 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5275
01618498 5276 for (reg = 1; reg < num_regs; reg++)
25fe55af 5277 {
01618498
SM
5278 best_regstart[reg] = regstart[reg];
5279 best_regend[reg] = regend[reg];
25fe55af
RS
5280 }
5281 }
5282 goto fail;
5283 }
5284
5285 /* If no failure points, don't restore garbage. And if
5286 last match is real best match, don't restore second
5287 best one. */
5288 else if (best_regs_set && !best_match_p)
5289 {
5290 restore_best_regs:
5291 /* Restore best match. It may happen that `dend ==
5292 end_match_1' while the restored d is in string2.
5293 For example, the pattern `x.*y.*z' against the
5294 strings `x-' and `y-z-', if the two strings are
7814e705 5295 not consecutive in memory. */
25fe55af
RS
5296 DEBUG_PRINT1 ("Restoring best registers.\n");
5297
5298 d = match_end;
5299 dend = ((d >= string1 && d <= end1)
5300 ? end_match_1 : end_match_2);
fa9a63c5 5301
01618498 5302 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5303 {
01618498
SM
5304 regstart[reg] = best_regstart[reg];
5305 regend[reg] = best_regend[reg];
fa9a63c5 5306 }
25fe55af
RS
5307 }
5308 } /* d != end_match_2 */
fa9a63c5
RM
5309
5310 succeed_label:
25fe55af 5311 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5312
25fe55af
RS
5313 /* If caller wants register contents data back, do it. */
5314 if (regs && !bufp->no_sub)
fa9a63c5 5315 {
25fe55af
RS
5316 /* Have the register data arrays been allocated? */
5317 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5318 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5319 extra element beyond `num_regs' for the `-1' marker
5320 GNU code uses. */
5321 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5322 regs->start = TALLOC (regs->num_regs, regoff_t);
5323 regs->end = TALLOC (regs->num_regs, regoff_t);
5324 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5325 {
5326 FREE_VARIABLES ();
5327 return -2;
5328 }
25fe55af
RS
5329 bufp->regs_allocated = REGS_REALLOCATE;
5330 }
5331 else if (bufp->regs_allocated == REGS_REALLOCATE)
5332 { /* Yes. If we need more elements than were already
5333 allocated, reallocate them. If we need fewer, just
5334 leave it alone. */
5335 if (regs->num_regs < num_regs + 1)
5336 {
5337 regs->num_regs = num_regs + 1;
5338 RETALLOC (regs->start, regs->num_regs, regoff_t);
5339 RETALLOC (regs->end, regs->num_regs, regoff_t);
5340 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5341 {
5342 FREE_VARIABLES ();
5343 return -2;
5344 }
25fe55af
RS
5345 }
5346 }
5347 else
fa9a63c5
RM
5348 {
5349 /* These braces fend off a "empty body in an else-statement"
7814e705 5350 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5351 assert (bufp->regs_allocated == REGS_FIXED);
5352 }
5353
25fe55af
RS
5354 /* Convert the pointer data in `regstart' and `regend' to
5355 indices. Register zero has to be set differently,
5356 since we haven't kept track of any info for it. */
5357 if (regs->num_regs > 0)
5358 {
5359 regs->start[0] = pos;
99633e97 5360 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5361 }
5e69f11e 5362
25fe55af
RS
5363 /* Go through the first `min (num_regs, regs->num_regs)'
5364 registers, since that is all we initialized. */
01618498 5365 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5366 {
01618498
SM
5367 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5368 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5369 else
5370 {
01618498
SM
5371 regs->start[reg]
5372 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5373 regs->end[reg]
5374 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5375 }
fa9a63c5 5376 }
5e69f11e 5377
25fe55af
RS
5378 /* If the regs structure we return has more elements than
5379 were in the pattern, set the extra elements to -1. If
5380 we (re)allocated the registers, this is the case,
5381 because we always allocate enough to have at least one
7814e705 5382 -1 at the end. */
01618498
SM
5383 for (reg = num_regs; reg < regs->num_regs; reg++)
5384 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5385 } /* regs && !bufp->no_sub */
5386
25fe55af
RS
5387 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5388 nfailure_points_pushed, nfailure_points_popped,
5389 nfailure_points_pushed - nfailure_points_popped);
5390 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5391
99633e97 5392 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5393
25fe55af 5394 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5395
25fe55af
RS
5396 FREE_VARIABLES ();
5397 return mcnt;
5398 }
fa9a63c5 5399
7814e705 5400 /* Otherwise match next pattern command. */
fa9a63c5
RM
5401 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5402 {
25fe55af
RS
5403 /* Ignore these. Used to ignore the n of succeed_n's which
5404 currently have n == 0. */
5405 case no_op:
5406 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5407 break;
fa9a63c5
RM
5408
5409 case succeed:
25fe55af 5410 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5411 goto succeed_label;
5412
7814e705 5413 /* Match the next n pattern characters exactly. The following
25fe55af 5414 byte in the pattern defines n, and the n bytes after that
7814e705 5415 are the characters to match. */
fa9a63c5
RM
5416 case exactn:
5417 mcnt = *p++;
25fe55af 5418 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5419
99633e97
SM
5420 /* Remember the start point to rollback upon failure. */
5421 dfail = d;
5422
6fdd04b0 5423#ifndef emacs
25fe55af
RS
5424 /* This is written out as an if-else so we don't waste time
5425 testing `translate' inside the loop. */
28703c16 5426 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5427 do
5428 {
5429 PREFETCH ();
5430 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5431 {
6fdd04b0
KH
5432 d = dfail;
5433 goto fail;
e934739e 5434 }
6fdd04b0
KH
5435 d++;
5436 }
5437 while (--mcnt);
fa9a63c5 5438 else
6fdd04b0
KH
5439 do
5440 {
5441 PREFETCH ();
5442 if (*d++ != *p++)
bf216479 5443 {
6fdd04b0
KH
5444 d = dfail;
5445 goto fail;
bf216479 5446 }
6fdd04b0
KH
5447 }
5448 while (--mcnt);
5449#else /* emacs */
5450 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5451 if (target_multibyte)
6fdd04b0
KH
5452 do
5453 {
5454 int pat_charlen, buf_charlen;
cf9c99bc 5455 int pat_ch, buf_ch;
e934739e 5456
6fdd04b0 5457 PREFETCH ();
cf9c99bc 5458 if (multibyte)
62a6e103 5459 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5460 else
5461 {
5462 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5463 pat_charlen = 1;
5464 }
62a6e103 5465 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5466
6fdd04b0 5467 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5468 {
6fdd04b0
KH
5469 d = dfail;
5470 goto fail;
e934739e 5471 }
bf216479 5472
6fdd04b0
KH
5473 p += pat_charlen;
5474 d += buf_charlen;
5475 mcnt -= pat_charlen;
5476 }
5477 while (mcnt > 0);
fa9a63c5 5478 else
6fdd04b0
KH
5479 do
5480 {
cf9c99bc
KH
5481 int pat_charlen, buf_charlen;
5482 int pat_ch, buf_ch;
bf216479 5483
6fdd04b0 5484 PREFETCH ();
cf9c99bc
KH
5485 if (multibyte)
5486 {
62a6e103 5487 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5488 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5489 }
5490 else
5491 {
5492 pat_ch = *p;
5493 pat_charlen = 1;
5494 }
5495 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5496 if (! CHAR_BYTE8_P (buf_ch))
5497 {
5498 buf_ch = TRANSLATE (buf_ch);
5499 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5500 if (buf_ch < 0)
5501 buf_ch = *d;
5502 }
0e2501ed
AS
5503 else
5504 buf_ch = *d;
cf9c99bc 5505 if (buf_ch != pat_ch)
6fdd04b0
KH
5506 {
5507 d = dfail;
5508 goto fail;
bf216479 5509 }
cf9c99bc
KH
5510 p += pat_charlen;
5511 d++;
6fdd04b0
KH
5512 }
5513 while (--mcnt);
5514#endif
25fe55af 5515 break;
fa9a63c5
RM
5516
5517
25fe55af 5518 /* Match any character except possibly a newline or a null. */
fa9a63c5 5519 case anychar:
e934739e
RS
5520 {
5521 int buf_charlen;
01618498 5522 re_wchar_t buf_ch;
fa9a63c5 5523
e934739e 5524 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5525
e934739e 5526 PREFETCH ();
62a6e103 5527 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5528 target_multibyte);
e934739e
RS
5529 buf_ch = TRANSLATE (buf_ch);
5530
5531 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5532 && buf_ch == '\n')
5533 || ((bufp->syntax & RE_DOT_NOT_NULL)
5534 && buf_ch == '\000'))
5535 goto fail;
5536
e934739e
RS
5537 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5538 d += buf_charlen;
5539 }
fa9a63c5
RM
5540 break;
5541
5542
5543 case charset:
5544 case charset_not:
5545 {
b18215fc 5546 register unsigned int c;
fa9a63c5 5547 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5548 int len;
5549
5550 /* Start of actual range_table, or end of bitmap if there is no
5551 range table. */
01618498 5552 re_char *range_table;
b18215fc 5553
96cc36cc 5554 /* Nonzero if there is a range table. */
b18215fc
RS
5555 int range_table_exists;
5556
96cc36cc
RS
5557 /* Number of ranges of range table. This is not included
5558 in the initial byte-length of the command. */
5559 int count = 0;
fa9a63c5 5560
f5020181
AS
5561 /* Whether matching against a unibyte character. */
5562 boolean unibyte_char = false;
5563
25fe55af 5564 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5565
b18215fc 5566 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5567
b18215fc 5568 if (range_table_exists)
96cc36cc
RS
5569 {
5570 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5571 EXTRACT_NUMBER_AND_INCR (count, range_table);
5572 }
b18215fc 5573
2d1675e4 5574 PREFETCH ();
62a6e103 5575 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5576 if (target_multibyte)
5577 {
5578 int c1;
b18215fc 5579
cf9c99bc
KH
5580 c = TRANSLATE (c);
5581 c1 = RE_CHAR_TO_UNIBYTE (c);
5582 if (c1 >= 0)
f5020181
AS
5583 {
5584 unibyte_char = true;
5585 c = c1;
5586 }
cf9c99bc
KH
5587 }
5588 else
5589 {
5590 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5591
5592 if (! CHAR_BYTE8_P (c1))
5593 {
5594 c1 = TRANSLATE (c1);
5595 c1 = RE_CHAR_TO_UNIBYTE (c1);
5596 if (c1 >= 0)
f5020181
AS
5597 {
5598 unibyte_char = true;
5599 c = c1;
5600 }
cf9c99bc 5601 }
0b8be006
AS
5602 else
5603 unibyte_char = true;
cf9c99bc
KH
5604 }
5605
f5020181 5606 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5607 { /* Lookup bitmap. */
b18215fc
RS
5608 /* Cast to `unsigned' instead of `unsigned char' in
5609 case the bit list is a full 32 bytes long. */
5610 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5611 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5612 not = !not;
b18215fc 5613 }
96cc36cc 5614#ifdef emacs
b18215fc 5615 else if (range_table_exists)
96cc36cc
RS
5616 {
5617 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5618
14473664
SM
5619 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5620 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5621 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5622 | (class_bits & BIT_SPACE && ISSPACE (c))
5623 | (class_bits & BIT_UPPER && ISUPPER (c))
5624 | (class_bits & BIT_WORD && ISWORD (c)))
5625 not = !not;
5626 else
5627 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5628 }
5629#endif /* emacs */
fa9a63c5 5630
96cc36cc
RS
5631 if (range_table_exists)
5632 p = CHARSET_RANGE_TABLE_END (range_table, count);
5633 else
5634 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5635
5636 if (!not) goto fail;
5e69f11e 5637
b18215fc 5638 d += len;
fa9a63c5
RM
5639 break;
5640 }
5641
5642
25fe55af 5643 /* The beginning of a group is represented by start_memory.
505bde11 5644 The argument is the register number. The text
25fe55af 5645 matched within the group is recorded (in the internal
7814e705 5646 registers data structure) under the register number. */
25fe55af 5647 case start_memory:
505bde11
SM
5648 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5649
5650 /* In case we need to undo this operation (via backtracking). */
5651 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5652
25fe55af 5653 regstart[*p] = d;
4bb91c68 5654 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5655 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5656
25fe55af 5657 /* Move past the register number and inner group count. */
505bde11 5658 p += 1;
25fe55af 5659 break;
fa9a63c5
RM
5660
5661
25fe55af 5662 /* The stop_memory opcode represents the end of a group. Its
505bde11 5663 argument is the same as start_memory's: the register number. */
fa9a63c5 5664 case stop_memory:
505bde11
SM
5665 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5666
5667 assert (!REG_UNSET (regstart[*p]));
5668 /* Strictly speaking, there should be code such as:
177c0ea7 5669
0b32bf0e 5670 assert (REG_UNSET (regend[*p]));
505bde11
SM
5671 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5672
5673 But the only info to be pushed is regend[*p] and it is known to
5674 be UNSET, so there really isn't anything to push.
5675 Not pushing anything, on the other hand deprives us from the
5676 guarantee that regend[*p] is UNSET since undoing this operation
5677 will not reset its value properly. This is not important since
5678 the value will only be read on the next start_memory or at
5679 the very end and both events can only happen if this stop_memory
5680 is *not* undone. */
fa9a63c5 5681
25fe55af 5682 regend[*p] = d;
fa9a63c5
RM
5683 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5684
25fe55af 5685 /* Move past the register number and the inner group count. */
505bde11 5686 p += 1;
25fe55af 5687 break;
fa9a63c5
RM
5688
5689
5690 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5691 followed by the numeric value of <digit> as the register number. */
5692 case duplicate:
fa9a63c5 5693 {
66f0296e 5694 register re_char *d2, *dend2;
7814e705 5695 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5696 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5697
7814e705 5698 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5699 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5700 goto fail;
5e69f11e 5701
7814e705 5702 /* Where in input to try to start matching. */
25fe55af 5703 d2 = regstart[regno];
5e69f11e 5704
99633e97
SM
5705 /* Remember the start point to rollback upon failure. */
5706 dfail = d;
5707
25fe55af
RS
5708 /* Where to stop matching; if both the place to start and
5709 the place to stop matching are in the same string, then
5710 set to the place to stop, otherwise, for now have to use
5711 the end of the first string. */
fa9a63c5 5712
25fe55af 5713 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5714 == FIRST_STRING_P (regend[regno]))
5715 ? regend[regno] : end_match_1);
5716 for (;;)
5717 {
5718 /* If necessary, advance to next segment in register
25fe55af 5719 contents. */
fa9a63c5
RM
5720 while (d2 == dend2)
5721 {
5722 if (dend2 == end_match_2) break;
5723 if (dend2 == regend[regno]) break;
5724
25fe55af
RS
5725 /* End of string1 => advance to string2. */
5726 d2 = string2;
5727 dend2 = regend[regno];
fa9a63c5
RM
5728 }
5729 /* At end of register contents => success */
5730 if (d2 == dend2) break;
5731
5732 /* If necessary, advance to next segment in data. */
5733 PREFETCH ();
5734
5735 /* How many characters left in this segment to match. */
5736 mcnt = dend - d;
5e69f11e 5737
fa9a63c5 5738 /* Want how many consecutive characters we can match in
25fe55af
RS
5739 one shot, so, if necessary, adjust the count. */
5740 if (mcnt > dend2 - d2)
fa9a63c5 5741 mcnt = dend2 - d2;
5e69f11e 5742
fa9a63c5 5743 /* Compare that many; failure if mismatch, else move
25fe55af 5744 past them. */
28703c16 5745 if (RE_TRANSLATE_P (translate)
02cb78b5 5746 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5747 : memcmp (d, d2, mcnt))
99633e97
SM
5748 {
5749 d = dfail;
5750 goto fail;
5751 }
fa9a63c5 5752 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5753 }
5754 }
5755 break;
5756
5757
25fe55af 5758 /* begline matches the empty string at the beginning of the string
c0f9ea08 5759 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5760 case begline:
25fe55af 5761 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5762
25fe55af
RS
5763 if (AT_STRINGS_BEG (d))
5764 {
5765 if (!bufp->not_bol) break;
5766 }
419d1c74 5767 else
25fe55af 5768 {
bf216479 5769 unsigned c;
419d1c74 5770 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5771 if (c == '\n')
419d1c74 5772 break;
25fe55af
RS
5773 }
5774 /* In all other cases, we fail. */
5775 goto fail;
fa9a63c5
RM
5776
5777
25fe55af 5778 /* endline is the dual of begline. */
fa9a63c5 5779 case endline:
25fe55af 5780 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5781
25fe55af
RS
5782 if (AT_STRINGS_END (d))
5783 {
5784 if (!bufp->not_eol) break;
5785 }
f1ad044f 5786 else
25fe55af 5787 {
f1ad044f 5788 PREFETCH_NOLIMIT ();
c0f9ea08 5789 if (*d == '\n')
f1ad044f 5790 break;
25fe55af
RS
5791 }
5792 goto fail;
fa9a63c5
RM
5793
5794
5795 /* Match at the very beginning of the data. */
25fe55af
RS
5796 case begbuf:
5797 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5798 if (AT_STRINGS_BEG (d))
5799 break;
5800 goto fail;
fa9a63c5
RM
5801
5802
5803 /* Match at the very end of the data. */
25fe55af
RS
5804 case endbuf:
5805 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5806 if (AT_STRINGS_END (d))
5807 break;
25fe55af 5808 goto fail;
5e69f11e 5809
5e69f11e 5810
25fe55af
RS
5811 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5812 pushes NULL as the value for the string on the stack. Then
505bde11 5813 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5814 string, instead of restoring it. To see why, consider
7814e705 5815 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5816 then the . fails against the \n. But the next thing we want
5817 to do is match the \n against the \n; if we restored the
5818 string value, we would be back at the foo.
5819
5820 Because this is used only in specific cases, we don't need to
5821 check all the things that `on_failure_jump' does, to make
5822 sure the right things get saved on the stack. Hence we don't
5823 share its code. The only reason to push anything on the
5824 stack at all is that otherwise we would have to change
5825 `anychar's code to do something besides goto fail in this
5826 case; that seems worse than this. */
5827 case on_failure_keep_string_jump:
505bde11
SM
5828 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5829 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5830 mcnt, p + mcnt);
fa9a63c5 5831
505bde11
SM
5832 PUSH_FAILURE_POINT (p - 3, NULL);
5833 break;
5834
0683b6fa
SM
5835 /* A nasty loop is introduced by the non-greedy *? and +?.
5836 With such loops, the stack only ever contains one failure point
5837 at a time, so that a plain on_failure_jump_loop kind of
5838 cycle detection cannot work. Worse yet, such a detection
5839 can not only fail to detect a cycle, but it can also wrongly
5840 detect a cycle (between different instantiations of the same
6df42991 5841 loop).
0683b6fa
SM
5842 So the method used for those nasty loops is a little different:
5843 We use a special cycle-detection-stack-frame which is pushed
5844 when the on_failure_jump_nastyloop failure-point is *popped*.
5845 This special frame thus marks the beginning of one iteration
5846 through the loop and we can hence easily check right here
5847 whether something matched between the beginning and the end of
5848 the loop. */
5849 case on_failure_jump_nastyloop:
5850 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5851 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5852 mcnt, p + mcnt);
5853
5854 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5855 {
5856 int cycle = 0;
5857 CHECK_INFINITE_LOOP (p - 4, d);
5858 if (!cycle)
5859 /* If there's a cycle, just continue without pushing
5860 this failure point. The failure point is the "try again"
5861 option, which shouldn't be tried.
5862 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5863 PUSH_FAILURE_POINT (p - 3, d);
5864 }
0683b6fa
SM
5865 break;
5866
4e8a9132
SM
5867 /* Simple loop detecting on_failure_jump: just check on the
5868 failure stack if the same spot was already hit earlier. */
505bde11
SM
5869 case on_failure_jump_loop:
5870 on_failure:
5871 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5872 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5873 mcnt, p + mcnt);
6df42991
SM
5874 {
5875 int cycle = 0;
5876 CHECK_INFINITE_LOOP (p - 3, d);
5877 if (cycle)
5878 /* If there's a cycle, get out of the loop, as if the matching
5879 had failed. We used to just `goto fail' here, but that was
5880 aborting the search a bit too early: we want to keep the
5881 empty-loop-match and keep matching after the loop.
5882 We want (x?)*y\1z to match both xxyz and xxyxz. */
5883 p += mcnt;
5884 else
5885 PUSH_FAILURE_POINT (p - 3, d);
5886 }
25fe55af 5887 break;
fa9a63c5
RM
5888
5889
5890 /* Uses of on_failure_jump:
5e69f11e 5891
25fe55af
RS
5892 Each alternative starts with an on_failure_jump that points
5893 to the beginning of the next alternative. Each alternative
5894 except the last ends with a jump that in effect jumps past
5895 the rest of the alternatives. (They really jump to the
5896 ending jump of the following alternative, because tensioning
5897 these jumps is a hassle.)
fa9a63c5 5898
25fe55af
RS
5899 Repeats start with an on_failure_jump that points past both
5900 the repetition text and either the following jump or
5901 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5902 case on_failure_jump:
25fe55af 5903 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5904 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5905 mcnt, p + mcnt);
25fe55af 5906
505bde11 5907 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5908 break;
5909
4e8a9132 5910 /* This operation is used for greedy *.
505bde11
SM
5911 Compare the beginning of the repeat with what in the
5912 pattern follows its end. If we can establish that there
5913 is nothing that they would both match, i.e., that we
5914 would have to backtrack because of (as in, e.g., `a*a')
5915 then we can use a non-backtracking loop based on
4e8a9132 5916 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5917 case on_failure_jump_smart:
25fe55af 5918 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5919 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5920 mcnt, p + mcnt);
25fe55af 5921 {
01618498 5922 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5923 /* Here, we discard `const', making re_match non-reentrant. */
5924 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5925 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5926
505bde11
SM
5927 p -= 3; /* Reset so that we will re-execute the
5928 instruction once it's been changed. */
fa9a63c5 5929
4e8a9132
SM
5930 EXTRACT_NUMBER (mcnt, p2 - 2);
5931
5932 /* Ensure this is a indeed the trivial kind of loop
5933 we are expecting. */
5934 assert (skip_one_char (p1) == p2 - 3);
5935 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5936 DEBUG_STATEMENT (debug += 2);
505bde11 5937 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5938 {
505bde11 5939 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5940 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5941 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5942 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5943 }
505bde11 5944 else
fa9a63c5 5945 {
505bde11
SM
5946 /* Default to a safe `on_failure_jump' loop. */
5947 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5948 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5949 }
99633e97 5950 DEBUG_STATEMENT (debug -= 2);
25fe55af 5951 }
505bde11 5952 break;
25fe55af
RS
5953
5954 /* Unconditionally jump (without popping any failure points). */
5955 case jump:
fa9a63c5 5956 unconditional_jump:
5b370c2b 5957 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5958 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5959 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5960 p += mcnt; /* Do the jump. */
505bde11 5961 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5962 break;
5963
5964
25fe55af
RS
5965 /* Have to succeed matching what follows at least n times.
5966 After that, handle like `on_failure_jump'. */
5967 case succeed_n:
01618498 5968 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5969 EXTRACT_NUMBER (mcnt, p + 2);
5970 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5971
dc1e502d
SM
5972 /* Originally, mcnt is how many times we HAVE to succeed. */
5973 if (mcnt != 0)
25fe55af 5974 {
6dcf2d0e
SM
5975 /* Here, we discard `const', making re_match non-reentrant. */
5976 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5977 mcnt--;
01618498
SM
5978 p += 4;
5979 PUSH_NUMBER (p2, mcnt);
25fe55af 5980 }
dc1e502d
SM
5981 else
5982 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5983 goto on_failure;
25fe55af
RS
5984 break;
5985
5986 case jump_n:
01618498 5987 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5988 EXTRACT_NUMBER (mcnt, p + 2);
5989 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5990
5991 /* Originally, this is how many times we CAN jump. */
dc1e502d 5992 if (mcnt != 0)
25fe55af 5993 {
6dcf2d0e
SM
5994 /* Here, we discard `const', making re_match non-reentrant. */
5995 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5996 mcnt--;
01618498 5997 PUSH_NUMBER (p2, mcnt);
dc1e502d 5998 goto unconditional_jump;
25fe55af
RS
5999 }
6000 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
6001 else
6002 p += 4;
25fe55af 6003 break;
5e69f11e 6004
fa9a63c5
RM
6005 case set_number_at:
6006 {
01618498 6007 unsigned char *p2; /* Location of the counter. */
25fe55af 6008 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 6009
25fe55af 6010 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
6011 /* Here, we discard `const', making re_match non-reentrant. */
6012 p2 = (unsigned char*) p + mcnt;
01618498 6013 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6014 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6015 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6016 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6017 break;
6018 }
9121ca40
KH
6019
6020 case wordbound:
66f0296e
SM
6021 case notwordbound:
6022 not = (re_opcode_t) *(p - 1) == notwordbound;
6023 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6024
99633e97 6025 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6026
b18215fc 6027 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6028 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6029 not = !not;
b18215fc
RS
6030 else
6031 {
6032 /* C1 is the character before D, S1 is the syntax of C1, C2
6033 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6034 re_wchar_t c1, c2;
6035 int s1, s2;
bf216479 6036 int dummy;
b18215fc 6037#ifdef emacs
2d1675e4
SM
6038 int offset = PTR_TO_OFFSET (d - 1);
6039 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6040 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6041#endif
66f0296e 6042 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6043 s1 = SYNTAX (c1);
6044#ifdef emacs
5d967c7a 6045 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6046#endif
f1ad044f 6047 PREFETCH_NOLIMIT ();
6fdd04b0 6048 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6049 s2 = SYNTAX (c2);
6050
6051 if (/* Case 2: Only one of S1 and S2 is Sword. */
6052 ((s1 == Sword) != (s2 == Sword))
6053 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6054 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6055 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6056 not = !not;
6057 }
6058 if (not)
9121ca40 6059 break;
b18215fc 6060 else
9121ca40 6061 goto fail;
fa9a63c5
RM
6062
6063 case wordbeg:
25fe55af 6064 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6065
b18215fc
RS
6066 /* We FAIL in one of the following cases: */
6067
7814e705 6068 /* Case 1: D is at the end of string. */
b18215fc 6069 if (AT_STRINGS_END (d))
99633e97 6070 goto fail;
b18215fc
RS
6071 else
6072 {
6073 /* C1 is the character before D, S1 is the syntax of C1, C2
6074 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6075 re_wchar_t c1, c2;
6076 int s1, s2;
bf216479 6077 int dummy;
fa9a63c5 6078#ifdef emacs
2d1675e4
SM
6079 int offset = PTR_TO_OFFSET (d);
6080 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6081 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6082#endif
99633e97 6083 PREFETCH ();
6fdd04b0 6084 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6085 s2 = SYNTAX (c2);
177c0ea7 6086
b18215fc
RS
6087 /* Case 2: S2 is not Sword. */
6088 if (s2 != Sword)
6089 goto fail;
6090
6091 /* Case 3: D is not at the beginning of string ... */
6092 if (!AT_STRINGS_BEG (d))
6093 {
6094 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6095#ifdef emacs
5d967c7a 6096 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6097#endif
b18215fc
RS
6098 s1 = SYNTAX (c1);
6099
6100 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6101 returns 0. */
b18215fc
RS
6102 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6103 goto fail;
6104 }
6105 }
e318085a
RS
6106 break;
6107
b18215fc 6108 case wordend:
25fe55af 6109 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6110
6111 /* We FAIL in one of the following cases: */
6112
6113 /* Case 1: D is at the beginning of string. */
6114 if (AT_STRINGS_BEG (d))
e318085a 6115 goto fail;
b18215fc
RS
6116 else
6117 {
6118 /* C1 is the character before D, S1 is the syntax of C1, C2
6119 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6120 re_wchar_t c1, c2;
6121 int s1, s2;
bf216479 6122 int dummy;
5d967c7a 6123#ifdef emacs
2d1675e4
SM
6124 int offset = PTR_TO_OFFSET (d) - 1;
6125 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6126 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6127#endif
99633e97 6128 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6129 s1 = SYNTAX (c1);
6130
6131 /* Case 2: S1 is not Sword. */
6132 if (s1 != Sword)
6133 goto fail;
6134
6135 /* Case 3: D is not at the end of string ... */
6136 if (!AT_STRINGS_END (d))
6137 {
f1ad044f 6138 PREFETCH_NOLIMIT ();
6fdd04b0 6139 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6140#ifdef emacs
6141 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6142#endif
b18215fc
RS
6143 s2 = SYNTAX (c2);
6144
6145 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6146 returns 0. */
b18215fc 6147 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6148 goto fail;
b18215fc
RS
6149 }
6150 }
e318085a
RS
6151 break;
6152
669fa600
SM
6153 case symbeg:
6154 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6155
6156 /* We FAIL in one of the following cases: */
6157
7814e705 6158 /* Case 1: D is at the end of string. */
669fa600
SM
6159 if (AT_STRINGS_END (d))
6160 goto fail;
6161 else
6162 {
6163 /* C1 is the character before D, S1 is the syntax of C1, C2
6164 is the character at D, and S2 is the syntax of C2. */
6165 re_wchar_t c1, c2;
6166 int s1, s2;
6167#ifdef emacs
6168 int offset = PTR_TO_OFFSET (d);
6169 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6170 UPDATE_SYNTAX_TABLE (charpos);
6171#endif
6172 PREFETCH ();
62a6e103 6173 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6174 s2 = SYNTAX (c2);
7814e705 6175
669fa600
SM
6176 /* Case 2: S2 is neither Sword nor Ssymbol. */
6177 if (s2 != Sword && s2 != Ssymbol)
6178 goto fail;
6179
6180 /* Case 3: D is not at the beginning of string ... */
6181 if (!AT_STRINGS_BEG (d))
6182 {
6183 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6184#ifdef emacs
6185 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6186#endif
6187 s1 = SYNTAX (c1);
6188
6189 /* ... and S1 is Sword or Ssymbol. */
6190 if (s1 == Sword || s1 == Ssymbol)
6191 goto fail;
6192 }
6193 }
6194 break;
6195
6196 case symend:
6197 DEBUG_PRINT1 ("EXECUTING symend.\n");
6198
6199 /* We FAIL in one of the following cases: */
6200
6201 /* Case 1: D is at the beginning of string. */
6202 if (AT_STRINGS_BEG (d))
6203 goto fail;
6204 else
6205 {
6206 /* C1 is the character before D, S1 is the syntax of C1, C2
6207 is the character at D, and S2 is the syntax of C2. */
6208 re_wchar_t c1, c2;
6209 int s1, s2;
6210#ifdef emacs
6211 int offset = PTR_TO_OFFSET (d) - 1;
6212 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6213 UPDATE_SYNTAX_TABLE (charpos);
6214#endif
6215 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6216 s1 = SYNTAX (c1);
6217
6218 /* Case 2: S1 is neither Ssymbol nor Sword. */
6219 if (s1 != Sword && s1 != Ssymbol)
6220 goto fail;
6221
6222 /* Case 3: D is not at the end of string ... */
6223 if (!AT_STRINGS_END (d))
6224 {
6225 PREFETCH_NOLIMIT ();
62a6e103 6226 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6227#ifdef emacs
134579f2 6228 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6229#endif
6230 s2 = SYNTAX (c2);
6231
6232 /* ... and S2 is Sword or Ssymbol. */
6233 if (s2 == Sword || s2 == Ssymbol)
6234 goto fail;
b18215fc
RS
6235 }
6236 }
e318085a
RS
6237 break;
6238
fa9a63c5 6239 case syntaxspec:
1fb352e0
SM
6240 case notsyntaxspec:
6241 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6242 mcnt = *p++;
1fb352e0 6243 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6244 PREFETCH ();
b18215fc
RS
6245#ifdef emacs
6246 {
2d1675e4
SM
6247 int offset = PTR_TO_OFFSET (d);
6248 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6249 UPDATE_SYNTAX_TABLE (pos1);
6250 }
25fe55af 6251#endif
b18215fc 6252 {
01618498
SM
6253 int len;
6254 re_wchar_t c;
b18215fc 6255
6fdd04b0 6256 GET_CHAR_AFTER (c, d, len);
990b2375 6257 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6258 goto fail;
b18215fc
RS
6259 d += len;
6260 }
fa9a63c5
RM
6261 break;
6262
b18215fc 6263#ifdef emacs
1fb352e0
SM
6264 case before_dot:
6265 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6266 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6267 goto fail;
b18215fc
RS
6268 break;
6269
1fb352e0
SM
6270 case at_dot:
6271 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6272 if (PTR_BYTE_POS (d) != PT_BYTE)
6273 goto fail;
6274 break;
b18215fc 6275
1fb352e0
SM
6276 case after_dot:
6277 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6278 if (PTR_BYTE_POS (d) <= PT_BYTE)
6279 goto fail;
e318085a 6280 break;
fa9a63c5 6281
1fb352e0 6282 case categoryspec:
b18215fc 6283 case notcategoryspec:
1fb352e0 6284 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6285 mcnt = *p++;
1fb352e0 6286 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6287 PREFETCH ();
6288 {
01618498
SM
6289 int len;
6290 re_wchar_t c;
6291
6fdd04b0 6292 GET_CHAR_AFTER (c, d, len);
1fb352e0 6293 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6294 goto fail;
6295 d += len;
6296 }
fa9a63c5 6297 break;
5e69f11e 6298
1fb352e0 6299#endif /* emacs */
5e69f11e 6300
0b32bf0e
SM
6301 default:
6302 abort ();
fa9a63c5 6303 }
b18215fc 6304 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6305
6306
6307 /* We goto here if a matching operation fails. */
6308 fail:
5b370c2b 6309 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6310 if (!FAIL_STACK_EMPTY ())
505bde11 6311 {
01618498 6312 re_char *str, *pat;
505bde11 6313 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6314 DEBUG_PRINT1 ("\nFAIL:\n");
6315 POP_FAILURE_POINT (str, pat);
505bde11
SM
6316 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6317 {
6318 case on_failure_keep_string_jump:
6319 assert (str == NULL);
6320 goto continue_failure_jump;
6321
0683b6fa
SM
6322 case on_failure_jump_nastyloop:
6323 assert ((re_opcode_t)pat[-2] == no_op);
6324 PUSH_FAILURE_POINT (pat - 2, str);
6325 /* Fallthrough */
6326
505bde11
SM
6327 case on_failure_jump_loop:
6328 case on_failure_jump:
6329 case succeed_n:
6330 d = str;
6331 continue_failure_jump:
6332 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6333 p = pat + mcnt;
6334 break;
b18215fc 6335
0683b6fa
SM
6336 case no_op:
6337 /* A special frame used for nastyloops. */
6338 goto fail;
6339
505bde11
SM
6340 default:
6341 abort();
6342 }
fa9a63c5 6343
505bde11 6344 assert (p >= bufp->buffer && p <= pend);
b18215fc 6345
0b32bf0e 6346 if (d >= string1 && d <= end1)
fa9a63c5 6347 dend = end_match_1;
0b32bf0e 6348 }
fa9a63c5 6349 else
0b32bf0e 6350 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6351 } /* for (;;) */
6352
6353 if (best_regs_set)
6354 goto restore_best_regs;
6355
6356 FREE_VARIABLES ();
6357
b18215fc 6358 return -1; /* Failure to match. */
fa9a63c5
RM
6359} /* re_match_2 */
6360\f
6361/* Subroutine definitions for re_match_2. */
6362
fa9a63c5
RM
6363/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6364 bytes; nonzero otherwise. */
5e69f11e 6365
fa9a63c5 6366static int
438105ed
JB
6367bcmp_translate (const re_char *s1, const re_char *s2, register int len,
6368 RE_TRANSLATE_TYPE translate, const int target_multibyte)
fa9a63c5 6369{
2d1675e4
SM
6370 register re_char *p1 = s1, *p2 = s2;
6371 re_char *p1_end = s1 + len;
6372 re_char *p2_end = s2 + len;
e934739e 6373
4bb91c68
SM
6374 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6375 different lengths, but relying on a single `len' would break this. -sm */
6376 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6377 {
e934739e 6378 int p1_charlen, p2_charlen;
01618498 6379 re_wchar_t p1_ch, p2_ch;
e934739e 6380
6fdd04b0
KH
6381 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6382 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6383
6384 if (RE_TRANSLATE (translate, p1_ch)
6385 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6386 return 1;
e934739e
RS
6387
6388 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6389 }
e934739e
RS
6390
6391 if (p1 != p1_end || p2 != p2_end)
6392 return 1;
6393
fa9a63c5
RM
6394 return 0;
6395}
6396\f
6397/* Entry points for GNU code. */
6398
6399/* re_compile_pattern is the GNU regular expression compiler: it
6400 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6401 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6402
fa9a63c5
RM
6403 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6404 are set in BUFP on entry.
5e69f11e 6405
b18215fc 6406 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6407
6408const char *
971de7fb 6409re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp)
fa9a63c5
RM
6410{
6411 reg_errcode_t ret;
5e69f11e 6412
fa9a63c5
RM
6413 /* GNU code is written to assume at least RE_NREGS registers will be set
6414 (and at least one extra will be -1). */
6415 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6416
fa9a63c5
RM
6417 /* And GNU code determines whether or not to get register information
6418 by passing null for the REGS argument to re_match, etc., not by
6419 setting no_sub. */
6420 bufp->no_sub = 0;
5e69f11e 6421
4bb91c68 6422 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6423
6424 if (!ret)
6425 return NULL;
6426 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6427}
c0f9ea08 6428WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6429\f
b18215fc
RS
6430/* Entry points compatible with 4.2 BSD regex library. We don't define
6431 them unless specifically requested. */
fa9a63c5 6432
0b32bf0e 6433#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6434
6435/* BSD has one and only one pattern buffer. */
6436static struct re_pattern_buffer re_comp_buf;
6437
6438char *
0b32bf0e 6439# ifdef _LIBC
48afdd44
RM
6440/* Make these definitions weak in libc, so POSIX programs can redefine
6441 these names if they don't use our functions, and still use
6442 regcomp/regexec below without link errors. */
6443weak_function
0b32bf0e 6444# endif
fa9a63c5
RM
6445re_comp (s)
6446 const char *s;
6447{
6448 reg_errcode_t ret;
5e69f11e 6449
fa9a63c5
RM
6450 if (!s)
6451 {
6452 if (!re_comp_buf.buffer)
0b32bf0e 6453 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6454 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6455 return 0;
6456 }
6457
6458 if (!re_comp_buf.buffer)
6459 {
6460 re_comp_buf.buffer = (unsigned char *) malloc (200);
6461 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6462 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6463 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6464 re_comp_buf.allocated = 200;
6465
6466 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6467 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6468 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6469 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6470 }
6471
6472 /* Since `re_exec' always passes NULL for the `regs' argument, we
6473 don't need to initialize the pattern buffer fields which affect it. */
6474
fa9a63c5 6475 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6476
fa9a63c5
RM
6477 if (!ret)
6478 return NULL;
6479
6480 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6481 return (char *) gettext (re_error_msgid[(int) ret]);
6482}
6483
6484
6485int
0b32bf0e 6486# ifdef _LIBC
48afdd44 6487weak_function
0b32bf0e 6488# endif
fa9a63c5
RM
6489re_exec (s)
6490 const char *s;
6491{
6492 const int len = strlen (s);
6493 return
6494 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6495}
6496#endif /* _REGEX_RE_COMP */
6497\f
6498/* POSIX.2 functions. Don't define these for Emacs. */
6499
6500#ifndef emacs
6501
6502/* regcomp takes a regular expression as a string and compiles it.
6503
b18215fc 6504 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6505 since POSIX says we shouldn't. Thus, we set
6506
6507 `buffer' to the compiled pattern;
6508 `used' to the length of the compiled pattern;
6509 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6510 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6511 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6512 `fastmap' to an allocated space for the fastmap;
6513 `fastmap_accurate' to zero;
fa9a63c5
RM
6514 `re_nsub' to the number of subexpressions in PATTERN.
6515
6516 PATTERN is the address of the pattern string.
6517
6518 CFLAGS is a series of bits which affect compilation.
6519
6520 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6521 use POSIX basic syntax.
6522
6523 If REG_NEWLINE is set, then . and [^...] don't match newline.
6524 Also, regexec will try a match beginning after every newline.
6525
6526 If REG_ICASE is set, then we considers upper- and lowercase
6527 versions of letters to be equivalent when matching.
6528
6529 If REG_NOSUB is set, then when PREG is passed to regexec, that
6530 routine will report only success or failure, and nothing about the
6531 registers.
6532
b18215fc 6533 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6534 the return codes and their meanings.) */
6535
6536int
6537regcomp (preg, pattern, cflags)
ada30c0e
SM
6538 regex_t *__restrict preg;
6539 const char *__restrict pattern;
fa9a63c5
RM
6540 int cflags;
6541{
6542 reg_errcode_t ret;
4bb91c68 6543 reg_syntax_t syntax
fa9a63c5
RM
6544 = (cflags & REG_EXTENDED) ?
6545 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6546
6547 /* regex_compile will allocate the space for the compiled pattern. */
6548 preg->buffer = 0;
6549 preg->allocated = 0;
6550 preg->used = 0;
5e69f11e 6551
c0f9ea08
SM
6552 /* Try to allocate space for the fastmap. */
6553 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6554
fa9a63c5
RM
6555 if (cflags & REG_ICASE)
6556 {
6557 unsigned i;
5e69f11e 6558
6676cb1c
RS
6559 preg->translate
6560 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6561 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6562 if (preg->translate == NULL)
0b32bf0e 6563 return (int) REG_ESPACE;
fa9a63c5
RM
6564
6565 /* Map uppercase characters to corresponding lowercase ones. */
6566 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6567 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6568 }
6569 else
6570 preg->translate = NULL;
6571
6572 /* If REG_NEWLINE is set, newlines are treated differently. */
6573 if (cflags & REG_NEWLINE)
6574 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6575 syntax &= ~RE_DOT_NEWLINE;
6576 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6577 }
6578 else
c0f9ea08 6579 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6580
6581 preg->no_sub = !!(cflags & REG_NOSUB);
6582
5e69f11e 6583 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6584 can use strlen here in compiling the pattern. */
4bb91c68 6585 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6586
fa9a63c5
RM
6587 /* POSIX doesn't distinguish between an unmatched open-group and an
6588 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6589 if (ret == REG_ERPAREN)
6590 ret = REG_EPAREN;
6591
6592 if (ret == REG_NOERROR && preg->fastmap)
6593 { /* Compute the fastmap now, since regexec cannot modify the pattern
6594 buffer. */
6595 re_compile_fastmap (preg);
6596 if (preg->can_be_null)
6597 { /* The fastmap can't be used anyway. */
6598 free (preg->fastmap);
6599 preg->fastmap = NULL;
6600 }
6601 }
fa9a63c5
RM
6602 return (int) ret;
6603}
c0f9ea08 6604WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6605
6606
6607/* regexec searches for a given pattern, specified by PREG, in the
6608 string STRING.
5e69f11e 6609
fa9a63c5 6610 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6611 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6612 least NMATCH elements, and we set them to the offsets of the
6613 corresponding matched substrings.
5e69f11e 6614
fa9a63c5
RM
6615 EFLAGS specifies `execution flags' which affect matching: if
6616 REG_NOTBOL is set, then ^ does not match at the beginning of the
6617 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6618
fa9a63c5
RM
6619 We return 0 if we find a match and REG_NOMATCH if not. */
6620
6621int
6622regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6623 const regex_t *__restrict preg;
6624 const char *__restrict string;
5e69f11e 6625 size_t nmatch;
9f2dbe01 6626 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6627 int eflags;
6628{
6629 int ret;
6630 struct re_registers regs;
6631 regex_t private_preg;
6632 int len = strlen (string);
c0f9ea08 6633 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6634
6635 private_preg = *preg;
5e69f11e 6636
fa9a63c5
RM
6637 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6638 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6639
fa9a63c5
RM
6640 /* The user has told us exactly how many registers to return
6641 information about, via `nmatch'. We have to pass that on to the
b18215fc 6642 matching routines. */
fa9a63c5 6643 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6644
fa9a63c5
RM
6645 if (want_reg_info)
6646 {
6647 regs.num_regs = nmatch;
4bb91c68
SM
6648 regs.start = TALLOC (nmatch * 2, regoff_t);
6649 if (regs.start == NULL)
0b32bf0e 6650 return (int) REG_NOMATCH;
4bb91c68 6651 regs.end = regs.start + nmatch;
fa9a63c5
RM
6652 }
6653
c0f9ea08
SM
6654 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6655 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6656 was a little bit longer but still only matching the real part.
6657 This works because the `endline' will check for a '\n' and will find a
6658 '\0', correctly deciding that this is not the end of a line.
6659 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6660 a convenient '\0' there. For all we know, the string could be preceded
6661 by '\n' which would throw things off. */
6662
fa9a63c5
RM
6663 /* Perform the searching operation. */
6664 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6665 /* start: */ 0, /* range: */ len,
6666 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6667
fa9a63c5
RM
6668 /* Copy the register information to the POSIX structure. */
6669 if (want_reg_info)
6670 {
6671 if (ret >= 0)
0b32bf0e
SM
6672 {
6673 unsigned r;
fa9a63c5 6674
0b32bf0e
SM
6675 for (r = 0; r < nmatch; r++)
6676 {
6677 pmatch[r].rm_so = regs.start[r];
6678 pmatch[r].rm_eo = regs.end[r];
6679 }
6680 }
fa9a63c5 6681
b18215fc 6682 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6683 free (regs.start);
fa9a63c5
RM
6684 }
6685
6686 /* We want zero return to mean success, unlike `re_search'. */
6687 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6688}
c0f9ea08 6689WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6690
6691
ec869672
JR
6692/* Returns a message corresponding to an error code, ERR_CODE, returned
6693 from either regcomp or regexec. We don't use PREG here.
6694
6695 ERR_CODE was previously called ERRCODE, but that name causes an
6696 error with msvc8 compiler. */
fa9a63c5
RM
6697
6698size_t
ec869672
JR
6699regerror (err_code, preg, errbuf, errbuf_size)
6700 int err_code;
fa9a63c5
RM
6701 const regex_t *preg;
6702 char *errbuf;
6703 size_t errbuf_size;
6704{
6705 const char *msg;
6706 size_t msg_size;
6707
ec869672
JR
6708 if (err_code < 0
6709 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6710 /* Only error codes returned by the rest of the code should be passed
b18215fc 6711 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6712 code generates an invalid error code, then the program has a bug.
6713 Dump core so we can fix it. */
6714 abort ();
6715
ec869672 6716 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6717
6718 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6719
fa9a63c5
RM
6720 if (errbuf_size != 0)
6721 {
6722 if (msg_size > errbuf_size)
0b32bf0e
SM
6723 {
6724 strncpy (errbuf, msg, errbuf_size - 1);
6725 errbuf[errbuf_size - 1] = 0;
6726 }
fa9a63c5 6727 else
0b32bf0e 6728 strcpy (errbuf, msg);
fa9a63c5
RM
6729 }
6730
6731 return msg_size;
6732}
c0f9ea08 6733WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6734
6735
6736/* Free dynamically allocated space used by PREG. */
6737
6738void
6739regfree (preg)
6740 regex_t *preg;
6741{
c2cd06e6 6742 free (preg->buffer);
fa9a63c5 6743 preg->buffer = NULL;
5e69f11e 6744
fa9a63c5
RM
6745 preg->allocated = 0;
6746 preg->used = 0;
6747
c2cd06e6 6748 free (preg->fastmap);
fa9a63c5
RM
6749 preg->fastmap = NULL;
6750 preg->fastmap_accurate = 0;
6751
c2cd06e6 6752 free (preg->translate);
fa9a63c5
RM
6753 preg->translate = NULL;
6754}
c0f9ea08 6755WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6756
6757#endif /* not emacs */
839966f3
KH
6758
6759/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6760 (do not change this comment) */