(search.o): Depend on charset.h.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
4e6835db 6 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
bc78d348 7
fa9a63c5
RM
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
4fc5845f 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 21 USA. */
fa9a63c5 22
6df42991 23/* TODO:
505bde11 24 - structure the opcode space into opcode+flag.
dc1e502d 25 - merge with glibc's regex.[ch].
01618498 26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
01618498 30*/
505bde11 31
fa9a63c5 32/* AIX requires this to be the first thing in the file. */
0b32bf0e 33#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
34 #pragma alloca
35#endif
36
fa9a63c5 37#ifdef HAVE_CONFIG_H
0b32bf0e 38# include <config.h>
fa9a63c5
RM
39#endif
40
4bb91c68
SM
41#if defined STDC_HEADERS && !defined emacs
42# include <stddef.h>
43#else
44/* We need this for `regex.h', and perhaps for the Emacs include files. */
45# include <sys/types.h>
46#endif
fa9a63c5 47
14473664
SM
48/* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
50#if defined _LIBC
51#define WIDE_CHAR_SUPPORT 1
52#else
14473664 53#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55#endif
14473664
SM
56
57/* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
a0ad02f7 59#if WIDE_CHAR_SUPPORT
14473664
SM
60/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61# include <wchar.h>
62# include <wctype.h>
63#endif
64
c0f9ea08
SM
65#ifdef _LIBC
66/* We have to keep the namespace clean. */
67# define regfree(preg) __regfree (preg)
68# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
70# define regerror(err_code, preg, errbuf, errbuf_size) \
71 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
72# define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76# define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78# define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80# define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82# define re_set_syntax(syntax) __re_set_syntax (syntax)
83# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
14473664
SM
87/* Make sure we call libc's function even if the user overrides them. */
88# define btowc __btowc
89# define iswctype __iswctype
90# define wctype __wctype
91
c0f9ea08
SM
92# define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94/* We are also using some library internals. */
95# include <locale/localeinfo.h>
96# include <locale/elem-hash.h>
97# include <langinfo.h>
98#else
99# define WEAK_ALIAS(a,b)
100#endif
101
4bb91c68 102/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 103#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
104# include <libintl.h>
105#else
106# define gettext(msgid) (msgid)
107#endif
108
5e69f11e
RM
109#ifndef gettext_noop
110/* This define is so xgettext can find the internationalizable
111 strings. */
0b32bf0e 112# define gettext_noop(String) String
5e69f11e
RM
113#endif
114
fa9a63c5
RM
115/* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117#ifdef emacs
118
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e 125# include "syntax.h"
9117d724 126# include "character.h"
0b32bf0e 127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
7814e705 142/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 147# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
cf9c99bc 148# define RE_STRING_CHAR(p, s, multibyte) \
4e8a9132 149 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
cf9c99bc 150# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
2d1675e4
SM
151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
152
cf9c99bc
KH
153# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]
154
155# define RE_CHAR_TO_UNIBYTE(c) \
156 (ASCII_CHAR_P (c) ? (c) \
157 : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c) \
158 : multibyte_char_to_unibyte_safe (c))
159
6fdd04b0
KH
160/* Set C a (possibly converted to multibyte) character before P. P
161 points into a string which is the virtual concatenation of STR1
162 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
163# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
164 do { \
6fdd04b0 165 if (multibyte) \
bf216479
KH
166 { \
167 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
168 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
169 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
170 c = STRING_CHAR (dtemp, (p) - dtemp); \
171 } \
172 else \
173 { \
174 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 175 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 176 } \
2d1675e4
SM
177 } while (0)
178
6fdd04b0
KH
179/* Set C a (possibly converted to multibyte) character at P, and set
180 LEN to the byte length of that character. */
181# define GET_CHAR_AFTER(c, p, len) \
182 do { \
183 if (multibyte) \
cf9c99bc 184 (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \
6fdd04b0
KH
185 else \
186 { \
cf9c99bc 187 (c) = *p; \
6fdd04b0 188 len = 1; \
cf9c99bc 189 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 190 } \
8f924df7 191 } while (0)
4e8a9132 192
fa9a63c5
RM
193#else /* not emacs */
194
195/* If we are not linking with Emacs proper,
196 we can't use the relocating allocator
197 even if config.h says that we can. */
0b32bf0e 198# undef REL_ALLOC
fa9a63c5 199
0b32bf0e
SM
200# if defined STDC_HEADERS || defined _LIBC
201# include <stdlib.h>
202# else
fa9a63c5
RM
203char *malloc ();
204char *realloc ();
0b32bf0e 205# endif
fa9a63c5 206
a77f947b
CY
207/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
208
209void *
210xmalloc (size)
211 size_t size;
212{
213 register void *val;
214 val = (void *) malloc (size);
215 if (!val && size)
216 {
217 write (2, "virtual memory exhausted\n", 25);
218 exit (1);
219 }
220 return val;
221}
222
223void *
224xrealloc (block, size)
225 void *block;
226 size_t size;
227{
228 register void *val;
229 /* We must call malloc explicitly when BLOCK is 0, since some
230 reallocs don't do this. */
231 if (! block)
232 val = (void *) malloc (size);
233 else
234 val = (void *) realloc (block, size);
235 if (!val && size)
236 {
237 write (2, "virtual memory exhausted\n", 25);
238 exit (1);
239 }
240 return val;
241}
242
a073faa6
CY
243# ifdef malloc
244# undef malloc
245# endif
246# define malloc xmalloc
247# ifdef realloc
248# undef realloc
249# endif
250# define realloc xrealloc
251
9e4ecb26 252/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 253 If nothing else has been done, use the method below. */
0b32bf0e
SM
254# ifdef INHIBIT_STRING_HEADER
255# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
256# if !defined bzero && !defined bcopy
257# undef INHIBIT_STRING_HEADER
258# endif
259# endif
260# endif
9e4ecb26 261
4bb91c68 262/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
263 This is used in most programs--a few other programs avoid this
264 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
265# ifndef INHIBIT_STRING_HEADER
266# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
267# include <string.h>
0b32bf0e 268# ifndef bzero
4bb91c68
SM
269# ifndef _LIBC
270# define bzero(s, n) (memset (s, '\0', n), (s))
271# else
272# define bzero(s, n) __bzero (s, n)
273# endif
0b32bf0e
SM
274# endif
275# else
276# include <strings.h>
4bb91c68
SM
277# ifndef memcmp
278# define memcmp(s1, s2, n) bcmp (s1, s2, n)
279# endif
280# ifndef memcpy
281# define memcpy(d, s, n) (bcopy (s, d, n), (d))
282# endif
0b32bf0e
SM
283# endif
284# endif
fa9a63c5
RM
285
286/* Define the syntax stuff for \<, \>, etc. */
287
990b2375 288/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 289enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 290
0b32bf0e
SM
291# ifdef SWITCH_ENUM_BUG
292# define SWITCH_ENUM_CAST(x) ((int)(x))
293# else
294# define SWITCH_ENUM_CAST(x) (x)
295# endif
fa9a63c5 296
e934739e 297/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
298# define BASE_LEADING_CODE_P(c) (0)
299# define CHAR_CHARSET(c) 0
300# define CHARSET_LEADING_CODE_BASE(c) 0
301# define MAX_MULTIBYTE_LENGTH 1
302# define RE_MULTIBYTE_P(x) 0
bf216479 303# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
304# define WORD_BOUNDARY_P(c1, c2) (0)
305# define CHAR_HEAD_P(p) (1)
306# define SINGLE_BYTE_CHAR_P(c) (1)
307# define SAME_CHARSET_P(c1, c2) (1)
308# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 309# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e 310# define STRING_CHAR(p, s) (*(p))
cf9c99bc 311# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
0b32bf0e
SM
312# define CHAR_STRING(c, s) (*(s) = (c), 1)
313# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
cf9c99bc
KH
314# define RE_STRING_CHAR_AND_LENGTH(p, s, multibyte) STRING_CHAR_AND_LENGTH ((p), (s))
315# define RE_CHAR_TO_MULTIBYTE(c) (c)
316# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 317# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 318 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
319# define GET_CHAR_AFTER(c, p, len) \
320 (c = *p, len = 1)
0b32bf0e 321# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
322# define BYTE8_TO_CHAR(c) (c)
323# define CHAR_BYTE8_P(c) (0)
bf216479 324# define CHAR_LEADING_CODE(c) (c)
8f924df7 325
fa9a63c5 326#endif /* not emacs */
4e8a9132
SM
327
328#ifndef RE_TRANSLATE
0b32bf0e
SM
329# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
330# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 331#endif
fa9a63c5
RM
332\f
333/* Get the interface, including the syntax bits. */
334#include "regex.h"
335
f71b19b6
DL
336/* isalpha etc. are used for the character classes. */
337#include <ctype.h>
fa9a63c5 338
f71b19b6 339#ifdef emacs
fa9a63c5 340
f71b19b6 341/* 1 if C is an ASCII character. */
0b32bf0e 342# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 343
f71b19b6 344/* 1 if C is a unibyte character. */
0b32bf0e 345# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 346
f71b19b6 347/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 348
f71b19b6 349/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
350# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
351# define ISCNTRL(c) ((c) < ' ')
352# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
353 || ((c) >= 'a' && (c) <= 'f') \
354 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
355
356/* This is only used for single-byte characters. */
0b32bf0e 357# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
358
359/* The rest must handle multibyte characters. */
360
0b32bf0e 361# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 362 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
363 : 1)
364
14473664 365# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 366 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
367 : 1)
368
0b32bf0e 369# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
370 ? (((c) >= 'a' && (c) <= 'z') \
371 || ((c) >= 'A' && (c) <= 'Z') \
372 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
373 : SYNTAX (c) == Sword)
374
0b32bf0e 375# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
376 ? (((c) >= 'a' && (c) <= 'z') \
377 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
378 : SYNTAX (c) == Sword)
379
0b32bf0e 380# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 381
0b32bf0e 382# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
383 ? ((c) > ' ' && (c) < 0177 \
384 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
385 || ((c) >= 'A' && (c) <= 'Z') \
386 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
387 : SYNTAX (c) != Sword)
388
0b32bf0e 389# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 390
0b32bf0e 391# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 392
0b32bf0e 393# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
394
395#else /* not emacs */
396
f71b19b6
DL
397/* Jim Meyering writes:
398
399 "... Some ctype macros are valid only for character codes that
400 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
401 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 402 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
403 STDC_HEADERS is defined, then autoconf has verified that the ctype
404 macros don't need to be guarded with references to isascii. ...
405 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
406 eliminate the && through constant folding."
407 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 408
4bb91c68 409# undef ISASCII
0b32bf0e
SM
410# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
411# define ISASCII(c) 1
412# else
413# define ISASCII(c) isascii(c)
414# endif
f71b19b6
DL
415
416/* 1 if C is an ASCII character. */
0b32bf0e 417# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
418
419/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
420# define ISUNIBYTE(c) 1
421
422# ifdef isblank
423# define ISBLANK(c) (ISASCII (c) && isblank (c))
424# else
425# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
426# endif
427# ifdef isgraph
428# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
429# else
430# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
431# endif
432
4bb91c68 433# undef ISPRINT
0b32bf0e
SM
434# define ISPRINT(c) (ISASCII (c) && isprint (c))
435# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
436# define ISALNUM(c) (ISASCII (c) && isalnum (c))
437# define ISALPHA(c) (ISASCII (c) && isalpha (c))
438# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
439# define ISLOWER(c) (ISASCII (c) && islower (c))
440# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
441# define ISSPACE(c) (ISASCII (c) && isspace (c))
442# define ISUPPER(c) (ISASCII (c) && isupper (c))
443# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
444
445# define ISWORD(c) ISALPHA(c)
446
4bb91c68
SM
447# ifdef _tolower
448# define TOLOWER(c) _tolower(c)
449# else
450# define TOLOWER(c) tolower(c)
451# endif
452
453/* How many characters in the character set. */
454# define CHAR_SET_SIZE 256
455
0b32bf0e 456# ifdef SYNTAX_TABLE
f71b19b6 457
0b32bf0e 458extern char *re_syntax_table;
f71b19b6 459
0b32bf0e
SM
460# else /* not SYNTAX_TABLE */
461
0b32bf0e
SM
462static char re_syntax_table[CHAR_SET_SIZE];
463
464static void
465init_syntax_once ()
466{
467 register int c;
468 static int done = 0;
469
470 if (done)
471 return;
472
473 bzero (re_syntax_table, sizeof re_syntax_table);
474
4bb91c68
SM
475 for (c = 0; c < CHAR_SET_SIZE; ++c)
476 if (ISALNUM (c))
477 re_syntax_table[c] = Sword;
fa9a63c5 478
669fa600 479 re_syntax_table['_'] = Ssymbol;
fa9a63c5 480
0b32bf0e
SM
481 done = 1;
482}
483
484# endif /* not SYNTAX_TABLE */
96cc36cc 485
4bb91c68
SM
486# define SYNTAX(c) re_syntax_table[(c)]
487
96cc36cc
RS
488#endif /* not emacs */
489\f
fa9a63c5 490#ifndef NULL
0b32bf0e 491# define NULL (void *)0
fa9a63c5
RM
492#endif
493
494/* We remove any previous definition of `SIGN_EXTEND_CHAR',
495 since ours (we hope) works properly with all combinations of
496 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 497 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
498#undef SIGN_EXTEND_CHAR
499#if __STDC__
0b32bf0e 500# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
501#else /* not __STDC__ */
502/* As in Harbison and Steele. */
0b32bf0e 503# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
504#endif
505\f
506/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
507 use `alloca' instead of `malloc'. This is because using malloc in
508 re_search* or re_match* could cause memory leaks when C-g is used in
509 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
510 the other hand, malloc is more portable, and easier to debug.
511
fa9a63c5
RM
512 Because we sometimes use alloca, some routines have to be macros,
513 not functions -- `alloca'-allocated space disappears at the end of the
514 function it is called in. */
515
516#ifdef REGEX_MALLOC
517
0b32bf0e
SM
518# define REGEX_ALLOCATE malloc
519# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
520# define REGEX_FREE free
fa9a63c5
RM
521
522#else /* not REGEX_MALLOC */
523
524/* Emacs already defines alloca, sometimes. */
0b32bf0e 525# ifndef alloca
fa9a63c5
RM
526
527/* Make alloca work the best possible way. */
0b32bf0e
SM
528# ifdef __GNUC__
529# define alloca __builtin_alloca
530# else /* not __GNUC__ */
7f585e7a 531# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
532# include <alloca.h>
533# endif /* HAVE_ALLOCA_H */
534# endif /* not __GNUC__ */
fa9a63c5 535
0b32bf0e 536# endif /* not alloca */
fa9a63c5 537
0b32bf0e 538# define REGEX_ALLOCATE alloca
fa9a63c5
RM
539
540/* Assumes a `char *destination' variable. */
0b32bf0e 541# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 542 (destination = (char *) alloca (nsize), \
4bb91c68 543 memcpy (destination, source, osize))
fa9a63c5
RM
544
545/* No need to do anything to free, after alloca. */
0b32bf0e 546# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
547
548#endif /* not REGEX_MALLOC */
549
550/* Define how to allocate the failure stack. */
551
0b32bf0e 552#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 553
0b32bf0e 554# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 555 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 556# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 557 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 558# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
559 r_alloc_free (&failure_stack_ptr)
560
4297555e 561#else /* not using relocating allocator */
fa9a63c5 562
0b32bf0e 563# ifdef REGEX_MALLOC
fa9a63c5 564
0b32bf0e
SM
565# define REGEX_ALLOCATE_STACK malloc
566# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
567# define REGEX_FREE_STACK free
fa9a63c5 568
0b32bf0e 569# else /* not REGEX_MALLOC */
fa9a63c5 570
0b32bf0e 571# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 572
0b32bf0e 573# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 574 REGEX_REALLOCATE (source, osize, nsize)
7814e705 575/* No need to explicitly free anything. */
0b32bf0e 576# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 577
0b32bf0e 578# endif /* not REGEX_MALLOC */
4297555e 579#endif /* not using relocating allocator */
fa9a63c5
RM
580
581
582/* True if `size1' is non-NULL and PTR is pointing anywhere inside
583 `string1' or just past its end. This works if PTR is NULL, which is
584 a good thing. */
25fe55af 585#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
586 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
587
588/* (Re)Allocate N items of type T using malloc, or fail. */
589#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
590#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
591#define RETALLOC_IF(addr, n, t) \
592 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
593#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
594
4bb91c68 595#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
596
597#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
598
599#undef MAX
600#undef MIN
601#define MAX(a, b) ((a) > (b) ? (a) : (b))
602#define MIN(a, b) ((a) < (b) ? (a) : (b))
603
66f0296e
SM
604/* Type of source-pattern and string chars. */
605typedef const unsigned char re_char;
606
fa9a63c5
RM
607typedef char boolean;
608#define false 0
609#define true 1
610
4bb91c68
SM
611static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
612 re_char *string1, int size1,
613 re_char *string2, int size2,
614 int pos,
615 struct re_registers *regs,
616 int stop));
fa9a63c5
RM
617\f
618/* These are the command codes that appear in compiled regular
4bb91c68 619 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
620 command code can specify any interpretation whatsoever for its
621 arguments. Zero bytes may appear in the compiled regular expression. */
622
623typedef enum
624{
625 no_op = 0,
626
4bb91c68 627 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
628 succeed,
629
25fe55af 630 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
631 exactn,
632
25fe55af 633 /* Matches any (more or less) character. */
fa9a63c5
RM
634 anychar,
635
25fe55af
RS
636 /* Matches any one char belonging to specified set. First
637 following byte is number of bitmap bytes. Then come bytes
638 for a bitmap saying which chars are in. Bits in each byte
639 are ordered low-bit-first. A character is in the set if its
640 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
641 automatically not in the set.
642
643 If the length byte has the 0x80 bit set, then that stuff
644 is followed by a range table:
645 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 646 See RANGE_TABLE_WORK_BITS below.
01618498 647 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 648 pairs, each 2 multibyte characters,
0b32bf0e 649 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
650 charset,
651
25fe55af 652 /* Same parameters as charset, but match any character that is
4bb91c68 653 not one of those specified. */
fa9a63c5
RM
654 charset_not,
655
25fe55af
RS
656 /* Start remembering the text that is matched, for storing in a
657 register. Followed by one byte with the register number, in
658 the range 0 to one less than the pattern buffer's re_nsub
505bde11 659 field. */
fa9a63c5
RM
660 start_memory,
661
25fe55af
RS
662 /* Stop remembering the text that is matched and store it in a
663 memory register. Followed by one byte with the register
664 number, in the range 0 to one less than `re_nsub' in the
505bde11 665 pattern buffer. */
fa9a63c5
RM
666 stop_memory,
667
25fe55af 668 /* Match a duplicate of something remembered. Followed by one
4bb91c68 669 byte containing the register number. */
fa9a63c5
RM
670 duplicate,
671
25fe55af 672 /* Fail unless at beginning of line. */
fa9a63c5
RM
673 begline,
674
4bb91c68 675 /* Fail unless at end of line. */
fa9a63c5
RM
676 endline,
677
25fe55af
RS
678 /* Succeeds if at beginning of buffer (if emacs) or at beginning
679 of string to be matched (if not). */
fa9a63c5
RM
680 begbuf,
681
25fe55af 682 /* Analogously, for end of buffer/string. */
fa9a63c5 683 endbuf,
5e69f11e 684
25fe55af 685 /* Followed by two byte relative address to which to jump. */
5e69f11e 686 jump,
fa9a63c5 687
25fe55af 688 /* Followed by two-byte relative address of place to resume at
7814e705 689 in case of failure. */
fa9a63c5 690 on_failure_jump,
5e69f11e 691
25fe55af
RS
692 /* Like on_failure_jump, but pushes a placeholder instead of the
693 current string position when executed. */
fa9a63c5 694 on_failure_keep_string_jump,
5e69f11e 695
505bde11
SM
696 /* Just like `on_failure_jump', except that it checks that we
697 don't get stuck in an infinite loop (matching an empty string
698 indefinitely). */
699 on_failure_jump_loop,
700
0683b6fa
SM
701 /* Just like `on_failure_jump_loop', except that it checks for
702 a different kind of loop (the kind that shows up with non-greedy
703 operators). This operation has to be immediately preceded
704 by a `no_op'. */
705 on_failure_jump_nastyloop,
706
0b32bf0e 707 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
708 It analyses the loop before which it is put and if the
709 loop does not require backtracking, it changes itself to
4e8a9132
SM
710 `on_failure_keep_string_jump' and short-circuits the loop,
711 else it just defaults to changing itself into `on_failure_jump'.
712 It assumes that it is pointing to just past a `jump'. */
505bde11 713 on_failure_jump_smart,
fa9a63c5 714
25fe55af 715 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
716 After matching N times, jump to the address upon failure.
717 Does not work if N starts at 0: use on_failure_jump_loop
718 instead. */
fa9a63c5
RM
719 succeed_n,
720
25fe55af
RS
721 /* Followed by two-byte relative address, and two-byte number n.
722 Jump to the address N times, then fail. */
fa9a63c5
RM
723 jump_n,
724
25fe55af 725 /* Set the following two-byte relative address to the
7814e705 726 subsequent two-byte number. The address *includes* the two
25fe55af 727 bytes of number. */
fa9a63c5
RM
728 set_number_at,
729
fa9a63c5
RM
730 wordbeg, /* Succeeds if at word beginning. */
731 wordend, /* Succeeds if at word end. */
732
733 wordbound, /* Succeeds if at a word boundary. */
7814e705 734 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 735
669fa600
SM
736 symbeg, /* Succeeds if at symbol beginning. */
737 symend, /* Succeeds if at symbol end. */
738
fa9a63c5 739 /* Matches any character whose syntax is specified. Followed by
25fe55af 740 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
741 syntaxspec,
742
743 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
744 notsyntaxspec
745
746#ifdef emacs
747 ,before_dot, /* Succeeds if before point. */
748 at_dot, /* Succeeds if at point. */
749 after_dot, /* Succeeds if after point. */
b18215fc
RS
750
751 /* Matches any character whose category-set contains the specified
7814e705
JB
752 category. The operator is followed by a byte which contains a
753 category code (mnemonic ASCII character). */
b18215fc
RS
754 categoryspec,
755
756 /* Matches any character whose category-set does not contain the
757 specified category. The operator is followed by a byte which
758 contains the category code (mnemonic ASCII character). */
759 notcategoryspec
fa9a63c5
RM
760#endif /* emacs */
761} re_opcode_t;
762\f
763/* Common operations on the compiled pattern. */
764
765/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
766
767#define STORE_NUMBER(destination, number) \
768 do { \
769 (destination)[0] = (number) & 0377; \
770 (destination)[1] = (number) >> 8; \
771 } while (0)
772
773/* Same as STORE_NUMBER, except increment DESTINATION to
774 the byte after where the number is stored. Therefore, DESTINATION
775 must be an lvalue. */
776
777#define STORE_NUMBER_AND_INCR(destination, number) \
778 do { \
779 STORE_NUMBER (destination, number); \
780 (destination) += 2; \
781 } while (0)
782
783/* Put into DESTINATION a number stored in two contiguous bytes starting
784 at SOURCE. */
785
786#define EXTRACT_NUMBER(destination, source) \
787 do { \
788 (destination) = *(source) & 0377; \
789 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
790 } while (0)
791
792#ifdef DEBUG
4bb91c68 793static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
794static void
795extract_number (dest, source)
796 int *dest;
01618498 797 re_char *source;
fa9a63c5 798{
5e69f11e 799 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
800 *dest = *source & 0377;
801 *dest += temp << 8;
802}
803
4bb91c68 804# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
805# undef EXTRACT_NUMBER
806# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
807# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
808
809#endif /* DEBUG */
810
811/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
812 SOURCE must be an lvalue. */
813
814#define EXTRACT_NUMBER_AND_INCR(destination, source) \
815 do { \
816 EXTRACT_NUMBER (destination, source); \
25fe55af 817 (source) += 2; \
fa9a63c5
RM
818 } while (0)
819
820#ifdef DEBUG
4bb91c68
SM
821static void extract_number_and_incr _RE_ARGS ((int *destination,
822 re_char **source));
fa9a63c5
RM
823static void
824extract_number_and_incr (destination, source)
825 int *destination;
01618498 826 re_char **source;
5e69f11e 827{
fa9a63c5
RM
828 extract_number (destination, *source);
829 *source += 2;
830}
831
0b32bf0e
SM
832# ifndef EXTRACT_MACROS
833# undef EXTRACT_NUMBER_AND_INCR
834# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 835 extract_number_and_incr (&dest, &src)
0b32bf0e 836# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
837
838#endif /* DEBUG */
839\f
b18215fc
RS
840/* Store a multibyte character in three contiguous bytes starting
841 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 842 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
843
844#define STORE_CHARACTER_AND_INCR(destination, character) \
845 do { \
846 (destination)[0] = (character) & 0377; \
847 (destination)[1] = ((character) >> 8) & 0377; \
848 (destination)[2] = (character) >> 16; \
849 (destination) += 3; \
850 } while (0)
851
852/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 853 starting at SOURCE. */
b18215fc
RS
854
855#define EXTRACT_CHARACTER(destination, source) \
856 do { \
857 (destination) = ((source)[0] \
858 | ((source)[1] << 8) \
859 | ((source)[2] << 16)); \
860 } while (0)
861
862
863/* Macros for charset. */
864
865/* Size of bitmap of charset P in bytes. P is a start of charset,
866 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
867#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
868
869/* Nonzero if charset P has range table. */
25fe55af 870#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
871
872/* Return the address of range table of charset P. But not the start
873 of table itself, but the before where the number of ranges is
96cc36cc
RS
874 stored. `2 +' means to skip re_opcode_t and size of bitmap,
875 and the 2 bytes of flags at the start of the range table. */
876#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
877
878/* Extract the bit flags that start a range table. */
879#define CHARSET_RANGE_TABLE_BITS(p) \
880 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
881 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
882
883/* Test if C is listed in the bitmap of charset P. */
884#define CHARSET_LOOKUP_BITMAP(p, c) \
885 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
886 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
887
888/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
889 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
890 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
891 and end. */
892#define CHARSET_RANGE_TABLE_END(range_table, count) \
893 ((range_table) + (count) * 2 * 3)
894
7814e705 895/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
896 COUNT is number of ranges in RANGE_TABLE. */
897#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
898 do \
899 { \
01618498
SM
900 re_wchar_t range_start, range_end; \
901 re_char *p; \
902 re_char *range_table_end \
b18215fc
RS
903 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
904 \
905 for (p = (range_table); p < range_table_end; p += 2 * 3) \
906 { \
907 EXTRACT_CHARACTER (range_start, p); \
908 EXTRACT_CHARACTER (range_end, p + 3); \
909 \
910 if (range_start <= (c) && (c) <= range_end) \
911 { \
912 (not) = !(not); \
913 break; \
914 } \
915 } \
916 } \
917 while (0)
918
919/* Test if C is in range table of CHARSET. The flag NOT is negated if
920 C is listed in it. */
921#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
922 do \
923 { \
924 /* Number of ranges in range table. */ \
925 int count; \
01618498
SM
926 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
927 \
b18215fc
RS
928 EXTRACT_NUMBER_AND_INCR (count, range_table); \
929 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
930 } \
931 while (0)
932\f
fa9a63c5
RM
933/* If DEBUG is defined, Regex prints many voluminous messages about what
934 it is doing (if the variable `debug' is nonzero). If linked with the
935 main program in `iregex.c', you can enter patterns and strings
936 interactively. And if linked with the main program in `main.c' and
4bb91c68 937 the other test files, you can run the already-written tests. */
fa9a63c5
RM
938
939#ifdef DEBUG
940
941/* We use standard I/O for debugging. */
0b32bf0e 942# include <stdio.h>
fa9a63c5
RM
943
944/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 945# include <assert.h>
fa9a63c5 946
99633e97 947static int debug = -100000;
fa9a63c5 948
0b32bf0e
SM
949# define DEBUG_STATEMENT(e) e
950# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
951# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
952# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
953# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
954# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 955 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 956# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 957 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
958
959
960/* Print the fastmap in human-readable form. */
961
962void
963print_fastmap (fastmap)
964 char *fastmap;
965{
966 unsigned was_a_range = 0;
5e69f11e
RM
967 unsigned i = 0;
968
fa9a63c5
RM
969 while (i < (1 << BYTEWIDTH))
970 {
971 if (fastmap[i++])
972 {
973 was_a_range = 0;
25fe55af
RS
974 putchar (i - 1);
975 while (i < (1 << BYTEWIDTH) && fastmap[i])
976 {
977 was_a_range = 1;
978 i++;
979 }
fa9a63c5 980 if (was_a_range)
25fe55af
RS
981 {
982 printf ("-");
983 putchar (i - 1);
984 }
985 }
fa9a63c5 986 }
5e69f11e 987 putchar ('\n');
fa9a63c5
RM
988}
989
990
991/* Print a compiled pattern string in human-readable form, starting at
992 the START pointer into it and ending just before the pointer END. */
993
994void
995print_partial_compiled_pattern (start, end)
01618498
SM
996 re_char *start;
997 re_char *end;
fa9a63c5
RM
998{
999 int mcnt, mcnt2;
01618498
SM
1000 re_char *p = start;
1001 re_char *pend = end;
fa9a63c5
RM
1002
1003 if (start == NULL)
1004 {
a1a052df 1005 fprintf (stderr, "(null)\n");
fa9a63c5
RM
1006 return;
1007 }
5e69f11e 1008
fa9a63c5
RM
1009 /* Loop over pattern commands. */
1010 while (p < pend)
1011 {
a1a052df 1012 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
1013
1014 switch ((re_opcode_t) *p++)
1015 {
25fe55af 1016 case no_op:
a1a052df 1017 fprintf (stderr, "/no_op");
25fe55af 1018 break;
fa9a63c5 1019
99633e97 1020 case succeed:
a1a052df 1021 fprintf (stderr, "/succeed");
99633e97
SM
1022 break;
1023
fa9a63c5
RM
1024 case exactn:
1025 mcnt = *p++;
a1a052df 1026 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 1027 do
fa9a63c5 1028 {
a1a052df 1029 fprintf (stderr, "/%c", *p++);
25fe55af
RS
1030 }
1031 while (--mcnt);
1032 break;
fa9a63c5
RM
1033
1034 case start_memory:
a1a052df 1035 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1036 break;
fa9a63c5
RM
1037
1038 case stop_memory:
a1a052df 1039 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1040 break;
fa9a63c5
RM
1041
1042 case duplicate:
a1a052df 1043 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1044 break;
1045
1046 case anychar:
a1a052df 1047 fprintf (stderr, "/anychar");
fa9a63c5
RM
1048 break;
1049
1050 case charset:
25fe55af
RS
1051 case charset_not:
1052 {
1053 register int c, last = -100;
fa9a63c5 1054 register int in_range = 0;
99633e97
SM
1055 int length = CHARSET_BITMAP_SIZE (p - 1);
1056 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1057
a1a052df 1058 fprintf (stderr, "/charset [%s",
839966f3 1059 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1060
839966f3
KH
1061 if (p + *p >= pend)
1062 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1063
25fe55af 1064 for (c = 0; c < 256; c++)
96cc36cc 1065 if (c / 8 < length
fa9a63c5
RM
1066 && (p[1 + (c/8)] & (1 << (c % 8))))
1067 {
1068 /* Are we starting a range? */
1069 if (last + 1 == c && ! in_range)
1070 {
a1a052df 1071 fprintf (stderr, "-");
fa9a63c5
RM
1072 in_range = 1;
1073 }
1074 /* Have we broken a range? */
1075 else if (last + 1 != c && in_range)
96cc36cc 1076 {
a1a052df 1077 fprintf (stderr, "%c", last);
fa9a63c5
RM
1078 in_range = 0;
1079 }
5e69f11e 1080
fa9a63c5 1081 if (! in_range)
a1a052df 1082 fprintf (stderr, "%c", c);
fa9a63c5
RM
1083
1084 last = c;
25fe55af 1085 }
fa9a63c5
RM
1086
1087 if (in_range)
a1a052df 1088 fprintf (stderr, "%c", last);
fa9a63c5 1089
a1a052df 1090 fprintf (stderr, "]");
fa9a63c5 1091
99633e97 1092 p += 1 + length;
96cc36cc 1093
96cc36cc 1094 if (has_range_table)
99633e97
SM
1095 {
1096 int count;
a1a052df 1097 fprintf (stderr, "has-range-table");
99633e97
SM
1098
1099 /* ??? Should print the range table; for now, just skip it. */
1100 p += 2; /* skip range table bits */
1101 EXTRACT_NUMBER_AND_INCR (count, p);
1102 p = CHARSET_RANGE_TABLE_END (p, count);
1103 }
fa9a63c5
RM
1104 }
1105 break;
1106
1107 case begline:
a1a052df 1108 fprintf (stderr, "/begline");
25fe55af 1109 break;
fa9a63c5
RM
1110
1111 case endline:
a1a052df 1112 fprintf (stderr, "/endline");
25fe55af 1113 break;
fa9a63c5
RM
1114
1115 case on_failure_jump:
25fe55af 1116 extract_number_and_incr (&mcnt, &p);
a1a052df 1117 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1118 break;
fa9a63c5
RM
1119
1120 case on_failure_keep_string_jump:
25fe55af 1121 extract_number_and_incr (&mcnt, &p);
a1a052df 1122 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1123 break;
fa9a63c5 1124
0683b6fa
SM
1125 case on_failure_jump_nastyloop:
1126 extract_number_and_incr (&mcnt, &p);
a1a052df 1127 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1128 break;
1129
505bde11 1130 case on_failure_jump_loop:
fa9a63c5 1131 extract_number_and_incr (&mcnt, &p);
a1a052df 1132 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1133 break;
1134
505bde11 1135 case on_failure_jump_smart:
fa9a63c5 1136 extract_number_and_incr (&mcnt, &p);
a1a052df 1137 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1138 break;
1139
25fe55af 1140 case jump:
fa9a63c5 1141 extract_number_and_incr (&mcnt, &p);
a1a052df 1142 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1143 break;
1144
25fe55af
RS
1145 case succeed_n:
1146 extract_number_and_incr (&mcnt, &p);
1147 extract_number_and_incr (&mcnt2, &p);
a1a052df 1148 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1149 break;
5e69f11e 1150
25fe55af
RS
1151 case jump_n:
1152 extract_number_and_incr (&mcnt, &p);
1153 extract_number_and_incr (&mcnt2, &p);
a1a052df 1154 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1155 break;
5e69f11e 1156
25fe55af
RS
1157 case set_number_at:
1158 extract_number_and_incr (&mcnt, &p);
1159 extract_number_and_incr (&mcnt2, &p);
a1a052df 1160 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1161 break;
5e69f11e 1162
25fe55af 1163 case wordbound:
a1a052df 1164 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1165 break;
1166
1167 case notwordbound:
a1a052df 1168 fprintf (stderr, "/notwordbound");
25fe55af 1169 break;
fa9a63c5
RM
1170
1171 case wordbeg:
a1a052df 1172 fprintf (stderr, "/wordbeg");
fa9a63c5 1173 break;
5e69f11e 1174
fa9a63c5 1175 case wordend:
a1a052df 1176 fprintf (stderr, "/wordend");
e2543b02 1177 break;
5e69f11e 1178
669fa600 1179 case symbeg:
e2543b02 1180 fprintf (stderr, "/symbeg");
669fa600
SM
1181 break;
1182
1183 case symend:
e2543b02 1184 fprintf (stderr, "/symend");
669fa600 1185 break;
5e69f11e 1186
1fb352e0 1187 case syntaxspec:
a1a052df 1188 fprintf (stderr, "/syntaxspec");
1fb352e0 1189 mcnt = *p++;
a1a052df 1190 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1191 break;
1192
1193 case notsyntaxspec:
a1a052df 1194 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1195 mcnt = *p++;
a1a052df 1196 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1197 break;
1198
0b32bf0e 1199# ifdef emacs
fa9a63c5 1200 case before_dot:
a1a052df 1201 fprintf (stderr, "/before_dot");
25fe55af 1202 break;
fa9a63c5
RM
1203
1204 case at_dot:
a1a052df 1205 fprintf (stderr, "/at_dot");
25fe55af 1206 break;
fa9a63c5
RM
1207
1208 case after_dot:
a1a052df 1209 fprintf (stderr, "/after_dot");
25fe55af 1210 break;
fa9a63c5 1211
1fb352e0 1212 case categoryspec:
a1a052df 1213 fprintf (stderr, "/categoryspec");
fa9a63c5 1214 mcnt = *p++;
a1a052df 1215 fprintf (stderr, "/%d", mcnt);
25fe55af 1216 break;
5e69f11e 1217
1fb352e0 1218 case notcategoryspec:
a1a052df 1219 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1220 mcnt = *p++;
a1a052df 1221 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1222 break;
0b32bf0e 1223# endif /* emacs */
fa9a63c5 1224
fa9a63c5 1225 case begbuf:
a1a052df 1226 fprintf (stderr, "/begbuf");
25fe55af 1227 break;
fa9a63c5
RM
1228
1229 case endbuf:
a1a052df 1230 fprintf (stderr, "/endbuf");
25fe55af 1231 break;
fa9a63c5 1232
25fe55af 1233 default:
a1a052df 1234 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1235 }
1236
a1a052df 1237 fprintf (stderr, "\n");
fa9a63c5
RM
1238 }
1239
a1a052df 1240 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1241}
1242
1243
1244void
1245print_compiled_pattern (bufp)
1246 struct re_pattern_buffer *bufp;
1247{
01618498 1248 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1249
1250 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1251 printf ("%ld bytes used/%ld bytes allocated.\n",
1252 bufp->used, bufp->allocated);
fa9a63c5
RM
1253
1254 if (bufp->fastmap_accurate && bufp->fastmap)
1255 {
1256 printf ("fastmap: ");
1257 print_fastmap (bufp->fastmap);
1258 }
1259
1260 printf ("re_nsub: %d\t", bufp->re_nsub);
1261 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1262 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1263 printf ("no_sub: %d\t", bufp->no_sub);
1264 printf ("not_bol: %d\t", bufp->not_bol);
1265 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1266 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1267 fflush (stdout);
fa9a63c5
RM
1268 /* Perhaps we should print the translate table? */
1269}
1270
1271
1272void
1273print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1274 re_char *where;
1275 re_char *string1;
1276 re_char *string2;
fa9a63c5
RM
1277 int size1;
1278 int size2;
1279{
4bb91c68 1280 int this_char;
5e69f11e 1281
fa9a63c5
RM
1282 if (where == NULL)
1283 printf ("(null)");
1284 else
1285 {
1286 if (FIRST_STRING_P (where))
25fe55af
RS
1287 {
1288 for (this_char = where - string1; this_char < size1; this_char++)
1289 putchar (string1[this_char]);
fa9a63c5 1290
25fe55af
RS
1291 where = string2;
1292 }
fa9a63c5
RM
1293
1294 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1295 putchar (string2[this_char]);
fa9a63c5
RM
1296 }
1297}
1298
1299#else /* not DEBUG */
1300
0b32bf0e
SM
1301# undef assert
1302# define assert(e)
fa9a63c5 1303
0b32bf0e
SM
1304# define DEBUG_STATEMENT(e)
1305# define DEBUG_PRINT1(x)
1306# define DEBUG_PRINT2(x1, x2)
1307# define DEBUG_PRINT3(x1, x2, x3)
1308# define DEBUG_PRINT4(x1, x2, x3, x4)
1309# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1310# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1311
1312#endif /* not DEBUG */
1313\f
1314/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1315 also be assigned to arbitrarily: each pattern buffer stores its own
1316 syntax, so it can be changed between regex compilations. */
1317/* This has no initializer because initialized variables in Emacs
1318 become read-only after dumping. */
1319reg_syntax_t re_syntax_options;
1320
1321
1322/* Specify the precise syntax of regexps for compilation. This provides
1323 for compatibility for various utilities which historically have
1324 different, incompatible syntaxes.
1325
1326 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1327 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1328
1329reg_syntax_t
1330re_set_syntax (syntax)
f9b0fd99 1331 reg_syntax_t syntax;
fa9a63c5
RM
1332{
1333 reg_syntax_t ret = re_syntax_options;
5e69f11e 1334
fa9a63c5
RM
1335 re_syntax_options = syntax;
1336 return ret;
1337}
c0f9ea08 1338WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1339
1340/* Regexp to use to replace spaces, or NULL meaning don't. */
1341static re_char *whitespace_regexp;
1342
1343void
1344re_set_whitespace_regexp (regexp)
6470ea05 1345 const char *regexp;
f9b0fd99 1346{
6470ea05 1347 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1348}
1349WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1350\f
1351/* This table gives an error message for each of the error codes listed
4bb91c68 1352 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1353 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1354 but why not be nice? */
fa9a63c5
RM
1355
1356static const char *re_error_msgid[] =
5e69f11e
RM
1357 {
1358 gettext_noop ("Success"), /* REG_NOERROR */
1359 gettext_noop ("No match"), /* REG_NOMATCH */
1360 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1361 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1362 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1363 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1364 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1365 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1366 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1367 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1368 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1369 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1370 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1371 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1372 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1373 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1374 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1375 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1376 };
1377\f
4bb91c68 1378/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1379
1380/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1381 searching and matching functions should not call alloca. On some
1382 systems, alloca is implemented in terms of malloc, and if we're
1383 using the relocating allocator routines, then malloc could cause a
1384 relocation, which might (if the strings being searched are in the
1385 ralloc heap) shift the data out from underneath the regexp
1386 routines.
1387
5e69f11e 1388 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1389 processes input from X in a signal handler; processing X input may
1390 call malloc; if input arrives while a matching routine is calling
1391 malloc, then we're scrod. But Emacs can't just block input while
1392 calling matching routines; then we don't notice interrupts when
1393 they come in. So, Emacs blocks input around all regexp calls
1394 except the matching calls, which it leaves unprotected, in the
1395 faith that they will not malloc. */
1396
1397/* Normally, this is fine. */
1398#define MATCH_MAY_ALLOCATE
1399
1400/* When using GNU C, we are not REALLY using the C alloca, no matter
1401 what config.h may say. So don't take precautions for it. */
1402#ifdef __GNUC__
0b32bf0e 1403# undef C_ALLOCA
fa9a63c5
RM
1404#endif
1405
1406/* The match routines may not allocate if (1) they would do it with malloc
1407 and (2) it's not safe for them to use malloc.
1408 Note that if REL_ALLOC is defined, matching would not use malloc for the
1409 failure stack, but we would still use it for the register vectors;
4bb91c68 1410 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1411#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1412# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1413#endif
1414
1415\f
1416/* Failure stack declarations and macros; both re_compile_fastmap and
1417 re_match_2 use a failure stack. These have to be macros because of
1418 REGEX_ALLOCATE_STACK. */
5e69f11e 1419
fa9a63c5 1420
320a2a73 1421/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1422 when matching. If this number is exceeded, we allocate more
1423 space, so it is not a hard limit. */
1424#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1425# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1426#endif
1427
1428/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1429 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1430 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1431 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1432 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1433# if defined MATCH_MAY_ALLOCATE
1434/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1435 whose default stack limit is 2mb. In order for a larger
1436 value to work reliably, you have to try to make it accord
1437 with the process stack limit. */
c0f9ea08
SM
1438size_t re_max_failures = 40000;
1439# else
1440size_t re_max_failures = 4000;
1441# endif
fa9a63c5
RM
1442
1443union fail_stack_elt
1444{
01618498 1445 re_char *pointer;
c0f9ea08
SM
1446 /* This should be the biggest `int' that's no bigger than a pointer. */
1447 long integer;
fa9a63c5
RM
1448};
1449
1450typedef union fail_stack_elt fail_stack_elt_t;
1451
1452typedef struct
1453{
1454 fail_stack_elt_t *stack;
c0f9ea08
SM
1455 size_t size;
1456 size_t avail; /* Offset of next open position. */
1457 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1458} fail_stack_type;
1459
505bde11 1460#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1461#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1462
1463
1464/* Define macros to initialize and free the failure stack.
1465 Do `return -2' if the alloc fails. */
1466
1467#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1468# define INIT_FAIL_STACK() \
fa9a63c5
RM
1469 do { \
1470 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1471 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1472 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1473 \
1474 if (fail_stack.stack == NULL) \
1475 return -2; \
1476 \
1477 fail_stack.size = INIT_FAILURE_ALLOC; \
1478 fail_stack.avail = 0; \
505bde11 1479 fail_stack.frame = 0; \
fa9a63c5
RM
1480 } while (0)
1481
0b32bf0e 1482# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1483#else
0b32bf0e 1484# define INIT_FAIL_STACK() \
fa9a63c5
RM
1485 do { \
1486 fail_stack.avail = 0; \
505bde11 1487 fail_stack.frame = 0; \
fa9a63c5
RM
1488 } while (0)
1489
0b32bf0e 1490# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1491#endif
1492
1493
320a2a73
KH
1494/* Double the size of FAIL_STACK, up to a limit
1495 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1496
1497 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1498 allocating space for it or it was already too large.
1499
4bb91c68 1500 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1501
320a2a73
KH
1502/* Factor to increase the failure stack size by
1503 when we increase it.
1504 This used to be 2, but 2 was too wasteful
1505 because the old discarded stacks added up to as much space
1506 were as ultimate, maximum-size stack. */
1507#define FAIL_STACK_GROWTH_FACTOR 4
1508
1509#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1510 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1511 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1512 ? 0 \
320a2a73
KH
1513 : ((fail_stack).stack \
1514 = (fail_stack_elt_t *) \
25fe55af
RS
1515 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1516 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1517 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1518 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1519 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1520 \
1521 (fail_stack).stack == NULL \
1522 ? 0 \
6453db45
KH
1523 : ((fail_stack).size \
1524 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1525 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1526 * FAIL_STACK_GROWTH_FACTOR)) \
1527 / sizeof (fail_stack_elt_t)), \
25fe55af 1528 1)))
fa9a63c5
RM
1529
1530
fa9a63c5
RM
1531/* Push a pointer value onto the failure stack.
1532 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1533 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1534#define PUSH_FAILURE_POINTER(item) \
01618498 1535 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1536
1537/* This pushes an integer-valued item onto the failure stack.
1538 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1539 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1540#define PUSH_FAILURE_INT(item) \
1541 fail_stack.stack[fail_stack.avail++].integer = (item)
1542
1543/* Push a fail_stack_elt_t value onto the failure stack.
1544 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1545 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1546#define PUSH_FAILURE_ELT(item) \
1547 fail_stack.stack[fail_stack.avail++] = (item)
1548
1549/* These three POP... operations complement the three PUSH... operations.
1550 All assume that `fail_stack' is nonempty. */
1551#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1552#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1553#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1554
505bde11
SM
1555/* Individual items aside from the registers. */
1556#define NUM_NONREG_ITEMS 3
1557
1558/* Used to examine the stack (to detect infinite loops). */
1559#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1560#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1561#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1562#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1563
1564
505bde11
SM
1565#define ENSURE_FAIL_STACK(space) \
1566while (REMAINING_AVAIL_SLOTS <= space) { \
1567 if (!GROW_FAIL_STACK (fail_stack)) \
1568 return -2; \
1569 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1570 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1571}
1572
1573/* Push register NUM onto the stack. */
1574#define PUSH_FAILURE_REG(num) \
1575do { \
1576 char *destination; \
1577 ENSURE_FAIL_STACK(3); \
1578 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1579 num, regstart[num], regend[num]); \
1580 PUSH_FAILURE_POINTER (regstart[num]); \
1581 PUSH_FAILURE_POINTER (regend[num]); \
1582 PUSH_FAILURE_INT (num); \
1583} while (0)
1584
01618498
SM
1585/* Change the counter's value to VAL, but make sure that it will
1586 be reset when backtracking. */
1587#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1588do { \
1589 char *destination; \
1590 int c; \
1591 ENSURE_FAIL_STACK(3); \
1592 EXTRACT_NUMBER (c, ptr); \
01618498 1593 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1594 PUSH_FAILURE_INT (c); \
1595 PUSH_FAILURE_POINTER (ptr); \
1596 PUSH_FAILURE_INT (-1); \
01618498 1597 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1598} while (0)
1599
505bde11 1600/* Pop a saved register off the stack. */
dc1e502d 1601#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1602do { \
1603 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1604 if (reg == -1) \
1605 { \
1606 /* It's a counter. */ \
6dcf2d0e
SM
1607 /* Here, we discard `const', making re_match non-reentrant. */ \
1608 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1609 reg = POP_FAILURE_INT (); \
1610 STORE_NUMBER (ptr, reg); \
1611 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1612 } \
1613 else \
1614 { \
1615 regend[reg] = POP_FAILURE_POINTER (); \
1616 regstart[reg] = POP_FAILURE_POINTER (); \
1617 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1618 reg, regstart[reg], regend[reg]); \
1619 } \
505bde11
SM
1620} while (0)
1621
1622/* Check that we are not stuck in an infinite loop. */
1623#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1624do { \
f6df485f 1625 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1626 /* Check for infinite matching loops */ \
f6df485f
RS
1627 while (failure > 0 \
1628 && (FAILURE_STR (failure) == string_place \
1629 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1630 { \
1631 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1632 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1633 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1634 { \
6df42991
SM
1635 cycle = 1; \
1636 break; \
f6df485f 1637 } \
66f0296e 1638 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1639 failure = NEXT_FAILURE_HANDLE(failure); \
1640 } \
1641 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1642} while (0)
6df42991 1643
fa9a63c5 1644/* Push the information about the state we will need
5e69f11e
RM
1645 if we ever fail back to it.
1646
505bde11 1647 Requires variables fail_stack, regstart, regend and
320a2a73 1648 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1649 declared.
5e69f11e 1650
fa9a63c5
RM
1651 Does `return FAILURE_CODE' if runs out of memory. */
1652
505bde11
SM
1653#define PUSH_FAILURE_POINT(pattern, string_place) \
1654do { \
1655 char *destination; \
1656 /* Must be int, so when we don't save any registers, the arithmetic \
1657 of 0 + -1 isn't done as unsigned. */ \
1658 \
505bde11 1659 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1660 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1661 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1662 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1663 \
1664 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1665 \
1666 DEBUG_PRINT1 ("\n"); \
1667 \
1668 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1669 PUSH_FAILURE_INT (fail_stack.frame); \
1670 \
1671 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1672 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1673 DEBUG_PRINT1 ("'\n"); \
1674 PUSH_FAILURE_POINTER (string_place); \
1675 \
1676 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1677 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1678 PUSH_FAILURE_POINTER (pattern); \
1679 \
1680 /* Close the frame by moving the frame pointer past it. */ \
1681 fail_stack.frame = fail_stack.avail; \
1682} while (0)
fa9a63c5 1683
320a2a73
KH
1684/* Estimate the size of data pushed by a typical failure stack entry.
1685 An estimate is all we need, because all we use this for
1686 is to choose a limit for how big to make the failure stack. */
ada30c0e 1687/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1688#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1689
fa9a63c5
RM
1690/* How many items can still be added to the stack without overflowing it. */
1691#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1692
1693
1694/* Pops what PUSH_FAIL_STACK pushes.
1695
1696 We restore into the parameters, all of which should be lvalues:
1697 STR -- the saved data position.
1698 PAT -- the saved pattern position.
fa9a63c5 1699 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1700
fa9a63c5 1701 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1702 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1703
505bde11
SM
1704#define POP_FAILURE_POINT(str, pat) \
1705do { \
fa9a63c5
RM
1706 assert (!FAIL_STACK_EMPTY ()); \
1707 \
1708 /* Remove failure points and point to how many regs pushed. */ \
1709 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1710 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1711 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1712 \
505bde11
SM
1713 /* Pop the saved registers. */ \
1714 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1715 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1716 \
01618498 1717 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1718 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1719 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1720 \
1721 /* If the saved string location is NULL, it came from an \
1722 on_failure_keep_string_jump opcode, and we want to throw away the \
1723 saved NULL, thus retaining our current position in the string. */ \
01618498 1724 str = POP_FAILURE_POINTER (); \
505bde11 1725 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1726 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1727 DEBUG_PRINT1 ("'\n"); \
1728 \
505bde11
SM
1729 fail_stack.frame = POP_FAILURE_INT (); \
1730 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1731 \
505bde11
SM
1732 assert (fail_stack.avail >= 0); \
1733 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1734 \
fa9a63c5 1735 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1736} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1737
1738
1739\f
fa9a63c5 1740/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1741#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1742\f
1743/* Subroutine declarations and macros for regex_compile. */
1744
4bb91c68
SM
1745static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1746 reg_syntax_t syntax,
1747 struct re_pattern_buffer *bufp));
1748static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1749static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1750 int arg1, int arg2));
1751static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1752 int arg, unsigned char *end));
1753static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1754 int arg1, int arg2, unsigned char *end));
01618498
SM
1755static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1756 re_char *p,
4bb91c68 1757 reg_syntax_t syntax));
01618498
SM
1758static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1759 re_char *pend,
4bb91c68 1760 reg_syntax_t syntax));
01618498
SM
1761static re_char *skip_one_char _RE_ARGS ((re_char *p));
1762static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1763 char *fastmap, const int multibyte));
fa9a63c5 1764
fa9a63c5 1765/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1766 translation. */
36595814 1767#define PATFETCH(c) \
2d1675e4
SM
1768 do { \
1769 int len; \
1770 if (p == pend) return REG_EEND; \
cf9c99bc 1771 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte); \
2d1675e4 1772 p += len; \
fa9a63c5
RM
1773 } while (0)
1774
fa9a63c5
RM
1775
1776/* If `translate' is non-null, return translate[D], else just D. We
1777 cast the subscript to translate because some data is declared as
1778 `char *', to avoid warnings when a string constant is passed. But
1779 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1780#ifndef TRANSLATE
0b32bf0e 1781# define TRANSLATE(d) \
66f0296e 1782 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1783#endif
fa9a63c5
RM
1784
1785
1786/* Macros for outputting the compiled pattern into `buffer'. */
1787
1788/* If the buffer isn't allocated when it comes in, use this. */
1789#define INIT_BUF_SIZE 32
1790
4bb91c68 1791/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1792#define GET_BUFFER_SPACE(n) \
01618498 1793 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1794 EXTEND_BUFFER ()
1795
1796/* Make sure we have one more byte of buffer space and then add C to it. */
1797#define BUF_PUSH(c) \
1798 do { \
1799 GET_BUFFER_SPACE (1); \
1800 *b++ = (unsigned char) (c); \
1801 } while (0)
1802
1803
1804/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1805#define BUF_PUSH_2(c1, c2) \
1806 do { \
1807 GET_BUFFER_SPACE (2); \
1808 *b++ = (unsigned char) (c1); \
1809 *b++ = (unsigned char) (c2); \
1810 } while (0)
1811
1812
4bb91c68 1813/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1814#define BUF_PUSH_3(c1, c2, c3) \
1815 do { \
1816 GET_BUFFER_SPACE (3); \
1817 *b++ = (unsigned char) (c1); \
1818 *b++ = (unsigned char) (c2); \
1819 *b++ = (unsigned char) (c3); \
1820 } while (0)
1821
1822
1823/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1824 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1825#define STORE_JUMP(op, loc, to) \
1826 store_op1 (op, loc, (to) - (loc) - 3)
1827
1828/* Likewise, for a two-argument jump. */
1829#define STORE_JUMP2(op, loc, to, arg) \
1830 store_op2 (op, loc, (to) - (loc) - 3, arg)
1831
4bb91c68 1832/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1833#define INSERT_JUMP(op, loc, to) \
1834 insert_op1 (op, loc, (to) - (loc) - 3, b)
1835
1836/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1837#define INSERT_JUMP2(op, loc, to, arg) \
1838 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1839
1840
1841/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1842 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1843 be too small, many things would have to change. */
839966f3
KH
1844# define MAX_BUF_SIZE (1L << 15)
1845
1846#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1847/* Any other compiler which, like MSC, has allocation limit below 2^16
1848 bytes will have to use approach similar to what was done below for
1849 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1850 reallocating to 0 bytes. Such thing is not going to work too well.
1851 You have been warned!! */
1852#if defined _MSC_VER && !defined WIN32
1853/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1854# define MAX_BUF_SIZE 65500L
1855#else
1856# define MAX_BUF_SIZE (1L << 16)
1857#endif
839966f3 1858#endif /* 0 */
fa9a63c5
RM
1859
1860/* Extend the buffer by twice its current size via realloc and
1861 reset the pointers that pointed into the old block to point to the
1862 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1863 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1864#if __BOUNDED_POINTERS__
1865# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1866# define MOVE_BUFFER_POINTER(P) \
1867 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1868# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1869 else \
1870 { \
1871 SET_HIGH_BOUND (b); \
1872 SET_HIGH_BOUND (begalt); \
1873 if (fixup_alt_jump) \
1874 SET_HIGH_BOUND (fixup_alt_jump); \
1875 if (laststart) \
1876 SET_HIGH_BOUND (laststart); \
1877 if (pending_exact) \
1878 SET_HIGH_BOUND (pending_exact); \
1879 }
1880#else
1881# define MOVE_BUFFER_POINTER(P) (P) += incr
1882# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1883#endif
fa9a63c5 1884#define EXTEND_BUFFER() \
25fe55af 1885 do { \
01618498 1886 re_char *old_buffer = bufp->buffer; \
25fe55af 1887 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1888 return REG_ESIZE; \
1889 bufp->allocated <<= 1; \
1890 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1891 bufp->allocated = MAX_BUF_SIZE; \
01618498 1892 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1893 if (bufp->buffer == NULL) \
1894 return REG_ESPACE; \
1895 /* If the buffer moved, move all the pointers into it. */ \
1896 if (old_buffer != bufp->buffer) \
1897 { \
4bb91c68
SM
1898 int incr = bufp->buffer - old_buffer; \
1899 MOVE_BUFFER_POINTER (b); \
1900 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1901 if (fixup_alt_jump) \
4bb91c68 1902 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1903 if (laststart) \
4bb91c68 1904 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1905 if (pending_exact) \
4bb91c68 1906 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1907 } \
4bb91c68 1908 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1909 } while (0)
1910
1911
1912/* Since we have one byte reserved for the register number argument to
1913 {start,stop}_memory, the maximum number of groups we can report
1914 things about is what fits in that byte. */
1915#define MAX_REGNUM 255
1916
1917/* But patterns can have more than `MAX_REGNUM' registers. We just
1918 ignore the excess. */
098d42af 1919typedef int regnum_t;
fa9a63c5
RM
1920
1921
1922/* Macros for the compile stack. */
1923
1924/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1925 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1926/* int may be not enough when sizeof(int) == 2. */
1927typedef long pattern_offset_t;
fa9a63c5
RM
1928
1929typedef struct
1930{
1931 pattern_offset_t begalt_offset;
1932 pattern_offset_t fixup_alt_jump;
5e69f11e 1933 pattern_offset_t laststart_offset;
fa9a63c5
RM
1934 regnum_t regnum;
1935} compile_stack_elt_t;
1936
1937
1938typedef struct
1939{
1940 compile_stack_elt_t *stack;
1941 unsigned size;
1942 unsigned avail; /* Offset of next open position. */
1943} compile_stack_type;
1944
1945
1946#define INIT_COMPILE_STACK_SIZE 32
1947
1948#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1949#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1950
4bb91c68 1951/* The next available element. */
fa9a63c5
RM
1952#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1953
1cee1e27
SM
1954/* Explicit quit checking is only used on NTemacs and whenever we
1955 use polling to process input events. */
1956#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1957extern int immediate_quit;
1958# define IMMEDIATE_QUIT_CHECK \
1959 do { \
1960 if (immediate_quit) QUIT; \
1961 } while (0)
1962#else
1963# define IMMEDIATE_QUIT_CHECK ((void)0)
1964#endif
1965\f
b18215fc
RS
1966/* Structure to manage work area for range table. */
1967struct range_table_work_area
1968{
1969 int *table; /* actual work area. */
1970 int allocated; /* allocated size for work area in bytes. */
7814e705 1971 int used; /* actually used size in words. */
96cc36cc 1972 int bits; /* flag to record character classes */
b18215fc
RS
1973};
1974
77d11aec
RS
1975/* Make sure that WORK_AREA can hold more N multibyte characters.
1976 This is used only in set_image_of_range and set_image_of_range_1.
1977 It expects WORK_AREA to be a pointer.
1978 If it can't get the space, it returns from the surrounding function. */
1979
1980#define EXTEND_RANGE_TABLE(work_area, n) \
1981 do { \
8f924df7 1982 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1983 { \
8f924df7
KH
1984 extend_range_table_work_area (&work_area); \
1985 if ((work_area).table == 0) \
77d11aec
RS
1986 return (REG_ESPACE); \
1987 } \
b18215fc
RS
1988 } while (0)
1989
96cc36cc
RS
1990#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1991 (work_area).bits |= (bit)
1992
14473664
SM
1993/* Bits used to implement the multibyte-part of the various character classes
1994 such as [:alnum:] in a charset's range table. */
1995#define BIT_WORD 0x1
1996#define BIT_LOWER 0x2
1997#define BIT_PUNCT 0x4
1998#define BIT_SPACE 0x8
1999#define BIT_UPPER 0x10
2000#define BIT_MULTIBYTE 0x20
96cc36cc 2001
b18215fc
RS
2002/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
2003#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 2004 do { \
8f924df7 2005 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
2006 (work_area).table[(work_area).used++] = (range_start); \
2007 (work_area).table[(work_area).used++] = (range_end); \
2008 } while (0)
2009
7814e705 2010/* Free allocated memory for WORK_AREA. */
b18215fc
RS
2011#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
2012 do { \
2013 if ((work_area).table) \
2014 free ((work_area).table); \
2015 } while (0)
2016
96cc36cc 2017#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 2018#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 2019#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 2020#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 2021\f
b18215fc 2022
fa9a63c5 2023/* Set the bit for character C in a list. */
01618498 2024#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
2025
2026
bf216479
KH
2027#ifdef emacs
2028
cf9c99bc
KH
2029/* Store characters in the range FROM to TO in the bitmap at B (for
2030 ASCII and unibyte characters) and WORK_AREA (for multibyte
2031 characters) while translating them and paying attention to the
2032 continuity of translated characters.
8f924df7 2033
cf9c99bc
KH
2034 Implementation note: It is better to implement these fairly big
2035 macros by a function, but it's not that easy because macros called
8f924df7 2036 in this macro assume various local variables already declared. */
bf216479 2037
cf9c99bc
KH
2038/* Both FROM and TO are ASCII characters. */
2039
2040#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
2041 do { \
2042 int C0, C1; \
2043 \
2044 for (C0 = (FROM); C0 <= (TO); C0++) \
2045 { \
2046 C1 = TRANSLATE (C0); \
2047 if (! ASCII_CHAR_P (C1)) \
2048 { \
2049 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2050 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2051 C1 = C0; \
2052 } \
2053 SET_LIST_BIT (C1); \
2054 } \
2055 } while (0)
2056
2057
2058/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2059
2060#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2061 do { \
2062 int C0, C1, C2, I; \
2063 int USED = RANGE_TABLE_WORK_USED (work_area); \
2064 \
2065 for (C0 = (FROM); C0 <= (TO); C0++) \
2066 { \
2067 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2068 if (CHAR_BYTE8_P (C1)) \
2069 SET_LIST_BIT (C0); \
2070 else \
2071 { \
2072 C2 = TRANSLATE (C1); \
2073 if (C2 == C1 \
2074 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2075 C1 = C0; \
2076 SET_LIST_BIT (C1); \
2077 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2078 { \
2079 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2080 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2081 \
2082 if (C2 >= from - 1 && C2 <= to + 1) \
2083 { \
2084 if (C2 == from - 1) \
2085 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2086 else if (C2 == to + 1) \
2087 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2088 break; \
2089 } \
2090 } \
2091 if (I < USED) \
2092 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2093 } \
2094 } \
2095 } while (0)
2096
2097
2098/* Both FROM and TO are mulitbyte characters. */
2099
2100#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2101 do { \
2102 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2103 \
2104 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2105 for (C0 = (FROM); C0 <= (TO); C0++) \
2106 { \
2107 C1 = TRANSLATE (C0); \
2108 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2109 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2110 SET_LIST_BIT (C2); \
2111 if (C1 >= (FROM) && C1 <= (TO)) \
2112 continue; \
2113 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2114 { \
2115 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2116 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2117 \
2118 if (C1 >= from - 1 && C1 <= to + 1) \
2119 { \
2120 if (C1 == from - 1) \
2121 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2122 else if (C1 == to + 1) \
2123 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2124 break; \
2125 } \
2126 } \
2127 if (I < USED) \
2128 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2129 } \
bf216479
KH
2130 } while (0)
2131
2132#endif /* emacs */
2133
fa9a63c5 2134/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2135#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2136 do { \
2137 if (p == pend) \
2138 FREE_STACK_RETURN (REG_EBRACE); \
2139 else \
2140 { \
2141 PATFETCH (c); \
2142 while ('0' <= c && c <= '9') \
2143 { \
2144 int prev; \
2145 if (num < 0) \
2146 num = 0; \
2147 prev = num; \
2148 num = num * 10 + c - '0'; \
2149 if (num / 10 != prev) \
2150 FREE_STACK_RETURN (REG_BADBR); \
2151 if (p == pend) \
2152 FREE_STACK_RETURN (REG_EBRACE); \
2153 PATFETCH (c); \
2154 } \
2155 } \
2156 } while (0)
77d11aec 2157\f
1fdab503 2158#if ! WIDE_CHAR_SUPPORT
01618498 2159
14473664 2160/* Map a string to the char class it names (if any). */
1fdab503 2161re_wctype_t
ada30c0e
SM
2162re_wctype (str)
2163 re_char *str;
14473664 2164{
ada30c0e 2165 const char *string = str;
14473664
SM
2166 if (STREQ (string, "alnum")) return RECC_ALNUM;
2167 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2168 else if (STREQ (string, "word")) return RECC_WORD;
2169 else if (STREQ (string, "ascii")) return RECC_ASCII;
2170 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2171 else if (STREQ (string, "graph")) return RECC_GRAPH;
2172 else if (STREQ (string, "lower")) return RECC_LOWER;
2173 else if (STREQ (string, "print")) return RECC_PRINT;
2174 else if (STREQ (string, "punct")) return RECC_PUNCT;
2175 else if (STREQ (string, "space")) return RECC_SPACE;
2176 else if (STREQ (string, "upper")) return RECC_UPPER;
2177 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2178 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2179 else if (STREQ (string, "digit")) return RECC_DIGIT;
2180 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2181 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2182 else if (STREQ (string, "blank")) return RECC_BLANK;
2183 else return 0;
2184}
2185
2186/* True iff CH is in the char class CC. */
1fdab503 2187boolean
14473664
SM
2188re_iswctype (ch, cc)
2189 int ch;
2190 re_wctype_t cc;
2191{
2192 switch (cc)
2193 {
0cdd06f8
SM
2194 case RECC_ALNUM: return ISALNUM (ch);
2195 case RECC_ALPHA: return ISALPHA (ch);
2196 case RECC_BLANK: return ISBLANK (ch);
2197 case RECC_CNTRL: return ISCNTRL (ch);
2198 case RECC_DIGIT: return ISDIGIT (ch);
2199 case RECC_GRAPH: return ISGRAPH (ch);
2200 case RECC_LOWER: return ISLOWER (ch);
2201 case RECC_PRINT: return ISPRINT (ch);
2202 case RECC_PUNCT: return ISPUNCT (ch);
2203 case RECC_SPACE: return ISSPACE (ch);
2204 case RECC_UPPER: return ISUPPER (ch);
2205 case RECC_XDIGIT: return ISXDIGIT (ch);
2206 case RECC_ASCII: return IS_REAL_ASCII (ch);
2207 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2208 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2209 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2210 case RECC_WORD: return ISWORD (ch);
2211 case RECC_ERROR: return false;
2212 default:
2213 abort();
14473664
SM
2214 }
2215}
fa9a63c5 2216
14473664
SM
2217/* Return a bit-pattern to use in the range-table bits to match multibyte
2218 chars of class CC. */
2219static int
2220re_wctype_to_bit (cc)
2221 re_wctype_t cc;
2222{
2223 switch (cc)
2224 {
2225 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2226 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2227 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2228 case RECC_LOWER: return BIT_LOWER;
2229 case RECC_UPPER: return BIT_UPPER;
2230 case RECC_PUNCT: return BIT_PUNCT;
2231 case RECC_SPACE: return BIT_SPACE;
14473664 2232 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2233 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2234 default:
2235 abort();
14473664
SM
2236 }
2237}
2238#endif
77d11aec
RS
2239\f
2240/* Filling in the work area of a range. */
2241
2242/* Actually extend the space in WORK_AREA. */
2243
2244static void
2245extend_range_table_work_area (work_area)
2246 struct range_table_work_area *work_area;
177c0ea7 2247{
77d11aec
RS
2248 work_area->allocated += 16 * sizeof (int);
2249 if (work_area->table)
2250 work_area->table
2251 = (int *) realloc (work_area->table, work_area->allocated);
2252 else
2253 work_area->table
2254 = (int *) malloc (work_area->allocated);
2255}
2256
8f924df7 2257#if 0
77d11aec
RS
2258#ifdef emacs
2259
2260/* Carefully find the ranges of codes that are equivalent
2261 under case conversion to the range start..end when passed through
2262 TRANSLATE. Handle the case where non-letters can come in between
2263 two upper-case letters (which happens in Latin-1).
2264 Also handle the case of groups of more than 2 case-equivalent chars.
2265
2266 The basic method is to look at consecutive characters and see
2267 if they can form a run that can be handled as one.
2268
2269 Returns -1 if successful, REG_ESPACE if ran out of space. */
2270
2271static int
2272set_image_of_range_1 (work_area, start, end, translate)
2273 RE_TRANSLATE_TYPE translate;
2274 struct range_table_work_area *work_area;
2275 re_wchar_t start, end;
2276{
2277 /* `one_case' indicates a character, or a run of characters,
2278 each of which is an isolate (no case-equivalents).
2279 This includes all ASCII non-letters.
2280
2281 `two_case' indicates a character, or a run of characters,
2282 each of which has two case-equivalent forms.
2283 This includes all ASCII letters.
2284
2285 `strange' indicates a character that has more than one
2286 case-equivalent. */
177c0ea7 2287
77d11aec
RS
2288 enum case_type {one_case, two_case, strange};
2289
2290 /* Describe the run that is in progress,
2291 which the next character can try to extend.
2292 If run_type is strange, that means there really is no run.
2293 If run_type is one_case, then run_start...run_end is the run.
2294 If run_type is two_case, then the run is run_start...run_end,
2295 and the case-equivalents end at run_eqv_end. */
2296
2297 enum case_type run_type = strange;
2298 int run_start, run_end, run_eqv_end;
2299
2300 Lisp_Object eqv_table;
2301
2302 if (!RE_TRANSLATE_P (translate))
2303 {
b7c12565 2304 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2305 work_area->table[work_area->used++] = (start);
2306 work_area->table[work_area->used++] = (end);
b7c12565 2307 return -1;
77d11aec
RS
2308 }
2309
2310 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2311
77d11aec
RS
2312 for (; start <= end; start++)
2313 {
2314 enum case_type this_type;
2315 int eqv = RE_TRANSLATE (eqv_table, start);
2316 int minchar, maxchar;
2317
2318 /* Classify this character */
2319 if (eqv == start)
2320 this_type = one_case;
2321 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2322 this_type = two_case;
2323 else
2324 this_type = strange;
2325
2326 if (start < eqv)
2327 minchar = start, maxchar = eqv;
2328 else
2329 minchar = eqv, maxchar = start;
2330
2331 /* Can this character extend the run in progress? */
2332 if (this_type == strange || this_type != run_type
2333 || !(minchar == run_end + 1
2334 && (run_type == two_case
2335 ? maxchar == run_eqv_end + 1 : 1)))
2336 {
2337 /* No, end the run.
2338 Record each of its equivalent ranges. */
2339 if (run_type == one_case)
2340 {
2341 EXTEND_RANGE_TABLE (work_area, 2);
2342 work_area->table[work_area->used++] = run_start;
2343 work_area->table[work_area->used++] = run_end;
2344 }
2345 else if (run_type == two_case)
2346 {
2347 EXTEND_RANGE_TABLE (work_area, 4);
2348 work_area->table[work_area->used++] = run_start;
2349 work_area->table[work_area->used++] = run_end;
2350 work_area->table[work_area->used++]
2351 = RE_TRANSLATE (eqv_table, run_start);
2352 work_area->table[work_area->used++]
2353 = RE_TRANSLATE (eqv_table, run_end);
2354 }
2355 run_type = strange;
2356 }
177c0ea7 2357
77d11aec
RS
2358 if (this_type == strange)
2359 {
2360 /* For a strange character, add each of its equivalents, one
2361 by one. Don't start a range. */
2362 do
2363 {
2364 EXTEND_RANGE_TABLE (work_area, 2);
2365 work_area->table[work_area->used++] = eqv;
2366 work_area->table[work_area->used++] = eqv;
2367 eqv = RE_TRANSLATE (eqv_table, eqv);
2368 }
2369 while (eqv != start);
2370 }
2371
2372 /* Add this char to the run, or start a new run. */
2373 else if (run_type == strange)
2374 {
2375 /* Initialize a new range. */
2376 run_type = this_type;
2377 run_start = start;
2378 run_end = start;
2379 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2380 }
2381 else
2382 {
2383 /* Extend a running range. */
2384 run_end = minchar;
2385 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2386 }
2387 }
2388
2389 /* If a run is still in progress at the end, finish it now
2390 by recording its equivalent ranges. */
2391 if (run_type == one_case)
2392 {
2393 EXTEND_RANGE_TABLE (work_area, 2);
2394 work_area->table[work_area->used++] = run_start;
2395 work_area->table[work_area->used++] = run_end;
2396 }
2397 else if (run_type == two_case)
2398 {
2399 EXTEND_RANGE_TABLE (work_area, 4);
2400 work_area->table[work_area->used++] = run_start;
2401 work_area->table[work_area->used++] = run_end;
2402 work_area->table[work_area->used++]
2403 = RE_TRANSLATE (eqv_table, run_start);
2404 work_area->table[work_area->used++]
2405 = RE_TRANSLATE (eqv_table, run_end);
2406 }
2407
2408 return -1;
2409}
36595814 2410
77d11aec 2411#endif /* emacs */
36595814 2412
b7c12565 2413/* Record the the image of the range start..end when passed through
36595814
SM
2414 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2415 and is not even necessarily contiguous.
b7c12565
RS
2416 Normally we approximate it with the smallest contiguous range that contains
2417 all the chars we need. However, for Latin-1 we go to extra effort
2418 to do a better job.
2419
2420 This function is not called for ASCII ranges.
77d11aec
RS
2421
2422 Returns -1 if successful, REG_ESPACE if ran out of space. */
2423
2424static int
36595814
SM
2425set_image_of_range (work_area, start, end, translate)
2426 RE_TRANSLATE_TYPE translate;
2427 struct range_table_work_area *work_area;
2428 re_wchar_t start, end;
2429{
77d11aec
RS
2430 re_wchar_t cmin, cmax;
2431
2432#ifdef emacs
2433 /* For Latin-1 ranges, use set_image_of_range_1
2434 to get proper handling of ranges that include letters and nonletters.
b7c12565 2435 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2436 For other character sets, we don't bother to get this right. */
b7c12565
RS
2437 if (RE_TRANSLATE_P (translate) && start < 04400
2438 && !(start < 04200 && end >= 04377))
77d11aec 2439 {
b7c12565 2440 int newend;
77d11aec 2441 int tem;
b7c12565
RS
2442 newend = end;
2443 if (newend > 04377)
2444 newend = 04377;
2445 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2446 if (tem > 0)
2447 return tem;
2448
2449 start = 04400;
2450 if (end < 04400)
2451 return -1;
2452 }
2453#endif
2454
b7c12565
RS
2455 EXTEND_RANGE_TABLE (work_area, 2);
2456 work_area->table[work_area->used++] = (start);
2457 work_area->table[work_area->used++] = (end);
2458
2459 cmin = -1, cmax = -1;
77d11aec 2460
36595814 2461 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2462 {
2463 int ch;
77d11aec 2464
b7c12565
RS
2465 for (ch = start; ch <= end; ch++)
2466 {
2467 re_wchar_t c = TRANSLATE (ch);
2468 if (! (start <= c && c <= end))
2469 {
2470 if (cmin == -1)
2471 cmin = c, cmax = c;
2472 else
2473 {
2474 cmin = MIN (cmin, c);
2475 cmax = MAX (cmax, c);
2476 }
2477 }
2478 }
2479
2480 if (cmin != -1)
2481 {
2482 EXTEND_RANGE_TABLE (work_area, 2);
2483 work_area->table[work_area->used++] = (cmin);
2484 work_area->table[work_area->used++] = (cmax);
2485 }
2486 }
36595814 2487
77d11aec
RS
2488 return -1;
2489}
8f924df7 2490#endif /* 0 */
fa9a63c5
RM
2491\f
2492#ifndef MATCH_MAY_ALLOCATE
2493
2494/* If we cannot allocate large objects within re_match_2_internal,
2495 we make the fail stack and register vectors global.
2496 The fail stack, we grow to the maximum size when a regexp
2497 is compiled.
2498 The register vectors, we adjust in size each time we
2499 compile a regexp, according to the number of registers it needs. */
2500
2501static fail_stack_type fail_stack;
2502
2503/* Size with which the following vectors are currently allocated.
2504 That is so we can make them bigger as needed,
4bb91c68 2505 but never make them smaller. */
fa9a63c5
RM
2506static int regs_allocated_size;
2507
66f0296e
SM
2508static re_char ** regstart, ** regend;
2509static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2510
2511/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2512 but don't make them smaller. */
fa9a63c5
RM
2513
2514static
2515regex_grow_registers (num_regs)
2516 int num_regs;
2517{
2518 if (num_regs > regs_allocated_size)
2519 {
66f0296e
SM
2520 RETALLOC_IF (regstart, num_regs, re_char *);
2521 RETALLOC_IF (regend, num_regs, re_char *);
2522 RETALLOC_IF (best_regstart, num_regs, re_char *);
2523 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2524
2525 regs_allocated_size = num_regs;
2526 }
2527}
2528
2529#endif /* not MATCH_MAY_ALLOCATE */
2530\f
99633e97
SM
2531static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2532 compile_stack,
2533 regnum_t regnum));
2534
fa9a63c5
RM
2535/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2536 Returns one of error codes defined in `regex.h', or zero for success.
2537
2538 Assumes the `allocated' (and perhaps `buffer') and `translate'
2539 fields are set in BUFP on entry.
2540
2541 If it succeeds, results are put in BUFP (if it returns an error, the
2542 contents of BUFP are undefined):
2543 `buffer' is the compiled pattern;
2544 `syntax' is set to SYNTAX;
2545 `used' is set to the length of the compiled pattern;
2546 `fastmap_accurate' is zero;
2547 `re_nsub' is the number of subexpressions in PATTERN;
2548 `not_bol' and `not_eol' are zero;
5e69f11e 2549
c0f9ea08 2550 The `fastmap' field is neither examined nor set. */
fa9a63c5 2551
505bde11
SM
2552/* Insert the `jump' from the end of last alternative to "here".
2553 The space for the jump has already been allocated. */
2554#define FIXUP_ALT_JUMP() \
2555do { \
2556 if (fixup_alt_jump) \
2557 STORE_JUMP (jump, fixup_alt_jump, b); \
2558} while (0)
2559
2560
fa9a63c5
RM
2561/* Return, freeing storage we allocated. */
2562#define FREE_STACK_RETURN(value) \
b18215fc
RS
2563 do { \
2564 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2565 free (compile_stack.stack); \
2566 return value; \
2567 } while (0)
fa9a63c5
RM
2568
2569static reg_errcode_t
2570regex_compile (pattern, size, syntax, bufp)
66f0296e 2571 re_char *pattern;
4bb91c68 2572 size_t size;
fa9a63c5
RM
2573 reg_syntax_t syntax;
2574 struct re_pattern_buffer *bufp;
2575{
01618498
SM
2576 /* We fetch characters from PATTERN here. */
2577 register re_wchar_t c, c1;
5e69f11e 2578
fa9a63c5 2579 /* A random temporary spot in PATTERN. */
66f0296e 2580 re_char *p1;
fa9a63c5
RM
2581
2582 /* Points to the end of the buffer, where we should append. */
2583 register unsigned char *b;
5e69f11e 2584
fa9a63c5
RM
2585 /* Keeps track of unclosed groups. */
2586 compile_stack_type compile_stack;
2587
2588 /* Points to the current (ending) position in the pattern. */
22336245
RS
2589#ifdef AIX
2590 /* `const' makes AIX compiler fail. */
66f0296e 2591 unsigned char *p = pattern;
22336245 2592#else
66f0296e 2593 re_char *p = pattern;
22336245 2594#endif
66f0296e 2595 re_char *pend = pattern + size;
5e69f11e 2596
fa9a63c5 2597 /* How to translate the characters in the pattern. */
6676cb1c 2598 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2599
2600 /* Address of the count-byte of the most recently inserted `exactn'
2601 command. This makes it possible to tell if a new exact-match
2602 character can be added to that command or if the character requires
2603 a new `exactn' command. */
2604 unsigned char *pending_exact = 0;
2605
2606 /* Address of start of the most recently finished expression.
2607 This tells, e.g., postfix * where to find the start of its
2608 operand. Reset at the beginning of groups and alternatives. */
2609 unsigned char *laststart = 0;
2610
2611 /* Address of beginning of regexp, or inside of last group. */
2612 unsigned char *begalt;
2613
2614 /* Place in the uncompiled pattern (i.e., the {) to
2615 which to go back if the interval is invalid. */
66f0296e 2616 re_char *beg_interval;
5e69f11e 2617
fa9a63c5 2618 /* Address of the place where a forward jump should go to the end of
7814e705 2619 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2620 last -- ends with a forward jump of this sort. */
2621 unsigned char *fixup_alt_jump = 0;
2622
2623 /* Counts open-groups as they are encountered. Remembered for the
2624 matching close-group on the compile stack, so the same register
2625 number is put in the stop_memory as the start_memory. */
2626 regnum_t regnum = 0;
2627
b18215fc
RS
2628 /* Work area for range table of charset. */
2629 struct range_table_work_area range_table_work;
2630
2d1675e4
SM
2631 /* If the object matched can contain multibyte characters. */
2632 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2633
8f924df7 2634 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2635 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2636
f9b0fd99
RS
2637 /* Nonzero if we have pushed down into a subpattern. */
2638 int in_subpattern = 0;
2639
2640 /* These hold the values of p, pattern, and pend from the main
2641 pattern when we have pushed into a subpattern. */
2642 re_char *main_p;
2643 re_char *main_pattern;
2644 re_char *main_pend;
2645
fa9a63c5 2646#ifdef DEBUG
99633e97 2647 debug++;
fa9a63c5 2648 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2649 if (debug > 0)
fa9a63c5
RM
2650 {
2651 unsigned debug_count;
5e69f11e 2652
fa9a63c5 2653 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2654 putchar (pattern[debug_count]);
fa9a63c5
RM
2655 putchar ('\n');
2656 }
2657#endif /* DEBUG */
2658
2659 /* Initialize the compile stack. */
2660 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2661 if (compile_stack.stack == NULL)
2662 return REG_ESPACE;
2663
2664 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2665 compile_stack.avail = 0;
2666
b18215fc
RS
2667 range_table_work.table = 0;
2668 range_table_work.allocated = 0;
2669
fa9a63c5
RM
2670 /* Initialize the pattern buffer. */
2671 bufp->syntax = syntax;
2672 bufp->fastmap_accurate = 0;
2673 bufp->not_bol = bufp->not_eol = 0;
6224b623 2674 bufp->used_syntax = 0;
fa9a63c5
RM
2675
2676 /* Set `used' to zero, so that if we return an error, the pattern
2677 printer (for debugging) will think there's no pattern. We reset it
2678 at the end. */
2679 bufp->used = 0;
5e69f11e 2680
fa9a63c5 2681 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2682 bufp->re_nsub = 0;
fa9a63c5 2683
0b32bf0e 2684#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2685 /* Initialize the syntax table. */
2686 init_syntax_once ();
2687#endif
2688
2689 if (bufp->allocated == 0)
2690 {
2691 if (bufp->buffer)
2692 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2693 enough space. This loses if buffer's address is bogus, but
7814e705 2694 that is the user's responsibility. */
25fe55af
RS
2695 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2696 }
fa9a63c5 2697 else
7814e705 2698 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2699 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2700 }
fa9a63c5
RM
2701 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2702
2703 bufp->allocated = INIT_BUF_SIZE;
2704 }
2705
2706 begalt = b = bufp->buffer;
2707
2708 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2709 while (1)
fa9a63c5 2710 {
f9b0fd99
RS
2711 if (p == pend)
2712 {
2713 /* If this is the end of an included regexp,
2714 pop back to the main regexp and try again. */
2715 if (in_subpattern)
2716 {
2717 in_subpattern = 0;
2718 pattern = main_pattern;
2719 p = main_p;
2720 pend = main_pend;
2721 continue;
2722 }
2723 /* If this is the end of the main regexp, we are done. */
2724 break;
2725 }
2726
fa9a63c5
RM
2727 PATFETCH (c);
2728
2729 switch (c)
25fe55af 2730 {
f9b0fd99
RS
2731 case ' ':
2732 {
2733 re_char *p1 = p;
2734
2735 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2736 spaces normally. And don't try to do this recursively. */
2737 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2738 goto normal_char;
2739
2740 /* Peek past following spaces. */
2741 while (p1 != pend)
2742 {
2743 if (*p1 != ' ')
2744 break;
2745 p1++;
2746 }
2747 /* If the spaces are followed by a repetition op,
2748 treat them normally. */
c721eee5
RS
2749 if (p1 != pend
2750 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2751 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2752 goto normal_char;
2753
2754 /* Replace the spaces with the whitespace regexp. */
2755 in_subpattern = 1;
2756 main_p = p1;
2757 main_pend = pend;
2758 main_pattern = pattern;
2759 p = pattern = whitespace_regexp;
2760 pend = p + strlen (p);
2761 break;
7814e705 2762 }
f9b0fd99 2763
25fe55af
RS
2764 case '^':
2765 {
7814e705 2766 if ( /* If at start of pattern, it's an operator. */
25fe55af 2767 p == pattern + 1
7814e705 2768 /* If context independent, it's an operator. */
25fe55af 2769 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2770 /* Otherwise, depends on what's come before. */
25fe55af 2771 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2772 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2773 else
2774 goto normal_char;
2775 }
2776 break;
2777
2778
2779 case '$':
2780 {
2781 if ( /* If at end of pattern, it's an operator. */
2782 p == pend
7814e705 2783 /* If context independent, it's an operator. */
25fe55af
RS
2784 || syntax & RE_CONTEXT_INDEP_ANCHORS
2785 /* Otherwise, depends on what's next. */
2786 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2787 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2788 else
2789 goto normal_char;
2790 }
2791 break;
fa9a63c5
RM
2792
2793
2794 case '+':
25fe55af
RS
2795 case '?':
2796 if ((syntax & RE_BK_PLUS_QM)
2797 || (syntax & RE_LIMITED_OPS))
2798 goto normal_char;
2799 handle_plus:
2800 case '*':
2801 /* If there is no previous pattern... */
2802 if (!laststart)
2803 {
2804 if (syntax & RE_CONTEXT_INVALID_OPS)
2805 FREE_STACK_RETURN (REG_BADRPT);
2806 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2807 goto normal_char;
2808 }
2809
2810 {
7814e705 2811 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2812 boolean zero_times_ok = 0, many_times_ok = 0;
2813 boolean greedy = 1;
25fe55af
RS
2814
2815 /* If there is a sequence of repetition chars, collapse it
2816 down to just one (the right one). We can't combine
2817 interval operators with these because of, e.g., `a{2}*',
7814e705 2818 which should only match an even number of `a's. */
25fe55af
RS
2819
2820 for (;;)
2821 {
0b32bf0e 2822 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2823 && c == '?' && (zero_times_ok || many_times_ok))
2824 greedy = 0;
2825 else
2826 {
2827 zero_times_ok |= c != '+';
2828 many_times_ok |= c != '?';
2829 }
25fe55af
RS
2830
2831 if (p == pend)
2832 break;
ed0767d8
SM
2833 else if (*p == '*'
2834 || (!(syntax & RE_BK_PLUS_QM)
2835 && (*p == '+' || *p == '?')))
25fe55af 2836 ;
ed0767d8 2837 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2838 {
ed0767d8
SM
2839 if (p+1 == pend)
2840 FREE_STACK_RETURN (REG_EESCAPE);
2841 if (p[1] == '+' || p[1] == '?')
2842 PATFETCH (c); /* Gobble up the backslash. */
2843 else
2844 break;
25fe55af
RS
2845 }
2846 else
ed0767d8 2847 break;
25fe55af 2848 /* If we get here, we found another repeat character. */
ed0767d8
SM
2849 PATFETCH (c);
2850 }
25fe55af
RS
2851
2852 /* Star, etc. applied to an empty pattern is equivalent
2853 to an empty pattern. */
4e8a9132 2854 if (!laststart || laststart == b)
25fe55af
RS
2855 break;
2856
2857 /* Now we know whether or not zero matches is allowed
7814e705 2858 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2859 if (greedy)
2860 {
99633e97 2861 if (many_times_ok)
4e8a9132
SM
2862 {
2863 boolean simple = skip_one_char (laststart) == b;
2864 unsigned int startoffset = 0;
f6a3f532 2865 re_opcode_t ofj =
01618498 2866 /* Check if the loop can match the empty string. */
6df42991
SM
2867 (simple || !analyse_first (laststart, b, NULL, 0))
2868 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2869 assert (skip_one_char (laststart) <= b);
177c0ea7 2870
4e8a9132
SM
2871 if (!zero_times_ok && simple)
2872 { /* Since simple * loops can be made faster by using
2873 on_failure_keep_string_jump, we turn simple P+
2874 into PP* if P is simple. */
2875 unsigned char *p1, *p2;
2876 startoffset = b - laststart;
2877 GET_BUFFER_SPACE (startoffset);
2878 p1 = b; p2 = laststart;
2879 while (p2 < p1)
2880 *b++ = *p2++;
2881 zero_times_ok = 1;
99633e97 2882 }
4e8a9132
SM
2883
2884 GET_BUFFER_SPACE (6);
2885 if (!zero_times_ok)
2886 /* A + loop. */
f6a3f532 2887 STORE_JUMP (ofj, b, b + 6);
99633e97 2888 else
4e8a9132
SM
2889 /* Simple * loops can use on_failure_keep_string_jump
2890 depending on what follows. But since we don't know
2891 that yet, we leave the decision up to
2892 on_failure_jump_smart. */
f6a3f532 2893 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2894 laststart + startoffset, b + 6);
99633e97 2895 b += 3;
4e8a9132 2896 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2897 b += 3;
2898 }
2899 else
2900 {
4e8a9132
SM
2901 /* A simple ? pattern. */
2902 assert (zero_times_ok);
2903 GET_BUFFER_SPACE (3);
2904 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2905 b += 3;
2906 }
1c8c6d39
DL
2907 }
2908 else /* not greedy */
2909 { /* I wish the greedy and non-greedy cases could be merged. */
2910
0683b6fa 2911 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2912 if (many_times_ok)
2913 {
f6a3f532
SM
2914 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2915
6df42991
SM
2916 /* The non-greedy multiple match looks like
2917 a repeat..until: we only need a conditional jump
2918 at the end of the loop. */
f6a3f532
SM
2919 if (emptyp) BUF_PUSH (no_op);
2920 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2921 : on_failure_jump, b, laststart);
1c8c6d39
DL
2922 b += 3;
2923 if (zero_times_ok)
2924 {
2925 /* The repeat...until naturally matches one or more.
2926 To also match zero times, we need to first jump to
6df42991 2927 the end of the loop (its conditional jump). */
1c8c6d39
DL
2928 INSERT_JUMP (jump, laststart, b);
2929 b += 3;
2930 }
2931 }
2932 else
2933 {
2934 /* non-greedy a?? */
1c8c6d39
DL
2935 INSERT_JUMP (jump, laststart, b + 3);
2936 b += 3;
2937 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2938 b += 3;
2939 }
2940 }
2941 }
4e8a9132 2942 pending_exact = 0;
fa9a63c5
RM
2943 break;
2944
2945
2946 case '.':
25fe55af
RS
2947 laststart = b;
2948 BUF_PUSH (anychar);
2949 break;
fa9a63c5
RM
2950
2951
25fe55af
RS
2952 case '[':
2953 {
b18215fc 2954 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2955
25fe55af 2956 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2957
25fe55af
RS
2958 /* Ensure that we have enough space to push a charset: the
2959 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2960 GET_BUFFER_SPACE (34);
2961
25fe55af 2962 laststart = b;
e318085a 2963
25fe55af 2964 /* We test `*p == '^' twice, instead of using an if
7814e705 2965 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2966 BUF_PUSH (*p == '^' ? charset_not : charset);
2967 if (*p == '^')
2968 p++;
e318085a 2969
25fe55af
RS
2970 /* Remember the first position in the bracket expression. */
2971 p1 = p;
e318085a 2972
7814e705 2973 /* Push the number of bytes in the bitmap. */
25fe55af 2974 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2975
25fe55af
RS
2976 /* Clear the whole map. */
2977 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2978
25fe55af
RS
2979 /* charset_not matches newline according to a syntax bit. */
2980 if ((re_opcode_t) b[-2] == charset_not
2981 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2982 SET_LIST_BIT ('\n');
fa9a63c5 2983
7814e705 2984 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2985 for (;;)
2986 {
b18215fc 2987 boolean escaped_char = false;
2d1675e4 2988 const unsigned char *p2 = p;
cf9c99bc 2989 re_wchar_t ch, c2;
e318085a 2990
25fe55af 2991 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2992
36595814
SM
2993 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2994 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2995 So the translation is done later in a loop. Example:
2996 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2997 PATFETCH (c);
e318085a 2998
25fe55af
RS
2999 /* \ might escape characters inside [...] and [^...]. */
3000 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3001 {
3002 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
3003
3004 PATFETCH (c);
b18215fc 3005 escaped_char = true;
25fe55af 3006 }
b18215fc
RS
3007 else
3008 {
7814e705 3009 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
3010 not (i.e., when the bracket expression is `[]' so
3011 far), the ']' character bit gets set way below. */
2d1675e4 3012 if (c == ']' && p2 != p1)
657fcfbd 3013 break;
25fe55af 3014 }
b18215fc 3015
25fe55af
RS
3016 /* See if we're at the beginning of a possible character
3017 class. */
b18215fc 3018
2d1675e4
SM
3019 if (!escaped_char &&
3020 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 3021 {
7814e705 3022 /* Leave room for the null. */
14473664 3023 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 3024 const unsigned char *class_beg;
b18215fc 3025
25fe55af
RS
3026 PATFETCH (c);
3027 c1 = 0;
ed0767d8 3028 class_beg = p;
b18215fc 3029
25fe55af
RS
3030 /* If pattern is `[[:'. */
3031 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 3032
25fe55af
RS
3033 for (;;)
3034 {
14473664
SM
3035 PATFETCH (c);
3036 if ((c == ':' && *p == ']') || p == pend)
3037 break;
3038 if (c1 < CHAR_CLASS_MAX_LENGTH)
3039 str[c1++] = c;
3040 else
3041 /* This is in any case an invalid class name. */
3042 str[0] = '\0';
25fe55af
RS
3043 }
3044 str[c1] = '\0';
b18215fc
RS
3045
3046 /* If isn't a word bracketed by `[:' and `:]':
3047 undo the ending character, the letters, and
3048 leave the leading `:' and `[' (but set bits for
3049 them). */
25fe55af
RS
3050 if (c == ':' && *p == ']')
3051 {
14473664 3052 re_wctype_t cc;
8f924df7 3053 int limit;
14473664
SM
3054
3055 cc = re_wctype (str);
3056
3057 if (cc == 0)
fa9a63c5
RM
3058 FREE_STACK_RETURN (REG_ECTYPE);
3059
14473664
SM
3060 /* Throw away the ] at the end of the character
3061 class. */
3062 PATFETCH (c);
fa9a63c5 3063
14473664 3064 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3065
cf9c99bc
KH
3066#ifndef emacs
3067 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3068 if (re_iswctype (btowc (ch), cc))
3069 {
3070 c = TRANSLATE (ch);
ed00c2ac
KH
3071 if (c < (1 << BYTEWIDTH))
3072 SET_LIST_BIT (c);
8f924df7 3073 }
cf9c99bc
KH
3074#else /* emacs */
3075 /* Most character classes in a multibyte match
3076 just set a flag. Exceptions are is_blank,
3077 is_digit, is_cntrl, and is_xdigit, since
3078 they can only match ASCII characters. We
3079 don't need to handle them for multibyte.
3080 They are distinguished by a negative wctype. */
96cc36cc 3081
cf9c99bc 3082 for (ch = 0; ch < 256; ++ch)
25fe55af 3083 {
cf9c99bc
KH
3084 c = RE_CHAR_TO_MULTIBYTE (ch);
3085 if (! CHAR_BYTE8_P (c)
3086 && re_iswctype (c, cc))
8f924df7 3087 {
cf9c99bc
KH
3088 SET_LIST_BIT (ch);
3089 c1 = TRANSLATE (c);
3090 if (c1 == c)
3091 continue;
3092 if (ASCII_CHAR_P (c1))
3093 SET_LIST_BIT (c1);
3094 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3095 SET_LIST_BIT (c1);
8f924df7 3096 }
25fe55af 3097 }
cf9c99bc
KH
3098 SET_RANGE_TABLE_WORK_AREA_BIT
3099 (range_table_work, re_wctype_to_bit (cc));
3100#endif /* emacs */
6224b623
SM
3101 /* In most cases the matching rule for char classes
3102 only uses the syntax table for multibyte chars,
3103 so that the content of the syntax-table it is not
3104 hardcoded in the range_table. SPACE and WORD are
3105 the two exceptions. */
3106 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3107 bufp->used_syntax = 1;
3108
b18215fc
RS
3109 /* Repeat the loop. */
3110 continue;
25fe55af
RS
3111 }
3112 else
3113 {
ed0767d8
SM
3114 /* Go back to right after the "[:". */
3115 p = class_beg;
25fe55af 3116 SET_LIST_BIT ('[');
b18215fc
RS
3117
3118 /* Because the `:' may starts the range, we
3119 can't simply set bit and repeat the loop.
7814e705 3120 Instead, just set it to C and handle below. */
b18215fc 3121 c = ':';
25fe55af
RS
3122 }
3123 }
b18215fc
RS
3124
3125 if (p < pend && p[0] == '-' && p[1] != ']')
3126 {
3127
3128 /* Discard the `-'. */
3129 PATFETCH (c1);
3130
3131 /* Fetch the character which ends the range. */
3132 PATFETCH (c1);
cf9c99bc
KH
3133#ifdef emacs
3134 if (CHAR_BYTE8_P (c1)
3135 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3136 /* Treat the range from a multibyte character to
3137 raw-byte character as empty. */
3138 c = c1 + 1;
3139#endif /* emacs */
e318085a 3140 }
25fe55af 3141 else
b18215fc
RS
3142 /* Range from C to C. */
3143 c1 = c;
3144
cf9c99bc 3145 if (c > c1)
25fe55af 3146 {
cf9c99bc
KH
3147 if (syntax & RE_NO_EMPTY_RANGES)
3148 FREE_STACK_RETURN (REG_ERANGEX);
3149 /* Else, repeat the loop. */
bf216479 3150 }
6fdd04b0 3151 else
25fe55af 3152 {
cf9c99bc
KH
3153#ifndef emacs
3154 /* Set the range into bitmap */
8f924df7 3155 for (; c <= c1; c++)
b18215fc 3156 {
cf9c99bc
KH
3157 ch = TRANSLATE (c);
3158 if (ch < (1 << BYTEWIDTH))
3159 SET_LIST_BIT (ch);
3160 }
3161#else /* emacs */
3162 if (c < 128)
3163 {
3164 ch = MIN (127, c1);
3165 SETUP_ASCII_RANGE (range_table_work, c, ch);
3166 c = ch + 1;
3167 if (CHAR_BYTE8_P (c1))
3168 c = BYTE8_TO_CHAR (128);
3169 }
3170 if (c <= c1)
3171 {
3172 if (CHAR_BYTE8_P (c))
3173 {
3174 c = CHAR_TO_BYTE8 (c);
3175 c1 = CHAR_TO_BYTE8 (c1);
3176 for (; c <= c1; c++)
3177 SET_LIST_BIT (c);
3178 }
3179 else if (multibyte)
3180 {
3181 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3182 }
3183 else
3184 {
3185 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3186 }
e934739e 3187 }
cf9c99bc 3188#endif /* emacs */
25fe55af 3189 }
e318085a
RS
3190 }
3191
25fe55af 3192 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3193 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3194 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3195 b[-1]--;
3196 b += b[-1];
fa9a63c5 3197
96cc36cc
RS
3198 /* Build real range table from work area. */
3199 if (RANGE_TABLE_WORK_USED (range_table_work)
3200 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3201 {
3202 int i;
3203 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3204
b18215fc 3205 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3206 bytes for flags, two for COUNT, and three bytes for
3207 each character. */
3208 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3209
b18215fc
RS
3210 /* Indicate the existence of range table. */
3211 laststart[1] |= 0x80;
fa9a63c5 3212
96cc36cc
RS
3213 /* Store the character class flag bits into the range table.
3214 If not in emacs, these flag bits are always 0. */
3215 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3216 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3217
b18215fc
RS
3218 STORE_NUMBER_AND_INCR (b, used / 2);
3219 for (i = 0; i < used; i++)
3220 STORE_CHARACTER_AND_INCR
3221 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3222 }
25fe55af
RS
3223 }
3224 break;
fa9a63c5
RM
3225
3226
b18215fc 3227 case '(':
25fe55af
RS
3228 if (syntax & RE_NO_BK_PARENS)
3229 goto handle_open;
3230 else
3231 goto normal_char;
fa9a63c5
RM
3232
3233
25fe55af
RS
3234 case ')':
3235 if (syntax & RE_NO_BK_PARENS)
3236 goto handle_close;
3237 else
3238 goto normal_char;
e318085a
RS
3239
3240
25fe55af
RS
3241 case '\n':
3242 if (syntax & RE_NEWLINE_ALT)
3243 goto handle_alt;
3244 else
3245 goto normal_char;
e318085a
RS
3246
3247
b18215fc 3248 case '|':
25fe55af
RS
3249 if (syntax & RE_NO_BK_VBAR)
3250 goto handle_alt;
3251 else
3252 goto normal_char;
3253
3254
3255 case '{':
3256 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3257 goto handle_interval;
3258 else
3259 goto normal_char;
3260
3261
3262 case '\\':
3263 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3264
3265 /* Do not translate the character after the \, so that we can
3266 distinguish, e.g., \B from \b, even if we normally would
3267 translate, e.g., B to b. */
36595814 3268 PATFETCH (c);
25fe55af
RS
3269
3270 switch (c)
3271 {
3272 case '(':
3273 if (syntax & RE_NO_BK_PARENS)
3274 goto normal_backslash;
3275
3276 handle_open:
505bde11
SM
3277 {
3278 int shy = 0;
3279 if (p+1 < pend)
3280 {
3281 /* Look for a special (?...) construct */
ed0767d8 3282 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3283 {
ed0767d8 3284 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3285 PATFETCH (c);
3286 switch (c)
3287 {
3288 case ':': shy = 1; break;
3289 default:
3290 /* Only (?:...) is supported right now. */
3291 FREE_STACK_RETURN (REG_BADPAT);
3292 }
3293 }
505bde11
SM
3294 }
3295
3296 if (!shy)
3297 {
3298 bufp->re_nsub++;
3299 regnum++;
3300 }
25fe55af 3301
99633e97
SM
3302 if (COMPILE_STACK_FULL)
3303 {
3304 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3305 compile_stack_elt_t);
3306 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3307
99633e97
SM
3308 compile_stack.size <<= 1;
3309 }
25fe55af 3310
99633e97 3311 /* These are the values to restore when we hit end of this
7814e705 3312 group. They are all relative offsets, so that if the
99633e97
SM
3313 whole pattern moves because of realloc, they will still
3314 be valid. */
3315 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3316 COMPILE_STACK_TOP.fixup_alt_jump
3317 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3318 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3319 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3320
3321 /* Do not push a
3322 start_memory for groups beyond the last one we can
3323 represent in the compiled pattern. */
3324 if (regnum <= MAX_REGNUM && !shy)
3325 BUF_PUSH_2 (start_memory, regnum);
3326
3327 compile_stack.avail++;
3328
3329 fixup_alt_jump = 0;
3330 laststart = 0;
3331 begalt = b;
3332 /* If we've reached MAX_REGNUM groups, then this open
3333 won't actually generate any code, so we'll have to
3334 clear pending_exact explicitly. */
3335 pending_exact = 0;
3336 break;
505bde11 3337 }
25fe55af
RS
3338
3339 case ')':
3340 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3341
3342 if (COMPILE_STACK_EMPTY)
505bde11
SM
3343 {
3344 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3345 goto normal_backslash;
3346 else
3347 FREE_STACK_RETURN (REG_ERPAREN);
3348 }
25fe55af
RS
3349
3350 handle_close:
505bde11 3351 FIXUP_ALT_JUMP ();
25fe55af
RS
3352
3353 /* See similar code for backslashed left paren above. */
3354 if (COMPILE_STACK_EMPTY)
505bde11
SM
3355 {
3356 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3357 goto normal_char;
3358 else
3359 FREE_STACK_RETURN (REG_ERPAREN);
3360 }
25fe55af
RS
3361
3362 /* Since we just checked for an empty stack above, this
3363 ``can't happen''. */
3364 assert (compile_stack.avail != 0);
3365 {
3366 /* We don't just want to restore into `regnum', because
3367 later groups should continue to be numbered higher,
7814e705 3368 as in `(ab)c(de)' -- the second group is #2. */
25fe55af
RS
3369 regnum_t this_group_regnum;
3370
3371 compile_stack.avail--;
3372 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3373 fixup_alt_jump
3374 = COMPILE_STACK_TOP.fixup_alt_jump
3375 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3376 : 0;
3377 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3378 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3379 /* If we've reached MAX_REGNUM groups, then this open
3380 won't actually generate any code, so we'll have to
3381 clear pending_exact explicitly. */
3382 pending_exact = 0;
e318085a 3383
25fe55af 3384 /* We're at the end of the group, so now we know how many
7814e705 3385 groups were inside this one. */
505bde11
SM
3386 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3387 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3388 }
3389 break;
3390
3391
3392 case '|': /* `\|'. */
3393 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3394 goto normal_backslash;
3395 handle_alt:
3396 if (syntax & RE_LIMITED_OPS)
3397 goto normal_char;
3398
3399 /* Insert before the previous alternative a jump which
7814e705 3400 jumps to this alternative if the former fails. */
25fe55af
RS
3401 GET_BUFFER_SPACE (3);
3402 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3403 pending_exact = 0;
3404 b += 3;
3405
3406 /* The alternative before this one has a jump after it
3407 which gets executed if it gets matched. Adjust that
3408 jump so it will jump to this alternative's analogous
3409 jump (put in below, which in turn will jump to the next
3410 (if any) alternative's such jump, etc.). The last such
3411 jump jumps to the correct final destination. A picture:
3412 _____ _____
3413 | | | |
3414 | v | v
3415 a | b | c
3416
3417 If we are at `b', then fixup_alt_jump right now points to a
3418 three-byte space after `a'. We'll put in the jump, set
3419 fixup_alt_jump to right after `b', and leave behind three
3420 bytes which we'll fill in when we get to after `c'. */
3421
505bde11 3422 FIXUP_ALT_JUMP ();
25fe55af
RS
3423
3424 /* Mark and leave space for a jump after this alternative,
3425 to be filled in later either by next alternative or
3426 when know we're at the end of a series of alternatives. */
3427 fixup_alt_jump = b;
3428 GET_BUFFER_SPACE (3);
3429 b += 3;
3430
3431 laststart = 0;
3432 begalt = b;
3433 break;
3434
3435
3436 case '{':
3437 /* If \{ is a literal. */
3438 if (!(syntax & RE_INTERVALS)
3439 /* If we're at `\{' and it's not the open-interval
3440 operator. */
4bb91c68 3441 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3442 goto normal_backslash;
3443
3444 handle_interval:
3445 {
3446 /* If got here, then the syntax allows intervals. */
3447
3448 /* At least (most) this many matches must be made. */
99633e97 3449 int lower_bound = 0, upper_bound = -1;
25fe55af 3450
ed0767d8 3451 beg_interval = p;
25fe55af 3452
25fe55af
RS
3453 GET_UNSIGNED_NUMBER (lower_bound);
3454
3455 if (c == ',')
ed0767d8 3456 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3457 else
3458 /* Interval such as `{1}' => match exactly once. */
3459 upper_bound = lower_bound;
3460
3461 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3462 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3463 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3464
3465 if (!(syntax & RE_NO_BK_BRACES))
3466 {
4bb91c68
SM
3467 if (c != '\\')
3468 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3469 if (p == pend)
3470 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3471 PATFETCH (c);
3472 }
3473
3474 if (c != '}')
4bb91c68 3475 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3476
3477 /* We just parsed a valid interval. */
3478
3479 /* If it's invalid to have no preceding re. */
3480 if (!laststart)
3481 {
3482 if (syntax & RE_CONTEXT_INVALID_OPS)
3483 FREE_STACK_RETURN (REG_BADRPT);
3484 else if (syntax & RE_CONTEXT_INDEP_OPS)
3485 laststart = b;
3486 else
3487 goto unfetch_interval;
3488 }
3489
6df42991
SM
3490 if (upper_bound == 0)
3491 /* If the upper bound is zero, just drop the sub pattern
3492 altogether. */
3493 b = laststart;
3494 else if (lower_bound == 1 && upper_bound == 1)
3495 /* Just match it once: nothing to do here. */
3496 ;
3497
3498 /* Otherwise, we have a nontrivial interval. When
3499 we're all done, the pattern will look like:
3500 set_number_at <jump count> <upper bound>
3501 set_number_at <succeed_n count> <lower bound>
3502 succeed_n <after jump addr> <succeed_n count>
3503 <body of loop>
3504 jump_n <succeed_n addr> <jump count>
3505 (The upper bound and `jump_n' are omitted if
3506 `upper_bound' is 1, though.) */
3507 else
3508 { /* If the upper bound is > 1, we need to insert
3509 more at the end of the loop. */
3510 unsigned int nbytes = (upper_bound < 0 ? 3
3511 : upper_bound > 1 ? 5 : 0);
3512 unsigned int startoffset = 0;
3513
3514 GET_BUFFER_SPACE (20); /* We might use less. */
3515
3516 if (lower_bound == 0)
3517 {
3518 /* A succeed_n that starts with 0 is really a
3519 a simple on_failure_jump_loop. */
3520 INSERT_JUMP (on_failure_jump_loop, laststart,
3521 b + 3 + nbytes);
3522 b += 3;
3523 }
3524 else
3525 {
3526 /* Initialize lower bound of the `succeed_n', even
3527 though it will be set during matching by its
3528 attendant `set_number_at' (inserted next),
3529 because `re_compile_fastmap' needs to know.
3530 Jump to the `jump_n' we might insert below. */
3531 INSERT_JUMP2 (succeed_n, laststart,
3532 b + 5 + nbytes,
3533 lower_bound);
3534 b += 5;
3535
3536 /* Code to initialize the lower bound. Insert
7814e705 3537 before the `succeed_n'. The `5' is the last two
6df42991
SM
3538 bytes of this `set_number_at', plus 3 bytes of
3539 the following `succeed_n'. */
3540 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3541 b += 5;
3542 startoffset += 5;
3543 }
3544
3545 if (upper_bound < 0)
3546 {
3547 /* A negative upper bound stands for infinity,
3548 in which case it degenerates to a plain jump. */
3549 STORE_JUMP (jump, b, laststart + startoffset);
3550 b += 3;
3551 }
3552 else if (upper_bound > 1)
3553 { /* More than one repetition is allowed, so
3554 append a backward jump to the `succeed_n'
3555 that starts this interval.
3556
3557 When we've reached this during matching,
3558 we'll have matched the interval once, so
3559 jump back only `upper_bound - 1' times. */
3560 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3561 upper_bound - 1);
3562 b += 5;
3563
3564 /* The location we want to set is the second
3565 parameter of the `jump_n'; that is `b-2' as
3566 an absolute address. `laststart' will be
3567 the `set_number_at' we're about to insert;
3568 `laststart+3' the number to set, the source
3569 for the relative address. But we are
3570 inserting into the middle of the pattern --
3571 so everything is getting moved up by 5.
3572 Conclusion: (b - 2) - (laststart + 3) + 5,
3573 i.e., b - laststart.
3574
3575 We insert this at the beginning of the loop
3576 so that if we fail during matching, we'll
3577 reinitialize the bounds. */
3578 insert_op2 (set_number_at, laststart, b - laststart,
3579 upper_bound - 1, b);
3580 b += 5;
3581 }
3582 }
25fe55af
RS
3583 pending_exact = 0;
3584 beg_interval = NULL;
3585 }
3586 break;
3587
3588 unfetch_interval:
3589 /* If an invalid interval, match the characters as literals. */
3590 assert (beg_interval);
3591 p = beg_interval;
3592 beg_interval = NULL;
3593
3594 /* normal_char and normal_backslash need `c'. */
ed0767d8 3595 c = '{';
25fe55af
RS
3596
3597 if (!(syntax & RE_NO_BK_BRACES))
3598 {
ed0767d8
SM
3599 assert (p > pattern && p[-1] == '\\');
3600 goto normal_backslash;
25fe55af 3601 }
ed0767d8
SM
3602 else
3603 goto normal_char;
e318085a 3604
b18215fc 3605#ifdef emacs
25fe55af 3606 /* There is no way to specify the before_dot and after_dot
7814e705 3607 operators. rms says this is ok. --karl */
25fe55af
RS
3608 case '=':
3609 BUF_PUSH (at_dot);
3610 break;
3611
3612 case 's':
3613 laststart = b;
3614 PATFETCH (c);
3615 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3616 break;
3617
3618 case 'S':
3619 laststart = b;
3620 PATFETCH (c);
3621 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3622 break;
b18215fc
RS
3623
3624 case 'c':
3625 laststart = b;
36595814 3626 PATFETCH (c);
b18215fc
RS
3627 BUF_PUSH_2 (categoryspec, c);
3628 break;
e318085a 3629
b18215fc
RS
3630 case 'C':
3631 laststart = b;
36595814 3632 PATFETCH (c);
b18215fc
RS
3633 BUF_PUSH_2 (notcategoryspec, c);
3634 break;
3635#endif /* emacs */
e318085a 3636
e318085a 3637
25fe55af 3638 case 'w':
4bb91c68
SM
3639 if (syntax & RE_NO_GNU_OPS)
3640 goto normal_char;
25fe55af 3641 laststart = b;
1fb352e0 3642 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3643 break;
e318085a 3644
e318085a 3645
25fe55af 3646 case 'W':
4bb91c68
SM
3647 if (syntax & RE_NO_GNU_OPS)
3648 goto normal_char;
25fe55af 3649 laststart = b;
1fb352e0 3650 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3651 break;
e318085a
RS
3652
3653
25fe55af 3654 case '<':
4bb91c68
SM
3655 if (syntax & RE_NO_GNU_OPS)
3656 goto normal_char;
25fe55af
RS
3657 BUF_PUSH (wordbeg);
3658 break;
e318085a 3659
25fe55af 3660 case '>':
4bb91c68
SM
3661 if (syntax & RE_NO_GNU_OPS)
3662 goto normal_char;
25fe55af
RS
3663 BUF_PUSH (wordend);
3664 break;
e318085a 3665
669fa600
SM
3666 case '_':
3667 if (syntax & RE_NO_GNU_OPS)
3668 goto normal_char;
3669 laststart = b;
3670 PATFETCH (c);
3671 if (c == '<')
3672 BUF_PUSH (symbeg);
3673 else if (c == '>')
3674 BUF_PUSH (symend);
3675 else
3676 FREE_STACK_RETURN (REG_BADPAT);
3677 break;
3678
25fe55af 3679 case 'b':
4bb91c68
SM
3680 if (syntax & RE_NO_GNU_OPS)
3681 goto normal_char;
25fe55af
RS
3682 BUF_PUSH (wordbound);
3683 break;
e318085a 3684
25fe55af 3685 case 'B':
4bb91c68
SM
3686 if (syntax & RE_NO_GNU_OPS)
3687 goto normal_char;
25fe55af
RS
3688 BUF_PUSH (notwordbound);
3689 break;
fa9a63c5 3690
25fe55af 3691 case '`':
4bb91c68
SM
3692 if (syntax & RE_NO_GNU_OPS)
3693 goto normal_char;
25fe55af
RS
3694 BUF_PUSH (begbuf);
3695 break;
e318085a 3696
25fe55af 3697 case '\'':
4bb91c68
SM
3698 if (syntax & RE_NO_GNU_OPS)
3699 goto normal_char;
25fe55af
RS
3700 BUF_PUSH (endbuf);
3701 break;
e318085a 3702
25fe55af
RS
3703 case '1': case '2': case '3': case '4': case '5':
3704 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3705 {
3706 regnum_t reg;
e318085a 3707
0cdd06f8
SM
3708 if (syntax & RE_NO_BK_REFS)
3709 goto normal_backslash;
e318085a 3710
0cdd06f8 3711 reg = c - '0';
e318085a 3712
0cdd06f8
SM
3713 /* Can't back reference to a subexpression before its end. */
3714 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3715 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3716
0cdd06f8
SM
3717 laststart = b;
3718 BUF_PUSH_2 (duplicate, reg);
3719 }
25fe55af 3720 break;
e318085a 3721
e318085a 3722
25fe55af
RS
3723 case '+':
3724 case '?':
3725 if (syntax & RE_BK_PLUS_QM)
3726 goto handle_plus;
3727 else
3728 goto normal_backslash;
3729
3730 default:
3731 normal_backslash:
3732 /* You might think it would be useful for \ to mean
3733 not to translate; but if we don't translate it
4bb91c68 3734 it will never match anything. */
25fe55af
RS
3735 goto normal_char;
3736 }
3737 break;
fa9a63c5
RM
3738
3739
3740 default:
25fe55af 3741 /* Expects the character in `c'. */
fa9a63c5 3742 normal_char:
36595814 3743 /* If no exactn currently being built. */
25fe55af 3744 if (!pending_exact
fa9a63c5 3745
25fe55af
RS
3746 /* If last exactn not at current position. */
3747 || pending_exact + *pending_exact + 1 != b
5e69f11e 3748
25fe55af 3749 /* We have only one byte following the exactn for the count. */
2d1675e4 3750 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3751
7814e705 3752 /* If followed by a repetition operator. */
9d99031f 3753 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3754 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3755 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3756 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3757 || ((syntax & RE_INTERVALS)
25fe55af 3758 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3759 ? p != pend && *p == '{'
3760 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3761 {
3762 /* Start building a new exactn. */
5e69f11e 3763
25fe55af 3764 laststart = b;
fa9a63c5
RM
3765
3766 BUF_PUSH_2 (exactn, 0);
3767 pending_exact = b - 1;
25fe55af 3768 }
5e69f11e 3769
2d1675e4
SM
3770 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3771 {
e0277a47
KH
3772 int len;
3773
cf9c99bc 3774 if (multibyte)
6fdd04b0 3775 {
cf9c99bc 3776 c = TRANSLATE (c);
6fdd04b0
KH
3777 len = CHAR_STRING (c, b);
3778 b += len;
3779 }
e0277a47 3780 else
6fdd04b0 3781 {
cf9c99bc
KH
3782 c1 = RE_CHAR_TO_MULTIBYTE (c);
3783 if (! CHAR_BYTE8_P (c1))
3784 {
3785 re_wchar_t c2 = TRANSLATE (c1);
3786
3787 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3788 c = c1;
3789 }
6fdd04b0
KH
3790 *b++ = c;
3791 len = 1;
3792 }
2d1675e4
SM
3793 (*pending_exact) += len;
3794 }
3795
fa9a63c5 3796 break;
25fe55af 3797 } /* switch (c) */
fa9a63c5
RM
3798 } /* while p != pend */
3799
5e69f11e 3800
fa9a63c5 3801 /* Through the pattern now. */
5e69f11e 3802
505bde11 3803 FIXUP_ALT_JUMP ();
fa9a63c5 3804
5e69f11e 3805 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3806 FREE_STACK_RETURN (REG_EPAREN);
3807
3808 /* If we don't want backtracking, force success
3809 the first time we reach the end of the compiled pattern. */
3810 if (syntax & RE_NO_POSIX_BACKTRACKING)
3811 BUF_PUSH (succeed);
3812
fa9a63c5
RM
3813 /* We have succeeded; set the length of the buffer. */
3814 bufp->used = b - bufp->buffer;
3815
3816#ifdef DEBUG
99633e97 3817 if (debug > 0)
fa9a63c5 3818 {
505bde11 3819 re_compile_fastmap (bufp);
fa9a63c5
RM
3820 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3821 print_compiled_pattern (bufp);
3822 }
99633e97 3823 debug--;
fa9a63c5
RM
3824#endif /* DEBUG */
3825
3826#ifndef MATCH_MAY_ALLOCATE
3827 /* Initialize the failure stack to the largest possible stack. This
3828 isn't necessary unless we're trying to avoid calling alloca in
3829 the search and match routines. */
3830 {
3831 int num_regs = bufp->re_nsub + 1;
3832
320a2a73 3833 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3834 {
a26f4ccd 3835 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3836
fa9a63c5
RM
3837 if (! fail_stack.stack)
3838 fail_stack.stack
5e69f11e 3839 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3840 * sizeof (fail_stack_elt_t));
3841 else
3842 fail_stack.stack
3843 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3844 (fail_stack.size
3845 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3846 }
3847
3848 regex_grow_registers (num_regs);
3849 }
3850#endif /* not MATCH_MAY_ALLOCATE */
3851
839966f3 3852 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3853} /* regex_compile */
3854\f
3855/* Subroutines for `regex_compile'. */
3856
7814e705 3857/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3858
3859static void
3860store_op1 (op, loc, arg)
3861 re_opcode_t op;
3862 unsigned char *loc;
3863 int arg;
3864{
3865 *loc = (unsigned char) op;
3866 STORE_NUMBER (loc + 1, arg);
3867}
3868
3869
3870/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3871
3872static void
3873store_op2 (op, loc, arg1, arg2)
3874 re_opcode_t op;
3875 unsigned char *loc;
3876 int arg1, arg2;
3877{
3878 *loc = (unsigned char) op;
3879 STORE_NUMBER (loc + 1, arg1);
3880 STORE_NUMBER (loc + 3, arg2);
3881}
3882
3883
3884/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3885 for OP followed by two-byte integer parameter ARG. */
3886
3887static void
3888insert_op1 (op, loc, arg, end)
3889 re_opcode_t op;
3890 unsigned char *loc;
3891 int arg;
5e69f11e 3892 unsigned char *end;
fa9a63c5
RM
3893{
3894 register unsigned char *pfrom = end;
3895 register unsigned char *pto = end + 3;
3896
3897 while (pfrom != loc)
3898 *--pto = *--pfrom;
5e69f11e 3899
fa9a63c5
RM
3900 store_op1 (op, loc, arg);
3901}
3902
3903
3904/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3905
3906static void
3907insert_op2 (op, loc, arg1, arg2, end)
3908 re_opcode_t op;
3909 unsigned char *loc;
3910 int arg1, arg2;
5e69f11e 3911 unsigned char *end;
fa9a63c5
RM
3912{
3913 register unsigned char *pfrom = end;
3914 register unsigned char *pto = end + 5;
3915
3916 while (pfrom != loc)
3917 *--pto = *--pfrom;
5e69f11e 3918
fa9a63c5
RM
3919 store_op2 (op, loc, arg1, arg2);
3920}
3921
3922
3923/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3924 after an alternative or a begin-subexpression. We assume there is at
3925 least one character before the ^. */
3926
3927static boolean
3928at_begline_loc_p (pattern, p, syntax)
01618498 3929 re_char *pattern, *p;
fa9a63c5
RM
3930 reg_syntax_t syntax;
3931{
01618498 3932 re_char *prev = p - 2;
fa9a63c5 3933 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3934
fa9a63c5
RM
3935 return
3936 /* After a subexpression? */
3937 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3938 /* After an alternative? */
d2af47df
SM
3939 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3940 /* After a shy subexpression? */
3941 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3942 && prev[-1] == '?' && prev[-2] == '('
3943 && (syntax & RE_NO_BK_PARENS
3944 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3945}
3946
3947
3948/* The dual of at_begline_loc_p. This one is for $. We assume there is
3949 at least one character after the $, i.e., `P < PEND'. */
3950
3951static boolean
3952at_endline_loc_p (p, pend, syntax)
01618498 3953 re_char *p, *pend;
99633e97 3954 reg_syntax_t syntax;
fa9a63c5 3955{
01618498 3956 re_char *next = p;
fa9a63c5 3957 boolean next_backslash = *next == '\\';
01618498 3958 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3959
fa9a63c5
RM
3960 return
3961 /* Before a subexpression? */
3962 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3963 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3964 /* Before an alternative? */
3965 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3966 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3967}
3968
3969
5e69f11e 3970/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3971 false if it's not. */
3972
3973static boolean
3974group_in_compile_stack (compile_stack, regnum)
3975 compile_stack_type compile_stack;
3976 regnum_t regnum;
3977{
3978 int this_element;
3979
5e69f11e
RM
3980 for (this_element = compile_stack.avail - 1;
3981 this_element >= 0;
fa9a63c5
RM
3982 this_element--)
3983 if (compile_stack.stack[this_element].regnum == regnum)
3984 return true;
3985
3986 return false;
3987}
fa9a63c5 3988\f
f6a3f532
SM
3989/* analyse_first.
3990 If fastmap is non-NULL, go through the pattern and fill fastmap
3991 with all the possible leading chars. If fastmap is NULL, don't
3992 bother filling it up (obviously) and only return whether the
3993 pattern could potentially match the empty string.
3994
3995 Return 1 if p..pend might match the empty string.
3996 Return 0 if p..pend matches at least one char.
01618498 3997 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3998
3999static int
4000analyse_first (p, pend, fastmap, multibyte)
01618498 4001 re_char *p, *pend;
f6a3f532
SM
4002 char *fastmap;
4003 const int multibyte;
fa9a63c5 4004{
505bde11 4005 int j, k;
1fb352e0 4006 boolean not;
fa9a63c5 4007
b18215fc 4008 /* If all elements for base leading-codes in fastmap is set, this
7814e705 4009 flag is set true. */
b18215fc
RS
4010 boolean match_any_multibyte_characters = false;
4011
f6a3f532 4012 assert (p);
5e69f11e 4013
505bde11
SM
4014 /* The loop below works as follows:
4015 - It has a working-list kept in the PATTERN_STACK and which basically
4016 starts by only containing a pointer to the first operation.
4017 - If the opcode we're looking at is a match against some set of
4018 chars, then we add those chars to the fastmap and go on to the
4019 next work element from the worklist (done via `break').
4020 - If the opcode is a control operator on the other hand, we either
4021 ignore it (if it's meaningless at this point, such as `start_memory')
4022 or execute it (if it's a jump). If the jump has several destinations
4023 (i.e. `on_failure_jump'), then we push the other destination onto the
4024 worklist.
4025 We guarantee termination by ignoring backward jumps (more or less),
4026 so that `p' is monotonically increasing. More to the point, we
4027 never set `p' (or push) anything `<= p1'. */
4028
01618498 4029 while (p < pend)
fa9a63c5 4030 {
505bde11
SM
4031 /* `p1' is used as a marker of how far back a `on_failure_jump'
4032 can go without being ignored. It is normally equal to `p'
4033 (which prevents any backward `on_failure_jump') except right
4034 after a plain `jump', to allow patterns such as:
4035 0: jump 10
4036 3..9: <body>
4037 10: on_failure_jump 3
4038 as used for the *? operator. */
01618498 4039 re_char *p1 = p;
5e69f11e 4040
fa9a63c5
RM
4041 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4042 {
f6a3f532 4043 case succeed:
01618498 4044 return 1;
f6a3f532 4045 continue;
fa9a63c5 4046
fa9a63c5 4047 case duplicate:
505bde11
SM
4048 /* If the first character has to match a backreference, that means
4049 that the group was empty (since it already matched). Since this
4050 is the only case that interests us here, we can assume that the
4051 backreference must match the empty string. */
4052 p++;
4053 continue;
fa9a63c5
RM
4054
4055
4056 /* Following are the cases which match a character. These end
7814e705 4057 with `break'. */
fa9a63c5
RM
4058
4059 case exactn:
e0277a47 4060 if (fastmap)
cf9c99bc
KH
4061 {
4062 /* If multibyte is nonzero, the first byte of each
4063 character is an ASCII or a leading code. Otherwise,
4064 each byte is a character. Thus, this works in both
4065 cases. */
4066 fastmap[p[1]] = 1;
4067 if (! multibyte)
4068 {
4069 /* For the case of matching this unibyte regex
4070 against multibyte, we must set a leading code of
4071 the corresponding multibyte character. */
4072 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4073
4074 if (! CHAR_BYTE8_P (c))
4075 fastmap[CHAR_LEADING_CODE (c)] = 1;
4076 }
4077 }
fa9a63c5
RM
4078 break;
4079
4080
1fb352e0
SM
4081 case anychar:
4082 /* We could put all the chars except for \n (and maybe \0)
4083 but we don't bother since it is generally not worth it. */
f6a3f532 4084 if (!fastmap) break;
01618498 4085 return -1;
fa9a63c5
RM
4086
4087
b18215fc 4088 case charset_not:
1fb352e0 4089 if (!fastmap) break;
bf216479
KH
4090 {
4091 /* Chars beyond end of bitmap are possible matches. */
bf216479 4092 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4093 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4094 fastmap[j] = 1;
4095 }
4096
1fb352e0
SM
4097 /* Fallthrough */
4098 case charset:
4099 if (!fastmap) break;
4100 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4101 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4102 j >= 0; j--)
1fb352e0 4103 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4104 fastmap[j] = 1;
b18215fc 4105
1fb352e0 4106 if ((not && multibyte)
bf216479 4107 /* Any leading code can possibly start a character
1fb352e0
SM
4108 which doesn't match the specified set of characters. */
4109 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4110 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4111 /* If we can match a character class, we can match
6fdd04b0 4112 any multibyte characters. */
b18215fc 4113 {
b18215fc
RS
4114 if (match_any_multibyte_characters == false)
4115 {
6fdd04b0
KH
4116 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
4117 fastmap[j] = 1;
b18215fc
RS
4118 match_any_multibyte_characters = true;
4119 }
4120 }
b18215fc 4121
1fb352e0
SM
4122 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4123 && match_any_multibyte_characters == false)
4124 {
bf216479 4125 /* Set fastmap[I] to 1 where I is a leading code of each
9117d724 4126 multibyte characer in the range table. */
1fb352e0 4127 int c, count;
bf216479 4128 unsigned char lc1, lc2;
b18215fc 4129
1fb352e0 4130 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4131 bits for a character class. */
1fb352e0 4132 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4133
1fb352e0
SM
4134 /* Extract the number of ranges in range table into COUNT. */
4135 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4136 for (; count > 0; count--, p += 3)
1fb352e0 4137 {
9117d724
KH
4138 /* Extract the start and end of each range. */
4139 EXTRACT_CHARACTER (c, p);
bf216479 4140 lc1 = CHAR_LEADING_CODE (c);
9117d724 4141 p += 3;
1fb352e0 4142 EXTRACT_CHARACTER (c, p);
bf216479
KH
4143 lc2 = CHAR_LEADING_CODE (c);
4144 for (j = lc1; j <= lc2; j++)
9117d724 4145 fastmap[j] = 1;
1fb352e0
SM
4146 }
4147 }
b18215fc
RS
4148 break;
4149
1fb352e0
SM
4150 case syntaxspec:
4151 case notsyntaxspec:
4152 if (!fastmap) break;
4153#ifndef emacs
4154 not = (re_opcode_t)p[-1] == notsyntaxspec;
4155 k = *p++;
4156 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4157 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4158 fastmap[j] = 1;
b18215fc 4159 break;
1fb352e0 4160#else /* emacs */
b18215fc
RS
4161 /* This match depends on text properties. These end with
4162 aborting optimizations. */
01618498 4163 return -1;
b18215fc
RS
4164
4165 case categoryspec:
b18215fc 4166 case notcategoryspec:
1fb352e0
SM
4167 if (!fastmap) break;
4168 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4169 k = *p++;
bf216479 4170 for (j = (multibyte ? 127 : (1 << BYTEWIDTH)); j >= 0; j--)
1fb352e0 4171 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4172 fastmap[j] = 1;
4173
1fb352e0 4174 if (multibyte)
6fdd04b0
KH
4175 {
4176 /* Any character set can possibly contain a character
4177 whose category is K (or not). */
4178 if (match_any_multibyte_characters == false)
4179 {
4180 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
4181 fastmap[j] = 1;
4182 match_any_multibyte_characters = true;
4183 }
4184 }
b18215fc
RS
4185 break;
4186
fa9a63c5 4187 /* All cases after this match the empty string. These end with
25fe55af 4188 `continue'. */
fa9a63c5 4189
fa9a63c5
RM
4190 case before_dot:
4191 case at_dot:
4192 case after_dot:
1fb352e0 4193#endif /* !emacs */
25fe55af
RS
4194 case no_op:
4195 case begline:
4196 case endline:
fa9a63c5
RM
4197 case begbuf:
4198 case endbuf:
4199 case wordbound:
4200 case notwordbound:
4201 case wordbeg:
4202 case wordend:
669fa600
SM
4203 case symbeg:
4204 case symend:
25fe55af 4205 continue;
fa9a63c5
RM
4206
4207
fa9a63c5 4208 case jump:
25fe55af 4209 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4210 if (j < 0)
4211 /* Backward jumps can only go back to code that we've already
4212 visited. `re_compile' should make sure this is true. */
4213 break;
25fe55af 4214 p += j;
505bde11
SM
4215 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4216 {
4217 case on_failure_jump:
4218 case on_failure_keep_string_jump:
505bde11 4219 case on_failure_jump_loop:
0683b6fa 4220 case on_failure_jump_nastyloop:
505bde11
SM
4221 case on_failure_jump_smart:
4222 p++;
4223 break;
4224 default:
4225 continue;
4226 };
4227 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4228 to jump back to "just after here". */
4229 /* Fallthrough */
fa9a63c5 4230
25fe55af
RS
4231 case on_failure_jump:
4232 case on_failure_keep_string_jump:
0683b6fa 4233 case on_failure_jump_nastyloop:
505bde11
SM
4234 case on_failure_jump_loop:
4235 case on_failure_jump_smart:
25fe55af 4236 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4237 if (p + j <= p1)
ed0767d8 4238 ; /* Backward jump to be ignored. */
01618498
SM
4239 else
4240 { /* We have to look down both arms.
4241 We first go down the "straight" path so as to minimize
4242 stack usage when going through alternatives. */
4243 int r = analyse_first (p, pend, fastmap, multibyte);
4244 if (r) return r;
4245 p += j;
4246 }
25fe55af 4247 continue;
fa9a63c5
RM
4248
4249
ed0767d8
SM
4250 case jump_n:
4251 /* This code simply does not properly handle forward jump_n. */
4252 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4253 p += 4;
4254 /* jump_n can either jump or fall through. The (backward) jump
4255 case has already been handled, so we only need to look at the
4256 fallthrough case. */
4257 continue;
177c0ea7 4258
fa9a63c5 4259 case succeed_n:
ed0767d8
SM
4260 /* If N == 0, it should be an on_failure_jump_loop instead. */
4261 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4262 p += 4;
4263 /* We only care about one iteration of the loop, so we don't
4264 need to consider the case where this behaves like an
4265 on_failure_jump. */
25fe55af 4266 continue;
fa9a63c5
RM
4267
4268
4269 case set_number_at:
25fe55af
RS
4270 p += 4;
4271 continue;
fa9a63c5
RM
4272
4273
4274 case start_memory:
25fe55af 4275 case stop_memory:
505bde11 4276 p += 1;
fa9a63c5
RM
4277 continue;
4278
4279
4280 default:
25fe55af
RS
4281 abort (); /* We have listed all the cases. */
4282 } /* switch *p++ */
fa9a63c5
RM
4283
4284 /* Getting here means we have found the possible starting
25fe55af 4285 characters for one path of the pattern -- and that the empty
7814e705 4286 string does not match. We need not follow this path further. */
01618498 4287 return 0;
fa9a63c5
RM
4288 } /* while p */
4289
01618498
SM
4290 /* We reached the end without matching anything. */
4291 return 1;
4292
f6a3f532
SM
4293} /* analyse_first */
4294\f
4295/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4296 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4297 characters can start a string that matches the pattern. This fastmap
4298 is used by re_search to skip quickly over impossible starting points.
4299
4300 Character codes above (1 << BYTEWIDTH) are not represented in the
4301 fastmap, but the leading codes are represented. Thus, the fastmap
4302 indicates which character sets could start a match.
4303
4304 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4305 area as BUFP->fastmap.
4306
4307 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4308 the pattern buffer.
4309
4310 Returns 0 if we succeed, -2 if an internal error. */
4311
4312int
4313re_compile_fastmap (bufp)
4314 struct re_pattern_buffer *bufp;
4315{
4316 char *fastmap = bufp->fastmap;
4317 int analysis;
4318
4319 assert (fastmap && bufp->buffer);
4320
7814e705 4321 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4322 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4323
4324 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4325 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4326 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4327 return 0;
4328} /* re_compile_fastmap */
4329\f
4330/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4331 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4332 this memory for recording register information. STARTS and ENDS
4333 must be allocated using the malloc library routine, and must each
4334 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4335
4336 If NUM_REGS == 0, then subsequent matches should allocate their own
4337 register data.
4338
4339 Unless this function is called, the first search or match using
4340 PATTERN_BUFFER will allocate its own register data, without
4341 freeing the old data. */
4342
4343void
4344re_set_registers (bufp, regs, num_regs, starts, ends)
4345 struct re_pattern_buffer *bufp;
4346 struct re_registers *regs;
4347 unsigned num_regs;
4348 regoff_t *starts, *ends;
4349{
4350 if (num_regs)
4351 {
4352 bufp->regs_allocated = REGS_REALLOCATE;
4353 regs->num_regs = num_regs;
4354 regs->start = starts;
4355 regs->end = ends;
4356 }
4357 else
4358 {
4359 bufp->regs_allocated = REGS_UNALLOCATED;
4360 regs->num_regs = 0;
4361 regs->start = regs->end = (regoff_t *) 0;
4362 }
4363}
c0f9ea08 4364WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4365\f
7814e705 4366/* Searching routines. */
fa9a63c5
RM
4367
4368/* Like re_search_2, below, but only one string is specified, and
4369 doesn't let you say where to stop matching. */
4370
4371int
4372re_search (bufp, string, size, startpos, range, regs)
4373 struct re_pattern_buffer *bufp;
4374 const char *string;
4375 int size, startpos, range;
4376 struct re_registers *regs;
4377{
5e69f11e 4378 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4379 regs, size);
4380}
c0f9ea08 4381WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4382
70806df6
KH
4383/* Head address of virtual concatenation of string. */
4384#define HEAD_ADDR_VSTRING(P) \
4385 (((P) >= size1 ? string2 : string1))
4386
b18215fc
RS
4387/* End address of virtual concatenation of string. */
4388#define STOP_ADDR_VSTRING(P) \
4389 (((P) >= size1 ? string2 + size2 : string1 + size1))
4390
4391/* Address of POS in the concatenation of virtual string. */
4392#define POS_ADDR_VSTRING(POS) \
4393 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4394
4395/* Using the compiled pattern in BUFP->buffer, first tries to match the
4396 virtual concatenation of STRING1 and STRING2, starting first at index
4397 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4398
fa9a63c5 4399 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4400
fa9a63c5
RM
4401 RANGE is how far to scan while trying to match. RANGE = 0 means try
4402 only at STARTPOS; in general, the last start tried is STARTPOS +
4403 RANGE.
5e69f11e 4404
fa9a63c5
RM
4405 In REGS, return the indices of the virtual concatenation of STRING1
4406 and STRING2 that matched the entire BUFP->buffer and its contained
4407 subexpressions.
5e69f11e 4408
fa9a63c5
RM
4409 Do not consider matching one past the index STOP in the virtual
4410 concatenation of STRING1 and STRING2.
4411
4412 We return either the position in the strings at which the match was
4413 found, -1 if no match, or -2 if error (such as failure
4414 stack overflow). */
4415
4416int
66f0296e 4417re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4418 struct re_pattern_buffer *bufp;
66f0296e 4419 const char *str1, *str2;
fa9a63c5
RM
4420 int size1, size2;
4421 int startpos;
4422 int range;
4423 struct re_registers *regs;
4424 int stop;
4425{
4426 int val;
66f0296e
SM
4427 re_char *string1 = (re_char*) str1;
4428 re_char *string2 = (re_char*) str2;
fa9a63c5 4429 register char *fastmap = bufp->fastmap;
6676cb1c 4430 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4431 int total_size = size1 + size2;
4432 int endpos = startpos + range;
c0f9ea08 4433 boolean anchored_start;
cf9c99bc
KH
4434 /* Nonzero if we are searching multibyte string. */
4435 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4436
fa9a63c5
RM
4437 /* Check for out-of-range STARTPOS. */
4438 if (startpos < 0 || startpos > total_size)
4439 return -1;
5e69f11e 4440
fa9a63c5 4441 /* Fix up RANGE if it might eventually take us outside
34597fa9 4442 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4443 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4444 if (endpos < 0)
4445 range = 0 - startpos;
fa9a63c5
RM
4446 else if (endpos > total_size)
4447 range = total_size - startpos;
4448
4449 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4450 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4451 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4452 {
4453 if (startpos > 0)
4454 return -1;
4455 else
7b140fd7 4456 range = 0;
fa9a63c5
RM
4457 }
4458
ae4788a8
RS
4459#ifdef emacs
4460 /* In a forward search for something that starts with \=.
4461 don't keep searching past point. */
4462 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4463 {
7b140fd7
RS
4464 range = PT_BYTE - BEGV_BYTE - startpos;
4465 if (range < 0)
ae4788a8
RS
4466 return -1;
4467 }
4468#endif /* emacs */
4469
fa9a63c5
RM
4470 /* Update the fastmap now if not correct already. */
4471 if (fastmap && !bufp->fastmap_accurate)
01618498 4472 re_compile_fastmap (bufp);
5e69f11e 4473
c8499ba5 4474 /* See whether the pattern is anchored. */
c0f9ea08 4475 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4476
b18215fc 4477#ifdef emacs
cc9b4df2
KH
4478 gl_state.object = re_match_object;
4479 {
99633e97 4480 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4481
4482 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4483 }
b18215fc
RS
4484#endif
4485
fa9a63c5
RM
4486 /* Loop through the string, looking for a place to start matching. */
4487 for (;;)
5e69f11e 4488 {
c8499ba5
RS
4489 /* If the pattern is anchored,
4490 skip quickly past places we cannot match.
4491 We don't bother to treat startpos == 0 specially
4492 because that case doesn't repeat. */
4493 if (anchored_start && startpos > 0)
4494 {
c0f9ea08
SM
4495 if (! ((startpos <= size1 ? string1[startpos - 1]
4496 : string2[startpos - size1 - 1])
4497 == '\n'))
c8499ba5
RS
4498 goto advance;
4499 }
4500
fa9a63c5 4501 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4502 cannot be the start of a match. If the pattern can match the
4503 null string, however, we don't need to skip characters; we want
7814e705 4504 the first null string. */
fa9a63c5
RM
4505 if (fastmap && startpos < total_size && !bufp->can_be_null)
4506 {
66f0296e 4507 register re_char *d;
01618498 4508 register re_wchar_t buf_ch;
e934739e
RS
4509
4510 d = POS_ADDR_VSTRING (startpos);
4511
7814e705 4512 if (range > 0) /* Searching forwards. */
fa9a63c5 4513 {
fa9a63c5
RM
4514 register int lim = 0;
4515 int irange = range;
4516
25fe55af
RS
4517 if (startpos < size1 && startpos + range >= size1)
4518 lim = range - (size1 - startpos);
fa9a63c5 4519
25fe55af
RS
4520 /* Written out as an if-else to avoid testing `translate'
4521 inside the loop. */
28ae27ae
AS
4522 if (RE_TRANSLATE_P (translate))
4523 {
e934739e
RS
4524 if (multibyte)
4525 while (range > lim)
4526 {
4527 int buf_charlen;
4528
4529 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4530 buf_charlen);
e934739e 4531 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4532 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4533 break;
4534
4535 range -= buf_charlen;
4536 d += buf_charlen;
4537 }
4538 else
bf216479 4539 while (range > lim)
33c46939 4540 {
cf9c99bc
KH
4541 register re_wchar_t ch, translated;
4542
bf216479 4543 buf_ch = *d;
cf9c99bc
KH
4544 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4545 translated = RE_TRANSLATE (translate, ch);
4546 if (translated != ch
4547 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4548 buf_ch = ch;
6fdd04b0 4549 if (fastmap[buf_ch])
bf216479 4550 break;
33c46939
RS
4551 d++;
4552 range--;
4553 }
e934739e 4554 }
fa9a63c5 4555 else
6fdd04b0
KH
4556 {
4557 if (multibyte)
4558 while (range > lim)
4559 {
4560 int buf_charlen;
fa9a63c5 4561
6fdd04b0
KH
4562 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4563 buf_charlen);
4564 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4565 break;
4566 range -= buf_charlen;
4567 d += buf_charlen;
4568 }
e934739e 4569 else
6fdd04b0 4570 while (range > lim && !fastmap[*d])
33c46939
RS
4571 {
4572 d++;
4573 range--;
4574 }
e934739e 4575 }
fa9a63c5
RM
4576 startpos += irange - range;
4577 }
7814e705 4578 else /* Searching backwards. */
fa9a63c5 4579 {
2d1675e4
SM
4580 int room = (startpos >= size1
4581 ? size2 + size1 - startpos
4582 : size1 - startpos);
ba5e343c
KH
4583 if (multibyte)
4584 {
6fdd04b0 4585 buf_ch = STRING_CHAR (d, room);
ba5e343c
KH
4586 buf_ch = TRANSLATE (buf_ch);
4587 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4588 goto advance;
4589 }
4590 else
4591 {
cf9c99bc
KH
4592 register re_wchar_t ch, translated;
4593
4594 buf_ch = *d;
4595 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4596 translated = TRANSLATE (ch);
4597 if (translated != ch
4598 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4599 buf_ch = ch;
4600 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4601 goto advance;
4602 }
fa9a63c5
RM
4603 }
4604 }
4605
4606 /* If can't match the null string, and that's all we have left, fail. */
4607 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4608 && !bufp->can_be_null)
fa9a63c5
RM
4609 return -1;
4610
4611 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4612 startpos, regs, stop);
4613#ifndef REGEX_MALLOC
0b32bf0e 4614# ifdef C_ALLOCA
fa9a63c5 4615 alloca (0);
0b32bf0e 4616# endif
fa9a63c5
RM
4617#endif
4618
4619 if (val >= 0)
4620 return startpos;
5e69f11e 4621
fa9a63c5
RM
4622 if (val == -2)
4623 return -2;
4624
4625 advance:
5e69f11e 4626 if (!range)
25fe55af 4627 break;
5e69f11e 4628 else if (range > 0)
25fe55af 4629 {
b18215fc
RS
4630 /* Update STARTPOS to the next character boundary. */
4631 if (multibyte)
4632 {
66f0296e
SM
4633 re_char *p = POS_ADDR_VSTRING (startpos);
4634 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4635 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4636
4637 range -= len;
4638 if (range < 0)
4639 break;
4640 startpos += len;
4641 }
4642 else
4643 {
b560c397
RS
4644 range--;
4645 startpos++;
4646 }
e318085a 4647 }
fa9a63c5 4648 else
25fe55af
RS
4649 {
4650 range++;
4651 startpos--;
b18215fc
RS
4652
4653 /* Update STARTPOS to the previous character boundary. */
4654 if (multibyte)
4655 {
70806df6
KH
4656 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4657 re_char *p0 = p;
4658 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4659
4660 /* Find the head of multibyte form. */
70806df6
KH
4661 PREV_CHAR_BOUNDARY (p, phead);
4662 range += p0 - 1 - p;
4663 if (range > 0)
4664 break;
b18215fc 4665
70806df6 4666 startpos -= p0 - 1 - p;
b18215fc 4667 }
25fe55af 4668 }
fa9a63c5
RM
4669 }
4670 return -1;
4671} /* re_search_2 */
c0f9ea08 4672WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4673\f
4674/* Declarations and macros for re_match_2. */
4675
2d1675e4
SM
4676static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4677 register int len,
4678 RE_TRANSLATE_TYPE translate,
4679 const int multibyte));
fa9a63c5
RM
4680
4681/* This converts PTR, a pointer into one of the search strings `string1'
4682 and `string2' into an offset from the beginning of that string. */
4683#define POINTER_TO_OFFSET(ptr) \
4684 (FIRST_STRING_P (ptr) \
4685 ? ((regoff_t) ((ptr) - string1)) \
4686 : ((regoff_t) ((ptr) - string2 + size1)))
4687
fa9a63c5 4688/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4689 string2 if necessary.
4690 Check re_match_2_internal for a discussion of why end_match_2 might
4691 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4692#define PREFETCH() \
25fe55af 4693 while (d == dend) \
fa9a63c5
RM
4694 { \
4695 /* End of string2 => fail. */ \
25fe55af
RS
4696 if (dend == end_match_2) \
4697 goto fail; \
4bb91c68 4698 /* End of string1 => advance to string2. */ \
25fe55af 4699 d = string2; \
fa9a63c5
RM
4700 dend = end_match_2; \
4701 }
4702
f1ad044f
SM
4703/* Call before fetching a char with *d if you already checked other limits.
4704 This is meant for use in lookahead operations like wordend, etc..
4705 where we might need to look at parts of the string that might be
4706 outside of the LIMITs (i.e past `stop'). */
4707#define PREFETCH_NOLIMIT() \
4708 if (d == end1) \
4709 { \
4710 d = string2; \
4711 dend = end_match_2; \
4712 } \
fa9a63c5
RM
4713
4714/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4715 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4716#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4717#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4718
4719
4720/* Test if D points to a character which is word-constituent. We have
4721 two special cases to check for: if past the end of string1, look at
4722 the first character in string2; and if before the beginning of
4723 string2, look at the last character in string1. */
4724#define WORDCHAR_P(d) \
4725 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4726 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4727 == Sword)
4728
9121ca40 4729/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4730
4731/* The comment at case wordbound is following one, but we don't use
4732 AT_WORD_BOUNDARY anymore to support multibyte form.
4733
4734 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4735 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4736 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4737 macro and introducing temporary variables works around the bug. */
4738
9121ca40 4739#if 0
fa9a63c5
RM
4740/* Test if the character before D and the one at D differ with respect
4741 to being word-constituent. */
4742#define AT_WORD_BOUNDARY(d) \
4743 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4744 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4745#endif
fa9a63c5
RM
4746
4747/* Free everything we malloc. */
4748#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4749# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4750# define FREE_VARIABLES() \
fa9a63c5
RM
4751 do { \
4752 REGEX_FREE_STACK (fail_stack.stack); \
4753 FREE_VAR (regstart); \
4754 FREE_VAR (regend); \
fa9a63c5
RM
4755 FREE_VAR (best_regstart); \
4756 FREE_VAR (best_regend); \
fa9a63c5
RM
4757 } while (0)
4758#else
0b32bf0e 4759# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4760#endif /* not MATCH_MAY_ALLOCATE */
4761
505bde11
SM
4762\f
4763/* Optimization routines. */
4764
4e8a9132
SM
4765/* If the operation is a match against one or more chars,
4766 return a pointer to the next operation, else return NULL. */
01618498 4767static re_char *
4e8a9132 4768skip_one_char (p)
01618498 4769 re_char *p;
4e8a9132
SM
4770{
4771 switch (SWITCH_ENUM_CAST (*p++))
4772 {
4773 case anychar:
4774 break;
177c0ea7 4775
4e8a9132
SM
4776 case exactn:
4777 p += *p + 1;
4778 break;
4779
4780 case charset_not:
4781 case charset:
4782 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4783 {
4784 int mcnt;
4785 p = CHARSET_RANGE_TABLE (p - 1);
4786 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4787 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4788 }
4789 else
4790 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4791 break;
177c0ea7 4792
4e8a9132
SM
4793 case syntaxspec:
4794 case notsyntaxspec:
1fb352e0 4795#ifdef emacs
4e8a9132
SM
4796 case categoryspec:
4797 case notcategoryspec:
4798#endif /* emacs */
4799 p++;
4800 break;
4801
4802 default:
4803 p = NULL;
4804 }
4805 return p;
4806}
4807
4808
505bde11 4809/* Jump over non-matching operations. */
839966f3 4810static re_char *
4e8a9132 4811skip_noops (p, pend)
839966f3 4812 re_char *p, *pend;
505bde11
SM
4813{
4814 int mcnt;
4815 while (p < pend)
4816 {
4817 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4818 {
4819 case start_memory:
505bde11
SM
4820 case stop_memory:
4821 p += 2; break;
4822 case no_op:
4823 p += 1; break;
4824 case jump:
4825 p += 1;
4826 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4827 p += mcnt;
4828 break;
4829 default:
4830 return p;
4831 }
4832 }
4833 assert (p == pend);
4834 return p;
4835}
4836
4837/* Non-zero if "p1 matches something" implies "p2 fails". */
4838static int
4839mutually_exclusive_p (bufp, p1, p2)
4840 struct re_pattern_buffer *bufp;
839966f3 4841 re_char *p1, *p2;
505bde11 4842{
4e8a9132 4843 re_opcode_t op2;
2d1675e4 4844 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4845 unsigned char *pend = bufp->buffer + bufp->used;
4846
4e8a9132 4847 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4848 && p2 >= bufp->buffer && p2 <= pend);
4849
4850 /* Skip over open/close-group commands.
4851 If what follows this loop is a ...+ construct,
4852 look at what begins its body, since we will have to
4853 match at least one of that. */
4e8a9132
SM
4854 p2 = skip_noops (p2, pend);
4855 /* The same skip can be done for p1, except that this function
4856 is only used in the case where p1 is a simple match operator. */
4857 /* p1 = skip_noops (p1, pend); */
4858
4859 assert (p1 >= bufp->buffer && p1 < pend
4860 && p2 >= bufp->buffer && p2 <= pend);
4861
4862 op2 = p2 == pend ? succeed : *p2;
4863
4864 switch (SWITCH_ENUM_CAST (op2))
505bde11 4865 {
4e8a9132
SM
4866 case succeed:
4867 case endbuf:
4868 /* If we're at the end of the pattern, we can change. */
4869 if (skip_one_char (p1))
505bde11 4870 {
505bde11
SM
4871 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4872 return 1;
505bde11 4873 }
4e8a9132 4874 break;
177c0ea7 4875
4e8a9132 4876 case endline:
4e8a9132
SM
4877 case exactn:
4878 {
01618498 4879 register re_wchar_t c
4e8a9132 4880 = (re_opcode_t) *p2 == endline ? '\n'
cf9c99bc 4881 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte);
505bde11 4882
4e8a9132
SM
4883 if ((re_opcode_t) *p1 == exactn)
4884 {
cf9c99bc 4885 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte))
4e8a9132
SM
4886 {
4887 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4888 return 1;
4889 }
4890 }
505bde11 4891
4e8a9132
SM
4892 else if ((re_opcode_t) *p1 == charset
4893 || (re_opcode_t) *p1 == charset_not)
4894 {
4895 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4896
4e8a9132
SM
4897 /* Test if C is listed in charset (or charset_not)
4898 at `p1'. */
6fdd04b0 4899 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4900 {
4901 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4902 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4903 not = !not;
4904 }
4905 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4906 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4907
4e8a9132
SM
4908 /* `not' is equal to 1 if c would match, which means
4909 that we can't change to pop_failure_jump. */
4910 if (!not)
4911 {
4912 DEBUG_PRINT1 (" No match => fast loop.\n");
4913 return 1;
4914 }
4915 }
4916 else if ((re_opcode_t) *p1 == anychar
4917 && c == '\n')
4918 {
4919 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4920 return 1;
4921 }
4922 }
4923 break;
505bde11 4924
4e8a9132 4925 case charset:
4e8a9132
SM
4926 {
4927 if ((re_opcode_t) *p1 == exactn)
4928 /* Reuse the code above. */
4929 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4930
505bde11
SM
4931 /* It is hard to list up all the character in charset
4932 P2 if it includes multibyte character. Give up in
4933 such case. */
4934 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4935 {
4936 /* Now, we are sure that P2 has no range table.
4937 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4938 enough. But P1 may have range table, so the
505bde11
SM
4939 size of bitmap table of P1 is extracted by
4940 using macro `CHARSET_BITMAP_SIZE'.
4941
6fdd04b0
KH
4942 In a multibyte case, we know that all the character
4943 listed in P2 is ASCII. In a unibyte case, P1 has only a
4944 bitmap table. So, in both cases, it is enough to test
4945 only the bitmap table of P1. */
505bde11 4946
411e4203 4947 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4948 {
4949 int idx;
4950 /* We win if the charset inside the loop
4951 has no overlap with the one after the loop. */
4952 for (idx = 0;
4953 (idx < (int) p2[1]
4954 && idx < CHARSET_BITMAP_SIZE (p1));
4955 idx++)
4956 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4957 break;
4958
4959 if (idx == p2[1]
4960 || idx == CHARSET_BITMAP_SIZE (p1))
4961 {
4962 DEBUG_PRINT1 (" No match => fast loop.\n");
4963 return 1;
4964 }
4965 }
411e4203 4966 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4967 {
4968 int idx;
4969 /* We win if the charset_not inside the loop lists
7814e705 4970 every character listed in the charset after. */
505bde11
SM
4971 for (idx = 0; idx < (int) p2[1]; idx++)
4972 if (! (p2[2 + idx] == 0
4973 || (idx < CHARSET_BITMAP_SIZE (p1)
4974 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4975 break;
4976
4e8a9132
SM
4977 if (idx == p2[1])
4978 {
4979 DEBUG_PRINT1 (" No match => fast loop.\n");
4980 return 1;
4981 }
4982 }
4983 }
4984 }
609b757a 4985 break;
177c0ea7 4986
411e4203
SM
4987 case charset_not:
4988 switch (SWITCH_ENUM_CAST (*p1))
4989 {
4990 case exactn:
4991 case charset:
4992 /* Reuse the code above. */
4993 return mutually_exclusive_p (bufp, p2, p1);
4994 case charset_not:
4995 /* When we have two charset_not, it's very unlikely that
4996 they don't overlap. The union of the two sets of excluded
4997 chars should cover all possible chars, which, as a matter of
4998 fact, is virtually impossible in multibyte buffers. */
36595814 4999 break;
411e4203
SM
5000 }
5001 break;
5002
4e8a9132 5003 case wordend:
669fa600
SM
5004 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
5005 case symend:
4e8a9132 5006 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
5007 && (p1[1] == Ssymbol || p1[1] == Sword));
5008 case notsyntaxspec:
5009 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
5010
5011 case wordbeg:
669fa600
SM
5012 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
5013 case symbeg:
4e8a9132 5014 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
5015 && (p1[1] == Ssymbol || p1[1] == Sword));
5016 case syntaxspec:
5017 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
5018
5019 case wordbound:
5020 return (((re_opcode_t) *p1 == notsyntaxspec
5021 || (re_opcode_t) *p1 == syntaxspec)
5022 && p1[1] == Sword);
5023
1fb352e0 5024#ifdef emacs
4e8a9132
SM
5025 case categoryspec:
5026 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
5027 case notcategoryspec:
5028 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
5029#endif /* emacs */
5030
5031 default:
5032 ;
505bde11
SM
5033 }
5034
5035 /* Safe default. */
5036 return 0;
5037}
5038
fa9a63c5
RM
5039\f
5040/* Matching routines. */
5041
25fe55af 5042#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
5043/* re_match is like re_match_2 except it takes only a single string. */
5044
5045int
5046re_match (bufp, string, size, pos, regs)
5047 struct re_pattern_buffer *bufp;
5048 const char *string;
5049 int size, pos;
5050 struct re_registers *regs;
5051{
4bb91c68 5052 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 5053 pos, regs, size);
0b32bf0e 5054# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 5055 alloca (0);
0b32bf0e 5056# endif
fa9a63c5
RM
5057 return result;
5058}
c0f9ea08 5059WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
5060#endif /* not emacs */
5061
b18215fc
RS
5062#ifdef emacs
5063/* In Emacs, this is the string or buffer in which we
7814e705 5064 are matching. It is used for looking up syntax properties. */
b18215fc
RS
5065Lisp_Object re_match_object;
5066#endif
fa9a63c5
RM
5067
5068/* re_match_2 matches the compiled pattern in BUFP against the
5069 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5070 and SIZE2, respectively). We start matching at POS, and stop
5071 matching at STOP.
5e69f11e 5072
fa9a63c5 5073 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 5074 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
5075 documentation for exactly how many groups we fill.
5076
5077 We return -1 if no match, -2 if an internal error (such as the
7814e705 5078 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
5079 matched substring. */
5080
5081int
5082re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5083 struct re_pattern_buffer *bufp;
5084 const char *string1, *string2;
5085 int size1, size2;
5086 int pos;
5087 struct re_registers *regs;
5088 int stop;
5089{
b18215fc 5090 int result;
25fe55af 5091
b18215fc 5092#ifdef emacs
cc9b4df2
KH
5093 int charpos;
5094 gl_state.object = re_match_object;
99633e97 5095 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5096 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5097#endif
5098
4bb91c68
SM
5099 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5100 (re_char*) string2, size2,
cc9b4df2 5101 pos, regs, stop);
0b32bf0e 5102#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 5103 alloca (0);
a60198e5 5104#endif
fa9a63c5
RM
5105 return result;
5106}
c0f9ea08 5107WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5108
bf216479 5109
fa9a63c5 5110/* This is a separate function so that we can force an alloca cleanup
7814e705 5111 afterwards. */
fa9a63c5
RM
5112static int
5113re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5114 struct re_pattern_buffer *bufp;
66f0296e 5115 re_char *string1, *string2;
fa9a63c5
RM
5116 int size1, size2;
5117 int pos;
5118 struct re_registers *regs;
5119 int stop;
5120{
5121 /* General temporaries. */
5122 int mcnt;
01618498 5123 size_t reg;
66f0296e 5124 boolean not;
fa9a63c5
RM
5125
5126 /* Just past the end of the corresponding string. */
66f0296e 5127 re_char *end1, *end2;
fa9a63c5
RM
5128
5129 /* Pointers into string1 and string2, just past the last characters in
7814e705 5130 each to consider matching. */
66f0296e 5131 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5132
5133 /* Where we are in the data, and the end of the current string. */
66f0296e 5134 re_char *d, *dend;
5e69f11e 5135
99633e97
SM
5136 /* Used sometimes to remember where we were before starting matching
5137 an operator so that we can go back in case of failure. This "atomic"
5138 behavior of matching opcodes is indispensable to the correctness
5139 of the on_failure_keep_string_jump optimization. */
5140 re_char *dfail;
5141
fa9a63c5 5142 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5143 re_char *p = bufp->buffer;
5144 re_char *pend = p + bufp->used;
fa9a63c5 5145
25fe55af 5146 /* We use this to map every character in the string. */
6676cb1c 5147 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5148
cf9c99bc 5149 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5150 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5151
cf9c99bc
KH
5152 /* Nonzero if STRING1/STRING2 are multibyte. */
5153 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5154
fa9a63c5
RM
5155 /* Failure point stack. Each place that can handle a failure further
5156 down the line pushes a failure point on this stack. It consists of
505bde11 5157 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5158 the subexpressions we're currently inside, plus the number of such
5159 registers, and, finally, two char *'s. The first char * is where
5160 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5161 scanning the strings. */
5162#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5163 fail_stack_type fail_stack;
5164#endif
5165#ifdef DEBUG
fa9a63c5
RM
5166 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5167#endif
5168
0b32bf0e 5169#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5170 /* This holds the pointer to the failure stack, when
5171 it is allocated relocatably. */
5172 fail_stack_elt_t *failure_stack_ptr;
99633e97 5173#endif
fa9a63c5
RM
5174
5175 /* We fill all the registers internally, independent of what we
7814e705 5176 return, for use in backreferences. The number here includes
fa9a63c5 5177 an element for register zero. */
4bb91c68 5178 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5179
fa9a63c5
RM
5180 /* Information on the contents of registers. These are pointers into
5181 the input strings; they record just what was matched (on this
5182 attempt) by a subexpression part of the pattern, that is, the
5183 regnum-th regstart pointer points to where in the pattern we began
5184 matching and the regnum-th regend points to right after where we
5185 stopped matching the regnum-th subexpression. (The zeroth register
5186 keeps track of what the whole pattern matches.) */
5187#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5188 re_char **regstart, **regend;
fa9a63c5
RM
5189#endif
5190
fa9a63c5 5191 /* The following record the register info as found in the above
5e69f11e 5192 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5193 This happens as we backtrack through the failure points, which in
5194 turn happens only if we have not yet matched the entire string. */
5195 unsigned best_regs_set = false;
5196#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5197 re_char **best_regstart, **best_regend;
fa9a63c5 5198#endif
5e69f11e 5199
fa9a63c5
RM
5200 /* Logically, this is `best_regend[0]'. But we don't want to have to
5201 allocate space for that if we're not allocating space for anything
7814e705 5202 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5203 any of the other register vectors, and it seems rather a kludge to
5204 treat `best_regend' differently than the rest. So we keep track of
5205 the end of the best match so far in a separate variable. We
5206 initialize this to NULL so that when we backtrack the first time
5207 and need to test it, it's not garbage. */
66f0296e 5208 re_char *match_end = NULL;
fa9a63c5 5209
fa9a63c5
RM
5210#ifdef DEBUG
5211 /* Counts the total number of registers pushed. */
5e69f11e 5212 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5213#endif
5214
5215 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5216
fa9a63c5 5217 INIT_FAIL_STACK ();
5e69f11e 5218
fa9a63c5
RM
5219#ifdef MATCH_MAY_ALLOCATE
5220 /* Do not bother to initialize all the register variables if there are
5221 no groups in the pattern, as it takes a fair amount of time. If
5222 there are groups, we include space for register 0 (the whole
5223 pattern), even though we never use it, since it simplifies the
5224 array indexing. We should fix this. */
5225 if (bufp->re_nsub)
5226 {
66f0296e
SM
5227 regstart = REGEX_TALLOC (num_regs, re_char *);
5228 regend = REGEX_TALLOC (num_regs, re_char *);
5229 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5230 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5231
505bde11 5232 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5233 {
5234 FREE_VARIABLES ();
5235 return -2;
5236 }
fa9a63c5
RM
5237 }
5238 else
5239 {
5240 /* We must initialize all our variables to NULL, so that
25fe55af 5241 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5242 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5243 }
5244#endif /* MATCH_MAY_ALLOCATE */
5245
5246 /* The starting position is bogus. */
5247 if (pos < 0 || pos > size1 + size2)
5248 {
5249 FREE_VARIABLES ();
5250 return -1;
5251 }
5e69f11e 5252
fa9a63c5
RM
5253 /* Initialize subexpression text positions to -1 to mark ones that no
5254 start_memory/stop_memory has been seen for. Also initialize the
5255 register information struct. */
01618498
SM
5256 for (reg = 1; reg < num_regs; reg++)
5257 regstart[reg] = regend[reg] = NULL;
99633e97 5258
fa9a63c5 5259 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5260 `string1' is null. */
fa9a63c5
RM
5261 if (size2 == 0 && string1 != NULL)
5262 {
5263 string2 = string1;
5264 size2 = size1;
5265 string1 = 0;
5266 size1 = 0;
5267 }
5268 end1 = string1 + size1;
5269 end2 = string2 + size2;
5270
5e69f11e 5271 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5272 `dend' is the end of the input string that `d' points within. `d'
5273 is advanced into the following input string whenever necessary, but
5274 this happens before fetching; therefore, at the beginning of the
5275 loop, `d' can be pointing at the end of a string, but it cannot
5276 equal `string2'. */
419d1c74 5277 if (pos >= size1)
fa9a63c5 5278 {
419d1c74
SM
5279 /* Only match within string2. */
5280 d = string2 + pos - size1;
5281 dend = end_match_2 = string2 + stop - size1;
5282 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5283 }
5284 else
5285 {
f1ad044f 5286 if (stop < size1)
419d1c74
SM
5287 {
5288 /* Only match within string1. */
5289 end_match_1 = string1 + stop;
5290 /* BEWARE!
5291 When we reach end_match_1, PREFETCH normally switches to string2.
5292 But in the present case, this means that just doing a PREFETCH
5293 makes us jump from `stop' to `gap' within the string.
5294 What we really want here is for the search to stop as
5295 soon as we hit end_match_1. That's why we set end_match_2
5296 to end_match_1 (since PREFETCH fails as soon as we hit
5297 end_match_2). */
5298 end_match_2 = end_match_1;
5299 }
5300 else
f1ad044f
SM
5301 { /* It's important to use this code when stop == size so that
5302 moving `d' from end1 to string2 will not prevent the d == dend
5303 check from catching the end of string. */
419d1c74
SM
5304 end_match_1 = end1;
5305 end_match_2 = string2 + stop - size1;
5306 }
5307 d = string1 + pos;
5308 dend = end_match_1;
fa9a63c5
RM
5309 }
5310
5311 DEBUG_PRINT1 ("The compiled pattern is: ");
5312 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5313 DEBUG_PRINT1 ("The string to match is: `");
5314 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5315 DEBUG_PRINT1 ("'\n");
5e69f11e 5316
7814e705 5317 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5318 function if the match is complete, or it drops through if the match
5319 fails at this starting point in the input data. */
5320 for (;;)
5321 {
505bde11 5322 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5323
5324 if (p == pend)
5325 { /* End of pattern means we might have succeeded. */
25fe55af 5326 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5327
fa9a63c5 5328 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5329 longest match, try backtracking. */
5330 if (d != end_match_2)
fa9a63c5
RM
5331 {
5332 /* 1 if this match ends in the same string (string1 or string2)
5333 as the best previous match. */
5e69f11e 5334 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5335 == FIRST_STRING_P (d));
fa9a63c5
RM
5336 /* 1 if this match is the best seen so far. */
5337 boolean best_match_p;
5338
5339 /* AIX compiler got confused when this was combined
7814e705 5340 with the previous declaration. */
fa9a63c5
RM
5341 if (same_str_p)
5342 best_match_p = d > match_end;
5343 else
99633e97 5344 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5345
25fe55af
RS
5346 DEBUG_PRINT1 ("backtracking.\n");
5347
5348 if (!FAIL_STACK_EMPTY ())
5349 { /* More failure points to try. */
5350
5351 /* If exceeds best match so far, save it. */
5352 if (!best_regs_set || best_match_p)
5353 {
5354 best_regs_set = true;
5355 match_end = d;
5356
5357 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5358
01618498 5359 for (reg = 1; reg < num_regs; reg++)
25fe55af 5360 {
01618498
SM
5361 best_regstart[reg] = regstart[reg];
5362 best_regend[reg] = regend[reg];
25fe55af
RS
5363 }
5364 }
5365 goto fail;
5366 }
5367
5368 /* If no failure points, don't restore garbage. And if
5369 last match is real best match, don't restore second
5370 best one. */
5371 else if (best_regs_set && !best_match_p)
5372 {
5373 restore_best_regs:
5374 /* Restore best match. It may happen that `dend ==
5375 end_match_1' while the restored d is in string2.
5376 For example, the pattern `x.*y.*z' against the
5377 strings `x-' and `y-z-', if the two strings are
7814e705 5378 not consecutive in memory. */
25fe55af
RS
5379 DEBUG_PRINT1 ("Restoring best registers.\n");
5380
5381 d = match_end;
5382 dend = ((d >= string1 && d <= end1)
5383 ? end_match_1 : end_match_2);
fa9a63c5 5384
01618498 5385 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5386 {
01618498
SM
5387 regstart[reg] = best_regstart[reg];
5388 regend[reg] = best_regend[reg];
fa9a63c5 5389 }
25fe55af
RS
5390 }
5391 } /* d != end_match_2 */
fa9a63c5
RM
5392
5393 succeed_label:
25fe55af 5394 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5395
25fe55af
RS
5396 /* If caller wants register contents data back, do it. */
5397 if (regs && !bufp->no_sub)
fa9a63c5 5398 {
25fe55af
RS
5399 /* Have the register data arrays been allocated? */
5400 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5401 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5402 extra element beyond `num_regs' for the `-1' marker
5403 GNU code uses. */
5404 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5405 regs->start = TALLOC (regs->num_regs, regoff_t);
5406 regs->end = TALLOC (regs->num_regs, regoff_t);
5407 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5408 {
5409 FREE_VARIABLES ();
5410 return -2;
5411 }
25fe55af
RS
5412 bufp->regs_allocated = REGS_REALLOCATE;
5413 }
5414 else if (bufp->regs_allocated == REGS_REALLOCATE)
5415 { /* Yes. If we need more elements than were already
5416 allocated, reallocate them. If we need fewer, just
5417 leave it alone. */
5418 if (regs->num_regs < num_regs + 1)
5419 {
5420 regs->num_regs = num_regs + 1;
5421 RETALLOC (regs->start, regs->num_regs, regoff_t);
5422 RETALLOC (regs->end, regs->num_regs, regoff_t);
5423 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5424 {
5425 FREE_VARIABLES ();
5426 return -2;
5427 }
25fe55af
RS
5428 }
5429 }
5430 else
fa9a63c5
RM
5431 {
5432 /* These braces fend off a "empty body in an else-statement"
7814e705 5433 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5434 assert (bufp->regs_allocated == REGS_FIXED);
5435 }
5436
25fe55af
RS
5437 /* Convert the pointer data in `regstart' and `regend' to
5438 indices. Register zero has to be set differently,
5439 since we haven't kept track of any info for it. */
5440 if (regs->num_regs > 0)
5441 {
5442 regs->start[0] = pos;
99633e97 5443 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5444 }
5e69f11e 5445
25fe55af
RS
5446 /* Go through the first `min (num_regs, regs->num_regs)'
5447 registers, since that is all we initialized. */
01618498 5448 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5449 {
01618498
SM
5450 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5451 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5452 else
5453 {
01618498
SM
5454 regs->start[reg]
5455 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5456 regs->end[reg]
5457 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5458 }
fa9a63c5 5459 }
5e69f11e 5460
25fe55af
RS
5461 /* If the regs structure we return has more elements than
5462 were in the pattern, set the extra elements to -1. If
5463 we (re)allocated the registers, this is the case,
5464 because we always allocate enough to have at least one
7814e705 5465 -1 at the end. */
01618498
SM
5466 for (reg = num_regs; reg < regs->num_regs; reg++)
5467 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5468 } /* regs && !bufp->no_sub */
5469
25fe55af
RS
5470 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5471 nfailure_points_pushed, nfailure_points_popped,
5472 nfailure_points_pushed - nfailure_points_popped);
5473 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5474
99633e97 5475 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5476
25fe55af 5477 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5478
25fe55af
RS
5479 FREE_VARIABLES ();
5480 return mcnt;
5481 }
fa9a63c5 5482
7814e705 5483 /* Otherwise match next pattern command. */
fa9a63c5
RM
5484 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5485 {
25fe55af
RS
5486 /* Ignore these. Used to ignore the n of succeed_n's which
5487 currently have n == 0. */
5488 case no_op:
5489 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5490 break;
fa9a63c5
RM
5491
5492 case succeed:
25fe55af 5493 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5494 goto succeed_label;
5495
7814e705 5496 /* Match the next n pattern characters exactly. The following
25fe55af 5497 byte in the pattern defines n, and the n bytes after that
7814e705 5498 are the characters to match. */
fa9a63c5
RM
5499 case exactn:
5500 mcnt = *p++;
25fe55af 5501 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5502
99633e97
SM
5503 /* Remember the start point to rollback upon failure. */
5504 dfail = d;
5505
6fdd04b0 5506#ifndef emacs
25fe55af
RS
5507 /* This is written out as an if-else so we don't waste time
5508 testing `translate' inside the loop. */
28703c16 5509 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5510 do
5511 {
5512 PREFETCH ();
5513 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5514 {
6fdd04b0
KH
5515 d = dfail;
5516 goto fail;
e934739e 5517 }
6fdd04b0
KH
5518 d++;
5519 }
5520 while (--mcnt);
fa9a63c5 5521 else
6fdd04b0
KH
5522 do
5523 {
5524 PREFETCH ();
5525 if (*d++ != *p++)
bf216479 5526 {
6fdd04b0
KH
5527 d = dfail;
5528 goto fail;
bf216479 5529 }
6fdd04b0
KH
5530 }
5531 while (--mcnt);
5532#else /* emacs */
5533 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5534 if (target_multibyte)
6fdd04b0
KH
5535 do
5536 {
5537 int pat_charlen, buf_charlen;
cf9c99bc 5538 int pat_ch, buf_ch;
e934739e 5539
6fdd04b0 5540 PREFETCH ();
cf9c99bc
KH
5541 if (multibyte)
5542 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5543 else
5544 {
5545 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5546 pat_charlen = 1;
5547 }
6fdd04b0 5548 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e 5549
6fdd04b0 5550 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5551 {
6fdd04b0
KH
5552 d = dfail;
5553 goto fail;
e934739e 5554 }
bf216479 5555
6fdd04b0
KH
5556 p += pat_charlen;
5557 d += buf_charlen;
5558 mcnt -= pat_charlen;
5559 }
5560 while (mcnt > 0);
fa9a63c5 5561 else
6fdd04b0
KH
5562 do
5563 {
cf9c99bc
KH
5564 int pat_charlen, buf_charlen;
5565 int pat_ch, buf_ch;
bf216479 5566
6fdd04b0 5567 PREFETCH ();
cf9c99bc
KH
5568 if (multibyte)
5569 {
5570 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5571 if (CHAR_BYTE8_P (pat_ch))
5572 pat_ch = CHAR_TO_BYTE8 (pat_ch);
5573 else
5574 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5575 }
5576 else
5577 {
5578 pat_ch = *p;
5579 pat_charlen = 1;
5580 }
5581 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5582 if (! CHAR_BYTE8_P (buf_ch))
5583 {
5584 buf_ch = TRANSLATE (buf_ch);
5585 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5586 if (buf_ch < 0)
5587 buf_ch = *d;
5588 }
5589 if (buf_ch != pat_ch)
6fdd04b0
KH
5590 {
5591 d = dfail;
5592 goto fail;
bf216479 5593 }
cf9c99bc
KH
5594 p += pat_charlen;
5595 d++;
6fdd04b0
KH
5596 }
5597 while (--mcnt);
5598#endif
25fe55af 5599 break;
fa9a63c5
RM
5600
5601
25fe55af 5602 /* Match any character except possibly a newline or a null. */
fa9a63c5 5603 case anychar:
e934739e
RS
5604 {
5605 int buf_charlen;
01618498 5606 re_wchar_t buf_ch;
fa9a63c5 5607
e934739e 5608 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5609
e934739e 5610 PREFETCH ();
cf9c99bc
KH
5611 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen,
5612 target_multibyte);
e934739e
RS
5613 buf_ch = TRANSLATE (buf_ch);
5614
5615 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5616 && buf_ch == '\n')
5617 || ((bufp->syntax & RE_DOT_NOT_NULL)
5618 && buf_ch == '\000'))
5619 goto fail;
5620
e934739e
RS
5621 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5622 d += buf_charlen;
5623 }
fa9a63c5
RM
5624 break;
5625
5626
5627 case charset:
5628 case charset_not:
5629 {
b18215fc 5630 register unsigned int c;
fa9a63c5 5631 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5632 int len;
5633
5634 /* Start of actual range_table, or end of bitmap if there is no
5635 range table. */
01618498 5636 re_char *range_table;
b18215fc 5637
96cc36cc 5638 /* Nonzero if there is a range table. */
b18215fc
RS
5639 int range_table_exists;
5640
96cc36cc
RS
5641 /* Number of ranges of range table. This is not included
5642 in the initial byte-length of the command. */
5643 int count = 0;
fa9a63c5 5644
25fe55af 5645 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5646
b18215fc 5647 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5648
b18215fc 5649 if (range_table_exists)
96cc36cc
RS
5650 {
5651 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5652 EXTRACT_NUMBER_AND_INCR (count, range_table);
5653 }
b18215fc 5654
2d1675e4 5655 PREFETCH ();
cf9c99bc
KH
5656 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte);
5657 if (target_multibyte)
5658 {
5659 int c1;
b18215fc 5660
cf9c99bc
KH
5661 c = TRANSLATE (c);
5662 c1 = RE_CHAR_TO_UNIBYTE (c);
5663 if (c1 >= 0)
5664 c = c1;
5665 }
5666 else
5667 {
5668 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5669
5670 if (! CHAR_BYTE8_P (c1))
5671 {
5672 c1 = TRANSLATE (c1);
5673 c1 = RE_CHAR_TO_UNIBYTE (c1);
5674 if (c1 >= 0)
5675 c = c1;
5676 }
5677 }
5678
5679 if (c < (1 << BYTEWIDTH))
b18215fc 5680 { /* Lookup bitmap. */
b18215fc
RS
5681 /* Cast to `unsigned' instead of `unsigned char' in
5682 case the bit list is a full 32 bytes long. */
5683 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5684 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5685 not = !not;
b18215fc 5686 }
96cc36cc 5687#ifdef emacs
b18215fc 5688 else if (range_table_exists)
96cc36cc
RS
5689 {
5690 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5691
14473664
SM
5692 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5693 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5694 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5695 | (class_bits & BIT_SPACE && ISSPACE (c))
5696 | (class_bits & BIT_UPPER && ISUPPER (c))
5697 | (class_bits & BIT_WORD && ISWORD (c)))
5698 not = !not;
5699 else
5700 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5701 }
5702#endif /* emacs */
fa9a63c5 5703
96cc36cc
RS
5704 if (range_table_exists)
5705 p = CHARSET_RANGE_TABLE_END (range_table, count);
5706 else
5707 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5708
5709 if (!not) goto fail;
5e69f11e 5710
b18215fc 5711 d += len;
fa9a63c5
RM
5712 break;
5713 }
5714
5715
25fe55af 5716 /* The beginning of a group is represented by start_memory.
505bde11 5717 The argument is the register number. The text
25fe55af 5718 matched within the group is recorded (in the internal
7814e705 5719 registers data structure) under the register number. */
25fe55af 5720 case start_memory:
505bde11
SM
5721 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5722
5723 /* In case we need to undo this operation (via backtracking). */
5724 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5725
25fe55af 5726 regstart[*p] = d;
4bb91c68 5727 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5728 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5729
25fe55af 5730 /* Move past the register number and inner group count. */
505bde11 5731 p += 1;
25fe55af 5732 break;
fa9a63c5
RM
5733
5734
25fe55af 5735 /* The stop_memory opcode represents the end of a group. Its
505bde11 5736 argument is the same as start_memory's: the register number. */
fa9a63c5 5737 case stop_memory:
505bde11
SM
5738 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5739
5740 assert (!REG_UNSET (regstart[*p]));
5741 /* Strictly speaking, there should be code such as:
177c0ea7 5742
0b32bf0e 5743 assert (REG_UNSET (regend[*p]));
505bde11
SM
5744 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5745
5746 But the only info to be pushed is regend[*p] and it is known to
5747 be UNSET, so there really isn't anything to push.
5748 Not pushing anything, on the other hand deprives us from the
5749 guarantee that regend[*p] is UNSET since undoing this operation
5750 will not reset its value properly. This is not important since
5751 the value will only be read on the next start_memory or at
5752 the very end and both events can only happen if this stop_memory
5753 is *not* undone. */
fa9a63c5 5754
25fe55af 5755 regend[*p] = d;
fa9a63c5
RM
5756 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5757
25fe55af 5758 /* Move past the register number and the inner group count. */
505bde11 5759 p += 1;
25fe55af 5760 break;
fa9a63c5
RM
5761
5762
5763 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5764 followed by the numeric value of <digit> as the register number. */
5765 case duplicate:
fa9a63c5 5766 {
66f0296e 5767 register re_char *d2, *dend2;
7814e705 5768 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5769 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5770
7814e705 5771 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5772 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5773 goto fail;
5e69f11e 5774
7814e705 5775 /* Where in input to try to start matching. */
25fe55af 5776 d2 = regstart[regno];
5e69f11e 5777
99633e97
SM
5778 /* Remember the start point to rollback upon failure. */
5779 dfail = d;
5780
25fe55af
RS
5781 /* Where to stop matching; if both the place to start and
5782 the place to stop matching are in the same string, then
5783 set to the place to stop, otherwise, for now have to use
5784 the end of the first string. */
fa9a63c5 5785
25fe55af 5786 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5787 == FIRST_STRING_P (regend[regno]))
5788 ? regend[regno] : end_match_1);
5789 for (;;)
5790 {
5791 /* If necessary, advance to next segment in register
25fe55af 5792 contents. */
fa9a63c5
RM
5793 while (d2 == dend2)
5794 {
5795 if (dend2 == end_match_2) break;
5796 if (dend2 == regend[regno]) break;
5797
25fe55af
RS
5798 /* End of string1 => advance to string2. */
5799 d2 = string2;
5800 dend2 = regend[regno];
fa9a63c5
RM
5801 }
5802 /* At end of register contents => success */
5803 if (d2 == dend2) break;
5804
5805 /* If necessary, advance to next segment in data. */
5806 PREFETCH ();
5807
5808 /* How many characters left in this segment to match. */
5809 mcnt = dend - d;
5e69f11e 5810
fa9a63c5 5811 /* Want how many consecutive characters we can match in
25fe55af
RS
5812 one shot, so, if necessary, adjust the count. */
5813 if (mcnt > dend2 - d2)
fa9a63c5 5814 mcnt = dend2 - d2;
5e69f11e 5815
fa9a63c5 5816 /* Compare that many; failure if mismatch, else move
25fe55af 5817 past them. */
28703c16 5818 if (RE_TRANSLATE_P (translate)
2d1675e4 5819 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5820 : memcmp (d, d2, mcnt))
99633e97
SM
5821 {
5822 d = dfail;
5823 goto fail;
5824 }
fa9a63c5 5825 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5826 }
5827 }
5828 break;
5829
5830
25fe55af 5831 /* begline matches the empty string at the beginning of the string
c0f9ea08 5832 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5833 case begline:
25fe55af 5834 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5835
25fe55af
RS
5836 if (AT_STRINGS_BEG (d))
5837 {
5838 if (!bufp->not_bol) break;
5839 }
419d1c74 5840 else
25fe55af 5841 {
bf216479 5842 unsigned c;
419d1c74 5843 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5844 if (c == '\n')
419d1c74 5845 break;
25fe55af
RS
5846 }
5847 /* In all other cases, we fail. */
5848 goto fail;
fa9a63c5
RM
5849
5850
25fe55af 5851 /* endline is the dual of begline. */
fa9a63c5 5852 case endline:
25fe55af 5853 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5854
25fe55af
RS
5855 if (AT_STRINGS_END (d))
5856 {
5857 if (!bufp->not_eol) break;
5858 }
f1ad044f 5859 else
25fe55af 5860 {
f1ad044f 5861 PREFETCH_NOLIMIT ();
c0f9ea08 5862 if (*d == '\n')
f1ad044f 5863 break;
25fe55af
RS
5864 }
5865 goto fail;
fa9a63c5
RM
5866
5867
5868 /* Match at the very beginning of the data. */
25fe55af
RS
5869 case begbuf:
5870 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5871 if (AT_STRINGS_BEG (d))
5872 break;
5873 goto fail;
fa9a63c5
RM
5874
5875
5876 /* Match at the very end of the data. */
25fe55af
RS
5877 case endbuf:
5878 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5879 if (AT_STRINGS_END (d))
5880 break;
25fe55af 5881 goto fail;
5e69f11e 5882
5e69f11e 5883
25fe55af
RS
5884 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5885 pushes NULL as the value for the string on the stack. Then
505bde11 5886 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5887 string, instead of restoring it. To see why, consider
7814e705 5888 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5889 then the . fails against the \n. But the next thing we want
5890 to do is match the \n against the \n; if we restored the
5891 string value, we would be back at the foo.
5892
5893 Because this is used only in specific cases, we don't need to
5894 check all the things that `on_failure_jump' does, to make
5895 sure the right things get saved on the stack. Hence we don't
5896 share its code. The only reason to push anything on the
5897 stack at all is that otherwise we would have to change
5898 `anychar's code to do something besides goto fail in this
5899 case; that seems worse than this. */
5900 case on_failure_keep_string_jump:
505bde11
SM
5901 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5902 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5903 mcnt, p + mcnt);
fa9a63c5 5904
505bde11
SM
5905 PUSH_FAILURE_POINT (p - 3, NULL);
5906 break;
5907
0683b6fa
SM
5908 /* A nasty loop is introduced by the non-greedy *? and +?.
5909 With such loops, the stack only ever contains one failure point
5910 at a time, so that a plain on_failure_jump_loop kind of
5911 cycle detection cannot work. Worse yet, such a detection
5912 can not only fail to detect a cycle, but it can also wrongly
5913 detect a cycle (between different instantiations of the same
6df42991 5914 loop).
0683b6fa
SM
5915 So the method used for those nasty loops is a little different:
5916 We use a special cycle-detection-stack-frame which is pushed
5917 when the on_failure_jump_nastyloop failure-point is *popped*.
5918 This special frame thus marks the beginning of one iteration
5919 through the loop and we can hence easily check right here
5920 whether something matched between the beginning and the end of
5921 the loop. */
5922 case on_failure_jump_nastyloop:
5923 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5924 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5925 mcnt, p + mcnt);
5926
5927 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5928 {
5929 int cycle = 0;
5930 CHECK_INFINITE_LOOP (p - 4, d);
5931 if (!cycle)
5932 /* If there's a cycle, just continue without pushing
5933 this failure point. The failure point is the "try again"
5934 option, which shouldn't be tried.
5935 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5936 PUSH_FAILURE_POINT (p - 3, d);
5937 }
0683b6fa
SM
5938 break;
5939
4e8a9132
SM
5940 /* Simple loop detecting on_failure_jump: just check on the
5941 failure stack if the same spot was already hit earlier. */
505bde11
SM
5942 case on_failure_jump_loop:
5943 on_failure:
5944 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5945 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5946 mcnt, p + mcnt);
6df42991
SM
5947 {
5948 int cycle = 0;
5949 CHECK_INFINITE_LOOP (p - 3, d);
5950 if (cycle)
5951 /* If there's a cycle, get out of the loop, as if the matching
5952 had failed. We used to just `goto fail' here, but that was
5953 aborting the search a bit too early: we want to keep the
5954 empty-loop-match and keep matching after the loop.
5955 We want (x?)*y\1z to match both xxyz and xxyxz. */
5956 p += mcnt;
5957 else
5958 PUSH_FAILURE_POINT (p - 3, d);
5959 }
25fe55af 5960 break;
fa9a63c5
RM
5961
5962
5963 /* Uses of on_failure_jump:
5e69f11e 5964
25fe55af
RS
5965 Each alternative starts with an on_failure_jump that points
5966 to the beginning of the next alternative. Each alternative
5967 except the last ends with a jump that in effect jumps past
5968 the rest of the alternatives. (They really jump to the
5969 ending jump of the following alternative, because tensioning
5970 these jumps is a hassle.)
fa9a63c5 5971
25fe55af
RS
5972 Repeats start with an on_failure_jump that points past both
5973 the repetition text and either the following jump or
5974 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5975 case on_failure_jump:
25fe55af 5976 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5977 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5978 mcnt, p + mcnt);
25fe55af 5979
505bde11 5980 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5981 break;
5982
4e8a9132 5983 /* This operation is used for greedy *.
505bde11
SM
5984 Compare the beginning of the repeat with what in the
5985 pattern follows its end. If we can establish that there
5986 is nothing that they would both match, i.e., that we
5987 would have to backtrack because of (as in, e.g., `a*a')
5988 then we can use a non-backtracking loop based on
4e8a9132 5989 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5990 case on_failure_jump_smart:
25fe55af 5991 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5992 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5993 mcnt, p + mcnt);
25fe55af 5994 {
01618498 5995 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5996 /* Here, we discard `const', making re_match non-reentrant. */
5997 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5998 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5999
505bde11
SM
6000 p -= 3; /* Reset so that we will re-execute the
6001 instruction once it's been changed. */
fa9a63c5 6002
4e8a9132
SM
6003 EXTRACT_NUMBER (mcnt, p2 - 2);
6004
6005 /* Ensure this is a indeed the trivial kind of loop
6006 we are expecting. */
6007 assert (skip_one_char (p1) == p2 - 3);
6008 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 6009 DEBUG_STATEMENT (debug += 2);
505bde11 6010 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 6011 {
505bde11 6012 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 6013 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 6014 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 6015 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 6016 }
505bde11 6017 else
fa9a63c5 6018 {
505bde11
SM
6019 /* Default to a safe `on_failure_jump' loop. */
6020 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 6021 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 6022 }
99633e97 6023 DEBUG_STATEMENT (debug -= 2);
25fe55af 6024 }
505bde11 6025 break;
25fe55af
RS
6026
6027 /* Unconditionally jump (without popping any failure points). */
6028 case jump:
fa9a63c5 6029 unconditional_jump:
5b370c2b 6030 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6031 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 6032 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 6033 p += mcnt; /* Do the jump. */
505bde11 6034 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
6035 break;
6036
6037
25fe55af
RS
6038 /* Have to succeed matching what follows at least n times.
6039 After that, handle like `on_failure_jump'. */
6040 case succeed_n:
01618498 6041 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6042 EXTRACT_NUMBER (mcnt, p + 2);
6043 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 6044
dc1e502d
SM
6045 /* Originally, mcnt is how many times we HAVE to succeed. */
6046 if (mcnt != 0)
25fe55af 6047 {
6dcf2d0e
SM
6048 /* Here, we discard `const', making re_match non-reentrant. */
6049 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6050 mcnt--;
01618498
SM
6051 p += 4;
6052 PUSH_NUMBER (p2, mcnt);
25fe55af 6053 }
dc1e502d
SM
6054 else
6055 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
6056 goto on_failure;
25fe55af
RS
6057 break;
6058
6059 case jump_n:
01618498 6060 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6061 EXTRACT_NUMBER (mcnt, p + 2);
6062 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
6063
6064 /* Originally, this is how many times we CAN jump. */
dc1e502d 6065 if (mcnt != 0)
25fe55af 6066 {
6dcf2d0e
SM
6067 /* Here, we discard `const', making re_match non-reentrant. */
6068 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6069 mcnt--;
01618498 6070 PUSH_NUMBER (p2, mcnt);
dc1e502d 6071 goto unconditional_jump;
25fe55af
RS
6072 }
6073 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
6074 else
6075 p += 4;
25fe55af 6076 break;
5e69f11e 6077
fa9a63c5
RM
6078 case set_number_at:
6079 {
01618498 6080 unsigned char *p2; /* Location of the counter. */
25fe55af 6081 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 6082
25fe55af 6083 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
6084 /* Here, we discard `const', making re_match non-reentrant. */
6085 p2 = (unsigned char*) p + mcnt;
01618498 6086 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6087 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6088 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6089 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6090 break;
6091 }
9121ca40
KH
6092
6093 case wordbound:
66f0296e
SM
6094 case notwordbound:
6095 not = (re_opcode_t) *(p - 1) == notwordbound;
6096 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6097
99633e97 6098 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6099
b18215fc 6100 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6101 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6102 not = !not;
b18215fc
RS
6103 else
6104 {
6105 /* C1 is the character before D, S1 is the syntax of C1, C2
6106 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6107 re_wchar_t c1, c2;
6108 int s1, s2;
bf216479 6109 int dummy;
b18215fc 6110#ifdef emacs
2d1675e4
SM
6111 int offset = PTR_TO_OFFSET (d - 1);
6112 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6113 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6114#endif
66f0296e 6115 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6116 s1 = SYNTAX (c1);
6117#ifdef emacs
5d967c7a 6118 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6119#endif
f1ad044f 6120 PREFETCH_NOLIMIT ();
6fdd04b0 6121 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6122 s2 = SYNTAX (c2);
6123
6124 if (/* Case 2: Only one of S1 and S2 is Sword. */
6125 ((s1 == Sword) != (s2 == Sword))
6126 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6127 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6128 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6129 not = !not;
6130 }
6131 if (not)
9121ca40 6132 break;
b18215fc 6133 else
9121ca40 6134 goto fail;
fa9a63c5
RM
6135
6136 case wordbeg:
25fe55af 6137 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6138
b18215fc
RS
6139 /* We FAIL in one of the following cases: */
6140
7814e705 6141 /* Case 1: D is at the end of string. */
b18215fc 6142 if (AT_STRINGS_END (d))
99633e97 6143 goto fail;
b18215fc
RS
6144 else
6145 {
6146 /* C1 is the character before D, S1 is the syntax of C1, C2
6147 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6148 re_wchar_t c1, c2;
6149 int s1, s2;
bf216479 6150 int dummy;
fa9a63c5 6151#ifdef emacs
2d1675e4
SM
6152 int offset = PTR_TO_OFFSET (d);
6153 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6154 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6155#endif
99633e97 6156 PREFETCH ();
6fdd04b0 6157 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6158 s2 = SYNTAX (c2);
177c0ea7 6159
b18215fc
RS
6160 /* Case 2: S2 is not Sword. */
6161 if (s2 != Sword)
6162 goto fail;
6163
6164 /* Case 3: D is not at the beginning of string ... */
6165 if (!AT_STRINGS_BEG (d))
6166 {
6167 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6168#ifdef emacs
5d967c7a 6169 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6170#endif
b18215fc
RS
6171 s1 = SYNTAX (c1);
6172
6173 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6174 returns 0. */
b18215fc
RS
6175 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6176 goto fail;
6177 }
6178 }
e318085a
RS
6179 break;
6180
b18215fc 6181 case wordend:
25fe55af 6182 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6183
6184 /* We FAIL in one of the following cases: */
6185
6186 /* Case 1: D is at the beginning of string. */
6187 if (AT_STRINGS_BEG (d))
e318085a 6188 goto fail;
b18215fc
RS
6189 else
6190 {
6191 /* C1 is the character before D, S1 is the syntax of C1, C2
6192 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6193 re_wchar_t c1, c2;
6194 int s1, s2;
bf216479 6195 int dummy;
5d967c7a 6196#ifdef emacs
2d1675e4
SM
6197 int offset = PTR_TO_OFFSET (d) - 1;
6198 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6199 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6200#endif
99633e97 6201 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6202 s1 = SYNTAX (c1);
6203
6204 /* Case 2: S1 is not Sword. */
6205 if (s1 != Sword)
6206 goto fail;
6207
6208 /* Case 3: D is not at the end of string ... */
6209 if (!AT_STRINGS_END (d))
6210 {
f1ad044f 6211 PREFETCH_NOLIMIT ();
6fdd04b0 6212 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6213#ifdef emacs
6214 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6215#endif
b18215fc
RS
6216 s2 = SYNTAX (c2);
6217
6218 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6219 returns 0. */
b18215fc 6220 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6221 goto fail;
b18215fc
RS
6222 }
6223 }
e318085a
RS
6224 break;
6225
669fa600
SM
6226 case symbeg:
6227 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6228
6229 /* We FAIL in one of the following cases: */
6230
7814e705 6231 /* Case 1: D is at the end of string. */
669fa600
SM
6232 if (AT_STRINGS_END (d))
6233 goto fail;
6234 else
6235 {
6236 /* C1 is the character before D, S1 is the syntax of C1, C2
6237 is the character at D, and S2 is the syntax of C2. */
6238 re_wchar_t c1, c2;
6239 int s1, s2;
6240#ifdef emacs
6241 int offset = PTR_TO_OFFSET (d);
6242 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6243 UPDATE_SYNTAX_TABLE (charpos);
6244#endif
6245 PREFETCH ();
cf9c99bc 6246 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
669fa600 6247 s2 = SYNTAX (c2);
7814e705 6248
669fa600
SM
6249 /* Case 2: S2 is neither Sword nor Ssymbol. */
6250 if (s2 != Sword && s2 != Ssymbol)
6251 goto fail;
6252
6253 /* Case 3: D is not at the beginning of string ... */
6254 if (!AT_STRINGS_BEG (d))
6255 {
6256 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6257#ifdef emacs
6258 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6259#endif
6260 s1 = SYNTAX (c1);
6261
6262 /* ... and S1 is Sword or Ssymbol. */
6263 if (s1 == Sword || s1 == Ssymbol)
6264 goto fail;
6265 }
6266 }
6267 break;
6268
6269 case symend:
6270 DEBUG_PRINT1 ("EXECUTING symend.\n");
6271
6272 /* We FAIL in one of the following cases: */
6273
6274 /* Case 1: D is at the beginning of string. */
6275 if (AT_STRINGS_BEG (d))
6276 goto fail;
6277 else
6278 {
6279 /* C1 is the character before D, S1 is the syntax of C1, C2
6280 is the character at D, and S2 is the syntax of C2. */
6281 re_wchar_t c1, c2;
6282 int s1, s2;
6283#ifdef emacs
6284 int offset = PTR_TO_OFFSET (d) - 1;
6285 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6286 UPDATE_SYNTAX_TABLE (charpos);
6287#endif
6288 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6289 s1 = SYNTAX (c1);
6290
6291 /* Case 2: S1 is neither Ssymbol nor Sword. */
6292 if (s1 != Sword && s1 != Ssymbol)
6293 goto fail;
6294
6295 /* Case 3: D is not at the end of string ... */
6296 if (!AT_STRINGS_END (d))
6297 {
6298 PREFETCH_NOLIMIT ();
cf9c99bc 6299 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
669fa600 6300#ifdef emacs
134579f2 6301 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6302#endif
6303 s2 = SYNTAX (c2);
6304
6305 /* ... and S2 is Sword or Ssymbol. */
6306 if (s2 == Sword || s2 == Ssymbol)
6307 goto fail;
b18215fc
RS
6308 }
6309 }
e318085a
RS
6310 break;
6311
fa9a63c5 6312 case syntaxspec:
1fb352e0
SM
6313 case notsyntaxspec:
6314 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6315 mcnt = *p++;
1fb352e0 6316 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6317 PREFETCH ();
b18215fc
RS
6318#ifdef emacs
6319 {
2d1675e4
SM
6320 int offset = PTR_TO_OFFSET (d);
6321 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6322 UPDATE_SYNTAX_TABLE (pos1);
6323 }
25fe55af 6324#endif
b18215fc 6325 {
01618498
SM
6326 int len;
6327 re_wchar_t c;
b18215fc 6328
6fdd04b0 6329 GET_CHAR_AFTER (c, d, len);
990b2375 6330 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6331 goto fail;
b18215fc
RS
6332 d += len;
6333 }
fa9a63c5
RM
6334 break;
6335
b18215fc 6336#ifdef emacs
1fb352e0
SM
6337 case before_dot:
6338 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6339 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6340 goto fail;
b18215fc
RS
6341 break;
6342
1fb352e0
SM
6343 case at_dot:
6344 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6345 if (PTR_BYTE_POS (d) != PT_BYTE)
6346 goto fail;
6347 break;
b18215fc 6348
1fb352e0
SM
6349 case after_dot:
6350 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6351 if (PTR_BYTE_POS (d) <= PT_BYTE)
6352 goto fail;
e318085a 6353 break;
fa9a63c5 6354
1fb352e0 6355 case categoryspec:
b18215fc 6356 case notcategoryspec:
1fb352e0 6357 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6358 mcnt = *p++;
1fb352e0 6359 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6360 PREFETCH ();
6361 {
01618498
SM
6362 int len;
6363 re_wchar_t c;
6364
6fdd04b0 6365 GET_CHAR_AFTER (c, d, len);
1fb352e0 6366 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6367 goto fail;
6368 d += len;
6369 }
fa9a63c5 6370 break;
5e69f11e 6371
1fb352e0 6372#endif /* emacs */
5e69f11e 6373
0b32bf0e
SM
6374 default:
6375 abort ();
fa9a63c5 6376 }
b18215fc 6377 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6378
6379
6380 /* We goto here if a matching operation fails. */
6381 fail:
5b370c2b 6382 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6383 if (!FAIL_STACK_EMPTY ())
505bde11 6384 {
01618498 6385 re_char *str, *pat;
505bde11 6386 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6387 DEBUG_PRINT1 ("\nFAIL:\n");
6388 POP_FAILURE_POINT (str, pat);
505bde11
SM
6389 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6390 {
6391 case on_failure_keep_string_jump:
6392 assert (str == NULL);
6393 goto continue_failure_jump;
6394
0683b6fa
SM
6395 case on_failure_jump_nastyloop:
6396 assert ((re_opcode_t)pat[-2] == no_op);
6397 PUSH_FAILURE_POINT (pat - 2, str);
6398 /* Fallthrough */
6399
505bde11
SM
6400 case on_failure_jump_loop:
6401 case on_failure_jump:
6402 case succeed_n:
6403 d = str;
6404 continue_failure_jump:
6405 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6406 p = pat + mcnt;
6407 break;
b18215fc 6408
0683b6fa
SM
6409 case no_op:
6410 /* A special frame used for nastyloops. */
6411 goto fail;
6412
505bde11
SM
6413 default:
6414 abort();
6415 }
fa9a63c5 6416
505bde11 6417 assert (p >= bufp->buffer && p <= pend);
b18215fc 6418
0b32bf0e 6419 if (d >= string1 && d <= end1)
fa9a63c5 6420 dend = end_match_1;
0b32bf0e 6421 }
fa9a63c5 6422 else
0b32bf0e 6423 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6424 } /* for (;;) */
6425
6426 if (best_regs_set)
6427 goto restore_best_regs;
6428
6429 FREE_VARIABLES ();
6430
b18215fc 6431 return -1; /* Failure to match. */
fa9a63c5
RM
6432} /* re_match_2 */
6433\f
6434/* Subroutine definitions for re_match_2. */
6435
fa9a63c5
RM
6436/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6437 bytes; nonzero otherwise. */
5e69f11e 6438
fa9a63c5 6439static int
2d1675e4
SM
6440bcmp_translate (s1, s2, len, translate, multibyte)
6441 re_char *s1, *s2;
fa9a63c5 6442 register int len;
6676cb1c 6443 RE_TRANSLATE_TYPE translate;
2d1675e4 6444 const int multibyte;
fa9a63c5 6445{
2d1675e4
SM
6446 register re_char *p1 = s1, *p2 = s2;
6447 re_char *p1_end = s1 + len;
6448 re_char *p2_end = s2 + len;
e934739e 6449
4bb91c68
SM
6450 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6451 different lengths, but relying on a single `len' would break this. -sm */
6452 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6453 {
e934739e 6454 int p1_charlen, p2_charlen;
01618498 6455 re_wchar_t p1_ch, p2_ch;
e934739e 6456
6fdd04b0
KH
6457 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6458 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6459
6460 if (RE_TRANSLATE (translate, p1_ch)
6461 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6462 return 1;
e934739e
RS
6463
6464 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6465 }
e934739e
RS
6466
6467 if (p1 != p1_end || p2 != p2_end)
6468 return 1;
6469
fa9a63c5
RM
6470 return 0;
6471}
6472\f
6473/* Entry points for GNU code. */
6474
6475/* re_compile_pattern is the GNU regular expression compiler: it
6476 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6477 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6478
fa9a63c5
RM
6479 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6480 are set in BUFP on entry.
5e69f11e 6481
b18215fc 6482 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6483
6484const char *
6485re_compile_pattern (pattern, length, bufp)
6486 const char *pattern;
0b32bf0e 6487 size_t length;
fa9a63c5
RM
6488 struct re_pattern_buffer *bufp;
6489{
6490 reg_errcode_t ret;
5e69f11e 6491
1208f11a
RS
6492#ifdef emacs
6493 gl_state.current_syntax_table = current_buffer->syntax_table;
6494#endif
6495
fa9a63c5
RM
6496 /* GNU code is written to assume at least RE_NREGS registers will be set
6497 (and at least one extra will be -1). */
6498 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6499
fa9a63c5
RM
6500 /* And GNU code determines whether or not to get register information
6501 by passing null for the REGS argument to re_match, etc., not by
6502 setting no_sub. */
6503 bufp->no_sub = 0;
5e69f11e 6504
4bb91c68 6505 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6506
6507 if (!ret)
6508 return NULL;
6509 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6510}
c0f9ea08 6511WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6512\f
b18215fc
RS
6513/* Entry points compatible with 4.2 BSD regex library. We don't define
6514 them unless specifically requested. */
fa9a63c5 6515
0b32bf0e 6516#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6517
6518/* BSD has one and only one pattern buffer. */
6519static struct re_pattern_buffer re_comp_buf;
6520
6521char *
0b32bf0e 6522# ifdef _LIBC
48afdd44
RM
6523/* Make these definitions weak in libc, so POSIX programs can redefine
6524 these names if they don't use our functions, and still use
6525 regcomp/regexec below without link errors. */
6526weak_function
0b32bf0e 6527# endif
fa9a63c5
RM
6528re_comp (s)
6529 const char *s;
6530{
6531 reg_errcode_t ret;
5e69f11e 6532
fa9a63c5
RM
6533 if (!s)
6534 {
6535 if (!re_comp_buf.buffer)
0b32bf0e 6536 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6537 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6538 return 0;
6539 }
6540
6541 if (!re_comp_buf.buffer)
6542 {
6543 re_comp_buf.buffer = (unsigned char *) malloc (200);
6544 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6545 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6546 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6547 re_comp_buf.allocated = 200;
6548
6549 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6550 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6551 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6552 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6553 }
6554
6555 /* Since `re_exec' always passes NULL for the `regs' argument, we
6556 don't need to initialize the pattern buffer fields which affect it. */
6557
fa9a63c5 6558 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6559
fa9a63c5
RM
6560 if (!ret)
6561 return NULL;
6562
6563 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6564 return (char *) gettext (re_error_msgid[(int) ret]);
6565}
6566
6567
6568int
0b32bf0e 6569# ifdef _LIBC
48afdd44 6570weak_function
0b32bf0e 6571# endif
fa9a63c5
RM
6572re_exec (s)
6573 const char *s;
6574{
6575 const int len = strlen (s);
6576 return
6577 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6578}
6579#endif /* _REGEX_RE_COMP */
6580\f
6581/* POSIX.2 functions. Don't define these for Emacs. */
6582
6583#ifndef emacs
6584
6585/* regcomp takes a regular expression as a string and compiles it.
6586
b18215fc 6587 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6588 since POSIX says we shouldn't. Thus, we set
6589
6590 `buffer' to the compiled pattern;
6591 `used' to the length of the compiled pattern;
6592 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6593 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6594 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6595 `fastmap' to an allocated space for the fastmap;
6596 `fastmap_accurate' to zero;
fa9a63c5
RM
6597 `re_nsub' to the number of subexpressions in PATTERN.
6598
6599 PATTERN is the address of the pattern string.
6600
6601 CFLAGS is a series of bits which affect compilation.
6602
6603 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6604 use POSIX basic syntax.
6605
6606 If REG_NEWLINE is set, then . and [^...] don't match newline.
6607 Also, regexec will try a match beginning after every newline.
6608
6609 If REG_ICASE is set, then we considers upper- and lowercase
6610 versions of letters to be equivalent when matching.
6611
6612 If REG_NOSUB is set, then when PREG is passed to regexec, that
6613 routine will report only success or failure, and nothing about the
6614 registers.
6615
b18215fc 6616 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6617 the return codes and their meanings.) */
6618
6619int
6620regcomp (preg, pattern, cflags)
ada30c0e
SM
6621 regex_t *__restrict preg;
6622 const char *__restrict pattern;
fa9a63c5
RM
6623 int cflags;
6624{
6625 reg_errcode_t ret;
4bb91c68 6626 reg_syntax_t syntax
fa9a63c5
RM
6627 = (cflags & REG_EXTENDED) ?
6628 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6629
6630 /* regex_compile will allocate the space for the compiled pattern. */
6631 preg->buffer = 0;
6632 preg->allocated = 0;
6633 preg->used = 0;
5e69f11e 6634
c0f9ea08
SM
6635 /* Try to allocate space for the fastmap. */
6636 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6637
fa9a63c5
RM
6638 if (cflags & REG_ICASE)
6639 {
6640 unsigned i;
5e69f11e 6641
6676cb1c
RS
6642 preg->translate
6643 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6644 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6645 if (preg->translate == NULL)
0b32bf0e 6646 return (int) REG_ESPACE;
fa9a63c5
RM
6647
6648 /* Map uppercase characters to corresponding lowercase ones. */
6649 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6650 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6651 }
6652 else
6653 preg->translate = NULL;
6654
6655 /* If REG_NEWLINE is set, newlines are treated differently. */
6656 if (cflags & REG_NEWLINE)
6657 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6658 syntax &= ~RE_DOT_NEWLINE;
6659 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6660 }
6661 else
c0f9ea08 6662 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6663
6664 preg->no_sub = !!(cflags & REG_NOSUB);
6665
5e69f11e 6666 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6667 can use strlen here in compiling the pattern. */
4bb91c68 6668 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6669
fa9a63c5
RM
6670 /* POSIX doesn't distinguish between an unmatched open-group and an
6671 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6672 if (ret == REG_ERPAREN)
6673 ret = REG_EPAREN;
6674
6675 if (ret == REG_NOERROR && preg->fastmap)
6676 { /* Compute the fastmap now, since regexec cannot modify the pattern
6677 buffer. */
6678 re_compile_fastmap (preg);
6679 if (preg->can_be_null)
6680 { /* The fastmap can't be used anyway. */
6681 free (preg->fastmap);
6682 preg->fastmap = NULL;
6683 }
6684 }
fa9a63c5
RM
6685 return (int) ret;
6686}
c0f9ea08 6687WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6688
6689
6690/* regexec searches for a given pattern, specified by PREG, in the
6691 string STRING.
5e69f11e 6692
fa9a63c5 6693 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6694 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6695 least NMATCH elements, and we set them to the offsets of the
6696 corresponding matched substrings.
5e69f11e 6697
fa9a63c5
RM
6698 EFLAGS specifies `execution flags' which affect matching: if
6699 REG_NOTBOL is set, then ^ does not match at the beginning of the
6700 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6701
fa9a63c5
RM
6702 We return 0 if we find a match and REG_NOMATCH if not. */
6703
6704int
6705regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6706 const regex_t *__restrict preg;
6707 const char *__restrict string;
5e69f11e 6708 size_t nmatch;
9f2dbe01 6709 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6710 int eflags;
6711{
6712 int ret;
6713 struct re_registers regs;
6714 regex_t private_preg;
6715 int len = strlen (string);
c0f9ea08 6716 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6717
6718 private_preg = *preg;
5e69f11e 6719
fa9a63c5
RM
6720 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6721 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6722
fa9a63c5
RM
6723 /* The user has told us exactly how many registers to return
6724 information about, via `nmatch'. We have to pass that on to the
b18215fc 6725 matching routines. */
fa9a63c5 6726 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6727
fa9a63c5
RM
6728 if (want_reg_info)
6729 {
6730 regs.num_regs = nmatch;
4bb91c68
SM
6731 regs.start = TALLOC (nmatch * 2, regoff_t);
6732 if (regs.start == NULL)
0b32bf0e 6733 return (int) REG_NOMATCH;
4bb91c68 6734 regs.end = regs.start + nmatch;
fa9a63c5
RM
6735 }
6736
c0f9ea08
SM
6737 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6738 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6739 was a little bit longer but still only matching the real part.
6740 This works because the `endline' will check for a '\n' and will find a
6741 '\0', correctly deciding that this is not the end of a line.
6742 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6743 a convenient '\0' there. For all we know, the string could be preceded
6744 by '\n' which would throw things off. */
6745
fa9a63c5
RM
6746 /* Perform the searching operation. */
6747 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6748 /* start: */ 0, /* range: */ len,
6749 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6750
fa9a63c5
RM
6751 /* Copy the register information to the POSIX structure. */
6752 if (want_reg_info)
6753 {
6754 if (ret >= 0)
0b32bf0e
SM
6755 {
6756 unsigned r;
fa9a63c5 6757
0b32bf0e
SM
6758 for (r = 0; r < nmatch; r++)
6759 {
6760 pmatch[r].rm_so = regs.start[r];
6761 pmatch[r].rm_eo = regs.end[r];
6762 }
6763 }
fa9a63c5 6764
b18215fc 6765 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6766 free (regs.start);
fa9a63c5
RM
6767 }
6768
6769 /* We want zero return to mean success, unlike `re_search'. */
6770 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6771}
c0f9ea08 6772WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6773
6774
ec869672
JR
6775/* Returns a message corresponding to an error code, ERR_CODE, returned
6776 from either regcomp or regexec. We don't use PREG here.
6777
6778 ERR_CODE was previously called ERRCODE, but that name causes an
6779 error with msvc8 compiler. */
fa9a63c5
RM
6780
6781size_t
ec869672
JR
6782regerror (err_code, preg, errbuf, errbuf_size)
6783 int err_code;
fa9a63c5
RM
6784 const regex_t *preg;
6785 char *errbuf;
6786 size_t errbuf_size;
6787{
6788 const char *msg;
6789 size_t msg_size;
6790
ec869672
JR
6791 if (err_code < 0
6792 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6793 /* Only error codes returned by the rest of the code should be passed
b18215fc 6794 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6795 code generates an invalid error code, then the program has a bug.
6796 Dump core so we can fix it. */
6797 abort ();
6798
ec869672 6799 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6800
6801 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6802
fa9a63c5
RM
6803 if (errbuf_size != 0)
6804 {
6805 if (msg_size > errbuf_size)
0b32bf0e
SM
6806 {
6807 strncpy (errbuf, msg, errbuf_size - 1);
6808 errbuf[errbuf_size - 1] = 0;
6809 }
fa9a63c5 6810 else
0b32bf0e 6811 strcpy (errbuf, msg);
fa9a63c5
RM
6812 }
6813
6814 return msg_size;
6815}
c0f9ea08 6816WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6817
6818
6819/* Free dynamically allocated space used by PREG. */
6820
6821void
6822regfree (preg)
6823 regex_t *preg;
6824{
6825 if (preg->buffer != NULL)
6826 free (preg->buffer);
6827 preg->buffer = NULL;
5e69f11e 6828
fa9a63c5
RM
6829 preg->allocated = 0;
6830 preg->used = 0;
6831
6832 if (preg->fastmap != NULL)
6833 free (preg->fastmap);
6834 preg->fastmap = NULL;
6835 preg->fastmap_accurate = 0;
6836
6837 if (preg->translate != NULL)
6838 free (preg->translate);
6839 preg->translate = NULL;
6840}
c0f9ea08 6841WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6842
6843#endif /* not emacs */
839966f3
KH
6844
6845/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6846 (do not change this comment) */