(bindat--unpack-group, bindat--length-group)
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
1cee1e27 6 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
bc78d348 7
fa9a63c5
RM
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
4fc5845f 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 21 USA. */
fa9a63c5 22
6df42991 23/* TODO:
505bde11 24 - structure the opcode space into opcode+flag.
dc1e502d 25 - merge with glibc's regex.[ch].
01618498 26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
01618498 30*/
505bde11 31
fa9a63c5 32/* AIX requires this to be the first thing in the file. */
0b32bf0e 33#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
34 #pragma alloca
35#endif
36
fa9a63c5 37#ifdef HAVE_CONFIG_H
0b32bf0e 38# include <config.h>
fa9a63c5
RM
39#endif
40
4bb91c68
SM
41#if defined STDC_HEADERS && !defined emacs
42# include <stddef.h>
43#else
44/* We need this for `regex.h', and perhaps for the Emacs include files. */
45# include <sys/types.h>
46#endif
fa9a63c5 47
14473664
SM
48/* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
50#if defined _LIBC
51#define WIDE_CHAR_SUPPORT 1
52#else
14473664 53#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55#endif
14473664
SM
56
57/* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
a0ad02f7 59#if WIDE_CHAR_SUPPORT
14473664
SM
60/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61# include <wchar.h>
62# include <wctype.h>
63#endif
64
c0f9ea08
SM
65#ifdef _LIBC
66/* We have to keep the namespace clean. */
67# define regfree(preg) __regfree (preg)
68# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
70# define regerror(errcode, preg, errbuf, errbuf_size) \
71 __regerror(errcode, preg, errbuf, errbuf_size)
72# define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76# define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78# define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80# define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82# define re_set_syntax(syntax) __re_set_syntax (syntax)
83# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
14473664
SM
87/* Make sure we call libc's function even if the user overrides them. */
88# define btowc __btowc
89# define iswctype __iswctype
90# define wctype __wctype
91
c0f9ea08
SM
92# define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94/* We are also using some library internals. */
95# include <locale/localeinfo.h>
96# include <locale/elem-hash.h>
97# include <langinfo.h>
98#else
99# define WEAK_ALIAS(a,b)
100#endif
101
4bb91c68 102/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 103#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
104# include <libintl.h>
105#else
106# define gettext(msgid) (msgid)
107#endif
108
5e69f11e
RM
109#ifndef gettext_noop
110/* This define is so xgettext can find the internationalizable
111 strings. */
0b32bf0e 112# define gettext_noop(String) String
5e69f11e
RM
113#endif
114
fa9a63c5
RM
115/* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117#ifdef emacs
118
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e
SM
125# include "syntax.h"
126# include "charset.h"
127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
7814e705 142/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
147# define RE_STRING_CHAR(p, s) \
4e8a9132 148 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 149# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
150 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
151
152/* Set C a (possibly multibyte) character before P. P points into a
153 string which is the virtual concatenation of STR1 (which ends at
154 END1) or STR2 (which ends at END2). */
0b32bf0e 155# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
156 do { \
157 if (multibyte) \
158 { \
0b32bf0e
SM
159 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
160 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
70806df6
KH
161 re_char *d0 = dtemp; \
162 PREV_CHAR_BOUNDARY (d0, dlimit); \
163 c = STRING_CHAR (d0, dtemp - d0); \
2d1675e4
SM
164 } \
165 else \
166 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
167 } while (0)
168
4e8a9132 169
fa9a63c5
RM
170#else /* not emacs */
171
172/* If we are not linking with Emacs proper,
173 we can't use the relocating allocator
174 even if config.h says that we can. */
0b32bf0e 175# undef REL_ALLOC
fa9a63c5 176
0b32bf0e
SM
177# if defined STDC_HEADERS || defined _LIBC
178# include <stdlib.h>
179# else
fa9a63c5
RM
180char *malloc ();
181char *realloc ();
0b32bf0e 182# endif
fa9a63c5 183
a77f947b
CY
184/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
185
186void *
187xmalloc (size)
188 size_t size;
189{
190 register void *val;
191 val = (void *) malloc (size);
192 if (!val && size)
193 {
194 write (2, "virtual memory exhausted\n", 25);
195 exit (1);
196 }
197 return val;
198}
199
200void *
201xrealloc (block, size)
202 void *block;
203 size_t size;
204{
205 register void *val;
206 /* We must call malloc explicitly when BLOCK is 0, since some
207 reallocs don't do this. */
208 if (! block)
209 val = (void *) malloc (size);
210 else
211 val = (void *) realloc (block, size);
212 if (!val && size)
213 {
214 write (2, "virtual memory exhausted\n", 25);
215 exit (1);
216 }
217 return val;
218}
219
a073faa6
CY
220# ifdef malloc
221# undef malloc
222# endif
223# define malloc xmalloc
224# ifdef realloc
225# undef realloc
226# endif
227# define realloc xrealloc
228
9e4ecb26 229/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 230 If nothing else has been done, use the method below. */
0b32bf0e
SM
231# ifdef INHIBIT_STRING_HEADER
232# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
233# if !defined bzero && !defined bcopy
234# undef INHIBIT_STRING_HEADER
235# endif
236# endif
237# endif
9e4ecb26 238
4bb91c68 239/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
240 This is used in most programs--a few other programs avoid this
241 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
242# ifndef INHIBIT_STRING_HEADER
243# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
244# include <string.h>
0b32bf0e 245# ifndef bzero
4bb91c68
SM
246# ifndef _LIBC
247# define bzero(s, n) (memset (s, '\0', n), (s))
248# else
249# define bzero(s, n) __bzero (s, n)
250# endif
0b32bf0e
SM
251# endif
252# else
253# include <strings.h>
4bb91c68
SM
254# ifndef memcmp
255# define memcmp(s1, s2, n) bcmp (s1, s2, n)
256# endif
257# ifndef memcpy
258# define memcpy(d, s, n) (bcopy (s, d, n), (d))
259# endif
0b32bf0e
SM
260# endif
261# endif
fa9a63c5
RM
262
263/* Define the syntax stuff for \<, \>, etc. */
264
990b2375 265/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 266enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 267
0b32bf0e
SM
268# ifdef SWITCH_ENUM_BUG
269# define SWITCH_ENUM_CAST(x) ((int)(x))
270# else
271# define SWITCH_ENUM_CAST(x) (x)
272# endif
fa9a63c5 273
e934739e 274/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
275# define BASE_LEADING_CODE_P(c) (0)
276# define CHAR_CHARSET(c) 0
277# define CHARSET_LEADING_CODE_BASE(c) 0
278# define MAX_MULTIBYTE_LENGTH 1
279# define RE_MULTIBYTE_P(x) 0
280# define WORD_BOUNDARY_P(c1, c2) (0)
281# define CHAR_HEAD_P(p) (1)
282# define SINGLE_BYTE_CHAR_P(c) (1)
283# define SAME_CHARSET_P(c1, c2) (1)
284# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 285# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e
SM
286# define STRING_CHAR(p, s) (*(p))
287# define RE_STRING_CHAR STRING_CHAR
288# define CHAR_STRING(c, s) (*(s) = (c), 1)
289# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
290# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
291# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 292 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 293# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 294#endif /* not emacs */
4e8a9132
SM
295
296#ifndef RE_TRANSLATE
0b32bf0e
SM
297# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
298# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 299#endif
fa9a63c5
RM
300\f
301/* Get the interface, including the syntax bits. */
302#include "regex.h"
303
f71b19b6
DL
304/* isalpha etc. are used for the character classes. */
305#include <ctype.h>
fa9a63c5 306
f71b19b6 307#ifdef emacs
fa9a63c5 308
f71b19b6 309/* 1 if C is an ASCII character. */
0b32bf0e 310# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 311
f71b19b6 312/* 1 if C is a unibyte character. */
0b32bf0e 313# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 314
f71b19b6 315/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 316
f71b19b6 317/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
318# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
319# define ISCNTRL(c) ((c) < ' ')
320# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
321 || ((c) >= 'a' && (c) <= 'f') \
322 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
323
324/* This is only used for single-byte characters. */
0b32bf0e 325# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
326
327/* The rest must handle multibyte characters. */
328
0b32bf0e 329# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 330 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
331 : 1)
332
14473664 333# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 334 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
335 : 1)
336
0b32bf0e 337# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
338 ? (((c) >= 'a' && (c) <= 'z') \
339 || ((c) >= 'A' && (c) <= 'Z') \
340 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
341 : SYNTAX (c) == Sword)
342
0b32bf0e 343# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
344 ? (((c) >= 'a' && (c) <= 'z') \
345 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
346 : SYNTAX (c) == Sword)
347
0b32bf0e 348# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 349
0b32bf0e 350# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
351 ? ((c) > ' ' && (c) < 0177 \
352 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
353 || ((c) >= 'A' && (c) <= 'Z') \
354 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
355 : SYNTAX (c) != Sword)
356
0b32bf0e 357# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 358
0b32bf0e 359# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 360
0b32bf0e 361# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
362
363#else /* not emacs */
364
f71b19b6
DL
365/* Jim Meyering writes:
366
367 "... Some ctype macros are valid only for character codes that
368 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
369 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 370 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
371 STDC_HEADERS is defined, then autoconf has verified that the ctype
372 macros don't need to be guarded with references to isascii. ...
373 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
374 eliminate the && through constant folding."
375 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 376
4bb91c68 377# undef ISASCII
0b32bf0e
SM
378# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
379# define ISASCII(c) 1
380# else
381# define ISASCII(c) isascii(c)
382# endif
f71b19b6
DL
383
384/* 1 if C is an ASCII character. */
0b32bf0e 385# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
386
387/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
388# define ISUNIBYTE(c) 1
389
390# ifdef isblank
391# define ISBLANK(c) (ISASCII (c) && isblank (c))
392# else
393# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
394# endif
395# ifdef isgraph
396# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
397# else
398# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
399# endif
400
4bb91c68 401# undef ISPRINT
0b32bf0e
SM
402# define ISPRINT(c) (ISASCII (c) && isprint (c))
403# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
404# define ISALNUM(c) (ISASCII (c) && isalnum (c))
405# define ISALPHA(c) (ISASCII (c) && isalpha (c))
406# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
407# define ISLOWER(c) (ISASCII (c) && islower (c))
408# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
409# define ISSPACE(c) (ISASCII (c) && isspace (c))
410# define ISUPPER(c) (ISASCII (c) && isupper (c))
411# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
412
413# define ISWORD(c) ISALPHA(c)
414
4bb91c68
SM
415# ifdef _tolower
416# define TOLOWER(c) _tolower(c)
417# else
418# define TOLOWER(c) tolower(c)
419# endif
420
421/* How many characters in the character set. */
422# define CHAR_SET_SIZE 256
423
0b32bf0e 424# ifdef SYNTAX_TABLE
f71b19b6 425
0b32bf0e 426extern char *re_syntax_table;
f71b19b6 427
0b32bf0e
SM
428# else /* not SYNTAX_TABLE */
429
0b32bf0e
SM
430static char re_syntax_table[CHAR_SET_SIZE];
431
432static void
433init_syntax_once ()
434{
435 register int c;
436 static int done = 0;
437
438 if (done)
439 return;
440
441 bzero (re_syntax_table, sizeof re_syntax_table);
442
4bb91c68
SM
443 for (c = 0; c < CHAR_SET_SIZE; ++c)
444 if (ISALNUM (c))
445 re_syntax_table[c] = Sword;
fa9a63c5 446
669fa600 447 re_syntax_table['_'] = Ssymbol;
fa9a63c5 448
0b32bf0e
SM
449 done = 1;
450}
451
452# endif /* not SYNTAX_TABLE */
96cc36cc 453
4bb91c68
SM
454# define SYNTAX(c) re_syntax_table[(c)]
455
96cc36cc
RS
456#endif /* not emacs */
457\f
fa9a63c5 458#ifndef NULL
0b32bf0e 459# define NULL (void *)0
fa9a63c5
RM
460#endif
461
462/* We remove any previous definition of `SIGN_EXTEND_CHAR',
463 since ours (we hope) works properly with all combinations of
464 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 465 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
466#undef SIGN_EXTEND_CHAR
467#if __STDC__
0b32bf0e 468# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
469#else /* not __STDC__ */
470/* As in Harbison and Steele. */
0b32bf0e 471# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
472#endif
473\f
474/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
475 use `alloca' instead of `malloc'. This is because using malloc in
476 re_search* or re_match* could cause memory leaks when C-g is used in
477 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
478 the other hand, malloc is more portable, and easier to debug.
479
fa9a63c5
RM
480 Because we sometimes use alloca, some routines have to be macros,
481 not functions -- `alloca'-allocated space disappears at the end of the
482 function it is called in. */
483
484#ifdef REGEX_MALLOC
485
0b32bf0e
SM
486# define REGEX_ALLOCATE malloc
487# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
488# define REGEX_FREE free
fa9a63c5
RM
489
490#else /* not REGEX_MALLOC */
491
492/* Emacs already defines alloca, sometimes. */
0b32bf0e 493# ifndef alloca
fa9a63c5
RM
494
495/* Make alloca work the best possible way. */
0b32bf0e
SM
496# ifdef __GNUC__
497# define alloca __builtin_alloca
498# else /* not __GNUC__ */
499# if HAVE_ALLOCA_H
500# include <alloca.h>
501# endif /* HAVE_ALLOCA_H */
502# endif /* not __GNUC__ */
fa9a63c5 503
0b32bf0e 504# endif /* not alloca */
fa9a63c5 505
0b32bf0e 506# define REGEX_ALLOCATE alloca
fa9a63c5
RM
507
508/* Assumes a `char *destination' variable. */
0b32bf0e 509# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 510 (destination = (char *) alloca (nsize), \
4bb91c68 511 memcpy (destination, source, osize))
fa9a63c5
RM
512
513/* No need to do anything to free, after alloca. */
0b32bf0e 514# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
515
516#endif /* not REGEX_MALLOC */
517
518/* Define how to allocate the failure stack. */
519
0b32bf0e 520#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 521
0b32bf0e 522# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 523 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 524# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 525 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 526# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
527 r_alloc_free (&failure_stack_ptr)
528
4297555e 529#else /* not using relocating allocator */
fa9a63c5 530
0b32bf0e 531# ifdef REGEX_MALLOC
fa9a63c5 532
0b32bf0e
SM
533# define REGEX_ALLOCATE_STACK malloc
534# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
535# define REGEX_FREE_STACK free
fa9a63c5 536
0b32bf0e 537# else /* not REGEX_MALLOC */
fa9a63c5 538
0b32bf0e 539# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 540
0b32bf0e 541# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 542 REGEX_REALLOCATE (source, osize, nsize)
7814e705 543/* No need to explicitly free anything. */
0b32bf0e 544# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 545
0b32bf0e 546# endif /* not REGEX_MALLOC */
4297555e 547#endif /* not using relocating allocator */
fa9a63c5
RM
548
549
550/* True if `size1' is non-NULL and PTR is pointing anywhere inside
551 `string1' or just past its end. This works if PTR is NULL, which is
552 a good thing. */
25fe55af 553#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
554 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
555
556/* (Re)Allocate N items of type T using malloc, or fail. */
557#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
558#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
559#define RETALLOC_IF(addr, n, t) \
560 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
561#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
562
4bb91c68 563#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
564
565#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
566
567#undef MAX
568#undef MIN
569#define MAX(a, b) ((a) > (b) ? (a) : (b))
570#define MIN(a, b) ((a) < (b) ? (a) : (b))
571
6470ea05
SM
572/* Type of source-pattern and string chars. */
573typedef const unsigned char re_char;
574
fa9a63c5
RM
575typedef char boolean;
576#define false 0
577#define true 1
578
4bb91c68
SM
579static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
580 re_char *string1, int size1,
581 re_char *string2, int size2,
582 int pos,
583 struct re_registers *regs,
584 int stop));
fa9a63c5
RM
585\f
586/* These are the command codes that appear in compiled regular
4bb91c68 587 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
588 command code can specify any interpretation whatsoever for its
589 arguments. Zero bytes may appear in the compiled regular expression. */
590
591typedef enum
592{
593 no_op = 0,
594
4bb91c68 595 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
596 succeed,
597
25fe55af 598 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
599 exactn,
600
25fe55af 601 /* Matches any (more or less) character. */
fa9a63c5
RM
602 anychar,
603
25fe55af
RS
604 /* Matches any one char belonging to specified set. First
605 following byte is number of bitmap bytes. Then come bytes
606 for a bitmap saying which chars are in. Bits in each byte
607 are ordered low-bit-first. A character is in the set if its
608 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
609 automatically not in the set.
610
611 If the length byte has the 0x80 bit set, then that stuff
612 is followed by a range table:
613 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 614 See RANGE_TABLE_WORK_BITS below.
01618498 615 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 616 pairs, each 2 multibyte characters,
0b32bf0e 617 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
618 charset,
619
25fe55af 620 /* Same parameters as charset, but match any character that is
4bb91c68 621 not one of those specified. */
fa9a63c5
RM
622 charset_not,
623
25fe55af
RS
624 /* Start remembering the text that is matched, for storing in a
625 register. Followed by one byte with the register number, in
626 the range 0 to one less than the pattern buffer's re_nsub
505bde11 627 field. */
fa9a63c5
RM
628 start_memory,
629
25fe55af
RS
630 /* Stop remembering the text that is matched and store it in a
631 memory register. Followed by one byte with the register
632 number, in the range 0 to one less than `re_nsub' in the
505bde11 633 pattern buffer. */
fa9a63c5
RM
634 stop_memory,
635
25fe55af 636 /* Match a duplicate of something remembered. Followed by one
4bb91c68 637 byte containing the register number. */
fa9a63c5
RM
638 duplicate,
639
25fe55af 640 /* Fail unless at beginning of line. */
fa9a63c5
RM
641 begline,
642
4bb91c68 643 /* Fail unless at end of line. */
fa9a63c5
RM
644 endline,
645
25fe55af
RS
646 /* Succeeds if at beginning of buffer (if emacs) or at beginning
647 of string to be matched (if not). */
fa9a63c5
RM
648 begbuf,
649
25fe55af 650 /* Analogously, for end of buffer/string. */
fa9a63c5 651 endbuf,
5e69f11e 652
25fe55af 653 /* Followed by two byte relative address to which to jump. */
5e69f11e 654 jump,
fa9a63c5 655
25fe55af 656 /* Followed by two-byte relative address of place to resume at
7814e705 657 in case of failure. */
fa9a63c5 658 on_failure_jump,
5e69f11e 659
25fe55af
RS
660 /* Like on_failure_jump, but pushes a placeholder instead of the
661 current string position when executed. */
fa9a63c5 662 on_failure_keep_string_jump,
5e69f11e 663
505bde11
SM
664 /* Just like `on_failure_jump', except that it checks that we
665 don't get stuck in an infinite loop (matching an empty string
666 indefinitely). */
667 on_failure_jump_loop,
668
0683b6fa
SM
669 /* Just like `on_failure_jump_loop', except that it checks for
670 a different kind of loop (the kind that shows up with non-greedy
671 operators). This operation has to be immediately preceded
672 by a `no_op'. */
673 on_failure_jump_nastyloop,
674
0b32bf0e 675 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
676 It analyses the loop before which it is put and if the
677 loop does not require backtracking, it changes itself to
4e8a9132
SM
678 `on_failure_keep_string_jump' and short-circuits the loop,
679 else it just defaults to changing itself into `on_failure_jump'.
680 It assumes that it is pointing to just past a `jump'. */
505bde11 681 on_failure_jump_smart,
fa9a63c5 682
25fe55af 683 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
684 After matching N times, jump to the address upon failure.
685 Does not work if N starts at 0: use on_failure_jump_loop
686 instead. */
fa9a63c5
RM
687 succeed_n,
688
25fe55af
RS
689 /* Followed by two-byte relative address, and two-byte number n.
690 Jump to the address N times, then fail. */
fa9a63c5
RM
691 jump_n,
692
25fe55af 693 /* Set the following two-byte relative address to the
7814e705 694 subsequent two-byte number. The address *includes* the two
25fe55af 695 bytes of number. */
fa9a63c5
RM
696 set_number_at,
697
fa9a63c5
RM
698 wordbeg, /* Succeeds if at word beginning. */
699 wordend, /* Succeeds if at word end. */
700
701 wordbound, /* Succeeds if at a word boundary. */
7814e705 702 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 703
669fa600
SM
704 symbeg, /* Succeeds if at symbol beginning. */
705 symend, /* Succeeds if at symbol end. */
706
fa9a63c5 707 /* Matches any character whose syntax is specified. Followed by
25fe55af 708 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
709 syntaxspec,
710
711 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
712 notsyntaxspec
713
714#ifdef emacs
715 ,before_dot, /* Succeeds if before point. */
716 at_dot, /* Succeeds if at point. */
717 after_dot, /* Succeeds if after point. */
b18215fc
RS
718
719 /* Matches any character whose category-set contains the specified
7814e705
JB
720 category. The operator is followed by a byte which contains a
721 category code (mnemonic ASCII character). */
b18215fc
RS
722 categoryspec,
723
724 /* Matches any character whose category-set does not contain the
725 specified category. The operator is followed by a byte which
726 contains the category code (mnemonic ASCII character). */
727 notcategoryspec
fa9a63c5
RM
728#endif /* emacs */
729} re_opcode_t;
730\f
731/* Common operations on the compiled pattern. */
732
733/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
734
735#define STORE_NUMBER(destination, number) \
736 do { \
737 (destination)[0] = (number) & 0377; \
738 (destination)[1] = (number) >> 8; \
739 } while (0)
740
741/* Same as STORE_NUMBER, except increment DESTINATION to
742 the byte after where the number is stored. Therefore, DESTINATION
743 must be an lvalue. */
744
745#define STORE_NUMBER_AND_INCR(destination, number) \
746 do { \
747 STORE_NUMBER (destination, number); \
748 (destination) += 2; \
749 } while (0)
750
751/* Put into DESTINATION a number stored in two contiguous bytes starting
752 at SOURCE. */
753
754#define EXTRACT_NUMBER(destination, source) \
755 do { \
756 (destination) = *(source) & 0377; \
757 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
758 } while (0)
759
760#ifdef DEBUG
4bb91c68 761static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
762static void
763extract_number (dest, source)
764 int *dest;
01618498 765 re_char *source;
fa9a63c5 766{
5e69f11e 767 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
768 *dest = *source & 0377;
769 *dest += temp << 8;
770}
771
4bb91c68 772# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
773# undef EXTRACT_NUMBER
774# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
775# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
776
777#endif /* DEBUG */
778
779/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
780 SOURCE must be an lvalue. */
781
782#define EXTRACT_NUMBER_AND_INCR(destination, source) \
783 do { \
784 EXTRACT_NUMBER (destination, source); \
25fe55af 785 (source) += 2; \
fa9a63c5
RM
786 } while (0)
787
788#ifdef DEBUG
4bb91c68
SM
789static void extract_number_and_incr _RE_ARGS ((int *destination,
790 re_char **source));
fa9a63c5
RM
791static void
792extract_number_and_incr (destination, source)
793 int *destination;
01618498 794 re_char **source;
5e69f11e 795{
fa9a63c5
RM
796 extract_number (destination, *source);
797 *source += 2;
798}
799
0b32bf0e
SM
800# ifndef EXTRACT_MACROS
801# undef EXTRACT_NUMBER_AND_INCR
802# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 803 extract_number_and_incr (&dest, &src)
0b32bf0e 804# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
805
806#endif /* DEBUG */
807\f
b18215fc
RS
808/* Store a multibyte character in three contiguous bytes starting
809 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 810 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
811
812#define STORE_CHARACTER_AND_INCR(destination, character) \
813 do { \
814 (destination)[0] = (character) & 0377; \
815 (destination)[1] = ((character) >> 8) & 0377; \
816 (destination)[2] = (character) >> 16; \
817 (destination) += 3; \
818 } while (0)
819
820/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 821 starting at SOURCE. */
b18215fc
RS
822
823#define EXTRACT_CHARACTER(destination, source) \
824 do { \
825 (destination) = ((source)[0] \
826 | ((source)[1] << 8) \
827 | ((source)[2] << 16)); \
828 } while (0)
829
830
831/* Macros for charset. */
832
833/* Size of bitmap of charset P in bytes. P is a start of charset,
834 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
835#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
836
837/* Nonzero if charset P has range table. */
25fe55af 838#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
839
840/* Return the address of range table of charset P. But not the start
841 of table itself, but the before where the number of ranges is
96cc36cc
RS
842 stored. `2 +' means to skip re_opcode_t and size of bitmap,
843 and the 2 bytes of flags at the start of the range table. */
844#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
845
846/* Extract the bit flags that start a range table. */
847#define CHARSET_RANGE_TABLE_BITS(p) \
848 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
849 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
850
851/* Test if C is listed in the bitmap of charset P. */
852#define CHARSET_LOOKUP_BITMAP(p, c) \
853 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
854 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
855
856/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
857 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
858 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
859 and end. */
860#define CHARSET_RANGE_TABLE_END(range_table, count) \
861 ((range_table) + (count) * 2 * 3)
862
7814e705 863/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
864 COUNT is number of ranges in RANGE_TABLE. */
865#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
866 do \
867 { \
01618498
SM
868 re_wchar_t range_start, range_end; \
869 re_char *p; \
870 re_char *range_table_end \
b18215fc
RS
871 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
872 \
873 for (p = (range_table); p < range_table_end; p += 2 * 3) \
874 { \
875 EXTRACT_CHARACTER (range_start, p); \
876 EXTRACT_CHARACTER (range_end, p + 3); \
877 \
878 if (range_start <= (c) && (c) <= range_end) \
879 { \
880 (not) = !(not); \
881 break; \
882 } \
883 } \
884 } \
885 while (0)
886
887/* Test if C is in range table of CHARSET. The flag NOT is negated if
888 C is listed in it. */
889#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
890 do \
891 { \
892 /* Number of ranges in range table. */ \
893 int count; \
01618498
SM
894 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
895 \
b18215fc
RS
896 EXTRACT_NUMBER_AND_INCR (count, range_table); \
897 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
898 } \
899 while (0)
900\f
fa9a63c5
RM
901/* If DEBUG is defined, Regex prints many voluminous messages about what
902 it is doing (if the variable `debug' is nonzero). If linked with the
903 main program in `iregex.c', you can enter patterns and strings
904 interactively. And if linked with the main program in `main.c' and
4bb91c68 905 the other test files, you can run the already-written tests. */
fa9a63c5
RM
906
907#ifdef DEBUG
908
909/* We use standard I/O for debugging. */
0b32bf0e 910# include <stdio.h>
fa9a63c5
RM
911
912/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 913# include <assert.h>
fa9a63c5 914
99633e97 915static int debug = -100000;
fa9a63c5 916
0b32bf0e
SM
917# define DEBUG_STATEMENT(e) e
918# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
919# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
920# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
921# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
922# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 923 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 924# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 925 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
926
927
928/* Print the fastmap in human-readable form. */
929
930void
931print_fastmap (fastmap)
932 char *fastmap;
933{
934 unsigned was_a_range = 0;
5e69f11e
RM
935 unsigned i = 0;
936
fa9a63c5
RM
937 while (i < (1 << BYTEWIDTH))
938 {
939 if (fastmap[i++])
940 {
941 was_a_range = 0;
25fe55af
RS
942 putchar (i - 1);
943 while (i < (1 << BYTEWIDTH) && fastmap[i])
944 {
945 was_a_range = 1;
946 i++;
947 }
fa9a63c5 948 if (was_a_range)
25fe55af
RS
949 {
950 printf ("-");
951 putchar (i - 1);
952 }
953 }
fa9a63c5 954 }
5e69f11e 955 putchar ('\n');
fa9a63c5
RM
956}
957
958
959/* Print a compiled pattern string in human-readable form, starting at
960 the START pointer into it and ending just before the pointer END. */
961
962void
963print_partial_compiled_pattern (start, end)
01618498
SM
964 re_char *start;
965 re_char *end;
fa9a63c5
RM
966{
967 int mcnt, mcnt2;
01618498
SM
968 re_char *p = start;
969 re_char *pend = end;
fa9a63c5
RM
970
971 if (start == NULL)
972 {
a1a052df 973 fprintf (stderr, "(null)\n");
fa9a63c5
RM
974 return;
975 }
5e69f11e 976
fa9a63c5
RM
977 /* Loop over pattern commands. */
978 while (p < pend)
979 {
a1a052df 980 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
981
982 switch ((re_opcode_t) *p++)
983 {
25fe55af 984 case no_op:
a1a052df 985 fprintf (stderr, "/no_op");
25fe55af 986 break;
fa9a63c5 987
99633e97 988 case succeed:
a1a052df 989 fprintf (stderr, "/succeed");
99633e97
SM
990 break;
991
fa9a63c5
RM
992 case exactn:
993 mcnt = *p++;
a1a052df 994 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 995 do
fa9a63c5 996 {
a1a052df 997 fprintf (stderr, "/%c", *p++);
25fe55af
RS
998 }
999 while (--mcnt);
1000 break;
fa9a63c5
RM
1001
1002 case start_memory:
a1a052df 1003 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1004 break;
fa9a63c5
RM
1005
1006 case stop_memory:
a1a052df 1007 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1008 break;
fa9a63c5
RM
1009
1010 case duplicate:
a1a052df 1011 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1012 break;
1013
1014 case anychar:
a1a052df 1015 fprintf (stderr, "/anychar");
fa9a63c5
RM
1016 break;
1017
1018 case charset:
25fe55af
RS
1019 case charset_not:
1020 {
1021 register int c, last = -100;
fa9a63c5 1022 register int in_range = 0;
99633e97
SM
1023 int length = CHARSET_BITMAP_SIZE (p - 1);
1024 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1025
a1a052df 1026 fprintf (stderr, "/charset [%s",
275a3409 1027 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1028
275a3409
RS
1029 if (p + *p >= pend)
1030 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1031
25fe55af 1032 for (c = 0; c < 256; c++)
96cc36cc 1033 if (c / 8 < length
fa9a63c5
RM
1034 && (p[1 + (c/8)] & (1 << (c % 8))))
1035 {
1036 /* Are we starting a range? */
1037 if (last + 1 == c && ! in_range)
1038 {
a1a052df 1039 fprintf (stderr, "-");
fa9a63c5
RM
1040 in_range = 1;
1041 }
1042 /* Have we broken a range? */
1043 else if (last + 1 != c && in_range)
96cc36cc 1044 {
a1a052df 1045 fprintf (stderr, "%c", last);
fa9a63c5
RM
1046 in_range = 0;
1047 }
5e69f11e 1048
fa9a63c5 1049 if (! in_range)
a1a052df 1050 fprintf (stderr, "%c", c);
fa9a63c5
RM
1051
1052 last = c;
25fe55af 1053 }
fa9a63c5
RM
1054
1055 if (in_range)
a1a052df 1056 fprintf (stderr, "%c", last);
fa9a63c5 1057
a1a052df 1058 fprintf (stderr, "]");
fa9a63c5 1059
99633e97 1060 p += 1 + length;
96cc36cc 1061
96cc36cc 1062 if (has_range_table)
99633e97
SM
1063 {
1064 int count;
a1a052df 1065 fprintf (stderr, "has-range-table");
99633e97
SM
1066
1067 /* ??? Should print the range table; for now, just skip it. */
1068 p += 2; /* skip range table bits */
1069 EXTRACT_NUMBER_AND_INCR (count, p);
1070 p = CHARSET_RANGE_TABLE_END (p, count);
1071 }
fa9a63c5
RM
1072 }
1073 break;
1074
1075 case begline:
a1a052df 1076 fprintf (stderr, "/begline");
25fe55af 1077 break;
fa9a63c5
RM
1078
1079 case endline:
a1a052df 1080 fprintf (stderr, "/endline");
25fe55af 1081 break;
fa9a63c5
RM
1082
1083 case on_failure_jump:
25fe55af 1084 extract_number_and_incr (&mcnt, &p);
a1a052df 1085 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1086 break;
fa9a63c5
RM
1087
1088 case on_failure_keep_string_jump:
25fe55af 1089 extract_number_and_incr (&mcnt, &p);
a1a052df 1090 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1091 break;
fa9a63c5 1092
0683b6fa
SM
1093 case on_failure_jump_nastyloop:
1094 extract_number_and_incr (&mcnt, &p);
a1a052df 1095 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1096 break;
1097
505bde11 1098 case on_failure_jump_loop:
fa9a63c5 1099 extract_number_and_incr (&mcnt, &p);
a1a052df 1100 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1101 break;
1102
505bde11 1103 case on_failure_jump_smart:
fa9a63c5 1104 extract_number_and_incr (&mcnt, &p);
a1a052df 1105 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1106 break;
1107
25fe55af 1108 case jump:
fa9a63c5 1109 extract_number_and_incr (&mcnt, &p);
a1a052df 1110 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1111 break;
1112
25fe55af
RS
1113 case succeed_n:
1114 extract_number_and_incr (&mcnt, &p);
1115 extract_number_and_incr (&mcnt2, &p);
a1a052df 1116 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1117 break;
5e69f11e 1118
25fe55af
RS
1119 case jump_n:
1120 extract_number_and_incr (&mcnt, &p);
1121 extract_number_and_incr (&mcnt2, &p);
a1a052df 1122 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1123 break;
5e69f11e 1124
25fe55af
RS
1125 case set_number_at:
1126 extract_number_and_incr (&mcnt, &p);
1127 extract_number_and_incr (&mcnt2, &p);
a1a052df 1128 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1129 break;
5e69f11e 1130
25fe55af 1131 case wordbound:
a1a052df 1132 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1133 break;
1134
1135 case notwordbound:
a1a052df 1136 fprintf (stderr, "/notwordbound");
25fe55af 1137 break;
fa9a63c5
RM
1138
1139 case wordbeg:
a1a052df 1140 fprintf (stderr, "/wordbeg");
fa9a63c5 1141 break;
5e69f11e 1142
fa9a63c5 1143 case wordend:
a1a052df 1144 fprintf (stderr, "/wordend");
e2543b02 1145 break;
5e69f11e 1146
669fa600 1147 case symbeg:
e2543b02 1148 fprintf (stderr, "/symbeg");
669fa600
SM
1149 break;
1150
1151 case symend:
e2543b02 1152 fprintf (stderr, "/symend");
669fa600
SM
1153 break;
1154
1fb352e0 1155 case syntaxspec:
a1a052df 1156 fprintf (stderr, "/syntaxspec");
1fb352e0 1157 mcnt = *p++;
a1a052df 1158 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1159 break;
1160
1161 case notsyntaxspec:
a1a052df 1162 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1163 mcnt = *p++;
a1a052df 1164 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1165 break;
1166
0b32bf0e 1167# ifdef emacs
fa9a63c5 1168 case before_dot:
a1a052df 1169 fprintf (stderr, "/before_dot");
25fe55af 1170 break;
fa9a63c5
RM
1171
1172 case at_dot:
a1a052df 1173 fprintf (stderr, "/at_dot");
25fe55af 1174 break;
fa9a63c5
RM
1175
1176 case after_dot:
a1a052df 1177 fprintf (stderr, "/after_dot");
25fe55af 1178 break;
fa9a63c5 1179
1fb352e0 1180 case categoryspec:
a1a052df 1181 fprintf (stderr, "/categoryspec");
fa9a63c5 1182 mcnt = *p++;
a1a052df 1183 fprintf (stderr, "/%d", mcnt);
25fe55af 1184 break;
5e69f11e 1185
1fb352e0 1186 case notcategoryspec:
a1a052df 1187 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1188 mcnt = *p++;
a1a052df 1189 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1190 break;
0b32bf0e 1191# endif /* emacs */
fa9a63c5 1192
fa9a63c5 1193 case begbuf:
a1a052df 1194 fprintf (stderr, "/begbuf");
25fe55af 1195 break;
fa9a63c5
RM
1196
1197 case endbuf:
a1a052df 1198 fprintf (stderr, "/endbuf");
25fe55af 1199 break;
fa9a63c5 1200
25fe55af 1201 default:
a1a052df 1202 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1203 }
1204
a1a052df 1205 fprintf (stderr, "\n");
fa9a63c5
RM
1206 }
1207
a1a052df 1208 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1209}
1210
1211
1212void
1213print_compiled_pattern (bufp)
1214 struct re_pattern_buffer *bufp;
1215{
01618498 1216 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1217
1218 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1219 printf ("%ld bytes used/%ld bytes allocated.\n",
1220 bufp->used, bufp->allocated);
fa9a63c5
RM
1221
1222 if (bufp->fastmap_accurate && bufp->fastmap)
1223 {
1224 printf ("fastmap: ");
1225 print_fastmap (bufp->fastmap);
1226 }
1227
1228 printf ("re_nsub: %d\t", bufp->re_nsub);
1229 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1230 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1231 printf ("no_sub: %d\t", bufp->no_sub);
1232 printf ("not_bol: %d\t", bufp->not_bol);
1233 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1234 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1235 fflush (stdout);
fa9a63c5
RM
1236 /* Perhaps we should print the translate table? */
1237}
1238
1239
1240void
1241print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1242 re_char *where;
1243 re_char *string1;
1244 re_char *string2;
fa9a63c5
RM
1245 int size1;
1246 int size2;
1247{
4bb91c68 1248 int this_char;
5e69f11e 1249
fa9a63c5
RM
1250 if (where == NULL)
1251 printf ("(null)");
1252 else
1253 {
1254 if (FIRST_STRING_P (where))
25fe55af
RS
1255 {
1256 for (this_char = where - string1; this_char < size1; this_char++)
1257 putchar (string1[this_char]);
fa9a63c5 1258
25fe55af
RS
1259 where = string2;
1260 }
fa9a63c5
RM
1261
1262 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1263 putchar (string2[this_char]);
fa9a63c5
RM
1264 }
1265}
1266
1267#else /* not DEBUG */
1268
0b32bf0e
SM
1269# undef assert
1270# define assert(e)
fa9a63c5 1271
0b32bf0e
SM
1272# define DEBUG_STATEMENT(e)
1273# define DEBUG_PRINT1(x)
1274# define DEBUG_PRINT2(x1, x2)
1275# define DEBUG_PRINT3(x1, x2, x3)
1276# define DEBUG_PRINT4(x1, x2, x3, x4)
1277# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1278# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1279
1280#endif /* not DEBUG */
1281\f
1282/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1283 also be assigned to arbitrarily: each pattern buffer stores its own
1284 syntax, so it can be changed between regex compilations. */
1285/* This has no initializer because initialized variables in Emacs
1286 become read-only after dumping. */
1287reg_syntax_t re_syntax_options;
1288
1289
1290/* Specify the precise syntax of regexps for compilation. This provides
1291 for compatibility for various utilities which historically have
1292 different, incompatible syntaxes.
1293
1294 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1295 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1296
1297reg_syntax_t
1298re_set_syntax (syntax)
f9b0fd99 1299 reg_syntax_t syntax;
fa9a63c5
RM
1300{
1301 reg_syntax_t ret = re_syntax_options;
5e69f11e 1302
fa9a63c5
RM
1303 re_syntax_options = syntax;
1304 return ret;
1305}
c0f9ea08 1306WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1307
1308/* Regexp to use to replace spaces, or NULL meaning don't. */
1309static re_char *whitespace_regexp;
1310
1311void
1312re_set_whitespace_regexp (regexp)
6470ea05 1313 const char *regexp;
f9b0fd99 1314{
6470ea05 1315 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1316}
1317WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1318\f
1319/* This table gives an error message for each of the error codes listed
4bb91c68 1320 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1321 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1322 but why not be nice? */
fa9a63c5
RM
1323
1324static const char *re_error_msgid[] =
5e69f11e
RM
1325 {
1326 gettext_noop ("Success"), /* REG_NOERROR */
1327 gettext_noop ("No match"), /* REG_NOMATCH */
1328 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1329 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1330 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1331 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1332 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1333 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1334 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1335 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1336 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1337 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1338 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1339 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1340 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1341 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1342 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1343 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1344 };
1345\f
4bb91c68 1346/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1347
1348/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1349 searching and matching functions should not call alloca. On some
1350 systems, alloca is implemented in terms of malloc, and if we're
1351 using the relocating allocator routines, then malloc could cause a
1352 relocation, which might (if the strings being searched are in the
1353 ralloc heap) shift the data out from underneath the regexp
1354 routines.
1355
5e69f11e 1356 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1357 processes input from X in a signal handler; processing X input may
1358 call malloc; if input arrives while a matching routine is calling
1359 malloc, then we're scrod. But Emacs can't just block input while
1360 calling matching routines; then we don't notice interrupts when
1361 they come in. So, Emacs blocks input around all regexp calls
1362 except the matching calls, which it leaves unprotected, in the
1363 faith that they will not malloc. */
1364
1365/* Normally, this is fine. */
1366#define MATCH_MAY_ALLOCATE
1367
1368/* When using GNU C, we are not REALLY using the C alloca, no matter
1369 what config.h may say. So don't take precautions for it. */
1370#ifdef __GNUC__
0b32bf0e 1371# undef C_ALLOCA
fa9a63c5
RM
1372#endif
1373
1374/* The match routines may not allocate if (1) they would do it with malloc
1375 and (2) it's not safe for them to use malloc.
1376 Note that if REL_ALLOC is defined, matching would not use malloc for the
1377 failure stack, but we would still use it for the register vectors;
4bb91c68 1378 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1379#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1380# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1381#endif
1382
1383\f
1384/* Failure stack declarations and macros; both re_compile_fastmap and
1385 re_match_2 use a failure stack. These have to be macros because of
1386 REGEX_ALLOCATE_STACK. */
5e69f11e 1387
fa9a63c5 1388
320a2a73 1389/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1390 when matching. If this number is exceeded, we allocate more
1391 space, so it is not a hard limit. */
1392#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1393# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1394#endif
1395
1396/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1397 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1398 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1399 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1400 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1401# if defined MATCH_MAY_ALLOCATE
1402/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1403 whose default stack limit is 2mb. In order for a larger
1404 value to work reliably, you have to try to make it accord
1405 with the process stack limit. */
c0f9ea08
SM
1406size_t re_max_failures = 40000;
1407# else
1408size_t re_max_failures = 4000;
1409# endif
fa9a63c5
RM
1410
1411union fail_stack_elt
1412{
01618498 1413 re_char *pointer;
c0f9ea08
SM
1414 /* This should be the biggest `int' that's no bigger than a pointer. */
1415 long integer;
fa9a63c5
RM
1416};
1417
1418typedef union fail_stack_elt fail_stack_elt_t;
1419
1420typedef struct
1421{
1422 fail_stack_elt_t *stack;
c0f9ea08
SM
1423 size_t size;
1424 size_t avail; /* Offset of next open position. */
1425 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1426} fail_stack_type;
1427
505bde11 1428#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1429#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1430
1431
1432/* Define macros to initialize and free the failure stack.
1433 Do `return -2' if the alloc fails. */
1434
1435#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1436# define INIT_FAIL_STACK() \
fa9a63c5
RM
1437 do { \
1438 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1439 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1440 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1441 \
1442 if (fail_stack.stack == NULL) \
1443 return -2; \
1444 \
1445 fail_stack.size = INIT_FAILURE_ALLOC; \
1446 fail_stack.avail = 0; \
505bde11 1447 fail_stack.frame = 0; \
fa9a63c5
RM
1448 } while (0)
1449
0b32bf0e 1450# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1451#else
0b32bf0e 1452# define INIT_FAIL_STACK() \
fa9a63c5
RM
1453 do { \
1454 fail_stack.avail = 0; \
505bde11 1455 fail_stack.frame = 0; \
fa9a63c5
RM
1456 } while (0)
1457
0b32bf0e 1458# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1459#endif
1460
1461
320a2a73
KH
1462/* Double the size of FAIL_STACK, up to a limit
1463 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1464
1465 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1466 allocating space for it or it was already too large.
1467
4bb91c68 1468 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1469
320a2a73
KH
1470/* Factor to increase the failure stack size by
1471 when we increase it.
1472 This used to be 2, but 2 was too wasteful
1473 because the old discarded stacks added up to as much space
1474 were as ultimate, maximum-size stack. */
1475#define FAIL_STACK_GROWTH_FACTOR 4
1476
1477#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1478 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1479 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1480 ? 0 \
320a2a73
KH
1481 : ((fail_stack).stack \
1482 = (fail_stack_elt_t *) \
25fe55af
RS
1483 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1484 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1485 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1486 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1487 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1488 \
1489 (fail_stack).stack == NULL \
1490 ? 0 \
6453db45
KH
1491 : ((fail_stack).size \
1492 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1493 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1494 * FAIL_STACK_GROWTH_FACTOR)) \
1495 / sizeof (fail_stack_elt_t)), \
25fe55af 1496 1)))
fa9a63c5
RM
1497
1498
fa9a63c5
RM
1499/* Push a pointer value onto the failure stack.
1500 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1501 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1502#define PUSH_FAILURE_POINTER(item) \
01618498 1503 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1504
1505/* This pushes an integer-valued item onto the failure stack.
1506 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1507 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1508#define PUSH_FAILURE_INT(item) \
1509 fail_stack.stack[fail_stack.avail++].integer = (item)
1510
1511/* Push a fail_stack_elt_t value onto the failure stack.
1512 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1513 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1514#define PUSH_FAILURE_ELT(item) \
1515 fail_stack.stack[fail_stack.avail++] = (item)
1516
1517/* These three POP... operations complement the three PUSH... operations.
1518 All assume that `fail_stack' is nonempty. */
1519#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1520#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1521#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1522
505bde11
SM
1523/* Individual items aside from the registers. */
1524#define NUM_NONREG_ITEMS 3
1525
1526/* Used to examine the stack (to detect infinite loops). */
1527#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1528#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1529#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1530#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1531
1532
505bde11
SM
1533#define ENSURE_FAIL_STACK(space) \
1534while (REMAINING_AVAIL_SLOTS <= space) { \
1535 if (!GROW_FAIL_STACK (fail_stack)) \
1536 return -2; \
1537 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1538 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1539}
1540
1541/* Push register NUM onto the stack. */
1542#define PUSH_FAILURE_REG(num) \
1543do { \
1544 char *destination; \
1545 ENSURE_FAIL_STACK(3); \
1546 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1547 num, regstart[num], regend[num]); \
1548 PUSH_FAILURE_POINTER (regstart[num]); \
1549 PUSH_FAILURE_POINTER (regend[num]); \
1550 PUSH_FAILURE_INT (num); \
1551} while (0)
1552
01618498
SM
1553/* Change the counter's value to VAL, but make sure that it will
1554 be reset when backtracking. */
1555#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1556do { \
1557 char *destination; \
1558 int c; \
1559 ENSURE_FAIL_STACK(3); \
1560 EXTRACT_NUMBER (c, ptr); \
01618498 1561 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1562 PUSH_FAILURE_INT (c); \
1563 PUSH_FAILURE_POINTER (ptr); \
1564 PUSH_FAILURE_INT (-1); \
01618498 1565 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1566} while (0)
1567
505bde11 1568/* Pop a saved register off the stack. */
dc1e502d 1569#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1570do { \
1571 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1572 if (reg == -1) \
1573 { \
1574 /* It's a counter. */ \
6dcf2d0e
SM
1575 /* Here, we discard `const', making re_match non-reentrant. */ \
1576 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1577 reg = POP_FAILURE_INT (); \
1578 STORE_NUMBER (ptr, reg); \
1579 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1580 } \
1581 else \
1582 { \
1583 regend[reg] = POP_FAILURE_POINTER (); \
1584 regstart[reg] = POP_FAILURE_POINTER (); \
1585 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1586 reg, regstart[reg], regend[reg]); \
1587 } \
505bde11
SM
1588} while (0)
1589
1590/* Check that we are not stuck in an infinite loop. */
1591#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1592do { \
f6df485f 1593 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1594 /* Check for infinite matching loops */ \
f6df485f
RS
1595 while (failure > 0 \
1596 && (FAILURE_STR (failure) == string_place \
1597 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1598 { \
1599 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1600 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1601 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1602 { \
6df42991
SM
1603 cycle = 1; \
1604 break; \
f6df485f 1605 } \
66f0296e 1606 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1607 failure = NEXT_FAILURE_HANDLE(failure); \
1608 } \
1609 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1610} while (0)
6df42991 1611
fa9a63c5 1612/* Push the information about the state we will need
5e69f11e
RM
1613 if we ever fail back to it.
1614
505bde11 1615 Requires variables fail_stack, regstart, regend and
320a2a73 1616 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1617 declared.
5e69f11e 1618
fa9a63c5
RM
1619 Does `return FAILURE_CODE' if runs out of memory. */
1620
505bde11
SM
1621#define PUSH_FAILURE_POINT(pattern, string_place) \
1622do { \
1623 char *destination; \
1624 /* Must be int, so when we don't save any registers, the arithmetic \
1625 of 0 + -1 isn't done as unsigned. */ \
1626 \
505bde11 1627 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1628 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1629 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1630 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1631 \
1632 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1633 \
1634 DEBUG_PRINT1 ("\n"); \
1635 \
1636 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1637 PUSH_FAILURE_INT (fail_stack.frame); \
1638 \
1639 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1640 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1641 DEBUG_PRINT1 ("'\n"); \
1642 PUSH_FAILURE_POINTER (string_place); \
1643 \
1644 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1645 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1646 PUSH_FAILURE_POINTER (pattern); \
1647 \
1648 /* Close the frame by moving the frame pointer past it. */ \
1649 fail_stack.frame = fail_stack.avail; \
1650} while (0)
fa9a63c5 1651
320a2a73
KH
1652/* Estimate the size of data pushed by a typical failure stack entry.
1653 An estimate is all we need, because all we use this for
1654 is to choose a limit for how big to make the failure stack. */
ada30c0e 1655/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1656#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1657
fa9a63c5
RM
1658/* How many items can still be added to the stack without overflowing it. */
1659#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1660
1661
1662/* Pops what PUSH_FAIL_STACK pushes.
1663
1664 We restore into the parameters, all of which should be lvalues:
1665 STR -- the saved data position.
1666 PAT -- the saved pattern position.
fa9a63c5 1667 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1668
fa9a63c5 1669 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1670 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1671
505bde11
SM
1672#define POP_FAILURE_POINT(str, pat) \
1673do { \
fa9a63c5
RM
1674 assert (!FAIL_STACK_EMPTY ()); \
1675 \
1676 /* Remove failure points and point to how many regs pushed. */ \
1677 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1678 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1679 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1680 \
505bde11
SM
1681 /* Pop the saved registers. */ \
1682 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1683 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1684 \
01618498 1685 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1686 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1687 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1688 \
1689 /* If the saved string location is NULL, it came from an \
1690 on_failure_keep_string_jump opcode, and we want to throw away the \
1691 saved NULL, thus retaining our current position in the string. */ \
01618498 1692 str = POP_FAILURE_POINTER (); \
505bde11 1693 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1694 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1695 DEBUG_PRINT1 ("'\n"); \
1696 \
505bde11
SM
1697 fail_stack.frame = POP_FAILURE_INT (); \
1698 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1699 \
505bde11
SM
1700 assert (fail_stack.avail >= 0); \
1701 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1702 \
fa9a63c5 1703 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1704} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1705
1706
1707\f
fa9a63c5 1708/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1709#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1710\f
1711/* Subroutine declarations and macros for regex_compile. */
1712
4bb91c68
SM
1713static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1714 reg_syntax_t syntax,
1715 struct re_pattern_buffer *bufp));
1716static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1717static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1718 int arg1, int arg2));
1719static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1720 int arg, unsigned char *end));
1721static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1722 int arg1, int arg2, unsigned char *end));
01618498
SM
1723static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1724 re_char *p,
4bb91c68 1725 reg_syntax_t syntax));
01618498
SM
1726static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1727 re_char *pend,
4bb91c68 1728 reg_syntax_t syntax));
01618498
SM
1729static re_char *skip_one_char _RE_ARGS ((re_char *p));
1730static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1731 char *fastmap, const int multibyte));
fa9a63c5 1732
fa9a63c5 1733/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1734 translation. */
36595814 1735#define PATFETCH(c) \
2d1675e4
SM
1736 do { \
1737 int len; \
1738 if (p == pend) return REG_EEND; \
1739 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1740 p += len; \
fa9a63c5
RM
1741 } while (0)
1742
fa9a63c5
RM
1743
1744/* If `translate' is non-null, return translate[D], else just D. We
1745 cast the subscript to translate because some data is declared as
1746 `char *', to avoid warnings when a string constant is passed. But
1747 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1748#ifndef TRANSLATE
0b32bf0e 1749# define TRANSLATE(d) \
66f0296e 1750 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1751#endif
fa9a63c5
RM
1752
1753
1754/* Macros for outputting the compiled pattern into `buffer'. */
1755
1756/* If the buffer isn't allocated when it comes in, use this. */
1757#define INIT_BUF_SIZE 32
1758
4bb91c68 1759/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1760#define GET_BUFFER_SPACE(n) \
01618498 1761 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1762 EXTEND_BUFFER ()
1763
1764/* Make sure we have one more byte of buffer space and then add C to it. */
1765#define BUF_PUSH(c) \
1766 do { \
1767 GET_BUFFER_SPACE (1); \
1768 *b++ = (unsigned char) (c); \
1769 } while (0)
1770
1771
1772/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1773#define BUF_PUSH_2(c1, c2) \
1774 do { \
1775 GET_BUFFER_SPACE (2); \
1776 *b++ = (unsigned char) (c1); \
1777 *b++ = (unsigned char) (c2); \
1778 } while (0)
1779
1780
4bb91c68 1781/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1782#define BUF_PUSH_3(c1, c2, c3) \
1783 do { \
1784 GET_BUFFER_SPACE (3); \
1785 *b++ = (unsigned char) (c1); \
1786 *b++ = (unsigned char) (c2); \
1787 *b++ = (unsigned char) (c3); \
1788 } while (0)
1789
1790
1791/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1792 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1793#define STORE_JUMP(op, loc, to) \
1794 store_op1 (op, loc, (to) - (loc) - 3)
1795
1796/* Likewise, for a two-argument jump. */
1797#define STORE_JUMP2(op, loc, to, arg) \
1798 store_op2 (op, loc, (to) - (loc) - 3, arg)
1799
4bb91c68 1800/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1801#define INSERT_JUMP(op, loc, to) \
1802 insert_op1 (op, loc, (to) - (loc) - 3, b)
1803
1804/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1805#define INSERT_JUMP2(op, loc, to, arg) \
1806 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1807
1808
1809/* This is not an arbitrary limit: the arguments which represent offsets
275a3409 1810 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1811 be too small, many things would have to change. */
275a3409
RS
1812# define MAX_BUF_SIZE (1L << 15)
1813
1814#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1815/* Any other compiler which, like MSC, has allocation limit below 2^16
1816 bytes will have to use approach similar to what was done below for
1817 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1818 reallocating to 0 bytes. Such thing is not going to work too well.
1819 You have been warned!! */
1820#if defined _MSC_VER && !defined WIN32
1821/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1822# define MAX_BUF_SIZE 65500L
1823#else
1824# define MAX_BUF_SIZE (1L << 16)
1825#endif
275a3409 1826#endif /* 0 */
fa9a63c5
RM
1827
1828/* Extend the buffer by twice its current size via realloc and
1829 reset the pointers that pointed into the old block to point to the
1830 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1831 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1832#if __BOUNDED_POINTERS__
1833# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1834# define MOVE_BUFFER_POINTER(P) \
1835 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1836# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1837 else \
1838 { \
1839 SET_HIGH_BOUND (b); \
1840 SET_HIGH_BOUND (begalt); \
1841 if (fixup_alt_jump) \
1842 SET_HIGH_BOUND (fixup_alt_jump); \
1843 if (laststart) \
1844 SET_HIGH_BOUND (laststart); \
1845 if (pending_exact) \
1846 SET_HIGH_BOUND (pending_exact); \
1847 }
1848#else
1849# define MOVE_BUFFER_POINTER(P) (P) += incr
1850# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1851#endif
fa9a63c5 1852#define EXTEND_BUFFER() \
25fe55af 1853 do { \
01618498 1854 re_char *old_buffer = bufp->buffer; \
25fe55af 1855 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1856 return REG_ESIZE; \
1857 bufp->allocated <<= 1; \
1858 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1859 bufp->allocated = MAX_BUF_SIZE; \
01618498 1860 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1861 if (bufp->buffer == NULL) \
1862 return REG_ESPACE; \
1863 /* If the buffer moved, move all the pointers into it. */ \
1864 if (old_buffer != bufp->buffer) \
1865 { \
4bb91c68
SM
1866 int incr = bufp->buffer - old_buffer; \
1867 MOVE_BUFFER_POINTER (b); \
1868 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1869 if (fixup_alt_jump) \
4bb91c68 1870 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1871 if (laststart) \
4bb91c68 1872 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1873 if (pending_exact) \
4bb91c68 1874 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1875 } \
4bb91c68 1876 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1877 } while (0)
1878
1879
1880/* Since we have one byte reserved for the register number argument to
1881 {start,stop}_memory, the maximum number of groups we can report
1882 things about is what fits in that byte. */
1883#define MAX_REGNUM 255
1884
1885/* But patterns can have more than `MAX_REGNUM' registers. We just
1886 ignore the excess. */
098d42af 1887typedef int regnum_t;
fa9a63c5
RM
1888
1889
1890/* Macros for the compile stack. */
1891
1892/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1893 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1894/* int may be not enough when sizeof(int) == 2. */
1895typedef long pattern_offset_t;
fa9a63c5
RM
1896
1897typedef struct
1898{
1899 pattern_offset_t begalt_offset;
1900 pattern_offset_t fixup_alt_jump;
5e69f11e 1901 pattern_offset_t laststart_offset;
fa9a63c5
RM
1902 regnum_t regnum;
1903} compile_stack_elt_t;
1904
1905
1906typedef struct
1907{
1908 compile_stack_elt_t *stack;
1909 unsigned size;
1910 unsigned avail; /* Offset of next open position. */
1911} compile_stack_type;
1912
1913
1914#define INIT_COMPILE_STACK_SIZE 32
1915
1916#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1917#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1918
4bb91c68 1919/* The next available element. */
fa9a63c5
RM
1920#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1921
1cee1e27
SM
1922/* Explicit quit checking is only used on NTemacs and whenever we
1923 use polling to process input events. */
1924#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1925extern int immediate_quit;
1926# define IMMEDIATE_QUIT_CHECK \
1927 do { \
1928 if (immediate_quit) QUIT; \
1929 } while (0)
1930#else
1931# define IMMEDIATE_QUIT_CHECK ((void)0)
1932#endif
1933\f
b18215fc
RS
1934/* Structure to manage work area for range table. */
1935struct range_table_work_area
1936{
1937 int *table; /* actual work area. */
1938 int allocated; /* allocated size for work area in bytes. */
7814e705 1939 int used; /* actually used size in words. */
96cc36cc 1940 int bits; /* flag to record character classes */
b18215fc
RS
1941};
1942
77d11aec
RS
1943/* Make sure that WORK_AREA can hold more N multibyte characters.
1944 This is used only in set_image_of_range and set_image_of_range_1.
1945 It expects WORK_AREA to be a pointer.
1946 If it can't get the space, it returns from the surrounding function. */
1947
1948#define EXTEND_RANGE_TABLE(work_area, n) \
1949 do { \
1950 if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
1951 { \
1952 extend_range_table_work_area (work_area); \
1953 if ((work_area)->table == 0) \
1954 return (REG_ESPACE); \
1955 } \
b18215fc
RS
1956 } while (0)
1957
96cc36cc
RS
1958#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1959 (work_area).bits |= (bit)
1960
14473664
SM
1961/* Bits used to implement the multibyte-part of the various character classes
1962 such as [:alnum:] in a charset's range table. */
1963#define BIT_WORD 0x1
1964#define BIT_LOWER 0x2
1965#define BIT_PUNCT 0x4
1966#define BIT_SPACE 0x8
1967#define BIT_UPPER 0x10
1968#define BIT_MULTIBYTE 0x20
96cc36cc 1969
36595814
SM
1970/* Set a range START..END to WORK_AREA.
1971 The range is passed through TRANSLATE, so START and END
1972 should be untranslated. */
77d11aec
RS
1973#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
1974 do { \
1975 int tem; \
1976 tem = set_image_of_range (&work_area, start, end, translate); \
1977 if (tem > 0) \
1978 FREE_STACK_RETURN (tem); \
b18215fc
RS
1979 } while (0)
1980
7814e705 1981/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1982#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1983 do { \
1984 if ((work_area).table) \
1985 free ((work_area).table); \
1986 } while (0)
1987
96cc36cc 1988#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1989#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1990#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1991#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1992\f
b18215fc 1993
fa9a63c5 1994/* Set the bit for character C in a list. */
01618498 1995#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1996
1997
1998/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1999#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2000 do { \
2001 if (p == pend) \
2002 FREE_STACK_RETURN (REG_EBRACE); \
2003 else \
2004 { \
2005 PATFETCH (c); \
2006 while ('0' <= c && c <= '9') \
2007 { \
2008 int prev; \
2009 if (num < 0) \
2010 num = 0; \
2011 prev = num; \
2012 num = num * 10 + c - '0'; \
2013 if (num / 10 != prev) \
2014 FREE_STACK_RETURN (REG_BADBR); \
2015 if (p == pend) \
2016 FREE_STACK_RETURN (REG_EBRACE); \
2017 PATFETCH (c); \
2018 } \
2019 } \
2020 } while (0)
77d11aec 2021\f
1fdab503 2022#if ! WIDE_CHAR_SUPPORT
01618498 2023
14473664 2024/* Map a string to the char class it names (if any). */
1fdab503 2025re_wctype_t
ada30c0e
SM
2026re_wctype (str)
2027 re_char *str;
14473664 2028{
ada30c0e 2029 const char *string = str;
14473664
SM
2030 if (STREQ (string, "alnum")) return RECC_ALNUM;
2031 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2032 else if (STREQ (string, "word")) return RECC_WORD;
2033 else if (STREQ (string, "ascii")) return RECC_ASCII;
2034 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2035 else if (STREQ (string, "graph")) return RECC_GRAPH;
2036 else if (STREQ (string, "lower")) return RECC_LOWER;
2037 else if (STREQ (string, "print")) return RECC_PRINT;
2038 else if (STREQ (string, "punct")) return RECC_PUNCT;
2039 else if (STREQ (string, "space")) return RECC_SPACE;
2040 else if (STREQ (string, "upper")) return RECC_UPPER;
2041 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2042 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2043 else if (STREQ (string, "digit")) return RECC_DIGIT;
2044 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2045 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2046 else if (STREQ (string, "blank")) return RECC_BLANK;
2047 else return 0;
2048}
2049
2050/* True iff CH is in the char class CC. */
1fdab503 2051boolean
14473664
SM
2052re_iswctype (ch, cc)
2053 int ch;
2054 re_wctype_t cc;
2055{
2056 switch (cc)
2057 {
0cdd06f8
SM
2058 case RECC_ALNUM: return ISALNUM (ch);
2059 case RECC_ALPHA: return ISALPHA (ch);
2060 case RECC_BLANK: return ISBLANK (ch);
2061 case RECC_CNTRL: return ISCNTRL (ch);
2062 case RECC_DIGIT: return ISDIGIT (ch);
2063 case RECC_GRAPH: return ISGRAPH (ch);
2064 case RECC_LOWER: return ISLOWER (ch);
2065 case RECC_PRINT: return ISPRINT (ch);
2066 case RECC_PUNCT: return ISPUNCT (ch);
2067 case RECC_SPACE: return ISSPACE (ch);
2068 case RECC_UPPER: return ISUPPER (ch);
2069 case RECC_XDIGIT: return ISXDIGIT (ch);
2070 case RECC_ASCII: return IS_REAL_ASCII (ch);
2071 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2072 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2073 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2074 case RECC_WORD: return ISWORD (ch);
2075 case RECC_ERROR: return false;
2076 default:
2077 abort();
14473664
SM
2078 }
2079}
fa9a63c5 2080
14473664
SM
2081/* Return a bit-pattern to use in the range-table bits to match multibyte
2082 chars of class CC. */
2083static int
2084re_wctype_to_bit (cc)
2085 re_wctype_t cc;
2086{
2087 switch (cc)
2088 {
2089 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2090 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2091 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2092 case RECC_LOWER: return BIT_LOWER;
2093 case RECC_UPPER: return BIT_UPPER;
2094 case RECC_PUNCT: return BIT_PUNCT;
2095 case RECC_SPACE: return BIT_SPACE;
14473664 2096 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2097 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2098 default:
2099 abort();
14473664
SM
2100 }
2101}
2102#endif
77d11aec
RS
2103\f
2104/* Filling in the work area of a range. */
2105
2106/* Actually extend the space in WORK_AREA. */
2107
2108static void
2109extend_range_table_work_area (work_area)
2110 struct range_table_work_area *work_area;
177c0ea7 2111{
77d11aec
RS
2112 work_area->allocated += 16 * sizeof (int);
2113 if (work_area->table)
2114 work_area->table
a073faa6 2115 = (int *) realloc (work_area->table, work_area->allocated);
77d11aec
RS
2116 else
2117 work_area->table
a073faa6 2118 = (int *) malloc (work_area->allocated);
77d11aec
RS
2119}
2120
2121#ifdef emacs
2122
2123/* Carefully find the ranges of codes that are equivalent
2124 under case conversion to the range start..end when passed through
2125 TRANSLATE. Handle the case where non-letters can come in between
2126 two upper-case letters (which happens in Latin-1).
2127 Also handle the case of groups of more than 2 case-equivalent chars.
2128
2129 The basic method is to look at consecutive characters and see
2130 if they can form a run that can be handled as one.
2131
2132 Returns -1 if successful, REG_ESPACE if ran out of space. */
2133
2134static int
2135set_image_of_range_1 (work_area, start, end, translate)
2136 RE_TRANSLATE_TYPE translate;
2137 struct range_table_work_area *work_area;
2138 re_wchar_t start, end;
2139{
2140 /* `one_case' indicates a character, or a run of characters,
2141 each of which is an isolate (no case-equivalents).
2142 This includes all ASCII non-letters.
2143
2144 `two_case' indicates a character, or a run of characters,
2145 each of which has two case-equivalent forms.
2146 This includes all ASCII letters.
2147
2148 `strange' indicates a character that has more than one
2149 case-equivalent. */
177c0ea7 2150
77d11aec
RS
2151 enum case_type {one_case, two_case, strange};
2152
2153 /* Describe the run that is in progress,
2154 which the next character can try to extend.
2155 If run_type is strange, that means there really is no run.
2156 If run_type is one_case, then run_start...run_end is the run.
2157 If run_type is two_case, then the run is run_start...run_end,
2158 and the case-equivalents end at run_eqv_end. */
2159
2160 enum case_type run_type = strange;
2161 int run_start, run_end, run_eqv_end;
2162
2163 Lisp_Object eqv_table;
2164
2165 if (!RE_TRANSLATE_P (translate))
2166 {
b7c12565 2167 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2168 work_area->table[work_area->used++] = (start);
2169 work_area->table[work_area->used++] = (end);
b7c12565 2170 return -1;
77d11aec
RS
2171 }
2172
2173 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2174
77d11aec
RS
2175 for (; start <= end; start++)
2176 {
2177 enum case_type this_type;
2178 int eqv = RE_TRANSLATE (eqv_table, start);
2179 int minchar, maxchar;
2180
2181 /* Classify this character */
2182 if (eqv == start)
2183 this_type = one_case;
2184 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2185 this_type = two_case;
2186 else
2187 this_type = strange;
2188
2189 if (start < eqv)
2190 minchar = start, maxchar = eqv;
2191 else
2192 minchar = eqv, maxchar = start;
2193
2194 /* Can this character extend the run in progress? */
2195 if (this_type == strange || this_type != run_type
2196 || !(minchar == run_end + 1
2197 && (run_type == two_case
2198 ? maxchar == run_eqv_end + 1 : 1)))
2199 {
2200 /* No, end the run.
2201 Record each of its equivalent ranges. */
2202 if (run_type == one_case)
2203 {
2204 EXTEND_RANGE_TABLE (work_area, 2);
2205 work_area->table[work_area->used++] = run_start;
2206 work_area->table[work_area->used++] = run_end;
2207 }
2208 else if (run_type == two_case)
2209 {
2210 EXTEND_RANGE_TABLE (work_area, 4);
2211 work_area->table[work_area->used++] = run_start;
2212 work_area->table[work_area->used++] = run_end;
2213 work_area->table[work_area->used++]
2214 = RE_TRANSLATE (eqv_table, run_start);
2215 work_area->table[work_area->used++]
2216 = RE_TRANSLATE (eqv_table, run_end);
2217 }
2218 run_type = strange;
2219 }
177c0ea7 2220
77d11aec
RS
2221 if (this_type == strange)
2222 {
2223 /* For a strange character, add each of its equivalents, one
2224 by one. Don't start a range. */
2225 do
2226 {
2227 EXTEND_RANGE_TABLE (work_area, 2);
2228 work_area->table[work_area->used++] = eqv;
2229 work_area->table[work_area->used++] = eqv;
2230 eqv = RE_TRANSLATE (eqv_table, eqv);
2231 }
2232 while (eqv != start);
2233 }
2234
2235 /* Add this char to the run, or start a new run. */
2236 else if (run_type == strange)
2237 {
2238 /* Initialize a new range. */
2239 run_type = this_type;
2240 run_start = start;
2241 run_end = start;
2242 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2243 }
2244 else
2245 {
2246 /* Extend a running range. */
2247 run_end = minchar;
2248 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2249 }
2250 }
2251
2252 /* If a run is still in progress at the end, finish it now
2253 by recording its equivalent ranges. */
2254 if (run_type == one_case)
2255 {
2256 EXTEND_RANGE_TABLE (work_area, 2);
2257 work_area->table[work_area->used++] = run_start;
2258 work_area->table[work_area->used++] = run_end;
2259 }
2260 else if (run_type == two_case)
2261 {
2262 EXTEND_RANGE_TABLE (work_area, 4);
2263 work_area->table[work_area->used++] = run_start;
2264 work_area->table[work_area->used++] = run_end;
2265 work_area->table[work_area->used++]
2266 = RE_TRANSLATE (eqv_table, run_start);
2267 work_area->table[work_area->used++]
2268 = RE_TRANSLATE (eqv_table, run_end);
2269 }
2270
2271 return -1;
2272}
36595814 2273
77d11aec 2274#endif /* emacs */
36595814 2275
b7c12565 2276/* Record the the image of the range start..end when passed through
36595814
SM
2277 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2278 and is not even necessarily contiguous.
b7c12565
RS
2279 Normally we approximate it with the smallest contiguous range that contains
2280 all the chars we need. However, for Latin-1 we go to extra effort
2281 to do a better job.
2282
2283 This function is not called for ASCII ranges.
77d11aec
RS
2284
2285 Returns -1 if successful, REG_ESPACE if ran out of space. */
2286
2287static int
36595814
SM
2288set_image_of_range (work_area, start, end, translate)
2289 RE_TRANSLATE_TYPE translate;
2290 struct range_table_work_area *work_area;
2291 re_wchar_t start, end;
2292{
77d11aec
RS
2293 re_wchar_t cmin, cmax;
2294
2295#ifdef emacs
2296 /* For Latin-1 ranges, use set_image_of_range_1
2297 to get proper handling of ranges that include letters and nonletters.
b7c12565 2298 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2299 For other character sets, we don't bother to get this right. */
b7c12565
RS
2300 if (RE_TRANSLATE_P (translate) && start < 04400
2301 && !(start < 04200 && end >= 04377))
77d11aec 2302 {
b7c12565 2303 int newend;
77d11aec 2304 int tem;
b7c12565
RS
2305 newend = end;
2306 if (newend > 04377)
2307 newend = 04377;
2308 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2309 if (tem > 0)
2310 return tem;
2311
2312 start = 04400;
2313 if (end < 04400)
2314 return -1;
2315 }
2316#endif
2317
b7c12565
RS
2318 EXTEND_RANGE_TABLE (work_area, 2);
2319 work_area->table[work_area->used++] = (start);
2320 work_area->table[work_area->used++] = (end);
2321
2322 cmin = -1, cmax = -1;
77d11aec 2323
36595814 2324 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2325 {
2326 int ch;
77d11aec 2327
b7c12565
RS
2328 for (ch = start; ch <= end; ch++)
2329 {
2330 re_wchar_t c = TRANSLATE (ch);
2331 if (! (start <= c && c <= end))
2332 {
2333 if (cmin == -1)
2334 cmin = c, cmax = c;
2335 else
2336 {
2337 cmin = MIN (cmin, c);
2338 cmax = MAX (cmax, c);
2339 }
2340 }
2341 }
2342
2343 if (cmin != -1)
2344 {
2345 EXTEND_RANGE_TABLE (work_area, 2);
2346 work_area->table[work_area->used++] = (cmin);
2347 work_area->table[work_area->used++] = (cmax);
2348 }
2349 }
36595814 2350
77d11aec
RS
2351 return -1;
2352}
fa9a63c5
RM
2353\f
2354#ifndef MATCH_MAY_ALLOCATE
2355
2356/* If we cannot allocate large objects within re_match_2_internal,
2357 we make the fail stack and register vectors global.
2358 The fail stack, we grow to the maximum size when a regexp
2359 is compiled.
2360 The register vectors, we adjust in size each time we
2361 compile a regexp, according to the number of registers it needs. */
2362
2363static fail_stack_type fail_stack;
2364
2365/* Size with which the following vectors are currently allocated.
2366 That is so we can make them bigger as needed,
4bb91c68 2367 but never make them smaller. */
fa9a63c5
RM
2368static int regs_allocated_size;
2369
66f0296e
SM
2370static re_char ** regstart, ** regend;
2371static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2372
2373/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2374 but don't make them smaller. */
fa9a63c5
RM
2375
2376static
2377regex_grow_registers (num_regs)
2378 int num_regs;
2379{
2380 if (num_regs > regs_allocated_size)
2381 {
66f0296e
SM
2382 RETALLOC_IF (regstart, num_regs, re_char *);
2383 RETALLOC_IF (regend, num_regs, re_char *);
2384 RETALLOC_IF (best_regstart, num_regs, re_char *);
2385 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2386
2387 regs_allocated_size = num_regs;
2388 }
2389}
2390
2391#endif /* not MATCH_MAY_ALLOCATE */
2392\f
99633e97
SM
2393static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2394 compile_stack,
2395 regnum_t regnum));
2396
fa9a63c5
RM
2397/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2398 Returns one of error codes defined in `regex.h', or zero for success.
2399
2400 Assumes the `allocated' (and perhaps `buffer') and `translate'
2401 fields are set in BUFP on entry.
2402
2403 If it succeeds, results are put in BUFP (if it returns an error, the
2404 contents of BUFP are undefined):
2405 `buffer' is the compiled pattern;
2406 `syntax' is set to SYNTAX;
2407 `used' is set to the length of the compiled pattern;
2408 `fastmap_accurate' is zero;
2409 `re_nsub' is the number of subexpressions in PATTERN;
2410 `not_bol' and `not_eol' are zero;
5e69f11e 2411
c0f9ea08 2412 The `fastmap' field is neither examined nor set. */
fa9a63c5 2413
505bde11
SM
2414/* Insert the `jump' from the end of last alternative to "here".
2415 The space for the jump has already been allocated. */
2416#define FIXUP_ALT_JUMP() \
2417do { \
2418 if (fixup_alt_jump) \
2419 STORE_JUMP (jump, fixup_alt_jump, b); \
2420} while (0)
2421
2422
fa9a63c5
RM
2423/* Return, freeing storage we allocated. */
2424#define FREE_STACK_RETURN(value) \
b18215fc
RS
2425 do { \
2426 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2427 free (compile_stack.stack); \
2428 return value; \
2429 } while (0)
fa9a63c5
RM
2430
2431static reg_errcode_t
2432regex_compile (pattern, size, syntax, bufp)
66f0296e 2433 re_char *pattern;
4bb91c68 2434 size_t size;
fa9a63c5
RM
2435 reg_syntax_t syntax;
2436 struct re_pattern_buffer *bufp;
2437{
01618498
SM
2438 /* We fetch characters from PATTERN here. */
2439 register re_wchar_t c, c1;
5e69f11e 2440
fa9a63c5 2441 /* A random temporary spot in PATTERN. */
66f0296e 2442 re_char *p1;
fa9a63c5
RM
2443
2444 /* Points to the end of the buffer, where we should append. */
2445 register unsigned char *b;
5e69f11e 2446
fa9a63c5
RM
2447 /* Keeps track of unclosed groups. */
2448 compile_stack_type compile_stack;
2449
2450 /* Points to the current (ending) position in the pattern. */
22336245
RS
2451#ifdef AIX
2452 /* `const' makes AIX compiler fail. */
66f0296e 2453 unsigned char *p = pattern;
22336245 2454#else
66f0296e 2455 re_char *p = pattern;
22336245 2456#endif
66f0296e 2457 re_char *pend = pattern + size;
5e69f11e 2458
fa9a63c5 2459 /* How to translate the characters in the pattern. */
6676cb1c 2460 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2461
2462 /* Address of the count-byte of the most recently inserted `exactn'
2463 command. This makes it possible to tell if a new exact-match
2464 character can be added to that command or if the character requires
2465 a new `exactn' command. */
2466 unsigned char *pending_exact = 0;
2467
2468 /* Address of start of the most recently finished expression.
2469 This tells, e.g., postfix * where to find the start of its
2470 operand. Reset at the beginning of groups and alternatives. */
2471 unsigned char *laststart = 0;
2472
2473 /* Address of beginning of regexp, or inside of last group. */
2474 unsigned char *begalt;
2475
2476 /* Place in the uncompiled pattern (i.e., the {) to
2477 which to go back if the interval is invalid. */
66f0296e 2478 re_char *beg_interval;
5e69f11e 2479
fa9a63c5 2480 /* Address of the place where a forward jump should go to the end of
7814e705 2481 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2482 last -- ends with a forward jump of this sort. */
2483 unsigned char *fixup_alt_jump = 0;
2484
2485 /* Counts open-groups as they are encountered. Remembered for the
2486 matching close-group on the compile stack, so the same register
2487 number is put in the stop_memory as the start_memory. */
2488 regnum_t regnum = 0;
2489
b18215fc
RS
2490 /* Work area for range table of charset. */
2491 struct range_table_work_area range_table_work;
2492
2d1675e4
SM
2493 /* If the object matched can contain multibyte characters. */
2494 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2495
f9b0fd99
RS
2496 /* Nonzero if we have pushed down into a subpattern. */
2497 int in_subpattern = 0;
2498
2499 /* These hold the values of p, pattern, and pend from the main
2500 pattern when we have pushed into a subpattern. */
2501 re_char *main_p;
2502 re_char *main_pattern;
2503 re_char *main_pend;
2504
fa9a63c5 2505#ifdef DEBUG
99633e97 2506 debug++;
fa9a63c5 2507 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2508 if (debug > 0)
fa9a63c5
RM
2509 {
2510 unsigned debug_count;
5e69f11e 2511
fa9a63c5 2512 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2513 putchar (pattern[debug_count]);
fa9a63c5
RM
2514 putchar ('\n');
2515 }
2516#endif /* DEBUG */
2517
2518 /* Initialize the compile stack. */
2519 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2520 if (compile_stack.stack == NULL)
2521 return REG_ESPACE;
2522
2523 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2524 compile_stack.avail = 0;
2525
b18215fc
RS
2526 range_table_work.table = 0;
2527 range_table_work.allocated = 0;
2528
fa9a63c5
RM
2529 /* Initialize the pattern buffer. */
2530 bufp->syntax = syntax;
2531 bufp->fastmap_accurate = 0;
2532 bufp->not_bol = bufp->not_eol = 0;
6224b623 2533 bufp->used_syntax = 0;
fa9a63c5
RM
2534
2535 /* Set `used' to zero, so that if we return an error, the pattern
2536 printer (for debugging) will think there's no pattern. We reset it
2537 at the end. */
2538 bufp->used = 0;
5e69f11e 2539
fa9a63c5 2540 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2541 bufp->re_nsub = 0;
fa9a63c5 2542
0b32bf0e 2543#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2544 /* Initialize the syntax table. */
2545 init_syntax_once ();
2546#endif
2547
2548 if (bufp->allocated == 0)
2549 {
2550 if (bufp->buffer)
2551 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2552 enough space. This loses if buffer's address is bogus, but
7814e705 2553 that is the user's responsibility. */
25fe55af
RS
2554 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2555 }
fa9a63c5 2556 else
7814e705 2557 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2558 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2559 }
fa9a63c5
RM
2560 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2561
2562 bufp->allocated = INIT_BUF_SIZE;
2563 }
2564
2565 begalt = b = bufp->buffer;
2566
2567 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2568 while (1)
fa9a63c5 2569 {
f9b0fd99
RS
2570 if (p == pend)
2571 {
2572 /* If this is the end of an included regexp,
2573 pop back to the main regexp and try again. */
2574 if (in_subpattern)
2575 {
2576 in_subpattern = 0;
2577 pattern = main_pattern;
2578 p = main_p;
2579 pend = main_pend;
2580 continue;
2581 }
2582 /* If this is the end of the main regexp, we are done. */
2583 break;
2584 }
2585
fa9a63c5
RM
2586 PATFETCH (c);
2587
2588 switch (c)
25fe55af 2589 {
f9b0fd99
RS
2590 case ' ':
2591 {
2592 re_char *p1 = p;
2593
2594 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2595 spaces normally. And don't try to do this recursively. */
2596 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2597 goto normal_char;
2598
2599 /* Peek past following spaces. */
2600 while (p1 != pend)
2601 {
2602 if (*p1 != ' ')
2603 break;
2604 p1++;
2605 }
2606 /* If the spaces are followed by a repetition op,
2607 treat them normally. */
c721eee5
RS
2608 if (p1 != pend
2609 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2610 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2611 goto normal_char;
2612
2613 /* Replace the spaces with the whitespace regexp. */
2614 in_subpattern = 1;
2615 main_p = p1;
2616 main_pend = pend;
2617 main_pattern = pattern;
2618 p = pattern = whitespace_regexp;
2619 pend = p + strlen (p);
2620 break;
7814e705 2621 }
f9b0fd99 2622
25fe55af
RS
2623 case '^':
2624 {
7814e705 2625 if ( /* If at start of pattern, it's an operator. */
25fe55af 2626 p == pattern + 1
7814e705 2627 /* If context independent, it's an operator. */
25fe55af 2628 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2629 /* Otherwise, depends on what's come before. */
25fe55af 2630 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2631 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2632 else
2633 goto normal_char;
2634 }
2635 break;
2636
2637
2638 case '$':
2639 {
2640 if ( /* If at end of pattern, it's an operator. */
2641 p == pend
7814e705 2642 /* If context independent, it's an operator. */
25fe55af
RS
2643 || syntax & RE_CONTEXT_INDEP_ANCHORS
2644 /* Otherwise, depends on what's next. */
2645 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2646 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2647 else
2648 goto normal_char;
2649 }
2650 break;
fa9a63c5
RM
2651
2652
2653 case '+':
25fe55af
RS
2654 case '?':
2655 if ((syntax & RE_BK_PLUS_QM)
2656 || (syntax & RE_LIMITED_OPS))
2657 goto normal_char;
2658 handle_plus:
2659 case '*':
2660 /* If there is no previous pattern... */
2661 if (!laststart)
2662 {
2663 if (syntax & RE_CONTEXT_INVALID_OPS)
2664 FREE_STACK_RETURN (REG_BADRPT);
2665 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2666 goto normal_char;
2667 }
2668
2669 {
7814e705 2670 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2671 boolean zero_times_ok = 0, many_times_ok = 0;
2672 boolean greedy = 1;
25fe55af
RS
2673
2674 /* If there is a sequence of repetition chars, collapse it
2675 down to just one (the right one). We can't combine
2676 interval operators with these because of, e.g., `a{2}*',
7814e705 2677 which should only match an even number of `a's. */
25fe55af
RS
2678
2679 for (;;)
2680 {
0b32bf0e 2681 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2682 && c == '?' && (zero_times_ok || many_times_ok))
2683 greedy = 0;
2684 else
2685 {
2686 zero_times_ok |= c != '+';
2687 many_times_ok |= c != '?';
2688 }
25fe55af
RS
2689
2690 if (p == pend)
2691 break;
ed0767d8
SM
2692 else if (*p == '*'
2693 || (!(syntax & RE_BK_PLUS_QM)
2694 && (*p == '+' || *p == '?')))
25fe55af 2695 ;
ed0767d8 2696 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2697 {
ed0767d8
SM
2698 if (p+1 == pend)
2699 FREE_STACK_RETURN (REG_EESCAPE);
2700 if (p[1] == '+' || p[1] == '?')
2701 PATFETCH (c); /* Gobble up the backslash. */
2702 else
2703 break;
25fe55af
RS
2704 }
2705 else
ed0767d8 2706 break;
25fe55af 2707 /* If we get here, we found another repeat character. */
ed0767d8
SM
2708 PATFETCH (c);
2709 }
25fe55af
RS
2710
2711 /* Star, etc. applied to an empty pattern is equivalent
2712 to an empty pattern. */
4e8a9132 2713 if (!laststart || laststart == b)
25fe55af
RS
2714 break;
2715
2716 /* Now we know whether or not zero matches is allowed
7814e705 2717 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2718 if (greedy)
2719 {
99633e97 2720 if (many_times_ok)
4e8a9132
SM
2721 {
2722 boolean simple = skip_one_char (laststart) == b;
2723 unsigned int startoffset = 0;
f6a3f532 2724 re_opcode_t ofj =
01618498 2725 /* Check if the loop can match the empty string. */
6df42991
SM
2726 (simple || !analyse_first (laststart, b, NULL, 0))
2727 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2728 assert (skip_one_char (laststart) <= b);
177c0ea7 2729
4e8a9132
SM
2730 if (!zero_times_ok && simple)
2731 { /* Since simple * loops can be made faster by using
2732 on_failure_keep_string_jump, we turn simple P+
2733 into PP* if P is simple. */
2734 unsigned char *p1, *p2;
2735 startoffset = b - laststart;
2736 GET_BUFFER_SPACE (startoffset);
2737 p1 = b; p2 = laststart;
2738 while (p2 < p1)
2739 *b++ = *p2++;
2740 zero_times_ok = 1;
99633e97 2741 }
4e8a9132
SM
2742
2743 GET_BUFFER_SPACE (6);
2744 if (!zero_times_ok)
2745 /* A + loop. */
f6a3f532 2746 STORE_JUMP (ofj, b, b + 6);
99633e97 2747 else
4e8a9132
SM
2748 /* Simple * loops can use on_failure_keep_string_jump
2749 depending on what follows. But since we don't know
2750 that yet, we leave the decision up to
2751 on_failure_jump_smart. */
f6a3f532 2752 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2753 laststart + startoffset, b + 6);
99633e97 2754 b += 3;
4e8a9132 2755 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2756 b += 3;
2757 }
2758 else
2759 {
4e8a9132
SM
2760 /* A simple ? pattern. */
2761 assert (zero_times_ok);
2762 GET_BUFFER_SPACE (3);
2763 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2764 b += 3;
2765 }
1c8c6d39
DL
2766 }
2767 else /* not greedy */
2768 { /* I wish the greedy and non-greedy cases could be merged. */
2769
0683b6fa 2770 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2771 if (many_times_ok)
2772 {
f6a3f532
SM
2773 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2774
6df42991
SM
2775 /* The non-greedy multiple match looks like
2776 a repeat..until: we only need a conditional jump
2777 at the end of the loop. */
f6a3f532
SM
2778 if (emptyp) BUF_PUSH (no_op);
2779 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2780 : on_failure_jump, b, laststart);
1c8c6d39
DL
2781 b += 3;
2782 if (zero_times_ok)
2783 {
2784 /* The repeat...until naturally matches one or more.
2785 To also match zero times, we need to first jump to
6df42991 2786 the end of the loop (its conditional jump). */
1c8c6d39
DL
2787 INSERT_JUMP (jump, laststart, b);
2788 b += 3;
2789 }
2790 }
2791 else
2792 {
2793 /* non-greedy a?? */
1c8c6d39
DL
2794 INSERT_JUMP (jump, laststart, b + 3);
2795 b += 3;
2796 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2797 b += 3;
2798 }
2799 }
2800 }
4e8a9132 2801 pending_exact = 0;
fa9a63c5
RM
2802 break;
2803
2804
2805 case '.':
25fe55af
RS
2806 laststart = b;
2807 BUF_PUSH (anychar);
2808 break;
fa9a63c5
RM
2809
2810
25fe55af
RS
2811 case '[':
2812 {
b18215fc 2813 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2814
25fe55af 2815 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2816
25fe55af
RS
2817 /* Ensure that we have enough space to push a charset: the
2818 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2819 GET_BUFFER_SPACE (34);
2820
25fe55af 2821 laststart = b;
e318085a 2822
25fe55af 2823 /* We test `*p == '^' twice, instead of using an if
7814e705 2824 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2825 BUF_PUSH (*p == '^' ? charset_not : charset);
2826 if (*p == '^')
2827 p++;
e318085a 2828
25fe55af
RS
2829 /* Remember the first position in the bracket expression. */
2830 p1 = p;
e318085a 2831
7814e705 2832 /* Push the number of bytes in the bitmap. */
25fe55af 2833 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2834
25fe55af
RS
2835 /* Clear the whole map. */
2836 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2837
25fe55af
RS
2838 /* charset_not matches newline according to a syntax bit. */
2839 if ((re_opcode_t) b[-2] == charset_not
2840 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2841 SET_LIST_BIT ('\n');
fa9a63c5 2842
7814e705 2843 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2844 for (;;)
2845 {
b18215fc 2846 boolean escaped_char = false;
2d1675e4 2847 const unsigned char *p2 = p;
e318085a 2848
25fe55af 2849 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2850
36595814
SM
2851 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2852 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2853 So the translation is done later in a loop. Example:
2854 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2855 PATFETCH (c);
e318085a 2856
25fe55af
RS
2857 /* \ might escape characters inside [...] and [^...]. */
2858 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2859 {
2860 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2861
2862 PATFETCH (c);
b18215fc 2863 escaped_char = true;
25fe55af 2864 }
b18215fc
RS
2865 else
2866 {
7814e705 2867 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2868 not (i.e., when the bracket expression is `[]' so
2869 far), the ']' character bit gets set way below. */
2d1675e4 2870 if (c == ']' && p2 != p1)
657fcfbd 2871 break;
25fe55af 2872 }
b18215fc 2873
b18215fc
RS
2874 /* What should we do for the character which is
2875 greater than 0x7F, but not BASE_LEADING_CODE_P?
2876 XXX */
2877
25fe55af
RS
2878 /* See if we're at the beginning of a possible character
2879 class. */
b18215fc 2880
2d1675e4
SM
2881 if (!escaped_char &&
2882 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2883 {
7814e705 2884 /* Leave room for the null. */
14473664 2885 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2886 const unsigned char *class_beg;
b18215fc 2887
25fe55af
RS
2888 PATFETCH (c);
2889 c1 = 0;
ed0767d8 2890 class_beg = p;
b18215fc 2891
25fe55af
RS
2892 /* If pattern is `[[:'. */
2893 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2894
25fe55af
RS
2895 for (;;)
2896 {
14473664
SM
2897 PATFETCH (c);
2898 if ((c == ':' && *p == ']') || p == pend)
2899 break;
2900 if (c1 < CHAR_CLASS_MAX_LENGTH)
2901 str[c1++] = c;
2902 else
2903 /* This is in any case an invalid class name. */
2904 str[0] = '\0';
25fe55af
RS
2905 }
2906 str[c1] = '\0';
b18215fc
RS
2907
2908 /* If isn't a word bracketed by `[:' and `:]':
2909 undo the ending character, the letters, and
2910 leave the leading `:' and `[' (but set bits for
2911 them). */
25fe55af
RS
2912 if (c == ':' && *p == ']')
2913 {
36595814 2914 re_wchar_t ch;
14473664
SM
2915 re_wctype_t cc;
2916
2917 cc = re_wctype (str);
2918
2919 if (cc == 0)
fa9a63c5
RM
2920 FREE_STACK_RETURN (REG_ECTYPE);
2921
14473664
SM
2922 /* Throw away the ] at the end of the character
2923 class. */
2924 PATFETCH (c);
fa9a63c5 2925
14473664 2926 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2927
96cc36cc
RS
2928 /* Most character classes in a multibyte match
2929 just set a flag. Exceptions are is_blank,
2930 is_digit, is_cntrl, and is_xdigit, since
2931 they can only match ASCII characters. We
14473664
SM
2932 don't need to handle them for multibyte.
2933 They are distinguished by a negative wctype. */
96cc36cc 2934
2d1675e4 2935 if (multibyte)
14473664
SM
2936 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2937 re_wctype_to_bit (cc));
96cc36cc 2938
14473664 2939 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
25fe55af 2940 {
7ae68633 2941 int translated = TRANSLATE (ch);
151ccb74 2942 if (translated < (1 << BYTEWIDTH)
6358f8b2 2943 && re_iswctype (btowc (ch), cc))
96cc36cc 2944 SET_LIST_BIT (translated);
25fe55af 2945 }
b18215fc 2946
6224b623
SM
2947 /* In most cases the matching rule for char classes
2948 only uses the syntax table for multibyte chars,
2949 so that the content of the syntax-table it is not
2950 hardcoded in the range_table. SPACE and WORD are
2951 the two exceptions. */
2952 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2953 bufp->used_syntax = 1;
2954
b18215fc
RS
2955 /* Repeat the loop. */
2956 continue;
25fe55af
RS
2957 }
2958 else
2959 {
ed0767d8
SM
2960 /* Go back to right after the "[:". */
2961 p = class_beg;
25fe55af 2962 SET_LIST_BIT ('[');
b18215fc
RS
2963
2964 /* Because the `:' may starts the range, we
2965 can't simply set bit and repeat the loop.
7814e705 2966 Instead, just set it to C and handle below. */
b18215fc 2967 c = ':';
25fe55af
RS
2968 }
2969 }
b18215fc
RS
2970
2971 if (p < pend && p[0] == '-' && p[1] != ']')
2972 {
2973
2974 /* Discard the `-'. */
2975 PATFETCH (c1);
2976
2977 /* Fetch the character which ends the range. */
2978 PATFETCH (c1);
b18215fc 2979
b54f61ed 2980 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2981 {
b54f61ed
KH
2982 if (! SINGLE_BYTE_CHAR_P (c1))
2983 {
3ff2446d
KH
2984 /* Handle a range starting with a
2985 character of less than 256, and ending
2986 with a character of not less than 256.
2987 Split that into two ranges, the low one
2988 ending at 0377, and the high one
2989 starting at the smallest character in
2990 the charset of C1 and ending at C1. */
b54f61ed 2991 int charset = CHAR_CHARSET (c1);
36595814
SM
2992 re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
2993
b54f61ed
KH
2994 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2995 c2, c1);
333526e0 2996 c1 = 0377;
b54f61ed 2997 }
e934739e
RS
2998 }
2999 else if (!SAME_CHARSET_P (c, c1))
b3e4c897 3000 FREE_STACK_RETURN (REG_ERANGEX);
e318085a 3001 }
25fe55af 3002 else
b18215fc
RS
3003 /* Range from C to C. */
3004 c1 = c;
3005
3006 /* Set the range ... */
3007 if (SINGLE_BYTE_CHAR_P (c))
3008 /* ... into bitmap. */
25fe55af 3009 {
01618498 3010 re_wchar_t this_char;
36595814 3011 re_wchar_t range_start = c, range_end = c1;
b18215fc
RS
3012
3013 /* If the start is after the end, the range is empty. */
3014 if (range_start > range_end)
3015 {
3016 if (syntax & RE_NO_EMPTY_RANGES)
3017 FREE_STACK_RETURN (REG_ERANGE);
3018 /* Else, repeat the loop. */
3019 }
3020 else
3021 {
3022 for (this_char = range_start; this_char <= range_end;
3023 this_char++)
3aaab9a0
KH
3024 {
3025 int translated = TRANSLATE (this_char);
3026 if (translated < (1 << BYTEWIDTH))
3027 SET_LIST_BIT (translated);
3028 else
3029 SET_RANGE_TABLE_WORK_AREA
3030 (range_table_work, translated, translated);
3031 }
e934739e 3032 }
25fe55af 3033 }
e318085a 3034 else
b18215fc
RS
3035 /* ... into range table. */
3036 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
3037 }
3038
25fe55af 3039 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3040 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3041 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3042 b[-1]--;
3043 b += b[-1];
fa9a63c5 3044
96cc36cc
RS
3045 /* Build real range table from work area. */
3046 if (RANGE_TABLE_WORK_USED (range_table_work)
3047 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3048 {
3049 int i;
3050 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3051
b18215fc 3052 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3053 bytes for flags, two for COUNT, and three bytes for
3054 each character. */
3055 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3056
b18215fc
RS
3057 /* Indicate the existence of range table. */
3058 laststart[1] |= 0x80;
fa9a63c5 3059
96cc36cc
RS
3060 /* Store the character class flag bits into the range table.
3061 If not in emacs, these flag bits are always 0. */
3062 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3063 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3064
b18215fc
RS
3065 STORE_NUMBER_AND_INCR (b, used / 2);
3066 for (i = 0; i < used; i++)
3067 STORE_CHARACTER_AND_INCR
3068 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3069 }
25fe55af
RS
3070 }
3071 break;
fa9a63c5
RM
3072
3073
b18215fc 3074 case '(':
25fe55af
RS
3075 if (syntax & RE_NO_BK_PARENS)
3076 goto handle_open;
3077 else
3078 goto normal_char;
fa9a63c5
RM
3079
3080
25fe55af
RS
3081 case ')':
3082 if (syntax & RE_NO_BK_PARENS)
3083 goto handle_close;
3084 else
3085 goto normal_char;
e318085a
RS
3086
3087
25fe55af
RS
3088 case '\n':
3089 if (syntax & RE_NEWLINE_ALT)
3090 goto handle_alt;
3091 else
3092 goto normal_char;
e318085a
RS
3093
3094
b18215fc 3095 case '|':
25fe55af
RS
3096 if (syntax & RE_NO_BK_VBAR)
3097 goto handle_alt;
3098 else
3099 goto normal_char;
3100
3101
3102 case '{':
3103 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3104 goto handle_interval;
3105 else
3106 goto normal_char;
3107
3108
3109 case '\\':
3110 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3111
3112 /* Do not translate the character after the \, so that we can
3113 distinguish, e.g., \B from \b, even if we normally would
3114 translate, e.g., B to b. */
36595814 3115 PATFETCH (c);
25fe55af
RS
3116
3117 switch (c)
3118 {
3119 case '(':
3120 if (syntax & RE_NO_BK_PARENS)
3121 goto normal_backslash;
3122
3123 handle_open:
505bde11
SM
3124 {
3125 int shy = 0;
3126 if (p+1 < pend)
3127 {
3128 /* Look for a special (?...) construct */
ed0767d8 3129 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3130 {
ed0767d8 3131 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3132 PATFETCH (c);
3133 switch (c)
3134 {
3135 case ':': shy = 1; break;
3136 default:
3137 /* Only (?:...) is supported right now. */
3138 FREE_STACK_RETURN (REG_BADPAT);
3139 }
3140 }
505bde11
SM
3141 }
3142
3143 if (!shy)
3144 {
3145 bufp->re_nsub++;
3146 regnum++;
3147 }
25fe55af 3148
99633e97
SM
3149 if (COMPILE_STACK_FULL)
3150 {
3151 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3152 compile_stack_elt_t);
3153 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3154
99633e97
SM
3155 compile_stack.size <<= 1;
3156 }
25fe55af 3157
99633e97 3158 /* These are the values to restore when we hit end of this
7814e705 3159 group. They are all relative offsets, so that if the
99633e97
SM
3160 whole pattern moves because of realloc, they will still
3161 be valid. */
3162 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3163 COMPILE_STACK_TOP.fixup_alt_jump
3164 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3165 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3166 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3167
3168 /* Do not push a
3169 start_memory for groups beyond the last one we can
3170 represent in the compiled pattern. */
3171 if (regnum <= MAX_REGNUM && !shy)
3172 BUF_PUSH_2 (start_memory, regnum);
3173
3174 compile_stack.avail++;
3175
3176 fixup_alt_jump = 0;
3177 laststart = 0;
3178 begalt = b;
3179 /* If we've reached MAX_REGNUM groups, then this open
3180 won't actually generate any code, so we'll have to
3181 clear pending_exact explicitly. */
3182 pending_exact = 0;
3183 break;
505bde11 3184 }
25fe55af
RS
3185
3186 case ')':
3187 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3188
3189 if (COMPILE_STACK_EMPTY)
505bde11
SM
3190 {
3191 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3192 goto normal_backslash;
3193 else
3194 FREE_STACK_RETURN (REG_ERPAREN);
3195 }
25fe55af
RS
3196
3197 handle_close:
505bde11 3198 FIXUP_ALT_JUMP ();
25fe55af
RS
3199
3200 /* See similar code for backslashed left paren above. */
3201 if (COMPILE_STACK_EMPTY)
505bde11
SM
3202 {
3203 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3204 goto normal_char;
3205 else
3206 FREE_STACK_RETURN (REG_ERPAREN);
3207 }
25fe55af
RS
3208
3209 /* Since we just checked for an empty stack above, this
3210 ``can't happen''. */
3211 assert (compile_stack.avail != 0);
3212 {
3213 /* We don't just want to restore into `regnum', because
3214 later groups should continue to be numbered higher,
7814e705 3215 as in `(ab)c(de)' -- the second group is #2. */
25fe55af
RS
3216 regnum_t this_group_regnum;
3217
3218 compile_stack.avail--;
3219 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3220 fixup_alt_jump
3221 = COMPILE_STACK_TOP.fixup_alt_jump
3222 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3223 : 0;
3224 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3225 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3226 /* If we've reached MAX_REGNUM groups, then this open
3227 won't actually generate any code, so we'll have to
3228 clear pending_exact explicitly. */
3229 pending_exact = 0;
e318085a 3230
25fe55af 3231 /* We're at the end of the group, so now we know how many
7814e705 3232 groups were inside this one. */
505bde11
SM
3233 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3234 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3235 }
3236 break;
3237
3238
3239 case '|': /* `\|'. */
3240 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3241 goto normal_backslash;
3242 handle_alt:
3243 if (syntax & RE_LIMITED_OPS)
3244 goto normal_char;
3245
3246 /* Insert before the previous alternative a jump which
7814e705 3247 jumps to this alternative if the former fails. */
25fe55af
RS
3248 GET_BUFFER_SPACE (3);
3249 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3250 pending_exact = 0;
3251 b += 3;
3252
3253 /* The alternative before this one has a jump after it
3254 which gets executed if it gets matched. Adjust that
3255 jump so it will jump to this alternative's analogous
3256 jump (put in below, which in turn will jump to the next
3257 (if any) alternative's such jump, etc.). The last such
3258 jump jumps to the correct final destination. A picture:
3259 _____ _____
3260 | | | |
3261 | v | v
3262 a | b | c
3263
3264 If we are at `b', then fixup_alt_jump right now points to a
3265 three-byte space after `a'. We'll put in the jump, set
3266 fixup_alt_jump to right after `b', and leave behind three
3267 bytes which we'll fill in when we get to after `c'. */
3268
505bde11 3269 FIXUP_ALT_JUMP ();
25fe55af
RS
3270
3271 /* Mark and leave space for a jump after this alternative,
3272 to be filled in later either by next alternative or
3273 when know we're at the end of a series of alternatives. */
3274 fixup_alt_jump = b;
3275 GET_BUFFER_SPACE (3);
3276 b += 3;
3277
3278 laststart = 0;
3279 begalt = b;
3280 break;
3281
3282
3283 case '{':
3284 /* If \{ is a literal. */
3285 if (!(syntax & RE_INTERVALS)
3286 /* If we're at `\{' and it's not the open-interval
3287 operator. */
4bb91c68 3288 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3289 goto normal_backslash;
3290
3291 handle_interval:
3292 {
3293 /* If got here, then the syntax allows intervals. */
3294
3295 /* At least (most) this many matches must be made. */
99633e97 3296 int lower_bound = 0, upper_bound = -1;
25fe55af 3297
ed0767d8 3298 beg_interval = p;
25fe55af 3299
25fe55af
RS
3300 GET_UNSIGNED_NUMBER (lower_bound);
3301
3302 if (c == ',')
ed0767d8 3303 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3304 else
3305 /* Interval such as `{1}' => match exactly once. */
3306 upper_bound = lower_bound;
3307
3308 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3309 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3310 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3311
3312 if (!(syntax & RE_NO_BK_BRACES))
3313 {
4bb91c68
SM
3314 if (c != '\\')
3315 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3316 if (p == pend)
3317 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3318 PATFETCH (c);
3319 }
3320
3321 if (c != '}')
4bb91c68 3322 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3323
3324 /* We just parsed a valid interval. */
3325
3326 /* If it's invalid to have no preceding re. */
3327 if (!laststart)
3328 {
3329 if (syntax & RE_CONTEXT_INVALID_OPS)
3330 FREE_STACK_RETURN (REG_BADRPT);
3331 else if (syntax & RE_CONTEXT_INDEP_OPS)
3332 laststart = b;
3333 else
3334 goto unfetch_interval;
3335 }
3336
6df42991
SM
3337 if (upper_bound == 0)
3338 /* If the upper bound is zero, just drop the sub pattern
3339 altogether. */
3340 b = laststart;
3341 else if (lower_bound == 1 && upper_bound == 1)
3342 /* Just match it once: nothing to do here. */
3343 ;
3344
3345 /* Otherwise, we have a nontrivial interval. When
3346 we're all done, the pattern will look like:
3347 set_number_at <jump count> <upper bound>
3348 set_number_at <succeed_n count> <lower bound>
3349 succeed_n <after jump addr> <succeed_n count>
3350 <body of loop>
3351 jump_n <succeed_n addr> <jump count>
3352 (The upper bound and `jump_n' are omitted if
3353 `upper_bound' is 1, though.) */
3354 else
3355 { /* If the upper bound is > 1, we need to insert
3356 more at the end of the loop. */
3357 unsigned int nbytes = (upper_bound < 0 ? 3
3358 : upper_bound > 1 ? 5 : 0);
3359 unsigned int startoffset = 0;
3360
3361 GET_BUFFER_SPACE (20); /* We might use less. */
3362
3363 if (lower_bound == 0)
3364 {
3365 /* A succeed_n that starts with 0 is really a
3366 a simple on_failure_jump_loop. */
3367 INSERT_JUMP (on_failure_jump_loop, laststart,
3368 b + 3 + nbytes);
3369 b += 3;
3370 }
3371 else
3372 {
3373 /* Initialize lower bound of the `succeed_n', even
3374 though it will be set during matching by its
3375 attendant `set_number_at' (inserted next),
3376 because `re_compile_fastmap' needs to know.
3377 Jump to the `jump_n' we might insert below. */
3378 INSERT_JUMP2 (succeed_n, laststart,
3379 b + 5 + nbytes,
3380 lower_bound);
3381 b += 5;
3382
3383 /* Code to initialize the lower bound. Insert
7814e705 3384 before the `succeed_n'. The `5' is the last two
6df42991
SM
3385 bytes of this `set_number_at', plus 3 bytes of
3386 the following `succeed_n'. */
3387 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3388 b += 5;
3389 startoffset += 5;
3390 }
3391
3392 if (upper_bound < 0)
3393 {
3394 /* A negative upper bound stands for infinity,
3395 in which case it degenerates to a plain jump. */
3396 STORE_JUMP (jump, b, laststart + startoffset);
3397 b += 3;
3398 }
3399 else if (upper_bound > 1)
3400 { /* More than one repetition is allowed, so
3401 append a backward jump to the `succeed_n'
3402 that starts this interval.
3403
3404 When we've reached this during matching,
3405 we'll have matched the interval once, so
3406 jump back only `upper_bound - 1' times. */
3407 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3408 upper_bound - 1);
3409 b += 5;
3410
3411 /* The location we want to set is the second
3412 parameter of the `jump_n'; that is `b-2' as
3413 an absolute address. `laststart' will be
3414 the `set_number_at' we're about to insert;
3415 `laststart+3' the number to set, the source
3416 for the relative address. But we are
3417 inserting into the middle of the pattern --
3418 so everything is getting moved up by 5.
3419 Conclusion: (b - 2) - (laststart + 3) + 5,
3420 i.e., b - laststart.
3421
3422 We insert this at the beginning of the loop
3423 so that if we fail during matching, we'll
3424 reinitialize the bounds. */
3425 insert_op2 (set_number_at, laststart, b - laststart,
3426 upper_bound - 1, b);
3427 b += 5;
3428 }
3429 }
25fe55af
RS
3430 pending_exact = 0;
3431 beg_interval = NULL;
3432 }
3433 break;
3434
3435 unfetch_interval:
3436 /* If an invalid interval, match the characters as literals. */
3437 assert (beg_interval);
3438 p = beg_interval;
3439 beg_interval = NULL;
3440
3441 /* normal_char and normal_backslash need `c'. */
ed0767d8 3442 c = '{';
25fe55af
RS
3443
3444 if (!(syntax & RE_NO_BK_BRACES))
3445 {
ed0767d8
SM
3446 assert (p > pattern && p[-1] == '\\');
3447 goto normal_backslash;
25fe55af 3448 }
ed0767d8
SM
3449 else
3450 goto normal_char;
e318085a 3451
b18215fc 3452#ifdef emacs
25fe55af 3453 /* There is no way to specify the before_dot and after_dot
7814e705 3454 operators. rms says this is ok. --karl */
25fe55af
RS
3455 case '=':
3456 BUF_PUSH (at_dot);
3457 break;
3458
3459 case 's':
3460 laststart = b;
3461 PATFETCH (c);
3462 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3463 break;
3464
3465 case 'S':
3466 laststart = b;
3467 PATFETCH (c);
3468 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3469 break;
b18215fc
RS
3470
3471 case 'c':
3472 laststart = b;
36595814 3473 PATFETCH (c);
b18215fc
RS
3474 BUF_PUSH_2 (categoryspec, c);
3475 break;
e318085a 3476
b18215fc
RS
3477 case 'C':
3478 laststart = b;
36595814 3479 PATFETCH (c);
b18215fc
RS
3480 BUF_PUSH_2 (notcategoryspec, c);
3481 break;
3482#endif /* emacs */
e318085a 3483
e318085a 3484
25fe55af 3485 case 'w':
4bb91c68
SM
3486 if (syntax & RE_NO_GNU_OPS)
3487 goto normal_char;
25fe55af 3488 laststart = b;
1fb352e0 3489 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3490 break;
e318085a 3491
e318085a 3492
25fe55af 3493 case 'W':
4bb91c68
SM
3494 if (syntax & RE_NO_GNU_OPS)
3495 goto normal_char;
25fe55af 3496 laststart = b;
1fb352e0 3497 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3498 break;
e318085a
RS
3499
3500
25fe55af 3501 case '<':
4bb91c68
SM
3502 if (syntax & RE_NO_GNU_OPS)
3503 goto normal_char;
25fe55af
RS
3504 BUF_PUSH (wordbeg);
3505 break;
e318085a 3506
25fe55af 3507 case '>':
4bb91c68
SM
3508 if (syntax & RE_NO_GNU_OPS)
3509 goto normal_char;
25fe55af
RS
3510 BUF_PUSH (wordend);
3511 break;
e318085a 3512
669fa600
SM
3513 case '_':
3514 if (syntax & RE_NO_GNU_OPS)
3515 goto normal_char;
3516 laststart = b;
3517 PATFETCH (c);
3518 if (c == '<')
3519 BUF_PUSH (symbeg);
3520 else if (c == '>')
3521 BUF_PUSH (symend);
3522 else
3523 FREE_STACK_RETURN (REG_BADPAT);
3524 break;
3525
25fe55af 3526 case 'b':
4bb91c68
SM
3527 if (syntax & RE_NO_GNU_OPS)
3528 goto normal_char;
25fe55af
RS
3529 BUF_PUSH (wordbound);
3530 break;
e318085a 3531
25fe55af 3532 case 'B':
4bb91c68
SM
3533 if (syntax & RE_NO_GNU_OPS)
3534 goto normal_char;
25fe55af
RS
3535 BUF_PUSH (notwordbound);
3536 break;
fa9a63c5 3537
25fe55af 3538 case '`':
4bb91c68
SM
3539 if (syntax & RE_NO_GNU_OPS)
3540 goto normal_char;
25fe55af
RS
3541 BUF_PUSH (begbuf);
3542 break;
e318085a 3543
25fe55af 3544 case '\'':
4bb91c68
SM
3545 if (syntax & RE_NO_GNU_OPS)
3546 goto normal_char;
25fe55af
RS
3547 BUF_PUSH (endbuf);
3548 break;
e318085a 3549
25fe55af
RS
3550 case '1': case '2': case '3': case '4': case '5':
3551 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3552 {
3553 regnum_t reg;
e318085a 3554
0cdd06f8
SM
3555 if (syntax & RE_NO_BK_REFS)
3556 goto normal_backslash;
e318085a 3557
0cdd06f8 3558 reg = c - '0';
e318085a 3559
0cdd06f8
SM
3560 /* Can't back reference to a subexpression before its end. */
3561 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3562 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3563
0cdd06f8
SM
3564 laststart = b;
3565 BUF_PUSH_2 (duplicate, reg);
3566 }
25fe55af 3567 break;
e318085a 3568
e318085a 3569
25fe55af
RS
3570 case '+':
3571 case '?':
3572 if (syntax & RE_BK_PLUS_QM)
3573 goto handle_plus;
3574 else
3575 goto normal_backslash;
3576
3577 default:
3578 normal_backslash:
3579 /* You might think it would be useful for \ to mean
3580 not to translate; but if we don't translate it
4bb91c68 3581 it will never match anything. */
25fe55af
RS
3582 goto normal_char;
3583 }
3584 break;
fa9a63c5
RM
3585
3586
3587 default:
25fe55af 3588 /* Expects the character in `c'. */
fa9a63c5 3589 normal_char:
36595814 3590 /* If no exactn currently being built. */
25fe55af 3591 if (!pending_exact
fa9a63c5 3592
25fe55af
RS
3593 /* If last exactn not at current position. */
3594 || pending_exact + *pending_exact + 1 != b
5e69f11e 3595
25fe55af 3596 /* We have only one byte following the exactn for the count. */
2d1675e4 3597 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3598
7814e705 3599 /* If followed by a repetition operator. */
9d99031f 3600 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3601 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3602 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3603 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3604 || ((syntax & RE_INTERVALS)
25fe55af 3605 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3606 ? p != pend && *p == '{'
3607 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3608 {
3609 /* Start building a new exactn. */
5e69f11e 3610
25fe55af 3611 laststart = b;
fa9a63c5
RM
3612
3613 BUF_PUSH_2 (exactn, 0);
3614 pending_exact = b - 1;
25fe55af 3615 }
5e69f11e 3616
2d1675e4
SM
3617 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3618 {
e0277a47
KH
3619 int len;
3620
36595814 3621 c = TRANSLATE (c);
e0277a47
KH
3622 if (multibyte)
3623 len = CHAR_STRING (c, b);
3624 else
3625 *b = c, len = 1;
2d1675e4
SM
3626 b += len;
3627 (*pending_exact) += len;
3628 }
3629
fa9a63c5 3630 break;
25fe55af 3631 } /* switch (c) */
fa9a63c5
RM
3632 } /* while p != pend */
3633
5e69f11e 3634
fa9a63c5 3635 /* Through the pattern now. */
5e69f11e 3636
505bde11 3637 FIXUP_ALT_JUMP ();
fa9a63c5 3638
5e69f11e 3639 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3640 FREE_STACK_RETURN (REG_EPAREN);
3641
3642 /* If we don't want backtracking, force success
3643 the first time we reach the end of the compiled pattern. */
3644 if (syntax & RE_NO_POSIX_BACKTRACKING)
3645 BUF_PUSH (succeed);
3646
fa9a63c5
RM
3647 /* We have succeeded; set the length of the buffer. */
3648 bufp->used = b - bufp->buffer;
3649
3650#ifdef DEBUG
99633e97 3651 if (debug > 0)
fa9a63c5 3652 {
505bde11 3653 re_compile_fastmap (bufp);
fa9a63c5
RM
3654 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3655 print_compiled_pattern (bufp);
3656 }
99633e97 3657 debug--;
fa9a63c5
RM
3658#endif /* DEBUG */
3659
3660#ifndef MATCH_MAY_ALLOCATE
3661 /* Initialize the failure stack to the largest possible stack. This
3662 isn't necessary unless we're trying to avoid calling alloca in
3663 the search and match routines. */
3664 {
3665 int num_regs = bufp->re_nsub + 1;
3666
320a2a73 3667 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3668 {
a26f4ccd 3669 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3670
fa9a63c5
RM
3671 if (! fail_stack.stack)
3672 fail_stack.stack
a073faa6 3673 = (fail_stack_elt_t *) malloc (fail_stack.size
8ea094cf 3674 * sizeof (fail_stack_elt_t));
fa9a63c5
RM
3675 else
3676 fail_stack.stack
a073faa6 3677 = (fail_stack_elt_t *) realloc (fail_stack.stack,
8ea094cf
AS
3678 (fail_stack.size
3679 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3680 }
3681
3682 regex_grow_registers (num_regs);
3683 }
3684#endif /* not MATCH_MAY_ALLOCATE */
3685
7c67c9f6 3686 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3687} /* regex_compile */
3688\f
3689/* Subroutines for `regex_compile'. */
3690
7814e705 3691/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3692
3693static void
3694store_op1 (op, loc, arg)
3695 re_opcode_t op;
3696 unsigned char *loc;
3697 int arg;
3698{
3699 *loc = (unsigned char) op;
3700 STORE_NUMBER (loc + 1, arg);
3701}
3702
3703
3704/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3705
3706static void
3707store_op2 (op, loc, arg1, arg2)
3708 re_opcode_t op;
3709 unsigned char *loc;
3710 int arg1, arg2;
3711{
3712 *loc = (unsigned char) op;
3713 STORE_NUMBER (loc + 1, arg1);
3714 STORE_NUMBER (loc + 3, arg2);
3715}
3716
3717
3718/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3719 for OP followed by two-byte integer parameter ARG. */
3720
3721static void
3722insert_op1 (op, loc, arg, end)
3723 re_opcode_t op;
3724 unsigned char *loc;
3725 int arg;
5e69f11e 3726 unsigned char *end;
fa9a63c5
RM
3727{
3728 register unsigned char *pfrom = end;
3729 register unsigned char *pto = end + 3;
3730
3731 while (pfrom != loc)
3732 *--pto = *--pfrom;
5e69f11e 3733
fa9a63c5
RM
3734 store_op1 (op, loc, arg);
3735}
3736
3737
3738/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3739
3740static void
3741insert_op2 (op, loc, arg1, arg2, end)
3742 re_opcode_t op;
3743 unsigned char *loc;
3744 int arg1, arg2;
5e69f11e 3745 unsigned char *end;
fa9a63c5
RM
3746{
3747 register unsigned char *pfrom = end;
3748 register unsigned char *pto = end + 5;
3749
3750 while (pfrom != loc)
3751 *--pto = *--pfrom;
5e69f11e 3752
fa9a63c5
RM
3753 store_op2 (op, loc, arg1, arg2);
3754}
3755
3756
3757/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3758 after an alternative or a begin-subexpression. We assume there is at
3759 least one character before the ^. */
3760
3761static boolean
3762at_begline_loc_p (pattern, p, syntax)
01618498 3763 re_char *pattern, *p;
fa9a63c5
RM
3764 reg_syntax_t syntax;
3765{
01618498 3766 re_char *prev = p - 2;
fa9a63c5 3767 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3768
fa9a63c5
RM
3769 return
3770 /* After a subexpression? */
3771 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3772 /* After an alternative? */
d2af47df
SM
3773 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3774 /* After a shy subexpression? */
3775 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3776 && prev[-1] == '?' && prev[-2] == '('
3777 && (syntax & RE_NO_BK_PARENS
3778 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3779}
3780
3781
3782/* The dual of at_begline_loc_p. This one is for $. We assume there is
3783 at least one character after the $, i.e., `P < PEND'. */
3784
3785static boolean
3786at_endline_loc_p (p, pend, syntax)
01618498 3787 re_char *p, *pend;
99633e97 3788 reg_syntax_t syntax;
fa9a63c5 3789{
01618498 3790 re_char *next = p;
fa9a63c5 3791 boolean next_backslash = *next == '\\';
01618498 3792 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3793
fa9a63c5
RM
3794 return
3795 /* Before a subexpression? */
3796 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3797 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3798 /* Before an alternative? */
3799 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3800 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3801}
3802
3803
5e69f11e 3804/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3805 false if it's not. */
3806
3807static boolean
3808group_in_compile_stack (compile_stack, regnum)
3809 compile_stack_type compile_stack;
3810 regnum_t regnum;
3811{
3812 int this_element;
3813
5e69f11e
RM
3814 for (this_element = compile_stack.avail - 1;
3815 this_element >= 0;
fa9a63c5
RM
3816 this_element--)
3817 if (compile_stack.stack[this_element].regnum == regnum)
3818 return true;
3819
3820 return false;
3821}
fa9a63c5 3822\f
f6a3f532
SM
3823/* analyse_first.
3824 If fastmap is non-NULL, go through the pattern and fill fastmap
3825 with all the possible leading chars. If fastmap is NULL, don't
3826 bother filling it up (obviously) and only return whether the
3827 pattern could potentially match the empty string.
3828
3829 Return 1 if p..pend might match the empty string.
3830 Return 0 if p..pend matches at least one char.
01618498 3831 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3832
3833static int
3834analyse_first (p, pend, fastmap, multibyte)
01618498 3835 re_char *p, *pend;
f6a3f532
SM
3836 char *fastmap;
3837 const int multibyte;
fa9a63c5 3838{
505bde11 3839 int j, k;
1fb352e0 3840 boolean not;
fa9a63c5 3841
b18215fc 3842 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3843 flag is set true. */
b18215fc
RS
3844 boolean match_any_multibyte_characters = false;
3845
f6a3f532 3846 assert (p);
5e69f11e 3847
505bde11
SM
3848 /* The loop below works as follows:
3849 - It has a working-list kept in the PATTERN_STACK and which basically
3850 starts by only containing a pointer to the first operation.
3851 - If the opcode we're looking at is a match against some set of
3852 chars, then we add those chars to the fastmap and go on to the
3853 next work element from the worklist (done via `break').
3854 - If the opcode is a control operator on the other hand, we either
3855 ignore it (if it's meaningless at this point, such as `start_memory')
3856 or execute it (if it's a jump). If the jump has several destinations
3857 (i.e. `on_failure_jump'), then we push the other destination onto the
3858 worklist.
3859 We guarantee termination by ignoring backward jumps (more or less),
3860 so that `p' is monotonically increasing. More to the point, we
3861 never set `p' (or push) anything `<= p1'. */
3862
01618498 3863 while (p < pend)
fa9a63c5 3864 {
505bde11
SM
3865 /* `p1' is used as a marker of how far back a `on_failure_jump'
3866 can go without being ignored. It is normally equal to `p'
3867 (which prevents any backward `on_failure_jump') except right
3868 after a plain `jump', to allow patterns such as:
3869 0: jump 10
3870 3..9: <body>
3871 10: on_failure_jump 3
3872 as used for the *? operator. */
01618498 3873 re_char *p1 = p;
5e69f11e 3874
fa9a63c5
RM
3875 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3876 {
f6a3f532 3877 case succeed:
01618498 3878 return 1;
f6a3f532 3879 continue;
fa9a63c5 3880
fa9a63c5 3881 case duplicate:
505bde11
SM
3882 /* If the first character has to match a backreference, that means
3883 that the group was empty (since it already matched). Since this
3884 is the only case that interests us here, we can assume that the
3885 backreference must match the empty string. */
3886 p++;
3887 continue;
fa9a63c5
RM
3888
3889
3890 /* Following are the cases which match a character. These end
7814e705 3891 with `break'. */
fa9a63c5
RM
3892
3893 case exactn:
e0277a47
KH
3894 if (fastmap)
3895 {
3896 int c = RE_STRING_CHAR (p + 1, pend - p);
4560a582
SM
3897 /* When fast-scanning, the fastmap can be indexed either with
3898 a char (smaller than 256) or with the first byte of
3899 a char's byte sequence. So we have to conservatively add
3900 both to the table. */
e0277a47
KH
3901 if (SINGLE_BYTE_CHAR_P (c))
3902 fastmap[c] = 1;
4560a582 3903 fastmap[p[1]] = 1;
e0277a47 3904 }
fa9a63c5
RM
3905 break;
3906
3907
1fb352e0
SM
3908 case anychar:
3909 /* We could put all the chars except for \n (and maybe \0)
3910 but we don't bother since it is generally not worth it. */
f6a3f532 3911 if (!fastmap) break;
01618498 3912 return -1;
fa9a63c5
RM
3913
3914
b18215fc 3915 case charset_not:
ba5c004d
RS
3916 /* Chars beyond end of bitmap are possible matches.
3917 All the single-byte codes can occur in multibyte buffers.
3918 So any that are not listed in the charset
3919 are possible matches, even in multibyte buffers. */
1fb352e0 3920 if (!fastmap) break;
4560a582
SM
3921 /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
3922 because it will automatically be set when needed by virtue of
3923 being larger than the highest char of its charset (0xbf) but
3924 smaller than (1<<BYTEWIDTH). */
b18215fc 3925 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3926 j < (1 << BYTEWIDTH); j++)
b18215fc 3927 fastmap[j] = 1;
1fb352e0
SM
3928 /* Fallthrough */
3929 case charset:
3930 if (!fastmap) break;
3931 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3932 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3933 j >= 0; j--)
1fb352e0 3934 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
4560a582
SM
3935 {
3936 fastmap[j] = 1;
3937#ifdef emacs
3938 if (j >= 0x80 && j < 0xa0)
3939 fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
3940#endif
3941 }
b18215fc 3942
1fb352e0
SM
3943 if ((not && multibyte)
3944 /* Any character set can possibly contain a character
3945 which doesn't match the specified set of characters. */
3946 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3947 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3948 /* If we can match a character class, we can match
3949 any character set. */
b18215fc
RS
3950 {
3951 set_fastmap_for_multibyte_characters:
3952 if (match_any_multibyte_characters == false)
3953 {
3954 for (j = 0x80; j < 0xA0; j++) /* XXX */
3955 if (BASE_LEADING_CODE_P (j))
3956 fastmap[j] = 1;
3957 match_any_multibyte_characters = true;
3958 }
3959 }
b18215fc 3960
1fb352e0
SM
3961 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3962 && match_any_multibyte_characters == false)
3963 {
3964 /* Set fastmap[I] 1 where I is a base leading code of each
3965 multibyte character in the range table. */
3966 int c, count;
b18215fc 3967
1fb352e0 3968 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3969 bits for a character class. */
1fb352e0 3970 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3971
1fb352e0
SM
3972 /* Extract the number of ranges in range table into COUNT. */
3973 EXTRACT_NUMBER_AND_INCR (count, p);
3974 for (; count > 0; count--, p += 2 * 3) /* XXX */
3975 {
3976 /* Extract the start of each range. */
3977 EXTRACT_CHARACTER (c, p);
3978 j = CHAR_CHARSET (c);
3979 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3980 }
3981 }
b18215fc
RS
3982 break;
3983
1fb352e0
SM
3984 case syntaxspec:
3985 case notsyntaxspec:
3986 if (!fastmap) break;
3987#ifndef emacs
3988 not = (re_opcode_t)p[-1] == notsyntaxspec;
3989 k = *p++;
3990 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3991 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3992 fastmap[j] = 1;
b18215fc 3993 break;
1fb352e0 3994#else /* emacs */
b18215fc
RS
3995 /* This match depends on text properties. These end with
3996 aborting optimizations. */
01618498 3997 return -1;
b18215fc
RS
3998
3999 case categoryspec:
b18215fc 4000 case notcategoryspec:
1fb352e0
SM
4001 if (!fastmap) break;
4002 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4003 k = *p++;
1fb352e0
SM
4004 for (j = 0; j < (1 << BYTEWIDTH); j++)
4005 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4006 fastmap[j] = 1;
4007
1fb352e0 4008 if (multibyte)
b18215fc 4009 /* Any character set can possibly contain a character
1fb352e0 4010 whose category is K (or not). */
b18215fc
RS
4011 goto set_fastmap_for_multibyte_characters;
4012 break;
4013
fa9a63c5 4014 /* All cases after this match the empty string. These end with
25fe55af 4015 `continue'. */
fa9a63c5 4016
fa9a63c5
RM
4017 case before_dot:
4018 case at_dot:
4019 case after_dot:
1fb352e0 4020#endif /* !emacs */
25fe55af
RS
4021 case no_op:
4022 case begline:
4023 case endline:
fa9a63c5
RM
4024 case begbuf:
4025 case endbuf:
4026 case wordbound:
4027 case notwordbound:
4028 case wordbeg:
4029 case wordend:
669fa600
SM
4030 case symbeg:
4031 case symend:
25fe55af 4032 continue;
fa9a63c5
RM
4033
4034
fa9a63c5 4035 case jump:
25fe55af 4036 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4037 if (j < 0)
4038 /* Backward jumps can only go back to code that we've already
4039 visited. `re_compile' should make sure this is true. */
4040 break;
25fe55af 4041 p += j;
505bde11
SM
4042 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4043 {
4044 case on_failure_jump:
4045 case on_failure_keep_string_jump:
505bde11 4046 case on_failure_jump_loop:
0683b6fa 4047 case on_failure_jump_nastyloop:
505bde11
SM
4048 case on_failure_jump_smart:
4049 p++;
4050 break;
4051 default:
4052 continue;
4053 };
4054 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4055 to jump back to "just after here". */
4056 /* Fallthrough */
fa9a63c5 4057
25fe55af
RS
4058 case on_failure_jump:
4059 case on_failure_keep_string_jump:
0683b6fa 4060 case on_failure_jump_nastyloop:
505bde11
SM
4061 case on_failure_jump_loop:
4062 case on_failure_jump_smart:
25fe55af 4063 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4064 if (p + j <= p1)
ed0767d8 4065 ; /* Backward jump to be ignored. */
01618498
SM
4066 else
4067 { /* We have to look down both arms.
4068 We first go down the "straight" path so as to minimize
4069 stack usage when going through alternatives. */
4070 int r = analyse_first (p, pend, fastmap, multibyte);
4071 if (r) return r;
4072 p += j;
4073 }
25fe55af 4074 continue;
fa9a63c5
RM
4075
4076
ed0767d8
SM
4077 case jump_n:
4078 /* This code simply does not properly handle forward jump_n. */
4079 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4080 p += 4;
4081 /* jump_n can either jump or fall through. The (backward) jump
4082 case has already been handled, so we only need to look at the
4083 fallthrough case. */
4084 continue;
177c0ea7 4085
fa9a63c5 4086 case succeed_n:
ed0767d8
SM
4087 /* If N == 0, it should be an on_failure_jump_loop instead. */
4088 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4089 p += 4;
4090 /* We only care about one iteration of the loop, so we don't
4091 need to consider the case where this behaves like an
4092 on_failure_jump. */
25fe55af 4093 continue;
fa9a63c5
RM
4094
4095
4096 case set_number_at:
25fe55af
RS
4097 p += 4;
4098 continue;
fa9a63c5
RM
4099
4100
4101 case start_memory:
25fe55af 4102 case stop_memory:
505bde11 4103 p += 1;
fa9a63c5
RM
4104 continue;
4105
4106
4107 default:
25fe55af
RS
4108 abort (); /* We have listed all the cases. */
4109 } /* switch *p++ */
fa9a63c5
RM
4110
4111 /* Getting here means we have found the possible starting
25fe55af 4112 characters for one path of the pattern -- and that the empty
7814e705 4113 string does not match. We need not follow this path further. */
01618498 4114 return 0;
fa9a63c5
RM
4115 } /* while p */
4116
01618498
SM
4117 /* We reached the end without matching anything. */
4118 return 1;
4119
f6a3f532
SM
4120} /* analyse_first */
4121\f
4122/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4123 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4124 characters can start a string that matches the pattern. This fastmap
4125 is used by re_search to skip quickly over impossible starting points.
4126
4127 Character codes above (1 << BYTEWIDTH) are not represented in the
4128 fastmap, but the leading codes are represented. Thus, the fastmap
4129 indicates which character sets could start a match.
4130
4131 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4132 area as BUFP->fastmap.
4133
4134 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4135 the pattern buffer.
4136
4137 Returns 0 if we succeed, -2 if an internal error. */
4138
4139int
4140re_compile_fastmap (bufp)
4141 struct re_pattern_buffer *bufp;
4142{
4143 char *fastmap = bufp->fastmap;
4144 int analysis;
4145
4146 assert (fastmap && bufp->buffer);
4147
7814e705 4148 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4149 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4150
4151 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4152 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4153 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4154 return 0;
4155} /* re_compile_fastmap */
4156\f
4157/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4158 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4159 this memory for recording register information. STARTS and ENDS
4160 must be allocated using the malloc library routine, and must each
4161 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4162
4163 If NUM_REGS == 0, then subsequent matches should allocate their own
4164 register data.
4165
4166 Unless this function is called, the first search or match using
4167 PATTERN_BUFFER will allocate its own register data, without
4168 freeing the old data. */
4169
4170void
4171re_set_registers (bufp, regs, num_regs, starts, ends)
4172 struct re_pattern_buffer *bufp;
4173 struct re_registers *regs;
4174 unsigned num_regs;
4175 regoff_t *starts, *ends;
4176{
4177 if (num_regs)
4178 {
4179 bufp->regs_allocated = REGS_REALLOCATE;
4180 regs->num_regs = num_regs;
4181 regs->start = starts;
4182 regs->end = ends;
4183 }
4184 else
4185 {
4186 bufp->regs_allocated = REGS_UNALLOCATED;
4187 regs->num_regs = 0;
4188 regs->start = regs->end = (regoff_t *) 0;
4189 }
4190}
c0f9ea08 4191WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4192\f
7814e705 4193/* Searching routines. */
fa9a63c5
RM
4194
4195/* Like re_search_2, below, but only one string is specified, and
4196 doesn't let you say where to stop matching. */
4197
4198int
4199re_search (bufp, string, size, startpos, range, regs)
4200 struct re_pattern_buffer *bufp;
4201 const char *string;
4202 int size, startpos, range;
4203 struct re_registers *regs;
4204{
5e69f11e 4205 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4206 regs, size);
4207}
c0f9ea08 4208WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4209
70806df6
KH
4210/* Head address of virtual concatenation of string. */
4211#define HEAD_ADDR_VSTRING(P) \
4212 (((P) >= size1 ? string2 : string1))
4213
b18215fc
RS
4214/* End address of virtual concatenation of string. */
4215#define STOP_ADDR_VSTRING(P) \
4216 (((P) >= size1 ? string2 + size2 : string1 + size1))
4217
4218/* Address of POS in the concatenation of virtual string. */
4219#define POS_ADDR_VSTRING(POS) \
4220 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4221
4222/* Using the compiled pattern in BUFP->buffer, first tries to match the
4223 virtual concatenation of STRING1 and STRING2, starting first at index
4224 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4225
fa9a63c5 4226 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4227
fa9a63c5
RM
4228 RANGE is how far to scan while trying to match. RANGE = 0 means try
4229 only at STARTPOS; in general, the last start tried is STARTPOS +
4230 RANGE.
5e69f11e 4231
fa9a63c5
RM
4232 In REGS, return the indices of the virtual concatenation of STRING1
4233 and STRING2 that matched the entire BUFP->buffer and its contained
4234 subexpressions.
5e69f11e 4235
fa9a63c5
RM
4236 Do not consider matching one past the index STOP in the virtual
4237 concatenation of STRING1 and STRING2.
4238
4239 We return either the position in the strings at which the match was
4240 found, -1 if no match, or -2 if error (such as failure
4241 stack overflow). */
4242
4243int
66f0296e 4244re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4245 struct re_pattern_buffer *bufp;
66f0296e 4246 const char *str1, *str2;
fa9a63c5
RM
4247 int size1, size2;
4248 int startpos;
4249 int range;
4250 struct re_registers *regs;
4251 int stop;
4252{
4253 int val;
66f0296e
SM
4254 re_char *string1 = (re_char*) str1;
4255 re_char *string2 = (re_char*) str2;
fa9a63c5 4256 register char *fastmap = bufp->fastmap;
6676cb1c 4257 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4258 int total_size = size1 + size2;
4259 int endpos = startpos + range;
c0f9ea08 4260 boolean anchored_start;
fa9a63c5 4261
7814e705 4262 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4263 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4264
fa9a63c5
RM
4265 /* Check for out-of-range STARTPOS. */
4266 if (startpos < 0 || startpos > total_size)
4267 return -1;
5e69f11e 4268
fa9a63c5 4269 /* Fix up RANGE if it might eventually take us outside
34597fa9 4270 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4271 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4272 if (endpos < 0)
4273 range = 0 - startpos;
fa9a63c5
RM
4274 else if (endpos > total_size)
4275 range = total_size - startpos;
4276
4277 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4278 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4279 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4280 {
4281 if (startpos > 0)
4282 return -1;
4283 else
7b140fd7 4284 range = 0;
fa9a63c5
RM
4285 }
4286
ae4788a8
RS
4287#ifdef emacs
4288 /* In a forward search for something that starts with \=.
4289 don't keep searching past point. */
4290 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4291 {
7b140fd7
RS
4292 range = PT_BYTE - BEGV_BYTE - startpos;
4293 if (range < 0)
ae4788a8
RS
4294 return -1;
4295 }
4296#endif /* emacs */
4297
fa9a63c5
RM
4298 /* Update the fastmap now if not correct already. */
4299 if (fastmap && !bufp->fastmap_accurate)
01618498 4300 re_compile_fastmap (bufp);
5e69f11e 4301
c8499ba5 4302 /* See whether the pattern is anchored. */
c0f9ea08 4303 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4304
b18215fc 4305#ifdef emacs
cc9b4df2
KH
4306 gl_state.object = re_match_object;
4307 {
99633e97 4308 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4309
4310 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4311 }
b18215fc
RS
4312#endif
4313
fa9a63c5
RM
4314 /* Loop through the string, looking for a place to start matching. */
4315 for (;;)
5e69f11e 4316 {
c8499ba5
RS
4317 /* If the pattern is anchored,
4318 skip quickly past places we cannot match.
4319 We don't bother to treat startpos == 0 specially
4320 because that case doesn't repeat. */
4321 if (anchored_start && startpos > 0)
4322 {
c0f9ea08
SM
4323 if (! ((startpos <= size1 ? string1[startpos - 1]
4324 : string2[startpos - size1 - 1])
4325 == '\n'))
c8499ba5
RS
4326 goto advance;
4327 }
4328
fa9a63c5 4329 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4330 cannot be the start of a match. If the pattern can match the
4331 null string, however, we don't need to skip characters; we want
7814e705 4332 the first null string. */
fa9a63c5
RM
4333 if (fastmap && startpos < total_size && !bufp->can_be_null)
4334 {
66f0296e 4335 register re_char *d;
01618498 4336 register re_wchar_t buf_ch;
e934739e
RS
4337
4338 d = POS_ADDR_VSTRING (startpos);
4339
7814e705 4340 if (range > 0) /* Searching forwards. */
fa9a63c5 4341 {
fa9a63c5
RM
4342 register int lim = 0;
4343 int irange = range;
4344
25fe55af
RS
4345 if (startpos < size1 && startpos + range >= size1)
4346 lim = range - (size1 - startpos);
fa9a63c5 4347
25fe55af
RS
4348 /* Written out as an if-else to avoid testing `translate'
4349 inside the loop. */
28ae27ae
AS
4350 if (RE_TRANSLATE_P (translate))
4351 {
e934739e
RS
4352 if (multibyte)
4353 while (range > lim)
4354 {
4355 int buf_charlen;
4356
4357 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4358 buf_charlen);
4359
4360 buf_ch = RE_TRANSLATE (translate, buf_ch);
4361 if (buf_ch >= 0400
4362 || fastmap[buf_ch])
4363 break;
4364
4365 range -= buf_charlen;
4366 d += buf_charlen;
4367 }
4368 else
cf1982d9
EZ
4369 {
4370 /* Convert *d to integer to shut up GCC's
4371 whining about comparison that is always
4372 true. */
4373 int di = *d;
4374
4375 while (range > lim
4376 && !fastmap[RE_TRANSLATE (translate, di)])
4377 {
4378 di = *(++d);
4379 range--;
4380 }
4381 }
e934739e 4382 }
fa9a63c5 4383 else
4560a582 4384 do
33c46939 4385 {
4560a582
SM
4386 re_char *d_start = d;
4387 while (range > lim && !fastmap[*d])
4388 {
4389 d++;
4390 range--;
4391 }
4392#ifdef emacs
4393 if (multibyte && range > lim)
4394 {
4395 /* Check that we are at the beginning of a char. */
4396 int at_boundary;
4397 AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
4398 if (at_boundary)
4399 break;
4400 else
4401 { /* We have matched an internal byte of a char
4402 rather than the leading byte, so it's a false
4403 positive: we should keep scanning. */
4404 d++; range--;
4405 }
4406 }
4407 else
4408#endif
4409 break;
4410 } while (1);
fa9a63c5
RM
4411
4412 startpos += irange - range;
4413 }
7814e705 4414 else /* Searching backwards. */
fa9a63c5 4415 {
2d1675e4
SM
4416 int room = (startpos >= size1
4417 ? size2 + size1 - startpos
4418 : size1 - startpos);
4419 buf_ch = RE_STRING_CHAR (d, room);
4420 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 4421
e934739e
RS
4422 if (! (buf_ch >= 0400
4423 || fastmap[buf_ch]))
fa9a63c5
RM
4424 goto advance;
4425 }
4426 }
4427
4428 /* If can't match the null string, and that's all we have left, fail. */
4429 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4430 && !bufp->can_be_null)
fa9a63c5
RM
4431 return -1;
4432
4433 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4434 startpos, regs, stop);
4435#ifndef REGEX_MALLOC
0b32bf0e 4436# ifdef C_ALLOCA
fa9a63c5 4437 alloca (0);
0b32bf0e 4438# endif
fa9a63c5
RM
4439#endif
4440
4441 if (val >= 0)
4442 return startpos;
5e69f11e 4443
fa9a63c5
RM
4444 if (val == -2)
4445 return -2;
4446
4447 advance:
5e69f11e 4448 if (!range)
25fe55af 4449 break;
5e69f11e 4450 else if (range > 0)
25fe55af 4451 {
b18215fc
RS
4452 /* Update STARTPOS to the next character boundary. */
4453 if (multibyte)
4454 {
66f0296e
SM
4455 re_char *p = POS_ADDR_VSTRING (startpos);
4456 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4457 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4458
4459 range -= len;
4460 if (range < 0)
4461 break;
4462 startpos += len;
4463 }
4464 else
4465 {
b560c397
RS
4466 range--;
4467 startpos++;
4468 }
e318085a 4469 }
fa9a63c5 4470 else
25fe55af
RS
4471 {
4472 range++;
4473 startpos--;
b18215fc
RS
4474
4475 /* Update STARTPOS to the previous character boundary. */
4476 if (multibyte)
4477 {
70806df6
KH
4478 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4479 re_char *p0 = p;
4480 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4481
4482 /* Find the head of multibyte form. */
70806df6
KH
4483 PREV_CHAR_BOUNDARY (p, phead);
4484 range += p0 - 1 - p;
4485 if (range > 0)
4486 break;
b18215fc 4487
70806df6 4488 startpos -= p0 - 1 - p;
b18215fc 4489 }
25fe55af 4490 }
fa9a63c5
RM
4491 }
4492 return -1;
4493} /* re_search_2 */
c0f9ea08 4494WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4495\f
4496/* Declarations and macros for re_match_2. */
4497
2d1675e4
SM
4498static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4499 register int len,
4500 RE_TRANSLATE_TYPE translate,
4501 const int multibyte));
fa9a63c5
RM
4502
4503/* This converts PTR, a pointer into one of the search strings `string1'
4504 and `string2' into an offset from the beginning of that string. */
4505#define POINTER_TO_OFFSET(ptr) \
4506 (FIRST_STRING_P (ptr) \
4507 ? ((regoff_t) ((ptr) - string1)) \
4508 : ((regoff_t) ((ptr) - string2 + size1)))
4509
fa9a63c5 4510/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4511 string2 if necessary.
4512 Check re_match_2_internal for a discussion of why end_match_2 might
4513 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4514#define PREFETCH() \
25fe55af 4515 while (d == dend) \
fa9a63c5
RM
4516 { \
4517 /* End of string2 => fail. */ \
25fe55af
RS
4518 if (dend == end_match_2) \
4519 goto fail; \
4bb91c68 4520 /* End of string1 => advance to string2. */ \
25fe55af 4521 d = string2; \
fa9a63c5
RM
4522 dend = end_match_2; \
4523 }
4524
f1ad044f
SM
4525/* Call before fetching a char with *d if you already checked other limits.
4526 This is meant for use in lookahead operations like wordend, etc..
4527 where we might need to look at parts of the string that might be
4528 outside of the LIMITs (i.e past `stop'). */
4529#define PREFETCH_NOLIMIT() \
4530 if (d == end1) \
4531 { \
4532 d = string2; \
4533 dend = end_match_2; \
4534 } \
fa9a63c5
RM
4535
4536/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4537 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4538#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4539#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4540
4541
4542/* Test if D points to a character which is word-constituent. We have
4543 two special cases to check for: if past the end of string1, look at
4544 the first character in string2; and if before the beginning of
4545 string2, look at the last character in string1. */
4546#define WORDCHAR_P(d) \
4547 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4548 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4549 == Sword)
4550
9121ca40 4551/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4552
4553/* The comment at case wordbound is following one, but we don't use
4554 AT_WORD_BOUNDARY anymore to support multibyte form.
4555
4556 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4557 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4558 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4559 macro and introducing temporary variables works around the bug. */
4560
9121ca40 4561#if 0
fa9a63c5
RM
4562/* Test if the character before D and the one at D differ with respect
4563 to being word-constituent. */
4564#define AT_WORD_BOUNDARY(d) \
4565 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4566 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4567#endif
fa9a63c5
RM
4568
4569/* Free everything we malloc. */
4570#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4571# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4572# define FREE_VARIABLES() \
fa9a63c5
RM
4573 do { \
4574 REGEX_FREE_STACK (fail_stack.stack); \
4575 FREE_VAR (regstart); \
4576 FREE_VAR (regend); \
fa9a63c5
RM
4577 FREE_VAR (best_regstart); \
4578 FREE_VAR (best_regend); \
fa9a63c5
RM
4579 } while (0)
4580#else
0b32bf0e 4581# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4582#endif /* not MATCH_MAY_ALLOCATE */
4583
505bde11
SM
4584\f
4585/* Optimization routines. */
4586
4e8a9132
SM
4587/* If the operation is a match against one or more chars,
4588 return a pointer to the next operation, else return NULL. */
01618498 4589static re_char *
4e8a9132 4590skip_one_char (p)
01618498 4591 re_char *p;
4e8a9132
SM
4592{
4593 switch (SWITCH_ENUM_CAST (*p++))
4594 {
4595 case anychar:
4596 break;
177c0ea7 4597
4e8a9132
SM
4598 case exactn:
4599 p += *p + 1;
4600 break;
4601
4602 case charset_not:
4603 case charset:
4604 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4605 {
4606 int mcnt;
4607 p = CHARSET_RANGE_TABLE (p - 1);
4608 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4609 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4610 }
4611 else
4612 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4613 break;
177c0ea7 4614
4e8a9132
SM
4615 case syntaxspec:
4616 case notsyntaxspec:
1fb352e0 4617#ifdef emacs
4e8a9132
SM
4618 case categoryspec:
4619 case notcategoryspec:
4620#endif /* emacs */
4621 p++;
4622 break;
4623
4624 default:
4625 p = NULL;
4626 }
4627 return p;
4628}
4629
4630
505bde11 4631/* Jump over non-matching operations. */
275a3409 4632static re_char *
4e8a9132 4633skip_noops (p, pend)
275a3409 4634 re_char *p, *pend;
505bde11
SM
4635{
4636 int mcnt;
4637 while (p < pend)
4638 {
4639 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4640 {
4641 case start_memory:
505bde11
SM
4642 case stop_memory:
4643 p += 2; break;
4644 case no_op:
4645 p += 1; break;
4646 case jump:
4647 p += 1;
4648 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4649 p += mcnt;
4650 break;
4651 default:
4652 return p;
4653 }
4654 }
4655 assert (p == pend);
4656 return p;
4657}
4658
4659/* Non-zero if "p1 matches something" implies "p2 fails". */
4660static int
4661mutually_exclusive_p (bufp, p1, p2)
4662 struct re_pattern_buffer *bufp;
275a3409 4663 re_char *p1, *p2;
505bde11 4664{
4e8a9132 4665 re_opcode_t op2;
2d1675e4 4666 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4667 unsigned char *pend = bufp->buffer + bufp->used;
4668
4e8a9132 4669 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4670 && p2 >= bufp->buffer && p2 <= pend);
4671
4672 /* Skip over open/close-group commands.
4673 If what follows this loop is a ...+ construct,
4674 look at what begins its body, since we will have to
4675 match at least one of that. */
4e8a9132
SM
4676 p2 = skip_noops (p2, pend);
4677 /* The same skip can be done for p1, except that this function
4678 is only used in the case where p1 is a simple match operator. */
4679 /* p1 = skip_noops (p1, pend); */
4680
4681 assert (p1 >= bufp->buffer && p1 < pend
4682 && p2 >= bufp->buffer && p2 <= pend);
4683
4684 op2 = p2 == pend ? succeed : *p2;
4685
4686 switch (SWITCH_ENUM_CAST (op2))
505bde11 4687 {
4e8a9132
SM
4688 case succeed:
4689 case endbuf:
4690 /* If we're at the end of the pattern, we can change. */
4691 if (skip_one_char (p1))
505bde11 4692 {
505bde11
SM
4693 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4694 return 1;
505bde11 4695 }
4e8a9132 4696 break;
177c0ea7 4697
4e8a9132 4698 case endline:
4e8a9132
SM
4699 case exactn:
4700 {
01618498 4701 register re_wchar_t c
4e8a9132 4702 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4703 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4704
4e8a9132
SM
4705 if ((re_opcode_t) *p1 == exactn)
4706 {
4707 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4708 {
4709 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4710 return 1;
4711 }
4712 }
505bde11 4713
4e8a9132
SM
4714 else if ((re_opcode_t) *p1 == charset
4715 || (re_opcode_t) *p1 == charset_not)
4716 {
4717 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4718
4e8a9132
SM
4719 /* Test if C is listed in charset (or charset_not)
4720 at `p1'. */
4721 if (SINGLE_BYTE_CHAR_P (c))
4722 {
4723 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4724 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4725 not = !not;
4726 }
4727 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4728 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4729
4e8a9132
SM
4730 /* `not' is equal to 1 if c would match, which means
4731 that we can't change to pop_failure_jump. */
4732 if (!not)
4733 {
4734 DEBUG_PRINT1 (" No match => fast loop.\n");
4735 return 1;
4736 }
4737 }
4738 else if ((re_opcode_t) *p1 == anychar
4739 && c == '\n')
4740 {
4741 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4742 return 1;
4743 }
4744 }
4745 break;
505bde11 4746
4e8a9132 4747 case charset:
4e8a9132
SM
4748 {
4749 if ((re_opcode_t) *p1 == exactn)
4750 /* Reuse the code above. */
4751 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4752
505bde11
SM
4753 /* It is hard to list up all the character in charset
4754 P2 if it includes multibyte character. Give up in
4755 such case. */
4756 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4757 {
4758 /* Now, we are sure that P2 has no range table.
4759 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4760 enough. But P1 may have range table, so the
505bde11
SM
4761 size of bitmap table of P1 is extracted by
4762 using macro `CHARSET_BITMAP_SIZE'.
4763
4764 Since we know that all the character listed in
4765 P2 is ASCII, it is enough to test only bitmap
4766 table of P1. */
4767
411e4203 4768 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4769 {
4770 int idx;
4771 /* We win if the charset inside the loop
4772 has no overlap with the one after the loop. */
4773 for (idx = 0;
4774 (idx < (int) p2[1]
4775 && idx < CHARSET_BITMAP_SIZE (p1));
4776 idx++)
4777 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4778 break;
4779
4780 if (idx == p2[1]
4781 || idx == CHARSET_BITMAP_SIZE (p1))
4782 {
4783 DEBUG_PRINT1 (" No match => fast loop.\n");
4784 return 1;
4785 }
4786 }
411e4203 4787 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4788 {
4789 int idx;
4790 /* We win if the charset_not inside the loop lists
7814e705 4791 every character listed in the charset after. */
505bde11
SM
4792 for (idx = 0; idx < (int) p2[1]; idx++)
4793 if (! (p2[2 + idx] == 0
4794 || (idx < CHARSET_BITMAP_SIZE (p1)
4795 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4796 break;
4797
4e8a9132
SM
4798 if (idx == p2[1])
4799 {
4800 DEBUG_PRINT1 (" No match => fast loop.\n");
4801 return 1;
4802 }
4803 }
4804 }
4805 }
609b757a 4806 break;
177c0ea7 4807
411e4203
SM
4808 case charset_not:
4809 switch (SWITCH_ENUM_CAST (*p1))
4810 {
4811 case exactn:
4812 case charset:
4813 /* Reuse the code above. */
4814 return mutually_exclusive_p (bufp, p2, p1);
4815 case charset_not:
4816 /* When we have two charset_not, it's very unlikely that
4817 they don't overlap. The union of the two sets of excluded
4818 chars should cover all possible chars, which, as a matter of
4819 fact, is virtually impossible in multibyte buffers. */
36595814 4820 break;
411e4203
SM
4821 }
4822 break;
4823
4e8a9132 4824 case wordend:
669fa600
SM
4825 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4826 case symend:
4e8a9132 4827 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4828 && (p1[1] == Ssymbol || p1[1] == Sword));
4829 case notsyntaxspec:
4830 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4831
4832 case wordbeg:
669fa600
SM
4833 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4834 case symbeg:
4e8a9132 4835 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4836 && (p1[1] == Ssymbol || p1[1] == Sword));
4837 case syntaxspec:
4838 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4839
4840 case wordbound:
4841 return (((re_opcode_t) *p1 == notsyntaxspec
4842 || (re_opcode_t) *p1 == syntaxspec)
4843 && p1[1] == Sword);
4844
1fb352e0 4845#ifdef emacs
4e8a9132
SM
4846 case categoryspec:
4847 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4848 case notcategoryspec:
4849 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4850#endif /* emacs */
4851
4852 default:
4853 ;
505bde11
SM
4854 }
4855
4856 /* Safe default. */
4857 return 0;
4858}
4859
fa9a63c5
RM
4860\f
4861/* Matching routines. */
4862
25fe55af 4863#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4864/* re_match is like re_match_2 except it takes only a single string. */
4865
4866int
4867re_match (bufp, string, size, pos, regs)
4868 struct re_pattern_buffer *bufp;
4869 const char *string;
4870 int size, pos;
4871 struct re_registers *regs;
4872{
4bb91c68 4873 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4874 pos, regs, size);
0b32bf0e 4875# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4876 alloca (0);
0b32bf0e 4877# endif
fa9a63c5
RM
4878 return result;
4879}
c0f9ea08 4880WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4881#endif /* not emacs */
4882
b18215fc
RS
4883#ifdef emacs
4884/* In Emacs, this is the string or buffer in which we
7814e705 4885 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4886Lisp_Object re_match_object;
4887#endif
fa9a63c5
RM
4888
4889/* re_match_2 matches the compiled pattern in BUFP against the
4890 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4891 and SIZE2, respectively). We start matching at POS, and stop
4892 matching at STOP.
5e69f11e 4893
fa9a63c5 4894 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4895 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4896 documentation for exactly how many groups we fill.
4897
4898 We return -1 if no match, -2 if an internal error (such as the
7814e705 4899 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4900 matched substring. */
4901
4902int
4903re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4904 struct re_pattern_buffer *bufp;
4905 const char *string1, *string2;
4906 int size1, size2;
4907 int pos;
4908 struct re_registers *regs;
4909 int stop;
4910{
b18215fc 4911 int result;
25fe55af 4912
b18215fc 4913#ifdef emacs
cc9b4df2
KH
4914 int charpos;
4915 gl_state.object = re_match_object;
99633e97 4916 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4917 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4918#endif
4919
4bb91c68
SM
4920 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4921 (re_char*) string2, size2,
cc9b4df2 4922 pos, regs, stop);
0b32bf0e 4923#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4924 alloca (0);
a60198e5 4925#endif
fa9a63c5
RM
4926 return result;
4927}
c0f9ea08 4928WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4929
4930/* This is a separate function so that we can force an alloca cleanup
7814e705 4931 afterwards. */
fa9a63c5
RM
4932static int
4933re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4934 struct re_pattern_buffer *bufp;
66f0296e 4935 re_char *string1, *string2;
fa9a63c5
RM
4936 int size1, size2;
4937 int pos;
4938 struct re_registers *regs;
4939 int stop;
4940{
4941 /* General temporaries. */
4942 int mcnt;
01618498 4943 size_t reg;
66f0296e 4944 boolean not;
fa9a63c5
RM
4945
4946 /* Just past the end of the corresponding string. */
66f0296e 4947 re_char *end1, *end2;
fa9a63c5
RM
4948
4949 /* Pointers into string1 and string2, just past the last characters in
7814e705 4950 each to consider matching. */
66f0296e 4951 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4952
4953 /* Where we are in the data, and the end of the current string. */
66f0296e 4954 re_char *d, *dend;
5e69f11e 4955
99633e97
SM
4956 /* Used sometimes to remember where we were before starting matching
4957 an operator so that we can go back in case of failure. This "atomic"
4958 behavior of matching opcodes is indispensable to the correctness
4959 of the on_failure_keep_string_jump optimization. */
4960 re_char *dfail;
4961
fa9a63c5 4962 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4963 re_char *p = bufp->buffer;
4964 re_char *pend = p + bufp->used;
fa9a63c5 4965
7814e705 4966 /* We use this to map every character in the string. */
6676cb1c 4967 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4968
46ac3660 4969 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4970 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4971
fa9a63c5
RM
4972 /* Failure point stack. Each place that can handle a failure further
4973 down the line pushes a failure point on this stack. It consists of
505bde11 4974 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4975 the subexpressions we're currently inside, plus the number of such
4976 registers, and, finally, two char *'s. The first char * is where
4977 to resume scanning the pattern; the second one is where to resume
7814e705
JB
4978 scanning the strings. */
4979#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4980 fail_stack_type fail_stack;
4981#endif
4982#ifdef DEBUG
fa9a63c5
RM
4983 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4984#endif
4985
0b32bf0e 4986#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4987 /* This holds the pointer to the failure stack, when
4988 it is allocated relocatably. */
4989 fail_stack_elt_t *failure_stack_ptr;
99633e97 4990#endif
fa9a63c5
RM
4991
4992 /* We fill all the registers internally, independent of what we
7814e705 4993 return, for use in backreferences. The number here includes
fa9a63c5 4994 an element for register zero. */
4bb91c68 4995 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4996
fa9a63c5
RM
4997 /* Information on the contents of registers. These are pointers into
4998 the input strings; they record just what was matched (on this
4999 attempt) by a subexpression part of the pattern, that is, the
5000 regnum-th regstart pointer points to where in the pattern we began
5001 matching and the regnum-th regend points to right after where we
5002 stopped matching the regnum-th subexpression. (The zeroth register
5003 keeps track of what the whole pattern matches.) */
5004#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5005 re_char **regstart, **regend;
fa9a63c5
RM
5006#endif
5007
fa9a63c5 5008 /* The following record the register info as found in the above
5e69f11e 5009 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5010 This happens as we backtrack through the failure points, which in
5011 turn happens only if we have not yet matched the entire string. */
5012 unsigned best_regs_set = false;
5013#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5014 re_char **best_regstart, **best_regend;
fa9a63c5 5015#endif
5e69f11e 5016
fa9a63c5
RM
5017 /* Logically, this is `best_regend[0]'. But we don't want to have to
5018 allocate space for that if we're not allocating space for anything
7814e705 5019 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5020 any of the other register vectors, and it seems rather a kludge to
5021 treat `best_regend' differently than the rest. So we keep track of
5022 the end of the best match so far in a separate variable. We
5023 initialize this to NULL so that when we backtrack the first time
5024 and need to test it, it's not garbage. */
66f0296e 5025 re_char *match_end = NULL;
fa9a63c5 5026
fa9a63c5
RM
5027#ifdef DEBUG
5028 /* Counts the total number of registers pushed. */
5e69f11e 5029 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5030#endif
5031
5032 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5033
fa9a63c5 5034 INIT_FAIL_STACK ();
5e69f11e 5035
fa9a63c5
RM
5036#ifdef MATCH_MAY_ALLOCATE
5037 /* Do not bother to initialize all the register variables if there are
5038 no groups in the pattern, as it takes a fair amount of time. If
5039 there are groups, we include space for register 0 (the whole
5040 pattern), even though we never use it, since it simplifies the
5041 array indexing. We should fix this. */
5042 if (bufp->re_nsub)
5043 {
66f0296e
SM
5044 regstart = REGEX_TALLOC (num_regs, re_char *);
5045 regend = REGEX_TALLOC (num_regs, re_char *);
5046 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5047 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5048
505bde11 5049 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5050 {
5051 FREE_VARIABLES ();
5052 return -2;
5053 }
fa9a63c5
RM
5054 }
5055 else
5056 {
5057 /* We must initialize all our variables to NULL, so that
25fe55af 5058 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5059 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5060 }
5061#endif /* MATCH_MAY_ALLOCATE */
5062
5063 /* The starting position is bogus. */
5064 if (pos < 0 || pos > size1 + size2)
5065 {
5066 FREE_VARIABLES ();
5067 return -1;
5068 }
5e69f11e 5069
fa9a63c5
RM
5070 /* Initialize subexpression text positions to -1 to mark ones that no
5071 start_memory/stop_memory has been seen for. Also initialize the
5072 register information struct. */
01618498
SM
5073 for (reg = 1; reg < num_regs; reg++)
5074 regstart[reg] = regend[reg] = NULL;
99633e97 5075
fa9a63c5 5076 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5077 `string1' is null. */
fa9a63c5
RM
5078 if (size2 == 0 && string1 != NULL)
5079 {
5080 string2 = string1;
5081 size2 = size1;
5082 string1 = 0;
5083 size1 = 0;
5084 }
5085 end1 = string1 + size1;
5086 end2 = string2 + size2;
5087
5e69f11e 5088 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5089 `dend' is the end of the input string that `d' points within. `d'
5090 is advanced into the following input string whenever necessary, but
5091 this happens before fetching; therefore, at the beginning of the
5092 loop, `d' can be pointing at the end of a string, but it cannot
5093 equal `string2'. */
419d1c74 5094 if (pos >= size1)
fa9a63c5 5095 {
419d1c74
SM
5096 /* Only match within string2. */
5097 d = string2 + pos - size1;
5098 dend = end_match_2 = string2 + stop - size1;
5099 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5100 }
5101 else
5102 {
f1ad044f 5103 if (stop < size1)
419d1c74
SM
5104 {
5105 /* Only match within string1. */
5106 end_match_1 = string1 + stop;
5107 /* BEWARE!
5108 When we reach end_match_1, PREFETCH normally switches to string2.
5109 But in the present case, this means that just doing a PREFETCH
5110 makes us jump from `stop' to `gap' within the string.
5111 What we really want here is for the search to stop as
5112 soon as we hit end_match_1. That's why we set end_match_2
5113 to end_match_1 (since PREFETCH fails as soon as we hit
5114 end_match_2). */
5115 end_match_2 = end_match_1;
5116 }
5117 else
f1ad044f
SM
5118 { /* It's important to use this code when stop == size so that
5119 moving `d' from end1 to string2 will not prevent the d == dend
5120 check from catching the end of string. */
419d1c74
SM
5121 end_match_1 = end1;
5122 end_match_2 = string2 + stop - size1;
5123 }
5124 d = string1 + pos;
5125 dend = end_match_1;
fa9a63c5
RM
5126 }
5127
5128 DEBUG_PRINT1 ("The compiled pattern is: ");
5129 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5130 DEBUG_PRINT1 ("The string to match is: `");
5131 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5132 DEBUG_PRINT1 ("'\n");
5e69f11e 5133
7814e705 5134 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5135 function if the match is complete, or it drops through if the match
5136 fails at this starting point in the input data. */
5137 for (;;)
5138 {
505bde11 5139 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5140
5141 if (p == pend)
5142 { /* End of pattern means we might have succeeded. */
25fe55af 5143 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5144
fa9a63c5 5145 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5146 longest match, try backtracking. */
5147 if (d != end_match_2)
fa9a63c5
RM
5148 {
5149 /* 1 if this match ends in the same string (string1 or string2)
5150 as the best previous match. */
5e69f11e 5151 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5152 == FIRST_STRING_P (d));
fa9a63c5
RM
5153 /* 1 if this match is the best seen so far. */
5154 boolean best_match_p;
5155
5156 /* AIX compiler got confused when this was combined
7814e705 5157 with the previous declaration. */
fa9a63c5
RM
5158 if (same_str_p)
5159 best_match_p = d > match_end;
5160 else
99633e97 5161 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5162
25fe55af
RS
5163 DEBUG_PRINT1 ("backtracking.\n");
5164
5165 if (!FAIL_STACK_EMPTY ())
5166 { /* More failure points to try. */
5167
5168 /* If exceeds best match so far, save it. */
5169 if (!best_regs_set || best_match_p)
5170 {
5171 best_regs_set = true;
5172 match_end = d;
5173
5174 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5175
01618498 5176 for (reg = 1; reg < num_regs; reg++)
25fe55af 5177 {
01618498
SM
5178 best_regstart[reg] = regstart[reg];
5179 best_regend[reg] = regend[reg];
25fe55af
RS
5180 }
5181 }
5182 goto fail;
5183 }
5184
5185 /* If no failure points, don't restore garbage. And if
5186 last match is real best match, don't restore second
5187 best one. */
5188 else if (best_regs_set && !best_match_p)
5189 {
5190 restore_best_regs:
5191 /* Restore best match. It may happen that `dend ==
5192 end_match_1' while the restored d is in string2.
5193 For example, the pattern `x.*y.*z' against the
5194 strings `x-' and `y-z-', if the two strings are
7814e705 5195 not consecutive in memory. */
25fe55af
RS
5196 DEBUG_PRINT1 ("Restoring best registers.\n");
5197
5198 d = match_end;
5199 dend = ((d >= string1 && d <= end1)
5200 ? end_match_1 : end_match_2);
fa9a63c5 5201
01618498 5202 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5203 {
01618498
SM
5204 regstart[reg] = best_regstart[reg];
5205 regend[reg] = best_regend[reg];
fa9a63c5 5206 }
25fe55af
RS
5207 }
5208 } /* d != end_match_2 */
fa9a63c5
RM
5209
5210 succeed_label:
25fe55af 5211 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5212
25fe55af
RS
5213 /* If caller wants register contents data back, do it. */
5214 if (regs && !bufp->no_sub)
fa9a63c5 5215 {
25fe55af
RS
5216 /* Have the register data arrays been allocated? */
5217 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5218 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5219 extra element beyond `num_regs' for the `-1' marker
5220 GNU code uses. */
5221 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5222 regs->start = TALLOC (regs->num_regs, regoff_t);
5223 regs->end = TALLOC (regs->num_regs, regoff_t);
5224 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5225 {
5226 FREE_VARIABLES ();
5227 return -2;
5228 }
25fe55af
RS
5229 bufp->regs_allocated = REGS_REALLOCATE;
5230 }
5231 else if (bufp->regs_allocated == REGS_REALLOCATE)
5232 { /* Yes. If we need more elements than were already
5233 allocated, reallocate them. If we need fewer, just
5234 leave it alone. */
5235 if (regs->num_regs < num_regs + 1)
5236 {
5237 regs->num_regs = num_regs + 1;
5238 RETALLOC (regs->start, regs->num_regs, regoff_t);
5239 RETALLOC (regs->end, regs->num_regs, regoff_t);
5240 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5241 {
5242 FREE_VARIABLES ();
5243 return -2;
5244 }
25fe55af
RS
5245 }
5246 }
5247 else
fa9a63c5
RM
5248 {
5249 /* These braces fend off a "empty body in an else-statement"
7814e705 5250 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5251 assert (bufp->regs_allocated == REGS_FIXED);
5252 }
5253
25fe55af
RS
5254 /* Convert the pointer data in `regstart' and `regend' to
5255 indices. Register zero has to be set differently,
5256 since we haven't kept track of any info for it. */
5257 if (regs->num_regs > 0)
5258 {
5259 regs->start[0] = pos;
99633e97 5260 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5261 }
5e69f11e 5262
25fe55af
RS
5263 /* Go through the first `min (num_regs, regs->num_regs)'
5264 registers, since that is all we initialized. */
01618498 5265 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5266 {
01618498
SM
5267 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5268 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5269 else
5270 {
01618498
SM
5271 regs->start[reg]
5272 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5273 regs->end[reg]
5274 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5275 }
fa9a63c5 5276 }
5e69f11e 5277
25fe55af
RS
5278 /* If the regs structure we return has more elements than
5279 were in the pattern, set the extra elements to -1. If
5280 we (re)allocated the registers, this is the case,
5281 because we always allocate enough to have at least one
7814e705 5282 -1 at the end. */
01618498
SM
5283 for (reg = num_regs; reg < regs->num_regs; reg++)
5284 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5285 } /* regs && !bufp->no_sub */
5286
25fe55af
RS
5287 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5288 nfailure_points_pushed, nfailure_points_popped,
5289 nfailure_points_pushed - nfailure_points_popped);
5290 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5291
99633e97 5292 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5293
25fe55af 5294 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5295
25fe55af
RS
5296 FREE_VARIABLES ();
5297 return mcnt;
5298 }
fa9a63c5 5299
7814e705 5300 /* Otherwise match next pattern command. */
fa9a63c5
RM
5301 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5302 {
25fe55af
RS
5303 /* Ignore these. Used to ignore the n of succeed_n's which
5304 currently have n == 0. */
5305 case no_op:
5306 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5307 break;
fa9a63c5
RM
5308
5309 case succeed:
25fe55af 5310 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5311 goto succeed_label;
5312
7814e705 5313 /* Match the next n pattern characters exactly. The following
25fe55af 5314 byte in the pattern defines n, and the n bytes after that
7814e705 5315 are the characters to match. */
fa9a63c5
RM
5316 case exactn:
5317 mcnt = *p++;
25fe55af 5318 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5319
99633e97
SM
5320 /* Remember the start point to rollback upon failure. */
5321 dfail = d;
5322
25fe55af
RS
5323 /* This is written out as an if-else so we don't waste time
5324 testing `translate' inside the loop. */
28703c16 5325 if (RE_TRANSLATE_P (translate))
fa9a63c5 5326 {
e934739e
RS
5327 if (multibyte)
5328 do
5329 {
5330 int pat_charlen, buf_charlen;
e71b1971 5331 unsigned int pat_ch, buf_ch;
e934739e
RS
5332
5333 PREFETCH ();
5334 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5335 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5336
5337 if (RE_TRANSLATE (translate, buf_ch)
5338 != pat_ch)
99633e97
SM
5339 {
5340 d = dfail;
5341 goto fail;
5342 }
e934739e
RS
5343
5344 p += pat_charlen;
5345 d += buf_charlen;
5346 mcnt -= pat_charlen;
5347 }
5348 while (mcnt > 0);
5349 else
e934739e
RS
5350 do
5351 {
cf1982d9
EZ
5352 /* Avoid compiler whining about comparison being
5353 always true. */
5354 int di;
5355
e934739e 5356 PREFETCH ();
cf1982d9
EZ
5357 di = *d;
5358 if (RE_TRANSLATE (translate, di) != *p++)
99633e97
SM
5359 {
5360 d = dfail;
5361 goto fail;
5362 }
33c46939 5363 d++;
e934739e
RS
5364 }
5365 while (--mcnt);
fa9a63c5
RM
5366 }
5367 else
5368 {
5369 do
5370 {
5371 PREFETCH ();
99633e97
SM
5372 if (*d++ != *p++)
5373 {
5374 d = dfail;
5375 goto fail;
5376 }
fa9a63c5
RM
5377 }
5378 while (--mcnt);
5379 }
25fe55af 5380 break;
fa9a63c5
RM
5381
5382
25fe55af 5383 /* Match any character except possibly a newline or a null. */
fa9a63c5 5384 case anychar:
e934739e
RS
5385 {
5386 int buf_charlen;
01618498 5387 re_wchar_t buf_ch;
fa9a63c5 5388
e934739e 5389 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5390
e934739e 5391 PREFETCH ();
2d1675e4 5392 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5393 buf_ch = TRANSLATE (buf_ch);
5394
5395 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5396 && buf_ch == '\n')
5397 || ((bufp->syntax & RE_DOT_NOT_NULL)
5398 && buf_ch == '\000'))
5399 goto fail;
5400
e934739e
RS
5401 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5402 d += buf_charlen;
5403 }
fa9a63c5
RM
5404 break;
5405
5406
5407 case charset:
5408 case charset_not:
5409 {
b18215fc 5410 register unsigned int c;
fa9a63c5 5411 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5412 int len;
5413
5414 /* Start of actual range_table, or end of bitmap if there is no
5415 range table. */
01618498 5416 re_char *range_table;
b18215fc 5417
96cc36cc 5418 /* Nonzero if there is a range table. */
b18215fc
RS
5419 int range_table_exists;
5420
96cc36cc
RS
5421 /* Number of ranges of range table. This is not included
5422 in the initial byte-length of the command. */
5423 int count = 0;
fa9a63c5 5424
25fe55af 5425 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5426
b18215fc 5427 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5428
b18215fc 5429 if (range_table_exists)
96cc36cc
RS
5430 {
5431 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5432 EXTRACT_NUMBER_AND_INCR (count, range_table);
5433 }
b18215fc 5434
2d1675e4
SM
5435 PREFETCH ();
5436 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5437 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
5438
5439 if (SINGLE_BYTE_CHAR_P (c))
5440 { /* Lookup bitmap. */
b18215fc
RS
5441 /* Cast to `unsigned' instead of `unsigned char' in
5442 case the bit list is a full 32 bytes long. */
5443 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5444 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5445 not = !not;
b18215fc 5446 }
96cc36cc 5447#ifdef emacs
b18215fc 5448 else if (range_table_exists)
96cc36cc
RS
5449 {
5450 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5451
14473664
SM
5452 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5453 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5454 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5455 | (class_bits & BIT_SPACE && ISSPACE (c))
5456 | (class_bits & BIT_UPPER && ISUPPER (c))
5457 | (class_bits & BIT_WORD && ISWORD (c)))
5458 not = !not;
5459 else
5460 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5461 }
5462#endif /* emacs */
fa9a63c5 5463
96cc36cc
RS
5464 if (range_table_exists)
5465 p = CHARSET_RANGE_TABLE_END (range_table, count);
5466 else
5467 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5468
5469 if (!not) goto fail;
5e69f11e 5470
b18215fc 5471 d += len;
fa9a63c5
RM
5472 break;
5473 }
5474
5475
25fe55af 5476 /* The beginning of a group is represented by start_memory.
505bde11 5477 The argument is the register number. The text
25fe55af 5478 matched within the group is recorded (in the internal
7814e705 5479 registers data structure) under the register number. */
25fe55af 5480 case start_memory:
505bde11
SM
5481 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5482
5483 /* In case we need to undo this operation (via backtracking). */
5484 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5485
25fe55af 5486 regstart[*p] = d;
4bb91c68 5487 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5488 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5489
25fe55af 5490 /* Move past the register number and inner group count. */
505bde11 5491 p += 1;
25fe55af 5492 break;
fa9a63c5
RM
5493
5494
25fe55af 5495 /* The stop_memory opcode represents the end of a group. Its
505bde11 5496 argument is the same as start_memory's: the register number. */
fa9a63c5 5497 case stop_memory:
505bde11
SM
5498 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5499
5500 assert (!REG_UNSET (regstart[*p]));
5501 /* Strictly speaking, there should be code such as:
177c0ea7 5502
0b32bf0e 5503 assert (REG_UNSET (regend[*p]));
505bde11
SM
5504 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5505
5506 But the only info to be pushed is regend[*p] and it is known to
5507 be UNSET, so there really isn't anything to push.
5508 Not pushing anything, on the other hand deprives us from the
5509 guarantee that regend[*p] is UNSET since undoing this operation
5510 will not reset its value properly. This is not important since
5511 the value will only be read on the next start_memory or at
5512 the very end and both events can only happen if this stop_memory
5513 is *not* undone. */
fa9a63c5 5514
25fe55af 5515 regend[*p] = d;
fa9a63c5
RM
5516 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5517
25fe55af 5518 /* Move past the register number and the inner group count. */
505bde11 5519 p += 1;
25fe55af 5520 break;
fa9a63c5
RM
5521
5522
5523 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5524 followed by the numeric value of <digit> as the register number. */
5525 case duplicate:
fa9a63c5 5526 {
66f0296e 5527 register re_char *d2, *dend2;
7814e705 5528 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5529 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5530
7814e705 5531 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5532 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5533 goto fail;
5e69f11e 5534
7814e705 5535 /* Where in input to try to start matching. */
25fe55af 5536 d2 = regstart[regno];
5e69f11e 5537
99633e97
SM
5538 /* Remember the start point to rollback upon failure. */
5539 dfail = d;
5540
25fe55af
RS
5541 /* Where to stop matching; if both the place to start and
5542 the place to stop matching are in the same string, then
5543 set to the place to stop, otherwise, for now have to use
5544 the end of the first string. */
fa9a63c5 5545
25fe55af 5546 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5547 == FIRST_STRING_P (regend[regno]))
5548 ? regend[regno] : end_match_1);
5549 for (;;)
5550 {
5551 /* If necessary, advance to next segment in register
25fe55af 5552 contents. */
fa9a63c5
RM
5553 while (d2 == dend2)
5554 {
5555 if (dend2 == end_match_2) break;
5556 if (dend2 == regend[regno]) break;
5557
25fe55af
RS
5558 /* End of string1 => advance to string2. */
5559 d2 = string2;
5560 dend2 = regend[regno];
fa9a63c5
RM
5561 }
5562 /* At end of register contents => success */
5563 if (d2 == dend2) break;
5564
5565 /* If necessary, advance to next segment in data. */
5566 PREFETCH ();
5567
5568 /* How many characters left in this segment to match. */
5569 mcnt = dend - d;
5e69f11e 5570
fa9a63c5 5571 /* Want how many consecutive characters we can match in
25fe55af
RS
5572 one shot, so, if necessary, adjust the count. */
5573 if (mcnt > dend2 - d2)
fa9a63c5 5574 mcnt = dend2 - d2;
5e69f11e 5575
fa9a63c5 5576 /* Compare that many; failure if mismatch, else move
25fe55af 5577 past them. */
28703c16 5578 if (RE_TRANSLATE_P (translate)
2d1675e4 5579 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5580 : memcmp (d, d2, mcnt))
99633e97
SM
5581 {
5582 d = dfail;
5583 goto fail;
5584 }
fa9a63c5 5585 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5586 }
5587 }
5588 break;
5589
5590
25fe55af 5591 /* begline matches the empty string at the beginning of the string
c0f9ea08 5592 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5593 case begline:
25fe55af 5594 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5595
25fe55af
RS
5596 if (AT_STRINGS_BEG (d))
5597 {
5598 if (!bufp->not_bol) break;
5599 }
419d1c74 5600 else
25fe55af 5601 {
419d1c74
SM
5602 unsigned char c;
5603 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5604 if (c == '\n')
419d1c74 5605 break;
25fe55af
RS
5606 }
5607 /* In all other cases, we fail. */
5608 goto fail;
fa9a63c5
RM
5609
5610
25fe55af 5611 /* endline is the dual of begline. */
fa9a63c5 5612 case endline:
25fe55af 5613 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5614
25fe55af
RS
5615 if (AT_STRINGS_END (d))
5616 {
5617 if (!bufp->not_eol) break;
5618 }
f1ad044f 5619 else
25fe55af 5620 {
f1ad044f 5621 PREFETCH_NOLIMIT ();
c0f9ea08 5622 if (*d == '\n')
f1ad044f 5623 break;
25fe55af
RS
5624 }
5625 goto fail;
fa9a63c5
RM
5626
5627
5628 /* Match at the very beginning of the data. */
25fe55af
RS
5629 case begbuf:
5630 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5631 if (AT_STRINGS_BEG (d))
5632 break;
5633 goto fail;
fa9a63c5
RM
5634
5635
5636 /* Match at the very end of the data. */
25fe55af
RS
5637 case endbuf:
5638 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5639 if (AT_STRINGS_END (d))
5640 break;
25fe55af 5641 goto fail;
5e69f11e 5642
5e69f11e 5643
25fe55af
RS
5644 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5645 pushes NULL as the value for the string on the stack. Then
505bde11 5646 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5647 string, instead of restoring it. To see why, consider
7814e705 5648 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5649 then the . fails against the \n. But the next thing we want
5650 to do is match the \n against the \n; if we restored the
5651 string value, we would be back at the foo.
5652
5653 Because this is used only in specific cases, we don't need to
5654 check all the things that `on_failure_jump' does, to make
5655 sure the right things get saved on the stack. Hence we don't
5656 share its code. The only reason to push anything on the
5657 stack at all is that otherwise we would have to change
5658 `anychar's code to do something besides goto fail in this
5659 case; that seems worse than this. */
5660 case on_failure_keep_string_jump:
505bde11
SM
5661 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5662 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5663 mcnt, p + mcnt);
fa9a63c5 5664
505bde11
SM
5665 PUSH_FAILURE_POINT (p - 3, NULL);
5666 break;
5667
0683b6fa
SM
5668 /* A nasty loop is introduced by the non-greedy *? and +?.
5669 With such loops, the stack only ever contains one failure point
5670 at a time, so that a plain on_failure_jump_loop kind of
5671 cycle detection cannot work. Worse yet, such a detection
5672 can not only fail to detect a cycle, but it can also wrongly
5673 detect a cycle (between different instantiations of the same
6df42991 5674 loop).
0683b6fa
SM
5675 So the method used for those nasty loops is a little different:
5676 We use a special cycle-detection-stack-frame which is pushed
5677 when the on_failure_jump_nastyloop failure-point is *popped*.
5678 This special frame thus marks the beginning of one iteration
5679 through the loop and we can hence easily check right here
5680 whether something matched between the beginning and the end of
5681 the loop. */
5682 case on_failure_jump_nastyloop:
5683 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5684 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5685 mcnt, p + mcnt);
5686
5687 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5688 {
5689 int cycle = 0;
5690 CHECK_INFINITE_LOOP (p - 4, d);
5691 if (!cycle)
5692 /* If there's a cycle, just continue without pushing
5693 this failure point. The failure point is the "try again"
5694 option, which shouldn't be tried.
5695 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5696 PUSH_FAILURE_POINT (p - 3, d);
5697 }
0683b6fa
SM
5698 break;
5699
4e8a9132
SM
5700 /* Simple loop detecting on_failure_jump: just check on the
5701 failure stack if the same spot was already hit earlier. */
505bde11
SM
5702 case on_failure_jump_loop:
5703 on_failure:
5704 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5705 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5706 mcnt, p + mcnt);
6df42991
SM
5707 {
5708 int cycle = 0;
5709 CHECK_INFINITE_LOOP (p - 3, d);
5710 if (cycle)
5711 /* If there's a cycle, get out of the loop, as if the matching
5712 had failed. We used to just `goto fail' here, but that was
5713 aborting the search a bit too early: we want to keep the
5714 empty-loop-match and keep matching after the loop.
5715 We want (x?)*y\1z to match both xxyz and xxyxz. */
5716 p += mcnt;
5717 else
5718 PUSH_FAILURE_POINT (p - 3, d);
5719 }
25fe55af 5720 break;
fa9a63c5
RM
5721
5722
5723 /* Uses of on_failure_jump:
5e69f11e 5724
25fe55af
RS
5725 Each alternative starts with an on_failure_jump that points
5726 to the beginning of the next alternative. Each alternative
5727 except the last ends with a jump that in effect jumps past
5728 the rest of the alternatives. (They really jump to the
5729 ending jump of the following alternative, because tensioning
5730 these jumps is a hassle.)
fa9a63c5 5731
25fe55af
RS
5732 Repeats start with an on_failure_jump that points past both
5733 the repetition text and either the following jump or
5734 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5735 case on_failure_jump:
25fe55af 5736 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5737 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5738 mcnt, p + mcnt);
25fe55af 5739
505bde11 5740 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5741 break;
5742
4e8a9132 5743 /* This operation is used for greedy *.
505bde11
SM
5744 Compare the beginning of the repeat with what in the
5745 pattern follows its end. If we can establish that there
5746 is nothing that they would both match, i.e., that we
5747 would have to backtrack because of (as in, e.g., `a*a')
5748 then we can use a non-backtracking loop based on
4e8a9132 5749 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5750 case on_failure_jump_smart:
25fe55af 5751 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5752 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5753 mcnt, p + mcnt);
25fe55af 5754 {
01618498 5755 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5756 /* Here, we discard `const', making re_match non-reentrant. */
5757 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5758 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5759
505bde11
SM
5760 p -= 3; /* Reset so that we will re-execute the
5761 instruction once it's been changed. */
fa9a63c5 5762
4e8a9132
SM
5763 EXTRACT_NUMBER (mcnt, p2 - 2);
5764
5765 /* Ensure this is a indeed the trivial kind of loop
5766 we are expecting. */
5767 assert (skip_one_char (p1) == p2 - 3);
5768 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5769 DEBUG_STATEMENT (debug += 2);
505bde11 5770 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5771 {
505bde11 5772 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5773 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5774 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5775 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5776 }
505bde11 5777 else
fa9a63c5 5778 {
505bde11
SM
5779 /* Default to a safe `on_failure_jump' loop. */
5780 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5781 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5782 }
99633e97 5783 DEBUG_STATEMENT (debug -= 2);
25fe55af 5784 }
505bde11 5785 break;
25fe55af
RS
5786
5787 /* Unconditionally jump (without popping any failure points). */
5788 case jump:
fa9a63c5 5789 unconditional_jump:
5b370c2b 5790 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5791 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5792 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5793 p += mcnt; /* Do the jump. */
505bde11 5794 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5795 break;
5796
5797
25fe55af
RS
5798 /* Have to succeed matching what follows at least n times.
5799 After that, handle like `on_failure_jump'. */
5800 case succeed_n:
01618498 5801 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5802 EXTRACT_NUMBER (mcnt, p + 2);
5803 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5804
dc1e502d
SM
5805 /* Originally, mcnt is how many times we HAVE to succeed. */
5806 if (mcnt != 0)
25fe55af 5807 {
6dcf2d0e
SM
5808 /* Here, we discard `const', making re_match non-reentrant. */
5809 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5810 mcnt--;
01618498
SM
5811 p += 4;
5812 PUSH_NUMBER (p2, mcnt);
25fe55af 5813 }
dc1e502d
SM
5814 else
5815 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5816 goto on_failure;
25fe55af
RS
5817 break;
5818
5819 case jump_n:
01618498 5820 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5821 EXTRACT_NUMBER (mcnt, p + 2);
5822 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5823
5824 /* Originally, this is how many times we CAN jump. */
dc1e502d 5825 if (mcnt != 0)
25fe55af 5826 {
6dcf2d0e
SM
5827 /* Here, we discard `const', making re_match non-reentrant. */
5828 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5829 mcnt--;
01618498 5830 PUSH_NUMBER (p2, mcnt);
dc1e502d 5831 goto unconditional_jump;
25fe55af
RS
5832 }
5833 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5834 else
5835 p += 4;
25fe55af 5836 break;
5e69f11e 5837
fa9a63c5
RM
5838 case set_number_at:
5839 {
01618498 5840 unsigned char *p2; /* Location of the counter. */
25fe55af 5841 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5842
25fe55af 5843 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5844 /* Here, we discard `const', making re_match non-reentrant. */
5845 p2 = (unsigned char*) p + mcnt;
01618498 5846 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5847 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5848 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5849 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5850 break;
5851 }
9121ca40
KH
5852
5853 case wordbound:
66f0296e
SM
5854 case notwordbound:
5855 not = (re_opcode_t) *(p - 1) == notwordbound;
5856 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5857
99633e97 5858 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5859
b18215fc 5860 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5861 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5862 not = !not;
b18215fc
RS
5863 else
5864 {
5865 /* C1 is the character before D, S1 is the syntax of C1, C2
5866 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5867 re_wchar_t c1, c2;
5868 int s1, s2;
b18215fc 5869#ifdef emacs
2d1675e4
SM
5870 int offset = PTR_TO_OFFSET (d - 1);
5871 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5872 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5873#endif
66f0296e 5874 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5875 s1 = SYNTAX (c1);
5876#ifdef emacs
5d967c7a 5877 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5878#endif
f1ad044f 5879 PREFETCH_NOLIMIT ();
2d1675e4 5880 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5881 s2 = SYNTAX (c2);
5882
5883 if (/* Case 2: Only one of S1 and S2 is Sword. */
5884 ((s1 == Sword) != (s2 == Sword))
5885 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 5886 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5887 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5888 not = !not;
5889 }
5890 if (not)
9121ca40 5891 break;
b18215fc 5892 else
9121ca40 5893 goto fail;
fa9a63c5
RM
5894
5895 case wordbeg:
25fe55af 5896 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5897
b18215fc
RS
5898 /* We FAIL in one of the following cases: */
5899
7814e705 5900 /* Case 1: D is at the end of string. */
b18215fc 5901 if (AT_STRINGS_END (d))
99633e97 5902 goto fail;
b18215fc
RS
5903 else
5904 {
5905 /* C1 is the character before D, S1 is the syntax of C1, C2
5906 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5907 re_wchar_t c1, c2;
5908 int s1, s2;
fa9a63c5 5909#ifdef emacs
2d1675e4
SM
5910 int offset = PTR_TO_OFFSET (d);
5911 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5912 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5913#endif
99633e97 5914 PREFETCH ();
2d1675e4 5915 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5916 s2 = SYNTAX (c2);
177c0ea7 5917
b18215fc
RS
5918 /* Case 2: S2 is not Sword. */
5919 if (s2 != Sword)
5920 goto fail;
5921
5922 /* Case 3: D is not at the beginning of string ... */
5923 if (!AT_STRINGS_BEG (d))
5924 {
5925 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5926#ifdef emacs
5d967c7a 5927 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5928#endif
b18215fc
RS
5929 s1 = SYNTAX (c1);
5930
5931 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 5932 returns 0. */
b18215fc
RS
5933 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5934 goto fail;
5935 }
5936 }
e318085a
RS
5937 break;
5938
b18215fc 5939 case wordend:
25fe55af 5940 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5941
5942 /* We FAIL in one of the following cases: */
5943
5944 /* Case 1: D is at the beginning of string. */
5945 if (AT_STRINGS_BEG (d))
e318085a 5946 goto fail;
b18215fc
RS
5947 else
5948 {
5949 /* C1 is the character before D, S1 is the syntax of C1, C2
5950 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5951 re_wchar_t c1, c2;
5952 int s1, s2;
5d967c7a 5953#ifdef emacs
2d1675e4
SM
5954 int offset = PTR_TO_OFFSET (d) - 1;
5955 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5956 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5957#endif
99633e97 5958 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5959 s1 = SYNTAX (c1);
5960
5961 /* Case 2: S1 is not Sword. */
5962 if (s1 != Sword)
5963 goto fail;
5964
5965 /* Case 3: D is not at the end of string ... */
5966 if (!AT_STRINGS_END (d))
5967 {
f1ad044f 5968 PREFETCH_NOLIMIT ();
2d1675e4 5969 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a 5970#ifdef emacs
134579f2 5971 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5d967c7a 5972#endif
b18215fc
RS
5973 s2 = SYNTAX (c2);
5974
5975 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 5976 returns 0. */
b18215fc 5977 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5978 goto fail;
b18215fc
RS
5979 }
5980 }
e318085a
RS
5981 break;
5982
669fa600
SM
5983 case symbeg:
5984 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
5985
5986 /* We FAIL in one of the following cases: */
5987
7814e705 5988 /* Case 1: D is at the end of string. */
669fa600
SM
5989 if (AT_STRINGS_END (d))
5990 goto fail;
5991 else
5992 {
5993 /* C1 is the character before D, S1 is the syntax of C1, C2
5994 is the character at D, and S2 is the syntax of C2. */
5995 re_wchar_t c1, c2;
5996 int s1, s2;
5997#ifdef emacs
5998 int offset = PTR_TO_OFFSET (d);
5999 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6000 UPDATE_SYNTAX_TABLE (charpos);
6001#endif
6002 PREFETCH ();
6003 c2 = RE_STRING_CHAR (d, dend - d);
6004 s2 = SYNTAX (c2);
7814e705 6005
669fa600
SM
6006 /* Case 2: S2 is neither Sword nor Ssymbol. */
6007 if (s2 != Sword && s2 != Ssymbol)
6008 goto fail;
6009
6010 /* Case 3: D is not at the beginning of string ... */
6011 if (!AT_STRINGS_BEG (d))
6012 {
6013 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6014#ifdef emacs
6015 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6016#endif
6017 s1 = SYNTAX (c1);
6018
6019 /* ... and S1 is Sword or Ssymbol. */
6020 if (s1 == Sword || s1 == Ssymbol)
6021 goto fail;
6022 }
6023 }
6024 break;
6025
6026 case symend:
6027 DEBUG_PRINT1 ("EXECUTING symend.\n");
6028
6029 /* We FAIL in one of the following cases: */
6030
6031 /* Case 1: D is at the beginning of string. */
6032 if (AT_STRINGS_BEG (d))
6033 goto fail;
6034 else
6035 {
6036 /* C1 is the character before D, S1 is the syntax of C1, C2
6037 is the character at D, and S2 is the syntax of C2. */
6038 re_wchar_t c1, c2;
6039 int s1, s2;
6040#ifdef emacs
6041 int offset = PTR_TO_OFFSET (d) - 1;
6042 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6043 UPDATE_SYNTAX_TABLE (charpos);
6044#endif
6045 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6046 s1 = SYNTAX (c1);
6047
6048 /* Case 2: S1 is neither Ssymbol nor Sword. */
6049 if (s1 != Sword && s1 != Ssymbol)
6050 goto fail;
6051
6052 /* Case 3: D is not at the end of string ... */
6053 if (!AT_STRINGS_END (d))
6054 {
6055 PREFETCH_NOLIMIT ();
6056 c2 = RE_STRING_CHAR (d, dend - d);
6057#ifdef emacs
134579f2 6058 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6059#endif
6060 s2 = SYNTAX (c2);
6061
6062 /* ... and S2 is Sword or Ssymbol. */
6063 if (s2 == Sword || s2 == Ssymbol)
6064 goto fail;
6065 }
6066 }
6067 break;
6068
fa9a63c5 6069 case syntaxspec:
1fb352e0
SM
6070 case notsyntaxspec:
6071 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6072 mcnt = *p++;
1fb352e0 6073 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6074 PREFETCH ();
b18215fc
RS
6075#ifdef emacs
6076 {
2d1675e4
SM
6077 int offset = PTR_TO_OFFSET (d);
6078 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6079 UPDATE_SYNTAX_TABLE (pos1);
6080 }
25fe55af 6081#endif
b18215fc 6082 {
01618498
SM
6083 int len;
6084 re_wchar_t c;
b18215fc 6085
2d1675e4 6086 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6087
990b2375 6088 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6089 goto fail;
b18215fc
RS
6090 d += len;
6091 }
fa9a63c5
RM
6092 break;
6093
b18215fc 6094#ifdef emacs
1fb352e0
SM
6095 case before_dot:
6096 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6097 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6098 goto fail;
b18215fc
RS
6099 break;
6100
1fb352e0
SM
6101 case at_dot:
6102 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6103 if (PTR_BYTE_POS (d) != PT_BYTE)
6104 goto fail;
6105 break;
b18215fc 6106
1fb352e0
SM
6107 case after_dot:
6108 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6109 if (PTR_BYTE_POS (d) <= PT_BYTE)
6110 goto fail;
e318085a 6111 break;
fa9a63c5 6112
1fb352e0 6113 case categoryspec:
b18215fc 6114 case notcategoryspec:
1fb352e0 6115 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6116 mcnt = *p++;
1fb352e0 6117 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6118 PREFETCH ();
6119 {
01618498
SM
6120 int len;
6121 re_wchar_t c;
6122
2d1675e4 6123 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6124
1fb352e0 6125 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6126 goto fail;
6127 d += len;
6128 }
fa9a63c5 6129 break;
5e69f11e 6130
1fb352e0 6131#endif /* emacs */
5e69f11e 6132
0b32bf0e
SM
6133 default:
6134 abort ();
fa9a63c5 6135 }
b18215fc 6136 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6137
6138
6139 /* We goto here if a matching operation fails. */
6140 fail:
5b370c2b 6141 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6142 if (!FAIL_STACK_EMPTY ())
505bde11 6143 {
01618498 6144 re_char *str, *pat;
505bde11 6145 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6146 DEBUG_PRINT1 ("\nFAIL:\n");
6147 POP_FAILURE_POINT (str, pat);
505bde11
SM
6148 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6149 {
6150 case on_failure_keep_string_jump:
6151 assert (str == NULL);
6152 goto continue_failure_jump;
6153
0683b6fa
SM
6154 case on_failure_jump_nastyloop:
6155 assert ((re_opcode_t)pat[-2] == no_op);
6156 PUSH_FAILURE_POINT (pat - 2, str);
6157 /* Fallthrough */
6158
505bde11
SM
6159 case on_failure_jump_loop:
6160 case on_failure_jump:
6161 case succeed_n:
6162 d = str;
6163 continue_failure_jump:
6164 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6165 p = pat + mcnt;
6166 break;
b18215fc 6167
0683b6fa
SM
6168 case no_op:
6169 /* A special frame used for nastyloops. */
6170 goto fail;
6171
505bde11
SM
6172 default:
6173 abort();
6174 }
fa9a63c5 6175
505bde11 6176 assert (p >= bufp->buffer && p <= pend);
b18215fc 6177
0b32bf0e 6178 if (d >= string1 && d <= end1)
fa9a63c5 6179 dend = end_match_1;
0b32bf0e 6180 }
fa9a63c5 6181 else
0b32bf0e 6182 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6183 } /* for (;;) */
6184
6185 if (best_regs_set)
6186 goto restore_best_regs;
6187
6188 FREE_VARIABLES ();
6189
b18215fc 6190 return -1; /* Failure to match. */
fa9a63c5
RM
6191} /* re_match_2 */
6192\f
6193/* Subroutine definitions for re_match_2. */
6194
fa9a63c5
RM
6195/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6196 bytes; nonzero otherwise. */
5e69f11e 6197
fa9a63c5 6198static int
2d1675e4
SM
6199bcmp_translate (s1, s2, len, translate, multibyte)
6200 re_char *s1, *s2;
fa9a63c5 6201 register int len;
6676cb1c 6202 RE_TRANSLATE_TYPE translate;
2d1675e4 6203 const int multibyte;
fa9a63c5 6204{
2d1675e4
SM
6205 register re_char *p1 = s1, *p2 = s2;
6206 re_char *p1_end = s1 + len;
6207 re_char *p2_end = s2 + len;
e934739e 6208
4bb91c68
SM
6209 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6210 different lengths, but relying on a single `len' would break this. -sm */
6211 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6212 {
e934739e 6213 int p1_charlen, p2_charlen;
01618498 6214 re_wchar_t p1_ch, p2_ch;
e934739e 6215
2d1675e4
SM
6216 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6217 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
6218
6219 if (RE_TRANSLATE (translate, p1_ch)
6220 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6221 return 1;
e934739e
RS
6222
6223 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6224 }
e934739e
RS
6225
6226 if (p1 != p1_end || p2 != p2_end)
6227 return 1;
6228
fa9a63c5
RM
6229 return 0;
6230}
6231\f
6232/* Entry points for GNU code. */
6233
6234/* re_compile_pattern is the GNU regular expression compiler: it
6235 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6236 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6237
fa9a63c5
RM
6238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6239 are set in BUFP on entry.
5e69f11e 6240
b18215fc 6241 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6242
6243const char *
6244re_compile_pattern (pattern, length, bufp)
6245 const char *pattern;
0b32bf0e 6246 size_t length;
fa9a63c5
RM
6247 struct re_pattern_buffer *bufp;
6248{
6249 reg_errcode_t ret;
5e69f11e 6250
1208f11a
RS
6251#ifdef emacs
6252 gl_state.current_syntax_table = current_buffer->syntax_table;
6253#endif
6254
fa9a63c5
RM
6255 /* GNU code is written to assume at least RE_NREGS registers will be set
6256 (and at least one extra will be -1). */
6257 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6258
fa9a63c5
RM
6259 /* And GNU code determines whether or not to get register information
6260 by passing null for the REGS argument to re_match, etc., not by
6261 setting no_sub. */
6262 bufp->no_sub = 0;
5e69f11e 6263
4bb91c68 6264 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6265
6266 if (!ret)
6267 return NULL;
6268 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6269}
c0f9ea08 6270WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6271\f
b18215fc
RS
6272/* Entry points compatible with 4.2 BSD regex library. We don't define
6273 them unless specifically requested. */
fa9a63c5 6274
0b32bf0e 6275#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6276
6277/* BSD has one and only one pattern buffer. */
6278static struct re_pattern_buffer re_comp_buf;
6279
6280char *
0b32bf0e 6281# ifdef _LIBC
48afdd44
RM
6282/* Make these definitions weak in libc, so POSIX programs can redefine
6283 these names if they don't use our functions, and still use
6284 regcomp/regexec below without link errors. */
6285weak_function
0b32bf0e 6286# endif
fa9a63c5
RM
6287re_comp (s)
6288 const char *s;
6289{
6290 reg_errcode_t ret;
5e69f11e 6291
fa9a63c5
RM
6292 if (!s)
6293 {
6294 if (!re_comp_buf.buffer)
0b32bf0e 6295 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6296 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6297 return 0;
6298 }
6299
6300 if (!re_comp_buf.buffer)
6301 {
6302 re_comp_buf.buffer = (unsigned char *) malloc (200);
6303 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6304 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6305 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6306 re_comp_buf.allocated = 200;
6307
6308 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6309 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6310 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6311 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6312 }
6313
6314 /* Since `re_exec' always passes NULL for the `regs' argument, we
6315 don't need to initialize the pattern buffer fields which affect it. */
6316
fa9a63c5 6317 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6318
fa9a63c5
RM
6319 if (!ret)
6320 return NULL;
6321
6322 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6323 return (char *) gettext (re_error_msgid[(int) ret]);
6324}
6325
6326
6327int
0b32bf0e 6328# ifdef _LIBC
48afdd44 6329weak_function
0b32bf0e 6330# endif
fa9a63c5
RM
6331re_exec (s)
6332 const char *s;
6333{
6334 const int len = strlen (s);
6335 return
6336 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6337}
6338#endif /* _REGEX_RE_COMP */
6339\f
6340/* POSIX.2 functions. Don't define these for Emacs. */
6341
6342#ifndef emacs
6343
6344/* regcomp takes a regular expression as a string and compiles it.
6345
b18215fc 6346 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6347 since POSIX says we shouldn't. Thus, we set
6348
6349 `buffer' to the compiled pattern;
6350 `used' to the length of the compiled pattern;
6351 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6352 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6353 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6354 `fastmap' to an allocated space for the fastmap;
6355 `fastmap_accurate' to zero;
fa9a63c5
RM
6356 `re_nsub' to the number of subexpressions in PATTERN.
6357
6358 PATTERN is the address of the pattern string.
6359
6360 CFLAGS is a series of bits which affect compilation.
6361
6362 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6363 use POSIX basic syntax.
6364
6365 If REG_NEWLINE is set, then . and [^...] don't match newline.
6366 Also, regexec will try a match beginning after every newline.
6367
6368 If REG_ICASE is set, then we considers upper- and lowercase
6369 versions of letters to be equivalent when matching.
6370
6371 If REG_NOSUB is set, then when PREG is passed to regexec, that
6372 routine will report only success or failure, and nothing about the
6373 registers.
6374
b18215fc 6375 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6376 the return codes and their meanings.) */
6377
6378int
6379regcomp (preg, pattern, cflags)
ada30c0e
SM
6380 regex_t *__restrict preg;
6381 const char *__restrict pattern;
fa9a63c5
RM
6382 int cflags;
6383{
6384 reg_errcode_t ret;
4bb91c68 6385 reg_syntax_t syntax
fa9a63c5
RM
6386 = (cflags & REG_EXTENDED) ?
6387 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6388
6389 /* regex_compile will allocate the space for the compiled pattern. */
6390 preg->buffer = 0;
6391 preg->allocated = 0;
6392 preg->used = 0;
5e69f11e 6393
c0f9ea08 6394 /* Try to allocate space for the fastmap. */
a073faa6 6395 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6396
fa9a63c5
RM
6397 if (cflags & REG_ICASE)
6398 {
6399 unsigned i;
5e69f11e 6400
6676cb1c 6401 preg->translate
a073faa6 6402 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8ea094cf 6403 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6404 if (preg->translate == NULL)
0b32bf0e 6405 return (int) REG_ESPACE;
fa9a63c5
RM
6406
6407 /* Map uppercase characters to corresponding lowercase ones. */
6408 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6409 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6410 }
6411 else
6412 preg->translate = NULL;
6413
6414 /* If REG_NEWLINE is set, newlines are treated differently. */
6415 if (cflags & REG_NEWLINE)
6416 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6417 syntax &= ~RE_DOT_NEWLINE;
6418 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6419 }
6420 else
c0f9ea08 6421 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6422
6423 preg->no_sub = !!(cflags & REG_NOSUB);
6424
5e69f11e 6425 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6426 can use strlen here in compiling the pattern. */
4bb91c68 6427 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6428
fa9a63c5
RM
6429 /* POSIX doesn't distinguish between an unmatched open-group and an
6430 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6431 if (ret == REG_ERPAREN)
6432 ret = REG_EPAREN;
6433
6434 if (ret == REG_NOERROR && preg->fastmap)
6435 { /* Compute the fastmap now, since regexec cannot modify the pattern
6436 buffer. */
6437 re_compile_fastmap (preg);
6438 if (preg->can_be_null)
6439 { /* The fastmap can't be used anyway. */
6440 free (preg->fastmap);
6441 preg->fastmap = NULL;
6442 }
6443 }
fa9a63c5
RM
6444 return (int) ret;
6445}
c0f9ea08 6446WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6447
6448
6449/* regexec searches for a given pattern, specified by PREG, in the
6450 string STRING.
5e69f11e 6451
fa9a63c5 6452 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6453 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6454 least NMATCH elements, and we set them to the offsets of the
6455 corresponding matched substrings.
5e69f11e 6456
fa9a63c5
RM
6457 EFLAGS specifies `execution flags' which affect matching: if
6458 REG_NOTBOL is set, then ^ does not match at the beginning of the
6459 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6460
fa9a63c5
RM
6461 We return 0 if we find a match and REG_NOMATCH if not. */
6462
6463int
6464regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6465 const regex_t *__restrict preg;
6466 const char *__restrict string;
5e69f11e 6467 size_t nmatch;
9f2dbe01 6468 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6469 int eflags;
6470{
6471 int ret;
6472 struct re_registers regs;
6473 regex_t private_preg;
6474 int len = strlen (string);
c0f9ea08 6475 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6476
6477 private_preg = *preg;
5e69f11e 6478
fa9a63c5
RM
6479 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6480 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6481
fa9a63c5
RM
6482 /* The user has told us exactly how many registers to return
6483 information about, via `nmatch'. We have to pass that on to the
b18215fc 6484 matching routines. */
fa9a63c5 6485 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6486
fa9a63c5
RM
6487 if (want_reg_info)
6488 {
6489 regs.num_regs = nmatch;
4bb91c68
SM
6490 regs.start = TALLOC (nmatch * 2, regoff_t);
6491 if (regs.start == NULL)
0b32bf0e 6492 return (int) REG_NOMATCH;
4bb91c68 6493 regs.end = regs.start + nmatch;
fa9a63c5
RM
6494 }
6495
c0f9ea08
SM
6496 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6497 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6498 was a little bit longer but still only matching the real part.
6499 This works because the `endline' will check for a '\n' and will find a
6500 '\0', correctly deciding that this is not the end of a line.
6501 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6502 a convenient '\0' there. For all we know, the string could be preceded
6503 by '\n' which would throw things off. */
6504
fa9a63c5
RM
6505 /* Perform the searching operation. */
6506 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6507 /* start: */ 0, /* range: */ len,
6508 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6509
fa9a63c5
RM
6510 /* Copy the register information to the POSIX structure. */
6511 if (want_reg_info)
6512 {
6513 if (ret >= 0)
0b32bf0e
SM
6514 {
6515 unsigned r;
fa9a63c5 6516
0b32bf0e
SM
6517 for (r = 0; r < nmatch; r++)
6518 {
6519 pmatch[r].rm_so = regs.start[r];
6520 pmatch[r].rm_eo = regs.end[r];
6521 }
6522 }
fa9a63c5 6523
b18215fc 6524 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6525 free (regs.start);
fa9a63c5
RM
6526 }
6527
6528 /* We want zero return to mean success, unlike `re_search'. */
6529 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6530}
c0f9ea08 6531WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6532
6533
6534/* Returns a message corresponding to an error code, ERRCODE, returned
6535 from either regcomp or regexec. We don't use PREG here. */
6536
6537size_t
6538regerror (errcode, preg, errbuf, errbuf_size)
6539 int errcode;
6540 const regex_t *preg;
6541 char *errbuf;
6542 size_t errbuf_size;
6543{
6544 const char *msg;
6545 size_t msg_size;
6546
6547 if (errcode < 0
6548 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6549 /* Only error codes returned by the rest of the code should be passed
b18215fc 6550 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6551 code generates an invalid error code, then the program has a bug.
6552 Dump core so we can fix it. */
6553 abort ();
6554
6555 msg = gettext (re_error_msgid[errcode]);
6556
6557 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6558
fa9a63c5
RM
6559 if (errbuf_size != 0)
6560 {
6561 if (msg_size > errbuf_size)
0b32bf0e
SM
6562 {
6563 strncpy (errbuf, msg, errbuf_size - 1);
6564 errbuf[errbuf_size - 1] = 0;
6565 }
fa9a63c5 6566 else
0b32bf0e 6567 strcpy (errbuf, msg);
fa9a63c5
RM
6568 }
6569
6570 return msg_size;
6571}
c0f9ea08 6572WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6573
6574
6575/* Free dynamically allocated space used by PREG. */
6576
6577void
6578regfree (preg)
6579 regex_t *preg;
6580{
6581 if (preg->buffer != NULL)
6582 free (preg->buffer);
6583 preg->buffer = NULL;
5e69f11e 6584
fa9a63c5
RM
6585 preg->allocated = 0;
6586 preg->used = 0;
6587
6588 if (preg->fastmap != NULL)
6589 free (preg->fastmap);
6590 preg->fastmap = NULL;
6591 preg->fastmap_accurate = 0;
6592
6593 if (preg->translate != NULL)
6594 free (preg->translate);
6595 preg->translate = NULL;
6596}
c0f9ea08 6597WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6598
6599#endif /* not emacs */
ab5796a9
MB
6600
6601/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6602 (do not change this comment) */