Merged from emacs@sv.gnu.org
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
1cee1e27 6 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
bc78d348 7
fa9a63c5
RM
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
4fc5845f 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
25fe55af 21 USA. */
fa9a63c5 22
6df42991 23/* TODO:
505bde11 24 - structure the opcode space into opcode+flag.
dc1e502d 25 - merge with glibc's regex.[ch].
01618498 26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
01618498 30*/
505bde11 31
fa9a63c5 32/* AIX requires this to be the first thing in the file. */
0b32bf0e 33#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
34 #pragma alloca
35#endif
36
fa9a63c5 37#ifdef HAVE_CONFIG_H
0b32bf0e 38# include <config.h>
fa9a63c5
RM
39#endif
40
4bb91c68
SM
41#if defined STDC_HEADERS && !defined emacs
42# include <stddef.h>
43#else
44/* We need this for `regex.h', and perhaps for the Emacs include files. */
45# include <sys/types.h>
46#endif
fa9a63c5 47
14473664
SM
48/* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
50#if defined _LIBC
51#define WIDE_CHAR_SUPPORT 1
52#else
14473664 53#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55#endif
14473664
SM
56
57/* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
a0ad02f7 59#if WIDE_CHAR_SUPPORT
14473664
SM
60/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61# include <wchar.h>
62# include <wctype.h>
63#endif
64
c0f9ea08
SM
65#ifdef _LIBC
66/* We have to keep the namespace clean. */
67# define regfree(preg) __regfree (preg)
68# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
70# define regerror(errcode, preg, errbuf, errbuf_size) \
71 __regerror(errcode, preg, errbuf, errbuf_size)
72# define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76# define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78# define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80# define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82# define re_set_syntax(syntax) __re_set_syntax (syntax)
83# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
14473664
SM
87/* Make sure we call libc's function even if the user overrides them. */
88# define btowc __btowc
89# define iswctype __iswctype
90# define wctype __wctype
91
c0f9ea08
SM
92# define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94/* We are also using some library internals. */
95# include <locale/localeinfo.h>
96# include <locale/elem-hash.h>
97# include <langinfo.h>
98#else
99# define WEAK_ALIAS(a,b)
100#endif
101
4bb91c68 102/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 103#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
104# include <libintl.h>
105#else
106# define gettext(msgid) (msgid)
107#endif
108
5e69f11e
RM
109#ifndef gettext_noop
110/* This define is so xgettext can find the internationalizable
111 strings. */
0b32bf0e 112# define gettext_noop(String) String
5e69f11e
RM
113#endif
114
fa9a63c5
RM
115/* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117#ifdef emacs
118
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e
SM
125# include "syntax.h"
126# include "charset.h"
127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
0b32bf0e
SM
142/* Converts the pointer to the char to BEG-based offset from the start. */
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
147# define RE_STRING_CHAR(p, s) \
4e8a9132 148 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 149# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
150 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
151
152/* Set C a (possibly multibyte) character before P. P points into a
153 string which is the virtual concatenation of STR1 (which ends at
154 END1) or STR2 (which ends at END2). */
0b32bf0e 155# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
156 do { \
157 if (multibyte) \
158 { \
0b32bf0e
SM
159 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
160 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
70806df6
KH
161 re_char *d0 = dtemp; \
162 PREV_CHAR_BOUNDARY (d0, dlimit); \
163 c = STRING_CHAR (d0, dtemp - d0); \
2d1675e4
SM
164 } \
165 else \
166 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
167 } while (0)
168
4e8a9132 169
fa9a63c5
RM
170#else /* not emacs */
171
172/* If we are not linking with Emacs proper,
173 we can't use the relocating allocator
174 even if config.h says that we can. */
0b32bf0e 175# undef REL_ALLOC
fa9a63c5 176
0b32bf0e
SM
177# if defined STDC_HEADERS || defined _LIBC
178# include <stdlib.h>
179# else
fa9a63c5
RM
180char *malloc ();
181char *realloc ();
0b32bf0e 182# endif
fa9a63c5 183
a77f947b
CY
184/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
185
186void *
187xmalloc (size)
188 size_t size;
189{
190 register void *val;
191 val = (void *) malloc (size);
192 if (!val && size)
193 {
194 write (2, "virtual memory exhausted\n", 25);
195 exit (1);
196 }
197 return val;
198}
199
200void *
201xrealloc (block, size)
202 void *block;
203 size_t size;
204{
205 register void *val;
206 /* We must call malloc explicitly when BLOCK is 0, since some
207 reallocs don't do this. */
208 if (! block)
209 val = (void *) malloc (size);
210 else
211 val = (void *) realloc (block, size);
212 if (!val && size)
213 {
214 write (2, "virtual memory exhausted\n", 25);
215 exit (1);
216 }
217 return val;
218}
219
a073faa6
CY
220# ifdef malloc
221# undef malloc
222# endif
223# define malloc xmalloc
224# ifdef realloc
225# undef realloc
226# endif
227# define realloc xrealloc
228
9e4ecb26 229/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 230 If nothing else has been done, use the method below. */
0b32bf0e
SM
231# ifdef INHIBIT_STRING_HEADER
232# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
233# if !defined bzero && !defined bcopy
234# undef INHIBIT_STRING_HEADER
235# endif
236# endif
237# endif
9e4ecb26 238
4bb91c68 239/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
240 This is used in most programs--a few other programs avoid this
241 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
242# ifndef INHIBIT_STRING_HEADER
243# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
244# include <string.h>
0b32bf0e 245# ifndef bzero
4bb91c68
SM
246# ifndef _LIBC
247# define bzero(s, n) (memset (s, '\0', n), (s))
248# else
249# define bzero(s, n) __bzero (s, n)
250# endif
0b32bf0e
SM
251# endif
252# else
253# include <strings.h>
4bb91c68
SM
254# ifndef memcmp
255# define memcmp(s1, s2, n) bcmp (s1, s2, n)
256# endif
257# ifndef memcpy
258# define memcpy(d, s, n) (bcopy (s, d, n), (d))
259# endif
0b32bf0e
SM
260# endif
261# endif
fa9a63c5
RM
262
263/* Define the syntax stuff for \<, \>, etc. */
264
990b2375 265/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 266enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 267
0b32bf0e
SM
268# ifdef SWITCH_ENUM_BUG
269# define SWITCH_ENUM_CAST(x) ((int)(x))
270# else
271# define SWITCH_ENUM_CAST(x) (x)
272# endif
fa9a63c5 273
e934739e 274/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
275# define BASE_LEADING_CODE_P(c) (0)
276# define CHAR_CHARSET(c) 0
277# define CHARSET_LEADING_CODE_BASE(c) 0
278# define MAX_MULTIBYTE_LENGTH 1
279# define RE_MULTIBYTE_P(x) 0
280# define WORD_BOUNDARY_P(c1, c2) (0)
281# define CHAR_HEAD_P(p) (1)
282# define SINGLE_BYTE_CHAR_P(c) (1)
283# define SAME_CHARSET_P(c1, c2) (1)
284# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 285# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e
SM
286# define STRING_CHAR(p, s) (*(p))
287# define RE_STRING_CHAR STRING_CHAR
288# define CHAR_STRING(c, s) (*(s) = (c), 1)
289# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
290# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
291# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 292 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 293# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 294#endif /* not emacs */
4e8a9132
SM
295
296#ifndef RE_TRANSLATE
0b32bf0e
SM
297# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
298# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 299#endif
fa9a63c5
RM
300\f
301/* Get the interface, including the syntax bits. */
302#include "regex.h"
303
f71b19b6
DL
304/* isalpha etc. are used for the character classes. */
305#include <ctype.h>
fa9a63c5 306
f71b19b6 307#ifdef emacs
fa9a63c5 308
f71b19b6 309/* 1 if C is an ASCII character. */
0b32bf0e 310# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 311
f71b19b6 312/* 1 if C is a unibyte character. */
0b32bf0e 313# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 314
f71b19b6 315/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 316
f71b19b6 317/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
318# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
319# define ISCNTRL(c) ((c) < ' ')
320# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
321 || ((c) >= 'a' && (c) <= 'f') \
322 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
323
324/* This is only used for single-byte characters. */
0b32bf0e 325# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
326
327/* The rest must handle multibyte characters. */
328
0b32bf0e 329# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 330 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
331 : 1)
332
14473664 333# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 334 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
335 : 1)
336
0b32bf0e 337# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
338 ? (((c) >= 'a' && (c) <= 'z') \
339 || ((c) >= 'A' && (c) <= 'Z') \
340 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
341 : SYNTAX (c) == Sword)
342
0b32bf0e 343# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
344 ? (((c) >= 'a' && (c) <= 'z') \
345 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
346 : SYNTAX (c) == Sword)
347
0b32bf0e 348# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 349
0b32bf0e 350# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
351 ? ((c) > ' ' && (c) < 0177 \
352 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
353 || ((c) >= 'A' && (c) <= 'Z') \
354 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
355 : SYNTAX (c) != Sword)
356
0b32bf0e 357# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 358
0b32bf0e 359# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 360
0b32bf0e 361# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
362
363#else /* not emacs */
364
f71b19b6
DL
365/* Jim Meyering writes:
366
367 "... Some ctype macros are valid only for character codes that
368 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
369 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 370 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
371 STDC_HEADERS is defined, then autoconf has verified that the ctype
372 macros don't need to be guarded with references to isascii. ...
373 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
374 eliminate the && through constant folding."
375 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 376
4bb91c68 377# undef ISASCII
0b32bf0e
SM
378# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
379# define ISASCII(c) 1
380# else
381# define ISASCII(c) isascii(c)
382# endif
f71b19b6
DL
383
384/* 1 if C is an ASCII character. */
0b32bf0e 385# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
386
387/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
388# define ISUNIBYTE(c) 1
389
390# ifdef isblank
391# define ISBLANK(c) (ISASCII (c) && isblank (c))
392# else
393# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
394# endif
395# ifdef isgraph
396# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
397# else
398# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
399# endif
400
4bb91c68 401# undef ISPRINT
0b32bf0e
SM
402# define ISPRINT(c) (ISASCII (c) && isprint (c))
403# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
404# define ISALNUM(c) (ISASCII (c) && isalnum (c))
405# define ISALPHA(c) (ISASCII (c) && isalpha (c))
406# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
407# define ISLOWER(c) (ISASCII (c) && islower (c))
408# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
409# define ISSPACE(c) (ISASCII (c) && isspace (c))
410# define ISUPPER(c) (ISASCII (c) && isupper (c))
411# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
412
413# define ISWORD(c) ISALPHA(c)
414
4bb91c68
SM
415# ifdef _tolower
416# define TOLOWER(c) _tolower(c)
417# else
418# define TOLOWER(c) tolower(c)
419# endif
420
421/* How many characters in the character set. */
422# define CHAR_SET_SIZE 256
423
0b32bf0e 424# ifdef SYNTAX_TABLE
f71b19b6 425
0b32bf0e 426extern char *re_syntax_table;
f71b19b6 427
0b32bf0e
SM
428# else /* not SYNTAX_TABLE */
429
0b32bf0e
SM
430static char re_syntax_table[CHAR_SET_SIZE];
431
432static void
433init_syntax_once ()
434{
435 register int c;
436 static int done = 0;
437
438 if (done)
439 return;
440
441 bzero (re_syntax_table, sizeof re_syntax_table);
442
4bb91c68
SM
443 for (c = 0; c < CHAR_SET_SIZE; ++c)
444 if (ISALNUM (c))
445 re_syntax_table[c] = Sword;
fa9a63c5 446
669fa600 447 re_syntax_table['_'] = Ssymbol;
fa9a63c5 448
0b32bf0e
SM
449 done = 1;
450}
451
452# endif /* not SYNTAX_TABLE */
96cc36cc 453
4bb91c68
SM
454# define SYNTAX(c) re_syntax_table[(c)]
455
96cc36cc
RS
456#endif /* not emacs */
457\f
fa9a63c5 458#ifndef NULL
0b32bf0e 459# define NULL (void *)0
fa9a63c5
RM
460#endif
461
462/* We remove any previous definition of `SIGN_EXTEND_CHAR',
463 since ours (we hope) works properly with all combinations of
464 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 465 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
466#undef SIGN_EXTEND_CHAR
467#if __STDC__
0b32bf0e 468# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
469#else /* not __STDC__ */
470/* As in Harbison and Steele. */
0b32bf0e 471# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
472#endif
473\f
474/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
475 use `alloca' instead of `malloc'. This is because using malloc in
476 re_search* or re_match* could cause memory leaks when C-g is used in
477 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
478 the other hand, malloc is more portable, and easier to debug.
479
fa9a63c5
RM
480 Because we sometimes use alloca, some routines have to be macros,
481 not functions -- `alloca'-allocated space disappears at the end of the
482 function it is called in. */
483
484#ifdef REGEX_MALLOC
485
0b32bf0e
SM
486# define REGEX_ALLOCATE malloc
487# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
488# define REGEX_FREE free
fa9a63c5
RM
489
490#else /* not REGEX_MALLOC */
491
492/* Emacs already defines alloca, sometimes. */
0b32bf0e 493# ifndef alloca
fa9a63c5
RM
494
495/* Make alloca work the best possible way. */
0b32bf0e
SM
496# ifdef __GNUC__
497# define alloca __builtin_alloca
498# else /* not __GNUC__ */
499# if HAVE_ALLOCA_H
500# include <alloca.h>
501# endif /* HAVE_ALLOCA_H */
502# endif /* not __GNUC__ */
fa9a63c5 503
0b32bf0e 504# endif /* not alloca */
fa9a63c5 505
0b32bf0e 506# define REGEX_ALLOCATE alloca
fa9a63c5
RM
507
508/* Assumes a `char *destination' variable. */
0b32bf0e 509# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 510 (destination = (char *) alloca (nsize), \
4bb91c68 511 memcpy (destination, source, osize))
fa9a63c5
RM
512
513/* No need to do anything to free, after alloca. */
0b32bf0e 514# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
515
516#endif /* not REGEX_MALLOC */
517
518/* Define how to allocate the failure stack. */
519
0b32bf0e 520#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 521
0b32bf0e 522# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 523 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 524# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 525 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 526# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
527 r_alloc_free (&failure_stack_ptr)
528
4297555e 529#else /* not using relocating allocator */
fa9a63c5 530
0b32bf0e 531# ifdef REGEX_MALLOC
fa9a63c5 532
0b32bf0e
SM
533# define REGEX_ALLOCATE_STACK malloc
534# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
535# define REGEX_FREE_STACK free
fa9a63c5 536
0b32bf0e 537# else /* not REGEX_MALLOC */
fa9a63c5 538
0b32bf0e 539# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 540
0b32bf0e 541# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 542 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 543/* No need to explicitly free anything. */
0b32bf0e 544# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 545
0b32bf0e 546# endif /* not REGEX_MALLOC */
4297555e 547#endif /* not using relocating allocator */
fa9a63c5
RM
548
549
550/* True if `size1' is non-NULL and PTR is pointing anywhere inside
551 `string1' or just past its end. This works if PTR is NULL, which is
552 a good thing. */
25fe55af 553#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
554 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
555
556/* (Re)Allocate N items of type T using malloc, or fail. */
557#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
558#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
559#define RETALLOC_IF(addr, n, t) \
560 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
561#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
562
4bb91c68 563#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
564
565#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
566
567#undef MAX
568#undef MIN
569#define MAX(a, b) ((a) > (b) ? (a) : (b))
570#define MIN(a, b) ((a) < (b) ? (a) : (b))
571
6470ea05
SM
572/* Type of source-pattern and string chars. */
573typedef const unsigned char re_char;
574
fa9a63c5
RM
575typedef char boolean;
576#define false 0
577#define true 1
578
4bb91c68
SM
579static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
580 re_char *string1, int size1,
581 re_char *string2, int size2,
582 int pos,
583 struct re_registers *regs,
584 int stop));
fa9a63c5
RM
585\f
586/* These are the command codes that appear in compiled regular
4bb91c68 587 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
588 command code can specify any interpretation whatsoever for its
589 arguments. Zero bytes may appear in the compiled regular expression. */
590
591typedef enum
592{
593 no_op = 0,
594
4bb91c68 595 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
596 succeed,
597
25fe55af 598 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
599 exactn,
600
25fe55af 601 /* Matches any (more or less) character. */
fa9a63c5
RM
602 anychar,
603
25fe55af
RS
604 /* Matches any one char belonging to specified set. First
605 following byte is number of bitmap bytes. Then come bytes
606 for a bitmap saying which chars are in. Bits in each byte
607 are ordered low-bit-first. A character is in the set if its
608 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
609 automatically not in the set.
610
611 If the length byte has the 0x80 bit set, then that stuff
612 is followed by a range table:
613 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 614 See RANGE_TABLE_WORK_BITS below.
01618498 615 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 616 pairs, each 2 multibyte characters,
0b32bf0e 617 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
618 charset,
619
25fe55af 620 /* Same parameters as charset, but match any character that is
4bb91c68 621 not one of those specified. */
fa9a63c5
RM
622 charset_not,
623
25fe55af
RS
624 /* Start remembering the text that is matched, for storing in a
625 register. Followed by one byte with the register number, in
626 the range 0 to one less than the pattern buffer's re_nsub
505bde11 627 field. */
fa9a63c5
RM
628 start_memory,
629
25fe55af
RS
630 /* Stop remembering the text that is matched and store it in a
631 memory register. Followed by one byte with the register
632 number, in the range 0 to one less than `re_nsub' in the
505bde11 633 pattern buffer. */
fa9a63c5
RM
634 stop_memory,
635
25fe55af 636 /* Match a duplicate of something remembered. Followed by one
4bb91c68 637 byte containing the register number. */
fa9a63c5
RM
638 duplicate,
639
25fe55af 640 /* Fail unless at beginning of line. */
fa9a63c5
RM
641 begline,
642
4bb91c68 643 /* Fail unless at end of line. */
fa9a63c5
RM
644 endline,
645
25fe55af
RS
646 /* Succeeds if at beginning of buffer (if emacs) or at beginning
647 of string to be matched (if not). */
fa9a63c5
RM
648 begbuf,
649
25fe55af 650 /* Analogously, for end of buffer/string. */
fa9a63c5 651 endbuf,
5e69f11e 652
25fe55af 653 /* Followed by two byte relative address to which to jump. */
5e69f11e 654 jump,
fa9a63c5 655
25fe55af
RS
656 /* Followed by two-byte relative address of place to resume at
657 in case of failure. */
fa9a63c5 658 on_failure_jump,
5e69f11e 659
25fe55af
RS
660 /* Like on_failure_jump, but pushes a placeholder instead of the
661 current string position when executed. */
fa9a63c5 662 on_failure_keep_string_jump,
5e69f11e 663
505bde11
SM
664 /* Just like `on_failure_jump', except that it checks that we
665 don't get stuck in an infinite loop (matching an empty string
666 indefinitely). */
667 on_failure_jump_loop,
668
0683b6fa
SM
669 /* Just like `on_failure_jump_loop', except that it checks for
670 a different kind of loop (the kind that shows up with non-greedy
671 operators). This operation has to be immediately preceded
672 by a `no_op'. */
673 on_failure_jump_nastyloop,
674
0b32bf0e 675 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
676 It analyses the loop before which it is put and if the
677 loop does not require backtracking, it changes itself to
4e8a9132
SM
678 `on_failure_keep_string_jump' and short-circuits the loop,
679 else it just defaults to changing itself into `on_failure_jump'.
680 It assumes that it is pointing to just past a `jump'. */
505bde11 681 on_failure_jump_smart,
fa9a63c5 682
25fe55af 683 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
684 After matching N times, jump to the address upon failure.
685 Does not work if N starts at 0: use on_failure_jump_loop
686 instead. */
fa9a63c5
RM
687 succeed_n,
688
25fe55af
RS
689 /* Followed by two-byte relative address, and two-byte number n.
690 Jump to the address N times, then fail. */
fa9a63c5
RM
691 jump_n,
692
25fe55af
RS
693 /* Set the following two-byte relative address to the
694 subsequent two-byte number. The address *includes* the two
695 bytes of number. */
fa9a63c5
RM
696 set_number_at,
697
fa9a63c5
RM
698 wordbeg, /* Succeeds if at word beginning. */
699 wordend, /* Succeeds if at word end. */
700
701 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 702 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 703
669fa600
SM
704 symbeg, /* Succeeds if at symbol beginning. */
705 symend, /* Succeeds if at symbol end. */
706
fa9a63c5 707 /* Matches any character whose syntax is specified. Followed by
25fe55af 708 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
709 syntaxspec,
710
711 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
712 notsyntaxspec
713
714#ifdef emacs
715 ,before_dot, /* Succeeds if before point. */
716 at_dot, /* Succeeds if at point. */
717 after_dot, /* Succeeds if after point. */
b18215fc
RS
718
719 /* Matches any character whose category-set contains the specified
25fe55af
RS
720 category. The operator is followed by a byte which contains a
721 category code (mnemonic ASCII character). */
b18215fc
RS
722 categoryspec,
723
724 /* Matches any character whose category-set does not contain the
725 specified category. The operator is followed by a byte which
726 contains the category code (mnemonic ASCII character). */
727 notcategoryspec
fa9a63c5
RM
728#endif /* emacs */
729} re_opcode_t;
730\f
731/* Common operations on the compiled pattern. */
732
733/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
734
735#define STORE_NUMBER(destination, number) \
736 do { \
737 (destination)[0] = (number) & 0377; \
738 (destination)[1] = (number) >> 8; \
739 } while (0)
740
741/* Same as STORE_NUMBER, except increment DESTINATION to
742 the byte after where the number is stored. Therefore, DESTINATION
743 must be an lvalue. */
744
745#define STORE_NUMBER_AND_INCR(destination, number) \
746 do { \
747 STORE_NUMBER (destination, number); \
748 (destination) += 2; \
749 } while (0)
750
751/* Put into DESTINATION a number stored in two contiguous bytes starting
752 at SOURCE. */
753
754#define EXTRACT_NUMBER(destination, source) \
755 do { \
756 (destination) = *(source) & 0377; \
757 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
758 } while (0)
759
760#ifdef DEBUG
4bb91c68 761static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
762static void
763extract_number (dest, source)
764 int *dest;
01618498 765 re_char *source;
fa9a63c5 766{
5e69f11e 767 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
768 *dest = *source & 0377;
769 *dest += temp << 8;
770}
771
4bb91c68 772# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
773# undef EXTRACT_NUMBER
774# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
775# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
776
777#endif /* DEBUG */
778
779/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
780 SOURCE must be an lvalue. */
781
782#define EXTRACT_NUMBER_AND_INCR(destination, source) \
783 do { \
784 EXTRACT_NUMBER (destination, source); \
25fe55af 785 (source) += 2; \
fa9a63c5
RM
786 } while (0)
787
788#ifdef DEBUG
4bb91c68
SM
789static void extract_number_and_incr _RE_ARGS ((int *destination,
790 re_char **source));
fa9a63c5
RM
791static void
792extract_number_and_incr (destination, source)
793 int *destination;
01618498 794 re_char **source;
5e69f11e 795{
fa9a63c5
RM
796 extract_number (destination, *source);
797 *source += 2;
798}
799
0b32bf0e
SM
800# ifndef EXTRACT_MACROS
801# undef EXTRACT_NUMBER_AND_INCR
802# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 803 extract_number_and_incr (&dest, &src)
0b32bf0e 804# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
805
806#endif /* DEBUG */
807\f
b18215fc
RS
808/* Store a multibyte character in three contiguous bytes starting
809 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 810 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
811
812#define STORE_CHARACTER_AND_INCR(destination, character) \
813 do { \
814 (destination)[0] = (character) & 0377; \
815 (destination)[1] = ((character) >> 8) & 0377; \
816 (destination)[2] = (character) >> 16; \
817 (destination) += 3; \
818 } while (0)
819
820/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 821 starting at SOURCE. */
b18215fc
RS
822
823#define EXTRACT_CHARACTER(destination, source) \
824 do { \
825 (destination) = ((source)[0] \
826 | ((source)[1] << 8) \
827 | ((source)[2] << 16)); \
828 } while (0)
829
830
831/* Macros for charset. */
832
833/* Size of bitmap of charset P in bytes. P is a start of charset,
834 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
835#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
836
837/* Nonzero if charset P has range table. */
25fe55af 838#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
839
840/* Return the address of range table of charset P. But not the start
841 of table itself, but the before where the number of ranges is
96cc36cc
RS
842 stored. `2 +' means to skip re_opcode_t and size of bitmap,
843 and the 2 bytes of flags at the start of the range table. */
844#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
845
846/* Extract the bit flags that start a range table. */
847#define CHARSET_RANGE_TABLE_BITS(p) \
848 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
849 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
850
851/* Test if C is listed in the bitmap of charset P. */
852#define CHARSET_LOOKUP_BITMAP(p, c) \
853 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
854 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
855
856/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
857 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
858 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
859 and end. */
860#define CHARSET_RANGE_TABLE_END(range_table, count) \
861 ((range_table) + (count) * 2 * 3)
862
25fe55af 863/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
864 COUNT is number of ranges in RANGE_TABLE. */
865#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
866 do \
867 { \
01618498
SM
868 re_wchar_t range_start, range_end; \
869 re_char *p; \
870 re_char *range_table_end \
b18215fc
RS
871 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
872 \
873 for (p = (range_table); p < range_table_end; p += 2 * 3) \
874 { \
875 EXTRACT_CHARACTER (range_start, p); \
876 EXTRACT_CHARACTER (range_end, p + 3); \
877 \
878 if (range_start <= (c) && (c) <= range_end) \
879 { \
880 (not) = !(not); \
881 break; \
882 } \
883 } \
884 } \
885 while (0)
886
887/* Test if C is in range table of CHARSET. The flag NOT is negated if
888 C is listed in it. */
889#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
890 do \
891 { \
892 /* Number of ranges in range table. */ \
893 int count; \
01618498
SM
894 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
895 \
b18215fc
RS
896 EXTRACT_NUMBER_AND_INCR (count, range_table); \
897 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
898 } \
899 while (0)
900\f
fa9a63c5
RM
901/* If DEBUG is defined, Regex prints many voluminous messages about what
902 it is doing (if the variable `debug' is nonzero). If linked with the
903 main program in `iregex.c', you can enter patterns and strings
904 interactively. And if linked with the main program in `main.c' and
4bb91c68 905 the other test files, you can run the already-written tests. */
fa9a63c5
RM
906
907#ifdef DEBUG
908
909/* We use standard I/O for debugging. */
0b32bf0e 910# include <stdio.h>
fa9a63c5
RM
911
912/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 913# include <assert.h>
fa9a63c5 914
99633e97 915static int debug = -100000;
fa9a63c5 916
0b32bf0e
SM
917# define DEBUG_STATEMENT(e) e
918# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
919# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
920# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
921# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
922# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 923 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 924# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 925 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
926
927
928/* Print the fastmap in human-readable form. */
929
930void
931print_fastmap (fastmap)
932 char *fastmap;
933{
934 unsigned was_a_range = 0;
5e69f11e
RM
935 unsigned i = 0;
936
fa9a63c5
RM
937 while (i < (1 << BYTEWIDTH))
938 {
939 if (fastmap[i++])
940 {
941 was_a_range = 0;
25fe55af
RS
942 putchar (i - 1);
943 while (i < (1 << BYTEWIDTH) && fastmap[i])
944 {
945 was_a_range = 1;
946 i++;
947 }
fa9a63c5 948 if (was_a_range)
25fe55af
RS
949 {
950 printf ("-");
951 putchar (i - 1);
952 }
953 }
fa9a63c5 954 }
5e69f11e 955 putchar ('\n');
fa9a63c5
RM
956}
957
958
959/* Print a compiled pattern string in human-readable form, starting at
960 the START pointer into it and ending just before the pointer END. */
961
962void
963print_partial_compiled_pattern (start, end)
01618498
SM
964 re_char *start;
965 re_char *end;
fa9a63c5
RM
966{
967 int mcnt, mcnt2;
01618498
SM
968 re_char *p = start;
969 re_char *pend = end;
fa9a63c5
RM
970
971 if (start == NULL)
972 {
a1a052df 973 fprintf (stderr, "(null)\n");
fa9a63c5
RM
974 return;
975 }
5e69f11e 976
fa9a63c5
RM
977 /* Loop over pattern commands. */
978 while (p < pend)
979 {
a1a052df 980 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
981
982 switch ((re_opcode_t) *p++)
983 {
25fe55af 984 case no_op:
a1a052df 985 fprintf (stderr, "/no_op");
25fe55af 986 break;
fa9a63c5 987
99633e97 988 case succeed:
a1a052df 989 fprintf (stderr, "/succeed");
99633e97
SM
990 break;
991
fa9a63c5
RM
992 case exactn:
993 mcnt = *p++;
a1a052df 994 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 995 do
fa9a63c5 996 {
a1a052df 997 fprintf (stderr, "/%c", *p++);
25fe55af
RS
998 }
999 while (--mcnt);
1000 break;
fa9a63c5
RM
1001
1002 case start_memory:
a1a052df 1003 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1004 break;
fa9a63c5
RM
1005
1006 case stop_memory:
a1a052df 1007 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1008 break;
fa9a63c5
RM
1009
1010 case duplicate:
a1a052df 1011 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1012 break;
1013
1014 case anychar:
a1a052df 1015 fprintf (stderr, "/anychar");
fa9a63c5
RM
1016 break;
1017
1018 case charset:
25fe55af
RS
1019 case charset_not:
1020 {
1021 register int c, last = -100;
fa9a63c5 1022 register int in_range = 0;
99633e97
SM
1023 int length = CHARSET_BITMAP_SIZE (p - 1);
1024 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1025
a1a052df 1026 fprintf (stderr, "/charset [%s",
275a3409 1027 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1028
275a3409
RS
1029 if (p + *p >= pend)
1030 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1031
25fe55af 1032 for (c = 0; c < 256; c++)
96cc36cc 1033 if (c / 8 < length
fa9a63c5
RM
1034 && (p[1 + (c/8)] & (1 << (c % 8))))
1035 {
1036 /* Are we starting a range? */
1037 if (last + 1 == c && ! in_range)
1038 {
a1a052df 1039 fprintf (stderr, "-");
fa9a63c5
RM
1040 in_range = 1;
1041 }
1042 /* Have we broken a range? */
1043 else if (last + 1 != c && in_range)
96cc36cc 1044 {
a1a052df 1045 fprintf (stderr, "%c", last);
fa9a63c5
RM
1046 in_range = 0;
1047 }
5e69f11e 1048
fa9a63c5 1049 if (! in_range)
a1a052df 1050 fprintf (stderr, "%c", c);
fa9a63c5
RM
1051
1052 last = c;
25fe55af 1053 }
fa9a63c5
RM
1054
1055 if (in_range)
a1a052df 1056 fprintf (stderr, "%c", last);
fa9a63c5 1057
a1a052df 1058 fprintf (stderr, "]");
fa9a63c5 1059
99633e97 1060 p += 1 + length;
96cc36cc 1061
96cc36cc 1062 if (has_range_table)
99633e97
SM
1063 {
1064 int count;
a1a052df 1065 fprintf (stderr, "has-range-table");
99633e97
SM
1066
1067 /* ??? Should print the range table; for now, just skip it. */
1068 p += 2; /* skip range table bits */
1069 EXTRACT_NUMBER_AND_INCR (count, p);
1070 p = CHARSET_RANGE_TABLE_END (p, count);
1071 }
fa9a63c5
RM
1072 }
1073 break;
1074
1075 case begline:
a1a052df 1076 fprintf (stderr, "/begline");
25fe55af 1077 break;
fa9a63c5
RM
1078
1079 case endline:
a1a052df 1080 fprintf (stderr, "/endline");
25fe55af 1081 break;
fa9a63c5
RM
1082
1083 case on_failure_jump:
25fe55af 1084 extract_number_and_incr (&mcnt, &p);
a1a052df 1085 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1086 break;
fa9a63c5
RM
1087
1088 case on_failure_keep_string_jump:
25fe55af 1089 extract_number_and_incr (&mcnt, &p);
a1a052df 1090 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1091 break;
fa9a63c5 1092
0683b6fa
SM
1093 case on_failure_jump_nastyloop:
1094 extract_number_and_incr (&mcnt, &p);
a1a052df 1095 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1096 break;
1097
505bde11 1098 case on_failure_jump_loop:
fa9a63c5 1099 extract_number_and_incr (&mcnt, &p);
a1a052df 1100 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1101 break;
1102
505bde11 1103 case on_failure_jump_smart:
fa9a63c5 1104 extract_number_and_incr (&mcnt, &p);
a1a052df 1105 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1106 break;
1107
25fe55af 1108 case jump:
fa9a63c5 1109 extract_number_and_incr (&mcnt, &p);
a1a052df 1110 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1111 break;
1112
25fe55af
RS
1113 case succeed_n:
1114 extract_number_and_incr (&mcnt, &p);
1115 extract_number_and_incr (&mcnt2, &p);
a1a052df 1116 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1117 break;
5e69f11e 1118
25fe55af
RS
1119 case jump_n:
1120 extract_number_and_incr (&mcnt, &p);
1121 extract_number_and_incr (&mcnt2, &p);
a1a052df 1122 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1123 break;
5e69f11e 1124
25fe55af
RS
1125 case set_number_at:
1126 extract_number_and_incr (&mcnt, &p);
1127 extract_number_and_incr (&mcnt2, &p);
a1a052df 1128 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1129 break;
5e69f11e 1130
25fe55af 1131 case wordbound:
a1a052df 1132 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1133 break;
1134
1135 case notwordbound:
a1a052df 1136 fprintf (stderr, "/notwordbound");
25fe55af 1137 break;
fa9a63c5
RM
1138
1139 case wordbeg:
a1a052df 1140 fprintf (stderr, "/wordbeg");
fa9a63c5 1141 break;
5e69f11e 1142
fa9a63c5 1143 case wordend:
a1a052df 1144 fprintf (stderr, "/wordend");
e2543b02 1145 break;
5e69f11e 1146
669fa600 1147 case symbeg:
e2543b02 1148 fprintf (stderr, "/symbeg");
669fa600
SM
1149 break;
1150
1151 case symend:
e2543b02 1152 fprintf (stderr, "/symend");
669fa600
SM
1153 break;
1154
1fb352e0 1155 case syntaxspec:
a1a052df 1156 fprintf (stderr, "/syntaxspec");
1fb352e0 1157 mcnt = *p++;
a1a052df 1158 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1159 break;
1160
1161 case notsyntaxspec:
a1a052df 1162 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1163 mcnt = *p++;
a1a052df 1164 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1165 break;
1166
0b32bf0e 1167# ifdef emacs
fa9a63c5 1168 case before_dot:
a1a052df 1169 fprintf (stderr, "/before_dot");
25fe55af 1170 break;
fa9a63c5
RM
1171
1172 case at_dot:
a1a052df 1173 fprintf (stderr, "/at_dot");
25fe55af 1174 break;
fa9a63c5
RM
1175
1176 case after_dot:
a1a052df 1177 fprintf (stderr, "/after_dot");
25fe55af 1178 break;
fa9a63c5 1179
1fb352e0 1180 case categoryspec:
a1a052df 1181 fprintf (stderr, "/categoryspec");
fa9a63c5 1182 mcnt = *p++;
a1a052df 1183 fprintf (stderr, "/%d", mcnt);
25fe55af 1184 break;
5e69f11e 1185
1fb352e0 1186 case notcategoryspec:
a1a052df 1187 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1188 mcnt = *p++;
a1a052df 1189 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1190 break;
0b32bf0e 1191# endif /* emacs */
fa9a63c5 1192
fa9a63c5 1193 case begbuf:
a1a052df 1194 fprintf (stderr, "/begbuf");
25fe55af 1195 break;
fa9a63c5
RM
1196
1197 case endbuf:
a1a052df 1198 fprintf (stderr, "/endbuf");
25fe55af 1199 break;
fa9a63c5 1200
25fe55af 1201 default:
a1a052df 1202 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1203 }
1204
a1a052df 1205 fprintf (stderr, "\n");
fa9a63c5
RM
1206 }
1207
a1a052df 1208 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1209}
1210
1211
1212void
1213print_compiled_pattern (bufp)
1214 struct re_pattern_buffer *bufp;
1215{
01618498 1216 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1217
1218 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1219 printf ("%ld bytes used/%ld bytes allocated.\n",
1220 bufp->used, bufp->allocated);
fa9a63c5
RM
1221
1222 if (bufp->fastmap_accurate && bufp->fastmap)
1223 {
1224 printf ("fastmap: ");
1225 print_fastmap (bufp->fastmap);
1226 }
1227
1228 printf ("re_nsub: %d\t", bufp->re_nsub);
1229 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1230 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1231 printf ("no_sub: %d\t", bufp->no_sub);
1232 printf ("not_bol: %d\t", bufp->not_bol);
1233 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1234 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1235 fflush (stdout);
fa9a63c5
RM
1236 /* Perhaps we should print the translate table? */
1237}
1238
1239
1240void
1241print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1242 re_char *where;
1243 re_char *string1;
1244 re_char *string2;
fa9a63c5
RM
1245 int size1;
1246 int size2;
1247{
4bb91c68 1248 int this_char;
5e69f11e 1249
fa9a63c5
RM
1250 if (where == NULL)
1251 printf ("(null)");
1252 else
1253 {
1254 if (FIRST_STRING_P (where))
25fe55af
RS
1255 {
1256 for (this_char = where - string1; this_char < size1; this_char++)
1257 putchar (string1[this_char]);
fa9a63c5 1258
25fe55af
RS
1259 where = string2;
1260 }
fa9a63c5
RM
1261
1262 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1263 putchar (string2[this_char]);
fa9a63c5
RM
1264 }
1265}
1266
1267#else /* not DEBUG */
1268
0b32bf0e
SM
1269# undef assert
1270# define assert(e)
fa9a63c5 1271
0b32bf0e
SM
1272# define DEBUG_STATEMENT(e)
1273# define DEBUG_PRINT1(x)
1274# define DEBUG_PRINT2(x1, x2)
1275# define DEBUG_PRINT3(x1, x2, x3)
1276# define DEBUG_PRINT4(x1, x2, x3, x4)
1277# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1278# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1279
1280#endif /* not DEBUG */
1281\f
1282/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1283 also be assigned to arbitrarily: each pattern buffer stores its own
1284 syntax, so it can be changed between regex compilations. */
1285/* This has no initializer because initialized variables in Emacs
1286 become read-only after dumping. */
1287reg_syntax_t re_syntax_options;
1288
1289
1290/* Specify the precise syntax of regexps for compilation. This provides
1291 for compatibility for various utilities which historically have
1292 different, incompatible syntaxes.
1293
1294 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1295 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1296
1297reg_syntax_t
1298re_set_syntax (syntax)
f9b0fd99 1299 reg_syntax_t syntax;
fa9a63c5
RM
1300{
1301 reg_syntax_t ret = re_syntax_options;
5e69f11e 1302
fa9a63c5
RM
1303 re_syntax_options = syntax;
1304 return ret;
1305}
c0f9ea08 1306WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1307
1308/* Regexp to use to replace spaces, or NULL meaning don't. */
1309static re_char *whitespace_regexp;
1310
1311void
1312re_set_whitespace_regexp (regexp)
6470ea05 1313 const char *regexp;
f9b0fd99 1314{
6470ea05 1315 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1316}
1317WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1318\f
1319/* This table gives an error message for each of the error codes listed
4bb91c68 1320 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1321 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1322 but why not be nice? */
fa9a63c5
RM
1323
1324static const char *re_error_msgid[] =
5e69f11e
RM
1325 {
1326 gettext_noop ("Success"), /* REG_NOERROR */
1327 gettext_noop ("No match"), /* REG_NOMATCH */
1328 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1329 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1330 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1331 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1332 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1333 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1334 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1335 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1336 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1337 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1338 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1339 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1340 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1341 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1342 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1343 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1344 };
1345\f
4bb91c68 1346/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1347
1348/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1349 searching and matching functions should not call alloca. On some
1350 systems, alloca is implemented in terms of malloc, and if we're
1351 using the relocating allocator routines, then malloc could cause a
1352 relocation, which might (if the strings being searched are in the
1353 ralloc heap) shift the data out from underneath the regexp
1354 routines.
1355
5e69f11e 1356 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1357 processes input from X in a signal handler; processing X input may
1358 call malloc; if input arrives while a matching routine is calling
1359 malloc, then we're scrod. But Emacs can't just block input while
1360 calling matching routines; then we don't notice interrupts when
1361 they come in. So, Emacs blocks input around all regexp calls
1362 except the matching calls, which it leaves unprotected, in the
1363 faith that they will not malloc. */
1364
1365/* Normally, this is fine. */
1366#define MATCH_MAY_ALLOCATE
1367
1368/* When using GNU C, we are not REALLY using the C alloca, no matter
1369 what config.h may say. So don't take precautions for it. */
1370#ifdef __GNUC__
0b32bf0e 1371# undef C_ALLOCA
fa9a63c5
RM
1372#endif
1373
1374/* The match routines may not allocate if (1) they would do it with malloc
1375 and (2) it's not safe for them to use malloc.
1376 Note that if REL_ALLOC is defined, matching would not use malloc for the
1377 failure stack, but we would still use it for the register vectors;
4bb91c68 1378 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1379#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1380# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1381#endif
1382
1383\f
1384/* Failure stack declarations and macros; both re_compile_fastmap and
1385 re_match_2 use a failure stack. These have to be macros because of
1386 REGEX_ALLOCATE_STACK. */
5e69f11e 1387
fa9a63c5 1388
320a2a73 1389/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1390 when matching. If this number is exceeded, we allocate more
1391 space, so it is not a hard limit. */
1392#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1393# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1394#endif
1395
1396/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1397 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1398 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1399 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1400 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1401# if defined MATCH_MAY_ALLOCATE
1402/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1403 whose default stack limit is 2mb. In order for a larger
1404 value to work reliably, you have to try to make it accord
1405 with the process stack limit. */
c0f9ea08
SM
1406size_t re_max_failures = 40000;
1407# else
1408size_t re_max_failures = 4000;
1409# endif
fa9a63c5
RM
1410
1411union fail_stack_elt
1412{
01618498 1413 re_char *pointer;
c0f9ea08
SM
1414 /* This should be the biggest `int' that's no bigger than a pointer. */
1415 long integer;
fa9a63c5
RM
1416};
1417
1418typedef union fail_stack_elt fail_stack_elt_t;
1419
1420typedef struct
1421{
1422 fail_stack_elt_t *stack;
c0f9ea08
SM
1423 size_t size;
1424 size_t avail; /* Offset of next open position. */
1425 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1426} fail_stack_type;
1427
505bde11 1428#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1429#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1430
1431
1432/* Define macros to initialize and free the failure stack.
1433 Do `return -2' if the alloc fails. */
1434
1435#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1436# define INIT_FAIL_STACK() \
fa9a63c5
RM
1437 do { \
1438 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1439 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1440 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1441 \
1442 if (fail_stack.stack == NULL) \
1443 return -2; \
1444 \
1445 fail_stack.size = INIT_FAILURE_ALLOC; \
1446 fail_stack.avail = 0; \
505bde11 1447 fail_stack.frame = 0; \
fa9a63c5
RM
1448 } while (0)
1449
0b32bf0e 1450# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1451#else
0b32bf0e 1452# define INIT_FAIL_STACK() \
fa9a63c5
RM
1453 do { \
1454 fail_stack.avail = 0; \
505bde11 1455 fail_stack.frame = 0; \
fa9a63c5
RM
1456 } while (0)
1457
0b32bf0e 1458# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1459#endif
1460
1461
320a2a73
KH
1462/* Double the size of FAIL_STACK, up to a limit
1463 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1464
1465 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1466 allocating space for it or it was already too large.
1467
4bb91c68 1468 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1469
320a2a73
KH
1470/* Factor to increase the failure stack size by
1471 when we increase it.
1472 This used to be 2, but 2 was too wasteful
1473 because the old discarded stacks added up to as much space
1474 were as ultimate, maximum-size stack. */
1475#define FAIL_STACK_GROWTH_FACTOR 4
1476
1477#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1478 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1479 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1480 ? 0 \
320a2a73
KH
1481 : ((fail_stack).stack \
1482 = (fail_stack_elt_t *) \
25fe55af
RS
1483 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1484 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1485 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1486 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1487 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1488 \
1489 (fail_stack).stack == NULL \
1490 ? 0 \
6453db45
KH
1491 : ((fail_stack).size \
1492 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1493 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1494 * FAIL_STACK_GROWTH_FACTOR)) \
1495 / sizeof (fail_stack_elt_t)), \
25fe55af 1496 1)))
fa9a63c5
RM
1497
1498
fa9a63c5
RM
1499/* Push a pointer value onto the failure stack.
1500 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1501 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1502#define PUSH_FAILURE_POINTER(item) \
01618498 1503 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1504
1505/* This pushes an integer-valued item onto the failure stack.
1506 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1507 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1508#define PUSH_FAILURE_INT(item) \
1509 fail_stack.stack[fail_stack.avail++].integer = (item)
1510
1511/* Push a fail_stack_elt_t value onto the failure stack.
1512 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1513 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1514#define PUSH_FAILURE_ELT(item) \
1515 fail_stack.stack[fail_stack.avail++] = (item)
1516
1517/* These three POP... operations complement the three PUSH... operations.
1518 All assume that `fail_stack' is nonempty. */
1519#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1520#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1521#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1522
505bde11
SM
1523/* Individual items aside from the registers. */
1524#define NUM_NONREG_ITEMS 3
1525
1526/* Used to examine the stack (to detect infinite loops). */
1527#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1528#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1529#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1530#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1531
1532
505bde11
SM
1533#define ENSURE_FAIL_STACK(space) \
1534while (REMAINING_AVAIL_SLOTS <= space) { \
1535 if (!GROW_FAIL_STACK (fail_stack)) \
1536 return -2; \
1537 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1538 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1539}
1540
1541/* Push register NUM onto the stack. */
1542#define PUSH_FAILURE_REG(num) \
1543do { \
1544 char *destination; \
1545 ENSURE_FAIL_STACK(3); \
1546 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1547 num, regstart[num], regend[num]); \
1548 PUSH_FAILURE_POINTER (regstart[num]); \
1549 PUSH_FAILURE_POINTER (regend[num]); \
1550 PUSH_FAILURE_INT (num); \
1551} while (0)
1552
01618498
SM
1553/* Change the counter's value to VAL, but make sure that it will
1554 be reset when backtracking. */
1555#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1556do { \
1557 char *destination; \
1558 int c; \
1559 ENSURE_FAIL_STACK(3); \
1560 EXTRACT_NUMBER (c, ptr); \
01618498 1561 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1562 PUSH_FAILURE_INT (c); \
1563 PUSH_FAILURE_POINTER (ptr); \
1564 PUSH_FAILURE_INT (-1); \
01618498 1565 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1566} while (0)
1567
505bde11 1568/* Pop a saved register off the stack. */
dc1e502d 1569#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1570do { \
1571 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1572 if (reg == -1) \
1573 { \
1574 /* It's a counter. */ \
6dcf2d0e
SM
1575 /* Here, we discard `const', making re_match non-reentrant. */ \
1576 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1577 reg = POP_FAILURE_INT (); \
1578 STORE_NUMBER (ptr, reg); \
1579 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1580 } \
1581 else \
1582 { \
1583 regend[reg] = POP_FAILURE_POINTER (); \
1584 regstart[reg] = POP_FAILURE_POINTER (); \
1585 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1586 reg, regstart[reg], regend[reg]); \
1587 } \
505bde11
SM
1588} while (0)
1589
1590/* Check that we are not stuck in an infinite loop. */
1591#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1592do { \
f6df485f 1593 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1594 /* Check for infinite matching loops */ \
f6df485f
RS
1595 while (failure > 0 \
1596 && (FAILURE_STR (failure) == string_place \
1597 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1598 { \
1599 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1600 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1601 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1602 { \
6df42991
SM
1603 cycle = 1; \
1604 break; \
f6df485f 1605 } \
66f0296e 1606 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1607 failure = NEXT_FAILURE_HANDLE(failure); \
1608 } \
1609 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1610} while (0)
6df42991 1611
fa9a63c5 1612/* Push the information about the state we will need
5e69f11e
RM
1613 if we ever fail back to it.
1614
505bde11 1615 Requires variables fail_stack, regstart, regend and
320a2a73 1616 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1617 declared.
5e69f11e 1618
fa9a63c5
RM
1619 Does `return FAILURE_CODE' if runs out of memory. */
1620
505bde11
SM
1621#define PUSH_FAILURE_POINT(pattern, string_place) \
1622do { \
1623 char *destination; \
1624 /* Must be int, so when we don't save any registers, the arithmetic \
1625 of 0 + -1 isn't done as unsigned. */ \
1626 \
505bde11 1627 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1628 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1629 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1630 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1631 \
1632 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1633 \
1634 DEBUG_PRINT1 ("\n"); \
1635 \
1636 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1637 PUSH_FAILURE_INT (fail_stack.frame); \
1638 \
1639 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1640 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1641 DEBUG_PRINT1 ("'\n"); \
1642 PUSH_FAILURE_POINTER (string_place); \
1643 \
1644 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1645 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1646 PUSH_FAILURE_POINTER (pattern); \
1647 \
1648 /* Close the frame by moving the frame pointer past it. */ \
1649 fail_stack.frame = fail_stack.avail; \
1650} while (0)
fa9a63c5 1651
320a2a73
KH
1652/* Estimate the size of data pushed by a typical failure stack entry.
1653 An estimate is all we need, because all we use this for
1654 is to choose a limit for how big to make the failure stack. */
ada30c0e 1655/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1656#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1657
fa9a63c5
RM
1658/* How many items can still be added to the stack without overflowing it. */
1659#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1660
1661
1662/* Pops what PUSH_FAIL_STACK pushes.
1663
1664 We restore into the parameters, all of which should be lvalues:
1665 STR -- the saved data position.
1666 PAT -- the saved pattern position.
fa9a63c5 1667 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1668
fa9a63c5 1669 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1670 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1671
505bde11
SM
1672#define POP_FAILURE_POINT(str, pat) \
1673do { \
fa9a63c5
RM
1674 assert (!FAIL_STACK_EMPTY ()); \
1675 \
1676 /* Remove failure points and point to how many regs pushed. */ \
1677 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1678 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1679 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1680 \
505bde11
SM
1681 /* Pop the saved registers. */ \
1682 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1683 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1684 \
01618498 1685 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1686 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1687 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1688 \
1689 /* If the saved string location is NULL, it came from an \
1690 on_failure_keep_string_jump opcode, and we want to throw away the \
1691 saved NULL, thus retaining our current position in the string. */ \
01618498 1692 str = POP_FAILURE_POINTER (); \
505bde11 1693 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1694 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1695 DEBUG_PRINT1 ("'\n"); \
1696 \
505bde11
SM
1697 fail_stack.frame = POP_FAILURE_INT (); \
1698 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1699 \
505bde11
SM
1700 assert (fail_stack.avail >= 0); \
1701 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1702 \
fa9a63c5 1703 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1704} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1705
1706
1707\f
fa9a63c5 1708/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1709#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1710\f
1711/* Subroutine declarations and macros for regex_compile. */
1712
4bb91c68
SM
1713static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1714 reg_syntax_t syntax,
1715 struct re_pattern_buffer *bufp));
1716static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1717static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1718 int arg1, int arg2));
1719static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1720 int arg, unsigned char *end));
1721static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1722 int arg1, int arg2, unsigned char *end));
01618498
SM
1723static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1724 re_char *p,
4bb91c68 1725 reg_syntax_t syntax));
01618498
SM
1726static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1727 re_char *pend,
4bb91c68 1728 reg_syntax_t syntax));
01618498
SM
1729static re_char *skip_one_char _RE_ARGS ((re_char *p));
1730static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1731 char *fastmap, const int multibyte));
fa9a63c5 1732
fa9a63c5 1733/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1734 translation. */
36595814 1735#define PATFETCH(c) \
2d1675e4
SM
1736 do { \
1737 int len; \
1738 if (p == pend) return REG_EEND; \
1739 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1740 p += len; \
fa9a63c5
RM
1741 } while (0)
1742
fa9a63c5
RM
1743
1744/* If `translate' is non-null, return translate[D], else just D. We
1745 cast the subscript to translate because some data is declared as
1746 `char *', to avoid warnings when a string constant is passed. But
1747 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1748#ifndef TRANSLATE
0b32bf0e 1749# define TRANSLATE(d) \
66f0296e 1750 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1751#endif
fa9a63c5
RM
1752
1753
1754/* Macros for outputting the compiled pattern into `buffer'. */
1755
1756/* If the buffer isn't allocated when it comes in, use this. */
1757#define INIT_BUF_SIZE 32
1758
4bb91c68 1759/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1760#define GET_BUFFER_SPACE(n) \
01618498 1761 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1762 EXTEND_BUFFER ()
1763
1764/* Make sure we have one more byte of buffer space and then add C to it. */
1765#define BUF_PUSH(c) \
1766 do { \
1767 GET_BUFFER_SPACE (1); \
1768 *b++ = (unsigned char) (c); \
1769 } while (0)
1770
1771
1772/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1773#define BUF_PUSH_2(c1, c2) \
1774 do { \
1775 GET_BUFFER_SPACE (2); \
1776 *b++ = (unsigned char) (c1); \
1777 *b++ = (unsigned char) (c2); \
1778 } while (0)
1779
1780
4bb91c68 1781/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1782#define BUF_PUSH_3(c1, c2, c3) \
1783 do { \
1784 GET_BUFFER_SPACE (3); \
1785 *b++ = (unsigned char) (c1); \
1786 *b++ = (unsigned char) (c2); \
1787 *b++ = (unsigned char) (c3); \
1788 } while (0)
1789
1790
1791/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1792 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1793#define STORE_JUMP(op, loc, to) \
1794 store_op1 (op, loc, (to) - (loc) - 3)
1795
1796/* Likewise, for a two-argument jump. */
1797#define STORE_JUMP2(op, loc, to, arg) \
1798 store_op2 (op, loc, (to) - (loc) - 3, arg)
1799
4bb91c68 1800/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1801#define INSERT_JUMP(op, loc, to) \
1802 insert_op1 (op, loc, (to) - (loc) - 3, b)
1803
1804/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1805#define INSERT_JUMP2(op, loc, to, arg) \
1806 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1807
1808
1809/* This is not an arbitrary limit: the arguments which represent offsets
275a3409 1810 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1811 be too small, many things would have to change. */
275a3409
RS
1812# define MAX_BUF_SIZE (1L << 15)
1813
1814#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1815/* Any other compiler which, like MSC, has allocation limit below 2^16
1816 bytes will have to use approach similar to what was done below for
1817 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1818 reallocating to 0 bytes. Such thing is not going to work too well.
1819 You have been warned!! */
1820#if defined _MSC_VER && !defined WIN32
1821/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1822# define MAX_BUF_SIZE 65500L
1823#else
1824# define MAX_BUF_SIZE (1L << 16)
1825#endif
275a3409 1826#endif /* 0 */
fa9a63c5
RM
1827
1828/* Extend the buffer by twice its current size via realloc and
1829 reset the pointers that pointed into the old block to point to the
1830 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1831 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1832#if __BOUNDED_POINTERS__
1833# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1834# define MOVE_BUFFER_POINTER(P) \
1835 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1836# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1837 else \
1838 { \
1839 SET_HIGH_BOUND (b); \
1840 SET_HIGH_BOUND (begalt); \
1841 if (fixup_alt_jump) \
1842 SET_HIGH_BOUND (fixup_alt_jump); \
1843 if (laststart) \
1844 SET_HIGH_BOUND (laststart); \
1845 if (pending_exact) \
1846 SET_HIGH_BOUND (pending_exact); \
1847 }
1848#else
1849# define MOVE_BUFFER_POINTER(P) (P) += incr
1850# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1851#endif
fa9a63c5 1852#define EXTEND_BUFFER() \
25fe55af 1853 do { \
01618498 1854 re_char *old_buffer = bufp->buffer; \
25fe55af 1855 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1856 return REG_ESIZE; \
1857 bufp->allocated <<= 1; \
1858 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1859 bufp->allocated = MAX_BUF_SIZE; \
01618498 1860 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1861 if (bufp->buffer == NULL) \
1862 return REG_ESPACE; \
1863 /* If the buffer moved, move all the pointers into it. */ \
1864 if (old_buffer != bufp->buffer) \
1865 { \
4bb91c68
SM
1866 int incr = bufp->buffer - old_buffer; \
1867 MOVE_BUFFER_POINTER (b); \
1868 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1869 if (fixup_alt_jump) \
4bb91c68 1870 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1871 if (laststart) \
4bb91c68 1872 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1873 if (pending_exact) \
4bb91c68 1874 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1875 } \
4bb91c68 1876 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1877 } while (0)
1878
1879
1880/* Since we have one byte reserved for the register number argument to
1881 {start,stop}_memory, the maximum number of groups we can report
1882 things about is what fits in that byte. */
1883#define MAX_REGNUM 255
1884
1885/* But patterns can have more than `MAX_REGNUM' registers. We just
1886 ignore the excess. */
098d42af 1887typedef int regnum_t;
fa9a63c5
RM
1888
1889
1890/* Macros for the compile stack. */
1891
1892/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1893 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1894/* int may be not enough when sizeof(int) == 2. */
1895typedef long pattern_offset_t;
fa9a63c5
RM
1896
1897typedef struct
1898{
1899 pattern_offset_t begalt_offset;
1900 pattern_offset_t fixup_alt_jump;
5e69f11e 1901 pattern_offset_t laststart_offset;
fa9a63c5
RM
1902 regnum_t regnum;
1903} compile_stack_elt_t;
1904
1905
1906typedef struct
1907{
1908 compile_stack_elt_t *stack;
1909 unsigned size;
1910 unsigned avail; /* Offset of next open position. */
1911} compile_stack_type;
1912
1913
1914#define INIT_COMPILE_STACK_SIZE 32
1915
1916#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1917#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1918
4bb91c68 1919/* The next available element. */
fa9a63c5
RM
1920#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1921
1cee1e27
SM
1922/* Explicit quit checking is only used on NTemacs and whenever we
1923 use polling to process input events. */
1924#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1925extern int immediate_quit;
1926# define IMMEDIATE_QUIT_CHECK \
1927 do { \
1928 if (immediate_quit) QUIT; \
1929 } while (0)
1930#else
1931# define IMMEDIATE_QUIT_CHECK ((void)0)
1932#endif
1933\f
b18215fc
RS
1934/* Structure to manage work area for range table. */
1935struct range_table_work_area
1936{
1937 int *table; /* actual work area. */
1938 int allocated; /* allocated size for work area in bytes. */
25fe55af 1939 int used; /* actually used size in words. */
96cc36cc 1940 int bits; /* flag to record character classes */
b18215fc
RS
1941};
1942
77d11aec
RS
1943/* Make sure that WORK_AREA can hold more N multibyte characters.
1944 This is used only in set_image_of_range and set_image_of_range_1.
1945 It expects WORK_AREA to be a pointer.
1946 If it can't get the space, it returns from the surrounding function. */
1947
1948#define EXTEND_RANGE_TABLE(work_area, n) \
1949 do { \
1950 if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
1951 { \
1952 extend_range_table_work_area (work_area); \
1953 if ((work_area)->table == 0) \
1954 return (REG_ESPACE); \
1955 } \
b18215fc
RS
1956 } while (0)
1957
96cc36cc
RS
1958#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1959 (work_area).bits |= (bit)
1960
14473664
SM
1961/* Bits used to implement the multibyte-part of the various character classes
1962 such as [:alnum:] in a charset's range table. */
1963#define BIT_WORD 0x1
1964#define BIT_LOWER 0x2
1965#define BIT_PUNCT 0x4
1966#define BIT_SPACE 0x8
1967#define BIT_UPPER 0x10
1968#define BIT_MULTIBYTE 0x20
96cc36cc 1969
36595814
SM
1970/* Set a range START..END to WORK_AREA.
1971 The range is passed through TRANSLATE, so START and END
1972 should be untranslated. */
77d11aec
RS
1973#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
1974 do { \
1975 int tem; \
1976 tem = set_image_of_range (&work_area, start, end, translate); \
1977 if (tem > 0) \
1978 FREE_STACK_RETURN (tem); \
b18215fc
RS
1979 } while (0)
1980
25fe55af 1981/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1982#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1983 do { \
1984 if ((work_area).table) \
1985 free ((work_area).table); \
1986 } while (0)
1987
96cc36cc 1988#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1989#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1990#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1991#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1992\f
b18215fc 1993
fa9a63c5 1994/* Set the bit for character C in a list. */
01618498 1995#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1996
1997
1998/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1999#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2000 do { \
2001 if (p == pend) \
2002 FREE_STACK_RETURN (REG_EBRACE); \
2003 else \
2004 { \
2005 PATFETCH (c); \
2006 while ('0' <= c && c <= '9') \
2007 { \
2008 int prev; \
2009 if (num < 0) \
2010 num = 0; \
2011 prev = num; \
2012 num = num * 10 + c - '0'; \
2013 if (num / 10 != prev) \
2014 FREE_STACK_RETURN (REG_BADBR); \
2015 if (p == pend) \
2016 FREE_STACK_RETURN (REG_EBRACE); \
2017 PATFETCH (c); \
2018 } \
2019 } \
2020 } while (0)
77d11aec 2021\f
1fdab503 2022#if ! WIDE_CHAR_SUPPORT
01618498 2023
14473664 2024/* Map a string to the char class it names (if any). */
1fdab503 2025re_wctype_t
ada30c0e
SM
2026re_wctype (str)
2027 re_char *str;
14473664 2028{
ada30c0e 2029 const char *string = str;
14473664
SM
2030 if (STREQ (string, "alnum")) return RECC_ALNUM;
2031 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2032 else if (STREQ (string, "word")) return RECC_WORD;
2033 else if (STREQ (string, "ascii")) return RECC_ASCII;
2034 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2035 else if (STREQ (string, "graph")) return RECC_GRAPH;
2036 else if (STREQ (string, "lower")) return RECC_LOWER;
2037 else if (STREQ (string, "print")) return RECC_PRINT;
2038 else if (STREQ (string, "punct")) return RECC_PUNCT;
2039 else if (STREQ (string, "space")) return RECC_SPACE;
2040 else if (STREQ (string, "upper")) return RECC_UPPER;
2041 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2042 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2043 else if (STREQ (string, "digit")) return RECC_DIGIT;
2044 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2045 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2046 else if (STREQ (string, "blank")) return RECC_BLANK;
2047 else return 0;
2048}
2049
2050/* True iff CH is in the char class CC. */
1fdab503 2051boolean
14473664
SM
2052re_iswctype (ch, cc)
2053 int ch;
2054 re_wctype_t cc;
2055{
2056 switch (cc)
2057 {
0cdd06f8
SM
2058 case RECC_ALNUM: return ISALNUM (ch);
2059 case RECC_ALPHA: return ISALPHA (ch);
2060 case RECC_BLANK: return ISBLANK (ch);
2061 case RECC_CNTRL: return ISCNTRL (ch);
2062 case RECC_DIGIT: return ISDIGIT (ch);
2063 case RECC_GRAPH: return ISGRAPH (ch);
2064 case RECC_LOWER: return ISLOWER (ch);
2065 case RECC_PRINT: return ISPRINT (ch);
2066 case RECC_PUNCT: return ISPUNCT (ch);
2067 case RECC_SPACE: return ISSPACE (ch);
2068 case RECC_UPPER: return ISUPPER (ch);
2069 case RECC_XDIGIT: return ISXDIGIT (ch);
2070 case RECC_ASCII: return IS_REAL_ASCII (ch);
2071 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2072 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2073 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2074 case RECC_WORD: return ISWORD (ch);
2075 case RECC_ERROR: return false;
2076 default:
2077 abort();
14473664
SM
2078 }
2079}
fa9a63c5 2080
14473664
SM
2081/* Return a bit-pattern to use in the range-table bits to match multibyte
2082 chars of class CC. */
2083static int
2084re_wctype_to_bit (cc)
2085 re_wctype_t cc;
2086{
2087 switch (cc)
2088 {
2089 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2090 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2091 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2092 case RECC_LOWER: return BIT_LOWER;
2093 case RECC_UPPER: return BIT_UPPER;
2094 case RECC_PUNCT: return BIT_PUNCT;
2095 case RECC_SPACE: return BIT_SPACE;
14473664 2096 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2097 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2098 default:
2099 abort();
14473664
SM
2100 }
2101}
2102#endif
77d11aec
RS
2103\f
2104/* Filling in the work area of a range. */
2105
2106/* Actually extend the space in WORK_AREA. */
2107
2108static void
2109extend_range_table_work_area (work_area)
2110 struct range_table_work_area *work_area;
177c0ea7 2111{
77d11aec
RS
2112 work_area->allocated += 16 * sizeof (int);
2113 if (work_area->table)
2114 work_area->table
a073faa6 2115 = (int *) realloc (work_area->table, work_area->allocated);
77d11aec
RS
2116 else
2117 work_area->table
a073faa6 2118 = (int *) malloc (work_area->allocated);
77d11aec
RS
2119}
2120
2121#ifdef emacs
2122
2123/* Carefully find the ranges of codes that are equivalent
2124 under case conversion to the range start..end when passed through
2125 TRANSLATE. Handle the case where non-letters can come in between
2126 two upper-case letters (which happens in Latin-1).
2127 Also handle the case of groups of more than 2 case-equivalent chars.
2128
2129 The basic method is to look at consecutive characters and see
2130 if they can form a run that can be handled as one.
2131
2132 Returns -1 if successful, REG_ESPACE if ran out of space. */
2133
2134static int
2135set_image_of_range_1 (work_area, start, end, translate)
2136 RE_TRANSLATE_TYPE translate;
2137 struct range_table_work_area *work_area;
2138 re_wchar_t start, end;
2139{
2140 /* `one_case' indicates a character, or a run of characters,
2141 each of which is an isolate (no case-equivalents).
2142 This includes all ASCII non-letters.
2143
2144 `two_case' indicates a character, or a run of characters,
2145 each of which has two case-equivalent forms.
2146 This includes all ASCII letters.
2147
2148 `strange' indicates a character that has more than one
2149 case-equivalent. */
177c0ea7 2150
77d11aec
RS
2151 enum case_type {one_case, two_case, strange};
2152
2153 /* Describe the run that is in progress,
2154 which the next character can try to extend.
2155 If run_type is strange, that means there really is no run.
2156 If run_type is one_case, then run_start...run_end is the run.
2157 If run_type is two_case, then the run is run_start...run_end,
2158 and the case-equivalents end at run_eqv_end. */
2159
2160 enum case_type run_type = strange;
2161 int run_start, run_end, run_eqv_end;
2162
2163 Lisp_Object eqv_table;
2164
2165 if (!RE_TRANSLATE_P (translate))
2166 {
b7c12565 2167 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2168 work_area->table[work_area->used++] = (start);
2169 work_area->table[work_area->used++] = (end);
b7c12565 2170 return -1;
77d11aec
RS
2171 }
2172
2173 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2174
77d11aec
RS
2175 for (; start <= end; start++)
2176 {
2177 enum case_type this_type;
2178 int eqv = RE_TRANSLATE (eqv_table, start);
2179 int minchar, maxchar;
2180
2181 /* Classify this character */
2182 if (eqv == start)
2183 this_type = one_case;
2184 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2185 this_type = two_case;
2186 else
2187 this_type = strange;
2188
2189 if (start < eqv)
2190 minchar = start, maxchar = eqv;
2191 else
2192 minchar = eqv, maxchar = start;
2193
2194 /* Can this character extend the run in progress? */
2195 if (this_type == strange || this_type != run_type
2196 || !(minchar == run_end + 1
2197 && (run_type == two_case
2198 ? maxchar == run_eqv_end + 1 : 1)))
2199 {
2200 /* No, end the run.
2201 Record each of its equivalent ranges. */
2202 if (run_type == one_case)
2203 {
2204 EXTEND_RANGE_TABLE (work_area, 2);
2205 work_area->table[work_area->used++] = run_start;
2206 work_area->table[work_area->used++] = run_end;
2207 }
2208 else if (run_type == two_case)
2209 {
2210 EXTEND_RANGE_TABLE (work_area, 4);
2211 work_area->table[work_area->used++] = run_start;
2212 work_area->table[work_area->used++] = run_end;
2213 work_area->table[work_area->used++]
2214 = RE_TRANSLATE (eqv_table, run_start);
2215 work_area->table[work_area->used++]
2216 = RE_TRANSLATE (eqv_table, run_end);
2217 }
2218 run_type = strange;
2219 }
177c0ea7 2220
77d11aec
RS
2221 if (this_type == strange)
2222 {
2223 /* For a strange character, add each of its equivalents, one
2224 by one. Don't start a range. */
2225 do
2226 {
2227 EXTEND_RANGE_TABLE (work_area, 2);
2228 work_area->table[work_area->used++] = eqv;
2229 work_area->table[work_area->used++] = eqv;
2230 eqv = RE_TRANSLATE (eqv_table, eqv);
2231 }
2232 while (eqv != start);
2233 }
2234
2235 /* Add this char to the run, or start a new run. */
2236 else if (run_type == strange)
2237 {
2238 /* Initialize a new range. */
2239 run_type = this_type;
2240 run_start = start;
2241 run_end = start;
2242 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2243 }
2244 else
2245 {
2246 /* Extend a running range. */
2247 run_end = minchar;
2248 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2249 }
2250 }
2251
2252 /* If a run is still in progress at the end, finish it now
2253 by recording its equivalent ranges. */
2254 if (run_type == one_case)
2255 {
2256 EXTEND_RANGE_TABLE (work_area, 2);
2257 work_area->table[work_area->used++] = run_start;
2258 work_area->table[work_area->used++] = run_end;
2259 }
2260 else if (run_type == two_case)
2261 {
2262 EXTEND_RANGE_TABLE (work_area, 4);
2263 work_area->table[work_area->used++] = run_start;
2264 work_area->table[work_area->used++] = run_end;
2265 work_area->table[work_area->used++]
2266 = RE_TRANSLATE (eqv_table, run_start);
2267 work_area->table[work_area->used++]
2268 = RE_TRANSLATE (eqv_table, run_end);
2269 }
2270
2271 return -1;
2272}
36595814 2273
77d11aec 2274#endif /* emacs */
36595814 2275
b7c12565 2276/* Record the the image of the range start..end when passed through
36595814
SM
2277 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2278 and is not even necessarily contiguous.
b7c12565
RS
2279 Normally we approximate it with the smallest contiguous range that contains
2280 all the chars we need. However, for Latin-1 we go to extra effort
2281 to do a better job.
2282
2283 This function is not called for ASCII ranges.
77d11aec
RS
2284
2285 Returns -1 if successful, REG_ESPACE if ran out of space. */
2286
2287static int
36595814
SM
2288set_image_of_range (work_area, start, end, translate)
2289 RE_TRANSLATE_TYPE translate;
2290 struct range_table_work_area *work_area;
2291 re_wchar_t start, end;
2292{
77d11aec
RS
2293 re_wchar_t cmin, cmax;
2294
2295#ifdef emacs
2296 /* For Latin-1 ranges, use set_image_of_range_1
2297 to get proper handling of ranges that include letters and nonletters.
b7c12565 2298 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2299 For other character sets, we don't bother to get this right. */
b7c12565
RS
2300 if (RE_TRANSLATE_P (translate) && start < 04400
2301 && !(start < 04200 && end >= 04377))
77d11aec 2302 {
b7c12565 2303 int newend;
77d11aec 2304 int tem;
b7c12565
RS
2305 newend = end;
2306 if (newend > 04377)
2307 newend = 04377;
2308 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2309 if (tem > 0)
2310 return tem;
2311
2312 start = 04400;
2313 if (end < 04400)
2314 return -1;
2315 }
2316#endif
2317
b7c12565
RS
2318 EXTEND_RANGE_TABLE (work_area, 2);
2319 work_area->table[work_area->used++] = (start);
2320 work_area->table[work_area->used++] = (end);
2321
2322 cmin = -1, cmax = -1;
77d11aec 2323
36595814 2324 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2325 {
2326 int ch;
77d11aec 2327
b7c12565
RS
2328 for (ch = start; ch <= end; ch++)
2329 {
2330 re_wchar_t c = TRANSLATE (ch);
2331 if (! (start <= c && c <= end))
2332 {
2333 if (cmin == -1)
2334 cmin = c, cmax = c;
2335 else
2336 {
2337 cmin = MIN (cmin, c);
2338 cmax = MAX (cmax, c);
2339 }
2340 }
2341 }
2342
2343 if (cmin != -1)
2344 {
2345 EXTEND_RANGE_TABLE (work_area, 2);
2346 work_area->table[work_area->used++] = (cmin);
2347 work_area->table[work_area->used++] = (cmax);
2348 }
2349 }
36595814 2350
77d11aec
RS
2351 return -1;
2352}
fa9a63c5
RM
2353\f
2354#ifndef MATCH_MAY_ALLOCATE
2355
2356/* If we cannot allocate large objects within re_match_2_internal,
2357 we make the fail stack and register vectors global.
2358 The fail stack, we grow to the maximum size when a regexp
2359 is compiled.
2360 The register vectors, we adjust in size each time we
2361 compile a regexp, according to the number of registers it needs. */
2362
2363static fail_stack_type fail_stack;
2364
2365/* Size with which the following vectors are currently allocated.
2366 That is so we can make them bigger as needed,
4bb91c68 2367 but never make them smaller. */
fa9a63c5
RM
2368static int regs_allocated_size;
2369
66f0296e
SM
2370static re_char ** regstart, ** regend;
2371static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2372
2373/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2374 but don't make them smaller. */
fa9a63c5
RM
2375
2376static
2377regex_grow_registers (num_regs)
2378 int num_regs;
2379{
2380 if (num_regs > regs_allocated_size)
2381 {
66f0296e
SM
2382 RETALLOC_IF (regstart, num_regs, re_char *);
2383 RETALLOC_IF (regend, num_regs, re_char *);
2384 RETALLOC_IF (best_regstart, num_regs, re_char *);
2385 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2386
2387 regs_allocated_size = num_regs;
2388 }
2389}
2390
2391#endif /* not MATCH_MAY_ALLOCATE */
2392\f
99633e97
SM
2393static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2394 compile_stack,
2395 regnum_t regnum));
2396
fa9a63c5
RM
2397/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2398 Returns one of error codes defined in `regex.h', or zero for success.
2399
2400 Assumes the `allocated' (and perhaps `buffer') and `translate'
2401 fields are set in BUFP on entry.
2402
2403 If it succeeds, results are put in BUFP (if it returns an error, the
2404 contents of BUFP are undefined):
2405 `buffer' is the compiled pattern;
2406 `syntax' is set to SYNTAX;
2407 `used' is set to the length of the compiled pattern;
2408 `fastmap_accurate' is zero;
2409 `re_nsub' is the number of subexpressions in PATTERN;
2410 `not_bol' and `not_eol' are zero;
5e69f11e 2411
c0f9ea08 2412 The `fastmap' field is neither examined nor set. */
fa9a63c5 2413
505bde11
SM
2414/* Insert the `jump' from the end of last alternative to "here".
2415 The space for the jump has already been allocated. */
2416#define FIXUP_ALT_JUMP() \
2417do { \
2418 if (fixup_alt_jump) \
2419 STORE_JUMP (jump, fixup_alt_jump, b); \
2420} while (0)
2421
2422
fa9a63c5
RM
2423/* Return, freeing storage we allocated. */
2424#define FREE_STACK_RETURN(value) \
b18215fc
RS
2425 do { \
2426 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2427 free (compile_stack.stack); \
2428 return value; \
2429 } while (0)
fa9a63c5
RM
2430
2431static reg_errcode_t
2432regex_compile (pattern, size, syntax, bufp)
66f0296e 2433 re_char *pattern;
4bb91c68 2434 size_t size;
fa9a63c5
RM
2435 reg_syntax_t syntax;
2436 struct re_pattern_buffer *bufp;
2437{
01618498
SM
2438 /* We fetch characters from PATTERN here. */
2439 register re_wchar_t c, c1;
5e69f11e 2440
fa9a63c5 2441 /* A random temporary spot in PATTERN. */
66f0296e 2442 re_char *p1;
fa9a63c5
RM
2443
2444 /* Points to the end of the buffer, where we should append. */
2445 register unsigned char *b;
5e69f11e 2446
fa9a63c5
RM
2447 /* Keeps track of unclosed groups. */
2448 compile_stack_type compile_stack;
2449
2450 /* Points to the current (ending) position in the pattern. */
22336245
RS
2451#ifdef AIX
2452 /* `const' makes AIX compiler fail. */
66f0296e 2453 unsigned char *p = pattern;
22336245 2454#else
66f0296e 2455 re_char *p = pattern;
22336245 2456#endif
66f0296e 2457 re_char *pend = pattern + size;
5e69f11e 2458
fa9a63c5 2459 /* How to translate the characters in the pattern. */
6676cb1c 2460 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2461
2462 /* Address of the count-byte of the most recently inserted `exactn'
2463 command. This makes it possible to tell if a new exact-match
2464 character can be added to that command or if the character requires
2465 a new `exactn' command. */
2466 unsigned char *pending_exact = 0;
2467
2468 /* Address of start of the most recently finished expression.
2469 This tells, e.g., postfix * where to find the start of its
2470 operand. Reset at the beginning of groups and alternatives. */
2471 unsigned char *laststart = 0;
2472
2473 /* Address of beginning of regexp, or inside of last group. */
2474 unsigned char *begalt;
2475
2476 /* Place in the uncompiled pattern (i.e., the {) to
2477 which to go back if the interval is invalid. */
66f0296e 2478 re_char *beg_interval;
5e69f11e 2479
fa9a63c5 2480 /* Address of the place where a forward jump should go to the end of
25fe55af 2481 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2482 last -- ends with a forward jump of this sort. */
2483 unsigned char *fixup_alt_jump = 0;
2484
2485 /* Counts open-groups as they are encountered. Remembered for the
2486 matching close-group on the compile stack, so the same register
2487 number is put in the stop_memory as the start_memory. */
2488 regnum_t regnum = 0;
2489
b18215fc
RS
2490 /* Work area for range table of charset. */
2491 struct range_table_work_area range_table_work;
2492
2d1675e4
SM
2493 /* If the object matched can contain multibyte characters. */
2494 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2495
f9b0fd99
RS
2496 /* Nonzero if we have pushed down into a subpattern. */
2497 int in_subpattern = 0;
2498
2499 /* These hold the values of p, pattern, and pend from the main
2500 pattern when we have pushed into a subpattern. */
2501 re_char *main_p;
2502 re_char *main_pattern;
2503 re_char *main_pend;
2504
fa9a63c5 2505#ifdef DEBUG
99633e97 2506 debug++;
fa9a63c5 2507 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2508 if (debug > 0)
fa9a63c5
RM
2509 {
2510 unsigned debug_count;
5e69f11e 2511
fa9a63c5 2512 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2513 putchar (pattern[debug_count]);
fa9a63c5
RM
2514 putchar ('\n');
2515 }
2516#endif /* DEBUG */
2517
2518 /* Initialize the compile stack. */
2519 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2520 if (compile_stack.stack == NULL)
2521 return REG_ESPACE;
2522
2523 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2524 compile_stack.avail = 0;
2525
b18215fc
RS
2526 range_table_work.table = 0;
2527 range_table_work.allocated = 0;
2528
fa9a63c5
RM
2529 /* Initialize the pattern buffer. */
2530 bufp->syntax = syntax;
2531 bufp->fastmap_accurate = 0;
2532 bufp->not_bol = bufp->not_eol = 0;
2533
2534 /* Set `used' to zero, so that if we return an error, the pattern
2535 printer (for debugging) will think there's no pattern. We reset it
2536 at the end. */
2537 bufp->used = 0;
5e69f11e 2538
fa9a63c5 2539 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2540 bufp->re_nsub = 0;
fa9a63c5 2541
0b32bf0e 2542#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2543 /* Initialize the syntax table. */
2544 init_syntax_once ();
2545#endif
2546
2547 if (bufp->allocated == 0)
2548 {
2549 if (bufp->buffer)
2550 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2551 enough space. This loses if buffer's address is bogus, but
2552 that is the user's responsibility. */
2553 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2554 }
fa9a63c5 2555 else
25fe55af
RS
2556 { /* Caller did not allocate a buffer. Do it for them. */
2557 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2558 }
fa9a63c5
RM
2559 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2560
2561 bufp->allocated = INIT_BUF_SIZE;
2562 }
2563
2564 begalt = b = bufp->buffer;
2565
2566 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2567 while (1)
fa9a63c5 2568 {
f9b0fd99
RS
2569 if (p == pend)
2570 {
2571 /* If this is the end of an included regexp,
2572 pop back to the main regexp and try again. */
2573 if (in_subpattern)
2574 {
2575 in_subpattern = 0;
2576 pattern = main_pattern;
2577 p = main_p;
2578 pend = main_pend;
2579 continue;
2580 }
2581 /* If this is the end of the main regexp, we are done. */
2582 break;
2583 }
2584
fa9a63c5
RM
2585 PATFETCH (c);
2586
2587 switch (c)
25fe55af 2588 {
f9b0fd99
RS
2589 case ' ':
2590 {
2591 re_char *p1 = p;
2592
2593 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2594 spaces normally. And don't try to do this recursively. */
2595 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2596 goto normal_char;
2597
2598 /* Peek past following spaces. */
2599 while (p1 != pend)
2600 {
2601 if (*p1 != ' ')
2602 break;
2603 p1++;
2604 }
2605 /* If the spaces are followed by a repetition op,
2606 treat them normally. */
c721eee5
RS
2607 if (p1 != pend
2608 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2609 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2610 goto normal_char;
2611
2612 /* Replace the spaces with the whitespace regexp. */
2613 in_subpattern = 1;
2614 main_p = p1;
2615 main_pend = pend;
2616 main_pattern = pattern;
2617 p = pattern = whitespace_regexp;
2618 pend = p + strlen (p);
2619 break;
2620 }
2621
25fe55af
RS
2622 case '^':
2623 {
2624 if ( /* If at start of pattern, it's an operator. */
2625 p == pattern + 1
2626 /* If context independent, it's an operator. */
2627 || syntax & RE_CONTEXT_INDEP_ANCHORS
2628 /* Otherwise, depends on what's come before. */
2629 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2630 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2631 else
2632 goto normal_char;
2633 }
2634 break;
2635
2636
2637 case '$':
2638 {
2639 if ( /* If at end of pattern, it's an operator. */
2640 p == pend
2641 /* If context independent, it's an operator. */
2642 || syntax & RE_CONTEXT_INDEP_ANCHORS
2643 /* Otherwise, depends on what's next. */
2644 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2645 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2646 else
2647 goto normal_char;
2648 }
2649 break;
fa9a63c5
RM
2650
2651
2652 case '+':
25fe55af
RS
2653 case '?':
2654 if ((syntax & RE_BK_PLUS_QM)
2655 || (syntax & RE_LIMITED_OPS))
2656 goto normal_char;
2657 handle_plus:
2658 case '*':
2659 /* If there is no previous pattern... */
2660 if (!laststart)
2661 {
2662 if (syntax & RE_CONTEXT_INVALID_OPS)
2663 FREE_STACK_RETURN (REG_BADRPT);
2664 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2665 goto normal_char;
2666 }
2667
2668 {
25fe55af 2669 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2670 boolean zero_times_ok = 0, many_times_ok = 0;
2671 boolean greedy = 1;
25fe55af
RS
2672
2673 /* If there is a sequence of repetition chars, collapse it
2674 down to just one (the right one). We can't combine
2675 interval operators with these because of, e.g., `a{2}*',
2676 which should only match an even number of `a's. */
2677
2678 for (;;)
2679 {
0b32bf0e 2680 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2681 && c == '?' && (zero_times_ok || many_times_ok))
2682 greedy = 0;
2683 else
2684 {
2685 zero_times_ok |= c != '+';
2686 many_times_ok |= c != '?';
2687 }
25fe55af
RS
2688
2689 if (p == pend)
2690 break;
ed0767d8
SM
2691 else if (*p == '*'
2692 || (!(syntax & RE_BK_PLUS_QM)
2693 && (*p == '+' || *p == '?')))
25fe55af 2694 ;
ed0767d8 2695 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2696 {
ed0767d8
SM
2697 if (p+1 == pend)
2698 FREE_STACK_RETURN (REG_EESCAPE);
2699 if (p[1] == '+' || p[1] == '?')
2700 PATFETCH (c); /* Gobble up the backslash. */
2701 else
2702 break;
25fe55af
RS
2703 }
2704 else
ed0767d8 2705 break;
25fe55af 2706 /* If we get here, we found another repeat character. */
ed0767d8
SM
2707 PATFETCH (c);
2708 }
25fe55af
RS
2709
2710 /* Star, etc. applied to an empty pattern is equivalent
2711 to an empty pattern. */
4e8a9132 2712 if (!laststart || laststart == b)
25fe55af
RS
2713 break;
2714
2715 /* Now we know whether or not zero matches is allowed
2716 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2717 if (greedy)
2718 {
99633e97 2719 if (many_times_ok)
4e8a9132
SM
2720 {
2721 boolean simple = skip_one_char (laststart) == b;
2722 unsigned int startoffset = 0;
f6a3f532 2723 re_opcode_t ofj =
01618498 2724 /* Check if the loop can match the empty string. */
6df42991
SM
2725 (simple || !analyse_first (laststart, b, NULL, 0))
2726 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2727 assert (skip_one_char (laststart) <= b);
177c0ea7 2728
4e8a9132
SM
2729 if (!zero_times_ok && simple)
2730 { /* Since simple * loops can be made faster by using
2731 on_failure_keep_string_jump, we turn simple P+
2732 into PP* if P is simple. */
2733 unsigned char *p1, *p2;
2734 startoffset = b - laststart;
2735 GET_BUFFER_SPACE (startoffset);
2736 p1 = b; p2 = laststart;
2737 while (p2 < p1)
2738 *b++ = *p2++;
2739 zero_times_ok = 1;
99633e97 2740 }
4e8a9132
SM
2741
2742 GET_BUFFER_SPACE (6);
2743 if (!zero_times_ok)
2744 /* A + loop. */
f6a3f532 2745 STORE_JUMP (ofj, b, b + 6);
99633e97 2746 else
4e8a9132
SM
2747 /* Simple * loops can use on_failure_keep_string_jump
2748 depending on what follows. But since we don't know
2749 that yet, we leave the decision up to
2750 on_failure_jump_smart. */
f6a3f532 2751 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2752 laststart + startoffset, b + 6);
99633e97 2753 b += 3;
4e8a9132 2754 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2755 b += 3;
2756 }
2757 else
2758 {
4e8a9132
SM
2759 /* A simple ? pattern. */
2760 assert (zero_times_ok);
2761 GET_BUFFER_SPACE (3);
2762 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2763 b += 3;
2764 }
1c8c6d39
DL
2765 }
2766 else /* not greedy */
2767 { /* I wish the greedy and non-greedy cases could be merged. */
2768
0683b6fa 2769 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2770 if (many_times_ok)
2771 {
f6a3f532
SM
2772 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2773
6df42991
SM
2774 /* The non-greedy multiple match looks like
2775 a repeat..until: we only need a conditional jump
2776 at the end of the loop. */
f6a3f532
SM
2777 if (emptyp) BUF_PUSH (no_op);
2778 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2779 : on_failure_jump, b, laststart);
1c8c6d39
DL
2780 b += 3;
2781 if (zero_times_ok)
2782 {
2783 /* The repeat...until naturally matches one or more.
2784 To also match zero times, we need to first jump to
6df42991 2785 the end of the loop (its conditional jump). */
1c8c6d39
DL
2786 INSERT_JUMP (jump, laststart, b);
2787 b += 3;
2788 }
2789 }
2790 else
2791 {
2792 /* non-greedy a?? */
1c8c6d39
DL
2793 INSERT_JUMP (jump, laststart, b + 3);
2794 b += 3;
2795 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2796 b += 3;
2797 }
2798 }
2799 }
4e8a9132 2800 pending_exact = 0;
fa9a63c5
RM
2801 break;
2802
2803
2804 case '.':
25fe55af
RS
2805 laststart = b;
2806 BUF_PUSH (anychar);
2807 break;
fa9a63c5
RM
2808
2809
25fe55af
RS
2810 case '[':
2811 {
b18215fc 2812 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2813
25fe55af 2814 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2815
25fe55af
RS
2816 /* Ensure that we have enough space to push a charset: the
2817 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2818 GET_BUFFER_SPACE (34);
2819
25fe55af 2820 laststart = b;
e318085a 2821
25fe55af
RS
2822 /* We test `*p == '^' twice, instead of using an if
2823 statement, so we only need one BUF_PUSH. */
2824 BUF_PUSH (*p == '^' ? charset_not : charset);
2825 if (*p == '^')
2826 p++;
e318085a 2827
25fe55af
RS
2828 /* Remember the first position in the bracket expression. */
2829 p1 = p;
e318085a 2830
25fe55af
RS
2831 /* Push the number of bytes in the bitmap. */
2832 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2833
25fe55af
RS
2834 /* Clear the whole map. */
2835 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2836
25fe55af
RS
2837 /* charset_not matches newline according to a syntax bit. */
2838 if ((re_opcode_t) b[-2] == charset_not
2839 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2840 SET_LIST_BIT ('\n');
fa9a63c5 2841
25fe55af
RS
2842 /* Read in characters and ranges, setting map bits. */
2843 for (;;)
2844 {
b18215fc 2845 boolean escaped_char = false;
2d1675e4 2846 const unsigned char *p2 = p;
e318085a 2847
25fe55af 2848 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2849
36595814
SM
2850 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2851 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2852 So the translation is done later in a loop. Example:
2853 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2854 PATFETCH (c);
e318085a 2855
25fe55af
RS
2856 /* \ might escape characters inside [...] and [^...]. */
2857 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2858 {
2859 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2860
2861 PATFETCH (c);
b18215fc 2862 escaped_char = true;
25fe55af 2863 }
b18215fc
RS
2864 else
2865 {
657fcfbd
RS
2866 /* Could be the end of the bracket expression. If it's
2867 not (i.e., when the bracket expression is `[]' so
2868 far), the ']' character bit gets set way below. */
2d1675e4 2869 if (c == ']' && p2 != p1)
657fcfbd 2870 break;
25fe55af 2871 }
b18215fc 2872
b18215fc
RS
2873 /* What should we do for the character which is
2874 greater than 0x7F, but not BASE_LEADING_CODE_P?
2875 XXX */
2876
25fe55af
RS
2877 /* See if we're at the beginning of a possible character
2878 class. */
b18215fc 2879
2d1675e4
SM
2880 if (!escaped_char &&
2881 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2882 {
2883 /* Leave room for the null. */
14473664 2884 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2885 const unsigned char *class_beg;
b18215fc 2886
25fe55af
RS
2887 PATFETCH (c);
2888 c1 = 0;
ed0767d8 2889 class_beg = p;
b18215fc 2890
25fe55af
RS
2891 /* If pattern is `[[:'. */
2892 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2893
25fe55af
RS
2894 for (;;)
2895 {
14473664
SM
2896 PATFETCH (c);
2897 if ((c == ':' && *p == ']') || p == pend)
2898 break;
2899 if (c1 < CHAR_CLASS_MAX_LENGTH)
2900 str[c1++] = c;
2901 else
2902 /* This is in any case an invalid class name. */
2903 str[0] = '\0';
25fe55af
RS
2904 }
2905 str[c1] = '\0';
b18215fc
RS
2906
2907 /* If isn't a word bracketed by `[:' and `:]':
2908 undo the ending character, the letters, and
2909 leave the leading `:' and `[' (but set bits for
2910 them). */
25fe55af
RS
2911 if (c == ':' && *p == ']')
2912 {
36595814 2913 re_wchar_t ch;
14473664
SM
2914 re_wctype_t cc;
2915
2916 cc = re_wctype (str);
2917
2918 if (cc == 0)
fa9a63c5
RM
2919 FREE_STACK_RETURN (REG_ECTYPE);
2920
14473664
SM
2921 /* Throw away the ] at the end of the character
2922 class. */
2923 PATFETCH (c);
fa9a63c5 2924
14473664 2925 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2926
96cc36cc
RS
2927 /* Most character classes in a multibyte match
2928 just set a flag. Exceptions are is_blank,
2929 is_digit, is_cntrl, and is_xdigit, since
2930 they can only match ASCII characters. We
14473664
SM
2931 don't need to handle them for multibyte.
2932 They are distinguished by a negative wctype. */
96cc36cc 2933
2d1675e4 2934 if (multibyte)
14473664
SM
2935 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2936 re_wctype_to_bit (cc));
96cc36cc 2937
14473664 2938 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
25fe55af 2939 {
7ae68633 2940 int translated = TRANSLATE (ch);
14473664 2941 if (re_iswctype (btowc (ch), cc))
96cc36cc 2942 SET_LIST_BIT (translated);
25fe55af 2943 }
b18215fc
RS
2944
2945 /* Repeat the loop. */
2946 continue;
25fe55af
RS
2947 }
2948 else
2949 {
ed0767d8
SM
2950 /* Go back to right after the "[:". */
2951 p = class_beg;
25fe55af 2952 SET_LIST_BIT ('[');
b18215fc
RS
2953
2954 /* Because the `:' may starts the range, we
2955 can't simply set bit and repeat the loop.
25fe55af 2956 Instead, just set it to C and handle below. */
b18215fc 2957 c = ':';
25fe55af
RS
2958 }
2959 }
b18215fc
RS
2960
2961 if (p < pend && p[0] == '-' && p[1] != ']')
2962 {
2963
2964 /* Discard the `-'. */
2965 PATFETCH (c1);
2966
2967 /* Fetch the character which ends the range. */
2968 PATFETCH (c1);
b18215fc 2969
b54f61ed 2970 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2971 {
b54f61ed
KH
2972 if (! SINGLE_BYTE_CHAR_P (c1))
2973 {
3ff2446d
KH
2974 /* Handle a range starting with a
2975 character of less than 256, and ending
2976 with a character of not less than 256.
2977 Split that into two ranges, the low one
2978 ending at 0377, and the high one
2979 starting at the smallest character in
2980 the charset of C1 and ending at C1. */
b54f61ed 2981 int charset = CHAR_CHARSET (c1);
36595814
SM
2982 re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
2983
b54f61ed
KH
2984 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2985 c2, c1);
333526e0 2986 c1 = 0377;
b54f61ed 2987 }
e934739e
RS
2988 }
2989 else if (!SAME_CHARSET_P (c, c1))
b3e4c897 2990 FREE_STACK_RETURN (REG_ERANGEX);
e318085a 2991 }
25fe55af 2992 else
b18215fc
RS
2993 /* Range from C to C. */
2994 c1 = c;
2995
2996 /* Set the range ... */
2997 if (SINGLE_BYTE_CHAR_P (c))
2998 /* ... into bitmap. */
25fe55af 2999 {
01618498 3000 re_wchar_t this_char;
36595814 3001 re_wchar_t range_start = c, range_end = c1;
b18215fc
RS
3002
3003 /* If the start is after the end, the range is empty. */
3004 if (range_start > range_end)
3005 {
3006 if (syntax & RE_NO_EMPTY_RANGES)
3007 FREE_STACK_RETURN (REG_ERANGE);
3008 /* Else, repeat the loop. */
3009 }
3010 else
3011 {
3012 for (this_char = range_start; this_char <= range_end;
3013 this_char++)
3014 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 3015 }
25fe55af 3016 }
e318085a 3017 else
b18215fc
RS
3018 /* ... into range table. */
3019 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
3020 }
3021
25fe55af
RS
3022 /* Discard any (non)matching list bytes that are all 0 at the
3023 end of the map. Decrease the map-length byte too. */
3024 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3025 b[-1]--;
3026 b += b[-1];
fa9a63c5 3027
96cc36cc
RS
3028 /* Build real range table from work area. */
3029 if (RANGE_TABLE_WORK_USED (range_table_work)
3030 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3031 {
3032 int i;
3033 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3034
b18215fc 3035 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3036 bytes for flags, two for COUNT, and three bytes for
3037 each character. */
3038 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3039
b18215fc
RS
3040 /* Indicate the existence of range table. */
3041 laststart[1] |= 0x80;
fa9a63c5 3042
96cc36cc
RS
3043 /* Store the character class flag bits into the range table.
3044 If not in emacs, these flag bits are always 0. */
3045 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3046 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3047
b18215fc
RS
3048 STORE_NUMBER_AND_INCR (b, used / 2);
3049 for (i = 0; i < used; i++)
3050 STORE_CHARACTER_AND_INCR
3051 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3052 }
25fe55af
RS
3053 }
3054 break;
fa9a63c5
RM
3055
3056
b18215fc 3057 case '(':
25fe55af
RS
3058 if (syntax & RE_NO_BK_PARENS)
3059 goto handle_open;
3060 else
3061 goto normal_char;
fa9a63c5
RM
3062
3063
25fe55af
RS
3064 case ')':
3065 if (syntax & RE_NO_BK_PARENS)
3066 goto handle_close;
3067 else
3068 goto normal_char;
e318085a
RS
3069
3070
25fe55af
RS
3071 case '\n':
3072 if (syntax & RE_NEWLINE_ALT)
3073 goto handle_alt;
3074 else
3075 goto normal_char;
e318085a
RS
3076
3077
b18215fc 3078 case '|':
25fe55af
RS
3079 if (syntax & RE_NO_BK_VBAR)
3080 goto handle_alt;
3081 else
3082 goto normal_char;
3083
3084
3085 case '{':
3086 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3087 goto handle_interval;
3088 else
3089 goto normal_char;
3090
3091
3092 case '\\':
3093 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3094
3095 /* Do not translate the character after the \, so that we can
3096 distinguish, e.g., \B from \b, even if we normally would
3097 translate, e.g., B to b. */
36595814 3098 PATFETCH (c);
25fe55af
RS
3099
3100 switch (c)
3101 {
3102 case '(':
3103 if (syntax & RE_NO_BK_PARENS)
3104 goto normal_backslash;
3105
3106 handle_open:
505bde11
SM
3107 {
3108 int shy = 0;
3109 if (p+1 < pend)
3110 {
3111 /* Look for a special (?...) construct */
ed0767d8 3112 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3113 {
ed0767d8 3114 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3115 PATFETCH (c);
3116 switch (c)
3117 {
3118 case ':': shy = 1; break;
3119 default:
3120 /* Only (?:...) is supported right now. */
3121 FREE_STACK_RETURN (REG_BADPAT);
3122 }
3123 }
505bde11
SM
3124 }
3125
3126 if (!shy)
3127 {
3128 bufp->re_nsub++;
3129 regnum++;
3130 }
25fe55af 3131
99633e97
SM
3132 if (COMPILE_STACK_FULL)
3133 {
3134 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3135 compile_stack_elt_t);
3136 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3137
99633e97
SM
3138 compile_stack.size <<= 1;
3139 }
25fe55af 3140
99633e97
SM
3141 /* These are the values to restore when we hit end of this
3142 group. They are all relative offsets, so that if the
3143 whole pattern moves because of realloc, they will still
3144 be valid. */
3145 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3146 COMPILE_STACK_TOP.fixup_alt_jump
3147 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3148 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3149 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3150
3151 /* Do not push a
3152 start_memory for groups beyond the last one we can
3153 represent in the compiled pattern. */
3154 if (regnum <= MAX_REGNUM && !shy)
3155 BUF_PUSH_2 (start_memory, regnum);
3156
3157 compile_stack.avail++;
3158
3159 fixup_alt_jump = 0;
3160 laststart = 0;
3161 begalt = b;
3162 /* If we've reached MAX_REGNUM groups, then this open
3163 won't actually generate any code, so we'll have to
3164 clear pending_exact explicitly. */
3165 pending_exact = 0;
3166 break;
505bde11 3167 }
25fe55af
RS
3168
3169 case ')':
3170 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3171
3172 if (COMPILE_STACK_EMPTY)
505bde11
SM
3173 {
3174 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3175 goto normal_backslash;
3176 else
3177 FREE_STACK_RETURN (REG_ERPAREN);
3178 }
25fe55af
RS
3179
3180 handle_close:
505bde11 3181 FIXUP_ALT_JUMP ();
25fe55af
RS
3182
3183 /* See similar code for backslashed left paren above. */
3184 if (COMPILE_STACK_EMPTY)
505bde11
SM
3185 {
3186 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3187 goto normal_char;
3188 else
3189 FREE_STACK_RETURN (REG_ERPAREN);
3190 }
25fe55af
RS
3191
3192 /* Since we just checked for an empty stack above, this
3193 ``can't happen''. */
3194 assert (compile_stack.avail != 0);
3195 {
3196 /* We don't just want to restore into `regnum', because
3197 later groups should continue to be numbered higher,
3198 as in `(ab)c(de)' -- the second group is #2. */
3199 regnum_t this_group_regnum;
3200
3201 compile_stack.avail--;
3202 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3203 fixup_alt_jump
3204 = COMPILE_STACK_TOP.fixup_alt_jump
3205 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3206 : 0;
3207 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3208 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3209 /* If we've reached MAX_REGNUM groups, then this open
3210 won't actually generate any code, so we'll have to
3211 clear pending_exact explicitly. */
3212 pending_exact = 0;
e318085a 3213
25fe55af
RS
3214 /* We're at the end of the group, so now we know how many
3215 groups were inside this one. */
505bde11
SM
3216 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3217 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3218 }
3219 break;
3220
3221
3222 case '|': /* `\|'. */
3223 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3224 goto normal_backslash;
3225 handle_alt:
3226 if (syntax & RE_LIMITED_OPS)
3227 goto normal_char;
3228
3229 /* Insert before the previous alternative a jump which
3230 jumps to this alternative if the former fails. */
3231 GET_BUFFER_SPACE (3);
3232 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3233 pending_exact = 0;
3234 b += 3;
3235
3236 /* The alternative before this one has a jump after it
3237 which gets executed if it gets matched. Adjust that
3238 jump so it will jump to this alternative's analogous
3239 jump (put in below, which in turn will jump to the next
3240 (if any) alternative's such jump, etc.). The last such
3241 jump jumps to the correct final destination. A picture:
3242 _____ _____
3243 | | | |
3244 | v | v
3245 a | b | c
3246
3247 If we are at `b', then fixup_alt_jump right now points to a
3248 three-byte space after `a'. We'll put in the jump, set
3249 fixup_alt_jump to right after `b', and leave behind three
3250 bytes which we'll fill in when we get to after `c'. */
3251
505bde11 3252 FIXUP_ALT_JUMP ();
25fe55af
RS
3253
3254 /* Mark and leave space for a jump after this alternative,
3255 to be filled in later either by next alternative or
3256 when know we're at the end of a series of alternatives. */
3257 fixup_alt_jump = b;
3258 GET_BUFFER_SPACE (3);
3259 b += 3;
3260
3261 laststart = 0;
3262 begalt = b;
3263 break;
3264
3265
3266 case '{':
3267 /* If \{ is a literal. */
3268 if (!(syntax & RE_INTERVALS)
3269 /* If we're at `\{' and it's not the open-interval
3270 operator. */
4bb91c68 3271 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3272 goto normal_backslash;
3273
3274 handle_interval:
3275 {
3276 /* If got here, then the syntax allows intervals. */
3277
3278 /* At least (most) this many matches must be made. */
99633e97 3279 int lower_bound = 0, upper_bound = -1;
25fe55af 3280
ed0767d8 3281 beg_interval = p;
25fe55af 3282
25fe55af
RS
3283 GET_UNSIGNED_NUMBER (lower_bound);
3284
3285 if (c == ',')
ed0767d8 3286 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3287 else
3288 /* Interval such as `{1}' => match exactly once. */
3289 upper_bound = lower_bound;
3290
3291 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3292 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3293 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3294
3295 if (!(syntax & RE_NO_BK_BRACES))
3296 {
4bb91c68
SM
3297 if (c != '\\')
3298 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3299 if (p == pend)
3300 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3301 PATFETCH (c);
3302 }
3303
3304 if (c != '}')
4bb91c68 3305 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3306
3307 /* We just parsed a valid interval. */
3308
3309 /* If it's invalid to have no preceding re. */
3310 if (!laststart)
3311 {
3312 if (syntax & RE_CONTEXT_INVALID_OPS)
3313 FREE_STACK_RETURN (REG_BADRPT);
3314 else if (syntax & RE_CONTEXT_INDEP_OPS)
3315 laststart = b;
3316 else
3317 goto unfetch_interval;
3318 }
3319
6df42991
SM
3320 if (upper_bound == 0)
3321 /* If the upper bound is zero, just drop the sub pattern
3322 altogether. */
3323 b = laststart;
3324 else if (lower_bound == 1 && upper_bound == 1)
3325 /* Just match it once: nothing to do here. */
3326 ;
3327
3328 /* Otherwise, we have a nontrivial interval. When
3329 we're all done, the pattern will look like:
3330 set_number_at <jump count> <upper bound>
3331 set_number_at <succeed_n count> <lower bound>
3332 succeed_n <after jump addr> <succeed_n count>
3333 <body of loop>
3334 jump_n <succeed_n addr> <jump count>
3335 (The upper bound and `jump_n' are omitted if
3336 `upper_bound' is 1, though.) */
3337 else
3338 { /* If the upper bound is > 1, we need to insert
3339 more at the end of the loop. */
3340 unsigned int nbytes = (upper_bound < 0 ? 3
3341 : upper_bound > 1 ? 5 : 0);
3342 unsigned int startoffset = 0;
3343
3344 GET_BUFFER_SPACE (20); /* We might use less. */
3345
3346 if (lower_bound == 0)
3347 {
3348 /* A succeed_n that starts with 0 is really a
3349 a simple on_failure_jump_loop. */
3350 INSERT_JUMP (on_failure_jump_loop, laststart,
3351 b + 3 + nbytes);
3352 b += 3;
3353 }
3354 else
3355 {
3356 /* Initialize lower bound of the `succeed_n', even
3357 though it will be set during matching by its
3358 attendant `set_number_at' (inserted next),
3359 because `re_compile_fastmap' needs to know.
3360 Jump to the `jump_n' we might insert below. */
3361 INSERT_JUMP2 (succeed_n, laststart,
3362 b + 5 + nbytes,
3363 lower_bound);
3364 b += 5;
3365
3366 /* Code to initialize the lower bound. Insert
3367 before the `succeed_n'. The `5' is the last two
3368 bytes of this `set_number_at', plus 3 bytes of
3369 the following `succeed_n'. */
3370 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3371 b += 5;
3372 startoffset += 5;
3373 }
3374
3375 if (upper_bound < 0)
3376 {
3377 /* A negative upper bound stands for infinity,
3378 in which case it degenerates to a plain jump. */
3379 STORE_JUMP (jump, b, laststart + startoffset);
3380 b += 3;
3381 }
3382 else if (upper_bound > 1)
3383 { /* More than one repetition is allowed, so
3384 append a backward jump to the `succeed_n'
3385 that starts this interval.
3386
3387 When we've reached this during matching,
3388 we'll have matched the interval once, so
3389 jump back only `upper_bound - 1' times. */
3390 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3391 upper_bound - 1);
3392 b += 5;
3393
3394 /* The location we want to set is the second
3395 parameter of the `jump_n'; that is `b-2' as
3396 an absolute address. `laststart' will be
3397 the `set_number_at' we're about to insert;
3398 `laststart+3' the number to set, the source
3399 for the relative address. But we are
3400 inserting into the middle of the pattern --
3401 so everything is getting moved up by 5.
3402 Conclusion: (b - 2) - (laststart + 3) + 5,
3403 i.e., b - laststart.
3404
3405 We insert this at the beginning of the loop
3406 so that if we fail during matching, we'll
3407 reinitialize the bounds. */
3408 insert_op2 (set_number_at, laststart, b - laststart,
3409 upper_bound - 1, b);
3410 b += 5;
3411 }
3412 }
25fe55af
RS
3413 pending_exact = 0;
3414 beg_interval = NULL;
3415 }
3416 break;
3417
3418 unfetch_interval:
3419 /* If an invalid interval, match the characters as literals. */
3420 assert (beg_interval);
3421 p = beg_interval;
3422 beg_interval = NULL;
3423
3424 /* normal_char and normal_backslash need `c'. */
ed0767d8 3425 c = '{';
25fe55af
RS
3426
3427 if (!(syntax & RE_NO_BK_BRACES))
3428 {
ed0767d8
SM
3429 assert (p > pattern && p[-1] == '\\');
3430 goto normal_backslash;
25fe55af 3431 }
ed0767d8
SM
3432 else
3433 goto normal_char;
e318085a 3434
b18215fc 3435#ifdef emacs
25fe55af
RS
3436 /* There is no way to specify the before_dot and after_dot
3437 operators. rms says this is ok. --karl */
3438 case '=':
3439 BUF_PUSH (at_dot);
3440 break;
3441
3442 case 's':
3443 laststart = b;
3444 PATFETCH (c);
3445 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3446 break;
3447
3448 case 'S':
3449 laststart = b;
3450 PATFETCH (c);
3451 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3452 break;
b18215fc
RS
3453
3454 case 'c':
3455 laststart = b;
36595814 3456 PATFETCH (c);
b18215fc
RS
3457 BUF_PUSH_2 (categoryspec, c);
3458 break;
e318085a 3459
b18215fc
RS
3460 case 'C':
3461 laststart = b;
36595814 3462 PATFETCH (c);
b18215fc
RS
3463 BUF_PUSH_2 (notcategoryspec, c);
3464 break;
3465#endif /* emacs */
e318085a 3466
e318085a 3467
25fe55af 3468 case 'w':
4bb91c68
SM
3469 if (syntax & RE_NO_GNU_OPS)
3470 goto normal_char;
25fe55af 3471 laststart = b;
1fb352e0 3472 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3473 break;
e318085a 3474
e318085a 3475
25fe55af 3476 case 'W':
4bb91c68
SM
3477 if (syntax & RE_NO_GNU_OPS)
3478 goto normal_char;
25fe55af 3479 laststart = b;
1fb352e0 3480 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3481 break;
e318085a
RS
3482
3483
25fe55af 3484 case '<':
4bb91c68
SM
3485 if (syntax & RE_NO_GNU_OPS)
3486 goto normal_char;
25fe55af
RS
3487 BUF_PUSH (wordbeg);
3488 break;
e318085a 3489
25fe55af 3490 case '>':
4bb91c68
SM
3491 if (syntax & RE_NO_GNU_OPS)
3492 goto normal_char;
25fe55af
RS
3493 BUF_PUSH (wordend);
3494 break;
e318085a 3495
669fa600
SM
3496 case '_':
3497 if (syntax & RE_NO_GNU_OPS)
3498 goto normal_char;
3499 laststart = b;
3500 PATFETCH (c);
3501 if (c == '<')
3502 BUF_PUSH (symbeg);
3503 else if (c == '>')
3504 BUF_PUSH (symend);
3505 else
3506 FREE_STACK_RETURN (REG_BADPAT);
3507 break;
3508
25fe55af 3509 case 'b':
4bb91c68
SM
3510 if (syntax & RE_NO_GNU_OPS)
3511 goto normal_char;
25fe55af
RS
3512 BUF_PUSH (wordbound);
3513 break;
e318085a 3514
25fe55af 3515 case 'B':
4bb91c68
SM
3516 if (syntax & RE_NO_GNU_OPS)
3517 goto normal_char;
25fe55af
RS
3518 BUF_PUSH (notwordbound);
3519 break;
fa9a63c5 3520
25fe55af 3521 case '`':
4bb91c68
SM
3522 if (syntax & RE_NO_GNU_OPS)
3523 goto normal_char;
25fe55af
RS
3524 BUF_PUSH (begbuf);
3525 break;
e318085a 3526
25fe55af 3527 case '\'':
4bb91c68
SM
3528 if (syntax & RE_NO_GNU_OPS)
3529 goto normal_char;
25fe55af
RS
3530 BUF_PUSH (endbuf);
3531 break;
e318085a 3532
25fe55af
RS
3533 case '1': case '2': case '3': case '4': case '5':
3534 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3535 {
3536 regnum_t reg;
e318085a 3537
0cdd06f8
SM
3538 if (syntax & RE_NO_BK_REFS)
3539 goto normal_backslash;
e318085a 3540
0cdd06f8 3541 reg = c - '0';
e318085a 3542
0cdd06f8
SM
3543 /* Can't back reference to a subexpression before its end. */
3544 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3545 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3546
0cdd06f8
SM
3547 laststart = b;
3548 BUF_PUSH_2 (duplicate, reg);
3549 }
25fe55af 3550 break;
e318085a 3551
e318085a 3552
25fe55af
RS
3553 case '+':
3554 case '?':
3555 if (syntax & RE_BK_PLUS_QM)
3556 goto handle_plus;
3557 else
3558 goto normal_backslash;
3559
3560 default:
3561 normal_backslash:
3562 /* You might think it would be useful for \ to mean
3563 not to translate; but if we don't translate it
4bb91c68 3564 it will never match anything. */
25fe55af
RS
3565 goto normal_char;
3566 }
3567 break;
fa9a63c5
RM
3568
3569
3570 default:
25fe55af 3571 /* Expects the character in `c'. */
fa9a63c5 3572 normal_char:
36595814 3573 /* If no exactn currently being built. */
25fe55af 3574 if (!pending_exact
fa9a63c5 3575
25fe55af
RS
3576 /* If last exactn not at current position. */
3577 || pending_exact + *pending_exact + 1 != b
5e69f11e 3578
25fe55af 3579 /* We have only one byte following the exactn for the count. */
2d1675e4 3580 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3581
25fe55af 3582 /* If followed by a repetition operator. */
9d99031f 3583 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3584 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3585 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3586 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3587 || ((syntax & RE_INTERVALS)
25fe55af 3588 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3589 ? p != pend && *p == '{'
3590 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3591 {
3592 /* Start building a new exactn. */
5e69f11e 3593
25fe55af 3594 laststart = b;
fa9a63c5
RM
3595
3596 BUF_PUSH_2 (exactn, 0);
3597 pending_exact = b - 1;
25fe55af 3598 }
5e69f11e 3599
2d1675e4
SM
3600 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3601 {
e0277a47
KH
3602 int len;
3603
36595814 3604 c = TRANSLATE (c);
e0277a47
KH
3605 if (multibyte)
3606 len = CHAR_STRING (c, b);
3607 else
3608 *b = c, len = 1;
2d1675e4
SM
3609 b += len;
3610 (*pending_exact) += len;
3611 }
3612
fa9a63c5 3613 break;
25fe55af 3614 } /* switch (c) */
fa9a63c5
RM
3615 } /* while p != pend */
3616
5e69f11e 3617
fa9a63c5 3618 /* Through the pattern now. */
5e69f11e 3619
505bde11 3620 FIXUP_ALT_JUMP ();
fa9a63c5 3621
5e69f11e 3622 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3623 FREE_STACK_RETURN (REG_EPAREN);
3624
3625 /* If we don't want backtracking, force success
3626 the first time we reach the end of the compiled pattern. */
3627 if (syntax & RE_NO_POSIX_BACKTRACKING)
3628 BUF_PUSH (succeed);
3629
fa9a63c5
RM
3630 /* We have succeeded; set the length of the buffer. */
3631 bufp->used = b - bufp->buffer;
3632
3633#ifdef DEBUG
99633e97 3634 if (debug > 0)
fa9a63c5 3635 {
505bde11 3636 re_compile_fastmap (bufp);
fa9a63c5
RM
3637 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3638 print_compiled_pattern (bufp);
3639 }
99633e97 3640 debug--;
fa9a63c5
RM
3641#endif /* DEBUG */
3642
3643#ifndef MATCH_MAY_ALLOCATE
3644 /* Initialize the failure stack to the largest possible stack. This
3645 isn't necessary unless we're trying to avoid calling alloca in
3646 the search and match routines. */
3647 {
3648 int num_regs = bufp->re_nsub + 1;
3649
320a2a73 3650 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3651 {
a26f4ccd 3652 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3653
fa9a63c5
RM
3654 if (! fail_stack.stack)
3655 fail_stack.stack
a073faa6 3656 = (fail_stack_elt_t *) malloc (fail_stack.size
8ea094cf 3657 * sizeof (fail_stack_elt_t));
fa9a63c5
RM
3658 else
3659 fail_stack.stack
a073faa6 3660 = (fail_stack_elt_t *) realloc (fail_stack.stack,
8ea094cf
AS
3661 (fail_stack.size
3662 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3663 }
3664
3665 regex_grow_registers (num_regs);
3666 }
3667#endif /* not MATCH_MAY_ALLOCATE */
3668
7c67c9f6 3669 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3670} /* regex_compile */
3671\f
3672/* Subroutines for `regex_compile'. */
3673
25fe55af 3674/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3675
3676static void
3677store_op1 (op, loc, arg)
3678 re_opcode_t op;
3679 unsigned char *loc;
3680 int arg;
3681{
3682 *loc = (unsigned char) op;
3683 STORE_NUMBER (loc + 1, arg);
3684}
3685
3686
3687/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3688
3689static void
3690store_op2 (op, loc, arg1, arg2)
3691 re_opcode_t op;
3692 unsigned char *loc;
3693 int arg1, arg2;
3694{
3695 *loc = (unsigned char) op;
3696 STORE_NUMBER (loc + 1, arg1);
3697 STORE_NUMBER (loc + 3, arg2);
3698}
3699
3700
3701/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3702 for OP followed by two-byte integer parameter ARG. */
3703
3704static void
3705insert_op1 (op, loc, arg, end)
3706 re_opcode_t op;
3707 unsigned char *loc;
3708 int arg;
5e69f11e 3709 unsigned char *end;
fa9a63c5
RM
3710{
3711 register unsigned char *pfrom = end;
3712 register unsigned char *pto = end + 3;
3713
3714 while (pfrom != loc)
3715 *--pto = *--pfrom;
5e69f11e 3716
fa9a63c5
RM
3717 store_op1 (op, loc, arg);
3718}
3719
3720
3721/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3722
3723static void
3724insert_op2 (op, loc, arg1, arg2, end)
3725 re_opcode_t op;
3726 unsigned char *loc;
3727 int arg1, arg2;
5e69f11e 3728 unsigned char *end;
fa9a63c5
RM
3729{
3730 register unsigned char *pfrom = end;
3731 register unsigned char *pto = end + 5;
3732
3733 while (pfrom != loc)
3734 *--pto = *--pfrom;
5e69f11e 3735
fa9a63c5
RM
3736 store_op2 (op, loc, arg1, arg2);
3737}
3738
3739
3740/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3741 after an alternative or a begin-subexpression. We assume there is at
3742 least one character before the ^. */
3743
3744static boolean
3745at_begline_loc_p (pattern, p, syntax)
01618498 3746 re_char *pattern, *p;
fa9a63c5
RM
3747 reg_syntax_t syntax;
3748{
01618498 3749 re_char *prev = p - 2;
fa9a63c5 3750 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3751
fa9a63c5
RM
3752 return
3753 /* After a subexpression? */
3754 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3755 /* After an alternative? */
d2af47df
SM
3756 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3757 /* After a shy subexpression? */
3758 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3759 && prev[-1] == '?' && prev[-2] == '('
3760 && (syntax & RE_NO_BK_PARENS
3761 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3762}
3763
3764
3765/* The dual of at_begline_loc_p. This one is for $. We assume there is
3766 at least one character after the $, i.e., `P < PEND'. */
3767
3768static boolean
3769at_endline_loc_p (p, pend, syntax)
01618498 3770 re_char *p, *pend;
99633e97 3771 reg_syntax_t syntax;
fa9a63c5 3772{
01618498 3773 re_char *next = p;
fa9a63c5 3774 boolean next_backslash = *next == '\\';
01618498 3775 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3776
fa9a63c5
RM
3777 return
3778 /* Before a subexpression? */
3779 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3780 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3781 /* Before an alternative? */
3782 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3783 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3784}
3785
3786
5e69f11e 3787/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3788 false if it's not. */
3789
3790static boolean
3791group_in_compile_stack (compile_stack, regnum)
3792 compile_stack_type compile_stack;
3793 regnum_t regnum;
3794{
3795 int this_element;
3796
5e69f11e
RM
3797 for (this_element = compile_stack.avail - 1;
3798 this_element >= 0;
fa9a63c5
RM
3799 this_element--)
3800 if (compile_stack.stack[this_element].regnum == regnum)
3801 return true;
3802
3803 return false;
3804}
fa9a63c5 3805\f
f6a3f532
SM
3806/* analyse_first.
3807 If fastmap is non-NULL, go through the pattern and fill fastmap
3808 with all the possible leading chars. If fastmap is NULL, don't
3809 bother filling it up (obviously) and only return whether the
3810 pattern could potentially match the empty string.
3811
3812 Return 1 if p..pend might match the empty string.
3813 Return 0 if p..pend matches at least one char.
01618498 3814 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3815
3816static int
3817analyse_first (p, pend, fastmap, multibyte)
01618498 3818 re_char *p, *pend;
f6a3f532
SM
3819 char *fastmap;
3820 const int multibyte;
fa9a63c5 3821{
505bde11 3822 int j, k;
1fb352e0 3823 boolean not;
fa9a63c5 3824
b18215fc 3825 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3826 flag is set true. */
b18215fc
RS
3827 boolean match_any_multibyte_characters = false;
3828
f6a3f532 3829 assert (p);
5e69f11e 3830
505bde11
SM
3831 /* The loop below works as follows:
3832 - It has a working-list kept in the PATTERN_STACK and which basically
3833 starts by only containing a pointer to the first operation.
3834 - If the opcode we're looking at is a match against some set of
3835 chars, then we add those chars to the fastmap and go on to the
3836 next work element from the worklist (done via `break').
3837 - If the opcode is a control operator on the other hand, we either
3838 ignore it (if it's meaningless at this point, such as `start_memory')
3839 or execute it (if it's a jump). If the jump has several destinations
3840 (i.e. `on_failure_jump'), then we push the other destination onto the
3841 worklist.
3842 We guarantee termination by ignoring backward jumps (more or less),
3843 so that `p' is monotonically increasing. More to the point, we
3844 never set `p' (or push) anything `<= p1'. */
3845
01618498 3846 while (p < pend)
fa9a63c5 3847 {
505bde11
SM
3848 /* `p1' is used as a marker of how far back a `on_failure_jump'
3849 can go without being ignored. It is normally equal to `p'
3850 (which prevents any backward `on_failure_jump') except right
3851 after a plain `jump', to allow patterns such as:
3852 0: jump 10
3853 3..9: <body>
3854 10: on_failure_jump 3
3855 as used for the *? operator. */
01618498 3856 re_char *p1 = p;
5e69f11e 3857
fa9a63c5
RM
3858 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3859 {
f6a3f532 3860 case succeed:
01618498 3861 return 1;
f6a3f532 3862 continue;
fa9a63c5 3863
fa9a63c5 3864 case duplicate:
505bde11
SM
3865 /* If the first character has to match a backreference, that means
3866 that the group was empty (since it already matched). Since this
3867 is the only case that interests us here, we can assume that the
3868 backreference must match the empty string. */
3869 p++;
3870 continue;
fa9a63c5
RM
3871
3872
3873 /* Following are the cases which match a character. These end
25fe55af 3874 with `break'. */
fa9a63c5
RM
3875
3876 case exactn:
e0277a47
KH
3877 if (fastmap)
3878 {
3879 int c = RE_STRING_CHAR (p + 1, pend - p);
3880
3881 if (SINGLE_BYTE_CHAR_P (c))
3882 fastmap[c] = 1;
3883 else
3884 fastmap[p[1]] = 1;
3885 }
fa9a63c5
RM
3886 break;
3887
3888
1fb352e0
SM
3889 case anychar:
3890 /* We could put all the chars except for \n (and maybe \0)
3891 but we don't bother since it is generally not worth it. */
f6a3f532 3892 if (!fastmap) break;
01618498 3893 return -1;
fa9a63c5
RM
3894
3895
b18215fc 3896 case charset_not:
ba5c004d
RS
3897 /* Chars beyond end of bitmap are possible matches.
3898 All the single-byte codes can occur in multibyte buffers.
3899 So any that are not listed in the charset
3900 are possible matches, even in multibyte buffers. */
1fb352e0 3901 if (!fastmap) break;
b18215fc 3902 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3903 j < (1 << BYTEWIDTH); j++)
b18215fc 3904 fastmap[j] = 1;
1fb352e0
SM
3905 /* Fallthrough */
3906 case charset:
3907 if (!fastmap) break;
3908 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3909 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3910 j >= 0; j--)
1fb352e0 3911 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3912 fastmap[j] = 1;
3913
1fb352e0
SM
3914 if ((not && multibyte)
3915 /* Any character set can possibly contain a character
3916 which doesn't match the specified set of characters. */
3917 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3918 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3919 /* If we can match a character class, we can match
3920 any character set. */
b18215fc
RS
3921 {
3922 set_fastmap_for_multibyte_characters:
3923 if (match_any_multibyte_characters == false)
3924 {
3925 for (j = 0x80; j < 0xA0; j++) /* XXX */
3926 if (BASE_LEADING_CODE_P (j))
3927 fastmap[j] = 1;
3928 match_any_multibyte_characters = true;
3929 }
3930 }
b18215fc 3931
1fb352e0
SM
3932 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3933 && match_any_multibyte_characters == false)
3934 {
3935 /* Set fastmap[I] 1 where I is a base leading code of each
3936 multibyte character in the range table. */
3937 int c, count;
b18215fc 3938
1fb352e0 3939 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3940 bits for a character class. */
1fb352e0 3941 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3942
1fb352e0
SM
3943 /* Extract the number of ranges in range table into COUNT. */
3944 EXTRACT_NUMBER_AND_INCR (count, p);
3945 for (; count > 0; count--, p += 2 * 3) /* XXX */
3946 {
3947 /* Extract the start of each range. */
3948 EXTRACT_CHARACTER (c, p);
3949 j = CHAR_CHARSET (c);
3950 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3951 }
3952 }
b18215fc
RS
3953 break;
3954
1fb352e0
SM
3955 case syntaxspec:
3956 case notsyntaxspec:
3957 if (!fastmap) break;
3958#ifndef emacs
3959 not = (re_opcode_t)p[-1] == notsyntaxspec;
3960 k = *p++;
3961 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3962 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3963 fastmap[j] = 1;
b18215fc 3964 break;
1fb352e0 3965#else /* emacs */
b18215fc
RS
3966 /* This match depends on text properties. These end with
3967 aborting optimizations. */
01618498 3968 return -1;
b18215fc
RS
3969
3970 case categoryspec:
b18215fc 3971 case notcategoryspec:
1fb352e0
SM
3972 if (!fastmap) break;
3973 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3974 k = *p++;
1fb352e0
SM
3975 for (j = 0; j < (1 << BYTEWIDTH); j++)
3976 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3977 fastmap[j] = 1;
3978
1fb352e0 3979 if (multibyte)
b18215fc 3980 /* Any character set can possibly contain a character
1fb352e0 3981 whose category is K (or not). */
b18215fc
RS
3982 goto set_fastmap_for_multibyte_characters;
3983 break;
3984
fa9a63c5 3985 /* All cases after this match the empty string. These end with
25fe55af 3986 `continue'. */
fa9a63c5 3987
fa9a63c5
RM
3988 case before_dot:
3989 case at_dot:
3990 case after_dot:
1fb352e0 3991#endif /* !emacs */
25fe55af
RS
3992 case no_op:
3993 case begline:
3994 case endline:
fa9a63c5
RM
3995 case begbuf:
3996 case endbuf:
3997 case wordbound:
3998 case notwordbound:
3999 case wordbeg:
4000 case wordend:
669fa600
SM
4001 case symbeg:
4002 case symend:
25fe55af 4003 continue;
fa9a63c5
RM
4004
4005
fa9a63c5 4006 case jump:
25fe55af 4007 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4008 if (j < 0)
4009 /* Backward jumps can only go back to code that we've already
4010 visited. `re_compile' should make sure this is true. */
4011 break;
25fe55af 4012 p += j;
505bde11
SM
4013 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4014 {
4015 case on_failure_jump:
4016 case on_failure_keep_string_jump:
505bde11 4017 case on_failure_jump_loop:
0683b6fa 4018 case on_failure_jump_nastyloop:
505bde11
SM
4019 case on_failure_jump_smart:
4020 p++;
4021 break;
4022 default:
4023 continue;
4024 };
4025 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4026 to jump back to "just after here". */
4027 /* Fallthrough */
fa9a63c5 4028
25fe55af
RS
4029 case on_failure_jump:
4030 case on_failure_keep_string_jump:
0683b6fa 4031 case on_failure_jump_nastyloop:
505bde11
SM
4032 case on_failure_jump_loop:
4033 case on_failure_jump_smart:
25fe55af 4034 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4035 if (p + j <= p1)
ed0767d8 4036 ; /* Backward jump to be ignored. */
01618498
SM
4037 else
4038 { /* We have to look down both arms.
4039 We first go down the "straight" path so as to minimize
4040 stack usage when going through alternatives. */
4041 int r = analyse_first (p, pend, fastmap, multibyte);
4042 if (r) return r;
4043 p += j;
4044 }
25fe55af 4045 continue;
fa9a63c5
RM
4046
4047
ed0767d8
SM
4048 case jump_n:
4049 /* This code simply does not properly handle forward jump_n. */
4050 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4051 p += 4;
4052 /* jump_n can either jump or fall through. The (backward) jump
4053 case has already been handled, so we only need to look at the
4054 fallthrough case. */
4055 continue;
177c0ea7 4056
fa9a63c5 4057 case succeed_n:
ed0767d8
SM
4058 /* If N == 0, it should be an on_failure_jump_loop instead. */
4059 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4060 p += 4;
4061 /* We only care about one iteration of the loop, so we don't
4062 need to consider the case where this behaves like an
4063 on_failure_jump. */
25fe55af 4064 continue;
fa9a63c5
RM
4065
4066
4067 case set_number_at:
25fe55af
RS
4068 p += 4;
4069 continue;
fa9a63c5
RM
4070
4071
4072 case start_memory:
25fe55af 4073 case stop_memory:
505bde11 4074 p += 1;
fa9a63c5
RM
4075 continue;
4076
4077
4078 default:
25fe55af
RS
4079 abort (); /* We have listed all the cases. */
4080 } /* switch *p++ */
fa9a63c5
RM
4081
4082 /* Getting here means we have found the possible starting
25fe55af 4083 characters for one path of the pattern -- and that the empty
01618498
SM
4084 string does not match. We need not follow this path further. */
4085 return 0;
fa9a63c5
RM
4086 } /* while p */
4087
01618498
SM
4088 /* We reached the end without matching anything. */
4089 return 1;
4090
f6a3f532
SM
4091} /* analyse_first */
4092\f
4093/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4094 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4095 characters can start a string that matches the pattern. This fastmap
4096 is used by re_search to skip quickly over impossible starting points.
4097
4098 Character codes above (1 << BYTEWIDTH) are not represented in the
4099 fastmap, but the leading codes are represented. Thus, the fastmap
4100 indicates which character sets could start a match.
4101
4102 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4103 area as BUFP->fastmap.
4104
4105 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4106 the pattern buffer.
4107
4108 Returns 0 if we succeed, -2 if an internal error. */
4109
4110int
4111re_compile_fastmap (bufp)
4112 struct re_pattern_buffer *bufp;
4113{
4114 char *fastmap = bufp->fastmap;
4115 int analysis;
4116
4117 assert (fastmap && bufp->buffer);
4118
4119 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4120 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4121
4122 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4123 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4124 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4125 return 0;
4126} /* re_compile_fastmap */
4127\f
4128/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4129 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4130 this memory for recording register information. STARTS and ENDS
4131 must be allocated using the malloc library routine, and must each
4132 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4133
4134 If NUM_REGS == 0, then subsequent matches should allocate their own
4135 register data.
4136
4137 Unless this function is called, the first search or match using
4138 PATTERN_BUFFER will allocate its own register data, without
4139 freeing the old data. */
4140
4141void
4142re_set_registers (bufp, regs, num_regs, starts, ends)
4143 struct re_pattern_buffer *bufp;
4144 struct re_registers *regs;
4145 unsigned num_regs;
4146 regoff_t *starts, *ends;
4147{
4148 if (num_regs)
4149 {
4150 bufp->regs_allocated = REGS_REALLOCATE;
4151 regs->num_regs = num_regs;
4152 regs->start = starts;
4153 regs->end = ends;
4154 }
4155 else
4156 {
4157 bufp->regs_allocated = REGS_UNALLOCATED;
4158 regs->num_regs = 0;
4159 regs->start = regs->end = (regoff_t *) 0;
4160 }
4161}
c0f9ea08 4162WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4163\f
25fe55af 4164/* Searching routines. */
fa9a63c5
RM
4165
4166/* Like re_search_2, below, but only one string is specified, and
4167 doesn't let you say where to stop matching. */
4168
4169int
4170re_search (bufp, string, size, startpos, range, regs)
4171 struct re_pattern_buffer *bufp;
4172 const char *string;
4173 int size, startpos, range;
4174 struct re_registers *regs;
4175{
5e69f11e 4176 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4177 regs, size);
4178}
c0f9ea08 4179WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4180
70806df6
KH
4181/* Head address of virtual concatenation of string. */
4182#define HEAD_ADDR_VSTRING(P) \
4183 (((P) >= size1 ? string2 : string1))
4184
b18215fc
RS
4185/* End address of virtual concatenation of string. */
4186#define STOP_ADDR_VSTRING(P) \
4187 (((P) >= size1 ? string2 + size2 : string1 + size1))
4188
4189/* Address of POS in the concatenation of virtual string. */
4190#define POS_ADDR_VSTRING(POS) \
4191 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4192
4193/* Using the compiled pattern in BUFP->buffer, first tries to match the
4194 virtual concatenation of STRING1 and STRING2, starting first at index
4195 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4196
fa9a63c5 4197 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4198
fa9a63c5
RM
4199 RANGE is how far to scan while trying to match. RANGE = 0 means try
4200 only at STARTPOS; in general, the last start tried is STARTPOS +
4201 RANGE.
5e69f11e 4202
fa9a63c5
RM
4203 In REGS, return the indices of the virtual concatenation of STRING1
4204 and STRING2 that matched the entire BUFP->buffer and its contained
4205 subexpressions.
5e69f11e 4206
fa9a63c5
RM
4207 Do not consider matching one past the index STOP in the virtual
4208 concatenation of STRING1 and STRING2.
4209
4210 We return either the position in the strings at which the match was
4211 found, -1 if no match, or -2 if error (such as failure
4212 stack overflow). */
4213
4214int
66f0296e 4215re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4216 struct re_pattern_buffer *bufp;
66f0296e 4217 const char *str1, *str2;
fa9a63c5
RM
4218 int size1, size2;
4219 int startpos;
4220 int range;
4221 struct re_registers *regs;
4222 int stop;
4223{
4224 int val;
66f0296e
SM
4225 re_char *string1 = (re_char*) str1;
4226 re_char *string2 = (re_char*) str2;
fa9a63c5 4227 register char *fastmap = bufp->fastmap;
6676cb1c 4228 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4229 int total_size = size1 + size2;
4230 int endpos = startpos + range;
c0f9ea08 4231 boolean anchored_start;
fa9a63c5 4232
25fe55af 4233 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4234 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4235
fa9a63c5
RM
4236 /* Check for out-of-range STARTPOS. */
4237 if (startpos < 0 || startpos > total_size)
4238 return -1;
5e69f11e 4239
fa9a63c5 4240 /* Fix up RANGE if it might eventually take us outside
34597fa9 4241 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4242 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4243 if (endpos < 0)
4244 range = 0 - startpos;
fa9a63c5
RM
4245 else if (endpos > total_size)
4246 range = total_size - startpos;
4247
4248 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4249 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4250 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4251 {
4252 if (startpos > 0)
4253 return -1;
4254 else
7b140fd7 4255 range = 0;
fa9a63c5
RM
4256 }
4257
ae4788a8
RS
4258#ifdef emacs
4259 /* In a forward search for something that starts with \=.
4260 don't keep searching past point. */
4261 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4262 {
7b140fd7
RS
4263 range = PT_BYTE - BEGV_BYTE - startpos;
4264 if (range < 0)
ae4788a8
RS
4265 return -1;
4266 }
4267#endif /* emacs */
4268
fa9a63c5
RM
4269 /* Update the fastmap now if not correct already. */
4270 if (fastmap && !bufp->fastmap_accurate)
01618498 4271 re_compile_fastmap (bufp);
5e69f11e 4272
c8499ba5 4273 /* See whether the pattern is anchored. */
c0f9ea08 4274 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4275
b18215fc 4276#ifdef emacs
cc9b4df2
KH
4277 gl_state.object = re_match_object;
4278 {
99633e97 4279 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4280
4281 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4282 }
b18215fc
RS
4283#endif
4284
fa9a63c5
RM
4285 /* Loop through the string, looking for a place to start matching. */
4286 for (;;)
5e69f11e 4287 {
c8499ba5
RS
4288 /* If the pattern is anchored,
4289 skip quickly past places we cannot match.
4290 We don't bother to treat startpos == 0 specially
4291 because that case doesn't repeat. */
4292 if (anchored_start && startpos > 0)
4293 {
c0f9ea08
SM
4294 if (! ((startpos <= size1 ? string1[startpos - 1]
4295 : string2[startpos - size1 - 1])
4296 == '\n'))
c8499ba5
RS
4297 goto advance;
4298 }
4299
fa9a63c5 4300 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4301 cannot be the start of a match. If the pattern can match the
4302 null string, however, we don't need to skip characters; we want
4303 the first null string. */
fa9a63c5
RM
4304 if (fastmap && startpos < total_size && !bufp->can_be_null)
4305 {
66f0296e 4306 register re_char *d;
01618498 4307 register re_wchar_t buf_ch;
e934739e
RS
4308
4309 d = POS_ADDR_VSTRING (startpos);
4310
25fe55af 4311 if (range > 0) /* Searching forwards. */
fa9a63c5 4312 {
fa9a63c5
RM
4313 register int lim = 0;
4314 int irange = range;
4315
25fe55af
RS
4316 if (startpos < size1 && startpos + range >= size1)
4317 lim = range - (size1 - startpos);
fa9a63c5 4318
25fe55af
RS
4319 /* Written out as an if-else to avoid testing `translate'
4320 inside the loop. */
28ae27ae
AS
4321 if (RE_TRANSLATE_P (translate))
4322 {
e934739e
RS
4323 if (multibyte)
4324 while (range > lim)
4325 {
4326 int buf_charlen;
4327
4328 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4329 buf_charlen);
4330
4331 buf_ch = RE_TRANSLATE (translate, buf_ch);
4332 if (buf_ch >= 0400
4333 || fastmap[buf_ch])
4334 break;
4335
4336 range -= buf_charlen;
4337 d += buf_charlen;
4338 }
4339 else
cf1982d9
EZ
4340 {
4341 /* Convert *d to integer to shut up GCC's
4342 whining about comparison that is always
4343 true. */
4344 int di = *d;
4345
4346 while (range > lim
4347 && !fastmap[RE_TRANSLATE (translate, di)])
4348 {
4349 di = *(++d);
4350 range--;
4351 }
4352 }
e934739e 4353 }
fa9a63c5 4354 else
66f0296e 4355 while (range > lim && !fastmap[*d])
33c46939
RS
4356 {
4357 d++;
4358 range--;
4359 }
fa9a63c5
RM
4360
4361 startpos += irange - range;
4362 }
25fe55af 4363 else /* Searching backwards. */
fa9a63c5 4364 {
2d1675e4
SM
4365 int room = (startpos >= size1
4366 ? size2 + size1 - startpos
4367 : size1 - startpos);
4368 buf_ch = RE_STRING_CHAR (d, room);
4369 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 4370
e934739e
RS
4371 if (! (buf_ch >= 0400
4372 || fastmap[buf_ch]))
fa9a63c5
RM
4373 goto advance;
4374 }
4375 }
4376
4377 /* If can't match the null string, and that's all we have left, fail. */
4378 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4379 && !bufp->can_be_null)
fa9a63c5
RM
4380 return -1;
4381
4382 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4383 startpos, regs, stop);
4384#ifndef REGEX_MALLOC
0b32bf0e 4385# ifdef C_ALLOCA
fa9a63c5 4386 alloca (0);
0b32bf0e 4387# endif
fa9a63c5
RM
4388#endif
4389
4390 if (val >= 0)
4391 return startpos;
5e69f11e 4392
fa9a63c5
RM
4393 if (val == -2)
4394 return -2;
4395
4396 advance:
5e69f11e 4397 if (!range)
25fe55af 4398 break;
5e69f11e 4399 else if (range > 0)
25fe55af 4400 {
b18215fc
RS
4401 /* Update STARTPOS to the next character boundary. */
4402 if (multibyte)
4403 {
66f0296e
SM
4404 re_char *p = POS_ADDR_VSTRING (startpos);
4405 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4406 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4407
4408 range -= len;
4409 if (range < 0)
4410 break;
4411 startpos += len;
4412 }
4413 else
4414 {
b560c397
RS
4415 range--;
4416 startpos++;
4417 }
e318085a 4418 }
fa9a63c5 4419 else
25fe55af
RS
4420 {
4421 range++;
4422 startpos--;
b18215fc
RS
4423
4424 /* Update STARTPOS to the previous character boundary. */
4425 if (multibyte)
4426 {
70806df6
KH
4427 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4428 re_char *p0 = p;
4429 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4430
4431 /* Find the head of multibyte form. */
70806df6
KH
4432 PREV_CHAR_BOUNDARY (p, phead);
4433 range += p0 - 1 - p;
4434 if (range > 0)
4435 break;
b18215fc 4436
70806df6 4437 startpos -= p0 - 1 - p;
b18215fc 4438 }
25fe55af 4439 }
fa9a63c5
RM
4440 }
4441 return -1;
4442} /* re_search_2 */
c0f9ea08 4443WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4444\f
4445/* Declarations and macros for re_match_2. */
4446
2d1675e4
SM
4447static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4448 register int len,
4449 RE_TRANSLATE_TYPE translate,
4450 const int multibyte));
fa9a63c5
RM
4451
4452/* This converts PTR, a pointer into one of the search strings `string1'
4453 and `string2' into an offset from the beginning of that string. */
4454#define POINTER_TO_OFFSET(ptr) \
4455 (FIRST_STRING_P (ptr) \
4456 ? ((regoff_t) ((ptr) - string1)) \
4457 : ((regoff_t) ((ptr) - string2 + size1)))
4458
fa9a63c5 4459/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4460 string2 if necessary.
4461 Check re_match_2_internal for a discussion of why end_match_2 might
4462 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4463#define PREFETCH() \
25fe55af 4464 while (d == dend) \
fa9a63c5
RM
4465 { \
4466 /* End of string2 => fail. */ \
25fe55af
RS
4467 if (dend == end_match_2) \
4468 goto fail; \
4bb91c68 4469 /* End of string1 => advance to string2. */ \
25fe55af 4470 d = string2; \
fa9a63c5
RM
4471 dend = end_match_2; \
4472 }
4473
f1ad044f
SM
4474/* Call before fetching a char with *d if you already checked other limits.
4475 This is meant for use in lookahead operations like wordend, etc..
4476 where we might need to look at parts of the string that might be
4477 outside of the LIMITs (i.e past `stop'). */
4478#define PREFETCH_NOLIMIT() \
4479 if (d == end1) \
4480 { \
4481 d = string2; \
4482 dend = end_match_2; \
4483 } \
fa9a63c5
RM
4484
4485/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4486 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4487#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4488#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4489
4490
4491/* Test if D points to a character which is word-constituent. We have
4492 two special cases to check for: if past the end of string1, look at
4493 the first character in string2; and if before the beginning of
4494 string2, look at the last character in string1. */
4495#define WORDCHAR_P(d) \
4496 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4497 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4498 == Sword)
4499
9121ca40 4500/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4501
4502/* The comment at case wordbound is following one, but we don't use
4503 AT_WORD_BOUNDARY anymore to support multibyte form.
4504
4505 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4506 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4507 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4508 macro and introducing temporary variables works around the bug. */
4509
9121ca40 4510#if 0
fa9a63c5
RM
4511/* Test if the character before D and the one at D differ with respect
4512 to being word-constituent. */
4513#define AT_WORD_BOUNDARY(d) \
4514 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4515 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4516#endif
fa9a63c5
RM
4517
4518/* Free everything we malloc. */
4519#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4520# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4521# define FREE_VARIABLES() \
fa9a63c5
RM
4522 do { \
4523 REGEX_FREE_STACK (fail_stack.stack); \
4524 FREE_VAR (regstart); \
4525 FREE_VAR (regend); \
fa9a63c5
RM
4526 FREE_VAR (best_regstart); \
4527 FREE_VAR (best_regend); \
fa9a63c5
RM
4528 } while (0)
4529#else
0b32bf0e 4530# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4531#endif /* not MATCH_MAY_ALLOCATE */
4532
505bde11
SM
4533\f
4534/* Optimization routines. */
4535
4e8a9132
SM
4536/* If the operation is a match against one or more chars,
4537 return a pointer to the next operation, else return NULL. */
01618498 4538static re_char *
4e8a9132 4539skip_one_char (p)
01618498 4540 re_char *p;
4e8a9132
SM
4541{
4542 switch (SWITCH_ENUM_CAST (*p++))
4543 {
4544 case anychar:
4545 break;
177c0ea7 4546
4e8a9132
SM
4547 case exactn:
4548 p += *p + 1;
4549 break;
4550
4551 case charset_not:
4552 case charset:
4553 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4554 {
4555 int mcnt;
4556 p = CHARSET_RANGE_TABLE (p - 1);
4557 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4558 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4559 }
4560 else
4561 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4562 break;
177c0ea7 4563
4e8a9132
SM
4564 case syntaxspec:
4565 case notsyntaxspec:
1fb352e0 4566#ifdef emacs
4e8a9132
SM
4567 case categoryspec:
4568 case notcategoryspec:
4569#endif /* emacs */
4570 p++;
4571 break;
4572
4573 default:
4574 p = NULL;
4575 }
4576 return p;
4577}
4578
4579
505bde11 4580/* Jump over non-matching operations. */
275a3409 4581static re_char *
4e8a9132 4582skip_noops (p, pend)
275a3409 4583 re_char *p, *pend;
505bde11
SM
4584{
4585 int mcnt;
4586 while (p < pend)
4587 {
4588 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4589 {
4590 case start_memory:
505bde11
SM
4591 case stop_memory:
4592 p += 2; break;
4593 case no_op:
4594 p += 1; break;
4595 case jump:
4596 p += 1;
4597 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4598 p += mcnt;
4599 break;
4600 default:
4601 return p;
4602 }
4603 }
4604 assert (p == pend);
4605 return p;
4606}
4607
4608/* Non-zero if "p1 matches something" implies "p2 fails". */
4609static int
4610mutually_exclusive_p (bufp, p1, p2)
4611 struct re_pattern_buffer *bufp;
275a3409 4612 re_char *p1, *p2;
505bde11 4613{
4e8a9132 4614 re_opcode_t op2;
2d1675e4 4615 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4616 unsigned char *pend = bufp->buffer + bufp->used;
4617
4e8a9132 4618 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4619 && p2 >= bufp->buffer && p2 <= pend);
4620
4621 /* Skip over open/close-group commands.
4622 If what follows this loop is a ...+ construct,
4623 look at what begins its body, since we will have to
4624 match at least one of that. */
4e8a9132
SM
4625 p2 = skip_noops (p2, pend);
4626 /* The same skip can be done for p1, except that this function
4627 is only used in the case where p1 is a simple match operator. */
4628 /* p1 = skip_noops (p1, pend); */
4629
4630 assert (p1 >= bufp->buffer && p1 < pend
4631 && p2 >= bufp->buffer && p2 <= pend);
4632
4633 op2 = p2 == pend ? succeed : *p2;
4634
4635 switch (SWITCH_ENUM_CAST (op2))
505bde11 4636 {
4e8a9132
SM
4637 case succeed:
4638 case endbuf:
4639 /* If we're at the end of the pattern, we can change. */
4640 if (skip_one_char (p1))
505bde11 4641 {
505bde11
SM
4642 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4643 return 1;
505bde11 4644 }
4e8a9132 4645 break;
177c0ea7 4646
4e8a9132 4647 case endline:
4e8a9132
SM
4648 case exactn:
4649 {
01618498 4650 register re_wchar_t c
4e8a9132 4651 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4652 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4653
4e8a9132
SM
4654 if ((re_opcode_t) *p1 == exactn)
4655 {
4656 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4657 {
4658 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4659 return 1;
4660 }
4661 }
505bde11 4662
4e8a9132
SM
4663 else if ((re_opcode_t) *p1 == charset
4664 || (re_opcode_t) *p1 == charset_not)
4665 {
4666 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4667
4e8a9132
SM
4668 /* Test if C is listed in charset (or charset_not)
4669 at `p1'. */
4670 if (SINGLE_BYTE_CHAR_P (c))
4671 {
4672 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4673 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4674 not = !not;
4675 }
4676 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4677 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4678
4e8a9132
SM
4679 /* `not' is equal to 1 if c would match, which means
4680 that we can't change to pop_failure_jump. */
4681 if (!not)
4682 {
4683 DEBUG_PRINT1 (" No match => fast loop.\n");
4684 return 1;
4685 }
4686 }
4687 else if ((re_opcode_t) *p1 == anychar
4688 && c == '\n')
4689 {
4690 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4691 return 1;
4692 }
4693 }
4694 break;
505bde11 4695
4e8a9132 4696 case charset:
4e8a9132
SM
4697 {
4698 if ((re_opcode_t) *p1 == exactn)
4699 /* Reuse the code above. */
4700 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4701
505bde11
SM
4702 /* It is hard to list up all the character in charset
4703 P2 if it includes multibyte character. Give up in
4704 such case. */
4705 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4706 {
4707 /* Now, we are sure that P2 has no range table.
4708 So, for the size of bitmap in P2, `p2[1]' is
4709 enough. But P1 may have range table, so the
4710 size of bitmap table of P1 is extracted by
4711 using macro `CHARSET_BITMAP_SIZE'.
4712
4713 Since we know that all the character listed in
4714 P2 is ASCII, it is enough to test only bitmap
4715 table of P1. */
4716
411e4203 4717 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4718 {
4719 int idx;
4720 /* We win if the charset inside the loop
4721 has no overlap with the one after the loop. */
4722 for (idx = 0;
4723 (idx < (int) p2[1]
4724 && idx < CHARSET_BITMAP_SIZE (p1));
4725 idx++)
4726 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4727 break;
4728
4729 if (idx == p2[1]
4730 || idx == CHARSET_BITMAP_SIZE (p1))
4731 {
4732 DEBUG_PRINT1 (" No match => fast loop.\n");
4733 return 1;
4734 }
4735 }
411e4203 4736 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4737 {
4738 int idx;
4739 /* We win if the charset_not inside the loop lists
4740 every character listed in the charset after. */
4741 for (idx = 0; idx < (int) p2[1]; idx++)
4742 if (! (p2[2 + idx] == 0
4743 || (idx < CHARSET_BITMAP_SIZE (p1)
4744 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4745 break;
4746
4e8a9132
SM
4747 if (idx == p2[1])
4748 {
4749 DEBUG_PRINT1 (" No match => fast loop.\n");
4750 return 1;
4751 }
4752 }
4753 }
4754 }
609b757a 4755 break;
177c0ea7 4756
411e4203
SM
4757 case charset_not:
4758 switch (SWITCH_ENUM_CAST (*p1))
4759 {
4760 case exactn:
4761 case charset:
4762 /* Reuse the code above. */
4763 return mutually_exclusive_p (bufp, p2, p1);
4764 case charset_not:
4765 /* When we have two charset_not, it's very unlikely that
4766 they don't overlap. The union of the two sets of excluded
4767 chars should cover all possible chars, which, as a matter of
4768 fact, is virtually impossible in multibyte buffers. */
36595814 4769 break;
411e4203
SM
4770 }
4771 break;
4772
4e8a9132 4773 case wordend:
669fa600
SM
4774 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4775 case symend:
4e8a9132 4776 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4777 && (p1[1] == Ssymbol || p1[1] == Sword));
4778 case notsyntaxspec:
4779 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4780
4781 case wordbeg:
669fa600
SM
4782 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4783 case symbeg:
4e8a9132 4784 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4785 && (p1[1] == Ssymbol || p1[1] == Sword));
4786 case syntaxspec:
4787 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4788
4789 case wordbound:
4790 return (((re_opcode_t) *p1 == notsyntaxspec
4791 || (re_opcode_t) *p1 == syntaxspec)
4792 && p1[1] == Sword);
4793
1fb352e0 4794#ifdef emacs
4e8a9132
SM
4795 case categoryspec:
4796 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4797 case notcategoryspec:
4798 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4799#endif /* emacs */
4800
4801 default:
4802 ;
505bde11
SM
4803 }
4804
4805 /* Safe default. */
4806 return 0;
4807}
4808
fa9a63c5
RM
4809\f
4810/* Matching routines. */
4811
25fe55af 4812#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4813/* re_match is like re_match_2 except it takes only a single string. */
4814
4815int
4816re_match (bufp, string, size, pos, regs)
4817 struct re_pattern_buffer *bufp;
4818 const char *string;
4819 int size, pos;
4820 struct re_registers *regs;
4821{
4bb91c68 4822 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4823 pos, regs, size);
0b32bf0e 4824# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4825 alloca (0);
0b32bf0e 4826# endif
fa9a63c5
RM
4827 return result;
4828}
c0f9ea08 4829WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4830#endif /* not emacs */
4831
b18215fc
RS
4832#ifdef emacs
4833/* In Emacs, this is the string or buffer in which we
25fe55af 4834 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4835Lisp_Object re_match_object;
4836#endif
fa9a63c5
RM
4837
4838/* re_match_2 matches the compiled pattern in BUFP against the
4839 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4840 and SIZE2, respectively). We start matching at POS, and stop
4841 matching at STOP.
5e69f11e 4842
fa9a63c5 4843 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4844 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4845 documentation for exactly how many groups we fill.
4846
4847 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4848 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4849 matched substring. */
4850
4851int
4852re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4853 struct re_pattern_buffer *bufp;
4854 const char *string1, *string2;
4855 int size1, size2;
4856 int pos;
4857 struct re_registers *regs;
4858 int stop;
4859{
b18215fc 4860 int result;
25fe55af 4861
b18215fc 4862#ifdef emacs
cc9b4df2
KH
4863 int charpos;
4864 gl_state.object = re_match_object;
99633e97 4865 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4866 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4867#endif
4868
4bb91c68
SM
4869 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4870 (re_char*) string2, size2,
cc9b4df2 4871 pos, regs, stop);
0b32bf0e 4872#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4873 alloca (0);
a60198e5 4874#endif
fa9a63c5
RM
4875 return result;
4876}
c0f9ea08 4877WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4878
4879/* This is a separate function so that we can force an alloca cleanup
25fe55af 4880 afterwards. */
fa9a63c5
RM
4881static int
4882re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4883 struct re_pattern_buffer *bufp;
66f0296e 4884 re_char *string1, *string2;
fa9a63c5
RM
4885 int size1, size2;
4886 int pos;
4887 struct re_registers *regs;
4888 int stop;
4889{
4890 /* General temporaries. */
4891 int mcnt;
01618498 4892 size_t reg;
66f0296e 4893 boolean not;
fa9a63c5
RM
4894
4895 /* Just past the end of the corresponding string. */
66f0296e 4896 re_char *end1, *end2;
fa9a63c5
RM
4897
4898 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4899 each to consider matching. */
66f0296e 4900 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4901
4902 /* Where we are in the data, and the end of the current string. */
66f0296e 4903 re_char *d, *dend;
5e69f11e 4904
99633e97
SM
4905 /* Used sometimes to remember where we were before starting matching
4906 an operator so that we can go back in case of failure. This "atomic"
4907 behavior of matching opcodes is indispensable to the correctness
4908 of the on_failure_keep_string_jump optimization. */
4909 re_char *dfail;
4910
fa9a63c5 4911 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4912 re_char *p = bufp->buffer;
4913 re_char *pend = p + bufp->used;
fa9a63c5 4914
25fe55af 4915 /* We use this to map every character in the string. */
6676cb1c 4916 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4917
25fe55af 4918 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4919 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4920
fa9a63c5
RM
4921 /* Failure point stack. Each place that can handle a failure further
4922 down the line pushes a failure point on this stack. It consists of
505bde11 4923 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4924 the subexpressions we're currently inside, plus the number of such
4925 registers, and, finally, two char *'s. The first char * is where
4926 to resume scanning the pattern; the second one is where to resume
505bde11 4927 scanning the strings. */
25fe55af 4928#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4929 fail_stack_type fail_stack;
4930#endif
4931#ifdef DEBUG
fa9a63c5
RM
4932 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4933#endif
4934
0b32bf0e 4935#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4936 /* This holds the pointer to the failure stack, when
4937 it is allocated relocatably. */
4938 fail_stack_elt_t *failure_stack_ptr;
99633e97 4939#endif
fa9a63c5
RM
4940
4941 /* We fill all the registers internally, independent of what we
25fe55af 4942 return, for use in backreferences. The number here includes
fa9a63c5 4943 an element for register zero. */
4bb91c68 4944 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4945
fa9a63c5
RM
4946 /* Information on the contents of registers. These are pointers into
4947 the input strings; they record just what was matched (on this
4948 attempt) by a subexpression part of the pattern, that is, the
4949 regnum-th regstart pointer points to where in the pattern we began
4950 matching and the regnum-th regend points to right after where we
4951 stopped matching the regnum-th subexpression. (The zeroth register
4952 keeps track of what the whole pattern matches.) */
4953#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4954 re_char **regstart, **regend;
fa9a63c5
RM
4955#endif
4956
fa9a63c5 4957 /* The following record the register info as found in the above
5e69f11e 4958 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4959 This happens as we backtrack through the failure points, which in
4960 turn happens only if we have not yet matched the entire string. */
4961 unsigned best_regs_set = false;
4962#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4963 re_char **best_regstart, **best_regend;
fa9a63c5 4964#endif
5e69f11e 4965
fa9a63c5
RM
4966 /* Logically, this is `best_regend[0]'. But we don't want to have to
4967 allocate space for that if we're not allocating space for anything
25fe55af 4968 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4969 any of the other register vectors, and it seems rather a kludge to
4970 treat `best_regend' differently than the rest. So we keep track of
4971 the end of the best match so far in a separate variable. We
4972 initialize this to NULL so that when we backtrack the first time
4973 and need to test it, it's not garbage. */
66f0296e 4974 re_char *match_end = NULL;
fa9a63c5 4975
fa9a63c5
RM
4976#ifdef DEBUG
4977 /* Counts the total number of registers pushed. */
5e69f11e 4978 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4979#endif
4980
4981 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4982
fa9a63c5 4983 INIT_FAIL_STACK ();
5e69f11e 4984
fa9a63c5
RM
4985#ifdef MATCH_MAY_ALLOCATE
4986 /* Do not bother to initialize all the register variables if there are
4987 no groups in the pattern, as it takes a fair amount of time. If
4988 there are groups, we include space for register 0 (the whole
4989 pattern), even though we never use it, since it simplifies the
4990 array indexing. We should fix this. */
4991 if (bufp->re_nsub)
4992 {
66f0296e
SM
4993 regstart = REGEX_TALLOC (num_regs, re_char *);
4994 regend = REGEX_TALLOC (num_regs, re_char *);
4995 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4996 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 4997
505bde11 4998 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
4999 {
5000 FREE_VARIABLES ();
5001 return -2;
5002 }
fa9a63c5
RM
5003 }
5004 else
5005 {
5006 /* We must initialize all our variables to NULL, so that
25fe55af 5007 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5008 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5009 }
5010#endif /* MATCH_MAY_ALLOCATE */
5011
5012 /* The starting position is bogus. */
5013 if (pos < 0 || pos > size1 + size2)
5014 {
5015 FREE_VARIABLES ();
5016 return -1;
5017 }
5e69f11e 5018
fa9a63c5
RM
5019 /* Initialize subexpression text positions to -1 to mark ones that no
5020 start_memory/stop_memory has been seen for. Also initialize the
5021 register information struct. */
01618498
SM
5022 for (reg = 1; reg < num_regs; reg++)
5023 regstart[reg] = regend[reg] = NULL;
99633e97 5024
fa9a63c5 5025 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 5026 `string1' is null. */
fa9a63c5
RM
5027 if (size2 == 0 && string1 != NULL)
5028 {
5029 string2 = string1;
5030 size2 = size1;
5031 string1 = 0;
5032 size1 = 0;
5033 }
5034 end1 = string1 + size1;
5035 end2 = string2 + size2;
5036
5e69f11e 5037 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5038 `dend' is the end of the input string that `d' points within. `d'
5039 is advanced into the following input string whenever necessary, but
5040 this happens before fetching; therefore, at the beginning of the
5041 loop, `d' can be pointing at the end of a string, but it cannot
5042 equal `string2'. */
419d1c74 5043 if (pos >= size1)
fa9a63c5 5044 {
419d1c74
SM
5045 /* Only match within string2. */
5046 d = string2 + pos - size1;
5047 dend = end_match_2 = string2 + stop - size1;
5048 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5049 }
5050 else
5051 {
f1ad044f 5052 if (stop < size1)
419d1c74
SM
5053 {
5054 /* Only match within string1. */
5055 end_match_1 = string1 + stop;
5056 /* BEWARE!
5057 When we reach end_match_1, PREFETCH normally switches to string2.
5058 But in the present case, this means that just doing a PREFETCH
5059 makes us jump from `stop' to `gap' within the string.
5060 What we really want here is for the search to stop as
5061 soon as we hit end_match_1. That's why we set end_match_2
5062 to end_match_1 (since PREFETCH fails as soon as we hit
5063 end_match_2). */
5064 end_match_2 = end_match_1;
5065 }
5066 else
f1ad044f
SM
5067 { /* It's important to use this code when stop == size so that
5068 moving `d' from end1 to string2 will not prevent the d == dend
5069 check from catching the end of string. */
419d1c74
SM
5070 end_match_1 = end1;
5071 end_match_2 = string2 + stop - size1;
5072 }
5073 d = string1 + pos;
5074 dend = end_match_1;
fa9a63c5
RM
5075 }
5076
5077 DEBUG_PRINT1 ("The compiled pattern is: ");
5078 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5079 DEBUG_PRINT1 ("The string to match is: `");
5080 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5081 DEBUG_PRINT1 ("'\n");
5e69f11e 5082
25fe55af 5083 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5084 function if the match is complete, or it drops through if the match
5085 fails at this starting point in the input data. */
5086 for (;;)
5087 {
505bde11 5088 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5089
5090 if (p == pend)
5091 { /* End of pattern means we might have succeeded. */
25fe55af 5092 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5093
fa9a63c5 5094 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5095 longest match, try backtracking. */
5096 if (d != end_match_2)
fa9a63c5
RM
5097 {
5098 /* 1 if this match ends in the same string (string1 or string2)
5099 as the best previous match. */
5e69f11e 5100 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5101 == FIRST_STRING_P (d));
fa9a63c5
RM
5102 /* 1 if this match is the best seen so far. */
5103 boolean best_match_p;
5104
5105 /* AIX compiler got confused when this was combined
25fe55af 5106 with the previous declaration. */
fa9a63c5
RM
5107 if (same_str_p)
5108 best_match_p = d > match_end;
5109 else
99633e97 5110 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5111
25fe55af
RS
5112 DEBUG_PRINT1 ("backtracking.\n");
5113
5114 if (!FAIL_STACK_EMPTY ())
5115 { /* More failure points to try. */
5116
5117 /* If exceeds best match so far, save it. */
5118 if (!best_regs_set || best_match_p)
5119 {
5120 best_regs_set = true;
5121 match_end = d;
5122
5123 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5124
01618498 5125 for (reg = 1; reg < num_regs; reg++)
25fe55af 5126 {
01618498
SM
5127 best_regstart[reg] = regstart[reg];
5128 best_regend[reg] = regend[reg];
25fe55af
RS
5129 }
5130 }
5131 goto fail;
5132 }
5133
5134 /* If no failure points, don't restore garbage. And if
5135 last match is real best match, don't restore second
5136 best one. */
5137 else if (best_regs_set && !best_match_p)
5138 {
5139 restore_best_regs:
5140 /* Restore best match. It may happen that `dend ==
5141 end_match_1' while the restored d is in string2.
5142 For example, the pattern `x.*y.*z' against the
5143 strings `x-' and `y-z-', if the two strings are
5144 not consecutive in memory. */
5145 DEBUG_PRINT1 ("Restoring best registers.\n");
5146
5147 d = match_end;
5148 dend = ((d >= string1 && d <= end1)
5149 ? end_match_1 : end_match_2);
fa9a63c5 5150
01618498 5151 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5152 {
01618498
SM
5153 regstart[reg] = best_regstart[reg];
5154 regend[reg] = best_regend[reg];
fa9a63c5 5155 }
25fe55af
RS
5156 }
5157 } /* d != end_match_2 */
fa9a63c5
RM
5158
5159 succeed_label:
25fe55af 5160 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5161
25fe55af
RS
5162 /* If caller wants register contents data back, do it. */
5163 if (regs && !bufp->no_sub)
fa9a63c5 5164 {
25fe55af
RS
5165 /* Have the register data arrays been allocated? */
5166 if (bufp->regs_allocated == REGS_UNALLOCATED)
5167 { /* No. So allocate them with malloc. We need one
5168 extra element beyond `num_regs' for the `-1' marker
5169 GNU code uses. */
5170 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5171 regs->start = TALLOC (regs->num_regs, regoff_t);
5172 regs->end = TALLOC (regs->num_regs, regoff_t);
5173 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5174 {
5175 FREE_VARIABLES ();
5176 return -2;
5177 }
25fe55af
RS
5178 bufp->regs_allocated = REGS_REALLOCATE;
5179 }
5180 else if (bufp->regs_allocated == REGS_REALLOCATE)
5181 { /* Yes. If we need more elements than were already
5182 allocated, reallocate them. If we need fewer, just
5183 leave it alone. */
5184 if (regs->num_regs < num_regs + 1)
5185 {
5186 regs->num_regs = num_regs + 1;
5187 RETALLOC (regs->start, regs->num_regs, regoff_t);
5188 RETALLOC (regs->end, regs->num_regs, regoff_t);
5189 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5190 {
5191 FREE_VARIABLES ();
5192 return -2;
5193 }
25fe55af
RS
5194 }
5195 }
5196 else
fa9a63c5
RM
5197 {
5198 /* These braces fend off a "empty body in an else-statement"
25fe55af 5199 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5200 assert (bufp->regs_allocated == REGS_FIXED);
5201 }
5202
25fe55af
RS
5203 /* Convert the pointer data in `regstart' and `regend' to
5204 indices. Register zero has to be set differently,
5205 since we haven't kept track of any info for it. */
5206 if (regs->num_regs > 0)
5207 {
5208 regs->start[0] = pos;
99633e97 5209 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5210 }
5e69f11e 5211
25fe55af
RS
5212 /* Go through the first `min (num_regs, regs->num_regs)'
5213 registers, since that is all we initialized. */
01618498 5214 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5215 {
01618498
SM
5216 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5217 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5218 else
5219 {
01618498
SM
5220 regs->start[reg]
5221 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5222 regs->end[reg]
5223 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5224 }
fa9a63c5 5225 }
5e69f11e 5226
25fe55af
RS
5227 /* If the regs structure we return has more elements than
5228 were in the pattern, set the extra elements to -1. If
5229 we (re)allocated the registers, this is the case,
5230 because we always allocate enough to have at least one
5231 -1 at the end. */
01618498
SM
5232 for (reg = num_regs; reg < regs->num_regs; reg++)
5233 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5234 } /* regs && !bufp->no_sub */
5235
25fe55af
RS
5236 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5237 nfailure_points_pushed, nfailure_points_popped,
5238 nfailure_points_pushed - nfailure_points_popped);
5239 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5240
99633e97 5241 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5242
25fe55af 5243 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5244
25fe55af
RS
5245 FREE_VARIABLES ();
5246 return mcnt;
5247 }
fa9a63c5 5248
25fe55af 5249 /* Otherwise match next pattern command. */
fa9a63c5
RM
5250 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5251 {
25fe55af
RS
5252 /* Ignore these. Used to ignore the n of succeed_n's which
5253 currently have n == 0. */
5254 case no_op:
5255 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5256 break;
fa9a63c5
RM
5257
5258 case succeed:
25fe55af 5259 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5260 goto succeed_label;
5261
25fe55af
RS
5262 /* Match the next n pattern characters exactly. The following
5263 byte in the pattern defines n, and the n bytes after that
5264 are the characters to match. */
fa9a63c5
RM
5265 case exactn:
5266 mcnt = *p++;
25fe55af 5267 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5268
99633e97
SM
5269 /* Remember the start point to rollback upon failure. */
5270 dfail = d;
5271
25fe55af
RS
5272 /* This is written out as an if-else so we don't waste time
5273 testing `translate' inside the loop. */
28703c16 5274 if (RE_TRANSLATE_P (translate))
fa9a63c5 5275 {
e934739e
RS
5276 if (multibyte)
5277 do
5278 {
5279 int pat_charlen, buf_charlen;
e71b1971 5280 unsigned int pat_ch, buf_ch;
e934739e
RS
5281
5282 PREFETCH ();
5283 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5284 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5285
5286 if (RE_TRANSLATE (translate, buf_ch)
5287 != pat_ch)
99633e97
SM
5288 {
5289 d = dfail;
5290 goto fail;
5291 }
e934739e
RS
5292
5293 p += pat_charlen;
5294 d += buf_charlen;
5295 mcnt -= pat_charlen;
5296 }
5297 while (mcnt > 0);
5298 else
e934739e
RS
5299 do
5300 {
cf1982d9
EZ
5301 /* Avoid compiler whining about comparison being
5302 always true. */
5303 int di;
5304
e934739e 5305 PREFETCH ();
cf1982d9
EZ
5306 di = *d;
5307 if (RE_TRANSLATE (translate, di) != *p++)
99633e97
SM
5308 {
5309 d = dfail;
5310 goto fail;
5311 }
33c46939 5312 d++;
e934739e
RS
5313 }
5314 while (--mcnt);
fa9a63c5
RM
5315 }
5316 else
5317 {
5318 do
5319 {
5320 PREFETCH ();
99633e97
SM
5321 if (*d++ != *p++)
5322 {
5323 d = dfail;
5324 goto fail;
5325 }
fa9a63c5
RM
5326 }
5327 while (--mcnt);
5328 }
25fe55af 5329 break;
fa9a63c5
RM
5330
5331
25fe55af 5332 /* Match any character except possibly a newline or a null. */
fa9a63c5 5333 case anychar:
e934739e
RS
5334 {
5335 int buf_charlen;
01618498 5336 re_wchar_t buf_ch;
fa9a63c5 5337
e934739e 5338 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5339
e934739e 5340 PREFETCH ();
2d1675e4 5341 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5342 buf_ch = TRANSLATE (buf_ch);
5343
5344 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5345 && buf_ch == '\n')
5346 || ((bufp->syntax & RE_DOT_NOT_NULL)
5347 && buf_ch == '\000'))
5348 goto fail;
5349
e934739e
RS
5350 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5351 d += buf_charlen;
5352 }
fa9a63c5
RM
5353 break;
5354
5355
5356 case charset:
5357 case charset_not:
5358 {
b18215fc 5359 register unsigned int c;
fa9a63c5 5360 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5361 int len;
5362
5363 /* Start of actual range_table, or end of bitmap if there is no
5364 range table. */
01618498 5365 re_char *range_table;
b18215fc 5366
96cc36cc 5367 /* Nonzero if there is a range table. */
b18215fc
RS
5368 int range_table_exists;
5369
96cc36cc
RS
5370 /* Number of ranges of range table. This is not included
5371 in the initial byte-length of the command. */
5372 int count = 0;
fa9a63c5 5373
25fe55af 5374 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5375
b18215fc 5376 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5377
b18215fc 5378 if (range_table_exists)
96cc36cc
RS
5379 {
5380 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5381 EXTRACT_NUMBER_AND_INCR (count, range_table);
5382 }
b18215fc 5383
2d1675e4
SM
5384 PREFETCH ();
5385 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5386 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
5387
5388 if (SINGLE_BYTE_CHAR_P (c))
5389 { /* Lookup bitmap. */
b18215fc
RS
5390 /* Cast to `unsigned' instead of `unsigned char' in
5391 case the bit list is a full 32 bytes long. */
5392 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5393 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5394 not = !not;
b18215fc 5395 }
96cc36cc 5396#ifdef emacs
b18215fc 5397 else if (range_table_exists)
96cc36cc
RS
5398 {
5399 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5400
14473664
SM
5401 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5402 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5403 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5404 | (class_bits & BIT_SPACE && ISSPACE (c))
5405 | (class_bits & BIT_UPPER && ISUPPER (c))
5406 | (class_bits & BIT_WORD && ISWORD (c)))
5407 not = !not;
5408 else
5409 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5410 }
5411#endif /* emacs */
fa9a63c5 5412
96cc36cc
RS
5413 if (range_table_exists)
5414 p = CHARSET_RANGE_TABLE_END (range_table, count);
5415 else
5416 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5417
5418 if (!not) goto fail;
5e69f11e 5419
b18215fc 5420 d += len;
fa9a63c5
RM
5421 break;
5422 }
5423
5424
25fe55af 5425 /* The beginning of a group is represented by start_memory.
505bde11 5426 The argument is the register number. The text
25fe55af
RS
5427 matched within the group is recorded (in the internal
5428 registers data structure) under the register number. */
5429 case start_memory:
505bde11
SM
5430 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5431
5432 /* In case we need to undo this operation (via backtracking). */
5433 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5434
25fe55af 5435 regstart[*p] = d;
4bb91c68 5436 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5437 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5438
25fe55af 5439 /* Move past the register number and inner group count. */
505bde11 5440 p += 1;
25fe55af 5441 break;
fa9a63c5
RM
5442
5443
25fe55af 5444 /* The stop_memory opcode represents the end of a group. Its
505bde11 5445 argument is the same as start_memory's: the register number. */
fa9a63c5 5446 case stop_memory:
505bde11
SM
5447 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5448
5449 assert (!REG_UNSET (regstart[*p]));
5450 /* Strictly speaking, there should be code such as:
177c0ea7 5451
0b32bf0e 5452 assert (REG_UNSET (regend[*p]));
505bde11
SM
5453 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5454
5455 But the only info to be pushed is regend[*p] and it is known to
5456 be UNSET, so there really isn't anything to push.
5457 Not pushing anything, on the other hand deprives us from the
5458 guarantee that regend[*p] is UNSET since undoing this operation
5459 will not reset its value properly. This is not important since
5460 the value will only be read on the next start_memory or at
5461 the very end and both events can only happen if this stop_memory
5462 is *not* undone. */
fa9a63c5 5463
25fe55af 5464 regend[*p] = d;
fa9a63c5
RM
5465 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5466
25fe55af 5467 /* Move past the register number and the inner group count. */
505bde11 5468 p += 1;
25fe55af 5469 break;
fa9a63c5
RM
5470
5471
5472 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5473 followed by the numeric value of <digit> as the register number. */
5474 case duplicate:
fa9a63c5 5475 {
66f0296e 5476 register re_char *d2, *dend2;
25fe55af 5477 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5478 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5479
25fe55af
RS
5480 /* Can't back reference a group which we've never matched. */
5481 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5482 goto fail;
5e69f11e 5483
25fe55af
RS
5484 /* Where in input to try to start matching. */
5485 d2 = regstart[regno];
5e69f11e 5486
99633e97
SM
5487 /* Remember the start point to rollback upon failure. */
5488 dfail = d;
5489
25fe55af
RS
5490 /* Where to stop matching; if both the place to start and
5491 the place to stop matching are in the same string, then
5492 set to the place to stop, otherwise, for now have to use
5493 the end of the first string. */
fa9a63c5 5494
25fe55af 5495 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5496 == FIRST_STRING_P (regend[regno]))
5497 ? regend[regno] : end_match_1);
5498 for (;;)
5499 {
5500 /* If necessary, advance to next segment in register
25fe55af 5501 contents. */
fa9a63c5
RM
5502 while (d2 == dend2)
5503 {
5504 if (dend2 == end_match_2) break;
5505 if (dend2 == regend[regno]) break;
5506
25fe55af
RS
5507 /* End of string1 => advance to string2. */
5508 d2 = string2;
5509 dend2 = regend[regno];
fa9a63c5
RM
5510 }
5511 /* At end of register contents => success */
5512 if (d2 == dend2) break;
5513
5514 /* If necessary, advance to next segment in data. */
5515 PREFETCH ();
5516
5517 /* How many characters left in this segment to match. */
5518 mcnt = dend - d;
5e69f11e 5519
fa9a63c5 5520 /* Want how many consecutive characters we can match in
25fe55af
RS
5521 one shot, so, if necessary, adjust the count. */
5522 if (mcnt > dend2 - d2)
fa9a63c5 5523 mcnt = dend2 - d2;
5e69f11e 5524
fa9a63c5 5525 /* Compare that many; failure if mismatch, else move
25fe55af 5526 past them. */
28703c16 5527 if (RE_TRANSLATE_P (translate)
2d1675e4 5528 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5529 : memcmp (d, d2, mcnt))
99633e97
SM
5530 {
5531 d = dfail;
5532 goto fail;
5533 }
fa9a63c5 5534 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5535 }
5536 }
5537 break;
5538
5539
25fe55af 5540 /* begline matches the empty string at the beginning of the string
c0f9ea08 5541 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5542 case begline:
25fe55af 5543 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5544
25fe55af
RS
5545 if (AT_STRINGS_BEG (d))
5546 {
5547 if (!bufp->not_bol) break;
5548 }
419d1c74 5549 else
25fe55af 5550 {
419d1c74
SM
5551 unsigned char c;
5552 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5553 if (c == '\n')
419d1c74 5554 break;
25fe55af
RS
5555 }
5556 /* In all other cases, we fail. */
5557 goto fail;
fa9a63c5
RM
5558
5559
25fe55af 5560 /* endline is the dual of begline. */
fa9a63c5 5561 case endline:
25fe55af 5562 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5563
25fe55af
RS
5564 if (AT_STRINGS_END (d))
5565 {
5566 if (!bufp->not_eol) break;
5567 }
f1ad044f 5568 else
25fe55af 5569 {
f1ad044f 5570 PREFETCH_NOLIMIT ();
c0f9ea08 5571 if (*d == '\n')
f1ad044f 5572 break;
25fe55af
RS
5573 }
5574 goto fail;
fa9a63c5
RM
5575
5576
5577 /* Match at the very beginning of the data. */
25fe55af
RS
5578 case begbuf:
5579 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5580 if (AT_STRINGS_BEG (d))
5581 break;
5582 goto fail;
fa9a63c5
RM
5583
5584
5585 /* Match at the very end of the data. */
25fe55af
RS
5586 case endbuf:
5587 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5588 if (AT_STRINGS_END (d))
5589 break;
25fe55af 5590 goto fail;
5e69f11e 5591
5e69f11e 5592
25fe55af
RS
5593 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5594 pushes NULL as the value for the string on the stack. Then
505bde11 5595 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5596 string, instead of restoring it. To see why, consider
5597 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5598 then the . fails against the \n. But the next thing we want
5599 to do is match the \n against the \n; if we restored the
5600 string value, we would be back at the foo.
5601
5602 Because this is used only in specific cases, we don't need to
5603 check all the things that `on_failure_jump' does, to make
5604 sure the right things get saved on the stack. Hence we don't
5605 share its code. The only reason to push anything on the
5606 stack at all is that otherwise we would have to change
5607 `anychar's code to do something besides goto fail in this
5608 case; that seems worse than this. */
5609 case on_failure_keep_string_jump:
505bde11
SM
5610 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5611 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5612 mcnt, p + mcnt);
fa9a63c5 5613
505bde11
SM
5614 PUSH_FAILURE_POINT (p - 3, NULL);
5615 break;
5616
0683b6fa
SM
5617 /* A nasty loop is introduced by the non-greedy *? and +?.
5618 With such loops, the stack only ever contains one failure point
5619 at a time, so that a plain on_failure_jump_loop kind of
5620 cycle detection cannot work. Worse yet, such a detection
5621 can not only fail to detect a cycle, but it can also wrongly
5622 detect a cycle (between different instantiations of the same
6df42991 5623 loop).
0683b6fa
SM
5624 So the method used for those nasty loops is a little different:
5625 We use a special cycle-detection-stack-frame which is pushed
5626 when the on_failure_jump_nastyloop failure-point is *popped*.
5627 This special frame thus marks the beginning of one iteration
5628 through the loop and we can hence easily check right here
5629 whether something matched between the beginning and the end of
5630 the loop. */
5631 case on_failure_jump_nastyloop:
5632 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5633 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5634 mcnt, p + mcnt);
5635
5636 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5637 {
5638 int cycle = 0;
5639 CHECK_INFINITE_LOOP (p - 4, d);
5640 if (!cycle)
5641 /* If there's a cycle, just continue without pushing
5642 this failure point. The failure point is the "try again"
5643 option, which shouldn't be tried.
5644 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5645 PUSH_FAILURE_POINT (p - 3, d);
5646 }
0683b6fa
SM
5647 break;
5648
4e8a9132
SM
5649 /* Simple loop detecting on_failure_jump: just check on the
5650 failure stack if the same spot was already hit earlier. */
505bde11
SM
5651 case on_failure_jump_loop:
5652 on_failure:
5653 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5654 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5655 mcnt, p + mcnt);
6df42991
SM
5656 {
5657 int cycle = 0;
5658 CHECK_INFINITE_LOOP (p - 3, d);
5659 if (cycle)
5660 /* If there's a cycle, get out of the loop, as if the matching
5661 had failed. We used to just `goto fail' here, but that was
5662 aborting the search a bit too early: we want to keep the
5663 empty-loop-match and keep matching after the loop.
5664 We want (x?)*y\1z to match both xxyz and xxyxz. */
5665 p += mcnt;
5666 else
5667 PUSH_FAILURE_POINT (p - 3, d);
5668 }
25fe55af 5669 break;
fa9a63c5
RM
5670
5671
5672 /* Uses of on_failure_jump:
5e69f11e 5673
25fe55af
RS
5674 Each alternative starts with an on_failure_jump that points
5675 to the beginning of the next alternative. Each alternative
5676 except the last ends with a jump that in effect jumps past
5677 the rest of the alternatives. (They really jump to the
5678 ending jump of the following alternative, because tensioning
5679 these jumps is a hassle.)
fa9a63c5 5680
25fe55af
RS
5681 Repeats start with an on_failure_jump that points past both
5682 the repetition text and either the following jump or
5683 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5684 case on_failure_jump:
25fe55af 5685 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5686 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5687 mcnt, p + mcnt);
25fe55af 5688
505bde11 5689 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5690 break;
5691
4e8a9132 5692 /* This operation is used for greedy *.
505bde11
SM
5693 Compare the beginning of the repeat with what in the
5694 pattern follows its end. If we can establish that there
5695 is nothing that they would both match, i.e., that we
5696 would have to backtrack because of (as in, e.g., `a*a')
5697 then we can use a non-backtracking loop based on
4e8a9132 5698 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5699 case on_failure_jump_smart:
25fe55af 5700 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5701 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5702 mcnt, p + mcnt);
25fe55af 5703 {
01618498 5704 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5705 /* Here, we discard `const', making re_match non-reentrant. */
5706 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5707 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5708
505bde11
SM
5709 p -= 3; /* Reset so that we will re-execute the
5710 instruction once it's been changed. */
fa9a63c5 5711
4e8a9132
SM
5712 EXTRACT_NUMBER (mcnt, p2 - 2);
5713
5714 /* Ensure this is a indeed the trivial kind of loop
5715 we are expecting. */
5716 assert (skip_one_char (p1) == p2 - 3);
5717 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5718 DEBUG_STATEMENT (debug += 2);
505bde11 5719 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5720 {
505bde11 5721 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5722 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5723 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5724 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5725 }
505bde11 5726 else
fa9a63c5 5727 {
505bde11
SM
5728 /* Default to a safe `on_failure_jump' loop. */
5729 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5730 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5731 }
99633e97 5732 DEBUG_STATEMENT (debug -= 2);
25fe55af 5733 }
505bde11 5734 break;
25fe55af
RS
5735
5736 /* Unconditionally jump (without popping any failure points). */
5737 case jump:
fa9a63c5 5738 unconditional_jump:
5b370c2b 5739 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5740 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5741 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5742 p += mcnt; /* Do the jump. */
505bde11 5743 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5744 break;
5745
5746
25fe55af
RS
5747 /* Have to succeed matching what follows at least n times.
5748 After that, handle like `on_failure_jump'. */
5749 case succeed_n:
01618498 5750 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5751 EXTRACT_NUMBER (mcnt, p + 2);
5752 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5753
dc1e502d
SM
5754 /* Originally, mcnt is how many times we HAVE to succeed. */
5755 if (mcnt != 0)
25fe55af 5756 {
6dcf2d0e
SM
5757 /* Here, we discard `const', making re_match non-reentrant. */
5758 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5759 mcnt--;
01618498
SM
5760 p += 4;
5761 PUSH_NUMBER (p2, mcnt);
25fe55af 5762 }
dc1e502d
SM
5763 else
5764 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5765 goto on_failure;
25fe55af
RS
5766 break;
5767
5768 case jump_n:
01618498 5769 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5770 EXTRACT_NUMBER (mcnt, p + 2);
5771 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5772
5773 /* Originally, this is how many times we CAN jump. */
dc1e502d 5774 if (mcnt != 0)
25fe55af 5775 {
6dcf2d0e
SM
5776 /* Here, we discard `const', making re_match non-reentrant. */
5777 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5778 mcnt--;
01618498 5779 PUSH_NUMBER (p2, mcnt);
dc1e502d 5780 goto unconditional_jump;
25fe55af
RS
5781 }
5782 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5783 else
5784 p += 4;
25fe55af 5785 break;
5e69f11e 5786
fa9a63c5
RM
5787 case set_number_at:
5788 {
01618498 5789 unsigned char *p2; /* Location of the counter. */
25fe55af 5790 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5791
25fe55af 5792 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5793 /* Here, we discard `const', making re_match non-reentrant. */
5794 p2 = (unsigned char*) p + mcnt;
01618498 5795 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5796 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5797 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5798 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5799 break;
5800 }
9121ca40
KH
5801
5802 case wordbound:
66f0296e
SM
5803 case notwordbound:
5804 not = (re_opcode_t) *(p - 1) == notwordbound;
5805 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5806
99633e97 5807 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5808
b18215fc 5809 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5810 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5811 not = !not;
b18215fc
RS
5812 else
5813 {
5814 /* C1 is the character before D, S1 is the syntax of C1, C2
5815 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5816 re_wchar_t c1, c2;
5817 int s1, s2;
b18215fc 5818#ifdef emacs
2d1675e4
SM
5819 int offset = PTR_TO_OFFSET (d - 1);
5820 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5821 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5822#endif
66f0296e 5823 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5824 s1 = SYNTAX (c1);
5825#ifdef emacs
5d967c7a 5826 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5827#endif
f1ad044f 5828 PREFETCH_NOLIMIT ();
2d1675e4 5829 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5830 s2 = SYNTAX (c2);
5831
5832 if (/* Case 2: Only one of S1 and S2 is Sword. */
5833 ((s1 == Sword) != (s2 == Sword))
5834 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5835 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5836 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5837 not = !not;
5838 }
5839 if (not)
9121ca40 5840 break;
b18215fc 5841 else
9121ca40 5842 goto fail;
fa9a63c5
RM
5843
5844 case wordbeg:
25fe55af 5845 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5846
b18215fc
RS
5847 /* We FAIL in one of the following cases: */
5848
25fe55af 5849 /* Case 1: D is at the end of string. */
b18215fc 5850 if (AT_STRINGS_END (d))
99633e97 5851 goto fail;
b18215fc
RS
5852 else
5853 {
5854 /* C1 is the character before D, S1 is the syntax of C1, C2
5855 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5856 re_wchar_t c1, c2;
5857 int s1, s2;
fa9a63c5 5858#ifdef emacs
2d1675e4
SM
5859 int offset = PTR_TO_OFFSET (d);
5860 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5861 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5862#endif
99633e97 5863 PREFETCH ();
2d1675e4 5864 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5865 s2 = SYNTAX (c2);
177c0ea7 5866
b18215fc
RS
5867 /* Case 2: S2 is not Sword. */
5868 if (s2 != Sword)
5869 goto fail;
5870
5871 /* Case 3: D is not at the beginning of string ... */
5872 if (!AT_STRINGS_BEG (d))
5873 {
5874 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5875#ifdef emacs
5d967c7a 5876 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5877#endif
b18215fc
RS
5878 s1 = SYNTAX (c1);
5879
5880 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5881 returns 0. */
b18215fc
RS
5882 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5883 goto fail;
5884 }
5885 }
e318085a
RS
5886 break;
5887
b18215fc 5888 case wordend:
25fe55af 5889 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5890
5891 /* We FAIL in one of the following cases: */
5892
5893 /* Case 1: D is at the beginning of string. */
5894 if (AT_STRINGS_BEG (d))
e318085a 5895 goto fail;
b18215fc
RS
5896 else
5897 {
5898 /* C1 is the character before D, S1 is the syntax of C1, C2
5899 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5900 re_wchar_t c1, c2;
5901 int s1, s2;
5d967c7a 5902#ifdef emacs
2d1675e4
SM
5903 int offset = PTR_TO_OFFSET (d) - 1;
5904 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5905 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5906#endif
99633e97 5907 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5908 s1 = SYNTAX (c1);
5909
5910 /* Case 2: S1 is not Sword. */
5911 if (s1 != Sword)
5912 goto fail;
5913
5914 /* Case 3: D is not at the end of string ... */
5915 if (!AT_STRINGS_END (d))
5916 {
f1ad044f 5917 PREFETCH_NOLIMIT ();
2d1675e4 5918 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a 5919#ifdef emacs
134579f2 5920 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5d967c7a 5921#endif
b18215fc
RS
5922 s2 = SYNTAX (c2);
5923
5924 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5925 returns 0. */
b18215fc 5926 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5927 goto fail;
b18215fc
RS
5928 }
5929 }
e318085a
RS
5930 break;
5931
669fa600
SM
5932 case symbeg:
5933 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
5934
5935 /* We FAIL in one of the following cases: */
5936
5937 /* Case 1: D is at the end of string. */
5938 if (AT_STRINGS_END (d))
5939 goto fail;
5940 else
5941 {
5942 /* C1 is the character before D, S1 is the syntax of C1, C2
5943 is the character at D, and S2 is the syntax of C2. */
5944 re_wchar_t c1, c2;
5945 int s1, s2;
5946#ifdef emacs
5947 int offset = PTR_TO_OFFSET (d);
5948 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5949 UPDATE_SYNTAX_TABLE (charpos);
5950#endif
5951 PREFETCH ();
5952 c2 = RE_STRING_CHAR (d, dend - d);
5953 s2 = SYNTAX (c2);
5954
5955 /* Case 2: S2 is neither Sword nor Ssymbol. */
5956 if (s2 != Sword && s2 != Ssymbol)
5957 goto fail;
5958
5959 /* Case 3: D is not at the beginning of string ... */
5960 if (!AT_STRINGS_BEG (d))
5961 {
5962 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5963#ifdef emacs
5964 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5965#endif
5966 s1 = SYNTAX (c1);
5967
5968 /* ... and S1 is Sword or Ssymbol. */
5969 if (s1 == Sword || s1 == Ssymbol)
5970 goto fail;
5971 }
5972 }
5973 break;
5974
5975 case symend:
5976 DEBUG_PRINT1 ("EXECUTING symend.\n");
5977
5978 /* We FAIL in one of the following cases: */
5979
5980 /* Case 1: D is at the beginning of string. */
5981 if (AT_STRINGS_BEG (d))
5982 goto fail;
5983 else
5984 {
5985 /* C1 is the character before D, S1 is the syntax of C1, C2
5986 is the character at D, and S2 is the syntax of C2. */
5987 re_wchar_t c1, c2;
5988 int s1, s2;
5989#ifdef emacs
5990 int offset = PTR_TO_OFFSET (d) - 1;
5991 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5992 UPDATE_SYNTAX_TABLE (charpos);
5993#endif
5994 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5995 s1 = SYNTAX (c1);
5996
5997 /* Case 2: S1 is neither Ssymbol nor Sword. */
5998 if (s1 != Sword && s1 != Ssymbol)
5999 goto fail;
6000
6001 /* Case 3: D is not at the end of string ... */
6002 if (!AT_STRINGS_END (d))
6003 {
6004 PREFETCH_NOLIMIT ();
6005 c2 = RE_STRING_CHAR (d, dend - d);
6006#ifdef emacs
134579f2 6007 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6008#endif
6009 s2 = SYNTAX (c2);
6010
6011 /* ... and S2 is Sword or Ssymbol. */
6012 if (s2 == Sword || s2 == Ssymbol)
6013 goto fail;
6014 }
6015 }
6016 break;
6017
fa9a63c5 6018 case syntaxspec:
1fb352e0
SM
6019 case notsyntaxspec:
6020 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6021 mcnt = *p++;
1fb352e0 6022 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6023 PREFETCH ();
b18215fc
RS
6024#ifdef emacs
6025 {
2d1675e4
SM
6026 int offset = PTR_TO_OFFSET (d);
6027 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6028 UPDATE_SYNTAX_TABLE (pos1);
6029 }
25fe55af 6030#endif
b18215fc 6031 {
01618498
SM
6032 int len;
6033 re_wchar_t c;
b18215fc 6034
2d1675e4 6035 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6036
990b2375 6037 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6038 goto fail;
b18215fc
RS
6039 d += len;
6040 }
fa9a63c5
RM
6041 break;
6042
b18215fc 6043#ifdef emacs
1fb352e0
SM
6044 case before_dot:
6045 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6046 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6047 goto fail;
b18215fc
RS
6048 break;
6049
1fb352e0
SM
6050 case at_dot:
6051 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6052 if (PTR_BYTE_POS (d) != PT_BYTE)
6053 goto fail;
6054 break;
b18215fc 6055
1fb352e0
SM
6056 case after_dot:
6057 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6058 if (PTR_BYTE_POS (d) <= PT_BYTE)
6059 goto fail;
e318085a 6060 break;
fa9a63c5 6061
1fb352e0 6062 case categoryspec:
b18215fc 6063 case notcategoryspec:
1fb352e0 6064 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6065 mcnt = *p++;
1fb352e0 6066 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6067 PREFETCH ();
6068 {
01618498
SM
6069 int len;
6070 re_wchar_t c;
6071
2d1675e4 6072 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6073
1fb352e0 6074 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6075 goto fail;
6076 d += len;
6077 }
fa9a63c5 6078 break;
5e69f11e 6079
1fb352e0 6080#endif /* emacs */
5e69f11e 6081
0b32bf0e
SM
6082 default:
6083 abort ();
fa9a63c5 6084 }
b18215fc 6085 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6086
6087
6088 /* We goto here if a matching operation fails. */
6089 fail:
5b370c2b 6090 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6091 if (!FAIL_STACK_EMPTY ())
505bde11 6092 {
01618498 6093 re_char *str, *pat;
505bde11 6094 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6095 DEBUG_PRINT1 ("\nFAIL:\n");
6096 POP_FAILURE_POINT (str, pat);
505bde11
SM
6097 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6098 {
6099 case on_failure_keep_string_jump:
6100 assert (str == NULL);
6101 goto continue_failure_jump;
6102
0683b6fa
SM
6103 case on_failure_jump_nastyloop:
6104 assert ((re_opcode_t)pat[-2] == no_op);
6105 PUSH_FAILURE_POINT (pat - 2, str);
6106 /* Fallthrough */
6107
505bde11
SM
6108 case on_failure_jump_loop:
6109 case on_failure_jump:
6110 case succeed_n:
6111 d = str;
6112 continue_failure_jump:
6113 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6114 p = pat + mcnt;
6115 break;
b18215fc 6116
0683b6fa
SM
6117 case no_op:
6118 /* A special frame used for nastyloops. */
6119 goto fail;
6120
505bde11
SM
6121 default:
6122 abort();
6123 }
fa9a63c5 6124
505bde11 6125 assert (p >= bufp->buffer && p <= pend);
b18215fc 6126
0b32bf0e 6127 if (d >= string1 && d <= end1)
fa9a63c5 6128 dend = end_match_1;
0b32bf0e 6129 }
fa9a63c5 6130 else
0b32bf0e 6131 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6132 } /* for (;;) */
6133
6134 if (best_regs_set)
6135 goto restore_best_regs;
6136
6137 FREE_VARIABLES ();
6138
b18215fc 6139 return -1; /* Failure to match. */
fa9a63c5
RM
6140} /* re_match_2 */
6141\f
6142/* Subroutine definitions for re_match_2. */
6143
fa9a63c5
RM
6144/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6145 bytes; nonzero otherwise. */
5e69f11e 6146
fa9a63c5 6147static int
2d1675e4
SM
6148bcmp_translate (s1, s2, len, translate, multibyte)
6149 re_char *s1, *s2;
fa9a63c5 6150 register int len;
6676cb1c 6151 RE_TRANSLATE_TYPE translate;
2d1675e4 6152 const int multibyte;
fa9a63c5 6153{
2d1675e4
SM
6154 register re_char *p1 = s1, *p2 = s2;
6155 re_char *p1_end = s1 + len;
6156 re_char *p2_end = s2 + len;
e934739e 6157
4bb91c68
SM
6158 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6159 different lengths, but relying on a single `len' would break this. -sm */
6160 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6161 {
e934739e 6162 int p1_charlen, p2_charlen;
01618498 6163 re_wchar_t p1_ch, p2_ch;
e934739e 6164
2d1675e4
SM
6165 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6166 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
6167
6168 if (RE_TRANSLATE (translate, p1_ch)
6169 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6170 return 1;
e934739e
RS
6171
6172 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6173 }
e934739e
RS
6174
6175 if (p1 != p1_end || p2 != p2_end)
6176 return 1;
6177
fa9a63c5
RM
6178 return 0;
6179}
6180\f
6181/* Entry points for GNU code. */
6182
6183/* re_compile_pattern is the GNU regular expression compiler: it
6184 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6185 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6186
fa9a63c5
RM
6187 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6188 are set in BUFP on entry.
5e69f11e 6189
b18215fc 6190 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6191
6192const char *
6193re_compile_pattern (pattern, length, bufp)
6194 const char *pattern;
0b32bf0e 6195 size_t length;
fa9a63c5
RM
6196 struct re_pattern_buffer *bufp;
6197{
6198 reg_errcode_t ret;
5e69f11e 6199
fa9a63c5
RM
6200 /* GNU code is written to assume at least RE_NREGS registers will be set
6201 (and at least one extra will be -1). */
6202 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6203
fa9a63c5
RM
6204 /* And GNU code determines whether or not to get register information
6205 by passing null for the REGS argument to re_match, etc., not by
6206 setting no_sub. */
6207 bufp->no_sub = 0;
5e69f11e 6208
4bb91c68 6209 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6210
6211 if (!ret)
6212 return NULL;
6213 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6214}
c0f9ea08 6215WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6216\f
b18215fc
RS
6217/* Entry points compatible with 4.2 BSD regex library. We don't define
6218 them unless specifically requested. */
fa9a63c5 6219
0b32bf0e 6220#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6221
6222/* BSD has one and only one pattern buffer. */
6223static struct re_pattern_buffer re_comp_buf;
6224
6225char *
0b32bf0e 6226# ifdef _LIBC
48afdd44
RM
6227/* Make these definitions weak in libc, so POSIX programs can redefine
6228 these names if they don't use our functions, and still use
6229 regcomp/regexec below without link errors. */
6230weak_function
0b32bf0e 6231# endif
fa9a63c5
RM
6232re_comp (s)
6233 const char *s;
6234{
6235 reg_errcode_t ret;
5e69f11e 6236
fa9a63c5
RM
6237 if (!s)
6238 {
6239 if (!re_comp_buf.buffer)
0b32bf0e 6240 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6241 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6242 return 0;
6243 }
6244
6245 if (!re_comp_buf.buffer)
6246 {
6247 re_comp_buf.buffer = (unsigned char *) malloc (200);
6248 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6249 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6250 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6251 re_comp_buf.allocated = 200;
6252
6253 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6254 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6255 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6256 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6257 }
6258
6259 /* Since `re_exec' always passes NULL for the `regs' argument, we
6260 don't need to initialize the pattern buffer fields which affect it. */
6261
fa9a63c5 6262 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6263
fa9a63c5
RM
6264 if (!ret)
6265 return NULL;
6266
6267 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6268 return (char *) gettext (re_error_msgid[(int) ret]);
6269}
6270
6271
6272int
0b32bf0e 6273# ifdef _LIBC
48afdd44 6274weak_function
0b32bf0e 6275# endif
fa9a63c5
RM
6276re_exec (s)
6277 const char *s;
6278{
6279 const int len = strlen (s);
6280 return
6281 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6282}
6283#endif /* _REGEX_RE_COMP */
6284\f
6285/* POSIX.2 functions. Don't define these for Emacs. */
6286
6287#ifndef emacs
6288
6289/* regcomp takes a regular expression as a string and compiles it.
6290
b18215fc 6291 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6292 since POSIX says we shouldn't. Thus, we set
6293
6294 `buffer' to the compiled pattern;
6295 `used' to the length of the compiled pattern;
6296 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6297 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6298 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6299 `fastmap' to an allocated space for the fastmap;
6300 `fastmap_accurate' to zero;
fa9a63c5
RM
6301 `re_nsub' to the number of subexpressions in PATTERN.
6302
6303 PATTERN is the address of the pattern string.
6304
6305 CFLAGS is a series of bits which affect compilation.
6306
6307 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6308 use POSIX basic syntax.
6309
6310 If REG_NEWLINE is set, then . and [^...] don't match newline.
6311 Also, regexec will try a match beginning after every newline.
6312
6313 If REG_ICASE is set, then we considers upper- and lowercase
6314 versions of letters to be equivalent when matching.
6315
6316 If REG_NOSUB is set, then when PREG is passed to regexec, that
6317 routine will report only success or failure, and nothing about the
6318 registers.
6319
b18215fc 6320 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6321 the return codes and their meanings.) */
6322
6323int
6324regcomp (preg, pattern, cflags)
ada30c0e
SM
6325 regex_t *__restrict preg;
6326 const char *__restrict pattern;
fa9a63c5
RM
6327 int cflags;
6328{
6329 reg_errcode_t ret;
4bb91c68 6330 reg_syntax_t syntax
fa9a63c5
RM
6331 = (cflags & REG_EXTENDED) ?
6332 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6333
6334 /* regex_compile will allocate the space for the compiled pattern. */
6335 preg->buffer = 0;
6336 preg->allocated = 0;
6337 preg->used = 0;
5e69f11e 6338
c0f9ea08 6339 /* Try to allocate space for the fastmap. */
a073faa6 6340 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6341
fa9a63c5
RM
6342 if (cflags & REG_ICASE)
6343 {
6344 unsigned i;
5e69f11e 6345
6676cb1c 6346 preg->translate
a073faa6 6347 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8ea094cf 6348 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6349 if (preg->translate == NULL)
0b32bf0e 6350 return (int) REG_ESPACE;
fa9a63c5
RM
6351
6352 /* Map uppercase characters to corresponding lowercase ones. */
6353 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6354 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6355 }
6356 else
6357 preg->translate = NULL;
6358
6359 /* If REG_NEWLINE is set, newlines are treated differently. */
6360 if (cflags & REG_NEWLINE)
6361 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6362 syntax &= ~RE_DOT_NEWLINE;
6363 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6364 }
6365 else
c0f9ea08 6366 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6367
6368 preg->no_sub = !!(cflags & REG_NOSUB);
6369
5e69f11e 6370 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6371 can use strlen here in compiling the pattern. */
4bb91c68 6372 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6373
fa9a63c5
RM
6374 /* POSIX doesn't distinguish between an unmatched open-group and an
6375 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6376 if (ret == REG_ERPAREN)
6377 ret = REG_EPAREN;
6378
6379 if (ret == REG_NOERROR && preg->fastmap)
6380 { /* Compute the fastmap now, since regexec cannot modify the pattern
6381 buffer. */
6382 re_compile_fastmap (preg);
6383 if (preg->can_be_null)
6384 { /* The fastmap can't be used anyway. */
6385 free (preg->fastmap);
6386 preg->fastmap = NULL;
6387 }
6388 }
fa9a63c5
RM
6389 return (int) ret;
6390}
c0f9ea08 6391WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6392
6393
6394/* regexec searches for a given pattern, specified by PREG, in the
6395 string STRING.
5e69f11e 6396
fa9a63c5 6397 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6398 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6399 least NMATCH elements, and we set them to the offsets of the
6400 corresponding matched substrings.
5e69f11e 6401
fa9a63c5
RM
6402 EFLAGS specifies `execution flags' which affect matching: if
6403 REG_NOTBOL is set, then ^ does not match at the beginning of the
6404 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6405
fa9a63c5
RM
6406 We return 0 if we find a match and REG_NOMATCH if not. */
6407
6408int
6409regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6410 const regex_t *__restrict preg;
6411 const char *__restrict string;
5e69f11e 6412 size_t nmatch;
9f2dbe01 6413 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6414 int eflags;
6415{
6416 int ret;
6417 struct re_registers regs;
6418 regex_t private_preg;
6419 int len = strlen (string);
c0f9ea08 6420 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6421
6422 private_preg = *preg;
5e69f11e 6423
fa9a63c5
RM
6424 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6425 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6426
fa9a63c5
RM
6427 /* The user has told us exactly how many registers to return
6428 information about, via `nmatch'. We have to pass that on to the
b18215fc 6429 matching routines. */
fa9a63c5 6430 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6431
fa9a63c5
RM
6432 if (want_reg_info)
6433 {
6434 regs.num_regs = nmatch;
4bb91c68
SM
6435 regs.start = TALLOC (nmatch * 2, regoff_t);
6436 if (regs.start == NULL)
0b32bf0e 6437 return (int) REG_NOMATCH;
4bb91c68 6438 regs.end = regs.start + nmatch;
fa9a63c5
RM
6439 }
6440
c0f9ea08
SM
6441 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6442 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6443 was a little bit longer but still only matching the real part.
6444 This works because the `endline' will check for a '\n' and will find a
6445 '\0', correctly deciding that this is not the end of a line.
6446 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6447 a convenient '\0' there. For all we know, the string could be preceded
6448 by '\n' which would throw things off. */
6449
fa9a63c5
RM
6450 /* Perform the searching operation. */
6451 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6452 /* start: */ 0, /* range: */ len,
6453 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6454
fa9a63c5
RM
6455 /* Copy the register information to the POSIX structure. */
6456 if (want_reg_info)
6457 {
6458 if (ret >= 0)
0b32bf0e
SM
6459 {
6460 unsigned r;
fa9a63c5 6461
0b32bf0e
SM
6462 for (r = 0; r < nmatch; r++)
6463 {
6464 pmatch[r].rm_so = regs.start[r];
6465 pmatch[r].rm_eo = regs.end[r];
6466 }
6467 }
fa9a63c5 6468
b18215fc 6469 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6470 free (regs.start);
fa9a63c5
RM
6471 }
6472
6473 /* We want zero return to mean success, unlike `re_search'. */
6474 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6475}
c0f9ea08 6476WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6477
6478
6479/* Returns a message corresponding to an error code, ERRCODE, returned
6480 from either regcomp or regexec. We don't use PREG here. */
6481
6482size_t
6483regerror (errcode, preg, errbuf, errbuf_size)
6484 int errcode;
6485 const regex_t *preg;
6486 char *errbuf;
6487 size_t errbuf_size;
6488{
6489 const char *msg;
6490 size_t msg_size;
6491
6492 if (errcode < 0
6493 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6494 /* Only error codes returned by the rest of the code should be passed
b18215fc 6495 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6496 code generates an invalid error code, then the program has a bug.
6497 Dump core so we can fix it. */
6498 abort ();
6499
6500 msg = gettext (re_error_msgid[errcode]);
6501
6502 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6503
fa9a63c5
RM
6504 if (errbuf_size != 0)
6505 {
6506 if (msg_size > errbuf_size)
0b32bf0e
SM
6507 {
6508 strncpy (errbuf, msg, errbuf_size - 1);
6509 errbuf[errbuf_size - 1] = 0;
6510 }
fa9a63c5 6511 else
0b32bf0e 6512 strcpy (errbuf, msg);
fa9a63c5
RM
6513 }
6514
6515 return msg_size;
6516}
c0f9ea08 6517WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6518
6519
6520/* Free dynamically allocated space used by PREG. */
6521
6522void
6523regfree (preg)
6524 regex_t *preg;
6525{
6526 if (preg->buffer != NULL)
6527 free (preg->buffer);
6528 preg->buffer = NULL;
5e69f11e 6529
fa9a63c5
RM
6530 preg->allocated = 0;
6531 preg->used = 0;
6532
6533 if (preg->fastmap != NULL)
6534 free (preg->fastmap);
6535 preg->fastmap = NULL;
6536 preg->fastmap_accurate = 0;
6537
6538 if (preg->translate != NULL)
6539 free (preg->translate);
6540 preg->translate = NULL;
6541}
c0f9ea08 6542WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6543
6544#endif /* not emacs */
ab5796a9
MB
6545
6546/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6547 (do not change this comment) */