* regex.c (xmalloc, xrealloc): Define these when not linked to
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
1cee1e27 6 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
bc78d348 7
fa9a63c5
RM
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
4fc5845f 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
25fe55af 21 USA. */
fa9a63c5 22
6df42991 23/* TODO:
505bde11 24 - structure the opcode space into opcode+flag.
dc1e502d 25 - merge with glibc's regex.[ch].
01618498 26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
01618498 30*/
505bde11 31
fa9a63c5 32/* AIX requires this to be the first thing in the file. */
0b32bf0e 33#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
34 #pragma alloca
35#endif
36
fa9a63c5 37#ifdef HAVE_CONFIG_H
0b32bf0e 38# include <config.h>
fa9a63c5
RM
39#endif
40
4bb91c68
SM
41#if defined STDC_HEADERS && !defined emacs
42# include <stddef.h>
43#else
44/* We need this for `regex.h', and perhaps for the Emacs include files. */
45# include <sys/types.h>
46#endif
fa9a63c5 47
14473664
SM
48/* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
50#if defined _LIBC
51#define WIDE_CHAR_SUPPORT 1
52#else
14473664 53#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55#endif
14473664
SM
56
57/* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
a0ad02f7 59#if WIDE_CHAR_SUPPORT
14473664
SM
60/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61# include <wchar.h>
62# include <wctype.h>
63#endif
64
c0f9ea08
SM
65#ifdef _LIBC
66/* We have to keep the namespace clean. */
67# define regfree(preg) __regfree (preg)
68# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
70# define regerror(errcode, preg, errbuf, errbuf_size) \
71 __regerror(errcode, preg, errbuf, errbuf_size)
72# define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76# define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78# define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80# define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82# define re_set_syntax(syntax) __re_set_syntax (syntax)
83# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
14473664
SM
87/* Make sure we call libc's function even if the user overrides them. */
88# define btowc __btowc
89# define iswctype __iswctype
90# define wctype __wctype
91
c0f9ea08
SM
92# define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94/* We are also using some library internals. */
95# include <locale/localeinfo.h>
96# include <locale/elem-hash.h>
97# include <langinfo.h>
98#else
99# define WEAK_ALIAS(a,b)
100#endif
101
4bb91c68 102/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 103#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
104# include <libintl.h>
105#else
106# define gettext(msgid) (msgid)
107#endif
108
5e69f11e
RM
109#ifndef gettext_noop
110/* This define is so xgettext can find the internationalizable
111 strings. */
0b32bf0e 112# define gettext_noop(String) String
5e69f11e
RM
113#endif
114
fa9a63c5
RM
115/* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117#ifdef emacs
118
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e
SM
125# include "syntax.h"
126# include "charset.h"
127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
0b32bf0e
SM
142/* Converts the pointer to the char to BEG-based offset from the start. */
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
147# define RE_STRING_CHAR(p, s) \
4e8a9132 148 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 149# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
150 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
151
152/* Set C a (possibly multibyte) character before P. P points into a
153 string which is the virtual concatenation of STR1 (which ends at
154 END1) or STR2 (which ends at END2). */
0b32bf0e 155# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
156 do { \
157 if (multibyte) \
158 { \
0b32bf0e
SM
159 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
160 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
70806df6
KH
161 re_char *d0 = dtemp; \
162 PREV_CHAR_BOUNDARY (d0, dlimit); \
163 c = STRING_CHAR (d0, dtemp - d0); \
2d1675e4
SM
164 } \
165 else \
166 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
167 } while (0)
168
4e8a9132 169
fa9a63c5
RM
170#else /* not emacs */
171
172/* If we are not linking with Emacs proper,
173 we can't use the relocating allocator
174 even if config.h says that we can. */
0b32bf0e 175# undef REL_ALLOC
fa9a63c5 176
0b32bf0e
SM
177# if defined STDC_HEADERS || defined _LIBC
178# include <stdlib.h>
179# else
fa9a63c5
RM
180char *malloc ();
181char *realloc ();
0b32bf0e 182# endif
fa9a63c5 183
a77f947b
CY
184/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
185
186void *
187xmalloc (size)
188 size_t size;
189{
190 register void *val;
191 val = (void *) malloc (size);
192 if (!val && size)
193 {
194 write (2, "virtual memory exhausted\n", 25);
195 exit (1);
196 }
197 return val;
198}
199
200void *
201xrealloc (block, size)
202 void *block;
203 size_t size;
204{
205 register void *val;
206 /* We must call malloc explicitly when BLOCK is 0, since some
207 reallocs don't do this. */
208 if (! block)
209 val = (void *) malloc (size);
210 else
211 val = (void *) realloc (block, size);
212 if (!val && size)
213 {
214 write (2, "virtual memory exhausted\n", 25);
215 exit (1);
216 }
217 return val;
218}
219
9e4ecb26 220/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 221 If nothing else has been done, use the method below. */
0b32bf0e
SM
222# ifdef INHIBIT_STRING_HEADER
223# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
224# if !defined bzero && !defined bcopy
225# undef INHIBIT_STRING_HEADER
226# endif
227# endif
228# endif
9e4ecb26 229
4bb91c68 230/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
231 This is used in most programs--a few other programs avoid this
232 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
233# ifndef INHIBIT_STRING_HEADER
234# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
235# include <string.h>
0b32bf0e 236# ifndef bzero
4bb91c68
SM
237# ifndef _LIBC
238# define bzero(s, n) (memset (s, '\0', n), (s))
239# else
240# define bzero(s, n) __bzero (s, n)
241# endif
0b32bf0e
SM
242# endif
243# else
244# include <strings.h>
4bb91c68
SM
245# ifndef memcmp
246# define memcmp(s1, s2, n) bcmp (s1, s2, n)
247# endif
248# ifndef memcpy
249# define memcpy(d, s, n) (bcopy (s, d, n), (d))
250# endif
0b32bf0e
SM
251# endif
252# endif
fa9a63c5
RM
253
254/* Define the syntax stuff for \<, \>, etc. */
255
990b2375 256/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 257enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 258
0b32bf0e
SM
259# ifdef SWITCH_ENUM_BUG
260# define SWITCH_ENUM_CAST(x) ((int)(x))
261# else
262# define SWITCH_ENUM_CAST(x) (x)
263# endif
fa9a63c5 264
e934739e 265/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
266# define BASE_LEADING_CODE_P(c) (0)
267# define CHAR_CHARSET(c) 0
268# define CHARSET_LEADING_CODE_BASE(c) 0
269# define MAX_MULTIBYTE_LENGTH 1
270# define RE_MULTIBYTE_P(x) 0
271# define WORD_BOUNDARY_P(c1, c2) (0)
272# define CHAR_HEAD_P(p) (1)
273# define SINGLE_BYTE_CHAR_P(c) (1)
274# define SAME_CHARSET_P(c1, c2) (1)
275# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 276# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e
SM
277# define STRING_CHAR(p, s) (*(p))
278# define RE_STRING_CHAR STRING_CHAR
279# define CHAR_STRING(c, s) (*(s) = (c), 1)
280# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
281# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
282# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 283 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 284# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 285#endif /* not emacs */
4e8a9132
SM
286
287#ifndef RE_TRANSLATE
0b32bf0e
SM
288# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
289# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 290#endif
fa9a63c5
RM
291\f
292/* Get the interface, including the syntax bits. */
293#include "regex.h"
294
f71b19b6
DL
295/* isalpha etc. are used for the character classes. */
296#include <ctype.h>
fa9a63c5 297
f71b19b6 298#ifdef emacs
fa9a63c5 299
f71b19b6 300/* 1 if C is an ASCII character. */
0b32bf0e 301# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 302
f71b19b6 303/* 1 if C is a unibyte character. */
0b32bf0e 304# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 305
f71b19b6 306/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 307
f71b19b6 308/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
309# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
310# define ISCNTRL(c) ((c) < ' ')
311# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
312 || ((c) >= 'a' && (c) <= 'f') \
313 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
314
315/* This is only used for single-byte characters. */
0b32bf0e 316# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
317
318/* The rest must handle multibyte characters. */
319
0b32bf0e 320# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 321 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
322 : 1)
323
14473664 324# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 325 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
326 : 1)
327
0b32bf0e 328# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
329 ? (((c) >= 'a' && (c) <= 'z') \
330 || ((c) >= 'A' && (c) <= 'Z') \
331 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
332 : SYNTAX (c) == Sword)
333
0b32bf0e 334# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
335 ? (((c) >= 'a' && (c) <= 'z') \
336 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
337 : SYNTAX (c) == Sword)
338
0b32bf0e 339# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 340
0b32bf0e 341# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
342 ? ((c) > ' ' && (c) < 0177 \
343 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
344 || ((c) >= 'A' && (c) <= 'Z') \
345 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
346 : SYNTAX (c) != Sword)
347
0b32bf0e 348# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 349
0b32bf0e 350# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 351
0b32bf0e 352# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
353
354#else /* not emacs */
355
f71b19b6
DL
356/* Jim Meyering writes:
357
358 "... Some ctype macros are valid only for character codes that
359 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
360 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 361 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
362 STDC_HEADERS is defined, then autoconf has verified that the ctype
363 macros don't need to be guarded with references to isascii. ...
364 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
365 eliminate the && through constant folding."
366 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 367
4bb91c68 368# undef ISASCII
0b32bf0e
SM
369# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
370# define ISASCII(c) 1
371# else
372# define ISASCII(c) isascii(c)
373# endif
f71b19b6
DL
374
375/* 1 if C is an ASCII character. */
0b32bf0e 376# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
377
378/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
379# define ISUNIBYTE(c) 1
380
381# ifdef isblank
382# define ISBLANK(c) (ISASCII (c) && isblank (c))
383# else
384# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
385# endif
386# ifdef isgraph
387# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
388# else
389# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
390# endif
391
4bb91c68 392# undef ISPRINT
0b32bf0e
SM
393# define ISPRINT(c) (ISASCII (c) && isprint (c))
394# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
395# define ISALNUM(c) (ISASCII (c) && isalnum (c))
396# define ISALPHA(c) (ISASCII (c) && isalpha (c))
397# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
398# define ISLOWER(c) (ISASCII (c) && islower (c))
399# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
400# define ISSPACE(c) (ISASCII (c) && isspace (c))
401# define ISUPPER(c) (ISASCII (c) && isupper (c))
402# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
403
404# define ISWORD(c) ISALPHA(c)
405
4bb91c68
SM
406# ifdef _tolower
407# define TOLOWER(c) _tolower(c)
408# else
409# define TOLOWER(c) tolower(c)
410# endif
411
412/* How many characters in the character set. */
413# define CHAR_SET_SIZE 256
414
0b32bf0e 415# ifdef SYNTAX_TABLE
f71b19b6 416
0b32bf0e 417extern char *re_syntax_table;
f71b19b6 418
0b32bf0e
SM
419# else /* not SYNTAX_TABLE */
420
0b32bf0e
SM
421static char re_syntax_table[CHAR_SET_SIZE];
422
423static void
424init_syntax_once ()
425{
426 register int c;
427 static int done = 0;
428
429 if (done)
430 return;
431
432 bzero (re_syntax_table, sizeof re_syntax_table);
433
4bb91c68
SM
434 for (c = 0; c < CHAR_SET_SIZE; ++c)
435 if (ISALNUM (c))
436 re_syntax_table[c] = Sword;
fa9a63c5 437
669fa600 438 re_syntax_table['_'] = Ssymbol;
fa9a63c5 439
0b32bf0e
SM
440 done = 1;
441}
442
443# endif /* not SYNTAX_TABLE */
96cc36cc 444
4bb91c68
SM
445# define SYNTAX(c) re_syntax_table[(c)]
446
96cc36cc
RS
447#endif /* not emacs */
448\f
fa9a63c5 449#ifndef NULL
0b32bf0e 450# define NULL (void *)0
fa9a63c5
RM
451#endif
452
453/* We remove any previous definition of `SIGN_EXTEND_CHAR',
454 since ours (we hope) works properly with all combinations of
455 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 456 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
457#undef SIGN_EXTEND_CHAR
458#if __STDC__
0b32bf0e 459# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
460#else /* not __STDC__ */
461/* As in Harbison and Steele. */
0b32bf0e 462# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
463#endif
464\f
465/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
466 use `alloca' instead of `malloc'. This is because using malloc in
467 re_search* or re_match* could cause memory leaks when C-g is used in
468 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
469 the other hand, malloc is more portable, and easier to debug.
470
fa9a63c5
RM
471 Because we sometimes use alloca, some routines have to be macros,
472 not functions -- `alloca'-allocated space disappears at the end of the
473 function it is called in. */
474
475#ifdef REGEX_MALLOC
476
0b32bf0e
SM
477# define REGEX_ALLOCATE malloc
478# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
479# define REGEX_FREE free
fa9a63c5
RM
480
481#else /* not REGEX_MALLOC */
482
483/* Emacs already defines alloca, sometimes. */
0b32bf0e 484# ifndef alloca
fa9a63c5
RM
485
486/* Make alloca work the best possible way. */
0b32bf0e
SM
487# ifdef __GNUC__
488# define alloca __builtin_alloca
489# else /* not __GNUC__ */
490# if HAVE_ALLOCA_H
491# include <alloca.h>
492# endif /* HAVE_ALLOCA_H */
493# endif /* not __GNUC__ */
fa9a63c5 494
0b32bf0e 495# endif /* not alloca */
fa9a63c5 496
0b32bf0e 497# define REGEX_ALLOCATE alloca
fa9a63c5
RM
498
499/* Assumes a `char *destination' variable. */
0b32bf0e 500# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 501 (destination = (char *) alloca (nsize), \
4bb91c68 502 memcpy (destination, source, osize))
fa9a63c5
RM
503
504/* No need to do anything to free, after alloca. */
0b32bf0e 505# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
506
507#endif /* not REGEX_MALLOC */
508
509/* Define how to allocate the failure stack. */
510
0b32bf0e 511#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 512
0b32bf0e 513# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 514 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 515# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 516 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 517# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
518 r_alloc_free (&failure_stack_ptr)
519
4297555e 520#else /* not using relocating allocator */
fa9a63c5 521
0b32bf0e 522# ifdef REGEX_MALLOC
fa9a63c5 523
0b32bf0e
SM
524# define REGEX_ALLOCATE_STACK malloc
525# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
526# define REGEX_FREE_STACK free
fa9a63c5 527
0b32bf0e 528# else /* not REGEX_MALLOC */
fa9a63c5 529
0b32bf0e 530# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 531
0b32bf0e 532# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 533 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 534/* No need to explicitly free anything. */
0b32bf0e 535# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 536
0b32bf0e 537# endif /* not REGEX_MALLOC */
4297555e 538#endif /* not using relocating allocator */
fa9a63c5
RM
539
540
541/* True if `size1' is non-NULL and PTR is pointing anywhere inside
542 `string1' or just past its end. This works if PTR is NULL, which is
543 a good thing. */
25fe55af 544#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
545 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
546
547/* (Re)Allocate N items of type T using malloc, or fail. */
548#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
549#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
550#define RETALLOC_IF(addr, n, t) \
551 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
552#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
553
4bb91c68 554#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
555
556#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
557
558#undef MAX
559#undef MIN
560#define MAX(a, b) ((a) > (b) ? (a) : (b))
561#define MIN(a, b) ((a) < (b) ? (a) : (b))
562
6470ea05
SM
563/* Type of source-pattern and string chars. */
564typedef const unsigned char re_char;
565
fa9a63c5
RM
566typedef char boolean;
567#define false 0
568#define true 1
569
4bb91c68
SM
570static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
571 re_char *string1, int size1,
572 re_char *string2, int size2,
573 int pos,
574 struct re_registers *regs,
575 int stop));
fa9a63c5
RM
576\f
577/* These are the command codes that appear in compiled regular
4bb91c68 578 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
579 command code can specify any interpretation whatsoever for its
580 arguments. Zero bytes may appear in the compiled regular expression. */
581
582typedef enum
583{
584 no_op = 0,
585
4bb91c68 586 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
587 succeed,
588
25fe55af 589 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
590 exactn,
591
25fe55af 592 /* Matches any (more or less) character. */
fa9a63c5
RM
593 anychar,
594
25fe55af
RS
595 /* Matches any one char belonging to specified set. First
596 following byte is number of bitmap bytes. Then come bytes
597 for a bitmap saying which chars are in. Bits in each byte
598 are ordered low-bit-first. A character is in the set if its
599 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
600 automatically not in the set.
601
602 If the length byte has the 0x80 bit set, then that stuff
603 is followed by a range table:
604 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 605 See RANGE_TABLE_WORK_BITS below.
01618498 606 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 607 pairs, each 2 multibyte characters,
0b32bf0e 608 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
609 charset,
610
25fe55af 611 /* Same parameters as charset, but match any character that is
4bb91c68 612 not one of those specified. */
fa9a63c5
RM
613 charset_not,
614
25fe55af
RS
615 /* Start remembering the text that is matched, for storing in a
616 register. Followed by one byte with the register number, in
617 the range 0 to one less than the pattern buffer's re_nsub
505bde11 618 field. */
fa9a63c5
RM
619 start_memory,
620
25fe55af
RS
621 /* Stop remembering the text that is matched and store it in a
622 memory register. Followed by one byte with the register
623 number, in the range 0 to one less than `re_nsub' in the
505bde11 624 pattern buffer. */
fa9a63c5
RM
625 stop_memory,
626
25fe55af 627 /* Match a duplicate of something remembered. Followed by one
4bb91c68 628 byte containing the register number. */
fa9a63c5
RM
629 duplicate,
630
25fe55af 631 /* Fail unless at beginning of line. */
fa9a63c5
RM
632 begline,
633
4bb91c68 634 /* Fail unless at end of line. */
fa9a63c5
RM
635 endline,
636
25fe55af
RS
637 /* Succeeds if at beginning of buffer (if emacs) or at beginning
638 of string to be matched (if not). */
fa9a63c5
RM
639 begbuf,
640
25fe55af 641 /* Analogously, for end of buffer/string. */
fa9a63c5 642 endbuf,
5e69f11e 643
25fe55af 644 /* Followed by two byte relative address to which to jump. */
5e69f11e 645 jump,
fa9a63c5 646
25fe55af
RS
647 /* Followed by two-byte relative address of place to resume at
648 in case of failure. */
fa9a63c5 649 on_failure_jump,
5e69f11e 650
25fe55af
RS
651 /* Like on_failure_jump, but pushes a placeholder instead of the
652 current string position when executed. */
fa9a63c5 653 on_failure_keep_string_jump,
5e69f11e 654
505bde11
SM
655 /* Just like `on_failure_jump', except that it checks that we
656 don't get stuck in an infinite loop (matching an empty string
657 indefinitely). */
658 on_failure_jump_loop,
659
0683b6fa
SM
660 /* Just like `on_failure_jump_loop', except that it checks for
661 a different kind of loop (the kind that shows up with non-greedy
662 operators). This operation has to be immediately preceded
663 by a `no_op'. */
664 on_failure_jump_nastyloop,
665
0b32bf0e 666 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
667 It analyses the loop before which it is put and if the
668 loop does not require backtracking, it changes itself to
4e8a9132
SM
669 `on_failure_keep_string_jump' and short-circuits the loop,
670 else it just defaults to changing itself into `on_failure_jump'.
671 It assumes that it is pointing to just past a `jump'. */
505bde11 672 on_failure_jump_smart,
fa9a63c5 673
25fe55af 674 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
675 After matching N times, jump to the address upon failure.
676 Does not work if N starts at 0: use on_failure_jump_loop
677 instead. */
fa9a63c5
RM
678 succeed_n,
679
25fe55af
RS
680 /* Followed by two-byte relative address, and two-byte number n.
681 Jump to the address N times, then fail. */
fa9a63c5
RM
682 jump_n,
683
25fe55af
RS
684 /* Set the following two-byte relative address to the
685 subsequent two-byte number. The address *includes* the two
686 bytes of number. */
fa9a63c5
RM
687 set_number_at,
688
fa9a63c5
RM
689 wordbeg, /* Succeeds if at word beginning. */
690 wordend, /* Succeeds if at word end. */
691
692 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 693 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 694
669fa600
SM
695 symbeg, /* Succeeds if at symbol beginning. */
696 symend, /* Succeeds if at symbol end. */
697
fa9a63c5 698 /* Matches any character whose syntax is specified. Followed by
25fe55af 699 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
700 syntaxspec,
701
702 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
703 notsyntaxspec
704
705#ifdef emacs
706 ,before_dot, /* Succeeds if before point. */
707 at_dot, /* Succeeds if at point. */
708 after_dot, /* Succeeds if after point. */
b18215fc
RS
709
710 /* Matches any character whose category-set contains the specified
25fe55af
RS
711 category. The operator is followed by a byte which contains a
712 category code (mnemonic ASCII character). */
b18215fc
RS
713 categoryspec,
714
715 /* Matches any character whose category-set does not contain the
716 specified category. The operator is followed by a byte which
717 contains the category code (mnemonic ASCII character). */
718 notcategoryspec
fa9a63c5
RM
719#endif /* emacs */
720} re_opcode_t;
721\f
722/* Common operations on the compiled pattern. */
723
724/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
725
726#define STORE_NUMBER(destination, number) \
727 do { \
728 (destination)[0] = (number) & 0377; \
729 (destination)[1] = (number) >> 8; \
730 } while (0)
731
732/* Same as STORE_NUMBER, except increment DESTINATION to
733 the byte after where the number is stored. Therefore, DESTINATION
734 must be an lvalue. */
735
736#define STORE_NUMBER_AND_INCR(destination, number) \
737 do { \
738 STORE_NUMBER (destination, number); \
739 (destination) += 2; \
740 } while (0)
741
742/* Put into DESTINATION a number stored in two contiguous bytes starting
743 at SOURCE. */
744
745#define EXTRACT_NUMBER(destination, source) \
746 do { \
747 (destination) = *(source) & 0377; \
748 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
749 } while (0)
750
751#ifdef DEBUG
4bb91c68 752static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
753static void
754extract_number (dest, source)
755 int *dest;
01618498 756 re_char *source;
fa9a63c5 757{
5e69f11e 758 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
759 *dest = *source & 0377;
760 *dest += temp << 8;
761}
762
4bb91c68 763# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
764# undef EXTRACT_NUMBER
765# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
766# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
767
768#endif /* DEBUG */
769
770/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
771 SOURCE must be an lvalue. */
772
773#define EXTRACT_NUMBER_AND_INCR(destination, source) \
774 do { \
775 EXTRACT_NUMBER (destination, source); \
25fe55af 776 (source) += 2; \
fa9a63c5
RM
777 } while (0)
778
779#ifdef DEBUG
4bb91c68
SM
780static void extract_number_and_incr _RE_ARGS ((int *destination,
781 re_char **source));
fa9a63c5
RM
782static void
783extract_number_and_incr (destination, source)
784 int *destination;
01618498 785 re_char **source;
5e69f11e 786{
fa9a63c5
RM
787 extract_number (destination, *source);
788 *source += 2;
789}
790
0b32bf0e
SM
791# ifndef EXTRACT_MACROS
792# undef EXTRACT_NUMBER_AND_INCR
793# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 794 extract_number_and_incr (&dest, &src)
0b32bf0e 795# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
796
797#endif /* DEBUG */
798\f
b18215fc
RS
799/* Store a multibyte character in three contiguous bytes starting
800 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 801 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
802
803#define STORE_CHARACTER_AND_INCR(destination, character) \
804 do { \
805 (destination)[0] = (character) & 0377; \
806 (destination)[1] = ((character) >> 8) & 0377; \
807 (destination)[2] = (character) >> 16; \
808 (destination) += 3; \
809 } while (0)
810
811/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 812 starting at SOURCE. */
b18215fc
RS
813
814#define EXTRACT_CHARACTER(destination, source) \
815 do { \
816 (destination) = ((source)[0] \
817 | ((source)[1] << 8) \
818 | ((source)[2] << 16)); \
819 } while (0)
820
821
822/* Macros for charset. */
823
824/* Size of bitmap of charset P in bytes. P is a start of charset,
825 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
826#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
827
828/* Nonzero if charset P has range table. */
25fe55af 829#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
830
831/* Return the address of range table of charset P. But not the start
832 of table itself, but the before where the number of ranges is
96cc36cc
RS
833 stored. `2 +' means to skip re_opcode_t and size of bitmap,
834 and the 2 bytes of flags at the start of the range table. */
835#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
836
837/* Extract the bit flags that start a range table. */
838#define CHARSET_RANGE_TABLE_BITS(p) \
839 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
840 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
841
842/* Test if C is listed in the bitmap of charset P. */
843#define CHARSET_LOOKUP_BITMAP(p, c) \
844 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
845 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
846
847/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
848 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
849 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
850 and end. */
851#define CHARSET_RANGE_TABLE_END(range_table, count) \
852 ((range_table) + (count) * 2 * 3)
853
25fe55af 854/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
855 COUNT is number of ranges in RANGE_TABLE. */
856#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
857 do \
858 { \
01618498
SM
859 re_wchar_t range_start, range_end; \
860 re_char *p; \
861 re_char *range_table_end \
b18215fc
RS
862 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
863 \
864 for (p = (range_table); p < range_table_end; p += 2 * 3) \
865 { \
866 EXTRACT_CHARACTER (range_start, p); \
867 EXTRACT_CHARACTER (range_end, p + 3); \
868 \
869 if (range_start <= (c) && (c) <= range_end) \
870 { \
871 (not) = !(not); \
872 break; \
873 } \
874 } \
875 } \
876 while (0)
877
878/* Test if C is in range table of CHARSET. The flag NOT is negated if
879 C is listed in it. */
880#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
881 do \
882 { \
883 /* Number of ranges in range table. */ \
884 int count; \
01618498
SM
885 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
886 \
b18215fc
RS
887 EXTRACT_NUMBER_AND_INCR (count, range_table); \
888 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
889 } \
890 while (0)
891\f
fa9a63c5
RM
892/* If DEBUG is defined, Regex prints many voluminous messages about what
893 it is doing (if the variable `debug' is nonzero). If linked with the
894 main program in `iregex.c', you can enter patterns and strings
895 interactively. And if linked with the main program in `main.c' and
4bb91c68 896 the other test files, you can run the already-written tests. */
fa9a63c5
RM
897
898#ifdef DEBUG
899
900/* We use standard I/O for debugging. */
0b32bf0e 901# include <stdio.h>
fa9a63c5
RM
902
903/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 904# include <assert.h>
fa9a63c5 905
99633e97 906static int debug = -100000;
fa9a63c5 907
0b32bf0e
SM
908# define DEBUG_STATEMENT(e) e
909# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
910# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
911# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
912# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
913# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 914 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 915# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 916 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
917
918
919/* Print the fastmap in human-readable form. */
920
921void
922print_fastmap (fastmap)
923 char *fastmap;
924{
925 unsigned was_a_range = 0;
5e69f11e
RM
926 unsigned i = 0;
927
fa9a63c5
RM
928 while (i < (1 << BYTEWIDTH))
929 {
930 if (fastmap[i++])
931 {
932 was_a_range = 0;
25fe55af
RS
933 putchar (i - 1);
934 while (i < (1 << BYTEWIDTH) && fastmap[i])
935 {
936 was_a_range = 1;
937 i++;
938 }
fa9a63c5 939 if (was_a_range)
25fe55af
RS
940 {
941 printf ("-");
942 putchar (i - 1);
943 }
944 }
fa9a63c5 945 }
5e69f11e 946 putchar ('\n');
fa9a63c5
RM
947}
948
949
950/* Print a compiled pattern string in human-readable form, starting at
951 the START pointer into it and ending just before the pointer END. */
952
953void
954print_partial_compiled_pattern (start, end)
01618498
SM
955 re_char *start;
956 re_char *end;
fa9a63c5
RM
957{
958 int mcnt, mcnt2;
01618498
SM
959 re_char *p = start;
960 re_char *pend = end;
fa9a63c5
RM
961
962 if (start == NULL)
963 {
a1a052df 964 fprintf (stderr, "(null)\n");
fa9a63c5
RM
965 return;
966 }
5e69f11e 967
fa9a63c5
RM
968 /* Loop over pattern commands. */
969 while (p < pend)
970 {
a1a052df 971 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
972
973 switch ((re_opcode_t) *p++)
974 {
25fe55af 975 case no_op:
a1a052df 976 fprintf (stderr, "/no_op");
25fe55af 977 break;
fa9a63c5 978
99633e97 979 case succeed:
a1a052df 980 fprintf (stderr, "/succeed");
99633e97
SM
981 break;
982
fa9a63c5
RM
983 case exactn:
984 mcnt = *p++;
a1a052df 985 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 986 do
fa9a63c5 987 {
a1a052df 988 fprintf (stderr, "/%c", *p++);
25fe55af
RS
989 }
990 while (--mcnt);
991 break;
fa9a63c5
RM
992
993 case start_memory:
a1a052df 994 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 995 break;
fa9a63c5
RM
996
997 case stop_memory:
a1a052df 998 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 999 break;
fa9a63c5
RM
1000
1001 case duplicate:
a1a052df 1002 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1003 break;
1004
1005 case anychar:
a1a052df 1006 fprintf (stderr, "/anychar");
fa9a63c5
RM
1007 break;
1008
1009 case charset:
25fe55af
RS
1010 case charset_not:
1011 {
1012 register int c, last = -100;
fa9a63c5 1013 register int in_range = 0;
99633e97
SM
1014 int length = CHARSET_BITMAP_SIZE (p - 1);
1015 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1016
a1a052df 1017 fprintf (stderr, "/charset [%s",
275a3409 1018 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1019
275a3409
RS
1020 if (p + *p >= pend)
1021 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1022
25fe55af 1023 for (c = 0; c < 256; c++)
96cc36cc 1024 if (c / 8 < length
fa9a63c5
RM
1025 && (p[1 + (c/8)] & (1 << (c % 8))))
1026 {
1027 /* Are we starting a range? */
1028 if (last + 1 == c && ! in_range)
1029 {
a1a052df 1030 fprintf (stderr, "-");
fa9a63c5
RM
1031 in_range = 1;
1032 }
1033 /* Have we broken a range? */
1034 else if (last + 1 != c && in_range)
96cc36cc 1035 {
a1a052df 1036 fprintf (stderr, "%c", last);
fa9a63c5
RM
1037 in_range = 0;
1038 }
5e69f11e 1039
fa9a63c5 1040 if (! in_range)
a1a052df 1041 fprintf (stderr, "%c", c);
fa9a63c5
RM
1042
1043 last = c;
25fe55af 1044 }
fa9a63c5
RM
1045
1046 if (in_range)
a1a052df 1047 fprintf (stderr, "%c", last);
fa9a63c5 1048
a1a052df 1049 fprintf (stderr, "]");
fa9a63c5 1050
99633e97 1051 p += 1 + length;
96cc36cc 1052
96cc36cc 1053 if (has_range_table)
99633e97
SM
1054 {
1055 int count;
a1a052df 1056 fprintf (stderr, "has-range-table");
99633e97
SM
1057
1058 /* ??? Should print the range table; for now, just skip it. */
1059 p += 2; /* skip range table bits */
1060 EXTRACT_NUMBER_AND_INCR (count, p);
1061 p = CHARSET_RANGE_TABLE_END (p, count);
1062 }
fa9a63c5
RM
1063 }
1064 break;
1065
1066 case begline:
a1a052df 1067 fprintf (stderr, "/begline");
25fe55af 1068 break;
fa9a63c5
RM
1069
1070 case endline:
a1a052df 1071 fprintf (stderr, "/endline");
25fe55af 1072 break;
fa9a63c5
RM
1073
1074 case on_failure_jump:
25fe55af 1075 extract_number_and_incr (&mcnt, &p);
a1a052df 1076 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1077 break;
fa9a63c5
RM
1078
1079 case on_failure_keep_string_jump:
25fe55af 1080 extract_number_and_incr (&mcnt, &p);
a1a052df 1081 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1082 break;
fa9a63c5 1083
0683b6fa
SM
1084 case on_failure_jump_nastyloop:
1085 extract_number_and_incr (&mcnt, &p);
a1a052df 1086 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1087 break;
1088
505bde11 1089 case on_failure_jump_loop:
fa9a63c5 1090 extract_number_and_incr (&mcnt, &p);
a1a052df 1091 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1092 break;
1093
505bde11 1094 case on_failure_jump_smart:
fa9a63c5 1095 extract_number_and_incr (&mcnt, &p);
a1a052df 1096 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1097 break;
1098
25fe55af 1099 case jump:
fa9a63c5 1100 extract_number_and_incr (&mcnt, &p);
a1a052df 1101 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1102 break;
1103
25fe55af
RS
1104 case succeed_n:
1105 extract_number_and_incr (&mcnt, &p);
1106 extract_number_and_incr (&mcnt2, &p);
a1a052df 1107 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1108 break;
5e69f11e 1109
25fe55af
RS
1110 case jump_n:
1111 extract_number_and_incr (&mcnt, &p);
1112 extract_number_and_incr (&mcnt2, &p);
a1a052df 1113 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1114 break;
5e69f11e 1115
25fe55af
RS
1116 case set_number_at:
1117 extract_number_and_incr (&mcnt, &p);
1118 extract_number_and_incr (&mcnt2, &p);
a1a052df 1119 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1120 break;
5e69f11e 1121
25fe55af 1122 case wordbound:
a1a052df 1123 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1124 break;
1125
1126 case notwordbound:
a1a052df 1127 fprintf (stderr, "/notwordbound");
25fe55af 1128 break;
fa9a63c5
RM
1129
1130 case wordbeg:
a1a052df 1131 fprintf (stderr, "/wordbeg");
fa9a63c5 1132 break;
5e69f11e 1133
fa9a63c5 1134 case wordend:
a1a052df 1135 fprintf (stderr, "/wordend");
e2543b02 1136 break;
5e69f11e 1137
669fa600 1138 case symbeg:
e2543b02 1139 fprintf (stderr, "/symbeg");
669fa600
SM
1140 break;
1141
1142 case symend:
e2543b02 1143 fprintf (stderr, "/symend");
669fa600
SM
1144 break;
1145
1fb352e0 1146 case syntaxspec:
a1a052df 1147 fprintf (stderr, "/syntaxspec");
1fb352e0 1148 mcnt = *p++;
a1a052df 1149 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1150 break;
1151
1152 case notsyntaxspec:
a1a052df 1153 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1154 mcnt = *p++;
a1a052df 1155 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1156 break;
1157
0b32bf0e 1158# ifdef emacs
fa9a63c5 1159 case before_dot:
a1a052df 1160 fprintf (stderr, "/before_dot");
25fe55af 1161 break;
fa9a63c5
RM
1162
1163 case at_dot:
a1a052df 1164 fprintf (stderr, "/at_dot");
25fe55af 1165 break;
fa9a63c5
RM
1166
1167 case after_dot:
a1a052df 1168 fprintf (stderr, "/after_dot");
25fe55af 1169 break;
fa9a63c5 1170
1fb352e0 1171 case categoryspec:
a1a052df 1172 fprintf (stderr, "/categoryspec");
fa9a63c5 1173 mcnt = *p++;
a1a052df 1174 fprintf (stderr, "/%d", mcnt);
25fe55af 1175 break;
5e69f11e 1176
1fb352e0 1177 case notcategoryspec:
a1a052df 1178 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1179 mcnt = *p++;
a1a052df 1180 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1181 break;
0b32bf0e 1182# endif /* emacs */
fa9a63c5 1183
fa9a63c5 1184 case begbuf:
a1a052df 1185 fprintf (stderr, "/begbuf");
25fe55af 1186 break;
fa9a63c5
RM
1187
1188 case endbuf:
a1a052df 1189 fprintf (stderr, "/endbuf");
25fe55af 1190 break;
fa9a63c5 1191
25fe55af 1192 default:
a1a052df 1193 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1194 }
1195
a1a052df 1196 fprintf (stderr, "\n");
fa9a63c5
RM
1197 }
1198
a1a052df 1199 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1200}
1201
1202
1203void
1204print_compiled_pattern (bufp)
1205 struct re_pattern_buffer *bufp;
1206{
01618498 1207 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1208
1209 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1210 printf ("%ld bytes used/%ld bytes allocated.\n",
1211 bufp->used, bufp->allocated);
fa9a63c5
RM
1212
1213 if (bufp->fastmap_accurate && bufp->fastmap)
1214 {
1215 printf ("fastmap: ");
1216 print_fastmap (bufp->fastmap);
1217 }
1218
1219 printf ("re_nsub: %d\t", bufp->re_nsub);
1220 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1221 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1222 printf ("no_sub: %d\t", bufp->no_sub);
1223 printf ("not_bol: %d\t", bufp->not_bol);
1224 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1225 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1226 fflush (stdout);
fa9a63c5
RM
1227 /* Perhaps we should print the translate table? */
1228}
1229
1230
1231void
1232print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1233 re_char *where;
1234 re_char *string1;
1235 re_char *string2;
fa9a63c5
RM
1236 int size1;
1237 int size2;
1238{
4bb91c68 1239 int this_char;
5e69f11e 1240
fa9a63c5
RM
1241 if (where == NULL)
1242 printf ("(null)");
1243 else
1244 {
1245 if (FIRST_STRING_P (where))
25fe55af
RS
1246 {
1247 for (this_char = where - string1; this_char < size1; this_char++)
1248 putchar (string1[this_char]);
fa9a63c5 1249
25fe55af
RS
1250 where = string2;
1251 }
fa9a63c5
RM
1252
1253 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1254 putchar (string2[this_char]);
fa9a63c5
RM
1255 }
1256}
1257
1258#else /* not DEBUG */
1259
0b32bf0e
SM
1260# undef assert
1261# define assert(e)
fa9a63c5 1262
0b32bf0e
SM
1263# define DEBUG_STATEMENT(e)
1264# define DEBUG_PRINT1(x)
1265# define DEBUG_PRINT2(x1, x2)
1266# define DEBUG_PRINT3(x1, x2, x3)
1267# define DEBUG_PRINT4(x1, x2, x3, x4)
1268# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1269# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1270
1271#endif /* not DEBUG */
1272\f
1273/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1274 also be assigned to arbitrarily: each pattern buffer stores its own
1275 syntax, so it can be changed between regex compilations. */
1276/* This has no initializer because initialized variables in Emacs
1277 become read-only after dumping. */
1278reg_syntax_t re_syntax_options;
1279
1280
1281/* Specify the precise syntax of regexps for compilation. This provides
1282 for compatibility for various utilities which historically have
1283 different, incompatible syntaxes.
1284
1285 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1286 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1287
1288reg_syntax_t
1289re_set_syntax (syntax)
f9b0fd99 1290 reg_syntax_t syntax;
fa9a63c5
RM
1291{
1292 reg_syntax_t ret = re_syntax_options;
5e69f11e 1293
fa9a63c5
RM
1294 re_syntax_options = syntax;
1295 return ret;
1296}
c0f9ea08 1297WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1298
1299/* Regexp to use to replace spaces, or NULL meaning don't. */
1300static re_char *whitespace_regexp;
1301
1302void
1303re_set_whitespace_regexp (regexp)
6470ea05 1304 const char *regexp;
f9b0fd99 1305{
6470ea05 1306 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1307}
1308WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1309\f
1310/* This table gives an error message for each of the error codes listed
4bb91c68 1311 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1312 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1313 but why not be nice? */
fa9a63c5
RM
1314
1315static const char *re_error_msgid[] =
5e69f11e
RM
1316 {
1317 gettext_noop ("Success"), /* REG_NOERROR */
1318 gettext_noop ("No match"), /* REG_NOMATCH */
1319 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1320 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1321 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1322 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1323 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1324 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1325 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1326 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1327 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1328 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1329 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1330 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1331 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1332 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1333 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1334 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1335 };
1336\f
4bb91c68 1337/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1338
1339/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1340 searching and matching functions should not call alloca. On some
1341 systems, alloca is implemented in terms of malloc, and if we're
1342 using the relocating allocator routines, then malloc could cause a
1343 relocation, which might (if the strings being searched are in the
1344 ralloc heap) shift the data out from underneath the regexp
1345 routines.
1346
5e69f11e 1347 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1348 processes input from X in a signal handler; processing X input may
1349 call malloc; if input arrives while a matching routine is calling
1350 malloc, then we're scrod. But Emacs can't just block input while
1351 calling matching routines; then we don't notice interrupts when
1352 they come in. So, Emacs blocks input around all regexp calls
1353 except the matching calls, which it leaves unprotected, in the
1354 faith that they will not malloc. */
1355
1356/* Normally, this is fine. */
1357#define MATCH_MAY_ALLOCATE
1358
1359/* When using GNU C, we are not REALLY using the C alloca, no matter
1360 what config.h may say. So don't take precautions for it. */
1361#ifdef __GNUC__
0b32bf0e 1362# undef C_ALLOCA
fa9a63c5
RM
1363#endif
1364
1365/* The match routines may not allocate if (1) they would do it with malloc
1366 and (2) it's not safe for them to use malloc.
1367 Note that if REL_ALLOC is defined, matching would not use malloc for the
1368 failure stack, but we would still use it for the register vectors;
4bb91c68 1369 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1370#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1371# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1372#endif
1373
1374\f
1375/* Failure stack declarations and macros; both re_compile_fastmap and
1376 re_match_2 use a failure stack. These have to be macros because of
1377 REGEX_ALLOCATE_STACK. */
5e69f11e 1378
fa9a63c5 1379
320a2a73 1380/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1381 when matching. If this number is exceeded, we allocate more
1382 space, so it is not a hard limit. */
1383#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1384# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1385#endif
1386
1387/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1388 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1389 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1390 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1391 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1392# if defined MATCH_MAY_ALLOCATE
1393/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1394 whose default stack limit is 2mb. In order for a larger
1395 value to work reliably, you have to try to make it accord
1396 with the process stack limit. */
c0f9ea08
SM
1397size_t re_max_failures = 40000;
1398# else
1399size_t re_max_failures = 4000;
1400# endif
fa9a63c5
RM
1401
1402union fail_stack_elt
1403{
01618498 1404 re_char *pointer;
c0f9ea08
SM
1405 /* This should be the biggest `int' that's no bigger than a pointer. */
1406 long integer;
fa9a63c5
RM
1407};
1408
1409typedef union fail_stack_elt fail_stack_elt_t;
1410
1411typedef struct
1412{
1413 fail_stack_elt_t *stack;
c0f9ea08
SM
1414 size_t size;
1415 size_t avail; /* Offset of next open position. */
1416 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1417} fail_stack_type;
1418
505bde11 1419#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1420#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1421
1422
1423/* Define macros to initialize and free the failure stack.
1424 Do `return -2' if the alloc fails. */
1425
1426#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1427# define INIT_FAIL_STACK() \
fa9a63c5
RM
1428 do { \
1429 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1430 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1431 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1432 \
1433 if (fail_stack.stack == NULL) \
1434 return -2; \
1435 \
1436 fail_stack.size = INIT_FAILURE_ALLOC; \
1437 fail_stack.avail = 0; \
505bde11 1438 fail_stack.frame = 0; \
fa9a63c5
RM
1439 } while (0)
1440
0b32bf0e 1441# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1442#else
0b32bf0e 1443# define INIT_FAIL_STACK() \
fa9a63c5
RM
1444 do { \
1445 fail_stack.avail = 0; \
505bde11 1446 fail_stack.frame = 0; \
fa9a63c5
RM
1447 } while (0)
1448
0b32bf0e 1449# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1450#endif
1451
1452
320a2a73
KH
1453/* Double the size of FAIL_STACK, up to a limit
1454 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1455
1456 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1457 allocating space for it or it was already too large.
1458
4bb91c68 1459 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1460
320a2a73
KH
1461/* Factor to increase the failure stack size by
1462 when we increase it.
1463 This used to be 2, but 2 was too wasteful
1464 because the old discarded stacks added up to as much space
1465 were as ultimate, maximum-size stack. */
1466#define FAIL_STACK_GROWTH_FACTOR 4
1467
1468#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1469 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1470 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1471 ? 0 \
320a2a73
KH
1472 : ((fail_stack).stack \
1473 = (fail_stack_elt_t *) \
25fe55af
RS
1474 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1475 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1476 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1477 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1478 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1479 \
1480 (fail_stack).stack == NULL \
1481 ? 0 \
6453db45
KH
1482 : ((fail_stack).size \
1483 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1484 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1485 * FAIL_STACK_GROWTH_FACTOR)) \
1486 / sizeof (fail_stack_elt_t)), \
25fe55af 1487 1)))
fa9a63c5
RM
1488
1489
fa9a63c5
RM
1490/* Push a pointer value onto the failure stack.
1491 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1492 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1493#define PUSH_FAILURE_POINTER(item) \
01618498 1494 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1495
1496/* This pushes an integer-valued item onto the failure stack.
1497 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1498 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1499#define PUSH_FAILURE_INT(item) \
1500 fail_stack.stack[fail_stack.avail++].integer = (item)
1501
1502/* Push a fail_stack_elt_t value onto the failure stack.
1503 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1504 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1505#define PUSH_FAILURE_ELT(item) \
1506 fail_stack.stack[fail_stack.avail++] = (item)
1507
1508/* These three POP... operations complement the three PUSH... operations.
1509 All assume that `fail_stack' is nonempty. */
1510#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1511#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1512#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1513
505bde11
SM
1514/* Individual items aside from the registers. */
1515#define NUM_NONREG_ITEMS 3
1516
1517/* Used to examine the stack (to detect infinite loops). */
1518#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1519#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1520#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1521#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1522
1523
505bde11
SM
1524#define ENSURE_FAIL_STACK(space) \
1525while (REMAINING_AVAIL_SLOTS <= space) { \
1526 if (!GROW_FAIL_STACK (fail_stack)) \
1527 return -2; \
1528 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1529 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1530}
1531
1532/* Push register NUM onto the stack. */
1533#define PUSH_FAILURE_REG(num) \
1534do { \
1535 char *destination; \
1536 ENSURE_FAIL_STACK(3); \
1537 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1538 num, regstart[num], regend[num]); \
1539 PUSH_FAILURE_POINTER (regstart[num]); \
1540 PUSH_FAILURE_POINTER (regend[num]); \
1541 PUSH_FAILURE_INT (num); \
1542} while (0)
1543
01618498
SM
1544/* Change the counter's value to VAL, but make sure that it will
1545 be reset when backtracking. */
1546#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1547do { \
1548 char *destination; \
1549 int c; \
1550 ENSURE_FAIL_STACK(3); \
1551 EXTRACT_NUMBER (c, ptr); \
01618498 1552 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1553 PUSH_FAILURE_INT (c); \
1554 PUSH_FAILURE_POINTER (ptr); \
1555 PUSH_FAILURE_INT (-1); \
01618498 1556 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1557} while (0)
1558
505bde11 1559/* Pop a saved register off the stack. */
dc1e502d 1560#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1561do { \
1562 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1563 if (reg == -1) \
1564 { \
1565 /* It's a counter. */ \
6dcf2d0e
SM
1566 /* Here, we discard `const', making re_match non-reentrant. */ \
1567 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1568 reg = POP_FAILURE_INT (); \
1569 STORE_NUMBER (ptr, reg); \
1570 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1571 } \
1572 else \
1573 { \
1574 regend[reg] = POP_FAILURE_POINTER (); \
1575 regstart[reg] = POP_FAILURE_POINTER (); \
1576 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1577 reg, regstart[reg], regend[reg]); \
1578 } \
505bde11
SM
1579} while (0)
1580
1581/* Check that we are not stuck in an infinite loop. */
1582#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1583do { \
f6df485f 1584 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1585 /* Check for infinite matching loops */ \
f6df485f
RS
1586 while (failure > 0 \
1587 && (FAILURE_STR (failure) == string_place \
1588 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1589 { \
1590 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1591 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1592 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1593 { \
6df42991
SM
1594 cycle = 1; \
1595 break; \
f6df485f 1596 } \
66f0296e 1597 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1598 failure = NEXT_FAILURE_HANDLE(failure); \
1599 } \
1600 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1601} while (0)
6df42991 1602
fa9a63c5 1603/* Push the information about the state we will need
5e69f11e
RM
1604 if we ever fail back to it.
1605
505bde11 1606 Requires variables fail_stack, regstart, regend and
320a2a73 1607 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1608 declared.
5e69f11e 1609
fa9a63c5
RM
1610 Does `return FAILURE_CODE' if runs out of memory. */
1611
505bde11
SM
1612#define PUSH_FAILURE_POINT(pattern, string_place) \
1613do { \
1614 char *destination; \
1615 /* Must be int, so when we don't save any registers, the arithmetic \
1616 of 0 + -1 isn't done as unsigned. */ \
1617 \
505bde11 1618 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1619 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1620 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1621 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1622 \
1623 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1624 \
1625 DEBUG_PRINT1 ("\n"); \
1626 \
1627 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1628 PUSH_FAILURE_INT (fail_stack.frame); \
1629 \
1630 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1631 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1632 DEBUG_PRINT1 ("'\n"); \
1633 PUSH_FAILURE_POINTER (string_place); \
1634 \
1635 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1636 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1637 PUSH_FAILURE_POINTER (pattern); \
1638 \
1639 /* Close the frame by moving the frame pointer past it. */ \
1640 fail_stack.frame = fail_stack.avail; \
1641} while (0)
fa9a63c5 1642
320a2a73
KH
1643/* Estimate the size of data pushed by a typical failure stack entry.
1644 An estimate is all we need, because all we use this for
1645 is to choose a limit for how big to make the failure stack. */
ada30c0e 1646/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1647#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1648
fa9a63c5
RM
1649/* How many items can still be added to the stack without overflowing it. */
1650#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1651
1652
1653/* Pops what PUSH_FAIL_STACK pushes.
1654
1655 We restore into the parameters, all of which should be lvalues:
1656 STR -- the saved data position.
1657 PAT -- the saved pattern position.
fa9a63c5 1658 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1659
fa9a63c5 1660 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1661 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1662
505bde11
SM
1663#define POP_FAILURE_POINT(str, pat) \
1664do { \
fa9a63c5
RM
1665 assert (!FAIL_STACK_EMPTY ()); \
1666 \
1667 /* Remove failure points and point to how many regs pushed. */ \
1668 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1669 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1670 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1671 \
505bde11
SM
1672 /* Pop the saved registers. */ \
1673 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1674 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1675 \
01618498 1676 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1677 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1678 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1679 \
1680 /* If the saved string location is NULL, it came from an \
1681 on_failure_keep_string_jump opcode, and we want to throw away the \
1682 saved NULL, thus retaining our current position in the string. */ \
01618498 1683 str = POP_FAILURE_POINTER (); \
505bde11 1684 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1685 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1686 DEBUG_PRINT1 ("'\n"); \
1687 \
505bde11
SM
1688 fail_stack.frame = POP_FAILURE_INT (); \
1689 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1690 \
505bde11
SM
1691 assert (fail_stack.avail >= 0); \
1692 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1693 \
fa9a63c5 1694 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1695} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1696
1697
1698\f
fa9a63c5 1699/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1700#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1701\f
1702/* Subroutine declarations and macros for regex_compile. */
1703
4bb91c68
SM
1704static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1705 reg_syntax_t syntax,
1706 struct re_pattern_buffer *bufp));
1707static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1708static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1709 int arg1, int arg2));
1710static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1711 int arg, unsigned char *end));
1712static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1713 int arg1, int arg2, unsigned char *end));
01618498
SM
1714static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1715 re_char *p,
4bb91c68 1716 reg_syntax_t syntax));
01618498
SM
1717static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1718 re_char *pend,
4bb91c68 1719 reg_syntax_t syntax));
01618498
SM
1720static re_char *skip_one_char _RE_ARGS ((re_char *p));
1721static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1722 char *fastmap, const int multibyte));
fa9a63c5 1723
fa9a63c5 1724/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1725 translation. */
36595814 1726#define PATFETCH(c) \
2d1675e4
SM
1727 do { \
1728 int len; \
1729 if (p == pend) return REG_EEND; \
1730 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1731 p += len; \
fa9a63c5
RM
1732 } while (0)
1733
fa9a63c5
RM
1734
1735/* If `translate' is non-null, return translate[D], else just D. We
1736 cast the subscript to translate because some data is declared as
1737 `char *', to avoid warnings when a string constant is passed. But
1738 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1739#ifndef TRANSLATE
0b32bf0e 1740# define TRANSLATE(d) \
66f0296e 1741 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1742#endif
fa9a63c5
RM
1743
1744
1745/* Macros for outputting the compiled pattern into `buffer'. */
1746
1747/* If the buffer isn't allocated when it comes in, use this. */
1748#define INIT_BUF_SIZE 32
1749
4bb91c68 1750/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1751#define GET_BUFFER_SPACE(n) \
01618498 1752 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1753 EXTEND_BUFFER ()
1754
1755/* Make sure we have one more byte of buffer space and then add C to it. */
1756#define BUF_PUSH(c) \
1757 do { \
1758 GET_BUFFER_SPACE (1); \
1759 *b++ = (unsigned char) (c); \
1760 } while (0)
1761
1762
1763/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1764#define BUF_PUSH_2(c1, c2) \
1765 do { \
1766 GET_BUFFER_SPACE (2); \
1767 *b++ = (unsigned char) (c1); \
1768 *b++ = (unsigned char) (c2); \
1769 } while (0)
1770
1771
4bb91c68 1772/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1773#define BUF_PUSH_3(c1, c2, c3) \
1774 do { \
1775 GET_BUFFER_SPACE (3); \
1776 *b++ = (unsigned char) (c1); \
1777 *b++ = (unsigned char) (c2); \
1778 *b++ = (unsigned char) (c3); \
1779 } while (0)
1780
1781
1782/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1783 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1784#define STORE_JUMP(op, loc, to) \
1785 store_op1 (op, loc, (to) - (loc) - 3)
1786
1787/* Likewise, for a two-argument jump. */
1788#define STORE_JUMP2(op, loc, to, arg) \
1789 store_op2 (op, loc, (to) - (loc) - 3, arg)
1790
4bb91c68 1791/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1792#define INSERT_JUMP(op, loc, to) \
1793 insert_op1 (op, loc, (to) - (loc) - 3, b)
1794
1795/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1796#define INSERT_JUMP2(op, loc, to, arg) \
1797 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1798
1799
1800/* This is not an arbitrary limit: the arguments which represent offsets
275a3409 1801 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1802 be too small, many things would have to change. */
275a3409
RS
1803# define MAX_BUF_SIZE (1L << 15)
1804
1805#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1806/* Any other compiler which, like MSC, has allocation limit below 2^16
1807 bytes will have to use approach similar to what was done below for
1808 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1809 reallocating to 0 bytes. Such thing is not going to work too well.
1810 You have been warned!! */
1811#if defined _MSC_VER && !defined WIN32
1812/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1813# define MAX_BUF_SIZE 65500L
1814#else
1815# define MAX_BUF_SIZE (1L << 16)
1816#endif
275a3409 1817#endif /* 0 */
fa9a63c5
RM
1818
1819/* Extend the buffer by twice its current size via realloc and
1820 reset the pointers that pointed into the old block to point to the
1821 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1822 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1823#if __BOUNDED_POINTERS__
1824# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1825# define MOVE_BUFFER_POINTER(P) \
1826 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1827# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1828 else \
1829 { \
1830 SET_HIGH_BOUND (b); \
1831 SET_HIGH_BOUND (begalt); \
1832 if (fixup_alt_jump) \
1833 SET_HIGH_BOUND (fixup_alt_jump); \
1834 if (laststart) \
1835 SET_HIGH_BOUND (laststart); \
1836 if (pending_exact) \
1837 SET_HIGH_BOUND (pending_exact); \
1838 }
1839#else
1840# define MOVE_BUFFER_POINTER(P) (P) += incr
1841# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1842#endif
fa9a63c5 1843#define EXTEND_BUFFER() \
25fe55af 1844 do { \
01618498 1845 re_char *old_buffer = bufp->buffer; \
25fe55af 1846 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1847 return REG_ESIZE; \
1848 bufp->allocated <<= 1; \
1849 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1850 bufp->allocated = MAX_BUF_SIZE; \
01618498 1851 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1852 if (bufp->buffer == NULL) \
1853 return REG_ESPACE; \
1854 /* If the buffer moved, move all the pointers into it. */ \
1855 if (old_buffer != bufp->buffer) \
1856 { \
4bb91c68
SM
1857 int incr = bufp->buffer - old_buffer; \
1858 MOVE_BUFFER_POINTER (b); \
1859 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1860 if (fixup_alt_jump) \
4bb91c68 1861 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1862 if (laststart) \
4bb91c68 1863 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1864 if (pending_exact) \
4bb91c68 1865 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1866 } \
4bb91c68 1867 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1868 } while (0)
1869
1870
1871/* Since we have one byte reserved for the register number argument to
1872 {start,stop}_memory, the maximum number of groups we can report
1873 things about is what fits in that byte. */
1874#define MAX_REGNUM 255
1875
1876/* But patterns can have more than `MAX_REGNUM' registers. We just
1877 ignore the excess. */
098d42af 1878typedef int regnum_t;
fa9a63c5
RM
1879
1880
1881/* Macros for the compile stack. */
1882
1883/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1884 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1885/* int may be not enough when sizeof(int) == 2. */
1886typedef long pattern_offset_t;
fa9a63c5
RM
1887
1888typedef struct
1889{
1890 pattern_offset_t begalt_offset;
1891 pattern_offset_t fixup_alt_jump;
5e69f11e 1892 pattern_offset_t laststart_offset;
fa9a63c5
RM
1893 regnum_t regnum;
1894} compile_stack_elt_t;
1895
1896
1897typedef struct
1898{
1899 compile_stack_elt_t *stack;
1900 unsigned size;
1901 unsigned avail; /* Offset of next open position. */
1902} compile_stack_type;
1903
1904
1905#define INIT_COMPILE_STACK_SIZE 32
1906
1907#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1908#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1909
4bb91c68 1910/* The next available element. */
fa9a63c5
RM
1911#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1912
1cee1e27
SM
1913/* Explicit quit checking is only used on NTemacs and whenever we
1914 use polling to process input events. */
1915#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1916extern int immediate_quit;
1917# define IMMEDIATE_QUIT_CHECK \
1918 do { \
1919 if (immediate_quit) QUIT; \
1920 } while (0)
1921#else
1922# define IMMEDIATE_QUIT_CHECK ((void)0)
1923#endif
1924\f
b18215fc
RS
1925/* Structure to manage work area for range table. */
1926struct range_table_work_area
1927{
1928 int *table; /* actual work area. */
1929 int allocated; /* allocated size for work area in bytes. */
25fe55af 1930 int used; /* actually used size in words. */
96cc36cc 1931 int bits; /* flag to record character classes */
b18215fc
RS
1932};
1933
77d11aec
RS
1934/* Make sure that WORK_AREA can hold more N multibyte characters.
1935 This is used only in set_image_of_range and set_image_of_range_1.
1936 It expects WORK_AREA to be a pointer.
1937 If it can't get the space, it returns from the surrounding function. */
1938
1939#define EXTEND_RANGE_TABLE(work_area, n) \
1940 do { \
1941 if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
1942 { \
1943 extend_range_table_work_area (work_area); \
1944 if ((work_area)->table == 0) \
1945 return (REG_ESPACE); \
1946 } \
b18215fc
RS
1947 } while (0)
1948
96cc36cc
RS
1949#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1950 (work_area).bits |= (bit)
1951
14473664
SM
1952/* Bits used to implement the multibyte-part of the various character classes
1953 such as [:alnum:] in a charset's range table. */
1954#define BIT_WORD 0x1
1955#define BIT_LOWER 0x2
1956#define BIT_PUNCT 0x4
1957#define BIT_SPACE 0x8
1958#define BIT_UPPER 0x10
1959#define BIT_MULTIBYTE 0x20
96cc36cc 1960
36595814
SM
1961/* Set a range START..END to WORK_AREA.
1962 The range is passed through TRANSLATE, so START and END
1963 should be untranslated. */
77d11aec
RS
1964#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
1965 do { \
1966 int tem; \
1967 tem = set_image_of_range (&work_area, start, end, translate); \
1968 if (tem > 0) \
1969 FREE_STACK_RETURN (tem); \
b18215fc
RS
1970 } while (0)
1971
25fe55af 1972/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1973#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1974 do { \
1975 if ((work_area).table) \
1976 free ((work_area).table); \
1977 } while (0)
1978
96cc36cc 1979#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1980#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1981#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1982#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1983\f
b18215fc 1984
fa9a63c5 1985/* Set the bit for character C in a list. */
01618498 1986#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1987
1988
1989/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1990#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
1991 do { \
1992 if (p == pend) \
1993 FREE_STACK_RETURN (REG_EBRACE); \
1994 else \
1995 { \
1996 PATFETCH (c); \
1997 while ('0' <= c && c <= '9') \
1998 { \
1999 int prev; \
2000 if (num < 0) \
2001 num = 0; \
2002 prev = num; \
2003 num = num * 10 + c - '0'; \
2004 if (num / 10 != prev) \
2005 FREE_STACK_RETURN (REG_BADBR); \
2006 if (p == pend) \
2007 FREE_STACK_RETURN (REG_EBRACE); \
2008 PATFETCH (c); \
2009 } \
2010 } \
2011 } while (0)
77d11aec 2012\f
1fdab503 2013#if ! WIDE_CHAR_SUPPORT
01618498 2014
14473664 2015/* Map a string to the char class it names (if any). */
1fdab503 2016re_wctype_t
ada30c0e
SM
2017re_wctype (str)
2018 re_char *str;
14473664 2019{
ada30c0e 2020 const char *string = str;
14473664
SM
2021 if (STREQ (string, "alnum")) return RECC_ALNUM;
2022 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2023 else if (STREQ (string, "word")) return RECC_WORD;
2024 else if (STREQ (string, "ascii")) return RECC_ASCII;
2025 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2026 else if (STREQ (string, "graph")) return RECC_GRAPH;
2027 else if (STREQ (string, "lower")) return RECC_LOWER;
2028 else if (STREQ (string, "print")) return RECC_PRINT;
2029 else if (STREQ (string, "punct")) return RECC_PUNCT;
2030 else if (STREQ (string, "space")) return RECC_SPACE;
2031 else if (STREQ (string, "upper")) return RECC_UPPER;
2032 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2033 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2034 else if (STREQ (string, "digit")) return RECC_DIGIT;
2035 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2036 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2037 else if (STREQ (string, "blank")) return RECC_BLANK;
2038 else return 0;
2039}
2040
2041/* True iff CH is in the char class CC. */
1fdab503 2042boolean
14473664
SM
2043re_iswctype (ch, cc)
2044 int ch;
2045 re_wctype_t cc;
2046{
2047 switch (cc)
2048 {
0cdd06f8
SM
2049 case RECC_ALNUM: return ISALNUM (ch);
2050 case RECC_ALPHA: return ISALPHA (ch);
2051 case RECC_BLANK: return ISBLANK (ch);
2052 case RECC_CNTRL: return ISCNTRL (ch);
2053 case RECC_DIGIT: return ISDIGIT (ch);
2054 case RECC_GRAPH: return ISGRAPH (ch);
2055 case RECC_LOWER: return ISLOWER (ch);
2056 case RECC_PRINT: return ISPRINT (ch);
2057 case RECC_PUNCT: return ISPUNCT (ch);
2058 case RECC_SPACE: return ISSPACE (ch);
2059 case RECC_UPPER: return ISUPPER (ch);
2060 case RECC_XDIGIT: return ISXDIGIT (ch);
2061 case RECC_ASCII: return IS_REAL_ASCII (ch);
2062 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2063 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2064 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2065 case RECC_WORD: return ISWORD (ch);
2066 case RECC_ERROR: return false;
2067 default:
2068 abort();
14473664
SM
2069 }
2070}
fa9a63c5 2071
14473664
SM
2072/* Return a bit-pattern to use in the range-table bits to match multibyte
2073 chars of class CC. */
2074static int
2075re_wctype_to_bit (cc)
2076 re_wctype_t cc;
2077{
2078 switch (cc)
2079 {
2080 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2081 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2082 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2083 case RECC_LOWER: return BIT_LOWER;
2084 case RECC_UPPER: return BIT_UPPER;
2085 case RECC_PUNCT: return BIT_PUNCT;
2086 case RECC_SPACE: return BIT_SPACE;
14473664 2087 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2088 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2089 default:
2090 abort();
14473664
SM
2091 }
2092}
2093#endif
77d11aec
RS
2094\f
2095/* Filling in the work area of a range. */
2096
2097/* Actually extend the space in WORK_AREA. */
2098
2099static void
2100extend_range_table_work_area (work_area)
2101 struct range_table_work_area *work_area;
177c0ea7 2102{
77d11aec
RS
2103 work_area->allocated += 16 * sizeof (int);
2104 if (work_area->table)
2105 work_area->table
8a169494 2106 = (int *) xrealloc (work_area->table, work_area->allocated);
77d11aec
RS
2107 else
2108 work_area->table
4aea1d4b 2109 = (int *) xmalloc (work_area->allocated);
77d11aec
RS
2110}
2111
2112#ifdef emacs
2113
2114/* Carefully find the ranges of codes that are equivalent
2115 under case conversion to the range start..end when passed through
2116 TRANSLATE. Handle the case where non-letters can come in between
2117 two upper-case letters (which happens in Latin-1).
2118 Also handle the case of groups of more than 2 case-equivalent chars.
2119
2120 The basic method is to look at consecutive characters and see
2121 if they can form a run that can be handled as one.
2122
2123 Returns -1 if successful, REG_ESPACE if ran out of space. */
2124
2125static int
2126set_image_of_range_1 (work_area, start, end, translate)
2127 RE_TRANSLATE_TYPE translate;
2128 struct range_table_work_area *work_area;
2129 re_wchar_t start, end;
2130{
2131 /* `one_case' indicates a character, or a run of characters,
2132 each of which is an isolate (no case-equivalents).
2133 This includes all ASCII non-letters.
2134
2135 `two_case' indicates a character, or a run of characters,
2136 each of which has two case-equivalent forms.
2137 This includes all ASCII letters.
2138
2139 `strange' indicates a character that has more than one
2140 case-equivalent. */
177c0ea7 2141
77d11aec
RS
2142 enum case_type {one_case, two_case, strange};
2143
2144 /* Describe the run that is in progress,
2145 which the next character can try to extend.
2146 If run_type is strange, that means there really is no run.
2147 If run_type is one_case, then run_start...run_end is the run.
2148 If run_type is two_case, then the run is run_start...run_end,
2149 and the case-equivalents end at run_eqv_end. */
2150
2151 enum case_type run_type = strange;
2152 int run_start, run_end, run_eqv_end;
2153
2154 Lisp_Object eqv_table;
2155
2156 if (!RE_TRANSLATE_P (translate))
2157 {
b7c12565 2158 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2159 work_area->table[work_area->used++] = (start);
2160 work_area->table[work_area->used++] = (end);
b7c12565 2161 return -1;
77d11aec
RS
2162 }
2163
2164 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2165
77d11aec
RS
2166 for (; start <= end; start++)
2167 {
2168 enum case_type this_type;
2169 int eqv = RE_TRANSLATE (eqv_table, start);
2170 int minchar, maxchar;
2171
2172 /* Classify this character */
2173 if (eqv == start)
2174 this_type = one_case;
2175 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2176 this_type = two_case;
2177 else
2178 this_type = strange;
2179
2180 if (start < eqv)
2181 minchar = start, maxchar = eqv;
2182 else
2183 minchar = eqv, maxchar = start;
2184
2185 /* Can this character extend the run in progress? */
2186 if (this_type == strange || this_type != run_type
2187 || !(minchar == run_end + 1
2188 && (run_type == two_case
2189 ? maxchar == run_eqv_end + 1 : 1)))
2190 {
2191 /* No, end the run.
2192 Record each of its equivalent ranges. */
2193 if (run_type == one_case)
2194 {
2195 EXTEND_RANGE_TABLE (work_area, 2);
2196 work_area->table[work_area->used++] = run_start;
2197 work_area->table[work_area->used++] = run_end;
2198 }
2199 else if (run_type == two_case)
2200 {
2201 EXTEND_RANGE_TABLE (work_area, 4);
2202 work_area->table[work_area->used++] = run_start;
2203 work_area->table[work_area->used++] = run_end;
2204 work_area->table[work_area->used++]
2205 = RE_TRANSLATE (eqv_table, run_start);
2206 work_area->table[work_area->used++]
2207 = RE_TRANSLATE (eqv_table, run_end);
2208 }
2209 run_type = strange;
2210 }
177c0ea7 2211
77d11aec
RS
2212 if (this_type == strange)
2213 {
2214 /* For a strange character, add each of its equivalents, one
2215 by one. Don't start a range. */
2216 do
2217 {
2218 EXTEND_RANGE_TABLE (work_area, 2);
2219 work_area->table[work_area->used++] = eqv;
2220 work_area->table[work_area->used++] = eqv;
2221 eqv = RE_TRANSLATE (eqv_table, eqv);
2222 }
2223 while (eqv != start);
2224 }
2225
2226 /* Add this char to the run, or start a new run. */
2227 else if (run_type == strange)
2228 {
2229 /* Initialize a new range. */
2230 run_type = this_type;
2231 run_start = start;
2232 run_end = start;
2233 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2234 }
2235 else
2236 {
2237 /* Extend a running range. */
2238 run_end = minchar;
2239 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2240 }
2241 }
2242
2243 /* If a run is still in progress at the end, finish it now
2244 by recording its equivalent ranges. */
2245 if (run_type == one_case)
2246 {
2247 EXTEND_RANGE_TABLE (work_area, 2);
2248 work_area->table[work_area->used++] = run_start;
2249 work_area->table[work_area->used++] = run_end;
2250 }
2251 else if (run_type == two_case)
2252 {
2253 EXTEND_RANGE_TABLE (work_area, 4);
2254 work_area->table[work_area->used++] = run_start;
2255 work_area->table[work_area->used++] = run_end;
2256 work_area->table[work_area->used++]
2257 = RE_TRANSLATE (eqv_table, run_start);
2258 work_area->table[work_area->used++]
2259 = RE_TRANSLATE (eqv_table, run_end);
2260 }
2261
2262 return -1;
2263}
36595814 2264
77d11aec 2265#endif /* emacs */
36595814 2266
b7c12565 2267/* Record the the image of the range start..end when passed through
36595814
SM
2268 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2269 and is not even necessarily contiguous.
b7c12565
RS
2270 Normally we approximate it with the smallest contiguous range that contains
2271 all the chars we need. However, for Latin-1 we go to extra effort
2272 to do a better job.
2273
2274 This function is not called for ASCII ranges.
77d11aec
RS
2275
2276 Returns -1 if successful, REG_ESPACE if ran out of space. */
2277
2278static int
36595814
SM
2279set_image_of_range (work_area, start, end, translate)
2280 RE_TRANSLATE_TYPE translate;
2281 struct range_table_work_area *work_area;
2282 re_wchar_t start, end;
2283{
77d11aec
RS
2284 re_wchar_t cmin, cmax;
2285
2286#ifdef emacs
2287 /* For Latin-1 ranges, use set_image_of_range_1
2288 to get proper handling of ranges that include letters and nonletters.
b7c12565 2289 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2290 For other character sets, we don't bother to get this right. */
b7c12565
RS
2291 if (RE_TRANSLATE_P (translate) && start < 04400
2292 && !(start < 04200 && end >= 04377))
77d11aec 2293 {
b7c12565 2294 int newend;
77d11aec 2295 int tem;
b7c12565
RS
2296 newend = end;
2297 if (newend > 04377)
2298 newend = 04377;
2299 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2300 if (tem > 0)
2301 return tem;
2302
2303 start = 04400;
2304 if (end < 04400)
2305 return -1;
2306 }
2307#endif
2308
b7c12565
RS
2309 EXTEND_RANGE_TABLE (work_area, 2);
2310 work_area->table[work_area->used++] = (start);
2311 work_area->table[work_area->used++] = (end);
2312
2313 cmin = -1, cmax = -1;
77d11aec 2314
36595814 2315 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2316 {
2317 int ch;
77d11aec 2318
b7c12565
RS
2319 for (ch = start; ch <= end; ch++)
2320 {
2321 re_wchar_t c = TRANSLATE (ch);
2322 if (! (start <= c && c <= end))
2323 {
2324 if (cmin == -1)
2325 cmin = c, cmax = c;
2326 else
2327 {
2328 cmin = MIN (cmin, c);
2329 cmax = MAX (cmax, c);
2330 }
2331 }
2332 }
2333
2334 if (cmin != -1)
2335 {
2336 EXTEND_RANGE_TABLE (work_area, 2);
2337 work_area->table[work_area->used++] = (cmin);
2338 work_area->table[work_area->used++] = (cmax);
2339 }
2340 }
36595814 2341
77d11aec
RS
2342 return -1;
2343}
fa9a63c5
RM
2344\f
2345#ifndef MATCH_MAY_ALLOCATE
2346
2347/* If we cannot allocate large objects within re_match_2_internal,
2348 we make the fail stack and register vectors global.
2349 The fail stack, we grow to the maximum size when a regexp
2350 is compiled.
2351 The register vectors, we adjust in size each time we
2352 compile a regexp, according to the number of registers it needs. */
2353
2354static fail_stack_type fail_stack;
2355
2356/* Size with which the following vectors are currently allocated.
2357 That is so we can make them bigger as needed,
4bb91c68 2358 but never make them smaller. */
fa9a63c5
RM
2359static int regs_allocated_size;
2360
66f0296e
SM
2361static re_char ** regstart, ** regend;
2362static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2363
2364/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2365 but don't make them smaller. */
fa9a63c5
RM
2366
2367static
2368regex_grow_registers (num_regs)
2369 int num_regs;
2370{
2371 if (num_regs > regs_allocated_size)
2372 {
66f0296e
SM
2373 RETALLOC_IF (regstart, num_regs, re_char *);
2374 RETALLOC_IF (regend, num_regs, re_char *);
2375 RETALLOC_IF (best_regstart, num_regs, re_char *);
2376 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2377
2378 regs_allocated_size = num_regs;
2379 }
2380}
2381
2382#endif /* not MATCH_MAY_ALLOCATE */
2383\f
99633e97
SM
2384static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2385 compile_stack,
2386 regnum_t regnum));
2387
fa9a63c5
RM
2388/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2389 Returns one of error codes defined in `regex.h', or zero for success.
2390
2391 Assumes the `allocated' (and perhaps `buffer') and `translate'
2392 fields are set in BUFP on entry.
2393
2394 If it succeeds, results are put in BUFP (if it returns an error, the
2395 contents of BUFP are undefined):
2396 `buffer' is the compiled pattern;
2397 `syntax' is set to SYNTAX;
2398 `used' is set to the length of the compiled pattern;
2399 `fastmap_accurate' is zero;
2400 `re_nsub' is the number of subexpressions in PATTERN;
2401 `not_bol' and `not_eol' are zero;
5e69f11e 2402
c0f9ea08 2403 The `fastmap' field is neither examined nor set. */
fa9a63c5 2404
505bde11
SM
2405/* Insert the `jump' from the end of last alternative to "here".
2406 The space for the jump has already been allocated. */
2407#define FIXUP_ALT_JUMP() \
2408do { \
2409 if (fixup_alt_jump) \
2410 STORE_JUMP (jump, fixup_alt_jump, b); \
2411} while (0)
2412
2413
fa9a63c5
RM
2414/* Return, freeing storage we allocated. */
2415#define FREE_STACK_RETURN(value) \
b18215fc
RS
2416 do { \
2417 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2418 free (compile_stack.stack); \
2419 return value; \
2420 } while (0)
fa9a63c5
RM
2421
2422static reg_errcode_t
2423regex_compile (pattern, size, syntax, bufp)
66f0296e 2424 re_char *pattern;
4bb91c68 2425 size_t size;
fa9a63c5
RM
2426 reg_syntax_t syntax;
2427 struct re_pattern_buffer *bufp;
2428{
01618498
SM
2429 /* We fetch characters from PATTERN here. */
2430 register re_wchar_t c, c1;
5e69f11e 2431
fa9a63c5 2432 /* A random temporary spot in PATTERN. */
66f0296e 2433 re_char *p1;
fa9a63c5
RM
2434
2435 /* Points to the end of the buffer, where we should append. */
2436 register unsigned char *b;
5e69f11e 2437
fa9a63c5
RM
2438 /* Keeps track of unclosed groups. */
2439 compile_stack_type compile_stack;
2440
2441 /* Points to the current (ending) position in the pattern. */
22336245
RS
2442#ifdef AIX
2443 /* `const' makes AIX compiler fail. */
66f0296e 2444 unsigned char *p = pattern;
22336245 2445#else
66f0296e 2446 re_char *p = pattern;
22336245 2447#endif
66f0296e 2448 re_char *pend = pattern + size;
5e69f11e 2449
fa9a63c5 2450 /* How to translate the characters in the pattern. */
6676cb1c 2451 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2452
2453 /* Address of the count-byte of the most recently inserted `exactn'
2454 command. This makes it possible to tell if a new exact-match
2455 character can be added to that command or if the character requires
2456 a new `exactn' command. */
2457 unsigned char *pending_exact = 0;
2458
2459 /* Address of start of the most recently finished expression.
2460 This tells, e.g., postfix * where to find the start of its
2461 operand. Reset at the beginning of groups and alternatives. */
2462 unsigned char *laststart = 0;
2463
2464 /* Address of beginning of regexp, or inside of last group. */
2465 unsigned char *begalt;
2466
2467 /* Place in the uncompiled pattern (i.e., the {) to
2468 which to go back if the interval is invalid. */
66f0296e 2469 re_char *beg_interval;
5e69f11e 2470
fa9a63c5 2471 /* Address of the place where a forward jump should go to the end of
25fe55af 2472 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2473 last -- ends with a forward jump of this sort. */
2474 unsigned char *fixup_alt_jump = 0;
2475
2476 /* Counts open-groups as they are encountered. Remembered for the
2477 matching close-group on the compile stack, so the same register
2478 number is put in the stop_memory as the start_memory. */
2479 regnum_t regnum = 0;
2480
b18215fc
RS
2481 /* Work area for range table of charset. */
2482 struct range_table_work_area range_table_work;
2483
2d1675e4
SM
2484 /* If the object matched can contain multibyte characters. */
2485 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2486
f9b0fd99
RS
2487 /* Nonzero if we have pushed down into a subpattern. */
2488 int in_subpattern = 0;
2489
2490 /* These hold the values of p, pattern, and pend from the main
2491 pattern when we have pushed into a subpattern. */
2492 re_char *main_p;
2493 re_char *main_pattern;
2494 re_char *main_pend;
2495
fa9a63c5 2496#ifdef DEBUG
99633e97 2497 debug++;
fa9a63c5 2498 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2499 if (debug > 0)
fa9a63c5
RM
2500 {
2501 unsigned debug_count;
5e69f11e 2502
fa9a63c5 2503 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2504 putchar (pattern[debug_count]);
fa9a63c5
RM
2505 putchar ('\n');
2506 }
2507#endif /* DEBUG */
2508
2509 /* Initialize the compile stack. */
2510 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2511 if (compile_stack.stack == NULL)
2512 return REG_ESPACE;
2513
2514 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2515 compile_stack.avail = 0;
2516
b18215fc
RS
2517 range_table_work.table = 0;
2518 range_table_work.allocated = 0;
2519
fa9a63c5
RM
2520 /* Initialize the pattern buffer. */
2521 bufp->syntax = syntax;
2522 bufp->fastmap_accurate = 0;
2523 bufp->not_bol = bufp->not_eol = 0;
2524
2525 /* Set `used' to zero, so that if we return an error, the pattern
2526 printer (for debugging) will think there's no pattern. We reset it
2527 at the end. */
2528 bufp->used = 0;
5e69f11e 2529
fa9a63c5 2530 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2531 bufp->re_nsub = 0;
fa9a63c5 2532
0b32bf0e 2533#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2534 /* Initialize the syntax table. */
2535 init_syntax_once ();
2536#endif
2537
2538 if (bufp->allocated == 0)
2539 {
2540 if (bufp->buffer)
2541 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2542 enough space. This loses if buffer's address is bogus, but
2543 that is the user's responsibility. */
2544 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2545 }
fa9a63c5 2546 else
25fe55af
RS
2547 { /* Caller did not allocate a buffer. Do it for them. */
2548 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2549 }
fa9a63c5
RM
2550 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2551
2552 bufp->allocated = INIT_BUF_SIZE;
2553 }
2554
2555 begalt = b = bufp->buffer;
2556
2557 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2558 while (1)
fa9a63c5 2559 {
f9b0fd99
RS
2560 if (p == pend)
2561 {
2562 /* If this is the end of an included regexp,
2563 pop back to the main regexp and try again. */
2564 if (in_subpattern)
2565 {
2566 in_subpattern = 0;
2567 pattern = main_pattern;
2568 p = main_p;
2569 pend = main_pend;
2570 continue;
2571 }
2572 /* If this is the end of the main regexp, we are done. */
2573 break;
2574 }
2575
fa9a63c5
RM
2576 PATFETCH (c);
2577
2578 switch (c)
25fe55af 2579 {
f9b0fd99
RS
2580 case ' ':
2581 {
2582 re_char *p1 = p;
2583
2584 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2585 spaces normally. And don't try to do this recursively. */
2586 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2587 goto normal_char;
2588
2589 /* Peek past following spaces. */
2590 while (p1 != pend)
2591 {
2592 if (*p1 != ' ')
2593 break;
2594 p1++;
2595 }
2596 /* If the spaces are followed by a repetition op,
2597 treat them normally. */
c721eee5
RS
2598 if (p1 != pend
2599 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2600 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2601 goto normal_char;
2602
2603 /* Replace the spaces with the whitespace regexp. */
2604 in_subpattern = 1;
2605 main_p = p1;
2606 main_pend = pend;
2607 main_pattern = pattern;
2608 p = pattern = whitespace_regexp;
2609 pend = p + strlen (p);
2610 break;
2611 }
2612
25fe55af
RS
2613 case '^':
2614 {
2615 if ( /* If at start of pattern, it's an operator. */
2616 p == pattern + 1
2617 /* If context independent, it's an operator. */
2618 || syntax & RE_CONTEXT_INDEP_ANCHORS
2619 /* Otherwise, depends on what's come before. */
2620 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2621 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2622 else
2623 goto normal_char;
2624 }
2625 break;
2626
2627
2628 case '$':
2629 {
2630 if ( /* If at end of pattern, it's an operator. */
2631 p == pend
2632 /* If context independent, it's an operator. */
2633 || syntax & RE_CONTEXT_INDEP_ANCHORS
2634 /* Otherwise, depends on what's next. */
2635 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2636 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2637 else
2638 goto normal_char;
2639 }
2640 break;
fa9a63c5
RM
2641
2642
2643 case '+':
25fe55af
RS
2644 case '?':
2645 if ((syntax & RE_BK_PLUS_QM)
2646 || (syntax & RE_LIMITED_OPS))
2647 goto normal_char;
2648 handle_plus:
2649 case '*':
2650 /* If there is no previous pattern... */
2651 if (!laststart)
2652 {
2653 if (syntax & RE_CONTEXT_INVALID_OPS)
2654 FREE_STACK_RETURN (REG_BADRPT);
2655 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2656 goto normal_char;
2657 }
2658
2659 {
25fe55af 2660 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2661 boolean zero_times_ok = 0, many_times_ok = 0;
2662 boolean greedy = 1;
25fe55af
RS
2663
2664 /* If there is a sequence of repetition chars, collapse it
2665 down to just one (the right one). We can't combine
2666 interval operators with these because of, e.g., `a{2}*',
2667 which should only match an even number of `a's. */
2668
2669 for (;;)
2670 {
0b32bf0e 2671 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2672 && c == '?' && (zero_times_ok || many_times_ok))
2673 greedy = 0;
2674 else
2675 {
2676 zero_times_ok |= c != '+';
2677 many_times_ok |= c != '?';
2678 }
25fe55af
RS
2679
2680 if (p == pend)
2681 break;
ed0767d8
SM
2682 else if (*p == '*'
2683 || (!(syntax & RE_BK_PLUS_QM)
2684 && (*p == '+' || *p == '?')))
25fe55af 2685 ;
ed0767d8 2686 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2687 {
ed0767d8
SM
2688 if (p+1 == pend)
2689 FREE_STACK_RETURN (REG_EESCAPE);
2690 if (p[1] == '+' || p[1] == '?')
2691 PATFETCH (c); /* Gobble up the backslash. */
2692 else
2693 break;
25fe55af
RS
2694 }
2695 else
ed0767d8 2696 break;
25fe55af 2697 /* If we get here, we found another repeat character. */
ed0767d8
SM
2698 PATFETCH (c);
2699 }
25fe55af
RS
2700
2701 /* Star, etc. applied to an empty pattern is equivalent
2702 to an empty pattern. */
4e8a9132 2703 if (!laststart || laststart == b)
25fe55af
RS
2704 break;
2705
2706 /* Now we know whether or not zero matches is allowed
2707 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2708 if (greedy)
2709 {
99633e97 2710 if (many_times_ok)
4e8a9132
SM
2711 {
2712 boolean simple = skip_one_char (laststart) == b;
2713 unsigned int startoffset = 0;
f6a3f532 2714 re_opcode_t ofj =
01618498 2715 /* Check if the loop can match the empty string. */
6df42991
SM
2716 (simple || !analyse_first (laststart, b, NULL, 0))
2717 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2718 assert (skip_one_char (laststart) <= b);
177c0ea7 2719
4e8a9132
SM
2720 if (!zero_times_ok && simple)
2721 { /* Since simple * loops can be made faster by using
2722 on_failure_keep_string_jump, we turn simple P+
2723 into PP* if P is simple. */
2724 unsigned char *p1, *p2;
2725 startoffset = b - laststart;
2726 GET_BUFFER_SPACE (startoffset);
2727 p1 = b; p2 = laststart;
2728 while (p2 < p1)
2729 *b++ = *p2++;
2730 zero_times_ok = 1;
99633e97 2731 }
4e8a9132
SM
2732
2733 GET_BUFFER_SPACE (6);
2734 if (!zero_times_ok)
2735 /* A + loop. */
f6a3f532 2736 STORE_JUMP (ofj, b, b + 6);
99633e97 2737 else
4e8a9132
SM
2738 /* Simple * loops can use on_failure_keep_string_jump
2739 depending on what follows. But since we don't know
2740 that yet, we leave the decision up to
2741 on_failure_jump_smart. */
f6a3f532 2742 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2743 laststart + startoffset, b + 6);
99633e97 2744 b += 3;
4e8a9132 2745 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2746 b += 3;
2747 }
2748 else
2749 {
4e8a9132
SM
2750 /* A simple ? pattern. */
2751 assert (zero_times_ok);
2752 GET_BUFFER_SPACE (3);
2753 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2754 b += 3;
2755 }
1c8c6d39
DL
2756 }
2757 else /* not greedy */
2758 { /* I wish the greedy and non-greedy cases could be merged. */
2759
0683b6fa 2760 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2761 if (many_times_ok)
2762 {
f6a3f532
SM
2763 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2764
6df42991
SM
2765 /* The non-greedy multiple match looks like
2766 a repeat..until: we only need a conditional jump
2767 at the end of the loop. */
f6a3f532
SM
2768 if (emptyp) BUF_PUSH (no_op);
2769 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2770 : on_failure_jump, b, laststart);
1c8c6d39
DL
2771 b += 3;
2772 if (zero_times_ok)
2773 {
2774 /* The repeat...until naturally matches one or more.
2775 To also match zero times, we need to first jump to
6df42991 2776 the end of the loop (its conditional jump). */
1c8c6d39
DL
2777 INSERT_JUMP (jump, laststart, b);
2778 b += 3;
2779 }
2780 }
2781 else
2782 {
2783 /* non-greedy a?? */
1c8c6d39
DL
2784 INSERT_JUMP (jump, laststart, b + 3);
2785 b += 3;
2786 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2787 b += 3;
2788 }
2789 }
2790 }
4e8a9132 2791 pending_exact = 0;
fa9a63c5
RM
2792 break;
2793
2794
2795 case '.':
25fe55af
RS
2796 laststart = b;
2797 BUF_PUSH (anychar);
2798 break;
fa9a63c5
RM
2799
2800
25fe55af
RS
2801 case '[':
2802 {
b18215fc 2803 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2804
25fe55af 2805 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2806
25fe55af
RS
2807 /* Ensure that we have enough space to push a charset: the
2808 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2809 GET_BUFFER_SPACE (34);
2810
25fe55af 2811 laststart = b;
e318085a 2812
25fe55af
RS
2813 /* We test `*p == '^' twice, instead of using an if
2814 statement, so we only need one BUF_PUSH. */
2815 BUF_PUSH (*p == '^' ? charset_not : charset);
2816 if (*p == '^')
2817 p++;
e318085a 2818
25fe55af
RS
2819 /* Remember the first position in the bracket expression. */
2820 p1 = p;
e318085a 2821
25fe55af
RS
2822 /* Push the number of bytes in the bitmap. */
2823 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2824
25fe55af
RS
2825 /* Clear the whole map. */
2826 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2827
25fe55af
RS
2828 /* charset_not matches newline according to a syntax bit. */
2829 if ((re_opcode_t) b[-2] == charset_not
2830 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2831 SET_LIST_BIT ('\n');
fa9a63c5 2832
25fe55af
RS
2833 /* Read in characters and ranges, setting map bits. */
2834 for (;;)
2835 {
b18215fc 2836 boolean escaped_char = false;
2d1675e4 2837 const unsigned char *p2 = p;
e318085a 2838
25fe55af 2839 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2840
36595814
SM
2841 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2842 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2843 So the translation is done later in a loop. Example:
2844 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2845 PATFETCH (c);
e318085a 2846
25fe55af
RS
2847 /* \ might escape characters inside [...] and [^...]. */
2848 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2849 {
2850 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2851
2852 PATFETCH (c);
b18215fc 2853 escaped_char = true;
25fe55af 2854 }
b18215fc
RS
2855 else
2856 {
657fcfbd
RS
2857 /* Could be the end of the bracket expression. If it's
2858 not (i.e., when the bracket expression is `[]' so
2859 far), the ']' character bit gets set way below. */
2d1675e4 2860 if (c == ']' && p2 != p1)
657fcfbd 2861 break;
25fe55af 2862 }
b18215fc 2863
b18215fc
RS
2864 /* What should we do for the character which is
2865 greater than 0x7F, but not BASE_LEADING_CODE_P?
2866 XXX */
2867
25fe55af
RS
2868 /* See if we're at the beginning of a possible character
2869 class. */
b18215fc 2870
2d1675e4
SM
2871 if (!escaped_char &&
2872 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2873 {
2874 /* Leave room for the null. */
14473664 2875 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2876 const unsigned char *class_beg;
b18215fc 2877
25fe55af
RS
2878 PATFETCH (c);
2879 c1 = 0;
ed0767d8 2880 class_beg = p;
b18215fc 2881
25fe55af
RS
2882 /* If pattern is `[[:'. */
2883 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2884
25fe55af
RS
2885 for (;;)
2886 {
14473664
SM
2887 PATFETCH (c);
2888 if ((c == ':' && *p == ']') || p == pend)
2889 break;
2890 if (c1 < CHAR_CLASS_MAX_LENGTH)
2891 str[c1++] = c;
2892 else
2893 /* This is in any case an invalid class name. */
2894 str[0] = '\0';
25fe55af
RS
2895 }
2896 str[c1] = '\0';
b18215fc
RS
2897
2898 /* If isn't a word bracketed by `[:' and `:]':
2899 undo the ending character, the letters, and
2900 leave the leading `:' and `[' (but set bits for
2901 them). */
25fe55af
RS
2902 if (c == ':' && *p == ']')
2903 {
36595814 2904 re_wchar_t ch;
14473664
SM
2905 re_wctype_t cc;
2906
2907 cc = re_wctype (str);
2908
2909 if (cc == 0)
fa9a63c5
RM
2910 FREE_STACK_RETURN (REG_ECTYPE);
2911
14473664
SM
2912 /* Throw away the ] at the end of the character
2913 class. */
2914 PATFETCH (c);
fa9a63c5 2915
14473664 2916 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2917
96cc36cc
RS
2918 /* Most character classes in a multibyte match
2919 just set a flag. Exceptions are is_blank,
2920 is_digit, is_cntrl, and is_xdigit, since
2921 they can only match ASCII characters. We
14473664
SM
2922 don't need to handle them for multibyte.
2923 They are distinguished by a negative wctype. */
96cc36cc 2924
2d1675e4 2925 if (multibyte)
14473664
SM
2926 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2927 re_wctype_to_bit (cc));
96cc36cc 2928
14473664 2929 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
25fe55af 2930 {
7ae68633 2931 int translated = TRANSLATE (ch);
14473664 2932 if (re_iswctype (btowc (ch), cc))
96cc36cc 2933 SET_LIST_BIT (translated);
25fe55af 2934 }
b18215fc
RS
2935
2936 /* Repeat the loop. */
2937 continue;
25fe55af
RS
2938 }
2939 else
2940 {
ed0767d8
SM
2941 /* Go back to right after the "[:". */
2942 p = class_beg;
25fe55af 2943 SET_LIST_BIT ('[');
b18215fc
RS
2944
2945 /* Because the `:' may starts the range, we
2946 can't simply set bit and repeat the loop.
25fe55af 2947 Instead, just set it to C and handle below. */
b18215fc 2948 c = ':';
25fe55af
RS
2949 }
2950 }
b18215fc
RS
2951
2952 if (p < pend && p[0] == '-' && p[1] != ']')
2953 {
2954
2955 /* Discard the `-'. */
2956 PATFETCH (c1);
2957
2958 /* Fetch the character which ends the range. */
2959 PATFETCH (c1);
b18215fc 2960
b54f61ed 2961 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2962 {
b54f61ed
KH
2963 if (! SINGLE_BYTE_CHAR_P (c1))
2964 {
3ff2446d
KH
2965 /* Handle a range starting with a
2966 character of less than 256, and ending
2967 with a character of not less than 256.
2968 Split that into two ranges, the low one
2969 ending at 0377, and the high one
2970 starting at the smallest character in
2971 the charset of C1 and ending at C1. */
b54f61ed 2972 int charset = CHAR_CHARSET (c1);
36595814
SM
2973 re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
2974
b54f61ed
KH
2975 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2976 c2, c1);
333526e0 2977 c1 = 0377;
b54f61ed 2978 }
e934739e
RS
2979 }
2980 else if (!SAME_CHARSET_P (c, c1))
b3e4c897 2981 FREE_STACK_RETURN (REG_ERANGEX);
e318085a 2982 }
25fe55af 2983 else
b18215fc
RS
2984 /* Range from C to C. */
2985 c1 = c;
2986
2987 /* Set the range ... */
2988 if (SINGLE_BYTE_CHAR_P (c))
2989 /* ... into bitmap. */
25fe55af 2990 {
01618498 2991 re_wchar_t this_char;
36595814 2992 re_wchar_t range_start = c, range_end = c1;
b18215fc
RS
2993
2994 /* If the start is after the end, the range is empty. */
2995 if (range_start > range_end)
2996 {
2997 if (syntax & RE_NO_EMPTY_RANGES)
2998 FREE_STACK_RETURN (REG_ERANGE);
2999 /* Else, repeat the loop. */
3000 }
3001 else
3002 {
3003 for (this_char = range_start; this_char <= range_end;
3004 this_char++)
3005 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 3006 }
25fe55af 3007 }
e318085a 3008 else
b18215fc
RS
3009 /* ... into range table. */
3010 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
3011 }
3012
25fe55af
RS
3013 /* Discard any (non)matching list bytes that are all 0 at the
3014 end of the map. Decrease the map-length byte too. */
3015 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3016 b[-1]--;
3017 b += b[-1];
fa9a63c5 3018
96cc36cc
RS
3019 /* Build real range table from work area. */
3020 if (RANGE_TABLE_WORK_USED (range_table_work)
3021 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3022 {
3023 int i;
3024 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3025
b18215fc 3026 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3027 bytes for flags, two for COUNT, and three bytes for
3028 each character. */
3029 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3030
b18215fc
RS
3031 /* Indicate the existence of range table. */
3032 laststart[1] |= 0x80;
fa9a63c5 3033
96cc36cc
RS
3034 /* Store the character class flag bits into the range table.
3035 If not in emacs, these flag bits are always 0. */
3036 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3037 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3038
b18215fc
RS
3039 STORE_NUMBER_AND_INCR (b, used / 2);
3040 for (i = 0; i < used; i++)
3041 STORE_CHARACTER_AND_INCR
3042 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3043 }
25fe55af
RS
3044 }
3045 break;
fa9a63c5
RM
3046
3047
b18215fc 3048 case '(':
25fe55af
RS
3049 if (syntax & RE_NO_BK_PARENS)
3050 goto handle_open;
3051 else
3052 goto normal_char;
fa9a63c5
RM
3053
3054
25fe55af
RS
3055 case ')':
3056 if (syntax & RE_NO_BK_PARENS)
3057 goto handle_close;
3058 else
3059 goto normal_char;
e318085a
RS
3060
3061
25fe55af
RS
3062 case '\n':
3063 if (syntax & RE_NEWLINE_ALT)
3064 goto handle_alt;
3065 else
3066 goto normal_char;
e318085a
RS
3067
3068
b18215fc 3069 case '|':
25fe55af
RS
3070 if (syntax & RE_NO_BK_VBAR)
3071 goto handle_alt;
3072 else
3073 goto normal_char;
3074
3075
3076 case '{':
3077 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3078 goto handle_interval;
3079 else
3080 goto normal_char;
3081
3082
3083 case '\\':
3084 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3085
3086 /* Do not translate the character after the \, so that we can
3087 distinguish, e.g., \B from \b, even if we normally would
3088 translate, e.g., B to b. */
36595814 3089 PATFETCH (c);
25fe55af
RS
3090
3091 switch (c)
3092 {
3093 case '(':
3094 if (syntax & RE_NO_BK_PARENS)
3095 goto normal_backslash;
3096
3097 handle_open:
505bde11
SM
3098 {
3099 int shy = 0;
3100 if (p+1 < pend)
3101 {
3102 /* Look for a special (?...) construct */
ed0767d8 3103 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3104 {
ed0767d8 3105 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3106 PATFETCH (c);
3107 switch (c)
3108 {
3109 case ':': shy = 1; break;
3110 default:
3111 /* Only (?:...) is supported right now. */
3112 FREE_STACK_RETURN (REG_BADPAT);
3113 }
3114 }
505bde11
SM
3115 }
3116
3117 if (!shy)
3118 {
3119 bufp->re_nsub++;
3120 regnum++;
3121 }
25fe55af 3122
99633e97
SM
3123 if (COMPILE_STACK_FULL)
3124 {
3125 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3126 compile_stack_elt_t);
3127 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3128
99633e97
SM
3129 compile_stack.size <<= 1;
3130 }
25fe55af 3131
99633e97
SM
3132 /* These are the values to restore when we hit end of this
3133 group. They are all relative offsets, so that if the
3134 whole pattern moves because of realloc, they will still
3135 be valid. */
3136 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3137 COMPILE_STACK_TOP.fixup_alt_jump
3138 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3139 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3140 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3141
3142 /* Do not push a
3143 start_memory for groups beyond the last one we can
3144 represent in the compiled pattern. */
3145 if (regnum <= MAX_REGNUM && !shy)
3146 BUF_PUSH_2 (start_memory, regnum);
3147
3148 compile_stack.avail++;
3149
3150 fixup_alt_jump = 0;
3151 laststart = 0;
3152 begalt = b;
3153 /* If we've reached MAX_REGNUM groups, then this open
3154 won't actually generate any code, so we'll have to
3155 clear pending_exact explicitly. */
3156 pending_exact = 0;
3157 break;
505bde11 3158 }
25fe55af
RS
3159
3160 case ')':
3161 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3162
3163 if (COMPILE_STACK_EMPTY)
505bde11
SM
3164 {
3165 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3166 goto normal_backslash;
3167 else
3168 FREE_STACK_RETURN (REG_ERPAREN);
3169 }
25fe55af
RS
3170
3171 handle_close:
505bde11 3172 FIXUP_ALT_JUMP ();
25fe55af
RS
3173
3174 /* See similar code for backslashed left paren above. */
3175 if (COMPILE_STACK_EMPTY)
505bde11
SM
3176 {
3177 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3178 goto normal_char;
3179 else
3180 FREE_STACK_RETURN (REG_ERPAREN);
3181 }
25fe55af
RS
3182
3183 /* Since we just checked for an empty stack above, this
3184 ``can't happen''. */
3185 assert (compile_stack.avail != 0);
3186 {
3187 /* We don't just want to restore into `regnum', because
3188 later groups should continue to be numbered higher,
3189 as in `(ab)c(de)' -- the second group is #2. */
3190 regnum_t this_group_regnum;
3191
3192 compile_stack.avail--;
3193 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3194 fixup_alt_jump
3195 = COMPILE_STACK_TOP.fixup_alt_jump
3196 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3197 : 0;
3198 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3199 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3200 /* If we've reached MAX_REGNUM groups, then this open
3201 won't actually generate any code, so we'll have to
3202 clear pending_exact explicitly. */
3203 pending_exact = 0;
e318085a 3204
25fe55af
RS
3205 /* We're at the end of the group, so now we know how many
3206 groups were inside this one. */
505bde11
SM
3207 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3208 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3209 }
3210 break;
3211
3212
3213 case '|': /* `\|'. */
3214 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3215 goto normal_backslash;
3216 handle_alt:
3217 if (syntax & RE_LIMITED_OPS)
3218 goto normal_char;
3219
3220 /* Insert before the previous alternative a jump which
3221 jumps to this alternative if the former fails. */
3222 GET_BUFFER_SPACE (3);
3223 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3224 pending_exact = 0;
3225 b += 3;
3226
3227 /* The alternative before this one has a jump after it
3228 which gets executed if it gets matched. Adjust that
3229 jump so it will jump to this alternative's analogous
3230 jump (put in below, which in turn will jump to the next
3231 (if any) alternative's such jump, etc.). The last such
3232 jump jumps to the correct final destination. A picture:
3233 _____ _____
3234 | | | |
3235 | v | v
3236 a | b | c
3237
3238 If we are at `b', then fixup_alt_jump right now points to a
3239 three-byte space after `a'. We'll put in the jump, set
3240 fixup_alt_jump to right after `b', and leave behind three
3241 bytes which we'll fill in when we get to after `c'. */
3242
505bde11 3243 FIXUP_ALT_JUMP ();
25fe55af
RS
3244
3245 /* Mark and leave space for a jump after this alternative,
3246 to be filled in later either by next alternative or
3247 when know we're at the end of a series of alternatives. */
3248 fixup_alt_jump = b;
3249 GET_BUFFER_SPACE (3);
3250 b += 3;
3251
3252 laststart = 0;
3253 begalt = b;
3254 break;
3255
3256
3257 case '{':
3258 /* If \{ is a literal. */
3259 if (!(syntax & RE_INTERVALS)
3260 /* If we're at `\{' and it's not the open-interval
3261 operator. */
4bb91c68 3262 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3263 goto normal_backslash;
3264
3265 handle_interval:
3266 {
3267 /* If got here, then the syntax allows intervals. */
3268
3269 /* At least (most) this many matches must be made. */
99633e97 3270 int lower_bound = 0, upper_bound = -1;
25fe55af 3271
ed0767d8 3272 beg_interval = p;
25fe55af 3273
25fe55af
RS
3274 GET_UNSIGNED_NUMBER (lower_bound);
3275
3276 if (c == ',')
ed0767d8 3277 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3278 else
3279 /* Interval such as `{1}' => match exactly once. */
3280 upper_bound = lower_bound;
3281
3282 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3283 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3284 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3285
3286 if (!(syntax & RE_NO_BK_BRACES))
3287 {
4bb91c68
SM
3288 if (c != '\\')
3289 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3290 if (p == pend)
3291 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3292 PATFETCH (c);
3293 }
3294
3295 if (c != '}')
4bb91c68 3296 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3297
3298 /* We just parsed a valid interval. */
3299
3300 /* If it's invalid to have no preceding re. */
3301 if (!laststart)
3302 {
3303 if (syntax & RE_CONTEXT_INVALID_OPS)
3304 FREE_STACK_RETURN (REG_BADRPT);
3305 else if (syntax & RE_CONTEXT_INDEP_OPS)
3306 laststart = b;
3307 else
3308 goto unfetch_interval;
3309 }
3310
6df42991
SM
3311 if (upper_bound == 0)
3312 /* If the upper bound is zero, just drop the sub pattern
3313 altogether. */
3314 b = laststart;
3315 else if (lower_bound == 1 && upper_bound == 1)
3316 /* Just match it once: nothing to do here. */
3317 ;
3318
3319 /* Otherwise, we have a nontrivial interval. When
3320 we're all done, the pattern will look like:
3321 set_number_at <jump count> <upper bound>
3322 set_number_at <succeed_n count> <lower bound>
3323 succeed_n <after jump addr> <succeed_n count>
3324 <body of loop>
3325 jump_n <succeed_n addr> <jump count>
3326 (The upper bound and `jump_n' are omitted if
3327 `upper_bound' is 1, though.) */
3328 else
3329 { /* If the upper bound is > 1, we need to insert
3330 more at the end of the loop. */
3331 unsigned int nbytes = (upper_bound < 0 ? 3
3332 : upper_bound > 1 ? 5 : 0);
3333 unsigned int startoffset = 0;
3334
3335 GET_BUFFER_SPACE (20); /* We might use less. */
3336
3337 if (lower_bound == 0)
3338 {
3339 /* A succeed_n that starts with 0 is really a
3340 a simple on_failure_jump_loop. */
3341 INSERT_JUMP (on_failure_jump_loop, laststart,
3342 b + 3 + nbytes);
3343 b += 3;
3344 }
3345 else
3346 {
3347 /* Initialize lower bound of the `succeed_n', even
3348 though it will be set during matching by its
3349 attendant `set_number_at' (inserted next),
3350 because `re_compile_fastmap' needs to know.
3351 Jump to the `jump_n' we might insert below. */
3352 INSERT_JUMP2 (succeed_n, laststart,
3353 b + 5 + nbytes,
3354 lower_bound);
3355 b += 5;
3356
3357 /* Code to initialize the lower bound. Insert
3358 before the `succeed_n'. The `5' is the last two
3359 bytes of this `set_number_at', plus 3 bytes of
3360 the following `succeed_n'. */
3361 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3362 b += 5;
3363 startoffset += 5;
3364 }
3365
3366 if (upper_bound < 0)
3367 {
3368 /* A negative upper bound stands for infinity,
3369 in which case it degenerates to a plain jump. */
3370 STORE_JUMP (jump, b, laststart + startoffset);
3371 b += 3;
3372 }
3373 else if (upper_bound > 1)
3374 { /* More than one repetition is allowed, so
3375 append a backward jump to the `succeed_n'
3376 that starts this interval.
3377
3378 When we've reached this during matching,
3379 we'll have matched the interval once, so
3380 jump back only `upper_bound - 1' times. */
3381 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3382 upper_bound - 1);
3383 b += 5;
3384
3385 /* The location we want to set is the second
3386 parameter of the `jump_n'; that is `b-2' as
3387 an absolute address. `laststart' will be
3388 the `set_number_at' we're about to insert;
3389 `laststart+3' the number to set, the source
3390 for the relative address. But we are
3391 inserting into the middle of the pattern --
3392 so everything is getting moved up by 5.
3393 Conclusion: (b - 2) - (laststart + 3) + 5,
3394 i.e., b - laststart.
3395
3396 We insert this at the beginning of the loop
3397 so that if we fail during matching, we'll
3398 reinitialize the bounds. */
3399 insert_op2 (set_number_at, laststart, b - laststart,
3400 upper_bound - 1, b);
3401 b += 5;
3402 }
3403 }
25fe55af
RS
3404 pending_exact = 0;
3405 beg_interval = NULL;
3406 }
3407 break;
3408
3409 unfetch_interval:
3410 /* If an invalid interval, match the characters as literals. */
3411 assert (beg_interval);
3412 p = beg_interval;
3413 beg_interval = NULL;
3414
3415 /* normal_char and normal_backslash need `c'. */
ed0767d8 3416 c = '{';
25fe55af
RS
3417
3418 if (!(syntax & RE_NO_BK_BRACES))
3419 {
ed0767d8
SM
3420 assert (p > pattern && p[-1] == '\\');
3421 goto normal_backslash;
25fe55af 3422 }
ed0767d8
SM
3423 else
3424 goto normal_char;
e318085a 3425
b18215fc 3426#ifdef emacs
25fe55af
RS
3427 /* There is no way to specify the before_dot and after_dot
3428 operators. rms says this is ok. --karl */
3429 case '=':
3430 BUF_PUSH (at_dot);
3431 break;
3432
3433 case 's':
3434 laststart = b;
3435 PATFETCH (c);
3436 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3437 break;
3438
3439 case 'S':
3440 laststart = b;
3441 PATFETCH (c);
3442 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3443 break;
b18215fc
RS
3444
3445 case 'c':
3446 laststart = b;
36595814 3447 PATFETCH (c);
b18215fc
RS
3448 BUF_PUSH_2 (categoryspec, c);
3449 break;
e318085a 3450
b18215fc
RS
3451 case 'C':
3452 laststart = b;
36595814 3453 PATFETCH (c);
b18215fc
RS
3454 BUF_PUSH_2 (notcategoryspec, c);
3455 break;
3456#endif /* emacs */
e318085a 3457
e318085a 3458
25fe55af 3459 case 'w':
4bb91c68
SM
3460 if (syntax & RE_NO_GNU_OPS)
3461 goto normal_char;
25fe55af 3462 laststart = b;
1fb352e0 3463 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3464 break;
e318085a 3465
e318085a 3466
25fe55af 3467 case 'W':
4bb91c68
SM
3468 if (syntax & RE_NO_GNU_OPS)
3469 goto normal_char;
25fe55af 3470 laststart = b;
1fb352e0 3471 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3472 break;
e318085a
RS
3473
3474
25fe55af 3475 case '<':
4bb91c68
SM
3476 if (syntax & RE_NO_GNU_OPS)
3477 goto normal_char;
25fe55af
RS
3478 BUF_PUSH (wordbeg);
3479 break;
e318085a 3480
25fe55af 3481 case '>':
4bb91c68
SM
3482 if (syntax & RE_NO_GNU_OPS)
3483 goto normal_char;
25fe55af
RS
3484 BUF_PUSH (wordend);
3485 break;
e318085a 3486
669fa600
SM
3487 case '_':
3488 if (syntax & RE_NO_GNU_OPS)
3489 goto normal_char;
3490 laststart = b;
3491 PATFETCH (c);
3492 if (c == '<')
3493 BUF_PUSH (symbeg);
3494 else if (c == '>')
3495 BUF_PUSH (symend);
3496 else
3497 FREE_STACK_RETURN (REG_BADPAT);
3498 break;
3499
25fe55af 3500 case 'b':
4bb91c68
SM
3501 if (syntax & RE_NO_GNU_OPS)
3502 goto normal_char;
25fe55af
RS
3503 BUF_PUSH (wordbound);
3504 break;
e318085a 3505
25fe55af 3506 case 'B':
4bb91c68
SM
3507 if (syntax & RE_NO_GNU_OPS)
3508 goto normal_char;
25fe55af
RS
3509 BUF_PUSH (notwordbound);
3510 break;
fa9a63c5 3511
25fe55af 3512 case '`':
4bb91c68
SM
3513 if (syntax & RE_NO_GNU_OPS)
3514 goto normal_char;
25fe55af
RS
3515 BUF_PUSH (begbuf);
3516 break;
e318085a 3517
25fe55af 3518 case '\'':
4bb91c68
SM
3519 if (syntax & RE_NO_GNU_OPS)
3520 goto normal_char;
25fe55af
RS
3521 BUF_PUSH (endbuf);
3522 break;
e318085a 3523
25fe55af
RS
3524 case '1': case '2': case '3': case '4': case '5':
3525 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3526 {
3527 regnum_t reg;
e318085a 3528
0cdd06f8
SM
3529 if (syntax & RE_NO_BK_REFS)
3530 goto normal_backslash;
e318085a 3531
0cdd06f8 3532 reg = c - '0';
e318085a 3533
0cdd06f8
SM
3534 /* Can't back reference to a subexpression before its end. */
3535 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3536 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3537
0cdd06f8
SM
3538 laststart = b;
3539 BUF_PUSH_2 (duplicate, reg);
3540 }
25fe55af 3541 break;
e318085a 3542
e318085a 3543
25fe55af
RS
3544 case '+':
3545 case '?':
3546 if (syntax & RE_BK_PLUS_QM)
3547 goto handle_plus;
3548 else
3549 goto normal_backslash;
3550
3551 default:
3552 normal_backslash:
3553 /* You might think it would be useful for \ to mean
3554 not to translate; but if we don't translate it
4bb91c68 3555 it will never match anything. */
25fe55af
RS
3556 goto normal_char;
3557 }
3558 break;
fa9a63c5
RM
3559
3560
3561 default:
25fe55af 3562 /* Expects the character in `c'. */
fa9a63c5 3563 normal_char:
36595814 3564 /* If no exactn currently being built. */
25fe55af 3565 if (!pending_exact
fa9a63c5 3566
25fe55af
RS
3567 /* If last exactn not at current position. */
3568 || pending_exact + *pending_exact + 1 != b
5e69f11e 3569
25fe55af 3570 /* We have only one byte following the exactn for the count. */
2d1675e4 3571 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3572
25fe55af 3573 /* If followed by a repetition operator. */
9d99031f 3574 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3575 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3576 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3577 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3578 || ((syntax & RE_INTERVALS)
25fe55af 3579 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3580 ? p != pend && *p == '{'
3581 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3582 {
3583 /* Start building a new exactn. */
5e69f11e 3584
25fe55af 3585 laststart = b;
fa9a63c5
RM
3586
3587 BUF_PUSH_2 (exactn, 0);
3588 pending_exact = b - 1;
25fe55af 3589 }
5e69f11e 3590
2d1675e4
SM
3591 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3592 {
e0277a47
KH
3593 int len;
3594
36595814 3595 c = TRANSLATE (c);
e0277a47
KH
3596 if (multibyte)
3597 len = CHAR_STRING (c, b);
3598 else
3599 *b = c, len = 1;
2d1675e4
SM
3600 b += len;
3601 (*pending_exact) += len;
3602 }
3603
fa9a63c5 3604 break;
25fe55af 3605 } /* switch (c) */
fa9a63c5
RM
3606 } /* while p != pend */
3607
5e69f11e 3608
fa9a63c5 3609 /* Through the pattern now. */
5e69f11e 3610
505bde11 3611 FIXUP_ALT_JUMP ();
fa9a63c5 3612
5e69f11e 3613 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3614 FREE_STACK_RETURN (REG_EPAREN);
3615
3616 /* If we don't want backtracking, force success
3617 the first time we reach the end of the compiled pattern. */
3618 if (syntax & RE_NO_POSIX_BACKTRACKING)
3619 BUF_PUSH (succeed);
3620
fa9a63c5
RM
3621 /* We have succeeded; set the length of the buffer. */
3622 bufp->used = b - bufp->buffer;
3623
3624#ifdef DEBUG
99633e97 3625 if (debug > 0)
fa9a63c5 3626 {
505bde11 3627 re_compile_fastmap (bufp);
fa9a63c5
RM
3628 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3629 print_compiled_pattern (bufp);
3630 }
99633e97 3631 debug--;
fa9a63c5
RM
3632#endif /* DEBUG */
3633
3634#ifndef MATCH_MAY_ALLOCATE
3635 /* Initialize the failure stack to the largest possible stack. This
3636 isn't necessary unless we're trying to avoid calling alloca in
3637 the search and match routines. */
3638 {
3639 int num_regs = bufp->re_nsub + 1;
3640
320a2a73 3641 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3642 {
a26f4ccd 3643 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3644
fa9a63c5
RM
3645 if (! fail_stack.stack)
3646 fail_stack.stack
7b1d7d66
RS
3647 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3648 * sizeof (fail_stack_elt_t));
fa9a63c5
RM
3649 else
3650 fail_stack.stack
7b1d7d66
RS
3651 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3652 (fail_stack.size
3653 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3654 }
3655
3656 regex_grow_registers (num_regs);
3657 }
3658#endif /* not MATCH_MAY_ALLOCATE */
3659
7c67c9f6 3660 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3661} /* regex_compile */
3662\f
3663/* Subroutines for `regex_compile'. */
3664
25fe55af 3665/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3666
3667static void
3668store_op1 (op, loc, arg)
3669 re_opcode_t op;
3670 unsigned char *loc;
3671 int arg;
3672{
3673 *loc = (unsigned char) op;
3674 STORE_NUMBER (loc + 1, arg);
3675}
3676
3677
3678/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3679
3680static void
3681store_op2 (op, loc, arg1, arg2)
3682 re_opcode_t op;
3683 unsigned char *loc;
3684 int arg1, arg2;
3685{
3686 *loc = (unsigned char) op;
3687 STORE_NUMBER (loc + 1, arg1);
3688 STORE_NUMBER (loc + 3, arg2);
3689}
3690
3691
3692/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3693 for OP followed by two-byte integer parameter ARG. */
3694
3695static void
3696insert_op1 (op, loc, arg, end)
3697 re_opcode_t op;
3698 unsigned char *loc;
3699 int arg;
5e69f11e 3700 unsigned char *end;
fa9a63c5
RM
3701{
3702 register unsigned char *pfrom = end;
3703 register unsigned char *pto = end + 3;
3704
3705 while (pfrom != loc)
3706 *--pto = *--pfrom;
5e69f11e 3707
fa9a63c5
RM
3708 store_op1 (op, loc, arg);
3709}
3710
3711
3712/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3713
3714static void
3715insert_op2 (op, loc, arg1, arg2, end)
3716 re_opcode_t op;
3717 unsigned char *loc;
3718 int arg1, arg2;
5e69f11e 3719 unsigned char *end;
fa9a63c5
RM
3720{
3721 register unsigned char *pfrom = end;
3722 register unsigned char *pto = end + 5;
3723
3724 while (pfrom != loc)
3725 *--pto = *--pfrom;
5e69f11e 3726
fa9a63c5
RM
3727 store_op2 (op, loc, arg1, arg2);
3728}
3729
3730
3731/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3732 after an alternative or a begin-subexpression. We assume there is at
3733 least one character before the ^. */
3734
3735static boolean
3736at_begline_loc_p (pattern, p, syntax)
01618498 3737 re_char *pattern, *p;
fa9a63c5
RM
3738 reg_syntax_t syntax;
3739{
01618498 3740 re_char *prev = p - 2;
fa9a63c5 3741 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3742
fa9a63c5
RM
3743 return
3744 /* After a subexpression? */
3745 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3746 /* After an alternative? */
d2af47df
SM
3747 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3748 /* After a shy subexpression? */
3749 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3750 && prev[-1] == '?' && prev[-2] == '('
3751 && (syntax & RE_NO_BK_PARENS
3752 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3753}
3754
3755
3756/* The dual of at_begline_loc_p. This one is for $. We assume there is
3757 at least one character after the $, i.e., `P < PEND'. */
3758
3759static boolean
3760at_endline_loc_p (p, pend, syntax)
01618498 3761 re_char *p, *pend;
99633e97 3762 reg_syntax_t syntax;
fa9a63c5 3763{
01618498 3764 re_char *next = p;
fa9a63c5 3765 boolean next_backslash = *next == '\\';
01618498 3766 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3767
fa9a63c5
RM
3768 return
3769 /* Before a subexpression? */
3770 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3771 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3772 /* Before an alternative? */
3773 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3774 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3775}
3776
3777
5e69f11e 3778/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3779 false if it's not. */
3780
3781static boolean
3782group_in_compile_stack (compile_stack, regnum)
3783 compile_stack_type compile_stack;
3784 regnum_t regnum;
3785{
3786 int this_element;
3787
5e69f11e
RM
3788 for (this_element = compile_stack.avail - 1;
3789 this_element >= 0;
fa9a63c5
RM
3790 this_element--)
3791 if (compile_stack.stack[this_element].regnum == regnum)
3792 return true;
3793
3794 return false;
3795}
fa9a63c5 3796\f
f6a3f532
SM
3797/* analyse_first.
3798 If fastmap is non-NULL, go through the pattern and fill fastmap
3799 with all the possible leading chars. If fastmap is NULL, don't
3800 bother filling it up (obviously) and only return whether the
3801 pattern could potentially match the empty string.
3802
3803 Return 1 if p..pend might match the empty string.
3804 Return 0 if p..pend matches at least one char.
01618498 3805 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3806
3807static int
3808analyse_first (p, pend, fastmap, multibyte)
01618498 3809 re_char *p, *pend;
f6a3f532
SM
3810 char *fastmap;
3811 const int multibyte;
fa9a63c5 3812{
505bde11 3813 int j, k;
1fb352e0 3814 boolean not;
fa9a63c5 3815
b18215fc 3816 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3817 flag is set true. */
b18215fc
RS
3818 boolean match_any_multibyte_characters = false;
3819
f6a3f532 3820 assert (p);
5e69f11e 3821
505bde11
SM
3822 /* The loop below works as follows:
3823 - It has a working-list kept in the PATTERN_STACK and which basically
3824 starts by only containing a pointer to the first operation.
3825 - If the opcode we're looking at is a match against some set of
3826 chars, then we add those chars to the fastmap and go on to the
3827 next work element from the worklist (done via `break').
3828 - If the opcode is a control operator on the other hand, we either
3829 ignore it (if it's meaningless at this point, such as `start_memory')
3830 or execute it (if it's a jump). If the jump has several destinations
3831 (i.e. `on_failure_jump'), then we push the other destination onto the
3832 worklist.
3833 We guarantee termination by ignoring backward jumps (more or less),
3834 so that `p' is monotonically increasing. More to the point, we
3835 never set `p' (or push) anything `<= p1'. */
3836
01618498 3837 while (p < pend)
fa9a63c5 3838 {
505bde11
SM
3839 /* `p1' is used as a marker of how far back a `on_failure_jump'
3840 can go without being ignored. It is normally equal to `p'
3841 (which prevents any backward `on_failure_jump') except right
3842 after a plain `jump', to allow patterns such as:
3843 0: jump 10
3844 3..9: <body>
3845 10: on_failure_jump 3
3846 as used for the *? operator. */
01618498 3847 re_char *p1 = p;
5e69f11e 3848
fa9a63c5
RM
3849 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3850 {
f6a3f532 3851 case succeed:
01618498 3852 return 1;
f6a3f532 3853 continue;
fa9a63c5 3854
fa9a63c5 3855 case duplicate:
505bde11
SM
3856 /* If the first character has to match a backreference, that means
3857 that the group was empty (since it already matched). Since this
3858 is the only case that interests us here, we can assume that the
3859 backreference must match the empty string. */
3860 p++;
3861 continue;
fa9a63c5
RM
3862
3863
3864 /* Following are the cases which match a character. These end
25fe55af 3865 with `break'. */
fa9a63c5
RM
3866
3867 case exactn:
e0277a47
KH
3868 if (fastmap)
3869 {
3870 int c = RE_STRING_CHAR (p + 1, pend - p);
3871
3872 if (SINGLE_BYTE_CHAR_P (c))
3873 fastmap[c] = 1;
3874 else
3875 fastmap[p[1]] = 1;
3876 }
fa9a63c5
RM
3877 break;
3878
3879
1fb352e0
SM
3880 case anychar:
3881 /* We could put all the chars except for \n (and maybe \0)
3882 but we don't bother since it is generally not worth it. */
f6a3f532 3883 if (!fastmap) break;
01618498 3884 return -1;
fa9a63c5
RM
3885
3886
b18215fc 3887 case charset_not:
ba5c004d
RS
3888 /* Chars beyond end of bitmap are possible matches.
3889 All the single-byte codes can occur in multibyte buffers.
3890 So any that are not listed in the charset
3891 are possible matches, even in multibyte buffers. */
1fb352e0 3892 if (!fastmap) break;
b18215fc 3893 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3894 j < (1 << BYTEWIDTH); j++)
b18215fc 3895 fastmap[j] = 1;
1fb352e0
SM
3896 /* Fallthrough */
3897 case charset:
3898 if (!fastmap) break;
3899 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3900 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3901 j >= 0; j--)
1fb352e0 3902 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3903 fastmap[j] = 1;
3904
1fb352e0
SM
3905 if ((not && multibyte)
3906 /* Any character set can possibly contain a character
3907 which doesn't match the specified set of characters. */
3908 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3909 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3910 /* If we can match a character class, we can match
3911 any character set. */
b18215fc
RS
3912 {
3913 set_fastmap_for_multibyte_characters:
3914 if (match_any_multibyte_characters == false)
3915 {
3916 for (j = 0x80; j < 0xA0; j++) /* XXX */
3917 if (BASE_LEADING_CODE_P (j))
3918 fastmap[j] = 1;
3919 match_any_multibyte_characters = true;
3920 }
3921 }
b18215fc 3922
1fb352e0
SM
3923 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3924 && match_any_multibyte_characters == false)
3925 {
3926 /* Set fastmap[I] 1 where I is a base leading code of each
3927 multibyte character in the range table. */
3928 int c, count;
b18215fc 3929
1fb352e0 3930 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3931 bits for a character class. */
1fb352e0 3932 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3933
1fb352e0
SM
3934 /* Extract the number of ranges in range table into COUNT. */
3935 EXTRACT_NUMBER_AND_INCR (count, p);
3936 for (; count > 0; count--, p += 2 * 3) /* XXX */
3937 {
3938 /* Extract the start of each range. */
3939 EXTRACT_CHARACTER (c, p);
3940 j = CHAR_CHARSET (c);
3941 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3942 }
3943 }
b18215fc
RS
3944 break;
3945
1fb352e0
SM
3946 case syntaxspec:
3947 case notsyntaxspec:
3948 if (!fastmap) break;
3949#ifndef emacs
3950 not = (re_opcode_t)p[-1] == notsyntaxspec;
3951 k = *p++;
3952 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3953 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3954 fastmap[j] = 1;
b18215fc 3955 break;
1fb352e0 3956#else /* emacs */
b18215fc
RS
3957 /* This match depends on text properties. These end with
3958 aborting optimizations. */
01618498 3959 return -1;
b18215fc
RS
3960
3961 case categoryspec:
b18215fc 3962 case notcategoryspec:
1fb352e0
SM
3963 if (!fastmap) break;
3964 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3965 k = *p++;
1fb352e0
SM
3966 for (j = 0; j < (1 << BYTEWIDTH); j++)
3967 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3968 fastmap[j] = 1;
3969
1fb352e0 3970 if (multibyte)
b18215fc 3971 /* Any character set can possibly contain a character
1fb352e0 3972 whose category is K (or not). */
b18215fc
RS
3973 goto set_fastmap_for_multibyte_characters;
3974 break;
3975
fa9a63c5 3976 /* All cases after this match the empty string. These end with
25fe55af 3977 `continue'. */
fa9a63c5 3978
fa9a63c5
RM
3979 case before_dot:
3980 case at_dot:
3981 case after_dot:
1fb352e0 3982#endif /* !emacs */
25fe55af
RS
3983 case no_op:
3984 case begline:
3985 case endline:
fa9a63c5
RM
3986 case begbuf:
3987 case endbuf:
3988 case wordbound:
3989 case notwordbound:
3990 case wordbeg:
3991 case wordend:
669fa600
SM
3992 case symbeg:
3993 case symend:
25fe55af 3994 continue;
fa9a63c5
RM
3995
3996
fa9a63c5 3997 case jump:
25fe55af 3998 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
3999 if (j < 0)
4000 /* Backward jumps can only go back to code that we've already
4001 visited. `re_compile' should make sure this is true. */
4002 break;
25fe55af 4003 p += j;
505bde11
SM
4004 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4005 {
4006 case on_failure_jump:
4007 case on_failure_keep_string_jump:
505bde11 4008 case on_failure_jump_loop:
0683b6fa 4009 case on_failure_jump_nastyloop:
505bde11
SM
4010 case on_failure_jump_smart:
4011 p++;
4012 break;
4013 default:
4014 continue;
4015 };
4016 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4017 to jump back to "just after here". */
4018 /* Fallthrough */
fa9a63c5 4019
25fe55af
RS
4020 case on_failure_jump:
4021 case on_failure_keep_string_jump:
0683b6fa 4022 case on_failure_jump_nastyloop:
505bde11
SM
4023 case on_failure_jump_loop:
4024 case on_failure_jump_smart:
25fe55af 4025 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4026 if (p + j <= p1)
ed0767d8 4027 ; /* Backward jump to be ignored. */
01618498
SM
4028 else
4029 { /* We have to look down both arms.
4030 We first go down the "straight" path so as to minimize
4031 stack usage when going through alternatives. */
4032 int r = analyse_first (p, pend, fastmap, multibyte);
4033 if (r) return r;
4034 p += j;
4035 }
25fe55af 4036 continue;
fa9a63c5
RM
4037
4038
ed0767d8
SM
4039 case jump_n:
4040 /* This code simply does not properly handle forward jump_n. */
4041 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4042 p += 4;
4043 /* jump_n can either jump or fall through. The (backward) jump
4044 case has already been handled, so we only need to look at the
4045 fallthrough case. */
4046 continue;
177c0ea7 4047
fa9a63c5 4048 case succeed_n:
ed0767d8
SM
4049 /* If N == 0, it should be an on_failure_jump_loop instead. */
4050 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4051 p += 4;
4052 /* We only care about one iteration of the loop, so we don't
4053 need to consider the case where this behaves like an
4054 on_failure_jump. */
25fe55af 4055 continue;
fa9a63c5
RM
4056
4057
4058 case set_number_at:
25fe55af
RS
4059 p += 4;
4060 continue;
fa9a63c5
RM
4061
4062
4063 case start_memory:
25fe55af 4064 case stop_memory:
505bde11 4065 p += 1;
fa9a63c5
RM
4066 continue;
4067
4068
4069 default:
25fe55af
RS
4070 abort (); /* We have listed all the cases. */
4071 } /* switch *p++ */
fa9a63c5
RM
4072
4073 /* Getting here means we have found the possible starting
25fe55af 4074 characters for one path of the pattern -- and that the empty
01618498
SM
4075 string does not match. We need not follow this path further. */
4076 return 0;
fa9a63c5
RM
4077 } /* while p */
4078
01618498
SM
4079 /* We reached the end without matching anything. */
4080 return 1;
4081
f6a3f532
SM
4082} /* analyse_first */
4083\f
4084/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4085 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4086 characters can start a string that matches the pattern. This fastmap
4087 is used by re_search to skip quickly over impossible starting points.
4088
4089 Character codes above (1 << BYTEWIDTH) are not represented in the
4090 fastmap, but the leading codes are represented. Thus, the fastmap
4091 indicates which character sets could start a match.
4092
4093 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4094 area as BUFP->fastmap.
4095
4096 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4097 the pattern buffer.
4098
4099 Returns 0 if we succeed, -2 if an internal error. */
4100
4101int
4102re_compile_fastmap (bufp)
4103 struct re_pattern_buffer *bufp;
4104{
4105 char *fastmap = bufp->fastmap;
4106 int analysis;
4107
4108 assert (fastmap && bufp->buffer);
4109
4110 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4111 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4112
4113 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4114 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4115 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4116 return 0;
4117} /* re_compile_fastmap */
4118\f
4119/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4120 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4121 this memory for recording register information. STARTS and ENDS
4122 must be allocated using the malloc library routine, and must each
4123 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4124
4125 If NUM_REGS == 0, then subsequent matches should allocate their own
4126 register data.
4127
4128 Unless this function is called, the first search or match using
4129 PATTERN_BUFFER will allocate its own register data, without
4130 freeing the old data. */
4131
4132void
4133re_set_registers (bufp, regs, num_regs, starts, ends)
4134 struct re_pattern_buffer *bufp;
4135 struct re_registers *regs;
4136 unsigned num_regs;
4137 regoff_t *starts, *ends;
4138{
4139 if (num_regs)
4140 {
4141 bufp->regs_allocated = REGS_REALLOCATE;
4142 regs->num_regs = num_regs;
4143 regs->start = starts;
4144 regs->end = ends;
4145 }
4146 else
4147 {
4148 bufp->regs_allocated = REGS_UNALLOCATED;
4149 regs->num_regs = 0;
4150 regs->start = regs->end = (regoff_t *) 0;
4151 }
4152}
c0f9ea08 4153WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4154\f
25fe55af 4155/* Searching routines. */
fa9a63c5
RM
4156
4157/* Like re_search_2, below, but only one string is specified, and
4158 doesn't let you say where to stop matching. */
4159
4160int
4161re_search (bufp, string, size, startpos, range, regs)
4162 struct re_pattern_buffer *bufp;
4163 const char *string;
4164 int size, startpos, range;
4165 struct re_registers *regs;
4166{
5e69f11e 4167 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4168 regs, size);
4169}
c0f9ea08 4170WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4171
70806df6
KH
4172/* Head address of virtual concatenation of string. */
4173#define HEAD_ADDR_VSTRING(P) \
4174 (((P) >= size1 ? string2 : string1))
4175
b18215fc
RS
4176/* End address of virtual concatenation of string. */
4177#define STOP_ADDR_VSTRING(P) \
4178 (((P) >= size1 ? string2 + size2 : string1 + size1))
4179
4180/* Address of POS in the concatenation of virtual string. */
4181#define POS_ADDR_VSTRING(POS) \
4182 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4183
4184/* Using the compiled pattern in BUFP->buffer, first tries to match the
4185 virtual concatenation of STRING1 and STRING2, starting first at index
4186 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4187
fa9a63c5 4188 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4189
fa9a63c5
RM
4190 RANGE is how far to scan while trying to match. RANGE = 0 means try
4191 only at STARTPOS; in general, the last start tried is STARTPOS +
4192 RANGE.
5e69f11e 4193
fa9a63c5
RM
4194 In REGS, return the indices of the virtual concatenation of STRING1
4195 and STRING2 that matched the entire BUFP->buffer and its contained
4196 subexpressions.
5e69f11e 4197
fa9a63c5
RM
4198 Do not consider matching one past the index STOP in the virtual
4199 concatenation of STRING1 and STRING2.
4200
4201 We return either the position in the strings at which the match was
4202 found, -1 if no match, or -2 if error (such as failure
4203 stack overflow). */
4204
4205int
66f0296e 4206re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4207 struct re_pattern_buffer *bufp;
66f0296e 4208 const char *str1, *str2;
fa9a63c5
RM
4209 int size1, size2;
4210 int startpos;
4211 int range;
4212 struct re_registers *regs;
4213 int stop;
4214{
4215 int val;
66f0296e
SM
4216 re_char *string1 = (re_char*) str1;
4217 re_char *string2 = (re_char*) str2;
fa9a63c5 4218 register char *fastmap = bufp->fastmap;
6676cb1c 4219 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4220 int total_size = size1 + size2;
4221 int endpos = startpos + range;
c0f9ea08 4222 boolean anchored_start;
fa9a63c5 4223
25fe55af 4224 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4225 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4226
fa9a63c5
RM
4227 /* Check for out-of-range STARTPOS. */
4228 if (startpos < 0 || startpos > total_size)
4229 return -1;
5e69f11e 4230
fa9a63c5 4231 /* Fix up RANGE if it might eventually take us outside
34597fa9 4232 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4233 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4234 if (endpos < 0)
4235 range = 0 - startpos;
fa9a63c5
RM
4236 else if (endpos > total_size)
4237 range = total_size - startpos;
4238
4239 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4240 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4241 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4242 {
4243 if (startpos > 0)
4244 return -1;
4245 else
7b140fd7 4246 range = 0;
fa9a63c5
RM
4247 }
4248
ae4788a8
RS
4249#ifdef emacs
4250 /* In a forward search for something that starts with \=.
4251 don't keep searching past point. */
4252 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4253 {
7b140fd7
RS
4254 range = PT_BYTE - BEGV_BYTE - startpos;
4255 if (range < 0)
ae4788a8
RS
4256 return -1;
4257 }
4258#endif /* emacs */
4259
fa9a63c5
RM
4260 /* Update the fastmap now if not correct already. */
4261 if (fastmap && !bufp->fastmap_accurate)
01618498 4262 re_compile_fastmap (bufp);
5e69f11e 4263
c8499ba5 4264 /* See whether the pattern is anchored. */
c0f9ea08 4265 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4266
b18215fc 4267#ifdef emacs
cc9b4df2
KH
4268 gl_state.object = re_match_object;
4269 {
99633e97 4270 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4271
4272 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4273 }
b18215fc
RS
4274#endif
4275
fa9a63c5
RM
4276 /* Loop through the string, looking for a place to start matching. */
4277 for (;;)
5e69f11e 4278 {
c8499ba5
RS
4279 /* If the pattern is anchored,
4280 skip quickly past places we cannot match.
4281 We don't bother to treat startpos == 0 specially
4282 because that case doesn't repeat. */
4283 if (anchored_start && startpos > 0)
4284 {
c0f9ea08
SM
4285 if (! ((startpos <= size1 ? string1[startpos - 1]
4286 : string2[startpos - size1 - 1])
4287 == '\n'))
c8499ba5
RS
4288 goto advance;
4289 }
4290
fa9a63c5 4291 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4292 cannot be the start of a match. If the pattern can match the
4293 null string, however, we don't need to skip characters; we want
4294 the first null string. */
fa9a63c5
RM
4295 if (fastmap && startpos < total_size && !bufp->can_be_null)
4296 {
66f0296e 4297 register re_char *d;
01618498 4298 register re_wchar_t buf_ch;
e934739e
RS
4299
4300 d = POS_ADDR_VSTRING (startpos);
4301
25fe55af 4302 if (range > 0) /* Searching forwards. */
fa9a63c5 4303 {
fa9a63c5
RM
4304 register int lim = 0;
4305 int irange = range;
4306
25fe55af
RS
4307 if (startpos < size1 && startpos + range >= size1)
4308 lim = range - (size1 - startpos);
fa9a63c5 4309
25fe55af
RS
4310 /* Written out as an if-else to avoid testing `translate'
4311 inside the loop. */
28ae27ae
AS
4312 if (RE_TRANSLATE_P (translate))
4313 {
e934739e
RS
4314 if (multibyte)
4315 while (range > lim)
4316 {
4317 int buf_charlen;
4318
4319 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4320 buf_charlen);
4321
4322 buf_ch = RE_TRANSLATE (translate, buf_ch);
4323 if (buf_ch >= 0400
4324 || fastmap[buf_ch])
4325 break;
4326
4327 range -= buf_charlen;
4328 d += buf_charlen;
4329 }
4330 else
cf1982d9
EZ
4331 {
4332 /* Convert *d to integer to shut up GCC's
4333 whining about comparison that is always
4334 true. */
4335 int di = *d;
4336
4337 while (range > lim
4338 && !fastmap[RE_TRANSLATE (translate, di)])
4339 {
4340 di = *(++d);
4341 range--;
4342 }
4343 }
e934739e 4344 }
fa9a63c5 4345 else
66f0296e 4346 while (range > lim && !fastmap[*d])
33c46939
RS
4347 {
4348 d++;
4349 range--;
4350 }
fa9a63c5
RM
4351
4352 startpos += irange - range;
4353 }
25fe55af 4354 else /* Searching backwards. */
fa9a63c5 4355 {
2d1675e4
SM
4356 int room = (startpos >= size1
4357 ? size2 + size1 - startpos
4358 : size1 - startpos);
4359 buf_ch = RE_STRING_CHAR (d, room);
4360 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 4361
e934739e
RS
4362 if (! (buf_ch >= 0400
4363 || fastmap[buf_ch]))
fa9a63c5
RM
4364 goto advance;
4365 }
4366 }
4367
4368 /* If can't match the null string, and that's all we have left, fail. */
4369 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4370 && !bufp->can_be_null)
fa9a63c5
RM
4371 return -1;
4372
4373 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4374 startpos, regs, stop);
4375#ifndef REGEX_MALLOC
0b32bf0e 4376# ifdef C_ALLOCA
fa9a63c5 4377 alloca (0);
0b32bf0e 4378# endif
fa9a63c5
RM
4379#endif
4380
4381 if (val >= 0)
4382 return startpos;
5e69f11e 4383
fa9a63c5
RM
4384 if (val == -2)
4385 return -2;
4386
4387 advance:
5e69f11e 4388 if (!range)
25fe55af 4389 break;
5e69f11e 4390 else if (range > 0)
25fe55af 4391 {
b18215fc
RS
4392 /* Update STARTPOS to the next character boundary. */
4393 if (multibyte)
4394 {
66f0296e
SM
4395 re_char *p = POS_ADDR_VSTRING (startpos);
4396 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4397 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4398
4399 range -= len;
4400 if (range < 0)
4401 break;
4402 startpos += len;
4403 }
4404 else
4405 {
b560c397
RS
4406 range--;
4407 startpos++;
4408 }
e318085a 4409 }
fa9a63c5 4410 else
25fe55af
RS
4411 {
4412 range++;
4413 startpos--;
b18215fc
RS
4414
4415 /* Update STARTPOS to the previous character boundary. */
4416 if (multibyte)
4417 {
70806df6
KH
4418 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4419 re_char *p0 = p;
4420 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4421
4422 /* Find the head of multibyte form. */
70806df6
KH
4423 PREV_CHAR_BOUNDARY (p, phead);
4424 range += p0 - 1 - p;
4425 if (range > 0)
4426 break;
b18215fc 4427
70806df6 4428 startpos -= p0 - 1 - p;
b18215fc 4429 }
25fe55af 4430 }
fa9a63c5
RM
4431 }
4432 return -1;
4433} /* re_search_2 */
c0f9ea08 4434WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4435\f
4436/* Declarations and macros for re_match_2. */
4437
2d1675e4
SM
4438static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4439 register int len,
4440 RE_TRANSLATE_TYPE translate,
4441 const int multibyte));
fa9a63c5
RM
4442
4443/* This converts PTR, a pointer into one of the search strings `string1'
4444 and `string2' into an offset from the beginning of that string. */
4445#define POINTER_TO_OFFSET(ptr) \
4446 (FIRST_STRING_P (ptr) \
4447 ? ((regoff_t) ((ptr) - string1)) \
4448 : ((regoff_t) ((ptr) - string2 + size1)))
4449
fa9a63c5 4450/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4451 string2 if necessary.
4452 Check re_match_2_internal for a discussion of why end_match_2 might
4453 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4454#define PREFETCH() \
25fe55af 4455 while (d == dend) \
fa9a63c5
RM
4456 { \
4457 /* End of string2 => fail. */ \
25fe55af
RS
4458 if (dend == end_match_2) \
4459 goto fail; \
4bb91c68 4460 /* End of string1 => advance to string2. */ \
25fe55af 4461 d = string2; \
fa9a63c5
RM
4462 dend = end_match_2; \
4463 }
4464
f1ad044f
SM
4465/* Call before fetching a char with *d if you already checked other limits.
4466 This is meant for use in lookahead operations like wordend, etc..
4467 where we might need to look at parts of the string that might be
4468 outside of the LIMITs (i.e past `stop'). */
4469#define PREFETCH_NOLIMIT() \
4470 if (d == end1) \
4471 { \
4472 d = string2; \
4473 dend = end_match_2; \
4474 } \
fa9a63c5
RM
4475
4476/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4477 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4478#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4479#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4480
4481
4482/* Test if D points to a character which is word-constituent. We have
4483 two special cases to check for: if past the end of string1, look at
4484 the first character in string2; and if before the beginning of
4485 string2, look at the last character in string1. */
4486#define WORDCHAR_P(d) \
4487 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4488 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4489 == Sword)
4490
9121ca40 4491/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4492
4493/* The comment at case wordbound is following one, but we don't use
4494 AT_WORD_BOUNDARY anymore to support multibyte form.
4495
4496 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4497 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4498 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4499 macro and introducing temporary variables works around the bug. */
4500
9121ca40 4501#if 0
fa9a63c5
RM
4502/* Test if the character before D and the one at D differ with respect
4503 to being word-constituent. */
4504#define AT_WORD_BOUNDARY(d) \
4505 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4506 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4507#endif
fa9a63c5
RM
4508
4509/* Free everything we malloc. */
4510#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4511# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4512# define FREE_VARIABLES() \
fa9a63c5
RM
4513 do { \
4514 REGEX_FREE_STACK (fail_stack.stack); \
4515 FREE_VAR (regstart); \
4516 FREE_VAR (regend); \
fa9a63c5
RM
4517 FREE_VAR (best_regstart); \
4518 FREE_VAR (best_regend); \
fa9a63c5
RM
4519 } while (0)
4520#else
0b32bf0e 4521# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4522#endif /* not MATCH_MAY_ALLOCATE */
4523
505bde11
SM
4524\f
4525/* Optimization routines. */
4526
4e8a9132
SM
4527/* If the operation is a match against one or more chars,
4528 return a pointer to the next operation, else return NULL. */
01618498 4529static re_char *
4e8a9132 4530skip_one_char (p)
01618498 4531 re_char *p;
4e8a9132
SM
4532{
4533 switch (SWITCH_ENUM_CAST (*p++))
4534 {
4535 case anychar:
4536 break;
177c0ea7 4537
4e8a9132
SM
4538 case exactn:
4539 p += *p + 1;
4540 break;
4541
4542 case charset_not:
4543 case charset:
4544 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4545 {
4546 int mcnt;
4547 p = CHARSET_RANGE_TABLE (p - 1);
4548 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4549 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4550 }
4551 else
4552 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4553 break;
177c0ea7 4554
4e8a9132
SM
4555 case syntaxspec:
4556 case notsyntaxspec:
1fb352e0 4557#ifdef emacs
4e8a9132
SM
4558 case categoryspec:
4559 case notcategoryspec:
4560#endif /* emacs */
4561 p++;
4562 break;
4563
4564 default:
4565 p = NULL;
4566 }
4567 return p;
4568}
4569
4570
505bde11 4571/* Jump over non-matching operations. */
275a3409 4572static re_char *
4e8a9132 4573skip_noops (p, pend)
275a3409 4574 re_char *p, *pend;
505bde11
SM
4575{
4576 int mcnt;
4577 while (p < pend)
4578 {
4579 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4580 {
4581 case start_memory:
505bde11
SM
4582 case stop_memory:
4583 p += 2; break;
4584 case no_op:
4585 p += 1; break;
4586 case jump:
4587 p += 1;
4588 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4589 p += mcnt;
4590 break;
4591 default:
4592 return p;
4593 }
4594 }
4595 assert (p == pend);
4596 return p;
4597}
4598
4599/* Non-zero if "p1 matches something" implies "p2 fails". */
4600static int
4601mutually_exclusive_p (bufp, p1, p2)
4602 struct re_pattern_buffer *bufp;
275a3409 4603 re_char *p1, *p2;
505bde11 4604{
4e8a9132 4605 re_opcode_t op2;
2d1675e4 4606 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4607 unsigned char *pend = bufp->buffer + bufp->used;
4608
4e8a9132 4609 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4610 && p2 >= bufp->buffer && p2 <= pend);
4611
4612 /* Skip over open/close-group commands.
4613 If what follows this loop is a ...+ construct,
4614 look at what begins its body, since we will have to
4615 match at least one of that. */
4e8a9132
SM
4616 p2 = skip_noops (p2, pend);
4617 /* The same skip can be done for p1, except that this function
4618 is only used in the case where p1 is a simple match operator. */
4619 /* p1 = skip_noops (p1, pend); */
4620
4621 assert (p1 >= bufp->buffer && p1 < pend
4622 && p2 >= bufp->buffer && p2 <= pend);
4623
4624 op2 = p2 == pend ? succeed : *p2;
4625
4626 switch (SWITCH_ENUM_CAST (op2))
505bde11 4627 {
4e8a9132
SM
4628 case succeed:
4629 case endbuf:
4630 /* If we're at the end of the pattern, we can change. */
4631 if (skip_one_char (p1))
505bde11 4632 {
505bde11
SM
4633 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4634 return 1;
505bde11 4635 }
4e8a9132 4636 break;
177c0ea7 4637
4e8a9132 4638 case endline:
4e8a9132
SM
4639 case exactn:
4640 {
01618498 4641 register re_wchar_t c
4e8a9132 4642 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4643 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4644
4e8a9132
SM
4645 if ((re_opcode_t) *p1 == exactn)
4646 {
4647 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4648 {
4649 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4650 return 1;
4651 }
4652 }
505bde11 4653
4e8a9132
SM
4654 else if ((re_opcode_t) *p1 == charset
4655 || (re_opcode_t) *p1 == charset_not)
4656 {
4657 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4658
4e8a9132
SM
4659 /* Test if C is listed in charset (or charset_not)
4660 at `p1'. */
4661 if (SINGLE_BYTE_CHAR_P (c))
4662 {
4663 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4664 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4665 not = !not;
4666 }
4667 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4668 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4669
4e8a9132
SM
4670 /* `not' is equal to 1 if c would match, which means
4671 that we can't change to pop_failure_jump. */
4672 if (!not)
4673 {
4674 DEBUG_PRINT1 (" No match => fast loop.\n");
4675 return 1;
4676 }
4677 }
4678 else if ((re_opcode_t) *p1 == anychar
4679 && c == '\n')
4680 {
4681 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4682 return 1;
4683 }
4684 }
4685 break;
505bde11 4686
4e8a9132 4687 case charset:
4e8a9132
SM
4688 {
4689 if ((re_opcode_t) *p1 == exactn)
4690 /* Reuse the code above. */
4691 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4692
505bde11
SM
4693 /* It is hard to list up all the character in charset
4694 P2 if it includes multibyte character. Give up in
4695 such case. */
4696 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4697 {
4698 /* Now, we are sure that P2 has no range table.
4699 So, for the size of bitmap in P2, `p2[1]' is
4700 enough. But P1 may have range table, so the
4701 size of bitmap table of P1 is extracted by
4702 using macro `CHARSET_BITMAP_SIZE'.
4703
4704 Since we know that all the character listed in
4705 P2 is ASCII, it is enough to test only bitmap
4706 table of P1. */
4707
411e4203 4708 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4709 {
4710 int idx;
4711 /* We win if the charset inside the loop
4712 has no overlap with the one after the loop. */
4713 for (idx = 0;
4714 (idx < (int) p2[1]
4715 && idx < CHARSET_BITMAP_SIZE (p1));
4716 idx++)
4717 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4718 break;
4719
4720 if (idx == p2[1]
4721 || idx == CHARSET_BITMAP_SIZE (p1))
4722 {
4723 DEBUG_PRINT1 (" No match => fast loop.\n");
4724 return 1;
4725 }
4726 }
411e4203 4727 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4728 {
4729 int idx;
4730 /* We win if the charset_not inside the loop lists
4731 every character listed in the charset after. */
4732 for (idx = 0; idx < (int) p2[1]; idx++)
4733 if (! (p2[2 + idx] == 0
4734 || (idx < CHARSET_BITMAP_SIZE (p1)
4735 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4736 break;
4737
4e8a9132
SM
4738 if (idx == p2[1])
4739 {
4740 DEBUG_PRINT1 (" No match => fast loop.\n");
4741 return 1;
4742 }
4743 }
4744 }
4745 }
609b757a 4746 break;
177c0ea7 4747
411e4203
SM
4748 case charset_not:
4749 switch (SWITCH_ENUM_CAST (*p1))
4750 {
4751 case exactn:
4752 case charset:
4753 /* Reuse the code above. */
4754 return mutually_exclusive_p (bufp, p2, p1);
4755 case charset_not:
4756 /* When we have two charset_not, it's very unlikely that
4757 they don't overlap. The union of the two sets of excluded
4758 chars should cover all possible chars, which, as a matter of
4759 fact, is virtually impossible in multibyte buffers. */
36595814 4760 break;
411e4203
SM
4761 }
4762 break;
4763
4e8a9132 4764 case wordend:
669fa600
SM
4765 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4766 case symend:
4e8a9132 4767 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4768 && (p1[1] == Ssymbol || p1[1] == Sword));
4769 case notsyntaxspec:
4770 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4771
4772 case wordbeg:
669fa600
SM
4773 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4774 case symbeg:
4e8a9132 4775 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4776 && (p1[1] == Ssymbol || p1[1] == Sword));
4777 case syntaxspec:
4778 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4779
4780 case wordbound:
4781 return (((re_opcode_t) *p1 == notsyntaxspec
4782 || (re_opcode_t) *p1 == syntaxspec)
4783 && p1[1] == Sword);
4784
1fb352e0 4785#ifdef emacs
4e8a9132
SM
4786 case categoryspec:
4787 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4788 case notcategoryspec:
4789 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4790#endif /* emacs */
4791
4792 default:
4793 ;
505bde11
SM
4794 }
4795
4796 /* Safe default. */
4797 return 0;
4798}
4799
fa9a63c5
RM
4800\f
4801/* Matching routines. */
4802
25fe55af 4803#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4804/* re_match is like re_match_2 except it takes only a single string. */
4805
4806int
4807re_match (bufp, string, size, pos, regs)
4808 struct re_pattern_buffer *bufp;
4809 const char *string;
4810 int size, pos;
4811 struct re_registers *regs;
4812{
4bb91c68 4813 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4814 pos, regs, size);
0b32bf0e 4815# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4816 alloca (0);
0b32bf0e 4817# endif
fa9a63c5
RM
4818 return result;
4819}
c0f9ea08 4820WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4821#endif /* not emacs */
4822
b18215fc
RS
4823#ifdef emacs
4824/* In Emacs, this is the string or buffer in which we
25fe55af 4825 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4826Lisp_Object re_match_object;
4827#endif
fa9a63c5
RM
4828
4829/* re_match_2 matches the compiled pattern in BUFP against the
4830 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4831 and SIZE2, respectively). We start matching at POS, and stop
4832 matching at STOP.
5e69f11e 4833
fa9a63c5 4834 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4835 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4836 documentation for exactly how many groups we fill.
4837
4838 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4839 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4840 matched substring. */
4841
4842int
4843re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4844 struct re_pattern_buffer *bufp;
4845 const char *string1, *string2;
4846 int size1, size2;
4847 int pos;
4848 struct re_registers *regs;
4849 int stop;
4850{
b18215fc 4851 int result;
25fe55af 4852
b18215fc 4853#ifdef emacs
cc9b4df2
KH
4854 int charpos;
4855 gl_state.object = re_match_object;
99633e97 4856 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4857 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4858#endif
4859
4bb91c68
SM
4860 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4861 (re_char*) string2, size2,
cc9b4df2 4862 pos, regs, stop);
0b32bf0e 4863#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4864 alloca (0);
a60198e5 4865#endif
fa9a63c5
RM
4866 return result;
4867}
c0f9ea08 4868WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4869
4870/* This is a separate function so that we can force an alloca cleanup
25fe55af 4871 afterwards. */
fa9a63c5
RM
4872static int
4873re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4874 struct re_pattern_buffer *bufp;
66f0296e 4875 re_char *string1, *string2;
fa9a63c5
RM
4876 int size1, size2;
4877 int pos;
4878 struct re_registers *regs;
4879 int stop;
4880{
4881 /* General temporaries. */
4882 int mcnt;
01618498 4883 size_t reg;
66f0296e 4884 boolean not;
fa9a63c5
RM
4885
4886 /* Just past the end of the corresponding string. */
66f0296e 4887 re_char *end1, *end2;
fa9a63c5
RM
4888
4889 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4890 each to consider matching. */
66f0296e 4891 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4892
4893 /* Where we are in the data, and the end of the current string. */
66f0296e 4894 re_char *d, *dend;
5e69f11e 4895
99633e97
SM
4896 /* Used sometimes to remember where we were before starting matching
4897 an operator so that we can go back in case of failure. This "atomic"
4898 behavior of matching opcodes is indispensable to the correctness
4899 of the on_failure_keep_string_jump optimization. */
4900 re_char *dfail;
4901
fa9a63c5 4902 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4903 re_char *p = bufp->buffer;
4904 re_char *pend = p + bufp->used;
fa9a63c5 4905
25fe55af 4906 /* We use this to map every character in the string. */
6676cb1c 4907 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4908
25fe55af 4909 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4910 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4911
fa9a63c5
RM
4912 /* Failure point stack. Each place that can handle a failure further
4913 down the line pushes a failure point on this stack. It consists of
505bde11 4914 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4915 the subexpressions we're currently inside, plus the number of such
4916 registers, and, finally, two char *'s. The first char * is where
4917 to resume scanning the pattern; the second one is where to resume
505bde11 4918 scanning the strings. */
25fe55af 4919#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4920 fail_stack_type fail_stack;
4921#endif
4922#ifdef DEBUG
fa9a63c5
RM
4923 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4924#endif
4925
0b32bf0e 4926#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4927 /* This holds the pointer to the failure stack, when
4928 it is allocated relocatably. */
4929 fail_stack_elt_t *failure_stack_ptr;
99633e97 4930#endif
fa9a63c5
RM
4931
4932 /* We fill all the registers internally, independent of what we
25fe55af 4933 return, for use in backreferences. The number here includes
fa9a63c5 4934 an element for register zero. */
4bb91c68 4935 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4936
fa9a63c5
RM
4937 /* Information on the contents of registers. These are pointers into
4938 the input strings; they record just what was matched (on this
4939 attempt) by a subexpression part of the pattern, that is, the
4940 regnum-th regstart pointer points to where in the pattern we began
4941 matching and the regnum-th regend points to right after where we
4942 stopped matching the regnum-th subexpression. (The zeroth register
4943 keeps track of what the whole pattern matches.) */
4944#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4945 re_char **regstart, **regend;
fa9a63c5
RM
4946#endif
4947
fa9a63c5 4948 /* The following record the register info as found in the above
5e69f11e 4949 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4950 This happens as we backtrack through the failure points, which in
4951 turn happens only if we have not yet matched the entire string. */
4952 unsigned best_regs_set = false;
4953#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4954 re_char **best_regstart, **best_regend;
fa9a63c5 4955#endif
5e69f11e 4956
fa9a63c5
RM
4957 /* Logically, this is `best_regend[0]'. But we don't want to have to
4958 allocate space for that if we're not allocating space for anything
25fe55af 4959 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4960 any of the other register vectors, and it seems rather a kludge to
4961 treat `best_regend' differently than the rest. So we keep track of
4962 the end of the best match so far in a separate variable. We
4963 initialize this to NULL so that when we backtrack the first time
4964 and need to test it, it's not garbage. */
66f0296e 4965 re_char *match_end = NULL;
fa9a63c5 4966
fa9a63c5
RM
4967#ifdef DEBUG
4968 /* Counts the total number of registers pushed. */
5e69f11e 4969 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4970#endif
4971
4972 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4973
fa9a63c5 4974 INIT_FAIL_STACK ();
5e69f11e 4975
fa9a63c5
RM
4976#ifdef MATCH_MAY_ALLOCATE
4977 /* Do not bother to initialize all the register variables if there are
4978 no groups in the pattern, as it takes a fair amount of time. If
4979 there are groups, we include space for register 0 (the whole
4980 pattern), even though we never use it, since it simplifies the
4981 array indexing. We should fix this. */
4982 if (bufp->re_nsub)
4983 {
66f0296e
SM
4984 regstart = REGEX_TALLOC (num_regs, re_char *);
4985 regend = REGEX_TALLOC (num_regs, re_char *);
4986 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4987 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 4988
505bde11 4989 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
4990 {
4991 FREE_VARIABLES ();
4992 return -2;
4993 }
fa9a63c5
RM
4994 }
4995 else
4996 {
4997 /* We must initialize all our variables to NULL, so that
25fe55af 4998 `FREE_VARIABLES' doesn't try to free them. */
505bde11 4999 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5000 }
5001#endif /* MATCH_MAY_ALLOCATE */
5002
5003 /* The starting position is bogus. */
5004 if (pos < 0 || pos > size1 + size2)
5005 {
5006 FREE_VARIABLES ();
5007 return -1;
5008 }
5e69f11e 5009
fa9a63c5
RM
5010 /* Initialize subexpression text positions to -1 to mark ones that no
5011 start_memory/stop_memory has been seen for. Also initialize the
5012 register information struct. */
01618498
SM
5013 for (reg = 1; reg < num_regs; reg++)
5014 regstart[reg] = regend[reg] = NULL;
99633e97 5015
fa9a63c5 5016 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 5017 `string1' is null. */
fa9a63c5
RM
5018 if (size2 == 0 && string1 != NULL)
5019 {
5020 string2 = string1;
5021 size2 = size1;
5022 string1 = 0;
5023 size1 = 0;
5024 }
5025 end1 = string1 + size1;
5026 end2 = string2 + size2;
5027
5e69f11e 5028 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5029 `dend' is the end of the input string that `d' points within. `d'
5030 is advanced into the following input string whenever necessary, but
5031 this happens before fetching; therefore, at the beginning of the
5032 loop, `d' can be pointing at the end of a string, but it cannot
5033 equal `string2'. */
419d1c74 5034 if (pos >= size1)
fa9a63c5 5035 {
419d1c74
SM
5036 /* Only match within string2. */
5037 d = string2 + pos - size1;
5038 dend = end_match_2 = string2 + stop - size1;
5039 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5040 }
5041 else
5042 {
f1ad044f 5043 if (stop < size1)
419d1c74
SM
5044 {
5045 /* Only match within string1. */
5046 end_match_1 = string1 + stop;
5047 /* BEWARE!
5048 When we reach end_match_1, PREFETCH normally switches to string2.
5049 But in the present case, this means that just doing a PREFETCH
5050 makes us jump from `stop' to `gap' within the string.
5051 What we really want here is for the search to stop as
5052 soon as we hit end_match_1. That's why we set end_match_2
5053 to end_match_1 (since PREFETCH fails as soon as we hit
5054 end_match_2). */
5055 end_match_2 = end_match_1;
5056 }
5057 else
f1ad044f
SM
5058 { /* It's important to use this code when stop == size so that
5059 moving `d' from end1 to string2 will not prevent the d == dend
5060 check from catching the end of string. */
419d1c74
SM
5061 end_match_1 = end1;
5062 end_match_2 = string2 + stop - size1;
5063 }
5064 d = string1 + pos;
5065 dend = end_match_1;
fa9a63c5
RM
5066 }
5067
5068 DEBUG_PRINT1 ("The compiled pattern is: ");
5069 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5070 DEBUG_PRINT1 ("The string to match is: `");
5071 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5072 DEBUG_PRINT1 ("'\n");
5e69f11e 5073
25fe55af 5074 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5075 function if the match is complete, or it drops through if the match
5076 fails at this starting point in the input data. */
5077 for (;;)
5078 {
505bde11 5079 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5080
5081 if (p == pend)
5082 { /* End of pattern means we might have succeeded. */
25fe55af 5083 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5084
fa9a63c5 5085 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5086 longest match, try backtracking. */
5087 if (d != end_match_2)
fa9a63c5
RM
5088 {
5089 /* 1 if this match ends in the same string (string1 or string2)
5090 as the best previous match. */
5e69f11e 5091 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5092 == FIRST_STRING_P (d));
fa9a63c5
RM
5093 /* 1 if this match is the best seen so far. */
5094 boolean best_match_p;
5095
5096 /* AIX compiler got confused when this was combined
25fe55af 5097 with the previous declaration. */
fa9a63c5
RM
5098 if (same_str_p)
5099 best_match_p = d > match_end;
5100 else
99633e97 5101 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5102
25fe55af
RS
5103 DEBUG_PRINT1 ("backtracking.\n");
5104
5105 if (!FAIL_STACK_EMPTY ())
5106 { /* More failure points to try. */
5107
5108 /* If exceeds best match so far, save it. */
5109 if (!best_regs_set || best_match_p)
5110 {
5111 best_regs_set = true;
5112 match_end = d;
5113
5114 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5115
01618498 5116 for (reg = 1; reg < num_regs; reg++)
25fe55af 5117 {
01618498
SM
5118 best_regstart[reg] = regstart[reg];
5119 best_regend[reg] = regend[reg];
25fe55af
RS
5120 }
5121 }
5122 goto fail;
5123 }
5124
5125 /* If no failure points, don't restore garbage. And if
5126 last match is real best match, don't restore second
5127 best one. */
5128 else if (best_regs_set && !best_match_p)
5129 {
5130 restore_best_regs:
5131 /* Restore best match. It may happen that `dend ==
5132 end_match_1' while the restored d is in string2.
5133 For example, the pattern `x.*y.*z' against the
5134 strings `x-' and `y-z-', if the two strings are
5135 not consecutive in memory. */
5136 DEBUG_PRINT1 ("Restoring best registers.\n");
5137
5138 d = match_end;
5139 dend = ((d >= string1 && d <= end1)
5140 ? end_match_1 : end_match_2);
fa9a63c5 5141
01618498 5142 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5143 {
01618498
SM
5144 regstart[reg] = best_regstart[reg];
5145 regend[reg] = best_regend[reg];
fa9a63c5 5146 }
25fe55af
RS
5147 }
5148 } /* d != end_match_2 */
fa9a63c5
RM
5149
5150 succeed_label:
25fe55af 5151 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5152
25fe55af
RS
5153 /* If caller wants register contents data back, do it. */
5154 if (regs && !bufp->no_sub)
fa9a63c5 5155 {
25fe55af
RS
5156 /* Have the register data arrays been allocated? */
5157 if (bufp->regs_allocated == REGS_UNALLOCATED)
5158 { /* No. So allocate them with malloc. We need one
5159 extra element beyond `num_regs' for the `-1' marker
5160 GNU code uses. */
5161 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5162 regs->start = TALLOC (regs->num_regs, regoff_t);
5163 regs->end = TALLOC (regs->num_regs, regoff_t);
5164 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5165 {
5166 FREE_VARIABLES ();
5167 return -2;
5168 }
25fe55af
RS
5169 bufp->regs_allocated = REGS_REALLOCATE;
5170 }
5171 else if (bufp->regs_allocated == REGS_REALLOCATE)
5172 { /* Yes. If we need more elements than were already
5173 allocated, reallocate them. If we need fewer, just
5174 leave it alone. */
5175 if (regs->num_regs < num_regs + 1)
5176 {
5177 regs->num_regs = num_regs + 1;
5178 RETALLOC (regs->start, regs->num_regs, regoff_t);
5179 RETALLOC (regs->end, regs->num_regs, regoff_t);
5180 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5181 {
5182 FREE_VARIABLES ();
5183 return -2;
5184 }
25fe55af
RS
5185 }
5186 }
5187 else
fa9a63c5
RM
5188 {
5189 /* These braces fend off a "empty body in an else-statement"
25fe55af 5190 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5191 assert (bufp->regs_allocated == REGS_FIXED);
5192 }
5193
25fe55af
RS
5194 /* Convert the pointer data in `regstart' and `regend' to
5195 indices. Register zero has to be set differently,
5196 since we haven't kept track of any info for it. */
5197 if (regs->num_regs > 0)
5198 {
5199 regs->start[0] = pos;
99633e97 5200 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5201 }
5e69f11e 5202
25fe55af
RS
5203 /* Go through the first `min (num_regs, regs->num_regs)'
5204 registers, since that is all we initialized. */
01618498 5205 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5206 {
01618498
SM
5207 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5208 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5209 else
5210 {
01618498
SM
5211 regs->start[reg]
5212 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5213 regs->end[reg]
5214 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5215 }
fa9a63c5 5216 }
5e69f11e 5217
25fe55af
RS
5218 /* If the regs structure we return has more elements than
5219 were in the pattern, set the extra elements to -1. If
5220 we (re)allocated the registers, this is the case,
5221 because we always allocate enough to have at least one
5222 -1 at the end. */
01618498
SM
5223 for (reg = num_regs; reg < regs->num_regs; reg++)
5224 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5225 } /* regs && !bufp->no_sub */
5226
25fe55af
RS
5227 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5228 nfailure_points_pushed, nfailure_points_popped,
5229 nfailure_points_pushed - nfailure_points_popped);
5230 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5231
99633e97 5232 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5233
25fe55af 5234 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5235
25fe55af
RS
5236 FREE_VARIABLES ();
5237 return mcnt;
5238 }
fa9a63c5 5239
25fe55af 5240 /* Otherwise match next pattern command. */
fa9a63c5
RM
5241 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5242 {
25fe55af
RS
5243 /* Ignore these. Used to ignore the n of succeed_n's which
5244 currently have n == 0. */
5245 case no_op:
5246 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5247 break;
fa9a63c5
RM
5248
5249 case succeed:
25fe55af 5250 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5251 goto succeed_label;
5252
25fe55af
RS
5253 /* Match the next n pattern characters exactly. The following
5254 byte in the pattern defines n, and the n bytes after that
5255 are the characters to match. */
fa9a63c5
RM
5256 case exactn:
5257 mcnt = *p++;
25fe55af 5258 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5259
99633e97
SM
5260 /* Remember the start point to rollback upon failure. */
5261 dfail = d;
5262
25fe55af
RS
5263 /* This is written out as an if-else so we don't waste time
5264 testing `translate' inside the loop. */
28703c16 5265 if (RE_TRANSLATE_P (translate))
fa9a63c5 5266 {
e934739e
RS
5267 if (multibyte)
5268 do
5269 {
5270 int pat_charlen, buf_charlen;
e71b1971 5271 unsigned int pat_ch, buf_ch;
e934739e
RS
5272
5273 PREFETCH ();
5274 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5275 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5276
5277 if (RE_TRANSLATE (translate, buf_ch)
5278 != pat_ch)
99633e97
SM
5279 {
5280 d = dfail;
5281 goto fail;
5282 }
e934739e
RS
5283
5284 p += pat_charlen;
5285 d += buf_charlen;
5286 mcnt -= pat_charlen;
5287 }
5288 while (mcnt > 0);
5289 else
e934739e
RS
5290 do
5291 {
cf1982d9
EZ
5292 /* Avoid compiler whining about comparison being
5293 always true. */
5294 int di;
5295
e934739e 5296 PREFETCH ();
cf1982d9
EZ
5297 di = *d;
5298 if (RE_TRANSLATE (translate, di) != *p++)
99633e97
SM
5299 {
5300 d = dfail;
5301 goto fail;
5302 }
33c46939 5303 d++;
e934739e
RS
5304 }
5305 while (--mcnt);
fa9a63c5
RM
5306 }
5307 else
5308 {
5309 do
5310 {
5311 PREFETCH ();
99633e97
SM
5312 if (*d++ != *p++)
5313 {
5314 d = dfail;
5315 goto fail;
5316 }
fa9a63c5
RM
5317 }
5318 while (--mcnt);
5319 }
25fe55af 5320 break;
fa9a63c5
RM
5321
5322
25fe55af 5323 /* Match any character except possibly a newline or a null. */
fa9a63c5 5324 case anychar:
e934739e
RS
5325 {
5326 int buf_charlen;
01618498 5327 re_wchar_t buf_ch;
fa9a63c5 5328
e934739e 5329 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5330
e934739e 5331 PREFETCH ();
2d1675e4 5332 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5333 buf_ch = TRANSLATE (buf_ch);
5334
5335 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5336 && buf_ch == '\n')
5337 || ((bufp->syntax & RE_DOT_NOT_NULL)
5338 && buf_ch == '\000'))
5339 goto fail;
5340
e934739e
RS
5341 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5342 d += buf_charlen;
5343 }
fa9a63c5
RM
5344 break;
5345
5346
5347 case charset:
5348 case charset_not:
5349 {
b18215fc 5350 register unsigned int c;
fa9a63c5 5351 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5352 int len;
5353
5354 /* Start of actual range_table, or end of bitmap if there is no
5355 range table. */
01618498 5356 re_char *range_table;
b18215fc 5357
96cc36cc 5358 /* Nonzero if there is a range table. */
b18215fc
RS
5359 int range_table_exists;
5360
96cc36cc
RS
5361 /* Number of ranges of range table. This is not included
5362 in the initial byte-length of the command. */
5363 int count = 0;
fa9a63c5 5364
25fe55af 5365 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5366
b18215fc 5367 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5368
b18215fc 5369 if (range_table_exists)
96cc36cc
RS
5370 {
5371 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5372 EXTRACT_NUMBER_AND_INCR (count, range_table);
5373 }
b18215fc 5374
2d1675e4
SM
5375 PREFETCH ();
5376 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5377 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
5378
5379 if (SINGLE_BYTE_CHAR_P (c))
5380 { /* Lookup bitmap. */
b18215fc
RS
5381 /* Cast to `unsigned' instead of `unsigned char' in
5382 case the bit list is a full 32 bytes long. */
5383 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5384 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5385 not = !not;
b18215fc 5386 }
96cc36cc 5387#ifdef emacs
b18215fc 5388 else if (range_table_exists)
96cc36cc
RS
5389 {
5390 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5391
14473664
SM
5392 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5393 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5394 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5395 | (class_bits & BIT_SPACE && ISSPACE (c))
5396 | (class_bits & BIT_UPPER && ISUPPER (c))
5397 | (class_bits & BIT_WORD && ISWORD (c)))
5398 not = !not;
5399 else
5400 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5401 }
5402#endif /* emacs */
fa9a63c5 5403
96cc36cc
RS
5404 if (range_table_exists)
5405 p = CHARSET_RANGE_TABLE_END (range_table, count);
5406 else
5407 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5408
5409 if (!not) goto fail;
5e69f11e 5410
b18215fc 5411 d += len;
fa9a63c5
RM
5412 break;
5413 }
5414
5415
25fe55af 5416 /* The beginning of a group is represented by start_memory.
505bde11 5417 The argument is the register number. The text
25fe55af
RS
5418 matched within the group is recorded (in the internal
5419 registers data structure) under the register number. */
5420 case start_memory:
505bde11
SM
5421 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5422
5423 /* In case we need to undo this operation (via backtracking). */
5424 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5425
25fe55af 5426 regstart[*p] = d;
4bb91c68 5427 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5428 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5429
25fe55af 5430 /* Move past the register number and inner group count. */
505bde11 5431 p += 1;
25fe55af 5432 break;
fa9a63c5
RM
5433
5434
25fe55af 5435 /* The stop_memory opcode represents the end of a group. Its
505bde11 5436 argument is the same as start_memory's: the register number. */
fa9a63c5 5437 case stop_memory:
505bde11
SM
5438 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5439
5440 assert (!REG_UNSET (regstart[*p]));
5441 /* Strictly speaking, there should be code such as:
177c0ea7 5442
0b32bf0e 5443 assert (REG_UNSET (regend[*p]));
505bde11
SM
5444 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5445
5446 But the only info to be pushed is regend[*p] and it is known to
5447 be UNSET, so there really isn't anything to push.
5448 Not pushing anything, on the other hand deprives us from the
5449 guarantee that regend[*p] is UNSET since undoing this operation
5450 will not reset its value properly. This is not important since
5451 the value will only be read on the next start_memory or at
5452 the very end and both events can only happen if this stop_memory
5453 is *not* undone. */
fa9a63c5 5454
25fe55af 5455 regend[*p] = d;
fa9a63c5
RM
5456 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5457
25fe55af 5458 /* Move past the register number and the inner group count. */
505bde11 5459 p += 1;
25fe55af 5460 break;
fa9a63c5
RM
5461
5462
5463 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5464 followed by the numeric value of <digit> as the register number. */
5465 case duplicate:
fa9a63c5 5466 {
66f0296e 5467 register re_char *d2, *dend2;
25fe55af 5468 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5469 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5470
25fe55af
RS
5471 /* Can't back reference a group which we've never matched. */
5472 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5473 goto fail;
5e69f11e 5474
25fe55af
RS
5475 /* Where in input to try to start matching. */
5476 d2 = regstart[regno];
5e69f11e 5477
99633e97
SM
5478 /* Remember the start point to rollback upon failure. */
5479 dfail = d;
5480
25fe55af
RS
5481 /* Where to stop matching; if both the place to start and
5482 the place to stop matching are in the same string, then
5483 set to the place to stop, otherwise, for now have to use
5484 the end of the first string. */
fa9a63c5 5485
25fe55af 5486 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5487 == FIRST_STRING_P (regend[regno]))
5488 ? regend[regno] : end_match_1);
5489 for (;;)
5490 {
5491 /* If necessary, advance to next segment in register
25fe55af 5492 contents. */
fa9a63c5
RM
5493 while (d2 == dend2)
5494 {
5495 if (dend2 == end_match_2) break;
5496 if (dend2 == regend[regno]) break;
5497
25fe55af
RS
5498 /* End of string1 => advance to string2. */
5499 d2 = string2;
5500 dend2 = regend[regno];
fa9a63c5
RM
5501 }
5502 /* At end of register contents => success */
5503 if (d2 == dend2) break;
5504
5505 /* If necessary, advance to next segment in data. */
5506 PREFETCH ();
5507
5508 /* How many characters left in this segment to match. */
5509 mcnt = dend - d;
5e69f11e 5510
fa9a63c5 5511 /* Want how many consecutive characters we can match in
25fe55af
RS
5512 one shot, so, if necessary, adjust the count. */
5513 if (mcnt > dend2 - d2)
fa9a63c5 5514 mcnt = dend2 - d2;
5e69f11e 5515
fa9a63c5 5516 /* Compare that many; failure if mismatch, else move
25fe55af 5517 past them. */
28703c16 5518 if (RE_TRANSLATE_P (translate)
2d1675e4 5519 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5520 : memcmp (d, d2, mcnt))
99633e97
SM
5521 {
5522 d = dfail;
5523 goto fail;
5524 }
fa9a63c5 5525 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5526 }
5527 }
5528 break;
5529
5530
25fe55af 5531 /* begline matches the empty string at the beginning of the string
c0f9ea08 5532 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5533 case begline:
25fe55af 5534 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5535
25fe55af
RS
5536 if (AT_STRINGS_BEG (d))
5537 {
5538 if (!bufp->not_bol) break;
5539 }
419d1c74 5540 else
25fe55af 5541 {
419d1c74
SM
5542 unsigned char c;
5543 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5544 if (c == '\n')
419d1c74 5545 break;
25fe55af
RS
5546 }
5547 /* In all other cases, we fail. */
5548 goto fail;
fa9a63c5
RM
5549
5550
25fe55af 5551 /* endline is the dual of begline. */
fa9a63c5 5552 case endline:
25fe55af 5553 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5554
25fe55af
RS
5555 if (AT_STRINGS_END (d))
5556 {
5557 if (!bufp->not_eol) break;
5558 }
f1ad044f 5559 else
25fe55af 5560 {
f1ad044f 5561 PREFETCH_NOLIMIT ();
c0f9ea08 5562 if (*d == '\n')
f1ad044f 5563 break;
25fe55af
RS
5564 }
5565 goto fail;
fa9a63c5
RM
5566
5567
5568 /* Match at the very beginning of the data. */
25fe55af
RS
5569 case begbuf:
5570 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5571 if (AT_STRINGS_BEG (d))
5572 break;
5573 goto fail;
fa9a63c5
RM
5574
5575
5576 /* Match at the very end of the data. */
25fe55af
RS
5577 case endbuf:
5578 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5579 if (AT_STRINGS_END (d))
5580 break;
25fe55af 5581 goto fail;
5e69f11e 5582
5e69f11e 5583
25fe55af
RS
5584 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5585 pushes NULL as the value for the string on the stack. Then
505bde11 5586 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5587 string, instead of restoring it. To see why, consider
5588 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5589 then the . fails against the \n. But the next thing we want
5590 to do is match the \n against the \n; if we restored the
5591 string value, we would be back at the foo.
5592
5593 Because this is used only in specific cases, we don't need to
5594 check all the things that `on_failure_jump' does, to make
5595 sure the right things get saved on the stack. Hence we don't
5596 share its code. The only reason to push anything on the
5597 stack at all is that otherwise we would have to change
5598 `anychar's code to do something besides goto fail in this
5599 case; that seems worse than this. */
5600 case on_failure_keep_string_jump:
505bde11
SM
5601 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5602 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5603 mcnt, p + mcnt);
fa9a63c5 5604
505bde11
SM
5605 PUSH_FAILURE_POINT (p - 3, NULL);
5606 break;
5607
0683b6fa
SM
5608 /* A nasty loop is introduced by the non-greedy *? and +?.
5609 With such loops, the stack only ever contains one failure point
5610 at a time, so that a plain on_failure_jump_loop kind of
5611 cycle detection cannot work. Worse yet, such a detection
5612 can not only fail to detect a cycle, but it can also wrongly
5613 detect a cycle (between different instantiations of the same
6df42991 5614 loop).
0683b6fa
SM
5615 So the method used for those nasty loops is a little different:
5616 We use a special cycle-detection-stack-frame which is pushed
5617 when the on_failure_jump_nastyloop failure-point is *popped*.
5618 This special frame thus marks the beginning of one iteration
5619 through the loop and we can hence easily check right here
5620 whether something matched between the beginning and the end of
5621 the loop. */
5622 case on_failure_jump_nastyloop:
5623 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5624 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5625 mcnt, p + mcnt);
5626
5627 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5628 {
5629 int cycle = 0;
5630 CHECK_INFINITE_LOOP (p - 4, d);
5631 if (!cycle)
5632 /* If there's a cycle, just continue without pushing
5633 this failure point. The failure point is the "try again"
5634 option, which shouldn't be tried.
5635 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5636 PUSH_FAILURE_POINT (p - 3, d);
5637 }
0683b6fa
SM
5638 break;
5639
4e8a9132
SM
5640 /* Simple loop detecting on_failure_jump: just check on the
5641 failure stack if the same spot was already hit earlier. */
505bde11
SM
5642 case on_failure_jump_loop:
5643 on_failure:
5644 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5645 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5646 mcnt, p + mcnt);
6df42991
SM
5647 {
5648 int cycle = 0;
5649 CHECK_INFINITE_LOOP (p - 3, d);
5650 if (cycle)
5651 /* If there's a cycle, get out of the loop, as if the matching
5652 had failed. We used to just `goto fail' here, but that was
5653 aborting the search a bit too early: we want to keep the
5654 empty-loop-match and keep matching after the loop.
5655 We want (x?)*y\1z to match both xxyz and xxyxz. */
5656 p += mcnt;
5657 else
5658 PUSH_FAILURE_POINT (p - 3, d);
5659 }
25fe55af 5660 break;
fa9a63c5
RM
5661
5662
5663 /* Uses of on_failure_jump:
5e69f11e 5664
25fe55af
RS
5665 Each alternative starts with an on_failure_jump that points
5666 to the beginning of the next alternative. Each alternative
5667 except the last ends with a jump that in effect jumps past
5668 the rest of the alternatives. (They really jump to the
5669 ending jump of the following alternative, because tensioning
5670 these jumps is a hassle.)
fa9a63c5 5671
25fe55af
RS
5672 Repeats start with an on_failure_jump that points past both
5673 the repetition text and either the following jump or
5674 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5675 case on_failure_jump:
25fe55af 5676 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5677 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5678 mcnt, p + mcnt);
25fe55af 5679
505bde11 5680 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5681 break;
5682
4e8a9132 5683 /* This operation is used for greedy *.
505bde11
SM
5684 Compare the beginning of the repeat with what in the
5685 pattern follows its end. If we can establish that there
5686 is nothing that they would both match, i.e., that we
5687 would have to backtrack because of (as in, e.g., `a*a')
5688 then we can use a non-backtracking loop based on
4e8a9132 5689 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5690 case on_failure_jump_smart:
25fe55af 5691 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5692 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5693 mcnt, p + mcnt);
25fe55af 5694 {
01618498 5695 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5696 /* Here, we discard `const', making re_match non-reentrant. */
5697 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5698 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5699
505bde11
SM
5700 p -= 3; /* Reset so that we will re-execute the
5701 instruction once it's been changed. */
fa9a63c5 5702
4e8a9132
SM
5703 EXTRACT_NUMBER (mcnt, p2 - 2);
5704
5705 /* Ensure this is a indeed the trivial kind of loop
5706 we are expecting. */
5707 assert (skip_one_char (p1) == p2 - 3);
5708 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5709 DEBUG_STATEMENT (debug += 2);
505bde11 5710 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5711 {
505bde11 5712 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5713 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5714 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5715 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5716 }
505bde11 5717 else
fa9a63c5 5718 {
505bde11
SM
5719 /* Default to a safe `on_failure_jump' loop. */
5720 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5721 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5722 }
99633e97 5723 DEBUG_STATEMENT (debug -= 2);
25fe55af 5724 }
505bde11 5725 break;
25fe55af
RS
5726
5727 /* Unconditionally jump (without popping any failure points). */
5728 case jump:
fa9a63c5 5729 unconditional_jump:
5b370c2b 5730 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5731 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5732 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5733 p += mcnt; /* Do the jump. */
505bde11 5734 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5735 break;
5736
5737
25fe55af
RS
5738 /* Have to succeed matching what follows at least n times.
5739 After that, handle like `on_failure_jump'. */
5740 case succeed_n:
01618498 5741 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5742 EXTRACT_NUMBER (mcnt, p + 2);
5743 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5744
dc1e502d
SM
5745 /* Originally, mcnt is how many times we HAVE to succeed. */
5746 if (mcnt != 0)
25fe55af 5747 {
6dcf2d0e
SM
5748 /* Here, we discard `const', making re_match non-reentrant. */
5749 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5750 mcnt--;
01618498
SM
5751 p += 4;
5752 PUSH_NUMBER (p2, mcnt);
25fe55af 5753 }
dc1e502d
SM
5754 else
5755 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5756 goto on_failure;
25fe55af
RS
5757 break;
5758
5759 case jump_n:
01618498 5760 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5761 EXTRACT_NUMBER (mcnt, p + 2);
5762 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5763
5764 /* Originally, this is how many times we CAN jump. */
dc1e502d 5765 if (mcnt != 0)
25fe55af 5766 {
6dcf2d0e
SM
5767 /* Here, we discard `const', making re_match non-reentrant. */
5768 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5769 mcnt--;
01618498 5770 PUSH_NUMBER (p2, mcnt);
dc1e502d 5771 goto unconditional_jump;
25fe55af
RS
5772 }
5773 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5774 else
5775 p += 4;
25fe55af 5776 break;
5e69f11e 5777
fa9a63c5
RM
5778 case set_number_at:
5779 {
01618498 5780 unsigned char *p2; /* Location of the counter. */
25fe55af 5781 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5782
25fe55af 5783 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5784 /* Here, we discard `const', making re_match non-reentrant. */
5785 p2 = (unsigned char*) p + mcnt;
01618498 5786 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5787 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5788 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5789 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5790 break;
5791 }
9121ca40
KH
5792
5793 case wordbound:
66f0296e
SM
5794 case notwordbound:
5795 not = (re_opcode_t) *(p - 1) == notwordbound;
5796 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5797
99633e97 5798 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5799
b18215fc 5800 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5801 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5802 not = !not;
b18215fc
RS
5803 else
5804 {
5805 /* C1 is the character before D, S1 is the syntax of C1, C2
5806 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5807 re_wchar_t c1, c2;
5808 int s1, s2;
b18215fc 5809#ifdef emacs
2d1675e4
SM
5810 int offset = PTR_TO_OFFSET (d - 1);
5811 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5812 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5813#endif
66f0296e 5814 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5815 s1 = SYNTAX (c1);
5816#ifdef emacs
5d967c7a 5817 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5818#endif
f1ad044f 5819 PREFETCH_NOLIMIT ();
2d1675e4 5820 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5821 s2 = SYNTAX (c2);
5822
5823 if (/* Case 2: Only one of S1 and S2 is Sword. */
5824 ((s1 == Sword) != (s2 == Sword))
5825 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5826 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5827 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5828 not = !not;
5829 }
5830 if (not)
9121ca40 5831 break;
b18215fc 5832 else
9121ca40 5833 goto fail;
fa9a63c5
RM
5834
5835 case wordbeg:
25fe55af 5836 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5837
b18215fc
RS
5838 /* We FAIL in one of the following cases: */
5839
25fe55af 5840 /* Case 1: D is at the end of string. */
b18215fc 5841 if (AT_STRINGS_END (d))
99633e97 5842 goto fail;
b18215fc
RS
5843 else
5844 {
5845 /* C1 is the character before D, S1 is the syntax of C1, C2
5846 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5847 re_wchar_t c1, c2;
5848 int s1, s2;
fa9a63c5 5849#ifdef emacs
2d1675e4
SM
5850 int offset = PTR_TO_OFFSET (d);
5851 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5852 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5853#endif
99633e97 5854 PREFETCH ();
2d1675e4 5855 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5856 s2 = SYNTAX (c2);
177c0ea7 5857
b18215fc
RS
5858 /* Case 2: S2 is not Sword. */
5859 if (s2 != Sword)
5860 goto fail;
5861
5862 /* Case 3: D is not at the beginning of string ... */
5863 if (!AT_STRINGS_BEG (d))
5864 {
5865 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5866#ifdef emacs
5d967c7a 5867 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5868#endif
b18215fc
RS
5869 s1 = SYNTAX (c1);
5870
5871 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5872 returns 0. */
b18215fc
RS
5873 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5874 goto fail;
5875 }
5876 }
e318085a
RS
5877 break;
5878
b18215fc 5879 case wordend:
25fe55af 5880 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5881
5882 /* We FAIL in one of the following cases: */
5883
5884 /* Case 1: D is at the beginning of string. */
5885 if (AT_STRINGS_BEG (d))
e318085a 5886 goto fail;
b18215fc
RS
5887 else
5888 {
5889 /* C1 is the character before D, S1 is the syntax of C1, C2
5890 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5891 re_wchar_t c1, c2;
5892 int s1, s2;
5d967c7a 5893#ifdef emacs
2d1675e4
SM
5894 int offset = PTR_TO_OFFSET (d) - 1;
5895 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5896 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5897#endif
99633e97 5898 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5899 s1 = SYNTAX (c1);
5900
5901 /* Case 2: S1 is not Sword. */
5902 if (s1 != Sword)
5903 goto fail;
5904
5905 /* Case 3: D is not at the end of string ... */
5906 if (!AT_STRINGS_END (d))
5907 {
f1ad044f 5908 PREFETCH_NOLIMIT ();
2d1675e4 5909 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a 5910#ifdef emacs
134579f2 5911 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5d967c7a 5912#endif
b18215fc
RS
5913 s2 = SYNTAX (c2);
5914
5915 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5916 returns 0. */
b18215fc 5917 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5918 goto fail;
b18215fc
RS
5919 }
5920 }
e318085a
RS
5921 break;
5922
669fa600
SM
5923 case symbeg:
5924 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
5925
5926 /* We FAIL in one of the following cases: */
5927
5928 /* Case 1: D is at the end of string. */
5929 if (AT_STRINGS_END (d))
5930 goto fail;
5931 else
5932 {
5933 /* C1 is the character before D, S1 is the syntax of C1, C2
5934 is the character at D, and S2 is the syntax of C2. */
5935 re_wchar_t c1, c2;
5936 int s1, s2;
5937#ifdef emacs
5938 int offset = PTR_TO_OFFSET (d);
5939 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5940 UPDATE_SYNTAX_TABLE (charpos);
5941#endif
5942 PREFETCH ();
5943 c2 = RE_STRING_CHAR (d, dend - d);
5944 s2 = SYNTAX (c2);
5945
5946 /* Case 2: S2 is neither Sword nor Ssymbol. */
5947 if (s2 != Sword && s2 != Ssymbol)
5948 goto fail;
5949
5950 /* Case 3: D is not at the beginning of string ... */
5951 if (!AT_STRINGS_BEG (d))
5952 {
5953 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5954#ifdef emacs
5955 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5956#endif
5957 s1 = SYNTAX (c1);
5958
5959 /* ... and S1 is Sword or Ssymbol. */
5960 if (s1 == Sword || s1 == Ssymbol)
5961 goto fail;
5962 }
5963 }
5964 break;
5965
5966 case symend:
5967 DEBUG_PRINT1 ("EXECUTING symend.\n");
5968
5969 /* We FAIL in one of the following cases: */
5970
5971 /* Case 1: D is at the beginning of string. */
5972 if (AT_STRINGS_BEG (d))
5973 goto fail;
5974 else
5975 {
5976 /* C1 is the character before D, S1 is the syntax of C1, C2
5977 is the character at D, and S2 is the syntax of C2. */
5978 re_wchar_t c1, c2;
5979 int s1, s2;
5980#ifdef emacs
5981 int offset = PTR_TO_OFFSET (d) - 1;
5982 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5983 UPDATE_SYNTAX_TABLE (charpos);
5984#endif
5985 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5986 s1 = SYNTAX (c1);
5987
5988 /* Case 2: S1 is neither Ssymbol nor Sword. */
5989 if (s1 != Sword && s1 != Ssymbol)
5990 goto fail;
5991
5992 /* Case 3: D is not at the end of string ... */
5993 if (!AT_STRINGS_END (d))
5994 {
5995 PREFETCH_NOLIMIT ();
5996 c2 = RE_STRING_CHAR (d, dend - d);
5997#ifdef emacs
134579f2 5998 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
5999#endif
6000 s2 = SYNTAX (c2);
6001
6002 /* ... and S2 is Sword or Ssymbol. */
6003 if (s2 == Sword || s2 == Ssymbol)
6004 goto fail;
6005 }
6006 }
6007 break;
6008
fa9a63c5 6009 case syntaxspec:
1fb352e0
SM
6010 case notsyntaxspec:
6011 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6012 mcnt = *p++;
1fb352e0 6013 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6014 PREFETCH ();
b18215fc
RS
6015#ifdef emacs
6016 {
2d1675e4
SM
6017 int offset = PTR_TO_OFFSET (d);
6018 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6019 UPDATE_SYNTAX_TABLE (pos1);
6020 }
25fe55af 6021#endif
b18215fc 6022 {
01618498
SM
6023 int len;
6024 re_wchar_t c;
b18215fc 6025
2d1675e4 6026 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6027
990b2375 6028 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6029 goto fail;
b18215fc
RS
6030 d += len;
6031 }
fa9a63c5
RM
6032 break;
6033
b18215fc 6034#ifdef emacs
1fb352e0
SM
6035 case before_dot:
6036 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6037 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6038 goto fail;
b18215fc
RS
6039 break;
6040
1fb352e0
SM
6041 case at_dot:
6042 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6043 if (PTR_BYTE_POS (d) != PT_BYTE)
6044 goto fail;
6045 break;
b18215fc 6046
1fb352e0
SM
6047 case after_dot:
6048 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6049 if (PTR_BYTE_POS (d) <= PT_BYTE)
6050 goto fail;
e318085a 6051 break;
fa9a63c5 6052
1fb352e0 6053 case categoryspec:
b18215fc 6054 case notcategoryspec:
1fb352e0 6055 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6056 mcnt = *p++;
1fb352e0 6057 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6058 PREFETCH ();
6059 {
01618498
SM
6060 int len;
6061 re_wchar_t c;
6062
2d1675e4 6063 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6064
1fb352e0 6065 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6066 goto fail;
6067 d += len;
6068 }
fa9a63c5 6069 break;
5e69f11e 6070
1fb352e0 6071#endif /* emacs */
5e69f11e 6072
0b32bf0e
SM
6073 default:
6074 abort ();
fa9a63c5 6075 }
b18215fc 6076 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6077
6078
6079 /* We goto here if a matching operation fails. */
6080 fail:
5b370c2b 6081 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6082 if (!FAIL_STACK_EMPTY ())
505bde11 6083 {
01618498 6084 re_char *str, *pat;
505bde11 6085 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6086 DEBUG_PRINT1 ("\nFAIL:\n");
6087 POP_FAILURE_POINT (str, pat);
505bde11
SM
6088 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6089 {
6090 case on_failure_keep_string_jump:
6091 assert (str == NULL);
6092 goto continue_failure_jump;
6093
0683b6fa
SM
6094 case on_failure_jump_nastyloop:
6095 assert ((re_opcode_t)pat[-2] == no_op);
6096 PUSH_FAILURE_POINT (pat - 2, str);
6097 /* Fallthrough */
6098
505bde11
SM
6099 case on_failure_jump_loop:
6100 case on_failure_jump:
6101 case succeed_n:
6102 d = str;
6103 continue_failure_jump:
6104 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6105 p = pat + mcnt;
6106 break;
b18215fc 6107
0683b6fa
SM
6108 case no_op:
6109 /* A special frame used for nastyloops. */
6110 goto fail;
6111
505bde11
SM
6112 default:
6113 abort();
6114 }
fa9a63c5 6115
505bde11 6116 assert (p >= bufp->buffer && p <= pend);
b18215fc 6117
0b32bf0e 6118 if (d >= string1 && d <= end1)
fa9a63c5 6119 dend = end_match_1;
0b32bf0e 6120 }
fa9a63c5 6121 else
0b32bf0e 6122 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6123 } /* for (;;) */
6124
6125 if (best_regs_set)
6126 goto restore_best_regs;
6127
6128 FREE_VARIABLES ();
6129
b18215fc 6130 return -1; /* Failure to match. */
fa9a63c5
RM
6131} /* re_match_2 */
6132\f
6133/* Subroutine definitions for re_match_2. */
6134
fa9a63c5
RM
6135/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6136 bytes; nonzero otherwise. */
5e69f11e 6137
fa9a63c5 6138static int
2d1675e4
SM
6139bcmp_translate (s1, s2, len, translate, multibyte)
6140 re_char *s1, *s2;
fa9a63c5 6141 register int len;
6676cb1c 6142 RE_TRANSLATE_TYPE translate;
2d1675e4 6143 const int multibyte;
fa9a63c5 6144{
2d1675e4
SM
6145 register re_char *p1 = s1, *p2 = s2;
6146 re_char *p1_end = s1 + len;
6147 re_char *p2_end = s2 + len;
e934739e 6148
4bb91c68
SM
6149 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6150 different lengths, but relying on a single `len' would break this. -sm */
6151 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6152 {
e934739e 6153 int p1_charlen, p2_charlen;
01618498 6154 re_wchar_t p1_ch, p2_ch;
e934739e 6155
2d1675e4
SM
6156 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6157 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
6158
6159 if (RE_TRANSLATE (translate, p1_ch)
6160 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6161 return 1;
e934739e
RS
6162
6163 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6164 }
e934739e
RS
6165
6166 if (p1 != p1_end || p2 != p2_end)
6167 return 1;
6168
fa9a63c5
RM
6169 return 0;
6170}
6171\f
6172/* Entry points for GNU code. */
6173
6174/* re_compile_pattern is the GNU regular expression compiler: it
6175 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6176 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6177
fa9a63c5
RM
6178 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6179 are set in BUFP on entry.
5e69f11e 6180
b18215fc 6181 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6182
6183const char *
6184re_compile_pattern (pattern, length, bufp)
6185 const char *pattern;
0b32bf0e 6186 size_t length;
fa9a63c5
RM
6187 struct re_pattern_buffer *bufp;
6188{
6189 reg_errcode_t ret;
5e69f11e 6190
fa9a63c5
RM
6191 /* GNU code is written to assume at least RE_NREGS registers will be set
6192 (and at least one extra will be -1). */
6193 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6194
fa9a63c5
RM
6195 /* And GNU code determines whether or not to get register information
6196 by passing null for the REGS argument to re_match, etc., not by
6197 setting no_sub. */
6198 bufp->no_sub = 0;
5e69f11e 6199
4bb91c68 6200 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6201
6202 if (!ret)
6203 return NULL;
6204 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6205}
c0f9ea08 6206WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6207\f
b18215fc
RS
6208/* Entry points compatible with 4.2 BSD regex library. We don't define
6209 them unless specifically requested. */
fa9a63c5 6210
0b32bf0e 6211#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6212
6213/* BSD has one and only one pattern buffer. */
6214static struct re_pattern_buffer re_comp_buf;
6215
6216char *
0b32bf0e 6217# ifdef _LIBC
48afdd44
RM
6218/* Make these definitions weak in libc, so POSIX programs can redefine
6219 these names if they don't use our functions, and still use
6220 regcomp/regexec below without link errors. */
6221weak_function
0b32bf0e 6222# endif
fa9a63c5
RM
6223re_comp (s)
6224 const char *s;
6225{
6226 reg_errcode_t ret;
5e69f11e 6227
fa9a63c5
RM
6228 if (!s)
6229 {
6230 if (!re_comp_buf.buffer)
0b32bf0e 6231 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6232 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6233 return 0;
6234 }
6235
6236 if (!re_comp_buf.buffer)
6237 {
6238 re_comp_buf.buffer = (unsigned char *) malloc (200);
6239 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6240 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6241 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6242 re_comp_buf.allocated = 200;
6243
6244 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6245 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6246 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6247 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6248 }
6249
6250 /* Since `re_exec' always passes NULL for the `regs' argument, we
6251 don't need to initialize the pattern buffer fields which affect it. */
6252
fa9a63c5 6253 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6254
fa9a63c5
RM
6255 if (!ret)
6256 return NULL;
6257
6258 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6259 return (char *) gettext (re_error_msgid[(int) ret]);
6260}
6261
6262
6263int
0b32bf0e 6264# ifdef _LIBC
48afdd44 6265weak_function
0b32bf0e 6266# endif
fa9a63c5
RM
6267re_exec (s)
6268 const char *s;
6269{
6270 const int len = strlen (s);
6271 return
6272 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6273}
6274#endif /* _REGEX_RE_COMP */
6275\f
6276/* POSIX.2 functions. Don't define these for Emacs. */
6277
6278#ifndef emacs
6279
6280/* regcomp takes a regular expression as a string and compiles it.
6281
b18215fc 6282 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6283 since POSIX says we shouldn't. Thus, we set
6284
6285 `buffer' to the compiled pattern;
6286 `used' to the length of the compiled pattern;
6287 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6288 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6289 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6290 `fastmap' to an allocated space for the fastmap;
6291 `fastmap_accurate' to zero;
fa9a63c5
RM
6292 `re_nsub' to the number of subexpressions in PATTERN.
6293
6294 PATTERN is the address of the pattern string.
6295
6296 CFLAGS is a series of bits which affect compilation.
6297
6298 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6299 use POSIX basic syntax.
6300
6301 If REG_NEWLINE is set, then . and [^...] don't match newline.
6302 Also, regexec will try a match beginning after every newline.
6303
6304 If REG_ICASE is set, then we considers upper- and lowercase
6305 versions of letters to be equivalent when matching.
6306
6307 If REG_NOSUB is set, then when PREG is passed to regexec, that
6308 routine will report only success or failure, and nothing about the
6309 registers.
6310
b18215fc 6311 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6312 the return codes and their meanings.) */
6313
6314int
6315regcomp (preg, pattern, cflags)
ada30c0e
SM
6316 regex_t *__restrict preg;
6317 const char *__restrict pattern;
fa9a63c5
RM
6318 int cflags;
6319{
6320 reg_errcode_t ret;
4bb91c68 6321 reg_syntax_t syntax
fa9a63c5
RM
6322 = (cflags & REG_EXTENDED) ?
6323 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6324
6325 /* regex_compile will allocate the space for the compiled pattern. */
6326 preg->buffer = 0;
6327 preg->allocated = 0;
6328 preg->used = 0;
5e69f11e 6329
c0f9ea08 6330 /* Try to allocate space for the fastmap. */
7b1d7d66 6331 preg->fastmap = (char *) xmalloc (1 << BYTEWIDTH);
5e69f11e 6332
fa9a63c5
RM
6333 if (cflags & REG_ICASE)
6334 {
6335 unsigned i;
5e69f11e 6336
6676cb1c 6337 preg->translate
7b1d7d66
RS
6338 = (RE_TRANSLATE_TYPE) xmalloc (CHAR_SET_SIZE
6339 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6340 if (preg->translate == NULL)
0b32bf0e 6341 return (int) REG_ESPACE;
fa9a63c5
RM
6342
6343 /* Map uppercase characters to corresponding lowercase ones. */
6344 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6345 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6346 }
6347 else
6348 preg->translate = NULL;
6349
6350 /* If REG_NEWLINE is set, newlines are treated differently. */
6351 if (cflags & REG_NEWLINE)
6352 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6353 syntax &= ~RE_DOT_NEWLINE;
6354 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6355 }
6356 else
c0f9ea08 6357 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6358
6359 preg->no_sub = !!(cflags & REG_NOSUB);
6360
5e69f11e 6361 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6362 can use strlen here in compiling the pattern. */
4bb91c68 6363 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6364
fa9a63c5
RM
6365 /* POSIX doesn't distinguish between an unmatched open-group and an
6366 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6367 if (ret == REG_ERPAREN)
6368 ret = REG_EPAREN;
6369
6370 if (ret == REG_NOERROR && preg->fastmap)
6371 { /* Compute the fastmap now, since regexec cannot modify the pattern
6372 buffer. */
6373 re_compile_fastmap (preg);
6374 if (preg->can_be_null)
6375 { /* The fastmap can't be used anyway. */
6376 free (preg->fastmap);
6377 preg->fastmap = NULL;
6378 }
6379 }
fa9a63c5
RM
6380 return (int) ret;
6381}
c0f9ea08 6382WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6383
6384
6385/* regexec searches for a given pattern, specified by PREG, in the
6386 string STRING.
5e69f11e 6387
fa9a63c5 6388 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6389 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6390 least NMATCH elements, and we set them to the offsets of the
6391 corresponding matched substrings.
5e69f11e 6392
fa9a63c5
RM
6393 EFLAGS specifies `execution flags' which affect matching: if
6394 REG_NOTBOL is set, then ^ does not match at the beginning of the
6395 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6396
fa9a63c5
RM
6397 We return 0 if we find a match and REG_NOMATCH if not. */
6398
6399int
6400regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6401 const regex_t *__restrict preg;
6402 const char *__restrict string;
5e69f11e 6403 size_t nmatch;
9f2dbe01 6404 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6405 int eflags;
6406{
6407 int ret;
6408 struct re_registers regs;
6409 regex_t private_preg;
6410 int len = strlen (string);
c0f9ea08 6411 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6412
6413 private_preg = *preg;
5e69f11e 6414
fa9a63c5
RM
6415 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6416 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6417
fa9a63c5
RM
6418 /* The user has told us exactly how many registers to return
6419 information about, via `nmatch'. We have to pass that on to the
b18215fc 6420 matching routines. */
fa9a63c5 6421 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6422
fa9a63c5
RM
6423 if (want_reg_info)
6424 {
6425 regs.num_regs = nmatch;
4bb91c68
SM
6426 regs.start = TALLOC (nmatch * 2, regoff_t);
6427 if (regs.start == NULL)
0b32bf0e 6428 return (int) REG_NOMATCH;
4bb91c68 6429 regs.end = regs.start + nmatch;
fa9a63c5
RM
6430 }
6431
c0f9ea08
SM
6432 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6433 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6434 was a little bit longer but still only matching the real part.
6435 This works because the `endline' will check for a '\n' and will find a
6436 '\0', correctly deciding that this is not the end of a line.
6437 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6438 a convenient '\0' there. For all we know, the string could be preceded
6439 by '\n' which would throw things off. */
6440
fa9a63c5
RM
6441 /* Perform the searching operation. */
6442 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6443 /* start: */ 0, /* range: */ len,
6444 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6445
fa9a63c5
RM
6446 /* Copy the register information to the POSIX structure. */
6447 if (want_reg_info)
6448 {
6449 if (ret >= 0)
0b32bf0e
SM
6450 {
6451 unsigned r;
fa9a63c5 6452
0b32bf0e
SM
6453 for (r = 0; r < nmatch; r++)
6454 {
6455 pmatch[r].rm_so = regs.start[r];
6456 pmatch[r].rm_eo = regs.end[r];
6457 }
6458 }
fa9a63c5 6459
b18215fc 6460 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6461 free (regs.start);
fa9a63c5
RM
6462 }
6463
6464 /* We want zero return to mean success, unlike `re_search'. */
6465 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6466}
c0f9ea08 6467WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6468
6469
6470/* Returns a message corresponding to an error code, ERRCODE, returned
6471 from either regcomp or regexec. We don't use PREG here. */
6472
6473size_t
6474regerror (errcode, preg, errbuf, errbuf_size)
6475 int errcode;
6476 const regex_t *preg;
6477 char *errbuf;
6478 size_t errbuf_size;
6479{
6480 const char *msg;
6481 size_t msg_size;
6482
6483 if (errcode < 0
6484 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6485 /* Only error codes returned by the rest of the code should be passed
b18215fc 6486 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6487 code generates an invalid error code, then the program has a bug.
6488 Dump core so we can fix it. */
6489 abort ();
6490
6491 msg = gettext (re_error_msgid[errcode]);
6492
6493 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6494
fa9a63c5
RM
6495 if (errbuf_size != 0)
6496 {
6497 if (msg_size > errbuf_size)
0b32bf0e
SM
6498 {
6499 strncpy (errbuf, msg, errbuf_size - 1);
6500 errbuf[errbuf_size - 1] = 0;
6501 }
fa9a63c5 6502 else
0b32bf0e 6503 strcpy (errbuf, msg);
fa9a63c5
RM
6504 }
6505
6506 return msg_size;
6507}
c0f9ea08 6508WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6509
6510
6511/* Free dynamically allocated space used by PREG. */
6512
6513void
6514regfree (preg)
6515 regex_t *preg;
6516{
6517 if (preg->buffer != NULL)
6518 free (preg->buffer);
6519 preg->buffer = NULL;
5e69f11e 6520
fa9a63c5
RM
6521 preg->allocated = 0;
6522 preg->used = 0;
6523
6524 if (preg->fastmap != NULL)
6525 free (preg->fastmap);
6526 preg->fastmap = NULL;
6527 preg->fastmap_accurate = 0;
6528
6529 if (preg->translate != NULL)
6530 free (preg->translate);
6531 preg->translate = NULL;
6532}
c0f9ea08 6533WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6534
6535#endif /* not emacs */
ab5796a9
MB
6536
6537/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6538 (do not change this comment) */