(Ffunction, Fquote): Signal error if not 1 argument.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
64be3a42
GM
6 2002, 2003, 2004, 2005, 2006, 2007
7 Free Software Foundation, Inc.
bc78d348 8
fa9a63c5
RM
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
64be3a42 11 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
4fc5845f 21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 22 USA. */
fa9a63c5 23
6df42991 24/* TODO:
505bde11 25 - structure the opcode space into opcode+flag.
dc1e502d 26 - merge with glibc's regex.[ch].
01618498 27 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
28 need to modify the compiled regexp so that re_match can be reentrant.
29 - get rid of on_failure_jump_smart by doing the optimization in re_comp
30 rather than at run-time, so that re_match can be reentrant.
01618498 31*/
505bde11 32
fa9a63c5 33/* AIX requires this to be the first thing in the file. */
0b32bf0e 34#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
35 #pragma alloca
36#endif
37
fa9a63c5 38#ifdef HAVE_CONFIG_H
0b32bf0e 39# include <config.h>
fa9a63c5
RM
40#endif
41
4bb91c68
SM
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
fa9a63c5 48
14473664
SM
49/* Whether to use ISO C Amendment 1 wide char functions.
50 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
51#if defined _LIBC
52#define WIDE_CHAR_SUPPORT 1
53#else
14473664 54#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
55 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
56#endif
14473664
SM
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
a0ad02f7 60#if WIDE_CHAR_SUPPORT
14473664
SM
61/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62# include <wchar.h>
63# include <wctype.h>
64#endif
65
c0f9ea08
SM
66#ifdef _LIBC
67/* We have to keep the namespace clean. */
68# define regfree(preg) __regfree (preg)
69# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
71# define regerror(err_code, preg, errbuf, errbuf_size) \
72 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
73# define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77# define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79# define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81# define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83# define re_set_syntax(syntax) __re_set_syntax (syntax)
84# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
14473664
SM
88/* Make sure we call libc's function even if the user overrides them. */
89# define btowc __btowc
90# define iswctype __iswctype
91# define wctype __wctype
92
c0f9ea08
SM
93# define WEAK_ALIAS(a,b) weak_alias (a, b)
94
95/* We are also using some library internals. */
96# include <locale/localeinfo.h>
97# include <locale/elem-hash.h>
98# include <langinfo.h>
99#else
100# define WEAK_ALIAS(a,b)
101#endif
102
4bb91c68 103/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 104#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
105# include <libintl.h>
106#else
107# define gettext(msgid) (msgid)
108#endif
109
5e69f11e
RM
110#ifndef gettext_noop
111/* This define is so xgettext can find the internationalizable
112 strings. */
0b32bf0e 113# define gettext_noop(String) String
5e69f11e
RM
114#endif
115
fa9a63c5
RM
116/* The `emacs' switch turns on certain matching commands
117 that make sense only in Emacs. */
118#ifdef emacs
119
0b32bf0e
SM
120# include "lisp.h"
121# include "buffer.h"
b18215fc
RS
122
123/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 124# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 125
0b32bf0e
SM
126# include "syntax.h"
127# include "charset.h"
128# include "category.h"
fa9a63c5 129
7689ef0b
EZ
130# ifdef malloc
131# undef malloc
132# endif
0b32bf0e 133# define malloc xmalloc
7689ef0b
EZ
134# ifdef realloc
135# undef realloc
136# endif
0b32bf0e 137# define realloc xrealloc
7689ef0b
EZ
138# ifdef free
139# undef free
140# endif
0b32bf0e 141# define free xfree
9abbd165 142
7814e705 143/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
144# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
145# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
146
147# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
148# define RE_STRING_CHAR(p, s) \
4e8a9132 149 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 150# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
151 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
152
153/* Set C a (possibly multibyte) character before P. P points into a
154 string which is the virtual concatenation of STR1 (which ends at
155 END1) or STR2 (which ends at END2). */
0b32bf0e 156# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
157 do { \
158 if (multibyte) \
159 { \
0b32bf0e
SM
160 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
161 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
70806df6
KH
162 re_char *d0 = dtemp; \
163 PREV_CHAR_BOUNDARY (d0, dlimit); \
164 c = STRING_CHAR (d0, dtemp - d0); \
2d1675e4
SM
165 } \
166 else \
167 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
168 } while (0)
169
4e8a9132 170
fa9a63c5
RM
171#else /* not emacs */
172
173/* If we are not linking with Emacs proper,
174 we can't use the relocating allocator
175 even if config.h says that we can. */
0b32bf0e 176# undef REL_ALLOC
fa9a63c5 177
0b32bf0e
SM
178# if defined STDC_HEADERS || defined _LIBC
179# include <stdlib.h>
180# else
fa9a63c5
RM
181char *malloc ();
182char *realloc ();
0b32bf0e 183# endif
fa9a63c5 184
a77f947b
CY
185/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
186
187void *
188xmalloc (size)
189 size_t size;
190{
191 register void *val;
192 val = (void *) malloc (size);
193 if (!val && size)
194 {
195 write (2, "virtual memory exhausted\n", 25);
196 exit (1);
197 }
198 return val;
199}
200
201void *
202xrealloc (block, size)
203 void *block;
204 size_t size;
205{
206 register void *val;
207 /* We must call malloc explicitly when BLOCK is 0, since some
208 reallocs don't do this. */
209 if (! block)
210 val = (void *) malloc (size);
211 else
212 val = (void *) realloc (block, size);
213 if (!val && size)
214 {
215 write (2, "virtual memory exhausted\n", 25);
216 exit (1);
217 }
218 return val;
219}
220
a073faa6
CY
221# ifdef malloc
222# undef malloc
223# endif
224# define malloc xmalloc
225# ifdef realloc
226# undef realloc
227# endif
228# define realloc xrealloc
229
9e4ecb26 230/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 231 If nothing else has been done, use the method below. */
0b32bf0e
SM
232# ifdef INHIBIT_STRING_HEADER
233# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
234# if !defined bzero && !defined bcopy
235# undef INHIBIT_STRING_HEADER
236# endif
237# endif
238# endif
9e4ecb26 239
4bb91c68 240/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
241 This is used in most programs--a few other programs avoid this
242 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
243# ifndef INHIBIT_STRING_HEADER
244# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
245# include <string.h>
0b32bf0e 246# ifndef bzero
4bb91c68
SM
247# ifndef _LIBC
248# define bzero(s, n) (memset (s, '\0', n), (s))
249# else
250# define bzero(s, n) __bzero (s, n)
251# endif
0b32bf0e
SM
252# endif
253# else
254# include <strings.h>
4bb91c68
SM
255# ifndef memcmp
256# define memcmp(s1, s2, n) bcmp (s1, s2, n)
257# endif
258# ifndef memcpy
259# define memcpy(d, s, n) (bcopy (s, d, n), (d))
260# endif
0b32bf0e
SM
261# endif
262# endif
fa9a63c5
RM
263
264/* Define the syntax stuff for \<, \>, etc. */
265
990b2375 266/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 267enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 268
0b32bf0e
SM
269# ifdef SWITCH_ENUM_BUG
270# define SWITCH_ENUM_CAST(x) ((int)(x))
271# else
272# define SWITCH_ENUM_CAST(x) (x)
273# endif
fa9a63c5 274
e934739e 275/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
276# define BASE_LEADING_CODE_P(c) (0)
277# define CHAR_CHARSET(c) 0
278# define CHARSET_LEADING_CODE_BASE(c) 0
279# define MAX_MULTIBYTE_LENGTH 1
280# define RE_MULTIBYTE_P(x) 0
281# define WORD_BOUNDARY_P(c1, c2) (0)
282# define CHAR_HEAD_P(p) (1)
283# define SINGLE_BYTE_CHAR_P(c) (1)
284# define SAME_CHARSET_P(c1, c2) (1)
285# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 286# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e
SM
287# define STRING_CHAR(p, s) (*(p))
288# define RE_STRING_CHAR STRING_CHAR
289# define CHAR_STRING(c, s) (*(s) = (c), 1)
290# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
291# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
292# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 293 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 294# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 295#endif /* not emacs */
4e8a9132
SM
296
297#ifndef RE_TRANSLATE
0b32bf0e
SM
298# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
299# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 300#endif
fa9a63c5
RM
301\f
302/* Get the interface, including the syntax bits. */
303#include "regex.h"
304
f71b19b6
DL
305/* isalpha etc. are used for the character classes. */
306#include <ctype.h>
fa9a63c5 307
f71b19b6 308#ifdef emacs
fa9a63c5 309
f71b19b6 310/* 1 if C is an ASCII character. */
0b32bf0e 311# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 312
f71b19b6 313/* 1 if C is a unibyte character. */
0b32bf0e 314# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 315
f71b19b6 316/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 317
f71b19b6 318/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
319# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
320# define ISCNTRL(c) ((c) < ' ')
321# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
322 || ((c) >= 'a' && (c) <= 'f') \
323 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
324
325/* This is only used for single-byte characters. */
0b32bf0e 326# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
327
328/* The rest must handle multibyte characters. */
329
0b32bf0e 330# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 331 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
332 : 1)
333
14473664 334# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 335 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
336 : 1)
337
0b32bf0e 338# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
339 ? (((c) >= 'a' && (c) <= 'z') \
340 || ((c) >= 'A' && (c) <= 'Z') \
341 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
342 : SYNTAX (c) == Sword)
343
0b32bf0e 344# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
345 ? (((c) >= 'a' && (c) <= 'z') \
346 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
347 : SYNTAX (c) == Sword)
348
0b32bf0e 349# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 350
0b32bf0e 351# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
352 ? ((c) > ' ' && (c) < 0177 \
353 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
354 || ((c) >= 'A' && (c) <= 'Z') \
355 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
356 : SYNTAX (c) != Sword)
357
0b32bf0e 358# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 359
0b32bf0e 360# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 361
0b32bf0e 362# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
363
364#else /* not emacs */
365
f71b19b6
DL
366/* Jim Meyering writes:
367
368 "... Some ctype macros are valid only for character codes that
369 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
370 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 371 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
372 STDC_HEADERS is defined, then autoconf has verified that the ctype
373 macros don't need to be guarded with references to isascii. ...
374 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
375 eliminate the && through constant folding."
376 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 377
4bb91c68 378# undef ISASCII
0b32bf0e
SM
379# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
380# define ISASCII(c) 1
381# else
382# define ISASCII(c) isascii(c)
383# endif
f71b19b6
DL
384
385/* 1 if C is an ASCII character. */
0b32bf0e 386# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
387
388/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
389# define ISUNIBYTE(c) 1
390
391# ifdef isblank
392# define ISBLANK(c) (ISASCII (c) && isblank (c))
393# else
394# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
395# endif
396# ifdef isgraph
397# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
398# else
399# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
400# endif
401
4bb91c68 402# undef ISPRINT
0b32bf0e
SM
403# define ISPRINT(c) (ISASCII (c) && isprint (c))
404# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
405# define ISALNUM(c) (ISASCII (c) && isalnum (c))
406# define ISALPHA(c) (ISASCII (c) && isalpha (c))
407# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
408# define ISLOWER(c) (ISASCII (c) && islower (c))
409# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
410# define ISSPACE(c) (ISASCII (c) && isspace (c))
411# define ISUPPER(c) (ISASCII (c) && isupper (c))
412# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
413
414# define ISWORD(c) ISALPHA(c)
415
4bb91c68
SM
416# ifdef _tolower
417# define TOLOWER(c) _tolower(c)
418# else
419# define TOLOWER(c) tolower(c)
420# endif
421
422/* How many characters in the character set. */
423# define CHAR_SET_SIZE 256
424
0b32bf0e 425# ifdef SYNTAX_TABLE
f71b19b6 426
0b32bf0e 427extern char *re_syntax_table;
f71b19b6 428
0b32bf0e
SM
429# else /* not SYNTAX_TABLE */
430
0b32bf0e
SM
431static char re_syntax_table[CHAR_SET_SIZE];
432
433static void
434init_syntax_once ()
435{
436 register int c;
437 static int done = 0;
438
439 if (done)
440 return;
441
442 bzero (re_syntax_table, sizeof re_syntax_table);
443
4bb91c68
SM
444 for (c = 0; c < CHAR_SET_SIZE; ++c)
445 if (ISALNUM (c))
446 re_syntax_table[c] = Sword;
fa9a63c5 447
669fa600 448 re_syntax_table['_'] = Ssymbol;
fa9a63c5 449
0b32bf0e
SM
450 done = 1;
451}
452
453# endif /* not SYNTAX_TABLE */
96cc36cc 454
4bb91c68
SM
455# define SYNTAX(c) re_syntax_table[(c)]
456
96cc36cc
RS
457#endif /* not emacs */
458\f
fa9a63c5 459#ifndef NULL
0b32bf0e 460# define NULL (void *)0
fa9a63c5
RM
461#endif
462
463/* We remove any previous definition of `SIGN_EXTEND_CHAR',
464 since ours (we hope) works properly with all combinations of
465 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 466 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
467#undef SIGN_EXTEND_CHAR
468#if __STDC__
0b32bf0e 469# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
470#else /* not __STDC__ */
471/* As in Harbison and Steele. */
0b32bf0e 472# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
473#endif
474\f
475/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
476 use `alloca' instead of `malloc'. This is because using malloc in
477 re_search* or re_match* could cause memory leaks when C-g is used in
478 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
479 the other hand, malloc is more portable, and easier to debug.
480
fa9a63c5
RM
481 Because we sometimes use alloca, some routines have to be macros,
482 not functions -- `alloca'-allocated space disappears at the end of the
483 function it is called in. */
484
485#ifdef REGEX_MALLOC
486
0b32bf0e
SM
487# define REGEX_ALLOCATE malloc
488# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
489# define REGEX_FREE free
fa9a63c5
RM
490
491#else /* not REGEX_MALLOC */
492
493/* Emacs already defines alloca, sometimes. */
0b32bf0e 494# ifndef alloca
fa9a63c5
RM
495
496/* Make alloca work the best possible way. */
0b32bf0e
SM
497# ifdef __GNUC__
498# define alloca __builtin_alloca
499# else /* not __GNUC__ */
500# if HAVE_ALLOCA_H
501# include <alloca.h>
502# endif /* HAVE_ALLOCA_H */
503# endif /* not __GNUC__ */
fa9a63c5 504
0b32bf0e 505# endif /* not alloca */
fa9a63c5 506
0b32bf0e 507# define REGEX_ALLOCATE alloca
fa9a63c5
RM
508
509/* Assumes a `char *destination' variable. */
0b32bf0e 510# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 511 (destination = (char *) alloca (nsize), \
4bb91c68 512 memcpy (destination, source, osize))
fa9a63c5
RM
513
514/* No need to do anything to free, after alloca. */
0b32bf0e 515# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
516
517#endif /* not REGEX_MALLOC */
518
519/* Define how to allocate the failure stack. */
520
0b32bf0e 521#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 522
0b32bf0e 523# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 524 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 525# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 526 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 527# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
528 r_alloc_free (&failure_stack_ptr)
529
4297555e 530#else /* not using relocating allocator */
fa9a63c5 531
0b32bf0e 532# ifdef REGEX_MALLOC
fa9a63c5 533
0b32bf0e
SM
534# define REGEX_ALLOCATE_STACK malloc
535# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
536# define REGEX_FREE_STACK free
fa9a63c5 537
0b32bf0e 538# else /* not REGEX_MALLOC */
fa9a63c5 539
0b32bf0e 540# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 541
0b32bf0e 542# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 543 REGEX_REALLOCATE (source, osize, nsize)
7814e705 544/* No need to explicitly free anything. */
0b32bf0e 545# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 546
0b32bf0e 547# endif /* not REGEX_MALLOC */
4297555e 548#endif /* not using relocating allocator */
fa9a63c5
RM
549
550
551/* True if `size1' is non-NULL and PTR is pointing anywhere inside
552 `string1' or just past its end. This works if PTR is NULL, which is
553 a good thing. */
25fe55af 554#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
555 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
556
557/* (Re)Allocate N items of type T using malloc, or fail. */
558#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
559#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
560#define RETALLOC_IF(addr, n, t) \
561 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
562#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
563
4bb91c68 564#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
565
566#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
567
568#undef MAX
569#undef MIN
570#define MAX(a, b) ((a) > (b) ? (a) : (b))
571#define MIN(a, b) ((a) < (b) ? (a) : (b))
572
6470ea05
SM
573/* Type of source-pattern and string chars. */
574typedef const unsigned char re_char;
575
fa9a63c5
RM
576typedef char boolean;
577#define false 0
578#define true 1
579
4bb91c68
SM
580static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
581 re_char *string1, int size1,
582 re_char *string2, int size2,
583 int pos,
584 struct re_registers *regs,
585 int stop));
fa9a63c5
RM
586\f
587/* These are the command codes that appear in compiled regular
4bb91c68 588 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
589 command code can specify any interpretation whatsoever for its
590 arguments. Zero bytes may appear in the compiled regular expression. */
591
592typedef enum
593{
594 no_op = 0,
595
4bb91c68 596 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
597 succeed,
598
25fe55af 599 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
600 exactn,
601
25fe55af 602 /* Matches any (more or less) character. */
fa9a63c5
RM
603 anychar,
604
25fe55af
RS
605 /* Matches any one char belonging to specified set. First
606 following byte is number of bitmap bytes. Then come bytes
607 for a bitmap saying which chars are in. Bits in each byte
608 are ordered low-bit-first. A character is in the set if its
609 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
610 automatically not in the set.
611
612 If the length byte has the 0x80 bit set, then that stuff
613 is followed by a range table:
614 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 615 See RANGE_TABLE_WORK_BITS below.
01618498 616 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 617 pairs, each 2 multibyte characters,
0b32bf0e 618 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
619 charset,
620
25fe55af 621 /* Same parameters as charset, but match any character that is
4bb91c68 622 not one of those specified. */
fa9a63c5
RM
623 charset_not,
624
25fe55af
RS
625 /* Start remembering the text that is matched, for storing in a
626 register. Followed by one byte with the register number, in
627 the range 0 to one less than the pattern buffer's re_nsub
505bde11 628 field. */
fa9a63c5
RM
629 start_memory,
630
25fe55af
RS
631 /* Stop remembering the text that is matched and store it in a
632 memory register. Followed by one byte with the register
633 number, in the range 0 to one less than `re_nsub' in the
505bde11 634 pattern buffer. */
fa9a63c5
RM
635 stop_memory,
636
25fe55af 637 /* Match a duplicate of something remembered. Followed by one
4bb91c68 638 byte containing the register number. */
fa9a63c5
RM
639 duplicate,
640
25fe55af 641 /* Fail unless at beginning of line. */
fa9a63c5
RM
642 begline,
643
4bb91c68 644 /* Fail unless at end of line. */
fa9a63c5
RM
645 endline,
646
25fe55af
RS
647 /* Succeeds if at beginning of buffer (if emacs) or at beginning
648 of string to be matched (if not). */
fa9a63c5
RM
649 begbuf,
650
25fe55af 651 /* Analogously, for end of buffer/string. */
fa9a63c5 652 endbuf,
5e69f11e 653
25fe55af 654 /* Followed by two byte relative address to which to jump. */
5e69f11e 655 jump,
fa9a63c5 656
25fe55af 657 /* Followed by two-byte relative address of place to resume at
7814e705 658 in case of failure. */
fa9a63c5 659 on_failure_jump,
5e69f11e 660
25fe55af
RS
661 /* Like on_failure_jump, but pushes a placeholder instead of the
662 current string position when executed. */
fa9a63c5 663 on_failure_keep_string_jump,
5e69f11e 664
505bde11
SM
665 /* Just like `on_failure_jump', except that it checks that we
666 don't get stuck in an infinite loop (matching an empty string
667 indefinitely). */
668 on_failure_jump_loop,
669
0683b6fa
SM
670 /* Just like `on_failure_jump_loop', except that it checks for
671 a different kind of loop (the kind that shows up with non-greedy
672 operators). This operation has to be immediately preceded
673 by a `no_op'. */
674 on_failure_jump_nastyloop,
675
0b32bf0e 676 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
677 It analyses the loop before which it is put and if the
678 loop does not require backtracking, it changes itself to
4e8a9132
SM
679 `on_failure_keep_string_jump' and short-circuits the loop,
680 else it just defaults to changing itself into `on_failure_jump'.
681 It assumes that it is pointing to just past a `jump'. */
505bde11 682 on_failure_jump_smart,
fa9a63c5 683
25fe55af 684 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
685 After matching N times, jump to the address upon failure.
686 Does not work if N starts at 0: use on_failure_jump_loop
687 instead. */
fa9a63c5
RM
688 succeed_n,
689
25fe55af
RS
690 /* Followed by two-byte relative address, and two-byte number n.
691 Jump to the address N times, then fail. */
fa9a63c5
RM
692 jump_n,
693
25fe55af 694 /* Set the following two-byte relative address to the
7814e705 695 subsequent two-byte number. The address *includes* the two
25fe55af 696 bytes of number. */
fa9a63c5
RM
697 set_number_at,
698
fa9a63c5
RM
699 wordbeg, /* Succeeds if at word beginning. */
700 wordend, /* Succeeds if at word end. */
701
702 wordbound, /* Succeeds if at a word boundary. */
7814e705 703 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 704
669fa600
SM
705 symbeg, /* Succeeds if at symbol beginning. */
706 symend, /* Succeeds if at symbol end. */
707
fa9a63c5 708 /* Matches any character whose syntax is specified. Followed by
25fe55af 709 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
710 syntaxspec,
711
712 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
713 notsyntaxspec
714
715#ifdef emacs
716 ,before_dot, /* Succeeds if before point. */
717 at_dot, /* Succeeds if at point. */
718 after_dot, /* Succeeds if after point. */
b18215fc
RS
719
720 /* Matches any character whose category-set contains the specified
7814e705
JB
721 category. The operator is followed by a byte which contains a
722 category code (mnemonic ASCII character). */
b18215fc
RS
723 categoryspec,
724
725 /* Matches any character whose category-set does not contain the
726 specified category. The operator is followed by a byte which
727 contains the category code (mnemonic ASCII character). */
728 notcategoryspec
fa9a63c5
RM
729#endif /* emacs */
730} re_opcode_t;
731\f
732/* Common operations on the compiled pattern. */
733
734/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
735
736#define STORE_NUMBER(destination, number) \
737 do { \
738 (destination)[0] = (number) & 0377; \
739 (destination)[1] = (number) >> 8; \
740 } while (0)
741
742/* Same as STORE_NUMBER, except increment DESTINATION to
743 the byte after where the number is stored. Therefore, DESTINATION
744 must be an lvalue. */
745
746#define STORE_NUMBER_AND_INCR(destination, number) \
747 do { \
748 STORE_NUMBER (destination, number); \
749 (destination) += 2; \
750 } while (0)
751
752/* Put into DESTINATION a number stored in two contiguous bytes starting
753 at SOURCE. */
754
755#define EXTRACT_NUMBER(destination, source) \
756 do { \
757 (destination) = *(source) & 0377; \
758 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
759 } while (0)
760
761#ifdef DEBUG
4bb91c68 762static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
763static void
764extract_number (dest, source)
765 int *dest;
01618498 766 re_char *source;
fa9a63c5 767{
5e69f11e 768 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
769 *dest = *source & 0377;
770 *dest += temp << 8;
771}
772
4bb91c68 773# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
774# undef EXTRACT_NUMBER
775# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
776# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
777
778#endif /* DEBUG */
779
780/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
781 SOURCE must be an lvalue. */
782
783#define EXTRACT_NUMBER_AND_INCR(destination, source) \
784 do { \
785 EXTRACT_NUMBER (destination, source); \
25fe55af 786 (source) += 2; \
fa9a63c5
RM
787 } while (0)
788
789#ifdef DEBUG
4bb91c68
SM
790static void extract_number_and_incr _RE_ARGS ((int *destination,
791 re_char **source));
fa9a63c5
RM
792static void
793extract_number_and_incr (destination, source)
794 int *destination;
01618498 795 re_char **source;
5e69f11e 796{
fa9a63c5
RM
797 extract_number (destination, *source);
798 *source += 2;
799}
800
0b32bf0e
SM
801# ifndef EXTRACT_MACROS
802# undef EXTRACT_NUMBER_AND_INCR
803# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 804 extract_number_and_incr (&dest, &src)
0b32bf0e 805# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
806
807#endif /* DEBUG */
808\f
b18215fc
RS
809/* Store a multibyte character in three contiguous bytes starting
810 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 811 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
812
813#define STORE_CHARACTER_AND_INCR(destination, character) \
814 do { \
815 (destination)[0] = (character) & 0377; \
816 (destination)[1] = ((character) >> 8) & 0377; \
817 (destination)[2] = (character) >> 16; \
818 (destination) += 3; \
819 } while (0)
820
821/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 822 starting at SOURCE. */
b18215fc
RS
823
824#define EXTRACT_CHARACTER(destination, source) \
825 do { \
826 (destination) = ((source)[0] \
827 | ((source)[1] << 8) \
828 | ((source)[2] << 16)); \
829 } while (0)
830
831
832/* Macros for charset. */
833
834/* Size of bitmap of charset P in bytes. P is a start of charset,
835 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
836#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
837
838/* Nonzero if charset P has range table. */
25fe55af 839#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
840
841/* Return the address of range table of charset P. But not the start
842 of table itself, but the before where the number of ranges is
96cc36cc
RS
843 stored. `2 +' means to skip re_opcode_t and size of bitmap,
844 and the 2 bytes of flags at the start of the range table. */
845#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
846
847/* Extract the bit flags that start a range table. */
848#define CHARSET_RANGE_TABLE_BITS(p) \
849 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
850 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
851
852/* Test if C is listed in the bitmap of charset P. */
853#define CHARSET_LOOKUP_BITMAP(p, c) \
854 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
855 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
856
857/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
858 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
859 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
860 and end. */
861#define CHARSET_RANGE_TABLE_END(range_table, count) \
862 ((range_table) + (count) * 2 * 3)
863
7814e705 864/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
865 COUNT is number of ranges in RANGE_TABLE. */
866#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
867 do \
868 { \
01618498
SM
869 re_wchar_t range_start, range_end; \
870 re_char *p; \
871 re_char *range_table_end \
b18215fc
RS
872 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
873 \
874 for (p = (range_table); p < range_table_end; p += 2 * 3) \
875 { \
876 EXTRACT_CHARACTER (range_start, p); \
877 EXTRACT_CHARACTER (range_end, p + 3); \
878 \
879 if (range_start <= (c) && (c) <= range_end) \
880 { \
881 (not) = !(not); \
882 break; \
883 } \
884 } \
885 } \
886 while (0)
887
888/* Test if C is in range table of CHARSET. The flag NOT is negated if
889 C is listed in it. */
890#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
891 do \
892 { \
893 /* Number of ranges in range table. */ \
894 int count; \
01618498
SM
895 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
896 \
b18215fc
RS
897 EXTRACT_NUMBER_AND_INCR (count, range_table); \
898 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
899 } \
900 while (0)
901\f
fa9a63c5
RM
902/* If DEBUG is defined, Regex prints many voluminous messages about what
903 it is doing (if the variable `debug' is nonzero). If linked with the
904 main program in `iregex.c', you can enter patterns and strings
905 interactively. And if linked with the main program in `main.c' and
4bb91c68 906 the other test files, you can run the already-written tests. */
fa9a63c5
RM
907
908#ifdef DEBUG
909
910/* We use standard I/O for debugging. */
0b32bf0e 911# include <stdio.h>
fa9a63c5
RM
912
913/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 914# include <assert.h>
fa9a63c5 915
99633e97 916static int debug = -100000;
fa9a63c5 917
0b32bf0e
SM
918# define DEBUG_STATEMENT(e) e
919# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
920# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
921# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
922# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
923# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 924 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 925# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 926 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
927
928
929/* Print the fastmap in human-readable form. */
930
931void
932print_fastmap (fastmap)
933 char *fastmap;
934{
935 unsigned was_a_range = 0;
5e69f11e
RM
936 unsigned i = 0;
937
fa9a63c5
RM
938 while (i < (1 << BYTEWIDTH))
939 {
940 if (fastmap[i++])
941 {
942 was_a_range = 0;
25fe55af
RS
943 putchar (i - 1);
944 while (i < (1 << BYTEWIDTH) && fastmap[i])
945 {
946 was_a_range = 1;
947 i++;
948 }
fa9a63c5 949 if (was_a_range)
25fe55af
RS
950 {
951 printf ("-");
952 putchar (i - 1);
953 }
954 }
fa9a63c5 955 }
5e69f11e 956 putchar ('\n');
fa9a63c5
RM
957}
958
959
960/* Print a compiled pattern string in human-readable form, starting at
961 the START pointer into it and ending just before the pointer END. */
962
963void
964print_partial_compiled_pattern (start, end)
01618498
SM
965 re_char *start;
966 re_char *end;
fa9a63c5
RM
967{
968 int mcnt, mcnt2;
01618498
SM
969 re_char *p = start;
970 re_char *pend = end;
fa9a63c5
RM
971
972 if (start == NULL)
973 {
a1a052df 974 fprintf (stderr, "(null)\n");
fa9a63c5
RM
975 return;
976 }
5e69f11e 977
fa9a63c5
RM
978 /* Loop over pattern commands. */
979 while (p < pend)
980 {
a1a052df 981 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
982
983 switch ((re_opcode_t) *p++)
984 {
25fe55af 985 case no_op:
a1a052df 986 fprintf (stderr, "/no_op");
25fe55af 987 break;
fa9a63c5 988
99633e97 989 case succeed:
a1a052df 990 fprintf (stderr, "/succeed");
99633e97
SM
991 break;
992
fa9a63c5
RM
993 case exactn:
994 mcnt = *p++;
a1a052df 995 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 996 do
fa9a63c5 997 {
a1a052df 998 fprintf (stderr, "/%c", *p++);
25fe55af
RS
999 }
1000 while (--mcnt);
1001 break;
fa9a63c5
RM
1002
1003 case start_memory:
a1a052df 1004 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1005 break;
fa9a63c5
RM
1006
1007 case stop_memory:
a1a052df 1008 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1009 break;
fa9a63c5
RM
1010
1011 case duplicate:
a1a052df 1012 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1013 break;
1014
1015 case anychar:
a1a052df 1016 fprintf (stderr, "/anychar");
fa9a63c5
RM
1017 break;
1018
1019 case charset:
25fe55af
RS
1020 case charset_not:
1021 {
1022 register int c, last = -100;
fa9a63c5 1023 register int in_range = 0;
99633e97
SM
1024 int length = CHARSET_BITMAP_SIZE (p - 1);
1025 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1026
a1a052df 1027 fprintf (stderr, "/charset [%s",
275a3409 1028 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1029
275a3409
RS
1030 if (p + *p >= pend)
1031 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1032
25fe55af 1033 for (c = 0; c < 256; c++)
96cc36cc 1034 if (c / 8 < length
fa9a63c5
RM
1035 && (p[1 + (c/8)] & (1 << (c % 8))))
1036 {
1037 /* Are we starting a range? */
1038 if (last + 1 == c && ! in_range)
1039 {
a1a052df 1040 fprintf (stderr, "-");
fa9a63c5
RM
1041 in_range = 1;
1042 }
1043 /* Have we broken a range? */
1044 else if (last + 1 != c && in_range)
96cc36cc 1045 {
a1a052df 1046 fprintf (stderr, "%c", last);
fa9a63c5
RM
1047 in_range = 0;
1048 }
5e69f11e 1049
fa9a63c5 1050 if (! in_range)
a1a052df 1051 fprintf (stderr, "%c", c);
fa9a63c5
RM
1052
1053 last = c;
25fe55af 1054 }
fa9a63c5
RM
1055
1056 if (in_range)
a1a052df 1057 fprintf (stderr, "%c", last);
fa9a63c5 1058
a1a052df 1059 fprintf (stderr, "]");
fa9a63c5 1060
99633e97 1061 p += 1 + length;
96cc36cc 1062
96cc36cc 1063 if (has_range_table)
99633e97
SM
1064 {
1065 int count;
a1a052df 1066 fprintf (stderr, "has-range-table");
99633e97
SM
1067
1068 /* ??? Should print the range table; for now, just skip it. */
1069 p += 2; /* skip range table bits */
1070 EXTRACT_NUMBER_AND_INCR (count, p);
1071 p = CHARSET_RANGE_TABLE_END (p, count);
1072 }
fa9a63c5
RM
1073 }
1074 break;
1075
1076 case begline:
a1a052df 1077 fprintf (stderr, "/begline");
25fe55af 1078 break;
fa9a63c5
RM
1079
1080 case endline:
a1a052df 1081 fprintf (stderr, "/endline");
25fe55af 1082 break;
fa9a63c5
RM
1083
1084 case on_failure_jump:
25fe55af 1085 extract_number_and_incr (&mcnt, &p);
a1a052df 1086 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1087 break;
fa9a63c5
RM
1088
1089 case on_failure_keep_string_jump:
25fe55af 1090 extract_number_and_incr (&mcnt, &p);
a1a052df 1091 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1092 break;
fa9a63c5 1093
0683b6fa
SM
1094 case on_failure_jump_nastyloop:
1095 extract_number_and_incr (&mcnt, &p);
a1a052df 1096 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1097 break;
1098
505bde11 1099 case on_failure_jump_loop:
fa9a63c5 1100 extract_number_and_incr (&mcnt, &p);
a1a052df 1101 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1102 break;
1103
505bde11 1104 case on_failure_jump_smart:
fa9a63c5 1105 extract_number_and_incr (&mcnt, &p);
a1a052df 1106 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1107 break;
1108
25fe55af 1109 case jump:
fa9a63c5 1110 extract_number_and_incr (&mcnt, &p);
a1a052df 1111 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1112 break;
1113
25fe55af
RS
1114 case succeed_n:
1115 extract_number_and_incr (&mcnt, &p);
1116 extract_number_and_incr (&mcnt2, &p);
a1a052df 1117 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1118 break;
5e69f11e 1119
25fe55af
RS
1120 case jump_n:
1121 extract_number_and_incr (&mcnt, &p);
1122 extract_number_and_incr (&mcnt2, &p);
a1a052df 1123 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1124 break;
5e69f11e 1125
25fe55af
RS
1126 case set_number_at:
1127 extract_number_and_incr (&mcnt, &p);
1128 extract_number_and_incr (&mcnt2, &p);
a1a052df 1129 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1130 break;
5e69f11e 1131
25fe55af 1132 case wordbound:
a1a052df 1133 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1134 break;
1135
1136 case notwordbound:
a1a052df 1137 fprintf (stderr, "/notwordbound");
25fe55af 1138 break;
fa9a63c5
RM
1139
1140 case wordbeg:
a1a052df 1141 fprintf (stderr, "/wordbeg");
fa9a63c5 1142 break;
5e69f11e 1143
fa9a63c5 1144 case wordend:
a1a052df 1145 fprintf (stderr, "/wordend");
e2543b02 1146 break;
5e69f11e 1147
669fa600 1148 case symbeg:
e2543b02 1149 fprintf (stderr, "/symbeg");
669fa600
SM
1150 break;
1151
1152 case symend:
e2543b02 1153 fprintf (stderr, "/symend");
669fa600
SM
1154 break;
1155
1fb352e0 1156 case syntaxspec:
a1a052df 1157 fprintf (stderr, "/syntaxspec");
1fb352e0 1158 mcnt = *p++;
a1a052df 1159 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1160 break;
1161
1162 case notsyntaxspec:
a1a052df 1163 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1164 mcnt = *p++;
a1a052df 1165 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1166 break;
1167
0b32bf0e 1168# ifdef emacs
fa9a63c5 1169 case before_dot:
a1a052df 1170 fprintf (stderr, "/before_dot");
25fe55af 1171 break;
fa9a63c5
RM
1172
1173 case at_dot:
a1a052df 1174 fprintf (stderr, "/at_dot");
25fe55af 1175 break;
fa9a63c5
RM
1176
1177 case after_dot:
a1a052df 1178 fprintf (stderr, "/after_dot");
25fe55af 1179 break;
fa9a63c5 1180
1fb352e0 1181 case categoryspec:
a1a052df 1182 fprintf (stderr, "/categoryspec");
fa9a63c5 1183 mcnt = *p++;
a1a052df 1184 fprintf (stderr, "/%d", mcnt);
25fe55af 1185 break;
5e69f11e 1186
1fb352e0 1187 case notcategoryspec:
a1a052df 1188 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1189 mcnt = *p++;
a1a052df 1190 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1191 break;
0b32bf0e 1192# endif /* emacs */
fa9a63c5 1193
fa9a63c5 1194 case begbuf:
a1a052df 1195 fprintf (stderr, "/begbuf");
25fe55af 1196 break;
fa9a63c5
RM
1197
1198 case endbuf:
a1a052df 1199 fprintf (stderr, "/endbuf");
25fe55af 1200 break;
fa9a63c5 1201
25fe55af 1202 default:
a1a052df 1203 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1204 }
1205
a1a052df 1206 fprintf (stderr, "\n");
fa9a63c5
RM
1207 }
1208
a1a052df 1209 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1210}
1211
1212
1213void
1214print_compiled_pattern (bufp)
1215 struct re_pattern_buffer *bufp;
1216{
01618498 1217 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1218
1219 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1220 printf ("%ld bytes used/%ld bytes allocated.\n",
1221 bufp->used, bufp->allocated);
fa9a63c5
RM
1222
1223 if (bufp->fastmap_accurate && bufp->fastmap)
1224 {
1225 printf ("fastmap: ");
1226 print_fastmap (bufp->fastmap);
1227 }
1228
1229 printf ("re_nsub: %d\t", bufp->re_nsub);
1230 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1231 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1232 printf ("no_sub: %d\t", bufp->no_sub);
1233 printf ("not_bol: %d\t", bufp->not_bol);
1234 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1235 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1236 fflush (stdout);
fa9a63c5
RM
1237 /* Perhaps we should print the translate table? */
1238}
1239
1240
1241void
1242print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1243 re_char *where;
1244 re_char *string1;
1245 re_char *string2;
fa9a63c5
RM
1246 int size1;
1247 int size2;
1248{
4bb91c68 1249 int this_char;
5e69f11e 1250
fa9a63c5
RM
1251 if (where == NULL)
1252 printf ("(null)");
1253 else
1254 {
1255 if (FIRST_STRING_P (where))
25fe55af
RS
1256 {
1257 for (this_char = where - string1; this_char < size1; this_char++)
1258 putchar (string1[this_char]);
fa9a63c5 1259
25fe55af
RS
1260 where = string2;
1261 }
fa9a63c5
RM
1262
1263 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1264 putchar (string2[this_char]);
fa9a63c5
RM
1265 }
1266}
1267
1268#else /* not DEBUG */
1269
0b32bf0e
SM
1270# undef assert
1271# define assert(e)
fa9a63c5 1272
0b32bf0e
SM
1273# define DEBUG_STATEMENT(e)
1274# define DEBUG_PRINT1(x)
1275# define DEBUG_PRINT2(x1, x2)
1276# define DEBUG_PRINT3(x1, x2, x3)
1277# define DEBUG_PRINT4(x1, x2, x3, x4)
1278# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1279# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1280
1281#endif /* not DEBUG */
1282\f
1283/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1284 also be assigned to arbitrarily: each pattern buffer stores its own
1285 syntax, so it can be changed between regex compilations. */
1286/* This has no initializer because initialized variables in Emacs
1287 become read-only after dumping. */
1288reg_syntax_t re_syntax_options;
1289
1290
1291/* Specify the precise syntax of regexps for compilation. This provides
1292 for compatibility for various utilities which historically have
1293 different, incompatible syntaxes.
1294
1295 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1296 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1297
1298reg_syntax_t
1299re_set_syntax (syntax)
f9b0fd99 1300 reg_syntax_t syntax;
fa9a63c5
RM
1301{
1302 reg_syntax_t ret = re_syntax_options;
5e69f11e 1303
fa9a63c5
RM
1304 re_syntax_options = syntax;
1305 return ret;
1306}
c0f9ea08 1307WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1308
1309/* Regexp to use to replace spaces, or NULL meaning don't. */
1310static re_char *whitespace_regexp;
1311
1312void
1313re_set_whitespace_regexp (regexp)
6470ea05 1314 const char *regexp;
f9b0fd99 1315{
6470ea05 1316 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1317}
1318WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1319\f
1320/* This table gives an error message for each of the error codes listed
4bb91c68 1321 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1322 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1323 but why not be nice? */
fa9a63c5
RM
1324
1325static const char *re_error_msgid[] =
5e69f11e
RM
1326 {
1327 gettext_noop ("Success"), /* REG_NOERROR */
1328 gettext_noop ("No match"), /* REG_NOMATCH */
1329 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1330 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1331 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1332 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1333 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1334 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1335 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1336 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1337 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1338 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1339 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1340 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1341 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1342 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1343 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1344 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1345 };
1346\f
4bb91c68 1347/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1348
1349/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1350 searching and matching functions should not call alloca. On some
1351 systems, alloca is implemented in terms of malloc, and if we're
1352 using the relocating allocator routines, then malloc could cause a
1353 relocation, which might (if the strings being searched are in the
1354 ralloc heap) shift the data out from underneath the regexp
1355 routines.
1356
5e69f11e 1357 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1358 processes input from X in a signal handler; processing X input may
1359 call malloc; if input arrives while a matching routine is calling
1360 malloc, then we're scrod. But Emacs can't just block input while
1361 calling matching routines; then we don't notice interrupts when
1362 they come in. So, Emacs blocks input around all regexp calls
1363 except the matching calls, which it leaves unprotected, in the
1364 faith that they will not malloc. */
1365
1366/* Normally, this is fine. */
1367#define MATCH_MAY_ALLOCATE
1368
1369/* When using GNU C, we are not REALLY using the C alloca, no matter
1370 what config.h may say. So don't take precautions for it. */
1371#ifdef __GNUC__
0b32bf0e 1372# undef C_ALLOCA
fa9a63c5
RM
1373#endif
1374
1375/* The match routines may not allocate if (1) they would do it with malloc
1376 and (2) it's not safe for them to use malloc.
1377 Note that if REL_ALLOC is defined, matching would not use malloc for the
1378 failure stack, but we would still use it for the register vectors;
4bb91c68 1379 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1380#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1381# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1382#endif
1383
1384\f
1385/* Failure stack declarations and macros; both re_compile_fastmap and
1386 re_match_2 use a failure stack. These have to be macros because of
1387 REGEX_ALLOCATE_STACK. */
5e69f11e 1388
fa9a63c5 1389
320a2a73 1390/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1391 when matching. If this number is exceeded, we allocate more
1392 space, so it is not a hard limit. */
1393#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1394# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1395#endif
1396
1397/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1398 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1399 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1400 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1401 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1402# if defined MATCH_MAY_ALLOCATE
1403/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1404 whose default stack limit is 2mb. In order for a larger
1405 value to work reliably, you have to try to make it accord
1406 with the process stack limit. */
c0f9ea08
SM
1407size_t re_max_failures = 40000;
1408# else
1409size_t re_max_failures = 4000;
1410# endif
fa9a63c5
RM
1411
1412union fail_stack_elt
1413{
01618498 1414 re_char *pointer;
c0f9ea08
SM
1415 /* This should be the biggest `int' that's no bigger than a pointer. */
1416 long integer;
fa9a63c5
RM
1417};
1418
1419typedef union fail_stack_elt fail_stack_elt_t;
1420
1421typedef struct
1422{
1423 fail_stack_elt_t *stack;
c0f9ea08
SM
1424 size_t size;
1425 size_t avail; /* Offset of next open position. */
1426 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1427} fail_stack_type;
1428
505bde11 1429#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1430#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1431
1432
1433/* Define macros to initialize and free the failure stack.
1434 Do `return -2' if the alloc fails. */
1435
1436#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1437# define INIT_FAIL_STACK() \
fa9a63c5
RM
1438 do { \
1439 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1440 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1441 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1442 \
1443 if (fail_stack.stack == NULL) \
1444 return -2; \
1445 \
1446 fail_stack.size = INIT_FAILURE_ALLOC; \
1447 fail_stack.avail = 0; \
505bde11 1448 fail_stack.frame = 0; \
fa9a63c5
RM
1449 } while (0)
1450
0b32bf0e 1451# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1452#else
0b32bf0e 1453# define INIT_FAIL_STACK() \
fa9a63c5
RM
1454 do { \
1455 fail_stack.avail = 0; \
505bde11 1456 fail_stack.frame = 0; \
fa9a63c5
RM
1457 } while (0)
1458
0b32bf0e 1459# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1460#endif
1461
1462
320a2a73
KH
1463/* Double the size of FAIL_STACK, up to a limit
1464 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1465
1466 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1467 allocating space for it or it was already too large.
1468
4bb91c68 1469 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1470
320a2a73
KH
1471/* Factor to increase the failure stack size by
1472 when we increase it.
1473 This used to be 2, but 2 was too wasteful
1474 because the old discarded stacks added up to as much space
1475 were as ultimate, maximum-size stack. */
1476#define FAIL_STACK_GROWTH_FACTOR 4
1477
1478#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1479 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1480 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1481 ? 0 \
320a2a73
KH
1482 : ((fail_stack).stack \
1483 = (fail_stack_elt_t *) \
25fe55af
RS
1484 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1485 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1486 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1487 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1488 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1489 \
1490 (fail_stack).stack == NULL \
1491 ? 0 \
6453db45
KH
1492 : ((fail_stack).size \
1493 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1494 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1495 * FAIL_STACK_GROWTH_FACTOR)) \
1496 / sizeof (fail_stack_elt_t)), \
25fe55af 1497 1)))
fa9a63c5
RM
1498
1499
fa9a63c5
RM
1500/* Push a pointer value onto the failure stack.
1501 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1502 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1503#define PUSH_FAILURE_POINTER(item) \
01618498 1504 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1505
1506/* This pushes an integer-valued item onto the failure stack.
1507 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1508 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1509#define PUSH_FAILURE_INT(item) \
1510 fail_stack.stack[fail_stack.avail++].integer = (item)
1511
1512/* Push a fail_stack_elt_t value onto the failure stack.
1513 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1514 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1515#define PUSH_FAILURE_ELT(item) \
1516 fail_stack.stack[fail_stack.avail++] = (item)
1517
1518/* These three POP... operations complement the three PUSH... operations.
1519 All assume that `fail_stack' is nonempty. */
1520#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1521#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1522#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1523
505bde11
SM
1524/* Individual items aside from the registers. */
1525#define NUM_NONREG_ITEMS 3
1526
1527/* Used to examine the stack (to detect infinite loops). */
1528#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1529#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1530#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1531#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1532
1533
505bde11
SM
1534#define ENSURE_FAIL_STACK(space) \
1535while (REMAINING_AVAIL_SLOTS <= space) { \
1536 if (!GROW_FAIL_STACK (fail_stack)) \
1537 return -2; \
1538 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1539 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1540}
1541
1542/* Push register NUM onto the stack. */
1543#define PUSH_FAILURE_REG(num) \
1544do { \
1545 char *destination; \
1546 ENSURE_FAIL_STACK(3); \
1547 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1548 num, regstart[num], regend[num]); \
1549 PUSH_FAILURE_POINTER (regstart[num]); \
1550 PUSH_FAILURE_POINTER (regend[num]); \
1551 PUSH_FAILURE_INT (num); \
1552} while (0)
1553
01618498
SM
1554/* Change the counter's value to VAL, but make sure that it will
1555 be reset when backtracking. */
1556#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1557do { \
1558 char *destination; \
1559 int c; \
1560 ENSURE_FAIL_STACK(3); \
1561 EXTRACT_NUMBER (c, ptr); \
01618498 1562 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1563 PUSH_FAILURE_INT (c); \
1564 PUSH_FAILURE_POINTER (ptr); \
1565 PUSH_FAILURE_INT (-1); \
01618498 1566 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1567} while (0)
1568
505bde11 1569/* Pop a saved register off the stack. */
dc1e502d 1570#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1571do { \
1572 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1573 if (reg == -1) \
1574 { \
1575 /* It's a counter. */ \
6dcf2d0e
SM
1576 /* Here, we discard `const', making re_match non-reentrant. */ \
1577 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1578 reg = POP_FAILURE_INT (); \
1579 STORE_NUMBER (ptr, reg); \
1580 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1581 } \
1582 else \
1583 { \
1584 regend[reg] = POP_FAILURE_POINTER (); \
1585 regstart[reg] = POP_FAILURE_POINTER (); \
1586 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1587 reg, regstart[reg], regend[reg]); \
1588 } \
505bde11
SM
1589} while (0)
1590
1591/* Check that we are not stuck in an infinite loop. */
1592#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1593do { \
f6df485f 1594 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1595 /* Check for infinite matching loops */ \
f6df485f
RS
1596 while (failure > 0 \
1597 && (FAILURE_STR (failure) == string_place \
1598 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1599 { \
1600 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1601 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1602 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1603 { \
6df42991
SM
1604 cycle = 1; \
1605 break; \
f6df485f 1606 } \
66f0296e 1607 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1608 failure = NEXT_FAILURE_HANDLE(failure); \
1609 } \
1610 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1611} while (0)
6df42991 1612
fa9a63c5 1613/* Push the information about the state we will need
5e69f11e
RM
1614 if we ever fail back to it.
1615
505bde11 1616 Requires variables fail_stack, regstart, regend and
320a2a73 1617 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1618 declared.
5e69f11e 1619
fa9a63c5
RM
1620 Does `return FAILURE_CODE' if runs out of memory. */
1621
505bde11
SM
1622#define PUSH_FAILURE_POINT(pattern, string_place) \
1623do { \
1624 char *destination; \
1625 /* Must be int, so when we don't save any registers, the arithmetic \
1626 of 0 + -1 isn't done as unsigned. */ \
1627 \
505bde11 1628 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1629 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1630 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1631 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1632 \
1633 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1634 \
1635 DEBUG_PRINT1 ("\n"); \
1636 \
1637 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1638 PUSH_FAILURE_INT (fail_stack.frame); \
1639 \
1640 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1641 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1642 DEBUG_PRINT1 ("'\n"); \
1643 PUSH_FAILURE_POINTER (string_place); \
1644 \
1645 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1646 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1647 PUSH_FAILURE_POINTER (pattern); \
1648 \
1649 /* Close the frame by moving the frame pointer past it. */ \
1650 fail_stack.frame = fail_stack.avail; \
1651} while (0)
fa9a63c5 1652
320a2a73
KH
1653/* Estimate the size of data pushed by a typical failure stack entry.
1654 An estimate is all we need, because all we use this for
1655 is to choose a limit for how big to make the failure stack. */
ada30c0e 1656/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1657#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1658
fa9a63c5
RM
1659/* How many items can still be added to the stack without overflowing it. */
1660#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1661
1662
1663/* Pops what PUSH_FAIL_STACK pushes.
1664
1665 We restore into the parameters, all of which should be lvalues:
1666 STR -- the saved data position.
1667 PAT -- the saved pattern position.
fa9a63c5 1668 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1669
fa9a63c5 1670 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1671 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1672
505bde11
SM
1673#define POP_FAILURE_POINT(str, pat) \
1674do { \
fa9a63c5
RM
1675 assert (!FAIL_STACK_EMPTY ()); \
1676 \
1677 /* Remove failure points and point to how many regs pushed. */ \
1678 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1679 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1680 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1681 \
505bde11
SM
1682 /* Pop the saved registers. */ \
1683 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1684 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1685 \
01618498 1686 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1687 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1688 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1689 \
1690 /* If the saved string location is NULL, it came from an \
1691 on_failure_keep_string_jump opcode, and we want to throw away the \
1692 saved NULL, thus retaining our current position in the string. */ \
01618498 1693 str = POP_FAILURE_POINTER (); \
505bde11 1694 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1695 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1696 DEBUG_PRINT1 ("'\n"); \
1697 \
505bde11
SM
1698 fail_stack.frame = POP_FAILURE_INT (); \
1699 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1700 \
505bde11
SM
1701 assert (fail_stack.avail >= 0); \
1702 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1703 \
fa9a63c5 1704 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1705} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1706
1707
1708\f
fa9a63c5 1709/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1710#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1711\f
1712/* Subroutine declarations and macros for regex_compile. */
1713
4bb91c68
SM
1714static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1715 reg_syntax_t syntax,
1716 struct re_pattern_buffer *bufp));
1717static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1718static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1719 int arg1, int arg2));
1720static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1721 int arg, unsigned char *end));
1722static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1723 int arg1, int arg2, unsigned char *end));
01618498
SM
1724static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1725 re_char *p,
4bb91c68 1726 reg_syntax_t syntax));
01618498
SM
1727static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1728 re_char *pend,
4bb91c68 1729 reg_syntax_t syntax));
01618498
SM
1730static re_char *skip_one_char _RE_ARGS ((re_char *p));
1731static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1732 char *fastmap, const int multibyte));
fa9a63c5 1733
fa9a63c5 1734/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1735 translation. */
36595814 1736#define PATFETCH(c) \
2d1675e4
SM
1737 do { \
1738 int len; \
1739 if (p == pend) return REG_EEND; \
1740 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1741 p += len; \
fa9a63c5
RM
1742 } while (0)
1743
fa9a63c5
RM
1744
1745/* If `translate' is non-null, return translate[D], else just D. We
1746 cast the subscript to translate because some data is declared as
1747 `char *', to avoid warnings when a string constant is passed. But
1748 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1749#ifndef TRANSLATE
0b32bf0e 1750# define TRANSLATE(d) \
66f0296e 1751 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1752#endif
fa9a63c5
RM
1753
1754
1755/* Macros for outputting the compiled pattern into `buffer'. */
1756
1757/* If the buffer isn't allocated when it comes in, use this. */
1758#define INIT_BUF_SIZE 32
1759
4bb91c68 1760/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1761#define GET_BUFFER_SPACE(n) \
01618498 1762 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1763 EXTEND_BUFFER ()
1764
1765/* Make sure we have one more byte of buffer space and then add C to it. */
1766#define BUF_PUSH(c) \
1767 do { \
1768 GET_BUFFER_SPACE (1); \
1769 *b++ = (unsigned char) (c); \
1770 } while (0)
1771
1772
1773/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1774#define BUF_PUSH_2(c1, c2) \
1775 do { \
1776 GET_BUFFER_SPACE (2); \
1777 *b++ = (unsigned char) (c1); \
1778 *b++ = (unsigned char) (c2); \
1779 } while (0)
1780
1781
4bb91c68 1782/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1783#define BUF_PUSH_3(c1, c2, c3) \
1784 do { \
1785 GET_BUFFER_SPACE (3); \
1786 *b++ = (unsigned char) (c1); \
1787 *b++ = (unsigned char) (c2); \
1788 *b++ = (unsigned char) (c3); \
1789 } while (0)
1790
1791
1792/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1793 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1794#define STORE_JUMP(op, loc, to) \
1795 store_op1 (op, loc, (to) - (loc) - 3)
1796
1797/* Likewise, for a two-argument jump. */
1798#define STORE_JUMP2(op, loc, to, arg) \
1799 store_op2 (op, loc, (to) - (loc) - 3, arg)
1800
4bb91c68 1801/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1802#define INSERT_JUMP(op, loc, to) \
1803 insert_op1 (op, loc, (to) - (loc) - 3, b)
1804
1805/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1806#define INSERT_JUMP2(op, loc, to, arg) \
1807 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1808
1809
1810/* This is not an arbitrary limit: the arguments which represent offsets
275a3409 1811 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1812 be too small, many things would have to change. */
275a3409
RS
1813# define MAX_BUF_SIZE (1L << 15)
1814
1815#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1816/* Any other compiler which, like MSC, has allocation limit below 2^16
1817 bytes will have to use approach similar to what was done below for
1818 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1819 reallocating to 0 bytes. Such thing is not going to work too well.
1820 You have been warned!! */
1821#if defined _MSC_VER && !defined WIN32
1822/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1823# define MAX_BUF_SIZE 65500L
1824#else
1825# define MAX_BUF_SIZE (1L << 16)
1826#endif
275a3409 1827#endif /* 0 */
fa9a63c5
RM
1828
1829/* Extend the buffer by twice its current size via realloc and
1830 reset the pointers that pointed into the old block to point to the
1831 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1832 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1833#if __BOUNDED_POINTERS__
1834# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1835# define MOVE_BUFFER_POINTER(P) \
1836 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1837# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1838 else \
1839 { \
1840 SET_HIGH_BOUND (b); \
1841 SET_HIGH_BOUND (begalt); \
1842 if (fixup_alt_jump) \
1843 SET_HIGH_BOUND (fixup_alt_jump); \
1844 if (laststart) \
1845 SET_HIGH_BOUND (laststart); \
1846 if (pending_exact) \
1847 SET_HIGH_BOUND (pending_exact); \
1848 }
1849#else
1850# define MOVE_BUFFER_POINTER(P) (P) += incr
1851# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1852#endif
fa9a63c5 1853#define EXTEND_BUFFER() \
25fe55af 1854 do { \
01618498 1855 re_char *old_buffer = bufp->buffer; \
25fe55af 1856 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1857 return REG_ESIZE; \
1858 bufp->allocated <<= 1; \
1859 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1860 bufp->allocated = MAX_BUF_SIZE; \
01618498 1861 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1862 if (bufp->buffer == NULL) \
1863 return REG_ESPACE; \
1864 /* If the buffer moved, move all the pointers into it. */ \
1865 if (old_buffer != bufp->buffer) \
1866 { \
4bb91c68
SM
1867 int incr = bufp->buffer - old_buffer; \
1868 MOVE_BUFFER_POINTER (b); \
1869 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1870 if (fixup_alt_jump) \
4bb91c68 1871 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1872 if (laststart) \
4bb91c68 1873 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1874 if (pending_exact) \
4bb91c68 1875 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1876 } \
4bb91c68 1877 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1878 } while (0)
1879
1880
1881/* Since we have one byte reserved for the register number argument to
1882 {start,stop}_memory, the maximum number of groups we can report
1883 things about is what fits in that byte. */
1884#define MAX_REGNUM 255
1885
1886/* But patterns can have more than `MAX_REGNUM' registers. We just
1887 ignore the excess. */
098d42af 1888typedef int regnum_t;
fa9a63c5
RM
1889
1890
1891/* Macros for the compile stack. */
1892
1893/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1894 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1895/* int may be not enough when sizeof(int) == 2. */
1896typedef long pattern_offset_t;
fa9a63c5
RM
1897
1898typedef struct
1899{
1900 pattern_offset_t begalt_offset;
1901 pattern_offset_t fixup_alt_jump;
5e69f11e 1902 pattern_offset_t laststart_offset;
fa9a63c5
RM
1903 regnum_t regnum;
1904} compile_stack_elt_t;
1905
1906
1907typedef struct
1908{
1909 compile_stack_elt_t *stack;
1910 unsigned size;
1911 unsigned avail; /* Offset of next open position. */
1912} compile_stack_type;
1913
1914
1915#define INIT_COMPILE_STACK_SIZE 32
1916
1917#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1918#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1919
4bb91c68 1920/* The next available element. */
fa9a63c5
RM
1921#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1922
1cee1e27
SM
1923/* Explicit quit checking is only used on NTemacs and whenever we
1924 use polling to process input events. */
1925#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1926extern int immediate_quit;
1927# define IMMEDIATE_QUIT_CHECK \
1928 do { \
1929 if (immediate_quit) QUIT; \
1930 } while (0)
1931#else
1932# define IMMEDIATE_QUIT_CHECK ((void)0)
1933#endif
1934\f
b18215fc
RS
1935/* Structure to manage work area for range table. */
1936struct range_table_work_area
1937{
1938 int *table; /* actual work area. */
1939 int allocated; /* allocated size for work area in bytes. */
7814e705 1940 int used; /* actually used size in words. */
96cc36cc 1941 int bits; /* flag to record character classes */
b18215fc
RS
1942};
1943
77d11aec
RS
1944/* Make sure that WORK_AREA can hold more N multibyte characters.
1945 This is used only in set_image_of_range and set_image_of_range_1.
1946 It expects WORK_AREA to be a pointer.
1947 If it can't get the space, it returns from the surrounding function. */
1948
1949#define EXTEND_RANGE_TABLE(work_area, n) \
1950 do { \
1951 if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
1952 { \
1953 extend_range_table_work_area (work_area); \
1954 if ((work_area)->table == 0) \
1955 return (REG_ESPACE); \
1956 } \
b18215fc
RS
1957 } while (0)
1958
96cc36cc
RS
1959#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1960 (work_area).bits |= (bit)
1961
14473664
SM
1962/* Bits used to implement the multibyte-part of the various character classes
1963 such as [:alnum:] in a charset's range table. */
1964#define BIT_WORD 0x1
1965#define BIT_LOWER 0x2
1966#define BIT_PUNCT 0x4
1967#define BIT_SPACE 0x8
1968#define BIT_UPPER 0x10
1969#define BIT_MULTIBYTE 0x20
96cc36cc 1970
36595814
SM
1971/* Set a range START..END to WORK_AREA.
1972 The range is passed through TRANSLATE, so START and END
1973 should be untranslated. */
77d11aec
RS
1974#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
1975 do { \
1976 int tem; \
1977 tem = set_image_of_range (&work_area, start, end, translate); \
1978 if (tem > 0) \
1979 FREE_STACK_RETURN (tem); \
b18215fc
RS
1980 } while (0)
1981
7814e705 1982/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1983#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1984 do { \
1985 if ((work_area).table) \
1986 free ((work_area).table); \
1987 } while (0)
1988
96cc36cc 1989#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1990#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1991#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1992#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1993\f
b18215fc 1994
fa9a63c5 1995/* Set the bit for character C in a list. */
01618498 1996#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1997
1998
1999/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2000#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2001 do { \
2002 if (p == pend) \
2003 FREE_STACK_RETURN (REG_EBRACE); \
2004 else \
2005 { \
2006 PATFETCH (c); \
2007 while ('0' <= c && c <= '9') \
2008 { \
2009 int prev; \
2010 if (num < 0) \
2011 num = 0; \
2012 prev = num; \
2013 num = num * 10 + c - '0'; \
2014 if (num / 10 != prev) \
2015 FREE_STACK_RETURN (REG_BADBR); \
2016 if (p == pend) \
2017 FREE_STACK_RETURN (REG_EBRACE); \
2018 PATFETCH (c); \
2019 } \
2020 } \
2021 } while (0)
77d11aec 2022\f
1fdab503 2023#if ! WIDE_CHAR_SUPPORT
01618498 2024
14473664 2025/* Map a string to the char class it names (if any). */
1fdab503 2026re_wctype_t
ada30c0e
SM
2027re_wctype (str)
2028 re_char *str;
14473664 2029{
ada30c0e 2030 const char *string = str;
14473664
SM
2031 if (STREQ (string, "alnum")) return RECC_ALNUM;
2032 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2033 else if (STREQ (string, "word")) return RECC_WORD;
2034 else if (STREQ (string, "ascii")) return RECC_ASCII;
2035 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2036 else if (STREQ (string, "graph")) return RECC_GRAPH;
2037 else if (STREQ (string, "lower")) return RECC_LOWER;
2038 else if (STREQ (string, "print")) return RECC_PRINT;
2039 else if (STREQ (string, "punct")) return RECC_PUNCT;
2040 else if (STREQ (string, "space")) return RECC_SPACE;
2041 else if (STREQ (string, "upper")) return RECC_UPPER;
2042 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2043 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2044 else if (STREQ (string, "digit")) return RECC_DIGIT;
2045 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2046 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2047 else if (STREQ (string, "blank")) return RECC_BLANK;
2048 else return 0;
2049}
2050
e0f24100 2051/* True if CH is in the char class CC. */
1fdab503 2052boolean
14473664
SM
2053re_iswctype (ch, cc)
2054 int ch;
2055 re_wctype_t cc;
2056{
2057 switch (cc)
2058 {
0cdd06f8
SM
2059 case RECC_ALNUM: return ISALNUM (ch);
2060 case RECC_ALPHA: return ISALPHA (ch);
2061 case RECC_BLANK: return ISBLANK (ch);
2062 case RECC_CNTRL: return ISCNTRL (ch);
2063 case RECC_DIGIT: return ISDIGIT (ch);
2064 case RECC_GRAPH: return ISGRAPH (ch);
2065 case RECC_LOWER: return ISLOWER (ch);
2066 case RECC_PRINT: return ISPRINT (ch);
2067 case RECC_PUNCT: return ISPUNCT (ch);
2068 case RECC_SPACE: return ISSPACE (ch);
2069 case RECC_UPPER: return ISUPPER (ch);
2070 case RECC_XDIGIT: return ISXDIGIT (ch);
2071 case RECC_ASCII: return IS_REAL_ASCII (ch);
2072 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2073 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2074 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2075 case RECC_WORD: return ISWORD (ch);
2076 case RECC_ERROR: return false;
2077 default:
2078 abort();
14473664
SM
2079 }
2080}
fa9a63c5 2081
14473664
SM
2082/* Return a bit-pattern to use in the range-table bits to match multibyte
2083 chars of class CC. */
2084static int
2085re_wctype_to_bit (cc)
2086 re_wctype_t cc;
2087{
2088 switch (cc)
2089 {
2090 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2091 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2092 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2093 case RECC_LOWER: return BIT_LOWER;
2094 case RECC_UPPER: return BIT_UPPER;
2095 case RECC_PUNCT: return BIT_PUNCT;
2096 case RECC_SPACE: return BIT_SPACE;
14473664 2097 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2098 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2099 default:
2100 abort();
14473664
SM
2101 }
2102}
2103#endif
77d11aec
RS
2104\f
2105/* Filling in the work area of a range. */
2106
2107/* Actually extend the space in WORK_AREA. */
2108
2109static void
2110extend_range_table_work_area (work_area)
2111 struct range_table_work_area *work_area;
177c0ea7 2112{
77d11aec
RS
2113 work_area->allocated += 16 * sizeof (int);
2114 if (work_area->table)
2115 work_area->table
a073faa6 2116 = (int *) realloc (work_area->table, work_area->allocated);
77d11aec
RS
2117 else
2118 work_area->table
a073faa6 2119 = (int *) malloc (work_area->allocated);
77d11aec
RS
2120}
2121
2122#ifdef emacs
2123
2124/* Carefully find the ranges of codes that are equivalent
2125 under case conversion to the range start..end when passed through
2126 TRANSLATE. Handle the case where non-letters can come in between
2127 two upper-case letters (which happens in Latin-1).
2128 Also handle the case of groups of more than 2 case-equivalent chars.
2129
2130 The basic method is to look at consecutive characters and see
2131 if they can form a run that can be handled as one.
2132
2133 Returns -1 if successful, REG_ESPACE if ran out of space. */
2134
2135static int
2136set_image_of_range_1 (work_area, start, end, translate)
2137 RE_TRANSLATE_TYPE translate;
2138 struct range_table_work_area *work_area;
2139 re_wchar_t start, end;
2140{
2141 /* `one_case' indicates a character, or a run of characters,
2142 each of which is an isolate (no case-equivalents).
2143 This includes all ASCII non-letters.
2144
2145 `two_case' indicates a character, or a run of characters,
2146 each of which has two case-equivalent forms.
2147 This includes all ASCII letters.
2148
2149 `strange' indicates a character that has more than one
2150 case-equivalent. */
177c0ea7 2151
77d11aec
RS
2152 enum case_type {one_case, two_case, strange};
2153
2154 /* Describe the run that is in progress,
2155 which the next character can try to extend.
2156 If run_type is strange, that means there really is no run.
2157 If run_type is one_case, then run_start...run_end is the run.
2158 If run_type is two_case, then the run is run_start...run_end,
2159 and the case-equivalents end at run_eqv_end. */
2160
2161 enum case_type run_type = strange;
2162 int run_start, run_end, run_eqv_end;
2163
2164 Lisp_Object eqv_table;
2165
2166 if (!RE_TRANSLATE_P (translate))
2167 {
b7c12565 2168 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2169 work_area->table[work_area->used++] = (start);
2170 work_area->table[work_area->used++] = (end);
b7c12565 2171 return -1;
77d11aec
RS
2172 }
2173
2174 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2175
77d11aec
RS
2176 for (; start <= end; start++)
2177 {
2178 enum case_type this_type;
2179 int eqv = RE_TRANSLATE (eqv_table, start);
2180 int minchar, maxchar;
2181
2182 /* Classify this character */
2183 if (eqv == start)
2184 this_type = one_case;
2185 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2186 this_type = two_case;
2187 else
2188 this_type = strange;
2189
2190 if (start < eqv)
2191 minchar = start, maxchar = eqv;
2192 else
2193 minchar = eqv, maxchar = start;
2194
2195 /* Can this character extend the run in progress? */
2196 if (this_type == strange || this_type != run_type
2197 || !(minchar == run_end + 1
2198 && (run_type == two_case
2199 ? maxchar == run_eqv_end + 1 : 1)))
2200 {
2201 /* No, end the run.
2202 Record each of its equivalent ranges. */
2203 if (run_type == one_case)
2204 {
2205 EXTEND_RANGE_TABLE (work_area, 2);
2206 work_area->table[work_area->used++] = run_start;
2207 work_area->table[work_area->used++] = run_end;
2208 }
2209 else if (run_type == two_case)
2210 {
2211 EXTEND_RANGE_TABLE (work_area, 4);
2212 work_area->table[work_area->used++] = run_start;
2213 work_area->table[work_area->used++] = run_end;
2214 work_area->table[work_area->used++]
2215 = RE_TRANSLATE (eqv_table, run_start);
2216 work_area->table[work_area->used++]
2217 = RE_TRANSLATE (eqv_table, run_end);
2218 }
2219 run_type = strange;
2220 }
177c0ea7 2221
77d11aec
RS
2222 if (this_type == strange)
2223 {
2224 /* For a strange character, add each of its equivalents, one
2225 by one. Don't start a range. */
2226 do
2227 {
2228 EXTEND_RANGE_TABLE (work_area, 2);
2229 work_area->table[work_area->used++] = eqv;
2230 work_area->table[work_area->used++] = eqv;
2231 eqv = RE_TRANSLATE (eqv_table, eqv);
2232 }
2233 while (eqv != start);
2234 }
2235
2236 /* Add this char to the run, or start a new run. */
2237 else if (run_type == strange)
2238 {
2239 /* Initialize a new range. */
2240 run_type = this_type;
2241 run_start = start;
2242 run_end = start;
2243 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2244 }
2245 else
2246 {
2247 /* Extend a running range. */
2248 run_end = minchar;
2249 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2250 }
2251 }
2252
2253 /* If a run is still in progress at the end, finish it now
2254 by recording its equivalent ranges. */
2255 if (run_type == one_case)
2256 {
2257 EXTEND_RANGE_TABLE (work_area, 2);
2258 work_area->table[work_area->used++] = run_start;
2259 work_area->table[work_area->used++] = run_end;
2260 }
2261 else if (run_type == two_case)
2262 {
2263 EXTEND_RANGE_TABLE (work_area, 4);
2264 work_area->table[work_area->used++] = run_start;
2265 work_area->table[work_area->used++] = run_end;
2266 work_area->table[work_area->used++]
2267 = RE_TRANSLATE (eqv_table, run_start);
2268 work_area->table[work_area->used++]
2269 = RE_TRANSLATE (eqv_table, run_end);
2270 }
2271
2272 return -1;
2273}
36595814 2274
77d11aec 2275#endif /* emacs */
36595814 2276
b7c12565 2277/* Record the the image of the range start..end when passed through
36595814
SM
2278 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2279 and is not even necessarily contiguous.
b7c12565
RS
2280 Normally we approximate it with the smallest contiguous range that contains
2281 all the chars we need. However, for Latin-1 we go to extra effort
2282 to do a better job.
2283
2284 This function is not called for ASCII ranges.
77d11aec
RS
2285
2286 Returns -1 if successful, REG_ESPACE if ran out of space. */
2287
2288static int
36595814
SM
2289set_image_of_range (work_area, start, end, translate)
2290 RE_TRANSLATE_TYPE translate;
2291 struct range_table_work_area *work_area;
2292 re_wchar_t start, end;
2293{
77d11aec
RS
2294 re_wchar_t cmin, cmax;
2295
2296#ifdef emacs
2297 /* For Latin-1 ranges, use set_image_of_range_1
2298 to get proper handling of ranges that include letters and nonletters.
b7c12565 2299 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2300 For other character sets, we don't bother to get this right. */
b7c12565
RS
2301 if (RE_TRANSLATE_P (translate) && start < 04400
2302 && !(start < 04200 && end >= 04377))
77d11aec 2303 {
b7c12565 2304 int newend;
77d11aec 2305 int tem;
b7c12565
RS
2306 newend = end;
2307 if (newend > 04377)
2308 newend = 04377;
2309 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2310 if (tem > 0)
2311 return tem;
2312
2313 start = 04400;
2314 if (end < 04400)
2315 return -1;
2316 }
2317#endif
2318
b7c12565
RS
2319 EXTEND_RANGE_TABLE (work_area, 2);
2320 work_area->table[work_area->used++] = (start);
2321 work_area->table[work_area->used++] = (end);
2322
2323 cmin = -1, cmax = -1;
77d11aec 2324
36595814 2325 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2326 {
2327 int ch;
77d11aec 2328
b7c12565
RS
2329 for (ch = start; ch <= end; ch++)
2330 {
2331 re_wchar_t c = TRANSLATE (ch);
2332 if (! (start <= c && c <= end))
2333 {
2334 if (cmin == -1)
2335 cmin = c, cmax = c;
2336 else
2337 {
2338 cmin = MIN (cmin, c);
2339 cmax = MAX (cmax, c);
2340 }
2341 }
2342 }
2343
2344 if (cmin != -1)
2345 {
2346 EXTEND_RANGE_TABLE (work_area, 2);
2347 work_area->table[work_area->used++] = (cmin);
2348 work_area->table[work_area->used++] = (cmax);
2349 }
2350 }
36595814 2351
77d11aec
RS
2352 return -1;
2353}
fa9a63c5
RM
2354\f
2355#ifndef MATCH_MAY_ALLOCATE
2356
2357/* If we cannot allocate large objects within re_match_2_internal,
2358 we make the fail stack and register vectors global.
2359 The fail stack, we grow to the maximum size when a regexp
2360 is compiled.
2361 The register vectors, we adjust in size each time we
2362 compile a regexp, according to the number of registers it needs. */
2363
2364static fail_stack_type fail_stack;
2365
2366/* Size with which the following vectors are currently allocated.
2367 That is so we can make them bigger as needed,
4bb91c68 2368 but never make them smaller. */
fa9a63c5
RM
2369static int regs_allocated_size;
2370
66f0296e
SM
2371static re_char ** regstart, ** regend;
2372static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2373
2374/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2375 but don't make them smaller. */
fa9a63c5
RM
2376
2377static
2378regex_grow_registers (num_regs)
2379 int num_regs;
2380{
2381 if (num_regs > regs_allocated_size)
2382 {
66f0296e
SM
2383 RETALLOC_IF (regstart, num_regs, re_char *);
2384 RETALLOC_IF (regend, num_regs, re_char *);
2385 RETALLOC_IF (best_regstart, num_regs, re_char *);
2386 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2387
2388 regs_allocated_size = num_regs;
2389 }
2390}
2391
2392#endif /* not MATCH_MAY_ALLOCATE */
2393\f
99633e97
SM
2394static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2395 compile_stack,
2396 regnum_t regnum));
2397
fa9a63c5
RM
2398/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2399 Returns one of error codes defined in `regex.h', or zero for success.
2400
2401 Assumes the `allocated' (and perhaps `buffer') and `translate'
2402 fields are set in BUFP on entry.
2403
2404 If it succeeds, results are put in BUFP (if it returns an error, the
2405 contents of BUFP are undefined):
2406 `buffer' is the compiled pattern;
2407 `syntax' is set to SYNTAX;
2408 `used' is set to the length of the compiled pattern;
2409 `fastmap_accurate' is zero;
2410 `re_nsub' is the number of subexpressions in PATTERN;
2411 `not_bol' and `not_eol' are zero;
5e69f11e 2412
c0f9ea08 2413 The `fastmap' field is neither examined nor set. */
fa9a63c5 2414
505bde11
SM
2415/* Insert the `jump' from the end of last alternative to "here".
2416 The space for the jump has already been allocated. */
2417#define FIXUP_ALT_JUMP() \
2418do { \
2419 if (fixup_alt_jump) \
2420 STORE_JUMP (jump, fixup_alt_jump, b); \
2421} while (0)
2422
2423
fa9a63c5
RM
2424/* Return, freeing storage we allocated. */
2425#define FREE_STACK_RETURN(value) \
b18215fc
RS
2426 do { \
2427 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2428 free (compile_stack.stack); \
2429 return value; \
2430 } while (0)
fa9a63c5
RM
2431
2432static reg_errcode_t
2433regex_compile (pattern, size, syntax, bufp)
66f0296e 2434 re_char *pattern;
4bb91c68 2435 size_t size;
fa9a63c5
RM
2436 reg_syntax_t syntax;
2437 struct re_pattern_buffer *bufp;
2438{
01618498
SM
2439 /* We fetch characters from PATTERN here. */
2440 register re_wchar_t c, c1;
5e69f11e 2441
fa9a63c5 2442 /* A random temporary spot in PATTERN. */
66f0296e 2443 re_char *p1;
fa9a63c5
RM
2444
2445 /* Points to the end of the buffer, where we should append. */
2446 register unsigned char *b;
5e69f11e 2447
fa9a63c5
RM
2448 /* Keeps track of unclosed groups. */
2449 compile_stack_type compile_stack;
2450
2451 /* Points to the current (ending) position in the pattern. */
22336245
RS
2452#ifdef AIX
2453 /* `const' makes AIX compiler fail. */
66f0296e 2454 unsigned char *p = pattern;
22336245 2455#else
66f0296e 2456 re_char *p = pattern;
22336245 2457#endif
66f0296e 2458 re_char *pend = pattern + size;
5e69f11e 2459
fa9a63c5 2460 /* How to translate the characters in the pattern. */
6676cb1c 2461 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2462
2463 /* Address of the count-byte of the most recently inserted `exactn'
2464 command. This makes it possible to tell if a new exact-match
2465 character can be added to that command or if the character requires
2466 a new `exactn' command. */
2467 unsigned char *pending_exact = 0;
2468
2469 /* Address of start of the most recently finished expression.
2470 This tells, e.g., postfix * where to find the start of its
2471 operand. Reset at the beginning of groups and alternatives. */
2472 unsigned char *laststart = 0;
2473
2474 /* Address of beginning of regexp, or inside of last group. */
2475 unsigned char *begalt;
2476
2477 /* Place in the uncompiled pattern (i.e., the {) to
2478 which to go back if the interval is invalid. */
66f0296e 2479 re_char *beg_interval;
5e69f11e 2480
fa9a63c5 2481 /* Address of the place where a forward jump should go to the end of
7814e705 2482 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2483 last -- ends with a forward jump of this sort. */
2484 unsigned char *fixup_alt_jump = 0;
2485
b18215fc
RS
2486 /* Work area for range table of charset. */
2487 struct range_table_work_area range_table_work;
2488
2d1675e4
SM
2489 /* If the object matched can contain multibyte characters. */
2490 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2491
f9b0fd99
RS
2492 /* Nonzero if we have pushed down into a subpattern. */
2493 int in_subpattern = 0;
2494
2495 /* These hold the values of p, pattern, and pend from the main
2496 pattern when we have pushed into a subpattern. */
2497 re_char *main_p;
2498 re_char *main_pattern;
2499 re_char *main_pend;
2500
fa9a63c5 2501#ifdef DEBUG
99633e97 2502 debug++;
fa9a63c5 2503 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2504 if (debug > 0)
fa9a63c5
RM
2505 {
2506 unsigned debug_count;
5e69f11e 2507
fa9a63c5 2508 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2509 putchar (pattern[debug_count]);
fa9a63c5
RM
2510 putchar ('\n');
2511 }
2512#endif /* DEBUG */
2513
2514 /* Initialize the compile stack. */
2515 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2516 if (compile_stack.stack == NULL)
2517 return REG_ESPACE;
2518
2519 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2520 compile_stack.avail = 0;
2521
b18215fc
RS
2522 range_table_work.table = 0;
2523 range_table_work.allocated = 0;
2524
fa9a63c5
RM
2525 /* Initialize the pattern buffer. */
2526 bufp->syntax = syntax;
2527 bufp->fastmap_accurate = 0;
2528 bufp->not_bol = bufp->not_eol = 0;
6224b623 2529 bufp->used_syntax = 0;
fa9a63c5
RM
2530
2531 /* Set `used' to zero, so that if we return an error, the pattern
2532 printer (for debugging) will think there's no pattern. We reset it
2533 at the end. */
2534 bufp->used = 0;
5e69f11e 2535
fa9a63c5 2536 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2537 bufp->re_nsub = 0;
fa9a63c5 2538
0b32bf0e 2539#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2540 /* Initialize the syntax table. */
2541 init_syntax_once ();
2542#endif
2543
2544 if (bufp->allocated == 0)
2545 {
2546 if (bufp->buffer)
2547 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2548 enough space. This loses if buffer's address is bogus, but
7814e705 2549 that is the user's responsibility. */
25fe55af
RS
2550 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2551 }
fa9a63c5 2552 else
7814e705 2553 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2554 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2555 }
fa9a63c5
RM
2556 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2557
2558 bufp->allocated = INIT_BUF_SIZE;
2559 }
2560
2561 begalt = b = bufp->buffer;
2562
2563 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2564 while (1)
fa9a63c5 2565 {
f9b0fd99
RS
2566 if (p == pend)
2567 {
2568 /* If this is the end of an included regexp,
2569 pop back to the main regexp and try again. */
2570 if (in_subpattern)
2571 {
2572 in_subpattern = 0;
2573 pattern = main_pattern;
2574 p = main_p;
2575 pend = main_pend;
2576 continue;
2577 }
2578 /* If this is the end of the main regexp, we are done. */
2579 break;
2580 }
2581
fa9a63c5
RM
2582 PATFETCH (c);
2583
2584 switch (c)
25fe55af 2585 {
f9b0fd99
RS
2586 case ' ':
2587 {
2588 re_char *p1 = p;
2589
2590 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2591 spaces normally. And don't try to do this recursively. */
2592 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2593 goto normal_char;
2594
2595 /* Peek past following spaces. */
2596 while (p1 != pend)
2597 {
2598 if (*p1 != ' ')
2599 break;
2600 p1++;
2601 }
2602 /* If the spaces are followed by a repetition op,
2603 treat them normally. */
c721eee5
RS
2604 if (p1 != pend
2605 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2606 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2607 goto normal_char;
2608
2609 /* Replace the spaces with the whitespace regexp. */
2610 in_subpattern = 1;
2611 main_p = p1;
2612 main_pend = pend;
2613 main_pattern = pattern;
2614 p = pattern = whitespace_regexp;
2615 pend = p + strlen (p);
2616 break;
7814e705 2617 }
f9b0fd99 2618
25fe55af
RS
2619 case '^':
2620 {
7814e705 2621 if ( /* If at start of pattern, it's an operator. */
25fe55af 2622 p == pattern + 1
7814e705 2623 /* If context independent, it's an operator. */
25fe55af 2624 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2625 /* Otherwise, depends on what's come before. */
25fe55af 2626 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2627 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2628 else
2629 goto normal_char;
2630 }
2631 break;
2632
2633
2634 case '$':
2635 {
2636 if ( /* If at end of pattern, it's an operator. */
2637 p == pend
7814e705 2638 /* If context independent, it's an operator. */
25fe55af
RS
2639 || syntax & RE_CONTEXT_INDEP_ANCHORS
2640 /* Otherwise, depends on what's next. */
2641 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2642 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2643 else
2644 goto normal_char;
2645 }
2646 break;
fa9a63c5
RM
2647
2648
2649 case '+':
25fe55af
RS
2650 case '?':
2651 if ((syntax & RE_BK_PLUS_QM)
2652 || (syntax & RE_LIMITED_OPS))
2653 goto normal_char;
2654 handle_plus:
2655 case '*':
2656 /* If there is no previous pattern... */
2657 if (!laststart)
2658 {
2659 if (syntax & RE_CONTEXT_INVALID_OPS)
2660 FREE_STACK_RETURN (REG_BADRPT);
2661 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2662 goto normal_char;
2663 }
2664
2665 {
7814e705 2666 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2667 boolean zero_times_ok = 0, many_times_ok = 0;
2668 boolean greedy = 1;
25fe55af
RS
2669
2670 /* If there is a sequence of repetition chars, collapse it
2671 down to just one (the right one). We can't combine
2672 interval operators with these because of, e.g., `a{2}*',
7814e705 2673 which should only match an even number of `a's. */
25fe55af
RS
2674
2675 for (;;)
2676 {
0b32bf0e 2677 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2678 && c == '?' && (zero_times_ok || many_times_ok))
2679 greedy = 0;
2680 else
2681 {
2682 zero_times_ok |= c != '+';
2683 many_times_ok |= c != '?';
2684 }
25fe55af
RS
2685
2686 if (p == pend)
2687 break;
ed0767d8
SM
2688 else if (*p == '*'
2689 || (!(syntax & RE_BK_PLUS_QM)
2690 && (*p == '+' || *p == '?')))
25fe55af 2691 ;
ed0767d8 2692 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2693 {
ed0767d8
SM
2694 if (p+1 == pend)
2695 FREE_STACK_RETURN (REG_EESCAPE);
2696 if (p[1] == '+' || p[1] == '?')
2697 PATFETCH (c); /* Gobble up the backslash. */
2698 else
2699 break;
25fe55af
RS
2700 }
2701 else
ed0767d8 2702 break;
25fe55af 2703 /* If we get here, we found another repeat character. */
ed0767d8
SM
2704 PATFETCH (c);
2705 }
25fe55af
RS
2706
2707 /* Star, etc. applied to an empty pattern is equivalent
2708 to an empty pattern. */
4e8a9132 2709 if (!laststart || laststart == b)
25fe55af
RS
2710 break;
2711
2712 /* Now we know whether or not zero matches is allowed
7814e705 2713 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2714 if (greedy)
2715 {
99633e97 2716 if (many_times_ok)
4e8a9132
SM
2717 {
2718 boolean simple = skip_one_char (laststart) == b;
2719 unsigned int startoffset = 0;
f6a3f532 2720 re_opcode_t ofj =
01618498 2721 /* Check if the loop can match the empty string. */
6df42991
SM
2722 (simple || !analyse_first (laststart, b, NULL, 0))
2723 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2724 assert (skip_one_char (laststart) <= b);
177c0ea7 2725
4e8a9132
SM
2726 if (!zero_times_ok && simple)
2727 { /* Since simple * loops can be made faster by using
2728 on_failure_keep_string_jump, we turn simple P+
2729 into PP* if P is simple. */
2730 unsigned char *p1, *p2;
2731 startoffset = b - laststart;
2732 GET_BUFFER_SPACE (startoffset);
2733 p1 = b; p2 = laststart;
2734 while (p2 < p1)
2735 *b++ = *p2++;
2736 zero_times_ok = 1;
99633e97 2737 }
4e8a9132
SM
2738
2739 GET_BUFFER_SPACE (6);
2740 if (!zero_times_ok)
2741 /* A + loop. */
f6a3f532 2742 STORE_JUMP (ofj, b, b + 6);
99633e97 2743 else
4e8a9132
SM
2744 /* Simple * loops can use on_failure_keep_string_jump
2745 depending on what follows. But since we don't know
2746 that yet, we leave the decision up to
2747 on_failure_jump_smart. */
f6a3f532 2748 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2749 laststart + startoffset, b + 6);
99633e97 2750 b += 3;
4e8a9132 2751 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2752 b += 3;
2753 }
2754 else
2755 {
4e8a9132
SM
2756 /* A simple ? pattern. */
2757 assert (zero_times_ok);
2758 GET_BUFFER_SPACE (3);
2759 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2760 b += 3;
2761 }
1c8c6d39
DL
2762 }
2763 else /* not greedy */
2764 { /* I wish the greedy and non-greedy cases could be merged. */
2765
0683b6fa 2766 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2767 if (many_times_ok)
2768 {
f6a3f532
SM
2769 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2770
6df42991
SM
2771 /* The non-greedy multiple match looks like
2772 a repeat..until: we only need a conditional jump
2773 at the end of the loop. */
f6a3f532
SM
2774 if (emptyp) BUF_PUSH (no_op);
2775 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2776 : on_failure_jump, b, laststart);
1c8c6d39
DL
2777 b += 3;
2778 if (zero_times_ok)
2779 {
2780 /* The repeat...until naturally matches one or more.
2781 To also match zero times, we need to first jump to
6df42991 2782 the end of the loop (its conditional jump). */
1c8c6d39
DL
2783 INSERT_JUMP (jump, laststart, b);
2784 b += 3;
2785 }
2786 }
2787 else
2788 {
2789 /* non-greedy a?? */
1c8c6d39
DL
2790 INSERT_JUMP (jump, laststart, b + 3);
2791 b += 3;
2792 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2793 b += 3;
2794 }
2795 }
2796 }
4e8a9132 2797 pending_exact = 0;
fa9a63c5
RM
2798 break;
2799
2800
2801 case '.':
25fe55af
RS
2802 laststart = b;
2803 BUF_PUSH (anychar);
2804 break;
fa9a63c5
RM
2805
2806
25fe55af
RS
2807 case '[':
2808 {
b18215fc 2809 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2810
25fe55af 2811 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2812
25fe55af
RS
2813 /* Ensure that we have enough space to push a charset: the
2814 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2815 GET_BUFFER_SPACE (34);
2816
25fe55af 2817 laststart = b;
e318085a 2818
25fe55af 2819 /* We test `*p == '^' twice, instead of using an if
7814e705 2820 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2821 BUF_PUSH (*p == '^' ? charset_not : charset);
2822 if (*p == '^')
2823 p++;
e318085a 2824
25fe55af
RS
2825 /* Remember the first position in the bracket expression. */
2826 p1 = p;
e318085a 2827
7814e705 2828 /* Push the number of bytes in the bitmap. */
25fe55af 2829 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2830
25fe55af
RS
2831 /* Clear the whole map. */
2832 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2833
25fe55af
RS
2834 /* charset_not matches newline according to a syntax bit. */
2835 if ((re_opcode_t) b[-2] == charset_not
2836 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2837 SET_LIST_BIT ('\n');
fa9a63c5 2838
7814e705 2839 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2840 for (;;)
2841 {
b18215fc 2842 boolean escaped_char = false;
2d1675e4 2843 const unsigned char *p2 = p;
e318085a 2844
25fe55af 2845 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2846
36595814
SM
2847 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2848 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2849 So the translation is done later in a loop. Example:
2850 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2851 PATFETCH (c);
e318085a 2852
25fe55af
RS
2853 /* \ might escape characters inside [...] and [^...]. */
2854 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2855 {
2856 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2857
2858 PATFETCH (c);
b18215fc 2859 escaped_char = true;
25fe55af 2860 }
b18215fc
RS
2861 else
2862 {
7814e705 2863 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2864 not (i.e., when the bracket expression is `[]' so
2865 far), the ']' character bit gets set way below. */
2d1675e4 2866 if (c == ']' && p2 != p1)
657fcfbd 2867 break;
25fe55af 2868 }
b18215fc 2869
b18215fc
RS
2870 /* What should we do for the character which is
2871 greater than 0x7F, but not BASE_LEADING_CODE_P?
2872 XXX */
2873
25fe55af
RS
2874 /* See if we're at the beginning of a possible character
2875 class. */
b18215fc 2876
2d1675e4
SM
2877 if (!escaped_char &&
2878 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2879 {
7814e705 2880 /* Leave room for the null. */
14473664 2881 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2882 const unsigned char *class_beg;
b18215fc 2883
25fe55af
RS
2884 PATFETCH (c);
2885 c1 = 0;
ed0767d8 2886 class_beg = p;
b18215fc 2887
25fe55af
RS
2888 /* If pattern is `[[:'. */
2889 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2890
25fe55af
RS
2891 for (;;)
2892 {
14473664
SM
2893 PATFETCH (c);
2894 if ((c == ':' && *p == ']') || p == pend)
2895 break;
2896 if (c1 < CHAR_CLASS_MAX_LENGTH)
2897 str[c1++] = c;
2898 else
2899 /* This is in any case an invalid class name. */
2900 str[0] = '\0';
25fe55af
RS
2901 }
2902 str[c1] = '\0';
b18215fc
RS
2903
2904 /* If isn't a word bracketed by `[:' and `:]':
2905 undo the ending character, the letters, and
2906 leave the leading `:' and `[' (but set bits for
2907 them). */
25fe55af
RS
2908 if (c == ':' && *p == ']')
2909 {
36595814 2910 re_wchar_t ch;
14473664
SM
2911 re_wctype_t cc;
2912
2913 cc = re_wctype (str);
2914
2915 if (cc == 0)
fa9a63c5
RM
2916 FREE_STACK_RETURN (REG_ECTYPE);
2917
14473664
SM
2918 /* Throw away the ] at the end of the character
2919 class. */
2920 PATFETCH (c);
fa9a63c5 2921
14473664 2922 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2923
96cc36cc
RS
2924 /* Most character classes in a multibyte match
2925 just set a flag. Exceptions are is_blank,
2926 is_digit, is_cntrl, and is_xdigit, since
2927 they can only match ASCII characters. We
14473664
SM
2928 don't need to handle them for multibyte.
2929 They are distinguished by a negative wctype. */
96cc36cc 2930
2d1675e4 2931 if (multibyte)
14473664
SM
2932 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2933 re_wctype_to_bit (cc));
96cc36cc 2934
14473664 2935 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
25fe55af 2936 {
7ae68633 2937 int translated = TRANSLATE (ch);
151ccb74 2938 if (translated < (1 << BYTEWIDTH)
6358f8b2 2939 && re_iswctype (btowc (ch), cc))
96cc36cc 2940 SET_LIST_BIT (translated);
25fe55af 2941 }
b18215fc 2942
6224b623
SM
2943 /* In most cases the matching rule for char classes
2944 only uses the syntax table for multibyte chars,
2945 so that the content of the syntax-table it is not
2946 hardcoded in the range_table. SPACE and WORD are
2947 the two exceptions. */
2948 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2949 bufp->used_syntax = 1;
2950
b18215fc
RS
2951 /* Repeat the loop. */
2952 continue;
25fe55af
RS
2953 }
2954 else
2955 {
ed0767d8
SM
2956 /* Go back to right after the "[:". */
2957 p = class_beg;
25fe55af 2958 SET_LIST_BIT ('[');
b18215fc
RS
2959
2960 /* Because the `:' may starts the range, we
2961 can't simply set bit and repeat the loop.
7814e705 2962 Instead, just set it to C and handle below. */
b18215fc 2963 c = ':';
25fe55af
RS
2964 }
2965 }
b18215fc
RS
2966
2967 if (p < pend && p[0] == '-' && p[1] != ']')
2968 {
2969
2970 /* Discard the `-'. */
2971 PATFETCH (c1);
2972
2973 /* Fetch the character which ends the range. */
2974 PATFETCH (c1);
b18215fc 2975
b54f61ed 2976 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2977 {
b54f61ed
KH
2978 if (! SINGLE_BYTE_CHAR_P (c1))
2979 {
3ff2446d
KH
2980 /* Handle a range starting with a
2981 character of less than 256, and ending
2982 with a character of not less than 256.
2983 Split that into two ranges, the low one
2984 ending at 0377, and the high one
2985 starting at the smallest character in
2986 the charset of C1 and ending at C1. */
b54f61ed 2987 int charset = CHAR_CHARSET (c1);
36595814
SM
2988 re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
2989
b54f61ed
KH
2990 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2991 c2, c1);
333526e0 2992 c1 = 0377;
b54f61ed 2993 }
e934739e
RS
2994 }
2995 else if (!SAME_CHARSET_P (c, c1))
b3e4c897 2996 FREE_STACK_RETURN (REG_ERANGEX);
e318085a 2997 }
25fe55af 2998 else
b18215fc
RS
2999 /* Range from C to C. */
3000 c1 = c;
3001
3002 /* Set the range ... */
3003 if (SINGLE_BYTE_CHAR_P (c))
3004 /* ... into bitmap. */
25fe55af 3005 {
01618498 3006 re_wchar_t this_char;
36595814 3007 re_wchar_t range_start = c, range_end = c1;
b18215fc
RS
3008
3009 /* If the start is after the end, the range is empty. */
3010 if (range_start > range_end)
3011 {
3012 if (syntax & RE_NO_EMPTY_RANGES)
3013 FREE_STACK_RETURN (REG_ERANGE);
3014 /* Else, repeat the loop. */
3015 }
3016 else
3017 {
3018 for (this_char = range_start; this_char <= range_end;
3019 this_char++)
3aaab9a0
KH
3020 {
3021 int translated = TRANSLATE (this_char);
3022 if (translated < (1 << BYTEWIDTH))
3023 SET_LIST_BIT (translated);
3024 else
3025 SET_RANGE_TABLE_WORK_AREA
3026 (range_table_work, translated, translated);
3027 }
e934739e 3028 }
25fe55af 3029 }
e318085a 3030 else
b18215fc
RS
3031 /* ... into range table. */
3032 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
3033 }
3034
25fe55af 3035 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3036 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3037 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3038 b[-1]--;
3039 b += b[-1];
fa9a63c5 3040
96cc36cc
RS
3041 /* Build real range table from work area. */
3042 if (RANGE_TABLE_WORK_USED (range_table_work)
3043 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3044 {
3045 int i;
3046 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3047
b18215fc 3048 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3049 bytes for flags, two for COUNT, and three bytes for
3050 each character. */
3051 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3052
b18215fc
RS
3053 /* Indicate the existence of range table. */
3054 laststart[1] |= 0x80;
fa9a63c5 3055
96cc36cc
RS
3056 /* Store the character class flag bits into the range table.
3057 If not in emacs, these flag bits are always 0. */
3058 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3059 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3060
b18215fc
RS
3061 STORE_NUMBER_AND_INCR (b, used / 2);
3062 for (i = 0; i < used; i++)
3063 STORE_CHARACTER_AND_INCR
3064 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3065 }
25fe55af
RS
3066 }
3067 break;
fa9a63c5
RM
3068
3069
b18215fc 3070 case '(':
25fe55af
RS
3071 if (syntax & RE_NO_BK_PARENS)
3072 goto handle_open;
3073 else
3074 goto normal_char;
fa9a63c5
RM
3075
3076
25fe55af
RS
3077 case ')':
3078 if (syntax & RE_NO_BK_PARENS)
3079 goto handle_close;
3080 else
3081 goto normal_char;
e318085a
RS
3082
3083
25fe55af
RS
3084 case '\n':
3085 if (syntax & RE_NEWLINE_ALT)
3086 goto handle_alt;
3087 else
3088 goto normal_char;
e318085a
RS
3089
3090
b18215fc 3091 case '|':
25fe55af
RS
3092 if (syntax & RE_NO_BK_VBAR)
3093 goto handle_alt;
3094 else
3095 goto normal_char;
3096
3097
3098 case '{':
3099 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3100 goto handle_interval;
3101 else
3102 goto normal_char;
3103
3104
3105 case '\\':
3106 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3107
3108 /* Do not translate the character after the \, so that we can
3109 distinguish, e.g., \B from \b, even if we normally would
3110 translate, e.g., B to b. */
36595814 3111 PATFETCH (c);
25fe55af
RS
3112
3113 switch (c)
3114 {
3115 case '(':
3116 if (syntax & RE_NO_BK_PARENS)
3117 goto normal_backslash;
3118
3119 handle_open:
505bde11
SM
3120 {
3121 int shy = 0;
c69b0314 3122 regnum_t regnum = 0;
505bde11
SM
3123 if (p+1 < pend)
3124 {
3125 /* Look for a special (?...) construct */
ed0767d8 3126 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3127 {
ed0767d8 3128 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3129 while (!shy)
505bde11 3130 {
c69b0314
SM
3131 PATFETCH (c);
3132 switch (c)
3133 {
3134 case ':': shy = 1; break;
3135 case '0':
3136 /* An explicitly specified regnum must start
3137 with non-0. */
3138 if (regnum == 0)
3139 FREE_STACK_RETURN (REG_BADPAT);
3140 case '1': case '2': case '3': case '4':
3141 case '5': case '6': case '7': case '8': case '9':
3142 regnum = 10*regnum + (c - '0'); break;
3143 default:
3144 /* Only (?:...) is supported right now. */
3145 FREE_STACK_RETURN (REG_BADPAT);
3146 }
505bde11
SM
3147 }
3148 }
505bde11
SM
3149 }
3150
3151 if (!shy)
c69b0314
SM
3152 regnum = ++bufp->re_nsub;
3153 else if (regnum)
3154 { /* It's actually not shy, but explicitly numbered. */
3155 shy = 0;
3156 if (regnum > bufp->re_nsub)
3157 bufp->re_nsub = regnum;
3158 else if (regnum > bufp->re_nsub
3159 /* Ideally, we'd want to check that the specified
3160 group can't have matched (i.e. all subgroups
3161 using the same regnum are in other branches of
3162 OR patterns), but we don't currently keep track
3163 of enough info to do that easily. */
3164 || group_in_compile_stack (compile_stack, regnum))
3165 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3166 }
c69b0314
SM
3167 else
3168 /* It's really shy. */
3169 regnum = - bufp->re_nsub;
25fe55af 3170
99633e97
SM
3171 if (COMPILE_STACK_FULL)
3172 {
3173 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3174 compile_stack_elt_t);
3175 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3176
99633e97
SM
3177 compile_stack.size <<= 1;
3178 }
25fe55af 3179
99633e97 3180 /* These are the values to restore when we hit end of this
7814e705 3181 group. They are all relative offsets, so that if the
99633e97
SM
3182 whole pattern moves because of realloc, they will still
3183 be valid. */
3184 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3185 COMPILE_STACK_TOP.fixup_alt_jump
3186 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3187 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3188 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3189
c69b0314
SM
3190 /* Do not push a start_memory for groups beyond the last one
3191 we can represent in the compiled pattern. */
3192 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3193 BUF_PUSH_2 (start_memory, regnum);
3194
3195 compile_stack.avail++;
3196
3197 fixup_alt_jump = 0;
3198 laststart = 0;
3199 begalt = b;
3200 /* If we've reached MAX_REGNUM groups, then this open
3201 won't actually generate any code, so we'll have to
3202 clear pending_exact explicitly. */
3203 pending_exact = 0;
3204 break;
505bde11 3205 }
25fe55af
RS
3206
3207 case ')':
3208 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3209
3210 if (COMPILE_STACK_EMPTY)
505bde11
SM
3211 {
3212 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3213 goto normal_backslash;
3214 else
3215 FREE_STACK_RETURN (REG_ERPAREN);
3216 }
25fe55af
RS
3217
3218 handle_close:
505bde11 3219 FIXUP_ALT_JUMP ();
25fe55af
RS
3220
3221 /* See similar code for backslashed left paren above. */
3222 if (COMPILE_STACK_EMPTY)
505bde11
SM
3223 {
3224 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3225 goto normal_char;
3226 else
3227 FREE_STACK_RETURN (REG_ERPAREN);
3228 }
25fe55af
RS
3229
3230 /* Since we just checked for an empty stack above, this
3231 ``can't happen''. */
3232 assert (compile_stack.avail != 0);
3233 {
3234 /* We don't just want to restore into `regnum', because
3235 later groups should continue to be numbered higher,
7814e705 3236 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3237 regnum_t regnum;
25fe55af
RS
3238
3239 compile_stack.avail--;
3240 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3241 fixup_alt_jump
3242 = COMPILE_STACK_TOP.fixup_alt_jump
3243 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3244 : 0;
3245 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3246 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3247 /* If we've reached MAX_REGNUM groups, then this open
3248 won't actually generate any code, so we'll have to
3249 clear pending_exact explicitly. */
3250 pending_exact = 0;
e318085a 3251
25fe55af 3252 /* We're at the end of the group, so now we know how many
7814e705 3253 groups were inside this one. */
c69b0314
SM
3254 if (regnum <= MAX_REGNUM && regnum > 0)
3255 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3256 }
3257 break;
3258
3259
3260 case '|': /* `\|'. */
3261 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3262 goto normal_backslash;
3263 handle_alt:
3264 if (syntax & RE_LIMITED_OPS)
3265 goto normal_char;
3266
3267 /* Insert before the previous alternative a jump which
7814e705 3268 jumps to this alternative if the former fails. */
25fe55af
RS
3269 GET_BUFFER_SPACE (3);
3270 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3271 pending_exact = 0;
3272 b += 3;
3273
3274 /* The alternative before this one has a jump after it
3275 which gets executed if it gets matched. Adjust that
3276 jump so it will jump to this alternative's analogous
3277 jump (put in below, which in turn will jump to the next
3278 (if any) alternative's such jump, etc.). The last such
3279 jump jumps to the correct final destination. A picture:
3280 _____ _____
3281 | | | |
3282 | v | v
3283 a | b | c
3284
3285 If we are at `b', then fixup_alt_jump right now points to a
3286 three-byte space after `a'. We'll put in the jump, set
3287 fixup_alt_jump to right after `b', and leave behind three
3288 bytes which we'll fill in when we get to after `c'. */
3289
505bde11 3290 FIXUP_ALT_JUMP ();
25fe55af
RS
3291
3292 /* Mark and leave space for a jump after this alternative,
3293 to be filled in later either by next alternative or
3294 when know we're at the end of a series of alternatives. */
3295 fixup_alt_jump = b;
3296 GET_BUFFER_SPACE (3);
3297 b += 3;
3298
3299 laststart = 0;
3300 begalt = b;
3301 break;
3302
3303
3304 case '{':
3305 /* If \{ is a literal. */
3306 if (!(syntax & RE_INTERVALS)
3307 /* If we're at `\{' and it's not the open-interval
3308 operator. */
4bb91c68 3309 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3310 goto normal_backslash;
3311
3312 handle_interval:
3313 {
3314 /* If got here, then the syntax allows intervals. */
3315
3316 /* At least (most) this many matches must be made. */
99633e97 3317 int lower_bound = 0, upper_bound = -1;
25fe55af 3318
ed0767d8 3319 beg_interval = p;
25fe55af 3320
25fe55af
RS
3321 GET_UNSIGNED_NUMBER (lower_bound);
3322
3323 if (c == ',')
ed0767d8 3324 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3325 else
3326 /* Interval such as `{1}' => match exactly once. */
3327 upper_bound = lower_bound;
3328
3329 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3330 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3331 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3332
3333 if (!(syntax & RE_NO_BK_BRACES))
3334 {
4bb91c68
SM
3335 if (c != '\\')
3336 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3337 if (p == pend)
3338 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3339 PATFETCH (c);
3340 }
3341
3342 if (c != '}')
4bb91c68 3343 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3344
3345 /* We just parsed a valid interval. */
3346
3347 /* If it's invalid to have no preceding re. */
3348 if (!laststart)
3349 {
3350 if (syntax & RE_CONTEXT_INVALID_OPS)
3351 FREE_STACK_RETURN (REG_BADRPT);
3352 else if (syntax & RE_CONTEXT_INDEP_OPS)
3353 laststart = b;
3354 else
3355 goto unfetch_interval;
3356 }
3357
6df42991
SM
3358 if (upper_bound == 0)
3359 /* If the upper bound is zero, just drop the sub pattern
3360 altogether. */
3361 b = laststart;
3362 else if (lower_bound == 1 && upper_bound == 1)
3363 /* Just match it once: nothing to do here. */
3364 ;
3365
3366 /* Otherwise, we have a nontrivial interval. When
3367 we're all done, the pattern will look like:
3368 set_number_at <jump count> <upper bound>
3369 set_number_at <succeed_n count> <lower bound>
3370 succeed_n <after jump addr> <succeed_n count>
3371 <body of loop>
3372 jump_n <succeed_n addr> <jump count>
3373 (The upper bound and `jump_n' are omitted if
3374 `upper_bound' is 1, though.) */
3375 else
3376 { /* If the upper bound is > 1, we need to insert
3377 more at the end of the loop. */
3378 unsigned int nbytes = (upper_bound < 0 ? 3
3379 : upper_bound > 1 ? 5 : 0);
3380 unsigned int startoffset = 0;
3381
3382 GET_BUFFER_SPACE (20); /* We might use less. */
3383
3384 if (lower_bound == 0)
3385 {
3386 /* A succeed_n that starts with 0 is really a
3387 a simple on_failure_jump_loop. */
3388 INSERT_JUMP (on_failure_jump_loop, laststart,
3389 b + 3 + nbytes);
3390 b += 3;
3391 }
3392 else
3393 {
3394 /* Initialize lower bound of the `succeed_n', even
3395 though it will be set during matching by its
3396 attendant `set_number_at' (inserted next),
3397 because `re_compile_fastmap' needs to know.
3398 Jump to the `jump_n' we might insert below. */
3399 INSERT_JUMP2 (succeed_n, laststart,
3400 b + 5 + nbytes,
3401 lower_bound);
3402 b += 5;
3403
3404 /* Code to initialize the lower bound. Insert
7814e705 3405 before the `succeed_n'. The `5' is the last two
6df42991
SM
3406 bytes of this `set_number_at', plus 3 bytes of
3407 the following `succeed_n'. */
3408 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3409 b += 5;
3410 startoffset += 5;
3411 }
3412
3413 if (upper_bound < 0)
3414 {
3415 /* A negative upper bound stands for infinity,
3416 in which case it degenerates to a plain jump. */
3417 STORE_JUMP (jump, b, laststart + startoffset);
3418 b += 3;
3419 }
3420 else if (upper_bound > 1)
3421 { /* More than one repetition is allowed, so
3422 append a backward jump to the `succeed_n'
3423 that starts this interval.
3424
3425 When we've reached this during matching,
3426 we'll have matched the interval once, so
3427 jump back only `upper_bound - 1' times. */
3428 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3429 upper_bound - 1);
3430 b += 5;
3431
3432 /* The location we want to set is the second
3433 parameter of the `jump_n'; that is `b-2' as
3434 an absolute address. `laststart' will be
3435 the `set_number_at' we're about to insert;
3436 `laststart+3' the number to set, the source
3437 for the relative address. But we are
3438 inserting into the middle of the pattern --
3439 so everything is getting moved up by 5.
3440 Conclusion: (b - 2) - (laststart + 3) + 5,
3441 i.e., b - laststart.
3442
3443 We insert this at the beginning of the loop
3444 so that if we fail during matching, we'll
3445 reinitialize the bounds. */
3446 insert_op2 (set_number_at, laststart, b - laststart,
3447 upper_bound - 1, b);
3448 b += 5;
3449 }
3450 }
25fe55af
RS
3451 pending_exact = 0;
3452 beg_interval = NULL;
3453 }
3454 break;
3455
3456 unfetch_interval:
3457 /* If an invalid interval, match the characters as literals. */
3458 assert (beg_interval);
3459 p = beg_interval;
3460 beg_interval = NULL;
3461
3462 /* normal_char and normal_backslash need `c'. */
ed0767d8 3463 c = '{';
25fe55af
RS
3464
3465 if (!(syntax & RE_NO_BK_BRACES))
3466 {
ed0767d8
SM
3467 assert (p > pattern && p[-1] == '\\');
3468 goto normal_backslash;
25fe55af 3469 }
ed0767d8
SM
3470 else
3471 goto normal_char;
e318085a 3472
b18215fc 3473#ifdef emacs
25fe55af 3474 /* There is no way to specify the before_dot and after_dot
7814e705 3475 operators. rms says this is ok. --karl */
25fe55af
RS
3476 case '=':
3477 BUF_PUSH (at_dot);
3478 break;
3479
3480 case 's':
3481 laststart = b;
3482 PATFETCH (c);
3483 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3484 break;
3485
3486 case 'S':
3487 laststart = b;
3488 PATFETCH (c);
3489 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3490 break;
b18215fc
RS
3491
3492 case 'c':
3493 laststart = b;
36595814 3494 PATFETCH (c);
b18215fc
RS
3495 BUF_PUSH_2 (categoryspec, c);
3496 break;
e318085a 3497
b18215fc
RS
3498 case 'C':
3499 laststart = b;
36595814 3500 PATFETCH (c);
b18215fc
RS
3501 BUF_PUSH_2 (notcategoryspec, c);
3502 break;
3503#endif /* emacs */
e318085a 3504
e318085a 3505
25fe55af 3506 case 'w':
4bb91c68
SM
3507 if (syntax & RE_NO_GNU_OPS)
3508 goto normal_char;
25fe55af 3509 laststart = b;
1fb352e0 3510 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3511 break;
e318085a 3512
e318085a 3513
25fe55af 3514 case 'W':
4bb91c68
SM
3515 if (syntax & RE_NO_GNU_OPS)
3516 goto normal_char;
25fe55af 3517 laststart = b;
1fb352e0 3518 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3519 break;
e318085a
RS
3520
3521
25fe55af 3522 case '<':
4bb91c68
SM
3523 if (syntax & RE_NO_GNU_OPS)
3524 goto normal_char;
25fe55af
RS
3525 BUF_PUSH (wordbeg);
3526 break;
e318085a 3527
25fe55af 3528 case '>':
4bb91c68
SM
3529 if (syntax & RE_NO_GNU_OPS)
3530 goto normal_char;
25fe55af
RS
3531 BUF_PUSH (wordend);
3532 break;
e318085a 3533
669fa600
SM
3534 case '_':
3535 if (syntax & RE_NO_GNU_OPS)
3536 goto normal_char;
3537 laststart = b;
3538 PATFETCH (c);
3539 if (c == '<')
3540 BUF_PUSH (symbeg);
3541 else if (c == '>')
3542 BUF_PUSH (symend);
3543 else
3544 FREE_STACK_RETURN (REG_BADPAT);
3545 break;
3546
25fe55af 3547 case 'b':
4bb91c68
SM
3548 if (syntax & RE_NO_GNU_OPS)
3549 goto normal_char;
25fe55af
RS
3550 BUF_PUSH (wordbound);
3551 break;
e318085a 3552
25fe55af 3553 case 'B':
4bb91c68
SM
3554 if (syntax & RE_NO_GNU_OPS)
3555 goto normal_char;
25fe55af
RS
3556 BUF_PUSH (notwordbound);
3557 break;
fa9a63c5 3558
25fe55af 3559 case '`':
4bb91c68
SM
3560 if (syntax & RE_NO_GNU_OPS)
3561 goto normal_char;
25fe55af
RS
3562 BUF_PUSH (begbuf);
3563 break;
e318085a 3564
25fe55af 3565 case '\'':
4bb91c68
SM
3566 if (syntax & RE_NO_GNU_OPS)
3567 goto normal_char;
25fe55af
RS
3568 BUF_PUSH (endbuf);
3569 break;
e318085a 3570
25fe55af
RS
3571 case '1': case '2': case '3': case '4': case '5':
3572 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3573 {
3574 regnum_t reg;
e318085a 3575
0cdd06f8
SM
3576 if (syntax & RE_NO_BK_REFS)
3577 goto normal_backslash;
e318085a 3578
0cdd06f8 3579 reg = c - '0';
e318085a 3580
c69b0314
SM
3581 if (reg > bufp->re_nsub || reg < 1
3582 /* Can't back reference to a subexp before its end. */
3583 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3584 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3585
0cdd06f8
SM
3586 laststart = b;
3587 BUF_PUSH_2 (duplicate, reg);
3588 }
25fe55af 3589 break;
e318085a 3590
e318085a 3591
25fe55af
RS
3592 case '+':
3593 case '?':
3594 if (syntax & RE_BK_PLUS_QM)
3595 goto handle_plus;
3596 else
3597 goto normal_backslash;
3598
3599 default:
3600 normal_backslash:
3601 /* You might think it would be useful for \ to mean
3602 not to translate; but if we don't translate it
4bb91c68 3603 it will never match anything. */
25fe55af
RS
3604 goto normal_char;
3605 }
3606 break;
fa9a63c5
RM
3607
3608
3609 default:
25fe55af 3610 /* Expects the character in `c'. */
fa9a63c5 3611 normal_char:
36595814 3612 /* If no exactn currently being built. */
25fe55af 3613 if (!pending_exact
fa9a63c5 3614
25fe55af
RS
3615 /* If last exactn not at current position. */
3616 || pending_exact + *pending_exact + 1 != b
5e69f11e 3617
25fe55af 3618 /* We have only one byte following the exactn for the count. */
2d1675e4 3619 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3620
7814e705 3621 /* If followed by a repetition operator. */
9d99031f 3622 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3623 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3624 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3625 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3626 || ((syntax & RE_INTERVALS)
25fe55af 3627 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3628 ? p != pend && *p == '{'
3629 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3630 {
3631 /* Start building a new exactn. */
5e69f11e 3632
25fe55af 3633 laststart = b;
fa9a63c5
RM
3634
3635 BUF_PUSH_2 (exactn, 0);
3636 pending_exact = b - 1;
25fe55af 3637 }
5e69f11e 3638
2d1675e4
SM
3639 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3640 {
e0277a47
KH
3641 int len;
3642
36595814 3643 c = TRANSLATE (c);
e0277a47
KH
3644 if (multibyte)
3645 len = CHAR_STRING (c, b);
3646 else
3647 *b = c, len = 1;
2d1675e4
SM
3648 b += len;
3649 (*pending_exact) += len;
3650 }
3651
fa9a63c5 3652 break;
25fe55af 3653 } /* switch (c) */
fa9a63c5
RM
3654 } /* while p != pend */
3655
5e69f11e 3656
fa9a63c5 3657 /* Through the pattern now. */
5e69f11e 3658
505bde11 3659 FIXUP_ALT_JUMP ();
fa9a63c5 3660
5e69f11e 3661 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3662 FREE_STACK_RETURN (REG_EPAREN);
3663
3664 /* If we don't want backtracking, force success
3665 the first time we reach the end of the compiled pattern. */
3666 if (syntax & RE_NO_POSIX_BACKTRACKING)
3667 BUF_PUSH (succeed);
3668
fa9a63c5
RM
3669 /* We have succeeded; set the length of the buffer. */
3670 bufp->used = b - bufp->buffer;
3671
3672#ifdef DEBUG
99633e97 3673 if (debug > 0)
fa9a63c5 3674 {
505bde11 3675 re_compile_fastmap (bufp);
fa9a63c5
RM
3676 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3677 print_compiled_pattern (bufp);
3678 }
99633e97 3679 debug--;
fa9a63c5
RM
3680#endif /* DEBUG */
3681
3682#ifndef MATCH_MAY_ALLOCATE
3683 /* Initialize the failure stack to the largest possible stack. This
3684 isn't necessary unless we're trying to avoid calling alloca in
3685 the search and match routines. */
3686 {
3687 int num_regs = bufp->re_nsub + 1;
3688
320a2a73 3689 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3690 {
a26f4ccd 3691 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3692
fa9a63c5
RM
3693 if (! fail_stack.stack)
3694 fail_stack.stack
a073faa6 3695 = (fail_stack_elt_t *) malloc (fail_stack.size
8ea094cf 3696 * sizeof (fail_stack_elt_t));
fa9a63c5
RM
3697 else
3698 fail_stack.stack
a073faa6 3699 = (fail_stack_elt_t *) realloc (fail_stack.stack,
8ea094cf
AS
3700 (fail_stack.size
3701 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3702 }
3703
3704 regex_grow_registers (num_regs);
3705 }
3706#endif /* not MATCH_MAY_ALLOCATE */
3707
7c67c9f6 3708 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3709} /* regex_compile */
3710\f
3711/* Subroutines for `regex_compile'. */
3712
7814e705 3713/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3714
3715static void
3716store_op1 (op, loc, arg)
3717 re_opcode_t op;
3718 unsigned char *loc;
3719 int arg;
3720{
3721 *loc = (unsigned char) op;
3722 STORE_NUMBER (loc + 1, arg);
3723}
3724
3725
3726/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3727
3728static void
3729store_op2 (op, loc, arg1, arg2)
3730 re_opcode_t op;
3731 unsigned char *loc;
3732 int arg1, arg2;
3733{
3734 *loc = (unsigned char) op;
3735 STORE_NUMBER (loc + 1, arg1);
3736 STORE_NUMBER (loc + 3, arg2);
3737}
3738
3739
3740/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3741 for OP followed by two-byte integer parameter ARG. */
3742
3743static void
3744insert_op1 (op, loc, arg, end)
3745 re_opcode_t op;
3746 unsigned char *loc;
3747 int arg;
5e69f11e 3748 unsigned char *end;
fa9a63c5
RM
3749{
3750 register unsigned char *pfrom = end;
3751 register unsigned char *pto = end + 3;
3752
3753 while (pfrom != loc)
3754 *--pto = *--pfrom;
5e69f11e 3755
fa9a63c5
RM
3756 store_op1 (op, loc, arg);
3757}
3758
3759
3760/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3761
3762static void
3763insert_op2 (op, loc, arg1, arg2, end)
3764 re_opcode_t op;
3765 unsigned char *loc;
3766 int arg1, arg2;
5e69f11e 3767 unsigned char *end;
fa9a63c5
RM
3768{
3769 register unsigned char *pfrom = end;
3770 register unsigned char *pto = end + 5;
3771
3772 while (pfrom != loc)
3773 *--pto = *--pfrom;
5e69f11e 3774
fa9a63c5
RM
3775 store_op2 (op, loc, arg1, arg2);
3776}
3777
3778
3779/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3780 after an alternative or a begin-subexpression. We assume there is at
3781 least one character before the ^. */
3782
3783static boolean
3784at_begline_loc_p (pattern, p, syntax)
01618498 3785 re_char *pattern, *p;
fa9a63c5
RM
3786 reg_syntax_t syntax;
3787{
01618498 3788 re_char *prev = p - 2;
fa9a63c5 3789 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3790
fa9a63c5
RM
3791 return
3792 /* After a subexpression? */
3793 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3794 /* After an alternative? */
d2af47df
SM
3795 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3796 /* After a shy subexpression? */
3797 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3798 && prev[-1] == '?' && prev[-2] == '('
3799 && (syntax & RE_NO_BK_PARENS
3800 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3801}
3802
3803
3804/* The dual of at_begline_loc_p. This one is for $. We assume there is
3805 at least one character after the $, i.e., `P < PEND'. */
3806
3807static boolean
3808at_endline_loc_p (p, pend, syntax)
01618498 3809 re_char *p, *pend;
99633e97 3810 reg_syntax_t syntax;
fa9a63c5 3811{
01618498 3812 re_char *next = p;
fa9a63c5 3813 boolean next_backslash = *next == '\\';
01618498 3814 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3815
fa9a63c5
RM
3816 return
3817 /* Before a subexpression? */
3818 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3819 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3820 /* Before an alternative? */
3821 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3822 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3823}
3824
3825
5e69f11e 3826/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3827 false if it's not. */
3828
3829static boolean
3830group_in_compile_stack (compile_stack, regnum)
3831 compile_stack_type compile_stack;
3832 regnum_t regnum;
3833{
3834 int this_element;
3835
5e69f11e
RM
3836 for (this_element = compile_stack.avail - 1;
3837 this_element >= 0;
fa9a63c5
RM
3838 this_element--)
3839 if (compile_stack.stack[this_element].regnum == regnum)
3840 return true;
3841
3842 return false;
3843}
fa9a63c5 3844\f
f6a3f532
SM
3845/* analyse_first.
3846 If fastmap is non-NULL, go through the pattern and fill fastmap
3847 with all the possible leading chars. If fastmap is NULL, don't
3848 bother filling it up (obviously) and only return whether the
3849 pattern could potentially match the empty string.
3850
3851 Return 1 if p..pend might match the empty string.
3852 Return 0 if p..pend matches at least one char.
01618498 3853 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3854
3855static int
3856analyse_first (p, pend, fastmap, multibyte)
01618498 3857 re_char *p, *pend;
f6a3f532
SM
3858 char *fastmap;
3859 const int multibyte;
fa9a63c5 3860{
505bde11 3861 int j, k;
1fb352e0 3862 boolean not;
fa9a63c5 3863
b18215fc 3864 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3865 flag is set true. */
b18215fc
RS
3866 boolean match_any_multibyte_characters = false;
3867
f6a3f532 3868 assert (p);
5e69f11e 3869
505bde11
SM
3870 /* The loop below works as follows:
3871 - It has a working-list kept in the PATTERN_STACK and which basically
3872 starts by only containing a pointer to the first operation.
3873 - If the opcode we're looking at is a match against some set of
3874 chars, then we add those chars to the fastmap and go on to the
3875 next work element from the worklist (done via `break').
3876 - If the opcode is a control operator on the other hand, we either
3877 ignore it (if it's meaningless at this point, such as `start_memory')
3878 or execute it (if it's a jump). If the jump has several destinations
3879 (i.e. `on_failure_jump'), then we push the other destination onto the
3880 worklist.
3881 We guarantee termination by ignoring backward jumps (more or less),
3882 so that `p' is monotonically increasing. More to the point, we
3883 never set `p' (or push) anything `<= p1'. */
3884
01618498 3885 while (p < pend)
fa9a63c5 3886 {
505bde11
SM
3887 /* `p1' is used as a marker of how far back a `on_failure_jump'
3888 can go without being ignored. It is normally equal to `p'
3889 (which prevents any backward `on_failure_jump') except right
3890 after a plain `jump', to allow patterns such as:
3891 0: jump 10
3892 3..9: <body>
3893 10: on_failure_jump 3
3894 as used for the *? operator. */
01618498 3895 re_char *p1 = p;
5e69f11e 3896
fa9a63c5
RM
3897 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3898 {
f6a3f532 3899 case succeed:
01618498 3900 return 1;
f6a3f532 3901 continue;
fa9a63c5 3902
fa9a63c5 3903 case duplicate:
505bde11
SM
3904 /* If the first character has to match a backreference, that means
3905 that the group was empty (since it already matched). Since this
3906 is the only case that interests us here, we can assume that the
3907 backreference must match the empty string. */
3908 p++;
3909 continue;
fa9a63c5
RM
3910
3911
3912 /* Following are the cases which match a character. These end
7814e705 3913 with `break'. */
fa9a63c5
RM
3914
3915 case exactn:
e0277a47
KH
3916 if (fastmap)
3917 {
3918 int c = RE_STRING_CHAR (p + 1, pend - p);
4560a582
SM
3919 /* When fast-scanning, the fastmap can be indexed either with
3920 a char (smaller than 256) or with the first byte of
3921 a char's byte sequence. So we have to conservatively add
3922 both to the table. */
e0277a47
KH
3923 if (SINGLE_BYTE_CHAR_P (c))
3924 fastmap[c] = 1;
4560a582 3925 fastmap[p[1]] = 1;
e0277a47 3926 }
fa9a63c5
RM
3927 break;
3928
3929
1fb352e0
SM
3930 case anychar:
3931 /* We could put all the chars except for \n (and maybe \0)
3932 but we don't bother since it is generally not worth it. */
f6a3f532 3933 if (!fastmap) break;
01618498 3934 return -1;
fa9a63c5
RM
3935
3936
b18215fc 3937 case charset_not:
ba5c004d
RS
3938 /* Chars beyond end of bitmap are possible matches.
3939 All the single-byte codes can occur in multibyte buffers.
3940 So any that are not listed in the charset
3941 are possible matches, even in multibyte buffers. */
1fb352e0 3942 if (!fastmap) break;
4560a582
SM
3943 /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
3944 because it will automatically be set when needed by virtue of
3945 being larger than the highest char of its charset (0xbf) but
3946 smaller than (1<<BYTEWIDTH). */
b18215fc 3947 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3948 j < (1 << BYTEWIDTH); j++)
b18215fc 3949 fastmap[j] = 1;
1fb352e0
SM
3950 /* Fallthrough */
3951 case charset:
3952 if (!fastmap) break;
3953 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3954 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3955 j >= 0; j--)
1fb352e0 3956 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
4560a582
SM
3957 {
3958 fastmap[j] = 1;
3959#ifdef emacs
3960 if (j >= 0x80 && j < 0xa0)
3961 fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
3962#endif
3963 }
b18215fc 3964
1fb352e0
SM
3965 if ((not && multibyte)
3966 /* Any character set can possibly contain a character
3967 which doesn't match the specified set of characters. */
3968 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3969 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3970 /* If we can match a character class, we can match
3971 any character set. */
b18215fc
RS
3972 {
3973 set_fastmap_for_multibyte_characters:
3974 if (match_any_multibyte_characters == false)
3975 {
3976 for (j = 0x80; j < 0xA0; j++) /* XXX */
3977 if (BASE_LEADING_CODE_P (j))
3978 fastmap[j] = 1;
3979 match_any_multibyte_characters = true;
3980 }
3981 }
b18215fc 3982
1fb352e0
SM
3983 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3984 && match_any_multibyte_characters == false)
3985 {
3986 /* Set fastmap[I] 1 where I is a base leading code of each
3987 multibyte character in the range table. */
3988 int c, count;
b18215fc 3989
1fb352e0 3990 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3991 bits for a character class. */
1fb352e0 3992 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3993
1fb352e0
SM
3994 /* Extract the number of ranges in range table into COUNT. */
3995 EXTRACT_NUMBER_AND_INCR (count, p);
3996 for (; count > 0; count--, p += 2 * 3) /* XXX */
3997 {
3998 /* Extract the start of each range. */
3999 EXTRACT_CHARACTER (c, p);
4000 j = CHAR_CHARSET (c);
4001 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
4002 }
4003 }
b18215fc
RS
4004 break;
4005
1fb352e0
SM
4006 case syntaxspec:
4007 case notsyntaxspec:
4008 if (!fastmap) break;
4009#ifndef emacs
4010 not = (re_opcode_t)p[-1] == notsyntaxspec;
4011 k = *p++;
4012 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4013 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4014 fastmap[j] = 1;
b18215fc 4015 break;
1fb352e0 4016#else /* emacs */
b18215fc
RS
4017 /* This match depends on text properties. These end with
4018 aborting optimizations. */
01618498 4019 return -1;
b18215fc
RS
4020
4021 case categoryspec:
b18215fc 4022 case notcategoryspec:
1fb352e0
SM
4023 if (!fastmap) break;
4024 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4025 k = *p++;
1fb352e0
SM
4026 for (j = 0; j < (1 << BYTEWIDTH); j++)
4027 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4028 fastmap[j] = 1;
4029
1fb352e0 4030 if (multibyte)
b18215fc 4031 /* Any character set can possibly contain a character
1fb352e0 4032 whose category is K (or not). */
b18215fc
RS
4033 goto set_fastmap_for_multibyte_characters;
4034 break;
4035
fa9a63c5 4036 /* All cases after this match the empty string. These end with
25fe55af 4037 `continue'. */
fa9a63c5 4038
fa9a63c5
RM
4039 case before_dot:
4040 case at_dot:
4041 case after_dot:
1fb352e0 4042#endif /* !emacs */
25fe55af
RS
4043 case no_op:
4044 case begline:
4045 case endline:
fa9a63c5
RM
4046 case begbuf:
4047 case endbuf:
4048 case wordbound:
4049 case notwordbound:
4050 case wordbeg:
4051 case wordend:
669fa600
SM
4052 case symbeg:
4053 case symend:
25fe55af 4054 continue;
fa9a63c5
RM
4055
4056
fa9a63c5 4057 case jump:
25fe55af 4058 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4059 if (j < 0)
4060 /* Backward jumps can only go back to code that we've already
4061 visited. `re_compile' should make sure this is true. */
4062 break;
25fe55af 4063 p += j;
505bde11
SM
4064 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4065 {
4066 case on_failure_jump:
4067 case on_failure_keep_string_jump:
505bde11 4068 case on_failure_jump_loop:
0683b6fa 4069 case on_failure_jump_nastyloop:
505bde11
SM
4070 case on_failure_jump_smart:
4071 p++;
4072 break;
4073 default:
4074 continue;
4075 };
4076 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4077 to jump back to "just after here". */
4078 /* Fallthrough */
fa9a63c5 4079
25fe55af
RS
4080 case on_failure_jump:
4081 case on_failure_keep_string_jump:
0683b6fa 4082 case on_failure_jump_nastyloop:
505bde11
SM
4083 case on_failure_jump_loop:
4084 case on_failure_jump_smart:
25fe55af 4085 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4086 if (p + j <= p1)
ed0767d8 4087 ; /* Backward jump to be ignored. */
01618498
SM
4088 else
4089 { /* We have to look down both arms.
4090 We first go down the "straight" path so as to minimize
4091 stack usage when going through alternatives. */
4092 int r = analyse_first (p, pend, fastmap, multibyte);
4093 if (r) return r;
4094 p += j;
4095 }
25fe55af 4096 continue;
fa9a63c5
RM
4097
4098
ed0767d8
SM
4099 case jump_n:
4100 /* This code simply does not properly handle forward jump_n. */
4101 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4102 p += 4;
4103 /* jump_n can either jump or fall through. The (backward) jump
4104 case has already been handled, so we only need to look at the
4105 fallthrough case. */
4106 continue;
177c0ea7 4107
fa9a63c5 4108 case succeed_n:
ed0767d8
SM
4109 /* If N == 0, it should be an on_failure_jump_loop instead. */
4110 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4111 p += 4;
4112 /* We only care about one iteration of the loop, so we don't
4113 need to consider the case where this behaves like an
4114 on_failure_jump. */
25fe55af 4115 continue;
fa9a63c5
RM
4116
4117
4118 case set_number_at:
25fe55af
RS
4119 p += 4;
4120 continue;
fa9a63c5
RM
4121
4122
4123 case start_memory:
25fe55af 4124 case stop_memory:
505bde11 4125 p += 1;
fa9a63c5
RM
4126 continue;
4127
4128
4129 default:
25fe55af
RS
4130 abort (); /* We have listed all the cases. */
4131 } /* switch *p++ */
fa9a63c5
RM
4132
4133 /* Getting here means we have found the possible starting
25fe55af 4134 characters for one path of the pattern -- and that the empty
7814e705 4135 string does not match. We need not follow this path further. */
01618498 4136 return 0;
fa9a63c5
RM
4137 } /* while p */
4138
01618498
SM
4139 /* We reached the end without matching anything. */
4140 return 1;
4141
f6a3f532
SM
4142} /* analyse_first */
4143\f
4144/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4145 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4146 characters can start a string that matches the pattern. This fastmap
4147 is used by re_search to skip quickly over impossible starting points.
4148
4149 Character codes above (1 << BYTEWIDTH) are not represented in the
4150 fastmap, but the leading codes are represented. Thus, the fastmap
4151 indicates which character sets could start a match.
4152
4153 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4154 area as BUFP->fastmap.
4155
4156 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4157 the pattern buffer.
4158
4159 Returns 0 if we succeed, -2 if an internal error. */
4160
4161int
4162re_compile_fastmap (bufp)
4163 struct re_pattern_buffer *bufp;
4164{
4165 char *fastmap = bufp->fastmap;
4166 int analysis;
4167
4168 assert (fastmap && bufp->buffer);
4169
7814e705 4170 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4171 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4172
4173 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4174 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4175 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4176 return 0;
4177} /* re_compile_fastmap */
4178\f
4179/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4180 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4181 this memory for recording register information. STARTS and ENDS
4182 must be allocated using the malloc library routine, and must each
4183 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4184
4185 If NUM_REGS == 0, then subsequent matches should allocate their own
4186 register data.
4187
4188 Unless this function is called, the first search or match using
4189 PATTERN_BUFFER will allocate its own register data, without
4190 freeing the old data. */
4191
4192void
4193re_set_registers (bufp, regs, num_regs, starts, ends)
4194 struct re_pattern_buffer *bufp;
4195 struct re_registers *regs;
4196 unsigned num_regs;
4197 regoff_t *starts, *ends;
4198{
4199 if (num_regs)
4200 {
4201 bufp->regs_allocated = REGS_REALLOCATE;
4202 regs->num_regs = num_regs;
4203 regs->start = starts;
4204 regs->end = ends;
4205 }
4206 else
4207 {
4208 bufp->regs_allocated = REGS_UNALLOCATED;
4209 regs->num_regs = 0;
4210 regs->start = regs->end = (regoff_t *) 0;
4211 }
4212}
c0f9ea08 4213WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4214\f
7814e705 4215/* Searching routines. */
fa9a63c5
RM
4216
4217/* Like re_search_2, below, but only one string is specified, and
4218 doesn't let you say where to stop matching. */
4219
4220int
4221re_search (bufp, string, size, startpos, range, regs)
4222 struct re_pattern_buffer *bufp;
4223 const char *string;
4224 int size, startpos, range;
4225 struct re_registers *regs;
4226{
5e69f11e 4227 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4228 regs, size);
4229}
c0f9ea08 4230WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4231
70806df6
KH
4232/* Head address of virtual concatenation of string. */
4233#define HEAD_ADDR_VSTRING(P) \
4234 (((P) >= size1 ? string2 : string1))
4235
b18215fc
RS
4236/* End address of virtual concatenation of string. */
4237#define STOP_ADDR_VSTRING(P) \
4238 (((P) >= size1 ? string2 + size2 : string1 + size1))
4239
4240/* Address of POS in the concatenation of virtual string. */
4241#define POS_ADDR_VSTRING(POS) \
4242 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4243
4244/* Using the compiled pattern in BUFP->buffer, first tries to match the
4245 virtual concatenation of STRING1 and STRING2, starting first at index
4246 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4247
fa9a63c5 4248 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4249
fa9a63c5
RM
4250 RANGE is how far to scan while trying to match. RANGE = 0 means try
4251 only at STARTPOS; in general, the last start tried is STARTPOS +
4252 RANGE.
5e69f11e 4253
fa9a63c5
RM
4254 In REGS, return the indices of the virtual concatenation of STRING1
4255 and STRING2 that matched the entire BUFP->buffer and its contained
4256 subexpressions.
5e69f11e 4257
fa9a63c5
RM
4258 Do not consider matching one past the index STOP in the virtual
4259 concatenation of STRING1 and STRING2.
4260
4261 We return either the position in the strings at which the match was
4262 found, -1 if no match, or -2 if error (such as failure
4263 stack overflow). */
4264
4265int
66f0296e 4266re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4267 struct re_pattern_buffer *bufp;
66f0296e 4268 const char *str1, *str2;
fa9a63c5
RM
4269 int size1, size2;
4270 int startpos;
4271 int range;
4272 struct re_registers *regs;
4273 int stop;
4274{
4275 int val;
66f0296e
SM
4276 re_char *string1 = (re_char*) str1;
4277 re_char *string2 = (re_char*) str2;
fa9a63c5 4278 register char *fastmap = bufp->fastmap;
6676cb1c 4279 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4280 int total_size = size1 + size2;
4281 int endpos = startpos + range;
c0f9ea08 4282 boolean anchored_start;
fa9a63c5 4283
7814e705 4284 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4285 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4286
fa9a63c5
RM
4287 /* Check for out-of-range STARTPOS. */
4288 if (startpos < 0 || startpos > total_size)
4289 return -1;
5e69f11e 4290
fa9a63c5 4291 /* Fix up RANGE if it might eventually take us outside
34597fa9 4292 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4293 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4294 if (endpos < 0)
4295 range = 0 - startpos;
fa9a63c5
RM
4296 else if (endpos > total_size)
4297 range = total_size - startpos;
4298
4299 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4300 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4301 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4302 {
4303 if (startpos > 0)
4304 return -1;
4305 else
7b140fd7 4306 range = 0;
fa9a63c5
RM
4307 }
4308
ae4788a8
RS
4309#ifdef emacs
4310 /* In a forward search for something that starts with \=.
4311 don't keep searching past point. */
4312 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4313 {
7b140fd7
RS
4314 range = PT_BYTE - BEGV_BYTE - startpos;
4315 if (range < 0)
ae4788a8
RS
4316 return -1;
4317 }
4318#endif /* emacs */
4319
fa9a63c5
RM
4320 /* Update the fastmap now if not correct already. */
4321 if (fastmap && !bufp->fastmap_accurate)
01618498 4322 re_compile_fastmap (bufp);
5e69f11e 4323
c8499ba5 4324 /* See whether the pattern is anchored. */
c0f9ea08 4325 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4326
b18215fc 4327#ifdef emacs
cc9b4df2
KH
4328 gl_state.object = re_match_object;
4329 {
99633e97 4330 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4331
4332 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4333 }
b18215fc
RS
4334#endif
4335
fa9a63c5
RM
4336 /* Loop through the string, looking for a place to start matching. */
4337 for (;;)
5e69f11e 4338 {
c8499ba5
RS
4339 /* If the pattern is anchored,
4340 skip quickly past places we cannot match.
4341 We don't bother to treat startpos == 0 specially
4342 because that case doesn't repeat. */
4343 if (anchored_start && startpos > 0)
4344 {
c0f9ea08
SM
4345 if (! ((startpos <= size1 ? string1[startpos - 1]
4346 : string2[startpos - size1 - 1])
4347 == '\n'))
c8499ba5
RS
4348 goto advance;
4349 }
4350
fa9a63c5 4351 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4352 cannot be the start of a match. If the pattern can match the
4353 null string, however, we don't need to skip characters; we want
7814e705 4354 the first null string. */
fa9a63c5
RM
4355 if (fastmap && startpos < total_size && !bufp->can_be_null)
4356 {
66f0296e 4357 register re_char *d;
01618498 4358 register re_wchar_t buf_ch;
e934739e
RS
4359
4360 d = POS_ADDR_VSTRING (startpos);
4361
7814e705 4362 if (range > 0) /* Searching forwards. */
fa9a63c5 4363 {
fa9a63c5
RM
4364 register int lim = 0;
4365 int irange = range;
4366
25fe55af
RS
4367 if (startpos < size1 && startpos + range >= size1)
4368 lim = range - (size1 - startpos);
fa9a63c5 4369
25fe55af
RS
4370 /* Written out as an if-else to avoid testing `translate'
4371 inside the loop. */
28ae27ae
AS
4372 if (RE_TRANSLATE_P (translate))
4373 {
e934739e
RS
4374 if (multibyte)
4375 while (range > lim)
4376 {
4377 int buf_charlen;
4378
4379 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4380 buf_charlen);
4381
4382 buf_ch = RE_TRANSLATE (translate, buf_ch);
4383 if (buf_ch >= 0400
4384 || fastmap[buf_ch])
4385 break;
4386
4387 range -= buf_charlen;
4388 d += buf_charlen;
4389 }
4390 else
cf1982d9
EZ
4391 {
4392 /* Convert *d to integer to shut up GCC's
4393 whining about comparison that is always
4394 true. */
4395 int di = *d;
4396
4397 while (range > lim
4398 && !fastmap[RE_TRANSLATE (translate, di)])
4399 {
4400 di = *(++d);
4401 range--;
4402 }
4403 }
e934739e 4404 }
fa9a63c5 4405 else
4560a582 4406 do
33c46939 4407 {
4560a582
SM
4408 re_char *d_start = d;
4409 while (range > lim && !fastmap[*d])
4410 {
4411 d++;
4412 range--;
4413 }
4414#ifdef emacs
4415 if (multibyte && range > lim)
4416 {
4417 /* Check that we are at the beginning of a char. */
4418 int at_boundary;
4419 AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
4420 if (at_boundary)
4421 break;
4422 else
4423 { /* We have matched an internal byte of a char
4424 rather than the leading byte, so it's a false
4425 positive: we should keep scanning. */
4426 d++; range--;
4427 }
4428 }
4429 else
4430#endif
4431 break;
4432 } while (1);
fa9a63c5
RM
4433
4434 startpos += irange - range;
4435 }
7814e705 4436 else /* Searching backwards. */
fa9a63c5 4437 {
2d1675e4
SM
4438 int room = (startpos >= size1
4439 ? size2 + size1 - startpos
4440 : size1 - startpos);
4441 buf_ch = RE_STRING_CHAR (d, room);
4442 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 4443
e934739e
RS
4444 if (! (buf_ch >= 0400
4445 || fastmap[buf_ch]))
fa9a63c5
RM
4446 goto advance;
4447 }
4448 }
4449
4450 /* If can't match the null string, and that's all we have left, fail. */
4451 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4452 && !bufp->can_be_null)
fa9a63c5
RM
4453 return -1;
4454
4455 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4456 startpos, regs, stop);
4457#ifndef REGEX_MALLOC
0b32bf0e 4458# ifdef C_ALLOCA
fa9a63c5 4459 alloca (0);
0b32bf0e 4460# endif
fa9a63c5
RM
4461#endif
4462
4463 if (val >= 0)
4464 return startpos;
5e69f11e 4465
fa9a63c5
RM
4466 if (val == -2)
4467 return -2;
4468
4469 advance:
5e69f11e 4470 if (!range)
25fe55af 4471 break;
5e69f11e 4472 else if (range > 0)
25fe55af 4473 {
b18215fc
RS
4474 /* Update STARTPOS to the next character boundary. */
4475 if (multibyte)
4476 {
66f0296e
SM
4477 re_char *p = POS_ADDR_VSTRING (startpos);
4478 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4479 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4480
4481 range -= len;
4482 if (range < 0)
4483 break;
4484 startpos += len;
4485 }
4486 else
4487 {
b560c397
RS
4488 range--;
4489 startpos++;
4490 }
e318085a 4491 }
fa9a63c5 4492 else
25fe55af
RS
4493 {
4494 range++;
4495 startpos--;
b18215fc
RS
4496
4497 /* Update STARTPOS to the previous character boundary. */
4498 if (multibyte)
4499 {
70806df6
KH
4500 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4501 re_char *p0 = p;
4502 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4503
4504 /* Find the head of multibyte form. */
70806df6
KH
4505 PREV_CHAR_BOUNDARY (p, phead);
4506 range += p0 - 1 - p;
4507 if (range > 0)
4508 break;
b18215fc 4509
70806df6 4510 startpos -= p0 - 1 - p;
b18215fc 4511 }
25fe55af 4512 }
fa9a63c5
RM
4513 }
4514 return -1;
4515} /* re_search_2 */
c0f9ea08 4516WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4517\f
4518/* Declarations and macros for re_match_2. */
4519
2d1675e4
SM
4520static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4521 register int len,
4522 RE_TRANSLATE_TYPE translate,
4523 const int multibyte));
fa9a63c5
RM
4524
4525/* This converts PTR, a pointer into one of the search strings `string1'
4526 and `string2' into an offset from the beginning of that string. */
4527#define POINTER_TO_OFFSET(ptr) \
4528 (FIRST_STRING_P (ptr) \
4529 ? ((regoff_t) ((ptr) - string1)) \
4530 : ((regoff_t) ((ptr) - string2 + size1)))
4531
fa9a63c5 4532/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4533 string2 if necessary.
4534 Check re_match_2_internal for a discussion of why end_match_2 might
4535 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4536#define PREFETCH() \
25fe55af 4537 while (d == dend) \
fa9a63c5
RM
4538 { \
4539 /* End of string2 => fail. */ \
25fe55af
RS
4540 if (dend == end_match_2) \
4541 goto fail; \
4bb91c68 4542 /* End of string1 => advance to string2. */ \
25fe55af 4543 d = string2; \
fa9a63c5
RM
4544 dend = end_match_2; \
4545 }
4546
f1ad044f
SM
4547/* Call before fetching a char with *d if you already checked other limits.
4548 This is meant for use in lookahead operations like wordend, etc..
4549 where we might need to look at parts of the string that might be
4550 outside of the LIMITs (i.e past `stop'). */
4551#define PREFETCH_NOLIMIT() \
4552 if (d == end1) \
4553 { \
4554 d = string2; \
4555 dend = end_match_2; \
4556 } \
fa9a63c5
RM
4557
4558/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4559 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4560#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4561#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4562
4563
4564/* Test if D points to a character which is word-constituent. We have
4565 two special cases to check for: if past the end of string1, look at
4566 the first character in string2; and if before the beginning of
4567 string2, look at the last character in string1. */
4568#define WORDCHAR_P(d) \
4569 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4570 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4571 == Sword)
4572
9121ca40 4573/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4574
4575/* The comment at case wordbound is following one, but we don't use
4576 AT_WORD_BOUNDARY anymore to support multibyte form.
4577
4578 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4579 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4580 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4581 macro and introducing temporary variables works around the bug. */
4582
9121ca40 4583#if 0
fa9a63c5
RM
4584/* Test if the character before D and the one at D differ with respect
4585 to being word-constituent. */
4586#define AT_WORD_BOUNDARY(d) \
4587 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4588 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4589#endif
fa9a63c5
RM
4590
4591/* Free everything we malloc. */
4592#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4593# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4594# define FREE_VARIABLES() \
fa9a63c5
RM
4595 do { \
4596 REGEX_FREE_STACK (fail_stack.stack); \
4597 FREE_VAR (regstart); \
4598 FREE_VAR (regend); \
fa9a63c5
RM
4599 FREE_VAR (best_regstart); \
4600 FREE_VAR (best_regend); \
fa9a63c5
RM
4601 } while (0)
4602#else
0b32bf0e 4603# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4604#endif /* not MATCH_MAY_ALLOCATE */
4605
505bde11
SM
4606\f
4607/* Optimization routines. */
4608
4e8a9132
SM
4609/* If the operation is a match against one or more chars,
4610 return a pointer to the next operation, else return NULL. */
01618498 4611static re_char *
4e8a9132 4612skip_one_char (p)
01618498 4613 re_char *p;
4e8a9132
SM
4614{
4615 switch (SWITCH_ENUM_CAST (*p++))
4616 {
4617 case anychar:
4618 break;
177c0ea7 4619
4e8a9132
SM
4620 case exactn:
4621 p += *p + 1;
4622 break;
4623
4624 case charset_not:
4625 case charset:
4626 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4627 {
4628 int mcnt;
4629 p = CHARSET_RANGE_TABLE (p - 1);
4630 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4631 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4632 }
4633 else
4634 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4635 break;
177c0ea7 4636
4e8a9132
SM
4637 case syntaxspec:
4638 case notsyntaxspec:
1fb352e0 4639#ifdef emacs
4e8a9132
SM
4640 case categoryspec:
4641 case notcategoryspec:
4642#endif /* emacs */
4643 p++;
4644 break;
4645
4646 default:
4647 p = NULL;
4648 }
4649 return p;
4650}
4651
4652
505bde11 4653/* Jump over non-matching operations. */
275a3409 4654static re_char *
4e8a9132 4655skip_noops (p, pend)
275a3409 4656 re_char *p, *pend;
505bde11
SM
4657{
4658 int mcnt;
4659 while (p < pend)
4660 {
4661 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4662 {
4663 case start_memory:
505bde11
SM
4664 case stop_memory:
4665 p += 2; break;
4666 case no_op:
4667 p += 1; break;
4668 case jump:
4669 p += 1;
4670 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4671 p += mcnt;
4672 break;
4673 default:
4674 return p;
4675 }
4676 }
4677 assert (p == pend);
4678 return p;
4679}
4680
4681/* Non-zero if "p1 matches something" implies "p2 fails". */
4682static int
4683mutually_exclusive_p (bufp, p1, p2)
4684 struct re_pattern_buffer *bufp;
275a3409 4685 re_char *p1, *p2;
505bde11 4686{
4e8a9132 4687 re_opcode_t op2;
2d1675e4 4688 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4689 unsigned char *pend = bufp->buffer + bufp->used;
4690
4e8a9132 4691 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4692 && p2 >= bufp->buffer && p2 <= pend);
4693
4694 /* Skip over open/close-group commands.
4695 If what follows this loop is a ...+ construct,
4696 look at what begins its body, since we will have to
4697 match at least one of that. */
4e8a9132
SM
4698 p2 = skip_noops (p2, pend);
4699 /* The same skip can be done for p1, except that this function
4700 is only used in the case where p1 is a simple match operator. */
4701 /* p1 = skip_noops (p1, pend); */
4702
4703 assert (p1 >= bufp->buffer && p1 < pend
4704 && p2 >= bufp->buffer && p2 <= pend);
4705
4706 op2 = p2 == pend ? succeed : *p2;
4707
4708 switch (SWITCH_ENUM_CAST (op2))
505bde11 4709 {
4e8a9132
SM
4710 case succeed:
4711 case endbuf:
4712 /* If we're at the end of the pattern, we can change. */
4713 if (skip_one_char (p1))
505bde11 4714 {
505bde11
SM
4715 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4716 return 1;
505bde11 4717 }
4e8a9132 4718 break;
177c0ea7 4719
4e8a9132 4720 case endline:
4e8a9132
SM
4721 case exactn:
4722 {
01618498 4723 register re_wchar_t c
4e8a9132 4724 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4725 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4726
4e8a9132
SM
4727 if ((re_opcode_t) *p1 == exactn)
4728 {
4729 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4730 {
4731 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4732 return 1;
4733 }
4734 }
505bde11 4735
4e8a9132
SM
4736 else if ((re_opcode_t) *p1 == charset
4737 || (re_opcode_t) *p1 == charset_not)
4738 {
4739 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4740
4e8a9132
SM
4741 /* Test if C is listed in charset (or charset_not)
4742 at `p1'. */
4743 if (SINGLE_BYTE_CHAR_P (c))
4744 {
4745 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4746 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4747 not = !not;
4748 }
4749 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4750 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4751
4e8a9132
SM
4752 /* `not' is equal to 1 if c would match, which means
4753 that we can't change to pop_failure_jump. */
4754 if (!not)
4755 {
4756 DEBUG_PRINT1 (" No match => fast loop.\n");
4757 return 1;
4758 }
4759 }
4760 else if ((re_opcode_t) *p1 == anychar
4761 && c == '\n')
4762 {
4763 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4764 return 1;
4765 }
4766 }
4767 break;
505bde11 4768
4e8a9132 4769 case charset:
4e8a9132
SM
4770 {
4771 if ((re_opcode_t) *p1 == exactn)
4772 /* Reuse the code above. */
4773 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4774
505bde11
SM
4775 /* It is hard to list up all the character in charset
4776 P2 if it includes multibyte character. Give up in
4777 such case. */
4778 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4779 {
4780 /* Now, we are sure that P2 has no range table.
4781 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4782 enough. But P1 may have range table, so the
505bde11
SM
4783 size of bitmap table of P1 is extracted by
4784 using macro `CHARSET_BITMAP_SIZE'.
4785
4786 Since we know that all the character listed in
4787 P2 is ASCII, it is enough to test only bitmap
4788 table of P1. */
4789
411e4203 4790 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4791 {
4792 int idx;
4793 /* We win if the charset inside the loop
4794 has no overlap with the one after the loop. */
4795 for (idx = 0;
4796 (idx < (int) p2[1]
4797 && idx < CHARSET_BITMAP_SIZE (p1));
4798 idx++)
4799 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4800 break;
4801
4802 if (idx == p2[1]
4803 || idx == CHARSET_BITMAP_SIZE (p1))
4804 {
4805 DEBUG_PRINT1 (" No match => fast loop.\n");
4806 return 1;
4807 }
4808 }
411e4203 4809 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4810 {
4811 int idx;
4812 /* We win if the charset_not inside the loop lists
7814e705 4813 every character listed in the charset after. */
505bde11
SM
4814 for (idx = 0; idx < (int) p2[1]; idx++)
4815 if (! (p2[2 + idx] == 0
4816 || (idx < CHARSET_BITMAP_SIZE (p1)
4817 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4818 break;
4819
4e8a9132
SM
4820 if (idx == p2[1])
4821 {
4822 DEBUG_PRINT1 (" No match => fast loop.\n");
4823 return 1;
4824 }
4825 }
4826 }
4827 }
609b757a 4828 break;
177c0ea7 4829
411e4203
SM
4830 case charset_not:
4831 switch (SWITCH_ENUM_CAST (*p1))
4832 {
4833 case exactn:
4834 case charset:
4835 /* Reuse the code above. */
4836 return mutually_exclusive_p (bufp, p2, p1);
4837 case charset_not:
4838 /* When we have two charset_not, it's very unlikely that
4839 they don't overlap. The union of the two sets of excluded
4840 chars should cover all possible chars, which, as a matter of
4841 fact, is virtually impossible in multibyte buffers. */
36595814 4842 break;
411e4203
SM
4843 }
4844 break;
4845
4e8a9132 4846 case wordend:
669fa600
SM
4847 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4848 case symend:
4e8a9132 4849 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4850 && (p1[1] == Ssymbol || p1[1] == Sword));
4851 case notsyntaxspec:
4852 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4853
4854 case wordbeg:
669fa600
SM
4855 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4856 case symbeg:
4e8a9132 4857 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4858 && (p1[1] == Ssymbol || p1[1] == Sword));
4859 case syntaxspec:
4860 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4861
4862 case wordbound:
4863 return (((re_opcode_t) *p1 == notsyntaxspec
4864 || (re_opcode_t) *p1 == syntaxspec)
4865 && p1[1] == Sword);
4866
1fb352e0 4867#ifdef emacs
4e8a9132
SM
4868 case categoryspec:
4869 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4870 case notcategoryspec:
4871 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4872#endif /* emacs */
4873
4874 default:
4875 ;
505bde11
SM
4876 }
4877
4878 /* Safe default. */
4879 return 0;
4880}
4881
fa9a63c5
RM
4882\f
4883/* Matching routines. */
4884
25fe55af 4885#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4886/* re_match is like re_match_2 except it takes only a single string. */
4887
4888int
4889re_match (bufp, string, size, pos, regs)
4890 struct re_pattern_buffer *bufp;
4891 const char *string;
4892 int size, pos;
4893 struct re_registers *regs;
4894{
4bb91c68 4895 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4896 pos, regs, size);
0b32bf0e 4897# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4898 alloca (0);
0b32bf0e 4899# endif
fa9a63c5
RM
4900 return result;
4901}
c0f9ea08 4902WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4903#endif /* not emacs */
4904
b18215fc
RS
4905#ifdef emacs
4906/* In Emacs, this is the string or buffer in which we
7814e705 4907 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4908Lisp_Object re_match_object;
4909#endif
fa9a63c5
RM
4910
4911/* re_match_2 matches the compiled pattern in BUFP against the
4912 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4913 and SIZE2, respectively). We start matching at POS, and stop
4914 matching at STOP.
5e69f11e 4915
fa9a63c5 4916 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 4917 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4918 documentation for exactly how many groups we fill.
4919
4920 We return -1 if no match, -2 if an internal error (such as the
7814e705 4921 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4922 matched substring. */
4923
4924int
4925re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4926 struct re_pattern_buffer *bufp;
4927 const char *string1, *string2;
4928 int size1, size2;
4929 int pos;
4930 struct re_registers *regs;
4931 int stop;
4932{
b18215fc 4933 int result;
25fe55af 4934
b18215fc 4935#ifdef emacs
cc9b4df2
KH
4936 int charpos;
4937 gl_state.object = re_match_object;
99633e97 4938 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4939 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4940#endif
4941
4bb91c68
SM
4942 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4943 (re_char*) string2, size2,
cc9b4df2 4944 pos, regs, stop);
0b32bf0e 4945#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4946 alloca (0);
a60198e5 4947#endif
fa9a63c5
RM
4948 return result;
4949}
c0f9ea08 4950WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4951
4952/* This is a separate function so that we can force an alloca cleanup
7814e705 4953 afterwards. */
fa9a63c5
RM
4954static int
4955re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4956 struct re_pattern_buffer *bufp;
66f0296e 4957 re_char *string1, *string2;
fa9a63c5
RM
4958 int size1, size2;
4959 int pos;
4960 struct re_registers *regs;
4961 int stop;
4962{
4963 /* General temporaries. */
4964 int mcnt;
01618498 4965 size_t reg;
66f0296e 4966 boolean not;
fa9a63c5
RM
4967
4968 /* Just past the end of the corresponding string. */
66f0296e 4969 re_char *end1, *end2;
fa9a63c5
RM
4970
4971 /* Pointers into string1 and string2, just past the last characters in
7814e705 4972 each to consider matching. */
66f0296e 4973 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4974
4975 /* Where we are in the data, and the end of the current string. */
66f0296e 4976 re_char *d, *dend;
5e69f11e 4977
99633e97
SM
4978 /* Used sometimes to remember where we were before starting matching
4979 an operator so that we can go back in case of failure. This "atomic"
4980 behavior of matching opcodes is indispensable to the correctness
4981 of the on_failure_keep_string_jump optimization. */
4982 re_char *dfail;
4983
fa9a63c5 4984 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4985 re_char *p = bufp->buffer;
4986 re_char *pend = p + bufp->used;
fa9a63c5 4987
7814e705 4988 /* We use this to map every character in the string. */
6676cb1c 4989 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4990
46ac3660 4991 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4992 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4993
fa9a63c5
RM
4994 /* Failure point stack. Each place that can handle a failure further
4995 down the line pushes a failure point on this stack. It consists of
505bde11 4996 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4997 the subexpressions we're currently inside, plus the number of such
4998 registers, and, finally, two char *'s. The first char * is where
4999 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5000 scanning the strings. */
5001#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5002 fail_stack_type fail_stack;
5003#endif
5004#ifdef DEBUG
fa9a63c5
RM
5005 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5006#endif
5007
0b32bf0e 5008#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5009 /* This holds the pointer to the failure stack, when
5010 it is allocated relocatably. */
5011 fail_stack_elt_t *failure_stack_ptr;
99633e97 5012#endif
fa9a63c5
RM
5013
5014 /* We fill all the registers internally, independent of what we
7814e705 5015 return, for use in backreferences. The number here includes
fa9a63c5 5016 an element for register zero. */
4bb91c68 5017 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5018
fa9a63c5
RM
5019 /* Information on the contents of registers. These are pointers into
5020 the input strings; they record just what was matched (on this
5021 attempt) by a subexpression part of the pattern, that is, the
5022 regnum-th regstart pointer points to where in the pattern we began
5023 matching and the regnum-th regend points to right after where we
5024 stopped matching the regnum-th subexpression. (The zeroth register
5025 keeps track of what the whole pattern matches.) */
5026#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5027 re_char **regstart, **regend;
fa9a63c5
RM
5028#endif
5029
fa9a63c5 5030 /* The following record the register info as found in the above
5e69f11e 5031 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5032 This happens as we backtrack through the failure points, which in
5033 turn happens only if we have not yet matched the entire string. */
5034 unsigned best_regs_set = false;
5035#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5036 re_char **best_regstart, **best_regend;
fa9a63c5 5037#endif
5e69f11e 5038
fa9a63c5
RM
5039 /* Logically, this is `best_regend[0]'. But we don't want to have to
5040 allocate space for that if we're not allocating space for anything
7814e705 5041 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5042 any of the other register vectors, and it seems rather a kludge to
5043 treat `best_regend' differently than the rest. So we keep track of
5044 the end of the best match so far in a separate variable. We
5045 initialize this to NULL so that when we backtrack the first time
5046 and need to test it, it's not garbage. */
66f0296e 5047 re_char *match_end = NULL;
fa9a63c5 5048
fa9a63c5
RM
5049#ifdef DEBUG
5050 /* Counts the total number of registers pushed. */
5e69f11e 5051 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5052#endif
5053
5054 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5055
fa9a63c5 5056 INIT_FAIL_STACK ();
5e69f11e 5057
fa9a63c5
RM
5058#ifdef MATCH_MAY_ALLOCATE
5059 /* Do not bother to initialize all the register variables if there are
5060 no groups in the pattern, as it takes a fair amount of time. If
5061 there are groups, we include space for register 0 (the whole
5062 pattern), even though we never use it, since it simplifies the
5063 array indexing. We should fix this. */
5064 if (bufp->re_nsub)
5065 {
66f0296e
SM
5066 regstart = REGEX_TALLOC (num_regs, re_char *);
5067 regend = REGEX_TALLOC (num_regs, re_char *);
5068 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5069 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5070
505bde11 5071 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5072 {
5073 FREE_VARIABLES ();
5074 return -2;
5075 }
fa9a63c5
RM
5076 }
5077 else
5078 {
5079 /* We must initialize all our variables to NULL, so that
25fe55af 5080 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5081 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5082 }
5083#endif /* MATCH_MAY_ALLOCATE */
5084
5085 /* The starting position is bogus. */
5086 if (pos < 0 || pos > size1 + size2)
5087 {
5088 FREE_VARIABLES ();
5089 return -1;
5090 }
5e69f11e 5091
fa9a63c5
RM
5092 /* Initialize subexpression text positions to -1 to mark ones that no
5093 start_memory/stop_memory has been seen for. Also initialize the
5094 register information struct. */
01618498
SM
5095 for (reg = 1; reg < num_regs; reg++)
5096 regstart[reg] = regend[reg] = NULL;
99633e97 5097
fa9a63c5 5098 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5099 `string1' is null. */
fa9a63c5
RM
5100 if (size2 == 0 && string1 != NULL)
5101 {
5102 string2 = string1;
5103 size2 = size1;
5104 string1 = 0;
5105 size1 = 0;
5106 }
5107 end1 = string1 + size1;
5108 end2 = string2 + size2;
5109
5e69f11e 5110 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5111 `dend' is the end of the input string that `d' points within. `d'
5112 is advanced into the following input string whenever necessary, but
5113 this happens before fetching; therefore, at the beginning of the
5114 loop, `d' can be pointing at the end of a string, but it cannot
5115 equal `string2'. */
419d1c74 5116 if (pos >= size1)
fa9a63c5 5117 {
419d1c74
SM
5118 /* Only match within string2. */
5119 d = string2 + pos - size1;
5120 dend = end_match_2 = string2 + stop - size1;
5121 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5122 }
5123 else
5124 {
f1ad044f 5125 if (stop < size1)
419d1c74
SM
5126 {
5127 /* Only match within string1. */
5128 end_match_1 = string1 + stop;
5129 /* BEWARE!
5130 When we reach end_match_1, PREFETCH normally switches to string2.
5131 But in the present case, this means that just doing a PREFETCH
5132 makes us jump from `stop' to `gap' within the string.
5133 What we really want here is for the search to stop as
5134 soon as we hit end_match_1. That's why we set end_match_2
5135 to end_match_1 (since PREFETCH fails as soon as we hit
5136 end_match_2). */
5137 end_match_2 = end_match_1;
5138 }
5139 else
f1ad044f
SM
5140 { /* It's important to use this code when stop == size so that
5141 moving `d' from end1 to string2 will not prevent the d == dend
5142 check from catching the end of string. */
419d1c74
SM
5143 end_match_1 = end1;
5144 end_match_2 = string2 + stop - size1;
5145 }
5146 d = string1 + pos;
5147 dend = end_match_1;
fa9a63c5
RM
5148 }
5149
5150 DEBUG_PRINT1 ("The compiled pattern is: ");
5151 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5152 DEBUG_PRINT1 ("The string to match is: `");
5153 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5154 DEBUG_PRINT1 ("'\n");
5e69f11e 5155
7814e705 5156 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5157 function if the match is complete, or it drops through if the match
5158 fails at this starting point in the input data. */
5159 for (;;)
5160 {
505bde11 5161 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5162
5163 if (p == pend)
5164 { /* End of pattern means we might have succeeded. */
25fe55af 5165 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5166
fa9a63c5 5167 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5168 longest match, try backtracking. */
5169 if (d != end_match_2)
fa9a63c5
RM
5170 {
5171 /* 1 if this match ends in the same string (string1 or string2)
5172 as the best previous match. */
5e69f11e 5173 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5174 == FIRST_STRING_P (d));
fa9a63c5
RM
5175 /* 1 if this match is the best seen so far. */
5176 boolean best_match_p;
5177
5178 /* AIX compiler got confused when this was combined
7814e705 5179 with the previous declaration. */
fa9a63c5
RM
5180 if (same_str_p)
5181 best_match_p = d > match_end;
5182 else
99633e97 5183 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5184
25fe55af
RS
5185 DEBUG_PRINT1 ("backtracking.\n");
5186
5187 if (!FAIL_STACK_EMPTY ())
5188 { /* More failure points to try. */
5189
5190 /* If exceeds best match so far, save it. */
5191 if (!best_regs_set || best_match_p)
5192 {
5193 best_regs_set = true;
5194 match_end = d;
5195
5196 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5197
01618498 5198 for (reg = 1; reg < num_regs; reg++)
25fe55af 5199 {
01618498
SM
5200 best_regstart[reg] = regstart[reg];
5201 best_regend[reg] = regend[reg];
25fe55af
RS
5202 }
5203 }
5204 goto fail;
5205 }
5206
5207 /* If no failure points, don't restore garbage. And if
5208 last match is real best match, don't restore second
5209 best one. */
5210 else if (best_regs_set && !best_match_p)
5211 {
5212 restore_best_regs:
5213 /* Restore best match. It may happen that `dend ==
5214 end_match_1' while the restored d is in string2.
5215 For example, the pattern `x.*y.*z' against the
5216 strings `x-' and `y-z-', if the two strings are
7814e705 5217 not consecutive in memory. */
25fe55af
RS
5218 DEBUG_PRINT1 ("Restoring best registers.\n");
5219
5220 d = match_end;
5221 dend = ((d >= string1 && d <= end1)
5222 ? end_match_1 : end_match_2);
fa9a63c5 5223
01618498 5224 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5225 {
01618498
SM
5226 regstart[reg] = best_regstart[reg];
5227 regend[reg] = best_regend[reg];
fa9a63c5 5228 }
25fe55af
RS
5229 }
5230 } /* d != end_match_2 */
fa9a63c5
RM
5231
5232 succeed_label:
25fe55af 5233 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5234
25fe55af
RS
5235 /* If caller wants register contents data back, do it. */
5236 if (regs && !bufp->no_sub)
fa9a63c5 5237 {
25fe55af
RS
5238 /* Have the register data arrays been allocated? */
5239 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5240 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5241 extra element beyond `num_regs' for the `-1' marker
5242 GNU code uses. */
5243 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5244 regs->start = TALLOC (regs->num_regs, regoff_t);
5245 regs->end = TALLOC (regs->num_regs, regoff_t);
5246 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5247 {
5248 FREE_VARIABLES ();
5249 return -2;
5250 }
25fe55af
RS
5251 bufp->regs_allocated = REGS_REALLOCATE;
5252 }
5253 else if (bufp->regs_allocated == REGS_REALLOCATE)
5254 { /* Yes. If we need more elements than were already
5255 allocated, reallocate them. If we need fewer, just
5256 leave it alone. */
5257 if (regs->num_regs < num_regs + 1)
5258 {
5259 regs->num_regs = num_regs + 1;
5260 RETALLOC (regs->start, regs->num_regs, regoff_t);
5261 RETALLOC (regs->end, regs->num_regs, regoff_t);
5262 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5263 {
5264 FREE_VARIABLES ();
5265 return -2;
5266 }
25fe55af
RS
5267 }
5268 }
5269 else
fa9a63c5
RM
5270 {
5271 /* These braces fend off a "empty body in an else-statement"
7814e705 5272 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5273 assert (bufp->regs_allocated == REGS_FIXED);
5274 }
5275
25fe55af
RS
5276 /* Convert the pointer data in `regstart' and `regend' to
5277 indices. Register zero has to be set differently,
5278 since we haven't kept track of any info for it. */
5279 if (regs->num_regs > 0)
5280 {
5281 regs->start[0] = pos;
99633e97 5282 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5283 }
5e69f11e 5284
25fe55af
RS
5285 /* Go through the first `min (num_regs, regs->num_regs)'
5286 registers, since that is all we initialized. */
01618498 5287 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5288 {
01618498
SM
5289 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5290 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5291 else
5292 {
01618498
SM
5293 regs->start[reg]
5294 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5295 regs->end[reg]
5296 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5297 }
fa9a63c5 5298 }
5e69f11e 5299
25fe55af
RS
5300 /* If the regs structure we return has more elements than
5301 were in the pattern, set the extra elements to -1. If
5302 we (re)allocated the registers, this is the case,
5303 because we always allocate enough to have at least one
7814e705 5304 -1 at the end. */
01618498
SM
5305 for (reg = num_regs; reg < regs->num_regs; reg++)
5306 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5307 } /* regs && !bufp->no_sub */
5308
25fe55af
RS
5309 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5310 nfailure_points_pushed, nfailure_points_popped,
5311 nfailure_points_pushed - nfailure_points_popped);
5312 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5313
99633e97 5314 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5315
25fe55af 5316 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5317
25fe55af
RS
5318 FREE_VARIABLES ();
5319 return mcnt;
5320 }
fa9a63c5 5321
7814e705 5322 /* Otherwise match next pattern command. */
fa9a63c5
RM
5323 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5324 {
25fe55af
RS
5325 /* Ignore these. Used to ignore the n of succeed_n's which
5326 currently have n == 0. */
5327 case no_op:
5328 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5329 break;
fa9a63c5
RM
5330
5331 case succeed:
25fe55af 5332 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5333 goto succeed_label;
5334
7814e705 5335 /* Match the next n pattern characters exactly. The following
25fe55af 5336 byte in the pattern defines n, and the n bytes after that
7814e705 5337 are the characters to match. */
fa9a63c5
RM
5338 case exactn:
5339 mcnt = *p++;
25fe55af 5340 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5341
99633e97
SM
5342 /* Remember the start point to rollback upon failure. */
5343 dfail = d;
5344
25fe55af
RS
5345 /* This is written out as an if-else so we don't waste time
5346 testing `translate' inside the loop. */
28703c16 5347 if (RE_TRANSLATE_P (translate))
fa9a63c5 5348 {
e934739e
RS
5349 if (multibyte)
5350 do
5351 {
5352 int pat_charlen, buf_charlen;
e71b1971 5353 unsigned int pat_ch, buf_ch;
e934739e
RS
5354
5355 PREFETCH ();
5356 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5357 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5358
5359 if (RE_TRANSLATE (translate, buf_ch)
5360 != pat_ch)
99633e97
SM
5361 {
5362 d = dfail;
5363 goto fail;
5364 }
e934739e
RS
5365
5366 p += pat_charlen;
5367 d += buf_charlen;
5368 mcnt -= pat_charlen;
5369 }
5370 while (mcnt > 0);
5371 else
e934739e
RS
5372 do
5373 {
cf1982d9
EZ
5374 /* Avoid compiler whining about comparison being
5375 always true. */
5376 int di;
5377
e934739e 5378 PREFETCH ();
cf1982d9
EZ
5379 di = *d;
5380 if (RE_TRANSLATE (translate, di) != *p++)
99633e97
SM
5381 {
5382 d = dfail;
5383 goto fail;
5384 }
33c46939 5385 d++;
e934739e
RS
5386 }
5387 while (--mcnt);
fa9a63c5
RM
5388 }
5389 else
5390 {
5391 do
5392 {
5393 PREFETCH ();
99633e97
SM
5394 if (*d++ != *p++)
5395 {
5396 d = dfail;
5397 goto fail;
5398 }
fa9a63c5
RM
5399 }
5400 while (--mcnt);
5401 }
25fe55af 5402 break;
fa9a63c5
RM
5403
5404
25fe55af 5405 /* Match any character except possibly a newline or a null. */
fa9a63c5 5406 case anychar:
e934739e
RS
5407 {
5408 int buf_charlen;
01618498 5409 re_wchar_t buf_ch;
fa9a63c5 5410
e934739e 5411 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5412
e934739e 5413 PREFETCH ();
2d1675e4 5414 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5415 buf_ch = TRANSLATE (buf_ch);
5416
5417 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5418 && buf_ch == '\n')
5419 || ((bufp->syntax & RE_DOT_NOT_NULL)
5420 && buf_ch == '\000'))
5421 goto fail;
5422
e934739e
RS
5423 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5424 d += buf_charlen;
5425 }
fa9a63c5
RM
5426 break;
5427
5428
5429 case charset:
5430 case charset_not:
5431 {
b18215fc 5432 register unsigned int c;
fa9a63c5 5433 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5434 int len;
5435
5436 /* Start of actual range_table, or end of bitmap if there is no
5437 range table. */
01618498 5438 re_char *range_table;
b18215fc 5439
96cc36cc 5440 /* Nonzero if there is a range table. */
b18215fc
RS
5441 int range_table_exists;
5442
96cc36cc
RS
5443 /* Number of ranges of range table. This is not included
5444 in the initial byte-length of the command. */
5445 int count = 0;
fa9a63c5 5446
25fe55af 5447 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5448
b18215fc 5449 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5450
b18215fc 5451 if (range_table_exists)
96cc36cc
RS
5452 {
5453 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5454 EXTRACT_NUMBER_AND_INCR (count, range_table);
5455 }
b18215fc 5456
2d1675e4
SM
5457 PREFETCH ();
5458 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5459 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
5460
5461 if (SINGLE_BYTE_CHAR_P (c))
5462 { /* Lookup bitmap. */
b18215fc
RS
5463 /* Cast to `unsigned' instead of `unsigned char' in
5464 case the bit list is a full 32 bytes long. */
5465 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5466 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5467 not = !not;
b18215fc 5468 }
96cc36cc 5469#ifdef emacs
b18215fc 5470 else if (range_table_exists)
96cc36cc
RS
5471 {
5472 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5473
14473664
SM
5474 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5475 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5476 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5477 | (class_bits & BIT_SPACE && ISSPACE (c))
5478 | (class_bits & BIT_UPPER && ISUPPER (c))
5479 | (class_bits & BIT_WORD && ISWORD (c)))
5480 not = !not;
5481 else
5482 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5483 }
5484#endif /* emacs */
fa9a63c5 5485
96cc36cc
RS
5486 if (range_table_exists)
5487 p = CHARSET_RANGE_TABLE_END (range_table, count);
5488 else
5489 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5490
5491 if (!not) goto fail;
5e69f11e 5492
b18215fc 5493 d += len;
fa9a63c5
RM
5494 break;
5495 }
5496
5497
25fe55af 5498 /* The beginning of a group is represented by start_memory.
505bde11 5499 The argument is the register number. The text
25fe55af 5500 matched within the group is recorded (in the internal
7814e705 5501 registers data structure) under the register number. */
25fe55af 5502 case start_memory:
505bde11
SM
5503 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5504
5505 /* In case we need to undo this operation (via backtracking). */
5506 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5507
25fe55af 5508 regstart[*p] = d;
4bb91c68 5509 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5510 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5511
25fe55af 5512 /* Move past the register number and inner group count. */
505bde11 5513 p += 1;
25fe55af 5514 break;
fa9a63c5
RM
5515
5516
25fe55af 5517 /* The stop_memory opcode represents the end of a group. Its
505bde11 5518 argument is the same as start_memory's: the register number. */
fa9a63c5 5519 case stop_memory:
505bde11
SM
5520 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5521
5522 assert (!REG_UNSET (regstart[*p]));
5523 /* Strictly speaking, there should be code such as:
177c0ea7 5524
0b32bf0e 5525 assert (REG_UNSET (regend[*p]));
505bde11
SM
5526 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5527
5528 But the only info to be pushed is regend[*p] and it is known to
5529 be UNSET, so there really isn't anything to push.
5530 Not pushing anything, on the other hand deprives us from the
5531 guarantee that regend[*p] is UNSET since undoing this operation
5532 will not reset its value properly. This is not important since
5533 the value will only be read on the next start_memory or at
5534 the very end and both events can only happen if this stop_memory
5535 is *not* undone. */
fa9a63c5 5536
25fe55af 5537 regend[*p] = d;
fa9a63c5
RM
5538 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5539
25fe55af 5540 /* Move past the register number and the inner group count. */
505bde11 5541 p += 1;
25fe55af 5542 break;
fa9a63c5
RM
5543
5544
5545 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5546 followed by the numeric value of <digit> as the register number. */
5547 case duplicate:
fa9a63c5 5548 {
66f0296e 5549 register re_char *d2, *dend2;
7814e705 5550 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5551 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5552
7814e705 5553 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5554 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5555 goto fail;
5e69f11e 5556
7814e705 5557 /* Where in input to try to start matching. */
25fe55af 5558 d2 = regstart[regno];
5e69f11e 5559
99633e97
SM
5560 /* Remember the start point to rollback upon failure. */
5561 dfail = d;
5562
25fe55af
RS
5563 /* Where to stop matching; if both the place to start and
5564 the place to stop matching are in the same string, then
5565 set to the place to stop, otherwise, for now have to use
5566 the end of the first string. */
fa9a63c5 5567
25fe55af 5568 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5569 == FIRST_STRING_P (regend[regno]))
5570 ? regend[regno] : end_match_1);
5571 for (;;)
5572 {
5573 /* If necessary, advance to next segment in register
25fe55af 5574 contents. */
fa9a63c5
RM
5575 while (d2 == dend2)
5576 {
5577 if (dend2 == end_match_2) break;
5578 if (dend2 == regend[regno]) break;
5579
25fe55af
RS
5580 /* End of string1 => advance to string2. */
5581 d2 = string2;
5582 dend2 = regend[regno];
fa9a63c5
RM
5583 }
5584 /* At end of register contents => success */
5585 if (d2 == dend2) break;
5586
5587 /* If necessary, advance to next segment in data. */
5588 PREFETCH ();
5589
5590 /* How many characters left in this segment to match. */
5591 mcnt = dend - d;
5e69f11e 5592
fa9a63c5 5593 /* Want how many consecutive characters we can match in
25fe55af
RS
5594 one shot, so, if necessary, adjust the count. */
5595 if (mcnt > dend2 - d2)
fa9a63c5 5596 mcnt = dend2 - d2;
5e69f11e 5597
fa9a63c5 5598 /* Compare that many; failure if mismatch, else move
25fe55af 5599 past them. */
28703c16 5600 if (RE_TRANSLATE_P (translate)
2d1675e4 5601 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5602 : memcmp (d, d2, mcnt))
99633e97
SM
5603 {
5604 d = dfail;
5605 goto fail;
5606 }
fa9a63c5 5607 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5608 }
5609 }
5610 break;
5611
5612
25fe55af 5613 /* begline matches the empty string at the beginning of the string
c0f9ea08 5614 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5615 case begline:
25fe55af 5616 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5617
25fe55af
RS
5618 if (AT_STRINGS_BEG (d))
5619 {
5620 if (!bufp->not_bol) break;
5621 }
419d1c74 5622 else
25fe55af 5623 {
419d1c74
SM
5624 unsigned char c;
5625 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5626 if (c == '\n')
419d1c74 5627 break;
25fe55af
RS
5628 }
5629 /* In all other cases, we fail. */
5630 goto fail;
fa9a63c5
RM
5631
5632
25fe55af 5633 /* endline is the dual of begline. */
fa9a63c5 5634 case endline:
25fe55af 5635 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5636
25fe55af
RS
5637 if (AT_STRINGS_END (d))
5638 {
5639 if (!bufp->not_eol) break;
5640 }
f1ad044f 5641 else
25fe55af 5642 {
f1ad044f 5643 PREFETCH_NOLIMIT ();
c0f9ea08 5644 if (*d == '\n')
f1ad044f 5645 break;
25fe55af
RS
5646 }
5647 goto fail;
fa9a63c5
RM
5648
5649
5650 /* Match at the very beginning of the data. */
25fe55af
RS
5651 case begbuf:
5652 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5653 if (AT_STRINGS_BEG (d))
5654 break;
5655 goto fail;
fa9a63c5
RM
5656
5657
5658 /* Match at the very end of the data. */
25fe55af
RS
5659 case endbuf:
5660 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5661 if (AT_STRINGS_END (d))
5662 break;
25fe55af 5663 goto fail;
5e69f11e 5664
5e69f11e 5665
25fe55af
RS
5666 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5667 pushes NULL as the value for the string on the stack. Then
505bde11 5668 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5669 string, instead of restoring it. To see why, consider
7814e705 5670 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5671 then the . fails against the \n. But the next thing we want
5672 to do is match the \n against the \n; if we restored the
5673 string value, we would be back at the foo.
5674
5675 Because this is used only in specific cases, we don't need to
5676 check all the things that `on_failure_jump' does, to make
5677 sure the right things get saved on the stack. Hence we don't
5678 share its code. The only reason to push anything on the
5679 stack at all is that otherwise we would have to change
5680 `anychar's code to do something besides goto fail in this
5681 case; that seems worse than this. */
5682 case on_failure_keep_string_jump:
505bde11
SM
5683 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5684 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5685 mcnt, p + mcnt);
fa9a63c5 5686
505bde11
SM
5687 PUSH_FAILURE_POINT (p - 3, NULL);
5688 break;
5689
0683b6fa
SM
5690 /* A nasty loop is introduced by the non-greedy *? and +?.
5691 With such loops, the stack only ever contains one failure point
5692 at a time, so that a plain on_failure_jump_loop kind of
5693 cycle detection cannot work. Worse yet, such a detection
5694 can not only fail to detect a cycle, but it can also wrongly
5695 detect a cycle (between different instantiations of the same
6df42991 5696 loop).
0683b6fa
SM
5697 So the method used for those nasty loops is a little different:
5698 We use a special cycle-detection-stack-frame which is pushed
5699 when the on_failure_jump_nastyloop failure-point is *popped*.
5700 This special frame thus marks the beginning of one iteration
5701 through the loop and we can hence easily check right here
5702 whether something matched between the beginning and the end of
5703 the loop. */
5704 case on_failure_jump_nastyloop:
5705 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5706 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5707 mcnt, p + mcnt);
5708
5709 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5710 {
5711 int cycle = 0;
5712 CHECK_INFINITE_LOOP (p - 4, d);
5713 if (!cycle)
5714 /* If there's a cycle, just continue without pushing
5715 this failure point. The failure point is the "try again"
5716 option, which shouldn't be tried.
5717 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5718 PUSH_FAILURE_POINT (p - 3, d);
5719 }
0683b6fa
SM
5720 break;
5721
4e8a9132
SM
5722 /* Simple loop detecting on_failure_jump: just check on the
5723 failure stack if the same spot was already hit earlier. */
505bde11
SM
5724 case on_failure_jump_loop:
5725 on_failure:
5726 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5727 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5728 mcnt, p + mcnt);
6df42991
SM
5729 {
5730 int cycle = 0;
5731 CHECK_INFINITE_LOOP (p - 3, d);
5732 if (cycle)
5733 /* If there's a cycle, get out of the loop, as if the matching
5734 had failed. We used to just `goto fail' here, but that was
5735 aborting the search a bit too early: we want to keep the
5736 empty-loop-match and keep matching after the loop.
5737 We want (x?)*y\1z to match both xxyz and xxyxz. */
5738 p += mcnt;
5739 else
5740 PUSH_FAILURE_POINT (p - 3, d);
5741 }
25fe55af 5742 break;
fa9a63c5
RM
5743
5744
5745 /* Uses of on_failure_jump:
5e69f11e 5746
25fe55af
RS
5747 Each alternative starts with an on_failure_jump that points
5748 to the beginning of the next alternative. Each alternative
5749 except the last ends with a jump that in effect jumps past
5750 the rest of the alternatives. (They really jump to the
5751 ending jump of the following alternative, because tensioning
5752 these jumps is a hassle.)
fa9a63c5 5753
25fe55af
RS
5754 Repeats start with an on_failure_jump that points past both
5755 the repetition text and either the following jump or
5756 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5757 case on_failure_jump:
25fe55af 5758 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5759 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5760 mcnt, p + mcnt);
25fe55af 5761
505bde11 5762 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5763 break;
5764
4e8a9132 5765 /* This operation is used for greedy *.
505bde11
SM
5766 Compare the beginning of the repeat with what in the
5767 pattern follows its end. If we can establish that there
5768 is nothing that they would both match, i.e., that we
5769 would have to backtrack because of (as in, e.g., `a*a')
5770 then we can use a non-backtracking loop based on
4e8a9132 5771 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5772 case on_failure_jump_smart:
25fe55af 5773 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5774 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5775 mcnt, p + mcnt);
25fe55af 5776 {
01618498 5777 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5778 /* Here, we discard `const', making re_match non-reentrant. */
5779 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5780 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5781
505bde11
SM
5782 p -= 3; /* Reset so that we will re-execute the
5783 instruction once it's been changed. */
fa9a63c5 5784
4e8a9132
SM
5785 EXTRACT_NUMBER (mcnt, p2 - 2);
5786
5787 /* Ensure this is a indeed the trivial kind of loop
5788 we are expecting. */
5789 assert (skip_one_char (p1) == p2 - 3);
5790 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5791 DEBUG_STATEMENT (debug += 2);
505bde11 5792 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5793 {
505bde11 5794 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5795 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5796 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5797 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5798 }
505bde11 5799 else
fa9a63c5 5800 {
505bde11
SM
5801 /* Default to a safe `on_failure_jump' loop. */
5802 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5803 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5804 }
99633e97 5805 DEBUG_STATEMENT (debug -= 2);
25fe55af 5806 }
505bde11 5807 break;
25fe55af
RS
5808
5809 /* Unconditionally jump (without popping any failure points). */
5810 case jump:
fa9a63c5 5811 unconditional_jump:
5b370c2b 5812 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5813 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5814 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5815 p += mcnt; /* Do the jump. */
505bde11 5816 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5817 break;
5818
5819
25fe55af
RS
5820 /* Have to succeed matching what follows at least n times.
5821 After that, handle like `on_failure_jump'. */
5822 case succeed_n:
01618498 5823 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5824 EXTRACT_NUMBER (mcnt, p + 2);
5825 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5826
dc1e502d
SM
5827 /* Originally, mcnt is how many times we HAVE to succeed. */
5828 if (mcnt != 0)
25fe55af 5829 {
6dcf2d0e
SM
5830 /* Here, we discard `const', making re_match non-reentrant. */
5831 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5832 mcnt--;
01618498
SM
5833 p += 4;
5834 PUSH_NUMBER (p2, mcnt);
25fe55af 5835 }
dc1e502d
SM
5836 else
5837 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5838 goto on_failure;
25fe55af
RS
5839 break;
5840
5841 case jump_n:
01618498 5842 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5843 EXTRACT_NUMBER (mcnt, p + 2);
5844 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5845
5846 /* Originally, this is how many times we CAN jump. */
dc1e502d 5847 if (mcnt != 0)
25fe55af 5848 {
6dcf2d0e
SM
5849 /* Here, we discard `const', making re_match non-reentrant. */
5850 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5851 mcnt--;
01618498 5852 PUSH_NUMBER (p2, mcnt);
dc1e502d 5853 goto unconditional_jump;
25fe55af
RS
5854 }
5855 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5856 else
5857 p += 4;
25fe55af 5858 break;
5e69f11e 5859
fa9a63c5
RM
5860 case set_number_at:
5861 {
01618498 5862 unsigned char *p2; /* Location of the counter. */
25fe55af 5863 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5864
25fe55af 5865 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5866 /* Here, we discard `const', making re_match non-reentrant. */
5867 p2 = (unsigned char*) p + mcnt;
01618498 5868 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5869 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5870 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5871 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5872 break;
5873 }
9121ca40
KH
5874
5875 case wordbound:
66f0296e
SM
5876 case notwordbound:
5877 not = (re_opcode_t) *(p - 1) == notwordbound;
5878 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5879
99633e97 5880 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5881
b18215fc 5882 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5883 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5884 not = !not;
b18215fc
RS
5885 else
5886 {
5887 /* C1 is the character before D, S1 is the syntax of C1, C2
5888 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5889 re_wchar_t c1, c2;
5890 int s1, s2;
b18215fc 5891#ifdef emacs
2d1675e4
SM
5892 int offset = PTR_TO_OFFSET (d - 1);
5893 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5894 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5895#endif
66f0296e 5896 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5897 s1 = SYNTAX (c1);
5898#ifdef emacs
5d967c7a 5899 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5900#endif
f1ad044f 5901 PREFETCH_NOLIMIT ();
2d1675e4 5902 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5903 s2 = SYNTAX (c2);
5904
5905 if (/* Case 2: Only one of S1 and S2 is Sword. */
5906 ((s1 == Sword) != (s2 == Sword))
5907 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 5908 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5909 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5910 not = !not;
5911 }
5912 if (not)
9121ca40 5913 break;
b18215fc 5914 else
9121ca40 5915 goto fail;
fa9a63c5
RM
5916
5917 case wordbeg:
25fe55af 5918 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5919
b18215fc
RS
5920 /* We FAIL in one of the following cases: */
5921
7814e705 5922 /* Case 1: D is at the end of string. */
b18215fc 5923 if (AT_STRINGS_END (d))
99633e97 5924 goto fail;
b18215fc
RS
5925 else
5926 {
5927 /* C1 is the character before D, S1 is the syntax of C1, C2
5928 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5929 re_wchar_t c1, c2;
5930 int s1, s2;
fa9a63c5 5931#ifdef emacs
2d1675e4
SM
5932 int offset = PTR_TO_OFFSET (d);
5933 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5934 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5935#endif
99633e97 5936 PREFETCH ();
2d1675e4 5937 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5938 s2 = SYNTAX (c2);
177c0ea7 5939
b18215fc
RS
5940 /* Case 2: S2 is not Sword. */
5941 if (s2 != Sword)
5942 goto fail;
5943
5944 /* Case 3: D is not at the beginning of string ... */
5945 if (!AT_STRINGS_BEG (d))
5946 {
5947 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5948#ifdef emacs
5d967c7a 5949 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5950#endif
b18215fc
RS
5951 s1 = SYNTAX (c1);
5952
5953 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 5954 returns 0. */
b18215fc
RS
5955 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5956 goto fail;
5957 }
5958 }
e318085a
RS
5959 break;
5960
b18215fc 5961 case wordend:
25fe55af 5962 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5963
5964 /* We FAIL in one of the following cases: */
5965
5966 /* Case 1: D is at the beginning of string. */
5967 if (AT_STRINGS_BEG (d))
e318085a 5968 goto fail;
b18215fc
RS
5969 else
5970 {
5971 /* C1 is the character before D, S1 is the syntax of C1, C2
5972 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5973 re_wchar_t c1, c2;
5974 int s1, s2;
5d967c7a 5975#ifdef emacs
2d1675e4
SM
5976 int offset = PTR_TO_OFFSET (d) - 1;
5977 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5978 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5979#endif
99633e97 5980 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5981 s1 = SYNTAX (c1);
5982
5983 /* Case 2: S1 is not Sword. */
5984 if (s1 != Sword)
5985 goto fail;
5986
5987 /* Case 3: D is not at the end of string ... */
5988 if (!AT_STRINGS_END (d))
5989 {
f1ad044f 5990 PREFETCH_NOLIMIT ();
2d1675e4 5991 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a 5992#ifdef emacs
134579f2 5993 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5d967c7a 5994#endif
b18215fc
RS
5995 s2 = SYNTAX (c2);
5996
5997 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 5998 returns 0. */
b18215fc 5999 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6000 goto fail;
b18215fc
RS
6001 }
6002 }
e318085a
RS
6003 break;
6004
669fa600
SM
6005 case symbeg:
6006 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6007
6008 /* We FAIL in one of the following cases: */
6009
7814e705 6010 /* Case 1: D is at the end of string. */
669fa600
SM
6011 if (AT_STRINGS_END (d))
6012 goto fail;
6013 else
6014 {
6015 /* C1 is the character before D, S1 is the syntax of C1, C2
6016 is the character at D, and S2 is the syntax of C2. */
6017 re_wchar_t c1, c2;
6018 int s1, s2;
6019#ifdef emacs
6020 int offset = PTR_TO_OFFSET (d);
6021 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6022 UPDATE_SYNTAX_TABLE (charpos);
6023#endif
6024 PREFETCH ();
6025 c2 = RE_STRING_CHAR (d, dend - d);
6026 s2 = SYNTAX (c2);
7814e705 6027
669fa600
SM
6028 /* Case 2: S2 is neither Sword nor Ssymbol. */
6029 if (s2 != Sword && s2 != Ssymbol)
6030 goto fail;
6031
6032 /* Case 3: D is not at the beginning of string ... */
6033 if (!AT_STRINGS_BEG (d))
6034 {
6035 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6036#ifdef emacs
6037 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6038#endif
6039 s1 = SYNTAX (c1);
6040
6041 /* ... and S1 is Sword or Ssymbol. */
6042 if (s1 == Sword || s1 == Ssymbol)
6043 goto fail;
6044 }
6045 }
6046 break;
6047
6048 case symend:
6049 DEBUG_PRINT1 ("EXECUTING symend.\n");
6050
6051 /* We FAIL in one of the following cases: */
6052
6053 /* Case 1: D is at the beginning of string. */
6054 if (AT_STRINGS_BEG (d))
6055 goto fail;
6056 else
6057 {
6058 /* C1 is the character before D, S1 is the syntax of C1, C2
6059 is the character at D, and S2 is the syntax of C2. */
6060 re_wchar_t c1, c2;
6061 int s1, s2;
6062#ifdef emacs
6063 int offset = PTR_TO_OFFSET (d) - 1;
6064 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6065 UPDATE_SYNTAX_TABLE (charpos);
6066#endif
6067 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6068 s1 = SYNTAX (c1);
6069
6070 /* Case 2: S1 is neither Ssymbol nor Sword. */
6071 if (s1 != Sword && s1 != Ssymbol)
6072 goto fail;
6073
6074 /* Case 3: D is not at the end of string ... */
6075 if (!AT_STRINGS_END (d))
6076 {
6077 PREFETCH_NOLIMIT ();
6078 c2 = RE_STRING_CHAR (d, dend - d);
6079#ifdef emacs
134579f2 6080 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6081#endif
6082 s2 = SYNTAX (c2);
6083
6084 /* ... and S2 is Sword or Ssymbol. */
6085 if (s2 == Sword || s2 == Ssymbol)
6086 goto fail;
6087 }
6088 }
6089 break;
6090
fa9a63c5 6091 case syntaxspec:
1fb352e0
SM
6092 case notsyntaxspec:
6093 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6094 mcnt = *p++;
1fb352e0 6095 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6096 PREFETCH ();
b18215fc
RS
6097#ifdef emacs
6098 {
2d1675e4
SM
6099 int offset = PTR_TO_OFFSET (d);
6100 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6101 UPDATE_SYNTAX_TABLE (pos1);
6102 }
25fe55af 6103#endif
b18215fc 6104 {
01618498
SM
6105 int len;
6106 re_wchar_t c;
b18215fc 6107
2d1675e4 6108 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6109
990b2375 6110 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6111 goto fail;
b18215fc
RS
6112 d += len;
6113 }
fa9a63c5
RM
6114 break;
6115
b18215fc 6116#ifdef emacs
1fb352e0
SM
6117 case before_dot:
6118 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6119 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6120 goto fail;
b18215fc
RS
6121 break;
6122
1fb352e0
SM
6123 case at_dot:
6124 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6125 if (PTR_BYTE_POS (d) != PT_BYTE)
6126 goto fail;
6127 break;
b18215fc 6128
1fb352e0
SM
6129 case after_dot:
6130 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6131 if (PTR_BYTE_POS (d) <= PT_BYTE)
6132 goto fail;
e318085a 6133 break;
fa9a63c5 6134
1fb352e0 6135 case categoryspec:
b18215fc 6136 case notcategoryspec:
1fb352e0 6137 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6138 mcnt = *p++;
1fb352e0 6139 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6140 PREFETCH ();
6141 {
01618498
SM
6142 int len;
6143 re_wchar_t c;
6144
2d1675e4 6145 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6146
1fb352e0 6147 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6148 goto fail;
6149 d += len;
6150 }
fa9a63c5 6151 break;
5e69f11e 6152
1fb352e0 6153#endif /* emacs */
5e69f11e 6154
0b32bf0e
SM
6155 default:
6156 abort ();
fa9a63c5 6157 }
b18215fc 6158 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6159
6160
6161 /* We goto here if a matching operation fails. */
6162 fail:
5b370c2b 6163 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6164 if (!FAIL_STACK_EMPTY ())
505bde11 6165 {
01618498 6166 re_char *str, *pat;
505bde11 6167 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6168 DEBUG_PRINT1 ("\nFAIL:\n");
6169 POP_FAILURE_POINT (str, pat);
505bde11
SM
6170 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6171 {
6172 case on_failure_keep_string_jump:
6173 assert (str == NULL);
6174 goto continue_failure_jump;
6175
0683b6fa
SM
6176 case on_failure_jump_nastyloop:
6177 assert ((re_opcode_t)pat[-2] == no_op);
6178 PUSH_FAILURE_POINT (pat - 2, str);
6179 /* Fallthrough */
6180
505bde11
SM
6181 case on_failure_jump_loop:
6182 case on_failure_jump:
6183 case succeed_n:
6184 d = str;
6185 continue_failure_jump:
6186 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6187 p = pat + mcnt;
6188 break;
b18215fc 6189
0683b6fa
SM
6190 case no_op:
6191 /* A special frame used for nastyloops. */
6192 goto fail;
6193
505bde11
SM
6194 default:
6195 abort();
6196 }
fa9a63c5 6197
505bde11 6198 assert (p >= bufp->buffer && p <= pend);
b18215fc 6199
0b32bf0e 6200 if (d >= string1 && d <= end1)
fa9a63c5 6201 dend = end_match_1;
0b32bf0e 6202 }
fa9a63c5 6203 else
0b32bf0e 6204 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6205 } /* for (;;) */
6206
6207 if (best_regs_set)
6208 goto restore_best_regs;
6209
6210 FREE_VARIABLES ();
6211
b18215fc 6212 return -1; /* Failure to match. */
fa9a63c5
RM
6213} /* re_match_2 */
6214\f
6215/* Subroutine definitions for re_match_2. */
6216
fa9a63c5
RM
6217/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6218 bytes; nonzero otherwise. */
5e69f11e 6219
fa9a63c5 6220static int
2d1675e4
SM
6221bcmp_translate (s1, s2, len, translate, multibyte)
6222 re_char *s1, *s2;
fa9a63c5 6223 register int len;
6676cb1c 6224 RE_TRANSLATE_TYPE translate;
2d1675e4 6225 const int multibyte;
fa9a63c5 6226{
2d1675e4
SM
6227 register re_char *p1 = s1, *p2 = s2;
6228 re_char *p1_end = s1 + len;
6229 re_char *p2_end = s2 + len;
e934739e 6230
4bb91c68
SM
6231 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6232 different lengths, but relying on a single `len' would break this. -sm */
6233 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6234 {
e934739e 6235 int p1_charlen, p2_charlen;
01618498 6236 re_wchar_t p1_ch, p2_ch;
e934739e 6237
2d1675e4
SM
6238 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6239 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
6240
6241 if (RE_TRANSLATE (translate, p1_ch)
6242 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6243 return 1;
e934739e
RS
6244
6245 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6246 }
e934739e
RS
6247
6248 if (p1 != p1_end || p2 != p2_end)
6249 return 1;
6250
fa9a63c5
RM
6251 return 0;
6252}
6253\f
6254/* Entry points for GNU code. */
6255
6256/* re_compile_pattern is the GNU regular expression compiler: it
6257 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6258 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6259
fa9a63c5
RM
6260 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6261 are set in BUFP on entry.
5e69f11e 6262
b18215fc 6263 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6264
6265const char *
6266re_compile_pattern (pattern, length, bufp)
6267 const char *pattern;
0b32bf0e 6268 size_t length;
fa9a63c5
RM
6269 struct re_pattern_buffer *bufp;
6270{
6271 reg_errcode_t ret;
5e69f11e 6272
1208f11a
RS
6273#ifdef emacs
6274 gl_state.current_syntax_table = current_buffer->syntax_table;
6275#endif
6276
fa9a63c5
RM
6277 /* GNU code is written to assume at least RE_NREGS registers will be set
6278 (and at least one extra will be -1). */
6279 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6280
fa9a63c5
RM
6281 /* And GNU code determines whether or not to get register information
6282 by passing null for the REGS argument to re_match, etc., not by
6283 setting no_sub. */
6284 bufp->no_sub = 0;
5e69f11e 6285
4bb91c68 6286 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6287
6288 if (!ret)
6289 return NULL;
6290 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6291}
c0f9ea08 6292WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6293\f
b18215fc
RS
6294/* Entry points compatible with 4.2 BSD regex library. We don't define
6295 them unless specifically requested. */
fa9a63c5 6296
0b32bf0e 6297#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6298
6299/* BSD has one and only one pattern buffer. */
6300static struct re_pattern_buffer re_comp_buf;
6301
6302char *
0b32bf0e 6303# ifdef _LIBC
48afdd44
RM
6304/* Make these definitions weak in libc, so POSIX programs can redefine
6305 these names if they don't use our functions, and still use
6306 regcomp/regexec below without link errors. */
6307weak_function
0b32bf0e 6308# endif
fa9a63c5
RM
6309re_comp (s)
6310 const char *s;
6311{
6312 reg_errcode_t ret;
5e69f11e 6313
fa9a63c5
RM
6314 if (!s)
6315 {
6316 if (!re_comp_buf.buffer)
0b32bf0e 6317 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6318 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6319 return 0;
6320 }
6321
6322 if (!re_comp_buf.buffer)
6323 {
6324 re_comp_buf.buffer = (unsigned char *) malloc (200);
6325 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6326 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6327 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6328 re_comp_buf.allocated = 200;
6329
6330 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6331 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6332 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6333 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6334 }
6335
6336 /* Since `re_exec' always passes NULL for the `regs' argument, we
6337 don't need to initialize the pattern buffer fields which affect it. */
6338
fa9a63c5 6339 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6340
fa9a63c5
RM
6341 if (!ret)
6342 return NULL;
6343
6344 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6345 return (char *) gettext (re_error_msgid[(int) ret]);
6346}
6347
6348
6349int
0b32bf0e 6350# ifdef _LIBC
48afdd44 6351weak_function
0b32bf0e 6352# endif
fa9a63c5
RM
6353re_exec (s)
6354 const char *s;
6355{
6356 const int len = strlen (s);
6357 return
6358 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6359}
6360#endif /* _REGEX_RE_COMP */
6361\f
6362/* POSIX.2 functions. Don't define these for Emacs. */
6363
6364#ifndef emacs
6365
6366/* regcomp takes a regular expression as a string and compiles it.
6367
b18215fc 6368 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6369 since POSIX says we shouldn't. Thus, we set
6370
6371 `buffer' to the compiled pattern;
6372 `used' to the length of the compiled pattern;
6373 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6374 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6375 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6376 `fastmap' to an allocated space for the fastmap;
6377 `fastmap_accurate' to zero;
fa9a63c5
RM
6378 `re_nsub' to the number of subexpressions in PATTERN.
6379
6380 PATTERN is the address of the pattern string.
6381
6382 CFLAGS is a series of bits which affect compilation.
6383
6384 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6385 use POSIX basic syntax.
6386
6387 If REG_NEWLINE is set, then . and [^...] don't match newline.
6388 Also, regexec will try a match beginning after every newline.
6389
6390 If REG_ICASE is set, then we considers upper- and lowercase
6391 versions of letters to be equivalent when matching.
6392
6393 If REG_NOSUB is set, then when PREG is passed to regexec, that
6394 routine will report only success or failure, and nothing about the
6395 registers.
6396
b18215fc 6397 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6398 the return codes and their meanings.) */
6399
6400int
6401regcomp (preg, pattern, cflags)
ada30c0e
SM
6402 regex_t *__restrict preg;
6403 const char *__restrict pattern;
fa9a63c5
RM
6404 int cflags;
6405{
6406 reg_errcode_t ret;
4bb91c68 6407 reg_syntax_t syntax
fa9a63c5
RM
6408 = (cflags & REG_EXTENDED) ?
6409 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6410
6411 /* regex_compile will allocate the space for the compiled pattern. */
6412 preg->buffer = 0;
6413 preg->allocated = 0;
6414 preg->used = 0;
5e69f11e 6415
c0f9ea08 6416 /* Try to allocate space for the fastmap. */
a073faa6 6417 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6418
fa9a63c5
RM
6419 if (cflags & REG_ICASE)
6420 {
6421 unsigned i;
5e69f11e 6422
6676cb1c 6423 preg->translate
a073faa6 6424 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8ea094cf 6425 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6426 if (preg->translate == NULL)
0b32bf0e 6427 return (int) REG_ESPACE;
fa9a63c5
RM
6428
6429 /* Map uppercase characters to corresponding lowercase ones. */
6430 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6431 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6432 }
6433 else
6434 preg->translate = NULL;
6435
6436 /* If REG_NEWLINE is set, newlines are treated differently. */
6437 if (cflags & REG_NEWLINE)
6438 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6439 syntax &= ~RE_DOT_NEWLINE;
6440 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6441 }
6442 else
c0f9ea08 6443 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6444
6445 preg->no_sub = !!(cflags & REG_NOSUB);
6446
5e69f11e 6447 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6448 can use strlen here in compiling the pattern. */
4bb91c68 6449 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6450
fa9a63c5
RM
6451 /* POSIX doesn't distinguish between an unmatched open-group and an
6452 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6453 if (ret == REG_ERPAREN)
6454 ret = REG_EPAREN;
6455
6456 if (ret == REG_NOERROR && preg->fastmap)
6457 { /* Compute the fastmap now, since regexec cannot modify the pattern
6458 buffer. */
6459 re_compile_fastmap (preg);
6460 if (preg->can_be_null)
6461 { /* The fastmap can't be used anyway. */
6462 free (preg->fastmap);
6463 preg->fastmap = NULL;
6464 }
6465 }
fa9a63c5
RM
6466 return (int) ret;
6467}
c0f9ea08 6468WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6469
6470
6471/* regexec searches for a given pattern, specified by PREG, in the
6472 string STRING.
5e69f11e 6473
fa9a63c5 6474 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6475 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6476 least NMATCH elements, and we set them to the offsets of the
6477 corresponding matched substrings.
5e69f11e 6478
fa9a63c5
RM
6479 EFLAGS specifies `execution flags' which affect matching: if
6480 REG_NOTBOL is set, then ^ does not match at the beginning of the
6481 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6482
fa9a63c5
RM
6483 We return 0 if we find a match and REG_NOMATCH if not. */
6484
6485int
6486regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6487 const regex_t *__restrict preg;
6488 const char *__restrict string;
5e69f11e 6489 size_t nmatch;
9f2dbe01 6490 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6491 int eflags;
6492{
6493 int ret;
6494 struct re_registers regs;
6495 regex_t private_preg;
6496 int len = strlen (string);
c0f9ea08 6497 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6498
6499 private_preg = *preg;
5e69f11e 6500
fa9a63c5
RM
6501 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6502 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6503
fa9a63c5
RM
6504 /* The user has told us exactly how many registers to return
6505 information about, via `nmatch'. We have to pass that on to the
b18215fc 6506 matching routines. */
fa9a63c5 6507 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6508
fa9a63c5
RM
6509 if (want_reg_info)
6510 {
6511 regs.num_regs = nmatch;
4bb91c68
SM
6512 regs.start = TALLOC (nmatch * 2, regoff_t);
6513 if (regs.start == NULL)
0b32bf0e 6514 return (int) REG_NOMATCH;
4bb91c68 6515 regs.end = regs.start + nmatch;
fa9a63c5
RM
6516 }
6517
c0f9ea08
SM
6518 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6519 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6520 was a little bit longer but still only matching the real part.
6521 This works because the `endline' will check for a '\n' and will find a
6522 '\0', correctly deciding that this is not the end of a line.
6523 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6524 a convenient '\0' there. For all we know, the string could be preceded
6525 by '\n' which would throw things off. */
6526
fa9a63c5
RM
6527 /* Perform the searching operation. */
6528 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6529 /* start: */ 0, /* range: */ len,
6530 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6531
fa9a63c5
RM
6532 /* Copy the register information to the POSIX structure. */
6533 if (want_reg_info)
6534 {
6535 if (ret >= 0)
0b32bf0e
SM
6536 {
6537 unsigned r;
fa9a63c5 6538
0b32bf0e
SM
6539 for (r = 0; r < nmatch; r++)
6540 {
6541 pmatch[r].rm_so = regs.start[r];
6542 pmatch[r].rm_eo = regs.end[r];
6543 }
6544 }
fa9a63c5 6545
b18215fc 6546 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6547 free (regs.start);
fa9a63c5
RM
6548 }
6549
6550 /* We want zero return to mean success, unlike `re_search'. */
6551 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6552}
c0f9ea08 6553WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6554
6555
ec869672
JR
6556/* Returns a message corresponding to an error code, ERR_CODE, returned
6557 from either regcomp or regexec. We don't use PREG here.
6558
6559 ERR_CODE was previously called ERRCODE, but that name causes an
6560 error with msvc8 compiler. */
fa9a63c5
RM
6561
6562size_t
ec869672
JR
6563regerror (err_code, preg, errbuf, errbuf_size)
6564 int err_code;
fa9a63c5
RM
6565 const regex_t *preg;
6566 char *errbuf;
6567 size_t errbuf_size;
6568{
6569 const char *msg;
6570 size_t msg_size;
6571
ec869672
JR
6572 if (err_code < 0
6573 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6574 /* Only error codes returned by the rest of the code should be passed
b18215fc 6575 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6576 code generates an invalid error code, then the program has a bug.
6577 Dump core so we can fix it. */
6578 abort ();
6579
ec869672 6580 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6581
6582 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6583
fa9a63c5
RM
6584 if (errbuf_size != 0)
6585 {
6586 if (msg_size > errbuf_size)
0b32bf0e
SM
6587 {
6588 strncpy (errbuf, msg, errbuf_size - 1);
6589 errbuf[errbuf_size - 1] = 0;
6590 }
fa9a63c5 6591 else
0b32bf0e 6592 strcpy (errbuf, msg);
fa9a63c5
RM
6593 }
6594
6595 return msg_size;
6596}
c0f9ea08 6597WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6598
6599
6600/* Free dynamically allocated space used by PREG. */
6601
6602void
6603regfree (preg)
6604 regex_t *preg;
6605{
6606 if (preg->buffer != NULL)
6607 free (preg->buffer);
6608 preg->buffer = NULL;
5e69f11e 6609
fa9a63c5
RM
6610 preg->allocated = 0;
6611 preg->used = 0;
6612
6613 if (preg->fastmap != NULL)
6614 free (preg->fastmap);
6615 preg->fastmap = NULL;
6616 preg->fastmap_accurate = 0;
6617
6618 if (preg->translate != NULL)
6619 free (preg->translate);
6620 preg->translate = NULL;
6621}
c0f9ea08 6622WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6623
6624#endif /* not emacs */
ab5796a9
MB
6625
6626/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6627 (do not change this comment) */