Add descriptions about configuration and invocation.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
1cee1e27 6 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
bc78d348 7
fa9a63c5
RM
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
4fc5845f 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
25fe55af 21 USA. */
fa9a63c5 22
6df42991 23/* TODO:
505bde11 24 - structure the opcode space into opcode+flag.
dc1e502d 25 - merge with glibc's regex.[ch].
01618498 26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
01618498 30*/
505bde11 31
fa9a63c5 32/* AIX requires this to be the first thing in the file. */
0b32bf0e 33#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
34 #pragma alloca
35#endif
36
fa9a63c5 37#ifdef HAVE_CONFIG_H
0b32bf0e 38# include <config.h>
fa9a63c5
RM
39#endif
40
4bb91c68
SM
41#if defined STDC_HEADERS && !defined emacs
42# include <stddef.h>
43#else
44/* We need this for `regex.h', and perhaps for the Emacs include files. */
45# include <sys/types.h>
46#endif
fa9a63c5 47
14473664
SM
48/* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
50#if defined _LIBC
51#define WIDE_CHAR_SUPPORT 1
52#else
14473664 53#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55#endif
14473664
SM
56
57/* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
a0ad02f7 59#if WIDE_CHAR_SUPPORT
14473664
SM
60/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61# include <wchar.h>
62# include <wctype.h>
63#endif
64
c0f9ea08
SM
65#ifdef _LIBC
66/* We have to keep the namespace clean. */
67# define regfree(preg) __regfree (preg)
68# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
70# define regerror(errcode, preg, errbuf, errbuf_size) \
71 __regerror(errcode, preg, errbuf, errbuf_size)
72# define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76# define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78# define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80# define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82# define re_set_syntax(syntax) __re_set_syntax (syntax)
83# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
14473664
SM
87/* Make sure we call libc's function even if the user overrides them. */
88# define btowc __btowc
89# define iswctype __iswctype
90# define wctype __wctype
91
c0f9ea08
SM
92# define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94/* We are also using some library internals. */
95# include <locale/localeinfo.h>
96# include <locale/elem-hash.h>
97# include <langinfo.h>
98#else
99# define WEAK_ALIAS(a,b)
100#endif
101
4bb91c68 102/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 103#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
104# include <libintl.h>
105#else
106# define gettext(msgid) (msgid)
107#endif
108
5e69f11e
RM
109#ifndef gettext_noop
110/* This define is so xgettext can find the internationalizable
111 strings. */
0b32bf0e 112# define gettext_noop(String) String
5e69f11e
RM
113#endif
114
fa9a63c5
RM
115/* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117#ifdef emacs
118
0b32bf0e
SM
119# include "lisp.h"
120# include "buffer.h"
b18215fc
RS
121
122/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 123# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 124
0b32bf0e
SM
125# include "syntax.h"
126# include "charset.h"
127# include "category.h"
fa9a63c5 128
7689ef0b
EZ
129# ifdef malloc
130# undef malloc
131# endif
0b32bf0e 132# define malloc xmalloc
7689ef0b
EZ
133# ifdef realloc
134# undef realloc
135# endif
0b32bf0e 136# define realloc xrealloc
7689ef0b
EZ
137# ifdef free
138# undef free
139# endif
0b32bf0e 140# define free xfree
9abbd165 141
0b32bf0e
SM
142/* Converts the pointer to the char to BEG-based offset from the start. */
143# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
144# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
145
146# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
147# define RE_STRING_CHAR(p, s) \
4e8a9132 148 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 149# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
150 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
151
152/* Set C a (possibly multibyte) character before P. P points into a
153 string which is the virtual concatenation of STR1 (which ends at
154 END1) or STR2 (which ends at END2). */
0b32bf0e 155# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
156 do { \
157 if (multibyte) \
158 { \
0b32bf0e
SM
159 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
160 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
70806df6
KH
161 re_char *d0 = dtemp; \
162 PREV_CHAR_BOUNDARY (d0, dlimit); \
163 c = STRING_CHAR (d0, dtemp - d0); \
2d1675e4
SM
164 } \
165 else \
166 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
167 } while (0)
168
4e8a9132 169
fa9a63c5
RM
170#else /* not emacs */
171
172/* If we are not linking with Emacs proper,
173 we can't use the relocating allocator
174 even if config.h says that we can. */
0b32bf0e 175# undef REL_ALLOC
fa9a63c5 176
0b32bf0e
SM
177# if defined STDC_HEADERS || defined _LIBC
178# include <stdlib.h>
179# else
fa9a63c5
RM
180char *malloc ();
181char *realloc ();
0b32bf0e 182# endif
fa9a63c5 183
a77f947b
CY
184/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
185
186void *
187xmalloc (size)
188 size_t size;
189{
190 register void *val;
191 val = (void *) malloc (size);
192 if (!val && size)
193 {
194 write (2, "virtual memory exhausted\n", 25);
195 exit (1);
196 }
197 return val;
198}
199
200void *
201xrealloc (block, size)
202 void *block;
203 size_t size;
204{
205 register void *val;
206 /* We must call malloc explicitly when BLOCK is 0, since some
207 reallocs don't do this. */
208 if (! block)
209 val = (void *) malloc (size);
210 else
211 val = (void *) realloc (block, size);
212 if (!val && size)
213 {
214 write (2, "virtual memory exhausted\n", 25);
215 exit (1);
216 }
217 return val;
218}
219
a073faa6
CY
220# ifdef malloc
221# undef malloc
222# endif
223# define malloc xmalloc
224# ifdef realloc
225# undef realloc
226# endif
227# define realloc xrealloc
228
9e4ecb26 229/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 230 If nothing else has been done, use the method below. */
0b32bf0e
SM
231# ifdef INHIBIT_STRING_HEADER
232# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
233# if !defined bzero && !defined bcopy
234# undef INHIBIT_STRING_HEADER
235# endif
236# endif
237# endif
9e4ecb26 238
4bb91c68 239/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
240 This is used in most programs--a few other programs avoid this
241 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
242# ifndef INHIBIT_STRING_HEADER
243# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
244# include <string.h>
0b32bf0e 245# ifndef bzero
4bb91c68
SM
246# ifndef _LIBC
247# define bzero(s, n) (memset (s, '\0', n), (s))
248# else
249# define bzero(s, n) __bzero (s, n)
250# endif
0b32bf0e
SM
251# endif
252# else
253# include <strings.h>
4bb91c68
SM
254# ifndef memcmp
255# define memcmp(s1, s2, n) bcmp (s1, s2, n)
256# endif
257# ifndef memcpy
258# define memcpy(d, s, n) (bcopy (s, d, n), (d))
259# endif
0b32bf0e
SM
260# endif
261# endif
fa9a63c5
RM
262
263/* Define the syntax stuff for \<, \>, etc. */
264
990b2375 265/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 266enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 267
0b32bf0e
SM
268# ifdef SWITCH_ENUM_BUG
269# define SWITCH_ENUM_CAST(x) ((int)(x))
270# else
271# define SWITCH_ENUM_CAST(x) (x)
272# endif
fa9a63c5 273
e934739e 274/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
275# define BASE_LEADING_CODE_P(c) (0)
276# define CHAR_CHARSET(c) 0
277# define CHARSET_LEADING_CODE_BASE(c) 0
278# define MAX_MULTIBYTE_LENGTH 1
279# define RE_MULTIBYTE_P(x) 0
280# define WORD_BOUNDARY_P(c1, c2) (0)
281# define CHAR_HEAD_P(p) (1)
282# define SINGLE_BYTE_CHAR_P(c) (1)
283# define SAME_CHARSET_P(c1, c2) (1)
284# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 285# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e
SM
286# define STRING_CHAR(p, s) (*(p))
287# define RE_STRING_CHAR STRING_CHAR
288# define CHAR_STRING(c, s) (*(s) = (c), 1)
289# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
290# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
291# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 292 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 293# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 294#endif /* not emacs */
4e8a9132
SM
295
296#ifndef RE_TRANSLATE
0b32bf0e
SM
297# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
298# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 299#endif
fa9a63c5
RM
300\f
301/* Get the interface, including the syntax bits. */
302#include "regex.h"
303
f71b19b6
DL
304/* isalpha etc. are used for the character classes. */
305#include <ctype.h>
fa9a63c5 306
f71b19b6 307#ifdef emacs
fa9a63c5 308
f71b19b6 309/* 1 if C is an ASCII character. */
0b32bf0e 310# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 311
f71b19b6 312/* 1 if C is a unibyte character. */
0b32bf0e 313# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 314
f71b19b6 315/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 316
f71b19b6 317/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
318# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
319# define ISCNTRL(c) ((c) < ' ')
320# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
321 || ((c) >= 'a' && (c) <= 'f') \
322 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
323
324/* This is only used for single-byte characters. */
0b32bf0e 325# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
326
327/* The rest must handle multibyte characters. */
328
0b32bf0e 329# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 330 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
331 : 1)
332
14473664 333# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 334 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
335 : 1)
336
0b32bf0e 337# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
338 ? (((c) >= 'a' && (c) <= 'z') \
339 || ((c) >= 'A' && (c) <= 'Z') \
340 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
341 : SYNTAX (c) == Sword)
342
0b32bf0e 343# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
344 ? (((c) >= 'a' && (c) <= 'z') \
345 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
346 : SYNTAX (c) == Sword)
347
0b32bf0e 348# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 349
0b32bf0e 350# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
351 ? ((c) > ' ' && (c) < 0177 \
352 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
353 || ((c) >= 'A' && (c) <= 'Z') \
354 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
355 : SYNTAX (c) != Sword)
356
0b32bf0e 357# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 358
0b32bf0e 359# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 360
0b32bf0e 361# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
362
363#else /* not emacs */
364
f71b19b6
DL
365/* Jim Meyering writes:
366
367 "... Some ctype macros are valid only for character codes that
368 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
369 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 370 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
371 STDC_HEADERS is defined, then autoconf has verified that the ctype
372 macros don't need to be guarded with references to isascii. ...
373 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
374 eliminate the && through constant folding."
375 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 376
4bb91c68 377# undef ISASCII
0b32bf0e
SM
378# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
379# define ISASCII(c) 1
380# else
381# define ISASCII(c) isascii(c)
382# endif
f71b19b6
DL
383
384/* 1 if C is an ASCII character. */
0b32bf0e 385# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
386
387/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
388# define ISUNIBYTE(c) 1
389
390# ifdef isblank
391# define ISBLANK(c) (ISASCII (c) && isblank (c))
392# else
393# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
394# endif
395# ifdef isgraph
396# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
397# else
398# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
399# endif
400
4bb91c68 401# undef ISPRINT
0b32bf0e
SM
402# define ISPRINT(c) (ISASCII (c) && isprint (c))
403# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
404# define ISALNUM(c) (ISASCII (c) && isalnum (c))
405# define ISALPHA(c) (ISASCII (c) && isalpha (c))
406# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
407# define ISLOWER(c) (ISASCII (c) && islower (c))
408# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
409# define ISSPACE(c) (ISASCII (c) && isspace (c))
410# define ISUPPER(c) (ISASCII (c) && isupper (c))
411# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
412
413# define ISWORD(c) ISALPHA(c)
414
4bb91c68
SM
415# ifdef _tolower
416# define TOLOWER(c) _tolower(c)
417# else
418# define TOLOWER(c) tolower(c)
419# endif
420
421/* How many characters in the character set. */
422# define CHAR_SET_SIZE 256
423
0b32bf0e 424# ifdef SYNTAX_TABLE
f71b19b6 425
0b32bf0e 426extern char *re_syntax_table;
f71b19b6 427
0b32bf0e
SM
428# else /* not SYNTAX_TABLE */
429
0b32bf0e
SM
430static char re_syntax_table[CHAR_SET_SIZE];
431
432static void
433init_syntax_once ()
434{
435 register int c;
436 static int done = 0;
437
438 if (done)
439 return;
440
441 bzero (re_syntax_table, sizeof re_syntax_table);
442
4bb91c68
SM
443 for (c = 0; c < CHAR_SET_SIZE; ++c)
444 if (ISALNUM (c))
445 re_syntax_table[c] = Sword;
fa9a63c5 446
669fa600 447 re_syntax_table['_'] = Ssymbol;
fa9a63c5 448
0b32bf0e
SM
449 done = 1;
450}
451
452# endif /* not SYNTAX_TABLE */
96cc36cc 453
4bb91c68
SM
454# define SYNTAX(c) re_syntax_table[(c)]
455
96cc36cc
RS
456#endif /* not emacs */
457\f
fa9a63c5 458#ifndef NULL
0b32bf0e 459# define NULL (void *)0
fa9a63c5
RM
460#endif
461
462/* We remove any previous definition of `SIGN_EXTEND_CHAR',
463 since ours (we hope) works properly with all combinations of
464 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 465 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
466#undef SIGN_EXTEND_CHAR
467#if __STDC__
0b32bf0e 468# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
469#else /* not __STDC__ */
470/* As in Harbison and Steele. */
0b32bf0e 471# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
472#endif
473\f
474/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
475 use `alloca' instead of `malloc'. This is because using malloc in
476 re_search* or re_match* could cause memory leaks when C-g is used in
477 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
478 the other hand, malloc is more portable, and easier to debug.
479
fa9a63c5
RM
480 Because we sometimes use alloca, some routines have to be macros,
481 not functions -- `alloca'-allocated space disappears at the end of the
482 function it is called in. */
483
484#ifdef REGEX_MALLOC
485
0b32bf0e
SM
486# define REGEX_ALLOCATE malloc
487# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
488# define REGEX_FREE free
fa9a63c5
RM
489
490#else /* not REGEX_MALLOC */
491
492/* Emacs already defines alloca, sometimes. */
0b32bf0e 493# ifndef alloca
fa9a63c5
RM
494
495/* Make alloca work the best possible way. */
0b32bf0e
SM
496# ifdef __GNUC__
497# define alloca __builtin_alloca
498# else /* not __GNUC__ */
499# if HAVE_ALLOCA_H
500# include <alloca.h>
501# endif /* HAVE_ALLOCA_H */
502# endif /* not __GNUC__ */
fa9a63c5 503
0b32bf0e 504# endif /* not alloca */
fa9a63c5 505
0b32bf0e 506# define REGEX_ALLOCATE alloca
fa9a63c5
RM
507
508/* Assumes a `char *destination' variable. */
0b32bf0e 509# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 510 (destination = (char *) alloca (nsize), \
4bb91c68 511 memcpy (destination, source, osize))
fa9a63c5
RM
512
513/* No need to do anything to free, after alloca. */
0b32bf0e 514# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
515
516#endif /* not REGEX_MALLOC */
517
518/* Define how to allocate the failure stack. */
519
0b32bf0e 520#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 521
0b32bf0e 522# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 523 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 524# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 525 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 526# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
527 r_alloc_free (&failure_stack_ptr)
528
4297555e 529#else /* not using relocating allocator */
fa9a63c5 530
0b32bf0e 531# ifdef REGEX_MALLOC
fa9a63c5 532
0b32bf0e
SM
533# define REGEX_ALLOCATE_STACK malloc
534# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
535# define REGEX_FREE_STACK free
fa9a63c5 536
0b32bf0e 537# else /* not REGEX_MALLOC */
fa9a63c5 538
0b32bf0e 539# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 540
0b32bf0e 541# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 542 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 543/* No need to explicitly free anything. */
0b32bf0e 544# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 545
0b32bf0e 546# endif /* not REGEX_MALLOC */
4297555e 547#endif /* not using relocating allocator */
fa9a63c5
RM
548
549
550/* True if `size1' is non-NULL and PTR is pointing anywhere inside
551 `string1' or just past its end. This works if PTR is NULL, which is
552 a good thing. */
25fe55af 553#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
554 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
555
556/* (Re)Allocate N items of type T using malloc, or fail. */
557#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
558#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
559#define RETALLOC_IF(addr, n, t) \
560 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
561#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
562
4bb91c68 563#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
564
565#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
566
567#undef MAX
568#undef MIN
569#define MAX(a, b) ((a) > (b) ? (a) : (b))
570#define MIN(a, b) ((a) < (b) ? (a) : (b))
571
6470ea05
SM
572/* Type of source-pattern and string chars. */
573typedef const unsigned char re_char;
574
fa9a63c5
RM
575typedef char boolean;
576#define false 0
577#define true 1
578
4bb91c68
SM
579static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
580 re_char *string1, int size1,
581 re_char *string2, int size2,
582 int pos,
583 struct re_registers *regs,
584 int stop));
fa9a63c5
RM
585\f
586/* These are the command codes that appear in compiled regular
4bb91c68 587 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
588 command code can specify any interpretation whatsoever for its
589 arguments. Zero bytes may appear in the compiled regular expression. */
590
591typedef enum
592{
593 no_op = 0,
594
4bb91c68 595 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
596 succeed,
597
25fe55af 598 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
599 exactn,
600
25fe55af 601 /* Matches any (more or less) character. */
fa9a63c5
RM
602 anychar,
603
25fe55af
RS
604 /* Matches any one char belonging to specified set. First
605 following byte is number of bitmap bytes. Then come bytes
606 for a bitmap saying which chars are in. Bits in each byte
607 are ordered low-bit-first. A character is in the set if its
608 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
609 automatically not in the set.
610
611 If the length byte has the 0x80 bit set, then that stuff
612 is followed by a range table:
613 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 614 See RANGE_TABLE_WORK_BITS below.
01618498 615 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 616 pairs, each 2 multibyte characters,
0b32bf0e 617 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
618 charset,
619
25fe55af 620 /* Same parameters as charset, but match any character that is
4bb91c68 621 not one of those specified. */
fa9a63c5
RM
622 charset_not,
623
25fe55af
RS
624 /* Start remembering the text that is matched, for storing in a
625 register. Followed by one byte with the register number, in
626 the range 0 to one less than the pattern buffer's re_nsub
505bde11 627 field. */
fa9a63c5
RM
628 start_memory,
629
25fe55af
RS
630 /* Stop remembering the text that is matched and store it in a
631 memory register. Followed by one byte with the register
632 number, in the range 0 to one less than `re_nsub' in the
505bde11 633 pattern buffer. */
fa9a63c5
RM
634 stop_memory,
635
25fe55af 636 /* Match a duplicate of something remembered. Followed by one
4bb91c68 637 byte containing the register number. */
fa9a63c5
RM
638 duplicate,
639
25fe55af 640 /* Fail unless at beginning of line. */
fa9a63c5
RM
641 begline,
642
4bb91c68 643 /* Fail unless at end of line. */
fa9a63c5
RM
644 endline,
645
25fe55af
RS
646 /* Succeeds if at beginning of buffer (if emacs) or at beginning
647 of string to be matched (if not). */
fa9a63c5
RM
648 begbuf,
649
25fe55af 650 /* Analogously, for end of buffer/string. */
fa9a63c5 651 endbuf,
5e69f11e 652
25fe55af 653 /* Followed by two byte relative address to which to jump. */
5e69f11e 654 jump,
fa9a63c5 655
25fe55af
RS
656 /* Followed by two-byte relative address of place to resume at
657 in case of failure. */
fa9a63c5 658 on_failure_jump,
5e69f11e 659
25fe55af
RS
660 /* Like on_failure_jump, but pushes a placeholder instead of the
661 current string position when executed. */
fa9a63c5 662 on_failure_keep_string_jump,
5e69f11e 663
505bde11
SM
664 /* Just like `on_failure_jump', except that it checks that we
665 don't get stuck in an infinite loop (matching an empty string
666 indefinitely). */
667 on_failure_jump_loop,
668
0683b6fa
SM
669 /* Just like `on_failure_jump_loop', except that it checks for
670 a different kind of loop (the kind that shows up with non-greedy
671 operators). This operation has to be immediately preceded
672 by a `no_op'. */
673 on_failure_jump_nastyloop,
674
0b32bf0e 675 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
676 It analyses the loop before which it is put and if the
677 loop does not require backtracking, it changes itself to
4e8a9132
SM
678 `on_failure_keep_string_jump' and short-circuits the loop,
679 else it just defaults to changing itself into `on_failure_jump'.
680 It assumes that it is pointing to just past a `jump'. */
505bde11 681 on_failure_jump_smart,
fa9a63c5 682
25fe55af 683 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
684 After matching N times, jump to the address upon failure.
685 Does not work if N starts at 0: use on_failure_jump_loop
686 instead. */
fa9a63c5
RM
687 succeed_n,
688
25fe55af
RS
689 /* Followed by two-byte relative address, and two-byte number n.
690 Jump to the address N times, then fail. */
fa9a63c5
RM
691 jump_n,
692
25fe55af
RS
693 /* Set the following two-byte relative address to the
694 subsequent two-byte number. The address *includes* the two
695 bytes of number. */
fa9a63c5
RM
696 set_number_at,
697
fa9a63c5
RM
698 wordbeg, /* Succeeds if at word beginning. */
699 wordend, /* Succeeds if at word end. */
700
701 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 702 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 703
669fa600
SM
704 symbeg, /* Succeeds if at symbol beginning. */
705 symend, /* Succeeds if at symbol end. */
706
fa9a63c5 707 /* Matches any character whose syntax is specified. Followed by
25fe55af 708 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
709 syntaxspec,
710
711 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
712 notsyntaxspec
713
714#ifdef emacs
715 ,before_dot, /* Succeeds if before point. */
716 at_dot, /* Succeeds if at point. */
717 after_dot, /* Succeeds if after point. */
b18215fc
RS
718
719 /* Matches any character whose category-set contains the specified
25fe55af
RS
720 category. The operator is followed by a byte which contains a
721 category code (mnemonic ASCII character). */
b18215fc
RS
722 categoryspec,
723
724 /* Matches any character whose category-set does not contain the
725 specified category. The operator is followed by a byte which
726 contains the category code (mnemonic ASCII character). */
727 notcategoryspec
fa9a63c5
RM
728#endif /* emacs */
729} re_opcode_t;
730\f
731/* Common operations on the compiled pattern. */
732
733/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
734
735#define STORE_NUMBER(destination, number) \
736 do { \
737 (destination)[0] = (number) & 0377; \
738 (destination)[1] = (number) >> 8; \
739 } while (0)
740
741/* Same as STORE_NUMBER, except increment DESTINATION to
742 the byte after where the number is stored. Therefore, DESTINATION
743 must be an lvalue. */
744
745#define STORE_NUMBER_AND_INCR(destination, number) \
746 do { \
747 STORE_NUMBER (destination, number); \
748 (destination) += 2; \
749 } while (0)
750
751/* Put into DESTINATION a number stored in two contiguous bytes starting
752 at SOURCE. */
753
754#define EXTRACT_NUMBER(destination, source) \
755 do { \
756 (destination) = *(source) & 0377; \
757 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
758 } while (0)
759
760#ifdef DEBUG
4bb91c68 761static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
762static void
763extract_number (dest, source)
764 int *dest;
01618498 765 re_char *source;
fa9a63c5 766{
5e69f11e 767 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
768 *dest = *source & 0377;
769 *dest += temp << 8;
770}
771
4bb91c68 772# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
773# undef EXTRACT_NUMBER
774# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
775# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
776
777#endif /* DEBUG */
778
779/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
780 SOURCE must be an lvalue. */
781
782#define EXTRACT_NUMBER_AND_INCR(destination, source) \
783 do { \
784 EXTRACT_NUMBER (destination, source); \
25fe55af 785 (source) += 2; \
fa9a63c5
RM
786 } while (0)
787
788#ifdef DEBUG
4bb91c68
SM
789static void extract_number_and_incr _RE_ARGS ((int *destination,
790 re_char **source));
fa9a63c5
RM
791static void
792extract_number_and_incr (destination, source)
793 int *destination;
01618498 794 re_char **source;
5e69f11e 795{
fa9a63c5
RM
796 extract_number (destination, *source);
797 *source += 2;
798}
799
0b32bf0e
SM
800# ifndef EXTRACT_MACROS
801# undef EXTRACT_NUMBER_AND_INCR
802# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 803 extract_number_and_incr (&dest, &src)
0b32bf0e 804# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
805
806#endif /* DEBUG */
807\f
b18215fc
RS
808/* Store a multibyte character in three contiguous bytes starting
809 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 810 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
811
812#define STORE_CHARACTER_AND_INCR(destination, character) \
813 do { \
814 (destination)[0] = (character) & 0377; \
815 (destination)[1] = ((character) >> 8) & 0377; \
816 (destination)[2] = (character) >> 16; \
817 (destination) += 3; \
818 } while (0)
819
820/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 821 starting at SOURCE. */
b18215fc
RS
822
823#define EXTRACT_CHARACTER(destination, source) \
824 do { \
825 (destination) = ((source)[0] \
826 | ((source)[1] << 8) \
827 | ((source)[2] << 16)); \
828 } while (0)
829
830
831/* Macros for charset. */
832
833/* Size of bitmap of charset P in bytes. P is a start of charset,
834 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
835#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
836
837/* Nonzero if charset P has range table. */
25fe55af 838#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
839
840/* Return the address of range table of charset P. But not the start
841 of table itself, but the before where the number of ranges is
96cc36cc
RS
842 stored. `2 +' means to skip re_opcode_t and size of bitmap,
843 and the 2 bytes of flags at the start of the range table. */
844#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
845
846/* Extract the bit flags that start a range table. */
847#define CHARSET_RANGE_TABLE_BITS(p) \
848 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
849 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
850
851/* Test if C is listed in the bitmap of charset P. */
852#define CHARSET_LOOKUP_BITMAP(p, c) \
853 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
854 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
855
856/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
857 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
858 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
859 and end. */
860#define CHARSET_RANGE_TABLE_END(range_table, count) \
861 ((range_table) + (count) * 2 * 3)
862
25fe55af 863/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
864 COUNT is number of ranges in RANGE_TABLE. */
865#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
866 do \
867 { \
01618498
SM
868 re_wchar_t range_start, range_end; \
869 re_char *p; \
870 re_char *range_table_end \
b18215fc
RS
871 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
872 \
873 for (p = (range_table); p < range_table_end; p += 2 * 3) \
874 { \
875 EXTRACT_CHARACTER (range_start, p); \
876 EXTRACT_CHARACTER (range_end, p + 3); \
877 \
878 if (range_start <= (c) && (c) <= range_end) \
879 { \
880 (not) = !(not); \
881 break; \
882 } \
883 } \
884 } \
885 while (0)
886
887/* Test if C is in range table of CHARSET. The flag NOT is negated if
888 C is listed in it. */
889#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
890 do \
891 { \
892 /* Number of ranges in range table. */ \
893 int count; \
01618498
SM
894 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
895 \
b18215fc
RS
896 EXTRACT_NUMBER_AND_INCR (count, range_table); \
897 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
898 } \
899 while (0)
900\f
fa9a63c5
RM
901/* If DEBUG is defined, Regex prints many voluminous messages about what
902 it is doing (if the variable `debug' is nonzero). If linked with the
903 main program in `iregex.c', you can enter patterns and strings
904 interactively. And if linked with the main program in `main.c' and
4bb91c68 905 the other test files, you can run the already-written tests. */
fa9a63c5
RM
906
907#ifdef DEBUG
908
909/* We use standard I/O for debugging. */
0b32bf0e 910# include <stdio.h>
fa9a63c5
RM
911
912/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 913# include <assert.h>
fa9a63c5 914
99633e97 915static int debug = -100000;
fa9a63c5 916
0b32bf0e
SM
917# define DEBUG_STATEMENT(e) e
918# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
919# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
920# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
921# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
922# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 923 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 924# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 925 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
926
927
928/* Print the fastmap in human-readable form. */
929
930void
931print_fastmap (fastmap)
932 char *fastmap;
933{
934 unsigned was_a_range = 0;
5e69f11e
RM
935 unsigned i = 0;
936
fa9a63c5
RM
937 while (i < (1 << BYTEWIDTH))
938 {
939 if (fastmap[i++])
940 {
941 was_a_range = 0;
25fe55af
RS
942 putchar (i - 1);
943 while (i < (1 << BYTEWIDTH) && fastmap[i])
944 {
945 was_a_range = 1;
946 i++;
947 }
fa9a63c5 948 if (was_a_range)
25fe55af
RS
949 {
950 printf ("-");
951 putchar (i - 1);
952 }
953 }
fa9a63c5 954 }
5e69f11e 955 putchar ('\n');
fa9a63c5
RM
956}
957
958
959/* Print a compiled pattern string in human-readable form, starting at
960 the START pointer into it and ending just before the pointer END. */
961
962void
963print_partial_compiled_pattern (start, end)
01618498
SM
964 re_char *start;
965 re_char *end;
fa9a63c5
RM
966{
967 int mcnt, mcnt2;
01618498
SM
968 re_char *p = start;
969 re_char *pend = end;
fa9a63c5
RM
970
971 if (start == NULL)
972 {
a1a052df 973 fprintf (stderr, "(null)\n");
fa9a63c5
RM
974 return;
975 }
5e69f11e 976
fa9a63c5
RM
977 /* Loop over pattern commands. */
978 while (p < pend)
979 {
a1a052df 980 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
981
982 switch ((re_opcode_t) *p++)
983 {
25fe55af 984 case no_op:
a1a052df 985 fprintf (stderr, "/no_op");
25fe55af 986 break;
fa9a63c5 987
99633e97 988 case succeed:
a1a052df 989 fprintf (stderr, "/succeed");
99633e97
SM
990 break;
991
fa9a63c5
RM
992 case exactn:
993 mcnt = *p++;
a1a052df 994 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 995 do
fa9a63c5 996 {
a1a052df 997 fprintf (stderr, "/%c", *p++);
25fe55af
RS
998 }
999 while (--mcnt);
1000 break;
fa9a63c5
RM
1001
1002 case start_memory:
a1a052df 1003 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1004 break;
fa9a63c5
RM
1005
1006 case stop_memory:
a1a052df 1007 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1008 break;
fa9a63c5
RM
1009
1010 case duplicate:
a1a052df 1011 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1012 break;
1013
1014 case anychar:
a1a052df 1015 fprintf (stderr, "/anychar");
fa9a63c5
RM
1016 break;
1017
1018 case charset:
25fe55af
RS
1019 case charset_not:
1020 {
1021 register int c, last = -100;
fa9a63c5 1022 register int in_range = 0;
99633e97
SM
1023 int length = CHARSET_BITMAP_SIZE (p - 1);
1024 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1025
a1a052df 1026 fprintf (stderr, "/charset [%s",
275a3409 1027 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1028
275a3409
RS
1029 if (p + *p >= pend)
1030 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1031
25fe55af 1032 for (c = 0; c < 256; c++)
96cc36cc 1033 if (c / 8 < length
fa9a63c5
RM
1034 && (p[1 + (c/8)] & (1 << (c % 8))))
1035 {
1036 /* Are we starting a range? */
1037 if (last + 1 == c && ! in_range)
1038 {
a1a052df 1039 fprintf (stderr, "-");
fa9a63c5
RM
1040 in_range = 1;
1041 }
1042 /* Have we broken a range? */
1043 else if (last + 1 != c && in_range)
96cc36cc 1044 {
a1a052df 1045 fprintf (stderr, "%c", last);
fa9a63c5
RM
1046 in_range = 0;
1047 }
5e69f11e 1048
fa9a63c5 1049 if (! in_range)
a1a052df 1050 fprintf (stderr, "%c", c);
fa9a63c5
RM
1051
1052 last = c;
25fe55af 1053 }
fa9a63c5
RM
1054
1055 if (in_range)
a1a052df 1056 fprintf (stderr, "%c", last);
fa9a63c5 1057
a1a052df 1058 fprintf (stderr, "]");
fa9a63c5 1059
99633e97 1060 p += 1 + length;
96cc36cc 1061
96cc36cc 1062 if (has_range_table)
99633e97
SM
1063 {
1064 int count;
a1a052df 1065 fprintf (stderr, "has-range-table");
99633e97
SM
1066
1067 /* ??? Should print the range table; for now, just skip it. */
1068 p += 2; /* skip range table bits */
1069 EXTRACT_NUMBER_AND_INCR (count, p);
1070 p = CHARSET_RANGE_TABLE_END (p, count);
1071 }
fa9a63c5
RM
1072 }
1073 break;
1074
1075 case begline:
a1a052df 1076 fprintf (stderr, "/begline");
25fe55af 1077 break;
fa9a63c5
RM
1078
1079 case endline:
a1a052df 1080 fprintf (stderr, "/endline");
25fe55af 1081 break;
fa9a63c5
RM
1082
1083 case on_failure_jump:
25fe55af 1084 extract_number_and_incr (&mcnt, &p);
a1a052df 1085 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1086 break;
fa9a63c5
RM
1087
1088 case on_failure_keep_string_jump:
25fe55af 1089 extract_number_and_incr (&mcnt, &p);
a1a052df 1090 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1091 break;
fa9a63c5 1092
0683b6fa
SM
1093 case on_failure_jump_nastyloop:
1094 extract_number_and_incr (&mcnt, &p);
a1a052df 1095 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1096 break;
1097
505bde11 1098 case on_failure_jump_loop:
fa9a63c5 1099 extract_number_and_incr (&mcnt, &p);
a1a052df 1100 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1101 break;
1102
505bde11 1103 case on_failure_jump_smart:
fa9a63c5 1104 extract_number_and_incr (&mcnt, &p);
a1a052df 1105 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1106 break;
1107
25fe55af 1108 case jump:
fa9a63c5 1109 extract_number_and_incr (&mcnt, &p);
a1a052df 1110 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1111 break;
1112
25fe55af
RS
1113 case succeed_n:
1114 extract_number_and_incr (&mcnt, &p);
1115 extract_number_and_incr (&mcnt2, &p);
a1a052df 1116 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1117 break;
5e69f11e 1118
25fe55af
RS
1119 case jump_n:
1120 extract_number_and_incr (&mcnt, &p);
1121 extract_number_and_incr (&mcnt2, &p);
a1a052df 1122 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1123 break;
5e69f11e 1124
25fe55af
RS
1125 case set_number_at:
1126 extract_number_and_incr (&mcnt, &p);
1127 extract_number_and_incr (&mcnt2, &p);
a1a052df 1128 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1129 break;
5e69f11e 1130
25fe55af 1131 case wordbound:
a1a052df 1132 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1133 break;
1134
1135 case notwordbound:
a1a052df 1136 fprintf (stderr, "/notwordbound");
25fe55af 1137 break;
fa9a63c5
RM
1138
1139 case wordbeg:
a1a052df 1140 fprintf (stderr, "/wordbeg");
fa9a63c5 1141 break;
5e69f11e 1142
fa9a63c5 1143 case wordend:
a1a052df 1144 fprintf (stderr, "/wordend");
e2543b02 1145 break;
5e69f11e 1146
669fa600 1147 case symbeg:
e2543b02 1148 fprintf (stderr, "/symbeg");
669fa600
SM
1149 break;
1150
1151 case symend:
e2543b02 1152 fprintf (stderr, "/symend");
669fa600
SM
1153 break;
1154
1fb352e0 1155 case syntaxspec:
a1a052df 1156 fprintf (stderr, "/syntaxspec");
1fb352e0 1157 mcnt = *p++;
a1a052df 1158 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1159 break;
1160
1161 case notsyntaxspec:
a1a052df 1162 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1163 mcnt = *p++;
a1a052df 1164 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1165 break;
1166
0b32bf0e 1167# ifdef emacs
fa9a63c5 1168 case before_dot:
a1a052df 1169 fprintf (stderr, "/before_dot");
25fe55af 1170 break;
fa9a63c5
RM
1171
1172 case at_dot:
a1a052df 1173 fprintf (stderr, "/at_dot");
25fe55af 1174 break;
fa9a63c5
RM
1175
1176 case after_dot:
a1a052df 1177 fprintf (stderr, "/after_dot");
25fe55af 1178 break;
fa9a63c5 1179
1fb352e0 1180 case categoryspec:
a1a052df 1181 fprintf (stderr, "/categoryspec");
fa9a63c5 1182 mcnt = *p++;
a1a052df 1183 fprintf (stderr, "/%d", mcnt);
25fe55af 1184 break;
5e69f11e 1185
1fb352e0 1186 case notcategoryspec:
a1a052df 1187 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1188 mcnt = *p++;
a1a052df 1189 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1190 break;
0b32bf0e 1191# endif /* emacs */
fa9a63c5 1192
fa9a63c5 1193 case begbuf:
a1a052df 1194 fprintf (stderr, "/begbuf");
25fe55af 1195 break;
fa9a63c5
RM
1196
1197 case endbuf:
a1a052df 1198 fprintf (stderr, "/endbuf");
25fe55af 1199 break;
fa9a63c5 1200
25fe55af 1201 default:
a1a052df 1202 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1203 }
1204
a1a052df 1205 fprintf (stderr, "\n");
fa9a63c5
RM
1206 }
1207
a1a052df 1208 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1209}
1210
1211
1212void
1213print_compiled_pattern (bufp)
1214 struct re_pattern_buffer *bufp;
1215{
01618498 1216 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1217
1218 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1219 printf ("%ld bytes used/%ld bytes allocated.\n",
1220 bufp->used, bufp->allocated);
fa9a63c5
RM
1221
1222 if (bufp->fastmap_accurate && bufp->fastmap)
1223 {
1224 printf ("fastmap: ");
1225 print_fastmap (bufp->fastmap);
1226 }
1227
1228 printf ("re_nsub: %d\t", bufp->re_nsub);
1229 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1230 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1231 printf ("no_sub: %d\t", bufp->no_sub);
1232 printf ("not_bol: %d\t", bufp->not_bol);
1233 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1234 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1235 fflush (stdout);
fa9a63c5
RM
1236 /* Perhaps we should print the translate table? */
1237}
1238
1239
1240void
1241print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1242 re_char *where;
1243 re_char *string1;
1244 re_char *string2;
fa9a63c5
RM
1245 int size1;
1246 int size2;
1247{
4bb91c68 1248 int this_char;
5e69f11e 1249
fa9a63c5
RM
1250 if (where == NULL)
1251 printf ("(null)");
1252 else
1253 {
1254 if (FIRST_STRING_P (where))
25fe55af
RS
1255 {
1256 for (this_char = where - string1; this_char < size1; this_char++)
1257 putchar (string1[this_char]);
fa9a63c5 1258
25fe55af
RS
1259 where = string2;
1260 }
fa9a63c5
RM
1261
1262 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1263 putchar (string2[this_char]);
fa9a63c5
RM
1264 }
1265}
1266
1267#else /* not DEBUG */
1268
0b32bf0e
SM
1269# undef assert
1270# define assert(e)
fa9a63c5 1271
0b32bf0e
SM
1272# define DEBUG_STATEMENT(e)
1273# define DEBUG_PRINT1(x)
1274# define DEBUG_PRINT2(x1, x2)
1275# define DEBUG_PRINT3(x1, x2, x3)
1276# define DEBUG_PRINT4(x1, x2, x3, x4)
1277# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1278# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1279
1280#endif /* not DEBUG */
1281\f
1282/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1283 also be assigned to arbitrarily: each pattern buffer stores its own
1284 syntax, so it can be changed between regex compilations. */
1285/* This has no initializer because initialized variables in Emacs
1286 become read-only after dumping. */
1287reg_syntax_t re_syntax_options;
1288
1289
1290/* Specify the precise syntax of regexps for compilation. This provides
1291 for compatibility for various utilities which historically have
1292 different, incompatible syntaxes.
1293
1294 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1295 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1296
1297reg_syntax_t
1298re_set_syntax (syntax)
f9b0fd99 1299 reg_syntax_t syntax;
fa9a63c5
RM
1300{
1301 reg_syntax_t ret = re_syntax_options;
5e69f11e 1302
fa9a63c5
RM
1303 re_syntax_options = syntax;
1304 return ret;
1305}
c0f9ea08 1306WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1307
1308/* Regexp to use to replace spaces, or NULL meaning don't. */
1309static re_char *whitespace_regexp;
1310
1311void
1312re_set_whitespace_regexp (regexp)
6470ea05 1313 const char *regexp;
f9b0fd99 1314{
6470ea05 1315 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1316}
1317WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1318\f
1319/* This table gives an error message for each of the error codes listed
4bb91c68 1320 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1321 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1322 but why not be nice? */
fa9a63c5
RM
1323
1324static const char *re_error_msgid[] =
5e69f11e
RM
1325 {
1326 gettext_noop ("Success"), /* REG_NOERROR */
1327 gettext_noop ("No match"), /* REG_NOMATCH */
1328 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1329 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1330 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1331 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1332 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1333 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1334 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1335 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1336 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1337 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1338 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1339 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1340 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1341 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1342 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1343 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1344 };
1345\f
4bb91c68 1346/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1347
1348/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1349 searching and matching functions should not call alloca. On some
1350 systems, alloca is implemented in terms of malloc, and if we're
1351 using the relocating allocator routines, then malloc could cause a
1352 relocation, which might (if the strings being searched are in the
1353 ralloc heap) shift the data out from underneath the regexp
1354 routines.
1355
5e69f11e 1356 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1357 processes input from X in a signal handler; processing X input may
1358 call malloc; if input arrives while a matching routine is calling
1359 malloc, then we're scrod. But Emacs can't just block input while
1360 calling matching routines; then we don't notice interrupts when
1361 they come in. So, Emacs blocks input around all regexp calls
1362 except the matching calls, which it leaves unprotected, in the
1363 faith that they will not malloc. */
1364
1365/* Normally, this is fine. */
1366#define MATCH_MAY_ALLOCATE
1367
1368/* When using GNU C, we are not REALLY using the C alloca, no matter
1369 what config.h may say. So don't take precautions for it. */
1370#ifdef __GNUC__
0b32bf0e 1371# undef C_ALLOCA
fa9a63c5
RM
1372#endif
1373
1374/* The match routines may not allocate if (1) they would do it with malloc
1375 and (2) it's not safe for them to use malloc.
1376 Note that if REL_ALLOC is defined, matching would not use malloc for the
1377 failure stack, but we would still use it for the register vectors;
4bb91c68 1378 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1379#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1380# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1381#endif
1382
1383\f
1384/* Failure stack declarations and macros; both re_compile_fastmap and
1385 re_match_2 use a failure stack. These have to be macros because of
1386 REGEX_ALLOCATE_STACK. */
5e69f11e 1387
fa9a63c5 1388
320a2a73 1389/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1390 when matching. If this number is exceeded, we allocate more
1391 space, so it is not a hard limit. */
1392#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1393# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1394#endif
1395
1396/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1397 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1398 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1399 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1400 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1401# if defined MATCH_MAY_ALLOCATE
1402/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1403 whose default stack limit is 2mb. In order for a larger
1404 value to work reliably, you have to try to make it accord
1405 with the process stack limit. */
c0f9ea08
SM
1406size_t re_max_failures = 40000;
1407# else
1408size_t re_max_failures = 4000;
1409# endif
fa9a63c5
RM
1410
1411union fail_stack_elt
1412{
01618498 1413 re_char *pointer;
c0f9ea08
SM
1414 /* This should be the biggest `int' that's no bigger than a pointer. */
1415 long integer;
fa9a63c5
RM
1416};
1417
1418typedef union fail_stack_elt fail_stack_elt_t;
1419
1420typedef struct
1421{
1422 fail_stack_elt_t *stack;
c0f9ea08
SM
1423 size_t size;
1424 size_t avail; /* Offset of next open position. */
1425 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1426} fail_stack_type;
1427
505bde11 1428#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1429#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1430
1431
1432/* Define macros to initialize and free the failure stack.
1433 Do `return -2' if the alloc fails. */
1434
1435#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1436# define INIT_FAIL_STACK() \
fa9a63c5
RM
1437 do { \
1438 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1439 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1440 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1441 \
1442 if (fail_stack.stack == NULL) \
1443 return -2; \
1444 \
1445 fail_stack.size = INIT_FAILURE_ALLOC; \
1446 fail_stack.avail = 0; \
505bde11 1447 fail_stack.frame = 0; \
fa9a63c5
RM
1448 } while (0)
1449
0b32bf0e 1450# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1451#else
0b32bf0e 1452# define INIT_FAIL_STACK() \
fa9a63c5
RM
1453 do { \
1454 fail_stack.avail = 0; \
505bde11 1455 fail_stack.frame = 0; \
fa9a63c5
RM
1456 } while (0)
1457
0b32bf0e 1458# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1459#endif
1460
1461
320a2a73
KH
1462/* Double the size of FAIL_STACK, up to a limit
1463 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1464
1465 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1466 allocating space for it or it was already too large.
1467
4bb91c68 1468 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1469
320a2a73
KH
1470/* Factor to increase the failure stack size by
1471 when we increase it.
1472 This used to be 2, but 2 was too wasteful
1473 because the old discarded stacks added up to as much space
1474 were as ultimate, maximum-size stack. */
1475#define FAIL_STACK_GROWTH_FACTOR 4
1476
1477#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1478 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1479 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1480 ? 0 \
320a2a73
KH
1481 : ((fail_stack).stack \
1482 = (fail_stack_elt_t *) \
25fe55af
RS
1483 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1484 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1485 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1486 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1487 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1488 \
1489 (fail_stack).stack == NULL \
1490 ? 0 \
6453db45
KH
1491 : ((fail_stack).size \
1492 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1493 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1494 * FAIL_STACK_GROWTH_FACTOR)) \
1495 / sizeof (fail_stack_elt_t)), \
25fe55af 1496 1)))
fa9a63c5
RM
1497
1498
fa9a63c5
RM
1499/* Push a pointer value onto the failure stack.
1500 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1501 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1502#define PUSH_FAILURE_POINTER(item) \
01618498 1503 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1504
1505/* This pushes an integer-valued item onto the failure stack.
1506 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1507 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1508#define PUSH_FAILURE_INT(item) \
1509 fail_stack.stack[fail_stack.avail++].integer = (item)
1510
1511/* Push a fail_stack_elt_t value onto the failure stack.
1512 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1513 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1514#define PUSH_FAILURE_ELT(item) \
1515 fail_stack.stack[fail_stack.avail++] = (item)
1516
1517/* These three POP... operations complement the three PUSH... operations.
1518 All assume that `fail_stack' is nonempty. */
1519#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1520#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1521#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1522
505bde11
SM
1523/* Individual items aside from the registers. */
1524#define NUM_NONREG_ITEMS 3
1525
1526/* Used to examine the stack (to detect infinite loops). */
1527#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1528#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1529#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1530#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1531
1532
505bde11
SM
1533#define ENSURE_FAIL_STACK(space) \
1534while (REMAINING_AVAIL_SLOTS <= space) { \
1535 if (!GROW_FAIL_STACK (fail_stack)) \
1536 return -2; \
1537 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1538 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1539}
1540
1541/* Push register NUM onto the stack. */
1542#define PUSH_FAILURE_REG(num) \
1543do { \
1544 char *destination; \
1545 ENSURE_FAIL_STACK(3); \
1546 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1547 num, regstart[num], regend[num]); \
1548 PUSH_FAILURE_POINTER (regstart[num]); \
1549 PUSH_FAILURE_POINTER (regend[num]); \
1550 PUSH_FAILURE_INT (num); \
1551} while (0)
1552
01618498
SM
1553/* Change the counter's value to VAL, but make sure that it will
1554 be reset when backtracking. */
1555#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1556do { \
1557 char *destination; \
1558 int c; \
1559 ENSURE_FAIL_STACK(3); \
1560 EXTRACT_NUMBER (c, ptr); \
01618498 1561 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1562 PUSH_FAILURE_INT (c); \
1563 PUSH_FAILURE_POINTER (ptr); \
1564 PUSH_FAILURE_INT (-1); \
01618498 1565 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1566} while (0)
1567
505bde11 1568/* Pop a saved register off the stack. */
dc1e502d 1569#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1570do { \
1571 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1572 if (reg == -1) \
1573 { \
1574 /* It's a counter. */ \
6dcf2d0e
SM
1575 /* Here, we discard `const', making re_match non-reentrant. */ \
1576 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1577 reg = POP_FAILURE_INT (); \
1578 STORE_NUMBER (ptr, reg); \
1579 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1580 } \
1581 else \
1582 { \
1583 regend[reg] = POP_FAILURE_POINTER (); \
1584 regstart[reg] = POP_FAILURE_POINTER (); \
1585 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1586 reg, regstart[reg], regend[reg]); \
1587 } \
505bde11
SM
1588} while (0)
1589
1590/* Check that we are not stuck in an infinite loop. */
1591#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1592do { \
f6df485f 1593 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1594 /* Check for infinite matching loops */ \
f6df485f
RS
1595 while (failure > 0 \
1596 && (FAILURE_STR (failure) == string_place \
1597 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1598 { \
1599 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1600 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1601 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1602 { \
6df42991
SM
1603 cycle = 1; \
1604 break; \
f6df485f 1605 } \
66f0296e 1606 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1607 failure = NEXT_FAILURE_HANDLE(failure); \
1608 } \
1609 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1610} while (0)
6df42991 1611
fa9a63c5 1612/* Push the information about the state we will need
5e69f11e
RM
1613 if we ever fail back to it.
1614
505bde11 1615 Requires variables fail_stack, regstart, regend and
320a2a73 1616 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1617 declared.
5e69f11e 1618
fa9a63c5
RM
1619 Does `return FAILURE_CODE' if runs out of memory. */
1620
505bde11
SM
1621#define PUSH_FAILURE_POINT(pattern, string_place) \
1622do { \
1623 char *destination; \
1624 /* Must be int, so when we don't save any registers, the arithmetic \
1625 of 0 + -1 isn't done as unsigned. */ \
1626 \
505bde11 1627 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1628 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1629 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1630 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1631 \
1632 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1633 \
1634 DEBUG_PRINT1 ("\n"); \
1635 \
1636 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1637 PUSH_FAILURE_INT (fail_stack.frame); \
1638 \
1639 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1640 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1641 DEBUG_PRINT1 ("'\n"); \
1642 PUSH_FAILURE_POINTER (string_place); \
1643 \
1644 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1645 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1646 PUSH_FAILURE_POINTER (pattern); \
1647 \
1648 /* Close the frame by moving the frame pointer past it. */ \
1649 fail_stack.frame = fail_stack.avail; \
1650} while (0)
fa9a63c5 1651
320a2a73
KH
1652/* Estimate the size of data pushed by a typical failure stack entry.
1653 An estimate is all we need, because all we use this for
1654 is to choose a limit for how big to make the failure stack. */
ada30c0e 1655/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1656#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1657
fa9a63c5
RM
1658/* How many items can still be added to the stack without overflowing it. */
1659#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1660
1661
1662/* Pops what PUSH_FAIL_STACK pushes.
1663
1664 We restore into the parameters, all of which should be lvalues:
1665 STR -- the saved data position.
1666 PAT -- the saved pattern position.
fa9a63c5 1667 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1668
fa9a63c5 1669 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1670 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1671
505bde11
SM
1672#define POP_FAILURE_POINT(str, pat) \
1673do { \
fa9a63c5
RM
1674 assert (!FAIL_STACK_EMPTY ()); \
1675 \
1676 /* Remove failure points and point to how many regs pushed. */ \
1677 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1678 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1679 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1680 \
505bde11
SM
1681 /* Pop the saved registers. */ \
1682 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1683 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1684 \
01618498 1685 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1686 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1687 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1688 \
1689 /* If the saved string location is NULL, it came from an \
1690 on_failure_keep_string_jump opcode, and we want to throw away the \
1691 saved NULL, thus retaining our current position in the string. */ \
01618498 1692 str = POP_FAILURE_POINTER (); \
505bde11 1693 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1694 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1695 DEBUG_PRINT1 ("'\n"); \
1696 \
505bde11
SM
1697 fail_stack.frame = POP_FAILURE_INT (); \
1698 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1699 \
505bde11
SM
1700 assert (fail_stack.avail >= 0); \
1701 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1702 \
fa9a63c5 1703 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1704} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1705
1706
1707\f
fa9a63c5 1708/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1709#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1710\f
1711/* Subroutine declarations and macros for regex_compile. */
1712
4bb91c68
SM
1713static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1714 reg_syntax_t syntax,
1715 struct re_pattern_buffer *bufp));
1716static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1717static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1718 int arg1, int arg2));
1719static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1720 int arg, unsigned char *end));
1721static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1722 int arg1, int arg2, unsigned char *end));
01618498
SM
1723static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1724 re_char *p,
4bb91c68 1725 reg_syntax_t syntax));
01618498
SM
1726static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1727 re_char *pend,
4bb91c68 1728 reg_syntax_t syntax));
01618498
SM
1729static re_char *skip_one_char _RE_ARGS ((re_char *p));
1730static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1731 char *fastmap, const int multibyte));
fa9a63c5 1732
fa9a63c5 1733/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1734 translation. */
36595814 1735#define PATFETCH(c) \
2d1675e4
SM
1736 do { \
1737 int len; \
1738 if (p == pend) return REG_EEND; \
1739 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1740 p += len; \
fa9a63c5
RM
1741 } while (0)
1742
fa9a63c5
RM
1743
1744/* If `translate' is non-null, return translate[D], else just D. We
1745 cast the subscript to translate because some data is declared as
1746 `char *', to avoid warnings when a string constant is passed. But
1747 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1748#ifndef TRANSLATE
0b32bf0e 1749# define TRANSLATE(d) \
66f0296e 1750 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1751#endif
fa9a63c5
RM
1752
1753
1754/* Macros for outputting the compiled pattern into `buffer'. */
1755
1756/* If the buffer isn't allocated when it comes in, use this. */
1757#define INIT_BUF_SIZE 32
1758
4bb91c68 1759/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1760#define GET_BUFFER_SPACE(n) \
01618498 1761 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1762 EXTEND_BUFFER ()
1763
1764/* Make sure we have one more byte of buffer space and then add C to it. */
1765#define BUF_PUSH(c) \
1766 do { \
1767 GET_BUFFER_SPACE (1); \
1768 *b++ = (unsigned char) (c); \
1769 } while (0)
1770
1771
1772/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1773#define BUF_PUSH_2(c1, c2) \
1774 do { \
1775 GET_BUFFER_SPACE (2); \
1776 *b++ = (unsigned char) (c1); \
1777 *b++ = (unsigned char) (c2); \
1778 } while (0)
1779
1780
4bb91c68 1781/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1782#define BUF_PUSH_3(c1, c2, c3) \
1783 do { \
1784 GET_BUFFER_SPACE (3); \
1785 *b++ = (unsigned char) (c1); \
1786 *b++ = (unsigned char) (c2); \
1787 *b++ = (unsigned char) (c3); \
1788 } while (0)
1789
1790
1791/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1792 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1793#define STORE_JUMP(op, loc, to) \
1794 store_op1 (op, loc, (to) - (loc) - 3)
1795
1796/* Likewise, for a two-argument jump. */
1797#define STORE_JUMP2(op, loc, to, arg) \
1798 store_op2 (op, loc, (to) - (loc) - 3, arg)
1799
4bb91c68 1800/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1801#define INSERT_JUMP(op, loc, to) \
1802 insert_op1 (op, loc, (to) - (loc) - 3, b)
1803
1804/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1805#define INSERT_JUMP2(op, loc, to, arg) \
1806 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1807
1808
1809/* This is not an arbitrary limit: the arguments which represent offsets
275a3409 1810 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1811 be too small, many things would have to change. */
275a3409
RS
1812# define MAX_BUF_SIZE (1L << 15)
1813
1814#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1815/* Any other compiler which, like MSC, has allocation limit below 2^16
1816 bytes will have to use approach similar to what was done below for
1817 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1818 reallocating to 0 bytes. Such thing is not going to work too well.
1819 You have been warned!! */
1820#if defined _MSC_VER && !defined WIN32
1821/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1822# define MAX_BUF_SIZE 65500L
1823#else
1824# define MAX_BUF_SIZE (1L << 16)
1825#endif
275a3409 1826#endif /* 0 */
fa9a63c5
RM
1827
1828/* Extend the buffer by twice its current size via realloc and
1829 reset the pointers that pointed into the old block to point to the
1830 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1831 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1832#if __BOUNDED_POINTERS__
1833# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1834# define MOVE_BUFFER_POINTER(P) \
1835 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1836# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1837 else \
1838 { \
1839 SET_HIGH_BOUND (b); \
1840 SET_HIGH_BOUND (begalt); \
1841 if (fixup_alt_jump) \
1842 SET_HIGH_BOUND (fixup_alt_jump); \
1843 if (laststart) \
1844 SET_HIGH_BOUND (laststart); \
1845 if (pending_exact) \
1846 SET_HIGH_BOUND (pending_exact); \
1847 }
1848#else
1849# define MOVE_BUFFER_POINTER(P) (P) += incr
1850# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1851#endif
fa9a63c5 1852#define EXTEND_BUFFER() \
25fe55af 1853 do { \
01618498 1854 re_char *old_buffer = bufp->buffer; \
25fe55af 1855 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1856 return REG_ESIZE; \
1857 bufp->allocated <<= 1; \
1858 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1859 bufp->allocated = MAX_BUF_SIZE; \
01618498 1860 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1861 if (bufp->buffer == NULL) \
1862 return REG_ESPACE; \
1863 /* If the buffer moved, move all the pointers into it. */ \
1864 if (old_buffer != bufp->buffer) \
1865 { \
4bb91c68
SM
1866 int incr = bufp->buffer - old_buffer; \
1867 MOVE_BUFFER_POINTER (b); \
1868 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1869 if (fixup_alt_jump) \
4bb91c68 1870 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1871 if (laststart) \
4bb91c68 1872 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1873 if (pending_exact) \
4bb91c68 1874 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1875 } \
4bb91c68 1876 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1877 } while (0)
1878
1879
1880/* Since we have one byte reserved for the register number argument to
1881 {start,stop}_memory, the maximum number of groups we can report
1882 things about is what fits in that byte. */
1883#define MAX_REGNUM 255
1884
1885/* But patterns can have more than `MAX_REGNUM' registers. We just
1886 ignore the excess. */
098d42af 1887typedef int regnum_t;
fa9a63c5
RM
1888
1889
1890/* Macros for the compile stack. */
1891
1892/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1893 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1894/* int may be not enough when sizeof(int) == 2. */
1895typedef long pattern_offset_t;
fa9a63c5
RM
1896
1897typedef struct
1898{
1899 pattern_offset_t begalt_offset;
1900 pattern_offset_t fixup_alt_jump;
5e69f11e 1901 pattern_offset_t laststart_offset;
fa9a63c5
RM
1902 regnum_t regnum;
1903} compile_stack_elt_t;
1904
1905
1906typedef struct
1907{
1908 compile_stack_elt_t *stack;
1909 unsigned size;
1910 unsigned avail; /* Offset of next open position. */
1911} compile_stack_type;
1912
1913
1914#define INIT_COMPILE_STACK_SIZE 32
1915
1916#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1917#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1918
4bb91c68 1919/* The next available element. */
fa9a63c5
RM
1920#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1921
1cee1e27
SM
1922/* Explicit quit checking is only used on NTemacs and whenever we
1923 use polling to process input events. */
1924#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1925extern int immediate_quit;
1926# define IMMEDIATE_QUIT_CHECK \
1927 do { \
1928 if (immediate_quit) QUIT; \
1929 } while (0)
1930#else
1931# define IMMEDIATE_QUIT_CHECK ((void)0)
1932#endif
1933\f
b18215fc
RS
1934/* Structure to manage work area for range table. */
1935struct range_table_work_area
1936{
1937 int *table; /* actual work area. */
1938 int allocated; /* allocated size for work area in bytes. */
25fe55af 1939 int used; /* actually used size in words. */
96cc36cc 1940 int bits; /* flag to record character classes */
b18215fc
RS
1941};
1942
77d11aec
RS
1943/* Make sure that WORK_AREA can hold more N multibyte characters.
1944 This is used only in set_image_of_range and set_image_of_range_1.
1945 It expects WORK_AREA to be a pointer.
1946 If it can't get the space, it returns from the surrounding function. */
1947
1948#define EXTEND_RANGE_TABLE(work_area, n) \
1949 do { \
1950 if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
1951 { \
1952 extend_range_table_work_area (work_area); \
1953 if ((work_area)->table == 0) \
1954 return (REG_ESPACE); \
1955 } \
b18215fc
RS
1956 } while (0)
1957
96cc36cc
RS
1958#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1959 (work_area).bits |= (bit)
1960
14473664
SM
1961/* Bits used to implement the multibyte-part of the various character classes
1962 such as [:alnum:] in a charset's range table. */
1963#define BIT_WORD 0x1
1964#define BIT_LOWER 0x2
1965#define BIT_PUNCT 0x4
1966#define BIT_SPACE 0x8
1967#define BIT_UPPER 0x10
1968#define BIT_MULTIBYTE 0x20
96cc36cc 1969
36595814
SM
1970/* Set a range START..END to WORK_AREA.
1971 The range is passed through TRANSLATE, so START and END
1972 should be untranslated. */
77d11aec
RS
1973#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
1974 do { \
1975 int tem; \
1976 tem = set_image_of_range (&work_area, start, end, translate); \
1977 if (tem > 0) \
1978 FREE_STACK_RETURN (tem); \
b18215fc
RS
1979 } while (0)
1980
25fe55af 1981/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1982#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1983 do { \
1984 if ((work_area).table) \
1985 free ((work_area).table); \
1986 } while (0)
1987
96cc36cc 1988#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1989#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1990#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1991#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1992\f
b18215fc 1993
fa9a63c5 1994/* Set the bit for character C in a list. */
01618498 1995#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1996
1997
1998/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1999#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2000 do { \
2001 if (p == pend) \
2002 FREE_STACK_RETURN (REG_EBRACE); \
2003 else \
2004 { \
2005 PATFETCH (c); \
2006 while ('0' <= c && c <= '9') \
2007 { \
2008 int prev; \
2009 if (num < 0) \
2010 num = 0; \
2011 prev = num; \
2012 num = num * 10 + c - '0'; \
2013 if (num / 10 != prev) \
2014 FREE_STACK_RETURN (REG_BADBR); \
2015 if (p == pend) \
2016 FREE_STACK_RETURN (REG_EBRACE); \
2017 PATFETCH (c); \
2018 } \
2019 } \
2020 } while (0)
77d11aec 2021\f
1fdab503 2022#if ! WIDE_CHAR_SUPPORT
01618498 2023
14473664 2024/* Map a string to the char class it names (if any). */
1fdab503 2025re_wctype_t
ada30c0e
SM
2026re_wctype (str)
2027 re_char *str;
14473664 2028{
ada30c0e 2029 const char *string = str;
14473664
SM
2030 if (STREQ (string, "alnum")) return RECC_ALNUM;
2031 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2032 else if (STREQ (string, "word")) return RECC_WORD;
2033 else if (STREQ (string, "ascii")) return RECC_ASCII;
2034 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2035 else if (STREQ (string, "graph")) return RECC_GRAPH;
2036 else if (STREQ (string, "lower")) return RECC_LOWER;
2037 else if (STREQ (string, "print")) return RECC_PRINT;
2038 else if (STREQ (string, "punct")) return RECC_PUNCT;
2039 else if (STREQ (string, "space")) return RECC_SPACE;
2040 else if (STREQ (string, "upper")) return RECC_UPPER;
2041 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2042 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2043 else if (STREQ (string, "digit")) return RECC_DIGIT;
2044 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2045 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2046 else if (STREQ (string, "blank")) return RECC_BLANK;
2047 else return 0;
2048}
2049
2050/* True iff CH is in the char class CC. */
1fdab503 2051boolean
14473664
SM
2052re_iswctype (ch, cc)
2053 int ch;
2054 re_wctype_t cc;
2055{
2056 switch (cc)
2057 {
0cdd06f8
SM
2058 case RECC_ALNUM: return ISALNUM (ch);
2059 case RECC_ALPHA: return ISALPHA (ch);
2060 case RECC_BLANK: return ISBLANK (ch);
2061 case RECC_CNTRL: return ISCNTRL (ch);
2062 case RECC_DIGIT: return ISDIGIT (ch);
2063 case RECC_GRAPH: return ISGRAPH (ch);
2064 case RECC_LOWER: return ISLOWER (ch);
2065 case RECC_PRINT: return ISPRINT (ch);
2066 case RECC_PUNCT: return ISPUNCT (ch);
2067 case RECC_SPACE: return ISSPACE (ch);
2068 case RECC_UPPER: return ISUPPER (ch);
2069 case RECC_XDIGIT: return ISXDIGIT (ch);
2070 case RECC_ASCII: return IS_REAL_ASCII (ch);
2071 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2072 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2073 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2074 case RECC_WORD: return ISWORD (ch);
2075 case RECC_ERROR: return false;
2076 default:
2077 abort();
14473664
SM
2078 }
2079}
fa9a63c5 2080
14473664
SM
2081/* Return a bit-pattern to use in the range-table bits to match multibyte
2082 chars of class CC. */
2083static int
2084re_wctype_to_bit (cc)
2085 re_wctype_t cc;
2086{
2087 switch (cc)
2088 {
2089 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2090 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2091 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2092 case RECC_LOWER: return BIT_LOWER;
2093 case RECC_UPPER: return BIT_UPPER;
2094 case RECC_PUNCT: return BIT_PUNCT;
2095 case RECC_SPACE: return BIT_SPACE;
14473664 2096 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2097 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2098 default:
2099 abort();
14473664
SM
2100 }
2101}
2102#endif
77d11aec
RS
2103\f
2104/* Filling in the work area of a range. */
2105
2106/* Actually extend the space in WORK_AREA. */
2107
2108static void
2109extend_range_table_work_area (work_area)
2110 struct range_table_work_area *work_area;
177c0ea7 2111{
77d11aec
RS
2112 work_area->allocated += 16 * sizeof (int);
2113 if (work_area->table)
2114 work_area->table
a073faa6 2115 = (int *) realloc (work_area->table, work_area->allocated);
77d11aec
RS
2116 else
2117 work_area->table
a073faa6 2118 = (int *) malloc (work_area->allocated);
77d11aec
RS
2119}
2120
2121#ifdef emacs
2122
2123/* Carefully find the ranges of codes that are equivalent
2124 under case conversion to the range start..end when passed through
2125 TRANSLATE. Handle the case where non-letters can come in between
2126 two upper-case letters (which happens in Latin-1).
2127 Also handle the case of groups of more than 2 case-equivalent chars.
2128
2129 The basic method is to look at consecutive characters and see
2130 if they can form a run that can be handled as one.
2131
2132 Returns -1 if successful, REG_ESPACE if ran out of space. */
2133
2134static int
2135set_image_of_range_1 (work_area, start, end, translate)
2136 RE_TRANSLATE_TYPE translate;
2137 struct range_table_work_area *work_area;
2138 re_wchar_t start, end;
2139{
2140 /* `one_case' indicates a character, or a run of characters,
2141 each of which is an isolate (no case-equivalents).
2142 This includes all ASCII non-letters.
2143
2144 `two_case' indicates a character, or a run of characters,
2145 each of which has two case-equivalent forms.
2146 This includes all ASCII letters.
2147
2148 `strange' indicates a character that has more than one
2149 case-equivalent. */
177c0ea7 2150
77d11aec
RS
2151 enum case_type {one_case, two_case, strange};
2152
2153 /* Describe the run that is in progress,
2154 which the next character can try to extend.
2155 If run_type is strange, that means there really is no run.
2156 If run_type is one_case, then run_start...run_end is the run.
2157 If run_type is two_case, then the run is run_start...run_end,
2158 and the case-equivalents end at run_eqv_end. */
2159
2160 enum case_type run_type = strange;
2161 int run_start, run_end, run_eqv_end;
2162
2163 Lisp_Object eqv_table;
2164
2165 if (!RE_TRANSLATE_P (translate))
2166 {
b7c12565 2167 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2168 work_area->table[work_area->used++] = (start);
2169 work_area->table[work_area->used++] = (end);
b7c12565 2170 return -1;
77d11aec
RS
2171 }
2172
2173 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2174
77d11aec
RS
2175 for (; start <= end; start++)
2176 {
2177 enum case_type this_type;
2178 int eqv = RE_TRANSLATE (eqv_table, start);
2179 int minchar, maxchar;
2180
2181 /* Classify this character */
2182 if (eqv == start)
2183 this_type = one_case;
2184 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2185 this_type = two_case;
2186 else
2187 this_type = strange;
2188
2189 if (start < eqv)
2190 minchar = start, maxchar = eqv;
2191 else
2192 minchar = eqv, maxchar = start;
2193
2194 /* Can this character extend the run in progress? */
2195 if (this_type == strange || this_type != run_type
2196 || !(minchar == run_end + 1
2197 && (run_type == two_case
2198 ? maxchar == run_eqv_end + 1 : 1)))
2199 {
2200 /* No, end the run.
2201 Record each of its equivalent ranges. */
2202 if (run_type == one_case)
2203 {
2204 EXTEND_RANGE_TABLE (work_area, 2);
2205 work_area->table[work_area->used++] = run_start;
2206 work_area->table[work_area->used++] = run_end;
2207 }
2208 else if (run_type == two_case)
2209 {
2210 EXTEND_RANGE_TABLE (work_area, 4);
2211 work_area->table[work_area->used++] = run_start;
2212 work_area->table[work_area->used++] = run_end;
2213 work_area->table[work_area->used++]
2214 = RE_TRANSLATE (eqv_table, run_start);
2215 work_area->table[work_area->used++]
2216 = RE_TRANSLATE (eqv_table, run_end);
2217 }
2218 run_type = strange;
2219 }
177c0ea7 2220
77d11aec
RS
2221 if (this_type == strange)
2222 {
2223 /* For a strange character, add each of its equivalents, one
2224 by one. Don't start a range. */
2225 do
2226 {
2227 EXTEND_RANGE_TABLE (work_area, 2);
2228 work_area->table[work_area->used++] = eqv;
2229 work_area->table[work_area->used++] = eqv;
2230 eqv = RE_TRANSLATE (eqv_table, eqv);
2231 }
2232 while (eqv != start);
2233 }
2234
2235 /* Add this char to the run, or start a new run. */
2236 else if (run_type == strange)
2237 {
2238 /* Initialize a new range. */
2239 run_type = this_type;
2240 run_start = start;
2241 run_end = start;
2242 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2243 }
2244 else
2245 {
2246 /* Extend a running range. */
2247 run_end = minchar;
2248 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2249 }
2250 }
2251
2252 /* If a run is still in progress at the end, finish it now
2253 by recording its equivalent ranges. */
2254 if (run_type == one_case)
2255 {
2256 EXTEND_RANGE_TABLE (work_area, 2);
2257 work_area->table[work_area->used++] = run_start;
2258 work_area->table[work_area->used++] = run_end;
2259 }
2260 else if (run_type == two_case)
2261 {
2262 EXTEND_RANGE_TABLE (work_area, 4);
2263 work_area->table[work_area->used++] = run_start;
2264 work_area->table[work_area->used++] = run_end;
2265 work_area->table[work_area->used++]
2266 = RE_TRANSLATE (eqv_table, run_start);
2267 work_area->table[work_area->used++]
2268 = RE_TRANSLATE (eqv_table, run_end);
2269 }
2270
2271 return -1;
2272}
36595814 2273
77d11aec 2274#endif /* emacs */
36595814 2275
b7c12565 2276/* Record the the image of the range start..end when passed through
36595814
SM
2277 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2278 and is not even necessarily contiguous.
b7c12565
RS
2279 Normally we approximate it with the smallest contiguous range that contains
2280 all the chars we need. However, for Latin-1 we go to extra effort
2281 to do a better job.
2282
2283 This function is not called for ASCII ranges.
77d11aec
RS
2284
2285 Returns -1 if successful, REG_ESPACE if ran out of space. */
2286
2287static int
36595814
SM
2288set_image_of_range (work_area, start, end, translate)
2289 RE_TRANSLATE_TYPE translate;
2290 struct range_table_work_area *work_area;
2291 re_wchar_t start, end;
2292{
77d11aec
RS
2293 re_wchar_t cmin, cmax;
2294
2295#ifdef emacs
2296 /* For Latin-1 ranges, use set_image_of_range_1
2297 to get proper handling of ranges that include letters and nonletters.
b7c12565 2298 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2299 For other character sets, we don't bother to get this right. */
b7c12565
RS
2300 if (RE_TRANSLATE_P (translate) && start < 04400
2301 && !(start < 04200 && end >= 04377))
77d11aec 2302 {
b7c12565 2303 int newend;
77d11aec 2304 int tem;
b7c12565
RS
2305 newend = end;
2306 if (newend > 04377)
2307 newend = 04377;
2308 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2309 if (tem > 0)
2310 return tem;
2311
2312 start = 04400;
2313 if (end < 04400)
2314 return -1;
2315 }
2316#endif
2317
b7c12565
RS
2318 EXTEND_RANGE_TABLE (work_area, 2);
2319 work_area->table[work_area->used++] = (start);
2320 work_area->table[work_area->used++] = (end);
2321
2322 cmin = -1, cmax = -1;
77d11aec 2323
36595814 2324 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2325 {
2326 int ch;
77d11aec 2327
b7c12565
RS
2328 for (ch = start; ch <= end; ch++)
2329 {
2330 re_wchar_t c = TRANSLATE (ch);
2331 if (! (start <= c && c <= end))
2332 {
2333 if (cmin == -1)
2334 cmin = c, cmax = c;
2335 else
2336 {
2337 cmin = MIN (cmin, c);
2338 cmax = MAX (cmax, c);
2339 }
2340 }
2341 }
2342
2343 if (cmin != -1)
2344 {
2345 EXTEND_RANGE_TABLE (work_area, 2);
2346 work_area->table[work_area->used++] = (cmin);
2347 work_area->table[work_area->used++] = (cmax);
2348 }
2349 }
36595814 2350
77d11aec
RS
2351 return -1;
2352}
fa9a63c5
RM
2353\f
2354#ifndef MATCH_MAY_ALLOCATE
2355
2356/* If we cannot allocate large objects within re_match_2_internal,
2357 we make the fail stack and register vectors global.
2358 The fail stack, we grow to the maximum size when a regexp
2359 is compiled.
2360 The register vectors, we adjust in size each time we
2361 compile a regexp, according to the number of registers it needs. */
2362
2363static fail_stack_type fail_stack;
2364
2365/* Size with which the following vectors are currently allocated.
2366 That is so we can make them bigger as needed,
4bb91c68 2367 but never make them smaller. */
fa9a63c5
RM
2368static int regs_allocated_size;
2369
66f0296e
SM
2370static re_char ** regstart, ** regend;
2371static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2372
2373/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2374 but don't make them smaller. */
fa9a63c5
RM
2375
2376static
2377regex_grow_registers (num_regs)
2378 int num_regs;
2379{
2380 if (num_regs > regs_allocated_size)
2381 {
66f0296e
SM
2382 RETALLOC_IF (regstart, num_regs, re_char *);
2383 RETALLOC_IF (regend, num_regs, re_char *);
2384 RETALLOC_IF (best_regstart, num_regs, re_char *);
2385 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2386
2387 regs_allocated_size = num_regs;
2388 }
2389}
2390
2391#endif /* not MATCH_MAY_ALLOCATE */
2392\f
99633e97
SM
2393static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2394 compile_stack,
2395 regnum_t regnum));
2396
fa9a63c5
RM
2397/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2398 Returns one of error codes defined in `regex.h', or zero for success.
2399
2400 Assumes the `allocated' (and perhaps `buffer') and `translate'
2401 fields are set in BUFP on entry.
2402
2403 If it succeeds, results are put in BUFP (if it returns an error, the
2404 contents of BUFP are undefined):
2405 `buffer' is the compiled pattern;
2406 `syntax' is set to SYNTAX;
2407 `used' is set to the length of the compiled pattern;
2408 `fastmap_accurate' is zero;
2409 `re_nsub' is the number of subexpressions in PATTERN;
2410 `not_bol' and `not_eol' are zero;
5e69f11e 2411
c0f9ea08 2412 The `fastmap' field is neither examined nor set. */
fa9a63c5 2413
505bde11
SM
2414/* Insert the `jump' from the end of last alternative to "here".
2415 The space for the jump has already been allocated. */
2416#define FIXUP_ALT_JUMP() \
2417do { \
2418 if (fixup_alt_jump) \
2419 STORE_JUMP (jump, fixup_alt_jump, b); \
2420} while (0)
2421
2422
fa9a63c5
RM
2423/* Return, freeing storage we allocated. */
2424#define FREE_STACK_RETURN(value) \
b18215fc
RS
2425 do { \
2426 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2427 free (compile_stack.stack); \
2428 return value; \
2429 } while (0)
fa9a63c5
RM
2430
2431static reg_errcode_t
2432regex_compile (pattern, size, syntax, bufp)
66f0296e 2433 re_char *pattern;
4bb91c68 2434 size_t size;
fa9a63c5
RM
2435 reg_syntax_t syntax;
2436 struct re_pattern_buffer *bufp;
2437{
01618498
SM
2438 /* We fetch characters from PATTERN here. */
2439 register re_wchar_t c, c1;
5e69f11e 2440
fa9a63c5 2441 /* A random temporary spot in PATTERN. */
66f0296e 2442 re_char *p1;
fa9a63c5
RM
2443
2444 /* Points to the end of the buffer, where we should append. */
2445 register unsigned char *b;
5e69f11e 2446
fa9a63c5
RM
2447 /* Keeps track of unclosed groups. */
2448 compile_stack_type compile_stack;
2449
2450 /* Points to the current (ending) position in the pattern. */
22336245
RS
2451#ifdef AIX
2452 /* `const' makes AIX compiler fail. */
66f0296e 2453 unsigned char *p = pattern;
22336245 2454#else
66f0296e 2455 re_char *p = pattern;
22336245 2456#endif
66f0296e 2457 re_char *pend = pattern + size;
5e69f11e 2458
fa9a63c5 2459 /* How to translate the characters in the pattern. */
6676cb1c 2460 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2461
2462 /* Address of the count-byte of the most recently inserted `exactn'
2463 command. This makes it possible to tell if a new exact-match
2464 character can be added to that command or if the character requires
2465 a new `exactn' command. */
2466 unsigned char *pending_exact = 0;
2467
2468 /* Address of start of the most recently finished expression.
2469 This tells, e.g., postfix * where to find the start of its
2470 operand. Reset at the beginning of groups and alternatives. */
2471 unsigned char *laststart = 0;
2472
2473 /* Address of beginning of regexp, or inside of last group. */
2474 unsigned char *begalt;
2475
2476 /* Place in the uncompiled pattern (i.e., the {) to
2477 which to go back if the interval is invalid. */
66f0296e 2478 re_char *beg_interval;
5e69f11e 2479
fa9a63c5 2480 /* Address of the place where a forward jump should go to the end of
25fe55af 2481 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2482 last -- ends with a forward jump of this sort. */
2483 unsigned char *fixup_alt_jump = 0;
2484
2485 /* Counts open-groups as they are encountered. Remembered for the
2486 matching close-group on the compile stack, so the same register
2487 number is put in the stop_memory as the start_memory. */
2488 regnum_t regnum = 0;
2489
b18215fc
RS
2490 /* Work area for range table of charset. */
2491 struct range_table_work_area range_table_work;
2492
2d1675e4
SM
2493 /* If the object matched can contain multibyte characters. */
2494 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2495
f9b0fd99
RS
2496 /* Nonzero if we have pushed down into a subpattern. */
2497 int in_subpattern = 0;
2498
2499 /* These hold the values of p, pattern, and pend from the main
2500 pattern when we have pushed into a subpattern. */
2501 re_char *main_p;
2502 re_char *main_pattern;
2503 re_char *main_pend;
2504
fa9a63c5 2505#ifdef DEBUG
99633e97 2506 debug++;
fa9a63c5 2507 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2508 if (debug > 0)
fa9a63c5
RM
2509 {
2510 unsigned debug_count;
5e69f11e 2511
fa9a63c5 2512 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2513 putchar (pattern[debug_count]);
fa9a63c5
RM
2514 putchar ('\n');
2515 }
2516#endif /* DEBUG */
2517
2518 /* Initialize the compile stack. */
2519 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2520 if (compile_stack.stack == NULL)
2521 return REG_ESPACE;
2522
2523 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2524 compile_stack.avail = 0;
2525
b18215fc
RS
2526 range_table_work.table = 0;
2527 range_table_work.allocated = 0;
2528
fa9a63c5
RM
2529 /* Initialize the pattern buffer. */
2530 bufp->syntax = syntax;
2531 bufp->fastmap_accurate = 0;
2532 bufp->not_bol = bufp->not_eol = 0;
6224b623 2533 bufp->used_syntax = 0;
fa9a63c5
RM
2534
2535 /* Set `used' to zero, so that if we return an error, the pattern
2536 printer (for debugging) will think there's no pattern. We reset it
2537 at the end. */
2538 bufp->used = 0;
5e69f11e 2539
fa9a63c5 2540 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2541 bufp->re_nsub = 0;
fa9a63c5 2542
0b32bf0e 2543#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2544 /* Initialize the syntax table. */
2545 init_syntax_once ();
2546#endif
2547
2548 if (bufp->allocated == 0)
2549 {
2550 if (bufp->buffer)
2551 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2552 enough space. This loses if buffer's address is bogus, but
2553 that is the user's responsibility. */
2554 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2555 }
fa9a63c5 2556 else
25fe55af
RS
2557 { /* Caller did not allocate a buffer. Do it for them. */
2558 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2559 }
fa9a63c5
RM
2560 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2561
2562 bufp->allocated = INIT_BUF_SIZE;
2563 }
2564
2565 begalt = b = bufp->buffer;
2566
2567 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2568 while (1)
fa9a63c5 2569 {
f9b0fd99
RS
2570 if (p == pend)
2571 {
2572 /* If this is the end of an included regexp,
2573 pop back to the main regexp and try again. */
2574 if (in_subpattern)
2575 {
2576 in_subpattern = 0;
2577 pattern = main_pattern;
2578 p = main_p;
2579 pend = main_pend;
2580 continue;
2581 }
2582 /* If this is the end of the main regexp, we are done. */
2583 break;
2584 }
2585
fa9a63c5
RM
2586 PATFETCH (c);
2587
2588 switch (c)
25fe55af 2589 {
f9b0fd99
RS
2590 case ' ':
2591 {
2592 re_char *p1 = p;
2593
2594 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2595 spaces normally. And don't try to do this recursively. */
2596 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2597 goto normal_char;
2598
2599 /* Peek past following spaces. */
2600 while (p1 != pend)
2601 {
2602 if (*p1 != ' ')
2603 break;
2604 p1++;
2605 }
2606 /* If the spaces are followed by a repetition op,
2607 treat them normally. */
c721eee5
RS
2608 if (p1 != pend
2609 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2610 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2611 goto normal_char;
2612
2613 /* Replace the spaces with the whitespace regexp. */
2614 in_subpattern = 1;
2615 main_p = p1;
2616 main_pend = pend;
2617 main_pattern = pattern;
2618 p = pattern = whitespace_regexp;
2619 pend = p + strlen (p);
2620 break;
2621 }
2622
25fe55af
RS
2623 case '^':
2624 {
2625 if ( /* If at start of pattern, it's an operator. */
2626 p == pattern + 1
2627 /* If context independent, it's an operator. */
2628 || syntax & RE_CONTEXT_INDEP_ANCHORS
2629 /* Otherwise, depends on what's come before. */
2630 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2631 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2632 else
2633 goto normal_char;
2634 }
2635 break;
2636
2637
2638 case '$':
2639 {
2640 if ( /* If at end of pattern, it's an operator. */
2641 p == pend
2642 /* If context independent, it's an operator. */
2643 || syntax & RE_CONTEXT_INDEP_ANCHORS
2644 /* Otherwise, depends on what's next. */
2645 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2646 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2647 else
2648 goto normal_char;
2649 }
2650 break;
fa9a63c5
RM
2651
2652
2653 case '+':
25fe55af
RS
2654 case '?':
2655 if ((syntax & RE_BK_PLUS_QM)
2656 || (syntax & RE_LIMITED_OPS))
2657 goto normal_char;
2658 handle_plus:
2659 case '*':
2660 /* If there is no previous pattern... */
2661 if (!laststart)
2662 {
2663 if (syntax & RE_CONTEXT_INVALID_OPS)
2664 FREE_STACK_RETURN (REG_BADRPT);
2665 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2666 goto normal_char;
2667 }
2668
2669 {
25fe55af 2670 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2671 boolean zero_times_ok = 0, many_times_ok = 0;
2672 boolean greedy = 1;
25fe55af
RS
2673
2674 /* If there is a sequence of repetition chars, collapse it
2675 down to just one (the right one). We can't combine
2676 interval operators with these because of, e.g., `a{2}*',
2677 which should only match an even number of `a's. */
2678
2679 for (;;)
2680 {
0b32bf0e 2681 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2682 && c == '?' && (zero_times_ok || many_times_ok))
2683 greedy = 0;
2684 else
2685 {
2686 zero_times_ok |= c != '+';
2687 many_times_ok |= c != '?';
2688 }
25fe55af
RS
2689
2690 if (p == pend)
2691 break;
ed0767d8
SM
2692 else if (*p == '*'
2693 || (!(syntax & RE_BK_PLUS_QM)
2694 && (*p == '+' || *p == '?')))
25fe55af 2695 ;
ed0767d8 2696 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2697 {
ed0767d8
SM
2698 if (p+1 == pend)
2699 FREE_STACK_RETURN (REG_EESCAPE);
2700 if (p[1] == '+' || p[1] == '?')
2701 PATFETCH (c); /* Gobble up the backslash. */
2702 else
2703 break;
25fe55af
RS
2704 }
2705 else
ed0767d8 2706 break;
25fe55af 2707 /* If we get here, we found another repeat character. */
ed0767d8
SM
2708 PATFETCH (c);
2709 }
25fe55af
RS
2710
2711 /* Star, etc. applied to an empty pattern is equivalent
2712 to an empty pattern. */
4e8a9132 2713 if (!laststart || laststart == b)
25fe55af
RS
2714 break;
2715
2716 /* Now we know whether or not zero matches is allowed
2717 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2718 if (greedy)
2719 {
99633e97 2720 if (many_times_ok)
4e8a9132
SM
2721 {
2722 boolean simple = skip_one_char (laststart) == b;
2723 unsigned int startoffset = 0;
f6a3f532 2724 re_opcode_t ofj =
01618498 2725 /* Check if the loop can match the empty string. */
6df42991
SM
2726 (simple || !analyse_first (laststart, b, NULL, 0))
2727 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2728 assert (skip_one_char (laststart) <= b);
177c0ea7 2729
4e8a9132
SM
2730 if (!zero_times_ok && simple)
2731 { /* Since simple * loops can be made faster by using
2732 on_failure_keep_string_jump, we turn simple P+
2733 into PP* if P is simple. */
2734 unsigned char *p1, *p2;
2735 startoffset = b - laststart;
2736 GET_BUFFER_SPACE (startoffset);
2737 p1 = b; p2 = laststart;
2738 while (p2 < p1)
2739 *b++ = *p2++;
2740 zero_times_ok = 1;
99633e97 2741 }
4e8a9132
SM
2742
2743 GET_BUFFER_SPACE (6);
2744 if (!zero_times_ok)
2745 /* A + loop. */
f6a3f532 2746 STORE_JUMP (ofj, b, b + 6);
99633e97 2747 else
4e8a9132
SM
2748 /* Simple * loops can use on_failure_keep_string_jump
2749 depending on what follows. But since we don't know
2750 that yet, we leave the decision up to
2751 on_failure_jump_smart. */
f6a3f532 2752 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2753 laststart + startoffset, b + 6);
99633e97 2754 b += 3;
4e8a9132 2755 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2756 b += 3;
2757 }
2758 else
2759 {
4e8a9132
SM
2760 /* A simple ? pattern. */
2761 assert (zero_times_ok);
2762 GET_BUFFER_SPACE (3);
2763 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2764 b += 3;
2765 }
1c8c6d39
DL
2766 }
2767 else /* not greedy */
2768 { /* I wish the greedy and non-greedy cases could be merged. */
2769
0683b6fa 2770 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2771 if (many_times_ok)
2772 {
f6a3f532
SM
2773 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2774
6df42991
SM
2775 /* The non-greedy multiple match looks like
2776 a repeat..until: we only need a conditional jump
2777 at the end of the loop. */
f6a3f532
SM
2778 if (emptyp) BUF_PUSH (no_op);
2779 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2780 : on_failure_jump, b, laststart);
1c8c6d39
DL
2781 b += 3;
2782 if (zero_times_ok)
2783 {
2784 /* The repeat...until naturally matches one or more.
2785 To also match zero times, we need to first jump to
6df42991 2786 the end of the loop (its conditional jump). */
1c8c6d39
DL
2787 INSERT_JUMP (jump, laststart, b);
2788 b += 3;
2789 }
2790 }
2791 else
2792 {
2793 /* non-greedy a?? */
1c8c6d39
DL
2794 INSERT_JUMP (jump, laststart, b + 3);
2795 b += 3;
2796 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2797 b += 3;
2798 }
2799 }
2800 }
4e8a9132 2801 pending_exact = 0;
fa9a63c5
RM
2802 break;
2803
2804
2805 case '.':
25fe55af
RS
2806 laststart = b;
2807 BUF_PUSH (anychar);
2808 break;
fa9a63c5
RM
2809
2810
25fe55af
RS
2811 case '[':
2812 {
b18215fc 2813 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2814
25fe55af 2815 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2816
25fe55af
RS
2817 /* Ensure that we have enough space to push a charset: the
2818 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2819 GET_BUFFER_SPACE (34);
2820
25fe55af 2821 laststart = b;
e318085a 2822
25fe55af
RS
2823 /* We test `*p == '^' twice, instead of using an if
2824 statement, so we only need one BUF_PUSH. */
2825 BUF_PUSH (*p == '^' ? charset_not : charset);
2826 if (*p == '^')
2827 p++;
e318085a 2828
25fe55af
RS
2829 /* Remember the first position in the bracket expression. */
2830 p1 = p;
e318085a 2831
25fe55af
RS
2832 /* Push the number of bytes in the bitmap. */
2833 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2834
25fe55af
RS
2835 /* Clear the whole map. */
2836 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2837
25fe55af
RS
2838 /* charset_not matches newline according to a syntax bit. */
2839 if ((re_opcode_t) b[-2] == charset_not
2840 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2841 SET_LIST_BIT ('\n');
fa9a63c5 2842
25fe55af
RS
2843 /* Read in characters and ranges, setting map bits. */
2844 for (;;)
2845 {
b18215fc 2846 boolean escaped_char = false;
2d1675e4 2847 const unsigned char *p2 = p;
e318085a 2848
25fe55af 2849 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2850
36595814
SM
2851 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2852 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2853 So the translation is done later in a loop. Example:
2854 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2855 PATFETCH (c);
e318085a 2856
25fe55af
RS
2857 /* \ might escape characters inside [...] and [^...]. */
2858 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2859 {
2860 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2861
2862 PATFETCH (c);
b18215fc 2863 escaped_char = true;
25fe55af 2864 }
b18215fc
RS
2865 else
2866 {
657fcfbd
RS
2867 /* Could be the end of the bracket expression. If it's
2868 not (i.e., when the bracket expression is `[]' so
2869 far), the ']' character bit gets set way below. */
2d1675e4 2870 if (c == ']' && p2 != p1)
657fcfbd 2871 break;
25fe55af 2872 }
b18215fc 2873
b18215fc
RS
2874 /* What should we do for the character which is
2875 greater than 0x7F, but not BASE_LEADING_CODE_P?
2876 XXX */
2877
25fe55af
RS
2878 /* See if we're at the beginning of a possible character
2879 class. */
b18215fc 2880
2d1675e4
SM
2881 if (!escaped_char &&
2882 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2883 {
2884 /* Leave room for the null. */
14473664 2885 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2886 const unsigned char *class_beg;
b18215fc 2887
25fe55af
RS
2888 PATFETCH (c);
2889 c1 = 0;
ed0767d8 2890 class_beg = p;
b18215fc 2891
25fe55af
RS
2892 /* If pattern is `[[:'. */
2893 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2894
25fe55af
RS
2895 for (;;)
2896 {
14473664
SM
2897 PATFETCH (c);
2898 if ((c == ':' && *p == ']') || p == pend)
2899 break;
2900 if (c1 < CHAR_CLASS_MAX_LENGTH)
2901 str[c1++] = c;
2902 else
2903 /* This is in any case an invalid class name. */
2904 str[0] = '\0';
25fe55af
RS
2905 }
2906 str[c1] = '\0';
b18215fc
RS
2907
2908 /* If isn't a word bracketed by `[:' and `:]':
2909 undo the ending character, the letters, and
2910 leave the leading `:' and `[' (but set bits for
2911 them). */
25fe55af
RS
2912 if (c == ':' && *p == ']')
2913 {
36595814 2914 re_wchar_t ch;
14473664
SM
2915 re_wctype_t cc;
2916
2917 cc = re_wctype (str);
2918
2919 if (cc == 0)
fa9a63c5
RM
2920 FREE_STACK_RETURN (REG_ECTYPE);
2921
14473664
SM
2922 /* Throw away the ] at the end of the character
2923 class. */
2924 PATFETCH (c);
fa9a63c5 2925
14473664 2926 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2927
96cc36cc
RS
2928 /* Most character classes in a multibyte match
2929 just set a flag. Exceptions are is_blank,
2930 is_digit, is_cntrl, and is_xdigit, since
2931 they can only match ASCII characters. We
14473664
SM
2932 don't need to handle them for multibyte.
2933 They are distinguished by a negative wctype. */
96cc36cc 2934
2d1675e4 2935 if (multibyte)
14473664
SM
2936 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2937 re_wctype_to_bit (cc));
96cc36cc 2938
14473664 2939 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
25fe55af 2940 {
7ae68633 2941 int translated = TRANSLATE (ch);
14473664 2942 if (re_iswctype (btowc (ch), cc))
96cc36cc 2943 SET_LIST_BIT (translated);
25fe55af 2944 }
b18215fc 2945
6224b623
SM
2946 /* In most cases the matching rule for char classes
2947 only uses the syntax table for multibyte chars,
2948 so that the content of the syntax-table it is not
2949 hardcoded in the range_table. SPACE and WORD are
2950 the two exceptions. */
2951 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2952 bufp->used_syntax = 1;
2953
b18215fc
RS
2954 /* Repeat the loop. */
2955 continue;
25fe55af
RS
2956 }
2957 else
2958 {
ed0767d8
SM
2959 /* Go back to right after the "[:". */
2960 p = class_beg;
25fe55af 2961 SET_LIST_BIT ('[');
b18215fc
RS
2962
2963 /* Because the `:' may starts the range, we
2964 can't simply set bit and repeat the loop.
25fe55af 2965 Instead, just set it to C and handle below. */
b18215fc 2966 c = ':';
25fe55af
RS
2967 }
2968 }
b18215fc
RS
2969
2970 if (p < pend && p[0] == '-' && p[1] != ']')
2971 {
2972
2973 /* Discard the `-'. */
2974 PATFETCH (c1);
2975
2976 /* Fetch the character which ends the range. */
2977 PATFETCH (c1);
b18215fc 2978
b54f61ed 2979 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2980 {
b54f61ed
KH
2981 if (! SINGLE_BYTE_CHAR_P (c1))
2982 {
3ff2446d
KH
2983 /* Handle a range starting with a
2984 character of less than 256, and ending
2985 with a character of not less than 256.
2986 Split that into two ranges, the low one
2987 ending at 0377, and the high one
2988 starting at the smallest character in
2989 the charset of C1 and ending at C1. */
b54f61ed 2990 int charset = CHAR_CHARSET (c1);
36595814
SM
2991 re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
2992
b54f61ed
KH
2993 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2994 c2, c1);
333526e0 2995 c1 = 0377;
b54f61ed 2996 }
e934739e
RS
2997 }
2998 else if (!SAME_CHARSET_P (c, c1))
b3e4c897 2999 FREE_STACK_RETURN (REG_ERANGEX);
e318085a 3000 }
25fe55af 3001 else
b18215fc
RS
3002 /* Range from C to C. */
3003 c1 = c;
3004
3005 /* Set the range ... */
3006 if (SINGLE_BYTE_CHAR_P (c))
3007 /* ... into bitmap. */
25fe55af 3008 {
01618498 3009 re_wchar_t this_char;
36595814 3010 re_wchar_t range_start = c, range_end = c1;
b18215fc
RS
3011
3012 /* If the start is after the end, the range is empty. */
3013 if (range_start > range_end)
3014 {
3015 if (syntax & RE_NO_EMPTY_RANGES)
3016 FREE_STACK_RETURN (REG_ERANGE);
3017 /* Else, repeat the loop. */
3018 }
3019 else
3020 {
3021 for (this_char = range_start; this_char <= range_end;
3022 this_char++)
3023 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 3024 }
25fe55af 3025 }
e318085a 3026 else
b18215fc
RS
3027 /* ... into range table. */
3028 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
3029 }
3030
25fe55af
RS
3031 /* Discard any (non)matching list bytes that are all 0 at the
3032 end of the map. Decrease the map-length byte too. */
3033 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3034 b[-1]--;
3035 b += b[-1];
fa9a63c5 3036
96cc36cc
RS
3037 /* Build real range table from work area. */
3038 if (RANGE_TABLE_WORK_USED (range_table_work)
3039 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3040 {
3041 int i;
3042 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3043
b18215fc 3044 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3045 bytes for flags, two for COUNT, and three bytes for
3046 each character. */
3047 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3048
b18215fc
RS
3049 /* Indicate the existence of range table. */
3050 laststart[1] |= 0x80;
fa9a63c5 3051
96cc36cc
RS
3052 /* Store the character class flag bits into the range table.
3053 If not in emacs, these flag bits are always 0. */
3054 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3055 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3056
b18215fc
RS
3057 STORE_NUMBER_AND_INCR (b, used / 2);
3058 for (i = 0; i < used; i++)
3059 STORE_CHARACTER_AND_INCR
3060 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3061 }
25fe55af
RS
3062 }
3063 break;
fa9a63c5
RM
3064
3065
b18215fc 3066 case '(':
25fe55af
RS
3067 if (syntax & RE_NO_BK_PARENS)
3068 goto handle_open;
3069 else
3070 goto normal_char;
fa9a63c5
RM
3071
3072
25fe55af
RS
3073 case ')':
3074 if (syntax & RE_NO_BK_PARENS)
3075 goto handle_close;
3076 else
3077 goto normal_char;
e318085a
RS
3078
3079
25fe55af
RS
3080 case '\n':
3081 if (syntax & RE_NEWLINE_ALT)
3082 goto handle_alt;
3083 else
3084 goto normal_char;
e318085a
RS
3085
3086
b18215fc 3087 case '|':
25fe55af
RS
3088 if (syntax & RE_NO_BK_VBAR)
3089 goto handle_alt;
3090 else
3091 goto normal_char;
3092
3093
3094 case '{':
3095 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3096 goto handle_interval;
3097 else
3098 goto normal_char;
3099
3100
3101 case '\\':
3102 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3103
3104 /* Do not translate the character after the \, so that we can
3105 distinguish, e.g., \B from \b, even if we normally would
3106 translate, e.g., B to b. */
36595814 3107 PATFETCH (c);
25fe55af
RS
3108
3109 switch (c)
3110 {
3111 case '(':
3112 if (syntax & RE_NO_BK_PARENS)
3113 goto normal_backslash;
3114
3115 handle_open:
505bde11
SM
3116 {
3117 int shy = 0;
3118 if (p+1 < pend)
3119 {
3120 /* Look for a special (?...) construct */
ed0767d8 3121 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3122 {
ed0767d8 3123 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3124 PATFETCH (c);
3125 switch (c)
3126 {
3127 case ':': shy = 1; break;
3128 default:
3129 /* Only (?:...) is supported right now. */
3130 FREE_STACK_RETURN (REG_BADPAT);
3131 }
3132 }
505bde11
SM
3133 }
3134
3135 if (!shy)
3136 {
3137 bufp->re_nsub++;
3138 regnum++;
3139 }
25fe55af 3140
99633e97
SM
3141 if (COMPILE_STACK_FULL)
3142 {
3143 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3144 compile_stack_elt_t);
3145 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3146
99633e97
SM
3147 compile_stack.size <<= 1;
3148 }
25fe55af 3149
99633e97
SM
3150 /* These are the values to restore when we hit end of this
3151 group. They are all relative offsets, so that if the
3152 whole pattern moves because of realloc, they will still
3153 be valid. */
3154 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3155 COMPILE_STACK_TOP.fixup_alt_jump
3156 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3157 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3158 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3159
3160 /* Do not push a
3161 start_memory for groups beyond the last one we can
3162 represent in the compiled pattern. */
3163 if (regnum <= MAX_REGNUM && !shy)
3164 BUF_PUSH_2 (start_memory, regnum);
3165
3166 compile_stack.avail++;
3167
3168 fixup_alt_jump = 0;
3169 laststart = 0;
3170 begalt = b;
3171 /* If we've reached MAX_REGNUM groups, then this open
3172 won't actually generate any code, so we'll have to
3173 clear pending_exact explicitly. */
3174 pending_exact = 0;
3175 break;
505bde11 3176 }
25fe55af
RS
3177
3178 case ')':
3179 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3180
3181 if (COMPILE_STACK_EMPTY)
505bde11
SM
3182 {
3183 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3184 goto normal_backslash;
3185 else
3186 FREE_STACK_RETURN (REG_ERPAREN);
3187 }
25fe55af
RS
3188
3189 handle_close:
505bde11 3190 FIXUP_ALT_JUMP ();
25fe55af
RS
3191
3192 /* See similar code for backslashed left paren above. */
3193 if (COMPILE_STACK_EMPTY)
505bde11
SM
3194 {
3195 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3196 goto normal_char;
3197 else
3198 FREE_STACK_RETURN (REG_ERPAREN);
3199 }
25fe55af
RS
3200
3201 /* Since we just checked for an empty stack above, this
3202 ``can't happen''. */
3203 assert (compile_stack.avail != 0);
3204 {
3205 /* We don't just want to restore into `regnum', because
3206 later groups should continue to be numbered higher,
3207 as in `(ab)c(de)' -- the second group is #2. */
3208 regnum_t this_group_regnum;
3209
3210 compile_stack.avail--;
3211 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3212 fixup_alt_jump
3213 = COMPILE_STACK_TOP.fixup_alt_jump
3214 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3215 : 0;
3216 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3217 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3218 /* If we've reached MAX_REGNUM groups, then this open
3219 won't actually generate any code, so we'll have to
3220 clear pending_exact explicitly. */
3221 pending_exact = 0;
e318085a 3222
25fe55af
RS
3223 /* We're at the end of the group, so now we know how many
3224 groups were inside this one. */
505bde11
SM
3225 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3226 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3227 }
3228 break;
3229
3230
3231 case '|': /* `\|'. */
3232 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3233 goto normal_backslash;
3234 handle_alt:
3235 if (syntax & RE_LIMITED_OPS)
3236 goto normal_char;
3237
3238 /* Insert before the previous alternative a jump which
3239 jumps to this alternative if the former fails. */
3240 GET_BUFFER_SPACE (3);
3241 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3242 pending_exact = 0;
3243 b += 3;
3244
3245 /* The alternative before this one has a jump after it
3246 which gets executed if it gets matched. Adjust that
3247 jump so it will jump to this alternative's analogous
3248 jump (put in below, which in turn will jump to the next
3249 (if any) alternative's such jump, etc.). The last such
3250 jump jumps to the correct final destination. A picture:
3251 _____ _____
3252 | | | |
3253 | v | v
3254 a | b | c
3255
3256 If we are at `b', then fixup_alt_jump right now points to a
3257 three-byte space after `a'. We'll put in the jump, set
3258 fixup_alt_jump to right after `b', and leave behind three
3259 bytes which we'll fill in when we get to after `c'. */
3260
505bde11 3261 FIXUP_ALT_JUMP ();
25fe55af
RS
3262
3263 /* Mark and leave space for a jump after this alternative,
3264 to be filled in later either by next alternative or
3265 when know we're at the end of a series of alternatives. */
3266 fixup_alt_jump = b;
3267 GET_BUFFER_SPACE (3);
3268 b += 3;
3269
3270 laststart = 0;
3271 begalt = b;
3272 break;
3273
3274
3275 case '{':
3276 /* If \{ is a literal. */
3277 if (!(syntax & RE_INTERVALS)
3278 /* If we're at `\{' and it's not the open-interval
3279 operator. */
4bb91c68 3280 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3281 goto normal_backslash;
3282
3283 handle_interval:
3284 {
3285 /* If got here, then the syntax allows intervals. */
3286
3287 /* At least (most) this many matches must be made. */
99633e97 3288 int lower_bound = 0, upper_bound = -1;
25fe55af 3289
ed0767d8 3290 beg_interval = p;
25fe55af 3291
25fe55af
RS
3292 GET_UNSIGNED_NUMBER (lower_bound);
3293
3294 if (c == ',')
ed0767d8 3295 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3296 else
3297 /* Interval such as `{1}' => match exactly once. */
3298 upper_bound = lower_bound;
3299
3300 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3301 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3302 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3303
3304 if (!(syntax & RE_NO_BK_BRACES))
3305 {
4bb91c68
SM
3306 if (c != '\\')
3307 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3308 if (p == pend)
3309 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3310 PATFETCH (c);
3311 }
3312
3313 if (c != '}')
4bb91c68 3314 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3315
3316 /* We just parsed a valid interval. */
3317
3318 /* If it's invalid to have no preceding re. */
3319 if (!laststart)
3320 {
3321 if (syntax & RE_CONTEXT_INVALID_OPS)
3322 FREE_STACK_RETURN (REG_BADRPT);
3323 else if (syntax & RE_CONTEXT_INDEP_OPS)
3324 laststart = b;
3325 else
3326 goto unfetch_interval;
3327 }
3328
6df42991
SM
3329 if (upper_bound == 0)
3330 /* If the upper bound is zero, just drop the sub pattern
3331 altogether. */
3332 b = laststart;
3333 else if (lower_bound == 1 && upper_bound == 1)
3334 /* Just match it once: nothing to do here. */
3335 ;
3336
3337 /* Otherwise, we have a nontrivial interval. When
3338 we're all done, the pattern will look like:
3339 set_number_at <jump count> <upper bound>
3340 set_number_at <succeed_n count> <lower bound>
3341 succeed_n <after jump addr> <succeed_n count>
3342 <body of loop>
3343 jump_n <succeed_n addr> <jump count>
3344 (The upper bound and `jump_n' are omitted if
3345 `upper_bound' is 1, though.) */
3346 else
3347 { /* If the upper bound is > 1, we need to insert
3348 more at the end of the loop. */
3349 unsigned int nbytes = (upper_bound < 0 ? 3
3350 : upper_bound > 1 ? 5 : 0);
3351 unsigned int startoffset = 0;
3352
3353 GET_BUFFER_SPACE (20); /* We might use less. */
3354
3355 if (lower_bound == 0)
3356 {
3357 /* A succeed_n that starts with 0 is really a
3358 a simple on_failure_jump_loop. */
3359 INSERT_JUMP (on_failure_jump_loop, laststart,
3360 b + 3 + nbytes);
3361 b += 3;
3362 }
3363 else
3364 {
3365 /* Initialize lower bound of the `succeed_n', even
3366 though it will be set during matching by its
3367 attendant `set_number_at' (inserted next),
3368 because `re_compile_fastmap' needs to know.
3369 Jump to the `jump_n' we might insert below. */
3370 INSERT_JUMP2 (succeed_n, laststart,
3371 b + 5 + nbytes,
3372 lower_bound);
3373 b += 5;
3374
3375 /* Code to initialize the lower bound. Insert
3376 before the `succeed_n'. The `5' is the last two
3377 bytes of this `set_number_at', plus 3 bytes of
3378 the following `succeed_n'. */
3379 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3380 b += 5;
3381 startoffset += 5;
3382 }
3383
3384 if (upper_bound < 0)
3385 {
3386 /* A negative upper bound stands for infinity,
3387 in which case it degenerates to a plain jump. */
3388 STORE_JUMP (jump, b, laststart + startoffset);
3389 b += 3;
3390 }
3391 else if (upper_bound > 1)
3392 { /* More than one repetition is allowed, so
3393 append a backward jump to the `succeed_n'
3394 that starts this interval.
3395
3396 When we've reached this during matching,
3397 we'll have matched the interval once, so
3398 jump back only `upper_bound - 1' times. */
3399 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3400 upper_bound - 1);
3401 b += 5;
3402
3403 /* The location we want to set is the second
3404 parameter of the `jump_n'; that is `b-2' as
3405 an absolute address. `laststart' will be
3406 the `set_number_at' we're about to insert;
3407 `laststart+3' the number to set, the source
3408 for the relative address. But we are
3409 inserting into the middle of the pattern --
3410 so everything is getting moved up by 5.
3411 Conclusion: (b - 2) - (laststart + 3) + 5,
3412 i.e., b - laststart.
3413
3414 We insert this at the beginning of the loop
3415 so that if we fail during matching, we'll
3416 reinitialize the bounds. */
3417 insert_op2 (set_number_at, laststart, b - laststart,
3418 upper_bound - 1, b);
3419 b += 5;
3420 }
3421 }
25fe55af
RS
3422 pending_exact = 0;
3423 beg_interval = NULL;
3424 }
3425 break;
3426
3427 unfetch_interval:
3428 /* If an invalid interval, match the characters as literals. */
3429 assert (beg_interval);
3430 p = beg_interval;
3431 beg_interval = NULL;
3432
3433 /* normal_char and normal_backslash need `c'. */
ed0767d8 3434 c = '{';
25fe55af
RS
3435
3436 if (!(syntax & RE_NO_BK_BRACES))
3437 {
ed0767d8
SM
3438 assert (p > pattern && p[-1] == '\\');
3439 goto normal_backslash;
25fe55af 3440 }
ed0767d8
SM
3441 else
3442 goto normal_char;
e318085a 3443
b18215fc 3444#ifdef emacs
25fe55af
RS
3445 /* There is no way to specify the before_dot and after_dot
3446 operators. rms says this is ok. --karl */
3447 case '=':
3448 BUF_PUSH (at_dot);
3449 break;
3450
3451 case 's':
3452 laststart = b;
3453 PATFETCH (c);
3454 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3455 break;
3456
3457 case 'S':
3458 laststart = b;
3459 PATFETCH (c);
3460 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3461 break;
b18215fc
RS
3462
3463 case 'c':
3464 laststart = b;
36595814 3465 PATFETCH (c);
b18215fc
RS
3466 BUF_PUSH_2 (categoryspec, c);
3467 break;
e318085a 3468
b18215fc
RS
3469 case 'C':
3470 laststart = b;
36595814 3471 PATFETCH (c);
b18215fc
RS
3472 BUF_PUSH_2 (notcategoryspec, c);
3473 break;
3474#endif /* emacs */
e318085a 3475
e318085a 3476
25fe55af 3477 case 'w':
4bb91c68
SM
3478 if (syntax & RE_NO_GNU_OPS)
3479 goto normal_char;
25fe55af 3480 laststart = b;
1fb352e0 3481 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3482 break;
e318085a 3483
e318085a 3484
25fe55af 3485 case 'W':
4bb91c68
SM
3486 if (syntax & RE_NO_GNU_OPS)
3487 goto normal_char;
25fe55af 3488 laststart = b;
1fb352e0 3489 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3490 break;
e318085a
RS
3491
3492
25fe55af 3493 case '<':
4bb91c68
SM
3494 if (syntax & RE_NO_GNU_OPS)
3495 goto normal_char;
25fe55af
RS
3496 BUF_PUSH (wordbeg);
3497 break;
e318085a 3498
25fe55af 3499 case '>':
4bb91c68
SM
3500 if (syntax & RE_NO_GNU_OPS)
3501 goto normal_char;
25fe55af
RS
3502 BUF_PUSH (wordend);
3503 break;
e318085a 3504
669fa600
SM
3505 case '_':
3506 if (syntax & RE_NO_GNU_OPS)
3507 goto normal_char;
3508 laststart = b;
3509 PATFETCH (c);
3510 if (c == '<')
3511 BUF_PUSH (symbeg);
3512 else if (c == '>')
3513 BUF_PUSH (symend);
3514 else
3515 FREE_STACK_RETURN (REG_BADPAT);
3516 break;
3517
25fe55af 3518 case 'b':
4bb91c68
SM
3519 if (syntax & RE_NO_GNU_OPS)
3520 goto normal_char;
25fe55af
RS
3521 BUF_PUSH (wordbound);
3522 break;
e318085a 3523
25fe55af 3524 case 'B':
4bb91c68
SM
3525 if (syntax & RE_NO_GNU_OPS)
3526 goto normal_char;
25fe55af
RS
3527 BUF_PUSH (notwordbound);
3528 break;
fa9a63c5 3529
25fe55af 3530 case '`':
4bb91c68
SM
3531 if (syntax & RE_NO_GNU_OPS)
3532 goto normal_char;
25fe55af
RS
3533 BUF_PUSH (begbuf);
3534 break;
e318085a 3535
25fe55af 3536 case '\'':
4bb91c68
SM
3537 if (syntax & RE_NO_GNU_OPS)
3538 goto normal_char;
25fe55af
RS
3539 BUF_PUSH (endbuf);
3540 break;
e318085a 3541
25fe55af
RS
3542 case '1': case '2': case '3': case '4': case '5':
3543 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3544 {
3545 regnum_t reg;
e318085a 3546
0cdd06f8
SM
3547 if (syntax & RE_NO_BK_REFS)
3548 goto normal_backslash;
e318085a 3549
0cdd06f8 3550 reg = c - '0';
e318085a 3551
0cdd06f8
SM
3552 /* Can't back reference to a subexpression before its end. */
3553 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3554 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3555
0cdd06f8
SM
3556 laststart = b;
3557 BUF_PUSH_2 (duplicate, reg);
3558 }
25fe55af 3559 break;
e318085a 3560
e318085a 3561
25fe55af
RS
3562 case '+':
3563 case '?':
3564 if (syntax & RE_BK_PLUS_QM)
3565 goto handle_plus;
3566 else
3567 goto normal_backslash;
3568
3569 default:
3570 normal_backslash:
3571 /* You might think it would be useful for \ to mean
3572 not to translate; but if we don't translate it
4bb91c68 3573 it will never match anything. */
25fe55af
RS
3574 goto normal_char;
3575 }
3576 break;
fa9a63c5
RM
3577
3578
3579 default:
25fe55af 3580 /* Expects the character in `c'. */
fa9a63c5 3581 normal_char:
36595814 3582 /* If no exactn currently being built. */
25fe55af 3583 if (!pending_exact
fa9a63c5 3584
25fe55af
RS
3585 /* If last exactn not at current position. */
3586 || pending_exact + *pending_exact + 1 != b
5e69f11e 3587
25fe55af 3588 /* We have only one byte following the exactn for the count. */
2d1675e4 3589 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3590
25fe55af 3591 /* If followed by a repetition operator. */
9d99031f 3592 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3593 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3594 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3595 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3596 || ((syntax & RE_INTERVALS)
25fe55af 3597 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3598 ? p != pend && *p == '{'
3599 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3600 {
3601 /* Start building a new exactn. */
5e69f11e 3602
25fe55af 3603 laststart = b;
fa9a63c5
RM
3604
3605 BUF_PUSH_2 (exactn, 0);
3606 pending_exact = b - 1;
25fe55af 3607 }
5e69f11e 3608
2d1675e4
SM
3609 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3610 {
e0277a47
KH
3611 int len;
3612
36595814 3613 c = TRANSLATE (c);
e0277a47
KH
3614 if (multibyte)
3615 len = CHAR_STRING (c, b);
3616 else
3617 *b = c, len = 1;
2d1675e4
SM
3618 b += len;
3619 (*pending_exact) += len;
3620 }
3621
fa9a63c5 3622 break;
25fe55af 3623 } /* switch (c) */
fa9a63c5
RM
3624 } /* while p != pend */
3625
5e69f11e 3626
fa9a63c5 3627 /* Through the pattern now. */
5e69f11e 3628
505bde11 3629 FIXUP_ALT_JUMP ();
fa9a63c5 3630
5e69f11e 3631 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3632 FREE_STACK_RETURN (REG_EPAREN);
3633
3634 /* If we don't want backtracking, force success
3635 the first time we reach the end of the compiled pattern. */
3636 if (syntax & RE_NO_POSIX_BACKTRACKING)
3637 BUF_PUSH (succeed);
3638
fa9a63c5
RM
3639 /* We have succeeded; set the length of the buffer. */
3640 bufp->used = b - bufp->buffer;
3641
3642#ifdef DEBUG
99633e97 3643 if (debug > 0)
fa9a63c5 3644 {
505bde11 3645 re_compile_fastmap (bufp);
fa9a63c5
RM
3646 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3647 print_compiled_pattern (bufp);
3648 }
99633e97 3649 debug--;
fa9a63c5
RM
3650#endif /* DEBUG */
3651
3652#ifndef MATCH_MAY_ALLOCATE
3653 /* Initialize the failure stack to the largest possible stack. This
3654 isn't necessary unless we're trying to avoid calling alloca in
3655 the search and match routines. */
3656 {
3657 int num_regs = bufp->re_nsub + 1;
3658
320a2a73 3659 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3660 {
a26f4ccd 3661 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3662
fa9a63c5
RM
3663 if (! fail_stack.stack)
3664 fail_stack.stack
a073faa6 3665 = (fail_stack_elt_t *) malloc (fail_stack.size
8ea094cf 3666 * sizeof (fail_stack_elt_t));
fa9a63c5
RM
3667 else
3668 fail_stack.stack
a073faa6 3669 = (fail_stack_elt_t *) realloc (fail_stack.stack,
8ea094cf
AS
3670 (fail_stack.size
3671 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3672 }
3673
3674 regex_grow_registers (num_regs);
3675 }
3676#endif /* not MATCH_MAY_ALLOCATE */
3677
7c67c9f6 3678 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3679} /* regex_compile */
3680\f
3681/* Subroutines for `regex_compile'. */
3682
25fe55af 3683/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3684
3685static void
3686store_op1 (op, loc, arg)
3687 re_opcode_t op;
3688 unsigned char *loc;
3689 int arg;
3690{
3691 *loc = (unsigned char) op;
3692 STORE_NUMBER (loc + 1, arg);
3693}
3694
3695
3696/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3697
3698static void
3699store_op2 (op, loc, arg1, arg2)
3700 re_opcode_t op;
3701 unsigned char *loc;
3702 int arg1, arg2;
3703{
3704 *loc = (unsigned char) op;
3705 STORE_NUMBER (loc + 1, arg1);
3706 STORE_NUMBER (loc + 3, arg2);
3707}
3708
3709
3710/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3711 for OP followed by two-byte integer parameter ARG. */
3712
3713static void
3714insert_op1 (op, loc, arg, end)
3715 re_opcode_t op;
3716 unsigned char *loc;
3717 int arg;
5e69f11e 3718 unsigned char *end;
fa9a63c5
RM
3719{
3720 register unsigned char *pfrom = end;
3721 register unsigned char *pto = end + 3;
3722
3723 while (pfrom != loc)
3724 *--pto = *--pfrom;
5e69f11e 3725
fa9a63c5
RM
3726 store_op1 (op, loc, arg);
3727}
3728
3729
3730/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3731
3732static void
3733insert_op2 (op, loc, arg1, arg2, end)
3734 re_opcode_t op;
3735 unsigned char *loc;
3736 int arg1, arg2;
5e69f11e 3737 unsigned char *end;
fa9a63c5
RM
3738{
3739 register unsigned char *pfrom = end;
3740 register unsigned char *pto = end + 5;
3741
3742 while (pfrom != loc)
3743 *--pto = *--pfrom;
5e69f11e 3744
fa9a63c5
RM
3745 store_op2 (op, loc, arg1, arg2);
3746}
3747
3748
3749/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3750 after an alternative or a begin-subexpression. We assume there is at
3751 least one character before the ^. */
3752
3753static boolean
3754at_begline_loc_p (pattern, p, syntax)
01618498 3755 re_char *pattern, *p;
fa9a63c5
RM
3756 reg_syntax_t syntax;
3757{
01618498 3758 re_char *prev = p - 2;
fa9a63c5 3759 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3760
fa9a63c5
RM
3761 return
3762 /* After a subexpression? */
3763 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3764 /* After an alternative? */
d2af47df
SM
3765 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3766 /* After a shy subexpression? */
3767 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3768 && prev[-1] == '?' && prev[-2] == '('
3769 && (syntax & RE_NO_BK_PARENS
3770 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3771}
3772
3773
3774/* The dual of at_begline_loc_p. This one is for $. We assume there is
3775 at least one character after the $, i.e., `P < PEND'. */
3776
3777static boolean
3778at_endline_loc_p (p, pend, syntax)
01618498 3779 re_char *p, *pend;
99633e97 3780 reg_syntax_t syntax;
fa9a63c5 3781{
01618498 3782 re_char *next = p;
fa9a63c5 3783 boolean next_backslash = *next == '\\';
01618498 3784 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3785
fa9a63c5
RM
3786 return
3787 /* Before a subexpression? */
3788 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3789 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3790 /* Before an alternative? */
3791 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3792 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3793}
3794
3795
5e69f11e 3796/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3797 false if it's not. */
3798
3799static boolean
3800group_in_compile_stack (compile_stack, regnum)
3801 compile_stack_type compile_stack;
3802 regnum_t regnum;
3803{
3804 int this_element;
3805
5e69f11e
RM
3806 for (this_element = compile_stack.avail - 1;
3807 this_element >= 0;
fa9a63c5
RM
3808 this_element--)
3809 if (compile_stack.stack[this_element].regnum == regnum)
3810 return true;
3811
3812 return false;
3813}
fa9a63c5 3814\f
f6a3f532
SM
3815/* analyse_first.
3816 If fastmap is non-NULL, go through the pattern and fill fastmap
3817 with all the possible leading chars. If fastmap is NULL, don't
3818 bother filling it up (obviously) and only return whether the
3819 pattern could potentially match the empty string.
3820
3821 Return 1 if p..pend might match the empty string.
3822 Return 0 if p..pend matches at least one char.
01618498 3823 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3824
3825static int
3826analyse_first (p, pend, fastmap, multibyte)
01618498 3827 re_char *p, *pend;
f6a3f532
SM
3828 char *fastmap;
3829 const int multibyte;
fa9a63c5 3830{
505bde11 3831 int j, k;
1fb352e0 3832 boolean not;
fa9a63c5 3833
b18215fc 3834 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3835 flag is set true. */
b18215fc
RS
3836 boolean match_any_multibyte_characters = false;
3837
f6a3f532 3838 assert (p);
5e69f11e 3839
505bde11
SM
3840 /* The loop below works as follows:
3841 - It has a working-list kept in the PATTERN_STACK and which basically
3842 starts by only containing a pointer to the first operation.
3843 - If the opcode we're looking at is a match against some set of
3844 chars, then we add those chars to the fastmap and go on to the
3845 next work element from the worklist (done via `break').
3846 - If the opcode is a control operator on the other hand, we either
3847 ignore it (if it's meaningless at this point, such as `start_memory')
3848 or execute it (if it's a jump). If the jump has several destinations
3849 (i.e. `on_failure_jump'), then we push the other destination onto the
3850 worklist.
3851 We guarantee termination by ignoring backward jumps (more or less),
3852 so that `p' is monotonically increasing. More to the point, we
3853 never set `p' (or push) anything `<= p1'. */
3854
01618498 3855 while (p < pend)
fa9a63c5 3856 {
505bde11
SM
3857 /* `p1' is used as a marker of how far back a `on_failure_jump'
3858 can go without being ignored. It is normally equal to `p'
3859 (which prevents any backward `on_failure_jump') except right
3860 after a plain `jump', to allow patterns such as:
3861 0: jump 10
3862 3..9: <body>
3863 10: on_failure_jump 3
3864 as used for the *? operator. */
01618498 3865 re_char *p1 = p;
5e69f11e 3866
fa9a63c5
RM
3867 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3868 {
f6a3f532 3869 case succeed:
01618498 3870 return 1;
f6a3f532 3871 continue;
fa9a63c5 3872
fa9a63c5 3873 case duplicate:
505bde11
SM
3874 /* If the first character has to match a backreference, that means
3875 that the group was empty (since it already matched). Since this
3876 is the only case that interests us here, we can assume that the
3877 backreference must match the empty string. */
3878 p++;
3879 continue;
fa9a63c5
RM
3880
3881
3882 /* Following are the cases which match a character. These end
25fe55af 3883 with `break'. */
fa9a63c5
RM
3884
3885 case exactn:
e0277a47
KH
3886 if (fastmap)
3887 {
3888 int c = RE_STRING_CHAR (p + 1, pend - p);
4560a582
SM
3889 /* When fast-scanning, the fastmap can be indexed either with
3890 a char (smaller than 256) or with the first byte of
3891 a char's byte sequence. So we have to conservatively add
3892 both to the table. */
e0277a47
KH
3893 if (SINGLE_BYTE_CHAR_P (c))
3894 fastmap[c] = 1;
4560a582 3895 fastmap[p[1]] = 1;
e0277a47 3896 }
fa9a63c5
RM
3897 break;
3898
3899
1fb352e0
SM
3900 case anychar:
3901 /* We could put all the chars except for \n (and maybe \0)
3902 but we don't bother since it is generally not worth it. */
f6a3f532 3903 if (!fastmap) break;
01618498 3904 return -1;
fa9a63c5
RM
3905
3906
b18215fc 3907 case charset_not:
ba5c004d
RS
3908 /* Chars beyond end of bitmap are possible matches.
3909 All the single-byte codes can occur in multibyte buffers.
3910 So any that are not listed in the charset
3911 are possible matches, even in multibyte buffers. */
1fb352e0 3912 if (!fastmap) break;
4560a582
SM
3913 /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
3914 because it will automatically be set when needed by virtue of
3915 being larger than the highest char of its charset (0xbf) but
3916 smaller than (1<<BYTEWIDTH). */
b18215fc 3917 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3918 j < (1 << BYTEWIDTH); j++)
b18215fc 3919 fastmap[j] = 1;
1fb352e0
SM
3920 /* Fallthrough */
3921 case charset:
3922 if (!fastmap) break;
3923 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3924 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3925 j >= 0; j--)
1fb352e0 3926 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
4560a582
SM
3927 {
3928 fastmap[j] = 1;
3929#ifdef emacs
3930 if (j >= 0x80 && j < 0xa0)
3931 fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
3932#endif
3933 }
b18215fc 3934
1fb352e0
SM
3935 if ((not && multibyte)
3936 /* Any character set can possibly contain a character
3937 which doesn't match the specified set of characters. */
3938 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3939 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3940 /* If we can match a character class, we can match
3941 any character set. */
b18215fc
RS
3942 {
3943 set_fastmap_for_multibyte_characters:
3944 if (match_any_multibyte_characters == false)
3945 {
3946 for (j = 0x80; j < 0xA0; j++) /* XXX */
3947 if (BASE_LEADING_CODE_P (j))
3948 fastmap[j] = 1;
3949 match_any_multibyte_characters = true;
3950 }
3951 }
b18215fc 3952
1fb352e0
SM
3953 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3954 && match_any_multibyte_characters == false)
3955 {
3956 /* Set fastmap[I] 1 where I is a base leading code of each
3957 multibyte character in the range table. */
3958 int c, count;
b18215fc 3959
1fb352e0 3960 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3961 bits for a character class. */
1fb352e0 3962 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3963
1fb352e0
SM
3964 /* Extract the number of ranges in range table into COUNT. */
3965 EXTRACT_NUMBER_AND_INCR (count, p);
3966 for (; count > 0; count--, p += 2 * 3) /* XXX */
3967 {
3968 /* Extract the start of each range. */
3969 EXTRACT_CHARACTER (c, p);
3970 j = CHAR_CHARSET (c);
3971 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3972 }
3973 }
b18215fc
RS
3974 break;
3975
1fb352e0
SM
3976 case syntaxspec:
3977 case notsyntaxspec:
3978 if (!fastmap) break;
3979#ifndef emacs
3980 not = (re_opcode_t)p[-1] == notsyntaxspec;
3981 k = *p++;
3982 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3983 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3984 fastmap[j] = 1;
b18215fc 3985 break;
1fb352e0 3986#else /* emacs */
b18215fc
RS
3987 /* This match depends on text properties. These end with
3988 aborting optimizations. */
01618498 3989 return -1;
b18215fc
RS
3990
3991 case categoryspec:
b18215fc 3992 case notcategoryspec:
1fb352e0
SM
3993 if (!fastmap) break;
3994 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3995 k = *p++;
1fb352e0
SM
3996 for (j = 0; j < (1 << BYTEWIDTH); j++)
3997 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3998 fastmap[j] = 1;
3999
1fb352e0 4000 if (multibyte)
b18215fc 4001 /* Any character set can possibly contain a character
1fb352e0 4002 whose category is K (or not). */
b18215fc
RS
4003 goto set_fastmap_for_multibyte_characters;
4004 break;
4005
fa9a63c5 4006 /* All cases after this match the empty string. These end with
25fe55af 4007 `continue'. */
fa9a63c5 4008
fa9a63c5
RM
4009 case before_dot:
4010 case at_dot:
4011 case after_dot:
1fb352e0 4012#endif /* !emacs */
25fe55af
RS
4013 case no_op:
4014 case begline:
4015 case endline:
fa9a63c5
RM
4016 case begbuf:
4017 case endbuf:
4018 case wordbound:
4019 case notwordbound:
4020 case wordbeg:
4021 case wordend:
669fa600
SM
4022 case symbeg:
4023 case symend:
25fe55af 4024 continue;
fa9a63c5
RM
4025
4026
fa9a63c5 4027 case jump:
25fe55af 4028 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4029 if (j < 0)
4030 /* Backward jumps can only go back to code that we've already
4031 visited. `re_compile' should make sure this is true. */
4032 break;
25fe55af 4033 p += j;
505bde11
SM
4034 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4035 {
4036 case on_failure_jump:
4037 case on_failure_keep_string_jump:
505bde11 4038 case on_failure_jump_loop:
0683b6fa 4039 case on_failure_jump_nastyloop:
505bde11
SM
4040 case on_failure_jump_smart:
4041 p++;
4042 break;
4043 default:
4044 continue;
4045 };
4046 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4047 to jump back to "just after here". */
4048 /* Fallthrough */
fa9a63c5 4049
25fe55af
RS
4050 case on_failure_jump:
4051 case on_failure_keep_string_jump:
0683b6fa 4052 case on_failure_jump_nastyloop:
505bde11
SM
4053 case on_failure_jump_loop:
4054 case on_failure_jump_smart:
25fe55af 4055 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4056 if (p + j <= p1)
ed0767d8 4057 ; /* Backward jump to be ignored. */
01618498
SM
4058 else
4059 { /* We have to look down both arms.
4060 We first go down the "straight" path so as to minimize
4061 stack usage when going through alternatives. */
4062 int r = analyse_first (p, pend, fastmap, multibyte);
4063 if (r) return r;
4064 p += j;
4065 }
25fe55af 4066 continue;
fa9a63c5
RM
4067
4068
ed0767d8
SM
4069 case jump_n:
4070 /* This code simply does not properly handle forward jump_n. */
4071 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4072 p += 4;
4073 /* jump_n can either jump or fall through. The (backward) jump
4074 case has already been handled, so we only need to look at the
4075 fallthrough case. */
4076 continue;
177c0ea7 4077
fa9a63c5 4078 case succeed_n:
ed0767d8
SM
4079 /* If N == 0, it should be an on_failure_jump_loop instead. */
4080 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4081 p += 4;
4082 /* We only care about one iteration of the loop, so we don't
4083 need to consider the case where this behaves like an
4084 on_failure_jump. */
25fe55af 4085 continue;
fa9a63c5
RM
4086
4087
4088 case set_number_at:
25fe55af
RS
4089 p += 4;
4090 continue;
fa9a63c5
RM
4091
4092
4093 case start_memory:
25fe55af 4094 case stop_memory:
505bde11 4095 p += 1;
fa9a63c5
RM
4096 continue;
4097
4098
4099 default:
25fe55af
RS
4100 abort (); /* We have listed all the cases. */
4101 } /* switch *p++ */
fa9a63c5
RM
4102
4103 /* Getting here means we have found the possible starting
25fe55af 4104 characters for one path of the pattern -- and that the empty
01618498
SM
4105 string does not match. We need not follow this path further. */
4106 return 0;
fa9a63c5
RM
4107 } /* while p */
4108
01618498
SM
4109 /* We reached the end without matching anything. */
4110 return 1;
4111
f6a3f532
SM
4112} /* analyse_first */
4113\f
4114/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4115 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4116 characters can start a string that matches the pattern. This fastmap
4117 is used by re_search to skip quickly over impossible starting points.
4118
4119 Character codes above (1 << BYTEWIDTH) are not represented in the
4120 fastmap, but the leading codes are represented. Thus, the fastmap
4121 indicates which character sets could start a match.
4122
4123 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4124 area as BUFP->fastmap.
4125
4126 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4127 the pattern buffer.
4128
4129 Returns 0 if we succeed, -2 if an internal error. */
4130
4131int
4132re_compile_fastmap (bufp)
4133 struct re_pattern_buffer *bufp;
4134{
4135 char *fastmap = bufp->fastmap;
4136 int analysis;
4137
4138 assert (fastmap && bufp->buffer);
4139
4140 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4141 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4142
4143 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4144 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4145 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4146 return 0;
4147} /* re_compile_fastmap */
4148\f
4149/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4150 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4151 this memory for recording register information. STARTS and ENDS
4152 must be allocated using the malloc library routine, and must each
4153 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4154
4155 If NUM_REGS == 0, then subsequent matches should allocate their own
4156 register data.
4157
4158 Unless this function is called, the first search or match using
4159 PATTERN_BUFFER will allocate its own register data, without
4160 freeing the old data. */
4161
4162void
4163re_set_registers (bufp, regs, num_regs, starts, ends)
4164 struct re_pattern_buffer *bufp;
4165 struct re_registers *regs;
4166 unsigned num_regs;
4167 regoff_t *starts, *ends;
4168{
4169 if (num_regs)
4170 {
4171 bufp->regs_allocated = REGS_REALLOCATE;
4172 regs->num_regs = num_regs;
4173 regs->start = starts;
4174 regs->end = ends;
4175 }
4176 else
4177 {
4178 bufp->regs_allocated = REGS_UNALLOCATED;
4179 regs->num_regs = 0;
4180 regs->start = regs->end = (regoff_t *) 0;
4181 }
4182}
c0f9ea08 4183WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4184\f
25fe55af 4185/* Searching routines. */
fa9a63c5
RM
4186
4187/* Like re_search_2, below, but only one string is specified, and
4188 doesn't let you say where to stop matching. */
4189
4190int
4191re_search (bufp, string, size, startpos, range, regs)
4192 struct re_pattern_buffer *bufp;
4193 const char *string;
4194 int size, startpos, range;
4195 struct re_registers *regs;
4196{
5e69f11e 4197 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4198 regs, size);
4199}
c0f9ea08 4200WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4201
70806df6
KH
4202/* Head address of virtual concatenation of string. */
4203#define HEAD_ADDR_VSTRING(P) \
4204 (((P) >= size1 ? string2 : string1))
4205
b18215fc
RS
4206/* End address of virtual concatenation of string. */
4207#define STOP_ADDR_VSTRING(P) \
4208 (((P) >= size1 ? string2 + size2 : string1 + size1))
4209
4210/* Address of POS in the concatenation of virtual string. */
4211#define POS_ADDR_VSTRING(POS) \
4212 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4213
4214/* Using the compiled pattern in BUFP->buffer, first tries to match the
4215 virtual concatenation of STRING1 and STRING2, starting first at index
4216 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4217
fa9a63c5 4218 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4219
fa9a63c5
RM
4220 RANGE is how far to scan while trying to match. RANGE = 0 means try
4221 only at STARTPOS; in general, the last start tried is STARTPOS +
4222 RANGE.
5e69f11e 4223
fa9a63c5
RM
4224 In REGS, return the indices of the virtual concatenation of STRING1
4225 and STRING2 that matched the entire BUFP->buffer and its contained
4226 subexpressions.
5e69f11e 4227
fa9a63c5
RM
4228 Do not consider matching one past the index STOP in the virtual
4229 concatenation of STRING1 and STRING2.
4230
4231 We return either the position in the strings at which the match was
4232 found, -1 if no match, or -2 if error (such as failure
4233 stack overflow). */
4234
4235int
66f0296e 4236re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4237 struct re_pattern_buffer *bufp;
66f0296e 4238 const char *str1, *str2;
fa9a63c5
RM
4239 int size1, size2;
4240 int startpos;
4241 int range;
4242 struct re_registers *regs;
4243 int stop;
4244{
4245 int val;
66f0296e
SM
4246 re_char *string1 = (re_char*) str1;
4247 re_char *string2 = (re_char*) str2;
fa9a63c5 4248 register char *fastmap = bufp->fastmap;
6676cb1c 4249 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4250 int total_size = size1 + size2;
4251 int endpos = startpos + range;
c0f9ea08 4252 boolean anchored_start;
fa9a63c5 4253
25fe55af 4254 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4255 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4256
fa9a63c5
RM
4257 /* Check for out-of-range STARTPOS. */
4258 if (startpos < 0 || startpos > total_size)
4259 return -1;
5e69f11e 4260
fa9a63c5 4261 /* Fix up RANGE if it might eventually take us outside
34597fa9 4262 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4263 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4264 if (endpos < 0)
4265 range = 0 - startpos;
fa9a63c5
RM
4266 else if (endpos > total_size)
4267 range = total_size - startpos;
4268
4269 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4270 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4271 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4272 {
4273 if (startpos > 0)
4274 return -1;
4275 else
7b140fd7 4276 range = 0;
fa9a63c5
RM
4277 }
4278
ae4788a8
RS
4279#ifdef emacs
4280 /* In a forward search for something that starts with \=.
4281 don't keep searching past point. */
4282 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4283 {
7b140fd7
RS
4284 range = PT_BYTE - BEGV_BYTE - startpos;
4285 if (range < 0)
ae4788a8
RS
4286 return -1;
4287 }
4288#endif /* emacs */
4289
fa9a63c5
RM
4290 /* Update the fastmap now if not correct already. */
4291 if (fastmap && !bufp->fastmap_accurate)
01618498 4292 re_compile_fastmap (bufp);
5e69f11e 4293
c8499ba5 4294 /* See whether the pattern is anchored. */
c0f9ea08 4295 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4296
b18215fc 4297#ifdef emacs
cc9b4df2
KH
4298 gl_state.object = re_match_object;
4299 {
99633e97 4300 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4301
4302 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4303 }
b18215fc
RS
4304#endif
4305
fa9a63c5
RM
4306 /* Loop through the string, looking for a place to start matching. */
4307 for (;;)
5e69f11e 4308 {
c8499ba5
RS
4309 /* If the pattern is anchored,
4310 skip quickly past places we cannot match.
4311 We don't bother to treat startpos == 0 specially
4312 because that case doesn't repeat. */
4313 if (anchored_start && startpos > 0)
4314 {
c0f9ea08
SM
4315 if (! ((startpos <= size1 ? string1[startpos - 1]
4316 : string2[startpos - size1 - 1])
4317 == '\n'))
c8499ba5
RS
4318 goto advance;
4319 }
4320
fa9a63c5 4321 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4322 cannot be the start of a match. If the pattern can match the
4323 null string, however, we don't need to skip characters; we want
4324 the first null string. */
fa9a63c5
RM
4325 if (fastmap && startpos < total_size && !bufp->can_be_null)
4326 {
66f0296e 4327 register re_char *d;
01618498 4328 register re_wchar_t buf_ch;
e934739e
RS
4329
4330 d = POS_ADDR_VSTRING (startpos);
4331
25fe55af 4332 if (range > 0) /* Searching forwards. */
fa9a63c5 4333 {
fa9a63c5
RM
4334 register int lim = 0;
4335 int irange = range;
4336
25fe55af
RS
4337 if (startpos < size1 && startpos + range >= size1)
4338 lim = range - (size1 - startpos);
fa9a63c5 4339
25fe55af
RS
4340 /* Written out as an if-else to avoid testing `translate'
4341 inside the loop. */
28ae27ae
AS
4342 if (RE_TRANSLATE_P (translate))
4343 {
e934739e
RS
4344 if (multibyte)
4345 while (range > lim)
4346 {
4347 int buf_charlen;
4348
4349 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4350 buf_charlen);
4351
4352 buf_ch = RE_TRANSLATE (translate, buf_ch);
4353 if (buf_ch >= 0400
4354 || fastmap[buf_ch])
4355 break;
4356
4357 range -= buf_charlen;
4358 d += buf_charlen;
4359 }
4360 else
cf1982d9
EZ
4361 {
4362 /* Convert *d to integer to shut up GCC's
4363 whining about comparison that is always
4364 true. */
4365 int di = *d;
4366
4367 while (range > lim
4368 && !fastmap[RE_TRANSLATE (translate, di)])
4369 {
4370 di = *(++d);
4371 range--;
4372 }
4373 }
e934739e 4374 }
fa9a63c5 4375 else
4560a582 4376 do
33c46939 4377 {
4560a582
SM
4378 re_char *d_start = d;
4379 while (range > lim && !fastmap[*d])
4380 {
4381 d++;
4382 range--;
4383 }
4384#ifdef emacs
4385 if (multibyte && range > lim)
4386 {
4387 /* Check that we are at the beginning of a char. */
4388 int at_boundary;
4389 AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
4390 if (at_boundary)
4391 break;
4392 else
4393 { /* We have matched an internal byte of a char
4394 rather than the leading byte, so it's a false
4395 positive: we should keep scanning. */
4396 d++; range--;
4397 }
4398 }
4399 else
4400#endif
4401 break;
4402 } while (1);
fa9a63c5
RM
4403
4404 startpos += irange - range;
4405 }
25fe55af 4406 else /* Searching backwards. */
fa9a63c5 4407 {
2d1675e4
SM
4408 int room = (startpos >= size1
4409 ? size2 + size1 - startpos
4410 : size1 - startpos);
4411 buf_ch = RE_STRING_CHAR (d, room);
4412 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 4413
e934739e
RS
4414 if (! (buf_ch >= 0400
4415 || fastmap[buf_ch]))
fa9a63c5
RM
4416 goto advance;
4417 }
4418 }
4419
4420 /* If can't match the null string, and that's all we have left, fail. */
4421 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4422 && !bufp->can_be_null)
fa9a63c5
RM
4423 return -1;
4424
4425 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4426 startpos, regs, stop);
4427#ifndef REGEX_MALLOC
0b32bf0e 4428# ifdef C_ALLOCA
fa9a63c5 4429 alloca (0);
0b32bf0e 4430# endif
fa9a63c5
RM
4431#endif
4432
4433 if (val >= 0)
4434 return startpos;
5e69f11e 4435
fa9a63c5
RM
4436 if (val == -2)
4437 return -2;
4438
4439 advance:
5e69f11e 4440 if (!range)
25fe55af 4441 break;
5e69f11e 4442 else if (range > 0)
25fe55af 4443 {
b18215fc
RS
4444 /* Update STARTPOS to the next character boundary. */
4445 if (multibyte)
4446 {
66f0296e
SM
4447 re_char *p = POS_ADDR_VSTRING (startpos);
4448 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4449 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4450
4451 range -= len;
4452 if (range < 0)
4453 break;
4454 startpos += len;
4455 }
4456 else
4457 {
b560c397
RS
4458 range--;
4459 startpos++;
4460 }
e318085a 4461 }
fa9a63c5 4462 else
25fe55af
RS
4463 {
4464 range++;
4465 startpos--;
b18215fc
RS
4466
4467 /* Update STARTPOS to the previous character boundary. */
4468 if (multibyte)
4469 {
70806df6
KH
4470 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4471 re_char *p0 = p;
4472 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4473
4474 /* Find the head of multibyte form. */
70806df6
KH
4475 PREV_CHAR_BOUNDARY (p, phead);
4476 range += p0 - 1 - p;
4477 if (range > 0)
4478 break;
b18215fc 4479
70806df6 4480 startpos -= p0 - 1 - p;
b18215fc 4481 }
25fe55af 4482 }
fa9a63c5
RM
4483 }
4484 return -1;
4485} /* re_search_2 */
c0f9ea08 4486WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4487\f
4488/* Declarations and macros for re_match_2. */
4489
2d1675e4
SM
4490static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4491 register int len,
4492 RE_TRANSLATE_TYPE translate,
4493 const int multibyte));
fa9a63c5
RM
4494
4495/* This converts PTR, a pointer into one of the search strings `string1'
4496 and `string2' into an offset from the beginning of that string. */
4497#define POINTER_TO_OFFSET(ptr) \
4498 (FIRST_STRING_P (ptr) \
4499 ? ((regoff_t) ((ptr) - string1)) \
4500 : ((regoff_t) ((ptr) - string2 + size1)))
4501
fa9a63c5 4502/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4503 string2 if necessary.
4504 Check re_match_2_internal for a discussion of why end_match_2 might
4505 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4506#define PREFETCH() \
25fe55af 4507 while (d == dend) \
fa9a63c5
RM
4508 { \
4509 /* End of string2 => fail. */ \
25fe55af
RS
4510 if (dend == end_match_2) \
4511 goto fail; \
4bb91c68 4512 /* End of string1 => advance to string2. */ \
25fe55af 4513 d = string2; \
fa9a63c5
RM
4514 dend = end_match_2; \
4515 }
4516
f1ad044f
SM
4517/* Call before fetching a char with *d if you already checked other limits.
4518 This is meant for use in lookahead operations like wordend, etc..
4519 where we might need to look at parts of the string that might be
4520 outside of the LIMITs (i.e past `stop'). */
4521#define PREFETCH_NOLIMIT() \
4522 if (d == end1) \
4523 { \
4524 d = string2; \
4525 dend = end_match_2; \
4526 } \
fa9a63c5
RM
4527
4528/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4529 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4530#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4531#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4532
4533
4534/* Test if D points to a character which is word-constituent. We have
4535 two special cases to check for: if past the end of string1, look at
4536 the first character in string2; and if before the beginning of
4537 string2, look at the last character in string1. */
4538#define WORDCHAR_P(d) \
4539 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4540 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4541 == Sword)
4542
9121ca40 4543/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4544
4545/* The comment at case wordbound is following one, but we don't use
4546 AT_WORD_BOUNDARY anymore to support multibyte form.
4547
4548 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4549 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4550 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4551 macro and introducing temporary variables works around the bug. */
4552
9121ca40 4553#if 0
fa9a63c5
RM
4554/* Test if the character before D and the one at D differ with respect
4555 to being word-constituent. */
4556#define AT_WORD_BOUNDARY(d) \
4557 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4558 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4559#endif
fa9a63c5
RM
4560
4561/* Free everything we malloc. */
4562#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4563# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4564# define FREE_VARIABLES() \
fa9a63c5
RM
4565 do { \
4566 REGEX_FREE_STACK (fail_stack.stack); \
4567 FREE_VAR (regstart); \
4568 FREE_VAR (regend); \
fa9a63c5
RM
4569 FREE_VAR (best_regstart); \
4570 FREE_VAR (best_regend); \
fa9a63c5
RM
4571 } while (0)
4572#else
0b32bf0e 4573# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4574#endif /* not MATCH_MAY_ALLOCATE */
4575
505bde11
SM
4576\f
4577/* Optimization routines. */
4578
4e8a9132
SM
4579/* If the operation is a match against one or more chars,
4580 return a pointer to the next operation, else return NULL. */
01618498 4581static re_char *
4e8a9132 4582skip_one_char (p)
01618498 4583 re_char *p;
4e8a9132
SM
4584{
4585 switch (SWITCH_ENUM_CAST (*p++))
4586 {
4587 case anychar:
4588 break;
177c0ea7 4589
4e8a9132
SM
4590 case exactn:
4591 p += *p + 1;
4592 break;
4593
4594 case charset_not:
4595 case charset:
4596 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4597 {
4598 int mcnt;
4599 p = CHARSET_RANGE_TABLE (p - 1);
4600 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4601 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4602 }
4603 else
4604 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4605 break;
177c0ea7 4606
4e8a9132
SM
4607 case syntaxspec:
4608 case notsyntaxspec:
1fb352e0 4609#ifdef emacs
4e8a9132
SM
4610 case categoryspec:
4611 case notcategoryspec:
4612#endif /* emacs */
4613 p++;
4614 break;
4615
4616 default:
4617 p = NULL;
4618 }
4619 return p;
4620}
4621
4622
505bde11 4623/* Jump over non-matching operations. */
275a3409 4624static re_char *
4e8a9132 4625skip_noops (p, pend)
275a3409 4626 re_char *p, *pend;
505bde11
SM
4627{
4628 int mcnt;
4629 while (p < pend)
4630 {
4631 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4632 {
4633 case start_memory:
505bde11
SM
4634 case stop_memory:
4635 p += 2; break;
4636 case no_op:
4637 p += 1; break;
4638 case jump:
4639 p += 1;
4640 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4641 p += mcnt;
4642 break;
4643 default:
4644 return p;
4645 }
4646 }
4647 assert (p == pend);
4648 return p;
4649}
4650
4651/* Non-zero if "p1 matches something" implies "p2 fails". */
4652static int
4653mutually_exclusive_p (bufp, p1, p2)
4654 struct re_pattern_buffer *bufp;
275a3409 4655 re_char *p1, *p2;
505bde11 4656{
4e8a9132 4657 re_opcode_t op2;
2d1675e4 4658 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4659 unsigned char *pend = bufp->buffer + bufp->used;
4660
4e8a9132 4661 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4662 && p2 >= bufp->buffer && p2 <= pend);
4663
4664 /* Skip over open/close-group commands.
4665 If what follows this loop is a ...+ construct,
4666 look at what begins its body, since we will have to
4667 match at least one of that. */
4e8a9132
SM
4668 p2 = skip_noops (p2, pend);
4669 /* The same skip can be done for p1, except that this function
4670 is only used in the case where p1 is a simple match operator. */
4671 /* p1 = skip_noops (p1, pend); */
4672
4673 assert (p1 >= bufp->buffer && p1 < pend
4674 && p2 >= bufp->buffer && p2 <= pend);
4675
4676 op2 = p2 == pend ? succeed : *p2;
4677
4678 switch (SWITCH_ENUM_CAST (op2))
505bde11 4679 {
4e8a9132
SM
4680 case succeed:
4681 case endbuf:
4682 /* If we're at the end of the pattern, we can change. */
4683 if (skip_one_char (p1))
505bde11 4684 {
505bde11
SM
4685 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4686 return 1;
505bde11 4687 }
4e8a9132 4688 break;
177c0ea7 4689
4e8a9132 4690 case endline:
4e8a9132
SM
4691 case exactn:
4692 {
01618498 4693 register re_wchar_t c
4e8a9132 4694 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4695 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4696
4e8a9132
SM
4697 if ((re_opcode_t) *p1 == exactn)
4698 {
4699 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4700 {
4701 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4702 return 1;
4703 }
4704 }
505bde11 4705
4e8a9132
SM
4706 else if ((re_opcode_t) *p1 == charset
4707 || (re_opcode_t) *p1 == charset_not)
4708 {
4709 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4710
4e8a9132
SM
4711 /* Test if C is listed in charset (or charset_not)
4712 at `p1'. */
4713 if (SINGLE_BYTE_CHAR_P (c))
4714 {
4715 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4716 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4717 not = !not;
4718 }
4719 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4720 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4721
4e8a9132
SM
4722 /* `not' is equal to 1 if c would match, which means
4723 that we can't change to pop_failure_jump. */
4724 if (!not)
4725 {
4726 DEBUG_PRINT1 (" No match => fast loop.\n");
4727 return 1;
4728 }
4729 }
4730 else if ((re_opcode_t) *p1 == anychar
4731 && c == '\n')
4732 {
4733 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4734 return 1;
4735 }
4736 }
4737 break;
505bde11 4738
4e8a9132 4739 case charset:
4e8a9132
SM
4740 {
4741 if ((re_opcode_t) *p1 == exactn)
4742 /* Reuse the code above. */
4743 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4744
505bde11
SM
4745 /* It is hard to list up all the character in charset
4746 P2 if it includes multibyte character. Give up in
4747 such case. */
4748 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4749 {
4750 /* Now, we are sure that P2 has no range table.
4751 So, for the size of bitmap in P2, `p2[1]' is
4752 enough. But P1 may have range table, so the
4753 size of bitmap table of P1 is extracted by
4754 using macro `CHARSET_BITMAP_SIZE'.
4755
4756 Since we know that all the character listed in
4757 P2 is ASCII, it is enough to test only bitmap
4758 table of P1. */
4759
411e4203 4760 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4761 {
4762 int idx;
4763 /* We win if the charset inside the loop
4764 has no overlap with the one after the loop. */
4765 for (idx = 0;
4766 (idx < (int) p2[1]
4767 && idx < CHARSET_BITMAP_SIZE (p1));
4768 idx++)
4769 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4770 break;
4771
4772 if (idx == p2[1]
4773 || idx == CHARSET_BITMAP_SIZE (p1))
4774 {
4775 DEBUG_PRINT1 (" No match => fast loop.\n");
4776 return 1;
4777 }
4778 }
411e4203 4779 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4780 {
4781 int idx;
4782 /* We win if the charset_not inside the loop lists
4783 every character listed in the charset after. */
4784 for (idx = 0; idx < (int) p2[1]; idx++)
4785 if (! (p2[2 + idx] == 0
4786 || (idx < CHARSET_BITMAP_SIZE (p1)
4787 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4788 break;
4789
4e8a9132
SM
4790 if (idx == p2[1])
4791 {
4792 DEBUG_PRINT1 (" No match => fast loop.\n");
4793 return 1;
4794 }
4795 }
4796 }
4797 }
609b757a 4798 break;
177c0ea7 4799
411e4203
SM
4800 case charset_not:
4801 switch (SWITCH_ENUM_CAST (*p1))
4802 {
4803 case exactn:
4804 case charset:
4805 /* Reuse the code above. */
4806 return mutually_exclusive_p (bufp, p2, p1);
4807 case charset_not:
4808 /* When we have two charset_not, it's very unlikely that
4809 they don't overlap. The union of the two sets of excluded
4810 chars should cover all possible chars, which, as a matter of
4811 fact, is virtually impossible in multibyte buffers. */
36595814 4812 break;
411e4203
SM
4813 }
4814 break;
4815
4e8a9132 4816 case wordend:
669fa600
SM
4817 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4818 case symend:
4e8a9132 4819 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4820 && (p1[1] == Ssymbol || p1[1] == Sword));
4821 case notsyntaxspec:
4822 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4823
4824 case wordbeg:
669fa600
SM
4825 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4826 case symbeg:
4e8a9132 4827 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4828 && (p1[1] == Ssymbol || p1[1] == Sword));
4829 case syntaxspec:
4830 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4831
4832 case wordbound:
4833 return (((re_opcode_t) *p1 == notsyntaxspec
4834 || (re_opcode_t) *p1 == syntaxspec)
4835 && p1[1] == Sword);
4836
1fb352e0 4837#ifdef emacs
4e8a9132
SM
4838 case categoryspec:
4839 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4840 case notcategoryspec:
4841 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4842#endif /* emacs */
4843
4844 default:
4845 ;
505bde11
SM
4846 }
4847
4848 /* Safe default. */
4849 return 0;
4850}
4851
fa9a63c5
RM
4852\f
4853/* Matching routines. */
4854
25fe55af 4855#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4856/* re_match is like re_match_2 except it takes only a single string. */
4857
4858int
4859re_match (bufp, string, size, pos, regs)
4860 struct re_pattern_buffer *bufp;
4861 const char *string;
4862 int size, pos;
4863 struct re_registers *regs;
4864{
4bb91c68 4865 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4866 pos, regs, size);
0b32bf0e 4867# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4868 alloca (0);
0b32bf0e 4869# endif
fa9a63c5
RM
4870 return result;
4871}
c0f9ea08 4872WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4873#endif /* not emacs */
4874
b18215fc
RS
4875#ifdef emacs
4876/* In Emacs, this is the string or buffer in which we
25fe55af 4877 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4878Lisp_Object re_match_object;
4879#endif
fa9a63c5
RM
4880
4881/* re_match_2 matches the compiled pattern in BUFP against the
4882 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4883 and SIZE2, respectively). We start matching at POS, and stop
4884 matching at STOP.
5e69f11e 4885
fa9a63c5 4886 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4887 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4888 documentation for exactly how many groups we fill.
4889
4890 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4891 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4892 matched substring. */
4893
4894int
4895re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4896 struct re_pattern_buffer *bufp;
4897 const char *string1, *string2;
4898 int size1, size2;
4899 int pos;
4900 struct re_registers *regs;
4901 int stop;
4902{
b18215fc 4903 int result;
25fe55af 4904
b18215fc 4905#ifdef emacs
cc9b4df2
KH
4906 int charpos;
4907 gl_state.object = re_match_object;
99633e97 4908 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4909 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4910#endif
4911
4bb91c68
SM
4912 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4913 (re_char*) string2, size2,
cc9b4df2 4914 pos, regs, stop);
0b32bf0e 4915#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4916 alloca (0);
a60198e5 4917#endif
fa9a63c5
RM
4918 return result;
4919}
c0f9ea08 4920WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4921
4922/* This is a separate function so that we can force an alloca cleanup
25fe55af 4923 afterwards. */
fa9a63c5
RM
4924static int
4925re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4926 struct re_pattern_buffer *bufp;
66f0296e 4927 re_char *string1, *string2;
fa9a63c5
RM
4928 int size1, size2;
4929 int pos;
4930 struct re_registers *regs;
4931 int stop;
4932{
4933 /* General temporaries. */
4934 int mcnt;
01618498 4935 size_t reg;
66f0296e 4936 boolean not;
fa9a63c5
RM
4937
4938 /* Just past the end of the corresponding string. */
66f0296e 4939 re_char *end1, *end2;
fa9a63c5
RM
4940
4941 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4942 each to consider matching. */
66f0296e 4943 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4944
4945 /* Where we are in the data, and the end of the current string. */
66f0296e 4946 re_char *d, *dend;
5e69f11e 4947
99633e97
SM
4948 /* Used sometimes to remember where we were before starting matching
4949 an operator so that we can go back in case of failure. This "atomic"
4950 behavior of matching opcodes is indispensable to the correctness
4951 of the on_failure_keep_string_jump optimization. */
4952 re_char *dfail;
4953
fa9a63c5 4954 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4955 re_char *p = bufp->buffer;
4956 re_char *pend = p + bufp->used;
fa9a63c5 4957
25fe55af 4958 /* We use this to map every character in the string. */
6676cb1c 4959 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4960
25fe55af 4961 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4962 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4963
fa9a63c5
RM
4964 /* Failure point stack. Each place that can handle a failure further
4965 down the line pushes a failure point on this stack. It consists of
505bde11 4966 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4967 the subexpressions we're currently inside, plus the number of such
4968 registers, and, finally, two char *'s. The first char * is where
4969 to resume scanning the pattern; the second one is where to resume
505bde11 4970 scanning the strings. */
25fe55af 4971#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4972 fail_stack_type fail_stack;
4973#endif
4974#ifdef DEBUG
fa9a63c5
RM
4975 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4976#endif
4977
0b32bf0e 4978#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4979 /* This holds the pointer to the failure stack, when
4980 it is allocated relocatably. */
4981 fail_stack_elt_t *failure_stack_ptr;
99633e97 4982#endif
fa9a63c5
RM
4983
4984 /* We fill all the registers internally, independent of what we
25fe55af 4985 return, for use in backreferences. The number here includes
fa9a63c5 4986 an element for register zero. */
4bb91c68 4987 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4988
fa9a63c5
RM
4989 /* Information on the contents of registers. These are pointers into
4990 the input strings; they record just what was matched (on this
4991 attempt) by a subexpression part of the pattern, that is, the
4992 regnum-th regstart pointer points to where in the pattern we began
4993 matching and the regnum-th regend points to right after where we
4994 stopped matching the regnum-th subexpression. (The zeroth register
4995 keeps track of what the whole pattern matches.) */
4996#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4997 re_char **regstart, **regend;
fa9a63c5
RM
4998#endif
4999
fa9a63c5 5000 /* The following record the register info as found in the above
5e69f11e 5001 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5002 This happens as we backtrack through the failure points, which in
5003 turn happens only if we have not yet matched the entire string. */
5004 unsigned best_regs_set = false;
5005#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5006 re_char **best_regstart, **best_regend;
fa9a63c5 5007#endif
5e69f11e 5008
fa9a63c5
RM
5009 /* Logically, this is `best_regend[0]'. But we don't want to have to
5010 allocate space for that if we're not allocating space for anything
25fe55af 5011 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5012 any of the other register vectors, and it seems rather a kludge to
5013 treat `best_regend' differently than the rest. So we keep track of
5014 the end of the best match so far in a separate variable. We
5015 initialize this to NULL so that when we backtrack the first time
5016 and need to test it, it's not garbage. */
66f0296e 5017 re_char *match_end = NULL;
fa9a63c5 5018
fa9a63c5
RM
5019#ifdef DEBUG
5020 /* Counts the total number of registers pushed. */
5e69f11e 5021 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5022#endif
5023
5024 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5025
fa9a63c5 5026 INIT_FAIL_STACK ();
5e69f11e 5027
fa9a63c5
RM
5028#ifdef MATCH_MAY_ALLOCATE
5029 /* Do not bother to initialize all the register variables if there are
5030 no groups in the pattern, as it takes a fair amount of time. If
5031 there are groups, we include space for register 0 (the whole
5032 pattern), even though we never use it, since it simplifies the
5033 array indexing. We should fix this. */
5034 if (bufp->re_nsub)
5035 {
66f0296e
SM
5036 regstart = REGEX_TALLOC (num_regs, re_char *);
5037 regend = REGEX_TALLOC (num_regs, re_char *);
5038 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5039 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5040
505bde11 5041 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5042 {
5043 FREE_VARIABLES ();
5044 return -2;
5045 }
fa9a63c5
RM
5046 }
5047 else
5048 {
5049 /* We must initialize all our variables to NULL, so that
25fe55af 5050 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5051 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5052 }
5053#endif /* MATCH_MAY_ALLOCATE */
5054
5055 /* The starting position is bogus. */
5056 if (pos < 0 || pos > size1 + size2)
5057 {
5058 FREE_VARIABLES ();
5059 return -1;
5060 }
5e69f11e 5061
fa9a63c5
RM
5062 /* Initialize subexpression text positions to -1 to mark ones that no
5063 start_memory/stop_memory has been seen for. Also initialize the
5064 register information struct. */
01618498
SM
5065 for (reg = 1; reg < num_regs; reg++)
5066 regstart[reg] = regend[reg] = NULL;
99633e97 5067
fa9a63c5 5068 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 5069 `string1' is null. */
fa9a63c5
RM
5070 if (size2 == 0 && string1 != NULL)
5071 {
5072 string2 = string1;
5073 size2 = size1;
5074 string1 = 0;
5075 size1 = 0;
5076 }
5077 end1 = string1 + size1;
5078 end2 = string2 + size2;
5079
5e69f11e 5080 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5081 `dend' is the end of the input string that `d' points within. `d'
5082 is advanced into the following input string whenever necessary, but
5083 this happens before fetching; therefore, at the beginning of the
5084 loop, `d' can be pointing at the end of a string, but it cannot
5085 equal `string2'. */
419d1c74 5086 if (pos >= size1)
fa9a63c5 5087 {
419d1c74
SM
5088 /* Only match within string2. */
5089 d = string2 + pos - size1;
5090 dend = end_match_2 = string2 + stop - size1;
5091 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5092 }
5093 else
5094 {
f1ad044f 5095 if (stop < size1)
419d1c74
SM
5096 {
5097 /* Only match within string1. */
5098 end_match_1 = string1 + stop;
5099 /* BEWARE!
5100 When we reach end_match_1, PREFETCH normally switches to string2.
5101 But in the present case, this means that just doing a PREFETCH
5102 makes us jump from `stop' to `gap' within the string.
5103 What we really want here is for the search to stop as
5104 soon as we hit end_match_1. That's why we set end_match_2
5105 to end_match_1 (since PREFETCH fails as soon as we hit
5106 end_match_2). */
5107 end_match_2 = end_match_1;
5108 }
5109 else
f1ad044f
SM
5110 { /* It's important to use this code when stop == size so that
5111 moving `d' from end1 to string2 will not prevent the d == dend
5112 check from catching the end of string. */
419d1c74
SM
5113 end_match_1 = end1;
5114 end_match_2 = string2 + stop - size1;
5115 }
5116 d = string1 + pos;
5117 dend = end_match_1;
fa9a63c5
RM
5118 }
5119
5120 DEBUG_PRINT1 ("The compiled pattern is: ");
5121 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5122 DEBUG_PRINT1 ("The string to match is: `");
5123 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5124 DEBUG_PRINT1 ("'\n");
5e69f11e 5125
25fe55af 5126 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5127 function if the match is complete, or it drops through if the match
5128 fails at this starting point in the input data. */
5129 for (;;)
5130 {
505bde11 5131 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5132
5133 if (p == pend)
5134 { /* End of pattern means we might have succeeded. */
25fe55af 5135 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5136
fa9a63c5 5137 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5138 longest match, try backtracking. */
5139 if (d != end_match_2)
fa9a63c5
RM
5140 {
5141 /* 1 if this match ends in the same string (string1 or string2)
5142 as the best previous match. */
5e69f11e 5143 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5144 == FIRST_STRING_P (d));
fa9a63c5
RM
5145 /* 1 if this match is the best seen so far. */
5146 boolean best_match_p;
5147
5148 /* AIX compiler got confused when this was combined
25fe55af 5149 with the previous declaration. */
fa9a63c5
RM
5150 if (same_str_p)
5151 best_match_p = d > match_end;
5152 else
99633e97 5153 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5154
25fe55af
RS
5155 DEBUG_PRINT1 ("backtracking.\n");
5156
5157 if (!FAIL_STACK_EMPTY ())
5158 { /* More failure points to try. */
5159
5160 /* If exceeds best match so far, save it. */
5161 if (!best_regs_set || best_match_p)
5162 {
5163 best_regs_set = true;
5164 match_end = d;
5165
5166 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5167
01618498 5168 for (reg = 1; reg < num_regs; reg++)
25fe55af 5169 {
01618498
SM
5170 best_regstart[reg] = regstart[reg];
5171 best_regend[reg] = regend[reg];
25fe55af
RS
5172 }
5173 }
5174 goto fail;
5175 }
5176
5177 /* If no failure points, don't restore garbage. And if
5178 last match is real best match, don't restore second
5179 best one. */
5180 else if (best_regs_set && !best_match_p)
5181 {
5182 restore_best_regs:
5183 /* Restore best match. It may happen that `dend ==
5184 end_match_1' while the restored d is in string2.
5185 For example, the pattern `x.*y.*z' against the
5186 strings `x-' and `y-z-', if the two strings are
5187 not consecutive in memory. */
5188 DEBUG_PRINT1 ("Restoring best registers.\n");
5189
5190 d = match_end;
5191 dend = ((d >= string1 && d <= end1)
5192 ? end_match_1 : end_match_2);
fa9a63c5 5193
01618498 5194 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5195 {
01618498
SM
5196 regstart[reg] = best_regstart[reg];
5197 regend[reg] = best_regend[reg];
fa9a63c5 5198 }
25fe55af
RS
5199 }
5200 } /* d != end_match_2 */
fa9a63c5
RM
5201
5202 succeed_label:
25fe55af 5203 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5204
25fe55af
RS
5205 /* If caller wants register contents data back, do it. */
5206 if (regs && !bufp->no_sub)
fa9a63c5 5207 {
25fe55af
RS
5208 /* Have the register data arrays been allocated? */
5209 if (bufp->regs_allocated == REGS_UNALLOCATED)
5210 { /* No. So allocate them with malloc. We need one
5211 extra element beyond `num_regs' for the `-1' marker
5212 GNU code uses. */
5213 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5214 regs->start = TALLOC (regs->num_regs, regoff_t);
5215 regs->end = TALLOC (regs->num_regs, regoff_t);
5216 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5217 {
5218 FREE_VARIABLES ();
5219 return -2;
5220 }
25fe55af
RS
5221 bufp->regs_allocated = REGS_REALLOCATE;
5222 }
5223 else if (bufp->regs_allocated == REGS_REALLOCATE)
5224 { /* Yes. If we need more elements than were already
5225 allocated, reallocate them. If we need fewer, just
5226 leave it alone. */
5227 if (regs->num_regs < num_regs + 1)
5228 {
5229 regs->num_regs = num_regs + 1;
5230 RETALLOC (regs->start, regs->num_regs, regoff_t);
5231 RETALLOC (regs->end, regs->num_regs, regoff_t);
5232 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5233 {
5234 FREE_VARIABLES ();
5235 return -2;
5236 }
25fe55af
RS
5237 }
5238 }
5239 else
fa9a63c5
RM
5240 {
5241 /* These braces fend off a "empty body in an else-statement"
25fe55af 5242 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5243 assert (bufp->regs_allocated == REGS_FIXED);
5244 }
5245
25fe55af
RS
5246 /* Convert the pointer data in `regstart' and `regend' to
5247 indices. Register zero has to be set differently,
5248 since we haven't kept track of any info for it. */
5249 if (regs->num_regs > 0)
5250 {
5251 regs->start[0] = pos;
99633e97 5252 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5253 }
5e69f11e 5254
25fe55af
RS
5255 /* Go through the first `min (num_regs, regs->num_regs)'
5256 registers, since that is all we initialized. */
01618498 5257 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5258 {
01618498
SM
5259 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5260 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5261 else
5262 {
01618498
SM
5263 regs->start[reg]
5264 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5265 regs->end[reg]
5266 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5267 }
fa9a63c5 5268 }
5e69f11e 5269
25fe55af
RS
5270 /* If the regs structure we return has more elements than
5271 were in the pattern, set the extra elements to -1. If
5272 we (re)allocated the registers, this is the case,
5273 because we always allocate enough to have at least one
5274 -1 at the end. */
01618498
SM
5275 for (reg = num_regs; reg < regs->num_regs; reg++)
5276 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5277 } /* regs && !bufp->no_sub */
5278
25fe55af
RS
5279 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5280 nfailure_points_pushed, nfailure_points_popped,
5281 nfailure_points_pushed - nfailure_points_popped);
5282 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5283
99633e97 5284 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5285
25fe55af 5286 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5287
25fe55af
RS
5288 FREE_VARIABLES ();
5289 return mcnt;
5290 }
fa9a63c5 5291
25fe55af 5292 /* Otherwise match next pattern command. */
fa9a63c5
RM
5293 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5294 {
25fe55af
RS
5295 /* Ignore these. Used to ignore the n of succeed_n's which
5296 currently have n == 0. */
5297 case no_op:
5298 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5299 break;
fa9a63c5
RM
5300
5301 case succeed:
25fe55af 5302 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5303 goto succeed_label;
5304
25fe55af
RS
5305 /* Match the next n pattern characters exactly. The following
5306 byte in the pattern defines n, and the n bytes after that
5307 are the characters to match. */
fa9a63c5
RM
5308 case exactn:
5309 mcnt = *p++;
25fe55af 5310 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5311
99633e97
SM
5312 /* Remember the start point to rollback upon failure. */
5313 dfail = d;
5314
25fe55af
RS
5315 /* This is written out as an if-else so we don't waste time
5316 testing `translate' inside the loop. */
28703c16 5317 if (RE_TRANSLATE_P (translate))
fa9a63c5 5318 {
e934739e
RS
5319 if (multibyte)
5320 do
5321 {
5322 int pat_charlen, buf_charlen;
e71b1971 5323 unsigned int pat_ch, buf_ch;
e934739e
RS
5324
5325 PREFETCH ();
5326 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5327 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5328
5329 if (RE_TRANSLATE (translate, buf_ch)
5330 != pat_ch)
99633e97
SM
5331 {
5332 d = dfail;
5333 goto fail;
5334 }
e934739e
RS
5335
5336 p += pat_charlen;
5337 d += buf_charlen;
5338 mcnt -= pat_charlen;
5339 }
5340 while (mcnt > 0);
5341 else
e934739e
RS
5342 do
5343 {
cf1982d9
EZ
5344 /* Avoid compiler whining about comparison being
5345 always true. */
5346 int di;
5347
e934739e 5348 PREFETCH ();
cf1982d9
EZ
5349 di = *d;
5350 if (RE_TRANSLATE (translate, di) != *p++)
99633e97
SM
5351 {
5352 d = dfail;
5353 goto fail;
5354 }
33c46939 5355 d++;
e934739e
RS
5356 }
5357 while (--mcnt);
fa9a63c5
RM
5358 }
5359 else
5360 {
5361 do
5362 {
5363 PREFETCH ();
99633e97
SM
5364 if (*d++ != *p++)
5365 {
5366 d = dfail;
5367 goto fail;
5368 }
fa9a63c5
RM
5369 }
5370 while (--mcnt);
5371 }
25fe55af 5372 break;
fa9a63c5
RM
5373
5374
25fe55af 5375 /* Match any character except possibly a newline or a null. */
fa9a63c5 5376 case anychar:
e934739e
RS
5377 {
5378 int buf_charlen;
01618498 5379 re_wchar_t buf_ch;
fa9a63c5 5380
e934739e 5381 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5382
e934739e 5383 PREFETCH ();
2d1675e4 5384 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5385 buf_ch = TRANSLATE (buf_ch);
5386
5387 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5388 && buf_ch == '\n')
5389 || ((bufp->syntax & RE_DOT_NOT_NULL)
5390 && buf_ch == '\000'))
5391 goto fail;
5392
e934739e
RS
5393 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5394 d += buf_charlen;
5395 }
fa9a63c5
RM
5396 break;
5397
5398
5399 case charset:
5400 case charset_not:
5401 {
b18215fc 5402 register unsigned int c;
fa9a63c5 5403 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5404 int len;
5405
5406 /* Start of actual range_table, or end of bitmap if there is no
5407 range table. */
01618498 5408 re_char *range_table;
b18215fc 5409
96cc36cc 5410 /* Nonzero if there is a range table. */
b18215fc
RS
5411 int range_table_exists;
5412
96cc36cc
RS
5413 /* Number of ranges of range table. This is not included
5414 in the initial byte-length of the command. */
5415 int count = 0;
fa9a63c5 5416
25fe55af 5417 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5418
b18215fc 5419 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5420
b18215fc 5421 if (range_table_exists)
96cc36cc
RS
5422 {
5423 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5424 EXTRACT_NUMBER_AND_INCR (count, range_table);
5425 }
b18215fc 5426
2d1675e4
SM
5427 PREFETCH ();
5428 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5429 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
5430
5431 if (SINGLE_BYTE_CHAR_P (c))
5432 { /* Lookup bitmap. */
b18215fc
RS
5433 /* Cast to `unsigned' instead of `unsigned char' in
5434 case the bit list is a full 32 bytes long. */
5435 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5436 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5437 not = !not;
b18215fc 5438 }
96cc36cc 5439#ifdef emacs
b18215fc 5440 else if (range_table_exists)
96cc36cc
RS
5441 {
5442 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5443
14473664
SM
5444 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5445 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5446 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5447 | (class_bits & BIT_SPACE && ISSPACE (c))
5448 | (class_bits & BIT_UPPER && ISUPPER (c))
5449 | (class_bits & BIT_WORD && ISWORD (c)))
5450 not = !not;
5451 else
5452 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5453 }
5454#endif /* emacs */
fa9a63c5 5455
96cc36cc
RS
5456 if (range_table_exists)
5457 p = CHARSET_RANGE_TABLE_END (range_table, count);
5458 else
5459 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5460
5461 if (!not) goto fail;
5e69f11e 5462
b18215fc 5463 d += len;
fa9a63c5
RM
5464 break;
5465 }
5466
5467
25fe55af 5468 /* The beginning of a group is represented by start_memory.
505bde11 5469 The argument is the register number. The text
25fe55af
RS
5470 matched within the group is recorded (in the internal
5471 registers data structure) under the register number. */
5472 case start_memory:
505bde11
SM
5473 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5474
5475 /* In case we need to undo this operation (via backtracking). */
5476 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5477
25fe55af 5478 regstart[*p] = d;
4bb91c68 5479 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5480 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5481
25fe55af 5482 /* Move past the register number and inner group count. */
505bde11 5483 p += 1;
25fe55af 5484 break;
fa9a63c5
RM
5485
5486
25fe55af 5487 /* The stop_memory opcode represents the end of a group. Its
505bde11 5488 argument is the same as start_memory's: the register number. */
fa9a63c5 5489 case stop_memory:
505bde11
SM
5490 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5491
5492 assert (!REG_UNSET (regstart[*p]));
5493 /* Strictly speaking, there should be code such as:
177c0ea7 5494
0b32bf0e 5495 assert (REG_UNSET (regend[*p]));
505bde11
SM
5496 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5497
5498 But the only info to be pushed is regend[*p] and it is known to
5499 be UNSET, so there really isn't anything to push.
5500 Not pushing anything, on the other hand deprives us from the
5501 guarantee that regend[*p] is UNSET since undoing this operation
5502 will not reset its value properly. This is not important since
5503 the value will only be read on the next start_memory or at
5504 the very end and both events can only happen if this stop_memory
5505 is *not* undone. */
fa9a63c5 5506
25fe55af 5507 regend[*p] = d;
fa9a63c5
RM
5508 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5509
25fe55af 5510 /* Move past the register number and the inner group count. */
505bde11 5511 p += 1;
25fe55af 5512 break;
fa9a63c5
RM
5513
5514
5515 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5516 followed by the numeric value of <digit> as the register number. */
5517 case duplicate:
fa9a63c5 5518 {
66f0296e 5519 register re_char *d2, *dend2;
25fe55af 5520 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5521 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5522
25fe55af
RS
5523 /* Can't back reference a group which we've never matched. */
5524 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5525 goto fail;
5e69f11e 5526
25fe55af
RS
5527 /* Where in input to try to start matching. */
5528 d2 = regstart[regno];
5e69f11e 5529
99633e97
SM
5530 /* Remember the start point to rollback upon failure. */
5531 dfail = d;
5532
25fe55af
RS
5533 /* Where to stop matching; if both the place to start and
5534 the place to stop matching are in the same string, then
5535 set to the place to stop, otherwise, for now have to use
5536 the end of the first string. */
fa9a63c5 5537
25fe55af 5538 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5539 == FIRST_STRING_P (regend[regno]))
5540 ? regend[regno] : end_match_1);
5541 for (;;)
5542 {
5543 /* If necessary, advance to next segment in register
25fe55af 5544 contents. */
fa9a63c5
RM
5545 while (d2 == dend2)
5546 {
5547 if (dend2 == end_match_2) break;
5548 if (dend2 == regend[regno]) break;
5549
25fe55af
RS
5550 /* End of string1 => advance to string2. */
5551 d2 = string2;
5552 dend2 = regend[regno];
fa9a63c5
RM
5553 }
5554 /* At end of register contents => success */
5555 if (d2 == dend2) break;
5556
5557 /* If necessary, advance to next segment in data. */
5558 PREFETCH ();
5559
5560 /* How many characters left in this segment to match. */
5561 mcnt = dend - d;
5e69f11e 5562
fa9a63c5 5563 /* Want how many consecutive characters we can match in
25fe55af
RS
5564 one shot, so, if necessary, adjust the count. */
5565 if (mcnt > dend2 - d2)
fa9a63c5 5566 mcnt = dend2 - d2;
5e69f11e 5567
fa9a63c5 5568 /* Compare that many; failure if mismatch, else move
25fe55af 5569 past them. */
28703c16 5570 if (RE_TRANSLATE_P (translate)
2d1675e4 5571 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5572 : memcmp (d, d2, mcnt))
99633e97
SM
5573 {
5574 d = dfail;
5575 goto fail;
5576 }
fa9a63c5 5577 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5578 }
5579 }
5580 break;
5581
5582
25fe55af 5583 /* begline matches the empty string at the beginning of the string
c0f9ea08 5584 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5585 case begline:
25fe55af 5586 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5587
25fe55af
RS
5588 if (AT_STRINGS_BEG (d))
5589 {
5590 if (!bufp->not_bol) break;
5591 }
419d1c74 5592 else
25fe55af 5593 {
419d1c74
SM
5594 unsigned char c;
5595 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5596 if (c == '\n')
419d1c74 5597 break;
25fe55af
RS
5598 }
5599 /* In all other cases, we fail. */
5600 goto fail;
fa9a63c5
RM
5601
5602
25fe55af 5603 /* endline is the dual of begline. */
fa9a63c5 5604 case endline:
25fe55af 5605 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5606
25fe55af
RS
5607 if (AT_STRINGS_END (d))
5608 {
5609 if (!bufp->not_eol) break;
5610 }
f1ad044f 5611 else
25fe55af 5612 {
f1ad044f 5613 PREFETCH_NOLIMIT ();
c0f9ea08 5614 if (*d == '\n')
f1ad044f 5615 break;
25fe55af
RS
5616 }
5617 goto fail;
fa9a63c5
RM
5618
5619
5620 /* Match at the very beginning of the data. */
25fe55af
RS
5621 case begbuf:
5622 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5623 if (AT_STRINGS_BEG (d))
5624 break;
5625 goto fail;
fa9a63c5
RM
5626
5627
5628 /* Match at the very end of the data. */
25fe55af
RS
5629 case endbuf:
5630 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5631 if (AT_STRINGS_END (d))
5632 break;
25fe55af 5633 goto fail;
5e69f11e 5634
5e69f11e 5635
25fe55af
RS
5636 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5637 pushes NULL as the value for the string on the stack. Then
505bde11 5638 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5639 string, instead of restoring it. To see why, consider
5640 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5641 then the . fails against the \n. But the next thing we want
5642 to do is match the \n against the \n; if we restored the
5643 string value, we would be back at the foo.
5644
5645 Because this is used only in specific cases, we don't need to
5646 check all the things that `on_failure_jump' does, to make
5647 sure the right things get saved on the stack. Hence we don't
5648 share its code. The only reason to push anything on the
5649 stack at all is that otherwise we would have to change
5650 `anychar's code to do something besides goto fail in this
5651 case; that seems worse than this. */
5652 case on_failure_keep_string_jump:
505bde11
SM
5653 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5654 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5655 mcnt, p + mcnt);
fa9a63c5 5656
505bde11
SM
5657 PUSH_FAILURE_POINT (p - 3, NULL);
5658 break;
5659
0683b6fa
SM
5660 /* A nasty loop is introduced by the non-greedy *? and +?.
5661 With such loops, the stack only ever contains one failure point
5662 at a time, so that a plain on_failure_jump_loop kind of
5663 cycle detection cannot work. Worse yet, such a detection
5664 can not only fail to detect a cycle, but it can also wrongly
5665 detect a cycle (between different instantiations of the same
6df42991 5666 loop).
0683b6fa
SM
5667 So the method used for those nasty loops is a little different:
5668 We use a special cycle-detection-stack-frame which is pushed
5669 when the on_failure_jump_nastyloop failure-point is *popped*.
5670 This special frame thus marks the beginning of one iteration
5671 through the loop and we can hence easily check right here
5672 whether something matched between the beginning and the end of
5673 the loop. */
5674 case on_failure_jump_nastyloop:
5675 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5676 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5677 mcnt, p + mcnt);
5678
5679 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5680 {
5681 int cycle = 0;
5682 CHECK_INFINITE_LOOP (p - 4, d);
5683 if (!cycle)
5684 /* If there's a cycle, just continue without pushing
5685 this failure point. The failure point is the "try again"
5686 option, which shouldn't be tried.
5687 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5688 PUSH_FAILURE_POINT (p - 3, d);
5689 }
0683b6fa
SM
5690 break;
5691
4e8a9132
SM
5692 /* Simple loop detecting on_failure_jump: just check on the
5693 failure stack if the same spot was already hit earlier. */
505bde11
SM
5694 case on_failure_jump_loop:
5695 on_failure:
5696 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5697 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5698 mcnt, p + mcnt);
6df42991
SM
5699 {
5700 int cycle = 0;
5701 CHECK_INFINITE_LOOP (p - 3, d);
5702 if (cycle)
5703 /* If there's a cycle, get out of the loop, as if the matching
5704 had failed. We used to just `goto fail' here, but that was
5705 aborting the search a bit too early: we want to keep the
5706 empty-loop-match and keep matching after the loop.
5707 We want (x?)*y\1z to match both xxyz and xxyxz. */
5708 p += mcnt;
5709 else
5710 PUSH_FAILURE_POINT (p - 3, d);
5711 }
25fe55af 5712 break;
fa9a63c5
RM
5713
5714
5715 /* Uses of on_failure_jump:
5e69f11e 5716
25fe55af
RS
5717 Each alternative starts with an on_failure_jump that points
5718 to the beginning of the next alternative. Each alternative
5719 except the last ends with a jump that in effect jumps past
5720 the rest of the alternatives. (They really jump to the
5721 ending jump of the following alternative, because tensioning
5722 these jumps is a hassle.)
fa9a63c5 5723
25fe55af
RS
5724 Repeats start with an on_failure_jump that points past both
5725 the repetition text and either the following jump or
5726 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5727 case on_failure_jump:
25fe55af 5728 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5729 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5730 mcnt, p + mcnt);
25fe55af 5731
505bde11 5732 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5733 break;
5734
4e8a9132 5735 /* This operation is used for greedy *.
505bde11
SM
5736 Compare the beginning of the repeat with what in the
5737 pattern follows its end. If we can establish that there
5738 is nothing that they would both match, i.e., that we
5739 would have to backtrack because of (as in, e.g., `a*a')
5740 then we can use a non-backtracking loop based on
4e8a9132 5741 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5742 case on_failure_jump_smart:
25fe55af 5743 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5744 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5745 mcnt, p + mcnt);
25fe55af 5746 {
01618498 5747 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5748 /* Here, we discard `const', making re_match non-reentrant. */
5749 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5750 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5751
505bde11
SM
5752 p -= 3; /* Reset so that we will re-execute the
5753 instruction once it's been changed. */
fa9a63c5 5754
4e8a9132
SM
5755 EXTRACT_NUMBER (mcnt, p2 - 2);
5756
5757 /* Ensure this is a indeed the trivial kind of loop
5758 we are expecting. */
5759 assert (skip_one_char (p1) == p2 - 3);
5760 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5761 DEBUG_STATEMENT (debug += 2);
505bde11 5762 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5763 {
505bde11 5764 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5765 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5766 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5767 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5768 }
505bde11 5769 else
fa9a63c5 5770 {
505bde11
SM
5771 /* Default to a safe `on_failure_jump' loop. */
5772 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5773 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5774 }
99633e97 5775 DEBUG_STATEMENT (debug -= 2);
25fe55af 5776 }
505bde11 5777 break;
25fe55af
RS
5778
5779 /* Unconditionally jump (without popping any failure points). */
5780 case jump:
fa9a63c5 5781 unconditional_jump:
5b370c2b 5782 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5783 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5784 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5785 p += mcnt; /* Do the jump. */
505bde11 5786 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5787 break;
5788
5789
25fe55af
RS
5790 /* Have to succeed matching what follows at least n times.
5791 After that, handle like `on_failure_jump'. */
5792 case succeed_n:
01618498 5793 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5794 EXTRACT_NUMBER (mcnt, p + 2);
5795 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5796
dc1e502d
SM
5797 /* Originally, mcnt is how many times we HAVE to succeed. */
5798 if (mcnt != 0)
25fe55af 5799 {
6dcf2d0e
SM
5800 /* Here, we discard `const', making re_match non-reentrant. */
5801 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5802 mcnt--;
01618498
SM
5803 p += 4;
5804 PUSH_NUMBER (p2, mcnt);
25fe55af 5805 }
dc1e502d
SM
5806 else
5807 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5808 goto on_failure;
25fe55af
RS
5809 break;
5810
5811 case jump_n:
01618498 5812 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5813 EXTRACT_NUMBER (mcnt, p + 2);
5814 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5815
5816 /* Originally, this is how many times we CAN jump. */
dc1e502d 5817 if (mcnt != 0)
25fe55af 5818 {
6dcf2d0e
SM
5819 /* Here, we discard `const', making re_match non-reentrant. */
5820 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5821 mcnt--;
01618498 5822 PUSH_NUMBER (p2, mcnt);
dc1e502d 5823 goto unconditional_jump;
25fe55af
RS
5824 }
5825 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5826 else
5827 p += 4;
25fe55af 5828 break;
5e69f11e 5829
fa9a63c5
RM
5830 case set_number_at:
5831 {
01618498 5832 unsigned char *p2; /* Location of the counter. */
25fe55af 5833 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5834
25fe55af 5835 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5836 /* Here, we discard `const', making re_match non-reentrant. */
5837 p2 = (unsigned char*) p + mcnt;
01618498 5838 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5839 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5840 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5841 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5842 break;
5843 }
9121ca40
KH
5844
5845 case wordbound:
66f0296e
SM
5846 case notwordbound:
5847 not = (re_opcode_t) *(p - 1) == notwordbound;
5848 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5849
99633e97 5850 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5851
b18215fc 5852 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5853 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5854 not = !not;
b18215fc
RS
5855 else
5856 {
5857 /* C1 is the character before D, S1 is the syntax of C1, C2
5858 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5859 re_wchar_t c1, c2;
5860 int s1, s2;
b18215fc 5861#ifdef emacs
2d1675e4
SM
5862 int offset = PTR_TO_OFFSET (d - 1);
5863 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5864 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5865#endif
66f0296e 5866 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5867 s1 = SYNTAX (c1);
5868#ifdef emacs
5d967c7a 5869 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5870#endif
f1ad044f 5871 PREFETCH_NOLIMIT ();
2d1675e4 5872 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5873 s2 = SYNTAX (c2);
5874
5875 if (/* Case 2: Only one of S1 and S2 is Sword. */
5876 ((s1 == Sword) != (s2 == Sword))
5877 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5878 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5879 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5880 not = !not;
5881 }
5882 if (not)
9121ca40 5883 break;
b18215fc 5884 else
9121ca40 5885 goto fail;
fa9a63c5
RM
5886
5887 case wordbeg:
25fe55af 5888 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5889
b18215fc
RS
5890 /* We FAIL in one of the following cases: */
5891
25fe55af 5892 /* Case 1: D is at the end of string. */
b18215fc 5893 if (AT_STRINGS_END (d))
99633e97 5894 goto fail;
b18215fc
RS
5895 else
5896 {
5897 /* C1 is the character before D, S1 is the syntax of C1, C2
5898 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5899 re_wchar_t c1, c2;
5900 int s1, s2;
fa9a63c5 5901#ifdef emacs
2d1675e4
SM
5902 int offset = PTR_TO_OFFSET (d);
5903 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5904 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5905#endif
99633e97 5906 PREFETCH ();
2d1675e4 5907 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5908 s2 = SYNTAX (c2);
177c0ea7 5909
b18215fc
RS
5910 /* Case 2: S2 is not Sword. */
5911 if (s2 != Sword)
5912 goto fail;
5913
5914 /* Case 3: D is not at the beginning of string ... */
5915 if (!AT_STRINGS_BEG (d))
5916 {
5917 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5918#ifdef emacs
5d967c7a 5919 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5920#endif
b18215fc
RS
5921 s1 = SYNTAX (c1);
5922
5923 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5924 returns 0. */
b18215fc
RS
5925 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5926 goto fail;
5927 }
5928 }
e318085a
RS
5929 break;
5930
b18215fc 5931 case wordend:
25fe55af 5932 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5933
5934 /* We FAIL in one of the following cases: */
5935
5936 /* Case 1: D is at the beginning of string. */
5937 if (AT_STRINGS_BEG (d))
e318085a 5938 goto fail;
b18215fc
RS
5939 else
5940 {
5941 /* C1 is the character before D, S1 is the syntax of C1, C2
5942 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5943 re_wchar_t c1, c2;
5944 int s1, s2;
5d967c7a 5945#ifdef emacs
2d1675e4
SM
5946 int offset = PTR_TO_OFFSET (d) - 1;
5947 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5948 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5949#endif
99633e97 5950 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5951 s1 = SYNTAX (c1);
5952
5953 /* Case 2: S1 is not Sword. */
5954 if (s1 != Sword)
5955 goto fail;
5956
5957 /* Case 3: D is not at the end of string ... */
5958 if (!AT_STRINGS_END (d))
5959 {
f1ad044f 5960 PREFETCH_NOLIMIT ();
2d1675e4 5961 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a 5962#ifdef emacs
134579f2 5963 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5d967c7a 5964#endif
b18215fc
RS
5965 s2 = SYNTAX (c2);
5966
5967 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5968 returns 0. */
b18215fc 5969 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5970 goto fail;
b18215fc
RS
5971 }
5972 }
e318085a
RS
5973 break;
5974
669fa600
SM
5975 case symbeg:
5976 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
5977
5978 /* We FAIL in one of the following cases: */
5979
5980 /* Case 1: D is at the end of string. */
5981 if (AT_STRINGS_END (d))
5982 goto fail;
5983 else
5984 {
5985 /* C1 is the character before D, S1 is the syntax of C1, C2
5986 is the character at D, and S2 is the syntax of C2. */
5987 re_wchar_t c1, c2;
5988 int s1, s2;
5989#ifdef emacs
5990 int offset = PTR_TO_OFFSET (d);
5991 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5992 UPDATE_SYNTAX_TABLE (charpos);
5993#endif
5994 PREFETCH ();
5995 c2 = RE_STRING_CHAR (d, dend - d);
5996 s2 = SYNTAX (c2);
5997
5998 /* Case 2: S2 is neither Sword nor Ssymbol. */
5999 if (s2 != Sword && s2 != Ssymbol)
6000 goto fail;
6001
6002 /* Case 3: D is not at the beginning of string ... */
6003 if (!AT_STRINGS_BEG (d))
6004 {
6005 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6006#ifdef emacs
6007 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6008#endif
6009 s1 = SYNTAX (c1);
6010
6011 /* ... and S1 is Sword or Ssymbol. */
6012 if (s1 == Sword || s1 == Ssymbol)
6013 goto fail;
6014 }
6015 }
6016 break;
6017
6018 case symend:
6019 DEBUG_PRINT1 ("EXECUTING symend.\n");
6020
6021 /* We FAIL in one of the following cases: */
6022
6023 /* Case 1: D is at the beginning of string. */
6024 if (AT_STRINGS_BEG (d))
6025 goto fail;
6026 else
6027 {
6028 /* C1 is the character before D, S1 is the syntax of C1, C2
6029 is the character at D, and S2 is the syntax of C2. */
6030 re_wchar_t c1, c2;
6031 int s1, s2;
6032#ifdef emacs
6033 int offset = PTR_TO_OFFSET (d) - 1;
6034 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6035 UPDATE_SYNTAX_TABLE (charpos);
6036#endif
6037 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6038 s1 = SYNTAX (c1);
6039
6040 /* Case 2: S1 is neither Ssymbol nor Sword. */
6041 if (s1 != Sword && s1 != Ssymbol)
6042 goto fail;
6043
6044 /* Case 3: D is not at the end of string ... */
6045 if (!AT_STRINGS_END (d))
6046 {
6047 PREFETCH_NOLIMIT ();
6048 c2 = RE_STRING_CHAR (d, dend - d);
6049#ifdef emacs
134579f2 6050 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6051#endif
6052 s2 = SYNTAX (c2);
6053
6054 /* ... and S2 is Sword or Ssymbol. */
6055 if (s2 == Sword || s2 == Ssymbol)
6056 goto fail;
6057 }
6058 }
6059 break;
6060
fa9a63c5 6061 case syntaxspec:
1fb352e0
SM
6062 case notsyntaxspec:
6063 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6064 mcnt = *p++;
1fb352e0 6065 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6066 PREFETCH ();
b18215fc
RS
6067#ifdef emacs
6068 {
2d1675e4
SM
6069 int offset = PTR_TO_OFFSET (d);
6070 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6071 UPDATE_SYNTAX_TABLE (pos1);
6072 }
25fe55af 6073#endif
b18215fc 6074 {
01618498
SM
6075 int len;
6076 re_wchar_t c;
b18215fc 6077
2d1675e4 6078 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6079
990b2375 6080 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6081 goto fail;
b18215fc
RS
6082 d += len;
6083 }
fa9a63c5
RM
6084 break;
6085
b18215fc 6086#ifdef emacs
1fb352e0
SM
6087 case before_dot:
6088 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6089 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6090 goto fail;
b18215fc
RS
6091 break;
6092
1fb352e0
SM
6093 case at_dot:
6094 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6095 if (PTR_BYTE_POS (d) != PT_BYTE)
6096 goto fail;
6097 break;
b18215fc 6098
1fb352e0
SM
6099 case after_dot:
6100 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6101 if (PTR_BYTE_POS (d) <= PT_BYTE)
6102 goto fail;
e318085a 6103 break;
fa9a63c5 6104
1fb352e0 6105 case categoryspec:
b18215fc 6106 case notcategoryspec:
1fb352e0 6107 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6108 mcnt = *p++;
1fb352e0 6109 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6110 PREFETCH ();
6111 {
01618498
SM
6112 int len;
6113 re_wchar_t c;
6114
2d1675e4 6115 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 6116
1fb352e0 6117 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6118 goto fail;
6119 d += len;
6120 }
fa9a63c5 6121 break;
5e69f11e 6122
1fb352e0 6123#endif /* emacs */
5e69f11e 6124
0b32bf0e
SM
6125 default:
6126 abort ();
fa9a63c5 6127 }
b18215fc 6128 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6129
6130
6131 /* We goto here if a matching operation fails. */
6132 fail:
5b370c2b 6133 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6134 if (!FAIL_STACK_EMPTY ())
505bde11 6135 {
01618498 6136 re_char *str, *pat;
505bde11 6137 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6138 DEBUG_PRINT1 ("\nFAIL:\n");
6139 POP_FAILURE_POINT (str, pat);
505bde11
SM
6140 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6141 {
6142 case on_failure_keep_string_jump:
6143 assert (str == NULL);
6144 goto continue_failure_jump;
6145
0683b6fa
SM
6146 case on_failure_jump_nastyloop:
6147 assert ((re_opcode_t)pat[-2] == no_op);
6148 PUSH_FAILURE_POINT (pat - 2, str);
6149 /* Fallthrough */
6150
505bde11
SM
6151 case on_failure_jump_loop:
6152 case on_failure_jump:
6153 case succeed_n:
6154 d = str;
6155 continue_failure_jump:
6156 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6157 p = pat + mcnt;
6158 break;
b18215fc 6159
0683b6fa
SM
6160 case no_op:
6161 /* A special frame used for nastyloops. */
6162 goto fail;
6163
505bde11
SM
6164 default:
6165 abort();
6166 }
fa9a63c5 6167
505bde11 6168 assert (p >= bufp->buffer && p <= pend);
b18215fc 6169
0b32bf0e 6170 if (d >= string1 && d <= end1)
fa9a63c5 6171 dend = end_match_1;
0b32bf0e 6172 }
fa9a63c5 6173 else
0b32bf0e 6174 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6175 } /* for (;;) */
6176
6177 if (best_regs_set)
6178 goto restore_best_regs;
6179
6180 FREE_VARIABLES ();
6181
b18215fc 6182 return -1; /* Failure to match. */
fa9a63c5
RM
6183} /* re_match_2 */
6184\f
6185/* Subroutine definitions for re_match_2. */
6186
fa9a63c5
RM
6187/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6188 bytes; nonzero otherwise. */
5e69f11e 6189
fa9a63c5 6190static int
2d1675e4
SM
6191bcmp_translate (s1, s2, len, translate, multibyte)
6192 re_char *s1, *s2;
fa9a63c5 6193 register int len;
6676cb1c 6194 RE_TRANSLATE_TYPE translate;
2d1675e4 6195 const int multibyte;
fa9a63c5 6196{
2d1675e4
SM
6197 register re_char *p1 = s1, *p2 = s2;
6198 re_char *p1_end = s1 + len;
6199 re_char *p2_end = s2 + len;
e934739e 6200
4bb91c68
SM
6201 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6202 different lengths, but relying on a single `len' would break this. -sm */
6203 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6204 {
e934739e 6205 int p1_charlen, p2_charlen;
01618498 6206 re_wchar_t p1_ch, p2_ch;
e934739e 6207
2d1675e4
SM
6208 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6209 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
6210
6211 if (RE_TRANSLATE (translate, p1_ch)
6212 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6213 return 1;
e934739e
RS
6214
6215 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6216 }
e934739e
RS
6217
6218 if (p1 != p1_end || p2 != p2_end)
6219 return 1;
6220
fa9a63c5
RM
6221 return 0;
6222}
6223\f
6224/* Entry points for GNU code. */
6225
6226/* re_compile_pattern is the GNU regular expression compiler: it
6227 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6228 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6229
fa9a63c5
RM
6230 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6231 are set in BUFP on entry.
5e69f11e 6232
b18215fc 6233 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6234
6235const char *
6236re_compile_pattern (pattern, length, bufp)
6237 const char *pattern;
0b32bf0e 6238 size_t length;
fa9a63c5
RM
6239 struct re_pattern_buffer *bufp;
6240{
6241 reg_errcode_t ret;
5e69f11e 6242
1208f11a
RS
6243#ifdef emacs
6244 gl_state.current_syntax_table = current_buffer->syntax_table;
6245#endif
6246
fa9a63c5
RM
6247 /* GNU code is written to assume at least RE_NREGS registers will be set
6248 (and at least one extra will be -1). */
6249 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6250
fa9a63c5
RM
6251 /* And GNU code determines whether or not to get register information
6252 by passing null for the REGS argument to re_match, etc., not by
6253 setting no_sub. */
6254 bufp->no_sub = 0;
5e69f11e 6255
4bb91c68 6256 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6257
6258 if (!ret)
6259 return NULL;
6260 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6261}
c0f9ea08 6262WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6263\f
b18215fc
RS
6264/* Entry points compatible with 4.2 BSD regex library. We don't define
6265 them unless specifically requested. */
fa9a63c5 6266
0b32bf0e 6267#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6268
6269/* BSD has one and only one pattern buffer. */
6270static struct re_pattern_buffer re_comp_buf;
6271
6272char *
0b32bf0e 6273# ifdef _LIBC
48afdd44
RM
6274/* Make these definitions weak in libc, so POSIX programs can redefine
6275 these names if they don't use our functions, and still use
6276 regcomp/regexec below without link errors. */
6277weak_function
0b32bf0e 6278# endif
fa9a63c5
RM
6279re_comp (s)
6280 const char *s;
6281{
6282 reg_errcode_t ret;
5e69f11e 6283
fa9a63c5
RM
6284 if (!s)
6285 {
6286 if (!re_comp_buf.buffer)
0b32bf0e 6287 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6288 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6289 return 0;
6290 }
6291
6292 if (!re_comp_buf.buffer)
6293 {
6294 re_comp_buf.buffer = (unsigned char *) malloc (200);
6295 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6296 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6297 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6298 re_comp_buf.allocated = 200;
6299
6300 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6301 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6302 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6303 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6304 }
6305
6306 /* Since `re_exec' always passes NULL for the `regs' argument, we
6307 don't need to initialize the pattern buffer fields which affect it. */
6308
fa9a63c5 6309 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6310
fa9a63c5
RM
6311 if (!ret)
6312 return NULL;
6313
6314 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6315 return (char *) gettext (re_error_msgid[(int) ret]);
6316}
6317
6318
6319int
0b32bf0e 6320# ifdef _LIBC
48afdd44 6321weak_function
0b32bf0e 6322# endif
fa9a63c5
RM
6323re_exec (s)
6324 const char *s;
6325{
6326 const int len = strlen (s);
6327 return
6328 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6329}
6330#endif /* _REGEX_RE_COMP */
6331\f
6332/* POSIX.2 functions. Don't define these for Emacs. */
6333
6334#ifndef emacs
6335
6336/* regcomp takes a regular expression as a string and compiles it.
6337
b18215fc 6338 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6339 since POSIX says we shouldn't. Thus, we set
6340
6341 `buffer' to the compiled pattern;
6342 `used' to the length of the compiled pattern;
6343 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6344 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6345 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6346 `fastmap' to an allocated space for the fastmap;
6347 `fastmap_accurate' to zero;
fa9a63c5
RM
6348 `re_nsub' to the number of subexpressions in PATTERN.
6349
6350 PATTERN is the address of the pattern string.
6351
6352 CFLAGS is a series of bits which affect compilation.
6353
6354 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6355 use POSIX basic syntax.
6356
6357 If REG_NEWLINE is set, then . and [^...] don't match newline.
6358 Also, regexec will try a match beginning after every newline.
6359
6360 If REG_ICASE is set, then we considers upper- and lowercase
6361 versions of letters to be equivalent when matching.
6362
6363 If REG_NOSUB is set, then when PREG is passed to regexec, that
6364 routine will report only success or failure, and nothing about the
6365 registers.
6366
b18215fc 6367 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6368 the return codes and their meanings.) */
6369
6370int
6371regcomp (preg, pattern, cflags)
ada30c0e
SM
6372 regex_t *__restrict preg;
6373 const char *__restrict pattern;
fa9a63c5
RM
6374 int cflags;
6375{
6376 reg_errcode_t ret;
4bb91c68 6377 reg_syntax_t syntax
fa9a63c5
RM
6378 = (cflags & REG_EXTENDED) ?
6379 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6380
6381 /* regex_compile will allocate the space for the compiled pattern. */
6382 preg->buffer = 0;
6383 preg->allocated = 0;
6384 preg->used = 0;
5e69f11e 6385
c0f9ea08 6386 /* Try to allocate space for the fastmap. */
a073faa6 6387 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6388
fa9a63c5
RM
6389 if (cflags & REG_ICASE)
6390 {
6391 unsigned i;
5e69f11e 6392
6676cb1c 6393 preg->translate
a073faa6 6394 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8ea094cf 6395 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6396 if (preg->translate == NULL)
0b32bf0e 6397 return (int) REG_ESPACE;
fa9a63c5
RM
6398
6399 /* Map uppercase characters to corresponding lowercase ones. */
6400 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6401 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6402 }
6403 else
6404 preg->translate = NULL;
6405
6406 /* If REG_NEWLINE is set, newlines are treated differently. */
6407 if (cflags & REG_NEWLINE)
6408 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6409 syntax &= ~RE_DOT_NEWLINE;
6410 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6411 }
6412 else
c0f9ea08 6413 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6414
6415 preg->no_sub = !!(cflags & REG_NOSUB);
6416
5e69f11e 6417 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6418 can use strlen here in compiling the pattern. */
4bb91c68 6419 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6420
fa9a63c5
RM
6421 /* POSIX doesn't distinguish between an unmatched open-group and an
6422 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6423 if (ret == REG_ERPAREN)
6424 ret = REG_EPAREN;
6425
6426 if (ret == REG_NOERROR && preg->fastmap)
6427 { /* Compute the fastmap now, since regexec cannot modify the pattern
6428 buffer. */
6429 re_compile_fastmap (preg);
6430 if (preg->can_be_null)
6431 { /* The fastmap can't be used anyway. */
6432 free (preg->fastmap);
6433 preg->fastmap = NULL;
6434 }
6435 }
fa9a63c5
RM
6436 return (int) ret;
6437}
c0f9ea08 6438WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6439
6440
6441/* regexec searches for a given pattern, specified by PREG, in the
6442 string STRING.
5e69f11e 6443
fa9a63c5 6444 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6445 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6446 least NMATCH elements, and we set them to the offsets of the
6447 corresponding matched substrings.
5e69f11e 6448
fa9a63c5
RM
6449 EFLAGS specifies `execution flags' which affect matching: if
6450 REG_NOTBOL is set, then ^ does not match at the beginning of the
6451 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6452
fa9a63c5
RM
6453 We return 0 if we find a match and REG_NOMATCH if not. */
6454
6455int
6456regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6457 const regex_t *__restrict preg;
6458 const char *__restrict string;
5e69f11e 6459 size_t nmatch;
9f2dbe01 6460 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6461 int eflags;
6462{
6463 int ret;
6464 struct re_registers regs;
6465 regex_t private_preg;
6466 int len = strlen (string);
c0f9ea08 6467 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6468
6469 private_preg = *preg;
5e69f11e 6470
fa9a63c5
RM
6471 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6472 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6473
fa9a63c5
RM
6474 /* The user has told us exactly how many registers to return
6475 information about, via `nmatch'. We have to pass that on to the
b18215fc 6476 matching routines. */
fa9a63c5 6477 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6478
fa9a63c5
RM
6479 if (want_reg_info)
6480 {
6481 regs.num_regs = nmatch;
4bb91c68
SM
6482 regs.start = TALLOC (nmatch * 2, regoff_t);
6483 if (regs.start == NULL)
0b32bf0e 6484 return (int) REG_NOMATCH;
4bb91c68 6485 regs.end = regs.start + nmatch;
fa9a63c5
RM
6486 }
6487
c0f9ea08
SM
6488 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6489 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6490 was a little bit longer but still only matching the real part.
6491 This works because the `endline' will check for a '\n' and will find a
6492 '\0', correctly deciding that this is not the end of a line.
6493 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6494 a convenient '\0' there. For all we know, the string could be preceded
6495 by '\n' which would throw things off. */
6496
fa9a63c5
RM
6497 /* Perform the searching operation. */
6498 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6499 /* start: */ 0, /* range: */ len,
6500 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6501
fa9a63c5
RM
6502 /* Copy the register information to the POSIX structure. */
6503 if (want_reg_info)
6504 {
6505 if (ret >= 0)
0b32bf0e
SM
6506 {
6507 unsigned r;
fa9a63c5 6508
0b32bf0e
SM
6509 for (r = 0; r < nmatch; r++)
6510 {
6511 pmatch[r].rm_so = regs.start[r];
6512 pmatch[r].rm_eo = regs.end[r];
6513 }
6514 }
fa9a63c5 6515
b18215fc 6516 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6517 free (regs.start);
fa9a63c5
RM
6518 }
6519
6520 /* We want zero return to mean success, unlike `re_search'. */
6521 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6522}
c0f9ea08 6523WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6524
6525
6526/* Returns a message corresponding to an error code, ERRCODE, returned
6527 from either regcomp or regexec. We don't use PREG here. */
6528
6529size_t
6530regerror (errcode, preg, errbuf, errbuf_size)
6531 int errcode;
6532 const regex_t *preg;
6533 char *errbuf;
6534 size_t errbuf_size;
6535{
6536 const char *msg;
6537 size_t msg_size;
6538
6539 if (errcode < 0
6540 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6541 /* Only error codes returned by the rest of the code should be passed
b18215fc 6542 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6543 code generates an invalid error code, then the program has a bug.
6544 Dump core so we can fix it. */
6545 abort ();
6546
6547 msg = gettext (re_error_msgid[errcode]);
6548
6549 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6550
fa9a63c5
RM
6551 if (errbuf_size != 0)
6552 {
6553 if (msg_size > errbuf_size)
0b32bf0e
SM
6554 {
6555 strncpy (errbuf, msg, errbuf_size - 1);
6556 errbuf[errbuf_size - 1] = 0;
6557 }
fa9a63c5 6558 else
0b32bf0e 6559 strcpy (errbuf, msg);
fa9a63c5
RM
6560 }
6561
6562 return msg_size;
6563}
c0f9ea08 6564WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6565
6566
6567/* Free dynamically allocated space used by PREG. */
6568
6569void
6570regfree (preg)
6571 regex_t *preg;
6572{
6573 if (preg->buffer != NULL)
6574 free (preg->buffer);
6575 preg->buffer = NULL;
5e69f11e 6576
fa9a63c5
RM
6577 preg->allocated = 0;
6578 preg->used = 0;
6579
6580 if (preg->fastmap != NULL)
6581 free (preg->fastmap);
6582 preg->fastmap = NULL;
6583 preg->fastmap_accurate = 0;
6584
6585 if (preg->translate != NULL)
6586 free (preg->translate);
6587 preg->translate = NULL;
6588}
c0f9ea08 6589WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6590
6591#endif /* not emacs */
ab5796a9
MB
6592
6593/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6594 (do not change this comment) */