*** empty log message ***
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
4bb91c68 5 Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
ba4a8e51 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25fe55af 20 USA. */
fa9a63c5 21
505bde11 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d
SM
24 - merge with glibc's regex.[ch].
25 - replace succeed_n + jump_n with a combined operation so that the counter
26 can simply be decremented when popping the failure_point without having
27 to stack up failure_count entries.
f6a3f532 28 */
505bde11 29
fa9a63c5 30/* AIX requires this to be the first thing in the file. */
0b32bf0e 31#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
32 #pragma alloca
33#endif
34
68d96f02 35#undef _GNU_SOURCE
fa9a63c5
RM
36#define _GNU_SOURCE
37
38#ifdef HAVE_CONFIG_H
0b32bf0e 39# include <config.h>
fa9a63c5
RM
40#endif
41
4bb91c68
SM
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
fa9a63c5 48
c0f9ea08
SM
49#ifdef _LIBC
50/* We have to keep the namespace clean. */
51# define regfree(preg) __regfree (preg)
52# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
53# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
54# define regerror(errcode, preg, errbuf, errbuf_size) \
55 __regerror(errcode, preg, errbuf, errbuf_size)
56# define re_set_registers(bu, re, nu, st, en) \
57 __re_set_registers (bu, re, nu, st, en)
58# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
59 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
60# define re_match(bufp, string, size, pos, regs) \
61 __re_match (bufp, string, size, pos, regs)
62# define re_search(bufp, string, size, startpos, range, regs) \
63 __re_search (bufp, string, size, startpos, range, regs)
64# define re_compile_pattern(pattern, length, bufp) \
65 __re_compile_pattern (pattern, length, bufp)
66# define re_set_syntax(syntax) __re_set_syntax (syntax)
67# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
68 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
69# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
70
71# define WEAK_ALIAS(a,b) weak_alias (a, b)
72
73/* We are also using some library internals. */
74# include <locale/localeinfo.h>
75# include <locale/elem-hash.h>
76# include <langinfo.h>
77#else
78# define WEAK_ALIAS(a,b)
79#endif
80
4bb91c68 81/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 82#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
83# include <libintl.h>
84#else
85# define gettext(msgid) (msgid)
86#endif
87
5e69f11e
RM
88#ifndef gettext_noop
89/* This define is so xgettext can find the internationalizable
90 strings. */
0b32bf0e 91# define gettext_noop(String) String
5e69f11e
RM
92#endif
93
fa9a63c5
RM
94/* The `emacs' switch turns on certain matching commands
95 that make sense only in Emacs. */
96#ifdef emacs
97
0b32bf0e
SM
98# include "lisp.h"
99# include "buffer.h"
b18215fc
RS
100
101/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 102# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 103
0b32bf0e
SM
104# include "syntax.h"
105# include "charset.h"
106# include "category.h"
fa9a63c5 107
0b32bf0e
SM
108# define malloc xmalloc
109# define realloc xrealloc
110# define free xfree
9abbd165 111
0b32bf0e
SM
112/* Converts the pointer to the char to BEG-based offset from the start. */
113# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
114# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
115
116# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
117# define RE_STRING_CHAR(p, s) \
4e8a9132 118 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 119# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
120 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
121
122/* Set C a (possibly multibyte) character before P. P points into a
123 string which is the virtual concatenation of STR1 (which ends at
124 END1) or STR2 (which ends at END2). */
0b32bf0e 125# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
126 do { \
127 if (multibyte) \
128 { \
0b32bf0e
SM
129 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
130 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
131 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
132 c = STRING_CHAR (dtemp, (p) - dtemp); \
2d1675e4
SM
133 } \
134 else \
135 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
136 } while (0)
137
4e8a9132 138
fa9a63c5
RM
139#else /* not emacs */
140
141/* If we are not linking with Emacs proper,
142 we can't use the relocating allocator
143 even if config.h says that we can. */
0b32bf0e 144# undef REL_ALLOC
fa9a63c5 145
0b32bf0e
SM
146# if defined STDC_HEADERS || defined _LIBC
147# include <stdlib.h>
148# else
fa9a63c5
RM
149char *malloc ();
150char *realloc ();
0b32bf0e 151# endif
fa9a63c5 152
9e4ecb26 153/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 154 If nothing else has been done, use the method below. */
0b32bf0e
SM
155# ifdef INHIBIT_STRING_HEADER
156# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
157# if !defined bzero && !defined bcopy
158# undef INHIBIT_STRING_HEADER
159# endif
160# endif
161# endif
9e4ecb26 162
4bb91c68 163/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
164 This is used in most programs--a few other programs avoid this
165 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
166# ifndef INHIBIT_STRING_HEADER
167# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
168# include <string.h>
0b32bf0e 169# ifndef bzero
4bb91c68
SM
170# ifndef _LIBC
171# define bzero(s, n) (memset (s, '\0', n), (s))
172# else
173# define bzero(s, n) __bzero (s, n)
174# endif
0b32bf0e
SM
175# endif
176# else
177# include <strings.h>
4bb91c68
SM
178# ifndef memcmp
179# define memcmp(s1, s2, n) bcmp (s1, s2, n)
180# endif
181# ifndef memcpy
182# define memcpy(d, s, n) (bcopy (s, d, n), (d))
183# endif
0b32bf0e
SM
184# endif
185# endif
fa9a63c5
RM
186
187/* Define the syntax stuff for \<, \>, etc. */
188
990b2375
SM
189/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
190enum syntaxcode { Swhitespace = 0, Sword = 1 };
fa9a63c5 191
0b32bf0e
SM
192# ifdef SWITCH_ENUM_BUG
193# define SWITCH_ENUM_CAST(x) ((int)(x))
194# else
195# define SWITCH_ENUM_CAST(x) (x)
196# endif
fa9a63c5 197
e934739e 198/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
199# define BASE_LEADING_CODE_P(c) (0)
200# define CHAR_CHARSET(c) 0
201# define CHARSET_LEADING_CODE_BASE(c) 0
202# define MAX_MULTIBYTE_LENGTH 1
203# define RE_MULTIBYTE_P(x) 0
204# define WORD_BOUNDARY_P(c1, c2) (0)
205# define CHAR_HEAD_P(p) (1)
206# define SINGLE_BYTE_CHAR_P(c) (1)
207# define SAME_CHARSET_P(c1, c2) (1)
208# define MULTIBYTE_FORM_LENGTH(p, s) (1)
209# define STRING_CHAR(p, s) (*(p))
210# define RE_STRING_CHAR STRING_CHAR
211# define CHAR_STRING(c, s) (*(s) = (c), 1)
212# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
213# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
214# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 215 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 216# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 217#endif /* not emacs */
4e8a9132
SM
218
219#ifndef RE_TRANSLATE
0b32bf0e
SM
220# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
221# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 222#endif
fa9a63c5
RM
223\f
224/* Get the interface, including the syntax bits. */
225#include "regex.h"
226
f71b19b6
DL
227/* isalpha etc. are used for the character classes. */
228#include <ctype.h>
fa9a63c5 229
f71b19b6 230#ifdef emacs
fa9a63c5 231
f71b19b6 232/* 1 if C is an ASCII character. */
0b32bf0e 233# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 234
f71b19b6 235/* 1 if C is a unibyte character. */
0b32bf0e 236# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 237
f71b19b6 238/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 239
f71b19b6 240/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
241# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
242# define ISCNTRL(c) ((c) < ' ')
243# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
244 || ((c) >= 'a' && (c) <= 'f') \
245 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
246
247/* This is only used for single-byte characters. */
0b32bf0e 248# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
249
250/* The rest must handle multibyte characters. */
251
0b32bf0e 252# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 253 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
254 : 1)
255
0b32bf0e 256# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 257 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
258 : 1)
259
0b32bf0e 260# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
261 ? (((c) >= 'a' && (c) <= 'z') \
262 || ((c) >= 'A' && (c) <= 'Z') \
263 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
264 : SYNTAX (c) == Sword)
265
0b32bf0e 266# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
267 ? (((c) >= 'a' && (c) <= 'z') \
268 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
269 : SYNTAX (c) == Sword)
270
0b32bf0e 271# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 272
0b32bf0e 273# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
274 ? ((c) > ' ' && (c) < 0177 \
275 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
276 || ((c) >= 'A' && (c) <= 'Z') \
277 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
278 : SYNTAX (c) != Sword)
279
0b32bf0e 280# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 281
0b32bf0e 282# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 283
0b32bf0e 284# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
285
286#else /* not emacs */
287
f71b19b6
DL
288/* Jim Meyering writes:
289
290 "... Some ctype macros are valid only for character codes that
291 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
292 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 293 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
294 STDC_HEADERS is defined, then autoconf has verified that the ctype
295 macros don't need to be guarded with references to isascii. ...
296 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
297 eliminate the && through constant folding."
298 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 299
4bb91c68 300# undef ISASCII
0b32bf0e
SM
301# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
302# define ISASCII(c) 1
303# else
304# define ISASCII(c) isascii(c)
305# endif
f71b19b6
DL
306
307/* 1 if C is an ASCII character. */
0b32bf0e 308# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
309
310/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
311# define ISUNIBYTE(c) 1
312
313# ifdef isblank
314# define ISBLANK(c) (ISASCII (c) && isblank (c))
315# else
316# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
317# endif
318# ifdef isgraph
319# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
320# else
321# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
322# endif
323
4bb91c68 324# undef ISPRINT
0b32bf0e
SM
325# define ISPRINT(c) (ISASCII (c) && isprint (c))
326# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
327# define ISALNUM(c) (ISASCII (c) && isalnum (c))
328# define ISALPHA(c) (ISASCII (c) && isalpha (c))
329# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
330# define ISLOWER(c) (ISASCII (c) && islower (c))
331# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
332# define ISSPACE(c) (ISASCII (c) && isspace (c))
333# define ISUPPER(c) (ISASCII (c) && isupper (c))
334# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
335
336# define ISWORD(c) ISALPHA(c)
337
4bb91c68
SM
338# ifdef _tolower
339# define TOLOWER(c) _tolower(c)
340# else
341# define TOLOWER(c) tolower(c)
342# endif
343
344/* How many characters in the character set. */
345# define CHAR_SET_SIZE 256
346
0b32bf0e 347# ifdef SYNTAX_TABLE
f71b19b6 348
0b32bf0e 349extern char *re_syntax_table;
f71b19b6 350
0b32bf0e
SM
351# else /* not SYNTAX_TABLE */
352
0b32bf0e
SM
353static char re_syntax_table[CHAR_SET_SIZE];
354
355static void
356init_syntax_once ()
357{
358 register int c;
359 static int done = 0;
360
361 if (done)
362 return;
363
364 bzero (re_syntax_table, sizeof re_syntax_table);
365
4bb91c68
SM
366 for (c = 0; c < CHAR_SET_SIZE; ++c)
367 if (ISALNUM (c))
368 re_syntax_table[c] = Sword;
fa9a63c5 369
0b32bf0e 370 re_syntax_table['_'] = Sword;
fa9a63c5 371
0b32bf0e
SM
372 done = 1;
373}
374
375# endif /* not SYNTAX_TABLE */
96cc36cc 376
4bb91c68
SM
377# define SYNTAX(c) re_syntax_table[(c)]
378
96cc36cc
RS
379#endif /* not emacs */
380\f
fa9a63c5 381#ifndef NULL
0b32bf0e 382# define NULL (void *)0
fa9a63c5
RM
383#endif
384
385/* We remove any previous definition of `SIGN_EXTEND_CHAR',
386 since ours (we hope) works properly with all combinations of
387 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 388 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
389#undef SIGN_EXTEND_CHAR
390#if __STDC__
0b32bf0e 391# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
392#else /* not __STDC__ */
393/* As in Harbison and Steele. */
0b32bf0e 394# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
395#endif
396\f
397/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
398 use `alloca' instead of `malloc'. This is because using malloc in
399 re_search* or re_match* could cause memory leaks when C-g is used in
400 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
401 the other hand, malloc is more portable, and easier to debug.
402
fa9a63c5
RM
403 Because we sometimes use alloca, some routines have to be macros,
404 not functions -- `alloca'-allocated space disappears at the end of the
405 function it is called in. */
406
407#ifdef REGEX_MALLOC
408
0b32bf0e
SM
409# define REGEX_ALLOCATE malloc
410# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
411# define REGEX_FREE free
fa9a63c5
RM
412
413#else /* not REGEX_MALLOC */
414
415/* Emacs already defines alloca, sometimes. */
0b32bf0e 416# ifndef alloca
fa9a63c5
RM
417
418/* Make alloca work the best possible way. */
0b32bf0e
SM
419# ifdef __GNUC__
420# define alloca __builtin_alloca
421# else /* not __GNUC__ */
422# if HAVE_ALLOCA_H
423# include <alloca.h>
424# endif /* HAVE_ALLOCA_H */
425# endif /* not __GNUC__ */
fa9a63c5 426
0b32bf0e 427# endif /* not alloca */
fa9a63c5 428
0b32bf0e 429# define REGEX_ALLOCATE alloca
fa9a63c5
RM
430
431/* Assumes a `char *destination' variable. */
0b32bf0e 432# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 433 (destination = (char *) alloca (nsize), \
4bb91c68 434 memcpy (destination, source, osize))
fa9a63c5
RM
435
436/* No need to do anything to free, after alloca. */
0b32bf0e 437# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
438
439#endif /* not REGEX_MALLOC */
440
441/* Define how to allocate the failure stack. */
442
0b32bf0e 443#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 444
0b32bf0e 445# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 446 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 447# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 448 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 449# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
450 r_alloc_free (&failure_stack_ptr)
451
4297555e 452#else /* not using relocating allocator */
fa9a63c5 453
0b32bf0e 454# ifdef REGEX_MALLOC
fa9a63c5 455
0b32bf0e
SM
456# define REGEX_ALLOCATE_STACK malloc
457# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
458# define REGEX_FREE_STACK free
fa9a63c5 459
0b32bf0e 460# else /* not REGEX_MALLOC */
fa9a63c5 461
0b32bf0e 462# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 463
0b32bf0e 464# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 465 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 466/* No need to explicitly free anything. */
0b32bf0e 467# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 468
0b32bf0e 469# endif /* not REGEX_MALLOC */
4297555e 470#endif /* not using relocating allocator */
fa9a63c5
RM
471
472
473/* True if `size1' is non-NULL and PTR is pointing anywhere inside
474 `string1' or just past its end. This works if PTR is NULL, which is
475 a good thing. */
25fe55af 476#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
477 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
478
479/* (Re)Allocate N items of type T using malloc, or fail. */
480#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
481#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
482#define RETALLOC_IF(addr, n, t) \
483 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
484#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
485
4bb91c68 486#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
487
488#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
489
490#undef MAX
491#undef MIN
492#define MAX(a, b) ((a) > (b) ? (a) : (b))
493#define MIN(a, b) ((a) < (b) ? (a) : (b))
494
66f0296e
SM
495/* Type of source-pattern and string chars. */
496typedef const unsigned char re_char;
497
fa9a63c5
RM
498typedef char boolean;
499#define false 0
500#define true 1
501
4bb91c68
SM
502static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
503 re_char *string1, int size1,
504 re_char *string2, int size2,
505 int pos,
506 struct re_registers *regs,
507 int stop));
fa9a63c5
RM
508\f
509/* These are the command codes that appear in compiled regular
4bb91c68 510 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
511 command code can specify any interpretation whatsoever for its
512 arguments. Zero bytes may appear in the compiled regular expression. */
513
514typedef enum
515{
516 no_op = 0,
517
4bb91c68 518 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
519 succeed,
520
25fe55af 521 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
522 exactn,
523
25fe55af 524 /* Matches any (more or less) character. */
fa9a63c5
RM
525 anychar,
526
25fe55af
RS
527 /* Matches any one char belonging to specified set. First
528 following byte is number of bitmap bytes. Then come bytes
529 for a bitmap saying which chars are in. Bits in each byte
530 are ordered low-bit-first. A character is in the set if its
531 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
532 automatically not in the set.
533
534 If the length byte has the 0x80 bit set, then that stuff
535 is followed by a range table:
536 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 537 See RANGE_TABLE_WORK_BITS below.
96cc36cc
RS
538 2 bytes, the number of pairs that follow
539 pairs, each 2 multibyte characters,
0b32bf0e 540 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
541 charset,
542
25fe55af 543 /* Same parameters as charset, but match any character that is
4bb91c68 544 not one of those specified. */
fa9a63c5
RM
545 charset_not,
546
25fe55af
RS
547 /* Start remembering the text that is matched, for storing in a
548 register. Followed by one byte with the register number, in
549 the range 0 to one less than the pattern buffer's re_nsub
505bde11 550 field. */
fa9a63c5
RM
551 start_memory,
552
25fe55af
RS
553 /* Stop remembering the text that is matched and store it in a
554 memory register. Followed by one byte with the register
555 number, in the range 0 to one less than `re_nsub' in the
505bde11 556 pattern buffer. */
fa9a63c5
RM
557 stop_memory,
558
25fe55af 559 /* Match a duplicate of something remembered. Followed by one
4bb91c68 560 byte containing the register number. */
fa9a63c5
RM
561 duplicate,
562
25fe55af 563 /* Fail unless at beginning of line. */
fa9a63c5
RM
564 begline,
565
4bb91c68 566 /* Fail unless at end of line. */
fa9a63c5
RM
567 endline,
568
25fe55af
RS
569 /* Succeeds if at beginning of buffer (if emacs) or at beginning
570 of string to be matched (if not). */
fa9a63c5
RM
571 begbuf,
572
25fe55af 573 /* Analogously, for end of buffer/string. */
fa9a63c5 574 endbuf,
5e69f11e 575
25fe55af 576 /* Followed by two byte relative address to which to jump. */
5e69f11e 577 jump,
fa9a63c5 578
25fe55af
RS
579 /* Followed by two-byte relative address of place to resume at
580 in case of failure. */
fa9a63c5 581 on_failure_jump,
5e69f11e 582
25fe55af
RS
583 /* Like on_failure_jump, but pushes a placeholder instead of the
584 current string position when executed. */
fa9a63c5 585 on_failure_keep_string_jump,
5e69f11e 586
505bde11
SM
587 /* Just like `on_failure_jump', except that it checks that we
588 don't get stuck in an infinite loop (matching an empty string
589 indefinitely). */
590 on_failure_jump_loop,
591
0683b6fa
SM
592 /* Just like `on_failure_jump_loop', except that it checks for
593 a different kind of loop (the kind that shows up with non-greedy
594 operators). This operation has to be immediately preceded
595 by a `no_op'. */
596 on_failure_jump_nastyloop,
597
0b32bf0e 598 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
599 It analyses the loop before which it is put and if the
600 loop does not require backtracking, it changes itself to
4e8a9132
SM
601 `on_failure_keep_string_jump' and short-circuits the loop,
602 else it just defaults to changing itself into `on_failure_jump'.
603 It assumes that it is pointing to just past a `jump'. */
505bde11 604 on_failure_jump_smart,
fa9a63c5 605
25fe55af 606 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
607 After matching N times, jump to the address upon failure.
608 Does not work if N starts at 0: use on_failure_jump_loop
609 instead. */
fa9a63c5
RM
610 succeed_n,
611
25fe55af
RS
612 /* Followed by two-byte relative address, and two-byte number n.
613 Jump to the address N times, then fail. */
fa9a63c5
RM
614 jump_n,
615
25fe55af
RS
616 /* Set the following two-byte relative address to the
617 subsequent two-byte number. The address *includes* the two
618 bytes of number. */
fa9a63c5
RM
619 set_number_at,
620
fa9a63c5
RM
621 wordbeg, /* Succeeds if at word beginning. */
622 wordend, /* Succeeds if at word end. */
623
624 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 625 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5
RM
626
627 /* Matches any character whose syntax is specified. Followed by
25fe55af 628 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
629 syntaxspec,
630
631 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
632 notsyntaxspec
633
634#ifdef emacs
635 ,before_dot, /* Succeeds if before point. */
636 at_dot, /* Succeeds if at point. */
637 after_dot, /* Succeeds if after point. */
b18215fc
RS
638
639 /* Matches any character whose category-set contains the specified
25fe55af
RS
640 category. The operator is followed by a byte which contains a
641 category code (mnemonic ASCII character). */
b18215fc
RS
642 categoryspec,
643
644 /* Matches any character whose category-set does not contain the
645 specified category. The operator is followed by a byte which
646 contains the category code (mnemonic ASCII character). */
647 notcategoryspec
fa9a63c5
RM
648#endif /* emacs */
649} re_opcode_t;
650\f
651/* Common operations on the compiled pattern. */
652
653/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
654
655#define STORE_NUMBER(destination, number) \
656 do { \
657 (destination)[0] = (number) & 0377; \
658 (destination)[1] = (number) >> 8; \
659 } while (0)
660
661/* Same as STORE_NUMBER, except increment DESTINATION to
662 the byte after where the number is stored. Therefore, DESTINATION
663 must be an lvalue. */
664
665#define STORE_NUMBER_AND_INCR(destination, number) \
666 do { \
667 STORE_NUMBER (destination, number); \
668 (destination) += 2; \
669 } while (0)
670
671/* Put into DESTINATION a number stored in two contiguous bytes starting
672 at SOURCE. */
673
674#define EXTRACT_NUMBER(destination, source) \
675 do { \
676 (destination) = *(source) & 0377; \
677 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
678 } while (0)
679
680#ifdef DEBUG
4bb91c68 681static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
682static void
683extract_number (dest, source)
684 int *dest;
685 unsigned char *source;
686{
5e69f11e 687 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
688 *dest = *source & 0377;
689 *dest += temp << 8;
690}
691
4bb91c68 692# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
693# undef EXTRACT_NUMBER
694# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
695# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
696
697#endif /* DEBUG */
698
699/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
700 SOURCE must be an lvalue. */
701
702#define EXTRACT_NUMBER_AND_INCR(destination, source) \
703 do { \
704 EXTRACT_NUMBER (destination, source); \
25fe55af 705 (source) += 2; \
fa9a63c5
RM
706 } while (0)
707
708#ifdef DEBUG
4bb91c68
SM
709static void extract_number_and_incr _RE_ARGS ((int *destination,
710 re_char **source));
fa9a63c5
RM
711static void
712extract_number_and_incr (destination, source)
713 int *destination;
714 unsigned char **source;
5e69f11e 715{
fa9a63c5
RM
716 extract_number (destination, *source);
717 *source += 2;
718}
719
0b32bf0e
SM
720# ifndef EXTRACT_MACROS
721# undef EXTRACT_NUMBER_AND_INCR
722# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 723 extract_number_and_incr (&dest, &src)
0b32bf0e 724# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
725
726#endif /* DEBUG */
727\f
b18215fc
RS
728/* Store a multibyte character in three contiguous bytes starting
729 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 730 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
731
732#define STORE_CHARACTER_AND_INCR(destination, character) \
733 do { \
734 (destination)[0] = (character) & 0377; \
735 (destination)[1] = ((character) >> 8) & 0377; \
736 (destination)[2] = (character) >> 16; \
737 (destination) += 3; \
738 } while (0)
739
740/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 741 starting at SOURCE. */
b18215fc
RS
742
743#define EXTRACT_CHARACTER(destination, source) \
744 do { \
745 (destination) = ((source)[0] \
746 | ((source)[1] << 8) \
747 | ((source)[2] << 16)); \
748 } while (0)
749
750
751/* Macros for charset. */
752
753/* Size of bitmap of charset P in bytes. P is a start of charset,
754 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
755#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
756
757/* Nonzero if charset P has range table. */
25fe55af 758#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
759
760/* Return the address of range table of charset P. But not the start
761 of table itself, but the before where the number of ranges is
96cc36cc
RS
762 stored. `2 +' means to skip re_opcode_t and size of bitmap,
763 and the 2 bytes of flags at the start of the range table. */
764#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
765
766/* Extract the bit flags that start a range table. */
767#define CHARSET_RANGE_TABLE_BITS(p) \
768 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
769 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
770
771/* Test if C is listed in the bitmap of charset P. */
772#define CHARSET_LOOKUP_BITMAP(p, c) \
773 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
774 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
775
776/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
777 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
778 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
779 and end. */
780#define CHARSET_RANGE_TABLE_END(range_table, count) \
781 ((range_table) + (count) * 2 * 3)
782
25fe55af 783/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
784 COUNT is number of ranges in RANGE_TABLE. */
785#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
786 do \
787 { \
788 int range_start, range_end; \
789 unsigned char *p; \
790 unsigned char *range_table_end \
791 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
792 \
793 for (p = (range_table); p < range_table_end; p += 2 * 3) \
794 { \
795 EXTRACT_CHARACTER (range_start, p); \
796 EXTRACT_CHARACTER (range_end, p + 3); \
797 \
798 if (range_start <= (c) && (c) <= range_end) \
799 { \
800 (not) = !(not); \
801 break; \
802 } \
803 } \
804 } \
805 while (0)
806
807/* Test if C is in range table of CHARSET. The flag NOT is negated if
808 C is listed in it. */
809#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
810 do \
811 { \
812 /* Number of ranges in range table. */ \
813 int count; \
814 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
815 \
816 EXTRACT_NUMBER_AND_INCR (count, range_table); \
817 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
818 } \
819 while (0)
820\f
fa9a63c5
RM
821/* If DEBUG is defined, Regex prints many voluminous messages about what
822 it is doing (if the variable `debug' is nonzero). If linked with the
823 main program in `iregex.c', you can enter patterns and strings
824 interactively. And if linked with the main program in `main.c' and
4bb91c68 825 the other test files, you can run the already-written tests. */
fa9a63c5
RM
826
827#ifdef DEBUG
828
829/* We use standard I/O for debugging. */
0b32bf0e 830# include <stdio.h>
fa9a63c5
RM
831
832/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 833# include <assert.h>
fa9a63c5 834
99633e97 835static int debug = -100000;
fa9a63c5 836
0b32bf0e
SM
837# define DEBUG_STATEMENT(e) e
838# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
839# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
840# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
841# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
842# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 843 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 844# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 845 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
846
847
848/* Print the fastmap in human-readable form. */
849
850void
851print_fastmap (fastmap)
852 char *fastmap;
853{
854 unsigned was_a_range = 0;
5e69f11e
RM
855 unsigned i = 0;
856
fa9a63c5
RM
857 while (i < (1 << BYTEWIDTH))
858 {
859 if (fastmap[i++])
860 {
861 was_a_range = 0;
25fe55af
RS
862 putchar (i - 1);
863 while (i < (1 << BYTEWIDTH) && fastmap[i])
864 {
865 was_a_range = 1;
866 i++;
867 }
fa9a63c5 868 if (was_a_range)
25fe55af
RS
869 {
870 printf ("-");
871 putchar (i - 1);
872 }
873 }
fa9a63c5 874 }
5e69f11e 875 putchar ('\n');
fa9a63c5
RM
876}
877
878
879/* Print a compiled pattern string in human-readable form, starting at
880 the START pointer into it and ending just before the pointer END. */
881
882void
883print_partial_compiled_pattern (start, end)
884 unsigned char *start;
885 unsigned char *end;
886{
887 int mcnt, mcnt2;
888 unsigned char *p = start;
889 unsigned char *pend = end;
890
891 if (start == NULL)
892 {
893 printf ("(null)\n");
894 return;
895 }
5e69f11e 896
fa9a63c5
RM
897 /* Loop over pattern commands. */
898 while (p < pend)
899 {
900 printf ("%d:\t", p - start);
901
902 switch ((re_opcode_t) *p++)
903 {
25fe55af
RS
904 case no_op:
905 printf ("/no_op");
906 break;
fa9a63c5 907
99633e97
SM
908 case succeed:
909 printf ("/succeed");
910 break;
911
fa9a63c5
RM
912 case exactn:
913 mcnt = *p++;
25fe55af
RS
914 printf ("/exactn/%d", mcnt);
915 do
fa9a63c5 916 {
25fe55af 917 putchar ('/');
fa9a63c5 918 putchar (*p++);
25fe55af
RS
919 }
920 while (--mcnt);
921 break;
fa9a63c5
RM
922
923 case start_memory:
505bde11 924 printf ("/start_memory/%d", *p++);
25fe55af 925 break;
fa9a63c5
RM
926
927 case stop_memory:
505bde11 928 printf ("/stop_memory/%d", *p++);
25fe55af 929 break;
fa9a63c5
RM
930
931 case duplicate:
932 printf ("/duplicate/%d", *p++);
933 break;
934
935 case anychar:
936 printf ("/anychar");
937 break;
938
939 case charset:
25fe55af
RS
940 case charset_not:
941 {
942 register int c, last = -100;
fa9a63c5 943 register int in_range = 0;
99633e97
SM
944 int length = CHARSET_BITMAP_SIZE (p - 1);
945 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5
RM
946
947 printf ("/charset [%s",
25fe55af 948 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 949
25fe55af 950 assert (p + *p < pend);
fa9a63c5 951
25fe55af 952 for (c = 0; c < 256; c++)
96cc36cc 953 if (c / 8 < length
fa9a63c5
RM
954 && (p[1 + (c/8)] & (1 << (c % 8))))
955 {
956 /* Are we starting a range? */
957 if (last + 1 == c && ! in_range)
958 {
959 putchar ('-');
960 in_range = 1;
961 }
962 /* Have we broken a range? */
963 else if (last + 1 != c && in_range)
96cc36cc 964 {
fa9a63c5
RM
965 putchar (last);
966 in_range = 0;
967 }
5e69f11e 968
fa9a63c5
RM
969 if (! in_range)
970 putchar (c);
971
972 last = c;
25fe55af 973 }
fa9a63c5
RM
974
975 if (in_range)
976 putchar (last);
977
978 putchar (']');
979
99633e97 980 p += 1 + length;
96cc36cc 981
96cc36cc 982 if (has_range_table)
99633e97
SM
983 {
984 int count;
985 printf ("has-range-table");
986
987 /* ??? Should print the range table; for now, just skip it. */
988 p += 2; /* skip range table bits */
989 EXTRACT_NUMBER_AND_INCR (count, p);
990 p = CHARSET_RANGE_TABLE_END (p, count);
991 }
fa9a63c5
RM
992 }
993 break;
994
995 case begline:
996 printf ("/begline");
25fe55af 997 break;
fa9a63c5
RM
998
999 case endline:
25fe55af
RS
1000 printf ("/endline");
1001 break;
fa9a63c5
RM
1002
1003 case on_failure_jump:
25fe55af
RS
1004 extract_number_and_incr (&mcnt, &p);
1005 printf ("/on_failure_jump to %d", p + mcnt - start);
1006 break;
fa9a63c5
RM
1007
1008 case on_failure_keep_string_jump:
25fe55af
RS
1009 extract_number_and_incr (&mcnt, &p);
1010 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
1011 break;
fa9a63c5 1012
0683b6fa
SM
1013 case on_failure_jump_nastyloop:
1014 extract_number_and_incr (&mcnt, &p);
1015 printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
1016 break;
1017
505bde11 1018 case on_failure_jump_loop:
fa9a63c5 1019 extract_number_and_incr (&mcnt, &p);
505bde11 1020 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1021 break;
1022
505bde11 1023 case on_failure_jump_smart:
fa9a63c5 1024 extract_number_and_incr (&mcnt, &p);
505bde11 1025 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1026 break;
1027
25fe55af 1028 case jump:
fa9a63c5 1029 extract_number_and_incr (&mcnt, &p);
25fe55af 1030 printf ("/jump to %d", p + mcnt - start);
fa9a63c5
RM
1031 break;
1032
25fe55af
RS
1033 case succeed_n:
1034 extract_number_and_incr (&mcnt, &p);
1035 extract_number_and_incr (&mcnt2, &p);
99633e97 1036 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1037 break;
5e69f11e 1038
25fe55af
RS
1039 case jump_n:
1040 extract_number_and_incr (&mcnt, &p);
1041 extract_number_and_incr (&mcnt2, &p);
99633e97 1042 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1043 break;
5e69f11e 1044
25fe55af
RS
1045 case set_number_at:
1046 extract_number_and_incr (&mcnt, &p);
1047 extract_number_and_incr (&mcnt2, &p);
99633e97 1048 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1049 break;
5e69f11e 1050
25fe55af 1051 case wordbound:
fa9a63c5
RM
1052 printf ("/wordbound");
1053 break;
1054
1055 case notwordbound:
1056 printf ("/notwordbound");
25fe55af 1057 break;
fa9a63c5
RM
1058
1059 case wordbeg:
1060 printf ("/wordbeg");
1061 break;
5e69f11e 1062
fa9a63c5
RM
1063 case wordend:
1064 printf ("/wordend");
5e69f11e 1065
1fb352e0
SM
1066 case syntaxspec:
1067 printf ("/syntaxspec");
1068 mcnt = *p++;
1069 printf ("/%d", mcnt);
1070 break;
1071
1072 case notsyntaxspec:
1073 printf ("/notsyntaxspec");
1074 mcnt = *p++;
1075 printf ("/%d", mcnt);
1076 break;
1077
0b32bf0e 1078# ifdef emacs
fa9a63c5
RM
1079 case before_dot:
1080 printf ("/before_dot");
25fe55af 1081 break;
fa9a63c5
RM
1082
1083 case at_dot:
1084 printf ("/at_dot");
25fe55af 1085 break;
fa9a63c5
RM
1086
1087 case after_dot:
1088 printf ("/after_dot");
25fe55af 1089 break;
fa9a63c5 1090
1fb352e0
SM
1091 case categoryspec:
1092 printf ("/categoryspec");
fa9a63c5
RM
1093 mcnt = *p++;
1094 printf ("/%d", mcnt);
25fe55af 1095 break;
5e69f11e 1096
1fb352e0
SM
1097 case notcategoryspec:
1098 printf ("/notcategoryspec");
fa9a63c5
RM
1099 mcnt = *p++;
1100 printf ("/%d", mcnt);
1101 break;
0b32bf0e 1102# endif /* emacs */
fa9a63c5 1103
fa9a63c5
RM
1104 case begbuf:
1105 printf ("/begbuf");
25fe55af 1106 break;
fa9a63c5
RM
1107
1108 case endbuf:
1109 printf ("/endbuf");
25fe55af 1110 break;
fa9a63c5 1111
25fe55af
RS
1112 default:
1113 printf ("?%d", *(p-1));
fa9a63c5
RM
1114 }
1115
1116 putchar ('\n');
1117 }
1118
1119 printf ("%d:\tend of pattern.\n", p - start);
1120}
1121
1122
1123void
1124print_compiled_pattern (bufp)
1125 struct re_pattern_buffer *bufp;
1126{
1127 unsigned char *buffer = bufp->buffer;
1128
1129 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1130 printf ("%ld bytes used/%ld bytes allocated.\n",
1131 bufp->used, bufp->allocated);
fa9a63c5
RM
1132
1133 if (bufp->fastmap_accurate && bufp->fastmap)
1134 {
1135 printf ("fastmap: ");
1136 print_fastmap (bufp->fastmap);
1137 }
1138
1139 printf ("re_nsub: %d\t", bufp->re_nsub);
1140 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1141 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1142 printf ("no_sub: %d\t", bufp->no_sub);
1143 printf ("not_bol: %d\t", bufp->not_bol);
1144 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1145 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1146 fflush (stdout);
fa9a63c5
RM
1147 /* Perhaps we should print the translate table? */
1148}
1149
1150
1151void
1152print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1153 re_char *where;
1154 re_char *string1;
1155 re_char *string2;
fa9a63c5
RM
1156 int size1;
1157 int size2;
1158{
4bb91c68 1159 int this_char;
5e69f11e 1160
fa9a63c5
RM
1161 if (where == NULL)
1162 printf ("(null)");
1163 else
1164 {
1165 if (FIRST_STRING_P (where))
25fe55af
RS
1166 {
1167 for (this_char = where - string1; this_char < size1; this_char++)
1168 putchar (string1[this_char]);
fa9a63c5 1169
25fe55af
RS
1170 where = string2;
1171 }
fa9a63c5
RM
1172
1173 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1174 putchar (string2[this_char]);
fa9a63c5
RM
1175 }
1176}
1177
1178#else /* not DEBUG */
1179
0b32bf0e
SM
1180# undef assert
1181# define assert(e)
fa9a63c5 1182
0b32bf0e
SM
1183# define DEBUG_STATEMENT(e)
1184# define DEBUG_PRINT1(x)
1185# define DEBUG_PRINT2(x1, x2)
1186# define DEBUG_PRINT3(x1, x2, x3)
1187# define DEBUG_PRINT4(x1, x2, x3, x4)
1188# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1189# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1190
1191#endif /* not DEBUG */
1192\f
1193/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1194 also be assigned to arbitrarily: each pattern buffer stores its own
1195 syntax, so it can be changed between regex compilations. */
1196/* This has no initializer because initialized variables in Emacs
1197 become read-only after dumping. */
1198reg_syntax_t re_syntax_options;
1199
1200
1201/* Specify the precise syntax of regexps for compilation. This provides
1202 for compatibility for various utilities which historically have
1203 different, incompatible syntaxes.
1204
1205 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1206 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1207
1208reg_syntax_t
1209re_set_syntax (syntax)
1210 reg_syntax_t syntax;
1211{
1212 reg_syntax_t ret = re_syntax_options;
5e69f11e 1213
fa9a63c5
RM
1214 re_syntax_options = syntax;
1215 return ret;
1216}
c0f9ea08 1217WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1218\f
1219/* This table gives an error message for each of the error codes listed
4bb91c68 1220 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1221 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1222 but why not be nice? */
fa9a63c5
RM
1223
1224static const char *re_error_msgid[] =
5e69f11e
RM
1225 {
1226 gettext_noop ("Success"), /* REG_NOERROR */
1227 gettext_noop ("No match"), /* REG_NOMATCH */
1228 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1229 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1230 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1231 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1232 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1233 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1234 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1235 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1236 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1237 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1238 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1239 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1240 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1241 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1242 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
fa9a63c5
RM
1243 };
1244\f
4bb91c68 1245/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1246
1247/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1248 searching and matching functions should not call alloca. On some
1249 systems, alloca is implemented in terms of malloc, and if we're
1250 using the relocating allocator routines, then malloc could cause a
1251 relocation, which might (if the strings being searched are in the
1252 ralloc heap) shift the data out from underneath the regexp
1253 routines.
1254
5e69f11e 1255 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1256 processes input from X in a signal handler; processing X input may
1257 call malloc; if input arrives while a matching routine is calling
1258 malloc, then we're scrod. But Emacs can't just block input while
1259 calling matching routines; then we don't notice interrupts when
1260 they come in. So, Emacs blocks input around all regexp calls
1261 except the matching calls, which it leaves unprotected, in the
1262 faith that they will not malloc. */
1263
1264/* Normally, this is fine. */
1265#define MATCH_MAY_ALLOCATE
1266
1267/* When using GNU C, we are not REALLY using the C alloca, no matter
1268 what config.h may say. So don't take precautions for it. */
1269#ifdef __GNUC__
0b32bf0e 1270# undef C_ALLOCA
fa9a63c5
RM
1271#endif
1272
1273/* The match routines may not allocate if (1) they would do it with malloc
1274 and (2) it's not safe for them to use malloc.
1275 Note that if REL_ALLOC is defined, matching would not use malloc for the
1276 failure stack, but we would still use it for the register vectors;
4bb91c68 1277 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1278#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1279# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1280#endif
1281
1282\f
1283/* Failure stack declarations and macros; both re_compile_fastmap and
1284 re_match_2 use a failure stack. These have to be macros because of
1285 REGEX_ALLOCATE_STACK. */
5e69f11e 1286
fa9a63c5 1287
320a2a73 1288/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1289 when matching. If this number is exceeded, we allocate more
1290 space, so it is not a hard limit. */
1291#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1292# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1293#endif
1294
1295/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1296 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1297 This is a variable only so users of regex can assign to it; we never
c0f9ea08
SM
1298 change it ourselves. */
1299# if defined MATCH_MAY_ALLOCATE
1300/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1301 whose default stack limit is 2mb. In order for a larger
1302 value to work reliably, you have to try to make it accord
1303 with the process stack limit. */
c0f9ea08
SM
1304size_t re_max_failures = 40000;
1305# else
1306size_t re_max_failures = 4000;
1307# endif
fa9a63c5
RM
1308
1309union fail_stack_elt
1310{
c0f9ea08
SM
1311 const unsigned char *pointer;
1312 /* This should be the biggest `int' that's no bigger than a pointer. */
1313 long integer;
fa9a63c5
RM
1314};
1315
1316typedef union fail_stack_elt fail_stack_elt_t;
1317
1318typedef struct
1319{
1320 fail_stack_elt_t *stack;
c0f9ea08
SM
1321 size_t size;
1322 size_t avail; /* Offset of next open position. */
1323 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1324} fail_stack_type;
1325
505bde11
SM
1326#define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1327#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1328#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1329
1330
1331/* Define macros to initialize and free the failure stack.
1332 Do `return -2' if the alloc fails. */
1333
1334#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1335# define INIT_FAIL_STACK() \
fa9a63c5
RM
1336 do { \
1337 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1338 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1339 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1340 \
1341 if (fail_stack.stack == NULL) \
1342 return -2; \
1343 \
1344 fail_stack.size = INIT_FAILURE_ALLOC; \
1345 fail_stack.avail = 0; \
505bde11 1346 fail_stack.frame = 0; \
fa9a63c5
RM
1347 } while (0)
1348
0b32bf0e 1349# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1350#else
0b32bf0e 1351# define INIT_FAIL_STACK() \
fa9a63c5
RM
1352 do { \
1353 fail_stack.avail = 0; \
505bde11 1354 fail_stack.frame = 0; \
fa9a63c5
RM
1355 } while (0)
1356
0b32bf0e 1357# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1358#endif
1359
1360
320a2a73
KH
1361/* Double the size of FAIL_STACK, up to a limit
1362 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1363
1364 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1365 allocating space for it or it was already too large.
1366
4bb91c68 1367 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1368
320a2a73
KH
1369/* Factor to increase the failure stack size by
1370 when we increase it.
1371 This used to be 2, but 2 was too wasteful
1372 because the old discarded stacks added up to as much space
1373 were as ultimate, maximum-size stack. */
1374#define FAIL_STACK_GROWTH_FACTOR 4
1375
1376#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1377 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1378 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1379 ? 0 \
320a2a73
KH
1380 : ((fail_stack).stack \
1381 = (fail_stack_elt_t *) \
25fe55af
RS
1382 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1383 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1384 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1385 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1386 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1387 \
1388 (fail_stack).stack == NULL \
1389 ? 0 \
6453db45
KH
1390 : ((fail_stack).size \
1391 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1392 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1393 * FAIL_STACK_GROWTH_FACTOR)) \
1394 / sizeof (fail_stack_elt_t)), \
25fe55af 1395 1)))
fa9a63c5
RM
1396
1397
5e69f11e 1398/* Push pointer POINTER on FAIL_STACK.
fa9a63c5
RM
1399 Return 1 if was able to do so and 0 if ran out of memory allocating
1400 space to do so. */
1401#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1402 ((FAIL_STACK_FULL () \
320a2a73 1403 && !GROW_FAIL_STACK (FAIL_STACK)) \
fa9a63c5
RM
1404 ? 0 \
1405 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1406 1))
505bde11 1407#define POP_PATTERN_OP() POP_FAILURE_POINTER ()
fa9a63c5
RM
1408
1409/* Push a pointer value onto the failure stack.
1410 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1411 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1412#define PUSH_FAILURE_POINTER(item) \
1413 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1414
1415/* This pushes an integer-valued item onto the failure stack.
1416 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1417 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1418#define PUSH_FAILURE_INT(item) \
1419 fail_stack.stack[fail_stack.avail++].integer = (item)
1420
1421/* Push a fail_stack_elt_t value onto the failure stack.
1422 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1423 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1424#define PUSH_FAILURE_ELT(item) \
1425 fail_stack.stack[fail_stack.avail++] = (item)
1426
1427/* These three POP... operations complement the three PUSH... operations.
1428 All assume that `fail_stack' is nonempty. */
1429#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1430#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1431#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1432
505bde11
SM
1433/* Individual items aside from the registers. */
1434#define NUM_NONREG_ITEMS 3
1435
1436/* Used to examine the stack (to detect infinite loops). */
1437#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1438#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1439#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1440#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1441
1442
505bde11
SM
1443#define ENSURE_FAIL_STACK(space) \
1444while (REMAINING_AVAIL_SLOTS <= space) { \
1445 if (!GROW_FAIL_STACK (fail_stack)) \
1446 return -2; \
1447 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1448 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1449}
1450
1451/* Push register NUM onto the stack. */
1452#define PUSH_FAILURE_REG(num) \
1453do { \
1454 char *destination; \
1455 ENSURE_FAIL_STACK(3); \
1456 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1457 num, regstart[num], regend[num]); \
1458 PUSH_FAILURE_POINTER (regstart[num]); \
1459 PUSH_FAILURE_POINTER (regend[num]); \
1460 PUSH_FAILURE_INT (num); \
1461} while (0)
1462
dc1e502d
SM
1463#define PUSH_FAILURE_COUNT(ptr) \
1464do { \
1465 char *destination; \
1466 int c; \
1467 ENSURE_FAIL_STACK(3); \
1468 EXTRACT_NUMBER (c, ptr); \
1469 DEBUG_PRINT3 (" Push counter %p = %d\n", ptr, c); \
1470 PUSH_FAILURE_INT (c); \
1471 PUSH_FAILURE_POINTER (ptr); \
1472 PUSH_FAILURE_INT (-1); \
1473} while (0)
1474
505bde11 1475/* Pop a saved register off the stack. */
dc1e502d 1476#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1477do { \
1478 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1479 if (reg == -1) \
1480 { \
1481 /* It's a counter. */ \
1482 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
1483 reg = POP_FAILURE_INT (); \
1484 STORE_NUMBER (ptr, reg); \
1485 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1486 } \
1487 else \
1488 { \
1489 regend[reg] = POP_FAILURE_POINTER (); \
1490 regstart[reg] = POP_FAILURE_POINTER (); \
1491 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1492 reg, regstart[reg], regend[reg]); \
1493 } \
505bde11
SM
1494} while (0)
1495
1496/* Check that we are not stuck in an infinite loop. */
1497#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1498do { \
1499 int failure = TOP_FAILURE_HANDLE(); \
1500 /* Check for infinite matching loops */ \
1501 while (failure > 0 && \
1502 (FAILURE_STR (failure) == string_place \
1503 || FAILURE_STR (failure) == NULL)) \
1504 { \
1505 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1506 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11
SM
1507 if (FAILURE_PAT (failure) == pat_cur) \
1508 goto fail; \
66f0296e 1509 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1510 failure = NEXT_FAILURE_HANDLE(failure); \
1511 } \
1512 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1513} while (0)
1514
fa9a63c5 1515/* Push the information about the state we will need
5e69f11e
RM
1516 if we ever fail back to it.
1517
505bde11 1518 Requires variables fail_stack, regstart, regend and
320a2a73 1519 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1520 declared.
5e69f11e 1521
fa9a63c5
RM
1522 Does `return FAILURE_CODE' if runs out of memory. */
1523
505bde11
SM
1524#define PUSH_FAILURE_POINT(pattern, string_place) \
1525do { \
1526 char *destination; \
1527 /* Must be int, so when we don't save any registers, the arithmetic \
1528 of 0 + -1 isn't done as unsigned. */ \
1529 \
505bde11 1530 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1531 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1532 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1533 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1534 \
1535 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1536 \
1537 DEBUG_PRINT1 ("\n"); \
1538 \
1539 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1540 PUSH_FAILURE_INT (fail_stack.frame); \
1541 \
1542 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1543 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1544 DEBUG_PRINT1 ("'\n"); \
1545 PUSH_FAILURE_POINTER (string_place); \
1546 \
1547 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1548 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1549 PUSH_FAILURE_POINTER (pattern); \
1550 \
1551 /* Close the frame by moving the frame pointer past it. */ \
1552 fail_stack.frame = fail_stack.avail; \
1553} while (0)
fa9a63c5 1554
320a2a73
KH
1555/* Estimate the size of data pushed by a typical failure stack entry.
1556 An estimate is all we need, because all we use this for
1557 is to choose a limit for how big to make the failure stack. */
1558
1559#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1560
fa9a63c5
RM
1561/* How many items can still be added to the stack without overflowing it. */
1562#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1563
1564
1565/* Pops what PUSH_FAIL_STACK pushes.
1566
1567 We restore into the parameters, all of which should be lvalues:
1568 STR -- the saved data position.
1569 PAT -- the saved pattern position.
fa9a63c5 1570 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1571
fa9a63c5 1572 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1573 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1574
505bde11
SM
1575#define POP_FAILURE_POINT(str, pat) \
1576do { \
fa9a63c5
RM
1577 assert (!FAIL_STACK_EMPTY ()); \
1578 \
1579 /* Remove failure points and point to how many regs pushed. */ \
1580 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1581 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1582 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1583 \
505bde11
SM
1584 /* Pop the saved registers. */ \
1585 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1586 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1587 \
505bde11
SM
1588 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1589 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1590 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1591 \
1592 /* If the saved string location is NULL, it came from an \
1593 on_failure_keep_string_jump opcode, and we want to throw away the \
1594 saved NULL, thus retaining our current position in the string. */ \
66f0296e 1595 str = (re_char *) POP_FAILURE_POINTER (); \
505bde11 1596 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1597 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1598 DEBUG_PRINT1 ("'\n"); \
1599 \
505bde11
SM
1600 fail_stack.frame = POP_FAILURE_INT (); \
1601 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1602 \
505bde11
SM
1603 assert (fail_stack.avail >= 0); \
1604 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1605 \
fa9a63c5 1606 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1607} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1608
1609
1610\f
fa9a63c5 1611/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1612#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1613\f
1614/* Subroutine declarations and macros for regex_compile. */
1615
4bb91c68
SM
1616static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1617 reg_syntax_t syntax,
1618 struct re_pattern_buffer *bufp));
1619static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1620static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1621 int arg1, int arg2));
1622static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1623 int arg, unsigned char *end));
1624static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1625 int arg1, int arg2, unsigned char *end));
1626static boolean at_begline_loc_p _RE_ARGS ((const unsigned char *pattern,
1627 const unsigned char *p,
1628 reg_syntax_t syntax));
1629static boolean at_endline_loc_p _RE_ARGS ((const unsigned char *p,
1630 const unsigned char *pend,
1631 reg_syntax_t syntax));
1632static unsigned char *skip_one_char _RE_ARGS ((unsigned char *p));
1633static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend,
1634 char *fastmap, const int multibyte));
fa9a63c5 1635
5e69f11e 1636/* Fetch the next character in the uncompiled pattern---translating it
fa9a63c5
RM
1637 if necessary. Also cast from a signed character in the constant
1638 string passed to us by the user to an unsigned char that we can use
1639 as an array index (in, e.g., `translate'). */
1640#define PATFETCH(c) \
99633e97
SM
1641 do { \
1642 PATFETCH_RAW (c); \
2d1675e4 1643 c = TRANSLATE (c); \
fa9a63c5
RM
1644 } while (0)
1645
1646/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1647 translation. */
fa9a63c5 1648#define PATFETCH_RAW(c) \
2d1675e4
SM
1649 do { \
1650 int len; \
1651 if (p == pend) return REG_EEND; \
1652 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1653 p += len; \
fa9a63c5
RM
1654 } while (0)
1655
fa9a63c5
RM
1656
1657/* If `translate' is non-null, return translate[D], else just D. We
1658 cast the subscript to translate because some data is declared as
1659 `char *', to avoid warnings when a string constant is passed. But
1660 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1661#ifndef TRANSLATE
0b32bf0e 1662# define TRANSLATE(d) \
66f0296e 1663 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1664#endif
fa9a63c5
RM
1665
1666
1667/* Macros for outputting the compiled pattern into `buffer'. */
1668
1669/* If the buffer isn't allocated when it comes in, use this. */
1670#define INIT_BUF_SIZE 32
1671
4bb91c68 1672/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1673#define GET_BUFFER_SPACE(n) \
4bb91c68 1674 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1675 EXTEND_BUFFER ()
1676
1677/* Make sure we have one more byte of buffer space and then add C to it. */
1678#define BUF_PUSH(c) \
1679 do { \
1680 GET_BUFFER_SPACE (1); \
1681 *b++ = (unsigned char) (c); \
1682 } while (0)
1683
1684
1685/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1686#define BUF_PUSH_2(c1, c2) \
1687 do { \
1688 GET_BUFFER_SPACE (2); \
1689 *b++ = (unsigned char) (c1); \
1690 *b++ = (unsigned char) (c2); \
1691 } while (0)
1692
1693
4bb91c68 1694/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1695#define BUF_PUSH_3(c1, c2, c3) \
1696 do { \
1697 GET_BUFFER_SPACE (3); \
1698 *b++ = (unsigned char) (c1); \
1699 *b++ = (unsigned char) (c2); \
1700 *b++ = (unsigned char) (c3); \
1701 } while (0)
1702
1703
1704/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1705 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1706#define STORE_JUMP(op, loc, to) \
1707 store_op1 (op, loc, (to) - (loc) - 3)
1708
1709/* Likewise, for a two-argument jump. */
1710#define STORE_JUMP2(op, loc, to, arg) \
1711 store_op2 (op, loc, (to) - (loc) - 3, arg)
1712
4bb91c68 1713/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1714#define INSERT_JUMP(op, loc, to) \
1715 insert_op1 (op, loc, (to) - (loc) - 3, b)
1716
1717/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1718#define INSERT_JUMP2(op, loc, to, arg) \
1719 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1720
1721
1722/* This is not an arbitrary limit: the arguments which represent offsets
4bb91c68 1723 into the pattern are two bytes long. So if 2^16 bytes turns out to
fa9a63c5 1724 be too small, many things would have to change. */
4bb91c68
SM
1725/* Any other compiler which, like MSC, has allocation limit below 2^16
1726 bytes will have to use approach similar to what was done below for
1727 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1728 reallocating to 0 bytes. Such thing is not going to work too well.
1729 You have been warned!! */
1730#if defined _MSC_VER && !defined WIN32
1731/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1732# define MAX_BUF_SIZE 65500L
1733#else
1734# define MAX_BUF_SIZE (1L << 16)
1735#endif
fa9a63c5
RM
1736
1737/* Extend the buffer by twice its current size via realloc and
1738 reset the pointers that pointed into the old block to point to the
1739 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1740 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1741#if __BOUNDED_POINTERS__
1742# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1743# define MOVE_BUFFER_POINTER(P) \
1744 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1745# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1746 else \
1747 { \
1748 SET_HIGH_BOUND (b); \
1749 SET_HIGH_BOUND (begalt); \
1750 if (fixup_alt_jump) \
1751 SET_HIGH_BOUND (fixup_alt_jump); \
1752 if (laststart) \
1753 SET_HIGH_BOUND (laststart); \
1754 if (pending_exact) \
1755 SET_HIGH_BOUND (pending_exact); \
1756 }
1757#else
1758# define MOVE_BUFFER_POINTER(P) (P) += incr
1759# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1760#endif
fa9a63c5 1761#define EXTEND_BUFFER() \
25fe55af 1762 do { \
fa9a63c5 1763 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1764 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1765 return REG_ESIZE; \
1766 bufp->allocated <<= 1; \
1767 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1768 bufp->allocated = MAX_BUF_SIZE; \
fa9a63c5
RM
1769 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1770 if (bufp->buffer == NULL) \
1771 return REG_ESPACE; \
1772 /* If the buffer moved, move all the pointers into it. */ \
1773 if (old_buffer != bufp->buffer) \
1774 { \
4bb91c68
SM
1775 int incr = bufp->buffer - old_buffer; \
1776 MOVE_BUFFER_POINTER (b); \
1777 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1778 if (fixup_alt_jump) \
4bb91c68 1779 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1780 if (laststart) \
4bb91c68 1781 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1782 if (pending_exact) \
4bb91c68 1783 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1784 } \
4bb91c68 1785 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1786 } while (0)
1787
1788
1789/* Since we have one byte reserved for the register number argument to
1790 {start,stop}_memory, the maximum number of groups we can report
1791 things about is what fits in that byte. */
1792#define MAX_REGNUM 255
1793
1794/* But patterns can have more than `MAX_REGNUM' registers. We just
1795 ignore the excess. */
1796typedef unsigned regnum_t;
1797
1798
1799/* Macros for the compile stack. */
1800
1801/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1802 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1803/* int may be not enough when sizeof(int) == 2. */
1804typedef long pattern_offset_t;
fa9a63c5
RM
1805
1806typedef struct
1807{
1808 pattern_offset_t begalt_offset;
1809 pattern_offset_t fixup_alt_jump;
5e69f11e 1810 pattern_offset_t laststart_offset;
fa9a63c5
RM
1811 regnum_t regnum;
1812} compile_stack_elt_t;
1813
1814
1815typedef struct
1816{
1817 compile_stack_elt_t *stack;
1818 unsigned size;
1819 unsigned avail; /* Offset of next open position. */
1820} compile_stack_type;
1821
1822
1823#define INIT_COMPILE_STACK_SIZE 32
1824
1825#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1826#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1827
4bb91c68 1828/* The next available element. */
fa9a63c5
RM
1829#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1830
1831
b18215fc
RS
1832/* Structure to manage work area for range table. */
1833struct range_table_work_area
1834{
1835 int *table; /* actual work area. */
1836 int allocated; /* allocated size for work area in bytes. */
25fe55af 1837 int used; /* actually used size in words. */
96cc36cc 1838 int bits; /* flag to record character classes */
b18215fc
RS
1839};
1840
1841/* Make sure that WORK_AREA can hold more N multibyte characters. */
1842#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1843 do { \
1844 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1845 { \
1846 (work_area).allocated += 16 * sizeof (int); \
1847 if ((work_area).table) \
1848 (work_area).table \
1849 = (int *) realloc ((work_area).table, (work_area).allocated); \
1850 else \
1851 (work_area).table \
1852 = (int *) malloc ((work_area).allocated); \
1853 if ((work_area).table == 0) \
1854 FREE_STACK_RETURN (REG_ESPACE); \
1855 } \
1856 } while (0)
1857
96cc36cc
RS
1858#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1859 (work_area).bits |= (bit)
1860
1861/* These bits represent the various character classes such as [:alnum:]
1862 in a charset's range table. */
1863#define BIT_ALNUM 0x1
1864#define BIT_ALPHA 0x2
1865#define BIT_WORD 0x4
f71b19b6
DL
1866#define BIT_ASCII 0x8
1867#define BIT_NONASCII 0x10
96cc36cc
RS
1868#define BIT_GRAPH 0x20
1869#define BIT_LOWER 0x40
1870#define BIT_PRINT 0x80
1871#define BIT_PUNCT 0x100
1872#define BIT_SPACE 0x200
1873#define BIT_UPPER 0x400
f71b19b6
DL
1874#define BIT_UNIBYTE 0x800
1875#define BIT_MULTIBYTE 0x1000
96cc36cc 1876
b18215fc
RS
1877/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1878#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1879 do { \
1880 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1881 (work_area).table[(work_area).used++] = (range_start); \
1882 (work_area).table[(work_area).used++] = (range_end); \
1883 } while (0)
1884
25fe55af 1885/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1886#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1887 do { \
1888 if ((work_area).table) \
1889 free ((work_area).table); \
1890 } while (0)
1891
96cc36cc 1892#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1893#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1894#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc
RS
1895#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1896
1897
fa9a63c5 1898/* Set the bit for character C in a list. */
4bb91c68
SM
1899#define SET_LIST_BIT(c) \
1900 (b[((unsigned char) (c)) / BYTEWIDTH] \
fa9a63c5
RM
1901 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1902
1903
1904/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1905#define GET_UNSIGNED_NUMBER(num) \
99633e97 1906 do { if (p != pend) \
fa9a63c5 1907 { \
25fe55af 1908 PATFETCH (c); \
4bb91c68 1909 while ('0' <= c && c <= '9') \
25fe55af
RS
1910 { \
1911 if (num < 0) \
1912 num = 0; \
1913 num = num * 10 + c - '0'; \
1914 if (p == pend) \
1915 break; \
1916 PATFETCH (c); \
1917 } \
1918 } \
99633e97 1919 } while (0)
fa9a63c5
RM
1920
1921#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1922
1923#define IS_CHAR_CLASS(string) \
1924 (STREQ (string, "alpha") || STREQ (string, "upper") \
1925 || STREQ (string, "lower") || STREQ (string, "digit") \
1926 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1927 || STREQ (string, "space") || STREQ (string, "print") \
1928 || STREQ (string, "punct") || STREQ (string, "graph") \
96cc36cc 1929 || STREQ (string, "cntrl") || STREQ (string, "blank") \
f71b19b6
DL
1930 || STREQ (string, "word") \
1931 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1932 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
99633e97
SM
1933
1934/* QUIT is only used on NTemacs. */
4bb91c68 1935#if !defined WINDOWSNT || !defined emacs || !defined QUIT
0b32bf0e
SM
1936# undef QUIT
1937# define QUIT
99633e97 1938#endif
fa9a63c5
RM
1939\f
1940#ifndef MATCH_MAY_ALLOCATE
1941
1942/* If we cannot allocate large objects within re_match_2_internal,
1943 we make the fail stack and register vectors global.
1944 The fail stack, we grow to the maximum size when a regexp
1945 is compiled.
1946 The register vectors, we adjust in size each time we
1947 compile a regexp, according to the number of registers it needs. */
1948
1949static fail_stack_type fail_stack;
1950
1951/* Size with which the following vectors are currently allocated.
1952 That is so we can make them bigger as needed,
4bb91c68 1953 but never make them smaller. */
fa9a63c5
RM
1954static int regs_allocated_size;
1955
66f0296e
SM
1956static re_char ** regstart, ** regend;
1957static re_char **best_regstart, **best_regend;
fa9a63c5
RM
1958
1959/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 1960 but don't make them smaller. */
fa9a63c5
RM
1961
1962static
1963regex_grow_registers (num_regs)
1964 int num_regs;
1965{
1966 if (num_regs > regs_allocated_size)
1967 {
66f0296e
SM
1968 RETALLOC_IF (regstart, num_regs, re_char *);
1969 RETALLOC_IF (regend, num_regs, re_char *);
1970 RETALLOC_IF (best_regstart, num_regs, re_char *);
1971 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
1972
1973 regs_allocated_size = num_regs;
1974 }
1975}
1976
1977#endif /* not MATCH_MAY_ALLOCATE */
1978\f
99633e97
SM
1979static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1980 compile_stack,
1981 regnum_t regnum));
1982
fa9a63c5
RM
1983/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1984 Returns one of error codes defined in `regex.h', or zero for success.
1985
1986 Assumes the `allocated' (and perhaps `buffer') and `translate'
1987 fields are set in BUFP on entry.
1988
1989 If it succeeds, results are put in BUFP (if it returns an error, the
1990 contents of BUFP are undefined):
1991 `buffer' is the compiled pattern;
1992 `syntax' is set to SYNTAX;
1993 `used' is set to the length of the compiled pattern;
1994 `fastmap_accurate' is zero;
1995 `re_nsub' is the number of subexpressions in PATTERN;
1996 `not_bol' and `not_eol' are zero;
5e69f11e 1997
c0f9ea08 1998 The `fastmap' field is neither examined nor set. */
fa9a63c5 1999
505bde11
SM
2000/* Insert the `jump' from the end of last alternative to "here".
2001 The space for the jump has already been allocated. */
2002#define FIXUP_ALT_JUMP() \
2003do { \
2004 if (fixup_alt_jump) \
2005 STORE_JUMP (jump, fixup_alt_jump, b); \
2006} while (0)
2007
2008
fa9a63c5
RM
2009/* Return, freeing storage we allocated. */
2010#define FREE_STACK_RETURN(value) \
b18215fc
RS
2011 do { \
2012 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2013 free (compile_stack.stack); \
2014 return value; \
2015 } while (0)
fa9a63c5
RM
2016
2017static reg_errcode_t
2018regex_compile (pattern, size, syntax, bufp)
66f0296e 2019 re_char *pattern;
4bb91c68 2020 size_t size;
fa9a63c5
RM
2021 reg_syntax_t syntax;
2022 struct re_pattern_buffer *bufp;
2023{
2024 /* We fetch characters from PATTERN here. Even though PATTERN is
2025 `char *' (i.e., signed), we declare these variables as unsigned, so
2026 they can be reliably used as array indices. */
b18215fc 2027 register unsigned int c, c1;
5e69f11e 2028
fa9a63c5 2029 /* A random temporary spot in PATTERN. */
66f0296e 2030 re_char *p1;
fa9a63c5
RM
2031
2032 /* Points to the end of the buffer, where we should append. */
2033 register unsigned char *b;
5e69f11e 2034
fa9a63c5
RM
2035 /* Keeps track of unclosed groups. */
2036 compile_stack_type compile_stack;
2037
2038 /* Points to the current (ending) position in the pattern. */
22336245
RS
2039#ifdef AIX
2040 /* `const' makes AIX compiler fail. */
66f0296e 2041 unsigned char *p = pattern;
22336245 2042#else
66f0296e 2043 re_char *p = pattern;
22336245 2044#endif
66f0296e 2045 re_char *pend = pattern + size;
5e69f11e 2046
fa9a63c5 2047 /* How to translate the characters in the pattern. */
6676cb1c 2048 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2049
2050 /* Address of the count-byte of the most recently inserted `exactn'
2051 command. This makes it possible to tell if a new exact-match
2052 character can be added to that command or if the character requires
2053 a new `exactn' command. */
2054 unsigned char *pending_exact = 0;
2055
2056 /* Address of start of the most recently finished expression.
2057 This tells, e.g., postfix * where to find the start of its
2058 operand. Reset at the beginning of groups and alternatives. */
2059 unsigned char *laststart = 0;
2060
2061 /* Address of beginning of regexp, or inside of last group. */
2062 unsigned char *begalt;
2063
2064 /* Place in the uncompiled pattern (i.e., the {) to
2065 which to go back if the interval is invalid. */
66f0296e 2066 re_char *beg_interval;
5e69f11e 2067
fa9a63c5 2068 /* Address of the place where a forward jump should go to the end of
25fe55af 2069 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2070 last -- ends with a forward jump of this sort. */
2071 unsigned char *fixup_alt_jump = 0;
2072
2073 /* Counts open-groups as they are encountered. Remembered for the
2074 matching close-group on the compile stack, so the same register
2075 number is put in the stop_memory as the start_memory. */
2076 regnum_t regnum = 0;
2077
b18215fc
RS
2078 /* Work area for range table of charset. */
2079 struct range_table_work_area range_table_work;
2080
2d1675e4
SM
2081 /* If the object matched can contain multibyte characters. */
2082 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2083
fa9a63c5 2084#ifdef DEBUG
99633e97 2085 debug++;
fa9a63c5 2086 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2087 if (debug > 0)
fa9a63c5
RM
2088 {
2089 unsigned debug_count;
5e69f11e 2090
fa9a63c5 2091 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2092 putchar (pattern[debug_count]);
fa9a63c5
RM
2093 putchar ('\n');
2094 }
2095#endif /* DEBUG */
2096
2097 /* Initialize the compile stack. */
2098 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2099 if (compile_stack.stack == NULL)
2100 return REG_ESPACE;
2101
2102 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2103 compile_stack.avail = 0;
2104
b18215fc
RS
2105 range_table_work.table = 0;
2106 range_table_work.allocated = 0;
2107
fa9a63c5
RM
2108 /* Initialize the pattern buffer. */
2109 bufp->syntax = syntax;
2110 bufp->fastmap_accurate = 0;
2111 bufp->not_bol = bufp->not_eol = 0;
2112
2113 /* Set `used' to zero, so that if we return an error, the pattern
2114 printer (for debugging) will think there's no pattern. We reset it
2115 at the end. */
2116 bufp->used = 0;
5e69f11e 2117
fa9a63c5 2118 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2119 bufp->re_nsub = 0;
fa9a63c5 2120
0b32bf0e 2121#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2122 /* Initialize the syntax table. */
2123 init_syntax_once ();
2124#endif
2125
2126 if (bufp->allocated == 0)
2127 {
2128 if (bufp->buffer)
2129 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2130 enough space. This loses if buffer's address is bogus, but
2131 that is the user's responsibility. */
2132 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2133 }
fa9a63c5 2134 else
25fe55af
RS
2135 { /* Caller did not allocate a buffer. Do it for them. */
2136 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2137 }
fa9a63c5
RM
2138 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2139
2140 bufp->allocated = INIT_BUF_SIZE;
2141 }
2142
2143 begalt = b = bufp->buffer;
2144
2145 /* Loop through the uncompiled pattern until we're at the end. */
2146 while (p != pend)
2147 {
2148 PATFETCH (c);
2149
2150 switch (c)
25fe55af
RS
2151 {
2152 case '^':
2153 {
2154 if ( /* If at start of pattern, it's an operator. */
2155 p == pattern + 1
2156 /* If context independent, it's an operator. */
2157 || syntax & RE_CONTEXT_INDEP_ANCHORS
2158 /* Otherwise, depends on what's come before. */
2159 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2160 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2161 else
2162 goto normal_char;
2163 }
2164 break;
2165
2166
2167 case '$':
2168 {
2169 if ( /* If at end of pattern, it's an operator. */
2170 p == pend
2171 /* If context independent, it's an operator. */
2172 || syntax & RE_CONTEXT_INDEP_ANCHORS
2173 /* Otherwise, depends on what's next. */
2174 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2175 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2176 else
2177 goto normal_char;
2178 }
2179 break;
fa9a63c5
RM
2180
2181
2182 case '+':
25fe55af
RS
2183 case '?':
2184 if ((syntax & RE_BK_PLUS_QM)
2185 || (syntax & RE_LIMITED_OPS))
2186 goto normal_char;
2187 handle_plus:
2188 case '*':
2189 /* If there is no previous pattern... */
2190 if (!laststart)
2191 {
2192 if (syntax & RE_CONTEXT_INVALID_OPS)
2193 FREE_STACK_RETURN (REG_BADRPT);
2194 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2195 goto normal_char;
2196 }
2197
2198 {
25fe55af 2199 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2200 boolean zero_times_ok = 0, many_times_ok = 0;
2201 boolean greedy = 1;
25fe55af
RS
2202
2203 /* If there is a sequence of repetition chars, collapse it
2204 down to just one (the right one). We can't combine
2205 interval operators with these because of, e.g., `a{2}*',
2206 which should only match an even number of `a's. */
2207
2208 for (;;)
2209 {
0b32bf0e 2210 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2211 && c == '?' && (zero_times_ok || many_times_ok))
2212 greedy = 0;
2213 else
2214 {
2215 zero_times_ok |= c != '+';
2216 many_times_ok |= c != '?';
2217 }
25fe55af
RS
2218
2219 if (p == pend)
2220 break;
ed0767d8
SM
2221 else if (*p == '*'
2222 || (!(syntax & RE_BK_PLUS_QM)
2223 && (*p == '+' || *p == '?')))
25fe55af 2224 ;
ed0767d8 2225 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2226 {
ed0767d8
SM
2227 if (p+1 == pend)
2228 FREE_STACK_RETURN (REG_EESCAPE);
2229 if (p[1] == '+' || p[1] == '?')
2230 PATFETCH (c); /* Gobble up the backslash. */
2231 else
2232 break;
25fe55af
RS
2233 }
2234 else
ed0767d8 2235 break;
25fe55af 2236 /* If we get here, we found another repeat character. */
ed0767d8
SM
2237 PATFETCH (c);
2238 }
25fe55af
RS
2239
2240 /* Star, etc. applied to an empty pattern is equivalent
2241 to an empty pattern. */
4e8a9132 2242 if (!laststart || laststart == b)
25fe55af
RS
2243 break;
2244
2245 /* Now we know whether or not zero matches is allowed
2246 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2247 if (greedy)
2248 {
99633e97 2249 if (many_times_ok)
4e8a9132
SM
2250 {
2251 boolean simple = skip_one_char (laststart) == b;
2252 unsigned int startoffset = 0;
f6a3f532
SM
2253 re_opcode_t ofj =
2254 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2255 on_failure_jump : on_failure_jump_loop;
4e8a9132
SM
2256 assert (skip_one_char (laststart) <= b);
2257
2258 if (!zero_times_ok && simple)
2259 { /* Since simple * loops can be made faster by using
2260 on_failure_keep_string_jump, we turn simple P+
2261 into PP* if P is simple. */
2262 unsigned char *p1, *p2;
2263 startoffset = b - laststart;
2264 GET_BUFFER_SPACE (startoffset);
2265 p1 = b; p2 = laststart;
2266 while (p2 < p1)
2267 *b++ = *p2++;
2268 zero_times_ok = 1;
99633e97 2269 }
4e8a9132
SM
2270
2271 GET_BUFFER_SPACE (6);
2272 if (!zero_times_ok)
2273 /* A + loop. */
f6a3f532 2274 STORE_JUMP (ofj, b, b + 6);
99633e97 2275 else
4e8a9132
SM
2276 /* Simple * loops can use on_failure_keep_string_jump
2277 depending on what follows. But since we don't know
2278 that yet, we leave the decision up to
2279 on_failure_jump_smart. */
f6a3f532 2280 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2281 laststart + startoffset, b + 6);
99633e97 2282 b += 3;
4e8a9132 2283 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2284 b += 3;
2285 }
2286 else
2287 {
4e8a9132
SM
2288 /* A simple ? pattern. */
2289 assert (zero_times_ok);
2290 GET_BUFFER_SPACE (3);
2291 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2292 b += 3;
2293 }
1c8c6d39
DL
2294 }
2295 else /* not greedy */
2296 { /* I wish the greedy and non-greedy cases could be merged. */
2297
0683b6fa 2298 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2299 if (many_times_ok)
2300 {
f6a3f532
SM
2301 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2302
505bde11 2303 /* The non-greedy multiple match looks like a repeat..until:
1c8c6d39 2304 we only need a conditional jump at the end of the loop */
f6a3f532
SM
2305 if (emptyp) BUF_PUSH (no_op);
2306 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2307 : on_failure_jump, b, laststart);
1c8c6d39
DL
2308 b += 3;
2309 if (zero_times_ok)
2310 {
2311 /* The repeat...until naturally matches one or more.
2312 To also match zero times, we need to first jump to
2313 the end of the loop (its conditional jump). */
1c8c6d39
DL
2314 INSERT_JUMP (jump, laststart, b);
2315 b += 3;
2316 }
2317 }
2318 else
2319 {
2320 /* non-greedy a?? */
1c8c6d39
DL
2321 INSERT_JUMP (jump, laststart, b + 3);
2322 b += 3;
2323 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2324 b += 3;
2325 }
2326 }
2327 }
4e8a9132 2328 pending_exact = 0;
fa9a63c5
RM
2329 break;
2330
2331
2332 case '.':
25fe55af
RS
2333 laststart = b;
2334 BUF_PUSH (anychar);
2335 break;
fa9a63c5
RM
2336
2337
25fe55af
RS
2338 case '[':
2339 {
b18215fc 2340 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2341
25fe55af 2342 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2343
25fe55af
RS
2344 /* Ensure that we have enough space to push a charset: the
2345 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2346 GET_BUFFER_SPACE (34);
2347
25fe55af 2348 laststart = b;
e318085a 2349
25fe55af
RS
2350 /* We test `*p == '^' twice, instead of using an if
2351 statement, so we only need one BUF_PUSH. */
2352 BUF_PUSH (*p == '^' ? charset_not : charset);
2353 if (*p == '^')
2354 p++;
e318085a 2355
25fe55af
RS
2356 /* Remember the first position in the bracket expression. */
2357 p1 = p;
e318085a 2358
25fe55af
RS
2359 /* Push the number of bytes in the bitmap. */
2360 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2361
25fe55af
RS
2362 /* Clear the whole map. */
2363 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2364
25fe55af
RS
2365 /* charset_not matches newline according to a syntax bit. */
2366 if ((re_opcode_t) b[-2] == charset_not
2367 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2368 SET_LIST_BIT ('\n');
fa9a63c5 2369
25fe55af
RS
2370 /* Read in characters and ranges, setting map bits. */
2371 for (;;)
2372 {
b18215fc 2373 boolean escaped_char = false;
2d1675e4 2374 const unsigned char *p2 = p;
e318085a 2375
25fe55af 2376 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2377
25fe55af 2378 PATFETCH (c);
e318085a 2379
25fe55af
RS
2380 /* \ might escape characters inside [...] and [^...]. */
2381 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2382 {
2383 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2384
2385 PATFETCH (c);
b18215fc 2386 escaped_char = true;
25fe55af 2387 }
b18215fc
RS
2388 else
2389 {
657fcfbd
RS
2390 /* Could be the end of the bracket expression. If it's
2391 not (i.e., when the bracket expression is `[]' so
2392 far), the ']' character bit gets set way below. */
2d1675e4 2393 if (c == ']' && p2 != p1)
657fcfbd 2394 break;
25fe55af 2395 }
b18215fc 2396
b18215fc
RS
2397 /* What should we do for the character which is
2398 greater than 0x7F, but not BASE_LEADING_CODE_P?
2399 XXX */
2400
25fe55af
RS
2401 /* See if we're at the beginning of a possible character
2402 class. */
b18215fc 2403
2d1675e4
SM
2404 if (!escaped_char &&
2405 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2406 {
2407 /* Leave room for the null. */
25fe55af 2408 char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2409 const unsigned char *class_beg;
b18215fc 2410
25fe55af
RS
2411 PATFETCH (c);
2412 c1 = 0;
ed0767d8 2413 class_beg = p;
b18215fc 2414
25fe55af
RS
2415 /* If pattern is `[[:'. */
2416 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2417
25fe55af
RS
2418 for (;;)
2419 {
2420 PATFETCH (c);
2421 if (c == ':' || c == ']' || p == pend
2422 || c1 == CHAR_CLASS_MAX_LENGTH)
2423 break;
2424 str[c1++] = c;
2425 }
2426 str[c1] = '\0';
b18215fc
RS
2427
2428 /* If isn't a word bracketed by `[:' and `:]':
2429 undo the ending character, the letters, and
2430 leave the leading `:' and `[' (but set bits for
2431 them). */
25fe55af
RS
2432 if (c == ':' && *p == ']')
2433 {
2434 int ch;
2435 boolean is_alnum = STREQ (str, "alnum");
2436 boolean is_alpha = STREQ (str, "alpha");
f71b19b6 2437 boolean is_ascii = STREQ (str, "ascii");
25fe55af
RS
2438 boolean is_blank = STREQ (str, "blank");
2439 boolean is_cntrl = STREQ (str, "cntrl");
2440 boolean is_digit = STREQ (str, "digit");
2441 boolean is_graph = STREQ (str, "graph");
2442 boolean is_lower = STREQ (str, "lower");
f71b19b6
DL
2443 boolean is_multibyte = STREQ (str, "multibyte");
2444 boolean is_nonascii = STREQ (str, "nonascii");
25fe55af
RS
2445 boolean is_print = STREQ (str, "print");
2446 boolean is_punct = STREQ (str, "punct");
2447 boolean is_space = STREQ (str, "space");
f71b19b6 2448 boolean is_unibyte = STREQ (str, "unibyte");
25fe55af 2449 boolean is_upper = STREQ (str, "upper");
96cc36cc 2450 boolean is_word = STREQ (str, "word");
f71b19b6 2451 boolean is_xdigit = STREQ (str, "xdigit");
25fe55af
RS
2452
2453 if (!IS_CHAR_CLASS (str))
fa9a63c5
RM
2454 FREE_STACK_RETURN (REG_ECTYPE);
2455
25fe55af
RS
2456 /* Throw away the ] at the end of the character
2457 class. */
2458 PATFETCH (c);
fa9a63c5 2459
25fe55af 2460 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2461
96cc36cc
RS
2462 /* Most character classes in a multibyte match
2463 just set a flag. Exceptions are is_blank,
2464 is_digit, is_cntrl, and is_xdigit, since
2465 they can only match ASCII characters. We
2466 don't need to handle them for multibyte. */
2467
2d1675e4 2468 if (multibyte)
96cc36cc
RS
2469 {
2470 int bit = 0;
2471
2472 if (is_alnum) bit = BIT_ALNUM;
2473 if (is_alpha) bit = BIT_ALPHA;
f71b19b6 2474 if (is_ascii) bit = BIT_ASCII;
96cc36cc
RS
2475 if (is_graph) bit = BIT_GRAPH;
2476 if (is_lower) bit = BIT_LOWER;
f71b19b6
DL
2477 if (is_multibyte) bit = BIT_MULTIBYTE;
2478 if (is_nonascii) bit = BIT_NONASCII;
96cc36cc
RS
2479 if (is_print) bit = BIT_PRINT;
2480 if (is_punct) bit = BIT_PUNCT;
2481 if (is_space) bit = BIT_SPACE;
f71b19b6 2482 if (is_unibyte) bit = BIT_UNIBYTE;
96cc36cc
RS
2483 if (is_upper) bit = BIT_UPPER;
2484 if (is_word) bit = BIT_WORD;
2485 if (bit)
2486 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2487 bit);
2488 }
2489
2490 /* Handle character classes for ASCII characters. */
25fe55af
RS
2491 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2492 {
7ae68633 2493 int translated = TRANSLATE (ch);
fa9a63c5
RM
2494 /* This was split into 3 if's to
2495 avoid an arbitrary limit in some compiler. */
25fe55af
RS
2496 if ( (is_alnum && ISALNUM (ch))
2497 || (is_alpha && ISALPHA (ch))
2498 || (is_blank && ISBLANK (ch))
2499 || (is_cntrl && ISCNTRL (ch)))
7ae68633 2500 SET_LIST_BIT (translated);
fa9a63c5 2501 if ( (is_digit && ISDIGIT (ch))
25fe55af
RS
2502 || (is_graph && ISGRAPH (ch))
2503 || (is_lower && ISLOWER (ch))
2504 || (is_print && ISPRINT (ch)))
7ae68633 2505 SET_LIST_BIT (translated);
fa9a63c5 2506 if ( (is_punct && ISPUNCT (ch))
25fe55af
RS
2507 || (is_space && ISSPACE (ch))
2508 || (is_upper && ISUPPER (ch))
2509 || (is_xdigit && ISXDIGIT (ch)))
7ae68633 2510 SET_LIST_BIT (translated);
f71b19b6
DL
2511 if ( (is_ascii && IS_REAL_ASCII (ch))
2512 || (is_nonascii && !IS_REAL_ASCII (ch))
2513 || (is_unibyte && ISUNIBYTE (ch))
2514 || (is_multibyte && !ISUNIBYTE (ch)))
2515 SET_LIST_BIT (translated);
2516
96cc36cc
RS
2517 if ( (is_word && ISWORD (ch)))
2518 SET_LIST_BIT (translated);
25fe55af 2519 }
b18215fc
RS
2520
2521 /* Repeat the loop. */
2522 continue;
25fe55af
RS
2523 }
2524 else
2525 {
ed0767d8
SM
2526 /* Go back to right after the "[:". */
2527 p = class_beg;
25fe55af 2528 SET_LIST_BIT ('[');
b18215fc
RS
2529
2530 /* Because the `:' may starts the range, we
2531 can't simply set bit and repeat the loop.
25fe55af 2532 Instead, just set it to C and handle below. */
b18215fc 2533 c = ':';
25fe55af
RS
2534 }
2535 }
b18215fc
RS
2536
2537 if (p < pend && p[0] == '-' && p[1] != ']')
2538 {
2539
2540 /* Discard the `-'. */
2541 PATFETCH (c1);
2542
2543 /* Fetch the character which ends the range. */
2544 PATFETCH (c1);
b18215fc 2545
b54f61ed 2546 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2547 {
b54f61ed
KH
2548 if (! SINGLE_BYTE_CHAR_P (c1))
2549 {
2550 /* Handle a range such as \177-\377 in
2551 multibyte mode. Split that into two
2552 ranges, the low one ending at 0237, and
2553 the high one starting at the smallest
2554 character in the charset of C1 and
2555 ending at C1. */
2556 int charset = CHAR_CHARSET (c1);
2557 int c2 = MAKE_CHAR (charset, 0, 0);
2558
2559 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2560 c2, c1);
2561 c1 = 0237;
2562 }
e934739e
RS
2563 }
2564 else if (!SAME_CHARSET_P (c, c1))
b18215fc 2565 FREE_STACK_RETURN (REG_ERANGE);
e318085a 2566 }
25fe55af 2567 else
b18215fc
RS
2568 /* Range from C to C. */
2569 c1 = c;
2570
2571 /* Set the range ... */
2572 if (SINGLE_BYTE_CHAR_P (c))
2573 /* ... into bitmap. */
25fe55af 2574 {
b18215fc
RS
2575 unsigned this_char;
2576 int range_start = c, range_end = c1;
2577
2578 /* If the start is after the end, the range is empty. */
2579 if (range_start > range_end)
2580 {
2581 if (syntax & RE_NO_EMPTY_RANGES)
2582 FREE_STACK_RETURN (REG_ERANGE);
2583 /* Else, repeat the loop. */
2584 }
2585 else
2586 {
2587 for (this_char = range_start; this_char <= range_end;
2588 this_char++)
2589 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 2590 }
25fe55af 2591 }
e318085a 2592 else
b18215fc
RS
2593 /* ... into range table. */
2594 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
2595 }
2596
25fe55af
RS
2597 /* Discard any (non)matching list bytes that are all 0 at the
2598 end of the map. Decrease the map-length byte too. */
2599 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2600 b[-1]--;
2601 b += b[-1];
fa9a63c5 2602
96cc36cc
RS
2603 /* Build real range table from work area. */
2604 if (RANGE_TABLE_WORK_USED (range_table_work)
2605 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
2606 {
2607 int i;
2608 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2609
b18215fc 2610 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
2611 bytes for flags, two for COUNT, and three bytes for
2612 each character. */
2613 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 2614
b18215fc
RS
2615 /* Indicate the existence of range table. */
2616 laststart[1] |= 0x80;
fa9a63c5 2617
96cc36cc
RS
2618 /* Store the character class flag bits into the range table.
2619 If not in emacs, these flag bits are always 0. */
2620 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2621 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2622
b18215fc
RS
2623 STORE_NUMBER_AND_INCR (b, used / 2);
2624 for (i = 0; i < used; i++)
2625 STORE_CHARACTER_AND_INCR
2626 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2627 }
25fe55af
RS
2628 }
2629 break;
fa9a63c5
RM
2630
2631
b18215fc 2632 case '(':
25fe55af
RS
2633 if (syntax & RE_NO_BK_PARENS)
2634 goto handle_open;
2635 else
2636 goto normal_char;
fa9a63c5
RM
2637
2638
25fe55af
RS
2639 case ')':
2640 if (syntax & RE_NO_BK_PARENS)
2641 goto handle_close;
2642 else
2643 goto normal_char;
e318085a
RS
2644
2645
25fe55af
RS
2646 case '\n':
2647 if (syntax & RE_NEWLINE_ALT)
2648 goto handle_alt;
2649 else
2650 goto normal_char;
e318085a
RS
2651
2652
b18215fc 2653 case '|':
25fe55af
RS
2654 if (syntax & RE_NO_BK_VBAR)
2655 goto handle_alt;
2656 else
2657 goto normal_char;
2658
2659
2660 case '{':
2661 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2662 goto handle_interval;
2663 else
2664 goto normal_char;
2665
2666
2667 case '\\':
2668 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2669
2670 /* Do not translate the character after the \, so that we can
2671 distinguish, e.g., \B from \b, even if we normally would
2672 translate, e.g., B to b. */
2673 PATFETCH_RAW (c);
2674
2675 switch (c)
2676 {
2677 case '(':
2678 if (syntax & RE_NO_BK_PARENS)
2679 goto normal_backslash;
2680
2681 handle_open:
505bde11
SM
2682 {
2683 int shy = 0;
2684 if (p+1 < pend)
2685 {
2686 /* Look for a special (?...) construct */
ed0767d8 2687 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 2688 {
ed0767d8 2689 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
2690 PATFETCH (c);
2691 switch (c)
2692 {
2693 case ':': shy = 1; break;
2694 default:
2695 /* Only (?:...) is supported right now. */
2696 FREE_STACK_RETURN (REG_BADPAT);
2697 }
2698 }
505bde11
SM
2699 }
2700
2701 if (!shy)
2702 {
2703 bufp->re_nsub++;
2704 regnum++;
2705 }
25fe55af 2706
99633e97
SM
2707 if (COMPILE_STACK_FULL)
2708 {
2709 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2710 compile_stack_elt_t);
2711 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 2712
99633e97
SM
2713 compile_stack.size <<= 1;
2714 }
25fe55af 2715
99633e97
SM
2716 /* These are the values to restore when we hit end of this
2717 group. They are all relative offsets, so that if the
2718 whole pattern moves because of realloc, they will still
2719 be valid. */
2720 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2721 COMPILE_STACK_TOP.fixup_alt_jump
2722 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2723 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2724 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
2725
2726 /* Do not push a
2727 start_memory for groups beyond the last one we can
2728 represent in the compiled pattern. */
2729 if (regnum <= MAX_REGNUM && !shy)
2730 BUF_PUSH_2 (start_memory, regnum);
2731
2732 compile_stack.avail++;
2733
2734 fixup_alt_jump = 0;
2735 laststart = 0;
2736 begalt = b;
2737 /* If we've reached MAX_REGNUM groups, then this open
2738 won't actually generate any code, so we'll have to
2739 clear pending_exact explicitly. */
2740 pending_exact = 0;
2741 break;
505bde11 2742 }
25fe55af
RS
2743
2744 case ')':
2745 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2746
2747 if (COMPILE_STACK_EMPTY)
505bde11
SM
2748 {
2749 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2750 goto normal_backslash;
2751 else
2752 FREE_STACK_RETURN (REG_ERPAREN);
2753 }
25fe55af
RS
2754
2755 handle_close:
505bde11 2756 FIXUP_ALT_JUMP ();
25fe55af
RS
2757
2758 /* See similar code for backslashed left paren above. */
2759 if (COMPILE_STACK_EMPTY)
505bde11
SM
2760 {
2761 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2762 goto normal_char;
2763 else
2764 FREE_STACK_RETURN (REG_ERPAREN);
2765 }
25fe55af
RS
2766
2767 /* Since we just checked for an empty stack above, this
2768 ``can't happen''. */
2769 assert (compile_stack.avail != 0);
2770 {
2771 /* We don't just want to restore into `regnum', because
2772 later groups should continue to be numbered higher,
2773 as in `(ab)c(de)' -- the second group is #2. */
2774 regnum_t this_group_regnum;
2775
2776 compile_stack.avail--;
2777 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2778 fixup_alt_jump
2779 = COMPILE_STACK_TOP.fixup_alt_jump
2780 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2781 : 0;
2782 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2783 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
2784 /* If we've reached MAX_REGNUM groups, then this open
2785 won't actually generate any code, so we'll have to
2786 clear pending_exact explicitly. */
2787 pending_exact = 0;
e318085a 2788
25fe55af
RS
2789 /* We're at the end of the group, so now we know how many
2790 groups were inside this one. */
505bde11
SM
2791 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
2792 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
2793 }
2794 break;
2795
2796
2797 case '|': /* `\|'. */
2798 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2799 goto normal_backslash;
2800 handle_alt:
2801 if (syntax & RE_LIMITED_OPS)
2802 goto normal_char;
2803
2804 /* Insert before the previous alternative a jump which
2805 jumps to this alternative if the former fails. */
2806 GET_BUFFER_SPACE (3);
2807 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2808 pending_exact = 0;
2809 b += 3;
2810
2811 /* The alternative before this one has a jump after it
2812 which gets executed if it gets matched. Adjust that
2813 jump so it will jump to this alternative's analogous
2814 jump (put in below, which in turn will jump to the next
2815 (if any) alternative's such jump, etc.). The last such
2816 jump jumps to the correct final destination. A picture:
2817 _____ _____
2818 | | | |
2819 | v | v
2820 a | b | c
2821
2822 If we are at `b', then fixup_alt_jump right now points to a
2823 three-byte space after `a'. We'll put in the jump, set
2824 fixup_alt_jump to right after `b', and leave behind three
2825 bytes which we'll fill in when we get to after `c'. */
2826
505bde11 2827 FIXUP_ALT_JUMP ();
25fe55af
RS
2828
2829 /* Mark and leave space for a jump after this alternative,
2830 to be filled in later either by next alternative or
2831 when know we're at the end of a series of alternatives. */
2832 fixup_alt_jump = b;
2833 GET_BUFFER_SPACE (3);
2834 b += 3;
2835
2836 laststart = 0;
2837 begalt = b;
2838 break;
2839
2840
2841 case '{':
2842 /* If \{ is a literal. */
2843 if (!(syntax & RE_INTERVALS)
2844 /* If we're at `\{' and it's not the open-interval
2845 operator. */
4bb91c68 2846 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
2847 goto normal_backslash;
2848
2849 handle_interval:
2850 {
2851 /* If got here, then the syntax allows intervals. */
2852
2853 /* At least (most) this many matches must be made. */
99633e97 2854 int lower_bound = 0, upper_bound = -1;
25fe55af 2855
ed0767d8 2856 beg_interval = p;
25fe55af
RS
2857
2858 if (p == pend)
4bb91c68 2859 FREE_STACK_RETURN (REG_EBRACE);
25fe55af
RS
2860
2861 GET_UNSIGNED_NUMBER (lower_bound);
2862
2863 if (c == ',')
ed0767d8 2864 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
2865 else
2866 /* Interval such as `{1}' => match exactly once. */
2867 upper_bound = lower_bound;
2868
2869 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 2870 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 2871 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
2872
2873 if (!(syntax & RE_NO_BK_BRACES))
2874 {
4bb91c68
SM
2875 if (c != '\\')
2876 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
2877
2878 PATFETCH (c);
2879 }
2880
2881 if (c != '}')
4bb91c68 2882 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
2883
2884 /* We just parsed a valid interval. */
2885
2886 /* If it's invalid to have no preceding re. */
2887 if (!laststart)
2888 {
2889 if (syntax & RE_CONTEXT_INVALID_OPS)
2890 FREE_STACK_RETURN (REG_BADRPT);
2891 else if (syntax & RE_CONTEXT_INDEP_OPS)
2892 laststart = b;
2893 else
2894 goto unfetch_interval;
2895 }
2896
25fe55af 2897 if (upper_bound == 0)
ed0767d8
SM
2898 /* If the upper bound is zero, just drop the sub pattern
2899 altogether. */
2900 b = laststart;
2901 else if (lower_bound == 1 && upper_bound == 1)
2902 /* Just match it once: nothing to do here. */
2903 ;
25fe55af
RS
2904
2905 /* Otherwise, we have a nontrivial interval. When
2906 we're all done, the pattern will look like:
2907 set_number_at <jump count> <upper bound>
2908 set_number_at <succeed_n count> <lower bound>
2909 succeed_n <after jump addr> <succeed_n count>
2910 <body of loop>
2911 jump_n <succeed_n addr> <jump count>
2912 (The upper bound and `jump_n' are omitted if
2913 `upper_bound' is 1, though.) */
2914 else
2915 { /* If the upper bound is > 1, we need to insert
2916 more at the end of the loop. */
ed0767d8
SM
2917 unsigned int nbytes = (upper_bound < 0 ? 3
2918 : upper_bound > 1 ? 5 : 0);
2919 unsigned int startoffset = 0;
2920
2921 GET_BUFFER_SPACE (20); /* We might use less. */
2922
2923 if (lower_bound == 0)
2924 {
2925 /* A succeed_n that starts with 0 is really a
2926 a simple on_failure_jump_loop. */
2927 INSERT_JUMP (on_failure_jump_loop, laststart,
2928 b + 3 + nbytes);
2929 b += 3;
2930 }
2931 else
2932 {
2933 /* Initialize lower bound of the `succeed_n', even
2934 though it will be set during matching by its
2935 attendant `set_number_at' (inserted next),
2936 because `re_compile_fastmap' needs to know.
2937 Jump to the `jump_n' we might insert below. */
2938 INSERT_JUMP2 (succeed_n, laststart,
2939 b + 5 + nbytes,
2940 lower_bound);
2941 b += 5;
2942
2943 /* Code to initialize the lower bound. Insert
2944 before the `succeed_n'. The `5' is the last two
2945 bytes of this `set_number_at', plus 3 bytes of
2946 the following `succeed_n'. */
2947 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2948 b += 5;
2949 startoffset += 5;
2950 }
2951
2952 if (upper_bound < 0)
2953 {
2954 /* A negative upper bound stands for infinity,
2955 in which case it degenerates to a plain jump. */
2956 STORE_JUMP (jump, b, laststart + startoffset);
2957 b += 3;
2958 }
2959 else if (upper_bound > 1)
25fe55af
RS
2960 { /* More than one repetition is allowed, so
2961 append a backward jump to the `succeed_n'
2962 that starts this interval.
2963
2964 When we've reached this during matching,
2965 we'll have matched the interval once, so
2966 jump back only `upper_bound - 1' times. */
ed0767d8 2967 STORE_JUMP2 (jump_n, b, laststart + startoffset,
25fe55af
RS
2968 upper_bound - 1);
2969 b += 5;
2970
2971 /* The location we want to set is the second
2972 parameter of the `jump_n'; that is `b-2' as
2973 an absolute address. `laststart' will be
2974 the `set_number_at' we're about to insert;
2975 `laststart+3' the number to set, the source
2976 for the relative address. But we are
2977 inserting into the middle of the pattern --
2978 so everything is getting moved up by 5.
2979 Conclusion: (b - 2) - (laststart + 3) + 5,
2980 i.e., b - laststart.
2981
2982 We insert this at the beginning of the loop
2983 so that if we fail during matching, we'll
2984 reinitialize the bounds. */
2985 insert_op2 (set_number_at, laststart, b - laststart,
2986 upper_bound - 1, b);
2987 b += 5;
2988 }
2989 }
2990 pending_exact = 0;
2991 beg_interval = NULL;
2992 }
2993 break;
2994
2995 unfetch_interval:
2996 /* If an invalid interval, match the characters as literals. */
2997 assert (beg_interval);
2998 p = beg_interval;
2999 beg_interval = NULL;
3000
3001 /* normal_char and normal_backslash need `c'. */
ed0767d8 3002 c = '{';
25fe55af
RS
3003
3004 if (!(syntax & RE_NO_BK_BRACES))
3005 {
ed0767d8
SM
3006 assert (p > pattern && p[-1] == '\\');
3007 goto normal_backslash;
25fe55af 3008 }
ed0767d8
SM
3009 else
3010 goto normal_char;
e318085a 3011
b18215fc 3012#ifdef emacs
25fe55af
RS
3013 /* There is no way to specify the before_dot and after_dot
3014 operators. rms says this is ok. --karl */
3015 case '=':
3016 BUF_PUSH (at_dot);
3017 break;
3018
3019 case 's':
3020 laststart = b;
3021 PATFETCH (c);
3022 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3023 break;
3024
3025 case 'S':
3026 laststart = b;
3027 PATFETCH (c);
3028 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3029 break;
b18215fc
RS
3030
3031 case 'c':
3032 laststart = b;
3033 PATFETCH_RAW (c);
3034 BUF_PUSH_2 (categoryspec, c);
3035 break;
e318085a 3036
b18215fc
RS
3037 case 'C':
3038 laststart = b;
3039 PATFETCH_RAW (c);
3040 BUF_PUSH_2 (notcategoryspec, c);
3041 break;
3042#endif /* emacs */
e318085a 3043
e318085a 3044
25fe55af 3045 case 'w':
4bb91c68
SM
3046 if (syntax & RE_NO_GNU_OPS)
3047 goto normal_char;
25fe55af 3048 laststart = b;
1fb352e0 3049 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3050 break;
e318085a 3051
e318085a 3052
25fe55af 3053 case 'W':
4bb91c68
SM
3054 if (syntax & RE_NO_GNU_OPS)
3055 goto normal_char;
25fe55af 3056 laststart = b;
1fb352e0 3057 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3058 break;
e318085a
RS
3059
3060
25fe55af 3061 case '<':
4bb91c68
SM
3062 if (syntax & RE_NO_GNU_OPS)
3063 goto normal_char;
25fe55af
RS
3064 BUF_PUSH (wordbeg);
3065 break;
e318085a 3066
25fe55af 3067 case '>':
4bb91c68
SM
3068 if (syntax & RE_NO_GNU_OPS)
3069 goto normal_char;
25fe55af
RS
3070 BUF_PUSH (wordend);
3071 break;
e318085a 3072
25fe55af 3073 case 'b':
4bb91c68
SM
3074 if (syntax & RE_NO_GNU_OPS)
3075 goto normal_char;
25fe55af
RS
3076 BUF_PUSH (wordbound);
3077 break;
e318085a 3078
25fe55af 3079 case 'B':
4bb91c68
SM
3080 if (syntax & RE_NO_GNU_OPS)
3081 goto normal_char;
25fe55af
RS
3082 BUF_PUSH (notwordbound);
3083 break;
fa9a63c5 3084
25fe55af 3085 case '`':
4bb91c68
SM
3086 if (syntax & RE_NO_GNU_OPS)
3087 goto normal_char;
25fe55af
RS
3088 BUF_PUSH (begbuf);
3089 break;
e318085a 3090
25fe55af 3091 case '\'':
4bb91c68
SM
3092 if (syntax & RE_NO_GNU_OPS)
3093 goto normal_char;
25fe55af
RS
3094 BUF_PUSH (endbuf);
3095 break;
e318085a 3096
25fe55af
RS
3097 case '1': case '2': case '3': case '4': case '5':
3098 case '6': case '7': case '8': case '9':
3099 if (syntax & RE_NO_BK_REFS)
3100 goto normal_char;
e318085a 3101
25fe55af 3102 c1 = c - '0';
e318085a 3103
25fe55af
RS
3104 if (c1 > regnum)
3105 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3106
25fe55af 3107 /* Can't back reference to a subexpression if inside of it. */
4bb91c68 3108 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
25fe55af 3109 goto normal_char;
e318085a 3110
25fe55af
RS
3111 laststart = b;
3112 BUF_PUSH_2 (duplicate, c1);
3113 break;
e318085a 3114
e318085a 3115
25fe55af
RS
3116 case '+':
3117 case '?':
3118 if (syntax & RE_BK_PLUS_QM)
3119 goto handle_plus;
3120 else
3121 goto normal_backslash;
3122
3123 default:
3124 normal_backslash:
3125 /* You might think it would be useful for \ to mean
3126 not to translate; but if we don't translate it
4bb91c68 3127 it will never match anything. */
25fe55af
RS
3128 c = TRANSLATE (c);
3129 goto normal_char;
3130 }
3131 break;
fa9a63c5
RM
3132
3133
3134 default:
25fe55af 3135 /* Expects the character in `c'. */
fa9a63c5
RM
3136 normal_char:
3137 /* If no exactn currently being built. */
25fe55af 3138 if (!pending_exact
fa9a63c5 3139
25fe55af
RS
3140 /* If last exactn not at current position. */
3141 || pending_exact + *pending_exact + 1 != b
5e69f11e 3142
25fe55af 3143 /* We have only one byte following the exactn for the count. */
2d1675e4 3144 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3145
25fe55af 3146 /* If followed by a repetition operator. */
9d99031f 3147 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3148 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3149 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3150 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3151 || ((syntax & RE_INTERVALS)
25fe55af 3152 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3153 ? p != pend && *p == '{'
3154 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3155 {
3156 /* Start building a new exactn. */
5e69f11e 3157
25fe55af 3158 laststart = b;
fa9a63c5
RM
3159
3160 BUF_PUSH_2 (exactn, 0);
3161 pending_exact = b - 1;
25fe55af 3162 }
5e69f11e 3163
2d1675e4
SM
3164 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3165 {
e0277a47
KH
3166 int len;
3167
3168 if (multibyte)
3169 len = CHAR_STRING (c, b);
3170 else
3171 *b = c, len = 1;
2d1675e4
SM
3172 b += len;
3173 (*pending_exact) += len;
3174 }
3175
fa9a63c5 3176 break;
25fe55af 3177 } /* switch (c) */
fa9a63c5
RM
3178 } /* while p != pend */
3179
5e69f11e 3180
fa9a63c5 3181 /* Through the pattern now. */
5e69f11e 3182
505bde11 3183 FIXUP_ALT_JUMP ();
fa9a63c5 3184
5e69f11e 3185 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3186 FREE_STACK_RETURN (REG_EPAREN);
3187
3188 /* If we don't want backtracking, force success
3189 the first time we reach the end of the compiled pattern. */
3190 if (syntax & RE_NO_POSIX_BACKTRACKING)
3191 BUF_PUSH (succeed);
3192
3193 free (compile_stack.stack);
3194
3195 /* We have succeeded; set the length of the buffer. */
3196 bufp->used = b - bufp->buffer;
3197
3198#ifdef DEBUG
99633e97 3199 if (debug > 0)
fa9a63c5 3200 {
505bde11 3201 re_compile_fastmap (bufp);
fa9a63c5
RM
3202 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3203 print_compiled_pattern (bufp);
3204 }
99633e97 3205 debug--;
fa9a63c5
RM
3206#endif /* DEBUG */
3207
3208#ifndef MATCH_MAY_ALLOCATE
3209 /* Initialize the failure stack to the largest possible stack. This
3210 isn't necessary unless we're trying to avoid calling alloca in
3211 the search and match routines. */
3212 {
3213 int num_regs = bufp->re_nsub + 1;
3214
320a2a73 3215 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3216 {
a26f4ccd 3217 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3218
fa9a63c5
RM
3219 if (! fail_stack.stack)
3220 fail_stack.stack
5e69f11e 3221 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3222 * sizeof (fail_stack_elt_t));
3223 else
3224 fail_stack.stack
3225 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3226 (fail_stack.size
3227 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3228 }
3229
3230 regex_grow_registers (num_regs);
3231 }
3232#endif /* not MATCH_MAY_ALLOCATE */
3233
3234 return REG_NOERROR;
3235} /* regex_compile */
3236\f
3237/* Subroutines for `regex_compile'. */
3238
25fe55af 3239/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3240
3241static void
3242store_op1 (op, loc, arg)
3243 re_opcode_t op;
3244 unsigned char *loc;
3245 int arg;
3246{
3247 *loc = (unsigned char) op;
3248 STORE_NUMBER (loc + 1, arg);
3249}
3250
3251
3252/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3253
3254static void
3255store_op2 (op, loc, arg1, arg2)
3256 re_opcode_t op;
3257 unsigned char *loc;
3258 int arg1, arg2;
3259{
3260 *loc = (unsigned char) op;
3261 STORE_NUMBER (loc + 1, arg1);
3262 STORE_NUMBER (loc + 3, arg2);
3263}
3264
3265
3266/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3267 for OP followed by two-byte integer parameter ARG. */
3268
3269static void
3270insert_op1 (op, loc, arg, end)
3271 re_opcode_t op;
3272 unsigned char *loc;
3273 int arg;
5e69f11e 3274 unsigned char *end;
fa9a63c5
RM
3275{
3276 register unsigned char *pfrom = end;
3277 register unsigned char *pto = end + 3;
3278
3279 while (pfrom != loc)
3280 *--pto = *--pfrom;
5e69f11e 3281
fa9a63c5
RM
3282 store_op1 (op, loc, arg);
3283}
3284
3285
3286/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3287
3288static void
3289insert_op2 (op, loc, arg1, arg2, end)
3290 re_opcode_t op;
3291 unsigned char *loc;
3292 int arg1, arg2;
5e69f11e 3293 unsigned char *end;
fa9a63c5
RM
3294{
3295 register unsigned char *pfrom = end;
3296 register unsigned char *pto = end + 5;
3297
3298 while (pfrom != loc)
3299 *--pto = *--pfrom;
5e69f11e 3300
fa9a63c5
RM
3301 store_op2 (op, loc, arg1, arg2);
3302}
3303
3304
3305/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3306 after an alternative or a begin-subexpression. We assume there is at
3307 least one character before the ^. */
3308
3309static boolean
3310at_begline_loc_p (pattern, p, syntax)
99633e97 3311 const unsigned char *pattern, *p;
fa9a63c5
RM
3312 reg_syntax_t syntax;
3313{
4e8a9132 3314 const unsigned char *prev = p - 2;
fa9a63c5 3315 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3316
fa9a63c5
RM
3317 return
3318 /* After a subexpression? */
3319 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3320 /* After an alternative? */
d2af47df
SM
3321 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3322 /* After a shy subexpression? */
3323 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3324 && prev[-1] == '?' && prev[-2] == '('
3325 && (syntax & RE_NO_BK_PARENS
3326 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3327}
3328
3329
3330/* The dual of at_begline_loc_p. This one is for $. We assume there is
3331 at least one character after the $, i.e., `P < PEND'. */
3332
3333static boolean
3334at_endline_loc_p (p, pend, syntax)
99633e97
SM
3335 const unsigned char *p, *pend;
3336 reg_syntax_t syntax;
fa9a63c5 3337{
4e8a9132 3338 const unsigned char *next = p;
fa9a63c5 3339 boolean next_backslash = *next == '\\';
4e8a9132 3340 const unsigned char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3341
fa9a63c5
RM
3342 return
3343 /* Before a subexpression? */
3344 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3345 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3346 /* Before an alternative? */
3347 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3348 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3349}
3350
3351
5e69f11e 3352/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3353 false if it's not. */
3354
3355static boolean
3356group_in_compile_stack (compile_stack, regnum)
3357 compile_stack_type compile_stack;
3358 regnum_t regnum;
3359{
3360 int this_element;
3361
5e69f11e
RM
3362 for (this_element = compile_stack.avail - 1;
3363 this_element >= 0;
fa9a63c5
RM
3364 this_element--)
3365 if (compile_stack.stack[this_element].regnum == regnum)
3366 return true;
3367
3368 return false;
3369}
fa9a63c5 3370\f
f6a3f532
SM
3371/* analyse_first.
3372 If fastmap is non-NULL, go through the pattern and fill fastmap
3373 with all the possible leading chars. If fastmap is NULL, don't
3374 bother filling it up (obviously) and only return whether the
3375 pattern could potentially match the empty string.
3376
3377 Return 1 if p..pend might match the empty string.
3378 Return 0 if p..pend matches at least one char.
3379 Return -1 if p..pend matches at least one char, but fastmap was not
3380 updated accurately.
3381 Return -2 if an error occurred. */
3382
3383static int
3384analyse_first (p, pend, fastmap, multibyte)
3385 unsigned char *p, *pend;
3386 char *fastmap;
3387 const int multibyte;
fa9a63c5 3388{
505bde11 3389 int j, k;
1fb352e0 3390 boolean not;
fa9a63c5
RM
3391#ifdef MATCH_MAY_ALLOCATE
3392 fail_stack_type fail_stack;
3393#endif
3394#ifndef REGEX_MALLOC
3395 char *destination;
3396#endif
5e69f11e 3397
0b32bf0e 3398#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
3399 /* This holds the pointer to the failure stack, when
3400 it is allocated relocatably. */
3401 fail_stack_elt_t *failure_stack_ptr;
99633e97 3402#endif
fa9a63c5
RM
3403
3404 /* Assume that each path through the pattern can be null until
4bb91c68 3405 proven otherwise. We set this false at the bottom of switch
fa9a63c5
RM
3406 statement, to which we get only if a particular path doesn't
3407 match the empty string. */
3408 boolean path_can_be_null = true;
3409
b18215fc 3410 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3411 flag is set true. */
b18215fc
RS
3412 boolean match_any_multibyte_characters = false;
3413
f6a3f532 3414 assert (p);
5e69f11e 3415
fa9a63c5 3416 INIT_FAIL_STACK ();
5e69f11e 3417
505bde11
SM
3418 /* The loop below works as follows:
3419 - It has a working-list kept in the PATTERN_STACK and which basically
3420 starts by only containing a pointer to the first operation.
3421 - If the opcode we're looking at is a match against some set of
3422 chars, then we add those chars to the fastmap and go on to the
3423 next work element from the worklist (done via `break').
3424 - If the opcode is a control operator on the other hand, we either
3425 ignore it (if it's meaningless at this point, such as `start_memory')
3426 or execute it (if it's a jump). If the jump has several destinations
3427 (i.e. `on_failure_jump'), then we push the other destination onto the
3428 worklist.
3429 We guarantee termination by ignoring backward jumps (more or less),
3430 so that `p' is monotonically increasing. More to the point, we
3431 never set `p' (or push) anything `<= p1'. */
3432
f6a3f532 3433 while (1)
fa9a63c5 3434 {
505bde11
SM
3435 /* `p1' is used as a marker of how far back a `on_failure_jump'
3436 can go without being ignored. It is normally equal to `p'
3437 (which prevents any backward `on_failure_jump') except right
3438 after a plain `jump', to allow patterns such as:
3439 0: jump 10
3440 3..9: <body>
3441 10: on_failure_jump 3
3442 as used for the *? operator. */
3443 unsigned char *p1 = p;
3444
f6a3f532 3445 if (p >= pend)
fa9a63c5 3446 {
f6a3f532
SM
3447 if (path_can_be_null)
3448 return (RESET_FAIL_STACK (), 1);
fa9a63c5 3449
f6a3f532
SM
3450 /* We have reached the (effective) end of pattern. */
3451 if (PATTERN_STACK_EMPTY ())
3452 return (RESET_FAIL_STACK (), 0);
fa9a63c5 3453
f6a3f532
SM
3454 p = (unsigned char*) POP_PATTERN_OP ();
3455 path_can_be_null = true;
3456 continue;
fa9a63c5
RM
3457 }
3458
25fe55af 3459 /* We should never be about to go beyond the end of the pattern. */
fa9a63c5 3460 assert (p < pend);
5e69f11e 3461
fa9a63c5
RM
3462 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3463 {
f6a3f532
SM
3464 case succeed:
3465 p = pend;
3466 continue;
fa9a63c5 3467
fa9a63c5 3468 case duplicate:
505bde11
SM
3469 /* If the first character has to match a backreference, that means
3470 that the group was empty (since it already matched). Since this
3471 is the only case that interests us here, we can assume that the
3472 backreference must match the empty string. */
3473 p++;
3474 continue;
fa9a63c5
RM
3475
3476
3477 /* Following are the cases which match a character. These end
25fe55af 3478 with `break'. */
fa9a63c5
RM
3479
3480 case exactn:
e0277a47
KH
3481 if (fastmap)
3482 {
3483 int c = RE_STRING_CHAR (p + 1, pend - p);
3484
3485 if (SINGLE_BYTE_CHAR_P (c))
3486 fastmap[c] = 1;
3487 else
3488 fastmap[p[1]] = 1;
3489 }
fa9a63c5
RM
3490 break;
3491
3492
1fb352e0
SM
3493 case anychar:
3494 /* We could put all the chars except for \n (and maybe \0)
3495 but we don't bother since it is generally not worth it. */
f6a3f532
SM
3496 if (!fastmap) break;
3497 return (RESET_FAIL_STACK (), -1);
fa9a63c5
RM
3498
3499
b18215fc 3500 case charset_not:
ba5c004d
RS
3501 /* Chars beyond end of bitmap are possible matches.
3502 All the single-byte codes can occur in multibyte buffers.
3503 So any that are not listed in the charset
3504 are possible matches, even in multibyte buffers. */
1fb352e0 3505 if (!fastmap) break;
b18215fc 3506 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3507 j < (1 << BYTEWIDTH); j++)
b18215fc 3508 fastmap[j] = 1;
1fb352e0
SM
3509 /* Fallthrough */
3510 case charset:
3511 if (!fastmap) break;
3512 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3513 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3514 j >= 0; j--)
1fb352e0 3515 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3516 fastmap[j] = 1;
3517
1fb352e0
SM
3518 if ((not && multibyte)
3519 /* Any character set can possibly contain a character
3520 which doesn't match the specified set of characters. */
3521 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3522 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3523 /* If we can match a character class, we can match
3524 any character set. */
b18215fc
RS
3525 {
3526 set_fastmap_for_multibyte_characters:
3527 if (match_any_multibyte_characters == false)
3528 {
3529 for (j = 0x80; j < 0xA0; j++) /* XXX */
3530 if (BASE_LEADING_CODE_P (j))
3531 fastmap[j] = 1;
3532 match_any_multibyte_characters = true;
3533 }
3534 }
b18215fc 3535
1fb352e0
SM
3536 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3537 && match_any_multibyte_characters == false)
3538 {
3539 /* Set fastmap[I] 1 where I is a base leading code of each
3540 multibyte character in the range table. */
3541 int c, count;
b18215fc 3542
1fb352e0 3543 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3544 bits for a character class. */
1fb352e0 3545 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3546
1fb352e0
SM
3547 /* Extract the number of ranges in range table into COUNT. */
3548 EXTRACT_NUMBER_AND_INCR (count, p);
3549 for (; count > 0; count--, p += 2 * 3) /* XXX */
3550 {
3551 /* Extract the start of each range. */
3552 EXTRACT_CHARACTER (c, p);
3553 j = CHAR_CHARSET (c);
3554 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3555 }
3556 }
b18215fc
RS
3557 break;
3558
1fb352e0
SM
3559 case syntaxspec:
3560 case notsyntaxspec:
3561 if (!fastmap) break;
3562#ifndef emacs
3563 not = (re_opcode_t)p[-1] == notsyntaxspec;
3564 k = *p++;
3565 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3566 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3567 fastmap[j] = 1;
b18215fc 3568 break;
1fb352e0 3569#else /* emacs */
b18215fc
RS
3570 /* This match depends on text properties. These end with
3571 aborting optimizations. */
f6a3f532 3572 return (RESET_FAIL_STACK (), -1);
b18215fc
RS
3573
3574 case categoryspec:
b18215fc 3575 case notcategoryspec:
1fb352e0
SM
3576 if (!fastmap) break;
3577 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3578 k = *p++;
1fb352e0
SM
3579 for (j = 0; j < (1 << BYTEWIDTH); j++)
3580 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3581 fastmap[j] = 1;
3582
1fb352e0 3583 if (multibyte)
b18215fc 3584 /* Any character set can possibly contain a character
1fb352e0 3585 whose category is K (or not). */
b18215fc
RS
3586 goto set_fastmap_for_multibyte_characters;
3587 break;
3588
fa9a63c5 3589 /* All cases after this match the empty string. These end with
25fe55af 3590 `continue'. */
fa9a63c5 3591
fa9a63c5
RM
3592 case before_dot:
3593 case at_dot:
3594 case after_dot:
1fb352e0 3595#endif /* !emacs */
25fe55af
RS
3596 case no_op:
3597 case begline:
3598 case endline:
fa9a63c5
RM
3599 case begbuf:
3600 case endbuf:
3601 case wordbound:
3602 case notwordbound:
3603 case wordbeg:
3604 case wordend:
25fe55af 3605 continue;
fa9a63c5
RM
3606
3607
fa9a63c5 3608 case jump:
25fe55af 3609 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
3610 if (j < 0)
3611 /* Backward jumps can only go back to code that we've already
3612 visited. `re_compile' should make sure this is true. */
3613 break;
25fe55af 3614 p += j;
505bde11
SM
3615 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3616 {
3617 case on_failure_jump:
3618 case on_failure_keep_string_jump:
505bde11 3619 case on_failure_jump_loop:
0683b6fa 3620 case on_failure_jump_nastyloop:
505bde11
SM
3621 case on_failure_jump_smart:
3622 p++;
3623 break;
3624 default:
3625 continue;
3626 };
3627 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3628 to jump back to "just after here". */
3629 /* Fallthrough */
fa9a63c5 3630
25fe55af
RS
3631 case on_failure_jump:
3632 case on_failure_keep_string_jump:
0683b6fa 3633 case on_failure_jump_nastyloop:
505bde11
SM
3634 case on_failure_jump_loop:
3635 case on_failure_jump_smart:
25fe55af 3636 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 3637 if (p + j <= p1)
ed0767d8
SM
3638 ; /* Backward jump to be ignored. */
3639 else if (!PUSH_PATTERN_OP (p + j, fail_stack))
3640 return (RESET_FAIL_STACK (), -2);
25fe55af 3641 continue;
fa9a63c5
RM
3642
3643
ed0767d8
SM
3644 case jump_n:
3645 /* This code simply does not properly handle forward jump_n. */
3646 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3647 p += 4;
3648 /* jump_n can either jump or fall through. The (backward) jump
3649 case has already been handled, so we only need to look at the
3650 fallthrough case. */
3651 continue;
3652
fa9a63c5 3653 case succeed_n:
ed0767d8
SM
3654 /* If N == 0, it should be an on_failure_jump_loop instead. */
3655 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3656 p += 4;
3657 /* We only care about one iteration of the loop, so we don't
3658 need to consider the case where this behaves like an
3659 on_failure_jump. */
25fe55af 3660 continue;
fa9a63c5
RM
3661
3662
3663 case set_number_at:
25fe55af
RS
3664 p += 4;
3665 continue;
fa9a63c5
RM
3666
3667
3668 case start_memory:
25fe55af 3669 case stop_memory:
505bde11 3670 p += 1;
fa9a63c5
RM
3671 continue;
3672
3673
3674 default:
25fe55af
RS
3675 abort (); /* We have listed all the cases. */
3676 } /* switch *p++ */
fa9a63c5
RM
3677
3678 /* Getting here means we have found the possible starting
25fe55af
RS
3679 characters for one path of the pattern -- and that the empty
3680 string does not match. We need not follow this path further.
3681 Instead, look at the next alternative (remembered on the
3682 stack), or quit if no more. The test at the top of the loop
3683 does these things. */
fa9a63c5
RM
3684 path_can_be_null = false;
3685 p = pend;
3686 } /* while p */
3687
f6a3f532
SM
3688 return (RESET_FAIL_STACK (), 0);
3689} /* analyse_first */
3690\f
3691/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3692 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3693 characters can start a string that matches the pattern. This fastmap
3694 is used by re_search to skip quickly over impossible starting points.
3695
3696 Character codes above (1 << BYTEWIDTH) are not represented in the
3697 fastmap, but the leading codes are represented. Thus, the fastmap
3698 indicates which character sets could start a match.
3699
3700 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3701 area as BUFP->fastmap.
3702
3703 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3704 the pattern buffer.
3705
3706 Returns 0 if we succeed, -2 if an internal error. */
3707
3708int
3709re_compile_fastmap (bufp)
3710 struct re_pattern_buffer *bufp;
3711{
3712 char *fastmap = bufp->fastmap;
3713 int analysis;
3714
3715 assert (fastmap && bufp->buffer);
3716
3717 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3718 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3719
3720 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 3721 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 3722 bufp->can_be_null = (analysis != 0);
f6a3f532
SM
3723 if (analysis < -1)
3724 return analysis;
fa9a63c5
RM
3725 return 0;
3726} /* re_compile_fastmap */
3727\f
3728/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3729 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3730 this memory for recording register information. STARTS and ENDS
3731 must be allocated using the malloc library routine, and must each
3732 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3733
3734 If NUM_REGS == 0, then subsequent matches should allocate their own
3735 register data.
3736
3737 Unless this function is called, the first search or match using
3738 PATTERN_BUFFER will allocate its own register data, without
3739 freeing the old data. */
3740
3741void
3742re_set_registers (bufp, regs, num_regs, starts, ends)
3743 struct re_pattern_buffer *bufp;
3744 struct re_registers *regs;
3745 unsigned num_regs;
3746 regoff_t *starts, *ends;
3747{
3748 if (num_regs)
3749 {
3750 bufp->regs_allocated = REGS_REALLOCATE;
3751 regs->num_regs = num_regs;
3752 regs->start = starts;
3753 regs->end = ends;
3754 }
3755 else
3756 {
3757 bufp->regs_allocated = REGS_UNALLOCATED;
3758 regs->num_regs = 0;
3759 regs->start = regs->end = (regoff_t *) 0;
3760 }
3761}
c0f9ea08 3762WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 3763\f
25fe55af 3764/* Searching routines. */
fa9a63c5
RM
3765
3766/* Like re_search_2, below, but only one string is specified, and
3767 doesn't let you say where to stop matching. */
3768
3769int
3770re_search (bufp, string, size, startpos, range, regs)
3771 struct re_pattern_buffer *bufp;
3772 const char *string;
3773 int size, startpos, range;
3774 struct re_registers *regs;
3775{
5e69f11e 3776 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
3777 regs, size);
3778}
c0f9ea08 3779WEAK_ALIAS (__re_search, re_search)
fa9a63c5 3780
b18215fc
RS
3781/* End address of virtual concatenation of string. */
3782#define STOP_ADDR_VSTRING(P) \
3783 (((P) >= size1 ? string2 + size2 : string1 + size1))
3784
3785/* Address of POS in the concatenation of virtual string. */
3786#define POS_ADDR_VSTRING(POS) \
3787 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
3788
3789/* Using the compiled pattern in BUFP->buffer, first tries to match the
3790 virtual concatenation of STRING1 and STRING2, starting first at index
3791 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 3792
fa9a63c5 3793 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 3794
fa9a63c5
RM
3795 RANGE is how far to scan while trying to match. RANGE = 0 means try
3796 only at STARTPOS; in general, the last start tried is STARTPOS +
3797 RANGE.
5e69f11e 3798
fa9a63c5
RM
3799 In REGS, return the indices of the virtual concatenation of STRING1
3800 and STRING2 that matched the entire BUFP->buffer and its contained
3801 subexpressions.
5e69f11e 3802
fa9a63c5
RM
3803 Do not consider matching one past the index STOP in the virtual
3804 concatenation of STRING1 and STRING2.
3805
3806 We return either the position in the strings at which the match was
3807 found, -1 if no match, or -2 if error (such as failure
3808 stack overflow). */
3809
3810int
66f0296e 3811re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 3812 struct re_pattern_buffer *bufp;
66f0296e 3813 const char *str1, *str2;
fa9a63c5
RM
3814 int size1, size2;
3815 int startpos;
3816 int range;
3817 struct re_registers *regs;
3818 int stop;
3819{
3820 int val;
66f0296e
SM
3821 re_char *string1 = (re_char*) str1;
3822 re_char *string2 = (re_char*) str2;
fa9a63c5 3823 register char *fastmap = bufp->fastmap;
6676cb1c 3824 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
3825 int total_size = size1 + size2;
3826 int endpos = startpos + range;
c0f9ea08 3827 boolean anchored_start;
fa9a63c5 3828
25fe55af 3829 /* Nonzero if we have to concern multibyte character. */
2d1675e4 3830 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 3831
fa9a63c5
RM
3832 /* Check for out-of-range STARTPOS. */
3833 if (startpos < 0 || startpos > total_size)
3834 return -1;
5e69f11e 3835
fa9a63c5 3836 /* Fix up RANGE if it might eventually take us outside
34597fa9 3837 the virtual concatenation of STRING1 and STRING2.
5e69f11e 3838 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
3839 if (endpos < 0)
3840 range = 0 - startpos;
fa9a63c5
RM
3841 else if (endpos > total_size)
3842 range = total_size - startpos;
3843
3844 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 3845 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
3846 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3847 {
3848 if (startpos > 0)
3849 return -1;
3850 else
7b140fd7 3851 range = 0;
fa9a63c5
RM
3852 }
3853
ae4788a8
RS
3854#ifdef emacs
3855 /* In a forward search for something that starts with \=.
3856 don't keep searching past point. */
3857 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3858 {
7b140fd7
RS
3859 range = PT_BYTE - BEGV_BYTE - startpos;
3860 if (range < 0)
ae4788a8
RS
3861 return -1;
3862 }
3863#endif /* emacs */
3864
fa9a63c5
RM
3865 /* Update the fastmap now if not correct already. */
3866 if (fastmap && !bufp->fastmap_accurate)
3867 if (re_compile_fastmap (bufp) == -2)
3868 return -2;
5e69f11e 3869
c8499ba5 3870 /* See whether the pattern is anchored. */
c0f9ea08 3871 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 3872
b18215fc 3873#ifdef emacs
cc9b4df2
KH
3874 gl_state.object = re_match_object;
3875 {
99633e97 3876 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
3877
3878 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3879 }
b18215fc
RS
3880#endif
3881
fa9a63c5
RM
3882 /* Loop through the string, looking for a place to start matching. */
3883 for (;;)
5e69f11e 3884 {
c8499ba5
RS
3885 /* If the pattern is anchored,
3886 skip quickly past places we cannot match.
3887 We don't bother to treat startpos == 0 specially
3888 because that case doesn't repeat. */
3889 if (anchored_start && startpos > 0)
3890 {
c0f9ea08
SM
3891 if (! ((startpos <= size1 ? string1[startpos - 1]
3892 : string2[startpos - size1 - 1])
3893 == '\n'))
c8499ba5
RS
3894 goto advance;
3895 }
3896
fa9a63c5 3897 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
3898 cannot be the start of a match. If the pattern can match the
3899 null string, however, we don't need to skip characters; we want
3900 the first null string. */
fa9a63c5
RM
3901 if (fastmap && startpos < total_size && !bufp->can_be_null)
3902 {
66f0296e 3903 register re_char *d;
e934739e
RS
3904 register unsigned int buf_ch;
3905
3906 d = POS_ADDR_VSTRING (startpos);
3907
25fe55af 3908 if (range > 0) /* Searching forwards. */
fa9a63c5 3909 {
fa9a63c5
RM
3910 register int lim = 0;
3911 int irange = range;
3912
25fe55af
RS
3913 if (startpos < size1 && startpos + range >= size1)
3914 lim = range - (size1 - startpos);
fa9a63c5 3915
25fe55af
RS
3916 /* Written out as an if-else to avoid testing `translate'
3917 inside the loop. */
28ae27ae
AS
3918 if (RE_TRANSLATE_P (translate))
3919 {
e934739e
RS
3920 if (multibyte)
3921 while (range > lim)
3922 {
3923 int buf_charlen;
3924
3925 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
3926 buf_charlen);
3927
3928 buf_ch = RE_TRANSLATE (translate, buf_ch);
3929 if (buf_ch >= 0400
3930 || fastmap[buf_ch])
3931 break;
3932
3933 range -= buf_charlen;
3934 d += buf_charlen;
3935 }
3936 else
3937 while (range > lim
66f0296e 3938 && !fastmap[RE_TRANSLATE (translate, *d)])
33c46939
RS
3939 {
3940 d++;
3941 range--;
3942 }
e934739e 3943 }
fa9a63c5 3944 else
66f0296e 3945 while (range > lim && !fastmap[*d])
33c46939
RS
3946 {
3947 d++;
3948 range--;
3949 }
fa9a63c5
RM
3950
3951 startpos += irange - range;
3952 }
25fe55af 3953 else /* Searching backwards. */
fa9a63c5 3954 {
2d1675e4
SM
3955 int room = (startpos >= size1
3956 ? size2 + size1 - startpos
3957 : size1 - startpos);
3958 buf_ch = RE_STRING_CHAR (d, room);
3959 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 3960
e934739e
RS
3961 if (! (buf_ch >= 0400
3962 || fastmap[buf_ch]))
fa9a63c5
RM
3963 goto advance;
3964 }
3965 }
3966
3967 /* If can't match the null string, and that's all we have left, fail. */
3968 if (range >= 0 && startpos == total_size && fastmap
25fe55af 3969 && !bufp->can_be_null)
fa9a63c5
RM
3970 return -1;
3971
3972 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3973 startpos, regs, stop);
3974#ifndef REGEX_MALLOC
0b32bf0e 3975# ifdef C_ALLOCA
fa9a63c5 3976 alloca (0);
0b32bf0e 3977# endif
fa9a63c5
RM
3978#endif
3979
3980 if (val >= 0)
3981 return startpos;
5e69f11e 3982
fa9a63c5
RM
3983 if (val == -2)
3984 return -2;
3985
3986 advance:
5e69f11e 3987 if (!range)
25fe55af 3988 break;
5e69f11e 3989 else if (range > 0)
25fe55af 3990 {
b18215fc
RS
3991 /* Update STARTPOS to the next character boundary. */
3992 if (multibyte)
3993 {
66f0296e
SM
3994 re_char *p = POS_ADDR_VSTRING (startpos);
3995 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
3996 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
3997
3998 range -= len;
3999 if (range < 0)
4000 break;
4001 startpos += len;
4002 }
4003 else
4004 {
b560c397
RS
4005 range--;
4006 startpos++;
4007 }
e318085a 4008 }
fa9a63c5 4009 else
25fe55af
RS
4010 {
4011 range++;
4012 startpos--;
b18215fc
RS
4013
4014 /* Update STARTPOS to the previous character boundary. */
4015 if (multibyte)
4016 {
66f0296e 4017 re_char *p = POS_ADDR_VSTRING (startpos);
b18215fc
RS
4018 int len = 0;
4019
4020 /* Find the head of multibyte form. */
5d967c7a 4021 while (!CHAR_HEAD_P (*p))
b18215fc
RS
4022 p--, len++;
4023
4024 /* Adjust it. */
4025#if 0 /* XXX */
4026 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
4027 ;
4028 else
4029#endif
4030 {
4031 range += len;
4032 if (range > 0)
4033 break;
4034
4035 startpos -= len;
4036 }
4037 }
25fe55af 4038 }
fa9a63c5
RM
4039 }
4040 return -1;
4041} /* re_search_2 */
c0f9ea08 4042WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4043\f
4044/* Declarations and macros for re_match_2. */
4045
2d1675e4
SM
4046static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4047 register int len,
4048 RE_TRANSLATE_TYPE translate,
4049 const int multibyte));
fa9a63c5
RM
4050
4051/* This converts PTR, a pointer into one of the search strings `string1'
4052 and `string2' into an offset from the beginning of that string. */
4053#define POINTER_TO_OFFSET(ptr) \
4054 (FIRST_STRING_P (ptr) \
4055 ? ((regoff_t) ((ptr) - string1)) \
4056 : ((regoff_t) ((ptr) - string2 + size1)))
4057
fa9a63c5 4058/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4059 string2 if necessary.
4060 Check re_match_2_internal for a discussion of why end_match_2 might
4061 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4062#define PREFETCH() \
25fe55af 4063 while (d == dend) \
fa9a63c5
RM
4064 { \
4065 /* End of string2 => fail. */ \
25fe55af
RS
4066 if (dend == end_match_2) \
4067 goto fail; \
4bb91c68 4068 /* End of string1 => advance to string2. */ \
25fe55af 4069 d = string2; \
fa9a63c5
RM
4070 dend = end_match_2; \
4071 }
4072
f1ad044f
SM
4073/* Call before fetching a char with *d if you already checked other limits.
4074 This is meant for use in lookahead operations like wordend, etc..
4075 where we might need to look at parts of the string that might be
4076 outside of the LIMITs (i.e past `stop'). */
4077#define PREFETCH_NOLIMIT() \
4078 if (d == end1) \
4079 { \
4080 d = string2; \
4081 dend = end_match_2; \
4082 } \
fa9a63c5
RM
4083
4084/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4085 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4086#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4087#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4088
4089
4090/* Test if D points to a character which is word-constituent. We have
4091 two special cases to check for: if past the end of string1, look at
4092 the first character in string2; and if before the beginning of
4093 string2, look at the last character in string1. */
4094#define WORDCHAR_P(d) \
4095 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4096 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4097 == Sword)
4098
9121ca40 4099/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4100
4101/* The comment at case wordbound is following one, but we don't use
4102 AT_WORD_BOUNDARY anymore to support multibyte form.
4103
4104 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4105 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4106 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4107 macro and introducing temporary variables works around the bug. */
4108
9121ca40 4109#if 0
fa9a63c5
RM
4110/* Test if the character before D and the one at D differ with respect
4111 to being word-constituent. */
4112#define AT_WORD_BOUNDARY(d) \
4113 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4114 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4115#endif
fa9a63c5
RM
4116
4117/* Free everything we malloc. */
4118#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4119# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4120# define FREE_VARIABLES() \
fa9a63c5
RM
4121 do { \
4122 REGEX_FREE_STACK (fail_stack.stack); \
4123 FREE_VAR (regstart); \
4124 FREE_VAR (regend); \
fa9a63c5
RM
4125 FREE_VAR (best_regstart); \
4126 FREE_VAR (best_regend); \
fa9a63c5
RM
4127 } while (0)
4128#else
0b32bf0e 4129# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4130#endif /* not MATCH_MAY_ALLOCATE */
4131
505bde11
SM
4132\f
4133/* Optimization routines. */
4134
4e8a9132
SM
4135/* If the operation is a match against one or more chars,
4136 return a pointer to the next operation, else return NULL. */
4137static unsigned char *
4138skip_one_char (p)
4139 unsigned char *p;
4140{
4141 switch (SWITCH_ENUM_CAST (*p++))
4142 {
4143 case anychar:
4144 break;
4145
4146 case exactn:
4147 p += *p + 1;
4148 break;
4149
4150 case charset_not:
4151 case charset:
4152 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4153 {
4154 int mcnt;
4155 p = CHARSET_RANGE_TABLE (p - 1);
4156 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4157 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4158 }
4159 else
4160 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4161 break;
4162
4e8a9132
SM
4163 case syntaxspec:
4164 case notsyntaxspec:
1fb352e0 4165#ifdef emacs
4e8a9132
SM
4166 case categoryspec:
4167 case notcategoryspec:
4168#endif /* emacs */
4169 p++;
4170 break;
4171
4172 default:
4173 p = NULL;
4174 }
4175 return p;
4176}
4177
4178
505bde11
SM
4179/* Jump over non-matching operations. */
4180static unsigned char *
4e8a9132 4181skip_noops (p, pend)
505bde11 4182 unsigned char *p, *pend;
505bde11
SM
4183{
4184 int mcnt;
4185 while (p < pend)
4186 {
4187 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4188 {
4189 case start_memory:
505bde11
SM
4190 case stop_memory:
4191 p += 2; break;
4192 case no_op:
4193 p += 1; break;
4194 case jump:
4195 p += 1;
4196 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4197 p += mcnt;
4198 break;
4199 default:
4200 return p;
4201 }
4202 }
4203 assert (p == pend);
4204 return p;
4205}
4206
4207/* Non-zero if "p1 matches something" implies "p2 fails". */
4208static int
4209mutually_exclusive_p (bufp, p1, p2)
4210 struct re_pattern_buffer *bufp;
4211 unsigned char *p1, *p2;
4212{
4e8a9132 4213 re_opcode_t op2;
2d1675e4 4214 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4215 unsigned char *pend = bufp->buffer + bufp->used;
4216
4e8a9132 4217 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4218 && p2 >= bufp->buffer && p2 <= pend);
4219
4220 /* Skip over open/close-group commands.
4221 If what follows this loop is a ...+ construct,
4222 look at what begins its body, since we will have to
4223 match at least one of that. */
4e8a9132
SM
4224 p2 = skip_noops (p2, pend);
4225 /* The same skip can be done for p1, except that this function
4226 is only used in the case where p1 is a simple match operator. */
4227 /* p1 = skip_noops (p1, pend); */
4228
4229 assert (p1 >= bufp->buffer && p1 < pend
4230 && p2 >= bufp->buffer && p2 <= pend);
4231
4232 op2 = p2 == pend ? succeed : *p2;
4233
4234 switch (SWITCH_ENUM_CAST (op2))
505bde11 4235 {
4e8a9132
SM
4236 case succeed:
4237 case endbuf:
4238 /* If we're at the end of the pattern, we can change. */
4239 if (skip_one_char (p1))
505bde11 4240 {
505bde11
SM
4241 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4242 return 1;
505bde11 4243 }
4e8a9132
SM
4244 break;
4245
4246 case endline:
4e8a9132
SM
4247 case exactn:
4248 {
4249 register unsigned int c
4250 = (re_opcode_t) *p2 == endline ? '\n'
4251 : RE_STRING_CHAR(p2 + 2, pend - p2 - 2);
505bde11 4252
4e8a9132
SM
4253 if ((re_opcode_t) *p1 == exactn)
4254 {
4255 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4256 {
4257 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4258 return 1;
4259 }
4260 }
505bde11 4261
4e8a9132
SM
4262 else if ((re_opcode_t) *p1 == charset
4263 || (re_opcode_t) *p1 == charset_not)
4264 {
4265 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4266
4e8a9132
SM
4267 /* Test if C is listed in charset (or charset_not)
4268 at `p1'. */
4269 if (SINGLE_BYTE_CHAR_P (c))
4270 {
4271 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4272 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4273 not = !not;
4274 }
4275 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4276 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4277
4e8a9132
SM
4278 /* `not' is equal to 1 if c would match, which means
4279 that we can't change to pop_failure_jump. */
4280 if (!not)
4281 {
4282 DEBUG_PRINT1 (" No match => fast loop.\n");
4283 return 1;
4284 }
4285 }
4286 else if ((re_opcode_t) *p1 == anychar
4287 && c == '\n')
4288 {
4289 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4290 return 1;
4291 }
4292 }
4293 break;
505bde11 4294
4e8a9132
SM
4295 case charset:
4296 case charset_not:
4297 {
4298 if ((re_opcode_t) *p1 == exactn)
4299 /* Reuse the code above. */
4300 return mutually_exclusive_p (bufp, p2, p1);
505bde11
SM
4301
4302
4303 /* It is hard to list up all the character in charset
4304 P2 if it includes multibyte character. Give up in
4305 such case. */
4306 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4307 {
4308 /* Now, we are sure that P2 has no range table.
4309 So, for the size of bitmap in P2, `p2[1]' is
4310 enough. But P1 may have range table, so the
4311 size of bitmap table of P1 is extracted by
4312 using macro `CHARSET_BITMAP_SIZE'.
4313
4314 Since we know that all the character listed in
4315 P2 is ASCII, it is enough to test only bitmap
4316 table of P1. */
4317
4318 if (*p1 == *p2)
4319 {
4320 int idx;
4321 /* We win if the charset inside the loop
4322 has no overlap with the one after the loop. */
4323 for (idx = 0;
4324 (idx < (int) p2[1]
4325 && idx < CHARSET_BITMAP_SIZE (p1));
4326 idx++)
4327 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4328 break;
4329
4330 if (idx == p2[1]
4331 || idx == CHARSET_BITMAP_SIZE (p1))
4332 {
4333 DEBUG_PRINT1 (" No match => fast loop.\n");
4334 return 1;
4335 }
4336 }
4337 else if ((re_opcode_t) *p1 == charset
4338 || (re_opcode_t) *p1 == charset_not)
4339 {
4340 int idx;
4341 /* We win if the charset_not inside the loop lists
4342 every character listed in the charset after. */
4343 for (idx = 0; idx < (int) p2[1]; idx++)
4344 if (! (p2[2 + idx] == 0
4345 || (idx < CHARSET_BITMAP_SIZE (p1)
4346 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4347 break;
4348
4e8a9132
SM
4349 if (idx == p2[1])
4350 {
4351 DEBUG_PRINT1 (" No match => fast loop.\n");
4352 return 1;
4353 }
4354 }
4355 }
4356 }
4357
4e8a9132
SM
4358 case wordend:
4359 case notsyntaxspec:
4360 return ((re_opcode_t) *p1 == syntaxspec
4361 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4362
4363 case wordbeg:
4364 case syntaxspec:
4365 return ((re_opcode_t) *p1 == notsyntaxspec
4366 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4367
4368 case wordbound:
4369 return (((re_opcode_t) *p1 == notsyntaxspec
4370 || (re_opcode_t) *p1 == syntaxspec)
4371 && p1[1] == Sword);
4372
1fb352e0 4373#ifdef emacs
4e8a9132
SM
4374 case categoryspec:
4375 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4376 case notcategoryspec:
4377 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4378#endif /* emacs */
4379
4380 default:
4381 ;
505bde11
SM
4382 }
4383
4384 /* Safe default. */
4385 return 0;
4386}
4387
fa9a63c5
RM
4388\f
4389/* Matching routines. */
4390
25fe55af 4391#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4392/* re_match is like re_match_2 except it takes only a single string. */
4393
4394int
4395re_match (bufp, string, size, pos, regs)
4396 struct re_pattern_buffer *bufp;
4397 const char *string;
4398 int size, pos;
4399 struct re_registers *regs;
4400{
4bb91c68 4401 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4402 pos, regs, size);
0b32bf0e 4403# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4404 alloca (0);
0b32bf0e 4405# endif
fa9a63c5
RM
4406 return result;
4407}
c0f9ea08 4408WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4409#endif /* not emacs */
4410
b18215fc
RS
4411#ifdef emacs
4412/* In Emacs, this is the string or buffer in which we
25fe55af 4413 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4414Lisp_Object re_match_object;
4415#endif
fa9a63c5
RM
4416
4417/* re_match_2 matches the compiled pattern in BUFP against the
4418 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4419 and SIZE2, respectively). We start matching at POS, and stop
4420 matching at STOP.
5e69f11e 4421
fa9a63c5 4422 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4423 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4424 documentation for exactly how many groups we fill.
4425
4426 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4427 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4428 matched substring. */
4429
4430int
4431re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4432 struct re_pattern_buffer *bufp;
4433 const char *string1, *string2;
4434 int size1, size2;
4435 int pos;
4436 struct re_registers *regs;
4437 int stop;
4438{
b18215fc 4439 int result;
25fe55af 4440
b18215fc 4441#ifdef emacs
cc9b4df2
KH
4442 int charpos;
4443 gl_state.object = re_match_object;
99633e97 4444 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4445 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4446#endif
4447
4bb91c68
SM
4448 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4449 (re_char*) string2, size2,
cc9b4df2 4450 pos, regs, stop);
0b32bf0e 4451#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4452 alloca (0);
a60198e5 4453#endif
fa9a63c5
RM
4454 return result;
4455}
c0f9ea08 4456WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4457
4458/* This is a separate function so that we can force an alloca cleanup
25fe55af 4459 afterwards. */
fa9a63c5
RM
4460static int
4461re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4462 struct re_pattern_buffer *bufp;
66f0296e 4463 re_char *string1, *string2;
fa9a63c5
RM
4464 int size1, size2;
4465 int pos;
4466 struct re_registers *regs;
4467 int stop;
4468{
4469 /* General temporaries. */
4470 int mcnt;
66f0296e 4471 boolean not;
fa9a63c5
RM
4472 unsigned char *p1;
4473
4474 /* Just past the end of the corresponding string. */
66f0296e 4475 re_char *end1, *end2;
fa9a63c5
RM
4476
4477 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4478 each to consider matching. */
66f0296e 4479 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4480
4481 /* Where we are in the data, and the end of the current string. */
66f0296e 4482 re_char *d, *dend;
5e69f11e 4483
99633e97
SM
4484 /* Used sometimes to remember where we were before starting matching
4485 an operator so that we can go back in case of failure. This "atomic"
4486 behavior of matching opcodes is indispensable to the correctness
4487 of the on_failure_keep_string_jump optimization. */
4488 re_char *dfail;
4489
fa9a63c5
RM
4490 /* Where we are in the pattern, and the end of the pattern. */
4491 unsigned char *p = bufp->buffer;
4492 register unsigned char *pend = p + bufp->used;
4493
25fe55af 4494 /* We use this to map every character in the string. */
6676cb1c 4495 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4496
25fe55af 4497 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4498 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4499
fa9a63c5
RM
4500 /* Failure point stack. Each place that can handle a failure further
4501 down the line pushes a failure point on this stack. It consists of
505bde11 4502 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4503 the subexpressions we're currently inside, plus the number of such
4504 registers, and, finally, two char *'s. The first char * is where
4505 to resume scanning the pattern; the second one is where to resume
505bde11 4506 scanning the strings. */
25fe55af 4507#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4508 fail_stack_type fail_stack;
4509#endif
4510#ifdef DEBUG
fa9a63c5
RM
4511 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4512#endif
4513
0b32bf0e 4514#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4515 /* This holds the pointer to the failure stack, when
4516 it is allocated relocatably. */
4517 fail_stack_elt_t *failure_stack_ptr;
99633e97 4518#endif
fa9a63c5
RM
4519
4520 /* We fill all the registers internally, independent of what we
25fe55af 4521 return, for use in backreferences. The number here includes
fa9a63c5 4522 an element for register zero. */
4bb91c68 4523 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4524
fa9a63c5
RM
4525 /* Information on the contents of registers. These are pointers into
4526 the input strings; they record just what was matched (on this
4527 attempt) by a subexpression part of the pattern, that is, the
4528 regnum-th regstart pointer points to where in the pattern we began
4529 matching and the regnum-th regend points to right after where we
4530 stopped matching the regnum-th subexpression. (The zeroth register
4531 keeps track of what the whole pattern matches.) */
4532#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4533 re_char **regstart, **regend;
fa9a63c5
RM
4534#endif
4535
fa9a63c5 4536 /* The following record the register info as found in the above
5e69f11e 4537 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4538 This happens as we backtrack through the failure points, which in
4539 turn happens only if we have not yet matched the entire string. */
4540 unsigned best_regs_set = false;
4541#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4542 re_char **best_regstart, **best_regend;
fa9a63c5 4543#endif
5e69f11e 4544
fa9a63c5
RM
4545 /* Logically, this is `best_regend[0]'. But we don't want to have to
4546 allocate space for that if we're not allocating space for anything
25fe55af 4547 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4548 any of the other register vectors, and it seems rather a kludge to
4549 treat `best_regend' differently than the rest. So we keep track of
4550 the end of the best match so far in a separate variable. We
4551 initialize this to NULL so that when we backtrack the first time
4552 and need to test it, it's not garbage. */
66f0296e 4553 re_char *match_end = NULL;
fa9a63c5 4554
fa9a63c5
RM
4555#ifdef DEBUG
4556 /* Counts the total number of registers pushed. */
5e69f11e 4557 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4558#endif
4559
4560 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4561
fa9a63c5 4562 INIT_FAIL_STACK ();
5e69f11e 4563
fa9a63c5
RM
4564#ifdef MATCH_MAY_ALLOCATE
4565 /* Do not bother to initialize all the register variables if there are
4566 no groups in the pattern, as it takes a fair amount of time. If
4567 there are groups, we include space for register 0 (the whole
4568 pattern), even though we never use it, since it simplifies the
4569 array indexing. We should fix this. */
4570 if (bufp->re_nsub)
4571 {
66f0296e
SM
4572 regstart = REGEX_TALLOC (num_regs, re_char *);
4573 regend = REGEX_TALLOC (num_regs, re_char *);
4574 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4575 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 4576
505bde11 4577 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
4578 {
4579 FREE_VARIABLES ();
4580 return -2;
4581 }
fa9a63c5
RM
4582 }
4583 else
4584 {
4585 /* We must initialize all our variables to NULL, so that
25fe55af 4586 `FREE_VARIABLES' doesn't try to free them. */
505bde11 4587 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
4588 }
4589#endif /* MATCH_MAY_ALLOCATE */
4590
4591 /* The starting position is bogus. */
4592 if (pos < 0 || pos > size1 + size2)
4593 {
4594 FREE_VARIABLES ();
4595 return -1;
4596 }
5e69f11e 4597
fa9a63c5
RM
4598 /* Initialize subexpression text positions to -1 to mark ones that no
4599 start_memory/stop_memory has been seen for. Also initialize the
4600 register information struct. */
4601 for (mcnt = 1; mcnt < num_regs; mcnt++)
4bb91c68 4602 regstart[mcnt] = regend[mcnt] = NULL;
99633e97 4603
fa9a63c5 4604 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 4605 `string1' is null. */
fa9a63c5
RM
4606 if (size2 == 0 && string1 != NULL)
4607 {
4608 string2 = string1;
4609 size2 = size1;
4610 string1 = 0;
4611 size1 = 0;
4612 }
4613 end1 = string1 + size1;
4614 end2 = string2 + size2;
4615
5e69f11e 4616 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
4617 `dend' is the end of the input string that `d' points within. `d'
4618 is advanced into the following input string whenever necessary, but
4619 this happens before fetching; therefore, at the beginning of the
4620 loop, `d' can be pointing at the end of a string, but it cannot
4621 equal `string2'. */
419d1c74 4622 if (pos >= size1)
fa9a63c5 4623 {
419d1c74
SM
4624 /* Only match within string2. */
4625 d = string2 + pos - size1;
4626 dend = end_match_2 = string2 + stop - size1;
4627 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
4628 }
4629 else
4630 {
f1ad044f 4631 if (stop < size1)
419d1c74
SM
4632 {
4633 /* Only match within string1. */
4634 end_match_1 = string1 + stop;
4635 /* BEWARE!
4636 When we reach end_match_1, PREFETCH normally switches to string2.
4637 But in the present case, this means that just doing a PREFETCH
4638 makes us jump from `stop' to `gap' within the string.
4639 What we really want here is for the search to stop as
4640 soon as we hit end_match_1. That's why we set end_match_2
4641 to end_match_1 (since PREFETCH fails as soon as we hit
4642 end_match_2). */
4643 end_match_2 = end_match_1;
4644 }
4645 else
f1ad044f
SM
4646 { /* It's important to use this code when stop == size so that
4647 moving `d' from end1 to string2 will not prevent the d == dend
4648 check from catching the end of string. */
419d1c74
SM
4649 end_match_1 = end1;
4650 end_match_2 = string2 + stop - size1;
4651 }
4652 d = string1 + pos;
4653 dend = end_match_1;
fa9a63c5
RM
4654 }
4655
4656 DEBUG_PRINT1 ("The compiled pattern is: ");
4657 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4658 DEBUG_PRINT1 ("The string to match is: `");
4659 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4660 DEBUG_PRINT1 ("'\n");
5e69f11e 4661
25fe55af 4662 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
4663 function if the match is complete, or it drops through if the match
4664 fails at this starting point in the input data. */
4665 for (;;)
4666 {
505bde11 4667 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
4668
4669 if (p == pend)
4670 { /* End of pattern means we might have succeeded. */
25fe55af 4671 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 4672
fa9a63c5 4673 /* If we haven't matched the entire string, and we want the
25fe55af
RS
4674 longest match, try backtracking. */
4675 if (d != end_match_2)
fa9a63c5
RM
4676 {
4677 /* 1 if this match ends in the same string (string1 or string2)
4678 as the best previous match. */
5e69f11e 4679 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 4680 == FIRST_STRING_P (d));
fa9a63c5
RM
4681 /* 1 if this match is the best seen so far. */
4682 boolean best_match_p;
4683
4684 /* AIX compiler got confused when this was combined
25fe55af 4685 with the previous declaration. */
fa9a63c5
RM
4686 if (same_str_p)
4687 best_match_p = d > match_end;
4688 else
99633e97 4689 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 4690
25fe55af
RS
4691 DEBUG_PRINT1 ("backtracking.\n");
4692
4693 if (!FAIL_STACK_EMPTY ())
4694 { /* More failure points to try. */
4695
4696 /* If exceeds best match so far, save it. */
4697 if (!best_regs_set || best_match_p)
4698 {
4699 best_regs_set = true;
4700 match_end = d;
4701
4702 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4703
4704 for (mcnt = 1; mcnt < num_regs; mcnt++)
4705 {
4706 best_regstart[mcnt] = regstart[mcnt];
4707 best_regend[mcnt] = regend[mcnt];
4708 }
4709 }
4710 goto fail;
4711 }
4712
4713 /* If no failure points, don't restore garbage. And if
4714 last match is real best match, don't restore second
4715 best one. */
4716 else if (best_regs_set && !best_match_p)
4717 {
4718 restore_best_regs:
4719 /* Restore best match. It may happen that `dend ==
4720 end_match_1' while the restored d is in string2.
4721 For example, the pattern `x.*y.*z' against the
4722 strings `x-' and `y-z-', if the two strings are
4723 not consecutive in memory. */
4724 DEBUG_PRINT1 ("Restoring best registers.\n");
4725
4726 d = match_end;
4727 dend = ((d >= string1 && d <= end1)
4728 ? end_match_1 : end_match_2);
fa9a63c5
RM
4729
4730 for (mcnt = 1; mcnt < num_regs; mcnt++)
4731 {
4732 regstart[mcnt] = best_regstart[mcnt];
4733 regend[mcnt] = best_regend[mcnt];
4734 }
25fe55af
RS
4735 }
4736 } /* d != end_match_2 */
fa9a63c5
RM
4737
4738 succeed_label:
25fe55af 4739 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 4740
25fe55af
RS
4741 /* If caller wants register contents data back, do it. */
4742 if (regs && !bufp->no_sub)
fa9a63c5 4743 {
25fe55af
RS
4744 /* Have the register data arrays been allocated? */
4745 if (bufp->regs_allocated == REGS_UNALLOCATED)
4746 { /* No. So allocate them with malloc. We need one
4747 extra element beyond `num_regs' for the `-1' marker
4748 GNU code uses. */
4749 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4750 regs->start = TALLOC (regs->num_regs, regoff_t);
4751 regs->end = TALLOC (regs->num_regs, regoff_t);
4752 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4753 {
4754 FREE_VARIABLES ();
4755 return -2;
4756 }
25fe55af
RS
4757 bufp->regs_allocated = REGS_REALLOCATE;
4758 }
4759 else if (bufp->regs_allocated == REGS_REALLOCATE)
4760 { /* Yes. If we need more elements than were already
4761 allocated, reallocate them. If we need fewer, just
4762 leave it alone. */
4763 if (regs->num_regs < num_regs + 1)
4764 {
4765 regs->num_regs = num_regs + 1;
4766 RETALLOC (regs->start, regs->num_regs, regoff_t);
4767 RETALLOC (regs->end, regs->num_regs, regoff_t);
4768 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4769 {
4770 FREE_VARIABLES ();
4771 return -2;
4772 }
25fe55af
RS
4773 }
4774 }
4775 else
fa9a63c5
RM
4776 {
4777 /* These braces fend off a "empty body in an else-statement"
25fe55af 4778 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
4779 assert (bufp->regs_allocated == REGS_FIXED);
4780 }
4781
25fe55af
RS
4782 /* Convert the pointer data in `regstart' and `regend' to
4783 indices. Register zero has to be set differently,
4784 since we haven't kept track of any info for it. */
4785 if (regs->num_regs > 0)
4786 {
4787 regs->start[0] = pos;
99633e97 4788 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 4789 }
5e69f11e 4790
25fe55af
RS
4791 /* Go through the first `min (num_regs, regs->num_regs)'
4792 registers, since that is all we initialized. */
fa9a63c5
RM
4793 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4794 {
25fe55af
RS
4795 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4796 regs->start[mcnt] = regs->end[mcnt] = -1;
4797 else
4798 {
fa9a63c5
RM
4799 regs->start[mcnt]
4800 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
25fe55af 4801 regs->end[mcnt]
fa9a63c5 4802 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
25fe55af 4803 }
fa9a63c5 4804 }
5e69f11e 4805
25fe55af
RS
4806 /* If the regs structure we return has more elements than
4807 were in the pattern, set the extra elements to -1. If
4808 we (re)allocated the registers, this is the case,
4809 because we always allocate enough to have at least one
4810 -1 at the end. */
4811 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4812 regs->start[mcnt] = regs->end[mcnt] = -1;
fa9a63c5
RM
4813 } /* regs && !bufp->no_sub */
4814
25fe55af
RS
4815 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4816 nfailure_points_pushed, nfailure_points_popped,
4817 nfailure_points_pushed - nfailure_points_popped);
4818 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 4819
99633e97 4820 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 4821
25fe55af 4822 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 4823
25fe55af
RS
4824 FREE_VARIABLES ();
4825 return mcnt;
4826 }
fa9a63c5 4827
25fe55af 4828 /* Otherwise match next pattern command. */
fa9a63c5
RM
4829 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4830 {
25fe55af
RS
4831 /* Ignore these. Used to ignore the n of succeed_n's which
4832 currently have n == 0. */
4833 case no_op:
4834 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4835 break;
fa9a63c5
RM
4836
4837 case succeed:
25fe55af 4838 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
4839 goto succeed_label;
4840
25fe55af
RS
4841 /* Match the next n pattern characters exactly. The following
4842 byte in the pattern defines n, and the n bytes after that
4843 are the characters to match. */
fa9a63c5
RM
4844 case exactn:
4845 mcnt = *p++;
25fe55af 4846 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 4847
99633e97
SM
4848 /* Remember the start point to rollback upon failure. */
4849 dfail = d;
4850
25fe55af
RS
4851 /* This is written out as an if-else so we don't waste time
4852 testing `translate' inside the loop. */
28703c16 4853 if (RE_TRANSLATE_P (translate))
fa9a63c5 4854 {
e934739e
RS
4855 if (multibyte)
4856 do
4857 {
4858 int pat_charlen, buf_charlen;
e71b1971 4859 unsigned int pat_ch, buf_ch;
e934739e
RS
4860
4861 PREFETCH ();
4862 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4863 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4864
4865 if (RE_TRANSLATE (translate, buf_ch)
4866 != pat_ch)
99633e97
SM
4867 {
4868 d = dfail;
4869 goto fail;
4870 }
e934739e
RS
4871
4872 p += pat_charlen;
4873 d += buf_charlen;
4874 mcnt -= pat_charlen;
4875 }
4876 while (mcnt > 0);
4877 else
e934739e
RS
4878 do
4879 {
4880 PREFETCH ();
66f0296e 4881 if (RE_TRANSLATE (translate, *d) != *p++)
99633e97
SM
4882 {
4883 d = dfail;
4884 goto fail;
4885 }
33c46939 4886 d++;
e934739e
RS
4887 }
4888 while (--mcnt);
fa9a63c5
RM
4889 }
4890 else
4891 {
4892 do
4893 {
4894 PREFETCH ();
99633e97
SM
4895 if (*d++ != *p++)
4896 {
4897 d = dfail;
4898 goto fail;
4899 }
fa9a63c5
RM
4900 }
4901 while (--mcnt);
4902 }
25fe55af 4903 break;
fa9a63c5
RM
4904
4905
25fe55af 4906 /* Match any character except possibly a newline or a null. */
fa9a63c5 4907 case anychar:
e934739e
RS
4908 {
4909 int buf_charlen;
e71b1971 4910 unsigned int buf_ch;
fa9a63c5 4911
e934739e 4912 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 4913
e934739e 4914 PREFETCH ();
2d1675e4 4915 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
4916 buf_ch = TRANSLATE (buf_ch);
4917
4918 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4919 && buf_ch == '\n')
4920 || ((bufp->syntax & RE_DOT_NOT_NULL)
4921 && buf_ch == '\000'))
4922 goto fail;
4923
e934739e
RS
4924 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4925 d += buf_charlen;
4926 }
fa9a63c5
RM
4927 break;
4928
4929
4930 case charset:
4931 case charset_not:
4932 {
b18215fc 4933 register unsigned int c;
fa9a63c5 4934 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4935 int len;
4936
4937 /* Start of actual range_table, or end of bitmap if there is no
4938 range table. */
4939 unsigned char *range_table;
4940
96cc36cc 4941 /* Nonzero if there is a range table. */
b18215fc
RS
4942 int range_table_exists;
4943
96cc36cc
RS
4944 /* Number of ranges of range table. This is not included
4945 in the initial byte-length of the command. */
4946 int count = 0;
fa9a63c5 4947
25fe55af 4948 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 4949
b18215fc 4950 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 4951
b18215fc 4952 if (range_table_exists)
96cc36cc
RS
4953 {
4954 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4955 EXTRACT_NUMBER_AND_INCR (count, range_table);
4956 }
b18215fc 4957
2d1675e4
SM
4958 PREFETCH ();
4959 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
4960 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
4961
4962 if (SINGLE_BYTE_CHAR_P (c))
4963 { /* Lookup bitmap. */
b18215fc
RS
4964 /* Cast to `unsigned' instead of `unsigned char' in
4965 case the bit list is a full 32 bytes long. */
4966 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
4967 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4968 not = !not;
b18215fc 4969 }
96cc36cc 4970#ifdef emacs
b18215fc 4971 else if (range_table_exists)
96cc36cc
RS
4972 {
4973 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4974
4975 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4976 | (class_bits & BIT_ALPHA && ISALPHA (c))
f71b19b6 4977 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
96cc36cc
RS
4978 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4979 | (class_bits & BIT_LOWER && ISLOWER (c))
f71b19b6
DL
4980 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4981 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
96cc36cc
RS
4982 | (class_bits & BIT_PRINT && ISPRINT (c))
4983 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4984 | (class_bits & BIT_SPACE && ISSPACE (c))
f71b19b6 4985 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
96cc36cc
RS
4986 | (class_bits & BIT_UPPER && ISUPPER (c))
4987 | (class_bits & BIT_WORD && ISWORD (c)))
4988 not = !not;
4989 else
4990 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4991 }
4992#endif /* emacs */
fa9a63c5 4993
96cc36cc
RS
4994 if (range_table_exists)
4995 p = CHARSET_RANGE_TABLE_END (range_table, count);
4996 else
4997 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
4998
4999 if (!not) goto fail;
5e69f11e 5000
b18215fc 5001 d += len;
fa9a63c5
RM
5002 break;
5003 }
5004
5005
25fe55af 5006 /* The beginning of a group is represented by start_memory.
505bde11 5007 The argument is the register number. The text
25fe55af
RS
5008 matched within the group is recorded (in the internal
5009 registers data structure) under the register number. */
5010 case start_memory:
505bde11
SM
5011 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5012
5013 /* In case we need to undo this operation (via backtracking). */
5014 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5015
25fe55af 5016 regstart[*p] = d;
4bb91c68 5017 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5018 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5019
25fe55af 5020 /* Move past the register number and inner group count. */
505bde11 5021 p += 1;
25fe55af 5022 break;
fa9a63c5
RM
5023
5024
25fe55af 5025 /* The stop_memory opcode represents the end of a group. Its
505bde11 5026 argument is the same as start_memory's: the register number. */
fa9a63c5 5027 case stop_memory:
505bde11
SM
5028 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5029
5030 assert (!REG_UNSET (regstart[*p]));
5031 /* Strictly speaking, there should be code such as:
5032
0b32bf0e 5033 assert (REG_UNSET (regend[*p]));
505bde11
SM
5034 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5035
5036 But the only info to be pushed is regend[*p] and it is known to
5037 be UNSET, so there really isn't anything to push.
5038 Not pushing anything, on the other hand deprives us from the
5039 guarantee that regend[*p] is UNSET since undoing this operation
5040 will not reset its value properly. This is not important since
5041 the value will only be read on the next start_memory or at
5042 the very end and both events can only happen if this stop_memory
5043 is *not* undone. */
fa9a63c5 5044
25fe55af 5045 regend[*p] = d;
fa9a63c5
RM
5046 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5047
25fe55af 5048 /* Move past the register number and the inner group count. */
505bde11 5049 p += 1;
25fe55af 5050 break;
fa9a63c5
RM
5051
5052
5053 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5054 followed by the numeric value of <digit> as the register number. */
5055 case duplicate:
fa9a63c5 5056 {
66f0296e 5057 register re_char *d2, *dend2;
25fe55af 5058 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5059 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5060
25fe55af
RS
5061 /* Can't back reference a group which we've never matched. */
5062 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5063 goto fail;
5e69f11e 5064
25fe55af
RS
5065 /* Where in input to try to start matching. */
5066 d2 = regstart[regno];
5e69f11e 5067
99633e97
SM
5068 /* Remember the start point to rollback upon failure. */
5069 dfail = d;
5070
25fe55af
RS
5071 /* Where to stop matching; if both the place to start and
5072 the place to stop matching are in the same string, then
5073 set to the place to stop, otherwise, for now have to use
5074 the end of the first string. */
fa9a63c5 5075
25fe55af 5076 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5077 == FIRST_STRING_P (regend[regno]))
5078 ? regend[regno] : end_match_1);
5079 for (;;)
5080 {
5081 /* If necessary, advance to next segment in register
25fe55af 5082 contents. */
fa9a63c5
RM
5083 while (d2 == dend2)
5084 {
5085 if (dend2 == end_match_2) break;
5086 if (dend2 == regend[regno]) break;
5087
25fe55af
RS
5088 /* End of string1 => advance to string2. */
5089 d2 = string2;
5090 dend2 = regend[regno];
fa9a63c5
RM
5091 }
5092 /* At end of register contents => success */
5093 if (d2 == dend2) break;
5094
5095 /* If necessary, advance to next segment in data. */
5096 PREFETCH ();
5097
5098 /* How many characters left in this segment to match. */
5099 mcnt = dend - d;
5e69f11e 5100
fa9a63c5 5101 /* Want how many consecutive characters we can match in
25fe55af
RS
5102 one shot, so, if necessary, adjust the count. */
5103 if (mcnt > dend2 - d2)
fa9a63c5 5104 mcnt = dend2 - d2;
5e69f11e 5105
fa9a63c5 5106 /* Compare that many; failure if mismatch, else move
25fe55af 5107 past them. */
28703c16 5108 if (RE_TRANSLATE_P (translate)
2d1675e4 5109 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5110 : memcmp (d, d2, mcnt))
99633e97
SM
5111 {
5112 d = dfail;
5113 goto fail;
5114 }
fa9a63c5 5115 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5116 }
5117 }
5118 break;
5119
5120
25fe55af 5121 /* begline matches the empty string at the beginning of the string
c0f9ea08 5122 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5123 case begline:
25fe55af 5124 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5125
25fe55af
RS
5126 if (AT_STRINGS_BEG (d))
5127 {
5128 if (!bufp->not_bol) break;
5129 }
419d1c74 5130 else
25fe55af 5131 {
419d1c74
SM
5132 unsigned char c;
5133 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5134 if (c == '\n')
419d1c74 5135 break;
25fe55af
RS
5136 }
5137 /* In all other cases, we fail. */
5138 goto fail;
fa9a63c5
RM
5139
5140
25fe55af 5141 /* endline is the dual of begline. */
fa9a63c5 5142 case endline:
25fe55af 5143 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5144
25fe55af
RS
5145 if (AT_STRINGS_END (d))
5146 {
5147 if (!bufp->not_eol) break;
5148 }
f1ad044f 5149 else
25fe55af 5150 {
f1ad044f 5151 PREFETCH_NOLIMIT ();
c0f9ea08 5152 if (*d == '\n')
f1ad044f 5153 break;
25fe55af
RS
5154 }
5155 goto fail;
fa9a63c5
RM
5156
5157
5158 /* Match at the very beginning of the data. */
25fe55af
RS
5159 case begbuf:
5160 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5161 if (AT_STRINGS_BEG (d))
5162 break;
5163 goto fail;
fa9a63c5
RM
5164
5165
5166 /* Match at the very end of the data. */
25fe55af
RS
5167 case endbuf:
5168 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5169 if (AT_STRINGS_END (d))
5170 break;
25fe55af 5171 goto fail;
5e69f11e 5172
5e69f11e 5173
25fe55af
RS
5174 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5175 pushes NULL as the value for the string on the stack. Then
505bde11 5176 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5177 string, instead of restoring it. To see why, consider
5178 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5179 then the . fails against the \n. But the next thing we want
5180 to do is match the \n against the \n; if we restored the
5181 string value, we would be back at the foo.
5182
5183 Because this is used only in specific cases, we don't need to
5184 check all the things that `on_failure_jump' does, to make
5185 sure the right things get saved on the stack. Hence we don't
5186 share its code. The only reason to push anything on the
5187 stack at all is that otherwise we would have to change
5188 `anychar's code to do something besides goto fail in this
5189 case; that seems worse than this. */
5190 case on_failure_keep_string_jump:
505bde11
SM
5191 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5192 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5193 mcnt, p + mcnt);
fa9a63c5 5194
505bde11
SM
5195 PUSH_FAILURE_POINT (p - 3, NULL);
5196 break;
5197
0683b6fa
SM
5198 /* A nasty loop is introduced by the non-greedy *? and +?.
5199 With such loops, the stack only ever contains one failure point
5200 at a time, so that a plain on_failure_jump_loop kind of
5201 cycle detection cannot work. Worse yet, such a detection
5202 can not only fail to detect a cycle, but it can also wrongly
5203 detect a cycle (between different instantiations of the same
5204 loop.
5205 So the method used for those nasty loops is a little different:
5206 We use a special cycle-detection-stack-frame which is pushed
5207 when the on_failure_jump_nastyloop failure-point is *popped*.
5208 This special frame thus marks the beginning of one iteration
5209 through the loop and we can hence easily check right here
5210 whether something matched between the beginning and the end of
5211 the loop. */
5212 case on_failure_jump_nastyloop:
5213 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5214 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5215 mcnt, p + mcnt);
5216
5217 assert ((re_opcode_t)p[-4] == no_op);
5218 CHECK_INFINITE_LOOP (p - 4, d);
5219 PUSH_FAILURE_POINT (p - 3, d);
5220 break;
5221
505bde11 5222
4e8a9132
SM
5223 /* Simple loop detecting on_failure_jump: just check on the
5224 failure stack if the same spot was already hit earlier. */
505bde11
SM
5225 case on_failure_jump_loop:
5226 on_failure:
5227 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5228 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5229 mcnt, p + mcnt);
5230
5231 CHECK_INFINITE_LOOP (p - 3, d);
5232 PUSH_FAILURE_POINT (p - 3, d);
25fe55af 5233 break;
fa9a63c5
RM
5234
5235
5236 /* Uses of on_failure_jump:
5e69f11e 5237
25fe55af
RS
5238 Each alternative starts with an on_failure_jump that points
5239 to the beginning of the next alternative. Each alternative
5240 except the last ends with a jump that in effect jumps past
5241 the rest of the alternatives. (They really jump to the
5242 ending jump of the following alternative, because tensioning
5243 these jumps is a hassle.)
fa9a63c5 5244
25fe55af
RS
5245 Repeats start with an on_failure_jump that points past both
5246 the repetition text and either the following jump or
5247 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5248 case on_failure_jump:
27c3b45d 5249 QUIT;
25fe55af 5250 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5251 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5252 mcnt, p + mcnt);
25fe55af 5253
505bde11 5254 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5255 break;
5256
4e8a9132 5257 /* This operation is used for greedy *.
505bde11
SM
5258 Compare the beginning of the repeat with what in the
5259 pattern follows its end. If we can establish that there
5260 is nothing that they would both match, i.e., that we
5261 would have to backtrack because of (as in, e.g., `a*a')
5262 then we can use a non-backtracking loop based on
4e8a9132 5263 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5264 case on_failure_jump_smart:
27c3b45d 5265 QUIT;
25fe55af 5266 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5267 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5268 mcnt, p + mcnt);
25fe55af 5269 {
505bde11
SM
5270 unsigned char *p1 = p; /* Next operation. */
5271 unsigned char *p2 = p + mcnt; /* Destination of the jump. */
fa9a63c5 5272
505bde11
SM
5273 p -= 3; /* Reset so that we will re-execute the
5274 instruction once it's been changed. */
fa9a63c5 5275
4e8a9132
SM
5276 EXTRACT_NUMBER (mcnt, p2 - 2);
5277
5278 /* Ensure this is a indeed the trivial kind of loop
5279 we are expecting. */
5280 assert (skip_one_char (p1) == p2 - 3);
5281 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5282 DEBUG_STATEMENT (debug += 2);
505bde11 5283 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5284 {
505bde11 5285 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132
SM
5286 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5287 *p = (unsigned char) on_failure_keep_string_jump;
5288 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5289 }
505bde11 5290 else
fa9a63c5 5291 {
505bde11
SM
5292 /* Default to a safe `on_failure_jump' loop. */
5293 DEBUG_PRINT1 (" smart default => slow loop.\n");
4e8a9132 5294 *p = (unsigned char) on_failure_jump;
fa9a63c5 5295 }
99633e97 5296 DEBUG_STATEMENT (debug -= 2);
25fe55af 5297 }
505bde11 5298 break;
25fe55af
RS
5299
5300 /* Unconditionally jump (without popping any failure points). */
5301 case jump:
fa9a63c5 5302 unconditional_jump:
27c3b45d 5303 QUIT;
fa9a63c5 5304 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5305 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5306 p += mcnt; /* Do the jump. */
505bde11 5307 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5308 break;
5309
5310
25fe55af
RS
5311 /* Have to succeed matching what follows at least n times.
5312 After that, handle like `on_failure_jump'. */
5313 case succeed_n:
5314 EXTRACT_NUMBER (mcnt, p + 2);
5315 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5316
dc1e502d
SM
5317 /* Originally, mcnt is how many times we HAVE to succeed. */
5318 if (mcnt != 0)
25fe55af 5319 {
dc1e502d
SM
5320 mcnt--;
5321 p += 2;
5322 PUSH_FAILURE_COUNT (p);
dc1e502d 5323 DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt);
4bb91c68 5324 STORE_NUMBER_AND_INCR (p, mcnt);
25fe55af 5325 }
dc1e502d
SM
5326 else
5327 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5328 goto on_failure;
25fe55af
RS
5329 break;
5330
5331 case jump_n:
5332 EXTRACT_NUMBER (mcnt, p + 2);
5333 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5334
5335 /* Originally, this is how many times we CAN jump. */
dc1e502d 5336 if (mcnt != 0)
25fe55af 5337 {
dc1e502d
SM
5338 mcnt--;
5339 PUSH_FAILURE_COUNT (p + 2);
5340 STORE_NUMBER (p + 2, mcnt);
5341 goto unconditional_jump;
25fe55af
RS
5342 }
5343 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5344 else
5345 p += 4;
25fe55af 5346 break;
5e69f11e 5347
fa9a63c5
RM
5348 case set_number_at:
5349 {
25fe55af 5350 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5351
25fe55af
RS
5352 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5353 p1 = p + mcnt;
5354 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11 5355 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
dc1e502d 5356 PUSH_FAILURE_COUNT (p1);
fa9a63c5 5357 STORE_NUMBER (p1, mcnt);
25fe55af
RS
5358 break;
5359 }
9121ca40
KH
5360
5361 case wordbound:
66f0296e
SM
5362 case notwordbound:
5363 not = (re_opcode_t) *(p - 1) == notwordbound;
5364 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5365
99633e97 5366 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5367
b18215fc 5368 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5369 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5370 not = !not;
b18215fc
RS
5371 else
5372 {
5373 /* C1 is the character before D, S1 is the syntax of C1, C2
5374 is the character at D, and S2 is the syntax of C2. */
5375 int c1, c2, s1, s2;
b18215fc 5376#ifdef emacs
2d1675e4
SM
5377 int offset = PTR_TO_OFFSET (d - 1);
5378 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5379 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5380#endif
66f0296e 5381 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5382 s1 = SYNTAX (c1);
5383#ifdef emacs
5d967c7a 5384 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5385#endif
f1ad044f 5386 PREFETCH_NOLIMIT ();
2d1675e4 5387 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5388 s2 = SYNTAX (c2);
5389
5390 if (/* Case 2: Only one of S1 and S2 is Sword. */
5391 ((s1 == Sword) != (s2 == Sword))
5392 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5393 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5394 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5395 not = !not;
5396 }
5397 if (not)
9121ca40 5398 break;
b18215fc 5399 else
9121ca40 5400 goto fail;
fa9a63c5
RM
5401
5402 case wordbeg:
25fe55af 5403 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5404
b18215fc
RS
5405 /* We FAIL in one of the following cases: */
5406
25fe55af 5407 /* Case 1: D is at the end of string. */
b18215fc 5408 if (AT_STRINGS_END (d))
99633e97 5409 goto fail;
b18215fc
RS
5410 else
5411 {
5412 /* C1 is the character before D, S1 is the syntax of C1, C2
5413 is the character at D, and S2 is the syntax of C2. */
5414 int c1, c2, s1, s2;
fa9a63c5 5415#ifdef emacs
2d1675e4
SM
5416 int offset = PTR_TO_OFFSET (d);
5417 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5418 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5419#endif
99633e97 5420 PREFETCH ();
2d1675e4 5421 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5422 s2 = SYNTAX (c2);
25fe55af 5423
b18215fc
RS
5424 /* Case 2: S2 is not Sword. */
5425 if (s2 != Sword)
5426 goto fail;
5427
5428 /* Case 3: D is not at the beginning of string ... */
5429 if (!AT_STRINGS_BEG (d))
5430 {
5431 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5432#ifdef emacs
5d967c7a 5433 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5434#endif
b18215fc
RS
5435 s1 = SYNTAX (c1);
5436
5437 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5438 returns 0. */
b18215fc
RS
5439 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5440 goto fail;
5441 }
5442 }
e318085a
RS
5443 break;
5444
b18215fc 5445 case wordend:
25fe55af 5446 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5447
5448 /* We FAIL in one of the following cases: */
5449
5450 /* Case 1: D is at the beginning of string. */
5451 if (AT_STRINGS_BEG (d))
e318085a 5452 goto fail;
b18215fc
RS
5453 else
5454 {
5455 /* C1 is the character before D, S1 is the syntax of C1, C2
5456 is the character at D, and S2 is the syntax of C2. */
5457 int c1, c2, s1, s2;
5d967c7a 5458#ifdef emacs
2d1675e4
SM
5459 int offset = PTR_TO_OFFSET (d) - 1;
5460 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5461 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5462#endif
99633e97 5463 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5464 s1 = SYNTAX (c1);
5465
5466 /* Case 2: S1 is not Sword. */
5467 if (s1 != Sword)
5468 goto fail;
5469
5470 /* Case 3: D is not at the end of string ... */
5471 if (!AT_STRINGS_END (d))
5472 {
f1ad044f 5473 PREFETCH_NOLIMIT ();
2d1675e4 5474 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a
RS
5475#ifdef emacs
5476 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5477#endif
b18215fc
RS
5478 s2 = SYNTAX (c2);
5479
5480 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5481 returns 0. */
b18215fc 5482 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5483 goto fail;
b18215fc
RS
5484 }
5485 }
e318085a
RS
5486 break;
5487
fa9a63c5 5488 case syntaxspec:
1fb352e0
SM
5489 case notsyntaxspec:
5490 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 5491 mcnt = *p++;
1fb352e0 5492 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 5493 PREFETCH ();
b18215fc
RS
5494#ifdef emacs
5495 {
2d1675e4
SM
5496 int offset = PTR_TO_OFFSET (d);
5497 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
5498 UPDATE_SYNTAX_TABLE (pos1);
5499 }
25fe55af 5500#endif
b18215fc
RS
5501 {
5502 int c, len;
5503
2d1675e4 5504 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 5505
990b2375 5506 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 5507 goto fail;
b18215fc
RS
5508 d += len;
5509 }
fa9a63c5
RM
5510 break;
5511
b18215fc 5512#ifdef emacs
1fb352e0
SM
5513 case before_dot:
5514 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5515 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 5516 goto fail;
b18215fc
RS
5517 break;
5518
1fb352e0
SM
5519 case at_dot:
5520 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5521 if (PTR_BYTE_POS (d) != PT_BYTE)
5522 goto fail;
5523 break;
b18215fc 5524
1fb352e0
SM
5525 case after_dot:
5526 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5527 if (PTR_BYTE_POS (d) <= PT_BYTE)
5528 goto fail;
e318085a 5529 break;
fa9a63c5 5530
1fb352e0 5531 case categoryspec:
b18215fc 5532 case notcategoryspec:
1fb352e0 5533 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 5534 mcnt = *p++;
1fb352e0 5535 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
5536 PREFETCH ();
5537 {
5538 int c, len;
2d1675e4 5539 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 5540
1fb352e0 5541 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
5542 goto fail;
5543 d += len;
5544 }
fa9a63c5 5545 break;
5e69f11e 5546
1fb352e0 5547#endif /* emacs */
5e69f11e 5548
0b32bf0e
SM
5549 default:
5550 abort ();
fa9a63c5 5551 }
b18215fc 5552 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
5553
5554
5555 /* We goto here if a matching operation fails. */
5556 fail:
27c3b45d 5557 QUIT;
fa9a63c5 5558 if (!FAIL_STACK_EMPTY ())
505bde11 5559 {
66f0296e 5560 re_char *str;
505bde11
SM
5561 unsigned char *pat;
5562 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
5563 DEBUG_PRINT1 ("\nFAIL:\n");
5564 POP_FAILURE_POINT (str, pat);
505bde11
SM
5565 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5566 {
5567 case on_failure_keep_string_jump:
5568 assert (str == NULL);
5569 goto continue_failure_jump;
5570
0683b6fa
SM
5571 case on_failure_jump_nastyloop:
5572 assert ((re_opcode_t)pat[-2] == no_op);
5573 PUSH_FAILURE_POINT (pat - 2, str);
5574 /* Fallthrough */
5575
505bde11
SM
5576 case on_failure_jump_loop:
5577 case on_failure_jump:
5578 case succeed_n:
5579 d = str;
5580 continue_failure_jump:
5581 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5582 p = pat + mcnt;
5583 break;
b18215fc 5584
0683b6fa
SM
5585 case no_op:
5586 /* A special frame used for nastyloops. */
5587 goto fail;
5588
505bde11
SM
5589 default:
5590 abort();
5591 }
fa9a63c5 5592
505bde11 5593 assert (p >= bufp->buffer && p <= pend);
b18215fc 5594
0b32bf0e 5595 if (d >= string1 && d <= end1)
fa9a63c5 5596 dend = end_match_1;
0b32bf0e 5597 }
fa9a63c5 5598 else
0b32bf0e 5599 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
5600 } /* for (;;) */
5601
5602 if (best_regs_set)
5603 goto restore_best_regs;
5604
5605 FREE_VARIABLES ();
5606
b18215fc 5607 return -1; /* Failure to match. */
fa9a63c5
RM
5608} /* re_match_2 */
5609\f
5610/* Subroutine definitions for re_match_2. */
5611
fa9a63c5
RM
5612/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5613 bytes; nonzero otherwise. */
5e69f11e 5614
fa9a63c5 5615static int
2d1675e4
SM
5616bcmp_translate (s1, s2, len, translate, multibyte)
5617 re_char *s1, *s2;
fa9a63c5 5618 register int len;
6676cb1c 5619 RE_TRANSLATE_TYPE translate;
2d1675e4 5620 const int multibyte;
fa9a63c5 5621{
2d1675e4
SM
5622 register re_char *p1 = s1, *p2 = s2;
5623 re_char *p1_end = s1 + len;
5624 re_char *p2_end = s2 + len;
e934739e 5625
4bb91c68
SM
5626 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5627 different lengths, but relying on a single `len' would break this. -sm */
5628 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 5629 {
e934739e
RS
5630 int p1_charlen, p2_charlen;
5631 int p1_ch, p2_ch;
5632
2d1675e4
SM
5633 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5634 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
5635
5636 if (RE_TRANSLATE (translate, p1_ch)
5637 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 5638 return 1;
e934739e
RS
5639
5640 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 5641 }
e934739e
RS
5642
5643 if (p1 != p1_end || p2 != p2_end)
5644 return 1;
5645
fa9a63c5
RM
5646 return 0;
5647}
5648\f
5649/* Entry points for GNU code. */
5650
5651/* re_compile_pattern is the GNU regular expression compiler: it
5652 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5653 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 5654
fa9a63c5
RM
5655 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5656 are set in BUFP on entry.
5e69f11e 5657
b18215fc 5658 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
5659
5660const char *
5661re_compile_pattern (pattern, length, bufp)
5662 const char *pattern;
0b32bf0e 5663 size_t length;
fa9a63c5
RM
5664 struct re_pattern_buffer *bufp;
5665{
5666 reg_errcode_t ret;
5e69f11e 5667
fa9a63c5
RM
5668 /* GNU code is written to assume at least RE_NREGS registers will be set
5669 (and at least one extra will be -1). */
5670 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 5671
fa9a63c5
RM
5672 /* And GNU code determines whether or not to get register information
5673 by passing null for the REGS argument to re_match, etc., not by
5674 setting no_sub. */
5675 bufp->no_sub = 0;
5e69f11e 5676
4bb91c68 5677 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
5678
5679 if (!ret)
5680 return NULL;
5681 return gettext (re_error_msgid[(int) ret]);
5e69f11e 5682}
c0f9ea08 5683WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 5684\f
b18215fc
RS
5685/* Entry points compatible with 4.2 BSD regex library. We don't define
5686 them unless specifically requested. */
fa9a63c5 5687
0b32bf0e 5688#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
5689
5690/* BSD has one and only one pattern buffer. */
5691static struct re_pattern_buffer re_comp_buf;
5692
5693char *
0b32bf0e 5694# ifdef _LIBC
48afdd44
RM
5695/* Make these definitions weak in libc, so POSIX programs can redefine
5696 these names if they don't use our functions, and still use
5697 regcomp/regexec below without link errors. */
5698weak_function
0b32bf0e 5699# endif
fa9a63c5
RM
5700re_comp (s)
5701 const char *s;
5702{
5703 reg_errcode_t ret;
5e69f11e 5704
fa9a63c5
RM
5705 if (!s)
5706 {
5707 if (!re_comp_buf.buffer)
0b32bf0e 5708 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 5709 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
5710 return 0;
5711 }
5712
5713 if (!re_comp_buf.buffer)
5714 {
5715 re_comp_buf.buffer = (unsigned char *) malloc (200);
5716 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
5717 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5718 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
5719 re_comp_buf.allocated = 200;
5720
5721 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5722 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
5723 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5724 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
5725 }
5726
5727 /* Since `re_exec' always passes NULL for the `regs' argument, we
5728 don't need to initialize the pattern buffer fields which affect it. */
5729
fa9a63c5 5730 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 5731
fa9a63c5
RM
5732 if (!ret)
5733 return NULL;
5734
5735 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5736 return (char *) gettext (re_error_msgid[(int) ret]);
5737}
5738
5739
5740int
0b32bf0e 5741# ifdef _LIBC
48afdd44 5742weak_function
0b32bf0e 5743# endif
fa9a63c5
RM
5744re_exec (s)
5745 const char *s;
5746{
5747 const int len = strlen (s);
5748 return
5749 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5750}
5751#endif /* _REGEX_RE_COMP */
5752\f
5753/* POSIX.2 functions. Don't define these for Emacs. */
5754
5755#ifndef emacs
5756
5757/* regcomp takes a regular expression as a string and compiles it.
5758
b18215fc 5759 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
5760 since POSIX says we shouldn't. Thus, we set
5761
5762 `buffer' to the compiled pattern;
5763 `used' to the length of the compiled pattern;
5764 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5765 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5766 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
5767 `fastmap' to an allocated space for the fastmap;
5768 `fastmap_accurate' to zero;
fa9a63c5
RM
5769 `re_nsub' to the number of subexpressions in PATTERN.
5770
5771 PATTERN is the address of the pattern string.
5772
5773 CFLAGS is a series of bits which affect compilation.
5774
5775 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5776 use POSIX basic syntax.
5777
5778 If REG_NEWLINE is set, then . and [^...] don't match newline.
5779 Also, regexec will try a match beginning after every newline.
5780
5781 If REG_ICASE is set, then we considers upper- and lowercase
5782 versions of letters to be equivalent when matching.
5783
5784 If REG_NOSUB is set, then when PREG is passed to regexec, that
5785 routine will report only success or failure, and nothing about the
5786 registers.
5787
b18215fc 5788 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
5789 the return codes and their meanings.) */
5790
5791int
5792regcomp (preg, pattern, cflags)
5793 regex_t *preg;
5e69f11e 5794 const char *pattern;
fa9a63c5
RM
5795 int cflags;
5796{
5797 reg_errcode_t ret;
4bb91c68 5798 reg_syntax_t syntax
fa9a63c5
RM
5799 = (cflags & REG_EXTENDED) ?
5800 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5801
5802 /* regex_compile will allocate the space for the compiled pattern. */
5803 preg->buffer = 0;
5804 preg->allocated = 0;
5805 preg->used = 0;
5e69f11e 5806
c0f9ea08
SM
5807 /* Try to allocate space for the fastmap. */
5808 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 5809
fa9a63c5
RM
5810 if (cflags & REG_ICASE)
5811 {
5812 unsigned i;
5e69f11e 5813
6676cb1c
RS
5814 preg->translate
5815 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5816 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 5817 if (preg->translate == NULL)
0b32bf0e 5818 return (int) REG_ESPACE;
fa9a63c5
RM
5819
5820 /* Map uppercase characters to corresponding lowercase ones. */
5821 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 5822 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
5823 }
5824 else
5825 preg->translate = NULL;
5826
5827 /* If REG_NEWLINE is set, newlines are treated differently. */
5828 if (cflags & REG_NEWLINE)
5829 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5830 syntax &= ~RE_DOT_NEWLINE;
5831 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
5832 }
5833 else
c0f9ea08 5834 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
5835
5836 preg->no_sub = !!(cflags & REG_NOSUB);
5837
5e69f11e 5838 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 5839 can use strlen here in compiling the pattern. */
4bb91c68 5840 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 5841
fa9a63c5
RM
5842 /* POSIX doesn't distinguish between an unmatched open-group and an
5843 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
5844 if (ret == REG_ERPAREN)
5845 ret = REG_EPAREN;
5846
5847 if (ret == REG_NOERROR && preg->fastmap)
5848 { /* Compute the fastmap now, since regexec cannot modify the pattern
5849 buffer. */
5850 re_compile_fastmap (preg);
5851 if (preg->can_be_null)
5852 { /* The fastmap can't be used anyway. */
5853 free (preg->fastmap);
5854 preg->fastmap = NULL;
5855 }
5856 }
fa9a63c5
RM
5857 return (int) ret;
5858}
c0f9ea08 5859WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
5860
5861
5862/* regexec searches for a given pattern, specified by PREG, in the
5863 string STRING.
5e69f11e 5864
fa9a63c5 5865 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 5866 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
5867 least NMATCH elements, and we set them to the offsets of the
5868 corresponding matched substrings.
5e69f11e 5869
fa9a63c5
RM
5870 EFLAGS specifies `execution flags' which affect matching: if
5871 REG_NOTBOL is set, then ^ does not match at the beginning of the
5872 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 5873
fa9a63c5
RM
5874 We return 0 if we find a match and REG_NOMATCH if not. */
5875
5876int
5877regexec (preg, string, nmatch, pmatch, eflags)
5878 const regex_t *preg;
5e69f11e
RM
5879 const char *string;
5880 size_t nmatch;
5881 regmatch_t pmatch[];
fa9a63c5
RM
5882 int eflags;
5883{
5884 int ret;
5885 struct re_registers regs;
5886 regex_t private_preg;
5887 int len = strlen (string);
c0f9ea08 5888 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
5889
5890 private_preg = *preg;
5e69f11e 5891
fa9a63c5
RM
5892 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5893 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 5894
fa9a63c5
RM
5895 /* The user has told us exactly how many registers to return
5896 information about, via `nmatch'. We have to pass that on to the
b18215fc 5897 matching routines. */
fa9a63c5 5898 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 5899
fa9a63c5
RM
5900 if (want_reg_info)
5901 {
5902 regs.num_regs = nmatch;
4bb91c68
SM
5903 regs.start = TALLOC (nmatch * 2, regoff_t);
5904 if (regs.start == NULL)
0b32bf0e 5905 return (int) REG_NOMATCH;
4bb91c68 5906 regs.end = regs.start + nmatch;
fa9a63c5
RM
5907 }
5908
c0f9ea08
SM
5909 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
5910 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
5911 was a little bit longer but still only matching the real part.
5912 This works because the `endline' will check for a '\n' and will find a
5913 '\0', correctly deciding that this is not the end of a line.
5914 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
5915 a convenient '\0' there. For all we know, the string could be preceded
5916 by '\n' which would throw things off. */
5917
fa9a63c5
RM
5918 /* Perform the searching operation. */
5919 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
5920 /* start: */ 0, /* range: */ len,
5921 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 5922
fa9a63c5
RM
5923 /* Copy the register information to the POSIX structure. */
5924 if (want_reg_info)
5925 {
5926 if (ret >= 0)
0b32bf0e
SM
5927 {
5928 unsigned r;
fa9a63c5 5929
0b32bf0e
SM
5930 for (r = 0; r < nmatch; r++)
5931 {
5932 pmatch[r].rm_so = regs.start[r];
5933 pmatch[r].rm_eo = regs.end[r];
5934 }
5935 }
fa9a63c5 5936
b18215fc 5937 /* If we needed the temporary register info, free the space now. */
fa9a63c5 5938 free (regs.start);
fa9a63c5
RM
5939 }
5940
5941 /* We want zero return to mean success, unlike `re_search'. */
5942 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5943}
c0f9ea08 5944WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
5945
5946
5947/* Returns a message corresponding to an error code, ERRCODE, returned
5948 from either regcomp or regexec. We don't use PREG here. */
5949
5950size_t
5951regerror (errcode, preg, errbuf, errbuf_size)
5952 int errcode;
5953 const regex_t *preg;
5954 char *errbuf;
5955 size_t errbuf_size;
5956{
5957 const char *msg;
5958 size_t msg_size;
5959
5960 if (errcode < 0
5961 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 5962 /* Only error codes returned by the rest of the code should be passed
b18215fc 5963 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
5964 code generates an invalid error code, then the program has a bug.
5965 Dump core so we can fix it. */
5966 abort ();
5967
5968 msg = gettext (re_error_msgid[errcode]);
5969
5970 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 5971
fa9a63c5
RM
5972 if (errbuf_size != 0)
5973 {
5974 if (msg_size > errbuf_size)
0b32bf0e
SM
5975 {
5976 strncpy (errbuf, msg, errbuf_size - 1);
5977 errbuf[errbuf_size - 1] = 0;
5978 }
fa9a63c5 5979 else
0b32bf0e 5980 strcpy (errbuf, msg);
fa9a63c5
RM
5981 }
5982
5983 return msg_size;
5984}
c0f9ea08 5985WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
5986
5987
5988/* Free dynamically allocated space used by PREG. */
5989
5990void
5991regfree (preg)
5992 regex_t *preg;
5993{
5994 if (preg->buffer != NULL)
5995 free (preg->buffer);
5996 preg->buffer = NULL;
5e69f11e 5997
fa9a63c5
RM
5998 preg->allocated = 0;
5999 preg->used = 0;
6000
6001 if (preg->fastmap != NULL)
6002 free (preg->fastmap);
6003 preg->fastmap = NULL;
6004 preg->fastmap_accurate = 0;
6005
6006 if (preg->translate != NULL)
6007 free (preg->translate);
6008 preg->translate = NULL;
6009}
c0f9ea08 6010WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6011
6012#endif /* not emacs */