(longlines-search-forward, longlines-search-backward)
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
fff23de2 6 2002, 2003, 2004, 2005, 2006, 2007, 2008
e468b87f 7 Free Software Foundation, Inc.
bc78d348 8
fa9a63c5
RM
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
e468b87f 11 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
4fc5845f 21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 22 USA. */
fa9a63c5 23
6df42991 24/* TODO:
505bde11 25 - structure the opcode space into opcode+flag.
dc1e502d 26 - merge with glibc's regex.[ch].
01618498 27 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
28 need to modify the compiled regexp so that re_match can be reentrant.
29 - get rid of on_failure_jump_smart by doing the optimization in re_comp
30 rather than at run-time, so that re_match can be reentrant.
01618498 31*/
505bde11 32
fa9a63c5 33/* AIX requires this to be the first thing in the file. */
0b32bf0e 34#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
35 #pragma alloca
36#endif
37
fa9a63c5 38#ifdef HAVE_CONFIG_H
0b32bf0e 39# include <config.h>
fa9a63c5
RM
40#endif
41
4bb91c68
SM
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
fa9a63c5 48
14473664
SM
49/* Whether to use ISO C Amendment 1 wide char functions.
50 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
51#if defined _LIBC
52#define WIDE_CHAR_SUPPORT 1
53#else
14473664 54#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
55 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
56#endif
14473664
SM
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
a0ad02f7 60#if WIDE_CHAR_SUPPORT
14473664
SM
61/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62# include <wchar.h>
63# include <wctype.h>
64#endif
65
c0f9ea08
SM
66#ifdef _LIBC
67/* We have to keep the namespace clean. */
68# define regfree(preg) __regfree (preg)
69# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
71# define regerror(err_code, preg, errbuf, errbuf_size) \
72 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
73# define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77# define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79# define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81# define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83# define re_set_syntax(syntax) __re_set_syntax (syntax)
84# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
14473664
SM
88/* Make sure we call libc's function even if the user overrides them. */
89# define btowc __btowc
90# define iswctype __iswctype
91# define wctype __wctype
92
c0f9ea08
SM
93# define WEAK_ALIAS(a,b) weak_alias (a, b)
94
95/* We are also using some library internals. */
96# include <locale/localeinfo.h>
97# include <locale/elem-hash.h>
98# include <langinfo.h>
99#else
100# define WEAK_ALIAS(a,b)
101#endif
102
4bb91c68 103/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 104#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
105# include <libintl.h>
106#else
107# define gettext(msgid) (msgid)
108#endif
109
5e69f11e
RM
110#ifndef gettext_noop
111/* This define is so xgettext can find the internationalizable
112 strings. */
0b32bf0e 113# define gettext_noop(String) String
5e69f11e
RM
114#endif
115
fa9a63c5
RM
116/* The `emacs' switch turns on certain matching commands
117 that make sense only in Emacs. */
118#ifdef emacs
119
0b32bf0e
SM
120# include "lisp.h"
121# include "buffer.h"
b18215fc
RS
122
123/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 124# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 125
0b32bf0e 126# include "syntax.h"
9117d724 127# include "character.h"
0b32bf0e 128# include "category.h"
fa9a63c5 129
7689ef0b
EZ
130# ifdef malloc
131# undef malloc
132# endif
0b32bf0e 133# define malloc xmalloc
7689ef0b
EZ
134# ifdef realloc
135# undef realloc
136# endif
0b32bf0e 137# define realloc xrealloc
7689ef0b
EZ
138# ifdef free
139# undef free
140# endif
0b32bf0e 141# define free xfree
9abbd165 142
7814e705 143/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
144# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
145# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
146
147# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 148# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
cf9c99bc 149# define RE_STRING_CHAR(p, s, multibyte) \
4e8a9132 150 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
cf9c99bc 151# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
2d1675e4
SM
152 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
153
cf9c99bc
KH
154# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]
155
156# define RE_CHAR_TO_UNIBYTE(c) \
157 (ASCII_CHAR_P (c) ? (c) \
158 : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c) \
159 : multibyte_char_to_unibyte_safe (c))
160
6fdd04b0
KH
161/* Set C a (possibly converted to multibyte) character before P. P
162 points into a string which is the virtual concatenation of STR1
163 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
164# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
165 do { \
02cb78b5 166 if (target_multibyte) \
bf216479
KH
167 { \
168 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
169 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
170 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
171 c = STRING_CHAR (dtemp, (p) - dtemp); \
172 } \
173 else \
174 { \
175 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 176 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 177 } \
2d1675e4
SM
178 } while (0)
179
6fdd04b0
KH
180/* Set C a (possibly converted to multibyte) character at P, and set
181 LEN to the byte length of that character. */
182# define GET_CHAR_AFTER(c, p, len) \
183 do { \
02cb78b5 184 if (target_multibyte) \
cf9c99bc 185 (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \
6fdd04b0
KH
186 else \
187 { \
cf9c99bc 188 (c) = *p; \
6fdd04b0 189 len = 1; \
cf9c99bc 190 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 191 } \
8f924df7 192 } while (0)
4e8a9132 193
fa9a63c5
RM
194#else /* not emacs */
195
196/* If we are not linking with Emacs proper,
197 we can't use the relocating allocator
198 even if config.h says that we can. */
0b32bf0e 199# undef REL_ALLOC
fa9a63c5 200
0b32bf0e
SM
201# if defined STDC_HEADERS || defined _LIBC
202# include <stdlib.h>
203# else
fa9a63c5
RM
204char *malloc ();
205char *realloc ();
0b32bf0e 206# endif
fa9a63c5 207
a77f947b
CY
208/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
209
210void *
211xmalloc (size)
212 size_t size;
213{
214 register void *val;
215 val = (void *) malloc (size);
216 if (!val && size)
217 {
218 write (2, "virtual memory exhausted\n", 25);
219 exit (1);
220 }
221 return val;
222}
223
224void *
225xrealloc (block, size)
226 void *block;
227 size_t size;
228{
229 register void *val;
230 /* We must call malloc explicitly when BLOCK is 0, since some
231 reallocs don't do this. */
232 if (! block)
233 val = (void *) malloc (size);
234 else
235 val = (void *) realloc (block, size);
236 if (!val && size)
237 {
238 write (2, "virtual memory exhausted\n", 25);
239 exit (1);
240 }
241 return val;
242}
243
a073faa6
CY
244# ifdef malloc
245# undef malloc
246# endif
247# define malloc xmalloc
248# ifdef realloc
249# undef realloc
250# endif
251# define realloc xrealloc
252
9e4ecb26 253/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 254 If nothing else has been done, use the method below. */
0b32bf0e
SM
255# ifdef INHIBIT_STRING_HEADER
256# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
257# if !defined bzero && !defined bcopy
258# undef INHIBIT_STRING_HEADER
259# endif
260# endif
261# endif
9e4ecb26 262
4bb91c68 263/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
264 This is used in most programs--a few other programs avoid this
265 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
266# ifndef INHIBIT_STRING_HEADER
267# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
268# include <string.h>
0b32bf0e 269# ifndef bzero
4bb91c68
SM
270# ifndef _LIBC
271# define bzero(s, n) (memset (s, '\0', n), (s))
272# else
273# define bzero(s, n) __bzero (s, n)
274# endif
0b32bf0e
SM
275# endif
276# else
277# include <strings.h>
4bb91c68
SM
278# ifndef memcmp
279# define memcmp(s1, s2, n) bcmp (s1, s2, n)
280# endif
281# ifndef memcpy
282# define memcpy(d, s, n) (bcopy (s, d, n), (d))
283# endif
0b32bf0e
SM
284# endif
285# endif
fa9a63c5
RM
286
287/* Define the syntax stuff for \<, \>, etc. */
288
990b2375 289/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 290enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 291
0b32bf0e 292# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 293
e934739e 294/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
295# define BASE_LEADING_CODE_P(c) (0)
296# define CHAR_CHARSET(c) 0
297# define CHARSET_LEADING_CODE_BASE(c) 0
298# define MAX_MULTIBYTE_LENGTH 1
299# define RE_MULTIBYTE_P(x) 0
bf216479 300# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
301# define WORD_BOUNDARY_P(c1, c2) (0)
302# define CHAR_HEAD_P(p) (1)
303# define SINGLE_BYTE_CHAR_P(c) (1)
304# define SAME_CHARSET_P(c1, c2) (1)
305# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 306# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e 307# define STRING_CHAR(p, s) (*(p))
cf9c99bc 308# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
0b32bf0e
SM
309# define CHAR_STRING(c, s) (*(s) = (c), 1)
310# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
99f76c1e 311# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) STRING_CHAR_AND_LENGTH ((p), (s), (len))
cf9c99bc
KH
312# define RE_CHAR_TO_MULTIBYTE(c) (c)
313# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 314# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 315 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
316# define GET_CHAR_AFTER(c, p, len) \
317 (c = *p, len = 1)
0b32bf0e 318# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
319# define BYTE8_TO_CHAR(c) (c)
320# define CHAR_BYTE8_P(c) (0)
bf216479 321# define CHAR_LEADING_CODE(c) (c)
8f924df7 322
fa9a63c5 323#endif /* not emacs */
4e8a9132
SM
324
325#ifndef RE_TRANSLATE
0b32bf0e
SM
326# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
327# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 328#endif
fa9a63c5
RM
329\f
330/* Get the interface, including the syntax bits. */
331#include "regex.h"
332
f71b19b6
DL
333/* isalpha etc. are used for the character classes. */
334#include <ctype.h>
fa9a63c5 335
f71b19b6 336#ifdef emacs
fa9a63c5 337
f71b19b6 338/* 1 if C is an ASCII character. */
0b32bf0e 339# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 340
f71b19b6 341/* 1 if C is a unibyte character. */
0b32bf0e 342# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 343
f71b19b6 344/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 345
f71b19b6 346/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
347# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
348# define ISCNTRL(c) ((c) < ' ')
349# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
350 || ((c) >= 'a' && (c) <= 'f') \
351 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
352
353/* This is only used for single-byte characters. */
0b32bf0e 354# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
355
356/* The rest must handle multibyte characters. */
357
0b32bf0e 358# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 359 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
360 : 1)
361
14473664 362# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 363 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
364 : 1)
365
0b32bf0e 366# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
367 ? (((c) >= 'a' && (c) <= 'z') \
368 || ((c) >= 'A' && (c) <= 'Z') \
369 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
370 : SYNTAX (c) == Sword)
371
0b32bf0e 372# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
373 ? (((c) >= 'a' && (c) <= 'z') \
374 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
375 : SYNTAX (c) == Sword)
376
0b32bf0e 377# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 378
0b32bf0e 379# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
380 ? ((c) > ' ' && (c) < 0177 \
381 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
382 || ((c) >= 'A' && (c) <= 'Z') \
383 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
384 : SYNTAX (c) != Sword)
385
0b32bf0e 386# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 387
0b32bf0e 388# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 389
0b32bf0e 390# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
391
392#else /* not emacs */
393
f71b19b6
DL
394/* Jim Meyering writes:
395
396 "... Some ctype macros are valid only for character codes that
397 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
398 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 399 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
400 STDC_HEADERS is defined, then autoconf has verified that the ctype
401 macros don't need to be guarded with references to isascii. ...
402 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
403 eliminate the && through constant folding."
404 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 405
4bb91c68 406# undef ISASCII
0b32bf0e
SM
407# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
408# define ISASCII(c) 1
409# else
410# define ISASCII(c) isascii(c)
411# endif
f71b19b6
DL
412
413/* 1 if C is an ASCII character. */
0b32bf0e 414# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
415
416/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
417# define ISUNIBYTE(c) 1
418
419# ifdef isblank
420# define ISBLANK(c) (ISASCII (c) && isblank (c))
421# else
422# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
423# endif
424# ifdef isgraph
425# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
426# else
427# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
428# endif
429
4bb91c68 430# undef ISPRINT
0b32bf0e
SM
431# define ISPRINT(c) (ISASCII (c) && isprint (c))
432# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
433# define ISALNUM(c) (ISASCII (c) && isalnum (c))
434# define ISALPHA(c) (ISASCII (c) && isalpha (c))
435# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
436# define ISLOWER(c) (ISASCII (c) && islower (c))
437# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
438# define ISSPACE(c) (ISASCII (c) && isspace (c))
439# define ISUPPER(c) (ISASCII (c) && isupper (c))
440# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
441
442# define ISWORD(c) ISALPHA(c)
443
4bb91c68
SM
444# ifdef _tolower
445# define TOLOWER(c) _tolower(c)
446# else
447# define TOLOWER(c) tolower(c)
448# endif
449
450/* How many characters in the character set. */
451# define CHAR_SET_SIZE 256
452
0b32bf0e 453# ifdef SYNTAX_TABLE
f71b19b6 454
0b32bf0e 455extern char *re_syntax_table;
f71b19b6 456
0b32bf0e
SM
457# else /* not SYNTAX_TABLE */
458
0b32bf0e
SM
459static char re_syntax_table[CHAR_SET_SIZE];
460
461static void
462init_syntax_once ()
463{
464 register int c;
465 static int done = 0;
466
467 if (done)
468 return;
469
470 bzero (re_syntax_table, sizeof re_syntax_table);
471
4bb91c68
SM
472 for (c = 0; c < CHAR_SET_SIZE; ++c)
473 if (ISALNUM (c))
474 re_syntax_table[c] = Sword;
fa9a63c5 475
669fa600 476 re_syntax_table['_'] = Ssymbol;
fa9a63c5 477
0b32bf0e
SM
478 done = 1;
479}
480
481# endif /* not SYNTAX_TABLE */
96cc36cc 482
4bb91c68
SM
483# define SYNTAX(c) re_syntax_table[(c)]
484
96cc36cc
RS
485#endif /* not emacs */
486\f
fa9a63c5 487#ifndef NULL
0b32bf0e 488# define NULL (void *)0
fa9a63c5
RM
489#endif
490
491/* We remove any previous definition of `SIGN_EXTEND_CHAR',
492 since ours (we hope) works properly with all combinations of
493 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 494 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
495#undef SIGN_EXTEND_CHAR
496#if __STDC__
0b32bf0e 497# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
498#else /* not __STDC__ */
499/* As in Harbison and Steele. */
0b32bf0e 500# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
501#endif
502\f
503/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
504 use `alloca' instead of `malloc'. This is because using malloc in
505 re_search* or re_match* could cause memory leaks when C-g is used in
506 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
507 the other hand, malloc is more portable, and easier to debug.
508
fa9a63c5
RM
509 Because we sometimes use alloca, some routines have to be macros,
510 not functions -- `alloca'-allocated space disappears at the end of the
511 function it is called in. */
512
513#ifdef REGEX_MALLOC
514
0b32bf0e
SM
515# define REGEX_ALLOCATE malloc
516# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
517# define REGEX_FREE free
fa9a63c5
RM
518
519#else /* not REGEX_MALLOC */
520
521/* Emacs already defines alloca, sometimes. */
0b32bf0e 522# ifndef alloca
fa9a63c5
RM
523
524/* Make alloca work the best possible way. */
0b32bf0e
SM
525# ifdef __GNUC__
526# define alloca __builtin_alloca
527# else /* not __GNUC__ */
7f585e7a 528# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
529# include <alloca.h>
530# endif /* HAVE_ALLOCA_H */
531# endif /* not __GNUC__ */
fa9a63c5 532
0b32bf0e 533# endif /* not alloca */
fa9a63c5 534
0b32bf0e 535# define REGEX_ALLOCATE alloca
fa9a63c5
RM
536
537/* Assumes a `char *destination' variable. */
0b32bf0e 538# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 539 (destination = (char *) alloca (nsize), \
4bb91c68 540 memcpy (destination, source, osize))
fa9a63c5
RM
541
542/* No need to do anything to free, after alloca. */
0b32bf0e 543# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
544
545#endif /* not REGEX_MALLOC */
546
547/* Define how to allocate the failure stack. */
548
0b32bf0e 549#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 550
0b32bf0e 551# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 552 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 553# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 554 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 555# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
556 r_alloc_free (&failure_stack_ptr)
557
4297555e 558#else /* not using relocating allocator */
fa9a63c5 559
0b32bf0e 560# ifdef REGEX_MALLOC
fa9a63c5 561
0b32bf0e
SM
562# define REGEX_ALLOCATE_STACK malloc
563# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
564# define REGEX_FREE_STACK free
fa9a63c5 565
0b32bf0e 566# else /* not REGEX_MALLOC */
fa9a63c5 567
0b32bf0e 568# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 569
0b32bf0e 570# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 571 REGEX_REALLOCATE (source, osize, nsize)
7814e705 572/* No need to explicitly free anything. */
0b32bf0e 573# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 574
0b32bf0e 575# endif /* not REGEX_MALLOC */
4297555e 576#endif /* not using relocating allocator */
fa9a63c5
RM
577
578
579/* True if `size1' is non-NULL and PTR is pointing anywhere inside
580 `string1' or just past its end. This works if PTR is NULL, which is
581 a good thing. */
25fe55af 582#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
583 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
584
585/* (Re)Allocate N items of type T using malloc, or fail. */
586#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
587#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
588#define RETALLOC_IF(addr, n, t) \
589 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
590#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
591
4bb91c68 592#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
593
594#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
595
596#undef MAX
597#undef MIN
598#define MAX(a, b) ((a) > (b) ? (a) : (b))
599#define MIN(a, b) ((a) < (b) ? (a) : (b))
600
66f0296e
SM
601/* Type of source-pattern and string chars. */
602typedef const unsigned char re_char;
603
fa9a63c5
RM
604typedef char boolean;
605#define false 0
606#define true 1
607
4bb91c68
SM
608static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
609 re_char *string1, int size1,
610 re_char *string2, int size2,
611 int pos,
612 struct re_registers *regs,
613 int stop));
fa9a63c5
RM
614\f
615/* These are the command codes that appear in compiled regular
4bb91c68 616 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
617 command code can specify any interpretation whatsoever for its
618 arguments. Zero bytes may appear in the compiled regular expression. */
619
620typedef enum
621{
622 no_op = 0,
623
4bb91c68 624 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
625 succeed,
626
25fe55af 627 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
628 exactn,
629
25fe55af 630 /* Matches any (more or less) character. */
fa9a63c5
RM
631 anychar,
632
25fe55af
RS
633 /* Matches any one char belonging to specified set. First
634 following byte is number of bitmap bytes. Then come bytes
635 for a bitmap saying which chars are in. Bits in each byte
636 are ordered low-bit-first. A character is in the set if its
637 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
638 automatically not in the set.
639
640 If the length byte has the 0x80 bit set, then that stuff
641 is followed by a range table:
642 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 643 See RANGE_TABLE_WORK_BITS below.
01618498 644 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 645 pairs, each 2 multibyte characters,
0b32bf0e 646 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
647 charset,
648
25fe55af 649 /* Same parameters as charset, but match any character that is
4bb91c68 650 not one of those specified. */
fa9a63c5
RM
651 charset_not,
652
25fe55af
RS
653 /* Start remembering the text that is matched, for storing in a
654 register. Followed by one byte with the register number, in
655 the range 0 to one less than the pattern buffer's re_nsub
505bde11 656 field. */
fa9a63c5
RM
657 start_memory,
658
25fe55af
RS
659 /* Stop remembering the text that is matched and store it in a
660 memory register. Followed by one byte with the register
661 number, in the range 0 to one less than `re_nsub' in the
505bde11 662 pattern buffer. */
fa9a63c5
RM
663 stop_memory,
664
25fe55af 665 /* Match a duplicate of something remembered. Followed by one
4bb91c68 666 byte containing the register number. */
fa9a63c5
RM
667 duplicate,
668
25fe55af 669 /* Fail unless at beginning of line. */
fa9a63c5
RM
670 begline,
671
4bb91c68 672 /* Fail unless at end of line. */
fa9a63c5
RM
673 endline,
674
25fe55af
RS
675 /* Succeeds if at beginning of buffer (if emacs) or at beginning
676 of string to be matched (if not). */
fa9a63c5
RM
677 begbuf,
678
25fe55af 679 /* Analogously, for end of buffer/string. */
fa9a63c5 680 endbuf,
5e69f11e 681
25fe55af 682 /* Followed by two byte relative address to which to jump. */
5e69f11e 683 jump,
fa9a63c5 684
25fe55af 685 /* Followed by two-byte relative address of place to resume at
7814e705 686 in case of failure. */
fa9a63c5 687 on_failure_jump,
5e69f11e 688
25fe55af
RS
689 /* Like on_failure_jump, but pushes a placeholder instead of the
690 current string position when executed. */
fa9a63c5 691 on_failure_keep_string_jump,
5e69f11e 692
505bde11
SM
693 /* Just like `on_failure_jump', except that it checks that we
694 don't get stuck in an infinite loop (matching an empty string
695 indefinitely). */
696 on_failure_jump_loop,
697
0683b6fa
SM
698 /* Just like `on_failure_jump_loop', except that it checks for
699 a different kind of loop (the kind that shows up with non-greedy
700 operators). This operation has to be immediately preceded
701 by a `no_op'. */
702 on_failure_jump_nastyloop,
703
0b32bf0e 704 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
705 It analyses the loop before which it is put and if the
706 loop does not require backtracking, it changes itself to
4e8a9132
SM
707 `on_failure_keep_string_jump' and short-circuits the loop,
708 else it just defaults to changing itself into `on_failure_jump'.
709 It assumes that it is pointing to just past a `jump'. */
505bde11 710 on_failure_jump_smart,
fa9a63c5 711
25fe55af 712 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
713 After matching N times, jump to the address upon failure.
714 Does not work if N starts at 0: use on_failure_jump_loop
715 instead. */
fa9a63c5
RM
716 succeed_n,
717
25fe55af
RS
718 /* Followed by two-byte relative address, and two-byte number n.
719 Jump to the address N times, then fail. */
fa9a63c5
RM
720 jump_n,
721
25fe55af 722 /* Set the following two-byte relative address to the
7814e705 723 subsequent two-byte number. The address *includes* the two
25fe55af 724 bytes of number. */
fa9a63c5
RM
725 set_number_at,
726
fa9a63c5
RM
727 wordbeg, /* Succeeds if at word beginning. */
728 wordend, /* Succeeds if at word end. */
729
730 wordbound, /* Succeeds if at a word boundary. */
7814e705 731 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 732
669fa600
SM
733 symbeg, /* Succeeds if at symbol beginning. */
734 symend, /* Succeeds if at symbol end. */
735
fa9a63c5 736 /* Matches any character whose syntax is specified. Followed by
25fe55af 737 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
738 syntaxspec,
739
740 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
741 notsyntaxspec
742
743#ifdef emacs
744 ,before_dot, /* Succeeds if before point. */
745 at_dot, /* Succeeds if at point. */
746 after_dot, /* Succeeds if after point. */
b18215fc
RS
747
748 /* Matches any character whose category-set contains the specified
7814e705
JB
749 category. The operator is followed by a byte which contains a
750 category code (mnemonic ASCII character). */
b18215fc
RS
751 categoryspec,
752
753 /* Matches any character whose category-set does not contain the
754 specified category. The operator is followed by a byte which
755 contains the category code (mnemonic ASCII character). */
756 notcategoryspec
fa9a63c5
RM
757#endif /* emacs */
758} re_opcode_t;
759\f
760/* Common operations on the compiled pattern. */
761
762/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
763
764#define STORE_NUMBER(destination, number) \
765 do { \
766 (destination)[0] = (number) & 0377; \
767 (destination)[1] = (number) >> 8; \
768 } while (0)
769
770/* Same as STORE_NUMBER, except increment DESTINATION to
771 the byte after where the number is stored. Therefore, DESTINATION
772 must be an lvalue. */
773
774#define STORE_NUMBER_AND_INCR(destination, number) \
775 do { \
776 STORE_NUMBER (destination, number); \
777 (destination) += 2; \
778 } while (0)
779
780/* Put into DESTINATION a number stored in two contiguous bytes starting
781 at SOURCE. */
782
783#define EXTRACT_NUMBER(destination, source) \
784 do { \
785 (destination) = *(source) & 0377; \
786 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
787 } while (0)
788
789#ifdef DEBUG
4bb91c68 790static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
791static void
792extract_number (dest, source)
793 int *dest;
01618498 794 re_char *source;
fa9a63c5 795{
5e69f11e 796 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
797 *dest = *source & 0377;
798 *dest += temp << 8;
799}
800
4bb91c68 801# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
802# undef EXTRACT_NUMBER
803# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
804# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
805
806#endif /* DEBUG */
807
808/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
809 SOURCE must be an lvalue. */
810
811#define EXTRACT_NUMBER_AND_INCR(destination, source) \
812 do { \
813 EXTRACT_NUMBER (destination, source); \
25fe55af 814 (source) += 2; \
fa9a63c5
RM
815 } while (0)
816
817#ifdef DEBUG
4bb91c68
SM
818static void extract_number_and_incr _RE_ARGS ((int *destination,
819 re_char **source));
fa9a63c5
RM
820static void
821extract_number_and_incr (destination, source)
822 int *destination;
01618498 823 re_char **source;
5e69f11e 824{
fa9a63c5
RM
825 extract_number (destination, *source);
826 *source += 2;
827}
828
0b32bf0e
SM
829# ifndef EXTRACT_MACROS
830# undef EXTRACT_NUMBER_AND_INCR
831# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 832 extract_number_and_incr (&dest, &src)
0b32bf0e 833# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
834
835#endif /* DEBUG */
836\f
b18215fc
RS
837/* Store a multibyte character in three contiguous bytes starting
838 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 839 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
840
841#define STORE_CHARACTER_AND_INCR(destination, character) \
842 do { \
843 (destination)[0] = (character) & 0377; \
844 (destination)[1] = ((character) >> 8) & 0377; \
845 (destination)[2] = (character) >> 16; \
846 (destination) += 3; \
847 } while (0)
848
849/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 850 starting at SOURCE. */
b18215fc
RS
851
852#define EXTRACT_CHARACTER(destination, source) \
853 do { \
854 (destination) = ((source)[0] \
855 | ((source)[1] << 8) \
856 | ((source)[2] << 16)); \
857 } while (0)
858
859
860/* Macros for charset. */
861
862/* Size of bitmap of charset P in bytes. P is a start of charset,
863 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
864#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
865
866/* Nonzero if charset P has range table. */
25fe55af 867#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
868
869/* Return the address of range table of charset P. But not the start
870 of table itself, but the before where the number of ranges is
96cc36cc
RS
871 stored. `2 +' means to skip re_opcode_t and size of bitmap,
872 and the 2 bytes of flags at the start of the range table. */
873#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
874
875/* Extract the bit flags that start a range table. */
876#define CHARSET_RANGE_TABLE_BITS(p) \
877 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
878 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
879
880/* Test if C is listed in the bitmap of charset P. */
881#define CHARSET_LOOKUP_BITMAP(p, c) \
882 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
883 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
884
885/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
886 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
887 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
888 and end. */
889#define CHARSET_RANGE_TABLE_END(range_table, count) \
890 ((range_table) + (count) * 2 * 3)
891
7814e705 892/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
893 COUNT is number of ranges in RANGE_TABLE. */
894#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
895 do \
896 { \
01618498
SM
897 re_wchar_t range_start, range_end; \
898 re_char *p; \
899 re_char *range_table_end \
b18215fc
RS
900 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
901 \
902 for (p = (range_table); p < range_table_end; p += 2 * 3) \
903 { \
904 EXTRACT_CHARACTER (range_start, p); \
905 EXTRACT_CHARACTER (range_end, p + 3); \
906 \
907 if (range_start <= (c) && (c) <= range_end) \
908 { \
909 (not) = !(not); \
910 break; \
911 } \
912 } \
913 } \
914 while (0)
915
916/* Test if C is in range table of CHARSET. The flag NOT is negated if
917 C is listed in it. */
918#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
919 do \
920 { \
921 /* Number of ranges in range table. */ \
922 int count; \
01618498
SM
923 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
924 \
b18215fc
RS
925 EXTRACT_NUMBER_AND_INCR (count, range_table); \
926 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
927 } \
928 while (0)
929\f
fa9a63c5
RM
930/* If DEBUG is defined, Regex prints many voluminous messages about what
931 it is doing (if the variable `debug' is nonzero). If linked with the
932 main program in `iregex.c', you can enter patterns and strings
933 interactively. And if linked with the main program in `main.c' and
4bb91c68 934 the other test files, you can run the already-written tests. */
fa9a63c5
RM
935
936#ifdef DEBUG
937
938/* We use standard I/O for debugging. */
0b32bf0e 939# include <stdio.h>
fa9a63c5
RM
940
941/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 942# include <assert.h>
fa9a63c5 943
99633e97 944static int debug = -100000;
fa9a63c5 945
0b32bf0e
SM
946# define DEBUG_STATEMENT(e) e
947# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
948# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
949# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
950# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
951# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 952 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 953# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 954 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
955
956
957/* Print the fastmap in human-readable form. */
958
959void
960print_fastmap (fastmap)
961 char *fastmap;
962{
963 unsigned was_a_range = 0;
5e69f11e
RM
964 unsigned i = 0;
965
fa9a63c5
RM
966 while (i < (1 << BYTEWIDTH))
967 {
968 if (fastmap[i++])
969 {
970 was_a_range = 0;
25fe55af
RS
971 putchar (i - 1);
972 while (i < (1 << BYTEWIDTH) && fastmap[i])
973 {
974 was_a_range = 1;
975 i++;
976 }
fa9a63c5 977 if (was_a_range)
25fe55af
RS
978 {
979 printf ("-");
980 putchar (i - 1);
981 }
982 }
fa9a63c5 983 }
5e69f11e 984 putchar ('\n');
fa9a63c5
RM
985}
986
987
988/* Print a compiled pattern string in human-readable form, starting at
989 the START pointer into it and ending just before the pointer END. */
990
991void
992print_partial_compiled_pattern (start, end)
01618498
SM
993 re_char *start;
994 re_char *end;
fa9a63c5
RM
995{
996 int mcnt, mcnt2;
01618498
SM
997 re_char *p = start;
998 re_char *pend = end;
fa9a63c5
RM
999
1000 if (start == NULL)
1001 {
a1a052df 1002 fprintf (stderr, "(null)\n");
fa9a63c5
RM
1003 return;
1004 }
5e69f11e 1005
fa9a63c5
RM
1006 /* Loop over pattern commands. */
1007 while (p < pend)
1008 {
a1a052df 1009 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
1010
1011 switch ((re_opcode_t) *p++)
1012 {
25fe55af 1013 case no_op:
a1a052df 1014 fprintf (stderr, "/no_op");
25fe55af 1015 break;
fa9a63c5 1016
99633e97 1017 case succeed:
a1a052df 1018 fprintf (stderr, "/succeed");
99633e97
SM
1019 break;
1020
fa9a63c5
RM
1021 case exactn:
1022 mcnt = *p++;
a1a052df 1023 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 1024 do
fa9a63c5 1025 {
a1a052df 1026 fprintf (stderr, "/%c", *p++);
25fe55af
RS
1027 }
1028 while (--mcnt);
1029 break;
fa9a63c5
RM
1030
1031 case start_memory:
a1a052df 1032 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1033 break;
fa9a63c5
RM
1034
1035 case stop_memory:
a1a052df 1036 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1037 break;
fa9a63c5
RM
1038
1039 case duplicate:
a1a052df 1040 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1041 break;
1042
1043 case anychar:
a1a052df 1044 fprintf (stderr, "/anychar");
fa9a63c5
RM
1045 break;
1046
1047 case charset:
25fe55af
RS
1048 case charset_not:
1049 {
1050 register int c, last = -100;
fa9a63c5 1051 register int in_range = 0;
99633e97
SM
1052 int length = CHARSET_BITMAP_SIZE (p - 1);
1053 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1054
a1a052df 1055 fprintf (stderr, "/charset [%s",
839966f3 1056 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1057
839966f3
KH
1058 if (p + *p >= pend)
1059 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1060
25fe55af 1061 for (c = 0; c < 256; c++)
96cc36cc 1062 if (c / 8 < length
fa9a63c5
RM
1063 && (p[1 + (c/8)] & (1 << (c % 8))))
1064 {
1065 /* Are we starting a range? */
1066 if (last + 1 == c && ! in_range)
1067 {
a1a052df 1068 fprintf (stderr, "-");
fa9a63c5
RM
1069 in_range = 1;
1070 }
1071 /* Have we broken a range? */
1072 else if (last + 1 != c && in_range)
96cc36cc 1073 {
a1a052df 1074 fprintf (stderr, "%c", last);
fa9a63c5
RM
1075 in_range = 0;
1076 }
5e69f11e 1077
fa9a63c5 1078 if (! in_range)
a1a052df 1079 fprintf (stderr, "%c", c);
fa9a63c5
RM
1080
1081 last = c;
25fe55af 1082 }
fa9a63c5
RM
1083
1084 if (in_range)
a1a052df 1085 fprintf (stderr, "%c", last);
fa9a63c5 1086
a1a052df 1087 fprintf (stderr, "]");
fa9a63c5 1088
99633e97 1089 p += 1 + length;
96cc36cc 1090
96cc36cc 1091 if (has_range_table)
99633e97
SM
1092 {
1093 int count;
a1a052df 1094 fprintf (stderr, "has-range-table");
99633e97
SM
1095
1096 /* ??? Should print the range table; for now, just skip it. */
1097 p += 2; /* skip range table bits */
1098 EXTRACT_NUMBER_AND_INCR (count, p);
1099 p = CHARSET_RANGE_TABLE_END (p, count);
1100 }
fa9a63c5
RM
1101 }
1102 break;
1103
1104 case begline:
a1a052df 1105 fprintf (stderr, "/begline");
25fe55af 1106 break;
fa9a63c5
RM
1107
1108 case endline:
a1a052df 1109 fprintf (stderr, "/endline");
25fe55af 1110 break;
fa9a63c5
RM
1111
1112 case on_failure_jump:
25fe55af 1113 extract_number_and_incr (&mcnt, &p);
a1a052df 1114 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1115 break;
fa9a63c5
RM
1116
1117 case on_failure_keep_string_jump:
25fe55af 1118 extract_number_and_incr (&mcnt, &p);
a1a052df 1119 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1120 break;
fa9a63c5 1121
0683b6fa
SM
1122 case on_failure_jump_nastyloop:
1123 extract_number_and_incr (&mcnt, &p);
a1a052df 1124 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1125 break;
1126
505bde11 1127 case on_failure_jump_loop:
fa9a63c5 1128 extract_number_and_incr (&mcnt, &p);
a1a052df 1129 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1130 break;
1131
505bde11 1132 case on_failure_jump_smart:
fa9a63c5 1133 extract_number_and_incr (&mcnt, &p);
a1a052df 1134 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1135 break;
1136
25fe55af 1137 case jump:
fa9a63c5 1138 extract_number_and_incr (&mcnt, &p);
a1a052df 1139 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1140 break;
1141
25fe55af
RS
1142 case succeed_n:
1143 extract_number_and_incr (&mcnt, &p);
1144 extract_number_and_incr (&mcnt2, &p);
a1a052df 1145 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1146 break;
5e69f11e 1147
25fe55af
RS
1148 case jump_n:
1149 extract_number_and_incr (&mcnt, &p);
1150 extract_number_and_incr (&mcnt2, &p);
a1a052df 1151 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1152 break;
5e69f11e 1153
25fe55af
RS
1154 case set_number_at:
1155 extract_number_and_incr (&mcnt, &p);
1156 extract_number_and_incr (&mcnt2, &p);
a1a052df 1157 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1158 break;
5e69f11e 1159
25fe55af 1160 case wordbound:
a1a052df 1161 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1162 break;
1163
1164 case notwordbound:
a1a052df 1165 fprintf (stderr, "/notwordbound");
25fe55af 1166 break;
fa9a63c5
RM
1167
1168 case wordbeg:
a1a052df 1169 fprintf (stderr, "/wordbeg");
fa9a63c5 1170 break;
5e69f11e 1171
fa9a63c5 1172 case wordend:
a1a052df 1173 fprintf (stderr, "/wordend");
e2543b02 1174 break;
5e69f11e 1175
669fa600 1176 case symbeg:
e2543b02 1177 fprintf (stderr, "/symbeg");
669fa600
SM
1178 break;
1179
1180 case symend:
e2543b02 1181 fprintf (stderr, "/symend");
669fa600 1182 break;
5e69f11e 1183
1fb352e0 1184 case syntaxspec:
a1a052df 1185 fprintf (stderr, "/syntaxspec");
1fb352e0 1186 mcnt = *p++;
a1a052df 1187 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1188 break;
1189
1190 case notsyntaxspec:
a1a052df 1191 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1192 mcnt = *p++;
a1a052df 1193 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1194 break;
1195
0b32bf0e 1196# ifdef emacs
fa9a63c5 1197 case before_dot:
a1a052df 1198 fprintf (stderr, "/before_dot");
25fe55af 1199 break;
fa9a63c5
RM
1200
1201 case at_dot:
a1a052df 1202 fprintf (stderr, "/at_dot");
25fe55af 1203 break;
fa9a63c5
RM
1204
1205 case after_dot:
a1a052df 1206 fprintf (stderr, "/after_dot");
25fe55af 1207 break;
fa9a63c5 1208
1fb352e0 1209 case categoryspec:
a1a052df 1210 fprintf (stderr, "/categoryspec");
fa9a63c5 1211 mcnt = *p++;
a1a052df 1212 fprintf (stderr, "/%d", mcnt);
25fe55af 1213 break;
5e69f11e 1214
1fb352e0 1215 case notcategoryspec:
a1a052df 1216 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1217 mcnt = *p++;
a1a052df 1218 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1219 break;
0b32bf0e 1220# endif /* emacs */
fa9a63c5 1221
fa9a63c5 1222 case begbuf:
a1a052df 1223 fprintf (stderr, "/begbuf");
25fe55af 1224 break;
fa9a63c5
RM
1225
1226 case endbuf:
a1a052df 1227 fprintf (stderr, "/endbuf");
25fe55af 1228 break;
fa9a63c5 1229
25fe55af 1230 default:
a1a052df 1231 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1232 }
1233
a1a052df 1234 fprintf (stderr, "\n");
fa9a63c5
RM
1235 }
1236
a1a052df 1237 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1238}
1239
1240
1241void
1242print_compiled_pattern (bufp)
1243 struct re_pattern_buffer *bufp;
1244{
01618498 1245 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1246
1247 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1248 printf ("%ld bytes used/%ld bytes allocated.\n",
1249 bufp->used, bufp->allocated);
fa9a63c5
RM
1250
1251 if (bufp->fastmap_accurate && bufp->fastmap)
1252 {
1253 printf ("fastmap: ");
1254 print_fastmap (bufp->fastmap);
1255 }
1256
1257 printf ("re_nsub: %d\t", bufp->re_nsub);
1258 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1259 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1260 printf ("no_sub: %d\t", bufp->no_sub);
1261 printf ("not_bol: %d\t", bufp->not_bol);
1262 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1263 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1264 fflush (stdout);
fa9a63c5
RM
1265 /* Perhaps we should print the translate table? */
1266}
1267
1268
1269void
1270print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1271 re_char *where;
1272 re_char *string1;
1273 re_char *string2;
fa9a63c5
RM
1274 int size1;
1275 int size2;
1276{
4bb91c68 1277 int this_char;
5e69f11e 1278
fa9a63c5
RM
1279 if (where == NULL)
1280 printf ("(null)");
1281 else
1282 {
1283 if (FIRST_STRING_P (where))
25fe55af
RS
1284 {
1285 for (this_char = where - string1; this_char < size1; this_char++)
1286 putchar (string1[this_char]);
fa9a63c5 1287
25fe55af
RS
1288 where = string2;
1289 }
fa9a63c5
RM
1290
1291 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1292 putchar (string2[this_char]);
fa9a63c5
RM
1293 }
1294}
1295
1296#else /* not DEBUG */
1297
0b32bf0e
SM
1298# undef assert
1299# define assert(e)
fa9a63c5 1300
0b32bf0e
SM
1301# define DEBUG_STATEMENT(e)
1302# define DEBUG_PRINT1(x)
1303# define DEBUG_PRINT2(x1, x2)
1304# define DEBUG_PRINT3(x1, x2, x3)
1305# define DEBUG_PRINT4(x1, x2, x3, x4)
1306# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1307# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1308
1309#endif /* not DEBUG */
1310\f
1311/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1312 also be assigned to arbitrarily: each pattern buffer stores its own
1313 syntax, so it can be changed between regex compilations. */
1314/* This has no initializer because initialized variables in Emacs
1315 become read-only after dumping. */
1316reg_syntax_t re_syntax_options;
1317
1318
1319/* Specify the precise syntax of regexps for compilation. This provides
1320 for compatibility for various utilities which historically have
1321 different, incompatible syntaxes.
1322
1323 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1324 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1325
1326reg_syntax_t
1327re_set_syntax (syntax)
f9b0fd99 1328 reg_syntax_t syntax;
fa9a63c5
RM
1329{
1330 reg_syntax_t ret = re_syntax_options;
5e69f11e 1331
fa9a63c5
RM
1332 re_syntax_options = syntax;
1333 return ret;
1334}
c0f9ea08 1335WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1336
1337/* Regexp to use to replace spaces, or NULL meaning don't. */
1338static re_char *whitespace_regexp;
1339
1340void
1341re_set_whitespace_regexp (regexp)
6470ea05 1342 const char *regexp;
f9b0fd99 1343{
6470ea05 1344 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1345}
1346WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1347\f
1348/* This table gives an error message for each of the error codes listed
4bb91c68 1349 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1350 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1351 but why not be nice? */
fa9a63c5
RM
1352
1353static const char *re_error_msgid[] =
5e69f11e
RM
1354 {
1355 gettext_noop ("Success"), /* REG_NOERROR */
1356 gettext_noop ("No match"), /* REG_NOMATCH */
1357 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1358 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1359 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1360 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1361 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1362 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1363 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1364 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1365 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1366 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1367 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1368 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1369 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1370 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1371 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1372 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1373 };
1374\f
4bb91c68 1375/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1376
1377/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1378 searching and matching functions should not call alloca. On some
1379 systems, alloca is implemented in terms of malloc, and if we're
1380 using the relocating allocator routines, then malloc could cause a
1381 relocation, which might (if the strings being searched are in the
1382 ralloc heap) shift the data out from underneath the regexp
1383 routines.
1384
5e69f11e 1385 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1386 processes input from X in a signal handler; processing X input may
1387 call malloc; if input arrives while a matching routine is calling
1388 malloc, then we're scrod. But Emacs can't just block input while
1389 calling matching routines; then we don't notice interrupts when
1390 they come in. So, Emacs blocks input around all regexp calls
1391 except the matching calls, which it leaves unprotected, in the
1392 faith that they will not malloc. */
1393
1394/* Normally, this is fine. */
1395#define MATCH_MAY_ALLOCATE
1396
fa9a63c5
RM
1397/* The match routines may not allocate if (1) they would do it with malloc
1398 and (2) it's not safe for them to use malloc.
1399 Note that if REL_ALLOC is defined, matching would not use malloc for the
1400 failure stack, but we would still use it for the register vectors;
4bb91c68 1401 so REL_ALLOC should not affect this. */
b588157e 1402#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1403# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1404#endif
1405
1406\f
1407/* Failure stack declarations and macros; both re_compile_fastmap and
1408 re_match_2 use a failure stack. These have to be macros because of
1409 REGEX_ALLOCATE_STACK. */
5e69f11e 1410
fa9a63c5 1411
320a2a73 1412/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1413 when matching. If this number is exceeded, we allocate more
1414 space, so it is not a hard limit. */
1415#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1416# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1417#endif
1418
1419/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1420 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1421 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1422 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1423 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1424# if defined MATCH_MAY_ALLOCATE
1425/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1426 whose default stack limit is 2mb. In order for a larger
1427 value to work reliably, you have to try to make it accord
1428 with the process stack limit. */
c0f9ea08
SM
1429size_t re_max_failures = 40000;
1430# else
1431size_t re_max_failures = 4000;
1432# endif
fa9a63c5
RM
1433
1434union fail_stack_elt
1435{
01618498 1436 re_char *pointer;
c0f9ea08
SM
1437 /* This should be the biggest `int' that's no bigger than a pointer. */
1438 long integer;
fa9a63c5
RM
1439};
1440
1441typedef union fail_stack_elt fail_stack_elt_t;
1442
1443typedef struct
1444{
1445 fail_stack_elt_t *stack;
c0f9ea08
SM
1446 size_t size;
1447 size_t avail; /* Offset of next open position. */
1448 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1449} fail_stack_type;
1450
505bde11 1451#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1452#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1453
1454
1455/* Define macros to initialize and free the failure stack.
1456 Do `return -2' if the alloc fails. */
1457
1458#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1459# define INIT_FAIL_STACK() \
fa9a63c5
RM
1460 do { \
1461 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1462 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1463 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1464 \
1465 if (fail_stack.stack == NULL) \
1466 return -2; \
1467 \
1468 fail_stack.size = INIT_FAILURE_ALLOC; \
1469 fail_stack.avail = 0; \
505bde11 1470 fail_stack.frame = 0; \
fa9a63c5
RM
1471 } while (0)
1472
0b32bf0e 1473# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1474#else
0b32bf0e 1475# define INIT_FAIL_STACK() \
fa9a63c5
RM
1476 do { \
1477 fail_stack.avail = 0; \
505bde11 1478 fail_stack.frame = 0; \
fa9a63c5
RM
1479 } while (0)
1480
0b32bf0e 1481# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1482#endif
1483
1484
320a2a73
KH
1485/* Double the size of FAIL_STACK, up to a limit
1486 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1487
1488 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1489 allocating space for it or it was already too large.
1490
4bb91c68 1491 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1492
320a2a73
KH
1493/* Factor to increase the failure stack size by
1494 when we increase it.
1495 This used to be 2, but 2 was too wasteful
1496 because the old discarded stacks added up to as much space
1497 were as ultimate, maximum-size stack. */
1498#define FAIL_STACK_GROWTH_FACTOR 4
1499
1500#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1501 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1502 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1503 ? 0 \
320a2a73
KH
1504 : ((fail_stack).stack \
1505 = (fail_stack_elt_t *) \
25fe55af
RS
1506 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1507 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1508 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1509 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1510 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1511 \
1512 (fail_stack).stack == NULL \
1513 ? 0 \
6453db45
KH
1514 : ((fail_stack).size \
1515 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1516 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1517 * FAIL_STACK_GROWTH_FACTOR)) \
1518 / sizeof (fail_stack_elt_t)), \
25fe55af 1519 1)))
fa9a63c5
RM
1520
1521
fa9a63c5
RM
1522/* Push a pointer value onto the failure stack.
1523 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1524 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1525#define PUSH_FAILURE_POINTER(item) \
01618498 1526 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1527
1528/* This pushes an integer-valued item onto the failure stack.
1529 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1530 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1531#define PUSH_FAILURE_INT(item) \
1532 fail_stack.stack[fail_stack.avail++].integer = (item)
1533
1534/* Push a fail_stack_elt_t value onto the failure stack.
1535 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1536 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1537#define PUSH_FAILURE_ELT(item) \
1538 fail_stack.stack[fail_stack.avail++] = (item)
1539
1540/* These three POP... operations complement the three PUSH... operations.
1541 All assume that `fail_stack' is nonempty. */
1542#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1543#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1544#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1545
505bde11
SM
1546/* Individual items aside from the registers. */
1547#define NUM_NONREG_ITEMS 3
1548
1549/* Used to examine the stack (to detect infinite loops). */
1550#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1551#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1552#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1553#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1554
1555
505bde11
SM
1556#define ENSURE_FAIL_STACK(space) \
1557while (REMAINING_AVAIL_SLOTS <= space) { \
1558 if (!GROW_FAIL_STACK (fail_stack)) \
1559 return -2; \
1560 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1561 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1562}
1563
1564/* Push register NUM onto the stack. */
1565#define PUSH_FAILURE_REG(num) \
1566do { \
1567 char *destination; \
1568 ENSURE_FAIL_STACK(3); \
1569 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1570 num, regstart[num], regend[num]); \
1571 PUSH_FAILURE_POINTER (regstart[num]); \
1572 PUSH_FAILURE_POINTER (regend[num]); \
1573 PUSH_FAILURE_INT (num); \
1574} while (0)
1575
01618498
SM
1576/* Change the counter's value to VAL, but make sure that it will
1577 be reset when backtracking. */
1578#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1579do { \
1580 char *destination; \
1581 int c; \
1582 ENSURE_FAIL_STACK(3); \
1583 EXTRACT_NUMBER (c, ptr); \
01618498 1584 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1585 PUSH_FAILURE_INT (c); \
1586 PUSH_FAILURE_POINTER (ptr); \
1587 PUSH_FAILURE_INT (-1); \
01618498 1588 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1589} while (0)
1590
505bde11 1591/* Pop a saved register off the stack. */
dc1e502d 1592#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1593do { \
1594 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1595 if (reg == -1) \
1596 { \
1597 /* It's a counter. */ \
6dcf2d0e
SM
1598 /* Here, we discard `const', making re_match non-reentrant. */ \
1599 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1600 reg = POP_FAILURE_INT (); \
1601 STORE_NUMBER (ptr, reg); \
1602 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1603 } \
1604 else \
1605 { \
1606 regend[reg] = POP_FAILURE_POINTER (); \
1607 regstart[reg] = POP_FAILURE_POINTER (); \
1608 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1609 reg, regstart[reg], regend[reg]); \
1610 } \
505bde11
SM
1611} while (0)
1612
1613/* Check that we are not stuck in an infinite loop. */
1614#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1615do { \
f6df485f 1616 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1617 /* Check for infinite matching loops */ \
f6df485f
RS
1618 while (failure > 0 \
1619 && (FAILURE_STR (failure) == string_place \
1620 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1621 { \
1622 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1623 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1624 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1625 { \
6df42991
SM
1626 cycle = 1; \
1627 break; \
f6df485f 1628 } \
66f0296e 1629 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1630 failure = NEXT_FAILURE_HANDLE(failure); \
1631 } \
1632 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1633} while (0)
6df42991 1634
fa9a63c5 1635/* Push the information about the state we will need
5e69f11e
RM
1636 if we ever fail back to it.
1637
505bde11 1638 Requires variables fail_stack, regstart, regend and
320a2a73 1639 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1640 declared.
5e69f11e 1641
fa9a63c5
RM
1642 Does `return FAILURE_CODE' if runs out of memory. */
1643
505bde11
SM
1644#define PUSH_FAILURE_POINT(pattern, string_place) \
1645do { \
1646 char *destination; \
1647 /* Must be int, so when we don't save any registers, the arithmetic \
1648 of 0 + -1 isn't done as unsigned. */ \
1649 \
505bde11 1650 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1651 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1652 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1653 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1654 \
1655 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1656 \
1657 DEBUG_PRINT1 ("\n"); \
1658 \
1659 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1660 PUSH_FAILURE_INT (fail_stack.frame); \
1661 \
1662 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1663 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1664 DEBUG_PRINT1 ("'\n"); \
1665 PUSH_FAILURE_POINTER (string_place); \
1666 \
1667 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1668 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1669 PUSH_FAILURE_POINTER (pattern); \
1670 \
1671 /* Close the frame by moving the frame pointer past it. */ \
1672 fail_stack.frame = fail_stack.avail; \
1673} while (0)
fa9a63c5 1674
320a2a73
KH
1675/* Estimate the size of data pushed by a typical failure stack entry.
1676 An estimate is all we need, because all we use this for
1677 is to choose a limit for how big to make the failure stack. */
ada30c0e 1678/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1679#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1680
fa9a63c5
RM
1681/* How many items can still be added to the stack without overflowing it. */
1682#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1683
1684
1685/* Pops what PUSH_FAIL_STACK pushes.
1686
1687 We restore into the parameters, all of which should be lvalues:
1688 STR -- the saved data position.
1689 PAT -- the saved pattern position.
fa9a63c5 1690 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1691
fa9a63c5 1692 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1693 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1694
505bde11
SM
1695#define POP_FAILURE_POINT(str, pat) \
1696do { \
fa9a63c5
RM
1697 assert (!FAIL_STACK_EMPTY ()); \
1698 \
1699 /* Remove failure points and point to how many regs pushed. */ \
1700 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1701 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1702 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1703 \
505bde11
SM
1704 /* Pop the saved registers. */ \
1705 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1706 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1707 \
01618498 1708 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1709 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1710 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1711 \
1712 /* If the saved string location is NULL, it came from an \
1713 on_failure_keep_string_jump opcode, and we want to throw away the \
1714 saved NULL, thus retaining our current position in the string. */ \
01618498 1715 str = POP_FAILURE_POINTER (); \
505bde11 1716 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1717 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1718 DEBUG_PRINT1 ("'\n"); \
1719 \
505bde11
SM
1720 fail_stack.frame = POP_FAILURE_INT (); \
1721 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1722 \
505bde11
SM
1723 assert (fail_stack.avail >= 0); \
1724 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1725 \
fa9a63c5 1726 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1727} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1728
1729
1730\f
fa9a63c5 1731/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1732#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1733\f
1734/* Subroutine declarations and macros for regex_compile. */
1735
4bb91c68
SM
1736static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1737 reg_syntax_t syntax,
1738 struct re_pattern_buffer *bufp));
1739static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1740static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1741 int arg1, int arg2));
1742static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1743 int arg, unsigned char *end));
1744static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1745 int arg1, int arg2, unsigned char *end));
01618498
SM
1746static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1747 re_char *p,
4bb91c68 1748 reg_syntax_t syntax));
01618498
SM
1749static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1750 re_char *pend,
4bb91c68 1751 reg_syntax_t syntax));
01618498
SM
1752static re_char *skip_one_char _RE_ARGS ((re_char *p));
1753static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1754 char *fastmap, const int multibyte));
fa9a63c5 1755
fa9a63c5 1756/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1757 translation. */
36595814 1758#define PATFETCH(c) \
2d1675e4
SM
1759 do { \
1760 int len; \
1761 if (p == pend) return REG_EEND; \
cf9c99bc 1762 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte); \
2d1675e4 1763 p += len; \
fa9a63c5
RM
1764 } while (0)
1765
fa9a63c5
RM
1766
1767/* If `translate' is non-null, return translate[D], else just D. We
1768 cast the subscript to translate because some data is declared as
1769 `char *', to avoid warnings when a string constant is passed. But
1770 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1771#ifndef TRANSLATE
0b32bf0e 1772# define TRANSLATE(d) \
66f0296e 1773 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1774#endif
fa9a63c5
RM
1775
1776
1777/* Macros for outputting the compiled pattern into `buffer'. */
1778
1779/* If the buffer isn't allocated when it comes in, use this. */
1780#define INIT_BUF_SIZE 32
1781
4bb91c68 1782/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1783#define GET_BUFFER_SPACE(n) \
01618498 1784 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1785 EXTEND_BUFFER ()
1786
1787/* Make sure we have one more byte of buffer space and then add C to it. */
1788#define BUF_PUSH(c) \
1789 do { \
1790 GET_BUFFER_SPACE (1); \
1791 *b++ = (unsigned char) (c); \
1792 } while (0)
1793
1794
1795/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1796#define BUF_PUSH_2(c1, c2) \
1797 do { \
1798 GET_BUFFER_SPACE (2); \
1799 *b++ = (unsigned char) (c1); \
1800 *b++ = (unsigned char) (c2); \
1801 } while (0)
1802
1803
4bb91c68 1804/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1805#define BUF_PUSH_3(c1, c2, c3) \
1806 do { \
1807 GET_BUFFER_SPACE (3); \
1808 *b++ = (unsigned char) (c1); \
1809 *b++ = (unsigned char) (c2); \
1810 *b++ = (unsigned char) (c3); \
1811 } while (0)
1812
1813
1814/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1815 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1816#define STORE_JUMP(op, loc, to) \
1817 store_op1 (op, loc, (to) - (loc) - 3)
1818
1819/* Likewise, for a two-argument jump. */
1820#define STORE_JUMP2(op, loc, to, arg) \
1821 store_op2 (op, loc, (to) - (loc) - 3, arg)
1822
4bb91c68 1823/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1824#define INSERT_JUMP(op, loc, to) \
1825 insert_op1 (op, loc, (to) - (loc) - 3, b)
1826
1827/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1828#define INSERT_JUMP2(op, loc, to, arg) \
1829 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1830
1831
1832/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1833 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1834 be too small, many things would have to change. */
839966f3
KH
1835# define MAX_BUF_SIZE (1L << 15)
1836
1837#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1838/* Any other compiler which, like MSC, has allocation limit below 2^16
1839 bytes will have to use approach similar to what was done below for
1840 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1841 reallocating to 0 bytes. Such thing is not going to work too well.
1842 You have been warned!! */
1843#if defined _MSC_VER && !defined WIN32
1844/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1845# define MAX_BUF_SIZE 65500L
1846#else
1847# define MAX_BUF_SIZE (1L << 16)
1848#endif
839966f3 1849#endif /* 0 */
fa9a63c5
RM
1850
1851/* Extend the buffer by twice its current size via realloc and
1852 reset the pointers that pointed into the old block to point to the
1853 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1854 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1855#if __BOUNDED_POINTERS__
1856# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1857# define MOVE_BUFFER_POINTER(P) \
1858 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1859 SET_HIGH_BOUND (P), \
1860 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1861# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1862 else \
1863 { \
1864 SET_HIGH_BOUND (b); \
1865 SET_HIGH_BOUND (begalt); \
1866 if (fixup_alt_jump) \
1867 SET_HIGH_BOUND (fixup_alt_jump); \
1868 if (laststart) \
1869 SET_HIGH_BOUND (laststart); \
1870 if (pending_exact) \
1871 SET_HIGH_BOUND (pending_exact); \
1872 }
1873#else
381880b0 1874# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1875# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1876#endif
fa9a63c5 1877#define EXTEND_BUFFER() \
25fe55af 1878 do { \
381880b0 1879 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1880 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1881 return REG_ESIZE; \
1882 bufp->allocated <<= 1; \
1883 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1884 bufp->allocated = MAX_BUF_SIZE; \
01618498 1885 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1886 if (bufp->buffer == NULL) \
1887 return REG_ESPACE; \
1888 /* If the buffer moved, move all the pointers into it. */ \
1889 if (old_buffer != bufp->buffer) \
1890 { \
381880b0 1891 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1892 MOVE_BUFFER_POINTER (b); \
1893 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1894 if (fixup_alt_jump) \
4bb91c68 1895 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1896 if (laststart) \
4bb91c68 1897 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1898 if (pending_exact) \
4bb91c68 1899 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1900 } \
4bb91c68 1901 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1902 } while (0)
1903
1904
1905/* Since we have one byte reserved for the register number argument to
1906 {start,stop}_memory, the maximum number of groups we can report
1907 things about is what fits in that byte. */
1908#define MAX_REGNUM 255
1909
1910/* But patterns can have more than `MAX_REGNUM' registers. We just
1911 ignore the excess. */
098d42af 1912typedef int regnum_t;
fa9a63c5
RM
1913
1914
1915/* Macros for the compile stack. */
1916
1917/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1918 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1919/* int may be not enough when sizeof(int) == 2. */
1920typedef long pattern_offset_t;
fa9a63c5
RM
1921
1922typedef struct
1923{
1924 pattern_offset_t begalt_offset;
1925 pattern_offset_t fixup_alt_jump;
5e69f11e 1926 pattern_offset_t laststart_offset;
fa9a63c5
RM
1927 regnum_t regnum;
1928} compile_stack_elt_t;
1929
1930
1931typedef struct
1932{
1933 compile_stack_elt_t *stack;
1934 unsigned size;
1935 unsigned avail; /* Offset of next open position. */
1936} compile_stack_type;
1937
1938
1939#define INIT_COMPILE_STACK_SIZE 32
1940
1941#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1942#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1943
4bb91c68 1944/* The next available element. */
fa9a63c5
RM
1945#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1946
1cee1e27
SM
1947/* Explicit quit checking is only used on NTemacs and whenever we
1948 use polling to process input events. */
1949#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1950extern int immediate_quit;
1951# define IMMEDIATE_QUIT_CHECK \
1952 do { \
1953 if (immediate_quit) QUIT; \
1954 } while (0)
1955#else
1956# define IMMEDIATE_QUIT_CHECK ((void)0)
1957#endif
1958\f
b18215fc
RS
1959/* Structure to manage work area for range table. */
1960struct range_table_work_area
1961{
1962 int *table; /* actual work area. */
1963 int allocated; /* allocated size for work area in bytes. */
7814e705 1964 int used; /* actually used size in words. */
96cc36cc 1965 int bits; /* flag to record character classes */
b18215fc
RS
1966};
1967
77d11aec
RS
1968/* Make sure that WORK_AREA can hold more N multibyte characters.
1969 This is used only in set_image_of_range and set_image_of_range_1.
1970 It expects WORK_AREA to be a pointer.
1971 If it can't get the space, it returns from the surrounding function. */
1972
1973#define EXTEND_RANGE_TABLE(work_area, n) \
1974 do { \
8f924df7 1975 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1976 { \
8f924df7
KH
1977 extend_range_table_work_area (&work_area); \
1978 if ((work_area).table == 0) \
77d11aec
RS
1979 return (REG_ESPACE); \
1980 } \
b18215fc
RS
1981 } while (0)
1982
96cc36cc
RS
1983#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1984 (work_area).bits |= (bit)
1985
14473664
SM
1986/* Bits used to implement the multibyte-part of the various character classes
1987 such as [:alnum:] in a charset's range table. */
1988#define BIT_WORD 0x1
1989#define BIT_LOWER 0x2
1990#define BIT_PUNCT 0x4
1991#define BIT_SPACE 0x8
1992#define BIT_UPPER 0x10
1993#define BIT_MULTIBYTE 0x20
96cc36cc 1994
b18215fc
RS
1995/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1996#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1997 do { \
8f924df7 1998 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1999 (work_area).table[(work_area).used++] = (range_start); \
2000 (work_area).table[(work_area).used++] = (range_end); \
2001 } while (0)
2002
7814e705 2003/* Free allocated memory for WORK_AREA. */
b18215fc
RS
2004#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
2005 do { \
2006 if ((work_area).table) \
2007 free ((work_area).table); \
2008 } while (0)
2009
96cc36cc 2010#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 2011#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 2012#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 2013#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 2014\f
b18215fc 2015
fa9a63c5 2016/* Set the bit for character C in a list. */
01618498 2017#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
2018
2019
bf216479
KH
2020#ifdef emacs
2021
cf9c99bc
KH
2022/* Store characters in the range FROM to TO in the bitmap at B (for
2023 ASCII and unibyte characters) and WORK_AREA (for multibyte
2024 characters) while translating them and paying attention to the
2025 continuity of translated characters.
8f924df7 2026
cf9c99bc
KH
2027 Implementation note: It is better to implement these fairly big
2028 macros by a function, but it's not that easy because macros called
8f924df7 2029 in this macro assume various local variables already declared. */
bf216479 2030
cf9c99bc
KH
2031/* Both FROM and TO are ASCII characters. */
2032
2033#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
2034 do { \
2035 int C0, C1; \
2036 \
2037 for (C0 = (FROM); C0 <= (TO); C0++) \
2038 { \
2039 C1 = TRANSLATE (C0); \
2040 if (! ASCII_CHAR_P (C1)) \
2041 { \
2042 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2043 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2044 C1 = C0; \
2045 } \
2046 SET_LIST_BIT (C1); \
2047 } \
2048 } while (0)
2049
2050
2051/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2052
2053#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2054 do { \
2055 int C0, C1, C2, I; \
2056 int USED = RANGE_TABLE_WORK_USED (work_area); \
2057 \
2058 for (C0 = (FROM); C0 <= (TO); C0++) \
2059 { \
2060 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2061 if (CHAR_BYTE8_P (C1)) \
2062 SET_LIST_BIT (C0); \
2063 else \
2064 { \
2065 C2 = TRANSLATE (C1); \
2066 if (C2 == C1 \
2067 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2068 C1 = C0; \
2069 SET_LIST_BIT (C1); \
2070 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2071 { \
2072 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2073 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2074 \
2075 if (C2 >= from - 1 && C2 <= to + 1) \
2076 { \
2077 if (C2 == from - 1) \
2078 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2079 else if (C2 == to + 1) \
2080 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2081 break; \
2082 } \
2083 } \
2084 if (I < USED) \
2085 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2086 } \
2087 } \
2088 } while (0)
2089
2090
2091/* Both FROM and TO are mulitbyte characters. */
2092
2093#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2094 do { \
2095 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2096 \
2097 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2098 for (C0 = (FROM); C0 <= (TO); C0++) \
2099 { \
2100 C1 = TRANSLATE (C0); \
2101 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2102 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2103 SET_LIST_BIT (C2); \
2104 if (C1 >= (FROM) && C1 <= (TO)) \
2105 continue; \
2106 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2107 { \
2108 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2109 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2110 \
2111 if (C1 >= from - 1 && C1 <= to + 1) \
2112 { \
2113 if (C1 == from - 1) \
2114 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2115 else if (C1 == to + 1) \
2116 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2117 break; \
2118 } \
2119 } \
2120 if (I < USED) \
2121 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2122 } \
bf216479
KH
2123 } while (0)
2124
2125#endif /* emacs */
2126
fa9a63c5 2127/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2128#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2129 do { \
2130 if (p == pend) \
2131 FREE_STACK_RETURN (REG_EBRACE); \
2132 else \
2133 { \
2134 PATFETCH (c); \
2135 while ('0' <= c && c <= '9') \
2136 { \
2137 int prev; \
2138 if (num < 0) \
2139 num = 0; \
2140 prev = num; \
2141 num = num * 10 + c - '0'; \
2142 if (num / 10 != prev) \
2143 FREE_STACK_RETURN (REG_BADBR); \
2144 if (p == pend) \
2145 FREE_STACK_RETURN (REG_EBRACE); \
2146 PATFETCH (c); \
2147 } \
2148 } \
2149 } while (0)
77d11aec 2150\f
1fdab503 2151#if ! WIDE_CHAR_SUPPORT
01618498 2152
14473664 2153/* Map a string to the char class it names (if any). */
1fdab503 2154re_wctype_t
ada30c0e
SM
2155re_wctype (str)
2156 re_char *str;
14473664 2157{
ada30c0e 2158 const char *string = str;
14473664
SM
2159 if (STREQ (string, "alnum")) return RECC_ALNUM;
2160 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2161 else if (STREQ (string, "word")) return RECC_WORD;
2162 else if (STREQ (string, "ascii")) return RECC_ASCII;
2163 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2164 else if (STREQ (string, "graph")) return RECC_GRAPH;
2165 else if (STREQ (string, "lower")) return RECC_LOWER;
2166 else if (STREQ (string, "print")) return RECC_PRINT;
2167 else if (STREQ (string, "punct")) return RECC_PUNCT;
2168 else if (STREQ (string, "space")) return RECC_SPACE;
2169 else if (STREQ (string, "upper")) return RECC_UPPER;
2170 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2171 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2172 else if (STREQ (string, "digit")) return RECC_DIGIT;
2173 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2174 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2175 else if (STREQ (string, "blank")) return RECC_BLANK;
2176 else return 0;
2177}
2178
e0f24100 2179/* True if CH is in the char class CC. */
1fdab503 2180boolean
14473664
SM
2181re_iswctype (ch, cc)
2182 int ch;
2183 re_wctype_t cc;
2184{
2185 switch (cc)
2186 {
0cdd06f8
SM
2187 case RECC_ALNUM: return ISALNUM (ch);
2188 case RECC_ALPHA: return ISALPHA (ch);
2189 case RECC_BLANK: return ISBLANK (ch);
2190 case RECC_CNTRL: return ISCNTRL (ch);
2191 case RECC_DIGIT: return ISDIGIT (ch);
2192 case RECC_GRAPH: return ISGRAPH (ch);
2193 case RECC_LOWER: return ISLOWER (ch);
2194 case RECC_PRINT: return ISPRINT (ch);
2195 case RECC_PUNCT: return ISPUNCT (ch);
2196 case RECC_SPACE: return ISSPACE (ch);
2197 case RECC_UPPER: return ISUPPER (ch);
2198 case RECC_XDIGIT: return ISXDIGIT (ch);
2199 case RECC_ASCII: return IS_REAL_ASCII (ch);
2200 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2201 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2202 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2203 case RECC_WORD: return ISWORD (ch);
2204 case RECC_ERROR: return false;
2205 default:
2206 abort();
14473664
SM
2207 }
2208}
fa9a63c5 2209
14473664
SM
2210/* Return a bit-pattern to use in the range-table bits to match multibyte
2211 chars of class CC. */
2212static int
2213re_wctype_to_bit (cc)
2214 re_wctype_t cc;
2215{
2216 switch (cc)
2217 {
2218 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2219 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2220 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2221 case RECC_LOWER: return BIT_LOWER;
2222 case RECC_UPPER: return BIT_UPPER;
2223 case RECC_PUNCT: return BIT_PUNCT;
2224 case RECC_SPACE: return BIT_SPACE;
14473664 2225 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2226 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2227 default:
2228 abort();
14473664
SM
2229 }
2230}
2231#endif
77d11aec
RS
2232\f
2233/* Filling in the work area of a range. */
2234
2235/* Actually extend the space in WORK_AREA. */
2236
2237static void
2238extend_range_table_work_area (work_area)
2239 struct range_table_work_area *work_area;
177c0ea7 2240{
77d11aec
RS
2241 work_area->allocated += 16 * sizeof (int);
2242 if (work_area->table)
2243 work_area->table
2244 = (int *) realloc (work_area->table, work_area->allocated);
2245 else
2246 work_area->table
2247 = (int *) malloc (work_area->allocated);
2248}
2249
8f924df7 2250#if 0
77d11aec
RS
2251#ifdef emacs
2252
2253/* Carefully find the ranges of codes that are equivalent
2254 under case conversion to the range start..end when passed through
2255 TRANSLATE. Handle the case where non-letters can come in between
2256 two upper-case letters (which happens in Latin-1).
2257 Also handle the case of groups of more than 2 case-equivalent chars.
2258
2259 The basic method is to look at consecutive characters and see
2260 if they can form a run that can be handled as one.
2261
2262 Returns -1 if successful, REG_ESPACE if ran out of space. */
2263
2264static int
2265set_image_of_range_1 (work_area, start, end, translate)
2266 RE_TRANSLATE_TYPE translate;
2267 struct range_table_work_area *work_area;
2268 re_wchar_t start, end;
2269{
2270 /* `one_case' indicates a character, or a run of characters,
2271 each of which is an isolate (no case-equivalents).
2272 This includes all ASCII non-letters.
2273
2274 `two_case' indicates a character, or a run of characters,
2275 each of which has two case-equivalent forms.
2276 This includes all ASCII letters.
2277
2278 `strange' indicates a character that has more than one
2279 case-equivalent. */
177c0ea7 2280
77d11aec
RS
2281 enum case_type {one_case, two_case, strange};
2282
2283 /* Describe the run that is in progress,
2284 which the next character can try to extend.
2285 If run_type is strange, that means there really is no run.
2286 If run_type is one_case, then run_start...run_end is the run.
2287 If run_type is two_case, then the run is run_start...run_end,
2288 and the case-equivalents end at run_eqv_end. */
2289
2290 enum case_type run_type = strange;
2291 int run_start, run_end, run_eqv_end;
2292
2293 Lisp_Object eqv_table;
2294
2295 if (!RE_TRANSLATE_P (translate))
2296 {
b7c12565 2297 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2298 work_area->table[work_area->used++] = (start);
2299 work_area->table[work_area->used++] = (end);
b7c12565 2300 return -1;
77d11aec
RS
2301 }
2302
2303 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2304
77d11aec
RS
2305 for (; start <= end; start++)
2306 {
2307 enum case_type this_type;
2308 int eqv = RE_TRANSLATE (eqv_table, start);
2309 int minchar, maxchar;
2310
2311 /* Classify this character */
2312 if (eqv == start)
2313 this_type = one_case;
2314 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2315 this_type = two_case;
2316 else
2317 this_type = strange;
2318
2319 if (start < eqv)
2320 minchar = start, maxchar = eqv;
2321 else
2322 minchar = eqv, maxchar = start;
2323
2324 /* Can this character extend the run in progress? */
2325 if (this_type == strange || this_type != run_type
2326 || !(minchar == run_end + 1
2327 && (run_type == two_case
2328 ? maxchar == run_eqv_end + 1 : 1)))
2329 {
2330 /* No, end the run.
2331 Record each of its equivalent ranges. */
2332 if (run_type == one_case)
2333 {
2334 EXTEND_RANGE_TABLE (work_area, 2);
2335 work_area->table[work_area->used++] = run_start;
2336 work_area->table[work_area->used++] = run_end;
2337 }
2338 else if (run_type == two_case)
2339 {
2340 EXTEND_RANGE_TABLE (work_area, 4);
2341 work_area->table[work_area->used++] = run_start;
2342 work_area->table[work_area->used++] = run_end;
2343 work_area->table[work_area->used++]
2344 = RE_TRANSLATE (eqv_table, run_start);
2345 work_area->table[work_area->used++]
2346 = RE_TRANSLATE (eqv_table, run_end);
2347 }
2348 run_type = strange;
2349 }
177c0ea7 2350
77d11aec
RS
2351 if (this_type == strange)
2352 {
2353 /* For a strange character, add each of its equivalents, one
2354 by one. Don't start a range. */
2355 do
2356 {
2357 EXTEND_RANGE_TABLE (work_area, 2);
2358 work_area->table[work_area->used++] = eqv;
2359 work_area->table[work_area->used++] = eqv;
2360 eqv = RE_TRANSLATE (eqv_table, eqv);
2361 }
2362 while (eqv != start);
2363 }
2364
2365 /* Add this char to the run, or start a new run. */
2366 else if (run_type == strange)
2367 {
2368 /* Initialize a new range. */
2369 run_type = this_type;
2370 run_start = start;
2371 run_end = start;
2372 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2373 }
2374 else
2375 {
2376 /* Extend a running range. */
2377 run_end = minchar;
2378 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2379 }
2380 }
2381
2382 /* If a run is still in progress at the end, finish it now
2383 by recording its equivalent ranges. */
2384 if (run_type == one_case)
2385 {
2386 EXTEND_RANGE_TABLE (work_area, 2);
2387 work_area->table[work_area->used++] = run_start;
2388 work_area->table[work_area->used++] = run_end;
2389 }
2390 else if (run_type == two_case)
2391 {
2392 EXTEND_RANGE_TABLE (work_area, 4);
2393 work_area->table[work_area->used++] = run_start;
2394 work_area->table[work_area->used++] = run_end;
2395 work_area->table[work_area->used++]
2396 = RE_TRANSLATE (eqv_table, run_start);
2397 work_area->table[work_area->used++]
2398 = RE_TRANSLATE (eqv_table, run_end);
2399 }
2400
2401 return -1;
2402}
36595814 2403
77d11aec 2404#endif /* emacs */
36595814 2405
b7c12565 2406/* Record the the image of the range start..end when passed through
36595814
SM
2407 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2408 and is not even necessarily contiguous.
b7c12565
RS
2409 Normally we approximate it with the smallest contiguous range that contains
2410 all the chars we need. However, for Latin-1 we go to extra effort
2411 to do a better job.
2412
2413 This function is not called for ASCII ranges.
77d11aec
RS
2414
2415 Returns -1 if successful, REG_ESPACE if ran out of space. */
2416
2417static int
36595814
SM
2418set_image_of_range (work_area, start, end, translate)
2419 RE_TRANSLATE_TYPE translate;
2420 struct range_table_work_area *work_area;
2421 re_wchar_t start, end;
2422{
77d11aec
RS
2423 re_wchar_t cmin, cmax;
2424
2425#ifdef emacs
2426 /* For Latin-1 ranges, use set_image_of_range_1
2427 to get proper handling of ranges that include letters and nonletters.
b7c12565 2428 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2429 For other character sets, we don't bother to get this right. */
b7c12565
RS
2430 if (RE_TRANSLATE_P (translate) && start < 04400
2431 && !(start < 04200 && end >= 04377))
77d11aec 2432 {
b7c12565 2433 int newend;
77d11aec 2434 int tem;
b7c12565
RS
2435 newend = end;
2436 if (newend > 04377)
2437 newend = 04377;
2438 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2439 if (tem > 0)
2440 return tem;
2441
2442 start = 04400;
2443 if (end < 04400)
2444 return -1;
2445 }
2446#endif
2447
b7c12565
RS
2448 EXTEND_RANGE_TABLE (work_area, 2);
2449 work_area->table[work_area->used++] = (start);
2450 work_area->table[work_area->used++] = (end);
2451
2452 cmin = -1, cmax = -1;
77d11aec 2453
36595814 2454 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2455 {
2456 int ch;
77d11aec 2457
b7c12565
RS
2458 for (ch = start; ch <= end; ch++)
2459 {
2460 re_wchar_t c = TRANSLATE (ch);
2461 if (! (start <= c && c <= end))
2462 {
2463 if (cmin == -1)
2464 cmin = c, cmax = c;
2465 else
2466 {
2467 cmin = MIN (cmin, c);
2468 cmax = MAX (cmax, c);
2469 }
2470 }
2471 }
2472
2473 if (cmin != -1)
2474 {
2475 EXTEND_RANGE_TABLE (work_area, 2);
2476 work_area->table[work_area->used++] = (cmin);
2477 work_area->table[work_area->used++] = (cmax);
2478 }
2479 }
36595814 2480
77d11aec
RS
2481 return -1;
2482}
8f924df7 2483#endif /* 0 */
fa9a63c5
RM
2484\f
2485#ifndef MATCH_MAY_ALLOCATE
2486
2487/* If we cannot allocate large objects within re_match_2_internal,
2488 we make the fail stack and register vectors global.
2489 The fail stack, we grow to the maximum size when a regexp
2490 is compiled.
2491 The register vectors, we adjust in size each time we
2492 compile a regexp, according to the number of registers it needs. */
2493
2494static fail_stack_type fail_stack;
2495
2496/* Size with which the following vectors are currently allocated.
2497 That is so we can make them bigger as needed,
4bb91c68 2498 but never make them smaller. */
fa9a63c5
RM
2499static int regs_allocated_size;
2500
66f0296e
SM
2501static re_char ** regstart, ** regend;
2502static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2503
2504/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2505 but don't make them smaller. */
fa9a63c5
RM
2506
2507static
2508regex_grow_registers (num_regs)
2509 int num_regs;
2510{
2511 if (num_regs > regs_allocated_size)
2512 {
66f0296e
SM
2513 RETALLOC_IF (regstart, num_regs, re_char *);
2514 RETALLOC_IF (regend, num_regs, re_char *);
2515 RETALLOC_IF (best_regstart, num_regs, re_char *);
2516 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2517
2518 regs_allocated_size = num_regs;
2519 }
2520}
2521
2522#endif /* not MATCH_MAY_ALLOCATE */
2523\f
99633e97
SM
2524static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2525 compile_stack,
2526 regnum_t regnum));
2527
fa9a63c5
RM
2528/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2529 Returns one of error codes defined in `regex.h', or zero for success.
2530
2531 Assumes the `allocated' (and perhaps `buffer') and `translate'
2532 fields are set in BUFP on entry.
2533
2534 If it succeeds, results are put in BUFP (if it returns an error, the
2535 contents of BUFP are undefined):
2536 `buffer' is the compiled pattern;
2537 `syntax' is set to SYNTAX;
2538 `used' is set to the length of the compiled pattern;
2539 `fastmap_accurate' is zero;
2540 `re_nsub' is the number of subexpressions in PATTERN;
2541 `not_bol' and `not_eol' are zero;
5e69f11e 2542
c0f9ea08 2543 The `fastmap' field is neither examined nor set. */
fa9a63c5 2544
505bde11
SM
2545/* Insert the `jump' from the end of last alternative to "here".
2546 The space for the jump has already been allocated. */
2547#define FIXUP_ALT_JUMP() \
2548do { \
2549 if (fixup_alt_jump) \
2550 STORE_JUMP (jump, fixup_alt_jump, b); \
2551} while (0)
2552
2553
fa9a63c5
RM
2554/* Return, freeing storage we allocated. */
2555#define FREE_STACK_RETURN(value) \
b18215fc
RS
2556 do { \
2557 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2558 free (compile_stack.stack); \
2559 return value; \
2560 } while (0)
fa9a63c5
RM
2561
2562static reg_errcode_t
2563regex_compile (pattern, size, syntax, bufp)
66f0296e 2564 re_char *pattern;
4bb91c68 2565 size_t size;
fa9a63c5
RM
2566 reg_syntax_t syntax;
2567 struct re_pattern_buffer *bufp;
2568{
01618498
SM
2569 /* We fetch characters from PATTERN here. */
2570 register re_wchar_t c, c1;
5e69f11e 2571
fa9a63c5 2572 /* A random temporary spot in PATTERN. */
66f0296e 2573 re_char *p1;
fa9a63c5
RM
2574
2575 /* Points to the end of the buffer, where we should append. */
2576 register unsigned char *b;
5e69f11e 2577
fa9a63c5
RM
2578 /* Keeps track of unclosed groups. */
2579 compile_stack_type compile_stack;
2580
2581 /* Points to the current (ending) position in the pattern. */
22336245
RS
2582#ifdef AIX
2583 /* `const' makes AIX compiler fail. */
66f0296e 2584 unsigned char *p = pattern;
22336245 2585#else
66f0296e 2586 re_char *p = pattern;
22336245 2587#endif
66f0296e 2588 re_char *pend = pattern + size;
5e69f11e 2589
fa9a63c5 2590 /* How to translate the characters in the pattern. */
6676cb1c 2591 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2592
2593 /* Address of the count-byte of the most recently inserted `exactn'
2594 command. This makes it possible to tell if a new exact-match
2595 character can be added to that command or if the character requires
2596 a new `exactn' command. */
2597 unsigned char *pending_exact = 0;
2598
2599 /* Address of start of the most recently finished expression.
2600 This tells, e.g., postfix * where to find the start of its
2601 operand. Reset at the beginning of groups and alternatives. */
2602 unsigned char *laststart = 0;
2603
2604 /* Address of beginning of regexp, or inside of last group. */
2605 unsigned char *begalt;
2606
2607 /* Place in the uncompiled pattern (i.e., the {) to
2608 which to go back if the interval is invalid. */
66f0296e 2609 re_char *beg_interval;
5e69f11e 2610
fa9a63c5 2611 /* Address of the place where a forward jump should go to the end of
7814e705 2612 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2613 last -- ends with a forward jump of this sort. */
2614 unsigned char *fixup_alt_jump = 0;
2615
b18215fc
RS
2616 /* Work area for range table of charset. */
2617 struct range_table_work_area range_table_work;
2618
2d1675e4
SM
2619 /* If the object matched can contain multibyte characters. */
2620 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2621
8f924df7 2622 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2623 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2624
f9b0fd99
RS
2625 /* Nonzero if we have pushed down into a subpattern. */
2626 int in_subpattern = 0;
2627
2628 /* These hold the values of p, pattern, and pend from the main
2629 pattern when we have pushed into a subpattern. */
2630 re_char *main_p;
2631 re_char *main_pattern;
2632 re_char *main_pend;
2633
fa9a63c5 2634#ifdef DEBUG
99633e97 2635 debug++;
fa9a63c5 2636 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2637 if (debug > 0)
fa9a63c5
RM
2638 {
2639 unsigned debug_count;
5e69f11e 2640
fa9a63c5 2641 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2642 putchar (pattern[debug_count]);
fa9a63c5
RM
2643 putchar ('\n');
2644 }
2645#endif /* DEBUG */
2646
2647 /* Initialize the compile stack. */
2648 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2649 if (compile_stack.stack == NULL)
2650 return REG_ESPACE;
2651
2652 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2653 compile_stack.avail = 0;
2654
b18215fc
RS
2655 range_table_work.table = 0;
2656 range_table_work.allocated = 0;
2657
fa9a63c5
RM
2658 /* Initialize the pattern buffer. */
2659 bufp->syntax = syntax;
2660 bufp->fastmap_accurate = 0;
2661 bufp->not_bol = bufp->not_eol = 0;
6224b623 2662 bufp->used_syntax = 0;
fa9a63c5
RM
2663
2664 /* Set `used' to zero, so that if we return an error, the pattern
2665 printer (for debugging) will think there's no pattern. We reset it
2666 at the end. */
2667 bufp->used = 0;
5e69f11e 2668
fa9a63c5 2669 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2670 bufp->re_nsub = 0;
fa9a63c5 2671
0b32bf0e 2672#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2673 /* Initialize the syntax table. */
2674 init_syntax_once ();
2675#endif
2676
2677 if (bufp->allocated == 0)
2678 {
2679 if (bufp->buffer)
2680 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2681 enough space. This loses if buffer's address is bogus, but
7814e705 2682 that is the user's responsibility. */
25fe55af
RS
2683 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2684 }
fa9a63c5 2685 else
7814e705 2686 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2687 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2688 }
fa9a63c5
RM
2689 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2690
2691 bufp->allocated = INIT_BUF_SIZE;
2692 }
2693
2694 begalt = b = bufp->buffer;
2695
2696 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2697 while (1)
fa9a63c5 2698 {
f9b0fd99
RS
2699 if (p == pend)
2700 {
2701 /* If this is the end of an included regexp,
2702 pop back to the main regexp and try again. */
2703 if (in_subpattern)
2704 {
2705 in_subpattern = 0;
2706 pattern = main_pattern;
2707 p = main_p;
2708 pend = main_pend;
2709 continue;
2710 }
2711 /* If this is the end of the main regexp, we are done. */
2712 break;
2713 }
2714
fa9a63c5
RM
2715 PATFETCH (c);
2716
2717 switch (c)
25fe55af 2718 {
f9b0fd99
RS
2719 case ' ':
2720 {
2721 re_char *p1 = p;
2722
2723 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2724 spaces normally. And don't try to do this recursively. */
2725 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2726 goto normal_char;
2727
2728 /* Peek past following spaces. */
2729 while (p1 != pend)
2730 {
2731 if (*p1 != ' ')
2732 break;
2733 p1++;
2734 }
2735 /* If the spaces are followed by a repetition op,
2736 treat them normally. */
c721eee5
RS
2737 if (p1 != pend
2738 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2739 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2740 goto normal_char;
2741
2742 /* Replace the spaces with the whitespace regexp. */
2743 in_subpattern = 1;
2744 main_p = p1;
2745 main_pend = pend;
2746 main_pattern = pattern;
2747 p = pattern = whitespace_regexp;
2748 pend = p + strlen (p);
2749 break;
7814e705 2750 }
f9b0fd99 2751
25fe55af
RS
2752 case '^':
2753 {
7814e705 2754 if ( /* If at start of pattern, it's an operator. */
25fe55af 2755 p == pattern + 1
7814e705 2756 /* If context independent, it's an operator. */
25fe55af 2757 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2758 /* Otherwise, depends on what's come before. */
25fe55af 2759 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2760 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2761 else
2762 goto normal_char;
2763 }
2764 break;
2765
2766
2767 case '$':
2768 {
2769 if ( /* If at end of pattern, it's an operator. */
2770 p == pend
7814e705 2771 /* If context independent, it's an operator. */
25fe55af
RS
2772 || syntax & RE_CONTEXT_INDEP_ANCHORS
2773 /* Otherwise, depends on what's next. */
2774 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2775 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2776 else
2777 goto normal_char;
2778 }
2779 break;
fa9a63c5
RM
2780
2781
2782 case '+':
25fe55af
RS
2783 case '?':
2784 if ((syntax & RE_BK_PLUS_QM)
2785 || (syntax & RE_LIMITED_OPS))
2786 goto normal_char;
2787 handle_plus:
2788 case '*':
2789 /* If there is no previous pattern... */
2790 if (!laststart)
2791 {
2792 if (syntax & RE_CONTEXT_INVALID_OPS)
2793 FREE_STACK_RETURN (REG_BADRPT);
2794 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2795 goto normal_char;
2796 }
2797
2798 {
7814e705 2799 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2800 boolean zero_times_ok = 0, many_times_ok = 0;
2801 boolean greedy = 1;
25fe55af
RS
2802
2803 /* If there is a sequence of repetition chars, collapse it
2804 down to just one (the right one). We can't combine
2805 interval operators with these because of, e.g., `a{2}*',
7814e705 2806 which should only match an even number of `a's. */
25fe55af
RS
2807
2808 for (;;)
2809 {
0b32bf0e 2810 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2811 && c == '?' && (zero_times_ok || many_times_ok))
2812 greedy = 0;
2813 else
2814 {
2815 zero_times_ok |= c != '+';
2816 many_times_ok |= c != '?';
2817 }
25fe55af
RS
2818
2819 if (p == pend)
2820 break;
ed0767d8
SM
2821 else if (*p == '*'
2822 || (!(syntax & RE_BK_PLUS_QM)
2823 && (*p == '+' || *p == '?')))
25fe55af 2824 ;
ed0767d8 2825 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2826 {
ed0767d8
SM
2827 if (p+1 == pend)
2828 FREE_STACK_RETURN (REG_EESCAPE);
2829 if (p[1] == '+' || p[1] == '?')
2830 PATFETCH (c); /* Gobble up the backslash. */
2831 else
2832 break;
25fe55af
RS
2833 }
2834 else
ed0767d8 2835 break;
25fe55af 2836 /* If we get here, we found another repeat character. */
ed0767d8
SM
2837 PATFETCH (c);
2838 }
25fe55af
RS
2839
2840 /* Star, etc. applied to an empty pattern is equivalent
2841 to an empty pattern. */
4e8a9132 2842 if (!laststart || laststart == b)
25fe55af
RS
2843 break;
2844
2845 /* Now we know whether or not zero matches is allowed
7814e705 2846 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2847 if (greedy)
2848 {
99633e97 2849 if (many_times_ok)
4e8a9132
SM
2850 {
2851 boolean simple = skip_one_char (laststart) == b;
2852 unsigned int startoffset = 0;
f6a3f532 2853 re_opcode_t ofj =
01618498 2854 /* Check if the loop can match the empty string. */
6df42991
SM
2855 (simple || !analyse_first (laststart, b, NULL, 0))
2856 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2857 assert (skip_one_char (laststart) <= b);
177c0ea7 2858
4e8a9132
SM
2859 if (!zero_times_ok && simple)
2860 { /* Since simple * loops can be made faster by using
2861 on_failure_keep_string_jump, we turn simple P+
2862 into PP* if P is simple. */
2863 unsigned char *p1, *p2;
2864 startoffset = b - laststart;
2865 GET_BUFFER_SPACE (startoffset);
2866 p1 = b; p2 = laststart;
2867 while (p2 < p1)
2868 *b++ = *p2++;
2869 zero_times_ok = 1;
99633e97 2870 }
4e8a9132
SM
2871
2872 GET_BUFFER_SPACE (6);
2873 if (!zero_times_ok)
2874 /* A + loop. */
f6a3f532 2875 STORE_JUMP (ofj, b, b + 6);
99633e97 2876 else
4e8a9132
SM
2877 /* Simple * loops can use on_failure_keep_string_jump
2878 depending on what follows. But since we don't know
2879 that yet, we leave the decision up to
2880 on_failure_jump_smart. */
f6a3f532 2881 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2882 laststart + startoffset, b + 6);
99633e97 2883 b += 3;
4e8a9132 2884 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2885 b += 3;
2886 }
2887 else
2888 {
4e8a9132
SM
2889 /* A simple ? pattern. */
2890 assert (zero_times_ok);
2891 GET_BUFFER_SPACE (3);
2892 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2893 b += 3;
2894 }
1c8c6d39
DL
2895 }
2896 else /* not greedy */
2897 { /* I wish the greedy and non-greedy cases could be merged. */
2898
0683b6fa 2899 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2900 if (many_times_ok)
2901 {
f6a3f532
SM
2902 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2903
6df42991
SM
2904 /* The non-greedy multiple match looks like
2905 a repeat..until: we only need a conditional jump
2906 at the end of the loop. */
f6a3f532
SM
2907 if (emptyp) BUF_PUSH (no_op);
2908 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2909 : on_failure_jump, b, laststart);
1c8c6d39
DL
2910 b += 3;
2911 if (zero_times_ok)
2912 {
2913 /* The repeat...until naturally matches one or more.
2914 To also match zero times, we need to first jump to
6df42991 2915 the end of the loop (its conditional jump). */
1c8c6d39
DL
2916 INSERT_JUMP (jump, laststart, b);
2917 b += 3;
2918 }
2919 }
2920 else
2921 {
2922 /* non-greedy a?? */
1c8c6d39
DL
2923 INSERT_JUMP (jump, laststart, b + 3);
2924 b += 3;
2925 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2926 b += 3;
2927 }
2928 }
2929 }
4e8a9132 2930 pending_exact = 0;
fa9a63c5
RM
2931 break;
2932
2933
2934 case '.':
25fe55af
RS
2935 laststart = b;
2936 BUF_PUSH (anychar);
2937 break;
fa9a63c5
RM
2938
2939
25fe55af
RS
2940 case '[':
2941 {
b18215fc 2942 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2943
25fe55af 2944 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2945
25fe55af
RS
2946 /* Ensure that we have enough space to push a charset: the
2947 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2948 GET_BUFFER_SPACE (34);
2949
25fe55af 2950 laststart = b;
e318085a 2951
25fe55af 2952 /* We test `*p == '^' twice, instead of using an if
7814e705 2953 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2954 BUF_PUSH (*p == '^' ? charset_not : charset);
2955 if (*p == '^')
2956 p++;
e318085a 2957
25fe55af
RS
2958 /* Remember the first position in the bracket expression. */
2959 p1 = p;
e318085a 2960
7814e705 2961 /* Push the number of bytes in the bitmap. */
25fe55af 2962 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2963
25fe55af
RS
2964 /* Clear the whole map. */
2965 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2966
25fe55af
RS
2967 /* charset_not matches newline according to a syntax bit. */
2968 if ((re_opcode_t) b[-2] == charset_not
2969 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2970 SET_LIST_BIT ('\n');
fa9a63c5 2971
7814e705 2972 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2973 for (;;)
2974 {
b18215fc 2975 boolean escaped_char = false;
2d1675e4 2976 const unsigned char *p2 = p;
cf9c99bc 2977 re_wchar_t ch, c2;
e318085a 2978
25fe55af 2979 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2980
36595814
SM
2981 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2982 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2983 So the translation is done later in a loop. Example:
2984 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2985 PATFETCH (c);
e318085a 2986
25fe55af
RS
2987 /* \ might escape characters inside [...] and [^...]. */
2988 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2989 {
2990 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2991
2992 PATFETCH (c);
b18215fc 2993 escaped_char = true;
25fe55af 2994 }
b18215fc
RS
2995 else
2996 {
7814e705 2997 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2998 not (i.e., when the bracket expression is `[]' so
2999 far), the ']' character bit gets set way below. */
2d1675e4 3000 if (c == ']' && p2 != p1)
657fcfbd 3001 break;
25fe55af 3002 }
b18215fc 3003
25fe55af
RS
3004 /* See if we're at the beginning of a possible character
3005 class. */
b18215fc 3006
2d1675e4
SM
3007 if (!escaped_char &&
3008 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 3009 {
7814e705 3010 /* Leave room for the null. */
14473664 3011 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 3012 const unsigned char *class_beg;
b18215fc 3013
25fe55af
RS
3014 PATFETCH (c);
3015 c1 = 0;
ed0767d8 3016 class_beg = p;
b18215fc 3017
25fe55af
RS
3018 /* If pattern is `[[:'. */
3019 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 3020
25fe55af
RS
3021 for (;;)
3022 {
14473664
SM
3023 PATFETCH (c);
3024 if ((c == ':' && *p == ']') || p == pend)
3025 break;
3026 if (c1 < CHAR_CLASS_MAX_LENGTH)
3027 str[c1++] = c;
3028 else
3029 /* This is in any case an invalid class name. */
3030 str[0] = '\0';
25fe55af
RS
3031 }
3032 str[c1] = '\0';
b18215fc
RS
3033
3034 /* If isn't a word bracketed by `[:' and `:]':
3035 undo the ending character, the letters, and
3036 leave the leading `:' and `[' (but set bits for
3037 them). */
25fe55af
RS
3038 if (c == ':' && *p == ']')
3039 {
14473664 3040 re_wctype_t cc;
8f924df7 3041 int limit;
14473664
SM
3042
3043 cc = re_wctype (str);
3044
3045 if (cc == 0)
fa9a63c5
RM
3046 FREE_STACK_RETURN (REG_ECTYPE);
3047
14473664
SM
3048 /* Throw away the ] at the end of the character
3049 class. */
3050 PATFETCH (c);
fa9a63c5 3051
14473664 3052 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3053
cf9c99bc
KH
3054#ifndef emacs
3055 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3056 if (re_iswctype (btowc (ch), cc))
3057 {
3058 c = TRANSLATE (ch);
ed00c2ac
KH
3059 if (c < (1 << BYTEWIDTH))
3060 SET_LIST_BIT (c);
8f924df7 3061 }
cf9c99bc
KH
3062#else /* emacs */
3063 /* Most character classes in a multibyte match
3064 just set a flag. Exceptions are is_blank,
3065 is_digit, is_cntrl, and is_xdigit, since
3066 they can only match ASCII characters. We
3067 don't need to handle them for multibyte.
3068 They are distinguished by a negative wctype. */
96cc36cc 3069
cf9c99bc 3070 for (ch = 0; ch < 256; ++ch)
25fe55af 3071 {
cf9c99bc
KH
3072 c = RE_CHAR_TO_MULTIBYTE (ch);
3073 if (! CHAR_BYTE8_P (c)
3074 && re_iswctype (c, cc))
8f924df7 3075 {
cf9c99bc
KH
3076 SET_LIST_BIT (ch);
3077 c1 = TRANSLATE (c);
3078 if (c1 == c)
3079 continue;
3080 if (ASCII_CHAR_P (c1))
3081 SET_LIST_BIT (c1);
3082 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3083 SET_LIST_BIT (c1);
8f924df7 3084 }
25fe55af 3085 }
cf9c99bc
KH
3086 SET_RANGE_TABLE_WORK_AREA_BIT
3087 (range_table_work, re_wctype_to_bit (cc));
3088#endif /* emacs */
6224b623
SM
3089 /* In most cases the matching rule for char classes
3090 only uses the syntax table for multibyte chars,
3091 so that the content of the syntax-table it is not
3092 hardcoded in the range_table. SPACE and WORD are
3093 the two exceptions. */
3094 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3095 bufp->used_syntax = 1;
3096
b18215fc
RS
3097 /* Repeat the loop. */
3098 continue;
25fe55af
RS
3099 }
3100 else
3101 {
ed0767d8
SM
3102 /* Go back to right after the "[:". */
3103 p = class_beg;
25fe55af 3104 SET_LIST_BIT ('[');
b18215fc
RS
3105
3106 /* Because the `:' may starts the range, we
3107 can't simply set bit and repeat the loop.
7814e705 3108 Instead, just set it to C and handle below. */
b18215fc 3109 c = ':';
25fe55af
RS
3110 }
3111 }
b18215fc
RS
3112
3113 if (p < pend && p[0] == '-' && p[1] != ']')
3114 {
3115
3116 /* Discard the `-'. */
3117 PATFETCH (c1);
3118
3119 /* Fetch the character which ends the range. */
3120 PATFETCH (c1);
cf9c99bc
KH
3121#ifdef emacs
3122 if (CHAR_BYTE8_P (c1)
3123 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3124 /* Treat the range from a multibyte character to
3125 raw-byte character as empty. */
3126 c = c1 + 1;
3127#endif /* emacs */
e318085a 3128 }
25fe55af 3129 else
b18215fc
RS
3130 /* Range from C to C. */
3131 c1 = c;
3132
cf9c99bc 3133 if (c > c1)
25fe55af 3134 {
cf9c99bc
KH
3135 if (syntax & RE_NO_EMPTY_RANGES)
3136 FREE_STACK_RETURN (REG_ERANGEX);
3137 /* Else, repeat the loop. */
bf216479 3138 }
6fdd04b0 3139 else
25fe55af 3140 {
cf9c99bc
KH
3141#ifndef emacs
3142 /* Set the range into bitmap */
8f924df7 3143 for (; c <= c1; c++)
b18215fc 3144 {
cf9c99bc
KH
3145 ch = TRANSLATE (c);
3146 if (ch < (1 << BYTEWIDTH))
3147 SET_LIST_BIT (ch);
3148 }
3149#else /* emacs */
3150 if (c < 128)
3151 {
3152 ch = MIN (127, c1);
3153 SETUP_ASCII_RANGE (range_table_work, c, ch);
3154 c = ch + 1;
3155 if (CHAR_BYTE8_P (c1))
3156 c = BYTE8_TO_CHAR (128);
3157 }
3158 if (c <= c1)
3159 {
3160 if (CHAR_BYTE8_P (c))
3161 {
3162 c = CHAR_TO_BYTE8 (c);
3163 c1 = CHAR_TO_BYTE8 (c1);
3164 for (; c <= c1; c++)
3165 SET_LIST_BIT (c);
3166 }
3167 else if (multibyte)
3168 {
3169 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3170 }
3171 else
3172 {
3173 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3174 }
e934739e 3175 }
cf9c99bc 3176#endif /* emacs */
25fe55af 3177 }
e318085a
RS
3178 }
3179
25fe55af 3180 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3181 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3182 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3183 b[-1]--;
3184 b += b[-1];
fa9a63c5 3185
96cc36cc
RS
3186 /* Build real range table from work area. */
3187 if (RANGE_TABLE_WORK_USED (range_table_work)
3188 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3189 {
3190 int i;
3191 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3192
b18215fc 3193 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3194 bytes for flags, two for COUNT, and three bytes for
3195 each character. */
3196 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3197
b18215fc
RS
3198 /* Indicate the existence of range table. */
3199 laststart[1] |= 0x80;
fa9a63c5 3200
96cc36cc
RS
3201 /* Store the character class flag bits into the range table.
3202 If not in emacs, these flag bits are always 0. */
3203 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3204 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3205
b18215fc
RS
3206 STORE_NUMBER_AND_INCR (b, used / 2);
3207 for (i = 0; i < used; i++)
3208 STORE_CHARACTER_AND_INCR
3209 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3210 }
25fe55af
RS
3211 }
3212 break;
fa9a63c5
RM
3213
3214
b18215fc 3215 case '(':
25fe55af
RS
3216 if (syntax & RE_NO_BK_PARENS)
3217 goto handle_open;
3218 else
3219 goto normal_char;
fa9a63c5
RM
3220
3221
25fe55af
RS
3222 case ')':
3223 if (syntax & RE_NO_BK_PARENS)
3224 goto handle_close;
3225 else
3226 goto normal_char;
e318085a
RS
3227
3228
25fe55af
RS
3229 case '\n':
3230 if (syntax & RE_NEWLINE_ALT)
3231 goto handle_alt;
3232 else
3233 goto normal_char;
e318085a
RS
3234
3235
b18215fc 3236 case '|':
25fe55af
RS
3237 if (syntax & RE_NO_BK_VBAR)
3238 goto handle_alt;
3239 else
3240 goto normal_char;
3241
3242
3243 case '{':
3244 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3245 goto handle_interval;
3246 else
3247 goto normal_char;
3248
3249
3250 case '\\':
3251 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3252
3253 /* Do not translate the character after the \, so that we can
3254 distinguish, e.g., \B from \b, even if we normally would
3255 translate, e.g., B to b. */
36595814 3256 PATFETCH (c);
25fe55af
RS
3257
3258 switch (c)
3259 {
3260 case '(':
3261 if (syntax & RE_NO_BK_PARENS)
3262 goto normal_backslash;
3263
3264 handle_open:
505bde11
SM
3265 {
3266 int shy = 0;
c69b0314 3267 regnum_t regnum = 0;
505bde11
SM
3268 if (p+1 < pend)
3269 {
3270 /* Look for a special (?...) construct */
ed0767d8 3271 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3272 {
ed0767d8 3273 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3274 while (!shy)
505bde11 3275 {
c69b0314
SM
3276 PATFETCH (c);
3277 switch (c)
3278 {
3279 case ':': shy = 1; break;
3280 case '0':
3281 /* An explicitly specified regnum must start
3282 with non-0. */
3283 if (regnum == 0)
3284 FREE_STACK_RETURN (REG_BADPAT);
3285 case '1': case '2': case '3': case '4':
3286 case '5': case '6': case '7': case '8': case '9':
3287 regnum = 10*regnum + (c - '0'); break;
3288 default:
3289 /* Only (?:...) is supported right now. */
3290 FREE_STACK_RETURN (REG_BADPAT);
3291 }
505bde11
SM
3292 }
3293 }
505bde11
SM
3294 }
3295
3296 if (!shy)
c69b0314
SM
3297 regnum = ++bufp->re_nsub;
3298 else if (regnum)
3299 { /* It's actually not shy, but explicitly numbered. */
3300 shy = 0;
3301 if (regnum > bufp->re_nsub)
3302 bufp->re_nsub = regnum;
3303 else if (regnum > bufp->re_nsub
3304 /* Ideally, we'd want to check that the specified
3305 group can't have matched (i.e. all subgroups
3306 using the same regnum are in other branches of
3307 OR patterns), but we don't currently keep track
3308 of enough info to do that easily. */
3309 || group_in_compile_stack (compile_stack, regnum))
3310 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3311 }
c69b0314
SM
3312 else
3313 /* It's really shy. */
3314 regnum = - bufp->re_nsub;
25fe55af 3315
99633e97
SM
3316 if (COMPILE_STACK_FULL)
3317 {
3318 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3319 compile_stack_elt_t);
3320 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3321
99633e97
SM
3322 compile_stack.size <<= 1;
3323 }
25fe55af 3324
99633e97 3325 /* These are the values to restore when we hit end of this
7814e705 3326 group. They are all relative offsets, so that if the
99633e97
SM
3327 whole pattern moves because of realloc, they will still
3328 be valid. */
3329 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3330 COMPILE_STACK_TOP.fixup_alt_jump
3331 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3332 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3333 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3334
c69b0314
SM
3335 /* Do not push a start_memory for groups beyond the last one
3336 we can represent in the compiled pattern. */
3337 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3338 BUF_PUSH_2 (start_memory, regnum);
3339
3340 compile_stack.avail++;
3341
3342 fixup_alt_jump = 0;
3343 laststart = 0;
3344 begalt = b;
3345 /* If we've reached MAX_REGNUM groups, then this open
3346 won't actually generate any code, so we'll have to
3347 clear pending_exact explicitly. */
3348 pending_exact = 0;
3349 break;
505bde11 3350 }
25fe55af
RS
3351
3352 case ')':
3353 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3354
3355 if (COMPILE_STACK_EMPTY)
505bde11
SM
3356 {
3357 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3358 goto normal_backslash;
3359 else
3360 FREE_STACK_RETURN (REG_ERPAREN);
3361 }
25fe55af
RS
3362
3363 handle_close:
505bde11 3364 FIXUP_ALT_JUMP ();
25fe55af
RS
3365
3366 /* See similar code for backslashed left paren above. */
3367 if (COMPILE_STACK_EMPTY)
505bde11
SM
3368 {
3369 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3370 goto normal_char;
3371 else
3372 FREE_STACK_RETURN (REG_ERPAREN);
3373 }
25fe55af
RS
3374
3375 /* Since we just checked for an empty stack above, this
3376 ``can't happen''. */
3377 assert (compile_stack.avail != 0);
3378 {
3379 /* We don't just want to restore into `regnum', because
3380 later groups should continue to be numbered higher,
7814e705 3381 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3382 regnum_t regnum;
25fe55af
RS
3383
3384 compile_stack.avail--;
3385 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3386 fixup_alt_jump
3387 = COMPILE_STACK_TOP.fixup_alt_jump
3388 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3389 : 0;
3390 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3391 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3392 /* If we've reached MAX_REGNUM groups, then this open
3393 won't actually generate any code, so we'll have to
3394 clear pending_exact explicitly. */
3395 pending_exact = 0;
e318085a 3396
25fe55af 3397 /* We're at the end of the group, so now we know how many
7814e705 3398 groups were inside this one. */
c69b0314
SM
3399 if (regnum <= MAX_REGNUM && regnum > 0)
3400 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3401 }
3402 break;
3403
3404
3405 case '|': /* `\|'. */
3406 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3407 goto normal_backslash;
3408 handle_alt:
3409 if (syntax & RE_LIMITED_OPS)
3410 goto normal_char;
3411
3412 /* Insert before the previous alternative a jump which
7814e705 3413 jumps to this alternative if the former fails. */
25fe55af
RS
3414 GET_BUFFER_SPACE (3);
3415 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3416 pending_exact = 0;
3417 b += 3;
3418
3419 /* The alternative before this one has a jump after it
3420 which gets executed if it gets matched. Adjust that
3421 jump so it will jump to this alternative's analogous
3422 jump (put in below, which in turn will jump to the next
3423 (if any) alternative's such jump, etc.). The last such
3424 jump jumps to the correct final destination. A picture:
3425 _____ _____
3426 | | | |
3427 | v | v
3428 a | b | c
3429
3430 If we are at `b', then fixup_alt_jump right now points to a
3431 three-byte space after `a'. We'll put in the jump, set
3432 fixup_alt_jump to right after `b', and leave behind three
3433 bytes which we'll fill in when we get to after `c'. */
3434
505bde11 3435 FIXUP_ALT_JUMP ();
25fe55af
RS
3436
3437 /* Mark and leave space for a jump after this alternative,
3438 to be filled in later either by next alternative or
3439 when know we're at the end of a series of alternatives. */
3440 fixup_alt_jump = b;
3441 GET_BUFFER_SPACE (3);
3442 b += 3;
3443
3444 laststart = 0;
3445 begalt = b;
3446 break;
3447
3448
3449 case '{':
3450 /* If \{ is a literal. */
3451 if (!(syntax & RE_INTERVALS)
3452 /* If we're at `\{' and it's not the open-interval
3453 operator. */
4bb91c68 3454 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3455 goto normal_backslash;
3456
3457 handle_interval:
3458 {
3459 /* If got here, then the syntax allows intervals. */
3460
3461 /* At least (most) this many matches must be made. */
99633e97 3462 int lower_bound = 0, upper_bound = -1;
25fe55af 3463
ed0767d8 3464 beg_interval = p;
25fe55af 3465
25fe55af
RS
3466 GET_UNSIGNED_NUMBER (lower_bound);
3467
3468 if (c == ',')
ed0767d8 3469 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3470 else
3471 /* Interval such as `{1}' => match exactly once. */
3472 upper_bound = lower_bound;
3473
3474 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3475 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3476 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3477
3478 if (!(syntax & RE_NO_BK_BRACES))
3479 {
4bb91c68
SM
3480 if (c != '\\')
3481 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3482 if (p == pend)
3483 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3484 PATFETCH (c);
3485 }
3486
3487 if (c != '}')
4bb91c68 3488 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3489
3490 /* We just parsed a valid interval. */
3491
3492 /* If it's invalid to have no preceding re. */
3493 if (!laststart)
3494 {
3495 if (syntax & RE_CONTEXT_INVALID_OPS)
3496 FREE_STACK_RETURN (REG_BADRPT);
3497 else if (syntax & RE_CONTEXT_INDEP_OPS)
3498 laststart = b;
3499 else
3500 goto unfetch_interval;
3501 }
3502
6df42991
SM
3503 if (upper_bound == 0)
3504 /* If the upper bound is zero, just drop the sub pattern
3505 altogether. */
3506 b = laststart;
3507 else if (lower_bound == 1 && upper_bound == 1)
3508 /* Just match it once: nothing to do here. */
3509 ;
3510
3511 /* Otherwise, we have a nontrivial interval. When
3512 we're all done, the pattern will look like:
3513 set_number_at <jump count> <upper bound>
3514 set_number_at <succeed_n count> <lower bound>
3515 succeed_n <after jump addr> <succeed_n count>
3516 <body of loop>
3517 jump_n <succeed_n addr> <jump count>
3518 (The upper bound and `jump_n' are omitted if
3519 `upper_bound' is 1, though.) */
3520 else
3521 { /* If the upper bound is > 1, we need to insert
3522 more at the end of the loop. */
3523 unsigned int nbytes = (upper_bound < 0 ? 3
3524 : upper_bound > 1 ? 5 : 0);
3525 unsigned int startoffset = 0;
3526
3527 GET_BUFFER_SPACE (20); /* We might use less. */
3528
3529 if (lower_bound == 0)
3530 {
3531 /* A succeed_n that starts with 0 is really a
3532 a simple on_failure_jump_loop. */
3533 INSERT_JUMP (on_failure_jump_loop, laststart,
3534 b + 3 + nbytes);
3535 b += 3;
3536 }
3537 else
3538 {
3539 /* Initialize lower bound of the `succeed_n', even
3540 though it will be set during matching by its
3541 attendant `set_number_at' (inserted next),
3542 because `re_compile_fastmap' needs to know.
3543 Jump to the `jump_n' we might insert below. */
3544 INSERT_JUMP2 (succeed_n, laststart,
3545 b + 5 + nbytes,
3546 lower_bound);
3547 b += 5;
3548
3549 /* Code to initialize the lower bound. Insert
7814e705 3550 before the `succeed_n'. The `5' is the last two
6df42991
SM
3551 bytes of this `set_number_at', plus 3 bytes of
3552 the following `succeed_n'. */
3553 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3554 b += 5;
3555 startoffset += 5;
3556 }
3557
3558 if (upper_bound < 0)
3559 {
3560 /* A negative upper bound stands for infinity,
3561 in which case it degenerates to a plain jump. */
3562 STORE_JUMP (jump, b, laststart + startoffset);
3563 b += 3;
3564 }
3565 else if (upper_bound > 1)
3566 { /* More than one repetition is allowed, so
3567 append a backward jump to the `succeed_n'
3568 that starts this interval.
3569
3570 When we've reached this during matching,
3571 we'll have matched the interval once, so
3572 jump back only `upper_bound - 1' times. */
3573 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3574 upper_bound - 1);
3575 b += 5;
3576
3577 /* The location we want to set is the second
3578 parameter of the `jump_n'; that is `b-2' as
3579 an absolute address. `laststart' will be
3580 the `set_number_at' we're about to insert;
3581 `laststart+3' the number to set, the source
3582 for the relative address. But we are
3583 inserting into the middle of the pattern --
3584 so everything is getting moved up by 5.
3585 Conclusion: (b - 2) - (laststart + 3) + 5,
3586 i.e., b - laststart.
3587
3588 We insert this at the beginning of the loop
3589 so that if we fail during matching, we'll
3590 reinitialize the bounds. */
3591 insert_op2 (set_number_at, laststart, b - laststart,
3592 upper_bound - 1, b);
3593 b += 5;
3594 }
3595 }
25fe55af
RS
3596 pending_exact = 0;
3597 beg_interval = NULL;
3598 }
3599 break;
3600
3601 unfetch_interval:
3602 /* If an invalid interval, match the characters as literals. */
3603 assert (beg_interval);
3604 p = beg_interval;
3605 beg_interval = NULL;
3606
3607 /* normal_char and normal_backslash need `c'. */
ed0767d8 3608 c = '{';
25fe55af
RS
3609
3610 if (!(syntax & RE_NO_BK_BRACES))
3611 {
ed0767d8
SM
3612 assert (p > pattern && p[-1] == '\\');
3613 goto normal_backslash;
25fe55af 3614 }
ed0767d8
SM
3615 else
3616 goto normal_char;
e318085a 3617
b18215fc 3618#ifdef emacs
25fe55af 3619 /* There is no way to specify the before_dot and after_dot
7814e705 3620 operators. rms says this is ok. --karl */
25fe55af
RS
3621 case '=':
3622 BUF_PUSH (at_dot);
3623 break;
3624
3625 case 's':
3626 laststart = b;
3627 PATFETCH (c);
3628 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3629 break;
3630
3631 case 'S':
3632 laststart = b;
3633 PATFETCH (c);
3634 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3635 break;
b18215fc
RS
3636
3637 case 'c':
3638 laststart = b;
36595814 3639 PATFETCH (c);
b18215fc
RS
3640 BUF_PUSH_2 (categoryspec, c);
3641 break;
e318085a 3642
b18215fc
RS
3643 case 'C':
3644 laststart = b;
36595814 3645 PATFETCH (c);
b18215fc
RS
3646 BUF_PUSH_2 (notcategoryspec, c);
3647 break;
3648#endif /* emacs */
e318085a 3649
e318085a 3650
25fe55af 3651 case 'w':
4bb91c68
SM
3652 if (syntax & RE_NO_GNU_OPS)
3653 goto normal_char;
25fe55af 3654 laststart = b;
1fb352e0 3655 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3656 break;
e318085a 3657
e318085a 3658
25fe55af 3659 case 'W':
4bb91c68
SM
3660 if (syntax & RE_NO_GNU_OPS)
3661 goto normal_char;
25fe55af 3662 laststart = b;
1fb352e0 3663 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3664 break;
e318085a
RS
3665
3666
25fe55af 3667 case '<':
4bb91c68
SM
3668 if (syntax & RE_NO_GNU_OPS)
3669 goto normal_char;
25fe55af
RS
3670 BUF_PUSH (wordbeg);
3671 break;
e318085a 3672
25fe55af 3673 case '>':
4bb91c68
SM
3674 if (syntax & RE_NO_GNU_OPS)
3675 goto normal_char;
25fe55af
RS
3676 BUF_PUSH (wordend);
3677 break;
e318085a 3678
669fa600
SM
3679 case '_':
3680 if (syntax & RE_NO_GNU_OPS)
3681 goto normal_char;
3682 laststart = b;
3683 PATFETCH (c);
3684 if (c == '<')
3685 BUF_PUSH (symbeg);
3686 else if (c == '>')
3687 BUF_PUSH (symend);
3688 else
3689 FREE_STACK_RETURN (REG_BADPAT);
3690 break;
3691
25fe55af 3692 case 'b':
4bb91c68
SM
3693 if (syntax & RE_NO_GNU_OPS)
3694 goto normal_char;
25fe55af
RS
3695 BUF_PUSH (wordbound);
3696 break;
e318085a 3697
25fe55af 3698 case 'B':
4bb91c68
SM
3699 if (syntax & RE_NO_GNU_OPS)
3700 goto normal_char;
25fe55af
RS
3701 BUF_PUSH (notwordbound);
3702 break;
fa9a63c5 3703
25fe55af 3704 case '`':
4bb91c68
SM
3705 if (syntax & RE_NO_GNU_OPS)
3706 goto normal_char;
25fe55af
RS
3707 BUF_PUSH (begbuf);
3708 break;
e318085a 3709
25fe55af 3710 case '\'':
4bb91c68
SM
3711 if (syntax & RE_NO_GNU_OPS)
3712 goto normal_char;
25fe55af
RS
3713 BUF_PUSH (endbuf);
3714 break;
e318085a 3715
25fe55af
RS
3716 case '1': case '2': case '3': case '4': case '5':
3717 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3718 {
3719 regnum_t reg;
e318085a 3720
0cdd06f8
SM
3721 if (syntax & RE_NO_BK_REFS)
3722 goto normal_backslash;
e318085a 3723
0cdd06f8 3724 reg = c - '0';
e318085a 3725
c69b0314
SM
3726 if (reg > bufp->re_nsub || reg < 1
3727 /* Can't back reference to a subexp before its end. */
3728 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3729 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3730
0cdd06f8
SM
3731 laststart = b;
3732 BUF_PUSH_2 (duplicate, reg);
3733 }
25fe55af 3734 break;
e318085a 3735
e318085a 3736
25fe55af
RS
3737 case '+':
3738 case '?':
3739 if (syntax & RE_BK_PLUS_QM)
3740 goto handle_plus;
3741 else
3742 goto normal_backslash;
3743
3744 default:
3745 normal_backslash:
3746 /* You might think it would be useful for \ to mean
3747 not to translate; but if we don't translate it
4bb91c68 3748 it will never match anything. */
25fe55af
RS
3749 goto normal_char;
3750 }
3751 break;
fa9a63c5
RM
3752
3753
3754 default:
25fe55af 3755 /* Expects the character in `c'. */
fa9a63c5 3756 normal_char:
36595814 3757 /* If no exactn currently being built. */
25fe55af 3758 if (!pending_exact
fa9a63c5 3759
25fe55af
RS
3760 /* If last exactn not at current position. */
3761 || pending_exact + *pending_exact + 1 != b
5e69f11e 3762
25fe55af 3763 /* We have only one byte following the exactn for the count. */
2d1675e4 3764 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3765
7814e705 3766 /* If followed by a repetition operator. */
9d99031f 3767 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3768 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3769 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3770 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3771 || ((syntax & RE_INTERVALS)
25fe55af 3772 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3773 ? p != pend && *p == '{'
3774 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3775 {
3776 /* Start building a new exactn. */
5e69f11e 3777
25fe55af 3778 laststart = b;
fa9a63c5
RM
3779
3780 BUF_PUSH_2 (exactn, 0);
3781 pending_exact = b - 1;
25fe55af 3782 }
5e69f11e 3783
2d1675e4
SM
3784 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3785 {
e0277a47
KH
3786 int len;
3787
cf9c99bc 3788 if (multibyte)
6fdd04b0 3789 {
cf9c99bc 3790 c = TRANSLATE (c);
6fdd04b0
KH
3791 len = CHAR_STRING (c, b);
3792 b += len;
3793 }
e0277a47 3794 else
6fdd04b0 3795 {
cf9c99bc
KH
3796 c1 = RE_CHAR_TO_MULTIBYTE (c);
3797 if (! CHAR_BYTE8_P (c1))
3798 {
3799 re_wchar_t c2 = TRANSLATE (c1);
3800
3801 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3802 c = c1;
3803 }
6fdd04b0
KH
3804 *b++ = c;
3805 len = 1;
3806 }
2d1675e4
SM
3807 (*pending_exact) += len;
3808 }
3809
fa9a63c5 3810 break;
25fe55af 3811 } /* switch (c) */
fa9a63c5
RM
3812 } /* while p != pend */
3813
5e69f11e 3814
fa9a63c5 3815 /* Through the pattern now. */
5e69f11e 3816
505bde11 3817 FIXUP_ALT_JUMP ();
fa9a63c5 3818
5e69f11e 3819 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3820 FREE_STACK_RETURN (REG_EPAREN);
3821
3822 /* If we don't want backtracking, force success
3823 the first time we reach the end of the compiled pattern. */
3824 if (syntax & RE_NO_POSIX_BACKTRACKING)
3825 BUF_PUSH (succeed);
3826
fa9a63c5
RM
3827 /* We have succeeded; set the length of the buffer. */
3828 bufp->used = b - bufp->buffer;
3829
3830#ifdef DEBUG
99633e97 3831 if (debug > 0)
fa9a63c5 3832 {
505bde11 3833 re_compile_fastmap (bufp);
fa9a63c5
RM
3834 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3835 print_compiled_pattern (bufp);
3836 }
99633e97 3837 debug--;
fa9a63c5
RM
3838#endif /* DEBUG */
3839
3840#ifndef MATCH_MAY_ALLOCATE
3841 /* Initialize the failure stack to the largest possible stack. This
3842 isn't necessary unless we're trying to avoid calling alloca in
3843 the search and match routines. */
3844 {
3845 int num_regs = bufp->re_nsub + 1;
3846
320a2a73 3847 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3848 {
a26f4ccd 3849 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3850
fa9a63c5
RM
3851 if (! fail_stack.stack)
3852 fail_stack.stack
5e69f11e 3853 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3854 * sizeof (fail_stack_elt_t));
3855 else
3856 fail_stack.stack
3857 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3858 (fail_stack.size
3859 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3860 }
3861
3862 regex_grow_registers (num_regs);
3863 }
3864#endif /* not MATCH_MAY_ALLOCATE */
3865
839966f3 3866 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3867} /* regex_compile */
3868\f
3869/* Subroutines for `regex_compile'. */
3870
7814e705 3871/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3872
3873static void
3874store_op1 (op, loc, arg)
3875 re_opcode_t op;
3876 unsigned char *loc;
3877 int arg;
3878{
3879 *loc = (unsigned char) op;
3880 STORE_NUMBER (loc + 1, arg);
3881}
3882
3883
3884/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3885
3886static void
3887store_op2 (op, loc, arg1, arg2)
3888 re_opcode_t op;
3889 unsigned char *loc;
3890 int arg1, arg2;
3891{
3892 *loc = (unsigned char) op;
3893 STORE_NUMBER (loc + 1, arg1);
3894 STORE_NUMBER (loc + 3, arg2);
3895}
3896
3897
3898/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3899 for OP followed by two-byte integer parameter ARG. */
3900
3901static void
3902insert_op1 (op, loc, arg, end)
3903 re_opcode_t op;
3904 unsigned char *loc;
3905 int arg;
5e69f11e 3906 unsigned char *end;
fa9a63c5
RM
3907{
3908 register unsigned char *pfrom = end;
3909 register unsigned char *pto = end + 3;
3910
3911 while (pfrom != loc)
3912 *--pto = *--pfrom;
5e69f11e 3913
fa9a63c5
RM
3914 store_op1 (op, loc, arg);
3915}
3916
3917
3918/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3919
3920static void
3921insert_op2 (op, loc, arg1, arg2, end)
3922 re_opcode_t op;
3923 unsigned char *loc;
3924 int arg1, arg2;
5e69f11e 3925 unsigned char *end;
fa9a63c5
RM
3926{
3927 register unsigned char *pfrom = end;
3928 register unsigned char *pto = end + 5;
3929
3930 while (pfrom != loc)
3931 *--pto = *--pfrom;
5e69f11e 3932
fa9a63c5
RM
3933 store_op2 (op, loc, arg1, arg2);
3934}
3935
3936
3937/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3938 after an alternative or a begin-subexpression. We assume there is at
3939 least one character before the ^. */
3940
3941static boolean
3942at_begline_loc_p (pattern, p, syntax)
01618498 3943 re_char *pattern, *p;
fa9a63c5
RM
3944 reg_syntax_t syntax;
3945{
01618498 3946 re_char *prev = p - 2;
fa9a63c5 3947 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3948
fa9a63c5
RM
3949 return
3950 /* After a subexpression? */
3951 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3952 /* After an alternative? */
d2af47df
SM
3953 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3954 /* After a shy subexpression? */
3955 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3956 && prev[-1] == '?' && prev[-2] == '('
3957 && (syntax & RE_NO_BK_PARENS
3958 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3959}
3960
3961
3962/* The dual of at_begline_loc_p. This one is for $. We assume there is
3963 at least one character after the $, i.e., `P < PEND'. */
3964
3965static boolean
3966at_endline_loc_p (p, pend, syntax)
01618498 3967 re_char *p, *pend;
99633e97 3968 reg_syntax_t syntax;
fa9a63c5 3969{
01618498 3970 re_char *next = p;
fa9a63c5 3971 boolean next_backslash = *next == '\\';
01618498 3972 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3973
fa9a63c5
RM
3974 return
3975 /* Before a subexpression? */
3976 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3977 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3978 /* Before an alternative? */
3979 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3980 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3981}
3982
3983
5e69f11e 3984/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3985 false if it's not. */
3986
3987static boolean
3988group_in_compile_stack (compile_stack, regnum)
3989 compile_stack_type compile_stack;
3990 regnum_t regnum;
3991{
3992 int this_element;
3993
5e69f11e
RM
3994 for (this_element = compile_stack.avail - 1;
3995 this_element >= 0;
fa9a63c5
RM
3996 this_element--)
3997 if (compile_stack.stack[this_element].regnum == regnum)
3998 return true;
3999
4000 return false;
4001}
fa9a63c5 4002\f
f6a3f532
SM
4003/* analyse_first.
4004 If fastmap is non-NULL, go through the pattern and fill fastmap
4005 with all the possible leading chars. If fastmap is NULL, don't
4006 bother filling it up (obviously) and only return whether the
4007 pattern could potentially match the empty string.
4008
4009 Return 1 if p..pend might match the empty string.
4010 Return 0 if p..pend matches at least one char.
01618498 4011 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
4012
4013static int
4014analyse_first (p, pend, fastmap, multibyte)
01618498 4015 re_char *p, *pend;
f6a3f532
SM
4016 char *fastmap;
4017 const int multibyte;
fa9a63c5 4018{
505bde11 4019 int j, k;
1fb352e0 4020 boolean not;
fa9a63c5 4021
b18215fc 4022 /* If all elements for base leading-codes in fastmap is set, this
7814e705 4023 flag is set true. */
b18215fc
RS
4024 boolean match_any_multibyte_characters = false;
4025
f6a3f532 4026 assert (p);
5e69f11e 4027
505bde11
SM
4028 /* The loop below works as follows:
4029 - It has a working-list kept in the PATTERN_STACK and which basically
4030 starts by only containing a pointer to the first operation.
4031 - If the opcode we're looking at is a match against some set of
4032 chars, then we add those chars to the fastmap and go on to the
4033 next work element from the worklist (done via `break').
4034 - If the opcode is a control operator on the other hand, we either
4035 ignore it (if it's meaningless at this point, such as `start_memory')
4036 or execute it (if it's a jump). If the jump has several destinations
4037 (i.e. `on_failure_jump'), then we push the other destination onto the
4038 worklist.
4039 We guarantee termination by ignoring backward jumps (more or less),
4040 so that `p' is monotonically increasing. More to the point, we
4041 never set `p' (or push) anything `<= p1'. */
4042
01618498 4043 while (p < pend)
fa9a63c5 4044 {
505bde11
SM
4045 /* `p1' is used as a marker of how far back a `on_failure_jump'
4046 can go without being ignored. It is normally equal to `p'
4047 (which prevents any backward `on_failure_jump') except right
4048 after a plain `jump', to allow patterns such as:
4049 0: jump 10
4050 3..9: <body>
4051 10: on_failure_jump 3
4052 as used for the *? operator. */
01618498 4053 re_char *p1 = p;
5e69f11e 4054
fa9a63c5
RM
4055 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4056 {
f6a3f532 4057 case succeed:
01618498 4058 return 1;
f6a3f532 4059 continue;
fa9a63c5 4060
fa9a63c5 4061 case duplicate:
505bde11
SM
4062 /* If the first character has to match a backreference, that means
4063 that the group was empty (since it already matched). Since this
4064 is the only case that interests us here, we can assume that the
4065 backreference must match the empty string. */
4066 p++;
4067 continue;
fa9a63c5
RM
4068
4069
4070 /* Following are the cases which match a character. These end
7814e705 4071 with `break'. */
fa9a63c5
RM
4072
4073 case exactn:
e0277a47 4074 if (fastmap)
cf9c99bc
KH
4075 {
4076 /* If multibyte is nonzero, the first byte of each
4077 character is an ASCII or a leading code. Otherwise,
4078 each byte is a character. Thus, this works in both
4079 cases. */
4080 fastmap[p[1]] = 1;
4081 if (! multibyte)
4082 {
4083 /* For the case of matching this unibyte regex
4084 against multibyte, we must set a leading code of
4085 the corresponding multibyte character. */
4086 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4087
4088 if (! CHAR_BYTE8_P (c))
4089 fastmap[CHAR_LEADING_CODE (c)] = 1;
4090 }
4091 }
fa9a63c5
RM
4092 break;
4093
4094
1fb352e0
SM
4095 case anychar:
4096 /* We could put all the chars except for \n (and maybe \0)
4097 but we don't bother since it is generally not worth it. */
f6a3f532 4098 if (!fastmap) break;
01618498 4099 return -1;
fa9a63c5
RM
4100
4101
b18215fc 4102 case charset_not:
1fb352e0 4103 if (!fastmap) break;
bf216479
KH
4104 {
4105 /* Chars beyond end of bitmap are possible matches. */
bf216479 4106 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4107 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4108 fastmap[j] = 1;
4109 }
4110
1fb352e0
SM
4111 /* Fallthrough */
4112 case charset:
4113 if (!fastmap) break;
4114 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4115 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4116 j >= 0; j--)
1fb352e0 4117 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4118 fastmap[j] = 1;
b18215fc 4119
6482db2e
KH
4120#ifdef emacs
4121 if (/* Any leading code can possibly start a character
1fb352e0 4122 which doesn't match the specified set of characters. */
6482db2e
KH
4123 not
4124 ||
4125 /* If we can match a character class, we can match any
4126 multibyte characters. */
4127 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4128 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4129
b18215fc 4130 {
b18215fc
RS
4131 if (match_any_multibyte_characters == false)
4132 {
6482db2e
KH
4133 for (j = MIN_MULTIBYTE_LEADING_CODE;
4134 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4135 fastmap[j] = 1;
b18215fc
RS
4136 match_any_multibyte_characters = true;
4137 }
4138 }
b18215fc 4139
1fb352e0
SM
4140 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4141 && match_any_multibyte_characters == false)
4142 {
bf216479 4143 /* Set fastmap[I] to 1 where I is a leading code of each
9117d724 4144 multibyte characer in the range table. */
1fb352e0 4145 int c, count;
bf216479 4146 unsigned char lc1, lc2;
b18215fc 4147
1fb352e0 4148 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4149 bits for a character class. */
1fb352e0 4150 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4151
1fb352e0
SM
4152 /* Extract the number of ranges in range table into COUNT. */
4153 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4154 for (; count > 0; count--, p += 3)
1fb352e0 4155 {
9117d724
KH
4156 /* Extract the start and end of each range. */
4157 EXTRACT_CHARACTER (c, p);
bf216479 4158 lc1 = CHAR_LEADING_CODE (c);
9117d724 4159 p += 3;
1fb352e0 4160 EXTRACT_CHARACTER (c, p);
bf216479
KH
4161 lc2 = CHAR_LEADING_CODE (c);
4162 for (j = lc1; j <= lc2; j++)
9117d724 4163 fastmap[j] = 1;
1fb352e0
SM
4164 }
4165 }
6482db2e 4166#endif
b18215fc
RS
4167 break;
4168
1fb352e0
SM
4169 case syntaxspec:
4170 case notsyntaxspec:
4171 if (!fastmap) break;
4172#ifndef emacs
4173 not = (re_opcode_t)p[-1] == notsyntaxspec;
4174 k = *p++;
4175 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4176 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4177 fastmap[j] = 1;
b18215fc 4178 break;
1fb352e0 4179#else /* emacs */
b18215fc
RS
4180 /* This match depends on text properties. These end with
4181 aborting optimizations. */
01618498 4182 return -1;
b18215fc
RS
4183
4184 case categoryspec:
b18215fc 4185 case notcategoryspec:
1fb352e0
SM
4186 if (!fastmap) break;
4187 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4188 k = *p++;
6482db2e 4189 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4190 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4191 fastmap[j] = 1;
4192
6482db2e
KH
4193 /* Any leading code can possibly start a character which
4194 has or doesn't has the specified category. */
4195 if (match_any_multibyte_characters == false)
6fdd04b0 4196 {
6482db2e
KH
4197 for (j = MIN_MULTIBYTE_LEADING_CODE;
4198 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4199 fastmap[j] = 1;
4200 match_any_multibyte_characters = true;
6fdd04b0 4201 }
b18215fc
RS
4202 break;
4203
fa9a63c5 4204 /* All cases after this match the empty string. These end with
25fe55af 4205 `continue'. */
fa9a63c5 4206
fa9a63c5
RM
4207 case before_dot:
4208 case at_dot:
4209 case after_dot:
1fb352e0 4210#endif /* !emacs */
25fe55af
RS
4211 case no_op:
4212 case begline:
4213 case endline:
fa9a63c5
RM
4214 case begbuf:
4215 case endbuf:
4216 case wordbound:
4217 case notwordbound:
4218 case wordbeg:
4219 case wordend:
669fa600
SM
4220 case symbeg:
4221 case symend:
25fe55af 4222 continue;
fa9a63c5
RM
4223
4224
fa9a63c5 4225 case jump:
25fe55af 4226 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4227 if (j < 0)
4228 /* Backward jumps can only go back to code that we've already
4229 visited. `re_compile' should make sure this is true. */
4230 break;
25fe55af 4231 p += j;
505bde11
SM
4232 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4233 {
4234 case on_failure_jump:
4235 case on_failure_keep_string_jump:
505bde11 4236 case on_failure_jump_loop:
0683b6fa 4237 case on_failure_jump_nastyloop:
505bde11
SM
4238 case on_failure_jump_smart:
4239 p++;
4240 break;
4241 default:
4242 continue;
4243 };
4244 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4245 to jump back to "just after here". */
4246 /* Fallthrough */
fa9a63c5 4247
25fe55af
RS
4248 case on_failure_jump:
4249 case on_failure_keep_string_jump:
0683b6fa 4250 case on_failure_jump_nastyloop:
505bde11
SM
4251 case on_failure_jump_loop:
4252 case on_failure_jump_smart:
25fe55af 4253 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4254 if (p + j <= p1)
ed0767d8 4255 ; /* Backward jump to be ignored. */
01618498
SM
4256 else
4257 { /* We have to look down both arms.
4258 We first go down the "straight" path so as to minimize
4259 stack usage when going through alternatives. */
4260 int r = analyse_first (p, pend, fastmap, multibyte);
4261 if (r) return r;
4262 p += j;
4263 }
25fe55af 4264 continue;
fa9a63c5
RM
4265
4266
ed0767d8
SM
4267 case jump_n:
4268 /* This code simply does not properly handle forward jump_n. */
4269 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4270 p += 4;
4271 /* jump_n can either jump or fall through. The (backward) jump
4272 case has already been handled, so we only need to look at the
4273 fallthrough case. */
4274 continue;
177c0ea7 4275
fa9a63c5 4276 case succeed_n:
ed0767d8
SM
4277 /* If N == 0, it should be an on_failure_jump_loop instead. */
4278 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4279 p += 4;
4280 /* We only care about one iteration of the loop, so we don't
4281 need to consider the case where this behaves like an
4282 on_failure_jump. */
25fe55af 4283 continue;
fa9a63c5
RM
4284
4285
4286 case set_number_at:
25fe55af
RS
4287 p += 4;
4288 continue;
fa9a63c5
RM
4289
4290
4291 case start_memory:
25fe55af 4292 case stop_memory:
505bde11 4293 p += 1;
fa9a63c5
RM
4294 continue;
4295
4296
4297 default:
25fe55af
RS
4298 abort (); /* We have listed all the cases. */
4299 } /* switch *p++ */
fa9a63c5
RM
4300
4301 /* Getting here means we have found the possible starting
25fe55af 4302 characters for one path of the pattern -- and that the empty
7814e705 4303 string does not match. We need not follow this path further. */
01618498 4304 return 0;
fa9a63c5
RM
4305 } /* while p */
4306
01618498
SM
4307 /* We reached the end without matching anything. */
4308 return 1;
4309
f6a3f532
SM
4310} /* analyse_first */
4311\f
4312/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4313 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4314 characters can start a string that matches the pattern. This fastmap
4315 is used by re_search to skip quickly over impossible starting points.
4316
4317 Character codes above (1 << BYTEWIDTH) are not represented in the
4318 fastmap, but the leading codes are represented. Thus, the fastmap
4319 indicates which character sets could start a match.
4320
4321 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4322 area as BUFP->fastmap.
4323
4324 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4325 the pattern buffer.
4326
4327 Returns 0 if we succeed, -2 if an internal error. */
4328
4329int
4330re_compile_fastmap (bufp)
4331 struct re_pattern_buffer *bufp;
4332{
4333 char *fastmap = bufp->fastmap;
4334 int analysis;
4335
4336 assert (fastmap && bufp->buffer);
4337
7814e705 4338 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4339 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4340
4341 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4342 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4343 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4344 return 0;
4345} /* re_compile_fastmap */
4346\f
4347/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4348 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4349 this memory for recording register information. STARTS and ENDS
4350 must be allocated using the malloc library routine, and must each
4351 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4352
4353 If NUM_REGS == 0, then subsequent matches should allocate their own
4354 register data.
4355
4356 Unless this function is called, the first search or match using
4357 PATTERN_BUFFER will allocate its own register data, without
4358 freeing the old data. */
4359
4360void
4361re_set_registers (bufp, regs, num_regs, starts, ends)
4362 struct re_pattern_buffer *bufp;
4363 struct re_registers *regs;
4364 unsigned num_regs;
4365 regoff_t *starts, *ends;
4366{
4367 if (num_regs)
4368 {
4369 bufp->regs_allocated = REGS_REALLOCATE;
4370 regs->num_regs = num_regs;
4371 regs->start = starts;
4372 regs->end = ends;
4373 }
4374 else
4375 {
4376 bufp->regs_allocated = REGS_UNALLOCATED;
4377 regs->num_regs = 0;
4378 regs->start = regs->end = (regoff_t *) 0;
4379 }
4380}
c0f9ea08 4381WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4382\f
7814e705 4383/* Searching routines. */
fa9a63c5
RM
4384
4385/* Like re_search_2, below, but only one string is specified, and
4386 doesn't let you say where to stop matching. */
4387
4388int
4389re_search (bufp, string, size, startpos, range, regs)
4390 struct re_pattern_buffer *bufp;
4391 const char *string;
4392 int size, startpos, range;
4393 struct re_registers *regs;
4394{
5e69f11e 4395 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4396 regs, size);
4397}
c0f9ea08 4398WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4399
70806df6
KH
4400/* Head address of virtual concatenation of string. */
4401#define HEAD_ADDR_VSTRING(P) \
4402 (((P) >= size1 ? string2 : string1))
4403
b18215fc
RS
4404/* End address of virtual concatenation of string. */
4405#define STOP_ADDR_VSTRING(P) \
4406 (((P) >= size1 ? string2 + size2 : string1 + size1))
4407
4408/* Address of POS in the concatenation of virtual string. */
4409#define POS_ADDR_VSTRING(POS) \
4410 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4411
4412/* Using the compiled pattern in BUFP->buffer, first tries to match the
4413 virtual concatenation of STRING1 and STRING2, starting first at index
4414 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4415
fa9a63c5 4416 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4417
fa9a63c5
RM
4418 RANGE is how far to scan while trying to match. RANGE = 0 means try
4419 only at STARTPOS; in general, the last start tried is STARTPOS +
4420 RANGE.
5e69f11e 4421
fa9a63c5
RM
4422 In REGS, return the indices of the virtual concatenation of STRING1
4423 and STRING2 that matched the entire BUFP->buffer and its contained
4424 subexpressions.
5e69f11e 4425
fa9a63c5
RM
4426 Do not consider matching one past the index STOP in the virtual
4427 concatenation of STRING1 and STRING2.
4428
4429 We return either the position in the strings at which the match was
4430 found, -1 if no match, or -2 if error (such as failure
4431 stack overflow). */
4432
4433int
66f0296e 4434re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4435 struct re_pattern_buffer *bufp;
66f0296e 4436 const char *str1, *str2;
fa9a63c5
RM
4437 int size1, size2;
4438 int startpos;
4439 int range;
4440 struct re_registers *regs;
4441 int stop;
4442{
4443 int val;
66f0296e
SM
4444 re_char *string1 = (re_char*) str1;
4445 re_char *string2 = (re_char*) str2;
fa9a63c5 4446 register char *fastmap = bufp->fastmap;
6676cb1c 4447 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4448 int total_size = size1 + size2;
4449 int endpos = startpos + range;
c0f9ea08 4450 boolean anchored_start;
cf9c99bc
KH
4451 /* Nonzero if we are searching multibyte string. */
4452 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4453
fa9a63c5
RM
4454 /* Check for out-of-range STARTPOS. */
4455 if (startpos < 0 || startpos > total_size)
4456 return -1;
5e69f11e 4457
fa9a63c5 4458 /* Fix up RANGE if it might eventually take us outside
34597fa9 4459 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4460 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4461 if (endpos < 0)
4462 range = 0 - startpos;
fa9a63c5
RM
4463 else if (endpos > total_size)
4464 range = total_size - startpos;
4465
4466 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4467 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4468 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4469 {
4470 if (startpos > 0)
4471 return -1;
4472 else
7b140fd7 4473 range = 0;
fa9a63c5
RM
4474 }
4475
ae4788a8
RS
4476#ifdef emacs
4477 /* In a forward search for something that starts with \=.
4478 don't keep searching past point. */
4479 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4480 {
7b140fd7
RS
4481 range = PT_BYTE - BEGV_BYTE - startpos;
4482 if (range < 0)
ae4788a8
RS
4483 return -1;
4484 }
4485#endif /* emacs */
4486
fa9a63c5
RM
4487 /* Update the fastmap now if not correct already. */
4488 if (fastmap && !bufp->fastmap_accurate)
01618498 4489 re_compile_fastmap (bufp);
5e69f11e 4490
c8499ba5 4491 /* See whether the pattern is anchored. */
c0f9ea08 4492 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4493
b18215fc 4494#ifdef emacs
cc9b4df2
KH
4495 gl_state.object = re_match_object;
4496 {
99633e97 4497 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4498
4499 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4500 }
b18215fc
RS
4501#endif
4502
fa9a63c5
RM
4503 /* Loop through the string, looking for a place to start matching. */
4504 for (;;)
5e69f11e 4505 {
c8499ba5
RS
4506 /* If the pattern is anchored,
4507 skip quickly past places we cannot match.
4508 We don't bother to treat startpos == 0 specially
4509 because that case doesn't repeat. */
4510 if (anchored_start && startpos > 0)
4511 {
c0f9ea08
SM
4512 if (! ((startpos <= size1 ? string1[startpos - 1]
4513 : string2[startpos - size1 - 1])
4514 == '\n'))
c8499ba5
RS
4515 goto advance;
4516 }
4517
fa9a63c5 4518 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4519 cannot be the start of a match. If the pattern can match the
4520 null string, however, we don't need to skip characters; we want
7814e705 4521 the first null string. */
fa9a63c5
RM
4522 if (fastmap && startpos < total_size && !bufp->can_be_null)
4523 {
66f0296e 4524 register re_char *d;
01618498 4525 register re_wchar_t buf_ch;
e934739e
RS
4526
4527 d = POS_ADDR_VSTRING (startpos);
4528
7814e705 4529 if (range > 0) /* Searching forwards. */
fa9a63c5 4530 {
fa9a63c5
RM
4531 register int lim = 0;
4532 int irange = range;
4533
25fe55af
RS
4534 if (startpos < size1 && startpos + range >= size1)
4535 lim = range - (size1 - startpos);
fa9a63c5 4536
25fe55af
RS
4537 /* Written out as an if-else to avoid testing `translate'
4538 inside the loop. */
28ae27ae
AS
4539 if (RE_TRANSLATE_P (translate))
4540 {
e934739e
RS
4541 if (multibyte)
4542 while (range > lim)
4543 {
4544 int buf_charlen;
4545
4546 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4547 buf_charlen);
e934739e 4548 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4549 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4550 break;
4551
4552 range -= buf_charlen;
4553 d += buf_charlen;
4554 }
4555 else
bf216479 4556 while (range > lim)
33c46939 4557 {
cf9c99bc
KH
4558 register re_wchar_t ch, translated;
4559
bf216479 4560 buf_ch = *d;
cf9c99bc
KH
4561 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4562 translated = RE_TRANSLATE (translate, ch);
4563 if (translated != ch
4564 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4565 buf_ch = ch;
6fdd04b0 4566 if (fastmap[buf_ch])
bf216479 4567 break;
33c46939
RS
4568 d++;
4569 range--;
4570 }
e934739e 4571 }
fa9a63c5 4572 else
6fdd04b0
KH
4573 {
4574 if (multibyte)
4575 while (range > lim)
4576 {
4577 int buf_charlen;
fa9a63c5 4578
6fdd04b0
KH
4579 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4580 buf_charlen);
4581 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4582 break;
4583 range -= buf_charlen;
4584 d += buf_charlen;
4585 }
e934739e 4586 else
6fdd04b0 4587 while (range > lim && !fastmap[*d])
33c46939
RS
4588 {
4589 d++;
4590 range--;
4591 }
e934739e 4592 }
fa9a63c5
RM
4593 startpos += irange - range;
4594 }
7814e705 4595 else /* Searching backwards. */
fa9a63c5 4596 {
2d1675e4
SM
4597 int room = (startpos >= size1
4598 ? size2 + size1 - startpos
4599 : size1 - startpos);
ba5e343c
KH
4600 if (multibyte)
4601 {
6fdd04b0 4602 buf_ch = STRING_CHAR (d, room);
ba5e343c
KH
4603 buf_ch = TRANSLATE (buf_ch);
4604 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4605 goto advance;
4606 }
4607 else
4608 {
cf9c99bc
KH
4609 register re_wchar_t ch, translated;
4610
4611 buf_ch = *d;
4612 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4613 translated = TRANSLATE (ch);
4614 if (translated != ch
4615 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4616 buf_ch = ch;
4617 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4618 goto advance;
4619 }
fa9a63c5
RM
4620 }
4621 }
4622
4623 /* If can't match the null string, and that's all we have left, fail. */
4624 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4625 && !bufp->can_be_null)
fa9a63c5
RM
4626 return -1;
4627
4628 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4629 startpos, regs, stop);
fa9a63c5
RM
4630
4631 if (val >= 0)
4632 return startpos;
5e69f11e 4633
fa9a63c5
RM
4634 if (val == -2)
4635 return -2;
4636
4637 advance:
5e69f11e 4638 if (!range)
25fe55af 4639 break;
5e69f11e 4640 else if (range > 0)
25fe55af 4641 {
b18215fc
RS
4642 /* Update STARTPOS to the next character boundary. */
4643 if (multibyte)
4644 {
66f0296e
SM
4645 re_char *p = POS_ADDR_VSTRING (startpos);
4646 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4647 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4648
4649 range -= len;
4650 if (range < 0)
4651 break;
4652 startpos += len;
4653 }
4654 else
4655 {
b560c397
RS
4656 range--;
4657 startpos++;
4658 }
e318085a 4659 }
fa9a63c5 4660 else
25fe55af
RS
4661 {
4662 range++;
4663 startpos--;
b18215fc
RS
4664
4665 /* Update STARTPOS to the previous character boundary. */
4666 if (multibyte)
4667 {
70806df6
KH
4668 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4669 re_char *p0 = p;
4670 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4671
4672 /* Find the head of multibyte form. */
70806df6
KH
4673 PREV_CHAR_BOUNDARY (p, phead);
4674 range += p0 - 1 - p;
4675 if (range > 0)
4676 break;
b18215fc 4677
70806df6 4678 startpos -= p0 - 1 - p;
b18215fc 4679 }
25fe55af 4680 }
fa9a63c5
RM
4681 }
4682 return -1;
4683} /* re_search_2 */
c0f9ea08 4684WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4685\f
4686/* Declarations and macros for re_match_2. */
4687
2d1675e4
SM
4688static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4689 register int len,
4690 RE_TRANSLATE_TYPE translate,
4691 const int multibyte));
fa9a63c5
RM
4692
4693/* This converts PTR, a pointer into one of the search strings `string1'
4694 and `string2' into an offset from the beginning of that string. */
4695#define POINTER_TO_OFFSET(ptr) \
4696 (FIRST_STRING_P (ptr) \
4697 ? ((regoff_t) ((ptr) - string1)) \
4698 : ((regoff_t) ((ptr) - string2 + size1)))
4699
fa9a63c5 4700/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4701 string2 if necessary.
4702 Check re_match_2_internal for a discussion of why end_match_2 might
4703 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4704#define PREFETCH() \
25fe55af 4705 while (d == dend) \
fa9a63c5
RM
4706 { \
4707 /* End of string2 => fail. */ \
25fe55af
RS
4708 if (dend == end_match_2) \
4709 goto fail; \
4bb91c68 4710 /* End of string1 => advance to string2. */ \
25fe55af 4711 d = string2; \
fa9a63c5
RM
4712 dend = end_match_2; \
4713 }
4714
f1ad044f
SM
4715/* Call before fetching a char with *d if you already checked other limits.
4716 This is meant for use in lookahead operations like wordend, etc..
4717 where we might need to look at parts of the string that might be
4718 outside of the LIMITs (i.e past `stop'). */
4719#define PREFETCH_NOLIMIT() \
4720 if (d == end1) \
4721 { \
4722 d = string2; \
4723 dend = end_match_2; \
4724 } \
fa9a63c5
RM
4725
4726/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4727 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4728#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4729#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4730
4731
4732/* Test if D points to a character which is word-constituent. We have
4733 two special cases to check for: if past the end of string1, look at
4734 the first character in string2; and if before the beginning of
4735 string2, look at the last character in string1. */
4736#define WORDCHAR_P(d) \
4737 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4738 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4739 == Sword)
4740
9121ca40 4741/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4742
4743/* The comment at case wordbound is following one, but we don't use
4744 AT_WORD_BOUNDARY anymore to support multibyte form.
4745
4746 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4747 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4748 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4749 macro and introducing temporary variables works around the bug. */
4750
9121ca40 4751#if 0
fa9a63c5
RM
4752/* Test if the character before D and the one at D differ with respect
4753 to being word-constituent. */
4754#define AT_WORD_BOUNDARY(d) \
4755 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4756 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4757#endif
fa9a63c5
RM
4758
4759/* Free everything we malloc. */
4760#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4761# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4762# define FREE_VARIABLES() \
fa9a63c5
RM
4763 do { \
4764 REGEX_FREE_STACK (fail_stack.stack); \
4765 FREE_VAR (regstart); \
4766 FREE_VAR (regend); \
fa9a63c5
RM
4767 FREE_VAR (best_regstart); \
4768 FREE_VAR (best_regend); \
fa9a63c5
RM
4769 } while (0)
4770#else
0b32bf0e 4771# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4772#endif /* not MATCH_MAY_ALLOCATE */
4773
505bde11
SM
4774\f
4775/* Optimization routines. */
4776
4e8a9132
SM
4777/* If the operation is a match against one or more chars,
4778 return a pointer to the next operation, else return NULL. */
01618498 4779static re_char *
4e8a9132 4780skip_one_char (p)
01618498 4781 re_char *p;
4e8a9132
SM
4782{
4783 switch (SWITCH_ENUM_CAST (*p++))
4784 {
4785 case anychar:
4786 break;
177c0ea7 4787
4e8a9132
SM
4788 case exactn:
4789 p += *p + 1;
4790 break;
4791
4792 case charset_not:
4793 case charset:
4794 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4795 {
4796 int mcnt;
4797 p = CHARSET_RANGE_TABLE (p - 1);
4798 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4799 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4800 }
4801 else
4802 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4803 break;
177c0ea7 4804
4e8a9132
SM
4805 case syntaxspec:
4806 case notsyntaxspec:
1fb352e0 4807#ifdef emacs
4e8a9132
SM
4808 case categoryspec:
4809 case notcategoryspec:
4810#endif /* emacs */
4811 p++;
4812 break;
4813
4814 default:
4815 p = NULL;
4816 }
4817 return p;
4818}
4819
4820
505bde11 4821/* Jump over non-matching operations. */
839966f3 4822static re_char *
4e8a9132 4823skip_noops (p, pend)
839966f3 4824 re_char *p, *pend;
505bde11
SM
4825{
4826 int mcnt;
4827 while (p < pend)
4828 {
4829 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4830 {
4831 case start_memory:
505bde11
SM
4832 case stop_memory:
4833 p += 2; break;
4834 case no_op:
4835 p += 1; break;
4836 case jump:
4837 p += 1;
4838 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4839 p += mcnt;
4840 break;
4841 default:
4842 return p;
4843 }
4844 }
4845 assert (p == pend);
4846 return p;
4847}
4848
4849/* Non-zero if "p1 matches something" implies "p2 fails". */
4850static int
4851mutually_exclusive_p (bufp, p1, p2)
4852 struct re_pattern_buffer *bufp;
839966f3 4853 re_char *p1, *p2;
505bde11 4854{
4e8a9132 4855 re_opcode_t op2;
2d1675e4 4856 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4857 unsigned char *pend = bufp->buffer + bufp->used;
4858
4e8a9132 4859 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4860 && p2 >= bufp->buffer && p2 <= pend);
4861
4862 /* Skip over open/close-group commands.
4863 If what follows this loop is a ...+ construct,
4864 look at what begins its body, since we will have to
4865 match at least one of that. */
4e8a9132
SM
4866 p2 = skip_noops (p2, pend);
4867 /* The same skip can be done for p1, except that this function
4868 is only used in the case where p1 is a simple match operator. */
4869 /* p1 = skip_noops (p1, pend); */
4870
4871 assert (p1 >= bufp->buffer && p1 < pend
4872 && p2 >= bufp->buffer && p2 <= pend);
4873
4874 op2 = p2 == pend ? succeed : *p2;
4875
4876 switch (SWITCH_ENUM_CAST (op2))
505bde11 4877 {
4e8a9132
SM
4878 case succeed:
4879 case endbuf:
4880 /* If we're at the end of the pattern, we can change. */
4881 if (skip_one_char (p1))
505bde11 4882 {
505bde11
SM
4883 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4884 return 1;
505bde11 4885 }
4e8a9132 4886 break;
177c0ea7 4887
4e8a9132 4888 case endline:
4e8a9132
SM
4889 case exactn:
4890 {
01618498 4891 register re_wchar_t c
4e8a9132 4892 = (re_opcode_t) *p2 == endline ? '\n'
cf9c99bc 4893 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte);
505bde11 4894
4e8a9132
SM
4895 if ((re_opcode_t) *p1 == exactn)
4896 {
cf9c99bc 4897 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte))
4e8a9132
SM
4898 {
4899 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4900 return 1;
4901 }
4902 }
505bde11 4903
4e8a9132
SM
4904 else if ((re_opcode_t) *p1 == charset
4905 || (re_opcode_t) *p1 == charset_not)
4906 {
4907 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4908
4e8a9132
SM
4909 /* Test if C is listed in charset (or charset_not)
4910 at `p1'. */
6fdd04b0 4911 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4912 {
4913 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4914 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4915 not = !not;
4916 }
4917 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4918 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4919
4e8a9132
SM
4920 /* `not' is equal to 1 if c would match, which means
4921 that we can't change to pop_failure_jump. */
4922 if (!not)
4923 {
4924 DEBUG_PRINT1 (" No match => fast loop.\n");
4925 return 1;
4926 }
4927 }
4928 else if ((re_opcode_t) *p1 == anychar
4929 && c == '\n')
4930 {
4931 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4932 return 1;
4933 }
4934 }
4935 break;
505bde11 4936
4e8a9132 4937 case charset:
4e8a9132
SM
4938 {
4939 if ((re_opcode_t) *p1 == exactn)
4940 /* Reuse the code above. */
4941 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4942
505bde11
SM
4943 /* It is hard to list up all the character in charset
4944 P2 if it includes multibyte character. Give up in
4945 such case. */
4946 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4947 {
4948 /* Now, we are sure that P2 has no range table.
4949 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4950 enough. But P1 may have range table, so the
505bde11
SM
4951 size of bitmap table of P1 is extracted by
4952 using macro `CHARSET_BITMAP_SIZE'.
4953
6fdd04b0
KH
4954 In a multibyte case, we know that all the character
4955 listed in P2 is ASCII. In a unibyte case, P1 has only a
4956 bitmap table. So, in both cases, it is enough to test
4957 only the bitmap table of P1. */
505bde11 4958
411e4203 4959 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4960 {
4961 int idx;
4962 /* We win if the charset inside the loop
4963 has no overlap with the one after the loop. */
4964 for (idx = 0;
4965 (idx < (int) p2[1]
4966 && idx < CHARSET_BITMAP_SIZE (p1));
4967 idx++)
4968 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4969 break;
4970
4971 if (idx == p2[1]
4972 || idx == CHARSET_BITMAP_SIZE (p1))
4973 {
4974 DEBUG_PRINT1 (" No match => fast loop.\n");
4975 return 1;
4976 }
4977 }
411e4203 4978 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4979 {
4980 int idx;
4981 /* We win if the charset_not inside the loop lists
7814e705 4982 every character listed in the charset after. */
505bde11
SM
4983 for (idx = 0; idx < (int) p2[1]; idx++)
4984 if (! (p2[2 + idx] == 0
4985 || (idx < CHARSET_BITMAP_SIZE (p1)
4986 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4987 break;
4988
4e8a9132
SM
4989 if (idx == p2[1])
4990 {
4991 DEBUG_PRINT1 (" No match => fast loop.\n");
4992 return 1;
4993 }
4994 }
4995 }
4996 }
609b757a 4997 break;
177c0ea7 4998
411e4203
SM
4999 case charset_not:
5000 switch (SWITCH_ENUM_CAST (*p1))
5001 {
5002 case exactn:
5003 case charset:
5004 /* Reuse the code above. */
5005 return mutually_exclusive_p (bufp, p2, p1);
5006 case charset_not:
5007 /* When we have two charset_not, it's very unlikely that
5008 they don't overlap. The union of the two sets of excluded
5009 chars should cover all possible chars, which, as a matter of
5010 fact, is virtually impossible in multibyte buffers. */
36595814 5011 break;
411e4203
SM
5012 }
5013 break;
5014
4e8a9132 5015 case wordend:
669fa600
SM
5016 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
5017 case symend:
4e8a9132 5018 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
5019 && (p1[1] == Ssymbol || p1[1] == Sword));
5020 case notsyntaxspec:
5021 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
5022
5023 case wordbeg:
669fa600
SM
5024 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
5025 case symbeg:
4e8a9132 5026 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
5027 && (p1[1] == Ssymbol || p1[1] == Sword));
5028 case syntaxspec:
5029 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
5030
5031 case wordbound:
5032 return (((re_opcode_t) *p1 == notsyntaxspec
5033 || (re_opcode_t) *p1 == syntaxspec)
5034 && p1[1] == Sword);
5035
1fb352e0 5036#ifdef emacs
4e8a9132
SM
5037 case categoryspec:
5038 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
5039 case notcategoryspec:
5040 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
5041#endif /* emacs */
5042
5043 default:
5044 ;
505bde11
SM
5045 }
5046
5047 /* Safe default. */
5048 return 0;
5049}
5050
fa9a63c5
RM
5051\f
5052/* Matching routines. */
5053
25fe55af 5054#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
5055/* re_match is like re_match_2 except it takes only a single string. */
5056
5057int
5058re_match (bufp, string, size, pos, regs)
5059 struct re_pattern_buffer *bufp;
5060 const char *string;
5061 int size, pos;
5062 struct re_registers *regs;
5063{
4bb91c68 5064 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 5065 pos, regs, size);
fa9a63c5
RM
5066 return result;
5067}
c0f9ea08 5068WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
5069#endif /* not emacs */
5070
b18215fc
RS
5071#ifdef emacs
5072/* In Emacs, this is the string or buffer in which we
7814e705 5073 are matching. It is used for looking up syntax properties. */
b18215fc
RS
5074Lisp_Object re_match_object;
5075#endif
fa9a63c5
RM
5076
5077/* re_match_2 matches the compiled pattern in BUFP against the
5078 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5079 and SIZE2, respectively). We start matching at POS, and stop
5080 matching at STOP.
5e69f11e 5081
fa9a63c5 5082 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 5083 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
5084 documentation for exactly how many groups we fill.
5085
5086 We return -1 if no match, -2 if an internal error (such as the
7814e705 5087 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
5088 matched substring. */
5089
5090int
5091re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5092 struct re_pattern_buffer *bufp;
5093 const char *string1, *string2;
5094 int size1, size2;
5095 int pos;
5096 struct re_registers *regs;
5097 int stop;
5098{
b18215fc 5099 int result;
25fe55af 5100
b18215fc 5101#ifdef emacs
cc9b4df2
KH
5102 int charpos;
5103 gl_state.object = re_match_object;
99633e97 5104 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5105 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5106#endif
5107
4bb91c68
SM
5108 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5109 (re_char*) string2, size2,
cc9b4df2 5110 pos, regs, stop);
fa9a63c5
RM
5111 return result;
5112}
c0f9ea08 5113WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5114
bf216479 5115
fa9a63c5 5116/* This is a separate function so that we can force an alloca cleanup
7814e705 5117 afterwards. */
fa9a63c5
RM
5118static int
5119re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5120 struct re_pattern_buffer *bufp;
66f0296e 5121 re_char *string1, *string2;
fa9a63c5
RM
5122 int size1, size2;
5123 int pos;
5124 struct re_registers *regs;
5125 int stop;
5126{
5127 /* General temporaries. */
5128 int mcnt;
01618498 5129 size_t reg;
66f0296e 5130 boolean not;
fa9a63c5
RM
5131
5132 /* Just past the end of the corresponding string. */
66f0296e 5133 re_char *end1, *end2;
fa9a63c5
RM
5134
5135 /* Pointers into string1 and string2, just past the last characters in
7814e705 5136 each to consider matching. */
66f0296e 5137 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5138
5139 /* Where we are in the data, and the end of the current string. */
66f0296e 5140 re_char *d, *dend;
5e69f11e 5141
99633e97
SM
5142 /* Used sometimes to remember where we were before starting matching
5143 an operator so that we can go back in case of failure. This "atomic"
5144 behavior of matching opcodes is indispensable to the correctness
5145 of the on_failure_keep_string_jump optimization. */
5146 re_char *dfail;
5147
fa9a63c5 5148 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5149 re_char *p = bufp->buffer;
5150 re_char *pend = p + bufp->used;
fa9a63c5 5151
25fe55af 5152 /* We use this to map every character in the string. */
6676cb1c 5153 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5154
cf9c99bc 5155 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5156 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5157
cf9c99bc
KH
5158 /* Nonzero if STRING1/STRING2 are multibyte. */
5159 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5160
fa9a63c5
RM
5161 /* Failure point stack. Each place that can handle a failure further
5162 down the line pushes a failure point on this stack. It consists of
505bde11 5163 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5164 the subexpressions we're currently inside, plus the number of such
5165 registers, and, finally, two char *'s. The first char * is where
5166 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5167 scanning the strings. */
5168#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5169 fail_stack_type fail_stack;
5170#endif
5171#ifdef DEBUG
fa9a63c5
RM
5172 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5173#endif
5174
0b32bf0e 5175#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5176 /* This holds the pointer to the failure stack, when
5177 it is allocated relocatably. */
5178 fail_stack_elt_t *failure_stack_ptr;
99633e97 5179#endif
fa9a63c5
RM
5180
5181 /* We fill all the registers internally, independent of what we
7814e705 5182 return, for use in backreferences. The number here includes
fa9a63c5 5183 an element for register zero. */
4bb91c68 5184 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5185
fa9a63c5
RM
5186 /* Information on the contents of registers. These are pointers into
5187 the input strings; they record just what was matched (on this
5188 attempt) by a subexpression part of the pattern, that is, the
5189 regnum-th regstart pointer points to where in the pattern we began
5190 matching and the regnum-th regend points to right after where we
5191 stopped matching the regnum-th subexpression. (The zeroth register
5192 keeps track of what the whole pattern matches.) */
5193#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5194 re_char **regstart, **regend;
fa9a63c5
RM
5195#endif
5196
fa9a63c5 5197 /* The following record the register info as found in the above
5e69f11e 5198 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5199 This happens as we backtrack through the failure points, which in
5200 turn happens only if we have not yet matched the entire string. */
5201 unsigned best_regs_set = false;
5202#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5203 re_char **best_regstart, **best_regend;
fa9a63c5 5204#endif
5e69f11e 5205
fa9a63c5
RM
5206 /* Logically, this is `best_regend[0]'. But we don't want to have to
5207 allocate space for that if we're not allocating space for anything
7814e705 5208 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5209 any of the other register vectors, and it seems rather a kludge to
5210 treat `best_regend' differently than the rest. So we keep track of
5211 the end of the best match so far in a separate variable. We
5212 initialize this to NULL so that when we backtrack the first time
5213 and need to test it, it's not garbage. */
66f0296e 5214 re_char *match_end = NULL;
fa9a63c5 5215
fa9a63c5
RM
5216#ifdef DEBUG
5217 /* Counts the total number of registers pushed. */
5e69f11e 5218 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5219#endif
5220
5221 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5222
fa9a63c5 5223 INIT_FAIL_STACK ();
5e69f11e 5224
fa9a63c5
RM
5225#ifdef MATCH_MAY_ALLOCATE
5226 /* Do not bother to initialize all the register variables if there are
5227 no groups in the pattern, as it takes a fair amount of time. If
5228 there are groups, we include space for register 0 (the whole
5229 pattern), even though we never use it, since it simplifies the
5230 array indexing. We should fix this. */
5231 if (bufp->re_nsub)
5232 {
66f0296e
SM
5233 regstart = REGEX_TALLOC (num_regs, re_char *);
5234 regend = REGEX_TALLOC (num_regs, re_char *);
5235 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5236 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5237
505bde11 5238 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5239 {
5240 FREE_VARIABLES ();
5241 return -2;
5242 }
fa9a63c5
RM
5243 }
5244 else
5245 {
5246 /* We must initialize all our variables to NULL, so that
25fe55af 5247 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5248 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5249 }
5250#endif /* MATCH_MAY_ALLOCATE */
5251
5252 /* The starting position is bogus. */
5253 if (pos < 0 || pos > size1 + size2)
5254 {
5255 FREE_VARIABLES ();
5256 return -1;
5257 }
5e69f11e 5258
fa9a63c5
RM
5259 /* Initialize subexpression text positions to -1 to mark ones that no
5260 start_memory/stop_memory has been seen for. Also initialize the
5261 register information struct. */
01618498
SM
5262 for (reg = 1; reg < num_regs; reg++)
5263 regstart[reg] = regend[reg] = NULL;
99633e97 5264
fa9a63c5 5265 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5266 `string1' is null. */
fa9a63c5
RM
5267 if (size2 == 0 && string1 != NULL)
5268 {
5269 string2 = string1;
5270 size2 = size1;
5271 string1 = 0;
5272 size1 = 0;
5273 }
5274 end1 = string1 + size1;
5275 end2 = string2 + size2;
5276
5e69f11e 5277 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5278 `dend' is the end of the input string that `d' points within. `d'
5279 is advanced into the following input string whenever necessary, but
5280 this happens before fetching; therefore, at the beginning of the
5281 loop, `d' can be pointing at the end of a string, but it cannot
5282 equal `string2'. */
419d1c74 5283 if (pos >= size1)
fa9a63c5 5284 {
419d1c74
SM
5285 /* Only match within string2. */
5286 d = string2 + pos - size1;
5287 dend = end_match_2 = string2 + stop - size1;
5288 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5289 }
5290 else
5291 {
f1ad044f 5292 if (stop < size1)
419d1c74
SM
5293 {
5294 /* Only match within string1. */
5295 end_match_1 = string1 + stop;
5296 /* BEWARE!
5297 When we reach end_match_1, PREFETCH normally switches to string2.
5298 But in the present case, this means that just doing a PREFETCH
5299 makes us jump from `stop' to `gap' within the string.
5300 What we really want here is for the search to stop as
5301 soon as we hit end_match_1. That's why we set end_match_2
5302 to end_match_1 (since PREFETCH fails as soon as we hit
5303 end_match_2). */
5304 end_match_2 = end_match_1;
5305 }
5306 else
f1ad044f
SM
5307 { /* It's important to use this code when stop == size so that
5308 moving `d' from end1 to string2 will not prevent the d == dend
5309 check from catching the end of string. */
419d1c74
SM
5310 end_match_1 = end1;
5311 end_match_2 = string2 + stop - size1;
5312 }
5313 d = string1 + pos;
5314 dend = end_match_1;
fa9a63c5
RM
5315 }
5316
5317 DEBUG_PRINT1 ("The compiled pattern is: ");
5318 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5319 DEBUG_PRINT1 ("The string to match is: `");
5320 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5321 DEBUG_PRINT1 ("'\n");
5e69f11e 5322
7814e705 5323 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5324 function if the match is complete, or it drops through if the match
5325 fails at this starting point in the input data. */
5326 for (;;)
5327 {
505bde11 5328 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5329
5330 if (p == pend)
5331 { /* End of pattern means we might have succeeded. */
25fe55af 5332 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5333
fa9a63c5 5334 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5335 longest match, try backtracking. */
5336 if (d != end_match_2)
fa9a63c5
RM
5337 {
5338 /* 1 if this match ends in the same string (string1 or string2)
5339 as the best previous match. */
5e69f11e 5340 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5341 == FIRST_STRING_P (d));
fa9a63c5
RM
5342 /* 1 if this match is the best seen so far. */
5343 boolean best_match_p;
5344
5345 /* AIX compiler got confused when this was combined
7814e705 5346 with the previous declaration. */
fa9a63c5
RM
5347 if (same_str_p)
5348 best_match_p = d > match_end;
5349 else
99633e97 5350 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5351
25fe55af
RS
5352 DEBUG_PRINT1 ("backtracking.\n");
5353
5354 if (!FAIL_STACK_EMPTY ())
5355 { /* More failure points to try. */
5356
5357 /* If exceeds best match so far, save it. */
5358 if (!best_regs_set || best_match_p)
5359 {
5360 best_regs_set = true;
5361 match_end = d;
5362
5363 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5364
01618498 5365 for (reg = 1; reg < num_regs; reg++)
25fe55af 5366 {
01618498
SM
5367 best_regstart[reg] = regstart[reg];
5368 best_regend[reg] = regend[reg];
25fe55af
RS
5369 }
5370 }
5371 goto fail;
5372 }
5373
5374 /* If no failure points, don't restore garbage. And if
5375 last match is real best match, don't restore second
5376 best one. */
5377 else if (best_regs_set && !best_match_p)
5378 {
5379 restore_best_regs:
5380 /* Restore best match. It may happen that `dend ==
5381 end_match_1' while the restored d is in string2.
5382 For example, the pattern `x.*y.*z' against the
5383 strings `x-' and `y-z-', if the two strings are
7814e705 5384 not consecutive in memory. */
25fe55af
RS
5385 DEBUG_PRINT1 ("Restoring best registers.\n");
5386
5387 d = match_end;
5388 dend = ((d >= string1 && d <= end1)
5389 ? end_match_1 : end_match_2);
fa9a63c5 5390
01618498 5391 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5392 {
01618498
SM
5393 regstart[reg] = best_regstart[reg];
5394 regend[reg] = best_regend[reg];
fa9a63c5 5395 }
25fe55af
RS
5396 }
5397 } /* d != end_match_2 */
fa9a63c5
RM
5398
5399 succeed_label:
25fe55af 5400 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5401
25fe55af
RS
5402 /* If caller wants register contents data back, do it. */
5403 if (regs && !bufp->no_sub)
fa9a63c5 5404 {
25fe55af
RS
5405 /* Have the register data arrays been allocated? */
5406 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5407 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5408 extra element beyond `num_regs' for the `-1' marker
5409 GNU code uses. */
5410 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5411 regs->start = TALLOC (regs->num_regs, regoff_t);
5412 regs->end = TALLOC (regs->num_regs, regoff_t);
5413 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5414 {
5415 FREE_VARIABLES ();
5416 return -2;
5417 }
25fe55af
RS
5418 bufp->regs_allocated = REGS_REALLOCATE;
5419 }
5420 else if (bufp->regs_allocated == REGS_REALLOCATE)
5421 { /* Yes. If we need more elements than were already
5422 allocated, reallocate them. If we need fewer, just
5423 leave it alone. */
5424 if (regs->num_regs < num_regs + 1)
5425 {
5426 regs->num_regs = num_regs + 1;
5427 RETALLOC (regs->start, regs->num_regs, regoff_t);
5428 RETALLOC (regs->end, regs->num_regs, regoff_t);
5429 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5430 {
5431 FREE_VARIABLES ();
5432 return -2;
5433 }
25fe55af
RS
5434 }
5435 }
5436 else
fa9a63c5
RM
5437 {
5438 /* These braces fend off a "empty body in an else-statement"
7814e705 5439 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5440 assert (bufp->regs_allocated == REGS_FIXED);
5441 }
5442
25fe55af
RS
5443 /* Convert the pointer data in `regstart' and `regend' to
5444 indices. Register zero has to be set differently,
5445 since we haven't kept track of any info for it. */
5446 if (regs->num_regs > 0)
5447 {
5448 regs->start[0] = pos;
99633e97 5449 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5450 }
5e69f11e 5451
25fe55af
RS
5452 /* Go through the first `min (num_regs, regs->num_regs)'
5453 registers, since that is all we initialized. */
01618498 5454 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5455 {
01618498
SM
5456 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5457 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5458 else
5459 {
01618498
SM
5460 regs->start[reg]
5461 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5462 regs->end[reg]
5463 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5464 }
fa9a63c5 5465 }
5e69f11e 5466
25fe55af
RS
5467 /* If the regs structure we return has more elements than
5468 were in the pattern, set the extra elements to -1. If
5469 we (re)allocated the registers, this is the case,
5470 because we always allocate enough to have at least one
7814e705 5471 -1 at the end. */
01618498
SM
5472 for (reg = num_regs; reg < regs->num_regs; reg++)
5473 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5474 } /* regs && !bufp->no_sub */
5475
25fe55af
RS
5476 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5477 nfailure_points_pushed, nfailure_points_popped,
5478 nfailure_points_pushed - nfailure_points_popped);
5479 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5480
99633e97 5481 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5482
25fe55af 5483 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5484
25fe55af
RS
5485 FREE_VARIABLES ();
5486 return mcnt;
5487 }
fa9a63c5 5488
7814e705 5489 /* Otherwise match next pattern command. */
fa9a63c5
RM
5490 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5491 {
25fe55af
RS
5492 /* Ignore these. Used to ignore the n of succeed_n's which
5493 currently have n == 0. */
5494 case no_op:
5495 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5496 break;
fa9a63c5
RM
5497
5498 case succeed:
25fe55af 5499 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5500 goto succeed_label;
5501
7814e705 5502 /* Match the next n pattern characters exactly. The following
25fe55af 5503 byte in the pattern defines n, and the n bytes after that
7814e705 5504 are the characters to match. */
fa9a63c5
RM
5505 case exactn:
5506 mcnt = *p++;
25fe55af 5507 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5508
99633e97
SM
5509 /* Remember the start point to rollback upon failure. */
5510 dfail = d;
5511
6fdd04b0 5512#ifndef emacs
25fe55af
RS
5513 /* This is written out as an if-else so we don't waste time
5514 testing `translate' inside the loop. */
28703c16 5515 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5516 do
5517 {
5518 PREFETCH ();
5519 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5520 {
6fdd04b0
KH
5521 d = dfail;
5522 goto fail;
e934739e 5523 }
6fdd04b0
KH
5524 d++;
5525 }
5526 while (--mcnt);
fa9a63c5 5527 else
6fdd04b0
KH
5528 do
5529 {
5530 PREFETCH ();
5531 if (*d++ != *p++)
bf216479 5532 {
6fdd04b0
KH
5533 d = dfail;
5534 goto fail;
bf216479 5535 }
6fdd04b0
KH
5536 }
5537 while (--mcnt);
5538#else /* emacs */
5539 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5540 if (target_multibyte)
6fdd04b0
KH
5541 do
5542 {
5543 int pat_charlen, buf_charlen;
cf9c99bc 5544 int pat_ch, buf_ch;
e934739e 5545
6fdd04b0 5546 PREFETCH ();
cf9c99bc
KH
5547 if (multibyte)
5548 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5549 else
5550 {
5551 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5552 pat_charlen = 1;
5553 }
6fdd04b0 5554 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e 5555
6fdd04b0 5556 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5557 {
6fdd04b0
KH
5558 d = dfail;
5559 goto fail;
e934739e 5560 }
bf216479 5561
6fdd04b0
KH
5562 p += pat_charlen;
5563 d += buf_charlen;
5564 mcnt -= pat_charlen;
5565 }
5566 while (mcnt > 0);
fa9a63c5 5567 else
6fdd04b0
KH
5568 do
5569 {
cf9c99bc
KH
5570 int pat_charlen, buf_charlen;
5571 int pat_ch, buf_ch;
bf216479 5572
6fdd04b0 5573 PREFETCH ();
cf9c99bc
KH
5574 if (multibyte)
5575 {
5576 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5577 if (CHAR_BYTE8_P (pat_ch))
5578 pat_ch = CHAR_TO_BYTE8 (pat_ch);
5579 else
5580 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5581 }
5582 else
5583 {
5584 pat_ch = *p;
5585 pat_charlen = 1;
5586 }
5587 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5588 if (! CHAR_BYTE8_P (buf_ch))
5589 {
5590 buf_ch = TRANSLATE (buf_ch);
5591 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5592 if (buf_ch < 0)
5593 buf_ch = *d;
5594 }
0e2501ed
AS
5595 else
5596 buf_ch = *d;
cf9c99bc 5597 if (buf_ch != pat_ch)
6fdd04b0
KH
5598 {
5599 d = dfail;
5600 goto fail;
bf216479 5601 }
cf9c99bc
KH
5602 p += pat_charlen;
5603 d++;
6fdd04b0
KH
5604 }
5605 while (--mcnt);
5606#endif
25fe55af 5607 break;
fa9a63c5
RM
5608
5609
25fe55af 5610 /* Match any character except possibly a newline or a null. */
fa9a63c5 5611 case anychar:
e934739e
RS
5612 {
5613 int buf_charlen;
01618498 5614 re_wchar_t buf_ch;
fa9a63c5 5615
e934739e 5616 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5617
e934739e 5618 PREFETCH ();
cf9c99bc
KH
5619 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen,
5620 target_multibyte);
e934739e
RS
5621 buf_ch = TRANSLATE (buf_ch);
5622
5623 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5624 && buf_ch == '\n')
5625 || ((bufp->syntax & RE_DOT_NOT_NULL)
5626 && buf_ch == '\000'))
5627 goto fail;
5628
e934739e
RS
5629 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5630 d += buf_charlen;
5631 }
fa9a63c5
RM
5632 break;
5633
5634
5635 case charset:
5636 case charset_not:
5637 {
b18215fc 5638 register unsigned int c;
fa9a63c5 5639 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5640 int len;
5641
5642 /* Start of actual range_table, or end of bitmap if there is no
5643 range table. */
01618498 5644 re_char *range_table;
b18215fc 5645
96cc36cc 5646 /* Nonzero if there is a range table. */
b18215fc
RS
5647 int range_table_exists;
5648
96cc36cc
RS
5649 /* Number of ranges of range table. This is not included
5650 in the initial byte-length of the command. */
5651 int count = 0;
fa9a63c5 5652
f5020181
AS
5653 /* Whether matching against a unibyte character. */
5654 boolean unibyte_char = false;
5655
25fe55af 5656 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5657
b18215fc 5658 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5659
b18215fc 5660 if (range_table_exists)
96cc36cc
RS
5661 {
5662 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5663 EXTRACT_NUMBER_AND_INCR (count, range_table);
5664 }
b18215fc 5665
2d1675e4 5666 PREFETCH ();
cf9c99bc
KH
5667 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte);
5668 if (target_multibyte)
5669 {
5670 int c1;
b18215fc 5671
cf9c99bc
KH
5672 c = TRANSLATE (c);
5673 c1 = RE_CHAR_TO_UNIBYTE (c);
5674 if (c1 >= 0)
f5020181
AS
5675 {
5676 unibyte_char = true;
5677 c = c1;
5678 }
cf9c99bc
KH
5679 }
5680 else
5681 {
5682 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5683
5684 if (! CHAR_BYTE8_P (c1))
5685 {
5686 c1 = TRANSLATE (c1);
5687 c1 = RE_CHAR_TO_UNIBYTE (c1);
5688 if (c1 >= 0)
f5020181
AS
5689 {
5690 unibyte_char = true;
5691 c = c1;
5692 }
cf9c99bc 5693 }
0b8be006
AS
5694 else
5695 unibyte_char = true;
cf9c99bc
KH
5696 }
5697
f5020181 5698 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5699 { /* Lookup bitmap. */
b18215fc
RS
5700 /* Cast to `unsigned' instead of `unsigned char' in
5701 case the bit list is a full 32 bytes long. */
5702 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5703 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5704 not = !not;
b18215fc 5705 }
96cc36cc 5706#ifdef emacs
b18215fc 5707 else if (range_table_exists)
96cc36cc
RS
5708 {
5709 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5710
14473664
SM
5711 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5712 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5713 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5714 | (class_bits & BIT_SPACE && ISSPACE (c))
5715 | (class_bits & BIT_UPPER && ISUPPER (c))
5716 | (class_bits & BIT_WORD && ISWORD (c)))
5717 not = !not;
5718 else
5719 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5720 }
5721#endif /* emacs */
fa9a63c5 5722
96cc36cc
RS
5723 if (range_table_exists)
5724 p = CHARSET_RANGE_TABLE_END (range_table, count);
5725 else
5726 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5727
5728 if (!not) goto fail;
5e69f11e 5729
b18215fc 5730 d += len;
fa9a63c5
RM
5731 break;
5732 }
5733
5734
25fe55af 5735 /* The beginning of a group is represented by start_memory.
505bde11 5736 The argument is the register number. The text
25fe55af 5737 matched within the group is recorded (in the internal
7814e705 5738 registers data structure) under the register number. */
25fe55af 5739 case start_memory:
505bde11
SM
5740 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5741
5742 /* In case we need to undo this operation (via backtracking). */
5743 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5744
25fe55af 5745 regstart[*p] = d;
4bb91c68 5746 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5747 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5748
25fe55af 5749 /* Move past the register number and inner group count. */
505bde11 5750 p += 1;
25fe55af 5751 break;
fa9a63c5
RM
5752
5753
25fe55af 5754 /* The stop_memory opcode represents the end of a group. Its
505bde11 5755 argument is the same as start_memory's: the register number. */
fa9a63c5 5756 case stop_memory:
505bde11
SM
5757 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5758
5759 assert (!REG_UNSET (regstart[*p]));
5760 /* Strictly speaking, there should be code such as:
177c0ea7 5761
0b32bf0e 5762 assert (REG_UNSET (regend[*p]));
505bde11
SM
5763 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5764
5765 But the only info to be pushed is regend[*p] and it is known to
5766 be UNSET, so there really isn't anything to push.
5767 Not pushing anything, on the other hand deprives us from the
5768 guarantee that regend[*p] is UNSET since undoing this operation
5769 will not reset its value properly. This is not important since
5770 the value will only be read on the next start_memory or at
5771 the very end and both events can only happen if this stop_memory
5772 is *not* undone. */
fa9a63c5 5773
25fe55af 5774 regend[*p] = d;
fa9a63c5
RM
5775 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5776
25fe55af 5777 /* Move past the register number and the inner group count. */
505bde11 5778 p += 1;
25fe55af 5779 break;
fa9a63c5
RM
5780
5781
5782 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5783 followed by the numeric value of <digit> as the register number. */
5784 case duplicate:
fa9a63c5 5785 {
66f0296e 5786 register re_char *d2, *dend2;
7814e705 5787 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5788 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5789
7814e705 5790 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5791 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5792 goto fail;
5e69f11e 5793
7814e705 5794 /* Where in input to try to start matching. */
25fe55af 5795 d2 = regstart[regno];
5e69f11e 5796
99633e97
SM
5797 /* Remember the start point to rollback upon failure. */
5798 dfail = d;
5799
25fe55af
RS
5800 /* Where to stop matching; if both the place to start and
5801 the place to stop matching are in the same string, then
5802 set to the place to stop, otherwise, for now have to use
5803 the end of the first string. */
fa9a63c5 5804
25fe55af 5805 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5806 == FIRST_STRING_P (regend[regno]))
5807 ? regend[regno] : end_match_1);
5808 for (;;)
5809 {
5810 /* If necessary, advance to next segment in register
25fe55af 5811 contents. */
fa9a63c5
RM
5812 while (d2 == dend2)
5813 {
5814 if (dend2 == end_match_2) break;
5815 if (dend2 == regend[regno]) break;
5816
25fe55af
RS
5817 /* End of string1 => advance to string2. */
5818 d2 = string2;
5819 dend2 = regend[regno];
fa9a63c5
RM
5820 }
5821 /* At end of register contents => success */
5822 if (d2 == dend2) break;
5823
5824 /* If necessary, advance to next segment in data. */
5825 PREFETCH ();
5826
5827 /* How many characters left in this segment to match. */
5828 mcnt = dend - d;
5e69f11e 5829
fa9a63c5 5830 /* Want how many consecutive characters we can match in
25fe55af
RS
5831 one shot, so, if necessary, adjust the count. */
5832 if (mcnt > dend2 - d2)
fa9a63c5 5833 mcnt = dend2 - d2;
5e69f11e 5834
fa9a63c5 5835 /* Compare that many; failure if mismatch, else move
25fe55af 5836 past them. */
28703c16 5837 if (RE_TRANSLATE_P (translate)
02cb78b5 5838 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5839 : memcmp (d, d2, mcnt))
99633e97
SM
5840 {
5841 d = dfail;
5842 goto fail;
5843 }
fa9a63c5 5844 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5845 }
5846 }
5847 break;
5848
5849
25fe55af 5850 /* begline matches the empty string at the beginning of the string
c0f9ea08 5851 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5852 case begline:
25fe55af 5853 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5854
25fe55af
RS
5855 if (AT_STRINGS_BEG (d))
5856 {
5857 if (!bufp->not_bol) break;
5858 }
419d1c74 5859 else
25fe55af 5860 {
bf216479 5861 unsigned c;
419d1c74 5862 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5863 if (c == '\n')
419d1c74 5864 break;
25fe55af
RS
5865 }
5866 /* In all other cases, we fail. */
5867 goto fail;
fa9a63c5
RM
5868
5869
25fe55af 5870 /* endline is the dual of begline. */
fa9a63c5 5871 case endline:
25fe55af 5872 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5873
25fe55af
RS
5874 if (AT_STRINGS_END (d))
5875 {
5876 if (!bufp->not_eol) break;
5877 }
f1ad044f 5878 else
25fe55af 5879 {
f1ad044f 5880 PREFETCH_NOLIMIT ();
c0f9ea08 5881 if (*d == '\n')
f1ad044f 5882 break;
25fe55af
RS
5883 }
5884 goto fail;
fa9a63c5
RM
5885
5886
5887 /* Match at the very beginning of the data. */
25fe55af
RS
5888 case begbuf:
5889 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5890 if (AT_STRINGS_BEG (d))
5891 break;
5892 goto fail;
fa9a63c5
RM
5893
5894
5895 /* Match at the very end of the data. */
25fe55af
RS
5896 case endbuf:
5897 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5898 if (AT_STRINGS_END (d))
5899 break;
25fe55af 5900 goto fail;
5e69f11e 5901
5e69f11e 5902
25fe55af
RS
5903 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5904 pushes NULL as the value for the string on the stack. Then
505bde11 5905 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5906 string, instead of restoring it. To see why, consider
7814e705 5907 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5908 then the . fails against the \n. But the next thing we want
5909 to do is match the \n against the \n; if we restored the
5910 string value, we would be back at the foo.
5911
5912 Because this is used only in specific cases, we don't need to
5913 check all the things that `on_failure_jump' does, to make
5914 sure the right things get saved on the stack. Hence we don't
5915 share its code. The only reason to push anything on the
5916 stack at all is that otherwise we would have to change
5917 `anychar's code to do something besides goto fail in this
5918 case; that seems worse than this. */
5919 case on_failure_keep_string_jump:
505bde11
SM
5920 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5921 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5922 mcnt, p + mcnt);
fa9a63c5 5923
505bde11
SM
5924 PUSH_FAILURE_POINT (p - 3, NULL);
5925 break;
5926
0683b6fa
SM
5927 /* A nasty loop is introduced by the non-greedy *? and +?.
5928 With such loops, the stack only ever contains one failure point
5929 at a time, so that a plain on_failure_jump_loop kind of
5930 cycle detection cannot work. Worse yet, such a detection
5931 can not only fail to detect a cycle, but it can also wrongly
5932 detect a cycle (between different instantiations of the same
6df42991 5933 loop).
0683b6fa
SM
5934 So the method used for those nasty loops is a little different:
5935 We use a special cycle-detection-stack-frame which is pushed
5936 when the on_failure_jump_nastyloop failure-point is *popped*.
5937 This special frame thus marks the beginning of one iteration
5938 through the loop and we can hence easily check right here
5939 whether something matched between the beginning and the end of
5940 the loop. */
5941 case on_failure_jump_nastyloop:
5942 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5943 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5944 mcnt, p + mcnt);
5945
5946 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5947 {
5948 int cycle = 0;
5949 CHECK_INFINITE_LOOP (p - 4, d);
5950 if (!cycle)
5951 /* If there's a cycle, just continue without pushing
5952 this failure point. The failure point is the "try again"
5953 option, which shouldn't be tried.
5954 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5955 PUSH_FAILURE_POINT (p - 3, d);
5956 }
0683b6fa
SM
5957 break;
5958
4e8a9132
SM
5959 /* Simple loop detecting on_failure_jump: just check on the
5960 failure stack if the same spot was already hit earlier. */
505bde11
SM
5961 case on_failure_jump_loop:
5962 on_failure:
5963 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5964 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5965 mcnt, p + mcnt);
6df42991
SM
5966 {
5967 int cycle = 0;
5968 CHECK_INFINITE_LOOP (p - 3, d);
5969 if (cycle)
5970 /* If there's a cycle, get out of the loop, as if the matching
5971 had failed. We used to just `goto fail' here, but that was
5972 aborting the search a bit too early: we want to keep the
5973 empty-loop-match and keep matching after the loop.
5974 We want (x?)*y\1z to match both xxyz and xxyxz. */
5975 p += mcnt;
5976 else
5977 PUSH_FAILURE_POINT (p - 3, d);
5978 }
25fe55af 5979 break;
fa9a63c5
RM
5980
5981
5982 /* Uses of on_failure_jump:
5e69f11e 5983
25fe55af
RS
5984 Each alternative starts with an on_failure_jump that points
5985 to the beginning of the next alternative. Each alternative
5986 except the last ends with a jump that in effect jumps past
5987 the rest of the alternatives. (They really jump to the
5988 ending jump of the following alternative, because tensioning
5989 these jumps is a hassle.)
fa9a63c5 5990
25fe55af
RS
5991 Repeats start with an on_failure_jump that points past both
5992 the repetition text and either the following jump or
5993 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5994 case on_failure_jump:
25fe55af 5995 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5996 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5997 mcnt, p + mcnt);
25fe55af 5998
505bde11 5999 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
6000 break;
6001
4e8a9132 6002 /* This operation is used for greedy *.
505bde11
SM
6003 Compare the beginning of the repeat with what in the
6004 pattern follows its end. If we can establish that there
6005 is nothing that they would both match, i.e., that we
6006 would have to backtrack because of (as in, e.g., `a*a')
6007 then we can use a non-backtracking loop based on
4e8a9132 6008 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 6009 case on_failure_jump_smart:
25fe55af 6010 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
6011 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
6012 mcnt, p + mcnt);
25fe55af 6013 {
01618498 6014 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
6015 /* Here, we discard `const', making re_match non-reentrant. */
6016 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
6017 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 6018
505bde11
SM
6019 p -= 3; /* Reset so that we will re-execute the
6020 instruction once it's been changed. */
fa9a63c5 6021
4e8a9132
SM
6022 EXTRACT_NUMBER (mcnt, p2 - 2);
6023
6024 /* Ensure this is a indeed the trivial kind of loop
6025 we are expecting. */
6026 assert (skip_one_char (p1) == p2 - 3);
6027 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 6028 DEBUG_STATEMENT (debug += 2);
505bde11 6029 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 6030 {
505bde11 6031 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 6032 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 6033 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 6034 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 6035 }
505bde11 6036 else
fa9a63c5 6037 {
505bde11
SM
6038 /* Default to a safe `on_failure_jump' loop. */
6039 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 6040 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 6041 }
99633e97 6042 DEBUG_STATEMENT (debug -= 2);
25fe55af 6043 }
505bde11 6044 break;
25fe55af
RS
6045
6046 /* Unconditionally jump (without popping any failure points). */
6047 case jump:
fa9a63c5 6048 unconditional_jump:
5b370c2b 6049 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6050 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 6051 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 6052 p += mcnt; /* Do the jump. */
505bde11 6053 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
6054 break;
6055
6056
25fe55af
RS
6057 /* Have to succeed matching what follows at least n times.
6058 After that, handle like `on_failure_jump'. */
6059 case succeed_n:
01618498 6060 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6061 EXTRACT_NUMBER (mcnt, p + 2);
6062 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 6063
dc1e502d
SM
6064 /* Originally, mcnt is how many times we HAVE to succeed. */
6065 if (mcnt != 0)
25fe55af 6066 {
6dcf2d0e
SM
6067 /* Here, we discard `const', making re_match non-reentrant. */
6068 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6069 mcnt--;
01618498
SM
6070 p += 4;
6071 PUSH_NUMBER (p2, mcnt);
25fe55af 6072 }
dc1e502d
SM
6073 else
6074 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
6075 goto on_failure;
25fe55af
RS
6076 break;
6077
6078 case jump_n:
01618498 6079 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6080 EXTRACT_NUMBER (mcnt, p + 2);
6081 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
6082
6083 /* Originally, this is how many times we CAN jump. */
dc1e502d 6084 if (mcnt != 0)
25fe55af 6085 {
6dcf2d0e
SM
6086 /* Here, we discard `const', making re_match non-reentrant. */
6087 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6088 mcnt--;
01618498 6089 PUSH_NUMBER (p2, mcnt);
dc1e502d 6090 goto unconditional_jump;
25fe55af
RS
6091 }
6092 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
6093 else
6094 p += 4;
25fe55af 6095 break;
5e69f11e 6096
fa9a63c5
RM
6097 case set_number_at:
6098 {
01618498 6099 unsigned char *p2; /* Location of the counter. */
25fe55af 6100 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 6101
25fe55af 6102 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
6103 /* Here, we discard `const', making re_match non-reentrant. */
6104 p2 = (unsigned char*) p + mcnt;
01618498 6105 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6106 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6107 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6108 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6109 break;
6110 }
9121ca40
KH
6111
6112 case wordbound:
66f0296e
SM
6113 case notwordbound:
6114 not = (re_opcode_t) *(p - 1) == notwordbound;
6115 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6116
99633e97 6117 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6118
b18215fc 6119 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6120 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6121 not = !not;
b18215fc
RS
6122 else
6123 {
6124 /* C1 is the character before D, S1 is the syntax of C1, C2
6125 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6126 re_wchar_t c1, c2;
6127 int s1, s2;
bf216479 6128 int dummy;
b18215fc 6129#ifdef emacs
2d1675e4
SM
6130 int offset = PTR_TO_OFFSET (d - 1);
6131 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6132 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6133#endif
66f0296e 6134 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6135 s1 = SYNTAX (c1);
6136#ifdef emacs
5d967c7a 6137 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6138#endif
f1ad044f 6139 PREFETCH_NOLIMIT ();
6fdd04b0 6140 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6141 s2 = SYNTAX (c2);
6142
6143 if (/* Case 2: Only one of S1 and S2 is Sword. */
6144 ((s1 == Sword) != (s2 == Sword))
6145 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6146 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6147 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6148 not = !not;
6149 }
6150 if (not)
9121ca40 6151 break;
b18215fc 6152 else
9121ca40 6153 goto fail;
fa9a63c5
RM
6154
6155 case wordbeg:
25fe55af 6156 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6157
b18215fc
RS
6158 /* We FAIL in one of the following cases: */
6159
7814e705 6160 /* Case 1: D is at the end of string. */
b18215fc 6161 if (AT_STRINGS_END (d))
99633e97 6162 goto fail;
b18215fc
RS
6163 else
6164 {
6165 /* C1 is the character before D, S1 is the syntax of C1, C2
6166 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6167 re_wchar_t c1, c2;
6168 int s1, s2;
bf216479 6169 int dummy;
fa9a63c5 6170#ifdef emacs
2d1675e4
SM
6171 int offset = PTR_TO_OFFSET (d);
6172 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6173 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6174#endif
99633e97 6175 PREFETCH ();
6fdd04b0 6176 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6177 s2 = SYNTAX (c2);
177c0ea7 6178
b18215fc
RS
6179 /* Case 2: S2 is not Sword. */
6180 if (s2 != Sword)
6181 goto fail;
6182
6183 /* Case 3: D is not at the beginning of string ... */
6184 if (!AT_STRINGS_BEG (d))
6185 {
6186 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6187#ifdef emacs
5d967c7a 6188 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6189#endif
b18215fc
RS
6190 s1 = SYNTAX (c1);
6191
6192 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6193 returns 0. */
b18215fc
RS
6194 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6195 goto fail;
6196 }
6197 }
e318085a
RS
6198 break;
6199
b18215fc 6200 case wordend:
25fe55af 6201 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6202
6203 /* We FAIL in one of the following cases: */
6204
6205 /* Case 1: D is at the beginning of string. */
6206 if (AT_STRINGS_BEG (d))
e318085a 6207 goto fail;
b18215fc
RS
6208 else
6209 {
6210 /* C1 is the character before D, S1 is the syntax of C1, C2
6211 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6212 re_wchar_t c1, c2;
6213 int s1, s2;
bf216479 6214 int dummy;
5d967c7a 6215#ifdef emacs
2d1675e4
SM
6216 int offset = PTR_TO_OFFSET (d) - 1;
6217 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6218 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6219#endif
99633e97 6220 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6221 s1 = SYNTAX (c1);
6222
6223 /* Case 2: S1 is not Sword. */
6224 if (s1 != Sword)
6225 goto fail;
6226
6227 /* Case 3: D is not at the end of string ... */
6228 if (!AT_STRINGS_END (d))
6229 {
f1ad044f 6230 PREFETCH_NOLIMIT ();
6fdd04b0 6231 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6232#ifdef emacs
6233 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6234#endif
b18215fc
RS
6235 s2 = SYNTAX (c2);
6236
6237 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6238 returns 0. */
b18215fc 6239 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6240 goto fail;
b18215fc
RS
6241 }
6242 }
e318085a
RS
6243 break;
6244
669fa600
SM
6245 case symbeg:
6246 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6247
6248 /* We FAIL in one of the following cases: */
6249
7814e705 6250 /* Case 1: D is at the end of string. */
669fa600
SM
6251 if (AT_STRINGS_END (d))
6252 goto fail;
6253 else
6254 {
6255 /* C1 is the character before D, S1 is the syntax of C1, C2
6256 is the character at D, and S2 is the syntax of C2. */
6257 re_wchar_t c1, c2;
6258 int s1, s2;
6259#ifdef emacs
6260 int offset = PTR_TO_OFFSET (d);
6261 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6262 UPDATE_SYNTAX_TABLE (charpos);
6263#endif
6264 PREFETCH ();
cf9c99bc 6265 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
669fa600 6266 s2 = SYNTAX (c2);
7814e705 6267
669fa600
SM
6268 /* Case 2: S2 is neither Sword nor Ssymbol. */
6269 if (s2 != Sword && s2 != Ssymbol)
6270 goto fail;
6271
6272 /* Case 3: D is not at the beginning of string ... */
6273 if (!AT_STRINGS_BEG (d))
6274 {
6275 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6276#ifdef emacs
6277 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6278#endif
6279 s1 = SYNTAX (c1);
6280
6281 /* ... and S1 is Sword or Ssymbol. */
6282 if (s1 == Sword || s1 == Ssymbol)
6283 goto fail;
6284 }
6285 }
6286 break;
6287
6288 case symend:
6289 DEBUG_PRINT1 ("EXECUTING symend.\n");
6290
6291 /* We FAIL in one of the following cases: */
6292
6293 /* Case 1: D is at the beginning of string. */
6294 if (AT_STRINGS_BEG (d))
6295 goto fail;
6296 else
6297 {
6298 /* C1 is the character before D, S1 is the syntax of C1, C2
6299 is the character at D, and S2 is the syntax of C2. */
6300 re_wchar_t c1, c2;
6301 int s1, s2;
6302#ifdef emacs
6303 int offset = PTR_TO_OFFSET (d) - 1;
6304 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6305 UPDATE_SYNTAX_TABLE (charpos);
6306#endif
6307 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6308 s1 = SYNTAX (c1);
6309
6310 /* Case 2: S1 is neither Ssymbol nor Sword. */
6311 if (s1 != Sword && s1 != Ssymbol)
6312 goto fail;
6313
6314 /* Case 3: D is not at the end of string ... */
6315 if (!AT_STRINGS_END (d))
6316 {
6317 PREFETCH_NOLIMIT ();
cf9c99bc 6318 c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
669fa600 6319#ifdef emacs
134579f2 6320 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6321#endif
6322 s2 = SYNTAX (c2);
6323
6324 /* ... and S2 is Sword or Ssymbol. */
6325 if (s2 == Sword || s2 == Ssymbol)
6326 goto fail;
b18215fc
RS
6327 }
6328 }
e318085a
RS
6329 break;
6330
fa9a63c5 6331 case syntaxspec:
1fb352e0
SM
6332 case notsyntaxspec:
6333 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6334 mcnt = *p++;
1fb352e0 6335 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6336 PREFETCH ();
b18215fc
RS
6337#ifdef emacs
6338 {
2d1675e4
SM
6339 int offset = PTR_TO_OFFSET (d);
6340 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6341 UPDATE_SYNTAX_TABLE (pos1);
6342 }
25fe55af 6343#endif
b18215fc 6344 {
01618498
SM
6345 int len;
6346 re_wchar_t c;
b18215fc 6347
6fdd04b0 6348 GET_CHAR_AFTER (c, d, len);
990b2375 6349 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6350 goto fail;
b18215fc
RS
6351 d += len;
6352 }
fa9a63c5
RM
6353 break;
6354
b18215fc 6355#ifdef emacs
1fb352e0
SM
6356 case before_dot:
6357 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6358 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6359 goto fail;
b18215fc
RS
6360 break;
6361
1fb352e0
SM
6362 case at_dot:
6363 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6364 if (PTR_BYTE_POS (d) != PT_BYTE)
6365 goto fail;
6366 break;
b18215fc 6367
1fb352e0
SM
6368 case after_dot:
6369 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6370 if (PTR_BYTE_POS (d) <= PT_BYTE)
6371 goto fail;
e318085a 6372 break;
fa9a63c5 6373
1fb352e0 6374 case categoryspec:
b18215fc 6375 case notcategoryspec:
1fb352e0 6376 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6377 mcnt = *p++;
1fb352e0 6378 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6379 PREFETCH ();
6380 {
01618498
SM
6381 int len;
6382 re_wchar_t c;
6383
6fdd04b0 6384 GET_CHAR_AFTER (c, d, len);
1fb352e0 6385 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6386 goto fail;
6387 d += len;
6388 }
fa9a63c5 6389 break;
5e69f11e 6390
1fb352e0 6391#endif /* emacs */
5e69f11e 6392
0b32bf0e
SM
6393 default:
6394 abort ();
fa9a63c5 6395 }
b18215fc 6396 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6397
6398
6399 /* We goto here if a matching operation fails. */
6400 fail:
5b370c2b 6401 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6402 if (!FAIL_STACK_EMPTY ())
505bde11 6403 {
01618498 6404 re_char *str, *pat;
505bde11 6405 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6406 DEBUG_PRINT1 ("\nFAIL:\n");
6407 POP_FAILURE_POINT (str, pat);
505bde11
SM
6408 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6409 {
6410 case on_failure_keep_string_jump:
6411 assert (str == NULL);
6412 goto continue_failure_jump;
6413
0683b6fa
SM
6414 case on_failure_jump_nastyloop:
6415 assert ((re_opcode_t)pat[-2] == no_op);
6416 PUSH_FAILURE_POINT (pat - 2, str);
6417 /* Fallthrough */
6418
505bde11
SM
6419 case on_failure_jump_loop:
6420 case on_failure_jump:
6421 case succeed_n:
6422 d = str;
6423 continue_failure_jump:
6424 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6425 p = pat + mcnt;
6426 break;
b18215fc 6427
0683b6fa
SM
6428 case no_op:
6429 /* A special frame used for nastyloops. */
6430 goto fail;
6431
505bde11
SM
6432 default:
6433 abort();
6434 }
fa9a63c5 6435
505bde11 6436 assert (p >= bufp->buffer && p <= pend);
b18215fc 6437
0b32bf0e 6438 if (d >= string1 && d <= end1)
fa9a63c5 6439 dend = end_match_1;
0b32bf0e 6440 }
fa9a63c5 6441 else
0b32bf0e 6442 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6443 } /* for (;;) */
6444
6445 if (best_regs_set)
6446 goto restore_best_regs;
6447
6448 FREE_VARIABLES ();
6449
b18215fc 6450 return -1; /* Failure to match. */
fa9a63c5
RM
6451} /* re_match_2 */
6452\f
6453/* Subroutine definitions for re_match_2. */
6454
fa9a63c5
RM
6455/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6456 bytes; nonzero otherwise. */
5e69f11e 6457
fa9a63c5 6458static int
02cb78b5 6459bcmp_translate (s1, s2, len, translate, target_multibyte)
2d1675e4 6460 re_char *s1, *s2;
fa9a63c5 6461 register int len;
6676cb1c 6462 RE_TRANSLATE_TYPE translate;
02cb78b5 6463 const int target_multibyte;
fa9a63c5 6464{
2d1675e4
SM
6465 register re_char *p1 = s1, *p2 = s2;
6466 re_char *p1_end = s1 + len;
6467 re_char *p2_end = s2 + len;
e934739e 6468
4bb91c68
SM
6469 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6470 different lengths, but relying on a single `len' would break this. -sm */
6471 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6472 {
e934739e 6473 int p1_charlen, p2_charlen;
01618498 6474 re_wchar_t p1_ch, p2_ch;
e934739e 6475
6fdd04b0
KH
6476 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6477 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6478
6479 if (RE_TRANSLATE (translate, p1_ch)
6480 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6481 return 1;
e934739e
RS
6482
6483 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6484 }
e934739e
RS
6485
6486 if (p1 != p1_end || p2 != p2_end)
6487 return 1;
6488
fa9a63c5
RM
6489 return 0;
6490}
6491\f
6492/* Entry points for GNU code. */
6493
6494/* re_compile_pattern is the GNU regular expression compiler: it
6495 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6496 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6497
fa9a63c5
RM
6498 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6499 are set in BUFP on entry.
5e69f11e 6500
b18215fc 6501 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6502
6503const char *
6504re_compile_pattern (pattern, length, bufp)
6505 const char *pattern;
0b32bf0e 6506 size_t length;
fa9a63c5
RM
6507 struct re_pattern_buffer *bufp;
6508{
6509 reg_errcode_t ret;
5e69f11e 6510
1208f11a
RS
6511#ifdef emacs
6512 gl_state.current_syntax_table = current_buffer->syntax_table;
6513#endif
6514
fa9a63c5
RM
6515 /* GNU code is written to assume at least RE_NREGS registers will be set
6516 (and at least one extra will be -1). */
6517 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6518
fa9a63c5
RM
6519 /* And GNU code determines whether or not to get register information
6520 by passing null for the REGS argument to re_match, etc., not by
6521 setting no_sub. */
6522 bufp->no_sub = 0;
5e69f11e 6523
4bb91c68 6524 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6525
6526 if (!ret)
6527 return NULL;
6528 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6529}
c0f9ea08 6530WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6531\f
b18215fc
RS
6532/* Entry points compatible with 4.2 BSD regex library. We don't define
6533 them unless specifically requested. */
fa9a63c5 6534
0b32bf0e 6535#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6536
6537/* BSD has one and only one pattern buffer. */
6538static struct re_pattern_buffer re_comp_buf;
6539
6540char *
0b32bf0e 6541# ifdef _LIBC
48afdd44
RM
6542/* Make these definitions weak in libc, so POSIX programs can redefine
6543 these names if they don't use our functions, and still use
6544 regcomp/regexec below without link errors. */
6545weak_function
0b32bf0e 6546# endif
fa9a63c5
RM
6547re_comp (s)
6548 const char *s;
6549{
6550 reg_errcode_t ret;
5e69f11e 6551
fa9a63c5
RM
6552 if (!s)
6553 {
6554 if (!re_comp_buf.buffer)
0b32bf0e 6555 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6556 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6557 return 0;
6558 }
6559
6560 if (!re_comp_buf.buffer)
6561 {
6562 re_comp_buf.buffer = (unsigned char *) malloc (200);
6563 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6564 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6565 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6566 re_comp_buf.allocated = 200;
6567
6568 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6569 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6570 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6571 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6572 }
6573
6574 /* Since `re_exec' always passes NULL for the `regs' argument, we
6575 don't need to initialize the pattern buffer fields which affect it. */
6576
fa9a63c5 6577 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6578
fa9a63c5
RM
6579 if (!ret)
6580 return NULL;
6581
6582 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6583 return (char *) gettext (re_error_msgid[(int) ret]);
6584}
6585
6586
6587int
0b32bf0e 6588# ifdef _LIBC
48afdd44 6589weak_function
0b32bf0e 6590# endif
fa9a63c5
RM
6591re_exec (s)
6592 const char *s;
6593{
6594 const int len = strlen (s);
6595 return
6596 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6597}
6598#endif /* _REGEX_RE_COMP */
6599\f
6600/* POSIX.2 functions. Don't define these for Emacs. */
6601
6602#ifndef emacs
6603
6604/* regcomp takes a regular expression as a string and compiles it.
6605
b18215fc 6606 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6607 since POSIX says we shouldn't. Thus, we set
6608
6609 `buffer' to the compiled pattern;
6610 `used' to the length of the compiled pattern;
6611 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6612 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6613 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6614 `fastmap' to an allocated space for the fastmap;
6615 `fastmap_accurate' to zero;
fa9a63c5
RM
6616 `re_nsub' to the number of subexpressions in PATTERN.
6617
6618 PATTERN is the address of the pattern string.
6619
6620 CFLAGS is a series of bits which affect compilation.
6621
6622 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6623 use POSIX basic syntax.
6624
6625 If REG_NEWLINE is set, then . and [^...] don't match newline.
6626 Also, regexec will try a match beginning after every newline.
6627
6628 If REG_ICASE is set, then we considers upper- and lowercase
6629 versions of letters to be equivalent when matching.
6630
6631 If REG_NOSUB is set, then when PREG is passed to regexec, that
6632 routine will report only success or failure, and nothing about the
6633 registers.
6634
b18215fc 6635 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6636 the return codes and their meanings.) */
6637
6638int
6639regcomp (preg, pattern, cflags)
ada30c0e
SM
6640 regex_t *__restrict preg;
6641 const char *__restrict pattern;
fa9a63c5
RM
6642 int cflags;
6643{
6644 reg_errcode_t ret;
4bb91c68 6645 reg_syntax_t syntax
fa9a63c5
RM
6646 = (cflags & REG_EXTENDED) ?
6647 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6648
6649 /* regex_compile will allocate the space for the compiled pattern. */
6650 preg->buffer = 0;
6651 preg->allocated = 0;
6652 preg->used = 0;
5e69f11e 6653
c0f9ea08
SM
6654 /* Try to allocate space for the fastmap. */
6655 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6656
fa9a63c5
RM
6657 if (cflags & REG_ICASE)
6658 {
6659 unsigned i;
5e69f11e 6660
6676cb1c
RS
6661 preg->translate
6662 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6663 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6664 if (preg->translate == NULL)
0b32bf0e 6665 return (int) REG_ESPACE;
fa9a63c5
RM
6666
6667 /* Map uppercase characters to corresponding lowercase ones. */
6668 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6669 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6670 }
6671 else
6672 preg->translate = NULL;
6673
6674 /* If REG_NEWLINE is set, newlines are treated differently. */
6675 if (cflags & REG_NEWLINE)
6676 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6677 syntax &= ~RE_DOT_NEWLINE;
6678 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6679 }
6680 else
c0f9ea08 6681 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6682
6683 preg->no_sub = !!(cflags & REG_NOSUB);
6684
5e69f11e 6685 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6686 can use strlen here in compiling the pattern. */
4bb91c68 6687 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6688
fa9a63c5
RM
6689 /* POSIX doesn't distinguish between an unmatched open-group and an
6690 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6691 if (ret == REG_ERPAREN)
6692 ret = REG_EPAREN;
6693
6694 if (ret == REG_NOERROR && preg->fastmap)
6695 { /* Compute the fastmap now, since regexec cannot modify the pattern
6696 buffer. */
6697 re_compile_fastmap (preg);
6698 if (preg->can_be_null)
6699 { /* The fastmap can't be used anyway. */
6700 free (preg->fastmap);
6701 preg->fastmap = NULL;
6702 }
6703 }
fa9a63c5
RM
6704 return (int) ret;
6705}
c0f9ea08 6706WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6707
6708
6709/* regexec searches for a given pattern, specified by PREG, in the
6710 string STRING.
5e69f11e 6711
fa9a63c5 6712 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6713 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6714 least NMATCH elements, and we set them to the offsets of the
6715 corresponding matched substrings.
5e69f11e 6716
fa9a63c5
RM
6717 EFLAGS specifies `execution flags' which affect matching: if
6718 REG_NOTBOL is set, then ^ does not match at the beginning of the
6719 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6720
fa9a63c5
RM
6721 We return 0 if we find a match and REG_NOMATCH if not. */
6722
6723int
6724regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6725 const regex_t *__restrict preg;
6726 const char *__restrict string;
5e69f11e 6727 size_t nmatch;
9f2dbe01 6728 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6729 int eflags;
6730{
6731 int ret;
6732 struct re_registers regs;
6733 regex_t private_preg;
6734 int len = strlen (string);
c0f9ea08 6735 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6736
6737 private_preg = *preg;
5e69f11e 6738
fa9a63c5
RM
6739 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6740 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6741
fa9a63c5
RM
6742 /* The user has told us exactly how many registers to return
6743 information about, via `nmatch'. We have to pass that on to the
b18215fc 6744 matching routines. */
fa9a63c5 6745 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6746
fa9a63c5
RM
6747 if (want_reg_info)
6748 {
6749 regs.num_regs = nmatch;
4bb91c68
SM
6750 regs.start = TALLOC (nmatch * 2, regoff_t);
6751 if (regs.start == NULL)
0b32bf0e 6752 return (int) REG_NOMATCH;
4bb91c68 6753 regs.end = regs.start + nmatch;
fa9a63c5
RM
6754 }
6755
c0f9ea08
SM
6756 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6757 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6758 was a little bit longer but still only matching the real part.
6759 This works because the `endline' will check for a '\n' and will find a
6760 '\0', correctly deciding that this is not the end of a line.
6761 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6762 a convenient '\0' there. For all we know, the string could be preceded
6763 by '\n' which would throw things off. */
6764
fa9a63c5
RM
6765 /* Perform the searching operation. */
6766 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6767 /* start: */ 0, /* range: */ len,
6768 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6769
fa9a63c5
RM
6770 /* Copy the register information to the POSIX structure. */
6771 if (want_reg_info)
6772 {
6773 if (ret >= 0)
0b32bf0e
SM
6774 {
6775 unsigned r;
fa9a63c5 6776
0b32bf0e
SM
6777 for (r = 0; r < nmatch; r++)
6778 {
6779 pmatch[r].rm_so = regs.start[r];
6780 pmatch[r].rm_eo = regs.end[r];
6781 }
6782 }
fa9a63c5 6783
b18215fc 6784 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6785 free (regs.start);
fa9a63c5
RM
6786 }
6787
6788 /* We want zero return to mean success, unlike `re_search'. */
6789 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6790}
c0f9ea08 6791WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6792
6793
ec869672
JR
6794/* Returns a message corresponding to an error code, ERR_CODE, returned
6795 from either regcomp or regexec. We don't use PREG here.
6796
6797 ERR_CODE was previously called ERRCODE, but that name causes an
6798 error with msvc8 compiler. */
fa9a63c5
RM
6799
6800size_t
ec869672
JR
6801regerror (err_code, preg, errbuf, errbuf_size)
6802 int err_code;
fa9a63c5
RM
6803 const regex_t *preg;
6804 char *errbuf;
6805 size_t errbuf_size;
6806{
6807 const char *msg;
6808 size_t msg_size;
6809
ec869672
JR
6810 if (err_code < 0
6811 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6812 /* Only error codes returned by the rest of the code should be passed
b18215fc 6813 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6814 code generates an invalid error code, then the program has a bug.
6815 Dump core so we can fix it. */
6816 abort ();
6817
ec869672 6818 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6819
6820 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6821
fa9a63c5
RM
6822 if (errbuf_size != 0)
6823 {
6824 if (msg_size > errbuf_size)
0b32bf0e
SM
6825 {
6826 strncpy (errbuf, msg, errbuf_size - 1);
6827 errbuf[errbuf_size - 1] = 0;
6828 }
fa9a63c5 6829 else
0b32bf0e 6830 strcpy (errbuf, msg);
fa9a63c5
RM
6831 }
6832
6833 return msg_size;
6834}
c0f9ea08 6835WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6836
6837
6838/* Free dynamically allocated space used by PREG. */
6839
6840void
6841regfree (preg)
6842 regex_t *preg;
6843{
c2cd06e6 6844 free (preg->buffer);
fa9a63c5 6845 preg->buffer = NULL;
5e69f11e 6846
fa9a63c5
RM
6847 preg->allocated = 0;
6848 preg->used = 0;
6849
c2cd06e6 6850 free (preg->fastmap);
fa9a63c5
RM
6851 preg->fastmap = NULL;
6852 preg->fastmap_accurate = 0;
6853
c2cd06e6 6854 free (preg->translate);
fa9a63c5
RM
6855 preg->translate = NULL;
6856}
c0f9ea08 6857WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6858
6859#endif /* not emacs */
839966f3
KH
6860
6861/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6862 (do not change this comment) */