* lib-src/fakemail.c (action): Convert function definitions to standard C.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
114f9c96 6 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
e468b87f 7 Free Software Foundation, Inc.
bc78d348 8
fa9a63c5
RM
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
e468b87f 11 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
4fc5845f 21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 22 USA. */
fa9a63c5 23
6df42991 24/* TODO:
505bde11 25 - structure the opcode space into opcode+flag.
dc1e502d 26 - merge with glibc's regex.[ch].
01618498 27 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
28 need to modify the compiled regexp so that re_match can be reentrant.
29 - get rid of on_failure_jump_smart by doing the optimization in re_comp
30 rather than at run-time, so that re_match can be reentrant.
01618498 31*/
505bde11 32
fa9a63c5 33/* AIX requires this to be the first thing in the file. */
0b32bf0e 34#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
35 #pragma alloca
36#endif
37
fa9a63c5 38#ifdef HAVE_CONFIG_H
0b32bf0e 39# include <config.h>
fa9a63c5
RM
40#endif
41
4bb91c68
SM
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
fa9a63c5 48
14473664
SM
49/* Whether to use ISO C Amendment 1 wide char functions.
50 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
51#if defined _LIBC
52#define WIDE_CHAR_SUPPORT 1
53#else
14473664 54#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
55 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
56#endif
14473664
SM
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
a0ad02f7 60#if WIDE_CHAR_SUPPORT
14473664
SM
61/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62# include <wchar.h>
63# include <wctype.h>
64#endif
65
c0f9ea08
SM
66#ifdef _LIBC
67/* We have to keep the namespace clean. */
68# define regfree(preg) __regfree (preg)
69# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
71# define regerror(err_code, preg, errbuf, errbuf_size) \
72 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
73# define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77# define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79# define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81# define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83# define re_set_syntax(syntax) __re_set_syntax (syntax)
84# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
14473664
SM
88/* Make sure we call libc's function even if the user overrides them. */
89# define btowc __btowc
90# define iswctype __iswctype
91# define wctype __wctype
92
c0f9ea08
SM
93# define WEAK_ALIAS(a,b) weak_alias (a, b)
94
95/* We are also using some library internals. */
96# include <locale/localeinfo.h>
97# include <locale/elem-hash.h>
98# include <langinfo.h>
99#else
100# define WEAK_ALIAS(a,b)
101#endif
102
4bb91c68 103/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 104#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
105# include <libintl.h>
106#else
107# define gettext(msgid) (msgid)
108#endif
109
5e69f11e
RM
110#ifndef gettext_noop
111/* This define is so xgettext can find the internationalizable
112 strings. */
0b32bf0e 113# define gettext_noop(String) String
5e69f11e
RM
114#endif
115
fa9a63c5
RM
116/* The `emacs' switch turns on certain matching commands
117 that make sense only in Emacs. */
118#ifdef emacs
119
d7306fe6 120# include <setjmp.h>
0b32bf0e
SM
121# include "lisp.h"
122# include "buffer.h"
b18215fc
RS
123
124/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 125# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 126
0b32bf0e 127# include "syntax.h"
9117d724 128# include "character.h"
0b32bf0e 129# include "category.h"
fa9a63c5 130
7689ef0b
EZ
131# ifdef malloc
132# undef malloc
133# endif
0b32bf0e 134# define malloc xmalloc
7689ef0b
EZ
135# ifdef realloc
136# undef realloc
137# endif
0b32bf0e 138# define realloc xrealloc
7689ef0b
EZ
139# ifdef free
140# undef free
141# endif
0b32bf0e 142# define free xfree
9abbd165 143
7814e705 144/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
145# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
146# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
147
148# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 149# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
150# define RE_STRING_CHAR(p, multibyte) \
151 (multibyte ? (STRING_CHAR (p)) : (*(p)))
152# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
153 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 154
4c0354d7 155# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 156
2afc21f5 157# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 158
6fdd04b0
KH
159/* Set C a (possibly converted to multibyte) character before P. P
160 points into a string which is the virtual concatenation of STR1
161 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
162# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
163 do { \
02cb78b5 164 if (target_multibyte) \
bf216479
KH
165 { \
166 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
167 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
168 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 169 c = STRING_CHAR (dtemp); \
bf216479
KH
170 } \
171 else \
172 { \
173 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 174 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 175 } \
2d1675e4
SM
176 } while (0)
177
6fdd04b0
KH
178/* Set C a (possibly converted to multibyte) character at P, and set
179 LEN to the byte length of that character. */
180# define GET_CHAR_AFTER(c, p, len) \
181 do { \
02cb78b5 182 if (target_multibyte) \
62a6e103 183 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
184 else \
185 { \
cf9c99bc 186 (c) = *p; \
6fdd04b0 187 len = 1; \
cf9c99bc 188 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 189 } \
8f924df7 190 } while (0)
4e8a9132 191
fa9a63c5
RM
192#else /* not emacs */
193
194/* If we are not linking with Emacs proper,
195 we can't use the relocating allocator
196 even if config.h says that we can. */
0b32bf0e 197# undef REL_ALLOC
fa9a63c5 198
0b32bf0e
SM
199# if defined STDC_HEADERS || defined _LIBC
200# include <stdlib.h>
201# else
fa9a63c5
RM
202char *malloc ();
203char *realloc ();
0b32bf0e 204# endif
fa9a63c5 205
a77f947b
CY
206/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
207
208void *
209xmalloc (size)
210 size_t size;
211{
212 register void *val;
213 val = (void *) malloc (size);
214 if (!val && size)
215 {
216 write (2, "virtual memory exhausted\n", 25);
217 exit (1);
218 }
219 return val;
220}
221
222void *
223xrealloc (block, size)
224 void *block;
225 size_t size;
226{
227 register void *val;
228 /* We must call malloc explicitly when BLOCK is 0, since some
229 reallocs don't do this. */
230 if (! block)
231 val = (void *) malloc (size);
232 else
233 val = (void *) realloc (block, size);
234 if (!val && size)
235 {
236 write (2, "virtual memory exhausted\n", 25);
237 exit (1);
238 }
239 return val;
240}
241
a073faa6
CY
242# ifdef malloc
243# undef malloc
244# endif
245# define malloc xmalloc
246# ifdef realloc
247# undef realloc
248# endif
249# define realloc xrealloc
250
9e4ecb26 251/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 252 If nothing else has been done, use the method below. */
0b32bf0e
SM
253# ifdef INHIBIT_STRING_HEADER
254# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
255# if !defined bzero && !defined bcopy
256# undef INHIBIT_STRING_HEADER
257# endif
258# endif
259# endif
9e4ecb26 260
4bb91c68 261/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
262 This is used in most programs--a few other programs avoid this
263 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
264# ifndef INHIBIT_STRING_HEADER
265# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
266# include <string.h>
0b32bf0e 267# ifndef bzero
4bb91c68
SM
268# ifndef _LIBC
269# define bzero(s, n) (memset (s, '\0', n), (s))
270# else
271# define bzero(s, n) __bzero (s, n)
272# endif
0b32bf0e
SM
273# endif
274# else
275# include <strings.h>
4bb91c68
SM
276# ifndef memcmp
277# define memcmp(s1, s2, n) bcmp (s1, s2, n)
278# endif
279# ifndef memcpy
280# define memcpy(d, s, n) (bcopy (s, d, n), (d))
281# endif
0b32bf0e
SM
282# endif
283# endif
fa9a63c5
RM
284
285/* Define the syntax stuff for \<, \>, etc. */
286
990b2375 287/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 288enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 289
0b32bf0e 290# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 291
e934739e 292/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
293# define CHAR_CHARSET(c) 0
294# define CHARSET_LEADING_CODE_BASE(c) 0
295# define MAX_MULTIBYTE_LENGTH 1
296# define RE_MULTIBYTE_P(x) 0
bf216479 297# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
298# define WORD_BOUNDARY_P(c1, c2) (0)
299# define CHAR_HEAD_P(p) (1)
300# define SINGLE_BYTE_CHAR_P(c) (1)
301# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 302# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 303# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
304# define STRING_CHAR(p) (*(p))
305# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 306# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
307# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
308# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
309# define RE_CHAR_TO_MULTIBYTE(c) (c)
310# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 311# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 312 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
313# define GET_CHAR_AFTER(c, p, len) \
314 (c = *p, len = 1)
0b32bf0e 315# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
316# define BYTE8_TO_CHAR(c) (c)
317# define CHAR_BYTE8_P(c) (0)
bf216479 318# define CHAR_LEADING_CODE(c) (c)
8f924df7 319
fa9a63c5 320#endif /* not emacs */
4e8a9132
SM
321
322#ifndef RE_TRANSLATE
0b32bf0e
SM
323# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
324# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 325#endif
fa9a63c5
RM
326\f
327/* Get the interface, including the syntax bits. */
328#include "regex.h"
329
f71b19b6
DL
330/* isalpha etc. are used for the character classes. */
331#include <ctype.h>
fa9a63c5 332
f71b19b6 333#ifdef emacs
fa9a63c5 334
f71b19b6 335/* 1 if C is an ASCII character. */
0b32bf0e 336# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 337
f71b19b6 338/* 1 if C is a unibyte character. */
0b32bf0e 339# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 340
f71b19b6 341/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 342
f71b19b6 343/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
344# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
345# define ISCNTRL(c) ((c) < ' ')
346# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
347 || ((c) >= 'a' && (c) <= 'f') \
348 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
349
350/* This is only used for single-byte characters. */
0b32bf0e 351# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
352
353/* The rest must handle multibyte characters. */
354
0b32bf0e 355# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 356 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
357 : 1)
358
14473664 359# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 360 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
361 : 1)
362
0b32bf0e 363# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
364 ? (((c) >= 'a' && (c) <= 'z') \
365 || ((c) >= 'A' && (c) <= 'Z') \
366 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
367 : SYNTAX (c) == Sword)
368
0b32bf0e 369# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
370 ? (((c) >= 'a' && (c) <= 'z') \
371 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
372 : SYNTAX (c) == Sword)
373
0b32bf0e 374# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 375
0b32bf0e 376# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
377 ? ((c) > ' ' && (c) < 0177 \
378 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
379 || ((c) >= 'A' && (c) <= 'Z') \
380 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
381 : SYNTAX (c) != Sword)
382
0b32bf0e 383# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 384
0b32bf0e 385# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 386
0b32bf0e 387# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
388
389#else /* not emacs */
390
f71b19b6
DL
391/* Jim Meyering writes:
392
393 "... Some ctype macros are valid only for character codes that
394 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
395 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 396 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
397 STDC_HEADERS is defined, then autoconf has verified that the ctype
398 macros don't need to be guarded with references to isascii. ...
399 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
400 eliminate the && through constant folding."
401 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 402
4bb91c68 403# undef ISASCII
0b32bf0e
SM
404# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
405# define ISASCII(c) 1
406# else
407# define ISASCII(c) isascii(c)
408# endif
f71b19b6
DL
409
410/* 1 if C is an ASCII character. */
0b32bf0e 411# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
412
413/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
414# define ISUNIBYTE(c) 1
415
416# ifdef isblank
417# define ISBLANK(c) (ISASCII (c) && isblank (c))
418# else
419# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
420# endif
421# ifdef isgraph
422# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
423# else
424# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
425# endif
426
4bb91c68 427# undef ISPRINT
0b32bf0e
SM
428# define ISPRINT(c) (ISASCII (c) && isprint (c))
429# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
430# define ISALNUM(c) (ISASCII (c) && isalnum (c))
431# define ISALPHA(c) (ISASCII (c) && isalpha (c))
432# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
433# define ISLOWER(c) (ISASCII (c) && islower (c))
434# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
435# define ISSPACE(c) (ISASCII (c) && isspace (c))
436# define ISUPPER(c) (ISASCII (c) && isupper (c))
437# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
438
439# define ISWORD(c) ISALPHA(c)
440
4bb91c68
SM
441# ifdef _tolower
442# define TOLOWER(c) _tolower(c)
443# else
444# define TOLOWER(c) tolower(c)
445# endif
446
447/* How many characters in the character set. */
448# define CHAR_SET_SIZE 256
449
0b32bf0e 450# ifdef SYNTAX_TABLE
f71b19b6 451
0b32bf0e 452extern char *re_syntax_table;
f71b19b6 453
0b32bf0e
SM
454# else /* not SYNTAX_TABLE */
455
0b32bf0e
SM
456static char re_syntax_table[CHAR_SET_SIZE];
457
458static void
459init_syntax_once ()
460{
461 register int c;
462 static int done = 0;
463
464 if (done)
465 return;
466
467 bzero (re_syntax_table, sizeof re_syntax_table);
468
4bb91c68
SM
469 for (c = 0; c < CHAR_SET_SIZE; ++c)
470 if (ISALNUM (c))
471 re_syntax_table[c] = Sword;
fa9a63c5 472
669fa600 473 re_syntax_table['_'] = Ssymbol;
fa9a63c5 474
0b32bf0e
SM
475 done = 1;
476}
477
478# endif /* not SYNTAX_TABLE */
96cc36cc 479
4bb91c68
SM
480# define SYNTAX(c) re_syntax_table[(c)]
481
96cc36cc
RS
482#endif /* not emacs */
483\f
fa9a63c5 484#ifndef NULL
0b32bf0e 485# define NULL (void *)0
fa9a63c5
RM
486#endif
487
488/* We remove any previous definition of `SIGN_EXTEND_CHAR',
489 since ours (we hope) works properly with all combinations of
490 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 491 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
492#undef SIGN_EXTEND_CHAR
493#if __STDC__
0b32bf0e 494# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
495#else /* not __STDC__ */
496/* As in Harbison and Steele. */
0b32bf0e 497# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
498#endif
499\f
500/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
501 use `alloca' instead of `malloc'. This is because using malloc in
502 re_search* or re_match* could cause memory leaks when C-g is used in
503 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
504 the other hand, malloc is more portable, and easier to debug.
505
fa9a63c5
RM
506 Because we sometimes use alloca, some routines have to be macros,
507 not functions -- `alloca'-allocated space disappears at the end of the
508 function it is called in. */
509
510#ifdef REGEX_MALLOC
511
0b32bf0e
SM
512# define REGEX_ALLOCATE malloc
513# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
514# define REGEX_FREE free
fa9a63c5
RM
515
516#else /* not REGEX_MALLOC */
517
518/* Emacs already defines alloca, sometimes. */
0b32bf0e 519# ifndef alloca
fa9a63c5
RM
520
521/* Make alloca work the best possible way. */
0b32bf0e
SM
522# ifdef __GNUC__
523# define alloca __builtin_alloca
524# else /* not __GNUC__ */
7f585e7a 525# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
526# include <alloca.h>
527# endif /* HAVE_ALLOCA_H */
528# endif /* not __GNUC__ */
fa9a63c5 529
0b32bf0e 530# endif /* not alloca */
fa9a63c5 531
0b32bf0e 532# define REGEX_ALLOCATE alloca
fa9a63c5
RM
533
534/* Assumes a `char *destination' variable. */
0b32bf0e 535# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 536 (destination = (char *) alloca (nsize), \
4bb91c68 537 memcpy (destination, source, osize))
fa9a63c5
RM
538
539/* No need to do anything to free, after alloca. */
0b32bf0e 540# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
541
542#endif /* not REGEX_MALLOC */
543
544/* Define how to allocate the failure stack. */
545
0b32bf0e 546#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 547
0b32bf0e 548# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 549 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 550# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 551 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 552# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
553 r_alloc_free (&failure_stack_ptr)
554
4297555e 555#else /* not using relocating allocator */
fa9a63c5 556
0b32bf0e 557# ifdef REGEX_MALLOC
fa9a63c5 558
0b32bf0e
SM
559# define REGEX_ALLOCATE_STACK malloc
560# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
561# define REGEX_FREE_STACK free
fa9a63c5 562
0b32bf0e 563# else /* not REGEX_MALLOC */
fa9a63c5 564
0b32bf0e 565# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 566
0b32bf0e 567# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 568 REGEX_REALLOCATE (source, osize, nsize)
7814e705 569/* No need to explicitly free anything. */
0b32bf0e 570# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 571
0b32bf0e 572# endif /* not REGEX_MALLOC */
4297555e 573#endif /* not using relocating allocator */
fa9a63c5
RM
574
575
576/* True if `size1' is non-NULL and PTR is pointing anywhere inside
577 `string1' or just past its end. This works if PTR is NULL, which is
578 a good thing. */
25fe55af 579#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
580 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
581
582/* (Re)Allocate N items of type T using malloc, or fail. */
583#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
584#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
585#define RETALLOC_IF(addr, n, t) \
586 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
587#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
588
4bb91c68 589#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
590
591#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
592
593#undef MAX
594#undef MIN
595#define MAX(a, b) ((a) > (b) ? (a) : (b))
596#define MIN(a, b) ((a) < (b) ? (a) : (b))
597
66f0296e
SM
598/* Type of source-pattern and string chars. */
599typedef const unsigned char re_char;
600
fa9a63c5
RM
601typedef char boolean;
602#define false 0
603#define true 1
604
4bb91c68
SM
605static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
606 re_char *string1, int size1,
607 re_char *string2, int size2,
608 int pos,
609 struct re_registers *regs,
610 int stop));
fa9a63c5
RM
611\f
612/* These are the command codes that appear in compiled regular
4bb91c68 613 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
614 command code can specify any interpretation whatsoever for its
615 arguments. Zero bytes may appear in the compiled regular expression. */
616
617typedef enum
618{
619 no_op = 0,
620
4bb91c68 621 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
622 succeed,
623
25fe55af 624 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
625 exactn,
626
25fe55af 627 /* Matches any (more or less) character. */
fa9a63c5
RM
628 anychar,
629
25fe55af
RS
630 /* Matches any one char belonging to specified set. First
631 following byte is number of bitmap bytes. Then come bytes
632 for a bitmap saying which chars are in. Bits in each byte
633 are ordered low-bit-first. A character is in the set if its
634 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
635 automatically not in the set.
636
637 If the length byte has the 0x80 bit set, then that stuff
638 is followed by a range table:
639 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 640 See RANGE_TABLE_WORK_BITS below.
01618498 641 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 642 pairs, each 2 multibyte characters,
0b32bf0e 643 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
644 charset,
645
25fe55af 646 /* Same parameters as charset, but match any character that is
4bb91c68 647 not one of those specified. */
fa9a63c5
RM
648 charset_not,
649
25fe55af
RS
650 /* Start remembering the text that is matched, for storing in a
651 register. Followed by one byte with the register number, in
652 the range 0 to one less than the pattern buffer's re_nsub
505bde11 653 field. */
fa9a63c5
RM
654 start_memory,
655
25fe55af
RS
656 /* Stop remembering the text that is matched and store it in a
657 memory register. Followed by one byte with the register
658 number, in the range 0 to one less than `re_nsub' in the
505bde11 659 pattern buffer. */
fa9a63c5
RM
660 stop_memory,
661
25fe55af 662 /* Match a duplicate of something remembered. Followed by one
4bb91c68 663 byte containing the register number. */
fa9a63c5
RM
664 duplicate,
665
25fe55af 666 /* Fail unless at beginning of line. */
fa9a63c5
RM
667 begline,
668
4bb91c68 669 /* Fail unless at end of line. */
fa9a63c5
RM
670 endline,
671
25fe55af
RS
672 /* Succeeds if at beginning of buffer (if emacs) or at beginning
673 of string to be matched (if not). */
fa9a63c5
RM
674 begbuf,
675
25fe55af 676 /* Analogously, for end of buffer/string. */
fa9a63c5 677 endbuf,
5e69f11e 678
25fe55af 679 /* Followed by two byte relative address to which to jump. */
5e69f11e 680 jump,
fa9a63c5 681
25fe55af 682 /* Followed by two-byte relative address of place to resume at
7814e705 683 in case of failure. */
fa9a63c5 684 on_failure_jump,
5e69f11e 685
25fe55af
RS
686 /* Like on_failure_jump, but pushes a placeholder instead of the
687 current string position when executed. */
fa9a63c5 688 on_failure_keep_string_jump,
5e69f11e 689
505bde11
SM
690 /* Just like `on_failure_jump', except that it checks that we
691 don't get stuck in an infinite loop (matching an empty string
692 indefinitely). */
693 on_failure_jump_loop,
694
0683b6fa
SM
695 /* Just like `on_failure_jump_loop', except that it checks for
696 a different kind of loop (the kind that shows up with non-greedy
697 operators). This operation has to be immediately preceded
698 by a `no_op'. */
699 on_failure_jump_nastyloop,
700
0b32bf0e 701 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
702 It analyses the loop before which it is put and if the
703 loop does not require backtracking, it changes itself to
4e8a9132
SM
704 `on_failure_keep_string_jump' and short-circuits the loop,
705 else it just defaults to changing itself into `on_failure_jump'.
706 It assumes that it is pointing to just past a `jump'. */
505bde11 707 on_failure_jump_smart,
fa9a63c5 708
25fe55af 709 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
710 After matching N times, jump to the address upon failure.
711 Does not work if N starts at 0: use on_failure_jump_loop
712 instead. */
fa9a63c5
RM
713 succeed_n,
714
25fe55af
RS
715 /* Followed by two-byte relative address, and two-byte number n.
716 Jump to the address N times, then fail. */
fa9a63c5
RM
717 jump_n,
718
25fe55af 719 /* Set the following two-byte relative address to the
7814e705 720 subsequent two-byte number. The address *includes* the two
25fe55af 721 bytes of number. */
fa9a63c5
RM
722 set_number_at,
723
fa9a63c5
RM
724 wordbeg, /* Succeeds if at word beginning. */
725 wordend, /* Succeeds if at word end. */
726
727 wordbound, /* Succeeds if at a word boundary. */
7814e705 728 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 729
669fa600
SM
730 symbeg, /* Succeeds if at symbol beginning. */
731 symend, /* Succeeds if at symbol end. */
732
fa9a63c5 733 /* Matches any character whose syntax is specified. Followed by
25fe55af 734 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
735 syntaxspec,
736
737 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
738 notsyntaxspec
739
740#ifdef emacs
741 ,before_dot, /* Succeeds if before point. */
742 at_dot, /* Succeeds if at point. */
743 after_dot, /* Succeeds if after point. */
b18215fc
RS
744
745 /* Matches any character whose category-set contains the specified
7814e705
JB
746 category. The operator is followed by a byte which contains a
747 category code (mnemonic ASCII character). */
b18215fc
RS
748 categoryspec,
749
750 /* Matches any character whose category-set does not contain the
751 specified category. The operator is followed by a byte which
752 contains the category code (mnemonic ASCII character). */
753 notcategoryspec
fa9a63c5
RM
754#endif /* emacs */
755} re_opcode_t;
756\f
757/* Common operations on the compiled pattern. */
758
759/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
760
761#define STORE_NUMBER(destination, number) \
762 do { \
763 (destination)[0] = (number) & 0377; \
764 (destination)[1] = (number) >> 8; \
765 } while (0)
766
767/* Same as STORE_NUMBER, except increment DESTINATION to
768 the byte after where the number is stored. Therefore, DESTINATION
769 must be an lvalue. */
770
771#define STORE_NUMBER_AND_INCR(destination, number) \
772 do { \
773 STORE_NUMBER (destination, number); \
774 (destination) += 2; \
775 } while (0)
776
777/* Put into DESTINATION a number stored in two contiguous bytes starting
778 at SOURCE. */
779
780#define EXTRACT_NUMBER(destination, source) \
781 do { \
782 (destination) = *(source) & 0377; \
783 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
784 } while (0)
785
786#ifdef DEBUG
4bb91c68 787static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
788static void
789extract_number (dest, source)
790 int *dest;
01618498 791 re_char *source;
fa9a63c5 792{
5e69f11e 793 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
794 *dest = *source & 0377;
795 *dest += temp << 8;
796}
797
4bb91c68 798# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
799# undef EXTRACT_NUMBER
800# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
801# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
802
803#endif /* DEBUG */
804
805/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
806 SOURCE must be an lvalue. */
807
808#define EXTRACT_NUMBER_AND_INCR(destination, source) \
809 do { \
810 EXTRACT_NUMBER (destination, source); \
25fe55af 811 (source) += 2; \
fa9a63c5
RM
812 } while (0)
813
814#ifdef DEBUG
4bb91c68
SM
815static void extract_number_and_incr _RE_ARGS ((int *destination,
816 re_char **source));
fa9a63c5
RM
817static void
818extract_number_and_incr (destination, source)
819 int *destination;
01618498 820 re_char **source;
5e69f11e 821{
fa9a63c5
RM
822 extract_number (destination, *source);
823 *source += 2;
824}
825
0b32bf0e
SM
826# ifndef EXTRACT_MACROS
827# undef EXTRACT_NUMBER_AND_INCR
828# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 829 extract_number_and_incr (&dest, &src)
0b32bf0e 830# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
831
832#endif /* DEBUG */
833\f
b18215fc
RS
834/* Store a multibyte character in three contiguous bytes starting
835 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 836 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
837
838#define STORE_CHARACTER_AND_INCR(destination, character) \
839 do { \
840 (destination)[0] = (character) & 0377; \
841 (destination)[1] = ((character) >> 8) & 0377; \
842 (destination)[2] = (character) >> 16; \
843 (destination) += 3; \
844 } while (0)
845
846/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 847 starting at SOURCE. */
b18215fc
RS
848
849#define EXTRACT_CHARACTER(destination, source) \
850 do { \
851 (destination) = ((source)[0] \
852 | ((source)[1] << 8) \
853 | ((source)[2] << 16)); \
854 } while (0)
855
856
857/* Macros for charset. */
858
859/* Size of bitmap of charset P in bytes. P is a start of charset,
860 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
861#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
862
863/* Nonzero if charset P has range table. */
25fe55af 864#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
865
866/* Return the address of range table of charset P. But not the start
867 of table itself, but the before where the number of ranges is
96cc36cc
RS
868 stored. `2 +' means to skip re_opcode_t and size of bitmap,
869 and the 2 bytes of flags at the start of the range table. */
870#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
871
872/* Extract the bit flags that start a range table. */
873#define CHARSET_RANGE_TABLE_BITS(p) \
874 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
875 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
876
877/* Test if C is listed in the bitmap of charset P. */
878#define CHARSET_LOOKUP_BITMAP(p, c) \
879 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
880 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
881
882/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
883 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
884 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
885 and end. */
886#define CHARSET_RANGE_TABLE_END(range_table, count) \
887 ((range_table) + (count) * 2 * 3)
888
7814e705 889/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
890 COUNT is number of ranges in RANGE_TABLE. */
891#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
892 do \
893 { \
01618498
SM
894 re_wchar_t range_start, range_end; \
895 re_char *p; \
896 re_char *range_table_end \
b18215fc
RS
897 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
898 \
899 for (p = (range_table); p < range_table_end; p += 2 * 3) \
900 { \
901 EXTRACT_CHARACTER (range_start, p); \
902 EXTRACT_CHARACTER (range_end, p + 3); \
903 \
904 if (range_start <= (c) && (c) <= range_end) \
905 { \
906 (not) = !(not); \
907 break; \
908 } \
909 } \
910 } \
911 while (0)
912
913/* Test if C is in range table of CHARSET. The flag NOT is negated if
914 C is listed in it. */
915#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
916 do \
917 { \
918 /* Number of ranges in range table. */ \
919 int count; \
01618498
SM
920 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
921 \
b18215fc
RS
922 EXTRACT_NUMBER_AND_INCR (count, range_table); \
923 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
924 } \
925 while (0)
926\f
fa9a63c5
RM
927/* If DEBUG is defined, Regex prints many voluminous messages about what
928 it is doing (if the variable `debug' is nonzero). If linked with the
929 main program in `iregex.c', you can enter patterns and strings
930 interactively. And if linked with the main program in `main.c' and
4bb91c68 931 the other test files, you can run the already-written tests. */
fa9a63c5
RM
932
933#ifdef DEBUG
934
935/* We use standard I/O for debugging. */
0b32bf0e 936# include <stdio.h>
fa9a63c5
RM
937
938/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 939# include <assert.h>
fa9a63c5 940
99633e97 941static int debug = -100000;
fa9a63c5 942
0b32bf0e
SM
943# define DEBUG_STATEMENT(e) e
944# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
945# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
946# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
947# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
948# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 949 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 950# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 951 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
952
953
954/* Print the fastmap in human-readable form. */
955
956void
957print_fastmap (fastmap)
958 char *fastmap;
959{
960 unsigned was_a_range = 0;
5e69f11e
RM
961 unsigned i = 0;
962
fa9a63c5
RM
963 while (i < (1 << BYTEWIDTH))
964 {
965 if (fastmap[i++])
966 {
967 was_a_range = 0;
25fe55af
RS
968 putchar (i - 1);
969 while (i < (1 << BYTEWIDTH) && fastmap[i])
970 {
971 was_a_range = 1;
972 i++;
973 }
fa9a63c5 974 if (was_a_range)
25fe55af
RS
975 {
976 printf ("-");
977 putchar (i - 1);
978 }
979 }
fa9a63c5 980 }
5e69f11e 981 putchar ('\n');
fa9a63c5
RM
982}
983
984
985/* Print a compiled pattern string in human-readable form, starting at
986 the START pointer into it and ending just before the pointer END. */
987
988void
989print_partial_compiled_pattern (start, end)
01618498
SM
990 re_char *start;
991 re_char *end;
fa9a63c5
RM
992{
993 int mcnt, mcnt2;
01618498
SM
994 re_char *p = start;
995 re_char *pend = end;
fa9a63c5
RM
996
997 if (start == NULL)
998 {
a1a052df 999 fprintf (stderr, "(null)\n");
fa9a63c5
RM
1000 return;
1001 }
5e69f11e 1002
fa9a63c5
RM
1003 /* Loop over pattern commands. */
1004 while (p < pend)
1005 {
a1a052df 1006 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
1007
1008 switch ((re_opcode_t) *p++)
1009 {
25fe55af 1010 case no_op:
a1a052df 1011 fprintf (stderr, "/no_op");
25fe55af 1012 break;
fa9a63c5 1013
99633e97 1014 case succeed:
a1a052df 1015 fprintf (stderr, "/succeed");
99633e97
SM
1016 break;
1017
fa9a63c5
RM
1018 case exactn:
1019 mcnt = *p++;
a1a052df 1020 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 1021 do
fa9a63c5 1022 {
a1a052df 1023 fprintf (stderr, "/%c", *p++);
25fe55af
RS
1024 }
1025 while (--mcnt);
1026 break;
fa9a63c5
RM
1027
1028 case start_memory:
a1a052df 1029 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1030 break;
fa9a63c5
RM
1031
1032 case stop_memory:
a1a052df 1033 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1034 break;
fa9a63c5
RM
1035
1036 case duplicate:
a1a052df 1037 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1038 break;
1039
1040 case anychar:
a1a052df 1041 fprintf (stderr, "/anychar");
fa9a63c5
RM
1042 break;
1043
1044 case charset:
25fe55af
RS
1045 case charset_not:
1046 {
1047 register int c, last = -100;
fa9a63c5 1048 register int in_range = 0;
99633e97
SM
1049 int length = CHARSET_BITMAP_SIZE (p - 1);
1050 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1051
a1a052df 1052 fprintf (stderr, "/charset [%s",
839966f3 1053 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1054
839966f3
KH
1055 if (p + *p >= pend)
1056 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1057
25fe55af 1058 for (c = 0; c < 256; c++)
96cc36cc 1059 if (c / 8 < length
fa9a63c5
RM
1060 && (p[1 + (c/8)] & (1 << (c % 8))))
1061 {
1062 /* Are we starting a range? */
1063 if (last + 1 == c && ! in_range)
1064 {
a1a052df 1065 fprintf (stderr, "-");
fa9a63c5
RM
1066 in_range = 1;
1067 }
1068 /* Have we broken a range? */
1069 else if (last + 1 != c && in_range)
96cc36cc 1070 {
a1a052df 1071 fprintf (stderr, "%c", last);
fa9a63c5
RM
1072 in_range = 0;
1073 }
5e69f11e 1074
fa9a63c5 1075 if (! in_range)
a1a052df 1076 fprintf (stderr, "%c", c);
fa9a63c5
RM
1077
1078 last = c;
25fe55af 1079 }
fa9a63c5
RM
1080
1081 if (in_range)
a1a052df 1082 fprintf (stderr, "%c", last);
fa9a63c5 1083
a1a052df 1084 fprintf (stderr, "]");
fa9a63c5 1085
99633e97 1086 p += 1 + length;
96cc36cc 1087
96cc36cc 1088 if (has_range_table)
99633e97
SM
1089 {
1090 int count;
a1a052df 1091 fprintf (stderr, "has-range-table");
99633e97
SM
1092
1093 /* ??? Should print the range table; for now, just skip it. */
1094 p += 2; /* skip range table bits */
1095 EXTRACT_NUMBER_AND_INCR (count, p);
1096 p = CHARSET_RANGE_TABLE_END (p, count);
1097 }
fa9a63c5
RM
1098 }
1099 break;
1100
1101 case begline:
a1a052df 1102 fprintf (stderr, "/begline");
25fe55af 1103 break;
fa9a63c5
RM
1104
1105 case endline:
a1a052df 1106 fprintf (stderr, "/endline");
25fe55af 1107 break;
fa9a63c5
RM
1108
1109 case on_failure_jump:
25fe55af 1110 extract_number_and_incr (&mcnt, &p);
a1a052df 1111 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1112 break;
fa9a63c5
RM
1113
1114 case on_failure_keep_string_jump:
25fe55af 1115 extract_number_and_incr (&mcnt, &p);
a1a052df 1116 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1117 break;
fa9a63c5 1118
0683b6fa
SM
1119 case on_failure_jump_nastyloop:
1120 extract_number_and_incr (&mcnt, &p);
a1a052df 1121 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1122 break;
1123
505bde11 1124 case on_failure_jump_loop:
fa9a63c5 1125 extract_number_and_incr (&mcnt, &p);
a1a052df 1126 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1127 break;
1128
505bde11 1129 case on_failure_jump_smart:
fa9a63c5 1130 extract_number_and_incr (&mcnt, &p);
a1a052df 1131 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1132 break;
1133
25fe55af 1134 case jump:
fa9a63c5 1135 extract_number_and_incr (&mcnt, &p);
a1a052df 1136 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1137 break;
1138
25fe55af
RS
1139 case succeed_n:
1140 extract_number_and_incr (&mcnt, &p);
1141 extract_number_and_incr (&mcnt2, &p);
a1a052df 1142 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1143 break;
5e69f11e 1144
25fe55af
RS
1145 case jump_n:
1146 extract_number_and_incr (&mcnt, &p);
1147 extract_number_and_incr (&mcnt2, &p);
a1a052df 1148 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1149 break;
5e69f11e 1150
25fe55af
RS
1151 case set_number_at:
1152 extract_number_and_incr (&mcnt, &p);
1153 extract_number_and_incr (&mcnt2, &p);
a1a052df 1154 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1155 break;
5e69f11e 1156
25fe55af 1157 case wordbound:
a1a052df 1158 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1159 break;
1160
1161 case notwordbound:
a1a052df 1162 fprintf (stderr, "/notwordbound");
25fe55af 1163 break;
fa9a63c5
RM
1164
1165 case wordbeg:
a1a052df 1166 fprintf (stderr, "/wordbeg");
fa9a63c5 1167 break;
5e69f11e 1168
fa9a63c5 1169 case wordend:
a1a052df 1170 fprintf (stderr, "/wordend");
e2543b02 1171 break;
5e69f11e 1172
669fa600 1173 case symbeg:
e2543b02 1174 fprintf (stderr, "/symbeg");
669fa600
SM
1175 break;
1176
1177 case symend:
e2543b02 1178 fprintf (stderr, "/symend");
669fa600 1179 break;
5e69f11e 1180
1fb352e0 1181 case syntaxspec:
a1a052df 1182 fprintf (stderr, "/syntaxspec");
1fb352e0 1183 mcnt = *p++;
a1a052df 1184 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1185 break;
1186
1187 case notsyntaxspec:
a1a052df 1188 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1189 mcnt = *p++;
a1a052df 1190 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1191 break;
1192
0b32bf0e 1193# ifdef emacs
fa9a63c5 1194 case before_dot:
a1a052df 1195 fprintf (stderr, "/before_dot");
25fe55af 1196 break;
fa9a63c5
RM
1197
1198 case at_dot:
a1a052df 1199 fprintf (stderr, "/at_dot");
25fe55af 1200 break;
fa9a63c5
RM
1201
1202 case after_dot:
a1a052df 1203 fprintf (stderr, "/after_dot");
25fe55af 1204 break;
fa9a63c5 1205
1fb352e0 1206 case categoryspec:
a1a052df 1207 fprintf (stderr, "/categoryspec");
fa9a63c5 1208 mcnt = *p++;
a1a052df 1209 fprintf (stderr, "/%d", mcnt);
25fe55af 1210 break;
5e69f11e 1211
1fb352e0 1212 case notcategoryspec:
a1a052df 1213 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1214 mcnt = *p++;
a1a052df 1215 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1216 break;
0b32bf0e 1217# endif /* emacs */
fa9a63c5 1218
fa9a63c5 1219 case begbuf:
a1a052df 1220 fprintf (stderr, "/begbuf");
25fe55af 1221 break;
fa9a63c5
RM
1222
1223 case endbuf:
a1a052df 1224 fprintf (stderr, "/endbuf");
25fe55af 1225 break;
fa9a63c5 1226
25fe55af 1227 default:
a1a052df 1228 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1229 }
1230
a1a052df 1231 fprintf (stderr, "\n");
fa9a63c5
RM
1232 }
1233
a1a052df 1234 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1235}
1236
1237
1238void
1239print_compiled_pattern (bufp)
1240 struct re_pattern_buffer *bufp;
1241{
01618498 1242 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1243
1244 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1245 printf ("%ld bytes used/%ld bytes allocated.\n",
1246 bufp->used, bufp->allocated);
fa9a63c5
RM
1247
1248 if (bufp->fastmap_accurate && bufp->fastmap)
1249 {
1250 printf ("fastmap: ");
1251 print_fastmap (bufp->fastmap);
1252 }
1253
1254 printf ("re_nsub: %d\t", bufp->re_nsub);
1255 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1256 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1257 printf ("no_sub: %d\t", bufp->no_sub);
1258 printf ("not_bol: %d\t", bufp->not_bol);
1259 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1260 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1261 fflush (stdout);
fa9a63c5
RM
1262 /* Perhaps we should print the translate table? */
1263}
1264
1265
1266void
1267print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1268 re_char *where;
1269 re_char *string1;
1270 re_char *string2;
fa9a63c5
RM
1271 int size1;
1272 int size2;
1273{
4bb91c68 1274 int this_char;
5e69f11e 1275
fa9a63c5
RM
1276 if (where == NULL)
1277 printf ("(null)");
1278 else
1279 {
1280 if (FIRST_STRING_P (where))
25fe55af
RS
1281 {
1282 for (this_char = where - string1; this_char < size1; this_char++)
1283 putchar (string1[this_char]);
fa9a63c5 1284
25fe55af
RS
1285 where = string2;
1286 }
fa9a63c5
RM
1287
1288 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1289 putchar (string2[this_char]);
fa9a63c5
RM
1290 }
1291}
1292
1293#else /* not DEBUG */
1294
0b32bf0e
SM
1295# undef assert
1296# define assert(e)
fa9a63c5 1297
0b32bf0e
SM
1298# define DEBUG_STATEMENT(e)
1299# define DEBUG_PRINT1(x)
1300# define DEBUG_PRINT2(x1, x2)
1301# define DEBUG_PRINT3(x1, x2, x3)
1302# define DEBUG_PRINT4(x1, x2, x3, x4)
1303# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1304# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1305
1306#endif /* not DEBUG */
1307\f
1308/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1309 also be assigned to arbitrarily: each pattern buffer stores its own
1310 syntax, so it can be changed between regex compilations. */
1311/* This has no initializer because initialized variables in Emacs
1312 become read-only after dumping. */
1313reg_syntax_t re_syntax_options;
1314
1315
1316/* Specify the precise syntax of regexps for compilation. This provides
1317 for compatibility for various utilities which historically have
1318 different, incompatible syntaxes.
1319
1320 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1321 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1322
1323reg_syntax_t
1324re_set_syntax (syntax)
f9b0fd99 1325 reg_syntax_t syntax;
fa9a63c5
RM
1326{
1327 reg_syntax_t ret = re_syntax_options;
5e69f11e 1328
fa9a63c5
RM
1329 re_syntax_options = syntax;
1330 return ret;
1331}
c0f9ea08 1332WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1333
1334/* Regexp to use to replace spaces, or NULL meaning don't. */
1335static re_char *whitespace_regexp;
1336
1337void
1338re_set_whitespace_regexp (regexp)
6470ea05 1339 const char *regexp;
f9b0fd99 1340{
6470ea05 1341 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1342}
1343WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1344\f
1345/* This table gives an error message for each of the error codes listed
4bb91c68 1346 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1347 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1348 but why not be nice? */
fa9a63c5
RM
1349
1350static const char *re_error_msgid[] =
5e69f11e
RM
1351 {
1352 gettext_noop ("Success"), /* REG_NOERROR */
1353 gettext_noop ("No match"), /* REG_NOMATCH */
1354 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1355 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1356 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1357 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1358 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1359 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1360 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1361 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1362 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1363 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1364 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1365 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1366 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1367 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1368 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1369 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1370 };
1371\f
4bb91c68 1372/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1373
1374/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1375 searching and matching functions should not call alloca. On some
1376 systems, alloca is implemented in terms of malloc, and if we're
1377 using the relocating allocator routines, then malloc could cause a
1378 relocation, which might (if the strings being searched are in the
1379 ralloc heap) shift the data out from underneath the regexp
1380 routines.
1381
5e69f11e 1382 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1383 processes input from X in a signal handler; processing X input may
1384 call malloc; if input arrives while a matching routine is calling
1385 malloc, then we're scrod. But Emacs can't just block input while
1386 calling matching routines; then we don't notice interrupts when
1387 they come in. So, Emacs blocks input around all regexp calls
1388 except the matching calls, which it leaves unprotected, in the
1389 faith that they will not malloc. */
1390
1391/* Normally, this is fine. */
1392#define MATCH_MAY_ALLOCATE
1393
fa9a63c5
RM
1394/* The match routines may not allocate if (1) they would do it with malloc
1395 and (2) it's not safe for them to use malloc.
1396 Note that if REL_ALLOC is defined, matching would not use malloc for the
1397 failure stack, but we would still use it for the register vectors;
4bb91c68 1398 so REL_ALLOC should not affect this. */
b588157e 1399#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1400# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1401#endif
1402
1403\f
1404/* Failure stack declarations and macros; both re_compile_fastmap and
1405 re_match_2 use a failure stack. These have to be macros because of
1406 REGEX_ALLOCATE_STACK. */
5e69f11e 1407
fa9a63c5 1408
320a2a73 1409/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1410 when matching. If this number is exceeded, we allocate more
1411 space, so it is not a hard limit. */
1412#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1413# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1414#endif
1415
1416/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1417 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1418 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1419 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1420 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1421# if defined MATCH_MAY_ALLOCATE
1422/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1423 whose default stack limit is 2mb. In order for a larger
1424 value to work reliably, you have to try to make it accord
1425 with the process stack limit. */
c0f9ea08
SM
1426size_t re_max_failures = 40000;
1427# else
1428size_t re_max_failures = 4000;
1429# endif
fa9a63c5
RM
1430
1431union fail_stack_elt
1432{
01618498 1433 re_char *pointer;
c0f9ea08
SM
1434 /* This should be the biggest `int' that's no bigger than a pointer. */
1435 long integer;
fa9a63c5
RM
1436};
1437
1438typedef union fail_stack_elt fail_stack_elt_t;
1439
1440typedef struct
1441{
1442 fail_stack_elt_t *stack;
c0f9ea08
SM
1443 size_t size;
1444 size_t avail; /* Offset of next open position. */
1445 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1446} fail_stack_type;
1447
505bde11 1448#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1449#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1450
1451
1452/* Define macros to initialize and free the failure stack.
1453 Do `return -2' if the alloc fails. */
1454
1455#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1456# define INIT_FAIL_STACK() \
fa9a63c5
RM
1457 do { \
1458 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1459 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1460 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1461 \
1462 if (fail_stack.stack == NULL) \
1463 return -2; \
1464 \
1465 fail_stack.size = INIT_FAILURE_ALLOC; \
1466 fail_stack.avail = 0; \
505bde11 1467 fail_stack.frame = 0; \
fa9a63c5
RM
1468 } while (0)
1469
0b32bf0e 1470# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1471#else
0b32bf0e 1472# define INIT_FAIL_STACK() \
fa9a63c5
RM
1473 do { \
1474 fail_stack.avail = 0; \
505bde11 1475 fail_stack.frame = 0; \
fa9a63c5
RM
1476 } while (0)
1477
0b32bf0e 1478# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1479#endif
1480
1481
320a2a73
KH
1482/* Double the size of FAIL_STACK, up to a limit
1483 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1484
1485 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1486 allocating space for it or it was already too large.
1487
4bb91c68 1488 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1489
320a2a73
KH
1490/* Factor to increase the failure stack size by
1491 when we increase it.
1492 This used to be 2, but 2 was too wasteful
1493 because the old discarded stacks added up to as much space
1494 were as ultimate, maximum-size stack. */
1495#define FAIL_STACK_GROWTH_FACTOR 4
1496
1497#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1498 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1499 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1500 ? 0 \
320a2a73
KH
1501 : ((fail_stack).stack \
1502 = (fail_stack_elt_t *) \
25fe55af
RS
1503 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1504 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1505 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1506 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1507 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1508 \
1509 (fail_stack).stack == NULL \
1510 ? 0 \
6453db45
KH
1511 : ((fail_stack).size \
1512 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1513 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1514 * FAIL_STACK_GROWTH_FACTOR)) \
1515 / sizeof (fail_stack_elt_t)), \
25fe55af 1516 1)))
fa9a63c5
RM
1517
1518
fa9a63c5
RM
1519/* Push a pointer value onto the failure stack.
1520 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1521 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1522#define PUSH_FAILURE_POINTER(item) \
01618498 1523 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1524
1525/* This pushes an integer-valued item onto the failure stack.
1526 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1527 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1528#define PUSH_FAILURE_INT(item) \
1529 fail_stack.stack[fail_stack.avail++].integer = (item)
1530
1531/* Push a fail_stack_elt_t value onto the failure stack.
1532 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1533 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1534#define PUSH_FAILURE_ELT(item) \
1535 fail_stack.stack[fail_stack.avail++] = (item)
1536
1537/* These three POP... operations complement the three PUSH... operations.
1538 All assume that `fail_stack' is nonempty. */
1539#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1540#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1541#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1542
505bde11
SM
1543/* Individual items aside from the registers. */
1544#define NUM_NONREG_ITEMS 3
1545
1546/* Used to examine the stack (to detect infinite loops). */
1547#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1548#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1549#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1550#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1551
1552
505bde11
SM
1553#define ENSURE_FAIL_STACK(space) \
1554while (REMAINING_AVAIL_SLOTS <= space) { \
1555 if (!GROW_FAIL_STACK (fail_stack)) \
1556 return -2; \
1557 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1558 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1559}
1560
1561/* Push register NUM onto the stack. */
1562#define PUSH_FAILURE_REG(num) \
1563do { \
1564 char *destination; \
1565 ENSURE_FAIL_STACK(3); \
1566 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1567 num, regstart[num], regend[num]); \
1568 PUSH_FAILURE_POINTER (regstart[num]); \
1569 PUSH_FAILURE_POINTER (regend[num]); \
1570 PUSH_FAILURE_INT (num); \
1571} while (0)
1572
01618498
SM
1573/* Change the counter's value to VAL, but make sure that it will
1574 be reset when backtracking. */
1575#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1576do { \
1577 char *destination; \
1578 int c; \
1579 ENSURE_FAIL_STACK(3); \
1580 EXTRACT_NUMBER (c, ptr); \
01618498 1581 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1582 PUSH_FAILURE_INT (c); \
1583 PUSH_FAILURE_POINTER (ptr); \
1584 PUSH_FAILURE_INT (-1); \
01618498 1585 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1586} while (0)
1587
505bde11 1588/* Pop a saved register off the stack. */
dc1e502d 1589#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1590do { \
1591 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1592 if (reg == -1) \
1593 { \
1594 /* It's a counter. */ \
6dcf2d0e
SM
1595 /* Here, we discard `const', making re_match non-reentrant. */ \
1596 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1597 reg = POP_FAILURE_INT (); \
1598 STORE_NUMBER (ptr, reg); \
1599 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1600 } \
1601 else \
1602 { \
1603 regend[reg] = POP_FAILURE_POINTER (); \
1604 regstart[reg] = POP_FAILURE_POINTER (); \
1605 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1606 reg, regstart[reg], regend[reg]); \
1607 } \
505bde11
SM
1608} while (0)
1609
1610/* Check that we are not stuck in an infinite loop. */
1611#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1612do { \
f6df485f 1613 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1614 /* Check for infinite matching loops */ \
f6df485f
RS
1615 while (failure > 0 \
1616 && (FAILURE_STR (failure) == string_place \
1617 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1618 { \
1619 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1620 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1621 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1622 { \
6df42991
SM
1623 cycle = 1; \
1624 break; \
f6df485f 1625 } \
66f0296e 1626 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1627 failure = NEXT_FAILURE_HANDLE(failure); \
1628 } \
1629 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1630} while (0)
6df42991 1631
fa9a63c5 1632/* Push the information about the state we will need
5e69f11e
RM
1633 if we ever fail back to it.
1634
505bde11 1635 Requires variables fail_stack, regstart, regend and
320a2a73 1636 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1637 declared.
5e69f11e 1638
fa9a63c5
RM
1639 Does `return FAILURE_CODE' if runs out of memory. */
1640
505bde11
SM
1641#define PUSH_FAILURE_POINT(pattern, string_place) \
1642do { \
1643 char *destination; \
1644 /* Must be int, so when we don't save any registers, the arithmetic \
1645 of 0 + -1 isn't done as unsigned. */ \
1646 \
505bde11 1647 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1648 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1649 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1650 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1651 \
1652 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1653 \
1654 DEBUG_PRINT1 ("\n"); \
1655 \
1656 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1657 PUSH_FAILURE_INT (fail_stack.frame); \
1658 \
1659 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1660 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1661 DEBUG_PRINT1 ("'\n"); \
1662 PUSH_FAILURE_POINTER (string_place); \
1663 \
1664 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1665 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1666 PUSH_FAILURE_POINTER (pattern); \
1667 \
1668 /* Close the frame by moving the frame pointer past it. */ \
1669 fail_stack.frame = fail_stack.avail; \
1670} while (0)
fa9a63c5 1671
320a2a73
KH
1672/* Estimate the size of data pushed by a typical failure stack entry.
1673 An estimate is all we need, because all we use this for
1674 is to choose a limit for how big to make the failure stack. */
ada30c0e 1675/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1676#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1677
fa9a63c5
RM
1678/* How many items can still be added to the stack without overflowing it. */
1679#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1680
1681
1682/* Pops what PUSH_FAIL_STACK pushes.
1683
1684 We restore into the parameters, all of which should be lvalues:
1685 STR -- the saved data position.
1686 PAT -- the saved pattern position.
fa9a63c5 1687 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1688
fa9a63c5 1689 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1690 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1691
505bde11
SM
1692#define POP_FAILURE_POINT(str, pat) \
1693do { \
fa9a63c5
RM
1694 assert (!FAIL_STACK_EMPTY ()); \
1695 \
1696 /* Remove failure points and point to how many regs pushed. */ \
1697 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1698 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1699 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1700 \
505bde11
SM
1701 /* Pop the saved registers. */ \
1702 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1703 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1704 \
01618498 1705 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1706 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1707 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1708 \
1709 /* If the saved string location is NULL, it came from an \
1710 on_failure_keep_string_jump opcode, and we want to throw away the \
1711 saved NULL, thus retaining our current position in the string. */ \
01618498 1712 str = POP_FAILURE_POINTER (); \
505bde11 1713 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1714 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1715 DEBUG_PRINT1 ("'\n"); \
1716 \
505bde11
SM
1717 fail_stack.frame = POP_FAILURE_INT (); \
1718 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1719 \
505bde11
SM
1720 assert (fail_stack.avail >= 0); \
1721 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1722 \
fa9a63c5 1723 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1724} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1725
1726
1727\f
fa9a63c5 1728/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1729#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1730\f
1731/* Subroutine declarations and macros for regex_compile. */
1732
4bb91c68
SM
1733static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1734 reg_syntax_t syntax,
1735 struct re_pattern_buffer *bufp));
1736static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1737static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1738 int arg1, int arg2));
1739static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1740 int arg, unsigned char *end));
1741static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1742 int arg1, int arg2, unsigned char *end));
01618498
SM
1743static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1744 re_char *p,
4bb91c68 1745 reg_syntax_t syntax));
01618498
SM
1746static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1747 re_char *pend,
4bb91c68 1748 reg_syntax_t syntax));
01618498
SM
1749static re_char *skip_one_char _RE_ARGS ((re_char *p));
1750static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1751 char *fastmap, const int multibyte));
fa9a63c5 1752
fa9a63c5 1753/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1754 translation. */
36595814 1755#define PATFETCH(c) \
2d1675e4
SM
1756 do { \
1757 int len; \
1758 if (p == pend) return REG_EEND; \
62a6e103 1759 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1760 p += len; \
fa9a63c5
RM
1761 } while (0)
1762
fa9a63c5
RM
1763
1764/* If `translate' is non-null, return translate[D], else just D. We
1765 cast the subscript to translate because some data is declared as
1766 `char *', to avoid warnings when a string constant is passed. But
1767 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1768#ifndef TRANSLATE
0b32bf0e 1769# define TRANSLATE(d) \
66f0296e 1770 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1771#endif
fa9a63c5
RM
1772
1773
1774/* Macros for outputting the compiled pattern into `buffer'. */
1775
1776/* If the buffer isn't allocated when it comes in, use this. */
1777#define INIT_BUF_SIZE 32
1778
4bb91c68 1779/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1780#define GET_BUFFER_SPACE(n) \
01618498 1781 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1782 EXTEND_BUFFER ()
1783
1784/* Make sure we have one more byte of buffer space and then add C to it. */
1785#define BUF_PUSH(c) \
1786 do { \
1787 GET_BUFFER_SPACE (1); \
1788 *b++ = (unsigned char) (c); \
1789 } while (0)
1790
1791
1792/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1793#define BUF_PUSH_2(c1, c2) \
1794 do { \
1795 GET_BUFFER_SPACE (2); \
1796 *b++ = (unsigned char) (c1); \
1797 *b++ = (unsigned char) (c2); \
1798 } while (0)
1799
1800
4bb91c68 1801/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1802#define BUF_PUSH_3(c1, c2, c3) \
1803 do { \
1804 GET_BUFFER_SPACE (3); \
1805 *b++ = (unsigned char) (c1); \
1806 *b++ = (unsigned char) (c2); \
1807 *b++ = (unsigned char) (c3); \
1808 } while (0)
1809
1810
1811/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1812 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1813#define STORE_JUMP(op, loc, to) \
1814 store_op1 (op, loc, (to) - (loc) - 3)
1815
1816/* Likewise, for a two-argument jump. */
1817#define STORE_JUMP2(op, loc, to, arg) \
1818 store_op2 (op, loc, (to) - (loc) - 3, arg)
1819
4bb91c68 1820/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1821#define INSERT_JUMP(op, loc, to) \
1822 insert_op1 (op, loc, (to) - (loc) - 3, b)
1823
1824/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1825#define INSERT_JUMP2(op, loc, to, arg) \
1826 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1827
1828
1829/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1830 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1831 be too small, many things would have to change. */
839966f3
KH
1832# define MAX_BUF_SIZE (1L << 15)
1833
1834#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1835/* Any other compiler which, like MSC, has allocation limit below 2^16
1836 bytes will have to use approach similar to what was done below for
1837 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1838 reallocating to 0 bytes. Such thing is not going to work too well.
1839 You have been warned!! */
1840#if defined _MSC_VER && !defined WIN32
1841/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1842# define MAX_BUF_SIZE 65500L
1843#else
1844# define MAX_BUF_SIZE (1L << 16)
1845#endif
839966f3 1846#endif /* 0 */
fa9a63c5
RM
1847
1848/* Extend the buffer by twice its current size via realloc and
1849 reset the pointers that pointed into the old block to point to the
1850 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1851 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1852#if __BOUNDED_POINTERS__
1853# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1854# define MOVE_BUFFER_POINTER(P) \
1855 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1856 SET_HIGH_BOUND (P), \
1857 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1858# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1859 else \
1860 { \
1861 SET_HIGH_BOUND (b); \
1862 SET_HIGH_BOUND (begalt); \
1863 if (fixup_alt_jump) \
1864 SET_HIGH_BOUND (fixup_alt_jump); \
1865 if (laststart) \
1866 SET_HIGH_BOUND (laststart); \
1867 if (pending_exact) \
1868 SET_HIGH_BOUND (pending_exact); \
1869 }
1870#else
381880b0 1871# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1872# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1873#endif
fa9a63c5 1874#define EXTEND_BUFFER() \
25fe55af 1875 do { \
381880b0 1876 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1877 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1878 return REG_ESIZE; \
1879 bufp->allocated <<= 1; \
1880 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1881 bufp->allocated = MAX_BUF_SIZE; \
01618498 1882 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1883 if (bufp->buffer == NULL) \
1884 return REG_ESPACE; \
1885 /* If the buffer moved, move all the pointers into it. */ \
1886 if (old_buffer != bufp->buffer) \
1887 { \
381880b0 1888 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1889 MOVE_BUFFER_POINTER (b); \
1890 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1891 if (fixup_alt_jump) \
4bb91c68 1892 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1893 if (laststart) \
4bb91c68 1894 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1895 if (pending_exact) \
4bb91c68 1896 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1897 } \
4bb91c68 1898 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1899 } while (0)
1900
1901
1902/* Since we have one byte reserved for the register number argument to
1903 {start,stop}_memory, the maximum number of groups we can report
1904 things about is what fits in that byte. */
1905#define MAX_REGNUM 255
1906
1907/* But patterns can have more than `MAX_REGNUM' registers. We just
1908 ignore the excess. */
098d42af 1909typedef int regnum_t;
fa9a63c5
RM
1910
1911
1912/* Macros for the compile stack. */
1913
1914/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1915 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1916/* int may be not enough when sizeof(int) == 2. */
1917typedef long pattern_offset_t;
fa9a63c5
RM
1918
1919typedef struct
1920{
1921 pattern_offset_t begalt_offset;
1922 pattern_offset_t fixup_alt_jump;
5e69f11e 1923 pattern_offset_t laststart_offset;
fa9a63c5
RM
1924 regnum_t regnum;
1925} compile_stack_elt_t;
1926
1927
1928typedef struct
1929{
1930 compile_stack_elt_t *stack;
1931 unsigned size;
1932 unsigned avail; /* Offset of next open position. */
1933} compile_stack_type;
1934
1935
1936#define INIT_COMPILE_STACK_SIZE 32
1937
1938#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1939#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1940
4bb91c68 1941/* The next available element. */
fa9a63c5
RM
1942#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1943
1cee1e27
SM
1944/* Explicit quit checking is only used on NTemacs and whenever we
1945 use polling to process input events. */
1946#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1947extern int immediate_quit;
1948# define IMMEDIATE_QUIT_CHECK \
1949 do { \
1950 if (immediate_quit) QUIT; \
1951 } while (0)
1952#else
1953# define IMMEDIATE_QUIT_CHECK ((void)0)
1954#endif
1955\f
b18215fc
RS
1956/* Structure to manage work area for range table. */
1957struct range_table_work_area
1958{
1959 int *table; /* actual work area. */
1960 int allocated; /* allocated size for work area in bytes. */
7814e705 1961 int used; /* actually used size in words. */
96cc36cc 1962 int bits; /* flag to record character classes */
b18215fc
RS
1963};
1964
77d11aec
RS
1965/* Make sure that WORK_AREA can hold more N multibyte characters.
1966 This is used only in set_image_of_range and set_image_of_range_1.
1967 It expects WORK_AREA to be a pointer.
1968 If it can't get the space, it returns from the surrounding function. */
1969
1970#define EXTEND_RANGE_TABLE(work_area, n) \
1971 do { \
8f924df7 1972 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1973 { \
8f924df7
KH
1974 extend_range_table_work_area (&work_area); \
1975 if ((work_area).table == 0) \
77d11aec
RS
1976 return (REG_ESPACE); \
1977 } \
b18215fc
RS
1978 } while (0)
1979
96cc36cc
RS
1980#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1981 (work_area).bits |= (bit)
1982
14473664
SM
1983/* Bits used to implement the multibyte-part of the various character classes
1984 such as [:alnum:] in a charset's range table. */
1985#define BIT_WORD 0x1
1986#define BIT_LOWER 0x2
1987#define BIT_PUNCT 0x4
1988#define BIT_SPACE 0x8
1989#define BIT_UPPER 0x10
1990#define BIT_MULTIBYTE 0x20
96cc36cc 1991
b18215fc
RS
1992/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1993#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1994 do { \
8f924df7 1995 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1996 (work_area).table[(work_area).used++] = (range_start); \
1997 (work_area).table[(work_area).used++] = (range_end); \
1998 } while (0)
1999
7814e705 2000/* Free allocated memory for WORK_AREA. */
b18215fc
RS
2001#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
2002 do { \
2003 if ((work_area).table) \
2004 free ((work_area).table); \
2005 } while (0)
2006
96cc36cc 2007#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 2008#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 2009#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 2010#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 2011\f
b18215fc 2012
fa9a63c5 2013/* Set the bit for character C in a list. */
01618498 2014#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
2015
2016
bf216479
KH
2017#ifdef emacs
2018
cf9c99bc
KH
2019/* Store characters in the range FROM to TO in the bitmap at B (for
2020 ASCII and unibyte characters) and WORK_AREA (for multibyte
2021 characters) while translating them and paying attention to the
2022 continuity of translated characters.
8f924df7 2023
cf9c99bc
KH
2024 Implementation note: It is better to implement these fairly big
2025 macros by a function, but it's not that easy because macros called
8f924df7 2026 in this macro assume various local variables already declared. */
bf216479 2027
cf9c99bc
KH
2028/* Both FROM and TO are ASCII characters. */
2029
2030#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
2031 do { \
2032 int C0, C1; \
2033 \
2034 for (C0 = (FROM); C0 <= (TO); C0++) \
2035 { \
2036 C1 = TRANSLATE (C0); \
2037 if (! ASCII_CHAR_P (C1)) \
2038 { \
2039 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2040 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2041 C1 = C0; \
2042 } \
2043 SET_LIST_BIT (C1); \
2044 } \
2045 } while (0)
2046
2047
2048/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2049
2050#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2051 do { \
2052 int C0, C1, C2, I; \
2053 int USED = RANGE_TABLE_WORK_USED (work_area); \
2054 \
2055 for (C0 = (FROM); C0 <= (TO); C0++) \
2056 { \
2057 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2058 if (CHAR_BYTE8_P (C1)) \
2059 SET_LIST_BIT (C0); \
2060 else \
2061 { \
2062 C2 = TRANSLATE (C1); \
2063 if (C2 == C1 \
2064 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2065 C1 = C0; \
2066 SET_LIST_BIT (C1); \
2067 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2068 { \
2069 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2070 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2071 \
2072 if (C2 >= from - 1 && C2 <= to + 1) \
2073 { \
2074 if (C2 == from - 1) \
2075 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2076 else if (C2 == to + 1) \
2077 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2078 break; \
2079 } \
2080 } \
2081 if (I < USED) \
2082 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2083 } \
2084 } \
2085 } while (0)
2086
2087
78edd3b7 2088/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2089
2090#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2091 do { \
2092 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2093 \
2094 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2095 for (C0 = (FROM); C0 <= (TO); C0++) \
2096 { \
2097 C1 = TRANSLATE (C0); \
2098 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2099 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2100 SET_LIST_BIT (C2); \
2101 if (C1 >= (FROM) && C1 <= (TO)) \
2102 continue; \
2103 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2104 { \
2105 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2106 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2107 \
2108 if (C1 >= from - 1 && C1 <= to + 1) \
2109 { \
2110 if (C1 == from - 1) \
2111 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2112 else if (C1 == to + 1) \
2113 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2114 break; \
2115 } \
2116 } \
2117 if (I < USED) \
2118 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2119 } \
bf216479
KH
2120 } while (0)
2121
2122#endif /* emacs */
2123
fa9a63c5 2124/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2125#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2126 do { \
2127 if (p == pend) \
2128 FREE_STACK_RETURN (REG_EBRACE); \
2129 else \
2130 { \
2131 PATFETCH (c); \
2132 while ('0' <= c && c <= '9') \
2133 { \
2134 int prev; \
2135 if (num < 0) \
2136 num = 0; \
2137 prev = num; \
2138 num = num * 10 + c - '0'; \
2139 if (num / 10 != prev) \
2140 FREE_STACK_RETURN (REG_BADBR); \
2141 if (p == pend) \
2142 FREE_STACK_RETURN (REG_EBRACE); \
2143 PATFETCH (c); \
2144 } \
2145 } \
2146 } while (0)
77d11aec 2147\f
1fdab503 2148#if ! WIDE_CHAR_SUPPORT
01618498 2149
14473664 2150/* Map a string to the char class it names (if any). */
1fdab503 2151re_wctype_t
ada30c0e
SM
2152re_wctype (str)
2153 re_char *str;
14473664 2154{
ada30c0e 2155 const char *string = str;
14473664
SM
2156 if (STREQ (string, "alnum")) return RECC_ALNUM;
2157 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2158 else if (STREQ (string, "word")) return RECC_WORD;
2159 else if (STREQ (string, "ascii")) return RECC_ASCII;
2160 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2161 else if (STREQ (string, "graph")) return RECC_GRAPH;
2162 else if (STREQ (string, "lower")) return RECC_LOWER;
2163 else if (STREQ (string, "print")) return RECC_PRINT;
2164 else if (STREQ (string, "punct")) return RECC_PUNCT;
2165 else if (STREQ (string, "space")) return RECC_SPACE;
2166 else if (STREQ (string, "upper")) return RECC_UPPER;
2167 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2168 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2169 else if (STREQ (string, "digit")) return RECC_DIGIT;
2170 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2171 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2172 else if (STREQ (string, "blank")) return RECC_BLANK;
2173 else return 0;
2174}
2175
e0f24100 2176/* True if CH is in the char class CC. */
1fdab503 2177boolean
14473664
SM
2178re_iswctype (ch, cc)
2179 int ch;
2180 re_wctype_t cc;
2181{
2182 switch (cc)
2183 {
0cdd06f8
SM
2184 case RECC_ALNUM: return ISALNUM (ch);
2185 case RECC_ALPHA: return ISALPHA (ch);
2186 case RECC_BLANK: return ISBLANK (ch);
2187 case RECC_CNTRL: return ISCNTRL (ch);
2188 case RECC_DIGIT: return ISDIGIT (ch);
2189 case RECC_GRAPH: return ISGRAPH (ch);
2190 case RECC_LOWER: return ISLOWER (ch);
2191 case RECC_PRINT: return ISPRINT (ch);
2192 case RECC_PUNCT: return ISPUNCT (ch);
2193 case RECC_SPACE: return ISSPACE (ch);
2194 case RECC_UPPER: return ISUPPER (ch);
2195 case RECC_XDIGIT: return ISXDIGIT (ch);
2196 case RECC_ASCII: return IS_REAL_ASCII (ch);
2197 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2198 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2199 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2200 case RECC_WORD: return ISWORD (ch);
2201 case RECC_ERROR: return false;
2202 default:
2203 abort();
14473664
SM
2204 }
2205}
fa9a63c5 2206
14473664
SM
2207/* Return a bit-pattern to use in the range-table bits to match multibyte
2208 chars of class CC. */
2209static int
2210re_wctype_to_bit (cc)
2211 re_wctype_t cc;
2212{
2213 switch (cc)
2214 {
2215 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2216 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2217 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2218 case RECC_LOWER: return BIT_LOWER;
2219 case RECC_UPPER: return BIT_UPPER;
2220 case RECC_PUNCT: return BIT_PUNCT;
2221 case RECC_SPACE: return BIT_SPACE;
14473664 2222 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2223 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2224 default:
2225 abort();
14473664
SM
2226 }
2227}
2228#endif
77d11aec
RS
2229\f
2230/* Filling in the work area of a range. */
2231
2232/* Actually extend the space in WORK_AREA. */
2233
2234static void
2235extend_range_table_work_area (work_area)
2236 struct range_table_work_area *work_area;
177c0ea7 2237{
77d11aec
RS
2238 work_area->allocated += 16 * sizeof (int);
2239 if (work_area->table)
2240 work_area->table
2241 = (int *) realloc (work_area->table, work_area->allocated);
2242 else
2243 work_area->table
2244 = (int *) malloc (work_area->allocated);
2245}
2246
8f924df7 2247#if 0
77d11aec
RS
2248#ifdef emacs
2249
2250/* Carefully find the ranges of codes that are equivalent
2251 under case conversion to the range start..end when passed through
2252 TRANSLATE. Handle the case where non-letters can come in between
2253 two upper-case letters (which happens in Latin-1).
2254 Also handle the case of groups of more than 2 case-equivalent chars.
2255
2256 The basic method is to look at consecutive characters and see
2257 if they can form a run that can be handled as one.
2258
2259 Returns -1 if successful, REG_ESPACE if ran out of space. */
2260
2261static int
2262set_image_of_range_1 (work_area, start, end, translate)
2263 RE_TRANSLATE_TYPE translate;
2264 struct range_table_work_area *work_area;
2265 re_wchar_t start, end;
2266{
2267 /* `one_case' indicates a character, or a run of characters,
2268 each of which is an isolate (no case-equivalents).
2269 This includes all ASCII non-letters.
2270
2271 `two_case' indicates a character, or a run of characters,
2272 each of which has two case-equivalent forms.
2273 This includes all ASCII letters.
2274
2275 `strange' indicates a character that has more than one
2276 case-equivalent. */
177c0ea7 2277
77d11aec
RS
2278 enum case_type {one_case, two_case, strange};
2279
2280 /* Describe the run that is in progress,
2281 which the next character can try to extend.
2282 If run_type is strange, that means there really is no run.
2283 If run_type is one_case, then run_start...run_end is the run.
2284 If run_type is two_case, then the run is run_start...run_end,
2285 and the case-equivalents end at run_eqv_end. */
2286
2287 enum case_type run_type = strange;
2288 int run_start, run_end, run_eqv_end;
2289
2290 Lisp_Object eqv_table;
2291
2292 if (!RE_TRANSLATE_P (translate))
2293 {
b7c12565 2294 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2295 work_area->table[work_area->used++] = (start);
2296 work_area->table[work_area->used++] = (end);
b7c12565 2297 return -1;
77d11aec
RS
2298 }
2299
2300 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2301
77d11aec
RS
2302 for (; start <= end; start++)
2303 {
2304 enum case_type this_type;
2305 int eqv = RE_TRANSLATE (eqv_table, start);
2306 int minchar, maxchar;
2307
2308 /* Classify this character */
2309 if (eqv == start)
2310 this_type = one_case;
2311 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2312 this_type = two_case;
2313 else
2314 this_type = strange;
2315
2316 if (start < eqv)
2317 minchar = start, maxchar = eqv;
2318 else
2319 minchar = eqv, maxchar = start;
2320
2321 /* Can this character extend the run in progress? */
2322 if (this_type == strange || this_type != run_type
2323 || !(minchar == run_end + 1
2324 && (run_type == two_case
2325 ? maxchar == run_eqv_end + 1 : 1)))
2326 {
2327 /* No, end the run.
2328 Record each of its equivalent ranges. */
2329 if (run_type == one_case)
2330 {
2331 EXTEND_RANGE_TABLE (work_area, 2);
2332 work_area->table[work_area->used++] = run_start;
2333 work_area->table[work_area->used++] = run_end;
2334 }
2335 else if (run_type == two_case)
2336 {
2337 EXTEND_RANGE_TABLE (work_area, 4);
2338 work_area->table[work_area->used++] = run_start;
2339 work_area->table[work_area->used++] = run_end;
2340 work_area->table[work_area->used++]
2341 = RE_TRANSLATE (eqv_table, run_start);
2342 work_area->table[work_area->used++]
2343 = RE_TRANSLATE (eqv_table, run_end);
2344 }
2345 run_type = strange;
2346 }
177c0ea7 2347
77d11aec
RS
2348 if (this_type == strange)
2349 {
2350 /* For a strange character, add each of its equivalents, one
2351 by one. Don't start a range. */
2352 do
2353 {
2354 EXTEND_RANGE_TABLE (work_area, 2);
2355 work_area->table[work_area->used++] = eqv;
2356 work_area->table[work_area->used++] = eqv;
2357 eqv = RE_TRANSLATE (eqv_table, eqv);
2358 }
2359 while (eqv != start);
2360 }
2361
2362 /* Add this char to the run, or start a new run. */
2363 else if (run_type == strange)
2364 {
2365 /* Initialize a new range. */
2366 run_type = this_type;
2367 run_start = start;
2368 run_end = start;
2369 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2370 }
2371 else
2372 {
2373 /* Extend a running range. */
2374 run_end = minchar;
2375 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2376 }
2377 }
2378
2379 /* If a run is still in progress at the end, finish it now
2380 by recording its equivalent ranges. */
2381 if (run_type == one_case)
2382 {
2383 EXTEND_RANGE_TABLE (work_area, 2);
2384 work_area->table[work_area->used++] = run_start;
2385 work_area->table[work_area->used++] = run_end;
2386 }
2387 else if (run_type == two_case)
2388 {
2389 EXTEND_RANGE_TABLE (work_area, 4);
2390 work_area->table[work_area->used++] = run_start;
2391 work_area->table[work_area->used++] = run_end;
2392 work_area->table[work_area->used++]
2393 = RE_TRANSLATE (eqv_table, run_start);
2394 work_area->table[work_area->used++]
2395 = RE_TRANSLATE (eqv_table, run_end);
2396 }
2397
2398 return -1;
2399}
36595814 2400
77d11aec 2401#endif /* emacs */
36595814 2402
2b34df4e 2403/* Record the image of the range start..end when passed through
36595814
SM
2404 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2405 and is not even necessarily contiguous.
b7c12565
RS
2406 Normally we approximate it with the smallest contiguous range that contains
2407 all the chars we need. However, for Latin-1 we go to extra effort
2408 to do a better job.
2409
2410 This function is not called for ASCII ranges.
77d11aec
RS
2411
2412 Returns -1 if successful, REG_ESPACE if ran out of space. */
2413
2414static int
36595814
SM
2415set_image_of_range (work_area, start, end, translate)
2416 RE_TRANSLATE_TYPE translate;
2417 struct range_table_work_area *work_area;
2418 re_wchar_t start, end;
2419{
77d11aec
RS
2420 re_wchar_t cmin, cmax;
2421
2422#ifdef emacs
2423 /* For Latin-1 ranges, use set_image_of_range_1
2424 to get proper handling of ranges that include letters and nonletters.
b7c12565 2425 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2426 For other character sets, we don't bother to get this right. */
b7c12565
RS
2427 if (RE_TRANSLATE_P (translate) && start < 04400
2428 && !(start < 04200 && end >= 04377))
77d11aec 2429 {
b7c12565 2430 int newend;
77d11aec 2431 int tem;
b7c12565
RS
2432 newend = end;
2433 if (newend > 04377)
2434 newend = 04377;
2435 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2436 if (tem > 0)
2437 return tem;
2438
2439 start = 04400;
2440 if (end < 04400)
2441 return -1;
2442 }
2443#endif
2444
b7c12565
RS
2445 EXTEND_RANGE_TABLE (work_area, 2);
2446 work_area->table[work_area->used++] = (start);
2447 work_area->table[work_area->used++] = (end);
2448
2449 cmin = -1, cmax = -1;
77d11aec 2450
36595814 2451 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2452 {
2453 int ch;
77d11aec 2454
b7c12565
RS
2455 for (ch = start; ch <= end; ch++)
2456 {
2457 re_wchar_t c = TRANSLATE (ch);
2458 if (! (start <= c && c <= end))
2459 {
2460 if (cmin == -1)
2461 cmin = c, cmax = c;
2462 else
2463 {
2464 cmin = MIN (cmin, c);
2465 cmax = MAX (cmax, c);
2466 }
2467 }
2468 }
2469
2470 if (cmin != -1)
2471 {
2472 EXTEND_RANGE_TABLE (work_area, 2);
2473 work_area->table[work_area->used++] = (cmin);
2474 work_area->table[work_area->used++] = (cmax);
2475 }
2476 }
36595814 2477
77d11aec
RS
2478 return -1;
2479}
8f924df7 2480#endif /* 0 */
fa9a63c5
RM
2481\f
2482#ifndef MATCH_MAY_ALLOCATE
2483
2484/* If we cannot allocate large objects within re_match_2_internal,
2485 we make the fail stack and register vectors global.
2486 The fail stack, we grow to the maximum size when a regexp
2487 is compiled.
2488 The register vectors, we adjust in size each time we
2489 compile a regexp, according to the number of registers it needs. */
2490
2491static fail_stack_type fail_stack;
2492
2493/* Size with which the following vectors are currently allocated.
2494 That is so we can make them bigger as needed,
4bb91c68 2495 but never make them smaller. */
fa9a63c5
RM
2496static int regs_allocated_size;
2497
66f0296e
SM
2498static re_char ** regstart, ** regend;
2499static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2500
2501/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2502 but don't make them smaller. */
fa9a63c5
RM
2503
2504static
2505regex_grow_registers (num_regs)
2506 int num_regs;
2507{
2508 if (num_regs > regs_allocated_size)
2509 {
66f0296e
SM
2510 RETALLOC_IF (regstart, num_regs, re_char *);
2511 RETALLOC_IF (regend, num_regs, re_char *);
2512 RETALLOC_IF (best_regstart, num_regs, re_char *);
2513 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2514
2515 regs_allocated_size = num_regs;
2516 }
2517}
2518
2519#endif /* not MATCH_MAY_ALLOCATE */
2520\f
99633e97
SM
2521static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2522 compile_stack,
2523 regnum_t regnum));
2524
fa9a63c5
RM
2525/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2526 Returns one of error codes defined in `regex.h', or zero for success.
2527
2528 Assumes the `allocated' (and perhaps `buffer') and `translate'
2529 fields are set in BUFP on entry.
2530
2531 If it succeeds, results are put in BUFP (if it returns an error, the
2532 contents of BUFP are undefined):
2533 `buffer' is the compiled pattern;
2534 `syntax' is set to SYNTAX;
2535 `used' is set to the length of the compiled pattern;
2536 `fastmap_accurate' is zero;
2537 `re_nsub' is the number of subexpressions in PATTERN;
2538 `not_bol' and `not_eol' are zero;
5e69f11e 2539
c0f9ea08 2540 The `fastmap' field is neither examined nor set. */
fa9a63c5 2541
505bde11
SM
2542/* Insert the `jump' from the end of last alternative to "here".
2543 The space for the jump has already been allocated. */
2544#define FIXUP_ALT_JUMP() \
2545do { \
2546 if (fixup_alt_jump) \
2547 STORE_JUMP (jump, fixup_alt_jump, b); \
2548} while (0)
2549
2550
fa9a63c5
RM
2551/* Return, freeing storage we allocated. */
2552#define FREE_STACK_RETURN(value) \
b18215fc
RS
2553 do { \
2554 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2555 free (compile_stack.stack); \
2556 return value; \
2557 } while (0)
fa9a63c5
RM
2558
2559static reg_errcode_t
2560regex_compile (pattern, size, syntax, bufp)
66f0296e 2561 re_char *pattern;
4bb91c68 2562 size_t size;
fa9a63c5
RM
2563 reg_syntax_t syntax;
2564 struct re_pattern_buffer *bufp;
2565{
01618498
SM
2566 /* We fetch characters from PATTERN here. */
2567 register re_wchar_t c, c1;
5e69f11e 2568
fa9a63c5 2569 /* A random temporary spot in PATTERN. */
66f0296e 2570 re_char *p1;
fa9a63c5
RM
2571
2572 /* Points to the end of the buffer, where we should append. */
2573 register unsigned char *b;
5e69f11e 2574
fa9a63c5
RM
2575 /* Keeps track of unclosed groups. */
2576 compile_stack_type compile_stack;
2577
2578 /* Points to the current (ending) position in the pattern. */
22336245
RS
2579#ifdef AIX
2580 /* `const' makes AIX compiler fail. */
66f0296e 2581 unsigned char *p = pattern;
22336245 2582#else
66f0296e 2583 re_char *p = pattern;
22336245 2584#endif
66f0296e 2585 re_char *pend = pattern + size;
5e69f11e 2586
fa9a63c5 2587 /* How to translate the characters in the pattern. */
6676cb1c 2588 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2589
2590 /* Address of the count-byte of the most recently inserted `exactn'
2591 command. This makes it possible to tell if a new exact-match
2592 character can be added to that command or if the character requires
2593 a new `exactn' command. */
2594 unsigned char *pending_exact = 0;
2595
2596 /* Address of start of the most recently finished expression.
2597 This tells, e.g., postfix * where to find the start of its
2598 operand. Reset at the beginning of groups and alternatives. */
2599 unsigned char *laststart = 0;
2600
2601 /* Address of beginning of regexp, or inside of last group. */
2602 unsigned char *begalt;
2603
2604 /* Place in the uncompiled pattern (i.e., the {) to
2605 which to go back if the interval is invalid. */
66f0296e 2606 re_char *beg_interval;
5e69f11e 2607
fa9a63c5 2608 /* Address of the place where a forward jump should go to the end of
7814e705 2609 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2610 last -- ends with a forward jump of this sort. */
2611 unsigned char *fixup_alt_jump = 0;
2612
b18215fc
RS
2613 /* Work area for range table of charset. */
2614 struct range_table_work_area range_table_work;
2615
2d1675e4
SM
2616 /* If the object matched can contain multibyte characters. */
2617 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2618
8f924df7 2619 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2620 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2621
f9b0fd99
RS
2622 /* Nonzero if we have pushed down into a subpattern. */
2623 int in_subpattern = 0;
2624
2625 /* These hold the values of p, pattern, and pend from the main
2626 pattern when we have pushed into a subpattern. */
2627 re_char *main_p;
2628 re_char *main_pattern;
2629 re_char *main_pend;
2630
fa9a63c5 2631#ifdef DEBUG
99633e97 2632 debug++;
fa9a63c5 2633 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2634 if (debug > 0)
fa9a63c5
RM
2635 {
2636 unsigned debug_count;
5e69f11e 2637
fa9a63c5 2638 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2639 putchar (pattern[debug_count]);
fa9a63c5
RM
2640 putchar ('\n');
2641 }
2642#endif /* DEBUG */
2643
2644 /* Initialize the compile stack. */
2645 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2646 if (compile_stack.stack == NULL)
2647 return REG_ESPACE;
2648
2649 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2650 compile_stack.avail = 0;
2651
b18215fc
RS
2652 range_table_work.table = 0;
2653 range_table_work.allocated = 0;
2654
fa9a63c5
RM
2655 /* Initialize the pattern buffer. */
2656 bufp->syntax = syntax;
2657 bufp->fastmap_accurate = 0;
2658 bufp->not_bol = bufp->not_eol = 0;
6224b623 2659 bufp->used_syntax = 0;
fa9a63c5
RM
2660
2661 /* Set `used' to zero, so that if we return an error, the pattern
2662 printer (for debugging) will think there's no pattern. We reset it
2663 at the end. */
2664 bufp->used = 0;
5e69f11e 2665
fa9a63c5 2666 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2667 bufp->re_nsub = 0;
fa9a63c5 2668
0b32bf0e 2669#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2670 /* Initialize the syntax table. */
2671 init_syntax_once ();
2672#endif
2673
2674 if (bufp->allocated == 0)
2675 {
2676 if (bufp->buffer)
2677 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2678 enough space. This loses if buffer's address is bogus, but
7814e705 2679 that is the user's responsibility. */
25fe55af
RS
2680 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2681 }
fa9a63c5 2682 else
7814e705 2683 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2684 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2685 }
fa9a63c5
RM
2686 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2687
2688 bufp->allocated = INIT_BUF_SIZE;
2689 }
2690
2691 begalt = b = bufp->buffer;
2692
2693 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2694 while (1)
fa9a63c5 2695 {
f9b0fd99
RS
2696 if (p == pend)
2697 {
2698 /* If this is the end of an included regexp,
2699 pop back to the main regexp and try again. */
2700 if (in_subpattern)
2701 {
2702 in_subpattern = 0;
2703 pattern = main_pattern;
2704 p = main_p;
2705 pend = main_pend;
2706 continue;
2707 }
2708 /* If this is the end of the main regexp, we are done. */
2709 break;
2710 }
2711
fa9a63c5
RM
2712 PATFETCH (c);
2713
2714 switch (c)
25fe55af 2715 {
f9b0fd99
RS
2716 case ' ':
2717 {
2718 re_char *p1 = p;
2719
2720 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2721 spaces normally. And don't try to do this recursively. */
2722 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2723 goto normal_char;
2724
2725 /* Peek past following spaces. */
2726 while (p1 != pend)
2727 {
2728 if (*p1 != ' ')
2729 break;
2730 p1++;
2731 }
2732 /* If the spaces are followed by a repetition op,
2733 treat them normally. */
c721eee5
RS
2734 if (p1 != pend
2735 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2736 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2737 goto normal_char;
2738
2739 /* Replace the spaces with the whitespace regexp. */
2740 in_subpattern = 1;
2741 main_p = p1;
2742 main_pend = pend;
2743 main_pattern = pattern;
2744 p = pattern = whitespace_regexp;
2745 pend = p + strlen (p);
2746 break;
7814e705 2747 }
f9b0fd99 2748
25fe55af
RS
2749 case '^':
2750 {
7814e705 2751 if ( /* If at start of pattern, it's an operator. */
25fe55af 2752 p == pattern + 1
7814e705 2753 /* If context independent, it's an operator. */
25fe55af 2754 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2755 /* Otherwise, depends on what's come before. */
25fe55af 2756 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2757 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2758 else
2759 goto normal_char;
2760 }
2761 break;
2762
2763
2764 case '$':
2765 {
2766 if ( /* If at end of pattern, it's an operator. */
2767 p == pend
7814e705 2768 /* If context independent, it's an operator. */
25fe55af
RS
2769 || syntax & RE_CONTEXT_INDEP_ANCHORS
2770 /* Otherwise, depends on what's next. */
2771 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2772 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2773 else
2774 goto normal_char;
2775 }
2776 break;
fa9a63c5
RM
2777
2778
2779 case '+':
25fe55af
RS
2780 case '?':
2781 if ((syntax & RE_BK_PLUS_QM)
2782 || (syntax & RE_LIMITED_OPS))
2783 goto normal_char;
2784 handle_plus:
2785 case '*':
2786 /* If there is no previous pattern... */
2787 if (!laststart)
2788 {
2789 if (syntax & RE_CONTEXT_INVALID_OPS)
2790 FREE_STACK_RETURN (REG_BADRPT);
2791 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2792 goto normal_char;
2793 }
2794
2795 {
7814e705 2796 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2797 boolean zero_times_ok = 0, many_times_ok = 0;
2798 boolean greedy = 1;
25fe55af
RS
2799
2800 /* If there is a sequence of repetition chars, collapse it
2801 down to just one (the right one). We can't combine
2802 interval operators with these because of, e.g., `a{2}*',
7814e705 2803 which should only match an even number of `a's. */
25fe55af
RS
2804
2805 for (;;)
2806 {
0b32bf0e 2807 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2808 && c == '?' && (zero_times_ok || many_times_ok))
2809 greedy = 0;
2810 else
2811 {
2812 zero_times_ok |= c != '+';
2813 many_times_ok |= c != '?';
2814 }
25fe55af
RS
2815
2816 if (p == pend)
2817 break;
ed0767d8
SM
2818 else if (*p == '*'
2819 || (!(syntax & RE_BK_PLUS_QM)
2820 && (*p == '+' || *p == '?')))
25fe55af 2821 ;
ed0767d8 2822 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2823 {
ed0767d8
SM
2824 if (p+1 == pend)
2825 FREE_STACK_RETURN (REG_EESCAPE);
2826 if (p[1] == '+' || p[1] == '?')
2827 PATFETCH (c); /* Gobble up the backslash. */
2828 else
2829 break;
25fe55af
RS
2830 }
2831 else
ed0767d8 2832 break;
25fe55af 2833 /* If we get here, we found another repeat character. */
ed0767d8
SM
2834 PATFETCH (c);
2835 }
25fe55af
RS
2836
2837 /* Star, etc. applied to an empty pattern is equivalent
2838 to an empty pattern. */
4e8a9132 2839 if (!laststart || laststart == b)
25fe55af
RS
2840 break;
2841
2842 /* Now we know whether or not zero matches is allowed
7814e705 2843 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2844 if (greedy)
2845 {
99633e97 2846 if (many_times_ok)
4e8a9132
SM
2847 {
2848 boolean simple = skip_one_char (laststart) == b;
2849 unsigned int startoffset = 0;
f6a3f532 2850 re_opcode_t ofj =
01618498 2851 /* Check if the loop can match the empty string. */
6df42991
SM
2852 (simple || !analyse_first (laststart, b, NULL, 0))
2853 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2854 assert (skip_one_char (laststart) <= b);
177c0ea7 2855
4e8a9132
SM
2856 if (!zero_times_ok && simple)
2857 { /* Since simple * loops can be made faster by using
2858 on_failure_keep_string_jump, we turn simple P+
2859 into PP* if P is simple. */
2860 unsigned char *p1, *p2;
2861 startoffset = b - laststart;
2862 GET_BUFFER_SPACE (startoffset);
2863 p1 = b; p2 = laststart;
2864 while (p2 < p1)
2865 *b++ = *p2++;
2866 zero_times_ok = 1;
99633e97 2867 }
4e8a9132
SM
2868
2869 GET_BUFFER_SPACE (6);
2870 if (!zero_times_ok)
2871 /* A + loop. */
f6a3f532 2872 STORE_JUMP (ofj, b, b + 6);
99633e97 2873 else
4e8a9132
SM
2874 /* Simple * loops can use on_failure_keep_string_jump
2875 depending on what follows. But since we don't know
2876 that yet, we leave the decision up to
2877 on_failure_jump_smart. */
f6a3f532 2878 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2879 laststart + startoffset, b + 6);
99633e97 2880 b += 3;
4e8a9132 2881 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2882 b += 3;
2883 }
2884 else
2885 {
4e8a9132
SM
2886 /* A simple ? pattern. */
2887 assert (zero_times_ok);
2888 GET_BUFFER_SPACE (3);
2889 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2890 b += 3;
2891 }
1c8c6d39
DL
2892 }
2893 else /* not greedy */
2894 { /* I wish the greedy and non-greedy cases could be merged. */
2895
0683b6fa 2896 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2897 if (many_times_ok)
2898 {
f6a3f532
SM
2899 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2900
6df42991
SM
2901 /* The non-greedy multiple match looks like
2902 a repeat..until: we only need a conditional jump
2903 at the end of the loop. */
f6a3f532
SM
2904 if (emptyp) BUF_PUSH (no_op);
2905 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2906 : on_failure_jump, b, laststart);
1c8c6d39
DL
2907 b += 3;
2908 if (zero_times_ok)
2909 {
2910 /* The repeat...until naturally matches one or more.
2911 To also match zero times, we need to first jump to
6df42991 2912 the end of the loop (its conditional jump). */
1c8c6d39
DL
2913 INSERT_JUMP (jump, laststart, b);
2914 b += 3;
2915 }
2916 }
2917 else
2918 {
2919 /* non-greedy a?? */
1c8c6d39
DL
2920 INSERT_JUMP (jump, laststart, b + 3);
2921 b += 3;
2922 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2923 b += 3;
2924 }
2925 }
2926 }
4e8a9132 2927 pending_exact = 0;
fa9a63c5
RM
2928 break;
2929
2930
2931 case '.':
25fe55af
RS
2932 laststart = b;
2933 BUF_PUSH (anychar);
2934 break;
fa9a63c5
RM
2935
2936
25fe55af
RS
2937 case '[':
2938 {
b18215fc 2939 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2940
25fe55af 2941 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2942
25fe55af
RS
2943 /* Ensure that we have enough space to push a charset: the
2944 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2945 GET_BUFFER_SPACE (34);
2946
25fe55af 2947 laststart = b;
e318085a 2948
25fe55af 2949 /* We test `*p == '^' twice, instead of using an if
7814e705 2950 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2951 BUF_PUSH (*p == '^' ? charset_not : charset);
2952 if (*p == '^')
2953 p++;
e318085a 2954
25fe55af
RS
2955 /* Remember the first position in the bracket expression. */
2956 p1 = p;
e318085a 2957
7814e705 2958 /* Push the number of bytes in the bitmap. */
25fe55af 2959 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2960
25fe55af
RS
2961 /* Clear the whole map. */
2962 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2963
25fe55af
RS
2964 /* charset_not matches newline according to a syntax bit. */
2965 if ((re_opcode_t) b[-2] == charset_not
2966 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2967 SET_LIST_BIT ('\n');
fa9a63c5 2968
7814e705 2969 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2970 for (;;)
2971 {
b18215fc 2972 boolean escaped_char = false;
2d1675e4 2973 const unsigned char *p2 = p;
cf9c99bc 2974 re_wchar_t ch, c2;
e318085a 2975
25fe55af 2976 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2977
36595814
SM
2978 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2979 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2980 So the translation is done later in a loop. Example:
2981 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2982 PATFETCH (c);
e318085a 2983
25fe55af
RS
2984 /* \ might escape characters inside [...] and [^...]. */
2985 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2986 {
2987 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2988
2989 PATFETCH (c);
b18215fc 2990 escaped_char = true;
25fe55af 2991 }
b18215fc
RS
2992 else
2993 {
7814e705 2994 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2995 not (i.e., when the bracket expression is `[]' so
2996 far), the ']' character bit gets set way below. */
2d1675e4 2997 if (c == ']' && p2 != p1)
657fcfbd 2998 break;
25fe55af 2999 }
b18215fc 3000
25fe55af
RS
3001 /* See if we're at the beginning of a possible character
3002 class. */
b18215fc 3003
2d1675e4
SM
3004 if (!escaped_char &&
3005 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 3006 {
7814e705 3007 /* Leave room for the null. */
14473664 3008 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 3009 const unsigned char *class_beg;
b18215fc 3010
25fe55af
RS
3011 PATFETCH (c);
3012 c1 = 0;
ed0767d8 3013 class_beg = p;
b18215fc 3014
25fe55af
RS
3015 /* If pattern is `[[:'. */
3016 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 3017
25fe55af
RS
3018 for (;;)
3019 {
14473664
SM
3020 PATFETCH (c);
3021 if ((c == ':' && *p == ']') || p == pend)
3022 break;
3023 if (c1 < CHAR_CLASS_MAX_LENGTH)
3024 str[c1++] = c;
3025 else
3026 /* This is in any case an invalid class name. */
3027 str[0] = '\0';
25fe55af
RS
3028 }
3029 str[c1] = '\0';
b18215fc
RS
3030
3031 /* If isn't a word bracketed by `[:' and `:]':
3032 undo the ending character, the letters, and
3033 leave the leading `:' and `[' (but set bits for
3034 them). */
25fe55af
RS
3035 if (c == ':' && *p == ']')
3036 {
14473664 3037 re_wctype_t cc;
8f924df7 3038 int limit;
14473664
SM
3039
3040 cc = re_wctype (str);
3041
3042 if (cc == 0)
fa9a63c5
RM
3043 FREE_STACK_RETURN (REG_ECTYPE);
3044
14473664
SM
3045 /* Throw away the ] at the end of the character
3046 class. */
3047 PATFETCH (c);
fa9a63c5 3048
14473664 3049 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3050
cf9c99bc
KH
3051#ifndef emacs
3052 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3053 if (re_iswctype (btowc (ch), cc))
3054 {
3055 c = TRANSLATE (ch);
ed00c2ac
KH
3056 if (c < (1 << BYTEWIDTH))
3057 SET_LIST_BIT (c);
8f924df7 3058 }
cf9c99bc
KH
3059#else /* emacs */
3060 /* Most character classes in a multibyte match
3061 just set a flag. Exceptions are is_blank,
3062 is_digit, is_cntrl, and is_xdigit, since
3063 they can only match ASCII characters. We
3064 don't need to handle them for multibyte.
3065 They are distinguished by a negative wctype. */
96cc36cc 3066
254c06a8
SM
3067 /* Setup the gl_state object to its buffer-defined
3068 value. This hardcodes the buffer-global
3069 syntax-table for ASCII chars, while the other chars
3070 will obey syntax-table properties. It's not ideal,
3071 but it's the way it's been done until now. */
d48cd3f4 3072 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 3073
cf9c99bc 3074 for (ch = 0; ch < 256; ++ch)
25fe55af 3075 {
cf9c99bc
KH
3076 c = RE_CHAR_TO_MULTIBYTE (ch);
3077 if (! CHAR_BYTE8_P (c)
3078 && re_iswctype (c, cc))
8f924df7 3079 {
cf9c99bc
KH
3080 SET_LIST_BIT (ch);
3081 c1 = TRANSLATE (c);
3082 if (c1 == c)
3083 continue;
3084 if (ASCII_CHAR_P (c1))
3085 SET_LIST_BIT (c1);
3086 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3087 SET_LIST_BIT (c1);
8f924df7 3088 }
25fe55af 3089 }
cf9c99bc
KH
3090 SET_RANGE_TABLE_WORK_AREA_BIT
3091 (range_table_work, re_wctype_to_bit (cc));
3092#endif /* emacs */
6224b623
SM
3093 /* In most cases the matching rule for char classes
3094 only uses the syntax table for multibyte chars,
3095 so that the content of the syntax-table it is not
3096 hardcoded in the range_table. SPACE and WORD are
3097 the two exceptions. */
3098 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3099 bufp->used_syntax = 1;
3100
b18215fc
RS
3101 /* Repeat the loop. */
3102 continue;
25fe55af
RS
3103 }
3104 else
3105 {
ed0767d8
SM
3106 /* Go back to right after the "[:". */
3107 p = class_beg;
25fe55af 3108 SET_LIST_BIT ('[');
b18215fc
RS
3109
3110 /* Because the `:' may starts the range, we
3111 can't simply set bit and repeat the loop.
7814e705 3112 Instead, just set it to C and handle below. */
b18215fc 3113 c = ':';
25fe55af
RS
3114 }
3115 }
b18215fc
RS
3116
3117 if (p < pend && p[0] == '-' && p[1] != ']')
3118 {
3119
3120 /* Discard the `-'. */
3121 PATFETCH (c1);
3122
3123 /* Fetch the character which ends the range. */
3124 PATFETCH (c1);
cf9c99bc
KH
3125#ifdef emacs
3126 if (CHAR_BYTE8_P (c1)
3127 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3128 /* Treat the range from a multibyte character to
3129 raw-byte character as empty. */
3130 c = c1 + 1;
3131#endif /* emacs */
e318085a 3132 }
25fe55af 3133 else
b18215fc
RS
3134 /* Range from C to C. */
3135 c1 = c;
3136
cf9c99bc 3137 if (c > c1)
25fe55af 3138 {
cf9c99bc
KH
3139 if (syntax & RE_NO_EMPTY_RANGES)
3140 FREE_STACK_RETURN (REG_ERANGEX);
3141 /* Else, repeat the loop. */
bf216479 3142 }
6fdd04b0 3143 else
25fe55af 3144 {
cf9c99bc
KH
3145#ifndef emacs
3146 /* Set the range into bitmap */
8f924df7 3147 for (; c <= c1; c++)
b18215fc 3148 {
cf9c99bc
KH
3149 ch = TRANSLATE (c);
3150 if (ch < (1 << BYTEWIDTH))
3151 SET_LIST_BIT (ch);
3152 }
3153#else /* emacs */
3154 if (c < 128)
3155 {
3156 ch = MIN (127, c1);
3157 SETUP_ASCII_RANGE (range_table_work, c, ch);
3158 c = ch + 1;
3159 if (CHAR_BYTE8_P (c1))
3160 c = BYTE8_TO_CHAR (128);
3161 }
3162 if (c <= c1)
3163 {
3164 if (CHAR_BYTE8_P (c))
3165 {
3166 c = CHAR_TO_BYTE8 (c);
3167 c1 = CHAR_TO_BYTE8 (c1);
3168 for (; c <= c1; c++)
3169 SET_LIST_BIT (c);
3170 }
3171 else if (multibyte)
3172 {
3173 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3174 }
3175 else
3176 {
3177 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3178 }
e934739e 3179 }
cf9c99bc 3180#endif /* emacs */
25fe55af 3181 }
e318085a
RS
3182 }
3183
25fe55af 3184 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3185 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3186 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3187 b[-1]--;
3188 b += b[-1];
fa9a63c5 3189
96cc36cc
RS
3190 /* Build real range table from work area. */
3191 if (RANGE_TABLE_WORK_USED (range_table_work)
3192 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3193 {
3194 int i;
3195 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3196
b18215fc 3197 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3198 bytes for flags, two for COUNT, and three bytes for
3199 each character. */
3200 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3201
b18215fc
RS
3202 /* Indicate the existence of range table. */
3203 laststart[1] |= 0x80;
fa9a63c5 3204
96cc36cc
RS
3205 /* Store the character class flag bits into the range table.
3206 If not in emacs, these flag bits are always 0. */
3207 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3208 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3209
b18215fc
RS
3210 STORE_NUMBER_AND_INCR (b, used / 2);
3211 for (i = 0; i < used; i++)
3212 STORE_CHARACTER_AND_INCR
3213 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3214 }
25fe55af
RS
3215 }
3216 break;
fa9a63c5
RM
3217
3218
b18215fc 3219 case '(':
25fe55af
RS
3220 if (syntax & RE_NO_BK_PARENS)
3221 goto handle_open;
3222 else
3223 goto normal_char;
fa9a63c5
RM
3224
3225
25fe55af
RS
3226 case ')':
3227 if (syntax & RE_NO_BK_PARENS)
3228 goto handle_close;
3229 else
3230 goto normal_char;
e318085a
RS
3231
3232
25fe55af
RS
3233 case '\n':
3234 if (syntax & RE_NEWLINE_ALT)
3235 goto handle_alt;
3236 else
3237 goto normal_char;
e318085a
RS
3238
3239
b18215fc 3240 case '|':
25fe55af
RS
3241 if (syntax & RE_NO_BK_VBAR)
3242 goto handle_alt;
3243 else
3244 goto normal_char;
3245
3246
3247 case '{':
3248 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3249 goto handle_interval;
3250 else
3251 goto normal_char;
3252
3253
3254 case '\\':
3255 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3256
3257 /* Do not translate the character after the \, so that we can
3258 distinguish, e.g., \B from \b, even if we normally would
3259 translate, e.g., B to b. */
36595814 3260 PATFETCH (c);
25fe55af
RS
3261
3262 switch (c)
3263 {
3264 case '(':
3265 if (syntax & RE_NO_BK_PARENS)
3266 goto normal_backslash;
3267
3268 handle_open:
505bde11
SM
3269 {
3270 int shy = 0;
c69b0314 3271 regnum_t regnum = 0;
505bde11
SM
3272 if (p+1 < pend)
3273 {
3274 /* Look for a special (?...) construct */
ed0767d8 3275 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3276 {
ed0767d8 3277 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3278 while (!shy)
505bde11 3279 {
c69b0314
SM
3280 PATFETCH (c);
3281 switch (c)
3282 {
3283 case ':': shy = 1; break;
3284 case '0':
3285 /* An explicitly specified regnum must start
3286 with non-0. */
3287 if (regnum == 0)
3288 FREE_STACK_RETURN (REG_BADPAT);
3289 case '1': case '2': case '3': case '4':
3290 case '5': case '6': case '7': case '8': case '9':
3291 regnum = 10*regnum + (c - '0'); break;
3292 default:
3293 /* Only (?:...) is supported right now. */
3294 FREE_STACK_RETURN (REG_BADPAT);
3295 }
505bde11
SM
3296 }
3297 }
505bde11
SM
3298 }
3299
3300 if (!shy)
c69b0314
SM
3301 regnum = ++bufp->re_nsub;
3302 else if (regnum)
3303 { /* It's actually not shy, but explicitly numbered. */
3304 shy = 0;
3305 if (regnum > bufp->re_nsub)
3306 bufp->re_nsub = regnum;
3307 else if (regnum > bufp->re_nsub
3308 /* Ideally, we'd want to check that the specified
3309 group can't have matched (i.e. all subgroups
3310 using the same regnum are in other branches of
3311 OR patterns), but we don't currently keep track
3312 of enough info to do that easily. */
3313 || group_in_compile_stack (compile_stack, regnum))
3314 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3315 }
c69b0314
SM
3316 else
3317 /* It's really shy. */
3318 regnum = - bufp->re_nsub;
25fe55af 3319
99633e97
SM
3320 if (COMPILE_STACK_FULL)
3321 {
3322 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3323 compile_stack_elt_t);
3324 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3325
99633e97
SM
3326 compile_stack.size <<= 1;
3327 }
25fe55af 3328
99633e97 3329 /* These are the values to restore when we hit end of this
7814e705 3330 group. They are all relative offsets, so that if the
99633e97
SM
3331 whole pattern moves because of realloc, they will still
3332 be valid. */
3333 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3334 COMPILE_STACK_TOP.fixup_alt_jump
3335 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3336 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3337 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3338
c69b0314
SM
3339 /* Do not push a start_memory for groups beyond the last one
3340 we can represent in the compiled pattern. */
3341 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3342 BUF_PUSH_2 (start_memory, regnum);
3343
3344 compile_stack.avail++;
3345
3346 fixup_alt_jump = 0;
3347 laststart = 0;
3348 begalt = b;
3349 /* If we've reached MAX_REGNUM groups, then this open
3350 won't actually generate any code, so we'll have to
3351 clear pending_exact explicitly. */
3352 pending_exact = 0;
3353 break;
505bde11 3354 }
25fe55af
RS
3355
3356 case ')':
3357 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3358
3359 if (COMPILE_STACK_EMPTY)
505bde11
SM
3360 {
3361 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3362 goto normal_backslash;
3363 else
3364 FREE_STACK_RETURN (REG_ERPAREN);
3365 }
25fe55af
RS
3366
3367 handle_close:
505bde11 3368 FIXUP_ALT_JUMP ();
25fe55af
RS
3369
3370 /* See similar code for backslashed left paren above. */
3371 if (COMPILE_STACK_EMPTY)
505bde11
SM
3372 {
3373 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3374 goto normal_char;
3375 else
3376 FREE_STACK_RETURN (REG_ERPAREN);
3377 }
25fe55af
RS
3378
3379 /* Since we just checked for an empty stack above, this
3380 ``can't happen''. */
3381 assert (compile_stack.avail != 0);
3382 {
3383 /* We don't just want to restore into `regnum', because
3384 later groups should continue to be numbered higher,
7814e705 3385 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3386 regnum_t regnum;
25fe55af
RS
3387
3388 compile_stack.avail--;
3389 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3390 fixup_alt_jump
3391 = COMPILE_STACK_TOP.fixup_alt_jump
3392 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3393 : 0;
3394 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3395 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3396 /* If we've reached MAX_REGNUM groups, then this open
3397 won't actually generate any code, so we'll have to
3398 clear pending_exact explicitly. */
3399 pending_exact = 0;
e318085a 3400
25fe55af 3401 /* We're at the end of the group, so now we know how many
7814e705 3402 groups were inside this one. */
c69b0314
SM
3403 if (regnum <= MAX_REGNUM && regnum > 0)
3404 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3405 }
3406 break;
3407
3408
3409 case '|': /* `\|'. */
3410 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3411 goto normal_backslash;
3412 handle_alt:
3413 if (syntax & RE_LIMITED_OPS)
3414 goto normal_char;
3415
3416 /* Insert before the previous alternative a jump which
7814e705 3417 jumps to this alternative if the former fails. */
25fe55af
RS
3418 GET_BUFFER_SPACE (3);
3419 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3420 pending_exact = 0;
3421 b += 3;
3422
3423 /* The alternative before this one has a jump after it
3424 which gets executed if it gets matched. Adjust that
3425 jump so it will jump to this alternative's analogous
3426 jump (put in below, which in turn will jump to the next
3427 (if any) alternative's such jump, etc.). The last such
3428 jump jumps to the correct final destination. A picture:
3429 _____ _____
3430 | | | |
3431 | v | v
3432 a | b | c
3433
3434 If we are at `b', then fixup_alt_jump right now points to a
3435 three-byte space after `a'. We'll put in the jump, set
3436 fixup_alt_jump to right after `b', and leave behind three
3437 bytes which we'll fill in when we get to after `c'. */
3438
505bde11 3439 FIXUP_ALT_JUMP ();
25fe55af
RS
3440
3441 /* Mark and leave space for a jump after this alternative,
3442 to be filled in later either by next alternative or
3443 when know we're at the end of a series of alternatives. */
3444 fixup_alt_jump = b;
3445 GET_BUFFER_SPACE (3);
3446 b += 3;
3447
3448 laststart = 0;
3449 begalt = b;
3450 break;
3451
3452
3453 case '{':
3454 /* If \{ is a literal. */
3455 if (!(syntax & RE_INTERVALS)
3456 /* If we're at `\{' and it's not the open-interval
3457 operator. */
4bb91c68 3458 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3459 goto normal_backslash;
3460
3461 handle_interval:
3462 {
3463 /* If got here, then the syntax allows intervals. */
3464
3465 /* At least (most) this many matches must be made. */
99633e97 3466 int lower_bound = 0, upper_bound = -1;
25fe55af 3467
ed0767d8 3468 beg_interval = p;
25fe55af 3469
25fe55af
RS
3470 GET_UNSIGNED_NUMBER (lower_bound);
3471
3472 if (c == ',')
ed0767d8 3473 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3474 else
3475 /* Interval such as `{1}' => match exactly once. */
3476 upper_bound = lower_bound;
3477
3478 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3479 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3480 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3481
3482 if (!(syntax & RE_NO_BK_BRACES))
3483 {
4bb91c68
SM
3484 if (c != '\\')
3485 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3486 if (p == pend)
3487 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3488 PATFETCH (c);
3489 }
3490
3491 if (c != '}')
4bb91c68 3492 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3493
3494 /* We just parsed a valid interval. */
3495
3496 /* If it's invalid to have no preceding re. */
3497 if (!laststart)
3498 {
3499 if (syntax & RE_CONTEXT_INVALID_OPS)
3500 FREE_STACK_RETURN (REG_BADRPT);
3501 else if (syntax & RE_CONTEXT_INDEP_OPS)
3502 laststart = b;
3503 else
3504 goto unfetch_interval;
3505 }
3506
6df42991
SM
3507 if (upper_bound == 0)
3508 /* If the upper bound is zero, just drop the sub pattern
3509 altogether. */
3510 b = laststart;
3511 else if (lower_bound == 1 && upper_bound == 1)
3512 /* Just match it once: nothing to do here. */
3513 ;
3514
3515 /* Otherwise, we have a nontrivial interval. When
3516 we're all done, the pattern will look like:
3517 set_number_at <jump count> <upper bound>
3518 set_number_at <succeed_n count> <lower bound>
3519 succeed_n <after jump addr> <succeed_n count>
3520 <body of loop>
3521 jump_n <succeed_n addr> <jump count>
3522 (The upper bound and `jump_n' are omitted if
3523 `upper_bound' is 1, though.) */
3524 else
3525 { /* If the upper bound is > 1, we need to insert
3526 more at the end of the loop. */
3527 unsigned int nbytes = (upper_bound < 0 ? 3
3528 : upper_bound > 1 ? 5 : 0);
3529 unsigned int startoffset = 0;
3530
3531 GET_BUFFER_SPACE (20); /* We might use less. */
3532
3533 if (lower_bound == 0)
3534 {
3535 /* A succeed_n that starts with 0 is really a
3536 a simple on_failure_jump_loop. */
3537 INSERT_JUMP (on_failure_jump_loop, laststart,
3538 b + 3 + nbytes);
3539 b += 3;
3540 }
3541 else
3542 {
3543 /* Initialize lower bound of the `succeed_n', even
3544 though it will be set during matching by its
3545 attendant `set_number_at' (inserted next),
3546 because `re_compile_fastmap' needs to know.
3547 Jump to the `jump_n' we might insert below. */
3548 INSERT_JUMP2 (succeed_n, laststart,
3549 b + 5 + nbytes,
3550 lower_bound);
3551 b += 5;
3552
3553 /* Code to initialize the lower bound. Insert
7814e705 3554 before the `succeed_n'. The `5' is the last two
6df42991
SM
3555 bytes of this `set_number_at', plus 3 bytes of
3556 the following `succeed_n'. */
3557 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3558 b += 5;
3559 startoffset += 5;
3560 }
3561
3562 if (upper_bound < 0)
3563 {
3564 /* A negative upper bound stands for infinity,
3565 in which case it degenerates to a plain jump. */
3566 STORE_JUMP (jump, b, laststart + startoffset);
3567 b += 3;
3568 }
3569 else if (upper_bound > 1)
3570 { /* More than one repetition is allowed, so
3571 append a backward jump to the `succeed_n'
3572 that starts this interval.
3573
3574 When we've reached this during matching,
3575 we'll have matched the interval once, so
3576 jump back only `upper_bound - 1' times. */
3577 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3578 upper_bound - 1);
3579 b += 5;
3580
3581 /* The location we want to set is the second
3582 parameter of the `jump_n'; that is `b-2' as
3583 an absolute address. `laststart' will be
3584 the `set_number_at' we're about to insert;
3585 `laststart+3' the number to set, the source
3586 for the relative address. But we are
3587 inserting into the middle of the pattern --
3588 so everything is getting moved up by 5.
3589 Conclusion: (b - 2) - (laststart + 3) + 5,
3590 i.e., b - laststart.
3591
3592 We insert this at the beginning of the loop
3593 so that if we fail during matching, we'll
3594 reinitialize the bounds. */
3595 insert_op2 (set_number_at, laststart, b - laststart,
3596 upper_bound - 1, b);
3597 b += 5;
3598 }
3599 }
25fe55af
RS
3600 pending_exact = 0;
3601 beg_interval = NULL;
3602 }
3603 break;
3604
3605 unfetch_interval:
3606 /* If an invalid interval, match the characters as literals. */
3607 assert (beg_interval);
3608 p = beg_interval;
3609 beg_interval = NULL;
3610
3611 /* normal_char and normal_backslash need `c'. */
ed0767d8 3612 c = '{';
25fe55af
RS
3613
3614 if (!(syntax & RE_NO_BK_BRACES))
3615 {
ed0767d8
SM
3616 assert (p > pattern && p[-1] == '\\');
3617 goto normal_backslash;
25fe55af 3618 }
ed0767d8
SM
3619 else
3620 goto normal_char;
e318085a 3621
b18215fc 3622#ifdef emacs
25fe55af 3623 /* There is no way to specify the before_dot and after_dot
7814e705 3624 operators. rms says this is ok. --karl */
25fe55af
RS
3625 case '=':
3626 BUF_PUSH (at_dot);
3627 break;
3628
3629 case 's':
3630 laststart = b;
3631 PATFETCH (c);
3632 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3633 break;
3634
3635 case 'S':
3636 laststart = b;
3637 PATFETCH (c);
3638 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3639 break;
b18215fc
RS
3640
3641 case 'c':
3642 laststart = b;
36595814 3643 PATFETCH (c);
b18215fc
RS
3644 BUF_PUSH_2 (categoryspec, c);
3645 break;
e318085a 3646
b18215fc
RS
3647 case 'C':
3648 laststart = b;
36595814 3649 PATFETCH (c);
b18215fc
RS
3650 BUF_PUSH_2 (notcategoryspec, c);
3651 break;
3652#endif /* emacs */
e318085a 3653
e318085a 3654
25fe55af 3655 case 'w':
4bb91c68
SM
3656 if (syntax & RE_NO_GNU_OPS)
3657 goto normal_char;
25fe55af 3658 laststart = b;
1fb352e0 3659 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3660 break;
e318085a 3661
e318085a 3662
25fe55af 3663 case 'W':
4bb91c68
SM
3664 if (syntax & RE_NO_GNU_OPS)
3665 goto normal_char;
25fe55af 3666 laststart = b;
1fb352e0 3667 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3668 break;
e318085a
RS
3669
3670
25fe55af 3671 case '<':
4bb91c68
SM
3672 if (syntax & RE_NO_GNU_OPS)
3673 goto normal_char;
25fe55af
RS
3674 BUF_PUSH (wordbeg);
3675 break;
e318085a 3676
25fe55af 3677 case '>':
4bb91c68
SM
3678 if (syntax & RE_NO_GNU_OPS)
3679 goto normal_char;
25fe55af
RS
3680 BUF_PUSH (wordend);
3681 break;
e318085a 3682
669fa600
SM
3683 case '_':
3684 if (syntax & RE_NO_GNU_OPS)
3685 goto normal_char;
3686 laststart = b;
3687 PATFETCH (c);
3688 if (c == '<')
3689 BUF_PUSH (symbeg);
3690 else if (c == '>')
3691 BUF_PUSH (symend);
3692 else
3693 FREE_STACK_RETURN (REG_BADPAT);
3694 break;
3695
25fe55af 3696 case 'b':
4bb91c68
SM
3697 if (syntax & RE_NO_GNU_OPS)
3698 goto normal_char;
25fe55af
RS
3699 BUF_PUSH (wordbound);
3700 break;
e318085a 3701
25fe55af 3702 case 'B':
4bb91c68
SM
3703 if (syntax & RE_NO_GNU_OPS)
3704 goto normal_char;
25fe55af
RS
3705 BUF_PUSH (notwordbound);
3706 break;
fa9a63c5 3707
25fe55af 3708 case '`':
4bb91c68
SM
3709 if (syntax & RE_NO_GNU_OPS)
3710 goto normal_char;
25fe55af
RS
3711 BUF_PUSH (begbuf);
3712 break;
e318085a 3713
25fe55af 3714 case '\'':
4bb91c68
SM
3715 if (syntax & RE_NO_GNU_OPS)
3716 goto normal_char;
25fe55af
RS
3717 BUF_PUSH (endbuf);
3718 break;
e318085a 3719
25fe55af
RS
3720 case '1': case '2': case '3': case '4': case '5':
3721 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3722 {
3723 regnum_t reg;
e318085a 3724
0cdd06f8
SM
3725 if (syntax & RE_NO_BK_REFS)
3726 goto normal_backslash;
e318085a 3727
0cdd06f8 3728 reg = c - '0';
e318085a 3729
c69b0314
SM
3730 if (reg > bufp->re_nsub || reg < 1
3731 /* Can't back reference to a subexp before its end. */
3732 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3733 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3734
0cdd06f8
SM
3735 laststart = b;
3736 BUF_PUSH_2 (duplicate, reg);
3737 }
25fe55af 3738 break;
e318085a 3739
e318085a 3740
25fe55af
RS
3741 case '+':
3742 case '?':
3743 if (syntax & RE_BK_PLUS_QM)
3744 goto handle_plus;
3745 else
3746 goto normal_backslash;
3747
3748 default:
3749 normal_backslash:
3750 /* You might think it would be useful for \ to mean
3751 not to translate; but if we don't translate it
4bb91c68 3752 it will never match anything. */
25fe55af
RS
3753 goto normal_char;
3754 }
3755 break;
fa9a63c5
RM
3756
3757
3758 default:
25fe55af 3759 /* Expects the character in `c'. */
fa9a63c5 3760 normal_char:
36595814 3761 /* If no exactn currently being built. */
25fe55af 3762 if (!pending_exact
fa9a63c5 3763
25fe55af
RS
3764 /* If last exactn not at current position. */
3765 || pending_exact + *pending_exact + 1 != b
5e69f11e 3766
25fe55af 3767 /* We have only one byte following the exactn for the count. */
2d1675e4 3768 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3769
7814e705 3770 /* If followed by a repetition operator. */
9d99031f 3771 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3772 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3773 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3774 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3775 || ((syntax & RE_INTERVALS)
25fe55af 3776 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3777 ? p != pend && *p == '{'
3778 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3779 {
3780 /* Start building a new exactn. */
5e69f11e 3781
25fe55af 3782 laststart = b;
fa9a63c5
RM
3783
3784 BUF_PUSH_2 (exactn, 0);
3785 pending_exact = b - 1;
25fe55af 3786 }
5e69f11e 3787
2d1675e4
SM
3788 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3789 {
e0277a47
KH
3790 int len;
3791
cf9c99bc 3792 if (multibyte)
6fdd04b0 3793 {
cf9c99bc 3794 c = TRANSLATE (c);
6fdd04b0
KH
3795 len = CHAR_STRING (c, b);
3796 b += len;
3797 }
e0277a47 3798 else
6fdd04b0 3799 {
cf9c99bc
KH
3800 c1 = RE_CHAR_TO_MULTIBYTE (c);
3801 if (! CHAR_BYTE8_P (c1))
3802 {
3803 re_wchar_t c2 = TRANSLATE (c1);
3804
3805 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3806 c = c1;
409f2919 3807 }
6fdd04b0
KH
3808 *b++ = c;
3809 len = 1;
3810 }
2d1675e4
SM
3811 (*pending_exact) += len;
3812 }
3813
fa9a63c5 3814 break;
25fe55af 3815 } /* switch (c) */
fa9a63c5
RM
3816 } /* while p != pend */
3817
5e69f11e 3818
fa9a63c5 3819 /* Through the pattern now. */
5e69f11e 3820
505bde11 3821 FIXUP_ALT_JUMP ();
fa9a63c5 3822
5e69f11e 3823 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3824 FREE_STACK_RETURN (REG_EPAREN);
3825
3826 /* If we don't want backtracking, force success
3827 the first time we reach the end of the compiled pattern. */
3828 if (syntax & RE_NO_POSIX_BACKTRACKING)
3829 BUF_PUSH (succeed);
3830
fa9a63c5
RM
3831 /* We have succeeded; set the length of the buffer. */
3832 bufp->used = b - bufp->buffer;
3833
3834#ifdef DEBUG
99633e97 3835 if (debug > 0)
fa9a63c5 3836 {
505bde11 3837 re_compile_fastmap (bufp);
fa9a63c5
RM
3838 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3839 print_compiled_pattern (bufp);
3840 }
99633e97 3841 debug--;
fa9a63c5
RM
3842#endif /* DEBUG */
3843
3844#ifndef MATCH_MAY_ALLOCATE
3845 /* Initialize the failure stack to the largest possible stack. This
3846 isn't necessary unless we're trying to avoid calling alloca in
3847 the search and match routines. */
3848 {
3849 int num_regs = bufp->re_nsub + 1;
3850
320a2a73 3851 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3852 {
a26f4ccd 3853 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3854
fa9a63c5
RM
3855 if (! fail_stack.stack)
3856 fail_stack.stack
5e69f11e 3857 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3858 * sizeof (fail_stack_elt_t));
3859 else
3860 fail_stack.stack
3861 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3862 (fail_stack.size
3863 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3864 }
3865
3866 regex_grow_registers (num_regs);
3867 }
3868#endif /* not MATCH_MAY_ALLOCATE */
3869
839966f3 3870 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3871} /* regex_compile */
3872\f
3873/* Subroutines for `regex_compile'. */
3874
7814e705 3875/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3876
3877static void
3878store_op1 (op, loc, arg)
3879 re_opcode_t op;
3880 unsigned char *loc;
3881 int arg;
3882{
3883 *loc = (unsigned char) op;
3884 STORE_NUMBER (loc + 1, arg);
3885}
3886
3887
3888/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3889
3890static void
3891store_op2 (op, loc, arg1, arg2)
3892 re_opcode_t op;
3893 unsigned char *loc;
3894 int arg1, arg2;
3895{
3896 *loc = (unsigned char) op;
3897 STORE_NUMBER (loc + 1, arg1);
3898 STORE_NUMBER (loc + 3, arg2);
3899}
3900
3901
3902/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3903 for OP followed by two-byte integer parameter ARG. */
3904
3905static void
3906insert_op1 (op, loc, arg, end)
3907 re_opcode_t op;
3908 unsigned char *loc;
3909 int arg;
5e69f11e 3910 unsigned char *end;
fa9a63c5
RM
3911{
3912 register unsigned char *pfrom = end;
3913 register unsigned char *pto = end + 3;
3914
3915 while (pfrom != loc)
3916 *--pto = *--pfrom;
5e69f11e 3917
fa9a63c5
RM
3918 store_op1 (op, loc, arg);
3919}
3920
3921
3922/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3923
3924static void
3925insert_op2 (op, loc, arg1, arg2, end)
3926 re_opcode_t op;
3927 unsigned char *loc;
3928 int arg1, arg2;
5e69f11e 3929 unsigned char *end;
fa9a63c5
RM
3930{
3931 register unsigned char *pfrom = end;
3932 register unsigned char *pto = end + 5;
3933
3934 while (pfrom != loc)
3935 *--pto = *--pfrom;
5e69f11e 3936
fa9a63c5
RM
3937 store_op2 (op, loc, arg1, arg2);
3938}
3939
3940
3941/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3942 after an alternative or a begin-subexpression. We assume there is at
3943 least one character before the ^. */
3944
3945static boolean
3946at_begline_loc_p (pattern, p, syntax)
01618498 3947 re_char *pattern, *p;
fa9a63c5
RM
3948 reg_syntax_t syntax;
3949{
01618498 3950 re_char *prev = p - 2;
fa9a63c5 3951 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3952
fa9a63c5
RM
3953 return
3954 /* After a subexpression? */
3955 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3956 /* After an alternative? */
d2af47df
SM
3957 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3958 /* After a shy subexpression? */
3959 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3960 && prev[-1] == '?' && prev[-2] == '('
3961 && (syntax & RE_NO_BK_PARENS
3962 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3963}
3964
3965
3966/* The dual of at_begline_loc_p. This one is for $. We assume there is
3967 at least one character after the $, i.e., `P < PEND'. */
3968
3969static boolean
3970at_endline_loc_p (p, pend, syntax)
01618498 3971 re_char *p, *pend;
99633e97 3972 reg_syntax_t syntax;
fa9a63c5 3973{
01618498 3974 re_char *next = p;
fa9a63c5 3975 boolean next_backslash = *next == '\\';
01618498 3976 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3977
fa9a63c5
RM
3978 return
3979 /* Before a subexpression? */
3980 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3981 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3982 /* Before an alternative? */
3983 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3984 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3985}
3986
3987
5e69f11e 3988/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3989 false if it's not. */
3990
3991static boolean
3992group_in_compile_stack (compile_stack, regnum)
3993 compile_stack_type compile_stack;
3994 regnum_t regnum;
3995{
3996 int this_element;
3997
5e69f11e
RM
3998 for (this_element = compile_stack.avail - 1;
3999 this_element >= 0;
fa9a63c5
RM
4000 this_element--)
4001 if (compile_stack.stack[this_element].regnum == regnum)
4002 return true;
4003
4004 return false;
4005}
fa9a63c5 4006\f
f6a3f532
SM
4007/* analyse_first.
4008 If fastmap is non-NULL, go through the pattern and fill fastmap
4009 with all the possible leading chars. If fastmap is NULL, don't
4010 bother filling it up (obviously) and only return whether the
4011 pattern could potentially match the empty string.
4012
4013 Return 1 if p..pend might match the empty string.
4014 Return 0 if p..pend matches at least one char.
01618498 4015 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
4016
4017static int
4018analyse_first (p, pend, fastmap, multibyte)
01618498 4019 re_char *p, *pend;
f6a3f532
SM
4020 char *fastmap;
4021 const int multibyte;
fa9a63c5 4022{
505bde11 4023 int j, k;
1fb352e0 4024 boolean not;
fa9a63c5 4025
b18215fc 4026 /* If all elements for base leading-codes in fastmap is set, this
7814e705 4027 flag is set true. */
b18215fc
RS
4028 boolean match_any_multibyte_characters = false;
4029
f6a3f532 4030 assert (p);
5e69f11e 4031
505bde11
SM
4032 /* The loop below works as follows:
4033 - It has a working-list kept in the PATTERN_STACK and which basically
4034 starts by only containing a pointer to the first operation.
4035 - If the opcode we're looking at is a match against some set of
4036 chars, then we add those chars to the fastmap and go on to the
4037 next work element from the worklist (done via `break').
4038 - If the opcode is a control operator on the other hand, we either
4039 ignore it (if it's meaningless at this point, such as `start_memory')
4040 or execute it (if it's a jump). If the jump has several destinations
4041 (i.e. `on_failure_jump'), then we push the other destination onto the
4042 worklist.
4043 We guarantee termination by ignoring backward jumps (more or less),
4044 so that `p' is monotonically increasing. More to the point, we
4045 never set `p' (or push) anything `<= p1'. */
4046
01618498 4047 while (p < pend)
fa9a63c5 4048 {
505bde11
SM
4049 /* `p1' is used as a marker of how far back a `on_failure_jump'
4050 can go without being ignored. It is normally equal to `p'
4051 (which prevents any backward `on_failure_jump') except right
4052 after a plain `jump', to allow patterns such as:
4053 0: jump 10
4054 3..9: <body>
4055 10: on_failure_jump 3
4056 as used for the *? operator. */
01618498 4057 re_char *p1 = p;
5e69f11e 4058
fa9a63c5
RM
4059 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4060 {
f6a3f532 4061 case succeed:
01618498 4062 return 1;
f6a3f532 4063 continue;
fa9a63c5 4064
fa9a63c5 4065 case duplicate:
505bde11
SM
4066 /* If the first character has to match a backreference, that means
4067 that the group was empty (since it already matched). Since this
4068 is the only case that interests us here, we can assume that the
4069 backreference must match the empty string. */
4070 p++;
4071 continue;
fa9a63c5
RM
4072
4073
4074 /* Following are the cases which match a character. These end
7814e705 4075 with `break'. */
fa9a63c5
RM
4076
4077 case exactn:
e0277a47 4078 if (fastmap)
cf9c99bc
KH
4079 {
4080 /* If multibyte is nonzero, the first byte of each
4081 character is an ASCII or a leading code. Otherwise,
4082 each byte is a character. Thus, this works in both
4083 cases. */
4084 fastmap[p[1]] = 1;
4085 if (! multibyte)
4086 {
4087 /* For the case of matching this unibyte regex
4088 against multibyte, we must set a leading code of
4089 the corresponding multibyte character. */
4090 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4091
86e893e3 4092 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
4093 }
4094 }
fa9a63c5
RM
4095 break;
4096
4097
1fb352e0
SM
4098 case anychar:
4099 /* We could put all the chars except for \n (and maybe \0)
4100 but we don't bother since it is generally not worth it. */
f6a3f532 4101 if (!fastmap) break;
01618498 4102 return -1;
fa9a63c5
RM
4103
4104
b18215fc 4105 case charset_not:
1fb352e0 4106 if (!fastmap) break;
bf216479
KH
4107 {
4108 /* Chars beyond end of bitmap are possible matches. */
bf216479 4109 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4110 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4111 fastmap[j] = 1;
4112 }
4113
1fb352e0
SM
4114 /* Fallthrough */
4115 case charset:
4116 if (!fastmap) break;
4117 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4118 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4119 j >= 0; j--)
1fb352e0 4120 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4121 fastmap[j] = 1;
b18215fc 4122
6482db2e
KH
4123#ifdef emacs
4124 if (/* Any leading code can possibly start a character
1fb352e0 4125 which doesn't match the specified set of characters. */
6482db2e 4126 not
409f2919 4127 ||
6482db2e
KH
4128 /* If we can match a character class, we can match any
4129 multibyte characters. */
4130 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4131 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4132
b18215fc 4133 {
b18215fc
RS
4134 if (match_any_multibyte_characters == false)
4135 {
6482db2e
KH
4136 for (j = MIN_MULTIBYTE_LEADING_CODE;
4137 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4138 fastmap[j] = 1;
b18215fc
RS
4139 match_any_multibyte_characters = true;
4140 }
4141 }
b18215fc 4142
1fb352e0
SM
4143 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4144 && match_any_multibyte_characters == false)
4145 {
bf216479 4146 /* Set fastmap[I] to 1 where I is a leading code of each
9117d724 4147 multibyte characer in the range table. */
1fb352e0 4148 int c, count;
bf216479 4149 unsigned char lc1, lc2;
b18215fc 4150
1fb352e0 4151 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4152 bits for a character class. */
1fb352e0 4153 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4154
1fb352e0
SM
4155 /* Extract the number of ranges in range table into COUNT. */
4156 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4157 for (; count > 0; count--, p += 3)
1fb352e0 4158 {
9117d724
KH
4159 /* Extract the start and end of each range. */
4160 EXTRACT_CHARACTER (c, p);
bf216479 4161 lc1 = CHAR_LEADING_CODE (c);
9117d724 4162 p += 3;
1fb352e0 4163 EXTRACT_CHARACTER (c, p);
bf216479
KH
4164 lc2 = CHAR_LEADING_CODE (c);
4165 for (j = lc1; j <= lc2; j++)
9117d724 4166 fastmap[j] = 1;
1fb352e0
SM
4167 }
4168 }
6482db2e 4169#endif
b18215fc
RS
4170 break;
4171
1fb352e0
SM
4172 case syntaxspec:
4173 case notsyntaxspec:
4174 if (!fastmap) break;
4175#ifndef emacs
4176 not = (re_opcode_t)p[-1] == notsyntaxspec;
4177 k = *p++;
4178 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4179 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4180 fastmap[j] = 1;
b18215fc 4181 break;
1fb352e0 4182#else /* emacs */
b18215fc
RS
4183 /* This match depends on text properties. These end with
4184 aborting optimizations. */
01618498 4185 return -1;
b18215fc
RS
4186
4187 case categoryspec:
b18215fc 4188 case notcategoryspec:
1fb352e0
SM
4189 if (!fastmap) break;
4190 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4191 k = *p++;
6482db2e 4192 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4193 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4194 fastmap[j] = 1;
4195
6482db2e
KH
4196 /* Any leading code can possibly start a character which
4197 has or doesn't has the specified category. */
4198 if (match_any_multibyte_characters == false)
6fdd04b0 4199 {
6482db2e
KH
4200 for (j = MIN_MULTIBYTE_LEADING_CODE;
4201 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4202 fastmap[j] = 1;
4203 match_any_multibyte_characters = true;
6fdd04b0 4204 }
b18215fc
RS
4205 break;
4206
fa9a63c5 4207 /* All cases after this match the empty string. These end with
25fe55af 4208 `continue'. */
fa9a63c5 4209
fa9a63c5
RM
4210 case before_dot:
4211 case at_dot:
4212 case after_dot:
1fb352e0 4213#endif /* !emacs */
25fe55af
RS
4214 case no_op:
4215 case begline:
4216 case endline:
fa9a63c5
RM
4217 case begbuf:
4218 case endbuf:
4219 case wordbound:
4220 case notwordbound:
4221 case wordbeg:
4222 case wordend:
669fa600
SM
4223 case symbeg:
4224 case symend:
25fe55af 4225 continue;
fa9a63c5
RM
4226
4227
fa9a63c5 4228 case jump:
25fe55af 4229 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4230 if (j < 0)
4231 /* Backward jumps can only go back to code that we've already
4232 visited. `re_compile' should make sure this is true. */
4233 break;
25fe55af 4234 p += j;
505bde11
SM
4235 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4236 {
4237 case on_failure_jump:
4238 case on_failure_keep_string_jump:
505bde11 4239 case on_failure_jump_loop:
0683b6fa 4240 case on_failure_jump_nastyloop:
505bde11
SM
4241 case on_failure_jump_smart:
4242 p++;
4243 break;
4244 default:
4245 continue;
4246 };
4247 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4248 to jump back to "just after here". */
4249 /* Fallthrough */
fa9a63c5 4250
25fe55af
RS
4251 case on_failure_jump:
4252 case on_failure_keep_string_jump:
0683b6fa 4253 case on_failure_jump_nastyloop:
505bde11
SM
4254 case on_failure_jump_loop:
4255 case on_failure_jump_smart:
25fe55af 4256 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4257 if (p + j <= p1)
ed0767d8 4258 ; /* Backward jump to be ignored. */
01618498
SM
4259 else
4260 { /* We have to look down both arms.
4261 We first go down the "straight" path so as to minimize
4262 stack usage when going through alternatives. */
4263 int r = analyse_first (p, pend, fastmap, multibyte);
4264 if (r) return r;
4265 p += j;
4266 }
25fe55af 4267 continue;
fa9a63c5
RM
4268
4269
ed0767d8
SM
4270 case jump_n:
4271 /* This code simply does not properly handle forward jump_n. */
4272 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4273 p += 4;
4274 /* jump_n can either jump or fall through. The (backward) jump
4275 case has already been handled, so we only need to look at the
4276 fallthrough case. */
4277 continue;
177c0ea7 4278
fa9a63c5 4279 case succeed_n:
ed0767d8
SM
4280 /* If N == 0, it should be an on_failure_jump_loop instead. */
4281 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4282 p += 4;
4283 /* We only care about one iteration of the loop, so we don't
4284 need to consider the case where this behaves like an
4285 on_failure_jump. */
25fe55af 4286 continue;
fa9a63c5
RM
4287
4288
4289 case set_number_at:
25fe55af
RS
4290 p += 4;
4291 continue;
fa9a63c5
RM
4292
4293
4294 case start_memory:
25fe55af 4295 case stop_memory:
505bde11 4296 p += 1;
fa9a63c5
RM
4297 continue;
4298
4299
4300 default:
25fe55af
RS
4301 abort (); /* We have listed all the cases. */
4302 } /* switch *p++ */
fa9a63c5
RM
4303
4304 /* Getting here means we have found the possible starting
25fe55af 4305 characters for one path of the pattern -- and that the empty
7814e705 4306 string does not match. We need not follow this path further. */
01618498 4307 return 0;
fa9a63c5
RM
4308 } /* while p */
4309
01618498
SM
4310 /* We reached the end without matching anything. */
4311 return 1;
4312
f6a3f532
SM
4313} /* analyse_first */
4314\f
4315/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4316 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4317 characters can start a string that matches the pattern. This fastmap
4318 is used by re_search to skip quickly over impossible starting points.
4319
4320 Character codes above (1 << BYTEWIDTH) are not represented in the
4321 fastmap, but the leading codes are represented. Thus, the fastmap
4322 indicates which character sets could start a match.
4323
4324 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4325 area as BUFP->fastmap.
4326
4327 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4328 the pattern buffer.
4329
4330 Returns 0 if we succeed, -2 if an internal error. */
4331
4332int
4333re_compile_fastmap (bufp)
4334 struct re_pattern_buffer *bufp;
4335{
4336 char *fastmap = bufp->fastmap;
4337 int analysis;
4338
4339 assert (fastmap && bufp->buffer);
4340
7814e705 4341 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4342 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4343
4344 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4345 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4346 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4347 return 0;
4348} /* re_compile_fastmap */
4349\f
4350/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4351 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4352 this memory for recording register information. STARTS and ENDS
4353 must be allocated using the malloc library routine, and must each
4354 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4355
4356 If NUM_REGS == 0, then subsequent matches should allocate their own
4357 register data.
4358
4359 Unless this function is called, the first search or match using
4360 PATTERN_BUFFER will allocate its own register data, without
4361 freeing the old data. */
4362
4363void
4364re_set_registers (bufp, regs, num_regs, starts, ends)
4365 struct re_pattern_buffer *bufp;
4366 struct re_registers *regs;
4367 unsigned num_regs;
4368 regoff_t *starts, *ends;
4369{
4370 if (num_regs)
4371 {
4372 bufp->regs_allocated = REGS_REALLOCATE;
4373 regs->num_regs = num_regs;
4374 regs->start = starts;
4375 regs->end = ends;
4376 }
4377 else
4378 {
4379 bufp->regs_allocated = REGS_UNALLOCATED;
4380 regs->num_regs = 0;
4381 regs->start = regs->end = (regoff_t *) 0;
4382 }
4383}
c0f9ea08 4384WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4385\f
7814e705 4386/* Searching routines. */
fa9a63c5
RM
4387
4388/* Like re_search_2, below, but only one string is specified, and
4389 doesn't let you say where to stop matching. */
4390
4391int
4392re_search (bufp, string, size, startpos, range, regs)
4393 struct re_pattern_buffer *bufp;
4394 const char *string;
4395 int size, startpos, range;
4396 struct re_registers *regs;
4397{
5e69f11e 4398 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4399 regs, size);
4400}
c0f9ea08 4401WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4402
70806df6
KH
4403/* Head address of virtual concatenation of string. */
4404#define HEAD_ADDR_VSTRING(P) \
4405 (((P) >= size1 ? string2 : string1))
4406
b18215fc
RS
4407/* End address of virtual concatenation of string. */
4408#define STOP_ADDR_VSTRING(P) \
4409 (((P) >= size1 ? string2 + size2 : string1 + size1))
4410
4411/* Address of POS in the concatenation of virtual string. */
4412#define POS_ADDR_VSTRING(POS) \
4413 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4414
4415/* Using the compiled pattern in BUFP->buffer, first tries to match the
4416 virtual concatenation of STRING1 and STRING2, starting first at index
4417 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4418
fa9a63c5 4419 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4420
fa9a63c5
RM
4421 RANGE is how far to scan while trying to match. RANGE = 0 means try
4422 only at STARTPOS; in general, the last start tried is STARTPOS +
4423 RANGE.
5e69f11e 4424
fa9a63c5
RM
4425 In REGS, return the indices of the virtual concatenation of STRING1
4426 and STRING2 that matched the entire BUFP->buffer and its contained
4427 subexpressions.
5e69f11e 4428
fa9a63c5
RM
4429 Do not consider matching one past the index STOP in the virtual
4430 concatenation of STRING1 and STRING2.
4431
4432 We return either the position in the strings at which the match was
4433 found, -1 if no match, or -2 if error (such as failure
4434 stack overflow). */
4435
4436int
66f0296e 4437re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4438 struct re_pattern_buffer *bufp;
66f0296e 4439 const char *str1, *str2;
fa9a63c5
RM
4440 int size1, size2;
4441 int startpos;
4442 int range;
4443 struct re_registers *regs;
4444 int stop;
4445{
4446 int val;
66f0296e
SM
4447 re_char *string1 = (re_char*) str1;
4448 re_char *string2 = (re_char*) str2;
fa9a63c5 4449 register char *fastmap = bufp->fastmap;
6676cb1c 4450 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4451 int total_size = size1 + size2;
4452 int endpos = startpos + range;
c0f9ea08 4453 boolean anchored_start;
cf9c99bc
KH
4454 /* Nonzero if we are searching multibyte string. */
4455 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4456
fa9a63c5
RM
4457 /* Check for out-of-range STARTPOS. */
4458 if (startpos < 0 || startpos > total_size)
4459 return -1;
5e69f11e 4460
fa9a63c5 4461 /* Fix up RANGE if it might eventually take us outside
34597fa9 4462 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4463 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4464 if (endpos < 0)
4465 range = 0 - startpos;
fa9a63c5
RM
4466 else if (endpos > total_size)
4467 range = total_size - startpos;
4468
4469 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4470 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4471 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4472 {
4473 if (startpos > 0)
4474 return -1;
4475 else
7b140fd7 4476 range = 0;
fa9a63c5
RM
4477 }
4478
ae4788a8
RS
4479#ifdef emacs
4480 /* In a forward search for something that starts with \=.
4481 don't keep searching past point. */
4482 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4483 {
7b140fd7
RS
4484 range = PT_BYTE - BEGV_BYTE - startpos;
4485 if (range < 0)
ae4788a8
RS
4486 return -1;
4487 }
4488#endif /* emacs */
4489
fa9a63c5
RM
4490 /* Update the fastmap now if not correct already. */
4491 if (fastmap && !bufp->fastmap_accurate)
01618498 4492 re_compile_fastmap (bufp);
5e69f11e 4493
c8499ba5 4494 /* See whether the pattern is anchored. */
c0f9ea08 4495 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4496
b18215fc 4497#ifdef emacs
d48cd3f4 4498 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4499 {
99633e97 4500 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4501
4502 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4503 }
b18215fc
RS
4504#endif
4505
fa9a63c5
RM
4506 /* Loop through the string, looking for a place to start matching. */
4507 for (;;)
5e69f11e 4508 {
c8499ba5
RS
4509 /* If the pattern is anchored,
4510 skip quickly past places we cannot match.
4511 We don't bother to treat startpos == 0 specially
4512 because that case doesn't repeat. */
4513 if (anchored_start && startpos > 0)
4514 {
c0f9ea08
SM
4515 if (! ((startpos <= size1 ? string1[startpos - 1]
4516 : string2[startpos - size1 - 1])
4517 == '\n'))
c8499ba5
RS
4518 goto advance;
4519 }
4520
fa9a63c5 4521 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4522 cannot be the start of a match. If the pattern can match the
4523 null string, however, we don't need to skip characters; we want
7814e705 4524 the first null string. */
fa9a63c5
RM
4525 if (fastmap && startpos < total_size && !bufp->can_be_null)
4526 {
66f0296e 4527 register re_char *d;
01618498 4528 register re_wchar_t buf_ch;
e934739e
RS
4529
4530 d = POS_ADDR_VSTRING (startpos);
4531
7814e705 4532 if (range > 0) /* Searching forwards. */
fa9a63c5 4533 {
fa9a63c5
RM
4534 register int lim = 0;
4535 int irange = range;
4536
25fe55af
RS
4537 if (startpos < size1 && startpos + range >= size1)
4538 lim = range - (size1 - startpos);
fa9a63c5 4539
25fe55af
RS
4540 /* Written out as an if-else to avoid testing `translate'
4541 inside the loop. */
28ae27ae
AS
4542 if (RE_TRANSLATE_P (translate))
4543 {
e934739e
RS
4544 if (multibyte)
4545 while (range > lim)
4546 {
4547 int buf_charlen;
4548
62a6e103 4549 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4550 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4551 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4552 break;
4553
4554 range -= buf_charlen;
4555 d += buf_charlen;
4556 }
4557 else
bf216479 4558 while (range > lim)
33c46939 4559 {
cf9c99bc
KH
4560 register re_wchar_t ch, translated;
4561
bf216479 4562 buf_ch = *d;
cf9c99bc
KH
4563 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4564 translated = RE_TRANSLATE (translate, ch);
4565 if (translated != ch
4566 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4567 buf_ch = ch;
6fdd04b0 4568 if (fastmap[buf_ch])
bf216479 4569 break;
33c46939
RS
4570 d++;
4571 range--;
4572 }
e934739e 4573 }
fa9a63c5 4574 else
6fdd04b0
KH
4575 {
4576 if (multibyte)
4577 while (range > lim)
4578 {
4579 int buf_charlen;
fa9a63c5 4580
62a6e103 4581 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4582 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4583 break;
4584 range -= buf_charlen;
4585 d += buf_charlen;
4586 }
e934739e 4587 else
6fdd04b0 4588 while (range > lim && !fastmap[*d])
33c46939
RS
4589 {
4590 d++;
4591 range--;
4592 }
e934739e 4593 }
fa9a63c5
RM
4594 startpos += irange - range;
4595 }
7814e705 4596 else /* Searching backwards. */
fa9a63c5 4597 {
ba5e343c
KH
4598 if (multibyte)
4599 {
62a6e103 4600 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4601 buf_ch = TRANSLATE (buf_ch);
4602 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4603 goto advance;
4604 }
4605 else
4606 {
cf9c99bc
KH
4607 register re_wchar_t ch, translated;
4608
4609 buf_ch = *d;
4610 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4611 translated = TRANSLATE (ch);
4612 if (translated != ch
4613 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4614 buf_ch = ch;
4615 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4616 goto advance;
4617 }
fa9a63c5
RM
4618 }
4619 }
4620
4621 /* If can't match the null string, and that's all we have left, fail. */
4622 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4623 && !bufp->can_be_null)
fa9a63c5
RM
4624 return -1;
4625
4626 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4627 startpos, regs, stop);
fa9a63c5
RM
4628
4629 if (val >= 0)
4630 return startpos;
5e69f11e 4631
fa9a63c5
RM
4632 if (val == -2)
4633 return -2;
4634
4635 advance:
5e69f11e 4636 if (!range)
25fe55af 4637 break;
5e69f11e 4638 else if (range > 0)
25fe55af 4639 {
b18215fc
RS
4640 /* Update STARTPOS to the next character boundary. */
4641 if (multibyte)
4642 {
66f0296e
SM
4643 re_char *p = POS_ADDR_VSTRING (startpos);
4644 re_char *pend = STOP_ADDR_VSTRING (startpos);
aa3830c4 4645 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4646
4647 range -= len;
4648 if (range < 0)
4649 break;
4650 startpos += len;
4651 }
4652 else
4653 {
b560c397
RS
4654 range--;
4655 startpos++;
4656 }
e318085a 4657 }
fa9a63c5 4658 else
25fe55af
RS
4659 {
4660 range++;
4661 startpos--;
b18215fc
RS
4662
4663 /* Update STARTPOS to the previous character boundary. */
4664 if (multibyte)
4665 {
70806df6
KH
4666 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4667 re_char *p0 = p;
4668 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4669
4670 /* Find the head of multibyte form. */
70806df6
KH
4671 PREV_CHAR_BOUNDARY (p, phead);
4672 range += p0 - 1 - p;
4673 if (range > 0)
4674 break;
b18215fc 4675
70806df6 4676 startpos -= p0 - 1 - p;
b18215fc 4677 }
25fe55af 4678 }
fa9a63c5
RM
4679 }
4680 return -1;
4681} /* re_search_2 */
c0f9ea08 4682WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4683\f
4684/* Declarations and macros for re_match_2. */
4685
2d1675e4
SM
4686static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4687 register int len,
4688 RE_TRANSLATE_TYPE translate,
4689 const int multibyte));
fa9a63c5
RM
4690
4691/* This converts PTR, a pointer into one of the search strings `string1'
4692 and `string2' into an offset from the beginning of that string. */
4693#define POINTER_TO_OFFSET(ptr) \
4694 (FIRST_STRING_P (ptr) \
4695 ? ((regoff_t) ((ptr) - string1)) \
4696 : ((regoff_t) ((ptr) - string2 + size1)))
4697
fa9a63c5 4698/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4699 string2 if necessary.
4700 Check re_match_2_internal for a discussion of why end_match_2 might
4701 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4702#define PREFETCH() \
25fe55af 4703 while (d == dend) \
fa9a63c5
RM
4704 { \
4705 /* End of string2 => fail. */ \
25fe55af
RS
4706 if (dend == end_match_2) \
4707 goto fail; \
4bb91c68 4708 /* End of string1 => advance to string2. */ \
25fe55af 4709 d = string2; \
fa9a63c5
RM
4710 dend = end_match_2; \
4711 }
4712
f1ad044f
SM
4713/* Call before fetching a char with *d if you already checked other limits.
4714 This is meant for use in lookahead operations like wordend, etc..
4715 where we might need to look at parts of the string that might be
4716 outside of the LIMITs (i.e past `stop'). */
4717#define PREFETCH_NOLIMIT() \
4718 if (d == end1) \
4719 { \
4720 d = string2; \
4721 dend = end_match_2; \
4722 } \
fa9a63c5
RM
4723
4724/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4725 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4726#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4727#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4728
4729
4730/* Test if D points to a character which is word-constituent. We have
4731 two special cases to check for: if past the end of string1, look at
4732 the first character in string2; and if before the beginning of
4733 string2, look at the last character in string1. */
4734#define WORDCHAR_P(d) \
4735 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4736 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4737 == Sword)
4738
9121ca40 4739/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4740
4741/* The comment at case wordbound is following one, but we don't use
4742 AT_WORD_BOUNDARY anymore to support multibyte form.
4743
4744 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4745 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4746 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4747 macro and introducing temporary variables works around the bug. */
4748
9121ca40 4749#if 0
fa9a63c5
RM
4750/* Test if the character before D and the one at D differ with respect
4751 to being word-constituent. */
4752#define AT_WORD_BOUNDARY(d) \
4753 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4754 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4755#endif
fa9a63c5
RM
4756
4757/* Free everything we malloc. */
4758#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4759# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4760# define FREE_VARIABLES() \
fa9a63c5
RM
4761 do { \
4762 REGEX_FREE_STACK (fail_stack.stack); \
4763 FREE_VAR (regstart); \
4764 FREE_VAR (regend); \
fa9a63c5
RM
4765 FREE_VAR (best_regstart); \
4766 FREE_VAR (best_regend); \
fa9a63c5
RM
4767 } while (0)
4768#else
0b32bf0e 4769# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4770#endif /* not MATCH_MAY_ALLOCATE */
4771
505bde11
SM
4772\f
4773/* Optimization routines. */
4774
4e8a9132
SM
4775/* If the operation is a match against one or more chars,
4776 return a pointer to the next operation, else return NULL. */
01618498 4777static re_char *
4e8a9132 4778skip_one_char (p)
01618498 4779 re_char *p;
4e8a9132
SM
4780{
4781 switch (SWITCH_ENUM_CAST (*p++))
4782 {
4783 case anychar:
4784 break;
177c0ea7 4785
4e8a9132
SM
4786 case exactn:
4787 p += *p + 1;
4788 break;
4789
4790 case charset_not:
4791 case charset:
4792 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4793 {
4794 int mcnt;
4795 p = CHARSET_RANGE_TABLE (p - 1);
4796 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4797 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4798 }
4799 else
4800 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4801 break;
177c0ea7 4802
4e8a9132
SM
4803 case syntaxspec:
4804 case notsyntaxspec:
1fb352e0 4805#ifdef emacs
4e8a9132
SM
4806 case categoryspec:
4807 case notcategoryspec:
4808#endif /* emacs */
4809 p++;
4810 break;
4811
4812 default:
4813 p = NULL;
4814 }
4815 return p;
4816}
4817
4818
505bde11 4819/* Jump over non-matching operations. */
839966f3 4820static re_char *
4e8a9132 4821skip_noops (p, pend)
839966f3 4822 re_char *p, *pend;
505bde11
SM
4823{
4824 int mcnt;
4825 while (p < pend)
4826 {
4827 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4828 {
4829 case start_memory:
505bde11
SM
4830 case stop_memory:
4831 p += 2; break;
4832 case no_op:
4833 p += 1; break;
4834 case jump:
4835 p += 1;
4836 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4837 p += mcnt;
4838 break;
4839 default:
4840 return p;
4841 }
4842 }
4843 assert (p == pend);
4844 return p;
4845}
4846
4847/* Non-zero if "p1 matches something" implies "p2 fails". */
4848static int
4849mutually_exclusive_p (bufp, p1, p2)
4850 struct re_pattern_buffer *bufp;
839966f3 4851 re_char *p1, *p2;
505bde11 4852{
4e8a9132 4853 re_opcode_t op2;
2d1675e4 4854 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4855 unsigned char *pend = bufp->buffer + bufp->used;
4856
4e8a9132 4857 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4858 && p2 >= bufp->buffer && p2 <= pend);
4859
4860 /* Skip over open/close-group commands.
4861 If what follows this loop is a ...+ construct,
4862 look at what begins its body, since we will have to
4863 match at least one of that. */
4e8a9132
SM
4864 p2 = skip_noops (p2, pend);
4865 /* The same skip can be done for p1, except that this function
4866 is only used in the case where p1 is a simple match operator. */
4867 /* p1 = skip_noops (p1, pend); */
4868
4869 assert (p1 >= bufp->buffer && p1 < pend
4870 && p2 >= bufp->buffer && p2 <= pend);
4871
4872 op2 = p2 == pend ? succeed : *p2;
4873
4874 switch (SWITCH_ENUM_CAST (op2))
505bde11 4875 {
4e8a9132
SM
4876 case succeed:
4877 case endbuf:
4878 /* If we're at the end of the pattern, we can change. */
4879 if (skip_one_char (p1))
505bde11 4880 {
505bde11
SM
4881 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4882 return 1;
505bde11 4883 }
4e8a9132 4884 break;
177c0ea7 4885
4e8a9132 4886 case endline:
4e8a9132
SM
4887 case exactn:
4888 {
01618498 4889 register re_wchar_t c
4e8a9132 4890 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4891 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4892
4e8a9132
SM
4893 if ((re_opcode_t) *p1 == exactn)
4894 {
62a6e103 4895 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4896 {
4897 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4898 return 1;
4899 }
4900 }
505bde11 4901
4e8a9132
SM
4902 else if ((re_opcode_t) *p1 == charset
4903 || (re_opcode_t) *p1 == charset_not)
4904 {
4905 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4906
4e8a9132
SM
4907 /* Test if C is listed in charset (or charset_not)
4908 at `p1'. */
6fdd04b0 4909 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4910 {
4911 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4912 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4913 not = !not;
4914 }
4915 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4916 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4917
4e8a9132
SM
4918 /* `not' is equal to 1 if c would match, which means
4919 that we can't change to pop_failure_jump. */
4920 if (!not)
4921 {
4922 DEBUG_PRINT1 (" No match => fast loop.\n");
4923 return 1;
4924 }
4925 }
4926 else if ((re_opcode_t) *p1 == anychar
4927 && c == '\n')
4928 {
4929 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4930 return 1;
4931 }
4932 }
4933 break;
505bde11 4934
4e8a9132 4935 case charset:
4e8a9132
SM
4936 {
4937 if ((re_opcode_t) *p1 == exactn)
4938 /* Reuse the code above. */
4939 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4940
505bde11
SM
4941 /* It is hard to list up all the character in charset
4942 P2 if it includes multibyte character. Give up in
4943 such case. */
4944 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4945 {
4946 /* Now, we are sure that P2 has no range table.
4947 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4948 enough. But P1 may have range table, so the
505bde11
SM
4949 size of bitmap table of P1 is extracted by
4950 using macro `CHARSET_BITMAP_SIZE'.
4951
6fdd04b0
KH
4952 In a multibyte case, we know that all the character
4953 listed in P2 is ASCII. In a unibyte case, P1 has only a
4954 bitmap table. So, in both cases, it is enough to test
4955 only the bitmap table of P1. */
505bde11 4956
411e4203 4957 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4958 {
4959 int idx;
4960 /* We win if the charset inside the loop
4961 has no overlap with the one after the loop. */
4962 for (idx = 0;
4963 (idx < (int) p2[1]
4964 && idx < CHARSET_BITMAP_SIZE (p1));
4965 idx++)
4966 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4967 break;
4968
4969 if (idx == p2[1]
4970 || idx == CHARSET_BITMAP_SIZE (p1))
4971 {
4972 DEBUG_PRINT1 (" No match => fast loop.\n");
4973 return 1;
4974 }
4975 }
411e4203 4976 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4977 {
4978 int idx;
4979 /* We win if the charset_not inside the loop lists
7814e705 4980 every character listed in the charset after. */
505bde11
SM
4981 for (idx = 0; idx < (int) p2[1]; idx++)
4982 if (! (p2[2 + idx] == 0
4983 || (idx < CHARSET_BITMAP_SIZE (p1)
4984 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4985 break;
4986
4e8a9132
SM
4987 if (idx == p2[1])
4988 {
4989 DEBUG_PRINT1 (" No match => fast loop.\n");
4990 return 1;
4991 }
4992 }
4993 }
4994 }
609b757a 4995 break;
177c0ea7 4996
411e4203
SM
4997 case charset_not:
4998 switch (SWITCH_ENUM_CAST (*p1))
4999 {
5000 case exactn:
5001 case charset:
5002 /* Reuse the code above. */
5003 return mutually_exclusive_p (bufp, p2, p1);
5004 case charset_not:
5005 /* When we have two charset_not, it's very unlikely that
5006 they don't overlap. The union of the two sets of excluded
5007 chars should cover all possible chars, which, as a matter of
5008 fact, is virtually impossible in multibyte buffers. */
36595814 5009 break;
411e4203
SM
5010 }
5011 break;
5012
4e8a9132 5013 case wordend:
669fa600
SM
5014 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
5015 case symend:
4e8a9132 5016 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
5017 && (p1[1] == Ssymbol || p1[1] == Sword));
5018 case notsyntaxspec:
5019 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
5020
5021 case wordbeg:
669fa600
SM
5022 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
5023 case symbeg:
4e8a9132 5024 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
5025 && (p1[1] == Ssymbol || p1[1] == Sword));
5026 case syntaxspec:
5027 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
5028
5029 case wordbound:
5030 return (((re_opcode_t) *p1 == notsyntaxspec
5031 || (re_opcode_t) *p1 == syntaxspec)
5032 && p1[1] == Sword);
5033
1fb352e0 5034#ifdef emacs
4e8a9132
SM
5035 case categoryspec:
5036 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
5037 case notcategoryspec:
5038 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
5039#endif /* emacs */
5040
5041 default:
5042 ;
505bde11
SM
5043 }
5044
5045 /* Safe default. */
5046 return 0;
5047}
5048
fa9a63c5
RM
5049\f
5050/* Matching routines. */
5051
25fe55af 5052#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
5053/* re_match is like re_match_2 except it takes only a single string. */
5054
5055int
5056re_match (bufp, string, size, pos, regs)
5057 struct re_pattern_buffer *bufp;
5058 const char *string;
5059 int size, pos;
5060 struct re_registers *regs;
5061{
4bb91c68 5062 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 5063 pos, regs, size);
fa9a63c5
RM
5064 return result;
5065}
c0f9ea08 5066WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
5067#endif /* not emacs */
5068
b18215fc
RS
5069#ifdef emacs
5070/* In Emacs, this is the string or buffer in which we
7814e705 5071 are matching. It is used for looking up syntax properties. */
b18215fc
RS
5072Lisp_Object re_match_object;
5073#endif
fa9a63c5
RM
5074
5075/* re_match_2 matches the compiled pattern in BUFP against the
5076 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5077 and SIZE2, respectively). We start matching at POS, and stop
5078 matching at STOP.
5e69f11e 5079
fa9a63c5 5080 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 5081 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
5082 documentation for exactly how many groups we fill.
5083
5084 We return -1 if no match, -2 if an internal error (such as the
7814e705 5085 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
5086 matched substring. */
5087
5088int
5089re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5090 struct re_pattern_buffer *bufp;
5091 const char *string1, *string2;
5092 int size1, size2;
5093 int pos;
5094 struct re_registers *regs;
5095 int stop;
5096{
b18215fc 5097 int result;
25fe55af 5098
b18215fc 5099#ifdef emacs
cc9b4df2 5100 int charpos;
d48cd3f4 5101 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 5102 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5103 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5104#endif
5105
4bb91c68
SM
5106 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5107 (re_char*) string2, size2,
cc9b4df2 5108 pos, regs, stop);
fa9a63c5
RM
5109 return result;
5110}
c0f9ea08 5111WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5112
bf216479 5113
fa9a63c5 5114/* This is a separate function so that we can force an alloca cleanup
7814e705 5115 afterwards. */
fa9a63c5
RM
5116static int
5117re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5118 struct re_pattern_buffer *bufp;
66f0296e 5119 re_char *string1, *string2;
fa9a63c5
RM
5120 int size1, size2;
5121 int pos;
5122 struct re_registers *regs;
5123 int stop;
5124{
5125 /* General temporaries. */
5126 int mcnt;
01618498 5127 size_t reg;
66f0296e 5128 boolean not;
fa9a63c5
RM
5129
5130 /* Just past the end of the corresponding string. */
66f0296e 5131 re_char *end1, *end2;
fa9a63c5
RM
5132
5133 /* Pointers into string1 and string2, just past the last characters in
7814e705 5134 each to consider matching. */
66f0296e 5135 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5136
5137 /* Where we are in the data, and the end of the current string. */
66f0296e 5138 re_char *d, *dend;
5e69f11e 5139
99633e97
SM
5140 /* Used sometimes to remember where we were before starting matching
5141 an operator so that we can go back in case of failure. This "atomic"
5142 behavior of matching opcodes is indispensable to the correctness
5143 of the on_failure_keep_string_jump optimization. */
5144 re_char *dfail;
5145
fa9a63c5 5146 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5147 re_char *p = bufp->buffer;
5148 re_char *pend = p + bufp->used;
fa9a63c5 5149
25fe55af 5150 /* We use this to map every character in the string. */
6676cb1c 5151 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5152
cf9c99bc 5153 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5154 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5155
cf9c99bc
KH
5156 /* Nonzero if STRING1/STRING2 are multibyte. */
5157 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5158
fa9a63c5
RM
5159 /* Failure point stack. Each place that can handle a failure further
5160 down the line pushes a failure point on this stack. It consists of
505bde11 5161 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5162 the subexpressions we're currently inside, plus the number of such
5163 registers, and, finally, two char *'s. The first char * is where
5164 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5165 scanning the strings. */
5166#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5167 fail_stack_type fail_stack;
5168#endif
5169#ifdef DEBUG
fa9a63c5
RM
5170 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5171#endif
5172
0b32bf0e 5173#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5174 /* This holds the pointer to the failure stack, when
5175 it is allocated relocatably. */
5176 fail_stack_elt_t *failure_stack_ptr;
99633e97 5177#endif
fa9a63c5
RM
5178
5179 /* We fill all the registers internally, independent of what we
7814e705 5180 return, for use in backreferences. The number here includes
fa9a63c5 5181 an element for register zero. */
4bb91c68 5182 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5183
fa9a63c5
RM
5184 /* Information on the contents of registers. These are pointers into
5185 the input strings; they record just what was matched (on this
5186 attempt) by a subexpression part of the pattern, that is, the
5187 regnum-th regstart pointer points to where in the pattern we began
5188 matching and the regnum-th regend points to right after where we
5189 stopped matching the regnum-th subexpression. (The zeroth register
5190 keeps track of what the whole pattern matches.) */
5191#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5192 re_char **regstart, **regend;
fa9a63c5
RM
5193#endif
5194
fa9a63c5 5195 /* The following record the register info as found in the above
5e69f11e 5196 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5197 This happens as we backtrack through the failure points, which in
5198 turn happens only if we have not yet matched the entire string. */
5199 unsigned best_regs_set = false;
5200#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5201 re_char **best_regstart, **best_regend;
fa9a63c5 5202#endif
5e69f11e 5203
fa9a63c5
RM
5204 /* Logically, this is `best_regend[0]'. But we don't want to have to
5205 allocate space for that if we're not allocating space for anything
7814e705 5206 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5207 any of the other register vectors, and it seems rather a kludge to
5208 treat `best_regend' differently than the rest. So we keep track of
5209 the end of the best match so far in a separate variable. We
5210 initialize this to NULL so that when we backtrack the first time
5211 and need to test it, it's not garbage. */
66f0296e 5212 re_char *match_end = NULL;
fa9a63c5 5213
fa9a63c5
RM
5214#ifdef DEBUG
5215 /* Counts the total number of registers pushed. */
5e69f11e 5216 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5217#endif
5218
5219 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5220
fa9a63c5 5221 INIT_FAIL_STACK ();
5e69f11e 5222
fa9a63c5
RM
5223#ifdef MATCH_MAY_ALLOCATE
5224 /* Do not bother to initialize all the register variables if there are
5225 no groups in the pattern, as it takes a fair amount of time. If
5226 there are groups, we include space for register 0 (the whole
5227 pattern), even though we never use it, since it simplifies the
5228 array indexing. We should fix this. */
5229 if (bufp->re_nsub)
5230 {
66f0296e
SM
5231 regstart = REGEX_TALLOC (num_regs, re_char *);
5232 regend = REGEX_TALLOC (num_regs, re_char *);
5233 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5234 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5235
505bde11 5236 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5237 {
5238 FREE_VARIABLES ();
5239 return -2;
5240 }
fa9a63c5
RM
5241 }
5242 else
5243 {
5244 /* We must initialize all our variables to NULL, so that
25fe55af 5245 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5246 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5247 }
5248#endif /* MATCH_MAY_ALLOCATE */
5249
5250 /* The starting position is bogus. */
5251 if (pos < 0 || pos > size1 + size2)
5252 {
5253 FREE_VARIABLES ();
5254 return -1;
5255 }
5e69f11e 5256
fa9a63c5
RM
5257 /* Initialize subexpression text positions to -1 to mark ones that no
5258 start_memory/stop_memory has been seen for. Also initialize the
5259 register information struct. */
01618498
SM
5260 for (reg = 1; reg < num_regs; reg++)
5261 regstart[reg] = regend[reg] = NULL;
99633e97 5262
fa9a63c5 5263 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5264 `string1' is null. */
fa9a63c5
RM
5265 if (size2 == 0 && string1 != NULL)
5266 {
5267 string2 = string1;
5268 size2 = size1;
5269 string1 = 0;
5270 size1 = 0;
5271 }
5272 end1 = string1 + size1;
5273 end2 = string2 + size2;
5274
5e69f11e 5275 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5276 `dend' is the end of the input string that `d' points within. `d'
5277 is advanced into the following input string whenever necessary, but
5278 this happens before fetching; therefore, at the beginning of the
5279 loop, `d' can be pointing at the end of a string, but it cannot
5280 equal `string2'. */
419d1c74 5281 if (pos >= size1)
fa9a63c5 5282 {
419d1c74
SM
5283 /* Only match within string2. */
5284 d = string2 + pos - size1;
5285 dend = end_match_2 = string2 + stop - size1;
5286 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5287 }
5288 else
5289 {
f1ad044f 5290 if (stop < size1)
419d1c74
SM
5291 {
5292 /* Only match within string1. */
5293 end_match_1 = string1 + stop;
5294 /* BEWARE!
5295 When we reach end_match_1, PREFETCH normally switches to string2.
5296 But in the present case, this means that just doing a PREFETCH
5297 makes us jump from `stop' to `gap' within the string.
5298 What we really want here is for the search to stop as
5299 soon as we hit end_match_1. That's why we set end_match_2
5300 to end_match_1 (since PREFETCH fails as soon as we hit
5301 end_match_2). */
5302 end_match_2 = end_match_1;
5303 }
5304 else
f1ad044f
SM
5305 { /* It's important to use this code when stop == size so that
5306 moving `d' from end1 to string2 will not prevent the d == dend
5307 check from catching the end of string. */
419d1c74
SM
5308 end_match_1 = end1;
5309 end_match_2 = string2 + stop - size1;
5310 }
5311 d = string1 + pos;
5312 dend = end_match_1;
fa9a63c5
RM
5313 }
5314
5315 DEBUG_PRINT1 ("The compiled pattern is: ");
5316 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5317 DEBUG_PRINT1 ("The string to match is: `");
5318 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5319 DEBUG_PRINT1 ("'\n");
5e69f11e 5320
7814e705 5321 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5322 function if the match is complete, or it drops through if the match
5323 fails at this starting point in the input data. */
5324 for (;;)
5325 {
505bde11 5326 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5327
5328 if (p == pend)
5329 { /* End of pattern means we might have succeeded. */
25fe55af 5330 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5331
fa9a63c5 5332 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5333 longest match, try backtracking. */
5334 if (d != end_match_2)
fa9a63c5
RM
5335 {
5336 /* 1 if this match ends in the same string (string1 or string2)
5337 as the best previous match. */
5e69f11e 5338 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5339 == FIRST_STRING_P (d));
fa9a63c5
RM
5340 /* 1 if this match is the best seen so far. */
5341 boolean best_match_p;
5342
5343 /* AIX compiler got confused when this was combined
7814e705 5344 with the previous declaration. */
fa9a63c5
RM
5345 if (same_str_p)
5346 best_match_p = d > match_end;
5347 else
99633e97 5348 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5349
25fe55af
RS
5350 DEBUG_PRINT1 ("backtracking.\n");
5351
5352 if (!FAIL_STACK_EMPTY ())
5353 { /* More failure points to try. */
5354
5355 /* If exceeds best match so far, save it. */
5356 if (!best_regs_set || best_match_p)
5357 {
5358 best_regs_set = true;
5359 match_end = d;
5360
5361 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5362
01618498 5363 for (reg = 1; reg < num_regs; reg++)
25fe55af 5364 {
01618498
SM
5365 best_regstart[reg] = regstart[reg];
5366 best_regend[reg] = regend[reg];
25fe55af
RS
5367 }
5368 }
5369 goto fail;
5370 }
5371
5372 /* If no failure points, don't restore garbage. And if
5373 last match is real best match, don't restore second
5374 best one. */
5375 else if (best_regs_set && !best_match_p)
5376 {
5377 restore_best_regs:
5378 /* Restore best match. It may happen that `dend ==
5379 end_match_1' while the restored d is in string2.
5380 For example, the pattern `x.*y.*z' against the
5381 strings `x-' and `y-z-', if the two strings are
7814e705 5382 not consecutive in memory. */
25fe55af
RS
5383 DEBUG_PRINT1 ("Restoring best registers.\n");
5384
5385 d = match_end;
5386 dend = ((d >= string1 && d <= end1)
5387 ? end_match_1 : end_match_2);
fa9a63c5 5388
01618498 5389 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5390 {
01618498
SM
5391 regstart[reg] = best_regstart[reg];
5392 regend[reg] = best_regend[reg];
fa9a63c5 5393 }
25fe55af
RS
5394 }
5395 } /* d != end_match_2 */
fa9a63c5
RM
5396
5397 succeed_label:
25fe55af 5398 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5399
25fe55af
RS
5400 /* If caller wants register contents data back, do it. */
5401 if (regs && !bufp->no_sub)
fa9a63c5 5402 {
25fe55af
RS
5403 /* Have the register data arrays been allocated? */
5404 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5405 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5406 extra element beyond `num_regs' for the `-1' marker
5407 GNU code uses. */
5408 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5409 regs->start = TALLOC (regs->num_regs, regoff_t);
5410 regs->end = TALLOC (regs->num_regs, regoff_t);
5411 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5412 {
5413 FREE_VARIABLES ();
5414 return -2;
5415 }
25fe55af
RS
5416 bufp->regs_allocated = REGS_REALLOCATE;
5417 }
5418 else if (bufp->regs_allocated == REGS_REALLOCATE)
5419 { /* Yes. If we need more elements than were already
5420 allocated, reallocate them. If we need fewer, just
5421 leave it alone. */
5422 if (regs->num_regs < num_regs + 1)
5423 {
5424 regs->num_regs = num_regs + 1;
5425 RETALLOC (regs->start, regs->num_regs, regoff_t);
5426 RETALLOC (regs->end, regs->num_regs, regoff_t);
5427 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5428 {
5429 FREE_VARIABLES ();
5430 return -2;
5431 }
25fe55af
RS
5432 }
5433 }
5434 else
fa9a63c5
RM
5435 {
5436 /* These braces fend off a "empty body in an else-statement"
7814e705 5437 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5438 assert (bufp->regs_allocated == REGS_FIXED);
5439 }
5440
25fe55af
RS
5441 /* Convert the pointer data in `regstart' and `regend' to
5442 indices. Register zero has to be set differently,
5443 since we haven't kept track of any info for it. */
5444 if (regs->num_regs > 0)
5445 {
5446 regs->start[0] = pos;
99633e97 5447 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5448 }
5e69f11e 5449
25fe55af
RS
5450 /* Go through the first `min (num_regs, regs->num_regs)'
5451 registers, since that is all we initialized. */
01618498 5452 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5453 {
01618498
SM
5454 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5455 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5456 else
5457 {
01618498
SM
5458 regs->start[reg]
5459 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5460 regs->end[reg]
5461 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5462 }
fa9a63c5 5463 }
5e69f11e 5464
25fe55af
RS
5465 /* If the regs structure we return has more elements than
5466 were in the pattern, set the extra elements to -1. If
5467 we (re)allocated the registers, this is the case,
5468 because we always allocate enough to have at least one
7814e705 5469 -1 at the end. */
01618498
SM
5470 for (reg = num_regs; reg < regs->num_regs; reg++)
5471 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5472 } /* regs && !bufp->no_sub */
5473
25fe55af
RS
5474 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5475 nfailure_points_pushed, nfailure_points_popped,
5476 nfailure_points_pushed - nfailure_points_popped);
5477 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5478
99633e97 5479 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5480
25fe55af 5481 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5482
25fe55af
RS
5483 FREE_VARIABLES ();
5484 return mcnt;
5485 }
fa9a63c5 5486
7814e705 5487 /* Otherwise match next pattern command. */
fa9a63c5
RM
5488 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5489 {
25fe55af
RS
5490 /* Ignore these. Used to ignore the n of succeed_n's which
5491 currently have n == 0. */
5492 case no_op:
5493 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5494 break;
fa9a63c5
RM
5495
5496 case succeed:
25fe55af 5497 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5498 goto succeed_label;
5499
7814e705 5500 /* Match the next n pattern characters exactly. The following
25fe55af 5501 byte in the pattern defines n, and the n bytes after that
7814e705 5502 are the characters to match. */
fa9a63c5
RM
5503 case exactn:
5504 mcnt = *p++;
25fe55af 5505 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5506
99633e97
SM
5507 /* Remember the start point to rollback upon failure. */
5508 dfail = d;
5509
6fdd04b0 5510#ifndef emacs
25fe55af
RS
5511 /* This is written out as an if-else so we don't waste time
5512 testing `translate' inside the loop. */
28703c16 5513 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5514 do
5515 {
5516 PREFETCH ();
5517 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5518 {
6fdd04b0
KH
5519 d = dfail;
5520 goto fail;
e934739e 5521 }
6fdd04b0
KH
5522 d++;
5523 }
5524 while (--mcnt);
fa9a63c5 5525 else
6fdd04b0
KH
5526 do
5527 {
5528 PREFETCH ();
5529 if (*d++ != *p++)
bf216479 5530 {
6fdd04b0
KH
5531 d = dfail;
5532 goto fail;
bf216479 5533 }
6fdd04b0
KH
5534 }
5535 while (--mcnt);
5536#else /* emacs */
5537 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5538 if (target_multibyte)
6fdd04b0
KH
5539 do
5540 {
5541 int pat_charlen, buf_charlen;
cf9c99bc 5542 int pat_ch, buf_ch;
e934739e 5543
6fdd04b0 5544 PREFETCH ();
cf9c99bc 5545 if (multibyte)
62a6e103 5546 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5547 else
5548 {
5549 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5550 pat_charlen = 1;
5551 }
62a6e103 5552 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5553
6fdd04b0 5554 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5555 {
6fdd04b0
KH
5556 d = dfail;
5557 goto fail;
e934739e 5558 }
bf216479 5559
6fdd04b0
KH
5560 p += pat_charlen;
5561 d += buf_charlen;
5562 mcnt -= pat_charlen;
5563 }
5564 while (mcnt > 0);
fa9a63c5 5565 else
6fdd04b0
KH
5566 do
5567 {
cf9c99bc
KH
5568 int pat_charlen, buf_charlen;
5569 int pat_ch, buf_ch;
bf216479 5570
6fdd04b0 5571 PREFETCH ();
cf9c99bc
KH
5572 if (multibyte)
5573 {
62a6e103 5574 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5575 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5576 }
5577 else
5578 {
5579 pat_ch = *p;
5580 pat_charlen = 1;
5581 }
5582 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5583 if (! CHAR_BYTE8_P (buf_ch))
5584 {
5585 buf_ch = TRANSLATE (buf_ch);
5586 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5587 if (buf_ch < 0)
5588 buf_ch = *d;
5589 }
0e2501ed
AS
5590 else
5591 buf_ch = *d;
cf9c99bc 5592 if (buf_ch != pat_ch)
6fdd04b0
KH
5593 {
5594 d = dfail;
5595 goto fail;
bf216479 5596 }
cf9c99bc
KH
5597 p += pat_charlen;
5598 d++;
6fdd04b0
KH
5599 }
5600 while (--mcnt);
5601#endif
25fe55af 5602 break;
fa9a63c5
RM
5603
5604
25fe55af 5605 /* Match any character except possibly a newline or a null. */
fa9a63c5 5606 case anychar:
e934739e
RS
5607 {
5608 int buf_charlen;
01618498 5609 re_wchar_t buf_ch;
fa9a63c5 5610
e934739e 5611 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5612
e934739e 5613 PREFETCH ();
62a6e103 5614 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5615 target_multibyte);
e934739e
RS
5616 buf_ch = TRANSLATE (buf_ch);
5617
5618 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5619 && buf_ch == '\n')
5620 || ((bufp->syntax & RE_DOT_NOT_NULL)
5621 && buf_ch == '\000'))
5622 goto fail;
5623
e934739e
RS
5624 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5625 d += buf_charlen;
5626 }
fa9a63c5
RM
5627 break;
5628
5629
5630 case charset:
5631 case charset_not:
5632 {
b18215fc 5633 register unsigned int c;
fa9a63c5 5634 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5635 int len;
5636
5637 /* Start of actual range_table, or end of bitmap if there is no
5638 range table. */
01618498 5639 re_char *range_table;
b18215fc 5640
96cc36cc 5641 /* Nonzero if there is a range table. */
b18215fc
RS
5642 int range_table_exists;
5643
96cc36cc
RS
5644 /* Number of ranges of range table. This is not included
5645 in the initial byte-length of the command. */
5646 int count = 0;
fa9a63c5 5647
f5020181
AS
5648 /* Whether matching against a unibyte character. */
5649 boolean unibyte_char = false;
5650
25fe55af 5651 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5652
b18215fc 5653 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5654
b18215fc 5655 if (range_table_exists)
96cc36cc
RS
5656 {
5657 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5658 EXTRACT_NUMBER_AND_INCR (count, range_table);
5659 }
b18215fc 5660
2d1675e4 5661 PREFETCH ();
62a6e103 5662 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5663 if (target_multibyte)
5664 {
5665 int c1;
b18215fc 5666
cf9c99bc
KH
5667 c = TRANSLATE (c);
5668 c1 = RE_CHAR_TO_UNIBYTE (c);
5669 if (c1 >= 0)
f5020181
AS
5670 {
5671 unibyte_char = true;
5672 c = c1;
5673 }
cf9c99bc
KH
5674 }
5675 else
5676 {
5677 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5678
5679 if (! CHAR_BYTE8_P (c1))
5680 {
5681 c1 = TRANSLATE (c1);
5682 c1 = RE_CHAR_TO_UNIBYTE (c1);
5683 if (c1 >= 0)
f5020181
AS
5684 {
5685 unibyte_char = true;
5686 c = c1;
5687 }
cf9c99bc 5688 }
0b8be006
AS
5689 else
5690 unibyte_char = true;
cf9c99bc
KH
5691 }
5692
f5020181 5693 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5694 { /* Lookup bitmap. */
b18215fc
RS
5695 /* Cast to `unsigned' instead of `unsigned char' in
5696 case the bit list is a full 32 bytes long. */
5697 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5698 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5699 not = !not;
b18215fc 5700 }
96cc36cc 5701#ifdef emacs
b18215fc 5702 else if (range_table_exists)
96cc36cc
RS
5703 {
5704 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5705
14473664
SM
5706 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5707 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5708 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5709 | (class_bits & BIT_SPACE && ISSPACE (c))
5710 | (class_bits & BIT_UPPER && ISUPPER (c))
5711 | (class_bits & BIT_WORD && ISWORD (c)))
5712 not = !not;
5713 else
5714 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5715 }
5716#endif /* emacs */
fa9a63c5 5717
96cc36cc
RS
5718 if (range_table_exists)
5719 p = CHARSET_RANGE_TABLE_END (range_table, count);
5720 else
5721 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5722
5723 if (!not) goto fail;
5e69f11e 5724
b18215fc 5725 d += len;
fa9a63c5
RM
5726 break;
5727 }
5728
5729
25fe55af 5730 /* The beginning of a group is represented by start_memory.
505bde11 5731 The argument is the register number. The text
25fe55af 5732 matched within the group is recorded (in the internal
7814e705 5733 registers data structure) under the register number. */
25fe55af 5734 case start_memory:
505bde11
SM
5735 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5736
5737 /* In case we need to undo this operation (via backtracking). */
5738 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5739
25fe55af 5740 regstart[*p] = d;
4bb91c68 5741 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5742 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5743
25fe55af 5744 /* Move past the register number and inner group count. */
505bde11 5745 p += 1;
25fe55af 5746 break;
fa9a63c5
RM
5747
5748
25fe55af 5749 /* The stop_memory opcode represents the end of a group. Its
505bde11 5750 argument is the same as start_memory's: the register number. */
fa9a63c5 5751 case stop_memory:
505bde11
SM
5752 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5753
5754 assert (!REG_UNSET (regstart[*p]));
5755 /* Strictly speaking, there should be code such as:
177c0ea7 5756
0b32bf0e 5757 assert (REG_UNSET (regend[*p]));
505bde11
SM
5758 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5759
5760 But the only info to be pushed is regend[*p] and it is known to
5761 be UNSET, so there really isn't anything to push.
5762 Not pushing anything, on the other hand deprives us from the
5763 guarantee that regend[*p] is UNSET since undoing this operation
5764 will not reset its value properly. This is not important since
5765 the value will only be read on the next start_memory or at
5766 the very end and both events can only happen if this stop_memory
5767 is *not* undone. */
fa9a63c5 5768
25fe55af 5769 regend[*p] = d;
fa9a63c5
RM
5770 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5771
25fe55af 5772 /* Move past the register number and the inner group count. */
505bde11 5773 p += 1;
25fe55af 5774 break;
fa9a63c5
RM
5775
5776
5777 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5778 followed by the numeric value of <digit> as the register number. */
5779 case duplicate:
fa9a63c5 5780 {
66f0296e 5781 register re_char *d2, *dend2;
7814e705 5782 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5783 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5784
7814e705 5785 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5786 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5787 goto fail;
5e69f11e 5788
7814e705 5789 /* Where in input to try to start matching. */
25fe55af 5790 d2 = regstart[regno];
5e69f11e 5791
99633e97
SM
5792 /* Remember the start point to rollback upon failure. */
5793 dfail = d;
5794
25fe55af
RS
5795 /* Where to stop matching; if both the place to start and
5796 the place to stop matching are in the same string, then
5797 set to the place to stop, otherwise, for now have to use
5798 the end of the first string. */
fa9a63c5 5799
25fe55af 5800 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5801 == FIRST_STRING_P (regend[regno]))
5802 ? regend[regno] : end_match_1);
5803 for (;;)
5804 {
5805 /* If necessary, advance to next segment in register
25fe55af 5806 contents. */
fa9a63c5
RM
5807 while (d2 == dend2)
5808 {
5809 if (dend2 == end_match_2) break;
5810 if (dend2 == regend[regno]) break;
5811
25fe55af
RS
5812 /* End of string1 => advance to string2. */
5813 d2 = string2;
5814 dend2 = regend[regno];
fa9a63c5
RM
5815 }
5816 /* At end of register contents => success */
5817 if (d2 == dend2) break;
5818
5819 /* If necessary, advance to next segment in data. */
5820 PREFETCH ();
5821
5822 /* How many characters left in this segment to match. */
5823 mcnt = dend - d;
5e69f11e 5824
fa9a63c5 5825 /* Want how many consecutive characters we can match in
25fe55af
RS
5826 one shot, so, if necessary, adjust the count. */
5827 if (mcnt > dend2 - d2)
fa9a63c5 5828 mcnt = dend2 - d2;
5e69f11e 5829
fa9a63c5 5830 /* Compare that many; failure if mismatch, else move
25fe55af 5831 past them. */
28703c16 5832 if (RE_TRANSLATE_P (translate)
02cb78b5 5833 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5834 : memcmp (d, d2, mcnt))
99633e97
SM
5835 {
5836 d = dfail;
5837 goto fail;
5838 }
fa9a63c5 5839 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5840 }
5841 }
5842 break;
5843
5844
25fe55af 5845 /* begline matches the empty string at the beginning of the string
c0f9ea08 5846 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5847 case begline:
25fe55af 5848 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5849
25fe55af
RS
5850 if (AT_STRINGS_BEG (d))
5851 {
5852 if (!bufp->not_bol) break;
5853 }
419d1c74 5854 else
25fe55af 5855 {
bf216479 5856 unsigned c;
419d1c74 5857 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5858 if (c == '\n')
419d1c74 5859 break;
25fe55af
RS
5860 }
5861 /* In all other cases, we fail. */
5862 goto fail;
fa9a63c5
RM
5863
5864
25fe55af 5865 /* endline is the dual of begline. */
fa9a63c5 5866 case endline:
25fe55af 5867 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5868
25fe55af
RS
5869 if (AT_STRINGS_END (d))
5870 {
5871 if (!bufp->not_eol) break;
5872 }
f1ad044f 5873 else
25fe55af 5874 {
f1ad044f 5875 PREFETCH_NOLIMIT ();
c0f9ea08 5876 if (*d == '\n')
f1ad044f 5877 break;
25fe55af
RS
5878 }
5879 goto fail;
fa9a63c5
RM
5880
5881
5882 /* Match at the very beginning of the data. */
25fe55af
RS
5883 case begbuf:
5884 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5885 if (AT_STRINGS_BEG (d))
5886 break;
5887 goto fail;
fa9a63c5
RM
5888
5889
5890 /* Match at the very end of the data. */
25fe55af
RS
5891 case endbuf:
5892 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5893 if (AT_STRINGS_END (d))
5894 break;
25fe55af 5895 goto fail;
5e69f11e 5896
5e69f11e 5897
25fe55af
RS
5898 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5899 pushes NULL as the value for the string on the stack. Then
505bde11 5900 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5901 string, instead of restoring it. To see why, consider
7814e705 5902 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5903 then the . fails against the \n. But the next thing we want
5904 to do is match the \n against the \n; if we restored the
5905 string value, we would be back at the foo.
5906
5907 Because this is used only in specific cases, we don't need to
5908 check all the things that `on_failure_jump' does, to make
5909 sure the right things get saved on the stack. Hence we don't
5910 share its code. The only reason to push anything on the
5911 stack at all is that otherwise we would have to change
5912 `anychar's code to do something besides goto fail in this
5913 case; that seems worse than this. */
5914 case on_failure_keep_string_jump:
505bde11
SM
5915 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5916 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5917 mcnt, p + mcnt);
fa9a63c5 5918
505bde11
SM
5919 PUSH_FAILURE_POINT (p - 3, NULL);
5920 break;
5921
0683b6fa
SM
5922 /* A nasty loop is introduced by the non-greedy *? and +?.
5923 With such loops, the stack only ever contains one failure point
5924 at a time, so that a plain on_failure_jump_loop kind of
5925 cycle detection cannot work. Worse yet, such a detection
5926 can not only fail to detect a cycle, but it can also wrongly
5927 detect a cycle (between different instantiations of the same
6df42991 5928 loop).
0683b6fa
SM
5929 So the method used for those nasty loops is a little different:
5930 We use a special cycle-detection-stack-frame which is pushed
5931 when the on_failure_jump_nastyloop failure-point is *popped*.
5932 This special frame thus marks the beginning of one iteration
5933 through the loop and we can hence easily check right here
5934 whether something matched between the beginning and the end of
5935 the loop. */
5936 case on_failure_jump_nastyloop:
5937 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5938 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5939 mcnt, p + mcnt);
5940
5941 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5942 {
5943 int cycle = 0;
5944 CHECK_INFINITE_LOOP (p - 4, d);
5945 if (!cycle)
5946 /* If there's a cycle, just continue without pushing
5947 this failure point. The failure point is the "try again"
5948 option, which shouldn't be tried.
5949 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5950 PUSH_FAILURE_POINT (p - 3, d);
5951 }
0683b6fa
SM
5952 break;
5953
4e8a9132
SM
5954 /* Simple loop detecting on_failure_jump: just check on the
5955 failure stack if the same spot was already hit earlier. */
505bde11
SM
5956 case on_failure_jump_loop:
5957 on_failure:
5958 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5959 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5960 mcnt, p + mcnt);
6df42991
SM
5961 {
5962 int cycle = 0;
5963 CHECK_INFINITE_LOOP (p - 3, d);
5964 if (cycle)
5965 /* If there's a cycle, get out of the loop, as if the matching
5966 had failed. We used to just `goto fail' here, but that was
5967 aborting the search a bit too early: we want to keep the
5968 empty-loop-match and keep matching after the loop.
5969 We want (x?)*y\1z to match both xxyz and xxyxz. */
5970 p += mcnt;
5971 else
5972 PUSH_FAILURE_POINT (p - 3, d);
5973 }
25fe55af 5974 break;
fa9a63c5
RM
5975
5976
5977 /* Uses of on_failure_jump:
5e69f11e 5978
25fe55af
RS
5979 Each alternative starts with an on_failure_jump that points
5980 to the beginning of the next alternative. Each alternative
5981 except the last ends with a jump that in effect jumps past
5982 the rest of the alternatives. (They really jump to the
5983 ending jump of the following alternative, because tensioning
5984 these jumps is a hassle.)
fa9a63c5 5985
25fe55af
RS
5986 Repeats start with an on_failure_jump that points past both
5987 the repetition text and either the following jump or
5988 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5989 case on_failure_jump:
25fe55af 5990 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5991 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5992 mcnt, p + mcnt);
25fe55af 5993
505bde11 5994 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5995 break;
5996
4e8a9132 5997 /* This operation is used for greedy *.
505bde11
SM
5998 Compare the beginning of the repeat with what in the
5999 pattern follows its end. If we can establish that there
6000 is nothing that they would both match, i.e., that we
6001 would have to backtrack because of (as in, e.g., `a*a')
6002 then we can use a non-backtracking loop based on
4e8a9132 6003 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 6004 case on_failure_jump_smart:
25fe55af 6005 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
6006 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
6007 mcnt, p + mcnt);
25fe55af 6008 {
01618498 6009 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
6010 /* Here, we discard `const', making re_match non-reentrant. */
6011 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
6012 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 6013
505bde11
SM
6014 p -= 3; /* Reset so that we will re-execute the
6015 instruction once it's been changed. */
fa9a63c5 6016
4e8a9132
SM
6017 EXTRACT_NUMBER (mcnt, p2 - 2);
6018
6019 /* Ensure this is a indeed the trivial kind of loop
6020 we are expecting. */
6021 assert (skip_one_char (p1) == p2 - 3);
6022 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 6023 DEBUG_STATEMENT (debug += 2);
505bde11 6024 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 6025 {
505bde11 6026 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 6027 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 6028 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 6029 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 6030 }
505bde11 6031 else
fa9a63c5 6032 {
505bde11
SM
6033 /* Default to a safe `on_failure_jump' loop. */
6034 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 6035 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 6036 }
99633e97 6037 DEBUG_STATEMENT (debug -= 2);
25fe55af 6038 }
505bde11 6039 break;
25fe55af
RS
6040
6041 /* Unconditionally jump (without popping any failure points). */
6042 case jump:
fa9a63c5 6043 unconditional_jump:
5b370c2b 6044 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6045 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 6046 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 6047 p += mcnt; /* Do the jump. */
505bde11 6048 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
6049 break;
6050
6051
25fe55af
RS
6052 /* Have to succeed matching what follows at least n times.
6053 After that, handle like `on_failure_jump'. */
6054 case succeed_n:
01618498 6055 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6056 EXTRACT_NUMBER (mcnt, p + 2);
6057 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 6058
dc1e502d
SM
6059 /* Originally, mcnt is how many times we HAVE to succeed. */
6060 if (mcnt != 0)
25fe55af 6061 {
6dcf2d0e
SM
6062 /* Here, we discard `const', making re_match non-reentrant. */
6063 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6064 mcnt--;
01618498
SM
6065 p += 4;
6066 PUSH_NUMBER (p2, mcnt);
25fe55af 6067 }
dc1e502d
SM
6068 else
6069 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
6070 goto on_failure;
25fe55af
RS
6071 break;
6072
6073 case jump_n:
01618498 6074 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6075 EXTRACT_NUMBER (mcnt, p + 2);
6076 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
6077
6078 /* Originally, this is how many times we CAN jump. */
dc1e502d 6079 if (mcnt != 0)
25fe55af 6080 {
6dcf2d0e
SM
6081 /* Here, we discard `const', making re_match non-reentrant. */
6082 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6083 mcnt--;
01618498 6084 PUSH_NUMBER (p2, mcnt);
dc1e502d 6085 goto unconditional_jump;
25fe55af
RS
6086 }
6087 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
6088 else
6089 p += 4;
25fe55af 6090 break;
5e69f11e 6091
fa9a63c5
RM
6092 case set_number_at:
6093 {
01618498 6094 unsigned char *p2; /* Location of the counter. */
25fe55af 6095 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 6096
25fe55af 6097 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
6098 /* Here, we discard `const', making re_match non-reentrant. */
6099 p2 = (unsigned char*) p + mcnt;
01618498 6100 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6101 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6102 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6103 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6104 break;
6105 }
9121ca40
KH
6106
6107 case wordbound:
66f0296e
SM
6108 case notwordbound:
6109 not = (re_opcode_t) *(p - 1) == notwordbound;
6110 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6111
99633e97 6112 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6113
b18215fc 6114 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6115 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6116 not = !not;
b18215fc
RS
6117 else
6118 {
6119 /* C1 is the character before D, S1 is the syntax of C1, C2
6120 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6121 re_wchar_t c1, c2;
6122 int s1, s2;
bf216479 6123 int dummy;
b18215fc 6124#ifdef emacs
2d1675e4
SM
6125 int offset = PTR_TO_OFFSET (d - 1);
6126 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6127 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6128#endif
66f0296e 6129 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6130 s1 = SYNTAX (c1);
6131#ifdef emacs
5d967c7a 6132 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6133#endif
f1ad044f 6134 PREFETCH_NOLIMIT ();
6fdd04b0 6135 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6136 s2 = SYNTAX (c2);
6137
6138 if (/* Case 2: Only one of S1 and S2 is Sword. */
6139 ((s1 == Sword) != (s2 == Sword))
6140 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6141 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6142 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6143 not = !not;
6144 }
6145 if (not)
9121ca40 6146 break;
b18215fc 6147 else
9121ca40 6148 goto fail;
fa9a63c5
RM
6149
6150 case wordbeg:
25fe55af 6151 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6152
b18215fc
RS
6153 /* We FAIL in one of the following cases: */
6154
7814e705 6155 /* Case 1: D is at the end of string. */
b18215fc 6156 if (AT_STRINGS_END (d))
99633e97 6157 goto fail;
b18215fc
RS
6158 else
6159 {
6160 /* C1 is the character before D, S1 is the syntax of C1, C2
6161 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6162 re_wchar_t c1, c2;
6163 int s1, s2;
bf216479 6164 int dummy;
fa9a63c5 6165#ifdef emacs
2d1675e4
SM
6166 int offset = PTR_TO_OFFSET (d);
6167 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6168 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6169#endif
99633e97 6170 PREFETCH ();
6fdd04b0 6171 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6172 s2 = SYNTAX (c2);
177c0ea7 6173
b18215fc
RS
6174 /* Case 2: S2 is not Sword. */
6175 if (s2 != Sword)
6176 goto fail;
6177
6178 /* Case 3: D is not at the beginning of string ... */
6179 if (!AT_STRINGS_BEG (d))
6180 {
6181 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6182#ifdef emacs
5d967c7a 6183 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6184#endif
b18215fc
RS
6185 s1 = SYNTAX (c1);
6186
6187 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6188 returns 0. */
b18215fc
RS
6189 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6190 goto fail;
6191 }
6192 }
e318085a
RS
6193 break;
6194
b18215fc 6195 case wordend:
25fe55af 6196 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6197
6198 /* We FAIL in one of the following cases: */
6199
6200 /* Case 1: D is at the beginning of string. */
6201 if (AT_STRINGS_BEG (d))
e318085a 6202 goto fail;
b18215fc
RS
6203 else
6204 {
6205 /* C1 is the character before D, S1 is the syntax of C1, C2
6206 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6207 re_wchar_t c1, c2;
6208 int s1, s2;
bf216479 6209 int dummy;
5d967c7a 6210#ifdef emacs
2d1675e4
SM
6211 int offset = PTR_TO_OFFSET (d) - 1;
6212 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6213 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6214#endif
99633e97 6215 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6216 s1 = SYNTAX (c1);
6217
6218 /* Case 2: S1 is not Sword. */
6219 if (s1 != Sword)
6220 goto fail;
6221
6222 /* Case 3: D is not at the end of string ... */
6223 if (!AT_STRINGS_END (d))
6224 {
f1ad044f 6225 PREFETCH_NOLIMIT ();
6fdd04b0 6226 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6227#ifdef emacs
6228 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6229#endif
b18215fc
RS
6230 s2 = SYNTAX (c2);
6231
6232 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6233 returns 0. */
b18215fc 6234 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6235 goto fail;
b18215fc
RS
6236 }
6237 }
e318085a
RS
6238 break;
6239
669fa600
SM
6240 case symbeg:
6241 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6242
6243 /* We FAIL in one of the following cases: */
6244
7814e705 6245 /* Case 1: D is at the end of string. */
669fa600
SM
6246 if (AT_STRINGS_END (d))
6247 goto fail;
6248 else
6249 {
6250 /* C1 is the character before D, S1 is the syntax of C1, C2
6251 is the character at D, and S2 is the syntax of C2. */
6252 re_wchar_t c1, c2;
6253 int s1, s2;
6254#ifdef emacs
6255 int offset = PTR_TO_OFFSET (d);
6256 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6257 UPDATE_SYNTAX_TABLE (charpos);
6258#endif
6259 PREFETCH ();
62a6e103 6260 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6261 s2 = SYNTAX (c2);
7814e705 6262
669fa600
SM
6263 /* Case 2: S2 is neither Sword nor Ssymbol. */
6264 if (s2 != Sword && s2 != Ssymbol)
6265 goto fail;
6266
6267 /* Case 3: D is not at the beginning of string ... */
6268 if (!AT_STRINGS_BEG (d))
6269 {
6270 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6271#ifdef emacs
6272 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6273#endif
6274 s1 = SYNTAX (c1);
6275
6276 /* ... and S1 is Sword or Ssymbol. */
6277 if (s1 == Sword || s1 == Ssymbol)
6278 goto fail;
6279 }
6280 }
6281 break;
6282
6283 case symend:
6284 DEBUG_PRINT1 ("EXECUTING symend.\n");
6285
6286 /* We FAIL in one of the following cases: */
6287
6288 /* Case 1: D is at the beginning of string. */
6289 if (AT_STRINGS_BEG (d))
6290 goto fail;
6291 else
6292 {
6293 /* C1 is the character before D, S1 is the syntax of C1, C2
6294 is the character at D, and S2 is the syntax of C2. */
6295 re_wchar_t c1, c2;
6296 int s1, s2;
6297#ifdef emacs
6298 int offset = PTR_TO_OFFSET (d) - 1;
6299 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6300 UPDATE_SYNTAX_TABLE (charpos);
6301#endif
6302 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6303 s1 = SYNTAX (c1);
6304
6305 /* Case 2: S1 is neither Ssymbol nor Sword. */
6306 if (s1 != Sword && s1 != Ssymbol)
6307 goto fail;
6308
6309 /* Case 3: D is not at the end of string ... */
6310 if (!AT_STRINGS_END (d))
6311 {
6312 PREFETCH_NOLIMIT ();
62a6e103 6313 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6314#ifdef emacs
134579f2 6315 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6316#endif
6317 s2 = SYNTAX (c2);
6318
6319 /* ... and S2 is Sword or Ssymbol. */
6320 if (s2 == Sword || s2 == Ssymbol)
6321 goto fail;
b18215fc
RS
6322 }
6323 }
e318085a
RS
6324 break;
6325
fa9a63c5 6326 case syntaxspec:
1fb352e0
SM
6327 case notsyntaxspec:
6328 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6329 mcnt = *p++;
1fb352e0 6330 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6331 PREFETCH ();
b18215fc
RS
6332#ifdef emacs
6333 {
2d1675e4
SM
6334 int offset = PTR_TO_OFFSET (d);
6335 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6336 UPDATE_SYNTAX_TABLE (pos1);
6337 }
25fe55af 6338#endif
b18215fc 6339 {
01618498
SM
6340 int len;
6341 re_wchar_t c;
b18215fc 6342
6fdd04b0 6343 GET_CHAR_AFTER (c, d, len);
990b2375 6344 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6345 goto fail;
b18215fc
RS
6346 d += len;
6347 }
fa9a63c5
RM
6348 break;
6349
b18215fc 6350#ifdef emacs
1fb352e0
SM
6351 case before_dot:
6352 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6353 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6354 goto fail;
b18215fc
RS
6355 break;
6356
1fb352e0
SM
6357 case at_dot:
6358 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6359 if (PTR_BYTE_POS (d) != PT_BYTE)
6360 goto fail;
6361 break;
b18215fc 6362
1fb352e0
SM
6363 case after_dot:
6364 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6365 if (PTR_BYTE_POS (d) <= PT_BYTE)
6366 goto fail;
e318085a 6367 break;
fa9a63c5 6368
1fb352e0 6369 case categoryspec:
b18215fc 6370 case notcategoryspec:
1fb352e0 6371 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6372 mcnt = *p++;
1fb352e0 6373 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6374 PREFETCH ();
6375 {
01618498
SM
6376 int len;
6377 re_wchar_t c;
6378
6fdd04b0 6379 GET_CHAR_AFTER (c, d, len);
1fb352e0 6380 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6381 goto fail;
6382 d += len;
6383 }
fa9a63c5 6384 break;
5e69f11e 6385
1fb352e0 6386#endif /* emacs */
5e69f11e 6387
0b32bf0e
SM
6388 default:
6389 abort ();
fa9a63c5 6390 }
b18215fc 6391 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6392
6393
6394 /* We goto here if a matching operation fails. */
6395 fail:
5b370c2b 6396 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6397 if (!FAIL_STACK_EMPTY ())
505bde11 6398 {
01618498 6399 re_char *str, *pat;
505bde11 6400 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6401 DEBUG_PRINT1 ("\nFAIL:\n");
6402 POP_FAILURE_POINT (str, pat);
505bde11
SM
6403 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6404 {
6405 case on_failure_keep_string_jump:
6406 assert (str == NULL);
6407 goto continue_failure_jump;
6408
0683b6fa
SM
6409 case on_failure_jump_nastyloop:
6410 assert ((re_opcode_t)pat[-2] == no_op);
6411 PUSH_FAILURE_POINT (pat - 2, str);
6412 /* Fallthrough */
6413
505bde11
SM
6414 case on_failure_jump_loop:
6415 case on_failure_jump:
6416 case succeed_n:
6417 d = str;
6418 continue_failure_jump:
6419 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6420 p = pat + mcnt;
6421 break;
b18215fc 6422
0683b6fa
SM
6423 case no_op:
6424 /* A special frame used for nastyloops. */
6425 goto fail;
6426
505bde11
SM
6427 default:
6428 abort();
6429 }
fa9a63c5 6430
505bde11 6431 assert (p >= bufp->buffer && p <= pend);
b18215fc 6432
0b32bf0e 6433 if (d >= string1 && d <= end1)
fa9a63c5 6434 dend = end_match_1;
0b32bf0e 6435 }
fa9a63c5 6436 else
0b32bf0e 6437 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6438 } /* for (;;) */
6439
6440 if (best_regs_set)
6441 goto restore_best_regs;
6442
6443 FREE_VARIABLES ();
6444
b18215fc 6445 return -1; /* Failure to match. */
fa9a63c5
RM
6446} /* re_match_2 */
6447\f
6448/* Subroutine definitions for re_match_2. */
6449
fa9a63c5
RM
6450/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6451 bytes; nonzero otherwise. */
5e69f11e 6452
fa9a63c5 6453static int
02cb78b5 6454bcmp_translate (s1, s2, len, translate, target_multibyte)
2d1675e4 6455 re_char *s1, *s2;
fa9a63c5 6456 register int len;
6676cb1c 6457 RE_TRANSLATE_TYPE translate;
02cb78b5 6458 const int target_multibyte;
fa9a63c5 6459{
2d1675e4
SM
6460 register re_char *p1 = s1, *p2 = s2;
6461 re_char *p1_end = s1 + len;
6462 re_char *p2_end = s2 + len;
e934739e 6463
4bb91c68
SM
6464 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6465 different lengths, but relying on a single `len' would break this. -sm */
6466 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6467 {
e934739e 6468 int p1_charlen, p2_charlen;
01618498 6469 re_wchar_t p1_ch, p2_ch;
e934739e 6470
6fdd04b0
KH
6471 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6472 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6473
6474 if (RE_TRANSLATE (translate, p1_ch)
6475 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6476 return 1;
e934739e
RS
6477
6478 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6479 }
e934739e
RS
6480
6481 if (p1 != p1_end || p2 != p2_end)
6482 return 1;
6483
fa9a63c5
RM
6484 return 0;
6485}
6486\f
6487/* Entry points for GNU code. */
6488
6489/* re_compile_pattern is the GNU regular expression compiler: it
6490 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6491 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6492
fa9a63c5
RM
6493 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6494 are set in BUFP on entry.
5e69f11e 6495
b18215fc 6496 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6497
6498const char *
6499re_compile_pattern (pattern, length, bufp)
6500 const char *pattern;
0b32bf0e 6501 size_t length;
fa9a63c5
RM
6502 struct re_pattern_buffer *bufp;
6503{
6504 reg_errcode_t ret;
5e69f11e 6505
fa9a63c5
RM
6506 /* GNU code is written to assume at least RE_NREGS registers will be set
6507 (and at least one extra will be -1). */
6508 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6509
fa9a63c5
RM
6510 /* And GNU code determines whether or not to get register information
6511 by passing null for the REGS argument to re_match, etc., not by
6512 setting no_sub. */
6513 bufp->no_sub = 0;
5e69f11e 6514
4bb91c68 6515 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6516
6517 if (!ret)
6518 return NULL;
6519 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6520}
c0f9ea08 6521WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6522\f
b18215fc
RS
6523/* Entry points compatible with 4.2 BSD regex library. We don't define
6524 them unless specifically requested. */
fa9a63c5 6525
0b32bf0e 6526#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6527
6528/* BSD has one and only one pattern buffer. */
6529static struct re_pattern_buffer re_comp_buf;
6530
6531char *
0b32bf0e 6532# ifdef _LIBC
48afdd44
RM
6533/* Make these definitions weak in libc, so POSIX programs can redefine
6534 these names if they don't use our functions, and still use
6535 regcomp/regexec below without link errors. */
6536weak_function
0b32bf0e 6537# endif
fa9a63c5
RM
6538re_comp (s)
6539 const char *s;
6540{
6541 reg_errcode_t ret;
5e69f11e 6542
fa9a63c5
RM
6543 if (!s)
6544 {
6545 if (!re_comp_buf.buffer)
0b32bf0e 6546 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6547 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6548 return 0;
6549 }
6550
6551 if (!re_comp_buf.buffer)
6552 {
6553 re_comp_buf.buffer = (unsigned char *) malloc (200);
6554 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6555 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6556 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6557 re_comp_buf.allocated = 200;
6558
6559 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6560 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6561 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6562 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6563 }
6564
6565 /* Since `re_exec' always passes NULL for the `regs' argument, we
6566 don't need to initialize the pattern buffer fields which affect it. */
6567
fa9a63c5 6568 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6569
fa9a63c5
RM
6570 if (!ret)
6571 return NULL;
6572
6573 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6574 return (char *) gettext (re_error_msgid[(int) ret]);
6575}
6576
6577
6578int
0b32bf0e 6579# ifdef _LIBC
48afdd44 6580weak_function
0b32bf0e 6581# endif
fa9a63c5
RM
6582re_exec (s)
6583 const char *s;
6584{
6585 const int len = strlen (s);
6586 return
6587 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6588}
6589#endif /* _REGEX_RE_COMP */
6590\f
6591/* POSIX.2 functions. Don't define these for Emacs. */
6592
6593#ifndef emacs
6594
6595/* regcomp takes a regular expression as a string and compiles it.
6596
b18215fc 6597 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6598 since POSIX says we shouldn't. Thus, we set
6599
6600 `buffer' to the compiled pattern;
6601 `used' to the length of the compiled pattern;
6602 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6603 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6604 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6605 `fastmap' to an allocated space for the fastmap;
6606 `fastmap_accurate' to zero;
fa9a63c5
RM
6607 `re_nsub' to the number of subexpressions in PATTERN.
6608
6609 PATTERN is the address of the pattern string.
6610
6611 CFLAGS is a series of bits which affect compilation.
6612
6613 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6614 use POSIX basic syntax.
6615
6616 If REG_NEWLINE is set, then . and [^...] don't match newline.
6617 Also, regexec will try a match beginning after every newline.
6618
6619 If REG_ICASE is set, then we considers upper- and lowercase
6620 versions of letters to be equivalent when matching.
6621
6622 If REG_NOSUB is set, then when PREG is passed to regexec, that
6623 routine will report only success or failure, and nothing about the
6624 registers.
6625
b18215fc 6626 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6627 the return codes and their meanings.) */
6628
6629int
6630regcomp (preg, pattern, cflags)
ada30c0e
SM
6631 regex_t *__restrict preg;
6632 const char *__restrict pattern;
fa9a63c5
RM
6633 int cflags;
6634{
6635 reg_errcode_t ret;
4bb91c68 6636 reg_syntax_t syntax
fa9a63c5
RM
6637 = (cflags & REG_EXTENDED) ?
6638 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6639
6640 /* regex_compile will allocate the space for the compiled pattern. */
6641 preg->buffer = 0;
6642 preg->allocated = 0;
6643 preg->used = 0;
5e69f11e 6644
c0f9ea08
SM
6645 /* Try to allocate space for the fastmap. */
6646 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6647
fa9a63c5
RM
6648 if (cflags & REG_ICASE)
6649 {
6650 unsigned i;
5e69f11e 6651
6676cb1c
RS
6652 preg->translate
6653 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6654 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6655 if (preg->translate == NULL)
0b32bf0e 6656 return (int) REG_ESPACE;
fa9a63c5
RM
6657
6658 /* Map uppercase characters to corresponding lowercase ones. */
6659 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6660 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6661 }
6662 else
6663 preg->translate = NULL;
6664
6665 /* If REG_NEWLINE is set, newlines are treated differently. */
6666 if (cflags & REG_NEWLINE)
6667 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6668 syntax &= ~RE_DOT_NEWLINE;
6669 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6670 }
6671 else
c0f9ea08 6672 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6673
6674 preg->no_sub = !!(cflags & REG_NOSUB);
6675
5e69f11e 6676 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6677 can use strlen here in compiling the pattern. */
4bb91c68 6678 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6679
fa9a63c5
RM
6680 /* POSIX doesn't distinguish between an unmatched open-group and an
6681 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6682 if (ret == REG_ERPAREN)
6683 ret = REG_EPAREN;
6684
6685 if (ret == REG_NOERROR && preg->fastmap)
6686 { /* Compute the fastmap now, since regexec cannot modify the pattern
6687 buffer. */
6688 re_compile_fastmap (preg);
6689 if (preg->can_be_null)
6690 { /* The fastmap can't be used anyway. */
6691 free (preg->fastmap);
6692 preg->fastmap = NULL;
6693 }
6694 }
fa9a63c5
RM
6695 return (int) ret;
6696}
c0f9ea08 6697WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6698
6699
6700/* regexec searches for a given pattern, specified by PREG, in the
6701 string STRING.
5e69f11e 6702
fa9a63c5 6703 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6704 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6705 least NMATCH elements, and we set them to the offsets of the
6706 corresponding matched substrings.
5e69f11e 6707
fa9a63c5
RM
6708 EFLAGS specifies `execution flags' which affect matching: if
6709 REG_NOTBOL is set, then ^ does not match at the beginning of the
6710 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6711
fa9a63c5
RM
6712 We return 0 if we find a match and REG_NOMATCH if not. */
6713
6714int
6715regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6716 const regex_t *__restrict preg;
6717 const char *__restrict string;
5e69f11e 6718 size_t nmatch;
9f2dbe01 6719 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6720 int eflags;
6721{
6722 int ret;
6723 struct re_registers regs;
6724 regex_t private_preg;
6725 int len = strlen (string);
c0f9ea08 6726 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6727
6728 private_preg = *preg;
5e69f11e 6729
fa9a63c5
RM
6730 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6731 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6732
fa9a63c5
RM
6733 /* The user has told us exactly how many registers to return
6734 information about, via `nmatch'. We have to pass that on to the
b18215fc 6735 matching routines. */
fa9a63c5 6736 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6737
fa9a63c5
RM
6738 if (want_reg_info)
6739 {
6740 regs.num_regs = nmatch;
4bb91c68
SM
6741 regs.start = TALLOC (nmatch * 2, regoff_t);
6742 if (regs.start == NULL)
0b32bf0e 6743 return (int) REG_NOMATCH;
4bb91c68 6744 regs.end = regs.start + nmatch;
fa9a63c5
RM
6745 }
6746
c0f9ea08
SM
6747 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6748 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6749 was a little bit longer but still only matching the real part.
6750 This works because the `endline' will check for a '\n' and will find a
6751 '\0', correctly deciding that this is not the end of a line.
6752 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6753 a convenient '\0' there. For all we know, the string could be preceded
6754 by '\n' which would throw things off. */
6755
fa9a63c5
RM
6756 /* Perform the searching operation. */
6757 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6758 /* start: */ 0, /* range: */ len,
6759 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6760
fa9a63c5
RM
6761 /* Copy the register information to the POSIX structure. */
6762 if (want_reg_info)
6763 {
6764 if (ret >= 0)
0b32bf0e
SM
6765 {
6766 unsigned r;
fa9a63c5 6767
0b32bf0e
SM
6768 for (r = 0; r < nmatch; r++)
6769 {
6770 pmatch[r].rm_so = regs.start[r];
6771 pmatch[r].rm_eo = regs.end[r];
6772 }
6773 }
fa9a63c5 6774
b18215fc 6775 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6776 free (regs.start);
fa9a63c5
RM
6777 }
6778
6779 /* We want zero return to mean success, unlike `re_search'. */
6780 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6781}
c0f9ea08 6782WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6783
6784
ec869672
JR
6785/* Returns a message corresponding to an error code, ERR_CODE, returned
6786 from either regcomp or regexec. We don't use PREG here.
6787
6788 ERR_CODE was previously called ERRCODE, but that name causes an
6789 error with msvc8 compiler. */
fa9a63c5
RM
6790
6791size_t
ec869672
JR
6792regerror (err_code, preg, errbuf, errbuf_size)
6793 int err_code;
fa9a63c5
RM
6794 const regex_t *preg;
6795 char *errbuf;
6796 size_t errbuf_size;
6797{
6798 const char *msg;
6799 size_t msg_size;
6800
ec869672
JR
6801 if (err_code < 0
6802 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6803 /* Only error codes returned by the rest of the code should be passed
b18215fc 6804 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6805 code generates an invalid error code, then the program has a bug.
6806 Dump core so we can fix it. */
6807 abort ();
6808
ec869672 6809 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6810
6811 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6812
fa9a63c5
RM
6813 if (errbuf_size != 0)
6814 {
6815 if (msg_size > errbuf_size)
0b32bf0e
SM
6816 {
6817 strncpy (errbuf, msg, errbuf_size - 1);
6818 errbuf[errbuf_size - 1] = 0;
6819 }
fa9a63c5 6820 else
0b32bf0e 6821 strcpy (errbuf, msg);
fa9a63c5
RM
6822 }
6823
6824 return msg_size;
6825}
c0f9ea08 6826WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6827
6828
6829/* Free dynamically allocated space used by PREG. */
6830
6831void
6832regfree (preg)
6833 regex_t *preg;
6834{
c2cd06e6 6835 free (preg->buffer);
fa9a63c5 6836 preg->buffer = NULL;
5e69f11e 6837
fa9a63c5
RM
6838 preg->allocated = 0;
6839 preg->used = 0;
6840
c2cd06e6 6841 free (preg->fastmap);
fa9a63c5
RM
6842 preg->fastmap = NULL;
6843 preg->fastmap_accurate = 0;
6844
c2cd06e6 6845 free (preg->translate);
fa9a63c5
RM
6846 preg->translate = NULL;
6847}
c0f9ea08 6848WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6849
6850#endif /* not emacs */
839966f3
KH
6851
6852/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6853 (do not change this comment) */