* dbus.texi (Receiving Method Calls): Add optional argument
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
0b5538bd 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
114f9c96 6 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
e468b87f 7 Free Software Foundation, Inc.
bc78d348 8
fa9a63c5
RM
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
e468b87f 11 the Free Software Foundation; either version 3, or (at your option)
fa9a63c5
RM
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
7814e705 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
4fc5845f 21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
7814e705 22 USA. */
fa9a63c5 23
6df42991 24/* TODO:
505bde11 25 - structure the opcode space into opcode+flag.
dc1e502d 26 - merge with glibc's regex.[ch].
01618498 27 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
28 need to modify the compiled regexp so that re_match can be reentrant.
29 - get rid of on_failure_jump_smart by doing the optimization in re_comp
30 rather than at run-time, so that re_match can be reentrant.
01618498 31*/
505bde11 32
fa9a63c5 33/* AIX requires this to be the first thing in the file. */
0b32bf0e 34#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
35 #pragma alloca
36#endif
37
fa9a63c5 38#ifdef HAVE_CONFIG_H
0b32bf0e 39# include <config.h>
fa9a63c5
RM
40#endif
41
4bb91c68
SM
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
fa9a63c5 48
14473664
SM
49/* Whether to use ISO C Amendment 1 wide char functions.
50 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
51#if defined _LIBC
52#define WIDE_CHAR_SUPPORT 1
53#else
14473664 54#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
55 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
56#endif
14473664
SM
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
a0ad02f7 60#if WIDE_CHAR_SUPPORT
14473664
SM
61/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62# include <wchar.h>
63# include <wctype.h>
64#endif
65
c0f9ea08
SM
66#ifdef _LIBC
67/* We have to keep the namespace clean. */
68# define regfree(preg) __regfree (preg)
69# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
ec869672
JR
71# define regerror(err_code, preg, errbuf, errbuf_size) \
72 __regerror(err_code, preg, errbuf, errbuf_size)
c0f9ea08
SM
73# define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77# define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79# define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81# define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83# define re_set_syntax(syntax) __re_set_syntax (syntax)
84# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
14473664
SM
88/* Make sure we call libc's function even if the user overrides them. */
89# define btowc __btowc
90# define iswctype __iswctype
91# define wctype __wctype
92
c0f9ea08
SM
93# define WEAK_ALIAS(a,b) weak_alias (a, b)
94
95/* We are also using some library internals. */
96# include <locale/localeinfo.h>
97# include <locale/elem-hash.h>
98# include <langinfo.h>
99#else
100# define WEAK_ALIAS(a,b)
101#endif
102
4bb91c68 103/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 104#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
105# include <libintl.h>
106#else
107# define gettext(msgid) (msgid)
108#endif
109
5e69f11e
RM
110#ifndef gettext_noop
111/* This define is so xgettext can find the internationalizable
112 strings. */
0b32bf0e 113# define gettext_noop(String) String
5e69f11e
RM
114#endif
115
fa9a63c5
RM
116/* The `emacs' switch turns on certain matching commands
117 that make sense only in Emacs. */
118#ifdef emacs
119
d7306fe6 120# include <setjmp.h>
0b32bf0e
SM
121# include "lisp.h"
122# include "buffer.h"
b18215fc
RS
123
124/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 125# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 126
0b32bf0e 127# include "syntax.h"
9117d724 128# include "character.h"
0b32bf0e 129# include "category.h"
fa9a63c5 130
7689ef0b
EZ
131# ifdef malloc
132# undef malloc
133# endif
0b32bf0e 134# define malloc xmalloc
7689ef0b
EZ
135# ifdef realloc
136# undef realloc
137# endif
0b32bf0e 138# define realloc xrealloc
7689ef0b
EZ
139# ifdef free
140# undef free
141# endif
0b32bf0e 142# define free xfree
9abbd165 143
7814e705 144/* Converts the pointer to the char to BEG-based offset from the start. */
0b32bf0e
SM
145# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
146# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
147
148# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 149# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
62a6e103
AS
150# define RE_STRING_CHAR(p, multibyte) \
151 (multibyte ? (STRING_CHAR (p)) : (*(p)))
152# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
153 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
2d1675e4 154
4c0354d7 155# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
cf9c99bc 156
2afc21f5 157# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
cf9c99bc 158
6fdd04b0
KH
159/* Set C a (possibly converted to multibyte) character before P. P
160 points into a string which is the virtual concatenation of STR1
161 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
162# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
163 do { \
02cb78b5 164 if (target_multibyte) \
bf216479
KH
165 { \
166 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
167 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
168 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
62a6e103 169 c = STRING_CHAR (dtemp); \
bf216479
KH
170 } \
171 else \
172 { \
173 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
cf9c99bc 174 (c) = RE_CHAR_TO_MULTIBYTE (c); \
bf216479 175 } \
2d1675e4
SM
176 } while (0)
177
6fdd04b0
KH
178/* Set C a (possibly converted to multibyte) character at P, and set
179 LEN to the byte length of that character. */
180# define GET_CHAR_AFTER(c, p, len) \
181 do { \
02cb78b5 182 if (target_multibyte) \
62a6e103 183 (c) = STRING_CHAR_AND_LENGTH (p, len); \
6fdd04b0
KH
184 else \
185 { \
cf9c99bc 186 (c) = *p; \
6fdd04b0 187 len = 1; \
cf9c99bc 188 (c) = RE_CHAR_TO_MULTIBYTE (c); \
6fdd04b0 189 } \
8f924df7 190 } while (0)
4e8a9132 191
fa9a63c5
RM
192#else /* not emacs */
193
194/* If we are not linking with Emacs proper,
195 we can't use the relocating allocator
196 even if config.h says that we can. */
0b32bf0e 197# undef REL_ALLOC
fa9a63c5 198
0b32bf0e
SM
199# if defined STDC_HEADERS || defined _LIBC
200# include <stdlib.h>
201# else
fa9a63c5
RM
202char *malloc ();
203char *realloc ();
0b32bf0e 204# endif
fa9a63c5 205
a77f947b
CY
206/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
207
208void *
209xmalloc (size)
210 size_t size;
211{
212 register void *val;
213 val = (void *) malloc (size);
214 if (!val && size)
215 {
216 write (2, "virtual memory exhausted\n", 25);
217 exit (1);
218 }
219 return val;
220}
221
222void *
223xrealloc (block, size)
224 void *block;
225 size_t size;
226{
227 register void *val;
228 /* We must call malloc explicitly when BLOCK is 0, since some
229 reallocs don't do this. */
230 if (! block)
231 val = (void *) malloc (size);
232 else
233 val = (void *) realloc (block, size);
234 if (!val && size)
235 {
236 write (2, "virtual memory exhausted\n", 25);
237 exit (1);
238 }
239 return val;
240}
241
a073faa6
CY
242# ifdef malloc
243# undef malloc
244# endif
245# define malloc xmalloc
246# ifdef realloc
247# undef realloc
248# endif
249# define realloc xrealloc
250
9e4ecb26 251/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 252 If nothing else has been done, use the method below. */
0b32bf0e
SM
253# ifdef INHIBIT_STRING_HEADER
254# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
255# if !defined bzero && !defined bcopy
256# undef INHIBIT_STRING_HEADER
257# endif
258# endif
259# endif
9e4ecb26 260
4bb91c68 261/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
262 This is used in most programs--a few other programs avoid this
263 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
264# ifndef INHIBIT_STRING_HEADER
265# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
266# include <string.h>
0b32bf0e 267# ifndef bzero
4bb91c68
SM
268# ifndef _LIBC
269# define bzero(s, n) (memset (s, '\0', n), (s))
270# else
271# define bzero(s, n) __bzero (s, n)
272# endif
0b32bf0e
SM
273# endif
274# else
275# include <strings.h>
4bb91c68
SM
276# ifndef memcmp
277# define memcmp(s1, s2, n) bcmp (s1, s2, n)
278# endif
279# ifndef memcpy
280# define memcpy(d, s, n) (bcopy (s, d, n), (d))
281# endif
0b32bf0e
SM
282# endif
283# endif
fa9a63c5
RM
284
285/* Define the syntax stuff for \<, \>, etc. */
286
990b2375 287/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
669fa600 288enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
fa9a63c5 289
0b32bf0e 290# define SWITCH_ENUM_CAST(x) (x)
fa9a63c5 291
e934739e 292/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
293# define CHAR_CHARSET(c) 0
294# define CHARSET_LEADING_CODE_BASE(c) 0
295# define MAX_MULTIBYTE_LENGTH 1
296# define RE_MULTIBYTE_P(x) 0
bf216479 297# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
298# define WORD_BOUNDARY_P(c1, c2) (0)
299# define CHAR_HEAD_P(p) (1)
300# define SINGLE_BYTE_CHAR_P(c) (1)
301# define SAME_CHARSET_P(c1, c2) (1)
aa3830c4 302# define BYTES_BY_CHAR_HEAD(p) (1)
70806df6 303# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
62a6e103
AS
304# define STRING_CHAR(p) (*(p))
305# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
0b32bf0e 306# define CHAR_STRING(c, s) (*(s) = (c), 1)
62a6e103
AS
307# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
308# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
cf9c99bc
KH
309# define RE_CHAR_TO_MULTIBYTE(c) (c)
310# define RE_CHAR_TO_UNIBYTE(c) (c)
0b32bf0e 311# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 312 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
313# define GET_CHAR_AFTER(c, p, len) \
314 (c = *p, len = 1)
0b32bf0e 315# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
316# define BYTE8_TO_CHAR(c) (c)
317# define CHAR_BYTE8_P(c) (0)
bf216479 318# define CHAR_LEADING_CODE(c) (c)
8f924df7 319
fa9a63c5 320#endif /* not emacs */
4e8a9132
SM
321
322#ifndef RE_TRANSLATE
0b32bf0e
SM
323# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
324# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 325#endif
fa9a63c5
RM
326\f
327/* Get the interface, including the syntax bits. */
328#include "regex.h"
329
f71b19b6
DL
330/* isalpha etc. are used for the character classes. */
331#include <ctype.h>
fa9a63c5 332
f71b19b6 333#ifdef emacs
fa9a63c5 334
f71b19b6 335/* 1 if C is an ASCII character. */
0b32bf0e 336# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 337
f71b19b6 338/* 1 if C is a unibyte character. */
0b32bf0e 339# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 340
f71b19b6 341/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 342
f71b19b6 343/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
344# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
345# define ISCNTRL(c) ((c) < ' ')
346# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
347 || ((c) >= 'a' && (c) <= 'f') \
348 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
349
350/* This is only used for single-byte characters. */
0b32bf0e 351# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
352
353/* The rest must handle multibyte characters. */
354
0b32bf0e 355# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 356 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
357 : 1)
358
14473664 359# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 360 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
361 : 1)
362
0b32bf0e 363# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
364 ? (((c) >= 'a' && (c) <= 'z') \
365 || ((c) >= 'A' && (c) <= 'Z') \
366 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
367 : SYNTAX (c) == Sword)
368
0b32bf0e 369# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
370 ? (((c) >= 'a' && (c) <= 'z') \
371 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
372 : SYNTAX (c) == Sword)
373
0b32bf0e 374# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 375
0b32bf0e 376# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
377 ? ((c) > ' ' && (c) < 0177 \
378 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
379 || ((c) >= 'A' && (c) <= 'Z') \
380 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
381 : SYNTAX (c) != Sword)
382
0b32bf0e 383# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 384
0b32bf0e 385# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 386
0b32bf0e 387# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
388
389#else /* not emacs */
390
f71b19b6
DL
391/* Jim Meyering writes:
392
393 "... Some ctype macros are valid only for character codes that
394 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
395 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 396 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
397 STDC_HEADERS is defined, then autoconf has verified that the ctype
398 macros don't need to be guarded with references to isascii. ...
399 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
400 eliminate the && through constant folding."
401 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 402
4bb91c68 403# undef ISASCII
0b32bf0e
SM
404# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
405# define ISASCII(c) 1
406# else
407# define ISASCII(c) isascii(c)
408# endif
f71b19b6
DL
409
410/* 1 if C is an ASCII character. */
0b32bf0e 411# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
412
413/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
414# define ISUNIBYTE(c) 1
415
416# ifdef isblank
417# define ISBLANK(c) (ISASCII (c) && isblank (c))
418# else
419# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
420# endif
421# ifdef isgraph
422# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
423# else
424# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
425# endif
426
4bb91c68 427# undef ISPRINT
0b32bf0e
SM
428# define ISPRINT(c) (ISASCII (c) && isprint (c))
429# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
430# define ISALNUM(c) (ISASCII (c) && isalnum (c))
431# define ISALPHA(c) (ISASCII (c) && isalpha (c))
432# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
433# define ISLOWER(c) (ISASCII (c) && islower (c))
434# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
435# define ISSPACE(c) (ISASCII (c) && isspace (c))
436# define ISUPPER(c) (ISASCII (c) && isupper (c))
437# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
438
439# define ISWORD(c) ISALPHA(c)
440
4bb91c68
SM
441# ifdef _tolower
442# define TOLOWER(c) _tolower(c)
443# else
444# define TOLOWER(c) tolower(c)
445# endif
446
447/* How many characters in the character set. */
448# define CHAR_SET_SIZE 256
449
0b32bf0e 450# ifdef SYNTAX_TABLE
f71b19b6 451
0b32bf0e 452extern char *re_syntax_table;
f71b19b6 453
0b32bf0e
SM
454# else /* not SYNTAX_TABLE */
455
0b32bf0e
SM
456static char re_syntax_table[CHAR_SET_SIZE];
457
458static void
459init_syntax_once ()
460{
461 register int c;
462 static int done = 0;
463
464 if (done)
465 return;
466
467 bzero (re_syntax_table, sizeof re_syntax_table);
468
4bb91c68
SM
469 for (c = 0; c < CHAR_SET_SIZE; ++c)
470 if (ISALNUM (c))
471 re_syntax_table[c] = Sword;
fa9a63c5 472
669fa600 473 re_syntax_table['_'] = Ssymbol;
fa9a63c5 474
0b32bf0e
SM
475 done = 1;
476}
477
478# endif /* not SYNTAX_TABLE */
96cc36cc 479
4bb91c68
SM
480# define SYNTAX(c) re_syntax_table[(c)]
481
96cc36cc
RS
482#endif /* not emacs */
483\f
fa9a63c5 484#ifndef NULL
0b32bf0e 485# define NULL (void *)0
fa9a63c5
RM
486#endif
487
488/* We remove any previous definition of `SIGN_EXTEND_CHAR',
489 since ours (we hope) works properly with all combinations of
490 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 491 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
492#undef SIGN_EXTEND_CHAR
493#if __STDC__
0b32bf0e 494# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
495#else /* not __STDC__ */
496/* As in Harbison and Steele. */
0b32bf0e 497# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
498#endif
499\f
500/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
501 use `alloca' instead of `malloc'. This is because using malloc in
502 re_search* or re_match* could cause memory leaks when C-g is used in
503 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
504 the other hand, malloc is more portable, and easier to debug.
505
fa9a63c5
RM
506 Because we sometimes use alloca, some routines have to be macros,
507 not functions -- `alloca'-allocated space disappears at the end of the
508 function it is called in. */
509
510#ifdef REGEX_MALLOC
511
0b32bf0e
SM
512# define REGEX_ALLOCATE malloc
513# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
514# define REGEX_FREE free
fa9a63c5
RM
515
516#else /* not REGEX_MALLOC */
517
518/* Emacs already defines alloca, sometimes. */
0b32bf0e 519# ifndef alloca
fa9a63c5
RM
520
521/* Make alloca work the best possible way. */
0b32bf0e
SM
522# ifdef __GNUC__
523# define alloca __builtin_alloca
524# else /* not __GNUC__ */
7f585e7a 525# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
526# include <alloca.h>
527# endif /* HAVE_ALLOCA_H */
528# endif /* not __GNUC__ */
fa9a63c5 529
0b32bf0e 530# endif /* not alloca */
fa9a63c5 531
0b32bf0e 532# define REGEX_ALLOCATE alloca
fa9a63c5
RM
533
534/* Assumes a `char *destination' variable. */
0b32bf0e 535# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 536 (destination = (char *) alloca (nsize), \
4bb91c68 537 memcpy (destination, source, osize))
fa9a63c5
RM
538
539/* No need to do anything to free, after alloca. */
0b32bf0e 540# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
541
542#endif /* not REGEX_MALLOC */
543
544/* Define how to allocate the failure stack. */
545
0b32bf0e 546#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 547
0b32bf0e 548# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 549 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 550# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 551 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 552# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
553 r_alloc_free (&failure_stack_ptr)
554
4297555e 555#else /* not using relocating allocator */
fa9a63c5 556
0b32bf0e 557# ifdef REGEX_MALLOC
fa9a63c5 558
0b32bf0e
SM
559# define REGEX_ALLOCATE_STACK malloc
560# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
561# define REGEX_FREE_STACK free
fa9a63c5 562
0b32bf0e 563# else /* not REGEX_MALLOC */
fa9a63c5 564
0b32bf0e 565# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 566
0b32bf0e 567# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 568 REGEX_REALLOCATE (source, osize, nsize)
7814e705 569/* No need to explicitly free anything. */
0b32bf0e 570# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 571
0b32bf0e 572# endif /* not REGEX_MALLOC */
4297555e 573#endif /* not using relocating allocator */
fa9a63c5
RM
574
575
576/* True if `size1' is non-NULL and PTR is pointing anywhere inside
577 `string1' or just past its end. This works if PTR is NULL, which is
578 a good thing. */
25fe55af 579#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
580 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
581
582/* (Re)Allocate N items of type T using malloc, or fail. */
583#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
584#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
585#define RETALLOC_IF(addr, n, t) \
586 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
587#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
588
4bb91c68 589#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
590
591#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
592
593#undef MAX
594#undef MIN
595#define MAX(a, b) ((a) > (b) ? (a) : (b))
596#define MIN(a, b) ((a) < (b) ? (a) : (b))
597
66f0296e
SM
598/* Type of source-pattern and string chars. */
599typedef const unsigned char re_char;
600
fa9a63c5
RM
601typedef char boolean;
602#define false 0
603#define true 1
604
4bb91c68
SM
605static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
606 re_char *string1, int size1,
607 re_char *string2, int size2,
608 int pos,
609 struct re_registers *regs,
610 int stop));
fa9a63c5
RM
611\f
612/* These are the command codes that appear in compiled regular
4bb91c68 613 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
614 command code can specify any interpretation whatsoever for its
615 arguments. Zero bytes may appear in the compiled regular expression. */
616
617typedef enum
618{
619 no_op = 0,
620
4bb91c68 621 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
622 succeed,
623
25fe55af 624 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
625 exactn,
626
25fe55af 627 /* Matches any (more or less) character. */
fa9a63c5
RM
628 anychar,
629
25fe55af
RS
630 /* Matches any one char belonging to specified set. First
631 following byte is number of bitmap bytes. Then come bytes
632 for a bitmap saying which chars are in. Bits in each byte
633 are ordered low-bit-first. A character is in the set if its
634 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
635 automatically not in the set.
636
637 If the length byte has the 0x80 bit set, then that stuff
638 is followed by a range table:
639 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 640 See RANGE_TABLE_WORK_BITS below.
01618498 641 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 642 pairs, each 2 multibyte characters,
0b32bf0e 643 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
644 charset,
645
25fe55af 646 /* Same parameters as charset, but match any character that is
4bb91c68 647 not one of those specified. */
fa9a63c5
RM
648 charset_not,
649
25fe55af
RS
650 /* Start remembering the text that is matched, for storing in a
651 register. Followed by one byte with the register number, in
652 the range 0 to one less than the pattern buffer's re_nsub
505bde11 653 field. */
fa9a63c5
RM
654 start_memory,
655
25fe55af
RS
656 /* Stop remembering the text that is matched and store it in a
657 memory register. Followed by one byte with the register
658 number, in the range 0 to one less than `re_nsub' in the
505bde11 659 pattern buffer. */
fa9a63c5
RM
660 stop_memory,
661
25fe55af 662 /* Match a duplicate of something remembered. Followed by one
4bb91c68 663 byte containing the register number. */
fa9a63c5
RM
664 duplicate,
665
25fe55af 666 /* Fail unless at beginning of line. */
fa9a63c5
RM
667 begline,
668
4bb91c68 669 /* Fail unless at end of line. */
fa9a63c5
RM
670 endline,
671
25fe55af
RS
672 /* Succeeds if at beginning of buffer (if emacs) or at beginning
673 of string to be matched (if not). */
fa9a63c5
RM
674 begbuf,
675
25fe55af 676 /* Analogously, for end of buffer/string. */
fa9a63c5 677 endbuf,
5e69f11e 678
25fe55af 679 /* Followed by two byte relative address to which to jump. */
5e69f11e 680 jump,
fa9a63c5 681
25fe55af 682 /* Followed by two-byte relative address of place to resume at
7814e705 683 in case of failure. */
fa9a63c5 684 on_failure_jump,
5e69f11e 685
25fe55af
RS
686 /* Like on_failure_jump, but pushes a placeholder instead of the
687 current string position when executed. */
fa9a63c5 688 on_failure_keep_string_jump,
5e69f11e 689
505bde11
SM
690 /* Just like `on_failure_jump', except that it checks that we
691 don't get stuck in an infinite loop (matching an empty string
692 indefinitely). */
693 on_failure_jump_loop,
694
0683b6fa
SM
695 /* Just like `on_failure_jump_loop', except that it checks for
696 a different kind of loop (the kind that shows up with non-greedy
697 operators). This operation has to be immediately preceded
698 by a `no_op'. */
699 on_failure_jump_nastyloop,
700
0b32bf0e 701 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
702 It analyses the loop before which it is put and if the
703 loop does not require backtracking, it changes itself to
4e8a9132
SM
704 `on_failure_keep_string_jump' and short-circuits the loop,
705 else it just defaults to changing itself into `on_failure_jump'.
706 It assumes that it is pointing to just past a `jump'. */
505bde11 707 on_failure_jump_smart,
fa9a63c5 708
25fe55af 709 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
710 After matching N times, jump to the address upon failure.
711 Does not work if N starts at 0: use on_failure_jump_loop
712 instead. */
fa9a63c5
RM
713 succeed_n,
714
25fe55af
RS
715 /* Followed by two-byte relative address, and two-byte number n.
716 Jump to the address N times, then fail. */
fa9a63c5
RM
717 jump_n,
718
25fe55af 719 /* Set the following two-byte relative address to the
7814e705 720 subsequent two-byte number. The address *includes* the two
25fe55af 721 bytes of number. */
fa9a63c5
RM
722 set_number_at,
723
fa9a63c5
RM
724 wordbeg, /* Succeeds if at word beginning. */
725 wordend, /* Succeeds if at word end. */
726
727 wordbound, /* Succeeds if at a word boundary. */
7814e705 728 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5 729
669fa600
SM
730 symbeg, /* Succeeds if at symbol beginning. */
731 symend, /* Succeeds if at symbol end. */
732
fa9a63c5 733 /* Matches any character whose syntax is specified. Followed by
25fe55af 734 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
735 syntaxspec,
736
737 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
738 notsyntaxspec
739
740#ifdef emacs
741 ,before_dot, /* Succeeds if before point. */
742 at_dot, /* Succeeds if at point. */
743 after_dot, /* Succeeds if after point. */
b18215fc
RS
744
745 /* Matches any character whose category-set contains the specified
7814e705
JB
746 category. The operator is followed by a byte which contains a
747 category code (mnemonic ASCII character). */
b18215fc
RS
748 categoryspec,
749
750 /* Matches any character whose category-set does not contain the
751 specified category. The operator is followed by a byte which
752 contains the category code (mnemonic ASCII character). */
753 notcategoryspec
fa9a63c5
RM
754#endif /* emacs */
755} re_opcode_t;
756\f
757/* Common operations on the compiled pattern. */
758
759/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
760
761#define STORE_NUMBER(destination, number) \
762 do { \
763 (destination)[0] = (number) & 0377; \
764 (destination)[1] = (number) >> 8; \
765 } while (0)
766
767/* Same as STORE_NUMBER, except increment DESTINATION to
768 the byte after where the number is stored. Therefore, DESTINATION
769 must be an lvalue. */
770
771#define STORE_NUMBER_AND_INCR(destination, number) \
772 do { \
773 STORE_NUMBER (destination, number); \
774 (destination) += 2; \
775 } while (0)
776
777/* Put into DESTINATION a number stored in two contiguous bytes starting
778 at SOURCE. */
779
780#define EXTRACT_NUMBER(destination, source) \
781 do { \
782 (destination) = *(source) & 0377; \
783 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
784 } while (0)
785
786#ifdef DEBUG
4bb91c68 787static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
788static void
789extract_number (dest, source)
790 int *dest;
01618498 791 re_char *source;
fa9a63c5 792{
5e69f11e 793 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
794 *dest = *source & 0377;
795 *dest += temp << 8;
796}
797
4bb91c68 798# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
799# undef EXTRACT_NUMBER
800# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
801# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
802
803#endif /* DEBUG */
804
805/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
806 SOURCE must be an lvalue. */
807
808#define EXTRACT_NUMBER_AND_INCR(destination, source) \
809 do { \
810 EXTRACT_NUMBER (destination, source); \
25fe55af 811 (source) += 2; \
fa9a63c5
RM
812 } while (0)
813
814#ifdef DEBUG
4bb91c68
SM
815static void extract_number_and_incr _RE_ARGS ((int *destination,
816 re_char **source));
fa9a63c5
RM
817static void
818extract_number_and_incr (destination, source)
819 int *destination;
01618498 820 re_char **source;
5e69f11e 821{
fa9a63c5
RM
822 extract_number (destination, *source);
823 *source += 2;
824}
825
0b32bf0e
SM
826# ifndef EXTRACT_MACROS
827# undef EXTRACT_NUMBER_AND_INCR
828# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 829 extract_number_and_incr (&dest, &src)
0b32bf0e 830# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
831
832#endif /* DEBUG */
833\f
b18215fc
RS
834/* Store a multibyte character in three contiguous bytes starting
835 DESTINATION, and increment DESTINATION to the byte after where the
7814e705 836 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
837
838#define STORE_CHARACTER_AND_INCR(destination, character) \
839 do { \
840 (destination)[0] = (character) & 0377; \
841 (destination)[1] = ((character) >> 8) & 0377; \
842 (destination)[2] = (character) >> 16; \
843 (destination) += 3; \
844 } while (0)
845
846/* Put into DESTINATION a character stored in three contiguous bytes
7814e705 847 starting at SOURCE. */
b18215fc
RS
848
849#define EXTRACT_CHARACTER(destination, source) \
850 do { \
851 (destination) = ((source)[0] \
852 | ((source)[1] << 8) \
853 | ((source)[2] << 16)); \
854 } while (0)
855
856
857/* Macros for charset. */
858
859/* Size of bitmap of charset P in bytes. P is a start of charset,
860 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
861#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
862
863/* Nonzero if charset P has range table. */
25fe55af 864#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
865
866/* Return the address of range table of charset P. But not the start
867 of table itself, but the before where the number of ranges is
96cc36cc
RS
868 stored. `2 +' means to skip re_opcode_t and size of bitmap,
869 and the 2 bytes of flags at the start of the range table. */
870#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
871
872/* Extract the bit flags that start a range table. */
873#define CHARSET_RANGE_TABLE_BITS(p) \
874 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
875 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
876
877/* Test if C is listed in the bitmap of charset P. */
878#define CHARSET_LOOKUP_BITMAP(p, c) \
879 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
880 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
881
882/* Return the address of end of RANGE_TABLE. COUNT is number of
7814e705
JB
883 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
884 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
885 and end. */
886#define CHARSET_RANGE_TABLE_END(range_table, count) \
887 ((range_table) + (count) * 2 * 3)
888
7814e705 889/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
890 COUNT is number of ranges in RANGE_TABLE. */
891#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
892 do \
893 { \
01618498
SM
894 re_wchar_t range_start, range_end; \
895 re_char *p; \
896 re_char *range_table_end \
b18215fc
RS
897 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
898 \
899 for (p = (range_table); p < range_table_end; p += 2 * 3) \
900 { \
901 EXTRACT_CHARACTER (range_start, p); \
902 EXTRACT_CHARACTER (range_end, p + 3); \
903 \
904 if (range_start <= (c) && (c) <= range_end) \
905 { \
906 (not) = !(not); \
907 break; \
908 } \
909 } \
910 } \
911 while (0)
912
913/* Test if C is in range table of CHARSET. The flag NOT is negated if
914 C is listed in it. */
915#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
916 do \
917 { \
918 /* Number of ranges in range table. */ \
919 int count; \
01618498
SM
920 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
921 \
b18215fc
RS
922 EXTRACT_NUMBER_AND_INCR (count, range_table); \
923 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
924 } \
925 while (0)
926\f
fa9a63c5
RM
927/* If DEBUG is defined, Regex prints many voluminous messages about what
928 it is doing (if the variable `debug' is nonzero). If linked with the
929 main program in `iregex.c', you can enter patterns and strings
930 interactively. And if linked with the main program in `main.c' and
4bb91c68 931 the other test files, you can run the already-written tests. */
fa9a63c5
RM
932
933#ifdef DEBUG
934
935/* We use standard I/O for debugging. */
0b32bf0e 936# include <stdio.h>
fa9a63c5
RM
937
938/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 939# include <assert.h>
fa9a63c5 940
99633e97 941static int debug = -100000;
fa9a63c5 942
0b32bf0e
SM
943# define DEBUG_STATEMENT(e) e
944# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
945# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
946# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
947# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
948# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 949 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 950# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 951 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
952
953
954/* Print the fastmap in human-readable form. */
955
956void
957print_fastmap (fastmap)
958 char *fastmap;
959{
960 unsigned was_a_range = 0;
5e69f11e
RM
961 unsigned i = 0;
962
fa9a63c5
RM
963 while (i < (1 << BYTEWIDTH))
964 {
965 if (fastmap[i++])
966 {
967 was_a_range = 0;
25fe55af
RS
968 putchar (i - 1);
969 while (i < (1 << BYTEWIDTH) && fastmap[i])
970 {
971 was_a_range = 1;
972 i++;
973 }
fa9a63c5 974 if (was_a_range)
25fe55af
RS
975 {
976 printf ("-");
977 putchar (i - 1);
978 }
979 }
fa9a63c5 980 }
5e69f11e 981 putchar ('\n');
fa9a63c5
RM
982}
983
984
985/* Print a compiled pattern string in human-readable form, starting at
986 the START pointer into it and ending just before the pointer END. */
987
988void
989print_partial_compiled_pattern (start, end)
01618498
SM
990 re_char *start;
991 re_char *end;
fa9a63c5
RM
992{
993 int mcnt, mcnt2;
01618498
SM
994 re_char *p = start;
995 re_char *pend = end;
fa9a63c5
RM
996
997 if (start == NULL)
998 {
a1a052df 999 fprintf (stderr, "(null)\n");
fa9a63c5
RM
1000 return;
1001 }
5e69f11e 1002
fa9a63c5
RM
1003 /* Loop over pattern commands. */
1004 while (p < pend)
1005 {
a1a052df 1006 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
1007
1008 switch ((re_opcode_t) *p++)
1009 {
25fe55af 1010 case no_op:
a1a052df 1011 fprintf (stderr, "/no_op");
25fe55af 1012 break;
fa9a63c5 1013
99633e97 1014 case succeed:
a1a052df 1015 fprintf (stderr, "/succeed");
99633e97
SM
1016 break;
1017
fa9a63c5
RM
1018 case exactn:
1019 mcnt = *p++;
a1a052df 1020 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 1021 do
fa9a63c5 1022 {
a1a052df 1023 fprintf (stderr, "/%c", *p++);
25fe55af
RS
1024 }
1025 while (--mcnt);
1026 break;
fa9a63c5
RM
1027
1028 case start_memory:
a1a052df 1029 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 1030 break;
fa9a63c5
RM
1031
1032 case stop_memory:
a1a052df 1033 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 1034 break;
fa9a63c5
RM
1035
1036 case duplicate:
a1a052df 1037 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
1038 break;
1039
1040 case anychar:
a1a052df 1041 fprintf (stderr, "/anychar");
fa9a63c5
RM
1042 break;
1043
1044 case charset:
25fe55af
RS
1045 case charset_not:
1046 {
1047 register int c, last = -100;
fa9a63c5 1048 register int in_range = 0;
99633e97
SM
1049 int length = CHARSET_BITMAP_SIZE (p - 1);
1050 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1051
a1a052df 1052 fprintf (stderr, "/charset [%s",
839966f3 1053 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1054
839966f3
KH
1055 if (p + *p >= pend)
1056 fprintf (stderr, " !extends past end of pattern! ");
fa9a63c5 1057
25fe55af 1058 for (c = 0; c < 256; c++)
96cc36cc 1059 if (c / 8 < length
fa9a63c5
RM
1060 && (p[1 + (c/8)] & (1 << (c % 8))))
1061 {
1062 /* Are we starting a range? */
1063 if (last + 1 == c && ! in_range)
1064 {
a1a052df 1065 fprintf (stderr, "-");
fa9a63c5
RM
1066 in_range = 1;
1067 }
1068 /* Have we broken a range? */
1069 else if (last + 1 != c && in_range)
96cc36cc 1070 {
a1a052df 1071 fprintf (stderr, "%c", last);
fa9a63c5
RM
1072 in_range = 0;
1073 }
5e69f11e 1074
fa9a63c5 1075 if (! in_range)
a1a052df 1076 fprintf (stderr, "%c", c);
fa9a63c5
RM
1077
1078 last = c;
25fe55af 1079 }
fa9a63c5
RM
1080
1081 if (in_range)
a1a052df 1082 fprintf (stderr, "%c", last);
fa9a63c5 1083
a1a052df 1084 fprintf (stderr, "]");
fa9a63c5 1085
99633e97 1086 p += 1 + length;
96cc36cc 1087
96cc36cc 1088 if (has_range_table)
99633e97
SM
1089 {
1090 int count;
a1a052df 1091 fprintf (stderr, "has-range-table");
99633e97
SM
1092
1093 /* ??? Should print the range table; for now, just skip it. */
1094 p += 2; /* skip range table bits */
1095 EXTRACT_NUMBER_AND_INCR (count, p);
1096 p = CHARSET_RANGE_TABLE_END (p, count);
1097 }
fa9a63c5
RM
1098 }
1099 break;
1100
1101 case begline:
a1a052df 1102 fprintf (stderr, "/begline");
25fe55af 1103 break;
fa9a63c5
RM
1104
1105 case endline:
a1a052df 1106 fprintf (stderr, "/endline");
25fe55af 1107 break;
fa9a63c5
RM
1108
1109 case on_failure_jump:
25fe55af 1110 extract_number_and_incr (&mcnt, &p);
a1a052df 1111 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1112 break;
fa9a63c5
RM
1113
1114 case on_failure_keep_string_jump:
25fe55af 1115 extract_number_and_incr (&mcnt, &p);
a1a052df 1116 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1117 break;
fa9a63c5 1118
0683b6fa
SM
1119 case on_failure_jump_nastyloop:
1120 extract_number_and_incr (&mcnt, &p);
a1a052df 1121 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1122 break;
1123
505bde11 1124 case on_failure_jump_loop:
fa9a63c5 1125 extract_number_and_incr (&mcnt, &p);
a1a052df 1126 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1127 break;
1128
505bde11 1129 case on_failure_jump_smart:
fa9a63c5 1130 extract_number_and_incr (&mcnt, &p);
a1a052df 1131 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1132 break;
1133
25fe55af 1134 case jump:
fa9a63c5 1135 extract_number_and_incr (&mcnt, &p);
a1a052df 1136 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1137 break;
1138
25fe55af
RS
1139 case succeed_n:
1140 extract_number_and_incr (&mcnt, &p);
1141 extract_number_and_incr (&mcnt2, &p);
a1a052df 1142 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1143 break;
5e69f11e 1144
25fe55af
RS
1145 case jump_n:
1146 extract_number_and_incr (&mcnt, &p);
1147 extract_number_and_incr (&mcnt2, &p);
a1a052df 1148 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1149 break;
5e69f11e 1150
25fe55af
RS
1151 case set_number_at:
1152 extract_number_and_incr (&mcnt, &p);
1153 extract_number_and_incr (&mcnt2, &p);
a1a052df 1154 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1155 break;
5e69f11e 1156
25fe55af 1157 case wordbound:
a1a052df 1158 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1159 break;
1160
1161 case notwordbound:
a1a052df 1162 fprintf (stderr, "/notwordbound");
25fe55af 1163 break;
fa9a63c5
RM
1164
1165 case wordbeg:
a1a052df 1166 fprintf (stderr, "/wordbeg");
fa9a63c5 1167 break;
5e69f11e 1168
fa9a63c5 1169 case wordend:
a1a052df 1170 fprintf (stderr, "/wordend");
e2543b02 1171 break;
5e69f11e 1172
669fa600 1173 case symbeg:
e2543b02 1174 fprintf (stderr, "/symbeg");
669fa600
SM
1175 break;
1176
1177 case symend:
e2543b02 1178 fprintf (stderr, "/symend");
669fa600 1179 break;
5e69f11e 1180
1fb352e0 1181 case syntaxspec:
a1a052df 1182 fprintf (stderr, "/syntaxspec");
1fb352e0 1183 mcnt = *p++;
a1a052df 1184 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1185 break;
1186
1187 case notsyntaxspec:
a1a052df 1188 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1189 mcnt = *p++;
a1a052df 1190 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1191 break;
1192
0b32bf0e 1193# ifdef emacs
fa9a63c5 1194 case before_dot:
a1a052df 1195 fprintf (stderr, "/before_dot");
25fe55af 1196 break;
fa9a63c5
RM
1197
1198 case at_dot:
a1a052df 1199 fprintf (stderr, "/at_dot");
25fe55af 1200 break;
fa9a63c5
RM
1201
1202 case after_dot:
a1a052df 1203 fprintf (stderr, "/after_dot");
25fe55af 1204 break;
fa9a63c5 1205
1fb352e0 1206 case categoryspec:
a1a052df 1207 fprintf (stderr, "/categoryspec");
fa9a63c5 1208 mcnt = *p++;
a1a052df 1209 fprintf (stderr, "/%d", mcnt);
25fe55af 1210 break;
5e69f11e 1211
1fb352e0 1212 case notcategoryspec:
a1a052df 1213 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1214 mcnt = *p++;
a1a052df 1215 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1216 break;
0b32bf0e 1217# endif /* emacs */
fa9a63c5 1218
fa9a63c5 1219 case begbuf:
a1a052df 1220 fprintf (stderr, "/begbuf");
25fe55af 1221 break;
fa9a63c5
RM
1222
1223 case endbuf:
a1a052df 1224 fprintf (stderr, "/endbuf");
25fe55af 1225 break;
fa9a63c5 1226
25fe55af 1227 default:
a1a052df 1228 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1229 }
1230
a1a052df 1231 fprintf (stderr, "\n");
fa9a63c5
RM
1232 }
1233
a1a052df 1234 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1235}
1236
1237
1238void
1239print_compiled_pattern (bufp)
1240 struct re_pattern_buffer *bufp;
1241{
01618498 1242 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1243
1244 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1245 printf ("%ld bytes used/%ld bytes allocated.\n",
1246 bufp->used, bufp->allocated);
fa9a63c5
RM
1247
1248 if (bufp->fastmap_accurate && bufp->fastmap)
1249 {
1250 printf ("fastmap: ");
1251 print_fastmap (bufp->fastmap);
1252 }
1253
1254 printf ("re_nsub: %d\t", bufp->re_nsub);
1255 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1256 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1257 printf ("no_sub: %d\t", bufp->no_sub);
1258 printf ("not_bol: %d\t", bufp->not_bol);
1259 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1260 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1261 fflush (stdout);
fa9a63c5
RM
1262 /* Perhaps we should print the translate table? */
1263}
1264
1265
1266void
1267print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1268 re_char *where;
1269 re_char *string1;
1270 re_char *string2;
fa9a63c5
RM
1271 int size1;
1272 int size2;
1273{
4bb91c68 1274 int this_char;
5e69f11e 1275
fa9a63c5
RM
1276 if (where == NULL)
1277 printf ("(null)");
1278 else
1279 {
1280 if (FIRST_STRING_P (where))
25fe55af
RS
1281 {
1282 for (this_char = where - string1; this_char < size1; this_char++)
1283 putchar (string1[this_char]);
fa9a63c5 1284
25fe55af
RS
1285 where = string2;
1286 }
fa9a63c5
RM
1287
1288 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1289 putchar (string2[this_char]);
fa9a63c5
RM
1290 }
1291}
1292
1293#else /* not DEBUG */
1294
0b32bf0e
SM
1295# undef assert
1296# define assert(e)
fa9a63c5 1297
0b32bf0e
SM
1298# define DEBUG_STATEMENT(e)
1299# define DEBUG_PRINT1(x)
1300# define DEBUG_PRINT2(x1, x2)
1301# define DEBUG_PRINT3(x1, x2, x3)
1302# define DEBUG_PRINT4(x1, x2, x3, x4)
1303# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1304# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1305
1306#endif /* not DEBUG */
1307\f
1308/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1309 also be assigned to arbitrarily: each pattern buffer stores its own
1310 syntax, so it can be changed between regex compilations. */
1311/* This has no initializer because initialized variables in Emacs
1312 become read-only after dumping. */
1313reg_syntax_t re_syntax_options;
1314
1315
1316/* Specify the precise syntax of regexps for compilation. This provides
1317 for compatibility for various utilities which historically have
1318 different, incompatible syntaxes.
1319
1320 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1321 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1322
1323reg_syntax_t
971de7fb 1324re_set_syntax (reg_syntax_t syntax)
fa9a63c5
RM
1325{
1326 reg_syntax_t ret = re_syntax_options;
5e69f11e 1327
fa9a63c5
RM
1328 re_syntax_options = syntax;
1329 return ret;
1330}
c0f9ea08 1331WEAK_ALIAS (__re_set_syntax, re_set_syntax)
f9b0fd99
RS
1332
1333/* Regexp to use to replace spaces, or NULL meaning don't. */
1334static re_char *whitespace_regexp;
1335
1336void
971de7fb 1337re_set_whitespace_regexp (const char *regexp)
f9b0fd99 1338{
6470ea05 1339 whitespace_regexp = (re_char *) regexp;
f9b0fd99
RS
1340}
1341WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1342\f
1343/* This table gives an error message for each of the error codes listed
4bb91c68 1344 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1345 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1346 but why not be nice? */
fa9a63c5
RM
1347
1348static const char *re_error_msgid[] =
5e69f11e
RM
1349 {
1350 gettext_noop ("Success"), /* REG_NOERROR */
1351 gettext_noop ("No match"), /* REG_NOMATCH */
1352 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1353 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1354 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1355 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1356 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1357 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1358 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1359 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1360 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1361 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1362 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1363 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1364 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1365 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1366 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
b3e4c897 1367 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
fa9a63c5
RM
1368 };
1369\f
4bb91c68 1370/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1371
1372/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1373 searching and matching functions should not call alloca. On some
1374 systems, alloca is implemented in terms of malloc, and if we're
1375 using the relocating allocator routines, then malloc could cause a
1376 relocation, which might (if the strings being searched are in the
1377 ralloc heap) shift the data out from underneath the regexp
1378 routines.
1379
5e69f11e 1380 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1381 processes input from X in a signal handler; processing X input may
1382 call malloc; if input arrives while a matching routine is calling
1383 malloc, then we're scrod. But Emacs can't just block input while
1384 calling matching routines; then we don't notice interrupts when
1385 they come in. So, Emacs blocks input around all regexp calls
1386 except the matching calls, which it leaves unprotected, in the
1387 faith that they will not malloc. */
1388
1389/* Normally, this is fine. */
1390#define MATCH_MAY_ALLOCATE
1391
fa9a63c5
RM
1392/* The match routines may not allocate if (1) they would do it with malloc
1393 and (2) it's not safe for them to use malloc.
1394 Note that if REL_ALLOC is defined, matching would not use malloc for the
1395 failure stack, but we would still use it for the register vectors;
4bb91c68 1396 so REL_ALLOC should not affect this. */
b588157e 1397#if defined REGEX_MALLOC && defined emacs
0b32bf0e 1398# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1399#endif
1400
1401\f
1402/* Failure stack declarations and macros; both re_compile_fastmap and
1403 re_match_2 use a failure stack. These have to be macros because of
1404 REGEX_ALLOCATE_STACK. */
5e69f11e 1405
fa9a63c5 1406
320a2a73 1407/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1408 when matching. If this number is exceeded, we allocate more
1409 space, so it is not a hard limit. */
1410#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1411# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1412#endif
1413
1414/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1415 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1416 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1417 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1418 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1419# if defined MATCH_MAY_ALLOCATE
1420/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1421 whose default stack limit is 2mb. In order for a larger
1422 value to work reliably, you have to try to make it accord
1423 with the process stack limit. */
c0f9ea08
SM
1424size_t re_max_failures = 40000;
1425# else
1426size_t re_max_failures = 4000;
1427# endif
fa9a63c5
RM
1428
1429union fail_stack_elt
1430{
01618498 1431 re_char *pointer;
c0f9ea08
SM
1432 /* This should be the biggest `int' that's no bigger than a pointer. */
1433 long integer;
fa9a63c5
RM
1434};
1435
1436typedef union fail_stack_elt fail_stack_elt_t;
1437
1438typedef struct
1439{
1440 fail_stack_elt_t *stack;
c0f9ea08
SM
1441 size_t size;
1442 size_t avail; /* Offset of next open position. */
1443 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1444} fail_stack_type;
1445
505bde11 1446#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1447#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1448
1449
1450/* Define macros to initialize and free the failure stack.
1451 Do `return -2' if the alloc fails. */
1452
1453#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1454# define INIT_FAIL_STACK() \
fa9a63c5
RM
1455 do { \
1456 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1457 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1458 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1459 \
1460 if (fail_stack.stack == NULL) \
1461 return -2; \
1462 \
1463 fail_stack.size = INIT_FAILURE_ALLOC; \
1464 fail_stack.avail = 0; \
505bde11 1465 fail_stack.frame = 0; \
fa9a63c5
RM
1466 } while (0)
1467
0b32bf0e 1468# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1469#else
0b32bf0e 1470# define INIT_FAIL_STACK() \
fa9a63c5
RM
1471 do { \
1472 fail_stack.avail = 0; \
505bde11 1473 fail_stack.frame = 0; \
fa9a63c5
RM
1474 } while (0)
1475
0b32bf0e 1476# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1477#endif
1478
1479
320a2a73
KH
1480/* Double the size of FAIL_STACK, up to a limit
1481 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1482
1483 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1484 allocating space for it or it was already too large.
1485
4bb91c68 1486 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1487
320a2a73
KH
1488/* Factor to increase the failure stack size by
1489 when we increase it.
1490 This used to be 2, but 2 was too wasteful
1491 because the old discarded stacks added up to as much space
1492 were as ultimate, maximum-size stack. */
1493#define FAIL_STACK_GROWTH_FACTOR 4
1494
1495#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1496 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1497 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1498 ? 0 \
320a2a73
KH
1499 : ((fail_stack).stack \
1500 = (fail_stack_elt_t *) \
25fe55af
RS
1501 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1502 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1503 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1504 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1505 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1506 \
1507 (fail_stack).stack == NULL \
1508 ? 0 \
6453db45
KH
1509 : ((fail_stack).size \
1510 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1511 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1512 * FAIL_STACK_GROWTH_FACTOR)) \
1513 / sizeof (fail_stack_elt_t)), \
25fe55af 1514 1)))
fa9a63c5
RM
1515
1516
fa9a63c5
RM
1517/* Push a pointer value onto the failure stack.
1518 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1519 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1520#define PUSH_FAILURE_POINTER(item) \
01618498 1521 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1522
1523/* This pushes an integer-valued item onto the failure stack.
1524 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1525 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1526#define PUSH_FAILURE_INT(item) \
1527 fail_stack.stack[fail_stack.avail++].integer = (item)
1528
1529/* Push a fail_stack_elt_t value onto the failure stack.
1530 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1531 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1532#define PUSH_FAILURE_ELT(item) \
1533 fail_stack.stack[fail_stack.avail++] = (item)
1534
1535/* These three POP... operations complement the three PUSH... operations.
1536 All assume that `fail_stack' is nonempty. */
1537#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1538#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1539#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1540
505bde11
SM
1541/* Individual items aside from the registers. */
1542#define NUM_NONREG_ITEMS 3
1543
1544/* Used to examine the stack (to detect infinite loops). */
1545#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1546#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1547#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1548#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1549
1550
505bde11
SM
1551#define ENSURE_FAIL_STACK(space) \
1552while (REMAINING_AVAIL_SLOTS <= space) { \
1553 if (!GROW_FAIL_STACK (fail_stack)) \
1554 return -2; \
1555 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1556 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1557}
1558
1559/* Push register NUM onto the stack. */
1560#define PUSH_FAILURE_REG(num) \
1561do { \
1562 char *destination; \
1563 ENSURE_FAIL_STACK(3); \
1564 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1565 num, regstart[num], regend[num]); \
1566 PUSH_FAILURE_POINTER (regstart[num]); \
1567 PUSH_FAILURE_POINTER (regend[num]); \
1568 PUSH_FAILURE_INT (num); \
1569} while (0)
1570
01618498
SM
1571/* Change the counter's value to VAL, but make sure that it will
1572 be reset when backtracking. */
1573#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1574do { \
1575 char *destination; \
1576 int c; \
1577 ENSURE_FAIL_STACK(3); \
1578 EXTRACT_NUMBER (c, ptr); \
01618498 1579 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1580 PUSH_FAILURE_INT (c); \
1581 PUSH_FAILURE_POINTER (ptr); \
1582 PUSH_FAILURE_INT (-1); \
01618498 1583 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1584} while (0)
1585
505bde11 1586/* Pop a saved register off the stack. */
dc1e502d 1587#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1588do { \
1589 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1590 if (reg == -1) \
1591 { \
1592 /* It's a counter. */ \
6dcf2d0e
SM
1593 /* Here, we discard `const', making re_match non-reentrant. */ \
1594 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1595 reg = POP_FAILURE_INT (); \
1596 STORE_NUMBER (ptr, reg); \
1597 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1598 } \
1599 else \
1600 { \
1601 regend[reg] = POP_FAILURE_POINTER (); \
1602 regstart[reg] = POP_FAILURE_POINTER (); \
1603 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1604 reg, regstart[reg], regend[reg]); \
1605 } \
505bde11
SM
1606} while (0)
1607
1608/* Check that we are not stuck in an infinite loop. */
1609#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1610do { \
f6df485f 1611 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1612 /* Check for infinite matching loops */ \
f6df485f
RS
1613 while (failure > 0 \
1614 && (FAILURE_STR (failure) == string_place \
1615 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1616 { \
1617 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1618 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1619 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1620 { \
6df42991
SM
1621 cycle = 1; \
1622 break; \
f6df485f 1623 } \
66f0296e 1624 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1625 failure = NEXT_FAILURE_HANDLE(failure); \
1626 } \
1627 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1628} while (0)
6df42991 1629
fa9a63c5 1630/* Push the information about the state we will need
5e69f11e
RM
1631 if we ever fail back to it.
1632
505bde11 1633 Requires variables fail_stack, regstart, regend and
320a2a73 1634 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1635 declared.
5e69f11e 1636
fa9a63c5
RM
1637 Does `return FAILURE_CODE' if runs out of memory. */
1638
505bde11
SM
1639#define PUSH_FAILURE_POINT(pattern, string_place) \
1640do { \
1641 char *destination; \
1642 /* Must be int, so when we don't save any registers, the arithmetic \
1643 of 0 + -1 isn't done as unsigned. */ \
1644 \
505bde11 1645 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1646 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1647 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1648 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1649 \
1650 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1651 \
1652 DEBUG_PRINT1 ("\n"); \
1653 \
1654 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1655 PUSH_FAILURE_INT (fail_stack.frame); \
1656 \
1657 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1658 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1659 DEBUG_PRINT1 ("'\n"); \
1660 PUSH_FAILURE_POINTER (string_place); \
1661 \
1662 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1663 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1664 PUSH_FAILURE_POINTER (pattern); \
1665 \
1666 /* Close the frame by moving the frame pointer past it. */ \
1667 fail_stack.frame = fail_stack.avail; \
1668} while (0)
fa9a63c5 1669
320a2a73
KH
1670/* Estimate the size of data pushed by a typical failure stack entry.
1671 An estimate is all we need, because all we use this for
1672 is to choose a limit for how big to make the failure stack. */
ada30c0e 1673/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1674#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1675
fa9a63c5
RM
1676/* How many items can still be added to the stack without overflowing it. */
1677#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1678
1679
1680/* Pops what PUSH_FAIL_STACK pushes.
1681
1682 We restore into the parameters, all of which should be lvalues:
1683 STR -- the saved data position.
1684 PAT -- the saved pattern position.
fa9a63c5 1685 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1686
fa9a63c5 1687 Also assumes the variables `fail_stack' and (if debugging), `bufp',
7814e705 1688 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1689
505bde11
SM
1690#define POP_FAILURE_POINT(str, pat) \
1691do { \
fa9a63c5
RM
1692 assert (!FAIL_STACK_EMPTY ()); \
1693 \
1694 /* Remove failure points and point to how many regs pushed. */ \
1695 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1696 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1697 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1698 \
505bde11
SM
1699 /* Pop the saved registers. */ \
1700 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1701 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1702 \
01618498 1703 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1704 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1705 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1706 \
1707 /* If the saved string location is NULL, it came from an \
1708 on_failure_keep_string_jump opcode, and we want to throw away the \
1709 saved NULL, thus retaining our current position in the string. */ \
01618498 1710 str = POP_FAILURE_POINTER (); \
505bde11 1711 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1712 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1713 DEBUG_PRINT1 ("'\n"); \
1714 \
505bde11
SM
1715 fail_stack.frame = POP_FAILURE_INT (); \
1716 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1717 \
505bde11
SM
1718 assert (fail_stack.avail >= 0); \
1719 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1720 \
fa9a63c5 1721 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1722} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1723
1724
1725\f
fa9a63c5 1726/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1727#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1728\f
1729/* Subroutine declarations and macros for regex_compile. */
1730
4bb91c68
SM
1731static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1732 reg_syntax_t syntax,
1733 struct re_pattern_buffer *bufp));
1734static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1735static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1736 int arg1, int arg2));
1737static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1738 int arg, unsigned char *end));
1739static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1740 int arg1, int arg2, unsigned char *end));
01618498
SM
1741static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1742 re_char *p,
4bb91c68 1743 reg_syntax_t syntax));
01618498
SM
1744static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1745 re_char *pend,
4bb91c68 1746 reg_syntax_t syntax));
01618498
SM
1747static re_char *skip_one_char _RE_ARGS ((re_char *p));
1748static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1749 char *fastmap, const int multibyte));
fa9a63c5 1750
fa9a63c5 1751/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1752 translation. */
36595814 1753#define PATFETCH(c) \
2d1675e4
SM
1754 do { \
1755 int len; \
1756 if (p == pend) return REG_EEND; \
62a6e103 1757 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
2d1675e4 1758 p += len; \
fa9a63c5
RM
1759 } while (0)
1760
fa9a63c5
RM
1761
1762/* If `translate' is non-null, return translate[D], else just D. We
1763 cast the subscript to translate because some data is declared as
1764 `char *', to avoid warnings when a string constant is passed. But
1765 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1766#ifndef TRANSLATE
0b32bf0e 1767# define TRANSLATE(d) \
66f0296e 1768 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1769#endif
fa9a63c5
RM
1770
1771
1772/* Macros for outputting the compiled pattern into `buffer'. */
1773
1774/* If the buffer isn't allocated when it comes in, use this. */
1775#define INIT_BUF_SIZE 32
1776
4bb91c68 1777/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1778#define GET_BUFFER_SPACE(n) \
01618498 1779 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1780 EXTEND_BUFFER ()
1781
1782/* Make sure we have one more byte of buffer space and then add C to it. */
1783#define BUF_PUSH(c) \
1784 do { \
1785 GET_BUFFER_SPACE (1); \
1786 *b++ = (unsigned char) (c); \
1787 } while (0)
1788
1789
1790/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1791#define BUF_PUSH_2(c1, c2) \
1792 do { \
1793 GET_BUFFER_SPACE (2); \
1794 *b++ = (unsigned char) (c1); \
1795 *b++ = (unsigned char) (c2); \
1796 } while (0)
1797
1798
4bb91c68 1799/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1800#define BUF_PUSH_3(c1, c2, c3) \
1801 do { \
1802 GET_BUFFER_SPACE (3); \
1803 *b++ = (unsigned char) (c1); \
1804 *b++ = (unsigned char) (c2); \
1805 *b++ = (unsigned char) (c3); \
1806 } while (0)
1807
1808
1809/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1810 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1811#define STORE_JUMP(op, loc, to) \
1812 store_op1 (op, loc, (to) - (loc) - 3)
1813
1814/* Likewise, for a two-argument jump. */
1815#define STORE_JUMP2(op, loc, to, arg) \
1816 store_op2 (op, loc, (to) - (loc) - 3, arg)
1817
4bb91c68 1818/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1819#define INSERT_JUMP(op, loc, to) \
1820 insert_op1 (op, loc, (to) - (loc) - 3, b)
1821
1822/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1823#define INSERT_JUMP2(op, loc, to, arg) \
1824 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1825
1826
1827/* This is not an arbitrary limit: the arguments which represent offsets
839966f3 1828 into the pattern are two bytes long. So if 2^15 bytes turns out to
fa9a63c5 1829 be too small, many things would have to change. */
839966f3
KH
1830# define MAX_BUF_SIZE (1L << 15)
1831
1832#if 0 /* This is when we thought it could be 2^16 bytes. */
4bb91c68
SM
1833/* Any other compiler which, like MSC, has allocation limit below 2^16
1834 bytes will have to use approach similar to what was done below for
1835 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1836 reallocating to 0 bytes. Such thing is not going to work too well.
1837 You have been warned!! */
1838#if defined _MSC_VER && !defined WIN32
1839/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1840# define MAX_BUF_SIZE 65500L
1841#else
1842# define MAX_BUF_SIZE (1L << 16)
1843#endif
839966f3 1844#endif /* 0 */
fa9a63c5
RM
1845
1846/* Extend the buffer by twice its current size via realloc and
1847 reset the pointers that pointed into the old block to point to the
1848 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1849 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1850#if __BOUNDED_POINTERS__
1851# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
381880b0
CY
1852# define MOVE_BUFFER_POINTER(P) \
1853 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1854 SET_HIGH_BOUND (P), \
1855 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
4bb91c68
SM
1856# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1857 else \
1858 { \
1859 SET_HIGH_BOUND (b); \
1860 SET_HIGH_BOUND (begalt); \
1861 if (fixup_alt_jump) \
1862 SET_HIGH_BOUND (fixup_alt_jump); \
1863 if (laststart) \
1864 SET_HIGH_BOUND (laststart); \
1865 if (pending_exact) \
1866 SET_HIGH_BOUND (pending_exact); \
1867 }
1868#else
381880b0 1869# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
4bb91c68
SM
1870# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1871#endif
fa9a63c5 1872#define EXTEND_BUFFER() \
25fe55af 1873 do { \
381880b0 1874 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1875 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1876 return REG_ESIZE; \
1877 bufp->allocated <<= 1; \
1878 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1879 bufp->allocated = MAX_BUF_SIZE; \
01618498 1880 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1881 if (bufp->buffer == NULL) \
1882 return REG_ESPACE; \
1883 /* If the buffer moved, move all the pointers into it. */ \
1884 if (old_buffer != bufp->buffer) \
1885 { \
381880b0 1886 unsigned char *new_buffer = bufp->buffer; \
4bb91c68
SM
1887 MOVE_BUFFER_POINTER (b); \
1888 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1889 if (fixup_alt_jump) \
4bb91c68 1890 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1891 if (laststart) \
4bb91c68 1892 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1893 if (pending_exact) \
4bb91c68 1894 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1895 } \
4bb91c68 1896 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1897 } while (0)
1898
1899
1900/* Since we have one byte reserved for the register number argument to
1901 {start,stop}_memory, the maximum number of groups we can report
1902 things about is what fits in that byte. */
1903#define MAX_REGNUM 255
1904
1905/* But patterns can have more than `MAX_REGNUM' registers. We just
1906 ignore the excess. */
098d42af 1907typedef int regnum_t;
fa9a63c5
RM
1908
1909
1910/* Macros for the compile stack. */
1911
1912/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1913 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1914/* int may be not enough when sizeof(int) == 2. */
1915typedef long pattern_offset_t;
fa9a63c5
RM
1916
1917typedef struct
1918{
1919 pattern_offset_t begalt_offset;
1920 pattern_offset_t fixup_alt_jump;
5e69f11e 1921 pattern_offset_t laststart_offset;
fa9a63c5
RM
1922 regnum_t regnum;
1923} compile_stack_elt_t;
1924
1925
1926typedef struct
1927{
1928 compile_stack_elt_t *stack;
1929 unsigned size;
1930 unsigned avail; /* Offset of next open position. */
1931} compile_stack_type;
1932
1933
1934#define INIT_COMPILE_STACK_SIZE 32
1935
1936#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1937#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1938
4bb91c68 1939/* The next available element. */
fa9a63c5
RM
1940#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1941
1cee1e27
SM
1942/* Explicit quit checking is only used on NTemacs and whenever we
1943 use polling to process input events. */
1944#if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
77d11aec
RS
1945extern int immediate_quit;
1946# define IMMEDIATE_QUIT_CHECK \
1947 do { \
1948 if (immediate_quit) QUIT; \
1949 } while (0)
1950#else
1951# define IMMEDIATE_QUIT_CHECK ((void)0)
1952#endif
1953\f
b18215fc
RS
1954/* Structure to manage work area for range table. */
1955struct range_table_work_area
1956{
1957 int *table; /* actual work area. */
1958 int allocated; /* allocated size for work area in bytes. */
7814e705 1959 int used; /* actually used size in words. */
96cc36cc 1960 int bits; /* flag to record character classes */
b18215fc
RS
1961};
1962
77d11aec
RS
1963/* Make sure that WORK_AREA can hold more N multibyte characters.
1964 This is used only in set_image_of_range and set_image_of_range_1.
1965 It expects WORK_AREA to be a pointer.
1966 If it can't get the space, it returns from the surrounding function. */
1967
1968#define EXTEND_RANGE_TABLE(work_area, n) \
1969 do { \
8f924df7 1970 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1971 { \
8f924df7
KH
1972 extend_range_table_work_area (&work_area); \
1973 if ((work_area).table == 0) \
77d11aec
RS
1974 return (REG_ESPACE); \
1975 } \
b18215fc
RS
1976 } while (0)
1977
96cc36cc
RS
1978#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1979 (work_area).bits |= (bit)
1980
14473664
SM
1981/* Bits used to implement the multibyte-part of the various character classes
1982 such as [:alnum:] in a charset's range table. */
1983#define BIT_WORD 0x1
1984#define BIT_LOWER 0x2
1985#define BIT_PUNCT 0x4
1986#define BIT_SPACE 0x8
1987#define BIT_UPPER 0x10
1988#define BIT_MULTIBYTE 0x20
96cc36cc 1989
b18215fc
RS
1990/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1991#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1992 do { \
8f924df7 1993 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1994 (work_area).table[(work_area).used++] = (range_start); \
1995 (work_area).table[(work_area).used++] = (range_end); \
1996 } while (0)
1997
7814e705 1998/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1999#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
2000 do { \
2001 if ((work_area).table) \
2002 free ((work_area).table); \
2003 } while (0)
2004
96cc36cc 2005#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 2006#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 2007#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 2008#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 2009\f
b18215fc 2010
fa9a63c5 2011/* Set the bit for character C in a list. */
01618498 2012#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
2013
2014
bf216479
KH
2015#ifdef emacs
2016
cf9c99bc
KH
2017/* Store characters in the range FROM to TO in the bitmap at B (for
2018 ASCII and unibyte characters) and WORK_AREA (for multibyte
2019 characters) while translating them and paying attention to the
2020 continuity of translated characters.
8f924df7 2021
cf9c99bc
KH
2022 Implementation note: It is better to implement these fairly big
2023 macros by a function, but it's not that easy because macros called
8f924df7 2024 in this macro assume various local variables already declared. */
bf216479 2025
cf9c99bc
KH
2026/* Both FROM and TO are ASCII characters. */
2027
2028#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
2029 do { \
2030 int C0, C1; \
2031 \
2032 for (C0 = (FROM); C0 <= (TO); C0++) \
2033 { \
2034 C1 = TRANSLATE (C0); \
2035 if (! ASCII_CHAR_P (C1)) \
2036 { \
2037 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2038 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2039 C1 = C0; \
2040 } \
2041 SET_LIST_BIT (C1); \
2042 } \
2043 } while (0)
2044
2045
2046/* Both FROM and TO are unibyte characters (0x80..0xFF). */
2047
2048#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2049 do { \
2050 int C0, C1, C2, I; \
2051 int USED = RANGE_TABLE_WORK_USED (work_area); \
2052 \
2053 for (C0 = (FROM); C0 <= (TO); C0++) \
2054 { \
2055 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2056 if (CHAR_BYTE8_P (C1)) \
2057 SET_LIST_BIT (C0); \
2058 else \
2059 { \
2060 C2 = TRANSLATE (C1); \
2061 if (C2 == C1 \
2062 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2063 C1 = C0; \
2064 SET_LIST_BIT (C1); \
2065 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2066 { \
2067 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2068 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2069 \
2070 if (C2 >= from - 1 && C2 <= to + 1) \
2071 { \
2072 if (C2 == from - 1) \
2073 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2074 else if (C2 == to + 1) \
2075 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2076 break; \
2077 } \
2078 } \
2079 if (I < USED) \
2080 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2081 } \
2082 } \
2083 } while (0)
2084
2085
78edd3b7 2086/* Both FROM and TO are multibyte characters. */
cf9c99bc
KH
2087
2088#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2089 do { \
2090 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2091 \
2092 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2093 for (C0 = (FROM); C0 <= (TO); C0++) \
2094 { \
2095 C1 = TRANSLATE (C0); \
2096 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2097 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2098 SET_LIST_BIT (C2); \
2099 if (C1 >= (FROM) && C1 <= (TO)) \
2100 continue; \
2101 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2102 { \
2103 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2104 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2105 \
2106 if (C1 >= from - 1 && C1 <= to + 1) \
2107 { \
2108 if (C1 == from - 1) \
2109 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2110 else if (C1 == to + 1) \
2111 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2112 break; \
2113 } \
2114 } \
2115 if (I < USED) \
2116 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2117 } \
bf216479
KH
2118 } while (0)
2119
2120#endif /* emacs */
2121
fa9a63c5 2122/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 2123#define GET_UNSIGNED_NUMBER(num) \
c72b0edd
SM
2124 do { \
2125 if (p == pend) \
2126 FREE_STACK_RETURN (REG_EBRACE); \
2127 else \
2128 { \
2129 PATFETCH (c); \
2130 while ('0' <= c && c <= '9') \
2131 { \
2132 int prev; \
2133 if (num < 0) \
2134 num = 0; \
2135 prev = num; \
2136 num = num * 10 + c - '0'; \
2137 if (num / 10 != prev) \
2138 FREE_STACK_RETURN (REG_BADBR); \
2139 if (p == pend) \
2140 FREE_STACK_RETURN (REG_EBRACE); \
2141 PATFETCH (c); \
2142 } \
2143 } \
2144 } while (0)
77d11aec 2145\f
1fdab503 2146#if ! WIDE_CHAR_SUPPORT
01618498 2147
14473664 2148/* Map a string to the char class it names (if any). */
1fdab503 2149re_wctype_t
971de7fb 2150re_wctype (const re_char *str)
14473664 2151{
ada30c0e 2152 const char *string = str;
14473664
SM
2153 if (STREQ (string, "alnum")) return RECC_ALNUM;
2154 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2155 else if (STREQ (string, "word")) return RECC_WORD;
2156 else if (STREQ (string, "ascii")) return RECC_ASCII;
2157 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2158 else if (STREQ (string, "graph")) return RECC_GRAPH;
2159 else if (STREQ (string, "lower")) return RECC_LOWER;
2160 else if (STREQ (string, "print")) return RECC_PRINT;
2161 else if (STREQ (string, "punct")) return RECC_PUNCT;
2162 else if (STREQ (string, "space")) return RECC_SPACE;
2163 else if (STREQ (string, "upper")) return RECC_UPPER;
2164 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2165 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2166 else if (STREQ (string, "digit")) return RECC_DIGIT;
2167 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2168 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2169 else if (STREQ (string, "blank")) return RECC_BLANK;
2170 else return 0;
2171}
2172
e0f24100 2173/* True if CH is in the char class CC. */
1fdab503 2174boolean
971de7fb 2175re_iswctype (int ch, re_wctype_t cc)
14473664
SM
2176{
2177 switch (cc)
2178 {
0cdd06f8
SM
2179 case RECC_ALNUM: return ISALNUM (ch);
2180 case RECC_ALPHA: return ISALPHA (ch);
2181 case RECC_BLANK: return ISBLANK (ch);
2182 case RECC_CNTRL: return ISCNTRL (ch);
2183 case RECC_DIGIT: return ISDIGIT (ch);
2184 case RECC_GRAPH: return ISGRAPH (ch);
2185 case RECC_LOWER: return ISLOWER (ch);
2186 case RECC_PRINT: return ISPRINT (ch);
2187 case RECC_PUNCT: return ISPUNCT (ch);
2188 case RECC_SPACE: return ISSPACE (ch);
2189 case RECC_UPPER: return ISUPPER (ch);
2190 case RECC_XDIGIT: return ISXDIGIT (ch);
2191 case RECC_ASCII: return IS_REAL_ASCII (ch);
2192 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2193 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2194 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2195 case RECC_WORD: return ISWORD (ch);
2196 case RECC_ERROR: return false;
2197 default:
2198 abort();
14473664
SM
2199 }
2200}
fa9a63c5 2201
14473664
SM
2202/* Return a bit-pattern to use in the range-table bits to match multibyte
2203 chars of class CC. */
2204static int
971de7fb 2205re_wctype_to_bit (re_wctype_t cc)
14473664
SM
2206{
2207 switch (cc)
2208 {
2209 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2210 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2211 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2212 case RECC_LOWER: return BIT_LOWER;
2213 case RECC_UPPER: return BIT_UPPER;
2214 case RECC_PUNCT: return BIT_PUNCT;
2215 case RECC_SPACE: return BIT_SPACE;
14473664 2216 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2217 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2218 default:
2219 abort();
14473664
SM
2220 }
2221}
2222#endif
77d11aec
RS
2223\f
2224/* Filling in the work area of a range. */
2225
2226/* Actually extend the space in WORK_AREA. */
2227
2228static void
971de7fb 2229extend_range_table_work_area (struct range_table_work_area *work_area)
177c0ea7 2230{
77d11aec
RS
2231 work_area->allocated += 16 * sizeof (int);
2232 if (work_area->table)
2233 work_area->table
2234 = (int *) realloc (work_area->table, work_area->allocated);
2235 else
2236 work_area->table
2237 = (int *) malloc (work_area->allocated);
2238}
2239
8f924df7 2240#if 0
77d11aec
RS
2241#ifdef emacs
2242
2243/* Carefully find the ranges of codes that are equivalent
2244 under case conversion to the range start..end when passed through
2245 TRANSLATE. Handle the case where non-letters can come in between
2246 two upper-case letters (which happens in Latin-1).
2247 Also handle the case of groups of more than 2 case-equivalent chars.
2248
2249 The basic method is to look at consecutive characters and see
2250 if they can form a run that can be handled as one.
2251
2252 Returns -1 if successful, REG_ESPACE if ran out of space. */
2253
2254static int
2255set_image_of_range_1 (work_area, start, end, translate)
2256 RE_TRANSLATE_TYPE translate;
2257 struct range_table_work_area *work_area;
2258 re_wchar_t start, end;
2259{
2260 /* `one_case' indicates a character, or a run of characters,
2261 each of which is an isolate (no case-equivalents).
2262 This includes all ASCII non-letters.
2263
2264 `two_case' indicates a character, or a run of characters,
2265 each of which has two case-equivalent forms.
2266 This includes all ASCII letters.
2267
2268 `strange' indicates a character that has more than one
2269 case-equivalent. */
177c0ea7 2270
77d11aec
RS
2271 enum case_type {one_case, two_case, strange};
2272
2273 /* Describe the run that is in progress,
2274 which the next character can try to extend.
2275 If run_type is strange, that means there really is no run.
2276 If run_type is one_case, then run_start...run_end is the run.
2277 If run_type is two_case, then the run is run_start...run_end,
2278 and the case-equivalents end at run_eqv_end. */
2279
2280 enum case_type run_type = strange;
2281 int run_start, run_end, run_eqv_end;
2282
2283 Lisp_Object eqv_table;
2284
2285 if (!RE_TRANSLATE_P (translate))
2286 {
b7c12565 2287 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2288 work_area->table[work_area->used++] = (start);
2289 work_area->table[work_area->used++] = (end);
b7c12565 2290 return -1;
77d11aec
RS
2291 }
2292
2293 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2294
77d11aec
RS
2295 for (; start <= end; start++)
2296 {
2297 enum case_type this_type;
2298 int eqv = RE_TRANSLATE (eqv_table, start);
2299 int minchar, maxchar;
2300
2301 /* Classify this character */
2302 if (eqv == start)
2303 this_type = one_case;
2304 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2305 this_type = two_case;
2306 else
2307 this_type = strange;
2308
2309 if (start < eqv)
2310 minchar = start, maxchar = eqv;
2311 else
2312 minchar = eqv, maxchar = start;
2313
2314 /* Can this character extend the run in progress? */
2315 if (this_type == strange || this_type != run_type
2316 || !(minchar == run_end + 1
2317 && (run_type == two_case
2318 ? maxchar == run_eqv_end + 1 : 1)))
2319 {
2320 /* No, end the run.
2321 Record each of its equivalent ranges. */
2322 if (run_type == one_case)
2323 {
2324 EXTEND_RANGE_TABLE (work_area, 2);
2325 work_area->table[work_area->used++] = run_start;
2326 work_area->table[work_area->used++] = run_end;
2327 }
2328 else if (run_type == two_case)
2329 {
2330 EXTEND_RANGE_TABLE (work_area, 4);
2331 work_area->table[work_area->used++] = run_start;
2332 work_area->table[work_area->used++] = run_end;
2333 work_area->table[work_area->used++]
2334 = RE_TRANSLATE (eqv_table, run_start);
2335 work_area->table[work_area->used++]
2336 = RE_TRANSLATE (eqv_table, run_end);
2337 }
2338 run_type = strange;
2339 }
177c0ea7 2340
77d11aec
RS
2341 if (this_type == strange)
2342 {
2343 /* For a strange character, add each of its equivalents, one
2344 by one. Don't start a range. */
2345 do
2346 {
2347 EXTEND_RANGE_TABLE (work_area, 2);
2348 work_area->table[work_area->used++] = eqv;
2349 work_area->table[work_area->used++] = eqv;
2350 eqv = RE_TRANSLATE (eqv_table, eqv);
2351 }
2352 while (eqv != start);
2353 }
2354
2355 /* Add this char to the run, or start a new run. */
2356 else if (run_type == strange)
2357 {
2358 /* Initialize a new range. */
2359 run_type = this_type;
2360 run_start = start;
2361 run_end = start;
2362 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2363 }
2364 else
2365 {
2366 /* Extend a running range. */
2367 run_end = minchar;
2368 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2369 }
2370 }
2371
2372 /* If a run is still in progress at the end, finish it now
2373 by recording its equivalent ranges. */
2374 if (run_type == one_case)
2375 {
2376 EXTEND_RANGE_TABLE (work_area, 2);
2377 work_area->table[work_area->used++] = run_start;
2378 work_area->table[work_area->used++] = run_end;
2379 }
2380 else if (run_type == two_case)
2381 {
2382 EXTEND_RANGE_TABLE (work_area, 4);
2383 work_area->table[work_area->used++] = run_start;
2384 work_area->table[work_area->used++] = run_end;
2385 work_area->table[work_area->used++]
2386 = RE_TRANSLATE (eqv_table, run_start);
2387 work_area->table[work_area->used++]
2388 = RE_TRANSLATE (eqv_table, run_end);
2389 }
2390
2391 return -1;
2392}
36595814 2393
77d11aec 2394#endif /* emacs */
36595814 2395
2b34df4e 2396/* Record the image of the range start..end when passed through
36595814
SM
2397 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2398 and is not even necessarily contiguous.
b7c12565
RS
2399 Normally we approximate it with the smallest contiguous range that contains
2400 all the chars we need. However, for Latin-1 we go to extra effort
2401 to do a better job.
2402
2403 This function is not called for ASCII ranges.
77d11aec
RS
2404
2405 Returns -1 if successful, REG_ESPACE if ran out of space. */
2406
2407static int
36595814
SM
2408set_image_of_range (work_area, start, end, translate)
2409 RE_TRANSLATE_TYPE translate;
2410 struct range_table_work_area *work_area;
2411 re_wchar_t start, end;
2412{
77d11aec
RS
2413 re_wchar_t cmin, cmax;
2414
2415#ifdef emacs
2416 /* For Latin-1 ranges, use set_image_of_range_1
2417 to get proper handling of ranges that include letters and nonletters.
b7c12565 2418 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2419 For other character sets, we don't bother to get this right. */
b7c12565
RS
2420 if (RE_TRANSLATE_P (translate) && start < 04400
2421 && !(start < 04200 && end >= 04377))
77d11aec 2422 {
b7c12565 2423 int newend;
77d11aec 2424 int tem;
b7c12565
RS
2425 newend = end;
2426 if (newend > 04377)
2427 newend = 04377;
2428 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2429 if (tem > 0)
2430 return tem;
2431
2432 start = 04400;
2433 if (end < 04400)
2434 return -1;
2435 }
2436#endif
2437
b7c12565
RS
2438 EXTEND_RANGE_TABLE (work_area, 2);
2439 work_area->table[work_area->used++] = (start);
2440 work_area->table[work_area->used++] = (end);
2441
2442 cmin = -1, cmax = -1;
77d11aec 2443
36595814 2444 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2445 {
2446 int ch;
77d11aec 2447
b7c12565
RS
2448 for (ch = start; ch <= end; ch++)
2449 {
2450 re_wchar_t c = TRANSLATE (ch);
2451 if (! (start <= c && c <= end))
2452 {
2453 if (cmin == -1)
2454 cmin = c, cmax = c;
2455 else
2456 {
2457 cmin = MIN (cmin, c);
2458 cmax = MAX (cmax, c);
2459 }
2460 }
2461 }
2462
2463 if (cmin != -1)
2464 {
2465 EXTEND_RANGE_TABLE (work_area, 2);
2466 work_area->table[work_area->used++] = (cmin);
2467 work_area->table[work_area->used++] = (cmax);
2468 }
2469 }
36595814 2470
77d11aec
RS
2471 return -1;
2472}
8f924df7 2473#endif /* 0 */
fa9a63c5
RM
2474\f
2475#ifndef MATCH_MAY_ALLOCATE
2476
2477/* If we cannot allocate large objects within re_match_2_internal,
2478 we make the fail stack and register vectors global.
2479 The fail stack, we grow to the maximum size when a regexp
2480 is compiled.
2481 The register vectors, we adjust in size each time we
2482 compile a regexp, according to the number of registers it needs. */
2483
2484static fail_stack_type fail_stack;
2485
2486/* Size with which the following vectors are currently allocated.
2487 That is so we can make them bigger as needed,
4bb91c68 2488 but never make them smaller. */
fa9a63c5
RM
2489static int regs_allocated_size;
2490
66f0296e
SM
2491static re_char ** regstart, ** regend;
2492static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2493
2494/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2495 but don't make them smaller. */
fa9a63c5
RM
2496
2497static
2498regex_grow_registers (num_regs)
2499 int num_regs;
2500{
2501 if (num_regs > regs_allocated_size)
2502 {
66f0296e
SM
2503 RETALLOC_IF (regstart, num_regs, re_char *);
2504 RETALLOC_IF (regend, num_regs, re_char *);
2505 RETALLOC_IF (best_regstart, num_regs, re_char *);
2506 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2507
2508 regs_allocated_size = num_regs;
2509 }
2510}
2511
2512#endif /* not MATCH_MAY_ALLOCATE */
2513\f
99633e97
SM
2514static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2515 compile_stack,
2516 regnum_t regnum));
2517
fa9a63c5
RM
2518/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2519 Returns one of error codes defined in `regex.h', or zero for success.
2520
2521 Assumes the `allocated' (and perhaps `buffer') and `translate'
2522 fields are set in BUFP on entry.
2523
2524 If it succeeds, results are put in BUFP (if it returns an error, the
2525 contents of BUFP are undefined):
2526 `buffer' is the compiled pattern;
2527 `syntax' is set to SYNTAX;
2528 `used' is set to the length of the compiled pattern;
2529 `fastmap_accurate' is zero;
2530 `re_nsub' is the number of subexpressions in PATTERN;
2531 `not_bol' and `not_eol' are zero;
5e69f11e 2532
c0f9ea08 2533 The `fastmap' field is neither examined nor set. */
fa9a63c5 2534
505bde11
SM
2535/* Insert the `jump' from the end of last alternative to "here".
2536 The space for the jump has already been allocated. */
2537#define FIXUP_ALT_JUMP() \
2538do { \
2539 if (fixup_alt_jump) \
2540 STORE_JUMP (jump, fixup_alt_jump, b); \
2541} while (0)
2542
2543
fa9a63c5
RM
2544/* Return, freeing storage we allocated. */
2545#define FREE_STACK_RETURN(value) \
b18215fc
RS
2546 do { \
2547 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2548 free (compile_stack.stack); \
2549 return value; \
2550 } while (0)
fa9a63c5
RM
2551
2552static reg_errcode_t
971de7fb 2553regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
fa9a63c5 2554{
01618498
SM
2555 /* We fetch characters from PATTERN here. */
2556 register re_wchar_t c, c1;
5e69f11e 2557
fa9a63c5 2558 /* A random temporary spot in PATTERN. */
66f0296e 2559 re_char *p1;
fa9a63c5
RM
2560
2561 /* Points to the end of the buffer, where we should append. */
2562 register unsigned char *b;
5e69f11e 2563
fa9a63c5
RM
2564 /* Keeps track of unclosed groups. */
2565 compile_stack_type compile_stack;
2566
2567 /* Points to the current (ending) position in the pattern. */
22336245
RS
2568#ifdef AIX
2569 /* `const' makes AIX compiler fail. */
66f0296e 2570 unsigned char *p = pattern;
22336245 2571#else
66f0296e 2572 re_char *p = pattern;
22336245 2573#endif
66f0296e 2574 re_char *pend = pattern + size;
5e69f11e 2575
fa9a63c5 2576 /* How to translate the characters in the pattern. */
6676cb1c 2577 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2578
2579 /* Address of the count-byte of the most recently inserted `exactn'
2580 command. This makes it possible to tell if a new exact-match
2581 character can be added to that command or if the character requires
2582 a new `exactn' command. */
2583 unsigned char *pending_exact = 0;
2584
2585 /* Address of start of the most recently finished expression.
2586 This tells, e.g., postfix * where to find the start of its
2587 operand. Reset at the beginning of groups and alternatives. */
2588 unsigned char *laststart = 0;
2589
2590 /* Address of beginning of regexp, or inside of last group. */
2591 unsigned char *begalt;
2592
2593 /* Place in the uncompiled pattern (i.e., the {) to
2594 which to go back if the interval is invalid. */
66f0296e 2595 re_char *beg_interval;
5e69f11e 2596
fa9a63c5 2597 /* Address of the place where a forward jump should go to the end of
7814e705 2598 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2599 last -- ends with a forward jump of this sort. */
2600 unsigned char *fixup_alt_jump = 0;
2601
b18215fc
RS
2602 /* Work area for range table of charset. */
2603 struct range_table_work_area range_table_work;
2604
2d1675e4
SM
2605 /* If the object matched can contain multibyte characters. */
2606 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2607
8f924df7 2608 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2609 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2610
f9b0fd99
RS
2611 /* Nonzero if we have pushed down into a subpattern. */
2612 int in_subpattern = 0;
2613
2614 /* These hold the values of p, pattern, and pend from the main
2615 pattern when we have pushed into a subpattern. */
2616 re_char *main_p;
2617 re_char *main_pattern;
2618 re_char *main_pend;
2619
fa9a63c5 2620#ifdef DEBUG
99633e97 2621 debug++;
fa9a63c5 2622 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2623 if (debug > 0)
fa9a63c5
RM
2624 {
2625 unsigned debug_count;
5e69f11e 2626
fa9a63c5 2627 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2628 putchar (pattern[debug_count]);
fa9a63c5
RM
2629 putchar ('\n');
2630 }
2631#endif /* DEBUG */
2632
2633 /* Initialize the compile stack. */
2634 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2635 if (compile_stack.stack == NULL)
2636 return REG_ESPACE;
2637
2638 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2639 compile_stack.avail = 0;
2640
b18215fc
RS
2641 range_table_work.table = 0;
2642 range_table_work.allocated = 0;
2643
fa9a63c5
RM
2644 /* Initialize the pattern buffer. */
2645 bufp->syntax = syntax;
2646 bufp->fastmap_accurate = 0;
2647 bufp->not_bol = bufp->not_eol = 0;
6224b623 2648 bufp->used_syntax = 0;
fa9a63c5
RM
2649
2650 /* Set `used' to zero, so that if we return an error, the pattern
2651 printer (for debugging) will think there's no pattern. We reset it
2652 at the end. */
2653 bufp->used = 0;
5e69f11e 2654
fa9a63c5 2655 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2656 bufp->re_nsub = 0;
fa9a63c5 2657
0b32bf0e 2658#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2659 /* Initialize the syntax table. */
2660 init_syntax_once ();
2661#endif
2662
2663 if (bufp->allocated == 0)
2664 {
2665 if (bufp->buffer)
2666 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af 2667 enough space. This loses if buffer's address is bogus, but
7814e705 2668 that is the user's responsibility. */
25fe55af
RS
2669 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2670 }
fa9a63c5 2671 else
7814e705 2672 { /* Caller did not allocate a buffer. Do it for them. */
25fe55af
RS
2673 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2674 }
fa9a63c5
RM
2675 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2676
2677 bufp->allocated = INIT_BUF_SIZE;
2678 }
2679
2680 begalt = b = bufp->buffer;
2681
2682 /* Loop through the uncompiled pattern until we're at the end. */
f9b0fd99 2683 while (1)
fa9a63c5 2684 {
f9b0fd99
RS
2685 if (p == pend)
2686 {
2687 /* If this is the end of an included regexp,
2688 pop back to the main regexp and try again. */
2689 if (in_subpattern)
2690 {
2691 in_subpattern = 0;
2692 pattern = main_pattern;
2693 p = main_p;
2694 pend = main_pend;
2695 continue;
2696 }
2697 /* If this is the end of the main regexp, we are done. */
2698 break;
2699 }
2700
fa9a63c5
RM
2701 PATFETCH (c);
2702
2703 switch (c)
25fe55af 2704 {
f9b0fd99
RS
2705 case ' ':
2706 {
2707 re_char *p1 = p;
2708
2709 /* If there's no special whitespace regexp, treat
4fb680cd
RS
2710 spaces normally. And don't try to do this recursively. */
2711 if (!whitespace_regexp || in_subpattern)
f9b0fd99
RS
2712 goto normal_char;
2713
2714 /* Peek past following spaces. */
2715 while (p1 != pend)
2716 {
2717 if (*p1 != ' ')
2718 break;
2719 p1++;
2720 }
2721 /* If the spaces are followed by a repetition op,
2722 treat them normally. */
c721eee5
RS
2723 if (p1 != pend
2724 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
f9b0fd99
RS
2725 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2726 goto normal_char;
2727
2728 /* Replace the spaces with the whitespace regexp. */
2729 in_subpattern = 1;
2730 main_p = p1;
2731 main_pend = pend;
2732 main_pattern = pattern;
2733 p = pattern = whitespace_regexp;
2734 pend = p + strlen (p);
2735 break;
7814e705 2736 }
f9b0fd99 2737
25fe55af
RS
2738 case '^':
2739 {
7814e705 2740 if ( /* If at start of pattern, it's an operator. */
25fe55af 2741 p == pattern + 1
7814e705 2742 /* If context independent, it's an operator. */
25fe55af 2743 || syntax & RE_CONTEXT_INDEP_ANCHORS
7814e705 2744 /* Otherwise, depends on what's come before. */
25fe55af 2745 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2746 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2747 else
2748 goto normal_char;
2749 }
2750 break;
2751
2752
2753 case '$':
2754 {
2755 if ( /* If at end of pattern, it's an operator. */
2756 p == pend
7814e705 2757 /* If context independent, it's an operator. */
25fe55af
RS
2758 || syntax & RE_CONTEXT_INDEP_ANCHORS
2759 /* Otherwise, depends on what's next. */
2760 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2761 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2762 else
2763 goto normal_char;
2764 }
2765 break;
fa9a63c5
RM
2766
2767
2768 case '+':
25fe55af
RS
2769 case '?':
2770 if ((syntax & RE_BK_PLUS_QM)
2771 || (syntax & RE_LIMITED_OPS))
2772 goto normal_char;
2773 handle_plus:
2774 case '*':
2775 /* If there is no previous pattern... */
2776 if (!laststart)
2777 {
2778 if (syntax & RE_CONTEXT_INVALID_OPS)
2779 FREE_STACK_RETURN (REG_BADRPT);
2780 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2781 goto normal_char;
2782 }
2783
2784 {
7814e705 2785 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2786 boolean zero_times_ok = 0, many_times_ok = 0;
2787 boolean greedy = 1;
25fe55af
RS
2788
2789 /* If there is a sequence of repetition chars, collapse it
2790 down to just one (the right one). We can't combine
2791 interval operators with these because of, e.g., `a{2}*',
7814e705 2792 which should only match an even number of `a's. */
25fe55af
RS
2793
2794 for (;;)
2795 {
0b32bf0e 2796 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2797 && c == '?' && (zero_times_ok || many_times_ok))
2798 greedy = 0;
2799 else
2800 {
2801 zero_times_ok |= c != '+';
2802 many_times_ok |= c != '?';
2803 }
25fe55af
RS
2804
2805 if (p == pend)
2806 break;
ed0767d8
SM
2807 else if (*p == '*'
2808 || (!(syntax & RE_BK_PLUS_QM)
2809 && (*p == '+' || *p == '?')))
25fe55af 2810 ;
ed0767d8 2811 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2812 {
ed0767d8
SM
2813 if (p+1 == pend)
2814 FREE_STACK_RETURN (REG_EESCAPE);
2815 if (p[1] == '+' || p[1] == '?')
2816 PATFETCH (c); /* Gobble up the backslash. */
2817 else
2818 break;
25fe55af
RS
2819 }
2820 else
ed0767d8 2821 break;
25fe55af 2822 /* If we get here, we found another repeat character. */
ed0767d8
SM
2823 PATFETCH (c);
2824 }
25fe55af
RS
2825
2826 /* Star, etc. applied to an empty pattern is equivalent
2827 to an empty pattern. */
4e8a9132 2828 if (!laststart || laststart == b)
25fe55af
RS
2829 break;
2830
2831 /* Now we know whether or not zero matches is allowed
7814e705 2832 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2833 if (greedy)
2834 {
99633e97 2835 if (many_times_ok)
4e8a9132
SM
2836 {
2837 boolean simple = skip_one_char (laststart) == b;
2838 unsigned int startoffset = 0;
f6a3f532 2839 re_opcode_t ofj =
01618498 2840 /* Check if the loop can match the empty string. */
6df42991
SM
2841 (simple || !analyse_first (laststart, b, NULL, 0))
2842 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2843 assert (skip_one_char (laststart) <= b);
177c0ea7 2844
4e8a9132
SM
2845 if (!zero_times_ok && simple)
2846 { /* Since simple * loops can be made faster by using
2847 on_failure_keep_string_jump, we turn simple P+
2848 into PP* if P is simple. */
2849 unsigned char *p1, *p2;
2850 startoffset = b - laststart;
2851 GET_BUFFER_SPACE (startoffset);
2852 p1 = b; p2 = laststart;
2853 while (p2 < p1)
2854 *b++ = *p2++;
2855 zero_times_ok = 1;
99633e97 2856 }
4e8a9132
SM
2857
2858 GET_BUFFER_SPACE (6);
2859 if (!zero_times_ok)
2860 /* A + loop. */
f6a3f532 2861 STORE_JUMP (ofj, b, b + 6);
99633e97 2862 else
4e8a9132
SM
2863 /* Simple * loops can use on_failure_keep_string_jump
2864 depending on what follows. But since we don't know
2865 that yet, we leave the decision up to
2866 on_failure_jump_smart. */
f6a3f532 2867 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2868 laststart + startoffset, b + 6);
99633e97 2869 b += 3;
4e8a9132 2870 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2871 b += 3;
2872 }
2873 else
2874 {
4e8a9132
SM
2875 /* A simple ? pattern. */
2876 assert (zero_times_ok);
2877 GET_BUFFER_SPACE (3);
2878 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2879 b += 3;
2880 }
1c8c6d39
DL
2881 }
2882 else /* not greedy */
2883 { /* I wish the greedy and non-greedy cases could be merged. */
2884
0683b6fa 2885 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2886 if (many_times_ok)
2887 {
f6a3f532
SM
2888 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2889
6df42991
SM
2890 /* The non-greedy multiple match looks like
2891 a repeat..until: we only need a conditional jump
2892 at the end of the loop. */
f6a3f532
SM
2893 if (emptyp) BUF_PUSH (no_op);
2894 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2895 : on_failure_jump, b, laststart);
1c8c6d39
DL
2896 b += 3;
2897 if (zero_times_ok)
2898 {
2899 /* The repeat...until naturally matches one or more.
2900 To also match zero times, we need to first jump to
6df42991 2901 the end of the loop (its conditional jump). */
1c8c6d39
DL
2902 INSERT_JUMP (jump, laststart, b);
2903 b += 3;
2904 }
2905 }
2906 else
2907 {
2908 /* non-greedy a?? */
1c8c6d39
DL
2909 INSERT_JUMP (jump, laststart, b + 3);
2910 b += 3;
2911 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2912 b += 3;
2913 }
2914 }
2915 }
4e8a9132 2916 pending_exact = 0;
fa9a63c5
RM
2917 break;
2918
2919
2920 case '.':
25fe55af
RS
2921 laststart = b;
2922 BUF_PUSH (anychar);
2923 break;
fa9a63c5
RM
2924
2925
25fe55af
RS
2926 case '[':
2927 {
b18215fc 2928 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2929
25fe55af 2930 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2931
25fe55af
RS
2932 /* Ensure that we have enough space to push a charset: the
2933 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2934 GET_BUFFER_SPACE (34);
2935
25fe55af 2936 laststart = b;
e318085a 2937
25fe55af 2938 /* We test `*p == '^' twice, instead of using an if
7814e705 2939 statement, so we only need one BUF_PUSH. */
25fe55af
RS
2940 BUF_PUSH (*p == '^' ? charset_not : charset);
2941 if (*p == '^')
2942 p++;
e318085a 2943
25fe55af
RS
2944 /* Remember the first position in the bracket expression. */
2945 p1 = p;
e318085a 2946
7814e705 2947 /* Push the number of bytes in the bitmap. */
25fe55af 2948 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2949
25fe55af
RS
2950 /* Clear the whole map. */
2951 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2952
25fe55af
RS
2953 /* charset_not matches newline according to a syntax bit. */
2954 if ((re_opcode_t) b[-2] == charset_not
2955 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2956 SET_LIST_BIT ('\n');
fa9a63c5 2957
7814e705 2958 /* Read in characters and ranges, setting map bits. */
25fe55af
RS
2959 for (;;)
2960 {
b18215fc 2961 boolean escaped_char = false;
2d1675e4 2962 const unsigned char *p2 = p;
cf9c99bc 2963 re_wchar_t ch, c2;
e318085a 2964
25fe55af 2965 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2966
36595814
SM
2967 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2968 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2969 So the translation is done later in a loop. Example:
2970 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2971 PATFETCH (c);
e318085a 2972
25fe55af
RS
2973 /* \ might escape characters inside [...] and [^...]. */
2974 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2975 {
2976 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2977
2978 PATFETCH (c);
b18215fc 2979 escaped_char = true;
25fe55af 2980 }
b18215fc
RS
2981 else
2982 {
7814e705 2983 /* Could be the end of the bracket expression. If it's
657fcfbd
RS
2984 not (i.e., when the bracket expression is `[]' so
2985 far), the ']' character bit gets set way below. */
2d1675e4 2986 if (c == ']' && p2 != p1)
657fcfbd 2987 break;
25fe55af 2988 }
b18215fc 2989
25fe55af
RS
2990 /* See if we're at the beginning of a possible character
2991 class. */
b18215fc 2992
2d1675e4
SM
2993 if (!escaped_char &&
2994 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd 2995 {
7814e705 2996 /* Leave room for the null. */
14473664 2997 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2998 const unsigned char *class_beg;
b18215fc 2999
25fe55af
RS
3000 PATFETCH (c);
3001 c1 = 0;
ed0767d8 3002 class_beg = p;
b18215fc 3003
25fe55af
RS
3004 /* If pattern is `[[:'. */
3005 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 3006
25fe55af
RS
3007 for (;;)
3008 {
14473664
SM
3009 PATFETCH (c);
3010 if ((c == ':' && *p == ']') || p == pend)
3011 break;
3012 if (c1 < CHAR_CLASS_MAX_LENGTH)
3013 str[c1++] = c;
3014 else
3015 /* This is in any case an invalid class name. */
3016 str[0] = '\0';
25fe55af
RS
3017 }
3018 str[c1] = '\0';
b18215fc
RS
3019
3020 /* If isn't a word bracketed by `[:' and `:]':
3021 undo the ending character, the letters, and
3022 leave the leading `:' and `[' (but set bits for
3023 them). */
25fe55af
RS
3024 if (c == ':' && *p == ']')
3025 {
14473664 3026 re_wctype_t cc;
8f924df7 3027 int limit;
14473664
SM
3028
3029 cc = re_wctype (str);
3030
3031 if (cc == 0)
fa9a63c5
RM
3032 FREE_STACK_RETURN (REG_ECTYPE);
3033
14473664
SM
3034 /* Throw away the ] at the end of the character
3035 class. */
3036 PATFETCH (c);
fa9a63c5 3037
14473664 3038 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 3039
cf9c99bc
KH
3040#ifndef emacs
3041 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
8f924df7
KH
3042 if (re_iswctype (btowc (ch), cc))
3043 {
3044 c = TRANSLATE (ch);
ed00c2ac
KH
3045 if (c < (1 << BYTEWIDTH))
3046 SET_LIST_BIT (c);
8f924df7 3047 }
cf9c99bc
KH
3048#else /* emacs */
3049 /* Most character classes in a multibyte match
3050 just set a flag. Exceptions are is_blank,
3051 is_digit, is_cntrl, and is_xdigit, since
3052 they can only match ASCII characters. We
3053 don't need to handle them for multibyte.
3054 They are distinguished by a negative wctype. */
96cc36cc 3055
254c06a8
SM
3056 /* Setup the gl_state object to its buffer-defined
3057 value. This hardcodes the buffer-global
3058 syntax-table for ASCII chars, while the other chars
3059 will obey syntax-table properties. It's not ideal,
3060 but it's the way it's been done until now. */
d48cd3f4 3061 SETUP_BUFFER_SYNTAX_TABLE ();
254c06a8 3062
cf9c99bc 3063 for (ch = 0; ch < 256; ++ch)
25fe55af 3064 {
cf9c99bc
KH
3065 c = RE_CHAR_TO_MULTIBYTE (ch);
3066 if (! CHAR_BYTE8_P (c)
3067 && re_iswctype (c, cc))
8f924df7 3068 {
cf9c99bc
KH
3069 SET_LIST_BIT (ch);
3070 c1 = TRANSLATE (c);
3071 if (c1 == c)
3072 continue;
3073 if (ASCII_CHAR_P (c1))
3074 SET_LIST_BIT (c1);
3075 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3076 SET_LIST_BIT (c1);
8f924df7 3077 }
25fe55af 3078 }
cf9c99bc
KH
3079 SET_RANGE_TABLE_WORK_AREA_BIT
3080 (range_table_work, re_wctype_to_bit (cc));
3081#endif /* emacs */
6224b623
SM
3082 /* In most cases the matching rule for char classes
3083 only uses the syntax table for multibyte chars,
3084 so that the content of the syntax-table it is not
3085 hardcoded in the range_table. SPACE and WORD are
3086 the two exceptions. */
3087 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3088 bufp->used_syntax = 1;
3089
b18215fc
RS
3090 /* Repeat the loop. */
3091 continue;
25fe55af
RS
3092 }
3093 else
3094 {
ed0767d8
SM
3095 /* Go back to right after the "[:". */
3096 p = class_beg;
25fe55af 3097 SET_LIST_BIT ('[');
b18215fc
RS
3098
3099 /* Because the `:' may starts the range, we
3100 can't simply set bit and repeat the loop.
7814e705 3101 Instead, just set it to C and handle below. */
b18215fc 3102 c = ':';
25fe55af
RS
3103 }
3104 }
b18215fc
RS
3105
3106 if (p < pend && p[0] == '-' && p[1] != ']')
3107 {
3108
3109 /* Discard the `-'. */
3110 PATFETCH (c1);
3111
3112 /* Fetch the character which ends the range. */
3113 PATFETCH (c1);
cf9c99bc
KH
3114#ifdef emacs
3115 if (CHAR_BYTE8_P (c1)
3116 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3117 /* Treat the range from a multibyte character to
3118 raw-byte character as empty. */
3119 c = c1 + 1;
3120#endif /* emacs */
e318085a 3121 }
25fe55af 3122 else
b18215fc
RS
3123 /* Range from C to C. */
3124 c1 = c;
3125
cf9c99bc 3126 if (c > c1)
25fe55af 3127 {
cf9c99bc
KH
3128 if (syntax & RE_NO_EMPTY_RANGES)
3129 FREE_STACK_RETURN (REG_ERANGEX);
3130 /* Else, repeat the loop. */
bf216479 3131 }
6fdd04b0 3132 else
25fe55af 3133 {
cf9c99bc
KH
3134#ifndef emacs
3135 /* Set the range into bitmap */
8f924df7 3136 for (; c <= c1; c++)
b18215fc 3137 {
cf9c99bc
KH
3138 ch = TRANSLATE (c);
3139 if (ch < (1 << BYTEWIDTH))
3140 SET_LIST_BIT (ch);
3141 }
3142#else /* emacs */
3143 if (c < 128)
3144 {
3145 ch = MIN (127, c1);
3146 SETUP_ASCII_RANGE (range_table_work, c, ch);
3147 c = ch + 1;
3148 if (CHAR_BYTE8_P (c1))
3149 c = BYTE8_TO_CHAR (128);
3150 }
3151 if (c <= c1)
3152 {
3153 if (CHAR_BYTE8_P (c))
3154 {
3155 c = CHAR_TO_BYTE8 (c);
3156 c1 = CHAR_TO_BYTE8 (c1);
3157 for (; c <= c1; c++)
3158 SET_LIST_BIT (c);
3159 }
3160 else if (multibyte)
3161 {
3162 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3163 }
3164 else
3165 {
3166 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3167 }
e934739e 3168 }
cf9c99bc 3169#endif /* emacs */
25fe55af 3170 }
e318085a
RS
3171 }
3172
25fe55af 3173 /* Discard any (non)matching list bytes that are all 0 at the
7814e705 3174 end of the map. Decrease the map-length byte too. */
25fe55af
RS
3175 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3176 b[-1]--;
3177 b += b[-1];
fa9a63c5 3178
96cc36cc
RS
3179 /* Build real range table from work area. */
3180 if (RANGE_TABLE_WORK_USED (range_table_work)
3181 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
3182 {
3183 int i;
3184 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 3185
b18215fc 3186 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3187 bytes for flags, two for COUNT, and three bytes for
3188 each character. */
3189 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3190
b18215fc
RS
3191 /* Indicate the existence of range table. */
3192 laststart[1] |= 0x80;
fa9a63c5 3193
96cc36cc
RS
3194 /* Store the character class flag bits into the range table.
3195 If not in emacs, these flag bits are always 0. */
3196 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3197 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3198
b18215fc
RS
3199 STORE_NUMBER_AND_INCR (b, used / 2);
3200 for (i = 0; i < used; i++)
3201 STORE_CHARACTER_AND_INCR
3202 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3203 }
25fe55af
RS
3204 }
3205 break;
fa9a63c5
RM
3206
3207
b18215fc 3208 case '(':
25fe55af
RS
3209 if (syntax & RE_NO_BK_PARENS)
3210 goto handle_open;
3211 else
3212 goto normal_char;
fa9a63c5
RM
3213
3214
25fe55af
RS
3215 case ')':
3216 if (syntax & RE_NO_BK_PARENS)
3217 goto handle_close;
3218 else
3219 goto normal_char;
e318085a
RS
3220
3221
25fe55af
RS
3222 case '\n':
3223 if (syntax & RE_NEWLINE_ALT)
3224 goto handle_alt;
3225 else
3226 goto normal_char;
e318085a
RS
3227
3228
b18215fc 3229 case '|':
25fe55af
RS
3230 if (syntax & RE_NO_BK_VBAR)
3231 goto handle_alt;
3232 else
3233 goto normal_char;
3234
3235
3236 case '{':
3237 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3238 goto handle_interval;
3239 else
3240 goto normal_char;
3241
3242
3243 case '\\':
3244 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3245
3246 /* Do not translate the character after the \, so that we can
3247 distinguish, e.g., \B from \b, even if we normally would
3248 translate, e.g., B to b. */
36595814 3249 PATFETCH (c);
25fe55af
RS
3250
3251 switch (c)
3252 {
3253 case '(':
3254 if (syntax & RE_NO_BK_PARENS)
3255 goto normal_backslash;
3256
3257 handle_open:
505bde11
SM
3258 {
3259 int shy = 0;
c69b0314 3260 regnum_t regnum = 0;
505bde11
SM
3261 if (p+1 < pend)
3262 {
3263 /* Look for a special (?...) construct */
ed0767d8 3264 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3265 {
ed0767d8 3266 PATFETCH (c); /* Gobble up the '?'. */
c69b0314 3267 while (!shy)
505bde11 3268 {
c69b0314
SM
3269 PATFETCH (c);
3270 switch (c)
3271 {
3272 case ':': shy = 1; break;
3273 case '0':
3274 /* An explicitly specified regnum must start
3275 with non-0. */
3276 if (regnum == 0)
3277 FREE_STACK_RETURN (REG_BADPAT);
3278 case '1': case '2': case '3': case '4':
3279 case '5': case '6': case '7': case '8': case '9':
3280 regnum = 10*regnum + (c - '0'); break;
3281 default:
3282 /* Only (?:...) is supported right now. */
3283 FREE_STACK_RETURN (REG_BADPAT);
3284 }
505bde11
SM
3285 }
3286 }
505bde11
SM
3287 }
3288
3289 if (!shy)
c69b0314
SM
3290 regnum = ++bufp->re_nsub;
3291 else if (regnum)
3292 { /* It's actually not shy, but explicitly numbered. */
3293 shy = 0;
3294 if (regnum > bufp->re_nsub)
3295 bufp->re_nsub = regnum;
3296 else if (regnum > bufp->re_nsub
3297 /* Ideally, we'd want to check that the specified
3298 group can't have matched (i.e. all subgroups
3299 using the same regnum are in other branches of
3300 OR patterns), but we don't currently keep track
3301 of enough info to do that easily. */
3302 || group_in_compile_stack (compile_stack, regnum))
3303 FREE_STACK_RETURN (REG_BADPAT);
505bde11 3304 }
c69b0314
SM
3305 else
3306 /* It's really shy. */
3307 regnum = - bufp->re_nsub;
25fe55af 3308
99633e97
SM
3309 if (COMPILE_STACK_FULL)
3310 {
3311 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3312 compile_stack_elt_t);
3313 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3314
99633e97
SM
3315 compile_stack.size <<= 1;
3316 }
25fe55af 3317
99633e97 3318 /* These are the values to restore when we hit end of this
7814e705 3319 group. They are all relative offsets, so that if the
99633e97
SM
3320 whole pattern moves because of realloc, they will still
3321 be valid. */
3322 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3323 COMPILE_STACK_TOP.fixup_alt_jump
3324 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3325 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
c69b0314 3326 COMPILE_STACK_TOP.regnum = regnum;
99633e97 3327
c69b0314
SM
3328 /* Do not push a start_memory for groups beyond the last one
3329 we can represent in the compiled pattern. */
3330 if (regnum <= MAX_REGNUM && regnum > 0)
99633e97
SM
3331 BUF_PUSH_2 (start_memory, regnum);
3332
3333 compile_stack.avail++;
3334
3335 fixup_alt_jump = 0;
3336 laststart = 0;
3337 begalt = b;
3338 /* If we've reached MAX_REGNUM groups, then this open
3339 won't actually generate any code, so we'll have to
3340 clear pending_exact explicitly. */
3341 pending_exact = 0;
3342 break;
505bde11 3343 }
25fe55af
RS
3344
3345 case ')':
3346 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3347
3348 if (COMPILE_STACK_EMPTY)
505bde11
SM
3349 {
3350 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3351 goto normal_backslash;
3352 else
3353 FREE_STACK_RETURN (REG_ERPAREN);
3354 }
25fe55af
RS
3355
3356 handle_close:
505bde11 3357 FIXUP_ALT_JUMP ();
25fe55af
RS
3358
3359 /* See similar code for backslashed left paren above. */
3360 if (COMPILE_STACK_EMPTY)
505bde11
SM
3361 {
3362 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3363 goto normal_char;
3364 else
3365 FREE_STACK_RETURN (REG_ERPAREN);
3366 }
25fe55af
RS
3367
3368 /* Since we just checked for an empty stack above, this
3369 ``can't happen''. */
3370 assert (compile_stack.avail != 0);
3371 {
3372 /* We don't just want to restore into `regnum', because
3373 later groups should continue to be numbered higher,
7814e705 3374 as in `(ab)c(de)' -- the second group is #2. */
c69b0314 3375 regnum_t regnum;
25fe55af
RS
3376
3377 compile_stack.avail--;
3378 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3379 fixup_alt_jump
3380 = COMPILE_STACK_TOP.fixup_alt_jump
3381 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3382 : 0;
3383 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
c69b0314 3384 regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3385 /* If we've reached MAX_REGNUM groups, then this open
3386 won't actually generate any code, so we'll have to
3387 clear pending_exact explicitly. */
3388 pending_exact = 0;
e318085a 3389
25fe55af 3390 /* We're at the end of the group, so now we know how many
7814e705 3391 groups were inside this one. */
c69b0314
SM
3392 if (regnum <= MAX_REGNUM && regnum > 0)
3393 BUF_PUSH_2 (stop_memory, regnum);
25fe55af
RS
3394 }
3395 break;
3396
3397
3398 case '|': /* `\|'. */
3399 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3400 goto normal_backslash;
3401 handle_alt:
3402 if (syntax & RE_LIMITED_OPS)
3403 goto normal_char;
3404
3405 /* Insert before the previous alternative a jump which
7814e705 3406 jumps to this alternative if the former fails. */
25fe55af
RS
3407 GET_BUFFER_SPACE (3);
3408 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3409 pending_exact = 0;
3410 b += 3;
3411
3412 /* The alternative before this one has a jump after it
3413 which gets executed if it gets matched. Adjust that
3414 jump so it will jump to this alternative's analogous
3415 jump (put in below, which in turn will jump to the next
3416 (if any) alternative's such jump, etc.). The last such
3417 jump jumps to the correct final destination. A picture:
3418 _____ _____
3419 | | | |
3420 | v | v
3421 a | b | c
3422
3423 If we are at `b', then fixup_alt_jump right now points to a
3424 three-byte space after `a'. We'll put in the jump, set
3425 fixup_alt_jump to right after `b', and leave behind three
3426 bytes which we'll fill in when we get to after `c'. */
3427
505bde11 3428 FIXUP_ALT_JUMP ();
25fe55af
RS
3429
3430 /* Mark and leave space for a jump after this alternative,
3431 to be filled in later either by next alternative or
3432 when know we're at the end of a series of alternatives. */
3433 fixup_alt_jump = b;
3434 GET_BUFFER_SPACE (3);
3435 b += 3;
3436
3437 laststart = 0;
3438 begalt = b;
3439 break;
3440
3441
3442 case '{':
3443 /* If \{ is a literal. */
3444 if (!(syntax & RE_INTERVALS)
3445 /* If we're at `\{' and it's not the open-interval
3446 operator. */
4bb91c68 3447 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3448 goto normal_backslash;
3449
3450 handle_interval:
3451 {
3452 /* If got here, then the syntax allows intervals. */
3453
3454 /* At least (most) this many matches must be made. */
99633e97 3455 int lower_bound = 0, upper_bound = -1;
25fe55af 3456
ed0767d8 3457 beg_interval = p;
25fe55af 3458
25fe55af
RS
3459 GET_UNSIGNED_NUMBER (lower_bound);
3460
3461 if (c == ',')
ed0767d8 3462 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3463 else
3464 /* Interval such as `{1}' => match exactly once. */
3465 upper_bound = lower_bound;
3466
3467 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3468 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3469 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3470
3471 if (!(syntax & RE_NO_BK_BRACES))
3472 {
4bb91c68
SM
3473 if (c != '\\')
3474 FREE_STACK_RETURN (REG_BADBR);
c72b0edd
SM
3475 if (p == pend)
3476 FREE_STACK_RETURN (REG_EESCAPE);
25fe55af
RS
3477 PATFETCH (c);
3478 }
3479
3480 if (c != '}')
4bb91c68 3481 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3482
3483 /* We just parsed a valid interval. */
3484
3485 /* If it's invalid to have no preceding re. */
3486 if (!laststart)
3487 {
3488 if (syntax & RE_CONTEXT_INVALID_OPS)
3489 FREE_STACK_RETURN (REG_BADRPT);
3490 else if (syntax & RE_CONTEXT_INDEP_OPS)
3491 laststart = b;
3492 else
3493 goto unfetch_interval;
3494 }
3495
6df42991
SM
3496 if (upper_bound == 0)
3497 /* If the upper bound is zero, just drop the sub pattern
3498 altogether. */
3499 b = laststart;
3500 else if (lower_bound == 1 && upper_bound == 1)
3501 /* Just match it once: nothing to do here. */
3502 ;
3503
3504 /* Otherwise, we have a nontrivial interval. When
3505 we're all done, the pattern will look like:
3506 set_number_at <jump count> <upper bound>
3507 set_number_at <succeed_n count> <lower bound>
3508 succeed_n <after jump addr> <succeed_n count>
3509 <body of loop>
3510 jump_n <succeed_n addr> <jump count>
3511 (The upper bound and `jump_n' are omitted if
3512 `upper_bound' is 1, though.) */
3513 else
3514 { /* If the upper bound is > 1, we need to insert
3515 more at the end of the loop. */
3516 unsigned int nbytes = (upper_bound < 0 ? 3
3517 : upper_bound > 1 ? 5 : 0);
3518 unsigned int startoffset = 0;
3519
3520 GET_BUFFER_SPACE (20); /* We might use less. */
3521
3522 if (lower_bound == 0)
3523 {
3524 /* A succeed_n that starts with 0 is really a
3525 a simple on_failure_jump_loop. */
3526 INSERT_JUMP (on_failure_jump_loop, laststart,
3527 b + 3 + nbytes);
3528 b += 3;
3529 }
3530 else
3531 {
3532 /* Initialize lower bound of the `succeed_n', even
3533 though it will be set during matching by its
3534 attendant `set_number_at' (inserted next),
3535 because `re_compile_fastmap' needs to know.
3536 Jump to the `jump_n' we might insert below. */
3537 INSERT_JUMP2 (succeed_n, laststart,
3538 b + 5 + nbytes,
3539 lower_bound);
3540 b += 5;
3541
3542 /* Code to initialize the lower bound. Insert
7814e705 3543 before the `succeed_n'. The `5' is the last two
6df42991
SM
3544 bytes of this `set_number_at', plus 3 bytes of
3545 the following `succeed_n'. */
3546 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3547 b += 5;
3548 startoffset += 5;
3549 }
3550
3551 if (upper_bound < 0)
3552 {
3553 /* A negative upper bound stands for infinity,
3554 in which case it degenerates to a plain jump. */
3555 STORE_JUMP (jump, b, laststart + startoffset);
3556 b += 3;
3557 }
3558 else if (upper_bound > 1)
3559 { /* More than one repetition is allowed, so
3560 append a backward jump to the `succeed_n'
3561 that starts this interval.
3562
3563 When we've reached this during matching,
3564 we'll have matched the interval once, so
3565 jump back only `upper_bound - 1' times. */
3566 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3567 upper_bound - 1);
3568 b += 5;
3569
3570 /* The location we want to set is the second
3571 parameter of the `jump_n'; that is `b-2' as
3572 an absolute address. `laststart' will be
3573 the `set_number_at' we're about to insert;
3574 `laststart+3' the number to set, the source
3575 for the relative address. But we are
3576 inserting into the middle of the pattern --
3577 so everything is getting moved up by 5.
3578 Conclusion: (b - 2) - (laststart + 3) + 5,
3579 i.e., b - laststart.
3580
3581 We insert this at the beginning of the loop
3582 so that if we fail during matching, we'll
3583 reinitialize the bounds. */
3584 insert_op2 (set_number_at, laststart, b - laststart,
3585 upper_bound - 1, b);
3586 b += 5;
3587 }
3588 }
25fe55af
RS
3589 pending_exact = 0;
3590 beg_interval = NULL;
3591 }
3592 break;
3593
3594 unfetch_interval:
3595 /* If an invalid interval, match the characters as literals. */
3596 assert (beg_interval);
3597 p = beg_interval;
3598 beg_interval = NULL;
3599
3600 /* normal_char and normal_backslash need `c'. */
ed0767d8 3601 c = '{';
25fe55af
RS
3602
3603 if (!(syntax & RE_NO_BK_BRACES))
3604 {
ed0767d8
SM
3605 assert (p > pattern && p[-1] == '\\');
3606 goto normal_backslash;
25fe55af 3607 }
ed0767d8
SM
3608 else
3609 goto normal_char;
e318085a 3610
b18215fc 3611#ifdef emacs
25fe55af 3612 /* There is no way to specify the before_dot and after_dot
7814e705 3613 operators. rms says this is ok. --karl */
25fe55af
RS
3614 case '=':
3615 BUF_PUSH (at_dot);
3616 break;
3617
3618 case 's':
3619 laststart = b;
3620 PATFETCH (c);
3621 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3622 break;
3623
3624 case 'S':
3625 laststart = b;
3626 PATFETCH (c);
3627 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3628 break;
b18215fc
RS
3629
3630 case 'c':
3631 laststart = b;
36595814 3632 PATFETCH (c);
b18215fc
RS
3633 BUF_PUSH_2 (categoryspec, c);
3634 break;
e318085a 3635
b18215fc
RS
3636 case 'C':
3637 laststart = b;
36595814 3638 PATFETCH (c);
b18215fc
RS
3639 BUF_PUSH_2 (notcategoryspec, c);
3640 break;
3641#endif /* emacs */
e318085a 3642
e318085a 3643
25fe55af 3644 case 'w':
4bb91c68
SM
3645 if (syntax & RE_NO_GNU_OPS)
3646 goto normal_char;
25fe55af 3647 laststart = b;
1fb352e0 3648 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3649 break;
e318085a 3650
e318085a 3651
25fe55af 3652 case 'W':
4bb91c68
SM
3653 if (syntax & RE_NO_GNU_OPS)
3654 goto normal_char;
25fe55af 3655 laststart = b;
1fb352e0 3656 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3657 break;
e318085a
RS
3658
3659
25fe55af 3660 case '<':
4bb91c68
SM
3661 if (syntax & RE_NO_GNU_OPS)
3662 goto normal_char;
25fe55af
RS
3663 BUF_PUSH (wordbeg);
3664 break;
e318085a 3665
25fe55af 3666 case '>':
4bb91c68
SM
3667 if (syntax & RE_NO_GNU_OPS)
3668 goto normal_char;
25fe55af
RS
3669 BUF_PUSH (wordend);
3670 break;
e318085a 3671
669fa600
SM
3672 case '_':
3673 if (syntax & RE_NO_GNU_OPS)
3674 goto normal_char;
3675 laststart = b;
3676 PATFETCH (c);
3677 if (c == '<')
3678 BUF_PUSH (symbeg);
3679 else if (c == '>')
3680 BUF_PUSH (symend);
3681 else
3682 FREE_STACK_RETURN (REG_BADPAT);
3683 break;
3684
25fe55af 3685 case 'b':
4bb91c68
SM
3686 if (syntax & RE_NO_GNU_OPS)
3687 goto normal_char;
25fe55af
RS
3688 BUF_PUSH (wordbound);
3689 break;
e318085a 3690
25fe55af 3691 case 'B':
4bb91c68
SM
3692 if (syntax & RE_NO_GNU_OPS)
3693 goto normal_char;
25fe55af
RS
3694 BUF_PUSH (notwordbound);
3695 break;
fa9a63c5 3696
25fe55af 3697 case '`':
4bb91c68
SM
3698 if (syntax & RE_NO_GNU_OPS)
3699 goto normal_char;
25fe55af
RS
3700 BUF_PUSH (begbuf);
3701 break;
e318085a 3702
25fe55af 3703 case '\'':
4bb91c68
SM
3704 if (syntax & RE_NO_GNU_OPS)
3705 goto normal_char;
25fe55af
RS
3706 BUF_PUSH (endbuf);
3707 break;
e318085a 3708
25fe55af
RS
3709 case '1': case '2': case '3': case '4': case '5':
3710 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3711 {
3712 regnum_t reg;
e318085a 3713
0cdd06f8
SM
3714 if (syntax & RE_NO_BK_REFS)
3715 goto normal_backslash;
e318085a 3716
0cdd06f8 3717 reg = c - '0';
e318085a 3718
c69b0314
SM
3719 if (reg > bufp->re_nsub || reg < 1
3720 /* Can't back reference to a subexp before its end. */
3721 || group_in_compile_stack (compile_stack, reg))
0cdd06f8 3722 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3723
0cdd06f8
SM
3724 laststart = b;
3725 BUF_PUSH_2 (duplicate, reg);
3726 }
25fe55af 3727 break;
e318085a 3728
e318085a 3729
25fe55af
RS
3730 case '+':
3731 case '?':
3732 if (syntax & RE_BK_PLUS_QM)
3733 goto handle_plus;
3734 else
3735 goto normal_backslash;
3736
3737 default:
3738 normal_backslash:
3739 /* You might think it would be useful for \ to mean
3740 not to translate; but if we don't translate it
4bb91c68 3741 it will never match anything. */
25fe55af
RS
3742 goto normal_char;
3743 }
3744 break;
fa9a63c5
RM
3745
3746
3747 default:
25fe55af 3748 /* Expects the character in `c'. */
fa9a63c5 3749 normal_char:
36595814 3750 /* If no exactn currently being built. */
25fe55af 3751 if (!pending_exact
fa9a63c5 3752
25fe55af
RS
3753 /* If last exactn not at current position. */
3754 || pending_exact + *pending_exact + 1 != b
5e69f11e 3755
25fe55af 3756 /* We have only one byte following the exactn for the count. */
2d1675e4 3757 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3758
7814e705 3759 /* If followed by a repetition operator. */
9d99031f 3760 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3761 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3762 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3763 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3764 || ((syntax & RE_INTERVALS)
25fe55af 3765 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3766 ? p != pend && *p == '{'
3767 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3768 {
3769 /* Start building a new exactn. */
5e69f11e 3770
25fe55af 3771 laststart = b;
fa9a63c5
RM
3772
3773 BUF_PUSH_2 (exactn, 0);
3774 pending_exact = b - 1;
25fe55af 3775 }
5e69f11e 3776
2d1675e4
SM
3777 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3778 {
e0277a47
KH
3779 int len;
3780
cf9c99bc 3781 if (multibyte)
6fdd04b0 3782 {
cf9c99bc 3783 c = TRANSLATE (c);
6fdd04b0
KH
3784 len = CHAR_STRING (c, b);
3785 b += len;
3786 }
e0277a47 3787 else
6fdd04b0 3788 {
cf9c99bc
KH
3789 c1 = RE_CHAR_TO_MULTIBYTE (c);
3790 if (! CHAR_BYTE8_P (c1))
3791 {
3792 re_wchar_t c2 = TRANSLATE (c1);
3793
3794 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3795 c = c1;
409f2919 3796 }
6fdd04b0
KH
3797 *b++ = c;
3798 len = 1;
3799 }
2d1675e4
SM
3800 (*pending_exact) += len;
3801 }
3802
fa9a63c5 3803 break;
25fe55af 3804 } /* switch (c) */
fa9a63c5
RM
3805 } /* while p != pend */
3806
5e69f11e 3807
fa9a63c5 3808 /* Through the pattern now. */
5e69f11e 3809
505bde11 3810 FIXUP_ALT_JUMP ();
fa9a63c5 3811
5e69f11e 3812 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3813 FREE_STACK_RETURN (REG_EPAREN);
3814
3815 /* If we don't want backtracking, force success
3816 the first time we reach the end of the compiled pattern. */
3817 if (syntax & RE_NO_POSIX_BACKTRACKING)
3818 BUF_PUSH (succeed);
3819
fa9a63c5
RM
3820 /* We have succeeded; set the length of the buffer. */
3821 bufp->used = b - bufp->buffer;
3822
3823#ifdef DEBUG
99633e97 3824 if (debug > 0)
fa9a63c5 3825 {
505bde11 3826 re_compile_fastmap (bufp);
fa9a63c5
RM
3827 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3828 print_compiled_pattern (bufp);
3829 }
99633e97 3830 debug--;
fa9a63c5
RM
3831#endif /* DEBUG */
3832
3833#ifndef MATCH_MAY_ALLOCATE
3834 /* Initialize the failure stack to the largest possible stack. This
3835 isn't necessary unless we're trying to avoid calling alloca in
3836 the search and match routines. */
3837 {
3838 int num_regs = bufp->re_nsub + 1;
3839
320a2a73 3840 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3841 {
a26f4ccd 3842 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3843
fa9a63c5
RM
3844 if (! fail_stack.stack)
3845 fail_stack.stack
5e69f11e 3846 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3847 * sizeof (fail_stack_elt_t));
3848 else
3849 fail_stack.stack
3850 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3851 (fail_stack.size
3852 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3853 }
3854
3855 regex_grow_registers (num_regs);
3856 }
3857#endif /* not MATCH_MAY_ALLOCATE */
3858
839966f3 3859 FREE_STACK_RETURN (REG_NOERROR);
fa9a63c5
RM
3860} /* regex_compile */
3861\f
3862/* Subroutines for `regex_compile'. */
3863
7814e705 3864/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3865
3866static void
971de7fb 3867store_op1 (re_opcode_t op, unsigned char *loc, int arg)
fa9a63c5
RM
3868{
3869 *loc = (unsigned char) op;
3870 STORE_NUMBER (loc + 1, arg);
3871}
3872
3873
3874/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3875
3876static void
971de7fb 3877store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
fa9a63c5
RM
3878{
3879 *loc = (unsigned char) op;
3880 STORE_NUMBER (loc + 1, arg1);
3881 STORE_NUMBER (loc + 3, arg2);
3882}
3883
3884
3885/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3886 for OP followed by two-byte integer parameter ARG. */
3887
3888static void
971de7fb 3889insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
fa9a63c5
RM
3890{
3891 register unsigned char *pfrom = end;
3892 register unsigned char *pto = end + 3;
3893
3894 while (pfrom != loc)
3895 *--pto = *--pfrom;
5e69f11e 3896
fa9a63c5
RM
3897 store_op1 (op, loc, arg);
3898}
3899
3900
3901/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3902
3903static void
971de7fb 3904insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
fa9a63c5
RM
3905{
3906 register unsigned char *pfrom = end;
3907 register unsigned char *pto = end + 5;
3908
3909 while (pfrom != loc)
3910 *--pto = *--pfrom;
5e69f11e 3911
fa9a63c5
RM
3912 store_op2 (op, loc, arg1, arg2);
3913}
3914
3915
3916/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3917 after an alternative or a begin-subexpression. We assume there is at
3918 least one character before the ^. */
3919
3920static boolean
971de7fb 3921at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
fa9a63c5 3922{
01618498 3923 re_char *prev = p - 2;
fa9a63c5 3924 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3925
fa9a63c5
RM
3926 return
3927 /* After a subexpression? */
3928 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3929 /* After an alternative? */
d2af47df
SM
3930 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3931 /* After a shy subexpression? */
3932 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3933 && prev[-1] == '?' && prev[-2] == '('
3934 && (syntax & RE_NO_BK_PARENS
3935 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3936}
3937
3938
3939/* The dual of at_begline_loc_p. This one is for $. We assume there is
3940 at least one character after the $, i.e., `P < PEND'. */
3941
3942static boolean
971de7fb 3943at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
fa9a63c5 3944{
01618498 3945 re_char *next = p;
fa9a63c5 3946 boolean next_backslash = *next == '\\';
01618498 3947 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3948
fa9a63c5
RM
3949 return
3950 /* Before a subexpression? */
3951 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3952 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3953 /* Before an alternative? */
3954 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3955 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3956}
3957
3958
5e69f11e 3959/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3960 false if it's not. */
3961
3962static boolean
971de7fb 3963group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
fa9a63c5
RM
3964{
3965 int this_element;
3966
5e69f11e
RM
3967 for (this_element = compile_stack.avail - 1;
3968 this_element >= 0;
fa9a63c5
RM
3969 this_element--)
3970 if (compile_stack.stack[this_element].regnum == regnum)
3971 return true;
3972
3973 return false;
3974}
fa9a63c5 3975\f
f6a3f532
SM
3976/* analyse_first.
3977 If fastmap is non-NULL, go through the pattern and fill fastmap
3978 with all the possible leading chars. If fastmap is NULL, don't
3979 bother filling it up (obviously) and only return whether the
3980 pattern could potentially match the empty string.
3981
3982 Return 1 if p..pend might match the empty string.
3983 Return 0 if p..pend matches at least one char.
01618498 3984 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3985
3986static int
971de7fb 3987analyse_first (const re_char *p, const re_char *pend, char *fastmap, const const int multibyte)
fa9a63c5 3988{
505bde11 3989 int j, k;
1fb352e0 3990 boolean not;
fa9a63c5 3991
b18215fc 3992 /* If all elements for base leading-codes in fastmap is set, this
7814e705 3993 flag is set true. */
b18215fc
RS
3994 boolean match_any_multibyte_characters = false;
3995
f6a3f532 3996 assert (p);
5e69f11e 3997
505bde11
SM
3998 /* The loop below works as follows:
3999 - It has a working-list kept in the PATTERN_STACK and which basically
4000 starts by only containing a pointer to the first operation.
4001 - If the opcode we're looking at is a match against some set of
4002 chars, then we add those chars to the fastmap and go on to the
4003 next work element from the worklist (done via `break').
4004 - If the opcode is a control operator on the other hand, we either
4005 ignore it (if it's meaningless at this point, such as `start_memory')
4006 or execute it (if it's a jump). If the jump has several destinations
4007 (i.e. `on_failure_jump'), then we push the other destination onto the
4008 worklist.
4009 We guarantee termination by ignoring backward jumps (more or less),
4010 so that `p' is monotonically increasing. More to the point, we
4011 never set `p' (or push) anything `<= p1'. */
4012
01618498 4013 while (p < pend)
fa9a63c5 4014 {
505bde11
SM
4015 /* `p1' is used as a marker of how far back a `on_failure_jump'
4016 can go without being ignored. It is normally equal to `p'
4017 (which prevents any backward `on_failure_jump') except right
4018 after a plain `jump', to allow patterns such as:
4019 0: jump 10
4020 3..9: <body>
4021 10: on_failure_jump 3
4022 as used for the *? operator. */
01618498 4023 re_char *p1 = p;
5e69f11e 4024
fa9a63c5
RM
4025 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4026 {
f6a3f532 4027 case succeed:
01618498 4028 return 1;
f6a3f532 4029 continue;
fa9a63c5 4030
fa9a63c5 4031 case duplicate:
505bde11
SM
4032 /* If the first character has to match a backreference, that means
4033 that the group was empty (since it already matched). Since this
4034 is the only case that interests us here, we can assume that the
4035 backreference must match the empty string. */
4036 p++;
4037 continue;
fa9a63c5
RM
4038
4039
4040 /* Following are the cases which match a character. These end
7814e705 4041 with `break'. */
fa9a63c5
RM
4042
4043 case exactn:
e0277a47 4044 if (fastmap)
cf9c99bc
KH
4045 {
4046 /* If multibyte is nonzero, the first byte of each
4047 character is an ASCII or a leading code. Otherwise,
4048 each byte is a character. Thus, this works in both
4049 cases. */
4050 fastmap[p[1]] = 1;
4051 if (! multibyte)
4052 {
4053 /* For the case of matching this unibyte regex
4054 against multibyte, we must set a leading code of
4055 the corresponding multibyte character. */
4056 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4057
86e893e3 4058 fastmap[CHAR_LEADING_CODE (c)] = 1;
cf9c99bc
KH
4059 }
4060 }
fa9a63c5
RM
4061 break;
4062
4063
1fb352e0
SM
4064 case anychar:
4065 /* We could put all the chars except for \n (and maybe \0)
4066 but we don't bother since it is generally not worth it. */
f6a3f532 4067 if (!fastmap) break;
01618498 4068 return -1;
fa9a63c5
RM
4069
4070
b18215fc 4071 case charset_not:
1fb352e0 4072 if (!fastmap) break;
bf216479
KH
4073 {
4074 /* Chars beyond end of bitmap are possible matches. */
bf216479 4075 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
cf9c99bc 4076 j < (1 << BYTEWIDTH); j++)
bf216479
KH
4077 fastmap[j] = 1;
4078 }
4079
1fb352e0
SM
4080 /* Fallthrough */
4081 case charset:
4082 if (!fastmap) break;
4083 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4084 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4085 j >= 0; j--)
1fb352e0 4086 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
49da453b 4087 fastmap[j] = 1;
b18215fc 4088
6482db2e
KH
4089#ifdef emacs
4090 if (/* Any leading code can possibly start a character
1fb352e0 4091 which doesn't match the specified set of characters. */
6482db2e 4092 not
409f2919 4093 ||
6482db2e
KH
4094 /* If we can match a character class, we can match any
4095 multibyte characters. */
4096 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4097 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4098
b18215fc 4099 {
b18215fc
RS
4100 if (match_any_multibyte_characters == false)
4101 {
6482db2e
KH
4102 for (j = MIN_MULTIBYTE_LEADING_CODE;
4103 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
6fdd04b0 4104 fastmap[j] = 1;
b18215fc
RS
4105 match_any_multibyte_characters = true;
4106 }
4107 }
b18215fc 4108
1fb352e0
SM
4109 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4110 && match_any_multibyte_characters == false)
4111 {
bf216479 4112 /* Set fastmap[I] to 1 where I is a leading code of each
9117d724 4113 multibyte characer in the range table. */
1fb352e0 4114 int c, count;
bf216479 4115 unsigned char lc1, lc2;
b18215fc 4116
1fb352e0 4117 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 4118 bits for a character class. */
1fb352e0 4119 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 4120
1fb352e0
SM
4121 /* Extract the number of ranges in range table into COUNT. */
4122 EXTRACT_NUMBER_AND_INCR (count, p);
cf9c99bc 4123 for (; count > 0; count--, p += 3)
1fb352e0 4124 {
9117d724
KH
4125 /* Extract the start and end of each range. */
4126 EXTRACT_CHARACTER (c, p);
bf216479 4127 lc1 = CHAR_LEADING_CODE (c);
9117d724 4128 p += 3;
1fb352e0 4129 EXTRACT_CHARACTER (c, p);
bf216479
KH
4130 lc2 = CHAR_LEADING_CODE (c);
4131 for (j = lc1; j <= lc2; j++)
9117d724 4132 fastmap[j] = 1;
1fb352e0
SM
4133 }
4134 }
6482db2e 4135#endif
b18215fc
RS
4136 break;
4137
1fb352e0
SM
4138 case syntaxspec:
4139 case notsyntaxspec:
4140 if (!fastmap) break;
4141#ifndef emacs
4142 not = (re_opcode_t)p[-1] == notsyntaxspec;
4143 k = *p++;
4144 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 4145 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 4146 fastmap[j] = 1;
b18215fc 4147 break;
1fb352e0 4148#else /* emacs */
b18215fc
RS
4149 /* This match depends on text properties. These end with
4150 aborting optimizations. */
01618498 4151 return -1;
b18215fc
RS
4152
4153 case categoryspec:
b18215fc 4154 case notcategoryspec:
1fb352e0
SM
4155 if (!fastmap) break;
4156 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 4157 k = *p++;
6482db2e 4158 for (j = (1 << BYTEWIDTH); j >= 0; j--)
1fb352e0 4159 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
4160 fastmap[j] = 1;
4161
6482db2e
KH
4162 /* Any leading code can possibly start a character which
4163 has or doesn't has the specified category. */
4164 if (match_any_multibyte_characters == false)
6fdd04b0 4165 {
6482db2e
KH
4166 for (j = MIN_MULTIBYTE_LEADING_CODE;
4167 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4168 fastmap[j] = 1;
4169 match_any_multibyte_characters = true;
6fdd04b0 4170 }
b18215fc
RS
4171 break;
4172
fa9a63c5 4173 /* All cases after this match the empty string. These end with
25fe55af 4174 `continue'. */
fa9a63c5 4175
fa9a63c5
RM
4176 case before_dot:
4177 case at_dot:
4178 case after_dot:
1fb352e0 4179#endif /* !emacs */
25fe55af
RS
4180 case no_op:
4181 case begline:
4182 case endline:
fa9a63c5
RM
4183 case begbuf:
4184 case endbuf:
4185 case wordbound:
4186 case notwordbound:
4187 case wordbeg:
4188 case wordend:
669fa600
SM
4189 case symbeg:
4190 case symend:
25fe55af 4191 continue;
fa9a63c5
RM
4192
4193
fa9a63c5 4194 case jump:
25fe55af 4195 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
4196 if (j < 0)
4197 /* Backward jumps can only go back to code that we've already
4198 visited. `re_compile' should make sure this is true. */
4199 break;
25fe55af 4200 p += j;
505bde11
SM
4201 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4202 {
4203 case on_failure_jump:
4204 case on_failure_keep_string_jump:
505bde11 4205 case on_failure_jump_loop:
0683b6fa 4206 case on_failure_jump_nastyloop:
505bde11
SM
4207 case on_failure_jump_smart:
4208 p++;
4209 break;
4210 default:
4211 continue;
4212 };
4213 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4214 to jump back to "just after here". */
4215 /* Fallthrough */
fa9a63c5 4216
25fe55af
RS
4217 case on_failure_jump:
4218 case on_failure_keep_string_jump:
0683b6fa 4219 case on_failure_jump_nastyloop:
505bde11
SM
4220 case on_failure_jump_loop:
4221 case on_failure_jump_smart:
25fe55af 4222 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4223 if (p + j <= p1)
ed0767d8 4224 ; /* Backward jump to be ignored. */
01618498
SM
4225 else
4226 { /* We have to look down both arms.
4227 We first go down the "straight" path so as to minimize
4228 stack usage when going through alternatives. */
4229 int r = analyse_first (p, pend, fastmap, multibyte);
4230 if (r) return r;
4231 p += j;
4232 }
25fe55af 4233 continue;
fa9a63c5
RM
4234
4235
ed0767d8
SM
4236 case jump_n:
4237 /* This code simply does not properly handle forward jump_n. */
4238 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4239 p += 4;
4240 /* jump_n can either jump or fall through. The (backward) jump
4241 case has already been handled, so we only need to look at the
4242 fallthrough case. */
4243 continue;
177c0ea7 4244
fa9a63c5 4245 case succeed_n:
ed0767d8
SM
4246 /* If N == 0, it should be an on_failure_jump_loop instead. */
4247 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4248 p += 4;
4249 /* We only care about one iteration of the loop, so we don't
4250 need to consider the case where this behaves like an
4251 on_failure_jump. */
25fe55af 4252 continue;
fa9a63c5
RM
4253
4254
4255 case set_number_at:
25fe55af
RS
4256 p += 4;
4257 continue;
fa9a63c5
RM
4258
4259
4260 case start_memory:
25fe55af 4261 case stop_memory:
505bde11 4262 p += 1;
fa9a63c5
RM
4263 continue;
4264
4265
4266 default:
25fe55af
RS
4267 abort (); /* We have listed all the cases. */
4268 } /* switch *p++ */
fa9a63c5
RM
4269
4270 /* Getting here means we have found the possible starting
25fe55af 4271 characters for one path of the pattern -- and that the empty
7814e705 4272 string does not match. We need not follow this path further. */
01618498 4273 return 0;
fa9a63c5
RM
4274 } /* while p */
4275
01618498
SM
4276 /* We reached the end without matching anything. */
4277 return 1;
4278
f6a3f532
SM
4279} /* analyse_first */
4280\f
4281/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4282 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4283 characters can start a string that matches the pattern. This fastmap
4284 is used by re_search to skip quickly over impossible starting points.
4285
4286 Character codes above (1 << BYTEWIDTH) are not represented in the
4287 fastmap, but the leading codes are represented. Thus, the fastmap
4288 indicates which character sets could start a match.
4289
4290 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4291 area as BUFP->fastmap.
4292
4293 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4294 the pattern buffer.
4295
4296 Returns 0 if we succeed, -2 if an internal error. */
4297
4298int
971de7fb 4299re_compile_fastmap (struct re_pattern_buffer *bufp)
f6a3f532
SM
4300{
4301 char *fastmap = bufp->fastmap;
4302 int analysis;
4303
4304 assert (fastmap && bufp->buffer);
4305
7814e705 4306 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
f6a3f532
SM
4307 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4308
4309 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4310 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4311 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4312 return 0;
4313} /* re_compile_fastmap */
4314\f
4315/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4316 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4317 this memory for recording register information. STARTS and ENDS
4318 must be allocated using the malloc library routine, and must each
4319 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4320
4321 If NUM_REGS == 0, then subsequent matches should allocate their own
4322 register data.
4323
4324 Unless this function is called, the first search or match using
4325 PATTERN_BUFFER will allocate its own register data, without
4326 freeing the old data. */
4327
4328void
971de7fb 4329re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
fa9a63c5
RM
4330{
4331 if (num_regs)
4332 {
4333 bufp->regs_allocated = REGS_REALLOCATE;
4334 regs->num_regs = num_regs;
4335 regs->start = starts;
4336 regs->end = ends;
4337 }
4338 else
4339 {
4340 bufp->regs_allocated = REGS_UNALLOCATED;
4341 regs->num_regs = 0;
4342 regs->start = regs->end = (regoff_t *) 0;
4343 }
4344}
c0f9ea08 4345WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4346\f
7814e705 4347/* Searching routines. */
fa9a63c5
RM
4348
4349/* Like re_search_2, below, but only one string is specified, and
4350 doesn't let you say where to stop matching. */
4351
4352int
971de7fb 4353re_search (struct re_pattern_buffer *bufp, const char *string, int size, int startpos, int range, struct re_registers *regs)
fa9a63c5 4354{
5e69f11e 4355 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4356 regs, size);
4357}
c0f9ea08 4358WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4359
70806df6
KH
4360/* Head address of virtual concatenation of string. */
4361#define HEAD_ADDR_VSTRING(P) \
4362 (((P) >= size1 ? string2 : string1))
4363
b18215fc
RS
4364/* End address of virtual concatenation of string. */
4365#define STOP_ADDR_VSTRING(P) \
4366 (((P) >= size1 ? string2 + size2 : string1 + size1))
4367
4368/* Address of POS in the concatenation of virtual string. */
4369#define POS_ADDR_VSTRING(POS) \
4370 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4371
4372/* Using the compiled pattern in BUFP->buffer, first tries to match the
4373 virtual concatenation of STRING1 and STRING2, starting first at index
4374 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4375
fa9a63c5 4376 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4377
fa9a63c5
RM
4378 RANGE is how far to scan while trying to match. RANGE = 0 means try
4379 only at STARTPOS; in general, the last start tried is STARTPOS +
4380 RANGE.
5e69f11e 4381
fa9a63c5
RM
4382 In REGS, return the indices of the virtual concatenation of STRING1
4383 and STRING2 that matched the entire BUFP->buffer and its contained
4384 subexpressions.
5e69f11e 4385
fa9a63c5
RM
4386 Do not consider matching one past the index STOP in the virtual
4387 concatenation of STRING1 and STRING2.
4388
4389 We return either the position in the strings at which the match was
4390 found, -1 if no match, or -2 if error (such as failure
4391 stack overflow). */
4392
4393int
971de7fb 4394re_search_2 (struct re_pattern_buffer *bufp, const char *str1, int size1, const char *str2, int size2, int startpos, int range, struct re_registers *regs, int stop)
fa9a63c5
RM
4395{
4396 int val;
66f0296e
SM
4397 re_char *string1 = (re_char*) str1;
4398 re_char *string2 = (re_char*) str2;
fa9a63c5 4399 register char *fastmap = bufp->fastmap;
6676cb1c 4400 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4401 int total_size = size1 + size2;
4402 int endpos = startpos + range;
c0f9ea08 4403 boolean anchored_start;
cf9c99bc
KH
4404 /* Nonzero if we are searching multibyte string. */
4405 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
b18215fc 4406
fa9a63c5
RM
4407 /* Check for out-of-range STARTPOS. */
4408 if (startpos < 0 || startpos > total_size)
4409 return -1;
5e69f11e 4410
fa9a63c5 4411 /* Fix up RANGE if it might eventually take us outside
34597fa9 4412 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4413 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4414 if (endpos < 0)
4415 range = 0 - startpos;
fa9a63c5
RM
4416 else if (endpos > total_size)
4417 range = total_size - startpos;
4418
4419 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4420 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4421 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4422 {
4423 if (startpos > 0)
4424 return -1;
4425 else
7b140fd7 4426 range = 0;
fa9a63c5
RM
4427 }
4428
ae4788a8
RS
4429#ifdef emacs
4430 /* In a forward search for something that starts with \=.
4431 don't keep searching past point. */
4432 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4433 {
7b140fd7
RS
4434 range = PT_BYTE - BEGV_BYTE - startpos;
4435 if (range < 0)
ae4788a8
RS
4436 return -1;
4437 }
4438#endif /* emacs */
4439
fa9a63c5
RM
4440 /* Update the fastmap now if not correct already. */
4441 if (fastmap && !bufp->fastmap_accurate)
01618498 4442 re_compile_fastmap (bufp);
5e69f11e 4443
c8499ba5 4444 /* See whether the pattern is anchored. */
c0f9ea08 4445 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4446
b18215fc 4447#ifdef emacs
d48cd3f4 4448 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
cc9b4df2 4449 {
99633e97 4450 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4451
4452 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4453 }
b18215fc
RS
4454#endif
4455
fa9a63c5
RM
4456 /* Loop through the string, looking for a place to start matching. */
4457 for (;;)
5e69f11e 4458 {
c8499ba5
RS
4459 /* If the pattern is anchored,
4460 skip quickly past places we cannot match.
4461 We don't bother to treat startpos == 0 specially
4462 because that case doesn't repeat. */
4463 if (anchored_start && startpos > 0)
4464 {
c0f9ea08
SM
4465 if (! ((startpos <= size1 ? string1[startpos - 1]
4466 : string2[startpos - size1 - 1])
4467 == '\n'))
c8499ba5
RS
4468 goto advance;
4469 }
4470
fa9a63c5 4471 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4472 cannot be the start of a match. If the pattern can match the
4473 null string, however, we don't need to skip characters; we want
7814e705 4474 the first null string. */
fa9a63c5
RM
4475 if (fastmap && startpos < total_size && !bufp->can_be_null)
4476 {
66f0296e 4477 register re_char *d;
01618498 4478 register re_wchar_t buf_ch;
e934739e
RS
4479
4480 d = POS_ADDR_VSTRING (startpos);
4481
7814e705 4482 if (range > 0) /* Searching forwards. */
fa9a63c5 4483 {
fa9a63c5
RM
4484 register int lim = 0;
4485 int irange = range;
4486
25fe55af
RS
4487 if (startpos < size1 && startpos + range >= size1)
4488 lim = range - (size1 - startpos);
fa9a63c5 4489
25fe55af
RS
4490 /* Written out as an if-else to avoid testing `translate'
4491 inside the loop. */
28ae27ae
AS
4492 if (RE_TRANSLATE_P (translate))
4493 {
e934739e
RS
4494 if (multibyte)
4495 while (range > lim)
4496 {
4497 int buf_charlen;
4498
62a6e103 4499 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 4500 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4501 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4502 break;
4503
4504 range -= buf_charlen;
4505 d += buf_charlen;
4506 }
4507 else
bf216479 4508 while (range > lim)
33c46939 4509 {
cf9c99bc
KH
4510 register re_wchar_t ch, translated;
4511
bf216479 4512 buf_ch = *d;
cf9c99bc
KH
4513 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4514 translated = RE_TRANSLATE (translate, ch);
4515 if (translated != ch
4516 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4517 buf_ch = ch;
6fdd04b0 4518 if (fastmap[buf_ch])
bf216479 4519 break;
33c46939
RS
4520 d++;
4521 range--;
4522 }
e934739e 4523 }
fa9a63c5 4524 else
6fdd04b0
KH
4525 {
4526 if (multibyte)
4527 while (range > lim)
4528 {
4529 int buf_charlen;
fa9a63c5 4530
62a6e103 4531 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
6fdd04b0
KH
4532 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4533 break;
4534 range -= buf_charlen;
4535 d += buf_charlen;
4536 }
e934739e 4537 else
6fdd04b0 4538 while (range > lim && !fastmap[*d])
33c46939
RS
4539 {
4540 d++;
4541 range--;
4542 }
e934739e 4543 }
fa9a63c5
RM
4544 startpos += irange - range;
4545 }
7814e705 4546 else /* Searching backwards. */
fa9a63c5 4547 {
ba5e343c
KH
4548 if (multibyte)
4549 {
62a6e103 4550 buf_ch = STRING_CHAR (d);
ba5e343c
KH
4551 buf_ch = TRANSLATE (buf_ch);
4552 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4553 goto advance;
4554 }
4555 else
4556 {
cf9c99bc
KH
4557 register re_wchar_t ch, translated;
4558
4559 buf_ch = *d;
4560 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4561 translated = TRANSLATE (ch);
4562 if (translated != ch
4563 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4564 buf_ch = ch;
4565 if (! fastmap[TRANSLATE (buf_ch)])
ba5e343c
KH
4566 goto advance;
4567 }
fa9a63c5
RM
4568 }
4569 }
4570
4571 /* If can't match the null string, and that's all we have left, fail. */
4572 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4573 && !bufp->can_be_null)
fa9a63c5
RM
4574 return -1;
4575
4576 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4577 startpos, regs, stop);
fa9a63c5
RM
4578
4579 if (val >= 0)
4580 return startpos;
5e69f11e 4581
fa9a63c5
RM
4582 if (val == -2)
4583 return -2;
4584
4585 advance:
5e69f11e 4586 if (!range)
25fe55af 4587 break;
5e69f11e 4588 else if (range > 0)
25fe55af 4589 {
b18215fc
RS
4590 /* Update STARTPOS to the next character boundary. */
4591 if (multibyte)
4592 {
66f0296e
SM
4593 re_char *p = POS_ADDR_VSTRING (startpos);
4594 re_char *pend = STOP_ADDR_VSTRING (startpos);
aa3830c4 4595 int len = BYTES_BY_CHAR_HEAD (*p);
b18215fc
RS
4596
4597 range -= len;
4598 if (range < 0)
4599 break;
4600 startpos += len;
4601 }
4602 else
4603 {
b560c397
RS
4604 range--;
4605 startpos++;
4606 }
e318085a 4607 }
fa9a63c5 4608 else
25fe55af
RS
4609 {
4610 range++;
4611 startpos--;
b18215fc
RS
4612
4613 /* Update STARTPOS to the previous character boundary. */
4614 if (multibyte)
4615 {
70806df6
KH
4616 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4617 re_char *p0 = p;
4618 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4619
4620 /* Find the head of multibyte form. */
70806df6
KH
4621 PREV_CHAR_BOUNDARY (p, phead);
4622 range += p0 - 1 - p;
4623 if (range > 0)
4624 break;
b18215fc 4625
70806df6 4626 startpos -= p0 - 1 - p;
b18215fc 4627 }
25fe55af 4628 }
fa9a63c5
RM
4629 }
4630 return -1;
4631} /* re_search_2 */
c0f9ea08 4632WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4633\f
4634/* Declarations and macros for re_match_2. */
4635
2d1675e4
SM
4636static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4637 register int len,
4638 RE_TRANSLATE_TYPE translate,
4639 const int multibyte));
fa9a63c5
RM
4640
4641/* This converts PTR, a pointer into one of the search strings `string1'
4642 and `string2' into an offset from the beginning of that string. */
4643#define POINTER_TO_OFFSET(ptr) \
4644 (FIRST_STRING_P (ptr) \
4645 ? ((regoff_t) ((ptr) - string1)) \
4646 : ((regoff_t) ((ptr) - string2 + size1)))
4647
fa9a63c5 4648/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4649 string2 if necessary.
4650 Check re_match_2_internal for a discussion of why end_match_2 might
4651 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4652#define PREFETCH() \
25fe55af 4653 while (d == dend) \
fa9a63c5
RM
4654 { \
4655 /* End of string2 => fail. */ \
25fe55af
RS
4656 if (dend == end_match_2) \
4657 goto fail; \
4bb91c68 4658 /* End of string1 => advance to string2. */ \
25fe55af 4659 d = string2; \
fa9a63c5
RM
4660 dend = end_match_2; \
4661 }
4662
f1ad044f
SM
4663/* Call before fetching a char with *d if you already checked other limits.
4664 This is meant for use in lookahead operations like wordend, etc..
4665 where we might need to look at parts of the string that might be
4666 outside of the LIMITs (i.e past `stop'). */
4667#define PREFETCH_NOLIMIT() \
4668 if (d == end1) \
4669 { \
4670 d = string2; \
4671 dend = end_match_2; \
4672 } \
fa9a63c5
RM
4673
4674/* Test if at very beginning or at very end of the virtual concatenation
7814e705 4675 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4676#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4677#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4678
4679
4680/* Test if D points to a character which is word-constituent. We have
4681 two special cases to check for: if past the end of string1, look at
4682 the first character in string2; and if before the beginning of
4683 string2, look at the last character in string1. */
4684#define WORDCHAR_P(d) \
4685 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4686 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4687 == Sword)
4688
9121ca40 4689/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4690
4691/* The comment at case wordbound is following one, but we don't use
4692 AT_WORD_BOUNDARY anymore to support multibyte form.
4693
4694 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af 4695 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7814e705 4696 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4697 macro and introducing temporary variables works around the bug. */
4698
9121ca40 4699#if 0
fa9a63c5
RM
4700/* Test if the character before D and the one at D differ with respect
4701 to being word-constituent. */
4702#define AT_WORD_BOUNDARY(d) \
4703 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4704 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4705#endif
fa9a63c5
RM
4706
4707/* Free everything we malloc. */
4708#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4709# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4710# define FREE_VARIABLES() \
fa9a63c5
RM
4711 do { \
4712 REGEX_FREE_STACK (fail_stack.stack); \
4713 FREE_VAR (regstart); \
4714 FREE_VAR (regend); \
fa9a63c5
RM
4715 FREE_VAR (best_regstart); \
4716 FREE_VAR (best_regend); \
fa9a63c5
RM
4717 } while (0)
4718#else
0b32bf0e 4719# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4720#endif /* not MATCH_MAY_ALLOCATE */
4721
505bde11
SM
4722\f
4723/* Optimization routines. */
4724
4e8a9132
SM
4725/* If the operation is a match against one or more chars,
4726 return a pointer to the next operation, else return NULL. */
01618498 4727static re_char *
971de7fb 4728skip_one_char (const re_char *p)
4e8a9132
SM
4729{
4730 switch (SWITCH_ENUM_CAST (*p++))
4731 {
4732 case anychar:
4733 break;
177c0ea7 4734
4e8a9132
SM
4735 case exactn:
4736 p += *p + 1;
4737 break;
4738
4739 case charset_not:
4740 case charset:
4741 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4742 {
4743 int mcnt;
4744 p = CHARSET_RANGE_TABLE (p - 1);
4745 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4746 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4747 }
4748 else
4749 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4750 break;
177c0ea7 4751
4e8a9132
SM
4752 case syntaxspec:
4753 case notsyntaxspec:
1fb352e0 4754#ifdef emacs
4e8a9132
SM
4755 case categoryspec:
4756 case notcategoryspec:
4757#endif /* emacs */
4758 p++;
4759 break;
4760
4761 default:
4762 p = NULL;
4763 }
4764 return p;
4765}
4766
4767
505bde11 4768/* Jump over non-matching operations. */
839966f3 4769static re_char *
971de7fb 4770skip_noops (const re_char *p, const re_char *pend)
505bde11
SM
4771{
4772 int mcnt;
4773 while (p < pend)
4774 {
4775 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4776 {
4777 case start_memory:
505bde11
SM
4778 case stop_memory:
4779 p += 2; break;
4780 case no_op:
4781 p += 1; break;
4782 case jump:
4783 p += 1;
4784 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4785 p += mcnt;
4786 break;
4787 default:
4788 return p;
4789 }
4790 }
4791 assert (p == pend);
4792 return p;
4793}
4794
4795/* Non-zero if "p1 matches something" implies "p2 fails". */
4796static int
971de7fb 4797mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
505bde11 4798{
4e8a9132 4799 re_opcode_t op2;
2d1675e4 4800 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4801 unsigned char *pend = bufp->buffer + bufp->used;
4802
4e8a9132 4803 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4804 && p2 >= bufp->buffer && p2 <= pend);
4805
4806 /* Skip over open/close-group commands.
4807 If what follows this loop is a ...+ construct,
4808 look at what begins its body, since we will have to
4809 match at least one of that. */
4e8a9132
SM
4810 p2 = skip_noops (p2, pend);
4811 /* The same skip can be done for p1, except that this function
4812 is only used in the case where p1 is a simple match operator. */
4813 /* p1 = skip_noops (p1, pend); */
4814
4815 assert (p1 >= bufp->buffer && p1 < pend
4816 && p2 >= bufp->buffer && p2 <= pend);
4817
4818 op2 = p2 == pend ? succeed : *p2;
4819
4820 switch (SWITCH_ENUM_CAST (op2))
505bde11 4821 {
4e8a9132
SM
4822 case succeed:
4823 case endbuf:
4824 /* If we're at the end of the pattern, we can change. */
4825 if (skip_one_char (p1))
505bde11 4826 {
505bde11
SM
4827 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4828 return 1;
505bde11 4829 }
4e8a9132 4830 break;
177c0ea7 4831
4e8a9132 4832 case endline:
4e8a9132
SM
4833 case exactn:
4834 {
01618498 4835 register re_wchar_t c
4e8a9132 4836 = (re_opcode_t) *p2 == endline ? '\n'
62a6e103 4837 : RE_STRING_CHAR (p2 + 2, multibyte);
505bde11 4838
4e8a9132
SM
4839 if ((re_opcode_t) *p1 == exactn)
4840 {
62a6e103 4841 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4e8a9132
SM
4842 {
4843 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4844 return 1;
4845 }
4846 }
505bde11 4847
4e8a9132
SM
4848 else if ((re_opcode_t) *p1 == charset
4849 || (re_opcode_t) *p1 == charset_not)
4850 {
4851 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4852
4e8a9132
SM
4853 /* Test if C is listed in charset (or charset_not)
4854 at `p1'. */
6fdd04b0 4855 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4856 {
4857 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4858 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4859 not = !not;
4860 }
4861 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4862 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4863
4e8a9132
SM
4864 /* `not' is equal to 1 if c would match, which means
4865 that we can't change to pop_failure_jump. */
4866 if (!not)
4867 {
4868 DEBUG_PRINT1 (" No match => fast loop.\n");
4869 return 1;
4870 }
4871 }
4872 else if ((re_opcode_t) *p1 == anychar
4873 && c == '\n')
4874 {
4875 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4876 return 1;
4877 }
4878 }
4879 break;
505bde11 4880
4e8a9132 4881 case charset:
4e8a9132
SM
4882 {
4883 if ((re_opcode_t) *p1 == exactn)
4884 /* Reuse the code above. */
4885 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4886
505bde11
SM
4887 /* It is hard to list up all the character in charset
4888 P2 if it includes multibyte character. Give up in
4889 such case. */
4890 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4891 {
4892 /* Now, we are sure that P2 has no range table.
4893 So, for the size of bitmap in P2, `p2[1]' is
7814e705 4894 enough. But P1 may have range table, so the
505bde11
SM
4895 size of bitmap table of P1 is extracted by
4896 using macro `CHARSET_BITMAP_SIZE'.
4897
6fdd04b0
KH
4898 In a multibyte case, we know that all the character
4899 listed in P2 is ASCII. In a unibyte case, P1 has only a
4900 bitmap table. So, in both cases, it is enough to test
4901 only the bitmap table of P1. */
505bde11 4902
411e4203 4903 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4904 {
4905 int idx;
4906 /* We win if the charset inside the loop
4907 has no overlap with the one after the loop. */
4908 for (idx = 0;
4909 (idx < (int) p2[1]
4910 && idx < CHARSET_BITMAP_SIZE (p1));
4911 idx++)
4912 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4913 break;
4914
4915 if (idx == p2[1]
4916 || idx == CHARSET_BITMAP_SIZE (p1))
4917 {
4918 DEBUG_PRINT1 (" No match => fast loop.\n");
4919 return 1;
4920 }
4921 }
411e4203 4922 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4923 {
4924 int idx;
4925 /* We win if the charset_not inside the loop lists
7814e705 4926 every character listed in the charset after. */
505bde11
SM
4927 for (idx = 0; idx < (int) p2[1]; idx++)
4928 if (! (p2[2 + idx] == 0
4929 || (idx < CHARSET_BITMAP_SIZE (p1)
4930 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4931 break;
4932
4e8a9132
SM
4933 if (idx == p2[1])
4934 {
4935 DEBUG_PRINT1 (" No match => fast loop.\n");
4936 return 1;
4937 }
4938 }
4939 }
4940 }
609b757a 4941 break;
177c0ea7 4942
411e4203
SM
4943 case charset_not:
4944 switch (SWITCH_ENUM_CAST (*p1))
4945 {
4946 case exactn:
4947 case charset:
4948 /* Reuse the code above. */
4949 return mutually_exclusive_p (bufp, p2, p1);
4950 case charset_not:
4951 /* When we have two charset_not, it's very unlikely that
4952 they don't overlap. The union of the two sets of excluded
4953 chars should cover all possible chars, which, as a matter of
4954 fact, is virtually impossible in multibyte buffers. */
36595814 4955 break;
411e4203
SM
4956 }
4957 break;
4958
4e8a9132 4959 case wordend:
669fa600
SM
4960 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4961 case symend:
4e8a9132 4962 return ((re_opcode_t) *p1 == syntaxspec
669fa600
SM
4963 && (p1[1] == Ssymbol || p1[1] == Sword));
4964 case notsyntaxspec:
4965 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4966
4967 case wordbeg:
669fa600
SM
4968 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4969 case symbeg:
4e8a9132 4970 return ((re_opcode_t) *p1 == notsyntaxspec
669fa600
SM
4971 && (p1[1] == Ssymbol || p1[1] == Sword));
4972 case syntaxspec:
4973 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4e8a9132
SM
4974
4975 case wordbound:
4976 return (((re_opcode_t) *p1 == notsyntaxspec
4977 || (re_opcode_t) *p1 == syntaxspec)
4978 && p1[1] == Sword);
4979
1fb352e0 4980#ifdef emacs
4e8a9132
SM
4981 case categoryspec:
4982 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4983 case notcategoryspec:
4984 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4985#endif /* emacs */
4986
4987 default:
4988 ;
505bde11
SM
4989 }
4990
4991 /* Safe default. */
4992 return 0;
4993}
4994
fa9a63c5
RM
4995\f
4996/* Matching routines. */
4997
25fe55af 4998#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4999/* re_match is like re_match_2 except it takes only a single string. */
5000
5001int
5002re_match (bufp, string, size, pos, regs)
5003 struct re_pattern_buffer *bufp;
5004 const char *string;
5005 int size, pos;
5006 struct re_registers *regs;
5007{
4bb91c68 5008 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 5009 pos, regs, size);
fa9a63c5
RM
5010 return result;
5011}
c0f9ea08 5012WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
5013#endif /* not emacs */
5014
b18215fc
RS
5015#ifdef emacs
5016/* In Emacs, this is the string or buffer in which we
7814e705 5017 are matching. It is used for looking up syntax properties. */
b18215fc
RS
5018Lisp_Object re_match_object;
5019#endif
fa9a63c5
RM
5020
5021/* re_match_2 matches the compiled pattern in BUFP against the
5022 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5023 and SIZE2, respectively). We start matching at POS, and stop
5024 matching at STOP.
5e69f11e 5025
fa9a63c5 5026 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
7814e705 5027 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
5028 documentation for exactly how many groups we fill.
5029
5030 We return -1 if no match, -2 if an internal error (such as the
7814e705 5031 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
5032 matched substring. */
5033
5034int
971de7fb 5035re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5 5036{
b18215fc 5037 int result;
25fe55af 5038
b18215fc 5039#ifdef emacs
cc9b4df2 5040 int charpos;
d48cd3f4 5041 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
99633e97 5042 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 5043 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
5044#endif
5045
4bb91c68
SM
5046 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5047 (re_char*) string2, size2,
cc9b4df2 5048 pos, regs, stop);
fa9a63c5
RM
5049 return result;
5050}
c0f9ea08 5051WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 5052
bf216479 5053
fa9a63c5 5054/* This is a separate function so that we can force an alloca cleanup
7814e705 5055 afterwards. */
fa9a63c5 5056static int
971de7fb 5057re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1, int size1, const re_char *string2, int size2, int pos, struct re_registers *regs, int stop)
fa9a63c5
RM
5058{
5059 /* General temporaries. */
5060 int mcnt;
01618498 5061 size_t reg;
66f0296e 5062 boolean not;
fa9a63c5
RM
5063
5064 /* Just past the end of the corresponding string. */
66f0296e 5065 re_char *end1, *end2;
fa9a63c5
RM
5066
5067 /* Pointers into string1 and string2, just past the last characters in
7814e705 5068 each to consider matching. */
66f0296e 5069 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
5070
5071 /* Where we are in the data, and the end of the current string. */
66f0296e 5072 re_char *d, *dend;
5e69f11e 5073
99633e97
SM
5074 /* Used sometimes to remember where we were before starting matching
5075 an operator so that we can go back in case of failure. This "atomic"
5076 behavior of matching opcodes is indispensable to the correctness
5077 of the on_failure_keep_string_jump optimization. */
5078 re_char *dfail;
5079
fa9a63c5 5080 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
5081 re_char *p = bufp->buffer;
5082 re_char *pend = p + bufp->used;
fa9a63c5 5083
25fe55af 5084 /* We use this to map every character in the string. */
6676cb1c 5085 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 5086
cf9c99bc 5087 /* Nonzero if BUFP is setup from a multibyte regex. */
2d1675e4 5088 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 5089
cf9c99bc
KH
5090 /* Nonzero if STRING1/STRING2 are multibyte. */
5091 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5092
fa9a63c5
RM
5093 /* Failure point stack. Each place that can handle a failure further
5094 down the line pushes a failure point on this stack. It consists of
505bde11 5095 regstart, and regend for all registers corresponding to
fa9a63c5
RM
5096 the subexpressions we're currently inside, plus the number of such
5097 registers, and, finally, two char *'s. The first char * is where
5098 to resume scanning the pattern; the second one is where to resume
7814e705
JB
5099 scanning the strings. */
5100#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
5101 fail_stack_type fail_stack;
5102#endif
5103#ifdef DEBUG
fa9a63c5
RM
5104 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5105#endif
5106
0b32bf0e 5107#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
5108 /* This holds the pointer to the failure stack, when
5109 it is allocated relocatably. */
5110 fail_stack_elt_t *failure_stack_ptr;
99633e97 5111#endif
fa9a63c5
RM
5112
5113 /* We fill all the registers internally, independent of what we
7814e705 5114 return, for use in backreferences. The number here includes
fa9a63c5 5115 an element for register zero. */
4bb91c68 5116 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 5117
fa9a63c5
RM
5118 /* Information on the contents of registers. These are pointers into
5119 the input strings; they record just what was matched (on this
5120 attempt) by a subexpression part of the pattern, that is, the
5121 regnum-th regstart pointer points to where in the pattern we began
5122 matching and the regnum-th regend points to right after where we
5123 stopped matching the regnum-th subexpression. (The zeroth register
5124 keeps track of what the whole pattern matches.) */
5125#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5126 re_char **regstart, **regend;
fa9a63c5
RM
5127#endif
5128
fa9a63c5 5129 /* The following record the register info as found in the above
5e69f11e 5130 variables when we find a match better than any we've seen before.
fa9a63c5
RM
5131 This happens as we backtrack through the failure points, which in
5132 turn happens only if we have not yet matched the entire string. */
5133 unsigned best_regs_set = false;
5134#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 5135 re_char **best_regstart, **best_regend;
fa9a63c5 5136#endif
5e69f11e 5137
fa9a63c5
RM
5138 /* Logically, this is `best_regend[0]'. But we don't want to have to
5139 allocate space for that if we're not allocating space for anything
7814e705 5140 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
5141 any of the other register vectors, and it seems rather a kludge to
5142 treat `best_regend' differently than the rest. So we keep track of
5143 the end of the best match so far in a separate variable. We
5144 initialize this to NULL so that when we backtrack the first time
5145 and need to test it, it's not garbage. */
66f0296e 5146 re_char *match_end = NULL;
fa9a63c5 5147
fa9a63c5
RM
5148#ifdef DEBUG
5149 /* Counts the total number of registers pushed. */
5e69f11e 5150 unsigned num_regs_pushed = 0;
fa9a63c5
RM
5151#endif
5152
5153 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 5154
fa9a63c5 5155 INIT_FAIL_STACK ();
5e69f11e 5156
fa9a63c5
RM
5157#ifdef MATCH_MAY_ALLOCATE
5158 /* Do not bother to initialize all the register variables if there are
5159 no groups in the pattern, as it takes a fair amount of time. If
5160 there are groups, we include space for register 0 (the whole
5161 pattern), even though we never use it, since it simplifies the
5162 array indexing. We should fix this. */
5163 if (bufp->re_nsub)
5164 {
66f0296e
SM
5165 regstart = REGEX_TALLOC (num_regs, re_char *);
5166 regend = REGEX_TALLOC (num_regs, re_char *);
5167 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5168 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5169
505bde11 5170 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5171 {
5172 FREE_VARIABLES ();
5173 return -2;
5174 }
fa9a63c5
RM
5175 }
5176 else
5177 {
5178 /* We must initialize all our variables to NULL, so that
25fe55af 5179 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5180 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5181 }
5182#endif /* MATCH_MAY_ALLOCATE */
5183
5184 /* The starting position is bogus. */
5185 if (pos < 0 || pos > size1 + size2)
5186 {
5187 FREE_VARIABLES ();
5188 return -1;
5189 }
5e69f11e 5190
fa9a63c5
RM
5191 /* Initialize subexpression text positions to -1 to mark ones that no
5192 start_memory/stop_memory has been seen for. Also initialize the
5193 register information struct. */
01618498
SM
5194 for (reg = 1; reg < num_regs; reg++)
5195 regstart[reg] = regend[reg] = NULL;
99633e97 5196
fa9a63c5 5197 /* We move `string1' into `string2' if the latter's empty -- but not if
7814e705 5198 `string1' is null. */
fa9a63c5
RM
5199 if (size2 == 0 && string1 != NULL)
5200 {
5201 string2 = string1;
5202 size2 = size1;
5203 string1 = 0;
5204 size1 = 0;
5205 }
5206 end1 = string1 + size1;
5207 end2 = string2 + size2;
5208
5e69f11e 5209 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5210 `dend' is the end of the input string that `d' points within. `d'
5211 is advanced into the following input string whenever necessary, but
5212 this happens before fetching; therefore, at the beginning of the
5213 loop, `d' can be pointing at the end of a string, but it cannot
5214 equal `string2'. */
419d1c74 5215 if (pos >= size1)
fa9a63c5 5216 {
419d1c74
SM
5217 /* Only match within string2. */
5218 d = string2 + pos - size1;
5219 dend = end_match_2 = string2 + stop - size1;
5220 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5221 }
5222 else
5223 {
f1ad044f 5224 if (stop < size1)
419d1c74
SM
5225 {
5226 /* Only match within string1. */
5227 end_match_1 = string1 + stop;
5228 /* BEWARE!
5229 When we reach end_match_1, PREFETCH normally switches to string2.
5230 But in the present case, this means that just doing a PREFETCH
5231 makes us jump from `stop' to `gap' within the string.
5232 What we really want here is for the search to stop as
5233 soon as we hit end_match_1. That's why we set end_match_2
5234 to end_match_1 (since PREFETCH fails as soon as we hit
5235 end_match_2). */
5236 end_match_2 = end_match_1;
5237 }
5238 else
f1ad044f
SM
5239 { /* It's important to use this code when stop == size so that
5240 moving `d' from end1 to string2 will not prevent the d == dend
5241 check from catching the end of string. */
419d1c74
SM
5242 end_match_1 = end1;
5243 end_match_2 = string2 + stop - size1;
5244 }
5245 d = string1 + pos;
5246 dend = end_match_1;
fa9a63c5
RM
5247 }
5248
5249 DEBUG_PRINT1 ("The compiled pattern is: ");
5250 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5251 DEBUG_PRINT1 ("The string to match is: `");
5252 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5253 DEBUG_PRINT1 ("'\n");
5e69f11e 5254
7814e705 5255 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5256 function if the match is complete, or it drops through if the match
5257 fails at this starting point in the input data. */
5258 for (;;)
5259 {
505bde11 5260 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5261
5262 if (p == pend)
5263 { /* End of pattern means we might have succeeded. */
25fe55af 5264 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5265
fa9a63c5 5266 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5267 longest match, try backtracking. */
5268 if (d != end_match_2)
fa9a63c5
RM
5269 {
5270 /* 1 if this match ends in the same string (string1 or string2)
5271 as the best previous match. */
5e69f11e 5272 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5273 == FIRST_STRING_P (d));
fa9a63c5
RM
5274 /* 1 if this match is the best seen so far. */
5275 boolean best_match_p;
5276
5277 /* AIX compiler got confused when this was combined
7814e705 5278 with the previous declaration. */
fa9a63c5
RM
5279 if (same_str_p)
5280 best_match_p = d > match_end;
5281 else
99633e97 5282 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5283
25fe55af
RS
5284 DEBUG_PRINT1 ("backtracking.\n");
5285
5286 if (!FAIL_STACK_EMPTY ())
5287 { /* More failure points to try. */
5288
5289 /* If exceeds best match so far, save it. */
5290 if (!best_regs_set || best_match_p)
5291 {
5292 best_regs_set = true;
5293 match_end = d;
5294
5295 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5296
01618498 5297 for (reg = 1; reg < num_regs; reg++)
25fe55af 5298 {
01618498
SM
5299 best_regstart[reg] = regstart[reg];
5300 best_regend[reg] = regend[reg];
25fe55af
RS
5301 }
5302 }
5303 goto fail;
5304 }
5305
5306 /* If no failure points, don't restore garbage. And if
5307 last match is real best match, don't restore second
5308 best one. */
5309 else if (best_regs_set && !best_match_p)
5310 {
5311 restore_best_regs:
5312 /* Restore best match. It may happen that `dend ==
5313 end_match_1' while the restored d is in string2.
5314 For example, the pattern `x.*y.*z' against the
5315 strings `x-' and `y-z-', if the two strings are
7814e705 5316 not consecutive in memory. */
25fe55af
RS
5317 DEBUG_PRINT1 ("Restoring best registers.\n");
5318
5319 d = match_end;
5320 dend = ((d >= string1 && d <= end1)
5321 ? end_match_1 : end_match_2);
fa9a63c5 5322
01618498 5323 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5324 {
01618498
SM
5325 regstart[reg] = best_regstart[reg];
5326 regend[reg] = best_regend[reg];
fa9a63c5 5327 }
25fe55af
RS
5328 }
5329 } /* d != end_match_2 */
fa9a63c5
RM
5330
5331 succeed_label:
25fe55af 5332 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5333
25fe55af
RS
5334 /* If caller wants register contents data back, do it. */
5335 if (regs && !bufp->no_sub)
fa9a63c5 5336 {
25fe55af
RS
5337 /* Have the register data arrays been allocated? */
5338 if (bufp->regs_allocated == REGS_UNALLOCATED)
7814e705 5339 { /* No. So allocate them with malloc. We need one
25fe55af
RS
5340 extra element beyond `num_regs' for the `-1' marker
5341 GNU code uses. */
5342 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5343 regs->start = TALLOC (regs->num_regs, regoff_t);
5344 regs->end = TALLOC (regs->num_regs, regoff_t);
5345 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5346 {
5347 FREE_VARIABLES ();
5348 return -2;
5349 }
25fe55af
RS
5350 bufp->regs_allocated = REGS_REALLOCATE;
5351 }
5352 else if (bufp->regs_allocated == REGS_REALLOCATE)
5353 { /* Yes. If we need more elements than were already
5354 allocated, reallocate them. If we need fewer, just
5355 leave it alone. */
5356 if (regs->num_regs < num_regs + 1)
5357 {
5358 regs->num_regs = num_regs + 1;
5359 RETALLOC (regs->start, regs->num_regs, regoff_t);
5360 RETALLOC (regs->end, regs->num_regs, regoff_t);
5361 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5362 {
5363 FREE_VARIABLES ();
5364 return -2;
5365 }
25fe55af
RS
5366 }
5367 }
5368 else
fa9a63c5
RM
5369 {
5370 /* These braces fend off a "empty body in an else-statement"
7814e705 5371 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5372 assert (bufp->regs_allocated == REGS_FIXED);
5373 }
5374
25fe55af
RS
5375 /* Convert the pointer data in `regstart' and `regend' to
5376 indices. Register zero has to be set differently,
5377 since we haven't kept track of any info for it. */
5378 if (regs->num_regs > 0)
5379 {
5380 regs->start[0] = pos;
99633e97 5381 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5382 }
5e69f11e 5383
25fe55af
RS
5384 /* Go through the first `min (num_regs, regs->num_regs)'
5385 registers, since that is all we initialized. */
01618498 5386 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5387 {
01618498
SM
5388 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5389 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5390 else
5391 {
01618498
SM
5392 regs->start[reg]
5393 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5394 regs->end[reg]
5395 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5396 }
fa9a63c5 5397 }
5e69f11e 5398
25fe55af
RS
5399 /* If the regs structure we return has more elements than
5400 were in the pattern, set the extra elements to -1. If
5401 we (re)allocated the registers, this is the case,
5402 because we always allocate enough to have at least one
7814e705 5403 -1 at the end. */
01618498
SM
5404 for (reg = num_regs; reg < regs->num_regs; reg++)
5405 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5406 } /* regs && !bufp->no_sub */
5407
25fe55af
RS
5408 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5409 nfailure_points_pushed, nfailure_points_popped,
5410 nfailure_points_pushed - nfailure_points_popped);
5411 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5412
99633e97 5413 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5414
25fe55af 5415 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5416
25fe55af
RS
5417 FREE_VARIABLES ();
5418 return mcnt;
5419 }
fa9a63c5 5420
7814e705 5421 /* Otherwise match next pattern command. */
fa9a63c5
RM
5422 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5423 {
25fe55af
RS
5424 /* Ignore these. Used to ignore the n of succeed_n's which
5425 currently have n == 0. */
5426 case no_op:
5427 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5428 break;
fa9a63c5
RM
5429
5430 case succeed:
25fe55af 5431 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5432 goto succeed_label;
5433
7814e705 5434 /* Match the next n pattern characters exactly. The following
25fe55af 5435 byte in the pattern defines n, and the n bytes after that
7814e705 5436 are the characters to match. */
fa9a63c5
RM
5437 case exactn:
5438 mcnt = *p++;
25fe55af 5439 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5440
99633e97
SM
5441 /* Remember the start point to rollback upon failure. */
5442 dfail = d;
5443
6fdd04b0 5444#ifndef emacs
25fe55af
RS
5445 /* This is written out as an if-else so we don't waste time
5446 testing `translate' inside the loop. */
28703c16 5447 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5448 do
5449 {
5450 PREFETCH ();
5451 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5452 {
6fdd04b0
KH
5453 d = dfail;
5454 goto fail;
e934739e 5455 }
6fdd04b0
KH
5456 d++;
5457 }
5458 while (--mcnt);
fa9a63c5 5459 else
6fdd04b0
KH
5460 do
5461 {
5462 PREFETCH ();
5463 if (*d++ != *p++)
bf216479 5464 {
6fdd04b0
KH
5465 d = dfail;
5466 goto fail;
bf216479 5467 }
6fdd04b0
KH
5468 }
5469 while (--mcnt);
5470#else /* emacs */
5471 /* The cost of testing `translate' is comparatively small. */
cf9c99bc 5472 if (target_multibyte)
6fdd04b0
KH
5473 do
5474 {
5475 int pat_charlen, buf_charlen;
cf9c99bc 5476 int pat_ch, buf_ch;
e934739e 5477
6fdd04b0 5478 PREFETCH ();
cf9c99bc 5479 if (multibyte)
62a6e103 5480 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
cf9c99bc
KH
5481 else
5482 {
5483 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5484 pat_charlen = 1;
5485 }
62a6e103 5486 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
e934739e 5487
6fdd04b0 5488 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5489 {
6fdd04b0
KH
5490 d = dfail;
5491 goto fail;
e934739e 5492 }
bf216479 5493
6fdd04b0
KH
5494 p += pat_charlen;
5495 d += buf_charlen;
5496 mcnt -= pat_charlen;
5497 }
5498 while (mcnt > 0);
fa9a63c5 5499 else
6fdd04b0
KH
5500 do
5501 {
cf9c99bc
KH
5502 int pat_charlen, buf_charlen;
5503 int pat_ch, buf_ch;
bf216479 5504
6fdd04b0 5505 PREFETCH ();
cf9c99bc
KH
5506 if (multibyte)
5507 {
62a6e103 5508 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
2afc21f5 5509 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
cf9c99bc
KH
5510 }
5511 else
5512 {
5513 pat_ch = *p;
5514 pat_charlen = 1;
5515 }
5516 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5517 if (! CHAR_BYTE8_P (buf_ch))
5518 {
5519 buf_ch = TRANSLATE (buf_ch);
5520 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5521 if (buf_ch < 0)
5522 buf_ch = *d;
5523 }
0e2501ed
AS
5524 else
5525 buf_ch = *d;
cf9c99bc 5526 if (buf_ch != pat_ch)
6fdd04b0
KH
5527 {
5528 d = dfail;
5529 goto fail;
bf216479 5530 }
cf9c99bc
KH
5531 p += pat_charlen;
5532 d++;
6fdd04b0
KH
5533 }
5534 while (--mcnt);
5535#endif
25fe55af 5536 break;
fa9a63c5
RM
5537
5538
25fe55af 5539 /* Match any character except possibly a newline or a null. */
fa9a63c5 5540 case anychar:
e934739e
RS
5541 {
5542 int buf_charlen;
01618498 5543 re_wchar_t buf_ch;
fa9a63c5 5544
e934739e 5545 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5546
e934739e 5547 PREFETCH ();
62a6e103 5548 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
cf9c99bc 5549 target_multibyte);
e934739e
RS
5550 buf_ch = TRANSLATE (buf_ch);
5551
5552 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5553 && buf_ch == '\n')
5554 || ((bufp->syntax & RE_DOT_NOT_NULL)
5555 && buf_ch == '\000'))
5556 goto fail;
5557
e934739e
RS
5558 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5559 d += buf_charlen;
5560 }
fa9a63c5
RM
5561 break;
5562
5563
5564 case charset:
5565 case charset_not:
5566 {
b18215fc 5567 register unsigned int c;
fa9a63c5 5568 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5569 int len;
5570
5571 /* Start of actual range_table, or end of bitmap if there is no
5572 range table. */
01618498 5573 re_char *range_table;
b18215fc 5574
96cc36cc 5575 /* Nonzero if there is a range table. */
b18215fc
RS
5576 int range_table_exists;
5577
96cc36cc
RS
5578 /* Number of ranges of range table. This is not included
5579 in the initial byte-length of the command. */
5580 int count = 0;
fa9a63c5 5581
f5020181
AS
5582 /* Whether matching against a unibyte character. */
5583 boolean unibyte_char = false;
5584
25fe55af 5585 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5586
b18215fc 5587 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5588
b18215fc 5589 if (range_table_exists)
96cc36cc
RS
5590 {
5591 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5592 EXTRACT_NUMBER_AND_INCR (count, range_table);
5593 }
b18215fc 5594
2d1675e4 5595 PREFETCH ();
62a6e103 5596 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
cf9c99bc
KH
5597 if (target_multibyte)
5598 {
5599 int c1;
b18215fc 5600
cf9c99bc
KH
5601 c = TRANSLATE (c);
5602 c1 = RE_CHAR_TO_UNIBYTE (c);
5603 if (c1 >= 0)
f5020181
AS
5604 {
5605 unibyte_char = true;
5606 c = c1;
5607 }
cf9c99bc
KH
5608 }
5609 else
5610 {
5611 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5612
5613 if (! CHAR_BYTE8_P (c1))
5614 {
5615 c1 = TRANSLATE (c1);
5616 c1 = RE_CHAR_TO_UNIBYTE (c1);
5617 if (c1 >= 0)
f5020181
AS
5618 {
5619 unibyte_char = true;
5620 c = c1;
5621 }
cf9c99bc 5622 }
0b8be006
AS
5623 else
5624 unibyte_char = true;
cf9c99bc
KH
5625 }
5626
f5020181 5627 if (unibyte_char && c < (1 << BYTEWIDTH))
b18215fc 5628 { /* Lookup bitmap. */
b18215fc
RS
5629 /* Cast to `unsigned' instead of `unsigned char' in
5630 case the bit list is a full 32 bytes long. */
5631 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5632 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5633 not = !not;
b18215fc 5634 }
96cc36cc 5635#ifdef emacs
b18215fc 5636 else if (range_table_exists)
96cc36cc
RS
5637 {
5638 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5639
14473664
SM
5640 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5641 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5642 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5643 | (class_bits & BIT_SPACE && ISSPACE (c))
5644 | (class_bits & BIT_UPPER && ISUPPER (c))
5645 | (class_bits & BIT_WORD && ISWORD (c)))
5646 not = !not;
5647 else
5648 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5649 }
5650#endif /* emacs */
fa9a63c5 5651
96cc36cc
RS
5652 if (range_table_exists)
5653 p = CHARSET_RANGE_TABLE_END (range_table, count);
5654 else
5655 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5656
5657 if (!not) goto fail;
5e69f11e 5658
b18215fc 5659 d += len;
fa9a63c5
RM
5660 break;
5661 }
5662
5663
25fe55af 5664 /* The beginning of a group is represented by start_memory.
505bde11 5665 The argument is the register number. The text
25fe55af 5666 matched within the group is recorded (in the internal
7814e705 5667 registers data structure) under the register number. */
25fe55af 5668 case start_memory:
505bde11
SM
5669 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5670
5671 /* In case we need to undo this operation (via backtracking). */
5672 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5673
25fe55af 5674 regstart[*p] = d;
4bb91c68 5675 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5676 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5677
25fe55af 5678 /* Move past the register number and inner group count. */
505bde11 5679 p += 1;
25fe55af 5680 break;
fa9a63c5
RM
5681
5682
25fe55af 5683 /* The stop_memory opcode represents the end of a group. Its
505bde11 5684 argument is the same as start_memory's: the register number. */
fa9a63c5 5685 case stop_memory:
505bde11
SM
5686 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5687
5688 assert (!REG_UNSET (regstart[*p]));
5689 /* Strictly speaking, there should be code such as:
177c0ea7 5690
0b32bf0e 5691 assert (REG_UNSET (regend[*p]));
505bde11
SM
5692 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5693
5694 But the only info to be pushed is regend[*p] and it is known to
5695 be UNSET, so there really isn't anything to push.
5696 Not pushing anything, on the other hand deprives us from the
5697 guarantee that regend[*p] is UNSET since undoing this operation
5698 will not reset its value properly. This is not important since
5699 the value will only be read on the next start_memory or at
5700 the very end and both events can only happen if this stop_memory
5701 is *not* undone. */
fa9a63c5 5702
25fe55af 5703 regend[*p] = d;
fa9a63c5
RM
5704 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5705
25fe55af 5706 /* Move past the register number and the inner group count. */
505bde11 5707 p += 1;
25fe55af 5708 break;
fa9a63c5
RM
5709
5710
5711 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5712 followed by the numeric value of <digit> as the register number. */
5713 case duplicate:
fa9a63c5 5714 {
66f0296e 5715 register re_char *d2, *dend2;
7814e705 5716 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5717 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5718
7814e705 5719 /* Can't back reference a group which we've never matched. */
25fe55af
RS
5720 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5721 goto fail;
5e69f11e 5722
7814e705 5723 /* Where in input to try to start matching. */
25fe55af 5724 d2 = regstart[regno];
5e69f11e 5725
99633e97
SM
5726 /* Remember the start point to rollback upon failure. */
5727 dfail = d;
5728
25fe55af
RS
5729 /* Where to stop matching; if both the place to start and
5730 the place to stop matching are in the same string, then
5731 set to the place to stop, otherwise, for now have to use
5732 the end of the first string. */
fa9a63c5 5733
25fe55af 5734 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5735 == FIRST_STRING_P (regend[regno]))
5736 ? regend[regno] : end_match_1);
5737 for (;;)
5738 {
5739 /* If necessary, advance to next segment in register
25fe55af 5740 contents. */
fa9a63c5
RM
5741 while (d2 == dend2)
5742 {
5743 if (dend2 == end_match_2) break;
5744 if (dend2 == regend[regno]) break;
5745
25fe55af
RS
5746 /* End of string1 => advance to string2. */
5747 d2 = string2;
5748 dend2 = regend[regno];
fa9a63c5
RM
5749 }
5750 /* At end of register contents => success */
5751 if (d2 == dend2) break;
5752
5753 /* If necessary, advance to next segment in data. */
5754 PREFETCH ();
5755
5756 /* How many characters left in this segment to match. */
5757 mcnt = dend - d;
5e69f11e 5758
fa9a63c5 5759 /* Want how many consecutive characters we can match in
25fe55af
RS
5760 one shot, so, if necessary, adjust the count. */
5761 if (mcnt > dend2 - d2)
fa9a63c5 5762 mcnt = dend2 - d2;
5e69f11e 5763
fa9a63c5 5764 /* Compare that many; failure if mismatch, else move
25fe55af 5765 past them. */
28703c16 5766 if (RE_TRANSLATE_P (translate)
02cb78b5 5767 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
4bb91c68 5768 : memcmp (d, d2, mcnt))
99633e97
SM
5769 {
5770 d = dfail;
5771 goto fail;
5772 }
fa9a63c5 5773 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5774 }
5775 }
5776 break;
5777
5778
25fe55af 5779 /* begline matches the empty string at the beginning of the string
c0f9ea08 5780 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5781 case begline:
25fe55af 5782 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5783
25fe55af
RS
5784 if (AT_STRINGS_BEG (d))
5785 {
5786 if (!bufp->not_bol) break;
5787 }
419d1c74 5788 else
25fe55af 5789 {
bf216479 5790 unsigned c;
419d1c74 5791 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5792 if (c == '\n')
419d1c74 5793 break;
25fe55af
RS
5794 }
5795 /* In all other cases, we fail. */
5796 goto fail;
fa9a63c5
RM
5797
5798
25fe55af 5799 /* endline is the dual of begline. */
fa9a63c5 5800 case endline:
25fe55af 5801 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5802
25fe55af
RS
5803 if (AT_STRINGS_END (d))
5804 {
5805 if (!bufp->not_eol) break;
5806 }
f1ad044f 5807 else
25fe55af 5808 {
f1ad044f 5809 PREFETCH_NOLIMIT ();
c0f9ea08 5810 if (*d == '\n')
f1ad044f 5811 break;
25fe55af
RS
5812 }
5813 goto fail;
fa9a63c5
RM
5814
5815
5816 /* Match at the very beginning of the data. */
25fe55af
RS
5817 case begbuf:
5818 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5819 if (AT_STRINGS_BEG (d))
5820 break;
5821 goto fail;
fa9a63c5
RM
5822
5823
5824 /* Match at the very end of the data. */
25fe55af
RS
5825 case endbuf:
5826 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5827 if (AT_STRINGS_END (d))
5828 break;
25fe55af 5829 goto fail;
5e69f11e 5830
5e69f11e 5831
25fe55af
RS
5832 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5833 pushes NULL as the value for the string on the stack. Then
505bde11 5834 `POP_FAILURE_POINT' will keep the current value for the
25fe55af 5835 string, instead of restoring it. To see why, consider
7814e705 5836 matching `foo\nbar' against `.*\n'. The .* matches the foo;
25fe55af
RS
5837 then the . fails against the \n. But the next thing we want
5838 to do is match the \n against the \n; if we restored the
5839 string value, we would be back at the foo.
5840
5841 Because this is used only in specific cases, we don't need to
5842 check all the things that `on_failure_jump' does, to make
5843 sure the right things get saved on the stack. Hence we don't
5844 share its code. The only reason to push anything on the
5845 stack at all is that otherwise we would have to change
5846 `anychar's code to do something besides goto fail in this
5847 case; that seems worse than this. */
5848 case on_failure_keep_string_jump:
505bde11
SM
5849 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5850 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5851 mcnt, p + mcnt);
fa9a63c5 5852
505bde11
SM
5853 PUSH_FAILURE_POINT (p - 3, NULL);
5854 break;
5855
0683b6fa
SM
5856 /* A nasty loop is introduced by the non-greedy *? and +?.
5857 With such loops, the stack only ever contains one failure point
5858 at a time, so that a plain on_failure_jump_loop kind of
5859 cycle detection cannot work. Worse yet, such a detection
5860 can not only fail to detect a cycle, but it can also wrongly
5861 detect a cycle (between different instantiations of the same
6df42991 5862 loop).
0683b6fa
SM
5863 So the method used for those nasty loops is a little different:
5864 We use a special cycle-detection-stack-frame which is pushed
5865 when the on_failure_jump_nastyloop failure-point is *popped*.
5866 This special frame thus marks the beginning of one iteration
5867 through the loop and we can hence easily check right here
5868 whether something matched between the beginning and the end of
5869 the loop. */
5870 case on_failure_jump_nastyloop:
5871 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5872 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5873 mcnt, p + mcnt);
5874
5875 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5876 {
5877 int cycle = 0;
5878 CHECK_INFINITE_LOOP (p - 4, d);
5879 if (!cycle)
5880 /* If there's a cycle, just continue without pushing
5881 this failure point. The failure point is the "try again"
5882 option, which shouldn't be tried.
5883 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5884 PUSH_FAILURE_POINT (p - 3, d);
5885 }
0683b6fa
SM
5886 break;
5887
4e8a9132
SM
5888 /* Simple loop detecting on_failure_jump: just check on the
5889 failure stack if the same spot was already hit earlier. */
505bde11
SM
5890 case on_failure_jump_loop:
5891 on_failure:
5892 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5893 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5894 mcnt, p + mcnt);
6df42991
SM
5895 {
5896 int cycle = 0;
5897 CHECK_INFINITE_LOOP (p - 3, d);
5898 if (cycle)
5899 /* If there's a cycle, get out of the loop, as if the matching
5900 had failed. We used to just `goto fail' here, but that was
5901 aborting the search a bit too early: we want to keep the
5902 empty-loop-match and keep matching after the loop.
5903 We want (x?)*y\1z to match both xxyz and xxyxz. */
5904 p += mcnt;
5905 else
5906 PUSH_FAILURE_POINT (p - 3, d);
5907 }
25fe55af 5908 break;
fa9a63c5
RM
5909
5910
5911 /* Uses of on_failure_jump:
5e69f11e 5912
25fe55af
RS
5913 Each alternative starts with an on_failure_jump that points
5914 to the beginning of the next alternative. Each alternative
5915 except the last ends with a jump that in effect jumps past
5916 the rest of the alternatives. (They really jump to the
5917 ending jump of the following alternative, because tensioning
5918 these jumps is a hassle.)
fa9a63c5 5919
25fe55af
RS
5920 Repeats start with an on_failure_jump that points past both
5921 the repetition text and either the following jump or
5922 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5923 case on_failure_jump:
25fe55af 5924 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5925 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5926 mcnt, p + mcnt);
25fe55af 5927
505bde11 5928 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5929 break;
5930
4e8a9132 5931 /* This operation is used for greedy *.
505bde11
SM
5932 Compare the beginning of the repeat with what in the
5933 pattern follows its end. If we can establish that there
5934 is nothing that they would both match, i.e., that we
5935 would have to backtrack because of (as in, e.g., `a*a')
5936 then we can use a non-backtracking loop based on
4e8a9132 5937 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5938 case on_failure_jump_smart:
25fe55af 5939 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5940 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5941 mcnt, p + mcnt);
25fe55af 5942 {
01618498 5943 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5944 /* Here, we discard `const', making re_match non-reentrant. */
5945 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5946 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5947
505bde11
SM
5948 p -= 3; /* Reset so that we will re-execute the
5949 instruction once it's been changed. */
fa9a63c5 5950
4e8a9132
SM
5951 EXTRACT_NUMBER (mcnt, p2 - 2);
5952
5953 /* Ensure this is a indeed the trivial kind of loop
5954 we are expecting. */
5955 assert (skip_one_char (p1) == p2 - 3);
5956 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5957 DEBUG_STATEMENT (debug += 2);
505bde11 5958 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5959 {
505bde11 5960 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5961 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5962 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5963 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5964 }
505bde11 5965 else
fa9a63c5 5966 {
505bde11
SM
5967 /* Default to a safe `on_failure_jump' loop. */
5968 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5969 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5970 }
99633e97 5971 DEBUG_STATEMENT (debug -= 2);
25fe55af 5972 }
505bde11 5973 break;
25fe55af
RS
5974
5975 /* Unconditionally jump (without popping any failure points). */
5976 case jump:
fa9a63c5 5977 unconditional_jump:
5b370c2b 5978 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5979 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af 5980 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7814e705 5981 p += mcnt; /* Do the jump. */
505bde11 5982 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5983 break;
5984
5985
25fe55af
RS
5986 /* Have to succeed matching what follows at least n times.
5987 After that, handle like `on_failure_jump'. */
5988 case succeed_n:
01618498 5989 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5990 EXTRACT_NUMBER (mcnt, p + 2);
5991 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5992
dc1e502d
SM
5993 /* Originally, mcnt is how many times we HAVE to succeed. */
5994 if (mcnt != 0)
25fe55af 5995 {
6dcf2d0e
SM
5996 /* Here, we discard `const', making re_match non-reentrant. */
5997 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5998 mcnt--;
01618498
SM
5999 p += 4;
6000 PUSH_NUMBER (p2, mcnt);
25fe55af 6001 }
dc1e502d
SM
6002 else
6003 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
6004 goto on_failure;
25fe55af
RS
6005 break;
6006
6007 case jump_n:
01618498 6008 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
6009 EXTRACT_NUMBER (mcnt, p + 2);
6010 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
6011
6012 /* Originally, this is how many times we CAN jump. */
dc1e502d 6013 if (mcnt != 0)
25fe55af 6014 {
6dcf2d0e
SM
6015 /* Here, we discard `const', making re_match non-reentrant. */
6016 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 6017 mcnt--;
01618498 6018 PUSH_NUMBER (p2, mcnt);
dc1e502d 6019 goto unconditional_jump;
25fe55af
RS
6020 }
6021 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
6022 else
6023 p += 4;
25fe55af 6024 break;
5e69f11e 6025
fa9a63c5
RM
6026 case set_number_at:
6027 {
01618498 6028 unsigned char *p2; /* Location of the counter. */
25fe55af 6029 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 6030
25fe55af 6031 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
6032 /* Here, we discard `const', making re_match non-reentrant. */
6033 p2 = (unsigned char*) p + mcnt;
01618498 6034 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 6035 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
6036 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6037 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
6038 break;
6039 }
9121ca40
KH
6040
6041 case wordbound:
66f0296e
SM
6042 case notwordbound:
6043 not = (re_opcode_t) *(p - 1) == notwordbound;
6044 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 6045
99633e97 6046 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 6047
b18215fc 6048 /* Case 1: D is at the beginning or the end of string. */
9121ca40 6049 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 6050 not = !not;
b18215fc
RS
6051 else
6052 {
6053 /* C1 is the character before D, S1 is the syntax of C1, C2
6054 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6055 re_wchar_t c1, c2;
6056 int s1, s2;
bf216479 6057 int dummy;
b18215fc 6058#ifdef emacs
2d1675e4
SM
6059 int offset = PTR_TO_OFFSET (d - 1);
6060 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 6061 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6062#endif
66f0296e 6063 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6064 s1 = SYNTAX (c1);
6065#ifdef emacs
5d967c7a 6066 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 6067#endif
f1ad044f 6068 PREFETCH_NOLIMIT ();
6fdd04b0 6069 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
6070 s2 = SYNTAX (c2);
6071
6072 if (/* Case 2: Only one of S1 and S2 is Sword. */
6073 ((s1 == Sword) != (s2 == Sword))
6074 /* Case 3: Both of S1 and S2 are Sword, and macro
7814e705 6075 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 6076 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
6077 not = !not;
6078 }
6079 if (not)
9121ca40 6080 break;
b18215fc 6081 else
9121ca40 6082 goto fail;
fa9a63c5
RM
6083
6084 case wordbeg:
25fe55af 6085 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 6086
b18215fc
RS
6087 /* We FAIL in one of the following cases: */
6088
7814e705 6089 /* Case 1: D is at the end of string. */
b18215fc 6090 if (AT_STRINGS_END (d))
99633e97 6091 goto fail;
b18215fc
RS
6092 else
6093 {
6094 /* C1 is the character before D, S1 is the syntax of C1, C2
6095 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6096 re_wchar_t c1, c2;
6097 int s1, s2;
bf216479 6098 int dummy;
fa9a63c5 6099#ifdef emacs
2d1675e4
SM
6100 int offset = PTR_TO_OFFSET (d);
6101 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6102 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 6103#endif
99633e97 6104 PREFETCH ();
6fdd04b0 6105 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 6106 s2 = SYNTAX (c2);
177c0ea7 6107
b18215fc
RS
6108 /* Case 2: S2 is not Sword. */
6109 if (s2 != Sword)
6110 goto fail;
6111
6112 /* Case 3: D is not at the beginning of string ... */
6113 if (!AT_STRINGS_BEG (d))
6114 {
6115 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6116#ifdef emacs
5d967c7a 6117 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 6118#endif
b18215fc
RS
6119 s1 = SYNTAX (c1);
6120
6121 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6122 returns 0. */
b18215fc
RS
6123 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6124 goto fail;
6125 }
6126 }
e318085a
RS
6127 break;
6128
b18215fc 6129 case wordend:
25fe55af 6130 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
6131
6132 /* We FAIL in one of the following cases: */
6133
6134 /* Case 1: D is at the beginning of string. */
6135 if (AT_STRINGS_BEG (d))
e318085a 6136 goto fail;
b18215fc
RS
6137 else
6138 {
6139 /* C1 is the character before D, S1 is the syntax of C1, C2
6140 is the character at D, and S2 is the syntax of C2. */
01618498
SM
6141 re_wchar_t c1, c2;
6142 int s1, s2;
bf216479 6143 int dummy;
5d967c7a 6144#ifdef emacs
2d1675e4
SM
6145 int offset = PTR_TO_OFFSET (d) - 1;
6146 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 6147 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 6148#endif
99633e97 6149 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
6150 s1 = SYNTAX (c1);
6151
6152 /* Case 2: S1 is not Sword. */
6153 if (s1 != Sword)
6154 goto fail;
6155
6156 /* Case 3: D is not at the end of string ... */
6157 if (!AT_STRINGS_END (d))
6158 {
f1ad044f 6159 PREFETCH_NOLIMIT ();
6fdd04b0 6160 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
6161#ifdef emacs
6162 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6163#endif
b18215fc
RS
6164 s2 = SYNTAX (c2);
6165
6166 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
7814e705 6167 returns 0. */
b18215fc 6168 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 6169 goto fail;
b18215fc
RS
6170 }
6171 }
e318085a
RS
6172 break;
6173
669fa600
SM
6174 case symbeg:
6175 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6176
6177 /* We FAIL in one of the following cases: */
6178
7814e705 6179 /* Case 1: D is at the end of string. */
669fa600
SM
6180 if (AT_STRINGS_END (d))
6181 goto fail;
6182 else
6183 {
6184 /* C1 is the character before D, S1 is the syntax of C1, C2
6185 is the character at D, and S2 is the syntax of C2. */
6186 re_wchar_t c1, c2;
6187 int s1, s2;
6188#ifdef emacs
6189 int offset = PTR_TO_OFFSET (d);
6190 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6191 UPDATE_SYNTAX_TABLE (charpos);
6192#endif
6193 PREFETCH ();
62a6e103 6194 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6195 s2 = SYNTAX (c2);
7814e705 6196
669fa600
SM
6197 /* Case 2: S2 is neither Sword nor Ssymbol. */
6198 if (s2 != Sword && s2 != Ssymbol)
6199 goto fail;
6200
6201 /* Case 3: D is not at the beginning of string ... */
6202 if (!AT_STRINGS_BEG (d))
6203 {
6204 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6205#ifdef emacs
6206 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6207#endif
6208 s1 = SYNTAX (c1);
6209
6210 /* ... and S1 is Sword or Ssymbol. */
6211 if (s1 == Sword || s1 == Ssymbol)
6212 goto fail;
6213 }
6214 }
6215 break;
6216
6217 case symend:
6218 DEBUG_PRINT1 ("EXECUTING symend.\n");
6219
6220 /* We FAIL in one of the following cases: */
6221
6222 /* Case 1: D is at the beginning of string. */
6223 if (AT_STRINGS_BEG (d))
6224 goto fail;
6225 else
6226 {
6227 /* C1 is the character before D, S1 is the syntax of C1, C2
6228 is the character at D, and S2 is the syntax of C2. */
6229 re_wchar_t c1, c2;
6230 int s1, s2;
6231#ifdef emacs
6232 int offset = PTR_TO_OFFSET (d) - 1;
6233 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6234 UPDATE_SYNTAX_TABLE (charpos);
6235#endif
6236 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6237 s1 = SYNTAX (c1);
6238
6239 /* Case 2: S1 is neither Ssymbol nor Sword. */
6240 if (s1 != Sword && s1 != Ssymbol)
6241 goto fail;
6242
6243 /* Case 3: D is not at the end of string ... */
6244 if (!AT_STRINGS_END (d))
6245 {
6246 PREFETCH_NOLIMIT ();
62a6e103 6247 c2 = RE_STRING_CHAR (d, target_multibyte);
669fa600 6248#ifdef emacs
134579f2 6249 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
669fa600
SM
6250#endif
6251 s2 = SYNTAX (c2);
6252
6253 /* ... and S2 is Sword or Ssymbol. */
6254 if (s2 == Sword || s2 == Ssymbol)
6255 goto fail;
b18215fc
RS
6256 }
6257 }
e318085a
RS
6258 break;
6259
fa9a63c5 6260 case syntaxspec:
1fb352e0
SM
6261 case notsyntaxspec:
6262 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 6263 mcnt = *p++;
1fb352e0 6264 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 6265 PREFETCH ();
b18215fc
RS
6266#ifdef emacs
6267 {
2d1675e4
SM
6268 int offset = PTR_TO_OFFSET (d);
6269 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
6270 UPDATE_SYNTAX_TABLE (pos1);
6271 }
25fe55af 6272#endif
b18215fc 6273 {
01618498
SM
6274 int len;
6275 re_wchar_t c;
b18215fc 6276
6fdd04b0 6277 GET_CHAR_AFTER (c, d, len);
990b2375 6278 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 6279 goto fail;
b18215fc
RS
6280 d += len;
6281 }
fa9a63c5
RM
6282 break;
6283
b18215fc 6284#ifdef emacs
1fb352e0
SM
6285 case before_dot:
6286 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6287 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 6288 goto fail;
b18215fc
RS
6289 break;
6290
1fb352e0
SM
6291 case at_dot:
6292 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6293 if (PTR_BYTE_POS (d) != PT_BYTE)
6294 goto fail;
6295 break;
b18215fc 6296
1fb352e0
SM
6297 case after_dot:
6298 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6299 if (PTR_BYTE_POS (d) <= PT_BYTE)
6300 goto fail;
e318085a 6301 break;
fa9a63c5 6302
1fb352e0 6303 case categoryspec:
b18215fc 6304 case notcategoryspec:
1fb352e0 6305 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 6306 mcnt = *p++;
1fb352e0 6307 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6308 PREFETCH ();
6309 {
01618498
SM
6310 int len;
6311 re_wchar_t c;
6312
6fdd04b0 6313 GET_CHAR_AFTER (c, d, len);
1fb352e0 6314 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6315 goto fail;
6316 d += len;
6317 }
fa9a63c5 6318 break;
5e69f11e 6319
1fb352e0 6320#endif /* emacs */
5e69f11e 6321
0b32bf0e
SM
6322 default:
6323 abort ();
fa9a63c5 6324 }
b18215fc 6325 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6326
6327
6328 /* We goto here if a matching operation fails. */
6329 fail:
5b370c2b 6330 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6331 if (!FAIL_STACK_EMPTY ())
505bde11 6332 {
01618498 6333 re_char *str, *pat;
505bde11 6334 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6335 DEBUG_PRINT1 ("\nFAIL:\n");
6336 POP_FAILURE_POINT (str, pat);
505bde11
SM
6337 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6338 {
6339 case on_failure_keep_string_jump:
6340 assert (str == NULL);
6341 goto continue_failure_jump;
6342
0683b6fa
SM
6343 case on_failure_jump_nastyloop:
6344 assert ((re_opcode_t)pat[-2] == no_op);
6345 PUSH_FAILURE_POINT (pat - 2, str);
6346 /* Fallthrough */
6347
505bde11
SM
6348 case on_failure_jump_loop:
6349 case on_failure_jump:
6350 case succeed_n:
6351 d = str;
6352 continue_failure_jump:
6353 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6354 p = pat + mcnt;
6355 break;
b18215fc 6356
0683b6fa
SM
6357 case no_op:
6358 /* A special frame used for nastyloops. */
6359 goto fail;
6360
505bde11
SM
6361 default:
6362 abort();
6363 }
fa9a63c5 6364
505bde11 6365 assert (p >= bufp->buffer && p <= pend);
b18215fc 6366
0b32bf0e 6367 if (d >= string1 && d <= end1)
fa9a63c5 6368 dend = end_match_1;
0b32bf0e 6369 }
fa9a63c5 6370 else
0b32bf0e 6371 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6372 } /* for (;;) */
6373
6374 if (best_regs_set)
6375 goto restore_best_regs;
6376
6377 FREE_VARIABLES ();
6378
b18215fc 6379 return -1; /* Failure to match. */
fa9a63c5
RM
6380} /* re_match_2 */
6381\f
6382/* Subroutine definitions for re_match_2. */
6383
fa9a63c5
RM
6384/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6385 bytes; nonzero otherwise. */
5e69f11e 6386
fa9a63c5 6387static int
971de7fb 6388bcmp_translate (const re_char *s1, const re_char *s2, register int len, Lisp_Object translate, const const int target_multibyte)
fa9a63c5 6389{
2d1675e4
SM
6390 register re_char *p1 = s1, *p2 = s2;
6391 re_char *p1_end = s1 + len;
6392 re_char *p2_end = s2 + len;
e934739e 6393
4bb91c68
SM
6394 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6395 different lengths, but relying on a single `len' would break this. -sm */
6396 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6397 {
e934739e 6398 int p1_charlen, p2_charlen;
01618498 6399 re_wchar_t p1_ch, p2_ch;
e934739e 6400
6fdd04b0
KH
6401 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6402 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6403
6404 if (RE_TRANSLATE (translate, p1_ch)
6405 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6406 return 1;
e934739e
RS
6407
6408 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6409 }
e934739e
RS
6410
6411 if (p1 != p1_end || p2 != p2_end)
6412 return 1;
6413
fa9a63c5
RM
6414 return 0;
6415}
6416\f
6417/* Entry points for GNU code. */
6418
6419/* re_compile_pattern is the GNU regular expression compiler: it
6420 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6421 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6422
fa9a63c5
RM
6423 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6424 are set in BUFP on entry.
5e69f11e 6425
b18215fc 6426 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6427
6428const char *
971de7fb 6429re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp)
fa9a63c5
RM
6430{
6431 reg_errcode_t ret;
5e69f11e 6432
fa9a63c5
RM
6433 /* GNU code is written to assume at least RE_NREGS registers will be set
6434 (and at least one extra will be -1). */
6435 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6436
fa9a63c5
RM
6437 /* And GNU code determines whether or not to get register information
6438 by passing null for the REGS argument to re_match, etc., not by
6439 setting no_sub. */
6440 bufp->no_sub = 0;
5e69f11e 6441
4bb91c68 6442 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6443
6444 if (!ret)
6445 return NULL;
6446 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6447}
c0f9ea08 6448WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6449\f
b18215fc
RS
6450/* Entry points compatible with 4.2 BSD regex library. We don't define
6451 them unless specifically requested. */
fa9a63c5 6452
0b32bf0e 6453#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6454
6455/* BSD has one and only one pattern buffer. */
6456static struct re_pattern_buffer re_comp_buf;
6457
6458char *
0b32bf0e 6459# ifdef _LIBC
48afdd44
RM
6460/* Make these definitions weak in libc, so POSIX programs can redefine
6461 these names if they don't use our functions, and still use
6462 regcomp/regexec below without link errors. */
6463weak_function
0b32bf0e 6464# endif
fa9a63c5
RM
6465re_comp (s)
6466 const char *s;
6467{
6468 reg_errcode_t ret;
5e69f11e 6469
fa9a63c5
RM
6470 if (!s)
6471 {
6472 if (!re_comp_buf.buffer)
0b32bf0e 6473 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6474 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6475 return 0;
6476 }
6477
6478 if (!re_comp_buf.buffer)
6479 {
6480 re_comp_buf.buffer = (unsigned char *) malloc (200);
6481 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6482 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6483 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6484 re_comp_buf.allocated = 200;
6485
6486 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6487 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6488 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6489 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6490 }
6491
6492 /* Since `re_exec' always passes NULL for the `regs' argument, we
6493 don't need to initialize the pattern buffer fields which affect it. */
6494
fa9a63c5 6495 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6496
fa9a63c5
RM
6497 if (!ret)
6498 return NULL;
6499
6500 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6501 return (char *) gettext (re_error_msgid[(int) ret]);
6502}
6503
6504
6505int
0b32bf0e 6506# ifdef _LIBC
48afdd44 6507weak_function
0b32bf0e 6508# endif
fa9a63c5
RM
6509re_exec (s)
6510 const char *s;
6511{
6512 const int len = strlen (s);
6513 return
6514 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6515}
6516#endif /* _REGEX_RE_COMP */
6517\f
6518/* POSIX.2 functions. Don't define these for Emacs. */
6519
6520#ifndef emacs
6521
6522/* regcomp takes a regular expression as a string and compiles it.
6523
b18215fc 6524 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6525 since POSIX says we shouldn't. Thus, we set
6526
6527 `buffer' to the compiled pattern;
6528 `used' to the length of the compiled pattern;
6529 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6530 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6531 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6532 `fastmap' to an allocated space for the fastmap;
6533 `fastmap_accurate' to zero;
fa9a63c5
RM
6534 `re_nsub' to the number of subexpressions in PATTERN.
6535
6536 PATTERN is the address of the pattern string.
6537
6538 CFLAGS is a series of bits which affect compilation.
6539
6540 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6541 use POSIX basic syntax.
6542
6543 If REG_NEWLINE is set, then . and [^...] don't match newline.
6544 Also, regexec will try a match beginning after every newline.
6545
6546 If REG_ICASE is set, then we considers upper- and lowercase
6547 versions of letters to be equivalent when matching.
6548
6549 If REG_NOSUB is set, then when PREG is passed to regexec, that
6550 routine will report only success or failure, and nothing about the
6551 registers.
6552
b18215fc 6553 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6554 the return codes and their meanings.) */
6555
6556int
6557regcomp (preg, pattern, cflags)
ada30c0e
SM
6558 regex_t *__restrict preg;
6559 const char *__restrict pattern;
fa9a63c5
RM
6560 int cflags;
6561{
6562 reg_errcode_t ret;
4bb91c68 6563 reg_syntax_t syntax
fa9a63c5
RM
6564 = (cflags & REG_EXTENDED) ?
6565 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6566
6567 /* regex_compile will allocate the space for the compiled pattern. */
6568 preg->buffer = 0;
6569 preg->allocated = 0;
6570 preg->used = 0;
5e69f11e 6571
c0f9ea08
SM
6572 /* Try to allocate space for the fastmap. */
6573 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6574
fa9a63c5
RM
6575 if (cflags & REG_ICASE)
6576 {
6577 unsigned i;
5e69f11e 6578
6676cb1c
RS
6579 preg->translate
6580 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6581 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6582 if (preg->translate == NULL)
0b32bf0e 6583 return (int) REG_ESPACE;
fa9a63c5
RM
6584
6585 /* Map uppercase characters to corresponding lowercase ones. */
6586 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6587 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6588 }
6589 else
6590 preg->translate = NULL;
6591
6592 /* If REG_NEWLINE is set, newlines are treated differently. */
6593 if (cflags & REG_NEWLINE)
6594 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6595 syntax &= ~RE_DOT_NEWLINE;
6596 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6597 }
6598 else
c0f9ea08 6599 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6600
6601 preg->no_sub = !!(cflags & REG_NOSUB);
6602
5e69f11e 6603 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6604 can use strlen here in compiling the pattern. */
4bb91c68 6605 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6606
fa9a63c5
RM
6607 /* POSIX doesn't distinguish between an unmatched open-group and an
6608 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6609 if (ret == REG_ERPAREN)
6610 ret = REG_EPAREN;
6611
6612 if (ret == REG_NOERROR && preg->fastmap)
6613 { /* Compute the fastmap now, since regexec cannot modify the pattern
6614 buffer. */
6615 re_compile_fastmap (preg);
6616 if (preg->can_be_null)
6617 { /* The fastmap can't be used anyway. */
6618 free (preg->fastmap);
6619 preg->fastmap = NULL;
6620 }
6621 }
fa9a63c5
RM
6622 return (int) ret;
6623}
c0f9ea08 6624WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6625
6626
6627/* regexec searches for a given pattern, specified by PREG, in the
6628 string STRING.
5e69f11e 6629
fa9a63c5 6630 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6631 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6632 least NMATCH elements, and we set them to the offsets of the
6633 corresponding matched substrings.
5e69f11e 6634
fa9a63c5
RM
6635 EFLAGS specifies `execution flags' which affect matching: if
6636 REG_NOTBOL is set, then ^ does not match at the beginning of the
6637 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6638
fa9a63c5
RM
6639 We return 0 if we find a match and REG_NOMATCH if not. */
6640
6641int
6642regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6643 const regex_t *__restrict preg;
6644 const char *__restrict string;
5e69f11e 6645 size_t nmatch;
9f2dbe01 6646 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6647 int eflags;
6648{
6649 int ret;
6650 struct re_registers regs;
6651 regex_t private_preg;
6652 int len = strlen (string);
c0f9ea08 6653 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6654
6655 private_preg = *preg;
5e69f11e 6656
fa9a63c5
RM
6657 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6658 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6659
fa9a63c5
RM
6660 /* The user has told us exactly how many registers to return
6661 information about, via `nmatch'. We have to pass that on to the
b18215fc 6662 matching routines. */
fa9a63c5 6663 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6664
fa9a63c5
RM
6665 if (want_reg_info)
6666 {
6667 regs.num_regs = nmatch;
4bb91c68
SM
6668 regs.start = TALLOC (nmatch * 2, regoff_t);
6669 if (regs.start == NULL)
0b32bf0e 6670 return (int) REG_NOMATCH;
4bb91c68 6671 regs.end = regs.start + nmatch;
fa9a63c5
RM
6672 }
6673
c0f9ea08
SM
6674 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6675 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6676 was a little bit longer but still only matching the real part.
6677 This works because the `endline' will check for a '\n' and will find a
6678 '\0', correctly deciding that this is not the end of a line.
6679 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6680 a convenient '\0' there. For all we know, the string could be preceded
6681 by '\n' which would throw things off. */
6682
fa9a63c5
RM
6683 /* Perform the searching operation. */
6684 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6685 /* start: */ 0, /* range: */ len,
6686 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6687
fa9a63c5
RM
6688 /* Copy the register information to the POSIX structure. */
6689 if (want_reg_info)
6690 {
6691 if (ret >= 0)
0b32bf0e
SM
6692 {
6693 unsigned r;
fa9a63c5 6694
0b32bf0e
SM
6695 for (r = 0; r < nmatch; r++)
6696 {
6697 pmatch[r].rm_so = regs.start[r];
6698 pmatch[r].rm_eo = regs.end[r];
6699 }
6700 }
fa9a63c5 6701
b18215fc 6702 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6703 free (regs.start);
fa9a63c5
RM
6704 }
6705
6706 /* We want zero return to mean success, unlike `re_search'. */
6707 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6708}
c0f9ea08 6709WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6710
6711
ec869672
JR
6712/* Returns a message corresponding to an error code, ERR_CODE, returned
6713 from either regcomp or regexec. We don't use PREG here.
6714
6715 ERR_CODE was previously called ERRCODE, but that name causes an
6716 error with msvc8 compiler. */
fa9a63c5
RM
6717
6718size_t
ec869672
JR
6719regerror (err_code, preg, errbuf, errbuf_size)
6720 int err_code;
fa9a63c5
RM
6721 const regex_t *preg;
6722 char *errbuf;
6723 size_t errbuf_size;
6724{
6725 const char *msg;
6726 size_t msg_size;
6727
ec869672
JR
6728 if (err_code < 0
6729 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6730 /* Only error codes returned by the rest of the code should be passed
b18215fc 6731 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6732 code generates an invalid error code, then the program has a bug.
6733 Dump core so we can fix it. */
6734 abort ();
6735
ec869672 6736 msg = gettext (re_error_msgid[err_code]);
fa9a63c5
RM
6737
6738 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6739
fa9a63c5
RM
6740 if (errbuf_size != 0)
6741 {
6742 if (msg_size > errbuf_size)
0b32bf0e
SM
6743 {
6744 strncpy (errbuf, msg, errbuf_size - 1);
6745 errbuf[errbuf_size - 1] = 0;
6746 }
fa9a63c5 6747 else
0b32bf0e 6748 strcpy (errbuf, msg);
fa9a63c5
RM
6749 }
6750
6751 return msg_size;
6752}
c0f9ea08 6753WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6754
6755
6756/* Free dynamically allocated space used by PREG. */
6757
6758void
6759regfree (preg)
6760 regex_t *preg;
6761{
c2cd06e6 6762 free (preg->buffer);
fa9a63c5 6763 preg->buffer = NULL;
5e69f11e 6764
fa9a63c5
RM
6765 preg->allocated = 0;
6766 preg->used = 0;
6767
c2cd06e6 6768 free (preg->fastmap);
fa9a63c5
RM
6769 preg->fastmap = NULL;
6770 preg->fastmap_accurate = 0;
6771
c2cd06e6 6772 free (preg->translate);
fa9a63c5
RM
6773 preg->translate = NULL;
6774}
c0f9ea08 6775WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6776
6777#endif /* not emacs */
839966f3
KH
6778
6779/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
6780 (do not change this comment) */