*** empty log message ***
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
4bb91c68 5 Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
ba4a8e51 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25fe55af 20 USA. */
fa9a63c5 21
6df42991 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
dc1e502d 24 - merge with glibc's regex.[ch].
01618498 25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
01618498 29*/
505bde11 30
fa9a63c5 31/* AIX requires this to be the first thing in the file. */
0b32bf0e 32#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
33 #pragma alloca
34#endif
35
fa9a63c5 36#ifdef HAVE_CONFIG_H
0b32bf0e 37# include <config.h>
fa9a63c5
RM
38#endif
39
4bb91c68
SM
40#if defined STDC_HEADERS && !defined emacs
41# include <stddef.h>
42#else
43/* We need this for `regex.h', and perhaps for the Emacs include files. */
44# include <sys/types.h>
45#endif
fa9a63c5 46
14473664
SM
47/* Whether to use ISO C Amendment 1 wide char functions.
48 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
49#if defined _LIBC
50#define WIDE_CHAR_SUPPORT 1
51#else
14473664 52#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
53 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
54#endif
14473664
SM
55
56/* For platform which support the ISO C amendement 1 functionality we
57 support user defined character classes. */
a0ad02f7 58#if WIDE_CHAR_SUPPORT
14473664
SM
59/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
60# include <wchar.h>
61# include <wctype.h>
62#endif
63
c0f9ea08
SM
64#ifdef _LIBC
65/* We have to keep the namespace clean. */
66# define regfree(preg) __regfree (preg)
67# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
68# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
69# define regerror(errcode, preg, errbuf, errbuf_size) \
70 __regerror(errcode, preg, errbuf, errbuf_size)
71# define re_set_registers(bu, re, nu, st, en) \
72 __re_set_registers (bu, re, nu, st, en)
73# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
74 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
75# define re_match(bufp, string, size, pos, regs) \
76 __re_match (bufp, string, size, pos, regs)
77# define re_search(bufp, string, size, startpos, range, regs) \
78 __re_search (bufp, string, size, startpos, range, regs)
79# define re_compile_pattern(pattern, length, bufp) \
80 __re_compile_pattern (pattern, length, bufp)
81# define re_set_syntax(syntax) __re_set_syntax (syntax)
82# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
83 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
84# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
85
14473664
SM
86/* Make sure we call libc's function even if the user overrides them. */
87# define btowc __btowc
88# define iswctype __iswctype
89# define wctype __wctype
90
c0f9ea08
SM
91# define WEAK_ALIAS(a,b) weak_alias (a, b)
92
93/* We are also using some library internals. */
94# include <locale/localeinfo.h>
95# include <locale/elem-hash.h>
96# include <langinfo.h>
97#else
98# define WEAK_ALIAS(a,b)
99#endif
100
4bb91c68 101/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 102#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
103# include <libintl.h>
104#else
105# define gettext(msgid) (msgid)
106#endif
107
5e69f11e
RM
108#ifndef gettext_noop
109/* This define is so xgettext can find the internationalizable
110 strings. */
0b32bf0e 111# define gettext_noop(String) String
5e69f11e
RM
112#endif
113
fa9a63c5
RM
114/* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116#ifdef emacs
117
0b32bf0e
SM
118# include "lisp.h"
119# include "buffer.h"
b18215fc
RS
120
121/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 122# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 123
0b32bf0e 124# include "syntax.h"
9117d724 125# include "character.h"
0b32bf0e 126# include "category.h"
fa9a63c5 127
7689ef0b
EZ
128# ifdef malloc
129# undef malloc
130# endif
0b32bf0e 131# define malloc xmalloc
7689ef0b
EZ
132# ifdef realloc
133# undef realloc
134# endif
0b32bf0e 135# define realloc xrealloc
7689ef0b
EZ
136# ifdef free
137# undef free
138# endif
0b32bf0e 139# define free xfree
9abbd165 140
0b32bf0e
SM
141/* Converts the pointer to the char to BEG-based offset from the start. */
142# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
143# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
144
145# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
bf216479 146# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
0b32bf0e 147# define RE_STRING_CHAR(p, s) \
4e8a9132 148 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 149# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
150 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
151
6fdd04b0
KH
152/* Set C a (possibly converted to multibyte) character before P. P
153 points into a string which is the virtual concatenation of STR1
154 (which ends at END1) or STR2 (which ends at END2). */
bf216479
KH
155# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
156 do { \
6fdd04b0 157 if (multibyte) \
bf216479
KH
158 { \
159 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
160 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
161 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
162 c = STRING_CHAR (dtemp, (p) - dtemp); \
163 } \
164 else \
165 { \
166 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
6fdd04b0 167 MAKE_CHAR_MULTIBYTE (c); \
bf216479 168 } \
2d1675e4
SM
169 } while (0)
170
6fdd04b0
KH
171/* Set C a (possibly converted to multibyte) character at P, and set
172 LEN to the byte length of that character. */
173# define GET_CHAR_AFTER(c, p, len) \
174 do { \
175 if (multibyte) \
176 c = STRING_CHAR_AND_LENGTH (p, 0, len); \
177 else \
178 { \
179 c = *p; \
180 len = 1; \
181 MAKE_CHAR_MULTIBYTE (c); \
182 } \
8f924df7 183 } while (0)
4e8a9132 184
fa9a63c5
RM
185#else /* not emacs */
186
187/* If we are not linking with Emacs proper,
188 we can't use the relocating allocator
189 even if config.h says that we can. */
0b32bf0e 190# undef REL_ALLOC
fa9a63c5 191
0b32bf0e
SM
192# if defined STDC_HEADERS || defined _LIBC
193# include <stdlib.h>
194# else
fa9a63c5
RM
195char *malloc ();
196char *realloc ();
0b32bf0e 197# endif
fa9a63c5 198
9e4ecb26 199/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 200 If nothing else has been done, use the method below. */
0b32bf0e
SM
201# ifdef INHIBIT_STRING_HEADER
202# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
203# if !defined bzero && !defined bcopy
204# undef INHIBIT_STRING_HEADER
205# endif
206# endif
207# endif
9e4ecb26 208
4bb91c68 209/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
210 This is used in most programs--a few other programs avoid this
211 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
212# ifndef INHIBIT_STRING_HEADER
213# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
214# include <string.h>
0b32bf0e 215# ifndef bzero
4bb91c68
SM
216# ifndef _LIBC
217# define bzero(s, n) (memset (s, '\0', n), (s))
218# else
219# define bzero(s, n) __bzero (s, n)
220# endif
0b32bf0e
SM
221# endif
222# else
223# include <strings.h>
4bb91c68
SM
224# ifndef memcmp
225# define memcmp(s1, s2, n) bcmp (s1, s2, n)
226# endif
227# ifndef memcpy
228# define memcpy(d, s, n) (bcopy (s, d, n), (d))
229# endif
0b32bf0e
SM
230# endif
231# endif
fa9a63c5
RM
232
233/* Define the syntax stuff for \<, \>, etc. */
234
990b2375
SM
235/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
236enum syntaxcode { Swhitespace = 0, Sword = 1 };
fa9a63c5 237
0b32bf0e
SM
238# ifdef SWITCH_ENUM_BUG
239# define SWITCH_ENUM_CAST(x) ((int)(x))
240# else
241# define SWITCH_ENUM_CAST(x) (x)
242# endif
fa9a63c5 243
e934739e 244/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
245# define BASE_LEADING_CODE_P(c) (0)
246# define CHAR_CHARSET(c) 0
247# define CHARSET_LEADING_CODE_BASE(c) 0
248# define MAX_MULTIBYTE_LENGTH 1
249# define RE_MULTIBYTE_P(x) 0
bf216479 250# define RE_TARGET_MULTIBYTE_P(x) 0
0b32bf0e
SM
251# define WORD_BOUNDARY_P(c1, c2) (0)
252# define CHAR_HEAD_P(p) (1)
253# define SINGLE_BYTE_CHAR_P(c) (1)
254# define SAME_CHARSET_P(c1, c2) (1)
255# define MULTIBYTE_FORM_LENGTH(p, s) (1)
70806df6 256# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
0b32bf0e
SM
257# define STRING_CHAR(p, s) (*(p))
258# define RE_STRING_CHAR STRING_CHAR
259# define CHAR_STRING(c, s) (*(s) = (c), 1)
260# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
261# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
262# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 263 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
6fdd04b0
KH
264# define GET_CHAR_AFTER(c, p, len) \
265 (c = *p, len = 1)
0b32bf0e 266# define MAKE_CHAR(charset, c1, c2) (c1)
9117d724
KH
267# define BYTE8_TO_CHAR(c) (c)
268# define CHAR_BYTE8_P(c) (0)
6fdd04b0
KH
269# define MAKE_CHAR_MULTIBYTE(c) (c)
270# define MAKE_CHAR_UNIBYTE(c) (c)
bf216479 271# define CHAR_LEADING_CODE(c) (c)
8f924df7 272
fa9a63c5 273#endif /* not emacs */
4e8a9132
SM
274
275#ifndef RE_TRANSLATE
0b32bf0e
SM
276# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
277# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 278#endif
fa9a63c5
RM
279\f
280/* Get the interface, including the syntax bits. */
281#include "regex.h"
282
f71b19b6
DL
283/* isalpha etc. are used for the character classes. */
284#include <ctype.h>
fa9a63c5 285
f71b19b6 286#ifdef emacs
fa9a63c5 287
f71b19b6 288/* 1 if C is an ASCII character. */
0b32bf0e 289# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 290
f71b19b6 291/* 1 if C is a unibyte character. */
0b32bf0e 292# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 293
f71b19b6 294/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 295
f71b19b6 296/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
297# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
298# define ISCNTRL(c) ((c) < ' ')
299# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
300 || ((c) >= 'a' && (c) <= 'f') \
301 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
302
303/* This is only used for single-byte characters. */
0b32bf0e 304# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
305
306/* The rest must handle multibyte characters. */
307
0b32bf0e 308# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 309 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
310 : 1)
311
14473664 312# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 313 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
314 : 1)
315
0b32bf0e 316# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
317 ? (((c) >= 'a' && (c) <= 'z') \
318 || ((c) >= 'A' && (c) <= 'Z') \
319 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
320 : SYNTAX (c) == Sword)
321
0b32bf0e 322# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
323 ? (((c) >= 'a' && (c) <= 'z') \
324 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
325 : SYNTAX (c) == Sword)
326
0b32bf0e 327# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 328
0b32bf0e 329# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
330 ? ((c) > ' ' && (c) < 0177 \
331 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
332 || ((c) >= 'A' && (c) <= 'Z') \
333 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
334 : SYNTAX (c) != Sword)
335
0b32bf0e 336# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 337
0b32bf0e 338# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 339
0b32bf0e 340# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
341
342#else /* not emacs */
343
f71b19b6
DL
344/* Jim Meyering writes:
345
346 "... Some ctype macros are valid only for character codes that
347 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
348 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 349 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
350 STDC_HEADERS is defined, then autoconf has verified that the ctype
351 macros don't need to be guarded with references to isascii. ...
352 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
353 eliminate the && through constant folding."
354 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 355
4bb91c68 356# undef ISASCII
0b32bf0e
SM
357# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
358# define ISASCII(c) 1
359# else
360# define ISASCII(c) isascii(c)
361# endif
f71b19b6
DL
362
363/* 1 if C is an ASCII character. */
0b32bf0e 364# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
365
366/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
367# define ISUNIBYTE(c) 1
368
369# ifdef isblank
370# define ISBLANK(c) (ISASCII (c) && isblank (c))
371# else
372# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
373# endif
374# ifdef isgraph
375# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
376# else
377# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
378# endif
379
4bb91c68 380# undef ISPRINT
0b32bf0e
SM
381# define ISPRINT(c) (ISASCII (c) && isprint (c))
382# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
383# define ISALNUM(c) (ISASCII (c) && isalnum (c))
384# define ISALPHA(c) (ISASCII (c) && isalpha (c))
385# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
386# define ISLOWER(c) (ISASCII (c) && islower (c))
387# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
388# define ISSPACE(c) (ISASCII (c) && isspace (c))
389# define ISUPPER(c) (ISASCII (c) && isupper (c))
390# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
391
392# define ISWORD(c) ISALPHA(c)
393
4bb91c68
SM
394# ifdef _tolower
395# define TOLOWER(c) _tolower(c)
396# else
397# define TOLOWER(c) tolower(c)
398# endif
399
400/* How many characters in the character set. */
401# define CHAR_SET_SIZE 256
402
0b32bf0e 403# ifdef SYNTAX_TABLE
f71b19b6 404
0b32bf0e 405extern char *re_syntax_table;
f71b19b6 406
0b32bf0e
SM
407# else /* not SYNTAX_TABLE */
408
0b32bf0e
SM
409static char re_syntax_table[CHAR_SET_SIZE];
410
411static void
412init_syntax_once ()
413{
414 register int c;
415 static int done = 0;
416
417 if (done)
418 return;
419
420 bzero (re_syntax_table, sizeof re_syntax_table);
421
4bb91c68
SM
422 for (c = 0; c < CHAR_SET_SIZE; ++c)
423 if (ISALNUM (c))
424 re_syntax_table[c] = Sword;
fa9a63c5 425
0b32bf0e 426 re_syntax_table['_'] = Sword;
fa9a63c5 427
0b32bf0e
SM
428 done = 1;
429}
430
431# endif /* not SYNTAX_TABLE */
96cc36cc 432
4bb91c68
SM
433# define SYNTAX(c) re_syntax_table[(c)]
434
96cc36cc
RS
435#endif /* not emacs */
436\f
fa9a63c5 437#ifndef NULL
0b32bf0e 438# define NULL (void *)0
fa9a63c5
RM
439#endif
440
441/* We remove any previous definition of `SIGN_EXTEND_CHAR',
442 since ours (we hope) works properly with all combinations of
443 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 444 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
445#undef SIGN_EXTEND_CHAR
446#if __STDC__
0b32bf0e 447# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
448#else /* not __STDC__ */
449/* As in Harbison and Steele. */
0b32bf0e 450# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
451#endif
452\f
453/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
454 use `alloca' instead of `malloc'. This is because using malloc in
455 re_search* or re_match* could cause memory leaks when C-g is used in
456 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
457 the other hand, malloc is more portable, and easier to debug.
458
fa9a63c5
RM
459 Because we sometimes use alloca, some routines have to be macros,
460 not functions -- `alloca'-allocated space disappears at the end of the
461 function it is called in. */
462
463#ifdef REGEX_MALLOC
464
0b32bf0e
SM
465# define REGEX_ALLOCATE malloc
466# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
467# define REGEX_FREE free
fa9a63c5
RM
468
469#else /* not REGEX_MALLOC */
470
471/* Emacs already defines alloca, sometimes. */
0b32bf0e 472# ifndef alloca
fa9a63c5
RM
473
474/* Make alloca work the best possible way. */
0b32bf0e
SM
475# ifdef __GNUC__
476# define alloca __builtin_alloca
477# else /* not __GNUC__ */
7f585e7a 478# ifdef HAVE_ALLOCA_H
0b32bf0e
SM
479# include <alloca.h>
480# endif /* HAVE_ALLOCA_H */
481# endif /* not __GNUC__ */
fa9a63c5 482
0b32bf0e 483# endif /* not alloca */
fa9a63c5 484
0b32bf0e 485# define REGEX_ALLOCATE alloca
fa9a63c5
RM
486
487/* Assumes a `char *destination' variable. */
0b32bf0e 488# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 489 (destination = (char *) alloca (nsize), \
4bb91c68 490 memcpy (destination, source, osize))
fa9a63c5
RM
491
492/* No need to do anything to free, after alloca. */
0b32bf0e 493# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
494
495#endif /* not REGEX_MALLOC */
496
497/* Define how to allocate the failure stack. */
498
0b32bf0e 499#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 500
0b32bf0e 501# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 502 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 503# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 504 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 505# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
506 r_alloc_free (&failure_stack_ptr)
507
4297555e 508#else /* not using relocating allocator */
fa9a63c5 509
0b32bf0e 510# ifdef REGEX_MALLOC
fa9a63c5 511
0b32bf0e
SM
512# define REGEX_ALLOCATE_STACK malloc
513# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
514# define REGEX_FREE_STACK free
fa9a63c5 515
0b32bf0e 516# else /* not REGEX_MALLOC */
fa9a63c5 517
0b32bf0e 518# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 519
0b32bf0e 520# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 521 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 522/* No need to explicitly free anything. */
0b32bf0e 523# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 524
0b32bf0e 525# endif /* not REGEX_MALLOC */
4297555e 526#endif /* not using relocating allocator */
fa9a63c5
RM
527
528
529/* True if `size1' is non-NULL and PTR is pointing anywhere inside
530 `string1' or just past its end. This works if PTR is NULL, which is
531 a good thing. */
25fe55af 532#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
533 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
534
535/* (Re)Allocate N items of type T using malloc, or fail. */
536#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
537#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
538#define RETALLOC_IF(addr, n, t) \
539 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
540#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
541
4bb91c68 542#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
543
544#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
545
546#undef MAX
547#undef MIN
548#define MAX(a, b) ((a) > (b) ? (a) : (b))
549#define MIN(a, b) ((a) < (b) ? (a) : (b))
550
66f0296e
SM
551/* Type of source-pattern and string chars. */
552typedef const unsigned char re_char;
553
fa9a63c5
RM
554typedef char boolean;
555#define false 0
556#define true 1
557
4bb91c68
SM
558static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
559 re_char *string1, int size1,
560 re_char *string2, int size2,
561 int pos,
562 struct re_registers *regs,
563 int stop));
fa9a63c5
RM
564\f
565/* These are the command codes that appear in compiled regular
4bb91c68 566 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
567 command code can specify any interpretation whatsoever for its
568 arguments. Zero bytes may appear in the compiled regular expression. */
569
570typedef enum
571{
572 no_op = 0,
573
4bb91c68 574 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
575 succeed,
576
25fe55af 577 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
578 exactn,
579
25fe55af 580 /* Matches any (more or less) character. */
fa9a63c5
RM
581 anychar,
582
25fe55af
RS
583 /* Matches any one char belonging to specified set. First
584 following byte is number of bitmap bytes. Then come bytes
585 for a bitmap saying which chars are in. Bits in each byte
586 are ordered low-bit-first. A character is in the set if its
587 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
588 automatically not in the set.
589
590 If the length byte has the 0x80 bit set, then that stuff
591 is followed by a range table:
592 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 593 See RANGE_TABLE_WORK_BITS below.
01618498 594 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 595 pairs, each 2 multibyte characters,
0b32bf0e 596 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
597 charset,
598
25fe55af 599 /* Same parameters as charset, but match any character that is
4bb91c68 600 not one of those specified. */
fa9a63c5
RM
601 charset_not,
602
25fe55af
RS
603 /* Start remembering the text that is matched, for storing in a
604 register. Followed by one byte with the register number, in
605 the range 0 to one less than the pattern buffer's re_nsub
505bde11 606 field. */
fa9a63c5
RM
607 start_memory,
608
25fe55af
RS
609 /* Stop remembering the text that is matched and store it in a
610 memory register. Followed by one byte with the register
611 number, in the range 0 to one less than `re_nsub' in the
505bde11 612 pattern buffer. */
fa9a63c5
RM
613 stop_memory,
614
25fe55af 615 /* Match a duplicate of something remembered. Followed by one
4bb91c68 616 byte containing the register number. */
fa9a63c5
RM
617 duplicate,
618
25fe55af 619 /* Fail unless at beginning of line. */
fa9a63c5
RM
620 begline,
621
4bb91c68 622 /* Fail unless at end of line. */
fa9a63c5
RM
623 endline,
624
25fe55af
RS
625 /* Succeeds if at beginning of buffer (if emacs) or at beginning
626 of string to be matched (if not). */
fa9a63c5
RM
627 begbuf,
628
25fe55af 629 /* Analogously, for end of buffer/string. */
fa9a63c5 630 endbuf,
5e69f11e 631
25fe55af 632 /* Followed by two byte relative address to which to jump. */
5e69f11e 633 jump,
fa9a63c5 634
25fe55af
RS
635 /* Followed by two-byte relative address of place to resume at
636 in case of failure. */
fa9a63c5 637 on_failure_jump,
5e69f11e 638
25fe55af
RS
639 /* Like on_failure_jump, but pushes a placeholder instead of the
640 current string position when executed. */
fa9a63c5 641 on_failure_keep_string_jump,
5e69f11e 642
505bde11
SM
643 /* Just like `on_failure_jump', except that it checks that we
644 don't get stuck in an infinite loop (matching an empty string
645 indefinitely). */
646 on_failure_jump_loop,
647
0683b6fa
SM
648 /* Just like `on_failure_jump_loop', except that it checks for
649 a different kind of loop (the kind that shows up with non-greedy
650 operators). This operation has to be immediately preceded
651 by a `no_op'. */
652 on_failure_jump_nastyloop,
653
0b32bf0e 654 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
655 It analyses the loop before which it is put and if the
656 loop does not require backtracking, it changes itself to
4e8a9132
SM
657 `on_failure_keep_string_jump' and short-circuits the loop,
658 else it just defaults to changing itself into `on_failure_jump'.
659 It assumes that it is pointing to just past a `jump'. */
505bde11 660 on_failure_jump_smart,
fa9a63c5 661
25fe55af 662 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
663 After matching N times, jump to the address upon failure.
664 Does not work if N starts at 0: use on_failure_jump_loop
665 instead. */
fa9a63c5
RM
666 succeed_n,
667
25fe55af
RS
668 /* Followed by two-byte relative address, and two-byte number n.
669 Jump to the address N times, then fail. */
fa9a63c5
RM
670 jump_n,
671
25fe55af
RS
672 /* Set the following two-byte relative address to the
673 subsequent two-byte number. The address *includes* the two
674 bytes of number. */
fa9a63c5
RM
675 set_number_at,
676
fa9a63c5
RM
677 wordbeg, /* Succeeds if at word beginning. */
678 wordend, /* Succeeds if at word end. */
679
680 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 681 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5
RM
682
683 /* Matches any character whose syntax is specified. Followed by
25fe55af 684 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
685 syntaxspec,
686
687 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
688 notsyntaxspec
689
690#ifdef emacs
691 ,before_dot, /* Succeeds if before point. */
692 at_dot, /* Succeeds if at point. */
693 after_dot, /* Succeeds if after point. */
b18215fc
RS
694
695 /* Matches any character whose category-set contains the specified
25fe55af
RS
696 category. The operator is followed by a byte which contains a
697 category code (mnemonic ASCII character). */
b18215fc
RS
698 categoryspec,
699
700 /* Matches any character whose category-set does not contain the
701 specified category. The operator is followed by a byte which
702 contains the category code (mnemonic ASCII character). */
703 notcategoryspec
fa9a63c5
RM
704#endif /* emacs */
705} re_opcode_t;
706\f
707/* Common operations on the compiled pattern. */
708
709/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
710
711#define STORE_NUMBER(destination, number) \
712 do { \
713 (destination)[0] = (number) & 0377; \
714 (destination)[1] = (number) >> 8; \
715 } while (0)
716
717/* Same as STORE_NUMBER, except increment DESTINATION to
718 the byte after where the number is stored. Therefore, DESTINATION
719 must be an lvalue. */
720
721#define STORE_NUMBER_AND_INCR(destination, number) \
722 do { \
723 STORE_NUMBER (destination, number); \
724 (destination) += 2; \
725 } while (0)
726
727/* Put into DESTINATION a number stored in two contiguous bytes starting
728 at SOURCE. */
729
730#define EXTRACT_NUMBER(destination, source) \
731 do { \
732 (destination) = *(source) & 0377; \
733 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
734 } while (0)
735
736#ifdef DEBUG
4bb91c68 737static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
738static void
739extract_number (dest, source)
740 int *dest;
01618498 741 re_char *source;
fa9a63c5 742{
5e69f11e 743 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
744 *dest = *source & 0377;
745 *dest += temp << 8;
746}
747
4bb91c68 748# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
749# undef EXTRACT_NUMBER
750# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
751# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
752
753#endif /* DEBUG */
754
755/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
756 SOURCE must be an lvalue. */
757
758#define EXTRACT_NUMBER_AND_INCR(destination, source) \
759 do { \
760 EXTRACT_NUMBER (destination, source); \
25fe55af 761 (source) += 2; \
fa9a63c5
RM
762 } while (0)
763
764#ifdef DEBUG
4bb91c68
SM
765static void extract_number_and_incr _RE_ARGS ((int *destination,
766 re_char **source));
fa9a63c5
RM
767static void
768extract_number_and_incr (destination, source)
769 int *destination;
01618498 770 re_char **source;
5e69f11e 771{
fa9a63c5
RM
772 extract_number (destination, *source);
773 *source += 2;
774}
775
0b32bf0e
SM
776# ifndef EXTRACT_MACROS
777# undef EXTRACT_NUMBER_AND_INCR
778# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 779 extract_number_and_incr (&dest, &src)
0b32bf0e 780# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
781
782#endif /* DEBUG */
783\f
b18215fc
RS
784/* Store a multibyte character in three contiguous bytes starting
785 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 786 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
787
788#define STORE_CHARACTER_AND_INCR(destination, character) \
789 do { \
790 (destination)[0] = (character) & 0377; \
791 (destination)[1] = ((character) >> 8) & 0377; \
792 (destination)[2] = (character) >> 16; \
793 (destination) += 3; \
794 } while (0)
795
796/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 797 starting at SOURCE. */
b18215fc
RS
798
799#define EXTRACT_CHARACTER(destination, source) \
800 do { \
801 (destination) = ((source)[0] \
802 | ((source)[1] << 8) \
803 | ((source)[2] << 16)); \
804 } while (0)
805
806
807/* Macros for charset. */
808
809/* Size of bitmap of charset P in bytes. P is a start of charset,
810 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
811#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
812
813/* Nonzero if charset P has range table. */
25fe55af 814#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
815
816/* Return the address of range table of charset P. But not the start
817 of table itself, but the before where the number of ranges is
96cc36cc
RS
818 stored. `2 +' means to skip re_opcode_t and size of bitmap,
819 and the 2 bytes of flags at the start of the range table. */
820#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
821
822/* Extract the bit flags that start a range table. */
823#define CHARSET_RANGE_TABLE_BITS(p) \
824 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
825 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
826
827/* Test if C is listed in the bitmap of charset P. */
828#define CHARSET_LOOKUP_BITMAP(p, c) \
829 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
830 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
831
832/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
833 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
834 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
835 and end. */
836#define CHARSET_RANGE_TABLE_END(range_table, count) \
837 ((range_table) + (count) * 2 * 3)
838
25fe55af 839/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
840 COUNT is number of ranges in RANGE_TABLE. */
841#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
842 do \
843 { \
01618498
SM
844 re_wchar_t range_start, range_end; \
845 re_char *p; \
846 re_char *range_table_end \
b18215fc
RS
847 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
848 \
849 for (p = (range_table); p < range_table_end; p += 2 * 3) \
850 { \
851 EXTRACT_CHARACTER (range_start, p); \
852 EXTRACT_CHARACTER (range_end, p + 3); \
853 \
854 if (range_start <= (c) && (c) <= range_end) \
855 { \
856 (not) = !(not); \
857 break; \
858 } \
859 } \
860 } \
861 while (0)
862
863/* Test if C is in range table of CHARSET. The flag NOT is negated if
864 C is listed in it. */
865#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
866 do \
867 { \
868 /* Number of ranges in range table. */ \
869 int count; \
01618498
SM
870 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
871 \
b18215fc
RS
872 EXTRACT_NUMBER_AND_INCR (count, range_table); \
873 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
874 } \
875 while (0)
876\f
fa9a63c5
RM
877/* If DEBUG is defined, Regex prints many voluminous messages about what
878 it is doing (if the variable `debug' is nonzero). If linked with the
879 main program in `iregex.c', you can enter patterns and strings
880 interactively. And if linked with the main program in `main.c' and
4bb91c68 881 the other test files, you can run the already-written tests. */
fa9a63c5
RM
882
883#ifdef DEBUG
884
885/* We use standard I/O for debugging. */
0b32bf0e 886# include <stdio.h>
fa9a63c5
RM
887
888/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 889# include <assert.h>
fa9a63c5 890
99633e97 891static int debug = -100000;
fa9a63c5 892
0b32bf0e
SM
893# define DEBUG_STATEMENT(e) e
894# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
895# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
896# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
897# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
898# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 899 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 900# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 901 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
902
903
904/* Print the fastmap in human-readable form. */
905
906void
907print_fastmap (fastmap)
908 char *fastmap;
909{
910 unsigned was_a_range = 0;
5e69f11e
RM
911 unsigned i = 0;
912
fa9a63c5
RM
913 while (i < (1 << BYTEWIDTH))
914 {
915 if (fastmap[i++])
916 {
917 was_a_range = 0;
25fe55af
RS
918 putchar (i - 1);
919 while (i < (1 << BYTEWIDTH) && fastmap[i])
920 {
921 was_a_range = 1;
922 i++;
923 }
fa9a63c5 924 if (was_a_range)
25fe55af
RS
925 {
926 printf ("-");
927 putchar (i - 1);
928 }
929 }
fa9a63c5 930 }
5e69f11e 931 putchar ('\n');
fa9a63c5
RM
932}
933
934
935/* Print a compiled pattern string in human-readable form, starting at
936 the START pointer into it and ending just before the pointer END. */
937
938void
939print_partial_compiled_pattern (start, end)
01618498
SM
940 re_char *start;
941 re_char *end;
fa9a63c5
RM
942{
943 int mcnt, mcnt2;
01618498
SM
944 re_char *p = start;
945 re_char *pend = end;
fa9a63c5
RM
946
947 if (start == NULL)
948 {
a1a052df 949 fprintf (stderr, "(null)\n");
fa9a63c5
RM
950 return;
951 }
5e69f11e 952
fa9a63c5
RM
953 /* Loop over pattern commands. */
954 while (p < pend)
955 {
a1a052df 956 fprintf (stderr, "%d:\t", p - start);
fa9a63c5
RM
957
958 switch ((re_opcode_t) *p++)
959 {
25fe55af 960 case no_op:
a1a052df 961 fprintf (stderr, "/no_op");
25fe55af 962 break;
fa9a63c5 963
99633e97 964 case succeed:
a1a052df 965 fprintf (stderr, "/succeed");
99633e97
SM
966 break;
967
fa9a63c5
RM
968 case exactn:
969 mcnt = *p++;
a1a052df 970 fprintf (stderr, "/exactn/%d", mcnt);
25fe55af 971 do
fa9a63c5 972 {
a1a052df 973 fprintf (stderr, "/%c", *p++);
25fe55af
RS
974 }
975 while (--mcnt);
976 break;
fa9a63c5
RM
977
978 case start_memory:
a1a052df 979 fprintf (stderr, "/start_memory/%d", *p++);
25fe55af 980 break;
fa9a63c5
RM
981
982 case stop_memory:
a1a052df 983 fprintf (stderr, "/stop_memory/%d", *p++);
25fe55af 984 break;
fa9a63c5
RM
985
986 case duplicate:
a1a052df 987 fprintf (stderr, "/duplicate/%d", *p++);
fa9a63c5
RM
988 break;
989
990 case anychar:
a1a052df 991 fprintf (stderr, "/anychar");
fa9a63c5
RM
992 break;
993
994 case charset:
25fe55af
RS
995 case charset_not:
996 {
997 register int c, last = -100;
fa9a63c5 998 register int in_range = 0;
99633e97
SM
999 int length = CHARSET_BITMAP_SIZE (p - 1);
1000 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5 1001
a1a052df 1002 fprintf (stderr, "/charset [%s",
25fe55af 1003 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 1004
25fe55af 1005 assert (p + *p < pend);
fa9a63c5 1006
25fe55af 1007 for (c = 0; c < 256; c++)
96cc36cc 1008 if (c / 8 < length
fa9a63c5
RM
1009 && (p[1 + (c/8)] & (1 << (c % 8))))
1010 {
1011 /* Are we starting a range? */
1012 if (last + 1 == c && ! in_range)
1013 {
a1a052df 1014 fprintf (stderr, "-");
fa9a63c5
RM
1015 in_range = 1;
1016 }
1017 /* Have we broken a range? */
1018 else if (last + 1 != c && in_range)
96cc36cc 1019 {
a1a052df 1020 fprintf (stderr, "%c", last);
fa9a63c5
RM
1021 in_range = 0;
1022 }
5e69f11e 1023
fa9a63c5 1024 if (! in_range)
a1a052df 1025 fprintf (stderr, "%c", c);
fa9a63c5
RM
1026
1027 last = c;
25fe55af 1028 }
fa9a63c5
RM
1029
1030 if (in_range)
a1a052df 1031 fprintf (stderr, "%c", last);
fa9a63c5 1032
a1a052df 1033 fprintf (stderr, "]");
fa9a63c5 1034
99633e97 1035 p += 1 + length;
96cc36cc 1036
96cc36cc 1037 if (has_range_table)
99633e97
SM
1038 {
1039 int count;
a1a052df 1040 fprintf (stderr, "has-range-table");
99633e97
SM
1041
1042 /* ??? Should print the range table; for now, just skip it. */
1043 p += 2; /* skip range table bits */
1044 EXTRACT_NUMBER_AND_INCR (count, p);
1045 p = CHARSET_RANGE_TABLE_END (p, count);
1046 }
fa9a63c5
RM
1047 }
1048 break;
1049
1050 case begline:
a1a052df 1051 fprintf (stderr, "/begline");
25fe55af 1052 break;
fa9a63c5
RM
1053
1054 case endline:
a1a052df 1055 fprintf (stderr, "/endline");
25fe55af 1056 break;
fa9a63c5
RM
1057
1058 case on_failure_jump:
25fe55af 1059 extract_number_and_incr (&mcnt, &p);
a1a052df 1060 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
25fe55af 1061 break;
fa9a63c5
RM
1062
1063 case on_failure_keep_string_jump:
25fe55af 1064 extract_number_and_incr (&mcnt, &p);
a1a052df 1065 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
25fe55af 1066 break;
fa9a63c5 1067
0683b6fa
SM
1068 case on_failure_jump_nastyloop:
1069 extract_number_and_incr (&mcnt, &p);
a1a052df 1070 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
0683b6fa
SM
1071 break;
1072
505bde11 1073 case on_failure_jump_loop:
fa9a63c5 1074 extract_number_and_incr (&mcnt, &p);
a1a052df 1075 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1076 break;
1077
505bde11 1078 case on_failure_jump_smart:
fa9a63c5 1079 extract_number_and_incr (&mcnt, &p);
a1a052df 1080 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1081 break;
1082
25fe55af 1083 case jump:
fa9a63c5 1084 extract_number_and_incr (&mcnt, &p);
a1a052df 1085 fprintf (stderr, "/jump to %d", p + mcnt - start);
fa9a63c5
RM
1086 break;
1087
25fe55af
RS
1088 case succeed_n:
1089 extract_number_and_incr (&mcnt, &p);
1090 extract_number_and_incr (&mcnt2, &p);
a1a052df 1091 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1092 break;
5e69f11e 1093
25fe55af
RS
1094 case jump_n:
1095 extract_number_and_incr (&mcnt, &p);
1096 extract_number_and_incr (&mcnt2, &p);
a1a052df 1097 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1098 break;
5e69f11e 1099
25fe55af
RS
1100 case set_number_at:
1101 extract_number_and_incr (&mcnt, &p);
1102 extract_number_and_incr (&mcnt2, &p);
a1a052df 1103 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1104 break;
5e69f11e 1105
25fe55af 1106 case wordbound:
a1a052df 1107 fprintf (stderr, "/wordbound");
fa9a63c5
RM
1108 break;
1109
1110 case notwordbound:
a1a052df 1111 fprintf (stderr, "/notwordbound");
25fe55af 1112 break;
fa9a63c5
RM
1113
1114 case wordbeg:
a1a052df 1115 fprintf (stderr, "/wordbeg");
fa9a63c5 1116 break;
5e69f11e 1117
fa9a63c5 1118 case wordend:
a1a052df 1119 fprintf (stderr, "/wordend");
5e69f11e 1120
1fb352e0 1121 case syntaxspec:
a1a052df 1122 fprintf (stderr, "/syntaxspec");
1fb352e0 1123 mcnt = *p++;
a1a052df 1124 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1125 break;
1126
1127 case notsyntaxspec:
a1a052df 1128 fprintf (stderr, "/notsyntaxspec");
1fb352e0 1129 mcnt = *p++;
a1a052df 1130 fprintf (stderr, "/%d", mcnt);
1fb352e0
SM
1131 break;
1132
0b32bf0e 1133# ifdef emacs
fa9a63c5 1134 case before_dot:
a1a052df 1135 fprintf (stderr, "/before_dot");
25fe55af 1136 break;
fa9a63c5
RM
1137
1138 case at_dot:
a1a052df 1139 fprintf (stderr, "/at_dot");
25fe55af 1140 break;
fa9a63c5
RM
1141
1142 case after_dot:
a1a052df 1143 fprintf (stderr, "/after_dot");
25fe55af 1144 break;
fa9a63c5 1145
1fb352e0 1146 case categoryspec:
a1a052df 1147 fprintf (stderr, "/categoryspec");
fa9a63c5 1148 mcnt = *p++;
a1a052df 1149 fprintf (stderr, "/%d", mcnt);
25fe55af 1150 break;
5e69f11e 1151
1fb352e0 1152 case notcategoryspec:
a1a052df 1153 fprintf (stderr, "/notcategoryspec");
fa9a63c5 1154 mcnt = *p++;
a1a052df 1155 fprintf (stderr, "/%d", mcnt);
fa9a63c5 1156 break;
0b32bf0e 1157# endif /* emacs */
fa9a63c5 1158
fa9a63c5 1159 case begbuf:
a1a052df 1160 fprintf (stderr, "/begbuf");
25fe55af 1161 break;
fa9a63c5
RM
1162
1163 case endbuf:
a1a052df 1164 fprintf (stderr, "/endbuf");
25fe55af 1165 break;
fa9a63c5 1166
25fe55af 1167 default:
a1a052df 1168 fprintf (stderr, "?%d", *(p-1));
fa9a63c5
RM
1169 }
1170
a1a052df 1171 fprintf (stderr, "\n");
fa9a63c5
RM
1172 }
1173
a1a052df 1174 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
fa9a63c5
RM
1175}
1176
1177
1178void
1179print_compiled_pattern (bufp)
1180 struct re_pattern_buffer *bufp;
1181{
01618498 1182 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1183
1184 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1185 printf ("%ld bytes used/%ld bytes allocated.\n",
1186 bufp->used, bufp->allocated);
fa9a63c5
RM
1187
1188 if (bufp->fastmap_accurate && bufp->fastmap)
1189 {
1190 printf ("fastmap: ");
1191 print_fastmap (bufp->fastmap);
1192 }
1193
1194 printf ("re_nsub: %d\t", bufp->re_nsub);
1195 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1196 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1197 printf ("no_sub: %d\t", bufp->no_sub);
1198 printf ("not_bol: %d\t", bufp->not_bol);
1199 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1200 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1201 fflush (stdout);
fa9a63c5
RM
1202 /* Perhaps we should print the translate table? */
1203}
1204
1205
1206void
1207print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1208 re_char *where;
1209 re_char *string1;
1210 re_char *string2;
fa9a63c5
RM
1211 int size1;
1212 int size2;
1213{
4bb91c68 1214 int this_char;
5e69f11e 1215
fa9a63c5
RM
1216 if (where == NULL)
1217 printf ("(null)");
1218 else
1219 {
1220 if (FIRST_STRING_P (where))
25fe55af
RS
1221 {
1222 for (this_char = where - string1; this_char < size1; this_char++)
1223 putchar (string1[this_char]);
fa9a63c5 1224
25fe55af
RS
1225 where = string2;
1226 }
fa9a63c5
RM
1227
1228 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1229 putchar (string2[this_char]);
fa9a63c5
RM
1230 }
1231}
1232
1233#else /* not DEBUG */
1234
0b32bf0e
SM
1235# undef assert
1236# define assert(e)
fa9a63c5 1237
0b32bf0e
SM
1238# define DEBUG_STATEMENT(e)
1239# define DEBUG_PRINT1(x)
1240# define DEBUG_PRINT2(x1, x2)
1241# define DEBUG_PRINT3(x1, x2, x3)
1242# define DEBUG_PRINT4(x1, x2, x3, x4)
1243# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1244# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1245
1246#endif /* not DEBUG */
1247\f
1248/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1249 also be assigned to arbitrarily: each pattern buffer stores its own
1250 syntax, so it can be changed between regex compilations. */
1251/* This has no initializer because initialized variables in Emacs
1252 become read-only after dumping. */
1253reg_syntax_t re_syntax_options;
1254
1255
1256/* Specify the precise syntax of regexps for compilation. This provides
1257 for compatibility for various utilities which historically have
1258 different, incompatible syntaxes.
1259
1260 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1261 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1262
1263reg_syntax_t
1264re_set_syntax (syntax)
1265 reg_syntax_t syntax;
1266{
1267 reg_syntax_t ret = re_syntax_options;
5e69f11e 1268
fa9a63c5
RM
1269 re_syntax_options = syntax;
1270 return ret;
1271}
c0f9ea08 1272WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1273\f
1274/* This table gives an error message for each of the error codes listed
4bb91c68 1275 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1276 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1277 but why not be nice? */
fa9a63c5
RM
1278
1279static const char *re_error_msgid[] =
5e69f11e
RM
1280 {
1281 gettext_noop ("Success"), /* REG_NOERROR */
1282 gettext_noop ("No match"), /* REG_NOMATCH */
1283 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1284 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1285 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1286 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1287 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1288 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1289 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1290 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1291 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1292 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1293 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1294 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1295 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1296 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1297 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
fa9a63c5
RM
1298 };
1299\f
4bb91c68 1300/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1301
1302/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1303 searching and matching functions should not call alloca. On some
1304 systems, alloca is implemented in terms of malloc, and if we're
1305 using the relocating allocator routines, then malloc could cause a
1306 relocation, which might (if the strings being searched are in the
1307 ralloc heap) shift the data out from underneath the regexp
1308 routines.
1309
5e69f11e 1310 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1311 processes input from X in a signal handler; processing X input may
1312 call malloc; if input arrives while a matching routine is calling
1313 malloc, then we're scrod. But Emacs can't just block input while
1314 calling matching routines; then we don't notice interrupts when
1315 they come in. So, Emacs blocks input around all regexp calls
1316 except the matching calls, which it leaves unprotected, in the
1317 faith that they will not malloc. */
1318
1319/* Normally, this is fine. */
1320#define MATCH_MAY_ALLOCATE
1321
1322/* When using GNU C, we are not REALLY using the C alloca, no matter
1323 what config.h may say. So don't take precautions for it. */
1324#ifdef __GNUC__
0b32bf0e 1325# undef C_ALLOCA
fa9a63c5
RM
1326#endif
1327
1328/* The match routines may not allocate if (1) they would do it with malloc
1329 and (2) it's not safe for them to use malloc.
1330 Note that if REL_ALLOC is defined, matching would not use malloc for the
1331 failure stack, but we would still use it for the register vectors;
4bb91c68 1332 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1333#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1334# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1335#endif
1336
1337\f
1338/* Failure stack declarations and macros; both re_compile_fastmap and
1339 re_match_2 use a failure stack. These have to be macros because of
1340 REGEX_ALLOCATE_STACK. */
5e69f11e 1341
fa9a63c5 1342
320a2a73 1343/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1344 when matching. If this number is exceeded, we allocate more
1345 space, so it is not a hard limit. */
1346#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1347# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1348#endif
1349
1350/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1351 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1352 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1353 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1354 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1355# if defined MATCH_MAY_ALLOCATE
1356/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1357 whose default stack limit is 2mb. In order for a larger
1358 value to work reliably, you have to try to make it accord
1359 with the process stack limit. */
c0f9ea08
SM
1360size_t re_max_failures = 40000;
1361# else
1362size_t re_max_failures = 4000;
1363# endif
fa9a63c5
RM
1364
1365union fail_stack_elt
1366{
01618498 1367 re_char *pointer;
c0f9ea08
SM
1368 /* This should be the biggest `int' that's no bigger than a pointer. */
1369 long integer;
fa9a63c5
RM
1370};
1371
1372typedef union fail_stack_elt fail_stack_elt_t;
1373
1374typedef struct
1375{
1376 fail_stack_elt_t *stack;
c0f9ea08
SM
1377 size_t size;
1378 size_t avail; /* Offset of next open position. */
1379 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1380} fail_stack_type;
1381
505bde11 1382#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1383#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1384
1385
1386/* Define macros to initialize and free the failure stack.
1387 Do `return -2' if the alloc fails. */
1388
1389#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1390# define INIT_FAIL_STACK() \
fa9a63c5
RM
1391 do { \
1392 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1393 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1394 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1395 \
1396 if (fail_stack.stack == NULL) \
1397 return -2; \
1398 \
1399 fail_stack.size = INIT_FAILURE_ALLOC; \
1400 fail_stack.avail = 0; \
505bde11 1401 fail_stack.frame = 0; \
fa9a63c5
RM
1402 } while (0)
1403
0b32bf0e 1404# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1405#else
0b32bf0e 1406# define INIT_FAIL_STACK() \
fa9a63c5
RM
1407 do { \
1408 fail_stack.avail = 0; \
505bde11 1409 fail_stack.frame = 0; \
fa9a63c5
RM
1410 } while (0)
1411
0b32bf0e 1412# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1413#endif
1414
1415
320a2a73
KH
1416/* Double the size of FAIL_STACK, up to a limit
1417 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1418
1419 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1420 allocating space for it or it was already too large.
1421
4bb91c68 1422 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1423
320a2a73
KH
1424/* Factor to increase the failure stack size by
1425 when we increase it.
1426 This used to be 2, but 2 was too wasteful
1427 because the old discarded stacks added up to as much space
1428 were as ultimate, maximum-size stack. */
1429#define FAIL_STACK_GROWTH_FACTOR 4
1430
1431#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1432 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1433 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1434 ? 0 \
320a2a73
KH
1435 : ((fail_stack).stack \
1436 = (fail_stack_elt_t *) \
25fe55af
RS
1437 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1438 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1439 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1440 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1441 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1442 \
1443 (fail_stack).stack == NULL \
1444 ? 0 \
6453db45
KH
1445 : ((fail_stack).size \
1446 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1447 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1448 * FAIL_STACK_GROWTH_FACTOR)) \
1449 / sizeof (fail_stack_elt_t)), \
25fe55af 1450 1)))
fa9a63c5
RM
1451
1452
fa9a63c5
RM
1453/* Push a pointer value onto the failure stack.
1454 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1455 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1456#define PUSH_FAILURE_POINTER(item) \
01618498 1457 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1458
1459/* This pushes an integer-valued item onto the failure stack.
1460 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1461 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1462#define PUSH_FAILURE_INT(item) \
1463 fail_stack.stack[fail_stack.avail++].integer = (item)
1464
1465/* Push a fail_stack_elt_t value onto the failure stack.
1466 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1467 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1468#define PUSH_FAILURE_ELT(item) \
1469 fail_stack.stack[fail_stack.avail++] = (item)
1470
1471/* These three POP... operations complement the three PUSH... operations.
1472 All assume that `fail_stack' is nonempty. */
1473#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1474#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1475#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1476
505bde11
SM
1477/* Individual items aside from the registers. */
1478#define NUM_NONREG_ITEMS 3
1479
1480/* Used to examine the stack (to detect infinite loops). */
1481#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1482#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1483#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1484#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1485
1486
505bde11
SM
1487#define ENSURE_FAIL_STACK(space) \
1488while (REMAINING_AVAIL_SLOTS <= space) { \
1489 if (!GROW_FAIL_STACK (fail_stack)) \
1490 return -2; \
1491 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1492 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1493}
1494
1495/* Push register NUM onto the stack. */
1496#define PUSH_FAILURE_REG(num) \
1497do { \
1498 char *destination; \
1499 ENSURE_FAIL_STACK(3); \
1500 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1501 num, regstart[num], regend[num]); \
1502 PUSH_FAILURE_POINTER (regstart[num]); \
1503 PUSH_FAILURE_POINTER (regend[num]); \
1504 PUSH_FAILURE_INT (num); \
1505} while (0)
1506
01618498
SM
1507/* Change the counter's value to VAL, but make sure that it will
1508 be reset when backtracking. */
1509#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1510do { \
1511 char *destination; \
1512 int c; \
1513 ENSURE_FAIL_STACK(3); \
1514 EXTRACT_NUMBER (c, ptr); \
01618498 1515 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1516 PUSH_FAILURE_INT (c); \
1517 PUSH_FAILURE_POINTER (ptr); \
1518 PUSH_FAILURE_INT (-1); \
01618498 1519 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1520} while (0)
1521
505bde11 1522/* Pop a saved register off the stack. */
dc1e502d 1523#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1524do { \
1525 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1526 if (reg == -1) \
1527 { \
1528 /* It's a counter. */ \
6dcf2d0e
SM
1529 /* Here, we discard `const', making re_match non-reentrant. */ \
1530 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1531 reg = POP_FAILURE_INT (); \
1532 STORE_NUMBER (ptr, reg); \
1533 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1534 } \
1535 else \
1536 { \
1537 regend[reg] = POP_FAILURE_POINTER (); \
1538 regstart[reg] = POP_FAILURE_POINTER (); \
1539 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1540 reg, regstart[reg], regend[reg]); \
1541 } \
505bde11
SM
1542} while (0)
1543
1544/* Check that we are not stuck in an infinite loop. */
1545#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1546do { \
f6df485f 1547 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1548 /* Check for infinite matching loops */ \
f6df485f
RS
1549 while (failure > 0 \
1550 && (FAILURE_STR (failure) == string_place \
1551 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1552 { \
1553 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1554 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1555 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f 1556 { \
6df42991
SM
1557 cycle = 1; \
1558 break; \
f6df485f 1559 } \
66f0296e 1560 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1561 failure = NEXT_FAILURE_HANDLE(failure); \
1562 } \
1563 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1564} while (0)
6df42991 1565
fa9a63c5 1566/* Push the information about the state we will need
5e69f11e
RM
1567 if we ever fail back to it.
1568
505bde11 1569 Requires variables fail_stack, regstart, regend and
320a2a73 1570 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1571 declared.
5e69f11e 1572
fa9a63c5
RM
1573 Does `return FAILURE_CODE' if runs out of memory. */
1574
505bde11
SM
1575#define PUSH_FAILURE_POINT(pattern, string_place) \
1576do { \
1577 char *destination; \
1578 /* Must be int, so when we don't save any registers, the arithmetic \
1579 of 0 + -1 isn't done as unsigned. */ \
1580 \
505bde11 1581 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1582 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1583 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1584 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1585 \
1586 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1587 \
1588 DEBUG_PRINT1 ("\n"); \
1589 \
1590 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1591 PUSH_FAILURE_INT (fail_stack.frame); \
1592 \
1593 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1594 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1595 DEBUG_PRINT1 ("'\n"); \
1596 PUSH_FAILURE_POINTER (string_place); \
1597 \
1598 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1599 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1600 PUSH_FAILURE_POINTER (pattern); \
1601 \
1602 /* Close the frame by moving the frame pointer past it. */ \
1603 fail_stack.frame = fail_stack.avail; \
1604} while (0)
fa9a63c5 1605
320a2a73
KH
1606/* Estimate the size of data pushed by a typical failure stack entry.
1607 An estimate is all we need, because all we use this for
1608 is to choose a limit for how big to make the failure stack. */
ada30c0e 1609/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1610#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1611
fa9a63c5
RM
1612/* How many items can still be added to the stack without overflowing it. */
1613#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1614
1615
1616/* Pops what PUSH_FAIL_STACK pushes.
1617
1618 We restore into the parameters, all of which should be lvalues:
1619 STR -- the saved data position.
1620 PAT -- the saved pattern position.
fa9a63c5 1621 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1622
fa9a63c5 1623 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1624 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1625
505bde11
SM
1626#define POP_FAILURE_POINT(str, pat) \
1627do { \
fa9a63c5
RM
1628 assert (!FAIL_STACK_EMPTY ()); \
1629 \
1630 /* Remove failure points and point to how many regs pushed. */ \
1631 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1632 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1633 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1634 \
505bde11
SM
1635 /* Pop the saved registers. */ \
1636 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1637 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1638 \
01618498 1639 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1640 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1641 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1642 \
1643 /* If the saved string location is NULL, it came from an \
1644 on_failure_keep_string_jump opcode, and we want to throw away the \
1645 saved NULL, thus retaining our current position in the string. */ \
01618498 1646 str = POP_FAILURE_POINTER (); \
505bde11 1647 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1648 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1649 DEBUG_PRINT1 ("'\n"); \
1650 \
505bde11
SM
1651 fail_stack.frame = POP_FAILURE_INT (); \
1652 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1653 \
505bde11
SM
1654 assert (fail_stack.avail >= 0); \
1655 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1656 \
fa9a63c5 1657 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1658} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1659
1660
1661\f
fa9a63c5 1662/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1663#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1664\f
1665/* Subroutine declarations and macros for regex_compile. */
1666
4bb91c68
SM
1667static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1668 reg_syntax_t syntax,
1669 struct re_pattern_buffer *bufp));
1670static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1671static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1672 int arg1, int arg2));
1673static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1674 int arg, unsigned char *end));
1675static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1676 int arg1, int arg2, unsigned char *end));
01618498
SM
1677static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1678 re_char *p,
4bb91c68 1679 reg_syntax_t syntax));
01618498
SM
1680static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1681 re_char *pend,
4bb91c68 1682 reg_syntax_t syntax));
01618498
SM
1683static re_char *skip_one_char _RE_ARGS ((re_char *p));
1684static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1685 char *fastmap, const int multibyte));
fa9a63c5 1686
fa9a63c5 1687/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1688 translation. */
36595814 1689#define PATFETCH(c) \
2d1675e4
SM
1690 do { \
1691 int len; \
1692 if (p == pend) return REG_EEND; \
1693 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1694 p += len; \
fa9a63c5
RM
1695 } while (0)
1696
fa9a63c5
RM
1697
1698/* If `translate' is non-null, return translate[D], else just D. We
1699 cast the subscript to translate because some data is declared as
1700 `char *', to avoid warnings when a string constant is passed. But
1701 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1702#ifndef TRANSLATE
0b32bf0e 1703# define TRANSLATE(d) \
66f0296e 1704 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1705#endif
fa9a63c5
RM
1706
1707
1708/* Macros for outputting the compiled pattern into `buffer'. */
1709
1710/* If the buffer isn't allocated when it comes in, use this. */
1711#define INIT_BUF_SIZE 32
1712
4bb91c68 1713/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1714#define GET_BUFFER_SPACE(n) \
01618498 1715 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1716 EXTEND_BUFFER ()
1717
1718/* Make sure we have one more byte of buffer space and then add C to it. */
1719#define BUF_PUSH(c) \
1720 do { \
1721 GET_BUFFER_SPACE (1); \
1722 *b++ = (unsigned char) (c); \
1723 } while (0)
1724
1725
1726/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1727#define BUF_PUSH_2(c1, c2) \
1728 do { \
1729 GET_BUFFER_SPACE (2); \
1730 *b++ = (unsigned char) (c1); \
1731 *b++ = (unsigned char) (c2); \
1732 } while (0)
1733
1734
4bb91c68 1735/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1736#define BUF_PUSH_3(c1, c2, c3) \
1737 do { \
1738 GET_BUFFER_SPACE (3); \
1739 *b++ = (unsigned char) (c1); \
1740 *b++ = (unsigned char) (c2); \
1741 *b++ = (unsigned char) (c3); \
1742 } while (0)
1743
1744
1745/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1746 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1747#define STORE_JUMP(op, loc, to) \
1748 store_op1 (op, loc, (to) - (loc) - 3)
1749
1750/* Likewise, for a two-argument jump. */
1751#define STORE_JUMP2(op, loc, to, arg) \
1752 store_op2 (op, loc, (to) - (loc) - 3, arg)
1753
4bb91c68 1754/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1755#define INSERT_JUMP(op, loc, to) \
1756 insert_op1 (op, loc, (to) - (loc) - 3, b)
1757
1758/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1759#define INSERT_JUMP2(op, loc, to, arg) \
1760 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1761
1762
1763/* This is not an arbitrary limit: the arguments which represent offsets
4bb91c68 1764 into the pattern are two bytes long. So if 2^16 bytes turns out to
fa9a63c5 1765 be too small, many things would have to change. */
4bb91c68
SM
1766/* Any other compiler which, like MSC, has allocation limit below 2^16
1767 bytes will have to use approach similar to what was done below for
1768 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1769 reallocating to 0 bytes. Such thing is not going to work too well.
1770 You have been warned!! */
1771#if defined _MSC_VER && !defined WIN32
1772/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1773# define MAX_BUF_SIZE 65500L
1774#else
1775# define MAX_BUF_SIZE (1L << 16)
1776#endif
fa9a63c5
RM
1777
1778/* Extend the buffer by twice its current size via realloc and
1779 reset the pointers that pointed into the old block to point to the
1780 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1781 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1782#if __BOUNDED_POINTERS__
1783# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1784# define MOVE_BUFFER_POINTER(P) \
1785 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1786# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1787 else \
1788 { \
1789 SET_HIGH_BOUND (b); \
1790 SET_HIGH_BOUND (begalt); \
1791 if (fixup_alt_jump) \
1792 SET_HIGH_BOUND (fixup_alt_jump); \
1793 if (laststart) \
1794 SET_HIGH_BOUND (laststart); \
1795 if (pending_exact) \
1796 SET_HIGH_BOUND (pending_exact); \
1797 }
1798#else
1799# define MOVE_BUFFER_POINTER(P) (P) += incr
1800# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1801#endif
fa9a63c5 1802#define EXTEND_BUFFER() \
25fe55af 1803 do { \
01618498 1804 re_char *old_buffer = bufp->buffer; \
25fe55af 1805 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1806 return REG_ESIZE; \
1807 bufp->allocated <<= 1; \
1808 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1809 bufp->allocated = MAX_BUF_SIZE; \
01618498 1810 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1811 if (bufp->buffer == NULL) \
1812 return REG_ESPACE; \
1813 /* If the buffer moved, move all the pointers into it. */ \
1814 if (old_buffer != bufp->buffer) \
1815 { \
4bb91c68
SM
1816 int incr = bufp->buffer - old_buffer; \
1817 MOVE_BUFFER_POINTER (b); \
1818 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1819 if (fixup_alt_jump) \
4bb91c68 1820 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1821 if (laststart) \
4bb91c68 1822 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1823 if (pending_exact) \
4bb91c68 1824 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1825 } \
4bb91c68 1826 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1827 } while (0)
1828
1829
1830/* Since we have one byte reserved for the register number argument to
1831 {start,stop}_memory, the maximum number of groups we can report
1832 things about is what fits in that byte. */
1833#define MAX_REGNUM 255
1834
1835/* But patterns can have more than `MAX_REGNUM' registers. We just
1836 ignore the excess. */
098d42af 1837typedef int regnum_t;
fa9a63c5
RM
1838
1839
1840/* Macros for the compile stack. */
1841
1842/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1843 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1844/* int may be not enough when sizeof(int) == 2. */
1845typedef long pattern_offset_t;
fa9a63c5
RM
1846
1847typedef struct
1848{
1849 pattern_offset_t begalt_offset;
1850 pattern_offset_t fixup_alt_jump;
5e69f11e 1851 pattern_offset_t laststart_offset;
fa9a63c5
RM
1852 regnum_t regnum;
1853} compile_stack_elt_t;
1854
1855
1856typedef struct
1857{
1858 compile_stack_elt_t *stack;
1859 unsigned size;
1860 unsigned avail; /* Offset of next open position. */
1861} compile_stack_type;
1862
1863
1864#define INIT_COMPILE_STACK_SIZE 32
1865
1866#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1867#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1868
4bb91c68 1869/* The next available element. */
fa9a63c5
RM
1870#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1871
77d11aec
RS
1872/* Explicit quit checking is only used on NTemacs. */
1873#if defined WINDOWSNT && defined emacs && defined QUIT
1874extern int immediate_quit;
1875# define IMMEDIATE_QUIT_CHECK \
1876 do { \
1877 if (immediate_quit) QUIT; \
1878 } while (0)
1879#else
1880# define IMMEDIATE_QUIT_CHECK ((void)0)
1881#endif
1882\f
b18215fc
RS
1883/* Structure to manage work area for range table. */
1884struct range_table_work_area
1885{
1886 int *table; /* actual work area. */
1887 int allocated; /* allocated size for work area in bytes. */
25fe55af 1888 int used; /* actually used size in words. */
96cc36cc 1889 int bits; /* flag to record character classes */
b18215fc
RS
1890};
1891
77d11aec
RS
1892/* Make sure that WORK_AREA can hold more N multibyte characters.
1893 This is used only in set_image_of_range and set_image_of_range_1.
1894 It expects WORK_AREA to be a pointer.
1895 If it can't get the space, it returns from the surrounding function. */
1896
1897#define EXTEND_RANGE_TABLE(work_area, n) \
1898 do { \
8f924df7 1899 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
77d11aec 1900 { \
8f924df7
KH
1901 extend_range_table_work_area (&work_area); \
1902 if ((work_area).table == 0) \
77d11aec
RS
1903 return (REG_ESPACE); \
1904 } \
b18215fc
RS
1905 } while (0)
1906
96cc36cc
RS
1907#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1908 (work_area).bits |= (bit)
1909
14473664
SM
1910/* Bits used to implement the multibyte-part of the various character classes
1911 such as [:alnum:] in a charset's range table. */
1912#define BIT_WORD 0x1
1913#define BIT_LOWER 0x2
1914#define BIT_PUNCT 0x4
1915#define BIT_SPACE 0x8
1916#define BIT_UPPER 0x10
1917#define BIT_MULTIBYTE 0x20
96cc36cc 1918
b18215fc
RS
1919/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1920#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
77d11aec 1921 do { \
8f924df7 1922 EXTEND_RANGE_TABLE ((work_area), 2); \
b18215fc
RS
1923 (work_area).table[(work_area).used++] = (range_start); \
1924 (work_area).table[(work_area).used++] = (range_end); \
1925 } while (0)
1926
25fe55af 1927/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1928#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1929 do { \
1930 if ((work_area).table) \
1931 free ((work_area).table); \
1932 } while (0)
1933
96cc36cc 1934#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1935#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1936#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1937#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1938\f
b18215fc 1939
fa9a63c5 1940/* Set the bit for character C in a list. */
01618498 1941#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1942
1943
bf216479
KH
1944#ifdef emacs
1945
8f924df7
KH
1946/* Store characters in the rage range C0 to C1 in WORK_AREA while
1947 translating them and paying attention to the continuity of
1948 translated characters.
1949
1950 Implementation note: It is better to implement this fairly big
1951 macro by a function, but it's not that easy because macros called
1952 in this macro assume various local variables already declared. */
bf216479 1953
6fdd04b0
KH
1954#define SETUP_MULTIBYTE_RANGE(work_area, c0, c1) \
1955 do { \
1956 re_wchar_t c, t, t_last; \
1957 int n; \
1958 \
1959 c = (c0); \
1960 t_last = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
1961 for (c++, n = 1; c <= (c1); c++, n++) \
1962 { \
1963 t = multibyte ? TRANSLATE (c) : TRANSLATE (MAKE_CHAR_MULTIBYTE (c)); \
1964 if (t_last + n == t) \
1965 continue; \
1966 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \
1967 t_last = t; \
8f924df7 1968 n = 0; \
6fdd04b0
KH
1969 } \
1970 if (n > 0) \
1971 SET_RANGE_TABLE_WORK_AREA ((work_area), t_last, t_last + n - 1); \
bf216479
KH
1972 } while (0)
1973
1974#endif /* emacs */
1975
fa9a63c5 1976/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1977#define GET_UNSIGNED_NUMBER(num) \
99633e97 1978 do { if (p != pend) \
fa9a63c5 1979 { \
25fe55af 1980 PATFETCH (c); \
060cc8ff
RS
1981 if (c == ' ') \
1982 FREE_STACK_RETURN (REG_BADBR); \
4bb91c68 1983 while ('0' <= c && c <= '9') \
25fe55af 1984 { \
f6df485f 1985 int prev; \
25fe55af 1986 if (num < 0) \
f6df485f
RS
1987 num = 0; \
1988 prev = num; \
25fe55af 1989 num = num * 10 + c - '0'; \
f6df485f
RS
1990 if (num / 10 != prev) \
1991 FREE_STACK_RETURN (REG_BADBR); \
25fe55af 1992 if (p == pend) \
f6df485f 1993 break; \
25fe55af
RS
1994 PATFETCH (c); \
1995 } \
060cc8ff
RS
1996 if (c == ' ') \
1997 FREE_STACK_RETURN (REG_BADBR); \
25fe55af 1998 } \
99633e97 1999 } while (0)
77d11aec 2000\f
a0ad02f7 2001#if WIDE_CHAR_SUPPORT
14473664
SM
2002/* The GNU C library provides support for user-defined character classes
2003 and the functions from ISO C amendement 1. */
2004# ifdef CHARCLASS_NAME_MAX
2005# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2006# else
2007/* This shouldn't happen but some implementation might still have this
2008 problem. Use a reasonable default value. */
2009# define CHAR_CLASS_MAX_LENGTH 256
2010# endif
2011typedef wctype_t re_wctype_t;
01618498 2012typedef wchar_t re_wchar_t;
14473664
SM
2013# define re_wctype wctype
2014# define re_iswctype iswctype
2015# define re_wctype_to_bit(cc) 0
2016#else
2017# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
2018# define btowc(c) c
2019
01618498 2020/* Character classes. */
14473664
SM
2021typedef enum { RECC_ERROR = 0,
2022 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
2023 RECC_GRAPH, RECC_PRINT,
2024 RECC_LOWER, RECC_UPPER,
2025 RECC_PUNCT, RECC_CNTRL,
2026 RECC_DIGIT, RECC_XDIGIT,
2027 RECC_BLANK, RECC_SPACE,
2028 RECC_MULTIBYTE, RECC_NONASCII,
2029 RECC_ASCII, RECC_UNIBYTE
2030} re_wctype_t;
2031
01618498
SM
2032typedef int re_wchar_t;
2033
14473664
SM
2034/* Map a string to the char class it names (if any). */
2035static re_wctype_t
ada30c0e
SM
2036re_wctype (str)
2037 re_char *str;
14473664 2038{
ada30c0e 2039 const char *string = str;
14473664
SM
2040 if (STREQ (string, "alnum")) return RECC_ALNUM;
2041 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2042 else if (STREQ (string, "word")) return RECC_WORD;
2043 else if (STREQ (string, "ascii")) return RECC_ASCII;
2044 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2045 else if (STREQ (string, "graph")) return RECC_GRAPH;
2046 else if (STREQ (string, "lower")) return RECC_LOWER;
2047 else if (STREQ (string, "print")) return RECC_PRINT;
2048 else if (STREQ (string, "punct")) return RECC_PUNCT;
2049 else if (STREQ (string, "space")) return RECC_SPACE;
2050 else if (STREQ (string, "upper")) return RECC_UPPER;
2051 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2052 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2053 else if (STREQ (string, "digit")) return RECC_DIGIT;
2054 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2055 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2056 else if (STREQ (string, "blank")) return RECC_BLANK;
2057 else return 0;
2058}
2059
2060/* True iff CH is in the char class CC. */
2061static boolean
2062re_iswctype (ch, cc)
2063 int ch;
2064 re_wctype_t cc;
2065{
2066 switch (cc)
2067 {
0cdd06f8
SM
2068 case RECC_ALNUM: return ISALNUM (ch);
2069 case RECC_ALPHA: return ISALPHA (ch);
2070 case RECC_BLANK: return ISBLANK (ch);
2071 case RECC_CNTRL: return ISCNTRL (ch);
2072 case RECC_DIGIT: return ISDIGIT (ch);
2073 case RECC_GRAPH: return ISGRAPH (ch);
2074 case RECC_LOWER: return ISLOWER (ch);
2075 case RECC_PRINT: return ISPRINT (ch);
2076 case RECC_PUNCT: return ISPUNCT (ch);
2077 case RECC_SPACE: return ISSPACE (ch);
2078 case RECC_UPPER: return ISUPPER (ch);
2079 case RECC_XDIGIT: return ISXDIGIT (ch);
2080 case RECC_ASCII: return IS_REAL_ASCII (ch);
2081 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2082 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2083 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2084 case RECC_WORD: return ISWORD (ch);
2085 case RECC_ERROR: return false;
2086 default:
2087 abort();
14473664
SM
2088 }
2089}
fa9a63c5 2090
14473664
SM
2091/* Return a bit-pattern to use in the range-table bits to match multibyte
2092 chars of class CC. */
2093static int
2094re_wctype_to_bit (cc)
2095 re_wctype_t cc;
2096{
2097 switch (cc)
2098 {
2099 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2100 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2101 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2102 case RECC_LOWER: return BIT_LOWER;
2103 case RECC_UPPER: return BIT_UPPER;
2104 case RECC_PUNCT: return BIT_PUNCT;
2105 case RECC_SPACE: return BIT_SPACE;
14473664 2106 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2107 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2108 default:
2109 abort();
14473664
SM
2110 }
2111}
2112#endif
77d11aec
RS
2113\f
2114/* Filling in the work area of a range. */
2115
2116/* Actually extend the space in WORK_AREA. */
2117
2118static void
2119extend_range_table_work_area (work_area)
2120 struct range_table_work_area *work_area;
177c0ea7 2121{
77d11aec
RS
2122 work_area->allocated += 16 * sizeof (int);
2123 if (work_area->table)
2124 work_area->table
2125 = (int *) realloc (work_area->table, work_area->allocated);
2126 else
2127 work_area->table
2128 = (int *) malloc (work_area->allocated);
2129}
2130
8f924df7 2131#if 0
77d11aec
RS
2132#ifdef emacs
2133
2134/* Carefully find the ranges of codes that are equivalent
2135 under case conversion to the range start..end when passed through
2136 TRANSLATE. Handle the case where non-letters can come in between
2137 two upper-case letters (which happens in Latin-1).
2138 Also handle the case of groups of more than 2 case-equivalent chars.
2139
2140 The basic method is to look at consecutive characters and see
2141 if they can form a run that can be handled as one.
2142
2143 Returns -1 if successful, REG_ESPACE if ran out of space. */
2144
2145static int
2146set_image_of_range_1 (work_area, start, end, translate)
2147 RE_TRANSLATE_TYPE translate;
2148 struct range_table_work_area *work_area;
2149 re_wchar_t start, end;
2150{
2151 /* `one_case' indicates a character, or a run of characters,
2152 each of which is an isolate (no case-equivalents).
2153 This includes all ASCII non-letters.
2154
2155 `two_case' indicates a character, or a run of characters,
2156 each of which has two case-equivalent forms.
2157 This includes all ASCII letters.
2158
2159 `strange' indicates a character that has more than one
2160 case-equivalent. */
177c0ea7 2161
77d11aec
RS
2162 enum case_type {one_case, two_case, strange};
2163
2164 /* Describe the run that is in progress,
2165 which the next character can try to extend.
2166 If run_type is strange, that means there really is no run.
2167 If run_type is one_case, then run_start...run_end is the run.
2168 If run_type is two_case, then the run is run_start...run_end,
2169 and the case-equivalents end at run_eqv_end. */
2170
2171 enum case_type run_type = strange;
2172 int run_start, run_end, run_eqv_end;
2173
2174 Lisp_Object eqv_table;
2175
2176 if (!RE_TRANSLATE_P (translate))
2177 {
b7c12565 2178 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2179 work_area->table[work_area->used++] = (start);
2180 work_area->table[work_area->used++] = (end);
b7c12565 2181 return -1;
77d11aec
RS
2182 }
2183
2184 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2185
77d11aec
RS
2186 for (; start <= end; start++)
2187 {
2188 enum case_type this_type;
2189 int eqv = RE_TRANSLATE (eqv_table, start);
2190 int minchar, maxchar;
2191
2192 /* Classify this character */
2193 if (eqv == start)
2194 this_type = one_case;
2195 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2196 this_type = two_case;
2197 else
2198 this_type = strange;
2199
2200 if (start < eqv)
2201 minchar = start, maxchar = eqv;
2202 else
2203 minchar = eqv, maxchar = start;
2204
2205 /* Can this character extend the run in progress? */
2206 if (this_type == strange || this_type != run_type
2207 || !(minchar == run_end + 1
2208 && (run_type == two_case
2209 ? maxchar == run_eqv_end + 1 : 1)))
2210 {
2211 /* No, end the run.
2212 Record each of its equivalent ranges. */
2213 if (run_type == one_case)
2214 {
2215 EXTEND_RANGE_TABLE (work_area, 2);
2216 work_area->table[work_area->used++] = run_start;
2217 work_area->table[work_area->used++] = run_end;
2218 }
2219 else if (run_type == two_case)
2220 {
2221 EXTEND_RANGE_TABLE (work_area, 4);
2222 work_area->table[work_area->used++] = run_start;
2223 work_area->table[work_area->used++] = run_end;
2224 work_area->table[work_area->used++]
2225 = RE_TRANSLATE (eqv_table, run_start);
2226 work_area->table[work_area->used++]
2227 = RE_TRANSLATE (eqv_table, run_end);
2228 }
2229 run_type = strange;
2230 }
177c0ea7 2231
77d11aec
RS
2232 if (this_type == strange)
2233 {
2234 /* For a strange character, add each of its equivalents, one
2235 by one. Don't start a range. */
2236 do
2237 {
2238 EXTEND_RANGE_TABLE (work_area, 2);
2239 work_area->table[work_area->used++] = eqv;
2240 work_area->table[work_area->used++] = eqv;
2241 eqv = RE_TRANSLATE (eqv_table, eqv);
2242 }
2243 while (eqv != start);
2244 }
2245
2246 /* Add this char to the run, or start a new run. */
2247 else if (run_type == strange)
2248 {
2249 /* Initialize a new range. */
2250 run_type = this_type;
2251 run_start = start;
2252 run_end = start;
2253 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2254 }
2255 else
2256 {
2257 /* Extend a running range. */
2258 run_end = minchar;
2259 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2260 }
2261 }
2262
2263 /* If a run is still in progress at the end, finish it now
2264 by recording its equivalent ranges. */
2265 if (run_type == one_case)
2266 {
2267 EXTEND_RANGE_TABLE (work_area, 2);
2268 work_area->table[work_area->used++] = run_start;
2269 work_area->table[work_area->used++] = run_end;
2270 }
2271 else if (run_type == two_case)
2272 {
2273 EXTEND_RANGE_TABLE (work_area, 4);
2274 work_area->table[work_area->used++] = run_start;
2275 work_area->table[work_area->used++] = run_end;
2276 work_area->table[work_area->used++]
2277 = RE_TRANSLATE (eqv_table, run_start);
2278 work_area->table[work_area->used++]
2279 = RE_TRANSLATE (eqv_table, run_end);
2280 }
2281
2282 return -1;
2283}
36595814 2284
77d11aec 2285#endif /* emacs */
36595814 2286
b7c12565 2287/* Record the the image of the range start..end when passed through
36595814
SM
2288 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2289 and is not even necessarily contiguous.
b7c12565
RS
2290 Normally we approximate it with the smallest contiguous range that contains
2291 all the chars we need. However, for Latin-1 we go to extra effort
2292 to do a better job.
2293
2294 This function is not called for ASCII ranges.
77d11aec
RS
2295
2296 Returns -1 if successful, REG_ESPACE if ran out of space. */
2297
2298static int
36595814
SM
2299set_image_of_range (work_area, start, end, translate)
2300 RE_TRANSLATE_TYPE translate;
2301 struct range_table_work_area *work_area;
2302 re_wchar_t start, end;
2303{
77d11aec
RS
2304 re_wchar_t cmin, cmax;
2305
2306#ifdef emacs
2307 /* For Latin-1 ranges, use set_image_of_range_1
2308 to get proper handling of ranges that include letters and nonletters.
b7c12565 2309 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2310 For other character sets, we don't bother to get this right. */
b7c12565
RS
2311 if (RE_TRANSLATE_P (translate) && start < 04400
2312 && !(start < 04200 && end >= 04377))
77d11aec 2313 {
b7c12565 2314 int newend;
77d11aec 2315 int tem;
b7c12565
RS
2316 newend = end;
2317 if (newend > 04377)
2318 newend = 04377;
2319 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2320 if (tem > 0)
2321 return tem;
2322
2323 start = 04400;
2324 if (end < 04400)
2325 return -1;
2326 }
2327#endif
2328
b7c12565
RS
2329 EXTEND_RANGE_TABLE (work_area, 2);
2330 work_area->table[work_area->used++] = (start);
2331 work_area->table[work_area->used++] = (end);
2332
2333 cmin = -1, cmax = -1;
77d11aec 2334
36595814 2335 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2336 {
2337 int ch;
77d11aec 2338
b7c12565
RS
2339 for (ch = start; ch <= end; ch++)
2340 {
2341 re_wchar_t c = TRANSLATE (ch);
2342 if (! (start <= c && c <= end))
2343 {
2344 if (cmin == -1)
2345 cmin = c, cmax = c;
2346 else
2347 {
2348 cmin = MIN (cmin, c);
2349 cmax = MAX (cmax, c);
2350 }
2351 }
2352 }
2353
2354 if (cmin != -1)
2355 {
2356 EXTEND_RANGE_TABLE (work_area, 2);
2357 work_area->table[work_area->used++] = (cmin);
2358 work_area->table[work_area->used++] = (cmax);
2359 }
2360 }
36595814 2361
77d11aec
RS
2362 return -1;
2363}
8f924df7 2364#endif /* 0 */
fa9a63c5
RM
2365\f
2366#ifndef MATCH_MAY_ALLOCATE
2367
2368/* If we cannot allocate large objects within re_match_2_internal,
2369 we make the fail stack and register vectors global.
2370 The fail stack, we grow to the maximum size when a regexp
2371 is compiled.
2372 The register vectors, we adjust in size each time we
2373 compile a regexp, according to the number of registers it needs. */
2374
2375static fail_stack_type fail_stack;
2376
2377/* Size with which the following vectors are currently allocated.
2378 That is so we can make them bigger as needed,
4bb91c68 2379 but never make them smaller. */
fa9a63c5
RM
2380static int regs_allocated_size;
2381
66f0296e
SM
2382static re_char ** regstart, ** regend;
2383static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2384
2385/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2386 but don't make them smaller. */
fa9a63c5
RM
2387
2388static
2389regex_grow_registers (num_regs)
2390 int num_regs;
2391{
2392 if (num_regs > regs_allocated_size)
2393 {
66f0296e
SM
2394 RETALLOC_IF (regstart, num_regs, re_char *);
2395 RETALLOC_IF (regend, num_regs, re_char *);
2396 RETALLOC_IF (best_regstart, num_regs, re_char *);
2397 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2398
2399 regs_allocated_size = num_regs;
2400 }
2401}
2402
2403#endif /* not MATCH_MAY_ALLOCATE */
2404\f
99633e97
SM
2405static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2406 compile_stack,
2407 regnum_t regnum));
2408
fa9a63c5
RM
2409/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2410 Returns one of error codes defined in `regex.h', or zero for success.
2411
2412 Assumes the `allocated' (and perhaps `buffer') and `translate'
2413 fields are set in BUFP on entry.
2414
2415 If it succeeds, results are put in BUFP (if it returns an error, the
2416 contents of BUFP are undefined):
2417 `buffer' is the compiled pattern;
2418 `syntax' is set to SYNTAX;
2419 `used' is set to the length of the compiled pattern;
2420 `fastmap_accurate' is zero;
2421 `re_nsub' is the number of subexpressions in PATTERN;
2422 `not_bol' and `not_eol' are zero;
5e69f11e 2423
c0f9ea08 2424 The `fastmap' field is neither examined nor set. */
fa9a63c5 2425
505bde11
SM
2426/* Insert the `jump' from the end of last alternative to "here".
2427 The space for the jump has already been allocated. */
2428#define FIXUP_ALT_JUMP() \
2429do { \
2430 if (fixup_alt_jump) \
2431 STORE_JUMP (jump, fixup_alt_jump, b); \
2432} while (0)
2433
2434
fa9a63c5
RM
2435/* Return, freeing storage we allocated. */
2436#define FREE_STACK_RETURN(value) \
b18215fc
RS
2437 do { \
2438 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2439 free (compile_stack.stack); \
2440 return value; \
2441 } while (0)
fa9a63c5
RM
2442
2443static reg_errcode_t
2444regex_compile (pattern, size, syntax, bufp)
66f0296e 2445 re_char *pattern;
4bb91c68 2446 size_t size;
fa9a63c5
RM
2447 reg_syntax_t syntax;
2448 struct re_pattern_buffer *bufp;
2449{
01618498
SM
2450 /* We fetch characters from PATTERN here. */
2451 register re_wchar_t c, c1;
5e69f11e 2452
fa9a63c5 2453 /* A random temporary spot in PATTERN. */
66f0296e 2454 re_char *p1;
fa9a63c5
RM
2455
2456 /* Points to the end of the buffer, where we should append. */
2457 register unsigned char *b;
5e69f11e 2458
fa9a63c5
RM
2459 /* Keeps track of unclosed groups. */
2460 compile_stack_type compile_stack;
2461
2462 /* Points to the current (ending) position in the pattern. */
22336245
RS
2463#ifdef AIX
2464 /* `const' makes AIX compiler fail. */
66f0296e 2465 unsigned char *p = pattern;
22336245 2466#else
66f0296e 2467 re_char *p = pattern;
22336245 2468#endif
66f0296e 2469 re_char *pend = pattern + size;
5e69f11e 2470
fa9a63c5 2471 /* How to translate the characters in the pattern. */
6676cb1c 2472 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2473
2474 /* Address of the count-byte of the most recently inserted `exactn'
2475 command. This makes it possible to tell if a new exact-match
2476 character can be added to that command or if the character requires
2477 a new `exactn' command. */
2478 unsigned char *pending_exact = 0;
2479
2480 /* Address of start of the most recently finished expression.
2481 This tells, e.g., postfix * where to find the start of its
2482 operand. Reset at the beginning of groups and alternatives. */
2483 unsigned char *laststart = 0;
2484
2485 /* Address of beginning of regexp, or inside of last group. */
2486 unsigned char *begalt;
2487
2488 /* Place in the uncompiled pattern (i.e., the {) to
2489 which to go back if the interval is invalid. */
66f0296e 2490 re_char *beg_interval;
5e69f11e 2491
fa9a63c5 2492 /* Address of the place where a forward jump should go to the end of
25fe55af 2493 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2494 last -- ends with a forward jump of this sort. */
2495 unsigned char *fixup_alt_jump = 0;
2496
2497 /* Counts open-groups as they are encountered. Remembered for the
2498 matching close-group on the compile stack, so the same register
2499 number is put in the stop_memory as the start_memory. */
2500 regnum_t regnum = 0;
2501
b18215fc
RS
2502 /* Work area for range table of charset. */
2503 struct range_table_work_area range_table_work;
2504
2d1675e4
SM
2505 /* If the object matched can contain multibyte characters. */
2506 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2507
8f924df7 2508 /* If a target of matching can contain multibyte characters. */
6fdd04b0
KH
2509 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2510
fa9a63c5 2511#ifdef DEBUG
99633e97 2512 debug++;
fa9a63c5 2513 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2514 if (debug > 0)
fa9a63c5
RM
2515 {
2516 unsigned debug_count;
5e69f11e 2517
fa9a63c5 2518 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2519 putchar (pattern[debug_count]);
fa9a63c5
RM
2520 putchar ('\n');
2521 }
2522#endif /* DEBUG */
2523
2524 /* Initialize the compile stack. */
2525 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2526 if (compile_stack.stack == NULL)
2527 return REG_ESPACE;
2528
2529 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2530 compile_stack.avail = 0;
2531
b18215fc
RS
2532 range_table_work.table = 0;
2533 range_table_work.allocated = 0;
2534
fa9a63c5
RM
2535 /* Initialize the pattern buffer. */
2536 bufp->syntax = syntax;
2537 bufp->fastmap_accurate = 0;
2538 bufp->not_bol = bufp->not_eol = 0;
2539
2540 /* Set `used' to zero, so that if we return an error, the pattern
2541 printer (for debugging) will think there's no pattern. We reset it
2542 at the end. */
2543 bufp->used = 0;
5e69f11e 2544
fa9a63c5 2545 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2546 bufp->re_nsub = 0;
fa9a63c5 2547
0b32bf0e 2548#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2549 /* Initialize the syntax table. */
2550 init_syntax_once ();
2551#endif
2552
2553 if (bufp->allocated == 0)
2554 {
2555 if (bufp->buffer)
2556 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2557 enough space. This loses if buffer's address is bogus, but
2558 that is the user's responsibility. */
2559 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2560 }
fa9a63c5 2561 else
25fe55af
RS
2562 { /* Caller did not allocate a buffer. Do it for them. */
2563 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2564 }
fa9a63c5
RM
2565 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2566
2567 bufp->allocated = INIT_BUF_SIZE;
2568 }
2569
2570 begalt = b = bufp->buffer;
2571
2572 /* Loop through the uncompiled pattern until we're at the end. */
2573 while (p != pend)
2574 {
2575 PATFETCH (c);
2576
2577 switch (c)
25fe55af
RS
2578 {
2579 case '^':
2580 {
2581 if ( /* If at start of pattern, it's an operator. */
2582 p == pattern + 1
2583 /* If context independent, it's an operator. */
2584 || syntax & RE_CONTEXT_INDEP_ANCHORS
2585 /* Otherwise, depends on what's come before. */
2586 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2587 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2588 else
2589 goto normal_char;
2590 }
2591 break;
2592
2593
2594 case '$':
2595 {
2596 if ( /* If at end of pattern, it's an operator. */
2597 p == pend
2598 /* If context independent, it's an operator. */
2599 || syntax & RE_CONTEXT_INDEP_ANCHORS
2600 /* Otherwise, depends on what's next. */
2601 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2602 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2603 else
2604 goto normal_char;
2605 }
2606 break;
fa9a63c5
RM
2607
2608
2609 case '+':
25fe55af
RS
2610 case '?':
2611 if ((syntax & RE_BK_PLUS_QM)
2612 || (syntax & RE_LIMITED_OPS))
2613 goto normal_char;
2614 handle_plus:
2615 case '*':
2616 /* If there is no previous pattern... */
2617 if (!laststart)
2618 {
2619 if (syntax & RE_CONTEXT_INVALID_OPS)
2620 FREE_STACK_RETURN (REG_BADRPT);
2621 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2622 goto normal_char;
2623 }
2624
2625 {
25fe55af 2626 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2627 boolean zero_times_ok = 0, many_times_ok = 0;
2628 boolean greedy = 1;
25fe55af
RS
2629
2630 /* If there is a sequence of repetition chars, collapse it
2631 down to just one (the right one). We can't combine
2632 interval operators with these because of, e.g., `a{2}*',
2633 which should only match an even number of `a's. */
2634
2635 for (;;)
2636 {
0b32bf0e 2637 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2638 && c == '?' && (zero_times_ok || many_times_ok))
2639 greedy = 0;
2640 else
2641 {
2642 zero_times_ok |= c != '+';
2643 many_times_ok |= c != '?';
2644 }
25fe55af
RS
2645
2646 if (p == pend)
2647 break;
ed0767d8
SM
2648 else if (*p == '*'
2649 || (!(syntax & RE_BK_PLUS_QM)
2650 && (*p == '+' || *p == '?')))
25fe55af 2651 ;
ed0767d8 2652 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2653 {
ed0767d8
SM
2654 if (p+1 == pend)
2655 FREE_STACK_RETURN (REG_EESCAPE);
2656 if (p[1] == '+' || p[1] == '?')
2657 PATFETCH (c); /* Gobble up the backslash. */
2658 else
2659 break;
25fe55af
RS
2660 }
2661 else
ed0767d8 2662 break;
25fe55af 2663 /* If we get here, we found another repeat character. */
ed0767d8
SM
2664 PATFETCH (c);
2665 }
25fe55af
RS
2666
2667 /* Star, etc. applied to an empty pattern is equivalent
2668 to an empty pattern. */
4e8a9132 2669 if (!laststart || laststart == b)
25fe55af
RS
2670 break;
2671
2672 /* Now we know whether or not zero matches is allowed
2673 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2674 if (greedy)
2675 {
99633e97 2676 if (many_times_ok)
4e8a9132
SM
2677 {
2678 boolean simple = skip_one_char (laststart) == b;
2679 unsigned int startoffset = 0;
f6a3f532 2680 re_opcode_t ofj =
01618498 2681 /* Check if the loop can match the empty string. */
6df42991
SM
2682 (simple || !analyse_first (laststart, b, NULL, 0))
2683 ? on_failure_jump : on_failure_jump_loop;
4e8a9132 2684 assert (skip_one_char (laststart) <= b);
177c0ea7 2685
4e8a9132
SM
2686 if (!zero_times_ok && simple)
2687 { /* Since simple * loops can be made faster by using
2688 on_failure_keep_string_jump, we turn simple P+
2689 into PP* if P is simple. */
2690 unsigned char *p1, *p2;
2691 startoffset = b - laststart;
2692 GET_BUFFER_SPACE (startoffset);
2693 p1 = b; p2 = laststart;
2694 while (p2 < p1)
2695 *b++ = *p2++;
2696 zero_times_ok = 1;
99633e97 2697 }
4e8a9132
SM
2698
2699 GET_BUFFER_SPACE (6);
2700 if (!zero_times_ok)
2701 /* A + loop. */
f6a3f532 2702 STORE_JUMP (ofj, b, b + 6);
99633e97 2703 else
4e8a9132
SM
2704 /* Simple * loops can use on_failure_keep_string_jump
2705 depending on what follows. But since we don't know
2706 that yet, we leave the decision up to
2707 on_failure_jump_smart. */
f6a3f532 2708 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2709 laststart + startoffset, b + 6);
99633e97 2710 b += 3;
4e8a9132 2711 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2712 b += 3;
2713 }
2714 else
2715 {
4e8a9132
SM
2716 /* A simple ? pattern. */
2717 assert (zero_times_ok);
2718 GET_BUFFER_SPACE (3);
2719 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2720 b += 3;
2721 }
1c8c6d39
DL
2722 }
2723 else /* not greedy */
2724 { /* I wish the greedy and non-greedy cases could be merged. */
2725
0683b6fa 2726 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2727 if (many_times_ok)
2728 {
f6a3f532
SM
2729 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2730
6df42991
SM
2731 /* The non-greedy multiple match looks like
2732 a repeat..until: we only need a conditional jump
2733 at the end of the loop. */
f6a3f532
SM
2734 if (emptyp) BUF_PUSH (no_op);
2735 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2736 : on_failure_jump, b, laststart);
1c8c6d39
DL
2737 b += 3;
2738 if (zero_times_ok)
2739 {
2740 /* The repeat...until naturally matches one or more.
2741 To also match zero times, we need to first jump to
6df42991 2742 the end of the loop (its conditional jump). */
1c8c6d39
DL
2743 INSERT_JUMP (jump, laststart, b);
2744 b += 3;
2745 }
2746 }
2747 else
2748 {
2749 /* non-greedy a?? */
1c8c6d39
DL
2750 INSERT_JUMP (jump, laststart, b + 3);
2751 b += 3;
2752 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2753 b += 3;
2754 }
2755 }
2756 }
4e8a9132 2757 pending_exact = 0;
fa9a63c5
RM
2758 break;
2759
2760
2761 case '.':
25fe55af
RS
2762 laststart = b;
2763 BUF_PUSH (anychar);
2764 break;
fa9a63c5
RM
2765
2766
25fe55af
RS
2767 case '[':
2768 {
b18215fc 2769 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2770
25fe55af 2771 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2772
25fe55af
RS
2773 /* Ensure that we have enough space to push a charset: the
2774 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2775 GET_BUFFER_SPACE (34);
2776
25fe55af 2777 laststart = b;
e318085a 2778
25fe55af
RS
2779 /* We test `*p == '^' twice, instead of using an if
2780 statement, so we only need one BUF_PUSH. */
2781 BUF_PUSH (*p == '^' ? charset_not : charset);
2782 if (*p == '^')
2783 p++;
e318085a 2784
25fe55af
RS
2785 /* Remember the first position in the bracket expression. */
2786 p1 = p;
e318085a 2787
25fe55af
RS
2788 /* Push the number of bytes in the bitmap. */
2789 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2790
25fe55af
RS
2791 /* Clear the whole map. */
2792 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2793
25fe55af
RS
2794 /* charset_not matches newline according to a syntax bit. */
2795 if ((re_opcode_t) b[-2] == charset_not
2796 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2797 SET_LIST_BIT ('\n');
fa9a63c5 2798
25fe55af
RS
2799 /* Read in characters and ranges, setting map bits. */
2800 for (;;)
2801 {
b18215fc 2802 boolean escaped_char = false;
2d1675e4 2803 const unsigned char *p2 = p;
e318085a 2804
25fe55af 2805 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2806
36595814
SM
2807 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2808 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2809 So the translation is done later in a loop. Example:
2810 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2811 PATFETCH (c);
e318085a 2812
25fe55af
RS
2813 /* \ might escape characters inside [...] and [^...]. */
2814 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2815 {
2816 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2817
2818 PATFETCH (c);
b18215fc 2819 escaped_char = true;
25fe55af 2820 }
b18215fc
RS
2821 else
2822 {
657fcfbd
RS
2823 /* Could be the end of the bracket expression. If it's
2824 not (i.e., when the bracket expression is `[]' so
2825 far), the ']' character bit gets set way below. */
2d1675e4 2826 if (c == ']' && p2 != p1)
657fcfbd 2827 break;
25fe55af 2828 }
b18215fc 2829
25fe55af
RS
2830 /* See if we're at the beginning of a possible character
2831 class. */
b18215fc 2832
2d1675e4
SM
2833 if (!escaped_char &&
2834 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2835 {
2836 /* Leave room for the null. */
14473664 2837 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2838 const unsigned char *class_beg;
b18215fc 2839
25fe55af
RS
2840 PATFETCH (c);
2841 c1 = 0;
ed0767d8 2842 class_beg = p;
b18215fc 2843
25fe55af
RS
2844 /* If pattern is `[[:'. */
2845 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2846
25fe55af
RS
2847 for (;;)
2848 {
14473664
SM
2849 PATFETCH (c);
2850 if ((c == ':' && *p == ']') || p == pend)
2851 break;
2852 if (c1 < CHAR_CLASS_MAX_LENGTH)
2853 str[c1++] = c;
2854 else
2855 /* This is in any case an invalid class name. */
2856 str[0] = '\0';
25fe55af
RS
2857 }
2858 str[c1] = '\0';
b18215fc
RS
2859
2860 /* If isn't a word bracketed by `[:' and `:]':
2861 undo the ending character, the letters, and
2862 leave the leading `:' and `[' (but set bits for
2863 them). */
25fe55af
RS
2864 if (c == ':' && *p == ']')
2865 {
36595814 2866 re_wchar_t ch;
14473664 2867 re_wctype_t cc;
8f924df7 2868 int limit;
14473664
SM
2869
2870 cc = re_wctype (str);
2871
2872 if (cc == 0)
fa9a63c5
RM
2873 FREE_STACK_RETURN (REG_ECTYPE);
2874
14473664
SM
2875 /* Throw away the ] at the end of the character
2876 class. */
2877 PATFETCH (c);
fa9a63c5 2878
14473664 2879 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2880
96cc36cc
RS
2881 /* Most character classes in a multibyte match
2882 just set a flag. Exceptions are is_blank,
2883 is_digit, is_cntrl, and is_xdigit, since
2884 they can only match ASCII characters. We
14473664
SM
2885 don't need to handle them for multibyte.
2886 They are distinguished by a negative wctype. */
96cc36cc 2887
8f924df7
KH
2888 for (ch = 0; ch < 128; ++ch)
2889 if (re_iswctype (btowc (ch), cc))
2890 {
2891 c = TRANSLATE (ch);
2892 SET_LIST_BIT (c);
2893 }
96cc36cc 2894
8f924df7 2895 if (target_multibyte)
25fe55af 2896 {
8f924df7
KH
2897 SET_RANGE_TABLE_WORK_AREA_BIT
2898 (range_table_work, re_wctype_to_bit (cc));
2899 }
2900 else
25fe55af 2901 {
8f924df7
KH
2902 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2903 {
2904 c = ch;
2905 MAKE_CHAR_MULTIBYTE (c);
2906 if (re_iswctype (btowc (c), cc))
2907 {
2908 c = TRANSLATE (c);
2909 MAKE_CHAR_UNIBYTE (c);
2910 SET_LIST_BIT (c);
2911 }
2912 }
25fe55af 2913 }
b18215fc
RS
2914
2915 /* Repeat the loop. */
2916 continue;
25fe55af
RS
2917 }
2918 else
2919 {
ed0767d8
SM
2920 /* Go back to right after the "[:". */
2921 p = class_beg;
25fe55af 2922 SET_LIST_BIT ('[');
b18215fc
RS
2923
2924 /* Because the `:' may starts the range, we
2925 can't simply set bit and repeat the loop.
25fe55af 2926 Instead, just set it to C and handle below. */
b18215fc 2927 c = ':';
25fe55af
RS
2928 }
2929 }
b18215fc
RS
2930
2931 if (p < pend && p[0] == '-' && p[1] != ']')
2932 {
2933
2934 /* Discard the `-'. */
2935 PATFETCH (c1);
2936
2937 /* Fetch the character which ends the range. */
2938 PATFETCH (c1);
6fdd04b0 2939 if (c > c1)
e934739e 2940 {
6fdd04b0
KH
2941 if (syntax & RE_NO_EMPTY_RANGES)
2942 FREE_STACK_RETURN (REG_ERANGE);
2943 /* Else, repeat the loop. */
e934739e 2944 }
e318085a 2945 }
25fe55af 2946 else
b18215fc
RS
2947 /* Range from C to C. */
2948 c1 = c;
2949
6fdd04b0
KH
2950#ifndef emacs
2951 c = TRANSLATE (c);
2952 c1 = TRANSLATE (c1);
8f924df7
KH
2953 /* Set the range into bitmap */
2954 for (; c <= c1; c++)
2955 SET_LIST_BIT (TRANSLATE (c));
6fdd04b0
KH
2956#else /* not emacs */
2957 if (target_multibyte)
25fe55af 2958 {
8f924df7 2959 if (c1 >= 128)
b18215fc 2960 {
6fdd04b0
KH
2961 re_wchar_t c0 = MAX (c, 128);
2962
2963 SETUP_MULTIBYTE_RANGE (range_table_work, c0, c1);
8f924df7 2964 c1 = 127;
b18215fc 2965 }
8f924df7
KH
2966 for (; c <= c1; c++)
2967 SET_LIST_BIT (TRANSLATE (c));
bf216479 2968 }
6fdd04b0 2969 else
25fe55af 2970 {
8f924df7
KH
2971 re_wchar_t c0;
2972
2973 for (; c <= c1; c++)
b18215fc 2974 {
8f924df7
KH
2975 c0 = c;
2976 if (! multibyte)
2977 MAKE_CHAR_MULTIBYTE (c0);
2978 c0 = TRANSLATE (c0);
2979 MAKE_CHAR_UNIBYTE (c0);
2980 SET_LIST_BIT (c0);
e934739e 2981 }
25fe55af 2982 }
6fdd04b0 2983#endif /* not emacs */
e318085a
RS
2984 }
2985
25fe55af
RS
2986 /* Discard any (non)matching list bytes that are all 0 at the
2987 end of the map. Decrease the map-length byte too. */
2988 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2989 b[-1]--;
2990 b += b[-1];
fa9a63c5 2991
96cc36cc
RS
2992 /* Build real range table from work area. */
2993 if (RANGE_TABLE_WORK_USED (range_table_work)
2994 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
2995 {
2996 int i;
2997 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2998
b18215fc 2999 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
3000 bytes for flags, two for COUNT, and three bytes for
3001 each character. */
3002 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 3003
b18215fc
RS
3004 /* Indicate the existence of range table. */
3005 laststart[1] |= 0x80;
fa9a63c5 3006
96cc36cc
RS
3007 /* Store the character class flag bits into the range table.
3008 If not in emacs, these flag bits are always 0. */
3009 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3010 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3011
b18215fc
RS
3012 STORE_NUMBER_AND_INCR (b, used / 2);
3013 for (i = 0; i < used; i++)
3014 STORE_CHARACTER_AND_INCR
3015 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3016 }
25fe55af
RS
3017 }
3018 break;
fa9a63c5
RM
3019
3020
b18215fc 3021 case '(':
25fe55af
RS
3022 if (syntax & RE_NO_BK_PARENS)
3023 goto handle_open;
3024 else
3025 goto normal_char;
fa9a63c5
RM
3026
3027
25fe55af
RS
3028 case ')':
3029 if (syntax & RE_NO_BK_PARENS)
3030 goto handle_close;
3031 else
3032 goto normal_char;
e318085a
RS
3033
3034
25fe55af
RS
3035 case '\n':
3036 if (syntax & RE_NEWLINE_ALT)
3037 goto handle_alt;
3038 else
3039 goto normal_char;
e318085a
RS
3040
3041
b18215fc 3042 case '|':
25fe55af
RS
3043 if (syntax & RE_NO_BK_VBAR)
3044 goto handle_alt;
3045 else
3046 goto normal_char;
3047
3048
3049 case '{':
3050 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3051 goto handle_interval;
3052 else
3053 goto normal_char;
3054
3055
3056 case '\\':
3057 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3058
3059 /* Do not translate the character after the \, so that we can
3060 distinguish, e.g., \B from \b, even if we normally would
3061 translate, e.g., B to b. */
36595814 3062 PATFETCH (c);
25fe55af
RS
3063
3064 switch (c)
3065 {
3066 case '(':
3067 if (syntax & RE_NO_BK_PARENS)
3068 goto normal_backslash;
3069
3070 handle_open:
505bde11
SM
3071 {
3072 int shy = 0;
3073 if (p+1 < pend)
3074 {
3075 /* Look for a special (?...) construct */
ed0767d8 3076 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3077 {
ed0767d8 3078 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3079 PATFETCH (c);
3080 switch (c)
3081 {
3082 case ':': shy = 1; break;
3083 default:
3084 /* Only (?:...) is supported right now. */
3085 FREE_STACK_RETURN (REG_BADPAT);
3086 }
3087 }
505bde11
SM
3088 }
3089
3090 if (!shy)
3091 {
3092 bufp->re_nsub++;
3093 regnum++;
3094 }
25fe55af 3095
99633e97
SM
3096 if (COMPILE_STACK_FULL)
3097 {
3098 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3099 compile_stack_elt_t);
3100 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3101
99633e97
SM
3102 compile_stack.size <<= 1;
3103 }
25fe55af 3104
99633e97
SM
3105 /* These are the values to restore when we hit end of this
3106 group. They are all relative offsets, so that if the
3107 whole pattern moves because of realloc, they will still
3108 be valid. */
3109 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3110 COMPILE_STACK_TOP.fixup_alt_jump
3111 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3112 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3113 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3114
3115 /* Do not push a
3116 start_memory for groups beyond the last one we can
3117 represent in the compiled pattern. */
3118 if (regnum <= MAX_REGNUM && !shy)
3119 BUF_PUSH_2 (start_memory, regnum);
3120
3121 compile_stack.avail++;
3122
3123 fixup_alt_jump = 0;
3124 laststart = 0;
3125 begalt = b;
3126 /* If we've reached MAX_REGNUM groups, then this open
3127 won't actually generate any code, so we'll have to
3128 clear pending_exact explicitly. */
3129 pending_exact = 0;
3130 break;
505bde11 3131 }
25fe55af
RS
3132
3133 case ')':
3134 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3135
3136 if (COMPILE_STACK_EMPTY)
505bde11
SM
3137 {
3138 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3139 goto normal_backslash;
3140 else
3141 FREE_STACK_RETURN (REG_ERPAREN);
3142 }
25fe55af
RS
3143
3144 handle_close:
505bde11 3145 FIXUP_ALT_JUMP ();
25fe55af
RS
3146
3147 /* See similar code for backslashed left paren above. */
3148 if (COMPILE_STACK_EMPTY)
505bde11
SM
3149 {
3150 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3151 goto normal_char;
3152 else
3153 FREE_STACK_RETURN (REG_ERPAREN);
3154 }
25fe55af
RS
3155
3156 /* Since we just checked for an empty stack above, this
3157 ``can't happen''. */
3158 assert (compile_stack.avail != 0);
3159 {
3160 /* We don't just want to restore into `regnum', because
3161 later groups should continue to be numbered higher,
3162 as in `(ab)c(de)' -- the second group is #2. */
3163 regnum_t this_group_regnum;
3164
3165 compile_stack.avail--;
3166 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3167 fixup_alt_jump
3168 = COMPILE_STACK_TOP.fixup_alt_jump
3169 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3170 : 0;
3171 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3172 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3173 /* If we've reached MAX_REGNUM groups, then this open
3174 won't actually generate any code, so we'll have to
3175 clear pending_exact explicitly. */
3176 pending_exact = 0;
e318085a 3177
25fe55af
RS
3178 /* We're at the end of the group, so now we know how many
3179 groups were inside this one. */
505bde11
SM
3180 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3181 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3182 }
3183 break;
3184
3185
3186 case '|': /* `\|'. */
3187 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3188 goto normal_backslash;
3189 handle_alt:
3190 if (syntax & RE_LIMITED_OPS)
3191 goto normal_char;
3192
3193 /* Insert before the previous alternative a jump which
3194 jumps to this alternative if the former fails. */
3195 GET_BUFFER_SPACE (3);
3196 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3197 pending_exact = 0;
3198 b += 3;
3199
3200 /* The alternative before this one has a jump after it
3201 which gets executed if it gets matched. Adjust that
3202 jump so it will jump to this alternative's analogous
3203 jump (put in below, which in turn will jump to the next
3204 (if any) alternative's such jump, etc.). The last such
3205 jump jumps to the correct final destination. A picture:
3206 _____ _____
3207 | | | |
3208 | v | v
3209 a | b | c
3210
3211 If we are at `b', then fixup_alt_jump right now points to a
3212 three-byte space after `a'. We'll put in the jump, set
3213 fixup_alt_jump to right after `b', and leave behind three
3214 bytes which we'll fill in when we get to after `c'. */
3215
505bde11 3216 FIXUP_ALT_JUMP ();
25fe55af
RS
3217
3218 /* Mark and leave space for a jump after this alternative,
3219 to be filled in later either by next alternative or
3220 when know we're at the end of a series of alternatives. */
3221 fixup_alt_jump = b;
3222 GET_BUFFER_SPACE (3);
3223 b += 3;
3224
3225 laststart = 0;
3226 begalt = b;
3227 break;
3228
3229
3230 case '{':
3231 /* If \{ is a literal. */
3232 if (!(syntax & RE_INTERVALS)
3233 /* If we're at `\{' and it's not the open-interval
3234 operator. */
4bb91c68 3235 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3236 goto normal_backslash;
3237
3238 handle_interval:
3239 {
3240 /* If got here, then the syntax allows intervals. */
3241
3242 /* At least (most) this many matches must be made. */
99633e97 3243 int lower_bound = 0, upper_bound = -1;
25fe55af 3244
ed0767d8 3245 beg_interval = p;
25fe55af
RS
3246
3247 if (p == pend)
4bb91c68 3248 FREE_STACK_RETURN (REG_EBRACE);
25fe55af
RS
3249
3250 GET_UNSIGNED_NUMBER (lower_bound);
3251
3252 if (c == ',')
ed0767d8 3253 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3254 else
3255 /* Interval such as `{1}' => match exactly once. */
3256 upper_bound = lower_bound;
3257
3258 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3259 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3260 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3261
3262 if (!(syntax & RE_NO_BK_BRACES))
3263 {
4bb91c68
SM
3264 if (c != '\\')
3265 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3266
3267 PATFETCH (c);
3268 }
3269
3270 if (c != '}')
4bb91c68 3271 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3272
3273 /* We just parsed a valid interval. */
3274
3275 /* If it's invalid to have no preceding re. */
3276 if (!laststart)
3277 {
3278 if (syntax & RE_CONTEXT_INVALID_OPS)
3279 FREE_STACK_RETURN (REG_BADRPT);
3280 else if (syntax & RE_CONTEXT_INDEP_OPS)
3281 laststart = b;
3282 else
3283 goto unfetch_interval;
3284 }
3285
6df42991
SM
3286 if (upper_bound == 0)
3287 /* If the upper bound is zero, just drop the sub pattern
3288 altogether. */
3289 b = laststart;
3290 else if (lower_bound == 1 && upper_bound == 1)
3291 /* Just match it once: nothing to do here. */
3292 ;
3293
3294 /* Otherwise, we have a nontrivial interval. When
3295 we're all done, the pattern will look like:
3296 set_number_at <jump count> <upper bound>
3297 set_number_at <succeed_n count> <lower bound>
3298 succeed_n <after jump addr> <succeed_n count>
3299 <body of loop>
3300 jump_n <succeed_n addr> <jump count>
3301 (The upper bound and `jump_n' are omitted if
3302 `upper_bound' is 1, though.) */
3303 else
3304 { /* If the upper bound is > 1, we need to insert
3305 more at the end of the loop. */
3306 unsigned int nbytes = (upper_bound < 0 ? 3
3307 : upper_bound > 1 ? 5 : 0);
3308 unsigned int startoffset = 0;
3309
3310 GET_BUFFER_SPACE (20); /* We might use less. */
3311
3312 if (lower_bound == 0)
3313 {
3314 /* A succeed_n that starts with 0 is really a
3315 a simple on_failure_jump_loop. */
3316 INSERT_JUMP (on_failure_jump_loop, laststart,
3317 b + 3 + nbytes);
3318 b += 3;
3319 }
3320 else
3321 {
3322 /* Initialize lower bound of the `succeed_n', even
3323 though it will be set during matching by its
3324 attendant `set_number_at' (inserted next),
3325 because `re_compile_fastmap' needs to know.
3326 Jump to the `jump_n' we might insert below. */
3327 INSERT_JUMP2 (succeed_n, laststart,
3328 b + 5 + nbytes,
3329 lower_bound);
3330 b += 5;
3331
3332 /* Code to initialize the lower bound. Insert
3333 before the `succeed_n'. The `5' is the last two
3334 bytes of this `set_number_at', plus 3 bytes of
3335 the following `succeed_n'. */
3336 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3337 b += 5;
3338 startoffset += 5;
3339 }
3340
3341 if (upper_bound < 0)
3342 {
3343 /* A negative upper bound stands for infinity,
3344 in which case it degenerates to a plain jump. */
3345 STORE_JUMP (jump, b, laststart + startoffset);
3346 b += 3;
3347 }
3348 else if (upper_bound > 1)
3349 { /* More than one repetition is allowed, so
3350 append a backward jump to the `succeed_n'
3351 that starts this interval.
3352
3353 When we've reached this during matching,
3354 we'll have matched the interval once, so
3355 jump back only `upper_bound - 1' times. */
3356 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3357 upper_bound - 1);
3358 b += 5;
3359
3360 /* The location we want to set is the second
3361 parameter of the `jump_n'; that is `b-2' as
3362 an absolute address. `laststart' will be
3363 the `set_number_at' we're about to insert;
3364 `laststart+3' the number to set, the source
3365 for the relative address. But we are
3366 inserting into the middle of the pattern --
3367 so everything is getting moved up by 5.
3368 Conclusion: (b - 2) - (laststart + 3) + 5,
3369 i.e., b - laststart.
3370
3371 We insert this at the beginning of the loop
3372 so that if we fail during matching, we'll
3373 reinitialize the bounds. */
3374 insert_op2 (set_number_at, laststart, b - laststart,
3375 upper_bound - 1, b);
3376 b += 5;
3377 }
3378 }
25fe55af
RS
3379 pending_exact = 0;
3380 beg_interval = NULL;
3381 }
3382 break;
3383
3384 unfetch_interval:
3385 /* If an invalid interval, match the characters as literals. */
3386 assert (beg_interval);
3387 p = beg_interval;
3388 beg_interval = NULL;
3389
3390 /* normal_char and normal_backslash need `c'. */
ed0767d8 3391 c = '{';
25fe55af
RS
3392
3393 if (!(syntax & RE_NO_BK_BRACES))
3394 {
ed0767d8
SM
3395 assert (p > pattern && p[-1] == '\\');
3396 goto normal_backslash;
25fe55af 3397 }
ed0767d8
SM
3398 else
3399 goto normal_char;
e318085a 3400
b18215fc 3401#ifdef emacs
25fe55af
RS
3402 /* There is no way to specify the before_dot and after_dot
3403 operators. rms says this is ok. --karl */
3404 case '=':
3405 BUF_PUSH (at_dot);
3406 break;
3407
3408 case 's':
3409 laststart = b;
3410 PATFETCH (c);
3411 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3412 break;
3413
3414 case 'S':
3415 laststart = b;
3416 PATFETCH (c);
3417 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3418 break;
b18215fc
RS
3419
3420 case 'c':
3421 laststart = b;
36595814 3422 PATFETCH (c);
b18215fc
RS
3423 BUF_PUSH_2 (categoryspec, c);
3424 break;
e318085a 3425
b18215fc
RS
3426 case 'C':
3427 laststart = b;
36595814 3428 PATFETCH (c);
b18215fc
RS
3429 BUF_PUSH_2 (notcategoryspec, c);
3430 break;
3431#endif /* emacs */
e318085a 3432
e318085a 3433
25fe55af 3434 case 'w':
4bb91c68
SM
3435 if (syntax & RE_NO_GNU_OPS)
3436 goto normal_char;
25fe55af 3437 laststart = b;
1fb352e0 3438 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3439 break;
e318085a 3440
e318085a 3441
25fe55af 3442 case 'W':
4bb91c68
SM
3443 if (syntax & RE_NO_GNU_OPS)
3444 goto normal_char;
25fe55af 3445 laststart = b;
1fb352e0 3446 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3447 break;
e318085a
RS
3448
3449
25fe55af 3450 case '<':
4bb91c68
SM
3451 if (syntax & RE_NO_GNU_OPS)
3452 goto normal_char;
25fe55af
RS
3453 BUF_PUSH (wordbeg);
3454 break;
e318085a 3455
25fe55af 3456 case '>':
4bb91c68
SM
3457 if (syntax & RE_NO_GNU_OPS)
3458 goto normal_char;
25fe55af
RS
3459 BUF_PUSH (wordend);
3460 break;
e318085a 3461
25fe55af 3462 case 'b':
4bb91c68
SM
3463 if (syntax & RE_NO_GNU_OPS)
3464 goto normal_char;
25fe55af
RS
3465 BUF_PUSH (wordbound);
3466 break;
e318085a 3467
25fe55af 3468 case 'B':
4bb91c68
SM
3469 if (syntax & RE_NO_GNU_OPS)
3470 goto normal_char;
25fe55af
RS
3471 BUF_PUSH (notwordbound);
3472 break;
fa9a63c5 3473
25fe55af 3474 case '`':
4bb91c68
SM
3475 if (syntax & RE_NO_GNU_OPS)
3476 goto normal_char;
25fe55af
RS
3477 BUF_PUSH (begbuf);
3478 break;
e318085a 3479
25fe55af 3480 case '\'':
4bb91c68
SM
3481 if (syntax & RE_NO_GNU_OPS)
3482 goto normal_char;
25fe55af
RS
3483 BUF_PUSH (endbuf);
3484 break;
e318085a 3485
25fe55af
RS
3486 case '1': case '2': case '3': case '4': case '5':
3487 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3488 {
3489 regnum_t reg;
e318085a 3490
0cdd06f8
SM
3491 if (syntax & RE_NO_BK_REFS)
3492 goto normal_backslash;
e318085a 3493
0cdd06f8 3494 reg = c - '0';
e318085a 3495
0cdd06f8
SM
3496 /* Can't back reference to a subexpression before its end. */
3497 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3498 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3499
0cdd06f8
SM
3500 laststart = b;
3501 BUF_PUSH_2 (duplicate, reg);
3502 }
25fe55af 3503 break;
e318085a 3504
e318085a 3505
25fe55af
RS
3506 case '+':
3507 case '?':
3508 if (syntax & RE_BK_PLUS_QM)
3509 goto handle_plus;
3510 else
3511 goto normal_backslash;
3512
3513 default:
3514 normal_backslash:
3515 /* You might think it would be useful for \ to mean
3516 not to translate; but if we don't translate it
4bb91c68 3517 it will never match anything. */
25fe55af
RS
3518 goto normal_char;
3519 }
3520 break;
fa9a63c5
RM
3521
3522
3523 default:
25fe55af 3524 /* Expects the character in `c'. */
fa9a63c5 3525 normal_char:
36595814 3526 /* If no exactn currently being built. */
25fe55af 3527 if (!pending_exact
fa9a63c5 3528
25fe55af
RS
3529 /* If last exactn not at current position. */
3530 || pending_exact + *pending_exact + 1 != b
5e69f11e 3531
25fe55af 3532 /* We have only one byte following the exactn for the count. */
2d1675e4 3533 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3534
25fe55af 3535 /* If followed by a repetition operator. */
9d99031f 3536 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3537 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3538 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3539 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3540 || ((syntax & RE_INTERVALS)
25fe55af 3541 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3542 ? p != pend && *p == '{'
3543 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3544 {
3545 /* Start building a new exactn. */
5e69f11e 3546
25fe55af 3547 laststart = b;
fa9a63c5
RM
3548
3549 BUF_PUSH_2 (exactn, 0);
3550 pending_exact = b - 1;
25fe55af 3551 }
5e69f11e 3552
2d1675e4
SM
3553 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3554 {
e0277a47
KH
3555 int len;
3556
bf216479
KH
3557 if (! multibyte)
3558 MAKE_CHAR_MULTIBYTE (c);
36595814 3559 c = TRANSLATE (c);
6fdd04b0
KH
3560 if (target_multibyte)
3561 {
3562 len = CHAR_STRING (c, b);
3563 b += len;
3564 }
e0277a47 3565 else
6fdd04b0
KH
3566 {
3567 MAKE_CHAR_UNIBYTE (c);
3568 *b++ = c;
3569 len = 1;
3570 }
2d1675e4
SM
3571 (*pending_exact) += len;
3572 }
3573
fa9a63c5 3574 break;
25fe55af 3575 } /* switch (c) */
fa9a63c5
RM
3576 } /* while p != pend */
3577
5e69f11e 3578
fa9a63c5 3579 /* Through the pattern now. */
5e69f11e 3580
505bde11 3581 FIXUP_ALT_JUMP ();
fa9a63c5 3582
5e69f11e 3583 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3584 FREE_STACK_RETURN (REG_EPAREN);
3585
3586 /* If we don't want backtracking, force success
3587 the first time we reach the end of the compiled pattern. */
3588 if (syntax & RE_NO_POSIX_BACKTRACKING)
3589 BUF_PUSH (succeed);
3590
3591 free (compile_stack.stack);
3592
3593 /* We have succeeded; set the length of the buffer. */
3594 bufp->used = b - bufp->buffer;
3595
6fdd04b0
KH
3596#ifdef emacs
3597 /* Now the buffer is adjusted for the multibyteness of a target. */
3598 bufp->multibyte = bufp->target_multibyte;
3599#endif
3600
fa9a63c5 3601#ifdef DEBUG
99633e97 3602 if (debug > 0)
fa9a63c5 3603 {
505bde11 3604 re_compile_fastmap (bufp);
fa9a63c5
RM
3605 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3606 print_compiled_pattern (bufp);
3607 }
99633e97 3608 debug--;
fa9a63c5
RM
3609#endif /* DEBUG */
3610
3611#ifndef MATCH_MAY_ALLOCATE
3612 /* Initialize the failure stack to the largest possible stack. This
3613 isn't necessary unless we're trying to avoid calling alloca in
3614 the search and match routines. */
3615 {
3616 int num_regs = bufp->re_nsub + 1;
3617
320a2a73 3618 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3619 {
a26f4ccd 3620 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3621
fa9a63c5
RM
3622 if (! fail_stack.stack)
3623 fail_stack.stack
5e69f11e 3624 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3625 * sizeof (fail_stack_elt_t));
3626 else
3627 fail_stack.stack
3628 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3629 (fail_stack.size
3630 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3631 }
3632
3633 regex_grow_registers (num_regs);
3634 }
3635#endif /* not MATCH_MAY_ALLOCATE */
3636
3637 return REG_NOERROR;
3638} /* regex_compile */
3639\f
3640/* Subroutines for `regex_compile'. */
3641
25fe55af 3642/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3643
3644static void
3645store_op1 (op, loc, arg)
3646 re_opcode_t op;
3647 unsigned char *loc;
3648 int arg;
3649{
3650 *loc = (unsigned char) op;
3651 STORE_NUMBER (loc + 1, arg);
3652}
3653
3654
3655/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3656
3657static void
3658store_op2 (op, loc, arg1, arg2)
3659 re_opcode_t op;
3660 unsigned char *loc;
3661 int arg1, arg2;
3662{
3663 *loc = (unsigned char) op;
3664 STORE_NUMBER (loc + 1, arg1);
3665 STORE_NUMBER (loc + 3, arg2);
3666}
3667
3668
3669/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3670 for OP followed by two-byte integer parameter ARG. */
3671
3672static void
3673insert_op1 (op, loc, arg, end)
3674 re_opcode_t op;
3675 unsigned char *loc;
3676 int arg;
5e69f11e 3677 unsigned char *end;
fa9a63c5
RM
3678{
3679 register unsigned char *pfrom = end;
3680 register unsigned char *pto = end + 3;
3681
3682 while (pfrom != loc)
3683 *--pto = *--pfrom;
5e69f11e 3684
fa9a63c5
RM
3685 store_op1 (op, loc, arg);
3686}
3687
3688
3689/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3690
3691static void
3692insert_op2 (op, loc, arg1, arg2, end)
3693 re_opcode_t op;
3694 unsigned char *loc;
3695 int arg1, arg2;
5e69f11e 3696 unsigned char *end;
fa9a63c5
RM
3697{
3698 register unsigned char *pfrom = end;
3699 register unsigned char *pto = end + 5;
3700
3701 while (pfrom != loc)
3702 *--pto = *--pfrom;
5e69f11e 3703
fa9a63c5
RM
3704 store_op2 (op, loc, arg1, arg2);
3705}
3706
3707
3708/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3709 after an alternative or a begin-subexpression. We assume there is at
3710 least one character before the ^. */
3711
3712static boolean
3713at_begline_loc_p (pattern, p, syntax)
01618498 3714 re_char *pattern, *p;
fa9a63c5
RM
3715 reg_syntax_t syntax;
3716{
01618498 3717 re_char *prev = p - 2;
fa9a63c5 3718 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3719
fa9a63c5
RM
3720 return
3721 /* After a subexpression? */
3722 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3723 /* After an alternative? */
d2af47df
SM
3724 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3725 /* After a shy subexpression? */
3726 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3727 && prev[-1] == '?' && prev[-2] == '('
3728 && (syntax & RE_NO_BK_PARENS
3729 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3730}
3731
3732
3733/* The dual of at_begline_loc_p. This one is for $. We assume there is
3734 at least one character after the $, i.e., `P < PEND'. */
3735
3736static boolean
3737at_endline_loc_p (p, pend, syntax)
01618498 3738 re_char *p, *pend;
99633e97 3739 reg_syntax_t syntax;
fa9a63c5 3740{
01618498 3741 re_char *next = p;
fa9a63c5 3742 boolean next_backslash = *next == '\\';
01618498 3743 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3744
fa9a63c5
RM
3745 return
3746 /* Before a subexpression? */
3747 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3748 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3749 /* Before an alternative? */
3750 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3751 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3752}
3753
3754
5e69f11e 3755/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3756 false if it's not. */
3757
3758static boolean
3759group_in_compile_stack (compile_stack, regnum)
3760 compile_stack_type compile_stack;
3761 regnum_t regnum;
3762{
3763 int this_element;
3764
5e69f11e
RM
3765 for (this_element = compile_stack.avail - 1;
3766 this_element >= 0;
fa9a63c5
RM
3767 this_element--)
3768 if (compile_stack.stack[this_element].regnum == regnum)
3769 return true;
3770
3771 return false;
3772}
fa9a63c5 3773\f
f6a3f532
SM
3774/* analyse_first.
3775 If fastmap is non-NULL, go through the pattern and fill fastmap
3776 with all the possible leading chars. If fastmap is NULL, don't
3777 bother filling it up (obviously) and only return whether the
3778 pattern could potentially match the empty string.
3779
3780 Return 1 if p..pend might match the empty string.
3781 Return 0 if p..pend matches at least one char.
01618498 3782 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3783
3784static int
3785analyse_first (p, pend, fastmap, multibyte)
01618498 3786 re_char *p, *pend;
f6a3f532
SM
3787 char *fastmap;
3788 const int multibyte;
fa9a63c5 3789{
505bde11 3790 int j, k;
1fb352e0 3791 boolean not;
fa9a63c5 3792
b18215fc 3793 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3794 flag is set true. */
b18215fc
RS
3795 boolean match_any_multibyte_characters = false;
3796
f6a3f532 3797 assert (p);
5e69f11e 3798
505bde11
SM
3799 /* The loop below works as follows:
3800 - It has a working-list kept in the PATTERN_STACK and which basically
3801 starts by only containing a pointer to the first operation.
3802 - If the opcode we're looking at is a match against some set of
3803 chars, then we add those chars to the fastmap and go on to the
3804 next work element from the worklist (done via `break').
3805 - If the opcode is a control operator on the other hand, we either
3806 ignore it (if it's meaningless at this point, such as `start_memory')
3807 or execute it (if it's a jump). If the jump has several destinations
3808 (i.e. `on_failure_jump'), then we push the other destination onto the
3809 worklist.
3810 We guarantee termination by ignoring backward jumps (more or less),
3811 so that `p' is monotonically increasing. More to the point, we
3812 never set `p' (or push) anything `<= p1'. */
3813
01618498 3814 while (p < pend)
fa9a63c5 3815 {
505bde11
SM
3816 /* `p1' is used as a marker of how far back a `on_failure_jump'
3817 can go without being ignored. It is normally equal to `p'
3818 (which prevents any backward `on_failure_jump') except right
3819 after a plain `jump', to allow patterns such as:
3820 0: jump 10
3821 3..9: <body>
3822 10: on_failure_jump 3
3823 as used for the *? operator. */
01618498 3824 re_char *p1 = p;
5e69f11e 3825
fa9a63c5
RM
3826 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3827 {
f6a3f532 3828 case succeed:
01618498 3829 return 1;
f6a3f532 3830 continue;
fa9a63c5 3831
fa9a63c5 3832 case duplicate:
505bde11
SM
3833 /* If the first character has to match a backreference, that means
3834 that the group was empty (since it already matched). Since this
3835 is the only case that interests us here, we can assume that the
3836 backreference must match the empty string. */
3837 p++;
3838 continue;
fa9a63c5
RM
3839
3840
3841 /* Following are the cases which match a character. These end
25fe55af 3842 with `break'. */
fa9a63c5
RM
3843
3844 case exactn:
e0277a47 3845 if (fastmap)
bf216479
KH
3846 /* If multibyte is nonzero, the first byte of each
3847 character is an ASCII or a leading code. Otherwise,
3848 each byte is a character. Thus, this works in both
3849 cases. */
3850 fastmap[p[1]] = 1;
fa9a63c5
RM
3851 break;
3852
3853
1fb352e0
SM
3854 case anychar:
3855 /* We could put all the chars except for \n (and maybe \0)
3856 but we don't bother since it is generally not worth it. */
f6a3f532 3857 if (!fastmap) break;
01618498 3858 return -1;
fa9a63c5
RM
3859
3860
b18215fc 3861 case charset_not:
1fb352e0 3862 if (!fastmap) break;
bf216479
KH
3863 {
3864 /* Chars beyond end of bitmap are possible matches. */
6fdd04b0
KH
3865 /* In a multibyte case, the bitmap is used only for ASCII
3866 characters. */
bf216479
KH
3867 int limit = multibyte ? 128 : (1 << BYTEWIDTH);
3868
3869 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3870 j < limit; j++)
3871 fastmap[j] = 1;
3872 }
3873
1fb352e0
SM
3874 /* Fallthrough */
3875 case charset:
3876 if (!fastmap) break;
3877 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3878 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3879 j >= 0; j--)
1fb352e0 3880 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3881 fastmap[j] = 1;
3882
1fb352e0 3883 if ((not && multibyte)
bf216479 3884 /* Any leading code can possibly start a character
1fb352e0
SM
3885 which doesn't match the specified set of characters. */
3886 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3887 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3888 /* If we can match a character class, we can match
6fdd04b0 3889 any multibyte characters. */
b18215fc 3890 {
b18215fc
RS
3891 if (match_any_multibyte_characters == false)
3892 {
6fdd04b0
KH
3893 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3894 fastmap[j] = 1;
b18215fc
RS
3895 match_any_multibyte_characters = true;
3896 }
3897 }
b18215fc 3898
1fb352e0
SM
3899 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3900 && match_any_multibyte_characters == false)
3901 {
bf216479 3902 /* Set fastmap[I] to 1 where I is a leading code of each
9117d724 3903 multibyte characer in the range table. */
1fb352e0 3904 int c, count;
bf216479 3905 unsigned char lc1, lc2;
b18215fc 3906
1fb352e0 3907 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3908 bits for a character class. */
1fb352e0 3909 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3910
1fb352e0
SM
3911 /* Extract the number of ranges in range table into COUNT. */
3912 EXTRACT_NUMBER_AND_INCR (count, p);
3913 for (; count > 0; count--, p += 2 * 3) /* XXX */
3914 {
9117d724
KH
3915 /* Extract the start and end of each range. */
3916 EXTRACT_CHARACTER (c, p);
bf216479 3917 lc1 = CHAR_LEADING_CODE (c);
9117d724 3918 p += 3;
1fb352e0 3919 EXTRACT_CHARACTER (c, p);
bf216479
KH
3920 lc2 = CHAR_LEADING_CODE (c);
3921 for (j = lc1; j <= lc2; j++)
9117d724 3922 fastmap[j] = 1;
1fb352e0
SM
3923 }
3924 }
b18215fc
RS
3925 break;
3926
1fb352e0
SM
3927 case syntaxspec:
3928 case notsyntaxspec:
3929 if (!fastmap) break;
3930#ifndef emacs
3931 not = (re_opcode_t)p[-1] == notsyntaxspec;
3932 k = *p++;
3933 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3934 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3935 fastmap[j] = 1;
b18215fc 3936 break;
1fb352e0 3937#else /* emacs */
b18215fc
RS
3938 /* This match depends on text properties. These end with
3939 aborting optimizations. */
01618498 3940 return -1;
b18215fc
RS
3941
3942 case categoryspec:
b18215fc 3943 case notcategoryspec:
1fb352e0
SM
3944 if (!fastmap) break;
3945 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3946 k = *p++;
bf216479 3947 for (j = (multibyte ? 127 : (1 << BYTEWIDTH)); j >= 0; j--)
1fb352e0 3948 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3949 fastmap[j] = 1;
3950
1fb352e0 3951 if (multibyte)
6fdd04b0
KH
3952 {
3953 /* Any character set can possibly contain a character
3954 whose category is K (or not). */
3955 if (match_any_multibyte_characters == false)
3956 {
3957 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3958 fastmap[j] = 1;
3959 match_any_multibyte_characters = true;
3960 }
3961 }
b18215fc
RS
3962 break;
3963
fa9a63c5 3964 /* All cases after this match the empty string. These end with
25fe55af 3965 `continue'. */
fa9a63c5 3966
fa9a63c5
RM
3967 case before_dot:
3968 case at_dot:
3969 case after_dot:
1fb352e0 3970#endif /* !emacs */
25fe55af
RS
3971 case no_op:
3972 case begline:
3973 case endline:
fa9a63c5
RM
3974 case begbuf:
3975 case endbuf:
3976 case wordbound:
3977 case notwordbound:
3978 case wordbeg:
3979 case wordend:
25fe55af 3980 continue;
fa9a63c5
RM
3981
3982
fa9a63c5 3983 case jump:
25fe55af 3984 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
3985 if (j < 0)
3986 /* Backward jumps can only go back to code that we've already
3987 visited. `re_compile' should make sure this is true. */
3988 break;
25fe55af 3989 p += j;
505bde11
SM
3990 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3991 {
3992 case on_failure_jump:
3993 case on_failure_keep_string_jump:
505bde11 3994 case on_failure_jump_loop:
0683b6fa 3995 case on_failure_jump_nastyloop:
505bde11
SM
3996 case on_failure_jump_smart:
3997 p++;
3998 break;
3999 default:
4000 continue;
4001 };
4002 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4003 to jump back to "just after here". */
4004 /* Fallthrough */
fa9a63c5 4005
25fe55af
RS
4006 case on_failure_jump:
4007 case on_failure_keep_string_jump:
0683b6fa 4008 case on_failure_jump_nastyloop:
505bde11
SM
4009 case on_failure_jump_loop:
4010 case on_failure_jump_smart:
25fe55af 4011 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 4012 if (p + j <= p1)
ed0767d8 4013 ; /* Backward jump to be ignored. */
01618498
SM
4014 else
4015 { /* We have to look down both arms.
4016 We first go down the "straight" path so as to minimize
4017 stack usage when going through alternatives. */
4018 int r = analyse_first (p, pend, fastmap, multibyte);
4019 if (r) return r;
4020 p += j;
4021 }
25fe55af 4022 continue;
fa9a63c5
RM
4023
4024
ed0767d8
SM
4025 case jump_n:
4026 /* This code simply does not properly handle forward jump_n. */
4027 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4028 p += 4;
4029 /* jump_n can either jump or fall through. The (backward) jump
4030 case has already been handled, so we only need to look at the
4031 fallthrough case. */
4032 continue;
177c0ea7 4033
fa9a63c5 4034 case succeed_n:
ed0767d8
SM
4035 /* If N == 0, it should be an on_failure_jump_loop instead. */
4036 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4037 p += 4;
4038 /* We only care about one iteration of the loop, so we don't
4039 need to consider the case where this behaves like an
4040 on_failure_jump. */
25fe55af 4041 continue;
fa9a63c5
RM
4042
4043
4044 case set_number_at:
25fe55af
RS
4045 p += 4;
4046 continue;
fa9a63c5
RM
4047
4048
4049 case start_memory:
25fe55af 4050 case stop_memory:
505bde11 4051 p += 1;
fa9a63c5
RM
4052 continue;
4053
4054
4055 default:
25fe55af
RS
4056 abort (); /* We have listed all the cases. */
4057 } /* switch *p++ */
fa9a63c5
RM
4058
4059 /* Getting here means we have found the possible starting
25fe55af 4060 characters for one path of the pattern -- and that the empty
01618498
SM
4061 string does not match. We need not follow this path further. */
4062 return 0;
fa9a63c5
RM
4063 } /* while p */
4064
01618498
SM
4065 /* We reached the end without matching anything. */
4066 return 1;
4067
f6a3f532
SM
4068} /* analyse_first */
4069\f
4070/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4071 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4072 characters can start a string that matches the pattern. This fastmap
4073 is used by re_search to skip quickly over impossible starting points.
4074
4075 Character codes above (1 << BYTEWIDTH) are not represented in the
4076 fastmap, but the leading codes are represented. Thus, the fastmap
4077 indicates which character sets could start a match.
4078
4079 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4080 area as BUFP->fastmap.
4081
4082 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4083 the pattern buffer.
4084
4085 Returns 0 if we succeed, -2 if an internal error. */
4086
4087int
4088re_compile_fastmap (bufp)
4089 struct re_pattern_buffer *bufp;
4090{
4091 char *fastmap = bufp->fastmap;
4092 int analysis;
4093
4094 assert (fastmap && bufp->buffer);
4095
4096 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4097 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4098
4099 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4100 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4101 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4102 return 0;
4103} /* re_compile_fastmap */
4104\f
4105/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4106 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4107 this memory for recording register information. STARTS and ENDS
4108 must be allocated using the malloc library routine, and must each
4109 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4110
4111 If NUM_REGS == 0, then subsequent matches should allocate their own
4112 register data.
4113
4114 Unless this function is called, the first search or match using
4115 PATTERN_BUFFER will allocate its own register data, without
4116 freeing the old data. */
4117
4118void
4119re_set_registers (bufp, regs, num_regs, starts, ends)
4120 struct re_pattern_buffer *bufp;
4121 struct re_registers *regs;
4122 unsigned num_regs;
4123 regoff_t *starts, *ends;
4124{
4125 if (num_regs)
4126 {
4127 bufp->regs_allocated = REGS_REALLOCATE;
4128 regs->num_regs = num_regs;
4129 regs->start = starts;
4130 regs->end = ends;
4131 }
4132 else
4133 {
4134 bufp->regs_allocated = REGS_UNALLOCATED;
4135 regs->num_regs = 0;
4136 regs->start = regs->end = (regoff_t *) 0;
4137 }
4138}
c0f9ea08 4139WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4140\f
25fe55af 4141/* Searching routines. */
fa9a63c5
RM
4142
4143/* Like re_search_2, below, but only one string is specified, and
4144 doesn't let you say where to stop matching. */
4145
4146int
4147re_search (bufp, string, size, startpos, range, regs)
4148 struct re_pattern_buffer *bufp;
4149 const char *string;
4150 int size, startpos, range;
4151 struct re_registers *regs;
4152{
5e69f11e 4153 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4154 regs, size);
4155}
c0f9ea08 4156WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4157
70806df6
KH
4158/* Head address of virtual concatenation of string. */
4159#define HEAD_ADDR_VSTRING(P) \
4160 (((P) >= size1 ? string2 : string1))
4161
b18215fc
RS
4162/* End address of virtual concatenation of string. */
4163#define STOP_ADDR_VSTRING(P) \
4164 (((P) >= size1 ? string2 + size2 : string1 + size1))
4165
4166/* Address of POS in the concatenation of virtual string. */
4167#define POS_ADDR_VSTRING(POS) \
4168 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4169
4170/* Using the compiled pattern in BUFP->buffer, first tries to match the
4171 virtual concatenation of STRING1 and STRING2, starting first at index
4172 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4173
fa9a63c5 4174 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4175
fa9a63c5
RM
4176 RANGE is how far to scan while trying to match. RANGE = 0 means try
4177 only at STARTPOS; in general, the last start tried is STARTPOS +
4178 RANGE.
5e69f11e 4179
fa9a63c5
RM
4180 In REGS, return the indices of the virtual concatenation of STRING1
4181 and STRING2 that matched the entire BUFP->buffer and its contained
4182 subexpressions.
5e69f11e 4183
fa9a63c5
RM
4184 Do not consider matching one past the index STOP in the virtual
4185 concatenation of STRING1 and STRING2.
4186
4187 We return either the position in the strings at which the match was
4188 found, -1 if no match, or -2 if error (such as failure
4189 stack overflow). */
4190
4191int
66f0296e 4192re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4193 struct re_pattern_buffer *bufp;
66f0296e 4194 const char *str1, *str2;
fa9a63c5
RM
4195 int size1, size2;
4196 int startpos;
4197 int range;
4198 struct re_registers *regs;
4199 int stop;
4200{
4201 int val;
66f0296e
SM
4202 re_char *string1 = (re_char*) str1;
4203 re_char *string2 = (re_char*) str2;
fa9a63c5 4204 register char *fastmap = bufp->fastmap;
6676cb1c 4205 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4206 int total_size = size1 + size2;
4207 int endpos = startpos + range;
c0f9ea08 4208 boolean anchored_start;
6fdd04b0
KH
4209 /* Nonzero if BUFP is setup for multibyte characters. We are sure
4210 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
2d1675e4 4211 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4212
fa9a63c5
RM
4213 /* Check for out-of-range STARTPOS. */
4214 if (startpos < 0 || startpos > total_size)
4215 return -1;
5e69f11e 4216
fa9a63c5 4217 /* Fix up RANGE if it might eventually take us outside
34597fa9 4218 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4219 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4220 if (endpos < 0)
4221 range = 0 - startpos;
fa9a63c5
RM
4222 else if (endpos > total_size)
4223 range = total_size - startpos;
4224
4225 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4226 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4227 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4228 {
4229 if (startpos > 0)
4230 return -1;
4231 else
7b140fd7 4232 range = 0;
fa9a63c5
RM
4233 }
4234
ae4788a8
RS
4235#ifdef emacs
4236 /* In a forward search for something that starts with \=.
4237 don't keep searching past point. */
4238 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4239 {
7b140fd7
RS
4240 range = PT_BYTE - BEGV_BYTE - startpos;
4241 if (range < 0)
ae4788a8
RS
4242 return -1;
4243 }
4244#endif /* emacs */
4245
fa9a63c5
RM
4246 /* Update the fastmap now if not correct already. */
4247 if (fastmap && !bufp->fastmap_accurate)
01618498 4248 re_compile_fastmap (bufp);
5e69f11e 4249
c8499ba5 4250 /* See whether the pattern is anchored. */
c0f9ea08 4251 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4252
b18215fc 4253#ifdef emacs
cc9b4df2
KH
4254 gl_state.object = re_match_object;
4255 {
99633e97 4256 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4257
4258 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4259 }
b18215fc
RS
4260#endif
4261
fa9a63c5
RM
4262 /* Loop through the string, looking for a place to start matching. */
4263 for (;;)
5e69f11e 4264 {
c8499ba5
RS
4265 /* If the pattern is anchored,
4266 skip quickly past places we cannot match.
4267 We don't bother to treat startpos == 0 specially
4268 because that case doesn't repeat. */
4269 if (anchored_start && startpos > 0)
4270 {
c0f9ea08
SM
4271 if (! ((startpos <= size1 ? string1[startpos - 1]
4272 : string2[startpos - size1 - 1])
4273 == '\n'))
c8499ba5
RS
4274 goto advance;
4275 }
4276
fa9a63c5 4277 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4278 cannot be the start of a match. If the pattern can match the
4279 null string, however, we don't need to skip characters; we want
4280 the first null string. */
fa9a63c5
RM
4281 if (fastmap && startpos < total_size && !bufp->can_be_null)
4282 {
66f0296e 4283 register re_char *d;
01618498 4284 register re_wchar_t buf_ch;
e934739e
RS
4285
4286 d = POS_ADDR_VSTRING (startpos);
4287
25fe55af 4288 if (range > 0) /* Searching forwards. */
fa9a63c5 4289 {
fa9a63c5
RM
4290 register int lim = 0;
4291 int irange = range;
4292
25fe55af
RS
4293 if (startpos < size1 && startpos + range >= size1)
4294 lim = range - (size1 - startpos);
fa9a63c5 4295
25fe55af
RS
4296 /* Written out as an if-else to avoid testing `translate'
4297 inside the loop. */
28ae27ae
AS
4298 if (RE_TRANSLATE_P (translate))
4299 {
e934739e
RS
4300 if (multibyte)
4301 while (range > lim)
4302 {
4303 int buf_charlen;
4304
4305 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4306 buf_charlen);
e934739e 4307 buf_ch = RE_TRANSLATE (translate, buf_ch);
bf216479 4308 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
e934739e
RS
4309 break;
4310
4311 range -= buf_charlen;
4312 d += buf_charlen;
4313 }
4314 else
bf216479 4315 while (range > lim)
33c46939 4316 {
bf216479
KH
4317 buf_ch = *d;
4318 MAKE_CHAR_MULTIBYTE (buf_ch);
b884eb4d 4319 buf_ch = RE_TRANSLATE (translate, buf_ch);
6fdd04b0 4320 MAKE_CHAR_UNIBYTE (buf_ch);
6fdd04b0 4321 if (fastmap[buf_ch])
bf216479 4322 break;
33c46939
RS
4323 d++;
4324 range--;
4325 }
e934739e 4326 }
fa9a63c5 4327 else
6fdd04b0
KH
4328 {
4329 if (multibyte)
4330 while (range > lim)
4331 {
4332 int buf_charlen;
fa9a63c5 4333
6fdd04b0
KH
4334 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4335 buf_charlen);
4336 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4337 break;
4338 range -= buf_charlen;
4339 d += buf_charlen;
4340 }
e934739e 4341 else
6fdd04b0 4342 while (range > lim && !fastmap[*d])
33c46939
RS
4343 {
4344 d++;
4345 range--;
4346 }
e934739e 4347 }
fa9a63c5
RM
4348 startpos += irange - range;
4349 }
25fe55af 4350 else /* Searching backwards. */
fa9a63c5 4351 {
2d1675e4
SM
4352 int room = (startpos >= size1
4353 ? size2 + size1 - startpos
4354 : size1 - startpos);
ba5e343c
KH
4355 if (multibyte)
4356 {
6fdd04b0 4357 buf_ch = STRING_CHAR (d, room);
ba5e343c
KH
4358 buf_ch = TRANSLATE (buf_ch);
4359 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4360 goto advance;
4361 }
4362 else
4363 {
4364 if (! fastmap[TRANSLATE (*d)])
4365 goto advance;
4366 }
fa9a63c5
RM
4367 }
4368 }
4369
4370 /* If can't match the null string, and that's all we have left, fail. */
4371 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4372 && !bufp->can_be_null)
fa9a63c5
RM
4373 return -1;
4374
4375 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4376 startpos, regs, stop);
4377#ifndef REGEX_MALLOC
0b32bf0e 4378# ifdef C_ALLOCA
fa9a63c5 4379 alloca (0);
0b32bf0e 4380# endif
fa9a63c5
RM
4381#endif
4382
4383 if (val >= 0)
4384 return startpos;
5e69f11e 4385
fa9a63c5
RM
4386 if (val == -2)
4387 return -2;
4388
4389 advance:
5e69f11e 4390 if (!range)
25fe55af 4391 break;
5e69f11e 4392 else if (range > 0)
25fe55af 4393 {
b18215fc
RS
4394 /* Update STARTPOS to the next character boundary. */
4395 if (multibyte)
4396 {
66f0296e
SM
4397 re_char *p = POS_ADDR_VSTRING (startpos);
4398 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4399 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4400
4401 range -= len;
4402 if (range < 0)
4403 break;
4404 startpos += len;
4405 }
4406 else
4407 {
b560c397
RS
4408 range--;
4409 startpos++;
4410 }
e318085a 4411 }
fa9a63c5 4412 else
25fe55af
RS
4413 {
4414 range++;
4415 startpos--;
b18215fc
RS
4416
4417 /* Update STARTPOS to the previous character boundary. */
4418 if (multibyte)
4419 {
70806df6
KH
4420 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4421 re_char *p0 = p;
4422 re_char *phead = HEAD_ADDR_VSTRING (startpos);
b18215fc
RS
4423
4424 /* Find the head of multibyte form. */
70806df6
KH
4425 PREV_CHAR_BOUNDARY (p, phead);
4426 range += p0 - 1 - p;
4427 if (range > 0)
4428 break;
b18215fc 4429
70806df6 4430 startpos -= p0 - 1 - p;
b18215fc 4431 }
25fe55af 4432 }
fa9a63c5
RM
4433 }
4434 return -1;
4435} /* re_search_2 */
c0f9ea08 4436WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4437\f
4438/* Declarations and macros for re_match_2. */
4439
2d1675e4
SM
4440static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4441 register int len,
4442 RE_TRANSLATE_TYPE translate,
4443 const int multibyte));
fa9a63c5
RM
4444
4445/* This converts PTR, a pointer into one of the search strings `string1'
4446 and `string2' into an offset from the beginning of that string. */
4447#define POINTER_TO_OFFSET(ptr) \
4448 (FIRST_STRING_P (ptr) \
4449 ? ((regoff_t) ((ptr) - string1)) \
4450 : ((regoff_t) ((ptr) - string2 + size1)))
4451
fa9a63c5 4452/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4453 string2 if necessary.
4454 Check re_match_2_internal for a discussion of why end_match_2 might
4455 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4456#define PREFETCH() \
25fe55af 4457 while (d == dend) \
fa9a63c5
RM
4458 { \
4459 /* End of string2 => fail. */ \
25fe55af
RS
4460 if (dend == end_match_2) \
4461 goto fail; \
4bb91c68 4462 /* End of string1 => advance to string2. */ \
25fe55af 4463 d = string2; \
fa9a63c5
RM
4464 dend = end_match_2; \
4465 }
4466
f1ad044f
SM
4467/* Call before fetching a char with *d if you already checked other limits.
4468 This is meant for use in lookahead operations like wordend, etc..
4469 where we might need to look at parts of the string that might be
4470 outside of the LIMITs (i.e past `stop'). */
4471#define PREFETCH_NOLIMIT() \
4472 if (d == end1) \
4473 { \
4474 d = string2; \
4475 dend = end_match_2; \
4476 } \
fa9a63c5
RM
4477
4478/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4479 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4480#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4481#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4482
4483
4484/* Test if D points to a character which is word-constituent. We have
4485 two special cases to check for: if past the end of string1, look at
4486 the first character in string2; and if before the beginning of
4487 string2, look at the last character in string1. */
4488#define WORDCHAR_P(d) \
4489 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4490 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4491 == Sword)
4492
9121ca40 4493/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4494
4495/* The comment at case wordbound is following one, but we don't use
4496 AT_WORD_BOUNDARY anymore to support multibyte form.
4497
4498 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4499 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4500 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4501 macro and introducing temporary variables works around the bug. */
4502
9121ca40 4503#if 0
fa9a63c5
RM
4504/* Test if the character before D and the one at D differ with respect
4505 to being word-constituent. */
4506#define AT_WORD_BOUNDARY(d) \
4507 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4508 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4509#endif
fa9a63c5
RM
4510
4511/* Free everything we malloc. */
4512#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4513# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4514# define FREE_VARIABLES() \
fa9a63c5
RM
4515 do { \
4516 REGEX_FREE_STACK (fail_stack.stack); \
4517 FREE_VAR (regstart); \
4518 FREE_VAR (regend); \
fa9a63c5
RM
4519 FREE_VAR (best_regstart); \
4520 FREE_VAR (best_regend); \
fa9a63c5
RM
4521 } while (0)
4522#else
0b32bf0e 4523# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4524#endif /* not MATCH_MAY_ALLOCATE */
4525
505bde11
SM
4526\f
4527/* Optimization routines. */
4528
4e8a9132
SM
4529/* If the operation is a match against one or more chars,
4530 return a pointer to the next operation, else return NULL. */
01618498 4531static re_char *
4e8a9132 4532skip_one_char (p)
01618498 4533 re_char *p;
4e8a9132
SM
4534{
4535 switch (SWITCH_ENUM_CAST (*p++))
4536 {
4537 case anychar:
4538 break;
177c0ea7 4539
4e8a9132
SM
4540 case exactn:
4541 p += *p + 1;
4542 break;
4543
4544 case charset_not:
4545 case charset:
4546 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4547 {
4548 int mcnt;
4549 p = CHARSET_RANGE_TABLE (p - 1);
4550 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4551 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4552 }
4553 else
4554 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4555 break;
177c0ea7 4556
4e8a9132
SM
4557 case syntaxspec:
4558 case notsyntaxspec:
1fb352e0 4559#ifdef emacs
4e8a9132
SM
4560 case categoryspec:
4561 case notcategoryspec:
4562#endif /* emacs */
4563 p++;
4564 break;
4565
4566 default:
4567 p = NULL;
4568 }
4569 return p;
4570}
4571
4572
505bde11
SM
4573/* Jump over non-matching operations. */
4574static unsigned char *
4e8a9132 4575skip_noops (p, pend)
505bde11 4576 unsigned char *p, *pend;
505bde11
SM
4577{
4578 int mcnt;
4579 while (p < pend)
4580 {
4581 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4582 {
4583 case start_memory:
505bde11
SM
4584 case stop_memory:
4585 p += 2; break;
4586 case no_op:
4587 p += 1; break;
4588 case jump:
4589 p += 1;
4590 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4591 p += mcnt;
4592 break;
4593 default:
4594 return p;
4595 }
4596 }
4597 assert (p == pend);
4598 return p;
4599}
4600
4601/* Non-zero if "p1 matches something" implies "p2 fails". */
4602static int
4603mutually_exclusive_p (bufp, p1, p2)
4604 struct re_pattern_buffer *bufp;
4605 unsigned char *p1, *p2;
4606{
4e8a9132 4607 re_opcode_t op2;
2d1675e4 4608 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4609 unsigned char *pend = bufp->buffer + bufp->used;
4610
4e8a9132 4611 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4612 && p2 >= bufp->buffer && p2 <= pend);
4613
4614 /* Skip over open/close-group commands.
4615 If what follows this loop is a ...+ construct,
4616 look at what begins its body, since we will have to
4617 match at least one of that. */
4e8a9132
SM
4618 p2 = skip_noops (p2, pend);
4619 /* The same skip can be done for p1, except that this function
4620 is only used in the case where p1 is a simple match operator. */
4621 /* p1 = skip_noops (p1, pend); */
4622
4623 assert (p1 >= bufp->buffer && p1 < pend
4624 && p2 >= bufp->buffer && p2 <= pend);
4625
4626 op2 = p2 == pend ? succeed : *p2;
4627
4628 switch (SWITCH_ENUM_CAST (op2))
505bde11 4629 {
4e8a9132
SM
4630 case succeed:
4631 case endbuf:
4632 /* If we're at the end of the pattern, we can change. */
4633 if (skip_one_char (p1))
505bde11 4634 {
505bde11
SM
4635 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4636 return 1;
505bde11 4637 }
4e8a9132 4638 break;
177c0ea7 4639
4e8a9132 4640 case endline:
4e8a9132
SM
4641 case exactn:
4642 {
01618498 4643 register re_wchar_t c
4e8a9132 4644 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4645 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4646
4e8a9132
SM
4647 if ((re_opcode_t) *p1 == exactn)
4648 {
4649 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4650 {
4651 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4652 return 1;
4653 }
4654 }
505bde11 4655
4e8a9132
SM
4656 else if ((re_opcode_t) *p1 == charset
4657 || (re_opcode_t) *p1 == charset_not)
4658 {
4659 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4660
4e8a9132
SM
4661 /* Test if C is listed in charset (or charset_not)
4662 at `p1'. */
6fdd04b0 4663 if (! multibyte || IS_REAL_ASCII (c))
4e8a9132
SM
4664 {
4665 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4666 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4667 not = !not;
4668 }
4669 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4670 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4671
4e8a9132
SM
4672 /* `not' is equal to 1 if c would match, which means
4673 that we can't change to pop_failure_jump. */
4674 if (!not)
4675 {
4676 DEBUG_PRINT1 (" No match => fast loop.\n");
4677 return 1;
4678 }
4679 }
4680 else if ((re_opcode_t) *p1 == anychar
4681 && c == '\n')
4682 {
4683 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4684 return 1;
4685 }
4686 }
4687 break;
505bde11 4688
4e8a9132 4689 case charset:
4e8a9132
SM
4690 {
4691 if ((re_opcode_t) *p1 == exactn)
4692 /* Reuse the code above. */
4693 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4694
505bde11
SM
4695 /* It is hard to list up all the character in charset
4696 P2 if it includes multibyte character. Give up in
4697 such case. */
4698 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4699 {
4700 /* Now, we are sure that P2 has no range table.
4701 So, for the size of bitmap in P2, `p2[1]' is
4702 enough. But P1 may have range table, so the
4703 size of bitmap table of P1 is extracted by
4704 using macro `CHARSET_BITMAP_SIZE'.
4705
6fdd04b0
KH
4706 In a multibyte case, we know that all the character
4707 listed in P2 is ASCII. In a unibyte case, P1 has only a
4708 bitmap table. So, in both cases, it is enough to test
4709 only the bitmap table of P1. */
505bde11 4710
411e4203 4711 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4712 {
4713 int idx;
4714 /* We win if the charset inside the loop
4715 has no overlap with the one after the loop. */
4716 for (idx = 0;
4717 (idx < (int) p2[1]
4718 && idx < CHARSET_BITMAP_SIZE (p1));
4719 idx++)
4720 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4721 break;
4722
4723 if (idx == p2[1]
4724 || idx == CHARSET_BITMAP_SIZE (p1))
4725 {
4726 DEBUG_PRINT1 (" No match => fast loop.\n");
4727 return 1;
4728 }
4729 }
411e4203 4730 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4731 {
4732 int idx;
4733 /* We win if the charset_not inside the loop lists
4734 every character listed in the charset after. */
4735 for (idx = 0; idx < (int) p2[1]; idx++)
4736 if (! (p2[2 + idx] == 0
4737 || (idx < CHARSET_BITMAP_SIZE (p1)
4738 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4739 break;
4740
4e8a9132
SM
4741 if (idx == p2[1])
4742 {
4743 DEBUG_PRINT1 (" No match => fast loop.\n");
4744 return 1;
4745 }
4746 }
4747 }
4748 }
609b757a 4749 break;
177c0ea7 4750
411e4203
SM
4751 case charset_not:
4752 switch (SWITCH_ENUM_CAST (*p1))
4753 {
4754 case exactn:
4755 case charset:
4756 /* Reuse the code above. */
4757 return mutually_exclusive_p (bufp, p2, p1);
4758 case charset_not:
4759 /* When we have two charset_not, it's very unlikely that
4760 they don't overlap. The union of the two sets of excluded
4761 chars should cover all possible chars, which, as a matter of
4762 fact, is virtually impossible in multibyte buffers. */
36595814 4763 break;
411e4203
SM
4764 }
4765 break;
4766
4e8a9132
SM
4767 case wordend:
4768 case notsyntaxspec:
4769 return ((re_opcode_t) *p1 == syntaxspec
4770 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4771
4772 case wordbeg:
4773 case syntaxspec:
4774 return ((re_opcode_t) *p1 == notsyntaxspec
4775 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4776
4777 case wordbound:
4778 return (((re_opcode_t) *p1 == notsyntaxspec
4779 || (re_opcode_t) *p1 == syntaxspec)
4780 && p1[1] == Sword);
4781
1fb352e0 4782#ifdef emacs
4e8a9132
SM
4783 case categoryspec:
4784 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4785 case notcategoryspec:
4786 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4787#endif /* emacs */
4788
4789 default:
4790 ;
505bde11
SM
4791 }
4792
4793 /* Safe default. */
4794 return 0;
4795}
4796
fa9a63c5
RM
4797\f
4798/* Matching routines. */
4799
25fe55af 4800#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4801/* re_match is like re_match_2 except it takes only a single string. */
4802
4803int
4804re_match (bufp, string, size, pos, regs)
4805 struct re_pattern_buffer *bufp;
4806 const char *string;
4807 int size, pos;
4808 struct re_registers *regs;
4809{
4bb91c68 4810 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4811 pos, regs, size);
0b32bf0e 4812# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4813 alloca (0);
0b32bf0e 4814# endif
fa9a63c5
RM
4815 return result;
4816}
c0f9ea08 4817WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4818#endif /* not emacs */
4819
b18215fc
RS
4820#ifdef emacs
4821/* In Emacs, this is the string or buffer in which we
25fe55af 4822 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4823Lisp_Object re_match_object;
4824#endif
fa9a63c5
RM
4825
4826/* re_match_2 matches the compiled pattern in BUFP against the
4827 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4828 and SIZE2, respectively). We start matching at POS, and stop
4829 matching at STOP.
5e69f11e 4830
fa9a63c5 4831 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4832 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4833 documentation for exactly how many groups we fill.
4834
4835 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4836 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4837 matched substring. */
4838
4839int
4840re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4841 struct re_pattern_buffer *bufp;
4842 const char *string1, *string2;
4843 int size1, size2;
4844 int pos;
4845 struct re_registers *regs;
4846 int stop;
4847{
b18215fc 4848 int result;
25fe55af 4849
b18215fc 4850#ifdef emacs
cc9b4df2
KH
4851 int charpos;
4852 gl_state.object = re_match_object;
99633e97 4853 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4854 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4855#endif
4856
4bb91c68
SM
4857 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4858 (re_char*) string2, size2,
cc9b4df2 4859 pos, regs, stop);
0b32bf0e 4860#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4861 alloca (0);
a60198e5 4862#endif
fa9a63c5
RM
4863 return result;
4864}
c0f9ea08 4865WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5 4866
bf216479 4867#ifdef emacs
6fdd04b0
KH
4868#define TRANSLATE_VIA_MULTIBYTE(c) \
4869 do { \
4870 if (multibyte) \
4871 (c) = TRANSLATE (c); \
4872 else \
4873 { \
4874 MAKE_CHAR_MULTIBYTE (c); \
4875 (c) = TRANSLATE (c); \
4876 MAKE_CHAR_UNIBYTE (c); \
4877 } \
4878 } while (0)
4879
bf216479 4880#else
6fdd04b0 4881#define TRANSLATE_VIA_MULTIBYTE(c) ((c) = TRANSLATE (c))
bf216479
KH
4882#endif
4883
4884
fa9a63c5 4885/* This is a separate function so that we can force an alloca cleanup
25fe55af 4886 afterwards. */
fa9a63c5
RM
4887static int
4888re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4889 struct re_pattern_buffer *bufp;
66f0296e 4890 re_char *string1, *string2;
fa9a63c5
RM
4891 int size1, size2;
4892 int pos;
4893 struct re_registers *regs;
4894 int stop;
4895{
4896 /* General temporaries. */
4897 int mcnt;
01618498 4898 size_t reg;
66f0296e 4899 boolean not;
fa9a63c5
RM
4900
4901 /* Just past the end of the corresponding string. */
66f0296e 4902 re_char *end1, *end2;
fa9a63c5
RM
4903
4904 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4905 each to consider matching. */
66f0296e 4906 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4907
4908 /* Where we are in the data, and the end of the current string. */
66f0296e 4909 re_char *d, *dend;
5e69f11e 4910
99633e97
SM
4911 /* Used sometimes to remember where we were before starting matching
4912 an operator so that we can go back in case of failure. This "atomic"
4913 behavior of matching opcodes is indispensable to the correctness
4914 of the on_failure_keep_string_jump optimization. */
4915 re_char *dfail;
4916
fa9a63c5 4917 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4918 re_char *p = bufp->buffer;
4919 re_char *pend = p + bufp->used;
fa9a63c5 4920
25fe55af 4921 /* We use this to map every character in the string. */
6676cb1c 4922 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4923
6fdd04b0
KH
4924 /* Nonzero if BUFP is setup for multibyte characters. We are sure
4925 that it is the same as RE_TARGET_MULTIBYTE_P (bufp). */
2d1675e4 4926 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4927
fa9a63c5
RM
4928 /* Failure point stack. Each place that can handle a failure further
4929 down the line pushes a failure point on this stack. It consists of
505bde11 4930 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4931 the subexpressions we're currently inside, plus the number of such
4932 registers, and, finally, two char *'s. The first char * is where
4933 to resume scanning the pattern; the second one is where to resume
505bde11 4934 scanning the strings. */
25fe55af 4935#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4936 fail_stack_type fail_stack;
4937#endif
4938#ifdef DEBUG
fa9a63c5
RM
4939 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4940#endif
4941
0b32bf0e 4942#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4943 /* This holds the pointer to the failure stack, when
4944 it is allocated relocatably. */
4945 fail_stack_elt_t *failure_stack_ptr;
99633e97 4946#endif
fa9a63c5
RM
4947
4948 /* We fill all the registers internally, independent of what we
25fe55af 4949 return, for use in backreferences. The number here includes
fa9a63c5 4950 an element for register zero. */
4bb91c68 4951 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4952
fa9a63c5
RM
4953 /* Information on the contents of registers. These are pointers into
4954 the input strings; they record just what was matched (on this
4955 attempt) by a subexpression part of the pattern, that is, the
4956 regnum-th regstart pointer points to where in the pattern we began
4957 matching and the regnum-th regend points to right after where we
4958 stopped matching the regnum-th subexpression. (The zeroth register
4959 keeps track of what the whole pattern matches.) */
4960#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4961 re_char **regstart, **regend;
fa9a63c5
RM
4962#endif
4963
fa9a63c5 4964 /* The following record the register info as found in the above
5e69f11e 4965 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4966 This happens as we backtrack through the failure points, which in
4967 turn happens only if we have not yet matched the entire string. */
4968 unsigned best_regs_set = false;
4969#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4970 re_char **best_regstart, **best_regend;
fa9a63c5 4971#endif
5e69f11e 4972
fa9a63c5
RM
4973 /* Logically, this is `best_regend[0]'. But we don't want to have to
4974 allocate space for that if we're not allocating space for anything
25fe55af 4975 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4976 any of the other register vectors, and it seems rather a kludge to
4977 treat `best_regend' differently than the rest. So we keep track of
4978 the end of the best match so far in a separate variable. We
4979 initialize this to NULL so that when we backtrack the first time
4980 and need to test it, it's not garbage. */
66f0296e 4981 re_char *match_end = NULL;
fa9a63c5 4982
fa9a63c5
RM
4983#ifdef DEBUG
4984 /* Counts the total number of registers pushed. */
5e69f11e 4985 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4986#endif
4987
4988 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4989
fa9a63c5 4990 INIT_FAIL_STACK ();
5e69f11e 4991
fa9a63c5
RM
4992#ifdef MATCH_MAY_ALLOCATE
4993 /* Do not bother to initialize all the register variables if there are
4994 no groups in the pattern, as it takes a fair amount of time. If
4995 there are groups, we include space for register 0 (the whole
4996 pattern), even though we never use it, since it simplifies the
4997 array indexing. We should fix this. */
4998 if (bufp->re_nsub)
4999 {
66f0296e
SM
5000 regstart = REGEX_TALLOC (num_regs, re_char *);
5001 regend = REGEX_TALLOC (num_regs, re_char *);
5002 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5003 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 5004
505bde11 5005 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
5006 {
5007 FREE_VARIABLES ();
5008 return -2;
5009 }
fa9a63c5
RM
5010 }
5011 else
5012 {
5013 /* We must initialize all our variables to NULL, so that
25fe55af 5014 `FREE_VARIABLES' doesn't try to free them. */
505bde11 5015 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
5016 }
5017#endif /* MATCH_MAY_ALLOCATE */
5018
5019 /* The starting position is bogus. */
5020 if (pos < 0 || pos > size1 + size2)
5021 {
5022 FREE_VARIABLES ();
5023 return -1;
5024 }
5e69f11e 5025
fa9a63c5
RM
5026 /* Initialize subexpression text positions to -1 to mark ones that no
5027 start_memory/stop_memory has been seen for. Also initialize the
5028 register information struct. */
01618498
SM
5029 for (reg = 1; reg < num_regs; reg++)
5030 regstart[reg] = regend[reg] = NULL;
99633e97 5031
fa9a63c5 5032 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 5033 `string1' is null. */
fa9a63c5
RM
5034 if (size2 == 0 && string1 != NULL)
5035 {
5036 string2 = string1;
5037 size2 = size1;
5038 string1 = 0;
5039 size1 = 0;
5040 }
5041 end1 = string1 + size1;
5042 end2 = string2 + size2;
5043
5e69f11e 5044 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
5045 `dend' is the end of the input string that `d' points within. `d'
5046 is advanced into the following input string whenever necessary, but
5047 this happens before fetching; therefore, at the beginning of the
5048 loop, `d' can be pointing at the end of a string, but it cannot
5049 equal `string2'. */
419d1c74 5050 if (pos >= size1)
fa9a63c5 5051 {
419d1c74
SM
5052 /* Only match within string2. */
5053 d = string2 + pos - size1;
5054 dend = end_match_2 = string2 + stop - size1;
5055 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
5056 }
5057 else
5058 {
f1ad044f 5059 if (stop < size1)
419d1c74
SM
5060 {
5061 /* Only match within string1. */
5062 end_match_1 = string1 + stop;
5063 /* BEWARE!
5064 When we reach end_match_1, PREFETCH normally switches to string2.
5065 But in the present case, this means that just doing a PREFETCH
5066 makes us jump from `stop' to `gap' within the string.
5067 What we really want here is for the search to stop as
5068 soon as we hit end_match_1. That's why we set end_match_2
5069 to end_match_1 (since PREFETCH fails as soon as we hit
5070 end_match_2). */
5071 end_match_2 = end_match_1;
5072 }
5073 else
f1ad044f
SM
5074 { /* It's important to use this code when stop == size so that
5075 moving `d' from end1 to string2 will not prevent the d == dend
5076 check from catching the end of string. */
419d1c74
SM
5077 end_match_1 = end1;
5078 end_match_2 = string2 + stop - size1;
5079 }
5080 d = string1 + pos;
5081 dend = end_match_1;
fa9a63c5
RM
5082 }
5083
5084 DEBUG_PRINT1 ("The compiled pattern is: ");
5085 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5086 DEBUG_PRINT1 ("The string to match is: `");
5087 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5088 DEBUG_PRINT1 ("'\n");
5e69f11e 5089
25fe55af 5090 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
5091 function if the match is complete, or it drops through if the match
5092 fails at this starting point in the input data. */
5093 for (;;)
5094 {
505bde11 5095 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
5096
5097 if (p == pend)
5098 { /* End of pattern means we might have succeeded. */
25fe55af 5099 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 5100
fa9a63c5 5101 /* If we haven't matched the entire string, and we want the
25fe55af
RS
5102 longest match, try backtracking. */
5103 if (d != end_match_2)
fa9a63c5
RM
5104 {
5105 /* 1 if this match ends in the same string (string1 or string2)
5106 as the best previous match. */
5e69f11e 5107 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5108 == FIRST_STRING_P (d));
fa9a63c5
RM
5109 /* 1 if this match is the best seen so far. */
5110 boolean best_match_p;
5111
5112 /* AIX compiler got confused when this was combined
25fe55af 5113 with the previous declaration. */
fa9a63c5
RM
5114 if (same_str_p)
5115 best_match_p = d > match_end;
5116 else
99633e97 5117 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5118
25fe55af
RS
5119 DEBUG_PRINT1 ("backtracking.\n");
5120
5121 if (!FAIL_STACK_EMPTY ())
5122 { /* More failure points to try. */
5123
5124 /* If exceeds best match so far, save it. */
5125 if (!best_regs_set || best_match_p)
5126 {
5127 best_regs_set = true;
5128 match_end = d;
5129
5130 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5131
01618498 5132 for (reg = 1; reg < num_regs; reg++)
25fe55af 5133 {
01618498
SM
5134 best_regstart[reg] = regstart[reg];
5135 best_regend[reg] = regend[reg];
25fe55af
RS
5136 }
5137 }
5138 goto fail;
5139 }
5140
5141 /* If no failure points, don't restore garbage. And if
5142 last match is real best match, don't restore second
5143 best one. */
5144 else if (best_regs_set && !best_match_p)
5145 {
5146 restore_best_regs:
5147 /* Restore best match. It may happen that `dend ==
5148 end_match_1' while the restored d is in string2.
5149 For example, the pattern `x.*y.*z' against the
5150 strings `x-' and `y-z-', if the two strings are
5151 not consecutive in memory. */
5152 DEBUG_PRINT1 ("Restoring best registers.\n");
5153
5154 d = match_end;
5155 dend = ((d >= string1 && d <= end1)
5156 ? end_match_1 : end_match_2);
fa9a63c5 5157
01618498 5158 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5159 {
01618498
SM
5160 regstart[reg] = best_regstart[reg];
5161 regend[reg] = best_regend[reg];
fa9a63c5 5162 }
25fe55af
RS
5163 }
5164 } /* d != end_match_2 */
fa9a63c5
RM
5165
5166 succeed_label:
25fe55af 5167 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5168
25fe55af
RS
5169 /* If caller wants register contents data back, do it. */
5170 if (regs && !bufp->no_sub)
fa9a63c5 5171 {
25fe55af
RS
5172 /* Have the register data arrays been allocated? */
5173 if (bufp->regs_allocated == REGS_UNALLOCATED)
5174 { /* No. So allocate them with malloc. We need one
5175 extra element beyond `num_regs' for the `-1' marker
5176 GNU code uses. */
5177 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5178 regs->start = TALLOC (regs->num_regs, regoff_t);
5179 regs->end = TALLOC (regs->num_regs, regoff_t);
5180 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5181 {
5182 FREE_VARIABLES ();
5183 return -2;
5184 }
25fe55af
RS
5185 bufp->regs_allocated = REGS_REALLOCATE;
5186 }
5187 else if (bufp->regs_allocated == REGS_REALLOCATE)
5188 { /* Yes. If we need more elements than were already
5189 allocated, reallocate them. If we need fewer, just
5190 leave it alone. */
5191 if (regs->num_regs < num_regs + 1)
5192 {
5193 regs->num_regs = num_regs + 1;
5194 RETALLOC (regs->start, regs->num_regs, regoff_t);
5195 RETALLOC (regs->end, regs->num_regs, regoff_t);
5196 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5197 {
5198 FREE_VARIABLES ();
5199 return -2;
5200 }
25fe55af
RS
5201 }
5202 }
5203 else
fa9a63c5
RM
5204 {
5205 /* These braces fend off a "empty body in an else-statement"
25fe55af 5206 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5207 assert (bufp->regs_allocated == REGS_FIXED);
5208 }
5209
25fe55af
RS
5210 /* Convert the pointer data in `regstart' and `regend' to
5211 indices. Register zero has to be set differently,
5212 since we haven't kept track of any info for it. */
5213 if (regs->num_regs > 0)
5214 {
5215 regs->start[0] = pos;
99633e97 5216 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5217 }
5e69f11e 5218
25fe55af
RS
5219 /* Go through the first `min (num_regs, regs->num_regs)'
5220 registers, since that is all we initialized. */
01618498 5221 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5222 {
01618498
SM
5223 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5224 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5225 else
5226 {
01618498
SM
5227 regs->start[reg]
5228 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5229 regs->end[reg]
5230 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5231 }
fa9a63c5 5232 }
5e69f11e 5233
25fe55af
RS
5234 /* If the regs structure we return has more elements than
5235 were in the pattern, set the extra elements to -1. If
5236 we (re)allocated the registers, this is the case,
5237 because we always allocate enough to have at least one
5238 -1 at the end. */
01618498
SM
5239 for (reg = num_regs; reg < regs->num_regs; reg++)
5240 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5241 } /* regs && !bufp->no_sub */
5242
25fe55af
RS
5243 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5244 nfailure_points_pushed, nfailure_points_popped,
5245 nfailure_points_pushed - nfailure_points_popped);
5246 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5247
99633e97 5248 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5249
25fe55af 5250 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5251
25fe55af
RS
5252 FREE_VARIABLES ();
5253 return mcnt;
5254 }
fa9a63c5 5255
25fe55af 5256 /* Otherwise match next pattern command. */
fa9a63c5
RM
5257 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5258 {
25fe55af
RS
5259 /* Ignore these. Used to ignore the n of succeed_n's which
5260 currently have n == 0. */
5261 case no_op:
5262 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5263 break;
fa9a63c5
RM
5264
5265 case succeed:
25fe55af 5266 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5267 goto succeed_label;
5268
25fe55af
RS
5269 /* Match the next n pattern characters exactly. The following
5270 byte in the pattern defines n, and the n bytes after that
5271 are the characters to match. */
fa9a63c5
RM
5272 case exactn:
5273 mcnt = *p++;
25fe55af 5274 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5275
99633e97
SM
5276 /* Remember the start point to rollback upon failure. */
5277 dfail = d;
5278
6fdd04b0 5279#ifndef emacs
25fe55af
RS
5280 /* This is written out as an if-else so we don't waste time
5281 testing `translate' inside the loop. */
28703c16 5282 if (RE_TRANSLATE_P (translate))
6fdd04b0
KH
5283 do
5284 {
5285 PREFETCH ();
5286 if (RE_TRANSLATE (translate, *d) != *p++)
e934739e 5287 {
6fdd04b0
KH
5288 d = dfail;
5289 goto fail;
e934739e 5290 }
6fdd04b0
KH
5291 d++;
5292 }
5293 while (--mcnt);
fa9a63c5 5294 else
6fdd04b0
KH
5295 do
5296 {
5297 PREFETCH ();
5298 if (*d++ != *p++)
bf216479 5299 {
6fdd04b0
KH
5300 d = dfail;
5301 goto fail;
bf216479 5302 }
6fdd04b0
KH
5303 }
5304 while (--mcnt);
5305#else /* emacs */
5306 /* The cost of testing `translate' is comparatively small. */
5307 if (multibyte)
5308 do
5309 {
5310 int pat_charlen, buf_charlen;
5311 unsigned int pat_ch, buf_ch;
e934739e 5312
6fdd04b0
KH
5313 PREFETCH ();
5314 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5315 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e 5316
6fdd04b0 5317 if (TRANSLATE (buf_ch) != pat_ch)
e934739e 5318 {
6fdd04b0
KH
5319 d = dfail;
5320 goto fail;
e934739e 5321 }
bf216479 5322
6fdd04b0
KH
5323 p += pat_charlen;
5324 d += buf_charlen;
5325 mcnt -= pat_charlen;
5326 }
5327 while (mcnt > 0);
fa9a63c5 5328 else
6fdd04b0
KH
5329 do
5330 {
5331 unsigned int buf_ch;
bf216479 5332
6fdd04b0
KH
5333 PREFETCH ();
5334 buf_ch = *d++;
5335 TRANSLATE_VIA_MULTIBYTE (buf_ch);
5336 if (buf_ch != *p++)
5337 {
5338 d = dfail;
5339 goto fail;
bf216479 5340 }
6fdd04b0
KH
5341 }
5342 while (--mcnt);
5343#endif
25fe55af 5344 break;
fa9a63c5
RM
5345
5346
25fe55af 5347 /* Match any character except possibly a newline or a null. */
fa9a63c5 5348 case anychar:
e934739e
RS
5349 {
5350 int buf_charlen;
01618498 5351 re_wchar_t buf_ch;
fa9a63c5 5352
e934739e 5353 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5354
e934739e 5355 PREFETCH ();
2d1675e4 5356 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5357 buf_ch = TRANSLATE (buf_ch);
5358
5359 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5360 && buf_ch == '\n')
5361 || ((bufp->syntax & RE_DOT_NOT_NULL)
5362 && buf_ch == '\000'))
5363 goto fail;
5364
e934739e
RS
5365 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5366 d += buf_charlen;
5367 }
fa9a63c5
RM
5368 break;
5369
5370
5371 case charset:
5372 case charset_not:
5373 {
b18215fc 5374 register unsigned int c;
fa9a63c5 5375 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5376 int len;
5377
5378 /* Start of actual range_table, or end of bitmap if there is no
5379 range table. */
01618498 5380 re_char *range_table;
b18215fc 5381
96cc36cc 5382 /* Nonzero if there is a range table. */
b18215fc
RS
5383 int range_table_exists;
5384
96cc36cc
RS
5385 /* Number of ranges of range table. This is not included
5386 in the initial byte-length of the command. */
5387 int count = 0;
fa9a63c5 5388
25fe55af 5389 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5390
b18215fc 5391 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5392
b18215fc 5393 if (range_table_exists)
96cc36cc
RS
5394 {
5395 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5396 EXTRACT_NUMBER_AND_INCR (count, range_table);
5397 }
b18215fc 5398
2d1675e4
SM
5399 PREFETCH ();
5400 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
6fdd04b0 5401 TRANSLATE_VIA_MULTIBYTE (c); /* The character to match. */
b18215fc 5402
bf216479 5403 if (! multibyte || IS_REAL_ASCII (c))
b18215fc 5404 { /* Lookup bitmap. */
b18215fc
RS
5405 /* Cast to `unsigned' instead of `unsigned char' in
5406 case the bit list is a full 32 bytes long. */
5407 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5408 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5409 not = !not;
b18215fc 5410 }
96cc36cc 5411#ifdef emacs
b18215fc 5412 else if (range_table_exists)
96cc36cc
RS
5413 {
5414 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5415
14473664
SM
5416 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5417 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5418 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5419 | (class_bits & BIT_SPACE && ISSPACE (c))
5420 | (class_bits & BIT_UPPER && ISUPPER (c))
5421 | (class_bits & BIT_WORD && ISWORD (c)))
5422 not = !not;
5423 else
5424 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5425 }
5426#endif /* emacs */
fa9a63c5 5427
96cc36cc
RS
5428 if (range_table_exists)
5429 p = CHARSET_RANGE_TABLE_END (range_table, count);
5430 else
5431 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5432
5433 if (!not) goto fail;
5e69f11e 5434
b18215fc 5435 d += len;
fa9a63c5
RM
5436 break;
5437 }
5438
5439
25fe55af 5440 /* The beginning of a group is represented by start_memory.
505bde11 5441 The argument is the register number. The text
25fe55af
RS
5442 matched within the group is recorded (in the internal
5443 registers data structure) under the register number. */
5444 case start_memory:
505bde11
SM
5445 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5446
5447 /* In case we need to undo this operation (via backtracking). */
5448 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5449
25fe55af 5450 regstart[*p] = d;
4bb91c68 5451 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5452 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5453
25fe55af 5454 /* Move past the register number and inner group count. */
505bde11 5455 p += 1;
25fe55af 5456 break;
fa9a63c5
RM
5457
5458
25fe55af 5459 /* The stop_memory opcode represents the end of a group. Its
505bde11 5460 argument is the same as start_memory's: the register number. */
fa9a63c5 5461 case stop_memory:
505bde11
SM
5462 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5463
5464 assert (!REG_UNSET (regstart[*p]));
5465 /* Strictly speaking, there should be code such as:
177c0ea7 5466
0b32bf0e 5467 assert (REG_UNSET (regend[*p]));
505bde11
SM
5468 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5469
5470 But the only info to be pushed is regend[*p] and it is known to
5471 be UNSET, so there really isn't anything to push.
5472 Not pushing anything, on the other hand deprives us from the
5473 guarantee that regend[*p] is UNSET since undoing this operation
5474 will not reset its value properly. This is not important since
5475 the value will only be read on the next start_memory or at
5476 the very end and both events can only happen if this stop_memory
5477 is *not* undone. */
fa9a63c5 5478
25fe55af 5479 regend[*p] = d;
fa9a63c5
RM
5480 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5481
25fe55af 5482 /* Move past the register number and the inner group count. */
505bde11 5483 p += 1;
25fe55af 5484 break;
fa9a63c5
RM
5485
5486
5487 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5488 followed by the numeric value of <digit> as the register number. */
5489 case duplicate:
fa9a63c5 5490 {
66f0296e 5491 register re_char *d2, *dend2;
25fe55af 5492 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5493 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5494
25fe55af
RS
5495 /* Can't back reference a group which we've never matched. */
5496 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5497 goto fail;
5e69f11e 5498
25fe55af
RS
5499 /* Where in input to try to start matching. */
5500 d2 = regstart[regno];
5e69f11e 5501
99633e97
SM
5502 /* Remember the start point to rollback upon failure. */
5503 dfail = d;
5504
25fe55af
RS
5505 /* Where to stop matching; if both the place to start and
5506 the place to stop matching are in the same string, then
5507 set to the place to stop, otherwise, for now have to use
5508 the end of the first string. */
fa9a63c5 5509
25fe55af 5510 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5511 == FIRST_STRING_P (regend[regno]))
5512 ? regend[regno] : end_match_1);
5513 for (;;)
5514 {
5515 /* If necessary, advance to next segment in register
25fe55af 5516 contents. */
fa9a63c5
RM
5517 while (d2 == dend2)
5518 {
5519 if (dend2 == end_match_2) break;
5520 if (dend2 == regend[regno]) break;
5521
25fe55af
RS
5522 /* End of string1 => advance to string2. */
5523 d2 = string2;
5524 dend2 = regend[regno];
fa9a63c5
RM
5525 }
5526 /* At end of register contents => success */
5527 if (d2 == dend2) break;
5528
5529 /* If necessary, advance to next segment in data. */
5530 PREFETCH ();
5531
5532 /* How many characters left in this segment to match. */
5533 mcnt = dend - d;
5e69f11e 5534
fa9a63c5 5535 /* Want how many consecutive characters we can match in
25fe55af
RS
5536 one shot, so, if necessary, adjust the count. */
5537 if (mcnt > dend2 - d2)
fa9a63c5 5538 mcnt = dend2 - d2;
5e69f11e 5539
fa9a63c5 5540 /* Compare that many; failure if mismatch, else move
25fe55af 5541 past them. */
28703c16 5542 if (RE_TRANSLATE_P (translate)
2d1675e4 5543 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5544 : memcmp (d, d2, mcnt))
99633e97
SM
5545 {
5546 d = dfail;
5547 goto fail;
5548 }
fa9a63c5 5549 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5550 }
5551 }
5552 break;
5553
5554
25fe55af 5555 /* begline matches the empty string at the beginning of the string
c0f9ea08 5556 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5557 case begline:
25fe55af 5558 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5559
25fe55af
RS
5560 if (AT_STRINGS_BEG (d))
5561 {
5562 if (!bufp->not_bol) break;
5563 }
419d1c74 5564 else
25fe55af 5565 {
bf216479 5566 unsigned c;
419d1c74 5567 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5568 if (c == '\n')
419d1c74 5569 break;
25fe55af
RS
5570 }
5571 /* In all other cases, we fail. */
5572 goto fail;
fa9a63c5
RM
5573
5574
25fe55af 5575 /* endline is the dual of begline. */
fa9a63c5 5576 case endline:
25fe55af 5577 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5578
25fe55af
RS
5579 if (AT_STRINGS_END (d))
5580 {
5581 if (!bufp->not_eol) break;
5582 }
f1ad044f 5583 else
25fe55af 5584 {
f1ad044f 5585 PREFETCH_NOLIMIT ();
c0f9ea08 5586 if (*d == '\n')
f1ad044f 5587 break;
25fe55af
RS
5588 }
5589 goto fail;
fa9a63c5
RM
5590
5591
5592 /* Match at the very beginning of the data. */
25fe55af
RS
5593 case begbuf:
5594 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5595 if (AT_STRINGS_BEG (d))
5596 break;
5597 goto fail;
fa9a63c5
RM
5598
5599
5600 /* Match at the very end of the data. */
25fe55af
RS
5601 case endbuf:
5602 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5603 if (AT_STRINGS_END (d))
5604 break;
25fe55af 5605 goto fail;
5e69f11e 5606
5e69f11e 5607
25fe55af
RS
5608 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5609 pushes NULL as the value for the string on the stack. Then
505bde11 5610 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5611 string, instead of restoring it. To see why, consider
5612 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5613 then the . fails against the \n. But the next thing we want
5614 to do is match the \n against the \n; if we restored the
5615 string value, we would be back at the foo.
5616
5617 Because this is used only in specific cases, we don't need to
5618 check all the things that `on_failure_jump' does, to make
5619 sure the right things get saved on the stack. Hence we don't
5620 share its code. The only reason to push anything on the
5621 stack at all is that otherwise we would have to change
5622 `anychar's code to do something besides goto fail in this
5623 case; that seems worse than this. */
5624 case on_failure_keep_string_jump:
505bde11
SM
5625 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5626 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5627 mcnt, p + mcnt);
fa9a63c5 5628
505bde11
SM
5629 PUSH_FAILURE_POINT (p - 3, NULL);
5630 break;
5631
0683b6fa
SM
5632 /* A nasty loop is introduced by the non-greedy *? and +?.
5633 With such loops, the stack only ever contains one failure point
5634 at a time, so that a plain on_failure_jump_loop kind of
5635 cycle detection cannot work. Worse yet, such a detection
5636 can not only fail to detect a cycle, but it can also wrongly
5637 detect a cycle (between different instantiations of the same
6df42991 5638 loop).
0683b6fa
SM
5639 So the method used for those nasty loops is a little different:
5640 We use a special cycle-detection-stack-frame which is pushed
5641 when the on_failure_jump_nastyloop failure-point is *popped*.
5642 This special frame thus marks the beginning of one iteration
5643 through the loop and we can hence easily check right here
5644 whether something matched between the beginning and the end of
5645 the loop. */
5646 case on_failure_jump_nastyloop:
5647 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5648 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5649 mcnt, p + mcnt);
5650
5651 assert ((re_opcode_t)p[-4] == no_op);
6df42991
SM
5652 {
5653 int cycle = 0;
5654 CHECK_INFINITE_LOOP (p - 4, d);
5655 if (!cycle)
5656 /* If there's a cycle, just continue without pushing
5657 this failure point. The failure point is the "try again"
5658 option, which shouldn't be tried.
5659 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5660 PUSH_FAILURE_POINT (p - 3, d);
5661 }
0683b6fa
SM
5662 break;
5663
4e8a9132
SM
5664 /* Simple loop detecting on_failure_jump: just check on the
5665 failure stack if the same spot was already hit earlier. */
505bde11
SM
5666 case on_failure_jump_loop:
5667 on_failure:
5668 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5669 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5670 mcnt, p + mcnt);
6df42991
SM
5671 {
5672 int cycle = 0;
5673 CHECK_INFINITE_LOOP (p - 3, d);
5674 if (cycle)
5675 /* If there's a cycle, get out of the loop, as if the matching
5676 had failed. We used to just `goto fail' here, but that was
5677 aborting the search a bit too early: we want to keep the
5678 empty-loop-match and keep matching after the loop.
5679 We want (x?)*y\1z to match both xxyz and xxyxz. */
5680 p += mcnt;
5681 else
5682 PUSH_FAILURE_POINT (p - 3, d);
5683 }
25fe55af 5684 break;
fa9a63c5
RM
5685
5686
5687 /* Uses of on_failure_jump:
5e69f11e 5688
25fe55af
RS
5689 Each alternative starts with an on_failure_jump that points
5690 to the beginning of the next alternative. Each alternative
5691 except the last ends with a jump that in effect jumps past
5692 the rest of the alternatives. (They really jump to the
5693 ending jump of the following alternative, because tensioning
5694 these jumps is a hassle.)
fa9a63c5 5695
25fe55af
RS
5696 Repeats start with an on_failure_jump that points past both
5697 the repetition text and either the following jump or
5698 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5699 case on_failure_jump:
5b370c2b 5700 IMMEDIATE_QUIT_CHECK;
25fe55af 5701 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5702 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5703 mcnt, p + mcnt);
25fe55af 5704
505bde11 5705 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5706 break;
5707
4e8a9132 5708 /* This operation is used for greedy *.
505bde11
SM
5709 Compare the beginning of the repeat with what in the
5710 pattern follows its end. If we can establish that there
5711 is nothing that they would both match, i.e., that we
5712 would have to backtrack because of (as in, e.g., `a*a')
5713 then we can use a non-backtracking loop based on
4e8a9132 5714 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5715 case on_failure_jump_smart:
5b370c2b 5716 IMMEDIATE_QUIT_CHECK;
25fe55af 5717 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5718 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5719 mcnt, p + mcnt);
25fe55af 5720 {
01618498 5721 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5722 /* Here, we discard `const', making re_match non-reentrant. */
5723 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5724 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5725
505bde11
SM
5726 p -= 3; /* Reset so that we will re-execute the
5727 instruction once it's been changed. */
fa9a63c5 5728
4e8a9132
SM
5729 EXTRACT_NUMBER (mcnt, p2 - 2);
5730
5731 /* Ensure this is a indeed the trivial kind of loop
5732 we are expecting. */
5733 assert (skip_one_char (p1) == p2 - 3);
5734 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5735 DEBUG_STATEMENT (debug += 2);
505bde11 5736 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5737 {
505bde11 5738 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5739 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5740 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5741 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5742 }
505bde11 5743 else
fa9a63c5 5744 {
505bde11
SM
5745 /* Default to a safe `on_failure_jump' loop. */
5746 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5747 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5748 }
99633e97 5749 DEBUG_STATEMENT (debug -= 2);
25fe55af 5750 }
505bde11 5751 break;
25fe55af
RS
5752
5753 /* Unconditionally jump (without popping any failure points). */
5754 case jump:
fa9a63c5 5755 unconditional_jump:
5b370c2b 5756 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5757 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5758 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5759 p += mcnt; /* Do the jump. */
505bde11 5760 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5761 break;
5762
5763
25fe55af
RS
5764 /* Have to succeed matching what follows at least n times.
5765 After that, handle like `on_failure_jump'. */
5766 case succeed_n:
01618498 5767 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5768 EXTRACT_NUMBER (mcnt, p + 2);
5769 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5770
dc1e502d
SM
5771 /* Originally, mcnt is how many times we HAVE to succeed. */
5772 if (mcnt != 0)
25fe55af 5773 {
6dcf2d0e
SM
5774 /* Here, we discard `const', making re_match non-reentrant. */
5775 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5776 mcnt--;
01618498
SM
5777 p += 4;
5778 PUSH_NUMBER (p2, mcnt);
25fe55af 5779 }
dc1e502d
SM
5780 else
5781 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5782 goto on_failure;
25fe55af
RS
5783 break;
5784
5785 case jump_n:
01618498 5786 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5787 EXTRACT_NUMBER (mcnt, p + 2);
5788 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5789
5790 /* Originally, this is how many times we CAN jump. */
dc1e502d 5791 if (mcnt != 0)
25fe55af 5792 {
6dcf2d0e
SM
5793 /* Here, we discard `const', making re_match non-reentrant. */
5794 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5795 mcnt--;
01618498 5796 PUSH_NUMBER (p2, mcnt);
dc1e502d 5797 goto unconditional_jump;
25fe55af
RS
5798 }
5799 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5800 else
5801 p += 4;
25fe55af 5802 break;
5e69f11e 5803
fa9a63c5
RM
5804 case set_number_at:
5805 {
01618498 5806 unsigned char *p2; /* Location of the counter. */
25fe55af 5807 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5808
25fe55af 5809 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5810 /* Here, we discard `const', making re_match non-reentrant. */
5811 p2 = (unsigned char*) p + mcnt;
01618498 5812 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5813 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5814 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5815 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5816 break;
5817 }
9121ca40
KH
5818
5819 case wordbound:
66f0296e
SM
5820 case notwordbound:
5821 not = (re_opcode_t) *(p - 1) == notwordbound;
5822 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5823
99633e97 5824 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5825
b18215fc 5826 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5827 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5828 not = !not;
b18215fc
RS
5829 else
5830 {
5831 /* C1 is the character before D, S1 is the syntax of C1, C2
5832 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5833 re_wchar_t c1, c2;
5834 int s1, s2;
bf216479 5835 int dummy;
b18215fc 5836#ifdef emacs
2d1675e4
SM
5837 int offset = PTR_TO_OFFSET (d - 1);
5838 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5839 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5840#endif
66f0296e 5841 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5842 s1 = SYNTAX (c1);
5843#ifdef emacs
5d967c7a 5844 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5845#endif
f1ad044f 5846 PREFETCH_NOLIMIT ();
6fdd04b0 5847 GET_CHAR_AFTER (c2, d, dummy);
b18215fc
RS
5848 s2 = SYNTAX (c2);
5849
5850 if (/* Case 2: Only one of S1 and S2 is Sword. */
5851 ((s1 == Sword) != (s2 == Sword))
5852 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5853 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5854 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5855 not = !not;
5856 }
5857 if (not)
9121ca40 5858 break;
b18215fc 5859 else
9121ca40 5860 goto fail;
fa9a63c5
RM
5861
5862 case wordbeg:
25fe55af 5863 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5864
b18215fc
RS
5865 /* We FAIL in one of the following cases: */
5866
25fe55af 5867 /* Case 1: D is at the end of string. */
b18215fc 5868 if (AT_STRINGS_END (d))
99633e97 5869 goto fail;
b18215fc
RS
5870 else
5871 {
5872 /* C1 is the character before D, S1 is the syntax of C1, C2
5873 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5874 re_wchar_t c1, c2;
5875 int s1, s2;
bf216479 5876 int dummy;
fa9a63c5 5877#ifdef emacs
2d1675e4
SM
5878 int offset = PTR_TO_OFFSET (d);
5879 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5880 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5881#endif
99633e97 5882 PREFETCH ();
6fdd04b0 5883 GET_CHAR_AFTER (c2, d, dummy);
b18215fc 5884 s2 = SYNTAX (c2);
177c0ea7 5885
b18215fc
RS
5886 /* Case 2: S2 is not Sword. */
5887 if (s2 != Sword)
5888 goto fail;
5889
5890 /* Case 3: D is not at the beginning of string ... */
5891 if (!AT_STRINGS_BEG (d))
5892 {
5893 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5894#ifdef emacs
5d967c7a 5895 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5896#endif
b18215fc
RS
5897 s1 = SYNTAX (c1);
5898
5899 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5900 returns 0. */
b18215fc
RS
5901 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5902 goto fail;
5903 }
5904 }
e318085a
RS
5905 break;
5906
b18215fc 5907 case wordend:
25fe55af 5908 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5909
5910 /* We FAIL in one of the following cases: */
5911
5912 /* Case 1: D is at the beginning of string. */
5913 if (AT_STRINGS_BEG (d))
e318085a 5914 goto fail;
b18215fc
RS
5915 else
5916 {
5917 /* C1 is the character before D, S1 is the syntax of C1, C2
5918 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5919 re_wchar_t c1, c2;
5920 int s1, s2;
bf216479 5921 int dummy;
5d967c7a 5922#ifdef emacs
2d1675e4
SM
5923 int offset = PTR_TO_OFFSET (d) - 1;
5924 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5925 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5926#endif
99633e97 5927 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5928 s1 = SYNTAX (c1);
5929
5930 /* Case 2: S1 is not Sword. */
5931 if (s1 != Sword)
5932 goto fail;
5933
5934 /* Case 3: D is not at the end of string ... */
5935 if (!AT_STRINGS_END (d))
5936 {
f1ad044f 5937 PREFETCH_NOLIMIT ();
6fdd04b0 5938 GET_CHAR_AFTER (c2, d, dummy);
5d967c7a
RS
5939#ifdef emacs
5940 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5941#endif
b18215fc
RS
5942 s2 = SYNTAX (c2);
5943
5944 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5945 returns 0. */
b18215fc 5946 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5947 goto fail;
b18215fc
RS
5948 }
5949 }
e318085a
RS
5950 break;
5951
fa9a63c5 5952 case syntaxspec:
1fb352e0
SM
5953 case notsyntaxspec:
5954 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 5955 mcnt = *p++;
1fb352e0 5956 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 5957 PREFETCH ();
b18215fc
RS
5958#ifdef emacs
5959 {
2d1675e4
SM
5960 int offset = PTR_TO_OFFSET (d);
5961 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
5962 UPDATE_SYNTAX_TABLE (pos1);
5963 }
25fe55af 5964#endif
b18215fc 5965 {
01618498
SM
5966 int len;
5967 re_wchar_t c;
b18215fc 5968
6fdd04b0 5969 GET_CHAR_AFTER (c, d, len);
990b2375 5970 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 5971 goto fail;
b18215fc
RS
5972 d += len;
5973 }
fa9a63c5
RM
5974 break;
5975
b18215fc 5976#ifdef emacs
1fb352e0
SM
5977 case before_dot:
5978 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5979 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 5980 goto fail;
b18215fc
RS
5981 break;
5982
1fb352e0
SM
5983 case at_dot:
5984 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5985 if (PTR_BYTE_POS (d) != PT_BYTE)
5986 goto fail;
5987 break;
b18215fc 5988
1fb352e0
SM
5989 case after_dot:
5990 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5991 if (PTR_BYTE_POS (d) <= PT_BYTE)
5992 goto fail;
e318085a 5993 break;
fa9a63c5 5994
1fb352e0 5995 case categoryspec:
b18215fc 5996 case notcategoryspec:
1fb352e0 5997 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 5998 mcnt = *p++;
1fb352e0 5999 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
6000 PREFETCH ();
6001 {
01618498
SM
6002 int len;
6003 re_wchar_t c;
6004
6fdd04b0 6005 GET_CHAR_AFTER (c, d, len);
1fb352e0 6006 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
6007 goto fail;
6008 d += len;
6009 }
fa9a63c5 6010 break;
5e69f11e 6011
1fb352e0 6012#endif /* emacs */
5e69f11e 6013
0b32bf0e
SM
6014 default:
6015 abort ();
fa9a63c5 6016 }
b18215fc 6017 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
6018
6019
6020 /* We goto here if a matching operation fails. */
6021 fail:
5b370c2b 6022 IMMEDIATE_QUIT_CHECK;
fa9a63c5 6023 if (!FAIL_STACK_EMPTY ())
505bde11 6024 {
01618498 6025 re_char *str, *pat;
505bde11 6026 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
6027 DEBUG_PRINT1 ("\nFAIL:\n");
6028 POP_FAILURE_POINT (str, pat);
505bde11
SM
6029 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6030 {
6031 case on_failure_keep_string_jump:
6032 assert (str == NULL);
6033 goto continue_failure_jump;
6034
0683b6fa
SM
6035 case on_failure_jump_nastyloop:
6036 assert ((re_opcode_t)pat[-2] == no_op);
6037 PUSH_FAILURE_POINT (pat - 2, str);
6038 /* Fallthrough */
6039
505bde11
SM
6040 case on_failure_jump_loop:
6041 case on_failure_jump:
6042 case succeed_n:
6043 d = str;
6044 continue_failure_jump:
6045 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6046 p = pat + mcnt;
6047 break;
b18215fc 6048
0683b6fa
SM
6049 case no_op:
6050 /* A special frame used for nastyloops. */
6051 goto fail;
6052
505bde11
SM
6053 default:
6054 abort();
6055 }
fa9a63c5 6056
505bde11 6057 assert (p >= bufp->buffer && p <= pend);
b18215fc 6058
0b32bf0e 6059 if (d >= string1 && d <= end1)
fa9a63c5 6060 dend = end_match_1;
0b32bf0e 6061 }
fa9a63c5 6062 else
0b32bf0e 6063 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
6064 } /* for (;;) */
6065
6066 if (best_regs_set)
6067 goto restore_best_regs;
6068
6069 FREE_VARIABLES ();
6070
b18215fc 6071 return -1; /* Failure to match. */
fa9a63c5
RM
6072} /* re_match_2 */
6073\f
6074/* Subroutine definitions for re_match_2. */
6075
fa9a63c5
RM
6076/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6077 bytes; nonzero otherwise. */
5e69f11e 6078
fa9a63c5 6079static int
2d1675e4
SM
6080bcmp_translate (s1, s2, len, translate, multibyte)
6081 re_char *s1, *s2;
fa9a63c5 6082 register int len;
6676cb1c 6083 RE_TRANSLATE_TYPE translate;
2d1675e4 6084 const int multibyte;
fa9a63c5 6085{
2d1675e4
SM
6086 register re_char *p1 = s1, *p2 = s2;
6087 re_char *p1_end = s1 + len;
6088 re_char *p2_end = s2 + len;
e934739e 6089
4bb91c68
SM
6090 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6091 different lengths, but relying on a single `len' would break this. -sm */
6092 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 6093 {
e934739e 6094 int p1_charlen, p2_charlen;
01618498 6095 re_wchar_t p1_ch, p2_ch;
e934739e 6096
6fdd04b0
KH
6097 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6098 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
e934739e
RS
6099
6100 if (RE_TRANSLATE (translate, p1_ch)
6101 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6102 return 1;
e934739e
RS
6103
6104 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6105 }
e934739e
RS
6106
6107 if (p1 != p1_end || p2 != p2_end)
6108 return 1;
6109
fa9a63c5
RM
6110 return 0;
6111}
6112\f
6113/* Entry points for GNU code. */
6114
6115/* re_compile_pattern is the GNU regular expression compiler: it
6116 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6117 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6118
fa9a63c5
RM
6119 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6120 are set in BUFP on entry.
5e69f11e 6121
b18215fc 6122 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6123
6124const char *
6125re_compile_pattern (pattern, length, bufp)
6126 const char *pattern;
0b32bf0e 6127 size_t length;
fa9a63c5
RM
6128 struct re_pattern_buffer *bufp;
6129{
6130 reg_errcode_t ret;
5e69f11e 6131
fa9a63c5
RM
6132 /* GNU code is written to assume at least RE_NREGS registers will be set
6133 (and at least one extra will be -1). */
6134 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6135
fa9a63c5
RM
6136 /* And GNU code determines whether or not to get register information
6137 by passing null for the REGS argument to re_match, etc., not by
6138 setting no_sub. */
6139 bufp->no_sub = 0;
5e69f11e 6140
4bb91c68 6141 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6142
6143 if (!ret)
6144 return NULL;
6145 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6146}
c0f9ea08 6147WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6148\f
b18215fc
RS
6149/* Entry points compatible with 4.2 BSD regex library. We don't define
6150 them unless specifically requested. */
fa9a63c5 6151
0b32bf0e 6152#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6153
6154/* BSD has one and only one pattern buffer. */
6155static struct re_pattern_buffer re_comp_buf;
6156
6157char *
0b32bf0e 6158# ifdef _LIBC
48afdd44
RM
6159/* Make these definitions weak in libc, so POSIX programs can redefine
6160 these names if they don't use our functions, and still use
6161 regcomp/regexec below without link errors. */
6162weak_function
0b32bf0e 6163# endif
fa9a63c5
RM
6164re_comp (s)
6165 const char *s;
6166{
6167 reg_errcode_t ret;
5e69f11e 6168
fa9a63c5
RM
6169 if (!s)
6170 {
6171 if (!re_comp_buf.buffer)
0b32bf0e 6172 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6173 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6174 return 0;
6175 }
6176
6177 if (!re_comp_buf.buffer)
6178 {
6179 re_comp_buf.buffer = (unsigned char *) malloc (200);
6180 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6181 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6182 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6183 re_comp_buf.allocated = 200;
6184
6185 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6186 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6187 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6188 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6189 }
6190
6191 /* Since `re_exec' always passes NULL for the `regs' argument, we
6192 don't need to initialize the pattern buffer fields which affect it. */
6193
fa9a63c5 6194 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6195
fa9a63c5
RM
6196 if (!ret)
6197 return NULL;
6198
6199 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6200 return (char *) gettext (re_error_msgid[(int) ret]);
6201}
6202
6203
6204int
0b32bf0e 6205# ifdef _LIBC
48afdd44 6206weak_function
0b32bf0e 6207# endif
fa9a63c5
RM
6208re_exec (s)
6209 const char *s;
6210{
6211 const int len = strlen (s);
6212 return
6213 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6214}
6215#endif /* _REGEX_RE_COMP */
6216\f
6217/* POSIX.2 functions. Don't define these for Emacs. */
6218
6219#ifndef emacs
6220
6221/* regcomp takes a regular expression as a string and compiles it.
6222
b18215fc 6223 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6224 since POSIX says we shouldn't. Thus, we set
6225
6226 `buffer' to the compiled pattern;
6227 `used' to the length of the compiled pattern;
6228 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6229 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6230 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6231 `fastmap' to an allocated space for the fastmap;
6232 `fastmap_accurate' to zero;
fa9a63c5
RM
6233 `re_nsub' to the number of subexpressions in PATTERN.
6234
6235 PATTERN is the address of the pattern string.
6236
6237 CFLAGS is a series of bits which affect compilation.
6238
6239 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6240 use POSIX basic syntax.
6241
6242 If REG_NEWLINE is set, then . and [^...] don't match newline.
6243 Also, regexec will try a match beginning after every newline.
6244
6245 If REG_ICASE is set, then we considers upper- and lowercase
6246 versions of letters to be equivalent when matching.
6247
6248 If REG_NOSUB is set, then when PREG is passed to regexec, that
6249 routine will report only success or failure, and nothing about the
6250 registers.
6251
b18215fc 6252 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6253 the return codes and their meanings.) */
6254
6255int
6256regcomp (preg, pattern, cflags)
ada30c0e
SM
6257 regex_t *__restrict preg;
6258 const char *__restrict pattern;
fa9a63c5
RM
6259 int cflags;
6260{
6261 reg_errcode_t ret;
4bb91c68 6262 reg_syntax_t syntax
fa9a63c5
RM
6263 = (cflags & REG_EXTENDED) ?
6264 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6265
6266 /* regex_compile will allocate the space for the compiled pattern. */
6267 preg->buffer = 0;
6268 preg->allocated = 0;
6269 preg->used = 0;
5e69f11e 6270
c0f9ea08
SM
6271 /* Try to allocate space for the fastmap. */
6272 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6273
fa9a63c5
RM
6274 if (cflags & REG_ICASE)
6275 {
6276 unsigned i;
5e69f11e 6277
6676cb1c
RS
6278 preg->translate
6279 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6280 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6281 if (preg->translate == NULL)
0b32bf0e 6282 return (int) REG_ESPACE;
fa9a63c5
RM
6283
6284 /* Map uppercase characters to corresponding lowercase ones. */
6285 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6286 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6287 }
6288 else
6289 preg->translate = NULL;
6290
6291 /* If REG_NEWLINE is set, newlines are treated differently. */
6292 if (cflags & REG_NEWLINE)
6293 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6294 syntax &= ~RE_DOT_NEWLINE;
6295 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6296 }
6297 else
c0f9ea08 6298 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6299
6300 preg->no_sub = !!(cflags & REG_NOSUB);
6301
5e69f11e 6302 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6303 can use strlen here in compiling the pattern. */
4bb91c68 6304 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6305
fa9a63c5
RM
6306 /* POSIX doesn't distinguish between an unmatched open-group and an
6307 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6308 if (ret == REG_ERPAREN)
6309 ret = REG_EPAREN;
6310
6311 if (ret == REG_NOERROR && preg->fastmap)
6312 { /* Compute the fastmap now, since regexec cannot modify the pattern
6313 buffer. */
6314 re_compile_fastmap (preg);
6315 if (preg->can_be_null)
6316 { /* The fastmap can't be used anyway. */
6317 free (preg->fastmap);
6318 preg->fastmap = NULL;
6319 }
6320 }
fa9a63c5
RM
6321 return (int) ret;
6322}
c0f9ea08 6323WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6324
6325
6326/* regexec searches for a given pattern, specified by PREG, in the
6327 string STRING.
5e69f11e 6328
fa9a63c5 6329 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6330 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6331 least NMATCH elements, and we set them to the offsets of the
6332 corresponding matched substrings.
5e69f11e 6333
fa9a63c5
RM
6334 EFLAGS specifies `execution flags' which affect matching: if
6335 REG_NOTBOL is set, then ^ does not match at the beginning of the
6336 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6337
fa9a63c5
RM
6338 We return 0 if we find a match and REG_NOMATCH if not. */
6339
6340int
6341regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6342 const regex_t *__restrict preg;
6343 const char *__restrict string;
5e69f11e 6344 size_t nmatch;
9f2dbe01 6345 regmatch_t pmatch[__restrict_arr];
fa9a63c5
RM
6346 int eflags;
6347{
6348 int ret;
6349 struct re_registers regs;
6350 regex_t private_preg;
6351 int len = strlen (string);
c0f9ea08 6352 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6353
6354 private_preg = *preg;
5e69f11e 6355
fa9a63c5
RM
6356 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6357 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6358
fa9a63c5
RM
6359 /* The user has told us exactly how many registers to return
6360 information about, via `nmatch'. We have to pass that on to the
b18215fc 6361 matching routines. */
fa9a63c5 6362 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6363
fa9a63c5
RM
6364 if (want_reg_info)
6365 {
6366 regs.num_regs = nmatch;
4bb91c68
SM
6367 regs.start = TALLOC (nmatch * 2, regoff_t);
6368 if (regs.start == NULL)
0b32bf0e 6369 return (int) REG_NOMATCH;
4bb91c68 6370 regs.end = regs.start + nmatch;
fa9a63c5
RM
6371 }
6372
c0f9ea08
SM
6373 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6374 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6375 was a little bit longer but still only matching the real part.
6376 This works because the `endline' will check for a '\n' and will find a
6377 '\0', correctly deciding that this is not the end of a line.
6378 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6379 a convenient '\0' there. For all we know, the string could be preceded
6380 by '\n' which would throw things off. */
6381
fa9a63c5
RM
6382 /* Perform the searching operation. */
6383 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6384 /* start: */ 0, /* range: */ len,
6385 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6386
fa9a63c5
RM
6387 /* Copy the register information to the POSIX structure. */
6388 if (want_reg_info)
6389 {
6390 if (ret >= 0)
0b32bf0e
SM
6391 {
6392 unsigned r;
fa9a63c5 6393
0b32bf0e
SM
6394 for (r = 0; r < nmatch; r++)
6395 {
6396 pmatch[r].rm_so = regs.start[r];
6397 pmatch[r].rm_eo = regs.end[r];
6398 }
6399 }
fa9a63c5 6400
b18215fc 6401 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6402 free (regs.start);
fa9a63c5
RM
6403 }
6404
6405 /* We want zero return to mean success, unlike `re_search'. */
6406 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6407}
c0f9ea08 6408WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6409
6410
6411/* Returns a message corresponding to an error code, ERRCODE, returned
6412 from either regcomp or regexec. We don't use PREG here. */
6413
6414size_t
6415regerror (errcode, preg, errbuf, errbuf_size)
6416 int errcode;
6417 const regex_t *preg;
6418 char *errbuf;
6419 size_t errbuf_size;
6420{
6421 const char *msg;
6422 size_t msg_size;
6423
6424 if (errcode < 0
6425 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6426 /* Only error codes returned by the rest of the code should be passed
b18215fc 6427 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6428 code generates an invalid error code, then the program has a bug.
6429 Dump core so we can fix it. */
6430 abort ();
6431
6432 msg = gettext (re_error_msgid[errcode]);
6433
6434 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6435
fa9a63c5
RM
6436 if (errbuf_size != 0)
6437 {
6438 if (msg_size > errbuf_size)
0b32bf0e
SM
6439 {
6440 strncpy (errbuf, msg, errbuf_size - 1);
6441 errbuf[errbuf_size - 1] = 0;
6442 }
fa9a63c5 6443 else
0b32bf0e 6444 strcpy (errbuf, msg);
fa9a63c5
RM
6445 }
6446
6447 return msg_size;
6448}
c0f9ea08 6449WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6450
6451
6452/* Free dynamically allocated space used by PREG. */
6453
6454void
6455regfree (preg)
6456 regex_t *preg;
6457{
6458 if (preg->buffer != NULL)
6459 free (preg->buffer);
6460 preg->buffer = NULL;
5e69f11e 6461
fa9a63c5
RM
6462 preg->allocated = 0;
6463 preg->used = 0;
6464
6465 if (preg->fastmap != NULL)
6466 free (preg->fastmap);
6467 preg->fastmap = NULL;
6468 preg->fastmap_accurate = 0;
6469
6470 if (preg->translate != NULL)
6471 free (preg->translate);
6472 preg->translate = NULL;
6473}
c0f9ea08 6474WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6475
6476#endif /* not emacs */