2002-09-10 Deepak Goel <deego@glue.umd.edu>
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a 1/* Extended regular expression matching and search library, version
0b32bf0e 2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
bc78d348
KB
3 internationalization features.)
4
4bb91c68 5 Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
ba4a8e51 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25fe55af 20 USA. */
fa9a63c5 21
36595814
SM
22/* BUGS:
23 - (x?)*y\1z should match both xxxxyxz and xxxyz.
24 TODO:
505bde11 25 - structure the opcode space into opcode+flag.
dc1e502d 26 - merge with glibc's regex.[ch].
01618498 27 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
6dcf2d0e
SM
28 need to modify the compiled regexp so that re_match can be reentrant.
29 - get rid of on_failure_jump_smart by doing the optimization in re_comp
30 rather than at run-time, so that re_match can be reentrant.
01618498 31*/
505bde11 32
fa9a63c5 33/* AIX requires this to be the first thing in the file. */
0b32bf0e 34#if defined _AIX && !defined REGEX_MALLOC
fa9a63c5
RM
35 #pragma alloca
36#endif
37
68d96f02 38#undef _GNU_SOURCE
fa9a63c5
RM
39#define _GNU_SOURCE
40
41#ifdef HAVE_CONFIG_H
0b32bf0e 42# include <config.h>
fa9a63c5
RM
43#endif
44
4bb91c68
SM
45#if defined STDC_HEADERS && !defined emacs
46# include <stddef.h>
47#else
48/* We need this for `regex.h', and perhaps for the Emacs include files. */
49# include <sys/types.h>
50#endif
fa9a63c5 51
14473664
SM
52/* Whether to use ISO C Amendment 1 wide char functions.
53 Those should not be used for Emacs since it uses its own. */
5e5388f6
GM
54#if defined _LIBC
55#define WIDE_CHAR_SUPPORT 1
56#else
14473664 57#define WIDE_CHAR_SUPPORT \
5e5388f6
GM
58 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
59#endif
14473664
SM
60
61/* For platform which support the ISO C amendement 1 functionality we
62 support user defined character classes. */
a0ad02f7 63#if WIDE_CHAR_SUPPORT
14473664
SM
64/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
65# include <wchar.h>
66# include <wctype.h>
67#endif
68
c0f9ea08
SM
69#ifdef _LIBC
70/* We have to keep the namespace clean. */
71# define regfree(preg) __regfree (preg)
72# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
73# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
74# define regerror(errcode, preg, errbuf, errbuf_size) \
75 __regerror(errcode, preg, errbuf, errbuf_size)
76# define re_set_registers(bu, re, nu, st, en) \
77 __re_set_registers (bu, re, nu, st, en)
78# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
79 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
80# define re_match(bufp, string, size, pos, regs) \
81 __re_match (bufp, string, size, pos, regs)
82# define re_search(bufp, string, size, startpos, range, regs) \
83 __re_search (bufp, string, size, startpos, range, regs)
84# define re_compile_pattern(pattern, length, bufp) \
85 __re_compile_pattern (pattern, length, bufp)
86# define re_set_syntax(syntax) __re_set_syntax (syntax)
87# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
88 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
89# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
90
14473664
SM
91/* Make sure we call libc's function even if the user overrides them. */
92# define btowc __btowc
93# define iswctype __iswctype
94# define wctype __wctype
95
c0f9ea08
SM
96# define WEAK_ALIAS(a,b) weak_alias (a, b)
97
98/* We are also using some library internals. */
99# include <locale/localeinfo.h>
100# include <locale/elem-hash.h>
101# include <langinfo.h>
102#else
103# define WEAK_ALIAS(a,b)
104#endif
105
4bb91c68 106/* This is for other GNU distributions with internationalized messages. */
0b32bf0e 107#if HAVE_LIBINTL_H || defined _LIBC
fa9a63c5
RM
108# include <libintl.h>
109#else
110# define gettext(msgid) (msgid)
111#endif
112
5e69f11e
RM
113#ifndef gettext_noop
114/* This define is so xgettext can find the internationalizable
115 strings. */
0b32bf0e 116# define gettext_noop(String) String
5e69f11e
RM
117#endif
118
fa9a63c5
RM
119/* The `emacs' switch turns on certain matching commands
120 that make sense only in Emacs. */
121#ifdef emacs
122
0b32bf0e
SM
123# include "lisp.h"
124# include "buffer.h"
b18215fc
RS
125
126/* Make syntax table lookup grant data in gl_state. */
0b32bf0e 127# define SYNTAX_ENTRY_VIA_PROPERTY
b18215fc 128
0b32bf0e
SM
129# include "syntax.h"
130# include "charset.h"
131# include "category.h"
fa9a63c5 132
7689ef0b
EZ
133# ifdef malloc
134# undef malloc
135# endif
0b32bf0e 136# define malloc xmalloc
7689ef0b
EZ
137# ifdef realloc
138# undef realloc
139# endif
0b32bf0e 140# define realloc xrealloc
7689ef0b
EZ
141# ifdef free
142# undef free
143# endif
0b32bf0e 144# define free xfree
9abbd165 145
0b32bf0e
SM
146/* Converts the pointer to the char to BEG-based offset from the start. */
147# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
148# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
149
150# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
151# define RE_STRING_CHAR(p, s) \
4e8a9132 152 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
0b32bf0e 153# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
2d1675e4
SM
154 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
155
156/* Set C a (possibly multibyte) character before P. P points into a
157 string which is the virtual concatenation of STR1 (which ends at
158 END1) or STR2 (which ends at END2). */
0b32bf0e 159# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
2d1675e4
SM
160 do { \
161 if (multibyte) \
162 { \
0b32bf0e
SM
163 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
164 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
165 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
166 c = STRING_CHAR (dtemp, (p) - dtemp); \
2d1675e4
SM
167 } \
168 else \
169 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
170 } while (0)
171
4e8a9132 172
fa9a63c5
RM
173#else /* not emacs */
174
175/* If we are not linking with Emacs proper,
176 we can't use the relocating allocator
177 even if config.h says that we can. */
0b32bf0e 178# undef REL_ALLOC
fa9a63c5 179
0b32bf0e
SM
180# if defined STDC_HEADERS || defined _LIBC
181# include <stdlib.h>
182# else
fa9a63c5
RM
183char *malloc ();
184char *realloc ();
0b32bf0e 185# endif
fa9a63c5 186
9e4ecb26 187/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
4bb91c68 188 If nothing else has been done, use the method below. */
0b32bf0e
SM
189# ifdef INHIBIT_STRING_HEADER
190# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
191# if !defined bzero && !defined bcopy
192# undef INHIBIT_STRING_HEADER
193# endif
194# endif
195# endif
9e4ecb26 196
4bb91c68 197/* This is the normal way of making sure we have memcpy, memcmp and bzero.
9e4ecb26
KH
198 This is used in most programs--a few other programs avoid this
199 by defining INHIBIT_STRING_HEADER. */
0b32bf0e
SM
200# ifndef INHIBIT_STRING_HEADER
201# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
202# include <string.h>
0b32bf0e 203# ifndef bzero
4bb91c68
SM
204# ifndef _LIBC
205# define bzero(s, n) (memset (s, '\0', n), (s))
206# else
207# define bzero(s, n) __bzero (s, n)
208# endif
0b32bf0e
SM
209# endif
210# else
211# include <strings.h>
4bb91c68
SM
212# ifndef memcmp
213# define memcmp(s1, s2, n) bcmp (s1, s2, n)
214# endif
215# ifndef memcpy
216# define memcpy(d, s, n) (bcopy (s, d, n), (d))
217# endif
0b32bf0e
SM
218# endif
219# endif
fa9a63c5
RM
220
221/* Define the syntax stuff for \<, \>, etc. */
222
990b2375
SM
223/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
224enum syntaxcode { Swhitespace = 0, Sword = 1 };
fa9a63c5 225
0b32bf0e
SM
226# ifdef SWITCH_ENUM_BUG
227# define SWITCH_ENUM_CAST(x) ((int)(x))
228# else
229# define SWITCH_ENUM_CAST(x) (x)
230# endif
fa9a63c5 231
e934739e 232/* Dummy macros for non-Emacs environments. */
0b32bf0e
SM
233# define BASE_LEADING_CODE_P(c) (0)
234# define CHAR_CHARSET(c) 0
235# define CHARSET_LEADING_CODE_BASE(c) 0
236# define MAX_MULTIBYTE_LENGTH 1
237# define RE_MULTIBYTE_P(x) 0
238# define WORD_BOUNDARY_P(c1, c2) (0)
239# define CHAR_HEAD_P(p) (1)
240# define SINGLE_BYTE_CHAR_P(c) (1)
241# define SAME_CHARSET_P(c1, c2) (1)
242# define MULTIBYTE_FORM_LENGTH(p, s) (1)
243# define STRING_CHAR(p, s) (*(p))
244# define RE_STRING_CHAR STRING_CHAR
245# define CHAR_STRING(c, s) (*(s) = (c), 1)
246# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
247# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
248# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
b18215fc 249 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
0b32bf0e 250# define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 251#endif /* not emacs */
4e8a9132
SM
252
253#ifndef RE_TRANSLATE
0b32bf0e
SM
254# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
255# define RE_TRANSLATE_P(TBL) (TBL)
4e8a9132 256#endif
fa9a63c5
RM
257\f
258/* Get the interface, including the syntax bits. */
259#include "regex.h"
260
f71b19b6
DL
261/* isalpha etc. are used for the character classes. */
262#include <ctype.h>
fa9a63c5 263
f71b19b6 264#ifdef emacs
fa9a63c5 265
f71b19b6 266/* 1 if C is an ASCII character. */
0b32bf0e 267# define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 268
f71b19b6 269/* 1 if C is a unibyte character. */
0b32bf0e 270# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 271
f71b19b6 272/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 273
f71b19b6 274/* In Emacs, these are only used for single-byte characters. */
0b32bf0e
SM
275# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
276# define ISCNTRL(c) ((c) < ' ')
277# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
f71b19b6
DL
278 || ((c) >= 'a' && (c) <= 'f') \
279 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
280
281/* This is only used for single-byte characters. */
0b32bf0e 282# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
96cc36cc
RS
283
284/* The rest must handle multibyte characters. */
285
0b32bf0e 286# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 287 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
288 : 1)
289
14473664 290# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 291 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
292 : 1)
293
0b32bf0e 294# define ISALNUM(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
295 ? (((c) >= 'a' && (c) <= 'z') \
296 || ((c) >= 'A' && (c) <= 'Z') \
297 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
298 : SYNTAX (c) == Sword)
299
0b32bf0e 300# define ISALPHA(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
301 ? (((c) >= 'a' && (c) <= 'z') \
302 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
303 : SYNTAX (c) == Sword)
304
0b32bf0e 305# define ISLOWER(c) (LOWERCASEP (c))
96cc36cc 306
0b32bf0e 307# define ISPUNCT(c) (IS_REAL_ASCII (c) \
f71b19b6
DL
308 ? ((c) > ' ' && (c) < 0177 \
309 && !(((c) >= 'a' && (c) <= 'z') \
4bb91c68
SM
310 || ((c) >= 'A' && (c) <= 'Z') \
311 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
312 : SYNTAX (c) != Sword)
313
0b32bf0e 314# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
96cc36cc 315
0b32bf0e 316# define ISUPPER(c) (UPPERCASEP (c))
96cc36cc 317
0b32bf0e 318# define ISWORD(c) (SYNTAX (c) == Sword)
96cc36cc
RS
319
320#else /* not emacs */
321
f71b19b6
DL
322/* Jim Meyering writes:
323
324 "... Some ctype macros are valid only for character codes that
325 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
326 using /bin/cc or gcc but without giving an ansi option). So, all
4bb91c68 327 ctype uses should be through macros like ISPRINT... If
f71b19b6
DL
328 STDC_HEADERS is defined, then autoconf has verified that the ctype
329 macros don't need to be guarded with references to isascii. ...
330 Defining isascii to 1 should let any compiler worth its salt
4bb91c68
SM
331 eliminate the && through constant folding."
332 Solaris defines some of these symbols so we must undefine them first. */
f71b19b6 333
4bb91c68 334# undef ISASCII
0b32bf0e
SM
335# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
336# define ISASCII(c) 1
337# else
338# define ISASCII(c) isascii(c)
339# endif
f71b19b6
DL
340
341/* 1 if C is an ASCII character. */
0b32bf0e 342# define IS_REAL_ASCII(c) ((c) < 0200)
f71b19b6
DL
343
344/* This distinction is not meaningful, except in Emacs. */
0b32bf0e
SM
345# define ISUNIBYTE(c) 1
346
347# ifdef isblank
348# define ISBLANK(c) (ISASCII (c) && isblank (c))
349# else
350# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
351# endif
352# ifdef isgraph
353# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
354# else
355# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
356# endif
357
4bb91c68 358# undef ISPRINT
0b32bf0e
SM
359# define ISPRINT(c) (ISASCII (c) && isprint (c))
360# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
361# define ISALNUM(c) (ISASCII (c) && isalnum (c))
362# define ISALPHA(c) (ISASCII (c) && isalpha (c))
363# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
364# define ISLOWER(c) (ISASCII (c) && islower (c))
365# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
366# define ISSPACE(c) (ISASCII (c) && isspace (c))
367# define ISUPPER(c) (ISASCII (c) && isupper (c))
368# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
369
370# define ISWORD(c) ISALPHA(c)
371
4bb91c68
SM
372# ifdef _tolower
373# define TOLOWER(c) _tolower(c)
374# else
375# define TOLOWER(c) tolower(c)
376# endif
377
378/* How many characters in the character set. */
379# define CHAR_SET_SIZE 256
380
0b32bf0e 381# ifdef SYNTAX_TABLE
f71b19b6 382
0b32bf0e 383extern char *re_syntax_table;
f71b19b6 384
0b32bf0e
SM
385# else /* not SYNTAX_TABLE */
386
0b32bf0e
SM
387static char re_syntax_table[CHAR_SET_SIZE];
388
389static void
390init_syntax_once ()
391{
392 register int c;
393 static int done = 0;
394
395 if (done)
396 return;
397
398 bzero (re_syntax_table, sizeof re_syntax_table);
399
4bb91c68
SM
400 for (c = 0; c < CHAR_SET_SIZE; ++c)
401 if (ISALNUM (c))
402 re_syntax_table[c] = Sword;
fa9a63c5 403
0b32bf0e 404 re_syntax_table['_'] = Sword;
fa9a63c5 405
0b32bf0e
SM
406 done = 1;
407}
408
409# endif /* not SYNTAX_TABLE */
96cc36cc 410
4bb91c68
SM
411# define SYNTAX(c) re_syntax_table[(c)]
412
96cc36cc
RS
413#endif /* not emacs */
414\f
fa9a63c5 415#ifndef NULL
0b32bf0e 416# define NULL (void *)0
fa9a63c5
RM
417#endif
418
419/* We remove any previous definition of `SIGN_EXTEND_CHAR',
420 since ours (we hope) works properly with all combinations of
421 machines, compilers, `char' and `unsigned char' argument types.
4bb91c68 422 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
423#undef SIGN_EXTEND_CHAR
424#if __STDC__
0b32bf0e 425# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
fa9a63c5
RM
426#else /* not __STDC__ */
427/* As in Harbison and Steele. */
0b32bf0e 428# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
fa9a63c5
RM
429#endif
430\f
431/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
432 use `alloca' instead of `malloc'. This is because using malloc in
433 re_search* or re_match* could cause memory leaks when C-g is used in
434 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
435 the other hand, malloc is more portable, and easier to debug.
436
fa9a63c5
RM
437 Because we sometimes use alloca, some routines have to be macros,
438 not functions -- `alloca'-allocated space disappears at the end of the
439 function it is called in. */
440
441#ifdef REGEX_MALLOC
442
0b32bf0e
SM
443# define REGEX_ALLOCATE malloc
444# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
445# define REGEX_FREE free
fa9a63c5
RM
446
447#else /* not REGEX_MALLOC */
448
449/* Emacs already defines alloca, sometimes. */
0b32bf0e 450# ifndef alloca
fa9a63c5
RM
451
452/* Make alloca work the best possible way. */
0b32bf0e
SM
453# ifdef __GNUC__
454# define alloca __builtin_alloca
455# else /* not __GNUC__ */
456# if HAVE_ALLOCA_H
457# include <alloca.h>
458# endif /* HAVE_ALLOCA_H */
459# endif /* not __GNUC__ */
fa9a63c5 460
0b32bf0e 461# endif /* not alloca */
fa9a63c5 462
0b32bf0e 463# define REGEX_ALLOCATE alloca
fa9a63c5
RM
464
465/* Assumes a `char *destination' variable. */
0b32bf0e 466# define REGEX_REALLOCATE(source, osize, nsize) \
fa9a63c5 467 (destination = (char *) alloca (nsize), \
4bb91c68 468 memcpy (destination, source, osize))
fa9a63c5
RM
469
470/* No need to do anything to free, after alloca. */
0b32bf0e 471# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
472
473#endif /* not REGEX_MALLOC */
474
475/* Define how to allocate the failure stack. */
476
0b32bf0e 477#if defined REL_ALLOC && defined REGEX_MALLOC
4297555e 478
0b32bf0e 479# define REGEX_ALLOCATE_STACK(size) \
fa9a63c5 480 r_alloc (&failure_stack_ptr, (size))
0b32bf0e 481# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 482 r_re_alloc (&failure_stack_ptr, (nsize))
0b32bf0e 483# define REGEX_FREE_STACK(ptr) \
fa9a63c5
RM
484 r_alloc_free (&failure_stack_ptr)
485
4297555e 486#else /* not using relocating allocator */
fa9a63c5 487
0b32bf0e 488# ifdef REGEX_MALLOC
fa9a63c5 489
0b32bf0e
SM
490# define REGEX_ALLOCATE_STACK malloc
491# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
492# define REGEX_FREE_STACK free
fa9a63c5 493
0b32bf0e 494# else /* not REGEX_MALLOC */
fa9a63c5 495
0b32bf0e 496# define REGEX_ALLOCATE_STACK alloca
fa9a63c5 497
0b32bf0e 498# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
fa9a63c5 499 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 500/* No need to explicitly free anything. */
0b32bf0e 501# define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5 502
0b32bf0e 503# endif /* not REGEX_MALLOC */
4297555e 504#endif /* not using relocating allocator */
fa9a63c5
RM
505
506
507/* True if `size1' is non-NULL and PTR is pointing anywhere inside
508 `string1' or just past its end. This works if PTR is NULL, which is
509 a good thing. */
25fe55af 510#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
511 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
512
513/* (Re)Allocate N items of type T using malloc, or fail. */
514#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
515#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
516#define RETALLOC_IF(addr, n, t) \
517 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
518#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
519
4bb91c68 520#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
521
522#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
523
524#undef MAX
525#undef MIN
526#define MAX(a, b) ((a) > (b) ? (a) : (b))
527#define MIN(a, b) ((a) < (b) ? (a) : (b))
528
66f0296e
SM
529/* Type of source-pattern and string chars. */
530typedef const unsigned char re_char;
531
fa9a63c5
RM
532typedef char boolean;
533#define false 0
534#define true 1
535
4bb91c68
SM
536static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
537 re_char *string1, int size1,
538 re_char *string2, int size2,
539 int pos,
540 struct re_registers *regs,
541 int stop));
fa9a63c5
RM
542\f
543/* These are the command codes that appear in compiled regular
4bb91c68 544 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
545 command code can specify any interpretation whatsoever for its
546 arguments. Zero bytes may appear in the compiled regular expression. */
547
548typedef enum
549{
550 no_op = 0,
551
4bb91c68 552 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
553 succeed,
554
25fe55af 555 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
556 exactn,
557
25fe55af 558 /* Matches any (more or less) character. */
fa9a63c5
RM
559 anychar,
560
25fe55af
RS
561 /* Matches any one char belonging to specified set. First
562 following byte is number of bitmap bytes. Then come bytes
563 for a bitmap saying which chars are in. Bits in each byte
564 are ordered low-bit-first. A character is in the set if its
565 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
566 automatically not in the set.
567
568 If the length byte has the 0x80 bit set, then that stuff
569 is followed by a range table:
570 2 bytes of flags for character sets (low 8 bits, high 8 bits)
0b32bf0e 571 See RANGE_TABLE_WORK_BITS below.
01618498 572 2 bytes, the number of pairs that follow (upto 32767)
96cc36cc 573 pairs, each 2 multibyte characters,
0b32bf0e 574 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
575 charset,
576
25fe55af 577 /* Same parameters as charset, but match any character that is
4bb91c68 578 not one of those specified. */
fa9a63c5
RM
579 charset_not,
580
25fe55af
RS
581 /* Start remembering the text that is matched, for storing in a
582 register. Followed by one byte with the register number, in
583 the range 0 to one less than the pattern buffer's re_nsub
505bde11 584 field. */
fa9a63c5
RM
585 start_memory,
586
25fe55af
RS
587 /* Stop remembering the text that is matched and store it in a
588 memory register. Followed by one byte with the register
589 number, in the range 0 to one less than `re_nsub' in the
505bde11 590 pattern buffer. */
fa9a63c5
RM
591 stop_memory,
592
25fe55af 593 /* Match a duplicate of something remembered. Followed by one
4bb91c68 594 byte containing the register number. */
fa9a63c5
RM
595 duplicate,
596
25fe55af 597 /* Fail unless at beginning of line. */
fa9a63c5
RM
598 begline,
599
4bb91c68 600 /* Fail unless at end of line. */
fa9a63c5
RM
601 endline,
602
25fe55af
RS
603 /* Succeeds if at beginning of buffer (if emacs) or at beginning
604 of string to be matched (if not). */
fa9a63c5
RM
605 begbuf,
606
25fe55af 607 /* Analogously, for end of buffer/string. */
fa9a63c5 608 endbuf,
5e69f11e 609
25fe55af 610 /* Followed by two byte relative address to which to jump. */
5e69f11e 611 jump,
fa9a63c5 612
25fe55af
RS
613 /* Followed by two-byte relative address of place to resume at
614 in case of failure. */
fa9a63c5 615 on_failure_jump,
5e69f11e 616
25fe55af
RS
617 /* Like on_failure_jump, but pushes a placeholder instead of the
618 current string position when executed. */
fa9a63c5 619 on_failure_keep_string_jump,
5e69f11e 620
505bde11
SM
621 /* Just like `on_failure_jump', except that it checks that we
622 don't get stuck in an infinite loop (matching an empty string
623 indefinitely). */
624 on_failure_jump_loop,
625
0683b6fa
SM
626 /* Just like `on_failure_jump_loop', except that it checks for
627 a different kind of loop (the kind that shows up with non-greedy
628 operators). This operation has to be immediately preceded
629 by a `no_op'. */
630 on_failure_jump_nastyloop,
631
0b32bf0e 632 /* A smart `on_failure_jump' used for greedy * and + operators.
505bde11
SM
633 It analyses the loop before which it is put and if the
634 loop does not require backtracking, it changes itself to
4e8a9132
SM
635 `on_failure_keep_string_jump' and short-circuits the loop,
636 else it just defaults to changing itself into `on_failure_jump'.
637 It assumes that it is pointing to just past a `jump'. */
505bde11 638 on_failure_jump_smart,
fa9a63c5 639
25fe55af 640 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
641 After matching N times, jump to the address upon failure.
642 Does not work if N starts at 0: use on_failure_jump_loop
643 instead. */
fa9a63c5
RM
644 succeed_n,
645
25fe55af
RS
646 /* Followed by two-byte relative address, and two-byte number n.
647 Jump to the address N times, then fail. */
fa9a63c5
RM
648 jump_n,
649
25fe55af
RS
650 /* Set the following two-byte relative address to the
651 subsequent two-byte number. The address *includes* the two
652 bytes of number. */
fa9a63c5
RM
653 set_number_at,
654
fa9a63c5
RM
655 wordbeg, /* Succeeds if at word beginning. */
656 wordend, /* Succeeds if at word end. */
657
658 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 659 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5
RM
660
661 /* Matches any character whose syntax is specified. Followed by
25fe55af 662 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
663 syntaxspec,
664
665 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
666 notsyntaxspec
667
668#ifdef emacs
669 ,before_dot, /* Succeeds if before point. */
670 at_dot, /* Succeeds if at point. */
671 after_dot, /* Succeeds if after point. */
b18215fc
RS
672
673 /* Matches any character whose category-set contains the specified
25fe55af
RS
674 category. The operator is followed by a byte which contains a
675 category code (mnemonic ASCII character). */
b18215fc
RS
676 categoryspec,
677
678 /* Matches any character whose category-set does not contain the
679 specified category. The operator is followed by a byte which
680 contains the category code (mnemonic ASCII character). */
681 notcategoryspec
fa9a63c5
RM
682#endif /* emacs */
683} re_opcode_t;
684\f
685/* Common operations on the compiled pattern. */
686
687/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
688
689#define STORE_NUMBER(destination, number) \
690 do { \
691 (destination)[0] = (number) & 0377; \
692 (destination)[1] = (number) >> 8; \
693 } while (0)
694
695/* Same as STORE_NUMBER, except increment DESTINATION to
696 the byte after where the number is stored. Therefore, DESTINATION
697 must be an lvalue. */
698
699#define STORE_NUMBER_AND_INCR(destination, number) \
700 do { \
701 STORE_NUMBER (destination, number); \
702 (destination) += 2; \
703 } while (0)
704
705/* Put into DESTINATION a number stored in two contiguous bytes starting
706 at SOURCE. */
707
708#define EXTRACT_NUMBER(destination, source) \
709 do { \
710 (destination) = *(source) & 0377; \
711 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
712 } while (0)
713
714#ifdef DEBUG
4bb91c68 715static void extract_number _RE_ARGS ((int *dest, re_char *source));
fa9a63c5
RM
716static void
717extract_number (dest, source)
718 int *dest;
01618498 719 re_char *source;
fa9a63c5 720{
5e69f11e 721 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
722 *dest = *source & 0377;
723 *dest += temp << 8;
724}
725
4bb91c68 726# ifndef EXTRACT_MACROS /* To debug the macros. */
0b32bf0e
SM
727# undef EXTRACT_NUMBER
728# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
729# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
730
731#endif /* DEBUG */
732
733/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
734 SOURCE must be an lvalue. */
735
736#define EXTRACT_NUMBER_AND_INCR(destination, source) \
737 do { \
738 EXTRACT_NUMBER (destination, source); \
25fe55af 739 (source) += 2; \
fa9a63c5
RM
740 } while (0)
741
742#ifdef DEBUG
4bb91c68
SM
743static void extract_number_and_incr _RE_ARGS ((int *destination,
744 re_char **source));
fa9a63c5
RM
745static void
746extract_number_and_incr (destination, source)
747 int *destination;
01618498 748 re_char **source;
5e69f11e 749{
fa9a63c5
RM
750 extract_number (destination, *source);
751 *source += 2;
752}
753
0b32bf0e
SM
754# ifndef EXTRACT_MACROS
755# undef EXTRACT_NUMBER_AND_INCR
756# define EXTRACT_NUMBER_AND_INCR(dest, src) \
fa9a63c5 757 extract_number_and_incr (&dest, &src)
0b32bf0e 758# endif /* not EXTRACT_MACROS */
fa9a63c5
RM
759
760#endif /* DEBUG */
761\f
b18215fc
RS
762/* Store a multibyte character in three contiguous bytes starting
763 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 764 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
765
766#define STORE_CHARACTER_AND_INCR(destination, character) \
767 do { \
768 (destination)[0] = (character) & 0377; \
769 (destination)[1] = ((character) >> 8) & 0377; \
770 (destination)[2] = (character) >> 16; \
771 (destination) += 3; \
772 } while (0)
773
774/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 775 starting at SOURCE. */
b18215fc
RS
776
777#define EXTRACT_CHARACTER(destination, source) \
778 do { \
779 (destination) = ((source)[0] \
780 | ((source)[1] << 8) \
781 | ((source)[2] << 16)); \
782 } while (0)
783
784
785/* Macros for charset. */
786
787/* Size of bitmap of charset P in bytes. P is a start of charset,
788 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
789#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
790
791/* Nonzero if charset P has range table. */
25fe55af 792#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
793
794/* Return the address of range table of charset P. But not the start
795 of table itself, but the before where the number of ranges is
96cc36cc
RS
796 stored. `2 +' means to skip re_opcode_t and size of bitmap,
797 and the 2 bytes of flags at the start of the range table. */
798#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
799
800/* Extract the bit flags that start a range table. */
801#define CHARSET_RANGE_TABLE_BITS(p) \
802 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
803 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
804
805/* Test if C is listed in the bitmap of charset P. */
806#define CHARSET_LOOKUP_BITMAP(p, c) \
807 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
808 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
809
810/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
811 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
812 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
813 and end. */
814#define CHARSET_RANGE_TABLE_END(range_table, count) \
815 ((range_table) + (count) * 2 * 3)
816
25fe55af 817/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
818 COUNT is number of ranges in RANGE_TABLE. */
819#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
820 do \
821 { \
01618498
SM
822 re_wchar_t range_start, range_end; \
823 re_char *p; \
824 re_char *range_table_end \
b18215fc
RS
825 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
826 \
827 for (p = (range_table); p < range_table_end; p += 2 * 3) \
828 { \
829 EXTRACT_CHARACTER (range_start, p); \
830 EXTRACT_CHARACTER (range_end, p + 3); \
831 \
832 if (range_start <= (c) && (c) <= range_end) \
833 { \
834 (not) = !(not); \
835 break; \
836 } \
837 } \
838 } \
839 while (0)
840
841/* Test if C is in range table of CHARSET. The flag NOT is negated if
842 C is listed in it. */
843#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
844 do \
845 { \
846 /* Number of ranges in range table. */ \
847 int count; \
01618498
SM
848 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
849 \
b18215fc
RS
850 EXTRACT_NUMBER_AND_INCR (count, range_table); \
851 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
852 } \
853 while (0)
854\f
fa9a63c5
RM
855/* If DEBUG is defined, Regex prints many voluminous messages about what
856 it is doing (if the variable `debug' is nonzero). If linked with the
857 main program in `iregex.c', you can enter patterns and strings
858 interactively. And if linked with the main program in `main.c' and
4bb91c68 859 the other test files, you can run the already-written tests. */
fa9a63c5
RM
860
861#ifdef DEBUG
862
863/* We use standard I/O for debugging. */
0b32bf0e 864# include <stdio.h>
fa9a63c5
RM
865
866/* It is useful to test things that ``must'' be true when debugging. */
0b32bf0e 867# include <assert.h>
fa9a63c5 868
99633e97 869static int debug = -100000;
fa9a63c5 870
0b32bf0e
SM
871# define DEBUG_STATEMENT(e) e
872# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
873# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
874# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
875# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
876# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 877 if (debug > 0) print_partial_compiled_pattern (s, e)
0b32bf0e 878# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 879 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
880
881
882/* Print the fastmap in human-readable form. */
883
884void
885print_fastmap (fastmap)
886 char *fastmap;
887{
888 unsigned was_a_range = 0;
5e69f11e
RM
889 unsigned i = 0;
890
fa9a63c5
RM
891 while (i < (1 << BYTEWIDTH))
892 {
893 if (fastmap[i++])
894 {
895 was_a_range = 0;
25fe55af
RS
896 putchar (i - 1);
897 while (i < (1 << BYTEWIDTH) && fastmap[i])
898 {
899 was_a_range = 1;
900 i++;
901 }
fa9a63c5 902 if (was_a_range)
25fe55af
RS
903 {
904 printf ("-");
905 putchar (i - 1);
906 }
907 }
fa9a63c5 908 }
5e69f11e 909 putchar ('\n');
fa9a63c5
RM
910}
911
912
913/* Print a compiled pattern string in human-readable form, starting at
914 the START pointer into it and ending just before the pointer END. */
915
916void
917print_partial_compiled_pattern (start, end)
01618498
SM
918 re_char *start;
919 re_char *end;
fa9a63c5
RM
920{
921 int mcnt, mcnt2;
01618498
SM
922 re_char *p = start;
923 re_char *pend = end;
fa9a63c5
RM
924
925 if (start == NULL)
926 {
927 printf ("(null)\n");
928 return;
929 }
5e69f11e 930
fa9a63c5
RM
931 /* Loop over pattern commands. */
932 while (p < pend)
933 {
934 printf ("%d:\t", p - start);
935
936 switch ((re_opcode_t) *p++)
937 {
25fe55af
RS
938 case no_op:
939 printf ("/no_op");
940 break;
fa9a63c5 941
99633e97
SM
942 case succeed:
943 printf ("/succeed");
944 break;
945
fa9a63c5
RM
946 case exactn:
947 mcnt = *p++;
25fe55af
RS
948 printf ("/exactn/%d", mcnt);
949 do
fa9a63c5 950 {
25fe55af 951 putchar ('/');
fa9a63c5 952 putchar (*p++);
25fe55af
RS
953 }
954 while (--mcnt);
955 break;
fa9a63c5
RM
956
957 case start_memory:
505bde11 958 printf ("/start_memory/%d", *p++);
25fe55af 959 break;
fa9a63c5
RM
960
961 case stop_memory:
505bde11 962 printf ("/stop_memory/%d", *p++);
25fe55af 963 break;
fa9a63c5
RM
964
965 case duplicate:
966 printf ("/duplicate/%d", *p++);
967 break;
968
969 case anychar:
970 printf ("/anychar");
971 break;
972
973 case charset:
25fe55af
RS
974 case charset_not:
975 {
976 register int c, last = -100;
fa9a63c5 977 register int in_range = 0;
99633e97
SM
978 int length = CHARSET_BITMAP_SIZE (p - 1);
979 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5
RM
980
981 printf ("/charset [%s",
25fe55af 982 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 983
25fe55af 984 assert (p + *p < pend);
fa9a63c5 985
25fe55af 986 for (c = 0; c < 256; c++)
96cc36cc 987 if (c / 8 < length
fa9a63c5
RM
988 && (p[1 + (c/8)] & (1 << (c % 8))))
989 {
990 /* Are we starting a range? */
991 if (last + 1 == c && ! in_range)
992 {
993 putchar ('-');
994 in_range = 1;
995 }
996 /* Have we broken a range? */
997 else if (last + 1 != c && in_range)
96cc36cc 998 {
fa9a63c5
RM
999 putchar (last);
1000 in_range = 0;
1001 }
5e69f11e 1002
fa9a63c5
RM
1003 if (! in_range)
1004 putchar (c);
1005
1006 last = c;
25fe55af 1007 }
fa9a63c5
RM
1008
1009 if (in_range)
1010 putchar (last);
1011
1012 putchar (']');
1013
99633e97 1014 p += 1 + length;
96cc36cc 1015
96cc36cc 1016 if (has_range_table)
99633e97
SM
1017 {
1018 int count;
1019 printf ("has-range-table");
1020
1021 /* ??? Should print the range table; for now, just skip it. */
1022 p += 2; /* skip range table bits */
1023 EXTRACT_NUMBER_AND_INCR (count, p);
1024 p = CHARSET_RANGE_TABLE_END (p, count);
1025 }
fa9a63c5
RM
1026 }
1027 break;
1028
1029 case begline:
1030 printf ("/begline");
25fe55af 1031 break;
fa9a63c5
RM
1032
1033 case endline:
25fe55af
RS
1034 printf ("/endline");
1035 break;
fa9a63c5
RM
1036
1037 case on_failure_jump:
25fe55af
RS
1038 extract_number_and_incr (&mcnt, &p);
1039 printf ("/on_failure_jump to %d", p + mcnt - start);
1040 break;
fa9a63c5
RM
1041
1042 case on_failure_keep_string_jump:
25fe55af
RS
1043 extract_number_and_incr (&mcnt, &p);
1044 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
1045 break;
fa9a63c5 1046
0683b6fa
SM
1047 case on_failure_jump_nastyloop:
1048 extract_number_and_incr (&mcnt, &p);
1049 printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
1050 break;
1051
505bde11 1052 case on_failure_jump_loop:
fa9a63c5 1053 extract_number_and_incr (&mcnt, &p);
505bde11 1054 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
1055 break;
1056
505bde11 1057 case on_failure_jump_smart:
fa9a63c5 1058 extract_number_and_incr (&mcnt, &p);
505bde11 1059 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
1060 break;
1061
25fe55af 1062 case jump:
fa9a63c5 1063 extract_number_and_incr (&mcnt, &p);
25fe55af 1064 printf ("/jump to %d", p + mcnt - start);
fa9a63c5
RM
1065 break;
1066
25fe55af
RS
1067 case succeed_n:
1068 extract_number_and_incr (&mcnt, &p);
1069 extract_number_and_incr (&mcnt2, &p);
99633e97 1070 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1071 break;
5e69f11e 1072
25fe55af
RS
1073 case jump_n:
1074 extract_number_and_incr (&mcnt, &p);
1075 extract_number_and_incr (&mcnt2, &p);
99633e97 1076 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1077 break;
5e69f11e 1078
25fe55af
RS
1079 case set_number_at:
1080 extract_number_and_incr (&mcnt, &p);
1081 extract_number_and_incr (&mcnt2, &p);
99633e97 1082 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1083 break;
5e69f11e 1084
25fe55af 1085 case wordbound:
fa9a63c5
RM
1086 printf ("/wordbound");
1087 break;
1088
1089 case notwordbound:
1090 printf ("/notwordbound");
25fe55af 1091 break;
fa9a63c5
RM
1092
1093 case wordbeg:
1094 printf ("/wordbeg");
1095 break;
5e69f11e 1096
fa9a63c5
RM
1097 case wordend:
1098 printf ("/wordend");
5e69f11e 1099
1fb352e0
SM
1100 case syntaxspec:
1101 printf ("/syntaxspec");
1102 mcnt = *p++;
1103 printf ("/%d", mcnt);
1104 break;
1105
1106 case notsyntaxspec:
1107 printf ("/notsyntaxspec");
1108 mcnt = *p++;
1109 printf ("/%d", mcnt);
1110 break;
1111
0b32bf0e 1112# ifdef emacs
fa9a63c5
RM
1113 case before_dot:
1114 printf ("/before_dot");
25fe55af 1115 break;
fa9a63c5
RM
1116
1117 case at_dot:
1118 printf ("/at_dot");
25fe55af 1119 break;
fa9a63c5
RM
1120
1121 case after_dot:
1122 printf ("/after_dot");
25fe55af 1123 break;
fa9a63c5 1124
1fb352e0
SM
1125 case categoryspec:
1126 printf ("/categoryspec");
fa9a63c5
RM
1127 mcnt = *p++;
1128 printf ("/%d", mcnt);
25fe55af 1129 break;
5e69f11e 1130
1fb352e0
SM
1131 case notcategoryspec:
1132 printf ("/notcategoryspec");
fa9a63c5
RM
1133 mcnt = *p++;
1134 printf ("/%d", mcnt);
1135 break;
0b32bf0e 1136# endif /* emacs */
fa9a63c5 1137
fa9a63c5
RM
1138 case begbuf:
1139 printf ("/begbuf");
25fe55af 1140 break;
fa9a63c5
RM
1141
1142 case endbuf:
1143 printf ("/endbuf");
25fe55af 1144 break;
fa9a63c5 1145
25fe55af
RS
1146 default:
1147 printf ("?%d", *(p-1));
fa9a63c5
RM
1148 }
1149
1150 putchar ('\n');
1151 }
1152
1153 printf ("%d:\tend of pattern.\n", p - start);
1154}
1155
1156
1157void
1158print_compiled_pattern (bufp)
1159 struct re_pattern_buffer *bufp;
1160{
01618498 1161 re_char *buffer = bufp->buffer;
fa9a63c5
RM
1162
1163 print_partial_compiled_pattern (buffer, buffer + bufp->used);
4bb91c68
SM
1164 printf ("%ld bytes used/%ld bytes allocated.\n",
1165 bufp->used, bufp->allocated);
fa9a63c5
RM
1166
1167 if (bufp->fastmap_accurate && bufp->fastmap)
1168 {
1169 printf ("fastmap: ");
1170 print_fastmap (bufp->fastmap);
1171 }
1172
1173 printf ("re_nsub: %d\t", bufp->re_nsub);
1174 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1175 printf ("can_be_null: %d\t", bufp->can_be_null);
fa9a63c5
RM
1176 printf ("no_sub: %d\t", bufp->no_sub);
1177 printf ("not_bol: %d\t", bufp->not_bol);
1178 printf ("not_eol: %d\t", bufp->not_eol);
4bb91c68 1179 printf ("syntax: %lx\n", bufp->syntax);
505bde11 1180 fflush (stdout);
fa9a63c5
RM
1181 /* Perhaps we should print the translate table? */
1182}
1183
1184
1185void
1186print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1187 re_char *where;
1188 re_char *string1;
1189 re_char *string2;
fa9a63c5
RM
1190 int size1;
1191 int size2;
1192{
4bb91c68 1193 int this_char;
5e69f11e 1194
fa9a63c5
RM
1195 if (where == NULL)
1196 printf ("(null)");
1197 else
1198 {
1199 if (FIRST_STRING_P (where))
25fe55af
RS
1200 {
1201 for (this_char = where - string1; this_char < size1; this_char++)
1202 putchar (string1[this_char]);
fa9a63c5 1203
25fe55af
RS
1204 where = string2;
1205 }
fa9a63c5
RM
1206
1207 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1208 putchar (string2[this_char]);
fa9a63c5
RM
1209 }
1210}
1211
1212#else /* not DEBUG */
1213
0b32bf0e
SM
1214# undef assert
1215# define assert(e)
fa9a63c5 1216
0b32bf0e
SM
1217# define DEBUG_STATEMENT(e)
1218# define DEBUG_PRINT1(x)
1219# define DEBUG_PRINT2(x1, x2)
1220# define DEBUG_PRINT3(x1, x2, x3)
1221# define DEBUG_PRINT4(x1, x2, x3, x4)
1222# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1223# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
fa9a63c5
RM
1224
1225#endif /* not DEBUG */
1226\f
1227/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1228 also be assigned to arbitrarily: each pattern buffer stores its own
1229 syntax, so it can be changed between regex compilations. */
1230/* This has no initializer because initialized variables in Emacs
1231 become read-only after dumping. */
1232reg_syntax_t re_syntax_options;
1233
1234
1235/* Specify the precise syntax of regexps for compilation. This provides
1236 for compatibility for various utilities which historically have
1237 different, incompatible syntaxes.
1238
1239 The argument SYNTAX is a bit mask comprised of the various bits
4bb91c68 1240 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1241
1242reg_syntax_t
1243re_set_syntax (syntax)
1244 reg_syntax_t syntax;
1245{
1246 reg_syntax_t ret = re_syntax_options;
5e69f11e 1247
fa9a63c5
RM
1248 re_syntax_options = syntax;
1249 return ret;
1250}
c0f9ea08 1251WEAK_ALIAS (__re_set_syntax, re_set_syntax)
fa9a63c5
RM
1252\f
1253/* This table gives an error message for each of the error codes listed
4bb91c68 1254 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1255 POSIX doesn't require that we do anything for REG_NOERROR,
4bb91c68 1256 but why not be nice? */
fa9a63c5
RM
1257
1258static const char *re_error_msgid[] =
5e69f11e
RM
1259 {
1260 gettext_noop ("Success"), /* REG_NOERROR */
1261 gettext_noop ("No match"), /* REG_NOMATCH */
1262 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1263 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1264 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1265 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1266 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1267 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1268 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1269 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1270 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1271 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1272 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1273 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1274 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1275 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1276 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
fa9a63c5
RM
1277 };
1278\f
4bb91c68 1279/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1280
1281/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1282 searching and matching functions should not call alloca. On some
1283 systems, alloca is implemented in terms of malloc, and if we're
1284 using the relocating allocator routines, then malloc could cause a
1285 relocation, which might (if the strings being searched are in the
1286 ralloc heap) shift the data out from underneath the regexp
1287 routines.
1288
5e69f11e 1289 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1290 processes input from X in a signal handler; processing X input may
1291 call malloc; if input arrives while a matching routine is calling
1292 malloc, then we're scrod. But Emacs can't just block input while
1293 calling matching routines; then we don't notice interrupts when
1294 they come in. So, Emacs blocks input around all regexp calls
1295 except the matching calls, which it leaves unprotected, in the
1296 faith that they will not malloc. */
1297
1298/* Normally, this is fine. */
1299#define MATCH_MAY_ALLOCATE
1300
1301/* When using GNU C, we are not REALLY using the C alloca, no matter
1302 what config.h may say. So don't take precautions for it. */
1303#ifdef __GNUC__
0b32bf0e 1304# undef C_ALLOCA
fa9a63c5
RM
1305#endif
1306
1307/* The match routines may not allocate if (1) they would do it with malloc
1308 and (2) it's not safe for them to use malloc.
1309 Note that if REL_ALLOC is defined, matching would not use malloc for the
1310 failure stack, but we would still use it for the register vectors;
4bb91c68 1311 so REL_ALLOC should not affect this. */
0b32bf0e
SM
1312#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1313# undef MATCH_MAY_ALLOCATE
fa9a63c5
RM
1314#endif
1315
1316\f
1317/* Failure stack declarations and macros; both re_compile_fastmap and
1318 re_match_2 use a failure stack. These have to be macros because of
1319 REGEX_ALLOCATE_STACK. */
5e69f11e 1320
fa9a63c5 1321
320a2a73 1322/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1323 when matching. If this number is exceeded, we allocate more
1324 space, so it is not a hard limit. */
1325#ifndef INIT_FAILURE_ALLOC
0b32bf0e 1326# define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1327#endif
1328
1329/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1330 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1331 This is a variable only so users of regex can assign to it; we never
ada30c0e
SM
1332 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1333 before using it, so it should probably be a byte-count instead. */
c0f9ea08
SM
1334# if defined MATCH_MAY_ALLOCATE
1335/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
320a2a73
KH
1336 whose default stack limit is 2mb. In order for a larger
1337 value to work reliably, you have to try to make it accord
1338 with the process stack limit. */
c0f9ea08
SM
1339size_t re_max_failures = 40000;
1340# else
1341size_t re_max_failures = 4000;
1342# endif
fa9a63c5
RM
1343
1344union fail_stack_elt
1345{
01618498 1346 re_char *pointer;
c0f9ea08
SM
1347 /* This should be the biggest `int' that's no bigger than a pointer. */
1348 long integer;
fa9a63c5
RM
1349};
1350
1351typedef union fail_stack_elt fail_stack_elt_t;
1352
1353typedef struct
1354{
1355 fail_stack_elt_t *stack;
c0f9ea08
SM
1356 size_t size;
1357 size_t avail; /* Offset of next open position. */
1358 size_t frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1359} fail_stack_type;
1360
505bde11 1361#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1362#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1363
1364
1365/* Define macros to initialize and free the failure stack.
1366 Do `return -2' if the alloc fails. */
1367
1368#ifdef MATCH_MAY_ALLOCATE
0b32bf0e 1369# define INIT_FAIL_STACK() \
fa9a63c5
RM
1370 do { \
1371 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1372 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1373 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1374 \
1375 if (fail_stack.stack == NULL) \
1376 return -2; \
1377 \
1378 fail_stack.size = INIT_FAILURE_ALLOC; \
1379 fail_stack.avail = 0; \
505bde11 1380 fail_stack.frame = 0; \
fa9a63c5
RM
1381 } while (0)
1382
0b32bf0e 1383# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
fa9a63c5 1384#else
0b32bf0e 1385# define INIT_FAIL_STACK() \
fa9a63c5
RM
1386 do { \
1387 fail_stack.avail = 0; \
505bde11 1388 fail_stack.frame = 0; \
fa9a63c5
RM
1389 } while (0)
1390
0b32bf0e 1391# define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1392#endif
1393
1394
320a2a73
KH
1395/* Double the size of FAIL_STACK, up to a limit
1396 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1397
1398 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1399 allocating space for it or it was already too large.
1400
4bb91c68 1401 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1402
320a2a73
KH
1403/* Factor to increase the failure stack size by
1404 when we increase it.
1405 This used to be 2, but 2 was too wasteful
1406 because the old discarded stacks added up to as much space
1407 were as ultimate, maximum-size stack. */
1408#define FAIL_STACK_GROWTH_FACTOR 4
1409
1410#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1411 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1412 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1413 ? 0 \
320a2a73
KH
1414 : ((fail_stack).stack \
1415 = (fail_stack_elt_t *) \
25fe55af
RS
1416 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1417 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1418 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1419 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1420 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1421 \
1422 (fail_stack).stack == NULL \
1423 ? 0 \
6453db45
KH
1424 : ((fail_stack).size \
1425 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1426 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1427 * FAIL_STACK_GROWTH_FACTOR)) \
1428 / sizeof (fail_stack_elt_t)), \
25fe55af 1429 1)))
fa9a63c5
RM
1430
1431
fa9a63c5
RM
1432/* Push a pointer value onto the failure stack.
1433 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1434 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5 1435#define PUSH_FAILURE_POINTER(item) \
01618498 1436 fail_stack.stack[fail_stack.avail++].pointer = (item)
fa9a63c5
RM
1437
1438/* This pushes an integer-valued item onto the failure stack.
1439 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1440 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1441#define PUSH_FAILURE_INT(item) \
1442 fail_stack.stack[fail_stack.avail++].integer = (item)
1443
1444/* Push a fail_stack_elt_t value onto the failure stack.
1445 Assumes the variable `fail_stack'. Probably should only
4bb91c68 1446 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1447#define PUSH_FAILURE_ELT(item) \
1448 fail_stack.stack[fail_stack.avail++] = (item)
1449
1450/* These three POP... operations complement the three PUSH... operations.
1451 All assume that `fail_stack' is nonempty. */
1452#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1453#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1454#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1455
505bde11
SM
1456/* Individual items aside from the registers. */
1457#define NUM_NONREG_ITEMS 3
1458
1459/* Used to examine the stack (to detect infinite loops). */
1460#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1461#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1462#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1463#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1464
1465
505bde11
SM
1466#define ENSURE_FAIL_STACK(space) \
1467while (REMAINING_AVAIL_SLOTS <= space) { \
1468 if (!GROW_FAIL_STACK (fail_stack)) \
1469 return -2; \
1470 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1471 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1472}
1473
1474/* Push register NUM onto the stack. */
1475#define PUSH_FAILURE_REG(num) \
1476do { \
1477 char *destination; \
1478 ENSURE_FAIL_STACK(3); \
1479 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1480 num, regstart[num], regend[num]); \
1481 PUSH_FAILURE_POINTER (regstart[num]); \
1482 PUSH_FAILURE_POINTER (regend[num]); \
1483 PUSH_FAILURE_INT (num); \
1484} while (0)
1485
01618498
SM
1486/* Change the counter's value to VAL, but make sure that it will
1487 be reset when backtracking. */
1488#define PUSH_NUMBER(ptr,val) \
dc1e502d
SM
1489do { \
1490 char *destination; \
1491 int c; \
1492 ENSURE_FAIL_STACK(3); \
1493 EXTRACT_NUMBER (c, ptr); \
01618498 1494 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
dc1e502d
SM
1495 PUSH_FAILURE_INT (c); \
1496 PUSH_FAILURE_POINTER (ptr); \
1497 PUSH_FAILURE_INT (-1); \
01618498 1498 STORE_NUMBER (ptr, val); \
dc1e502d
SM
1499} while (0)
1500
505bde11 1501/* Pop a saved register off the stack. */
dc1e502d 1502#define POP_FAILURE_REG_OR_COUNT() \
505bde11
SM
1503do { \
1504 int reg = POP_FAILURE_INT (); \
dc1e502d
SM
1505 if (reg == -1) \
1506 { \
1507 /* It's a counter. */ \
6dcf2d0e
SM
1508 /* Here, we discard `const', making re_match non-reentrant. */ \
1509 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
dc1e502d
SM
1510 reg = POP_FAILURE_INT (); \
1511 STORE_NUMBER (ptr, reg); \
1512 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1513 } \
1514 else \
1515 { \
1516 regend[reg] = POP_FAILURE_POINTER (); \
1517 regstart[reg] = POP_FAILURE_POINTER (); \
1518 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1519 reg, regstart[reg], regend[reg]); \
1520 } \
505bde11
SM
1521} while (0)
1522
f6df485f
RS
1523/* Discard a saved register off the stack. */
1524#define DISCARD_FAILURE_REG_OR_COUNT() \
1525do { \
1526 int reg = POP_FAILURE_INT (); \
1527 if (reg == -1) \
1528 { \
1529 /* It's a counter. */ \
1530 POP_FAILURE_POINTER (); \
1531 reg = POP_FAILURE_INT (); \
1532 DEBUG_PRINT3 (" Discard counter %p = %d\n", ptr, reg); \
1533 } \
1534 else \
1535 { \
1536 POP_FAILURE_POINTER (); \
1537 POP_FAILURE_POINTER (); \
1538 DEBUG_PRINT4 (" Discard reg %d (spanning %p -> %p)\n", \
1539 reg, regstart[reg], regend[reg]); \
1540 } \
1541} while (0)
1542
505bde11
SM
1543/* Check that we are not stuck in an infinite loop. */
1544#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1545do { \
f6df485f 1546 int failure = TOP_FAILURE_HANDLE (); \
505bde11 1547 /* Check for infinite matching loops */ \
f6df485f
RS
1548 while (failure > 0 \
1549 && (FAILURE_STR (failure) == string_place \
1550 || FAILURE_STR (failure) == NULL)) \
505bde11
SM
1551 { \
1552 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1553 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11 1554 if (FAILURE_PAT (failure) == pat_cur) \
f6df485f
RS
1555 { \
1556 while (fail_stack.frame < fail_stack.avail) \
1557 DISCARD_FAILURE_REG_OR_COUNT (); \
1558 goto fail; \
1559 } \
66f0296e 1560 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1561 failure = NEXT_FAILURE_HANDLE(failure); \
1562 } \
1563 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1564} while (0)
1565
fa9a63c5 1566/* Push the information about the state we will need
5e69f11e
RM
1567 if we ever fail back to it.
1568
505bde11 1569 Requires variables fail_stack, regstart, regend and
320a2a73 1570 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1571 declared.
5e69f11e 1572
fa9a63c5
RM
1573 Does `return FAILURE_CODE' if runs out of memory. */
1574
505bde11
SM
1575#define PUSH_FAILURE_POINT(pattern, string_place) \
1576do { \
1577 char *destination; \
1578 /* Must be int, so when we don't save any registers, the arithmetic \
1579 of 0 + -1 isn't done as unsigned. */ \
1580 \
505bde11 1581 DEBUG_STATEMENT (nfailure_points_pushed++); \
4bb91c68 1582 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
505bde11
SM
1583 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1584 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1585 \
1586 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1587 \
1588 DEBUG_PRINT1 ("\n"); \
1589 \
1590 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1591 PUSH_FAILURE_INT (fail_stack.frame); \
1592 \
1593 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1594 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1595 DEBUG_PRINT1 ("'\n"); \
1596 PUSH_FAILURE_POINTER (string_place); \
1597 \
1598 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1599 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1600 PUSH_FAILURE_POINTER (pattern); \
1601 \
1602 /* Close the frame by moving the frame pointer past it. */ \
1603 fail_stack.frame = fail_stack.avail; \
1604} while (0)
fa9a63c5 1605
320a2a73
KH
1606/* Estimate the size of data pushed by a typical failure stack entry.
1607 An estimate is all we need, because all we use this for
1608 is to choose a limit for how big to make the failure stack. */
ada30c0e 1609/* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
320a2a73 1610#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1611
fa9a63c5
RM
1612/* How many items can still be added to the stack without overflowing it. */
1613#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1614
1615
1616/* Pops what PUSH_FAIL_STACK pushes.
1617
1618 We restore into the parameters, all of which should be lvalues:
1619 STR -- the saved data position.
1620 PAT -- the saved pattern position.
fa9a63c5 1621 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1622
fa9a63c5 1623 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1624 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1625
505bde11
SM
1626#define POP_FAILURE_POINT(str, pat) \
1627do { \
fa9a63c5
RM
1628 assert (!FAIL_STACK_EMPTY ()); \
1629 \
1630 /* Remove failure points and point to how many regs pushed. */ \
1631 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1632 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1633 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1634 \
505bde11
SM
1635 /* Pop the saved registers. */ \
1636 while (fail_stack.frame < fail_stack.avail) \
dc1e502d 1637 POP_FAILURE_REG_OR_COUNT (); \
fa9a63c5 1638 \
01618498 1639 pat = POP_FAILURE_POINTER (); \
505bde11
SM
1640 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1641 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1642 \
1643 /* If the saved string location is NULL, it came from an \
1644 on_failure_keep_string_jump opcode, and we want to throw away the \
1645 saved NULL, thus retaining our current position in the string. */ \
01618498 1646 str = POP_FAILURE_POINTER (); \
505bde11 1647 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1648 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1649 DEBUG_PRINT1 ("'\n"); \
1650 \
505bde11
SM
1651 fail_stack.frame = POP_FAILURE_INT (); \
1652 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1653 \
505bde11
SM
1654 assert (fail_stack.avail >= 0); \
1655 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1656 \
fa9a63c5 1657 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1658} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1659
1660
1661\f
fa9a63c5 1662/* Registers are set to a sentinel when they haven't yet matched. */
4bb91c68 1663#define REG_UNSET(e) ((e) == NULL)
fa9a63c5
RM
1664\f
1665/* Subroutine declarations and macros for regex_compile. */
1666
4bb91c68
SM
1667static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1668 reg_syntax_t syntax,
1669 struct re_pattern_buffer *bufp));
1670static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1671static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1672 int arg1, int arg2));
1673static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1674 int arg, unsigned char *end));
1675static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1676 int arg1, int arg2, unsigned char *end));
01618498
SM
1677static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1678 re_char *p,
4bb91c68 1679 reg_syntax_t syntax));
01618498
SM
1680static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1681 re_char *pend,
4bb91c68 1682 reg_syntax_t syntax));
01618498
SM
1683static re_char *skip_one_char _RE_ARGS ((re_char *p));
1684static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
4bb91c68 1685 char *fastmap, const int multibyte));
fa9a63c5 1686
fa9a63c5 1687/* Fetch the next character in the uncompiled pattern, with no
4bb91c68 1688 translation. */
36595814 1689#define PATFETCH(c) \
2d1675e4
SM
1690 do { \
1691 int len; \
1692 if (p == pend) return REG_EEND; \
1693 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1694 p += len; \
fa9a63c5
RM
1695 } while (0)
1696
fa9a63c5
RM
1697
1698/* If `translate' is non-null, return translate[D], else just D. We
1699 cast the subscript to translate because some data is declared as
1700 `char *', to avoid warnings when a string constant is passed. But
1701 when we use a character as a subscript we must make it unsigned. */
6676cb1c 1702#ifndef TRANSLATE
0b32bf0e 1703# define TRANSLATE(d) \
66f0296e 1704 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1705#endif
fa9a63c5
RM
1706
1707
1708/* Macros for outputting the compiled pattern into `buffer'. */
1709
1710/* If the buffer isn't allocated when it comes in, use this. */
1711#define INIT_BUF_SIZE 32
1712
4bb91c68 1713/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5 1714#define GET_BUFFER_SPACE(n) \
01618498 1715 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
fa9a63c5
RM
1716 EXTEND_BUFFER ()
1717
1718/* Make sure we have one more byte of buffer space and then add C to it. */
1719#define BUF_PUSH(c) \
1720 do { \
1721 GET_BUFFER_SPACE (1); \
1722 *b++ = (unsigned char) (c); \
1723 } while (0)
1724
1725
1726/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1727#define BUF_PUSH_2(c1, c2) \
1728 do { \
1729 GET_BUFFER_SPACE (2); \
1730 *b++ = (unsigned char) (c1); \
1731 *b++ = (unsigned char) (c2); \
1732 } while (0)
1733
1734
4bb91c68 1735/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1736#define BUF_PUSH_3(c1, c2, c3) \
1737 do { \
1738 GET_BUFFER_SPACE (3); \
1739 *b++ = (unsigned char) (c1); \
1740 *b++ = (unsigned char) (c2); \
1741 *b++ = (unsigned char) (c3); \
1742 } while (0)
1743
1744
1745/* Store a jump with opcode OP at LOC to location TO. We store a
4bb91c68 1746 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1747#define STORE_JUMP(op, loc, to) \
1748 store_op1 (op, loc, (to) - (loc) - 3)
1749
1750/* Likewise, for a two-argument jump. */
1751#define STORE_JUMP2(op, loc, to, arg) \
1752 store_op2 (op, loc, (to) - (loc) - 3, arg)
1753
4bb91c68 1754/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1755#define INSERT_JUMP(op, loc, to) \
1756 insert_op1 (op, loc, (to) - (loc) - 3, b)
1757
1758/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1759#define INSERT_JUMP2(op, loc, to, arg) \
1760 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1761
1762
1763/* This is not an arbitrary limit: the arguments which represent offsets
4bb91c68 1764 into the pattern are two bytes long. So if 2^16 bytes turns out to
fa9a63c5 1765 be too small, many things would have to change. */
4bb91c68
SM
1766/* Any other compiler which, like MSC, has allocation limit below 2^16
1767 bytes will have to use approach similar to what was done below for
1768 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1769 reallocating to 0 bytes. Such thing is not going to work too well.
1770 You have been warned!! */
1771#if defined _MSC_VER && !defined WIN32
1772/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1773# define MAX_BUF_SIZE 65500L
1774#else
1775# define MAX_BUF_SIZE (1L << 16)
1776#endif
fa9a63c5
RM
1777
1778/* Extend the buffer by twice its current size via realloc and
1779 reset the pointers that pointed into the old block to point to the
1780 correct places in the new one. If extending the buffer results in it
4bb91c68
SM
1781 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1782#if __BOUNDED_POINTERS__
1783# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1784# define MOVE_BUFFER_POINTER(P) \
1785 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
1786# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1787 else \
1788 { \
1789 SET_HIGH_BOUND (b); \
1790 SET_HIGH_BOUND (begalt); \
1791 if (fixup_alt_jump) \
1792 SET_HIGH_BOUND (fixup_alt_jump); \
1793 if (laststart) \
1794 SET_HIGH_BOUND (laststart); \
1795 if (pending_exact) \
1796 SET_HIGH_BOUND (pending_exact); \
1797 }
1798#else
1799# define MOVE_BUFFER_POINTER(P) (P) += incr
1800# define ELSE_EXTEND_BUFFER_HIGH_BOUND
1801#endif
fa9a63c5 1802#define EXTEND_BUFFER() \
25fe55af 1803 do { \
01618498 1804 re_char *old_buffer = bufp->buffer; \
25fe55af 1805 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1806 return REG_ESIZE; \
1807 bufp->allocated <<= 1; \
1808 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1809 bufp->allocated = MAX_BUF_SIZE; \
01618498 1810 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
fa9a63c5
RM
1811 if (bufp->buffer == NULL) \
1812 return REG_ESPACE; \
1813 /* If the buffer moved, move all the pointers into it. */ \
1814 if (old_buffer != bufp->buffer) \
1815 { \
4bb91c68
SM
1816 int incr = bufp->buffer - old_buffer; \
1817 MOVE_BUFFER_POINTER (b); \
1818 MOVE_BUFFER_POINTER (begalt); \
25fe55af 1819 if (fixup_alt_jump) \
4bb91c68 1820 MOVE_BUFFER_POINTER (fixup_alt_jump); \
25fe55af 1821 if (laststart) \
4bb91c68 1822 MOVE_BUFFER_POINTER (laststart); \
25fe55af 1823 if (pending_exact) \
4bb91c68 1824 MOVE_BUFFER_POINTER (pending_exact); \
fa9a63c5 1825 } \
4bb91c68 1826 ELSE_EXTEND_BUFFER_HIGH_BOUND \
fa9a63c5
RM
1827 } while (0)
1828
1829
1830/* Since we have one byte reserved for the register number argument to
1831 {start,stop}_memory, the maximum number of groups we can report
1832 things about is what fits in that byte. */
1833#define MAX_REGNUM 255
1834
1835/* But patterns can have more than `MAX_REGNUM' registers. We just
1836 ignore the excess. */
098d42af 1837typedef int regnum_t;
fa9a63c5
RM
1838
1839
1840/* Macros for the compile stack. */
1841
1842/* Since offsets can go either forwards or backwards, this type needs to
4bb91c68
SM
1843 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1844/* int may be not enough when sizeof(int) == 2. */
1845typedef long pattern_offset_t;
fa9a63c5
RM
1846
1847typedef struct
1848{
1849 pattern_offset_t begalt_offset;
1850 pattern_offset_t fixup_alt_jump;
5e69f11e 1851 pattern_offset_t laststart_offset;
fa9a63c5
RM
1852 regnum_t regnum;
1853} compile_stack_elt_t;
1854
1855
1856typedef struct
1857{
1858 compile_stack_elt_t *stack;
1859 unsigned size;
1860 unsigned avail; /* Offset of next open position. */
1861} compile_stack_type;
1862
1863
1864#define INIT_COMPILE_STACK_SIZE 32
1865
1866#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1867#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1868
4bb91c68 1869/* The next available element. */
fa9a63c5
RM
1870#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1871
77d11aec
RS
1872/* Explicit quit checking is only used on NTemacs. */
1873#if defined WINDOWSNT && defined emacs && defined QUIT
1874extern int immediate_quit;
1875# define IMMEDIATE_QUIT_CHECK \
1876 do { \
1877 if (immediate_quit) QUIT; \
1878 } while (0)
1879#else
1880# define IMMEDIATE_QUIT_CHECK ((void)0)
1881#endif
1882\f
b18215fc
RS
1883/* Structure to manage work area for range table. */
1884struct range_table_work_area
1885{
1886 int *table; /* actual work area. */
1887 int allocated; /* allocated size for work area in bytes. */
25fe55af 1888 int used; /* actually used size in words. */
96cc36cc 1889 int bits; /* flag to record character classes */
b18215fc
RS
1890};
1891
77d11aec
RS
1892/* Make sure that WORK_AREA can hold more N multibyte characters.
1893 This is used only in set_image_of_range and set_image_of_range_1.
1894 It expects WORK_AREA to be a pointer.
1895 If it can't get the space, it returns from the surrounding function. */
1896
1897#define EXTEND_RANGE_TABLE(work_area, n) \
1898 do { \
1899 if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
1900 { \
1901 extend_range_table_work_area (work_area); \
1902 if ((work_area)->table == 0) \
1903 return (REG_ESPACE); \
1904 } \
b18215fc
RS
1905 } while (0)
1906
96cc36cc
RS
1907#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1908 (work_area).bits |= (bit)
1909
14473664
SM
1910/* Bits used to implement the multibyte-part of the various character classes
1911 such as [:alnum:] in a charset's range table. */
1912#define BIT_WORD 0x1
1913#define BIT_LOWER 0x2
1914#define BIT_PUNCT 0x4
1915#define BIT_SPACE 0x8
1916#define BIT_UPPER 0x10
1917#define BIT_MULTIBYTE 0x20
96cc36cc 1918
36595814
SM
1919/* Set a range START..END to WORK_AREA.
1920 The range is passed through TRANSLATE, so START and END
1921 should be untranslated. */
77d11aec
RS
1922#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
1923 do { \
1924 int tem; \
1925 tem = set_image_of_range (&work_area, start, end, translate); \
1926 if (tem > 0) \
1927 FREE_STACK_RETURN (tem); \
b18215fc
RS
1928 } while (0)
1929
25fe55af 1930/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1931#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1932 do { \
1933 if ((work_area).table) \
1934 free ((work_area).table); \
1935 } while (0)
1936
96cc36cc 1937#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1938#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1939#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc 1940#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
77d11aec 1941\f
b18215fc 1942
fa9a63c5 1943/* Set the bit for character C in a list. */
01618498 1944#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
fa9a63c5
RM
1945
1946
1947/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1948#define GET_UNSIGNED_NUMBER(num) \
99633e97 1949 do { if (p != pend) \
fa9a63c5 1950 { \
25fe55af 1951 PATFETCH (c); \
060cc8ff
RS
1952 if (c == ' ') \
1953 FREE_STACK_RETURN (REG_BADBR); \
4bb91c68 1954 while ('0' <= c && c <= '9') \
25fe55af 1955 { \
f6df485f 1956 int prev; \
25fe55af 1957 if (num < 0) \
f6df485f
RS
1958 num = 0; \
1959 prev = num; \
25fe55af 1960 num = num * 10 + c - '0'; \
f6df485f
RS
1961 if (num / 10 != prev) \
1962 FREE_STACK_RETURN (REG_BADBR); \
25fe55af 1963 if (p == pend) \
f6df485f 1964 break; \
25fe55af
RS
1965 PATFETCH (c); \
1966 } \
060cc8ff
RS
1967 if (c == ' ') \
1968 FREE_STACK_RETURN (REG_BADBR); \
25fe55af 1969 } \
99633e97 1970 } while (0)
77d11aec 1971\f
a0ad02f7 1972#if WIDE_CHAR_SUPPORT
14473664
SM
1973/* The GNU C library provides support for user-defined character classes
1974 and the functions from ISO C amendement 1. */
1975# ifdef CHARCLASS_NAME_MAX
1976# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1977# else
1978/* This shouldn't happen but some implementation might still have this
1979 problem. Use a reasonable default value. */
1980# define CHAR_CLASS_MAX_LENGTH 256
1981# endif
1982typedef wctype_t re_wctype_t;
01618498 1983typedef wchar_t re_wchar_t;
14473664
SM
1984# define re_wctype wctype
1985# define re_iswctype iswctype
1986# define re_wctype_to_bit(cc) 0
1987#else
1988# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1989# define btowc(c) c
1990
01618498 1991/* Character classes. */
14473664
SM
1992typedef enum { RECC_ERROR = 0,
1993 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1994 RECC_GRAPH, RECC_PRINT,
1995 RECC_LOWER, RECC_UPPER,
1996 RECC_PUNCT, RECC_CNTRL,
1997 RECC_DIGIT, RECC_XDIGIT,
1998 RECC_BLANK, RECC_SPACE,
1999 RECC_MULTIBYTE, RECC_NONASCII,
2000 RECC_ASCII, RECC_UNIBYTE
2001} re_wctype_t;
2002
01618498
SM
2003typedef int re_wchar_t;
2004
14473664
SM
2005/* Map a string to the char class it names (if any). */
2006static re_wctype_t
ada30c0e
SM
2007re_wctype (str)
2008 re_char *str;
14473664 2009{
ada30c0e 2010 const char *string = str;
14473664
SM
2011 if (STREQ (string, "alnum")) return RECC_ALNUM;
2012 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2013 else if (STREQ (string, "word")) return RECC_WORD;
2014 else if (STREQ (string, "ascii")) return RECC_ASCII;
2015 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2016 else if (STREQ (string, "graph")) return RECC_GRAPH;
2017 else if (STREQ (string, "lower")) return RECC_LOWER;
2018 else if (STREQ (string, "print")) return RECC_PRINT;
2019 else if (STREQ (string, "punct")) return RECC_PUNCT;
2020 else if (STREQ (string, "space")) return RECC_SPACE;
2021 else if (STREQ (string, "upper")) return RECC_UPPER;
2022 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2023 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2024 else if (STREQ (string, "digit")) return RECC_DIGIT;
2025 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2026 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2027 else if (STREQ (string, "blank")) return RECC_BLANK;
2028 else return 0;
2029}
2030
2031/* True iff CH is in the char class CC. */
2032static boolean
2033re_iswctype (ch, cc)
2034 int ch;
2035 re_wctype_t cc;
2036{
2037 switch (cc)
2038 {
0cdd06f8
SM
2039 case RECC_ALNUM: return ISALNUM (ch);
2040 case RECC_ALPHA: return ISALPHA (ch);
2041 case RECC_BLANK: return ISBLANK (ch);
2042 case RECC_CNTRL: return ISCNTRL (ch);
2043 case RECC_DIGIT: return ISDIGIT (ch);
2044 case RECC_GRAPH: return ISGRAPH (ch);
2045 case RECC_LOWER: return ISLOWER (ch);
2046 case RECC_PRINT: return ISPRINT (ch);
2047 case RECC_PUNCT: return ISPUNCT (ch);
2048 case RECC_SPACE: return ISSPACE (ch);
2049 case RECC_UPPER: return ISUPPER (ch);
2050 case RECC_XDIGIT: return ISXDIGIT (ch);
2051 case RECC_ASCII: return IS_REAL_ASCII (ch);
2052 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2053 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2054 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2055 case RECC_WORD: return ISWORD (ch);
2056 case RECC_ERROR: return false;
2057 default:
2058 abort();
14473664
SM
2059 }
2060}
fa9a63c5 2061
14473664
SM
2062/* Return a bit-pattern to use in the range-table bits to match multibyte
2063 chars of class CC. */
2064static int
2065re_wctype_to_bit (cc)
2066 re_wctype_t cc;
2067{
2068 switch (cc)
2069 {
2070 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
0cdd06f8
SM
2071 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2072 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2073 case RECC_LOWER: return BIT_LOWER;
2074 case RECC_UPPER: return BIT_UPPER;
2075 case RECC_PUNCT: return BIT_PUNCT;
2076 case RECC_SPACE: return BIT_SPACE;
14473664 2077 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
0cdd06f8
SM
2078 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2079 default:
2080 abort();
14473664
SM
2081 }
2082}
2083#endif
77d11aec
RS
2084\f
2085/* Filling in the work area of a range. */
2086
2087/* Actually extend the space in WORK_AREA. */
2088
2089static void
2090extend_range_table_work_area (work_area)
2091 struct range_table_work_area *work_area;
2092{
2093 work_area->allocated += 16 * sizeof (int);
2094 if (work_area->table)
2095 work_area->table
2096 = (int *) realloc (work_area->table, work_area->allocated);
2097 else
2098 work_area->table
2099 = (int *) malloc (work_area->allocated);
2100}
2101
2102#ifdef emacs
2103
2104/* Carefully find the ranges of codes that are equivalent
2105 under case conversion to the range start..end when passed through
2106 TRANSLATE. Handle the case where non-letters can come in between
2107 two upper-case letters (which happens in Latin-1).
2108 Also handle the case of groups of more than 2 case-equivalent chars.
2109
2110 The basic method is to look at consecutive characters and see
2111 if they can form a run that can be handled as one.
2112
2113 Returns -1 if successful, REG_ESPACE if ran out of space. */
2114
2115static int
2116set_image_of_range_1 (work_area, start, end, translate)
2117 RE_TRANSLATE_TYPE translate;
2118 struct range_table_work_area *work_area;
2119 re_wchar_t start, end;
2120{
2121 /* `one_case' indicates a character, or a run of characters,
2122 each of which is an isolate (no case-equivalents).
2123 This includes all ASCII non-letters.
2124
2125 `two_case' indicates a character, or a run of characters,
2126 each of which has two case-equivalent forms.
2127 This includes all ASCII letters.
2128
2129 `strange' indicates a character that has more than one
2130 case-equivalent. */
2131
2132 enum case_type {one_case, two_case, strange};
2133
2134 /* Describe the run that is in progress,
2135 which the next character can try to extend.
2136 If run_type is strange, that means there really is no run.
2137 If run_type is one_case, then run_start...run_end is the run.
2138 If run_type is two_case, then the run is run_start...run_end,
2139 and the case-equivalents end at run_eqv_end. */
2140
2141 enum case_type run_type = strange;
2142 int run_start, run_end, run_eqv_end;
2143
2144 Lisp_Object eqv_table;
2145
2146 if (!RE_TRANSLATE_P (translate))
2147 {
b7c12565 2148 EXTEND_RANGE_TABLE (work_area, 2);
77d11aec
RS
2149 work_area->table[work_area->used++] = (start);
2150 work_area->table[work_area->used++] = (end);
b7c12565 2151 return -1;
77d11aec
RS
2152 }
2153
2154 eqv_table = XCHAR_TABLE (translate)->extras[2];
99633e97 2155
77d11aec
RS
2156 for (; start <= end; start++)
2157 {
2158 enum case_type this_type;
2159 int eqv = RE_TRANSLATE (eqv_table, start);
2160 int minchar, maxchar;
2161
2162 /* Classify this character */
2163 if (eqv == start)
2164 this_type = one_case;
2165 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2166 this_type = two_case;
2167 else
2168 this_type = strange;
2169
2170 if (start < eqv)
2171 minchar = start, maxchar = eqv;
2172 else
2173 minchar = eqv, maxchar = start;
2174
2175 /* Can this character extend the run in progress? */
2176 if (this_type == strange || this_type != run_type
2177 || !(minchar == run_end + 1
2178 && (run_type == two_case
2179 ? maxchar == run_eqv_end + 1 : 1)))
2180 {
2181 /* No, end the run.
2182 Record each of its equivalent ranges. */
2183 if (run_type == one_case)
2184 {
2185 EXTEND_RANGE_TABLE (work_area, 2);
2186 work_area->table[work_area->used++] = run_start;
2187 work_area->table[work_area->used++] = run_end;
2188 }
2189 else if (run_type == two_case)
2190 {
2191 EXTEND_RANGE_TABLE (work_area, 4);
2192 work_area->table[work_area->used++] = run_start;
2193 work_area->table[work_area->used++] = run_end;
2194 work_area->table[work_area->used++]
2195 = RE_TRANSLATE (eqv_table, run_start);
2196 work_area->table[work_area->used++]
2197 = RE_TRANSLATE (eqv_table, run_end);
2198 }
2199 run_type = strange;
2200 }
2201
2202 if (this_type == strange)
2203 {
2204 /* For a strange character, add each of its equivalents, one
2205 by one. Don't start a range. */
2206 do
2207 {
2208 EXTEND_RANGE_TABLE (work_area, 2);
2209 work_area->table[work_area->used++] = eqv;
2210 work_area->table[work_area->used++] = eqv;
2211 eqv = RE_TRANSLATE (eqv_table, eqv);
2212 }
2213 while (eqv != start);
2214 }
2215
2216 /* Add this char to the run, or start a new run. */
2217 else if (run_type == strange)
2218 {
2219 /* Initialize a new range. */
2220 run_type = this_type;
2221 run_start = start;
2222 run_end = start;
2223 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2224 }
2225 else
2226 {
2227 /* Extend a running range. */
2228 run_end = minchar;
2229 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2230 }
2231 }
2232
2233 /* If a run is still in progress at the end, finish it now
2234 by recording its equivalent ranges. */
2235 if (run_type == one_case)
2236 {
2237 EXTEND_RANGE_TABLE (work_area, 2);
2238 work_area->table[work_area->used++] = run_start;
2239 work_area->table[work_area->used++] = run_end;
2240 }
2241 else if (run_type == two_case)
2242 {
2243 EXTEND_RANGE_TABLE (work_area, 4);
2244 work_area->table[work_area->used++] = run_start;
2245 work_area->table[work_area->used++] = run_end;
2246 work_area->table[work_area->used++]
2247 = RE_TRANSLATE (eqv_table, run_start);
2248 work_area->table[work_area->used++]
2249 = RE_TRANSLATE (eqv_table, run_end);
2250 }
2251
2252 return -1;
2253}
36595814 2254
77d11aec 2255#endif /* emacs */
36595814 2256
b7c12565 2257/* Record the the image of the range start..end when passed through
36595814
SM
2258 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2259 and is not even necessarily contiguous.
b7c12565
RS
2260 Normally we approximate it with the smallest contiguous range that contains
2261 all the chars we need. However, for Latin-1 we go to extra effort
2262 to do a better job.
2263
2264 This function is not called for ASCII ranges.
77d11aec
RS
2265
2266 Returns -1 if successful, REG_ESPACE if ran out of space. */
2267
2268static int
36595814
SM
2269set_image_of_range (work_area, start, end, translate)
2270 RE_TRANSLATE_TYPE translate;
2271 struct range_table_work_area *work_area;
2272 re_wchar_t start, end;
2273{
77d11aec
RS
2274 re_wchar_t cmin, cmax;
2275
2276#ifdef emacs
2277 /* For Latin-1 ranges, use set_image_of_range_1
2278 to get proper handling of ranges that include letters and nonletters.
b7c12565 2279 For a range that includes the whole of Latin-1, this is not necessary.
77d11aec 2280 For other character sets, we don't bother to get this right. */
b7c12565
RS
2281 if (RE_TRANSLATE_P (translate) && start < 04400
2282 && !(start < 04200 && end >= 04377))
77d11aec 2283 {
b7c12565 2284 int newend;
77d11aec 2285 int tem;
b7c12565
RS
2286 newend = end;
2287 if (newend > 04377)
2288 newend = 04377;
2289 tem = set_image_of_range_1 (work_area, start, newend, translate);
77d11aec
RS
2290 if (tem > 0)
2291 return tem;
2292
2293 start = 04400;
2294 if (end < 04400)
2295 return -1;
2296 }
2297#endif
2298
b7c12565
RS
2299 EXTEND_RANGE_TABLE (work_area, 2);
2300 work_area->table[work_area->used++] = (start);
2301 work_area->table[work_area->used++] = (end);
2302
2303 cmin = -1, cmax = -1;
77d11aec 2304
36595814 2305 if (RE_TRANSLATE_P (translate))
b7c12565
RS
2306 {
2307 int ch;
77d11aec 2308
b7c12565
RS
2309 for (ch = start; ch <= end; ch++)
2310 {
2311 re_wchar_t c = TRANSLATE (ch);
2312 if (! (start <= c && c <= end))
2313 {
2314 if (cmin == -1)
2315 cmin = c, cmax = c;
2316 else
2317 {
2318 cmin = MIN (cmin, c);
2319 cmax = MAX (cmax, c);
2320 }
2321 }
2322 }
2323
2324 if (cmin != -1)
2325 {
2326 EXTEND_RANGE_TABLE (work_area, 2);
2327 work_area->table[work_area->used++] = (cmin);
2328 work_area->table[work_area->used++] = (cmax);
2329 }
2330 }
36595814 2331
77d11aec
RS
2332 return -1;
2333}
fa9a63c5
RM
2334\f
2335#ifndef MATCH_MAY_ALLOCATE
2336
2337/* If we cannot allocate large objects within re_match_2_internal,
2338 we make the fail stack and register vectors global.
2339 The fail stack, we grow to the maximum size when a regexp
2340 is compiled.
2341 The register vectors, we adjust in size each time we
2342 compile a regexp, according to the number of registers it needs. */
2343
2344static fail_stack_type fail_stack;
2345
2346/* Size with which the following vectors are currently allocated.
2347 That is so we can make them bigger as needed,
4bb91c68 2348 but never make them smaller. */
fa9a63c5
RM
2349static int regs_allocated_size;
2350
66f0296e
SM
2351static re_char ** regstart, ** regend;
2352static re_char **best_regstart, **best_regend;
fa9a63c5
RM
2353
2354/* Make the register vectors big enough for NUM_REGS registers,
4bb91c68 2355 but don't make them smaller. */
fa9a63c5
RM
2356
2357static
2358regex_grow_registers (num_regs)
2359 int num_regs;
2360{
2361 if (num_regs > regs_allocated_size)
2362 {
66f0296e
SM
2363 RETALLOC_IF (regstart, num_regs, re_char *);
2364 RETALLOC_IF (regend, num_regs, re_char *);
2365 RETALLOC_IF (best_regstart, num_regs, re_char *);
2366 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
2367
2368 regs_allocated_size = num_regs;
2369 }
2370}
2371
2372#endif /* not MATCH_MAY_ALLOCATE */
2373\f
99633e97
SM
2374static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2375 compile_stack,
2376 regnum_t regnum));
2377
fa9a63c5
RM
2378/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2379 Returns one of error codes defined in `regex.h', or zero for success.
2380
2381 Assumes the `allocated' (and perhaps `buffer') and `translate'
2382 fields are set in BUFP on entry.
2383
2384 If it succeeds, results are put in BUFP (if it returns an error, the
2385 contents of BUFP are undefined):
2386 `buffer' is the compiled pattern;
2387 `syntax' is set to SYNTAX;
2388 `used' is set to the length of the compiled pattern;
2389 `fastmap_accurate' is zero;
2390 `re_nsub' is the number of subexpressions in PATTERN;
2391 `not_bol' and `not_eol' are zero;
5e69f11e 2392
c0f9ea08 2393 The `fastmap' field is neither examined nor set. */
fa9a63c5 2394
505bde11
SM
2395/* Insert the `jump' from the end of last alternative to "here".
2396 The space for the jump has already been allocated. */
2397#define FIXUP_ALT_JUMP() \
2398do { \
2399 if (fixup_alt_jump) \
2400 STORE_JUMP (jump, fixup_alt_jump, b); \
2401} while (0)
2402
2403
fa9a63c5
RM
2404/* Return, freeing storage we allocated. */
2405#define FREE_STACK_RETURN(value) \
b18215fc
RS
2406 do { \
2407 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2408 free (compile_stack.stack); \
2409 return value; \
2410 } while (0)
fa9a63c5
RM
2411
2412static reg_errcode_t
2413regex_compile (pattern, size, syntax, bufp)
66f0296e 2414 re_char *pattern;
4bb91c68 2415 size_t size;
fa9a63c5
RM
2416 reg_syntax_t syntax;
2417 struct re_pattern_buffer *bufp;
2418{
01618498
SM
2419 /* We fetch characters from PATTERN here. */
2420 register re_wchar_t c, c1;
5e69f11e 2421
fa9a63c5 2422 /* A random temporary spot in PATTERN. */
66f0296e 2423 re_char *p1;
fa9a63c5
RM
2424
2425 /* Points to the end of the buffer, where we should append. */
2426 register unsigned char *b;
5e69f11e 2427
fa9a63c5
RM
2428 /* Keeps track of unclosed groups. */
2429 compile_stack_type compile_stack;
2430
2431 /* Points to the current (ending) position in the pattern. */
22336245
RS
2432#ifdef AIX
2433 /* `const' makes AIX compiler fail. */
66f0296e 2434 unsigned char *p = pattern;
22336245 2435#else
66f0296e 2436 re_char *p = pattern;
22336245 2437#endif
66f0296e 2438 re_char *pend = pattern + size;
5e69f11e 2439
fa9a63c5 2440 /* How to translate the characters in the pattern. */
6676cb1c 2441 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2442
2443 /* Address of the count-byte of the most recently inserted `exactn'
2444 command. This makes it possible to tell if a new exact-match
2445 character can be added to that command or if the character requires
2446 a new `exactn' command. */
2447 unsigned char *pending_exact = 0;
2448
2449 /* Address of start of the most recently finished expression.
2450 This tells, e.g., postfix * where to find the start of its
2451 operand. Reset at the beginning of groups and alternatives. */
2452 unsigned char *laststart = 0;
2453
2454 /* Address of beginning of regexp, or inside of last group. */
2455 unsigned char *begalt;
2456
2457 /* Place in the uncompiled pattern (i.e., the {) to
2458 which to go back if the interval is invalid. */
66f0296e 2459 re_char *beg_interval;
5e69f11e 2460
fa9a63c5 2461 /* Address of the place where a forward jump should go to the end of
25fe55af 2462 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2463 last -- ends with a forward jump of this sort. */
2464 unsigned char *fixup_alt_jump = 0;
2465
2466 /* Counts open-groups as they are encountered. Remembered for the
2467 matching close-group on the compile stack, so the same register
2468 number is put in the stop_memory as the start_memory. */
2469 regnum_t regnum = 0;
2470
b18215fc
RS
2471 /* Work area for range table of charset. */
2472 struct range_table_work_area range_table_work;
2473
2d1675e4
SM
2474 /* If the object matched can contain multibyte characters. */
2475 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2476
fa9a63c5 2477#ifdef DEBUG
99633e97 2478 debug++;
fa9a63c5 2479 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 2480 if (debug > 0)
fa9a63c5
RM
2481 {
2482 unsigned debug_count;
5e69f11e 2483
fa9a63c5 2484 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2485 putchar (pattern[debug_count]);
fa9a63c5
RM
2486 putchar ('\n');
2487 }
2488#endif /* DEBUG */
2489
2490 /* Initialize the compile stack. */
2491 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2492 if (compile_stack.stack == NULL)
2493 return REG_ESPACE;
2494
2495 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2496 compile_stack.avail = 0;
2497
b18215fc
RS
2498 range_table_work.table = 0;
2499 range_table_work.allocated = 0;
2500
fa9a63c5
RM
2501 /* Initialize the pattern buffer. */
2502 bufp->syntax = syntax;
2503 bufp->fastmap_accurate = 0;
2504 bufp->not_bol = bufp->not_eol = 0;
2505
2506 /* Set `used' to zero, so that if we return an error, the pattern
2507 printer (for debugging) will think there's no pattern. We reset it
2508 at the end. */
2509 bufp->used = 0;
5e69f11e 2510
fa9a63c5 2511 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2512 bufp->re_nsub = 0;
fa9a63c5 2513
0b32bf0e 2514#if !defined emacs && !defined SYNTAX_TABLE
fa9a63c5
RM
2515 /* Initialize the syntax table. */
2516 init_syntax_once ();
2517#endif
2518
2519 if (bufp->allocated == 0)
2520 {
2521 if (bufp->buffer)
2522 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2523 enough space. This loses if buffer's address is bogus, but
2524 that is the user's responsibility. */
2525 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2526 }
fa9a63c5 2527 else
25fe55af
RS
2528 { /* Caller did not allocate a buffer. Do it for them. */
2529 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2530 }
fa9a63c5
RM
2531 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2532
2533 bufp->allocated = INIT_BUF_SIZE;
2534 }
2535
2536 begalt = b = bufp->buffer;
2537
2538 /* Loop through the uncompiled pattern until we're at the end. */
2539 while (p != pend)
2540 {
2541 PATFETCH (c);
2542
2543 switch (c)
25fe55af
RS
2544 {
2545 case '^':
2546 {
2547 if ( /* If at start of pattern, it's an operator. */
2548 p == pattern + 1
2549 /* If context independent, it's an operator. */
2550 || syntax & RE_CONTEXT_INDEP_ANCHORS
2551 /* Otherwise, depends on what's come before. */
2552 || at_begline_loc_p (pattern, p, syntax))
c0f9ea08 2553 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
25fe55af
RS
2554 else
2555 goto normal_char;
2556 }
2557 break;
2558
2559
2560 case '$':
2561 {
2562 if ( /* If at end of pattern, it's an operator. */
2563 p == pend
2564 /* If context independent, it's an operator. */
2565 || syntax & RE_CONTEXT_INDEP_ANCHORS
2566 /* Otherwise, depends on what's next. */
2567 || at_endline_loc_p (p, pend, syntax))
c0f9ea08 2568 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
25fe55af
RS
2569 else
2570 goto normal_char;
2571 }
2572 break;
fa9a63c5
RM
2573
2574
2575 case '+':
25fe55af
RS
2576 case '?':
2577 if ((syntax & RE_BK_PLUS_QM)
2578 || (syntax & RE_LIMITED_OPS))
2579 goto normal_char;
2580 handle_plus:
2581 case '*':
2582 /* If there is no previous pattern... */
2583 if (!laststart)
2584 {
2585 if (syntax & RE_CONTEXT_INVALID_OPS)
2586 FREE_STACK_RETURN (REG_BADRPT);
2587 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2588 goto normal_char;
2589 }
2590
2591 {
25fe55af 2592 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2593 boolean zero_times_ok = 0, many_times_ok = 0;
2594 boolean greedy = 1;
25fe55af
RS
2595
2596 /* If there is a sequence of repetition chars, collapse it
2597 down to just one (the right one). We can't combine
2598 interval operators with these because of, e.g., `a{2}*',
2599 which should only match an even number of `a's. */
2600
2601 for (;;)
2602 {
0b32bf0e 2603 if ((syntax & RE_FRUGAL)
1c8c6d39
DL
2604 && c == '?' && (zero_times_ok || many_times_ok))
2605 greedy = 0;
2606 else
2607 {
2608 zero_times_ok |= c != '+';
2609 many_times_ok |= c != '?';
2610 }
25fe55af
RS
2611
2612 if (p == pend)
2613 break;
ed0767d8
SM
2614 else if (*p == '*'
2615 || (!(syntax & RE_BK_PLUS_QM)
2616 && (*p == '+' || *p == '?')))
25fe55af 2617 ;
ed0767d8 2618 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2619 {
ed0767d8
SM
2620 if (p+1 == pend)
2621 FREE_STACK_RETURN (REG_EESCAPE);
2622 if (p[1] == '+' || p[1] == '?')
2623 PATFETCH (c); /* Gobble up the backslash. */
2624 else
2625 break;
25fe55af
RS
2626 }
2627 else
ed0767d8 2628 break;
25fe55af 2629 /* If we get here, we found another repeat character. */
ed0767d8
SM
2630 PATFETCH (c);
2631 }
25fe55af
RS
2632
2633 /* Star, etc. applied to an empty pattern is equivalent
2634 to an empty pattern. */
4e8a9132 2635 if (!laststart || laststart == b)
25fe55af
RS
2636 break;
2637
2638 /* Now we know whether or not zero matches is allowed
2639 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2640 if (greedy)
2641 {
99633e97 2642 if (many_times_ok)
4e8a9132
SM
2643 {
2644 boolean simple = skip_one_char (laststart) == b;
2645 unsigned int startoffset = 0;
f6a3f532 2646 re_opcode_t ofj =
01618498 2647 /* Check if the loop can match the empty string. */
f6a3f532
SM
2648 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2649 on_failure_jump : on_failure_jump_loop;
4e8a9132
SM
2650 assert (skip_one_char (laststart) <= b);
2651
2652 if (!zero_times_ok && simple)
2653 { /* Since simple * loops can be made faster by using
2654 on_failure_keep_string_jump, we turn simple P+
2655 into PP* if P is simple. */
2656 unsigned char *p1, *p2;
2657 startoffset = b - laststart;
2658 GET_BUFFER_SPACE (startoffset);
2659 p1 = b; p2 = laststart;
2660 while (p2 < p1)
2661 *b++ = *p2++;
2662 zero_times_ok = 1;
99633e97 2663 }
4e8a9132
SM
2664
2665 GET_BUFFER_SPACE (6);
2666 if (!zero_times_ok)
2667 /* A + loop. */
f6a3f532 2668 STORE_JUMP (ofj, b, b + 6);
99633e97 2669 else
4e8a9132
SM
2670 /* Simple * loops can use on_failure_keep_string_jump
2671 depending on what follows. But since we don't know
2672 that yet, we leave the decision up to
2673 on_failure_jump_smart. */
f6a3f532 2674 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2675 laststart + startoffset, b + 6);
99633e97 2676 b += 3;
4e8a9132 2677 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2678 b += 3;
2679 }
2680 else
2681 {
4e8a9132
SM
2682 /* A simple ? pattern. */
2683 assert (zero_times_ok);
2684 GET_BUFFER_SPACE (3);
2685 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2686 b += 3;
2687 }
1c8c6d39
DL
2688 }
2689 else /* not greedy */
2690 { /* I wish the greedy and non-greedy cases could be merged. */
2691
0683b6fa 2692 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2693 if (many_times_ok)
2694 {
f6a3f532
SM
2695 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2696
505bde11 2697 /* The non-greedy multiple match looks like a repeat..until:
1c8c6d39 2698 we only need a conditional jump at the end of the loop */
f6a3f532
SM
2699 if (emptyp) BUF_PUSH (no_op);
2700 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2701 : on_failure_jump, b, laststart);
1c8c6d39
DL
2702 b += 3;
2703 if (zero_times_ok)
2704 {
2705 /* The repeat...until naturally matches one or more.
2706 To also match zero times, we need to first jump to
2707 the end of the loop (its conditional jump). */
1c8c6d39
DL
2708 INSERT_JUMP (jump, laststart, b);
2709 b += 3;
2710 }
2711 }
2712 else
2713 {
2714 /* non-greedy a?? */
1c8c6d39
DL
2715 INSERT_JUMP (jump, laststart, b + 3);
2716 b += 3;
2717 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2718 b += 3;
2719 }
2720 }
2721 }
4e8a9132 2722 pending_exact = 0;
fa9a63c5
RM
2723 break;
2724
2725
2726 case '.':
25fe55af
RS
2727 laststart = b;
2728 BUF_PUSH (anychar);
2729 break;
fa9a63c5
RM
2730
2731
25fe55af
RS
2732 case '[':
2733 {
b18215fc 2734 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2735
25fe55af 2736 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2737
25fe55af
RS
2738 /* Ensure that we have enough space to push a charset: the
2739 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2740 GET_BUFFER_SPACE (34);
2741
25fe55af 2742 laststart = b;
e318085a 2743
25fe55af
RS
2744 /* We test `*p == '^' twice, instead of using an if
2745 statement, so we only need one BUF_PUSH. */
2746 BUF_PUSH (*p == '^' ? charset_not : charset);
2747 if (*p == '^')
2748 p++;
e318085a 2749
25fe55af
RS
2750 /* Remember the first position in the bracket expression. */
2751 p1 = p;
e318085a 2752
25fe55af
RS
2753 /* Push the number of bytes in the bitmap. */
2754 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2755
25fe55af
RS
2756 /* Clear the whole map. */
2757 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2758
25fe55af
RS
2759 /* charset_not matches newline according to a syntax bit. */
2760 if ((re_opcode_t) b[-2] == charset_not
2761 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2762 SET_LIST_BIT ('\n');
fa9a63c5 2763
25fe55af
RS
2764 /* Read in characters and ranges, setting map bits. */
2765 for (;;)
2766 {
b18215fc 2767 boolean escaped_char = false;
2d1675e4 2768 const unsigned char *p2 = p;
e318085a 2769
25fe55af 2770 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2771
36595814
SM
2772 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2773 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2774 So the translation is done later in a loop. Example:
2775 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
25fe55af 2776 PATFETCH (c);
e318085a 2777
25fe55af
RS
2778 /* \ might escape characters inside [...] and [^...]. */
2779 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2780 {
2781 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2782
2783 PATFETCH (c);
b18215fc 2784 escaped_char = true;
25fe55af 2785 }
b18215fc
RS
2786 else
2787 {
657fcfbd
RS
2788 /* Could be the end of the bracket expression. If it's
2789 not (i.e., when the bracket expression is `[]' so
2790 far), the ']' character bit gets set way below. */
2d1675e4 2791 if (c == ']' && p2 != p1)
657fcfbd 2792 break;
25fe55af 2793 }
b18215fc 2794
b18215fc
RS
2795 /* What should we do for the character which is
2796 greater than 0x7F, but not BASE_LEADING_CODE_P?
2797 XXX */
2798
25fe55af
RS
2799 /* See if we're at the beginning of a possible character
2800 class. */
b18215fc 2801
2d1675e4
SM
2802 if (!escaped_char &&
2803 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2804 {
2805 /* Leave room for the null. */
14473664 2806 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2807 const unsigned char *class_beg;
b18215fc 2808
25fe55af
RS
2809 PATFETCH (c);
2810 c1 = 0;
ed0767d8 2811 class_beg = p;
b18215fc 2812
25fe55af
RS
2813 /* If pattern is `[[:'. */
2814 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2815
25fe55af
RS
2816 for (;;)
2817 {
14473664
SM
2818 PATFETCH (c);
2819 if ((c == ':' && *p == ']') || p == pend)
2820 break;
2821 if (c1 < CHAR_CLASS_MAX_LENGTH)
2822 str[c1++] = c;
2823 else
2824 /* This is in any case an invalid class name. */
2825 str[0] = '\0';
25fe55af
RS
2826 }
2827 str[c1] = '\0';
b18215fc
RS
2828
2829 /* If isn't a word bracketed by `[:' and `:]':
2830 undo the ending character, the letters, and
2831 leave the leading `:' and `[' (but set bits for
2832 them). */
25fe55af
RS
2833 if (c == ':' && *p == ']')
2834 {
36595814 2835 re_wchar_t ch;
14473664
SM
2836 re_wctype_t cc;
2837
2838 cc = re_wctype (str);
2839
2840 if (cc == 0)
fa9a63c5
RM
2841 FREE_STACK_RETURN (REG_ECTYPE);
2842
14473664
SM
2843 /* Throw away the ] at the end of the character
2844 class. */
2845 PATFETCH (c);
fa9a63c5 2846
14473664 2847 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2848
96cc36cc
RS
2849 /* Most character classes in a multibyte match
2850 just set a flag. Exceptions are is_blank,
2851 is_digit, is_cntrl, and is_xdigit, since
2852 they can only match ASCII characters. We
14473664
SM
2853 don't need to handle them for multibyte.
2854 They are distinguished by a negative wctype. */
96cc36cc 2855
2d1675e4 2856 if (multibyte)
14473664
SM
2857 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2858 re_wctype_to_bit (cc));
96cc36cc 2859
14473664 2860 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
25fe55af 2861 {
7ae68633 2862 int translated = TRANSLATE (ch);
14473664 2863 if (re_iswctype (btowc (ch), cc))
96cc36cc 2864 SET_LIST_BIT (translated);
25fe55af 2865 }
b18215fc
RS
2866
2867 /* Repeat the loop. */
2868 continue;
25fe55af
RS
2869 }
2870 else
2871 {
ed0767d8
SM
2872 /* Go back to right after the "[:". */
2873 p = class_beg;
25fe55af 2874 SET_LIST_BIT ('[');
b18215fc
RS
2875
2876 /* Because the `:' may starts the range, we
2877 can't simply set bit and repeat the loop.
25fe55af 2878 Instead, just set it to C and handle below. */
b18215fc 2879 c = ':';
25fe55af
RS
2880 }
2881 }
b18215fc
RS
2882
2883 if (p < pend && p[0] == '-' && p[1] != ']')
2884 {
2885
2886 /* Discard the `-'. */
2887 PATFETCH (c1);
2888
2889 /* Fetch the character which ends the range. */
2890 PATFETCH (c1);
b18215fc 2891
b54f61ed 2892 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2893 {
b54f61ed
KH
2894 if (! SINGLE_BYTE_CHAR_P (c1))
2895 {
3ff2446d
KH
2896 /* Handle a range starting with a
2897 character of less than 256, and ending
2898 with a character of not less than 256.
2899 Split that into two ranges, the low one
2900 ending at 0377, and the high one
2901 starting at the smallest character in
2902 the charset of C1 and ending at C1. */
b54f61ed 2903 int charset = CHAR_CHARSET (c1);
36595814
SM
2904 re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
2905
b54f61ed
KH
2906 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2907 c2, c1);
333526e0 2908 c1 = 0377;
b54f61ed 2909 }
e934739e
RS
2910 }
2911 else if (!SAME_CHARSET_P (c, c1))
b18215fc 2912 FREE_STACK_RETURN (REG_ERANGE);
e318085a 2913 }
25fe55af 2914 else
b18215fc
RS
2915 /* Range from C to C. */
2916 c1 = c;
2917
2918 /* Set the range ... */
2919 if (SINGLE_BYTE_CHAR_P (c))
2920 /* ... into bitmap. */
25fe55af 2921 {
01618498 2922 re_wchar_t this_char;
36595814 2923 re_wchar_t range_start = c, range_end = c1;
b18215fc
RS
2924
2925 /* If the start is after the end, the range is empty. */
2926 if (range_start > range_end)
2927 {
2928 if (syntax & RE_NO_EMPTY_RANGES)
2929 FREE_STACK_RETURN (REG_ERANGE);
2930 /* Else, repeat the loop. */
2931 }
2932 else
2933 {
2934 for (this_char = range_start; this_char <= range_end;
2935 this_char++)
2936 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 2937 }
25fe55af 2938 }
e318085a 2939 else
b18215fc
RS
2940 /* ... into range table. */
2941 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
2942 }
2943
25fe55af
RS
2944 /* Discard any (non)matching list bytes that are all 0 at the
2945 end of the map. Decrease the map-length byte too. */
2946 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2947 b[-1]--;
2948 b += b[-1];
fa9a63c5 2949
96cc36cc
RS
2950 /* Build real range table from work area. */
2951 if (RANGE_TABLE_WORK_USED (range_table_work)
2952 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
2953 {
2954 int i;
2955 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2956
b18215fc 2957 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
2958 bytes for flags, two for COUNT, and three bytes for
2959 each character. */
2960 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 2961
b18215fc
RS
2962 /* Indicate the existence of range table. */
2963 laststart[1] |= 0x80;
fa9a63c5 2964
96cc36cc
RS
2965 /* Store the character class flag bits into the range table.
2966 If not in emacs, these flag bits are always 0. */
2967 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2968 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2969
b18215fc
RS
2970 STORE_NUMBER_AND_INCR (b, used / 2);
2971 for (i = 0; i < used; i++)
2972 STORE_CHARACTER_AND_INCR
2973 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2974 }
25fe55af
RS
2975 }
2976 break;
fa9a63c5
RM
2977
2978
b18215fc 2979 case '(':
25fe55af
RS
2980 if (syntax & RE_NO_BK_PARENS)
2981 goto handle_open;
2982 else
2983 goto normal_char;
fa9a63c5
RM
2984
2985
25fe55af
RS
2986 case ')':
2987 if (syntax & RE_NO_BK_PARENS)
2988 goto handle_close;
2989 else
2990 goto normal_char;
e318085a
RS
2991
2992
25fe55af
RS
2993 case '\n':
2994 if (syntax & RE_NEWLINE_ALT)
2995 goto handle_alt;
2996 else
2997 goto normal_char;
e318085a
RS
2998
2999
b18215fc 3000 case '|':
25fe55af
RS
3001 if (syntax & RE_NO_BK_VBAR)
3002 goto handle_alt;
3003 else
3004 goto normal_char;
3005
3006
3007 case '{':
3008 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3009 goto handle_interval;
3010 else
3011 goto normal_char;
3012
3013
3014 case '\\':
3015 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3016
3017 /* Do not translate the character after the \, so that we can
3018 distinguish, e.g., \B from \b, even if we normally would
3019 translate, e.g., B to b. */
36595814 3020 PATFETCH (c);
25fe55af
RS
3021
3022 switch (c)
3023 {
3024 case '(':
3025 if (syntax & RE_NO_BK_PARENS)
3026 goto normal_backslash;
3027
3028 handle_open:
505bde11
SM
3029 {
3030 int shy = 0;
3031 if (p+1 < pend)
3032 {
3033 /* Look for a special (?...) construct */
ed0767d8 3034 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 3035 {
ed0767d8 3036 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
3037 PATFETCH (c);
3038 switch (c)
3039 {
3040 case ':': shy = 1; break;
3041 default:
3042 /* Only (?:...) is supported right now. */
3043 FREE_STACK_RETURN (REG_BADPAT);
3044 }
3045 }
505bde11
SM
3046 }
3047
3048 if (!shy)
3049 {
3050 bufp->re_nsub++;
3051 regnum++;
3052 }
25fe55af 3053
99633e97
SM
3054 if (COMPILE_STACK_FULL)
3055 {
3056 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3057 compile_stack_elt_t);
3058 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 3059
99633e97
SM
3060 compile_stack.size <<= 1;
3061 }
25fe55af 3062
99633e97
SM
3063 /* These are the values to restore when we hit end of this
3064 group. They are all relative offsets, so that if the
3065 whole pattern moves because of realloc, they will still
3066 be valid. */
3067 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3068 COMPILE_STACK_TOP.fixup_alt_jump
3069 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3070 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3071 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
3072
3073 /* Do not push a
3074 start_memory for groups beyond the last one we can
3075 represent in the compiled pattern. */
3076 if (regnum <= MAX_REGNUM && !shy)
3077 BUF_PUSH_2 (start_memory, regnum);
3078
3079 compile_stack.avail++;
3080
3081 fixup_alt_jump = 0;
3082 laststart = 0;
3083 begalt = b;
3084 /* If we've reached MAX_REGNUM groups, then this open
3085 won't actually generate any code, so we'll have to
3086 clear pending_exact explicitly. */
3087 pending_exact = 0;
3088 break;
505bde11 3089 }
25fe55af
RS
3090
3091 case ')':
3092 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3093
3094 if (COMPILE_STACK_EMPTY)
505bde11
SM
3095 {
3096 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3097 goto normal_backslash;
3098 else
3099 FREE_STACK_RETURN (REG_ERPAREN);
3100 }
25fe55af
RS
3101
3102 handle_close:
505bde11 3103 FIXUP_ALT_JUMP ();
25fe55af
RS
3104
3105 /* See similar code for backslashed left paren above. */
3106 if (COMPILE_STACK_EMPTY)
505bde11
SM
3107 {
3108 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3109 goto normal_char;
3110 else
3111 FREE_STACK_RETURN (REG_ERPAREN);
3112 }
25fe55af
RS
3113
3114 /* Since we just checked for an empty stack above, this
3115 ``can't happen''. */
3116 assert (compile_stack.avail != 0);
3117 {
3118 /* We don't just want to restore into `regnum', because
3119 later groups should continue to be numbered higher,
3120 as in `(ab)c(de)' -- the second group is #2. */
3121 regnum_t this_group_regnum;
3122
3123 compile_stack.avail--;
3124 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3125 fixup_alt_jump
3126 = COMPILE_STACK_TOP.fixup_alt_jump
3127 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3128 : 0;
3129 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3130 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
3131 /* If we've reached MAX_REGNUM groups, then this open
3132 won't actually generate any code, so we'll have to
3133 clear pending_exact explicitly. */
3134 pending_exact = 0;
e318085a 3135
25fe55af
RS
3136 /* We're at the end of the group, so now we know how many
3137 groups were inside this one. */
505bde11
SM
3138 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
3139 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
3140 }
3141 break;
3142
3143
3144 case '|': /* `\|'. */
3145 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3146 goto normal_backslash;
3147 handle_alt:
3148 if (syntax & RE_LIMITED_OPS)
3149 goto normal_char;
3150
3151 /* Insert before the previous alternative a jump which
3152 jumps to this alternative if the former fails. */
3153 GET_BUFFER_SPACE (3);
3154 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3155 pending_exact = 0;
3156 b += 3;
3157
3158 /* The alternative before this one has a jump after it
3159 which gets executed if it gets matched. Adjust that
3160 jump so it will jump to this alternative's analogous
3161 jump (put in below, which in turn will jump to the next
3162 (if any) alternative's such jump, etc.). The last such
3163 jump jumps to the correct final destination. A picture:
3164 _____ _____
3165 | | | |
3166 | v | v
3167 a | b | c
3168
3169 If we are at `b', then fixup_alt_jump right now points to a
3170 three-byte space after `a'. We'll put in the jump, set
3171 fixup_alt_jump to right after `b', and leave behind three
3172 bytes which we'll fill in when we get to after `c'. */
3173
505bde11 3174 FIXUP_ALT_JUMP ();
25fe55af
RS
3175
3176 /* Mark and leave space for a jump after this alternative,
3177 to be filled in later either by next alternative or
3178 when know we're at the end of a series of alternatives. */
3179 fixup_alt_jump = b;
3180 GET_BUFFER_SPACE (3);
3181 b += 3;
3182
3183 laststart = 0;
3184 begalt = b;
3185 break;
3186
3187
3188 case '{':
3189 /* If \{ is a literal. */
3190 if (!(syntax & RE_INTERVALS)
3191 /* If we're at `\{' and it's not the open-interval
3192 operator. */
4bb91c68 3193 || (syntax & RE_NO_BK_BRACES))
25fe55af
RS
3194 goto normal_backslash;
3195
3196 handle_interval:
3197 {
3198 /* If got here, then the syntax allows intervals. */
3199
3200 /* At least (most) this many matches must be made. */
99633e97 3201 int lower_bound = 0, upper_bound = -1;
25fe55af 3202
ed0767d8 3203 beg_interval = p;
25fe55af
RS
3204
3205 if (p == pend)
4bb91c68 3206 FREE_STACK_RETURN (REG_EBRACE);
25fe55af
RS
3207
3208 GET_UNSIGNED_NUMBER (lower_bound);
3209
3210 if (c == ',')
ed0767d8 3211 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
3212 else
3213 /* Interval such as `{1}' => match exactly once. */
3214 upper_bound = lower_bound;
3215
3216 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 3217 || (upper_bound >= 0 && lower_bound > upper_bound))
4bb91c68 3218 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3219
3220 if (!(syntax & RE_NO_BK_BRACES))
3221 {
4bb91c68
SM
3222 if (c != '\\')
3223 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3224
3225 PATFETCH (c);
3226 }
3227
3228 if (c != '}')
4bb91c68 3229 FREE_STACK_RETURN (REG_BADBR);
25fe55af
RS
3230
3231 /* We just parsed a valid interval. */
3232
3233 /* If it's invalid to have no preceding re. */
3234 if (!laststart)
3235 {
3236 if (syntax & RE_CONTEXT_INVALID_OPS)
3237 FREE_STACK_RETURN (REG_BADRPT);
3238 else if (syntax & RE_CONTEXT_INDEP_OPS)
3239 laststart = b;
3240 else
3241 goto unfetch_interval;
3242 }
3243
25fe55af 3244 if (upper_bound == 0)
ed0767d8
SM
3245 /* If the upper bound is zero, just drop the sub pattern
3246 altogether. */
3247 b = laststart;
3248 else if (lower_bound == 1 && upper_bound == 1)
3249 /* Just match it once: nothing to do here. */
3250 ;
25fe55af
RS
3251
3252 /* Otherwise, we have a nontrivial interval. When
3253 we're all done, the pattern will look like:
3254 set_number_at <jump count> <upper bound>
3255 set_number_at <succeed_n count> <lower bound>
3256 succeed_n <after jump addr> <succeed_n count>
3257 <body of loop>
3258 jump_n <succeed_n addr> <jump count>
3259 (The upper bound and `jump_n' are omitted if
3260 `upper_bound' is 1, though.) */
3261 else
3262 { /* If the upper bound is > 1, we need to insert
3263 more at the end of the loop. */
ed0767d8
SM
3264 unsigned int nbytes = (upper_bound < 0 ? 3
3265 : upper_bound > 1 ? 5 : 0);
3266 unsigned int startoffset = 0;
3267
3268 GET_BUFFER_SPACE (20); /* We might use less. */
3269
3270 if (lower_bound == 0)
3271 {
3272 /* A succeed_n that starts with 0 is really a
3273 a simple on_failure_jump_loop. */
3274 INSERT_JUMP (on_failure_jump_loop, laststart,
3275 b + 3 + nbytes);
3276 b += 3;
3277 }
3278 else
3279 {
3280 /* Initialize lower bound of the `succeed_n', even
3281 though it will be set during matching by its
3282 attendant `set_number_at' (inserted next),
3283 because `re_compile_fastmap' needs to know.
3284 Jump to the `jump_n' we might insert below. */
3285 INSERT_JUMP2 (succeed_n, laststart,
3286 b + 5 + nbytes,
3287 lower_bound);
3288 b += 5;
3289
3290 /* Code to initialize the lower bound. Insert
3291 before the `succeed_n'. The `5' is the last two
3292 bytes of this `set_number_at', plus 3 bytes of
3293 the following `succeed_n'. */
3294 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3295 b += 5;
3296 startoffset += 5;
3297 }
3298
3299 if (upper_bound < 0)
3300 {
3301 /* A negative upper bound stands for infinity,
3302 in which case it degenerates to a plain jump. */
3303 STORE_JUMP (jump, b, laststart + startoffset);
3304 b += 3;
3305 }
3306 else if (upper_bound > 1)
25fe55af
RS
3307 { /* More than one repetition is allowed, so
3308 append a backward jump to the `succeed_n'
3309 that starts this interval.
3310
3311 When we've reached this during matching,
3312 we'll have matched the interval once, so
3313 jump back only `upper_bound - 1' times. */
ed0767d8 3314 STORE_JUMP2 (jump_n, b, laststart + startoffset,
25fe55af
RS
3315 upper_bound - 1);
3316 b += 5;
3317
3318 /* The location we want to set is the second
3319 parameter of the `jump_n'; that is `b-2' as
3320 an absolute address. `laststart' will be
3321 the `set_number_at' we're about to insert;
3322 `laststart+3' the number to set, the source
3323 for the relative address. But we are
3324 inserting into the middle of the pattern --
3325 so everything is getting moved up by 5.
3326 Conclusion: (b - 2) - (laststart + 3) + 5,
3327 i.e., b - laststart.
3328
3329 We insert this at the beginning of the loop
3330 so that if we fail during matching, we'll
3331 reinitialize the bounds. */
3332 insert_op2 (set_number_at, laststart, b - laststart,
3333 upper_bound - 1, b);
3334 b += 5;
3335 }
3336 }
3337 pending_exact = 0;
3338 beg_interval = NULL;
3339 }
3340 break;
3341
3342 unfetch_interval:
3343 /* If an invalid interval, match the characters as literals. */
3344 assert (beg_interval);
3345 p = beg_interval;
3346 beg_interval = NULL;
3347
3348 /* normal_char and normal_backslash need `c'. */
ed0767d8 3349 c = '{';
25fe55af
RS
3350
3351 if (!(syntax & RE_NO_BK_BRACES))
3352 {
ed0767d8
SM
3353 assert (p > pattern && p[-1] == '\\');
3354 goto normal_backslash;
25fe55af 3355 }
ed0767d8
SM
3356 else
3357 goto normal_char;
e318085a 3358
b18215fc 3359#ifdef emacs
25fe55af
RS
3360 /* There is no way to specify the before_dot and after_dot
3361 operators. rms says this is ok. --karl */
3362 case '=':
3363 BUF_PUSH (at_dot);
3364 break;
3365
3366 case 's':
3367 laststart = b;
3368 PATFETCH (c);
3369 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3370 break;
3371
3372 case 'S':
3373 laststart = b;
3374 PATFETCH (c);
3375 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3376 break;
b18215fc
RS
3377
3378 case 'c':
3379 laststart = b;
36595814 3380 PATFETCH (c);
b18215fc
RS
3381 BUF_PUSH_2 (categoryspec, c);
3382 break;
e318085a 3383
b18215fc
RS
3384 case 'C':
3385 laststart = b;
36595814 3386 PATFETCH (c);
b18215fc
RS
3387 BUF_PUSH_2 (notcategoryspec, c);
3388 break;
3389#endif /* emacs */
e318085a 3390
e318085a 3391
25fe55af 3392 case 'w':
4bb91c68
SM
3393 if (syntax & RE_NO_GNU_OPS)
3394 goto normal_char;
25fe55af 3395 laststart = b;
1fb352e0 3396 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 3397 break;
e318085a 3398
e318085a 3399
25fe55af 3400 case 'W':
4bb91c68
SM
3401 if (syntax & RE_NO_GNU_OPS)
3402 goto normal_char;
25fe55af 3403 laststart = b;
1fb352e0 3404 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 3405 break;
e318085a
RS
3406
3407
25fe55af 3408 case '<':
4bb91c68
SM
3409 if (syntax & RE_NO_GNU_OPS)
3410 goto normal_char;
25fe55af
RS
3411 BUF_PUSH (wordbeg);
3412 break;
e318085a 3413
25fe55af 3414 case '>':
4bb91c68
SM
3415 if (syntax & RE_NO_GNU_OPS)
3416 goto normal_char;
25fe55af
RS
3417 BUF_PUSH (wordend);
3418 break;
e318085a 3419
25fe55af 3420 case 'b':
4bb91c68
SM
3421 if (syntax & RE_NO_GNU_OPS)
3422 goto normal_char;
25fe55af
RS
3423 BUF_PUSH (wordbound);
3424 break;
e318085a 3425
25fe55af 3426 case 'B':
4bb91c68
SM
3427 if (syntax & RE_NO_GNU_OPS)
3428 goto normal_char;
25fe55af
RS
3429 BUF_PUSH (notwordbound);
3430 break;
fa9a63c5 3431
25fe55af 3432 case '`':
4bb91c68
SM
3433 if (syntax & RE_NO_GNU_OPS)
3434 goto normal_char;
25fe55af
RS
3435 BUF_PUSH (begbuf);
3436 break;
e318085a 3437
25fe55af 3438 case '\'':
4bb91c68
SM
3439 if (syntax & RE_NO_GNU_OPS)
3440 goto normal_char;
25fe55af
RS
3441 BUF_PUSH (endbuf);
3442 break;
e318085a 3443
25fe55af
RS
3444 case '1': case '2': case '3': case '4': case '5':
3445 case '6': case '7': case '8': case '9':
0cdd06f8
SM
3446 {
3447 regnum_t reg;
e318085a 3448
0cdd06f8
SM
3449 if (syntax & RE_NO_BK_REFS)
3450 goto normal_backslash;
e318085a 3451
0cdd06f8 3452 reg = c - '0';
e318085a 3453
0cdd06f8
SM
3454 /* Can't back reference to a subexpression before its end. */
3455 if (reg > regnum || group_in_compile_stack (compile_stack, reg))
3456 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3457
0cdd06f8
SM
3458 laststart = b;
3459 BUF_PUSH_2 (duplicate, reg);
3460 }
25fe55af 3461 break;
e318085a 3462
e318085a 3463
25fe55af
RS
3464 case '+':
3465 case '?':
3466 if (syntax & RE_BK_PLUS_QM)
3467 goto handle_plus;
3468 else
3469 goto normal_backslash;
3470
3471 default:
3472 normal_backslash:
3473 /* You might think it would be useful for \ to mean
3474 not to translate; but if we don't translate it
4bb91c68 3475 it will never match anything. */
25fe55af
RS
3476 goto normal_char;
3477 }
3478 break;
fa9a63c5
RM
3479
3480
3481 default:
25fe55af 3482 /* Expects the character in `c'. */
fa9a63c5 3483 normal_char:
36595814 3484 /* If no exactn currently being built. */
25fe55af 3485 if (!pending_exact
fa9a63c5 3486
25fe55af
RS
3487 /* If last exactn not at current position. */
3488 || pending_exact + *pending_exact + 1 != b
5e69f11e 3489
25fe55af 3490 /* We have only one byte following the exactn for the count. */
2d1675e4 3491 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3492
25fe55af 3493 /* If followed by a repetition operator. */
9d99031f 3494 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3495 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3496 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3497 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3498 || ((syntax & RE_INTERVALS)
25fe55af 3499 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3500 ? p != pend && *p == '{'
3501 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3502 {
3503 /* Start building a new exactn. */
5e69f11e 3504
25fe55af 3505 laststart = b;
fa9a63c5
RM
3506
3507 BUF_PUSH_2 (exactn, 0);
3508 pending_exact = b - 1;
25fe55af 3509 }
5e69f11e 3510
2d1675e4
SM
3511 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3512 {
e0277a47
KH
3513 int len;
3514
36595814 3515 c = TRANSLATE (c);
e0277a47
KH
3516 if (multibyte)
3517 len = CHAR_STRING (c, b);
3518 else
3519 *b = c, len = 1;
2d1675e4
SM
3520 b += len;
3521 (*pending_exact) += len;
3522 }
3523
fa9a63c5 3524 break;
25fe55af 3525 } /* switch (c) */
fa9a63c5
RM
3526 } /* while p != pend */
3527
5e69f11e 3528
fa9a63c5 3529 /* Through the pattern now. */
5e69f11e 3530
505bde11 3531 FIXUP_ALT_JUMP ();
fa9a63c5 3532
5e69f11e 3533 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3534 FREE_STACK_RETURN (REG_EPAREN);
3535
3536 /* If we don't want backtracking, force success
3537 the first time we reach the end of the compiled pattern. */
3538 if (syntax & RE_NO_POSIX_BACKTRACKING)
3539 BUF_PUSH (succeed);
3540
3541 free (compile_stack.stack);
3542
3543 /* We have succeeded; set the length of the buffer. */
3544 bufp->used = b - bufp->buffer;
3545
3546#ifdef DEBUG
99633e97 3547 if (debug > 0)
fa9a63c5 3548 {
505bde11 3549 re_compile_fastmap (bufp);
fa9a63c5
RM
3550 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3551 print_compiled_pattern (bufp);
3552 }
99633e97 3553 debug--;
fa9a63c5
RM
3554#endif /* DEBUG */
3555
3556#ifndef MATCH_MAY_ALLOCATE
3557 /* Initialize the failure stack to the largest possible stack. This
3558 isn't necessary unless we're trying to avoid calling alloca in
3559 the search and match routines. */
3560 {
3561 int num_regs = bufp->re_nsub + 1;
3562
320a2a73 3563 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3564 {
a26f4ccd 3565 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3566
fa9a63c5
RM
3567 if (! fail_stack.stack)
3568 fail_stack.stack
5e69f11e 3569 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3570 * sizeof (fail_stack_elt_t));
3571 else
3572 fail_stack.stack
3573 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3574 (fail_stack.size
3575 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3576 }
3577
3578 regex_grow_registers (num_regs);
3579 }
3580#endif /* not MATCH_MAY_ALLOCATE */
3581
3582 return REG_NOERROR;
3583} /* regex_compile */
3584\f
3585/* Subroutines for `regex_compile'. */
3586
25fe55af 3587/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3588
3589static void
3590store_op1 (op, loc, arg)
3591 re_opcode_t op;
3592 unsigned char *loc;
3593 int arg;
3594{
3595 *loc = (unsigned char) op;
3596 STORE_NUMBER (loc + 1, arg);
3597}
3598
3599
3600/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3601
3602static void
3603store_op2 (op, loc, arg1, arg2)
3604 re_opcode_t op;
3605 unsigned char *loc;
3606 int arg1, arg2;
3607{
3608 *loc = (unsigned char) op;
3609 STORE_NUMBER (loc + 1, arg1);
3610 STORE_NUMBER (loc + 3, arg2);
3611}
3612
3613
3614/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3615 for OP followed by two-byte integer parameter ARG. */
3616
3617static void
3618insert_op1 (op, loc, arg, end)
3619 re_opcode_t op;
3620 unsigned char *loc;
3621 int arg;
5e69f11e 3622 unsigned char *end;
fa9a63c5
RM
3623{
3624 register unsigned char *pfrom = end;
3625 register unsigned char *pto = end + 3;
3626
3627 while (pfrom != loc)
3628 *--pto = *--pfrom;
5e69f11e 3629
fa9a63c5
RM
3630 store_op1 (op, loc, arg);
3631}
3632
3633
3634/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3635
3636static void
3637insert_op2 (op, loc, arg1, arg2, end)
3638 re_opcode_t op;
3639 unsigned char *loc;
3640 int arg1, arg2;
5e69f11e 3641 unsigned char *end;
fa9a63c5
RM
3642{
3643 register unsigned char *pfrom = end;
3644 register unsigned char *pto = end + 5;
3645
3646 while (pfrom != loc)
3647 *--pto = *--pfrom;
5e69f11e 3648
fa9a63c5
RM
3649 store_op2 (op, loc, arg1, arg2);
3650}
3651
3652
3653/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3654 after an alternative or a begin-subexpression. We assume there is at
3655 least one character before the ^. */
3656
3657static boolean
3658at_begline_loc_p (pattern, p, syntax)
01618498 3659 re_char *pattern, *p;
fa9a63c5
RM
3660 reg_syntax_t syntax;
3661{
01618498 3662 re_char *prev = p - 2;
fa9a63c5 3663 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3664
fa9a63c5
RM
3665 return
3666 /* After a subexpression? */
3667 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3668 /* After an alternative? */
d2af47df
SM
3669 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3670 /* After a shy subexpression? */
3671 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3672 && prev[-1] == '?' && prev[-2] == '('
3673 && (syntax & RE_NO_BK_PARENS
3674 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3675}
3676
3677
3678/* The dual of at_begline_loc_p. This one is for $. We assume there is
3679 at least one character after the $, i.e., `P < PEND'. */
3680
3681static boolean
3682at_endline_loc_p (p, pend, syntax)
01618498 3683 re_char *p, *pend;
99633e97 3684 reg_syntax_t syntax;
fa9a63c5 3685{
01618498 3686 re_char *next = p;
fa9a63c5 3687 boolean next_backslash = *next == '\\';
01618498 3688 re_char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3689
fa9a63c5
RM
3690 return
3691 /* Before a subexpression? */
3692 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3693 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3694 /* Before an alternative? */
3695 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3696 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3697}
3698
3699
5e69f11e 3700/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3701 false if it's not. */
3702
3703static boolean
3704group_in_compile_stack (compile_stack, regnum)
3705 compile_stack_type compile_stack;
3706 regnum_t regnum;
3707{
3708 int this_element;
3709
5e69f11e
RM
3710 for (this_element = compile_stack.avail - 1;
3711 this_element >= 0;
fa9a63c5
RM
3712 this_element--)
3713 if (compile_stack.stack[this_element].regnum == regnum)
3714 return true;
3715
3716 return false;
3717}
fa9a63c5 3718\f
f6a3f532
SM
3719/* analyse_first.
3720 If fastmap is non-NULL, go through the pattern and fill fastmap
3721 with all the possible leading chars. If fastmap is NULL, don't
3722 bother filling it up (obviously) and only return whether the
3723 pattern could potentially match the empty string.
3724
3725 Return 1 if p..pend might match the empty string.
3726 Return 0 if p..pend matches at least one char.
01618498 3727 Return -1 if fastmap was not updated accurately. */
f6a3f532
SM
3728
3729static int
3730analyse_first (p, pend, fastmap, multibyte)
01618498 3731 re_char *p, *pend;
f6a3f532
SM
3732 char *fastmap;
3733 const int multibyte;
fa9a63c5 3734{
505bde11 3735 int j, k;
1fb352e0 3736 boolean not;
fa9a63c5 3737
b18215fc 3738 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3739 flag is set true. */
b18215fc
RS
3740 boolean match_any_multibyte_characters = false;
3741
f6a3f532 3742 assert (p);
5e69f11e 3743
505bde11
SM
3744 /* The loop below works as follows:
3745 - It has a working-list kept in the PATTERN_STACK and which basically
3746 starts by only containing a pointer to the first operation.
3747 - If the opcode we're looking at is a match against some set of
3748 chars, then we add those chars to the fastmap and go on to the
3749 next work element from the worklist (done via `break').
3750 - If the opcode is a control operator on the other hand, we either
3751 ignore it (if it's meaningless at this point, such as `start_memory')
3752 or execute it (if it's a jump). If the jump has several destinations
3753 (i.e. `on_failure_jump'), then we push the other destination onto the
3754 worklist.
3755 We guarantee termination by ignoring backward jumps (more or less),
3756 so that `p' is monotonically increasing. More to the point, we
3757 never set `p' (or push) anything `<= p1'. */
3758
01618498 3759 while (p < pend)
fa9a63c5 3760 {
505bde11
SM
3761 /* `p1' is used as a marker of how far back a `on_failure_jump'
3762 can go without being ignored. It is normally equal to `p'
3763 (which prevents any backward `on_failure_jump') except right
3764 after a plain `jump', to allow patterns such as:
3765 0: jump 10
3766 3..9: <body>
3767 10: on_failure_jump 3
3768 as used for the *? operator. */
01618498 3769 re_char *p1 = p;
5e69f11e 3770
fa9a63c5
RM
3771 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3772 {
f6a3f532 3773 case succeed:
01618498 3774 return 1;
f6a3f532 3775 continue;
fa9a63c5 3776
fa9a63c5 3777 case duplicate:
505bde11
SM
3778 /* If the first character has to match a backreference, that means
3779 that the group was empty (since it already matched). Since this
3780 is the only case that interests us here, we can assume that the
3781 backreference must match the empty string. */
3782 p++;
3783 continue;
fa9a63c5
RM
3784
3785
3786 /* Following are the cases which match a character. These end
25fe55af 3787 with `break'. */
fa9a63c5
RM
3788
3789 case exactn:
e0277a47
KH
3790 if (fastmap)
3791 {
3792 int c = RE_STRING_CHAR (p + 1, pend - p);
3793
3794 if (SINGLE_BYTE_CHAR_P (c))
3795 fastmap[c] = 1;
3796 else
3797 fastmap[p[1]] = 1;
3798 }
fa9a63c5
RM
3799 break;
3800
3801
1fb352e0
SM
3802 case anychar:
3803 /* We could put all the chars except for \n (and maybe \0)
3804 but we don't bother since it is generally not worth it. */
f6a3f532 3805 if (!fastmap) break;
01618498 3806 return -1;
fa9a63c5
RM
3807
3808
b18215fc 3809 case charset_not:
ba5c004d
RS
3810 /* Chars beyond end of bitmap are possible matches.
3811 All the single-byte codes can occur in multibyte buffers.
3812 So any that are not listed in the charset
3813 are possible matches, even in multibyte buffers. */
1fb352e0 3814 if (!fastmap) break;
b18215fc 3815 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3816 j < (1 << BYTEWIDTH); j++)
b18215fc 3817 fastmap[j] = 1;
1fb352e0
SM
3818 /* Fallthrough */
3819 case charset:
3820 if (!fastmap) break;
3821 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3822 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3823 j >= 0; j--)
1fb352e0 3824 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3825 fastmap[j] = 1;
3826
1fb352e0
SM
3827 if ((not && multibyte)
3828 /* Any character set can possibly contain a character
3829 which doesn't match the specified set of characters. */
3830 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3831 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3832 /* If we can match a character class, we can match
3833 any character set. */
b18215fc
RS
3834 {
3835 set_fastmap_for_multibyte_characters:
3836 if (match_any_multibyte_characters == false)
3837 {
3838 for (j = 0x80; j < 0xA0; j++) /* XXX */
3839 if (BASE_LEADING_CODE_P (j))
3840 fastmap[j] = 1;
3841 match_any_multibyte_characters = true;
3842 }
3843 }
b18215fc 3844
1fb352e0
SM
3845 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3846 && match_any_multibyte_characters == false)
3847 {
3848 /* Set fastmap[I] 1 where I is a base leading code of each
3849 multibyte character in the range table. */
3850 int c, count;
b18215fc 3851
1fb352e0 3852 /* Make P points the range table. `+ 2' is to skip flag
0b32bf0e 3853 bits for a character class. */
1fb352e0 3854 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3855
1fb352e0
SM
3856 /* Extract the number of ranges in range table into COUNT. */
3857 EXTRACT_NUMBER_AND_INCR (count, p);
3858 for (; count > 0; count--, p += 2 * 3) /* XXX */
3859 {
3860 /* Extract the start of each range. */
3861 EXTRACT_CHARACTER (c, p);
3862 j = CHAR_CHARSET (c);
3863 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3864 }
3865 }
b18215fc
RS
3866 break;
3867
1fb352e0
SM
3868 case syntaxspec:
3869 case notsyntaxspec:
3870 if (!fastmap) break;
3871#ifndef emacs
3872 not = (re_opcode_t)p[-1] == notsyntaxspec;
3873 k = *p++;
3874 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3875 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3876 fastmap[j] = 1;
b18215fc 3877 break;
1fb352e0 3878#else /* emacs */
b18215fc
RS
3879 /* This match depends on text properties. These end with
3880 aborting optimizations. */
01618498 3881 return -1;
b18215fc
RS
3882
3883 case categoryspec:
b18215fc 3884 case notcategoryspec:
1fb352e0
SM
3885 if (!fastmap) break;
3886 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3887 k = *p++;
1fb352e0
SM
3888 for (j = 0; j < (1 << BYTEWIDTH); j++)
3889 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3890 fastmap[j] = 1;
3891
1fb352e0 3892 if (multibyte)
b18215fc 3893 /* Any character set can possibly contain a character
1fb352e0 3894 whose category is K (or not). */
b18215fc
RS
3895 goto set_fastmap_for_multibyte_characters;
3896 break;
3897
fa9a63c5 3898 /* All cases after this match the empty string. These end with
25fe55af 3899 `continue'. */
fa9a63c5 3900
fa9a63c5
RM
3901 case before_dot:
3902 case at_dot:
3903 case after_dot:
1fb352e0 3904#endif /* !emacs */
25fe55af
RS
3905 case no_op:
3906 case begline:
3907 case endline:
fa9a63c5
RM
3908 case begbuf:
3909 case endbuf:
3910 case wordbound:
3911 case notwordbound:
3912 case wordbeg:
3913 case wordend:
25fe55af 3914 continue;
fa9a63c5
RM
3915
3916
fa9a63c5 3917 case jump:
25fe55af 3918 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
3919 if (j < 0)
3920 /* Backward jumps can only go back to code that we've already
3921 visited. `re_compile' should make sure this is true. */
3922 break;
25fe55af 3923 p += j;
505bde11
SM
3924 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3925 {
3926 case on_failure_jump:
3927 case on_failure_keep_string_jump:
505bde11 3928 case on_failure_jump_loop:
0683b6fa 3929 case on_failure_jump_nastyloop:
505bde11
SM
3930 case on_failure_jump_smart:
3931 p++;
3932 break;
3933 default:
3934 continue;
3935 };
3936 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3937 to jump back to "just after here". */
3938 /* Fallthrough */
fa9a63c5 3939
25fe55af
RS
3940 case on_failure_jump:
3941 case on_failure_keep_string_jump:
0683b6fa 3942 case on_failure_jump_nastyloop:
505bde11
SM
3943 case on_failure_jump_loop:
3944 case on_failure_jump_smart:
25fe55af 3945 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 3946 if (p + j <= p1)
ed0767d8 3947 ; /* Backward jump to be ignored. */
01618498
SM
3948 else
3949 { /* We have to look down both arms.
3950 We first go down the "straight" path so as to minimize
3951 stack usage when going through alternatives. */
3952 int r = analyse_first (p, pend, fastmap, multibyte);
3953 if (r) return r;
3954 p += j;
3955 }
25fe55af 3956 continue;
fa9a63c5
RM
3957
3958
ed0767d8
SM
3959 case jump_n:
3960 /* This code simply does not properly handle forward jump_n. */
3961 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3962 p += 4;
3963 /* jump_n can either jump or fall through. The (backward) jump
3964 case has already been handled, so we only need to look at the
3965 fallthrough case. */
3966 continue;
3967
fa9a63c5 3968 case succeed_n:
ed0767d8
SM
3969 /* If N == 0, it should be an on_failure_jump_loop instead. */
3970 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3971 p += 4;
3972 /* We only care about one iteration of the loop, so we don't
3973 need to consider the case where this behaves like an
3974 on_failure_jump. */
25fe55af 3975 continue;
fa9a63c5
RM
3976
3977
3978 case set_number_at:
25fe55af
RS
3979 p += 4;
3980 continue;
fa9a63c5
RM
3981
3982
3983 case start_memory:
25fe55af 3984 case stop_memory:
505bde11 3985 p += 1;
fa9a63c5
RM
3986 continue;
3987
3988
3989 default:
25fe55af
RS
3990 abort (); /* We have listed all the cases. */
3991 } /* switch *p++ */
fa9a63c5
RM
3992
3993 /* Getting here means we have found the possible starting
25fe55af 3994 characters for one path of the pattern -- and that the empty
01618498
SM
3995 string does not match. We need not follow this path further. */
3996 return 0;
fa9a63c5
RM
3997 } /* while p */
3998
01618498
SM
3999 /* We reached the end without matching anything. */
4000 return 1;
4001
f6a3f532
SM
4002} /* analyse_first */
4003\f
4004/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4005 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4006 characters can start a string that matches the pattern. This fastmap
4007 is used by re_search to skip quickly over impossible starting points.
4008
4009 Character codes above (1 << BYTEWIDTH) are not represented in the
4010 fastmap, but the leading codes are represented. Thus, the fastmap
4011 indicates which character sets could start a match.
4012
4013 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4014 area as BUFP->fastmap.
4015
4016 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4017 the pattern buffer.
4018
4019 Returns 0 if we succeed, -2 if an internal error. */
4020
4021int
4022re_compile_fastmap (bufp)
4023 struct re_pattern_buffer *bufp;
4024{
4025 char *fastmap = bufp->fastmap;
4026 int analysis;
4027
4028 assert (fastmap && bufp->buffer);
4029
4030 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4031 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4032
4033 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 4034 fastmap, RE_MULTIBYTE_P (bufp));
c0f9ea08 4035 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
4036 return 0;
4037} /* re_compile_fastmap */
4038\f
4039/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4040 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4041 this memory for recording register information. STARTS and ENDS
4042 must be allocated using the malloc library routine, and must each
4043 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4044
4045 If NUM_REGS == 0, then subsequent matches should allocate their own
4046 register data.
4047
4048 Unless this function is called, the first search or match using
4049 PATTERN_BUFFER will allocate its own register data, without
4050 freeing the old data. */
4051
4052void
4053re_set_registers (bufp, regs, num_regs, starts, ends)
4054 struct re_pattern_buffer *bufp;
4055 struct re_registers *regs;
4056 unsigned num_regs;
4057 regoff_t *starts, *ends;
4058{
4059 if (num_regs)
4060 {
4061 bufp->regs_allocated = REGS_REALLOCATE;
4062 regs->num_regs = num_regs;
4063 regs->start = starts;
4064 regs->end = ends;
4065 }
4066 else
4067 {
4068 bufp->regs_allocated = REGS_UNALLOCATED;
4069 regs->num_regs = 0;
4070 regs->start = regs->end = (regoff_t *) 0;
4071 }
4072}
c0f9ea08 4073WEAK_ALIAS (__re_set_registers, re_set_registers)
fa9a63c5 4074\f
25fe55af 4075/* Searching routines. */
fa9a63c5
RM
4076
4077/* Like re_search_2, below, but only one string is specified, and
4078 doesn't let you say where to stop matching. */
4079
4080int
4081re_search (bufp, string, size, startpos, range, regs)
4082 struct re_pattern_buffer *bufp;
4083 const char *string;
4084 int size, startpos, range;
4085 struct re_registers *regs;
4086{
5e69f11e 4087 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
4088 regs, size);
4089}
c0f9ea08 4090WEAK_ALIAS (__re_search, re_search)
fa9a63c5 4091
b18215fc
RS
4092/* End address of virtual concatenation of string. */
4093#define STOP_ADDR_VSTRING(P) \
4094 (((P) >= size1 ? string2 + size2 : string1 + size1))
4095
4096/* Address of POS in the concatenation of virtual string. */
4097#define POS_ADDR_VSTRING(POS) \
4098 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
4099
4100/* Using the compiled pattern in BUFP->buffer, first tries to match the
4101 virtual concatenation of STRING1 and STRING2, starting first at index
4102 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 4103
fa9a63c5 4104 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 4105
fa9a63c5
RM
4106 RANGE is how far to scan while trying to match. RANGE = 0 means try
4107 only at STARTPOS; in general, the last start tried is STARTPOS +
4108 RANGE.
5e69f11e 4109
fa9a63c5
RM
4110 In REGS, return the indices of the virtual concatenation of STRING1
4111 and STRING2 that matched the entire BUFP->buffer and its contained
4112 subexpressions.
5e69f11e 4113
fa9a63c5
RM
4114 Do not consider matching one past the index STOP in the virtual
4115 concatenation of STRING1 and STRING2.
4116
4117 We return either the position in the strings at which the match was
4118 found, -1 if no match, or -2 if error (such as failure
4119 stack overflow). */
4120
4121int
66f0296e 4122re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 4123 struct re_pattern_buffer *bufp;
66f0296e 4124 const char *str1, *str2;
fa9a63c5
RM
4125 int size1, size2;
4126 int startpos;
4127 int range;
4128 struct re_registers *regs;
4129 int stop;
4130{
4131 int val;
66f0296e
SM
4132 re_char *string1 = (re_char*) str1;
4133 re_char *string2 = (re_char*) str2;
fa9a63c5 4134 register char *fastmap = bufp->fastmap;
6676cb1c 4135 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
4136 int total_size = size1 + size2;
4137 int endpos = startpos + range;
c0f9ea08 4138 boolean anchored_start;
fa9a63c5 4139
25fe55af 4140 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4141 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4142
fa9a63c5
RM
4143 /* Check for out-of-range STARTPOS. */
4144 if (startpos < 0 || startpos > total_size)
4145 return -1;
5e69f11e 4146
fa9a63c5 4147 /* Fix up RANGE if it might eventually take us outside
34597fa9 4148 the virtual concatenation of STRING1 and STRING2.
5e69f11e 4149 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
4150 if (endpos < 0)
4151 range = 0 - startpos;
fa9a63c5
RM
4152 else if (endpos > total_size)
4153 range = total_size - startpos;
4154
4155 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 4156 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
4157 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4158 {
4159 if (startpos > 0)
4160 return -1;
4161 else
7b140fd7 4162 range = 0;
fa9a63c5
RM
4163 }
4164
ae4788a8
RS
4165#ifdef emacs
4166 /* In a forward search for something that starts with \=.
4167 don't keep searching past point. */
4168 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4169 {
7b140fd7
RS
4170 range = PT_BYTE - BEGV_BYTE - startpos;
4171 if (range < 0)
ae4788a8
RS
4172 return -1;
4173 }
4174#endif /* emacs */
4175
fa9a63c5
RM
4176 /* Update the fastmap now if not correct already. */
4177 if (fastmap && !bufp->fastmap_accurate)
01618498 4178 re_compile_fastmap (bufp);
5e69f11e 4179
c8499ba5 4180 /* See whether the pattern is anchored. */
c0f9ea08 4181 anchored_start = (bufp->buffer[0] == begline);
c8499ba5 4182
b18215fc 4183#ifdef emacs
cc9b4df2
KH
4184 gl_state.object = re_match_object;
4185 {
99633e97 4186 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
4187
4188 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4189 }
b18215fc
RS
4190#endif
4191
fa9a63c5
RM
4192 /* Loop through the string, looking for a place to start matching. */
4193 for (;;)
5e69f11e 4194 {
c8499ba5
RS
4195 /* If the pattern is anchored,
4196 skip quickly past places we cannot match.
4197 We don't bother to treat startpos == 0 specially
4198 because that case doesn't repeat. */
4199 if (anchored_start && startpos > 0)
4200 {
c0f9ea08
SM
4201 if (! ((startpos <= size1 ? string1[startpos - 1]
4202 : string2[startpos - size1 - 1])
4203 == '\n'))
c8499ba5
RS
4204 goto advance;
4205 }
4206
fa9a63c5 4207 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
4208 cannot be the start of a match. If the pattern can match the
4209 null string, however, we don't need to skip characters; we want
4210 the first null string. */
fa9a63c5
RM
4211 if (fastmap && startpos < total_size && !bufp->can_be_null)
4212 {
66f0296e 4213 register re_char *d;
01618498 4214 register re_wchar_t buf_ch;
e934739e
RS
4215
4216 d = POS_ADDR_VSTRING (startpos);
4217
25fe55af 4218 if (range > 0) /* Searching forwards. */
fa9a63c5 4219 {
fa9a63c5
RM
4220 register int lim = 0;
4221 int irange = range;
4222
25fe55af
RS
4223 if (startpos < size1 && startpos + range >= size1)
4224 lim = range - (size1 - startpos);
fa9a63c5 4225
25fe55af
RS
4226 /* Written out as an if-else to avoid testing `translate'
4227 inside the loop. */
28ae27ae
AS
4228 if (RE_TRANSLATE_P (translate))
4229 {
e934739e
RS
4230 if (multibyte)
4231 while (range > lim)
4232 {
4233 int buf_charlen;
4234
4235 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4236 buf_charlen);
4237
4238 buf_ch = RE_TRANSLATE (translate, buf_ch);
4239 if (buf_ch >= 0400
4240 || fastmap[buf_ch])
4241 break;
4242
4243 range -= buf_charlen;
4244 d += buf_charlen;
4245 }
4246 else
4247 while (range > lim
66f0296e 4248 && !fastmap[RE_TRANSLATE (translate, *d)])
33c46939
RS
4249 {
4250 d++;
4251 range--;
4252 }
e934739e 4253 }
fa9a63c5 4254 else
66f0296e 4255 while (range > lim && !fastmap[*d])
33c46939
RS
4256 {
4257 d++;
4258 range--;
4259 }
fa9a63c5
RM
4260
4261 startpos += irange - range;
4262 }
25fe55af 4263 else /* Searching backwards. */
fa9a63c5 4264 {
2d1675e4
SM
4265 int room = (startpos >= size1
4266 ? size2 + size1 - startpos
4267 : size1 - startpos);
4268 buf_ch = RE_STRING_CHAR (d, room);
4269 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 4270
e934739e
RS
4271 if (! (buf_ch >= 0400
4272 || fastmap[buf_ch]))
fa9a63c5
RM
4273 goto advance;
4274 }
4275 }
4276
4277 /* If can't match the null string, and that's all we have left, fail. */
4278 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4279 && !bufp->can_be_null)
fa9a63c5
RM
4280 return -1;
4281
4282 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4283 startpos, regs, stop);
4284#ifndef REGEX_MALLOC
0b32bf0e 4285# ifdef C_ALLOCA
fa9a63c5 4286 alloca (0);
0b32bf0e 4287# endif
fa9a63c5
RM
4288#endif
4289
4290 if (val >= 0)
4291 return startpos;
5e69f11e 4292
fa9a63c5
RM
4293 if (val == -2)
4294 return -2;
4295
4296 advance:
5e69f11e 4297 if (!range)
25fe55af 4298 break;
5e69f11e 4299 else if (range > 0)
25fe55af 4300 {
b18215fc
RS
4301 /* Update STARTPOS to the next character boundary. */
4302 if (multibyte)
4303 {
66f0296e
SM
4304 re_char *p = POS_ADDR_VSTRING (startpos);
4305 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4306 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4307
4308 range -= len;
4309 if (range < 0)
4310 break;
4311 startpos += len;
4312 }
4313 else
4314 {
b560c397
RS
4315 range--;
4316 startpos++;
4317 }
e318085a 4318 }
fa9a63c5 4319 else
25fe55af
RS
4320 {
4321 range++;
4322 startpos--;
b18215fc
RS
4323
4324 /* Update STARTPOS to the previous character boundary. */
4325 if (multibyte)
4326 {
66f0296e 4327 re_char *p = POS_ADDR_VSTRING (startpos);
b18215fc
RS
4328 int len = 0;
4329
4330 /* Find the head of multibyte form. */
5d967c7a 4331 while (!CHAR_HEAD_P (*p))
b18215fc
RS
4332 p--, len++;
4333
4334 /* Adjust it. */
4335#if 0 /* XXX */
4336 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
4337 ;
4338 else
4339#endif
4340 {
4341 range += len;
4342 if (range > 0)
4343 break;
4344
4345 startpos -= len;
4346 }
4347 }
25fe55af 4348 }
fa9a63c5
RM
4349 }
4350 return -1;
4351} /* re_search_2 */
c0f9ea08 4352WEAK_ALIAS (__re_search_2, re_search_2)
fa9a63c5
RM
4353\f
4354/* Declarations and macros for re_match_2. */
4355
2d1675e4
SM
4356static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4357 register int len,
4358 RE_TRANSLATE_TYPE translate,
4359 const int multibyte));
fa9a63c5
RM
4360
4361/* This converts PTR, a pointer into one of the search strings `string1'
4362 and `string2' into an offset from the beginning of that string. */
4363#define POINTER_TO_OFFSET(ptr) \
4364 (FIRST_STRING_P (ptr) \
4365 ? ((regoff_t) ((ptr) - string1)) \
4366 : ((regoff_t) ((ptr) - string2 + size1)))
4367
fa9a63c5 4368/* Call before fetching a character with *d. This switches over to
419d1c74
SM
4369 string2 if necessary.
4370 Check re_match_2_internal for a discussion of why end_match_2 might
4371 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 4372#define PREFETCH() \
25fe55af 4373 while (d == dend) \
fa9a63c5
RM
4374 { \
4375 /* End of string2 => fail. */ \
25fe55af
RS
4376 if (dend == end_match_2) \
4377 goto fail; \
4bb91c68 4378 /* End of string1 => advance to string2. */ \
25fe55af 4379 d = string2; \
fa9a63c5
RM
4380 dend = end_match_2; \
4381 }
4382
f1ad044f
SM
4383/* Call before fetching a char with *d if you already checked other limits.
4384 This is meant for use in lookahead operations like wordend, etc..
4385 where we might need to look at parts of the string that might be
4386 outside of the LIMITs (i.e past `stop'). */
4387#define PREFETCH_NOLIMIT() \
4388 if (d == end1) \
4389 { \
4390 d = string2; \
4391 dend = end_match_2; \
4392 } \
fa9a63c5
RM
4393
4394/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4395 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4396#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4397#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4398
4399
4400/* Test if D points to a character which is word-constituent. We have
4401 two special cases to check for: if past the end of string1, look at
4402 the first character in string2; and if before the beginning of
4403 string2, look at the last character in string1. */
4404#define WORDCHAR_P(d) \
4405 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4406 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4407 == Sword)
4408
9121ca40 4409/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4410
4411/* The comment at case wordbound is following one, but we don't use
4412 AT_WORD_BOUNDARY anymore to support multibyte form.
4413
4414 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4415 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4416 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4417 macro and introducing temporary variables works around the bug. */
4418
9121ca40 4419#if 0
fa9a63c5
RM
4420/* Test if the character before D and the one at D differ with respect
4421 to being word-constituent. */
4422#define AT_WORD_BOUNDARY(d) \
4423 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4424 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4425#endif
fa9a63c5
RM
4426
4427/* Free everything we malloc. */
4428#ifdef MATCH_MAY_ALLOCATE
0b32bf0e
SM
4429# define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4430# define FREE_VARIABLES() \
fa9a63c5
RM
4431 do { \
4432 REGEX_FREE_STACK (fail_stack.stack); \
4433 FREE_VAR (regstart); \
4434 FREE_VAR (regend); \
fa9a63c5
RM
4435 FREE_VAR (best_regstart); \
4436 FREE_VAR (best_regend); \
fa9a63c5
RM
4437 } while (0)
4438#else
0b32bf0e 4439# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
4440#endif /* not MATCH_MAY_ALLOCATE */
4441
505bde11
SM
4442\f
4443/* Optimization routines. */
4444
4e8a9132
SM
4445/* If the operation is a match against one or more chars,
4446 return a pointer to the next operation, else return NULL. */
01618498 4447static re_char *
4e8a9132 4448skip_one_char (p)
01618498 4449 re_char *p;
4e8a9132
SM
4450{
4451 switch (SWITCH_ENUM_CAST (*p++))
4452 {
4453 case anychar:
4454 break;
4455
4456 case exactn:
4457 p += *p + 1;
4458 break;
4459
4460 case charset_not:
4461 case charset:
4462 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4463 {
4464 int mcnt;
4465 p = CHARSET_RANGE_TABLE (p - 1);
4466 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4467 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4468 }
4469 else
4470 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4471 break;
4472
4e8a9132
SM
4473 case syntaxspec:
4474 case notsyntaxspec:
1fb352e0 4475#ifdef emacs
4e8a9132
SM
4476 case categoryspec:
4477 case notcategoryspec:
4478#endif /* emacs */
4479 p++;
4480 break;
4481
4482 default:
4483 p = NULL;
4484 }
4485 return p;
4486}
4487
4488
505bde11
SM
4489/* Jump over non-matching operations. */
4490static unsigned char *
4e8a9132 4491skip_noops (p, pend)
505bde11 4492 unsigned char *p, *pend;
505bde11
SM
4493{
4494 int mcnt;
4495 while (p < pend)
4496 {
4497 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4498 {
4499 case start_memory:
505bde11
SM
4500 case stop_memory:
4501 p += 2; break;
4502 case no_op:
4503 p += 1; break;
4504 case jump:
4505 p += 1;
4506 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4507 p += mcnt;
4508 break;
4509 default:
4510 return p;
4511 }
4512 }
4513 assert (p == pend);
4514 return p;
4515}
4516
4517/* Non-zero if "p1 matches something" implies "p2 fails". */
4518static int
4519mutually_exclusive_p (bufp, p1, p2)
4520 struct re_pattern_buffer *bufp;
4521 unsigned char *p1, *p2;
4522{
4e8a9132 4523 re_opcode_t op2;
2d1675e4 4524 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4525 unsigned char *pend = bufp->buffer + bufp->used;
4526
4e8a9132 4527 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4528 && p2 >= bufp->buffer && p2 <= pend);
4529
4530 /* Skip over open/close-group commands.
4531 If what follows this loop is a ...+ construct,
4532 look at what begins its body, since we will have to
4533 match at least one of that. */
4e8a9132
SM
4534 p2 = skip_noops (p2, pend);
4535 /* The same skip can be done for p1, except that this function
4536 is only used in the case where p1 is a simple match operator. */
4537 /* p1 = skip_noops (p1, pend); */
4538
4539 assert (p1 >= bufp->buffer && p1 < pend
4540 && p2 >= bufp->buffer && p2 <= pend);
4541
4542 op2 = p2 == pend ? succeed : *p2;
4543
4544 switch (SWITCH_ENUM_CAST (op2))
505bde11 4545 {
4e8a9132
SM
4546 case succeed:
4547 case endbuf:
4548 /* If we're at the end of the pattern, we can change. */
4549 if (skip_one_char (p1))
505bde11 4550 {
505bde11
SM
4551 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4552 return 1;
505bde11 4553 }
4e8a9132
SM
4554 break;
4555
4556 case endline:
4e8a9132
SM
4557 case exactn:
4558 {
01618498 4559 register re_wchar_t c
4e8a9132 4560 = (re_opcode_t) *p2 == endline ? '\n'
411e4203 4561 : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
505bde11 4562
4e8a9132
SM
4563 if ((re_opcode_t) *p1 == exactn)
4564 {
4565 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4566 {
4567 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4568 return 1;
4569 }
4570 }
505bde11 4571
4e8a9132
SM
4572 else if ((re_opcode_t) *p1 == charset
4573 || (re_opcode_t) *p1 == charset_not)
4574 {
4575 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4576
4e8a9132
SM
4577 /* Test if C is listed in charset (or charset_not)
4578 at `p1'. */
4579 if (SINGLE_BYTE_CHAR_P (c))
4580 {
4581 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4582 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4583 not = !not;
4584 }
4585 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4586 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4587
4e8a9132
SM
4588 /* `not' is equal to 1 if c would match, which means
4589 that we can't change to pop_failure_jump. */
4590 if (!not)
4591 {
4592 DEBUG_PRINT1 (" No match => fast loop.\n");
4593 return 1;
4594 }
4595 }
4596 else if ((re_opcode_t) *p1 == anychar
4597 && c == '\n')
4598 {
4599 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4600 return 1;
4601 }
4602 }
4603 break;
505bde11 4604
4e8a9132 4605 case charset:
4e8a9132
SM
4606 {
4607 if ((re_opcode_t) *p1 == exactn)
4608 /* Reuse the code above. */
4609 return mutually_exclusive_p (bufp, p2, p1);
505bde11 4610
505bde11
SM
4611 /* It is hard to list up all the character in charset
4612 P2 if it includes multibyte character. Give up in
4613 such case. */
4614 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4615 {
4616 /* Now, we are sure that P2 has no range table.
4617 So, for the size of bitmap in P2, `p2[1]' is
4618 enough. But P1 may have range table, so the
4619 size of bitmap table of P1 is extracted by
4620 using macro `CHARSET_BITMAP_SIZE'.
4621
4622 Since we know that all the character listed in
4623 P2 is ASCII, it is enough to test only bitmap
4624 table of P1. */
4625
411e4203 4626 if ((re_opcode_t) *p1 == charset)
505bde11
SM
4627 {
4628 int idx;
4629 /* We win if the charset inside the loop
4630 has no overlap with the one after the loop. */
4631 for (idx = 0;
4632 (idx < (int) p2[1]
4633 && idx < CHARSET_BITMAP_SIZE (p1));
4634 idx++)
4635 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4636 break;
4637
4638 if (idx == p2[1]
4639 || idx == CHARSET_BITMAP_SIZE (p1))
4640 {
4641 DEBUG_PRINT1 (" No match => fast loop.\n");
4642 return 1;
4643 }
4644 }
411e4203 4645 else if ((re_opcode_t) *p1 == charset_not)
505bde11
SM
4646 {
4647 int idx;
4648 /* We win if the charset_not inside the loop lists
4649 every character listed in the charset after. */
4650 for (idx = 0; idx < (int) p2[1]; idx++)
4651 if (! (p2[2 + idx] == 0
4652 || (idx < CHARSET_BITMAP_SIZE (p1)
4653 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4654 break;
4655
4e8a9132
SM
4656 if (idx == p2[1])
4657 {
4658 DEBUG_PRINT1 (" No match => fast loop.\n");
4659 return 1;
4660 }
4661 }
4662 }
4663 }
609b757a 4664 break;
4e8a9132 4665
411e4203
SM
4666 case charset_not:
4667 switch (SWITCH_ENUM_CAST (*p1))
4668 {
4669 case exactn:
4670 case charset:
4671 /* Reuse the code above. */
4672 return mutually_exclusive_p (bufp, p2, p1);
4673 case charset_not:
4674 /* When we have two charset_not, it's very unlikely that
4675 they don't overlap. The union of the two sets of excluded
4676 chars should cover all possible chars, which, as a matter of
4677 fact, is virtually impossible in multibyte buffers. */
36595814 4678 break;
411e4203
SM
4679 }
4680 break;
4681
4e8a9132
SM
4682 case wordend:
4683 case notsyntaxspec:
4684 return ((re_opcode_t) *p1 == syntaxspec
4685 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4686
4687 case wordbeg:
4688 case syntaxspec:
4689 return ((re_opcode_t) *p1 == notsyntaxspec
4690 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4691
4692 case wordbound:
4693 return (((re_opcode_t) *p1 == notsyntaxspec
4694 || (re_opcode_t) *p1 == syntaxspec)
4695 && p1[1] == Sword);
4696
1fb352e0 4697#ifdef emacs
4e8a9132
SM
4698 case categoryspec:
4699 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4700 case notcategoryspec:
4701 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4702#endif /* emacs */
4703
4704 default:
4705 ;
505bde11
SM
4706 }
4707
4708 /* Safe default. */
4709 return 0;
4710}
4711
fa9a63c5
RM
4712\f
4713/* Matching routines. */
4714
25fe55af 4715#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4716/* re_match is like re_match_2 except it takes only a single string. */
4717
4718int
4719re_match (bufp, string, size, pos, regs)
4720 struct re_pattern_buffer *bufp;
4721 const char *string;
4722 int size, pos;
4723 struct re_registers *regs;
4724{
4bb91c68 4725 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
fa9a63c5 4726 pos, regs, size);
0b32bf0e 4727# if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4728 alloca (0);
0b32bf0e 4729# endif
fa9a63c5
RM
4730 return result;
4731}
c0f9ea08 4732WEAK_ALIAS (__re_match, re_match)
fa9a63c5
RM
4733#endif /* not emacs */
4734
b18215fc
RS
4735#ifdef emacs
4736/* In Emacs, this is the string or buffer in which we
25fe55af 4737 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4738Lisp_Object re_match_object;
4739#endif
fa9a63c5
RM
4740
4741/* re_match_2 matches the compiled pattern in BUFP against the
4742 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4743 and SIZE2, respectively). We start matching at POS, and stop
4744 matching at STOP.
5e69f11e 4745
fa9a63c5 4746 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4747 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4748 documentation for exactly how many groups we fill.
4749
4750 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4751 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4752 matched substring. */
4753
4754int
4755re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4756 struct re_pattern_buffer *bufp;
4757 const char *string1, *string2;
4758 int size1, size2;
4759 int pos;
4760 struct re_registers *regs;
4761 int stop;
4762{
b18215fc 4763 int result;
25fe55af 4764
b18215fc 4765#ifdef emacs
cc9b4df2
KH
4766 int charpos;
4767 gl_state.object = re_match_object;
99633e97 4768 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4769 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4770#endif
4771
4bb91c68
SM
4772 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4773 (re_char*) string2, size2,
cc9b4df2 4774 pos, regs, stop);
0b32bf0e 4775#if defined C_ALLOCA && !defined REGEX_MALLOC
fa9a63c5 4776 alloca (0);
a60198e5 4777#endif
fa9a63c5
RM
4778 return result;
4779}
c0f9ea08 4780WEAK_ALIAS (__re_match_2, re_match_2)
fa9a63c5
RM
4781
4782/* This is a separate function so that we can force an alloca cleanup
25fe55af 4783 afterwards. */
fa9a63c5
RM
4784static int
4785re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4786 struct re_pattern_buffer *bufp;
66f0296e 4787 re_char *string1, *string2;
fa9a63c5
RM
4788 int size1, size2;
4789 int pos;
4790 struct re_registers *regs;
4791 int stop;
4792{
4793 /* General temporaries. */
4794 int mcnt;
01618498 4795 size_t reg;
66f0296e 4796 boolean not;
fa9a63c5
RM
4797
4798 /* Just past the end of the corresponding string. */
66f0296e 4799 re_char *end1, *end2;
fa9a63c5
RM
4800
4801 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4802 each to consider matching. */
66f0296e 4803 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4804
4805 /* Where we are in the data, and the end of the current string. */
66f0296e 4806 re_char *d, *dend;
5e69f11e 4807
99633e97
SM
4808 /* Used sometimes to remember where we were before starting matching
4809 an operator so that we can go back in case of failure. This "atomic"
4810 behavior of matching opcodes is indispensable to the correctness
4811 of the on_failure_keep_string_jump optimization. */
4812 re_char *dfail;
4813
fa9a63c5 4814 /* Where we are in the pattern, and the end of the pattern. */
01618498
SM
4815 re_char *p = bufp->buffer;
4816 re_char *pend = p + bufp->used;
fa9a63c5 4817
25fe55af 4818 /* We use this to map every character in the string. */
6676cb1c 4819 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4820
25fe55af 4821 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4822 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4823
fa9a63c5
RM
4824 /* Failure point stack. Each place that can handle a failure further
4825 down the line pushes a failure point on this stack. It consists of
505bde11 4826 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4827 the subexpressions we're currently inside, plus the number of such
4828 registers, and, finally, two char *'s. The first char * is where
4829 to resume scanning the pattern; the second one is where to resume
505bde11 4830 scanning the strings. */
25fe55af 4831#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4832 fail_stack_type fail_stack;
4833#endif
4834#ifdef DEBUG
fa9a63c5
RM
4835 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4836#endif
4837
0b32bf0e 4838#if defined REL_ALLOC && defined REGEX_MALLOC
fa9a63c5
RM
4839 /* This holds the pointer to the failure stack, when
4840 it is allocated relocatably. */
4841 fail_stack_elt_t *failure_stack_ptr;
99633e97 4842#endif
fa9a63c5
RM
4843
4844 /* We fill all the registers internally, independent of what we
25fe55af 4845 return, for use in backreferences. The number here includes
fa9a63c5 4846 an element for register zero. */
4bb91c68 4847 size_t num_regs = bufp->re_nsub + 1;
5e69f11e 4848
fa9a63c5
RM
4849 /* Information on the contents of registers. These are pointers into
4850 the input strings; they record just what was matched (on this
4851 attempt) by a subexpression part of the pattern, that is, the
4852 regnum-th regstart pointer points to where in the pattern we began
4853 matching and the regnum-th regend points to right after where we
4854 stopped matching the regnum-th subexpression. (The zeroth register
4855 keeps track of what the whole pattern matches.) */
4856#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4857 re_char **regstart, **regend;
fa9a63c5
RM
4858#endif
4859
fa9a63c5 4860 /* The following record the register info as found in the above
5e69f11e 4861 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4862 This happens as we backtrack through the failure points, which in
4863 turn happens only if we have not yet matched the entire string. */
4864 unsigned best_regs_set = false;
4865#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4866 re_char **best_regstart, **best_regend;
fa9a63c5 4867#endif
5e69f11e 4868
fa9a63c5
RM
4869 /* Logically, this is `best_regend[0]'. But we don't want to have to
4870 allocate space for that if we're not allocating space for anything
25fe55af 4871 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4872 any of the other register vectors, and it seems rather a kludge to
4873 treat `best_regend' differently than the rest. So we keep track of
4874 the end of the best match so far in a separate variable. We
4875 initialize this to NULL so that when we backtrack the first time
4876 and need to test it, it's not garbage. */
66f0296e 4877 re_char *match_end = NULL;
fa9a63c5 4878
fa9a63c5
RM
4879#ifdef DEBUG
4880 /* Counts the total number of registers pushed. */
5e69f11e 4881 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4882#endif
4883
4884 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4885
fa9a63c5 4886 INIT_FAIL_STACK ();
5e69f11e 4887
fa9a63c5
RM
4888#ifdef MATCH_MAY_ALLOCATE
4889 /* Do not bother to initialize all the register variables if there are
4890 no groups in the pattern, as it takes a fair amount of time. If
4891 there are groups, we include space for register 0 (the whole
4892 pattern), even though we never use it, since it simplifies the
4893 array indexing. We should fix this. */
4894 if (bufp->re_nsub)
4895 {
66f0296e
SM
4896 regstart = REGEX_TALLOC (num_regs, re_char *);
4897 regend = REGEX_TALLOC (num_regs, re_char *);
4898 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4899 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 4900
505bde11 4901 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
4902 {
4903 FREE_VARIABLES ();
4904 return -2;
4905 }
fa9a63c5
RM
4906 }
4907 else
4908 {
4909 /* We must initialize all our variables to NULL, so that
25fe55af 4910 `FREE_VARIABLES' doesn't try to free them. */
505bde11 4911 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
4912 }
4913#endif /* MATCH_MAY_ALLOCATE */
4914
4915 /* The starting position is bogus. */
4916 if (pos < 0 || pos > size1 + size2)
4917 {
4918 FREE_VARIABLES ();
4919 return -1;
4920 }
5e69f11e 4921
fa9a63c5
RM
4922 /* Initialize subexpression text positions to -1 to mark ones that no
4923 start_memory/stop_memory has been seen for. Also initialize the
4924 register information struct. */
01618498
SM
4925 for (reg = 1; reg < num_regs; reg++)
4926 regstart[reg] = regend[reg] = NULL;
99633e97 4927
fa9a63c5 4928 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 4929 `string1' is null. */
fa9a63c5
RM
4930 if (size2 == 0 && string1 != NULL)
4931 {
4932 string2 = string1;
4933 size2 = size1;
4934 string1 = 0;
4935 size1 = 0;
4936 }
4937 end1 = string1 + size1;
4938 end2 = string2 + size2;
4939
5e69f11e 4940 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
4941 `dend' is the end of the input string that `d' points within. `d'
4942 is advanced into the following input string whenever necessary, but
4943 this happens before fetching; therefore, at the beginning of the
4944 loop, `d' can be pointing at the end of a string, but it cannot
4945 equal `string2'. */
419d1c74 4946 if (pos >= size1)
fa9a63c5 4947 {
419d1c74
SM
4948 /* Only match within string2. */
4949 d = string2 + pos - size1;
4950 dend = end_match_2 = string2 + stop - size1;
4951 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
4952 }
4953 else
4954 {
f1ad044f 4955 if (stop < size1)
419d1c74
SM
4956 {
4957 /* Only match within string1. */
4958 end_match_1 = string1 + stop;
4959 /* BEWARE!
4960 When we reach end_match_1, PREFETCH normally switches to string2.
4961 But in the present case, this means that just doing a PREFETCH
4962 makes us jump from `stop' to `gap' within the string.
4963 What we really want here is for the search to stop as
4964 soon as we hit end_match_1. That's why we set end_match_2
4965 to end_match_1 (since PREFETCH fails as soon as we hit
4966 end_match_2). */
4967 end_match_2 = end_match_1;
4968 }
4969 else
f1ad044f
SM
4970 { /* It's important to use this code when stop == size so that
4971 moving `d' from end1 to string2 will not prevent the d == dend
4972 check from catching the end of string. */
419d1c74
SM
4973 end_match_1 = end1;
4974 end_match_2 = string2 + stop - size1;
4975 }
4976 d = string1 + pos;
4977 dend = end_match_1;
fa9a63c5
RM
4978 }
4979
4980 DEBUG_PRINT1 ("The compiled pattern is: ");
4981 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4982 DEBUG_PRINT1 ("The string to match is: `");
4983 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4984 DEBUG_PRINT1 ("'\n");
5e69f11e 4985
25fe55af 4986 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
4987 function if the match is complete, or it drops through if the match
4988 fails at this starting point in the input data. */
4989 for (;;)
4990 {
505bde11 4991 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
4992
4993 if (p == pend)
4994 { /* End of pattern means we might have succeeded. */
25fe55af 4995 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 4996
fa9a63c5 4997 /* If we haven't matched the entire string, and we want the
25fe55af
RS
4998 longest match, try backtracking. */
4999 if (d != end_match_2)
fa9a63c5
RM
5000 {
5001 /* 1 if this match ends in the same string (string1 or string2)
5002 as the best previous match. */
5e69f11e 5003 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 5004 == FIRST_STRING_P (d));
fa9a63c5
RM
5005 /* 1 if this match is the best seen so far. */
5006 boolean best_match_p;
5007
5008 /* AIX compiler got confused when this was combined
25fe55af 5009 with the previous declaration. */
fa9a63c5
RM
5010 if (same_str_p)
5011 best_match_p = d > match_end;
5012 else
99633e97 5013 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 5014
25fe55af
RS
5015 DEBUG_PRINT1 ("backtracking.\n");
5016
5017 if (!FAIL_STACK_EMPTY ())
5018 { /* More failure points to try. */
5019
5020 /* If exceeds best match so far, save it. */
5021 if (!best_regs_set || best_match_p)
5022 {
5023 best_regs_set = true;
5024 match_end = d;
5025
5026 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5027
01618498 5028 for (reg = 1; reg < num_regs; reg++)
25fe55af 5029 {
01618498
SM
5030 best_regstart[reg] = regstart[reg];
5031 best_regend[reg] = regend[reg];
25fe55af
RS
5032 }
5033 }
5034 goto fail;
5035 }
5036
5037 /* If no failure points, don't restore garbage. And if
5038 last match is real best match, don't restore second
5039 best one. */
5040 else if (best_regs_set && !best_match_p)
5041 {
5042 restore_best_regs:
5043 /* Restore best match. It may happen that `dend ==
5044 end_match_1' while the restored d is in string2.
5045 For example, the pattern `x.*y.*z' against the
5046 strings `x-' and `y-z-', if the two strings are
5047 not consecutive in memory. */
5048 DEBUG_PRINT1 ("Restoring best registers.\n");
5049
5050 d = match_end;
5051 dend = ((d >= string1 && d <= end1)
5052 ? end_match_1 : end_match_2);
fa9a63c5 5053
01618498 5054 for (reg = 1; reg < num_regs; reg++)
fa9a63c5 5055 {
01618498
SM
5056 regstart[reg] = best_regstart[reg];
5057 regend[reg] = best_regend[reg];
fa9a63c5 5058 }
25fe55af
RS
5059 }
5060 } /* d != end_match_2 */
fa9a63c5
RM
5061
5062 succeed_label:
25fe55af 5063 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 5064
25fe55af
RS
5065 /* If caller wants register contents data back, do it. */
5066 if (regs && !bufp->no_sub)
fa9a63c5 5067 {
25fe55af
RS
5068 /* Have the register data arrays been allocated? */
5069 if (bufp->regs_allocated == REGS_UNALLOCATED)
5070 { /* No. So allocate them with malloc. We need one
5071 extra element beyond `num_regs' for the `-1' marker
5072 GNU code uses. */
5073 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5074 regs->start = TALLOC (regs->num_regs, regoff_t);
5075 regs->end = TALLOC (regs->num_regs, regoff_t);
5076 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5077 {
5078 FREE_VARIABLES ();
5079 return -2;
5080 }
25fe55af
RS
5081 bufp->regs_allocated = REGS_REALLOCATE;
5082 }
5083 else if (bufp->regs_allocated == REGS_REALLOCATE)
5084 { /* Yes. If we need more elements than were already
5085 allocated, reallocate them. If we need fewer, just
5086 leave it alone. */
5087 if (regs->num_regs < num_regs + 1)
5088 {
5089 regs->num_regs = num_regs + 1;
5090 RETALLOC (regs->start, regs->num_regs, regoff_t);
5091 RETALLOC (regs->end, regs->num_regs, regoff_t);
5092 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
5093 {
5094 FREE_VARIABLES ();
5095 return -2;
5096 }
25fe55af
RS
5097 }
5098 }
5099 else
fa9a63c5
RM
5100 {
5101 /* These braces fend off a "empty body in an else-statement"
25fe55af 5102 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
5103 assert (bufp->regs_allocated == REGS_FIXED);
5104 }
5105
25fe55af
RS
5106 /* Convert the pointer data in `regstart' and `regend' to
5107 indices. Register zero has to be set differently,
5108 since we haven't kept track of any info for it. */
5109 if (regs->num_regs > 0)
5110 {
5111 regs->start[0] = pos;
99633e97 5112 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 5113 }
5e69f11e 5114
25fe55af
RS
5115 /* Go through the first `min (num_regs, regs->num_regs)'
5116 registers, since that is all we initialized. */
01618498 5117 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
fa9a63c5 5118 {
01618498
SM
5119 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5120 regs->start[reg] = regs->end[reg] = -1;
25fe55af
RS
5121 else
5122 {
01618498
SM
5123 regs->start[reg]
5124 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5125 regs->end[reg]
5126 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
25fe55af 5127 }
fa9a63c5 5128 }
5e69f11e 5129
25fe55af
RS
5130 /* If the regs structure we return has more elements than
5131 were in the pattern, set the extra elements to -1. If
5132 we (re)allocated the registers, this is the case,
5133 because we always allocate enough to have at least one
5134 -1 at the end. */
01618498
SM
5135 for (reg = num_regs; reg < regs->num_regs; reg++)
5136 regs->start[reg] = regs->end[reg] = -1;
fa9a63c5
RM
5137 } /* regs && !bufp->no_sub */
5138
25fe55af
RS
5139 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5140 nfailure_points_pushed, nfailure_points_popped,
5141 nfailure_points_pushed - nfailure_points_popped);
5142 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 5143
99633e97 5144 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 5145
25fe55af 5146 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 5147
25fe55af
RS
5148 FREE_VARIABLES ();
5149 return mcnt;
5150 }
fa9a63c5 5151
25fe55af 5152 /* Otherwise match next pattern command. */
fa9a63c5
RM
5153 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5154 {
25fe55af
RS
5155 /* Ignore these. Used to ignore the n of succeed_n's which
5156 currently have n == 0. */
5157 case no_op:
5158 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5159 break;
fa9a63c5
RM
5160
5161 case succeed:
25fe55af 5162 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
5163 goto succeed_label;
5164
25fe55af
RS
5165 /* Match the next n pattern characters exactly. The following
5166 byte in the pattern defines n, and the n bytes after that
5167 are the characters to match. */
fa9a63c5
RM
5168 case exactn:
5169 mcnt = *p++;
25fe55af 5170 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 5171
99633e97
SM
5172 /* Remember the start point to rollback upon failure. */
5173 dfail = d;
5174
25fe55af
RS
5175 /* This is written out as an if-else so we don't waste time
5176 testing `translate' inside the loop. */
28703c16 5177 if (RE_TRANSLATE_P (translate))
fa9a63c5 5178 {
e934739e
RS
5179 if (multibyte)
5180 do
5181 {
5182 int pat_charlen, buf_charlen;
e71b1971 5183 unsigned int pat_ch, buf_ch;
e934739e
RS
5184
5185 PREFETCH ();
5186 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
5187 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
5188
5189 if (RE_TRANSLATE (translate, buf_ch)
5190 != pat_ch)
99633e97
SM
5191 {
5192 d = dfail;
5193 goto fail;
5194 }
e934739e
RS
5195
5196 p += pat_charlen;
5197 d += buf_charlen;
5198 mcnt -= pat_charlen;
5199 }
5200 while (mcnt > 0);
5201 else
e934739e
RS
5202 do
5203 {
5204 PREFETCH ();
66f0296e 5205 if (RE_TRANSLATE (translate, *d) != *p++)
99633e97
SM
5206 {
5207 d = dfail;
5208 goto fail;
5209 }
33c46939 5210 d++;
e934739e
RS
5211 }
5212 while (--mcnt);
fa9a63c5
RM
5213 }
5214 else
5215 {
5216 do
5217 {
5218 PREFETCH ();
99633e97
SM
5219 if (*d++ != *p++)
5220 {
5221 d = dfail;
5222 goto fail;
5223 }
fa9a63c5
RM
5224 }
5225 while (--mcnt);
5226 }
25fe55af 5227 break;
fa9a63c5
RM
5228
5229
25fe55af 5230 /* Match any character except possibly a newline or a null. */
fa9a63c5 5231 case anychar:
e934739e
RS
5232 {
5233 int buf_charlen;
01618498 5234 re_wchar_t buf_ch;
fa9a63c5 5235
e934739e 5236 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 5237
e934739e 5238 PREFETCH ();
2d1675e4 5239 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
5240 buf_ch = TRANSLATE (buf_ch);
5241
5242 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5243 && buf_ch == '\n')
5244 || ((bufp->syntax & RE_DOT_NOT_NULL)
5245 && buf_ch == '\000'))
5246 goto fail;
5247
e934739e
RS
5248 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5249 d += buf_charlen;
5250 }
fa9a63c5
RM
5251 break;
5252
5253
5254 case charset:
5255 case charset_not:
5256 {
b18215fc 5257 register unsigned int c;
fa9a63c5 5258 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
5259 int len;
5260
5261 /* Start of actual range_table, or end of bitmap if there is no
5262 range table. */
01618498 5263 re_char *range_table;
b18215fc 5264
96cc36cc 5265 /* Nonzero if there is a range table. */
b18215fc
RS
5266 int range_table_exists;
5267
96cc36cc
RS
5268 /* Number of ranges of range table. This is not included
5269 in the initial byte-length of the command. */
5270 int count = 0;
fa9a63c5 5271
25fe55af 5272 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 5273
b18215fc 5274 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 5275
b18215fc 5276 if (range_table_exists)
96cc36cc
RS
5277 {
5278 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5279 EXTRACT_NUMBER_AND_INCR (count, range_table);
5280 }
b18215fc 5281
2d1675e4
SM
5282 PREFETCH ();
5283 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5284 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
5285
5286 if (SINGLE_BYTE_CHAR_P (c))
5287 { /* Lookup bitmap. */
b18215fc
RS
5288 /* Cast to `unsigned' instead of `unsigned char' in
5289 case the bit list is a full 32 bytes long. */
5290 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
5291 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5292 not = !not;
b18215fc 5293 }
96cc36cc 5294#ifdef emacs
b18215fc 5295 else if (range_table_exists)
96cc36cc
RS
5296 {
5297 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5298
14473664
SM
5299 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5300 | (class_bits & BIT_MULTIBYTE)
96cc36cc
RS
5301 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5302 | (class_bits & BIT_SPACE && ISSPACE (c))
5303 | (class_bits & BIT_UPPER && ISUPPER (c))
5304 | (class_bits & BIT_WORD && ISWORD (c)))
5305 not = !not;
5306 else
5307 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5308 }
5309#endif /* emacs */
fa9a63c5 5310
96cc36cc
RS
5311 if (range_table_exists)
5312 p = CHARSET_RANGE_TABLE_END (range_table, count);
5313 else
5314 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
5315
5316 if (!not) goto fail;
5e69f11e 5317
b18215fc 5318 d += len;
fa9a63c5
RM
5319 break;
5320 }
5321
5322
25fe55af 5323 /* The beginning of a group is represented by start_memory.
505bde11 5324 The argument is the register number. The text
25fe55af
RS
5325 matched within the group is recorded (in the internal
5326 registers data structure) under the register number. */
5327 case start_memory:
505bde11
SM
5328 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5329
5330 /* In case we need to undo this operation (via backtracking). */
5331 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 5332
25fe55af 5333 regstart[*p] = d;
4bb91c68 5334 regend[*p] = NULL; /* probably unnecessary. -sm */
fa9a63c5
RM
5335 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5336
25fe55af 5337 /* Move past the register number and inner group count. */
505bde11 5338 p += 1;
25fe55af 5339 break;
fa9a63c5
RM
5340
5341
25fe55af 5342 /* The stop_memory opcode represents the end of a group. Its
505bde11 5343 argument is the same as start_memory's: the register number. */
fa9a63c5 5344 case stop_memory:
505bde11
SM
5345 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5346
5347 assert (!REG_UNSET (regstart[*p]));
5348 /* Strictly speaking, there should be code such as:
5349
0b32bf0e 5350 assert (REG_UNSET (regend[*p]));
505bde11
SM
5351 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5352
5353 But the only info to be pushed is regend[*p] and it is known to
5354 be UNSET, so there really isn't anything to push.
5355 Not pushing anything, on the other hand deprives us from the
5356 guarantee that regend[*p] is UNSET since undoing this operation
5357 will not reset its value properly. This is not important since
5358 the value will only be read on the next start_memory or at
5359 the very end and both events can only happen if this stop_memory
5360 is *not* undone. */
fa9a63c5 5361
25fe55af 5362 regend[*p] = d;
fa9a63c5
RM
5363 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5364
25fe55af 5365 /* Move past the register number and the inner group count. */
505bde11 5366 p += 1;
25fe55af 5367 break;
fa9a63c5
RM
5368
5369
5370 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5371 followed by the numeric value of <digit> as the register number. */
5372 case duplicate:
fa9a63c5 5373 {
66f0296e 5374 register re_char *d2, *dend2;
25fe55af 5375 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5376 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5377
25fe55af
RS
5378 /* Can't back reference a group which we've never matched. */
5379 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5380 goto fail;
5e69f11e 5381
25fe55af
RS
5382 /* Where in input to try to start matching. */
5383 d2 = regstart[regno];
5e69f11e 5384
99633e97
SM
5385 /* Remember the start point to rollback upon failure. */
5386 dfail = d;
5387
25fe55af
RS
5388 /* Where to stop matching; if both the place to start and
5389 the place to stop matching are in the same string, then
5390 set to the place to stop, otherwise, for now have to use
5391 the end of the first string. */
fa9a63c5 5392
25fe55af 5393 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5394 == FIRST_STRING_P (regend[regno]))
5395 ? regend[regno] : end_match_1);
5396 for (;;)
5397 {
5398 /* If necessary, advance to next segment in register
25fe55af 5399 contents. */
fa9a63c5
RM
5400 while (d2 == dend2)
5401 {
5402 if (dend2 == end_match_2) break;
5403 if (dend2 == regend[regno]) break;
5404
25fe55af
RS
5405 /* End of string1 => advance to string2. */
5406 d2 = string2;
5407 dend2 = regend[regno];
fa9a63c5
RM
5408 }
5409 /* At end of register contents => success */
5410 if (d2 == dend2) break;
5411
5412 /* If necessary, advance to next segment in data. */
5413 PREFETCH ();
5414
5415 /* How many characters left in this segment to match. */
5416 mcnt = dend - d;
5e69f11e 5417
fa9a63c5 5418 /* Want how many consecutive characters we can match in
25fe55af
RS
5419 one shot, so, if necessary, adjust the count. */
5420 if (mcnt > dend2 - d2)
fa9a63c5 5421 mcnt = dend2 - d2;
5e69f11e 5422
fa9a63c5 5423 /* Compare that many; failure if mismatch, else move
25fe55af 5424 past them. */
28703c16 5425 if (RE_TRANSLATE_P (translate)
2d1675e4 5426 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
4bb91c68 5427 : memcmp (d, d2, mcnt))
99633e97
SM
5428 {
5429 d = dfail;
5430 goto fail;
5431 }
fa9a63c5 5432 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5433 }
5434 }
5435 break;
5436
5437
25fe55af 5438 /* begline matches the empty string at the beginning of the string
c0f9ea08 5439 (unless `not_bol' is set in `bufp'), and after newlines. */
fa9a63c5 5440 case begline:
25fe55af 5441 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5442
25fe55af
RS
5443 if (AT_STRINGS_BEG (d))
5444 {
5445 if (!bufp->not_bol) break;
5446 }
419d1c74 5447 else
25fe55af 5448 {
419d1c74
SM
5449 unsigned char c;
5450 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
c0f9ea08 5451 if (c == '\n')
419d1c74 5452 break;
25fe55af
RS
5453 }
5454 /* In all other cases, we fail. */
5455 goto fail;
fa9a63c5
RM
5456
5457
25fe55af 5458 /* endline is the dual of begline. */
fa9a63c5 5459 case endline:
25fe55af 5460 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5461
25fe55af
RS
5462 if (AT_STRINGS_END (d))
5463 {
5464 if (!bufp->not_eol) break;
5465 }
f1ad044f 5466 else
25fe55af 5467 {
f1ad044f 5468 PREFETCH_NOLIMIT ();
c0f9ea08 5469 if (*d == '\n')
f1ad044f 5470 break;
25fe55af
RS
5471 }
5472 goto fail;
fa9a63c5
RM
5473
5474
5475 /* Match at the very beginning of the data. */
25fe55af
RS
5476 case begbuf:
5477 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5478 if (AT_STRINGS_BEG (d))
5479 break;
5480 goto fail;
fa9a63c5
RM
5481
5482
5483 /* Match at the very end of the data. */
25fe55af
RS
5484 case endbuf:
5485 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5486 if (AT_STRINGS_END (d))
5487 break;
25fe55af 5488 goto fail;
5e69f11e 5489
5e69f11e 5490
25fe55af
RS
5491 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5492 pushes NULL as the value for the string on the stack. Then
505bde11 5493 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5494 string, instead of restoring it. To see why, consider
5495 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5496 then the . fails against the \n. But the next thing we want
5497 to do is match the \n against the \n; if we restored the
5498 string value, we would be back at the foo.
5499
5500 Because this is used only in specific cases, we don't need to
5501 check all the things that `on_failure_jump' does, to make
5502 sure the right things get saved on the stack. Hence we don't
5503 share its code. The only reason to push anything on the
5504 stack at all is that otherwise we would have to change
5505 `anychar's code to do something besides goto fail in this
5506 case; that seems worse than this. */
5507 case on_failure_keep_string_jump:
505bde11
SM
5508 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5509 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5510 mcnt, p + mcnt);
fa9a63c5 5511
505bde11
SM
5512 PUSH_FAILURE_POINT (p - 3, NULL);
5513 break;
5514
0683b6fa
SM
5515 /* A nasty loop is introduced by the non-greedy *? and +?.
5516 With such loops, the stack only ever contains one failure point
5517 at a time, so that a plain on_failure_jump_loop kind of
5518 cycle detection cannot work. Worse yet, such a detection
5519 can not only fail to detect a cycle, but it can also wrongly
5520 detect a cycle (between different instantiations of the same
5521 loop.
5522 So the method used for those nasty loops is a little different:
5523 We use a special cycle-detection-stack-frame which is pushed
5524 when the on_failure_jump_nastyloop failure-point is *popped*.
5525 This special frame thus marks the beginning of one iteration
5526 through the loop and we can hence easily check right here
5527 whether something matched between the beginning and the end of
5528 the loop. */
5529 case on_failure_jump_nastyloop:
5530 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5531 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5532 mcnt, p + mcnt);
5533
5534 assert ((re_opcode_t)p[-4] == no_op);
5535 CHECK_INFINITE_LOOP (p - 4, d);
5536 PUSH_FAILURE_POINT (p - 3, d);
5537 break;
5538
505bde11 5539
4e8a9132
SM
5540 /* Simple loop detecting on_failure_jump: just check on the
5541 failure stack if the same spot was already hit earlier. */
505bde11
SM
5542 case on_failure_jump_loop:
5543 on_failure:
5544 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5545 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5546 mcnt, p + mcnt);
5547
5548 CHECK_INFINITE_LOOP (p - 3, d);
5549 PUSH_FAILURE_POINT (p - 3, d);
25fe55af 5550 break;
fa9a63c5
RM
5551
5552
5553 /* Uses of on_failure_jump:
5e69f11e 5554
25fe55af
RS
5555 Each alternative starts with an on_failure_jump that points
5556 to the beginning of the next alternative. Each alternative
5557 except the last ends with a jump that in effect jumps past
5558 the rest of the alternatives. (They really jump to the
5559 ending jump of the following alternative, because tensioning
5560 these jumps is a hassle.)
fa9a63c5 5561
25fe55af
RS
5562 Repeats start with an on_failure_jump that points past both
5563 the repetition text and either the following jump or
5564 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5565 case on_failure_jump:
5b370c2b 5566 IMMEDIATE_QUIT_CHECK;
25fe55af 5567 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5568 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5569 mcnt, p + mcnt);
25fe55af 5570
505bde11 5571 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5572 break;
5573
4e8a9132 5574 /* This operation is used for greedy *.
505bde11
SM
5575 Compare the beginning of the repeat with what in the
5576 pattern follows its end. If we can establish that there
5577 is nothing that they would both match, i.e., that we
5578 would have to backtrack because of (as in, e.g., `a*a')
5579 then we can use a non-backtracking loop based on
4e8a9132 5580 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5581 case on_failure_jump_smart:
5b370c2b 5582 IMMEDIATE_QUIT_CHECK;
25fe55af 5583 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5584 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5585 mcnt, p + mcnt);
25fe55af 5586 {
01618498 5587 re_char *p1 = p; /* Next operation. */
6dcf2d0e
SM
5588 /* Here, we discard `const', making re_match non-reentrant. */
5589 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5590 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
fa9a63c5 5591
505bde11
SM
5592 p -= 3; /* Reset so that we will re-execute the
5593 instruction once it's been changed. */
fa9a63c5 5594
4e8a9132
SM
5595 EXTRACT_NUMBER (mcnt, p2 - 2);
5596
5597 /* Ensure this is a indeed the trivial kind of loop
5598 we are expecting. */
5599 assert (skip_one_char (p1) == p2 - 3);
5600 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5601 DEBUG_STATEMENT (debug += 2);
505bde11 5602 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5603 {
505bde11 5604 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132 5605 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
01618498 5606 *p3 = (unsigned char) on_failure_keep_string_jump;
4e8a9132 5607 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5608 }
505bde11 5609 else
fa9a63c5 5610 {
505bde11
SM
5611 /* Default to a safe `on_failure_jump' loop. */
5612 DEBUG_PRINT1 (" smart default => slow loop.\n");
01618498 5613 *p3 = (unsigned char) on_failure_jump;
fa9a63c5 5614 }
99633e97 5615 DEBUG_STATEMENT (debug -= 2);
25fe55af 5616 }
505bde11 5617 break;
25fe55af
RS
5618
5619 /* Unconditionally jump (without popping any failure points). */
5620 case jump:
fa9a63c5 5621 unconditional_jump:
5b370c2b 5622 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5623 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5624 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5625 p += mcnt; /* Do the jump. */
505bde11 5626 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5627 break;
5628
5629
25fe55af
RS
5630 /* Have to succeed matching what follows at least n times.
5631 After that, handle like `on_failure_jump'. */
5632 case succeed_n:
01618498 5633 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5634 EXTRACT_NUMBER (mcnt, p + 2);
5635 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5636
dc1e502d
SM
5637 /* Originally, mcnt is how many times we HAVE to succeed. */
5638 if (mcnt != 0)
25fe55af 5639 {
6dcf2d0e
SM
5640 /* Here, we discard `const', making re_match non-reentrant. */
5641 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5642 mcnt--;
01618498
SM
5643 p += 4;
5644 PUSH_NUMBER (p2, mcnt);
25fe55af 5645 }
dc1e502d
SM
5646 else
5647 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5648 goto on_failure;
25fe55af
RS
5649 break;
5650
5651 case jump_n:
01618498 5652 /* Signedness doesn't matter since we only compare MCNT to 0. */
25fe55af
RS
5653 EXTRACT_NUMBER (mcnt, p + 2);
5654 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5655
5656 /* Originally, this is how many times we CAN jump. */
dc1e502d 5657 if (mcnt != 0)
25fe55af 5658 {
6dcf2d0e
SM
5659 /* Here, we discard `const', making re_match non-reentrant. */
5660 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
dc1e502d 5661 mcnt--;
01618498 5662 PUSH_NUMBER (p2, mcnt);
dc1e502d 5663 goto unconditional_jump;
25fe55af
RS
5664 }
5665 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5666 else
5667 p += 4;
25fe55af 5668 break;
5e69f11e 5669
fa9a63c5
RM
5670 case set_number_at:
5671 {
01618498 5672 unsigned char *p2; /* Location of the counter. */
25fe55af 5673 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5674
25fe55af 5675 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6dcf2d0e
SM
5676 /* Here, we discard `const', making re_match non-reentrant. */
5677 p2 = (unsigned char*) p + mcnt;
01618498 5678 /* Signedness doesn't matter since we only copy MCNT's bits . */
25fe55af 5679 EXTRACT_NUMBER_AND_INCR (mcnt, p);
01618498
SM
5680 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5681 PUSH_NUMBER (p2, mcnt);
25fe55af
RS
5682 break;
5683 }
9121ca40
KH
5684
5685 case wordbound:
66f0296e
SM
5686 case notwordbound:
5687 not = (re_opcode_t) *(p - 1) == notwordbound;
5688 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5689
99633e97 5690 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5691
b18215fc 5692 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5693 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5694 not = !not;
b18215fc
RS
5695 else
5696 {
5697 /* C1 is the character before D, S1 is the syntax of C1, C2
5698 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5699 re_wchar_t c1, c2;
5700 int s1, s2;
b18215fc 5701#ifdef emacs
2d1675e4
SM
5702 int offset = PTR_TO_OFFSET (d - 1);
5703 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5704 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5705#endif
66f0296e 5706 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5707 s1 = SYNTAX (c1);
5708#ifdef emacs
5d967c7a 5709 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5710#endif
f1ad044f 5711 PREFETCH_NOLIMIT ();
2d1675e4 5712 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5713 s2 = SYNTAX (c2);
5714
5715 if (/* Case 2: Only one of S1 and S2 is Sword. */
5716 ((s1 == Sword) != (s2 == Sword))
5717 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5718 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5719 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5720 not = !not;
5721 }
5722 if (not)
9121ca40 5723 break;
b18215fc 5724 else
9121ca40 5725 goto fail;
fa9a63c5
RM
5726
5727 case wordbeg:
25fe55af 5728 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5729
b18215fc
RS
5730 /* We FAIL in one of the following cases: */
5731
25fe55af 5732 /* Case 1: D is at the end of string. */
b18215fc 5733 if (AT_STRINGS_END (d))
99633e97 5734 goto fail;
b18215fc
RS
5735 else
5736 {
5737 /* C1 is the character before D, S1 is the syntax of C1, C2
5738 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5739 re_wchar_t c1, c2;
5740 int s1, s2;
fa9a63c5 5741#ifdef emacs
2d1675e4
SM
5742 int offset = PTR_TO_OFFSET (d);
5743 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5744 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5745#endif
99633e97 5746 PREFETCH ();
2d1675e4 5747 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5748 s2 = SYNTAX (c2);
25fe55af 5749
b18215fc
RS
5750 /* Case 2: S2 is not Sword. */
5751 if (s2 != Sword)
5752 goto fail;
5753
5754 /* Case 3: D is not at the beginning of string ... */
5755 if (!AT_STRINGS_BEG (d))
5756 {
5757 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5758#ifdef emacs
5d967c7a 5759 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5760#endif
b18215fc
RS
5761 s1 = SYNTAX (c1);
5762
5763 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5764 returns 0. */
b18215fc
RS
5765 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5766 goto fail;
5767 }
5768 }
e318085a
RS
5769 break;
5770
b18215fc 5771 case wordend:
25fe55af 5772 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5773
5774 /* We FAIL in one of the following cases: */
5775
5776 /* Case 1: D is at the beginning of string. */
5777 if (AT_STRINGS_BEG (d))
e318085a 5778 goto fail;
b18215fc
RS
5779 else
5780 {
5781 /* C1 is the character before D, S1 is the syntax of C1, C2
5782 is the character at D, and S2 is the syntax of C2. */
01618498
SM
5783 re_wchar_t c1, c2;
5784 int s1, s2;
5d967c7a 5785#ifdef emacs
2d1675e4
SM
5786 int offset = PTR_TO_OFFSET (d) - 1;
5787 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5788 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5789#endif
99633e97 5790 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5791 s1 = SYNTAX (c1);
5792
5793 /* Case 2: S1 is not Sword. */
5794 if (s1 != Sword)
5795 goto fail;
5796
5797 /* Case 3: D is not at the end of string ... */
5798 if (!AT_STRINGS_END (d))
5799 {
f1ad044f 5800 PREFETCH_NOLIMIT ();
2d1675e4 5801 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a
RS
5802#ifdef emacs
5803 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5804#endif
b18215fc
RS
5805 s2 = SYNTAX (c2);
5806
5807 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5808 returns 0. */
b18215fc 5809 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5810 goto fail;
b18215fc
RS
5811 }
5812 }
e318085a
RS
5813 break;
5814
fa9a63c5 5815 case syntaxspec:
1fb352e0
SM
5816 case notsyntaxspec:
5817 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 5818 mcnt = *p++;
1fb352e0 5819 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 5820 PREFETCH ();
b18215fc
RS
5821#ifdef emacs
5822 {
2d1675e4
SM
5823 int offset = PTR_TO_OFFSET (d);
5824 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
5825 UPDATE_SYNTAX_TABLE (pos1);
5826 }
25fe55af 5827#endif
b18215fc 5828 {
01618498
SM
5829 int len;
5830 re_wchar_t c;
b18215fc 5831
2d1675e4 5832 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 5833
990b2375 5834 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 5835 goto fail;
b18215fc
RS
5836 d += len;
5837 }
fa9a63c5
RM
5838 break;
5839
b18215fc 5840#ifdef emacs
1fb352e0
SM
5841 case before_dot:
5842 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5843 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 5844 goto fail;
b18215fc
RS
5845 break;
5846
1fb352e0
SM
5847 case at_dot:
5848 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5849 if (PTR_BYTE_POS (d) != PT_BYTE)
5850 goto fail;
5851 break;
b18215fc 5852
1fb352e0
SM
5853 case after_dot:
5854 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5855 if (PTR_BYTE_POS (d) <= PT_BYTE)
5856 goto fail;
e318085a 5857 break;
fa9a63c5 5858
1fb352e0 5859 case categoryspec:
b18215fc 5860 case notcategoryspec:
1fb352e0 5861 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 5862 mcnt = *p++;
1fb352e0 5863 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
5864 PREFETCH ();
5865 {
01618498
SM
5866 int len;
5867 re_wchar_t c;
5868
2d1675e4 5869 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 5870
1fb352e0 5871 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
5872 goto fail;
5873 d += len;
5874 }
fa9a63c5 5875 break;
5e69f11e 5876
1fb352e0 5877#endif /* emacs */
5e69f11e 5878
0b32bf0e
SM
5879 default:
5880 abort ();
fa9a63c5 5881 }
b18215fc 5882 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
5883
5884
5885 /* We goto here if a matching operation fails. */
5886 fail:
5b370c2b 5887 IMMEDIATE_QUIT_CHECK;
fa9a63c5 5888 if (!FAIL_STACK_EMPTY ())
505bde11 5889 {
01618498 5890 re_char *str, *pat;
505bde11 5891 /* A restart point is known. Restore to that state. */
0b32bf0e
SM
5892 DEBUG_PRINT1 ("\nFAIL:\n");
5893 POP_FAILURE_POINT (str, pat);
505bde11
SM
5894 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5895 {
5896 case on_failure_keep_string_jump:
5897 assert (str == NULL);
5898 goto continue_failure_jump;
5899
0683b6fa
SM
5900 case on_failure_jump_nastyloop:
5901 assert ((re_opcode_t)pat[-2] == no_op);
5902 PUSH_FAILURE_POINT (pat - 2, str);
5903 /* Fallthrough */
5904
505bde11
SM
5905 case on_failure_jump_loop:
5906 case on_failure_jump:
5907 case succeed_n:
5908 d = str;
5909 continue_failure_jump:
5910 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5911 p = pat + mcnt;
5912 break;
b18215fc 5913
0683b6fa
SM
5914 case no_op:
5915 /* A special frame used for nastyloops. */
5916 goto fail;
5917
505bde11
SM
5918 default:
5919 abort();
5920 }
fa9a63c5 5921
505bde11 5922 assert (p >= bufp->buffer && p <= pend);
b18215fc 5923
0b32bf0e 5924 if (d >= string1 && d <= end1)
fa9a63c5 5925 dend = end_match_1;
0b32bf0e 5926 }
fa9a63c5 5927 else
0b32bf0e 5928 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
5929 } /* for (;;) */
5930
5931 if (best_regs_set)
5932 goto restore_best_regs;
5933
5934 FREE_VARIABLES ();
5935
b18215fc 5936 return -1; /* Failure to match. */
fa9a63c5
RM
5937} /* re_match_2 */
5938\f
5939/* Subroutine definitions for re_match_2. */
5940
fa9a63c5
RM
5941/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5942 bytes; nonzero otherwise. */
5e69f11e 5943
fa9a63c5 5944static int
2d1675e4
SM
5945bcmp_translate (s1, s2, len, translate, multibyte)
5946 re_char *s1, *s2;
fa9a63c5 5947 register int len;
6676cb1c 5948 RE_TRANSLATE_TYPE translate;
2d1675e4 5949 const int multibyte;
fa9a63c5 5950{
2d1675e4
SM
5951 register re_char *p1 = s1, *p2 = s2;
5952 re_char *p1_end = s1 + len;
5953 re_char *p2_end = s2 + len;
e934739e 5954
4bb91c68
SM
5955 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
5956 different lengths, but relying on a single `len' would break this. -sm */
5957 while (p1 < p1_end && p2 < p2_end)
fa9a63c5 5958 {
e934739e 5959 int p1_charlen, p2_charlen;
01618498 5960 re_wchar_t p1_ch, p2_ch;
e934739e 5961
2d1675e4
SM
5962 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5963 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
5964
5965 if (RE_TRANSLATE (translate, p1_ch)
5966 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 5967 return 1;
e934739e
RS
5968
5969 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 5970 }
e934739e
RS
5971
5972 if (p1 != p1_end || p2 != p2_end)
5973 return 1;
5974
fa9a63c5
RM
5975 return 0;
5976}
5977\f
5978/* Entry points for GNU code. */
5979
5980/* re_compile_pattern is the GNU regular expression compiler: it
5981 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5982 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 5983
fa9a63c5
RM
5984 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5985 are set in BUFP on entry.
5e69f11e 5986
b18215fc 5987 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
5988
5989const char *
5990re_compile_pattern (pattern, length, bufp)
5991 const char *pattern;
0b32bf0e 5992 size_t length;
fa9a63c5
RM
5993 struct re_pattern_buffer *bufp;
5994{
5995 reg_errcode_t ret;
5e69f11e 5996
fa9a63c5
RM
5997 /* GNU code is written to assume at least RE_NREGS registers will be set
5998 (and at least one extra will be -1). */
5999 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6000
fa9a63c5
RM
6001 /* And GNU code determines whether or not to get register information
6002 by passing null for the REGS argument to re_match, etc., not by
6003 setting no_sub. */
6004 bufp->no_sub = 0;
5e69f11e 6005
4bb91c68 6006 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
fa9a63c5
RM
6007
6008 if (!ret)
6009 return NULL;
6010 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6011}
c0f9ea08 6012WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
fa9a63c5 6013\f
b18215fc
RS
6014/* Entry points compatible with 4.2 BSD regex library. We don't define
6015 them unless specifically requested. */
fa9a63c5 6016
0b32bf0e 6017#if defined _REGEX_RE_COMP || defined _LIBC
fa9a63c5
RM
6018
6019/* BSD has one and only one pattern buffer. */
6020static struct re_pattern_buffer re_comp_buf;
6021
6022char *
0b32bf0e 6023# ifdef _LIBC
48afdd44
RM
6024/* Make these definitions weak in libc, so POSIX programs can redefine
6025 these names if they don't use our functions, and still use
6026 regcomp/regexec below without link errors. */
6027weak_function
0b32bf0e 6028# endif
fa9a63c5
RM
6029re_comp (s)
6030 const char *s;
6031{
6032 reg_errcode_t ret;
5e69f11e 6033
fa9a63c5
RM
6034 if (!s)
6035 {
6036 if (!re_comp_buf.buffer)
0b32bf0e 6037 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
a60198e5 6038 return (char *) gettext ("No previous regular expression");
fa9a63c5
RM
6039 return 0;
6040 }
6041
6042 if (!re_comp_buf.buffer)
6043 {
6044 re_comp_buf.buffer = (unsigned char *) malloc (200);
6045 if (re_comp_buf.buffer == NULL)
0b32bf0e
SM
6046 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6047 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6048 re_comp_buf.allocated = 200;
6049
6050 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6051 if (re_comp_buf.fastmap == NULL)
a60198e5
SM
6052 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6053 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6054 }
6055
6056 /* Since `re_exec' always passes NULL for the `regs' argument, we
6057 don't need to initialize the pattern buffer fields which affect it. */
6058
fa9a63c5 6059 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6060
fa9a63c5
RM
6061 if (!ret)
6062 return NULL;
6063
6064 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6065 return (char *) gettext (re_error_msgid[(int) ret]);
6066}
6067
6068
6069int
0b32bf0e 6070# ifdef _LIBC
48afdd44 6071weak_function
0b32bf0e 6072# endif
fa9a63c5
RM
6073re_exec (s)
6074 const char *s;
6075{
6076 const int len = strlen (s);
6077 return
6078 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6079}
6080#endif /* _REGEX_RE_COMP */
6081\f
6082/* POSIX.2 functions. Don't define these for Emacs. */
6083
6084#ifndef emacs
6085
6086/* regcomp takes a regular expression as a string and compiles it.
6087
b18215fc 6088 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6089 since POSIX says we shouldn't. Thus, we set
6090
6091 `buffer' to the compiled pattern;
6092 `used' to the length of the compiled pattern;
6093 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6094 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6095 RE_SYNTAX_POSIX_BASIC;
c0f9ea08
SM
6096 `fastmap' to an allocated space for the fastmap;
6097 `fastmap_accurate' to zero;
fa9a63c5
RM
6098 `re_nsub' to the number of subexpressions in PATTERN.
6099
6100 PATTERN is the address of the pattern string.
6101
6102 CFLAGS is a series of bits which affect compilation.
6103
6104 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6105 use POSIX basic syntax.
6106
6107 If REG_NEWLINE is set, then . and [^...] don't match newline.
6108 Also, regexec will try a match beginning after every newline.
6109
6110 If REG_ICASE is set, then we considers upper- and lowercase
6111 versions of letters to be equivalent when matching.
6112
6113 If REG_NOSUB is set, then when PREG is passed to regexec, that
6114 routine will report only success or failure, and nothing about the
6115 registers.
6116
b18215fc 6117 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6118 the return codes and their meanings.) */
6119
6120int
6121regcomp (preg, pattern, cflags)
ada30c0e
SM
6122 regex_t *__restrict preg;
6123 const char *__restrict pattern;
fa9a63c5
RM
6124 int cflags;
6125{
6126 reg_errcode_t ret;
4bb91c68 6127 reg_syntax_t syntax
fa9a63c5
RM
6128 = (cflags & REG_EXTENDED) ?
6129 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6130
6131 /* regex_compile will allocate the space for the compiled pattern. */
6132 preg->buffer = 0;
6133 preg->allocated = 0;
6134 preg->used = 0;
5e69f11e 6135
c0f9ea08
SM
6136 /* Try to allocate space for the fastmap. */
6137 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
5e69f11e 6138
fa9a63c5
RM
6139 if (cflags & REG_ICASE)
6140 {
6141 unsigned i;
5e69f11e 6142
6676cb1c
RS
6143 preg->translate
6144 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6145 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6146 if (preg->translate == NULL)
0b32bf0e 6147 return (int) REG_ESPACE;
fa9a63c5
RM
6148
6149 /* Map uppercase characters to corresponding lowercase ones. */
6150 for (i = 0; i < CHAR_SET_SIZE; i++)
4bb91c68 6151 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
fa9a63c5
RM
6152 }
6153 else
6154 preg->translate = NULL;
6155
6156 /* If REG_NEWLINE is set, newlines are treated differently. */
6157 if (cflags & REG_NEWLINE)
6158 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6159 syntax &= ~RE_DOT_NEWLINE;
6160 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
fa9a63c5
RM
6161 }
6162 else
c0f9ea08 6163 syntax |= RE_NO_NEWLINE_ANCHOR;
fa9a63c5
RM
6164
6165 preg->no_sub = !!(cflags & REG_NOSUB);
6166
5e69f11e 6167 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5 6168 can use strlen here in compiling the pattern. */
4bb91c68 6169 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
5e69f11e 6170
fa9a63c5
RM
6171 /* POSIX doesn't distinguish between an unmatched open-group and an
6172 unmatched close-group: both are REG_EPAREN. */
c0f9ea08
SM
6173 if (ret == REG_ERPAREN)
6174 ret = REG_EPAREN;
6175
6176 if (ret == REG_NOERROR && preg->fastmap)
6177 { /* Compute the fastmap now, since regexec cannot modify the pattern
6178 buffer. */
6179 re_compile_fastmap (preg);
6180 if (preg->can_be_null)
6181 { /* The fastmap can't be used anyway. */
6182 free (preg->fastmap);
6183 preg->fastmap = NULL;
6184 }
6185 }
fa9a63c5
RM
6186 return (int) ret;
6187}
c0f9ea08 6188WEAK_ALIAS (__regcomp, regcomp)
fa9a63c5
RM
6189
6190
6191/* regexec searches for a given pattern, specified by PREG, in the
6192 string STRING.
5e69f11e 6193
fa9a63c5 6194 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6195 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6196 least NMATCH elements, and we set them to the offsets of the
6197 corresponding matched substrings.
5e69f11e 6198
fa9a63c5
RM
6199 EFLAGS specifies `execution flags' which affect matching: if
6200 REG_NOTBOL is set, then ^ does not match at the beginning of the
6201 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6202
fa9a63c5
RM
6203 We return 0 if we find a match and REG_NOMATCH if not. */
6204
6205int
6206regexec (preg, string, nmatch, pmatch, eflags)
ada30c0e
SM
6207 const regex_t *__restrict preg;
6208 const char *__restrict string;
5e69f11e
RM
6209 size_t nmatch;
6210 regmatch_t pmatch[];
fa9a63c5
RM
6211 int eflags;
6212{
6213 int ret;
6214 struct re_registers regs;
6215 regex_t private_preg;
6216 int len = strlen (string);
c0f9ea08 6217 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
fa9a63c5
RM
6218
6219 private_preg = *preg;
5e69f11e 6220
fa9a63c5
RM
6221 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6222 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6223
fa9a63c5
RM
6224 /* The user has told us exactly how many registers to return
6225 information about, via `nmatch'. We have to pass that on to the
b18215fc 6226 matching routines. */
fa9a63c5 6227 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6228
fa9a63c5
RM
6229 if (want_reg_info)
6230 {
6231 regs.num_regs = nmatch;
4bb91c68
SM
6232 regs.start = TALLOC (nmatch * 2, regoff_t);
6233 if (regs.start == NULL)
0b32bf0e 6234 return (int) REG_NOMATCH;
4bb91c68 6235 regs.end = regs.start + nmatch;
fa9a63c5
RM
6236 }
6237
c0f9ea08
SM
6238 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6239 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6240 was a little bit longer but still only matching the real part.
6241 This works because the `endline' will check for a '\n' and will find a
6242 '\0', correctly deciding that this is not the end of a line.
6243 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6244 a convenient '\0' there. For all we know, the string could be preceded
6245 by '\n' which would throw things off. */
6246
fa9a63c5
RM
6247 /* Perform the searching operation. */
6248 ret = re_search (&private_preg, string, len,
0b32bf0e
SM
6249 /* start: */ 0, /* range: */ len,
6250 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6251
fa9a63c5
RM
6252 /* Copy the register information to the POSIX structure. */
6253 if (want_reg_info)
6254 {
6255 if (ret >= 0)
0b32bf0e
SM
6256 {
6257 unsigned r;
fa9a63c5 6258
0b32bf0e
SM
6259 for (r = 0; r < nmatch; r++)
6260 {
6261 pmatch[r].rm_so = regs.start[r];
6262 pmatch[r].rm_eo = regs.end[r];
6263 }
6264 }
fa9a63c5 6265
b18215fc 6266 /* If we needed the temporary register info, free the space now. */
fa9a63c5 6267 free (regs.start);
fa9a63c5
RM
6268 }
6269
6270 /* We want zero return to mean success, unlike `re_search'. */
6271 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6272}
c0f9ea08 6273WEAK_ALIAS (__regexec, regexec)
fa9a63c5
RM
6274
6275
6276/* Returns a message corresponding to an error code, ERRCODE, returned
6277 from either regcomp or regexec. We don't use PREG here. */
6278
6279size_t
6280regerror (errcode, preg, errbuf, errbuf_size)
6281 int errcode;
6282 const regex_t *preg;
6283 char *errbuf;
6284 size_t errbuf_size;
6285{
6286 const char *msg;
6287 size_t msg_size;
6288
6289 if (errcode < 0
6290 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6291 /* Only error codes returned by the rest of the code should be passed
b18215fc 6292 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6293 code generates an invalid error code, then the program has a bug.
6294 Dump core so we can fix it. */
6295 abort ();
6296
6297 msg = gettext (re_error_msgid[errcode]);
6298
6299 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6300
fa9a63c5
RM
6301 if (errbuf_size != 0)
6302 {
6303 if (msg_size > errbuf_size)
0b32bf0e
SM
6304 {
6305 strncpy (errbuf, msg, errbuf_size - 1);
6306 errbuf[errbuf_size - 1] = 0;
6307 }
fa9a63c5 6308 else
0b32bf0e 6309 strcpy (errbuf, msg);
fa9a63c5
RM
6310 }
6311
6312 return msg_size;
6313}
c0f9ea08 6314WEAK_ALIAS (__regerror, regerror)
fa9a63c5
RM
6315
6316
6317/* Free dynamically allocated space used by PREG. */
6318
6319void
6320regfree (preg)
6321 regex_t *preg;
6322{
6323 if (preg->buffer != NULL)
6324 free (preg->buffer);
6325 preg->buffer = NULL;
5e69f11e 6326
fa9a63c5
RM
6327 preg->allocated = 0;
6328 preg->used = 0;
6329
6330 if (preg->fastmap != NULL)
6331 free (preg->fastmap);
6332 preg->fastmap = NULL;
6333 preg->fastmap_accurate = 0;
6334
6335 if (preg->translate != NULL)
6336 free (preg->translate);
6337 preg->translate = NULL;
6338}
c0f9ea08 6339WEAK_ALIAS (__regfree, regfree)
fa9a63c5
RM
6340
6341#endif /* not emacs */