(JIT Lock Mode): jit-lock-mode is not a command any more.
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a
RS
1/* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
bc78d348
KB
3 internationalization features.)
4
505bde11 5 Copyright (C) 1993,94,95,96,97,98,2000 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
ba4a8e51 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25fe55af 20 USA. */
fa9a63c5 21
505bde11 22/* TODO:
505bde11 23 - structure the opcode space into opcode+flag.
f6a3f532
SM
24 - merge with glibc's regex.[ch]
25 */
505bde11 26
fa9a63c5
RM
27/* AIX requires this to be the first thing in the file. */
28#if defined (_AIX) && !defined (REGEX_MALLOC)
29 #pragma alloca
30#endif
31
68d96f02 32#undef _GNU_SOURCE
fa9a63c5
RM
33#define _GNU_SOURCE
34
51352796 35#ifdef emacs
25fe55af 36/* Converts the pointer to the char to BEG-based offset from the start. */
99633e97 37#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
7e95234e 38#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
51352796 39#endif
b18215fc 40
fa9a63c5
RM
41#ifdef HAVE_CONFIG_H
42#include <config.h>
43#endif
44
25fe55af 45/* We need this for `regex.h', and perhaps for the Emacs include files. */
fa9a63c5
RM
46#include <sys/types.h>
47
25fe55af 48/* This is for other GNU distributions with internationalized messages. */
fa9a63c5
RM
49#if HAVE_LIBINTL_H || defined (_LIBC)
50# include <libintl.h>
51#else
52# define gettext(msgid) (msgid)
53#endif
54
5e69f11e
RM
55#ifndef gettext_noop
56/* This define is so xgettext can find the internationalizable
57 strings. */
58#define gettext_noop(String) String
59#endif
60
fa9a63c5
RM
61/* The `emacs' switch turns on certain matching commands
62 that make sense only in Emacs. */
63#ifdef emacs
64
65#include "lisp.h"
66#include "buffer.h"
b18215fc
RS
67
68/* Make syntax table lookup grant data in gl_state. */
69#define SYNTAX_ENTRY_VIA_PROPERTY
70
fa9a63c5 71#include "syntax.h"
b18215fc
RS
72#include "charset.h"
73#include "category.h"
fa9a63c5 74
9abbd165 75#define malloc xmalloc
64e3c718 76#define realloc xrealloc
9abbd165
RS
77#define free xfree
78
2d1675e4 79#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
4e8a9132
SM
80#define RE_STRING_CHAR(p, s) \
81 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
2d1675e4
SM
82#define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
83 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
84
85/* Set C a (possibly multibyte) character before P. P points into a
86 string which is the virtual concatenation of STR1 (which ends at
87 END1) or STR2 (which ends at END2). */
88#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
89 do { \
90 if (multibyte) \
91 { \
92 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
93 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
94 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
95 c = STRING_CHAR (dtemp, (p) - dtemp); \
96 } \
97 else \
98 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
99 } while (0)
100
4e8a9132 101
fa9a63c5
RM
102#else /* not emacs */
103
104/* If we are not linking with Emacs proper,
105 we can't use the relocating allocator
106 even if config.h says that we can. */
107#undef REL_ALLOC
108
109#if defined (STDC_HEADERS) || defined (_LIBC)
110#include <stdlib.h>
111#else
112char *malloc ();
113char *realloc ();
114#endif
115
9e4ecb26 116/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
25fe55af 117 If nothing else has been done, use the method below. */
9e4ecb26
KH
118#ifdef INHIBIT_STRING_HEADER
119#if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
120#if !defined (bzero) && !defined (bcopy)
121#undef INHIBIT_STRING_HEADER
122#endif
123#endif
124#endif
125
126/* This is the normal way of making sure we have a bcopy and a bzero.
127 This is used in most programs--a few other programs avoid this
128 by defining INHIBIT_STRING_HEADER. */
fa9a63c5 129#ifndef INHIBIT_STRING_HEADER
7f998252 130#if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
fa9a63c5
RM
131#include <string.h>
132#ifndef bcmp
133#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
134#endif
135#ifndef bcopy
136#define bcopy(s, d, n) memcpy ((d), (s), (n))
137#endif
138#ifndef bzero
139#define bzero(s, n) memset ((s), 0, (n))
140#endif
141#else
142#include <strings.h>
143#endif
144#endif
145
146/* Define the syntax stuff for \<, \>, etc. */
147
990b2375
SM
148/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
149enum syntaxcode { Swhitespace = 0, Sword = 1 };
fa9a63c5
RM
150
151#ifdef SWITCH_ENUM_BUG
152#define SWITCH_ENUM_CAST(x) ((int)(x))
153#else
154#define SWITCH_ENUM_CAST(x) (x)
155#endif
156
157#ifdef SYNTAX_TABLE
158
159extern char *re_syntax_table;
160
161#else /* not SYNTAX_TABLE */
162
163/* How many characters in the character set. */
164#define CHAR_SET_SIZE 256
165
166static char re_syntax_table[CHAR_SET_SIZE];
167
168static void
169init_syntax_once ()
170{
171 register int c;
172 static int done = 0;
173
174 if (done)
175 return;
176
177 bzero (re_syntax_table, sizeof re_syntax_table);
178
179 for (c = 'a'; c <= 'z'; c++)
180 re_syntax_table[c] = Sword;
181
182 for (c = 'A'; c <= 'Z'; c++)
183 re_syntax_table[c] = Sword;
184
185 for (c = '0'; c <= '9'; c++)
186 re_syntax_table[c] = Sword;
187
188 re_syntax_table['_'] = Sword;
189
190 done = 1;
191}
192
193#endif /* not SYNTAX_TABLE */
194
195#define SYNTAX(c) re_syntax_table[c]
196
e934739e 197/* Dummy macros for non-Emacs environments. */
b18215fc 198#define BASE_LEADING_CODE_P(c) (0)
1fb352e0
SM
199#define CHAR_CHARSET(c) 0
200#define CHARSET_LEADING_CODE_BASE(c) 0
2d1675e4
SM
201#define MAX_MULTIBYTE_LENGTH 1
202#define RE_MULTIBYTE_P(x) 0
b18215fc
RS
203#define WORD_BOUNDARY_P(c1, c2) (0)
204#define CHAR_HEAD_P(p) (1)
205#define SINGLE_BYTE_CHAR_P(c) (1)
206#define SAME_CHARSET_P(c1, c2) (1)
207#define MULTIBYTE_FORM_LENGTH(p, s) (1)
208#define STRING_CHAR(p, s) (*(p))
4e8a9132 209#define RE_STRING_CHAR STRING_CHAR
2d1675e4 210#define CHAR_STRING(c, s) (*(s) = (c), 1)
b18215fc 211#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
2d1675e4 212#define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
b18215fc
RS
213#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
214 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
b54f61ed 215#define MAKE_CHAR(charset, c1, c2) (c1)
fa9a63c5 216#endif /* not emacs */
4e8a9132
SM
217
218#ifndef RE_TRANSLATE
219#define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
220#define RE_TRANSLATE_P(TBL) (TBL)
221#endif
fa9a63c5
RM
222\f
223/* Get the interface, including the syntax bits. */
224#include "regex.h"
225
f71b19b6
DL
226/* isalpha etc. are used for the character classes. */
227#include <ctype.h>
fa9a63c5 228
f71b19b6 229#ifdef emacs
fa9a63c5 230
f71b19b6
DL
231/* 1 if C is an ASCII character. */
232#define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 233
f71b19b6
DL
234/* 1 if C is a unibyte character. */
235#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 236
f71b19b6 237/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 238
f71b19b6
DL
239/* In Emacs, these are only used for single-byte characters. */
240#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
241#define ISCNTRL(c) ((c) < ' ')
242#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
243 || ((c) >= 'a' && (c) <= 'f') \
244 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
245
246/* This is only used for single-byte characters. */
247#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
248
249/* The rest must handle multibyte characters. */
250
251#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 252 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
253 : 1)
254
255#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 256 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
257 : 1)
258
f71b19b6
DL
259#define ISALNUM(c) (IS_REAL_ASCII (c) \
260 ? (((c) >= 'a' && (c) <= 'z') \
261 || ((c) >= 'A' && (c) <= 'Z') \
262 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
263 : SYNTAX (c) == Sword)
264
f71b19b6
DL
265#define ISALPHA(c) (IS_REAL_ASCII (c) \
266 ? (((c) >= 'a' && (c) <= 'z') \
267 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
268 : SYNTAX (c) == Sword)
269
270#define ISLOWER(c) (LOWERCASEP (c))
271
f71b19b6
DL
272#define ISPUNCT(c) (IS_REAL_ASCII (c) \
273 ? ((c) > ' ' && (c) < 0177 \
274 && !(((c) >= 'a' && (c) <= 'z') \
275 || ((c) >= 'A' && (c) <= 'Z') \
276 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
277 : SYNTAX (c) != Sword)
278
279#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
280
281#define ISUPPER(c) (UPPERCASEP (c))
282
283#define ISWORD(c) (SYNTAX (c) == Sword)
284
285#else /* not emacs */
286
f71b19b6
DL
287/* Jim Meyering writes:
288
289 "... Some ctype macros are valid only for character codes that
290 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
291 using /bin/cc or gcc but without giving an ansi option). So, all
292 ctype uses should be through macros like ISPRINT... If
293 STDC_HEADERS is defined, then autoconf has verified that the ctype
294 macros don't need to be guarded with references to isascii. ...
295 Defining isascii to 1 should let any compiler worth its salt
296 eliminate the && through constant folding." */
297
298#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
299#define ISASCII(c) 1
300#else
301#define ISASCII(c) isascii(c)
302#endif
303
304/* 1 if C is an ASCII character. */
305#define IS_REAL_ASCII(c) ((c) < 0200)
306
307/* This distinction is not meaningful, except in Emacs. */
308#define ISUNIBYTE(c) 1
309
310#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
311#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
312#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
313
fa9a63c5
RM
314#ifdef isblank
315#define ISBLANK(c) (ISASCII (c) && isblank (c))
316#else
317#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
318#endif
319#ifdef isgraph
320#define ISGRAPH(c) (ISASCII (c) && isgraph (c))
321#else
322#define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
323#endif
324
325#define ISPRINT(c) (ISASCII (c) && isprint (c))
326#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
327#define ISALNUM(c) (ISASCII (c) && isalnum (c))
328#define ISALPHA(c) (ISASCII (c) && isalpha (c))
329#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
330#define ISLOWER(c) (ISASCII (c) && islower (c))
331#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
332#define ISSPACE(c) (ISASCII (c) && isspace (c))
333#define ISUPPER(c) (ISASCII (c) && isupper (c))
334#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
335
96cc36cc
RS
336#define ISWORD(c) ISALPHA(c)
337
338#endif /* not emacs */
339\f
fa9a63c5 340#ifndef NULL
075f06ec 341#define NULL (void *)0
fa9a63c5
RM
342#endif
343
344/* We remove any previous definition of `SIGN_EXTEND_CHAR',
345 since ours (we hope) works properly with all combinations of
346 machines, compilers, `char' and `unsigned char' argument types.
25fe55af 347 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
348#undef SIGN_EXTEND_CHAR
349#if __STDC__
350#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
351#else /* not __STDC__ */
352/* As in Harbison and Steele. */
353#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
354#endif
355\f
356/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
357 use `alloca' instead of `malloc'. This is because using malloc in
358 re_search* or re_match* could cause memory leaks when C-g is used in
359 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
360 the other hand, malloc is more portable, and easier to debug.
361
fa9a63c5
RM
362 Because we sometimes use alloca, some routines have to be macros,
363 not functions -- `alloca'-allocated space disappears at the end of the
364 function it is called in. */
365
366#ifdef REGEX_MALLOC
367
368#define REGEX_ALLOCATE malloc
369#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
370#define REGEX_FREE free
371
372#else /* not REGEX_MALLOC */
373
374/* Emacs already defines alloca, sometimes. */
375#ifndef alloca
376
377/* Make alloca work the best possible way. */
378#ifdef __GNUC__
379#define alloca __builtin_alloca
380#else /* not __GNUC__ */
381#if HAVE_ALLOCA_H
382#include <alloca.h>
383#else /* not __GNUC__ or HAVE_ALLOCA_H */
f3c4387f 384#if 0 /* It is a bad idea to declare alloca. We always cast the result. */
25fe55af 385#ifndef _AIX /* Already did AIX, up at the top. */
fa9a63c5
RM
386char *alloca ();
387#endif /* not _AIX */
f3c4387f 388#endif
5e69f11e 389#endif /* not HAVE_ALLOCA_H */
fa9a63c5
RM
390#endif /* not __GNUC__ */
391
392#endif /* not alloca */
393
394#define REGEX_ALLOCATE alloca
395
396/* Assumes a `char *destination' variable. */
397#define REGEX_REALLOCATE(source, osize, nsize) \
398 (destination = (char *) alloca (nsize), \
399 bcopy (source, destination, osize), \
400 destination)
401
402/* No need to do anything to free, after alloca. */
c2e1680a 403#define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
404
405#endif /* not REGEX_MALLOC */
406
407/* Define how to allocate the failure stack. */
408
33487cc8 409#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4297555e 410
fa9a63c5
RM
411#define REGEX_ALLOCATE_STACK(size) \
412 r_alloc (&failure_stack_ptr, (size))
413#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
414 r_re_alloc (&failure_stack_ptr, (nsize))
415#define REGEX_FREE_STACK(ptr) \
416 r_alloc_free (&failure_stack_ptr)
417
4297555e 418#else /* not using relocating allocator */
fa9a63c5
RM
419
420#ifdef REGEX_MALLOC
421
422#define REGEX_ALLOCATE_STACK malloc
423#define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
424#define REGEX_FREE_STACK free
425
426#else /* not REGEX_MALLOC */
427
428#define REGEX_ALLOCATE_STACK alloca
429
430#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
431 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 432/* No need to explicitly free anything. */
ed0767d8 433#define REGEX_FREE_STACK(arg) ((void)0)
fa9a63c5
RM
434
435#endif /* not REGEX_MALLOC */
4297555e 436#endif /* not using relocating allocator */
fa9a63c5
RM
437
438
439/* True if `size1' is non-NULL and PTR is pointing anywhere inside
440 `string1' or just past its end. This works if PTR is NULL, which is
441 a good thing. */
25fe55af 442#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
443 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
444
445/* (Re)Allocate N items of type T using malloc, or fail. */
446#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
447#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
448#define RETALLOC_IF(addr, n, t) \
449 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
450#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
451
25fe55af 452#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
453
454#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
455
456#undef MAX
457#undef MIN
458#define MAX(a, b) ((a) > (b) ? (a) : (b))
459#define MIN(a, b) ((a) < (b) ? (a) : (b))
460
66f0296e
SM
461/* Type of source-pattern and string chars. */
462typedef const unsigned char re_char;
463
fa9a63c5
RM
464typedef char boolean;
465#define false 0
466#define true 1
467
468static int re_match_2_internal ();
469\f
470/* These are the command codes that appear in compiled regular
25fe55af 471 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
472 command code can specify any interpretation whatsoever for its
473 arguments. Zero bytes may appear in the compiled regular expression. */
474
475typedef enum
476{
477 no_op = 0,
478
25fe55af 479 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
480 succeed,
481
25fe55af 482 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
483 exactn,
484
25fe55af 485 /* Matches any (more or less) character. */
fa9a63c5
RM
486 anychar,
487
25fe55af
RS
488 /* Matches any one char belonging to specified set. First
489 following byte is number of bitmap bytes. Then come bytes
490 for a bitmap saying which chars are in. Bits in each byte
491 are ordered low-bit-first. A character is in the set if its
492 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
493 automatically not in the set.
494
495 If the length byte has the 0x80 bit set, then that stuff
496 is followed by a range table:
497 2 bytes of flags for character sets (low 8 bits, high 8 bits)
498 See RANGE_TABLE_WORK_BITS below.
499 2 bytes, the number of pairs that follow
500 pairs, each 2 multibyte characters,
501 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
502 charset,
503
25fe55af
RS
504 /* Same parameters as charset, but match any character that is
505 not one of those specified. */
fa9a63c5
RM
506 charset_not,
507
25fe55af
RS
508 /* Start remembering the text that is matched, for storing in a
509 register. Followed by one byte with the register number, in
510 the range 0 to one less than the pattern buffer's re_nsub
505bde11 511 field. */
fa9a63c5
RM
512 start_memory,
513
25fe55af
RS
514 /* Stop remembering the text that is matched and store it in a
515 memory register. Followed by one byte with the register
516 number, in the range 0 to one less than `re_nsub' in the
505bde11 517 pattern buffer. */
fa9a63c5
RM
518 stop_memory,
519
25fe55af
RS
520 /* Match a duplicate of something remembered. Followed by one
521 byte containing the register number. */
fa9a63c5
RM
522 duplicate,
523
25fe55af 524 /* Fail unless at beginning of line. */
fa9a63c5
RM
525 begline,
526
25fe55af 527 /* Fail unless at end of line. */
fa9a63c5
RM
528 endline,
529
25fe55af
RS
530 /* Succeeds if at beginning of buffer (if emacs) or at beginning
531 of string to be matched (if not). */
fa9a63c5
RM
532 begbuf,
533
25fe55af 534 /* Analogously, for end of buffer/string. */
fa9a63c5 535 endbuf,
5e69f11e 536
25fe55af 537 /* Followed by two byte relative address to which to jump. */
5e69f11e 538 jump,
fa9a63c5 539
25fe55af
RS
540 /* Followed by two-byte relative address of place to resume at
541 in case of failure. */
fa9a63c5 542 on_failure_jump,
5e69f11e 543
25fe55af
RS
544 /* Like on_failure_jump, but pushes a placeholder instead of the
545 current string position when executed. */
fa9a63c5 546 on_failure_keep_string_jump,
5e69f11e 547
505bde11
SM
548 /* Just like `on_failure_jump', except that it checks that we
549 don't get stuck in an infinite loop (matching an empty string
550 indefinitely). */
551 on_failure_jump_loop,
552
0683b6fa
SM
553 /* Just like `on_failure_jump_loop', except that it checks for
554 a different kind of loop (the kind that shows up with non-greedy
555 operators). This operation has to be immediately preceded
556 by a `no_op'. */
557 on_failure_jump_nastyloop,
558
505bde11
SM
559 /* A smart `on_failure_jump' used for greedy * and + operators.
560 It analyses the loop before which it is put and if the
561 loop does not require backtracking, it changes itself to
4e8a9132
SM
562 `on_failure_keep_string_jump' and short-circuits the loop,
563 else it just defaults to changing itself into `on_failure_jump'.
564 It assumes that it is pointing to just past a `jump'. */
505bde11 565 on_failure_jump_smart,
fa9a63c5 566
25fe55af 567 /* Followed by two-byte relative address and two-byte number n.
ed0767d8
SM
568 After matching N times, jump to the address upon failure.
569 Does not work if N starts at 0: use on_failure_jump_loop
570 instead. */
fa9a63c5
RM
571 succeed_n,
572
25fe55af
RS
573 /* Followed by two-byte relative address, and two-byte number n.
574 Jump to the address N times, then fail. */
fa9a63c5
RM
575 jump_n,
576
25fe55af
RS
577 /* Set the following two-byte relative address to the
578 subsequent two-byte number. The address *includes* the two
579 bytes of number. */
fa9a63c5
RM
580 set_number_at,
581
fa9a63c5
RM
582 wordbeg, /* Succeeds if at word beginning. */
583 wordend, /* Succeeds if at word end. */
584
585 wordbound, /* Succeeds if at a word boundary. */
1fb352e0 586 notwordbound, /* Succeeds if not at a word boundary. */
fa9a63c5
RM
587
588 /* Matches any character whose syntax is specified. Followed by
25fe55af 589 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
590 syntaxspec,
591
592 /* Matches any character whose syntax is not that specified. */
1fb352e0
SM
593 notsyntaxspec
594
595#ifdef emacs
596 ,before_dot, /* Succeeds if before point. */
597 at_dot, /* Succeeds if at point. */
598 after_dot, /* Succeeds if after point. */
b18215fc
RS
599
600 /* Matches any character whose category-set contains the specified
25fe55af
RS
601 category. The operator is followed by a byte which contains a
602 category code (mnemonic ASCII character). */
b18215fc
RS
603 categoryspec,
604
605 /* Matches any character whose category-set does not contain the
606 specified category. The operator is followed by a byte which
607 contains the category code (mnemonic ASCII character). */
608 notcategoryspec
fa9a63c5
RM
609#endif /* emacs */
610} re_opcode_t;
611\f
612/* Common operations on the compiled pattern. */
613
614/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
615
616#define STORE_NUMBER(destination, number) \
617 do { \
618 (destination)[0] = (number) & 0377; \
619 (destination)[1] = (number) >> 8; \
620 } while (0)
621
622/* Same as STORE_NUMBER, except increment DESTINATION to
623 the byte after where the number is stored. Therefore, DESTINATION
624 must be an lvalue. */
625
626#define STORE_NUMBER_AND_INCR(destination, number) \
627 do { \
628 STORE_NUMBER (destination, number); \
629 (destination) += 2; \
630 } while (0)
631
632/* Put into DESTINATION a number stored in two contiguous bytes starting
633 at SOURCE. */
634
635#define EXTRACT_NUMBER(destination, source) \
636 do { \
637 (destination) = *(source) & 0377; \
638 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
639 } while (0)
640
641#ifdef DEBUG
642static void
643extract_number (dest, source)
644 int *dest;
645 unsigned char *source;
646{
5e69f11e 647 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
648 *dest = *source & 0377;
649 *dest += temp << 8;
650}
651
25fe55af 652#ifndef EXTRACT_MACROS /* To debug the macros. */
fa9a63c5
RM
653#undef EXTRACT_NUMBER
654#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
655#endif /* not EXTRACT_MACROS */
656
657#endif /* DEBUG */
658
659/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
660 SOURCE must be an lvalue. */
661
662#define EXTRACT_NUMBER_AND_INCR(destination, source) \
663 do { \
664 EXTRACT_NUMBER (destination, source); \
25fe55af 665 (source) += 2; \
fa9a63c5
RM
666 } while (0)
667
668#ifdef DEBUG
669static void
670extract_number_and_incr (destination, source)
671 int *destination;
672 unsigned char **source;
5e69f11e 673{
fa9a63c5
RM
674 extract_number (destination, *source);
675 *source += 2;
676}
677
678#ifndef EXTRACT_MACROS
679#undef EXTRACT_NUMBER_AND_INCR
680#define EXTRACT_NUMBER_AND_INCR(dest, src) \
681 extract_number_and_incr (&dest, &src)
682#endif /* not EXTRACT_MACROS */
683
684#endif /* DEBUG */
685\f
b18215fc
RS
686/* Store a multibyte character in three contiguous bytes starting
687 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 688 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
689
690#define STORE_CHARACTER_AND_INCR(destination, character) \
691 do { \
692 (destination)[0] = (character) & 0377; \
693 (destination)[1] = ((character) >> 8) & 0377; \
694 (destination)[2] = (character) >> 16; \
695 (destination) += 3; \
696 } while (0)
697
698/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 699 starting at SOURCE. */
b18215fc
RS
700
701#define EXTRACT_CHARACTER(destination, source) \
702 do { \
703 (destination) = ((source)[0] \
704 | ((source)[1] << 8) \
705 | ((source)[2] << 16)); \
706 } while (0)
707
708
709/* Macros for charset. */
710
711/* Size of bitmap of charset P in bytes. P is a start of charset,
712 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
713#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
714
715/* Nonzero if charset P has range table. */
25fe55af 716#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
717
718/* Return the address of range table of charset P. But not the start
719 of table itself, but the before where the number of ranges is
96cc36cc
RS
720 stored. `2 +' means to skip re_opcode_t and size of bitmap,
721 and the 2 bytes of flags at the start of the range table. */
722#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
723
724/* Extract the bit flags that start a range table. */
725#define CHARSET_RANGE_TABLE_BITS(p) \
726 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
727 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
728
729/* Test if C is listed in the bitmap of charset P. */
730#define CHARSET_LOOKUP_BITMAP(p, c) \
731 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
732 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
733
734/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
735 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
736 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
737 and end. */
738#define CHARSET_RANGE_TABLE_END(range_table, count) \
739 ((range_table) + (count) * 2 * 3)
740
25fe55af 741/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
742 COUNT is number of ranges in RANGE_TABLE. */
743#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
744 do \
745 { \
746 int range_start, range_end; \
747 unsigned char *p; \
748 unsigned char *range_table_end \
749 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
750 \
751 for (p = (range_table); p < range_table_end; p += 2 * 3) \
752 { \
753 EXTRACT_CHARACTER (range_start, p); \
754 EXTRACT_CHARACTER (range_end, p + 3); \
755 \
756 if (range_start <= (c) && (c) <= range_end) \
757 { \
758 (not) = !(not); \
759 break; \
760 } \
761 } \
762 } \
763 while (0)
764
765/* Test if C is in range table of CHARSET. The flag NOT is negated if
766 C is listed in it. */
767#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
768 do \
769 { \
770 /* Number of ranges in range table. */ \
771 int count; \
772 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
773 \
774 EXTRACT_NUMBER_AND_INCR (count, range_table); \
775 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
776 } \
777 while (0)
778\f
fa9a63c5
RM
779/* If DEBUG is defined, Regex prints many voluminous messages about what
780 it is doing (if the variable `debug' is nonzero). If linked with the
781 main program in `iregex.c', you can enter patterns and strings
782 interactively. And if linked with the main program in `main.c' and
25fe55af 783 the other test files, you can run the already-written tests. */
fa9a63c5
RM
784
785#ifdef DEBUG
786
787/* We use standard I/O for debugging. */
788#include <stdio.h>
789
790/* It is useful to test things that ``must'' be true when debugging. */
791#include <assert.h>
792
99633e97 793static int debug = -100000;
fa9a63c5
RM
794
795#define DEBUG_STATEMENT(e) e
99633e97
SM
796#define DEBUG_PRINT1(x) if (debug > 0) printf (x)
797#define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
798#define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
799#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
25fe55af 800#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
99633e97 801 if (debug > 0) print_partial_compiled_pattern (s, e)
fa9a63c5 802#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
99633e97 803 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
fa9a63c5
RM
804
805
806/* Print the fastmap in human-readable form. */
807
808void
809print_fastmap (fastmap)
810 char *fastmap;
811{
812 unsigned was_a_range = 0;
5e69f11e
RM
813 unsigned i = 0;
814
fa9a63c5
RM
815 while (i < (1 << BYTEWIDTH))
816 {
817 if (fastmap[i++])
818 {
819 was_a_range = 0;
25fe55af
RS
820 putchar (i - 1);
821 while (i < (1 << BYTEWIDTH) && fastmap[i])
822 {
823 was_a_range = 1;
824 i++;
825 }
fa9a63c5 826 if (was_a_range)
25fe55af
RS
827 {
828 printf ("-");
829 putchar (i - 1);
830 }
831 }
fa9a63c5 832 }
5e69f11e 833 putchar ('\n');
fa9a63c5
RM
834}
835
836
837/* Print a compiled pattern string in human-readable form, starting at
838 the START pointer into it and ending just before the pointer END. */
839
840void
841print_partial_compiled_pattern (start, end)
842 unsigned char *start;
843 unsigned char *end;
844{
845 int mcnt, mcnt2;
846 unsigned char *p = start;
847 unsigned char *pend = end;
848
849 if (start == NULL)
850 {
851 printf ("(null)\n");
852 return;
853 }
5e69f11e 854
fa9a63c5
RM
855 /* Loop over pattern commands. */
856 while (p < pend)
857 {
858 printf ("%d:\t", p - start);
859
860 switch ((re_opcode_t) *p++)
861 {
25fe55af
RS
862 case no_op:
863 printf ("/no_op");
864 break;
fa9a63c5 865
99633e97
SM
866 case succeed:
867 printf ("/succeed");
868 break;
869
fa9a63c5
RM
870 case exactn:
871 mcnt = *p++;
25fe55af
RS
872 printf ("/exactn/%d", mcnt);
873 do
fa9a63c5 874 {
25fe55af 875 putchar ('/');
fa9a63c5 876 putchar (*p++);
25fe55af
RS
877 }
878 while (--mcnt);
879 break;
fa9a63c5
RM
880
881 case start_memory:
505bde11 882 printf ("/start_memory/%d", *p++);
25fe55af 883 break;
fa9a63c5
RM
884
885 case stop_memory:
505bde11 886 printf ("/stop_memory/%d", *p++);
25fe55af 887 break;
fa9a63c5
RM
888
889 case duplicate:
890 printf ("/duplicate/%d", *p++);
891 break;
892
893 case anychar:
894 printf ("/anychar");
895 break;
896
897 case charset:
25fe55af
RS
898 case charset_not:
899 {
900 register int c, last = -100;
fa9a63c5 901 register int in_range = 0;
99633e97
SM
902 int length = CHARSET_BITMAP_SIZE (p - 1);
903 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
fa9a63c5
RM
904
905 printf ("/charset [%s",
25fe55af 906 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 907
25fe55af 908 assert (p + *p < pend);
fa9a63c5 909
25fe55af 910 for (c = 0; c < 256; c++)
96cc36cc 911 if (c / 8 < length
fa9a63c5
RM
912 && (p[1 + (c/8)] & (1 << (c % 8))))
913 {
914 /* Are we starting a range? */
915 if (last + 1 == c && ! in_range)
916 {
917 putchar ('-');
918 in_range = 1;
919 }
920 /* Have we broken a range? */
921 else if (last + 1 != c && in_range)
96cc36cc 922 {
fa9a63c5
RM
923 putchar (last);
924 in_range = 0;
925 }
5e69f11e 926
fa9a63c5
RM
927 if (! in_range)
928 putchar (c);
929
930 last = c;
25fe55af 931 }
fa9a63c5
RM
932
933 if (in_range)
934 putchar (last);
935
936 putchar (']');
937
99633e97 938 p += 1 + length;
96cc36cc 939
96cc36cc 940 if (has_range_table)
99633e97
SM
941 {
942 int count;
943 printf ("has-range-table");
944
945 /* ??? Should print the range table; for now, just skip it. */
946 p += 2; /* skip range table bits */
947 EXTRACT_NUMBER_AND_INCR (count, p);
948 p = CHARSET_RANGE_TABLE_END (p, count);
949 }
fa9a63c5
RM
950 }
951 break;
952
953 case begline:
954 printf ("/begline");
25fe55af 955 break;
fa9a63c5
RM
956
957 case endline:
25fe55af
RS
958 printf ("/endline");
959 break;
fa9a63c5
RM
960
961 case on_failure_jump:
25fe55af
RS
962 extract_number_and_incr (&mcnt, &p);
963 printf ("/on_failure_jump to %d", p + mcnt - start);
964 break;
fa9a63c5
RM
965
966 case on_failure_keep_string_jump:
25fe55af
RS
967 extract_number_and_incr (&mcnt, &p);
968 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
969 break;
fa9a63c5 970
0683b6fa
SM
971 case on_failure_jump_nastyloop:
972 extract_number_and_incr (&mcnt, &p);
973 printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
974 break;
975
505bde11 976 case on_failure_jump_loop:
fa9a63c5 977 extract_number_and_incr (&mcnt, &p);
505bde11 978 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
5e69f11e
RM
979 break;
980
505bde11 981 case on_failure_jump_smart:
fa9a63c5 982 extract_number_and_incr (&mcnt, &p);
505bde11 983 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
5e69f11e
RM
984 break;
985
25fe55af 986 case jump:
fa9a63c5 987 extract_number_and_incr (&mcnt, &p);
25fe55af 988 printf ("/jump to %d", p + mcnt - start);
fa9a63c5
RM
989 break;
990
25fe55af
RS
991 case succeed_n:
992 extract_number_and_incr (&mcnt, &p);
993 extract_number_and_incr (&mcnt2, &p);
99633e97 994 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 995 break;
5e69f11e 996
25fe55af
RS
997 case jump_n:
998 extract_number_and_incr (&mcnt, &p);
999 extract_number_and_incr (&mcnt2, &p);
99633e97 1000 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
25fe55af 1001 break;
5e69f11e 1002
25fe55af
RS
1003 case set_number_at:
1004 extract_number_and_incr (&mcnt, &p);
1005 extract_number_and_incr (&mcnt2, &p);
99633e97 1006 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
25fe55af 1007 break;
5e69f11e 1008
25fe55af 1009 case wordbound:
fa9a63c5
RM
1010 printf ("/wordbound");
1011 break;
1012
1013 case notwordbound:
1014 printf ("/notwordbound");
25fe55af 1015 break;
fa9a63c5
RM
1016
1017 case wordbeg:
1018 printf ("/wordbeg");
1019 break;
5e69f11e 1020
fa9a63c5
RM
1021 case wordend:
1022 printf ("/wordend");
5e69f11e 1023
1fb352e0
SM
1024 case syntaxspec:
1025 printf ("/syntaxspec");
1026 mcnt = *p++;
1027 printf ("/%d", mcnt);
1028 break;
1029
1030 case notsyntaxspec:
1031 printf ("/notsyntaxspec");
1032 mcnt = *p++;
1033 printf ("/%d", mcnt);
1034 break;
1035
fa9a63c5
RM
1036#ifdef emacs
1037 case before_dot:
1038 printf ("/before_dot");
25fe55af 1039 break;
fa9a63c5
RM
1040
1041 case at_dot:
1042 printf ("/at_dot");
25fe55af 1043 break;
fa9a63c5
RM
1044
1045 case after_dot:
1046 printf ("/after_dot");
25fe55af 1047 break;
fa9a63c5 1048
1fb352e0
SM
1049 case categoryspec:
1050 printf ("/categoryspec");
fa9a63c5
RM
1051 mcnt = *p++;
1052 printf ("/%d", mcnt);
25fe55af 1053 break;
5e69f11e 1054
1fb352e0
SM
1055 case notcategoryspec:
1056 printf ("/notcategoryspec");
fa9a63c5
RM
1057 mcnt = *p++;
1058 printf ("/%d", mcnt);
1059 break;
1060#endif /* emacs */
1061
fa9a63c5
RM
1062 case begbuf:
1063 printf ("/begbuf");
25fe55af 1064 break;
fa9a63c5
RM
1065
1066 case endbuf:
1067 printf ("/endbuf");
25fe55af 1068 break;
fa9a63c5 1069
25fe55af
RS
1070 default:
1071 printf ("?%d", *(p-1));
fa9a63c5
RM
1072 }
1073
1074 putchar ('\n');
1075 }
1076
1077 printf ("%d:\tend of pattern.\n", p - start);
1078}
1079
1080
1081void
1082print_compiled_pattern (bufp)
1083 struct re_pattern_buffer *bufp;
1084{
1085 unsigned char *buffer = bufp->buffer;
1086
1087 print_partial_compiled_pattern (buffer, buffer + bufp->used);
505bde11 1088 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, bufp->allocated);
fa9a63c5
RM
1089
1090 if (bufp->fastmap_accurate && bufp->fastmap)
1091 {
1092 printf ("fastmap: ");
1093 print_fastmap (bufp->fastmap);
1094 }
1095
1096 printf ("re_nsub: %d\t", bufp->re_nsub);
1097 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1098 printf ("can_be_null: %d\t", bufp->can_be_null);
1099 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1100 printf ("no_sub: %d\t", bufp->no_sub);
1101 printf ("not_bol: %d\t", bufp->not_bol);
1102 printf ("not_eol: %d\t", bufp->not_eol);
1103 printf ("syntax: %d\n", bufp->syntax);
505bde11 1104 fflush (stdout);
fa9a63c5
RM
1105 /* Perhaps we should print the translate table? */
1106}
1107
1108
1109void
1110print_double_string (where, string1, size1, string2, size2)
66f0296e
SM
1111 re_char *where;
1112 re_char *string1;
1113 re_char *string2;
fa9a63c5
RM
1114 int size1;
1115 int size2;
1116{
1117 unsigned this_char;
5e69f11e 1118
fa9a63c5
RM
1119 if (where == NULL)
1120 printf ("(null)");
1121 else
1122 {
1123 if (FIRST_STRING_P (where))
25fe55af
RS
1124 {
1125 for (this_char = where - string1; this_char < size1; this_char++)
1126 putchar (string1[this_char]);
fa9a63c5 1127
25fe55af
RS
1128 where = string2;
1129 }
fa9a63c5
RM
1130
1131 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1132 putchar (string2[this_char]);
fa9a63c5
RM
1133 }
1134}
1135
1136#else /* not DEBUG */
1137
1138#undef assert
1139#define assert(e)
1140
1141#define DEBUG_STATEMENT(e)
1142#define DEBUG_PRINT1(x)
1143#define DEBUG_PRINT2(x1, x2)
1144#define DEBUG_PRINT3(x1, x2, x3)
1145#define DEBUG_PRINT4(x1, x2, x3, x4)
1146#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1147#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1148
1149#endif /* not DEBUG */
1150\f
1151/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1152 also be assigned to arbitrarily: each pattern buffer stores its own
1153 syntax, so it can be changed between regex compilations. */
1154/* This has no initializer because initialized variables in Emacs
1155 become read-only after dumping. */
1156reg_syntax_t re_syntax_options;
1157
1158
1159/* Specify the precise syntax of regexps for compilation. This provides
1160 for compatibility for various utilities which historically have
1161 different, incompatible syntaxes.
1162
1163 The argument SYNTAX is a bit mask comprised of the various bits
25fe55af 1164 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1165
1166reg_syntax_t
1167re_set_syntax (syntax)
1168 reg_syntax_t syntax;
1169{
1170 reg_syntax_t ret = re_syntax_options;
5e69f11e 1171
fa9a63c5
RM
1172 re_syntax_options = syntax;
1173 return ret;
1174}
1175\f
1176/* This table gives an error message for each of the error codes listed
25fe55af 1177 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1178 POSIX doesn't require that we do anything for REG_NOERROR,
25fe55af 1179 but why not be nice? */
fa9a63c5
RM
1180
1181static const char *re_error_msgid[] =
5e69f11e
RM
1182 {
1183 gettext_noop ("Success"), /* REG_NOERROR */
1184 gettext_noop ("No match"), /* REG_NOMATCH */
1185 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1186 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1187 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1188 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1189 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1190 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1191 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1192 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1193 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1194 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1195 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1196 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1197 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1198 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1199 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
fa9a63c5
RM
1200 };
1201\f
25fe55af 1202/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1203
1204/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1205 searching and matching functions should not call alloca. On some
1206 systems, alloca is implemented in terms of malloc, and if we're
1207 using the relocating allocator routines, then malloc could cause a
1208 relocation, which might (if the strings being searched are in the
1209 ralloc heap) shift the data out from underneath the regexp
1210 routines.
1211
5e69f11e 1212 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1213 processes input from X in a signal handler; processing X input may
1214 call malloc; if input arrives while a matching routine is calling
1215 malloc, then we're scrod. But Emacs can't just block input while
1216 calling matching routines; then we don't notice interrupts when
1217 they come in. So, Emacs blocks input around all regexp calls
1218 except the matching calls, which it leaves unprotected, in the
1219 faith that they will not malloc. */
1220
1221/* Normally, this is fine. */
1222#define MATCH_MAY_ALLOCATE
1223
1224/* When using GNU C, we are not REALLY using the C alloca, no matter
1225 what config.h may say. So don't take precautions for it. */
1226#ifdef __GNUC__
1227#undef C_ALLOCA
1228#endif
1229
1230/* The match routines may not allocate if (1) they would do it with malloc
1231 and (2) it's not safe for them to use malloc.
1232 Note that if REL_ALLOC is defined, matching would not use malloc for the
1233 failure stack, but we would still use it for the register vectors;
25fe55af 1234 so REL_ALLOC should not affect this. */
fa9a63c5
RM
1235#if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1236#undef MATCH_MAY_ALLOCATE
1237#endif
1238
1239\f
1240/* Failure stack declarations and macros; both re_compile_fastmap and
1241 re_match_2 use a failure stack. These have to be macros because of
1242 REGEX_ALLOCATE_STACK. */
5e69f11e 1243
fa9a63c5 1244
320a2a73 1245/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1246 when matching. If this number is exceeded, we allocate more
1247 space, so it is not a hard limit. */
1248#ifndef INIT_FAILURE_ALLOC
320a2a73 1249#define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1250#endif
1251
1252/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1253 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1254 This is a variable only so users of regex can assign to it; we never
25fe55af 1255 change it ourselves. */
fa9a63c5 1256#if defined (MATCH_MAY_ALLOCATE)
320a2a73
KH
1257/* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1258 whose default stack limit is 2mb. In order for a larger
1259 value to work reliably, you have to try to make it accord
1260 with the process stack limit. */
1261int re_max_failures = 40000;
fa9a63c5 1262#else
320a2a73 1263int re_max_failures = 4000;
fa9a63c5
RM
1264#endif
1265
1266union fail_stack_elt
1267{
66f0296e 1268 const unsigned char *pointer;
505bde11 1269 unsigned int integer;
fa9a63c5
RM
1270};
1271
1272typedef union fail_stack_elt fail_stack_elt_t;
1273
1274typedef struct
1275{
1276 fail_stack_elt_t *stack;
1277 unsigned size;
505bde11
SM
1278 unsigned avail; /* Offset of next open position. */
1279 unsigned frame; /* Offset of the cur constructed frame. */
fa9a63c5
RM
1280} fail_stack_type;
1281
505bde11
SM
1282#define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1283#define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
fa9a63c5
RM
1284#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1285
1286
1287/* Define macros to initialize and free the failure stack.
1288 Do `return -2' if the alloc fails. */
1289
1290#ifdef MATCH_MAY_ALLOCATE
1291#define INIT_FAIL_STACK() \
1292 do { \
1293 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1294 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1295 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1296 \
1297 if (fail_stack.stack == NULL) \
1298 return -2; \
1299 \
1300 fail_stack.size = INIT_FAILURE_ALLOC; \
1301 fail_stack.avail = 0; \
505bde11 1302 fail_stack.frame = 0; \
fa9a63c5
RM
1303 } while (0)
1304
1305#define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1306#else
1307#define INIT_FAIL_STACK() \
1308 do { \
1309 fail_stack.avail = 0; \
505bde11 1310 fail_stack.frame = 0; \
fa9a63c5
RM
1311 } while (0)
1312
ed0767d8 1313#define RESET_FAIL_STACK() ((void)0)
fa9a63c5
RM
1314#endif
1315
1316
320a2a73
KH
1317/* Double the size of FAIL_STACK, up to a limit
1318 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1319
1320 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1321 allocating space for it or it was already too large.
1322
25fe55af 1323 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1324
320a2a73
KH
1325/* Factor to increase the failure stack size by
1326 when we increase it.
1327 This used to be 2, but 2 was too wasteful
1328 because the old discarded stacks added up to as much space
1329 were as ultimate, maximum-size stack. */
1330#define FAIL_STACK_GROWTH_FACTOR 4
1331
1332#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1333 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1334 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1335 ? 0 \
320a2a73
KH
1336 : ((fail_stack).stack \
1337 = (fail_stack_elt_t *) \
25fe55af
RS
1338 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1339 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1340 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1341 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1342 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1343 \
1344 (fail_stack).stack == NULL \
1345 ? 0 \
6453db45
KH
1346 : ((fail_stack).size \
1347 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1348 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1349 * FAIL_STACK_GROWTH_FACTOR)) \
1350 / sizeof (fail_stack_elt_t)), \
25fe55af 1351 1)))
fa9a63c5
RM
1352
1353
5e69f11e 1354/* Push pointer POINTER on FAIL_STACK.
fa9a63c5
RM
1355 Return 1 if was able to do so and 0 if ran out of memory allocating
1356 space to do so. */
1357#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1358 ((FAIL_STACK_FULL () \
320a2a73 1359 && !GROW_FAIL_STACK (FAIL_STACK)) \
fa9a63c5
RM
1360 ? 0 \
1361 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1362 1))
505bde11 1363#define POP_PATTERN_OP() POP_FAILURE_POINTER ()
fa9a63c5
RM
1364
1365/* Push a pointer value onto the failure stack.
1366 Assumes the variable `fail_stack'. Probably should only
25fe55af 1367 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1368#define PUSH_FAILURE_POINTER(item) \
1369 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1370
1371/* This pushes an integer-valued item onto the failure stack.
1372 Assumes the variable `fail_stack'. Probably should only
25fe55af 1373 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1374#define PUSH_FAILURE_INT(item) \
1375 fail_stack.stack[fail_stack.avail++].integer = (item)
1376
1377/* Push a fail_stack_elt_t value onto the failure stack.
1378 Assumes the variable `fail_stack'. Probably should only
25fe55af 1379 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1380#define PUSH_FAILURE_ELT(item) \
1381 fail_stack.stack[fail_stack.avail++] = (item)
1382
1383/* These three POP... operations complement the three PUSH... operations.
1384 All assume that `fail_stack' is nonempty. */
1385#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1386#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1387#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1388
505bde11
SM
1389/* Individual items aside from the registers. */
1390#define NUM_NONREG_ITEMS 3
1391
1392/* Used to examine the stack (to detect infinite loops). */
1393#define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
66f0296e 1394#define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
505bde11
SM
1395#define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1396#define TOP_FAILURE_HANDLE() fail_stack.frame
fa9a63c5
RM
1397
1398
505bde11
SM
1399#define ENSURE_FAIL_STACK(space) \
1400while (REMAINING_AVAIL_SLOTS <= space) { \
1401 if (!GROW_FAIL_STACK (fail_stack)) \
1402 return -2; \
1403 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1404 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1405}
1406
1407/* Push register NUM onto the stack. */
1408#define PUSH_FAILURE_REG(num) \
1409do { \
1410 char *destination; \
1411 ENSURE_FAIL_STACK(3); \
1412 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1413 num, regstart[num], regend[num]); \
1414 PUSH_FAILURE_POINTER (regstart[num]); \
1415 PUSH_FAILURE_POINTER (regend[num]); \
1416 PUSH_FAILURE_INT (num); \
1417} while (0)
1418
1419/* Pop a saved register off the stack. */
1420#define POP_FAILURE_REG() \
1421do { \
1422 int reg = POP_FAILURE_INT (); \
1423 regend[reg] = POP_FAILURE_POINTER (); \
1424 regstart[reg] = POP_FAILURE_POINTER (); \
1425 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1426 reg, regstart[reg], regend[reg]); \
1427} while (0)
1428
1429/* Check that we are not stuck in an infinite loop. */
1430#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1431do { \
1432 int failure = TOP_FAILURE_HANDLE(); \
1433 /* Check for infinite matching loops */ \
1434 while (failure > 0 && \
1435 (FAILURE_STR (failure) == string_place \
1436 || FAILURE_STR (failure) == NULL)) \
1437 { \
1438 assert (FAILURE_PAT (failure) >= bufp->buffer \
66f0296e 1439 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
505bde11
SM
1440 if (FAILURE_PAT (failure) == pat_cur) \
1441 goto fail; \
66f0296e 1442 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
505bde11
SM
1443 failure = NEXT_FAILURE_HANDLE(failure); \
1444 } \
1445 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1446} while (0)
1447
fa9a63c5 1448/* Push the information about the state we will need
5e69f11e
RM
1449 if we ever fail back to it.
1450
505bde11 1451 Requires variables fail_stack, regstart, regend and
320a2a73 1452 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1453 declared.
5e69f11e 1454
fa9a63c5
RM
1455 Does `return FAILURE_CODE' if runs out of memory. */
1456
505bde11
SM
1457#define PUSH_FAILURE_POINT(pattern, string_place) \
1458do { \
1459 char *destination; \
1460 /* Must be int, so when we don't save any registers, the arithmetic \
1461 of 0 + -1 isn't done as unsigned. */ \
1462 \
1463 DEBUG_STATEMENT (failure_id++); \
1464 DEBUG_STATEMENT (nfailure_points_pushed++); \
1465 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1466 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1467 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1468 \
1469 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1470 \
1471 DEBUG_PRINT1 ("\n"); \
1472 \
1473 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1474 PUSH_FAILURE_INT (fail_stack.frame); \
1475 \
1476 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1477 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1478 DEBUG_PRINT1 ("'\n"); \
1479 PUSH_FAILURE_POINTER (string_place); \
1480 \
1481 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1482 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1483 PUSH_FAILURE_POINTER (pattern); \
1484 \
1485 /* Close the frame by moving the frame pointer past it. */ \
1486 fail_stack.frame = fail_stack.avail; \
1487} while (0)
fa9a63c5 1488
320a2a73
KH
1489/* Estimate the size of data pushed by a typical failure stack entry.
1490 An estimate is all we need, because all we use this for
1491 is to choose a limit for how big to make the failure stack. */
1492
1493#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1494
fa9a63c5
RM
1495/* How many items can still be added to the stack without overflowing it. */
1496#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1497
1498
1499/* Pops what PUSH_FAIL_STACK pushes.
1500
1501 We restore into the parameters, all of which should be lvalues:
1502 STR -- the saved data position.
1503 PAT -- the saved pattern position.
fa9a63c5 1504 REGSTART, REGEND -- arrays of string positions.
5e69f11e 1505
fa9a63c5 1506 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1507 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5 1508
505bde11
SM
1509#define POP_FAILURE_POINT(str, pat) \
1510do { \
fa9a63c5
RM
1511 assert (!FAIL_STACK_EMPTY ()); \
1512 \
1513 /* Remove failure points and point to how many regs pushed. */ \
1514 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1515 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1516 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5 1517 \
505bde11
SM
1518 /* Pop the saved registers. */ \
1519 while (fail_stack.frame < fail_stack.avail) \
1520 POP_FAILURE_REG (); \
fa9a63c5 1521 \
505bde11
SM
1522 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1523 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1524 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
fa9a63c5
RM
1525 \
1526 /* If the saved string location is NULL, it came from an \
1527 on_failure_keep_string_jump opcode, and we want to throw away the \
1528 saved NULL, thus retaining our current position in the string. */ \
66f0296e 1529 str = (re_char *) POP_FAILURE_POINTER (); \
505bde11 1530 DEBUG_PRINT2 (" Popping string %p: `", str); \
fa9a63c5
RM
1531 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1532 DEBUG_PRINT1 ("'\n"); \
1533 \
505bde11
SM
1534 fail_stack.frame = POP_FAILURE_INT (); \
1535 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
fa9a63c5 1536 \
505bde11
SM
1537 assert (fail_stack.avail >= 0); \
1538 assert (fail_stack.frame <= fail_stack.avail); \
fa9a63c5 1539 \
fa9a63c5 1540 DEBUG_STATEMENT (nfailure_points_popped++); \
505bde11 1541} while (0) /* POP_FAILURE_POINT */
fa9a63c5
RM
1542
1543
1544\f
fa9a63c5 1545/* Registers are set to a sentinel when they haven't yet matched. */
505bde11 1546#define REG_UNSET_VALUE NULL
fa9a63c5
RM
1547#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1548\f
1549/* Subroutine declarations and macros for regex_compile. */
1550
99633e97
SM
1551static void store_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc, int arg));
1552static void store_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1553 int arg1, int arg2));
1554static void insert_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1555 int arg, unsigned char *end));
1556static void insert_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1557 int arg1, int arg2, unsigned char *end));
1558static boolean at_begline_loc_p _RE_ARGS((const unsigned char *pattern,
1559 const unsigned char *p,
1560 reg_syntax_t syntax));
1561static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p,
1562 const unsigned char *pend,
1563 reg_syntax_t syntax));
4e8a9132 1564static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
f6a3f532
SM
1565static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend,
1566 char *fastmap, const int multibyte));
fa9a63c5 1567
5e69f11e 1568/* Fetch the next character in the uncompiled pattern---translating it
fa9a63c5
RM
1569 if necessary. Also cast from a signed character in the constant
1570 string passed to us by the user to an unsigned char that we can use
1571 as an array index (in, e.g., `translate'). */
1572#define PATFETCH(c) \
99633e97
SM
1573 do { \
1574 PATFETCH_RAW (c); \
2d1675e4 1575 c = TRANSLATE (c); \
fa9a63c5
RM
1576 } while (0)
1577
1578/* Fetch the next character in the uncompiled pattern, with no
25fe55af 1579 translation. */
fa9a63c5 1580#define PATFETCH_RAW(c) \
2d1675e4
SM
1581 do { \
1582 int len; \
1583 if (p == pend) return REG_EEND; \
1584 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1585 p += len; \
fa9a63c5
RM
1586 } while (0)
1587
fa9a63c5
RM
1588
1589/* If `translate' is non-null, return translate[D], else just D. We
1590 cast the subscript to translate because some data is declared as
1591 `char *', to avoid warnings when a string constant is passed. But
1592 when we use a character as a subscript we must make it unsigned. */
6676cb1c
RS
1593#ifndef TRANSLATE
1594#define TRANSLATE(d) \
66f0296e 1595 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
6676cb1c 1596#endif
fa9a63c5
RM
1597
1598
1599/* Macros for outputting the compiled pattern into `buffer'. */
1600
1601/* If the buffer isn't allocated when it comes in, use this. */
1602#define INIT_BUF_SIZE 32
1603
25fe55af 1604/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5
RM
1605#define GET_BUFFER_SPACE(n) \
1606 while (b - bufp->buffer + (n) > bufp->allocated) \
1607 EXTEND_BUFFER ()
1608
1609/* Make sure we have one more byte of buffer space and then add C to it. */
1610#define BUF_PUSH(c) \
1611 do { \
1612 GET_BUFFER_SPACE (1); \
1613 *b++ = (unsigned char) (c); \
1614 } while (0)
1615
1616
1617/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1618#define BUF_PUSH_2(c1, c2) \
1619 do { \
1620 GET_BUFFER_SPACE (2); \
1621 *b++ = (unsigned char) (c1); \
1622 *b++ = (unsigned char) (c2); \
1623 } while (0)
1624
1625
25fe55af 1626/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1627#define BUF_PUSH_3(c1, c2, c3) \
1628 do { \
1629 GET_BUFFER_SPACE (3); \
1630 *b++ = (unsigned char) (c1); \
1631 *b++ = (unsigned char) (c2); \
1632 *b++ = (unsigned char) (c3); \
1633 } while (0)
1634
1635
1636/* Store a jump with opcode OP at LOC to location TO. We store a
25fe55af 1637 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1638#define STORE_JUMP(op, loc, to) \
1639 store_op1 (op, loc, (to) - (loc) - 3)
1640
1641/* Likewise, for a two-argument jump. */
1642#define STORE_JUMP2(op, loc, to, arg) \
1643 store_op2 (op, loc, (to) - (loc) - 3, arg)
1644
25fe55af 1645/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1646#define INSERT_JUMP(op, loc, to) \
1647 insert_op1 (op, loc, (to) - (loc) - 3, b)
1648
1649/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1650#define INSERT_JUMP2(op, loc, to, arg) \
1651 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1652
1653
1654/* This is not an arbitrary limit: the arguments which represent offsets
25fe55af 1655 into the pattern are two bytes long. So if 2^16 bytes turns out to
fa9a63c5
RM
1656 be too small, many things would have to change. */
1657#define MAX_BUF_SIZE (1L << 16)
1658
1659
1660/* Extend the buffer by twice its current size via realloc and
1661 reset the pointers that pointed into the old block to point to the
1662 correct places in the new one. If extending the buffer results in it
25fe55af 1663 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
fa9a63c5 1664#define EXTEND_BUFFER() \
25fe55af 1665 do { \
fa9a63c5 1666 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1667 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1668 return REG_ESIZE; \
1669 bufp->allocated <<= 1; \
1670 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1671 bufp->allocated = MAX_BUF_SIZE; \
fa9a63c5
RM
1672 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1673 if (bufp->buffer == NULL) \
1674 return REG_ESPACE; \
1675 /* If the buffer moved, move all the pointers into it. */ \
1676 if (old_buffer != bufp->buffer) \
1677 { \
25fe55af
RS
1678 b = (b - old_buffer) + bufp->buffer; \
1679 begalt = (begalt - old_buffer) + bufp->buffer; \
1680 if (fixup_alt_jump) \
1681 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1682 if (laststart) \
1683 laststart = (laststart - old_buffer) + bufp->buffer; \
1684 if (pending_exact) \
1685 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
fa9a63c5
RM
1686 } \
1687 } while (0)
1688
1689
1690/* Since we have one byte reserved for the register number argument to
1691 {start,stop}_memory, the maximum number of groups we can report
1692 things about is what fits in that byte. */
1693#define MAX_REGNUM 255
1694
1695/* But patterns can have more than `MAX_REGNUM' registers. We just
1696 ignore the excess. */
1697typedef unsigned regnum_t;
1698
1699
1700/* Macros for the compile stack. */
1701
1702/* Since offsets can go either forwards or backwards, this type needs to
25fe55af 1703 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
fa9a63c5
RM
1704typedef int pattern_offset_t;
1705
1706typedef struct
1707{
1708 pattern_offset_t begalt_offset;
1709 pattern_offset_t fixup_alt_jump;
5e69f11e 1710 pattern_offset_t laststart_offset;
fa9a63c5
RM
1711 regnum_t regnum;
1712} compile_stack_elt_t;
1713
1714
1715typedef struct
1716{
1717 compile_stack_elt_t *stack;
1718 unsigned size;
1719 unsigned avail; /* Offset of next open position. */
1720} compile_stack_type;
1721
1722
1723#define INIT_COMPILE_STACK_SIZE 32
1724
1725#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1726#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1727
25fe55af 1728/* The next available element. */
fa9a63c5
RM
1729#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1730
1731
b18215fc
RS
1732/* Structure to manage work area for range table. */
1733struct range_table_work_area
1734{
1735 int *table; /* actual work area. */
1736 int allocated; /* allocated size for work area in bytes. */
25fe55af 1737 int used; /* actually used size in words. */
96cc36cc 1738 int bits; /* flag to record character classes */
b18215fc
RS
1739};
1740
1741/* Make sure that WORK_AREA can hold more N multibyte characters. */
1742#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1743 do { \
1744 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1745 { \
1746 (work_area).allocated += 16 * sizeof (int); \
1747 if ((work_area).table) \
1748 (work_area).table \
1749 = (int *) realloc ((work_area).table, (work_area).allocated); \
1750 else \
1751 (work_area).table \
1752 = (int *) malloc ((work_area).allocated); \
1753 if ((work_area).table == 0) \
1754 FREE_STACK_RETURN (REG_ESPACE); \
1755 } \
1756 } while (0)
1757
96cc36cc
RS
1758#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1759 (work_area).bits |= (bit)
1760
1761/* These bits represent the various character classes such as [:alnum:]
1762 in a charset's range table. */
1763#define BIT_ALNUM 0x1
1764#define BIT_ALPHA 0x2
1765#define BIT_WORD 0x4
f71b19b6
DL
1766#define BIT_ASCII 0x8
1767#define BIT_NONASCII 0x10
96cc36cc
RS
1768#define BIT_GRAPH 0x20
1769#define BIT_LOWER 0x40
1770#define BIT_PRINT 0x80
1771#define BIT_PUNCT 0x100
1772#define BIT_SPACE 0x200
1773#define BIT_UPPER 0x400
f71b19b6
DL
1774#define BIT_UNIBYTE 0x800
1775#define BIT_MULTIBYTE 0x1000
96cc36cc 1776
b18215fc
RS
1777/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1778#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1779 do { \
1780 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1781 (work_area).table[(work_area).used++] = (range_start); \
1782 (work_area).table[(work_area).used++] = (range_end); \
1783 } while (0)
1784
25fe55af 1785/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1786#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1787 do { \
1788 if ((work_area).table) \
1789 free ((work_area).table); \
1790 } while (0)
1791
96cc36cc 1792#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1793#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1794#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc
RS
1795#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1796
1797
fa9a63c5 1798/* Set the bit for character C in a list. */
25fe55af
RS
1799#define SET_LIST_BIT(c) \
1800 (b[((unsigned char) (c)) / BYTEWIDTH] \
fa9a63c5
RM
1801 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1802
1803
1804/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1805#define GET_UNSIGNED_NUMBER(num) \
99633e97 1806 do { if (p != pend) \
fa9a63c5 1807 { \
25fe55af
RS
1808 PATFETCH (c); \
1809 while (ISDIGIT (c)) \
1810 { \
1811 if (num < 0) \
1812 num = 0; \
1813 num = num * 10 + c - '0'; \
1814 if (p == pend) \
1815 break; \
1816 PATFETCH (c); \
1817 } \
1818 } \
99633e97 1819 } while (0)
fa9a63c5
RM
1820
1821#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1822
1823#define IS_CHAR_CLASS(string) \
1824 (STREQ (string, "alpha") || STREQ (string, "upper") \
1825 || STREQ (string, "lower") || STREQ (string, "digit") \
1826 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1827 || STREQ (string, "space") || STREQ (string, "print") \
1828 || STREQ (string, "punct") || STREQ (string, "graph") \
96cc36cc 1829 || STREQ (string, "cntrl") || STREQ (string, "blank") \
f71b19b6
DL
1830 || STREQ (string, "word") \
1831 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1832 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
99633e97
SM
1833
1834/* QUIT is only used on NTemacs. */
1835#if !defined (WINDOWSNT) || !defined (emacs)
1836#undef QUIT
1837#define QUIT
1838#endif
fa9a63c5
RM
1839\f
1840#ifndef MATCH_MAY_ALLOCATE
1841
1842/* If we cannot allocate large objects within re_match_2_internal,
1843 we make the fail stack and register vectors global.
1844 The fail stack, we grow to the maximum size when a regexp
1845 is compiled.
1846 The register vectors, we adjust in size each time we
1847 compile a regexp, according to the number of registers it needs. */
1848
1849static fail_stack_type fail_stack;
1850
1851/* Size with which the following vectors are currently allocated.
1852 That is so we can make them bigger as needed,
25fe55af 1853 but never make them smaller. */
fa9a63c5
RM
1854static int regs_allocated_size;
1855
66f0296e
SM
1856static re_char ** regstart, ** regend;
1857static re_char **best_regstart, **best_regend;
fa9a63c5
RM
1858
1859/* Make the register vectors big enough for NUM_REGS registers,
25fe55af 1860 but don't make them smaller. */
fa9a63c5
RM
1861
1862static
1863regex_grow_registers (num_regs)
1864 int num_regs;
1865{
1866 if (num_regs > regs_allocated_size)
1867 {
66f0296e
SM
1868 RETALLOC_IF (regstart, num_regs, re_char *);
1869 RETALLOC_IF (regend, num_regs, re_char *);
1870 RETALLOC_IF (best_regstart, num_regs, re_char *);
1871 RETALLOC_IF (best_regend, num_regs, re_char *);
fa9a63c5
RM
1872
1873 regs_allocated_size = num_regs;
1874 }
1875}
1876
1877#endif /* not MATCH_MAY_ALLOCATE */
1878\f
99633e97
SM
1879static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1880 compile_stack,
1881 regnum_t regnum));
1882
fa9a63c5
RM
1883/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1884 Returns one of error codes defined in `regex.h', or zero for success.
1885
1886 Assumes the `allocated' (and perhaps `buffer') and `translate'
1887 fields are set in BUFP on entry.
1888
1889 If it succeeds, results are put in BUFP (if it returns an error, the
1890 contents of BUFP are undefined):
1891 `buffer' is the compiled pattern;
1892 `syntax' is set to SYNTAX;
1893 `used' is set to the length of the compiled pattern;
1894 `fastmap_accurate' is zero;
1895 `re_nsub' is the number of subexpressions in PATTERN;
1896 `not_bol' and `not_eol' are zero;
5e69f11e 1897
fa9a63c5
RM
1898 The `fastmap' and `newline_anchor' fields are neither
1899 examined nor set. */
1900
505bde11
SM
1901/* Insert the `jump' from the end of last alternative to "here".
1902 The space for the jump has already been allocated. */
1903#define FIXUP_ALT_JUMP() \
1904do { \
1905 if (fixup_alt_jump) \
1906 STORE_JUMP (jump, fixup_alt_jump, b); \
1907} while (0)
1908
1909
fa9a63c5
RM
1910/* Return, freeing storage we allocated. */
1911#define FREE_STACK_RETURN(value) \
b18215fc
RS
1912 do { \
1913 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1914 free (compile_stack.stack); \
1915 return value; \
1916 } while (0)
fa9a63c5
RM
1917
1918static reg_errcode_t
1919regex_compile (pattern, size, syntax, bufp)
66f0296e 1920 re_char *pattern;
fa9a63c5
RM
1921 int size;
1922 reg_syntax_t syntax;
1923 struct re_pattern_buffer *bufp;
1924{
1925 /* We fetch characters from PATTERN here. Even though PATTERN is
1926 `char *' (i.e., signed), we declare these variables as unsigned, so
1927 they can be reliably used as array indices. */
b18215fc 1928 register unsigned int c, c1;
5e69f11e 1929
fa9a63c5 1930 /* A random temporary spot in PATTERN. */
66f0296e 1931 re_char *p1;
fa9a63c5
RM
1932
1933 /* Points to the end of the buffer, where we should append. */
1934 register unsigned char *b;
5e69f11e 1935
fa9a63c5
RM
1936 /* Keeps track of unclosed groups. */
1937 compile_stack_type compile_stack;
1938
1939 /* Points to the current (ending) position in the pattern. */
22336245
RS
1940#ifdef AIX
1941 /* `const' makes AIX compiler fail. */
66f0296e 1942 unsigned char *p = pattern;
22336245 1943#else
66f0296e 1944 re_char *p = pattern;
22336245 1945#endif
66f0296e 1946 re_char *pend = pattern + size;
5e69f11e 1947
fa9a63c5 1948 /* How to translate the characters in the pattern. */
6676cb1c 1949 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
1950
1951 /* Address of the count-byte of the most recently inserted `exactn'
1952 command. This makes it possible to tell if a new exact-match
1953 character can be added to that command or if the character requires
1954 a new `exactn' command. */
1955 unsigned char *pending_exact = 0;
1956
1957 /* Address of start of the most recently finished expression.
1958 This tells, e.g., postfix * where to find the start of its
1959 operand. Reset at the beginning of groups and alternatives. */
1960 unsigned char *laststart = 0;
1961
1962 /* Address of beginning of regexp, or inside of last group. */
1963 unsigned char *begalt;
1964
1965 /* Place in the uncompiled pattern (i.e., the {) to
1966 which to go back if the interval is invalid. */
66f0296e 1967 re_char *beg_interval;
5e69f11e 1968
fa9a63c5 1969 /* Address of the place where a forward jump should go to the end of
25fe55af 1970 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
1971 last -- ends with a forward jump of this sort. */
1972 unsigned char *fixup_alt_jump = 0;
1973
1974 /* Counts open-groups as they are encountered. Remembered for the
1975 matching close-group on the compile stack, so the same register
1976 number is put in the stop_memory as the start_memory. */
1977 regnum_t regnum = 0;
1978
b18215fc
RS
1979 /* Work area for range table of charset. */
1980 struct range_table_work_area range_table_work;
1981
2d1675e4
SM
1982 /* If the object matched can contain multibyte characters. */
1983 const boolean multibyte = RE_MULTIBYTE_P (bufp);
1984
fa9a63c5 1985#ifdef DEBUG
99633e97 1986 debug++;
fa9a63c5 1987 DEBUG_PRINT1 ("\nCompiling pattern: ");
99633e97 1988 if (debug > 0)
fa9a63c5
RM
1989 {
1990 unsigned debug_count;
5e69f11e 1991
fa9a63c5 1992 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 1993 putchar (pattern[debug_count]);
fa9a63c5
RM
1994 putchar ('\n');
1995 }
1996#endif /* DEBUG */
1997
1998 /* Initialize the compile stack. */
1999 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2000 if (compile_stack.stack == NULL)
2001 return REG_ESPACE;
2002
2003 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2004 compile_stack.avail = 0;
2005
b18215fc
RS
2006 range_table_work.table = 0;
2007 range_table_work.allocated = 0;
2008
fa9a63c5
RM
2009 /* Initialize the pattern buffer. */
2010 bufp->syntax = syntax;
2011 bufp->fastmap_accurate = 0;
2012 bufp->not_bol = bufp->not_eol = 0;
2013
2014 /* Set `used' to zero, so that if we return an error, the pattern
2015 printer (for debugging) will think there's no pattern. We reset it
2016 at the end. */
2017 bufp->used = 0;
5e69f11e 2018
fa9a63c5 2019 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2020 bufp->re_nsub = 0;
fa9a63c5
RM
2021
2022#if !defined (emacs) && !defined (SYNTAX_TABLE)
2023 /* Initialize the syntax table. */
2024 init_syntax_once ();
2025#endif
2026
2027 if (bufp->allocated == 0)
2028 {
2029 if (bufp->buffer)
2030 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2031 enough space. This loses if buffer's address is bogus, but
2032 that is the user's responsibility. */
2033 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2034 }
fa9a63c5 2035 else
25fe55af
RS
2036 { /* Caller did not allocate a buffer. Do it for them. */
2037 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2038 }
fa9a63c5
RM
2039 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2040
2041 bufp->allocated = INIT_BUF_SIZE;
2042 }
2043
2044 begalt = b = bufp->buffer;
2045
2046 /* Loop through the uncompiled pattern until we're at the end. */
2047 while (p != pend)
2048 {
2049 PATFETCH (c);
2050
2051 switch (c)
25fe55af
RS
2052 {
2053 case '^':
2054 {
2055 if ( /* If at start of pattern, it's an operator. */
2056 p == pattern + 1
2057 /* If context independent, it's an operator. */
2058 || syntax & RE_CONTEXT_INDEP_ANCHORS
2059 /* Otherwise, depends on what's come before. */
2060 || at_begline_loc_p (pattern, p, syntax))
2061 BUF_PUSH (begline);
2062 else
2063 goto normal_char;
2064 }
2065 break;
2066
2067
2068 case '$':
2069 {
2070 if ( /* If at end of pattern, it's an operator. */
2071 p == pend
2072 /* If context independent, it's an operator. */
2073 || syntax & RE_CONTEXT_INDEP_ANCHORS
2074 /* Otherwise, depends on what's next. */
2075 || at_endline_loc_p (p, pend, syntax))
2076 BUF_PUSH (endline);
2077 else
2078 goto normal_char;
2079 }
2080 break;
fa9a63c5
RM
2081
2082
2083 case '+':
25fe55af
RS
2084 case '?':
2085 if ((syntax & RE_BK_PLUS_QM)
2086 || (syntax & RE_LIMITED_OPS))
2087 goto normal_char;
2088 handle_plus:
2089 case '*':
2090 /* If there is no previous pattern... */
2091 if (!laststart)
2092 {
2093 if (syntax & RE_CONTEXT_INVALID_OPS)
2094 FREE_STACK_RETURN (REG_BADRPT);
2095 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2096 goto normal_char;
2097 }
2098
2099 {
25fe55af 2100 /* 1 means zero (many) matches is allowed. */
66f0296e
SM
2101 boolean zero_times_ok = 0, many_times_ok = 0;
2102 boolean greedy = 1;
25fe55af
RS
2103
2104 /* If there is a sequence of repetition chars, collapse it
2105 down to just one (the right one). We can't combine
2106 interval operators with these because of, e.g., `a{2}*',
2107 which should only match an even number of `a's. */
2108
2109 for (;;)
2110 {
1c8c6d39
DL
2111 if (!(syntax & RE_ALL_GREEDY)
2112 && c == '?' && (zero_times_ok || many_times_ok))
2113 greedy = 0;
2114 else
2115 {
2116 zero_times_ok |= c != '+';
2117 many_times_ok |= c != '?';
2118 }
25fe55af
RS
2119
2120 if (p == pend)
2121 break;
ed0767d8
SM
2122 else if (*p == '*'
2123 || (!(syntax & RE_BK_PLUS_QM)
2124 && (*p == '+' || *p == '?')))
25fe55af 2125 ;
ed0767d8 2126 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
25fe55af 2127 {
ed0767d8
SM
2128 if (p+1 == pend)
2129 FREE_STACK_RETURN (REG_EESCAPE);
2130 if (p[1] == '+' || p[1] == '?')
2131 PATFETCH (c); /* Gobble up the backslash. */
2132 else
2133 break;
25fe55af
RS
2134 }
2135 else
ed0767d8 2136 break;
25fe55af 2137 /* If we get here, we found another repeat character. */
ed0767d8
SM
2138 PATFETCH (c);
2139 }
25fe55af
RS
2140
2141 /* Star, etc. applied to an empty pattern is equivalent
2142 to an empty pattern. */
4e8a9132 2143 if (!laststart || laststart == b)
25fe55af
RS
2144 break;
2145
2146 /* Now we know whether or not zero matches is allowed
2147 and also whether or not two or more matches is allowed. */
1c8c6d39
DL
2148 if (greedy)
2149 {
99633e97 2150 if (many_times_ok)
4e8a9132
SM
2151 {
2152 boolean simple = skip_one_char (laststart) == b;
2153 unsigned int startoffset = 0;
f6a3f532
SM
2154 re_opcode_t ofj =
2155 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2156 on_failure_jump : on_failure_jump_loop;
4e8a9132
SM
2157 assert (skip_one_char (laststart) <= b);
2158
2159 if (!zero_times_ok && simple)
2160 { /* Since simple * loops can be made faster by using
2161 on_failure_keep_string_jump, we turn simple P+
2162 into PP* if P is simple. */
2163 unsigned char *p1, *p2;
2164 startoffset = b - laststart;
2165 GET_BUFFER_SPACE (startoffset);
2166 p1 = b; p2 = laststart;
2167 while (p2 < p1)
2168 *b++ = *p2++;
2169 zero_times_ok = 1;
99633e97 2170 }
4e8a9132
SM
2171
2172 GET_BUFFER_SPACE (6);
2173 if (!zero_times_ok)
2174 /* A + loop. */
f6a3f532 2175 STORE_JUMP (ofj, b, b + 6);
99633e97 2176 else
4e8a9132
SM
2177 /* Simple * loops can use on_failure_keep_string_jump
2178 depending on what follows. But since we don't know
2179 that yet, we leave the decision up to
2180 on_failure_jump_smart. */
f6a3f532 2181 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
4e8a9132 2182 laststart + startoffset, b + 6);
99633e97 2183 b += 3;
4e8a9132 2184 STORE_JUMP (jump, b, laststart + startoffset);
99633e97
SM
2185 b += 3;
2186 }
2187 else
2188 {
4e8a9132
SM
2189 /* A simple ? pattern. */
2190 assert (zero_times_ok);
2191 GET_BUFFER_SPACE (3);
2192 INSERT_JUMP (on_failure_jump, laststart, b + 3);
99633e97
SM
2193 b += 3;
2194 }
1c8c6d39
DL
2195 }
2196 else /* not greedy */
2197 { /* I wish the greedy and non-greedy cases could be merged. */
2198
0683b6fa 2199 GET_BUFFER_SPACE (7); /* We might use less. */
1c8c6d39
DL
2200 if (many_times_ok)
2201 {
f6a3f532
SM
2202 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2203
505bde11 2204 /* The non-greedy multiple match looks like a repeat..until:
1c8c6d39 2205 we only need a conditional jump at the end of the loop */
f6a3f532
SM
2206 if (emptyp) BUF_PUSH (no_op);
2207 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2208 : on_failure_jump, b, laststart);
1c8c6d39
DL
2209 b += 3;
2210 if (zero_times_ok)
2211 {
2212 /* The repeat...until naturally matches one or more.
2213 To also match zero times, we need to first jump to
2214 the end of the loop (its conditional jump). */
1c8c6d39
DL
2215 INSERT_JUMP (jump, laststart, b);
2216 b += 3;
2217 }
2218 }
2219 else
2220 {
2221 /* non-greedy a?? */
1c8c6d39
DL
2222 INSERT_JUMP (jump, laststart, b + 3);
2223 b += 3;
2224 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2225 b += 3;
2226 }
2227 }
2228 }
4e8a9132 2229 pending_exact = 0;
fa9a63c5
RM
2230 break;
2231
2232
2233 case '.':
25fe55af
RS
2234 laststart = b;
2235 BUF_PUSH (anychar);
2236 break;
fa9a63c5
RM
2237
2238
25fe55af
RS
2239 case '[':
2240 {
b18215fc 2241 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2242
25fe55af 2243 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2244
25fe55af
RS
2245 /* Ensure that we have enough space to push a charset: the
2246 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2247 GET_BUFFER_SPACE (34);
2248
25fe55af 2249 laststart = b;
e318085a 2250
25fe55af
RS
2251 /* We test `*p == '^' twice, instead of using an if
2252 statement, so we only need one BUF_PUSH. */
2253 BUF_PUSH (*p == '^' ? charset_not : charset);
2254 if (*p == '^')
2255 p++;
e318085a 2256
25fe55af
RS
2257 /* Remember the first position in the bracket expression. */
2258 p1 = p;
e318085a 2259
25fe55af
RS
2260 /* Push the number of bytes in the bitmap. */
2261 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2262
25fe55af
RS
2263 /* Clear the whole map. */
2264 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2265
25fe55af
RS
2266 /* charset_not matches newline according to a syntax bit. */
2267 if ((re_opcode_t) b[-2] == charset_not
2268 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2269 SET_LIST_BIT ('\n');
fa9a63c5 2270
25fe55af
RS
2271 /* Read in characters and ranges, setting map bits. */
2272 for (;;)
2273 {
b18215fc 2274 boolean escaped_char = false;
2d1675e4 2275 const unsigned char *p2 = p;
e318085a 2276
25fe55af 2277 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2278
25fe55af 2279 PATFETCH (c);
e318085a 2280
25fe55af
RS
2281 /* \ might escape characters inside [...] and [^...]. */
2282 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2283 {
2284 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2285
2286 PATFETCH (c);
b18215fc 2287 escaped_char = true;
25fe55af 2288 }
b18215fc
RS
2289 else
2290 {
657fcfbd
RS
2291 /* Could be the end of the bracket expression. If it's
2292 not (i.e., when the bracket expression is `[]' so
2293 far), the ']' character bit gets set way below. */
2d1675e4 2294 if (c == ']' && p2 != p1)
657fcfbd 2295 break;
25fe55af 2296 }
b18215fc 2297
b18215fc
RS
2298 /* What should we do for the character which is
2299 greater than 0x7F, but not BASE_LEADING_CODE_P?
2300 XXX */
2301
25fe55af
RS
2302 /* See if we're at the beginning of a possible character
2303 class. */
b18215fc 2304
2d1675e4
SM
2305 if (!escaped_char &&
2306 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2307 {
2308 /* Leave room for the null. */
25fe55af 2309 char str[CHAR_CLASS_MAX_LENGTH + 1];
ed0767d8 2310 const unsigned char *class_beg;
b18215fc 2311
25fe55af
RS
2312 PATFETCH (c);
2313 c1 = 0;
ed0767d8 2314 class_beg = p;
b18215fc 2315
25fe55af
RS
2316 /* If pattern is `[[:'. */
2317 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2318
25fe55af
RS
2319 for (;;)
2320 {
2321 PATFETCH (c);
2322 if (c == ':' || c == ']' || p == pend
2323 || c1 == CHAR_CLASS_MAX_LENGTH)
2324 break;
2325 str[c1++] = c;
2326 }
2327 str[c1] = '\0';
b18215fc
RS
2328
2329 /* If isn't a word bracketed by `[:' and `:]':
2330 undo the ending character, the letters, and
2331 leave the leading `:' and `[' (but set bits for
2332 them). */
25fe55af
RS
2333 if (c == ':' && *p == ']')
2334 {
2335 int ch;
2336 boolean is_alnum = STREQ (str, "alnum");
2337 boolean is_alpha = STREQ (str, "alpha");
f71b19b6 2338 boolean is_ascii = STREQ (str, "ascii");
25fe55af
RS
2339 boolean is_blank = STREQ (str, "blank");
2340 boolean is_cntrl = STREQ (str, "cntrl");
2341 boolean is_digit = STREQ (str, "digit");
2342 boolean is_graph = STREQ (str, "graph");
2343 boolean is_lower = STREQ (str, "lower");
f71b19b6
DL
2344 boolean is_multibyte = STREQ (str, "multibyte");
2345 boolean is_nonascii = STREQ (str, "nonascii");
25fe55af
RS
2346 boolean is_print = STREQ (str, "print");
2347 boolean is_punct = STREQ (str, "punct");
2348 boolean is_space = STREQ (str, "space");
f71b19b6 2349 boolean is_unibyte = STREQ (str, "unibyte");
25fe55af 2350 boolean is_upper = STREQ (str, "upper");
96cc36cc 2351 boolean is_word = STREQ (str, "word");
f71b19b6 2352 boolean is_xdigit = STREQ (str, "xdigit");
25fe55af
RS
2353
2354 if (!IS_CHAR_CLASS (str))
fa9a63c5
RM
2355 FREE_STACK_RETURN (REG_ECTYPE);
2356
25fe55af
RS
2357 /* Throw away the ] at the end of the character
2358 class. */
2359 PATFETCH (c);
fa9a63c5 2360
25fe55af 2361 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2362
96cc36cc
RS
2363 /* Most character classes in a multibyte match
2364 just set a flag. Exceptions are is_blank,
2365 is_digit, is_cntrl, and is_xdigit, since
2366 they can only match ASCII characters. We
2367 don't need to handle them for multibyte. */
2368
2d1675e4 2369 if (multibyte)
96cc36cc
RS
2370 {
2371 int bit = 0;
2372
2373 if (is_alnum) bit = BIT_ALNUM;
2374 if (is_alpha) bit = BIT_ALPHA;
f71b19b6 2375 if (is_ascii) bit = BIT_ASCII;
96cc36cc
RS
2376 if (is_graph) bit = BIT_GRAPH;
2377 if (is_lower) bit = BIT_LOWER;
f71b19b6
DL
2378 if (is_multibyte) bit = BIT_MULTIBYTE;
2379 if (is_nonascii) bit = BIT_NONASCII;
96cc36cc
RS
2380 if (is_print) bit = BIT_PRINT;
2381 if (is_punct) bit = BIT_PUNCT;
2382 if (is_space) bit = BIT_SPACE;
f71b19b6 2383 if (is_unibyte) bit = BIT_UNIBYTE;
96cc36cc
RS
2384 if (is_upper) bit = BIT_UPPER;
2385 if (is_word) bit = BIT_WORD;
2386 if (bit)
2387 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2388 bit);
2389 }
2390
2391 /* Handle character classes for ASCII characters. */
25fe55af
RS
2392 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2393 {
7ae68633 2394 int translated = TRANSLATE (ch);
fa9a63c5
RM
2395 /* This was split into 3 if's to
2396 avoid an arbitrary limit in some compiler. */
25fe55af
RS
2397 if ( (is_alnum && ISALNUM (ch))
2398 || (is_alpha && ISALPHA (ch))
2399 || (is_blank && ISBLANK (ch))
2400 || (is_cntrl && ISCNTRL (ch)))
7ae68633 2401 SET_LIST_BIT (translated);
fa9a63c5 2402 if ( (is_digit && ISDIGIT (ch))
25fe55af
RS
2403 || (is_graph && ISGRAPH (ch))
2404 || (is_lower && ISLOWER (ch))
2405 || (is_print && ISPRINT (ch)))
7ae68633 2406 SET_LIST_BIT (translated);
fa9a63c5 2407 if ( (is_punct && ISPUNCT (ch))
25fe55af
RS
2408 || (is_space && ISSPACE (ch))
2409 || (is_upper && ISUPPER (ch))
2410 || (is_xdigit && ISXDIGIT (ch)))
7ae68633 2411 SET_LIST_BIT (translated);
f71b19b6
DL
2412 if ( (is_ascii && IS_REAL_ASCII (ch))
2413 || (is_nonascii && !IS_REAL_ASCII (ch))
2414 || (is_unibyte && ISUNIBYTE (ch))
2415 || (is_multibyte && !ISUNIBYTE (ch)))
2416 SET_LIST_BIT (translated);
2417
96cc36cc
RS
2418 if ( (is_word && ISWORD (ch)))
2419 SET_LIST_BIT (translated);
25fe55af 2420 }
b18215fc
RS
2421
2422 /* Repeat the loop. */
2423 continue;
25fe55af
RS
2424 }
2425 else
2426 {
ed0767d8
SM
2427 /* Go back to right after the "[:". */
2428 p = class_beg;
25fe55af 2429 SET_LIST_BIT ('[');
b18215fc
RS
2430
2431 /* Because the `:' may starts the range, we
2432 can't simply set bit and repeat the loop.
25fe55af 2433 Instead, just set it to C and handle below. */
b18215fc 2434 c = ':';
25fe55af
RS
2435 }
2436 }
b18215fc
RS
2437
2438 if (p < pend && p[0] == '-' && p[1] != ']')
2439 {
2440
2441 /* Discard the `-'. */
2442 PATFETCH (c1);
2443
2444 /* Fetch the character which ends the range. */
2445 PATFETCH (c1);
b18215fc 2446
b54f61ed 2447 if (SINGLE_BYTE_CHAR_P (c))
e934739e 2448 {
b54f61ed
KH
2449 if (! SINGLE_BYTE_CHAR_P (c1))
2450 {
2451 /* Handle a range such as \177-\377 in
2452 multibyte mode. Split that into two
2453 ranges, the low one ending at 0237, and
2454 the high one starting at the smallest
2455 character in the charset of C1 and
2456 ending at C1. */
2457 int charset = CHAR_CHARSET (c1);
2458 int c2 = MAKE_CHAR (charset, 0, 0);
2459
2460 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2461 c2, c1);
2462 c1 = 0237;
2463 }
e934739e
RS
2464 }
2465 else if (!SAME_CHARSET_P (c, c1))
b18215fc 2466 FREE_STACK_RETURN (REG_ERANGE);
e318085a 2467 }
25fe55af 2468 else
b18215fc
RS
2469 /* Range from C to C. */
2470 c1 = c;
2471
2472 /* Set the range ... */
2473 if (SINGLE_BYTE_CHAR_P (c))
2474 /* ... into bitmap. */
25fe55af 2475 {
b18215fc
RS
2476 unsigned this_char;
2477 int range_start = c, range_end = c1;
2478
2479 /* If the start is after the end, the range is empty. */
2480 if (range_start > range_end)
2481 {
2482 if (syntax & RE_NO_EMPTY_RANGES)
2483 FREE_STACK_RETURN (REG_ERANGE);
2484 /* Else, repeat the loop. */
2485 }
2486 else
2487 {
2488 for (this_char = range_start; this_char <= range_end;
2489 this_char++)
2490 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 2491 }
25fe55af 2492 }
e318085a 2493 else
b18215fc
RS
2494 /* ... into range table. */
2495 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
2496 }
2497
25fe55af
RS
2498 /* Discard any (non)matching list bytes that are all 0 at the
2499 end of the map. Decrease the map-length byte too. */
2500 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2501 b[-1]--;
2502 b += b[-1];
fa9a63c5 2503
96cc36cc
RS
2504 /* Build real range table from work area. */
2505 if (RANGE_TABLE_WORK_USED (range_table_work)
2506 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
2507 {
2508 int i;
2509 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2510
b18215fc 2511 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
2512 bytes for flags, two for COUNT, and three bytes for
2513 each character. */
2514 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 2515
b18215fc
RS
2516 /* Indicate the existence of range table. */
2517 laststart[1] |= 0x80;
fa9a63c5 2518
96cc36cc
RS
2519 /* Store the character class flag bits into the range table.
2520 If not in emacs, these flag bits are always 0. */
2521 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2522 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2523
b18215fc
RS
2524 STORE_NUMBER_AND_INCR (b, used / 2);
2525 for (i = 0; i < used; i++)
2526 STORE_CHARACTER_AND_INCR
2527 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2528 }
25fe55af
RS
2529 }
2530 break;
fa9a63c5
RM
2531
2532
b18215fc 2533 case '(':
25fe55af
RS
2534 if (syntax & RE_NO_BK_PARENS)
2535 goto handle_open;
2536 else
2537 goto normal_char;
fa9a63c5
RM
2538
2539
25fe55af
RS
2540 case ')':
2541 if (syntax & RE_NO_BK_PARENS)
2542 goto handle_close;
2543 else
2544 goto normal_char;
e318085a
RS
2545
2546
25fe55af
RS
2547 case '\n':
2548 if (syntax & RE_NEWLINE_ALT)
2549 goto handle_alt;
2550 else
2551 goto normal_char;
e318085a
RS
2552
2553
b18215fc 2554 case '|':
25fe55af
RS
2555 if (syntax & RE_NO_BK_VBAR)
2556 goto handle_alt;
2557 else
2558 goto normal_char;
2559
2560
2561 case '{':
2562 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2563 goto handle_interval;
2564 else
2565 goto normal_char;
2566
2567
2568 case '\\':
2569 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2570
2571 /* Do not translate the character after the \, so that we can
2572 distinguish, e.g., \B from \b, even if we normally would
2573 translate, e.g., B to b. */
2574 PATFETCH_RAW (c);
2575
2576 switch (c)
2577 {
2578 case '(':
2579 if (syntax & RE_NO_BK_PARENS)
2580 goto normal_backslash;
2581
2582 handle_open:
505bde11
SM
2583 {
2584 int shy = 0;
2585 if (p+1 < pend)
2586 {
2587 /* Look for a special (?...) construct */
ed0767d8 2588 if ((syntax & RE_SHY_GROUPS) && *p == '?')
505bde11 2589 {
ed0767d8 2590 PATFETCH (c); /* Gobble up the '?'. */
505bde11
SM
2591 PATFETCH (c);
2592 switch (c)
2593 {
2594 case ':': shy = 1; break;
2595 default:
2596 /* Only (?:...) is supported right now. */
2597 FREE_STACK_RETURN (REG_BADPAT);
2598 }
2599 }
505bde11
SM
2600 }
2601
2602 if (!shy)
2603 {
2604 bufp->re_nsub++;
2605 regnum++;
2606 }
25fe55af 2607
99633e97
SM
2608 if (COMPILE_STACK_FULL)
2609 {
2610 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2611 compile_stack_elt_t);
2612 if (compile_stack.stack == NULL) return REG_ESPACE;
25fe55af 2613
99633e97
SM
2614 compile_stack.size <<= 1;
2615 }
25fe55af 2616
99633e97
SM
2617 /* These are the values to restore when we hit end of this
2618 group. They are all relative offsets, so that if the
2619 whole pattern moves because of realloc, they will still
2620 be valid. */
2621 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2622 COMPILE_STACK_TOP.fixup_alt_jump
2623 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2624 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2625 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
2626
2627 /* Do not push a
2628 start_memory for groups beyond the last one we can
2629 represent in the compiled pattern. */
2630 if (regnum <= MAX_REGNUM && !shy)
2631 BUF_PUSH_2 (start_memory, regnum);
2632
2633 compile_stack.avail++;
2634
2635 fixup_alt_jump = 0;
2636 laststart = 0;
2637 begalt = b;
2638 /* If we've reached MAX_REGNUM groups, then this open
2639 won't actually generate any code, so we'll have to
2640 clear pending_exact explicitly. */
2641 pending_exact = 0;
2642 break;
505bde11 2643 }
25fe55af
RS
2644
2645 case ')':
2646 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2647
2648 if (COMPILE_STACK_EMPTY)
505bde11
SM
2649 {
2650 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2651 goto normal_backslash;
2652 else
2653 FREE_STACK_RETURN (REG_ERPAREN);
2654 }
25fe55af
RS
2655
2656 handle_close:
505bde11 2657 FIXUP_ALT_JUMP ();
25fe55af
RS
2658
2659 /* See similar code for backslashed left paren above. */
2660 if (COMPILE_STACK_EMPTY)
505bde11
SM
2661 {
2662 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2663 goto normal_char;
2664 else
2665 FREE_STACK_RETURN (REG_ERPAREN);
2666 }
25fe55af
RS
2667
2668 /* Since we just checked for an empty stack above, this
2669 ``can't happen''. */
2670 assert (compile_stack.avail != 0);
2671 {
2672 /* We don't just want to restore into `regnum', because
2673 later groups should continue to be numbered higher,
2674 as in `(ab)c(de)' -- the second group is #2. */
2675 regnum_t this_group_regnum;
2676
2677 compile_stack.avail--;
2678 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2679 fixup_alt_jump
2680 = COMPILE_STACK_TOP.fixup_alt_jump
2681 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2682 : 0;
2683 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2684 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
2685 /* If we've reached MAX_REGNUM groups, then this open
2686 won't actually generate any code, so we'll have to
2687 clear pending_exact explicitly. */
2688 pending_exact = 0;
e318085a 2689
25fe55af
RS
2690 /* We're at the end of the group, so now we know how many
2691 groups were inside this one. */
505bde11
SM
2692 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
2693 BUF_PUSH_2 (stop_memory, this_group_regnum);
25fe55af
RS
2694 }
2695 break;
2696
2697
2698 case '|': /* `\|'. */
2699 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2700 goto normal_backslash;
2701 handle_alt:
2702 if (syntax & RE_LIMITED_OPS)
2703 goto normal_char;
2704
2705 /* Insert before the previous alternative a jump which
2706 jumps to this alternative if the former fails. */
2707 GET_BUFFER_SPACE (3);
2708 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2709 pending_exact = 0;
2710 b += 3;
2711
2712 /* The alternative before this one has a jump after it
2713 which gets executed if it gets matched. Adjust that
2714 jump so it will jump to this alternative's analogous
2715 jump (put in below, which in turn will jump to the next
2716 (if any) alternative's such jump, etc.). The last such
2717 jump jumps to the correct final destination. A picture:
2718 _____ _____
2719 | | | |
2720 | v | v
2721 a | b | c
2722
2723 If we are at `b', then fixup_alt_jump right now points to a
2724 three-byte space after `a'. We'll put in the jump, set
2725 fixup_alt_jump to right after `b', and leave behind three
2726 bytes which we'll fill in when we get to after `c'. */
2727
505bde11 2728 FIXUP_ALT_JUMP ();
25fe55af
RS
2729
2730 /* Mark and leave space for a jump after this alternative,
2731 to be filled in later either by next alternative or
2732 when know we're at the end of a series of alternatives. */
2733 fixup_alt_jump = b;
2734 GET_BUFFER_SPACE (3);
2735 b += 3;
2736
2737 laststart = 0;
2738 begalt = b;
2739 break;
2740
2741
2742 case '{':
2743 /* If \{ is a literal. */
2744 if (!(syntax & RE_INTERVALS)
2745 /* If we're at `\{' and it's not the open-interval
2746 operator. */
ed0767d8
SM
2747 || (syntax & RE_NO_BK_BRACES)
2748 /* What is that? -sm */
2749 /* || (p - 2 == pattern && p == pend) */)
25fe55af
RS
2750 goto normal_backslash;
2751
2752 handle_interval:
2753 {
2754 /* If got here, then the syntax allows intervals. */
2755
2756 /* At least (most) this many matches must be made. */
99633e97 2757 int lower_bound = 0, upper_bound = -1;
25fe55af 2758
ed0767d8 2759 beg_interval = p;
25fe55af
RS
2760
2761 if (p == pend)
2762 {
2763 if (syntax & RE_NO_BK_BRACES)
2764 goto unfetch_interval;
2765 else
2766 FREE_STACK_RETURN (REG_EBRACE);
2767 }
2768
2769 GET_UNSIGNED_NUMBER (lower_bound);
2770
2771 if (c == ',')
ed0767d8 2772 GET_UNSIGNED_NUMBER (upper_bound);
25fe55af
RS
2773 else
2774 /* Interval such as `{1}' => match exactly once. */
2775 upper_bound = lower_bound;
2776
2777 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
ed0767d8 2778 || (upper_bound >= 0 && lower_bound > upper_bound))
25fe55af
RS
2779 {
2780 if (syntax & RE_NO_BK_BRACES)
2781 goto unfetch_interval;
2782 else
2783 FREE_STACK_RETURN (REG_BADBR);
2784 }
2785
2786 if (!(syntax & RE_NO_BK_BRACES))
2787 {
2788 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2789
2790 PATFETCH (c);
2791 }
2792
2793 if (c != '}')
2794 {
2795 if (syntax & RE_NO_BK_BRACES)
2796 goto unfetch_interval;
2797 else
2798 FREE_STACK_RETURN (REG_BADBR);
2799 }
2800
2801 /* We just parsed a valid interval. */
2802
2803 /* If it's invalid to have no preceding re. */
2804 if (!laststart)
2805 {
2806 if (syntax & RE_CONTEXT_INVALID_OPS)
2807 FREE_STACK_RETURN (REG_BADRPT);
2808 else if (syntax & RE_CONTEXT_INDEP_OPS)
2809 laststart = b;
2810 else
2811 goto unfetch_interval;
2812 }
2813
25fe55af 2814 if (upper_bound == 0)
ed0767d8
SM
2815 /* If the upper bound is zero, just drop the sub pattern
2816 altogether. */
2817 b = laststart;
2818 else if (lower_bound == 1 && upper_bound == 1)
2819 /* Just match it once: nothing to do here. */
2820 ;
25fe55af
RS
2821
2822 /* Otherwise, we have a nontrivial interval. When
2823 we're all done, the pattern will look like:
2824 set_number_at <jump count> <upper bound>
2825 set_number_at <succeed_n count> <lower bound>
2826 succeed_n <after jump addr> <succeed_n count>
2827 <body of loop>
2828 jump_n <succeed_n addr> <jump count>
2829 (The upper bound and `jump_n' are omitted if
2830 `upper_bound' is 1, though.) */
2831 else
2832 { /* If the upper bound is > 1, we need to insert
2833 more at the end of the loop. */
ed0767d8
SM
2834 unsigned int nbytes = (upper_bound < 0 ? 3
2835 : upper_bound > 1 ? 5 : 0);
2836 unsigned int startoffset = 0;
2837
2838 GET_BUFFER_SPACE (20); /* We might use less. */
2839
2840 if (lower_bound == 0)
2841 {
2842 /* A succeed_n that starts with 0 is really a
2843 a simple on_failure_jump_loop. */
2844 INSERT_JUMP (on_failure_jump_loop, laststart,
2845 b + 3 + nbytes);
2846 b += 3;
2847 }
2848 else
2849 {
2850 /* Initialize lower bound of the `succeed_n', even
2851 though it will be set during matching by its
2852 attendant `set_number_at' (inserted next),
2853 because `re_compile_fastmap' needs to know.
2854 Jump to the `jump_n' we might insert below. */
2855 INSERT_JUMP2 (succeed_n, laststart,
2856 b + 5 + nbytes,
2857 lower_bound);
2858 b += 5;
2859
2860 /* Code to initialize the lower bound. Insert
2861 before the `succeed_n'. The `5' is the last two
2862 bytes of this `set_number_at', plus 3 bytes of
2863 the following `succeed_n'. */
2864 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2865 b += 5;
2866 startoffset += 5;
2867 }
2868
2869 if (upper_bound < 0)
2870 {
2871 /* A negative upper bound stands for infinity,
2872 in which case it degenerates to a plain jump. */
2873 STORE_JUMP (jump, b, laststart + startoffset);
2874 b += 3;
2875 }
2876 else if (upper_bound > 1)
25fe55af
RS
2877 { /* More than one repetition is allowed, so
2878 append a backward jump to the `succeed_n'
2879 that starts this interval.
2880
2881 When we've reached this during matching,
2882 we'll have matched the interval once, so
2883 jump back only `upper_bound - 1' times. */
ed0767d8 2884 STORE_JUMP2 (jump_n, b, laststart + startoffset,
25fe55af
RS
2885 upper_bound - 1);
2886 b += 5;
2887
2888 /* The location we want to set is the second
2889 parameter of the `jump_n'; that is `b-2' as
2890 an absolute address. `laststart' will be
2891 the `set_number_at' we're about to insert;
2892 `laststart+3' the number to set, the source
2893 for the relative address. But we are
2894 inserting into the middle of the pattern --
2895 so everything is getting moved up by 5.
2896 Conclusion: (b - 2) - (laststart + 3) + 5,
2897 i.e., b - laststart.
2898
2899 We insert this at the beginning of the loop
2900 so that if we fail during matching, we'll
2901 reinitialize the bounds. */
2902 insert_op2 (set_number_at, laststart, b - laststart,
2903 upper_bound - 1, b);
2904 b += 5;
2905 }
2906 }
2907 pending_exact = 0;
2908 beg_interval = NULL;
2909 }
2910 break;
2911
2912 unfetch_interval:
2913 /* If an invalid interval, match the characters as literals. */
2914 assert (beg_interval);
2915 p = beg_interval;
2916 beg_interval = NULL;
2917
2918 /* normal_char and normal_backslash need `c'. */
ed0767d8 2919 c = '{';
25fe55af
RS
2920
2921 if (!(syntax & RE_NO_BK_BRACES))
2922 {
ed0767d8
SM
2923 assert (p > pattern && p[-1] == '\\');
2924 goto normal_backslash;
25fe55af 2925 }
ed0767d8
SM
2926 else
2927 goto normal_char;
e318085a 2928
b18215fc 2929#ifdef emacs
25fe55af
RS
2930 /* There is no way to specify the before_dot and after_dot
2931 operators. rms says this is ok. --karl */
2932 case '=':
2933 BUF_PUSH (at_dot);
2934 break;
2935
2936 case 's':
2937 laststart = b;
2938 PATFETCH (c);
2939 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2940 break;
2941
2942 case 'S':
2943 laststart = b;
2944 PATFETCH (c);
2945 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2946 break;
b18215fc
RS
2947
2948 case 'c':
2949 laststart = b;
2950 PATFETCH_RAW (c);
2951 BUF_PUSH_2 (categoryspec, c);
2952 break;
e318085a 2953
b18215fc
RS
2954 case 'C':
2955 laststart = b;
2956 PATFETCH_RAW (c);
2957 BUF_PUSH_2 (notcategoryspec, c);
2958 break;
2959#endif /* emacs */
e318085a 2960
e318085a 2961
25fe55af
RS
2962 case 'w':
2963 laststart = b;
1fb352e0 2964 BUF_PUSH_2 (syntaxspec, Sword);
25fe55af 2965 break;
e318085a 2966
e318085a 2967
25fe55af
RS
2968 case 'W':
2969 laststart = b;
1fb352e0 2970 BUF_PUSH_2 (notsyntaxspec, Sword);
25fe55af 2971 break;
e318085a
RS
2972
2973
25fe55af
RS
2974 case '<':
2975 BUF_PUSH (wordbeg);
2976 break;
e318085a 2977
25fe55af
RS
2978 case '>':
2979 BUF_PUSH (wordend);
2980 break;
e318085a 2981
25fe55af
RS
2982 case 'b':
2983 BUF_PUSH (wordbound);
2984 break;
e318085a 2985
25fe55af
RS
2986 case 'B':
2987 BUF_PUSH (notwordbound);
2988 break;
fa9a63c5 2989
25fe55af
RS
2990 case '`':
2991 BUF_PUSH (begbuf);
2992 break;
e318085a 2993
25fe55af
RS
2994 case '\'':
2995 BUF_PUSH (endbuf);
2996 break;
e318085a 2997
25fe55af
RS
2998 case '1': case '2': case '3': case '4': case '5':
2999 case '6': case '7': case '8': case '9':
3000 if (syntax & RE_NO_BK_REFS)
3001 goto normal_char;
e318085a 3002
25fe55af 3003 c1 = c - '0';
e318085a 3004
25fe55af
RS
3005 if (c1 > regnum)
3006 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3007
25fe55af
RS
3008 /* Can't back reference to a subexpression if inside of it. */
3009 if (group_in_compile_stack (compile_stack, c1))
3010 goto normal_char;
e318085a 3011
25fe55af
RS
3012 laststart = b;
3013 BUF_PUSH_2 (duplicate, c1);
3014 break;
e318085a 3015
e318085a 3016
25fe55af
RS
3017 case '+':
3018 case '?':
3019 if (syntax & RE_BK_PLUS_QM)
3020 goto handle_plus;
3021 else
3022 goto normal_backslash;
3023
3024 default:
3025 normal_backslash:
3026 /* You might think it would be useful for \ to mean
3027 not to translate; but if we don't translate it
3028 it will never match anything. */
3029 c = TRANSLATE (c);
3030 goto normal_char;
3031 }
3032 break;
fa9a63c5
RM
3033
3034
3035 default:
25fe55af 3036 /* Expects the character in `c'. */
fa9a63c5
RM
3037 normal_char:
3038 /* If no exactn currently being built. */
25fe55af 3039 if (!pending_exact
fa9a63c5 3040
25fe55af
RS
3041 /* If last exactn not at current position. */
3042 || pending_exact + *pending_exact + 1 != b
5e69f11e 3043
25fe55af 3044 /* We have only one byte following the exactn for the count. */
2d1675e4 3045 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
fa9a63c5 3046
25fe55af 3047 /* If followed by a repetition operator. */
9d99031f 3048 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3049 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3050 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3051 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3052 || ((syntax & RE_INTERVALS)
25fe55af 3053 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3054 ? p != pend && *p == '{'
3055 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3056 {
3057 /* Start building a new exactn. */
5e69f11e 3058
25fe55af 3059 laststart = b;
fa9a63c5
RM
3060
3061 BUF_PUSH_2 (exactn, 0);
3062 pending_exact = b - 1;
25fe55af 3063 }
5e69f11e 3064
2d1675e4
SM
3065 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3066 {
3067 int len = CHAR_STRING (c, b);
3068 b += len;
3069 (*pending_exact) += len;
3070 }
3071
fa9a63c5 3072 break;
25fe55af 3073 } /* switch (c) */
fa9a63c5
RM
3074 } /* while p != pend */
3075
5e69f11e 3076
fa9a63c5 3077 /* Through the pattern now. */
5e69f11e 3078
505bde11 3079 FIXUP_ALT_JUMP ();
fa9a63c5 3080
5e69f11e 3081 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3082 FREE_STACK_RETURN (REG_EPAREN);
3083
3084 /* If we don't want backtracking, force success
3085 the first time we reach the end of the compiled pattern. */
3086 if (syntax & RE_NO_POSIX_BACKTRACKING)
3087 BUF_PUSH (succeed);
3088
3089 free (compile_stack.stack);
3090
3091 /* We have succeeded; set the length of the buffer. */
3092 bufp->used = b - bufp->buffer;
3093
3094#ifdef DEBUG
99633e97 3095 if (debug > 0)
fa9a63c5 3096 {
505bde11 3097 re_compile_fastmap (bufp);
fa9a63c5
RM
3098 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3099 print_compiled_pattern (bufp);
3100 }
99633e97 3101 debug--;
fa9a63c5
RM
3102#endif /* DEBUG */
3103
3104#ifndef MATCH_MAY_ALLOCATE
3105 /* Initialize the failure stack to the largest possible stack. This
3106 isn't necessary unless we're trying to avoid calling alloca in
3107 the search and match routines. */
3108 {
3109 int num_regs = bufp->re_nsub + 1;
3110
320a2a73 3111 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3112 {
a26f4ccd 3113 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5 3114
fa9a63c5
RM
3115 if (! fail_stack.stack)
3116 fail_stack.stack
5e69f11e 3117 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3118 * sizeof (fail_stack_elt_t));
3119 else
3120 fail_stack.stack
3121 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3122 (fail_stack.size
3123 * sizeof (fail_stack_elt_t)));
fa9a63c5
RM
3124 }
3125
3126 regex_grow_registers (num_regs);
3127 }
3128#endif /* not MATCH_MAY_ALLOCATE */
3129
3130 return REG_NOERROR;
3131} /* regex_compile */
3132\f
3133/* Subroutines for `regex_compile'. */
3134
25fe55af 3135/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3136
3137static void
3138store_op1 (op, loc, arg)
3139 re_opcode_t op;
3140 unsigned char *loc;
3141 int arg;
3142{
3143 *loc = (unsigned char) op;
3144 STORE_NUMBER (loc + 1, arg);
3145}
3146
3147
3148/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3149
3150static void
3151store_op2 (op, loc, arg1, arg2)
3152 re_opcode_t op;
3153 unsigned char *loc;
3154 int arg1, arg2;
3155{
3156 *loc = (unsigned char) op;
3157 STORE_NUMBER (loc + 1, arg1);
3158 STORE_NUMBER (loc + 3, arg2);
3159}
3160
3161
3162/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3163 for OP followed by two-byte integer parameter ARG. */
3164
3165static void
3166insert_op1 (op, loc, arg, end)
3167 re_opcode_t op;
3168 unsigned char *loc;
3169 int arg;
5e69f11e 3170 unsigned char *end;
fa9a63c5
RM
3171{
3172 register unsigned char *pfrom = end;
3173 register unsigned char *pto = end + 3;
3174
3175 while (pfrom != loc)
3176 *--pto = *--pfrom;
5e69f11e 3177
fa9a63c5
RM
3178 store_op1 (op, loc, arg);
3179}
3180
3181
3182/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3183
3184static void
3185insert_op2 (op, loc, arg1, arg2, end)
3186 re_opcode_t op;
3187 unsigned char *loc;
3188 int arg1, arg2;
5e69f11e 3189 unsigned char *end;
fa9a63c5
RM
3190{
3191 register unsigned char *pfrom = end;
3192 register unsigned char *pto = end + 5;
3193
3194 while (pfrom != loc)
3195 *--pto = *--pfrom;
5e69f11e 3196
fa9a63c5
RM
3197 store_op2 (op, loc, arg1, arg2);
3198}
3199
3200
3201/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3202 after an alternative or a begin-subexpression. We assume there is at
3203 least one character before the ^. */
3204
3205static boolean
3206at_begline_loc_p (pattern, p, syntax)
99633e97 3207 const unsigned char *pattern, *p;
fa9a63c5
RM
3208 reg_syntax_t syntax;
3209{
4e8a9132 3210 const unsigned char *prev = p - 2;
fa9a63c5 3211 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3212
fa9a63c5
RM
3213 return
3214 /* After a subexpression? */
3215 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3216 /* After an alternative? */
d2af47df
SM
3217 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3218 /* After a shy subexpression? */
3219 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3220 && prev[-1] == '?' && prev[-2] == '('
3221 && (syntax & RE_NO_BK_PARENS
3222 || (prev - 3 >= pattern && prev[-3] == '\\')));
fa9a63c5
RM
3223}
3224
3225
3226/* The dual of at_begline_loc_p. This one is for $. We assume there is
3227 at least one character after the $, i.e., `P < PEND'. */
3228
3229static boolean
3230at_endline_loc_p (p, pend, syntax)
99633e97
SM
3231 const unsigned char *p, *pend;
3232 reg_syntax_t syntax;
fa9a63c5 3233{
4e8a9132 3234 const unsigned char *next = p;
fa9a63c5 3235 boolean next_backslash = *next == '\\';
4e8a9132 3236 const unsigned char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3237
fa9a63c5
RM
3238 return
3239 /* Before a subexpression? */
3240 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3241 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3242 /* Before an alternative? */
3243 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3244 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3245}
3246
3247
5e69f11e 3248/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3249 false if it's not. */
3250
3251static boolean
3252group_in_compile_stack (compile_stack, regnum)
3253 compile_stack_type compile_stack;
3254 regnum_t regnum;
3255{
3256 int this_element;
3257
5e69f11e
RM
3258 for (this_element = compile_stack.avail - 1;
3259 this_element >= 0;
fa9a63c5
RM
3260 this_element--)
3261 if (compile_stack.stack[this_element].regnum == regnum)
3262 return true;
3263
3264 return false;
3265}
fa9a63c5 3266\f
f6a3f532
SM
3267/* analyse_first.
3268 If fastmap is non-NULL, go through the pattern and fill fastmap
3269 with all the possible leading chars. If fastmap is NULL, don't
3270 bother filling it up (obviously) and only return whether the
3271 pattern could potentially match the empty string.
3272
3273 Return 1 if p..pend might match the empty string.
3274 Return 0 if p..pend matches at least one char.
3275 Return -1 if p..pend matches at least one char, but fastmap was not
3276 updated accurately.
3277 Return -2 if an error occurred. */
3278
3279static int
3280analyse_first (p, pend, fastmap, multibyte)
3281 unsigned char *p, *pend;
3282 char *fastmap;
3283 const int multibyte;
fa9a63c5 3284{
505bde11 3285 int j, k;
1fb352e0 3286 boolean not;
fa9a63c5
RM
3287#ifdef MATCH_MAY_ALLOCATE
3288 fail_stack_type fail_stack;
3289#endif
3290#ifndef REGEX_MALLOC
3291 char *destination;
3292#endif
5e69f11e 3293
99633e97 3294#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
fa9a63c5
RM
3295 /* This holds the pointer to the failure stack, when
3296 it is allocated relocatably. */
3297 fail_stack_elt_t *failure_stack_ptr;
99633e97 3298#endif
fa9a63c5
RM
3299
3300 /* Assume that each path through the pattern can be null until
25fe55af 3301 proven otherwise. We set this false at the bottom of switch
fa9a63c5
RM
3302 statement, to which we get only if a particular path doesn't
3303 match the empty string. */
3304 boolean path_can_be_null = true;
3305
b18215fc 3306 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3307 flag is set true. */
b18215fc
RS
3308 boolean match_any_multibyte_characters = false;
3309
f6a3f532 3310 assert (p);
5e69f11e 3311
fa9a63c5 3312 INIT_FAIL_STACK ();
5e69f11e 3313
505bde11
SM
3314 /* The loop below works as follows:
3315 - It has a working-list kept in the PATTERN_STACK and which basically
3316 starts by only containing a pointer to the first operation.
3317 - If the opcode we're looking at is a match against some set of
3318 chars, then we add those chars to the fastmap and go on to the
3319 next work element from the worklist (done via `break').
3320 - If the opcode is a control operator on the other hand, we either
3321 ignore it (if it's meaningless at this point, such as `start_memory')
3322 or execute it (if it's a jump). If the jump has several destinations
3323 (i.e. `on_failure_jump'), then we push the other destination onto the
3324 worklist.
3325 We guarantee termination by ignoring backward jumps (more or less),
3326 so that `p' is monotonically increasing. More to the point, we
3327 never set `p' (or push) anything `<= p1'. */
3328
3329 /* If can_be_null is set, then the fastmap will not be used anyway. */
f6a3f532 3330 while (1)
fa9a63c5 3331 {
505bde11
SM
3332 /* `p1' is used as a marker of how far back a `on_failure_jump'
3333 can go without being ignored. It is normally equal to `p'
3334 (which prevents any backward `on_failure_jump') except right
3335 after a plain `jump', to allow patterns such as:
3336 0: jump 10
3337 3..9: <body>
3338 10: on_failure_jump 3
3339 as used for the *? operator. */
3340 unsigned char *p1 = p;
3341
f6a3f532 3342 if (p >= pend)
fa9a63c5 3343 {
f6a3f532
SM
3344 if (path_can_be_null)
3345 return (RESET_FAIL_STACK (), 1);
fa9a63c5 3346
f6a3f532
SM
3347 /* We have reached the (effective) end of pattern. */
3348 if (PATTERN_STACK_EMPTY ())
3349 return (RESET_FAIL_STACK (), 0);
fa9a63c5 3350
f6a3f532
SM
3351 p = (unsigned char*) POP_PATTERN_OP ();
3352 path_can_be_null = true;
3353 continue;
fa9a63c5
RM
3354 }
3355
25fe55af 3356 /* We should never be about to go beyond the end of the pattern. */
fa9a63c5 3357 assert (p < pend);
5e69f11e 3358
fa9a63c5
RM
3359 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3360 {
f6a3f532
SM
3361 case succeed:
3362 p = pend;
3363 continue;
fa9a63c5 3364
fa9a63c5 3365 case duplicate:
505bde11
SM
3366 /* If the first character has to match a backreference, that means
3367 that the group was empty (since it already matched). Since this
3368 is the only case that interests us here, we can assume that the
3369 backreference must match the empty string. */
3370 p++;
3371 continue;
fa9a63c5
RM
3372
3373
3374 /* Following are the cases which match a character. These end
25fe55af 3375 with `break'. */
fa9a63c5
RM
3376
3377 case exactn:
f6a3f532 3378 if (fastmap) fastmap[p[1]] = 1;
fa9a63c5
RM
3379 break;
3380
3381
1fb352e0
SM
3382 case anychar:
3383 /* We could put all the chars except for \n (and maybe \0)
3384 but we don't bother since it is generally not worth it. */
f6a3f532
SM
3385 if (!fastmap) break;
3386 return (RESET_FAIL_STACK (), -1);
fa9a63c5
RM
3387
3388
b18215fc 3389 case charset_not:
ba5c004d
RS
3390 /* Chars beyond end of bitmap are possible matches.
3391 All the single-byte codes can occur in multibyte buffers.
3392 So any that are not listed in the charset
3393 are possible matches, even in multibyte buffers. */
1fb352e0 3394 if (!fastmap) break;
b18215fc 3395 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
1fb352e0 3396 j < (1 << BYTEWIDTH); j++)
b18215fc 3397 fastmap[j] = 1;
1fb352e0
SM
3398 /* Fallthrough */
3399 case charset:
3400 if (!fastmap) break;
3401 not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
3402 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3403 j >= 0; j--)
1fb352e0 3404 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
b18215fc
RS
3405 fastmap[j] = 1;
3406
1fb352e0
SM
3407 if ((not && multibyte)
3408 /* Any character set can possibly contain a character
3409 which doesn't match the specified set of characters. */
3410 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3411 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3412 /* If we can match a character class, we can match
3413 any character set. */
b18215fc
RS
3414 {
3415 set_fastmap_for_multibyte_characters:
3416 if (match_any_multibyte_characters == false)
3417 {
3418 for (j = 0x80; j < 0xA0; j++) /* XXX */
3419 if (BASE_LEADING_CODE_P (j))
3420 fastmap[j] = 1;
3421 match_any_multibyte_characters = true;
3422 }
3423 }
b18215fc 3424
1fb352e0
SM
3425 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3426 && match_any_multibyte_characters == false)
3427 {
3428 /* Set fastmap[I] 1 where I is a base leading code of each
3429 multibyte character in the range table. */
3430 int c, count;
b18215fc 3431
1fb352e0
SM
3432 /* Make P points the range table. `+ 2' is to skip flag
3433 bits for a character class. */
3434 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
b18215fc 3435
1fb352e0
SM
3436 /* Extract the number of ranges in range table into COUNT. */
3437 EXTRACT_NUMBER_AND_INCR (count, p);
3438 for (; count > 0; count--, p += 2 * 3) /* XXX */
3439 {
3440 /* Extract the start of each range. */
3441 EXTRACT_CHARACTER (c, p);
3442 j = CHAR_CHARSET (c);
3443 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3444 }
3445 }
b18215fc
RS
3446 break;
3447
1fb352e0
SM
3448 case syntaxspec:
3449 case notsyntaxspec:
3450 if (!fastmap) break;
3451#ifndef emacs
3452 not = (re_opcode_t)p[-1] == notsyntaxspec;
3453 k = *p++;
3454 for (j = 0; j < (1 << BYTEWIDTH); j++)
990b2375 3455 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
b18215fc 3456 fastmap[j] = 1;
b18215fc 3457 break;
1fb352e0 3458#else /* emacs */
b18215fc
RS
3459 /* This match depends on text properties. These end with
3460 aborting optimizations. */
f6a3f532 3461 return (RESET_FAIL_STACK (), -1);
b18215fc
RS
3462
3463 case categoryspec:
b18215fc 3464 case notcategoryspec:
1fb352e0
SM
3465 if (!fastmap) break;
3466 not = (re_opcode_t)p[-1] == notcategoryspec;
b18215fc 3467 k = *p++;
1fb352e0
SM
3468 for (j = 0; j < (1 << BYTEWIDTH); j++)
3469 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
b18215fc
RS
3470 fastmap[j] = 1;
3471
1fb352e0 3472 if (multibyte)
b18215fc 3473 /* Any character set can possibly contain a character
1fb352e0 3474 whose category is K (or not). */
b18215fc
RS
3475 goto set_fastmap_for_multibyte_characters;
3476 break;
3477
fa9a63c5 3478 /* All cases after this match the empty string. These end with
25fe55af 3479 `continue'. */
fa9a63c5 3480
fa9a63c5
RM
3481 case before_dot:
3482 case at_dot:
3483 case after_dot:
1fb352e0 3484#endif /* !emacs */
25fe55af
RS
3485 case no_op:
3486 case begline:
3487 case endline:
fa9a63c5
RM
3488 case begbuf:
3489 case endbuf:
3490 case wordbound:
3491 case notwordbound:
3492 case wordbeg:
3493 case wordend:
25fe55af 3494 continue;
fa9a63c5
RM
3495
3496
fa9a63c5 3497 case jump:
25fe55af 3498 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11
SM
3499 if (j < 0)
3500 /* Backward jumps can only go back to code that we've already
3501 visited. `re_compile' should make sure this is true. */
3502 break;
25fe55af 3503 p += j;
505bde11
SM
3504 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3505 {
3506 case on_failure_jump:
3507 case on_failure_keep_string_jump:
505bde11 3508 case on_failure_jump_loop:
0683b6fa 3509 case on_failure_jump_nastyloop:
505bde11
SM
3510 case on_failure_jump_smart:
3511 p++;
3512 break;
3513 default:
3514 continue;
3515 };
3516 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3517 to jump back to "just after here". */
3518 /* Fallthrough */
fa9a63c5 3519
25fe55af
RS
3520 case on_failure_jump:
3521 case on_failure_keep_string_jump:
0683b6fa 3522 case on_failure_jump_nastyloop:
505bde11
SM
3523 case on_failure_jump_loop:
3524 case on_failure_jump_smart:
25fe55af 3525 EXTRACT_NUMBER_AND_INCR (j, p);
505bde11 3526 if (p + j <= p1)
ed0767d8
SM
3527 ; /* Backward jump to be ignored. */
3528 else if (!PUSH_PATTERN_OP (p + j, fail_stack))
3529 return (RESET_FAIL_STACK (), -2);
25fe55af 3530 continue;
fa9a63c5
RM
3531
3532
ed0767d8
SM
3533 case jump_n:
3534 /* This code simply does not properly handle forward jump_n. */
3535 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3536 p += 4;
3537 /* jump_n can either jump or fall through. The (backward) jump
3538 case has already been handled, so we only need to look at the
3539 fallthrough case. */
3540 continue;
3541
fa9a63c5 3542 case succeed_n:
ed0767d8
SM
3543 /* If N == 0, it should be an on_failure_jump_loop instead. */
3544 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3545 p += 4;
3546 /* We only care about one iteration of the loop, so we don't
3547 need to consider the case where this behaves like an
3548 on_failure_jump. */
25fe55af 3549 continue;
fa9a63c5
RM
3550
3551
3552 case set_number_at:
25fe55af
RS
3553 p += 4;
3554 continue;
fa9a63c5
RM
3555
3556
3557 case start_memory:
25fe55af 3558 case stop_memory:
505bde11 3559 p += 1;
fa9a63c5
RM
3560 continue;
3561
3562
3563 default:
25fe55af
RS
3564 abort (); /* We have listed all the cases. */
3565 } /* switch *p++ */
fa9a63c5
RM
3566
3567 /* Getting here means we have found the possible starting
25fe55af
RS
3568 characters for one path of the pattern -- and that the empty
3569 string does not match. We need not follow this path further.
3570 Instead, look at the next alternative (remembered on the
3571 stack), or quit if no more. The test at the top of the loop
3572 does these things. */
fa9a63c5
RM
3573 path_can_be_null = false;
3574 p = pend;
3575 } /* while p */
3576
f6a3f532
SM
3577 return (RESET_FAIL_STACK (), 0);
3578} /* analyse_first */
3579\f
3580/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3581 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3582 characters can start a string that matches the pattern. This fastmap
3583 is used by re_search to skip quickly over impossible starting points.
3584
3585 Character codes above (1 << BYTEWIDTH) are not represented in the
3586 fastmap, but the leading codes are represented. Thus, the fastmap
3587 indicates which character sets could start a match.
3588
3589 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3590 area as BUFP->fastmap.
3591
3592 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3593 the pattern buffer.
3594
3595 Returns 0 if we succeed, -2 if an internal error. */
3596
3597int
3598re_compile_fastmap (bufp)
3599 struct re_pattern_buffer *bufp;
3600{
3601 char *fastmap = bufp->fastmap;
3602 int analysis;
3603
3604 assert (fastmap && bufp->buffer);
3605
3606 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3607 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3608
3609 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
2d1675e4 3610 fastmap, RE_MULTIBYTE_P (bufp));
f6a3f532
SM
3611 if (analysis < -1)
3612 return analysis;
3613 bufp->can_be_null = (analysis != 0);
fa9a63c5
RM
3614 return 0;
3615} /* re_compile_fastmap */
3616\f
3617/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3618 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3619 this memory for recording register information. STARTS and ENDS
3620 must be allocated using the malloc library routine, and must each
3621 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3622
3623 If NUM_REGS == 0, then subsequent matches should allocate their own
3624 register data.
3625
3626 Unless this function is called, the first search or match using
3627 PATTERN_BUFFER will allocate its own register data, without
3628 freeing the old data. */
3629
3630void
3631re_set_registers (bufp, regs, num_regs, starts, ends)
3632 struct re_pattern_buffer *bufp;
3633 struct re_registers *regs;
3634 unsigned num_regs;
3635 regoff_t *starts, *ends;
3636{
3637 if (num_regs)
3638 {
3639 bufp->regs_allocated = REGS_REALLOCATE;
3640 regs->num_regs = num_regs;
3641 regs->start = starts;
3642 regs->end = ends;
3643 }
3644 else
3645 {
3646 bufp->regs_allocated = REGS_UNALLOCATED;
3647 regs->num_regs = 0;
3648 regs->start = regs->end = (regoff_t *) 0;
3649 }
3650}
3651\f
25fe55af 3652/* Searching routines. */
fa9a63c5
RM
3653
3654/* Like re_search_2, below, but only one string is specified, and
3655 doesn't let you say where to stop matching. */
3656
3657int
3658re_search (bufp, string, size, startpos, range, regs)
3659 struct re_pattern_buffer *bufp;
3660 const char *string;
3661 int size, startpos, range;
3662 struct re_registers *regs;
3663{
5e69f11e 3664 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
3665 regs, size);
3666}
3667
b18215fc
RS
3668/* End address of virtual concatenation of string. */
3669#define STOP_ADDR_VSTRING(P) \
3670 (((P) >= size1 ? string2 + size2 : string1 + size1))
3671
3672/* Address of POS in the concatenation of virtual string. */
3673#define POS_ADDR_VSTRING(POS) \
3674 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
3675
3676/* Using the compiled pattern in BUFP->buffer, first tries to match the
3677 virtual concatenation of STRING1 and STRING2, starting first at index
3678 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 3679
fa9a63c5 3680 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 3681
fa9a63c5
RM
3682 RANGE is how far to scan while trying to match. RANGE = 0 means try
3683 only at STARTPOS; in general, the last start tried is STARTPOS +
3684 RANGE.
5e69f11e 3685
fa9a63c5
RM
3686 In REGS, return the indices of the virtual concatenation of STRING1
3687 and STRING2 that matched the entire BUFP->buffer and its contained
3688 subexpressions.
5e69f11e 3689
fa9a63c5
RM
3690 Do not consider matching one past the index STOP in the virtual
3691 concatenation of STRING1 and STRING2.
3692
3693 We return either the position in the strings at which the match was
3694 found, -1 if no match, or -2 if error (such as failure
3695 stack overflow). */
3696
3697int
66f0296e 3698re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
fa9a63c5 3699 struct re_pattern_buffer *bufp;
66f0296e 3700 const char *str1, *str2;
fa9a63c5
RM
3701 int size1, size2;
3702 int startpos;
3703 int range;
3704 struct re_registers *regs;
3705 int stop;
3706{
3707 int val;
66f0296e
SM
3708 re_char *string1 = (re_char*) str1;
3709 re_char *string2 = (re_char*) str2;
fa9a63c5 3710 register char *fastmap = bufp->fastmap;
6676cb1c 3711 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
3712 int total_size = size1 + size2;
3713 int endpos = startpos + range;
c8499ba5 3714 int anchored_start = 0;
fa9a63c5 3715
25fe55af 3716 /* Nonzero if we have to concern multibyte character. */
2d1675e4 3717 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 3718
fa9a63c5
RM
3719 /* Check for out-of-range STARTPOS. */
3720 if (startpos < 0 || startpos > total_size)
3721 return -1;
5e69f11e 3722
fa9a63c5 3723 /* Fix up RANGE if it might eventually take us outside
34597fa9 3724 the virtual concatenation of STRING1 and STRING2.
5e69f11e 3725 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
3726 if (endpos < 0)
3727 range = 0 - startpos;
fa9a63c5
RM
3728 else if (endpos > total_size)
3729 range = total_size - startpos;
3730
3731 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 3732 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
3733 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3734 {
3735 if (startpos > 0)
3736 return -1;
3737 else
7b140fd7 3738 range = 0;
fa9a63c5
RM
3739 }
3740
ae4788a8
RS
3741#ifdef emacs
3742 /* In a forward search for something that starts with \=.
3743 don't keep searching past point. */
3744 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3745 {
7b140fd7
RS
3746 range = PT_BYTE - BEGV_BYTE - startpos;
3747 if (range < 0)
ae4788a8
RS
3748 return -1;
3749 }
3750#endif /* emacs */
3751
fa9a63c5
RM
3752 /* Update the fastmap now if not correct already. */
3753 if (fastmap && !bufp->fastmap_accurate)
3754 if (re_compile_fastmap (bufp) == -2)
3755 return -2;
5e69f11e 3756
c8499ba5
RS
3757 /* See whether the pattern is anchored. */
3758 if (bufp->buffer[0] == begline)
3759 anchored_start = 1;
3760
b18215fc 3761#ifdef emacs
cc9b4df2
KH
3762 gl_state.object = re_match_object;
3763 {
99633e97 3764 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
cc9b4df2
KH
3765
3766 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3767 }
b18215fc
RS
3768#endif
3769
fa9a63c5
RM
3770 /* Loop through the string, looking for a place to start matching. */
3771 for (;;)
5e69f11e 3772 {
c8499ba5
RS
3773 /* If the pattern is anchored,
3774 skip quickly past places we cannot match.
3775 We don't bother to treat startpos == 0 specially
3776 because that case doesn't repeat. */
3777 if (anchored_start && startpos > 0)
3778 {
3779 if (! (bufp->newline_anchor
3780 && ((startpos <= size1 ? string1[startpos - 1]
3781 : string2[startpos - size1 - 1])
3782 == '\n')))
3783 goto advance;
3784 }
3785
fa9a63c5 3786 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
3787 cannot be the start of a match. If the pattern can match the
3788 null string, however, we don't need to skip characters; we want
3789 the first null string. */
fa9a63c5
RM
3790 if (fastmap && startpos < total_size && !bufp->can_be_null)
3791 {
66f0296e 3792 register re_char *d;
e934739e
RS
3793 register unsigned int buf_ch;
3794
3795 d = POS_ADDR_VSTRING (startpos);
3796
25fe55af 3797 if (range > 0) /* Searching forwards. */
fa9a63c5 3798 {
fa9a63c5
RM
3799 register int lim = 0;
3800 int irange = range;
3801
25fe55af
RS
3802 if (startpos < size1 && startpos + range >= size1)
3803 lim = range - (size1 - startpos);
fa9a63c5 3804
25fe55af
RS
3805 /* Written out as an if-else to avoid testing `translate'
3806 inside the loop. */
28ae27ae
AS
3807 if (RE_TRANSLATE_P (translate))
3808 {
e934739e
RS
3809 if (multibyte)
3810 while (range > lim)
3811 {
3812 int buf_charlen;
3813
3814 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
3815 buf_charlen);
3816
3817 buf_ch = RE_TRANSLATE (translate, buf_ch);
3818 if (buf_ch >= 0400
3819 || fastmap[buf_ch])
3820 break;
3821
3822 range -= buf_charlen;
3823 d += buf_charlen;
3824 }
3825 else
3826 while (range > lim
66f0296e 3827 && !fastmap[RE_TRANSLATE (translate, *d)])
33c46939
RS
3828 {
3829 d++;
3830 range--;
3831 }
e934739e 3832 }
fa9a63c5 3833 else
66f0296e 3834 while (range > lim && !fastmap[*d])
33c46939
RS
3835 {
3836 d++;
3837 range--;
3838 }
fa9a63c5
RM
3839
3840 startpos += irange - range;
3841 }
25fe55af 3842 else /* Searching backwards. */
fa9a63c5 3843 {
2d1675e4
SM
3844 int room = (startpos >= size1
3845 ? size2 + size1 - startpos
3846 : size1 - startpos);
3847 buf_ch = RE_STRING_CHAR (d, room);
3848 buf_ch = TRANSLATE (buf_ch);
fa9a63c5 3849
e934739e
RS
3850 if (! (buf_ch >= 0400
3851 || fastmap[buf_ch]))
fa9a63c5
RM
3852 goto advance;
3853 }
3854 }
3855
3856 /* If can't match the null string, and that's all we have left, fail. */
3857 if (range >= 0 && startpos == total_size && fastmap
25fe55af 3858 && !bufp->can_be_null)
fa9a63c5
RM
3859 return -1;
3860
3861 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3862 startpos, regs, stop);
3863#ifndef REGEX_MALLOC
3864#ifdef C_ALLOCA
3865 alloca (0);
3866#endif
3867#endif
3868
3869 if (val >= 0)
3870 return startpos;
5e69f11e 3871
fa9a63c5
RM
3872 if (val == -2)
3873 return -2;
3874
3875 advance:
5e69f11e 3876 if (!range)
25fe55af 3877 break;
5e69f11e 3878 else if (range > 0)
25fe55af 3879 {
b18215fc
RS
3880 /* Update STARTPOS to the next character boundary. */
3881 if (multibyte)
3882 {
66f0296e
SM
3883 re_char *p = POS_ADDR_VSTRING (startpos);
3884 re_char *pend = STOP_ADDR_VSTRING (startpos);
b18215fc
RS
3885 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
3886
3887 range -= len;
3888 if (range < 0)
3889 break;
3890 startpos += len;
3891 }
3892 else
3893 {
b560c397
RS
3894 range--;
3895 startpos++;
3896 }
e318085a 3897 }
fa9a63c5 3898 else
25fe55af
RS
3899 {
3900 range++;
3901 startpos--;
b18215fc
RS
3902
3903 /* Update STARTPOS to the previous character boundary. */
3904 if (multibyte)
3905 {
66f0296e 3906 re_char *p = POS_ADDR_VSTRING (startpos);
b18215fc
RS
3907 int len = 0;
3908
3909 /* Find the head of multibyte form. */
5d967c7a 3910 while (!CHAR_HEAD_P (*p))
b18215fc
RS
3911 p--, len++;
3912
3913 /* Adjust it. */
3914#if 0 /* XXX */
3915 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
3916 ;
3917 else
3918#endif
3919 {
3920 range += len;
3921 if (range > 0)
3922 break;
3923
3924 startpos -= len;
3925 }
3926 }
25fe55af 3927 }
fa9a63c5
RM
3928 }
3929 return -1;
3930} /* re_search_2 */
3931\f
3932/* Declarations and macros for re_match_2. */
3933
2d1675e4
SM
3934static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
3935 register int len,
3936 RE_TRANSLATE_TYPE translate,
3937 const int multibyte));
fa9a63c5
RM
3938
3939/* This converts PTR, a pointer into one of the search strings `string1'
3940 and `string2' into an offset from the beginning of that string. */
3941#define POINTER_TO_OFFSET(ptr) \
3942 (FIRST_STRING_P (ptr) \
3943 ? ((regoff_t) ((ptr) - string1)) \
3944 : ((regoff_t) ((ptr) - string2 + size1)))
3945
fa9a63c5 3946/* Call before fetching a character with *d. This switches over to
419d1c74
SM
3947 string2 if necessary.
3948 Check re_match_2_internal for a discussion of why end_match_2 might
3949 not be within string2 (but be equal to end_match_1 instead). */
fa9a63c5 3950#define PREFETCH() \
25fe55af 3951 while (d == dend) \
fa9a63c5
RM
3952 { \
3953 /* End of string2 => fail. */ \
25fe55af
RS
3954 if (dend == end_match_2) \
3955 goto fail; \
3956 /* End of string1 => advance to string2. */ \
3957 d = string2; \
fa9a63c5
RM
3958 dend = end_match_2; \
3959 }
3960
f1ad044f
SM
3961/* Call before fetching a char with *d if you already checked other limits.
3962 This is meant for use in lookahead operations like wordend, etc..
3963 where we might need to look at parts of the string that might be
3964 outside of the LIMITs (i.e past `stop'). */
3965#define PREFETCH_NOLIMIT() \
3966 if (d == end1) \
3967 { \
3968 d = string2; \
3969 dend = end_match_2; \
3970 } \
fa9a63c5
RM
3971
3972/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 3973 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 3974#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 3975#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
3976
3977
3978/* Test if D points to a character which is word-constituent. We have
3979 two special cases to check for: if past the end of string1, look at
3980 the first character in string2; and if before the beginning of
3981 string2, look at the last character in string1. */
3982#define WORDCHAR_P(d) \
3983 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 3984 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
3985 == Sword)
3986
9121ca40 3987/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
3988
3989/* The comment at case wordbound is following one, but we don't use
3990 AT_WORD_BOUNDARY anymore to support multibyte form.
3991
3992 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
3993 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3994 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
3995 macro and introducing temporary variables works around the bug. */
3996
9121ca40 3997#if 0
fa9a63c5
RM
3998/* Test if the character before D and the one at D differ with respect
3999 to being word-constituent. */
4000#define AT_WORD_BOUNDARY(d) \
4001 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4002 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4003#endif
fa9a63c5
RM
4004
4005/* Free everything we malloc. */
4006#ifdef MATCH_MAY_ALLOCATE
00049484 4007#define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
fa9a63c5
RM
4008#define FREE_VARIABLES() \
4009 do { \
4010 REGEX_FREE_STACK (fail_stack.stack); \
4011 FREE_VAR (regstart); \
4012 FREE_VAR (regend); \
fa9a63c5
RM
4013 FREE_VAR (best_regstart); \
4014 FREE_VAR (best_regend); \
fa9a63c5
RM
4015 } while (0)
4016#else
4017#define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4018#endif /* not MATCH_MAY_ALLOCATE */
4019
505bde11
SM
4020\f
4021/* Optimization routines. */
4022
4e8a9132
SM
4023/* If the operation is a match against one or more chars,
4024 return a pointer to the next operation, else return NULL. */
4025static unsigned char *
4026skip_one_char (p)
4027 unsigned char *p;
4028{
4029 switch (SWITCH_ENUM_CAST (*p++))
4030 {
4031 case anychar:
4032 break;
4033
4034 case exactn:
4035 p += *p + 1;
4036 break;
4037
4038 case charset_not:
4039 case charset:
4040 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4041 {
4042 int mcnt;
4043 p = CHARSET_RANGE_TABLE (p - 1);
4044 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4045 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4046 }
4047 else
4048 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4049 break;
4050
4e8a9132
SM
4051 case syntaxspec:
4052 case notsyntaxspec:
1fb352e0 4053#ifdef emacs
4e8a9132
SM
4054 case categoryspec:
4055 case notcategoryspec:
4056#endif /* emacs */
4057 p++;
4058 break;
4059
4060 default:
4061 p = NULL;
4062 }
4063 return p;
4064}
4065
4066
505bde11
SM
4067/* Jump over non-matching operations. */
4068static unsigned char *
4e8a9132 4069skip_noops (p, pend)
505bde11 4070 unsigned char *p, *pend;
505bde11
SM
4071{
4072 int mcnt;
4073 while (p < pend)
4074 {
4075 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4076 {
4077 case start_memory:
505bde11
SM
4078 case stop_memory:
4079 p += 2; break;
4080 case no_op:
4081 p += 1; break;
4082 case jump:
4083 p += 1;
4084 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4085 p += mcnt;
4086 break;
4087 default:
4088 return p;
4089 }
4090 }
4091 assert (p == pend);
4092 return p;
4093}
4094
4095/* Non-zero if "p1 matches something" implies "p2 fails". */
4096static int
4097mutually_exclusive_p (bufp, p1, p2)
4098 struct re_pattern_buffer *bufp;
4099 unsigned char *p1, *p2;
4100{
4e8a9132 4101 re_opcode_t op2;
2d1675e4 4102 const boolean multibyte = RE_MULTIBYTE_P (bufp);
505bde11
SM
4103 unsigned char *pend = bufp->buffer + bufp->used;
4104
4e8a9132 4105 assert (p1 >= bufp->buffer && p1 < pend
505bde11
SM
4106 && p2 >= bufp->buffer && p2 <= pend);
4107
4108 /* Skip over open/close-group commands.
4109 If what follows this loop is a ...+ construct,
4110 look at what begins its body, since we will have to
4111 match at least one of that. */
4e8a9132
SM
4112 p2 = skip_noops (p2, pend);
4113 /* The same skip can be done for p1, except that this function
4114 is only used in the case where p1 is a simple match operator. */
4115 /* p1 = skip_noops (p1, pend); */
4116
4117 assert (p1 >= bufp->buffer && p1 < pend
4118 && p2 >= bufp->buffer && p2 <= pend);
4119
4120 op2 = p2 == pend ? succeed : *p2;
4121
4122 switch (SWITCH_ENUM_CAST (op2))
505bde11 4123 {
4e8a9132
SM
4124 case succeed:
4125 case endbuf:
4126 /* If we're at the end of the pattern, we can change. */
4127 if (skip_one_char (p1))
505bde11 4128 {
505bde11
SM
4129 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4130 return 1;
505bde11 4131 }
4e8a9132
SM
4132 break;
4133
4134 case endline:
4135 if (!bufp->newline_anchor)
4136 break;
4137 /* Fallthrough */
4138 case exactn:
4139 {
4140 register unsigned int c
4141 = (re_opcode_t) *p2 == endline ? '\n'
4142 : RE_STRING_CHAR(p2 + 2, pend - p2 - 2);
505bde11 4143
4e8a9132
SM
4144 if ((re_opcode_t) *p1 == exactn)
4145 {
4146 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4147 {
4148 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4149 return 1;
4150 }
4151 }
505bde11 4152
4e8a9132
SM
4153 else if ((re_opcode_t) *p1 == charset
4154 || (re_opcode_t) *p1 == charset_not)
4155 {
4156 int not = (re_opcode_t) *p1 == charset_not;
505bde11 4157
4e8a9132
SM
4158 /* Test if C is listed in charset (or charset_not)
4159 at `p1'. */
4160 if (SINGLE_BYTE_CHAR_P (c))
4161 {
4162 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4163 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4164 not = !not;
4165 }
4166 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4167 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
505bde11 4168
4e8a9132
SM
4169 /* `not' is equal to 1 if c would match, which means
4170 that we can't change to pop_failure_jump. */
4171 if (!not)
4172 {
4173 DEBUG_PRINT1 (" No match => fast loop.\n");
4174 return 1;
4175 }
4176 }
4177 else if ((re_opcode_t) *p1 == anychar
4178 && c == '\n')
4179 {
4180 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4181 return 1;
4182 }
4183 }
4184 break;
505bde11 4185
4e8a9132
SM
4186 case charset:
4187 case charset_not:
4188 {
4189 if ((re_opcode_t) *p1 == exactn)
4190 /* Reuse the code above. */
4191 return mutually_exclusive_p (bufp, p2, p1);
505bde11
SM
4192
4193
4194 /* It is hard to list up all the character in charset
4195 P2 if it includes multibyte character. Give up in
4196 such case. */
4197 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4198 {
4199 /* Now, we are sure that P2 has no range table.
4200 So, for the size of bitmap in P2, `p2[1]' is
4201 enough. But P1 may have range table, so the
4202 size of bitmap table of P1 is extracted by
4203 using macro `CHARSET_BITMAP_SIZE'.
4204
4205 Since we know that all the character listed in
4206 P2 is ASCII, it is enough to test only bitmap
4207 table of P1. */
4208
4209 if (*p1 == *p2)
4210 {
4211 int idx;
4212 /* We win if the charset inside the loop
4213 has no overlap with the one after the loop. */
4214 for (idx = 0;
4215 (idx < (int) p2[1]
4216 && idx < CHARSET_BITMAP_SIZE (p1));
4217 idx++)
4218 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4219 break;
4220
4221 if (idx == p2[1]
4222 || idx == CHARSET_BITMAP_SIZE (p1))
4223 {
4224 DEBUG_PRINT1 (" No match => fast loop.\n");
4225 return 1;
4226 }
4227 }
4228 else if ((re_opcode_t) *p1 == charset
4229 || (re_opcode_t) *p1 == charset_not)
4230 {
4231 int idx;
4232 /* We win if the charset_not inside the loop lists
4233 every character listed in the charset after. */
4234 for (idx = 0; idx < (int) p2[1]; idx++)
4235 if (! (p2[2 + idx] == 0
4236 || (idx < CHARSET_BITMAP_SIZE (p1)
4237 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4238 break;
4239
4e8a9132
SM
4240 if (idx == p2[1])
4241 {
4242 DEBUG_PRINT1 (" No match => fast loop.\n");
4243 return 1;
4244 }
4245 }
4246 }
4247 }
4248
4e8a9132
SM
4249 case wordend:
4250 case notsyntaxspec:
4251 return ((re_opcode_t) *p1 == syntaxspec
4252 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4253
4254 case wordbeg:
4255 case syntaxspec:
4256 return ((re_opcode_t) *p1 == notsyntaxspec
4257 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4258
4259 case wordbound:
4260 return (((re_opcode_t) *p1 == notsyntaxspec
4261 || (re_opcode_t) *p1 == syntaxspec)
4262 && p1[1] == Sword);
4263
1fb352e0 4264#ifdef emacs
4e8a9132
SM
4265 case categoryspec:
4266 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4267 case notcategoryspec:
4268 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4269#endif /* emacs */
4270
4271 default:
4272 ;
505bde11
SM
4273 }
4274
4275 /* Safe default. */
4276 return 0;
4277}
4278
fa9a63c5
RM
4279\f
4280/* Matching routines. */
4281
25fe55af 4282#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4283/* re_match is like re_match_2 except it takes only a single string. */
4284
4285int
4286re_match (bufp, string, size, pos, regs)
4287 struct re_pattern_buffer *bufp;
4288 const char *string;
4289 int size, pos;
4290 struct re_registers *regs;
4291{
4292 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4293 pos, regs, size);
4294 alloca (0);
4295 return result;
4296}
4297#endif /* not emacs */
4298
b18215fc
RS
4299#ifdef emacs
4300/* In Emacs, this is the string or buffer in which we
25fe55af 4301 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4302Lisp_Object re_match_object;
4303#endif
fa9a63c5
RM
4304
4305/* re_match_2 matches the compiled pattern in BUFP against the
4306 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4307 and SIZE2, respectively). We start matching at POS, and stop
4308 matching at STOP.
5e69f11e 4309
fa9a63c5 4310 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4311 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4312 documentation for exactly how many groups we fill.
4313
4314 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4315 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4316 matched substring. */
4317
4318int
4319re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4320 struct re_pattern_buffer *bufp;
4321 const char *string1, *string2;
4322 int size1, size2;
4323 int pos;
4324 struct re_registers *regs;
4325 int stop;
4326{
b18215fc 4327 int result;
25fe55af 4328
b18215fc 4329#ifdef emacs
cc9b4df2
KH
4330 int charpos;
4331 gl_state.object = re_match_object;
99633e97 4332 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
cc9b4df2 4333 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4334#endif
4335
4336 result = re_match_2_internal (bufp, string1, size1, string2, size2,
cc9b4df2 4337 pos, regs, stop);
fa9a63c5
RM
4338 alloca (0);
4339 return result;
4340}
4341
4342/* This is a separate function so that we can force an alloca cleanup
25fe55af 4343 afterwards. */
fa9a63c5
RM
4344static int
4345re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4346 struct re_pattern_buffer *bufp;
66f0296e 4347 re_char *string1, *string2;
fa9a63c5
RM
4348 int size1, size2;
4349 int pos;
4350 struct re_registers *regs;
4351 int stop;
4352{
4353 /* General temporaries. */
4354 int mcnt;
66f0296e 4355 boolean not;
fa9a63c5
RM
4356 unsigned char *p1;
4357
4358 /* Just past the end of the corresponding string. */
66f0296e 4359 re_char *end1, *end2;
fa9a63c5
RM
4360
4361 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4362 each to consider matching. */
66f0296e 4363 re_char *end_match_1, *end_match_2;
fa9a63c5
RM
4364
4365 /* Where we are in the data, and the end of the current string. */
66f0296e 4366 re_char *d, *dend;
5e69f11e 4367
99633e97
SM
4368 /* Used sometimes to remember where we were before starting matching
4369 an operator so that we can go back in case of failure. This "atomic"
4370 behavior of matching opcodes is indispensable to the correctness
4371 of the on_failure_keep_string_jump optimization. */
4372 re_char *dfail;
4373
fa9a63c5
RM
4374 /* Where we are in the pattern, and the end of the pattern. */
4375 unsigned char *p = bufp->buffer;
4376 register unsigned char *pend = p + bufp->used;
4377
25fe55af 4378 /* We use this to map every character in the string. */
6676cb1c 4379 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4380
25fe55af 4381 /* Nonzero if we have to concern multibyte character. */
2d1675e4 4382 const boolean multibyte = RE_MULTIBYTE_P (bufp);
b18215fc 4383
fa9a63c5
RM
4384 /* Failure point stack. Each place that can handle a failure further
4385 down the line pushes a failure point on this stack. It consists of
505bde11 4386 regstart, and regend for all registers corresponding to
fa9a63c5
RM
4387 the subexpressions we're currently inside, plus the number of such
4388 registers, and, finally, two char *'s. The first char * is where
4389 to resume scanning the pattern; the second one is where to resume
505bde11 4390 scanning the strings. */
25fe55af 4391#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4392 fail_stack_type fail_stack;
4393#endif
4394#ifdef DEBUG
4395 static unsigned failure_id = 0;
4396 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4397#endif
4398
99633e97 4399#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
fa9a63c5
RM
4400 /* This holds the pointer to the failure stack, when
4401 it is allocated relocatably. */
4402 fail_stack_elt_t *failure_stack_ptr;
99633e97 4403#endif
fa9a63c5
RM
4404
4405 /* We fill all the registers internally, independent of what we
25fe55af 4406 return, for use in backreferences. The number here includes
fa9a63c5
RM
4407 an element for register zero. */
4408 unsigned num_regs = bufp->re_nsub + 1;
5e69f11e 4409
fa9a63c5
RM
4410 /* Information on the contents of registers. These are pointers into
4411 the input strings; they record just what was matched (on this
4412 attempt) by a subexpression part of the pattern, that is, the
4413 regnum-th regstart pointer points to where in the pattern we began
4414 matching and the regnum-th regend points to right after where we
4415 stopped matching the regnum-th subexpression. (The zeroth register
4416 keeps track of what the whole pattern matches.) */
4417#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4418 re_char **regstart, **regend;
fa9a63c5
RM
4419#endif
4420
fa9a63c5 4421 /* The following record the register info as found in the above
5e69f11e 4422 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4423 This happens as we backtrack through the failure points, which in
4424 turn happens only if we have not yet matched the entire string. */
4425 unsigned best_regs_set = false;
4426#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
66f0296e 4427 re_char **best_regstart, **best_regend;
fa9a63c5 4428#endif
5e69f11e 4429
fa9a63c5
RM
4430 /* Logically, this is `best_regend[0]'. But we don't want to have to
4431 allocate space for that if we're not allocating space for anything
25fe55af 4432 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4433 any of the other register vectors, and it seems rather a kludge to
4434 treat `best_regend' differently than the rest. So we keep track of
4435 the end of the best match so far in a separate variable. We
4436 initialize this to NULL so that when we backtrack the first time
4437 and need to test it, it's not garbage. */
66f0296e 4438 re_char *match_end = NULL;
fa9a63c5 4439
fa9a63c5
RM
4440#ifdef DEBUG
4441 /* Counts the total number of registers pushed. */
5e69f11e 4442 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4443#endif
4444
4445 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4446
fa9a63c5 4447 INIT_FAIL_STACK ();
5e69f11e 4448
fa9a63c5
RM
4449#ifdef MATCH_MAY_ALLOCATE
4450 /* Do not bother to initialize all the register variables if there are
4451 no groups in the pattern, as it takes a fair amount of time. If
4452 there are groups, we include space for register 0 (the whole
4453 pattern), even though we never use it, since it simplifies the
4454 array indexing. We should fix this. */
4455 if (bufp->re_nsub)
4456 {
66f0296e
SM
4457 regstart = REGEX_TALLOC (num_regs, re_char *);
4458 regend = REGEX_TALLOC (num_regs, re_char *);
4459 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4460 best_regend = REGEX_TALLOC (num_regs, re_char *);
fa9a63c5 4461
505bde11 4462 if (!(regstart && regend && best_regstart && best_regend))
25fe55af
RS
4463 {
4464 FREE_VARIABLES ();
4465 return -2;
4466 }
fa9a63c5
RM
4467 }
4468 else
4469 {
4470 /* We must initialize all our variables to NULL, so that
25fe55af 4471 `FREE_VARIABLES' doesn't try to free them. */
505bde11 4472 regstart = regend = best_regstart = best_regend = NULL;
fa9a63c5
RM
4473 }
4474#endif /* MATCH_MAY_ALLOCATE */
4475
4476 /* The starting position is bogus. */
4477 if (pos < 0 || pos > size1 + size2)
4478 {
4479 FREE_VARIABLES ();
4480 return -1;
4481 }
5e69f11e 4482
fa9a63c5
RM
4483 /* Initialize subexpression text positions to -1 to mark ones that no
4484 start_memory/stop_memory has been seen for. Also initialize the
4485 register information struct. */
4486 for (mcnt = 1; mcnt < num_regs; mcnt++)
99633e97
SM
4487 regstart[mcnt] = regend[mcnt] = REG_UNSET_VALUE;
4488
fa9a63c5 4489 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 4490 `string1' is null. */
fa9a63c5
RM
4491 if (size2 == 0 && string1 != NULL)
4492 {
4493 string2 = string1;
4494 size2 = size1;
4495 string1 = 0;
4496 size1 = 0;
4497 }
4498 end1 = string1 + size1;
4499 end2 = string2 + size2;
4500
5e69f11e 4501 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
4502 `dend' is the end of the input string that `d' points within. `d'
4503 is advanced into the following input string whenever necessary, but
4504 this happens before fetching; therefore, at the beginning of the
4505 loop, `d' can be pointing at the end of a string, but it cannot
4506 equal `string2'. */
419d1c74 4507 if (pos >= size1)
fa9a63c5 4508 {
419d1c74
SM
4509 /* Only match within string2. */
4510 d = string2 + pos - size1;
4511 dend = end_match_2 = string2 + stop - size1;
4512 end_match_1 = end1; /* Just to give it a value. */
fa9a63c5
RM
4513 }
4514 else
4515 {
f1ad044f 4516 if (stop < size1)
419d1c74
SM
4517 {
4518 /* Only match within string1. */
4519 end_match_1 = string1 + stop;
4520 /* BEWARE!
4521 When we reach end_match_1, PREFETCH normally switches to string2.
4522 But in the present case, this means that just doing a PREFETCH
4523 makes us jump from `stop' to `gap' within the string.
4524 What we really want here is for the search to stop as
4525 soon as we hit end_match_1. That's why we set end_match_2
4526 to end_match_1 (since PREFETCH fails as soon as we hit
4527 end_match_2). */
4528 end_match_2 = end_match_1;
4529 }
4530 else
f1ad044f
SM
4531 { /* It's important to use this code when stop == size so that
4532 moving `d' from end1 to string2 will not prevent the d == dend
4533 check from catching the end of string. */
419d1c74
SM
4534 end_match_1 = end1;
4535 end_match_2 = string2 + stop - size1;
4536 }
4537 d = string1 + pos;
4538 dend = end_match_1;
fa9a63c5
RM
4539 }
4540
4541 DEBUG_PRINT1 ("The compiled pattern is: ");
4542 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4543 DEBUG_PRINT1 ("The string to match is: `");
4544 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4545 DEBUG_PRINT1 ("'\n");
5e69f11e 4546
25fe55af 4547 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
4548 function if the match is complete, or it drops through if the match
4549 fails at this starting point in the input data. */
4550 for (;;)
4551 {
505bde11 4552 DEBUG_PRINT2 ("\n%p: ", p);
fa9a63c5
RM
4553
4554 if (p == pend)
4555 { /* End of pattern means we might have succeeded. */
25fe55af 4556 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 4557
fa9a63c5 4558 /* If we haven't matched the entire string, and we want the
25fe55af
RS
4559 longest match, try backtracking. */
4560 if (d != end_match_2)
fa9a63c5
RM
4561 {
4562 /* 1 if this match ends in the same string (string1 or string2)
4563 as the best previous match. */
5e69f11e 4564 boolean same_str_p = (FIRST_STRING_P (match_end)
99633e97 4565 == FIRST_STRING_P (d));
fa9a63c5
RM
4566 /* 1 if this match is the best seen so far. */
4567 boolean best_match_p;
4568
4569 /* AIX compiler got confused when this was combined
25fe55af 4570 with the previous declaration. */
fa9a63c5
RM
4571 if (same_str_p)
4572 best_match_p = d > match_end;
4573 else
99633e97 4574 best_match_p = !FIRST_STRING_P (d);
fa9a63c5 4575
25fe55af
RS
4576 DEBUG_PRINT1 ("backtracking.\n");
4577
4578 if (!FAIL_STACK_EMPTY ())
4579 { /* More failure points to try. */
4580
4581 /* If exceeds best match so far, save it. */
4582 if (!best_regs_set || best_match_p)
4583 {
4584 best_regs_set = true;
4585 match_end = d;
4586
4587 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4588
4589 for (mcnt = 1; mcnt < num_regs; mcnt++)
4590 {
4591 best_regstart[mcnt] = regstart[mcnt];
4592 best_regend[mcnt] = regend[mcnt];
4593 }
4594 }
4595 goto fail;
4596 }
4597
4598 /* If no failure points, don't restore garbage. And if
4599 last match is real best match, don't restore second
4600 best one. */
4601 else if (best_regs_set && !best_match_p)
4602 {
4603 restore_best_regs:
4604 /* Restore best match. It may happen that `dend ==
4605 end_match_1' while the restored d is in string2.
4606 For example, the pattern `x.*y.*z' against the
4607 strings `x-' and `y-z-', if the two strings are
4608 not consecutive in memory. */
4609 DEBUG_PRINT1 ("Restoring best registers.\n");
4610
4611 d = match_end;
4612 dend = ((d >= string1 && d <= end1)
4613 ? end_match_1 : end_match_2);
fa9a63c5
RM
4614
4615 for (mcnt = 1; mcnt < num_regs; mcnt++)
4616 {
4617 regstart[mcnt] = best_regstart[mcnt];
4618 regend[mcnt] = best_regend[mcnt];
4619 }
25fe55af
RS
4620 }
4621 } /* d != end_match_2 */
fa9a63c5
RM
4622
4623 succeed_label:
25fe55af 4624 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 4625
25fe55af
RS
4626 /* If caller wants register contents data back, do it. */
4627 if (regs && !bufp->no_sub)
fa9a63c5 4628 {
25fe55af
RS
4629 /* Have the register data arrays been allocated? */
4630 if (bufp->regs_allocated == REGS_UNALLOCATED)
4631 { /* No. So allocate them with malloc. We need one
4632 extra element beyond `num_regs' for the `-1' marker
4633 GNU code uses. */
4634 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4635 regs->start = TALLOC (regs->num_regs, regoff_t);
4636 regs->end = TALLOC (regs->num_regs, regoff_t);
4637 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4638 {
4639 FREE_VARIABLES ();
4640 return -2;
4641 }
25fe55af
RS
4642 bufp->regs_allocated = REGS_REALLOCATE;
4643 }
4644 else if (bufp->regs_allocated == REGS_REALLOCATE)
4645 { /* Yes. If we need more elements than were already
4646 allocated, reallocate them. If we need fewer, just
4647 leave it alone. */
4648 if (regs->num_regs < num_regs + 1)
4649 {
4650 regs->num_regs = num_regs + 1;
4651 RETALLOC (regs->start, regs->num_regs, regoff_t);
4652 RETALLOC (regs->end, regs->num_regs, regoff_t);
4653 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4654 {
4655 FREE_VARIABLES ();
4656 return -2;
4657 }
25fe55af
RS
4658 }
4659 }
4660 else
fa9a63c5
RM
4661 {
4662 /* These braces fend off a "empty body in an else-statement"
25fe55af 4663 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
4664 assert (bufp->regs_allocated == REGS_FIXED);
4665 }
4666
25fe55af
RS
4667 /* Convert the pointer data in `regstart' and `regend' to
4668 indices. Register zero has to be set differently,
4669 since we haven't kept track of any info for it. */
4670 if (regs->num_regs > 0)
4671 {
4672 regs->start[0] = pos;
99633e97 4673 regs->end[0] = POINTER_TO_OFFSET (d);
25fe55af 4674 }
5e69f11e 4675
25fe55af
RS
4676 /* Go through the first `min (num_regs, regs->num_regs)'
4677 registers, since that is all we initialized. */
fa9a63c5
RM
4678 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4679 {
25fe55af
RS
4680 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4681 regs->start[mcnt] = regs->end[mcnt] = -1;
4682 else
4683 {
fa9a63c5
RM
4684 regs->start[mcnt]
4685 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
25fe55af 4686 regs->end[mcnt]
fa9a63c5 4687 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
25fe55af 4688 }
fa9a63c5 4689 }
5e69f11e 4690
25fe55af
RS
4691 /* If the regs structure we return has more elements than
4692 were in the pattern, set the extra elements to -1. If
4693 we (re)allocated the registers, this is the case,
4694 because we always allocate enough to have at least one
4695 -1 at the end. */
4696 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4697 regs->start[mcnt] = regs->end[mcnt] = -1;
fa9a63c5
RM
4698 } /* regs && !bufp->no_sub */
4699
25fe55af
RS
4700 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4701 nfailure_points_pushed, nfailure_points_popped,
4702 nfailure_points_pushed - nfailure_points_popped);
4703 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 4704
99633e97 4705 mcnt = POINTER_TO_OFFSET (d) - pos;
fa9a63c5 4706
25fe55af 4707 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 4708
25fe55af
RS
4709 FREE_VARIABLES ();
4710 return mcnt;
4711 }
fa9a63c5 4712
25fe55af 4713 /* Otherwise match next pattern command. */
fa9a63c5
RM
4714 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4715 {
25fe55af
RS
4716 /* Ignore these. Used to ignore the n of succeed_n's which
4717 currently have n == 0. */
4718 case no_op:
4719 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4720 break;
fa9a63c5
RM
4721
4722 case succeed:
25fe55af 4723 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
4724 goto succeed_label;
4725
25fe55af
RS
4726 /* Match the next n pattern characters exactly. The following
4727 byte in the pattern defines n, and the n bytes after that
4728 are the characters to match. */
fa9a63c5
RM
4729 case exactn:
4730 mcnt = *p++;
25fe55af 4731 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 4732
99633e97
SM
4733 /* Remember the start point to rollback upon failure. */
4734 dfail = d;
4735
25fe55af
RS
4736 /* This is written out as an if-else so we don't waste time
4737 testing `translate' inside the loop. */
28703c16 4738 if (RE_TRANSLATE_P (translate))
fa9a63c5 4739 {
e934739e
RS
4740 if (multibyte)
4741 do
4742 {
4743 int pat_charlen, buf_charlen;
e71b1971 4744 unsigned int pat_ch, buf_ch;
e934739e
RS
4745
4746 PREFETCH ();
4747 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4748 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4749
4750 if (RE_TRANSLATE (translate, buf_ch)
4751 != pat_ch)
99633e97
SM
4752 {
4753 d = dfail;
4754 goto fail;
4755 }
e934739e
RS
4756
4757 p += pat_charlen;
4758 d += buf_charlen;
4759 mcnt -= pat_charlen;
4760 }
4761 while (mcnt > 0);
4762 else
e934739e
RS
4763 do
4764 {
4765 PREFETCH ();
66f0296e 4766 if (RE_TRANSLATE (translate, *d) != *p++)
99633e97
SM
4767 {
4768 d = dfail;
4769 goto fail;
4770 }
33c46939 4771 d++;
e934739e
RS
4772 }
4773 while (--mcnt);
fa9a63c5
RM
4774 }
4775 else
4776 {
4777 do
4778 {
4779 PREFETCH ();
99633e97
SM
4780 if (*d++ != *p++)
4781 {
4782 d = dfail;
4783 goto fail;
4784 }
fa9a63c5
RM
4785 }
4786 while (--mcnt);
4787 }
25fe55af 4788 break;
fa9a63c5
RM
4789
4790
25fe55af 4791 /* Match any character except possibly a newline or a null. */
fa9a63c5 4792 case anychar:
e934739e
RS
4793 {
4794 int buf_charlen;
e71b1971 4795 unsigned int buf_ch;
fa9a63c5 4796
e934739e 4797 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 4798
e934739e 4799 PREFETCH ();
2d1675e4 4800 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
e934739e
RS
4801 buf_ch = TRANSLATE (buf_ch);
4802
4803 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4804 && buf_ch == '\n')
4805 || ((bufp->syntax & RE_DOT_NOT_NULL)
4806 && buf_ch == '\000'))
4807 goto fail;
4808
e934739e
RS
4809 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4810 d += buf_charlen;
4811 }
fa9a63c5
RM
4812 break;
4813
4814
4815 case charset:
4816 case charset_not:
4817 {
b18215fc 4818 register unsigned int c;
fa9a63c5 4819 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4820 int len;
4821
4822 /* Start of actual range_table, or end of bitmap if there is no
4823 range table. */
4824 unsigned char *range_table;
4825
96cc36cc 4826 /* Nonzero if there is a range table. */
b18215fc
RS
4827 int range_table_exists;
4828
96cc36cc
RS
4829 /* Number of ranges of range table. This is not included
4830 in the initial byte-length of the command. */
4831 int count = 0;
fa9a63c5 4832
25fe55af 4833 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5 4834
b18215fc 4835 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc 4836
b18215fc 4837 if (range_table_exists)
96cc36cc
RS
4838 {
4839 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4840 EXTRACT_NUMBER_AND_INCR (count, range_table);
4841 }
b18215fc 4842
2d1675e4
SM
4843 PREFETCH ();
4844 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
4845 c = TRANSLATE (c); /* The character to match. */
b18215fc
RS
4846
4847 if (SINGLE_BYTE_CHAR_P (c))
4848 { /* Lookup bitmap. */
b18215fc
RS
4849 /* Cast to `unsigned' instead of `unsigned char' in
4850 case the bit list is a full 32 bytes long. */
4851 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
4852 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4853 not = !not;
b18215fc 4854 }
96cc36cc 4855#ifdef emacs
b18215fc 4856 else if (range_table_exists)
96cc36cc
RS
4857 {
4858 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4859
4860 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4861 | (class_bits & BIT_ALPHA && ISALPHA (c))
f71b19b6 4862 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
96cc36cc
RS
4863 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4864 | (class_bits & BIT_LOWER && ISLOWER (c))
f71b19b6
DL
4865 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4866 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
96cc36cc
RS
4867 | (class_bits & BIT_PRINT && ISPRINT (c))
4868 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4869 | (class_bits & BIT_SPACE && ISSPACE (c))
f71b19b6 4870 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
96cc36cc
RS
4871 | (class_bits & BIT_UPPER && ISUPPER (c))
4872 | (class_bits & BIT_WORD && ISWORD (c)))
4873 not = !not;
4874 else
4875 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4876 }
4877#endif /* emacs */
fa9a63c5 4878
96cc36cc
RS
4879 if (range_table_exists)
4880 p = CHARSET_RANGE_TABLE_END (range_table, count);
4881 else
4882 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
4883
4884 if (!not) goto fail;
5e69f11e 4885
b18215fc 4886 d += len;
fa9a63c5
RM
4887 break;
4888 }
4889
4890
25fe55af 4891 /* The beginning of a group is represented by start_memory.
505bde11 4892 The argument is the register number. The text
25fe55af
RS
4893 matched within the group is recorded (in the internal
4894 registers data structure) under the register number. */
4895 case start_memory:
505bde11
SM
4896 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
4897
4898 /* In case we need to undo this operation (via backtracking). */
4899 PUSH_FAILURE_REG ((unsigned int)*p);
fa9a63c5 4900
25fe55af 4901 regstart[*p] = d;
505bde11 4902 regend[*p] = REG_UNSET_VALUE; /* probably unnecessary. -sm */
fa9a63c5
RM
4903 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4904
25fe55af 4905 /* Move past the register number and inner group count. */
505bde11 4906 p += 1;
25fe55af 4907 break;
fa9a63c5
RM
4908
4909
25fe55af 4910 /* The stop_memory opcode represents the end of a group. Its
505bde11 4911 argument is the same as start_memory's: the register number. */
fa9a63c5 4912 case stop_memory:
505bde11
SM
4913 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
4914
4915 assert (!REG_UNSET (regstart[*p]));
4916 /* Strictly speaking, there should be code such as:
4917
4918 assert (REG_UNSET (regend[*p]));
4919 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
4920
4921 But the only info to be pushed is regend[*p] and it is known to
4922 be UNSET, so there really isn't anything to push.
4923 Not pushing anything, on the other hand deprives us from the
4924 guarantee that regend[*p] is UNSET since undoing this operation
4925 will not reset its value properly. This is not important since
4926 the value will only be read on the next start_memory or at
4927 the very end and both events can only happen if this stop_memory
4928 is *not* undone. */
fa9a63c5 4929
25fe55af 4930 regend[*p] = d;
fa9a63c5
RM
4931 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4932
25fe55af 4933 /* Move past the register number and the inner group count. */
505bde11 4934 p += 1;
25fe55af 4935 break;
fa9a63c5
RM
4936
4937
4938 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
4939 followed by the numeric value of <digit> as the register number. */
4940 case duplicate:
fa9a63c5 4941 {
66f0296e 4942 register re_char *d2, *dend2;
25fe55af 4943 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
4944 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4945
25fe55af
RS
4946 /* Can't back reference a group which we've never matched. */
4947 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4948 goto fail;
5e69f11e 4949
25fe55af
RS
4950 /* Where in input to try to start matching. */
4951 d2 = regstart[regno];
5e69f11e 4952
99633e97
SM
4953 /* Remember the start point to rollback upon failure. */
4954 dfail = d;
4955
25fe55af
RS
4956 /* Where to stop matching; if both the place to start and
4957 the place to stop matching are in the same string, then
4958 set to the place to stop, otherwise, for now have to use
4959 the end of the first string. */
fa9a63c5 4960
25fe55af 4961 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
4962 == FIRST_STRING_P (regend[regno]))
4963 ? regend[regno] : end_match_1);
4964 for (;;)
4965 {
4966 /* If necessary, advance to next segment in register
25fe55af 4967 contents. */
fa9a63c5
RM
4968 while (d2 == dend2)
4969 {
4970 if (dend2 == end_match_2) break;
4971 if (dend2 == regend[regno]) break;
4972
25fe55af
RS
4973 /* End of string1 => advance to string2. */
4974 d2 = string2;
4975 dend2 = regend[regno];
fa9a63c5
RM
4976 }
4977 /* At end of register contents => success */
4978 if (d2 == dend2) break;
4979
4980 /* If necessary, advance to next segment in data. */
4981 PREFETCH ();
4982
4983 /* How many characters left in this segment to match. */
4984 mcnt = dend - d;
5e69f11e 4985
fa9a63c5 4986 /* Want how many consecutive characters we can match in
25fe55af
RS
4987 one shot, so, if necessary, adjust the count. */
4988 if (mcnt > dend2 - d2)
fa9a63c5 4989 mcnt = dend2 - d2;
5e69f11e 4990
fa9a63c5 4991 /* Compare that many; failure if mismatch, else move
25fe55af 4992 past them. */
28703c16 4993 if (RE_TRANSLATE_P (translate)
2d1675e4 4994 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
25fe55af 4995 : bcmp (d, d2, mcnt))
99633e97
SM
4996 {
4997 d = dfail;
4998 goto fail;
4999 }
fa9a63c5 5000 d += mcnt, d2 += mcnt;
fa9a63c5
RM
5001 }
5002 }
5003 break;
5004
5005
25fe55af
RS
5006 /* begline matches the empty string at the beginning of the string
5007 (unless `not_bol' is set in `bufp'), and, if
5008 `newline_anchor' is set, after newlines. */
fa9a63c5 5009 case begline:
25fe55af 5010 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5011
25fe55af
RS
5012 if (AT_STRINGS_BEG (d))
5013 {
5014 if (!bufp->not_bol) break;
5015 }
419d1c74 5016 else
25fe55af 5017 {
419d1c74
SM
5018 unsigned char c;
5019 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5020 if (c == '\n' && bufp->newline_anchor)
5021 break;
25fe55af
RS
5022 }
5023 /* In all other cases, we fail. */
5024 goto fail;
fa9a63c5
RM
5025
5026
25fe55af 5027 /* endline is the dual of begline. */
fa9a63c5 5028 case endline:
25fe55af 5029 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5030
25fe55af
RS
5031 if (AT_STRINGS_END (d))
5032 {
5033 if (!bufp->not_eol) break;
5034 }
f1ad044f 5035 else
25fe55af 5036 {
f1ad044f
SM
5037 PREFETCH_NOLIMIT ();
5038 if (*d == '\n' && bufp->newline_anchor)
5039 break;
25fe55af
RS
5040 }
5041 goto fail;
fa9a63c5
RM
5042
5043
5044 /* Match at the very beginning of the data. */
25fe55af
RS
5045 case begbuf:
5046 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5047 if (AT_STRINGS_BEG (d))
5048 break;
5049 goto fail;
fa9a63c5
RM
5050
5051
5052 /* Match at the very end of the data. */
25fe55af
RS
5053 case endbuf:
5054 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5055 if (AT_STRINGS_END (d))
5056 break;
25fe55af 5057 goto fail;
5e69f11e 5058
5e69f11e 5059
25fe55af
RS
5060 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5061 pushes NULL as the value for the string on the stack. Then
505bde11 5062 `POP_FAILURE_POINT' will keep the current value for the
25fe55af
RS
5063 string, instead of restoring it. To see why, consider
5064 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5065 then the . fails against the \n. But the next thing we want
5066 to do is match the \n against the \n; if we restored the
5067 string value, we would be back at the foo.
5068
5069 Because this is used only in specific cases, we don't need to
5070 check all the things that `on_failure_jump' does, to make
5071 sure the right things get saved on the stack. Hence we don't
5072 share its code. The only reason to push anything on the
5073 stack at all is that otherwise we would have to change
5074 `anychar's code to do something besides goto fail in this
5075 case; that seems worse than this. */
5076 case on_failure_keep_string_jump:
505bde11
SM
5077 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5078 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5079 mcnt, p + mcnt);
fa9a63c5 5080
505bde11
SM
5081 PUSH_FAILURE_POINT (p - 3, NULL);
5082 break;
5083
0683b6fa
SM
5084 /* A nasty loop is introduced by the non-greedy *? and +?.
5085 With such loops, the stack only ever contains one failure point
5086 at a time, so that a plain on_failure_jump_loop kind of
5087 cycle detection cannot work. Worse yet, such a detection
5088 can not only fail to detect a cycle, but it can also wrongly
5089 detect a cycle (between different instantiations of the same
5090 loop.
5091 So the method used for those nasty loops is a little different:
5092 We use a special cycle-detection-stack-frame which is pushed
5093 when the on_failure_jump_nastyloop failure-point is *popped*.
5094 This special frame thus marks the beginning of one iteration
5095 through the loop and we can hence easily check right here
5096 whether something matched between the beginning and the end of
5097 the loop. */
5098 case on_failure_jump_nastyloop:
5099 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5100 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5101 mcnt, p + mcnt);
5102
5103 assert ((re_opcode_t)p[-4] == no_op);
5104 CHECK_INFINITE_LOOP (p - 4, d);
5105 PUSH_FAILURE_POINT (p - 3, d);
5106 break;
5107
505bde11 5108
4e8a9132
SM
5109 /* Simple loop detecting on_failure_jump: just check on the
5110 failure stack if the same spot was already hit earlier. */
505bde11
SM
5111 case on_failure_jump_loop:
5112 on_failure:
5113 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5114 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5115 mcnt, p + mcnt);
5116
5117 CHECK_INFINITE_LOOP (p - 3, d);
5118 PUSH_FAILURE_POINT (p - 3, d);
25fe55af 5119 break;
fa9a63c5
RM
5120
5121
5122 /* Uses of on_failure_jump:
5e69f11e 5123
25fe55af
RS
5124 Each alternative starts with an on_failure_jump that points
5125 to the beginning of the next alternative. Each alternative
5126 except the last ends with a jump that in effect jumps past
5127 the rest of the alternatives. (They really jump to the
5128 ending jump of the following alternative, because tensioning
5129 these jumps is a hassle.)
fa9a63c5 5130
25fe55af
RS
5131 Repeats start with an on_failure_jump that points past both
5132 the repetition text and either the following jump or
5133 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5134 case on_failure_jump:
27c3b45d 5135 QUIT;
25fe55af 5136 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5137 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5138 mcnt, p + mcnt);
25fe55af 5139
505bde11 5140 PUSH_FAILURE_POINT (p -3, d);
25fe55af
RS
5141 break;
5142
4e8a9132 5143 /* This operation is used for greedy *.
505bde11
SM
5144 Compare the beginning of the repeat with what in the
5145 pattern follows its end. If we can establish that there
5146 is nothing that they would both match, i.e., that we
5147 would have to backtrack because of (as in, e.g., `a*a')
5148 then we can use a non-backtracking loop based on
4e8a9132 5149 on_failure_keep_string_jump instead of on_failure_jump. */
505bde11 5150 case on_failure_jump_smart:
27c3b45d 5151 QUIT;
25fe55af 5152 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11
SM
5153 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5154 mcnt, p + mcnt);
25fe55af 5155 {
505bde11
SM
5156 unsigned char *p1 = p; /* Next operation. */
5157 unsigned char *p2 = p + mcnt; /* Destination of the jump. */
fa9a63c5 5158
505bde11
SM
5159 p -= 3; /* Reset so that we will re-execute the
5160 instruction once it's been changed. */
fa9a63c5 5161
4e8a9132
SM
5162 EXTRACT_NUMBER (mcnt, p2 - 2);
5163
5164 /* Ensure this is a indeed the trivial kind of loop
5165 we are expecting. */
5166 assert (skip_one_char (p1) == p2 - 3);
5167 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
99633e97 5168 DEBUG_STATEMENT (debug += 2);
505bde11 5169 if (mutually_exclusive_p (bufp, p1, p2))
fa9a63c5 5170 {
505bde11 5171 /* Use a fast `on_failure_keep_string_jump' loop. */
4e8a9132
SM
5172 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5173 *p = (unsigned char) on_failure_keep_string_jump;
5174 STORE_NUMBER (p2 - 2, mcnt + 3);
25fe55af 5175 }
505bde11 5176 else
fa9a63c5 5177 {
505bde11
SM
5178 /* Default to a safe `on_failure_jump' loop. */
5179 DEBUG_PRINT1 (" smart default => slow loop.\n");
4e8a9132 5180 *p = (unsigned char) on_failure_jump;
fa9a63c5 5181 }
99633e97 5182 DEBUG_STATEMENT (debug -= 2);
25fe55af 5183 }
505bde11 5184 break;
25fe55af
RS
5185
5186 /* Unconditionally jump (without popping any failure points). */
5187 case jump:
fa9a63c5 5188 unconditional_jump:
27c3b45d 5189 QUIT;
fa9a63c5 5190 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5191 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5192 p += mcnt; /* Do the jump. */
505bde11 5193 DEBUG_PRINT2 ("(to %p).\n", p);
25fe55af
RS
5194 break;
5195
5196
25fe55af
RS
5197 /* Have to succeed matching what follows at least n times.
5198 After that, handle like `on_failure_jump'. */
5199 case succeed_n:
5200 EXTRACT_NUMBER (mcnt, p + 2);
5201 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5202
25fe55af
RS
5203 assert (mcnt >= 0);
5204 /* Originally, this is how many times we HAVE to succeed. */
5205 if (mcnt > 0)
5206 {
5207 mcnt--;
fa9a63c5 5208 p += 2;
25fe55af 5209 STORE_NUMBER_AND_INCR (p, mcnt);
505bde11 5210 DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt);
25fe55af 5211 }
fa9a63c5 5212 else if (mcnt == 0)
25fe55af 5213 {
505bde11 5214 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
fa9a63c5 5215 p[2] = (unsigned char) no_op;
25fe55af
RS
5216 p[3] = (unsigned char) no_op;
5217 goto on_failure;
5218 }
5219 break;
5220
5221 case jump_n:
5222 EXTRACT_NUMBER (mcnt, p + 2);
5223 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5224
5225 /* Originally, this is how many times we CAN jump. */
5226 if (mcnt)
5227 {
5228 mcnt--;
5229 STORE_NUMBER (p + 2, mcnt);
5e69f11e 5230 goto unconditional_jump;
25fe55af
RS
5231 }
5232 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5233 else
5234 p += 4;
25fe55af 5235 break;
5e69f11e 5236
fa9a63c5
RM
5237 case set_number_at:
5238 {
25fe55af 5239 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5240
25fe55af
RS
5241 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5242 p1 = p + mcnt;
5243 EXTRACT_NUMBER_AND_INCR (mcnt, p);
505bde11 5244 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
fa9a63c5 5245 STORE_NUMBER (p1, mcnt);
25fe55af
RS
5246 break;
5247 }
9121ca40
KH
5248
5249 case wordbound:
66f0296e
SM
5250 case notwordbound:
5251 not = (re_opcode_t) *(p - 1) == notwordbound;
5252 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
fa9a63c5 5253
99633e97 5254 /* We SUCCEED (or FAIL) in one of the following cases: */
9121ca40 5255
b18215fc 5256 /* Case 1: D is at the beginning or the end of string. */
9121ca40 5257 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
66f0296e 5258 not = !not;
b18215fc
RS
5259 else
5260 {
5261 /* C1 is the character before D, S1 is the syntax of C1, C2
5262 is the character at D, and S2 is the syntax of C2. */
5263 int c1, c2, s1, s2;
b18215fc 5264#ifdef emacs
2d1675e4
SM
5265 int offset = PTR_TO_OFFSET (d - 1);
5266 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5d967c7a 5267 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5268#endif
66f0296e 5269 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5270 s1 = SYNTAX (c1);
5271#ifdef emacs
5d967c7a 5272 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5273#endif
f1ad044f 5274 PREFETCH_NOLIMIT ();
2d1675e4 5275 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc
RS
5276 s2 = SYNTAX (c2);
5277
5278 if (/* Case 2: Only one of S1 and S2 is Sword. */
5279 ((s1 == Sword) != (s2 == Sword))
5280 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5281 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5282 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
66f0296e
SM
5283 not = !not;
5284 }
5285 if (not)
9121ca40 5286 break;
b18215fc 5287 else
9121ca40 5288 goto fail;
fa9a63c5
RM
5289
5290 case wordbeg:
25fe55af 5291 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5292
b18215fc
RS
5293 /* We FAIL in one of the following cases: */
5294
25fe55af 5295 /* Case 1: D is at the end of string. */
b18215fc 5296 if (AT_STRINGS_END (d))
99633e97 5297 goto fail;
b18215fc
RS
5298 else
5299 {
5300 /* C1 is the character before D, S1 is the syntax of C1, C2
5301 is the character at D, and S2 is the syntax of C2. */
5302 int c1, c2, s1, s2;
fa9a63c5 5303#ifdef emacs
2d1675e4
SM
5304 int offset = PTR_TO_OFFSET (d);
5305 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5306 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5307#endif
99633e97 5308 PREFETCH ();
2d1675e4 5309 c2 = RE_STRING_CHAR (d, dend - d);
b18215fc 5310 s2 = SYNTAX (c2);
25fe55af 5311
b18215fc
RS
5312 /* Case 2: S2 is not Sword. */
5313 if (s2 != Sword)
5314 goto fail;
5315
5316 /* Case 3: D is not at the beginning of string ... */
5317 if (!AT_STRINGS_BEG (d))
5318 {
5319 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5320#ifdef emacs
5d967c7a 5321 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5322#endif
b18215fc
RS
5323 s1 = SYNTAX (c1);
5324
5325 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5326 returns 0. */
b18215fc
RS
5327 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5328 goto fail;
5329 }
5330 }
e318085a
RS
5331 break;
5332
b18215fc 5333 case wordend:
25fe55af 5334 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5335
5336 /* We FAIL in one of the following cases: */
5337
5338 /* Case 1: D is at the beginning of string. */
5339 if (AT_STRINGS_BEG (d))
e318085a 5340 goto fail;
b18215fc
RS
5341 else
5342 {
5343 /* C1 is the character before D, S1 is the syntax of C1, C2
5344 is the character at D, and S2 is the syntax of C2. */
5345 int c1, c2, s1, s2;
5d967c7a 5346#ifdef emacs
2d1675e4
SM
5347 int offset = PTR_TO_OFFSET (d) - 1;
5348 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
92432794 5349 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5350#endif
99633e97 5351 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
b18215fc
RS
5352 s1 = SYNTAX (c1);
5353
5354 /* Case 2: S1 is not Sword. */
5355 if (s1 != Sword)
5356 goto fail;
5357
5358 /* Case 3: D is not at the end of string ... */
5359 if (!AT_STRINGS_END (d))
5360 {
f1ad044f 5361 PREFETCH_NOLIMIT ();
2d1675e4 5362 c2 = RE_STRING_CHAR (d, dend - d);
5d967c7a
RS
5363#ifdef emacs
5364 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5365#endif
b18215fc
RS
5366 s2 = SYNTAX (c2);
5367
5368 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5369 returns 0. */
b18215fc 5370 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5371 goto fail;
b18215fc
RS
5372 }
5373 }
e318085a
RS
5374 break;
5375
fa9a63c5 5376 case syntaxspec:
1fb352e0
SM
5377 case notsyntaxspec:
5378 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
fa9a63c5 5379 mcnt = *p++;
1fb352e0 5380 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
fa9a63c5 5381 PREFETCH ();
b18215fc
RS
5382#ifdef emacs
5383 {
2d1675e4
SM
5384 int offset = PTR_TO_OFFSET (d);
5385 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
b18215fc
RS
5386 UPDATE_SYNTAX_TABLE (pos1);
5387 }
25fe55af 5388#endif
b18215fc
RS
5389 {
5390 int c, len;
5391
2d1675e4 5392 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 5393
990b2375 5394 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
1fb352e0 5395 goto fail;
b18215fc
RS
5396 d += len;
5397 }
fa9a63c5
RM
5398 break;
5399
b18215fc 5400#ifdef emacs
1fb352e0
SM
5401 case before_dot:
5402 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5403 if (PTR_BYTE_POS (d) >= PT_BYTE)
fa9a63c5 5404 goto fail;
b18215fc
RS
5405 break;
5406
1fb352e0
SM
5407 case at_dot:
5408 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5409 if (PTR_BYTE_POS (d) != PT_BYTE)
5410 goto fail;
5411 break;
b18215fc 5412
1fb352e0
SM
5413 case after_dot:
5414 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5415 if (PTR_BYTE_POS (d) <= PT_BYTE)
5416 goto fail;
e318085a 5417 break;
fa9a63c5 5418
1fb352e0 5419 case categoryspec:
b18215fc 5420 case notcategoryspec:
1fb352e0 5421 not = (re_opcode_t) *(p - 1) == notcategoryspec;
b18215fc 5422 mcnt = *p++;
1fb352e0 5423 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
b18215fc
RS
5424 PREFETCH ();
5425 {
5426 int c, len;
2d1675e4 5427 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
b18215fc 5428
1fb352e0 5429 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
b18215fc
RS
5430 goto fail;
5431 d += len;
5432 }
fa9a63c5 5433 break;
5e69f11e 5434
1fb352e0 5435#endif /* emacs */
5e69f11e 5436
b18215fc
RS
5437 default:
5438 abort ();
fa9a63c5 5439 }
b18215fc 5440 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
5441
5442
5443 /* We goto here if a matching operation fails. */
5444 fail:
27c3b45d 5445 QUIT;
fa9a63c5 5446 if (!FAIL_STACK_EMPTY ())
505bde11 5447 {
66f0296e 5448 re_char *str;
505bde11
SM
5449 unsigned char *pat;
5450 /* A restart point is known. Restore to that state. */
b18215fc 5451 DEBUG_PRINT1 ("\nFAIL:\n");
505bde11
SM
5452 POP_FAILURE_POINT (str, pat);
5453 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5454 {
5455 case on_failure_keep_string_jump:
5456 assert (str == NULL);
5457 goto continue_failure_jump;
5458
0683b6fa
SM
5459 case on_failure_jump_nastyloop:
5460 assert ((re_opcode_t)pat[-2] == no_op);
5461 PUSH_FAILURE_POINT (pat - 2, str);
5462 /* Fallthrough */
5463
505bde11
SM
5464 case on_failure_jump_loop:
5465 case on_failure_jump:
5466 case succeed_n:
5467 d = str;
5468 continue_failure_jump:
5469 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5470 p = pat + mcnt;
5471 break;
b18215fc 5472
0683b6fa
SM
5473 case no_op:
5474 /* A special frame used for nastyloops. */
5475 goto fail;
5476
505bde11
SM
5477 default:
5478 abort();
5479 }
fa9a63c5 5480
505bde11 5481 assert (p >= bufp->buffer && p <= pend);
b18215fc
RS
5482
5483 if (d >= string1 && d <= end1)
fa9a63c5 5484 dend = end_match_1;
b18215fc 5485 }
fa9a63c5 5486 else
b18215fc 5487 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
5488 } /* for (;;) */
5489
5490 if (best_regs_set)
5491 goto restore_best_regs;
5492
5493 FREE_VARIABLES ();
5494
b18215fc 5495 return -1; /* Failure to match. */
fa9a63c5
RM
5496} /* re_match_2 */
5497\f
5498/* Subroutine definitions for re_match_2. */
5499
fa9a63c5
RM
5500/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5501 bytes; nonzero otherwise. */
5e69f11e 5502
fa9a63c5 5503static int
2d1675e4
SM
5504bcmp_translate (s1, s2, len, translate, multibyte)
5505 re_char *s1, *s2;
fa9a63c5 5506 register int len;
6676cb1c 5507 RE_TRANSLATE_TYPE translate;
2d1675e4 5508 const int multibyte;
fa9a63c5 5509{
2d1675e4
SM
5510 register re_char *p1 = s1, *p2 = s2;
5511 re_char *p1_end = s1 + len;
5512 re_char *p2_end = s2 + len;
e934739e
RS
5513
5514 while (p1 != p1_end && p2 != p2_end)
fa9a63c5 5515 {
e934739e
RS
5516 int p1_charlen, p2_charlen;
5517 int p1_ch, p2_ch;
5518
2d1675e4
SM
5519 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5520 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
e934739e
RS
5521
5522 if (RE_TRANSLATE (translate, p1_ch)
5523 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 5524 return 1;
e934739e
RS
5525
5526 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 5527 }
e934739e
RS
5528
5529 if (p1 != p1_end || p2 != p2_end)
5530 return 1;
5531
fa9a63c5
RM
5532 return 0;
5533}
5534\f
5535/* Entry points for GNU code. */
5536
5537/* re_compile_pattern is the GNU regular expression compiler: it
5538 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5539 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 5540
fa9a63c5
RM
5541 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5542 are set in BUFP on entry.
5e69f11e 5543
b18215fc 5544 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
5545
5546const char *
5547re_compile_pattern (pattern, length, bufp)
5548 const char *pattern;
5549 int length;
5550 struct re_pattern_buffer *bufp;
5551{
5552 reg_errcode_t ret;
5e69f11e 5553
fa9a63c5
RM
5554 /* GNU code is written to assume at least RE_NREGS registers will be set
5555 (and at least one extra will be -1). */
5556 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 5557
fa9a63c5
RM
5558 /* And GNU code determines whether or not to get register information
5559 by passing null for the REGS argument to re_match, etc., not by
5560 setting no_sub. */
5561 bufp->no_sub = 0;
5e69f11e 5562
b18215fc 5563 /* Match anchors at newline. */
fa9a63c5 5564 bufp->newline_anchor = 1;
5e69f11e 5565
fa9a63c5
RM
5566 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5567
5568 if (!ret)
5569 return NULL;
5570 return gettext (re_error_msgid[(int) ret]);
5e69f11e 5571}
fa9a63c5 5572\f
b18215fc
RS
5573/* Entry points compatible with 4.2 BSD regex library. We don't define
5574 them unless specifically requested. */
fa9a63c5 5575
0c085854 5576#if defined (_REGEX_RE_COMP) || defined (_LIBC)
fa9a63c5
RM
5577
5578/* BSD has one and only one pattern buffer. */
5579static struct re_pattern_buffer re_comp_buf;
5580
5581char *
48afdd44
RM
5582#ifdef _LIBC
5583/* Make these definitions weak in libc, so POSIX programs can redefine
5584 these names if they don't use our functions, and still use
5585 regcomp/regexec below without link errors. */
5586weak_function
5587#endif
fa9a63c5
RM
5588re_comp (s)
5589 const char *s;
5590{
5591 reg_errcode_t ret;
5e69f11e 5592
fa9a63c5
RM
5593 if (!s)
5594 {
5595 if (!re_comp_buf.buffer)
5596 return gettext ("No previous regular expression");
5597 return 0;
5598 }
5599
5600 if (!re_comp_buf.buffer)
5601 {
5602 re_comp_buf.buffer = (unsigned char *) malloc (200);
5603 if (re_comp_buf.buffer == NULL)
b18215fc 5604 return gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
5605 re_comp_buf.allocated = 200;
5606
5607 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5608 if (re_comp_buf.fastmap == NULL)
5609 return gettext (re_error_msgid[(int) REG_ESPACE]);
5610 }
5611
5612 /* Since `re_exec' always passes NULL for the `regs' argument, we
5613 don't need to initialize the pattern buffer fields which affect it. */
5614
b18215fc 5615 /* Match anchors at newlines. */
fa9a63c5
RM
5616 re_comp_buf.newline_anchor = 1;
5617
5618 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 5619
fa9a63c5
RM
5620 if (!ret)
5621 return NULL;
5622
5623 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5624 return (char *) gettext (re_error_msgid[(int) ret]);
5625}
5626
5627
5628int
48afdd44
RM
5629#ifdef _LIBC
5630weak_function
5631#endif
fa9a63c5
RM
5632re_exec (s)
5633 const char *s;
5634{
5635 const int len = strlen (s);
5636 return
5637 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5638}
5639#endif /* _REGEX_RE_COMP */
5640\f
5641/* POSIX.2 functions. Don't define these for Emacs. */
5642
5643#ifndef emacs
5644
5645/* regcomp takes a regular expression as a string and compiles it.
5646
b18215fc 5647 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
5648 since POSIX says we shouldn't. Thus, we set
5649
5650 `buffer' to the compiled pattern;
5651 `used' to the length of the compiled pattern;
5652 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5653 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5654 RE_SYNTAX_POSIX_BASIC;
5655 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5656 `fastmap' and `fastmap_accurate' to zero;
5657 `re_nsub' to the number of subexpressions in PATTERN.
5658
5659 PATTERN is the address of the pattern string.
5660
5661 CFLAGS is a series of bits which affect compilation.
5662
5663 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5664 use POSIX basic syntax.
5665
5666 If REG_NEWLINE is set, then . and [^...] don't match newline.
5667 Also, regexec will try a match beginning after every newline.
5668
5669 If REG_ICASE is set, then we considers upper- and lowercase
5670 versions of letters to be equivalent when matching.
5671
5672 If REG_NOSUB is set, then when PREG is passed to regexec, that
5673 routine will report only success or failure, and nothing about the
5674 registers.
5675
b18215fc 5676 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
5677 the return codes and their meanings.) */
5678
5679int
5680regcomp (preg, pattern, cflags)
5681 regex_t *preg;
5e69f11e 5682 const char *pattern;
fa9a63c5
RM
5683 int cflags;
5684{
5685 reg_errcode_t ret;
5686 unsigned syntax
5687 = (cflags & REG_EXTENDED) ?
5688 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5689
5690 /* regex_compile will allocate the space for the compiled pattern. */
5691 preg->buffer = 0;
5692 preg->allocated = 0;
5693 preg->used = 0;
5e69f11e 5694
fa9a63c5
RM
5695 /* Don't bother to use a fastmap when searching. This simplifies the
5696 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5697 characters after newlines into the fastmap. This way, we just try
5698 every character. */
5699 preg->fastmap = 0;
5e69f11e 5700
fa9a63c5
RM
5701 if (cflags & REG_ICASE)
5702 {
5703 unsigned i;
5e69f11e 5704
6676cb1c
RS
5705 preg->translate
5706 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5707 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 5708 if (preg->translate == NULL)
b18215fc 5709 return (int) REG_ESPACE;
fa9a63c5
RM
5710
5711 /* Map uppercase characters to corresponding lowercase ones. */
5712 for (i = 0; i < CHAR_SET_SIZE; i++)
b18215fc 5713 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
fa9a63c5
RM
5714 }
5715 else
5716 preg->translate = NULL;
5717
5718 /* If REG_NEWLINE is set, newlines are treated differently. */
5719 if (cflags & REG_NEWLINE)
5720 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5721 syntax &= ~RE_DOT_NEWLINE;
5722 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
b18215fc 5723 /* It also changes the matching behavior. */
fa9a63c5
RM
5724 preg->newline_anchor = 1;
5725 }
5726 else
5727 preg->newline_anchor = 0;
5728
5729 preg->no_sub = !!(cflags & REG_NOSUB);
5730
5e69f11e 5731 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5
RM
5732 can use strlen here in compiling the pattern. */
5733 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5e69f11e 5734
fa9a63c5
RM
5735 /* POSIX doesn't distinguish between an unmatched open-group and an
5736 unmatched close-group: both are REG_EPAREN. */
5737 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5e69f11e 5738
fa9a63c5
RM
5739 return (int) ret;
5740}
5741
5742
5743/* regexec searches for a given pattern, specified by PREG, in the
5744 string STRING.
5e69f11e 5745
fa9a63c5 5746 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 5747 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
5748 least NMATCH elements, and we set them to the offsets of the
5749 corresponding matched substrings.
5e69f11e 5750
fa9a63c5
RM
5751 EFLAGS specifies `execution flags' which affect matching: if
5752 REG_NOTBOL is set, then ^ does not match at the beginning of the
5753 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 5754
fa9a63c5
RM
5755 We return 0 if we find a match and REG_NOMATCH if not. */
5756
5757int
5758regexec (preg, string, nmatch, pmatch, eflags)
5759 const regex_t *preg;
5e69f11e
RM
5760 const char *string;
5761 size_t nmatch;
5762 regmatch_t pmatch[];
fa9a63c5
RM
5763 int eflags;
5764{
5765 int ret;
5766 struct re_registers regs;
5767 regex_t private_preg;
5768 int len = strlen (string);
5769 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5770
5771 private_preg = *preg;
5e69f11e 5772
fa9a63c5
RM
5773 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5774 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 5775
fa9a63c5
RM
5776 /* The user has told us exactly how many registers to return
5777 information about, via `nmatch'. We have to pass that on to the
b18215fc 5778 matching routines. */
fa9a63c5 5779 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 5780
fa9a63c5
RM
5781 if (want_reg_info)
5782 {
5783 regs.num_regs = nmatch;
5784 regs.start = TALLOC (nmatch, regoff_t);
5785 regs.end = TALLOC (nmatch, regoff_t);
5786 if (regs.start == NULL || regs.end == NULL)
b18215fc 5787 return (int) REG_NOMATCH;
fa9a63c5
RM
5788 }
5789
5790 /* Perform the searching operation. */
5791 ret = re_search (&private_preg, string, len,
b18215fc
RS
5792 /* start: */ 0, /* range: */ len,
5793 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 5794
fa9a63c5
RM
5795 /* Copy the register information to the POSIX structure. */
5796 if (want_reg_info)
5797 {
5798 if (ret >= 0)
b18215fc
RS
5799 {
5800 unsigned r;
fa9a63c5 5801
b18215fc
RS
5802 for (r = 0; r < nmatch; r++)
5803 {
5804 pmatch[r].rm_so = regs.start[r];
5805 pmatch[r].rm_eo = regs.end[r];
5806 }
5807 }
fa9a63c5 5808
b18215fc 5809 /* If we needed the temporary register info, free the space now. */
fa9a63c5
RM
5810 free (regs.start);
5811 free (regs.end);
5812 }
5813
5814 /* We want zero return to mean success, unlike `re_search'. */
5815 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5816}
5817
5818
5819/* Returns a message corresponding to an error code, ERRCODE, returned
5820 from either regcomp or regexec. We don't use PREG here. */
5821
5822size_t
5823regerror (errcode, preg, errbuf, errbuf_size)
5824 int errcode;
5825 const regex_t *preg;
5826 char *errbuf;
5827 size_t errbuf_size;
5828{
5829 const char *msg;
5830 size_t msg_size;
5831
5832 if (errcode < 0
5833 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 5834 /* Only error codes returned by the rest of the code should be passed
b18215fc 5835 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
5836 code generates an invalid error code, then the program has a bug.
5837 Dump core so we can fix it. */
5838 abort ();
5839
5840 msg = gettext (re_error_msgid[errcode]);
5841
5842 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 5843
fa9a63c5
RM
5844 if (errbuf_size != 0)
5845 {
5846 if (msg_size > errbuf_size)
b18215fc
RS
5847 {
5848 strncpy (errbuf, msg, errbuf_size - 1);
5849 errbuf[errbuf_size - 1] = 0;
5850 }
fa9a63c5 5851 else
b18215fc 5852 strcpy (errbuf, msg);
fa9a63c5
RM
5853 }
5854
5855 return msg_size;
5856}
5857
5858
5859/* Free dynamically allocated space used by PREG. */
5860
5861void
5862regfree (preg)
5863 regex_t *preg;
5864{
5865 if (preg->buffer != NULL)
5866 free (preg->buffer);
5867 preg->buffer = NULL;
5e69f11e 5868
fa9a63c5
RM
5869 preg->allocated = 0;
5870 preg->used = 0;
5871
5872 if (preg->fastmap != NULL)
5873 free (preg->fastmap);
5874 preg->fastmap = NULL;
5875 preg->fastmap_accurate = 0;
5876
5877 if (preg->translate != NULL)
5878 free (preg->translate);
5879 preg->translate = NULL;
5880}
5881
5882#endif /* not emacs */