Explain that unused input methods can be removed from the installation
[bpt/emacs.git] / src / regex.c
CommitLineData
e318085a
RS
1/* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
bc78d348
KB
3 internationalization features.)
4
9d99031f 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc.
bc78d348 6
fa9a63c5
RM
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
25fe55af 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
fa9a63c5
RM
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
ba4a8e51 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25fe55af 20 USA. */
fa9a63c5
RM
21
22/* AIX requires this to be the first thing in the file. */
23#if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25#endif
26
68d96f02 27#undef _GNU_SOURCE
fa9a63c5
RM
28#define _GNU_SOURCE
29
51352796 30#ifdef emacs
25fe55af
RS
31/* Converts the pointer to the char to BEG-based offset from the start. */
32#define PTR_TO_OFFSET(d) \
b18215fc
RS
33 POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \
34 ? (d) - string1 : (d) - (string2 - size1))
7e95234e 35#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
51352796
RS
36#else
37#define PTR_TO_OFFSET(d) 0
38#endif
b18215fc 39
fa9a63c5
RM
40#ifdef HAVE_CONFIG_H
41#include <config.h>
42#endif
43
25fe55af 44/* We need this for `regex.h', and perhaps for the Emacs include files. */
fa9a63c5
RM
45#include <sys/types.h>
46
25fe55af 47/* This is for other GNU distributions with internationalized messages. */
fa9a63c5
RM
48#if HAVE_LIBINTL_H || defined (_LIBC)
49# include <libintl.h>
50#else
51# define gettext(msgid) (msgid)
52#endif
53
5e69f11e
RM
54#ifndef gettext_noop
55/* This define is so xgettext can find the internationalizable
56 strings. */
57#define gettext_noop(String) String
58#endif
59
fa9a63c5
RM
60/* The `emacs' switch turns on certain matching commands
61 that make sense only in Emacs. */
62#ifdef emacs
63
64#include "lisp.h"
65#include "buffer.h"
b18215fc
RS
66
67/* Make syntax table lookup grant data in gl_state. */
68#define SYNTAX_ENTRY_VIA_PROPERTY
69
fa9a63c5 70#include "syntax.h"
b18215fc
RS
71#include "charset.h"
72#include "category.h"
fa9a63c5 73
9abbd165 74#define malloc xmalloc
64e3c718 75#define realloc xrealloc
9abbd165
RS
76#define free xfree
77
fa9a63c5
RM
78#else /* not emacs */
79
80/* If we are not linking with Emacs proper,
81 we can't use the relocating allocator
82 even if config.h says that we can. */
83#undef REL_ALLOC
84
85#if defined (STDC_HEADERS) || defined (_LIBC)
86#include <stdlib.h>
87#else
88char *malloc ();
89char *realloc ();
90#endif
91
9e4ecb26 92/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
25fe55af 93 If nothing else has been done, use the method below. */
9e4ecb26
KH
94#ifdef INHIBIT_STRING_HEADER
95#if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
96#if !defined (bzero) && !defined (bcopy)
97#undef INHIBIT_STRING_HEADER
98#endif
99#endif
100#endif
101
102/* This is the normal way of making sure we have a bcopy and a bzero.
103 This is used in most programs--a few other programs avoid this
104 by defining INHIBIT_STRING_HEADER. */
fa9a63c5 105#ifndef INHIBIT_STRING_HEADER
7f998252 106#if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
fa9a63c5
RM
107#include <string.h>
108#ifndef bcmp
109#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
110#endif
111#ifndef bcopy
112#define bcopy(s, d, n) memcpy ((d), (s), (n))
113#endif
114#ifndef bzero
115#define bzero(s, n) memset ((s), 0, (n))
116#endif
117#else
118#include <strings.h>
119#endif
120#endif
121
122/* Define the syntax stuff for \<, \>, etc. */
123
124/* This must be nonzero for the wordchar and notwordchar pattern
125 commands in re_match_2. */
5e69f11e 126#ifndef Sword
fa9a63c5
RM
127#define Sword 1
128#endif
129
130#ifdef SWITCH_ENUM_BUG
131#define SWITCH_ENUM_CAST(x) ((int)(x))
132#else
133#define SWITCH_ENUM_CAST(x) (x)
134#endif
135
136#ifdef SYNTAX_TABLE
137
138extern char *re_syntax_table;
139
140#else /* not SYNTAX_TABLE */
141
142/* How many characters in the character set. */
143#define CHAR_SET_SIZE 256
144
145static char re_syntax_table[CHAR_SET_SIZE];
146
147static void
148init_syntax_once ()
149{
150 register int c;
151 static int done = 0;
152
153 if (done)
154 return;
155
156 bzero (re_syntax_table, sizeof re_syntax_table);
157
158 for (c = 'a'; c <= 'z'; c++)
159 re_syntax_table[c] = Sword;
160
161 for (c = 'A'; c <= 'Z'; c++)
162 re_syntax_table[c] = Sword;
163
164 for (c = '0'; c <= '9'; c++)
165 re_syntax_table[c] = Sword;
166
167 re_syntax_table['_'] = Sword;
168
169 done = 1;
170}
171
172#endif /* not SYNTAX_TABLE */
173
174#define SYNTAX(c) re_syntax_table[c]
175
e934739e 176/* Dummy macros for non-Emacs environments. */
b18215fc
RS
177#define BASE_LEADING_CODE_P(c) (0)
178#define WORD_BOUNDARY_P(c1, c2) (0)
179#define CHAR_HEAD_P(p) (1)
180#define SINGLE_BYTE_CHAR_P(c) (1)
181#define SAME_CHARSET_P(c1, c2) (1)
182#define MULTIBYTE_FORM_LENGTH(p, s) (1)
183#define STRING_CHAR(p, s) (*(p))
184#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
185#define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
186 (c = ((p) == (end1) ? *(str2) : *(p)))
187#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
188 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
fa9a63c5
RM
189#endif /* not emacs */
190\f
191/* Get the interface, including the syntax bits. */
192#include "regex.h"
193
f71b19b6
DL
194/* isalpha etc. are used for the character classes. */
195#include <ctype.h>
fa9a63c5 196
f71b19b6 197#ifdef emacs
fa9a63c5 198
f71b19b6
DL
199/* 1 if C is an ASCII character. */
200#define IS_REAL_ASCII(c) ((c) < 0200)
fa9a63c5 201
f71b19b6
DL
202/* 1 if C is a unibyte character. */
203#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
96cc36cc 204
f71b19b6 205/* The Emacs definitions should not be directly affected by locales. */
96cc36cc 206
f71b19b6
DL
207/* In Emacs, these are only used for single-byte characters. */
208#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
209#define ISCNTRL(c) ((c) < ' ')
210#define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
211 || ((c) >= 'a' && (c) <= 'f') \
212 || ((c) >= 'A' && (c) <= 'F'))
96cc36cc
RS
213
214/* This is only used for single-byte characters. */
215#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
216
217/* The rest must handle multibyte characters. */
218
219#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 220 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
221 : 1)
222
223#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
f71b19b6 224 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
96cc36cc
RS
225 : 1)
226
f71b19b6
DL
227#define ISALNUM(c) (IS_REAL_ASCII (c) \
228 ? (((c) >= 'a' && (c) <= 'z') \
229 || ((c) >= 'A' && (c) <= 'Z') \
230 || ((c) >= '0' && (c) <= '9')) \
96cc36cc
RS
231 : SYNTAX (c) == Sword)
232
f71b19b6
DL
233#define ISALPHA(c) (IS_REAL_ASCII (c) \
234 ? (((c) >= 'a' && (c) <= 'z') \
235 || ((c) >= 'A' && (c) <= 'Z')) \
96cc36cc
RS
236 : SYNTAX (c) == Sword)
237
238#define ISLOWER(c) (LOWERCASEP (c))
239
f71b19b6
DL
240#define ISPUNCT(c) (IS_REAL_ASCII (c) \
241 ? ((c) > ' ' && (c) < 0177 \
242 && !(((c) >= 'a' && (c) <= 'z') \
243 || ((c) >= 'A' && (c) <= 'Z') \
244 || ((c) >= '0' && (c) <= '9'))) \
96cc36cc
RS
245 : SYNTAX (c) != Sword)
246
247#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
248
249#define ISUPPER(c) (UPPERCASEP (c))
250
251#define ISWORD(c) (SYNTAX (c) == Sword)
252
253#else /* not emacs */
254
f71b19b6
DL
255/* Jim Meyering writes:
256
257 "... Some ctype macros are valid only for character codes that
258 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
259 using /bin/cc or gcc but without giving an ansi option). So, all
260 ctype uses should be through macros like ISPRINT... If
261 STDC_HEADERS is defined, then autoconf has verified that the ctype
262 macros don't need to be guarded with references to isascii. ...
263 Defining isascii to 1 should let any compiler worth its salt
264 eliminate the && through constant folding." */
265
266#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
267#define ISASCII(c) 1
268#else
269#define ISASCII(c) isascii(c)
270#endif
271
272/* 1 if C is an ASCII character. */
273#define IS_REAL_ASCII(c) ((c) < 0200)
274
275/* This distinction is not meaningful, except in Emacs. */
276#define ISUNIBYTE(c) 1
277
278#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
279#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
280#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
281
fa9a63c5
RM
282#ifdef isblank
283#define ISBLANK(c) (ISASCII (c) && isblank (c))
284#else
285#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
286#endif
287#ifdef isgraph
288#define ISGRAPH(c) (ISASCII (c) && isgraph (c))
289#else
290#define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
291#endif
292
293#define ISPRINT(c) (ISASCII (c) && isprint (c))
294#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
295#define ISALNUM(c) (ISASCII (c) && isalnum (c))
296#define ISALPHA(c) (ISASCII (c) && isalpha (c))
297#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
298#define ISLOWER(c) (ISASCII (c) && islower (c))
299#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
300#define ISSPACE(c) (ISASCII (c) && isspace (c))
301#define ISUPPER(c) (ISASCII (c) && isupper (c))
302#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
303
96cc36cc
RS
304#define ISWORD(c) ISALPHA(c)
305
306#endif /* not emacs */
307\f
fa9a63c5 308#ifndef NULL
075f06ec 309#define NULL (void *)0
fa9a63c5
RM
310#endif
311
312/* We remove any previous definition of `SIGN_EXTEND_CHAR',
313 since ours (we hope) works properly with all combinations of
314 machines, compilers, `char' and `unsigned char' argument types.
25fe55af 315 (Per Bothner suggested the basic approach.) */
fa9a63c5
RM
316#undef SIGN_EXTEND_CHAR
317#if __STDC__
318#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
319#else /* not __STDC__ */
320/* As in Harbison and Steele. */
321#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
322#endif
323\f
324/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
325 use `alloca' instead of `malloc'. This is because using malloc in
326 re_search* or re_match* could cause memory leaks when C-g is used in
327 Emacs; also, malloc is slower and causes storage fragmentation. On
5e69f11e
RM
328 the other hand, malloc is more portable, and easier to debug.
329
fa9a63c5
RM
330 Because we sometimes use alloca, some routines have to be macros,
331 not functions -- `alloca'-allocated space disappears at the end of the
332 function it is called in. */
333
334#ifdef REGEX_MALLOC
335
336#define REGEX_ALLOCATE malloc
337#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
338#define REGEX_FREE free
339
340#else /* not REGEX_MALLOC */
341
342/* Emacs already defines alloca, sometimes. */
343#ifndef alloca
344
345/* Make alloca work the best possible way. */
346#ifdef __GNUC__
347#define alloca __builtin_alloca
348#else /* not __GNUC__ */
349#if HAVE_ALLOCA_H
350#include <alloca.h>
351#else /* not __GNUC__ or HAVE_ALLOCA_H */
f3c4387f 352#if 0 /* It is a bad idea to declare alloca. We always cast the result. */
25fe55af 353#ifndef _AIX /* Already did AIX, up at the top. */
fa9a63c5
RM
354char *alloca ();
355#endif /* not _AIX */
f3c4387f 356#endif
5e69f11e 357#endif /* not HAVE_ALLOCA_H */
fa9a63c5
RM
358#endif /* not __GNUC__ */
359
360#endif /* not alloca */
361
362#define REGEX_ALLOCATE alloca
363
364/* Assumes a `char *destination' variable. */
365#define REGEX_REALLOCATE(source, osize, nsize) \
366 (destination = (char *) alloca (nsize), \
367 bcopy (source, destination, osize), \
368 destination)
369
370/* No need to do anything to free, after alloca. */
c2e1680a 371#define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
fa9a63c5
RM
372
373#endif /* not REGEX_MALLOC */
374
375/* Define how to allocate the failure stack. */
376
33487cc8 377#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4297555e 378
fa9a63c5
RM
379#define REGEX_ALLOCATE_STACK(size) \
380 r_alloc (&failure_stack_ptr, (size))
381#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
382 r_re_alloc (&failure_stack_ptr, (nsize))
383#define REGEX_FREE_STACK(ptr) \
384 r_alloc_free (&failure_stack_ptr)
385
4297555e 386#else /* not using relocating allocator */
fa9a63c5
RM
387
388#ifdef REGEX_MALLOC
389
390#define REGEX_ALLOCATE_STACK malloc
391#define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
392#define REGEX_FREE_STACK free
393
394#else /* not REGEX_MALLOC */
395
396#define REGEX_ALLOCATE_STACK alloca
397
398#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
399 REGEX_REALLOCATE (source, osize, nsize)
25fe55af 400/* No need to explicitly free anything. */
fa9a63c5
RM
401#define REGEX_FREE_STACK(arg)
402
403#endif /* not REGEX_MALLOC */
4297555e 404#endif /* not using relocating allocator */
fa9a63c5
RM
405
406
407/* True if `size1' is non-NULL and PTR is pointing anywhere inside
408 `string1' or just past its end. This works if PTR is NULL, which is
409 a good thing. */
25fe55af 410#define FIRST_STRING_P(ptr) \
fa9a63c5
RM
411 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
412
413/* (Re)Allocate N items of type T using malloc, or fail. */
414#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
415#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
416#define RETALLOC_IF(addr, n, t) \
417 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
418#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
419
25fe55af 420#define BYTEWIDTH 8 /* In bits. */
fa9a63c5
RM
421
422#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
423
424#undef MAX
425#undef MIN
426#define MAX(a, b) ((a) > (b) ? (a) : (b))
427#define MIN(a, b) ((a) < (b) ? (a) : (b))
428
429typedef char boolean;
430#define false 0
431#define true 1
432
433static int re_match_2_internal ();
434\f
435/* These are the command codes that appear in compiled regular
25fe55af 436 expressions. Some opcodes are followed by argument bytes. A
fa9a63c5
RM
437 command code can specify any interpretation whatsoever for its
438 arguments. Zero bytes may appear in the compiled regular expression. */
439
440typedef enum
441{
442 no_op = 0,
443
25fe55af 444 /* Succeed right away--no more backtracking. */
fa9a63c5
RM
445 succeed,
446
25fe55af 447 /* Followed by one byte giving n, then by n literal bytes. */
fa9a63c5
RM
448 exactn,
449
25fe55af 450 /* Matches any (more or less) character. */
fa9a63c5
RM
451 anychar,
452
25fe55af
RS
453 /* Matches any one char belonging to specified set. First
454 following byte is number of bitmap bytes. Then come bytes
455 for a bitmap saying which chars are in. Bits in each byte
456 are ordered low-bit-first. A character is in the set if its
457 bit is 1. A character too large to have a bit in the map is
96cc36cc
RS
458 automatically not in the set.
459
460 If the length byte has the 0x80 bit set, then that stuff
461 is followed by a range table:
462 2 bytes of flags for character sets (low 8 bits, high 8 bits)
463 See RANGE_TABLE_WORK_BITS below.
464 2 bytes, the number of pairs that follow
465 pairs, each 2 multibyte characters,
466 each multibyte character represented as 3 bytes. */
fa9a63c5
RM
467 charset,
468
25fe55af
RS
469 /* Same parameters as charset, but match any character that is
470 not one of those specified. */
fa9a63c5
RM
471 charset_not,
472
25fe55af
RS
473 /* Start remembering the text that is matched, for storing in a
474 register. Followed by one byte with the register number, in
475 the range 0 to one less than the pattern buffer's re_nsub
476 field. Then followed by one byte with the number of groups
477 inner to this one. (This last has to be part of the
478 start_memory only because we need it in the on_failure_jump
479 of re_match_2.) */
fa9a63c5
RM
480 start_memory,
481
25fe55af
RS
482 /* Stop remembering the text that is matched and store it in a
483 memory register. Followed by one byte with the register
484 number, in the range 0 to one less than `re_nsub' in the
485 pattern buffer, and one byte with the number of inner groups,
486 just like `start_memory'. (We need the number of inner
487 groups here because we don't have any easy way of finding the
488 corresponding start_memory when we're at a stop_memory.) */
fa9a63c5
RM
489 stop_memory,
490
25fe55af
RS
491 /* Match a duplicate of something remembered. Followed by one
492 byte containing the register number. */
fa9a63c5
RM
493 duplicate,
494
25fe55af 495 /* Fail unless at beginning of line. */
fa9a63c5
RM
496 begline,
497
25fe55af 498 /* Fail unless at end of line. */
fa9a63c5
RM
499 endline,
500
25fe55af
RS
501 /* Succeeds if at beginning of buffer (if emacs) or at beginning
502 of string to be matched (if not). */
fa9a63c5
RM
503 begbuf,
504
25fe55af 505 /* Analogously, for end of buffer/string. */
fa9a63c5 506 endbuf,
5e69f11e 507
25fe55af 508 /* Followed by two byte relative address to which to jump. */
5e69f11e 509 jump,
fa9a63c5
RM
510
511 /* Same as jump, but marks the end of an alternative. */
512 jump_past_alt,
513
25fe55af
RS
514 /* Followed by two-byte relative address of place to resume at
515 in case of failure. */
fa9a63c5 516 on_failure_jump,
5e69f11e 517
25fe55af
RS
518 /* Like on_failure_jump, but pushes a placeholder instead of the
519 current string position when executed. */
fa9a63c5 520 on_failure_keep_string_jump,
5e69f11e 521
25fe55af
RS
522 /* Throw away latest failure point and then jump to following
523 two-byte relative address. */
fa9a63c5
RM
524 pop_failure_jump,
525
25fe55af
RS
526 /* Change to pop_failure_jump if know won't have to backtrack to
527 match; otherwise change to jump. This is used to jump
528 back to the beginning of a repeat. If what follows this jump
529 clearly won't match what the repeat does, such that we can be
530 sure that there is no use backtracking out of repetitions
531 already matched, then we change it to a pop_failure_jump.
532 Followed by two-byte address. */
fa9a63c5
RM
533 maybe_pop_jump,
534
25fe55af
RS
535 /* Jump to following two-byte address, and push a dummy failure
536 point. This failure point will be thrown away if an attempt
537 is made to use it for a failure. A `+' construct makes this
538 before the first repeat. Also used as an intermediary kind
539 of jump when compiling an alternative. */
fa9a63c5
RM
540 dummy_failure_jump,
541
542 /* Push a dummy failure point and continue. Used at the end of
543 alternatives. */
544 push_dummy_failure,
545
25fe55af
RS
546 /* Followed by two-byte relative address and two-byte number n.
547 After matching N times, jump to the address upon failure. */
fa9a63c5
RM
548 succeed_n,
549
25fe55af
RS
550 /* Followed by two-byte relative address, and two-byte number n.
551 Jump to the address N times, then fail. */
fa9a63c5
RM
552 jump_n,
553
25fe55af
RS
554 /* Set the following two-byte relative address to the
555 subsequent two-byte number. The address *includes* the two
556 bytes of number. */
fa9a63c5
RM
557 set_number_at,
558
559 wordchar, /* Matches any word-constituent character. */
560 notwordchar, /* Matches any char that is not a word-constituent. */
561
562 wordbeg, /* Succeeds if at word beginning. */
563 wordend, /* Succeeds if at word end. */
564
565 wordbound, /* Succeeds if at a word boundary. */
25fe55af 566 notwordbound /* Succeeds if not at a word boundary. */
fa9a63c5
RM
567
568#ifdef emacs
569 ,before_dot, /* Succeeds if before point. */
570 at_dot, /* Succeeds if at point. */
571 after_dot, /* Succeeds if after point. */
572
573 /* Matches any character whose syntax is specified. Followed by
25fe55af 574 a byte which contains a syntax code, e.g., Sword. */
fa9a63c5
RM
575 syntaxspec,
576
577 /* Matches any character whose syntax is not that specified. */
b18215fc
RS
578 notsyntaxspec,
579
580 /* Matches any character whose category-set contains the specified
25fe55af
RS
581 category. The operator is followed by a byte which contains a
582 category code (mnemonic ASCII character). */
b18215fc
RS
583 categoryspec,
584
585 /* Matches any character whose category-set does not contain the
586 specified category. The operator is followed by a byte which
587 contains the category code (mnemonic ASCII character). */
588 notcategoryspec
fa9a63c5
RM
589#endif /* emacs */
590} re_opcode_t;
591\f
592/* Common operations on the compiled pattern. */
593
594/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
595
596#define STORE_NUMBER(destination, number) \
597 do { \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
600 } while (0)
601
602/* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
605
606#define STORE_NUMBER_AND_INCR(destination, number) \
607 do { \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
610 } while (0)
611
612/* Put into DESTINATION a number stored in two contiguous bytes starting
613 at SOURCE. */
614
615#define EXTRACT_NUMBER(destination, source) \
616 do { \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
619 } while (0)
620
621#ifdef DEBUG
622static void
623extract_number (dest, source)
624 int *dest;
625 unsigned char *source;
626{
5e69f11e 627 int temp = SIGN_EXTEND_CHAR (*(source + 1));
fa9a63c5
RM
628 *dest = *source & 0377;
629 *dest += temp << 8;
630}
631
25fe55af 632#ifndef EXTRACT_MACROS /* To debug the macros. */
fa9a63c5
RM
633#undef EXTRACT_NUMBER
634#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
635#endif /* not EXTRACT_MACROS */
636
637#endif /* DEBUG */
638
639/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
640 SOURCE must be an lvalue. */
641
642#define EXTRACT_NUMBER_AND_INCR(destination, source) \
643 do { \
644 EXTRACT_NUMBER (destination, source); \
25fe55af 645 (source) += 2; \
fa9a63c5
RM
646 } while (0)
647
648#ifdef DEBUG
649static void
650extract_number_and_incr (destination, source)
651 int *destination;
652 unsigned char **source;
5e69f11e 653{
fa9a63c5
RM
654 extract_number (destination, *source);
655 *source += 2;
656}
657
658#ifndef EXTRACT_MACROS
659#undef EXTRACT_NUMBER_AND_INCR
660#define EXTRACT_NUMBER_AND_INCR(dest, src) \
661 extract_number_and_incr (&dest, &src)
662#endif /* not EXTRACT_MACROS */
663
664#endif /* DEBUG */
665\f
b18215fc
RS
666/* Store a multibyte character in three contiguous bytes starting
667 DESTINATION, and increment DESTINATION to the byte after where the
25fe55af 668 character is stored. Therefore, DESTINATION must be an lvalue. */
b18215fc
RS
669
670#define STORE_CHARACTER_AND_INCR(destination, character) \
671 do { \
672 (destination)[0] = (character) & 0377; \
673 (destination)[1] = ((character) >> 8) & 0377; \
674 (destination)[2] = (character) >> 16; \
675 (destination) += 3; \
676 } while (0)
677
678/* Put into DESTINATION a character stored in three contiguous bytes
25fe55af 679 starting at SOURCE. */
b18215fc
RS
680
681#define EXTRACT_CHARACTER(destination, source) \
682 do { \
683 (destination) = ((source)[0] \
684 | ((source)[1] << 8) \
685 | ((source)[2] << 16)); \
686 } while (0)
687
688
689/* Macros for charset. */
690
691/* Size of bitmap of charset P in bytes. P is a start of charset,
692 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
693#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
694
695/* Nonzero if charset P has range table. */
25fe55af 696#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
b18215fc
RS
697
698/* Return the address of range table of charset P. But not the start
699 of table itself, but the before where the number of ranges is
96cc36cc
RS
700 stored. `2 +' means to skip re_opcode_t and size of bitmap,
701 and the 2 bytes of flags at the start of the range table. */
702#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
703
704/* Extract the bit flags that start a range table. */
705#define CHARSET_RANGE_TABLE_BITS(p) \
706 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
707 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
b18215fc
RS
708
709/* Test if C is listed in the bitmap of charset P. */
710#define CHARSET_LOOKUP_BITMAP(p, c) \
711 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
712 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
713
714/* Return the address of end of RANGE_TABLE. COUNT is number of
25fe55af
RS
715 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
716 is start of range and end of range. `* 3' is size of each start
b18215fc
RS
717 and end. */
718#define CHARSET_RANGE_TABLE_END(range_table, count) \
719 ((range_table) + (count) * 2 * 3)
720
25fe55af 721/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
b18215fc
RS
722 COUNT is number of ranges in RANGE_TABLE. */
723#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
724 do \
725 { \
726 int range_start, range_end; \
727 unsigned char *p; \
728 unsigned char *range_table_end \
729 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
730 \
731 for (p = (range_table); p < range_table_end; p += 2 * 3) \
732 { \
733 EXTRACT_CHARACTER (range_start, p); \
734 EXTRACT_CHARACTER (range_end, p + 3); \
735 \
736 if (range_start <= (c) && (c) <= range_end) \
737 { \
738 (not) = !(not); \
739 break; \
740 } \
741 } \
742 } \
743 while (0)
744
745/* Test if C is in range table of CHARSET. The flag NOT is negated if
746 C is listed in it. */
747#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
748 do \
749 { \
750 /* Number of ranges in range table. */ \
751 int count; \
752 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
753 \
754 EXTRACT_NUMBER_AND_INCR (count, range_table); \
755 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
756 } \
757 while (0)
758\f
fa9a63c5
RM
759/* If DEBUG is defined, Regex prints many voluminous messages about what
760 it is doing (if the variable `debug' is nonzero). If linked with the
761 main program in `iregex.c', you can enter patterns and strings
762 interactively. And if linked with the main program in `main.c' and
25fe55af 763 the other test files, you can run the already-written tests. */
fa9a63c5
RM
764
765#ifdef DEBUG
766
767/* We use standard I/O for debugging. */
768#include <stdio.h>
769
770/* It is useful to test things that ``must'' be true when debugging. */
771#include <assert.h>
772
773static int debug = 0;
774
775#define DEBUG_STATEMENT(e) e
776#define DEBUG_PRINT1(x) if (debug) printf (x)
777#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
778#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
779#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
25fe55af 780#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
fa9a63c5
RM
781 if (debug) print_partial_compiled_pattern (s, e)
782#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
783 if (debug) print_double_string (w, s1, sz1, s2, sz2)
784
785
786/* Print the fastmap in human-readable form. */
787
788void
789print_fastmap (fastmap)
790 char *fastmap;
791{
792 unsigned was_a_range = 0;
5e69f11e
RM
793 unsigned i = 0;
794
fa9a63c5
RM
795 while (i < (1 << BYTEWIDTH))
796 {
797 if (fastmap[i++])
798 {
799 was_a_range = 0;
25fe55af
RS
800 putchar (i - 1);
801 while (i < (1 << BYTEWIDTH) && fastmap[i])
802 {
803 was_a_range = 1;
804 i++;
805 }
fa9a63c5 806 if (was_a_range)
25fe55af
RS
807 {
808 printf ("-");
809 putchar (i - 1);
810 }
811 }
fa9a63c5 812 }
5e69f11e 813 putchar ('\n');
fa9a63c5
RM
814}
815
816
817/* Print a compiled pattern string in human-readable form, starting at
818 the START pointer into it and ending just before the pointer END. */
819
820void
821print_partial_compiled_pattern (start, end)
822 unsigned char *start;
823 unsigned char *end;
824{
825 int mcnt, mcnt2;
826 unsigned char *p = start;
827 unsigned char *pend = end;
828
829 if (start == NULL)
830 {
831 printf ("(null)\n");
832 return;
833 }
5e69f11e 834
fa9a63c5
RM
835 /* Loop over pattern commands. */
836 while (p < pend)
837 {
838 printf ("%d:\t", p - start);
839
840 switch ((re_opcode_t) *p++)
841 {
25fe55af
RS
842 case no_op:
843 printf ("/no_op");
844 break;
fa9a63c5
RM
845
846 case exactn:
847 mcnt = *p++;
25fe55af
RS
848 printf ("/exactn/%d", mcnt);
849 do
fa9a63c5 850 {
25fe55af 851 putchar ('/');
fa9a63c5 852 putchar (*p++);
25fe55af
RS
853 }
854 while (--mcnt);
855 break;
fa9a63c5
RM
856
857 case start_memory:
25fe55af
RS
858 mcnt = *p++;
859 printf ("/start_memory/%d/%d", mcnt, *p++);
860 break;
fa9a63c5
RM
861
862 case stop_memory:
25fe55af 863 mcnt = *p++;
fa9a63c5 864 printf ("/stop_memory/%d/%d", mcnt, *p++);
25fe55af 865 break;
fa9a63c5
RM
866
867 case duplicate:
868 printf ("/duplicate/%d", *p++);
869 break;
870
871 case anychar:
872 printf ("/anychar");
873 break;
874
875 case charset:
25fe55af
RS
876 case charset_not:
877 {
878 register int c, last = -100;
fa9a63c5 879 register int in_range = 0;
96cc36cc
RS
880 int length = *p & 0x7f;
881 int has_range_table = *p & 0x80;
882 int range_length = p[length + 2] + p[length + 3] * 0x100;
fa9a63c5
RM
883
884 printf ("/charset [%s",
25fe55af 885 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
5e69f11e 886
25fe55af 887 assert (p + *p < pend);
fa9a63c5 888
25fe55af 889 for (c = 0; c < 256; c++)
96cc36cc 890 if (c / 8 < length
fa9a63c5
RM
891 && (p[1 + (c/8)] & (1 << (c % 8))))
892 {
893 /* Are we starting a range? */
894 if (last + 1 == c && ! in_range)
895 {
896 putchar ('-');
897 in_range = 1;
898 }
899 /* Have we broken a range? */
900 else if (last + 1 != c && in_range)
96cc36cc 901 {
fa9a63c5
RM
902 putchar (last);
903 in_range = 0;
904 }
5e69f11e 905
fa9a63c5
RM
906 if (! in_range)
907 putchar (c);
908
909 last = c;
25fe55af 910 }
fa9a63c5 911
96cc36cc
RS
912 p += 1 + length;
913
fa9a63c5
RM
914 if (in_range)
915 putchar (last);
916
917 putchar (']');
918
96cc36cc
RS
919 if (has_range_table)
920 printf ("has-range-table");
921
922 /* ??? Should print the range table; for now,
923 just skip it. */
924 if (has_range_table)
925 p += 4 + 6 * range_length;
fa9a63c5
RM
926 }
927 break;
928
929 case begline:
930 printf ("/begline");
25fe55af 931 break;
fa9a63c5
RM
932
933 case endline:
25fe55af
RS
934 printf ("/endline");
935 break;
fa9a63c5
RM
936
937 case on_failure_jump:
25fe55af
RS
938 extract_number_and_incr (&mcnt, &p);
939 printf ("/on_failure_jump to %d", p + mcnt - start);
940 break;
fa9a63c5
RM
941
942 case on_failure_keep_string_jump:
25fe55af
RS
943 extract_number_and_incr (&mcnt, &p);
944 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
945 break;
fa9a63c5
RM
946
947 case dummy_failure_jump:
25fe55af
RS
948 extract_number_and_incr (&mcnt, &p);
949 printf ("/dummy_failure_jump to %d", p + mcnt - start);
950 break;
fa9a63c5
RM
951
952 case push_dummy_failure:
25fe55af
RS
953 printf ("/push_dummy_failure");
954 break;
5e69f11e 955
25fe55af
RS
956 case maybe_pop_jump:
957 extract_number_and_incr (&mcnt, &p);
958 printf ("/maybe_pop_jump to %d", p + mcnt - start);
fa9a63c5
RM
959 break;
960
25fe55af 961 case pop_failure_jump:
fa9a63c5 962 extract_number_and_incr (&mcnt, &p);
25fe55af 963 printf ("/pop_failure_jump to %d", p + mcnt - start);
5e69f11e
RM
964 break;
965
25fe55af 966 case jump_past_alt:
fa9a63c5 967 extract_number_and_incr (&mcnt, &p);
25fe55af 968 printf ("/jump_past_alt to %d", p + mcnt - start);
5e69f11e
RM
969 break;
970
25fe55af 971 case jump:
fa9a63c5 972 extract_number_and_incr (&mcnt, &p);
25fe55af 973 printf ("/jump to %d", p + mcnt - start);
fa9a63c5
RM
974 break;
975
25fe55af
RS
976 case succeed_n:
977 extract_number_and_incr (&mcnt, &p);
978 extract_number_and_incr (&mcnt2, &p);
fa9a63c5 979 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
25fe55af 980 break;
5e69f11e 981
25fe55af
RS
982 case jump_n:
983 extract_number_and_incr (&mcnt, &p);
984 extract_number_and_incr (&mcnt2, &p);
fa9a63c5 985 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
25fe55af 986 break;
5e69f11e 987
25fe55af
RS
988 case set_number_at:
989 extract_number_and_incr (&mcnt, &p);
990 extract_number_and_incr (&mcnt2, &p);
fa9a63c5 991 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
25fe55af 992 break;
5e69f11e 993
25fe55af 994 case wordbound:
fa9a63c5
RM
995 printf ("/wordbound");
996 break;
997
998 case notwordbound:
999 printf ("/notwordbound");
25fe55af 1000 break;
fa9a63c5
RM
1001
1002 case wordbeg:
1003 printf ("/wordbeg");
1004 break;
5e69f11e 1005
fa9a63c5
RM
1006 case wordend:
1007 printf ("/wordend");
5e69f11e 1008
fa9a63c5
RM
1009#ifdef emacs
1010 case before_dot:
1011 printf ("/before_dot");
25fe55af 1012 break;
fa9a63c5
RM
1013
1014 case at_dot:
1015 printf ("/at_dot");
25fe55af 1016 break;
fa9a63c5
RM
1017
1018 case after_dot:
1019 printf ("/after_dot");
25fe55af 1020 break;
fa9a63c5
RM
1021
1022 case syntaxspec:
25fe55af 1023 printf ("/syntaxspec");
fa9a63c5
RM
1024 mcnt = *p++;
1025 printf ("/%d", mcnt);
25fe55af 1026 break;
5e69f11e 1027
fa9a63c5 1028 case notsyntaxspec:
25fe55af 1029 printf ("/notsyntaxspec");
fa9a63c5
RM
1030 mcnt = *p++;
1031 printf ("/%d", mcnt);
1032 break;
1033#endif /* emacs */
1034
1035 case wordchar:
1036 printf ("/wordchar");
25fe55af 1037 break;
5e69f11e 1038
fa9a63c5
RM
1039 case notwordchar:
1040 printf ("/notwordchar");
25fe55af 1041 break;
fa9a63c5
RM
1042
1043 case begbuf:
1044 printf ("/begbuf");
25fe55af 1045 break;
fa9a63c5
RM
1046
1047 case endbuf:
1048 printf ("/endbuf");
25fe55af 1049 break;
fa9a63c5 1050
25fe55af
RS
1051 default:
1052 printf ("?%d", *(p-1));
fa9a63c5
RM
1053 }
1054
1055 putchar ('\n');
1056 }
1057
1058 printf ("%d:\tend of pattern.\n", p - start);
1059}
1060
1061
1062void
1063print_compiled_pattern (bufp)
1064 struct re_pattern_buffer *bufp;
1065{
1066 unsigned char *buffer = bufp->buffer;
1067
1068 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1069 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
1070
1071 if (bufp->fastmap_accurate && bufp->fastmap)
1072 {
1073 printf ("fastmap: ");
1074 print_fastmap (bufp->fastmap);
1075 }
1076
1077 printf ("re_nsub: %d\t", bufp->re_nsub);
1078 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1079 printf ("can_be_null: %d\t", bufp->can_be_null);
1080 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1081 printf ("no_sub: %d\t", bufp->no_sub);
1082 printf ("not_bol: %d\t", bufp->not_bol);
1083 printf ("not_eol: %d\t", bufp->not_eol);
1084 printf ("syntax: %d\n", bufp->syntax);
1085 /* Perhaps we should print the translate table? */
1086}
1087
1088
1089void
1090print_double_string (where, string1, size1, string2, size2)
1091 const char *where;
1092 const char *string1;
1093 const char *string2;
1094 int size1;
1095 int size2;
1096{
1097 unsigned this_char;
5e69f11e 1098
fa9a63c5
RM
1099 if (where == NULL)
1100 printf ("(null)");
1101 else
1102 {
1103 if (FIRST_STRING_P (where))
25fe55af
RS
1104 {
1105 for (this_char = where - string1; this_char < size1; this_char++)
1106 putchar (string1[this_char]);
fa9a63c5 1107
25fe55af
RS
1108 where = string2;
1109 }
fa9a63c5
RM
1110
1111 for (this_char = where - string2; this_char < size2; this_char++)
25fe55af 1112 putchar (string2[this_char]);
fa9a63c5
RM
1113 }
1114}
1115
1116#else /* not DEBUG */
1117
1118#undef assert
1119#define assert(e)
1120
1121#define DEBUG_STATEMENT(e)
1122#define DEBUG_PRINT1(x)
1123#define DEBUG_PRINT2(x1, x2)
1124#define DEBUG_PRINT3(x1, x2, x3)
1125#define DEBUG_PRINT4(x1, x2, x3, x4)
1126#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1127#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1128
1129#endif /* not DEBUG */
1130\f
1131/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1132 also be assigned to arbitrarily: each pattern buffer stores its own
1133 syntax, so it can be changed between regex compilations. */
1134/* This has no initializer because initialized variables in Emacs
1135 become read-only after dumping. */
1136reg_syntax_t re_syntax_options;
1137
1138
1139/* Specify the precise syntax of regexps for compilation. This provides
1140 for compatibility for various utilities which historically have
1141 different, incompatible syntaxes.
1142
1143 The argument SYNTAX is a bit mask comprised of the various bits
25fe55af 1144 defined in regex.h. We return the old syntax. */
fa9a63c5
RM
1145
1146reg_syntax_t
1147re_set_syntax (syntax)
1148 reg_syntax_t syntax;
1149{
1150 reg_syntax_t ret = re_syntax_options;
5e69f11e 1151
fa9a63c5
RM
1152 re_syntax_options = syntax;
1153 return ret;
1154}
1155\f
1156/* This table gives an error message for each of the error codes listed
25fe55af 1157 in regex.h. Obviously the order here has to be same as there.
fa9a63c5 1158 POSIX doesn't require that we do anything for REG_NOERROR,
25fe55af 1159 but why not be nice? */
fa9a63c5
RM
1160
1161static const char *re_error_msgid[] =
5e69f11e
RM
1162 {
1163 gettext_noop ("Success"), /* REG_NOERROR */
1164 gettext_noop ("No match"), /* REG_NOMATCH */
1165 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1166 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1167 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1168 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1169 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1170 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1171 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1172 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1173 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1174 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1175 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1176 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1177 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1178 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1179 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
fa9a63c5
RM
1180 };
1181\f
25fe55af 1182/* Avoiding alloca during matching, to placate r_alloc. */
fa9a63c5
RM
1183
1184/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1185 searching and matching functions should not call alloca. On some
1186 systems, alloca is implemented in terms of malloc, and if we're
1187 using the relocating allocator routines, then malloc could cause a
1188 relocation, which might (if the strings being searched are in the
1189 ralloc heap) shift the data out from underneath the regexp
1190 routines.
1191
5e69f11e 1192 Here's another reason to avoid allocation: Emacs
fa9a63c5
RM
1193 processes input from X in a signal handler; processing X input may
1194 call malloc; if input arrives while a matching routine is calling
1195 malloc, then we're scrod. But Emacs can't just block input while
1196 calling matching routines; then we don't notice interrupts when
1197 they come in. So, Emacs blocks input around all regexp calls
1198 except the matching calls, which it leaves unprotected, in the
1199 faith that they will not malloc. */
1200
1201/* Normally, this is fine. */
1202#define MATCH_MAY_ALLOCATE
1203
1204/* When using GNU C, we are not REALLY using the C alloca, no matter
1205 what config.h may say. So don't take precautions for it. */
1206#ifdef __GNUC__
1207#undef C_ALLOCA
1208#endif
1209
1210/* The match routines may not allocate if (1) they would do it with malloc
1211 and (2) it's not safe for them to use malloc.
1212 Note that if REL_ALLOC is defined, matching would not use malloc for the
1213 failure stack, but we would still use it for the register vectors;
25fe55af 1214 so REL_ALLOC should not affect this. */
fa9a63c5
RM
1215#if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1216#undef MATCH_MAY_ALLOCATE
1217#endif
1218
1219\f
1220/* Failure stack declarations and macros; both re_compile_fastmap and
1221 re_match_2 use a failure stack. These have to be macros because of
1222 REGEX_ALLOCATE_STACK. */
5e69f11e 1223
fa9a63c5 1224
320a2a73 1225/* Approximate number of failure points for which to initially allocate space
fa9a63c5
RM
1226 when matching. If this number is exceeded, we allocate more
1227 space, so it is not a hard limit. */
1228#ifndef INIT_FAILURE_ALLOC
320a2a73 1229#define INIT_FAILURE_ALLOC 20
fa9a63c5
RM
1230#endif
1231
1232/* Roughly the maximum number of failure points on the stack. Would be
320a2a73 1233 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
fa9a63c5 1234 This is a variable only so users of regex can assign to it; we never
25fe55af 1235 change it ourselves. */
fa9a63c5 1236#if defined (MATCH_MAY_ALLOCATE)
320a2a73
KH
1237/* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1238 whose default stack limit is 2mb. In order for a larger
1239 value to work reliably, you have to try to make it accord
1240 with the process stack limit. */
1241int re_max_failures = 40000;
fa9a63c5 1242#else
320a2a73 1243int re_max_failures = 4000;
fa9a63c5
RM
1244#endif
1245
1246union fail_stack_elt
1247{
1248 unsigned char *pointer;
1249 int integer;
1250};
1251
1252typedef union fail_stack_elt fail_stack_elt_t;
1253
1254typedef struct
1255{
1256 fail_stack_elt_t *stack;
1257 unsigned size;
1258 unsigned avail; /* Offset of next open position. */
1259} fail_stack_type;
1260
1261#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1262#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1263#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1264
1265
1266/* Define macros to initialize and free the failure stack.
1267 Do `return -2' if the alloc fails. */
1268
1269#ifdef MATCH_MAY_ALLOCATE
1270#define INIT_FAIL_STACK() \
1271 do { \
1272 fail_stack.stack = (fail_stack_elt_t *) \
320a2a73
KH
1273 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1274 * sizeof (fail_stack_elt_t)); \
fa9a63c5
RM
1275 \
1276 if (fail_stack.stack == NULL) \
1277 return -2; \
1278 \
1279 fail_stack.size = INIT_FAILURE_ALLOC; \
1280 fail_stack.avail = 0; \
1281 } while (0)
1282
1283#define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1284#else
1285#define INIT_FAIL_STACK() \
1286 do { \
1287 fail_stack.avail = 0; \
1288 } while (0)
1289
1290#define RESET_FAIL_STACK()
1291#endif
1292
1293
320a2a73
KH
1294/* Double the size of FAIL_STACK, up to a limit
1295 which allows approximately `re_max_failures' items.
fa9a63c5
RM
1296
1297 Return 1 if succeeds, and 0 if either ran out of memory
5e69f11e
RM
1298 allocating space for it or it was already too large.
1299
25fe55af 1300 REGEX_REALLOCATE_STACK requires `destination' be declared. */
fa9a63c5 1301
320a2a73
KH
1302/* Factor to increase the failure stack size by
1303 when we increase it.
1304 This used to be 2, but 2 was too wasteful
1305 because the old discarded stacks added up to as much space
1306 were as ultimate, maximum-size stack. */
1307#define FAIL_STACK_GROWTH_FACTOR 4
1308
1309#define GROW_FAIL_STACK(fail_stack) \
eead07d6
KH
1310 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1311 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
fa9a63c5 1312 ? 0 \
320a2a73
KH
1313 : ((fail_stack).stack \
1314 = (fail_stack_elt_t *) \
25fe55af
RS
1315 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1316 (fail_stack).size * sizeof (fail_stack_elt_t), \
320a2a73
KH
1317 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1318 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1319 * FAIL_STACK_GROWTH_FACTOR))), \
fa9a63c5
RM
1320 \
1321 (fail_stack).stack == NULL \
1322 ? 0 \
6453db45
KH
1323 : ((fail_stack).size \
1324 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1325 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1326 * FAIL_STACK_GROWTH_FACTOR)) \
1327 / sizeof (fail_stack_elt_t)), \
25fe55af 1328 1)))
fa9a63c5
RM
1329
1330
5e69f11e 1331/* Push pointer POINTER on FAIL_STACK.
fa9a63c5
RM
1332 Return 1 if was able to do so and 0 if ran out of memory allocating
1333 space to do so. */
1334#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1335 ((FAIL_STACK_FULL () \
320a2a73 1336 && !GROW_FAIL_STACK (FAIL_STACK)) \
fa9a63c5
RM
1337 ? 0 \
1338 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1339 1))
1340
1341/* Push a pointer value onto the failure stack.
1342 Assumes the variable `fail_stack'. Probably should only
25fe55af 1343 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1344#define PUSH_FAILURE_POINTER(item) \
1345 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1346
1347/* This pushes an integer-valued item onto the failure stack.
1348 Assumes the variable `fail_stack'. Probably should only
25fe55af 1349 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1350#define PUSH_FAILURE_INT(item) \
1351 fail_stack.stack[fail_stack.avail++].integer = (item)
1352
1353/* Push a fail_stack_elt_t value onto the failure stack.
1354 Assumes the variable `fail_stack'. Probably should only
25fe55af 1355 be called from within `PUSH_FAILURE_POINT'. */
fa9a63c5
RM
1356#define PUSH_FAILURE_ELT(item) \
1357 fail_stack.stack[fail_stack.avail++] = (item)
1358
1359/* These three POP... operations complement the three PUSH... operations.
1360 All assume that `fail_stack' is nonempty. */
1361#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1362#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1363#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1364
1365/* Used to omit pushing failure point id's when we're not debugging. */
1366#ifdef DEBUG
1367#define DEBUG_PUSH PUSH_FAILURE_INT
1368#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1369#else
1370#define DEBUG_PUSH(item)
1371#define DEBUG_POP(item_addr)
1372#endif
1373
1374
1375/* Push the information about the state we will need
5e69f11e
RM
1376 if we ever fail back to it.
1377
fa9a63c5 1378 Requires variables fail_stack, regstart, regend, reg_info, and
320a2a73 1379 num_regs be declared. GROW_FAIL_STACK requires `destination' be
fa9a63c5 1380 declared.
5e69f11e 1381
fa9a63c5
RM
1382 Does `return FAILURE_CODE' if runs out of memory. */
1383
1384#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1385 do { \
1386 char *destination; \
1387 /* Must be int, so when we don't save any registers, the arithmetic \
1388 of 0 + -1 isn't done as unsigned. */ \
1389 int this_reg; \
25fe55af 1390 \
fa9a63c5
RM
1391 DEBUG_STATEMENT (failure_id++); \
1392 DEBUG_STATEMENT (nfailure_points_pushed++); \
1393 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1394 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
25fe55af 1395 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
fa9a63c5
RM
1396 \
1397 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
25fe55af 1398 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
fa9a63c5
RM
1399 \
1400 /* Ensure we have enough space allocated for what we will push. */ \
1401 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1402 { \
320a2a73 1403 if (!GROW_FAIL_STACK (fail_stack)) \
25fe55af 1404 return failure_code; \
fa9a63c5 1405 \
25fe55af 1406 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
fa9a63c5 1407 (fail_stack).size); \
25fe55af 1408 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
fa9a63c5
RM
1409 } \
1410 \
1411 /* Push the info, starting with the registers. */ \
1412 DEBUG_PRINT1 ("\n"); \
1413 \
68d96f02 1414 if (1) \
faec11db
RS
1415 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1416 this_reg++) \
1417 { \
1418 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1419 DEBUG_STATEMENT (num_regs_pushed++); \
fa9a63c5 1420 \
faec11db
RS
1421 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1422 PUSH_FAILURE_POINTER (regstart[this_reg]); \
fa9a63c5 1423 \
faec11db
RS
1424 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1425 PUSH_FAILURE_POINTER (regend[this_reg]); \
1426 \
1427 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1428 DEBUG_PRINT2 (" match_null=%d", \
1429 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1430 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1431 DEBUG_PRINT2 (" matched_something=%d", \
1432 MATCHED_SOMETHING (reg_info[this_reg])); \
1433 DEBUG_PRINT2 (" ever_matched=%d", \
1434 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1435 DEBUG_PRINT1 ("\n"); \
1436 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1437 } \
fa9a63c5
RM
1438 \
1439 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1440 PUSH_FAILURE_INT (lowest_active_reg); \
1441 \
1442 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1443 PUSH_FAILURE_INT (highest_active_reg); \
1444 \
1445 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
1446 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1447 PUSH_FAILURE_POINTER (pattern_place); \
1448 \
1449 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
25fe55af 1450 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
fa9a63c5
RM
1451 size2); \
1452 DEBUG_PRINT1 ("'\n"); \
1453 PUSH_FAILURE_POINTER (string_place); \
1454 \
1455 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1456 DEBUG_PUSH (failure_id); \
1457 } while (0)
1458
1459/* This is the number of items that are pushed and popped on the stack
1460 for each register. */
1461#define NUM_REG_ITEMS 3
1462
1463/* Individual items aside from the registers. */
1464#ifdef DEBUG
1465#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1466#else
1467#define NUM_NONREG_ITEMS 4
1468#endif
1469
320a2a73
KH
1470/* Estimate the size of data pushed by a typical failure stack entry.
1471 An estimate is all we need, because all we use this for
1472 is to choose a limit for how big to make the failure stack. */
1473
1474#define TYPICAL_FAILURE_SIZE 20
fa9a63c5 1475
320a2a73
KH
1476/* This is how many items we actually use for a failure point.
1477 It depends on the regexp. */
faec11db 1478#define NUM_FAILURE_ITEMS \
68d96f02 1479 (((0 \
faec11db
RS
1480 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1481 * NUM_REG_ITEMS) \
1482 + NUM_NONREG_ITEMS)
fa9a63c5
RM
1483
1484/* How many items can still be added to the stack without overflowing it. */
1485#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1486
1487
1488/* Pops what PUSH_FAIL_STACK pushes.
1489
1490 We restore into the parameters, all of which should be lvalues:
1491 STR -- the saved data position.
1492 PAT -- the saved pattern position.
1493 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1494 REGSTART, REGEND -- arrays of string positions.
1495 REG_INFO -- array of information about each subexpression.
5e69f11e 1496
fa9a63c5 1497 Also assumes the variables `fail_stack' and (if debugging), `bufp',
25fe55af 1498 `pend', `string1', `size1', `string2', and `size2'. */
fa9a63c5
RM
1499
1500#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1501{ \
1502 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1503 int this_reg; \
1504 const unsigned char *string_temp; \
1505 \
1506 assert (!FAIL_STACK_EMPTY ()); \
1507 \
1508 /* Remove failure points and point to how many regs pushed. */ \
1509 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1510 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
25fe55af 1511 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
fa9a63c5
RM
1512 \
1513 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1514 \
d9e455fb 1515 DEBUG_POP (&failure_id.integer); \
7d0ff466 1516 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id.integer); \
fa9a63c5
RM
1517 \
1518 /* If the saved string location is NULL, it came from an \
1519 on_failure_keep_string_jump opcode, and we want to throw away the \
1520 saved NULL, thus retaining our current position in the string. */ \
1521 string_temp = POP_FAILURE_POINTER (); \
1522 if (string_temp != NULL) \
1523 str = (const char *) string_temp; \
1524 \
1525 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1526 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1527 DEBUG_PRINT1 ("'\n"); \
1528 \
1529 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1530 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
1531 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1532 \
1533 /* Restore register info. */ \
1534 high_reg = (unsigned) POP_FAILURE_INT (); \
1535 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1536 \
1537 low_reg = (unsigned) POP_FAILURE_INT (); \
1538 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1539 \
68d96f02 1540 if (1) \
faec11db
RS
1541 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1542 { \
25fe55af 1543 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
fa9a63c5 1544 \
faec11db 1545 reg_info[this_reg].word = POP_FAILURE_ELT (); \
25fe55af 1546 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
fa9a63c5 1547 \
faec11db 1548 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
25fe55af 1549 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
fa9a63c5 1550 \
faec11db 1551 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
25fe55af 1552 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
faec11db 1553 } \
6676cb1c
RS
1554 else \
1555 { \
1556 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1557 { \
4fcecab1 1558 reg_info[this_reg].word.integer = 0; \
6676cb1c
RS
1559 regend[this_reg] = 0; \
1560 regstart[this_reg] = 0; \
1561 } \
1562 highest_active_reg = high_reg; \
1563 } \
fa9a63c5
RM
1564 \
1565 set_regs_matched_done = 0; \
1566 DEBUG_STATEMENT (nfailure_points_popped++); \
1567} /* POP_FAILURE_POINT */
1568
1569
1570\f
1571/* Structure for per-register (a.k.a. per-group) information.
1572 Other register information, such as the
1573 starting and ending positions (which are addresses), and the list of
1574 inner groups (which is a bits list) are maintained in separate
5e69f11e
RM
1575 variables.
1576
fa9a63c5
RM
1577 We are making a (strictly speaking) nonportable assumption here: that
1578 the compiler will pack our bit fields into something that fits into
1579 the type of `word', i.e., is something that fits into one item on the
1580 failure stack. */
1581
1582typedef union
1583{
1584 fail_stack_elt_t word;
1585 struct
1586 {
1587 /* This field is one if this group can match the empty string,
25fe55af 1588 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
fa9a63c5
RM
1589#define MATCH_NULL_UNSET_VALUE 3
1590 unsigned match_null_string_p : 2;
1591 unsigned is_active : 1;
1592 unsigned matched_something : 1;
1593 unsigned ever_matched_something : 1;
1594 } bits;
1595} register_info_type;
1596
1597#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1598#define IS_ACTIVE(R) ((R).bits.is_active)
1599#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1600#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1601
1602
1603/* Call this when have matched a real character; it sets `matched' flags
1604 for the subexpressions which we are currently inside. Also records
1605 that those subexprs have matched. */
1606#define SET_REGS_MATCHED() \
1607 do \
1608 { \
1609 if (!set_regs_matched_done) \
1610 { \
1611 unsigned r; \
1612 set_regs_matched_done = 1; \
1613 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1614 { \
1615 MATCHED_SOMETHING (reg_info[r]) \
1616 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1617 = 1; \
1618 } \
1619 } \
1620 } \
1621 while (0)
1622
1623/* Registers are set to a sentinel when they haven't yet matched. */
1624static char reg_unset_dummy;
1625#define REG_UNSET_VALUE (&reg_unset_dummy)
1626#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1627\f
1628/* Subroutine declarations and macros for regex_compile. */
1629
1630static void store_op1 (), store_op2 ();
1631static void insert_op1 (), insert_op2 ();
1632static boolean at_begline_loc_p (), at_endline_loc_p ();
1633static boolean group_in_compile_stack ();
1634static reg_errcode_t compile_range ();
1635
5e69f11e 1636/* Fetch the next character in the uncompiled pattern---translating it
fa9a63c5
RM
1637 if necessary. Also cast from a signed character in the constant
1638 string passed to us by the user to an unsigned char that we can use
1639 as an array index (in, e.g., `translate'). */
6676cb1c 1640#ifndef PATFETCH
fa9a63c5
RM
1641#define PATFETCH(c) \
1642 do {if (p == pend) return REG_EEND; \
1643 c = (unsigned char) *p++; \
28703c16 1644 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
fa9a63c5 1645 } while (0)
6676cb1c 1646#endif
fa9a63c5
RM
1647
1648/* Fetch the next character in the uncompiled pattern, with no
25fe55af 1649 translation. */
fa9a63c5
RM
1650#define PATFETCH_RAW(c) \
1651 do {if (p == pend) return REG_EEND; \
25fe55af 1652 c = (unsigned char) *p++; \
fa9a63c5
RM
1653 } while (0)
1654
1655/* Go backwards one character in the pattern. */
1656#define PATUNFETCH p--
1657
1658
1659/* If `translate' is non-null, return translate[D], else just D. We
1660 cast the subscript to translate because some data is declared as
1661 `char *', to avoid warnings when a string constant is passed. But
1662 when we use a character as a subscript we must make it unsigned. */
6676cb1c
RS
1663#ifndef TRANSLATE
1664#define TRANSLATE(d) \
28703c16
AS
1665 (RE_TRANSLATE_P (translate) \
1666 ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
6676cb1c 1667#endif
fa9a63c5
RM
1668
1669
1670/* Macros for outputting the compiled pattern into `buffer'. */
1671
1672/* If the buffer isn't allocated when it comes in, use this. */
1673#define INIT_BUF_SIZE 32
1674
25fe55af 1675/* Make sure we have at least N more bytes of space in buffer. */
fa9a63c5
RM
1676#define GET_BUFFER_SPACE(n) \
1677 while (b - bufp->buffer + (n) > bufp->allocated) \
1678 EXTEND_BUFFER ()
1679
1680/* Make sure we have one more byte of buffer space and then add C to it. */
1681#define BUF_PUSH(c) \
1682 do { \
1683 GET_BUFFER_SPACE (1); \
1684 *b++ = (unsigned char) (c); \
1685 } while (0)
1686
1687
1688/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1689#define BUF_PUSH_2(c1, c2) \
1690 do { \
1691 GET_BUFFER_SPACE (2); \
1692 *b++ = (unsigned char) (c1); \
1693 *b++ = (unsigned char) (c2); \
1694 } while (0)
1695
1696
25fe55af 1697/* As with BUF_PUSH_2, except for three bytes. */
fa9a63c5
RM
1698#define BUF_PUSH_3(c1, c2, c3) \
1699 do { \
1700 GET_BUFFER_SPACE (3); \
1701 *b++ = (unsigned char) (c1); \
1702 *b++ = (unsigned char) (c2); \
1703 *b++ = (unsigned char) (c3); \
1704 } while (0)
1705
1706
1707/* Store a jump with opcode OP at LOC to location TO. We store a
25fe55af 1708 relative address offset by the three bytes the jump itself occupies. */
fa9a63c5
RM
1709#define STORE_JUMP(op, loc, to) \
1710 store_op1 (op, loc, (to) - (loc) - 3)
1711
1712/* Likewise, for a two-argument jump. */
1713#define STORE_JUMP2(op, loc, to, arg) \
1714 store_op2 (op, loc, (to) - (loc) - 3, arg)
1715
25fe55af 1716/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
fa9a63c5
RM
1717#define INSERT_JUMP(op, loc, to) \
1718 insert_op1 (op, loc, (to) - (loc) - 3, b)
1719
1720/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1721#define INSERT_JUMP2(op, loc, to, arg) \
1722 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1723
1724
1725/* This is not an arbitrary limit: the arguments which represent offsets
25fe55af 1726 into the pattern are two bytes long. So if 2^16 bytes turns out to
fa9a63c5
RM
1727 be too small, many things would have to change. */
1728#define MAX_BUF_SIZE (1L << 16)
1729
1730
1731/* Extend the buffer by twice its current size via realloc and
1732 reset the pointers that pointed into the old block to point to the
1733 correct places in the new one. If extending the buffer results in it
25fe55af 1734 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
fa9a63c5 1735#define EXTEND_BUFFER() \
25fe55af 1736 do { \
fa9a63c5 1737 unsigned char *old_buffer = bufp->buffer; \
25fe55af 1738 if (bufp->allocated == MAX_BUF_SIZE) \
fa9a63c5
RM
1739 return REG_ESIZE; \
1740 bufp->allocated <<= 1; \
1741 if (bufp->allocated > MAX_BUF_SIZE) \
25fe55af 1742 bufp->allocated = MAX_BUF_SIZE; \
fa9a63c5
RM
1743 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1744 if (bufp->buffer == NULL) \
1745 return REG_ESPACE; \
1746 /* If the buffer moved, move all the pointers into it. */ \
1747 if (old_buffer != bufp->buffer) \
1748 { \
25fe55af
RS
1749 b = (b - old_buffer) + bufp->buffer; \
1750 begalt = (begalt - old_buffer) + bufp->buffer; \
1751 if (fixup_alt_jump) \
1752 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1753 if (laststart) \
1754 laststart = (laststart - old_buffer) + bufp->buffer; \
1755 if (pending_exact) \
1756 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
fa9a63c5
RM
1757 } \
1758 } while (0)
1759
1760
1761/* Since we have one byte reserved for the register number argument to
1762 {start,stop}_memory, the maximum number of groups we can report
1763 things about is what fits in that byte. */
1764#define MAX_REGNUM 255
1765
1766/* But patterns can have more than `MAX_REGNUM' registers. We just
1767 ignore the excess. */
1768typedef unsigned regnum_t;
1769
1770
1771/* Macros for the compile stack. */
1772
1773/* Since offsets can go either forwards or backwards, this type needs to
25fe55af 1774 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
fa9a63c5
RM
1775typedef int pattern_offset_t;
1776
1777typedef struct
1778{
1779 pattern_offset_t begalt_offset;
1780 pattern_offset_t fixup_alt_jump;
1781 pattern_offset_t inner_group_offset;
5e69f11e 1782 pattern_offset_t laststart_offset;
fa9a63c5
RM
1783 regnum_t regnum;
1784} compile_stack_elt_t;
1785
1786
1787typedef struct
1788{
1789 compile_stack_elt_t *stack;
1790 unsigned size;
1791 unsigned avail; /* Offset of next open position. */
1792} compile_stack_type;
1793
1794
1795#define INIT_COMPILE_STACK_SIZE 32
1796
1797#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1798#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1799
25fe55af 1800/* The next available element. */
fa9a63c5
RM
1801#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1802
1803
b18215fc
RS
1804/* Structure to manage work area for range table. */
1805struct range_table_work_area
1806{
1807 int *table; /* actual work area. */
1808 int allocated; /* allocated size for work area in bytes. */
25fe55af 1809 int used; /* actually used size in words. */
96cc36cc 1810 int bits; /* flag to record character classes */
b18215fc
RS
1811};
1812
1813/* Make sure that WORK_AREA can hold more N multibyte characters. */
1814#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1815 do { \
1816 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1817 { \
1818 (work_area).allocated += 16 * sizeof (int); \
1819 if ((work_area).table) \
1820 (work_area).table \
1821 = (int *) realloc ((work_area).table, (work_area).allocated); \
1822 else \
1823 (work_area).table \
1824 = (int *) malloc ((work_area).allocated); \
1825 if ((work_area).table == 0) \
1826 FREE_STACK_RETURN (REG_ESPACE); \
1827 } \
1828 } while (0)
1829
96cc36cc
RS
1830#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1831 (work_area).bits |= (bit)
1832
1833/* These bits represent the various character classes such as [:alnum:]
1834 in a charset's range table. */
1835#define BIT_ALNUM 0x1
1836#define BIT_ALPHA 0x2
1837#define BIT_WORD 0x4
f71b19b6
DL
1838#define BIT_ASCII 0x8
1839#define BIT_NONASCII 0x10
96cc36cc
RS
1840#define BIT_GRAPH 0x20
1841#define BIT_LOWER 0x40
1842#define BIT_PRINT 0x80
1843#define BIT_PUNCT 0x100
1844#define BIT_SPACE 0x200
1845#define BIT_UPPER 0x400
f71b19b6
DL
1846#define BIT_UNIBYTE 0x800
1847#define BIT_MULTIBYTE 0x1000
96cc36cc 1848
b18215fc
RS
1849/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1850#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1851 do { \
1852 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1853 (work_area).table[(work_area).used++] = (range_start); \
1854 (work_area).table[(work_area).used++] = (range_end); \
1855 } while (0)
1856
25fe55af 1857/* Free allocated memory for WORK_AREA. */
b18215fc
RS
1858#define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1859 do { \
1860 if ((work_area).table) \
1861 free ((work_area).table); \
1862 } while (0)
1863
96cc36cc 1864#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
b18215fc 1865#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
96cc36cc 1866#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
b18215fc
RS
1867#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1868
1869
fa9a63c5 1870/* Set the bit for character C in a list. */
25fe55af
RS
1871#define SET_LIST_BIT(c) \
1872 (b[((unsigned char) (c)) / BYTEWIDTH] \
fa9a63c5
RM
1873 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1874
1875
1876/* Get the next unsigned number in the uncompiled pattern. */
25fe55af 1877#define GET_UNSIGNED_NUMBER(num) \
fa9a63c5
RM
1878 { if (p != pend) \
1879 { \
25fe55af
RS
1880 PATFETCH (c); \
1881 while (ISDIGIT (c)) \
1882 { \
1883 if (num < 0) \
1884 num = 0; \
1885 num = num * 10 + c - '0'; \
1886 if (p == pend) \
1887 break; \
1888 PATFETCH (c); \
1889 } \
1890 } \
5e69f11e 1891 }
fa9a63c5
RM
1892
1893#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1894
1895#define IS_CHAR_CLASS(string) \
1896 (STREQ (string, "alpha") || STREQ (string, "upper") \
1897 || STREQ (string, "lower") || STREQ (string, "digit") \
1898 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1899 || STREQ (string, "space") || STREQ (string, "print") \
1900 || STREQ (string, "punct") || STREQ (string, "graph") \
96cc36cc 1901 || STREQ (string, "cntrl") || STREQ (string, "blank") \
f71b19b6
DL
1902 || STREQ (string, "word") \
1903 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1904 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
fa9a63c5
RM
1905\f
1906#ifndef MATCH_MAY_ALLOCATE
1907
1908/* If we cannot allocate large objects within re_match_2_internal,
1909 we make the fail stack and register vectors global.
1910 The fail stack, we grow to the maximum size when a regexp
1911 is compiled.
1912 The register vectors, we adjust in size each time we
1913 compile a regexp, according to the number of registers it needs. */
1914
1915static fail_stack_type fail_stack;
1916
1917/* Size with which the following vectors are currently allocated.
1918 That is so we can make them bigger as needed,
25fe55af 1919 but never make them smaller. */
fa9a63c5
RM
1920static int regs_allocated_size;
1921
25fe55af 1922static const char ** regstart, ** regend;
fa9a63c5
RM
1923static const char ** old_regstart, ** old_regend;
1924static const char **best_regstart, **best_regend;
5e69f11e 1925static register_info_type *reg_info;
fa9a63c5
RM
1926static const char **reg_dummy;
1927static register_info_type *reg_info_dummy;
1928
1929/* Make the register vectors big enough for NUM_REGS registers,
25fe55af 1930 but don't make them smaller. */
fa9a63c5
RM
1931
1932static
1933regex_grow_registers (num_regs)
1934 int num_regs;
1935{
1936 if (num_regs > regs_allocated_size)
1937 {
1938 RETALLOC_IF (regstart, num_regs, const char *);
1939 RETALLOC_IF (regend, num_regs, const char *);
1940 RETALLOC_IF (old_regstart, num_regs, const char *);
1941 RETALLOC_IF (old_regend, num_regs, const char *);
1942 RETALLOC_IF (best_regstart, num_regs, const char *);
1943 RETALLOC_IF (best_regend, num_regs, const char *);
1944 RETALLOC_IF (reg_info, num_regs, register_info_type);
1945 RETALLOC_IF (reg_dummy, num_regs, const char *);
1946 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1947
1948 regs_allocated_size = num_regs;
1949 }
1950}
1951
1952#endif /* not MATCH_MAY_ALLOCATE */
1953\f
1954/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1955 Returns one of error codes defined in `regex.h', or zero for success.
1956
1957 Assumes the `allocated' (and perhaps `buffer') and `translate'
1958 fields are set in BUFP on entry.
1959
1960 If it succeeds, results are put in BUFP (if it returns an error, the
1961 contents of BUFP are undefined):
1962 `buffer' is the compiled pattern;
1963 `syntax' is set to SYNTAX;
1964 `used' is set to the length of the compiled pattern;
1965 `fastmap_accurate' is zero;
1966 `re_nsub' is the number of subexpressions in PATTERN;
1967 `not_bol' and `not_eol' are zero;
5e69f11e 1968
fa9a63c5
RM
1969 The `fastmap' and `newline_anchor' fields are neither
1970 examined nor set. */
1971
1972/* Return, freeing storage we allocated. */
1973#define FREE_STACK_RETURN(value) \
b18215fc
RS
1974 do { \
1975 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1976 free (compile_stack.stack); \
1977 return value; \
1978 } while (0)
fa9a63c5
RM
1979
1980static reg_errcode_t
1981regex_compile (pattern, size, syntax, bufp)
1982 const char *pattern;
1983 int size;
1984 reg_syntax_t syntax;
1985 struct re_pattern_buffer *bufp;
1986{
1987 /* We fetch characters from PATTERN here. Even though PATTERN is
1988 `char *' (i.e., signed), we declare these variables as unsigned, so
1989 they can be reliably used as array indices. */
b18215fc 1990 register unsigned int c, c1;
5e69f11e 1991
fa9a63c5
RM
1992 /* A random temporary spot in PATTERN. */
1993 const char *p1;
1994
1995 /* Points to the end of the buffer, where we should append. */
1996 register unsigned char *b;
5e69f11e 1997
fa9a63c5
RM
1998 /* Keeps track of unclosed groups. */
1999 compile_stack_type compile_stack;
2000
2001 /* Points to the current (ending) position in the pattern. */
22336245
RS
2002#ifdef AIX
2003 /* `const' makes AIX compiler fail. */
2004 char *p = pattern;
2005#else
fa9a63c5 2006 const char *p = pattern;
22336245 2007#endif
fa9a63c5 2008 const char *pend = pattern + size;
5e69f11e 2009
fa9a63c5 2010 /* How to translate the characters in the pattern. */
6676cb1c 2011 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
2012
2013 /* Address of the count-byte of the most recently inserted `exactn'
2014 command. This makes it possible to tell if a new exact-match
2015 character can be added to that command or if the character requires
2016 a new `exactn' command. */
2017 unsigned char *pending_exact = 0;
2018
2019 /* Address of start of the most recently finished expression.
2020 This tells, e.g., postfix * where to find the start of its
2021 operand. Reset at the beginning of groups and alternatives. */
2022 unsigned char *laststart = 0;
2023
2024 /* Address of beginning of regexp, or inside of last group. */
2025 unsigned char *begalt;
2026
2027 /* Place in the uncompiled pattern (i.e., the {) to
2028 which to go back if the interval is invalid. */
2029 const char *beg_interval;
5e69f11e 2030
fa9a63c5 2031 /* Address of the place where a forward jump should go to the end of
25fe55af 2032 the containing expression. Each alternative of an `or' -- except the
fa9a63c5
RM
2033 last -- ends with a forward jump of this sort. */
2034 unsigned char *fixup_alt_jump = 0;
2035
2036 /* Counts open-groups as they are encountered. Remembered for the
2037 matching close-group on the compile stack, so the same register
2038 number is put in the stop_memory as the start_memory. */
2039 regnum_t regnum = 0;
2040
b18215fc
RS
2041 /* Work area for range table of charset. */
2042 struct range_table_work_area range_table_work;
2043
fa9a63c5
RM
2044#ifdef DEBUG
2045 DEBUG_PRINT1 ("\nCompiling pattern: ");
2046 if (debug)
2047 {
2048 unsigned debug_count;
5e69f11e 2049
fa9a63c5 2050 for (debug_count = 0; debug_count < size; debug_count++)
25fe55af 2051 putchar (pattern[debug_count]);
fa9a63c5
RM
2052 putchar ('\n');
2053 }
2054#endif /* DEBUG */
2055
2056 /* Initialize the compile stack. */
2057 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2058 if (compile_stack.stack == NULL)
2059 return REG_ESPACE;
2060
2061 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2062 compile_stack.avail = 0;
2063
b18215fc
RS
2064 range_table_work.table = 0;
2065 range_table_work.allocated = 0;
2066
fa9a63c5
RM
2067 /* Initialize the pattern buffer. */
2068 bufp->syntax = syntax;
2069 bufp->fastmap_accurate = 0;
2070 bufp->not_bol = bufp->not_eol = 0;
2071
2072 /* Set `used' to zero, so that if we return an error, the pattern
2073 printer (for debugging) will think there's no pattern. We reset it
2074 at the end. */
2075 bufp->used = 0;
5e69f11e 2076
fa9a63c5 2077 /* Always count groups, whether or not bufp->no_sub is set. */
5e69f11e 2078 bufp->re_nsub = 0;
fa9a63c5 2079
b18215fc
RS
2080#ifdef emacs
2081 /* bufp->multibyte is set before regex_compile is called, so don't alter
2082 it. */
2083#else /* not emacs */
2084 /* Nothing is recognized as a multibyte character. */
2085 bufp->multibyte = 0;
2086#endif
2087
fa9a63c5
RM
2088#if !defined (emacs) && !defined (SYNTAX_TABLE)
2089 /* Initialize the syntax table. */
2090 init_syntax_once ();
2091#endif
2092
2093 if (bufp->allocated == 0)
2094 {
2095 if (bufp->buffer)
2096 { /* If zero allocated, but buffer is non-null, try to realloc
25fe55af
RS
2097 enough space. This loses if buffer's address is bogus, but
2098 that is the user's responsibility. */
2099 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2100 }
fa9a63c5 2101 else
25fe55af
RS
2102 { /* Caller did not allocate a buffer. Do it for them. */
2103 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2104 }
fa9a63c5
RM
2105 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2106
2107 bufp->allocated = INIT_BUF_SIZE;
2108 }
2109
2110 begalt = b = bufp->buffer;
2111
2112 /* Loop through the uncompiled pattern until we're at the end. */
2113 while (p != pend)
2114 {
2115 PATFETCH (c);
2116
2117 switch (c)
25fe55af
RS
2118 {
2119 case '^':
2120 {
2121 if ( /* If at start of pattern, it's an operator. */
2122 p == pattern + 1
2123 /* If context independent, it's an operator. */
2124 || syntax & RE_CONTEXT_INDEP_ANCHORS
2125 /* Otherwise, depends on what's come before. */
2126 || at_begline_loc_p (pattern, p, syntax))
2127 BUF_PUSH (begline);
2128 else
2129 goto normal_char;
2130 }
2131 break;
2132
2133
2134 case '$':
2135 {
2136 if ( /* If at end of pattern, it's an operator. */
2137 p == pend
2138 /* If context independent, it's an operator. */
2139 || syntax & RE_CONTEXT_INDEP_ANCHORS
2140 /* Otherwise, depends on what's next. */
2141 || at_endline_loc_p (p, pend, syntax))
2142 BUF_PUSH (endline);
2143 else
2144 goto normal_char;
2145 }
2146 break;
fa9a63c5
RM
2147
2148
2149 case '+':
25fe55af
RS
2150 case '?':
2151 if ((syntax & RE_BK_PLUS_QM)
2152 || (syntax & RE_LIMITED_OPS))
2153 goto normal_char;
2154 handle_plus:
2155 case '*':
2156 /* If there is no previous pattern... */
2157 if (!laststart)
2158 {
2159 if (syntax & RE_CONTEXT_INVALID_OPS)
2160 FREE_STACK_RETURN (REG_BADRPT);
2161 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2162 goto normal_char;
2163 }
2164
2165 {
2166 /* Are we optimizing this jump? */
2167 boolean keep_string_p = false;
2168
2169 /* 1 means zero (many) matches is allowed. */
2170 char zero_times_ok = 0, many_times_ok = 0;
2171
2172 /* If there is a sequence of repetition chars, collapse it
2173 down to just one (the right one). We can't combine
2174 interval operators with these because of, e.g., `a{2}*',
2175 which should only match an even number of `a's. */
2176
2177 for (;;)
2178 {
2179 zero_times_ok |= c != '+';
2180 many_times_ok |= c != '?';
2181
2182 if (p == pend)
2183 break;
2184
2185 PATFETCH (c);
2186
2187 if (c == '*'
2188 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2189 ;
2190
2191 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2192 {
2193 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2194
2195 PATFETCH (c1);
2196 if (!(c1 == '+' || c1 == '?'))
2197 {
2198 PATUNFETCH;
2199 PATUNFETCH;
2200 break;
2201 }
2202
2203 c = c1;
2204 }
2205 else
2206 {
2207 PATUNFETCH;
2208 break;
2209 }
2210
2211 /* If we get here, we found another repeat character. */
2212 }
2213
2214 /* Star, etc. applied to an empty pattern is equivalent
2215 to an empty pattern. */
2216 if (!laststart)
2217 break;
2218
2219 /* Now we know whether or not zero matches is allowed
2220 and also whether or not two or more matches is allowed. */
2221 if (many_times_ok)
2222 { /* More than one repetition is allowed, so put in at the
2223 end a backward relative jump from `b' to before the next
2224 jump we're going to put in below (which jumps from
2225 laststart to after this jump).
2226
2227 But if we are at the `*' in the exact sequence `.*\n',
2228 insert an unconditional jump backwards to the .,
2229 instead of the beginning of the loop. This way we only
2230 push a failure point once, instead of every time
2231 through the loop. */
2232 assert (p - 1 > pattern);
2233
2234 /* Allocate the space for the jump. */
2235 GET_BUFFER_SPACE (3);
2236
2237 /* We know we are not at the first character of the pattern,
2238 because laststart was nonzero. And we've already
2239 incremented `p', by the way, to be the character after
2240 the `*'. Do we have to do something analogous here
2241 for null bytes, because of RE_DOT_NOT_NULL? */
e934739e 2242 if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
fa9a63c5 2243 && zero_times_ok
e934739e
RS
2244 && p < pend
2245 && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
25fe55af
RS
2246 && !(syntax & RE_DOT_NEWLINE))
2247 { /* We have .*\n. */
2248 STORE_JUMP (jump, b, laststart);
2249 keep_string_p = true;
2250 }
2251 else
2252 /* Anything else. */
2253 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2254
2255 /* We've added more stuff to the buffer. */
2256 b += 3;
2257 }
2258
2259 /* On failure, jump from laststart to b + 3, which will be the
2260 end of the buffer after this jump is inserted. */
2261 GET_BUFFER_SPACE (3);
2262 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2263 : on_failure_jump,
2264 laststart, b + 3);
2265 pending_exact = 0;
2266 b += 3;
2267
2268 if (!zero_times_ok)
2269 {
2270 /* At least one repetition is required, so insert a
2271 `dummy_failure_jump' before the initial
2272 `on_failure_jump' instruction of the loop. This
2273 effects a skip over that instruction the first time
2274 we hit that loop. */
2275 GET_BUFFER_SPACE (3);
2276 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2277 b += 3;
2278 }
2279 }
fa9a63c5
RM
2280 break;
2281
2282
2283 case '.':
25fe55af
RS
2284 laststart = b;
2285 BUF_PUSH (anychar);
2286 break;
fa9a63c5
RM
2287
2288
25fe55af
RS
2289 case '[':
2290 {
b18215fc 2291 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2292
25fe55af 2293 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2294
25fe55af
RS
2295 /* Ensure that we have enough space to push a charset: the
2296 opcode, the length count, and the bitset; 34 bytes in all. */
fa9a63c5
RM
2297 GET_BUFFER_SPACE (34);
2298
25fe55af 2299 laststart = b;
e318085a 2300
25fe55af
RS
2301 /* We test `*p == '^' twice, instead of using an if
2302 statement, so we only need one BUF_PUSH. */
2303 BUF_PUSH (*p == '^' ? charset_not : charset);
2304 if (*p == '^')
2305 p++;
e318085a 2306
25fe55af
RS
2307 /* Remember the first position in the bracket expression. */
2308 p1 = p;
e318085a 2309
25fe55af
RS
2310 /* Push the number of bytes in the bitmap. */
2311 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2312
25fe55af
RS
2313 /* Clear the whole map. */
2314 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
e318085a 2315
25fe55af
RS
2316 /* charset_not matches newline according to a syntax bit. */
2317 if ((re_opcode_t) b[-2] == charset_not
2318 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2319 SET_LIST_BIT ('\n');
fa9a63c5 2320
25fe55af
RS
2321 /* Read in characters and ranges, setting map bits. */
2322 for (;;)
2323 {
b18215fc
RS
2324 int len;
2325 boolean escaped_char = false;
e318085a 2326
25fe55af 2327 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
e318085a 2328
25fe55af 2329 PATFETCH (c);
e318085a 2330
25fe55af
RS
2331 /* \ might escape characters inside [...] and [^...]. */
2332 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2333 {
2334 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
e318085a
RS
2335
2336 PATFETCH (c);
b18215fc 2337 escaped_char = true;
25fe55af 2338 }
b18215fc
RS
2339 else
2340 {
657fcfbd
RS
2341 /* Could be the end of the bracket expression. If it's
2342 not (i.e., when the bracket expression is `[]' so
2343 far), the ']' character bit gets set way below. */
2344 if (c == ']' && p != p1 + 1)
2345 break;
25fe55af 2346 }
b18215fc
RS
2347
2348 /* If C indicates start of multibyte char, get the
2349 actual character code in C, and set the pattern
2350 pointer P to the next character boundary. */
2351 if (bufp->multibyte && BASE_LEADING_CODE_P (c))
2352 {
2353 PATUNFETCH;
2354 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2355 p += len;
25fe55af 2356 }
b18215fc
RS
2357 /* What should we do for the character which is
2358 greater than 0x7F, but not BASE_LEADING_CODE_P?
2359 XXX */
2360
25fe55af
RS
2361 /* See if we're at the beginning of a possible character
2362 class. */
b18215fc
RS
2363
2364 else if (!escaped_char &&
2365 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
657fcfbd
RS
2366 {
2367 /* Leave room for the null. */
25fe55af 2368 char str[CHAR_CLASS_MAX_LENGTH + 1];
b18215fc 2369
25fe55af
RS
2370 PATFETCH (c);
2371 c1 = 0;
b18215fc 2372
25fe55af
RS
2373 /* If pattern is `[[:'. */
2374 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
b18215fc 2375
25fe55af
RS
2376 for (;;)
2377 {
2378 PATFETCH (c);
2379 if (c == ':' || c == ']' || p == pend
2380 || c1 == CHAR_CLASS_MAX_LENGTH)
2381 break;
2382 str[c1++] = c;
2383 }
2384 str[c1] = '\0';
b18215fc
RS
2385
2386 /* If isn't a word bracketed by `[:' and `:]':
2387 undo the ending character, the letters, and
2388 leave the leading `:' and `[' (but set bits for
2389 them). */
25fe55af
RS
2390 if (c == ':' && *p == ']')
2391 {
2392 int ch;
2393 boolean is_alnum = STREQ (str, "alnum");
2394 boolean is_alpha = STREQ (str, "alpha");
f71b19b6 2395 boolean is_ascii = STREQ (str, "ascii");
25fe55af
RS
2396 boolean is_blank = STREQ (str, "blank");
2397 boolean is_cntrl = STREQ (str, "cntrl");
2398 boolean is_digit = STREQ (str, "digit");
2399 boolean is_graph = STREQ (str, "graph");
2400 boolean is_lower = STREQ (str, "lower");
f71b19b6
DL
2401 boolean is_multibyte = STREQ (str, "multibyte");
2402 boolean is_nonascii = STREQ (str, "nonascii");
25fe55af
RS
2403 boolean is_print = STREQ (str, "print");
2404 boolean is_punct = STREQ (str, "punct");
2405 boolean is_space = STREQ (str, "space");
f71b19b6 2406 boolean is_unibyte = STREQ (str, "unibyte");
25fe55af 2407 boolean is_upper = STREQ (str, "upper");
96cc36cc 2408 boolean is_word = STREQ (str, "word");
f71b19b6 2409 boolean is_xdigit = STREQ (str, "xdigit");
25fe55af
RS
2410
2411 if (!IS_CHAR_CLASS (str))
fa9a63c5
RM
2412 FREE_STACK_RETURN (REG_ECTYPE);
2413
25fe55af
RS
2414 /* Throw away the ] at the end of the character
2415 class. */
2416 PATFETCH (c);
fa9a63c5 2417
25fe55af 2418 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
fa9a63c5 2419
96cc36cc
RS
2420 /* Most character classes in a multibyte match
2421 just set a flag. Exceptions are is_blank,
2422 is_digit, is_cntrl, and is_xdigit, since
2423 they can only match ASCII characters. We
2424 don't need to handle them for multibyte. */
2425
2426 if (bufp->multibyte)
2427 {
2428 int bit = 0;
2429
2430 if (is_alnum) bit = BIT_ALNUM;
2431 if (is_alpha) bit = BIT_ALPHA;
f71b19b6 2432 if (is_ascii) bit = BIT_ASCII;
96cc36cc
RS
2433 if (is_graph) bit = BIT_GRAPH;
2434 if (is_lower) bit = BIT_LOWER;
f71b19b6
DL
2435 if (is_multibyte) bit = BIT_MULTIBYTE;
2436 if (is_nonascii) bit = BIT_NONASCII;
96cc36cc
RS
2437 if (is_print) bit = BIT_PRINT;
2438 if (is_punct) bit = BIT_PUNCT;
2439 if (is_space) bit = BIT_SPACE;
f71b19b6 2440 if (is_unibyte) bit = BIT_UNIBYTE;
96cc36cc
RS
2441 if (is_upper) bit = BIT_UPPER;
2442 if (is_word) bit = BIT_WORD;
2443 if (bit)
2444 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2445 bit);
2446 }
2447
2448 /* Handle character classes for ASCII characters. */
25fe55af
RS
2449 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2450 {
7ae68633 2451 int translated = TRANSLATE (ch);
fa9a63c5
RM
2452 /* This was split into 3 if's to
2453 avoid an arbitrary limit in some compiler. */
25fe55af
RS
2454 if ( (is_alnum && ISALNUM (ch))
2455 || (is_alpha && ISALPHA (ch))
2456 || (is_blank && ISBLANK (ch))
2457 || (is_cntrl && ISCNTRL (ch)))
7ae68633 2458 SET_LIST_BIT (translated);
fa9a63c5 2459 if ( (is_digit && ISDIGIT (ch))
25fe55af
RS
2460 || (is_graph && ISGRAPH (ch))
2461 || (is_lower && ISLOWER (ch))
2462 || (is_print && ISPRINT (ch)))
7ae68633 2463 SET_LIST_BIT (translated);
fa9a63c5 2464 if ( (is_punct && ISPUNCT (ch))
25fe55af
RS
2465 || (is_space && ISSPACE (ch))
2466 || (is_upper && ISUPPER (ch))
2467 || (is_xdigit && ISXDIGIT (ch)))
7ae68633 2468 SET_LIST_BIT (translated);
f71b19b6
DL
2469 if ( (is_ascii && IS_REAL_ASCII (ch))
2470 || (is_nonascii && !IS_REAL_ASCII (ch))
2471 || (is_unibyte && ISUNIBYTE (ch))
2472 || (is_multibyte && !ISUNIBYTE (ch)))
2473 SET_LIST_BIT (translated);
2474
96cc36cc
RS
2475 if ( (is_word && ISWORD (ch)))
2476 SET_LIST_BIT (translated);
25fe55af 2477 }
b18215fc
RS
2478
2479 /* Repeat the loop. */
2480 continue;
25fe55af
RS
2481 }
2482 else
2483 {
2484 c1++;
2485 while (c1--)
2486 PATUNFETCH;
2487 SET_LIST_BIT ('[');
b18215fc
RS
2488
2489 /* Because the `:' may starts the range, we
2490 can't simply set bit and repeat the loop.
25fe55af 2491 Instead, just set it to C and handle below. */
b18215fc 2492 c = ':';
25fe55af
RS
2493 }
2494 }
b18215fc
RS
2495
2496 if (p < pend && p[0] == '-' && p[1] != ']')
2497 {
2498
2499 /* Discard the `-'. */
2500 PATFETCH (c1);
2501
2502 /* Fetch the character which ends the range. */
2503 PATFETCH (c1);
2504 if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
e318085a 2505 {
b18215fc
RS
2506 PATUNFETCH;
2507 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2508 p += len;
e318085a 2509 }
b18215fc 2510
e934739e
RS
2511 if (SINGLE_BYTE_CHAR_P (c)
2512 && ! SINGLE_BYTE_CHAR_P (c1))
2513 {
2514 /* Handle a range such as \177-\377 in multibyte mode.
2515 Split that into two ranges,,
2516 the low one ending at 0237, and the high one
2517 starting at ...040. */
2518 int c1_base = (c1 & ~0177) | 040;
2519 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2520 c1 = 0237;
2521 }
2522 else if (!SAME_CHARSET_P (c, c1))
b18215fc 2523 FREE_STACK_RETURN (REG_ERANGE);
e318085a 2524 }
25fe55af 2525 else
b18215fc
RS
2526 /* Range from C to C. */
2527 c1 = c;
2528
2529 /* Set the range ... */
2530 if (SINGLE_BYTE_CHAR_P (c))
2531 /* ... into bitmap. */
25fe55af 2532 {
b18215fc
RS
2533 unsigned this_char;
2534 int range_start = c, range_end = c1;
2535
2536 /* If the start is after the end, the range is empty. */
2537 if (range_start > range_end)
2538 {
2539 if (syntax & RE_NO_EMPTY_RANGES)
2540 FREE_STACK_RETURN (REG_ERANGE);
2541 /* Else, repeat the loop. */
2542 }
2543 else
2544 {
2545 for (this_char = range_start; this_char <= range_end;
2546 this_char++)
2547 SET_LIST_BIT (TRANSLATE (this_char));
e934739e 2548 }
25fe55af 2549 }
e318085a 2550 else
b18215fc
RS
2551 /* ... into range table. */
2552 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
e318085a
RS
2553 }
2554
25fe55af
RS
2555 /* Discard any (non)matching list bytes that are all 0 at the
2556 end of the map. Decrease the map-length byte too. */
2557 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2558 b[-1]--;
2559 b += b[-1];
fa9a63c5 2560
96cc36cc
RS
2561 /* Build real range table from work area. */
2562 if (RANGE_TABLE_WORK_USED (range_table_work)
2563 || RANGE_TABLE_WORK_BITS (range_table_work))
b18215fc
RS
2564 {
2565 int i;
2566 int used = RANGE_TABLE_WORK_USED (range_table_work);
fa9a63c5 2567
b18215fc 2568 /* Allocate space for COUNT + RANGE_TABLE. Needs two
96cc36cc
RS
2569 bytes for flags, two for COUNT, and three bytes for
2570 each character. */
2571 GET_BUFFER_SPACE (4 + used * 3);
fa9a63c5 2572
b18215fc
RS
2573 /* Indicate the existence of range table. */
2574 laststart[1] |= 0x80;
fa9a63c5 2575
96cc36cc
RS
2576 /* Store the character class flag bits into the range table.
2577 If not in emacs, these flag bits are always 0. */
2578 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2579 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2580
b18215fc
RS
2581 STORE_NUMBER_AND_INCR (b, used / 2);
2582 for (i = 0; i < used; i++)
2583 STORE_CHARACTER_AND_INCR
2584 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2585 }
25fe55af
RS
2586 }
2587 break;
fa9a63c5
RM
2588
2589
b18215fc 2590 case '(':
25fe55af
RS
2591 if (syntax & RE_NO_BK_PARENS)
2592 goto handle_open;
2593 else
2594 goto normal_char;
fa9a63c5
RM
2595
2596
25fe55af
RS
2597 case ')':
2598 if (syntax & RE_NO_BK_PARENS)
2599 goto handle_close;
2600 else
2601 goto normal_char;
e318085a
RS
2602
2603
25fe55af
RS
2604 case '\n':
2605 if (syntax & RE_NEWLINE_ALT)
2606 goto handle_alt;
2607 else
2608 goto normal_char;
e318085a
RS
2609
2610
b18215fc 2611 case '|':
25fe55af
RS
2612 if (syntax & RE_NO_BK_VBAR)
2613 goto handle_alt;
2614 else
2615 goto normal_char;
2616
2617
2618 case '{':
2619 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2620 goto handle_interval;
2621 else
2622 goto normal_char;
2623
2624
2625 case '\\':
2626 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2627
2628 /* Do not translate the character after the \, so that we can
2629 distinguish, e.g., \B from \b, even if we normally would
2630 translate, e.g., B to b. */
2631 PATFETCH_RAW (c);
2632
2633 switch (c)
2634 {
2635 case '(':
2636 if (syntax & RE_NO_BK_PARENS)
2637 goto normal_backslash;
2638
2639 handle_open:
2640 bufp->re_nsub++;
2641 regnum++;
2642
2643 if (COMPILE_STACK_FULL)
2644 {
2645 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2646 compile_stack_elt_t);
2647 if (compile_stack.stack == NULL) return REG_ESPACE;
2648
2649 compile_stack.size <<= 1;
2650 }
2651
2652 /* These are the values to restore when we hit end of this
2653 group. They are all relative offsets, so that if the
2654 whole pattern moves because of realloc, they will still
2655 be valid. */
2656 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2657 COMPILE_STACK_TOP.fixup_alt_jump
2658 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2659 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2660 COMPILE_STACK_TOP.regnum = regnum;
2661
2662 /* We will eventually replace the 0 with the number of
2663 groups inner to this one. But do not push a
2664 start_memory for groups beyond the last one we can
2665 represent in the compiled pattern. */
2666 if (regnum <= MAX_REGNUM)
2667 {
2668 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2669 BUF_PUSH_3 (start_memory, regnum, 0);
2670 }
2671
2672 compile_stack.avail++;
2673
2674 fixup_alt_jump = 0;
2675 laststart = 0;
2676 begalt = b;
b18215fc
RS
2677 /* If we've reached MAX_REGNUM groups, then this open
2678 won't actually generate any code, so we'll have to
2679 clear pending_exact explicitly. */
2680 pending_exact = 0;
25fe55af
RS
2681 break;
2682
2683
2684 case ')':
2685 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2686
2687 if (COMPILE_STACK_EMPTY)
2688 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2689 goto normal_backslash;
2690 else
2691 FREE_STACK_RETURN (REG_ERPAREN);
2692
2693 handle_close:
2694 if (fixup_alt_jump)
2695 { /* Push a dummy failure point at the end of the
2696 alternative for a possible future
2697 `pop_failure_jump' to pop. See comments at
2698 `push_dummy_failure' in `re_match_2'. */
2699 BUF_PUSH (push_dummy_failure);
2700
2701 /* We allocated space for this jump when we assigned
2702 to `fixup_alt_jump', in the `handle_alt' case below. */
2703 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2704 }
2705
2706 /* See similar code for backslashed left paren above. */
2707 if (COMPILE_STACK_EMPTY)
2708 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2709 goto normal_char;
2710 else
2711 FREE_STACK_RETURN (REG_ERPAREN);
2712
2713 /* Since we just checked for an empty stack above, this
2714 ``can't happen''. */
2715 assert (compile_stack.avail != 0);
2716 {
2717 /* We don't just want to restore into `regnum', because
2718 later groups should continue to be numbered higher,
2719 as in `(ab)c(de)' -- the second group is #2. */
2720 regnum_t this_group_regnum;
2721
2722 compile_stack.avail--;
2723 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2724 fixup_alt_jump
2725 = COMPILE_STACK_TOP.fixup_alt_jump
2726 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2727 : 0;
2728 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2729 this_group_regnum = COMPILE_STACK_TOP.regnum;
b18215fc
RS
2730 /* If we've reached MAX_REGNUM groups, then this open
2731 won't actually generate any code, so we'll have to
2732 clear pending_exact explicitly. */
2733 pending_exact = 0;
e318085a 2734
25fe55af
RS
2735 /* We're at the end of the group, so now we know how many
2736 groups were inside this one. */
2737 if (this_group_regnum <= MAX_REGNUM)
2738 {
2739 unsigned char *inner_group_loc
2740 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2741
2742 *inner_group_loc = regnum - this_group_regnum;
2743 BUF_PUSH_3 (stop_memory, this_group_regnum,
2744 regnum - this_group_regnum);
2745 }
2746 }
2747 break;
2748
2749
2750 case '|': /* `\|'. */
2751 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2752 goto normal_backslash;
2753 handle_alt:
2754 if (syntax & RE_LIMITED_OPS)
2755 goto normal_char;
2756
2757 /* Insert before the previous alternative a jump which
2758 jumps to this alternative if the former fails. */
2759 GET_BUFFER_SPACE (3);
2760 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2761 pending_exact = 0;
2762 b += 3;
2763
2764 /* The alternative before this one has a jump after it
2765 which gets executed if it gets matched. Adjust that
2766 jump so it will jump to this alternative's analogous
2767 jump (put in below, which in turn will jump to the next
2768 (if any) alternative's such jump, etc.). The last such
2769 jump jumps to the correct final destination. A picture:
2770 _____ _____
2771 | | | |
2772 | v | v
2773 a | b | c
2774
2775 If we are at `b', then fixup_alt_jump right now points to a
2776 three-byte space after `a'. We'll put in the jump, set
2777 fixup_alt_jump to right after `b', and leave behind three
2778 bytes which we'll fill in when we get to after `c'. */
2779
2780 if (fixup_alt_jump)
2781 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2782
2783 /* Mark and leave space for a jump after this alternative,
2784 to be filled in later either by next alternative or
2785 when know we're at the end of a series of alternatives. */
2786 fixup_alt_jump = b;
2787 GET_BUFFER_SPACE (3);
2788 b += 3;
2789
2790 laststart = 0;
2791 begalt = b;
2792 break;
2793
2794
2795 case '{':
2796 /* If \{ is a literal. */
2797 if (!(syntax & RE_INTERVALS)
2798 /* If we're at `\{' and it's not the open-interval
2799 operator. */
2800 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2801 || (p - 2 == pattern && p == pend))
2802 goto normal_backslash;
2803
2804 handle_interval:
2805 {
2806 /* If got here, then the syntax allows intervals. */
2807
2808 /* At least (most) this many matches must be made. */
2809 int lower_bound = -1, upper_bound = -1;
2810
2811 beg_interval = p - 1;
2812
2813 if (p == pend)
2814 {
2815 if (syntax & RE_NO_BK_BRACES)
2816 goto unfetch_interval;
2817 else
2818 FREE_STACK_RETURN (REG_EBRACE);
2819 }
2820
2821 GET_UNSIGNED_NUMBER (lower_bound);
2822
2823 if (c == ',')
2824 {
2825 GET_UNSIGNED_NUMBER (upper_bound);
2826 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2827 }
2828 else
2829 /* Interval such as `{1}' => match exactly once. */
2830 upper_bound = lower_bound;
2831
2832 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2833 || lower_bound > upper_bound)
2834 {
2835 if (syntax & RE_NO_BK_BRACES)
2836 goto unfetch_interval;
2837 else
2838 FREE_STACK_RETURN (REG_BADBR);
2839 }
2840
2841 if (!(syntax & RE_NO_BK_BRACES))
2842 {
2843 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2844
2845 PATFETCH (c);
2846 }
2847
2848 if (c != '}')
2849 {
2850 if (syntax & RE_NO_BK_BRACES)
2851 goto unfetch_interval;
2852 else
2853 FREE_STACK_RETURN (REG_BADBR);
2854 }
2855
2856 /* We just parsed a valid interval. */
2857
2858 /* If it's invalid to have no preceding re. */
2859 if (!laststart)
2860 {
2861 if (syntax & RE_CONTEXT_INVALID_OPS)
2862 FREE_STACK_RETURN (REG_BADRPT);
2863 else if (syntax & RE_CONTEXT_INDEP_OPS)
2864 laststart = b;
2865 else
2866 goto unfetch_interval;
2867 }
2868
2869 /* If the upper bound is zero, don't want to succeed at
2870 all; jump from `laststart' to `b + 3', which will be
2871 the end of the buffer after we insert the jump. */
2872 if (upper_bound == 0)
2873 {
2874 GET_BUFFER_SPACE (3);
2875 INSERT_JUMP (jump, laststart, b + 3);
2876 b += 3;
2877 }
2878
2879 /* Otherwise, we have a nontrivial interval. When
2880 we're all done, the pattern will look like:
2881 set_number_at <jump count> <upper bound>
2882 set_number_at <succeed_n count> <lower bound>
2883 succeed_n <after jump addr> <succeed_n count>
2884 <body of loop>
2885 jump_n <succeed_n addr> <jump count>
2886 (The upper bound and `jump_n' are omitted if
2887 `upper_bound' is 1, though.) */
2888 else
2889 { /* If the upper bound is > 1, we need to insert
2890 more at the end of the loop. */
2891 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2892
2893 GET_BUFFER_SPACE (nbytes);
2894
2895 /* Initialize lower bound of the `succeed_n', even
2896 though it will be set during matching by its
2897 attendant `set_number_at' (inserted next),
2898 because `re_compile_fastmap' needs to know.
2899 Jump to the `jump_n' we might insert below. */
2900 INSERT_JUMP2 (succeed_n, laststart,
2901 b + 5 + (upper_bound > 1) * 5,
2902 lower_bound);
2903 b += 5;
2904
2905 /* Code to initialize the lower bound. Insert
2906 before the `succeed_n'. The `5' is the last two
2907 bytes of this `set_number_at', plus 3 bytes of
2908 the following `succeed_n'. */
2909 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2910 b += 5;
2911
2912 if (upper_bound > 1)
2913 { /* More than one repetition is allowed, so
2914 append a backward jump to the `succeed_n'
2915 that starts this interval.
2916
2917 When we've reached this during matching,
2918 we'll have matched the interval once, so
2919 jump back only `upper_bound - 1' times. */
2920 STORE_JUMP2 (jump_n, b, laststart + 5,
2921 upper_bound - 1);
2922 b += 5;
2923
2924 /* The location we want to set is the second
2925 parameter of the `jump_n'; that is `b-2' as
2926 an absolute address. `laststart' will be
2927 the `set_number_at' we're about to insert;
2928 `laststart+3' the number to set, the source
2929 for the relative address. But we are
2930 inserting into the middle of the pattern --
2931 so everything is getting moved up by 5.
2932 Conclusion: (b - 2) - (laststart + 3) + 5,
2933 i.e., b - laststart.
2934
2935 We insert this at the beginning of the loop
2936 so that if we fail during matching, we'll
2937 reinitialize the bounds. */
2938 insert_op2 (set_number_at, laststart, b - laststart,
2939 upper_bound - 1, b);
2940 b += 5;
2941 }
2942 }
2943 pending_exact = 0;
2944 beg_interval = NULL;
2945 }
2946 break;
2947
2948 unfetch_interval:
2949 /* If an invalid interval, match the characters as literals. */
2950 assert (beg_interval);
2951 p = beg_interval;
2952 beg_interval = NULL;
2953
2954 /* normal_char and normal_backslash need `c'. */
2955 PATFETCH (c);
2956
2957 if (!(syntax & RE_NO_BK_BRACES))
2958 {
2959 if (p > pattern && p[-1] == '\\')
2960 goto normal_backslash;
2961 }
2962 goto normal_char;
e318085a 2963
b18215fc 2964#ifdef emacs
25fe55af
RS
2965 /* There is no way to specify the before_dot and after_dot
2966 operators. rms says this is ok. --karl */
2967 case '=':
2968 BUF_PUSH (at_dot);
2969 break;
2970
2971 case 's':
2972 laststart = b;
2973 PATFETCH (c);
2974 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2975 break;
2976
2977 case 'S':
2978 laststart = b;
2979 PATFETCH (c);
2980 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2981 break;
b18215fc
RS
2982
2983 case 'c':
2984 laststart = b;
2985 PATFETCH_RAW (c);
2986 BUF_PUSH_2 (categoryspec, c);
2987 break;
e318085a 2988
b18215fc
RS
2989 case 'C':
2990 laststart = b;
2991 PATFETCH_RAW (c);
2992 BUF_PUSH_2 (notcategoryspec, c);
2993 break;
2994#endif /* emacs */
e318085a 2995
e318085a 2996
25fe55af
RS
2997 case 'w':
2998 laststart = b;
2999 BUF_PUSH (wordchar);
3000 break;
e318085a 3001
e318085a 3002
25fe55af
RS
3003 case 'W':
3004 laststart = b;
3005 BUF_PUSH (notwordchar);
3006 break;
e318085a
RS
3007
3008
25fe55af
RS
3009 case '<':
3010 BUF_PUSH (wordbeg);
3011 break;
e318085a 3012
25fe55af
RS
3013 case '>':
3014 BUF_PUSH (wordend);
3015 break;
e318085a 3016
25fe55af
RS
3017 case 'b':
3018 BUF_PUSH (wordbound);
3019 break;
e318085a 3020
25fe55af
RS
3021 case 'B':
3022 BUF_PUSH (notwordbound);
3023 break;
fa9a63c5 3024
25fe55af
RS
3025 case '`':
3026 BUF_PUSH (begbuf);
3027 break;
e318085a 3028
25fe55af
RS
3029 case '\'':
3030 BUF_PUSH (endbuf);
3031 break;
e318085a 3032
25fe55af
RS
3033 case '1': case '2': case '3': case '4': case '5':
3034 case '6': case '7': case '8': case '9':
3035 if (syntax & RE_NO_BK_REFS)
3036 goto normal_char;
e318085a 3037
25fe55af 3038 c1 = c - '0';
e318085a 3039
25fe55af
RS
3040 if (c1 > regnum)
3041 FREE_STACK_RETURN (REG_ESUBREG);
e318085a 3042
25fe55af
RS
3043 /* Can't back reference to a subexpression if inside of it. */
3044 if (group_in_compile_stack (compile_stack, c1))
3045 goto normal_char;
e318085a 3046
25fe55af
RS
3047 laststart = b;
3048 BUF_PUSH_2 (duplicate, c1);
3049 break;
e318085a 3050
e318085a 3051
25fe55af
RS
3052 case '+':
3053 case '?':
3054 if (syntax & RE_BK_PLUS_QM)
3055 goto handle_plus;
3056 else
3057 goto normal_backslash;
3058
3059 default:
3060 normal_backslash:
3061 /* You might think it would be useful for \ to mean
3062 not to translate; but if we don't translate it
3063 it will never match anything. */
3064 c = TRANSLATE (c);
3065 goto normal_char;
3066 }
3067 break;
fa9a63c5
RM
3068
3069
3070 default:
25fe55af 3071 /* Expects the character in `c'. */
fa9a63c5 3072 normal_char:
b18215fc
RS
3073 p1 = p - 1; /* P1 points the head of C. */
3074#ifdef emacs
3075 if (bufp->multibyte)
3583e969
KH
3076 {
3077 c = STRING_CHAR (p1, pend - p1);
3078 c = TRANSLATE (c);
3079 /* Set P to the next character boundary. */
3080 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
3081 }
b18215fc 3082#endif
fa9a63c5 3083 /* If no exactn currently being built. */
25fe55af 3084 if (!pending_exact
fa9a63c5 3085
25fe55af
RS
3086 /* If last exactn not at current position. */
3087 || pending_exact + *pending_exact + 1 != b
5e69f11e 3088
25fe55af 3089 /* We have only one byte following the exactn for the count. */
b18215fc 3090 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
fa9a63c5 3091
25fe55af 3092 /* If followed by a repetition operator. */
9d99031f 3093 || (p != pend && (*p == '*' || *p == '^'))
fa9a63c5 3094 || ((syntax & RE_BK_PLUS_QM)
9d99031f
RS
3095 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3096 : p != pend && (*p == '+' || *p == '?'))
fa9a63c5 3097 || ((syntax & RE_INTERVALS)
25fe55af 3098 && ((syntax & RE_NO_BK_BRACES)
9d99031f
RS
3099 ? p != pend && *p == '{'
3100 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
fa9a63c5
RM
3101 {
3102 /* Start building a new exactn. */
5e69f11e 3103
25fe55af 3104 laststart = b;
fa9a63c5
RM
3105
3106 BUF_PUSH_2 (exactn, 0);
3107 pending_exact = b - 1;
25fe55af 3108 }
5e69f11e 3109
3583e969
KH
3110#ifdef emacs
3111 if (! SINGLE_BYTE_CHAR_P (c))
3112 {
3113 unsigned char work[4], *str;
3114 int i = CHAR_STRING (c, work, str);
3115 int j;
3116 for (j = 0; j < i; j++)
3117 {
3118 BUF_PUSH (str[j]);
3119 (*pending_exact)++;
3120 }
3121 }
3122 else
3123#endif
b18215fc 3124 {
e934739e
RS
3125 BUF_PUSH (c);
3126 (*pending_exact)++;
b18215fc 3127 }
fa9a63c5 3128 break;
25fe55af 3129 } /* switch (c) */
fa9a63c5
RM
3130 } /* while p != pend */
3131
5e69f11e 3132
fa9a63c5 3133 /* Through the pattern now. */
5e69f11e 3134
fa9a63c5
RM
3135 if (fixup_alt_jump)
3136 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3137
5e69f11e 3138 if (!COMPILE_STACK_EMPTY)
fa9a63c5
RM
3139 FREE_STACK_RETURN (REG_EPAREN);
3140
3141 /* If we don't want backtracking, force success
3142 the first time we reach the end of the compiled pattern. */
3143 if (syntax & RE_NO_POSIX_BACKTRACKING)
3144 BUF_PUSH (succeed);
3145
3146 free (compile_stack.stack);
3147
3148 /* We have succeeded; set the length of the buffer. */
3149 bufp->used = b - bufp->buffer;
3150
3151#ifdef DEBUG
3152 if (debug)
3153 {
3154 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3155 print_compiled_pattern (bufp);
3156 }
3157#endif /* DEBUG */
3158
3159#ifndef MATCH_MAY_ALLOCATE
3160 /* Initialize the failure stack to the largest possible stack. This
3161 isn't necessary unless we're trying to avoid calling alloca in
3162 the search and match routines. */
3163 {
3164 int num_regs = bufp->re_nsub + 1;
3165
320a2a73 3166 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
fa9a63c5 3167 {
a26f4ccd 3168 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
fa9a63c5
RM
3169
3170#ifdef emacs
3171 if (! fail_stack.stack)
3172 fail_stack.stack
5e69f11e 3173 = (fail_stack_elt_t *) xmalloc (fail_stack.size
fa9a63c5
RM
3174 * sizeof (fail_stack_elt_t));
3175 else
3176 fail_stack.stack
3177 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3178 (fail_stack.size
3179 * sizeof (fail_stack_elt_t)));
3180#else /* not emacs */
3181 if (! fail_stack.stack)
3182 fail_stack.stack
5e69f11e 3183 = (fail_stack_elt_t *) malloc (fail_stack.size
fa9a63c5
RM
3184 * sizeof (fail_stack_elt_t));
3185 else
3186 fail_stack.stack
3187 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3188 (fail_stack.size
3189 * sizeof (fail_stack_elt_t)));
3190#endif /* not emacs */
3191 }
3192
3193 regex_grow_registers (num_regs);
3194 }
3195#endif /* not MATCH_MAY_ALLOCATE */
3196
3197 return REG_NOERROR;
3198} /* regex_compile */
3199\f
3200/* Subroutines for `regex_compile'. */
3201
25fe55af 3202/* Store OP at LOC followed by two-byte integer parameter ARG. */
fa9a63c5
RM
3203
3204static void
3205store_op1 (op, loc, arg)
3206 re_opcode_t op;
3207 unsigned char *loc;
3208 int arg;
3209{
3210 *loc = (unsigned char) op;
3211 STORE_NUMBER (loc + 1, arg);
3212}
3213
3214
3215/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3216
3217static void
3218store_op2 (op, loc, arg1, arg2)
3219 re_opcode_t op;
3220 unsigned char *loc;
3221 int arg1, arg2;
3222{
3223 *loc = (unsigned char) op;
3224 STORE_NUMBER (loc + 1, arg1);
3225 STORE_NUMBER (loc + 3, arg2);
3226}
3227
3228
3229/* Copy the bytes from LOC to END to open up three bytes of space at LOC
3230 for OP followed by two-byte integer parameter ARG. */
3231
3232static void
3233insert_op1 (op, loc, arg, end)
3234 re_opcode_t op;
3235 unsigned char *loc;
3236 int arg;
5e69f11e 3237 unsigned char *end;
fa9a63c5
RM
3238{
3239 register unsigned char *pfrom = end;
3240 register unsigned char *pto = end + 3;
3241
3242 while (pfrom != loc)
3243 *--pto = *--pfrom;
5e69f11e 3244
fa9a63c5
RM
3245 store_op1 (op, loc, arg);
3246}
3247
3248
3249/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3250
3251static void
3252insert_op2 (op, loc, arg1, arg2, end)
3253 re_opcode_t op;
3254 unsigned char *loc;
3255 int arg1, arg2;
5e69f11e 3256 unsigned char *end;
fa9a63c5
RM
3257{
3258 register unsigned char *pfrom = end;
3259 register unsigned char *pto = end + 5;
3260
3261 while (pfrom != loc)
3262 *--pto = *--pfrom;
5e69f11e 3263
fa9a63c5
RM
3264 store_op2 (op, loc, arg1, arg2);
3265}
3266
3267
3268/* P points to just after a ^ in PATTERN. Return true if that ^ comes
3269 after an alternative or a begin-subexpression. We assume there is at
3270 least one character before the ^. */
3271
3272static boolean
3273at_begline_loc_p (pattern, p, syntax)
3274 const char *pattern, *p;
3275 reg_syntax_t syntax;
3276{
3277 const char *prev = p - 2;
3278 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
5e69f11e 3279
fa9a63c5
RM
3280 return
3281 /* After a subexpression? */
3282 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
25fe55af 3283 /* After an alternative? */
fa9a63c5
RM
3284 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3285}
3286
3287
3288/* The dual of at_begline_loc_p. This one is for $. We assume there is
3289 at least one character after the $, i.e., `P < PEND'. */
3290
3291static boolean
3292at_endline_loc_p (p, pend, syntax)
3293 const char *p, *pend;
3294 int syntax;
3295{
3296 const char *next = p;
3297 boolean next_backslash = *next == '\\';
5bb52971 3298 const char *next_next = p + 1 < pend ? p + 1 : 0;
5e69f11e 3299
fa9a63c5
RM
3300 return
3301 /* Before a subexpression? */
3302 (syntax & RE_NO_BK_PARENS ? *next == ')'
25fe55af 3303 : next_backslash && next_next && *next_next == ')')
fa9a63c5
RM
3304 /* Before an alternative? */
3305 || (syntax & RE_NO_BK_VBAR ? *next == '|'
25fe55af 3306 : next_backslash && next_next && *next_next == '|');
fa9a63c5
RM
3307}
3308
3309
5e69f11e 3310/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
fa9a63c5
RM
3311 false if it's not. */
3312
3313static boolean
3314group_in_compile_stack (compile_stack, regnum)
3315 compile_stack_type compile_stack;
3316 regnum_t regnum;
3317{
3318 int this_element;
3319
5e69f11e
RM
3320 for (this_element = compile_stack.avail - 1;
3321 this_element >= 0;
fa9a63c5
RM
3322 this_element--)
3323 if (compile_stack.stack[this_element].regnum == regnum)
3324 return true;
3325
3326 return false;
3327}
fa9a63c5
RM
3328\f
3329/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3330 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3331 characters can start a string that matches the pattern. This fastmap
3332 is used by re_search to skip quickly over impossible starting points.
3333
96cc36cc
RS
3334 Character codes above (1 << BYTEWIDTH) are not represented in the
3335 fastmap, but the leading codes are represented. Thus, the fastmap
3336 indicates which character sets could start a match.
3337
fa9a63c5
RM
3338 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3339 area as BUFP->fastmap.
5e69f11e 3340
fa9a63c5
RM
3341 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3342 the pattern buffer.
3343
3344 Returns 0 if we succeed, -2 if an internal error. */
3345
3346int
3347re_compile_fastmap (bufp)
3348 struct re_pattern_buffer *bufp;
3349{
b18215fc 3350 int i, j, k;
fa9a63c5
RM
3351#ifdef MATCH_MAY_ALLOCATE
3352 fail_stack_type fail_stack;
3353#endif
3354#ifndef REGEX_MALLOC
3355 char *destination;
3356#endif
3357 /* We don't push any register information onto the failure stack. */
3358 unsigned num_regs = 0;
5e69f11e 3359
fa9a63c5
RM
3360 register char *fastmap = bufp->fastmap;
3361 unsigned char *pattern = bufp->buffer;
3362 unsigned long size = bufp->used;
3363 unsigned char *p = pattern;
3364 register unsigned char *pend = pattern + size;
3365
3366 /* This holds the pointer to the failure stack, when
3367 it is allocated relocatably. */
3368 fail_stack_elt_t *failure_stack_ptr;
3369
3370 /* Assume that each path through the pattern can be null until
25fe55af 3371 proven otherwise. We set this false at the bottom of switch
fa9a63c5
RM
3372 statement, to which we get only if a particular path doesn't
3373 match the empty string. */
3374 boolean path_can_be_null = true;
3375
3376 /* We aren't doing a `succeed_n' to begin with. */
3377 boolean succeed_n_p = false;
3378
b18215fc 3379 /* If all elements for base leading-codes in fastmap is set, this
25fe55af 3380 flag is set true. */
b18215fc
RS
3381 boolean match_any_multibyte_characters = false;
3382
3383 /* Maximum code of simple (single byte) character. */
3384 int simple_char_max;
3385
fa9a63c5 3386 assert (fastmap != NULL && p != NULL);
5e69f11e 3387
fa9a63c5 3388 INIT_FAIL_STACK ();
25fe55af 3389 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
fa9a63c5
RM
3390 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3391 bufp->can_be_null = 0;
5e69f11e 3392
fa9a63c5
RM
3393 while (1)
3394 {
3395 if (p == pend || *p == succeed)
3396 {
3397 /* We have reached the (effective) end of pattern. */
3398 if (!FAIL_STACK_EMPTY ())
3399 {
3400 bufp->can_be_null |= path_can_be_null;
3401
3402 /* Reset for next path. */
3403 path_can_be_null = true;
3404
3405 p = fail_stack.stack[--fail_stack.avail].pointer;
3406
3407 continue;
3408 }
3409 else
3410 break;
3411 }
3412
25fe55af 3413 /* We should never be about to go beyond the end of the pattern. */
fa9a63c5 3414 assert (p < pend);
5e69f11e 3415
fa9a63c5
RM
3416 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3417 {
3418
25fe55af
RS
3419 /* I guess the idea here is to simply not bother with a fastmap
3420 if a backreference is used, since it's too hard to figure out
3421 the fastmap for the corresponding group. Setting
3422 `can_be_null' stops `re_search_2' from using the fastmap, so
3423 that is all we do. */
fa9a63c5
RM
3424 case duplicate:
3425 bufp->can_be_null = 1;
25fe55af 3426 goto done;
fa9a63c5
RM
3427
3428
3429 /* Following are the cases which match a character. These end
25fe55af 3430 with `break'. */
fa9a63c5
RM
3431
3432 case exactn:
25fe55af 3433 fastmap[p[1]] = 1;
fa9a63c5
RM
3434 break;
3435
3436
b18215fc 3437#ifndef emacs
25fe55af 3438 case charset:
96cc36cc
RS
3439 {
3440 int length = (*p & 0x7f);;
3441 p++;
fa9a63c5 3442
96cc36cc
RS
3443 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3444 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3445 fastmap[j] = 1;
3446 }
3447 break;
fa9a63c5
RM
3448
3449 case charset_not:
3450 /* Chars beyond end of map must be allowed. */
96cc36cc
RS
3451 {
3452 int length = (*p & 0x7f);;
3453 p++;
fa9a63c5 3454
96cc36cc 3455 for (j = length * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
25fe55af 3456 fastmap[j] = 1;
fa9a63c5 3457
96cc36cc
RS
3458 for (j = length * BYTEWIDTH - 1; j >= 0; j--)
3459 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3460 fastmap[j] = 1;
3461 }
3462 break;
fa9a63c5
RM
3463
3464 case wordchar:
3465 for (j = 0; j < (1 << BYTEWIDTH); j++)
3466 if (SYNTAX (j) == Sword)
3467 fastmap[j] = 1;
3468 break;
3469
3470
3471 case notwordchar:
3472 for (j = 0; j < (1 << BYTEWIDTH); j++)
3473 if (SYNTAX (j) != Sword)
3474 fastmap[j] = 1;
3475 break;
b18215fc
RS
3476#else /* emacs */
3477 case charset:
3478 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3479 j >= 0; j--)
3480 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3481 fastmap[j] = 1;
3482
f71b19b6 3483 /* If we can match a character class, we can match
96cc36cc
RS
3484 any character set. */
3485 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3486 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)
3487 goto set_fastmap_for_multibyte_characters;
3488
b18215fc
RS
3489 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3490 && match_any_multibyte_characters == false)
3491 {
3492 /* Set fastmap[I] 1 where I is a base leading code of each
3493 multibyte character in the range table. */
3494 int c, count;
3495
3496 /* Make P points the range table. */
3497 p += CHARSET_BITMAP_SIZE (&p[-2]);
3498
f71b19b6 3499 /* Extract the number of ranges in range table into COUNT. */
b18215fc
RS
3500 EXTRACT_NUMBER_AND_INCR (count, p);
3501 for (; count > 0; count--, p += 2 * 3) /* XXX */
3502 {
3503 /* Extract the start of each range. */
3504 EXTRACT_CHARACTER (c, p);
3505 j = CHAR_CHARSET (c);
3506 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3507 }
3508 }
3509 break;
fa9a63c5
RM
3510
3511
b18215fc 3512 case charset_not:
ba5c004d
RS
3513 /* Chars beyond end of bitmap are possible matches.
3514 All the single-byte codes can occur in multibyte buffers.
3515 So any that are not listed in the charset
3516 are possible matches, even in multibyte buffers. */
3517 simple_char_max = (1 << BYTEWIDTH);
b18215fc
RS
3518 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3519 j < simple_char_max; j++)
3520 fastmap[j] = 1;
3521
3522 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3523 j >= 0; j--)
3524 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3525 fastmap[j] = 1;
3526
3527 if (bufp->multibyte)
3528 /* Any character set can possibly contain a character
3529 which doesn't match the specified set of characters. */
3530 {
3531 set_fastmap_for_multibyte_characters:
3532 if (match_any_multibyte_characters == false)
3533 {
3534 for (j = 0x80; j < 0xA0; j++) /* XXX */
3535 if (BASE_LEADING_CODE_P (j))
3536 fastmap[j] = 1;
3537 match_any_multibyte_characters = true;
3538 }
3539 }
3540 break;
3541
3542
3543 case wordchar:
ba5c004d
RS
3544 /* All the single-byte codes can occur in multibyte buffers,
3545 and they may have word syntax. So do consider them. */
3546 simple_char_max = (1 << BYTEWIDTH);
b18215fc
RS
3547 for (j = 0; j < simple_char_max; j++)
3548 if (SYNTAX (j) == Sword)
3549 fastmap[j] = 1;
3550
3551 if (bufp->multibyte)
3552 /* Any character set can possibly contain a character
25fe55af 3553 whose syntax is `Sword'. */
b18215fc
RS
3554 goto set_fastmap_for_multibyte_characters;
3555 break;
3556
3557
3558 case notwordchar:
ba5c004d
RS
3559 /* All the single-byte codes can occur in multibyte buffers,
3560 and they may not have word syntax. So do consider them. */
3561 simple_char_max = (1 << BYTEWIDTH);
b18215fc
RS
3562 for (j = 0; j < simple_char_max; j++)
3563 if (SYNTAX (j) != Sword)
3564 fastmap[j] = 1;
3565
3566 if (bufp->multibyte)
3567 /* Any character set can possibly contain a character
3568 whose syntax is not `Sword'. */
3569 goto set_fastmap_for_multibyte_characters;
3570 break;
3571#endif
3572
25fe55af 3573 case anychar:
fa9a63c5
RM
3574 {
3575 int fastmap_newline = fastmap['\n'];
3576
d18b62f2
KH
3577 /* `.' matches anything, except perhaps newline.
3578 Even in a multibyte buffer, it should match any
3579 conceivable byte value for the fastmap. */
b18215fc 3580 if (bufp->multibyte)
d18b62f2 3581 match_any_multibyte_characters = true;
b18215fc 3582
d18b62f2 3583 simple_char_max = (1 << BYTEWIDTH);
b18215fc 3584 for (j = 0; j < simple_char_max; j++)
fa9a63c5
RM
3585 fastmap[j] = 1;
3586
3587 /* ... except perhaps newline. */
3588 if (!(bufp->syntax & RE_DOT_NEWLINE))
3589 fastmap['\n'] = fastmap_newline;
3590
3591 /* Return if we have already set `can_be_null'; if we have,
25fe55af 3592 then the fastmap is irrelevant. Something's wrong here. */
fa9a63c5
RM
3593 else if (bufp->can_be_null)
3594 goto done;
3595
3596 /* Otherwise, have to check alternative paths. */
3597 break;
3598 }
3599
3600#ifdef emacs
b18215fc
RS
3601 case wordbound:
3602 case notwordbound:
3603 case wordbeg:
3604 case wordend:
3605 case notsyntaxspec:
25fe55af 3606 case syntaxspec:
b18215fc
RS
3607 /* This match depends on text properties. These end with
3608 aborting optimizations. */
3609 bufp->can_be_null = 1;
25fe55af 3610 goto done;
b18215fc 3611#if 0
fa9a63c5 3612 k = *p++;
b18215fc
RS
3613 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3614 for (j = 0; j < simple_char_max; j++)
fa9a63c5
RM
3615 if (SYNTAX (j) == (enum syntaxcode) k)
3616 fastmap[j] = 1;
fa9a63c5 3617
b18215fc
RS
3618 if (bufp->multibyte)
3619 /* Any character set can possibly contain a character
3620 whose syntax is K. */
3621 goto set_fastmap_for_multibyte_characters;
3622 break;
fa9a63c5
RM
3623
3624 case notsyntaxspec:
3625 k = *p++;
b18215fc
RS
3626 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3627 for (j = 0; j < simple_char_max; j++)
fa9a63c5
RM
3628 if (SYNTAX (j) != (enum syntaxcode) k)
3629 fastmap[j] = 1;
b18215fc
RS
3630
3631 if (bufp->multibyte)
3632 /* Any character set can possibly contain a character
3633 whose syntax is not K. */
3634 goto set_fastmap_for_multibyte_characters;
3635 break;
3636#endif
3637
3638
3639 case categoryspec:
3640 k = *p++;
ba5c004d 3641 simple_char_max = (1 << BYTEWIDTH);
b18215fc
RS
3642 for (j = 0; j < simple_char_max; j++)
3643 if (CHAR_HAS_CATEGORY (j, k))
3644 fastmap[j] = 1;
3645
3646 if (bufp->multibyte)
3647 /* Any character set can possibly contain a character
3648 whose category is K. */
3649 goto set_fastmap_for_multibyte_characters;
fa9a63c5
RM
3650 break;
3651
3652
b18215fc
RS
3653 case notcategoryspec:
3654 k = *p++;
ba5c004d 3655 simple_char_max = (1 << BYTEWIDTH);
b18215fc
RS
3656 for (j = 0; j < simple_char_max; j++)
3657 if (!CHAR_HAS_CATEGORY (j, k))
3658 fastmap[j] = 1;
3659
3660 if (bufp->multibyte)
3661 /* Any character set can possibly contain a character
25fe55af 3662 whose category is not K. */
b18215fc
RS
3663 goto set_fastmap_for_multibyte_characters;
3664 break;
3665
fa9a63c5 3666 /* All cases after this match the empty string. These end with
25fe55af 3667 `continue'. */
fa9a63c5
RM
3668
3669
3670 case before_dot:
3671 case at_dot:
3672 case after_dot:
25fe55af 3673 continue;
ae4788a8 3674#endif /* emacs */
fa9a63c5
RM
3675
3676
25fe55af
RS
3677 case no_op:
3678 case begline:
3679 case endline:
fa9a63c5
RM
3680 case begbuf:
3681 case endbuf:
b18215fc 3682#ifndef emacs
fa9a63c5
RM
3683 case wordbound:
3684 case notwordbound:
3685 case wordbeg:
3686 case wordend:
25fe55af
RS
3687#endif
3688 case push_dummy_failure:
3689 continue;
fa9a63c5
RM
3690
3691
3692 case jump_n:
25fe55af 3693 case pop_failure_jump:
fa9a63c5
RM
3694 case maybe_pop_jump:
3695 case jump:
25fe55af 3696 case jump_past_alt:
fa9a63c5 3697 case dummy_failure_jump:
25fe55af 3698 EXTRACT_NUMBER_AND_INCR (j, p);
5e69f11e 3699 p += j;
fa9a63c5
RM
3700 if (j > 0)
3701 continue;
5e69f11e 3702
25fe55af
RS
3703 /* Jump backward implies we just went through the body of a
3704 loop and matched nothing. Opcode jumped to should be
3705 `on_failure_jump' or `succeed_n'. Just treat it like an
3706 ordinary jump. For a * loop, it has pushed its failure
3707 point already; if so, discard that as redundant. */
3708 if ((re_opcode_t) *p != on_failure_jump
fa9a63c5
RM
3709 && (re_opcode_t) *p != succeed_n)
3710 continue;
3711
25fe55af
RS
3712 p++;
3713 EXTRACT_NUMBER_AND_INCR (j, p);
3714 p += j;
5e69f11e 3715
25fe55af
RS
3716 /* If what's on the stack is where we are now, pop it. */
3717 if (!FAIL_STACK_EMPTY ()
fa9a63c5 3718 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
25fe55af 3719 fail_stack.avail--;
fa9a63c5 3720
25fe55af 3721 continue;
fa9a63c5
RM
3722
3723
25fe55af
RS
3724 case on_failure_jump:
3725 case on_failure_keep_string_jump:
fa9a63c5 3726 handle_on_failure_jump:
25fe55af
RS
3727 EXTRACT_NUMBER_AND_INCR (j, p);
3728
3729 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3730 end of the pattern. We don't want to push such a point,
3731 since when we restore it above, entering the switch will
3732 increment `p' past the end of the pattern. We don't need
3733 to push such a point since we obviously won't find any more
3734 fastmap entries beyond `pend'. Such a pattern can match
3735 the null string, though. */
3736 if (p + j < pend)
3737 {
3738 if (!PUSH_PATTERN_OP (p + j, fail_stack))
fa9a63c5
RM
3739 {
3740 RESET_FAIL_STACK ();
3741 return -2;
3742 }
fa9a63c5 3743 }
25fe55af
RS
3744 else
3745 bufp->can_be_null = 1;
fa9a63c5 3746
25fe55af
RS
3747 if (succeed_n_p)
3748 {
3749 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3750 succeed_n_p = false;
3751 }
3752
3753 continue;
fa9a63c5
RM
3754
3755
3756 case succeed_n:
25fe55af
RS
3757 /* Get to the number of times to succeed. */
3758 p += 2;
fa9a63c5 3759
25fe55af
RS
3760 /* Increment p past the n for when k != 0. */
3761 EXTRACT_NUMBER_AND_INCR (k, p);
3762 if (k == 0)
fa9a63c5 3763 {
25fe55af
RS
3764 p -= 4;
3765 succeed_n_p = true; /* Spaghetti code alert. */
3766 goto handle_on_failure_jump;
3767 }
3768 continue;
fa9a63c5
RM
3769
3770
3771 case set_number_at:
25fe55af
RS
3772 p += 4;
3773 continue;
fa9a63c5
RM
3774
3775
3776 case start_memory:
25fe55af 3777 case stop_memory:
fa9a63c5
RM
3778 p += 2;
3779 continue;
3780
3781
3782 default:
25fe55af
RS
3783 abort (); /* We have listed all the cases. */
3784 } /* switch *p++ */
fa9a63c5
RM
3785
3786 /* Getting here means we have found the possible starting
25fe55af
RS
3787 characters for one path of the pattern -- and that the empty
3788 string does not match. We need not follow this path further.
3789 Instead, look at the next alternative (remembered on the
3790 stack), or quit if no more. The test at the top of the loop
3791 does these things. */
fa9a63c5
RM
3792 path_can_be_null = false;
3793 p = pend;
3794 } /* while p */
3795
3796 /* Set `can_be_null' for the last path (also the first path, if the
25fe55af 3797 pattern is empty). */
fa9a63c5
RM
3798 bufp->can_be_null |= path_can_be_null;
3799
3800 done:
3801 RESET_FAIL_STACK ();
3802 return 0;
3803} /* re_compile_fastmap */
3804\f
3805/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3806 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3807 this memory for recording register information. STARTS and ENDS
3808 must be allocated using the malloc library routine, and must each
3809 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3810
3811 If NUM_REGS == 0, then subsequent matches should allocate their own
3812 register data.
3813
3814 Unless this function is called, the first search or match using
3815 PATTERN_BUFFER will allocate its own register data, without
3816 freeing the old data. */
3817
3818void
3819re_set_registers (bufp, regs, num_regs, starts, ends)
3820 struct re_pattern_buffer *bufp;
3821 struct re_registers *regs;
3822 unsigned num_regs;
3823 regoff_t *starts, *ends;
3824{
3825 if (num_regs)
3826 {
3827 bufp->regs_allocated = REGS_REALLOCATE;
3828 regs->num_regs = num_regs;
3829 regs->start = starts;
3830 regs->end = ends;
3831 }
3832 else
3833 {
3834 bufp->regs_allocated = REGS_UNALLOCATED;
3835 regs->num_regs = 0;
3836 regs->start = regs->end = (regoff_t *) 0;
3837 }
3838}
3839\f
25fe55af 3840/* Searching routines. */
fa9a63c5
RM
3841
3842/* Like re_search_2, below, but only one string is specified, and
3843 doesn't let you say where to stop matching. */
3844
3845int
3846re_search (bufp, string, size, startpos, range, regs)
3847 struct re_pattern_buffer *bufp;
3848 const char *string;
3849 int size, startpos, range;
3850 struct re_registers *regs;
3851{
5e69f11e 3852 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
fa9a63c5
RM
3853 regs, size);
3854}
3855
b18215fc
RS
3856/* End address of virtual concatenation of string. */
3857#define STOP_ADDR_VSTRING(P) \
3858 (((P) >= size1 ? string2 + size2 : string1 + size1))
3859
3860/* Address of POS in the concatenation of virtual string. */
3861#define POS_ADDR_VSTRING(POS) \
3862 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
fa9a63c5
RM
3863
3864/* Using the compiled pattern in BUFP->buffer, first tries to match the
3865 virtual concatenation of STRING1 and STRING2, starting first at index
3866 STARTPOS, then at STARTPOS + 1, and so on.
5e69f11e 3867
fa9a63c5 3868 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5e69f11e 3869
fa9a63c5
RM
3870 RANGE is how far to scan while trying to match. RANGE = 0 means try
3871 only at STARTPOS; in general, the last start tried is STARTPOS +
3872 RANGE.
5e69f11e 3873
fa9a63c5
RM
3874 In REGS, return the indices of the virtual concatenation of STRING1
3875 and STRING2 that matched the entire BUFP->buffer and its contained
3876 subexpressions.
5e69f11e 3877
fa9a63c5
RM
3878 Do not consider matching one past the index STOP in the virtual
3879 concatenation of STRING1 and STRING2.
3880
3881 We return either the position in the strings at which the match was
3882 found, -1 if no match, or -2 if error (such as failure
3883 stack overflow). */
3884
3885int
3886re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3887 struct re_pattern_buffer *bufp;
3888 const char *string1, *string2;
3889 int size1, size2;
3890 int startpos;
3891 int range;
3892 struct re_registers *regs;
3893 int stop;
3894{
3895 int val;
3896 register char *fastmap = bufp->fastmap;
6676cb1c 3897 register RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5
RM
3898 int total_size = size1 + size2;
3899 int endpos = startpos + range;
c8499ba5 3900 int anchored_start = 0;
fa9a63c5 3901
25fe55af 3902 /* Nonzero if we have to concern multibyte character. */
b18215fc
RS
3903 int multibyte = bufp->multibyte;
3904
fa9a63c5
RM
3905 /* Check for out-of-range STARTPOS. */
3906 if (startpos < 0 || startpos > total_size)
3907 return -1;
5e69f11e 3908
fa9a63c5 3909 /* Fix up RANGE if it might eventually take us outside
34597fa9 3910 the virtual concatenation of STRING1 and STRING2.
5e69f11e 3911 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
34597fa9
RS
3912 if (endpos < 0)
3913 range = 0 - startpos;
fa9a63c5
RM
3914 else if (endpos > total_size)
3915 range = total_size - startpos;
3916
3917 /* If the search isn't to be a backwards one, don't waste time in a
7b140fd7 3918 search for a pattern anchored at beginning of buffer. */
fa9a63c5
RM
3919 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3920 {
3921 if (startpos > 0)
3922 return -1;
3923 else
7b140fd7 3924 range = 0;
fa9a63c5
RM
3925 }
3926
ae4788a8
RS
3927#ifdef emacs
3928 /* In a forward search for something that starts with \=.
3929 don't keep searching past point. */
3930 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3931 {
7b140fd7
RS
3932 range = PT_BYTE - BEGV_BYTE - startpos;
3933 if (range < 0)
ae4788a8
RS
3934 return -1;
3935 }
3936#endif /* emacs */
3937
fa9a63c5
RM
3938 /* Update the fastmap now if not correct already. */
3939 if (fastmap && !bufp->fastmap_accurate)
3940 if (re_compile_fastmap (bufp) == -2)
3941 return -2;
5e69f11e 3942
c8499ba5
RS
3943 /* See whether the pattern is anchored. */
3944 if (bufp->buffer[0] == begline)
3945 anchored_start = 1;
3946
b18215fc 3947#ifdef emacs
cc9b4df2
KH
3948 gl_state.object = re_match_object;
3949 {
7e95234e
RS
3950 int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
3951 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos);
cc9b4df2
KH
3952
3953 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3954 }
b18215fc
RS
3955#endif
3956
fa9a63c5
RM
3957 /* Loop through the string, looking for a place to start matching. */
3958 for (;;)
5e69f11e 3959 {
c8499ba5
RS
3960 /* If the pattern is anchored,
3961 skip quickly past places we cannot match.
3962 We don't bother to treat startpos == 0 specially
3963 because that case doesn't repeat. */
3964 if (anchored_start && startpos > 0)
3965 {
3966 if (! (bufp->newline_anchor
3967 && ((startpos <= size1 ? string1[startpos - 1]
3968 : string2[startpos - size1 - 1])
3969 == '\n')))
3970 goto advance;
3971 }
3972
fa9a63c5 3973 /* If a fastmap is supplied, skip quickly over characters that
25fe55af
RS
3974 cannot be the start of a match. If the pattern can match the
3975 null string, however, we don't need to skip characters; we want
3976 the first null string. */
fa9a63c5
RM
3977 if (fastmap && startpos < total_size && !bufp->can_be_null)
3978 {
e934739e
RS
3979 register const char *d;
3980 register unsigned int buf_ch;
3981
3982 d = POS_ADDR_VSTRING (startpos);
3983
25fe55af 3984 if (range > 0) /* Searching forwards. */
fa9a63c5 3985 {
fa9a63c5
RM
3986 register int lim = 0;
3987 int irange = range;
3988
25fe55af
RS
3989 if (startpos < size1 && startpos + range >= size1)
3990 lim = range - (size1 - startpos);
fa9a63c5 3991
25fe55af
RS
3992 /* Written out as an if-else to avoid testing `translate'
3993 inside the loop. */
28ae27ae
AS
3994 if (RE_TRANSLATE_P (translate))
3995 {
e934739e
RS
3996 if (multibyte)
3997 while (range > lim)
3998 {
3999 int buf_charlen;
4000
4001 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
4002 buf_charlen);
4003
4004 buf_ch = RE_TRANSLATE (translate, buf_ch);
4005 if (buf_ch >= 0400
4006 || fastmap[buf_ch])
4007 break;
4008
4009 range -= buf_charlen;
4010 d += buf_charlen;
4011 }
4012 else
4013 while (range > lim
4014 && !fastmap[(unsigned char)
33c46939
RS
4015 RE_TRANSLATE (translate, (unsigned char) *d)])
4016 {
4017 d++;
4018 range--;
4019 }
e934739e 4020 }
fa9a63c5 4021 else
33c46939
RS
4022 while (range > lim && !fastmap[(unsigned char) *d])
4023 {
4024 d++;
4025 range--;
4026 }
fa9a63c5
RM
4027
4028 startpos += irange - range;
4029 }
25fe55af 4030 else /* Searching backwards. */
fa9a63c5 4031 {
e934739e
RS
4032 int room = (size1 == 0 || startpos >= size1
4033 ? size2 + size1 - startpos
4034 : size1 - startpos);
4035
4036 buf_ch = STRING_CHAR (d, room);
28703c16 4037 if (RE_TRANSLATE_P (translate))
e934739e 4038 buf_ch = RE_TRANSLATE (translate, buf_ch);
fa9a63c5 4039
e934739e
RS
4040 if (! (buf_ch >= 0400
4041 || fastmap[buf_ch]))
fa9a63c5
RM
4042 goto advance;
4043 }
4044 }
4045
4046 /* If can't match the null string, and that's all we have left, fail. */
4047 if (range >= 0 && startpos == total_size && fastmap
25fe55af 4048 && !bufp->can_be_null)
fa9a63c5
RM
4049 return -1;
4050
4051 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4052 startpos, regs, stop);
4053#ifndef REGEX_MALLOC
4054#ifdef C_ALLOCA
4055 alloca (0);
4056#endif
4057#endif
4058
4059 if (val >= 0)
4060 return startpos;
5e69f11e 4061
fa9a63c5
RM
4062 if (val == -2)
4063 return -2;
4064
4065 advance:
5e69f11e 4066 if (!range)
25fe55af 4067 break;
5e69f11e 4068 else if (range > 0)
25fe55af 4069 {
b18215fc
RS
4070 /* Update STARTPOS to the next character boundary. */
4071 if (multibyte)
4072 {
b560c397
RS
4073 const unsigned char *p
4074 = (const unsigned char *) POS_ADDR_VSTRING (startpos);
4075 const unsigned char *pend
4076 = (const unsigned char *) STOP_ADDR_VSTRING (startpos);
b18215fc
RS
4077 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
4078
4079 range -= len;
4080 if (range < 0)
4081 break;
4082 startpos += len;
4083 }
4084 else
4085 {
b560c397
RS
4086 range--;
4087 startpos++;
4088 }
e318085a 4089 }
fa9a63c5 4090 else
25fe55af
RS
4091 {
4092 range++;
4093 startpos--;
b18215fc
RS
4094
4095 /* Update STARTPOS to the previous character boundary. */
4096 if (multibyte)
4097 {
b560c397
RS
4098 const unsigned char *p
4099 = (const unsigned char *) POS_ADDR_VSTRING (startpos);
b18215fc
RS
4100 int len = 0;
4101
4102 /* Find the head of multibyte form. */
5d967c7a 4103 while (!CHAR_HEAD_P (*p))
b18215fc
RS
4104 p--, len++;
4105
4106 /* Adjust it. */
4107#if 0 /* XXX */
4108 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
4109 ;
4110 else
4111#endif
4112 {
4113 range += len;
4114 if (range > 0)
4115 break;
4116
4117 startpos -= len;
4118 }
4119 }
25fe55af 4120 }
fa9a63c5
RM
4121 }
4122 return -1;
4123} /* re_search_2 */
4124\f
4125/* Declarations and macros for re_match_2. */
4126
4127static int bcmp_translate ();
4128static boolean alt_match_null_string_p (),
25fe55af
RS
4129 common_op_match_null_string_p (),
4130 group_match_null_string_p ();
fa9a63c5
RM
4131
4132/* This converts PTR, a pointer into one of the search strings `string1'
4133 and `string2' into an offset from the beginning of that string. */
4134#define POINTER_TO_OFFSET(ptr) \
4135 (FIRST_STRING_P (ptr) \
4136 ? ((regoff_t) ((ptr) - string1)) \
4137 : ((regoff_t) ((ptr) - string2 + size1)))
4138
4139/* Macros for dealing with the split strings in re_match_2. */
4140
4141#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4142
4143/* Call before fetching a character with *d. This switches over to
4144 string2 if necessary. */
4145#define PREFETCH() \
25fe55af 4146 while (d == dend) \
fa9a63c5
RM
4147 { \
4148 /* End of string2 => fail. */ \
25fe55af
RS
4149 if (dend == end_match_2) \
4150 goto fail; \
4151 /* End of string1 => advance to string2. */ \
4152 d = string2; \
fa9a63c5
RM
4153 dend = end_match_2; \
4154 }
4155
4156
4157/* Test if at very beginning or at very end of the virtual concatenation
25fe55af 4158 of `string1' and `string2'. If only one string, it's `string2'. */
fa9a63c5 4159#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5e69f11e 4160#define AT_STRINGS_END(d) ((d) == end2)
fa9a63c5
RM
4161
4162
4163/* Test if D points to a character which is word-constituent. We have
4164 two special cases to check for: if past the end of string1, look at
4165 the first character in string2; and if before the beginning of
4166 string2, look at the last character in string1. */
4167#define WORDCHAR_P(d) \
4168 (SYNTAX ((d) == end1 ? *string2 \
25fe55af 4169 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
fa9a63c5
RM
4170 == Sword)
4171
9121ca40 4172/* Disabled due to a compiler bug -- see comment at case wordbound */
b18215fc
RS
4173
4174/* The comment at case wordbound is following one, but we don't use
4175 AT_WORD_BOUNDARY anymore to support multibyte form.
4176
4177 The DEC Alpha C compiler 3.x generates incorrect code for the
25fe55af
RS
4178 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4179 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
b18215fc
RS
4180 macro and introducing temporary variables works around the bug. */
4181
9121ca40 4182#if 0
fa9a63c5
RM
4183/* Test if the character before D and the one at D differ with respect
4184 to being word-constituent. */
4185#define AT_WORD_BOUNDARY(d) \
4186 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4187 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
9121ca40 4188#endif
fa9a63c5
RM
4189
4190/* Free everything we malloc. */
4191#ifdef MATCH_MAY_ALLOCATE
00049484 4192#define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
fa9a63c5
RM
4193#define FREE_VARIABLES() \
4194 do { \
4195 REGEX_FREE_STACK (fail_stack.stack); \
4196 FREE_VAR (regstart); \
4197 FREE_VAR (regend); \
4198 FREE_VAR (old_regstart); \
4199 FREE_VAR (old_regend); \
4200 FREE_VAR (best_regstart); \
4201 FREE_VAR (best_regend); \
4202 FREE_VAR (reg_info); \
4203 FREE_VAR (reg_dummy); \
4204 FREE_VAR (reg_info_dummy); \
4205 } while (0)
4206#else
4207#define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4208#endif /* not MATCH_MAY_ALLOCATE */
4209
25fe55af 4210/* These values must meet several constraints. They must not be valid
fa9a63c5
RM
4211 register values; since we have a limit of 255 registers (because
4212 we use only one byte in the pattern for the register number), we can
25fe55af 4213 use numbers larger than 255. They must differ by 1, because of
fa9a63c5
RM
4214 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4215 be larger than the value for the highest register, so we do not try
25fe55af 4216 to actually save any registers when none are active. */
fa9a63c5
RM
4217#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4218#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4219\f
4220/* Matching routines. */
4221
25fe55af 4222#ifndef emacs /* Emacs never uses this. */
fa9a63c5
RM
4223/* re_match is like re_match_2 except it takes only a single string. */
4224
4225int
4226re_match (bufp, string, size, pos, regs)
4227 struct re_pattern_buffer *bufp;
4228 const char *string;
4229 int size, pos;
4230 struct re_registers *regs;
4231{
4232 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4233 pos, regs, size);
4234 alloca (0);
4235 return result;
4236}
4237#endif /* not emacs */
4238
b18215fc
RS
4239#ifdef emacs
4240/* In Emacs, this is the string or buffer in which we
25fe55af 4241 are matching. It is used for looking up syntax properties. */
b18215fc
RS
4242Lisp_Object re_match_object;
4243#endif
fa9a63c5
RM
4244
4245/* re_match_2 matches the compiled pattern in BUFP against the
4246 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4247 and SIZE2, respectively). We start matching at POS, and stop
4248 matching at STOP.
5e69f11e 4249
fa9a63c5 4250 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
25fe55af 4251 store offsets for the substring each group matched in REGS. See the
fa9a63c5
RM
4252 documentation for exactly how many groups we fill.
4253
4254 We return -1 if no match, -2 if an internal error (such as the
25fe55af 4255 failure stack overflowing). Otherwise, we return the length of the
fa9a63c5
RM
4256 matched substring. */
4257
4258int
4259re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4260 struct re_pattern_buffer *bufp;
4261 const char *string1, *string2;
4262 int size1, size2;
4263 int pos;
4264 struct re_registers *regs;
4265 int stop;
4266{
b18215fc 4267 int result;
25fe55af 4268
b18215fc 4269#ifdef emacs
cc9b4df2 4270 int charpos;
7e95234e 4271 int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
cc9b4df2 4272 gl_state.object = re_match_object;
7e95234e 4273 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos);
cc9b4df2 4274 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
b18215fc
RS
4275#endif
4276
4277 result = re_match_2_internal (bufp, string1, size1, string2, size2,
cc9b4df2 4278 pos, regs, stop);
fa9a63c5
RM
4279 alloca (0);
4280 return result;
4281}
4282
4283/* This is a separate function so that we can force an alloca cleanup
25fe55af 4284 afterwards. */
fa9a63c5
RM
4285static int
4286re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4287 struct re_pattern_buffer *bufp;
4288 const char *string1, *string2;
4289 int size1, size2;
4290 int pos;
4291 struct re_registers *regs;
4292 int stop;
4293{
4294 /* General temporaries. */
4295 int mcnt;
4296 unsigned char *p1;
4297
4298 /* Just past the end of the corresponding string. */
4299 const char *end1, *end2;
4300
4301 /* Pointers into string1 and string2, just past the last characters in
25fe55af 4302 each to consider matching. */
fa9a63c5
RM
4303 const char *end_match_1, *end_match_2;
4304
4305 /* Where we are in the data, and the end of the current string. */
4306 const char *d, *dend;
5e69f11e 4307
fa9a63c5
RM
4308 /* Where we are in the pattern, and the end of the pattern. */
4309 unsigned char *p = bufp->buffer;
4310 register unsigned char *pend = p + bufp->used;
4311
4312 /* Mark the opcode just after a start_memory, so we can test for an
4313 empty subpattern when we get to the stop_memory. */
4314 unsigned char *just_past_start_mem = 0;
4315
25fe55af 4316 /* We use this to map every character in the string. */
6676cb1c 4317 RE_TRANSLATE_TYPE translate = bufp->translate;
fa9a63c5 4318
25fe55af 4319 /* Nonzero if we have to concern multibyte character. */
b18215fc
RS
4320 int multibyte = bufp->multibyte;
4321
fa9a63c5
RM
4322 /* Failure point stack. Each place that can handle a failure further
4323 down the line pushes a failure point on this stack. It consists of
4324 restart, regend, and reg_info for all registers corresponding to
4325 the subexpressions we're currently inside, plus the number of such
4326 registers, and, finally, two char *'s. The first char * is where
4327 to resume scanning the pattern; the second one is where to resume
4328 scanning the strings. If the latter is zero, the failure point is
4329 a ``dummy''; if a failure happens and the failure point is a dummy,
25fe55af
RS
4330 it gets discarded and the next next one is tried. */
4331#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fa9a63c5
RM
4332 fail_stack_type fail_stack;
4333#endif
4334#ifdef DEBUG
4335 static unsigned failure_id = 0;
4336 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4337#endif
4338
4339 /* This holds the pointer to the failure stack, when
4340 it is allocated relocatably. */
4341 fail_stack_elt_t *failure_stack_ptr;
4342
4343 /* We fill all the registers internally, independent of what we
25fe55af 4344 return, for use in backreferences. The number here includes
fa9a63c5
RM
4345 an element for register zero. */
4346 unsigned num_regs = bufp->re_nsub + 1;
5e69f11e 4347
fa9a63c5
RM
4348 /* The currently active registers. */
4349 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4350 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4351
4352 /* Information on the contents of registers. These are pointers into
4353 the input strings; they record just what was matched (on this
4354 attempt) by a subexpression part of the pattern, that is, the
4355 regnum-th regstart pointer points to where in the pattern we began
4356 matching and the regnum-th regend points to right after where we
4357 stopped matching the regnum-th subexpression. (The zeroth register
4358 keeps track of what the whole pattern matches.) */
4359#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4360 const char **regstart, **regend;
4361#endif
4362
4363 /* If a group that's operated upon by a repetition operator fails to
4364 match anything, then the register for its start will need to be
4365 restored because it will have been set to wherever in the string we
4366 are when we last see its open-group operator. Similarly for a
4367 register's end. */
4368#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4369 const char **old_regstart, **old_regend;
4370#endif
4371
4372 /* The is_active field of reg_info helps us keep track of which (possibly
4373 nested) subexpressions we are currently in. The matched_something
4374 field of reg_info[reg_num] helps us tell whether or not we have
4375 matched any of the pattern so far this time through the reg_num-th
4376 subexpression. These two fields get reset each time through any
25fe55af
RS
4377 loop their register is in. */
4378#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5e69f11e 4379 register_info_type *reg_info;
fa9a63c5
RM
4380#endif
4381
4382 /* The following record the register info as found in the above
5e69f11e 4383 variables when we find a match better than any we've seen before.
fa9a63c5
RM
4384 This happens as we backtrack through the failure points, which in
4385 turn happens only if we have not yet matched the entire string. */
4386 unsigned best_regs_set = false;
4387#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4388 const char **best_regstart, **best_regend;
4389#endif
5e69f11e 4390
fa9a63c5
RM
4391 /* Logically, this is `best_regend[0]'. But we don't want to have to
4392 allocate space for that if we're not allocating space for anything
25fe55af 4393 else (see below). Also, we never need info about register 0 for
fa9a63c5
RM
4394 any of the other register vectors, and it seems rather a kludge to
4395 treat `best_regend' differently than the rest. So we keep track of
4396 the end of the best match so far in a separate variable. We
4397 initialize this to NULL so that when we backtrack the first time
4398 and need to test it, it's not garbage. */
4399 const char *match_end = NULL;
4400
4401 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4402 int set_regs_matched_done = 0;
4403
4404 /* Used when we pop values we don't care about. */
4405#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4406 const char **reg_dummy;
4407 register_info_type *reg_info_dummy;
4408#endif
4409
4410#ifdef DEBUG
4411 /* Counts the total number of registers pushed. */
5e69f11e 4412 unsigned num_regs_pushed = 0;
fa9a63c5
RM
4413#endif
4414
4415 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5e69f11e 4416
fa9a63c5 4417 INIT_FAIL_STACK ();
5e69f11e 4418
fa9a63c5
RM
4419#ifdef MATCH_MAY_ALLOCATE
4420 /* Do not bother to initialize all the register variables if there are
4421 no groups in the pattern, as it takes a fair amount of time. If
4422 there are groups, we include space for register 0 (the whole
4423 pattern), even though we never use it, since it simplifies the
4424 array indexing. We should fix this. */
4425 if (bufp->re_nsub)
4426 {
4427 regstart = REGEX_TALLOC (num_regs, const char *);
4428 regend = REGEX_TALLOC (num_regs, const char *);
4429 old_regstart = REGEX_TALLOC (num_regs, const char *);
4430 old_regend = REGEX_TALLOC (num_regs, const char *);
4431 best_regstart = REGEX_TALLOC (num_regs, const char *);
4432 best_regend = REGEX_TALLOC (num_regs, const char *);
4433 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4434 reg_dummy = REGEX_TALLOC (num_regs, const char *);
4435 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4436
5e69f11e 4437 if (!(regstart && regend && old_regstart && old_regend && reg_info
25fe55af
RS
4438 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4439 {
4440 FREE_VARIABLES ();
4441 return -2;
4442 }
fa9a63c5
RM
4443 }
4444 else
4445 {
4446 /* We must initialize all our variables to NULL, so that
25fe55af 4447 `FREE_VARIABLES' doesn't try to free them. */
fa9a63c5 4448 regstart = regend = old_regstart = old_regend = best_regstart
25fe55af 4449 = best_regend = reg_dummy = NULL;
fa9a63c5
RM
4450 reg_info = reg_info_dummy = (register_info_type *) NULL;
4451 }
4452#endif /* MATCH_MAY_ALLOCATE */
4453
4454 /* The starting position is bogus. */
4455 if (pos < 0 || pos > size1 + size2)
4456 {
4457 FREE_VARIABLES ();
4458 return -1;
4459 }
5e69f11e 4460
fa9a63c5
RM
4461 /* Initialize subexpression text positions to -1 to mark ones that no
4462 start_memory/stop_memory has been seen for. Also initialize the
4463 register information struct. */
4464 for (mcnt = 1; mcnt < num_regs; mcnt++)
4465 {
5e69f11e 4466 regstart[mcnt] = regend[mcnt]
25fe55af 4467 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5e69f11e 4468
fa9a63c5
RM
4469 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4470 IS_ACTIVE (reg_info[mcnt]) = 0;
4471 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4472 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4473 }
5e69f11e 4474
fa9a63c5 4475 /* We move `string1' into `string2' if the latter's empty -- but not if
25fe55af 4476 `string1' is null. */
fa9a63c5
RM
4477 if (size2 == 0 && string1 != NULL)
4478 {
4479 string2 = string1;
4480 size2 = size1;
4481 string1 = 0;
4482 size1 = 0;
4483 }
4484 end1 = string1 + size1;
4485 end2 = string2 + size2;
4486
4487 /* Compute where to stop matching, within the two strings. */
4488 if (stop <= size1)
4489 {
4490 end_match_1 = string1 + stop;
4491 end_match_2 = string2;
4492 }
4493 else
4494 {
4495 end_match_1 = end1;
4496 end_match_2 = string2 + stop - size1;
4497 }
4498
5e69f11e 4499 /* `p' scans through the pattern as `d' scans through the data.
fa9a63c5
RM
4500 `dend' is the end of the input string that `d' points within. `d'
4501 is advanced into the following input string whenever necessary, but
4502 this happens before fetching; therefore, at the beginning of the
4503 loop, `d' can be pointing at the end of a string, but it cannot
4504 equal `string2'. */
4505 if (size1 > 0 && pos <= size1)
4506 {
4507 d = string1 + pos;
4508 dend = end_match_1;
4509 }
4510 else
4511 {
4512 d = string2 + pos - size1;
4513 dend = end_match_2;
4514 }
4515
4516 DEBUG_PRINT1 ("The compiled pattern is: ");
4517 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4518 DEBUG_PRINT1 ("The string to match is: `");
4519 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4520 DEBUG_PRINT1 ("'\n");
5e69f11e 4521
25fe55af 4522 /* This loops over pattern commands. It exits by returning from the
fa9a63c5
RM
4523 function if the match is complete, or it drops through if the match
4524 fails at this starting point in the input data. */
4525 for (;;)
4526 {
4527 DEBUG_PRINT2 ("\n0x%x: ", p);
4528
4529 if (p == pend)
4530 { /* End of pattern means we might have succeeded. */
25fe55af 4531 DEBUG_PRINT1 ("end of pattern ... ");
5e69f11e 4532
fa9a63c5 4533 /* If we haven't matched the entire string, and we want the
25fe55af
RS
4534 longest match, try backtracking. */
4535 if (d != end_match_2)
fa9a63c5
RM
4536 {
4537 /* 1 if this match ends in the same string (string1 or string2)
4538 as the best previous match. */
5e69f11e 4539 boolean same_str_p = (FIRST_STRING_P (match_end)
fa9a63c5
RM
4540 == MATCHING_IN_FIRST_STRING);
4541 /* 1 if this match is the best seen so far. */
4542 boolean best_match_p;
4543
4544 /* AIX compiler got confused when this was combined
25fe55af 4545 with the previous declaration. */
fa9a63c5
RM
4546 if (same_str_p)
4547 best_match_p = d > match_end;
4548 else
4549 best_match_p = !MATCHING_IN_FIRST_STRING;
4550
25fe55af
RS
4551 DEBUG_PRINT1 ("backtracking.\n");
4552
4553 if (!FAIL_STACK_EMPTY ())
4554 { /* More failure points to try. */
4555
4556 /* If exceeds best match so far, save it. */
4557 if (!best_regs_set || best_match_p)
4558 {
4559 best_regs_set = true;
4560 match_end = d;
4561
4562 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4563
4564 for (mcnt = 1; mcnt < num_regs; mcnt++)
4565 {
4566 best_regstart[mcnt] = regstart[mcnt];
4567 best_regend[mcnt] = regend[mcnt];
4568 }
4569 }
4570 goto fail;
4571 }
4572
4573 /* If no failure points, don't restore garbage. And if
4574 last match is real best match, don't restore second
4575 best one. */
4576 else if (best_regs_set && !best_match_p)
4577 {
4578 restore_best_regs:
4579 /* Restore best match. It may happen that `dend ==
4580 end_match_1' while the restored d is in string2.
4581 For example, the pattern `x.*y.*z' against the
4582 strings `x-' and `y-z-', if the two strings are
4583 not consecutive in memory. */
4584 DEBUG_PRINT1 ("Restoring best registers.\n");
4585
4586 d = match_end;
4587 dend = ((d >= string1 && d <= end1)
4588 ? end_match_1 : end_match_2);
fa9a63c5
RM
4589
4590 for (mcnt = 1; mcnt < num_regs; mcnt++)
4591 {
4592 regstart[mcnt] = best_regstart[mcnt];
4593 regend[mcnt] = best_regend[mcnt];
4594 }
25fe55af
RS
4595 }
4596 } /* d != end_match_2 */
fa9a63c5
RM
4597
4598 succeed_label:
25fe55af 4599 DEBUG_PRINT1 ("Accepting match.\n");
fa9a63c5 4600
25fe55af
RS
4601 /* If caller wants register contents data back, do it. */
4602 if (regs && !bufp->no_sub)
fa9a63c5 4603 {
25fe55af
RS
4604 /* Have the register data arrays been allocated? */
4605 if (bufp->regs_allocated == REGS_UNALLOCATED)
4606 { /* No. So allocate them with malloc. We need one
4607 extra element beyond `num_regs' for the `-1' marker
4608 GNU code uses. */
4609 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4610 regs->start = TALLOC (regs->num_regs, regoff_t);
4611 regs->end = TALLOC (regs->num_regs, regoff_t);
4612 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4613 {
4614 FREE_VARIABLES ();
4615 return -2;
4616 }
25fe55af
RS
4617 bufp->regs_allocated = REGS_REALLOCATE;
4618 }
4619 else if (bufp->regs_allocated == REGS_REALLOCATE)
4620 { /* Yes. If we need more elements than were already
4621 allocated, reallocate them. If we need fewer, just
4622 leave it alone. */
4623 if (regs->num_regs < num_regs + 1)
4624 {
4625 regs->num_regs = num_regs + 1;
4626 RETALLOC (regs->start, regs->num_regs, regoff_t);
4627 RETALLOC (regs->end, regs->num_regs, regoff_t);
4628 if (regs->start == NULL || regs->end == NULL)
fa9a63c5
RM
4629 {
4630 FREE_VARIABLES ();
4631 return -2;
4632 }
25fe55af
RS
4633 }
4634 }
4635 else
fa9a63c5
RM
4636 {
4637 /* These braces fend off a "empty body in an else-statement"
25fe55af 4638 warning under GCC when assert expands to nothing. */
fa9a63c5
RM
4639 assert (bufp->regs_allocated == REGS_FIXED);
4640 }
4641
25fe55af
RS
4642 /* Convert the pointer data in `regstart' and `regend' to
4643 indices. Register zero has to be set differently,
4644 since we haven't kept track of any info for it. */
4645 if (regs->num_regs > 0)
4646 {
4647 regs->start[0] = pos;
4648 regs->end[0] = (MATCHING_IN_FIRST_STRING
fa9a63c5 4649 ? ((regoff_t) (d - string1))
25fe55af
RS
4650 : ((regoff_t) (d - string2 + size1)));
4651 }
5e69f11e 4652
25fe55af
RS
4653 /* Go through the first `min (num_regs, regs->num_regs)'
4654 registers, since that is all we initialized. */
fa9a63c5
RM
4655 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4656 {
25fe55af
RS
4657 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4658 regs->start[mcnt] = regs->end[mcnt] = -1;
4659 else
4660 {
fa9a63c5
RM
4661 regs->start[mcnt]
4662 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
25fe55af 4663 regs->end[mcnt]
fa9a63c5 4664 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
25fe55af 4665 }
fa9a63c5 4666 }
5e69f11e 4667
25fe55af
RS
4668 /* If the regs structure we return has more elements than
4669 were in the pattern, set the extra elements to -1. If
4670 we (re)allocated the registers, this is the case,
4671 because we always allocate enough to have at least one
4672 -1 at the end. */
4673 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4674 regs->start[mcnt] = regs->end[mcnt] = -1;
fa9a63c5
RM
4675 } /* regs && !bufp->no_sub */
4676
25fe55af
RS
4677 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4678 nfailure_points_pushed, nfailure_points_popped,
4679 nfailure_points_pushed - nfailure_points_popped);
4680 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
fa9a63c5 4681
25fe55af 4682 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5e69f11e 4683 ? string1
fa9a63c5
RM
4684 : string2 - size1);
4685
25fe55af 4686 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
fa9a63c5 4687
25fe55af
RS
4688 FREE_VARIABLES ();
4689 return mcnt;
4690 }
fa9a63c5 4691
25fe55af 4692 /* Otherwise match next pattern command. */
fa9a63c5
RM
4693 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4694 {
25fe55af
RS
4695 /* Ignore these. Used to ignore the n of succeed_n's which
4696 currently have n == 0. */
4697 case no_op:
4698 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4699 break;
fa9a63c5
RM
4700
4701 case succeed:
25fe55af 4702 DEBUG_PRINT1 ("EXECUTING succeed.\n");
fa9a63c5
RM
4703 goto succeed_label;
4704
25fe55af
RS
4705 /* Match the next n pattern characters exactly. The following
4706 byte in the pattern defines n, and the n bytes after that
4707 are the characters to match. */
fa9a63c5
RM
4708 case exactn:
4709 mcnt = *p++;
25fe55af 4710 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
fa9a63c5 4711
25fe55af
RS
4712 /* This is written out as an if-else so we don't waste time
4713 testing `translate' inside the loop. */
28703c16 4714 if (RE_TRANSLATE_P (translate))
fa9a63c5 4715 {
e934739e
RS
4716#ifdef emacs
4717 if (multibyte)
4718 do
4719 {
4720 int pat_charlen, buf_charlen;
e71b1971 4721 unsigned int pat_ch, buf_ch;
e934739e
RS
4722
4723 PREFETCH ();
4724 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4725 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4726
4727 if (RE_TRANSLATE (translate, buf_ch)
4728 != pat_ch)
4729 goto fail;
4730
4731 p += pat_charlen;
4732 d += buf_charlen;
4733 mcnt -= pat_charlen;
4734 }
4735 while (mcnt > 0);
4736 else
4737#endif /* not emacs */
4738 do
4739 {
4740 PREFETCH ();
33c46939 4741 if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d)
e934739e
RS
4742 != (unsigned char) *p++)
4743 goto fail;
33c46939 4744 d++;
e934739e
RS
4745 }
4746 while (--mcnt);
fa9a63c5
RM
4747 }
4748 else
4749 {
4750 do
4751 {
4752 PREFETCH ();
4753 if (*d++ != (char) *p++) goto fail;
4754 }
4755 while (--mcnt);
4756 }
4757 SET_REGS_MATCHED ();
25fe55af 4758 break;
fa9a63c5
RM
4759
4760
25fe55af 4761 /* Match any character except possibly a newline or a null. */
fa9a63c5 4762 case anychar:
e934739e
RS
4763 {
4764 int buf_charlen;
e71b1971 4765 unsigned int buf_ch;
fa9a63c5 4766
e934739e 4767 DEBUG_PRINT1 ("EXECUTING anychar.\n");
fa9a63c5 4768
e934739e 4769 PREFETCH ();
fa9a63c5 4770
e934739e
RS
4771#ifdef emacs
4772 if (multibyte)
4773 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4774 else
4775#endif /* not emacs */
4776 {
85bc1d4e 4777 buf_ch = (unsigned char) *d;
e934739e
RS
4778 buf_charlen = 1;
4779 }
4780
4781 buf_ch = TRANSLATE (buf_ch);
4782
4783 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4784 && buf_ch == '\n')
4785 || ((bufp->syntax & RE_DOT_NOT_NULL)
4786 && buf_ch == '\000'))
4787 goto fail;
4788
4789 SET_REGS_MATCHED ();
4790 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4791 d += buf_charlen;
4792 }
fa9a63c5
RM
4793 break;
4794
4795
4796 case charset:
4797 case charset_not:
4798 {
b18215fc 4799 register unsigned int c;
fa9a63c5 4800 boolean not = (re_opcode_t) *(p - 1) == charset_not;
b18215fc
RS
4801 int len;
4802
4803 /* Start of actual range_table, or end of bitmap if there is no
4804 range table. */
4805 unsigned char *range_table;
4806
96cc36cc 4807 /* Nonzero if there is a range table. */
b18215fc
RS
4808 int range_table_exists;
4809
96cc36cc
RS
4810 /* Number of ranges of range table. This is not included
4811 in the initial byte-length of the command. */
4812 int count = 0;
fa9a63c5 4813
25fe55af 4814 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
fa9a63c5
RM
4815
4816 PREFETCH ();
b18215fc 4817 c = (unsigned char) *d;
fa9a63c5 4818
b18215fc 4819 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
96cc36cc
RS
4820
4821#ifdef emacs
b18215fc 4822 if (range_table_exists)
96cc36cc
RS
4823 {
4824 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4825 EXTRACT_NUMBER_AND_INCR (count, range_table);
4826 }
b18215fc
RS
4827
4828 if (multibyte && BASE_LEADING_CODE_P (c))
4829 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
96cc36cc 4830#endif /* emacs */
b18215fc
RS
4831
4832 if (SINGLE_BYTE_CHAR_P (c))
4833 { /* Lookup bitmap. */
4834 c = TRANSLATE (c); /* The character to match. */
4835 len = 1;
4836
4837 /* Cast to `unsigned' instead of `unsigned char' in
4838 case the bit list is a full 32 bytes long. */
4839 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
96cc36cc
RS
4840 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4841 not = !not;
b18215fc 4842 }
96cc36cc 4843#ifdef emacs
b18215fc 4844 else if (range_table_exists)
96cc36cc
RS
4845 {
4846 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4847
4848 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4849 | (class_bits & BIT_ALPHA && ISALPHA (c))
f71b19b6 4850 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
96cc36cc
RS
4851 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4852 | (class_bits & BIT_LOWER && ISLOWER (c))
f71b19b6
DL
4853 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4854 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
96cc36cc
RS
4855 | (class_bits & BIT_PRINT && ISPRINT (c))
4856 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4857 | (class_bits & BIT_SPACE && ISSPACE (c))
f71b19b6 4858 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
96cc36cc
RS
4859 | (class_bits & BIT_UPPER && ISUPPER (c))
4860 | (class_bits & BIT_WORD && ISWORD (c)))
4861 not = !not;
4862 else
4863 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4864 }
4865#endif /* emacs */
fa9a63c5 4866
96cc36cc
RS
4867 if (range_table_exists)
4868 p = CHARSET_RANGE_TABLE_END (range_table, count);
4869 else
4870 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
fa9a63c5
RM
4871
4872 if (!not) goto fail;
5e69f11e 4873
fa9a63c5 4874 SET_REGS_MATCHED ();
b18215fc 4875 d += len;
fa9a63c5
RM
4876 break;
4877 }
4878
4879
25fe55af
RS
4880 /* The beginning of a group is represented by start_memory.
4881 The arguments are the register number in the next byte, and the
4882 number of groups inner to this one in the next. The text
4883 matched within the group is recorded (in the internal
4884 registers data structure) under the register number. */
4885 case start_memory:
fa9a63c5
RM
4886 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4887
25fe55af 4888 /* Find out if this group can match the empty string. */
fa9a63c5 4889 p1 = p; /* To send to group_match_null_string_p. */
5e69f11e 4890
25fe55af
RS
4891 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4892 REG_MATCH_NULL_STRING_P (reg_info[*p])
4893 = group_match_null_string_p (&p1, pend, reg_info);
4894
4895 /* Save the position in the string where we were the last time
4896 we were at this open-group operator in case the group is
4897 operated upon by a repetition operator, e.g., with `(a*)*b'
4898 against `ab'; then we want to ignore where we are now in
4899 the string in case this attempt to match fails. */
4900 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4901 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4902 : regstart[*p];
5e69f11e 4903 DEBUG_PRINT2 (" old_regstart: %d\n",
fa9a63c5
RM
4904 POINTER_TO_OFFSET (old_regstart[*p]));
4905
25fe55af 4906 regstart[*p] = d;
fa9a63c5
RM
4907 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4908
25fe55af
RS
4909 IS_ACTIVE (reg_info[*p]) = 1;
4910 MATCHED_SOMETHING (reg_info[*p]) = 0;
fa9a63c5
RM
4911
4912 /* Clear this whenever we change the register activity status. */
4913 set_regs_matched_done = 0;
5e69f11e 4914
25fe55af
RS
4915 /* This is the new highest active register. */
4916 highest_active_reg = *p;
5e69f11e 4917
25fe55af
RS
4918 /* If nothing was active before, this is the new lowest active
4919 register. */
4920 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4921 lowest_active_reg = *p;
fa9a63c5 4922
25fe55af
RS
4923 /* Move past the register number and inner group count. */
4924 p += 2;
fa9a63c5
RM
4925 just_past_start_mem = p;
4926
25fe55af 4927 break;
fa9a63c5
RM
4928
4929
25fe55af
RS
4930 /* The stop_memory opcode represents the end of a group. Its
4931 arguments are the same as start_memory's: the register
4932 number, and the number of inner groups. */
fa9a63c5
RM
4933 case stop_memory:
4934 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
5e69f11e 4935
25fe55af
RS
4936 /* We need to save the string position the last time we were at
4937 this close-group operator in case the group is operated
4938 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4939 against `aba'; then we want to ignore where we are now in
4940 the string in case this attempt to match fails. */
4941 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4942 ? REG_UNSET (regend[*p]) ? d : regend[*p]
fa9a63c5 4943 : regend[*p];
5e69f11e 4944 DEBUG_PRINT2 (" old_regend: %d\n",
fa9a63c5
RM
4945 POINTER_TO_OFFSET (old_regend[*p]));
4946
25fe55af 4947 regend[*p] = d;
fa9a63c5
RM
4948 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4949
25fe55af
RS
4950 /* This register isn't active anymore. */
4951 IS_ACTIVE (reg_info[*p]) = 0;
fa9a63c5
RM
4952
4953 /* Clear this whenever we change the register activity status. */
4954 set_regs_matched_done = 0;
4955
25fe55af
RS
4956 /* If this was the only register active, nothing is active
4957 anymore. */
4958 if (lowest_active_reg == highest_active_reg)
4959 {
4960 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4961 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4962 }
4963 else
4964 { /* We must scan for the new highest active register, since
4965 it isn't necessarily one less than now: consider
4966 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4967 new highest active register is 1. */
4968 unsigned char r = *p - 1;
4969 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4970 r--;
4971
4972 /* If we end up at register zero, that means that we saved
4973 the registers as the result of an `on_failure_jump', not
4974 a `start_memory', and we jumped to past the innermost
4975 `stop_memory'. For example, in ((.)*) we save
4976 registers 1 and 2 as a result of the *, but when we pop
4977 back to the second ), we are at the stop_memory 1.
4978 Thus, nothing is active. */
fa9a63c5 4979 if (r == 0)
25fe55af
RS
4980 {
4981 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4982 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4983 }
4984 else
4985 highest_active_reg = r;
4986 }
4987
4988 /* If just failed to match something this time around with a
4989 group that's operated on by a repetition operator, try to
4990 force exit from the ``loop'', and restore the register
4991 information for this group that we had before trying this
4992 last match. */
4993 if ((!MATCHED_SOMETHING (reg_info[*p])
4994 || just_past_start_mem == p - 1)
5e69f11e 4995 && (p + 2) < pend)
25fe55af
RS
4996 {
4997 boolean is_a_jump_n = false;
4998
4999 p1 = p + 2;
5000 mcnt = 0;
5001 switch ((re_opcode_t) *p1++)
5002 {
5003 case jump_n:
fa9a63c5 5004 is_a_jump_n = true;
25fe55af 5005 case pop_failure_jump:
fa9a63c5
RM
5006 case maybe_pop_jump:
5007 case jump:
5008 case dummy_failure_jump:
25fe55af 5009 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
fa9a63c5
RM
5010 if (is_a_jump_n)
5011 p1 += 2;
25fe55af 5012 break;
5e69f11e 5013
25fe55af
RS
5014 default:
5015 /* do nothing */ ;
5016 }
fa9a63c5 5017 p1 += mcnt;
5e69f11e 5018
25fe55af
RS
5019 /* If the next operation is a jump backwards in the pattern
5020 to an on_failure_jump right before the start_memory
5021 corresponding to this stop_memory, exit from the loop
5022 by forcing a failure after pushing on the stack the
5023 on_failure_jump's jump in the pattern, and d. */
5024 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
5025 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
fa9a63c5 5026 {
25fe55af
RS
5027 /* If this group ever matched anything, then restore
5028 what its registers were before trying this last
5029 failed match, e.g., with `(a*)*b' against `ab' for
5030 regstart[1], and, e.g., with `((a*)*(b*)*)*'
5031 against `aba' for regend[3].
5e69f11e 5032
25fe55af
RS
5033 Also restore the registers for inner groups for,
5034 e.g., `((a*)(b*))*' against `aba' (register 3 would
5035 otherwise get trashed). */
5e69f11e 5036
25fe55af 5037 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
fa9a63c5 5038 {
5e69f11e
RM
5039 unsigned r;
5040
25fe55af 5041 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
5e69f11e 5042
fa9a63c5 5043 /* Restore this and inner groups' (if any) registers. */
25fe55af
RS
5044 for (r = *p; r < *p + *(p + 1); r++)
5045 {
5046 regstart[r] = old_regstart[r];
5047
5048 /* xx why this test? */
5049 if (old_regend[r] >= regstart[r])
5050 regend[r] = old_regend[r];
5051 }
5052 }
fa9a63c5 5053 p1++;
25fe55af
RS
5054 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5055 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
fa9a63c5 5056
25fe55af
RS
5057 goto fail;
5058 }
5059 }
5e69f11e 5060
25fe55af
RS
5061 /* Move past the register number and the inner group count. */
5062 p += 2;
5063 break;
fa9a63c5
RM
5064
5065
5066 /* \<digit> has been turned into a `duplicate' command which is
25fe55af
RS
5067 followed by the numeric value of <digit> as the register number. */
5068 case duplicate:
fa9a63c5
RM
5069 {
5070 register const char *d2, *dend2;
25fe55af 5071 int regno = *p++; /* Get which register to match against. */
fa9a63c5
RM
5072 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5073
25fe55af
RS
5074 /* Can't back reference a group which we've never matched. */
5075 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5076 goto fail;
5e69f11e 5077
25fe55af
RS
5078 /* Where in input to try to start matching. */
5079 d2 = regstart[regno];
5e69f11e 5080
25fe55af
RS
5081 /* Where to stop matching; if both the place to start and
5082 the place to stop matching are in the same string, then
5083 set to the place to stop, otherwise, for now have to use
5084 the end of the first string. */
fa9a63c5 5085
25fe55af 5086 dend2 = ((FIRST_STRING_P (regstart[regno])
fa9a63c5
RM
5087 == FIRST_STRING_P (regend[regno]))
5088 ? regend[regno] : end_match_1);
5089 for (;;)
5090 {
5091 /* If necessary, advance to next segment in register
25fe55af 5092 contents. */
fa9a63c5
RM
5093 while (d2 == dend2)
5094 {
5095 if (dend2 == end_match_2) break;
5096 if (dend2 == regend[regno]) break;
5097
25fe55af
RS
5098 /* End of string1 => advance to string2. */
5099 d2 = string2;
5100 dend2 = regend[regno];
fa9a63c5
RM
5101 }
5102 /* At end of register contents => success */
5103 if (d2 == dend2) break;
5104
5105 /* If necessary, advance to next segment in data. */
5106 PREFETCH ();
5107
5108 /* How many characters left in this segment to match. */
5109 mcnt = dend - d;
5e69f11e 5110
fa9a63c5 5111 /* Want how many consecutive characters we can match in
25fe55af
RS
5112 one shot, so, if necessary, adjust the count. */
5113 if (mcnt > dend2 - d2)
fa9a63c5 5114 mcnt = dend2 - d2;
5e69f11e 5115
fa9a63c5 5116 /* Compare that many; failure if mismatch, else move
25fe55af 5117 past them. */
28703c16 5118 if (RE_TRANSLATE_P (translate)
25fe55af
RS
5119 ? bcmp_translate (d, d2, mcnt, translate)
5120 : bcmp (d, d2, mcnt))
fa9a63c5
RM
5121 goto fail;
5122 d += mcnt, d2 += mcnt;
5123
25fe55af 5124 /* Do this because we've match some characters. */
fa9a63c5
RM
5125 SET_REGS_MATCHED ();
5126 }
5127 }
5128 break;
5129
5130
25fe55af
RS
5131 /* begline matches the empty string at the beginning of the string
5132 (unless `not_bol' is set in `bufp'), and, if
5133 `newline_anchor' is set, after newlines. */
fa9a63c5 5134 case begline:
25fe55af 5135 DEBUG_PRINT1 ("EXECUTING begline.\n");
5e69f11e 5136
25fe55af
RS
5137 if (AT_STRINGS_BEG (d))
5138 {
5139 if (!bufp->not_bol) break;
5140 }
5141 else if (d[-1] == '\n' && bufp->newline_anchor)
5142 {
5143 break;
5144 }
5145 /* In all other cases, we fail. */
5146 goto fail;
fa9a63c5
RM
5147
5148
25fe55af 5149 /* endline is the dual of begline. */
fa9a63c5 5150 case endline:
25fe55af 5151 DEBUG_PRINT1 ("EXECUTING endline.\n");
fa9a63c5 5152
25fe55af
RS
5153 if (AT_STRINGS_END (d))
5154 {
5155 if (!bufp->not_eol) break;
5156 }
5e69f11e 5157
25fe55af
RS
5158 /* We have to ``prefetch'' the next character. */
5159 else if ((d == end1 ? *string2 : *d) == '\n'
5160 && bufp->newline_anchor)
5161 {
5162 break;
5163 }
5164 goto fail;
fa9a63c5
RM
5165
5166
5167 /* Match at the very beginning of the data. */
25fe55af
RS
5168 case begbuf:
5169 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5170 if (AT_STRINGS_BEG (d))
5171 break;
5172 goto fail;
fa9a63c5
RM
5173
5174
5175 /* Match at the very end of the data. */
25fe55af
RS
5176 case endbuf:
5177 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
fa9a63c5
RM
5178 if (AT_STRINGS_END (d))
5179 break;
25fe55af 5180 goto fail;
5e69f11e 5181
5e69f11e 5182
25fe55af
RS
5183 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5184 pushes NULL as the value for the string on the stack. Then
5185 `pop_failure_point' will keep the current value for the
5186 string, instead of restoring it. To see why, consider
5187 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5188 then the . fails against the \n. But the next thing we want
5189 to do is match the \n against the \n; if we restored the
5190 string value, we would be back at the foo.
5191
5192 Because this is used only in specific cases, we don't need to
5193 check all the things that `on_failure_jump' does, to make
5194 sure the right things get saved on the stack. Hence we don't
5195 share its code. The only reason to push anything on the
5196 stack at all is that otherwise we would have to change
5197 `anychar's code to do something besides goto fail in this
5198 case; that seems worse than this. */
5199 case on_failure_keep_string_jump:
5200 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
fa9a63c5 5201
25fe55af
RS
5202 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5203 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
b18215fc 5204
25fe55af
RS
5205 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
5206 break;
fa9a63c5
RM
5207
5208
5209 /* Uses of on_failure_jump:
5e69f11e 5210
25fe55af
RS
5211 Each alternative starts with an on_failure_jump that points
5212 to the beginning of the next alternative. Each alternative
5213 except the last ends with a jump that in effect jumps past
5214 the rest of the alternatives. (They really jump to the
5215 ending jump of the following alternative, because tensioning
5216 these jumps is a hassle.)
fa9a63c5 5217
25fe55af
RS
5218 Repeats start with an on_failure_jump that points past both
5219 the repetition text and either the following jump or
5220 pop_failure_jump back to this on_failure_jump. */
fa9a63c5 5221 case on_failure_jump:
25fe55af
RS
5222 on_failure:
5223 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5224
27c3b45d
RS
5225#if defined (WINDOWSNT) && defined (emacs)
5226 QUIT;
5227#endif
5228
25fe55af
RS
5229 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5230 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
5231
5232 /* If this on_failure_jump comes right before a group (i.e.,
5233 the original * applied to a group), save the information
5234 for that group and all inner ones, so that if we fail back
5235 to this point, the group's information will be correct.
5236 For example, in \(a*\)*\1, we need the preceding group,
5237 and in \(zz\(a*\)b*\)\2, we need the inner group. */
5238
5239 /* We can't use `p' to check ahead because we push
5240 a failure point to `p + mcnt' after we do this. */
5241 p1 = p;
5242
5243 /* We need to skip no_op's before we look for the
5244 start_memory in case this on_failure_jump is happening as
5245 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5246 against aba. */
5247 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5248 p1++;
5249
5250 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5251 {
5252 /* We have a new highest active register now. This will
5253 get reset at the start_memory we are about to get to,
5254 but we will have saved all the registers relevant to
5255 this repetition op, as described above. */
5256 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5257 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5258 lowest_active_reg = *(p1 + 1);
5259 }
5260
5261 DEBUG_PRINT1 (":\n");
5262 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5263 break;
5264
5265
5266 /* A smart repeat ends with `maybe_pop_jump'.
5267 We change it to either `pop_failure_jump' or `jump'. */
5268 case maybe_pop_jump:
27c3b45d
RS
5269#if defined (WINDOWSNT) && defined (emacs)
5270 QUIT;
5271#endif
25fe55af
RS
5272 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5273 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5274 {
fa9a63c5
RM
5275 register unsigned char *p2 = p;
5276
25fe55af
RS
5277 /* Compare the beginning of the repeat with what in the
5278 pattern follows its end. If we can establish that there
5279 is nothing that they would both match, i.e., that we
5280 would have to backtrack because of (as in, e.g., `a*a')
5281 then we can change to pop_failure_jump, because we'll
5282 never have to backtrack.
5e69f11e 5283
25fe55af
RS
5284 This is not true in the case of alternatives: in
5285 `(a|ab)*' we do need to backtrack to the `ab' alternative
5286 (e.g., if the string was `ab'). But instead of trying to
5287 detect that here, the alternative has put on a dummy
5288 failure point which is what we will end up popping. */
fa9a63c5
RM
5289
5290 /* Skip over open/close-group commands.
5291 If what follows this loop is a ...+ construct,
5292 look at what begins its body, since we will have to
5293 match at least one of that. */
5294 while (1)
5295 {
5296 if (p2 + 2 < pend
5297 && ((re_opcode_t) *p2 == stop_memory
5298 || (re_opcode_t) *p2 == start_memory))
5299 p2 += 3;
5300 else if (p2 + 6 < pend
5301 && (re_opcode_t) *p2 == dummy_failure_jump)
5302 p2 += 6;
5303 else
5304 break;
5305 }
5306
5307 p1 = p + mcnt;
5308 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5e69f11e 5309 to the `maybe_finalize_jump' of this case. Examine what
25fe55af 5310 follows. */
fa9a63c5 5311
25fe55af
RS
5312 /* If we're at the end of the pattern, we can change. */
5313 if (p2 == pend)
fa9a63c5
RM
5314 {
5315 /* Consider what happens when matching ":\(.*\)"
5316 against ":/". I don't really understand this code
25fe55af
RS
5317 yet. */
5318 p[-3] = (unsigned char) pop_failure_jump;
5319 DEBUG_PRINT1
5320 (" End of pattern: change to `pop_failure_jump'.\n");
5321 }
fa9a63c5 5322
25fe55af 5323 else if ((re_opcode_t) *p2 == exactn
fa9a63c5
RM
5324 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5325 {
b18215fc 5326 register unsigned int c
25fe55af 5327 = *p2 == (unsigned char) endline ? '\n' : p2[2];
fa9a63c5 5328
b18215fc 5329 if ((re_opcode_t) p1[3] == exactn)
e318085a 5330 {
b18215fc
RS
5331 if (!(multibyte /* && (c != '\n') */
5332 && BASE_LEADING_CODE_P (c))
5333 ? c != p1[5]
5334 : (STRING_CHAR (&p2[2], pend - &p2[2])
5335 != STRING_CHAR (&p1[5], pend - &p1[5])))
25fe55af
RS
5336 {
5337 p[-3] = (unsigned char) pop_failure_jump;
5338 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5339 c, p1[5]);
5340 }
e318085a 5341 }
5e69f11e 5342
fa9a63c5
RM
5343 else if ((re_opcode_t) p1[3] == charset
5344 || (re_opcode_t) p1[3] == charset_not)
5345 {
5346 int not = (re_opcode_t) p1[3] == charset_not;
5e69f11e 5347
b18215fc
RS
5348 if (multibyte /* && (c != '\n') */
5349 && BASE_LEADING_CODE_P (c))
5350 c = STRING_CHAR (&p2[2], pend - &p2[2]);
5351
5352 /* Test if C is listed in charset (or charset_not)
5353 at `&p1[3]'. */
5354 if (SINGLE_BYTE_CHAR_P (c))
5355 {
5356 if (c < CHARSET_BITMAP_SIZE (&p1[3]) * BYTEWIDTH
fa9a63c5
RM
5357 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5358 not = !not;
b18215fc
RS
5359 }
5360 else if (CHARSET_RANGE_TABLE_EXISTS_P (&p1[3]))
5361 CHARSET_LOOKUP_RANGE_TABLE (not, c, &p1[3]);
fa9a63c5 5362
25fe55af
RS
5363 /* `not' is equal to 1 if c would match, which means
5364 that we can't change to pop_failure_jump. */
fa9a63c5 5365 if (!not)
25fe55af
RS
5366 {
5367 p[-3] = (unsigned char) pop_failure_jump;
5368 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5369 }
fa9a63c5
RM
5370 }
5371 }
25fe55af 5372 else if ((re_opcode_t) *p2 == charset)
fa9a63c5 5373 {
b18215fc 5374 if ((re_opcode_t) p1[3] == exactn)
e318085a 5375 {
b18215fc
RS
5376 register unsigned int c = p1[5];
5377 int not = 0;
5378
5379 if (multibyte && BASE_LEADING_CODE_P (c))
5380 c = STRING_CHAR (&p1[5], pend - &p1[5]);
5381
25fe55af 5382 /* Test if C is listed in charset at `p2'. */
b18215fc
RS
5383 if (SINGLE_BYTE_CHAR_P (c))
5384 {
5385 if (c < CHARSET_BITMAP_SIZE (p2) * BYTEWIDTH
5386 && (p2[2 + c / BYTEWIDTH]
5387 & (1 << (c % BYTEWIDTH))))
5388 not = !not;
5389 }
5390 else if (CHARSET_RANGE_TABLE_EXISTS_P (p2))
5391 CHARSET_LOOKUP_RANGE_TABLE (not, c, p2);
5392
5393 if (!not)
25fe55af
RS
5394 {
5395 p[-3] = (unsigned char) pop_failure_jump;
5396 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
b18215fc 5397 }
25fe55af 5398 }
5e69f11e 5399
b18215fc
RS
5400 /* It is hard to list up all the character in charset
5401 P2 if it includes multibyte character. Give up in
5402 such case. */
5403 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
5404 {
5405 /* Now, we are sure that P2 has no range table.
5406 So, for the size of bitmap in P2, `p2[1]' is
25fe55af 5407 enough. But P1 may have range table, so the
b18215fc
RS
5408 size of bitmap table of P1 is extracted by
5409 using macro `CHARSET_BITMAP_SIZE'.
5410
5411 Since we know that all the character listed in
5412 P2 is ASCII, it is enough to test only bitmap
5413 table of P1. */
25fe55af 5414
b18215fc 5415 if ((re_opcode_t) p1[3] == charset_not)
fa9a63c5
RM
5416 {
5417 int idx;
b18215fc 5418 /* We win if the charset_not inside the loop lists
25fe55af 5419 every character listed in the charset after. */
fa9a63c5
RM
5420 for (idx = 0; idx < (int) p2[1]; idx++)
5421 if (! (p2[2 + idx] == 0
b18215fc 5422 || (idx < CHARSET_BITMAP_SIZE (&p1[3])
fa9a63c5
RM
5423 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5424 break;
5425
5426 if (idx == p2[1])
25fe55af
RS
5427 {
5428 p[-3] = (unsigned char) pop_failure_jump;
5429 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5430 }
fa9a63c5
RM
5431 }
5432 else if ((re_opcode_t) p1[3] == charset)
5433 {
5434 int idx;
5435 /* We win if the charset inside the loop
5436 has no overlap with the one after the loop. */
5437 for (idx = 0;
b18215fc
RS
5438 (idx < (int) p2[1]
5439 && idx < CHARSET_BITMAP_SIZE (&p1[3]));
fa9a63c5
RM
5440 idx++)
5441 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5442 break;
5443
b18215fc
RS
5444 if (idx == p2[1]
5445 || idx == CHARSET_BITMAP_SIZE (&p1[3]))
25fe55af
RS
5446 {
5447 p[-3] = (unsigned char) pop_failure_jump;
5448 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5449 }
fa9a63c5
RM
5450 }
5451 }
5452 }
b18215fc 5453 }
fa9a63c5
RM
5454 p -= 2; /* Point at relative address again. */
5455 if ((re_opcode_t) p[-1] != pop_failure_jump)
5456 {
5457 p[-1] = (unsigned char) jump;
25fe55af 5458 DEBUG_PRINT1 (" Match => jump.\n");
fa9a63c5
RM
5459 goto unconditional_jump;
5460 }
25fe55af 5461 /* Note fall through. */
fa9a63c5
RM
5462
5463
5464 /* The end of a simple repeat has a pop_failure_jump back to
25fe55af
RS
5465 its matching on_failure_jump, where the latter will push a
5466 failure point. The pop_failure_jump takes off failure
5467 points put on by this pop_failure_jump's matching
5468 on_failure_jump; we got through the pattern to here from the
5469 matching on_failure_jump, so didn't fail. */
5470 case pop_failure_jump:
5471 {
5472 /* We need to pass separate storage for the lowest and
5473 highest registers, even though we don't care about the
5474 actual values. Otherwise, we will restore only one
5475 register from the stack, since lowest will == highest in
5476 `pop_failure_point'. */
5477 unsigned dummy_low_reg, dummy_high_reg;
5478 unsigned char *pdummy;
5479 const char *sdummy;
5480
5481 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5482 POP_FAILURE_POINT (sdummy, pdummy,
5483 dummy_low_reg, dummy_high_reg,
5484 reg_dummy, reg_dummy, reg_info_dummy);
5485 }
5486 /* Note fall through. */
5487
5488
5489 /* Unconditionally jump (without popping any failure points). */
5490 case jump:
fa9a63c5 5491 unconditional_jump:
27c3b45d
RS
5492#if defined (WINDOWSNT) && defined (emacs)
5493 QUIT;
5494#endif
fa9a63c5 5495 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
25fe55af
RS
5496 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5497 p += mcnt; /* Do the jump. */
5498 DEBUG_PRINT2 ("(to 0x%x).\n", p);
5499 break;
5500
5501
5502 /* We need this opcode so we can detect where alternatives end
5503 in `group_match_null_string_p' et al. */
5504 case jump_past_alt:
5505 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5506 goto unconditional_jump;
5507
5508
5509 /* Normally, the on_failure_jump pushes a failure point, which
5510 then gets popped at pop_failure_jump. We will end up at
5511 pop_failure_jump, also, and with a pattern of, say, `a+', we
5512 are skipping over the on_failure_jump, so we have to push
5513 something meaningless for pop_failure_jump to pop. */
5514 case dummy_failure_jump:
5515 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5516 /* It doesn't matter what we push for the string here. What
5517 the code at `fail' tests is the value for the pattern. */
5518 PUSH_FAILURE_POINT (0, 0, -2);
5519 goto unconditional_jump;
5520
5521
5522 /* At the end of an alternative, we need to push a dummy failure
5523 point in case we are followed by a `pop_failure_jump', because
5524 we don't want the failure point for the alternative to be
5525 popped. For example, matching `(a|ab)*' against `aab'
5526 requires that we match the `ab' alternative. */
5527 case push_dummy_failure:
5528 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5529 /* See comments just above at `dummy_failure_jump' about the
5530 two zeroes. */
5531 PUSH_FAILURE_POINT (0, 0, -2);
fa9a63c5
RM
5532 break;
5533
25fe55af
RS
5534 /* Have to succeed matching what follows at least n times.
5535 After that, handle like `on_failure_jump'. */
5536 case succeed_n:
5537 EXTRACT_NUMBER (mcnt, p + 2);
5538 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5e69f11e 5539
25fe55af
RS
5540 assert (mcnt >= 0);
5541 /* Originally, this is how many times we HAVE to succeed. */
5542 if (mcnt > 0)
5543 {
5544 mcnt--;
fa9a63c5 5545 p += 2;
25fe55af
RS
5546 STORE_NUMBER_AND_INCR (p, mcnt);
5547 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
5548 }
fa9a63c5 5549 else if (mcnt == 0)
25fe55af
RS
5550 {
5551 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
fa9a63c5 5552 p[2] = (unsigned char) no_op;
25fe55af
RS
5553 p[3] = (unsigned char) no_op;
5554 goto on_failure;
5555 }
5556 break;
5557
5558 case jump_n:
5559 EXTRACT_NUMBER (mcnt, p + 2);
5560 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5561
5562 /* Originally, this is how many times we CAN jump. */
5563 if (mcnt)
5564 {
5565 mcnt--;
5566 STORE_NUMBER (p + 2, mcnt);
5e69f11e 5567 goto unconditional_jump;
25fe55af
RS
5568 }
5569 /* If don't have to jump any more, skip over the rest of command. */
5e69f11e
RM
5570 else
5571 p += 4;
25fe55af 5572 break;
5e69f11e 5573
fa9a63c5
RM
5574 case set_number_at:
5575 {
25fe55af 5576 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
fa9a63c5 5577
25fe55af
RS
5578 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5579 p1 = p + mcnt;
5580 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5581 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
fa9a63c5 5582 STORE_NUMBER (p1, mcnt);
25fe55af
RS
5583 break;
5584 }
9121ca40
KH
5585
5586 case wordbound:
5587 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
fa9a63c5 5588
b18215fc 5589 /* We SUCCEED in one of the following cases: */
9121ca40 5590
b18215fc 5591 /* Case 1: D is at the beginning or the end of string. */
9121ca40
KH
5592 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5593 break;
b18215fc
RS
5594 else
5595 {
5596 /* C1 is the character before D, S1 is the syntax of C1, C2
5597 is the character at D, and S2 is the syntax of C2. */
5598 int c1, c2, s1, s2;
5599 int pos1 = PTR_TO_OFFSET (d - 1);
5d967c7a 5600 int charpos;
9121ca40 5601
b18215fc
RS
5602 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5603 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5604#ifdef emacs
7e95234e 5605 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5d967c7a 5606 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5607#endif
b18215fc
RS
5608 s1 = SYNTAX (c1);
5609#ifdef emacs
5d967c7a 5610 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5611#endif
b18215fc
RS
5612 s2 = SYNTAX (c2);
5613
5614 if (/* Case 2: Only one of S1 and S2 is Sword. */
5615 ((s1 == Sword) != (s2 == Sword))
5616 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5617 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5618 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
9121ca40 5619 break;
9121ca40 5620 }
b18215fc 5621 goto fail;
9121ca40
KH
5622
5623 case notwordbound:
9121ca40 5624 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
b18215fc
RS
5625
5626 /* We FAIL in one of the following cases: */
5627
5628 /* Case 1: D is at the beginning or the end of string. */
9121ca40
KH
5629 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5630 goto fail;
b18215fc
RS
5631 else
5632 {
5633 /* C1 is the character before D, S1 is the syntax of C1, C2
5634 is the character at D, and S2 is the syntax of C2. */
5635 int c1, c2, s1, s2;
5636 int pos1 = PTR_TO_OFFSET (d - 1);
5d967c7a 5637 int charpos;
9121ca40 5638
b18215fc
RS
5639 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5640 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5641#ifdef emacs
92432794 5642 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5d967c7a 5643 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5644#endif
b18215fc
RS
5645 s1 = SYNTAX (c1);
5646#ifdef emacs
5d967c7a 5647 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
25fe55af 5648#endif
b18215fc
RS
5649 s2 = SYNTAX (c2);
5650
5651 if (/* Case 2: Only one of S1 and S2 is Sword. */
5652 ((s1 == Sword) != (s2 == Sword))
5653 /* Case 3: Both of S1 and S2 are Sword, and macro
25fe55af 5654 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
b18215fc 5655 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
9121ca40 5656 goto fail;
9121ca40 5657 }
b18215fc 5658 break;
fa9a63c5
RM
5659
5660 case wordbeg:
25fe55af 5661 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
fa9a63c5 5662
b18215fc
RS
5663 /* We FAIL in one of the following cases: */
5664
25fe55af 5665 /* Case 1: D is at the end of string. */
b18215fc 5666 if (AT_STRINGS_END (d))
25fe55af 5667 goto fail;
b18215fc
RS
5668 else
5669 {
5670 /* C1 is the character before D, S1 is the syntax of C1, C2
5671 is the character at D, and S2 is the syntax of C2. */
5672 int c1, c2, s1, s2;
5673 int pos1 = PTR_TO_OFFSET (d);
5d967c7a 5674 int charpos;
fa9a63c5 5675
b18215fc 5676 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
fa9a63c5 5677#ifdef emacs
92432794
RS
5678 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5679 UPDATE_SYNTAX_TABLE (charpos);
25fe55af 5680#endif
b18215fc 5681 s2 = SYNTAX (c2);
25fe55af 5682
b18215fc
RS
5683 /* Case 2: S2 is not Sword. */
5684 if (s2 != Sword)
5685 goto fail;
5686
5687 /* Case 3: D is not at the beginning of string ... */
5688 if (!AT_STRINGS_BEG (d))
5689 {
5690 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5691#ifdef emacs
5d967c7a 5692 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
25fe55af 5693#endif
b18215fc
RS
5694 s1 = SYNTAX (c1);
5695
5696 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5697 returns 0. */
b18215fc
RS
5698 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5699 goto fail;
5700 }
5701 }
e318085a
RS
5702 break;
5703
b18215fc 5704 case wordend:
25fe55af 5705 DEBUG_PRINT1 ("EXECUTING wordend.\n");
b18215fc
RS
5706
5707 /* We FAIL in one of the following cases: */
5708
5709 /* Case 1: D is at the beginning of string. */
5710 if (AT_STRINGS_BEG (d))
e318085a 5711 goto fail;
b18215fc
RS
5712 else
5713 {
5714 /* C1 is the character before D, S1 is the syntax of C1, C2
5715 is the character at D, and S2 is the syntax of C2. */
5716 int c1, c2, s1, s2;
5d967c7a
RS
5717 int pos1 = PTR_TO_OFFSET (d);
5718 int charpos;
b18215fc
RS
5719
5720 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5d967c7a 5721#ifdef emacs
92432794
RS
5722 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1);
5723 UPDATE_SYNTAX_TABLE (charpos);
5d967c7a 5724#endif
b18215fc
RS
5725 s1 = SYNTAX (c1);
5726
5727 /* Case 2: S1 is not Sword. */
5728 if (s1 != Sword)
5729 goto fail;
5730
5731 /* Case 3: D is not at the end of string ... */
5732 if (!AT_STRINGS_END (d))
5733 {
5734 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5d967c7a
RS
5735#ifdef emacs
5736 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5737#endif
b18215fc
RS
5738 s2 = SYNTAX (c2);
5739
5740 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
25fe55af 5741 returns 0. */
b18215fc 5742 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
25fe55af 5743 goto fail;
b18215fc
RS
5744 }
5745 }
e318085a
RS
5746 break;
5747
b18215fc 5748#ifdef emacs
25fe55af
RS
5749 case before_dot:
5750 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5d967c7a 5751 if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE)
25fe55af
RS
5752 goto fail;
5753 break;
b18215fc 5754
25fe55af
RS
5755 case at_dot:
5756 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5d967c7a 5757 if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE)
25fe55af
RS
5758 goto fail;
5759 break;
b18215fc 5760
25fe55af
RS
5761 case after_dot:
5762 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5d967c7a 5763 if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE)
25fe55af
RS
5764 goto fail;
5765 break;
fa9a63c5
RM
5766
5767 case syntaxspec:
25fe55af 5768 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
fa9a63c5
RM
5769 mcnt = *p++;
5770 goto matchsyntax;
5771
25fe55af
RS
5772 case wordchar:
5773 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
fa9a63c5 5774 mcnt = (int) Sword;
25fe55af 5775 matchsyntax:
fa9a63c5 5776 PREFETCH ();
b18215fc
RS
5777#ifdef emacs
5778 {
92432794 5779 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
b18215fc
RS
5780 UPDATE_SYNTAX_TABLE (pos1);
5781 }
25fe55af 5782#endif
b18215fc
RS
5783 {
5784 int c, len;
5785
5786 if (multibyte)
5787 /* we must concern about multibyte form, ... */
5788 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5789 else
5790 /* everything should be handled as ASCII, even though it
5791 looks like multibyte form. */
5792 c = *d, len = 1;
5793
5794 if (SYNTAX (c) != (enum syntaxcode) mcnt)
fa9a63c5 5795 goto fail;
b18215fc
RS
5796 d += len;
5797 }
25fe55af 5798 SET_REGS_MATCHED ();
fa9a63c5
RM
5799 break;
5800
5801 case notsyntaxspec:
25fe55af 5802 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
fa9a63c5
RM
5803 mcnt = *p++;
5804 goto matchnotsyntax;
5805
25fe55af
RS
5806 case notwordchar:
5807 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
fa9a63c5 5808 mcnt = (int) Sword;
25fe55af 5809 matchnotsyntax:
fa9a63c5 5810 PREFETCH ();
b18215fc
RS
5811#ifdef emacs
5812 {
92432794 5813 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
b18215fc
RS
5814 UPDATE_SYNTAX_TABLE (pos1);
5815 }
25fe55af 5816#endif
b18215fc
RS
5817 {
5818 int c, len;
5819
5820 if (multibyte)
5821 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5822 else
5823 c = *d, len = 1;
5824
5825 if (SYNTAX (c) == (enum syntaxcode) mcnt)
fa9a63c5 5826 goto fail;
b18215fc
RS
5827 d += len;
5828 }
5829 SET_REGS_MATCHED ();
5830 break;
5831
5832 case categoryspec:
5833 DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p);
5834 mcnt = *p++;
5835 PREFETCH ();
5836 {
5837 int c, len;
5838
5839 if (multibyte)
5840 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5841 else
5842 c = *d, len = 1;
5843
5844 if (!CHAR_HAS_CATEGORY (c, mcnt))
5845 goto fail;
5846 d += len;
5847 }
fa9a63c5 5848 SET_REGS_MATCHED ();
e318085a 5849 break;
fa9a63c5 5850
b18215fc
RS
5851 case notcategoryspec:
5852 DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p);
5853 mcnt = *p++;
5854 PREFETCH ();
5855 {
5856 int c, len;
5857
5858 if (multibyte)
5859 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5860 else
5861 c = *d, len = 1;
5862
5863 if (CHAR_HAS_CATEGORY (c, mcnt))
5864 goto fail;
5865 d += len;
5866 }
5867 SET_REGS_MATCHED ();
5868 break;
5869
fa9a63c5
RM
5870#else /* not emacs */
5871 case wordchar:
b18215fc 5872 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
fa9a63c5 5873 PREFETCH ();
b18215fc
RS
5874 if (!WORDCHAR_P (d))
5875 goto fail;
fa9a63c5 5876 SET_REGS_MATCHED ();
b18215fc 5877 d++;
fa9a63c5 5878 break;
5e69f11e 5879
fa9a63c5 5880 case notwordchar:
b18215fc 5881 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
fa9a63c5
RM
5882 PREFETCH ();
5883 if (WORDCHAR_P (d))
b18215fc
RS
5884 goto fail;
5885 SET_REGS_MATCHED ();
5886 d++;
fa9a63c5
RM
5887 break;
5888#endif /* not emacs */
5e69f11e 5889
b18215fc
RS
5890 default:
5891 abort ();
fa9a63c5 5892 }
b18215fc 5893 continue; /* Successfully executed one pattern command; keep going. */
fa9a63c5
RM
5894
5895
5896 /* We goto here if a matching operation fails. */
5897 fail:
27c3b45d
RS
5898#if defined (WINDOWSNT) && defined (emacs)
5899 QUIT;
5900#endif
fa9a63c5 5901 if (!FAIL_STACK_EMPTY ())
b18215fc
RS
5902 { /* A restart point is known. Restore to that state. */
5903 DEBUG_PRINT1 ("\nFAIL:\n");
5904 POP_FAILURE_POINT (d, p,
5905 lowest_active_reg, highest_active_reg,
5906 regstart, regend, reg_info);
5907
5908 /* If this failure point is a dummy, try the next one. */
5909 if (!p)
fa9a63c5
RM
5910 goto fail;
5911
b18215fc 5912 /* If we failed to the end of the pattern, don't examine *p. */
fa9a63c5 5913 assert (p <= pend);
b18215fc
RS
5914 if (p < pend)
5915 {
5916 boolean is_a_jump_n = false;
5917
5918 /* If failed to a backwards jump that's part of a repetition
5919 loop, need to pop this failure point and use the next one. */
5920 switch ((re_opcode_t) *p)
5921 {
5922 case jump_n:
5923 is_a_jump_n = true;
5924 case maybe_pop_jump:
5925 case pop_failure_jump:
5926 case jump:
5927 p1 = p + 1;
5928 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5929 p1 += mcnt;
5930
5931 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5932 || (!is_a_jump_n
5933 && (re_opcode_t) *p1 == on_failure_jump))
5934 goto fail;
5935 break;
5936 default:
5937 /* do nothing */ ;
5938 }
5939 }
5940
5941 if (d >= string1 && d <= end1)
fa9a63c5 5942 dend = end_match_1;
b18215fc 5943 }
fa9a63c5 5944 else
b18215fc 5945 break; /* Matching at this starting point really fails. */
fa9a63c5
RM
5946 } /* for (;;) */
5947
5948 if (best_regs_set)
5949 goto restore_best_regs;
5950
5951 FREE_VARIABLES ();
5952
b18215fc 5953 return -1; /* Failure to match. */
fa9a63c5
RM
5954} /* re_match_2 */
5955\f
5956/* Subroutine definitions for re_match_2. */
5957
5958
5959/* We are passed P pointing to a register number after a start_memory.
5e69f11e 5960
fa9a63c5
RM
5961 Return true if the pattern up to the corresponding stop_memory can
5962 match the empty string, and false otherwise.
5e69f11e 5963
fa9a63c5
RM
5964 If we find the matching stop_memory, sets P to point to one past its number.
5965 Otherwise, sets P to an undefined byte less than or equal to END.
5966
5967 We don't handle duplicates properly (yet). */
5968
5969static boolean
5970group_match_null_string_p (p, end, reg_info)
5971 unsigned char **p, *end;
5972 register_info_type *reg_info;
5973{
5974 int mcnt;
5975 /* Point to after the args to the start_memory. */
5976 unsigned char *p1 = *p + 2;
5e69f11e 5977
fa9a63c5
RM
5978 while (p1 < end)
5979 {
5980 /* Skip over opcodes that can match nothing, and return true or
5981 false, as appropriate, when we get to one that can't, or to the
b18215fc 5982 matching stop_memory. */
5e69f11e 5983
fa9a63c5 5984 switch ((re_opcode_t) *p1)
b18215fc
RS
5985 {
5986 /* Could be either a loop or a series of alternatives. */
5987 case on_failure_jump:
5988 p1++;
5989 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5e69f11e 5990
b18215fc 5991 /* If the next operation is not a jump backwards in the
fa9a63c5
RM
5992 pattern. */
5993
5994 if (mcnt >= 0)
5995 {
b18215fc
RS
5996 /* Go through the on_failure_jumps of the alternatives,
5997 seeing if any of the alternatives cannot match nothing.
5998 The last alternative starts with only a jump,
5999 whereas the rest start with on_failure_jump and end
6000 with a jump, e.g., here is the pattern for `a|b|c':
fa9a63c5 6001
b18215fc
RS
6002 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6003 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6004 /exactn/1/c
fa9a63c5 6005
b18215fc
RS
6006 So, we have to first go through the first (n-1)
6007 alternatives and then deal with the last one separately. */
fa9a63c5
RM
6008
6009
b18215fc
RS
6010 /* Deal with the first (n-1) alternatives, which start
6011 with an on_failure_jump (see above) that jumps to right
6012 past a jump_past_alt. */
fa9a63c5 6013
b18215fc
RS
6014 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
6015 {
6016 /* `mcnt' holds how many bytes long the alternative
6017 is, including the ending `jump_past_alt' and
6018 its number. */
fa9a63c5 6019
b18215fc
RS
6020 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
6021 reg_info))
6022 return false;
fa9a63c5 6023
b18215fc 6024 /* Move to right after this alternative, including the
fa9a63c5 6025 jump_past_alt. */
b18215fc 6026 p1 += mcnt;
fa9a63c5 6027
b18215fc
RS
6028 /* Break if it's the beginning of an n-th alternative
6029 that doesn't begin with an on_failure_jump. */
6030 if ((re_opcode_t) *p1 != on_failure_jump)
6031 break;
5e69f11e 6032
fa9a63c5
RM
6033 /* Still have to check that it's not an n-th
6034 alternative that starts with an on_failure_jump. */
6035 p1++;
b18215fc
RS
6036 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6037 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
6038 {
6039 /* Get to the beginning of the n-th alternative. */
6040 p1 -= 3;
6041 break;
6042 }
6043 }
fa9a63c5 6044
b18215fc
RS
6045 /* Deal with the last alternative: go back and get number
6046 of the `jump_past_alt' just before it. `mcnt' contains
6047 the length of the alternative. */
6048 EXTRACT_NUMBER (mcnt, p1 - 2);
fa9a63c5 6049
b18215fc
RS
6050 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
6051 return false;
fa9a63c5 6052
b18215fc
RS
6053 p1 += mcnt; /* Get past the n-th alternative. */
6054 } /* if mcnt > 0 */
6055 break;
fa9a63c5 6056
5e69f11e 6057
b18215fc 6058 case stop_memory:
fa9a63c5 6059 assert (p1[1] == **p);
b18215fc
RS
6060 *p = p1 + 2;
6061 return true;
fa9a63c5 6062
5e69f11e 6063
b18215fc
RS
6064 default:
6065 if (!common_op_match_null_string_p (&p1, end, reg_info))
6066 return false;
6067 }
fa9a63c5
RM
6068 } /* while p1 < end */
6069
6070 return false;
6071} /* group_match_null_string_p */
6072
6073
6074/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6075 It expects P to be the first byte of a single alternative and END one
6076 byte past the last. The alternative can contain groups. */
5e69f11e 6077
fa9a63c5
RM
6078static boolean
6079alt_match_null_string_p (p, end, reg_info)
6080 unsigned char *p, *end;
6081 register_info_type *reg_info;
6082{
6083 int mcnt;
6084 unsigned char *p1 = p;
5e69f11e 6085
fa9a63c5
RM
6086 while (p1 < end)
6087 {
5e69f11e 6088 /* Skip over opcodes that can match nothing, and break when we get
b18215fc 6089 to one that can't. */
5e69f11e 6090
fa9a63c5 6091 switch ((re_opcode_t) *p1)
b18215fc
RS
6092 {
6093 /* It's a loop. */
6094 case on_failure_jump:
6095 p1++;
6096 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6097 p1 += mcnt;
6098 break;
5e69f11e
RM
6099
6100 default:
b18215fc
RS
6101 if (!common_op_match_null_string_p (&p1, end, reg_info))
6102 return false;
6103 }
fa9a63c5
RM
6104 } /* while p1 < end */
6105
6106 return true;
6107} /* alt_match_null_string_p */
6108
6109
6110/* Deals with the ops common to group_match_null_string_p and
5e69f11e
RM
6111 alt_match_null_string_p.
6112
fa9a63c5
RM
6113 Sets P to one after the op and its arguments, if any. */
6114
6115static boolean
6116common_op_match_null_string_p (p, end, reg_info)
6117 unsigned char **p, *end;
6118 register_info_type *reg_info;
6119{
6120 int mcnt;
6121 boolean ret;
6122 int reg_no;
6123 unsigned char *p1 = *p;
6124
6125 switch ((re_opcode_t) *p1++)
6126 {
6127 case no_op:
6128 case begline:
6129 case endline:
6130 case begbuf:
6131 case endbuf:
6132 case wordbeg:
6133 case wordend:
6134 case wordbound:
6135 case notwordbound:
6136#ifdef emacs
6137 case before_dot:
6138 case at_dot:
6139 case after_dot:
6140#endif
6141 break;
6142
6143 case start_memory:
6144 reg_no = *p1;
6145 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
6146 ret = group_match_null_string_p (&p1, end, reg_info);
5e69f11e 6147
fa9a63c5 6148 /* Have to set this here in case we're checking a group which
b18215fc 6149 contains a group and a back reference to it. */
fa9a63c5
RM
6150
6151 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
b18215fc 6152 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
fa9a63c5
RM
6153
6154 if (!ret)
b18215fc 6155 return false;
fa9a63c5 6156 break;
5e69f11e 6157
b18215fc 6158 /* If this is an optimized succeed_n for zero times, make the jump. */
fa9a63c5
RM
6159 case jump:
6160 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6161 if (mcnt >= 0)
b18215fc 6162 p1 += mcnt;
fa9a63c5 6163 else
b18215fc 6164 return false;
fa9a63c5
RM
6165 break;
6166
6167 case succeed_n:
b18215fc 6168 /* Get to the number of times to succeed. */
5e69f11e 6169 p1 += 2;
fa9a63c5
RM
6170 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6171
6172 if (mcnt == 0)
b18215fc
RS
6173 {
6174 p1 -= 4;
6175 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6176 p1 += mcnt;
6177 }
fa9a63c5 6178 else
b18215fc 6179 return false;
fa9a63c5
RM
6180 break;
6181
5e69f11e 6182 case duplicate:
fa9a63c5 6183 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
b18215fc 6184 return false;
fa9a63c5
RM
6185 break;
6186
6187 case set_number_at:
6188 p1 += 4;
6189
6190 default:
6191 /* All other opcodes mean we cannot match the empty string. */
6192 return false;
6193 }
6194
6195 *p = p1;
6196 return true;
6197} /* common_op_match_null_string_p */
6198
6199
6200/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6201 bytes; nonzero otherwise. */
5e69f11e 6202
fa9a63c5
RM
6203static int
6204bcmp_translate (s1, s2, len, translate)
6205 unsigned char *s1, *s2;
6206 register int len;
6676cb1c 6207 RE_TRANSLATE_TYPE translate;
fa9a63c5
RM
6208{
6209 register unsigned char *p1 = s1, *p2 = s2;
e934739e
RS
6210 unsigned char *p1_end = s1 + len;
6211 unsigned char *p2_end = s2 + len;
6212
6213 while (p1 != p1_end && p2 != p2_end)
fa9a63c5 6214 {
e934739e
RS
6215 int p1_charlen, p2_charlen;
6216 int p1_ch, p2_ch;
6217
6218 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6219 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
6220
6221 if (RE_TRANSLATE (translate, p1_ch)
6222 != RE_TRANSLATE (translate, p2_ch))
bc192b5b 6223 return 1;
e934739e
RS
6224
6225 p1 += p1_charlen, p2 += p2_charlen;
fa9a63c5 6226 }
e934739e
RS
6227
6228 if (p1 != p1_end || p2 != p2_end)
6229 return 1;
6230
fa9a63c5
RM
6231 return 0;
6232}
6233\f
6234/* Entry points for GNU code. */
6235
6236/* re_compile_pattern is the GNU regular expression compiler: it
6237 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6238 Returns 0 if the pattern was valid, otherwise an error string.
5e69f11e 6239
fa9a63c5
RM
6240 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6241 are set in BUFP on entry.
5e69f11e 6242
b18215fc 6243 We call regex_compile to do the actual compilation. */
fa9a63c5
RM
6244
6245const char *
6246re_compile_pattern (pattern, length, bufp)
6247 const char *pattern;
6248 int length;
6249 struct re_pattern_buffer *bufp;
6250{
6251 reg_errcode_t ret;
5e69f11e 6252
fa9a63c5
RM
6253 /* GNU code is written to assume at least RE_NREGS registers will be set
6254 (and at least one extra will be -1). */
6255 bufp->regs_allocated = REGS_UNALLOCATED;
5e69f11e 6256
fa9a63c5
RM
6257 /* And GNU code determines whether or not to get register information
6258 by passing null for the REGS argument to re_match, etc., not by
6259 setting no_sub. */
6260 bufp->no_sub = 0;
5e69f11e 6261
b18215fc 6262 /* Match anchors at newline. */
fa9a63c5 6263 bufp->newline_anchor = 1;
5e69f11e 6264
fa9a63c5
RM
6265 ret = regex_compile (pattern, length, re_syntax_options, bufp);
6266
6267 if (!ret)
6268 return NULL;
6269 return gettext (re_error_msgid[(int) ret]);
5e69f11e 6270}
fa9a63c5 6271\f
b18215fc
RS
6272/* Entry points compatible with 4.2 BSD regex library. We don't define
6273 them unless specifically requested. */
fa9a63c5 6274
0c085854 6275#if defined (_REGEX_RE_COMP) || defined (_LIBC)
fa9a63c5
RM
6276
6277/* BSD has one and only one pattern buffer. */
6278static struct re_pattern_buffer re_comp_buf;
6279
6280char *
48afdd44
RM
6281#ifdef _LIBC
6282/* Make these definitions weak in libc, so POSIX programs can redefine
6283 these names if they don't use our functions, and still use
6284 regcomp/regexec below without link errors. */
6285weak_function
6286#endif
fa9a63c5
RM
6287re_comp (s)
6288 const char *s;
6289{
6290 reg_errcode_t ret;
5e69f11e 6291
fa9a63c5
RM
6292 if (!s)
6293 {
6294 if (!re_comp_buf.buffer)
6295 return gettext ("No previous regular expression");
6296 return 0;
6297 }
6298
6299 if (!re_comp_buf.buffer)
6300 {
6301 re_comp_buf.buffer = (unsigned char *) malloc (200);
6302 if (re_comp_buf.buffer == NULL)
b18215fc 6303 return gettext (re_error_msgid[(int) REG_ESPACE]);
fa9a63c5
RM
6304 re_comp_buf.allocated = 200;
6305
6306 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6307 if (re_comp_buf.fastmap == NULL)
6308 return gettext (re_error_msgid[(int) REG_ESPACE]);
6309 }
6310
6311 /* Since `re_exec' always passes NULL for the `regs' argument, we
6312 don't need to initialize the pattern buffer fields which affect it. */
6313
b18215fc 6314 /* Match anchors at newlines. */
fa9a63c5
RM
6315 re_comp_buf.newline_anchor = 1;
6316
6317 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5e69f11e 6318
fa9a63c5
RM
6319 if (!ret)
6320 return NULL;
6321
6322 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6323 return (char *) gettext (re_error_msgid[(int) ret]);
6324}
6325
6326
6327int
48afdd44
RM
6328#ifdef _LIBC
6329weak_function
6330#endif
fa9a63c5
RM
6331re_exec (s)
6332 const char *s;
6333{
6334 const int len = strlen (s);
6335 return
6336 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6337}
6338#endif /* _REGEX_RE_COMP */
6339\f
6340/* POSIX.2 functions. Don't define these for Emacs. */
6341
6342#ifndef emacs
6343
6344/* regcomp takes a regular expression as a string and compiles it.
6345
b18215fc 6346 PREG is a regex_t *. We do not expect any fields to be initialized,
fa9a63c5
RM
6347 since POSIX says we shouldn't. Thus, we set
6348
6349 `buffer' to the compiled pattern;
6350 `used' to the length of the compiled pattern;
6351 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6352 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6353 RE_SYNTAX_POSIX_BASIC;
6354 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6355 `fastmap' and `fastmap_accurate' to zero;
6356 `re_nsub' to the number of subexpressions in PATTERN.
6357
6358 PATTERN is the address of the pattern string.
6359
6360 CFLAGS is a series of bits which affect compilation.
6361
6362 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6363 use POSIX basic syntax.
6364
6365 If REG_NEWLINE is set, then . and [^...] don't match newline.
6366 Also, regexec will try a match beginning after every newline.
6367
6368 If REG_ICASE is set, then we considers upper- and lowercase
6369 versions of letters to be equivalent when matching.
6370
6371 If REG_NOSUB is set, then when PREG is passed to regexec, that
6372 routine will report only success or failure, and nothing about the
6373 registers.
6374
b18215fc 6375 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
fa9a63c5
RM
6376 the return codes and their meanings.) */
6377
6378int
6379regcomp (preg, pattern, cflags)
6380 regex_t *preg;
5e69f11e 6381 const char *pattern;
fa9a63c5
RM
6382 int cflags;
6383{
6384 reg_errcode_t ret;
6385 unsigned syntax
6386 = (cflags & REG_EXTENDED) ?
6387 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6388
6389 /* regex_compile will allocate the space for the compiled pattern. */
6390 preg->buffer = 0;
6391 preg->allocated = 0;
6392 preg->used = 0;
5e69f11e 6393
fa9a63c5
RM
6394 /* Don't bother to use a fastmap when searching. This simplifies the
6395 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6396 characters after newlines into the fastmap. This way, we just try
6397 every character. */
6398 preg->fastmap = 0;
5e69f11e 6399
fa9a63c5
RM
6400 if (cflags & REG_ICASE)
6401 {
6402 unsigned i;
5e69f11e 6403
6676cb1c
RS
6404 preg->translate
6405 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6406 * sizeof (*(RE_TRANSLATE_TYPE)0));
fa9a63c5 6407 if (preg->translate == NULL)
b18215fc 6408 return (int) REG_ESPACE;
fa9a63c5
RM
6409
6410 /* Map uppercase characters to corresponding lowercase ones. */
6411 for (i = 0; i < CHAR_SET_SIZE; i++)
b18215fc 6412 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
fa9a63c5
RM
6413 }
6414 else
6415 preg->translate = NULL;
6416
6417 /* If REG_NEWLINE is set, newlines are treated differently. */
6418 if (cflags & REG_NEWLINE)
6419 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6420 syntax &= ~RE_DOT_NEWLINE;
6421 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
b18215fc 6422 /* It also changes the matching behavior. */
fa9a63c5
RM
6423 preg->newline_anchor = 1;
6424 }
6425 else
6426 preg->newline_anchor = 0;
6427
6428 preg->no_sub = !!(cflags & REG_NOSUB);
6429
5e69f11e 6430 /* POSIX says a null character in the pattern terminates it, so we
fa9a63c5
RM
6431 can use strlen here in compiling the pattern. */
6432 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5e69f11e 6433
fa9a63c5
RM
6434 /* POSIX doesn't distinguish between an unmatched open-group and an
6435 unmatched close-group: both are REG_EPAREN. */
6436 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5e69f11e 6437
fa9a63c5
RM
6438 return (int) ret;
6439}
6440
6441
6442/* regexec searches for a given pattern, specified by PREG, in the
6443 string STRING.
5e69f11e 6444
fa9a63c5 6445 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
b18215fc 6446 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
fa9a63c5
RM
6447 least NMATCH elements, and we set them to the offsets of the
6448 corresponding matched substrings.
5e69f11e 6449
fa9a63c5
RM
6450 EFLAGS specifies `execution flags' which affect matching: if
6451 REG_NOTBOL is set, then ^ does not match at the beginning of the
6452 string; if REG_NOTEOL is set, then $ does not match at the end.
5e69f11e 6453
fa9a63c5
RM
6454 We return 0 if we find a match and REG_NOMATCH if not. */
6455
6456int
6457regexec (preg, string, nmatch, pmatch, eflags)
6458 const regex_t *preg;
5e69f11e
RM
6459 const char *string;
6460 size_t nmatch;
6461 regmatch_t pmatch[];
fa9a63c5
RM
6462 int eflags;
6463{
6464 int ret;
6465 struct re_registers regs;
6466 regex_t private_preg;
6467 int len = strlen (string);
6468 boolean want_reg_info = !preg->no_sub && nmatch > 0;
6469
6470 private_preg = *preg;
5e69f11e 6471
fa9a63c5
RM
6472 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6473 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5e69f11e 6474
fa9a63c5
RM
6475 /* The user has told us exactly how many registers to return
6476 information about, via `nmatch'. We have to pass that on to the
b18215fc 6477 matching routines. */
fa9a63c5 6478 private_preg.regs_allocated = REGS_FIXED;
5e69f11e 6479
fa9a63c5
RM
6480 if (want_reg_info)
6481 {
6482 regs.num_regs = nmatch;
6483 regs.start = TALLOC (nmatch, regoff_t);
6484 regs.end = TALLOC (nmatch, regoff_t);
6485 if (regs.start == NULL || regs.end == NULL)
b18215fc 6486 return (int) REG_NOMATCH;
fa9a63c5
RM
6487 }
6488
6489 /* Perform the searching operation. */
6490 ret = re_search (&private_preg, string, len,
b18215fc
RS
6491 /* start: */ 0, /* range: */ len,
6492 want_reg_info ? &regs : (struct re_registers *) 0);
5e69f11e 6493
fa9a63c5
RM
6494 /* Copy the register information to the POSIX structure. */
6495 if (want_reg_info)
6496 {
6497 if (ret >= 0)
b18215fc
RS
6498 {
6499 unsigned r;
fa9a63c5 6500
b18215fc
RS
6501 for (r = 0; r < nmatch; r++)
6502 {
6503 pmatch[r].rm_so = regs.start[r];
6504 pmatch[r].rm_eo = regs.end[r];
6505 }
6506 }
fa9a63c5 6507
b18215fc 6508 /* If we needed the temporary register info, free the space now. */
fa9a63c5
RM
6509 free (regs.start);
6510 free (regs.end);
6511 }
6512
6513 /* We want zero return to mean success, unlike `re_search'. */
6514 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6515}
6516
6517
6518/* Returns a message corresponding to an error code, ERRCODE, returned
6519 from either regcomp or regexec. We don't use PREG here. */
6520
6521size_t
6522regerror (errcode, preg, errbuf, errbuf_size)
6523 int errcode;
6524 const regex_t *preg;
6525 char *errbuf;
6526 size_t errbuf_size;
6527{
6528 const char *msg;
6529 size_t msg_size;
6530
6531 if (errcode < 0
6532 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5e69f11e 6533 /* Only error codes returned by the rest of the code should be passed
b18215fc 6534 to this routine. If we are given anything else, or if other regex
fa9a63c5
RM
6535 code generates an invalid error code, then the program has a bug.
6536 Dump core so we can fix it. */
6537 abort ();
6538
6539 msg = gettext (re_error_msgid[errcode]);
6540
6541 msg_size = strlen (msg) + 1; /* Includes the null. */
5e69f11e 6542
fa9a63c5
RM
6543 if (errbuf_size != 0)
6544 {
6545 if (msg_size > errbuf_size)
b18215fc
RS
6546 {
6547 strncpy (errbuf, msg, errbuf_size - 1);
6548 errbuf[errbuf_size - 1] = 0;
6549 }
fa9a63c5 6550 else
b18215fc 6551 strcpy (errbuf, msg);
fa9a63c5
RM
6552 }
6553
6554 return msg_size;
6555}
6556
6557
6558/* Free dynamically allocated space used by PREG. */
6559
6560void
6561regfree (preg)
6562 regex_t *preg;
6563{
6564 if (preg->buffer != NULL)
6565 free (preg->buffer);
6566 preg->buffer = NULL;
5e69f11e 6567
fa9a63c5
RM
6568 preg->allocated = 0;
6569 preg->used = 0;
6570
6571 if (preg->fastmap != NULL)
6572 free (preg->fastmap);
6573 preg->fastmap = NULL;
6574 preg->fastmap_accurate = 0;
6575
6576 if (preg->translate != NULL)
6577 free (preg->translate);
6578 preg->translate = NULL;
6579}
6580
6581#endif /* not emacs */