New macro with-temp-buffer-window and related fixes.
[bpt/emacs.git] / src / regex.c
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
3 internationalization features.)
4
5 Copyright (C) 1993-2012 Free Software Foundation, Inc.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
20 USA. */
21
22 /* TODO:
23 - structure the opcode space into opcode+flag.
24 - merge with glibc's regex.[ch].
25 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
26 need to modify the compiled regexp so that re_match can be reentrant.
27 - get rid of on_failure_jump_smart by doing the optimization in re_comp
28 rather than at run-time, so that re_match can be reentrant.
29 */
30
31 /* AIX requires this to be the first thing in the file. */
32 #if defined _AIX && !defined REGEX_MALLOC
33 #pragma alloca
34 #endif
35
36 /* Ignore some GCC warnings for now. This section should go away
37 once the Emacs and Gnulib regex code is merged. */
38 #if (__GNUC__ == 4 && 5 <= __GNUC_MINOR__) || 4 < __GNUC__
39 # pragma GCC diagnostic ignored "-Wstrict-overflow"
40 # ifndef emacs
41 # pragma GCC diagnostic ignored "-Wunused-but-set-variable"
42 # pragma GCC diagnostic ignored "-Wunused-function"
43 # pragma GCC diagnostic ignored "-Wunused-macros"
44 # pragma GCC diagnostic ignored "-Wunused-result"
45 # pragma GCC diagnostic ignored "-Wunused-variable"
46 # endif
47 #endif
48
49 #include <config.h>
50
51 #include <stddef.h>
52
53 #ifdef emacs
54 /* We need this for `regex.h', and perhaps for the Emacs include files. */
55 # include <sys/types.h>
56 #endif
57
58 /* Whether to use ISO C Amendment 1 wide char functions.
59 Those should not be used for Emacs since it uses its own. */
60 #if defined _LIBC
61 #define WIDE_CHAR_SUPPORT 1
62 #else
63 #define WIDE_CHAR_SUPPORT \
64 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
65 #endif
66
67 /* For platform which support the ISO C amendment 1 functionality we
68 support user defined character classes. */
69 #if WIDE_CHAR_SUPPORT
70 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
71 # include <wchar.h>
72 # include <wctype.h>
73 #endif
74
75 #ifdef _LIBC
76 /* We have to keep the namespace clean. */
77 # define regfree(preg) __regfree (preg)
78 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
79 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
80 # define regerror(err_code, preg, errbuf, errbuf_size) \
81 __regerror (err_code, preg, errbuf, errbuf_size)
82 # define re_set_registers(bu, re, nu, st, en) \
83 __re_set_registers (bu, re, nu, st, en)
84 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
85 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
86 # define re_match(bufp, string, size, pos, regs) \
87 __re_match (bufp, string, size, pos, regs)
88 # define re_search(bufp, string, size, startpos, range, regs) \
89 __re_search (bufp, string, size, startpos, range, regs)
90 # define re_compile_pattern(pattern, length, bufp) \
91 __re_compile_pattern (pattern, length, bufp)
92 # define re_set_syntax(syntax) __re_set_syntax (syntax)
93 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
94 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
95 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
96
97 /* Make sure we call libc's function even if the user overrides them. */
98 # define btowc __btowc
99 # define iswctype __iswctype
100 # define wctype __wctype
101
102 # define WEAK_ALIAS(a,b) weak_alias (a, b)
103
104 /* We are also using some library internals. */
105 # include <locale/localeinfo.h>
106 # include <locale/elem-hash.h>
107 # include <langinfo.h>
108 #else
109 # define WEAK_ALIAS(a,b)
110 #endif
111
112 /* This is for other GNU distributions with internationalized messages. */
113 #if HAVE_LIBINTL_H || defined _LIBC
114 # include <libintl.h>
115 #else
116 # define gettext(msgid) (msgid)
117 #endif
118
119 #ifndef gettext_noop
120 /* This define is so xgettext can find the internationalizable
121 strings. */
122 # define gettext_noop(String) String
123 #endif
124
125 /* The `emacs' switch turns on certain matching commands
126 that make sense only in Emacs. */
127 #ifdef emacs
128
129 # include <setjmp.h>
130 # include "lisp.h"
131 # include "character.h"
132 # include "buffer.h"
133
134 /* Make syntax table lookup grant data in gl_state. */
135 # define SYNTAX_ENTRY_VIA_PROPERTY
136
137 # include "syntax.h"
138 # include "category.h"
139
140 # ifdef malloc
141 # undef malloc
142 # endif
143 # define malloc xmalloc
144 # ifdef realloc
145 # undef realloc
146 # endif
147 # define realloc xrealloc
148 # ifdef free
149 # undef free
150 # endif
151 # define free xfree
152
153 /* Converts the pointer to the char to BEG-based offset from the start. */
154 # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
155 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
156
157 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
158 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
159 # define RE_STRING_CHAR(p, multibyte) \
160 (multibyte ? (STRING_CHAR (p)) : (*(p)))
161 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
162 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
163
164 # define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
165
166 # define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
167
168 /* Set C a (possibly converted to multibyte) character before P. P
169 points into a string which is the virtual concatenation of STR1
170 (which ends at END1) or STR2 (which ends at END2). */
171 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
172 do { \
173 if (target_multibyte) \
174 { \
175 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
176 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
177 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
178 c = STRING_CHAR (dtemp); \
179 } \
180 else \
181 { \
182 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
183 (c) = RE_CHAR_TO_MULTIBYTE (c); \
184 } \
185 } while (0)
186
187 /* Set C a (possibly converted to multibyte) character at P, and set
188 LEN to the byte length of that character. */
189 # define GET_CHAR_AFTER(c, p, len) \
190 do { \
191 if (target_multibyte) \
192 (c) = STRING_CHAR_AND_LENGTH (p, len); \
193 else \
194 { \
195 (c) = *p; \
196 len = 1; \
197 (c) = RE_CHAR_TO_MULTIBYTE (c); \
198 } \
199 } while (0)
200
201 #else /* not emacs */
202
203 /* If we are not linking with Emacs proper,
204 we can't use the relocating allocator
205 even if config.h says that we can. */
206 # undef REL_ALLOC
207
208 # include <unistd.h>
209
210 /* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
211
212 static void *
213 xmalloc (size_t size)
214 {
215 void *val = malloc (size);
216 if (!val && size)
217 {
218 write (2, "virtual memory exhausted\n", 25);
219 exit (1);
220 }
221 return val;
222 }
223
224 static void *
225 xrealloc (void *block, size_t size)
226 {
227 void *val;
228 /* We must call malloc explicitly when BLOCK is 0, since some
229 reallocs don't do this. */
230 if (! block)
231 val = malloc (size);
232 else
233 val = realloc (block, size);
234 if (!val && size)
235 {
236 write (2, "virtual memory exhausted\n", 25);
237 exit (1);
238 }
239 return val;
240 }
241
242 # ifdef malloc
243 # undef malloc
244 # endif
245 # define malloc xmalloc
246 # ifdef realloc
247 # undef realloc
248 # endif
249 # define realloc xrealloc
250
251 # include <stdbool.h>
252 # include <string.h>
253
254 /* Define the syntax stuff for \<, \>, etc. */
255
256 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
257 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
258
259 /* Dummy macros for non-Emacs environments. */
260 # define CHAR_CHARSET(c) 0
261 # define CHARSET_LEADING_CODE_BASE(c) 0
262 # define MAX_MULTIBYTE_LENGTH 1
263 # define RE_MULTIBYTE_P(x) 0
264 # define RE_TARGET_MULTIBYTE_P(x) 0
265 # define WORD_BOUNDARY_P(c1, c2) (0)
266 # define CHAR_HEAD_P(p) (1)
267 # define SINGLE_BYTE_CHAR_P(c) (1)
268 # define SAME_CHARSET_P(c1, c2) (1)
269 # define BYTES_BY_CHAR_HEAD(p) (1)
270 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
271 # define STRING_CHAR(p) (*(p))
272 # define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
273 # define CHAR_STRING(c, s) (*(s) = (c), 1)
274 # define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
275 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
276 # define RE_CHAR_TO_MULTIBYTE(c) (c)
277 # define RE_CHAR_TO_UNIBYTE(c) (c)
278 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
279 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
280 # define GET_CHAR_AFTER(c, p, len) \
281 (c = *p, len = 1)
282 # define MAKE_CHAR(charset, c1, c2) (c1)
283 # define BYTE8_TO_CHAR(c) (c)
284 # define CHAR_BYTE8_P(c) (0)
285 # define CHAR_LEADING_CODE(c) (c)
286
287 #endif /* not emacs */
288
289 #ifndef RE_TRANSLATE
290 # define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
291 # define RE_TRANSLATE_P(TBL) (TBL)
292 #endif
293 \f
294 /* Get the interface, including the syntax bits. */
295 #include "regex.h"
296
297 /* isalpha etc. are used for the character classes. */
298 #include <ctype.h>
299
300 #ifdef emacs
301
302 /* 1 if C is an ASCII character. */
303 # define IS_REAL_ASCII(c) ((c) < 0200)
304
305 /* 1 if C is a unibyte character. */
306 # define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
307
308 /* The Emacs definitions should not be directly affected by locales. */
309
310 /* In Emacs, these are only used for single-byte characters. */
311 # define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
312 # define ISCNTRL(c) ((c) < ' ')
313 # define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
314 || ((c) >= 'a' && (c) <= 'f') \
315 || ((c) >= 'A' && (c) <= 'F'))
316
317 /* This is only used for single-byte characters. */
318 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
319
320 /* The rest must handle multibyte characters. */
321
322 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
323 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
324 : 1)
325
326 # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
327 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
328 : 1)
329
330 # define ISALNUM(c) (IS_REAL_ASCII (c) \
331 ? (((c) >= 'a' && (c) <= 'z') \
332 || ((c) >= 'A' && (c) <= 'Z') \
333 || ((c) >= '0' && (c) <= '9')) \
334 : SYNTAX (c) == Sword)
335
336 # define ISALPHA(c) (IS_REAL_ASCII (c) \
337 ? (((c) >= 'a' && (c) <= 'z') \
338 || ((c) >= 'A' && (c) <= 'Z')) \
339 : SYNTAX (c) == Sword)
340
341 # define ISLOWER(c) lowercasep (c)
342
343 # define ISPUNCT(c) (IS_REAL_ASCII (c) \
344 ? ((c) > ' ' && (c) < 0177 \
345 && !(((c) >= 'a' && (c) <= 'z') \
346 || ((c) >= 'A' && (c) <= 'Z') \
347 || ((c) >= '0' && (c) <= '9'))) \
348 : SYNTAX (c) != Sword)
349
350 # define ISSPACE(c) (SYNTAX (c) == Swhitespace)
351
352 # define ISUPPER(c) uppercasep (c)
353
354 # define ISWORD(c) (SYNTAX (c) == Sword)
355
356 #else /* not emacs */
357
358 /* 1 if C is an ASCII character. */
359 # define IS_REAL_ASCII(c) ((c) < 0200)
360
361 /* This distinction is not meaningful, except in Emacs. */
362 # define ISUNIBYTE(c) 1
363
364 # ifdef isblank
365 # define ISBLANK(c) isblank (c)
366 # else
367 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
368 # endif
369 # ifdef isgraph
370 # define ISGRAPH(c) isgraph (c)
371 # else
372 # define ISGRAPH(c) (isprint (c) && !isspace (c))
373 # endif
374
375 /* Solaris defines ISPRINT so we must undefine it first. */
376 # undef ISPRINT
377 # define ISPRINT(c) isprint (c)
378 # define ISDIGIT(c) isdigit (c)
379 # define ISALNUM(c) isalnum (c)
380 # define ISALPHA(c) isalpha (c)
381 # define ISCNTRL(c) iscntrl (c)
382 # define ISLOWER(c) islower (c)
383 # define ISPUNCT(c) ispunct (c)
384 # define ISSPACE(c) isspace (c)
385 # define ISUPPER(c) isupper (c)
386 # define ISXDIGIT(c) isxdigit (c)
387
388 # define ISWORD(c) ISALPHA (c)
389
390 # ifdef _tolower
391 # define TOLOWER(c) _tolower (c)
392 # else
393 # define TOLOWER(c) tolower (c)
394 # endif
395
396 /* How many characters in the character set. */
397 # define CHAR_SET_SIZE 256
398
399 # ifdef SYNTAX_TABLE
400
401 extern char *re_syntax_table;
402
403 # else /* not SYNTAX_TABLE */
404
405 static char re_syntax_table[CHAR_SET_SIZE];
406
407 static void
408 init_syntax_once (void)
409 {
410 register int c;
411 static int done = 0;
412
413 if (done)
414 return;
415
416 memset (re_syntax_table, 0, sizeof re_syntax_table);
417
418 for (c = 0; c < CHAR_SET_SIZE; ++c)
419 if (ISALNUM (c))
420 re_syntax_table[c] = Sword;
421
422 re_syntax_table['_'] = Ssymbol;
423
424 done = 1;
425 }
426
427 # endif /* not SYNTAX_TABLE */
428
429 # define SYNTAX(c) re_syntax_table[(c)]
430
431 #endif /* not emacs */
432 \f
433 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
434 \f
435 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
436 use `alloca' instead of `malloc'. This is because using malloc in
437 re_search* or re_match* could cause memory leaks when C-g is used in
438 Emacs; also, malloc is slower and causes storage fragmentation. On
439 the other hand, malloc is more portable, and easier to debug.
440
441 Because we sometimes use alloca, some routines have to be macros,
442 not functions -- `alloca'-allocated space disappears at the end of the
443 function it is called in. */
444
445 #ifdef REGEX_MALLOC
446
447 # define REGEX_ALLOCATE malloc
448 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
449 # define REGEX_FREE free
450
451 #else /* not REGEX_MALLOC */
452
453 /* Emacs already defines alloca, sometimes. */
454 # ifndef alloca
455
456 /* Make alloca work the best possible way. */
457 # ifdef __GNUC__
458 # define alloca __builtin_alloca
459 # else /* not __GNUC__ */
460 # ifdef HAVE_ALLOCA_H
461 # include <alloca.h>
462 # endif /* HAVE_ALLOCA_H */
463 # endif /* not __GNUC__ */
464
465 # endif /* not alloca */
466
467 # define REGEX_ALLOCATE alloca
468
469 /* Assumes a `char *destination' variable. */
470 # define REGEX_REALLOCATE(source, osize, nsize) \
471 (destination = (char *) alloca (nsize), \
472 memcpy (destination, source, osize))
473
474 /* No need to do anything to free, after alloca. */
475 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
476
477 #endif /* not REGEX_MALLOC */
478
479 /* Define how to allocate the failure stack. */
480
481 #if defined REL_ALLOC && defined REGEX_MALLOC
482
483 # define REGEX_ALLOCATE_STACK(size) \
484 r_alloc (&failure_stack_ptr, (size))
485 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
486 r_re_alloc (&failure_stack_ptr, (nsize))
487 # define REGEX_FREE_STACK(ptr) \
488 r_alloc_free (&failure_stack_ptr)
489
490 #else /* not using relocating allocator */
491
492 # ifdef REGEX_MALLOC
493
494 # define REGEX_ALLOCATE_STACK malloc
495 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
496 # define REGEX_FREE_STACK free
497
498 # else /* not REGEX_MALLOC */
499
500 # define REGEX_ALLOCATE_STACK alloca
501
502 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
503 REGEX_REALLOCATE (source, osize, nsize)
504 /* No need to explicitly free anything. */
505 # define REGEX_FREE_STACK(arg) ((void)0)
506
507 # endif /* not REGEX_MALLOC */
508 #endif /* not using relocating allocator */
509
510
511 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
512 `string1' or just past its end. This works if PTR is NULL, which is
513 a good thing. */
514 #define FIRST_STRING_P(ptr) \
515 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
516
517 /* (Re)Allocate N items of type T using malloc, or fail. */
518 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
519 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
520 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
521
522 #define BYTEWIDTH 8 /* In bits. */
523
524 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
525
526 #undef MAX
527 #undef MIN
528 #define MAX(a, b) ((a) > (b) ? (a) : (b))
529 #define MIN(a, b) ((a) < (b) ? (a) : (b))
530
531 /* Type of source-pattern and string chars. */
532 #ifdef _MSC_VER
533 typedef unsigned char re_char;
534 #else
535 typedef const unsigned char re_char;
536 #endif
537
538 typedef char boolean;
539
540 static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
541 re_char *string1, size_t size1,
542 re_char *string2, size_t size2,
543 ssize_t pos,
544 struct re_registers *regs,
545 ssize_t stop);
546 \f
547 /* These are the command codes that appear in compiled regular
548 expressions. Some opcodes are followed by argument bytes. A
549 command code can specify any interpretation whatsoever for its
550 arguments. Zero bytes may appear in the compiled regular expression. */
551
552 typedef enum
553 {
554 no_op = 0,
555
556 /* Succeed right away--no more backtracking. */
557 succeed,
558
559 /* Followed by one byte giving n, then by n literal bytes. */
560 exactn,
561
562 /* Matches any (more or less) character. */
563 anychar,
564
565 /* Matches any one char belonging to specified set. First
566 following byte is number of bitmap bytes. Then come bytes
567 for a bitmap saying which chars are in. Bits in each byte
568 are ordered low-bit-first. A character is in the set if its
569 bit is 1. A character too large to have a bit in the map is
570 automatically not in the set.
571
572 If the length byte has the 0x80 bit set, then that stuff
573 is followed by a range table:
574 2 bytes of flags for character sets (low 8 bits, high 8 bits)
575 See RANGE_TABLE_WORK_BITS below.
576 2 bytes, the number of pairs that follow (upto 32767)
577 pairs, each 2 multibyte characters,
578 each multibyte character represented as 3 bytes. */
579 charset,
580
581 /* Same parameters as charset, but match any character that is
582 not one of those specified. */
583 charset_not,
584
585 /* Start remembering the text that is matched, for storing in a
586 register. Followed by one byte with the register number, in
587 the range 0 to one less than the pattern buffer's re_nsub
588 field. */
589 start_memory,
590
591 /* Stop remembering the text that is matched and store it in a
592 memory register. Followed by one byte with the register
593 number, in the range 0 to one less than `re_nsub' in the
594 pattern buffer. */
595 stop_memory,
596
597 /* Match a duplicate of something remembered. Followed by one
598 byte containing the register number. */
599 duplicate,
600
601 /* Fail unless at beginning of line. */
602 begline,
603
604 /* Fail unless at end of line. */
605 endline,
606
607 /* Succeeds if at beginning of buffer (if emacs) or at beginning
608 of string to be matched (if not). */
609 begbuf,
610
611 /* Analogously, for end of buffer/string. */
612 endbuf,
613
614 /* Followed by two byte relative address to which to jump. */
615 jump,
616
617 /* Followed by two-byte relative address of place to resume at
618 in case of failure. */
619 on_failure_jump,
620
621 /* Like on_failure_jump, but pushes a placeholder instead of the
622 current string position when executed. */
623 on_failure_keep_string_jump,
624
625 /* Just like `on_failure_jump', except that it checks that we
626 don't get stuck in an infinite loop (matching an empty string
627 indefinitely). */
628 on_failure_jump_loop,
629
630 /* Just like `on_failure_jump_loop', except that it checks for
631 a different kind of loop (the kind that shows up with non-greedy
632 operators). This operation has to be immediately preceded
633 by a `no_op'. */
634 on_failure_jump_nastyloop,
635
636 /* A smart `on_failure_jump' used for greedy * and + operators.
637 It analyzes the loop before which it is put and if the
638 loop does not require backtracking, it changes itself to
639 `on_failure_keep_string_jump' and short-circuits the loop,
640 else it just defaults to changing itself into `on_failure_jump'.
641 It assumes that it is pointing to just past a `jump'. */
642 on_failure_jump_smart,
643
644 /* Followed by two-byte relative address and two-byte number n.
645 After matching N times, jump to the address upon failure.
646 Does not work if N starts at 0: use on_failure_jump_loop
647 instead. */
648 succeed_n,
649
650 /* Followed by two-byte relative address, and two-byte number n.
651 Jump to the address N times, then fail. */
652 jump_n,
653
654 /* Set the following two-byte relative address to the
655 subsequent two-byte number. The address *includes* the two
656 bytes of number. */
657 set_number_at,
658
659 wordbeg, /* Succeeds if at word beginning. */
660 wordend, /* Succeeds if at word end. */
661
662 wordbound, /* Succeeds if at a word boundary. */
663 notwordbound, /* Succeeds if not at a word boundary. */
664
665 symbeg, /* Succeeds if at symbol beginning. */
666 symend, /* Succeeds if at symbol end. */
667
668 /* Matches any character whose syntax is specified. Followed by
669 a byte which contains a syntax code, e.g., Sword. */
670 syntaxspec,
671
672 /* Matches any character whose syntax is not that specified. */
673 notsyntaxspec
674
675 #ifdef emacs
676 ,before_dot, /* Succeeds if before point. */
677 at_dot, /* Succeeds if at point. */
678 after_dot, /* Succeeds if after point. */
679
680 /* Matches any character whose category-set contains the specified
681 category. The operator is followed by a byte which contains a
682 category code (mnemonic ASCII character). */
683 categoryspec,
684
685 /* Matches any character whose category-set does not contain the
686 specified category. The operator is followed by a byte which
687 contains the category code (mnemonic ASCII character). */
688 notcategoryspec
689 #endif /* emacs */
690 } re_opcode_t;
691 \f
692 /* Common operations on the compiled pattern. */
693
694 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
695
696 #define STORE_NUMBER(destination, number) \
697 do { \
698 (destination)[0] = (number) & 0377; \
699 (destination)[1] = (number) >> 8; \
700 } while (0)
701
702 /* Same as STORE_NUMBER, except increment DESTINATION to
703 the byte after where the number is stored. Therefore, DESTINATION
704 must be an lvalue. */
705
706 #define STORE_NUMBER_AND_INCR(destination, number) \
707 do { \
708 STORE_NUMBER (destination, number); \
709 (destination) += 2; \
710 } while (0)
711
712 /* Put into DESTINATION a number stored in two contiguous bytes starting
713 at SOURCE. */
714
715 #define EXTRACT_NUMBER(destination, source) \
716 do { \
717 (destination) = *(source) & 0377; \
718 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
719 } while (0)
720
721 #ifdef DEBUG
722 static void
723 extract_number (int *dest, re_char *source)
724 {
725 int temp = SIGN_EXTEND_CHAR (*(source + 1));
726 *dest = *source & 0377;
727 *dest += temp << 8;
728 }
729
730 # ifndef EXTRACT_MACROS /* To debug the macros. */
731 # undef EXTRACT_NUMBER
732 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
733 # endif /* not EXTRACT_MACROS */
734
735 #endif /* DEBUG */
736
737 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
738 SOURCE must be an lvalue. */
739
740 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
741 do { \
742 EXTRACT_NUMBER (destination, source); \
743 (source) += 2; \
744 } while (0)
745
746 #ifdef DEBUG
747 static void
748 extract_number_and_incr (int *destination, re_char **source)
749 {
750 extract_number (destination, *source);
751 *source += 2;
752 }
753
754 # ifndef EXTRACT_MACROS
755 # undef EXTRACT_NUMBER_AND_INCR
756 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
757 extract_number_and_incr (&dest, &src)
758 # endif /* not EXTRACT_MACROS */
759
760 #endif /* DEBUG */
761 \f
762 /* Store a multibyte character in three contiguous bytes starting
763 DESTINATION, and increment DESTINATION to the byte after where the
764 character is stored. Therefore, DESTINATION must be an lvalue. */
765
766 #define STORE_CHARACTER_AND_INCR(destination, character) \
767 do { \
768 (destination)[0] = (character) & 0377; \
769 (destination)[1] = ((character) >> 8) & 0377; \
770 (destination)[2] = (character) >> 16; \
771 (destination) += 3; \
772 } while (0)
773
774 /* Put into DESTINATION a character stored in three contiguous bytes
775 starting at SOURCE. */
776
777 #define EXTRACT_CHARACTER(destination, source) \
778 do { \
779 (destination) = ((source)[0] \
780 | ((source)[1] << 8) \
781 | ((source)[2] << 16)); \
782 } while (0)
783
784
785 /* Macros for charset. */
786
787 /* Size of bitmap of charset P in bytes. P is a start of charset,
788 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
789 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
790
791 /* Nonzero if charset P has range table. */
792 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
793
794 /* Return the address of range table of charset P. But not the start
795 of table itself, but the before where the number of ranges is
796 stored. `2 +' means to skip re_opcode_t and size of bitmap,
797 and the 2 bytes of flags at the start of the range table. */
798 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
799
800 /* Extract the bit flags that start a range table. */
801 #define CHARSET_RANGE_TABLE_BITS(p) \
802 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
803 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
804
805 /* Return the address of end of RANGE_TABLE. COUNT is number of
806 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
807 is start of range and end of range. `* 3' is size of each start
808 and end. */
809 #define CHARSET_RANGE_TABLE_END(range_table, count) \
810 ((range_table) + (count) * 2 * 3)
811
812 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
813 COUNT is number of ranges in RANGE_TABLE. */
814 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
815 do \
816 { \
817 re_wchar_t range_start, range_end; \
818 re_char *rtp; \
819 re_char *range_table_end \
820 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
821 \
822 for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3) \
823 { \
824 EXTRACT_CHARACTER (range_start, rtp); \
825 EXTRACT_CHARACTER (range_end, rtp + 3); \
826 \
827 if (range_start <= (c) && (c) <= range_end) \
828 { \
829 (not) = !(not); \
830 break; \
831 } \
832 } \
833 } \
834 while (0)
835
836 /* Test if C is in range table of CHARSET. The flag NOT is negated if
837 C is listed in it. */
838 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
839 do \
840 { \
841 /* Number of ranges in range table. */ \
842 int count; \
843 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
844 \
845 EXTRACT_NUMBER_AND_INCR (count, range_table); \
846 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
847 } \
848 while (0)
849 \f
850 /* If DEBUG is defined, Regex prints many voluminous messages about what
851 it is doing (if the variable `debug' is nonzero). If linked with the
852 main program in `iregex.c', you can enter patterns and strings
853 interactively. And if linked with the main program in `main.c' and
854 the other test files, you can run the already-written tests. */
855
856 #ifdef DEBUG
857
858 /* We use standard I/O for debugging. */
859 # include <stdio.h>
860
861 /* It is useful to test things that ``must'' be true when debugging. */
862 # include <assert.h>
863
864 static int debug = -100000;
865
866 # define DEBUG_STATEMENT(e) e
867 # define DEBUG_PRINT1(x) if (debug > 0) printf (x)
868 # define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
869 # define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
870 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
871 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
872 if (debug > 0) print_partial_compiled_pattern (s, e)
873 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
874 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
875
876
877 /* Print the fastmap in human-readable form. */
878
879 void
880 print_fastmap (fastmap)
881 char *fastmap;
882 {
883 unsigned was_a_range = 0;
884 unsigned i = 0;
885
886 while (i < (1 << BYTEWIDTH))
887 {
888 if (fastmap[i++])
889 {
890 was_a_range = 0;
891 putchar (i - 1);
892 while (i < (1 << BYTEWIDTH) && fastmap[i])
893 {
894 was_a_range = 1;
895 i++;
896 }
897 if (was_a_range)
898 {
899 printf ("-");
900 putchar (i - 1);
901 }
902 }
903 }
904 putchar ('\n');
905 }
906
907
908 /* Print a compiled pattern string in human-readable form, starting at
909 the START pointer into it and ending just before the pointer END. */
910
911 void
912 print_partial_compiled_pattern (start, end)
913 re_char *start;
914 re_char *end;
915 {
916 int mcnt, mcnt2;
917 re_char *p = start;
918 re_char *pend = end;
919
920 if (start == NULL)
921 {
922 fprintf (stderr, "(null)\n");
923 return;
924 }
925
926 /* Loop over pattern commands. */
927 while (p < pend)
928 {
929 fprintf (stderr, "%d:\t", p - start);
930
931 switch ((re_opcode_t) *p++)
932 {
933 case no_op:
934 fprintf (stderr, "/no_op");
935 break;
936
937 case succeed:
938 fprintf (stderr, "/succeed");
939 break;
940
941 case exactn:
942 mcnt = *p++;
943 fprintf (stderr, "/exactn/%d", mcnt);
944 do
945 {
946 fprintf (stderr, "/%c", *p++);
947 }
948 while (--mcnt);
949 break;
950
951 case start_memory:
952 fprintf (stderr, "/start_memory/%d", *p++);
953 break;
954
955 case stop_memory:
956 fprintf (stderr, "/stop_memory/%d", *p++);
957 break;
958
959 case duplicate:
960 fprintf (stderr, "/duplicate/%d", *p++);
961 break;
962
963 case anychar:
964 fprintf (stderr, "/anychar");
965 break;
966
967 case charset:
968 case charset_not:
969 {
970 register int c, last = -100;
971 register int in_range = 0;
972 int length = CHARSET_BITMAP_SIZE (p - 1);
973 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
974
975 fprintf (stderr, "/charset [%s",
976 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
977
978 if (p + *p >= pend)
979 fprintf (stderr, " !extends past end of pattern! ");
980
981 for (c = 0; c < 256; c++)
982 if (c / 8 < length
983 && (p[1 + (c/8)] & (1 << (c % 8))))
984 {
985 /* Are we starting a range? */
986 if (last + 1 == c && ! in_range)
987 {
988 fprintf (stderr, "-");
989 in_range = 1;
990 }
991 /* Have we broken a range? */
992 else if (last + 1 != c && in_range)
993 {
994 fprintf (stderr, "%c", last);
995 in_range = 0;
996 }
997
998 if (! in_range)
999 fprintf (stderr, "%c", c);
1000
1001 last = c;
1002 }
1003
1004 if (in_range)
1005 fprintf (stderr, "%c", last);
1006
1007 fprintf (stderr, "]");
1008
1009 p += 1 + length;
1010
1011 if (has_range_table)
1012 {
1013 int count;
1014 fprintf (stderr, "has-range-table");
1015
1016 /* ??? Should print the range table; for now, just skip it. */
1017 p += 2; /* skip range table bits */
1018 EXTRACT_NUMBER_AND_INCR (count, p);
1019 p = CHARSET_RANGE_TABLE_END (p, count);
1020 }
1021 }
1022 break;
1023
1024 case begline:
1025 fprintf (stderr, "/begline");
1026 break;
1027
1028 case endline:
1029 fprintf (stderr, "/endline");
1030 break;
1031
1032 case on_failure_jump:
1033 extract_number_and_incr (&mcnt, &p);
1034 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
1035 break;
1036
1037 case on_failure_keep_string_jump:
1038 extract_number_and_incr (&mcnt, &p);
1039 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
1040 break;
1041
1042 case on_failure_jump_nastyloop:
1043 extract_number_and_incr (&mcnt, &p);
1044 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
1045 break;
1046
1047 case on_failure_jump_loop:
1048 extract_number_and_incr (&mcnt, &p);
1049 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
1050 break;
1051
1052 case on_failure_jump_smart:
1053 extract_number_and_incr (&mcnt, &p);
1054 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
1055 break;
1056
1057 case jump:
1058 extract_number_and_incr (&mcnt, &p);
1059 fprintf (stderr, "/jump to %d", p + mcnt - start);
1060 break;
1061
1062 case succeed_n:
1063 extract_number_and_incr (&mcnt, &p);
1064 extract_number_and_incr (&mcnt2, &p);
1065 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1066 break;
1067
1068 case jump_n:
1069 extract_number_and_incr (&mcnt, &p);
1070 extract_number_and_incr (&mcnt2, &p);
1071 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1072 break;
1073
1074 case set_number_at:
1075 extract_number_and_incr (&mcnt, &p);
1076 extract_number_and_incr (&mcnt2, &p);
1077 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
1078 break;
1079
1080 case wordbound:
1081 fprintf (stderr, "/wordbound");
1082 break;
1083
1084 case notwordbound:
1085 fprintf (stderr, "/notwordbound");
1086 break;
1087
1088 case wordbeg:
1089 fprintf (stderr, "/wordbeg");
1090 break;
1091
1092 case wordend:
1093 fprintf (stderr, "/wordend");
1094 break;
1095
1096 case symbeg:
1097 fprintf (stderr, "/symbeg");
1098 break;
1099
1100 case symend:
1101 fprintf (stderr, "/symend");
1102 break;
1103
1104 case syntaxspec:
1105 fprintf (stderr, "/syntaxspec");
1106 mcnt = *p++;
1107 fprintf (stderr, "/%d", mcnt);
1108 break;
1109
1110 case notsyntaxspec:
1111 fprintf (stderr, "/notsyntaxspec");
1112 mcnt = *p++;
1113 fprintf (stderr, "/%d", mcnt);
1114 break;
1115
1116 # ifdef emacs
1117 case before_dot:
1118 fprintf (stderr, "/before_dot");
1119 break;
1120
1121 case at_dot:
1122 fprintf (stderr, "/at_dot");
1123 break;
1124
1125 case after_dot:
1126 fprintf (stderr, "/after_dot");
1127 break;
1128
1129 case categoryspec:
1130 fprintf (stderr, "/categoryspec");
1131 mcnt = *p++;
1132 fprintf (stderr, "/%d", mcnt);
1133 break;
1134
1135 case notcategoryspec:
1136 fprintf (stderr, "/notcategoryspec");
1137 mcnt = *p++;
1138 fprintf (stderr, "/%d", mcnt);
1139 break;
1140 # endif /* emacs */
1141
1142 case begbuf:
1143 fprintf (stderr, "/begbuf");
1144 break;
1145
1146 case endbuf:
1147 fprintf (stderr, "/endbuf");
1148 break;
1149
1150 default:
1151 fprintf (stderr, "?%d", *(p-1));
1152 }
1153
1154 fprintf (stderr, "\n");
1155 }
1156
1157 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
1158 }
1159
1160
1161 void
1162 print_compiled_pattern (bufp)
1163 struct re_pattern_buffer *bufp;
1164 {
1165 re_char *buffer = bufp->buffer;
1166
1167 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1168 printf ("%ld bytes used/%ld bytes allocated.\n",
1169 bufp->used, bufp->allocated);
1170
1171 if (bufp->fastmap_accurate && bufp->fastmap)
1172 {
1173 printf ("fastmap: ");
1174 print_fastmap (bufp->fastmap);
1175 }
1176
1177 printf ("re_nsub: %d\t", bufp->re_nsub);
1178 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1179 printf ("can_be_null: %d\t", bufp->can_be_null);
1180 printf ("no_sub: %d\t", bufp->no_sub);
1181 printf ("not_bol: %d\t", bufp->not_bol);
1182 printf ("not_eol: %d\t", bufp->not_eol);
1183 printf ("syntax: %lx\n", bufp->syntax);
1184 fflush (stdout);
1185 /* Perhaps we should print the translate table? */
1186 }
1187
1188
1189 void
1190 print_double_string (where, string1, size1, string2, size2)
1191 re_char *where;
1192 re_char *string1;
1193 re_char *string2;
1194 ssize_t size1;
1195 ssize_t size2;
1196 {
1197 ssize_t this_char;
1198
1199 if (where == NULL)
1200 printf ("(null)");
1201 else
1202 {
1203 if (FIRST_STRING_P (where))
1204 {
1205 for (this_char = where - string1; this_char < size1; this_char++)
1206 putchar (string1[this_char]);
1207
1208 where = string2;
1209 }
1210
1211 for (this_char = where - string2; this_char < size2; this_char++)
1212 putchar (string2[this_char]);
1213 }
1214 }
1215
1216 #else /* not DEBUG */
1217
1218 # undef assert
1219 # define assert(e)
1220
1221 # define DEBUG_STATEMENT(e)
1222 # define DEBUG_PRINT1(x)
1223 # define DEBUG_PRINT2(x1, x2)
1224 # define DEBUG_PRINT3(x1, x2, x3)
1225 # define DEBUG_PRINT4(x1, x2, x3, x4)
1226 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1227 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1228
1229 #endif /* not DEBUG */
1230 \f
1231 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
1232 #ifdef lint
1233 # define IF_LINT(Code) Code
1234 #else
1235 # define IF_LINT(Code) /* empty */
1236 #endif
1237 \f
1238 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1239 also be assigned to arbitrarily: each pattern buffer stores its own
1240 syntax, so it can be changed between regex compilations. */
1241 /* This has no initializer because initialized variables in Emacs
1242 become read-only after dumping. */
1243 reg_syntax_t re_syntax_options;
1244
1245
1246 /* Specify the precise syntax of regexps for compilation. This provides
1247 for compatibility for various utilities which historically have
1248 different, incompatible syntaxes.
1249
1250 The argument SYNTAX is a bit mask comprised of the various bits
1251 defined in regex.h. We return the old syntax. */
1252
1253 reg_syntax_t
1254 re_set_syntax (reg_syntax_t syntax)
1255 {
1256 reg_syntax_t ret = re_syntax_options;
1257
1258 re_syntax_options = syntax;
1259 return ret;
1260 }
1261 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1262
1263 /* Regexp to use to replace spaces, or NULL meaning don't. */
1264 static re_char *whitespace_regexp;
1265
1266 void
1267 re_set_whitespace_regexp (const char *regexp)
1268 {
1269 whitespace_regexp = (re_char *) regexp;
1270 }
1271 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1272 \f
1273 /* This table gives an error message for each of the error codes listed
1274 in regex.h. Obviously the order here has to be same as there.
1275 POSIX doesn't require that we do anything for REG_NOERROR,
1276 but why not be nice? */
1277
1278 static const char *re_error_msgid[] =
1279 {
1280 gettext_noop ("Success"), /* REG_NOERROR */
1281 gettext_noop ("No match"), /* REG_NOMATCH */
1282 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1283 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1284 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1285 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1286 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1287 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1288 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1289 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1290 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1291 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1292 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1293 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1294 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1295 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1296 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1297 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
1298 };
1299 \f
1300 /* Avoiding alloca during matching, to placate r_alloc. */
1301
1302 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1303 searching and matching functions should not call alloca. On some
1304 systems, alloca is implemented in terms of malloc, and if we're
1305 using the relocating allocator routines, then malloc could cause a
1306 relocation, which might (if the strings being searched are in the
1307 ralloc heap) shift the data out from underneath the regexp
1308 routines.
1309
1310 Here's another reason to avoid allocation: Emacs
1311 processes input from X in a signal handler; processing X input may
1312 call malloc; if input arrives while a matching routine is calling
1313 malloc, then we're scrod. But Emacs can't just block input while
1314 calling matching routines; then we don't notice interrupts when
1315 they come in. So, Emacs blocks input around all regexp calls
1316 except the matching calls, which it leaves unprotected, in the
1317 faith that they will not malloc. */
1318
1319 /* Normally, this is fine. */
1320 #define MATCH_MAY_ALLOCATE
1321
1322 /* The match routines may not allocate if (1) they would do it with malloc
1323 and (2) it's not safe for them to use malloc.
1324 Note that if REL_ALLOC is defined, matching would not use malloc for the
1325 failure stack, but we would still use it for the register vectors;
1326 so REL_ALLOC should not affect this. */
1327 #if defined REGEX_MALLOC && defined emacs
1328 # undef MATCH_MAY_ALLOCATE
1329 #endif
1330
1331 \f
1332 /* Failure stack declarations and macros; both re_compile_fastmap and
1333 re_match_2 use a failure stack. These have to be macros because of
1334 REGEX_ALLOCATE_STACK. */
1335
1336
1337 /* Approximate number of failure points for which to initially allocate space
1338 when matching. If this number is exceeded, we allocate more
1339 space, so it is not a hard limit. */
1340 #ifndef INIT_FAILURE_ALLOC
1341 # define INIT_FAILURE_ALLOC 20
1342 #endif
1343
1344 /* Roughly the maximum number of failure points on the stack. Would be
1345 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1346 This is a variable only so users of regex can assign to it; we never
1347 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1348 before using it, so it should probably be a byte-count instead. */
1349 # if defined MATCH_MAY_ALLOCATE
1350 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
1351 whose default stack limit is 2mb. In order for a larger
1352 value to work reliably, you have to try to make it accord
1353 with the process stack limit. */
1354 size_t re_max_failures = 40000;
1355 # else
1356 size_t re_max_failures = 4000;
1357 # endif
1358
1359 union fail_stack_elt
1360 {
1361 re_char *pointer;
1362 /* This should be the biggest `int' that's no bigger than a pointer. */
1363 long integer;
1364 };
1365
1366 typedef union fail_stack_elt fail_stack_elt_t;
1367
1368 typedef struct
1369 {
1370 fail_stack_elt_t *stack;
1371 size_t size;
1372 size_t avail; /* Offset of next open position. */
1373 size_t frame; /* Offset of the cur constructed frame. */
1374 } fail_stack_type;
1375
1376 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1377
1378
1379 /* Define macros to initialize and free the failure stack.
1380 Do `return -2' if the alloc fails. */
1381
1382 #ifdef MATCH_MAY_ALLOCATE
1383 # define INIT_FAIL_STACK() \
1384 do { \
1385 fail_stack.stack = \
1386 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1387 * sizeof (fail_stack_elt_t)); \
1388 \
1389 if (fail_stack.stack == NULL) \
1390 return -2; \
1391 \
1392 fail_stack.size = INIT_FAILURE_ALLOC; \
1393 fail_stack.avail = 0; \
1394 fail_stack.frame = 0; \
1395 } while (0)
1396 #else
1397 # define INIT_FAIL_STACK() \
1398 do { \
1399 fail_stack.avail = 0; \
1400 fail_stack.frame = 0; \
1401 } while (0)
1402
1403 # define RETALLOC_IF(addr, n, t) \
1404 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
1405 #endif
1406
1407
1408 /* Double the size of FAIL_STACK, up to a limit
1409 which allows approximately `re_max_failures' items.
1410
1411 Return 1 if succeeds, and 0 if either ran out of memory
1412 allocating space for it or it was already too large.
1413
1414 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1415
1416 /* Factor to increase the failure stack size by
1417 when we increase it.
1418 This used to be 2, but 2 was too wasteful
1419 because the old discarded stacks added up to as much space
1420 were as ultimate, maximum-size stack. */
1421 #define FAIL_STACK_GROWTH_FACTOR 4
1422
1423 #define GROW_FAIL_STACK(fail_stack) \
1424 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1425 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1426 ? 0 \
1427 : ((fail_stack).stack \
1428 = REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1429 (fail_stack).size * sizeof (fail_stack_elt_t), \
1430 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1431 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1432 * FAIL_STACK_GROWTH_FACTOR))), \
1433 \
1434 (fail_stack).stack == NULL \
1435 ? 0 \
1436 : ((fail_stack).size \
1437 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1438 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1439 * FAIL_STACK_GROWTH_FACTOR)) \
1440 / sizeof (fail_stack_elt_t)), \
1441 1)))
1442
1443
1444 /* Push a pointer value onto the failure stack.
1445 Assumes the variable `fail_stack'. Probably should only
1446 be called from within `PUSH_FAILURE_POINT'. */
1447 #define PUSH_FAILURE_POINTER(item) \
1448 fail_stack.stack[fail_stack.avail++].pointer = (item)
1449
1450 /* This pushes an integer-valued item onto the failure stack.
1451 Assumes the variable `fail_stack'. Probably should only
1452 be called from within `PUSH_FAILURE_POINT'. */
1453 #define PUSH_FAILURE_INT(item) \
1454 fail_stack.stack[fail_stack.avail++].integer = (item)
1455
1456 /* These POP... operations complement the PUSH... operations.
1457 All assume that `fail_stack' is nonempty. */
1458 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1459 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1460
1461 /* Individual items aside from the registers. */
1462 #define NUM_NONREG_ITEMS 3
1463
1464 /* Used to examine the stack (to detect infinite loops). */
1465 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1466 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1467 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1468 #define TOP_FAILURE_HANDLE() fail_stack.frame
1469
1470
1471 #define ENSURE_FAIL_STACK(space) \
1472 while (REMAINING_AVAIL_SLOTS <= space) { \
1473 if (!GROW_FAIL_STACK (fail_stack)) \
1474 return -2; \
1475 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1476 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1477 }
1478
1479 /* Push register NUM onto the stack. */
1480 #define PUSH_FAILURE_REG(num) \
1481 do { \
1482 char *destination; \
1483 ENSURE_FAIL_STACK(3); \
1484 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1485 num, regstart[num], regend[num]); \
1486 PUSH_FAILURE_POINTER (regstart[num]); \
1487 PUSH_FAILURE_POINTER (regend[num]); \
1488 PUSH_FAILURE_INT (num); \
1489 } while (0)
1490
1491 /* Change the counter's value to VAL, but make sure that it will
1492 be reset when backtracking. */
1493 #define PUSH_NUMBER(ptr,val) \
1494 do { \
1495 char *destination; \
1496 int c; \
1497 ENSURE_FAIL_STACK(3); \
1498 EXTRACT_NUMBER (c, ptr); \
1499 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
1500 PUSH_FAILURE_INT (c); \
1501 PUSH_FAILURE_POINTER (ptr); \
1502 PUSH_FAILURE_INT (-1); \
1503 STORE_NUMBER (ptr, val); \
1504 } while (0)
1505
1506 /* Pop a saved register off the stack. */
1507 #define POP_FAILURE_REG_OR_COUNT() \
1508 do { \
1509 long pfreg = POP_FAILURE_INT (); \
1510 if (pfreg == -1) \
1511 { \
1512 /* It's a counter. */ \
1513 /* Here, we discard `const', making re_match non-reentrant. */ \
1514 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
1515 pfreg = POP_FAILURE_INT (); \
1516 STORE_NUMBER (ptr, pfreg); \
1517 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, pfreg); \
1518 } \
1519 else \
1520 { \
1521 regend[pfreg] = POP_FAILURE_POINTER (); \
1522 regstart[pfreg] = POP_FAILURE_POINTER (); \
1523 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1524 pfreg, regstart[pfreg], regend[pfreg]); \
1525 } \
1526 } while (0)
1527
1528 /* Check that we are not stuck in an infinite loop. */
1529 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1530 do { \
1531 ssize_t failure = TOP_FAILURE_HANDLE (); \
1532 /* Check for infinite matching loops */ \
1533 while (failure > 0 \
1534 && (FAILURE_STR (failure) == string_place \
1535 || FAILURE_STR (failure) == NULL)) \
1536 { \
1537 assert (FAILURE_PAT (failure) >= bufp->buffer \
1538 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1539 if (FAILURE_PAT (failure) == pat_cur) \
1540 { \
1541 cycle = 1; \
1542 break; \
1543 } \
1544 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1545 failure = NEXT_FAILURE_HANDLE(failure); \
1546 } \
1547 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1548 } while (0)
1549
1550 /* Push the information about the state we will need
1551 if we ever fail back to it.
1552
1553 Requires variables fail_stack, regstart, regend and
1554 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1555 declared.
1556
1557 Does `return FAILURE_CODE' if runs out of memory. */
1558
1559 #define PUSH_FAILURE_POINT(pattern, string_place) \
1560 do { \
1561 char *destination; \
1562 /* Must be int, so when we don't save any registers, the arithmetic \
1563 of 0 + -1 isn't done as unsigned. */ \
1564 \
1565 DEBUG_STATEMENT (nfailure_points_pushed++); \
1566 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
1567 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1568 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1569 \
1570 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1571 \
1572 DEBUG_PRINT1 ("\n"); \
1573 \
1574 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1575 PUSH_FAILURE_INT (fail_stack.frame); \
1576 \
1577 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1578 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1579 DEBUG_PRINT1 ("'\n"); \
1580 PUSH_FAILURE_POINTER (string_place); \
1581 \
1582 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1583 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1584 PUSH_FAILURE_POINTER (pattern); \
1585 \
1586 /* Close the frame by moving the frame pointer past it. */ \
1587 fail_stack.frame = fail_stack.avail; \
1588 } while (0)
1589
1590 /* Estimate the size of data pushed by a typical failure stack entry.
1591 An estimate is all we need, because all we use this for
1592 is to choose a limit for how big to make the failure stack. */
1593 /* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
1594 #define TYPICAL_FAILURE_SIZE 20
1595
1596 /* How many items can still be added to the stack without overflowing it. */
1597 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1598
1599
1600 /* Pops what PUSH_FAIL_STACK pushes.
1601
1602 We restore into the parameters, all of which should be lvalues:
1603 STR -- the saved data position.
1604 PAT -- the saved pattern position.
1605 REGSTART, REGEND -- arrays of string positions.
1606
1607 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1608 `pend', `string1', `size1', `string2', and `size2'. */
1609
1610 #define POP_FAILURE_POINT(str, pat) \
1611 do { \
1612 assert (!FAIL_STACK_EMPTY ()); \
1613 \
1614 /* Remove failure points and point to how many regs pushed. */ \
1615 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1616 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1617 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1618 \
1619 /* Pop the saved registers. */ \
1620 while (fail_stack.frame < fail_stack.avail) \
1621 POP_FAILURE_REG_OR_COUNT (); \
1622 \
1623 pat = POP_FAILURE_POINTER (); \
1624 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1625 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1626 \
1627 /* If the saved string location is NULL, it came from an \
1628 on_failure_keep_string_jump opcode, and we want to throw away the \
1629 saved NULL, thus retaining our current position in the string. */ \
1630 str = POP_FAILURE_POINTER (); \
1631 DEBUG_PRINT2 (" Popping string %p: `", str); \
1632 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1633 DEBUG_PRINT1 ("'\n"); \
1634 \
1635 fail_stack.frame = POP_FAILURE_INT (); \
1636 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1637 \
1638 assert (fail_stack.avail >= 0); \
1639 assert (fail_stack.frame <= fail_stack.avail); \
1640 \
1641 DEBUG_STATEMENT (nfailure_points_popped++); \
1642 } while (0) /* POP_FAILURE_POINT */
1643
1644
1645 \f
1646 /* Registers are set to a sentinel when they haven't yet matched. */
1647 #define REG_UNSET(e) ((e) == NULL)
1648 \f
1649 /* Subroutine declarations and macros for regex_compile. */
1650
1651 static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1652 reg_syntax_t syntax,
1653 struct re_pattern_buffer *bufp);
1654 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1655 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1656 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1657 int arg, unsigned char *end);
1658 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1659 int arg1, int arg2, unsigned char *end);
1660 static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1661 reg_syntax_t syntax);
1662 static boolean at_endline_loc_p (re_char *p, re_char *pend,
1663 reg_syntax_t syntax);
1664 static re_char *skip_one_char (re_char *p);
1665 static int analyse_first (re_char *p, re_char *pend,
1666 char *fastmap, const int multibyte);
1667
1668 /* Fetch the next character in the uncompiled pattern, with no
1669 translation. */
1670 #define PATFETCH(c) \
1671 do { \
1672 int len; \
1673 if (p == pend) return REG_EEND; \
1674 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
1675 p += len; \
1676 } while (0)
1677
1678
1679 /* If `translate' is non-null, return translate[D], else just D. We
1680 cast the subscript to translate because some data is declared as
1681 `char *', to avoid warnings when a string constant is passed. But
1682 when we use a character as a subscript we must make it unsigned. */
1683 #ifndef TRANSLATE
1684 # define TRANSLATE(d) \
1685 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1686 #endif
1687
1688
1689 /* Macros for outputting the compiled pattern into `buffer'. */
1690
1691 /* If the buffer isn't allocated when it comes in, use this. */
1692 #define INIT_BUF_SIZE 32
1693
1694 /* Make sure we have at least N more bytes of space in buffer. */
1695 #define GET_BUFFER_SPACE(n) \
1696 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
1697 EXTEND_BUFFER ()
1698
1699 /* Make sure we have one more byte of buffer space and then add C to it. */
1700 #define BUF_PUSH(c) \
1701 do { \
1702 GET_BUFFER_SPACE (1); \
1703 *b++ = (unsigned char) (c); \
1704 } while (0)
1705
1706
1707 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1708 #define BUF_PUSH_2(c1, c2) \
1709 do { \
1710 GET_BUFFER_SPACE (2); \
1711 *b++ = (unsigned char) (c1); \
1712 *b++ = (unsigned char) (c2); \
1713 } while (0)
1714
1715
1716 /* Store a jump with opcode OP at LOC to location TO. We store a
1717 relative address offset by the three bytes the jump itself occupies. */
1718 #define STORE_JUMP(op, loc, to) \
1719 store_op1 (op, loc, (to) - (loc) - 3)
1720
1721 /* Likewise, for a two-argument jump. */
1722 #define STORE_JUMP2(op, loc, to, arg) \
1723 store_op2 (op, loc, (to) - (loc) - 3, arg)
1724
1725 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1726 #define INSERT_JUMP(op, loc, to) \
1727 insert_op1 (op, loc, (to) - (loc) - 3, b)
1728
1729 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1730 #define INSERT_JUMP2(op, loc, to, arg) \
1731 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1732
1733
1734 /* This is not an arbitrary limit: the arguments which represent offsets
1735 into the pattern are two bytes long. So if 2^15 bytes turns out to
1736 be too small, many things would have to change. */
1737 # define MAX_BUF_SIZE (1L << 15)
1738
1739 /* Extend the buffer by twice its current size via realloc and
1740 reset the pointers that pointed into the old block to point to the
1741 correct places in the new one. If extending the buffer results in it
1742 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1743 #if __BOUNDED_POINTERS__
1744 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1745 # define MOVE_BUFFER_POINTER(P) \
1746 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1747 SET_HIGH_BOUND (P), \
1748 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
1749 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1750 else \
1751 { \
1752 SET_HIGH_BOUND (b); \
1753 SET_HIGH_BOUND (begalt); \
1754 if (fixup_alt_jump) \
1755 SET_HIGH_BOUND (fixup_alt_jump); \
1756 if (laststart) \
1757 SET_HIGH_BOUND (laststart); \
1758 if (pending_exact) \
1759 SET_HIGH_BOUND (pending_exact); \
1760 }
1761 #else
1762 # define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
1763 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1764 #endif
1765 #define EXTEND_BUFFER() \
1766 do { \
1767 unsigned char *old_buffer = bufp->buffer; \
1768 if (bufp->allocated == MAX_BUF_SIZE) \
1769 return REG_ESIZE; \
1770 bufp->allocated <<= 1; \
1771 if (bufp->allocated > MAX_BUF_SIZE) \
1772 bufp->allocated = MAX_BUF_SIZE; \
1773 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
1774 if (bufp->buffer == NULL) \
1775 return REG_ESPACE; \
1776 /* If the buffer moved, move all the pointers into it. */ \
1777 if (old_buffer != bufp->buffer) \
1778 { \
1779 unsigned char *new_buffer = bufp->buffer; \
1780 MOVE_BUFFER_POINTER (b); \
1781 MOVE_BUFFER_POINTER (begalt); \
1782 if (fixup_alt_jump) \
1783 MOVE_BUFFER_POINTER (fixup_alt_jump); \
1784 if (laststart) \
1785 MOVE_BUFFER_POINTER (laststart); \
1786 if (pending_exact) \
1787 MOVE_BUFFER_POINTER (pending_exact); \
1788 } \
1789 ELSE_EXTEND_BUFFER_HIGH_BOUND \
1790 } while (0)
1791
1792
1793 /* Since we have one byte reserved for the register number argument to
1794 {start,stop}_memory, the maximum number of groups we can report
1795 things about is what fits in that byte. */
1796 #define MAX_REGNUM 255
1797
1798 /* But patterns can have more than `MAX_REGNUM' registers. We just
1799 ignore the excess. */
1800 typedef int regnum_t;
1801
1802
1803 /* Macros for the compile stack. */
1804
1805 /* Since offsets can go either forwards or backwards, this type needs to
1806 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1807 /* int may be not enough when sizeof(int) == 2. */
1808 typedef long pattern_offset_t;
1809
1810 typedef struct
1811 {
1812 pattern_offset_t begalt_offset;
1813 pattern_offset_t fixup_alt_jump;
1814 pattern_offset_t laststart_offset;
1815 regnum_t regnum;
1816 } compile_stack_elt_t;
1817
1818
1819 typedef struct
1820 {
1821 compile_stack_elt_t *stack;
1822 size_t size;
1823 size_t avail; /* Offset of next open position. */
1824 } compile_stack_type;
1825
1826
1827 #define INIT_COMPILE_STACK_SIZE 32
1828
1829 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1830 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1831
1832 /* The next available element. */
1833 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1834
1835 /* Explicit quit checking is only used on NTemacs and whenever we
1836 use polling to process input events. */
1837 #if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
1838 extern int immediate_quit;
1839 # define IMMEDIATE_QUIT_CHECK \
1840 do { \
1841 if (immediate_quit) QUIT; \
1842 } while (0)
1843 #else
1844 # define IMMEDIATE_QUIT_CHECK ((void)0)
1845 #endif
1846 \f
1847 /* Structure to manage work area for range table. */
1848 struct range_table_work_area
1849 {
1850 int *table; /* actual work area. */
1851 int allocated; /* allocated size for work area in bytes. */
1852 int used; /* actually used size in words. */
1853 int bits; /* flag to record character classes */
1854 };
1855
1856 /* Make sure that WORK_AREA can hold more N multibyte characters.
1857 This is used only in set_image_of_range and set_image_of_range_1.
1858 It expects WORK_AREA to be a pointer.
1859 If it can't get the space, it returns from the surrounding function. */
1860
1861 #define EXTEND_RANGE_TABLE(work_area, n) \
1862 do { \
1863 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1864 { \
1865 extend_range_table_work_area (&work_area); \
1866 if ((work_area).table == 0) \
1867 return (REG_ESPACE); \
1868 } \
1869 } while (0)
1870
1871 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1872 (work_area).bits |= (bit)
1873
1874 /* Bits used to implement the multibyte-part of the various character classes
1875 such as [:alnum:] in a charset's range table. */
1876 #define BIT_WORD 0x1
1877 #define BIT_LOWER 0x2
1878 #define BIT_PUNCT 0x4
1879 #define BIT_SPACE 0x8
1880 #define BIT_UPPER 0x10
1881 #define BIT_MULTIBYTE 0x20
1882
1883 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1884 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1885 do { \
1886 EXTEND_RANGE_TABLE ((work_area), 2); \
1887 (work_area).table[(work_area).used++] = (range_start); \
1888 (work_area).table[(work_area).used++] = (range_end); \
1889 } while (0)
1890
1891 /* Free allocated memory for WORK_AREA. */
1892 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1893 do { \
1894 if ((work_area).table) \
1895 free ((work_area).table); \
1896 } while (0)
1897
1898 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1899 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1900 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1901 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1902 \f
1903
1904 /* Set the bit for character C in a list. */
1905 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1906
1907
1908 #ifdef emacs
1909
1910 /* Store characters in the range FROM to TO in the bitmap at B (for
1911 ASCII and unibyte characters) and WORK_AREA (for multibyte
1912 characters) while translating them and paying attention to the
1913 continuity of translated characters.
1914
1915 Implementation note: It is better to implement these fairly big
1916 macros by a function, but it's not that easy because macros called
1917 in this macro assume various local variables already declared. */
1918
1919 /* Both FROM and TO are ASCII characters. */
1920
1921 #define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1922 do { \
1923 int C0, C1; \
1924 \
1925 for (C0 = (FROM); C0 <= (TO); C0++) \
1926 { \
1927 C1 = TRANSLATE (C0); \
1928 if (! ASCII_CHAR_P (C1)) \
1929 { \
1930 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
1931 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
1932 C1 = C0; \
1933 } \
1934 SET_LIST_BIT (C1); \
1935 } \
1936 } while (0)
1937
1938
1939 /* Both FROM and TO are unibyte characters (0x80..0xFF). */
1940
1941 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
1942 do { \
1943 int C0, C1, C2, I; \
1944 int USED = RANGE_TABLE_WORK_USED (work_area); \
1945 \
1946 for (C0 = (FROM); C0 <= (TO); C0++) \
1947 { \
1948 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
1949 if (CHAR_BYTE8_P (C1)) \
1950 SET_LIST_BIT (C0); \
1951 else \
1952 { \
1953 C2 = TRANSLATE (C1); \
1954 if (C2 == C1 \
1955 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
1956 C1 = C0; \
1957 SET_LIST_BIT (C1); \
1958 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1959 { \
1960 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1961 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1962 \
1963 if (C2 >= from - 1 && C2 <= to + 1) \
1964 { \
1965 if (C2 == from - 1) \
1966 RANGE_TABLE_WORK_ELT (work_area, I)--; \
1967 else if (C2 == to + 1) \
1968 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
1969 break; \
1970 } \
1971 } \
1972 if (I < USED) \
1973 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
1974 } \
1975 } \
1976 } while (0)
1977
1978
1979 /* Both FROM and TO are multibyte characters. */
1980
1981 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
1982 do { \
1983 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
1984 \
1985 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
1986 for (C0 = (FROM); C0 <= (TO); C0++) \
1987 { \
1988 C1 = TRANSLATE (C0); \
1989 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
1990 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
1991 SET_LIST_BIT (C2); \
1992 if (C1 >= (FROM) && C1 <= (TO)) \
1993 continue; \
1994 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1995 { \
1996 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
1997 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
1998 \
1999 if (C1 >= from - 1 && C1 <= to + 1) \
2000 { \
2001 if (C1 == from - 1) \
2002 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2003 else if (C1 == to + 1) \
2004 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2005 break; \
2006 } \
2007 } \
2008 if (I < USED) \
2009 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2010 } \
2011 } while (0)
2012
2013 #endif /* emacs */
2014
2015 /* Get the next unsigned number in the uncompiled pattern. */
2016 #define GET_UNSIGNED_NUMBER(num) \
2017 do { \
2018 if (p == pend) \
2019 FREE_STACK_RETURN (REG_EBRACE); \
2020 else \
2021 { \
2022 PATFETCH (c); \
2023 while ('0' <= c && c <= '9') \
2024 { \
2025 int prev; \
2026 if (num < 0) \
2027 num = 0; \
2028 prev = num; \
2029 num = num * 10 + c - '0'; \
2030 if (num / 10 != prev) \
2031 FREE_STACK_RETURN (REG_BADBR); \
2032 if (p == pend) \
2033 FREE_STACK_RETURN (REG_EBRACE); \
2034 PATFETCH (c); \
2035 } \
2036 } \
2037 } while (0)
2038 \f
2039 #if ! WIDE_CHAR_SUPPORT
2040
2041 /* Map a string to the char class it names (if any). */
2042 re_wctype_t
2043 re_wctype (const re_char *str)
2044 {
2045 const char *string = (const char *) str;
2046 if (STREQ (string, "alnum")) return RECC_ALNUM;
2047 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2048 else if (STREQ (string, "word")) return RECC_WORD;
2049 else if (STREQ (string, "ascii")) return RECC_ASCII;
2050 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2051 else if (STREQ (string, "graph")) return RECC_GRAPH;
2052 else if (STREQ (string, "lower")) return RECC_LOWER;
2053 else if (STREQ (string, "print")) return RECC_PRINT;
2054 else if (STREQ (string, "punct")) return RECC_PUNCT;
2055 else if (STREQ (string, "space")) return RECC_SPACE;
2056 else if (STREQ (string, "upper")) return RECC_UPPER;
2057 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2058 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2059 else if (STREQ (string, "digit")) return RECC_DIGIT;
2060 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2061 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2062 else if (STREQ (string, "blank")) return RECC_BLANK;
2063 else return 0;
2064 }
2065
2066 /* True if CH is in the char class CC. */
2067 boolean
2068 re_iswctype (int ch, re_wctype_t cc)
2069 {
2070 switch (cc)
2071 {
2072 case RECC_ALNUM: return ISALNUM (ch) != 0;
2073 case RECC_ALPHA: return ISALPHA (ch) != 0;
2074 case RECC_BLANK: return ISBLANK (ch) != 0;
2075 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2076 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2077 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2078 case RECC_LOWER: return ISLOWER (ch) != 0;
2079 case RECC_PRINT: return ISPRINT (ch) != 0;
2080 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2081 case RECC_SPACE: return ISSPACE (ch) != 0;
2082 case RECC_UPPER: return ISUPPER (ch) != 0;
2083 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2084 case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
2085 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2086 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
2087 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2088 case RECC_WORD: return ISWORD (ch) != 0;
2089 case RECC_ERROR: return false;
2090 default:
2091 abort ();
2092 }
2093 }
2094
2095 /* Return a bit-pattern to use in the range-table bits to match multibyte
2096 chars of class CC. */
2097 static int
2098 re_wctype_to_bit (re_wctype_t cc)
2099 {
2100 switch (cc)
2101 {
2102 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
2103 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2104 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2105 case RECC_LOWER: return BIT_LOWER;
2106 case RECC_UPPER: return BIT_UPPER;
2107 case RECC_PUNCT: return BIT_PUNCT;
2108 case RECC_SPACE: return BIT_SPACE;
2109 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2110 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2111 default:
2112 abort ();
2113 }
2114 }
2115 #endif
2116 \f
2117 /* Filling in the work area of a range. */
2118
2119 /* Actually extend the space in WORK_AREA. */
2120
2121 static void
2122 extend_range_table_work_area (struct range_table_work_area *work_area)
2123 {
2124 work_area->allocated += 16 * sizeof (int);
2125 work_area->table = realloc (work_area->table, work_area->allocated);
2126 }
2127
2128 #if 0
2129 #ifdef emacs
2130
2131 /* Carefully find the ranges of codes that are equivalent
2132 under case conversion to the range start..end when passed through
2133 TRANSLATE. Handle the case where non-letters can come in between
2134 two upper-case letters (which happens in Latin-1).
2135 Also handle the case of groups of more than 2 case-equivalent chars.
2136
2137 The basic method is to look at consecutive characters and see
2138 if they can form a run that can be handled as one.
2139
2140 Returns -1 if successful, REG_ESPACE if ran out of space. */
2141
2142 static int
2143 set_image_of_range_1 (struct range_table_work_area *work_area,
2144 re_wchar_t start, re_wchar_t end,
2145 RE_TRANSLATE_TYPE translate)
2146 {
2147 /* `one_case' indicates a character, or a run of characters,
2148 each of which is an isolate (no case-equivalents).
2149 This includes all ASCII non-letters.
2150
2151 `two_case' indicates a character, or a run of characters,
2152 each of which has two case-equivalent forms.
2153 This includes all ASCII letters.
2154
2155 `strange' indicates a character that has more than one
2156 case-equivalent. */
2157
2158 enum case_type {one_case, two_case, strange};
2159
2160 /* Describe the run that is in progress,
2161 which the next character can try to extend.
2162 If run_type is strange, that means there really is no run.
2163 If run_type is one_case, then run_start...run_end is the run.
2164 If run_type is two_case, then the run is run_start...run_end,
2165 and the case-equivalents end at run_eqv_end. */
2166
2167 enum case_type run_type = strange;
2168 int run_start, run_end, run_eqv_end;
2169
2170 Lisp_Object eqv_table;
2171
2172 if (!RE_TRANSLATE_P (translate))
2173 {
2174 EXTEND_RANGE_TABLE (work_area, 2);
2175 work_area->table[work_area->used++] = (start);
2176 work_area->table[work_area->used++] = (end);
2177 return -1;
2178 }
2179
2180 eqv_table = XCHAR_TABLE (translate)->extras[2];
2181
2182 for (; start <= end; start++)
2183 {
2184 enum case_type this_type;
2185 int eqv = RE_TRANSLATE (eqv_table, start);
2186 int minchar, maxchar;
2187
2188 /* Classify this character */
2189 if (eqv == start)
2190 this_type = one_case;
2191 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2192 this_type = two_case;
2193 else
2194 this_type = strange;
2195
2196 if (start < eqv)
2197 minchar = start, maxchar = eqv;
2198 else
2199 minchar = eqv, maxchar = start;
2200
2201 /* Can this character extend the run in progress? */
2202 if (this_type == strange || this_type != run_type
2203 || !(minchar == run_end + 1
2204 && (run_type == two_case
2205 ? maxchar == run_eqv_end + 1 : 1)))
2206 {
2207 /* No, end the run.
2208 Record each of its equivalent ranges. */
2209 if (run_type == one_case)
2210 {
2211 EXTEND_RANGE_TABLE (work_area, 2);
2212 work_area->table[work_area->used++] = run_start;
2213 work_area->table[work_area->used++] = run_end;
2214 }
2215 else if (run_type == two_case)
2216 {
2217 EXTEND_RANGE_TABLE (work_area, 4);
2218 work_area->table[work_area->used++] = run_start;
2219 work_area->table[work_area->used++] = run_end;
2220 work_area->table[work_area->used++]
2221 = RE_TRANSLATE (eqv_table, run_start);
2222 work_area->table[work_area->used++]
2223 = RE_TRANSLATE (eqv_table, run_end);
2224 }
2225 run_type = strange;
2226 }
2227
2228 if (this_type == strange)
2229 {
2230 /* For a strange character, add each of its equivalents, one
2231 by one. Don't start a range. */
2232 do
2233 {
2234 EXTEND_RANGE_TABLE (work_area, 2);
2235 work_area->table[work_area->used++] = eqv;
2236 work_area->table[work_area->used++] = eqv;
2237 eqv = RE_TRANSLATE (eqv_table, eqv);
2238 }
2239 while (eqv != start);
2240 }
2241
2242 /* Add this char to the run, or start a new run. */
2243 else if (run_type == strange)
2244 {
2245 /* Initialize a new range. */
2246 run_type = this_type;
2247 run_start = start;
2248 run_end = start;
2249 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2250 }
2251 else
2252 {
2253 /* Extend a running range. */
2254 run_end = minchar;
2255 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2256 }
2257 }
2258
2259 /* If a run is still in progress at the end, finish it now
2260 by recording its equivalent ranges. */
2261 if (run_type == one_case)
2262 {
2263 EXTEND_RANGE_TABLE (work_area, 2);
2264 work_area->table[work_area->used++] = run_start;
2265 work_area->table[work_area->used++] = run_end;
2266 }
2267 else if (run_type == two_case)
2268 {
2269 EXTEND_RANGE_TABLE (work_area, 4);
2270 work_area->table[work_area->used++] = run_start;
2271 work_area->table[work_area->used++] = run_end;
2272 work_area->table[work_area->used++]
2273 = RE_TRANSLATE (eqv_table, run_start);
2274 work_area->table[work_area->used++]
2275 = RE_TRANSLATE (eqv_table, run_end);
2276 }
2277
2278 return -1;
2279 }
2280
2281 #endif /* emacs */
2282
2283 /* Record the image of the range start..end when passed through
2284 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2285 and is not even necessarily contiguous.
2286 Normally we approximate it with the smallest contiguous range that contains
2287 all the chars we need. However, for Latin-1 we go to extra effort
2288 to do a better job.
2289
2290 This function is not called for ASCII ranges.
2291
2292 Returns -1 if successful, REG_ESPACE if ran out of space. */
2293
2294 static int
2295 set_image_of_range (struct range_table_work_area *work_area,
2296 re_wchar_t start, re_wchar_t end,
2297 RE_TRANSLATE_TYPE translate)
2298 {
2299 re_wchar_t cmin, cmax;
2300
2301 #ifdef emacs
2302 /* For Latin-1 ranges, use set_image_of_range_1
2303 to get proper handling of ranges that include letters and nonletters.
2304 For a range that includes the whole of Latin-1, this is not necessary.
2305 For other character sets, we don't bother to get this right. */
2306 if (RE_TRANSLATE_P (translate) && start < 04400
2307 && !(start < 04200 && end >= 04377))
2308 {
2309 int newend;
2310 int tem;
2311 newend = end;
2312 if (newend > 04377)
2313 newend = 04377;
2314 tem = set_image_of_range_1 (work_area, start, newend, translate);
2315 if (tem > 0)
2316 return tem;
2317
2318 start = 04400;
2319 if (end < 04400)
2320 return -1;
2321 }
2322 #endif
2323
2324 EXTEND_RANGE_TABLE (work_area, 2);
2325 work_area->table[work_area->used++] = (start);
2326 work_area->table[work_area->used++] = (end);
2327
2328 cmin = -1, cmax = -1;
2329
2330 if (RE_TRANSLATE_P (translate))
2331 {
2332 int ch;
2333
2334 for (ch = start; ch <= end; ch++)
2335 {
2336 re_wchar_t c = TRANSLATE (ch);
2337 if (! (start <= c && c <= end))
2338 {
2339 if (cmin == -1)
2340 cmin = c, cmax = c;
2341 else
2342 {
2343 cmin = MIN (cmin, c);
2344 cmax = MAX (cmax, c);
2345 }
2346 }
2347 }
2348
2349 if (cmin != -1)
2350 {
2351 EXTEND_RANGE_TABLE (work_area, 2);
2352 work_area->table[work_area->used++] = (cmin);
2353 work_area->table[work_area->used++] = (cmax);
2354 }
2355 }
2356
2357 return -1;
2358 }
2359 #endif /* 0 */
2360 \f
2361 #ifndef MATCH_MAY_ALLOCATE
2362
2363 /* If we cannot allocate large objects within re_match_2_internal,
2364 we make the fail stack and register vectors global.
2365 The fail stack, we grow to the maximum size when a regexp
2366 is compiled.
2367 The register vectors, we adjust in size each time we
2368 compile a regexp, according to the number of registers it needs. */
2369
2370 static fail_stack_type fail_stack;
2371
2372 /* Size with which the following vectors are currently allocated.
2373 That is so we can make them bigger as needed,
2374 but never make them smaller. */
2375 static int regs_allocated_size;
2376
2377 static re_char ** regstart, ** regend;
2378 static re_char **best_regstart, **best_regend;
2379
2380 /* Make the register vectors big enough for NUM_REGS registers,
2381 but don't make them smaller. */
2382
2383 static
2384 regex_grow_registers (int num_regs)
2385 {
2386 if (num_regs > regs_allocated_size)
2387 {
2388 RETALLOC_IF (regstart, num_regs, re_char *);
2389 RETALLOC_IF (regend, num_regs, re_char *);
2390 RETALLOC_IF (best_regstart, num_regs, re_char *);
2391 RETALLOC_IF (best_regend, num_regs, re_char *);
2392
2393 regs_allocated_size = num_regs;
2394 }
2395 }
2396
2397 #endif /* not MATCH_MAY_ALLOCATE */
2398 \f
2399 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2400 regnum_t regnum);
2401
2402 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2403 Returns one of error codes defined in `regex.h', or zero for success.
2404
2405 Assumes the `allocated' (and perhaps `buffer') and `translate'
2406 fields are set in BUFP on entry.
2407
2408 If it succeeds, results are put in BUFP (if it returns an error, the
2409 contents of BUFP are undefined):
2410 `buffer' is the compiled pattern;
2411 `syntax' is set to SYNTAX;
2412 `used' is set to the length of the compiled pattern;
2413 `fastmap_accurate' is zero;
2414 `re_nsub' is the number of subexpressions in PATTERN;
2415 `not_bol' and `not_eol' are zero;
2416
2417 The `fastmap' field is neither examined nor set. */
2418
2419 /* Insert the `jump' from the end of last alternative to "here".
2420 The space for the jump has already been allocated. */
2421 #define FIXUP_ALT_JUMP() \
2422 do { \
2423 if (fixup_alt_jump) \
2424 STORE_JUMP (jump, fixup_alt_jump, b); \
2425 } while (0)
2426
2427
2428 /* Return, freeing storage we allocated. */
2429 #define FREE_STACK_RETURN(value) \
2430 do { \
2431 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2432 free (compile_stack.stack); \
2433 return value; \
2434 } while (0)
2435
2436 static reg_errcode_t
2437 regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
2438 {
2439 /* We fetch characters from PATTERN here. */
2440 register re_wchar_t c, c1;
2441
2442 /* Points to the end of the buffer, where we should append. */
2443 register unsigned char *b;
2444
2445 /* Keeps track of unclosed groups. */
2446 compile_stack_type compile_stack;
2447
2448 /* Points to the current (ending) position in the pattern. */
2449 #ifdef AIX
2450 /* `const' makes AIX compiler fail. */
2451 unsigned char *p = pattern;
2452 #else
2453 re_char *p = pattern;
2454 #endif
2455 re_char *pend = pattern + size;
2456
2457 /* How to translate the characters in the pattern. */
2458 RE_TRANSLATE_TYPE translate = bufp->translate;
2459
2460 /* Address of the count-byte of the most recently inserted `exactn'
2461 command. This makes it possible to tell if a new exact-match
2462 character can be added to that command or if the character requires
2463 a new `exactn' command. */
2464 unsigned char *pending_exact = 0;
2465
2466 /* Address of start of the most recently finished expression.
2467 This tells, e.g., postfix * where to find the start of its
2468 operand. Reset at the beginning of groups and alternatives. */
2469 unsigned char *laststart = 0;
2470
2471 /* Address of beginning of regexp, or inside of last group. */
2472 unsigned char *begalt;
2473
2474 /* Place in the uncompiled pattern (i.e., the {) to
2475 which to go back if the interval is invalid. */
2476 re_char *beg_interval;
2477
2478 /* Address of the place where a forward jump should go to the end of
2479 the containing expression. Each alternative of an `or' -- except the
2480 last -- ends with a forward jump of this sort. */
2481 unsigned char *fixup_alt_jump = 0;
2482
2483 /* Work area for range table of charset. */
2484 struct range_table_work_area range_table_work;
2485
2486 /* If the object matched can contain multibyte characters. */
2487 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2488
2489 /* Nonzero if we have pushed down into a subpattern. */
2490 int in_subpattern = 0;
2491
2492 /* These hold the values of p, pattern, and pend from the main
2493 pattern when we have pushed into a subpattern. */
2494 re_char *main_p IF_LINT (= NULL);
2495 re_char *main_pattern IF_LINT (= NULL);
2496 re_char *main_pend IF_LINT (= NULL);
2497
2498 #ifdef DEBUG
2499 debug++;
2500 DEBUG_PRINT1 ("\nCompiling pattern: ");
2501 if (debug > 0)
2502 {
2503 unsigned debug_count;
2504
2505 for (debug_count = 0; debug_count < size; debug_count++)
2506 putchar (pattern[debug_count]);
2507 putchar ('\n');
2508 }
2509 #endif /* DEBUG */
2510
2511 /* Initialize the compile stack. */
2512 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2513 if (compile_stack.stack == NULL)
2514 return REG_ESPACE;
2515
2516 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2517 compile_stack.avail = 0;
2518
2519 range_table_work.table = 0;
2520 range_table_work.allocated = 0;
2521
2522 /* Initialize the pattern buffer. */
2523 bufp->syntax = syntax;
2524 bufp->fastmap_accurate = 0;
2525 bufp->not_bol = bufp->not_eol = 0;
2526 bufp->used_syntax = 0;
2527
2528 /* Set `used' to zero, so that if we return an error, the pattern
2529 printer (for debugging) will think there's no pattern. We reset it
2530 at the end. */
2531 bufp->used = 0;
2532
2533 /* Always count groups, whether or not bufp->no_sub is set. */
2534 bufp->re_nsub = 0;
2535
2536 #if !defined emacs && !defined SYNTAX_TABLE
2537 /* Initialize the syntax table. */
2538 init_syntax_once ();
2539 #endif
2540
2541 if (bufp->allocated == 0)
2542 {
2543 if (bufp->buffer)
2544 { /* If zero allocated, but buffer is non-null, try to realloc
2545 enough space. This loses if buffer's address is bogus, but
2546 that is the user's responsibility. */
2547 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2548 }
2549 else
2550 { /* Caller did not allocate a buffer. Do it for them. */
2551 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2552 }
2553 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2554
2555 bufp->allocated = INIT_BUF_SIZE;
2556 }
2557
2558 begalt = b = bufp->buffer;
2559
2560 /* Loop through the uncompiled pattern until we're at the end. */
2561 while (1)
2562 {
2563 if (p == pend)
2564 {
2565 /* If this is the end of an included regexp,
2566 pop back to the main regexp and try again. */
2567 if (in_subpattern)
2568 {
2569 in_subpattern = 0;
2570 pattern = main_pattern;
2571 p = main_p;
2572 pend = main_pend;
2573 continue;
2574 }
2575 /* If this is the end of the main regexp, we are done. */
2576 break;
2577 }
2578
2579 PATFETCH (c);
2580
2581 switch (c)
2582 {
2583 case ' ':
2584 {
2585 re_char *p1 = p;
2586
2587 /* If there's no special whitespace regexp, treat
2588 spaces normally. And don't try to do this recursively. */
2589 if (!whitespace_regexp || in_subpattern)
2590 goto normal_char;
2591
2592 /* Peek past following spaces. */
2593 while (p1 != pend)
2594 {
2595 if (*p1 != ' ')
2596 break;
2597 p1++;
2598 }
2599 /* If the spaces are followed by a repetition op,
2600 treat them normally. */
2601 if (p1 != pend
2602 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
2603 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2604 goto normal_char;
2605
2606 /* Replace the spaces with the whitespace regexp. */
2607 in_subpattern = 1;
2608 main_p = p1;
2609 main_pend = pend;
2610 main_pattern = pattern;
2611 p = pattern = whitespace_regexp;
2612 pend = p + strlen ((const char *) p);
2613 break;
2614 }
2615
2616 case '^':
2617 {
2618 if ( /* If at start of pattern, it's an operator. */
2619 p == pattern + 1
2620 /* If context independent, it's an operator. */
2621 || syntax & RE_CONTEXT_INDEP_ANCHORS
2622 /* Otherwise, depends on what's come before. */
2623 || at_begline_loc_p (pattern, p, syntax))
2624 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
2625 else
2626 goto normal_char;
2627 }
2628 break;
2629
2630
2631 case '$':
2632 {
2633 if ( /* If at end of pattern, it's an operator. */
2634 p == pend
2635 /* If context independent, it's an operator. */
2636 || syntax & RE_CONTEXT_INDEP_ANCHORS
2637 /* Otherwise, depends on what's next. */
2638 || at_endline_loc_p (p, pend, syntax))
2639 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
2640 else
2641 goto normal_char;
2642 }
2643 break;
2644
2645
2646 case '+':
2647 case '?':
2648 if ((syntax & RE_BK_PLUS_QM)
2649 || (syntax & RE_LIMITED_OPS))
2650 goto normal_char;
2651 handle_plus:
2652 case '*':
2653 /* If there is no previous pattern... */
2654 if (!laststart)
2655 {
2656 if (syntax & RE_CONTEXT_INVALID_OPS)
2657 FREE_STACK_RETURN (REG_BADRPT);
2658 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2659 goto normal_char;
2660 }
2661
2662 {
2663 /* 1 means zero (many) matches is allowed. */
2664 boolean zero_times_ok = 0, many_times_ok = 0;
2665 boolean greedy = 1;
2666
2667 /* If there is a sequence of repetition chars, collapse it
2668 down to just one (the right one). We can't combine
2669 interval operators with these because of, e.g., `a{2}*',
2670 which should only match an even number of `a's. */
2671
2672 for (;;)
2673 {
2674 if ((syntax & RE_FRUGAL)
2675 && c == '?' && (zero_times_ok || many_times_ok))
2676 greedy = 0;
2677 else
2678 {
2679 zero_times_ok |= c != '+';
2680 many_times_ok |= c != '?';
2681 }
2682
2683 if (p == pend)
2684 break;
2685 else if (*p == '*'
2686 || (!(syntax & RE_BK_PLUS_QM)
2687 && (*p == '+' || *p == '?')))
2688 ;
2689 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
2690 {
2691 if (p+1 == pend)
2692 FREE_STACK_RETURN (REG_EESCAPE);
2693 if (p[1] == '+' || p[1] == '?')
2694 PATFETCH (c); /* Gobble up the backslash. */
2695 else
2696 break;
2697 }
2698 else
2699 break;
2700 /* If we get here, we found another repeat character. */
2701 PATFETCH (c);
2702 }
2703
2704 /* Star, etc. applied to an empty pattern is equivalent
2705 to an empty pattern. */
2706 if (!laststart || laststart == b)
2707 break;
2708
2709 /* Now we know whether or not zero matches is allowed
2710 and also whether or not two or more matches is allowed. */
2711 if (greedy)
2712 {
2713 if (many_times_ok)
2714 {
2715 boolean simple = skip_one_char (laststart) == b;
2716 size_t startoffset = 0;
2717 re_opcode_t ofj =
2718 /* Check if the loop can match the empty string. */
2719 (simple || !analyse_first (laststart, b, NULL, 0))
2720 ? on_failure_jump : on_failure_jump_loop;
2721 assert (skip_one_char (laststart) <= b);
2722
2723 if (!zero_times_ok && simple)
2724 { /* Since simple * loops can be made faster by using
2725 on_failure_keep_string_jump, we turn simple P+
2726 into PP* if P is simple. */
2727 unsigned char *p1, *p2;
2728 startoffset = b - laststart;
2729 GET_BUFFER_SPACE (startoffset);
2730 p1 = b; p2 = laststart;
2731 while (p2 < p1)
2732 *b++ = *p2++;
2733 zero_times_ok = 1;
2734 }
2735
2736 GET_BUFFER_SPACE (6);
2737 if (!zero_times_ok)
2738 /* A + loop. */
2739 STORE_JUMP (ofj, b, b + 6);
2740 else
2741 /* Simple * loops can use on_failure_keep_string_jump
2742 depending on what follows. But since we don't know
2743 that yet, we leave the decision up to
2744 on_failure_jump_smart. */
2745 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2746 laststart + startoffset, b + 6);
2747 b += 3;
2748 STORE_JUMP (jump, b, laststart + startoffset);
2749 b += 3;
2750 }
2751 else
2752 {
2753 /* A simple ? pattern. */
2754 assert (zero_times_ok);
2755 GET_BUFFER_SPACE (3);
2756 INSERT_JUMP (on_failure_jump, laststart, b + 3);
2757 b += 3;
2758 }
2759 }
2760 else /* not greedy */
2761 { /* I wish the greedy and non-greedy cases could be merged. */
2762
2763 GET_BUFFER_SPACE (7); /* We might use less. */
2764 if (many_times_ok)
2765 {
2766 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2767
2768 /* The non-greedy multiple match looks like
2769 a repeat..until: we only need a conditional jump
2770 at the end of the loop. */
2771 if (emptyp) BUF_PUSH (no_op);
2772 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2773 : on_failure_jump, b, laststart);
2774 b += 3;
2775 if (zero_times_ok)
2776 {
2777 /* The repeat...until naturally matches one or more.
2778 To also match zero times, we need to first jump to
2779 the end of the loop (its conditional jump). */
2780 INSERT_JUMP (jump, laststart, b);
2781 b += 3;
2782 }
2783 }
2784 else
2785 {
2786 /* non-greedy a?? */
2787 INSERT_JUMP (jump, laststart, b + 3);
2788 b += 3;
2789 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2790 b += 3;
2791 }
2792 }
2793 }
2794 pending_exact = 0;
2795 break;
2796
2797
2798 case '.':
2799 laststart = b;
2800 BUF_PUSH (anychar);
2801 break;
2802
2803
2804 case '[':
2805 {
2806 re_char *p1;
2807
2808 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2809
2810 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2811
2812 /* Ensure that we have enough space to push a charset: the
2813 opcode, the length count, and the bitset; 34 bytes in all. */
2814 GET_BUFFER_SPACE (34);
2815
2816 laststart = b;
2817
2818 /* We test `*p == '^' twice, instead of using an if
2819 statement, so we only need one BUF_PUSH. */
2820 BUF_PUSH (*p == '^' ? charset_not : charset);
2821 if (*p == '^')
2822 p++;
2823
2824 /* Remember the first position in the bracket expression. */
2825 p1 = p;
2826
2827 /* Push the number of bytes in the bitmap. */
2828 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2829
2830 /* Clear the whole map. */
2831 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2832
2833 /* charset_not matches newline according to a syntax bit. */
2834 if ((re_opcode_t) b[-2] == charset_not
2835 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2836 SET_LIST_BIT ('\n');
2837
2838 /* Read in characters and ranges, setting map bits. */
2839 for (;;)
2840 {
2841 boolean escaped_char = false;
2842 const unsigned char *p2 = p;
2843 re_wchar_t ch;
2844
2845 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2846
2847 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2848 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2849 So the translation is done later in a loop. Example:
2850 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
2851 PATFETCH (c);
2852
2853 /* \ might escape characters inside [...] and [^...]. */
2854 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2855 {
2856 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2857
2858 PATFETCH (c);
2859 escaped_char = true;
2860 }
2861 else
2862 {
2863 /* Could be the end of the bracket expression. If it's
2864 not (i.e., when the bracket expression is `[]' so
2865 far), the ']' character bit gets set way below. */
2866 if (c == ']' && p2 != p1)
2867 break;
2868 }
2869
2870 /* See if we're at the beginning of a possible character
2871 class. */
2872
2873 if (!escaped_char &&
2874 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2875 {
2876 /* Leave room for the null. */
2877 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2878 const unsigned char *class_beg;
2879
2880 PATFETCH (c);
2881 c1 = 0;
2882 class_beg = p;
2883
2884 /* If pattern is `[[:'. */
2885 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2886
2887 for (;;)
2888 {
2889 PATFETCH (c);
2890 if ((c == ':' && *p == ']') || p == pend)
2891 break;
2892 if (c1 < CHAR_CLASS_MAX_LENGTH)
2893 str[c1++] = c;
2894 else
2895 /* This is in any case an invalid class name. */
2896 str[0] = '\0';
2897 }
2898 str[c1] = '\0';
2899
2900 /* If isn't a word bracketed by `[:' and `:]':
2901 undo the ending character, the letters, and
2902 leave the leading `:' and `[' (but set bits for
2903 them). */
2904 if (c == ':' && *p == ']')
2905 {
2906 re_wctype_t cc = re_wctype (str);
2907
2908 if (cc == 0)
2909 FREE_STACK_RETURN (REG_ECTYPE);
2910
2911 /* Throw away the ] at the end of the character
2912 class. */
2913 PATFETCH (c);
2914
2915 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2916
2917 #ifndef emacs
2918 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2919 if (re_iswctype (btowc (ch), cc))
2920 {
2921 c = TRANSLATE (ch);
2922 if (c < (1 << BYTEWIDTH))
2923 SET_LIST_BIT (c);
2924 }
2925 #else /* emacs */
2926 /* Most character classes in a multibyte match
2927 just set a flag. Exceptions are is_blank,
2928 is_digit, is_cntrl, and is_xdigit, since
2929 they can only match ASCII characters. We
2930 don't need to handle them for multibyte.
2931 They are distinguished by a negative wctype. */
2932
2933 /* Setup the gl_state object to its buffer-defined
2934 value. This hardcodes the buffer-global
2935 syntax-table for ASCII chars, while the other chars
2936 will obey syntax-table properties. It's not ideal,
2937 but it's the way it's been done until now. */
2938 SETUP_BUFFER_SYNTAX_TABLE ();
2939
2940 for (ch = 0; ch < 256; ++ch)
2941 {
2942 c = RE_CHAR_TO_MULTIBYTE (ch);
2943 if (! CHAR_BYTE8_P (c)
2944 && re_iswctype (c, cc))
2945 {
2946 SET_LIST_BIT (ch);
2947 c1 = TRANSLATE (c);
2948 if (c1 == c)
2949 continue;
2950 if (ASCII_CHAR_P (c1))
2951 SET_LIST_BIT (c1);
2952 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2953 SET_LIST_BIT (c1);
2954 }
2955 }
2956 SET_RANGE_TABLE_WORK_AREA_BIT
2957 (range_table_work, re_wctype_to_bit (cc));
2958 #endif /* emacs */
2959 /* In most cases the matching rule for char classes
2960 only uses the syntax table for multibyte chars,
2961 so that the content of the syntax-table it is not
2962 hardcoded in the range_table. SPACE and WORD are
2963 the two exceptions. */
2964 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2965 bufp->used_syntax = 1;
2966
2967 /* Repeat the loop. */
2968 continue;
2969 }
2970 else
2971 {
2972 /* Go back to right after the "[:". */
2973 p = class_beg;
2974 SET_LIST_BIT ('[');
2975
2976 /* Because the `:' may starts the range, we
2977 can't simply set bit and repeat the loop.
2978 Instead, just set it to C and handle below. */
2979 c = ':';
2980 }
2981 }
2982
2983 if (p < pend && p[0] == '-' && p[1] != ']')
2984 {
2985
2986 /* Discard the `-'. */
2987 PATFETCH (c1);
2988
2989 /* Fetch the character which ends the range. */
2990 PATFETCH (c1);
2991 #ifdef emacs
2992 if (CHAR_BYTE8_P (c1)
2993 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2994 /* Treat the range from a multibyte character to
2995 raw-byte character as empty. */
2996 c = c1 + 1;
2997 #endif /* emacs */
2998 }
2999 else
3000 /* Range from C to C. */
3001 c1 = c;
3002
3003 if (c > c1)
3004 {
3005 if (syntax & RE_NO_EMPTY_RANGES)
3006 FREE_STACK_RETURN (REG_ERANGEX);
3007 /* Else, repeat the loop. */
3008 }
3009 else
3010 {
3011 #ifndef emacs
3012 /* Set the range into bitmap */
3013 for (; c <= c1; c++)
3014 {
3015 ch = TRANSLATE (c);
3016 if (ch < (1 << BYTEWIDTH))
3017 SET_LIST_BIT (ch);
3018 }
3019 #else /* emacs */
3020 if (c < 128)
3021 {
3022 ch = MIN (127, c1);
3023 SETUP_ASCII_RANGE (range_table_work, c, ch);
3024 c = ch + 1;
3025 if (CHAR_BYTE8_P (c1))
3026 c = BYTE8_TO_CHAR (128);
3027 }
3028 if (c <= c1)
3029 {
3030 if (CHAR_BYTE8_P (c))
3031 {
3032 c = CHAR_TO_BYTE8 (c);
3033 c1 = CHAR_TO_BYTE8 (c1);
3034 for (; c <= c1; c++)
3035 SET_LIST_BIT (c);
3036 }
3037 else if (multibyte)
3038 {
3039 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3040 }
3041 else
3042 {
3043 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3044 }
3045 }
3046 #endif /* emacs */
3047 }
3048 }
3049
3050 /* Discard any (non)matching list bytes that are all 0 at the
3051 end of the map. Decrease the map-length byte too. */
3052 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3053 b[-1]--;
3054 b += b[-1];
3055
3056 /* Build real range table from work area. */
3057 if (RANGE_TABLE_WORK_USED (range_table_work)
3058 || RANGE_TABLE_WORK_BITS (range_table_work))
3059 {
3060 int i;
3061 int used = RANGE_TABLE_WORK_USED (range_table_work);
3062
3063 /* Allocate space for COUNT + RANGE_TABLE. Needs two
3064 bytes for flags, two for COUNT, and three bytes for
3065 each character. */
3066 GET_BUFFER_SPACE (4 + used * 3);
3067
3068 /* Indicate the existence of range table. */
3069 laststart[1] |= 0x80;
3070
3071 /* Store the character class flag bits into the range table.
3072 If not in emacs, these flag bits are always 0. */
3073 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3074 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3075
3076 STORE_NUMBER_AND_INCR (b, used / 2);
3077 for (i = 0; i < used; i++)
3078 STORE_CHARACTER_AND_INCR
3079 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3080 }
3081 }
3082 break;
3083
3084
3085 case '(':
3086 if (syntax & RE_NO_BK_PARENS)
3087 goto handle_open;
3088 else
3089 goto normal_char;
3090
3091
3092 case ')':
3093 if (syntax & RE_NO_BK_PARENS)
3094 goto handle_close;
3095 else
3096 goto normal_char;
3097
3098
3099 case '\n':
3100 if (syntax & RE_NEWLINE_ALT)
3101 goto handle_alt;
3102 else
3103 goto normal_char;
3104
3105
3106 case '|':
3107 if (syntax & RE_NO_BK_VBAR)
3108 goto handle_alt;
3109 else
3110 goto normal_char;
3111
3112
3113 case '{':
3114 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3115 goto handle_interval;
3116 else
3117 goto normal_char;
3118
3119
3120 case '\\':
3121 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3122
3123 /* Do not translate the character after the \, so that we can
3124 distinguish, e.g., \B from \b, even if we normally would
3125 translate, e.g., B to b. */
3126 PATFETCH (c);
3127
3128 switch (c)
3129 {
3130 case '(':
3131 if (syntax & RE_NO_BK_PARENS)
3132 goto normal_backslash;
3133
3134 handle_open:
3135 {
3136 int shy = 0;
3137 regnum_t regnum = 0;
3138 if (p+1 < pend)
3139 {
3140 /* Look for a special (?...) construct */
3141 if ((syntax & RE_SHY_GROUPS) && *p == '?')
3142 {
3143 PATFETCH (c); /* Gobble up the '?'. */
3144 while (!shy)
3145 {
3146 PATFETCH (c);
3147 switch (c)
3148 {
3149 case ':': shy = 1; break;
3150 case '0':
3151 /* An explicitly specified regnum must start
3152 with non-0. */
3153 if (regnum == 0)
3154 FREE_STACK_RETURN (REG_BADPAT);
3155 case '1': case '2': case '3': case '4':
3156 case '5': case '6': case '7': case '8': case '9':
3157 regnum = 10*regnum + (c - '0'); break;
3158 default:
3159 /* Only (?:...) is supported right now. */
3160 FREE_STACK_RETURN (REG_BADPAT);
3161 }
3162 }
3163 }
3164 }
3165
3166 if (!shy)
3167 regnum = ++bufp->re_nsub;
3168 else if (regnum)
3169 { /* It's actually not shy, but explicitly numbered. */
3170 shy = 0;
3171 if (regnum > bufp->re_nsub)
3172 bufp->re_nsub = regnum;
3173 else if (regnum > bufp->re_nsub
3174 /* Ideally, we'd want to check that the specified
3175 group can't have matched (i.e. all subgroups
3176 using the same regnum are in other branches of
3177 OR patterns), but we don't currently keep track
3178 of enough info to do that easily. */
3179 || group_in_compile_stack (compile_stack, regnum))
3180 FREE_STACK_RETURN (REG_BADPAT);
3181 }
3182 else
3183 /* It's really shy. */
3184 regnum = - bufp->re_nsub;
3185
3186 if (COMPILE_STACK_FULL)
3187 {
3188 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3189 compile_stack_elt_t);
3190 if (compile_stack.stack == NULL) return REG_ESPACE;
3191
3192 compile_stack.size <<= 1;
3193 }
3194
3195 /* These are the values to restore when we hit end of this
3196 group. They are all relative offsets, so that if the
3197 whole pattern moves because of realloc, they will still
3198 be valid. */
3199 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3200 COMPILE_STACK_TOP.fixup_alt_jump
3201 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3202 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3203 COMPILE_STACK_TOP.regnum = regnum;
3204
3205 /* Do not push a start_memory for groups beyond the last one
3206 we can represent in the compiled pattern. */
3207 if (regnum <= MAX_REGNUM && regnum > 0)
3208 BUF_PUSH_2 (start_memory, regnum);
3209
3210 compile_stack.avail++;
3211
3212 fixup_alt_jump = 0;
3213 laststart = 0;
3214 begalt = b;
3215 /* If we've reached MAX_REGNUM groups, then this open
3216 won't actually generate any code, so we'll have to
3217 clear pending_exact explicitly. */
3218 pending_exact = 0;
3219 break;
3220 }
3221
3222 case ')':
3223 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3224
3225 if (COMPILE_STACK_EMPTY)
3226 {
3227 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3228 goto normal_backslash;
3229 else
3230 FREE_STACK_RETURN (REG_ERPAREN);
3231 }
3232
3233 handle_close:
3234 FIXUP_ALT_JUMP ();
3235
3236 /* See similar code for backslashed left paren above. */
3237 if (COMPILE_STACK_EMPTY)
3238 {
3239 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3240 goto normal_char;
3241 else
3242 FREE_STACK_RETURN (REG_ERPAREN);
3243 }
3244
3245 /* Since we just checked for an empty stack above, this
3246 ``can't happen''. */
3247 assert (compile_stack.avail != 0);
3248 {
3249 /* We don't just want to restore into `regnum', because
3250 later groups should continue to be numbered higher,
3251 as in `(ab)c(de)' -- the second group is #2. */
3252 regnum_t regnum;
3253
3254 compile_stack.avail--;
3255 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3256 fixup_alt_jump
3257 = COMPILE_STACK_TOP.fixup_alt_jump
3258 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3259 : 0;
3260 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3261 regnum = COMPILE_STACK_TOP.regnum;
3262 /* If we've reached MAX_REGNUM groups, then this open
3263 won't actually generate any code, so we'll have to
3264 clear pending_exact explicitly. */
3265 pending_exact = 0;
3266
3267 /* We're at the end of the group, so now we know how many
3268 groups were inside this one. */
3269 if (regnum <= MAX_REGNUM && regnum > 0)
3270 BUF_PUSH_2 (stop_memory, regnum);
3271 }
3272 break;
3273
3274
3275 case '|': /* `\|'. */
3276 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3277 goto normal_backslash;
3278 handle_alt:
3279 if (syntax & RE_LIMITED_OPS)
3280 goto normal_char;
3281
3282 /* Insert before the previous alternative a jump which
3283 jumps to this alternative if the former fails. */
3284 GET_BUFFER_SPACE (3);
3285 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3286 pending_exact = 0;
3287 b += 3;
3288
3289 /* The alternative before this one has a jump after it
3290 which gets executed if it gets matched. Adjust that
3291 jump so it will jump to this alternative's analogous
3292 jump (put in below, which in turn will jump to the next
3293 (if any) alternative's such jump, etc.). The last such
3294 jump jumps to the correct final destination. A picture:
3295 _____ _____
3296 | | | |
3297 | v | v
3298 a | b | c
3299
3300 If we are at `b', then fixup_alt_jump right now points to a
3301 three-byte space after `a'. We'll put in the jump, set
3302 fixup_alt_jump to right after `b', and leave behind three
3303 bytes which we'll fill in when we get to after `c'. */
3304
3305 FIXUP_ALT_JUMP ();
3306
3307 /* Mark and leave space for a jump after this alternative,
3308 to be filled in later either by next alternative or
3309 when know we're at the end of a series of alternatives. */
3310 fixup_alt_jump = b;
3311 GET_BUFFER_SPACE (3);
3312 b += 3;
3313
3314 laststart = 0;
3315 begalt = b;
3316 break;
3317
3318
3319 case '{':
3320 /* If \{ is a literal. */
3321 if (!(syntax & RE_INTERVALS)
3322 /* If we're at `\{' and it's not the open-interval
3323 operator. */
3324 || (syntax & RE_NO_BK_BRACES))
3325 goto normal_backslash;
3326
3327 handle_interval:
3328 {
3329 /* If got here, then the syntax allows intervals. */
3330
3331 /* At least (most) this many matches must be made. */
3332 int lower_bound = 0, upper_bound = -1;
3333
3334 beg_interval = p;
3335
3336 GET_UNSIGNED_NUMBER (lower_bound);
3337
3338 if (c == ',')
3339 GET_UNSIGNED_NUMBER (upper_bound);
3340 else
3341 /* Interval such as `{1}' => match exactly once. */
3342 upper_bound = lower_bound;
3343
3344 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
3345 || (upper_bound >= 0 && lower_bound > upper_bound))
3346 FREE_STACK_RETURN (REG_BADBR);
3347
3348 if (!(syntax & RE_NO_BK_BRACES))
3349 {
3350 if (c != '\\')
3351 FREE_STACK_RETURN (REG_BADBR);
3352 if (p == pend)
3353 FREE_STACK_RETURN (REG_EESCAPE);
3354 PATFETCH (c);
3355 }
3356
3357 if (c != '}')
3358 FREE_STACK_RETURN (REG_BADBR);
3359
3360 /* We just parsed a valid interval. */
3361
3362 /* If it's invalid to have no preceding re. */
3363 if (!laststart)
3364 {
3365 if (syntax & RE_CONTEXT_INVALID_OPS)
3366 FREE_STACK_RETURN (REG_BADRPT);
3367 else if (syntax & RE_CONTEXT_INDEP_OPS)
3368 laststart = b;
3369 else
3370 goto unfetch_interval;
3371 }
3372
3373 if (upper_bound == 0)
3374 /* If the upper bound is zero, just drop the sub pattern
3375 altogether. */
3376 b = laststart;
3377 else if (lower_bound == 1 && upper_bound == 1)
3378 /* Just match it once: nothing to do here. */
3379 ;
3380
3381 /* Otherwise, we have a nontrivial interval. When
3382 we're all done, the pattern will look like:
3383 set_number_at <jump count> <upper bound>
3384 set_number_at <succeed_n count> <lower bound>
3385 succeed_n <after jump addr> <succeed_n count>
3386 <body of loop>
3387 jump_n <succeed_n addr> <jump count>
3388 (The upper bound and `jump_n' are omitted if
3389 `upper_bound' is 1, though.) */
3390 else
3391 { /* If the upper bound is > 1, we need to insert
3392 more at the end of the loop. */
3393 unsigned int nbytes = (upper_bound < 0 ? 3
3394 : upper_bound > 1 ? 5 : 0);
3395 unsigned int startoffset = 0;
3396
3397 GET_BUFFER_SPACE (20); /* We might use less. */
3398
3399 if (lower_bound == 0)
3400 {
3401 /* A succeed_n that starts with 0 is really a
3402 a simple on_failure_jump_loop. */
3403 INSERT_JUMP (on_failure_jump_loop, laststart,
3404 b + 3 + nbytes);
3405 b += 3;
3406 }
3407 else
3408 {
3409 /* Initialize lower bound of the `succeed_n', even
3410 though it will be set during matching by its
3411 attendant `set_number_at' (inserted next),
3412 because `re_compile_fastmap' needs to know.
3413 Jump to the `jump_n' we might insert below. */
3414 INSERT_JUMP2 (succeed_n, laststart,
3415 b + 5 + nbytes,
3416 lower_bound);
3417 b += 5;
3418
3419 /* Code to initialize the lower bound. Insert
3420 before the `succeed_n'. The `5' is the last two
3421 bytes of this `set_number_at', plus 3 bytes of
3422 the following `succeed_n'. */
3423 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3424 b += 5;
3425 startoffset += 5;
3426 }
3427
3428 if (upper_bound < 0)
3429 {
3430 /* A negative upper bound stands for infinity,
3431 in which case it degenerates to a plain jump. */
3432 STORE_JUMP (jump, b, laststart + startoffset);
3433 b += 3;
3434 }
3435 else if (upper_bound > 1)
3436 { /* More than one repetition is allowed, so
3437 append a backward jump to the `succeed_n'
3438 that starts this interval.
3439
3440 When we've reached this during matching,
3441 we'll have matched the interval once, so
3442 jump back only `upper_bound - 1' times. */
3443 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3444 upper_bound - 1);
3445 b += 5;
3446
3447 /* The location we want to set is the second
3448 parameter of the `jump_n'; that is `b-2' as
3449 an absolute address. `laststart' will be
3450 the `set_number_at' we're about to insert;
3451 `laststart+3' the number to set, the source
3452 for the relative address. But we are
3453 inserting into the middle of the pattern --
3454 so everything is getting moved up by 5.
3455 Conclusion: (b - 2) - (laststart + 3) + 5,
3456 i.e., b - laststart.
3457
3458 We insert this at the beginning of the loop
3459 so that if we fail during matching, we'll
3460 reinitialize the bounds. */
3461 insert_op2 (set_number_at, laststart, b - laststart,
3462 upper_bound - 1, b);
3463 b += 5;
3464 }
3465 }
3466 pending_exact = 0;
3467 beg_interval = NULL;
3468 }
3469 break;
3470
3471 unfetch_interval:
3472 /* If an invalid interval, match the characters as literals. */
3473 assert (beg_interval);
3474 p = beg_interval;
3475 beg_interval = NULL;
3476
3477 /* normal_char and normal_backslash need `c'. */
3478 c = '{';
3479
3480 if (!(syntax & RE_NO_BK_BRACES))
3481 {
3482 assert (p > pattern && p[-1] == '\\');
3483 goto normal_backslash;
3484 }
3485 else
3486 goto normal_char;
3487
3488 #ifdef emacs
3489 /* There is no way to specify the before_dot and after_dot
3490 operators. rms says this is ok. --karl */
3491 case '=':
3492 BUF_PUSH (at_dot);
3493 break;
3494
3495 case 's':
3496 laststart = b;
3497 PATFETCH (c);
3498 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3499 break;
3500
3501 case 'S':
3502 laststart = b;
3503 PATFETCH (c);
3504 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3505 break;
3506
3507 case 'c':
3508 laststart = b;
3509 PATFETCH (c);
3510 BUF_PUSH_2 (categoryspec, c);
3511 break;
3512
3513 case 'C':
3514 laststart = b;
3515 PATFETCH (c);
3516 BUF_PUSH_2 (notcategoryspec, c);
3517 break;
3518 #endif /* emacs */
3519
3520
3521 case 'w':
3522 if (syntax & RE_NO_GNU_OPS)
3523 goto normal_char;
3524 laststart = b;
3525 BUF_PUSH_2 (syntaxspec, Sword);
3526 break;
3527
3528
3529 case 'W':
3530 if (syntax & RE_NO_GNU_OPS)
3531 goto normal_char;
3532 laststart = b;
3533 BUF_PUSH_2 (notsyntaxspec, Sword);
3534 break;
3535
3536
3537 case '<':
3538 if (syntax & RE_NO_GNU_OPS)
3539 goto normal_char;
3540 BUF_PUSH (wordbeg);
3541 break;
3542
3543 case '>':
3544 if (syntax & RE_NO_GNU_OPS)
3545 goto normal_char;
3546 BUF_PUSH (wordend);
3547 break;
3548
3549 case '_':
3550 if (syntax & RE_NO_GNU_OPS)
3551 goto normal_char;
3552 laststart = b;
3553 PATFETCH (c);
3554 if (c == '<')
3555 BUF_PUSH (symbeg);
3556 else if (c == '>')
3557 BUF_PUSH (symend);
3558 else
3559 FREE_STACK_RETURN (REG_BADPAT);
3560 break;
3561
3562 case 'b':
3563 if (syntax & RE_NO_GNU_OPS)
3564 goto normal_char;
3565 BUF_PUSH (wordbound);
3566 break;
3567
3568 case 'B':
3569 if (syntax & RE_NO_GNU_OPS)
3570 goto normal_char;
3571 BUF_PUSH (notwordbound);
3572 break;
3573
3574 case '`':
3575 if (syntax & RE_NO_GNU_OPS)
3576 goto normal_char;
3577 BUF_PUSH (begbuf);
3578 break;
3579
3580 case '\'':
3581 if (syntax & RE_NO_GNU_OPS)
3582 goto normal_char;
3583 BUF_PUSH (endbuf);
3584 break;
3585
3586 case '1': case '2': case '3': case '4': case '5':
3587 case '6': case '7': case '8': case '9':
3588 {
3589 regnum_t reg;
3590
3591 if (syntax & RE_NO_BK_REFS)
3592 goto normal_backslash;
3593
3594 reg = c - '0';
3595
3596 if (reg > bufp->re_nsub || reg < 1
3597 /* Can't back reference to a subexp before its end. */
3598 || group_in_compile_stack (compile_stack, reg))
3599 FREE_STACK_RETURN (REG_ESUBREG);
3600
3601 laststart = b;
3602 BUF_PUSH_2 (duplicate, reg);
3603 }
3604 break;
3605
3606
3607 case '+':
3608 case '?':
3609 if (syntax & RE_BK_PLUS_QM)
3610 goto handle_plus;
3611 else
3612 goto normal_backslash;
3613
3614 default:
3615 normal_backslash:
3616 /* You might think it would be useful for \ to mean
3617 not to translate; but if we don't translate it
3618 it will never match anything. */
3619 goto normal_char;
3620 }
3621 break;
3622
3623
3624 default:
3625 /* Expects the character in `c'. */
3626 normal_char:
3627 /* If no exactn currently being built. */
3628 if (!pending_exact
3629
3630 /* If last exactn not at current position. */
3631 || pending_exact + *pending_exact + 1 != b
3632
3633 /* We have only one byte following the exactn for the count. */
3634 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3635
3636 /* If followed by a repetition operator. */
3637 || (p != pend && (*p == '*' || *p == '^'))
3638 || ((syntax & RE_BK_PLUS_QM)
3639 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3640 : p != pend && (*p == '+' || *p == '?'))
3641 || ((syntax & RE_INTERVALS)
3642 && ((syntax & RE_NO_BK_BRACES)
3643 ? p != pend && *p == '{'
3644 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3645 {
3646 /* Start building a new exactn. */
3647
3648 laststart = b;
3649
3650 BUF_PUSH_2 (exactn, 0);
3651 pending_exact = b - 1;
3652 }
3653
3654 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3655 {
3656 int len;
3657
3658 if (multibyte)
3659 {
3660 c = TRANSLATE (c);
3661 len = CHAR_STRING (c, b);
3662 b += len;
3663 }
3664 else
3665 {
3666 c1 = RE_CHAR_TO_MULTIBYTE (c);
3667 if (! CHAR_BYTE8_P (c1))
3668 {
3669 re_wchar_t c2 = TRANSLATE (c1);
3670
3671 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3672 c = c1;
3673 }
3674 *b++ = c;
3675 len = 1;
3676 }
3677 (*pending_exact) += len;
3678 }
3679
3680 break;
3681 } /* switch (c) */
3682 } /* while p != pend */
3683
3684
3685 /* Through the pattern now. */
3686
3687 FIXUP_ALT_JUMP ();
3688
3689 if (!COMPILE_STACK_EMPTY)
3690 FREE_STACK_RETURN (REG_EPAREN);
3691
3692 /* If we don't want backtracking, force success
3693 the first time we reach the end of the compiled pattern. */
3694 if (syntax & RE_NO_POSIX_BACKTRACKING)
3695 BUF_PUSH (succeed);
3696
3697 /* We have succeeded; set the length of the buffer. */
3698 bufp->used = b - bufp->buffer;
3699
3700 #ifdef DEBUG
3701 if (debug > 0)
3702 {
3703 re_compile_fastmap (bufp);
3704 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3705 print_compiled_pattern (bufp);
3706 }
3707 debug--;
3708 #endif /* DEBUG */
3709
3710 #ifndef MATCH_MAY_ALLOCATE
3711 /* Initialize the failure stack to the largest possible stack. This
3712 isn't necessary unless we're trying to avoid calling alloca in
3713 the search and match routines. */
3714 {
3715 int num_regs = bufp->re_nsub + 1;
3716
3717 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3718 {
3719 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3720 falk_stack.stack = realloc (fail_stack.stack,
3721 fail_stack.size * sizeof *falk_stack.stack);
3722 }
3723
3724 regex_grow_registers (num_regs);
3725 }
3726 #endif /* not MATCH_MAY_ALLOCATE */
3727
3728 FREE_STACK_RETURN (REG_NOERROR);
3729 } /* regex_compile */
3730 \f
3731 /* Subroutines for `regex_compile'. */
3732
3733 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3734
3735 static void
3736 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3737 {
3738 *loc = (unsigned char) op;
3739 STORE_NUMBER (loc + 1, arg);
3740 }
3741
3742
3743 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3744
3745 static void
3746 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3747 {
3748 *loc = (unsigned char) op;
3749 STORE_NUMBER (loc + 1, arg1);
3750 STORE_NUMBER (loc + 3, arg2);
3751 }
3752
3753
3754 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3755 for OP followed by two-byte integer parameter ARG. */
3756
3757 static void
3758 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3759 {
3760 register unsigned char *pfrom = end;
3761 register unsigned char *pto = end + 3;
3762
3763 while (pfrom != loc)
3764 *--pto = *--pfrom;
3765
3766 store_op1 (op, loc, arg);
3767 }
3768
3769
3770 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3771
3772 static void
3773 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
3774 {
3775 register unsigned char *pfrom = end;
3776 register unsigned char *pto = end + 5;
3777
3778 while (pfrom != loc)
3779 *--pto = *--pfrom;
3780
3781 store_op2 (op, loc, arg1, arg2);
3782 }
3783
3784
3785 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3786 after an alternative or a begin-subexpression. We assume there is at
3787 least one character before the ^. */
3788
3789 static boolean
3790 at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
3791 {
3792 re_char *prev = p - 2;
3793 boolean odd_backslashes;
3794
3795 /* After a subexpression? */
3796 if (*prev == '(')
3797 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3798
3799 /* After an alternative? */
3800 else if (*prev == '|')
3801 odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3802
3803 /* After a shy subexpression? */
3804 else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3805 {
3806 /* Skip over optional regnum. */
3807 while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3808 --prev;
3809
3810 if (!(prev - 2 >= pattern
3811 && prev[-1] == '?' && prev[-2] == '('))
3812 return false;
3813 prev -= 2;
3814 odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3815 }
3816 else
3817 return false;
3818
3819 /* Count the number of preceding backslashes. */
3820 p = prev;
3821 while (prev - 1 >= pattern && prev[-1] == '\\')
3822 --prev;
3823 return (p - prev) & odd_backslashes;
3824 }
3825
3826
3827 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3828 at least one character after the $, i.e., `P < PEND'. */
3829
3830 static boolean
3831 at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
3832 {
3833 re_char *next = p;
3834 boolean next_backslash = *next == '\\';
3835 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3836
3837 return
3838 /* Before a subexpression? */
3839 (syntax & RE_NO_BK_PARENS ? *next == ')'
3840 : next_backslash && next_next && *next_next == ')')
3841 /* Before an alternative? */
3842 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3843 : next_backslash && next_next && *next_next == '|');
3844 }
3845
3846
3847 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3848 false if it's not. */
3849
3850 static boolean
3851 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3852 {
3853 ssize_t this_element;
3854
3855 for (this_element = compile_stack.avail - 1;
3856 this_element >= 0;
3857 this_element--)
3858 if (compile_stack.stack[this_element].regnum == regnum)
3859 return true;
3860
3861 return false;
3862 }
3863 \f
3864 /* analyse_first.
3865 If fastmap is non-NULL, go through the pattern and fill fastmap
3866 with all the possible leading chars. If fastmap is NULL, don't
3867 bother filling it up (obviously) and only return whether the
3868 pattern could potentially match the empty string.
3869
3870 Return 1 if p..pend might match the empty string.
3871 Return 0 if p..pend matches at least one char.
3872 Return -1 if fastmap was not updated accurately. */
3873
3874 static int
3875 analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
3876 {
3877 int j, k;
3878 boolean not;
3879
3880 /* If all elements for base leading-codes in fastmap is set, this
3881 flag is set true. */
3882 boolean match_any_multibyte_characters = false;
3883
3884 assert (p);
3885
3886 /* The loop below works as follows:
3887 - It has a working-list kept in the PATTERN_STACK and which basically
3888 starts by only containing a pointer to the first operation.
3889 - If the opcode we're looking at is a match against some set of
3890 chars, then we add those chars to the fastmap and go on to the
3891 next work element from the worklist (done via `break').
3892 - If the opcode is a control operator on the other hand, we either
3893 ignore it (if it's meaningless at this point, such as `start_memory')
3894 or execute it (if it's a jump). If the jump has several destinations
3895 (i.e. `on_failure_jump'), then we push the other destination onto the
3896 worklist.
3897 We guarantee termination by ignoring backward jumps (more or less),
3898 so that `p' is monotonically increasing. More to the point, we
3899 never set `p' (or push) anything `<= p1'. */
3900
3901 while (p < pend)
3902 {
3903 /* `p1' is used as a marker of how far back a `on_failure_jump'
3904 can go without being ignored. It is normally equal to `p'
3905 (which prevents any backward `on_failure_jump') except right
3906 after a plain `jump', to allow patterns such as:
3907 0: jump 10
3908 3..9: <body>
3909 10: on_failure_jump 3
3910 as used for the *? operator. */
3911 re_char *p1 = p;
3912
3913 switch (*p++)
3914 {
3915 case succeed:
3916 return 1;
3917
3918 case duplicate:
3919 /* If the first character has to match a backreference, that means
3920 that the group was empty (since it already matched). Since this
3921 is the only case that interests us here, we can assume that the
3922 backreference must match the empty string. */
3923 p++;
3924 continue;
3925
3926
3927 /* Following are the cases which match a character. These end
3928 with `break'. */
3929
3930 case exactn:
3931 if (fastmap)
3932 {
3933 /* If multibyte is nonzero, the first byte of each
3934 character is an ASCII or a leading code. Otherwise,
3935 each byte is a character. Thus, this works in both
3936 cases. */
3937 fastmap[p[1]] = 1;
3938 if (! multibyte)
3939 {
3940 /* For the case of matching this unibyte regex
3941 against multibyte, we must set a leading code of
3942 the corresponding multibyte character. */
3943 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3944
3945 fastmap[CHAR_LEADING_CODE (c)] = 1;
3946 }
3947 }
3948 break;
3949
3950
3951 case anychar:
3952 /* We could put all the chars except for \n (and maybe \0)
3953 but we don't bother since it is generally not worth it. */
3954 if (!fastmap) break;
3955 return -1;
3956
3957
3958 case charset_not:
3959 if (!fastmap) break;
3960 {
3961 /* Chars beyond end of bitmap are possible matches. */
3962 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3963 j < (1 << BYTEWIDTH); j++)
3964 fastmap[j] = 1;
3965 }
3966
3967 /* Fallthrough */
3968 case charset:
3969 if (!fastmap) break;
3970 not = (re_opcode_t) *(p - 1) == charset_not;
3971 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3972 j >= 0; j--)
3973 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3974 fastmap[j] = 1;
3975
3976 #ifdef emacs
3977 if (/* Any leading code can possibly start a character
3978 which doesn't match the specified set of characters. */
3979 not
3980 ||
3981 /* If we can match a character class, we can match any
3982 multibyte characters. */
3983 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3984 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3985
3986 {
3987 if (match_any_multibyte_characters == false)
3988 {
3989 for (j = MIN_MULTIBYTE_LEADING_CODE;
3990 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
3991 fastmap[j] = 1;
3992 match_any_multibyte_characters = true;
3993 }
3994 }
3995
3996 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3997 && match_any_multibyte_characters == false)
3998 {
3999 /* Set fastmap[I] to 1 where I is a leading code of each
4000 multibyte character in the range table. */
4001 int c, count;
4002 unsigned char lc1, lc2;
4003
4004 /* Make P points the range table. `+ 2' is to skip flag
4005 bits for a character class. */
4006 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
4007
4008 /* Extract the number of ranges in range table into COUNT. */
4009 EXTRACT_NUMBER_AND_INCR (count, p);
4010 for (; count > 0; count--, p += 3)
4011 {
4012 /* Extract the start and end of each range. */
4013 EXTRACT_CHARACTER (c, p);
4014 lc1 = CHAR_LEADING_CODE (c);
4015 p += 3;
4016 EXTRACT_CHARACTER (c, p);
4017 lc2 = CHAR_LEADING_CODE (c);
4018 for (j = lc1; j <= lc2; j++)
4019 fastmap[j] = 1;
4020 }
4021 }
4022 #endif
4023 break;
4024
4025 case syntaxspec:
4026 case notsyntaxspec:
4027 if (!fastmap) break;
4028 #ifndef emacs
4029 not = (re_opcode_t)p[-1] == notsyntaxspec;
4030 k = *p++;
4031 for (j = 0; j < (1 << BYTEWIDTH); j++)
4032 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
4033 fastmap[j] = 1;
4034 break;
4035 #else /* emacs */
4036 /* This match depends on text properties. These end with
4037 aborting optimizations. */
4038 return -1;
4039
4040 case categoryspec:
4041 case notcategoryspec:
4042 if (!fastmap) break;
4043 not = (re_opcode_t)p[-1] == notcategoryspec;
4044 k = *p++;
4045 for (j = (1 << BYTEWIDTH); j >= 0; j--)
4046 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
4047 fastmap[j] = 1;
4048
4049 /* Any leading code can possibly start a character which
4050 has or doesn't has the specified category. */
4051 if (match_any_multibyte_characters == false)
4052 {
4053 for (j = MIN_MULTIBYTE_LEADING_CODE;
4054 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4055 fastmap[j] = 1;
4056 match_any_multibyte_characters = true;
4057 }
4058 break;
4059
4060 /* All cases after this match the empty string. These end with
4061 `continue'. */
4062
4063 case before_dot:
4064 case at_dot:
4065 case after_dot:
4066 #endif /* !emacs */
4067 case no_op:
4068 case begline:
4069 case endline:
4070 case begbuf:
4071 case endbuf:
4072 case wordbound:
4073 case notwordbound:
4074 case wordbeg:
4075 case wordend:
4076 case symbeg:
4077 case symend:
4078 continue;
4079
4080
4081 case jump:
4082 EXTRACT_NUMBER_AND_INCR (j, p);
4083 if (j < 0)
4084 /* Backward jumps can only go back to code that we've already
4085 visited. `re_compile' should make sure this is true. */
4086 break;
4087 p += j;
4088 switch (*p)
4089 {
4090 case on_failure_jump:
4091 case on_failure_keep_string_jump:
4092 case on_failure_jump_loop:
4093 case on_failure_jump_nastyloop:
4094 case on_failure_jump_smart:
4095 p++;
4096 break;
4097 default:
4098 continue;
4099 };
4100 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4101 to jump back to "just after here". */
4102 /* Fallthrough */
4103
4104 case on_failure_jump:
4105 case on_failure_keep_string_jump:
4106 case on_failure_jump_nastyloop:
4107 case on_failure_jump_loop:
4108 case on_failure_jump_smart:
4109 EXTRACT_NUMBER_AND_INCR (j, p);
4110 if (p + j <= p1)
4111 ; /* Backward jump to be ignored. */
4112 else
4113 { /* We have to look down both arms.
4114 We first go down the "straight" path so as to minimize
4115 stack usage when going through alternatives. */
4116 int r = analyse_first (p, pend, fastmap, multibyte);
4117 if (r) return r;
4118 p += j;
4119 }
4120 continue;
4121
4122
4123 case jump_n:
4124 /* This code simply does not properly handle forward jump_n. */
4125 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4126 p += 4;
4127 /* jump_n can either jump or fall through. The (backward) jump
4128 case has already been handled, so we only need to look at the
4129 fallthrough case. */
4130 continue;
4131
4132 case succeed_n:
4133 /* If N == 0, it should be an on_failure_jump_loop instead. */
4134 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4135 p += 4;
4136 /* We only care about one iteration of the loop, so we don't
4137 need to consider the case where this behaves like an
4138 on_failure_jump. */
4139 continue;
4140
4141
4142 case set_number_at:
4143 p += 4;
4144 continue;
4145
4146
4147 case start_memory:
4148 case stop_memory:
4149 p += 1;
4150 continue;
4151
4152
4153 default:
4154 abort (); /* We have listed all the cases. */
4155 } /* switch *p++ */
4156
4157 /* Getting here means we have found the possible starting
4158 characters for one path of the pattern -- and that the empty
4159 string does not match. We need not follow this path further. */
4160 return 0;
4161 } /* while p */
4162
4163 /* We reached the end without matching anything. */
4164 return 1;
4165
4166 } /* analyse_first */
4167 \f
4168 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4169 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4170 characters can start a string that matches the pattern. This fastmap
4171 is used by re_search to skip quickly over impossible starting points.
4172
4173 Character codes above (1 << BYTEWIDTH) are not represented in the
4174 fastmap, but the leading codes are represented. Thus, the fastmap
4175 indicates which character sets could start a match.
4176
4177 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4178 area as BUFP->fastmap.
4179
4180 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4181 the pattern buffer.
4182
4183 Returns 0 if we succeed, -2 if an internal error. */
4184
4185 int
4186 re_compile_fastmap (struct re_pattern_buffer *bufp)
4187 {
4188 char *fastmap = bufp->fastmap;
4189 int analysis;
4190
4191 assert (fastmap && bufp->buffer);
4192
4193 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4194 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4195
4196 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
4197 fastmap, RE_MULTIBYTE_P (bufp));
4198 bufp->can_be_null = (analysis != 0);
4199 return 0;
4200 } /* re_compile_fastmap */
4201 \f
4202 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4203 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4204 this memory for recording register information. STARTS and ENDS
4205 must be allocated using the malloc library routine, and must each
4206 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4207
4208 If NUM_REGS == 0, then subsequent matches should allocate their own
4209 register data.
4210
4211 Unless this function is called, the first search or match using
4212 PATTERN_BUFFER will allocate its own register data, without
4213 freeing the old data. */
4214
4215 void
4216 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
4217 {
4218 if (num_regs)
4219 {
4220 bufp->regs_allocated = REGS_REALLOCATE;
4221 regs->num_regs = num_regs;
4222 regs->start = starts;
4223 regs->end = ends;
4224 }
4225 else
4226 {
4227 bufp->regs_allocated = REGS_UNALLOCATED;
4228 regs->num_regs = 0;
4229 regs->start = regs->end = (regoff_t *) 0;
4230 }
4231 }
4232 WEAK_ALIAS (__re_set_registers, re_set_registers)
4233 \f
4234 /* Searching routines. */
4235
4236 /* Like re_search_2, below, but only one string is specified, and
4237 doesn't let you say where to stop matching. */
4238
4239 regoff_t
4240 re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4241 ssize_t startpos, ssize_t range, struct re_registers *regs)
4242 {
4243 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4244 regs, size);
4245 }
4246 WEAK_ALIAS (__re_search, re_search)
4247
4248 /* Head address of virtual concatenation of string. */
4249 #define HEAD_ADDR_VSTRING(P) \
4250 (((P) >= size1 ? string2 : string1))
4251
4252 /* Address of POS in the concatenation of virtual string. */
4253 #define POS_ADDR_VSTRING(POS) \
4254 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
4255
4256 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4257 virtual concatenation of STRING1 and STRING2, starting first at index
4258 STARTPOS, then at STARTPOS + 1, and so on.
4259
4260 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4261
4262 RANGE is how far to scan while trying to match. RANGE = 0 means try
4263 only at STARTPOS; in general, the last start tried is STARTPOS +
4264 RANGE.
4265
4266 In REGS, return the indices of the virtual concatenation of STRING1
4267 and STRING2 that matched the entire BUFP->buffer and its contained
4268 subexpressions.
4269
4270 Do not consider matching one past the index STOP in the virtual
4271 concatenation of STRING1 and STRING2.
4272
4273 We return either the position in the strings at which the match was
4274 found, -1 if no match, or -2 if error (such as failure
4275 stack overflow). */
4276
4277 regoff_t
4278 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4279 const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4280 struct re_registers *regs, ssize_t stop)
4281 {
4282 regoff_t val;
4283 re_char *string1 = (re_char*) str1;
4284 re_char *string2 = (re_char*) str2;
4285 register char *fastmap = bufp->fastmap;
4286 register RE_TRANSLATE_TYPE translate = bufp->translate;
4287 size_t total_size = size1 + size2;
4288 ssize_t endpos = startpos + range;
4289 boolean anchored_start;
4290 /* Nonzero if we are searching multibyte string. */
4291 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4292
4293 /* Check for out-of-range STARTPOS. */
4294 if (startpos < 0 || startpos > total_size)
4295 return -1;
4296
4297 /* Fix up RANGE if it might eventually take us outside
4298 the virtual concatenation of STRING1 and STRING2.
4299 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
4300 if (endpos < 0)
4301 range = 0 - startpos;
4302 else if (endpos > total_size)
4303 range = total_size - startpos;
4304
4305 /* If the search isn't to be a backwards one, don't waste time in a
4306 search for a pattern anchored at beginning of buffer. */
4307 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4308 {
4309 if (startpos > 0)
4310 return -1;
4311 else
4312 range = 0;
4313 }
4314
4315 #ifdef emacs
4316 /* In a forward search for something that starts with \=.
4317 don't keep searching past point. */
4318 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4319 {
4320 range = PT_BYTE - BEGV_BYTE - startpos;
4321 if (range < 0)
4322 return -1;
4323 }
4324 #endif /* emacs */
4325
4326 /* Update the fastmap now if not correct already. */
4327 if (fastmap && !bufp->fastmap_accurate)
4328 re_compile_fastmap (bufp);
4329
4330 /* See whether the pattern is anchored. */
4331 anchored_start = (bufp->buffer[0] == begline);
4332
4333 #ifdef emacs
4334 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4335 {
4336 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
4337
4338 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4339 }
4340 #endif
4341
4342 /* Loop through the string, looking for a place to start matching. */
4343 for (;;)
4344 {
4345 /* If the pattern is anchored,
4346 skip quickly past places we cannot match.
4347 We don't bother to treat startpos == 0 specially
4348 because that case doesn't repeat. */
4349 if (anchored_start && startpos > 0)
4350 {
4351 if (! ((startpos <= size1 ? string1[startpos - 1]
4352 : string2[startpos - size1 - 1])
4353 == '\n'))
4354 goto advance;
4355 }
4356
4357 /* If a fastmap is supplied, skip quickly over characters that
4358 cannot be the start of a match. If the pattern can match the
4359 null string, however, we don't need to skip characters; we want
4360 the first null string. */
4361 if (fastmap && startpos < total_size && !bufp->can_be_null)
4362 {
4363 register re_char *d;
4364 register re_wchar_t buf_ch;
4365
4366 d = POS_ADDR_VSTRING (startpos);
4367
4368 if (range > 0) /* Searching forwards. */
4369 {
4370 register int lim = 0;
4371 ssize_t irange = range;
4372
4373 if (startpos < size1 && startpos + range >= size1)
4374 lim = range - (size1 - startpos);
4375
4376 /* Written out as an if-else to avoid testing `translate'
4377 inside the loop. */
4378 if (RE_TRANSLATE_P (translate))
4379 {
4380 if (multibyte)
4381 while (range > lim)
4382 {
4383 int buf_charlen;
4384
4385 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4386 buf_ch = RE_TRANSLATE (translate, buf_ch);
4387 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4388 break;
4389
4390 range -= buf_charlen;
4391 d += buf_charlen;
4392 }
4393 else
4394 while (range > lim)
4395 {
4396 register re_wchar_t ch, translated;
4397
4398 buf_ch = *d;
4399 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4400 translated = RE_TRANSLATE (translate, ch);
4401 if (translated != ch
4402 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4403 buf_ch = ch;
4404 if (fastmap[buf_ch])
4405 break;
4406 d++;
4407 range--;
4408 }
4409 }
4410 else
4411 {
4412 if (multibyte)
4413 while (range > lim)
4414 {
4415 int buf_charlen;
4416
4417 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4418 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4419 break;
4420 range -= buf_charlen;
4421 d += buf_charlen;
4422 }
4423 else
4424 while (range > lim && !fastmap[*d])
4425 {
4426 d++;
4427 range--;
4428 }
4429 }
4430 startpos += irange - range;
4431 }
4432 else /* Searching backwards. */
4433 {
4434 if (multibyte)
4435 {
4436 buf_ch = STRING_CHAR (d);
4437 buf_ch = TRANSLATE (buf_ch);
4438 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4439 goto advance;
4440 }
4441 else
4442 {
4443 register re_wchar_t ch, translated;
4444
4445 buf_ch = *d;
4446 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4447 translated = TRANSLATE (ch);
4448 if (translated != ch
4449 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4450 buf_ch = ch;
4451 if (! fastmap[TRANSLATE (buf_ch)])
4452 goto advance;
4453 }
4454 }
4455 }
4456
4457 /* If can't match the null string, and that's all we have left, fail. */
4458 if (range >= 0 && startpos == total_size && fastmap
4459 && !bufp->can_be_null)
4460 return -1;
4461
4462 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4463 startpos, regs, stop);
4464
4465 if (val >= 0)
4466 return startpos;
4467
4468 if (val == -2)
4469 return -2;
4470
4471 advance:
4472 if (!range)
4473 break;
4474 else if (range > 0)
4475 {
4476 /* Update STARTPOS to the next character boundary. */
4477 if (multibyte)
4478 {
4479 re_char *p = POS_ADDR_VSTRING (startpos);
4480 int len = BYTES_BY_CHAR_HEAD (*p);
4481
4482 range -= len;
4483 if (range < 0)
4484 break;
4485 startpos += len;
4486 }
4487 else
4488 {
4489 range--;
4490 startpos++;
4491 }
4492 }
4493 else
4494 {
4495 range++;
4496 startpos--;
4497
4498 /* Update STARTPOS to the previous character boundary. */
4499 if (multibyte)
4500 {
4501 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4502 re_char *p0 = p;
4503 re_char *phead = HEAD_ADDR_VSTRING (startpos);
4504
4505 /* Find the head of multibyte form. */
4506 PREV_CHAR_BOUNDARY (p, phead);
4507 range += p0 - 1 - p;
4508 if (range > 0)
4509 break;
4510
4511 startpos -= p0 - 1 - p;
4512 }
4513 }
4514 }
4515 return -1;
4516 } /* re_search_2 */
4517 WEAK_ALIAS (__re_search_2, re_search_2)
4518 \f
4519 /* Declarations and macros for re_match_2. */
4520
4521 static int bcmp_translate (re_char *s1, re_char *s2,
4522 register ssize_t len,
4523 RE_TRANSLATE_TYPE translate,
4524 const int multibyte);
4525
4526 /* This converts PTR, a pointer into one of the search strings `string1'
4527 and `string2' into an offset from the beginning of that string. */
4528 #define POINTER_TO_OFFSET(ptr) \
4529 (FIRST_STRING_P (ptr) \
4530 ? ((regoff_t) ((ptr) - string1)) \
4531 : ((regoff_t) ((ptr) - string2 + size1)))
4532
4533 /* Call before fetching a character with *d. This switches over to
4534 string2 if necessary.
4535 Check re_match_2_internal for a discussion of why end_match_2 might
4536 not be within string2 (but be equal to end_match_1 instead). */
4537 #define PREFETCH() \
4538 while (d == dend) \
4539 { \
4540 /* End of string2 => fail. */ \
4541 if (dend == end_match_2) \
4542 goto fail; \
4543 /* End of string1 => advance to string2. */ \
4544 d = string2; \
4545 dend = end_match_2; \
4546 }
4547
4548 /* Call before fetching a char with *d if you already checked other limits.
4549 This is meant for use in lookahead operations like wordend, etc..
4550 where we might need to look at parts of the string that might be
4551 outside of the LIMITs (i.e past `stop'). */
4552 #define PREFETCH_NOLIMIT() \
4553 if (d == end1) \
4554 { \
4555 d = string2; \
4556 dend = end_match_2; \
4557 } \
4558
4559 /* Test if at very beginning or at very end of the virtual concatenation
4560 of `string1' and `string2'. If only one string, it's `string2'. */
4561 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4562 #define AT_STRINGS_END(d) ((d) == end2)
4563
4564 /* Disabled due to a compiler bug -- see comment at case wordbound */
4565
4566 /* The comment at case wordbound is following one, but we don't use
4567 AT_WORD_BOUNDARY anymore to support multibyte form.
4568
4569 The DEC Alpha C compiler 3.x generates incorrect code for the
4570 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4571 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4572 macro and introducing temporary variables works around the bug. */
4573
4574 #if 0
4575 /* Test if D points to a character which is word-constituent. We have
4576 two special cases to check for: if past the end of string1, look at
4577 the first character in string2; and if before the beginning of
4578 string2, look at the last character in string1. */
4579 #define WORDCHAR_P(d) \
4580 (SYNTAX ((d) == end1 ? *string2 \
4581 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4582 == Sword)
4583
4584 /* Test if the character before D and the one at D differ with respect
4585 to being word-constituent. */
4586 #define AT_WORD_BOUNDARY(d) \
4587 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4588 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4589 #endif
4590
4591 /* Free everything we malloc. */
4592 #ifdef MATCH_MAY_ALLOCATE
4593 # define FREE_VAR(var) \
4594 do { \
4595 if (var) \
4596 { \
4597 REGEX_FREE (var); \
4598 var = NULL; \
4599 } \
4600 } while (0)
4601 # define FREE_VARIABLES() \
4602 do { \
4603 REGEX_FREE_STACK (fail_stack.stack); \
4604 FREE_VAR (regstart); \
4605 FREE_VAR (regend); \
4606 FREE_VAR (best_regstart); \
4607 FREE_VAR (best_regend); \
4608 } while (0)
4609 #else
4610 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4611 #endif /* not MATCH_MAY_ALLOCATE */
4612
4613 \f
4614 /* Optimization routines. */
4615
4616 /* If the operation is a match against one or more chars,
4617 return a pointer to the next operation, else return NULL. */
4618 static re_char *
4619 skip_one_char (const re_char *p)
4620 {
4621 switch (*p++)
4622 {
4623 case anychar:
4624 break;
4625
4626 case exactn:
4627 p += *p + 1;
4628 break;
4629
4630 case charset_not:
4631 case charset:
4632 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4633 {
4634 int mcnt;
4635 p = CHARSET_RANGE_TABLE (p - 1);
4636 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4637 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4638 }
4639 else
4640 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4641 break;
4642
4643 case syntaxspec:
4644 case notsyntaxspec:
4645 #ifdef emacs
4646 case categoryspec:
4647 case notcategoryspec:
4648 #endif /* emacs */
4649 p++;
4650 break;
4651
4652 default:
4653 p = NULL;
4654 }
4655 return p;
4656 }
4657
4658
4659 /* Jump over non-matching operations. */
4660 static re_char *
4661 skip_noops (const re_char *p, const re_char *pend)
4662 {
4663 int mcnt;
4664 while (p < pend)
4665 {
4666 switch (*p)
4667 {
4668 case start_memory:
4669 case stop_memory:
4670 p += 2; break;
4671 case no_op:
4672 p += 1; break;
4673 case jump:
4674 p += 1;
4675 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4676 p += mcnt;
4677 break;
4678 default:
4679 return p;
4680 }
4681 }
4682 assert (p == pend);
4683 return p;
4684 }
4685
4686 /* Non-zero if "p1 matches something" implies "p2 fails". */
4687 static int
4688 mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
4689 {
4690 re_opcode_t op2;
4691 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4692 unsigned char *pend = bufp->buffer + bufp->used;
4693
4694 assert (p1 >= bufp->buffer && p1 < pend
4695 && p2 >= bufp->buffer && p2 <= pend);
4696
4697 /* Skip over open/close-group commands.
4698 If what follows this loop is a ...+ construct,
4699 look at what begins its body, since we will have to
4700 match at least one of that. */
4701 p2 = skip_noops (p2, pend);
4702 /* The same skip can be done for p1, except that this function
4703 is only used in the case where p1 is a simple match operator. */
4704 /* p1 = skip_noops (p1, pend); */
4705
4706 assert (p1 >= bufp->buffer && p1 < pend
4707 && p2 >= bufp->buffer && p2 <= pend);
4708
4709 op2 = p2 == pend ? succeed : *p2;
4710
4711 switch (op2)
4712 {
4713 case succeed:
4714 case endbuf:
4715 /* If we're at the end of the pattern, we can change. */
4716 if (skip_one_char (p1))
4717 {
4718 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4719 return 1;
4720 }
4721 break;
4722
4723 case endline:
4724 case exactn:
4725 {
4726 register re_wchar_t c
4727 = (re_opcode_t) *p2 == endline ? '\n'
4728 : RE_STRING_CHAR (p2 + 2, multibyte);
4729
4730 if ((re_opcode_t) *p1 == exactn)
4731 {
4732 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4733 {
4734 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4735 return 1;
4736 }
4737 }
4738
4739 else if ((re_opcode_t) *p1 == charset
4740 || (re_opcode_t) *p1 == charset_not)
4741 {
4742 int not = (re_opcode_t) *p1 == charset_not;
4743
4744 /* Test if C is listed in charset (or charset_not)
4745 at `p1'. */
4746 if (! multibyte || IS_REAL_ASCII (c))
4747 {
4748 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4749 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4750 not = !not;
4751 }
4752 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4753 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4754
4755 /* `not' is equal to 1 if c would match, which means
4756 that we can't change to pop_failure_jump. */
4757 if (!not)
4758 {
4759 DEBUG_PRINT1 (" No match => fast loop.\n");
4760 return 1;
4761 }
4762 }
4763 else if ((re_opcode_t) *p1 == anychar
4764 && c == '\n')
4765 {
4766 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4767 return 1;
4768 }
4769 }
4770 break;
4771
4772 case charset:
4773 {
4774 if ((re_opcode_t) *p1 == exactn)
4775 /* Reuse the code above. */
4776 return mutually_exclusive_p (bufp, p2, p1);
4777
4778 /* It is hard to list up all the character in charset
4779 P2 if it includes multibyte character. Give up in
4780 such case. */
4781 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4782 {
4783 /* Now, we are sure that P2 has no range table.
4784 So, for the size of bitmap in P2, `p2[1]' is
4785 enough. But P1 may have range table, so the
4786 size of bitmap table of P1 is extracted by
4787 using macro `CHARSET_BITMAP_SIZE'.
4788
4789 In a multibyte case, we know that all the character
4790 listed in P2 is ASCII. In a unibyte case, P1 has only a
4791 bitmap table. So, in both cases, it is enough to test
4792 only the bitmap table of P1. */
4793
4794 if ((re_opcode_t) *p1 == charset)
4795 {
4796 int idx;
4797 /* We win if the charset inside the loop
4798 has no overlap with the one after the loop. */
4799 for (idx = 0;
4800 (idx < (int) p2[1]
4801 && idx < CHARSET_BITMAP_SIZE (p1));
4802 idx++)
4803 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4804 break;
4805
4806 if (idx == p2[1]
4807 || idx == CHARSET_BITMAP_SIZE (p1))
4808 {
4809 DEBUG_PRINT1 (" No match => fast loop.\n");
4810 return 1;
4811 }
4812 }
4813 else if ((re_opcode_t) *p1 == charset_not)
4814 {
4815 int idx;
4816 /* We win if the charset_not inside the loop lists
4817 every character listed in the charset after. */
4818 for (idx = 0; idx < (int) p2[1]; idx++)
4819 if (! (p2[2 + idx] == 0
4820 || (idx < CHARSET_BITMAP_SIZE (p1)
4821 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4822 break;
4823
4824 if (idx == p2[1])
4825 {
4826 DEBUG_PRINT1 (" No match => fast loop.\n");
4827 return 1;
4828 }
4829 }
4830 }
4831 }
4832 break;
4833
4834 case charset_not:
4835 switch (*p1)
4836 {
4837 case exactn:
4838 case charset:
4839 /* Reuse the code above. */
4840 return mutually_exclusive_p (bufp, p2, p1);
4841 case charset_not:
4842 /* When we have two charset_not, it's very unlikely that
4843 they don't overlap. The union of the two sets of excluded
4844 chars should cover all possible chars, which, as a matter of
4845 fact, is virtually impossible in multibyte buffers. */
4846 break;
4847 }
4848 break;
4849
4850 case wordend:
4851 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4852 case symend:
4853 return ((re_opcode_t) *p1 == syntaxspec
4854 && (p1[1] == Ssymbol || p1[1] == Sword));
4855 case notsyntaxspec:
4856 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4857
4858 case wordbeg:
4859 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4860 case symbeg:
4861 return ((re_opcode_t) *p1 == notsyntaxspec
4862 && (p1[1] == Ssymbol || p1[1] == Sword));
4863 case syntaxspec:
4864 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4865
4866 case wordbound:
4867 return (((re_opcode_t) *p1 == notsyntaxspec
4868 || (re_opcode_t) *p1 == syntaxspec)
4869 && p1[1] == Sword);
4870
4871 #ifdef emacs
4872 case categoryspec:
4873 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4874 case notcategoryspec:
4875 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4876 #endif /* emacs */
4877
4878 default:
4879 ;
4880 }
4881
4882 /* Safe default. */
4883 return 0;
4884 }
4885
4886 \f
4887 /* Matching routines. */
4888
4889 #ifndef emacs /* Emacs never uses this. */
4890 /* re_match is like re_match_2 except it takes only a single string. */
4891
4892 regoff_t
4893 re_match (struct re_pattern_buffer *bufp, const char *string,
4894 size_t size, ssize_t pos, struct re_registers *regs)
4895 {
4896 regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4897 size, pos, regs, size);
4898 return result;
4899 }
4900 WEAK_ALIAS (__re_match, re_match)
4901 #endif /* not emacs */
4902
4903 #ifdef emacs
4904 /* In Emacs, this is the string or buffer in which we
4905 are matching. It is used for looking up syntax properties. */
4906 Lisp_Object re_match_object;
4907 #endif
4908
4909 /* re_match_2 matches the compiled pattern in BUFP against the
4910 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4911 and SIZE2, respectively). We start matching at POS, and stop
4912 matching at STOP.
4913
4914 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4915 store offsets for the substring each group matched in REGS. See the
4916 documentation for exactly how many groups we fill.
4917
4918 We return -1 if no match, -2 if an internal error (such as the
4919 failure stack overflowing). Otherwise, we return the length of the
4920 matched substring. */
4921
4922 regoff_t
4923 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4924 size_t size1, const char *string2, size_t size2, ssize_t pos,
4925 struct re_registers *regs, ssize_t stop)
4926 {
4927 regoff_t result;
4928
4929 #ifdef emacs
4930 ssize_t charpos;
4931 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4932 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4933 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4934 #endif
4935
4936 result = re_match_2_internal (bufp, (re_char*) string1, size1,
4937 (re_char*) string2, size2,
4938 pos, regs, stop);
4939 return result;
4940 }
4941 WEAK_ALIAS (__re_match_2, re_match_2)
4942
4943
4944 /* This is a separate function so that we can force an alloca cleanup
4945 afterwards. */
4946 static regoff_t
4947 re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1,
4948 size_t size1, const re_char *string2, size_t size2,
4949 ssize_t pos, struct re_registers *regs, ssize_t stop)
4950 {
4951 /* General temporaries. */
4952 ssize_t mcnt;
4953 size_t reg;
4954
4955 /* Just past the end of the corresponding string. */
4956 re_char *end1, *end2;
4957
4958 /* Pointers into string1 and string2, just past the last characters in
4959 each to consider matching. */
4960 re_char *end_match_1, *end_match_2;
4961
4962 /* Where we are in the data, and the end of the current string. */
4963 re_char *d, *dend;
4964
4965 /* Used sometimes to remember where we were before starting matching
4966 an operator so that we can go back in case of failure. This "atomic"
4967 behavior of matching opcodes is indispensable to the correctness
4968 of the on_failure_keep_string_jump optimization. */
4969 re_char *dfail;
4970
4971 /* Where we are in the pattern, and the end of the pattern. */
4972 re_char *p = bufp->buffer;
4973 re_char *pend = p + bufp->used;
4974
4975 /* We use this to map every character in the string. */
4976 RE_TRANSLATE_TYPE translate = bufp->translate;
4977
4978 /* Nonzero if BUFP is setup from a multibyte regex. */
4979 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4980
4981 /* Nonzero if STRING1/STRING2 are multibyte. */
4982 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4983
4984 /* Failure point stack. Each place that can handle a failure further
4985 down the line pushes a failure point on this stack. It consists of
4986 regstart, and regend for all registers corresponding to
4987 the subexpressions we're currently inside, plus the number of such
4988 registers, and, finally, two char *'s. The first char * is where
4989 to resume scanning the pattern; the second one is where to resume
4990 scanning the strings. */
4991 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4992 fail_stack_type fail_stack;
4993 #endif
4994 #ifdef DEBUG
4995 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4996 #endif
4997
4998 #if defined REL_ALLOC && defined REGEX_MALLOC
4999 /* This holds the pointer to the failure stack, when
5000 it is allocated relocatably. */
5001 fail_stack_elt_t *failure_stack_ptr;
5002 #endif
5003
5004 /* We fill all the registers internally, independent of what we
5005 return, for use in backreferences. The number here includes
5006 an element for register zero. */
5007 size_t num_regs = bufp->re_nsub + 1;
5008
5009 /* Information on the contents of registers. These are pointers into
5010 the input strings; they record just what was matched (on this
5011 attempt) by a subexpression part of the pattern, that is, the
5012 regnum-th regstart pointer points to where in the pattern we began
5013 matching and the regnum-th regend points to right after where we
5014 stopped matching the regnum-th subexpression. (The zeroth register
5015 keeps track of what the whole pattern matches.) */
5016 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5017 re_char **regstart, **regend;
5018 #endif
5019
5020 /* The following record the register info as found in the above
5021 variables when we find a match better than any we've seen before.
5022 This happens as we backtrack through the failure points, which in
5023 turn happens only if we have not yet matched the entire string. */
5024 unsigned best_regs_set = false;
5025 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5026 re_char **best_regstart, **best_regend;
5027 #endif
5028
5029 /* Logically, this is `best_regend[0]'. But we don't want to have to
5030 allocate space for that if we're not allocating space for anything
5031 else (see below). Also, we never need info about register 0 for
5032 any of the other register vectors, and it seems rather a kludge to
5033 treat `best_regend' differently than the rest. So we keep track of
5034 the end of the best match so far in a separate variable. We
5035 initialize this to NULL so that when we backtrack the first time
5036 and need to test it, it's not garbage. */
5037 re_char *match_end = NULL;
5038
5039 #ifdef DEBUG
5040 /* Counts the total number of registers pushed. */
5041 unsigned num_regs_pushed = 0;
5042 #endif
5043
5044 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5045
5046 INIT_FAIL_STACK ();
5047
5048 #ifdef MATCH_MAY_ALLOCATE
5049 /* Do not bother to initialize all the register variables if there are
5050 no groups in the pattern, as it takes a fair amount of time. If
5051 there are groups, we include space for register 0 (the whole
5052 pattern), even though we never use it, since it simplifies the
5053 array indexing. We should fix this. */
5054 if (bufp->re_nsub)
5055 {
5056 regstart = REGEX_TALLOC (num_regs, re_char *);
5057 regend = REGEX_TALLOC (num_regs, re_char *);
5058 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5059 best_regend = REGEX_TALLOC (num_regs, re_char *);
5060
5061 if (!(regstart && regend && best_regstart && best_regend))
5062 {
5063 FREE_VARIABLES ();
5064 return -2;
5065 }
5066 }
5067 else
5068 {
5069 /* We must initialize all our variables to NULL, so that
5070 `FREE_VARIABLES' doesn't try to free them. */
5071 regstart = regend = best_regstart = best_regend = NULL;
5072 }
5073 #endif /* MATCH_MAY_ALLOCATE */
5074
5075 /* The starting position is bogus. */
5076 if (pos < 0 || pos > size1 + size2)
5077 {
5078 FREE_VARIABLES ();
5079 return -1;
5080 }
5081
5082 /* Initialize subexpression text positions to -1 to mark ones that no
5083 start_memory/stop_memory has been seen for. Also initialize the
5084 register information struct. */
5085 for (reg = 1; reg < num_regs; reg++)
5086 regstart[reg] = regend[reg] = NULL;
5087
5088 /* We move `string1' into `string2' if the latter's empty -- but not if
5089 `string1' is null. */
5090 if (size2 == 0 && string1 != NULL)
5091 {
5092 string2 = string1;
5093 size2 = size1;
5094 string1 = 0;
5095 size1 = 0;
5096 }
5097 end1 = string1 + size1;
5098 end2 = string2 + size2;
5099
5100 /* `p' scans through the pattern as `d' scans through the data.
5101 `dend' is the end of the input string that `d' points within. `d'
5102 is advanced into the following input string whenever necessary, but
5103 this happens before fetching; therefore, at the beginning of the
5104 loop, `d' can be pointing at the end of a string, but it cannot
5105 equal `string2'. */
5106 if (pos >= size1)
5107 {
5108 /* Only match within string2. */
5109 d = string2 + pos - size1;
5110 dend = end_match_2 = string2 + stop - size1;
5111 end_match_1 = end1; /* Just to give it a value. */
5112 }
5113 else
5114 {
5115 if (stop < size1)
5116 {
5117 /* Only match within string1. */
5118 end_match_1 = string1 + stop;
5119 /* BEWARE!
5120 When we reach end_match_1, PREFETCH normally switches to string2.
5121 But in the present case, this means that just doing a PREFETCH
5122 makes us jump from `stop' to `gap' within the string.
5123 What we really want here is for the search to stop as
5124 soon as we hit end_match_1. That's why we set end_match_2
5125 to end_match_1 (since PREFETCH fails as soon as we hit
5126 end_match_2). */
5127 end_match_2 = end_match_1;
5128 }
5129 else
5130 { /* It's important to use this code when stop == size so that
5131 moving `d' from end1 to string2 will not prevent the d == dend
5132 check from catching the end of string. */
5133 end_match_1 = end1;
5134 end_match_2 = string2 + stop - size1;
5135 }
5136 d = string1 + pos;
5137 dend = end_match_1;
5138 }
5139
5140 DEBUG_PRINT1 ("The compiled pattern is: ");
5141 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5142 DEBUG_PRINT1 ("The string to match is: `");
5143 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5144 DEBUG_PRINT1 ("'\n");
5145
5146 /* This loops over pattern commands. It exits by returning from the
5147 function if the match is complete, or it drops through if the match
5148 fails at this starting point in the input data. */
5149 for (;;)
5150 {
5151 DEBUG_PRINT2 ("\n%p: ", p);
5152
5153 if (p == pend)
5154 { /* End of pattern means we might have succeeded. */
5155 DEBUG_PRINT1 ("end of pattern ... ");
5156
5157 /* If we haven't matched the entire string, and we want the
5158 longest match, try backtracking. */
5159 if (d != end_match_2)
5160 {
5161 /* 1 if this match ends in the same string (string1 or string2)
5162 as the best previous match. */
5163 boolean same_str_p = (FIRST_STRING_P (match_end)
5164 == FIRST_STRING_P (d));
5165 /* 1 if this match is the best seen so far. */
5166 boolean best_match_p;
5167
5168 /* AIX compiler got confused when this was combined
5169 with the previous declaration. */
5170 if (same_str_p)
5171 best_match_p = d > match_end;
5172 else
5173 best_match_p = !FIRST_STRING_P (d);
5174
5175 DEBUG_PRINT1 ("backtracking.\n");
5176
5177 if (!FAIL_STACK_EMPTY ())
5178 { /* More failure points to try. */
5179
5180 /* If exceeds best match so far, save it. */
5181 if (!best_regs_set || best_match_p)
5182 {
5183 best_regs_set = true;
5184 match_end = d;
5185
5186 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5187
5188 for (reg = 1; reg < num_regs; reg++)
5189 {
5190 best_regstart[reg] = regstart[reg];
5191 best_regend[reg] = regend[reg];
5192 }
5193 }
5194 goto fail;
5195 }
5196
5197 /* If no failure points, don't restore garbage. And if
5198 last match is real best match, don't restore second
5199 best one. */
5200 else if (best_regs_set && !best_match_p)
5201 {
5202 restore_best_regs:
5203 /* Restore best match. It may happen that `dend ==
5204 end_match_1' while the restored d is in string2.
5205 For example, the pattern `x.*y.*z' against the
5206 strings `x-' and `y-z-', if the two strings are
5207 not consecutive in memory. */
5208 DEBUG_PRINT1 ("Restoring best registers.\n");
5209
5210 d = match_end;
5211 dend = ((d >= string1 && d <= end1)
5212 ? end_match_1 : end_match_2);
5213
5214 for (reg = 1; reg < num_regs; reg++)
5215 {
5216 regstart[reg] = best_regstart[reg];
5217 regend[reg] = best_regend[reg];
5218 }
5219 }
5220 } /* d != end_match_2 */
5221
5222 succeed_label:
5223 DEBUG_PRINT1 ("Accepting match.\n");
5224
5225 /* If caller wants register contents data back, do it. */
5226 if (regs && !bufp->no_sub)
5227 {
5228 /* Have the register data arrays been allocated? */
5229 if (bufp->regs_allocated == REGS_UNALLOCATED)
5230 { /* No. So allocate them with malloc. We need one
5231 extra element beyond `num_regs' for the `-1' marker
5232 GNU code uses. */
5233 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5234 regs->start = TALLOC (regs->num_regs, regoff_t);
5235 regs->end = TALLOC (regs->num_regs, regoff_t);
5236 if (regs->start == NULL || regs->end == NULL)
5237 {
5238 FREE_VARIABLES ();
5239 return -2;
5240 }
5241 bufp->regs_allocated = REGS_REALLOCATE;
5242 }
5243 else if (bufp->regs_allocated == REGS_REALLOCATE)
5244 { /* Yes. If we need more elements than were already
5245 allocated, reallocate them. If we need fewer, just
5246 leave it alone. */
5247 if (regs->num_regs < num_regs + 1)
5248 {
5249 regs->num_regs = num_regs + 1;
5250 RETALLOC (regs->start, regs->num_regs, regoff_t);
5251 RETALLOC (regs->end, regs->num_regs, regoff_t);
5252 if (regs->start == NULL || regs->end == NULL)
5253 {
5254 FREE_VARIABLES ();
5255 return -2;
5256 }
5257 }
5258 }
5259 else
5260 {
5261 /* These braces fend off a "empty body in an else-statement"
5262 warning under GCC when assert expands to nothing. */
5263 assert (bufp->regs_allocated == REGS_FIXED);
5264 }
5265
5266 /* Convert the pointer data in `regstart' and `regend' to
5267 indices. Register zero has to be set differently,
5268 since we haven't kept track of any info for it. */
5269 if (regs->num_regs > 0)
5270 {
5271 regs->start[0] = pos;
5272 regs->end[0] = POINTER_TO_OFFSET (d);
5273 }
5274
5275 /* Go through the first `min (num_regs, regs->num_regs)'
5276 registers, since that is all we initialized. */
5277 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
5278 {
5279 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5280 regs->start[reg] = regs->end[reg] = -1;
5281 else
5282 {
5283 regs->start[reg]
5284 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5285 regs->end[reg]
5286 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
5287 }
5288 }
5289
5290 /* If the regs structure we return has more elements than
5291 were in the pattern, set the extra elements to -1. If
5292 we (re)allocated the registers, this is the case,
5293 because we always allocate enough to have at least one
5294 -1 at the end. */
5295 for (reg = num_regs; reg < regs->num_regs; reg++)
5296 regs->start[reg] = regs->end[reg] = -1;
5297 } /* regs && !bufp->no_sub */
5298
5299 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5300 nfailure_points_pushed, nfailure_points_popped,
5301 nfailure_points_pushed - nfailure_points_popped);
5302 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
5303
5304 mcnt = POINTER_TO_OFFSET (d) - pos;
5305
5306 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
5307
5308 FREE_VARIABLES ();
5309 return mcnt;
5310 }
5311
5312 /* Otherwise match next pattern command. */
5313 switch (*p++)
5314 {
5315 /* Ignore these. Used to ignore the n of succeed_n's which
5316 currently have n == 0. */
5317 case no_op:
5318 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5319 break;
5320
5321 case succeed:
5322 DEBUG_PRINT1 ("EXECUTING succeed.\n");
5323 goto succeed_label;
5324
5325 /* Match the next n pattern characters exactly. The following
5326 byte in the pattern defines n, and the n bytes after that
5327 are the characters to match. */
5328 case exactn:
5329 mcnt = *p++;
5330 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
5331
5332 /* Remember the start point to rollback upon failure. */
5333 dfail = d;
5334
5335 #ifndef emacs
5336 /* This is written out as an if-else so we don't waste time
5337 testing `translate' inside the loop. */
5338 if (RE_TRANSLATE_P (translate))
5339 do
5340 {
5341 PREFETCH ();
5342 if (RE_TRANSLATE (translate, *d) != *p++)
5343 {
5344 d = dfail;
5345 goto fail;
5346 }
5347 d++;
5348 }
5349 while (--mcnt);
5350 else
5351 do
5352 {
5353 PREFETCH ();
5354 if (*d++ != *p++)
5355 {
5356 d = dfail;
5357 goto fail;
5358 }
5359 }
5360 while (--mcnt);
5361 #else /* emacs */
5362 /* The cost of testing `translate' is comparatively small. */
5363 if (target_multibyte)
5364 do
5365 {
5366 int pat_charlen, buf_charlen;
5367 int pat_ch, buf_ch;
5368
5369 PREFETCH ();
5370 if (multibyte)
5371 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5372 else
5373 {
5374 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5375 pat_charlen = 1;
5376 }
5377 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
5378
5379 if (TRANSLATE (buf_ch) != pat_ch)
5380 {
5381 d = dfail;
5382 goto fail;
5383 }
5384
5385 p += pat_charlen;
5386 d += buf_charlen;
5387 mcnt -= pat_charlen;
5388 }
5389 while (mcnt > 0);
5390 else
5391 do
5392 {
5393 int pat_charlen;
5394 int pat_ch, buf_ch;
5395
5396 PREFETCH ();
5397 if (multibyte)
5398 {
5399 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5400 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5401 }
5402 else
5403 {
5404 pat_ch = *p;
5405 pat_charlen = 1;
5406 }
5407 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5408 if (! CHAR_BYTE8_P (buf_ch))
5409 {
5410 buf_ch = TRANSLATE (buf_ch);
5411 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5412 if (buf_ch < 0)
5413 buf_ch = *d;
5414 }
5415 else
5416 buf_ch = *d;
5417 if (buf_ch != pat_ch)
5418 {
5419 d = dfail;
5420 goto fail;
5421 }
5422 p += pat_charlen;
5423 d++;
5424 }
5425 while (--mcnt);
5426 #endif
5427 break;
5428
5429
5430 /* Match any character except possibly a newline or a null. */
5431 case anychar:
5432 {
5433 int buf_charlen;
5434 re_wchar_t buf_ch;
5435
5436 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5437
5438 PREFETCH ();
5439 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
5440 target_multibyte);
5441 buf_ch = TRANSLATE (buf_ch);
5442
5443 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5444 && buf_ch == '\n')
5445 || ((bufp->syntax & RE_DOT_NOT_NULL)
5446 && buf_ch == '\000'))
5447 goto fail;
5448
5449 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5450 d += buf_charlen;
5451 }
5452 break;
5453
5454
5455 case charset:
5456 case charset_not:
5457 {
5458 register unsigned int c;
5459 boolean not = (re_opcode_t) *(p - 1) == charset_not;
5460 int len;
5461
5462 /* Start of actual range_table, or end of bitmap if there is no
5463 range table. */
5464 re_char *range_table IF_LINT (= NULL);
5465
5466 /* Nonzero if there is a range table. */
5467 int range_table_exists;
5468
5469 /* Number of ranges of range table. This is not included
5470 in the initial byte-length of the command. */
5471 int count = 0;
5472
5473 /* Whether matching against a unibyte character. */
5474 boolean unibyte_char = false;
5475
5476 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
5477
5478 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5479
5480 if (range_table_exists)
5481 {
5482 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5483 EXTRACT_NUMBER_AND_INCR (count, range_table);
5484 }
5485
5486 PREFETCH ();
5487 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
5488 if (target_multibyte)
5489 {
5490 int c1;
5491
5492 c = TRANSLATE (c);
5493 c1 = RE_CHAR_TO_UNIBYTE (c);
5494 if (c1 >= 0)
5495 {
5496 unibyte_char = true;
5497 c = c1;
5498 }
5499 }
5500 else
5501 {
5502 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5503
5504 if (! CHAR_BYTE8_P (c1))
5505 {
5506 c1 = TRANSLATE (c1);
5507 c1 = RE_CHAR_TO_UNIBYTE (c1);
5508 if (c1 >= 0)
5509 {
5510 unibyte_char = true;
5511 c = c1;
5512 }
5513 }
5514 else
5515 unibyte_char = true;
5516 }
5517
5518 if (unibyte_char && c < (1 << BYTEWIDTH))
5519 { /* Lookup bitmap. */
5520 /* Cast to `unsigned' instead of `unsigned char' in
5521 case the bit list is a full 32 bytes long. */
5522 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5523 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5524 not = !not;
5525 }
5526 #ifdef emacs
5527 else if (range_table_exists)
5528 {
5529 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5530
5531 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5532 | (class_bits & BIT_MULTIBYTE)
5533 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5534 | (class_bits & BIT_SPACE && ISSPACE (c))
5535 | (class_bits & BIT_UPPER && ISUPPER (c))
5536 | (class_bits & BIT_WORD && ISWORD (c)))
5537 not = !not;
5538 else
5539 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5540 }
5541 #endif /* emacs */
5542
5543 if (range_table_exists)
5544 p = CHARSET_RANGE_TABLE_END (range_table, count);
5545 else
5546 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5547
5548 if (!not) goto fail;
5549
5550 d += len;
5551 }
5552 break;
5553
5554
5555 /* The beginning of a group is represented by start_memory.
5556 The argument is the register number. The text
5557 matched within the group is recorded (in the internal
5558 registers data structure) under the register number. */
5559 case start_memory:
5560 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5561
5562 /* In case we need to undo this operation (via backtracking). */
5563 PUSH_FAILURE_REG ((unsigned int)*p);
5564
5565 regstart[*p] = d;
5566 regend[*p] = NULL; /* probably unnecessary. -sm */
5567 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5568
5569 /* Move past the register number and inner group count. */
5570 p += 1;
5571 break;
5572
5573
5574 /* The stop_memory opcode represents the end of a group. Its
5575 argument is the same as start_memory's: the register number. */
5576 case stop_memory:
5577 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5578
5579 assert (!REG_UNSET (regstart[*p]));
5580 /* Strictly speaking, there should be code such as:
5581
5582 assert (REG_UNSET (regend[*p]));
5583 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5584
5585 But the only info to be pushed is regend[*p] and it is known to
5586 be UNSET, so there really isn't anything to push.
5587 Not pushing anything, on the other hand deprives us from the
5588 guarantee that regend[*p] is UNSET since undoing this operation
5589 will not reset its value properly. This is not important since
5590 the value will only be read on the next start_memory or at
5591 the very end and both events can only happen if this stop_memory
5592 is *not* undone. */
5593
5594 regend[*p] = d;
5595 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5596
5597 /* Move past the register number and the inner group count. */
5598 p += 1;
5599 break;
5600
5601
5602 /* \<digit> has been turned into a `duplicate' command which is
5603 followed by the numeric value of <digit> as the register number. */
5604 case duplicate:
5605 {
5606 register re_char *d2, *dend2;
5607 int regno = *p++; /* Get which register to match against. */
5608 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5609
5610 /* Can't back reference a group which we've never matched. */
5611 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5612 goto fail;
5613
5614 /* Where in input to try to start matching. */
5615 d2 = regstart[regno];
5616
5617 /* Remember the start point to rollback upon failure. */
5618 dfail = d;
5619
5620 /* Where to stop matching; if both the place to start and
5621 the place to stop matching are in the same string, then
5622 set to the place to stop, otherwise, for now have to use
5623 the end of the first string. */
5624
5625 dend2 = ((FIRST_STRING_P (regstart[regno])
5626 == FIRST_STRING_P (regend[regno]))
5627 ? regend[regno] : end_match_1);
5628 for (;;)
5629 {
5630 /* If necessary, advance to next segment in register
5631 contents. */
5632 while (d2 == dend2)
5633 {
5634 if (dend2 == end_match_2) break;
5635 if (dend2 == regend[regno]) break;
5636
5637 /* End of string1 => advance to string2. */
5638 d2 = string2;
5639 dend2 = regend[regno];
5640 }
5641 /* At end of register contents => success */
5642 if (d2 == dend2) break;
5643
5644 /* If necessary, advance to next segment in data. */
5645 PREFETCH ();
5646
5647 /* How many characters left in this segment to match. */
5648 mcnt = dend - d;
5649
5650 /* Want how many consecutive characters we can match in
5651 one shot, so, if necessary, adjust the count. */
5652 if (mcnt > dend2 - d2)
5653 mcnt = dend2 - d2;
5654
5655 /* Compare that many; failure if mismatch, else move
5656 past them. */
5657 if (RE_TRANSLATE_P (translate)
5658 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
5659 : memcmp (d, d2, mcnt))
5660 {
5661 d = dfail;
5662 goto fail;
5663 }
5664 d += mcnt, d2 += mcnt;
5665 }
5666 }
5667 break;
5668
5669
5670 /* begline matches the empty string at the beginning of the string
5671 (unless `not_bol' is set in `bufp'), and after newlines. */
5672 case begline:
5673 DEBUG_PRINT1 ("EXECUTING begline.\n");
5674
5675 if (AT_STRINGS_BEG (d))
5676 {
5677 if (!bufp->not_bol) break;
5678 }
5679 else
5680 {
5681 unsigned c;
5682 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5683 if (c == '\n')
5684 break;
5685 }
5686 /* In all other cases, we fail. */
5687 goto fail;
5688
5689
5690 /* endline is the dual of begline. */
5691 case endline:
5692 DEBUG_PRINT1 ("EXECUTING endline.\n");
5693
5694 if (AT_STRINGS_END (d))
5695 {
5696 if (!bufp->not_eol) break;
5697 }
5698 else
5699 {
5700 PREFETCH_NOLIMIT ();
5701 if (*d == '\n')
5702 break;
5703 }
5704 goto fail;
5705
5706
5707 /* Match at the very beginning of the data. */
5708 case begbuf:
5709 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5710 if (AT_STRINGS_BEG (d))
5711 break;
5712 goto fail;
5713
5714
5715 /* Match at the very end of the data. */
5716 case endbuf:
5717 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5718 if (AT_STRINGS_END (d))
5719 break;
5720 goto fail;
5721
5722
5723 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5724 pushes NULL as the value for the string on the stack. Then
5725 `POP_FAILURE_POINT' will keep the current value for the
5726 string, instead of restoring it. To see why, consider
5727 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5728 then the . fails against the \n. But the next thing we want
5729 to do is match the \n against the \n; if we restored the
5730 string value, we would be back at the foo.
5731
5732 Because this is used only in specific cases, we don't need to
5733 check all the things that `on_failure_jump' does, to make
5734 sure the right things get saved on the stack. Hence we don't
5735 share its code. The only reason to push anything on the
5736 stack at all is that otherwise we would have to change
5737 `anychar's code to do something besides goto fail in this
5738 case; that seems worse than this. */
5739 case on_failure_keep_string_jump:
5740 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5741 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5742 mcnt, p + mcnt);
5743
5744 PUSH_FAILURE_POINT (p - 3, NULL);
5745 break;
5746
5747 /* A nasty loop is introduced by the non-greedy *? and +?.
5748 With such loops, the stack only ever contains one failure point
5749 at a time, so that a plain on_failure_jump_loop kind of
5750 cycle detection cannot work. Worse yet, such a detection
5751 can not only fail to detect a cycle, but it can also wrongly
5752 detect a cycle (between different instantiations of the same
5753 loop).
5754 So the method used for those nasty loops is a little different:
5755 We use a special cycle-detection-stack-frame which is pushed
5756 when the on_failure_jump_nastyloop failure-point is *popped*.
5757 This special frame thus marks the beginning of one iteration
5758 through the loop and we can hence easily check right here
5759 whether something matched between the beginning and the end of
5760 the loop. */
5761 case on_failure_jump_nastyloop:
5762 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5763 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5764 mcnt, p + mcnt);
5765
5766 assert ((re_opcode_t)p[-4] == no_op);
5767 {
5768 int cycle = 0;
5769 CHECK_INFINITE_LOOP (p - 4, d);
5770 if (!cycle)
5771 /* If there's a cycle, just continue without pushing
5772 this failure point. The failure point is the "try again"
5773 option, which shouldn't be tried.
5774 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5775 PUSH_FAILURE_POINT (p - 3, d);
5776 }
5777 break;
5778
5779 /* Simple loop detecting on_failure_jump: just check on the
5780 failure stack if the same spot was already hit earlier. */
5781 case on_failure_jump_loop:
5782 on_failure:
5783 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5784 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5785 mcnt, p + mcnt);
5786 {
5787 int cycle = 0;
5788 CHECK_INFINITE_LOOP (p - 3, d);
5789 if (cycle)
5790 /* If there's a cycle, get out of the loop, as if the matching
5791 had failed. We used to just `goto fail' here, but that was
5792 aborting the search a bit too early: we want to keep the
5793 empty-loop-match and keep matching after the loop.
5794 We want (x?)*y\1z to match both xxyz and xxyxz. */
5795 p += mcnt;
5796 else
5797 PUSH_FAILURE_POINT (p - 3, d);
5798 }
5799 break;
5800
5801
5802 /* Uses of on_failure_jump:
5803
5804 Each alternative starts with an on_failure_jump that points
5805 to the beginning of the next alternative. Each alternative
5806 except the last ends with a jump that in effect jumps past
5807 the rest of the alternatives. (They really jump to the
5808 ending jump of the following alternative, because tensioning
5809 these jumps is a hassle.)
5810
5811 Repeats start with an on_failure_jump that points past both
5812 the repetition text and either the following jump or
5813 pop_failure_jump back to this on_failure_jump. */
5814 case on_failure_jump:
5815 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5816 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5817 mcnt, p + mcnt);
5818
5819 PUSH_FAILURE_POINT (p -3, d);
5820 break;
5821
5822 /* This operation is used for greedy *.
5823 Compare the beginning of the repeat with what in the
5824 pattern follows its end. If we can establish that there
5825 is nothing that they would both match, i.e., that we
5826 would have to backtrack because of (as in, e.g., `a*a')
5827 then we can use a non-backtracking loop based on
5828 on_failure_keep_string_jump instead of on_failure_jump. */
5829 case on_failure_jump_smart:
5830 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5831 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5832 mcnt, p + mcnt);
5833 {
5834 re_char *p1 = p; /* Next operation. */
5835 /* Here, we discard `const', making re_match non-reentrant. */
5836 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5837 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
5838
5839 p -= 3; /* Reset so that we will re-execute the
5840 instruction once it's been changed. */
5841
5842 EXTRACT_NUMBER (mcnt, p2 - 2);
5843
5844 /* Ensure this is a indeed the trivial kind of loop
5845 we are expecting. */
5846 assert (skip_one_char (p1) == p2 - 3);
5847 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5848 DEBUG_STATEMENT (debug += 2);
5849 if (mutually_exclusive_p (bufp, p1, p2))
5850 {
5851 /* Use a fast `on_failure_keep_string_jump' loop. */
5852 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5853 *p3 = (unsigned char) on_failure_keep_string_jump;
5854 STORE_NUMBER (p2 - 2, mcnt + 3);
5855 }
5856 else
5857 {
5858 /* Default to a safe `on_failure_jump' loop. */
5859 DEBUG_PRINT1 (" smart default => slow loop.\n");
5860 *p3 = (unsigned char) on_failure_jump;
5861 }
5862 DEBUG_STATEMENT (debug -= 2);
5863 }
5864 break;
5865
5866 /* Unconditionally jump (without popping any failure points). */
5867 case jump:
5868 unconditional_jump:
5869 IMMEDIATE_QUIT_CHECK;
5870 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5871 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5872 p += mcnt; /* Do the jump. */
5873 DEBUG_PRINT2 ("(to %p).\n", p);
5874 break;
5875
5876
5877 /* Have to succeed matching what follows at least n times.
5878 After that, handle like `on_failure_jump'. */
5879 case succeed_n:
5880 /* Signedness doesn't matter since we only compare MCNT to 0. */
5881 EXTRACT_NUMBER (mcnt, p + 2);
5882 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5883
5884 /* Originally, mcnt is how many times we HAVE to succeed. */
5885 if (mcnt != 0)
5886 {
5887 /* Here, we discard `const', making re_match non-reentrant. */
5888 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
5889 mcnt--;
5890 p += 4;
5891 PUSH_NUMBER (p2, mcnt);
5892 }
5893 else
5894 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5895 goto on_failure;
5896 break;
5897
5898 case jump_n:
5899 /* Signedness doesn't matter since we only compare MCNT to 0. */
5900 EXTRACT_NUMBER (mcnt, p + 2);
5901 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5902
5903 /* Originally, this is how many times we CAN jump. */
5904 if (mcnt != 0)
5905 {
5906 /* Here, we discard `const', making re_match non-reentrant. */
5907 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
5908 mcnt--;
5909 PUSH_NUMBER (p2, mcnt);
5910 goto unconditional_jump;
5911 }
5912 /* If don't have to jump any more, skip over the rest of command. */
5913 else
5914 p += 4;
5915 break;
5916
5917 case set_number_at:
5918 {
5919 unsigned char *p2; /* Location of the counter. */
5920 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5921
5922 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5923 /* Here, we discard `const', making re_match non-reentrant. */
5924 p2 = (unsigned char*) p + mcnt;
5925 /* Signedness doesn't matter since we only copy MCNT's bits . */
5926 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5927 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
5928 PUSH_NUMBER (p2, mcnt);
5929 break;
5930 }
5931
5932 case wordbound:
5933 case notwordbound:
5934 {
5935 boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5936 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
5937
5938 /* We SUCCEED (or FAIL) in one of the following cases: */
5939
5940 /* Case 1: D is at the beginning or the end of string. */
5941 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5942 not = !not;
5943 else
5944 {
5945 /* C1 is the character before D, S1 is the syntax of C1, C2
5946 is the character at D, and S2 is the syntax of C2. */
5947 re_wchar_t c1, c2;
5948 int s1, s2;
5949 int dummy;
5950 #ifdef emacs
5951 ssize_t offset = PTR_TO_OFFSET (d - 1);
5952 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5953 UPDATE_SYNTAX_TABLE (charpos);
5954 #endif
5955 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5956 s1 = SYNTAX (c1);
5957 #ifdef emacs
5958 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5959 #endif
5960 PREFETCH_NOLIMIT ();
5961 GET_CHAR_AFTER (c2, d, dummy);
5962 s2 = SYNTAX (c2);
5963
5964 if (/* Case 2: Only one of S1 and S2 is Sword. */
5965 ((s1 == Sword) != (s2 == Sword))
5966 /* Case 3: Both of S1 and S2 are Sword, and macro
5967 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5968 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5969 not = !not;
5970 }
5971 if (not)
5972 break;
5973 else
5974 goto fail;
5975 }
5976
5977 case wordbeg:
5978 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5979
5980 /* We FAIL in one of the following cases: */
5981
5982 /* Case 1: D is at the end of string. */
5983 if (AT_STRINGS_END (d))
5984 goto fail;
5985 else
5986 {
5987 /* C1 is the character before D, S1 is the syntax of C1, C2
5988 is the character at D, and S2 is the syntax of C2. */
5989 re_wchar_t c1, c2;
5990 int s1, s2;
5991 int dummy;
5992 #ifdef emacs
5993 ssize_t offset = PTR_TO_OFFSET (d);
5994 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5995 UPDATE_SYNTAX_TABLE (charpos);
5996 #endif
5997 PREFETCH ();
5998 GET_CHAR_AFTER (c2, d, dummy);
5999 s2 = SYNTAX (c2);
6000
6001 /* Case 2: S2 is not Sword. */
6002 if (s2 != Sword)
6003 goto fail;
6004
6005 /* Case 3: D is not at the beginning of string ... */
6006 if (!AT_STRINGS_BEG (d))
6007 {
6008 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6009 #ifdef emacs
6010 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6011 #endif
6012 s1 = SYNTAX (c1);
6013
6014 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
6015 returns 0. */
6016 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6017 goto fail;
6018 }
6019 }
6020 break;
6021
6022 case wordend:
6023 DEBUG_PRINT1 ("EXECUTING wordend.\n");
6024
6025 /* We FAIL in one of the following cases: */
6026
6027 /* Case 1: D is at the beginning of string. */
6028 if (AT_STRINGS_BEG (d))
6029 goto fail;
6030 else
6031 {
6032 /* C1 is the character before D, S1 is the syntax of C1, C2
6033 is the character at D, and S2 is the syntax of C2. */
6034 re_wchar_t c1, c2;
6035 int s1, s2;
6036 int dummy;
6037 #ifdef emacs
6038 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6039 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6040 UPDATE_SYNTAX_TABLE (charpos);
6041 #endif
6042 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6043 s1 = SYNTAX (c1);
6044
6045 /* Case 2: S1 is not Sword. */
6046 if (s1 != Sword)
6047 goto fail;
6048
6049 /* Case 3: D is not at the end of string ... */
6050 if (!AT_STRINGS_END (d))
6051 {
6052 PREFETCH_NOLIMIT ();
6053 GET_CHAR_AFTER (c2, d, dummy);
6054 #ifdef emacs
6055 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6056 #endif
6057 s2 = SYNTAX (c2);
6058
6059 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
6060 returns 0. */
6061 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6062 goto fail;
6063 }
6064 }
6065 break;
6066
6067 case symbeg:
6068 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6069
6070 /* We FAIL in one of the following cases: */
6071
6072 /* Case 1: D is at the end of string. */
6073 if (AT_STRINGS_END (d))
6074 goto fail;
6075 else
6076 {
6077 /* C1 is the character before D, S1 is the syntax of C1, C2
6078 is the character at D, and S2 is the syntax of C2. */
6079 re_wchar_t c1, c2;
6080 int s1, s2;
6081 #ifdef emacs
6082 ssize_t offset = PTR_TO_OFFSET (d);
6083 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6084 UPDATE_SYNTAX_TABLE (charpos);
6085 #endif
6086 PREFETCH ();
6087 c2 = RE_STRING_CHAR (d, target_multibyte);
6088 s2 = SYNTAX (c2);
6089
6090 /* Case 2: S2 is neither Sword nor Ssymbol. */
6091 if (s2 != Sword && s2 != Ssymbol)
6092 goto fail;
6093
6094 /* Case 3: D is not at the beginning of string ... */
6095 if (!AT_STRINGS_BEG (d))
6096 {
6097 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6098 #ifdef emacs
6099 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6100 #endif
6101 s1 = SYNTAX (c1);
6102
6103 /* ... and S1 is Sword or Ssymbol. */
6104 if (s1 == Sword || s1 == Ssymbol)
6105 goto fail;
6106 }
6107 }
6108 break;
6109
6110 case symend:
6111 DEBUG_PRINT1 ("EXECUTING symend.\n");
6112
6113 /* We FAIL in one of the following cases: */
6114
6115 /* Case 1: D is at the beginning of string. */
6116 if (AT_STRINGS_BEG (d))
6117 goto fail;
6118 else
6119 {
6120 /* C1 is the character before D, S1 is the syntax of C1, C2
6121 is the character at D, and S2 is the syntax of C2. */
6122 re_wchar_t c1, c2;
6123 int s1, s2;
6124 #ifdef emacs
6125 ssize_t offset = PTR_TO_OFFSET (d) - 1;
6126 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6127 UPDATE_SYNTAX_TABLE (charpos);
6128 #endif
6129 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6130 s1 = SYNTAX (c1);
6131
6132 /* Case 2: S1 is neither Ssymbol nor Sword. */
6133 if (s1 != Sword && s1 != Ssymbol)
6134 goto fail;
6135
6136 /* Case 3: D is not at the end of string ... */
6137 if (!AT_STRINGS_END (d))
6138 {
6139 PREFETCH_NOLIMIT ();
6140 c2 = RE_STRING_CHAR (d, target_multibyte);
6141 #ifdef emacs
6142 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6143 #endif
6144 s2 = SYNTAX (c2);
6145
6146 /* ... and S2 is Sword or Ssymbol. */
6147 if (s2 == Sword || s2 == Ssymbol)
6148 goto fail;
6149 }
6150 }
6151 break;
6152
6153 case syntaxspec:
6154 case notsyntaxspec:
6155 {
6156 boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6157 mcnt = *p++;
6158 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6159 PREFETCH ();
6160 #ifdef emacs
6161 {
6162 ssize_t offset = PTR_TO_OFFSET (d);
6163 ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6164 UPDATE_SYNTAX_TABLE (pos1);
6165 }
6166 #endif
6167 {
6168 int len;
6169 re_wchar_t c;
6170
6171 GET_CHAR_AFTER (c, d, len);
6172 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6173 goto fail;
6174 d += len;
6175 }
6176 }
6177 break;
6178
6179 #ifdef emacs
6180 case before_dot:
6181 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6182 if (PTR_BYTE_POS (d) >= PT_BYTE)
6183 goto fail;
6184 break;
6185
6186 case at_dot:
6187 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6188 if (PTR_BYTE_POS (d) != PT_BYTE)
6189 goto fail;
6190 break;
6191
6192 case after_dot:
6193 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6194 if (PTR_BYTE_POS (d) <= PT_BYTE)
6195 goto fail;
6196 break;
6197
6198 case categoryspec:
6199 case notcategoryspec:
6200 {
6201 boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6202 mcnt = *p++;
6203 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n",
6204 not?"not":"", mcnt);
6205 PREFETCH ();
6206
6207 {
6208 int len;
6209 re_wchar_t c;
6210 GET_CHAR_AFTER (c, d, len);
6211 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6212 goto fail;
6213 d += len;
6214 }
6215 }
6216 break;
6217
6218 #endif /* emacs */
6219
6220 default:
6221 abort ();
6222 }
6223 continue; /* Successfully executed one pattern command; keep going. */
6224
6225
6226 /* We goto here if a matching operation fails. */
6227 fail:
6228 IMMEDIATE_QUIT_CHECK;
6229 if (!FAIL_STACK_EMPTY ())
6230 {
6231 re_char *str, *pat;
6232 /* A restart point is known. Restore to that state. */
6233 DEBUG_PRINT1 ("\nFAIL:\n");
6234 POP_FAILURE_POINT (str, pat);
6235 switch (*pat++)
6236 {
6237 case on_failure_keep_string_jump:
6238 assert (str == NULL);
6239 goto continue_failure_jump;
6240
6241 case on_failure_jump_nastyloop:
6242 assert ((re_opcode_t)pat[-2] == no_op);
6243 PUSH_FAILURE_POINT (pat - 2, str);
6244 /* Fallthrough */
6245
6246 case on_failure_jump_loop:
6247 case on_failure_jump:
6248 case succeed_n:
6249 d = str;
6250 continue_failure_jump:
6251 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6252 p = pat + mcnt;
6253 break;
6254
6255 case no_op:
6256 /* A special frame used for nastyloops. */
6257 goto fail;
6258
6259 default:
6260 abort ();
6261 }
6262
6263 assert (p >= bufp->buffer && p <= pend);
6264
6265 if (d >= string1 && d <= end1)
6266 dend = end_match_1;
6267 }
6268 else
6269 break; /* Matching at this starting point really fails. */
6270 } /* for (;;) */
6271
6272 if (best_regs_set)
6273 goto restore_best_regs;
6274
6275 FREE_VARIABLES ();
6276
6277 return -1; /* Failure to match. */
6278 } /* re_match_2 */
6279 \f
6280 /* Subroutine definitions for re_match_2. */
6281
6282 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6283 bytes; nonzero otherwise. */
6284
6285 static int
6286 bcmp_translate (const re_char *s1, const re_char *s2, register ssize_t len,
6287 RE_TRANSLATE_TYPE translate, const int target_multibyte)
6288 {
6289 register re_char *p1 = s1, *p2 = s2;
6290 re_char *p1_end = s1 + len;
6291 re_char *p2_end = s2 + len;
6292
6293 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6294 different lengths, but relying on a single `len' would break this. -sm */
6295 while (p1 < p1_end && p2 < p2_end)
6296 {
6297 int p1_charlen, p2_charlen;
6298 re_wchar_t p1_ch, p2_ch;
6299
6300 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6301 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
6302
6303 if (RE_TRANSLATE (translate, p1_ch)
6304 != RE_TRANSLATE (translate, p2_ch))
6305 return 1;
6306
6307 p1 += p1_charlen, p2 += p2_charlen;
6308 }
6309
6310 if (p1 != p1_end || p2 != p2_end)
6311 return 1;
6312
6313 return 0;
6314 }
6315 \f
6316 /* Entry points for GNU code. */
6317
6318 /* re_compile_pattern is the GNU regular expression compiler: it
6319 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6320 Returns 0 if the pattern was valid, otherwise an error string.
6321
6322 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6323 are set in BUFP on entry.
6324
6325 We call regex_compile to do the actual compilation. */
6326
6327 const char *
6328 re_compile_pattern (const char *pattern, size_t length,
6329 struct re_pattern_buffer *bufp)
6330 {
6331 reg_errcode_t ret;
6332
6333 /* GNU code is written to assume at least RE_NREGS registers will be set
6334 (and at least one extra will be -1). */
6335 bufp->regs_allocated = REGS_UNALLOCATED;
6336
6337 /* And GNU code determines whether or not to get register information
6338 by passing null for the REGS argument to re_match, etc., not by
6339 setting no_sub. */
6340 bufp->no_sub = 0;
6341
6342 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
6343
6344 if (!ret)
6345 return NULL;
6346 return gettext (re_error_msgid[(int) ret]);
6347 }
6348 WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
6349 \f
6350 /* Entry points compatible with 4.2 BSD regex library. We don't define
6351 them unless specifically requested. */
6352
6353 #if defined _REGEX_RE_COMP || defined _LIBC
6354
6355 /* BSD has one and only one pattern buffer. */
6356 static struct re_pattern_buffer re_comp_buf;
6357
6358 char *
6359 # ifdef _LIBC
6360 /* Make these definitions weak in libc, so POSIX programs can redefine
6361 these names if they don't use our functions, and still use
6362 regcomp/regexec below without link errors. */
6363 weak_function
6364 # endif
6365 re_comp (const char *s)
6366 {
6367 reg_errcode_t ret;
6368
6369 if (!s)
6370 {
6371 if (!re_comp_buf.buffer)
6372 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6373 return (char *) gettext ("No previous regular expression");
6374 return 0;
6375 }
6376
6377 if (!re_comp_buf.buffer)
6378 {
6379 re_comp_buf.buffer = malloc (200);
6380 if (re_comp_buf.buffer == NULL)
6381 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6382 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6383 re_comp_buf.allocated = 200;
6384
6385 re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
6386 if (re_comp_buf.fastmap == NULL)
6387 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6388 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6389 }
6390
6391 /* Since `re_exec' always passes NULL for the `regs' argument, we
6392 don't need to initialize the pattern buffer fields which affect it. */
6393
6394 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6395
6396 if (!ret)
6397 return NULL;
6398
6399 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6400 return (char *) gettext (re_error_msgid[(int) ret]);
6401 }
6402
6403
6404 int
6405 # ifdef _LIBC
6406 weak_function
6407 # endif
6408 re_exec (const char *s)
6409 {
6410 const size_t len = strlen (s);
6411 return
6412 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6413 }
6414 #endif /* _REGEX_RE_COMP */
6415 \f
6416 /* POSIX.2 functions. Don't define these for Emacs. */
6417
6418 #ifndef emacs
6419
6420 /* regcomp takes a regular expression as a string and compiles it.
6421
6422 PREG is a regex_t *. We do not expect any fields to be initialized,
6423 since POSIX says we shouldn't. Thus, we set
6424
6425 `buffer' to the compiled pattern;
6426 `used' to the length of the compiled pattern;
6427 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6428 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6429 RE_SYNTAX_POSIX_BASIC;
6430 `fastmap' to an allocated space for the fastmap;
6431 `fastmap_accurate' to zero;
6432 `re_nsub' to the number of subexpressions in PATTERN.
6433
6434 PATTERN is the address of the pattern string.
6435
6436 CFLAGS is a series of bits which affect compilation.
6437
6438 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6439 use POSIX basic syntax.
6440
6441 If REG_NEWLINE is set, then . and [^...] don't match newline.
6442 Also, regexec will try a match beginning after every newline.
6443
6444 If REG_ICASE is set, then we considers upper- and lowercase
6445 versions of letters to be equivalent when matching.
6446
6447 If REG_NOSUB is set, then when PREG is passed to regexec, that
6448 routine will report only success or failure, and nothing about the
6449 registers.
6450
6451 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6452 the return codes and their meanings.) */
6453
6454 reg_errcode_t
6455 regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6456 int cflags)
6457 {
6458 reg_errcode_t ret;
6459 reg_syntax_t syntax
6460 = (cflags & REG_EXTENDED) ?
6461 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6462
6463 /* regex_compile will allocate the space for the compiled pattern. */
6464 preg->buffer = 0;
6465 preg->allocated = 0;
6466 preg->used = 0;
6467
6468 /* Try to allocate space for the fastmap. */
6469 preg->fastmap = malloc (1 << BYTEWIDTH);
6470
6471 if (cflags & REG_ICASE)
6472 {
6473 unsigned i;
6474
6475 preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
6476 if (preg->translate == NULL)
6477 return (int) REG_ESPACE;
6478
6479 /* Map uppercase characters to corresponding lowercase ones. */
6480 for (i = 0; i < CHAR_SET_SIZE; i++)
6481 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6482 }
6483 else
6484 preg->translate = NULL;
6485
6486 /* If REG_NEWLINE is set, newlines are treated differently. */
6487 if (cflags & REG_NEWLINE)
6488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6489 syntax &= ~RE_DOT_NEWLINE;
6490 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6491 }
6492 else
6493 syntax |= RE_NO_NEWLINE_ANCHOR;
6494
6495 preg->no_sub = !!(cflags & REG_NOSUB);
6496
6497 /* POSIX says a null character in the pattern terminates it, so we
6498 can use strlen here in compiling the pattern. */
6499 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
6500
6501 /* POSIX doesn't distinguish between an unmatched open-group and an
6502 unmatched close-group: both are REG_EPAREN. */
6503 if (ret == REG_ERPAREN)
6504 ret = REG_EPAREN;
6505
6506 if (ret == REG_NOERROR && preg->fastmap)
6507 { /* Compute the fastmap now, since regexec cannot modify the pattern
6508 buffer. */
6509 re_compile_fastmap (preg);
6510 if (preg->can_be_null)
6511 { /* The fastmap can't be used anyway. */
6512 free (preg->fastmap);
6513 preg->fastmap = NULL;
6514 }
6515 }
6516 return ret;
6517 }
6518 WEAK_ALIAS (__regcomp, regcomp)
6519
6520
6521 /* regexec searches for a given pattern, specified by PREG, in the
6522 string STRING.
6523
6524 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6525 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6526 least NMATCH elements, and we set them to the offsets of the
6527 corresponding matched substrings.
6528
6529 EFLAGS specifies `execution flags' which affect matching: if
6530 REG_NOTBOL is set, then ^ does not match at the beginning of the
6531 string; if REG_NOTEOL is set, then $ does not match at the end.
6532
6533 We return 0 if we find a match and REG_NOMATCH if not. */
6534
6535 reg_errcode_t
6536 regexec (const regex_t *__restrict preg, const char *__restrict string,
6537 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
6538 {
6539 regoff_t ret;
6540 struct re_registers regs;
6541 regex_t private_preg;
6542 size_t len = strlen (string);
6543 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
6544
6545 private_preg = *preg;
6546
6547 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6548 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6549
6550 /* The user has told us exactly how many registers to return
6551 information about, via `nmatch'. We have to pass that on to the
6552 matching routines. */
6553 private_preg.regs_allocated = REGS_FIXED;
6554
6555 if (want_reg_info)
6556 {
6557 regs.num_regs = nmatch;
6558 regs.start = TALLOC (nmatch * 2, regoff_t);
6559 if (regs.start == NULL)
6560 return REG_NOMATCH;
6561 regs.end = regs.start + nmatch;
6562 }
6563
6564 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6565 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6566 was a little bit longer but still only matching the real part.
6567 This works because the `endline' will check for a '\n' and will find a
6568 '\0', correctly deciding that this is not the end of a line.
6569 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6570 a convenient '\0' there. For all we know, the string could be preceded
6571 by '\n' which would throw things off. */
6572
6573 /* Perform the searching operation. */
6574 ret = re_search (&private_preg, string, len,
6575 /* start: */ 0, /* range: */ len,
6576 want_reg_info ? &regs : (struct re_registers *) 0);
6577
6578 /* Copy the register information to the POSIX structure. */
6579 if (want_reg_info)
6580 {
6581 if (ret >= 0)
6582 {
6583 unsigned r;
6584
6585 for (r = 0; r < nmatch; r++)
6586 {
6587 pmatch[r].rm_so = regs.start[r];
6588 pmatch[r].rm_eo = regs.end[r];
6589 }
6590 }
6591
6592 /* If we needed the temporary register info, free the space now. */
6593 free (regs.start);
6594 }
6595
6596 /* We want zero return to mean success, unlike `re_search'. */
6597 return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
6598 }
6599 WEAK_ALIAS (__regexec, regexec)
6600
6601
6602 /* Returns a message corresponding to an error code, ERR_CODE, returned
6603 from either regcomp or regexec. We don't use PREG here.
6604
6605 ERR_CODE was previously called ERRCODE, but that name causes an
6606 error with msvc8 compiler. */
6607
6608 size_t
6609 regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
6610 {
6611 const char *msg;
6612 size_t msg_size;
6613
6614 if (err_code < 0
6615 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6616 /* Only error codes returned by the rest of the code should be passed
6617 to this routine. If we are given anything else, or if other regex
6618 code generates an invalid error code, then the program has a bug.
6619 Dump core so we can fix it. */
6620 abort ();
6621
6622 msg = gettext (re_error_msgid[err_code]);
6623
6624 msg_size = strlen (msg) + 1; /* Includes the null. */
6625
6626 if (errbuf_size != 0)
6627 {
6628 if (msg_size > errbuf_size)
6629 {
6630 memcpy (errbuf, msg, errbuf_size - 1);
6631 errbuf[errbuf_size - 1] = 0;
6632 }
6633 else
6634 strcpy (errbuf, msg);
6635 }
6636
6637 return msg_size;
6638 }
6639 WEAK_ALIAS (__regerror, regerror)
6640
6641
6642 /* Free dynamically allocated space used by PREG. */
6643
6644 void
6645 regfree (regex_t *preg)
6646 {
6647 free (preg->buffer);
6648 preg->buffer = NULL;
6649
6650 preg->allocated = 0;
6651 preg->used = 0;
6652
6653 free (preg->fastmap);
6654 preg->fastmap = NULL;
6655 preg->fastmap_accurate = 0;
6656
6657 free (preg->translate);
6658 preg->translate = NULL;
6659 }
6660 WEAK_ALIAS (__regfree, regfree)
6661
6662 #endif /* not emacs */