X-Git-Url: https://git.hcoop.net/bpt/emacs.git/blobdiff_plain/4bb91c680ffdd961ce5b149157b9e891ae748114..5fbf2e842d97a96635a1ab1947ee59045c3fb76c:/src/regex.c diff --git a/src/regex.c b/src/regex.c index 71c9dfe450..38fc80437f 100644 --- a/src/regex.c +++ b/src/regex.c @@ -22,11 +22,11 @@ /* TODO: - structure the opcode space into opcode+flag. - merge with glibc's regex.[ch]. - - replace succeed_n + jump_n with a combined operation so that the counter - can simply be decremented when popping the failure_point without having - to stack up failure_count entries. - - get rid of `newline_anchor'. - */ + - replace (succeed_n + jump_n + set_number_at) with something that doesn't + need to modify the compiled regexp so that re_match can be reentrant. + - get rid of on_failure_jump_smart by doing the optimization in re_comp + rather than at run-time, so that re_match can be reentrant. +*/ /* AIX requires this to be the first thing in the file. */ #if defined _AIX && !defined REGEX_MALLOC @@ -47,6 +47,60 @@ # include #endif +/* Whether to use ISO C Amendment 1 wide char functions. + Those should not be used for Emacs since it uses its own. */ +#if defined _LIBC +#define WIDE_CHAR_SUPPORT 1 +#else +#define WIDE_CHAR_SUPPORT \ + (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs) +#endif + +/* For platform which support the ISO C amendement 1 functionality we + support user defined character classes. */ +#if WIDE_CHAR_SUPPORT +/* Solaris 2.5 has a bug: must be included before . */ +# include +# include +#endif + +#ifdef _LIBC +/* We have to keep the namespace clean. */ +# define regfree(preg) __regfree (preg) +# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) +# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) +# define regerror(errcode, preg, errbuf, errbuf_size) \ + __regerror(errcode, preg, errbuf, errbuf_size) +# define re_set_registers(bu, re, nu, st, en) \ + __re_set_registers (bu, re, nu, st, en) +# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ + __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) +# define re_match(bufp, string, size, pos, regs) \ + __re_match (bufp, string, size, pos, regs) +# define re_search(bufp, string, size, startpos, range, regs) \ + __re_search (bufp, string, size, startpos, range, regs) +# define re_compile_pattern(pattern, length, bufp) \ + __re_compile_pattern (pattern, length, bufp) +# define re_set_syntax(syntax) __re_set_syntax (syntax) +# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ + __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) +# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) + +/* Make sure we call libc's function even if the user overrides them. */ +# define btowc __btowc +# define iswctype __iswctype +# define wctype __wctype + +# define WEAK_ALIAS(a,b) weak_alias (a, b) + +/* We are also using some library internals. */ +# include +# include +# include +#else +# define WEAK_ALIAS(a,b) +#endif + /* This is for other GNU distributions with internationalized messages. */ #if HAVE_LIBINTL_H || defined _LIBC # include @@ -74,8 +128,17 @@ # include "charset.h" # include "category.h" +# ifdef malloc +# undef malloc +# endif # define malloc xmalloc +# ifdef realloc +# undef realloc +# endif # define realloc xrealloc +# ifdef free +# undef free +# endif # define free xfree /* Converts the pointer to the char to BEG-based offset from the start. */ @@ -222,7 +285,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 }; ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ : 1) -# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ +# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ : 1) @@ -504,7 +567,7 @@ typedef enum is followed by a range table: 2 bytes of flags for character sets (low 8 bits, high 8 bits) See RANGE_TABLE_WORK_BITS below. - 2 bytes, the number of pairs that follow + 2 bytes, the number of pairs that follow (upto 32767) pairs, each 2 multibyte characters, each multibyte character represented as 3 bytes. */ charset, @@ -651,7 +714,7 @@ static void extract_number _RE_ARGS ((int *dest, re_char *source)); static void extract_number (dest, source) int *dest; - unsigned char *source; + re_char *source; { int temp = SIGN_EXTEND_CHAR (*(source + 1)); *dest = *source & 0377; @@ -680,7 +743,7 @@ static void extract_number_and_incr _RE_ARGS ((int *destination, static void extract_number_and_incr (destination, source) int *destination; - unsigned char **source; + re_char **source; { extract_number (destination, *source); *source += 2; @@ -754,9 +817,9 @@ extract_number_and_incr (destination, source) #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \ do \ { \ - int range_start, range_end; \ - unsigned char *p; \ - unsigned char *range_table_end \ + re_wchar_t range_start, range_end; \ + re_char *p; \ + re_char *range_table_end \ = CHARSET_RANGE_TABLE_END ((range_table), (count)); \ \ for (p = (range_table); p < range_table_end; p += 2 * 3) \ @@ -780,8 +843,8 @@ extract_number_and_incr (destination, source) { \ /* Number of ranges in range table. */ \ int count; \ - unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \ - \ + re_char *range_table = CHARSET_RANGE_TABLE (charset); \ + \ EXTRACT_NUMBER_AND_INCR (count, range_table); \ CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \ } \ @@ -850,12 +913,12 @@ print_fastmap (fastmap) void print_partial_compiled_pattern (start, end) - unsigned char *start; - unsigned char *end; + re_char *start; + re_char *end; { int mcnt, mcnt2; - unsigned char *p = start; - unsigned char *pend = end; + re_char *p = start; + re_char *pend = end; if (start == NULL) { @@ -1093,7 +1156,7 @@ void print_compiled_pattern (bufp) struct re_pattern_buffer *bufp; { - unsigned char *buffer = bufp->buffer; + re_char *buffer = bufp->buffer; print_partial_compiled_pattern (buffer, buffer + bufp->used); printf ("%ld bytes used/%ld bytes allocated.\n", @@ -1108,7 +1171,6 @@ print_compiled_pattern (bufp) printf ("re_nsub: %d\t", bufp->re_nsub); printf ("regs_alloc: %d\t", bufp->regs_allocated); printf ("can_be_null: %d\t", bufp->can_be_null); - printf ("newline_anchor: %d\n", bufp->newline_anchor); printf ("no_sub: %d\t", bufp->no_sub); printf ("not_bol: %d\t", bufp->not_bol); printf ("not_eol: %d\t", bufp->not_eol); @@ -1184,6 +1246,7 @@ re_set_syntax (syntax) re_syntax_options = syntax; return ret; } +WEAK_ALIAS (__re_set_syntax, re_set_syntax) /* This table gives an error message for each of the error codes listed in regex.h. Obviously the order here has to be same as there. @@ -1264,21 +1327,22 @@ static const char *re_error_msgid[] = /* Roughly the maximum number of failure points on the stack. Would be exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed. This is a variable only so users of regex can assign to it; we never - change it ourselves. */ -#if defined MATCH_MAY_ALLOCATE -/* Note that 4400 is enough to cause a crash on Alpha OSF/1, + change it ourselves. */ +# if defined MATCH_MAY_ALLOCATE +/* Note that 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. In order for a larger value to work reliably, you have to try to make it accord with the process stack limit. */ -int re_max_failures = 40000; -#else -int re_max_failures = 4000; -#endif +size_t re_max_failures = 40000; +# else +size_t re_max_failures = 4000; +# endif union fail_stack_elt { - const unsigned char *pointer; - unsigned int integer; + re_char *pointer; + /* This should be the biggest `int' that's no bigger than a pointer. */ + long integer; }; typedef union fail_stack_elt fail_stack_elt_t; @@ -1286,12 +1350,11 @@ typedef union fail_stack_elt fail_stack_elt_t; typedef struct { fail_stack_elt_t *stack; - unsigned size; - unsigned avail; /* Offset of next open position. */ - unsigned frame; /* Offset of the cur constructed frame. */ + size_t size; + size_t avail; /* Offset of next open position. */ + size_t frame; /* Offset of the cur constructed frame. */ } fail_stack_type; -#define PATTERN_STACK_EMPTY() (fail_stack.avail == 0) #define FAIL_STACK_EMPTY() (fail_stack.frame == 0) #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) @@ -1363,22 +1426,11 @@ typedef struct 1))) -/* Push pointer POINTER on FAIL_STACK. - Return 1 if was able to do so and 0 if ran out of memory allocating - space to do so. */ -#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ - ((FAIL_STACK_FULL () \ - && !GROW_FAIL_STACK (FAIL_STACK)) \ - ? 0 \ - : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ - 1)) -#define POP_PATTERN_OP() POP_FAILURE_POINTER () - /* Push a pointer value onto the failure stack. Assumes the variable `fail_stack'. Probably should only be called from within `PUSH_FAILURE_POINT'. */ #define PUSH_FAILURE_POINTER(item) \ - fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) + fail_stack.stack[fail_stack.avail++].pointer = (item) /* This pushes an integer-valued item onto the failure stack. Assumes the variable `fail_stack'. Probably should only @@ -1428,16 +1480,19 @@ do { \ PUSH_FAILURE_INT (num); \ } while (0) -#define PUSH_FAILURE_COUNT(ptr) \ +/* Change the counter's value to VAL, but make sure that it will + be reset when backtracking. */ +#define PUSH_NUMBER(ptr,val) \ do { \ char *destination; \ int c; \ ENSURE_FAIL_STACK(3); \ EXTRACT_NUMBER (c, ptr); \ - DEBUG_PRINT3 (" Push counter %p = %d\n", ptr, c); \ + DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \ PUSH_FAILURE_INT (c); \ PUSH_FAILURE_POINTER (ptr); \ PUSH_FAILURE_INT (-1); \ + STORE_NUMBER (ptr, val); \ } while (0) /* Pop a saved register off the stack. */ @@ -1447,6 +1502,7 @@ do { \ if (reg == -1) \ { \ /* It's a counter. */ \ + /* Here, we discard `const', making re_match non-reentrant. */ \ unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \ reg = POP_FAILURE_INT (); \ STORE_NUMBER (ptr, reg); \ @@ -1553,14 +1609,14 @@ do { \ while (fail_stack.frame < fail_stack.avail) \ POP_FAILURE_REG_OR_COUNT (); \ \ - pat = (unsigned char *) POP_FAILURE_POINTER (); \ + pat = POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" Popping pattern %p: ", pat); \ DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ \ /* If the saved string location is NULL, it came from an \ on_failure_keep_string_jump opcode, and we want to throw away the \ saved NULL, thus retaining our current position in the string. */ \ - str = (re_char *) POP_FAILURE_POINTER (); \ + str = POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" Popping string %p: `", str); \ DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ DEBUG_PRINT1 ("'\n"); \ @@ -1591,20 +1647,18 @@ static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)); static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)); -static boolean at_begline_loc_p _RE_ARGS ((const unsigned char *pattern, - const unsigned char *p, +static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern, + re_char *p, reg_syntax_t syntax)); -static boolean at_endline_loc_p _RE_ARGS ((const unsigned char *p, - const unsigned char *pend, +static boolean at_endline_loc_p _RE_ARGS ((re_char *p, + re_char *pend, reg_syntax_t syntax)); -static unsigned char *skip_one_char _RE_ARGS ((unsigned char *p)); -static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, +static re_char *skip_one_char _RE_ARGS ((re_char *p)); +static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, char *fastmap, const int multibyte)); /* Fetch the next character in the uncompiled pattern---translating it - if necessary. Also cast from a signed character in the constant - string passed to us by the user to an unsigned char that we can use - as an array index (in, e.g., `translate'). */ + if necessary. */ #define PATFETCH(c) \ do { \ PATFETCH_RAW (c); \ @@ -1639,7 +1693,7 @@ static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, /* Make sure we have at least N more bytes of space in buffer. */ #define GET_BUFFER_SPACE(n) \ - while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ + while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \ EXTEND_BUFFER () /* Make sure we have one more byte of buffer space and then add C to it. */ @@ -1728,13 +1782,13 @@ static int analyse_first _RE_ARGS ((unsigned char *p, unsigned char *pend, #endif #define EXTEND_BUFFER() \ do { \ - unsigned char *old_buffer = bufp->buffer; \ + re_char *old_buffer = bufp->buffer; \ if (bufp->allocated == MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ bufp->allocated = MAX_BUF_SIZE; \ - bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ + RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \ if (bufp->buffer == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ @@ -1826,21 +1880,14 @@ struct range_table_work_area #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \ (work_area).bits |= (bit) -/* These bits represent the various character classes such as [:alnum:] - in a charset's range table. */ -#define BIT_ALNUM 0x1 -#define BIT_ALPHA 0x2 -#define BIT_WORD 0x4 -#define BIT_ASCII 0x8 -#define BIT_NONASCII 0x10 -#define BIT_GRAPH 0x20 -#define BIT_LOWER 0x40 -#define BIT_PRINT 0x80 -#define BIT_PUNCT 0x100 -#define BIT_SPACE 0x200 -#define BIT_UPPER 0x400 -#define BIT_UNIBYTE 0x800 -#define BIT_MULTIBYTE 0x1000 +/* Bits used to implement the multibyte-part of the various character classes + such as [:alnum:] in a charset's range table. */ +#define BIT_WORD 0x1 +#define BIT_LOWER 0x2 +#define BIT_PUNCT 0x4 +#define BIT_SPACE 0x8 +#define BIT_UPPER 0x10 +#define BIT_MULTIBYTE 0x20 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ @@ -1864,9 +1911,7 @@ struct range_table_work_area /* Set the bit for character C in a list. */ -#define SET_LIST_BIT(c) \ - (b[((unsigned char) (c)) / BYTEWIDTH] \ - |= 1 << (((unsigned char) c) % BYTEWIDTH)) +#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH)) /* Get the next unsigned number in the uncompiled pattern. */ @@ -1886,23 +1931,127 @@ struct range_table_work_area } \ } while (0) -#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ - -#define IS_CHAR_CLASS(string) \ - (STREQ (string, "alpha") || STREQ (string, "upper") \ - || STREQ (string, "lower") || STREQ (string, "digit") \ - || STREQ (string, "alnum") || STREQ (string, "xdigit") \ - || STREQ (string, "space") || STREQ (string, "print") \ - || STREQ (string, "punct") || STREQ (string, "graph") \ - || STREQ (string, "cntrl") || STREQ (string, "blank") \ - || STREQ (string, "word") \ - || STREQ (string, "ascii") || STREQ (string, "nonascii") \ - || STREQ (string, "unibyte") || STREQ (string, "multibyte")) - -/* QUIT is only used on NTemacs. */ -#if !defined WINDOWSNT || !defined emacs || !defined QUIT -# undef QUIT -# define QUIT +#if WIDE_CHAR_SUPPORT +/* The GNU C library provides support for user-defined character classes + and the functions from ISO C amendement 1. */ +# ifdef CHARCLASS_NAME_MAX +# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX +# else +/* This shouldn't happen but some implementation might still have this + problem. Use a reasonable default value. */ +# define CHAR_CLASS_MAX_LENGTH 256 +# endif +typedef wctype_t re_wctype_t; +typedef wchar_t re_wchar_t; +# define re_wctype wctype +# define re_iswctype iswctype +# define re_wctype_to_bit(cc) 0 +#else +# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ +# define btowc(c) c + +/* Character classes. */ +typedef enum { RECC_ERROR = 0, + RECC_ALNUM, RECC_ALPHA, RECC_WORD, + RECC_GRAPH, RECC_PRINT, + RECC_LOWER, RECC_UPPER, + RECC_PUNCT, RECC_CNTRL, + RECC_DIGIT, RECC_XDIGIT, + RECC_BLANK, RECC_SPACE, + RECC_MULTIBYTE, RECC_NONASCII, + RECC_ASCII, RECC_UNIBYTE +} re_wctype_t; + +typedef int re_wchar_t; + +/* Map a string to the char class it names (if any). */ +static re_wctype_t +re_wctype (string) + re_char *string; +{ + if (STREQ (string, "alnum")) return RECC_ALNUM; + else if (STREQ (string, "alpha")) return RECC_ALPHA; + else if (STREQ (string, "word")) return RECC_WORD; + else if (STREQ (string, "ascii")) return RECC_ASCII; + else if (STREQ (string, "nonascii")) return RECC_NONASCII; + else if (STREQ (string, "graph")) return RECC_GRAPH; + else if (STREQ (string, "lower")) return RECC_LOWER; + else if (STREQ (string, "print")) return RECC_PRINT; + else if (STREQ (string, "punct")) return RECC_PUNCT; + else if (STREQ (string, "space")) return RECC_SPACE; + else if (STREQ (string, "upper")) return RECC_UPPER; + else if (STREQ (string, "unibyte")) return RECC_UNIBYTE; + else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE; + else if (STREQ (string, "digit")) return RECC_DIGIT; + else if (STREQ (string, "xdigit")) return RECC_XDIGIT; + else if (STREQ (string, "cntrl")) return RECC_CNTRL; + else if (STREQ (string, "blank")) return RECC_BLANK; + else return 0; +} + +/* True iff CH is in the char class CC. */ +static boolean +re_iswctype (ch, cc) + int ch; + re_wctype_t cc; +{ + switch (cc) + { + case RECC_ALNUM: return ISALNUM (ch); + case RECC_ALPHA: return ISALPHA (ch); + case RECC_BLANK: return ISBLANK (ch); + case RECC_CNTRL: return ISCNTRL (ch); + case RECC_DIGIT: return ISDIGIT (ch); + case RECC_GRAPH: return ISGRAPH (ch); + case RECC_LOWER: return ISLOWER (ch); + case RECC_PRINT: return ISPRINT (ch); + case RECC_PUNCT: return ISPUNCT (ch); + case RECC_SPACE: return ISSPACE (ch); + case RECC_UPPER: return ISUPPER (ch); + case RECC_XDIGIT: return ISXDIGIT (ch); + case RECC_ASCII: return IS_REAL_ASCII (ch); + case RECC_NONASCII: return !IS_REAL_ASCII (ch); + case RECC_UNIBYTE: return ISUNIBYTE (ch); + case RECC_MULTIBYTE: return !ISUNIBYTE (ch); + case RECC_WORD: return ISWORD (ch); + case RECC_ERROR: return false; + default: + abort(); + } +} + +/* Return a bit-pattern to use in the range-table bits to match multibyte + chars of class CC. */ +static int +re_wctype_to_bit (cc) + re_wctype_t cc; +{ + switch (cc) + { + case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: + case RECC_MULTIBYTE: return BIT_MULTIBYTE; + case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; + case RECC_LOWER: return BIT_LOWER; + case RECC_UPPER: return BIT_UPPER; + case RECC_PUNCT: return BIT_PUNCT; + case RECC_SPACE: return BIT_SPACE; + case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: + case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; + default: + abort(); + } +} +#endif + +/* Explicit quit checking is only used on NTemacs. */ +#if defined WINDOWSNT && defined emacs && defined QUIT +extern int immediate_quit; +# define IMMEDIATE_QUIT_CHECK \ + do { \ + if (immediate_quit) QUIT; \ + } while (0) +#else +# define IMMEDIATE_QUIT_CHECK ((void)0) #endif #ifndef MATCH_MAY_ALLOCATE @@ -1963,8 +2112,7 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type `re_nsub' is the number of subexpressions in PATTERN; `not_bol' and `not_eol' are zero; - The `fastmap' and `newline_anchor' fields are neither - examined nor set. */ + The `fastmap' field is neither examined nor set. */ /* Insert the `jump' from the end of last alternative to "here". The space for the jump has already been allocated. */ @@ -1990,10 +2138,8 @@ regex_compile (pattern, size, syntax, bufp) reg_syntax_t syntax; struct re_pattern_buffer *bufp; { - /* We fetch characters from PATTERN here. Even though PATTERN is - `char *' (i.e., signed), we declare these variables as unsigned, so - they can be reliably used as array indices. */ - register unsigned int c, c1; + /* We fetch characters from PATTERN here. */ + register re_wchar_t c, c1; /* A random temporary spot in PATTERN. */ re_char *p1; @@ -2126,7 +2272,7 @@ regex_compile (pattern, size, syntax, bufp) || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p (pattern, p, syntax)) - BUF_PUSH (begline); + BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline); else goto normal_char; } @@ -2141,7 +2287,7 @@ regex_compile (pattern, size, syntax, bufp) || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p (p, pend, syntax)) - BUF_PUSH (endline); + BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline); else goto normal_char; } @@ -2220,6 +2366,7 @@ regex_compile (pattern, size, syntax, bufp) boolean simple = skip_one_char (laststart) == b; unsigned int startoffset = 0; re_opcode_t ofj = + /* Check if the loop can match the empty string. */ (simple || !analyse_first (laststart, b, NULL, 0)) ? on_failure_jump : on_failure_jump_loop; assert (skip_one_char (laststart) <= b); @@ -2374,7 +2521,7 @@ regex_compile (pattern, size, syntax, bufp) syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') { /* Leave room for the null. */ - char str[CHAR_CLASS_MAX_LENGTH + 1]; + unsigned char str[CHAR_CLASS_MAX_LENGTH + 1]; const unsigned char *class_beg; PATFETCH (c); @@ -2386,11 +2533,14 @@ regex_compile (pattern, size, syntax, bufp) for (;;) { - PATFETCH (c); - if (c == ':' || c == ']' || p == pend - || c1 == CHAR_CLASS_MAX_LENGTH) - break; - str[c1++] = c; + PATFETCH (c); + if ((c == ':' && *p == ']') || p == pend) + break; + if (c1 < CHAR_CLASS_MAX_LENGTH) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; } str[c1] = '\0'; @@ -2401,89 +2551,34 @@ regex_compile (pattern, size, syntax, bufp) if (c == ':' && *p == ']') { int ch; - boolean is_alnum = STREQ (str, "alnum"); - boolean is_alpha = STREQ (str, "alpha"); - boolean is_ascii = STREQ (str, "ascii"); - boolean is_blank = STREQ (str, "blank"); - boolean is_cntrl = STREQ (str, "cntrl"); - boolean is_digit = STREQ (str, "digit"); - boolean is_graph = STREQ (str, "graph"); - boolean is_lower = STREQ (str, "lower"); - boolean is_multibyte = STREQ (str, "multibyte"); - boolean is_nonascii = STREQ (str, "nonascii"); - boolean is_print = STREQ (str, "print"); - boolean is_punct = STREQ (str, "punct"); - boolean is_space = STREQ (str, "space"); - boolean is_unibyte = STREQ (str, "unibyte"); - boolean is_upper = STREQ (str, "upper"); - boolean is_word = STREQ (str, "word"); - boolean is_xdigit = STREQ (str, "xdigit"); - - if (!IS_CHAR_CLASS (str)) + re_wctype_t cc; + + cc = re_wctype (str); + + if (cc == 0) FREE_STACK_RETURN (REG_ECTYPE); - /* Throw away the ] at the end of the character - class. */ - PATFETCH (c); + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); - if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); /* Most character classes in a multibyte match just set a flag. Exceptions are is_blank, is_digit, is_cntrl, and is_xdigit, since they can only match ASCII characters. We - don't need to handle them for multibyte. */ + don't need to handle them for multibyte. + They are distinguished by a negative wctype. */ if (multibyte) - { - int bit = 0; - - if (is_alnum) bit = BIT_ALNUM; - if (is_alpha) bit = BIT_ALPHA; - if (is_ascii) bit = BIT_ASCII; - if (is_graph) bit = BIT_GRAPH; - if (is_lower) bit = BIT_LOWER; - if (is_multibyte) bit = BIT_MULTIBYTE; - if (is_nonascii) bit = BIT_NONASCII; - if (is_print) bit = BIT_PRINT; - if (is_punct) bit = BIT_PUNCT; - if (is_space) bit = BIT_SPACE; - if (is_unibyte) bit = BIT_UNIBYTE; - if (is_upper) bit = BIT_UPPER; - if (is_word) bit = BIT_WORD; - if (bit) - SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, - bit); - } + SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, + re_wctype_to_bit (cc)); - /* Handle character classes for ASCII characters. */ - for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) { int translated = TRANSLATE (ch); - /* This was split into 3 if's to - avoid an arbitrary limit in some compiler. */ - if ( (is_alnum && ISALNUM (ch)) - || (is_alpha && ISALPHA (ch)) - || (is_blank && ISBLANK (ch)) - || (is_cntrl && ISCNTRL (ch))) - SET_LIST_BIT (translated); - if ( (is_digit && ISDIGIT (ch)) - || (is_graph && ISGRAPH (ch)) - || (is_lower && ISLOWER (ch)) - || (is_print && ISPRINT (ch))) - SET_LIST_BIT (translated); - if ( (is_punct && ISPUNCT (ch)) - || (is_space && ISSPACE (ch)) - || (is_upper && ISUPPER (ch)) - || (is_xdigit && ISXDIGIT (ch))) - SET_LIST_BIT (translated); - if ( (is_ascii && IS_REAL_ASCII (ch)) - || (is_nonascii && !IS_REAL_ASCII (ch)) - || (is_unibyte && ISUNIBYTE (ch)) - || (is_multibyte && !ISUNIBYTE (ch))) - SET_LIST_BIT (translated); - - if ( (is_word && ISWORD (ch))) + if (re_iswctype (btowc (ch), cc)) SET_LIST_BIT (translated); } @@ -2516,18 +2611,19 @@ regex_compile (pattern, size, syntax, bufp) { if (! SINGLE_BYTE_CHAR_P (c1)) { - /* Handle a range such as \177-\377 in - multibyte mode. Split that into two - ranges, the low one ending at 0237, and - the high one starting at the smallest - character in the charset of C1 and - ending at C1. */ + /* Handle a range starting with a + character of less than 256, and ending + with a character of not less than 256. + Split that into two ranges, the low one + ending at 0377, and the high one + starting at the smallest character in + the charset of C1 and ending at C1. */ int charset = CHAR_CHARSET (c1); int c2 = MAKE_CHAR (charset, 0, 0); SET_RANGE_TABLE_WORK_AREA (range_table_work, c2, c1); - c1 = 0237; + c1 = 0377; } } else if (!SAME_CHARSET_P (c, c1)) @@ -2541,7 +2637,7 @@ regex_compile (pattern, size, syntax, bufp) if (SINGLE_BYTE_CHAR_P (c)) /* ... into bitmap. */ { - unsigned this_char; + re_wchar_t this_char; int range_start = c, range_end = c1; /* If the start is after the end, the range is empty. */ @@ -3065,20 +3161,21 @@ regex_compile (pattern, size, syntax, bufp) case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - if (syntax & RE_NO_BK_REFS) - goto normal_char; + { + regnum_t reg; - c1 = c - '0'; + if (syntax & RE_NO_BK_REFS) + goto normal_backslash; - if (c1 > regnum) - FREE_STACK_RETURN (REG_ESUBREG); + reg = c - '0'; - /* Can't back reference to a subexpression if inside of it. */ - if (group_in_compile_stack (compile_stack, (regnum_t) c1)) - goto normal_char; + /* Can't back reference to a subexpression before its end. */ + if (reg > regnum || group_in_compile_stack (compile_stack, reg)) + FREE_STACK_RETURN (REG_ESUBREG); - laststart = b; - BUF_PUSH_2 (duplicate, c1); + laststart = b; + BUF_PUSH_2 (duplicate, reg); + } break; @@ -3277,10 +3374,10 @@ insert_op2 (op, loc, arg1, arg2, end) static boolean at_begline_loc_p (pattern, p, syntax) - const unsigned char *pattern, *p; + re_char *pattern, *p; reg_syntax_t syntax; { - const unsigned char *prev = p - 2; + re_char *prev = p - 2; boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; return @@ -3301,12 +3398,12 @@ at_begline_loc_p (pattern, p, syntax) static boolean at_endline_loc_p (p, pend, syntax) - const unsigned char *p, *pend; + re_char *p, *pend; reg_syntax_t syntax; { - const unsigned char *next = p; + re_char *next = p; boolean next_backslash = *next == '\\'; - const unsigned char *next_next = p + 1 < pend ? p + 1 : 0; + re_char *next_next = p + 1 < pend ? p + 1 : 0; return /* Before a subexpression? */ @@ -3345,36 +3442,16 @@ group_in_compile_stack (compile_stack, regnum) Return 1 if p..pend might match the empty string. Return 0 if p..pend matches at least one char. - Return -1 if p..pend matches at least one char, but fastmap was not - updated accurately. - Return -2 if an error occurred. */ + Return -1 if fastmap was not updated accurately. */ static int analyse_first (p, pend, fastmap, multibyte) - unsigned char *p, *pend; + re_char *p, *pend; char *fastmap; const int multibyte; { int j, k; boolean not; -#ifdef MATCH_MAY_ALLOCATE - fail_stack_type fail_stack; -#endif -#ifndef REGEX_MALLOC - char *destination; -#endif - -#if defined REL_ALLOC && defined REGEX_MALLOC - /* This holds the pointer to the failure stack, when - it is allocated relocatably. */ - fail_stack_elt_t *failure_stack_ptr; -#endif - - /* Assume that each path through the pattern can be null until - proven otherwise. We set this false at the bottom of switch - statement, to which we get only if a particular path doesn't - match the empty string. */ - boolean path_can_be_null = true; /* If all elements for base leading-codes in fastmap is set, this flag is set true. */ @@ -3382,8 +3459,6 @@ analyse_first (p, pend, fastmap, multibyte) assert (p); - INIT_FAIL_STACK (); - /* The loop below works as follows: - It has a working-list kept in the PATTERN_STACK and which basically starts by only containing a pointer to the first operation. @@ -3399,8 +3474,7 @@ analyse_first (p, pend, fastmap, multibyte) so that `p' is monotonically increasing. More to the point, we never set `p' (or push) anything `<= p1'. */ - /* If can_be_null is set, then the fastmap will not be used anyway. */ - while (1) + while (p < pend) { /* `p1' is used as a marker of how far back a `on_failure_jump' can go without being ignored. It is normally equal to `p' @@ -3410,29 +3484,12 @@ analyse_first (p, pend, fastmap, multibyte) 3..9: 10: on_failure_jump 3 as used for the *? operator. */ - unsigned char *p1 = p; - - if (p >= pend) - { - if (path_can_be_null) - return (RESET_FAIL_STACK (), 1); - - /* We have reached the (effective) end of pattern. */ - if (PATTERN_STACK_EMPTY ()) - return (RESET_FAIL_STACK (), 0); - - p = (unsigned char*) POP_PATTERN_OP (); - path_can_be_null = true; - continue; - } - - /* We should never be about to go beyond the end of the pattern. */ - assert (p < pend); + re_char *p1 = p; switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) { case succeed: - p = pend; + return 1; continue; case duplicate: @@ -3464,7 +3521,7 @@ analyse_first (p, pend, fastmap, multibyte) /* We could put all the chars except for \n (and maybe \0) but we don't bother since it is generally not worth it. */ if (!fastmap) break; - return (RESET_FAIL_STACK (), -1); + return -1; case charset_not: @@ -3539,7 +3596,7 @@ analyse_first (p, pend, fastmap, multibyte) #else /* emacs */ /* This match depends on text properties. These end with aborting optimizations. */ - return (RESET_FAIL_STACK (), -1); + return -1; case categoryspec: case notcategoryspec: @@ -3606,8 +3663,14 @@ analyse_first (p, pend, fastmap, multibyte) EXTRACT_NUMBER_AND_INCR (j, p); if (p + j <= p1) ; /* Backward jump to be ignored. */ - else if (!PUSH_PATTERN_OP (p + j, fail_stack)) - return (RESET_FAIL_STACK (), -2); + else + { /* We have to look down both arms. + We first go down the "straight" path so as to minimize + stack usage when going through alternatives. */ + int r = analyse_first (p, pend, fastmap, multibyte); + if (r) return r; + p += j; + } continue; @@ -3647,15 +3710,13 @@ analyse_first (p, pend, fastmap, multibyte) /* Getting here means we have found the possible starting characters for one path of the pattern -- and that the empty - string does not match. We need not follow this path further. - Instead, look at the next alternative (remembered on the - stack), or quit if no more. The test at the top of the loop - does these things. */ - path_can_be_null = false; - p = pend; + string does not match. We need not follow this path further. */ + return 0; } /* while p */ - return (RESET_FAIL_STACK (), 0); + /* We reached the end without matching anything. */ + return 1; + } /* analyse_first */ /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in @@ -3689,8 +3750,6 @@ re_compile_fastmap (bufp) analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used, fastmap, RE_MULTIBYTE_P (bufp)); - if (analysis < -1) - return analysis; bufp->can_be_null = (analysis != 0); return 0; } /* re_compile_fastmap */ @@ -3729,6 +3788,7 @@ re_set_registers (bufp, regs, num_regs, starts, ends) regs->start = regs->end = (regoff_t *) 0; } } +WEAK_ALIAS (__re_set_registers, re_set_registers) /* Searching routines. */ @@ -3745,6 +3805,7 @@ re_search (bufp, string, size, startpos, range, regs) return re_search_2 (bufp, NULL, 0, string, size, startpos, range, regs, size); } +WEAK_ALIAS (__re_search, re_search) /* End address of virtual concatenation of string. */ #define STOP_ADDR_VSTRING(P) \ @@ -3792,7 +3853,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) register RE_TRANSLATE_TYPE translate = bufp->translate; int total_size = size1 + size2; int endpos = startpos + range; - int anchored_start = 0; + boolean anchored_start; /* Nonzero if we have to concern multibyte character. */ const boolean multibyte = RE_MULTIBYTE_P (bufp); @@ -3832,12 +3893,10 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) /* Update the fastmap now if not correct already. */ if (fastmap && !bufp->fastmap_accurate) - if (re_compile_fastmap (bufp) == -2) - return -2; + re_compile_fastmap (bufp); /* See whether the pattern is anchored. */ - if (bufp->buffer[0] == begline) - anchored_start = 1; + anchored_start = (bufp->buffer[0] == begline); #ifdef emacs gl_state.object = re_match_object; @@ -3857,10 +3916,9 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) because that case doesn't repeat. */ if (anchored_start && startpos > 0) { - if (! (bufp->newline_anchor - && ((startpos <= size1 ? string1[startpos - 1] - : string2[startpos - size1 - 1]) - == '\n'))) + if (! ((startpos <= size1 ? string1[startpos - 1] + : string2[startpos - size1 - 1]) + == '\n')) goto advance; } @@ -3871,7 +3929,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) if (fastmap && startpos < total_size && !bufp->can_be_null) { register re_char *d; - register unsigned int buf_ch; + register re_wchar_t buf_ch; d = POS_ADDR_VSTRING (startpos); @@ -4009,6 +4067,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) } return -1; } /* re_search_2 */ +WEAK_ALIAS (__re_search_2, re_search_2) /* Declarations and macros for re_match_2. */ @@ -4103,9 +4162,9 @@ static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2, /* If the operation is a match against one or more chars, return a pointer to the next operation, else return NULL. */ -static unsigned char * +static re_char * skip_one_char (p) - unsigned char *p; + re_char *p; { switch (SWITCH_ENUM_CAST (*p++)) { @@ -4213,14 +4272,11 @@ mutually_exclusive_p (bufp, p1, p2) break; case endline: - if (!bufp->newline_anchor) - break; - /* Fallthrough */ case exactn: { - register unsigned int c + register re_wchar_t c = (re_opcode_t) *p2 == endline ? '\n' - : RE_STRING_CHAR(p2 + 2, pend - p2 - 2); + : RE_STRING_CHAR (p2 + 2, pend - p2 - 2); if ((re_opcode_t) *p1 == exactn) { @@ -4265,13 +4321,11 @@ mutually_exclusive_p (bufp, p1, p2) break; case charset: - case charset_not: { if ((re_opcode_t) *p1 == exactn) /* Reuse the code above. */ return mutually_exclusive_p (bufp, p2, p1); - /* It is hard to list up all the character in charset P2 if it includes multibyte character. Give up in such case. */ @@ -4287,7 +4341,7 @@ mutually_exclusive_p (bufp, p1, p2) P2 is ASCII, it is enough to test only bitmap table of P1. */ - if (*p1 == *p2) + if ((re_opcode_t) *p1 == charset) { int idx; /* We win if the charset inside the loop @@ -4306,8 +4360,7 @@ mutually_exclusive_p (bufp, p1, p2) return 1; } } - else if ((re_opcode_t) *p1 == charset - || (re_opcode_t) *p1 == charset_not) + else if ((re_opcode_t) *p1 == charset_not) { int idx; /* We win if the charset_not inside the loop lists @@ -4326,7 +4379,24 @@ mutually_exclusive_p (bufp, p1, p2) } } } + break; + case charset_not: + switch (SWITCH_ENUM_CAST (*p1)) + { + case exactn: + case charset: + /* Reuse the code above. */ + return mutually_exclusive_p (bufp, p2, p1); + case charset_not: + /* When we have two charset_not, it's very unlikely that + they don't overlap. The union of the two sets of excluded + chars should cover all possible chars, which, as a matter of + fact, is virtually impossible in multibyte buffers. */ + ; + } + break; + case wordend: case notsyntaxspec: return ((re_opcode_t) *p1 == syntaxspec @@ -4377,6 +4447,7 @@ re_match (bufp, string, size, pos, regs) # endif return result; } +WEAK_ALIAS (__re_match, re_match) #endif /* not emacs */ #ifdef emacs @@ -4424,6 +4495,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) #endif return result; } +WEAK_ALIAS (__re_match_2, re_match_2) /* This is a separate function so that we can force an alloca cleanup afterwards. */ @@ -4438,8 +4510,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* General temporaries. */ int mcnt; + size_t reg; boolean not; - unsigned char *p1; /* Just past the end of the corresponding string. */ re_char *end1, *end2; @@ -4458,8 +4530,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) re_char *dfail; /* Where we are in the pattern, and the end of the pattern. */ - unsigned char *p = bufp->buffer; - register unsigned char *pend = p + bufp->used; + re_char *p = bufp->buffer; + re_char *pend = p + bufp->used; /* We use this to map every character in the string. */ RE_TRANSLATE_TYPE translate = bufp->translate; @@ -4568,8 +4640,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Initialize subexpression text positions to -1 to mark ones that no start_memory/stop_memory has been seen for. Also initialize the register information struct. */ - for (mcnt = 1; mcnt < num_regs; mcnt++) - regstart[mcnt] = regend[mcnt] = NULL; + for (reg = 1; reg < num_regs; reg++) + regstart[reg] = regend[reg] = NULL; /* We move `string1' into `string2' if the latter's empty -- but not if `string1' is null. */ @@ -4671,10 +4743,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); - for (mcnt = 1; mcnt < num_regs; mcnt++) + for (reg = 1; reg < num_regs; reg++) { - best_regstart[mcnt] = regstart[mcnt]; - best_regend[mcnt] = regend[mcnt]; + best_regstart[reg] = regstart[reg]; + best_regend[reg] = regend[reg]; } } goto fail; @@ -4697,10 +4769,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) dend = ((d >= string1 && d <= end1) ? end_match_1 : end_match_2); - for (mcnt = 1; mcnt < num_regs; mcnt++) + for (reg = 1; reg < num_regs; reg++) { - regstart[mcnt] = best_regstart[mcnt]; - regend[mcnt] = best_regend[mcnt]; + regstart[reg] = best_regstart[reg]; + regend[reg] = best_regend[reg]; } } } /* d != end_match_2 */ @@ -4760,16 +4832,16 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Go through the first `min (num_regs, regs->num_regs)' registers, since that is all we initialized. */ - for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) + for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++) { - if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) - regs->start[mcnt] = regs->end[mcnt] = -1; + if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg])) + regs->start[reg] = regs->end[reg] = -1; else { - regs->start[mcnt] - = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); - regs->end[mcnt] - = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); + regs->start[reg] + = (regoff_t) POINTER_TO_OFFSET (regstart[reg]); + regs->end[reg] + = (regoff_t) POINTER_TO_OFFSET (regend[reg]); } } @@ -4778,8 +4850,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) we (re)allocated the registers, this is the case, because we always allocate enough to have at least one -1 at the end. */ - for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) - regs->start[mcnt] = regs->end[mcnt] = -1; + for (reg = num_regs; reg < regs->num_regs; reg++) + regs->start[reg] = regs->end[reg] = -1; } /* regs && !bufp->no_sub */ DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", @@ -4877,7 +4949,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case anychar: { int buf_charlen; - unsigned int buf_ch; + re_wchar_t buf_ch; DEBUG_PRINT1 ("EXECUTING anychar.\n"); @@ -4906,7 +4978,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Start of actual range_table, or end of bitmap if there is no range table. */ - unsigned char *range_table; + re_char *range_table; /* Nonzero if there is a range table. */ int range_table_exists; @@ -4942,17 +5014,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); - if ( (class_bits & BIT_ALNUM && ISALNUM (c)) - | (class_bits & BIT_ALPHA && ISALPHA (c)) - | (class_bits & BIT_ASCII && IS_REAL_ASCII (c)) - | (class_bits & BIT_GRAPH && ISGRAPH (c)) - | (class_bits & BIT_LOWER && ISLOWER (c)) - | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c)) - | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c)) - | (class_bits & BIT_PRINT && ISPRINT (c)) + if ( (class_bits & BIT_LOWER && ISLOWER (c)) + | (class_bits & BIT_MULTIBYTE) | (class_bits & BIT_PUNCT && ISPUNCT (c)) | (class_bits & BIT_SPACE && ISSPACE (c)) - | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c)) | (class_bits & BIT_UPPER && ISUPPER (c)) | (class_bits & BIT_WORD && ISWORD (c))) not = !not; @@ -5089,8 +5154,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* begline matches the empty string at the beginning of the string - (unless `not_bol' is set in `bufp'), and, if - `newline_anchor' is set, after newlines. */ + (unless `not_bol' is set in `bufp'), and after newlines. */ case begline: DEBUG_PRINT1 ("EXECUTING begline.\n"); @@ -5102,7 +5166,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { unsigned char c; GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2); - if (c == '\n' && bufp->newline_anchor) + if (c == '\n') break; } /* In all other cases, we fail. */ @@ -5120,7 +5184,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) else { PREFETCH_NOLIMIT (); - if (*d == '\n' && bufp->newline_anchor) + if (*d == '\n') break; } goto fail; @@ -5217,7 +5281,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) the repetition text and either the following jump or pop_failure_jump back to this on_failure_jump. */ case on_failure_jump: - QUIT; + IMMEDIATE_QUIT_CHECK; EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n", mcnt, p + mcnt); @@ -5233,13 +5297,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) then we can use a non-backtracking loop based on on_failure_keep_string_jump instead of on_failure_jump. */ case on_failure_jump_smart: - QUIT; + IMMEDIATE_QUIT_CHECK; EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n", mcnt, p + mcnt); { - unsigned char *p1 = p; /* Next operation. */ - unsigned char *p2 = p + mcnt; /* Destination of the jump. */ + re_char *p1 = p; /* Next operation. */ + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */ + unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */ p -= 3; /* Reset so that we will re-execute the instruction once it's been changed. */ @@ -5255,14 +5321,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* Use a fast `on_failure_keep_string_jump' loop. */ DEBUG_PRINT1 (" smart exclusive => fast loop.\n"); - *p = (unsigned char) on_failure_keep_string_jump; + *p3 = (unsigned char) on_failure_keep_string_jump; STORE_NUMBER (p2 - 2, mcnt + 3); } else { /* Default to a safe `on_failure_jump' loop. */ DEBUG_PRINT1 (" smart default => slow loop.\n"); - *p = (unsigned char) on_failure_jump; + *p3 = (unsigned char) on_failure_jump; } DEBUG_STATEMENT (debug -= 2); } @@ -5271,7 +5337,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Unconditionally jump (without popping any failure points). */ case jump: unconditional_jump: - QUIT; + IMMEDIATE_QUIT_CHECK; EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); p += mcnt; /* Do the jump. */ @@ -5282,17 +5348,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Have to succeed matching what follows at least n times. After that, handle like `on_failure_jump'. */ case succeed_n: + /* Signedness doesn't matter since we only compare MCNT to 0. */ EXTRACT_NUMBER (mcnt, p + 2); DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); /* Originally, mcnt is how many times we HAVE to succeed. */ if (mcnt != 0) { + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */ mcnt--; - p += 2; - PUSH_FAILURE_COUNT (p); - DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt); - STORE_NUMBER_AND_INCR (p, mcnt); + p += 4; + PUSH_NUMBER (p2, mcnt); } else /* The two bytes encoding mcnt == 0 are two no_op opcodes. */ @@ -5300,15 +5367,17 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) break; case jump_n: + /* Signedness doesn't matter since we only compare MCNT to 0. */ EXTRACT_NUMBER (mcnt, p + 2); DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); /* Originally, this is how many times we CAN jump. */ if (mcnt != 0) { + /* Here, we discard `const', making re_match non-reentrant. */ + unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */ mcnt--; - PUSH_FAILURE_COUNT (p + 2); - STORE_NUMBER (p + 2, mcnt); + PUSH_NUMBER (p2, mcnt); goto unconditional_jump; } /* If don't have to jump any more, skip over the rest of command. */ @@ -5318,14 +5387,16 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case set_number_at: { + unsigned char *p2; /* Location of the counter. */ DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); EXTRACT_NUMBER_AND_INCR (mcnt, p); - p1 = p + mcnt; + /* Here, we discard `const', making re_match non-reentrant. */ + p2 = (unsigned char*) p + mcnt; + /* Signedness doesn't matter since we only copy MCNT's bits . */ EXTRACT_NUMBER_AND_INCR (mcnt, p); - DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); - PUSH_FAILURE_COUNT (p1); - STORE_NUMBER (p1, mcnt); + DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt); + PUSH_NUMBER (p2, mcnt); break; } @@ -5343,7 +5414,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ - int c1, c2, s1, s2; + re_wchar_t c1, c2; + int s1, s2; #ifdef emacs int offset = PTR_TO_OFFSET (d - 1); int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5382,7 +5454,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ - int c1, c2, s1, s2; + re_wchar_t c1, c2; + int s1, s2; #ifdef emacs int offset = PTR_TO_OFFSET (d); int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5425,7 +5498,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) { /* C1 is the character before D, S1 is the syntax of C1, C2 is the character at D, and S2 is the syntax of C2. */ - int c1, c2, s1, s2; + re_wchar_t c1, c2; + int s1, s2; #ifdef emacs int offset = PTR_TO_OFFSET (d) - 1; int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5470,7 +5544,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } #endif { - int c, len; + int len; + re_wchar_t c; c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); @@ -5506,7 +5581,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt); PREFETCH (); { - int c, len; + int len; + re_wchar_t c; + c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) @@ -5525,11 +5602,10 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* We goto here if a matching operation fails. */ fail: - QUIT; + IMMEDIATE_QUIT_CHECK; if (!FAIL_STACK_EMPTY ()) { - re_char *str; - unsigned char *pat; + re_char *str, *pat; /* A restart point is known. Restore to that state. */ DEBUG_PRINT1 ("\nFAIL:\n"); POP_FAILURE_POINT (str, pat); @@ -5599,7 +5675,7 @@ bcmp_translate (s1, s2, len, translate, multibyte) while (p1 < p1_end && p2 < p2_end) { int p1_charlen, p2_charlen; - int p1_ch, p2_ch; + re_wchar_t p1_ch, p2_ch; p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); @@ -5645,15 +5721,13 @@ re_compile_pattern (pattern, length, bufp) setting no_sub. */ bufp->no_sub = 0; - /* Match anchors at newline. */ - bufp->newline_anchor = 1; - ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp); if (!ret) return NULL; return gettext (re_error_msgid[(int) ret]); } +WEAK_ALIAS (__re_compile_pattern, re_compile_pattern) /* Entry points compatible with 4.2 BSD regex library. We don't define them unless specifically requested. */ @@ -5700,9 +5774,6 @@ re_comp (s) /* Since `re_exec' always passes NULL for the `regs' argument, we don't need to initialize the pattern buffer fields which affect it. */ - /* Match anchors at newlines. */ - re_comp_buf.newline_anchor = 1; - ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); if (!ret) @@ -5740,8 +5811,8 @@ re_exec (s) `syntax' to RE_SYNTAX_POSIX_EXTENDED if the REG_EXTENDED bit in CFLAGS is set; otherwise, to RE_SYNTAX_POSIX_BASIC; - `newline_anchor' to REG_NEWLINE being set in CFLAGS; - `fastmap' and `fastmap_accurate' to zero; + `fastmap' to an allocated space for the fastmap; + `fastmap_accurate' to zero; `re_nsub' to the number of subexpressions in PATTERN. PATTERN is the address of the pattern string. @@ -5780,11 +5851,8 @@ regcomp (preg, pattern, cflags) preg->allocated = 0; preg->used = 0; - /* Don't bother to use a fastmap when searching. This simplifies the - REG_NEWLINE case: if we used a fastmap, we'd have to put all the - characters after newlines into the fastmap. This way, we just try - every character. */ - preg->fastmap = 0; + /* Try to allocate space for the fastmap. */ + preg->fastmap = (char *) malloc (1 << BYTEWIDTH); if (cflags & REG_ICASE) { @@ -5808,11 +5876,9 @@ regcomp (preg, pattern, cflags) { /* REG_NEWLINE implies neither . nor [^...] match newline. */ syntax &= ~RE_DOT_NEWLINE; syntax |= RE_HAT_LISTS_NOT_NEWLINE; - /* It also changes the matching behavior. */ - preg->newline_anchor = 1; } else - preg->newline_anchor = 0; + syntax |= RE_NO_NEWLINE_ANCHOR; preg->no_sub = !!(cflags & REG_NOSUB); @@ -5822,10 +5888,22 @@ regcomp (preg, pattern, cflags) /* POSIX doesn't distinguish between an unmatched open-group and an unmatched close-group: both are REG_EPAREN. */ - if (ret == REG_ERPAREN) ret = REG_EPAREN; - + if (ret == REG_ERPAREN) + ret = REG_EPAREN; + + if (ret == REG_NOERROR && preg->fastmap) + { /* Compute the fastmap now, since regexec cannot modify the pattern + buffer. */ + re_compile_fastmap (preg); + if (preg->can_be_null) + { /* The fastmap can't be used anyway. */ + free (preg->fastmap); + preg->fastmap = NULL; + } + } return (int) ret; } +WEAK_ALIAS (__regcomp, regcomp) /* regexec searches for a given pattern, specified by PREG, in the @@ -5854,7 +5932,7 @@ regexec (preg, string, nmatch, pmatch, eflags) struct re_registers regs; regex_t private_preg; int len = strlen (string); - boolean want_reg_info = !preg->no_sub && nmatch > 0; + boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch; private_preg = *preg; @@ -5875,6 +5953,15 @@ regexec (preg, string, nmatch, pmatch, eflags) regs.end = regs.start + nmatch; } + /* Instead of using not_eol to implement REG_NOTEOL, we could simply + pass (&private_preg, string, len + 1, 0, len, ...) pretending the string + was a little bit longer but still only matching the real part. + This works because the `endline' will check for a '\n' and will find a + '\0', correctly deciding that this is not the end of a line. + But it doesn't work out so nicely for REG_NOTBOL, since we don't have + a convenient '\0' there. For all we know, the string could be preceded + by '\n' which would throw things off. */ + /* Perform the searching operation. */ ret = re_search (&private_preg, string, len, /* start: */ 0, /* range: */ len, @@ -5901,6 +5988,7 @@ regexec (preg, string, nmatch, pmatch, eflags) /* We want zero return to mean success, unlike `re_search'. */ return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; } +WEAK_ALIAS (__regexec, regexec) /* Returns a message corresponding to an error code, ERRCODE, returned @@ -5941,6 +6029,7 @@ regerror (errcode, preg, errbuf, errbuf_size) return msg_size; } +WEAK_ALIAS (__regerror, regerror) /* Free dynamically allocated space used by PREG. */ @@ -5965,5 +6054,6 @@ regfree (preg) free (preg->translate); preg->translate = NULL; } +WEAK_ALIAS (__regfree, regfree) #endif /* not emacs */