src/regex.c

   1 /* Extended regular expression matching and search library, version
   2    0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
   3    internationalization features.)
   4
   5    Copyright (C) 1993-2014 Free Software Foundation, Inc.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* TODO:
  21    - structure the opcode space into opcode+flag.
  22    - merge with glibc's regex.[ch].
  23    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
  24      need to modify the compiled regexp so that re_match can be reentrant.
  25    - get rid of on_failure_jump_smart by doing the optimization in re_comp
  26      rather than at run-time, so that re_match can be reentrant.
  27 */
  28
  29 /* AIX requires this to be the first thing in the file.  */
  30 #if defined _AIX && !defined REGEX_MALLOC
  31   #pragma alloca
  32 #endif
  33
  34 /* Ignore some GCC warnings for now.  This section should go away
  35    once the Emacs and Gnulib regex code is merged.  */
  36 #if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) || defined __clang__
  37 # pragma GCC diagnostic ignored "-Wstrict-overflow"
  38 # ifndef emacs
  39 #  pragma GCC diagnostic ignored "-Wunused-function"
  40 #  pragma GCC diagnostic ignored "-Wunused-macros"
  41 #  pragma GCC diagnostic ignored "-Wunused-result"
  42 #  pragma GCC diagnostic ignored "-Wunused-variable"
  43 # endif
  44 #endif
  45
  46 #if 4 < __GNUC__ + (6 <= __GNUC_MINOR__) && ! defined __clang__
  47 # pragma GCC diagnostic ignored "-Wunused-but-set-variable"
  48 #endif
  49
  50 #include <config.h>
  51
  52 #include <stddef.h>
  53
  54 #ifdef emacs
  55 /* We need this for `regex.h', and perhaps for the Emacs include files.  */
  56 # include <sys/types.h>
  57 #endif
  58
  59 /* Whether to use ISO C Amendment 1 wide char functions.
  60    Those should not be used for Emacs since it uses its own.  */
  61 #if defined _LIBC
  62 #define WIDE_CHAR_SUPPORT 1
  63 #else
  64 #define WIDE_CHAR_SUPPORT \
  65         (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
  66 #endif
  67
  68 /* For platform which support the ISO C amendment 1 functionality we
  69    support user defined character classes.  */
  70 #if WIDE_CHAR_SUPPORT
  71 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
  72 # include <wchar.h>
  73 # include <wctype.h>
  74 #endif
  75
  76 #ifdef _LIBC
  77 /* We have to keep the namespace clean.  */
  78 # define regfree(preg) __regfree (preg)
  79 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
  80 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
  81 # define regerror(err_code, preg, errbuf, errbuf_size) \
  82         __regerror (err_code, preg, errbuf, errbuf_size)
  83 # define re_set_registers(bu, re, nu, st, en) \
  84         __re_set_registers (bu, re, nu, st, en)
  85 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
  86         __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
  87 # define re_match(bufp, string, size, pos, regs) \
  88         __re_match (bufp, string, size, pos, regs)
  89 # define re_search(bufp, string, size, startpos, range, regs) \
  90         __re_search (bufp, string, size, startpos, range, regs)
  91 # define re_compile_pattern(pattern, length, bufp) \
  92         __re_compile_pattern (pattern, length, bufp)
  93 # define re_set_syntax(syntax) __re_set_syntax (syntax)
  94 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
  95         __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
  96 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
  97
  98 /* Make sure we call libc's function even if the user overrides them.  */
  99 # define btowc __btowc
 100 # define iswctype __iswctype
 101 # define wctype __wctype
 102
 103 # define WEAK_ALIAS(a,b) weak_alias (a, b)
 104
 105 /* We are also using some library internals.  */
 106 # include <locale/localeinfo.h>
 107 # include <locale/elem-hash.h>
 108 # include <langinfo.h>
 109 #else
 110 # define WEAK_ALIAS(a,b)
 111 #endif
 112
 113 /* This is for other GNU distributions with internationalized messages.  */
 114 #if HAVE_LIBINTL_H || defined _LIBC
 115 # include <libintl.h>
 116 #else
 117 # define gettext(msgid) (msgid)
 118 #endif
 119
 120 #ifndef gettext_noop
 121 /* This define is so xgettext can find the internationalizable
 122    strings.  */
 123 # define gettext_noop(String) String
 124 #endif
 125
 126 /* The `emacs' switch turns on certain matching commands
 127    that make sense only in Emacs. */
 128 #ifdef emacs
 129
 130 # include "lisp.h"
 131 # include "character.h"
 132 # include "buffer.h"
 133
 134 # include "syntax.h"
 135 # include "category.h"
 136
 137 /* Make syntax table lookup grant data in gl_state.  */
 138 # define SYNTAX(c) syntax_property (c, 1)
 139
 140 # ifdef malloc
 141 #  undef malloc
 142 # endif
 143 # define malloc xmalloc
 144 # ifdef realloc
 145 #  undef realloc
 146 # endif
 147 # define realloc xrealloc
 148 # ifdef free
 149 #  undef free
 150 # endif
 151 # define free xfree
 152
 153 /* Converts the pointer to the char to BEG-based offset from the start.  */
 154 # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
 155 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
 156
 157 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
 158 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
 159 # define RE_STRING_CHAR(p, multibyte) \
 160   (multibyte ? (STRING_CHAR (p)) : (*(p)))
 161 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
 162   (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
 163
 164 # define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
 165
 166 # define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
 167
 168 /* Set C a (possibly converted to multibyte) character before P.  P
 169    points into a string which is the virtual concatenation of STR1
 170    (which ends at END1) or STR2 (which ends at END2).  */
 171 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)                     \
 172   do {                                                                       \
 173     if (target_multibyte)                                                    \
 174       {                                                                      \
 175         re_char *dtemp = (p) == (str2) ? (end1) : (p);                       \
 176         re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
 177         while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));                   \
 178         c = STRING_CHAR (dtemp);                                             \
 179       }                                                                      \
 180     else                                                                     \
 181       {                                                                      \
 182         (c = ((p) == (str2) ? (end1) : (p))[-1]);                            \
 183         (c) = RE_CHAR_TO_MULTIBYTE (c);                                      \
 184       }                                                                      \
 185   } while (0)
 186
 187 /* Set C a (possibly converted to multibyte) character at P, and set
 188    LEN to the byte length of that character.  */
 189 # define GET_CHAR_AFTER(c, p, len)              \
 190   do {                                          \
 191     if (target_multibyte)                       \
 192       (c) = STRING_CHAR_AND_LENGTH (p, len);    \
 193     else                                        \
 194       {                                         \
 195         (c) = *p;                               \
 196         len = 1;                                \
 197         (c) = RE_CHAR_TO_MULTIBYTE (c);         \
 198       }                                         \
 199    } while (0)
 200
 201 #else  /* not emacs */
 202
 203 /* If we are not linking with Emacs proper,
 204    we can't use the relocating allocator
 205    even if config.h says that we can.  */
 206 # undef REL_ALLOC
 207
 208 # include <unistd.h>
 209
 210 /* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
 211
 212 static void *
 213 xmalloc (size_t size)
 214 {
 215   void *val = malloc (size);
 216   if (!val && size)
 217     {
 218       write (2, "virtual memory exhausted\n", 25);
 219       exit (1);
 220     }
 221   return val;
 222 }
 223
 224 static void *
 225 xrealloc (void *block, size_t size)
 226 {
 227   void *val;
 228   /* We must call malloc explicitly when BLOCK is 0, since some
 229      reallocs don't do this.  */
 230   if (! block)
 231     val = malloc (size);
 232   else
 233     val = realloc (block, size);
 234   if (!val && size)
 235     {
 236       write (2, "virtual memory exhausted\n", 25);
 237       exit (1);
 238     }
 239   return val;
 240 }
 241
 242 # ifdef malloc
 243 #  undef malloc
 244 # endif
 245 # define malloc xmalloc
 246 # ifdef realloc
 247 #  undef realloc
 248 # endif
 249 # define realloc xrealloc
 250
 251 # include <stdbool.h>
 252 # include <string.h>
 253
 254 /* Define the syntax stuff for \<, \>, etc.  */
 255
 256 /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
 257 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
 258
 259 /* Dummy macros for non-Emacs environments.  */
 260 # define MAX_MULTIBYTE_LENGTH 1
 261 # define RE_MULTIBYTE_P(x) 0
 262 # define RE_TARGET_MULTIBYTE_P(x) 0
 263 # define WORD_BOUNDARY_P(c1, c2) (0)
 264 # define BYTES_BY_CHAR_HEAD(p) (1)
 265 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
 266 # define STRING_CHAR(p) (*(p))
 267 # define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
 268 # define CHAR_STRING(c, s) (*(s) = (c), 1)
 269 # define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
 270 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
 271 # define RE_CHAR_TO_MULTIBYTE(c) (c)
 272 # define RE_CHAR_TO_UNIBYTE(c) (c)
 273 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
 274   (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
 275 # define GET_CHAR_AFTER(c, p, len)      \
 276   (c = *p, len = 1)
 277 # define CHAR_BYTE8_P(c) (0)
 278 # define CHAR_LEADING_CODE(c) (c)
 279
 280 #endif /* not emacs */
 281
 282 #ifndef RE_TRANSLATE
 283 # define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
 284 # define RE_TRANSLATE_P(TBL) (TBL)
 285 #endif
 286 \f
 287 /* Get the interface, including the syntax bits.  */
 288 #include "regex.h"
 289
 290 /* isalpha etc. are used for the character classes.  */
 291 #include <ctype.h>
 292
 293 #ifdef emacs
 294
 295 /* 1 if C is an ASCII character.  */
 296 # define IS_REAL_ASCII(c) ((c) < 0200)
 297
 298 /* 1 if C is a unibyte character.  */
 299 # define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
 300
 301 /* The Emacs definitions should not be directly affected by locales.  */
 302
 303 /* In Emacs, these are only used for single-byte characters.  */
 304 # define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
 305 # define ISCNTRL(c) ((c) < ' ')
 306 # define ISXDIGIT(c) (((c) >= '0' && (c) <= '9')                \
 307                      || ((c) >= 'a' && (c) <= 'f')      \
 308                      || ((c) >= 'A' && (c) <= 'F'))
 309
 310 /* This is only used for single-byte characters.  */
 311 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
 312
 313 /* The rest must handle multibyte characters.  */
 314
 315 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)                             \
 316                     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237)        \
 317                     : 1)
 318
 319 # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)                             \
 320                     ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)       \
 321                     : 1)
 322
 323 # define ISALNUM(c) (IS_REAL_ASCII (c)                  \
 324                     ? (((c) >= 'a' && (c) <= 'z')       \
 325                        || ((c) >= 'A' && (c) <= 'Z')    \
 326                        || ((c) >= '0' && (c) <= '9'))   \
 327                     : SYNTAX (c) == Sword)
 328
 329 # define ISALPHA(c) (IS_REAL_ASCII (c)                  \
 330                     ? (((c) >= 'a' && (c) <= 'z')       \
 331                        || ((c) >= 'A' && (c) <= 'Z'))   \
 332                     : SYNTAX (c) == Sword)
 333
 334 # define ISLOWER(c) lowercasep (c)
 335
 336 # define ISPUNCT(c) (IS_REAL_ASCII (c)                          \
 337                     ? ((c) > ' ' && (c) < 0177                  \
 338                        && !(((c) >= 'a' && (c) <= 'z')          \
 339                             || ((c) >= 'A' && (c) <= 'Z')       \
 340                             || ((c) >= '0' && (c) <= '9')))     \
 341                     : SYNTAX (c) != Sword)
 342
 343 # define ISSPACE(c) (SYNTAX (c) == Swhitespace)
 344
 345 # define ISUPPER(c) uppercasep (c)
 346
 347 # define ISWORD(c) (SYNTAX (c) == Sword)
 348
 349 #else /* not emacs */
 350
 351 /* 1 if C is an ASCII character.  */
 352 # define IS_REAL_ASCII(c) ((c) < 0200)
 353
 354 /* This distinction is not meaningful, except in Emacs.  */
 355 # define ISUNIBYTE(c) 1
 356
 357 # ifdef isblank
 358 #  define ISBLANK(c) isblank (c)
 359 # else
 360 #  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
 361 # endif
 362 # ifdef isgraph
 363 #  define ISGRAPH(c) isgraph (c)
 364 # else
 365 #  define ISGRAPH(c) (isprint (c) && !isspace (c))
 366 # endif
 367
 368 /* Solaris defines ISPRINT so we must undefine it first.  */
 369 # undef ISPRINT
 370 # define ISPRINT(c) isprint (c)
 371 # define ISDIGIT(c) isdigit (c)
 372 # define ISALNUM(c) isalnum (c)
 373 # define ISALPHA(c) isalpha (c)
 374 # define ISCNTRL(c) iscntrl (c)
 375 # define ISLOWER(c) islower (c)
 376 # define ISPUNCT(c) ispunct (c)
 377 # define ISSPACE(c) isspace (c)
 378 # define ISUPPER(c) isupper (c)
 379 # define ISXDIGIT(c) isxdigit (c)
 380
 381 # define ISWORD(c) ISALPHA (c)
 382
 383 # ifdef _tolower
 384 #  define TOLOWER(c) _tolower (c)
 385 # else
 386 #  define TOLOWER(c) tolower (c)
 387 # endif
 388
 389 /* How many characters in the character set.  */
 390 # define CHAR_SET_SIZE 256
 391
 392 # ifdef SYNTAX_TABLE
 393
 394 extern char *re_syntax_table;
 395
 396 # else /* not SYNTAX_TABLE */
 397
 398 static char re_syntax_table[CHAR_SET_SIZE];
 399
 400 static void
 401 init_syntax_once (void)
 402 {
 403    register int c;
 404    static int done = 0;
 405
 406    if (done)
 407      return;
 408
 409    memset (re_syntax_table, 0, sizeof re_syntax_table);
 410
 411    for (c = 0; c < CHAR_SET_SIZE; ++c)
 412      if (ISALNUM (c))
 413         re_syntax_table[c] = Sword;
 414
 415    re_syntax_table['_'] = Ssymbol;
 416
 417    done = 1;
 418 }
 419
 420 # endif /* not SYNTAX_TABLE */
 421
 422 # define SYNTAX(c) re_syntax_table[(c)]
 423
 424 #endif /* not emacs */
 425 \f
 426 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
 427 \f
 428 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
 429    use `alloca' instead of `malloc'.  This is because using malloc in
 430    re_search* or re_match* could cause memory leaks when C-g is used in
 431    Emacs; also, malloc is slower and causes storage fragmentation.  On
 432    the other hand, malloc is more portable, and easier to debug.
 433
 434    Because we sometimes use alloca, some routines have to be macros,
 435    not functions -- `alloca'-allocated space disappears at the end of the
 436    function it is called in.  */
 437
 438 #ifdef REGEX_MALLOC
 439
 440 # define REGEX_ALLOCATE malloc
 441 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
 442 # define REGEX_FREE free
 443
 444 #else /* not REGEX_MALLOC  */
 445
 446 /* Emacs already defines alloca, sometimes.  */
 447 # ifndef alloca
 448
 449 /* Make alloca work the best possible way.  */
 450 #  ifdef __GNUC__
 451 #   define alloca __builtin_alloca
 452 #  else /* not __GNUC__ */
 453 #   ifdef HAVE_ALLOCA_H
 454 #    include <alloca.h>
 455 #   endif /* HAVE_ALLOCA_H */
 456 #  endif /* not __GNUC__ */
 457
 458 # endif /* not alloca */
 459
 460 # define REGEX_ALLOCATE alloca
 461
 462 /* Assumes a `char *destination' variable.  */
 463 # define REGEX_REALLOCATE(source, osize, nsize)                         \
 464   (destination = alloca (nsize),                                        \
 465    memcpy (destination, source, osize))
 466
 467 /* No need to do anything to free, after alloca.  */
 468 # define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
 469
 470 #endif /* not REGEX_MALLOC */
 471
 472 /* Define how to allocate the failure stack.  */
 473
 474 #if defined REL_ALLOC && defined REGEX_MALLOC
 475
 476 # define REGEX_ALLOCATE_STACK(size)                             \
 477   r_alloc (&failure_stack_ptr, (size))
 478 # define REGEX_REALLOCATE_STACK(source, osize, nsize)           \
 479   r_re_alloc (&failure_stack_ptr, (nsize))
 480 # define REGEX_FREE_STACK(ptr)                                  \
 481   r_alloc_free (&failure_stack_ptr)
 482
 483 #else /* not using relocating allocator */
 484
 485 # ifdef REGEX_MALLOC
 486
 487 #  define REGEX_ALLOCATE_STACK malloc
 488 #  define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
 489 #  define REGEX_FREE_STACK free
 490
 491 # else /* not REGEX_MALLOC */
 492
 493 #  define REGEX_ALLOCATE_STACK alloca
 494
 495 #  define REGEX_REALLOCATE_STACK(source, osize, nsize)                  \
 496    REGEX_REALLOCATE (source, osize, nsize)
 497 /* No need to explicitly free anything.  */
 498 #  define REGEX_FREE_STACK(arg) ((void)0)
 499
 500 # endif /* not REGEX_MALLOC */
 501 #endif /* not using relocating allocator */
 502
 503
 504 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
 505    `string1' or just past its end.  This works if PTR is NULL, which is
 506    a good thing.  */
 507 #define FIRST_STRING_P(ptr)                                     \
 508   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
 509
 510 /* (Re)Allocate N items of type T using malloc, or fail.  */
 511 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
 512 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
 513 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
 514
 515 #define BYTEWIDTH 8 /* In bits.  */
 516
 517 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
 518
 519 #undef MAX
 520 #undef MIN
 521 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 522 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 523
 524 /* Type of source-pattern and string chars.  */
 525 #ifdef _MSC_VER
 526 typedef unsigned char re_char;
 527 typedef const re_char const_re_char;
 528 #else
 529 typedef const unsigned char re_char;
 530 typedef re_char const_re_char;
 531 #endif
 532
 533 typedef char boolean;
 534
 535 static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
 536                                      re_char *string1, size_t size1,
 537                                      re_char *string2, size_t size2,
 538                                      ssize_t pos,
 539                                      struct re_registers *regs,
 540                                      ssize_t stop);
 541 \f
 542 /* These are the command codes that appear in compiled regular
 543    expressions.  Some opcodes are followed by argument bytes.  A
 544    command code can specify any interpretation whatsoever for its
 545    arguments.  Zero bytes may appear in the compiled regular expression.  */
 546
 547 typedef enum
 548 {
 549   no_op = 0,
 550
 551   /* Succeed right away--no more backtracking.  */
 552   succeed,
 553
 554         /* Followed by one byte giving n, then by n literal bytes.  */
 555   exactn,
 556
 557         /* Matches any (more or less) character.  */
 558   anychar,
 559
 560         /* Matches any one char belonging to specified set.  First
 561            following byte is number of bitmap bytes.  Then come bytes
 562            for a bitmap saying which chars are in.  Bits in each byte
 563            are ordered low-bit-first.  A character is in the set if its
 564            bit is 1.  A character too large to have a bit in the map is
 565            automatically not in the set.
 566
 567            If the length byte has the 0x80 bit set, then that stuff
 568            is followed by a range table:
 569                2 bytes of flags for character sets (low 8 bits, high 8 bits)
 570                    See RANGE_TABLE_WORK_BITS below.
 571                2 bytes, the number of pairs that follow (upto 32767)
 572                pairs, each 2 multibyte characters,
 573                    each multibyte character represented as 3 bytes.  */
 574   charset,
 575
 576         /* Same parameters as charset, but match any character that is
 577            not one of those specified.  */
 578   charset_not,
 579
 580         /* Start remembering the text that is matched, for storing in a
 581            register.  Followed by one byte with the register number, in
 582            the range 0 to one less than the pattern buffer's re_nsub
 583            field.  */
 584   start_memory,
 585
 586         /* Stop remembering the text that is matched and store it in a
 587            memory register.  Followed by one byte with the register
 588            number, in the range 0 to one less than `re_nsub' in the
 589            pattern buffer.  */
 590   stop_memory,
 591
 592         /* Match a duplicate of something remembered. Followed by one
 593            byte containing the register number.  */
 594   duplicate,
 595
 596         /* Fail unless at beginning of line.  */
 597   begline,
 598
 599         /* Fail unless at end of line.  */
 600   endline,
 601
 602         /* Succeeds if at beginning of buffer (if emacs) or at beginning
 603            of string to be matched (if not).  */
 604   begbuf,
 605
 606         /* Analogously, for end of buffer/string.  */
 607   endbuf,
 608
 609         /* Followed by two byte relative address to which to jump.  */
 610   jump,
 611
 612         /* Followed by two-byte relative address of place to resume at
 613            in case of failure.  */
 614   on_failure_jump,
 615
 616         /* Like on_failure_jump, but pushes a placeholder instead of the
 617            current string position when executed.  */
 618   on_failure_keep_string_jump,
 619
 620         /* Just like `on_failure_jump', except that it checks that we
 621            don't get stuck in an infinite loop (matching an empty string
 622            indefinitely).  */
 623   on_failure_jump_loop,
 624
 625         /* Just like `on_failure_jump_loop', except that it checks for
 626            a different kind of loop (the kind that shows up with non-greedy
 627            operators).  This operation has to be immediately preceded
 628            by a `no_op'.  */
 629   on_failure_jump_nastyloop,
 630
 631         /* A smart `on_failure_jump' used for greedy * and + operators.
 632            It analyzes the loop before which it is put and if the
 633            loop does not require backtracking, it changes itself to
 634            `on_failure_keep_string_jump' and short-circuits the loop,
 635            else it just defaults to changing itself into `on_failure_jump'.
 636            It assumes that it is pointing to just past a `jump'.  */
 637   on_failure_jump_smart,
 638
 639         /* Followed by two-byte relative address and two-byte number n.
 640            After matching N times, jump to the address upon failure.
 641            Does not work if N starts at 0: use on_failure_jump_loop
 642            instead.  */
 643   succeed_n,
 644
 645         /* Followed by two-byte relative address, and two-byte number n.
 646            Jump to the address N times, then fail.  */
 647   jump_n,
 648
 649         /* Set the following two-byte relative address to the
 650            subsequent two-byte number.  The address *includes* the two
 651            bytes of number.  */
 652   set_number_at,
 653
 654   wordbeg,      /* Succeeds if at word beginning.  */
 655   wordend,      /* Succeeds if at word end.  */
 656
 657   wordbound,    /* Succeeds if at a word boundary.  */
 658   notwordbound, /* Succeeds if not at a word boundary.  */
 659
 660   symbeg,       /* Succeeds if at symbol beginning.  */
 661   symend,       /* Succeeds if at symbol end.  */
 662
 663         /* Matches any character whose syntax is specified.  Followed by
 664            a byte which contains a syntax code, e.g., Sword.  */
 665   syntaxspec,
 666
 667         /* Matches any character whose syntax is not that specified.  */
 668   notsyntaxspec
 669
 670 #ifdef emacs
 671   ,before_dot,  /* Succeeds if before point.  */
 672   at_dot,       /* Succeeds if at point.  */
 673   after_dot,    /* Succeeds if after point.  */
 674
 675   /* Matches any character whose category-set contains the specified
 676      category.  The operator is followed by a byte which contains a
 677      category code (mnemonic ASCII character).  */
 678   categoryspec,
 679
 680   /* Matches any character whose category-set does not contain the
 681      specified category.  The operator is followed by a byte which
 682      contains the category code (mnemonic ASCII character).  */
 683   notcategoryspec
 684 #endif /* emacs */
 685 } re_opcode_t;
 686 \f
 687 /* Common operations on the compiled pattern.  */
 688
 689 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
 690
 691 #define STORE_NUMBER(destination, number)                               \
 692   do {                                                                  \
 693     (destination)[0] = (number) & 0377;                                 \
 694     (destination)[1] = (number) >> 8;                                   \
 695   } while (0)
 696
 697 /* Same as STORE_NUMBER, except increment DESTINATION to
 698    the byte after where the number is stored.  Therefore, DESTINATION
 699    must be an lvalue.  */
 700
 701 #define STORE_NUMBER_AND_INCR(destination, number)                      \
 702   do {                                                                  \
 703     STORE_NUMBER (destination, number);                                 \
 704     (destination) += 2;                                                 \
 705   } while (0)
 706
 707 /* Put into DESTINATION a number stored in two contiguous bytes starting
 708    at SOURCE.  */
 709
 710 #define EXTRACT_NUMBER(destination, source)                             \
 711   ((destination) = extract_number (source))
 712
 713 static int
 714 extract_number (re_char *source)
 715 {
 716   unsigned leading_byte = SIGN_EXTEND_CHAR (source[1]);
 717   return (leading_byte << 8) + source[0];
 718 }
 719
 720 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
 721    SOURCE must be an lvalue.  */
 722
 723 #define EXTRACT_NUMBER_AND_INCR(destination, source)                    \
 724   ((destination) = extract_number_and_incr (&source))
 725
 726 static int
 727 extract_number_and_incr (re_char **source)
 728 {
 729   int num = extract_number (*source);
 730   *source += 2;
 731   return num;
 732 }
 733 \f
 734 /* Store a multibyte character in three contiguous bytes starting
 735    DESTINATION, and increment DESTINATION to the byte after where the
 736    character is stored.  Therefore, DESTINATION must be an lvalue.  */
 737
 738 #define STORE_CHARACTER_AND_INCR(destination, character)        \
 739   do {                                                          \
 740     (destination)[0] = (character) & 0377;                      \
 741     (destination)[1] = ((character) >> 8) & 0377;               \
 742     (destination)[2] = (character) >> 16;                       \
 743     (destination) += 3;                                         \
 744   } while (0)
 745
 746 /* Put into DESTINATION a character stored in three contiguous bytes
 747    starting at SOURCE.  */
 748
 749 #define EXTRACT_CHARACTER(destination, source)  \
 750   do {                                          \
 751     (destination) = ((source)[0]                \
 752                      | ((source)[1] << 8)       \
 753                      | ((source)[2] << 16));    \
 754   } while (0)
 755
 756
 757 /* Macros for charset. */
 758
 759 /* Size of bitmap of charset P in bytes.  P is a start of charset,
 760    i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
 761 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
 762
 763 /* Nonzero if charset P has range table.  */
 764 #define CHARSET_RANGE_TABLE_EXISTS_P(p)  ((p)[1] & 0x80)
 765
 766 /* Return the address of range table of charset P.  But not the start
 767    of table itself, but the before where the number of ranges is
 768    stored.  `2 +' means to skip re_opcode_t and size of bitmap,
 769    and the 2 bytes of flags at the start of the range table.  */
 770 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
 771
 772 #ifdef emacs
 773 /* Extract the bit flags that start a range table.  */
 774 #define CHARSET_RANGE_TABLE_BITS(p)             \
 775   ((p)[2 + CHARSET_BITMAP_SIZE (p)]             \
 776    + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
 777 #endif
 778
 779 /* Return the address of end of RANGE_TABLE.  COUNT is number of
 780    ranges (which is a pair of (start, end)) in the RANGE_TABLE.  `* 2'
 781    is start of range and end of range.  `* 3' is size of each start
 782    and end.  */
 783 #define CHARSET_RANGE_TABLE_END(range_table, count)     \
 784   ((range_table) + (count) * 2 * 3)
 785
 786 /* Test if C is in RANGE_TABLE.  A flag NOT is negated if C is in.
 787    COUNT is number of ranges in RANGE_TABLE.  */
 788 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count)      \
 789   do                                                                    \
 790     {                                                                   \
 791       re_wchar_t range_start, range_end;                                \
 792       re_char *rtp;                                                     \
 793       re_char *range_table_end                                          \
 794         = CHARSET_RANGE_TABLE_END ((range_table), (count));             \
 795                                                                         \
 796       for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3)    \
 797         {                                                               \
 798           EXTRACT_CHARACTER (range_start, rtp);                         \
 799           EXTRACT_CHARACTER (range_end, rtp + 3);                       \
 800                                                                         \
 801           if (range_start <= (c) && (c) <= range_end)                   \
 802             {                                                           \
 803               (not) = !(not);                                           \
 804               break;                                                    \
 805             }                                                           \
 806         }                                                               \
 807     }                                                                   \
 808   while (0)
 809
 810 /* Test if C is in range table of CHARSET.  The flag NOT is negated if
 811    C is listed in it.  */
 812 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset)                     \
 813   do                                                                    \
 814     {                                                                   \
 815       /* Number of ranges in range table. */                            \
 816       int count;                                                        \
 817       re_char *range_table = CHARSET_RANGE_TABLE (charset);             \
 818                                                                         \
 819       EXTRACT_NUMBER_AND_INCR (count, range_table);                     \
 820       CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count);  \
 821     }                                                                   \
 822   while (0)
 823 \f
 824 /* If DEBUG is defined, Regex prints many voluminous messages about what
 825    it is doing (if the variable `debug' is nonzero).  If linked with the
 826    main program in `iregex.c', you can enter patterns and strings
 827    interactively.  And if linked with the main program in `main.c' and
 828    the other test files, you can run the already-written tests.  */
 829
 830 #ifdef DEBUG
 831
 832 /* We use standard I/O for debugging.  */
 833 # include <stdio.h>
 834
 835 /* It is useful to test things that ``must'' be true when debugging.  */
 836 # include <assert.h>
 837
 838 static int debug = -100000;
 839
 840 # define DEBUG_STATEMENT(e) e
 841 # define DEBUG_PRINT(...) if (debug > 0) printf (__VA_ARGS__)
 842 # define DEBUG_COMPILES_ARGUMENTS
 843 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)                          \
 844   if (debug > 0) print_partial_compiled_pattern (s, e)
 845 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)                 \
 846   if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
 847
 848
 849 /* Print the fastmap in human-readable form.  */
 850
 851 static void
 852 print_fastmap (char *fastmap)
 853 {
 854   unsigned was_a_range = 0;
 855   unsigned i = 0;
 856
 857   while (i < (1 << BYTEWIDTH))
 858     {
 859       if (fastmap[i++])
 860         {
 861           was_a_range = 0;
 862           putchar (i - 1);
 863           while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
 864             {
 865               was_a_range = 1;
 866               i++;
 867             }
 868           if (was_a_range)
 869             {
 870               printf ("-");
 871               putchar (i - 1);
 872             }
 873         }
 874     }
 875   putchar ('\n');
 876 }
 877
 878
 879 /* Print a compiled pattern string in human-readable form, starting at
 880    the START pointer into it and ending just before the pointer END.  */
 881
 882 static void
 883 print_partial_compiled_pattern (re_char *start, re_char *end)
 884 {
 885   int mcnt, mcnt2;
 886   re_char *p = start;
 887   re_char *pend = end;
 888
 889   if (start == NULL)
 890     {
 891       fprintf (stderr, "(null)\n");
 892       return;
 893     }
 894
 895   /* Loop over pattern commands.  */
 896   while (p < pend)
 897     {
 898       fprintf (stderr, "%td:\t", p - start);
 899
 900       switch ((re_opcode_t) *p++)
 901         {
 902         case no_op:
 903           fprintf (stderr, "/no_op");
 904           break;
 905
 906         case succeed:
 907           fprintf (stderr, "/succeed");
 908           break;
 909
 910         case exactn:
 911           mcnt = *p++;
 912           fprintf (stderr, "/exactn/%d", mcnt);
 913           do
 914             {
 915               fprintf (stderr, "/%c", *p++);
 916             }
 917           while (--mcnt);
 918           break;
 919
 920         case start_memory:
 921           fprintf (stderr, "/start_memory/%d", *p++);
 922           break;
 923
 924         case stop_memory:
 925           fprintf (stderr, "/stop_memory/%d", *p++);
 926           break;
 927
 928         case duplicate:
 929           fprintf (stderr, "/duplicate/%d", *p++);
 930           break;
 931
 932         case anychar:
 933           fprintf (stderr, "/anychar");
 934           break;
 935
 936         case charset:
 937         case charset_not:
 938           {
 939             register int c, last = -100;
 940             register int in_range = 0;
 941             int length = CHARSET_BITMAP_SIZE (p - 1);
 942             int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
 943
 944             fprintf (stderr, "/charset [%s",
 945                      (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
 946
 947             if (p + *p >= pend)
 948               fprintf (stderr, " !extends past end of pattern! ");
 949
 950             for (c = 0; c < 256; c++)
 951               if (c / 8 < length
 952                   && (p[1 + (c/8)] & (1 << (c % 8))))
 953                 {
 954                   /* Are we starting a range?  */
 955                   if (last + 1 == c && ! in_range)
 956                     {
 957                       fprintf (stderr, "-");
 958                       in_range = 1;
 959                     }
 960                   /* Have we broken a range?  */
 961                   else if (last + 1 != c && in_range)
 962                     {
 963                       fprintf (stderr, "%c", last);
 964                       in_range = 0;
 965                     }
 966
 967                   if (! in_range)
 968                     fprintf (stderr, "%c", c);
 969
 970                   last = c;
 971               }
 972
 973             if (in_range)
 974               fprintf (stderr, "%c", last);
 975
 976             fprintf (stderr, "]");
 977
 978             p += 1 + length;
 979
 980             if (has_range_table)
 981               {
 982                 int count;
 983                 fprintf (stderr, "has-range-table");
 984
 985                 /* ??? Should print the range table; for now, just skip it.  */
 986                 p += 2;         /* skip range table bits */
 987                 EXTRACT_NUMBER_AND_INCR (count, p);
 988                 p = CHARSET_RANGE_TABLE_END (p, count);
 989               }
 990           }
 991           break;
 992
 993         case begline:
 994           fprintf (stderr, "/begline");
 995           break;
 996
 997         case endline:
 998           fprintf (stderr, "/endline");
 999           break;
1000
1001         case on_failure_jump:
1002           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1003           fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
1004           break;
1005
1006         case on_failure_keep_string_jump:
1007           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1008           fprintf (stderr, "/on_failure_keep_string_jump to %td",
1009                    p + mcnt - start);
1010           break;
1011
1012         case on_failure_jump_nastyloop:
1013           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1014           fprintf (stderr, "/on_failure_jump_nastyloop to %td",
1015                    p + mcnt - start);
1016           break;
1017
1018         case on_failure_jump_loop:
1019           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1020           fprintf (stderr, "/on_failure_jump_loop to %td",
1021                    p + mcnt - start);
1022           break;
1023
1024         case on_failure_jump_smart:
1025           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1026           fprintf (stderr, "/on_failure_jump_smart to %td",
1027                    p + mcnt - start);
1028           break;
1029
1030         case jump:
1031           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1032           fprintf (stderr, "/jump to %td", p + mcnt - start);
1033           break;
1034
1035         case succeed_n:
1036           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1037           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1038           fprintf (stderr, "/succeed_n to %td, %d times",
1039                    p - 2 + mcnt - start, mcnt2);
1040           break;
1041
1042         case jump_n:
1043           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1044           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1045           fprintf (stderr, "/jump_n to %td, %d times",
1046                    p - 2 + mcnt - start, mcnt2);
1047           break;
1048
1049         case set_number_at:
1050           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1051           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1052           fprintf (stderr, "/set_number_at location %td to %d",
1053                    p - 2 + mcnt - start, mcnt2);
1054           break;
1055
1056         case wordbound:
1057           fprintf (stderr, "/wordbound");
1058           break;
1059
1060         case notwordbound:
1061           fprintf (stderr, "/notwordbound");
1062           break;
1063
1064         case wordbeg:
1065           fprintf (stderr, "/wordbeg");
1066           break;
1067
1068         case wordend:
1069           fprintf (stderr, "/wordend");
1070           break;
1071
1072         case symbeg:
1073           fprintf (stderr, "/symbeg");
1074           break;
1075
1076         case symend:
1077           fprintf (stderr, "/symend");
1078           break;
1079
1080         case syntaxspec:
1081           fprintf (stderr, "/syntaxspec");
1082           mcnt = *p++;
1083           fprintf (stderr, "/%d", mcnt);
1084           break;
1085
1086         case notsyntaxspec:
1087           fprintf (stderr, "/notsyntaxspec");
1088           mcnt = *p++;
1089           fprintf (stderr, "/%d", mcnt);
1090           break;
1091
1092 # ifdef emacs
1093         case before_dot:
1094           fprintf (stderr, "/before_dot");
1095           break;
1096
1097         case at_dot:
1098           fprintf (stderr, "/at_dot");
1099           break;
1100
1101         case after_dot:
1102           fprintf (stderr, "/after_dot");
1103           break;
1104
1105         case categoryspec:
1106           fprintf (stderr, "/categoryspec");
1107           mcnt = *p++;
1108           fprintf (stderr, "/%d", mcnt);
1109           break;
1110
1111         case notcategoryspec:
1112           fprintf (stderr, "/notcategoryspec");
1113           mcnt = *p++;
1114           fprintf (stderr, "/%d", mcnt);
1115           break;
1116 # endif /* emacs */
1117
1118         case begbuf:
1119           fprintf (stderr, "/begbuf");
1120           break;
1121
1122         case endbuf:
1123           fprintf (stderr, "/endbuf");
1124           break;
1125
1126         default:
1127           fprintf (stderr, "?%d", *(p-1));
1128         }
1129
1130       fprintf (stderr, "\n");
1131     }
1132
1133   fprintf (stderr, "%td:\tend of pattern.\n", p - start);
1134 }
1135
1136
1137 static void
1138 print_compiled_pattern (struct re_pattern_buffer *bufp)
1139 {
1140   re_char *buffer = bufp->buffer;
1141
1142   print_partial_compiled_pattern (buffer, buffer + bufp->used);
1143   printf ("%ld bytes used/%ld bytes allocated.\n",
1144           bufp->used, bufp->allocated);
1145
1146   if (bufp->fastmap_accurate && bufp->fastmap)
1147     {
1148       printf ("fastmap: ");
1149       print_fastmap (bufp->fastmap);
1150     }
1151
1152   printf ("re_nsub: %zu\t", bufp->re_nsub);
1153   printf ("regs_alloc: %d\t", bufp->regs_allocated);
1154   printf ("can_be_null: %d\t", bufp->can_be_null);
1155   printf ("no_sub: %d\t", bufp->no_sub);
1156   printf ("not_bol: %d\t", bufp->not_bol);
1157   printf ("not_eol: %d\t", bufp->not_eol);
1158   printf ("syntax: %lx\n", bufp->syntax);
1159   fflush (stdout);
1160   /* Perhaps we should print the translate table?  */
1161 }
1162
1163
1164 static void
1165 print_double_string (re_char *where, re_char *string1, ssize_t size1,
1166                      re_char *string2, ssize_t size2)
1167 {
1168   ssize_t this_char;
1169
1170   if (where == NULL)
1171     printf ("(null)");
1172   else
1173     {
1174       if (FIRST_STRING_P (where))
1175         {
1176           for (this_char = where - string1; this_char < size1; this_char++)
1177             putchar (string1[this_char]);
1178
1179           where = string2;
1180         }
1181
1182       for (this_char = where - string2; this_char < size2; this_char++)
1183         putchar (string2[this_char]);
1184     }
1185 }
1186
1187 #else /* not DEBUG */
1188
1189 # undef assert
1190 # define assert(e)
1191
1192 # define DEBUG_STATEMENT(e)
1193 # define DEBUG_PRINT(...)
1194 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1195 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1196
1197 #endif /* not DEBUG */
1198 \f
1199 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
1200 #ifdef lint
1201 # define IF_LINT(Code) Code
1202 #else
1203 # define IF_LINT(Code) /* empty */
1204 #endif
1205 \f
1206 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
1207    also be assigned to arbitrarily: each pattern buffer stores its own
1208    syntax, so it can be changed between regex compilations.  */
1209 /* This has no initializer because initialized variables in Emacs
1210    become read-only after dumping.  */
1211 reg_syntax_t re_syntax_options;
1212
1213
1214 /* Specify the precise syntax of regexps for compilation.  This provides
1215    for compatibility for various utilities which historically have
1216    different, incompatible syntaxes.
1217
1218    The argument SYNTAX is a bit mask comprised of the various bits
1219    defined in regex.h.  We return the old syntax.  */
1220
1221 reg_syntax_t
1222 re_set_syntax (reg_syntax_t syntax)
1223 {
1224   reg_syntax_t ret = re_syntax_options;
1225
1226   re_syntax_options = syntax;
1227   return ret;
1228 }
1229 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1230
1231 /* Regexp to use to replace spaces, or NULL meaning don't.  */
1232 static const_re_char *whitespace_regexp;
1233
1234 void
1235 re_set_whitespace_regexp (const char *regexp)
1236 {
1237   whitespace_regexp = (const_re_char *) regexp;
1238 }
1239 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1240 \f
1241 /* This table gives an error message for each of the error codes listed
1242    in regex.h.  Obviously the order here has to be same as there.
1243    POSIX doesn't require that we do anything for REG_NOERROR,
1244    but why not be nice?  */
1245
1246 static const char *re_error_msgid[] =
1247   {
1248     gettext_noop ("Success"),   /* REG_NOERROR */
1249     gettext_noop ("No match"),  /* REG_NOMATCH */
1250     gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1251     gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1252     gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1253     gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1254     gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1255     gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1256     gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1257     gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1258     gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1259     gettext_noop ("Invalid range end"), /* REG_ERANGE */
1260     gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1261     gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1262     gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1263     gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1264     gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1265     gettext_noop ("Range striding over charsets") /* REG_ERANGEX  */
1266   };
1267 \f
1268 /* Avoiding alloca during matching, to placate r_alloc.  */
1269
1270 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1271    searching and matching functions should not call alloca.  On some
1272    systems, alloca is implemented in terms of malloc, and if we're
1273    using the relocating allocator routines, then malloc could cause a
1274    relocation, which might (if the strings being searched are in the
1275    ralloc heap) shift the data out from underneath the regexp
1276    routines.
1277
1278    Here's another reason to avoid allocation: Emacs
1279    processes input from X in a signal handler; processing X input may
1280    call malloc; if input arrives while a matching routine is calling
1281    malloc, then we're scrod.  But Emacs can't just block input while
1282    calling matching routines; then we don't notice interrupts when
1283    they come in.  So, Emacs blocks input around all regexp calls
1284    except the matching calls, which it leaves unprotected, in the
1285    faith that they will not malloc.  */
1286
1287 /* Normally, this is fine.  */
1288 #define MATCH_MAY_ALLOCATE
1289
1290 /* The match routines may not allocate if (1) they would do it with malloc
1291    and (2) it's not safe for them to use malloc.
1292    Note that if REL_ALLOC is defined, matching would not use malloc for the
1293    failure stack, but we would still use it for the register vectors;
1294    so REL_ALLOC should not affect this.  */
1295 #if defined REGEX_MALLOC && defined emacs
1296 # undef MATCH_MAY_ALLOCATE
1297 #endif
1298
1299 \f
1300 /* Failure stack declarations and macros; both re_compile_fastmap and
1301    re_match_2 use a failure stack.  These have to be macros because of
1302    REGEX_ALLOCATE_STACK.  */
1303
1304
1305 /* Approximate number of failure points for which to initially allocate space
1306    when matching.  If this number is exceeded, we allocate more
1307    space, so it is not a hard limit.  */
1308 #ifndef INIT_FAILURE_ALLOC
1309 # define INIT_FAILURE_ALLOC 20
1310 #endif
1311
1312 /* Roughly the maximum number of failure points on the stack.  Would be
1313    exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1314    This is a variable only so users of regex can assign to it; we never
1315    change it ourselves.  We always multiply it by TYPICAL_FAILURE_SIZE
1316    before using it, so it should probably be a byte-count instead.  */
1317 # if defined MATCH_MAY_ALLOCATE
1318 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
1319    whose default stack limit is 2mb.  In order for a larger
1320    value to work reliably, you have to try to make it accord
1321    with the process stack limit.  */
1322 size_t re_max_failures = 40000;
1323 # else
1324 size_t re_max_failures = 4000;
1325 # endif
1326
1327 union fail_stack_elt
1328 {
1329   re_char *pointer;
1330   /* This should be the biggest `int' that's no bigger than a pointer.  */
1331   long integer;
1332 };
1333
1334 typedef union fail_stack_elt fail_stack_elt_t;
1335
1336 typedef struct
1337 {
1338   fail_stack_elt_t *stack;
1339   size_t size;
1340   size_t avail; /* Offset of next open position.  */
1341   size_t frame; /* Offset of the cur constructed frame.  */
1342 } fail_stack_type;
1343
1344 #define FAIL_STACK_EMPTY()     (fail_stack.frame == 0)
1345
1346
1347 /* Define macros to initialize and free the failure stack.
1348    Do `return -2' if the alloc fails.  */
1349
1350 #ifdef MATCH_MAY_ALLOCATE
1351 # define INIT_FAIL_STACK()                                              \
1352   do {                                                                  \
1353     fail_stack.stack =                                                  \
1354       REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE   \
1355                             * sizeof (fail_stack_elt_t));               \
1356                                                                         \
1357     if (fail_stack.stack == NULL)                                       \
1358       return -2;                                                        \
1359                                                                         \
1360     fail_stack.size = INIT_FAILURE_ALLOC;                               \
1361     fail_stack.avail = 0;                                               \
1362     fail_stack.frame = 0;                                               \
1363   } while (0)
1364 #else
1365 # define INIT_FAIL_STACK()                                              \
1366   do {                                                                  \
1367     fail_stack.avail = 0;                                               \
1368     fail_stack.frame = 0;                                               \
1369   } while (0)
1370
1371 # define RETALLOC_IF(addr, n, t) \
1372   if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
1373 #endif
1374
1375
1376 /* Double the size of FAIL_STACK, up to a limit
1377    which allows approximately `re_max_failures' items.
1378
1379    Return 1 if succeeds, and 0 if either ran out of memory
1380    allocating space for it or it was already too large.
1381
1382    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
1383
1384 /* Factor to increase the failure stack size by
1385    when we increase it.
1386    This used to be 2, but 2 was too wasteful
1387    because the old discarded stacks added up to as much space
1388    were as ultimate, maximum-size stack.  */
1389 #define FAIL_STACK_GROWTH_FACTOR 4
1390
1391 #define GROW_FAIL_STACK(fail_stack)                                     \
1392   (((fail_stack).size * sizeof (fail_stack_elt_t)                       \
1393     >= re_max_failures * TYPICAL_FAILURE_SIZE)                          \
1394    ? 0                                                                  \
1395    : ((fail_stack).stack                                                \
1396       = REGEX_REALLOCATE_STACK ((fail_stack).stack,                     \
1397           (fail_stack).size * sizeof (fail_stack_elt_t),                \
1398           MIN (re_max_failures * TYPICAL_FAILURE_SIZE,                  \
1399                ((fail_stack).size * sizeof (fail_stack_elt_t)           \
1400                 * FAIL_STACK_GROWTH_FACTOR))),                          \
1401                                                                         \
1402       (fail_stack).stack == NULL                                        \
1403       ? 0                                                               \
1404       : ((fail_stack).size                                              \
1405          = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE,                \
1406                  ((fail_stack).size * sizeof (fail_stack_elt_t)         \
1407                   * FAIL_STACK_GROWTH_FACTOR))                          \
1408             / sizeof (fail_stack_elt_t)),                               \
1409          1)))
1410
1411
1412 /* Push a pointer value onto the failure stack.
1413    Assumes the variable `fail_stack'.  Probably should only
1414    be called from within `PUSH_FAILURE_POINT'.  */
1415 #define PUSH_FAILURE_POINTER(item)                                      \
1416   fail_stack.stack[fail_stack.avail++].pointer = (item)
1417
1418 /* This pushes an integer-valued item onto the failure stack.
1419    Assumes the variable `fail_stack'.  Probably should only
1420    be called from within `PUSH_FAILURE_POINT'.  */
1421 #define PUSH_FAILURE_INT(item)                                  \
1422   fail_stack.stack[fail_stack.avail++].integer = (item)
1423
1424 /* These POP... operations complement the PUSH... operations.
1425    All assume that `fail_stack' is nonempty.  */
1426 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1427 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1428
1429 /* Individual items aside from the registers.  */
1430 #define NUM_NONREG_ITEMS 3
1431
1432 /* Used to examine the stack (to detect infinite loops).  */
1433 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1434 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1435 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1436 #define TOP_FAILURE_HANDLE() fail_stack.frame
1437
1438
1439 #define ENSURE_FAIL_STACK(space)                                        \
1440 while (REMAINING_AVAIL_SLOTS <= space) {                                \
1441   if (!GROW_FAIL_STACK (fail_stack))                                    \
1442     return -2;                                                          \
1443   DEBUG_PRINT ("\n  Doubled stack; size now: %zd\n", (fail_stack).size);\
1444   DEBUG_PRINT ("         slots available: %zd\n", REMAINING_AVAIL_SLOTS);\
1445 }
1446
1447 /* Push register NUM onto the stack.  */
1448 #define PUSH_FAILURE_REG(num)                                           \
1449 do {                                                                    \
1450   char *destination;                                                    \
1451   long n = num;                                                         \
1452   ENSURE_FAIL_STACK(3);                                                 \
1453   DEBUG_PRINT ("    Push reg %ld (spanning %p -> %p)\n",                \
1454                n, regstart[n], regend[n]);                              \
1455   PUSH_FAILURE_POINTER (regstart[n]);                                   \
1456   PUSH_FAILURE_POINTER (regend[n]);                                     \
1457   PUSH_FAILURE_INT (n);                                                 \
1458 } while (0)
1459
1460 /* Change the counter's value to VAL, but make sure that it will
1461    be reset when backtracking.  */
1462 #define PUSH_NUMBER(ptr,val)                                            \
1463 do {                                                                    \
1464   char *destination;                                                    \
1465   int c;                                                                \
1466   ENSURE_FAIL_STACK(3);                                                 \
1467   EXTRACT_NUMBER (c, ptr);                                              \
1468   DEBUG_PRINT ("    Push number %p = %d -> %d\n", ptr, c, val);         \
1469   PUSH_FAILURE_INT (c);                                                 \
1470   PUSH_FAILURE_POINTER (ptr);                                           \
1471   PUSH_FAILURE_INT (-1);                                                \
1472   STORE_NUMBER (ptr, val);                                              \
1473 } while (0)
1474
1475 /* Pop a saved register off the stack.  */
1476 #define POP_FAILURE_REG_OR_COUNT()                                      \
1477 do {                                                                    \
1478   long pfreg = POP_FAILURE_INT ();                                      \
1479   if (pfreg == -1)                                                      \
1480     {                                                                   \
1481       /* It's a counter.  */                                            \
1482       /* Here, we discard `const', making re_match non-reentrant.  */   \
1483       unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER ();     \
1484       pfreg = POP_FAILURE_INT ();                                       \
1485       STORE_NUMBER (ptr, pfreg);                                        \
1486       DEBUG_PRINT ("     Pop counter %p = %ld\n", ptr, pfreg);          \
1487     }                                                                   \
1488   else                                                                  \
1489     {                                                                   \
1490       regend[pfreg] = POP_FAILURE_POINTER ();                           \
1491       regstart[pfreg] = POP_FAILURE_POINTER ();                         \
1492       DEBUG_PRINT ("     Pop reg %ld (spanning %p -> %p)\n",            \
1493                    pfreg, regstart[pfreg], regend[pfreg]);              \
1494     }                                                                   \
1495 } while (0)
1496
1497 /* Check that we are not stuck in an infinite loop.  */
1498 #define CHECK_INFINITE_LOOP(pat_cur, string_place)                      \
1499 do {                                                                    \
1500   ssize_t failure = TOP_FAILURE_HANDLE ();                              \
1501   /* Check for infinite matching loops */                               \
1502   while (failure > 0                                                    \
1503          && (FAILURE_STR (failure) == string_place                      \
1504              || FAILURE_STR (failure) == NULL))                         \
1505     {                                                                   \
1506       assert (FAILURE_PAT (failure) >= bufp->buffer                     \
1507               && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);   \
1508       if (FAILURE_PAT (failure) == pat_cur)                             \
1509         {                                                               \
1510           cycle = 1;                                                    \
1511           break;                                                        \
1512         }                                                               \
1513       DEBUG_PRINT ("  Other pattern: %p\n", FAILURE_PAT (failure));     \
1514       failure = NEXT_FAILURE_HANDLE(failure);                           \
1515     }                                                                   \
1516   DEBUG_PRINT ("  Other string: %p\n", FAILURE_STR (failure));          \
1517 } while (0)
1518
1519 /* Push the information about the state we will need
1520    if we ever fail back to it.
1521
1522    Requires variables fail_stack, regstart, regend and
1523    num_regs be declared.  GROW_FAIL_STACK requires `destination' be
1524    declared.
1525
1526    Does `return FAILURE_CODE' if runs out of memory.  */
1527
1528 #define PUSH_FAILURE_POINT(pattern, string_place)                       \
1529 do {                                                                    \
1530   char *destination;                                                    \
1531   /* Must be int, so when we don't save any registers, the arithmetic   \
1532      of 0 + -1 isn't done as unsigned.  */                              \
1533                                                                         \
1534   DEBUG_STATEMENT (nfailure_points_pushed++);                           \
1535   DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n");                              \
1536   DEBUG_PRINT ("  Before push, next avail: %zd\n", (fail_stack).avail); \
1537   DEBUG_PRINT ("                        size: %zd\n", (fail_stack).size);\
1538                                                                         \
1539   ENSURE_FAIL_STACK (NUM_NONREG_ITEMS);                                 \
1540                                                                         \
1541   DEBUG_PRINT ("\n");                                                   \
1542                                                                         \
1543   DEBUG_PRINT ("  Push frame index: %zd\n", fail_stack.frame);          \
1544   PUSH_FAILURE_INT (fail_stack.frame);                                  \
1545                                                                         \
1546   DEBUG_PRINT ("  Push string %p: `", string_place);                    \
1547   DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1548   DEBUG_PRINT ("'\n");                                                  \
1549   PUSH_FAILURE_POINTER (string_place);                                  \
1550                                                                         \
1551   DEBUG_PRINT ("  Push pattern %p: ", pattern);                         \
1552   DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend);                   \
1553   PUSH_FAILURE_POINTER (pattern);                                       \
1554                                                                         \
1555   /* Close the frame by moving the frame pointer past it.  */           \
1556   fail_stack.frame = fail_stack.avail;                                  \
1557 } while (0)
1558
1559 /* Estimate the size of data pushed by a typical failure stack entry.
1560    An estimate is all we need, because all we use this for
1561    is to choose a limit for how big to make the failure stack.  */
1562 /* BEWARE, the value `20' is hard-coded in emacs.c:main().  */
1563 #define TYPICAL_FAILURE_SIZE 20
1564
1565 /* How many items can still be added to the stack without overflowing it.  */
1566 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1567
1568
1569 /* Pops what PUSH_FAIL_STACK pushes.
1570
1571    We restore into the parameters, all of which should be lvalues:
1572      STR -- the saved data position.
1573      PAT -- the saved pattern position.
1574      REGSTART, REGEND -- arrays of string positions.
1575
1576    Also assumes the variables `fail_stack' and (if debugging), `bufp',
1577    `pend', `string1', `size1', `string2', and `size2'.  */
1578
1579 #define POP_FAILURE_POINT(str, pat)                                     \
1580 do {                                                                    \
1581   assert (!FAIL_STACK_EMPTY ());                                        \
1582                                                                         \
1583   /* Remove failure points and point to how many regs pushed.  */       \
1584   DEBUG_PRINT ("POP_FAILURE_POINT:\n");                                 \
1585   DEBUG_PRINT ("  Before pop, next avail: %zd\n", fail_stack.avail);    \
1586   DEBUG_PRINT ("                     size: %zd\n", fail_stack.size);    \
1587                                                                         \
1588   /* Pop the saved registers.  */                                       \
1589   while (fail_stack.frame < fail_stack.avail)                           \
1590     POP_FAILURE_REG_OR_COUNT ();                                        \
1591                                                                         \
1592   pat = POP_FAILURE_POINTER ();                                         \
1593   DEBUG_PRINT ("  Popping pattern %p: ", pat);                          \
1594   DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend);                       \
1595                                                                         \
1596   /* If the saved string location is NULL, it came from an              \
1597      on_failure_keep_string_jump opcode, and we want to throw away the  \
1598      saved NULL, thus retaining our current position in the string.  */ \
1599   str = POP_FAILURE_POINTER ();                                         \
1600   DEBUG_PRINT ("  Popping string %p: `", str);                          \
1601   DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2);      \
1602   DEBUG_PRINT ("'\n");                                                  \
1603                                                                         \
1604   fail_stack.frame = POP_FAILURE_INT ();                                \
1605   DEBUG_PRINT ("  Popping  frame index: %zd\n", fail_stack.frame);      \
1606                                                                         \
1607   assert (fail_stack.avail >= 0);                                       \
1608   assert (fail_stack.frame <= fail_stack.avail);                        \
1609                                                                         \
1610   DEBUG_STATEMENT (nfailure_points_popped++);                           \
1611 } while (0) /* POP_FAILURE_POINT */
1612
1613
1614 \f
1615 /* Registers are set to a sentinel when they haven't yet matched.  */
1616 #define REG_UNSET(e) ((e) == NULL)
1617 \f
1618 /* Subroutine declarations and macros for regex_compile.  */
1619
1620 static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1621                                     reg_syntax_t syntax,
1622                                     struct re_pattern_buffer *bufp);
1623 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1624 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1625 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1626                         int arg, unsigned char *end);
1627 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1628                         int arg1, int arg2, unsigned char *end);
1629 static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1630                                  reg_syntax_t syntax);
1631 static boolean at_endline_loc_p (re_char *p, re_char *pend,
1632                                  reg_syntax_t syntax);
1633 static re_char *skip_one_char (re_char *p);
1634 static int analyse_first (re_char *p, re_char *pend,
1635                           char *fastmap, const int multibyte);
1636
1637 /* Fetch the next character in the uncompiled pattern, with no
1638    translation.  */
1639 #define PATFETCH(c)                                                     \
1640   do {                                                                  \
1641     int len;                                                            \
1642     if (p == pend) return REG_EEND;                                     \
1643     c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte);                  \
1644     p += len;                                                           \
1645   } while (0)
1646
1647
1648 /* If `translate' is non-null, return translate[D], else just D.  We
1649    cast the subscript to translate because some data is declared as
1650    `char *', to avoid warnings when a string constant is passed.  But
1651    when we use a character as a subscript we must make it unsigned.  */
1652 #ifndef TRANSLATE
1653 # define TRANSLATE(d) \
1654   (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1655 #endif
1656
1657
1658 /* Macros for outputting the compiled pattern into `buffer'.  */
1659
1660 /* If the buffer isn't allocated when it comes in, use this.  */
1661 #define INIT_BUF_SIZE  32
1662
1663 /* Make sure we have at least N more bytes of space in buffer.  */
1664 #define GET_BUFFER_SPACE(n)                                             \
1665     while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated)         \
1666       EXTEND_BUFFER ()
1667
1668 /* Make sure we have one more byte of buffer space and then add C to it.  */
1669 #define BUF_PUSH(c)                                                     \
1670   do {                                                                  \
1671     GET_BUFFER_SPACE (1);                                               \
1672     *b++ = (unsigned char) (c);                                         \
1673   } while (0)
1674
1675
1676 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
1677 #define BUF_PUSH_2(c1, c2)                                              \
1678   do {                                                                  \
1679     GET_BUFFER_SPACE (2);                                               \
1680     *b++ = (unsigned char) (c1);                                        \
1681     *b++ = (unsigned char) (c2);                                        \
1682   } while (0)
1683
1684
1685 /* Store a jump with opcode OP at LOC to location TO.  We store a
1686    relative address offset by the three bytes the jump itself occupies.  */
1687 #define STORE_JUMP(op, loc, to) \
1688   store_op1 (op, loc, (to) - (loc) - 3)
1689
1690 /* Likewise, for a two-argument jump.  */
1691 #define STORE_JUMP2(op, loc, to, arg) \
1692   store_op2 (op, loc, (to) - (loc) - 3, arg)
1693
1694 /* Like `STORE_JUMP', but for inserting.  Assume `b' is the buffer end.  */
1695 #define INSERT_JUMP(op, loc, to) \
1696   insert_op1 (op, loc, (to) - (loc) - 3, b)
1697
1698 /* Like `STORE_JUMP2', but for inserting.  Assume `b' is the buffer end.  */
1699 #define INSERT_JUMP2(op, loc, to, arg) \
1700   insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1701
1702
1703 /* This is not an arbitrary limit: the arguments which represent offsets
1704    into the pattern are two bytes long.  So if 2^15 bytes turns out to
1705    be too small, many things would have to change.  */
1706 # define MAX_BUF_SIZE (1L << 15)
1707
1708 /* Extend the buffer by twice its current size via realloc and
1709    reset the pointers that pointed into the old block to point to the
1710    correct places in the new one.  If extending the buffer results in it
1711    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
1712 #if __BOUNDED_POINTERS__
1713 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1714 # define MOVE_BUFFER_POINTER(P)                                 \
1715   (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer),     \
1716    SET_HIGH_BOUND (P),                                          \
1717    __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
1718 # define ELSE_EXTEND_BUFFER_HIGH_BOUND          \
1719   else                                          \
1720     {                                           \
1721       SET_HIGH_BOUND (b);                       \
1722       SET_HIGH_BOUND (begalt);                  \
1723       if (fixup_alt_jump)                       \
1724         SET_HIGH_BOUND (fixup_alt_jump);        \
1725       if (laststart)                            \
1726         SET_HIGH_BOUND (laststart);             \
1727       if (pending_exact)                        \
1728         SET_HIGH_BOUND (pending_exact);         \
1729     }
1730 #else
1731 # define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
1732 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1733 #endif
1734 #define EXTEND_BUFFER()                                                 \
1735   do {                                                                  \
1736     unsigned char *old_buffer = bufp->buffer;                           \
1737     if (bufp->allocated == MAX_BUF_SIZE)                                \
1738       return REG_ESIZE;                                                 \
1739     bufp->allocated <<= 1;                                              \
1740     if (bufp->allocated > MAX_BUF_SIZE)                                 \
1741       bufp->allocated = MAX_BUF_SIZE;                                   \
1742     RETALLOC (bufp->buffer, bufp->allocated, unsigned char);            \
1743     if (bufp->buffer == NULL)                                           \
1744       return REG_ESPACE;                                                \
1745     /* If the buffer moved, move all the pointers into it.  */          \
1746     if (old_buffer != bufp->buffer)                                     \
1747       {                                                                 \
1748         unsigned char *new_buffer = bufp->buffer;                       \
1749         MOVE_BUFFER_POINTER (b);                                        \
1750         MOVE_BUFFER_POINTER (begalt);                                   \
1751         if (fixup_alt_jump)                                             \
1752           MOVE_BUFFER_POINTER (fixup_alt_jump);                         \
1753         if (laststart)                                                  \
1754           MOVE_BUFFER_POINTER (laststart);                              \
1755         if (pending_exact)                                              \
1756           MOVE_BUFFER_POINTER (pending_exact);                          \
1757       }                                                                 \
1758     ELSE_EXTEND_BUFFER_HIGH_BOUND                                       \
1759   } while (0)
1760
1761
1762 /* Since we have one byte reserved for the register number argument to
1763    {start,stop}_memory, the maximum number of groups we can report
1764    things about is what fits in that byte.  */
1765 #define MAX_REGNUM 255
1766
1767 /* But patterns can have more than `MAX_REGNUM' registers.  We just
1768    ignore the excess.  */
1769 typedef int regnum_t;
1770
1771
1772 /* Macros for the compile stack.  */
1773
1774 /* Since offsets can go either forwards or backwards, this type needs to
1775    be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.  */
1776 /* int may be not enough when sizeof(int) == 2.  */
1777 typedef long pattern_offset_t;
1778
1779 typedef struct
1780 {
1781   pattern_offset_t begalt_offset;
1782   pattern_offset_t fixup_alt_jump;
1783   pattern_offset_t laststart_offset;
1784   regnum_t regnum;
1785 } compile_stack_elt_t;
1786
1787
1788 typedef struct
1789 {
1790   compile_stack_elt_t *stack;
1791   size_t size;
1792   size_t avail;                 /* Offset of next open position.  */
1793 } compile_stack_type;
1794
1795
1796 #define INIT_COMPILE_STACK_SIZE 32
1797
1798 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
1799 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
1800
1801 /* The next available element.  */
1802 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1803
1804 /* Explicit quit checking is needed for Emacs, which uses polling to
1805    process input events.  */
1806 #ifdef emacs
1807 # define IMMEDIATE_QUIT_CHECK                   \
1808     do {                                        \
1809       if (immediate_quit) QUIT;                 \
1810     } while (0)
1811 #else
1812 # define IMMEDIATE_QUIT_CHECK    ((void)0)
1813 #endif
1814 \f
1815 /* Structure to manage work area for range table.  */
1816 struct range_table_work_area
1817 {
1818   int *table;                   /* actual work area.  */
1819   int allocated;                /* allocated size for work area in bytes.  */
1820   int used;                     /* actually used size in words.  */
1821   int bits;                     /* flag to record character classes */
1822 };
1823
1824 #ifdef emacs
1825
1826 /* Make sure that WORK_AREA can hold more N multibyte characters.
1827    This is used only in set_image_of_range and set_image_of_range_1.
1828    It expects WORK_AREA to be a pointer.
1829    If it can't get the space, it returns from the surrounding function.  */
1830
1831 #define EXTEND_RANGE_TABLE(work_area, n)                                \
1832   do {                                                                  \
1833     if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1834       {                                                                 \
1835         extend_range_table_work_area (&work_area);                      \
1836         if ((work_area).table == 0)                                     \
1837           return (REG_ESPACE);                                          \
1838       }                                                                 \
1839   } while (0)
1840
1841 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit)           \
1842   (work_area).bits |= (bit)
1843
1844 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
1845 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)    \
1846   do {                                                                  \
1847     EXTEND_RANGE_TABLE ((work_area), 2);                                \
1848     (work_area).table[(work_area).used++] = (range_start);              \
1849     (work_area).table[(work_area).used++] = (range_end);                \
1850   } while (0)
1851
1852 #endif /* emacs */
1853
1854 /* Free allocated memory for WORK_AREA.  */
1855 #define FREE_RANGE_TABLE_WORK_AREA(work_area)   \
1856   do {                                          \
1857     if ((work_area).table)                      \
1858       free ((work_area).table);                 \
1859   } while (0)
1860
1861 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1862 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1863 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1864 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1865
1866 /* Bits used to implement the multibyte-part of the various character classes
1867    such as [:alnum:] in a charset's range table.  */
1868 #define BIT_WORD        0x1
1869 #define BIT_LOWER       0x2
1870 #define BIT_PUNCT       0x4
1871 #define BIT_SPACE       0x8
1872 #define BIT_UPPER       0x10
1873 #define BIT_MULTIBYTE   0x20
1874 \f
1875
1876 /* Set the bit for character C in a list.  */
1877 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1878
1879
1880 #ifdef emacs
1881
1882 /* Store characters in the range FROM to TO in the bitmap at B (for
1883    ASCII and unibyte characters) and WORK_AREA (for multibyte
1884    characters) while translating them and paying attention to the
1885    continuity of translated characters.
1886
1887    Implementation note: It is better to implement these fairly big
1888    macros by a function, but it's not that easy because macros called
1889    in this macro assume various local variables already declared.  */
1890
1891 /* Both FROM and TO are ASCII characters.  */
1892
1893 #define SETUP_ASCII_RANGE(work_area, FROM, TO)                  \
1894   do {                                                          \
1895     int C0, C1;                                                 \
1896                                                                 \
1897     for (C0 = (FROM); C0 <= (TO); C0++)                         \
1898       {                                                         \
1899         C1 = TRANSLATE (C0);                                    \
1900         if (! ASCII_CHAR_P (C1))                                \
1901           {                                                     \
1902             SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);    \
1903             if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0)             \
1904               C1 = C0;                                          \
1905           }                                                     \
1906         SET_LIST_BIT (C1);                                      \
1907       }                                                         \
1908   } while (0)
1909
1910
1911 /* Both FROM and TO are unibyte characters (0x80..0xFF).  */
1912
1913 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO)                               \
1914   do {                                                                         \
1915     int C0, C1, C2, I;                                                         \
1916     int USED = RANGE_TABLE_WORK_USED (work_area);                              \
1917                                                                                \
1918     for (C0 = (FROM); C0 <= (TO); C0++)                                        \
1919       {                                                                        \
1920         C1 = RE_CHAR_TO_MULTIBYTE (C0);                                        \
1921         if (CHAR_BYTE8_P (C1))                                                 \
1922           SET_LIST_BIT (C0);                                                   \
1923         else                                                                   \
1924           {                                                                    \
1925             C2 = TRANSLATE (C1);                                               \
1926             if (C2 == C1                                                       \
1927                 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0)                         \
1928               C1 = C0;                                                         \
1929             SET_LIST_BIT (C1);                                                 \
1930             for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1931               {                                                                \
1932                 int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
1933                 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
1934                                                                                \
1935                 if (C2 >= from - 1 && C2 <= to + 1)                            \
1936                   {                                                            \
1937                     if (C2 == from - 1)                                        \
1938                       RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
1939                     else if (C2 == to + 1)                                     \
1940                       RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
1941                     break;                                                     \
1942                   }                                                            \
1943               }                                                                \
1944             if (I < USED)                                                      \
1945               SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2);                 \
1946           }                                                                    \
1947       }                                                                        \
1948   } while (0)
1949
1950
1951 /* Both FROM and TO are multibyte characters.  */
1952
1953 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO)                         \
1954   do {                                                                     \
1955     int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area);           \
1956                                                                            \
1957     SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO));                 \
1958     for (C0 = (FROM); C0 <= (TO); C0++)                                    \
1959       {                                                                    \
1960         C1 = TRANSLATE (C0);                                               \
1961         if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0                            \
1962             || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0))          \
1963           SET_LIST_BIT (C2);                                               \
1964         if (C1 >= (FROM) && C1 <= (TO))                                    \
1965           continue;                                                        \
1966         for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1967           {                                                                \
1968             int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
1969             int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
1970                                                                            \
1971             if (C1 >= from - 1 && C1 <= to + 1)                            \
1972               {                                                            \
1973                 if (C1 == from - 1)                                        \
1974                   RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
1975                 else if (C1 == to + 1)                                     \
1976                   RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
1977                 break;                                                     \
1978               }                                                            \
1979           }                                                                \
1980         if (I < USED)                                                      \
1981           SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);                 \
1982       }                                                                    \
1983   } while (0)
1984
1985 #endif /* emacs */
1986
1987 /* Get the next unsigned number in the uncompiled pattern.  */
1988 #define GET_INTERVAL_COUNT(num)                                 \
1989   do {                                                                  \
1990     if (p == pend)                                                      \
1991       FREE_STACK_RETURN (REG_EBRACE);                                   \
1992     else                                                                \
1993       {                                                                 \
1994         PATFETCH (c);                                                   \
1995         while ('0' <= c && c <= '9')                                    \
1996           {                                                             \
1997             if (num < 0)                                                \
1998               num = 0;                                                  \
1999             if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num)    \
2000               FREE_STACK_RETURN (REG_BADBR);                            \
2001             num = num * 10 + c - '0';                                   \
2002             if (p == pend)                                              \
2003               FREE_STACK_RETURN (REG_EBRACE);                           \
2004             PATFETCH (c);                                               \
2005           }                                                             \
2006       }                                                                 \
2007   } while (0)
2008 \f
2009 #if ! WIDE_CHAR_SUPPORT
2010
2011 /* Map a string to the char class it names (if any).  */
2012 re_wctype_t
2013 re_wctype (const_re_char *str)
2014 {
2015   const char *string = (const char *) str;
2016   if      (STREQ (string, "alnum"))     return RECC_ALNUM;
2017   else if (STREQ (string, "alpha"))     return RECC_ALPHA;
2018   else if (STREQ (string, "word"))      return RECC_WORD;
2019   else if (STREQ (string, "ascii"))     return RECC_ASCII;
2020   else if (STREQ (string, "nonascii"))  return RECC_NONASCII;
2021   else if (STREQ (string, "graph"))     return RECC_GRAPH;
2022   else if (STREQ (string, "lower"))     return RECC_LOWER;
2023   else if (STREQ (string, "print"))     return RECC_PRINT;
2024   else if (STREQ (string, "punct"))     return RECC_PUNCT;
2025   else if (STREQ (string, "space"))     return RECC_SPACE;
2026   else if (STREQ (string, "upper"))     return RECC_UPPER;
2027   else if (STREQ (string, "unibyte"))   return RECC_UNIBYTE;
2028   else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2029   else if (STREQ (string, "digit"))     return RECC_DIGIT;
2030   else if (STREQ (string, "xdigit"))    return RECC_XDIGIT;
2031   else if (STREQ (string, "cntrl"))     return RECC_CNTRL;
2032   else if (STREQ (string, "blank"))     return RECC_BLANK;
2033   else return 0;
2034 }
2035
2036 /* True if CH is in the char class CC.  */
2037 boolean
2038 re_iswctype (int ch, re_wctype_t cc)
2039 {
2040   switch (cc)
2041     {
2042     case RECC_ALNUM: return ISALNUM (ch) != 0;
2043     case RECC_ALPHA: return ISALPHA (ch) != 0;
2044     case RECC_BLANK: return ISBLANK (ch) != 0;
2045     case RECC_CNTRL: return ISCNTRL (ch) != 0;
2046     case RECC_DIGIT: return ISDIGIT (ch) != 0;
2047     case RECC_GRAPH: return ISGRAPH (ch) != 0;
2048     case RECC_LOWER: return ISLOWER (ch) != 0;
2049     case RECC_PRINT: return ISPRINT (ch) != 0;
2050     case RECC_PUNCT: return ISPUNCT (ch) != 0;
2051     case RECC_SPACE: return ISSPACE (ch) != 0;
2052     case RECC_UPPER: return ISUPPER (ch) != 0;
2053     case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2054     case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
2055     case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2056     case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
2057     case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2058     case RECC_WORD: return ISWORD (ch) != 0;
2059     case RECC_ERROR: return false;
2060     default:
2061       abort ();
2062     }
2063 }
2064
2065 /* Return a bit-pattern to use in the range-table bits to match multibyte
2066    chars of class CC.  */
2067 static int
2068 re_wctype_to_bit (re_wctype_t cc)
2069 {
2070   switch (cc)
2071     {
2072     case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
2073     case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2074     case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2075     case RECC_LOWER: return BIT_LOWER;
2076     case RECC_UPPER: return BIT_UPPER;
2077     case RECC_PUNCT: return BIT_PUNCT;
2078     case RECC_SPACE: return BIT_SPACE;
2079     case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2080     case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2081     default:
2082       abort ();
2083     }
2084 }
2085 #endif
2086 \f
2087 /* Filling in the work area of a range.  */
2088
2089 /* Actually extend the space in WORK_AREA.  */
2090
2091 static void
2092 extend_range_table_work_area (struct range_table_work_area *work_area)
2093 {
2094   work_area->allocated += 16 * sizeof (int);
2095   work_area->table = realloc (work_area->table, work_area->allocated);
2096 }
2097
2098 #if 0
2099 #ifdef emacs
2100
2101 /* Carefully find the ranges of codes that are equivalent
2102    under case conversion to the range start..end when passed through
2103    TRANSLATE.  Handle the case where non-letters can come in between
2104    two upper-case letters (which happens in Latin-1).
2105    Also handle the case of groups of more than 2 case-equivalent chars.
2106
2107    The basic method is to look at consecutive characters and see
2108    if they can form a run that can be handled as one.
2109
2110    Returns -1 if successful, REG_ESPACE if ran out of space.  */
2111
2112 static int
2113 set_image_of_range_1 (struct range_table_work_area *work_area,
2114                       re_wchar_t start, re_wchar_t end,
2115                       RE_TRANSLATE_TYPE translate)
2116 {
2117   /* `one_case' indicates a character, or a run of characters,
2118      each of which is an isolate (no case-equivalents).
2119      This includes all ASCII non-letters.
2120
2121      `two_case' indicates a character, or a run of characters,
2122      each of which has two case-equivalent forms.
2123      This includes all ASCII letters.
2124
2125      `strange' indicates a character that has more than one
2126      case-equivalent.  */
2127
2128   enum case_type {one_case, two_case, strange};
2129
2130   /* Describe the run that is in progress,
2131      which the next character can try to extend.
2132      If run_type is strange, that means there really is no run.
2133      If run_type is one_case, then run_start...run_end is the run.
2134      If run_type is two_case, then the run is run_start...run_end,
2135      and the case-equivalents end at run_eqv_end.  */
2136
2137   enum case_type run_type = strange;
2138   int run_start, run_end, run_eqv_end;
2139
2140   Lisp_Object eqv_table;
2141
2142   if (!RE_TRANSLATE_P (translate))
2143     {
2144       EXTEND_RANGE_TABLE (work_area, 2);
2145       work_area->table[work_area->used++] = (start);
2146       work_area->table[work_area->used++] = (end);
2147       return -1;
2148     }
2149
2150   eqv_table = XCHAR_TABLE (translate)->extras[2];
2151
2152   for (; start <= end; start++)
2153     {
2154       enum case_type this_type;
2155       int eqv = RE_TRANSLATE (eqv_table, start);
2156       int minchar, maxchar;
2157
2158       /* Classify this character */
2159       if (eqv == start)
2160         this_type = one_case;
2161       else if (RE_TRANSLATE (eqv_table, eqv) == start)
2162         this_type = two_case;
2163       else
2164         this_type = strange;
2165
2166       if (start < eqv)
2167         minchar = start, maxchar = eqv;
2168       else
2169         minchar = eqv, maxchar = start;
2170
2171       /* Can this character extend the run in progress?  */
2172       if (this_type == strange || this_type != run_type
2173           || !(minchar == run_end + 1
2174                && (run_type == two_case
2175                    ? maxchar == run_eqv_end + 1 : 1)))
2176         {
2177           /* No, end the run.
2178              Record each of its equivalent ranges.  */
2179           if (run_type == one_case)
2180             {
2181               EXTEND_RANGE_TABLE (work_area, 2);
2182               work_area->table[work_area->used++] = run_start;
2183               work_area->table[work_area->used++] = run_end;
2184             }
2185           else if (run_type == two_case)
2186             {
2187               EXTEND_RANGE_TABLE (work_area, 4);
2188               work_area->table[work_area->used++] = run_start;
2189               work_area->table[work_area->used++] = run_end;
2190               work_area->table[work_area->used++]
2191                 = RE_TRANSLATE (eqv_table, run_start);
2192               work_area->table[work_area->used++]
2193                 = RE_TRANSLATE (eqv_table, run_end);
2194             }
2195           run_type = strange;
2196         }
2197
2198       if (this_type == strange)
2199         {
2200           /* For a strange character, add each of its equivalents, one
2201              by one.  Don't start a range.  */
2202           do
2203             {
2204               EXTEND_RANGE_TABLE (work_area, 2);
2205               work_area->table[work_area->used++] = eqv;
2206               work_area->table[work_area->used++] = eqv;
2207               eqv = RE_TRANSLATE (eqv_table, eqv);
2208             }
2209           while (eqv != start);
2210         }
2211
2212       /* Add this char to the run, or start a new run.  */
2213       else if (run_type == strange)
2214         {
2215           /* Initialize a new range.  */
2216           run_type = this_type;
2217           run_start = start;
2218           run_end = start;
2219           run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2220         }
2221       else
2222         {
2223           /* Extend a running range.  */
2224           run_end = minchar;
2225           run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2226         }
2227     }
2228
2229   /* If a run is still in progress at the end, finish it now
2230      by recording its equivalent ranges.  */
2231   if (run_type == one_case)
2232     {
2233       EXTEND_RANGE_TABLE (work_area, 2);
2234       work_area->table[work_area->used++] = run_start;
2235       work_area->table[work_area->used++] = run_end;
2236     }
2237   else if (run_type == two_case)
2238     {
2239       EXTEND_RANGE_TABLE (work_area, 4);
2240       work_area->table[work_area->used++] = run_start;
2241       work_area->table[work_area->used++] = run_end;
2242       work_area->table[work_area->used++]
2243         = RE_TRANSLATE (eqv_table, run_start);
2244       work_area->table[work_area->used++]
2245         = RE_TRANSLATE (eqv_table, run_end);
2246     }
2247
2248   return -1;
2249 }
2250
2251 #endif /* emacs */
2252
2253 /* Record the image of the range start..end when passed through
2254    TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2255    and is not even necessarily contiguous.
2256    Normally we approximate it with the smallest contiguous range that contains
2257    all the chars we need.  However, for Latin-1 we go to extra effort
2258    to do a better job.
2259
2260    This function is not called for ASCII ranges.
2261
2262    Returns -1 if successful, REG_ESPACE if ran out of space.  */
2263
2264 static int
2265 set_image_of_range (struct range_table_work_area *work_area,
2266                     re_wchar_t start, re_wchar_t end,
2267                     RE_TRANSLATE_TYPE translate)
2268 {
2269   re_wchar_t cmin, cmax;
2270
2271 #ifdef emacs
2272   /* For Latin-1 ranges, use set_image_of_range_1
2273      to get proper handling of ranges that include letters and nonletters.
2274      For a range that includes the whole of Latin-1, this is not necessary.
2275      For other character sets, we don't bother to get this right.  */
2276   if (RE_TRANSLATE_P (translate) && start < 04400
2277       && !(start < 04200 && end >= 04377))
2278     {
2279       int newend;
2280       int tem;
2281       newend = end;
2282       if (newend > 04377)
2283         newend = 04377;
2284       tem = set_image_of_range_1 (work_area, start, newend, translate);
2285       if (tem > 0)
2286         return tem;
2287
2288       start = 04400;
2289       if (end < 04400)
2290         return -1;
2291     }
2292 #endif
2293
2294   EXTEND_RANGE_TABLE (work_area, 2);
2295   work_area->table[work_area->used++] = (start);
2296   work_area->table[work_area->used++] = (end);
2297
2298   cmin = -1, cmax = -1;
2299
2300   if (RE_TRANSLATE_P (translate))
2301     {
2302       int ch;
2303
2304       for (ch = start; ch <= end; ch++)
2305         {
2306           re_wchar_t c = TRANSLATE (ch);
2307           if (! (start <= c && c <= end))
2308             {
2309               if (cmin == -1)
2310                 cmin = c, cmax = c;
2311               else
2312                 {
2313                   cmin = MIN (cmin, c);
2314                   cmax = MAX (cmax, c);
2315                 }
2316             }
2317         }
2318
2319       if (cmin != -1)
2320         {
2321           EXTEND_RANGE_TABLE (work_area, 2);
2322           work_area->table[work_area->used++] = (cmin);
2323           work_area->table[work_area->used++] = (cmax);
2324         }
2325     }
2326
2327   return -1;
2328 }
2329 #endif  /* 0 */
2330 \f
2331 #ifndef MATCH_MAY_ALLOCATE
2332
2333 /* If we cannot allocate large objects within re_match_2_internal,
2334    we make the fail stack and register vectors global.
2335    The fail stack, we grow to the maximum size when a regexp
2336    is compiled.
2337    The register vectors, we adjust in size each time we
2338    compile a regexp, according to the number of registers it needs.  */
2339
2340 static fail_stack_type fail_stack;
2341
2342 /* Size with which the following vectors are currently allocated.
2343    That is so we can make them bigger as needed,
2344    but never make them smaller.  */
2345 static int regs_allocated_size;
2346
2347 static re_char **     regstart, **     regend;
2348 static re_char **best_regstart, **best_regend;
2349
2350 /* Make the register vectors big enough for NUM_REGS registers,
2351    but don't make them smaller.  */
2352
2353 static
2354 regex_grow_registers (int num_regs)
2355 {
2356   if (num_regs > regs_allocated_size)
2357     {
2358       RETALLOC_IF (regstart,     num_regs, re_char *);
2359       RETALLOC_IF (regend,       num_regs, re_char *);
2360       RETALLOC_IF (best_regstart, num_regs, re_char *);
2361       RETALLOC_IF (best_regend,  num_regs, re_char *);
2362
2363       regs_allocated_size = num_regs;
2364     }
2365 }
2366
2367 #endif /* not MATCH_MAY_ALLOCATE */
2368 \f
2369 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2370                                        regnum_t regnum);
2371
2372 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2373    Returns one of error codes defined in `regex.h', or zero for success.
2374
2375    Assumes the `allocated' (and perhaps `buffer') and `translate'
2376    fields are set in BUFP on entry.
2377
2378    If it succeeds, results are put in BUFP (if it returns an error, the
2379    contents of BUFP are undefined):
2380      `buffer' is the compiled pattern;
2381      `syntax' is set to SYNTAX;
2382      `used' is set to the length of the compiled pattern;
2383      `fastmap_accurate' is zero;
2384      `re_nsub' is the number of subexpressions in PATTERN;
2385      `not_bol' and `not_eol' are zero;
2386
2387    The `fastmap' field is neither examined nor set.  */
2388
2389 /* Insert the `jump' from the end of last alternative to "here".
2390    The space for the jump has already been allocated. */
2391 #define FIXUP_ALT_JUMP()                                                \
2392 do {                                                                    \
2393   if (fixup_alt_jump)                                                   \
2394     STORE_JUMP (jump, fixup_alt_jump, b);                               \
2395 } while (0)
2396
2397
2398 /* Return, freeing storage we allocated.  */
2399 #define FREE_STACK_RETURN(value)                \
2400   do {                                                  \
2401     FREE_RANGE_TABLE_WORK_AREA (range_table_work);      \
2402     free (compile_stack.stack);                         \
2403     return value;                                       \
2404   } while (0)
2405
2406 static reg_errcode_t
2407 regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2408                struct re_pattern_buffer *bufp)
2409 {
2410   /* We fetch characters from PATTERN here.  */
2411   register re_wchar_t c, c1;
2412
2413   /* Points to the end of the buffer, where we should append.  */
2414   register unsigned char *b;
2415
2416   /* Keeps track of unclosed groups.  */
2417   compile_stack_type compile_stack;
2418
2419   /* Points to the current (ending) position in the pattern.  */
2420 #ifdef AIX
2421   /* `const' makes AIX compiler fail.  */
2422   unsigned char *p = pattern;
2423 #else
2424   re_char *p = pattern;
2425 #endif
2426   re_char *pend = pattern + size;
2427
2428   /* How to translate the characters in the pattern.  */
2429   RE_TRANSLATE_TYPE translate = bufp->translate;
2430
2431   /* Address of the count-byte of the most recently inserted `exactn'
2432      command.  This makes it possible to tell if a new exact-match
2433      character can be added to that command or if the character requires
2434      a new `exactn' command.  */
2435   unsigned char *pending_exact = 0;
2436
2437   /* Address of start of the most recently finished expression.
2438      This tells, e.g., postfix * where to find the start of its
2439      operand.  Reset at the beginning of groups and alternatives.  */
2440   unsigned char *laststart = 0;
2441
2442   /* Address of beginning of regexp, or inside of last group.  */
2443   unsigned char *begalt;
2444
2445   /* Place in the uncompiled pattern (i.e., the {) to
2446      which to go back if the interval is invalid.  */
2447   re_char *beg_interval;
2448
2449   /* Address of the place where a forward jump should go to the end of
2450      the containing expression.  Each alternative of an `or' -- except the
2451      last -- ends with a forward jump of this sort.  */
2452   unsigned char *fixup_alt_jump = 0;
2453
2454   /* Work area for range table of charset.  */
2455   struct range_table_work_area range_table_work;
2456
2457   /* If the object matched can contain multibyte characters.  */
2458   const boolean multibyte = RE_MULTIBYTE_P (bufp);
2459
2460   /* Nonzero if we have pushed down into a subpattern.  */
2461   int in_subpattern = 0;
2462
2463   /* These hold the values of p, pattern, and pend from the main
2464      pattern when we have pushed into a subpattern.  */
2465   re_char *main_p IF_LINT (= NULL);
2466   re_char *main_pattern IF_LINT (= NULL);
2467   re_char *main_pend IF_LINT (= NULL);
2468
2469 #ifdef DEBUG
2470   debug++;
2471   DEBUG_PRINT ("\nCompiling pattern: ");
2472   if (debug > 0)
2473     {
2474       unsigned debug_count;
2475
2476       for (debug_count = 0; debug_count < size; debug_count++)
2477         putchar (pattern[debug_count]);
2478       putchar ('\n');
2479     }
2480 #endif /* DEBUG */
2481
2482   /* Initialize the compile stack.  */
2483   compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2484   if (compile_stack.stack == NULL)
2485     return REG_ESPACE;
2486
2487   compile_stack.size = INIT_COMPILE_STACK_SIZE;
2488   compile_stack.avail = 0;
2489
2490   range_table_work.table = 0;
2491   range_table_work.allocated = 0;
2492
2493   /* Initialize the pattern buffer.  */
2494   bufp->syntax = syntax;
2495   bufp->fastmap_accurate = 0;
2496   bufp->not_bol = bufp->not_eol = 0;
2497   bufp->used_syntax = 0;
2498
2499   /* Set `used' to zero, so that if we return an error, the pattern
2500      printer (for debugging) will think there's no pattern.  We reset it
2501      at the end.  */
2502   bufp->used = 0;
2503
2504   /* Always count groups, whether or not bufp->no_sub is set.  */
2505   bufp->re_nsub = 0;
2506
2507 #if !defined emacs && !defined SYNTAX_TABLE
2508   /* Initialize the syntax table.  */
2509    init_syntax_once ();
2510 #endif
2511
2512   if (bufp->allocated == 0)
2513     {
2514       if (bufp->buffer)
2515         { /* If zero allocated, but buffer is non-null, try to realloc
2516              enough space.  This loses if buffer's address is bogus, but
2517              that is the user's responsibility.  */
2518           RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2519         }
2520       else
2521         { /* Caller did not allocate a buffer.  Do it for them.  */
2522           bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2523         }
2524       if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2525
2526       bufp->allocated = INIT_BUF_SIZE;
2527     }
2528
2529   begalt = b = bufp->buffer;
2530
2531   /* Loop through the uncompiled pattern until we're at the end.  */
2532   while (1)
2533     {
2534       if (p == pend)
2535         {
2536           /* If this is the end of an included regexp,
2537              pop back to the main regexp and try again.  */
2538           if (in_subpattern)
2539             {
2540               in_subpattern = 0;
2541               pattern = main_pattern;
2542               p = main_p;
2543               pend = main_pend;
2544               continue;
2545             }
2546           /* If this is the end of the main regexp, we are done.  */
2547           break;
2548         }
2549
2550       PATFETCH (c);
2551
2552       switch (c)
2553         {
2554         case ' ':
2555           {
2556             re_char *p1 = p;
2557
2558             /* If there's no special whitespace regexp, treat
2559                spaces normally.  And don't try to do this recursively.  */
2560             if (!whitespace_regexp || in_subpattern)
2561               goto normal_char;
2562
2563             /* Peek past following spaces.  */
2564             while (p1 != pend)
2565               {
2566                 if (*p1 != ' ')
2567                   break;
2568                 p1++;
2569               }
2570             /* If the spaces are followed by a repetition op,
2571                treat them normally.  */
2572             if (p1 != pend
2573                 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
2574                     || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2575               goto normal_char;
2576
2577             /* Replace the spaces with the whitespace regexp.  */
2578             in_subpattern = 1;
2579             main_p = p1;
2580             main_pend = pend;
2581             main_pattern = pattern;
2582             p = pattern = whitespace_regexp;
2583             pend = p + strlen ((const char *) p);
2584             break;
2585           }
2586
2587         case '^':
2588           {
2589             if (   /* If at start of pattern, it's an operator.  */
2590                    p == pattern + 1
2591                    /* If context independent, it's an operator.  */
2592                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2593                    /* Otherwise, depends on what's come before.  */
2594                 || at_begline_loc_p (pattern, p, syntax))
2595               BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
2596             else
2597               goto normal_char;
2598           }
2599           break;
2600
2601
2602         case '$':
2603           {
2604             if (   /* If at end of pattern, it's an operator.  */
2605                    p == pend
2606                    /* If context independent, it's an operator.  */
2607                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2608                    /* Otherwise, depends on what's next.  */
2609                 || at_endline_loc_p (p, pend, syntax))
2610                BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
2611              else
2612                goto normal_char;
2613            }
2614            break;
2615
2616
2617         case '+':
2618         case '?':
2619           if ((syntax & RE_BK_PLUS_QM)
2620               || (syntax & RE_LIMITED_OPS))
2621             goto normal_char;
2622         handle_plus:
2623         case '*':
2624           /* If there is no previous pattern...  */
2625           if (!laststart)
2626             {
2627               if (syntax & RE_CONTEXT_INVALID_OPS)
2628                 FREE_STACK_RETURN (REG_BADRPT);
2629               else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2630                 goto normal_char;
2631             }
2632
2633           {
2634             /* 1 means zero (many) matches is allowed.  */
2635             boolean zero_times_ok = 0, many_times_ok = 0;
2636             boolean greedy = 1;
2637
2638             /* If there is a sequence of repetition chars, collapse it
2639                down to just one (the right one).  We can't combine
2640                interval operators with these because of, e.g., `a{2}*',
2641                which should only match an even number of `a's.  */
2642
2643             for (;;)
2644               {
2645                 if ((syntax & RE_FRUGAL)
2646                     && c == '?' && (zero_times_ok || many_times_ok))
2647                   greedy = 0;
2648                 else
2649                   {
2650                     zero_times_ok |= c != '+';
2651                     many_times_ok |= c != '?';
2652                   }
2653
2654                 if (p == pend)
2655                   break;
2656                 else if (*p == '*'
2657                          || (!(syntax & RE_BK_PLUS_QM)
2658                              && (*p == '+' || *p == '?')))
2659                   ;
2660                 else if (syntax & RE_BK_PLUS_QM  && *p == '\\')
2661                   {
2662                     if (p+1 == pend)
2663                       FREE_STACK_RETURN (REG_EESCAPE);
2664                     if (p[1] == '+' || p[1] == '?')
2665                       PATFETCH (c); /* Gobble up the backslash.  */
2666                     else
2667                       break;
2668                   }
2669                 else
2670                   break;
2671                 /* If we get here, we found another repeat character.  */
2672                 PATFETCH (c);
2673                }
2674
2675             /* Star, etc. applied to an empty pattern is equivalent
2676                to an empty pattern.  */
2677             if (!laststart || laststart == b)
2678               break;
2679
2680             /* Now we know whether or not zero matches is allowed
2681                and also whether or not two or more matches is allowed.  */
2682             if (greedy)
2683               {
2684                 if (many_times_ok)
2685                   {
2686                     boolean simple = skip_one_char (laststart) == b;
2687                     size_t startoffset = 0;
2688                     re_opcode_t ofj =
2689                       /* Check if the loop can match the empty string.  */
2690                       (simple || !analyse_first (laststart, b, NULL, 0))
2691                       ? on_failure_jump : on_failure_jump_loop;
2692                     assert (skip_one_char (laststart) <= b);
2693
2694                     if (!zero_times_ok && simple)
2695                       { /* Since simple * loops can be made faster by using
2696                            on_failure_keep_string_jump, we turn simple P+
2697                            into PP* if P is simple.  */
2698                         unsigned char *p1, *p2;
2699                         startoffset = b - laststart;
2700                         GET_BUFFER_SPACE (startoffset);
2701                         p1 = b; p2 = laststart;
2702                         while (p2 < p1)
2703                           *b++ = *p2++;
2704                         zero_times_ok = 1;
2705                       }
2706
2707                     GET_BUFFER_SPACE (6);
2708                     if (!zero_times_ok)
2709                       /* A + loop.  */
2710                       STORE_JUMP (ofj, b, b + 6);
2711                     else
2712                       /* Simple * loops can use on_failure_keep_string_jump
2713                          depending on what follows.  But since we don't know
2714                          that yet, we leave the decision up to
2715                          on_failure_jump_smart.  */
2716                       INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2717                                    laststart + startoffset, b + 6);
2718                     b += 3;
2719                     STORE_JUMP (jump, b, laststart + startoffset);
2720                     b += 3;
2721                   }
2722                 else
2723                   {
2724                     /* A simple ? pattern.  */
2725                     assert (zero_times_ok);
2726                     GET_BUFFER_SPACE (3);
2727                     INSERT_JUMP (on_failure_jump, laststart, b + 3);
2728                     b += 3;
2729                   }
2730               }
2731             else                /* not greedy */
2732               { /* I wish the greedy and non-greedy cases could be merged.  */
2733
2734                 GET_BUFFER_SPACE (7); /* We might use less.  */
2735                 if (many_times_ok)
2736                   {
2737                     boolean emptyp = analyse_first (laststart, b, NULL, 0);
2738
2739                     /* The non-greedy multiple match looks like
2740                        a repeat..until: we only need a conditional jump
2741                        at the end of the loop.  */
2742                     if (emptyp) BUF_PUSH (no_op);
2743                     STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2744                                 : on_failure_jump, b, laststart);
2745                     b += 3;
2746                     if (zero_times_ok)
2747                       {
2748                         /* The repeat...until naturally matches one or more.
2749                            To also match zero times, we need to first jump to
2750                            the end of the loop (its conditional jump).  */
2751                         INSERT_JUMP (jump, laststart, b);
2752                         b += 3;
2753                       }
2754                   }
2755                 else
2756                   {
2757                     /* non-greedy a?? */
2758                     INSERT_JUMP (jump, laststart, b + 3);
2759                     b += 3;
2760                     INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2761                     b += 3;
2762                   }
2763               }
2764           }
2765           pending_exact = 0;
2766           break;
2767
2768
2769         case '.':
2770           laststart = b;
2771           BUF_PUSH (anychar);
2772           break;
2773
2774
2775         case '[':
2776           {
2777             re_char *p1;
2778
2779             CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2780
2781             if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2782
2783             /* Ensure that we have enough space to push a charset: the
2784                opcode, the length count, and the bitset; 34 bytes in all.  */
2785             GET_BUFFER_SPACE (34);
2786
2787             laststart = b;
2788
2789             /* We test `*p == '^' twice, instead of using an if
2790                statement, so we only need one BUF_PUSH.  */
2791             BUF_PUSH (*p == '^' ? charset_not : charset);
2792             if (*p == '^')
2793               p++;
2794
2795             /* Remember the first position in the bracket expression.  */
2796             p1 = p;
2797
2798             /* Push the number of bytes in the bitmap.  */
2799             BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2800
2801             /* Clear the whole map.  */
2802             memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2803
2804             /* charset_not matches newline according to a syntax bit.  */
2805             if ((re_opcode_t) b[-2] == charset_not
2806                 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2807               SET_LIST_BIT ('\n');
2808
2809             /* Read in characters and ranges, setting map bits.  */
2810             for (;;)
2811               {
2812                 boolean escaped_char = false;
2813                 const unsigned char *p2 = p;
2814                 re_wchar_t ch;
2815
2816                 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2817
2818                 /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
2819                    always be determined from TRANSLATE(X) and TRANSLATE(Y)
2820                    So the translation is done later in a loop.  Example:
2821                    (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
2822                 PATFETCH (c);
2823
2824                 /* \ might escape characters inside [...] and [^...].  */
2825                 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2826                   {
2827                     if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2828
2829                     PATFETCH (c);
2830                     escaped_char = true;
2831                   }
2832                 else
2833                   {
2834                     /* Could be the end of the bracket expression.  If it's
2835                        not (i.e., when the bracket expression is `[]' so
2836                        far), the ']' character bit gets set way below.  */
2837                     if (c == ']' && p2 != p1)
2838                       break;
2839                   }
2840
2841                 /* See if we're at the beginning of a possible character
2842                    class.  */
2843
2844                 if (!escaped_char &&
2845                     syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2846                   {
2847                     /* Leave room for the null.  */
2848                     unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2849                     const unsigned char *class_beg;
2850
2851                     PATFETCH (c);
2852                     c1 = 0;
2853                     class_beg = p;
2854
2855                     /* If pattern is `[[:'.  */
2856                     if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2857
2858                     for (;;)
2859                       {
2860                         PATFETCH (c);
2861                         if ((c == ':' && *p == ']') || p == pend)
2862                           break;
2863                         if (c1 < CHAR_CLASS_MAX_LENGTH)
2864                           str[c1++] = c;
2865                         else
2866                           /* This is in any case an invalid class name.  */
2867                           str[0] = '\0';
2868                       }
2869                     str[c1] = '\0';
2870
2871                     /* If isn't a word bracketed by `[:' and `:]':
2872                        undo the ending character, the letters, and
2873                        leave the leading `:' and `[' (but set bits for
2874                        them).  */
2875                     if (c == ':' && *p == ']')
2876                       {
2877                         re_wctype_t cc = re_wctype (str);
2878
2879                         if (cc == 0)
2880                           FREE_STACK_RETURN (REG_ECTYPE);
2881
2882                         /* Throw away the ] at the end of the character
2883                            class.  */
2884                         PATFETCH (c);
2885
2886                         if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2887
2888 #ifndef emacs
2889                         for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2890                           if (re_iswctype (btowc (ch), cc))
2891                             {
2892                               c = TRANSLATE (ch);
2893                               if (c < (1 << BYTEWIDTH))
2894                                 SET_LIST_BIT (c);
2895                             }
2896 #else  /* emacs */
2897                         /* Most character classes in a multibyte match
2898                            just set a flag.  Exceptions are is_blank,
2899                            is_digit, is_cntrl, and is_xdigit, since
2900                            they can only match ASCII characters.  We
2901                            don't need to handle them for multibyte.
2902                            They are distinguished by a negative wctype.  */
2903
2904                         /* Setup the gl_state object to its buffer-defined
2905                            value.  This hardcodes the buffer-global
2906                            syntax-table for ASCII chars, while the other chars
2907                            will obey syntax-table properties.  It's not ideal,
2908                            but it's the way it's been done until now.  */
2909                         SETUP_BUFFER_SYNTAX_TABLE ();
2910
2911                         for (ch = 0; ch < 256; ++ch)
2912                           {
2913                             c = RE_CHAR_TO_MULTIBYTE (ch);
2914                             if (! CHAR_BYTE8_P (c)
2915                                 && re_iswctype (c, cc))
2916                               {
2917                                 SET_LIST_BIT (ch);
2918                                 c1 = TRANSLATE (c);
2919                                 if (c1 == c)
2920                                   continue;
2921                                 if (ASCII_CHAR_P (c1))
2922                                   SET_LIST_BIT (c1);
2923                                 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2924                                   SET_LIST_BIT (c1);
2925                               }
2926                           }
2927                         SET_RANGE_TABLE_WORK_AREA_BIT
2928                           (range_table_work, re_wctype_to_bit (cc));
2929 #endif  /* emacs */
2930                         /* In most cases the matching rule for char classes
2931                            only uses the syntax table for multibyte chars,
2932                            so that the content of the syntax-table it is not
2933                            hardcoded in the range_table.  SPACE and WORD are
2934                            the two exceptions.  */
2935                         if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2936                           bufp->used_syntax = 1;
2937
2938                         /* Repeat the loop. */
2939                         continue;
2940                       }
2941                     else
2942                       {
2943                         /* Go back to right after the "[:".  */
2944                         p = class_beg;
2945                         SET_LIST_BIT ('[');
2946
2947                         /* Because the `:' may starts the range, we
2948                            can't simply set bit and repeat the loop.
2949                            Instead, just set it to C and handle below.  */
2950                         c = ':';
2951                       }
2952                   }
2953
2954                 if (p < pend && p[0] == '-' && p[1] != ']')
2955                   {
2956
2957                     /* Discard the `-'. */
2958                     PATFETCH (c1);
2959
2960                     /* Fetch the character which ends the range. */
2961                     PATFETCH (c1);
2962 #ifdef emacs
2963                     if (CHAR_BYTE8_P (c1)
2964                         && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2965                       /* Treat the range from a multibyte character to
2966                          raw-byte character as empty.  */
2967                       c = c1 + 1;
2968 #endif  /* emacs */
2969                   }
2970                 else
2971                   /* Range from C to C. */
2972                   c1 = c;
2973
2974                 if (c > c1)
2975                   {
2976                     if (syntax & RE_NO_EMPTY_RANGES)
2977                       FREE_STACK_RETURN (REG_ERANGEX);
2978                     /* Else, repeat the loop.  */
2979                   }
2980                 else
2981                   {
2982 #ifndef emacs
2983                     /* Set the range into bitmap */
2984                     for (; c <= c1; c++)
2985                       {
2986                         ch = TRANSLATE (c);
2987                         if (ch < (1 << BYTEWIDTH))
2988                           SET_LIST_BIT (ch);
2989                       }
2990 #else  /* emacs */
2991                     if (c < 128)
2992                       {
2993                         ch = MIN (127, c1);
2994                         SETUP_ASCII_RANGE (range_table_work, c, ch);
2995                         c = ch + 1;
2996                         if (CHAR_BYTE8_P (c1))
2997                           c = BYTE8_TO_CHAR (128);
2998                       }
2999                     if (c <= c1)
3000                       {
3001                         if (CHAR_BYTE8_P (c))
3002                           {
3003                             c = CHAR_TO_BYTE8 (c);
3004                             c1 = CHAR_TO_BYTE8 (c1);
3005                             for (; c <= c1; c++)
3006                               SET_LIST_BIT (c);
3007                           }
3008                         else if (multibyte)
3009                           {
3010                             SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3011                           }
3012                         else
3013                           {
3014                             SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3015                           }
3016                       }
3017 #endif /* emacs */
3018                   }
3019               }
3020
3021             /* Discard any (non)matching list bytes that are all 0 at the
3022                end of the map.  Decrease the map-length byte too.  */
3023             while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3024               b[-1]--;
3025             b += b[-1];
3026
3027             /* Build real range table from work area.  */
3028             if (RANGE_TABLE_WORK_USED (range_table_work)
3029                 || RANGE_TABLE_WORK_BITS (range_table_work))
3030               {
3031                 int i;
3032                 int used = RANGE_TABLE_WORK_USED (range_table_work);
3033
3034                 /* Allocate space for COUNT + RANGE_TABLE.  Needs two
3035                    bytes for flags, two for COUNT, and three bytes for
3036                    each character.  */
3037                 GET_BUFFER_SPACE (4 + used * 3);
3038
3039                 /* Indicate the existence of range table.  */
3040                 laststart[1] |= 0x80;
3041
3042                 /* Store the character class flag bits into the range table.
3043                    If not in emacs, these flag bits are always 0.  */
3044                 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3045                 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3046
3047                 STORE_NUMBER_AND_INCR (b, used / 2);
3048                 for (i = 0; i < used; i++)
3049                   STORE_CHARACTER_AND_INCR
3050                     (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3051               }
3052           }
3053           break;
3054
3055
3056         case '(':
3057           if (syntax & RE_NO_BK_PARENS)
3058             goto handle_open;
3059           else
3060             goto normal_char;
3061
3062
3063         case ')':
3064           if (syntax & RE_NO_BK_PARENS)
3065             goto handle_close;
3066           else
3067             goto normal_char;
3068
3069
3070         case '\n':
3071           if (syntax & RE_NEWLINE_ALT)
3072             goto handle_alt;
3073           else
3074             goto normal_char;
3075
3076
3077         case '|':
3078           if (syntax & RE_NO_BK_VBAR)
3079             goto handle_alt;
3080           else
3081             goto normal_char;
3082
3083
3084         case '{':
3085            if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3086              goto handle_interval;
3087            else
3088              goto normal_char;
3089
3090
3091         case '\\':
3092           if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3093
3094           /* Do not translate the character after the \, so that we can
3095              distinguish, e.g., \B from \b, even if we normally would
3096              translate, e.g., B to b.  */
3097           PATFETCH (c);
3098
3099           switch (c)
3100             {
3101             case '(':
3102               if (syntax & RE_NO_BK_PARENS)
3103                 goto normal_backslash;
3104
3105             handle_open:
3106               {
3107                 int shy = 0;
3108                 regnum_t regnum = 0;
3109                 if (p+1 < pend)
3110                   {
3111                     /* Look for a special (?...) construct */
3112                     if ((syntax & RE_SHY_GROUPS) && *p == '?')
3113                       {
3114                         PATFETCH (c); /* Gobble up the '?'.  */
3115                         while (!shy)
3116                           {
3117                             PATFETCH (c);
3118                             switch (c)
3119                               {
3120                               case ':': shy = 1; break;
3121                               case '0':
3122                                 /* An explicitly specified regnum must start
3123                                    with non-0. */
3124                                 if (regnum == 0)
3125                                   FREE_STACK_RETURN (REG_BADPAT);
3126                               case '1': case '2': case '3': case '4':
3127                               case '5': case '6': case '7': case '8': case '9':
3128                                 regnum = 10*regnum + (c - '0'); break;
3129                               default:
3130                                 /* Only (?:...) is supported right now. */
3131                                 FREE_STACK_RETURN (REG_BADPAT);
3132                               }
3133                           }
3134                       }
3135                   }
3136
3137                 if (!shy)
3138                   regnum = ++bufp->re_nsub;
3139                 else if (regnum)
3140                   { /* It's actually not shy, but explicitly numbered.  */
3141                     shy = 0;
3142                     if (regnum > bufp->re_nsub)
3143                       bufp->re_nsub = regnum;
3144                     else if (regnum > bufp->re_nsub
3145                              /* Ideally, we'd want to check that the specified
3146                                 group can't have matched (i.e. all subgroups
3147                                 using the same regnum are in other branches of
3148                                 OR patterns), but we don't currently keep track
3149                                 of enough info to do that easily.  */
3150                              || group_in_compile_stack (compile_stack, regnum))
3151                       FREE_STACK_RETURN (REG_BADPAT);
3152                   }
3153                 else
3154                   /* It's really shy.  */
3155                   regnum = - bufp->re_nsub;
3156
3157                 if (COMPILE_STACK_FULL)
3158                   {
3159                     RETALLOC (compile_stack.stack, compile_stack.size << 1,
3160                               compile_stack_elt_t);
3161                     if (compile_stack.stack == NULL) return REG_ESPACE;
3162
3163                     compile_stack.size <<= 1;
3164                   }
3165
3166                 /* These are the values to restore when we hit end of this
3167                    group.  They are all relative offsets, so that if the
3168                    whole pattern moves because of realloc, they will still
3169                    be valid.  */
3170                 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3171                 COMPILE_STACK_TOP.fixup_alt_jump
3172                   = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3173                 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3174                 COMPILE_STACK_TOP.regnum = regnum;
3175
3176                 /* Do not push a start_memory for groups beyond the last one
3177                    we can represent in the compiled pattern.  */
3178                 if (regnum <= MAX_REGNUM && regnum > 0)
3179                   BUF_PUSH_2 (start_memory, regnum);
3180
3181                 compile_stack.avail++;
3182
3183                 fixup_alt_jump = 0;
3184                 laststart = 0;
3185                 begalt = b;
3186                 /* If we've reached MAX_REGNUM groups, then this open
3187                    won't actually generate any code, so we'll have to
3188                    clear pending_exact explicitly.  */
3189                 pending_exact = 0;
3190                 break;
3191               }
3192
3193             case ')':
3194               if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3195
3196               if (COMPILE_STACK_EMPTY)
3197                 {
3198                   if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3199                     goto normal_backslash;
3200                   else
3201                     FREE_STACK_RETURN (REG_ERPAREN);
3202                 }
3203
3204             handle_close:
3205               FIXUP_ALT_JUMP ();
3206
3207               /* See similar code for backslashed left paren above.  */
3208               if (COMPILE_STACK_EMPTY)
3209                 {
3210                   if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3211                     goto normal_char;
3212                   else
3213                     FREE_STACK_RETURN (REG_ERPAREN);
3214                 }
3215
3216               /* Since we just checked for an empty stack above, this
3217                  ``can't happen''.  */
3218               assert (compile_stack.avail != 0);
3219               {
3220                 /* We don't just want to restore into `regnum', because
3221                    later groups should continue to be numbered higher,
3222                    as in `(ab)c(de)' -- the second group is #2.  */
3223                 regnum_t regnum;
3224
3225                 compile_stack.avail--;
3226                 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3227                 fixup_alt_jump
3228                   = COMPILE_STACK_TOP.fixup_alt_jump
3229                     ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3230                     : 0;
3231                 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3232                 regnum = COMPILE_STACK_TOP.regnum;
3233                 /* If we've reached MAX_REGNUM groups, then this open
3234                    won't actually generate any code, so we'll have to
3235                    clear pending_exact explicitly.  */
3236                 pending_exact = 0;
3237
3238                 /* We're at the end of the group, so now we know how many
3239                    groups were inside this one.  */
3240                 if (regnum <= MAX_REGNUM && regnum > 0)
3241                   BUF_PUSH_2 (stop_memory, regnum);
3242               }
3243               break;
3244
3245
3246             case '|':                                   /* `\|'.  */
3247               if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3248                 goto normal_backslash;
3249             handle_alt:
3250               if (syntax & RE_LIMITED_OPS)
3251                 goto normal_char;
3252
3253               /* Insert before the previous alternative a jump which
3254                  jumps to this alternative if the former fails.  */
3255               GET_BUFFER_SPACE (3);
3256               INSERT_JUMP (on_failure_jump, begalt, b + 6);
3257               pending_exact = 0;
3258               b += 3;
3259
3260               /* The alternative before this one has a jump after it
3261                  which gets executed if it gets matched.  Adjust that
3262                  jump so it will jump to this alternative's analogous
3263                  jump (put in below, which in turn will jump to the next
3264                  (if any) alternative's such jump, etc.).  The last such
3265                  jump jumps to the correct final destination.  A picture:
3266                           _____ _____
3267                           |   | |   |
3268                           |   v |   v
3269                         a | b   | c
3270
3271                  If we are at `b', then fixup_alt_jump right now points to a
3272                  three-byte space after `a'.  We'll put in the jump, set
3273                  fixup_alt_jump to right after `b', and leave behind three
3274                  bytes which we'll fill in when we get to after `c'.  */
3275
3276               FIXUP_ALT_JUMP ();
3277
3278               /* Mark and leave space for a jump after this alternative,
3279                  to be filled in later either by next alternative or
3280                  when know we're at the end of a series of alternatives.  */
3281               fixup_alt_jump = b;
3282               GET_BUFFER_SPACE (3);
3283               b += 3;
3284
3285               laststart = 0;
3286               begalt = b;
3287               break;
3288
3289
3290             case '{':
3291               /* If \{ is a literal.  */
3292               if (!(syntax & RE_INTERVALS)
3293                      /* If we're at `\{' and it's not the open-interval
3294                         operator.  */
3295                   || (syntax & RE_NO_BK_BRACES))
3296                 goto normal_backslash;
3297
3298             handle_interval:
3299               {
3300                 /* If got here, then the syntax allows intervals.  */
3301
3302                 /* At least (most) this many matches must be made.  */
3303                 int lower_bound = 0, upper_bound = -1;
3304
3305                 beg_interval = p;
3306
3307                 GET_INTERVAL_COUNT (lower_bound);
3308
3309                 if (c == ',')
3310                   GET_INTERVAL_COUNT (upper_bound);
3311                 else
3312                   /* Interval such as `{1}' => match exactly once. */
3313                   upper_bound = lower_bound;
3314
3315                 if (lower_bound < 0
3316                     || (0 <= upper_bound && upper_bound < lower_bound))
3317                   FREE_STACK_RETURN (REG_BADBR);
3318
3319                 if (!(syntax & RE_NO_BK_BRACES))
3320                   {
3321                     if (c != '\\')
3322                       FREE_STACK_RETURN (REG_BADBR);
3323                     if (p == pend)
3324                       FREE_STACK_RETURN (REG_EESCAPE);
3325                     PATFETCH (c);
3326                   }
3327
3328                 if (c != '}')
3329                   FREE_STACK_RETURN (REG_BADBR);
3330
3331                 /* We just parsed a valid interval.  */
3332
3333                 /* If it's invalid to have no preceding re.  */
3334                 if (!laststart)
3335                   {
3336                     if (syntax & RE_CONTEXT_INVALID_OPS)
3337                       FREE_STACK_RETURN (REG_BADRPT);
3338                     else if (syntax & RE_CONTEXT_INDEP_OPS)
3339                       laststart = b;
3340                     else
3341                       goto unfetch_interval;
3342                   }
3343
3344                 if (upper_bound == 0)
3345                   /* If the upper bound is zero, just drop the sub pattern
3346                      altogether.  */
3347                   b = laststart;
3348                 else if (lower_bound == 1 && upper_bound == 1)
3349                   /* Just match it once: nothing to do here.  */
3350                   ;
3351
3352                 /* Otherwise, we have a nontrivial interval.  When
3353                    we're all done, the pattern will look like:
3354                    set_number_at <jump count> <upper bound>
3355                    set_number_at <succeed_n count> <lower bound>
3356                    succeed_n <after jump addr> <succeed_n count>
3357                    <body of loop>
3358                    jump_n <succeed_n addr> <jump count>
3359                    (The upper bound and `jump_n' are omitted if
3360                    `upper_bound' is 1, though.)  */
3361                 else
3362                   { /* If the upper bound is > 1, we need to insert
3363                        more at the end of the loop.  */
3364                     unsigned int nbytes = (upper_bound < 0 ? 3
3365                                            : upper_bound > 1 ? 5 : 0);
3366                     unsigned int startoffset = 0;
3367
3368                     GET_BUFFER_SPACE (20); /* We might use less.  */
3369
3370                     if (lower_bound == 0)
3371                       {
3372                         /* A succeed_n that starts with 0 is really a
3373                            a simple on_failure_jump_loop.  */
3374                         INSERT_JUMP (on_failure_jump_loop, laststart,
3375                                      b + 3 + nbytes);
3376                         b += 3;
3377                       }
3378                     else
3379                       {
3380                         /* Initialize lower bound of the `succeed_n', even
3381                            though it will be set during matching by its
3382                            attendant `set_number_at' (inserted next),
3383                            because `re_compile_fastmap' needs to know.
3384                            Jump to the `jump_n' we might insert below.  */
3385                         INSERT_JUMP2 (succeed_n, laststart,
3386                                       b + 5 + nbytes,
3387                                       lower_bound);
3388                         b += 5;
3389
3390                         /* Code to initialize the lower bound.  Insert
3391                            before the `succeed_n'.  The `5' is the last two
3392                            bytes of this `set_number_at', plus 3 bytes of
3393                            the following `succeed_n'.  */
3394                         insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3395                         b += 5;
3396                         startoffset += 5;
3397                       }
3398
3399                     if (upper_bound < 0)
3400                       {
3401                         /* A negative upper bound stands for infinity,
3402                            in which case it degenerates to a plain jump.  */
3403                         STORE_JUMP (jump, b, laststart + startoffset);
3404                         b += 3;
3405                       }
3406                     else if (upper_bound > 1)
3407                       { /* More than one repetition is allowed, so
3408                            append a backward jump to the `succeed_n'
3409                            that starts this interval.
3410
3411                            When we've reached this during matching,
3412                            we'll have matched the interval once, so
3413                            jump back only `upper_bound - 1' times.  */
3414                         STORE_JUMP2 (jump_n, b, laststart + startoffset,
3415                                      upper_bound - 1);
3416                         b += 5;
3417
3418                         /* The location we want to set is the second
3419                            parameter of the `jump_n'; that is `b-2' as
3420                            an absolute address.  `laststart' will be
3421                            the `set_number_at' we're about to insert;
3422                            `laststart+3' the number to set, the source
3423                            for the relative address.  But we are
3424                            inserting into the middle of the pattern --
3425                            so everything is getting moved up by 5.
3426                            Conclusion: (b - 2) - (laststart + 3) + 5,
3427                            i.e., b - laststart.
3428
3429                            We insert this at the beginning of the loop
3430                            so that if we fail during matching, we'll
3431                            reinitialize the bounds.  */
3432                         insert_op2 (set_number_at, laststart, b - laststart,
3433                                     upper_bound - 1, b);
3434                         b += 5;
3435                       }
3436                   }
3437                 pending_exact = 0;
3438                 beg_interval = NULL;
3439               }
3440               break;
3441
3442             unfetch_interval:
3443               /* If an invalid interval, match the characters as literals.  */
3444                assert (beg_interval);
3445                p = beg_interval;
3446                beg_interval = NULL;
3447
3448                /* normal_char and normal_backslash need `c'.  */
3449                c = '{';
3450
3451                if (!(syntax & RE_NO_BK_BRACES))
3452                  {
3453                    assert (p > pattern && p[-1] == '\\');
3454                    goto normal_backslash;
3455                  }
3456                else
3457                  goto normal_char;
3458
3459 #ifdef emacs
3460             /* There is no way to specify the before_dot and after_dot
3461                operators.  rms says this is ok.  --karl  */
3462             case '=':
3463               laststart = b;
3464               BUF_PUSH (at_dot);
3465               break;
3466
3467             case 's':
3468               laststart = b;
3469               PATFETCH (c);
3470               BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3471               break;
3472
3473             case 'S':
3474               laststart = b;
3475               PATFETCH (c);
3476               BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3477               break;
3478
3479             case 'c':
3480               laststart = b;
3481               PATFETCH (c);
3482               BUF_PUSH_2 (categoryspec, c);
3483               break;
3484
3485             case 'C':
3486               laststart = b;
3487               PATFETCH (c);
3488               BUF_PUSH_2 (notcategoryspec, c);
3489               break;
3490 #endif /* emacs */
3491
3492
3493             case 'w':
3494               if (syntax & RE_NO_GNU_OPS)
3495                 goto normal_char;
3496               laststart = b;
3497               BUF_PUSH_2 (syntaxspec, Sword);
3498               break;
3499
3500
3501             case 'W':
3502               if (syntax & RE_NO_GNU_OPS)
3503                 goto normal_char;
3504               laststart = b;
3505               BUF_PUSH_2 (notsyntaxspec, Sword);
3506               break;
3507
3508
3509             case '<':
3510               if (syntax & RE_NO_GNU_OPS)
3511                 goto normal_char;
3512               laststart = b;
3513               BUF_PUSH (wordbeg);
3514               break;
3515
3516             case '>':
3517               if (syntax & RE_NO_GNU_OPS)
3518                 goto normal_char;
3519               laststart = b;
3520               BUF_PUSH (wordend);
3521               break;
3522
3523             case '_':
3524               if (syntax & RE_NO_GNU_OPS)
3525                 goto normal_char;
3526               laststart = b;
3527               PATFETCH (c);
3528               if (c == '<')
3529                 BUF_PUSH (symbeg);
3530               else if (c == '>')
3531                 BUF_PUSH (symend);
3532               else
3533                 FREE_STACK_RETURN (REG_BADPAT);
3534               break;
3535
3536             case 'b':
3537               if (syntax & RE_NO_GNU_OPS)
3538                 goto normal_char;
3539               BUF_PUSH (wordbound);
3540               break;
3541
3542             case 'B':
3543               if (syntax & RE_NO_GNU_OPS)
3544                 goto normal_char;
3545               BUF_PUSH (notwordbound);
3546               break;
3547
3548             case '`':
3549               if (syntax & RE_NO_GNU_OPS)
3550                 goto normal_char;
3551               BUF_PUSH (begbuf);
3552               break;
3553
3554             case '\'':
3555               if (syntax & RE_NO_GNU_OPS)
3556                 goto normal_char;
3557               BUF_PUSH (endbuf);
3558               break;
3559
3560             case '1': case '2': case '3': case '4': case '5':
3561             case '6': case '7': case '8': case '9':
3562               {
3563                 regnum_t reg;
3564
3565                 if (syntax & RE_NO_BK_REFS)
3566                   goto normal_backslash;
3567
3568                 reg = c - '0';
3569
3570                 if (reg > bufp->re_nsub || reg < 1
3571                     /* Can't back reference to a subexp before its end.  */
3572                     || group_in_compile_stack (compile_stack, reg))
3573                   FREE_STACK_RETURN (REG_ESUBREG);
3574
3575                 laststart = b;
3576                 BUF_PUSH_2 (duplicate, reg);
3577               }
3578               break;
3579
3580
3581             case '+':
3582             case '?':
3583               if (syntax & RE_BK_PLUS_QM)
3584                 goto handle_plus;
3585               else
3586                 goto normal_backslash;
3587
3588             default:
3589             normal_backslash:
3590               /* You might think it would be useful for \ to mean
3591                  not to translate; but if we don't translate it
3592                  it will never match anything.  */
3593               goto normal_char;
3594             }
3595           break;
3596
3597
3598         default:
3599         /* Expects the character in `c'.  */
3600         normal_char:
3601           /* If no exactn currently being built.  */
3602           if (!pending_exact
3603
3604               /* If last exactn not at current position.  */
3605               || pending_exact + *pending_exact + 1 != b
3606
3607               /* We have only one byte following the exactn for the count.  */
3608               || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3609
3610               /* If followed by a repetition operator.  */
3611               || (p != pend && (*p == '*' || *p == '^'))
3612               || ((syntax & RE_BK_PLUS_QM)
3613                   ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3614                   : p != pend && (*p == '+' || *p == '?'))
3615               || ((syntax & RE_INTERVALS)
3616                   && ((syntax & RE_NO_BK_BRACES)
3617                       ? p != pend && *p == '{'
3618                       : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3619             {
3620               /* Start building a new exactn.  */
3621
3622               laststart = b;
3623
3624               BUF_PUSH_2 (exactn, 0);
3625               pending_exact = b - 1;
3626             }
3627
3628           GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3629           {
3630             int len;
3631
3632             if (multibyte)
3633               {
3634                 c = TRANSLATE (c);
3635                 len = CHAR_STRING (c, b);
3636                 b += len;
3637               }
3638             else
3639               {
3640                 c1 = RE_CHAR_TO_MULTIBYTE (c);
3641                 if (! CHAR_BYTE8_P (c1))
3642                   {
3643                     re_wchar_t c2 = TRANSLATE (c1);
3644
3645                     if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3646                       c = c1;
3647                   }
3648                 *b++ = c;
3649                 len = 1;
3650               }
3651             (*pending_exact) += len;
3652           }
3653
3654           break;
3655         } /* switch (c) */
3656     } /* while p != pend */
3657
3658
3659   /* Through the pattern now.  */
3660
3661   FIXUP_ALT_JUMP ();
3662
3663   if (!COMPILE_STACK_EMPTY)
3664     FREE_STACK_RETURN (REG_EPAREN);
3665
3666   /* If we don't want backtracking, force success
3667      the first time we reach the end of the compiled pattern.  */
3668   if (syntax & RE_NO_POSIX_BACKTRACKING)
3669     BUF_PUSH (succeed);
3670
3671   /* We have succeeded; set the length of the buffer.  */
3672   bufp->used = b - bufp->buffer;
3673
3674 #ifdef DEBUG
3675   if (debug > 0)
3676     {
3677       re_compile_fastmap (bufp);
3678       DEBUG_PRINT ("\nCompiled pattern: \n");
3679       print_compiled_pattern (bufp);
3680     }
3681   debug--;
3682 #endif /* DEBUG */
3683
3684 #ifndef MATCH_MAY_ALLOCATE
3685   /* Initialize the failure stack to the largest possible stack.  This
3686      isn't necessary unless we're trying to avoid calling alloca in
3687      the search and match routines.  */
3688   {
3689     int num_regs = bufp->re_nsub + 1;
3690
3691     if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3692       {
3693         fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3694         falk_stack.stack = realloc (fail_stack.stack,
3695                                     fail_stack.size * sizeof *falk_stack.stack);
3696       }
3697
3698     regex_grow_registers (num_regs);
3699   }
3700 #endif /* not MATCH_MAY_ALLOCATE */
3701
3702   FREE_STACK_RETURN (REG_NOERROR);
3703 } /* regex_compile */
3704 \f
3705 /* Subroutines for `regex_compile'.  */
3706
3707 /* Store OP at LOC followed by two-byte integer parameter ARG.  */
3708
3709 static void
3710 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3711 {
3712   *loc = (unsigned char) op;
3713   STORE_NUMBER (loc + 1, arg);
3714 }
3715
3716
3717 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2.  */
3718
3719 static void
3720 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3721 {
3722   *loc = (unsigned char) op;
3723   STORE_NUMBER (loc + 1, arg1);
3724   STORE_NUMBER (loc + 3, arg2);
3725 }
3726
3727
3728 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3729    for OP followed by two-byte integer parameter ARG.  */
3730
3731 static void
3732 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3733 {
3734   register unsigned char *pfrom = end;
3735   register unsigned char *pto = end + 3;
3736
3737   while (pfrom != loc)
3738     *--pto = *--pfrom;
3739
3740   store_op1 (op, loc, arg);
3741 }
3742
3743
3744 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2.  */
3745
3746 static void
3747 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
3748 {
3749   register unsigned char *pfrom = end;
3750   register unsigned char *pto = end + 5;
3751
3752   while (pfrom != loc)
3753     *--pto = *--pfrom;
3754
3755   store_op2 (op, loc, arg1, arg2);
3756 }
3757
3758
3759 /* P points to just after a ^ in PATTERN.  Return true if that ^ comes
3760    after an alternative or a begin-subexpression.  We assume there is at
3761    least one character before the ^.  */
3762
3763 static boolean
3764 at_begline_loc_p (const_re_char *pattern, const_re_char *p, reg_syntax_t syntax)
3765 {
3766   re_char *prev = p - 2;
3767   boolean odd_backslashes;
3768
3769   /* After a subexpression?  */
3770   if (*prev == '(')
3771     odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3772
3773   /* After an alternative?  */
3774   else if (*prev == '|')
3775     odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3776
3777   /* After a shy subexpression?  */
3778   else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3779     {
3780       /* Skip over optional regnum.  */
3781       while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3782         --prev;
3783
3784       if (!(prev - 2 >= pattern
3785             && prev[-1] == '?' && prev[-2] == '('))
3786         return false;
3787       prev -= 2;
3788       odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3789     }
3790   else
3791     return false;
3792
3793   /* Count the number of preceding backslashes.  */
3794   p = prev;
3795   while (prev - 1 >= pattern && prev[-1] == '\\')
3796     --prev;
3797   return (p - prev) & odd_backslashes;
3798 }
3799
3800
3801 /* The dual of at_begline_loc_p.  This one is for $.  We assume there is
3802    at least one character after the $, i.e., `P < PEND'.  */
3803
3804 static boolean
3805 at_endline_loc_p (const_re_char *p, const_re_char *pend, reg_syntax_t syntax)
3806 {
3807   re_char *next = p;
3808   boolean next_backslash = *next == '\\';
3809   re_char *next_next = p + 1 < pend ? p + 1 : 0;
3810
3811   return
3812        /* Before a subexpression?  */
3813        (syntax & RE_NO_BK_PARENS ? *next == ')'
3814         : next_backslash && next_next && *next_next == ')')
3815        /* Before an alternative?  */
3816     || (syntax & RE_NO_BK_VBAR ? *next == '|'
3817         : next_backslash && next_next && *next_next == '|');
3818 }
3819
3820
3821 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3822    false if it's not.  */
3823
3824 static boolean
3825 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3826 {
3827   ssize_t this_element;
3828
3829   for (this_element = compile_stack.avail - 1;
3830        this_element >= 0;
3831        this_element--)
3832     if (compile_stack.stack[this_element].regnum == regnum)
3833       return true;
3834
3835   return false;
3836 }
3837 \f
3838 /* analyse_first.
3839    If fastmap is non-NULL, go through the pattern and fill fastmap
3840    with all the possible leading chars.  If fastmap is NULL, don't
3841    bother filling it up (obviously) and only return whether the
3842    pattern could potentially match the empty string.
3843
3844    Return 1  if p..pend might match the empty string.
3845    Return 0  if p..pend matches at least one char.
3846    Return -1 if fastmap was not updated accurately.  */
3847
3848 static int
3849 analyse_first (const_re_char *p, const_re_char *pend, char *fastmap,
3850                const int multibyte)
3851 {
3852   int j, k;
3853   boolean not;
3854
3855   /* If all elements for base leading-codes in fastmap is set, this
3856      flag is set true.  */
3857   boolean match_any_multibyte_characters = false;
3858
3859   assert (p);
3860
3861   /* The loop below works as follows:
3862      - It has a working-list kept in the PATTERN_STACK and which basically
3863        starts by only containing a pointer to the first operation.
3864      - If the opcode we're looking at is a match against some set of
3865        chars, then we add those chars to the fastmap and go on to the
3866        next work element from the worklist (done via `break').
3867      - If the opcode is a control operator on the other hand, we either
3868        ignore it (if it's meaningless at this point, such as `start_memory')
3869        or execute it (if it's a jump).  If the jump has several destinations
3870        (i.e. `on_failure_jump'), then we push the other destination onto the
3871        worklist.
3872      We guarantee termination by ignoring backward jumps (more or less),
3873      so that `p' is monotonically increasing.  More to the point, we
3874      never set `p' (or push) anything `<= p1'.  */
3875
3876   while (p < pend)
3877     {
3878       /* `p1' is used as a marker of how far back a `on_failure_jump'
3879          can go without being ignored.  It is normally equal to `p'
3880          (which prevents any backward `on_failure_jump') except right
3881          after a plain `jump', to allow patterns such as:
3882             0: jump 10
3883             3..9: <body>
3884             10: on_failure_jump 3
3885          as used for the *? operator.  */
3886       re_char *p1 = p;
3887
3888       switch (*p++)
3889         {
3890         case succeed:
3891           return 1;
3892
3893         case duplicate:
3894           /* If the first character has to match a backreference, that means
3895              that the group was empty (since it already matched).  Since this
3896              is the only case that interests us here, we can assume that the
3897              backreference must match the empty string.  */
3898           p++;
3899           continue;
3900
3901
3902       /* Following are the cases which match a character.  These end
3903          with `break'.  */
3904
3905         case exactn:
3906           if (fastmap)
3907             {
3908               /* If multibyte is nonzero, the first byte of each
3909                  character is an ASCII or a leading code.  Otherwise,
3910                  each byte is a character.  Thus, this works in both
3911                  cases. */
3912               fastmap[p[1]] = 1;
3913               if (! multibyte)
3914                 {
3915                   /* For the case of matching this unibyte regex
3916                      against multibyte, we must set a leading code of
3917                      the corresponding multibyte character.  */
3918                   int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3919
3920                   fastmap[CHAR_LEADING_CODE (c)] = 1;
3921                 }
3922             }
3923           break;
3924
3925
3926         case anychar:
3927           /* We could put all the chars except for \n (and maybe \0)
3928              but we don't bother since it is generally not worth it.  */
3929           if (!fastmap) break;
3930           return -1;
3931
3932
3933         case charset_not:
3934           if (!fastmap) break;
3935           {
3936             /* Chars beyond end of bitmap are possible matches.  */
3937             for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3938                  j < (1 << BYTEWIDTH); j++)
3939               fastmap[j] = 1;
3940           }
3941
3942           /* Fallthrough */
3943         case charset:
3944           if (!fastmap) break;
3945           not = (re_opcode_t) *(p - 1) == charset_not;
3946           for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3947                j >= 0; j--)
3948             if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3949               fastmap[j] = 1;
3950
3951 #ifdef emacs
3952           if (/* Any leading code can possibly start a character
3953                  which doesn't match the specified set of characters.  */
3954               not
3955               ||
3956               /* If we can match a character class, we can match any
3957                  multibyte characters.  */
3958               (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3959                && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3960
3961             {
3962               if (match_any_multibyte_characters == false)
3963                 {
3964                   for (j = MIN_MULTIBYTE_LEADING_CODE;
3965                        j <= MAX_MULTIBYTE_LEADING_CODE; j++)
3966                     fastmap[j] = 1;
3967                   match_any_multibyte_characters = true;
3968                 }
3969             }
3970
3971           else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3972                    && match_any_multibyte_characters == false)
3973             {
3974               /* Set fastmap[I] to 1 where I is a leading code of each
3975                  multibyte character in the range table. */
3976               int c, count;
3977               unsigned char lc1, lc2;
3978
3979               /* Make P points the range table.  `+ 2' is to skip flag
3980                  bits for a character class.  */
3981               p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
3982
3983               /* Extract the number of ranges in range table into COUNT.  */
3984               EXTRACT_NUMBER_AND_INCR (count, p);
3985               for (; count > 0; count--, p += 3)
3986                 {
3987                   /* Extract the start and end of each range.  */
3988                   EXTRACT_CHARACTER (c, p);
3989                   lc1 = CHAR_LEADING_CODE (c);
3990                   p += 3;
3991                   EXTRACT_CHARACTER (c, p);
3992                   lc2 = CHAR_LEADING_CODE (c);
3993                   for (j = lc1; j <= lc2; j++)
3994                     fastmap[j] = 1;
3995                 }
3996             }
3997 #endif
3998           break;
3999
4000         case syntaxspec:
4001         case notsyntaxspec:
4002           if (!fastmap) break;
4003 #ifndef emacs
4004           not = (re_opcode_t)p[-1] == notsyntaxspec;
4005           k = *p++;
4006           for (j = 0; j < (1 << BYTEWIDTH); j++)
4007             if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
4008               fastmap[j] = 1;
4009           break;
4010 #else  /* emacs */
4011           /* This match depends on text properties.  These end with
4012              aborting optimizations.  */
4013           return -1;
4014
4015         case categoryspec:
4016         case notcategoryspec:
4017           if (!fastmap) break;
4018           not = (re_opcode_t)p[-1] == notcategoryspec;
4019           k = *p++;
4020           for (j = (1 << BYTEWIDTH); j >= 0; j--)
4021             if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
4022               fastmap[j] = 1;
4023
4024           /* Any leading code can possibly start a character which
4025              has or doesn't has the specified category.  */
4026           if (match_any_multibyte_characters == false)
4027             {
4028               for (j = MIN_MULTIBYTE_LEADING_CODE;
4029                    j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4030                 fastmap[j] = 1;
4031               match_any_multibyte_characters = true;
4032             }
4033           break;
4034
4035       /* All cases after this match the empty string.  These end with
4036          `continue'.  */
4037
4038         case before_dot:
4039         case at_dot:
4040         case after_dot:
4041 #endif /* !emacs */
4042         case no_op:
4043         case begline:
4044         case endline:
4045         case begbuf:
4046         case endbuf:
4047         case wordbound:
4048         case notwordbound:
4049         case wordbeg:
4050         case wordend:
4051         case symbeg:
4052         case symend:
4053           continue;
4054
4055
4056         case jump:
4057           EXTRACT_NUMBER_AND_INCR (j, p);
4058           if (j < 0)
4059             /* Backward jumps can only go back to code that we've already
4060                visited.  `re_compile' should make sure this is true.  */
4061             break;
4062           p += j;
4063           switch (*p)
4064             {
4065             case on_failure_jump:
4066             case on_failure_keep_string_jump:
4067             case on_failure_jump_loop:
4068             case on_failure_jump_nastyloop:
4069             case on_failure_jump_smart:
4070               p++;
4071               break;
4072             default:
4073               continue;
4074             };
4075           /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4076              to jump back to "just after here".  */
4077           /* Fallthrough */
4078
4079         case on_failure_jump:
4080         case on_failure_keep_string_jump:
4081         case on_failure_jump_nastyloop:
4082         case on_failure_jump_loop:
4083         case on_failure_jump_smart:
4084           EXTRACT_NUMBER_AND_INCR (j, p);
4085           if (p + j <= p1)
4086             ; /* Backward jump to be ignored.  */
4087           else
4088             { /* We have to look down both arms.
4089                  We first go down the "straight" path so as to minimize
4090                  stack usage when going through alternatives.  */
4091               int r = analyse_first (p, pend, fastmap, multibyte);
4092               if (r) return r;
4093               p += j;
4094             }
4095           continue;
4096
4097
4098         case jump_n:
4099           /* This code simply does not properly handle forward jump_n.  */
4100           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4101           p += 4;
4102           /* jump_n can either jump or fall through.  The (backward) jump
4103              case has already been handled, so we only need to look at the
4104              fallthrough case.  */
4105           continue;
4106
4107         case succeed_n:
4108           /* If N == 0, it should be an on_failure_jump_loop instead.  */
4109           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4110           p += 4;
4111           /* We only care about one iteration of the loop, so we don't
4112              need to consider the case where this behaves like an
4113              on_failure_jump.  */
4114           continue;
4115
4116
4117         case set_number_at:
4118           p += 4;
4119           continue;
4120
4121
4122         case start_memory:
4123         case stop_memory:
4124           p += 1;
4125           continue;
4126
4127
4128         default:
4129           abort (); /* We have listed all the cases.  */
4130         } /* switch *p++ */
4131
4132       /* Getting here means we have found the possible starting
4133          characters for one path of the pattern -- and that the empty
4134          string does not match.  We need not follow this path further.  */
4135       return 0;
4136     } /* while p */
4137
4138   /* We reached the end without matching anything.  */
4139   return 1;
4140
4141 } /* analyse_first */
4142 \f
4143 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4144    BUFP.  A fastmap records which of the (1 << BYTEWIDTH) possible
4145    characters can start a string that matches the pattern.  This fastmap
4146    is used by re_search to skip quickly over impossible starting points.
4147
4148    Character codes above (1 << BYTEWIDTH) are not represented in the
4149    fastmap, but the leading codes are represented.  Thus, the fastmap
4150    indicates which character sets could start a match.
4151
4152    The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4153    area as BUFP->fastmap.
4154
4155    We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4156    the pattern buffer.
4157
4158    Returns 0 if we succeed, -2 if an internal error.   */
4159
4160 int
4161 re_compile_fastmap (struct re_pattern_buffer *bufp)
4162 {
4163   char *fastmap = bufp->fastmap;
4164   int analysis;
4165
4166   assert (fastmap && bufp->buffer);
4167
4168   memset (fastmap, 0, 1 << BYTEWIDTH);  /* Assume nothing's valid.  */
4169   bufp->fastmap_accurate = 1;       /* It will be when we're done.  */
4170
4171   analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
4172                             fastmap, RE_MULTIBYTE_P (bufp));
4173   bufp->can_be_null = (analysis != 0);
4174   return 0;
4175 } /* re_compile_fastmap */
4176 \f
4177 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4178    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
4179    this memory for recording register information.  STARTS and ENDS
4180    must be allocated using the malloc library routine, and must each
4181    be at least NUM_REGS * sizeof (regoff_t) bytes long.
4182
4183    If NUM_REGS == 0, then subsequent matches should allocate their own
4184    register data.
4185
4186    Unless this function is called, the first search or match using
4187    PATTERN_BUFFER will allocate its own register data, without
4188    freeing the old data.  */
4189
4190 void
4191 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
4192 {
4193   if (num_regs)
4194     {
4195       bufp->regs_allocated = REGS_REALLOCATE;
4196       regs->num_regs = num_regs;
4197       regs->start = starts;
4198       regs->end = ends;
4199     }
4200   else
4201     {
4202       bufp->regs_allocated = REGS_UNALLOCATED;
4203       regs->num_regs = 0;
4204       regs->start = regs->end = 0;
4205     }
4206 }
4207 WEAK_ALIAS (__re_set_registers, re_set_registers)
4208 \f
4209 /* Searching routines.  */
4210
4211 /* Like re_search_2, below, but only one string is specified, and
4212    doesn't let you say where to stop matching. */
4213
4214 regoff_t
4215 re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4216            ssize_t startpos, ssize_t range, struct re_registers *regs)
4217 {
4218   return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4219                       regs, size);
4220 }
4221 WEAK_ALIAS (__re_search, re_search)
4222
4223 /* Head address of virtual concatenation of string.  */
4224 #define HEAD_ADDR_VSTRING(P)            \
4225   (((P) >= size1 ? string2 : string1))
4226
4227 /* Address of POS in the concatenation of virtual string. */
4228 #define POS_ADDR_VSTRING(POS)                                   \
4229   (((POS) >= size1 ? string2 - size1 : string1) + (POS))
4230
4231 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4232    virtual concatenation of STRING1 and STRING2, starting first at index
4233    STARTPOS, then at STARTPOS + 1, and so on.
4234
4235    STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4236
4237    RANGE is how far to scan while trying to match.  RANGE = 0 means try
4238    only at STARTPOS; in general, the last start tried is STARTPOS +
4239    RANGE.
4240
4241    In REGS, return the indices of the virtual concatenation of STRING1
4242    and STRING2 that matched the entire BUFP->buffer and its contained
4243    subexpressions.
4244
4245    Do not consider matching one past the index STOP in the virtual
4246    concatenation of STRING1 and STRING2.
4247
4248    We return either the position in the strings at which the match was
4249    found, -1 if no match, or -2 if error (such as failure
4250    stack overflow).  */
4251
4252 regoff_t
4253 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4254              const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4255              struct re_registers *regs, ssize_t stop)
4256 {
4257   regoff_t val;
4258   re_char *string1 = (re_char*) str1;
4259   re_char *string2 = (re_char*) str2;
4260   register char *fastmap = bufp->fastmap;
4261   register RE_TRANSLATE_TYPE translate = bufp->translate;
4262   size_t total_size = size1 + size2;
4263   ssize_t endpos = startpos + range;
4264   boolean anchored_start;
4265   /* Nonzero if we are searching multibyte string.  */
4266   const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4267
4268   /* Check for out-of-range STARTPOS.  */
4269   if (startpos < 0 || startpos > total_size)
4270     return -1;
4271
4272   /* Fix up RANGE if it might eventually take us outside
4273      the virtual concatenation of STRING1 and STRING2.
4274      Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE.  */
4275   if (endpos < 0)
4276     range = 0 - startpos;
4277   else if (endpos > total_size)
4278     range = total_size - startpos;
4279
4280   /* If the search isn't to be a backwards one, don't waste time in a
4281      search for a pattern anchored at beginning of buffer.  */
4282   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4283     {
4284       if (startpos > 0)
4285         return -1;
4286       else
4287         range = 0;
4288     }
4289
4290 #ifdef emacs
4291   /* In a forward search for something that starts with \=.
4292      don't keep searching past point.  */
4293   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4294     {
4295       range = PT_BYTE - BEGV_BYTE - startpos;
4296       if (range < 0)
4297         return -1;
4298     }
4299 #endif /* emacs */
4300
4301   /* Update the fastmap now if not correct already.  */
4302   if (fastmap && !bufp->fastmap_accurate)
4303     re_compile_fastmap (bufp);
4304
4305   /* See whether the pattern is anchored.  */
4306   anchored_start = (bufp->buffer[0] == begline);
4307
4308 #ifdef emacs
4309   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4310   {
4311     ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
4312
4313     SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4314   }
4315 #endif
4316
4317   /* Loop through the string, looking for a place to start matching.  */
4318   for (;;)
4319     {
4320       /* If the pattern is anchored,
4321          skip quickly past places we cannot match.
4322          We don't bother to treat startpos == 0 specially
4323          because that case doesn't repeat.  */
4324       if (anchored_start && startpos > 0)
4325         {
4326           if (! ((startpos <= size1 ? string1[startpos - 1]
4327                   : string2[startpos - size1 - 1])
4328                  == '\n'))
4329             goto advance;
4330         }
4331
4332       /* If a fastmap is supplied, skip quickly over characters that
4333          cannot be the start of a match.  If the pattern can match the
4334          null string, however, we don't need to skip characters; we want
4335          the first null string.  */
4336       if (fastmap && startpos < total_size && !bufp->can_be_null)
4337         {
4338           register re_char *d;
4339           register re_wchar_t buf_ch;
4340
4341           d = POS_ADDR_VSTRING (startpos);
4342
4343           if (range > 0)        /* Searching forwards.  */
4344             {
4345               register int lim = 0;
4346               ssize_t irange = range;
4347
4348               if (startpos < size1 && startpos + range >= size1)
4349                 lim = range - (size1 - startpos);
4350
4351               /* Written out as an if-else to avoid testing `translate'
4352                  inside the loop.  */
4353               if (RE_TRANSLATE_P (translate))
4354                 {
4355                   if (multibyte)
4356                     while (range > lim)
4357                       {
4358                         int buf_charlen;
4359
4360                         buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4361                         buf_ch = RE_TRANSLATE (translate, buf_ch);
4362                         if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4363                           break;
4364
4365                         range -= buf_charlen;
4366                         d += buf_charlen;
4367                       }
4368                   else
4369                     while (range > lim)
4370                       {
4371                         register re_wchar_t ch, translated;
4372
4373                         buf_ch = *d;
4374                         ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4375                         translated = RE_TRANSLATE (translate, ch);
4376                         if (translated != ch
4377                             && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4378                           buf_ch = ch;
4379                         if (fastmap[buf_ch])
4380                           break;
4381                         d++;
4382                         range--;
4383                       }
4384                 }
4385               else
4386                 {
4387                   if (multibyte)
4388                     while (range > lim)
4389                       {
4390                         int buf_charlen;
4391
4392                         buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4393                         if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4394                           break;
4395                         range -= buf_charlen;
4396                         d += buf_charlen;
4397                       }
4398                   else
4399                     while (range > lim && !fastmap[*d])
4400                       {
4401                         d++;
4402                         range--;
4403                       }
4404                 }
4405               startpos += irange - range;
4406             }
4407           else                          /* Searching backwards.  */
4408             {
4409               if (multibyte)
4410                 {
4411                   buf_ch = STRING_CHAR (d);
4412                   buf_ch = TRANSLATE (buf_ch);
4413                   if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4414                     goto advance;
4415                 }
4416               else
4417                 {
4418                   register re_wchar_t ch, translated;
4419
4420                   buf_ch = *d;
4421                   ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4422                   translated = TRANSLATE (ch);
4423                   if (translated != ch
4424                       && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4425                     buf_ch = ch;
4426                   if (! fastmap[TRANSLATE (buf_ch)])
4427                     goto advance;
4428                 }
4429             }
4430         }
4431
4432       /* If can't match the null string, and that's all we have left, fail.  */
4433       if (range >= 0 && startpos == total_size && fastmap
4434           && !bufp->can_be_null)
4435         return -1;
4436
4437       val = re_match_2_internal (bufp, string1, size1, string2, size2,
4438                                  startpos, regs, stop);
4439
4440       if (val >= 0)
4441         return startpos;
4442
4443       if (val == -2)
4444         return -2;
4445
4446     advance:
4447       if (!range)
4448         break;
4449       else if (range > 0)
4450         {
4451           /* Update STARTPOS to the next character boundary.  */
4452           if (multibyte)
4453             {
4454               re_char *p = POS_ADDR_VSTRING (startpos);
4455               int len = BYTES_BY_CHAR_HEAD (*p);
4456
4457               range -= len;
4458               if (range < 0)
4459                 break;
4460               startpos += len;
4461             }
4462           else
4463             {
4464               range--;
4465               startpos++;
4466             }
4467         }
4468       else
4469         {
4470           range++;
4471           startpos--;
4472
4473           /* Update STARTPOS to the previous character boundary.  */
4474           if (multibyte)
4475             {
4476               re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4477               re_char *p0 = p;
4478               re_char *phead = HEAD_ADDR_VSTRING (startpos);
4479
4480               /* Find the head of multibyte form.  */
4481               PREV_CHAR_BOUNDARY (p, phead);
4482               range += p0 - 1 - p;
4483               if (range > 0)
4484                 break;
4485
4486               startpos -= p0 - 1 - p;
4487             }
4488         }
4489     }
4490   return -1;
4491 } /* re_search_2 */
4492 WEAK_ALIAS (__re_search_2, re_search_2)
4493 \f
4494 /* Declarations and macros for re_match_2.  */
4495
4496 static int bcmp_translate (re_char *s1, re_char *s2,
4497                            register ssize_t len,
4498                            RE_TRANSLATE_TYPE translate,
4499                            const int multibyte);
4500
4501 /* This converts PTR, a pointer into one of the search strings `string1'
4502    and `string2' into an offset from the beginning of that string.  */
4503 #define POINTER_TO_OFFSET(ptr)                  \
4504   (FIRST_STRING_P (ptr)                         \
4505    ? (ptr) - string1                            \
4506    : (ptr) - string2 + (ptrdiff_t) size1)
4507
4508 /* Call before fetching a character with *d.  This switches over to
4509    string2 if necessary.
4510    Check re_match_2_internal for a discussion of why end_match_2 might
4511    not be within string2 (but be equal to end_match_1 instead).  */
4512 #define PREFETCH()                                                      \
4513   while (d == dend)                                                     \
4514     {                                                                   \
4515       /* End of string2 => fail.  */                                    \
4516       if (dend == end_match_2)                                          \
4517         goto fail;                                                      \
4518       /* End of string1 => advance to string2.  */                      \
4519       d = string2;                                                      \
4520       dend = end_match_2;                                               \
4521     }
4522
4523 /* Call before fetching a char with *d if you already checked other limits.
4524    This is meant for use in lookahead operations like wordend, etc..
4525    where we might need to look at parts of the string that might be
4526    outside of the LIMITs (i.e past `stop').  */
4527 #define PREFETCH_NOLIMIT()                                              \
4528   if (d == end1)                                                        \
4529      {                                                                  \
4530        d = string2;                                                     \
4531        dend = end_match_2;                                              \
4532      }                                                                  \
4533
4534 /* Test if at very beginning or at very end of the virtual concatenation
4535    of `string1' and `string2'.  If only one string, it's `string2'.  */
4536 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4537 #define AT_STRINGS_END(d) ((d) == end2)
4538
4539 /* Disabled due to a compiler bug -- see comment at case wordbound */
4540
4541 /* The comment at case wordbound is following one, but we don't use
4542    AT_WORD_BOUNDARY anymore to support multibyte form.
4543
4544    The DEC Alpha C compiler 3.x generates incorrect code for the
4545    test  WORDCHAR_P (d - 1) != WORDCHAR_P (d)  in the expansion of
4546    AT_WORD_BOUNDARY, so this code is disabled.  Expanding the
4547    macro and introducing temporary variables works around the bug.  */
4548
4549 #if 0
4550 /* Test if D points to a character which is word-constituent.  We have
4551    two special cases to check for: if past the end of string1, look at
4552    the first character in string2; and if before the beginning of
4553    string2, look at the last character in string1.  */
4554 #define WORDCHAR_P(d)                                                   \
4555   (SYNTAX ((d) == end1 ? *string2                                       \
4556            : (d) == string2 - 1 ? *(end1 - 1) : *(d))                   \
4557    == Sword)
4558
4559 /* Test if the character before D and the one at D differ with respect
4560    to being word-constituent.  */
4561 #define AT_WORD_BOUNDARY(d)                                             \
4562   (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)                             \
4563    || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4564 #endif
4565
4566 /* Free everything we malloc.  */
4567 #ifdef MATCH_MAY_ALLOCATE
4568 # define FREE_VAR(var)                                                  \
4569   do {                                                                  \
4570     if (var)                                                            \
4571       {                                                                 \
4572         REGEX_FREE (var);                                               \
4573         var = NULL;                                                     \
4574       }                                                                 \
4575   } while (0)
4576 # define FREE_VARIABLES()                                               \
4577   do {                                                                  \
4578     REGEX_FREE_STACK (fail_stack.stack);                                \
4579     FREE_VAR (regstart);                                                \
4580     FREE_VAR (regend);                                                  \
4581     FREE_VAR (best_regstart);                                           \
4582     FREE_VAR (best_regend);                                             \
4583   } while (0)
4584 #else
4585 # define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning.  */
4586 #endif /* not MATCH_MAY_ALLOCATE */
4587
4588 \f
4589 /* Optimization routines.  */
4590
4591 /* If the operation is a match against one or more chars,
4592    return a pointer to the next operation, else return NULL.  */
4593 static re_char *
4594 skip_one_char (const_re_char *p)
4595 {
4596   switch (*p++)
4597     {
4598     case anychar:
4599       break;
4600
4601     case exactn:
4602       p += *p + 1;
4603       break;
4604
4605     case charset_not:
4606     case charset:
4607       if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4608         {
4609           int mcnt;
4610           p = CHARSET_RANGE_TABLE (p - 1);
4611           EXTRACT_NUMBER_AND_INCR (mcnt, p);
4612           p = CHARSET_RANGE_TABLE_END (p, mcnt);
4613         }
4614       else
4615         p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4616       break;
4617
4618     case syntaxspec:
4619     case notsyntaxspec:
4620 #ifdef emacs
4621     case categoryspec:
4622     case notcategoryspec:
4623 #endif /* emacs */
4624       p++;
4625       break;
4626
4627     default:
4628       p = NULL;
4629     }
4630   return p;
4631 }
4632
4633
4634 /* Jump over non-matching operations.  */
4635 static re_char *
4636 skip_noops (const_re_char *p, const_re_char *pend)
4637 {
4638   int mcnt;
4639   while (p < pend)
4640     {
4641       switch (*p)
4642         {
4643         case start_memory:
4644         case stop_memory:
4645           p += 2; break;
4646         case no_op:
4647           p += 1; break;
4648         case jump:
4649           p += 1;
4650           EXTRACT_NUMBER_AND_INCR (mcnt, p);
4651           p += mcnt;
4652           break;
4653         default:
4654           return p;
4655         }
4656     }
4657   assert (p == pend);
4658   return p;
4659 }
4660
4661 /* Non-zero if "p1 matches something" implies "p2 fails".  */
4662 static int
4663 mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
4664                       const_re_char *p2)
4665 {
4666   re_opcode_t op2;
4667   const boolean multibyte = RE_MULTIBYTE_P (bufp);
4668   unsigned char *pend = bufp->buffer + bufp->used;
4669
4670   assert (p1 >= bufp->buffer && p1 < pend
4671           && p2 >= bufp->buffer && p2 <= pend);
4672
4673   /* Skip over open/close-group commands.
4674      If what follows this loop is a ...+ construct,
4675      look at what begins its body, since we will have to
4676      match at least one of that.  */
4677   p2 = skip_noops (p2, pend);
4678   /* The same skip can be done for p1, except that this function
4679      is only used in the case where p1 is a simple match operator.  */
4680   /* p1 = skip_noops (p1, pend); */
4681
4682   assert (p1 >= bufp->buffer && p1 < pend
4683           && p2 >= bufp->buffer && p2 <= pend);
4684
4685   op2 = p2 == pend ? succeed : *p2;
4686
4687   switch (op2)
4688     {
4689     case succeed:
4690     case endbuf:
4691       /* If we're at the end of the pattern, we can change.  */
4692       if (skip_one_char (p1))
4693         {
4694           DEBUG_PRINT ("  End of pattern: fast loop.\n");
4695           return 1;
4696         }
4697       break;
4698
4699     case endline:
4700     case exactn:
4701       {
4702         register re_wchar_t c
4703           = (re_opcode_t) *p2 == endline ? '\n'
4704           : RE_STRING_CHAR (p2 + 2, multibyte);
4705
4706         if ((re_opcode_t) *p1 == exactn)
4707           {
4708             if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4709               {
4710                 DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
4711                 return 1;
4712               }
4713           }
4714
4715         else if ((re_opcode_t) *p1 == charset
4716                  || (re_opcode_t) *p1 == charset_not)
4717           {
4718             int not = (re_opcode_t) *p1 == charset_not;
4719
4720             /* Test if C is listed in charset (or charset_not)
4721                at `p1'.  */
4722             if (! multibyte || IS_REAL_ASCII (c))
4723               {
4724                 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4725                     && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4726                   not = !not;
4727               }
4728             else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4729               CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4730
4731             /* `not' is equal to 1 if c would match, which means
4732                that we can't change to pop_failure_jump.  */
4733             if (!not)
4734               {
4735                 DEBUG_PRINT ("   No match => fast loop.\n");
4736                 return 1;
4737               }
4738           }
4739         else if ((re_opcode_t) *p1 == anychar
4740                  && c == '\n')
4741           {
4742             DEBUG_PRINT ("   . != \\n => fast loop.\n");
4743             return 1;
4744           }
4745       }
4746       break;
4747
4748     case charset:
4749       {
4750         if ((re_opcode_t) *p1 == exactn)
4751           /* Reuse the code above.  */
4752           return mutually_exclusive_p (bufp, p2, p1);
4753
4754       /* It is hard to list up all the character in charset
4755          P2 if it includes multibyte character.  Give up in
4756          such case.  */
4757       else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4758         {
4759           /* Now, we are sure that P2 has no range table.
4760              So, for the size of bitmap in P2, `p2[1]' is
4761              enough.  But P1 may have range table, so the
4762              size of bitmap table of P1 is extracted by
4763              using macro `CHARSET_BITMAP_SIZE'.
4764
4765              In a multibyte case, we know that all the character
4766              listed in P2 is ASCII.  In a unibyte case, P1 has only a
4767              bitmap table.  So, in both cases, it is enough to test
4768              only the bitmap table of P1.  */
4769
4770           if ((re_opcode_t) *p1 == charset)
4771             {
4772               int idx;
4773               /* We win if the charset inside the loop
4774                  has no overlap with the one after the loop.  */
4775               for (idx = 0;
4776                    (idx < (int) p2[1]
4777                     && idx < CHARSET_BITMAP_SIZE (p1));
4778                    idx++)
4779                 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4780                   break;
4781
4782               if (idx == p2[1]
4783                   || idx == CHARSET_BITMAP_SIZE (p1))
4784                 {
4785                   DEBUG_PRINT ("         No match => fast loop.\n");
4786                   return 1;
4787                 }
4788             }
4789           else if ((re_opcode_t) *p1 == charset_not)
4790             {
4791               int idx;
4792               /* We win if the charset_not inside the loop lists
4793                  every character listed in the charset after.  */
4794               for (idx = 0; idx < (int) p2[1]; idx++)
4795                 if (! (p2[2 + idx] == 0
4796                        || (idx < CHARSET_BITMAP_SIZE (p1)
4797                            && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4798                   break;
4799
4800               if (idx == p2[1])
4801                 {
4802                   DEBUG_PRINT ("         No match => fast loop.\n");
4803                   return 1;
4804                 }
4805               }
4806           }
4807       }
4808       break;
4809
4810     case charset_not:
4811       switch (*p1)
4812         {
4813         case exactn:
4814         case charset:
4815           /* Reuse the code above.  */
4816           return mutually_exclusive_p (bufp, p2, p1);
4817         case charset_not:
4818           /* When we have two charset_not, it's very unlikely that
4819              they don't overlap.  The union of the two sets of excluded
4820              chars should cover all possible chars, which, as a matter of
4821              fact, is virtually impossible in multibyte buffers.  */
4822           break;
4823         }
4824       break;
4825
4826     case wordend:
4827       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4828     case symend:
4829       return ((re_opcode_t) *p1 == syntaxspec
4830               && (p1[1] == Ssymbol || p1[1] == Sword));
4831     case notsyntaxspec:
4832       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4833
4834     case wordbeg:
4835       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4836     case symbeg:
4837       return ((re_opcode_t) *p1 == notsyntaxspec
4838               && (p1[1] == Ssymbol || p1[1] == Sword));
4839     case syntaxspec:
4840       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4841
4842     case wordbound:
4843       return (((re_opcode_t) *p1 == notsyntaxspec
4844                || (re_opcode_t) *p1 == syntaxspec)
4845               && p1[1] == Sword);
4846
4847 #ifdef emacs
4848     case categoryspec:
4849       return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4850     case notcategoryspec:
4851       return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4852 #endif /* emacs */
4853
4854     default:
4855       ;
4856     }
4857
4858   /* Safe default.  */
4859   return 0;
4860 }
4861
4862 \f
4863 /* Matching routines.  */
4864
4865 #ifndef emacs   /* Emacs never uses this.  */
4866 /* re_match is like re_match_2 except it takes only a single string.  */
4867
4868 regoff_t
4869 re_match (struct re_pattern_buffer *bufp, const char *string,
4870           size_t size, ssize_t pos, struct re_registers *regs)
4871 {
4872   regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4873                                          size, pos, regs, size);
4874   return result;
4875 }
4876 WEAK_ALIAS (__re_match, re_match)
4877 #endif /* not emacs */
4878
4879 #ifdef emacs
4880 /* In Emacs, this is the string or buffer in which we
4881    are matching.  It is used for looking up syntax properties.  */
4882 Lisp_Object re_match_object;
4883 #endif
4884
4885 /* re_match_2 matches the compiled pattern in BUFP against the
4886    the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4887    and SIZE2, respectively).  We start matching at POS, and stop
4888    matching at STOP.
4889
4890    If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4891    store offsets for the substring each group matched in REGS.  See the
4892    documentation for exactly how many groups we fill.
4893
4894    We return -1 if no match, -2 if an internal error (such as the
4895    failure stack overflowing).  Otherwise, we return the length of the
4896    matched substring.  */
4897
4898 regoff_t
4899 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4900             size_t size1, const char *string2, size_t size2, ssize_t pos,
4901             struct re_registers *regs, ssize_t stop)
4902 {
4903   regoff_t result;
4904
4905 #ifdef emacs
4906   ssize_t charpos;
4907   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4908   charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4909   SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4910 #endif
4911
4912   result = re_match_2_internal (bufp, (re_char*) string1, size1,
4913                                 (re_char*) string2, size2,
4914                                 pos, regs, stop);
4915   return result;
4916 }
4917 WEAK_ALIAS (__re_match_2, re_match_2)
4918
4919
4920 /* This is a separate function so that we can force an alloca cleanup
4921    afterwards.  */
4922 static regoff_t
4923 re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
4924                      size_t size1, const_re_char *string2, size_t size2,
4925                      ssize_t pos, struct re_registers *regs, ssize_t stop)
4926 {
4927   /* General temporaries.  */
4928   int mcnt;
4929   size_t reg;
4930
4931   /* Just past the end of the corresponding string.  */
4932   re_char *end1, *end2;
4933
4934   /* Pointers into string1 and string2, just past the last characters in
4935      each to consider matching.  */
4936   re_char *end_match_1, *end_match_2;
4937
4938   /* Where we are in the data, and the end of the current string.  */
4939   re_char *d, *dend;
4940
4941   /* Used sometimes to remember where we were before starting matching
4942      an operator so that we can go back in case of failure.  This "atomic"
4943      behavior of matching opcodes is indispensable to the correctness
4944      of the on_failure_keep_string_jump optimization.  */
4945   re_char *dfail;
4946
4947   /* Where we are in the pattern, and the end of the pattern.  */
4948   re_char *p = bufp->buffer;
4949   re_char *pend = p + bufp->used;
4950
4951   /* We use this to map every character in the string.  */
4952   RE_TRANSLATE_TYPE translate = bufp->translate;
4953
4954   /* Nonzero if BUFP is setup from a multibyte regex.  */
4955   const boolean multibyte = RE_MULTIBYTE_P (bufp);
4956
4957   /* Nonzero if STRING1/STRING2 are multibyte.  */
4958   const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4959
4960   /* Failure point stack.  Each place that can handle a failure further
4961      down the line pushes a failure point on this stack.  It consists of
4962      regstart, and regend for all registers corresponding to
4963      the subexpressions we're currently inside, plus the number of such
4964      registers, and, finally, two char *'s.  The first char * is where
4965      to resume scanning the pattern; the second one is where to resume
4966      scanning the strings.  */
4967 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
4968   fail_stack_type fail_stack;
4969 #endif
4970 #ifdef DEBUG_COMPILES_ARGUMENTS
4971   unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4972 #endif
4973
4974 #if defined REL_ALLOC && defined REGEX_MALLOC
4975   /* This holds the pointer to the failure stack, when
4976      it is allocated relocatably.  */
4977   fail_stack_elt_t *failure_stack_ptr;
4978 #endif
4979
4980   /* We fill all the registers internally, independent of what we
4981      return, for use in backreferences.  The number here includes
4982      an element for register zero.  */
4983   size_t num_regs = bufp->re_nsub + 1;
4984
4985   /* Information on the contents of registers. These are pointers into
4986      the input strings; they record just what was matched (on this
4987      attempt) by a subexpression part of the pattern, that is, the
4988      regnum-th regstart pointer points to where in the pattern we began
4989      matching and the regnum-th regend points to right after where we
4990      stopped matching the regnum-th subexpression.  (The zeroth register
4991      keeps track of what the whole pattern matches.)  */
4992 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
4993   re_char **regstart, **regend;
4994 #endif
4995
4996   /* The following record the register info as found in the above
4997      variables when we find a match better than any we've seen before.
4998      This happens as we backtrack through the failure points, which in
4999      turn happens only if we have not yet matched the entire string. */
5000   unsigned best_regs_set = false;
5001 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
5002   re_char **best_regstart, **best_regend;
5003 #endif
5004
5005   /* Logically, this is `best_regend[0]'.  But we don't want to have to
5006      allocate space for that if we're not allocating space for anything
5007      else (see below).  Also, we never need info about register 0 for
5008      any of the other register vectors, and it seems rather a kludge to
5009      treat `best_regend' differently than the rest.  So we keep track of
5010      the end of the best match so far in a separate variable.  We
5011      initialize this to NULL so that when we backtrack the first time
5012      and need to test it, it's not garbage.  */
5013   re_char *match_end = NULL;
5014
5015 #ifdef DEBUG_COMPILES_ARGUMENTS
5016   /* Counts the total number of registers pushed.  */
5017   unsigned num_regs_pushed = 0;
5018 #endif
5019
5020   DEBUG_PRINT ("\n\nEntering re_match_2.\n");
5021
5022   INIT_FAIL_STACK ();
5023
5024 #ifdef MATCH_MAY_ALLOCATE
5025   /* Do not bother to initialize all the register variables if there are
5026      no groups in the pattern, as it takes a fair amount of time.  If
5027      there are groups, we include space for register 0 (the whole
5028      pattern), even though we never use it, since it simplifies the
5029      array indexing.  We should fix this.  */
5030   if (bufp->re_nsub)
5031     {
5032       regstart = REGEX_TALLOC (num_regs, re_char *);
5033       regend = REGEX_TALLOC (num_regs, re_char *);
5034       best_regstart = REGEX_TALLOC (num_regs, re_char *);
5035       best_regend = REGEX_TALLOC (num_regs, re_char *);
5036
5037       if (!(regstart && regend && best_regstart && best_regend))
5038         {
5039           FREE_VARIABLES ();
5040           return -2;
5041         }
5042     }
5043   else
5044     {
5045       /* We must initialize all our variables to NULL, so that
5046          `FREE_VARIABLES' doesn't try to free them.  */
5047       regstart = regend = best_regstart = best_regend = NULL;
5048     }
5049 #endif /* MATCH_MAY_ALLOCATE */
5050
5051   /* The starting position is bogus.  */
5052   if (pos < 0 || pos > size1 + size2)
5053     {
5054       FREE_VARIABLES ();
5055       return -1;
5056     }
5057
5058   /* Initialize subexpression text positions to -1 to mark ones that no
5059      start_memory/stop_memory has been seen for. Also initialize the
5060      register information struct.  */
5061   for (reg = 1; reg < num_regs; reg++)
5062     regstart[reg] = regend[reg] = NULL;
5063
5064   /* We move `string1' into `string2' if the latter's empty -- but not if
5065      `string1' is null.  */
5066   if (size2 == 0 && string1 != NULL)
5067     {
5068       string2 = string1;
5069       size2 = size1;
5070       string1 = 0;
5071       size1 = 0;
5072     }
5073   end1 = string1 + size1;
5074   end2 = string2 + size2;
5075
5076   /* `p' scans through the pattern as `d' scans through the data.
5077      `dend' is the end of the input string that `d' points within.  `d'
5078      is advanced into the following input string whenever necessary, but
5079      this happens before fetching; therefore, at the beginning of the
5080      loop, `d' can be pointing at the end of a string, but it cannot
5081      equal `string2'.  */
5082   if (pos >= size1)
5083     {
5084       /* Only match within string2.  */
5085       d = string2 + pos - size1;
5086       dend = end_match_2 = string2 + stop - size1;
5087       end_match_1 = end1;       /* Just to give it a value.  */
5088     }
5089   else
5090     {
5091       if (stop < size1)
5092         {
5093           /* Only match within string1.  */
5094           end_match_1 = string1 + stop;
5095           /* BEWARE!
5096              When we reach end_match_1, PREFETCH normally switches to string2.
5097              But in the present case, this means that just doing a PREFETCH
5098              makes us jump from `stop' to `gap' within the string.
5099              What we really want here is for the search to stop as
5100              soon as we hit end_match_1.  That's why we set end_match_2
5101              to end_match_1 (since PREFETCH fails as soon as we hit
5102              end_match_2).  */
5103           end_match_2 = end_match_1;
5104         }
5105       else
5106         { /* It's important to use this code when stop == size so that
5107              moving `d' from end1 to string2 will not prevent the d == dend
5108              check from catching the end of string.  */
5109           end_match_1 = end1;
5110           end_match_2 = string2 + stop - size1;
5111         }
5112       d = string1 + pos;
5113       dend = end_match_1;
5114     }
5115
5116   DEBUG_PRINT ("The compiled pattern is: ");
5117   DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5118   DEBUG_PRINT ("The string to match is: `");
5119   DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5120   DEBUG_PRINT ("'\n");
5121
5122   /* This loops over pattern commands.  It exits by returning from the
5123      function if the match is complete, or it drops through if the match
5124      fails at this starting point in the input data.  */
5125   for (;;)
5126     {
5127       DEBUG_PRINT ("\n%p: ", p);
5128
5129       if (p == pend)
5130         {
5131           ptrdiff_t dcnt;
5132
5133           /* End of pattern means we might have succeeded.  */
5134           DEBUG_PRINT ("end of pattern ... ");
5135
5136           /* If we haven't matched the entire string, and we want the
5137              longest match, try backtracking.  */
5138           if (d != end_match_2)
5139             {
5140               /* 1 if this match ends in the same string (string1 or string2)
5141                  as the best previous match.  */
5142               boolean same_str_p = (FIRST_STRING_P (match_end)
5143                                     == FIRST_STRING_P (d));
5144               /* 1 if this match is the best seen so far.  */
5145               boolean best_match_p;
5146
5147               /* AIX compiler got confused when this was combined
5148                  with the previous declaration.  */
5149               if (same_str_p)
5150                 best_match_p = d > match_end;
5151               else
5152                 best_match_p = !FIRST_STRING_P (d);
5153
5154               DEBUG_PRINT ("backtracking.\n");
5155
5156               if (!FAIL_STACK_EMPTY ())
5157                 { /* More failure points to try.  */
5158
5159                   /* If exceeds best match so far, save it.  */
5160                   if (!best_regs_set || best_match_p)
5161                     {
5162                       best_regs_set = true;
5163                       match_end = d;
5164
5165                       DEBUG_PRINT ("\nSAVING match as best so far.\n");
5166
5167                       for (reg = 1; reg < num_regs; reg++)
5168                         {
5169                           best_regstart[reg] = regstart[reg];
5170                           best_regend[reg] = regend[reg];
5171                         }
5172                     }
5173                   goto fail;
5174                 }
5175
5176               /* If no failure points, don't restore garbage.  And if
5177                  last match is real best match, don't restore second
5178                  best one. */
5179               else if (best_regs_set && !best_match_p)
5180                 {
5181                 restore_best_regs:
5182                   /* Restore best match.  It may happen that `dend ==
5183                      end_match_1' while the restored d is in string2.
5184                      For example, the pattern `x.*y.*z' against the
5185                      strings `x-' and `y-z-', if the two strings are
5186                      not consecutive in memory.  */
5187                   DEBUG_PRINT ("Restoring best registers.\n");
5188
5189                   d = match_end;
5190                   dend = ((d >= string1 && d <= end1)
5191                            ? end_match_1 : end_match_2);
5192
5193                   for (reg = 1; reg < num_regs; reg++)
5194                     {
5195                       regstart[reg] = best_regstart[reg];
5196                       regend[reg] = best_regend[reg];
5197                     }
5198                 }
5199             } /* d != end_match_2 */
5200
5201         succeed_label:
5202           DEBUG_PRINT ("Accepting match.\n");
5203
5204           /* If caller wants register contents data back, do it.  */
5205           if (regs && !bufp->no_sub)
5206             {
5207               /* Have the register data arrays been allocated?  */
5208               if (bufp->regs_allocated == REGS_UNALLOCATED)
5209                 { /* No.  So allocate them with malloc.  We need one
5210                      extra element beyond `num_regs' for the `-1' marker
5211                      GNU code uses.  */
5212                   regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5213                   regs->start = TALLOC (regs->num_regs, regoff_t);
5214                   regs->end = TALLOC (regs->num_regs, regoff_t);
5215                   if (regs->start == NULL || regs->end == NULL)
5216                     {
5217                       FREE_VARIABLES ();
5218                       return -2;
5219                     }
5220                   bufp->regs_allocated = REGS_REALLOCATE;
5221                 }
5222               else if (bufp->regs_allocated == REGS_REALLOCATE)
5223                 { /* Yes.  If we need more elements than were already
5224                      allocated, reallocate them.  If we need fewer, just
5225                      leave it alone.  */
5226                   if (regs->num_regs < num_regs + 1)
5227                     {
5228                       regs->num_regs = num_regs + 1;
5229                       RETALLOC (regs->start, regs->num_regs, regoff_t);
5230                       RETALLOC (regs->end, regs->num_regs, regoff_t);
5231                       if (regs->start == NULL || regs->end == NULL)
5232                         {
5233                           FREE_VARIABLES ();
5234                           return -2;
5235                         }
5236                     }
5237                 }
5238               else
5239                 {
5240                   /* These braces fend off a "empty body in an else-statement"
5241                      warning under GCC when assert expands to nothing.  */
5242                   assert (bufp->regs_allocated == REGS_FIXED);
5243                 }
5244
5245               /* Convert the pointer data in `regstart' and `regend' to
5246                  indices.  Register zero has to be set differently,
5247                  since we haven't kept track of any info for it.  */
5248               if (regs->num_regs > 0)
5249                 {
5250                   regs->start[0] = pos;
5251                   regs->end[0] = POINTER_TO_OFFSET (d);
5252                 }
5253
5254               /* Go through the first `min (num_regs, regs->num_regs)'
5255                  registers, since that is all we initialized.  */
5256               for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
5257                 {
5258                   if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5259                     regs->start[reg] = regs->end[reg] = -1;
5260                   else
5261                     {
5262                       regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
5263                       regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
5264                     }
5265                 }
5266
5267               /* If the regs structure we return has more elements than
5268                  were in the pattern, set the extra elements to -1.  If
5269                  we (re)allocated the registers, this is the case,
5270                  because we always allocate enough to have at least one
5271                  -1 at the end.  */
5272               for (reg = num_regs; reg < regs->num_regs; reg++)
5273                 regs->start[reg] = regs->end[reg] = -1;
5274             } /* regs && !bufp->no_sub */
5275
5276           DEBUG_PRINT ("%u failure points pushed, %u popped (%u remain).\n",
5277                        nfailure_points_pushed, nfailure_points_popped,
5278                        nfailure_points_pushed - nfailure_points_popped);
5279           DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed);
5280
5281           dcnt = POINTER_TO_OFFSET (d) - pos;
5282
5283           DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
5284
5285           FREE_VARIABLES ();
5286           return dcnt;
5287         }
5288
5289       /* Otherwise match next pattern command.  */
5290       switch (*p++)
5291         {
5292         /* Ignore these.  Used to ignore the n of succeed_n's which
5293            currently have n == 0.  */
5294         case no_op:
5295           DEBUG_PRINT ("EXECUTING no_op.\n");
5296           break;
5297
5298         case succeed:
5299           DEBUG_PRINT ("EXECUTING succeed.\n");
5300           goto succeed_label;
5301
5302         /* Match the next n pattern characters exactly.  The following
5303            byte in the pattern defines n, and the n bytes after that
5304            are the characters to match.  */
5305         case exactn:
5306           mcnt = *p++;
5307           DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
5308
5309           /* Remember the start point to rollback upon failure.  */
5310           dfail = d;
5311
5312 #ifndef emacs
5313           /* This is written out as an if-else so we don't waste time
5314              testing `translate' inside the loop.  */
5315           if (RE_TRANSLATE_P (translate))
5316             do
5317               {
5318                 PREFETCH ();
5319                 if (RE_TRANSLATE (translate, *d) != *p++)
5320                   {
5321                     d = dfail;
5322                     goto fail;
5323                   }
5324                 d++;
5325               }
5326             while (--mcnt);
5327           else
5328             do
5329               {
5330                 PREFETCH ();
5331                 if (*d++ != *p++)
5332                   {
5333                     d = dfail;
5334                     goto fail;
5335                   }
5336               }
5337             while (--mcnt);
5338 #else  /* emacs */
5339           /* The cost of testing `translate' is comparatively small.  */
5340           if (target_multibyte)
5341             do
5342               {
5343                 int pat_charlen, buf_charlen;
5344                 int pat_ch, buf_ch;
5345
5346                 PREFETCH ();
5347                 if (multibyte)
5348                   pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5349                 else
5350                   {
5351                     pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5352                     pat_charlen = 1;
5353                   }
5354                 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
5355
5356                 if (TRANSLATE (buf_ch) != pat_ch)
5357                   {
5358                     d = dfail;
5359                     goto fail;
5360                   }
5361
5362                 p += pat_charlen;
5363                 d += buf_charlen;
5364                 mcnt -= pat_charlen;
5365               }
5366             while (mcnt > 0);
5367           else
5368             do
5369               {
5370                 int pat_charlen;
5371                 int pat_ch, buf_ch;
5372
5373                 PREFETCH ();
5374                 if (multibyte)
5375                   {
5376                     pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5377                     pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5378                   }
5379                 else
5380                   {
5381                     pat_ch = *p;
5382                     pat_charlen = 1;
5383                   }
5384                 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5385                 if (! CHAR_BYTE8_P (buf_ch))
5386                   {
5387                     buf_ch = TRANSLATE (buf_ch);
5388                     buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5389                     if (buf_ch < 0)
5390                       buf_ch = *d;
5391                   }
5392                 else
5393                   buf_ch = *d;
5394                 if (buf_ch != pat_ch)
5395                   {
5396                     d = dfail;
5397                     goto fail;
5398                   }
5399                 p += pat_charlen;
5400                 d++;
5401               }
5402             while (--mcnt);
5403 #endif
5404           break;
5405
5406
5407         /* Match any character except possibly a newline or a null.  */
5408         case anychar:
5409           {
5410             int buf_charlen;
5411             re_wchar_t buf_ch;
5412
5413             DEBUG_PRINT ("EXECUTING anychar.\n");
5414
5415             PREFETCH ();
5416             buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
5417                                                 target_multibyte);
5418             buf_ch = TRANSLATE (buf_ch);
5419
5420             if ((!(bufp->syntax & RE_DOT_NEWLINE)
5421                  && buf_ch == '\n')
5422                 || ((bufp->syntax & RE_DOT_NOT_NULL)
5423                     && buf_ch == '\000'))
5424               goto fail;
5425
5426             DEBUG_PRINT ("  Matched `%d'.\n", *d);
5427             d += buf_charlen;
5428           }
5429           break;
5430
5431
5432         case charset:
5433         case charset_not:
5434           {
5435             register unsigned int c;
5436             boolean not = (re_opcode_t) *(p - 1) == charset_not;
5437             int len;
5438
5439             /* Start of actual range_table, or end of bitmap if there is no
5440                range table.  */
5441             re_char *range_table IF_LINT (= NULL);
5442
5443             /* Nonzero if there is a range table.  */
5444             int range_table_exists;
5445
5446             /* Number of ranges of range table.  This is not included
5447                in the initial byte-length of the command.  */
5448             int count = 0;
5449
5450             /* Whether matching against a unibyte character.  */
5451             boolean unibyte_char = false;
5452
5453             DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
5454
5455             range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5456
5457             if (range_table_exists)
5458               {
5459                 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap.  */
5460                 EXTRACT_NUMBER_AND_INCR (count, range_table);
5461               }
5462
5463             PREFETCH ();
5464             c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
5465             if (target_multibyte)
5466               {
5467                 int c1;
5468
5469                 c = TRANSLATE (c);
5470                 c1 = RE_CHAR_TO_UNIBYTE (c);
5471                 if (c1 >= 0)
5472                   {
5473                     unibyte_char = true;
5474                     c = c1;
5475                   }
5476               }
5477             else
5478               {
5479                 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5480
5481                 if (! CHAR_BYTE8_P (c1))
5482                   {
5483                     c1 = TRANSLATE (c1);
5484                     c1 = RE_CHAR_TO_UNIBYTE (c1);
5485                     if (c1 >= 0)
5486                       {
5487                         unibyte_char = true;
5488                         c = c1;
5489                       }
5490                   }
5491                 else
5492                   unibyte_char = true;
5493               }
5494
5495             if (unibyte_char && c < (1 << BYTEWIDTH))
5496               {                 /* Lookup bitmap.  */
5497                 /* Cast to `unsigned' instead of `unsigned char' in
5498                    case the bit list is a full 32 bytes long.  */
5499                 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5500                     && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5501                   not = !not;
5502               }
5503 #ifdef emacs
5504             else if (range_table_exists)
5505               {
5506                 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5507
5508                 if (  (class_bits & BIT_LOWER && ISLOWER (c))
5509                     | (class_bits & BIT_MULTIBYTE)
5510                     | (class_bits & BIT_PUNCT && ISPUNCT (c))
5511                     | (class_bits & BIT_SPACE && ISSPACE (c))
5512                     | (class_bits & BIT_UPPER && ISUPPER (c))
5513                     | (class_bits & BIT_WORD  && ISWORD (c)))
5514                   not = !not;
5515                 else
5516                   CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5517               }
5518 #endif /* emacs */
5519
5520             if (range_table_exists)
5521               p = CHARSET_RANGE_TABLE_END (range_table, count);
5522             else
5523               p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5524
5525             if (!not) goto fail;
5526
5527             d += len;
5528           }
5529           break;
5530
5531
5532         /* The beginning of a group is represented by start_memory.
5533            The argument is the register number.  The text
5534            matched within the group is recorded (in the internal
5535            registers data structure) under the register number.  */
5536         case start_memory:
5537           DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
5538
5539           /* In case we need to undo this operation (via backtracking).  */
5540           PUSH_FAILURE_REG (*p);
5541
5542           regstart[*p] = d;
5543           regend[*p] = NULL;    /* probably unnecessary.  -sm  */
5544           DEBUG_PRINT ("  regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
5545
5546           /* Move past the register number and inner group count.  */
5547           p += 1;
5548           break;
5549
5550
5551         /* The stop_memory opcode represents the end of a group.  Its
5552            argument is the same as start_memory's: the register number.  */
5553         case stop_memory:
5554           DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
5555
5556           assert (!REG_UNSET (regstart[*p]));
5557           /* Strictly speaking, there should be code such as:
5558
5559                 assert (REG_UNSET (regend[*p]));
5560                 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5561
5562              But the only info to be pushed is regend[*p] and it is known to
5563              be UNSET, so there really isn't anything to push.
5564              Not pushing anything, on the other hand deprives us from the
5565              guarantee that regend[*p] is UNSET since undoing this operation
5566              will not reset its value properly.  This is not important since
5567              the value will only be read on the next start_memory or at
5568              the very end and both events can only happen if this stop_memory
5569              is *not* undone.  */
5570
5571           regend[*p] = d;
5572           DEBUG_PRINT ("      regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
5573
5574           /* Move past the register number and the inner group count.  */
5575           p += 1;
5576           break;
5577
5578
5579         /* \<digit> has been turned into a `duplicate' command which is
5580            followed by the numeric value of <digit> as the register number.  */
5581         case duplicate:
5582           {
5583             register re_char *d2, *dend2;
5584             int regno = *p++;   /* Get which register to match against.  */
5585             DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
5586
5587             /* Can't back reference a group which we've never matched.  */
5588             if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5589               goto fail;
5590
5591             /* Where in input to try to start matching.  */
5592             d2 = regstart[regno];
5593
5594             /* Remember the start point to rollback upon failure.  */
5595             dfail = d;
5596
5597             /* Where to stop matching; if both the place to start and
5598                the place to stop matching are in the same string, then
5599                set to the place to stop, otherwise, for now have to use
5600                the end of the first string.  */
5601
5602             dend2 = ((FIRST_STRING_P (regstart[regno])
5603                       == FIRST_STRING_P (regend[regno]))
5604                      ? regend[regno] : end_match_1);
5605             for (;;)
5606               {
5607                 ptrdiff_t dcnt;
5608
5609                 /* If necessary, advance to next segment in register
5610                    contents.  */
5611                 while (d2 == dend2)
5612                   {
5613                     if (dend2 == end_match_2) break;
5614                     if (dend2 == regend[regno]) break;
5615
5616                     /* End of string1 => advance to string2. */
5617                     d2 = string2;
5618                     dend2 = regend[regno];
5619                   }
5620                 /* At end of register contents => success */
5621                 if (d2 == dend2) break;
5622
5623                 /* If necessary, advance to next segment in data.  */
5624                 PREFETCH ();
5625
5626                 /* How many characters left in this segment to match.  */
5627                 dcnt = dend - d;
5628
5629                 /* Want how many consecutive characters we can match in
5630                    one shot, so, if necessary, adjust the count.  */
5631                 if (dcnt > dend2 - d2)
5632                   dcnt = dend2 - d2;
5633
5634                 /* Compare that many; failure if mismatch, else move
5635                    past them.  */
5636                 if (RE_TRANSLATE_P (translate)
5637                     ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
5638                     : memcmp (d, d2, dcnt))
5639                   {
5640                     d = dfail;
5641                     goto fail;
5642                   }
5643                 d += dcnt, d2 += dcnt;
5644               }
5645           }
5646           break;
5647
5648
5649         /* begline matches the empty string at the beginning of the string
5650            (unless `not_bol' is set in `bufp'), and after newlines.  */
5651         case begline:
5652           DEBUG_PRINT ("EXECUTING begline.\n");
5653
5654           if (AT_STRINGS_BEG (d))
5655             {
5656               if (!bufp->not_bol) break;
5657             }
5658           else
5659             {
5660               unsigned c;
5661               GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5662               if (c == '\n')
5663                 break;
5664             }
5665           /* In all other cases, we fail.  */
5666           goto fail;
5667
5668
5669         /* endline is the dual of begline.  */
5670         case endline:
5671           DEBUG_PRINT ("EXECUTING endline.\n");
5672
5673           if (AT_STRINGS_END (d))
5674             {
5675               if (!bufp->not_eol) break;
5676             }
5677           else
5678             {
5679               PREFETCH_NOLIMIT ();
5680               if (*d == '\n')
5681                 break;
5682             }
5683           goto fail;
5684
5685
5686         /* Match at the very beginning of the data.  */
5687         case begbuf:
5688           DEBUG_PRINT ("EXECUTING begbuf.\n");
5689           if (AT_STRINGS_BEG (d))
5690             break;
5691           goto fail;
5692
5693
5694         /* Match at the very end of the data.  */
5695         case endbuf:
5696           DEBUG_PRINT ("EXECUTING endbuf.\n");
5697           if (AT_STRINGS_END (d))
5698             break;
5699           goto fail;
5700
5701
5702         /* on_failure_keep_string_jump is used to optimize `.*\n'.  It
5703            pushes NULL as the value for the string on the stack.  Then
5704            `POP_FAILURE_POINT' will keep the current value for the
5705            string, instead of restoring it.  To see why, consider
5706            matching `foo\nbar' against `.*\n'.  The .* matches the foo;
5707            then the . fails against the \n.  But the next thing we want
5708            to do is match the \n against the \n; if we restored the
5709            string value, we would be back at the foo.
5710
5711            Because this is used only in specific cases, we don't need to
5712            check all the things that `on_failure_jump' does, to make
5713            sure the right things get saved on the stack.  Hence we don't
5714            share its code.  The only reason to push anything on the
5715            stack at all is that otherwise we would have to change
5716            `anychar's code to do something besides goto fail in this
5717            case; that seems worse than this.  */
5718         case on_failure_keep_string_jump:
5719           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5720           DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5721                        mcnt, p + mcnt);
5722
5723           PUSH_FAILURE_POINT (p - 3, NULL);
5724           break;
5725
5726           /* A nasty loop is introduced by the non-greedy *? and +?.
5727              With such loops, the stack only ever contains one failure point
5728              at a time, so that a plain on_failure_jump_loop kind of
5729              cycle detection cannot work.  Worse yet, such a detection
5730              can not only fail to detect a cycle, but it can also wrongly
5731              detect a cycle (between different instantiations of the same
5732              loop).
5733              So the method used for those nasty loops is a little different:
5734              We use a special cycle-detection-stack-frame which is pushed
5735              when the on_failure_jump_nastyloop failure-point is *popped*.
5736              This special frame thus marks the beginning of one iteration
5737              through the loop and we can hence easily check right here
5738              whether something matched between the beginning and the end of
5739              the loop.  */
5740         case on_failure_jump_nastyloop:
5741           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5742           DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5743                        mcnt, p + mcnt);
5744
5745           assert ((re_opcode_t)p[-4] == no_op);
5746           {
5747             int cycle = 0;
5748             CHECK_INFINITE_LOOP (p - 4, d);
5749             if (!cycle)
5750               /* If there's a cycle, just continue without pushing
5751                  this failure point.  The failure point is the "try again"
5752                  option, which shouldn't be tried.
5753                  We want (x?)*?y\1z to match both xxyz and xxyxz.  */
5754               PUSH_FAILURE_POINT (p - 3, d);
5755           }
5756           break;
5757
5758           /* Simple loop detecting on_failure_jump:  just check on the
5759              failure stack if the same spot was already hit earlier.  */
5760         case on_failure_jump_loop:
5761         on_failure:
5762           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5763           DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5764                        mcnt, p + mcnt);
5765           {
5766             int cycle = 0;
5767             CHECK_INFINITE_LOOP (p - 3, d);
5768             if (cycle)
5769               /* If there's a cycle, get out of the loop, as if the matching
5770                  had failed.  We used to just `goto fail' here, but that was
5771                  aborting the search a bit too early: we want to keep the
5772                  empty-loop-match and keep matching after the loop.
5773                  We want (x?)*y\1z to match both xxyz and xxyxz.  */
5774               p += mcnt;
5775             else
5776               PUSH_FAILURE_POINT (p - 3, d);
5777           }
5778           break;
5779
5780
5781         /* Uses of on_failure_jump:
5782
5783            Each alternative starts with an on_failure_jump that points
5784            to the beginning of the next alternative.  Each alternative
5785            except the last ends with a jump that in effect jumps past
5786            the rest of the alternatives.  (They really jump to the
5787            ending jump of the following alternative, because tensioning
5788            these jumps is a hassle.)
5789
5790            Repeats start with an on_failure_jump that points past both
5791            the repetition text and either the following jump or
5792            pop_failure_jump back to this on_failure_jump.  */
5793         case on_failure_jump:
5794           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5795           DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
5796                        mcnt, p + mcnt);
5797
5798           PUSH_FAILURE_POINT (p -3, d);
5799           break;
5800
5801         /* This operation is used for greedy *.
5802            Compare the beginning of the repeat with what in the
5803            pattern follows its end. If we can establish that there
5804            is nothing that they would both match, i.e., that we
5805            would have to backtrack because of (as in, e.g., `a*a')
5806            then we can use a non-backtracking loop based on
5807            on_failure_keep_string_jump instead of on_failure_jump.  */
5808         case on_failure_jump_smart:
5809           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5810           DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5811                        mcnt, p + mcnt);
5812           {
5813             re_char *p1 = p; /* Next operation.  */
5814             /* Here, we discard `const', making re_match non-reentrant.  */
5815             unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest.  */
5816             unsigned char *p3 = (unsigned char*) p - 3; /* opcode location.  */
5817
5818             p -= 3;             /* Reset so that we will re-execute the
5819                                    instruction once it's been changed. */
5820
5821             EXTRACT_NUMBER (mcnt, p2 - 2);
5822
5823             /* Ensure this is a indeed the trivial kind of loop
5824                we are expecting.  */
5825             assert (skip_one_char (p1) == p2 - 3);
5826             assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5827             DEBUG_STATEMENT (debug += 2);
5828             if (mutually_exclusive_p (bufp, p1, p2))
5829               {
5830                 /* Use a fast `on_failure_keep_string_jump' loop.  */
5831                 DEBUG_PRINT ("  smart exclusive => fast loop.\n");
5832                 *p3 = (unsigned char) on_failure_keep_string_jump;
5833                 STORE_NUMBER (p2 - 2, mcnt + 3);
5834               }
5835             else
5836               {
5837                 /* Default to a safe `on_failure_jump' loop.  */
5838                 DEBUG_PRINT ("  smart default => slow loop.\n");
5839                 *p3 = (unsigned char) on_failure_jump;
5840               }
5841             DEBUG_STATEMENT (debug -= 2);
5842           }
5843           break;
5844
5845         /* Unconditionally jump (without popping any failure points).  */
5846         case jump:
5847         unconditional_jump:
5848           IMMEDIATE_QUIT_CHECK;
5849           EXTRACT_NUMBER_AND_INCR (mcnt, p);    /* Get the amount to jump.  */
5850           DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
5851           p += mcnt;                            /* Do the jump.  */
5852           DEBUG_PRINT ("(to %p).\n", p);
5853           break;
5854
5855
5856         /* Have to succeed matching what follows at least n times.
5857            After that, handle like `on_failure_jump'.  */
5858         case succeed_n:
5859           /* Signedness doesn't matter since we only compare MCNT to 0.  */
5860           EXTRACT_NUMBER (mcnt, p + 2);
5861           DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
5862
5863           /* Originally, mcnt is how many times we HAVE to succeed.  */
5864           if (mcnt != 0)
5865             {
5866               /* Here, we discard `const', making re_match non-reentrant.  */
5867               unsigned char *p2 = (unsigned char*) p + 2; /* counter loc.  */
5868               mcnt--;
5869               p += 4;
5870               PUSH_NUMBER (p2, mcnt);
5871             }
5872           else
5873             /* The two bytes encoding mcnt == 0 are two no_op opcodes.  */
5874             goto on_failure;
5875           break;
5876
5877         case jump_n:
5878           /* Signedness doesn't matter since we only compare MCNT to 0.  */
5879           EXTRACT_NUMBER (mcnt, p + 2);
5880           DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
5881
5882           /* Originally, this is how many times we CAN jump.  */
5883           if (mcnt != 0)
5884             {
5885                /* Here, we discard `const', making re_match non-reentrant.  */
5886               unsigned char *p2 = (unsigned char*) p + 2; /* counter loc.  */
5887               mcnt--;
5888               PUSH_NUMBER (p2, mcnt);
5889               goto unconditional_jump;
5890             }
5891           /* If don't have to jump any more, skip over the rest of command.  */
5892           else
5893             p += 4;
5894           break;
5895
5896         case set_number_at:
5897           {
5898             unsigned char *p2;  /* Location of the counter.  */
5899             DEBUG_PRINT ("EXECUTING set_number_at.\n");
5900
5901             EXTRACT_NUMBER_AND_INCR (mcnt, p);
5902             /* Here, we discard `const', making re_match non-reentrant.  */
5903             p2 = (unsigned char*) p + mcnt;
5904             /* Signedness doesn't matter since we only copy MCNT's bits.  */
5905             EXTRACT_NUMBER_AND_INCR (mcnt, p);
5906             DEBUG_PRINT ("  Setting %p to %d.\n", p2, mcnt);
5907             PUSH_NUMBER (p2, mcnt);
5908             break;
5909           }
5910
5911         case wordbound:
5912         case notwordbound:
5913           {
5914             boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5915             DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
5916
5917             /* We SUCCEED (or FAIL) in one of the following cases: */
5918
5919             /* Case 1: D is at the beginning or the end of string.  */
5920             if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5921               not = !not;
5922             else
5923               {
5924                 /* C1 is the character before D, S1 is the syntax of C1, C2
5925                    is the character at D, and S2 is the syntax of C2.  */
5926                 re_wchar_t c1, c2;
5927                 int s1, s2;
5928                 int dummy;
5929 #ifdef emacs
5930                 ssize_t offset = PTR_TO_OFFSET (d - 1);
5931                 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5932                 UPDATE_SYNTAX_TABLE (charpos);
5933 #endif
5934                 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5935                 s1 = SYNTAX (c1);
5936 #ifdef emacs
5937                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5938 #endif
5939                 PREFETCH_NOLIMIT ();
5940                 GET_CHAR_AFTER (c2, d, dummy);
5941                 s2 = SYNTAX (c2);
5942
5943                 if (/* Case 2: Only one of S1 and S2 is Sword.  */
5944                     ((s1 == Sword) != (s2 == Sword))
5945                     /* Case 3: Both of S1 and S2 are Sword, and macro
5946                        WORD_BOUNDARY_P (C1, C2) returns nonzero.  */
5947                     || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5948                   not = !not;
5949               }
5950             if (not)
5951               break;
5952             else
5953               goto fail;
5954           }
5955
5956         case wordbeg:
5957           DEBUG_PRINT ("EXECUTING wordbeg.\n");
5958
5959           /* We FAIL in one of the following cases: */
5960
5961           /* Case 1: D is at the end of string.  */
5962           if (AT_STRINGS_END (d))
5963             goto fail;
5964           else
5965             {
5966               /* C1 is the character before D, S1 is the syntax of C1, C2
5967                  is the character at D, and S2 is the syntax of C2.  */
5968               re_wchar_t c1, c2;
5969               int s1, s2;
5970               int dummy;
5971 #ifdef emacs
5972               ssize_t offset = PTR_TO_OFFSET (d);
5973               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5974               UPDATE_SYNTAX_TABLE (charpos);
5975 #endif
5976               PREFETCH ();
5977               GET_CHAR_AFTER (c2, d, dummy);
5978               s2 = SYNTAX (c2);
5979
5980               /* Case 2: S2 is not Sword. */
5981               if (s2 != Sword)
5982                 goto fail;
5983
5984               /* Case 3: D is not at the beginning of string ... */
5985               if (!AT_STRINGS_BEG (d))
5986                 {
5987                   GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5988 #ifdef emacs
5989                   UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5990 #endif
5991                   s1 = SYNTAX (c1);
5992
5993                   /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5994                      returns 0.  */
5995                   if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5996                     goto fail;
5997                 }
5998             }
5999           break;
6000
6001         case wordend:
6002           DEBUG_PRINT ("EXECUTING wordend.\n");
6003
6004           /* We FAIL in one of the following cases: */
6005
6006           /* Case 1: D is at the beginning of string.  */
6007           if (AT_STRINGS_BEG (d))
6008             goto fail;
6009           else
6010             {
6011               /* C1 is the character before D, S1 is the syntax of C1, C2
6012                  is the character at D, and S2 is the syntax of C2.  */
6013               re_wchar_t c1, c2;
6014               int s1, s2;
6015               int dummy;
6016 #ifdef emacs
6017               ssize_t offset = PTR_TO_OFFSET (d) - 1;
6018               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6019               UPDATE_SYNTAX_TABLE (charpos);
6020 #endif
6021               GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6022               s1 = SYNTAX (c1);
6023
6024               /* Case 2: S1 is not Sword.  */
6025               if (s1 != Sword)
6026                 goto fail;
6027
6028               /* Case 3: D is not at the end of string ... */
6029               if (!AT_STRINGS_END (d))
6030                 {
6031                   PREFETCH_NOLIMIT ();
6032                   GET_CHAR_AFTER (c2, d, dummy);
6033 #ifdef emacs
6034                   UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6035 #endif
6036                   s2 = SYNTAX (c2);
6037
6038                   /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
6039                      returns 0.  */
6040                   if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6041           goto fail;
6042                 }
6043             }
6044           break;
6045
6046         case symbeg:
6047           DEBUG_PRINT ("EXECUTING symbeg.\n");
6048
6049           /* We FAIL in one of the following cases: */
6050
6051           /* Case 1: D is at the end of string.  */
6052           if (AT_STRINGS_END (d))
6053             goto fail;
6054           else
6055             {
6056               /* C1 is the character before D, S1 is the syntax of C1, C2
6057                  is the character at D, and S2 is the syntax of C2.  */
6058               re_wchar_t c1, c2;
6059               int s1, s2;
6060 #ifdef emacs
6061               ssize_t offset = PTR_TO_OFFSET (d);
6062               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6063               UPDATE_SYNTAX_TABLE (charpos);
6064 #endif
6065               PREFETCH ();
6066               c2 = RE_STRING_CHAR (d, target_multibyte);
6067               s2 = SYNTAX (c2);
6068
6069               /* Case 2: S2 is neither Sword nor Ssymbol. */
6070               if (s2 != Sword && s2 != Ssymbol)
6071                 goto fail;
6072
6073               /* Case 3: D is not at the beginning of string ... */
6074               if (!AT_STRINGS_BEG (d))
6075                 {
6076                   GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6077 #ifdef emacs
6078                   UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6079 #endif
6080                   s1 = SYNTAX (c1);
6081
6082                   /* ... and S1 is Sword or Ssymbol.  */
6083                   if (s1 == Sword || s1 == Ssymbol)
6084                     goto fail;
6085                 }
6086             }
6087           break;
6088
6089         case symend:
6090           DEBUG_PRINT ("EXECUTING symend.\n");
6091
6092           /* We FAIL in one of the following cases: */
6093
6094           /* Case 1: D is at the beginning of string.  */
6095           if (AT_STRINGS_BEG (d))
6096             goto fail;
6097           else
6098             {
6099               /* C1 is the character before D, S1 is the syntax of C1, C2
6100                  is the character at D, and S2 is the syntax of C2.  */
6101               re_wchar_t c1, c2;
6102               int s1, s2;
6103 #ifdef emacs
6104               ssize_t offset = PTR_TO_OFFSET (d) - 1;
6105               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6106               UPDATE_SYNTAX_TABLE (charpos);
6107 #endif
6108               GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6109               s1 = SYNTAX (c1);
6110
6111               /* Case 2: S1 is neither Ssymbol nor Sword.  */
6112               if (s1 != Sword && s1 != Ssymbol)
6113                 goto fail;
6114
6115               /* Case 3: D is not at the end of string ... */
6116               if (!AT_STRINGS_END (d))
6117                 {
6118                   PREFETCH_NOLIMIT ();
6119                   c2 = RE_STRING_CHAR (d, target_multibyte);
6120 #ifdef emacs
6121                   UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6122 #endif
6123                   s2 = SYNTAX (c2);
6124
6125                   /* ... and S2 is Sword or Ssymbol.  */
6126                   if (s2 == Sword || s2 == Ssymbol)
6127                     goto fail;
6128                 }
6129             }
6130           break;
6131
6132         case syntaxspec:
6133         case notsyntaxspec:
6134           {
6135             boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6136             mcnt = *p++;
6137             DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
6138                          mcnt);
6139             PREFETCH ();
6140 #ifdef emacs
6141             {
6142               ssize_t offset = PTR_TO_OFFSET (d);
6143               ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6144               UPDATE_SYNTAX_TABLE (pos1);
6145             }
6146 #endif
6147             {
6148               int len;
6149               re_wchar_t c;
6150
6151               GET_CHAR_AFTER (c, d, len);
6152               if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6153                 goto fail;
6154               d += len;
6155             }
6156           }
6157           break;
6158
6159 #ifdef emacs
6160         case before_dot:
6161           DEBUG_PRINT ("EXECUTING before_dot.\n");
6162           if (PTR_BYTE_POS (d) >= PT_BYTE)
6163             goto fail;
6164           break;
6165
6166         case at_dot:
6167           DEBUG_PRINT ("EXECUTING at_dot.\n");
6168           if (PTR_BYTE_POS (d) != PT_BYTE)
6169             goto fail;
6170           break;
6171
6172         case after_dot:
6173           DEBUG_PRINT ("EXECUTING after_dot.\n");
6174           if (PTR_BYTE_POS (d) <= PT_BYTE)
6175             goto fail;
6176           break;
6177
6178         case categoryspec:
6179         case notcategoryspec:
6180           {
6181             boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6182             mcnt = *p++;
6183             DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
6184                          not ? "not" : "", mcnt);
6185             PREFETCH ();
6186
6187             {
6188               int len;
6189               re_wchar_t c;
6190               GET_CHAR_AFTER (c, d, len);
6191               if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6192                 goto fail;
6193               d += len;
6194             }
6195           }
6196           break;
6197
6198 #endif /* emacs */
6199
6200         default:
6201           abort ();
6202         }
6203       continue;  /* Successfully executed one pattern command; keep going.  */
6204
6205
6206     /* We goto here if a matching operation fails. */
6207     fail:
6208       IMMEDIATE_QUIT_CHECK;
6209       if (!FAIL_STACK_EMPTY ())
6210         {
6211           re_char *str, *pat;
6212           /* A restart point is known.  Restore to that state.  */
6213           DEBUG_PRINT ("\nFAIL:\n");
6214           POP_FAILURE_POINT (str, pat);
6215           switch (*pat++)
6216             {
6217             case on_failure_keep_string_jump:
6218               assert (str == NULL);
6219               goto continue_failure_jump;
6220
6221             case on_failure_jump_nastyloop:
6222               assert ((re_opcode_t)pat[-2] == no_op);
6223               PUSH_FAILURE_POINT (pat - 2, str);
6224               /* Fallthrough */
6225
6226             case on_failure_jump_loop:
6227             case on_failure_jump:
6228             case succeed_n:
6229               d = str;
6230             continue_failure_jump:
6231               EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6232               p = pat + mcnt;
6233               break;
6234
6235             case no_op:
6236               /* A special frame used for nastyloops. */
6237               goto fail;
6238
6239             default:
6240               abort ();
6241             }
6242
6243           assert (p >= bufp->buffer && p <= pend);
6244
6245           if (d >= string1 && d <= end1)
6246             dend = end_match_1;
6247         }
6248       else
6249         break;   /* Matching at this starting point really fails.  */
6250     } /* for (;;) */
6251
6252   if (best_regs_set)
6253     goto restore_best_regs;
6254
6255   FREE_VARIABLES ();
6256
6257   return -1;                            /* Failure to match.  */
6258 }
6259 \f
6260 /* Subroutine definitions for re_match_2.  */
6261
6262 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6263    bytes; nonzero otherwise.  */
6264
6265 static int
6266 bcmp_translate (const_re_char *s1, const_re_char *s2, register ssize_t len,
6267                 RE_TRANSLATE_TYPE translate, const int target_multibyte)
6268 {
6269   register re_char *p1 = s1, *p2 = s2;
6270   re_char *p1_end = s1 + len;
6271   re_char *p2_end = s2 + len;
6272
6273   /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6274      different lengths, but relying on a single `len' would break this. -sm  */
6275   while (p1 < p1_end && p2 < p2_end)
6276     {
6277       int p1_charlen, p2_charlen;
6278       re_wchar_t p1_ch, p2_ch;
6279
6280       GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6281       GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
6282
6283       if (RE_TRANSLATE (translate, p1_ch)
6284           != RE_TRANSLATE (translate, p2_ch))
6285         return 1;
6286
6287       p1 += p1_charlen, p2 += p2_charlen;
6288     }
6289
6290   if (p1 != p1_end || p2 != p2_end)
6291     return 1;
6292
6293   return 0;
6294 }
6295 \f
6296 /* Entry points for GNU code.  */
6297
6298 /* re_compile_pattern is the GNU regular expression compiler: it
6299    compiles PATTERN (of length SIZE) and puts the result in BUFP.
6300    Returns 0 if the pattern was valid, otherwise an error string.
6301
6302    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6303    are set in BUFP on entry.
6304
6305    We call regex_compile to do the actual compilation.  */
6306
6307 const char *
6308 re_compile_pattern (const char *pattern, size_t length,
6309                     struct re_pattern_buffer *bufp)
6310 {
6311   reg_errcode_t ret;
6312
6313   /* GNU code is written to assume at least RE_NREGS registers will be set
6314      (and at least one extra will be -1).  */
6315   bufp->regs_allocated = REGS_UNALLOCATED;
6316
6317   /* And GNU code determines whether or not to get register information
6318      by passing null for the REGS argument to re_match, etc., not by
6319      setting no_sub.  */
6320   bufp->no_sub = 0;
6321
6322   ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
6323
6324   if (!ret)
6325     return NULL;
6326   return gettext (re_error_msgid[(int) ret]);
6327 }
6328 WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
6329 \f
6330 /* Entry points compatible with 4.2 BSD regex library.  We don't define
6331    them unless specifically requested.  */
6332
6333 #if defined _REGEX_RE_COMP || defined _LIBC
6334
6335 /* BSD has one and only one pattern buffer.  */
6336 static struct re_pattern_buffer re_comp_buf;
6337
6338 char *
6339 # ifdef _LIBC
6340 /* Make these definitions weak in libc, so POSIX programs can redefine
6341    these names if they don't use our functions, and still use
6342    regcomp/regexec below without link errors.  */
6343 weak_function
6344 # endif
6345 re_comp (const char *s)
6346 {
6347   reg_errcode_t ret;
6348
6349   if (!s)
6350     {
6351       if (!re_comp_buf.buffer)
6352         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6353         return (char *) gettext ("No previous regular expression");
6354       return 0;
6355     }
6356
6357   if (!re_comp_buf.buffer)
6358     {
6359       re_comp_buf.buffer = malloc (200);
6360       if (re_comp_buf.buffer == NULL)
6361         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6362         return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6363       re_comp_buf.allocated = 200;
6364
6365       re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
6366       if (re_comp_buf.fastmap == NULL)
6367         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6368         return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6369     }
6370
6371   /* Since `re_exec' always passes NULL for the `regs' argument, we
6372      don't need to initialize the pattern buffer fields which affect it.  */
6373
6374   ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6375
6376   if (!ret)
6377     return NULL;
6378
6379   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6380   return (char *) gettext (re_error_msgid[(int) ret]);
6381 }
6382
6383
6384 int
6385 # ifdef _LIBC
6386 weak_function
6387 # endif
6388 re_exec (const char *s)
6389 {
6390   const size_t len = strlen (s);
6391   return re_search (&re_comp_buf, s, len, 0, len, 0) >= 0;
6392 }
6393 #endif /* _REGEX_RE_COMP */
6394 \f
6395 /* POSIX.2 functions.  Don't define these for Emacs.  */
6396
6397 #ifndef emacs
6398
6399 /* regcomp takes a regular expression as a string and compiles it.
6400
6401    PREG is a regex_t *.  We do not expect any fields to be initialized,
6402    since POSIX says we shouldn't.  Thus, we set
6403
6404      `buffer' to the compiled pattern;
6405      `used' to the length of the compiled pattern;
6406      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6407        REG_EXTENDED bit in CFLAGS is set; otherwise, to
6408        RE_SYNTAX_POSIX_BASIC;
6409      `fastmap' to an allocated space for the fastmap;
6410      `fastmap_accurate' to zero;
6411      `re_nsub' to the number of subexpressions in PATTERN.
6412
6413    PATTERN is the address of the pattern string.
6414
6415    CFLAGS is a series of bits which affect compilation.
6416
6417      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6418      use POSIX basic syntax.
6419
6420      If REG_NEWLINE is set, then . and [^...] don't match newline.
6421      Also, regexec will try a match beginning after every newline.
6422
6423      If REG_ICASE is set, then we considers upper- and lowercase
6424      versions of letters to be equivalent when matching.
6425
6426      If REG_NOSUB is set, then when PREG is passed to regexec, that
6427      routine will report only success or failure, and nothing about the
6428      registers.
6429
6430    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
6431    the return codes and their meanings.)  */
6432
6433 reg_errcode_t
6434 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern,
6435          int cflags)
6436 {
6437   reg_errcode_t ret;
6438   reg_syntax_t syntax
6439     = (cflags & REG_EXTENDED) ?
6440       RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6441
6442   /* regex_compile will allocate the space for the compiled pattern.  */
6443   preg->buffer = 0;
6444   preg->allocated = 0;
6445   preg->used = 0;
6446
6447   /* Try to allocate space for the fastmap.  */
6448   preg->fastmap = malloc (1 << BYTEWIDTH);
6449
6450   if (cflags & REG_ICASE)
6451     {
6452       unsigned i;
6453
6454       preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
6455       if (preg->translate == NULL)
6456         return (int) REG_ESPACE;
6457
6458       /* Map uppercase characters to corresponding lowercase ones.  */
6459       for (i = 0; i < CHAR_SET_SIZE; i++)
6460         preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6461     }
6462   else
6463     preg->translate = NULL;
6464
6465   /* If REG_NEWLINE is set, newlines are treated differently.  */
6466   if (cflags & REG_NEWLINE)
6467     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
6468       syntax &= ~RE_DOT_NEWLINE;
6469       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6470     }
6471   else
6472     syntax |= RE_NO_NEWLINE_ANCHOR;
6473
6474   preg->no_sub = !!(cflags & REG_NOSUB);
6475
6476   /* POSIX says a null character in the pattern terminates it, so we
6477      can use strlen here in compiling the pattern.  */
6478   ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
6479
6480   /* POSIX doesn't distinguish between an unmatched open-group and an
6481      unmatched close-group: both are REG_EPAREN.  */
6482   if (ret == REG_ERPAREN)
6483     ret = REG_EPAREN;
6484
6485   if (ret == REG_NOERROR && preg->fastmap)
6486     { /* Compute the fastmap now, since regexec cannot modify the pattern
6487          buffer.  */
6488       re_compile_fastmap (preg);
6489       if (preg->can_be_null)
6490         { /* The fastmap can't be used anyway.  */
6491           free (preg->fastmap);
6492           preg->fastmap = NULL;
6493         }
6494     }
6495   return ret;
6496 }
6497 WEAK_ALIAS (__regcomp, regcomp)
6498
6499
6500 /* regexec searches for a given pattern, specified by PREG, in the
6501    string STRING.
6502
6503    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6504    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
6505    least NMATCH elements, and we set them to the offsets of the
6506    corresponding matched substrings.
6507
6508    EFLAGS specifies `execution flags' which affect matching: if
6509    REG_NOTBOL is set, then ^ does not match at the beginning of the
6510    string; if REG_NOTEOL is set, then $ does not match at the end.
6511
6512    We return 0 if we find a match and REG_NOMATCH if not.  */
6513
6514 reg_errcode_t
6515 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
6516          size_t nmatch, regmatch_t pmatch[_Restrict_arr_], int eflags)
6517 {
6518   regoff_t ret;
6519   struct re_registers regs;
6520   regex_t private_preg;
6521   size_t len = strlen (string);
6522   boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
6523
6524   private_preg = *preg;
6525
6526   private_preg.not_bol = !!(eflags & REG_NOTBOL);
6527   private_preg.not_eol = !!(eflags & REG_NOTEOL);
6528
6529   /* The user has told us exactly how many registers to return
6530      information about, via `nmatch'.  We have to pass that on to the
6531      matching routines.  */
6532   private_preg.regs_allocated = REGS_FIXED;
6533
6534   if (want_reg_info)
6535     {
6536       regs.num_regs = nmatch;
6537       regs.start = TALLOC (nmatch * 2, regoff_t);
6538       if (regs.start == NULL)
6539         return REG_NOMATCH;
6540       regs.end = regs.start + nmatch;
6541     }
6542
6543   /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6544      pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6545      was a little bit longer but still only matching the real part.
6546      This works because the `endline' will check for a '\n' and will find a
6547      '\0', correctly deciding that this is not the end of a line.
6548      But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6549      a convenient '\0' there.  For all we know, the string could be preceded
6550      by '\n' which would throw things off.  */
6551
6552   /* Perform the searching operation.  */
6553   ret = re_search (&private_preg, string, len,
6554                    /* start: */ 0, /* range: */ len,
6555                    want_reg_info ? &regs : 0);
6556
6557   /* Copy the register information to the POSIX structure.  */
6558   if (want_reg_info)
6559     {
6560       if (ret >= 0)
6561         {
6562           unsigned r;
6563
6564           for (r = 0; r < nmatch; r++)
6565             {
6566               pmatch[r].rm_so = regs.start[r];
6567               pmatch[r].rm_eo = regs.end[r];
6568             }
6569         }
6570
6571       /* If we needed the temporary register info, free the space now.  */
6572       free (regs.start);
6573     }
6574
6575   /* We want zero return to mean success, unlike `re_search'.  */
6576   return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
6577 }
6578 WEAK_ALIAS (__regexec, regexec)
6579
6580
6581 /* Returns a message corresponding to an error code, ERR_CODE, returned
6582    from either regcomp or regexec.   We don't use PREG here.
6583
6584    ERR_CODE was previously called ERRCODE, but that name causes an
6585    error with msvc8 compiler.  */
6586
6587 size_t
6588 regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
6589 {
6590   const char *msg;
6591   size_t msg_size;
6592
6593   if (err_code < 0
6594       || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6595     /* Only error codes returned by the rest of the code should be passed
6596        to this routine.  If we are given anything else, or if other regex
6597        code generates an invalid error code, then the program has a bug.
6598        Dump core so we can fix it.  */
6599     abort ();
6600
6601   msg = gettext (re_error_msgid[err_code]);
6602
6603   msg_size = strlen (msg) + 1; /* Includes the null.  */
6604
6605   if (errbuf_size != 0)
6606     {
6607       if (msg_size > errbuf_size)
6608         {
6609           memcpy (errbuf, msg, errbuf_size - 1);
6610           errbuf[errbuf_size - 1] = 0;
6611         }
6612       else
6613         strcpy (errbuf, msg);
6614     }
6615
6616   return msg_size;
6617 }
6618 WEAK_ALIAS (__regerror, regerror)
6619
6620
6621 /* Free dynamically allocated space used by PREG.  */
6622
6623 void
6624 regfree (regex_t *preg)
6625 {
6626   free (preg->buffer);
6627   preg->buffer = NULL;
6628
6629   preg->allocated = 0;
6630   preg->used = 0;
6631
6632   free (preg->fastmap);
6633   preg->fastmap = NULL;
6634   preg->fastmap_accurate = 0;
6635
6636   free (preg->translate);
6637   preg->translate = NULL;
6638 }
6639 WEAK_ALIAS (__regfree, regfree)
6640
6641 #endif /* not emacs  */