src/regex.c

   1 /* Extended regular expression matching and search library, version
   2    0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
   3    internationalization features.)
   4
   5    Copyright (C) 1993-2014 Free Software Foundation, Inc.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* TODO:
  21    - structure the opcode space into opcode+flag.
  22    - merge with glibc's regex.[ch].
  23    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
  24      need to modify the compiled regexp so that re_match can be reentrant.
  25    - get rid of on_failure_jump_smart by doing the optimization in re_comp
  26      rather than at run-time, so that re_match can be reentrant.
  27 */
  28
  29 /* AIX requires this to be the first thing in the file.  */
  30 #if defined _AIX && !defined REGEX_MALLOC
  31   #pragma alloca
  32 #endif
  33
  34 /* Ignore some GCC warnings for now.  This section should go away
  35    once the Emacs and Gnulib regex code is merged.  */
  36 #if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) || defined __clang__
  37 # pragma GCC diagnostic ignored "-Wstrict-overflow"
  38 # ifndef emacs
  39 #  pragma GCC diagnostic ignored "-Wunused-function"
  40 #  pragma GCC diagnostic ignored "-Wunused-macros"
  41 #  pragma GCC diagnostic ignored "-Wunused-result"
  42 #  pragma GCC diagnostic ignored "-Wunused-variable"
  43 # endif
  44 #endif
  45
  46 #if 4 < __GNUC__ + (6 <= __GNUC_MINOR__) && ! defined __clang__
  47 # pragma GCC diagnostic ignored "-Wunused-but-set-variable"
  48 #endif
  49
  50 #include <config.h>
  51
  52 #include <stddef.h>
  53
  54 #ifdef emacs
  55 /* We need this for `regex.h', and perhaps for the Emacs include files.  */
  56 # include <sys/types.h>
  57 #endif
  58
  59 /* Whether to use ISO C Amendment 1 wide char functions.
  60    Those should not be used for Emacs since it uses its own.  */
  61 #if defined _LIBC
  62 #define WIDE_CHAR_SUPPORT 1
  63 #else
  64 #define WIDE_CHAR_SUPPORT \
  65         (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
  66 #endif
  67
  68 /* For platform which support the ISO C amendment 1 functionality we
  69    support user defined character classes.  */
  70 #if WIDE_CHAR_SUPPORT
  71 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
  72 # include <wchar.h>
  73 # include <wctype.h>
  74 #endif
  75
  76 #ifdef _LIBC
  77 /* We have to keep the namespace clean.  */
  78 # define regfree(preg) __regfree (preg)
  79 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
  80 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
  81 # define regerror(err_code, preg, errbuf, errbuf_size) \
  82         __regerror (err_code, preg, errbuf, errbuf_size)
  83 # define re_set_registers(bu, re, nu, st, en) \
  84         __re_set_registers (bu, re, nu, st, en)
  85 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
  86         __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
  87 # define re_match(bufp, string, size, pos, regs) \
  88         __re_match (bufp, string, size, pos, regs)
  89 # define re_search(bufp, string, size, startpos, range, regs) \
  90         __re_search (bufp, string, size, startpos, range, regs)
  91 # define re_compile_pattern(pattern, length, bufp) \
  92         __re_compile_pattern (pattern, length, bufp)
  93 # define re_set_syntax(syntax) __re_set_syntax (syntax)
  94 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
  95         __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
  96 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
  97
  98 /* Make sure we call libc's function even if the user overrides them.  */
  99 # define btowc __btowc
 100 # define iswctype __iswctype
 101 # define wctype __wctype
 102
 103 # define WEAK_ALIAS(a,b) weak_alias (a, b)
 104
 105 /* We are also using some library internals.  */
 106 # include <locale/localeinfo.h>
 107 # include <locale/elem-hash.h>
 108 # include <langinfo.h>
 109 #else
 110 # define WEAK_ALIAS(a,b)
 111 #endif
 112
 113 /* This is for other GNU distributions with internationalized messages.  */
 114 #if HAVE_LIBINTL_H || defined _LIBC
 115 # include <libintl.h>
 116 #else
 117 # define gettext(msgid) (msgid)
 118 #endif
 119
 120 #ifndef gettext_noop
 121 /* This define is so xgettext can find the internationalizable
 122    strings.  */
 123 # define gettext_noop(String) String
 124 #endif
 125
 126 /* The `emacs' switch turns on certain matching commands
 127    that make sense only in Emacs. */
 128 #ifdef emacs
 129
 130 # include "lisp.h"
 131 # include "character.h"
 132 # include "buffer.h"
 133
 134 # include "syntax.h"
 135 # include "category.h"
 136
 137 /* Make syntax table lookup grant data in gl_state.  */
 138 # define SYNTAX(c) syntax_property (c, 1)
 139
 140 # ifdef malloc
 141 #  undef malloc
 142 # endif
 143 # define malloc xmalloc
 144 # ifdef realloc
 145 #  undef realloc
 146 # endif
 147 # define realloc xrealloc
 148 # ifdef free
 149 #  undef free
 150 # endif
 151 # define free xfree
 152
 153 /* Converts the pointer to the char to BEG-based offset from the start.  */
 154 # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
 155 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
 156
 157 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
 158 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
 159 # define RE_STRING_CHAR(p, multibyte) \
 160   (multibyte ? (STRING_CHAR (p)) : (*(p)))
 161 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
 162   (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
 163
 164 # define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
 165
 166 # define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
 167
 168 /* Set C a (possibly converted to multibyte) character before P.  P
 169    points into a string which is the virtual concatenation of STR1
 170    (which ends at END1) or STR2 (which ends at END2).  */
 171 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)                     \
 172   do {                                                                       \
 173     if (target_multibyte)                                                    \
 174       {                                                                      \
 175         re_char *dtemp = (p) == (str2) ? (end1) : (p);                       \
 176         re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
 177         while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));                   \
 178         c = STRING_CHAR (dtemp);                                             \
 179       }                                                                      \
 180     else                                                                     \
 181       {                                                                      \
 182         (c = ((p) == (str2) ? (end1) : (p))[-1]);                            \
 183         (c) = RE_CHAR_TO_MULTIBYTE (c);                                      \
 184       }                                                                      \
 185   } while (0)
 186
 187 /* Set C a (possibly converted to multibyte) character at P, and set
 188    LEN to the byte length of that character.  */
 189 # define GET_CHAR_AFTER(c, p, len)              \
 190   do {                                          \
 191     if (target_multibyte)                       \
 192       (c) = STRING_CHAR_AND_LENGTH (p, len);    \
 193     else                                        \
 194       {                                         \
 195         (c) = *p;                               \
 196         len = 1;                                \
 197         (c) = RE_CHAR_TO_MULTIBYTE (c);         \
 198       }                                         \
 199    } while (0)
 200
 201 #else  /* not emacs */
 202
 203 /* If we are not linking with Emacs proper,
 204    we can't use the relocating allocator
 205    even if config.h says that we can.  */
 206 # undef REL_ALLOC
 207
 208 # include <unistd.h>
 209
 210 /* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
 211
 212 static void *
 213 xmalloc (size_t size)
 214 {
 215   void *val = malloc (size);
 216   if (!val && size)
 217     {
 218       write (2, "virtual memory exhausted\n", 25);
 219       exit (1);
 220     }
 221   return val;
 222 }
 223
 224 static void *
 225 xrealloc (void *block, size_t size)
 226 {
 227   void *val;
 228   /* We must call malloc explicitly when BLOCK is 0, since some
 229      reallocs don't do this.  */
 230   if (! block)
 231     val = malloc (size);
 232   else
 233     val = realloc (block, size);
 234   if (!val && size)
 235     {
 236       write (2, "virtual memory exhausted\n", 25);
 237       exit (1);
 238     }
 239   return val;
 240 }
 241
 242 # ifdef malloc
 243 #  undef malloc
 244 # endif
 245 # define malloc xmalloc
 246 # ifdef realloc
 247 #  undef realloc
 248 # endif
 249 # define realloc xrealloc
 250
 251 # include <stdbool.h>
 252 # include <string.h>
 253
 254 /* Define the syntax stuff for \<, \>, etc.  */
 255
 256 /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
 257 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
 258
 259 /* Dummy macros for non-Emacs environments.  */
 260 # define MAX_MULTIBYTE_LENGTH 1
 261 # define RE_MULTIBYTE_P(x) 0
 262 # define RE_TARGET_MULTIBYTE_P(x) 0
 263 # define WORD_BOUNDARY_P(c1, c2) (0)
 264 # define BYTES_BY_CHAR_HEAD(p) (1)
 265 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
 266 # define STRING_CHAR(p) (*(p))
 267 # define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
 268 # define CHAR_STRING(c, s) (*(s) = (c), 1)
 269 # define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
 270 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
 271 # define RE_CHAR_TO_MULTIBYTE(c) (c)
 272 # define RE_CHAR_TO_UNIBYTE(c) (c)
 273 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
 274   (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
 275 # define GET_CHAR_AFTER(c, p, len)      \
 276   (c = *p, len = 1)
 277 # define CHAR_BYTE8_P(c) (0)
 278 # define CHAR_LEADING_CODE(c) (c)
 279
 280 #endif /* not emacs */
 281
 282 #ifndef RE_TRANSLATE
 283 # define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
 284 # define RE_TRANSLATE_P(TBL) (TBL)
 285 #endif
 286 \f
 287 /* Get the interface, including the syntax bits.  */
 288 #include "regex.h"
 289
 290 /* isalpha etc. are used for the character classes.  */
 291 #include <ctype.h>
 292
 293 #ifdef emacs
 294
 295 /* 1 if C is an ASCII character.  */
 296 # define IS_REAL_ASCII(c) ((c) < 0200)
 297
 298 /* 1 if C is a unibyte character.  */
 299 # define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
 300
 301 /* The Emacs definitions should not be directly affected by locales.  */
 302
 303 /* In Emacs, these are only used for single-byte characters.  */
 304 # define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
 305 # define ISCNTRL(c) ((c) < ' ')
 306 # define ISXDIGIT(c) (((c) >= '0' && (c) <= '9')                \
 307                      || ((c) >= 'a' && (c) <= 'f')      \
 308                      || ((c) >= 'A' && (c) <= 'F'))
 309
 310 /* This is only used for single-byte characters.  */
 311 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
 312
 313 /* The rest must handle multibyte characters.  */
 314
 315 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)                             \
 316                     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237)        \
 317                     : 1)
 318
 319 # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)                             \
 320                     ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)       \
 321                     : 1)
 322
 323 # define ISALNUM(c) (IS_REAL_ASCII (c)                  \
 324                     ? (((c) >= 'a' && (c) <= 'z')       \
 325                        || ((c) >= 'A' && (c) <= 'Z')    \
 326                        || ((c) >= '0' && (c) <= '9'))   \
 327                     : SYNTAX (c) == Sword)
 328
 329 # define ISALPHA(c) (IS_REAL_ASCII (c)                  \
 330                     ? (((c) >= 'a' && (c) <= 'z')       \
 331                        || ((c) >= 'A' && (c) <= 'Z'))   \
 332                     : SYNTAX (c) == Sword)
 333
 334 # define ISLOWER(c) lowercasep (c)
 335
 336 # define ISPUNCT(c) (IS_REAL_ASCII (c)                          \
 337                     ? ((c) > ' ' && (c) < 0177                  \
 338                        && !(((c) >= 'a' && (c) <= 'z')          \
 339                             || ((c) >= 'A' && (c) <= 'Z')       \
 340                             || ((c) >= '0' && (c) <= '9')))     \
 341                     : SYNTAX (c) != Sword)
 342
 343 # define ISSPACE(c) (SYNTAX (c) == Swhitespace)
 344
 345 # define ISUPPER(c) uppercasep (c)
 346
 347 # define ISWORD(c) (SYNTAX (c) == Sword)
 348
 349 #else /* not emacs */
 350
 351 /* 1 if C is an ASCII character.  */
 352 # define IS_REAL_ASCII(c) ((c) < 0200)
 353
 354 /* This distinction is not meaningful, except in Emacs.  */
 355 # define ISUNIBYTE(c) 1
 356
 357 # ifdef isblank
 358 #  define ISBLANK(c) isblank (c)
 359 # else
 360 #  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
 361 # endif
 362 # ifdef isgraph
 363 #  define ISGRAPH(c) isgraph (c)
 364 # else
 365 #  define ISGRAPH(c) (isprint (c) && !isspace (c))
 366 # endif
 367
 368 /* Solaris defines ISPRINT so we must undefine it first.  */
 369 # undef ISPRINT
 370 # define ISPRINT(c) isprint (c)
 371 # define ISDIGIT(c) isdigit (c)
 372 # define ISALNUM(c) isalnum (c)
 373 # define ISALPHA(c) isalpha (c)
 374 # define ISCNTRL(c) iscntrl (c)
 375 # define ISLOWER(c) islower (c)
 376 # define ISPUNCT(c) ispunct (c)
 377 # define ISSPACE(c) isspace (c)
 378 # define ISUPPER(c) isupper (c)
 379 # define ISXDIGIT(c) isxdigit (c)
 380
 381 # define ISWORD(c) ISALPHA (c)
 382
 383 # ifdef _tolower
 384 #  define TOLOWER(c) _tolower (c)
 385 # else
 386 #  define TOLOWER(c) tolower (c)
 387 # endif
 388
 389 /* How many characters in the character set.  */
 390 # define CHAR_SET_SIZE 256
 391
 392 # ifdef SYNTAX_TABLE
 393
 394 extern char *re_syntax_table;
 395
 396 # else /* not SYNTAX_TABLE */
 397
 398 static char re_syntax_table[CHAR_SET_SIZE];
 399
 400 static void
 401 init_syntax_once (void)
 402 {
 403    register int c;
 404    static int done = 0;
 405
 406    if (done)
 407      return;
 408
 409    memset (re_syntax_table, 0, sizeof re_syntax_table);
 410
 411    for (c = 0; c < CHAR_SET_SIZE; ++c)
 412      if (ISALNUM (c))
 413         re_syntax_table[c] = Sword;
 414
 415    re_syntax_table['_'] = Ssymbol;
 416
 417    done = 1;
 418 }
 419
 420 # endif /* not SYNTAX_TABLE */
 421
 422 # define SYNTAX(c) re_syntax_table[(c)]
 423
 424 #endif /* not emacs */
 425 \f
 426 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
 427 \f
 428 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
 429    use `alloca' instead of `malloc'.  This is because using malloc in
 430    re_search* or re_match* could cause memory leaks when C-g is used in
 431    Emacs; also, malloc is slower and causes storage fragmentation.  On
 432    the other hand, malloc is more portable, and easier to debug.
 433
 434    Because we sometimes use alloca, some routines have to be macros,
 435    not functions -- `alloca'-allocated space disappears at the end of the
 436    function it is called in.  */
 437
 438 #ifdef REGEX_MALLOC
 439
 440 # define REGEX_ALLOCATE malloc
 441 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
 442 # define REGEX_FREE free
 443
 444 #else /* not REGEX_MALLOC  */
 445
 446 /* Emacs already defines alloca, sometimes.  */
 447 # ifndef alloca
 448
 449 /* Make alloca work the best possible way.  */
 450 #  ifdef __GNUC__
 451 #   define alloca __builtin_alloca
 452 #  else /* not __GNUC__ */
 453 #   ifdef HAVE_ALLOCA_H
 454 #    include <alloca.h>
 455 #   endif /* HAVE_ALLOCA_H */
 456 #  endif /* not __GNUC__ */
 457
 458 # endif /* not alloca */
 459
 460 # define REGEX_ALLOCATE alloca
 461
 462 /* Assumes a `char *destination' variable.  */
 463 # define REGEX_REALLOCATE(source, osize, nsize)                         \
 464   (destination = alloca (nsize),                                        \
 465    memcpy (destination, source, osize))
 466
 467 /* No need to do anything to free, after alloca.  */
 468 # define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
 469
 470 #endif /* not REGEX_MALLOC */
 471
 472 /* Define how to allocate the failure stack.  */
 473
 474 #if defined REL_ALLOC && defined REGEX_MALLOC
 475
 476 # define REGEX_ALLOCATE_STACK(size)                             \
 477   r_alloc (&failure_stack_ptr, (size))
 478 # define REGEX_REALLOCATE_STACK(source, osize, nsize)           \
 479   r_re_alloc (&failure_stack_ptr, (nsize))
 480 # define REGEX_FREE_STACK(ptr)                                  \
 481   r_alloc_free (&failure_stack_ptr)
 482
 483 #else /* not using relocating allocator */
 484
 485 # ifdef REGEX_MALLOC
 486
 487 #  define REGEX_ALLOCATE_STACK malloc
 488 #  define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
 489 #  define REGEX_FREE_STACK free
 490
 491 # else /* not REGEX_MALLOC */
 492
 493 #  define REGEX_ALLOCATE_STACK alloca
 494
 495 #  define REGEX_REALLOCATE_STACK(source, osize, nsize)                  \
 496    REGEX_REALLOCATE (source, osize, nsize)
 497 /* No need to explicitly free anything.  */
 498 #  define REGEX_FREE_STACK(arg) ((void)0)
 499
 500 # endif /* not REGEX_MALLOC */
 501 #endif /* not using relocating allocator */
 502
 503
 504 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
 505    `string1' or just past its end.  This works if PTR is NULL, which is
 506    a good thing.  */
 507 #define FIRST_STRING_P(ptr)                                     \
 508   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
 509
 510 /* (Re)Allocate N items of type T using malloc, or fail.  */
 511 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
 512 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
 513 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
 514
 515 #define BYTEWIDTH 8 /* In bits.  */
 516
 517 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
 518
 519 #undef MAX
 520 #undef MIN
 521 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 522 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 523
 524 /* Type of source-pattern and string chars.  */
 525 #ifdef _MSC_VER
 526 typedef unsigned char re_char;
 527 typedef const re_char const_re_char;
 528 #else
 529 typedef const unsigned char re_char;
 530 typedef re_char const_re_char;
 531 #endif
 532
 533 typedef char boolean;
 534
 535 static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
 536                                      re_char *string1, size_t size1,
 537                                      re_char *string2, size_t size2,
 538                                      ssize_t pos,
 539                                      struct re_registers *regs,
 540                                      ssize_t stop);
 541 \f
 542 /* These are the command codes that appear in compiled regular
 543    expressions.  Some opcodes are followed by argument bytes.  A
 544    command code can specify any interpretation whatsoever for its
 545    arguments.  Zero bytes may appear in the compiled regular expression.  */
 546
 547 typedef enum
 548 {
 549   no_op = 0,
 550
 551   /* Succeed right away--no more backtracking.  */
 552   succeed,
 553
 554         /* Followed by one byte giving n, then by n literal bytes.  */
 555   exactn,
 556
 557         /* Matches any (more or less) character.  */
 558   anychar,
 559
 560         /* Matches any one char belonging to specified set.  First
 561            following byte is number of bitmap bytes.  Then come bytes
 562            for a bitmap saying which chars are in.  Bits in each byte
 563            are ordered low-bit-first.  A character is in the set if its
 564            bit is 1.  A character too large to have a bit in the map is
 565            automatically not in the set.
 566
 567            If the length byte has the 0x80 bit set, then that stuff
 568            is followed by a range table:
 569                2 bytes of flags for character sets (low 8 bits, high 8 bits)
 570                    See RANGE_TABLE_WORK_BITS below.
 571                2 bytes, the number of pairs that follow (upto 32767)
 572                pairs, each 2 multibyte characters,
 573                    each multibyte character represented as 3 bytes.  */
 574   charset,
 575
 576         /* Same parameters as charset, but match any character that is
 577            not one of those specified.  */
 578   charset_not,
 579
 580         /* Start remembering the text that is matched, for storing in a
 581            register.  Followed by one byte with the register number, in
 582            the range 0 to one less than the pattern buffer's re_nsub
 583            field.  */
 584   start_memory,
 585
 586         /* Stop remembering the text that is matched and store it in a
 587            memory register.  Followed by one byte with the register
 588            number, in the range 0 to one less than `re_nsub' in the
 589            pattern buffer.  */
 590   stop_memory,
 591
 592         /* Match a duplicate of something remembered. Followed by one
 593            byte containing the register number.  */
 594   duplicate,
 595
 596         /* Fail unless at beginning of line.  */
 597   begline,
 598
 599         /* Fail unless at end of line.  */
 600   endline,
 601
 602         /* Succeeds if at beginning of buffer (if emacs) or at beginning
 603            of string to be matched (if not).  */
 604   begbuf,
 605
 606         /* Analogously, for end of buffer/string.  */
 607   endbuf,
 608
 609         /* Followed by two byte relative address to which to jump.  */
 610   jump,
 611
 612         /* Followed by two-byte relative address of place to resume at
 613            in case of failure.  */
 614   on_failure_jump,
 615
 616         /* Like on_failure_jump, but pushes a placeholder instead of the
 617            current string position when executed.  */
 618   on_failure_keep_string_jump,
 619
 620         /* Just like `on_failure_jump', except that it checks that we
 621            don't get stuck in an infinite loop (matching an empty string
 622            indefinitely).  */
 623   on_failure_jump_loop,
 624
 625         /* Just like `on_failure_jump_loop', except that it checks for
 626            a different kind of loop (the kind that shows up with non-greedy
 627            operators).  This operation has to be immediately preceded
 628            by a `no_op'.  */
 629   on_failure_jump_nastyloop,
 630
 631         /* A smart `on_failure_jump' used for greedy * and + operators.
 632            It analyzes the loop before which it is put and if the
 633            loop does not require backtracking, it changes itself to
 634            `on_failure_keep_string_jump' and short-circuits the loop,
 635            else it just defaults to changing itself into `on_failure_jump'.
 636            It assumes that it is pointing to just past a `jump'.  */
 637   on_failure_jump_smart,
 638
 639         /* Followed by two-byte relative address and two-byte number n.
 640            After matching N times, jump to the address upon failure.
 641            Does not work if N starts at 0: use on_failure_jump_loop
 642            instead.  */
 643   succeed_n,
 644
 645         /* Followed by two-byte relative address, and two-byte number n.
 646            Jump to the address N times, then fail.  */
 647   jump_n,
 648
 649         /* Set the following two-byte relative address to the
 650            subsequent two-byte number.  The address *includes* the two
 651            bytes of number.  */
 652   set_number_at,
 653
 654   wordbeg,      /* Succeeds if at word beginning.  */
 655   wordend,      /* Succeeds if at word end.  */
 656
 657   wordbound,    /* Succeeds if at a word boundary.  */
 658   notwordbound, /* Succeeds if not at a word boundary.  */
 659
 660   symbeg,       /* Succeeds if at symbol beginning.  */
 661   symend,       /* Succeeds if at symbol end.  */
 662
 663         /* Matches any character whose syntax is specified.  Followed by
 664            a byte which contains a syntax code, e.g., Sword.  */
 665   syntaxspec,
 666
 667         /* Matches any character whose syntax is not that specified.  */
 668   notsyntaxspec
 669
 670 #ifdef emacs
 671   ,before_dot,  /* Succeeds if before point.  */
 672   at_dot,       /* Succeeds if at point.  */
 673   after_dot,    /* Succeeds if after point.  */
 674
 675   /* Matches any character whose category-set contains the specified
 676      category.  The operator is followed by a byte which contains a
 677      category code (mnemonic ASCII character).  */
 678   categoryspec,
 679
 680   /* Matches any character whose category-set does not contain the
 681      specified category.  The operator is followed by a byte which
 682      contains the category code (mnemonic ASCII character).  */
 683   notcategoryspec
 684 #endif /* emacs */
 685 } re_opcode_t;
 686 \f
 687 /* Common operations on the compiled pattern.  */
 688
 689 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
 690
 691 #define STORE_NUMBER(destination, number)                               \
 692   do {                                                                  \
 693     (destination)[0] = (number) & 0377;                                 \
 694     (destination)[1] = (number) >> 8;                                   \
 695   } while (0)
 696
 697 /* Same as STORE_NUMBER, except increment DESTINATION to
 698    the byte after where the number is stored.  Therefore, DESTINATION
 699    must be an lvalue.  */
 700
 701 #define STORE_NUMBER_AND_INCR(destination, number)                      \
 702   do {                                                                  \
 703     STORE_NUMBER (destination, number);                                 \
 704     (destination) += 2;                                                 \
 705   } while (0)
 706
 707 /* Put into DESTINATION a number stored in two contiguous bytes starting
 708    at SOURCE.  */
 709
 710 #define EXTRACT_NUMBER(destination, source)                             \
 711   ((destination) = extract_number (source))
 712
 713 static int
 714 extract_number (re_char *source)
 715 {
 716   unsigned leading_byte = SIGN_EXTEND_CHAR (source[1]);
 717   return (leading_byte << 8) + source[0];
 718 }
 719
 720 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
 721    SOURCE must be an lvalue.  */
 722
 723 #define EXTRACT_NUMBER_AND_INCR(destination, source)                    \
 724   ((destination) = extract_number_and_incr (&source))
 725
 726 static int
 727 extract_number_and_incr (re_char **source)
 728 {
 729   int num = extract_number (*source);
 730   *source += 2;
 731   return num;
 732 }
 733 \f
 734 /* Store a multibyte character in three contiguous bytes starting
 735    DESTINATION, and increment DESTINATION to the byte after where the
 736    character is stored.  Therefore, DESTINATION must be an lvalue.  */
 737
 738 #define STORE_CHARACTER_AND_INCR(destination, character)        \
 739   do {                                                          \
 740     (destination)[0] = (character) & 0377;                      \
 741     (destination)[1] = ((character) >> 8) & 0377;               \
 742     (destination)[2] = (character) >> 16;                       \
 743     (destination) += 3;                                         \
 744   } while (0)
 745
 746 /* Put into DESTINATION a character stored in three contiguous bytes
 747    starting at SOURCE.  */
 748
 749 #define EXTRACT_CHARACTER(destination, source)  \
 750   do {                                          \
 751     (destination) = ((source)[0]                \
 752                      | ((source)[1] << 8)       \
 753                      | ((source)[2] << 16));    \
 754   } while (0)
 755
 756
 757 /* Macros for charset. */
 758
 759 /* Size of bitmap of charset P in bytes.  P is a start of charset,
 760    i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
 761 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
 762
 763 /* Nonzero if charset P has range table.  */
 764 #define CHARSET_RANGE_TABLE_EXISTS_P(p)  ((p)[1] & 0x80)
 765
 766 /* Return the address of range table of charset P.  But not the start
 767    of table itself, but the before where the number of ranges is
 768    stored.  `2 +' means to skip re_opcode_t and size of bitmap,
 769    and the 2 bytes of flags at the start of the range table.  */
 770 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
 771
 772 #ifdef emacs
 773 /* Extract the bit flags that start a range table.  */
 774 #define CHARSET_RANGE_TABLE_BITS(p)             \
 775   ((p)[2 + CHARSET_BITMAP_SIZE (p)]             \
 776    + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
 777 #endif
 778
 779 /* Return the address of end of RANGE_TABLE.  COUNT is number of
 780    ranges (which is a pair of (start, end)) in the RANGE_TABLE.  `* 2'
 781    is start of range and end of range.  `* 3' is size of each start
 782    and end.  */
 783 #define CHARSET_RANGE_TABLE_END(range_table, count)     \
 784   ((range_table) + (count) * 2 * 3)
 785
 786 /* Test if C is in RANGE_TABLE.  A flag NOT is negated if C is in.
 787    COUNT is number of ranges in RANGE_TABLE.  */
 788 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count)      \
 789   do                                                                    \
 790     {                                                                   \
 791       re_wchar_t range_start, range_end;                                \
 792       re_char *rtp;                                                     \
 793       re_char *range_table_end                                          \
 794         = CHARSET_RANGE_TABLE_END ((range_table), (count));             \
 795                                                                         \
 796       for (rtp = (range_table); rtp < range_table_end; rtp += 2 * 3)    \
 797         {                                                               \
 798           EXTRACT_CHARACTER (range_start, rtp);                         \
 799           EXTRACT_CHARACTER (range_end, rtp + 3);                       \
 800                                                                         \
 801           if (range_start <= (c) && (c) <= range_end)                   \
 802             {                                                           \
 803               (not) = !(not);                                           \
 804               break;                                                    \
 805             }                                                           \
 806         }                                                               \
 807     }                                                                   \
 808   while (0)
 809
 810 /* Test if C is in range table of CHARSET.  The flag NOT is negated if
 811    C is listed in it.  */
 812 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset)                     \
 813   do                                                                    \
 814     {                                                                   \
 815       /* Number of ranges in range table. */                            \
 816       int count;                                                        \
 817       re_char *range_table = CHARSET_RANGE_TABLE (charset);             \
 818                                                                         \
 819       EXTRACT_NUMBER_AND_INCR (count, range_table);                     \
 820       CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count);  \
 821     }                                                                   \
 822   while (0)
 823 \f
 824 /* If DEBUG is defined, Regex prints many voluminous messages about what
 825    it is doing (if the variable `debug' is nonzero).  If linked with the
 826    main program in `iregex.c', you can enter patterns and strings
 827    interactively.  And if linked with the main program in `main.c' and
 828    the other test files, you can run the already-written tests.  */
 829
 830 #ifdef DEBUG
 831
 832 /* We use standard I/O for debugging.  */
 833 # include <stdio.h>
 834
 835 /* It is useful to test things that ``must'' be true when debugging.  */
 836 # include <assert.h>
 837
 838 static int debug = -100000;
 839
 840 # define DEBUG_STATEMENT(e) e
 841 # define DEBUG_PRINT(...) if (debug > 0) printf (__VA_ARGS__)
 842 # define DEBUG_COMPILES_ARGUMENTS
 843 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)                          \
 844   if (debug > 0) print_partial_compiled_pattern (s, e)
 845 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)                 \
 846   if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
 847
 848
 849 /* Print the fastmap in human-readable form.  */
 850
 851 static void
 852 print_fastmap (char *fastmap)
 853 {
 854   unsigned was_a_range = 0;
 855   unsigned i = 0;
 856
 857   while (i < (1 << BYTEWIDTH))
 858     {
 859       if (fastmap[i++])
 860         {
 861           was_a_range = 0;
 862           putchar (i - 1);
 863           while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
 864             {
 865               was_a_range = 1;
 866               i++;
 867             }
 868           if (was_a_range)
 869             {
 870               printf ("-");
 871               putchar (i - 1);
 872             }
 873         }
 874     }
 875   putchar ('\n');
 876 }
 877
 878
 879 /* Print a compiled pattern string in human-readable form, starting at
 880    the START pointer into it and ending just before the pointer END.  */
 881
 882 static void
 883 print_partial_compiled_pattern (re_char *start, re_char *end)
 884 {
 885   int mcnt, mcnt2;
 886   re_char *p = start;
 887   re_char *pend = end;
 888
 889   if (start == NULL)
 890     {
 891       fprintf (stderr, "(null)\n");
 892       return;
 893     }
 894
 895   /* Loop over pattern commands.  */
 896   while (p < pend)
 897     {
 898       fprintf (stderr, "%td:\t", p - start);
 899
 900       switch ((re_opcode_t) *p++)
 901         {
 902         case no_op:
 903           fprintf (stderr, "/no_op");
 904           break;
 905
 906         case succeed:
 907           fprintf (stderr, "/succeed");
 908           break;
 909
 910         case exactn:
 911           mcnt = *p++;
 912           fprintf (stderr, "/exactn/%d", mcnt);
 913           do
 914             {
 915               fprintf (stderr, "/%c", *p++);
 916             }
 917           while (--mcnt);
 918           break;
 919
 920         case start_memory:
 921           fprintf (stderr, "/start_memory/%d", *p++);
 922           break;
 923
 924         case stop_memory:
 925           fprintf (stderr, "/stop_memory/%d", *p++);
 926           break;
 927
 928         case duplicate:
 929           fprintf (stderr, "/duplicate/%d", *p++);
 930           break;
 931
 932         case anychar:
 933           fprintf (stderr, "/anychar");
 934           break;
 935
 936         case charset:
 937         case charset_not:
 938           {
 939             register int c, last = -100;
 940             register int in_range = 0;
 941             int length = CHARSET_BITMAP_SIZE (p - 1);
 942             int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
 943
 944             fprintf (stderr, "/charset [%s",
 945                      (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
 946
 947             if (p + *p >= pend)
 948               fprintf (stderr, " !extends past end of pattern! ");
 949
 950             for (c = 0; c < 256; c++)
 951               if (c / 8 < length
 952                   && (p[1 + (c/8)] & (1 << (c % 8))))
 953                 {
 954                   /* Are we starting a range?  */
 955                   if (last + 1 == c && ! in_range)
 956                     {
 957                       fprintf (stderr, "-");
 958                       in_range = 1;
 959                     }
 960                   /* Have we broken a range?  */
 961                   else if (last + 1 != c && in_range)
 962                     {
 963                       fprintf (stderr, "%c", last);
 964                       in_range = 0;
 965                     }
 966
 967                   if (! in_range)
 968                     fprintf (stderr, "%c", c);
 969
 970                   last = c;
 971               }
 972
 973             if (in_range)
 974               fprintf (stderr, "%c", last);
 975
 976             fprintf (stderr, "]");
 977
 978             p += 1 + length;
 979
 980             if (has_range_table)
 981               {
 982                 int count;
 983                 fprintf (stderr, "has-range-table");
 984
 985                 /* ??? Should print the range table; for now, just skip it.  */
 986                 p += 2;         /* skip range table bits */
 987                 EXTRACT_NUMBER_AND_INCR (count, p);
 988                 p = CHARSET_RANGE_TABLE_END (p, count);
 989               }
 990           }
 991           break;
 992
 993         case begline:
 994           fprintf (stderr, "/begline");
 995           break;
 996
 997         case endline:
 998           fprintf (stderr, "/endline");
 999           break;
1000
1001         case on_failure_jump:
1002           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1003           fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
1004           break;
1005
1006         case on_failure_keep_string_jump:
1007           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1008           fprintf (stderr, "/on_failure_keep_string_jump to %td",
1009                    p + mcnt - start);
1010           break;
1011
1012         case on_failure_jump_nastyloop:
1013           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1014           fprintf (stderr, "/on_failure_jump_nastyloop to %td",
1015                    p + mcnt - start);
1016           break;
1017
1018         case on_failure_jump_loop:
1019           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1020           fprintf (stderr, "/on_failure_jump_loop to %td",
1021                    p + mcnt - start);
1022           break;
1023
1024         case on_failure_jump_smart:
1025           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1026           fprintf (stderr, "/on_failure_jump_smart to %td",
1027                    p + mcnt - start);
1028           break;
1029
1030         case jump:
1031           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1032           fprintf (stderr, "/jump to %td", p + mcnt - start);
1033           break;
1034
1035         case succeed_n:
1036           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1037           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1038           fprintf (stderr, "/succeed_n to %td, %d times",
1039                    p - 2 + mcnt - start, mcnt2);
1040           break;
1041
1042         case jump_n:
1043           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1044           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1045           fprintf (stderr, "/jump_n to %td, %d times",
1046                    p - 2 + mcnt - start, mcnt2);
1047           break;
1048
1049         case set_number_at:
1050           EXTRACT_NUMBER_AND_INCR (mcnt, p);
1051           EXTRACT_NUMBER_AND_INCR (mcnt2, p);
1052           fprintf (stderr, "/set_number_at location %td to %d",
1053                    p - 2 + mcnt - start, mcnt2);
1054           break;
1055
1056         case wordbound:
1057           fprintf (stderr, "/wordbound");
1058           break;
1059
1060         case notwordbound:
1061           fprintf (stderr, "/notwordbound");
1062           break;
1063
1064         case wordbeg:
1065           fprintf (stderr, "/wordbeg");
1066           break;
1067
1068         case wordend:
1069           fprintf (stderr, "/wordend");
1070           break;
1071
1072         case symbeg:
1073           fprintf (stderr, "/symbeg");
1074           break;
1075
1076         case symend:
1077           fprintf (stderr, "/symend");
1078           break;
1079
1080         case syntaxspec:
1081           fprintf (stderr, "/syntaxspec");
1082           mcnt = *p++;
1083           fprintf (stderr, "/%d", mcnt);
1084           break;
1085
1086         case notsyntaxspec:
1087           fprintf (stderr, "/notsyntaxspec");
1088           mcnt = *p++;
1089           fprintf (stderr, "/%d", mcnt);
1090           break;
1091
1092 # ifdef emacs
1093         case before_dot:
1094           fprintf (stderr, "/before_dot");
1095           break;
1096
1097         case at_dot:
1098           fprintf (stderr, "/at_dot");
1099           break;
1100
1101         case after_dot:
1102           fprintf (stderr, "/after_dot");
1103           break;
1104
1105         case categoryspec:
1106           fprintf (stderr, "/categoryspec");
1107           mcnt = *p++;
1108           fprintf (stderr, "/%d", mcnt);
1109           break;
1110
1111         case notcategoryspec:
1112           fprintf (stderr, "/notcategoryspec");
1113           mcnt = *p++;
1114           fprintf (stderr, "/%d", mcnt);
1115           break;
1116 # endif /* emacs */
1117
1118         case begbuf:
1119           fprintf (stderr, "/begbuf");
1120           break;
1121
1122         case endbuf:
1123           fprintf (stderr, "/endbuf");
1124           break;
1125
1126         default:
1127           fprintf (stderr, "?%d", *(p-1));
1128         }
1129
1130       fprintf (stderr, "\n");
1131     }
1132
1133   fprintf (stderr, "%td:\tend of pattern.\n", p - start);
1134 }
1135
1136
1137 static void
1138 print_compiled_pattern (struct re_pattern_buffer *bufp)
1139 {
1140   re_char *buffer = bufp->buffer;
1141
1142   print_partial_compiled_pattern (buffer, buffer + bufp->used);
1143   printf ("%ld bytes used/%ld bytes allocated.\n",
1144           bufp->used, bufp->allocated);
1145
1146   if (bufp->fastmap_accurate && bufp->fastmap)
1147     {
1148       printf ("fastmap: ");
1149       print_fastmap (bufp->fastmap);
1150     }
1151
1152   printf ("re_nsub: %zu\t", bufp->re_nsub);
1153   printf ("regs_alloc: %d\t", bufp->regs_allocated);
1154   printf ("can_be_null: %d\t", bufp->can_be_null);
1155   printf ("no_sub: %d\t", bufp->no_sub);
1156   printf ("not_bol: %d\t", bufp->not_bol);
1157   printf ("not_eol: %d\t", bufp->not_eol);
1158   printf ("syntax: %lx\n", bufp->syntax);
1159   fflush (stdout);
1160   /* Perhaps we should print the translate table?  */
1161 }
1162
1163
1164 static void
1165 print_double_string (re_char *where, re_char *string1, ssize_t size1,
1166                      re_char *string2, ssize_t size2)
1167 {
1168   ssize_t this_char;
1169
1170   if (where == NULL)
1171     printf ("(null)");
1172   else
1173     {
1174       if (FIRST_STRING_P (where))
1175         {
1176           for (this_char = where - string1; this_char < size1; this_char++)
1177             putchar (string1[this_char]);
1178
1179           where = string2;
1180         }
1181
1182       for (this_char = where - string2; this_char < size2; this_char++)
1183         putchar (string2[this_char]);
1184     }
1185 }
1186
1187 #else /* not DEBUG */
1188
1189 # undef assert
1190 # define assert(e)
1191
1192 # define DEBUG_STATEMENT(e)
1193 # if __STDC_VERSION__ < 199901L
1194 #  define DEBUG_COMPILES_ARGUMENTS
1195 #  define DEBUG_PRINT /* 'DEBUG_PRINT (x, y)' discards X and Y.  */ (void)
1196 # else
1197 #  define DEBUG_PRINT(...)
1198 # endif
1199 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1200 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1201
1202 #endif /* not DEBUG */
1203 \f
1204 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
1205 #ifdef lint
1206 # define IF_LINT(Code) Code
1207 #else
1208 # define IF_LINT(Code) /* empty */
1209 #endif
1210 \f
1211 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
1212    also be assigned to arbitrarily: each pattern buffer stores its own
1213    syntax, so it can be changed between regex compilations.  */
1214 /* This has no initializer because initialized variables in Emacs
1215    become read-only after dumping.  */
1216 reg_syntax_t re_syntax_options;
1217
1218
1219 /* Specify the precise syntax of regexps for compilation.  This provides
1220    for compatibility for various utilities which historically have
1221    different, incompatible syntaxes.
1222
1223    The argument SYNTAX is a bit mask comprised of the various bits
1224    defined in regex.h.  We return the old syntax.  */
1225
1226 reg_syntax_t
1227 re_set_syntax (reg_syntax_t syntax)
1228 {
1229   reg_syntax_t ret = re_syntax_options;
1230
1231   re_syntax_options = syntax;
1232   return ret;
1233 }
1234 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1235
1236 /* Regexp to use to replace spaces, or NULL meaning don't.  */
1237 static const_re_char *whitespace_regexp;
1238
1239 void
1240 re_set_whitespace_regexp (const char *regexp)
1241 {
1242   whitespace_regexp = (const_re_char *) regexp;
1243 }
1244 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1245 \f
1246 /* This table gives an error message for each of the error codes listed
1247    in regex.h.  Obviously the order here has to be same as there.
1248    POSIX doesn't require that we do anything for REG_NOERROR,
1249    but why not be nice?  */
1250
1251 static const char *re_error_msgid[] =
1252   {
1253     gettext_noop ("Success"),   /* REG_NOERROR */
1254     gettext_noop ("No match"),  /* REG_NOMATCH */
1255     gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1256     gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1257     gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1258     gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1259     gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1260     gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1261     gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1262     gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1263     gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1264     gettext_noop ("Invalid range end"), /* REG_ERANGE */
1265     gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1266     gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1267     gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1268     gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1269     gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1270     gettext_noop ("Range striding over charsets") /* REG_ERANGEX  */
1271   };
1272 \f
1273 /* Avoiding alloca during matching, to placate r_alloc.  */
1274
1275 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1276    searching and matching functions should not call alloca.  On some
1277    systems, alloca is implemented in terms of malloc, and if we're
1278    using the relocating allocator routines, then malloc could cause a
1279    relocation, which might (if the strings being searched are in the
1280    ralloc heap) shift the data out from underneath the regexp
1281    routines.
1282
1283    Here's another reason to avoid allocation: Emacs
1284    processes input from X in a signal handler; processing X input may
1285    call malloc; if input arrives while a matching routine is calling
1286    malloc, then we're scrod.  But Emacs can't just block input while
1287    calling matching routines; then we don't notice interrupts when
1288    they come in.  So, Emacs blocks input around all regexp calls
1289    except the matching calls, which it leaves unprotected, in the
1290    faith that they will not malloc.  */
1291
1292 /* Normally, this is fine.  */
1293 #define MATCH_MAY_ALLOCATE
1294
1295 /* The match routines may not allocate if (1) they would do it with malloc
1296    and (2) it's not safe for them to use malloc.
1297    Note that if REL_ALLOC is defined, matching would not use malloc for the
1298    failure stack, but we would still use it for the register vectors;
1299    so REL_ALLOC should not affect this.  */
1300 #if defined REGEX_MALLOC && defined emacs
1301 # undef MATCH_MAY_ALLOCATE
1302 #endif
1303
1304 \f
1305 /* Failure stack declarations and macros; both re_compile_fastmap and
1306    re_match_2 use a failure stack.  These have to be macros because of
1307    REGEX_ALLOCATE_STACK.  */
1308
1309
1310 /* Approximate number of failure points for which to initially allocate space
1311    when matching.  If this number is exceeded, we allocate more
1312    space, so it is not a hard limit.  */
1313 #ifndef INIT_FAILURE_ALLOC
1314 # define INIT_FAILURE_ALLOC 20
1315 #endif
1316
1317 /* Roughly the maximum number of failure points on the stack.  Would be
1318    exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1319    This is a variable only so users of regex can assign to it; we never
1320    change it ourselves.  We always multiply it by TYPICAL_FAILURE_SIZE
1321    before using it, so it should probably be a byte-count instead.  */
1322 # if defined MATCH_MAY_ALLOCATE
1323 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
1324    whose default stack limit is 2mb.  In order for a larger
1325    value to work reliably, you have to try to make it accord
1326    with the process stack limit.  */
1327 size_t re_max_failures = 40000;
1328 # else
1329 size_t re_max_failures = 4000;
1330 # endif
1331
1332 union fail_stack_elt
1333 {
1334   re_char *pointer;
1335   /* This should be the biggest `int' that's no bigger than a pointer.  */
1336   long integer;
1337 };
1338
1339 typedef union fail_stack_elt fail_stack_elt_t;
1340
1341 typedef struct
1342 {
1343   fail_stack_elt_t *stack;
1344   size_t size;
1345   size_t avail; /* Offset of next open position.  */
1346   size_t frame; /* Offset of the cur constructed frame.  */
1347 } fail_stack_type;
1348
1349 #define FAIL_STACK_EMPTY()     (fail_stack.frame == 0)
1350
1351
1352 /* Define macros to initialize and free the failure stack.
1353    Do `return -2' if the alloc fails.  */
1354
1355 #ifdef MATCH_MAY_ALLOCATE
1356 # define INIT_FAIL_STACK()                                              \
1357   do {                                                                  \
1358     fail_stack.stack =                                                  \
1359       REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE   \
1360                             * sizeof (fail_stack_elt_t));               \
1361                                                                         \
1362     if (fail_stack.stack == NULL)                                       \
1363       return -2;                                                        \
1364                                                                         \
1365     fail_stack.size = INIT_FAILURE_ALLOC;                               \
1366     fail_stack.avail = 0;                                               \
1367     fail_stack.frame = 0;                                               \
1368   } while (0)
1369 #else
1370 # define INIT_FAIL_STACK()                                              \
1371   do {                                                                  \
1372     fail_stack.avail = 0;                                               \
1373     fail_stack.frame = 0;                                               \
1374   } while (0)
1375
1376 # define RETALLOC_IF(addr, n, t) \
1377   if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
1378 #endif
1379
1380
1381 /* Double the size of FAIL_STACK, up to a limit
1382    which allows approximately `re_max_failures' items.
1383
1384    Return 1 if succeeds, and 0 if either ran out of memory
1385    allocating space for it or it was already too large.
1386
1387    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
1388
1389 /* Factor to increase the failure stack size by
1390    when we increase it.
1391    This used to be 2, but 2 was too wasteful
1392    because the old discarded stacks added up to as much space
1393    were as ultimate, maximum-size stack.  */
1394 #define FAIL_STACK_GROWTH_FACTOR 4
1395
1396 #define GROW_FAIL_STACK(fail_stack)                                     \
1397   (((fail_stack).size * sizeof (fail_stack_elt_t)                       \
1398     >= re_max_failures * TYPICAL_FAILURE_SIZE)                          \
1399    ? 0                                                                  \
1400    : ((fail_stack).stack                                                \
1401       = REGEX_REALLOCATE_STACK ((fail_stack).stack,                     \
1402           (fail_stack).size * sizeof (fail_stack_elt_t),                \
1403           MIN (re_max_failures * TYPICAL_FAILURE_SIZE,                  \
1404                ((fail_stack).size * sizeof (fail_stack_elt_t)           \
1405                 * FAIL_STACK_GROWTH_FACTOR))),                          \
1406                                                                         \
1407       (fail_stack).stack == NULL                                        \
1408       ? 0                                                               \
1409       : ((fail_stack).size                                              \
1410          = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE,                \
1411                  ((fail_stack).size * sizeof (fail_stack_elt_t)         \
1412                   * FAIL_STACK_GROWTH_FACTOR))                          \
1413             / sizeof (fail_stack_elt_t)),                               \
1414          1)))
1415
1416
1417 /* Push a pointer value onto the failure stack.
1418    Assumes the variable `fail_stack'.  Probably should only
1419    be called from within `PUSH_FAILURE_POINT'.  */
1420 #define PUSH_FAILURE_POINTER(item)                                      \
1421   fail_stack.stack[fail_stack.avail++].pointer = (item)
1422
1423 /* This pushes an integer-valued item onto the failure stack.
1424    Assumes the variable `fail_stack'.  Probably should only
1425    be called from within `PUSH_FAILURE_POINT'.  */
1426 #define PUSH_FAILURE_INT(item)                                  \
1427   fail_stack.stack[fail_stack.avail++].integer = (item)
1428
1429 /* These POP... operations complement the PUSH... operations.
1430    All assume that `fail_stack' is nonempty.  */
1431 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1432 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1433
1434 /* Individual items aside from the registers.  */
1435 #define NUM_NONREG_ITEMS 3
1436
1437 /* Used to examine the stack (to detect infinite loops).  */
1438 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1439 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1440 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1441 #define TOP_FAILURE_HANDLE() fail_stack.frame
1442
1443
1444 #define ENSURE_FAIL_STACK(space)                                        \
1445 while (REMAINING_AVAIL_SLOTS <= space) {                                \
1446   if (!GROW_FAIL_STACK (fail_stack))                                    \
1447     return -2;                                                          \
1448   DEBUG_PRINT ("\n  Doubled stack; size now: %zd\n", (fail_stack).size);\
1449   DEBUG_PRINT ("         slots available: %zd\n", REMAINING_AVAIL_SLOTS);\
1450 }
1451
1452 /* Push register NUM onto the stack.  */
1453 #define PUSH_FAILURE_REG(num)                                           \
1454 do {                                                                    \
1455   char *destination;                                                    \
1456   long n = num;                                                         \
1457   ENSURE_FAIL_STACK(3);                                                 \
1458   DEBUG_PRINT ("    Push reg %ld (spanning %p -> %p)\n",                \
1459                n, regstart[n], regend[n]);                              \
1460   PUSH_FAILURE_POINTER (regstart[n]);                                   \
1461   PUSH_FAILURE_POINTER (regend[n]);                                     \
1462   PUSH_FAILURE_INT (n);                                                 \
1463 } while (0)
1464
1465 /* Change the counter's value to VAL, but make sure that it will
1466    be reset when backtracking.  */
1467 #define PUSH_NUMBER(ptr,val)                                            \
1468 do {                                                                    \
1469   char *destination;                                                    \
1470   int c;                                                                \
1471   ENSURE_FAIL_STACK(3);                                                 \
1472   EXTRACT_NUMBER (c, ptr);                                              \
1473   DEBUG_PRINT ("    Push number %p = %d -> %d\n", ptr, c, val);         \
1474   PUSH_FAILURE_INT (c);                                                 \
1475   PUSH_FAILURE_POINTER (ptr);                                           \
1476   PUSH_FAILURE_INT (-1);                                                \
1477   STORE_NUMBER (ptr, val);                                              \
1478 } while (0)
1479
1480 /* Pop a saved register off the stack.  */
1481 #define POP_FAILURE_REG_OR_COUNT()                                      \
1482 do {                                                                    \
1483   long pfreg = POP_FAILURE_INT ();                                      \
1484   if (pfreg == -1)                                                      \
1485     {                                                                   \
1486       /* It's a counter.  */                                            \
1487       /* Here, we discard `const', making re_match non-reentrant.  */   \
1488       unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER ();     \
1489       pfreg = POP_FAILURE_INT ();                                       \
1490       STORE_NUMBER (ptr, pfreg);                                        \
1491       DEBUG_PRINT ("     Pop counter %p = %ld\n", ptr, pfreg);          \
1492     }                                                                   \
1493   else                                                                  \
1494     {                                                                   \
1495       regend[pfreg] = POP_FAILURE_POINTER ();                           \
1496       regstart[pfreg] = POP_FAILURE_POINTER ();                         \
1497       DEBUG_PRINT ("     Pop reg %ld (spanning %p -> %p)\n",            \
1498                    pfreg, regstart[pfreg], regend[pfreg]);              \
1499     }                                                                   \
1500 } while (0)
1501
1502 /* Check that we are not stuck in an infinite loop.  */
1503 #define CHECK_INFINITE_LOOP(pat_cur, string_place)                      \
1504 do {                                                                    \
1505   ssize_t failure = TOP_FAILURE_HANDLE ();                              \
1506   /* Check for infinite matching loops */                               \
1507   while (failure > 0                                                    \
1508          && (FAILURE_STR (failure) == string_place                      \
1509              || FAILURE_STR (failure) == NULL))                         \
1510     {                                                                   \
1511       assert (FAILURE_PAT (failure) >= bufp->buffer                     \
1512               && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);   \
1513       if (FAILURE_PAT (failure) == pat_cur)                             \
1514         {                                                               \
1515           cycle = 1;                                                    \
1516           break;                                                        \
1517         }                                                               \
1518       DEBUG_PRINT ("  Other pattern: %p\n", FAILURE_PAT (failure));     \
1519       failure = NEXT_FAILURE_HANDLE(failure);                           \
1520     }                                                                   \
1521   DEBUG_PRINT ("  Other string: %p\n", FAILURE_STR (failure));          \
1522 } while (0)
1523
1524 /* Push the information about the state we will need
1525    if we ever fail back to it.
1526
1527    Requires variables fail_stack, regstart, regend and
1528    num_regs be declared.  GROW_FAIL_STACK requires `destination' be
1529    declared.
1530
1531    Does `return FAILURE_CODE' if runs out of memory.  */
1532
1533 #define PUSH_FAILURE_POINT(pattern, string_place)                       \
1534 do {                                                                    \
1535   char *destination;                                                    \
1536   /* Must be int, so when we don't save any registers, the arithmetic   \
1537      of 0 + -1 isn't done as unsigned.  */                              \
1538                                                                         \
1539   DEBUG_STATEMENT (nfailure_points_pushed++);                           \
1540   DEBUG_PRINT ("\nPUSH_FAILURE_POINT:\n");                              \
1541   DEBUG_PRINT ("  Before push, next avail: %zd\n", (fail_stack).avail); \
1542   DEBUG_PRINT ("                        size: %zd\n", (fail_stack).size);\
1543                                                                         \
1544   ENSURE_FAIL_STACK (NUM_NONREG_ITEMS);                                 \
1545                                                                         \
1546   DEBUG_PRINT ("\n");                                                   \
1547                                                                         \
1548   DEBUG_PRINT ("  Push frame index: %zd\n", fail_stack.frame);          \
1549   PUSH_FAILURE_INT (fail_stack.frame);                                  \
1550                                                                         \
1551   DEBUG_PRINT ("  Push string %p: `", string_place);                    \
1552   DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1553   DEBUG_PRINT ("'\n");                                                  \
1554   PUSH_FAILURE_POINTER (string_place);                                  \
1555                                                                         \
1556   DEBUG_PRINT ("  Push pattern %p: ", pattern);                         \
1557   DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend);                   \
1558   PUSH_FAILURE_POINTER (pattern);                                       \
1559                                                                         \
1560   /* Close the frame by moving the frame pointer past it.  */           \
1561   fail_stack.frame = fail_stack.avail;                                  \
1562 } while (0)
1563
1564 /* Estimate the size of data pushed by a typical failure stack entry.
1565    An estimate is all we need, because all we use this for
1566    is to choose a limit for how big to make the failure stack.  */
1567 /* BEWARE, the value `20' is hard-coded in emacs.c:main().  */
1568 #define TYPICAL_FAILURE_SIZE 20
1569
1570 /* How many items can still be added to the stack without overflowing it.  */
1571 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1572
1573
1574 /* Pops what PUSH_FAIL_STACK pushes.
1575
1576    We restore into the parameters, all of which should be lvalues:
1577      STR -- the saved data position.
1578      PAT -- the saved pattern position.
1579      REGSTART, REGEND -- arrays of string positions.
1580
1581    Also assumes the variables `fail_stack' and (if debugging), `bufp',
1582    `pend', `string1', `size1', `string2', and `size2'.  */
1583
1584 #define POP_FAILURE_POINT(str, pat)                                     \
1585 do {                                                                    \
1586   assert (!FAIL_STACK_EMPTY ());                                        \
1587                                                                         \
1588   /* Remove failure points and point to how many regs pushed.  */       \
1589   DEBUG_PRINT ("POP_FAILURE_POINT:\n");                                 \
1590   DEBUG_PRINT ("  Before pop, next avail: %zd\n", fail_stack.avail);    \
1591   DEBUG_PRINT ("                     size: %zd\n", fail_stack.size);    \
1592                                                                         \
1593   /* Pop the saved registers.  */                                       \
1594   while (fail_stack.frame < fail_stack.avail)                           \
1595     POP_FAILURE_REG_OR_COUNT ();                                        \
1596                                                                         \
1597   pat = POP_FAILURE_POINTER ();                                         \
1598   DEBUG_PRINT ("  Popping pattern %p: ", pat);                          \
1599   DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend);                       \
1600                                                                         \
1601   /* If the saved string location is NULL, it came from an              \
1602      on_failure_keep_string_jump opcode, and we want to throw away the  \
1603      saved NULL, thus retaining our current position in the string.  */ \
1604   str = POP_FAILURE_POINTER ();                                         \
1605   DEBUG_PRINT ("  Popping string %p: `", str);                          \
1606   DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2);      \
1607   DEBUG_PRINT ("'\n");                                                  \
1608                                                                         \
1609   fail_stack.frame = POP_FAILURE_INT ();                                \
1610   DEBUG_PRINT ("  Popping  frame index: %zd\n", fail_stack.frame);      \
1611                                                                         \
1612   assert (fail_stack.avail >= 0);                                       \
1613   assert (fail_stack.frame <= fail_stack.avail);                        \
1614                                                                         \
1615   DEBUG_STATEMENT (nfailure_points_popped++);                           \
1616 } while (0) /* POP_FAILURE_POINT */
1617
1618
1619 \f
1620 /* Registers are set to a sentinel when they haven't yet matched.  */
1621 #define REG_UNSET(e) ((e) == NULL)
1622 \f
1623 /* Subroutine declarations and macros for regex_compile.  */
1624
1625 static reg_errcode_t regex_compile (re_char *pattern, size_t size,
1626                                     reg_syntax_t syntax,
1627                                     struct re_pattern_buffer *bufp);
1628 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1629 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1630 static void insert_op1 (re_opcode_t op, unsigned char *loc,
1631                         int arg, unsigned char *end);
1632 static void insert_op2 (re_opcode_t op, unsigned char *loc,
1633                         int arg1, int arg2, unsigned char *end);
1634 static boolean at_begline_loc_p (re_char *pattern, re_char *p,
1635                                  reg_syntax_t syntax);
1636 static boolean at_endline_loc_p (re_char *p, re_char *pend,
1637                                  reg_syntax_t syntax);
1638 static re_char *skip_one_char (re_char *p);
1639 static int analyse_first (re_char *p, re_char *pend,
1640                           char *fastmap, const int multibyte);
1641
1642 /* Fetch the next character in the uncompiled pattern, with no
1643    translation.  */
1644 #define PATFETCH(c)                                                     \
1645   do {                                                                  \
1646     int len;                                                            \
1647     if (p == pend) return REG_EEND;                                     \
1648     c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte);                  \
1649     p += len;                                                           \
1650   } while (0)
1651
1652
1653 /* If `translate' is non-null, return translate[D], else just D.  We
1654    cast the subscript to translate because some data is declared as
1655    `char *', to avoid warnings when a string constant is passed.  But
1656    when we use a character as a subscript we must make it unsigned.  */
1657 #ifndef TRANSLATE
1658 # define TRANSLATE(d) \
1659   (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1660 #endif
1661
1662
1663 /* Macros for outputting the compiled pattern into `buffer'.  */
1664
1665 /* If the buffer isn't allocated when it comes in, use this.  */
1666 #define INIT_BUF_SIZE  32
1667
1668 /* Make sure we have at least N more bytes of space in buffer.  */
1669 #define GET_BUFFER_SPACE(n)                                             \
1670     while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated)         \
1671       EXTEND_BUFFER ()
1672
1673 /* Make sure we have one more byte of buffer space and then add C to it.  */
1674 #define BUF_PUSH(c)                                                     \
1675   do {                                                                  \
1676     GET_BUFFER_SPACE (1);                                               \
1677     *b++ = (unsigned char) (c);                                         \
1678   } while (0)
1679
1680
1681 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
1682 #define BUF_PUSH_2(c1, c2)                                              \
1683   do {                                                                  \
1684     GET_BUFFER_SPACE (2);                                               \
1685     *b++ = (unsigned char) (c1);                                        \
1686     *b++ = (unsigned char) (c2);                                        \
1687   } while (0)
1688
1689
1690 /* Store a jump with opcode OP at LOC to location TO.  We store a
1691    relative address offset by the three bytes the jump itself occupies.  */
1692 #define STORE_JUMP(op, loc, to) \
1693   store_op1 (op, loc, (to) - (loc) - 3)
1694
1695 /* Likewise, for a two-argument jump.  */
1696 #define STORE_JUMP2(op, loc, to, arg) \
1697   store_op2 (op, loc, (to) - (loc) - 3, arg)
1698
1699 /* Like `STORE_JUMP', but for inserting.  Assume `b' is the buffer end.  */
1700 #define INSERT_JUMP(op, loc, to) \
1701   insert_op1 (op, loc, (to) - (loc) - 3, b)
1702
1703 /* Like `STORE_JUMP2', but for inserting.  Assume `b' is the buffer end.  */
1704 #define INSERT_JUMP2(op, loc, to, arg) \
1705   insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1706
1707
1708 /* This is not an arbitrary limit: the arguments which represent offsets
1709    into the pattern are two bytes long.  So if 2^15 bytes turns out to
1710    be too small, many things would have to change.  */
1711 # define MAX_BUF_SIZE (1L << 15)
1712
1713 /* Extend the buffer by twice its current size via realloc and
1714    reset the pointers that pointed into the old block to point to the
1715    correct places in the new one.  If extending the buffer results in it
1716    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
1717 #if __BOUNDED_POINTERS__
1718 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1719 # define MOVE_BUFFER_POINTER(P)                                 \
1720   (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer),     \
1721    SET_HIGH_BOUND (P),                                          \
1722    __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
1723 # define ELSE_EXTEND_BUFFER_HIGH_BOUND          \
1724   else                                          \
1725     {                                           \
1726       SET_HIGH_BOUND (b);                       \
1727       SET_HIGH_BOUND (begalt);                  \
1728       if (fixup_alt_jump)                       \
1729         SET_HIGH_BOUND (fixup_alt_jump);        \
1730       if (laststart)                            \
1731         SET_HIGH_BOUND (laststart);             \
1732       if (pending_exact)                        \
1733         SET_HIGH_BOUND (pending_exact);         \
1734     }
1735 #else
1736 # define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
1737 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1738 #endif
1739 #define EXTEND_BUFFER()                                                 \
1740   do {                                                                  \
1741     unsigned char *old_buffer = bufp->buffer;                           \
1742     if (bufp->allocated == MAX_BUF_SIZE)                                \
1743       return REG_ESIZE;                                                 \
1744     bufp->allocated <<= 1;                                              \
1745     if (bufp->allocated > MAX_BUF_SIZE)                                 \
1746       bufp->allocated = MAX_BUF_SIZE;                                   \
1747     RETALLOC (bufp->buffer, bufp->allocated, unsigned char);            \
1748     if (bufp->buffer == NULL)                                           \
1749       return REG_ESPACE;                                                \
1750     /* If the buffer moved, move all the pointers into it.  */          \
1751     if (old_buffer != bufp->buffer)                                     \
1752       {                                                                 \
1753         unsigned char *new_buffer = bufp->buffer;                       \
1754         MOVE_BUFFER_POINTER (b);                                        \
1755         MOVE_BUFFER_POINTER (begalt);                                   \
1756         if (fixup_alt_jump)                                             \
1757           MOVE_BUFFER_POINTER (fixup_alt_jump);                         \
1758         if (laststart)                                                  \
1759           MOVE_BUFFER_POINTER (laststart);                              \
1760         if (pending_exact)                                              \
1761           MOVE_BUFFER_POINTER (pending_exact);                          \
1762       }                                                                 \
1763     ELSE_EXTEND_BUFFER_HIGH_BOUND                                       \
1764   } while (0)
1765
1766
1767 /* Since we have one byte reserved for the register number argument to
1768    {start,stop}_memory, the maximum number of groups we can report
1769    things about is what fits in that byte.  */
1770 #define MAX_REGNUM 255
1771
1772 /* But patterns can have more than `MAX_REGNUM' registers.  We just
1773    ignore the excess.  */
1774 typedef int regnum_t;
1775
1776
1777 /* Macros for the compile stack.  */
1778
1779 /* Since offsets can go either forwards or backwards, this type needs to
1780    be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.  */
1781 /* int may be not enough when sizeof(int) == 2.  */
1782 typedef long pattern_offset_t;
1783
1784 typedef struct
1785 {
1786   pattern_offset_t begalt_offset;
1787   pattern_offset_t fixup_alt_jump;
1788   pattern_offset_t laststart_offset;
1789   regnum_t regnum;
1790 } compile_stack_elt_t;
1791
1792
1793 typedef struct
1794 {
1795   compile_stack_elt_t *stack;
1796   size_t size;
1797   size_t avail;                 /* Offset of next open position.  */
1798 } compile_stack_type;
1799
1800
1801 #define INIT_COMPILE_STACK_SIZE 32
1802
1803 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
1804 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
1805
1806 /* The next available element.  */
1807 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1808
1809 /* Explicit quit checking is needed for Emacs, which uses polling to
1810    process input events.  */
1811 #ifdef emacs
1812 # define IMMEDIATE_QUIT_CHECK                   \
1813     do {                                        \
1814       if (immediate_quit) QUIT;                 \
1815     } while (0)
1816 #else
1817 # define IMMEDIATE_QUIT_CHECK    ((void)0)
1818 #endif
1819 \f
1820 /* Structure to manage work area for range table.  */
1821 struct range_table_work_area
1822 {
1823   int *table;                   /* actual work area.  */
1824   int allocated;                /* allocated size for work area in bytes.  */
1825   int used;                     /* actually used size in words.  */
1826   int bits;                     /* flag to record character classes */
1827 };
1828
1829 #ifdef emacs
1830
1831 /* Make sure that WORK_AREA can hold more N multibyte characters.
1832    This is used only in set_image_of_range and set_image_of_range_1.
1833    It expects WORK_AREA to be a pointer.
1834    If it can't get the space, it returns from the surrounding function.  */
1835
1836 #define EXTEND_RANGE_TABLE(work_area, n)                                \
1837   do {                                                                  \
1838     if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1839       {                                                                 \
1840         extend_range_table_work_area (&work_area);                      \
1841         if ((work_area).table == 0)                                     \
1842           return (REG_ESPACE);                                          \
1843       }                                                                 \
1844   } while (0)
1845
1846 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit)           \
1847   (work_area).bits |= (bit)
1848
1849 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
1850 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)    \
1851   do {                                                                  \
1852     EXTEND_RANGE_TABLE ((work_area), 2);                                \
1853     (work_area).table[(work_area).used++] = (range_start);              \
1854     (work_area).table[(work_area).used++] = (range_end);                \
1855   } while (0)
1856
1857 #endif /* emacs */
1858
1859 /* Free allocated memory for WORK_AREA.  */
1860 #define FREE_RANGE_TABLE_WORK_AREA(work_area)   \
1861   do {                                          \
1862     if ((work_area).table)                      \
1863       free ((work_area).table);                 \
1864   } while (0)
1865
1866 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1867 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1868 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1869 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1870
1871 /* Bits used to implement the multibyte-part of the various character classes
1872    such as [:alnum:] in a charset's range table.  */
1873 #define BIT_WORD        0x1
1874 #define BIT_LOWER       0x2
1875 #define BIT_PUNCT       0x4
1876 #define BIT_SPACE       0x8
1877 #define BIT_UPPER       0x10
1878 #define BIT_MULTIBYTE   0x20
1879 \f
1880
1881 /* Set the bit for character C in a list.  */
1882 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1883
1884
1885 #ifdef emacs
1886
1887 /* Store characters in the range FROM to TO in the bitmap at B (for
1888    ASCII and unibyte characters) and WORK_AREA (for multibyte
1889    characters) while translating them and paying attention to the
1890    continuity of translated characters.
1891
1892    Implementation note: It is better to implement these fairly big
1893    macros by a function, but it's not that easy because macros called
1894    in this macro assume various local variables already declared.  */
1895
1896 /* Both FROM and TO are ASCII characters.  */
1897
1898 #define SETUP_ASCII_RANGE(work_area, FROM, TO)                  \
1899   do {                                                          \
1900     int C0, C1;                                                 \
1901                                                                 \
1902     for (C0 = (FROM); C0 <= (TO); C0++)                         \
1903       {                                                         \
1904         C1 = TRANSLATE (C0);                                    \
1905         if (! ASCII_CHAR_P (C1))                                \
1906           {                                                     \
1907             SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);    \
1908             if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0)             \
1909               C1 = C0;                                          \
1910           }                                                     \
1911         SET_LIST_BIT (C1);                                      \
1912       }                                                         \
1913   } while (0)
1914
1915
1916 /* Both FROM and TO are unibyte characters (0x80..0xFF).  */
1917
1918 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO)                               \
1919   do {                                                                         \
1920     int C0, C1, C2, I;                                                         \
1921     int USED = RANGE_TABLE_WORK_USED (work_area);                              \
1922                                                                                \
1923     for (C0 = (FROM); C0 <= (TO); C0++)                                        \
1924       {                                                                        \
1925         C1 = RE_CHAR_TO_MULTIBYTE (C0);                                        \
1926         if (CHAR_BYTE8_P (C1))                                                 \
1927           SET_LIST_BIT (C0);                                                   \
1928         else                                                                   \
1929           {                                                                    \
1930             C2 = TRANSLATE (C1);                                               \
1931             if (C2 == C1                                                       \
1932                 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0)                         \
1933               C1 = C0;                                                         \
1934             SET_LIST_BIT (C1);                                                 \
1935             for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1936               {                                                                \
1937                 int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
1938                 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
1939                                                                                \
1940                 if (C2 >= from - 1 && C2 <= to + 1)                            \
1941                   {                                                            \
1942                     if (C2 == from - 1)                                        \
1943                       RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
1944                     else if (C2 == to + 1)                                     \
1945                       RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
1946                     break;                                                     \
1947                   }                                                            \
1948               }                                                                \
1949             if (I < USED)                                                      \
1950               SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2);                 \
1951           }                                                                    \
1952       }                                                                        \
1953   } while (0)
1954
1955
1956 /* Both FROM and TO are multibyte characters.  */
1957
1958 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO)                         \
1959   do {                                                                     \
1960     int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area);           \
1961                                                                            \
1962     SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO));                 \
1963     for (C0 = (FROM); C0 <= (TO); C0++)                                    \
1964       {                                                                    \
1965         C1 = TRANSLATE (C0);                                               \
1966         if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0                            \
1967             || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0))          \
1968           SET_LIST_BIT (C2);                                               \
1969         if (C1 >= (FROM) && C1 <= (TO))                                    \
1970           continue;                                                        \
1971         for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
1972           {                                                                \
1973             int from = RANGE_TABLE_WORK_ELT (work_area, I);                \
1974             int to = RANGE_TABLE_WORK_ELT (work_area, I + 1);              \
1975                                                                            \
1976             if (C1 >= from - 1 && C1 <= to + 1)                            \
1977               {                                                            \
1978                 if (C1 == from - 1)                                        \
1979                   RANGE_TABLE_WORK_ELT (work_area, I)--;                   \
1980                 else if (C1 == to + 1)                                     \
1981                   RANGE_TABLE_WORK_ELT (work_area, I + 1)++;               \
1982                 break;                                                     \
1983               }                                                            \
1984           }                                                                \
1985         if (I < USED)                                                      \
1986           SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1);                 \
1987       }                                                                    \
1988   } while (0)
1989
1990 #endif /* emacs */
1991
1992 /* Get the next unsigned number in the uncompiled pattern.  */
1993 #define GET_INTERVAL_COUNT(num)                                 \
1994   do {                                                                  \
1995     if (p == pend)                                                      \
1996       FREE_STACK_RETURN (REG_EBRACE);                                   \
1997     else                                                                \
1998       {                                                                 \
1999         PATFETCH (c);                                                   \
2000         while ('0' <= c && c <= '9')                                    \
2001           {                                                             \
2002             if (num < 0)                                                \
2003               num = 0;                                                  \
2004             if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num)    \
2005               FREE_STACK_RETURN (REG_BADBR);                            \
2006             num = num * 10 + c - '0';                                   \
2007             if (p == pend)                                              \
2008               FREE_STACK_RETURN (REG_EBRACE);                           \
2009             PATFETCH (c);                                               \
2010           }                                                             \
2011       }                                                                 \
2012   } while (0)
2013 \f
2014 #if ! WIDE_CHAR_SUPPORT
2015
2016 /* Map a string to the char class it names (if any).  */
2017 re_wctype_t
2018 re_wctype (const_re_char *str)
2019 {
2020   const char *string = (const char *) str;
2021   if      (STREQ (string, "alnum"))     return RECC_ALNUM;
2022   else if (STREQ (string, "alpha"))     return RECC_ALPHA;
2023   else if (STREQ (string, "word"))      return RECC_WORD;
2024   else if (STREQ (string, "ascii"))     return RECC_ASCII;
2025   else if (STREQ (string, "nonascii"))  return RECC_NONASCII;
2026   else if (STREQ (string, "graph"))     return RECC_GRAPH;
2027   else if (STREQ (string, "lower"))     return RECC_LOWER;
2028   else if (STREQ (string, "print"))     return RECC_PRINT;
2029   else if (STREQ (string, "punct"))     return RECC_PUNCT;
2030   else if (STREQ (string, "space"))     return RECC_SPACE;
2031   else if (STREQ (string, "upper"))     return RECC_UPPER;
2032   else if (STREQ (string, "unibyte"))   return RECC_UNIBYTE;
2033   else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2034   else if (STREQ (string, "digit"))     return RECC_DIGIT;
2035   else if (STREQ (string, "xdigit"))    return RECC_XDIGIT;
2036   else if (STREQ (string, "cntrl"))     return RECC_CNTRL;
2037   else if (STREQ (string, "blank"))     return RECC_BLANK;
2038   else return 0;
2039 }
2040
2041 /* True if CH is in the char class CC.  */
2042 boolean
2043 re_iswctype (int ch, re_wctype_t cc)
2044 {
2045   switch (cc)
2046     {
2047     case RECC_ALNUM: return ISALNUM (ch) != 0;
2048     case RECC_ALPHA: return ISALPHA (ch) != 0;
2049     case RECC_BLANK: return ISBLANK (ch) != 0;
2050     case RECC_CNTRL: return ISCNTRL (ch) != 0;
2051     case RECC_DIGIT: return ISDIGIT (ch) != 0;
2052     case RECC_GRAPH: return ISGRAPH (ch) != 0;
2053     case RECC_LOWER: return ISLOWER (ch) != 0;
2054     case RECC_PRINT: return ISPRINT (ch) != 0;
2055     case RECC_PUNCT: return ISPUNCT (ch) != 0;
2056     case RECC_SPACE: return ISSPACE (ch) != 0;
2057     case RECC_UPPER: return ISUPPER (ch) != 0;
2058     case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2059     case RECC_ASCII: return IS_REAL_ASCII (ch) != 0;
2060     case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2061     case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
2062     case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2063     case RECC_WORD: return ISWORD (ch) != 0;
2064     case RECC_ERROR: return false;
2065     default:
2066       abort ();
2067     }
2068 }
2069
2070 /* Return a bit-pattern to use in the range-table bits to match multibyte
2071    chars of class CC.  */
2072 static int
2073 re_wctype_to_bit (re_wctype_t cc)
2074 {
2075   switch (cc)
2076     {
2077     case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
2078     case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2079     case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2080     case RECC_LOWER: return BIT_LOWER;
2081     case RECC_UPPER: return BIT_UPPER;
2082     case RECC_PUNCT: return BIT_PUNCT;
2083     case RECC_SPACE: return BIT_SPACE;
2084     case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2085     case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2086     default:
2087       abort ();
2088     }
2089 }
2090 #endif
2091 \f
2092 /* Filling in the work area of a range.  */
2093
2094 /* Actually extend the space in WORK_AREA.  */
2095
2096 static void
2097 extend_range_table_work_area (struct range_table_work_area *work_area)
2098 {
2099   work_area->allocated += 16 * sizeof (int);
2100   work_area->table = realloc (work_area->table, work_area->allocated);
2101 }
2102
2103 #if 0
2104 #ifdef emacs
2105
2106 /* Carefully find the ranges of codes that are equivalent
2107    under case conversion to the range start..end when passed through
2108    TRANSLATE.  Handle the case where non-letters can come in between
2109    two upper-case letters (which happens in Latin-1).
2110    Also handle the case of groups of more than 2 case-equivalent chars.
2111
2112    The basic method is to look at consecutive characters and see
2113    if they can form a run that can be handled as one.
2114
2115    Returns -1 if successful, REG_ESPACE if ran out of space.  */
2116
2117 static int
2118 set_image_of_range_1 (struct range_table_work_area *work_area,
2119                       re_wchar_t start, re_wchar_t end,
2120                       RE_TRANSLATE_TYPE translate)
2121 {
2122   /* `one_case' indicates a character, or a run of characters,
2123      each of which is an isolate (no case-equivalents).
2124      This includes all ASCII non-letters.
2125
2126      `two_case' indicates a character, or a run of characters,
2127      each of which has two case-equivalent forms.
2128      This includes all ASCII letters.
2129
2130      `strange' indicates a character that has more than one
2131      case-equivalent.  */
2132
2133   enum case_type {one_case, two_case, strange};
2134
2135   /* Describe the run that is in progress,
2136      which the next character can try to extend.
2137      If run_type is strange, that means there really is no run.
2138      If run_type is one_case, then run_start...run_end is the run.
2139      If run_type is two_case, then the run is run_start...run_end,
2140      and the case-equivalents end at run_eqv_end.  */
2141
2142   enum case_type run_type = strange;
2143   int run_start, run_end, run_eqv_end;
2144
2145   Lisp_Object eqv_table;
2146
2147   if (!RE_TRANSLATE_P (translate))
2148     {
2149       EXTEND_RANGE_TABLE (work_area, 2);
2150       work_area->table[work_area->used++] = (start);
2151       work_area->table[work_area->used++] = (end);
2152       return -1;
2153     }
2154
2155   eqv_table = XCHAR_TABLE (translate)->extras[2];
2156
2157   for (; start <= end; start++)
2158     {
2159       enum case_type this_type;
2160       int eqv = RE_TRANSLATE (eqv_table, start);
2161       int minchar, maxchar;
2162
2163       /* Classify this character */
2164       if (eqv == start)
2165         this_type = one_case;
2166       else if (RE_TRANSLATE (eqv_table, eqv) == start)
2167         this_type = two_case;
2168       else
2169         this_type = strange;
2170
2171       if (start < eqv)
2172         minchar = start, maxchar = eqv;
2173       else
2174         minchar = eqv, maxchar = start;
2175
2176       /* Can this character extend the run in progress?  */
2177       if (this_type == strange || this_type != run_type
2178           || !(minchar == run_end + 1
2179                && (run_type == two_case
2180                    ? maxchar == run_eqv_end + 1 : 1)))
2181         {
2182           /* No, end the run.
2183              Record each of its equivalent ranges.  */
2184           if (run_type == one_case)
2185             {
2186               EXTEND_RANGE_TABLE (work_area, 2);
2187               work_area->table[work_area->used++] = run_start;
2188               work_area->table[work_area->used++] = run_end;
2189             }
2190           else if (run_type == two_case)
2191             {
2192               EXTEND_RANGE_TABLE (work_area, 4);
2193               work_area->table[work_area->used++] = run_start;
2194               work_area->table[work_area->used++] = run_end;
2195               work_area->table[work_area->used++]
2196                 = RE_TRANSLATE (eqv_table, run_start);
2197               work_area->table[work_area->used++]
2198                 = RE_TRANSLATE (eqv_table, run_end);
2199             }
2200           run_type = strange;
2201         }
2202
2203       if (this_type == strange)
2204         {
2205           /* For a strange character, add each of its equivalents, one
2206              by one.  Don't start a range.  */
2207           do
2208             {
2209               EXTEND_RANGE_TABLE (work_area, 2);
2210               work_area->table[work_area->used++] = eqv;
2211               work_area->table[work_area->used++] = eqv;
2212               eqv = RE_TRANSLATE (eqv_table, eqv);
2213             }
2214           while (eqv != start);
2215         }
2216
2217       /* Add this char to the run, or start a new run.  */
2218       else if (run_type == strange)
2219         {
2220           /* Initialize a new range.  */
2221           run_type = this_type;
2222           run_start = start;
2223           run_end = start;
2224           run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2225         }
2226       else
2227         {
2228           /* Extend a running range.  */
2229           run_end = minchar;
2230           run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2231         }
2232     }
2233
2234   /* If a run is still in progress at the end, finish it now
2235      by recording its equivalent ranges.  */
2236   if (run_type == one_case)
2237     {
2238       EXTEND_RANGE_TABLE (work_area, 2);
2239       work_area->table[work_area->used++] = run_start;
2240       work_area->table[work_area->used++] = run_end;
2241     }
2242   else if (run_type == two_case)
2243     {
2244       EXTEND_RANGE_TABLE (work_area, 4);
2245       work_area->table[work_area->used++] = run_start;
2246       work_area->table[work_area->used++] = run_end;
2247       work_area->table[work_area->used++]
2248         = RE_TRANSLATE (eqv_table, run_start);
2249       work_area->table[work_area->used++]
2250         = RE_TRANSLATE (eqv_table, run_end);
2251     }
2252
2253   return -1;
2254 }
2255
2256 #endif /* emacs */
2257
2258 /* Record the image of the range start..end when passed through
2259    TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2260    and is not even necessarily contiguous.
2261    Normally we approximate it with the smallest contiguous range that contains
2262    all the chars we need.  However, for Latin-1 we go to extra effort
2263    to do a better job.
2264
2265    This function is not called for ASCII ranges.
2266
2267    Returns -1 if successful, REG_ESPACE if ran out of space.  */
2268
2269 static int
2270 set_image_of_range (struct range_table_work_area *work_area,
2271                     re_wchar_t start, re_wchar_t end,
2272                     RE_TRANSLATE_TYPE translate)
2273 {
2274   re_wchar_t cmin, cmax;
2275
2276 #ifdef emacs
2277   /* For Latin-1 ranges, use set_image_of_range_1
2278      to get proper handling of ranges that include letters and nonletters.
2279      For a range that includes the whole of Latin-1, this is not necessary.
2280      For other character sets, we don't bother to get this right.  */
2281   if (RE_TRANSLATE_P (translate) && start < 04400
2282       && !(start < 04200 && end >= 04377))
2283     {
2284       int newend;
2285       int tem;
2286       newend = end;
2287       if (newend > 04377)
2288         newend = 04377;
2289       tem = set_image_of_range_1 (work_area, start, newend, translate);
2290       if (tem > 0)
2291         return tem;
2292
2293       start = 04400;
2294       if (end < 04400)
2295         return -1;
2296     }
2297 #endif
2298
2299   EXTEND_RANGE_TABLE (work_area, 2);
2300   work_area->table[work_area->used++] = (start);
2301   work_area->table[work_area->used++] = (end);
2302
2303   cmin = -1, cmax = -1;
2304
2305   if (RE_TRANSLATE_P (translate))
2306     {
2307       int ch;
2308
2309       for (ch = start; ch <= end; ch++)
2310         {
2311           re_wchar_t c = TRANSLATE (ch);
2312           if (! (start <= c && c <= end))
2313             {
2314               if (cmin == -1)
2315                 cmin = c, cmax = c;
2316               else
2317                 {
2318                   cmin = MIN (cmin, c);
2319                   cmax = MAX (cmax, c);
2320                 }
2321             }
2322         }
2323
2324       if (cmin != -1)
2325         {
2326           EXTEND_RANGE_TABLE (work_area, 2);
2327           work_area->table[work_area->used++] = (cmin);
2328           work_area->table[work_area->used++] = (cmax);
2329         }
2330     }
2331
2332   return -1;
2333 }
2334 #endif  /* 0 */
2335 \f
2336 #ifndef MATCH_MAY_ALLOCATE
2337
2338 /* If we cannot allocate large objects within re_match_2_internal,
2339    we make the fail stack and register vectors global.
2340    The fail stack, we grow to the maximum size when a regexp
2341    is compiled.
2342    The register vectors, we adjust in size each time we
2343    compile a regexp, according to the number of registers it needs.  */
2344
2345 static fail_stack_type fail_stack;
2346
2347 /* Size with which the following vectors are currently allocated.
2348    That is so we can make them bigger as needed,
2349    but never make them smaller.  */
2350 static int regs_allocated_size;
2351
2352 static re_char **     regstart, **     regend;
2353 static re_char **best_regstart, **best_regend;
2354
2355 /* Make the register vectors big enough for NUM_REGS registers,
2356    but don't make them smaller.  */
2357
2358 static
2359 regex_grow_registers (int num_regs)
2360 {
2361   if (num_regs > regs_allocated_size)
2362     {
2363       RETALLOC_IF (regstart,     num_regs, re_char *);
2364       RETALLOC_IF (regend,       num_regs, re_char *);
2365       RETALLOC_IF (best_regstart, num_regs, re_char *);
2366       RETALLOC_IF (best_regend,  num_regs, re_char *);
2367
2368       regs_allocated_size = num_regs;
2369     }
2370 }
2371
2372 #endif /* not MATCH_MAY_ALLOCATE */
2373 \f
2374 static boolean group_in_compile_stack (compile_stack_type compile_stack,
2375                                        regnum_t regnum);
2376
2377 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2378    Returns one of error codes defined in `regex.h', or zero for success.
2379
2380    Assumes the `allocated' (and perhaps `buffer') and `translate'
2381    fields are set in BUFP on entry.
2382
2383    If it succeeds, results are put in BUFP (if it returns an error, the
2384    contents of BUFP are undefined):
2385      `buffer' is the compiled pattern;
2386      `syntax' is set to SYNTAX;
2387      `used' is set to the length of the compiled pattern;
2388      `fastmap_accurate' is zero;
2389      `re_nsub' is the number of subexpressions in PATTERN;
2390      `not_bol' and `not_eol' are zero;
2391
2392    The `fastmap' field is neither examined nor set.  */
2393
2394 /* Insert the `jump' from the end of last alternative to "here".
2395    The space for the jump has already been allocated. */
2396 #define FIXUP_ALT_JUMP()                                                \
2397 do {                                                                    \
2398   if (fixup_alt_jump)                                                   \
2399     STORE_JUMP (jump, fixup_alt_jump, b);                               \
2400 } while (0)
2401
2402
2403 /* Return, freeing storage we allocated.  */
2404 #define FREE_STACK_RETURN(value)                \
2405   do {                                                  \
2406     FREE_RANGE_TABLE_WORK_AREA (range_table_work);      \
2407     free (compile_stack.stack);                         \
2408     return value;                                       \
2409   } while (0)
2410
2411 static reg_errcode_t
2412 regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
2413                struct re_pattern_buffer *bufp)
2414 {
2415   /* We fetch characters from PATTERN here.  */
2416   register re_wchar_t c, c1;
2417
2418   /* Points to the end of the buffer, where we should append.  */
2419   register unsigned char *b;
2420
2421   /* Keeps track of unclosed groups.  */
2422   compile_stack_type compile_stack;
2423
2424   /* Points to the current (ending) position in the pattern.  */
2425 #ifdef AIX
2426   /* `const' makes AIX compiler fail.  */
2427   unsigned char *p = pattern;
2428 #else
2429   re_char *p = pattern;
2430 #endif
2431   re_char *pend = pattern + size;
2432
2433   /* How to translate the characters in the pattern.  */
2434   RE_TRANSLATE_TYPE translate = bufp->translate;
2435
2436   /* Address of the count-byte of the most recently inserted `exactn'
2437      command.  This makes it possible to tell if a new exact-match
2438      character can be added to that command or if the character requires
2439      a new `exactn' command.  */
2440   unsigned char *pending_exact = 0;
2441
2442   /* Address of start of the most recently finished expression.
2443      This tells, e.g., postfix * where to find the start of its
2444      operand.  Reset at the beginning of groups and alternatives.  */
2445   unsigned char *laststart = 0;
2446
2447   /* Address of beginning of regexp, or inside of last group.  */
2448   unsigned char *begalt;
2449
2450   /* Place in the uncompiled pattern (i.e., the {) to
2451      which to go back if the interval is invalid.  */
2452   re_char *beg_interval;
2453
2454   /* Address of the place where a forward jump should go to the end of
2455      the containing expression.  Each alternative of an `or' -- except the
2456      last -- ends with a forward jump of this sort.  */
2457   unsigned char *fixup_alt_jump = 0;
2458
2459   /* Work area for range table of charset.  */
2460   struct range_table_work_area range_table_work;
2461
2462   /* If the object matched can contain multibyte characters.  */
2463   const boolean multibyte = RE_MULTIBYTE_P (bufp);
2464
2465   /* Nonzero if we have pushed down into a subpattern.  */
2466   int in_subpattern = 0;
2467
2468   /* These hold the values of p, pattern, and pend from the main
2469      pattern when we have pushed into a subpattern.  */
2470   re_char *main_p IF_LINT (= NULL);
2471   re_char *main_pattern IF_LINT (= NULL);
2472   re_char *main_pend IF_LINT (= NULL);
2473
2474 #ifdef DEBUG
2475   debug++;
2476   DEBUG_PRINT ("\nCompiling pattern: ");
2477   if (debug > 0)
2478     {
2479       unsigned debug_count;
2480
2481       for (debug_count = 0; debug_count < size; debug_count++)
2482         putchar (pattern[debug_count]);
2483       putchar ('\n');
2484     }
2485 #endif /* DEBUG */
2486
2487   /* Initialize the compile stack.  */
2488   compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2489   if (compile_stack.stack == NULL)
2490     return REG_ESPACE;
2491
2492   compile_stack.size = INIT_COMPILE_STACK_SIZE;
2493   compile_stack.avail = 0;
2494
2495   range_table_work.table = 0;
2496   range_table_work.allocated = 0;
2497
2498   /* Initialize the pattern buffer.  */
2499   bufp->syntax = syntax;
2500   bufp->fastmap_accurate = 0;
2501   bufp->not_bol = bufp->not_eol = 0;
2502   bufp->used_syntax = 0;
2503
2504   /* Set `used' to zero, so that if we return an error, the pattern
2505      printer (for debugging) will think there's no pattern.  We reset it
2506      at the end.  */
2507   bufp->used = 0;
2508
2509   /* Always count groups, whether or not bufp->no_sub is set.  */
2510   bufp->re_nsub = 0;
2511
2512 #if !defined emacs && !defined SYNTAX_TABLE
2513   /* Initialize the syntax table.  */
2514    init_syntax_once ();
2515 #endif
2516
2517   if (bufp->allocated == 0)
2518     {
2519       if (bufp->buffer)
2520         { /* If zero allocated, but buffer is non-null, try to realloc
2521              enough space.  This loses if buffer's address is bogus, but
2522              that is the user's responsibility.  */
2523           RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2524         }
2525       else
2526         { /* Caller did not allocate a buffer.  Do it for them.  */
2527           bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2528         }
2529       if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2530
2531       bufp->allocated = INIT_BUF_SIZE;
2532     }
2533
2534   begalt = b = bufp->buffer;
2535
2536   /* Loop through the uncompiled pattern until we're at the end.  */
2537   while (1)
2538     {
2539       if (p == pend)
2540         {
2541           /* If this is the end of an included regexp,
2542              pop back to the main regexp and try again.  */
2543           if (in_subpattern)
2544             {
2545               in_subpattern = 0;
2546               pattern = main_pattern;
2547               p = main_p;
2548               pend = main_pend;
2549               continue;
2550             }
2551           /* If this is the end of the main regexp, we are done.  */
2552           break;
2553         }
2554
2555       PATFETCH (c);
2556
2557       switch (c)
2558         {
2559         case ' ':
2560           {
2561             re_char *p1 = p;
2562
2563             /* If there's no special whitespace regexp, treat
2564                spaces normally.  And don't try to do this recursively.  */
2565             if (!whitespace_regexp || in_subpattern)
2566               goto normal_char;
2567
2568             /* Peek past following spaces.  */
2569             while (p1 != pend)
2570               {
2571                 if (*p1 != ' ')
2572                   break;
2573                 p1++;
2574               }
2575             /* If the spaces are followed by a repetition op,
2576                treat them normally.  */
2577             if (p1 != pend
2578                 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
2579                     || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2580               goto normal_char;
2581
2582             /* Replace the spaces with the whitespace regexp.  */
2583             in_subpattern = 1;
2584             main_p = p1;
2585             main_pend = pend;
2586             main_pattern = pattern;
2587             p = pattern = whitespace_regexp;
2588             pend = p + strlen ((const char *) p);
2589             break;
2590           }
2591
2592         case '^':
2593           {
2594             if (   /* If at start of pattern, it's an operator.  */
2595                    p == pattern + 1
2596                    /* If context independent, it's an operator.  */
2597                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2598                    /* Otherwise, depends on what's come before.  */
2599                 || at_begline_loc_p (pattern, p, syntax))
2600               BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
2601             else
2602               goto normal_char;
2603           }
2604           break;
2605
2606
2607         case '$':
2608           {
2609             if (   /* If at end of pattern, it's an operator.  */
2610                    p == pend
2611                    /* If context independent, it's an operator.  */
2612                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2613                    /* Otherwise, depends on what's next.  */
2614                 || at_endline_loc_p (p, pend, syntax))
2615                BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
2616              else
2617                goto normal_char;
2618            }
2619            break;
2620
2621
2622         case '+':
2623         case '?':
2624           if ((syntax & RE_BK_PLUS_QM)
2625               || (syntax & RE_LIMITED_OPS))
2626             goto normal_char;
2627         handle_plus:
2628         case '*':
2629           /* If there is no previous pattern...  */
2630           if (!laststart)
2631             {
2632               if (syntax & RE_CONTEXT_INVALID_OPS)
2633                 FREE_STACK_RETURN (REG_BADRPT);
2634               else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2635                 goto normal_char;
2636             }
2637
2638           {
2639             /* 1 means zero (many) matches is allowed.  */
2640             boolean zero_times_ok = 0, many_times_ok = 0;
2641             boolean greedy = 1;
2642
2643             /* If there is a sequence of repetition chars, collapse it
2644                down to just one (the right one).  We can't combine
2645                interval operators with these because of, e.g., `a{2}*',
2646                which should only match an even number of `a's.  */
2647
2648             for (;;)
2649               {
2650                 if ((syntax & RE_FRUGAL)
2651                     && c == '?' && (zero_times_ok || many_times_ok))
2652                   greedy = 0;
2653                 else
2654                   {
2655                     zero_times_ok |= c != '+';
2656                     many_times_ok |= c != '?';
2657                   }
2658
2659                 if (p == pend)
2660                   break;
2661                 else if (*p == '*'
2662                          || (!(syntax & RE_BK_PLUS_QM)
2663                              && (*p == '+' || *p == '?')))
2664                   ;
2665                 else if (syntax & RE_BK_PLUS_QM  && *p == '\\')
2666                   {
2667                     if (p+1 == pend)
2668                       FREE_STACK_RETURN (REG_EESCAPE);
2669                     if (p[1] == '+' || p[1] == '?')
2670                       PATFETCH (c); /* Gobble up the backslash.  */
2671                     else
2672                       break;
2673                   }
2674                 else
2675                   break;
2676                 /* If we get here, we found another repeat character.  */
2677                 PATFETCH (c);
2678                }
2679
2680             /* Star, etc. applied to an empty pattern is equivalent
2681                to an empty pattern.  */
2682             if (!laststart || laststart == b)
2683               break;
2684
2685             /* Now we know whether or not zero matches is allowed
2686                and also whether or not two or more matches is allowed.  */
2687             if (greedy)
2688               {
2689                 if (many_times_ok)
2690                   {
2691                     boolean simple = skip_one_char (laststart) == b;
2692                     size_t startoffset = 0;
2693                     re_opcode_t ofj =
2694                       /* Check if the loop can match the empty string.  */
2695                       (simple || !analyse_first (laststart, b, NULL, 0))
2696                       ? on_failure_jump : on_failure_jump_loop;
2697                     assert (skip_one_char (laststart) <= b);
2698
2699                     if (!zero_times_ok && simple)
2700                       { /* Since simple * loops can be made faster by using
2701                            on_failure_keep_string_jump, we turn simple P+
2702                            into PP* if P is simple.  */
2703                         unsigned char *p1, *p2;
2704                         startoffset = b - laststart;
2705                         GET_BUFFER_SPACE (startoffset);
2706                         p1 = b; p2 = laststart;
2707                         while (p2 < p1)
2708                           *b++ = *p2++;
2709                         zero_times_ok = 1;
2710                       }
2711
2712                     GET_BUFFER_SPACE (6);
2713                     if (!zero_times_ok)
2714                       /* A + loop.  */
2715                       STORE_JUMP (ofj, b, b + 6);
2716                     else
2717                       /* Simple * loops can use on_failure_keep_string_jump
2718                          depending on what follows.  But since we don't know
2719                          that yet, we leave the decision up to
2720                          on_failure_jump_smart.  */
2721                       INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2722                                    laststart + startoffset, b + 6);
2723                     b += 3;
2724                     STORE_JUMP (jump, b, laststart + startoffset);
2725                     b += 3;
2726                   }
2727                 else
2728                   {
2729                     /* A simple ? pattern.  */
2730                     assert (zero_times_ok);
2731                     GET_BUFFER_SPACE (3);
2732                     INSERT_JUMP (on_failure_jump, laststart, b + 3);
2733                     b += 3;
2734                   }
2735               }
2736             else                /* not greedy */
2737               { /* I wish the greedy and non-greedy cases could be merged.  */
2738
2739                 GET_BUFFER_SPACE (7); /* We might use less.  */
2740                 if (many_times_ok)
2741                   {
2742                     boolean emptyp = analyse_first (laststart, b, NULL, 0);
2743
2744                     /* The non-greedy multiple match looks like
2745                        a repeat..until: we only need a conditional jump
2746                        at the end of the loop.  */
2747                     if (emptyp) BUF_PUSH (no_op);
2748                     STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2749                                 : on_failure_jump, b, laststart);
2750                     b += 3;
2751                     if (zero_times_ok)
2752                       {
2753                         /* The repeat...until naturally matches one or more.
2754                            To also match zero times, we need to first jump to
2755                            the end of the loop (its conditional jump).  */
2756                         INSERT_JUMP (jump, laststart, b);
2757                         b += 3;
2758                       }
2759                   }
2760                 else
2761                   {
2762                     /* non-greedy a?? */
2763                     INSERT_JUMP (jump, laststart, b + 3);
2764                     b += 3;
2765                     INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2766                     b += 3;
2767                   }
2768               }
2769           }
2770           pending_exact = 0;
2771           break;
2772
2773
2774         case '.':
2775           laststart = b;
2776           BUF_PUSH (anychar);
2777           break;
2778
2779
2780         case '[':
2781           {
2782             re_char *p1;
2783
2784             CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2785
2786             if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2787
2788             /* Ensure that we have enough space to push a charset: the
2789                opcode, the length count, and the bitset; 34 bytes in all.  */
2790             GET_BUFFER_SPACE (34);
2791
2792             laststart = b;
2793
2794             /* We test `*p == '^' twice, instead of using an if
2795                statement, so we only need one BUF_PUSH.  */
2796             BUF_PUSH (*p == '^' ? charset_not : charset);
2797             if (*p == '^')
2798               p++;
2799
2800             /* Remember the first position in the bracket expression.  */
2801             p1 = p;
2802
2803             /* Push the number of bytes in the bitmap.  */
2804             BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2805
2806             /* Clear the whole map.  */
2807             memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2808
2809             /* charset_not matches newline according to a syntax bit.  */
2810             if ((re_opcode_t) b[-2] == charset_not
2811                 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2812               SET_LIST_BIT ('\n');
2813
2814             /* Read in characters and ranges, setting map bits.  */
2815             for (;;)
2816               {
2817                 boolean escaped_char = false;
2818                 const unsigned char *p2 = p;
2819                 re_wchar_t ch;
2820
2821                 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2822
2823                 /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
2824                    always be determined from TRANSLATE(X) and TRANSLATE(Y)
2825                    So the translation is done later in a loop.  Example:
2826                    (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
2827                 PATFETCH (c);
2828
2829                 /* \ might escape characters inside [...] and [^...].  */
2830                 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2831                   {
2832                     if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2833
2834                     PATFETCH (c);
2835                     escaped_char = true;
2836                   }
2837                 else
2838                   {
2839                     /* Could be the end of the bracket expression.  If it's
2840                        not (i.e., when the bracket expression is `[]' so
2841                        far), the ']' character bit gets set way below.  */
2842                     if (c == ']' && p2 != p1)
2843                       break;
2844                   }
2845
2846                 /* See if we're at the beginning of a possible character
2847                    class.  */
2848
2849                 if (!escaped_char &&
2850                     syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2851                   {
2852                     /* Leave room for the null.  */
2853                     unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2854                     const unsigned char *class_beg;
2855
2856                     PATFETCH (c);
2857                     c1 = 0;
2858                     class_beg = p;
2859
2860                     /* If pattern is `[[:'.  */
2861                     if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2862
2863                     for (;;)
2864                       {
2865                         PATFETCH (c);
2866                         if ((c == ':' && *p == ']') || p == pend)
2867                           break;
2868                         if (c1 < CHAR_CLASS_MAX_LENGTH)
2869                           str[c1++] = c;
2870                         else
2871                           /* This is in any case an invalid class name.  */
2872                           str[0] = '\0';
2873                       }
2874                     str[c1] = '\0';
2875
2876                     /* If isn't a word bracketed by `[:' and `:]':
2877                        undo the ending character, the letters, and
2878                        leave the leading `:' and `[' (but set bits for
2879                        them).  */
2880                     if (c == ':' && *p == ']')
2881                       {
2882                         re_wctype_t cc = re_wctype (str);
2883
2884                         if (cc == 0)
2885                           FREE_STACK_RETURN (REG_ECTYPE);
2886
2887                         /* Throw away the ] at the end of the character
2888                            class.  */
2889                         PATFETCH (c);
2890
2891                         if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2892
2893 #ifndef emacs
2894                         for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2895                           if (re_iswctype (btowc (ch), cc))
2896                             {
2897                               c = TRANSLATE (ch);
2898                               if (c < (1 << BYTEWIDTH))
2899                                 SET_LIST_BIT (c);
2900                             }
2901 #else  /* emacs */
2902                         /* Most character classes in a multibyte match
2903                            just set a flag.  Exceptions are is_blank,
2904                            is_digit, is_cntrl, and is_xdigit, since
2905                            they can only match ASCII characters.  We
2906                            don't need to handle them for multibyte.
2907                            They are distinguished by a negative wctype.  */
2908
2909                         /* Setup the gl_state object to its buffer-defined
2910                            value.  This hardcodes the buffer-global
2911                            syntax-table for ASCII chars, while the other chars
2912                            will obey syntax-table properties.  It's not ideal,
2913                            but it's the way it's been done until now.  */
2914                         SETUP_BUFFER_SYNTAX_TABLE ();
2915
2916                         for (ch = 0; ch < 256; ++ch)
2917                           {
2918                             c = RE_CHAR_TO_MULTIBYTE (ch);
2919                             if (! CHAR_BYTE8_P (c)
2920                                 && re_iswctype (c, cc))
2921                               {
2922                                 SET_LIST_BIT (ch);
2923                                 c1 = TRANSLATE (c);
2924                                 if (c1 == c)
2925                                   continue;
2926                                 if (ASCII_CHAR_P (c1))
2927                                   SET_LIST_BIT (c1);
2928                                 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
2929                                   SET_LIST_BIT (c1);
2930                               }
2931                           }
2932                         SET_RANGE_TABLE_WORK_AREA_BIT
2933                           (range_table_work, re_wctype_to_bit (cc));
2934 #endif  /* emacs */
2935                         /* In most cases the matching rule for char classes
2936                            only uses the syntax table for multibyte chars,
2937                            so that the content of the syntax-table it is not
2938                            hardcoded in the range_table.  SPACE and WORD are
2939                            the two exceptions.  */
2940                         if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
2941                           bufp->used_syntax = 1;
2942
2943                         /* Repeat the loop. */
2944                         continue;
2945                       }
2946                     else
2947                       {
2948                         /* Go back to right after the "[:".  */
2949                         p = class_beg;
2950                         SET_LIST_BIT ('[');
2951
2952                         /* Because the `:' may starts the range, we
2953                            can't simply set bit and repeat the loop.
2954                            Instead, just set it to C and handle below.  */
2955                         c = ':';
2956                       }
2957                   }
2958
2959                 if (p < pend && p[0] == '-' && p[1] != ']')
2960                   {
2961
2962                     /* Discard the `-'. */
2963                     PATFETCH (c1);
2964
2965                     /* Fetch the character which ends the range. */
2966                     PATFETCH (c1);
2967 #ifdef emacs
2968                     if (CHAR_BYTE8_P (c1)
2969                         && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
2970                       /* Treat the range from a multibyte character to
2971                          raw-byte character as empty.  */
2972                       c = c1 + 1;
2973 #endif  /* emacs */
2974                   }
2975                 else
2976                   /* Range from C to C. */
2977                   c1 = c;
2978
2979                 if (c > c1)
2980                   {
2981                     if (syntax & RE_NO_EMPTY_RANGES)
2982                       FREE_STACK_RETURN (REG_ERANGEX);
2983                     /* Else, repeat the loop.  */
2984                   }
2985                 else
2986                   {
2987 #ifndef emacs
2988                     /* Set the range into bitmap */
2989                     for (; c <= c1; c++)
2990                       {
2991                         ch = TRANSLATE (c);
2992                         if (ch < (1 << BYTEWIDTH))
2993                           SET_LIST_BIT (ch);
2994                       }
2995 #else  /* emacs */
2996                     if (c < 128)
2997                       {
2998                         ch = MIN (127, c1);
2999                         SETUP_ASCII_RANGE (range_table_work, c, ch);
3000                         c = ch + 1;
3001                         if (CHAR_BYTE8_P (c1))
3002                           c = BYTE8_TO_CHAR (128);
3003                       }
3004                     if (c <= c1)
3005                       {
3006                         if (CHAR_BYTE8_P (c))
3007                           {
3008                             c = CHAR_TO_BYTE8 (c);
3009                             c1 = CHAR_TO_BYTE8 (c1);
3010                             for (; c <= c1; c++)
3011                               SET_LIST_BIT (c);
3012                           }
3013                         else if (multibyte)
3014                           {
3015                             SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3016                           }
3017                         else
3018                           {
3019                             SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3020                           }
3021                       }
3022 #endif /* emacs */
3023                   }
3024               }
3025
3026             /* Discard any (non)matching list bytes that are all 0 at the
3027                end of the map.  Decrease the map-length byte too.  */
3028             while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3029               b[-1]--;
3030             b += b[-1];
3031
3032             /* Build real range table from work area.  */
3033             if (RANGE_TABLE_WORK_USED (range_table_work)
3034                 || RANGE_TABLE_WORK_BITS (range_table_work))
3035               {
3036                 int i;
3037                 int used = RANGE_TABLE_WORK_USED (range_table_work);
3038
3039                 /* Allocate space for COUNT + RANGE_TABLE.  Needs two
3040                    bytes for flags, two for COUNT, and three bytes for
3041                    each character.  */
3042                 GET_BUFFER_SPACE (4 + used * 3);
3043
3044                 /* Indicate the existence of range table.  */
3045                 laststart[1] |= 0x80;
3046
3047                 /* Store the character class flag bits into the range table.
3048                    If not in emacs, these flag bits are always 0.  */
3049                 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3050                 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3051
3052                 STORE_NUMBER_AND_INCR (b, used / 2);
3053                 for (i = 0; i < used; i++)
3054                   STORE_CHARACTER_AND_INCR
3055                     (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3056               }
3057           }
3058           break;
3059
3060
3061         case '(':
3062           if (syntax & RE_NO_BK_PARENS)
3063             goto handle_open;
3064           else
3065             goto normal_char;
3066
3067
3068         case ')':
3069           if (syntax & RE_NO_BK_PARENS)
3070             goto handle_close;
3071           else
3072             goto normal_char;
3073
3074
3075         case '\n':
3076           if (syntax & RE_NEWLINE_ALT)
3077             goto handle_alt;
3078           else
3079             goto normal_char;
3080
3081
3082         case '|':
3083           if (syntax & RE_NO_BK_VBAR)
3084             goto handle_alt;
3085           else
3086             goto normal_char;
3087
3088
3089         case '{':
3090            if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3091              goto handle_interval;
3092            else
3093              goto normal_char;
3094
3095
3096         case '\\':
3097           if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3098
3099           /* Do not translate the character after the \, so that we can
3100              distinguish, e.g., \B from \b, even if we normally would
3101              translate, e.g., B to b.  */
3102           PATFETCH (c);
3103
3104           switch (c)
3105             {
3106             case '(':
3107               if (syntax & RE_NO_BK_PARENS)
3108                 goto normal_backslash;
3109
3110             handle_open:
3111               {
3112                 int shy = 0;
3113                 regnum_t regnum = 0;
3114                 if (p+1 < pend)
3115                   {
3116                     /* Look for a special (?...) construct */
3117                     if ((syntax & RE_SHY_GROUPS) && *p == '?')
3118                       {
3119                         PATFETCH (c); /* Gobble up the '?'.  */
3120                         while (!shy)
3121                           {
3122                             PATFETCH (c);
3123                             switch (c)
3124                               {
3125                               case ':': shy = 1; break;
3126                               case '0':
3127                                 /* An explicitly specified regnum must start
3128                                    with non-0. */
3129                                 if (regnum == 0)
3130                                   FREE_STACK_RETURN (REG_BADPAT);
3131                               case '1': case '2': case '3': case '4':
3132                               case '5': case '6': case '7': case '8': case '9':
3133                                 regnum = 10*regnum + (c - '0'); break;
3134                               default:
3135                                 /* Only (?:...) is supported right now. */
3136                                 FREE_STACK_RETURN (REG_BADPAT);
3137                               }
3138                           }
3139                       }
3140                   }
3141
3142                 if (!shy)
3143                   regnum = ++bufp->re_nsub;
3144                 else if (regnum)
3145                   { /* It's actually not shy, but explicitly numbered.  */
3146                     shy = 0;
3147                     if (regnum > bufp->re_nsub)
3148                       bufp->re_nsub = regnum;
3149                     else if (regnum > bufp->re_nsub
3150                              /* Ideally, we'd want to check that the specified
3151                                 group can't have matched (i.e. all subgroups
3152                                 using the same regnum are in other branches of
3153                                 OR patterns), but we don't currently keep track
3154                                 of enough info to do that easily.  */
3155                              || group_in_compile_stack (compile_stack, regnum))
3156                       FREE_STACK_RETURN (REG_BADPAT);
3157                   }
3158                 else
3159                   /* It's really shy.  */
3160                   regnum = - bufp->re_nsub;
3161
3162                 if (COMPILE_STACK_FULL)
3163                   {
3164                     RETALLOC (compile_stack.stack, compile_stack.size << 1,
3165                               compile_stack_elt_t);
3166                     if (compile_stack.stack == NULL) return REG_ESPACE;
3167
3168                     compile_stack.size <<= 1;
3169                   }
3170
3171                 /* These are the values to restore when we hit end of this
3172                    group.  They are all relative offsets, so that if the
3173                    whole pattern moves because of realloc, they will still
3174                    be valid.  */
3175                 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3176                 COMPILE_STACK_TOP.fixup_alt_jump
3177                   = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3178                 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3179                 COMPILE_STACK_TOP.regnum = regnum;
3180
3181                 /* Do not push a start_memory for groups beyond the last one
3182                    we can represent in the compiled pattern.  */
3183                 if (regnum <= MAX_REGNUM && regnum > 0)
3184                   BUF_PUSH_2 (start_memory, regnum);
3185
3186                 compile_stack.avail++;
3187
3188                 fixup_alt_jump = 0;
3189                 laststart = 0;
3190                 begalt = b;
3191                 /* If we've reached MAX_REGNUM groups, then this open
3192                    won't actually generate any code, so we'll have to
3193                    clear pending_exact explicitly.  */
3194                 pending_exact = 0;
3195                 break;
3196               }
3197
3198             case ')':
3199               if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3200
3201               if (COMPILE_STACK_EMPTY)
3202                 {
3203                   if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3204                     goto normal_backslash;
3205                   else
3206                     FREE_STACK_RETURN (REG_ERPAREN);
3207                 }
3208
3209             handle_close:
3210               FIXUP_ALT_JUMP ();
3211
3212               /* See similar code for backslashed left paren above.  */
3213               if (COMPILE_STACK_EMPTY)
3214                 {
3215                   if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3216                     goto normal_char;
3217                   else
3218                     FREE_STACK_RETURN (REG_ERPAREN);
3219                 }
3220
3221               /* Since we just checked for an empty stack above, this
3222                  ``can't happen''.  */
3223               assert (compile_stack.avail != 0);
3224               {
3225                 /* We don't just want to restore into `regnum', because
3226                    later groups should continue to be numbered higher,
3227                    as in `(ab)c(de)' -- the second group is #2.  */
3228                 regnum_t regnum;
3229
3230                 compile_stack.avail--;
3231                 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3232                 fixup_alt_jump
3233                   = COMPILE_STACK_TOP.fixup_alt_jump
3234                     ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3235                     : 0;
3236                 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3237                 regnum = COMPILE_STACK_TOP.regnum;
3238                 /* If we've reached MAX_REGNUM groups, then this open
3239                    won't actually generate any code, so we'll have to
3240                    clear pending_exact explicitly.  */
3241                 pending_exact = 0;
3242
3243                 /* We're at the end of the group, so now we know how many
3244                    groups were inside this one.  */
3245                 if (regnum <= MAX_REGNUM && regnum > 0)
3246                   BUF_PUSH_2 (stop_memory, regnum);
3247               }
3248               break;
3249
3250
3251             case '|':                                   /* `\|'.  */
3252               if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3253                 goto normal_backslash;
3254             handle_alt:
3255               if (syntax & RE_LIMITED_OPS)
3256                 goto normal_char;
3257
3258               /* Insert before the previous alternative a jump which
3259                  jumps to this alternative if the former fails.  */
3260               GET_BUFFER_SPACE (3);
3261               INSERT_JUMP (on_failure_jump, begalt, b + 6);
3262               pending_exact = 0;
3263               b += 3;
3264
3265               /* The alternative before this one has a jump after it
3266                  which gets executed if it gets matched.  Adjust that
3267                  jump so it will jump to this alternative's analogous
3268                  jump (put in below, which in turn will jump to the next
3269                  (if any) alternative's such jump, etc.).  The last such
3270                  jump jumps to the correct final destination.  A picture:
3271                           _____ _____
3272                           |   | |   |
3273                           |   v |   v
3274                         a | b   | c
3275
3276                  If we are at `b', then fixup_alt_jump right now points to a
3277                  three-byte space after `a'.  We'll put in the jump, set
3278                  fixup_alt_jump to right after `b', and leave behind three
3279                  bytes which we'll fill in when we get to after `c'.  */
3280
3281               FIXUP_ALT_JUMP ();
3282
3283               /* Mark and leave space for a jump after this alternative,
3284                  to be filled in later either by next alternative or
3285                  when know we're at the end of a series of alternatives.  */
3286               fixup_alt_jump = b;
3287               GET_BUFFER_SPACE (3);
3288               b += 3;
3289
3290               laststart = 0;
3291               begalt = b;
3292               break;
3293
3294
3295             case '{':
3296               /* If \{ is a literal.  */
3297               if (!(syntax & RE_INTERVALS)
3298                      /* If we're at `\{' and it's not the open-interval
3299                         operator.  */
3300                   || (syntax & RE_NO_BK_BRACES))
3301                 goto normal_backslash;
3302
3303             handle_interval:
3304               {
3305                 /* If got here, then the syntax allows intervals.  */
3306
3307                 /* At least (most) this many matches must be made.  */
3308                 int lower_bound = 0, upper_bound = -1;
3309
3310                 beg_interval = p;
3311
3312                 GET_INTERVAL_COUNT (lower_bound);
3313
3314                 if (c == ',')
3315                   GET_INTERVAL_COUNT (upper_bound);
3316                 else
3317                   /* Interval such as `{1}' => match exactly once. */
3318                   upper_bound = lower_bound;
3319
3320                 if (lower_bound < 0
3321                     || (0 <= upper_bound && upper_bound < lower_bound))
3322                   FREE_STACK_RETURN (REG_BADBR);
3323
3324                 if (!(syntax & RE_NO_BK_BRACES))
3325                   {
3326                     if (c != '\\')
3327                       FREE_STACK_RETURN (REG_BADBR);
3328                     if (p == pend)
3329                       FREE_STACK_RETURN (REG_EESCAPE);
3330                     PATFETCH (c);
3331                   }
3332
3333                 if (c != '}')
3334                   FREE_STACK_RETURN (REG_BADBR);
3335
3336                 /* We just parsed a valid interval.  */
3337
3338                 /* If it's invalid to have no preceding re.  */
3339                 if (!laststart)
3340                   {
3341                     if (syntax & RE_CONTEXT_INVALID_OPS)
3342                       FREE_STACK_RETURN (REG_BADRPT);
3343                     else if (syntax & RE_CONTEXT_INDEP_OPS)
3344                       laststart = b;
3345                     else
3346                       goto unfetch_interval;
3347                   }
3348
3349                 if (upper_bound == 0)
3350                   /* If the upper bound is zero, just drop the sub pattern
3351                      altogether.  */
3352                   b = laststart;
3353                 else if (lower_bound == 1 && upper_bound == 1)
3354                   /* Just match it once: nothing to do here.  */
3355                   ;
3356
3357                 /* Otherwise, we have a nontrivial interval.  When
3358                    we're all done, the pattern will look like:
3359                    set_number_at <jump count> <upper bound>
3360                    set_number_at <succeed_n count> <lower bound>
3361                    succeed_n <after jump addr> <succeed_n count>
3362                    <body of loop>
3363                    jump_n <succeed_n addr> <jump count>
3364                    (The upper bound and `jump_n' are omitted if
3365                    `upper_bound' is 1, though.)  */
3366                 else
3367                   { /* If the upper bound is > 1, we need to insert
3368                        more at the end of the loop.  */
3369                     unsigned int nbytes = (upper_bound < 0 ? 3
3370                                            : upper_bound > 1 ? 5 : 0);
3371                     unsigned int startoffset = 0;
3372
3373                     GET_BUFFER_SPACE (20); /* We might use less.  */
3374
3375                     if (lower_bound == 0)
3376                       {
3377                         /* A succeed_n that starts with 0 is really a
3378                            a simple on_failure_jump_loop.  */
3379                         INSERT_JUMP (on_failure_jump_loop, laststart,
3380                                      b + 3 + nbytes);
3381                         b += 3;
3382                       }
3383                     else
3384                       {
3385                         /* Initialize lower bound of the `succeed_n', even
3386                            though it will be set during matching by its
3387                            attendant `set_number_at' (inserted next),
3388                            because `re_compile_fastmap' needs to know.
3389                            Jump to the `jump_n' we might insert below.  */
3390                         INSERT_JUMP2 (succeed_n, laststart,
3391                                       b + 5 + nbytes,
3392                                       lower_bound);
3393                         b += 5;
3394
3395                         /* Code to initialize the lower bound.  Insert
3396                            before the `succeed_n'.  The `5' is the last two
3397                            bytes of this `set_number_at', plus 3 bytes of
3398                            the following `succeed_n'.  */
3399                         insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3400                         b += 5;
3401                         startoffset += 5;
3402                       }
3403
3404                     if (upper_bound < 0)
3405                       {
3406                         /* A negative upper bound stands for infinity,
3407                            in which case it degenerates to a plain jump.  */
3408                         STORE_JUMP (jump, b, laststart + startoffset);
3409                         b += 3;
3410                       }
3411                     else if (upper_bound > 1)
3412                       { /* More than one repetition is allowed, so
3413                            append a backward jump to the `succeed_n'
3414                            that starts this interval.
3415
3416                            When we've reached this during matching,
3417                            we'll have matched the interval once, so
3418                            jump back only `upper_bound - 1' times.  */
3419                         STORE_JUMP2 (jump_n, b, laststart + startoffset,
3420                                      upper_bound - 1);
3421                         b += 5;
3422
3423                         /* The location we want to set is the second
3424                            parameter of the `jump_n'; that is `b-2' as
3425                            an absolute address.  `laststart' will be
3426                            the `set_number_at' we're about to insert;
3427                            `laststart+3' the number to set, the source
3428                            for the relative address.  But we are
3429                            inserting into the middle of the pattern --
3430                            so everything is getting moved up by 5.
3431                            Conclusion: (b - 2) - (laststart + 3) + 5,
3432                            i.e., b - laststart.
3433
3434                            We insert this at the beginning of the loop
3435                            so that if we fail during matching, we'll
3436                            reinitialize the bounds.  */
3437                         insert_op2 (set_number_at, laststart, b - laststart,
3438                                     upper_bound - 1, b);
3439                         b += 5;
3440                       }
3441                   }
3442                 pending_exact = 0;
3443                 beg_interval = NULL;
3444               }
3445               break;
3446
3447             unfetch_interval:
3448               /* If an invalid interval, match the characters as literals.  */
3449                assert (beg_interval);
3450                p = beg_interval;
3451                beg_interval = NULL;
3452
3453                /* normal_char and normal_backslash need `c'.  */
3454                c = '{';
3455
3456                if (!(syntax & RE_NO_BK_BRACES))
3457                  {
3458                    assert (p > pattern && p[-1] == '\\');
3459                    goto normal_backslash;
3460                  }
3461                else
3462                  goto normal_char;
3463
3464 #ifdef emacs
3465             /* There is no way to specify the before_dot and after_dot
3466                operators.  rms says this is ok.  --karl  */
3467             case '=':
3468               laststart = b;
3469               BUF_PUSH (at_dot);
3470               break;
3471
3472             case 's':
3473               laststart = b;
3474               PATFETCH (c);
3475               BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3476               break;
3477
3478             case 'S':
3479               laststart = b;
3480               PATFETCH (c);
3481               BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3482               break;
3483
3484             case 'c':
3485               laststart = b;
3486               PATFETCH (c);
3487               BUF_PUSH_2 (categoryspec, c);
3488               break;
3489
3490             case 'C':
3491               laststart = b;
3492               PATFETCH (c);
3493               BUF_PUSH_2 (notcategoryspec, c);
3494               break;
3495 #endif /* emacs */
3496
3497
3498             case 'w':
3499               if (syntax & RE_NO_GNU_OPS)
3500                 goto normal_char;
3501               laststart = b;
3502               BUF_PUSH_2 (syntaxspec, Sword);
3503               break;
3504
3505
3506             case 'W':
3507               if (syntax & RE_NO_GNU_OPS)
3508                 goto normal_char;
3509               laststart = b;
3510               BUF_PUSH_2 (notsyntaxspec, Sword);
3511               break;
3512
3513
3514             case '<':
3515               if (syntax & RE_NO_GNU_OPS)
3516                 goto normal_char;
3517               laststart = b;
3518               BUF_PUSH (wordbeg);
3519               break;
3520
3521             case '>':
3522               if (syntax & RE_NO_GNU_OPS)
3523                 goto normal_char;
3524               laststart = b;
3525               BUF_PUSH (wordend);
3526               break;
3527
3528             case '_':
3529               if (syntax & RE_NO_GNU_OPS)
3530                 goto normal_char;
3531               laststart = b;
3532               PATFETCH (c);
3533               if (c == '<')
3534                 BUF_PUSH (symbeg);
3535               else if (c == '>')
3536                 BUF_PUSH (symend);
3537               else
3538                 FREE_STACK_RETURN (REG_BADPAT);
3539               break;
3540
3541             case 'b':
3542               if (syntax & RE_NO_GNU_OPS)
3543                 goto normal_char;
3544               BUF_PUSH (wordbound);
3545               break;
3546
3547             case 'B':
3548               if (syntax & RE_NO_GNU_OPS)
3549                 goto normal_char;
3550               BUF_PUSH (notwordbound);
3551               break;
3552
3553             case '`':
3554               if (syntax & RE_NO_GNU_OPS)
3555                 goto normal_char;
3556               BUF_PUSH (begbuf);
3557               break;
3558
3559             case '\'':
3560               if (syntax & RE_NO_GNU_OPS)
3561                 goto normal_char;
3562               BUF_PUSH (endbuf);
3563               break;
3564
3565             case '1': case '2': case '3': case '4': case '5':
3566             case '6': case '7': case '8': case '9':
3567               {
3568                 regnum_t reg;
3569
3570                 if (syntax & RE_NO_BK_REFS)
3571                   goto normal_backslash;
3572
3573                 reg = c - '0';
3574
3575                 if (reg > bufp->re_nsub || reg < 1
3576                     /* Can't back reference to a subexp before its end.  */
3577                     || group_in_compile_stack (compile_stack, reg))
3578                   FREE_STACK_RETURN (REG_ESUBREG);
3579
3580                 laststart = b;
3581                 BUF_PUSH_2 (duplicate, reg);
3582               }
3583               break;
3584
3585
3586             case '+':
3587             case '?':
3588               if (syntax & RE_BK_PLUS_QM)
3589                 goto handle_plus;
3590               else
3591                 goto normal_backslash;
3592
3593             default:
3594             normal_backslash:
3595               /* You might think it would be useful for \ to mean
3596                  not to translate; but if we don't translate it
3597                  it will never match anything.  */
3598               goto normal_char;
3599             }
3600           break;
3601
3602
3603         default:
3604         /* Expects the character in `c'.  */
3605         normal_char:
3606           /* If no exactn currently being built.  */
3607           if (!pending_exact
3608
3609               /* If last exactn not at current position.  */
3610               || pending_exact + *pending_exact + 1 != b
3611
3612               /* We have only one byte following the exactn for the count.  */
3613               || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3614
3615               /* If followed by a repetition operator.  */
3616               || (p != pend && (*p == '*' || *p == '^'))
3617               || ((syntax & RE_BK_PLUS_QM)
3618                   ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3619                   : p != pend && (*p == '+' || *p == '?'))
3620               || ((syntax & RE_INTERVALS)
3621                   && ((syntax & RE_NO_BK_BRACES)
3622                       ? p != pend && *p == '{'
3623                       : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3624             {
3625               /* Start building a new exactn.  */
3626
3627               laststart = b;
3628
3629               BUF_PUSH_2 (exactn, 0);
3630               pending_exact = b - 1;
3631             }
3632
3633           GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3634           {
3635             int len;
3636
3637             if (multibyte)
3638               {
3639                 c = TRANSLATE (c);
3640                 len = CHAR_STRING (c, b);
3641                 b += len;
3642               }
3643             else
3644               {
3645                 c1 = RE_CHAR_TO_MULTIBYTE (c);
3646                 if (! CHAR_BYTE8_P (c1))
3647                   {
3648                     re_wchar_t c2 = TRANSLATE (c1);
3649
3650                     if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3651                       c = c1;
3652                   }
3653                 *b++ = c;
3654                 len = 1;
3655               }
3656             (*pending_exact) += len;
3657           }
3658
3659           break;
3660         } /* switch (c) */
3661     } /* while p != pend */
3662
3663
3664   /* Through the pattern now.  */
3665
3666   FIXUP_ALT_JUMP ();
3667
3668   if (!COMPILE_STACK_EMPTY)
3669     FREE_STACK_RETURN (REG_EPAREN);
3670
3671   /* If we don't want backtracking, force success
3672      the first time we reach the end of the compiled pattern.  */
3673   if (syntax & RE_NO_POSIX_BACKTRACKING)
3674     BUF_PUSH (succeed);
3675
3676   /* We have succeeded; set the length of the buffer.  */
3677   bufp->used = b - bufp->buffer;
3678
3679 #ifdef DEBUG
3680   if (debug > 0)
3681     {
3682       re_compile_fastmap (bufp);
3683       DEBUG_PRINT ("\nCompiled pattern: \n");
3684       print_compiled_pattern (bufp);
3685     }
3686   debug--;
3687 #endif /* DEBUG */
3688
3689 #ifndef MATCH_MAY_ALLOCATE
3690   /* Initialize the failure stack to the largest possible stack.  This
3691      isn't necessary unless we're trying to avoid calling alloca in
3692      the search and match routines.  */
3693   {
3694     int num_regs = bufp->re_nsub + 1;
3695
3696     if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3697       {
3698         fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3699         falk_stack.stack = realloc (fail_stack.stack,
3700                                     fail_stack.size * sizeof *falk_stack.stack);
3701       }
3702
3703     regex_grow_registers (num_regs);
3704   }
3705 #endif /* not MATCH_MAY_ALLOCATE */
3706
3707   FREE_STACK_RETURN (REG_NOERROR);
3708 } /* regex_compile */
3709 \f
3710 /* Subroutines for `regex_compile'.  */
3711
3712 /* Store OP at LOC followed by two-byte integer parameter ARG.  */
3713
3714 static void
3715 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3716 {
3717   *loc = (unsigned char) op;
3718   STORE_NUMBER (loc + 1, arg);
3719 }
3720
3721
3722 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2.  */
3723
3724 static void
3725 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3726 {
3727   *loc = (unsigned char) op;
3728   STORE_NUMBER (loc + 1, arg1);
3729   STORE_NUMBER (loc + 3, arg2);
3730 }
3731
3732
3733 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3734    for OP followed by two-byte integer parameter ARG.  */
3735
3736 static void
3737 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3738 {
3739   register unsigned char *pfrom = end;
3740   register unsigned char *pto = end + 3;
3741
3742   while (pfrom != loc)
3743     *--pto = *--pfrom;
3744
3745   store_op1 (op, loc, arg);
3746 }
3747
3748
3749 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2.  */
3750
3751 static void
3752 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
3753 {
3754   register unsigned char *pfrom = end;
3755   register unsigned char *pto = end + 5;
3756
3757   while (pfrom != loc)
3758     *--pto = *--pfrom;
3759
3760   store_op2 (op, loc, arg1, arg2);
3761 }
3762
3763
3764 /* P points to just after a ^ in PATTERN.  Return true if that ^ comes
3765    after an alternative or a begin-subexpression.  We assume there is at
3766    least one character before the ^.  */
3767
3768 static boolean
3769 at_begline_loc_p (const_re_char *pattern, const_re_char *p, reg_syntax_t syntax)
3770 {
3771   re_char *prev = p - 2;
3772   boolean odd_backslashes;
3773
3774   /* After a subexpression?  */
3775   if (*prev == '(')
3776     odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3777
3778   /* After an alternative?  */
3779   else if (*prev == '|')
3780     odd_backslashes = (syntax & RE_NO_BK_VBAR) == 0;
3781
3782   /* After a shy subexpression?  */
3783   else if (*prev == ':' && (syntax & RE_SHY_GROUPS))
3784     {
3785       /* Skip over optional regnum.  */
3786       while (prev - 1 >= pattern && prev[-1] >= '0' && prev[-1] <= '9')
3787         --prev;
3788
3789       if (!(prev - 2 >= pattern
3790             && prev[-1] == '?' && prev[-2] == '('))
3791         return false;
3792       prev -= 2;
3793       odd_backslashes = (syntax & RE_NO_BK_PARENS) == 0;
3794     }
3795   else
3796     return false;
3797
3798   /* Count the number of preceding backslashes.  */
3799   p = prev;
3800   while (prev - 1 >= pattern && prev[-1] == '\\')
3801     --prev;
3802   return (p - prev) & odd_backslashes;
3803 }
3804
3805
3806 /* The dual of at_begline_loc_p.  This one is for $.  We assume there is
3807    at least one character after the $, i.e., `P < PEND'.  */
3808
3809 static boolean
3810 at_endline_loc_p (const_re_char *p, const_re_char *pend, reg_syntax_t syntax)
3811 {
3812   re_char *next = p;
3813   boolean next_backslash = *next == '\\';
3814   re_char *next_next = p + 1 < pend ? p + 1 : 0;
3815
3816   return
3817        /* Before a subexpression?  */
3818        (syntax & RE_NO_BK_PARENS ? *next == ')'
3819         : next_backslash && next_next && *next_next == ')')
3820        /* Before an alternative?  */
3821     || (syntax & RE_NO_BK_VBAR ? *next == '|'
3822         : next_backslash && next_next && *next_next == '|');
3823 }
3824
3825
3826 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3827    false if it's not.  */
3828
3829 static boolean
3830 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3831 {
3832   ssize_t this_element;
3833
3834   for (this_element = compile_stack.avail - 1;
3835        this_element >= 0;
3836        this_element--)
3837     if (compile_stack.stack[this_element].regnum == regnum)
3838       return true;
3839
3840   return false;
3841 }
3842 \f
3843 /* analyse_first.
3844    If fastmap is non-NULL, go through the pattern and fill fastmap
3845    with all the possible leading chars.  If fastmap is NULL, don't
3846    bother filling it up (obviously) and only return whether the
3847    pattern could potentially match the empty string.
3848
3849    Return 1  if p..pend might match the empty string.
3850    Return 0  if p..pend matches at least one char.
3851    Return -1 if fastmap was not updated accurately.  */
3852
3853 static int
3854 analyse_first (const_re_char *p, const_re_char *pend, char *fastmap,
3855                const int multibyte)
3856 {
3857   int j, k;
3858   boolean not;
3859
3860   /* If all elements for base leading-codes in fastmap is set, this
3861      flag is set true.  */
3862   boolean match_any_multibyte_characters = false;
3863
3864   assert (p);
3865
3866   /* The loop below works as follows:
3867      - It has a working-list kept in the PATTERN_STACK and which basically
3868        starts by only containing a pointer to the first operation.
3869      - If the opcode we're looking at is a match against some set of
3870        chars, then we add those chars to the fastmap and go on to the
3871        next work element from the worklist (done via `break').
3872      - If the opcode is a control operator on the other hand, we either
3873        ignore it (if it's meaningless at this point, such as `start_memory')
3874        or execute it (if it's a jump).  If the jump has several destinations
3875        (i.e. `on_failure_jump'), then we push the other destination onto the
3876        worklist.
3877      We guarantee termination by ignoring backward jumps (more or less),
3878      so that `p' is monotonically increasing.  More to the point, we
3879      never set `p' (or push) anything `<= p1'.  */
3880
3881   while (p < pend)
3882     {
3883       /* `p1' is used as a marker of how far back a `on_failure_jump'
3884          can go without being ignored.  It is normally equal to `p'
3885          (which prevents any backward `on_failure_jump') except right
3886          after a plain `jump', to allow patterns such as:
3887             0: jump 10
3888             3..9: <body>
3889             10: on_failure_jump 3
3890          as used for the *? operator.  */
3891       re_char *p1 = p;
3892
3893       switch (*p++)
3894         {
3895         case succeed:
3896           return 1;
3897
3898         case duplicate:
3899           /* If the first character has to match a backreference, that means
3900              that the group was empty (since it already matched).  Since this
3901              is the only case that interests us here, we can assume that the
3902              backreference must match the empty string.  */
3903           p++;
3904           continue;
3905
3906
3907       /* Following are the cases which match a character.  These end
3908          with `break'.  */
3909
3910         case exactn:
3911           if (fastmap)
3912             {
3913               /* If multibyte is nonzero, the first byte of each
3914                  character is an ASCII or a leading code.  Otherwise,
3915                  each byte is a character.  Thus, this works in both
3916                  cases. */
3917               fastmap[p[1]] = 1;
3918               if (! multibyte)
3919                 {
3920                   /* For the case of matching this unibyte regex
3921                      against multibyte, we must set a leading code of
3922                      the corresponding multibyte character.  */
3923                   int c = RE_CHAR_TO_MULTIBYTE (p[1]);
3924
3925                   fastmap[CHAR_LEADING_CODE (c)] = 1;
3926                 }
3927             }
3928           break;
3929
3930
3931         case anychar:
3932           /* We could put all the chars except for \n (and maybe \0)
3933              but we don't bother since it is generally not worth it.  */
3934           if (!fastmap) break;
3935           return -1;
3936
3937
3938         case charset_not:
3939           if (!fastmap) break;
3940           {
3941             /* Chars beyond end of bitmap are possible matches.  */
3942             for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3943                  j < (1 << BYTEWIDTH); j++)
3944               fastmap[j] = 1;
3945           }
3946
3947           /* Fallthrough */
3948         case charset:
3949           if (!fastmap) break;
3950           not = (re_opcode_t) *(p - 1) == charset_not;
3951           for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3952                j >= 0; j--)
3953             if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3954               fastmap[j] = 1;
3955
3956 #ifdef emacs
3957           if (/* Any leading code can possibly start a character
3958                  which doesn't match the specified set of characters.  */
3959               not
3960               ||
3961               /* If we can match a character class, we can match any
3962                  multibyte characters.  */
3963               (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3964                && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3965
3966             {
3967               if (match_any_multibyte_characters == false)
3968                 {
3969                   for (j = MIN_MULTIBYTE_LEADING_CODE;
3970                        j <= MAX_MULTIBYTE_LEADING_CODE; j++)
3971                     fastmap[j] = 1;
3972                   match_any_multibyte_characters = true;
3973                 }
3974             }
3975
3976           else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3977                    && match_any_multibyte_characters == false)
3978             {
3979               /* Set fastmap[I] to 1 where I is a leading code of each
3980                  multibyte character in the range table. */
3981               int c, count;
3982               unsigned char lc1, lc2;
3983
3984               /* Make P points the range table.  `+ 2' is to skip flag
3985                  bits for a character class.  */
3986               p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
3987
3988               /* Extract the number of ranges in range table into COUNT.  */
3989               EXTRACT_NUMBER_AND_INCR (count, p);
3990               for (; count > 0; count--, p += 3)
3991                 {
3992                   /* Extract the start and end of each range.  */
3993                   EXTRACT_CHARACTER (c, p);
3994                   lc1 = CHAR_LEADING_CODE (c);
3995                   p += 3;
3996                   EXTRACT_CHARACTER (c, p);
3997                   lc2 = CHAR_LEADING_CODE (c);
3998                   for (j = lc1; j <= lc2; j++)
3999                     fastmap[j] = 1;
4000                 }
4001             }
4002 #endif
4003           break;
4004
4005         case syntaxspec:
4006         case notsyntaxspec:
4007           if (!fastmap) break;
4008 #ifndef emacs
4009           not = (re_opcode_t)p[-1] == notsyntaxspec;
4010           k = *p++;
4011           for (j = 0; j < (1 << BYTEWIDTH); j++)
4012             if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
4013               fastmap[j] = 1;
4014           break;
4015 #else  /* emacs */
4016           /* This match depends on text properties.  These end with
4017              aborting optimizations.  */
4018           return -1;
4019
4020         case categoryspec:
4021         case notcategoryspec:
4022           if (!fastmap) break;
4023           not = (re_opcode_t)p[-1] == notcategoryspec;
4024           k = *p++;
4025           for (j = (1 << BYTEWIDTH); j >= 0; j--)
4026             if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
4027               fastmap[j] = 1;
4028
4029           /* Any leading code can possibly start a character which
4030              has or doesn't has the specified category.  */
4031           if (match_any_multibyte_characters == false)
4032             {
4033               for (j = MIN_MULTIBYTE_LEADING_CODE;
4034                    j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4035                 fastmap[j] = 1;
4036               match_any_multibyte_characters = true;
4037             }
4038           break;
4039
4040       /* All cases after this match the empty string.  These end with
4041          `continue'.  */
4042
4043         case before_dot:
4044         case at_dot:
4045         case after_dot:
4046 #endif /* !emacs */
4047         case no_op:
4048         case begline:
4049         case endline:
4050         case begbuf:
4051         case endbuf:
4052         case wordbound:
4053         case notwordbound:
4054         case wordbeg:
4055         case wordend:
4056         case symbeg:
4057         case symend:
4058           continue;
4059
4060
4061         case jump:
4062           EXTRACT_NUMBER_AND_INCR (j, p);
4063           if (j < 0)
4064             /* Backward jumps can only go back to code that we've already
4065                visited.  `re_compile' should make sure this is true.  */
4066             break;
4067           p += j;
4068           switch (*p)
4069             {
4070             case on_failure_jump:
4071             case on_failure_keep_string_jump:
4072             case on_failure_jump_loop:
4073             case on_failure_jump_nastyloop:
4074             case on_failure_jump_smart:
4075               p++;
4076               break;
4077             default:
4078               continue;
4079             };
4080           /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4081              to jump back to "just after here".  */
4082           /* Fallthrough */
4083
4084         case on_failure_jump:
4085         case on_failure_keep_string_jump:
4086         case on_failure_jump_nastyloop:
4087         case on_failure_jump_loop:
4088         case on_failure_jump_smart:
4089           EXTRACT_NUMBER_AND_INCR (j, p);
4090           if (p + j <= p1)
4091             ; /* Backward jump to be ignored.  */
4092           else
4093             { /* We have to look down both arms.
4094                  We first go down the "straight" path so as to minimize
4095                  stack usage when going through alternatives.  */
4096               int r = analyse_first (p, pend, fastmap, multibyte);
4097               if (r) return r;
4098               p += j;
4099             }
4100           continue;
4101
4102
4103         case jump_n:
4104           /* This code simply does not properly handle forward jump_n.  */
4105           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4106           p += 4;
4107           /* jump_n can either jump or fall through.  The (backward) jump
4108              case has already been handled, so we only need to look at the
4109              fallthrough case.  */
4110           continue;
4111
4112         case succeed_n:
4113           /* If N == 0, it should be an on_failure_jump_loop instead.  */
4114           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4115           p += 4;
4116           /* We only care about one iteration of the loop, so we don't
4117              need to consider the case where this behaves like an
4118              on_failure_jump.  */
4119           continue;
4120
4121
4122         case set_number_at:
4123           p += 4;
4124           continue;
4125
4126
4127         case start_memory:
4128         case stop_memory:
4129           p += 1;
4130           continue;
4131
4132
4133         default:
4134           abort (); /* We have listed all the cases.  */
4135         } /* switch *p++ */
4136
4137       /* Getting here means we have found the possible starting
4138          characters for one path of the pattern -- and that the empty
4139          string does not match.  We need not follow this path further.  */
4140       return 0;
4141     } /* while p */
4142
4143   /* We reached the end without matching anything.  */
4144   return 1;
4145
4146 } /* analyse_first */
4147 \f
4148 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4149    BUFP.  A fastmap records which of the (1 << BYTEWIDTH) possible
4150    characters can start a string that matches the pattern.  This fastmap
4151    is used by re_search to skip quickly over impossible starting points.
4152
4153    Character codes above (1 << BYTEWIDTH) are not represented in the
4154    fastmap, but the leading codes are represented.  Thus, the fastmap
4155    indicates which character sets could start a match.
4156
4157    The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4158    area as BUFP->fastmap.
4159
4160    We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4161    the pattern buffer.
4162
4163    Returns 0 if we succeed, -2 if an internal error.   */
4164
4165 int
4166 re_compile_fastmap (struct re_pattern_buffer *bufp)
4167 {
4168   char *fastmap = bufp->fastmap;
4169   int analysis;
4170
4171   assert (fastmap && bufp->buffer);
4172
4173   memset (fastmap, 0, 1 << BYTEWIDTH);  /* Assume nothing's valid.  */
4174   bufp->fastmap_accurate = 1;       /* It will be when we're done.  */
4175
4176   analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
4177                             fastmap, RE_MULTIBYTE_P (bufp));
4178   bufp->can_be_null = (analysis != 0);
4179   return 0;
4180 } /* re_compile_fastmap */
4181 \f
4182 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4183    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
4184    this memory for recording register information.  STARTS and ENDS
4185    must be allocated using the malloc library routine, and must each
4186    be at least NUM_REGS * sizeof (regoff_t) bytes long.
4187
4188    If NUM_REGS == 0, then subsequent matches should allocate their own
4189    register data.
4190
4191    Unless this function is called, the first search or match using
4192    PATTERN_BUFFER will allocate its own register data, without
4193    freeing the old data.  */
4194
4195 void
4196 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
4197 {
4198   if (num_regs)
4199     {
4200       bufp->regs_allocated = REGS_REALLOCATE;
4201       regs->num_regs = num_regs;
4202       regs->start = starts;
4203       regs->end = ends;
4204     }
4205   else
4206     {
4207       bufp->regs_allocated = REGS_UNALLOCATED;
4208       regs->num_regs = 0;
4209       regs->start = regs->end = 0;
4210     }
4211 }
4212 WEAK_ALIAS (__re_set_registers, re_set_registers)
4213 \f
4214 /* Searching routines.  */
4215
4216 /* Like re_search_2, below, but only one string is specified, and
4217    doesn't let you say where to stop matching. */
4218
4219 regoff_t
4220 re_search (struct re_pattern_buffer *bufp, const char *string, size_t size,
4221            ssize_t startpos, ssize_t range, struct re_registers *regs)
4222 {
4223   return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4224                       regs, size);
4225 }
4226 WEAK_ALIAS (__re_search, re_search)
4227
4228 /* Head address of virtual concatenation of string.  */
4229 #define HEAD_ADDR_VSTRING(P)            \
4230   (((P) >= size1 ? string2 : string1))
4231
4232 /* Address of POS in the concatenation of virtual string. */
4233 #define POS_ADDR_VSTRING(POS)                                   \
4234   (((POS) >= size1 ? string2 - size1 : string1) + (POS))
4235
4236 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4237    virtual concatenation of STRING1 and STRING2, starting first at index
4238    STARTPOS, then at STARTPOS + 1, and so on.
4239
4240    STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4241
4242    RANGE is how far to scan while trying to match.  RANGE = 0 means try
4243    only at STARTPOS; in general, the last start tried is STARTPOS +
4244    RANGE.
4245
4246    In REGS, return the indices of the virtual concatenation of STRING1
4247    and STRING2 that matched the entire BUFP->buffer and its contained
4248    subexpressions.
4249
4250    Do not consider matching one past the index STOP in the virtual
4251    concatenation of STRING1 and STRING2.
4252
4253    We return either the position in the strings at which the match was
4254    found, -1 if no match, or -2 if error (such as failure
4255    stack overflow).  */
4256
4257 regoff_t
4258 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, size_t size1,
4259              const char *str2, size_t size2, ssize_t startpos, ssize_t range,
4260              struct re_registers *regs, ssize_t stop)
4261 {
4262   regoff_t val;
4263   re_char *string1 = (re_char*) str1;
4264   re_char *string2 = (re_char*) str2;
4265   register char *fastmap = bufp->fastmap;
4266   register RE_TRANSLATE_TYPE translate = bufp->translate;
4267   size_t total_size = size1 + size2;
4268   ssize_t endpos = startpos + range;
4269   boolean anchored_start;
4270   /* Nonzero if we are searching multibyte string.  */
4271   const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4272
4273   /* Check for out-of-range STARTPOS.  */
4274   if (startpos < 0 || startpos > total_size)
4275     return -1;
4276
4277   /* Fix up RANGE if it might eventually take us outside
4278      the virtual concatenation of STRING1 and STRING2.
4279      Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE.  */
4280   if (endpos < 0)
4281     range = 0 - startpos;
4282   else if (endpos > total_size)
4283     range = total_size - startpos;
4284
4285   /* If the search isn't to be a backwards one, don't waste time in a
4286      search for a pattern anchored at beginning of buffer.  */
4287   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4288     {
4289       if (startpos > 0)
4290         return -1;
4291       else
4292         range = 0;
4293     }
4294
4295 #ifdef emacs
4296   /* In a forward search for something that starts with \=.
4297      don't keep searching past point.  */
4298   if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4299     {
4300       range = PT_BYTE - BEGV_BYTE - startpos;
4301       if (range < 0)
4302         return -1;
4303     }
4304 #endif /* emacs */
4305
4306   /* Update the fastmap now if not correct already.  */
4307   if (fastmap && !bufp->fastmap_accurate)
4308     re_compile_fastmap (bufp);
4309
4310   /* See whether the pattern is anchored.  */
4311   anchored_start = (bufp->buffer[0] == begline);
4312
4313 #ifdef emacs
4314   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4315   {
4316     ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
4317
4318     SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4319   }
4320 #endif
4321
4322   /* Loop through the string, looking for a place to start matching.  */
4323   for (;;)
4324     {
4325       /* If the pattern is anchored,
4326          skip quickly past places we cannot match.
4327          We don't bother to treat startpos == 0 specially
4328          because that case doesn't repeat.  */
4329       if (anchored_start && startpos > 0)
4330         {
4331           if (! ((startpos <= size1 ? string1[startpos - 1]
4332                   : string2[startpos - size1 - 1])
4333                  == '\n'))
4334             goto advance;
4335         }
4336
4337       /* If a fastmap is supplied, skip quickly over characters that
4338          cannot be the start of a match.  If the pattern can match the
4339          null string, however, we don't need to skip characters; we want
4340          the first null string.  */
4341       if (fastmap && startpos < total_size && !bufp->can_be_null)
4342         {
4343           register re_char *d;
4344           register re_wchar_t buf_ch;
4345
4346           d = POS_ADDR_VSTRING (startpos);
4347
4348           if (range > 0)        /* Searching forwards.  */
4349             {
4350               register int lim = 0;
4351               ssize_t irange = range;
4352
4353               if (startpos < size1 && startpos + range >= size1)
4354                 lim = range - (size1 - startpos);
4355
4356               /* Written out as an if-else to avoid testing `translate'
4357                  inside the loop.  */
4358               if (RE_TRANSLATE_P (translate))
4359                 {
4360                   if (multibyte)
4361                     while (range > lim)
4362                       {
4363                         int buf_charlen;
4364
4365                         buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4366                         buf_ch = RE_TRANSLATE (translate, buf_ch);
4367                         if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4368                           break;
4369
4370                         range -= buf_charlen;
4371                         d += buf_charlen;
4372                       }
4373                   else
4374                     while (range > lim)
4375                       {
4376                         register re_wchar_t ch, translated;
4377
4378                         buf_ch = *d;
4379                         ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4380                         translated = RE_TRANSLATE (translate, ch);
4381                         if (translated != ch
4382                             && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4383                           buf_ch = ch;
4384                         if (fastmap[buf_ch])
4385                           break;
4386                         d++;
4387                         range--;
4388                       }
4389                 }
4390               else
4391                 {
4392                   if (multibyte)
4393                     while (range > lim)
4394                       {
4395                         int buf_charlen;
4396
4397                         buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4398                         if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4399                           break;
4400                         range -= buf_charlen;
4401                         d += buf_charlen;
4402                       }
4403                   else
4404                     while (range > lim && !fastmap[*d])
4405                       {
4406                         d++;
4407                         range--;
4408                       }
4409                 }
4410               startpos += irange - range;
4411             }
4412           else                          /* Searching backwards.  */
4413             {
4414               if (multibyte)
4415                 {
4416                   buf_ch = STRING_CHAR (d);
4417                   buf_ch = TRANSLATE (buf_ch);
4418                   if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4419                     goto advance;
4420                 }
4421               else
4422                 {
4423                   register re_wchar_t ch, translated;
4424
4425                   buf_ch = *d;
4426                   ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4427                   translated = TRANSLATE (ch);
4428                   if (translated != ch
4429                       && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4430                     buf_ch = ch;
4431                   if (! fastmap[TRANSLATE (buf_ch)])
4432                     goto advance;
4433                 }
4434             }
4435         }
4436
4437       /* If can't match the null string, and that's all we have left, fail.  */
4438       if (range >= 0 && startpos == total_size && fastmap
4439           && !bufp->can_be_null)
4440         return -1;
4441
4442       val = re_match_2_internal (bufp, string1, size1, string2, size2,
4443                                  startpos, regs, stop);
4444
4445       if (val >= 0)
4446         return startpos;
4447
4448       if (val == -2)
4449         return -2;
4450
4451     advance:
4452       if (!range)
4453         break;
4454       else if (range > 0)
4455         {
4456           /* Update STARTPOS to the next character boundary.  */
4457           if (multibyte)
4458             {
4459               re_char *p = POS_ADDR_VSTRING (startpos);
4460               int len = BYTES_BY_CHAR_HEAD (*p);
4461
4462               range -= len;
4463               if (range < 0)
4464                 break;
4465               startpos += len;
4466             }
4467           else
4468             {
4469               range--;
4470               startpos++;
4471             }
4472         }
4473       else
4474         {
4475           range++;
4476           startpos--;
4477
4478           /* Update STARTPOS to the previous character boundary.  */
4479           if (multibyte)
4480             {
4481               re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4482               re_char *p0 = p;
4483               re_char *phead = HEAD_ADDR_VSTRING (startpos);
4484
4485               /* Find the head of multibyte form.  */
4486               PREV_CHAR_BOUNDARY (p, phead);
4487               range += p0 - 1 - p;
4488               if (range > 0)
4489                 break;
4490
4491               startpos -= p0 - 1 - p;
4492             }
4493         }
4494     }
4495   return -1;
4496 } /* re_search_2 */
4497 WEAK_ALIAS (__re_search_2, re_search_2)
4498 \f
4499 /* Declarations and macros for re_match_2.  */
4500
4501 static int bcmp_translate (re_char *s1, re_char *s2,
4502                            register ssize_t len,
4503                            RE_TRANSLATE_TYPE translate,
4504                            const int multibyte);
4505
4506 /* This converts PTR, a pointer into one of the search strings `string1'
4507    and `string2' into an offset from the beginning of that string.  */
4508 #define POINTER_TO_OFFSET(ptr)                  \
4509   (FIRST_STRING_P (ptr)                         \
4510    ? (ptr) - string1                            \
4511    : (ptr) - string2 + (ptrdiff_t) size1)
4512
4513 /* Call before fetching a character with *d.  This switches over to
4514    string2 if necessary.
4515    Check re_match_2_internal for a discussion of why end_match_2 might
4516    not be within string2 (but be equal to end_match_1 instead).  */
4517 #define PREFETCH()                                                      \
4518   while (d == dend)                                                     \
4519     {                                                                   \
4520       /* End of string2 => fail.  */                                    \
4521       if (dend == end_match_2)                                          \
4522         goto fail;                                                      \
4523       /* End of string1 => advance to string2.  */                      \
4524       d = string2;                                                      \
4525       dend = end_match_2;                                               \
4526     }
4527
4528 /* Call before fetching a char with *d if you already checked other limits.
4529    This is meant for use in lookahead operations like wordend, etc..
4530    where we might need to look at parts of the string that might be
4531    outside of the LIMITs (i.e past `stop').  */
4532 #define PREFETCH_NOLIMIT()                                              \
4533   if (d == end1)                                                        \
4534      {                                                                  \
4535        d = string2;                                                     \
4536        dend = end_match_2;                                              \
4537      }                                                                  \
4538
4539 /* Test if at very beginning or at very end of the virtual concatenation
4540    of `string1' and `string2'.  If only one string, it's `string2'.  */
4541 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4542 #define AT_STRINGS_END(d) ((d) == end2)
4543
4544 /* Disabled due to a compiler bug -- see comment at case wordbound */
4545
4546 /* The comment at case wordbound is following one, but we don't use
4547    AT_WORD_BOUNDARY anymore to support multibyte form.
4548
4549    The DEC Alpha C compiler 3.x generates incorrect code for the
4550    test  WORDCHAR_P (d - 1) != WORDCHAR_P (d)  in the expansion of
4551    AT_WORD_BOUNDARY, so this code is disabled.  Expanding the
4552    macro and introducing temporary variables works around the bug.  */
4553
4554 #if 0
4555 /* Test if D points to a character which is word-constituent.  We have
4556    two special cases to check for: if past the end of string1, look at
4557    the first character in string2; and if before the beginning of
4558    string2, look at the last character in string1.  */
4559 #define WORDCHAR_P(d)                                                   \
4560   (SYNTAX ((d) == end1 ? *string2                                       \
4561            : (d) == string2 - 1 ? *(end1 - 1) : *(d))                   \
4562    == Sword)
4563
4564 /* Test if the character before D and the one at D differ with respect
4565    to being word-constituent.  */
4566 #define AT_WORD_BOUNDARY(d)                                             \
4567   (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)                             \
4568    || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4569 #endif
4570
4571 /* Free everything we malloc.  */
4572 #ifdef MATCH_MAY_ALLOCATE
4573 # define FREE_VAR(var)                                                  \
4574   do {                                                                  \
4575     if (var)                                                            \
4576       {                                                                 \
4577         REGEX_FREE (var);                                               \
4578         var = NULL;                                                     \
4579       }                                                                 \
4580   } while (0)
4581 # define FREE_VARIABLES()                                               \
4582   do {                                                                  \
4583     REGEX_FREE_STACK (fail_stack.stack);                                \
4584     FREE_VAR (regstart);                                                \
4585     FREE_VAR (regend);                                                  \
4586     FREE_VAR (best_regstart);                                           \
4587     FREE_VAR (best_regend);                                             \
4588   } while (0)
4589 #else
4590 # define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning.  */
4591 #endif /* not MATCH_MAY_ALLOCATE */
4592
4593 \f
4594 /* Optimization routines.  */
4595
4596 /* If the operation is a match against one or more chars,
4597    return a pointer to the next operation, else return NULL.  */
4598 static re_char *
4599 skip_one_char (const_re_char *p)
4600 {
4601   switch (*p++)
4602     {
4603     case anychar:
4604       break;
4605
4606     case exactn:
4607       p += *p + 1;
4608       break;
4609
4610     case charset_not:
4611     case charset:
4612       if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4613         {
4614           int mcnt;
4615           p = CHARSET_RANGE_TABLE (p - 1);
4616           EXTRACT_NUMBER_AND_INCR (mcnt, p);
4617           p = CHARSET_RANGE_TABLE_END (p, mcnt);
4618         }
4619       else
4620         p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4621       break;
4622
4623     case syntaxspec:
4624     case notsyntaxspec:
4625 #ifdef emacs
4626     case categoryspec:
4627     case notcategoryspec:
4628 #endif /* emacs */
4629       p++;
4630       break;
4631
4632     default:
4633       p = NULL;
4634     }
4635   return p;
4636 }
4637
4638
4639 /* Jump over non-matching operations.  */
4640 static re_char *
4641 skip_noops (const_re_char *p, const_re_char *pend)
4642 {
4643   int mcnt;
4644   while (p < pend)
4645     {
4646       switch (*p)
4647         {
4648         case start_memory:
4649         case stop_memory:
4650           p += 2; break;
4651         case no_op:
4652           p += 1; break;
4653         case jump:
4654           p += 1;
4655           EXTRACT_NUMBER_AND_INCR (mcnt, p);
4656           p += mcnt;
4657           break;
4658         default:
4659           return p;
4660         }
4661     }
4662   assert (p == pend);
4663   return p;
4664 }
4665
4666 /* Non-zero if "p1 matches something" implies "p2 fails".  */
4667 static int
4668 mutually_exclusive_p (struct re_pattern_buffer *bufp, const_re_char *p1,
4669                       const_re_char *p2)
4670 {
4671   re_opcode_t op2;
4672   const boolean multibyte = RE_MULTIBYTE_P (bufp);
4673   unsigned char *pend = bufp->buffer + bufp->used;
4674
4675   assert (p1 >= bufp->buffer && p1 < pend
4676           && p2 >= bufp->buffer && p2 <= pend);
4677
4678   /* Skip over open/close-group commands.
4679      If what follows this loop is a ...+ construct,
4680      look at what begins its body, since we will have to
4681      match at least one of that.  */
4682   p2 = skip_noops (p2, pend);
4683   /* The same skip can be done for p1, except that this function
4684      is only used in the case where p1 is a simple match operator.  */
4685   /* p1 = skip_noops (p1, pend); */
4686
4687   assert (p1 >= bufp->buffer && p1 < pend
4688           && p2 >= bufp->buffer && p2 <= pend);
4689
4690   op2 = p2 == pend ? succeed : *p2;
4691
4692   switch (op2)
4693     {
4694     case succeed:
4695     case endbuf:
4696       /* If we're at the end of the pattern, we can change.  */
4697       if (skip_one_char (p1))
4698         {
4699           DEBUG_PRINT ("  End of pattern: fast loop.\n");
4700           return 1;
4701         }
4702       break;
4703
4704     case endline:
4705     case exactn:
4706       {
4707         register re_wchar_t c
4708           = (re_opcode_t) *p2 == endline ? '\n'
4709           : RE_STRING_CHAR (p2 + 2, multibyte);
4710
4711         if ((re_opcode_t) *p1 == exactn)
4712           {
4713             if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4714               {
4715                 DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
4716                 return 1;
4717               }
4718           }
4719
4720         else if ((re_opcode_t) *p1 == charset
4721                  || (re_opcode_t) *p1 == charset_not)
4722           {
4723             int not = (re_opcode_t) *p1 == charset_not;
4724
4725             /* Test if C is listed in charset (or charset_not)
4726                at `p1'.  */
4727             if (! multibyte || IS_REAL_ASCII (c))
4728               {
4729                 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4730                     && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4731                   not = !not;
4732               }
4733             else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4734               CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4735
4736             /* `not' is equal to 1 if c would match, which means
4737                that we can't change to pop_failure_jump.  */
4738             if (!not)
4739               {
4740                 DEBUG_PRINT ("   No match => fast loop.\n");
4741                 return 1;
4742               }
4743           }
4744         else if ((re_opcode_t) *p1 == anychar
4745                  && c == '\n')
4746           {
4747             DEBUG_PRINT ("   . != \\n => fast loop.\n");
4748             return 1;
4749           }
4750       }
4751       break;
4752
4753     case charset:
4754       {
4755         if ((re_opcode_t) *p1 == exactn)
4756           /* Reuse the code above.  */
4757           return mutually_exclusive_p (bufp, p2, p1);
4758
4759       /* It is hard to list up all the character in charset
4760          P2 if it includes multibyte character.  Give up in
4761          such case.  */
4762       else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4763         {
4764           /* Now, we are sure that P2 has no range table.
4765              So, for the size of bitmap in P2, `p2[1]' is
4766              enough.  But P1 may have range table, so the
4767              size of bitmap table of P1 is extracted by
4768              using macro `CHARSET_BITMAP_SIZE'.
4769
4770              In a multibyte case, we know that all the character
4771              listed in P2 is ASCII.  In a unibyte case, P1 has only a
4772              bitmap table.  So, in both cases, it is enough to test
4773              only the bitmap table of P1.  */
4774
4775           if ((re_opcode_t) *p1 == charset)
4776             {
4777               int idx;
4778               /* We win if the charset inside the loop
4779                  has no overlap with the one after the loop.  */
4780               for (idx = 0;
4781                    (idx < (int) p2[1]
4782                     && idx < CHARSET_BITMAP_SIZE (p1));
4783                    idx++)
4784                 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4785                   break;
4786
4787               if (idx == p2[1]
4788                   || idx == CHARSET_BITMAP_SIZE (p1))
4789                 {
4790                   DEBUG_PRINT ("         No match => fast loop.\n");
4791                   return 1;
4792                 }
4793             }
4794           else if ((re_opcode_t) *p1 == charset_not)
4795             {
4796               int idx;
4797               /* We win if the charset_not inside the loop lists
4798                  every character listed in the charset after.  */
4799               for (idx = 0; idx < (int) p2[1]; idx++)
4800                 if (! (p2[2 + idx] == 0
4801                        || (idx < CHARSET_BITMAP_SIZE (p1)
4802                            && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4803                   break;
4804
4805               if (idx == p2[1])
4806                 {
4807                   DEBUG_PRINT ("         No match => fast loop.\n");
4808                   return 1;
4809                 }
4810               }
4811           }
4812       }
4813       break;
4814
4815     case charset_not:
4816       switch (*p1)
4817         {
4818         case exactn:
4819         case charset:
4820           /* Reuse the code above.  */
4821           return mutually_exclusive_p (bufp, p2, p1);
4822         case charset_not:
4823           /* When we have two charset_not, it's very unlikely that
4824              they don't overlap.  The union of the two sets of excluded
4825              chars should cover all possible chars, which, as a matter of
4826              fact, is virtually impossible in multibyte buffers.  */
4827           break;
4828         }
4829       break;
4830
4831     case wordend:
4832       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4833     case symend:
4834       return ((re_opcode_t) *p1 == syntaxspec
4835               && (p1[1] == Ssymbol || p1[1] == Sword));
4836     case notsyntaxspec:
4837       return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4838
4839     case wordbeg:
4840       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4841     case symbeg:
4842       return ((re_opcode_t) *p1 == notsyntaxspec
4843               && (p1[1] == Ssymbol || p1[1] == Sword));
4844     case syntaxspec:
4845       return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4846
4847     case wordbound:
4848       return (((re_opcode_t) *p1 == notsyntaxspec
4849                || (re_opcode_t) *p1 == syntaxspec)
4850               && p1[1] == Sword);
4851
4852 #ifdef emacs
4853     case categoryspec:
4854       return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4855     case notcategoryspec:
4856       return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4857 #endif /* emacs */
4858
4859     default:
4860       ;
4861     }
4862
4863   /* Safe default.  */
4864   return 0;
4865 }
4866
4867 \f
4868 /* Matching routines.  */
4869
4870 #ifndef emacs   /* Emacs never uses this.  */
4871 /* re_match is like re_match_2 except it takes only a single string.  */
4872
4873 regoff_t
4874 re_match (struct re_pattern_buffer *bufp, const char *string,
4875           size_t size, ssize_t pos, struct re_registers *regs)
4876 {
4877   regoff_t result = re_match_2_internal (bufp, NULL, 0, (re_char*) string,
4878                                          size, pos, regs, size);
4879   return result;
4880 }
4881 WEAK_ALIAS (__re_match, re_match)
4882 #endif /* not emacs */
4883
4884 #ifdef emacs
4885 /* In Emacs, this is the string or buffer in which we
4886    are matching.  It is used for looking up syntax properties.  */
4887 Lisp_Object re_match_object;
4888 #endif
4889
4890 /* re_match_2 matches the compiled pattern in BUFP against the
4891    the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4892    and SIZE2, respectively).  We start matching at POS, and stop
4893    matching at STOP.
4894
4895    If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4896    store offsets for the substring each group matched in REGS.  See the
4897    documentation for exactly how many groups we fill.
4898
4899    We return -1 if no match, -2 if an internal error (such as the
4900    failure stack overflowing).  Otherwise, we return the length of the
4901    matched substring.  */
4902
4903 regoff_t
4904 re_match_2 (struct re_pattern_buffer *bufp, const char *string1,
4905             size_t size1, const char *string2, size_t size2, ssize_t pos,
4906             struct re_registers *regs, ssize_t stop)
4907 {
4908   regoff_t result;
4909
4910 #ifdef emacs
4911   ssize_t charpos;
4912   gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4913   charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4914   SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4915 #endif
4916
4917   result = re_match_2_internal (bufp, (re_char*) string1, size1,
4918                                 (re_char*) string2, size2,
4919                                 pos, regs, stop);
4920   return result;
4921 }
4922 WEAK_ALIAS (__re_match_2, re_match_2)
4923
4924
4925 /* This is a separate function so that we can force an alloca cleanup
4926    afterwards.  */
4927 static regoff_t
4928 re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
4929                      size_t size1, const_re_char *string2, size_t size2,
4930                      ssize_t pos, struct re_registers *regs, ssize_t stop)
4931 {
4932   /* General temporaries.  */
4933   int mcnt;
4934   size_t reg;
4935
4936   /* Just past the end of the corresponding string.  */
4937   re_char *end1, *end2;
4938
4939   /* Pointers into string1 and string2, just past the last characters in
4940      each to consider matching.  */
4941   re_char *end_match_1, *end_match_2;
4942
4943   /* Where we are in the data, and the end of the current string.  */
4944   re_char *d, *dend;
4945
4946   /* Used sometimes to remember where we were before starting matching
4947      an operator so that we can go back in case of failure.  This "atomic"
4948      behavior of matching opcodes is indispensable to the correctness
4949      of the on_failure_keep_string_jump optimization.  */
4950   re_char *dfail;
4951
4952   /* Where we are in the pattern, and the end of the pattern.  */
4953   re_char *p = bufp->buffer;
4954   re_char *pend = p + bufp->used;
4955
4956   /* We use this to map every character in the string.  */
4957   RE_TRANSLATE_TYPE translate = bufp->translate;
4958
4959   /* Nonzero if BUFP is setup from a multibyte regex.  */
4960   const boolean multibyte = RE_MULTIBYTE_P (bufp);
4961
4962   /* Nonzero if STRING1/STRING2 are multibyte.  */
4963   const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4964
4965   /* Failure point stack.  Each place that can handle a failure further
4966      down the line pushes a failure point on this stack.  It consists of
4967      regstart, and regend for all registers corresponding to
4968      the subexpressions we're currently inside, plus the number of such
4969      registers, and, finally, two char *'s.  The first char * is where
4970      to resume scanning the pattern; the second one is where to resume
4971      scanning the strings.  */
4972 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
4973   fail_stack_type fail_stack;
4974 #endif
4975 #ifdef DEBUG_COMPILES_ARGUMENTS
4976   unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4977 #endif
4978
4979 #if defined REL_ALLOC && defined REGEX_MALLOC
4980   /* This holds the pointer to the failure stack, when
4981      it is allocated relocatably.  */
4982   fail_stack_elt_t *failure_stack_ptr;
4983 #endif
4984
4985   /* We fill all the registers internally, independent of what we
4986      return, for use in backreferences.  The number here includes
4987      an element for register zero.  */
4988   size_t num_regs = bufp->re_nsub + 1;
4989
4990   /* Information on the contents of registers. These are pointers into
4991      the input strings; they record just what was matched (on this
4992      attempt) by a subexpression part of the pattern, that is, the
4993      regnum-th regstart pointer points to where in the pattern we began
4994      matching and the regnum-th regend points to right after where we
4995      stopped matching the regnum-th subexpression.  (The zeroth register
4996      keeps track of what the whole pattern matches.)  */
4997 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
4998   re_char **regstart, **regend;
4999 #endif
5000
5001   /* The following record the register info as found in the above
5002      variables when we find a match better than any we've seen before.
5003      This happens as we backtrack through the failure points, which in
5004      turn happens only if we have not yet matched the entire string. */
5005   unsigned best_regs_set = false;
5006 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
5007   re_char **best_regstart, **best_regend;
5008 #endif
5009
5010   /* Logically, this is `best_regend[0]'.  But we don't want to have to
5011      allocate space for that if we're not allocating space for anything
5012      else (see below).  Also, we never need info about register 0 for
5013      any of the other register vectors, and it seems rather a kludge to
5014      treat `best_regend' differently than the rest.  So we keep track of
5015      the end of the best match so far in a separate variable.  We
5016      initialize this to NULL so that when we backtrack the first time
5017      and need to test it, it's not garbage.  */
5018   re_char *match_end = NULL;
5019
5020 #ifdef DEBUG_COMPILES_ARGUMENTS
5021   /* Counts the total number of registers pushed.  */
5022   unsigned num_regs_pushed = 0;
5023 #endif
5024
5025   DEBUG_PRINT ("\n\nEntering re_match_2.\n");
5026
5027   INIT_FAIL_STACK ();
5028
5029 #ifdef MATCH_MAY_ALLOCATE
5030   /* Do not bother to initialize all the register variables if there are
5031      no groups in the pattern, as it takes a fair amount of time.  If
5032      there are groups, we include space for register 0 (the whole
5033      pattern), even though we never use it, since it simplifies the
5034      array indexing.  We should fix this.  */
5035   if (bufp->re_nsub)
5036     {
5037       regstart = REGEX_TALLOC (num_regs, re_char *);
5038       regend = REGEX_TALLOC (num_regs, re_char *);
5039       best_regstart = REGEX_TALLOC (num_regs, re_char *);
5040       best_regend = REGEX_TALLOC (num_regs, re_char *);
5041
5042       if (!(regstart && regend && best_regstart && best_regend))
5043         {
5044           FREE_VARIABLES ();
5045           return -2;
5046         }
5047     }
5048   else
5049     {
5050       /* We must initialize all our variables to NULL, so that
5051          `FREE_VARIABLES' doesn't try to free them.  */
5052       regstart = regend = best_regstart = best_regend = NULL;
5053     }
5054 #endif /* MATCH_MAY_ALLOCATE */
5055
5056   /* The starting position is bogus.  */
5057   if (pos < 0 || pos > size1 + size2)
5058     {
5059       FREE_VARIABLES ();
5060       return -1;
5061     }
5062
5063   /* Initialize subexpression text positions to -1 to mark ones that no
5064      start_memory/stop_memory has been seen for. Also initialize the
5065      register information struct.  */
5066   for (reg = 1; reg < num_regs; reg++)
5067     regstart[reg] = regend[reg] = NULL;
5068
5069   /* We move `string1' into `string2' if the latter's empty -- but not if
5070      `string1' is null.  */
5071   if (size2 == 0 && string1 != NULL)
5072     {
5073       string2 = string1;
5074       size2 = size1;
5075       string1 = 0;
5076       size1 = 0;
5077     }
5078   end1 = string1 + size1;
5079   end2 = string2 + size2;
5080
5081   /* `p' scans through the pattern as `d' scans through the data.
5082      `dend' is the end of the input string that `d' points within.  `d'
5083      is advanced into the following input string whenever necessary, but
5084      this happens before fetching; therefore, at the beginning of the
5085      loop, `d' can be pointing at the end of a string, but it cannot
5086      equal `string2'.  */
5087   if (pos >= size1)
5088     {
5089       /* Only match within string2.  */
5090       d = string2 + pos - size1;
5091       dend = end_match_2 = string2 + stop - size1;
5092       end_match_1 = end1;       /* Just to give it a value.  */
5093     }
5094   else
5095     {
5096       if (stop < size1)
5097         {
5098           /* Only match within string1.  */
5099           end_match_1 = string1 + stop;
5100           /* BEWARE!
5101              When we reach end_match_1, PREFETCH normally switches to string2.
5102              But in the present case, this means that just doing a PREFETCH
5103              makes us jump from `stop' to `gap' within the string.
5104              What we really want here is for the search to stop as
5105              soon as we hit end_match_1.  That's why we set end_match_2
5106              to end_match_1 (since PREFETCH fails as soon as we hit
5107              end_match_2).  */
5108           end_match_2 = end_match_1;
5109         }
5110       else
5111         { /* It's important to use this code when stop == size so that
5112              moving `d' from end1 to string2 will not prevent the d == dend
5113              check from catching the end of string.  */
5114           end_match_1 = end1;
5115           end_match_2 = string2 + stop - size1;
5116         }
5117       d = string1 + pos;
5118       dend = end_match_1;
5119     }
5120
5121   DEBUG_PRINT ("The compiled pattern is: ");
5122   DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5123   DEBUG_PRINT ("The string to match is: `");
5124   DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5125   DEBUG_PRINT ("'\n");
5126
5127   /* This loops over pattern commands.  It exits by returning from the
5128      function if the match is complete, or it drops through if the match
5129      fails at this starting point in the input data.  */
5130   for (;;)
5131     {
5132       DEBUG_PRINT ("\n%p: ", p);
5133
5134       if (p == pend)
5135         {
5136           ptrdiff_t dcnt;
5137
5138           /* End of pattern means we might have succeeded.  */
5139           DEBUG_PRINT ("end of pattern ... ");
5140
5141           /* If we haven't matched the entire string, and we want the
5142              longest match, try backtracking.  */
5143           if (d != end_match_2)
5144             {
5145               /* 1 if this match ends in the same string (string1 or string2)
5146                  as the best previous match.  */
5147               boolean same_str_p = (FIRST_STRING_P (match_end)
5148                                     == FIRST_STRING_P (d));
5149               /* 1 if this match is the best seen so far.  */
5150               boolean best_match_p;
5151
5152               /* AIX compiler got confused when this was combined
5153                  with the previous declaration.  */
5154               if (same_str_p)
5155                 best_match_p = d > match_end;
5156               else
5157                 best_match_p = !FIRST_STRING_P (d);
5158
5159               DEBUG_PRINT ("backtracking.\n");
5160
5161               if (!FAIL_STACK_EMPTY ())
5162                 { /* More failure points to try.  */
5163
5164                   /* If exceeds best match so far, save it.  */
5165                   if (!best_regs_set || best_match_p)
5166                     {
5167                       best_regs_set = true;
5168                       match_end = d;
5169
5170                       DEBUG_PRINT ("\nSAVING match as best so far.\n");
5171
5172                       for (reg = 1; reg < num_regs; reg++)
5173                         {
5174                           best_regstart[reg] = regstart[reg];
5175                           best_regend[reg] = regend[reg];
5176                         }
5177                     }
5178                   goto fail;
5179                 }
5180
5181               /* If no failure points, don't restore garbage.  And if
5182                  last match is real best match, don't restore second
5183                  best one. */
5184               else if (best_regs_set && !best_match_p)
5185                 {
5186                 restore_best_regs:
5187                   /* Restore best match.  It may happen that `dend ==
5188                      end_match_1' while the restored d is in string2.
5189                      For example, the pattern `x.*y.*z' against the
5190                      strings `x-' and `y-z-', if the two strings are
5191                      not consecutive in memory.  */
5192                   DEBUG_PRINT ("Restoring best registers.\n");
5193
5194                   d = match_end;
5195                   dend = ((d >= string1 && d <= end1)
5196                            ? end_match_1 : end_match_2);
5197
5198                   for (reg = 1; reg < num_regs; reg++)
5199                     {
5200                       regstart[reg] = best_regstart[reg];
5201                       regend[reg] = best_regend[reg];
5202                     }
5203                 }
5204             } /* d != end_match_2 */
5205
5206         succeed_label:
5207           DEBUG_PRINT ("Accepting match.\n");
5208
5209           /* If caller wants register contents data back, do it.  */
5210           if (regs && !bufp->no_sub)
5211             {
5212               /* Have the register data arrays been allocated?  */
5213               if (bufp->regs_allocated == REGS_UNALLOCATED)
5214                 { /* No.  So allocate them with malloc.  We need one
5215                      extra element beyond `num_regs' for the `-1' marker
5216                      GNU code uses.  */
5217                   regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5218                   regs->start = TALLOC (regs->num_regs, regoff_t);
5219                   regs->end = TALLOC (regs->num_regs, regoff_t);
5220                   if (regs->start == NULL || regs->end == NULL)
5221                     {
5222                       FREE_VARIABLES ();
5223                       return -2;
5224                     }
5225                   bufp->regs_allocated = REGS_REALLOCATE;
5226                 }
5227               else if (bufp->regs_allocated == REGS_REALLOCATE)
5228                 { /* Yes.  If we need more elements than were already
5229                      allocated, reallocate them.  If we need fewer, just
5230                      leave it alone.  */
5231                   if (regs->num_regs < num_regs + 1)
5232                     {
5233                       regs->num_regs = num_regs + 1;
5234                       RETALLOC (regs->start, regs->num_regs, regoff_t);
5235                       RETALLOC (regs->end, regs->num_regs, regoff_t);
5236                       if (regs->start == NULL || regs->end == NULL)
5237                         {
5238                           FREE_VARIABLES ();
5239                           return -2;
5240                         }
5241                     }
5242                 }
5243               else
5244                 {
5245                   /* These braces fend off a "empty body in an else-statement"
5246                      warning under GCC when assert expands to nothing.  */
5247                   assert (bufp->regs_allocated == REGS_FIXED);
5248                 }
5249
5250               /* Convert the pointer data in `regstart' and `regend' to
5251                  indices.  Register zero has to be set differently,
5252                  since we haven't kept track of any info for it.  */
5253               if (regs->num_regs > 0)
5254                 {
5255                   regs->start[0] = pos;
5256                   regs->end[0] = POINTER_TO_OFFSET (d);
5257                 }
5258
5259               /* Go through the first `min (num_regs, regs->num_regs)'
5260                  registers, since that is all we initialized.  */
5261               for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
5262                 {
5263                   if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5264                     regs->start[reg] = regs->end[reg] = -1;
5265                   else
5266                     {
5267                       regs->start[reg] = POINTER_TO_OFFSET (regstart[reg]);
5268                       regs->end[reg] = POINTER_TO_OFFSET (regend[reg]);
5269                     }
5270                 }
5271
5272               /* If the regs structure we return has more elements than
5273                  were in the pattern, set the extra elements to -1.  If
5274                  we (re)allocated the registers, this is the case,
5275                  because we always allocate enough to have at least one
5276                  -1 at the end.  */
5277               for (reg = num_regs; reg < regs->num_regs; reg++)
5278                 regs->start[reg] = regs->end[reg] = -1;
5279             } /* regs && !bufp->no_sub */
5280
5281           DEBUG_PRINT ("%u failure points pushed, %u popped (%u remain).\n",
5282                        nfailure_points_pushed, nfailure_points_popped,
5283                        nfailure_points_pushed - nfailure_points_popped);
5284           DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed);
5285
5286           dcnt = POINTER_TO_OFFSET (d) - pos;
5287
5288           DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt);
5289
5290           FREE_VARIABLES ();
5291           return dcnt;
5292         }
5293
5294       /* Otherwise match next pattern command.  */
5295       switch (*p++)
5296         {
5297         /* Ignore these.  Used to ignore the n of succeed_n's which
5298            currently have n == 0.  */
5299         case no_op:
5300           DEBUG_PRINT ("EXECUTING no_op.\n");
5301           break;
5302
5303         case succeed:
5304           DEBUG_PRINT ("EXECUTING succeed.\n");
5305           goto succeed_label;
5306
5307         /* Match the next n pattern characters exactly.  The following
5308            byte in the pattern defines n, and the n bytes after that
5309            are the characters to match.  */
5310         case exactn:
5311           mcnt = *p++;
5312           DEBUG_PRINT ("EXECUTING exactn %d.\n", mcnt);
5313
5314           /* Remember the start point to rollback upon failure.  */
5315           dfail = d;
5316
5317 #ifndef emacs
5318           /* This is written out as an if-else so we don't waste time
5319              testing `translate' inside the loop.  */
5320           if (RE_TRANSLATE_P (translate))
5321             do
5322               {
5323                 PREFETCH ();
5324                 if (RE_TRANSLATE (translate, *d) != *p++)
5325                   {
5326                     d = dfail;
5327                     goto fail;
5328                   }
5329                 d++;
5330               }
5331             while (--mcnt);
5332           else
5333             do
5334               {
5335                 PREFETCH ();
5336                 if (*d++ != *p++)
5337                   {
5338                     d = dfail;
5339                     goto fail;
5340                   }
5341               }
5342             while (--mcnt);
5343 #else  /* emacs */
5344           /* The cost of testing `translate' is comparatively small.  */
5345           if (target_multibyte)
5346             do
5347               {
5348                 int pat_charlen, buf_charlen;
5349                 int pat_ch, buf_ch;
5350
5351                 PREFETCH ();
5352                 if (multibyte)
5353                   pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5354                 else
5355                   {
5356                     pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5357                     pat_charlen = 1;
5358                   }
5359                 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
5360
5361                 if (TRANSLATE (buf_ch) != pat_ch)
5362                   {
5363                     d = dfail;
5364                     goto fail;
5365                   }
5366
5367                 p += pat_charlen;
5368                 d += buf_charlen;
5369                 mcnt -= pat_charlen;
5370               }
5371             while (mcnt > 0);
5372           else
5373             do
5374               {
5375                 int pat_charlen;
5376                 int pat_ch, buf_ch;
5377
5378                 PREFETCH ();
5379                 if (multibyte)
5380                   {
5381                     pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5382                     pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5383                   }
5384                 else
5385                   {
5386                     pat_ch = *p;
5387                     pat_charlen = 1;
5388                   }
5389                 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5390                 if (! CHAR_BYTE8_P (buf_ch))
5391                   {
5392                     buf_ch = TRANSLATE (buf_ch);
5393                     buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5394                     if (buf_ch < 0)
5395                       buf_ch = *d;
5396                   }
5397                 else
5398                   buf_ch = *d;
5399                 if (buf_ch != pat_ch)
5400                   {
5401                     d = dfail;
5402                     goto fail;
5403                   }
5404                 p += pat_charlen;
5405                 d++;
5406               }
5407             while (--mcnt);
5408 #endif
5409           break;
5410
5411
5412         /* Match any character except possibly a newline or a null.  */
5413         case anychar:
5414           {
5415             int buf_charlen;
5416             re_wchar_t buf_ch;
5417
5418             DEBUG_PRINT ("EXECUTING anychar.\n");
5419
5420             PREFETCH ();
5421             buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
5422                                                 target_multibyte);
5423             buf_ch = TRANSLATE (buf_ch);
5424
5425             if ((!(bufp->syntax & RE_DOT_NEWLINE)
5426                  && buf_ch == '\n')
5427                 || ((bufp->syntax & RE_DOT_NOT_NULL)
5428                     && buf_ch == '\000'))
5429               goto fail;
5430
5431             DEBUG_PRINT ("  Matched `%d'.\n", *d);
5432             d += buf_charlen;
5433           }
5434           break;
5435
5436
5437         case charset:
5438         case charset_not:
5439           {
5440             register unsigned int c;
5441             boolean not = (re_opcode_t) *(p - 1) == charset_not;
5442             int len;
5443
5444             /* Start of actual range_table, or end of bitmap if there is no
5445                range table.  */
5446             re_char *range_table IF_LINT (= NULL);
5447
5448             /* Nonzero if there is a range table.  */
5449             int range_table_exists;
5450
5451             /* Number of ranges of range table.  This is not included
5452                in the initial byte-length of the command.  */
5453             int count = 0;
5454
5455             /* Whether matching against a unibyte character.  */
5456             boolean unibyte_char = false;
5457
5458             DEBUG_PRINT ("EXECUTING charset%s.\n", not ? "_not" : "");
5459
5460             range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5461
5462             if (range_table_exists)
5463               {
5464                 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap.  */
5465                 EXTRACT_NUMBER_AND_INCR (count, range_table);
5466               }
5467
5468             PREFETCH ();
5469             c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
5470             if (target_multibyte)
5471               {
5472                 int c1;
5473
5474                 c = TRANSLATE (c);
5475                 c1 = RE_CHAR_TO_UNIBYTE (c);
5476                 if (c1 >= 0)
5477                   {
5478                     unibyte_char = true;
5479                     c = c1;
5480                   }
5481               }
5482             else
5483               {
5484                 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5485
5486                 if (! CHAR_BYTE8_P (c1))
5487                   {
5488                     c1 = TRANSLATE (c1);
5489                     c1 = RE_CHAR_TO_UNIBYTE (c1);
5490                     if (c1 >= 0)
5491                       {
5492                         unibyte_char = true;
5493                         c = c1;
5494                       }
5495                   }
5496                 else
5497                   unibyte_char = true;
5498               }
5499
5500             if (unibyte_char && c < (1 << BYTEWIDTH))
5501               {                 /* Lookup bitmap.  */
5502                 /* Cast to `unsigned' instead of `unsigned char' in
5503                    case the bit list is a full 32 bytes long.  */
5504                 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5505                     && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5506                   not = !not;
5507               }
5508 #ifdef emacs
5509             else if (range_table_exists)
5510               {
5511                 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5512
5513                 if (  (class_bits & BIT_LOWER && ISLOWER (c))
5514                     | (class_bits & BIT_MULTIBYTE)
5515                     | (class_bits & BIT_PUNCT && ISPUNCT (c))
5516                     | (class_bits & BIT_SPACE && ISSPACE (c))
5517                     | (class_bits & BIT_UPPER && ISUPPER (c))
5518                     | (class_bits & BIT_WORD  && ISWORD (c)))
5519                   not = !not;
5520                 else
5521                   CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5522               }
5523 #endif /* emacs */
5524
5525             if (range_table_exists)
5526               p = CHARSET_RANGE_TABLE_END (range_table, count);
5527             else
5528               p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5529
5530             if (!not) goto fail;
5531
5532             d += len;
5533           }
5534           break;
5535
5536
5537         /* The beginning of a group is represented by start_memory.
5538            The argument is the register number.  The text
5539            matched within the group is recorded (in the internal
5540            registers data structure) under the register number.  */
5541         case start_memory:
5542           DEBUG_PRINT ("EXECUTING start_memory %d:\n", *p);
5543
5544           /* In case we need to undo this operation (via backtracking).  */
5545           PUSH_FAILURE_REG (*p);
5546
5547           regstart[*p] = d;
5548           regend[*p] = NULL;    /* probably unnecessary.  -sm  */
5549           DEBUG_PRINT ("  regstart: %td\n", POINTER_TO_OFFSET (regstart[*p]));
5550
5551           /* Move past the register number and inner group count.  */
5552           p += 1;
5553           break;
5554
5555
5556         /* The stop_memory opcode represents the end of a group.  Its
5557            argument is the same as start_memory's: the register number.  */
5558         case stop_memory:
5559           DEBUG_PRINT ("EXECUTING stop_memory %d:\n", *p);
5560
5561           assert (!REG_UNSET (regstart[*p]));
5562           /* Strictly speaking, there should be code such as:
5563
5564                 assert (REG_UNSET (regend[*p]));
5565                 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5566
5567              But the only info to be pushed is regend[*p] and it is known to
5568              be UNSET, so there really isn't anything to push.
5569              Not pushing anything, on the other hand deprives us from the
5570              guarantee that regend[*p] is UNSET since undoing this operation
5571              will not reset its value properly.  This is not important since
5572              the value will only be read on the next start_memory or at
5573              the very end and both events can only happen if this stop_memory
5574              is *not* undone.  */
5575
5576           regend[*p] = d;
5577           DEBUG_PRINT ("      regend: %td\n", POINTER_TO_OFFSET (regend[*p]));
5578
5579           /* Move past the register number and the inner group count.  */
5580           p += 1;
5581           break;
5582
5583
5584         /* \<digit> has been turned into a `duplicate' command which is
5585            followed by the numeric value of <digit> as the register number.  */
5586         case duplicate:
5587           {
5588             register re_char *d2, *dend2;
5589             int regno = *p++;   /* Get which register to match against.  */
5590             DEBUG_PRINT ("EXECUTING duplicate %d.\n", regno);
5591
5592             /* Can't back reference a group which we've never matched.  */
5593             if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5594               goto fail;
5595
5596             /* Where in input to try to start matching.  */
5597             d2 = regstart[regno];
5598
5599             /* Remember the start point to rollback upon failure.  */
5600             dfail = d;
5601
5602             /* Where to stop matching; if both the place to start and
5603                the place to stop matching are in the same string, then
5604                set to the place to stop, otherwise, for now have to use
5605                the end of the first string.  */
5606
5607             dend2 = ((FIRST_STRING_P (regstart[regno])
5608                       == FIRST_STRING_P (regend[regno]))
5609                      ? regend[regno] : end_match_1);
5610             for (;;)
5611               {
5612                 ptrdiff_t dcnt;
5613
5614                 /* If necessary, advance to next segment in register
5615                    contents.  */
5616                 while (d2 == dend2)
5617                   {
5618                     if (dend2 == end_match_2) break;
5619                     if (dend2 == regend[regno]) break;
5620
5621                     /* End of string1 => advance to string2. */
5622                     d2 = string2;
5623                     dend2 = regend[regno];
5624                   }
5625                 /* At end of register contents => success */
5626                 if (d2 == dend2) break;
5627
5628                 /* If necessary, advance to next segment in data.  */
5629                 PREFETCH ();
5630
5631                 /* How many characters left in this segment to match.  */
5632                 dcnt = dend - d;
5633
5634                 /* Want how many consecutive characters we can match in
5635                    one shot, so, if necessary, adjust the count.  */
5636                 if (dcnt > dend2 - d2)
5637                   dcnt = dend2 - d2;
5638
5639                 /* Compare that many; failure if mismatch, else move
5640                    past them.  */
5641                 if (RE_TRANSLATE_P (translate)
5642                     ? bcmp_translate (d, d2, dcnt, translate, target_multibyte)
5643                     : memcmp (d, d2, dcnt))
5644                   {
5645                     d = dfail;
5646                     goto fail;
5647                   }
5648                 d += dcnt, d2 += dcnt;
5649               }
5650           }
5651           break;
5652
5653
5654         /* begline matches the empty string at the beginning of the string
5655            (unless `not_bol' is set in `bufp'), and after newlines.  */
5656         case begline:
5657           DEBUG_PRINT ("EXECUTING begline.\n");
5658
5659           if (AT_STRINGS_BEG (d))
5660             {
5661               if (!bufp->not_bol) break;
5662             }
5663           else
5664             {
5665               unsigned c;
5666               GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5667               if (c == '\n')
5668                 break;
5669             }
5670           /* In all other cases, we fail.  */
5671           goto fail;
5672
5673
5674         /* endline is the dual of begline.  */
5675         case endline:
5676           DEBUG_PRINT ("EXECUTING endline.\n");
5677
5678           if (AT_STRINGS_END (d))
5679             {
5680               if (!bufp->not_eol) break;
5681             }
5682           else
5683             {
5684               PREFETCH_NOLIMIT ();
5685               if (*d == '\n')
5686                 break;
5687             }
5688           goto fail;
5689
5690
5691         /* Match at the very beginning of the data.  */
5692         case begbuf:
5693           DEBUG_PRINT ("EXECUTING begbuf.\n");
5694           if (AT_STRINGS_BEG (d))
5695             break;
5696           goto fail;
5697
5698
5699         /* Match at the very end of the data.  */
5700         case endbuf:
5701           DEBUG_PRINT ("EXECUTING endbuf.\n");
5702           if (AT_STRINGS_END (d))
5703             break;
5704           goto fail;
5705
5706
5707         /* on_failure_keep_string_jump is used to optimize `.*\n'.  It
5708            pushes NULL as the value for the string on the stack.  Then
5709            `POP_FAILURE_POINT' will keep the current value for the
5710            string, instead of restoring it.  To see why, consider
5711            matching `foo\nbar' against `.*\n'.  The .* matches the foo;
5712            then the . fails against the \n.  But the next thing we want
5713            to do is match the \n against the \n; if we restored the
5714            string value, we would be back at the foo.
5715
5716            Because this is used only in specific cases, we don't need to
5717            check all the things that `on_failure_jump' does, to make
5718            sure the right things get saved on the stack.  Hence we don't
5719            share its code.  The only reason to push anything on the
5720            stack at all is that otherwise we would have to change
5721            `anychar's code to do something besides goto fail in this
5722            case; that seems worse than this.  */
5723         case on_failure_keep_string_jump:
5724           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5725           DEBUG_PRINT ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5726                        mcnt, p + mcnt);
5727
5728           PUSH_FAILURE_POINT (p - 3, NULL);
5729           break;
5730
5731           /* A nasty loop is introduced by the non-greedy *? and +?.
5732              With such loops, the stack only ever contains one failure point
5733              at a time, so that a plain on_failure_jump_loop kind of
5734              cycle detection cannot work.  Worse yet, such a detection
5735              can not only fail to detect a cycle, but it can also wrongly
5736              detect a cycle (between different instantiations of the same
5737              loop).
5738              So the method used for those nasty loops is a little different:
5739              We use a special cycle-detection-stack-frame which is pushed
5740              when the on_failure_jump_nastyloop failure-point is *popped*.
5741              This special frame thus marks the beginning of one iteration
5742              through the loop and we can hence easily check right here
5743              whether something matched between the beginning and the end of
5744              the loop.  */
5745         case on_failure_jump_nastyloop:
5746           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5747           DEBUG_PRINT ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5748                        mcnt, p + mcnt);
5749
5750           assert ((re_opcode_t)p[-4] == no_op);
5751           {
5752             int cycle = 0;
5753             CHECK_INFINITE_LOOP (p - 4, d);
5754             if (!cycle)
5755               /* If there's a cycle, just continue without pushing
5756                  this failure point.  The failure point is the "try again"
5757                  option, which shouldn't be tried.
5758                  We want (x?)*?y\1z to match both xxyz and xxyxz.  */
5759               PUSH_FAILURE_POINT (p - 3, d);
5760           }
5761           break;
5762
5763           /* Simple loop detecting on_failure_jump:  just check on the
5764              failure stack if the same spot was already hit earlier.  */
5765         case on_failure_jump_loop:
5766         on_failure:
5767           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5768           DEBUG_PRINT ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5769                        mcnt, p + mcnt);
5770           {
5771             int cycle = 0;
5772             CHECK_INFINITE_LOOP (p - 3, d);
5773             if (cycle)
5774               /* If there's a cycle, get out of the loop, as if the matching
5775                  had failed.  We used to just `goto fail' here, but that was
5776                  aborting the search a bit too early: we want to keep the
5777                  empty-loop-match and keep matching after the loop.
5778                  We want (x?)*y\1z to match both xxyz and xxyxz.  */
5779               p += mcnt;
5780             else
5781               PUSH_FAILURE_POINT (p - 3, d);
5782           }
5783           break;
5784
5785
5786         /* Uses of on_failure_jump:
5787
5788            Each alternative starts with an on_failure_jump that points
5789            to the beginning of the next alternative.  Each alternative
5790            except the last ends with a jump that in effect jumps past
5791            the rest of the alternatives.  (They really jump to the
5792            ending jump of the following alternative, because tensioning
5793            these jumps is a hassle.)
5794
5795            Repeats start with an on_failure_jump that points past both
5796            the repetition text and either the following jump or
5797            pop_failure_jump back to this on_failure_jump.  */
5798         case on_failure_jump:
5799           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5800           DEBUG_PRINT ("EXECUTING on_failure_jump %d (to %p):\n",
5801                        mcnt, p + mcnt);
5802
5803           PUSH_FAILURE_POINT (p -3, d);
5804           break;
5805
5806         /* This operation is used for greedy *.
5807            Compare the beginning of the repeat with what in the
5808            pattern follows its end. If we can establish that there
5809            is nothing that they would both match, i.e., that we
5810            would have to backtrack because of (as in, e.g., `a*a')
5811            then we can use a non-backtracking loop based on
5812            on_failure_keep_string_jump instead of on_failure_jump.  */
5813         case on_failure_jump_smart:
5814           EXTRACT_NUMBER_AND_INCR (mcnt, p);
5815           DEBUG_PRINT ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5816                        mcnt, p + mcnt);
5817           {
5818             re_char *p1 = p; /* Next operation.  */
5819             /* Here, we discard `const', making re_match non-reentrant.  */
5820             unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest.  */
5821             unsigned char *p3 = (unsigned char*) p - 3; /* opcode location.  */
5822
5823             p -= 3;             /* Reset so that we will re-execute the
5824                                    instruction once it's been changed. */
5825
5826             EXTRACT_NUMBER (mcnt, p2 - 2);
5827
5828             /* Ensure this is a indeed the trivial kind of loop
5829                we are expecting.  */
5830             assert (skip_one_char (p1) == p2 - 3);
5831             assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5832             DEBUG_STATEMENT (debug += 2);
5833             if (mutually_exclusive_p (bufp, p1, p2))
5834               {
5835                 /* Use a fast `on_failure_keep_string_jump' loop.  */
5836                 DEBUG_PRINT ("  smart exclusive => fast loop.\n");
5837                 *p3 = (unsigned char) on_failure_keep_string_jump;
5838                 STORE_NUMBER (p2 - 2, mcnt + 3);
5839               }
5840             else
5841               {
5842                 /* Default to a safe `on_failure_jump' loop.  */
5843                 DEBUG_PRINT ("  smart default => slow loop.\n");
5844                 *p3 = (unsigned char) on_failure_jump;
5845               }
5846             DEBUG_STATEMENT (debug -= 2);
5847           }
5848           break;
5849
5850         /* Unconditionally jump (without popping any failure points).  */
5851         case jump:
5852         unconditional_jump:
5853           IMMEDIATE_QUIT_CHECK;
5854           EXTRACT_NUMBER_AND_INCR (mcnt, p);    /* Get the amount to jump.  */
5855           DEBUG_PRINT ("EXECUTING jump %d ", mcnt);
5856           p += mcnt;                            /* Do the jump.  */
5857           DEBUG_PRINT ("(to %p).\n", p);
5858           break;
5859
5860
5861         /* Have to succeed matching what follows at least n times.
5862            After that, handle like `on_failure_jump'.  */
5863         case succeed_n:
5864           /* Signedness doesn't matter since we only compare MCNT to 0.  */
5865           EXTRACT_NUMBER (mcnt, p + 2);
5866           DEBUG_PRINT ("EXECUTING succeed_n %d.\n", mcnt);
5867
5868           /* Originally, mcnt is how many times we HAVE to succeed.  */
5869           if (mcnt != 0)
5870             {
5871               /* Here, we discard `const', making re_match non-reentrant.  */
5872               unsigned char *p2 = (unsigned char*) p + 2; /* counter loc.  */
5873               mcnt--;
5874               p += 4;
5875               PUSH_NUMBER (p2, mcnt);
5876             }
5877           else
5878             /* The two bytes encoding mcnt == 0 are two no_op opcodes.  */
5879             goto on_failure;
5880           break;
5881
5882         case jump_n:
5883           /* Signedness doesn't matter since we only compare MCNT to 0.  */
5884           EXTRACT_NUMBER (mcnt, p + 2);
5885           DEBUG_PRINT ("EXECUTING jump_n %d.\n", mcnt);
5886
5887           /* Originally, this is how many times we CAN jump.  */
5888           if (mcnt != 0)
5889             {
5890                /* Here, we discard `const', making re_match non-reentrant.  */
5891               unsigned char *p2 = (unsigned char*) p + 2; /* counter loc.  */
5892               mcnt--;
5893               PUSH_NUMBER (p2, mcnt);
5894               goto unconditional_jump;
5895             }
5896           /* If don't have to jump any more, skip over the rest of command.  */
5897           else
5898             p += 4;
5899           break;
5900
5901         case set_number_at:
5902           {
5903             unsigned char *p2;  /* Location of the counter.  */
5904             DEBUG_PRINT ("EXECUTING set_number_at.\n");
5905
5906             EXTRACT_NUMBER_AND_INCR (mcnt, p);
5907             /* Here, we discard `const', making re_match non-reentrant.  */
5908             p2 = (unsigned char*) p + mcnt;
5909             /* Signedness doesn't matter since we only copy MCNT's bits.  */
5910             EXTRACT_NUMBER_AND_INCR (mcnt, p);
5911             DEBUG_PRINT ("  Setting %p to %d.\n", p2, mcnt);
5912             PUSH_NUMBER (p2, mcnt);
5913             break;
5914           }
5915
5916         case wordbound:
5917         case notwordbound:
5918           {
5919             boolean not = (re_opcode_t) *(p - 1) == notwordbound;
5920             DEBUG_PRINT ("EXECUTING %swordbound.\n", not ? "not" : "");
5921
5922             /* We SUCCEED (or FAIL) in one of the following cases: */
5923
5924             /* Case 1: D is at the beginning or the end of string.  */
5925             if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5926               not = !not;
5927             else
5928               {
5929                 /* C1 is the character before D, S1 is the syntax of C1, C2
5930                    is the character at D, and S2 is the syntax of C2.  */
5931                 re_wchar_t c1, c2;
5932                 int s1, s2;
5933                 int dummy;
5934 #ifdef emacs
5935                 ssize_t offset = PTR_TO_OFFSET (d - 1);
5936                 ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5937                 UPDATE_SYNTAX_TABLE (charpos);
5938 #endif
5939                 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5940                 s1 = SYNTAX (c1);
5941 #ifdef emacs
5942                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5943 #endif
5944                 PREFETCH_NOLIMIT ();
5945                 GET_CHAR_AFTER (c2, d, dummy);
5946                 s2 = SYNTAX (c2);
5947
5948                 if (/* Case 2: Only one of S1 and S2 is Sword.  */
5949                     ((s1 == Sword) != (s2 == Sword))
5950                     /* Case 3: Both of S1 and S2 are Sword, and macro
5951                        WORD_BOUNDARY_P (C1, C2) returns nonzero.  */
5952                     || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5953                   not = !not;
5954               }
5955             if (not)
5956               break;
5957             else
5958               goto fail;
5959           }
5960
5961         case wordbeg:
5962           DEBUG_PRINT ("EXECUTING wordbeg.\n");
5963
5964           /* We FAIL in one of the following cases: */
5965
5966           /* Case 1: D is at the end of string.  */
5967           if (AT_STRINGS_END (d))
5968             goto fail;
5969           else
5970             {
5971               /* C1 is the character before D, S1 is the syntax of C1, C2
5972                  is the character at D, and S2 is the syntax of C2.  */
5973               re_wchar_t c1, c2;
5974               int s1, s2;
5975               int dummy;
5976 #ifdef emacs
5977               ssize_t offset = PTR_TO_OFFSET (d);
5978               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5979               UPDATE_SYNTAX_TABLE (charpos);
5980 #endif
5981               PREFETCH ();
5982               GET_CHAR_AFTER (c2, d, dummy);
5983               s2 = SYNTAX (c2);
5984
5985               /* Case 2: S2 is not Sword. */
5986               if (s2 != Sword)
5987                 goto fail;
5988
5989               /* Case 3: D is not at the beginning of string ... */
5990               if (!AT_STRINGS_BEG (d))
5991                 {
5992                   GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5993 #ifdef emacs
5994                   UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5995 #endif
5996                   s1 = SYNTAX (c1);
5997
5998                   /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5999                      returns 0.  */
6000                   if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6001                     goto fail;
6002                 }
6003             }
6004           break;
6005
6006         case wordend:
6007           DEBUG_PRINT ("EXECUTING wordend.\n");
6008
6009           /* We FAIL in one of the following cases: */
6010
6011           /* Case 1: D is at the beginning of string.  */
6012           if (AT_STRINGS_BEG (d))
6013             goto fail;
6014           else
6015             {
6016               /* C1 is the character before D, S1 is the syntax of C1, C2
6017                  is the character at D, and S2 is the syntax of C2.  */
6018               re_wchar_t c1, c2;
6019               int s1, s2;
6020               int dummy;
6021 #ifdef emacs
6022               ssize_t offset = PTR_TO_OFFSET (d) - 1;
6023               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6024               UPDATE_SYNTAX_TABLE (charpos);
6025 #endif
6026               GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6027               s1 = SYNTAX (c1);
6028
6029               /* Case 2: S1 is not Sword.  */
6030               if (s1 != Sword)
6031                 goto fail;
6032
6033               /* Case 3: D is not at the end of string ... */
6034               if (!AT_STRINGS_END (d))
6035                 {
6036                   PREFETCH_NOLIMIT ();
6037                   GET_CHAR_AFTER (c2, d, dummy);
6038 #ifdef emacs
6039                   UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6040 #endif
6041                   s2 = SYNTAX (c2);
6042
6043                   /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
6044                      returns 0.  */
6045                   if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6046           goto fail;
6047                 }
6048             }
6049           break;
6050
6051         case symbeg:
6052           DEBUG_PRINT ("EXECUTING symbeg.\n");
6053
6054           /* We FAIL in one of the following cases: */
6055
6056           /* Case 1: D is at the end of string.  */
6057           if (AT_STRINGS_END (d))
6058             goto fail;
6059           else
6060             {
6061               /* C1 is the character before D, S1 is the syntax of C1, C2
6062                  is the character at D, and S2 is the syntax of C2.  */
6063               re_wchar_t c1, c2;
6064               int s1, s2;
6065 #ifdef emacs
6066               ssize_t offset = PTR_TO_OFFSET (d);
6067               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6068               UPDATE_SYNTAX_TABLE (charpos);
6069 #endif
6070               PREFETCH ();
6071               c2 = RE_STRING_CHAR (d, target_multibyte);
6072               s2 = SYNTAX (c2);
6073
6074               /* Case 2: S2 is neither Sword nor Ssymbol. */
6075               if (s2 != Sword && s2 != Ssymbol)
6076                 goto fail;
6077
6078               /* Case 3: D is not at the beginning of string ... */
6079               if (!AT_STRINGS_BEG (d))
6080                 {
6081                   GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6082 #ifdef emacs
6083                   UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6084 #endif
6085                   s1 = SYNTAX (c1);
6086
6087                   /* ... and S1 is Sword or Ssymbol.  */
6088                   if (s1 == Sword || s1 == Ssymbol)
6089                     goto fail;
6090                 }
6091             }
6092           break;
6093
6094         case symend:
6095           DEBUG_PRINT ("EXECUTING symend.\n");
6096
6097           /* We FAIL in one of the following cases: */
6098
6099           /* Case 1: D is at the beginning of string.  */
6100           if (AT_STRINGS_BEG (d))
6101             goto fail;
6102           else
6103             {
6104               /* C1 is the character before D, S1 is the syntax of C1, C2
6105                  is the character at D, and S2 is the syntax of C2.  */
6106               re_wchar_t c1, c2;
6107               int s1, s2;
6108 #ifdef emacs
6109               ssize_t offset = PTR_TO_OFFSET (d) - 1;
6110               ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6111               UPDATE_SYNTAX_TABLE (charpos);
6112 #endif
6113               GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6114               s1 = SYNTAX (c1);
6115
6116               /* Case 2: S1 is neither Ssymbol nor Sword.  */
6117               if (s1 != Sword && s1 != Ssymbol)
6118                 goto fail;
6119
6120               /* Case 3: D is not at the end of string ... */
6121               if (!AT_STRINGS_END (d))
6122                 {
6123                   PREFETCH_NOLIMIT ();
6124                   c2 = RE_STRING_CHAR (d, target_multibyte);
6125 #ifdef emacs
6126                   UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6127 #endif
6128                   s2 = SYNTAX (c2);
6129
6130                   /* ... and S2 is Sword or Ssymbol.  */
6131                   if (s2 == Sword || s2 == Ssymbol)
6132                     goto fail;
6133                 }
6134             }
6135           break;
6136
6137         case syntaxspec:
6138         case notsyntaxspec:
6139           {
6140             boolean not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6141             mcnt = *p++;
6142             DEBUG_PRINT ("EXECUTING %ssyntaxspec %d.\n", not ? "not" : "",
6143                          mcnt);
6144             PREFETCH ();
6145 #ifdef emacs
6146             {
6147               ssize_t offset = PTR_TO_OFFSET (d);
6148               ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6149               UPDATE_SYNTAX_TABLE (pos1);
6150             }
6151 #endif
6152             {
6153               int len;
6154               re_wchar_t c;
6155
6156               GET_CHAR_AFTER (c, d, len);
6157               if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6158                 goto fail;
6159               d += len;
6160             }
6161           }
6162           break;
6163
6164 #ifdef emacs
6165         case before_dot:
6166           DEBUG_PRINT ("EXECUTING before_dot.\n");
6167           if (PTR_BYTE_POS (d) >= PT_BYTE)
6168             goto fail;
6169           break;
6170
6171         case at_dot:
6172           DEBUG_PRINT ("EXECUTING at_dot.\n");
6173           if (PTR_BYTE_POS (d) != PT_BYTE)
6174             goto fail;
6175           break;
6176
6177         case after_dot:
6178           DEBUG_PRINT ("EXECUTING after_dot.\n");
6179           if (PTR_BYTE_POS (d) <= PT_BYTE)
6180             goto fail;
6181           break;
6182
6183         case categoryspec:
6184         case notcategoryspec:
6185           {
6186             boolean not = (re_opcode_t) *(p - 1) == notcategoryspec;
6187             mcnt = *p++;
6188             DEBUG_PRINT ("EXECUTING %scategoryspec %d.\n",
6189                          not ? "not" : "", mcnt);
6190             PREFETCH ();
6191
6192             {
6193               int len;
6194               re_wchar_t c;
6195               GET_CHAR_AFTER (c, d, len);
6196               if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6197                 goto fail;
6198               d += len;
6199             }
6200           }
6201           break;
6202
6203 #endif /* emacs */
6204
6205         default:
6206           abort ();
6207         }
6208       continue;  /* Successfully executed one pattern command; keep going.  */
6209
6210
6211     /* We goto here if a matching operation fails. */
6212     fail:
6213       IMMEDIATE_QUIT_CHECK;
6214       if (!FAIL_STACK_EMPTY ())
6215         {
6216           re_char *str, *pat;
6217           /* A restart point is known.  Restore to that state.  */
6218           DEBUG_PRINT ("\nFAIL:\n");
6219           POP_FAILURE_POINT (str, pat);
6220           switch (*pat++)
6221             {
6222             case on_failure_keep_string_jump:
6223               assert (str == NULL);
6224               goto continue_failure_jump;
6225
6226             case on_failure_jump_nastyloop:
6227               assert ((re_opcode_t)pat[-2] == no_op);
6228               PUSH_FAILURE_POINT (pat - 2, str);
6229               /* Fallthrough */
6230
6231             case on_failure_jump_loop:
6232             case on_failure_jump:
6233             case succeed_n:
6234               d = str;
6235             continue_failure_jump:
6236               EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6237               p = pat + mcnt;
6238               break;
6239
6240             case no_op:
6241               /* A special frame used for nastyloops. */
6242               goto fail;
6243
6244             default:
6245               abort ();
6246             }
6247
6248           assert (p >= bufp->buffer && p <= pend);
6249
6250           if (d >= string1 && d <= end1)
6251             dend = end_match_1;
6252         }
6253       else
6254         break;   /* Matching at this starting point really fails.  */
6255     } /* for (;;) */
6256
6257   if (best_regs_set)
6258     goto restore_best_regs;
6259
6260   FREE_VARIABLES ();
6261
6262   return -1;                            /* Failure to match.  */
6263 }
6264 \f
6265 /* Subroutine definitions for re_match_2.  */
6266
6267 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6268    bytes; nonzero otherwise.  */
6269
6270 static int
6271 bcmp_translate (const_re_char *s1, const_re_char *s2, register ssize_t len,
6272                 RE_TRANSLATE_TYPE translate, const int target_multibyte)
6273 {
6274   register re_char *p1 = s1, *p2 = s2;
6275   re_char *p1_end = s1 + len;
6276   re_char *p2_end = s2 + len;
6277
6278   /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6279      different lengths, but relying on a single `len' would break this. -sm  */
6280   while (p1 < p1_end && p2 < p2_end)
6281     {
6282       int p1_charlen, p2_charlen;
6283       re_wchar_t p1_ch, p2_ch;
6284
6285       GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6286       GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
6287
6288       if (RE_TRANSLATE (translate, p1_ch)
6289           != RE_TRANSLATE (translate, p2_ch))
6290         return 1;
6291
6292       p1 += p1_charlen, p2 += p2_charlen;
6293     }
6294
6295   if (p1 != p1_end || p2 != p2_end)
6296     return 1;
6297
6298   return 0;
6299 }
6300 \f
6301 /* Entry points for GNU code.  */
6302
6303 /* re_compile_pattern is the GNU regular expression compiler: it
6304    compiles PATTERN (of length SIZE) and puts the result in BUFP.
6305    Returns 0 if the pattern was valid, otherwise an error string.
6306
6307    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6308    are set in BUFP on entry.
6309
6310    We call regex_compile to do the actual compilation.  */
6311
6312 const char *
6313 re_compile_pattern (const char *pattern, size_t length,
6314                     struct re_pattern_buffer *bufp)
6315 {
6316   reg_errcode_t ret;
6317
6318   /* GNU code is written to assume at least RE_NREGS registers will be set
6319      (and at least one extra will be -1).  */
6320   bufp->regs_allocated = REGS_UNALLOCATED;
6321
6322   /* And GNU code determines whether or not to get register information
6323      by passing null for the REGS argument to re_match, etc., not by
6324      setting no_sub.  */
6325   bufp->no_sub = 0;
6326
6327   ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
6328
6329   if (!ret)
6330     return NULL;
6331   return gettext (re_error_msgid[(int) ret]);
6332 }
6333 WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
6334 \f
6335 /* Entry points compatible with 4.2 BSD regex library.  We don't define
6336    them unless specifically requested.  */
6337
6338 #if defined _REGEX_RE_COMP || defined _LIBC
6339
6340 /* BSD has one and only one pattern buffer.  */
6341 static struct re_pattern_buffer re_comp_buf;
6342
6343 char *
6344 # ifdef _LIBC
6345 /* Make these definitions weak in libc, so POSIX programs can redefine
6346    these names if they don't use our functions, and still use
6347    regcomp/regexec below without link errors.  */
6348 weak_function
6349 # endif
6350 re_comp (const char *s)
6351 {
6352   reg_errcode_t ret;
6353
6354   if (!s)
6355     {
6356       if (!re_comp_buf.buffer)
6357         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6358         return (char *) gettext ("No previous regular expression");
6359       return 0;
6360     }
6361
6362   if (!re_comp_buf.buffer)
6363     {
6364       re_comp_buf.buffer = malloc (200);
6365       if (re_comp_buf.buffer == NULL)
6366         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6367         return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6368       re_comp_buf.allocated = 200;
6369
6370       re_comp_buf.fastmap = malloc (1 << BYTEWIDTH);
6371       if (re_comp_buf.fastmap == NULL)
6372         /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6373         return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6374     }
6375
6376   /* Since `re_exec' always passes NULL for the `regs' argument, we
6377      don't need to initialize the pattern buffer fields which affect it.  */
6378
6379   ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6380
6381   if (!ret)
6382     return NULL;
6383
6384   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
6385   return (char *) gettext (re_error_msgid[(int) ret]);
6386 }
6387
6388
6389 int
6390 # ifdef _LIBC
6391 weak_function
6392 # endif
6393 re_exec (const char *s)
6394 {
6395   const size_t len = strlen (s);
6396   return re_search (&re_comp_buf, s, len, 0, len, 0) >= 0;
6397 }
6398 #endif /* _REGEX_RE_COMP */
6399 \f
6400 /* POSIX.2 functions.  Don't define these for Emacs.  */
6401
6402 #ifndef emacs
6403
6404 /* regcomp takes a regular expression as a string and compiles it.
6405
6406    PREG is a regex_t *.  We do not expect any fields to be initialized,
6407    since POSIX says we shouldn't.  Thus, we set
6408
6409      `buffer' to the compiled pattern;
6410      `used' to the length of the compiled pattern;
6411      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6412        REG_EXTENDED bit in CFLAGS is set; otherwise, to
6413        RE_SYNTAX_POSIX_BASIC;
6414      `fastmap' to an allocated space for the fastmap;
6415      `fastmap_accurate' to zero;
6416      `re_nsub' to the number of subexpressions in PATTERN.
6417
6418    PATTERN is the address of the pattern string.
6419
6420    CFLAGS is a series of bits which affect compilation.
6421
6422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6423      use POSIX basic syntax.
6424
6425      If REG_NEWLINE is set, then . and [^...] don't match newline.
6426      Also, regexec will try a match beginning after every newline.
6427
6428      If REG_ICASE is set, then we considers upper- and lowercase
6429      versions of letters to be equivalent when matching.
6430
6431      If REG_NOSUB is set, then when PREG is passed to regexec, that
6432      routine will report only success or failure, and nothing about the
6433      registers.
6434
6435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
6436    the return codes and their meanings.)  */
6437
6438 reg_errcode_t
6439 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern,
6440          int cflags)
6441 {
6442   reg_errcode_t ret;
6443   reg_syntax_t syntax
6444     = (cflags & REG_EXTENDED) ?
6445       RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6446
6447   /* regex_compile will allocate the space for the compiled pattern.  */
6448   preg->buffer = 0;
6449   preg->allocated = 0;
6450   preg->used = 0;
6451
6452   /* Try to allocate space for the fastmap.  */
6453   preg->fastmap = malloc (1 << BYTEWIDTH);
6454
6455   if (cflags & REG_ICASE)
6456     {
6457       unsigned i;
6458
6459       preg->translate = malloc (CHAR_SET_SIZE * sizeof *preg->translate);
6460       if (preg->translate == NULL)
6461         return (int) REG_ESPACE;
6462
6463       /* Map uppercase characters to corresponding lowercase ones.  */
6464       for (i = 0; i < CHAR_SET_SIZE; i++)
6465         preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6466     }
6467   else
6468     preg->translate = NULL;
6469
6470   /* If REG_NEWLINE is set, newlines are treated differently.  */
6471   if (cflags & REG_NEWLINE)
6472     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
6473       syntax &= ~RE_DOT_NEWLINE;
6474       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6475     }
6476   else
6477     syntax |= RE_NO_NEWLINE_ANCHOR;
6478
6479   preg->no_sub = !!(cflags & REG_NOSUB);
6480
6481   /* POSIX says a null character in the pattern terminates it, so we
6482      can use strlen here in compiling the pattern.  */
6483   ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
6484
6485   /* POSIX doesn't distinguish between an unmatched open-group and an
6486      unmatched close-group: both are REG_EPAREN.  */
6487   if (ret == REG_ERPAREN)
6488     ret = REG_EPAREN;
6489
6490   if (ret == REG_NOERROR && preg->fastmap)
6491     { /* Compute the fastmap now, since regexec cannot modify the pattern
6492          buffer.  */
6493       re_compile_fastmap (preg);
6494       if (preg->can_be_null)
6495         { /* The fastmap can't be used anyway.  */
6496           free (preg->fastmap);
6497           preg->fastmap = NULL;
6498         }
6499     }
6500   return ret;
6501 }
6502 WEAK_ALIAS (__regcomp, regcomp)
6503
6504
6505 /* regexec searches for a given pattern, specified by PREG, in the
6506    string STRING.
6507
6508    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6509    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
6510    least NMATCH elements, and we set them to the offsets of the
6511    corresponding matched substrings.
6512
6513    EFLAGS specifies `execution flags' which affect matching: if
6514    REG_NOTBOL is set, then ^ does not match at the beginning of the
6515    string; if REG_NOTEOL is set, then $ does not match at the end.
6516
6517    We return 0 if we find a match and REG_NOMATCH if not.  */
6518
6519 reg_errcode_t
6520 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
6521          size_t nmatch, regmatch_t pmatch[_Restrict_arr_], int eflags)
6522 {
6523   regoff_t ret;
6524   struct re_registers regs;
6525   regex_t private_preg;
6526   size_t len = strlen (string);
6527   boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
6528
6529   private_preg = *preg;
6530
6531   private_preg.not_bol = !!(eflags & REG_NOTBOL);
6532   private_preg.not_eol = !!(eflags & REG_NOTEOL);
6533
6534   /* The user has told us exactly how many registers to return
6535      information about, via `nmatch'.  We have to pass that on to the
6536      matching routines.  */
6537   private_preg.regs_allocated = REGS_FIXED;
6538
6539   if (want_reg_info)
6540     {
6541       regs.num_regs = nmatch;
6542       regs.start = TALLOC (nmatch * 2, regoff_t);
6543       if (regs.start == NULL)
6544         return REG_NOMATCH;
6545       regs.end = regs.start + nmatch;
6546     }
6547
6548   /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6549      pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6550      was a little bit longer but still only matching the real part.
6551      This works because the `endline' will check for a '\n' and will find a
6552      '\0', correctly deciding that this is not the end of a line.
6553      But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6554      a convenient '\0' there.  For all we know, the string could be preceded
6555      by '\n' which would throw things off.  */
6556
6557   /* Perform the searching operation.  */
6558   ret = re_search (&private_preg, string, len,
6559                    /* start: */ 0, /* range: */ len,
6560                    want_reg_info ? &regs : 0);
6561
6562   /* Copy the register information to the POSIX structure.  */
6563   if (want_reg_info)
6564     {
6565       if (ret >= 0)
6566         {
6567           unsigned r;
6568
6569           for (r = 0; r < nmatch; r++)
6570             {
6571               pmatch[r].rm_so = regs.start[r];
6572               pmatch[r].rm_eo = regs.end[r];
6573             }
6574         }
6575
6576       /* If we needed the temporary register info, free the space now.  */
6577       free (regs.start);
6578     }
6579
6580   /* We want zero return to mean success, unlike `re_search'.  */
6581   return ret >= 0 ? REG_NOERROR : REG_NOMATCH;
6582 }
6583 WEAK_ALIAS (__regexec, regexec)
6584
6585
6586 /* Returns a message corresponding to an error code, ERR_CODE, returned
6587    from either regcomp or regexec.   We don't use PREG here.
6588
6589    ERR_CODE was previously called ERRCODE, but that name causes an
6590    error with msvc8 compiler.  */
6591
6592 size_t
6593 regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
6594 {
6595   const char *msg;
6596   size_t msg_size;
6597
6598   if (err_code < 0
6599       || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6600     /* Only error codes returned by the rest of the code should be passed
6601        to this routine.  If we are given anything else, or if other regex
6602        code generates an invalid error code, then the program has a bug.
6603        Dump core so we can fix it.  */
6604     abort ();
6605
6606   msg = gettext (re_error_msgid[err_code]);
6607
6608   msg_size = strlen (msg) + 1; /* Includes the null.  */
6609
6610   if (errbuf_size != 0)
6611     {
6612       if (msg_size > errbuf_size)
6613         {
6614           memcpy (errbuf, msg, errbuf_size - 1);
6615           errbuf[errbuf_size - 1] = 0;
6616         }
6617       else
6618         strcpy (errbuf, msg);
6619     }
6620
6621   return msg_size;
6622 }
6623 WEAK_ALIAS (__regerror, regerror)
6624
6625
6626 /* Free dynamically allocated space used by PREG.  */
6627
6628 void
6629 regfree (regex_t *preg)
6630 {
6631   free (preg->buffer);
6632   preg->buffer = NULL;
6633
6634   preg->allocated = 0;
6635   preg->used = 0;
6636
6637   free (preg->fastmap);
6638   preg->fastmap = NULL;
6639   preg->fastmap_accurate = 0;
6640
6641   free (preg->translate);
6642   preg->translate = NULL;
6643 }
6644 WEAK_ALIAS (__regfree, regfree)
6645
6646 #endif /* not emacs  */