src/search.c

   1 /* String search routines for GNU Emacs.
   2    Copyright (C) 1985-1987, 1993-1994, 1997-1999, 2001-2011
   3                  Free Software Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20
  21 #include <config.h>
  22 #include <setjmp.h>
  23 #include "lisp.h"
  24 #include "syntax.h"
  25 #include "category.h"
  26 #include "buffer.h"
  27 #include "character.h"
  28 #include "charset.h"
  29 #include "region-cache.h"
  30 #include "commands.h"
  31 #include "blockinput.h"
  32 #include "intervals.h"
  33
  34 #include <sys/types.h>
  35 #include "regex.h"
  36
  37 #define REGEXP_CACHE_SIZE 20
  38
  39 /* If the regexp is non-nil, then the buffer contains the compiled form
  40    of that regexp, suitable for searching.  */
  41 struct regexp_cache
  42 {
  43   struct regexp_cache *next;
  44   Lisp_Object regexp, whitespace_regexp;
  45   /* Syntax table for which the regexp applies.  We need this because
  46      of character classes.  If this is t, then the compiled pattern is valid
  47      for any syntax-table.  */
  48   Lisp_Object syntax_table;
  49   struct re_pattern_buffer buf;
  50   char fastmap[0400];
  51   /* Nonzero means regexp was compiled to do full POSIX backtracking.  */
  52   char posix;
  53 };
  54
  55 /* The instances of that struct.  */
  56 struct regexp_cache searchbufs[REGEXP_CACHE_SIZE];
  57
  58 /* The head of the linked list; points to the most recently used buffer.  */
  59 struct regexp_cache *searchbuf_head;
  60
  61
  62 /* Every call to re_match, etc., must pass &search_regs as the regs
  63    argument unless you can show it is unnecessary (i.e., if re_match
  64    is certainly going to be called again before region-around-match
  65    can be called).
  66
  67    Since the registers are now dynamically allocated, we need to make
  68    sure not to refer to the Nth register before checking that it has
  69    been allocated by checking search_regs.num_regs.
  70
  71    The regex code keeps track of whether it has allocated the search
  72    buffer using bits in the re_pattern_buffer.  This means that whenever
  73    you compile a new pattern, it completely forgets whether it has
  74    allocated any registers, and will allocate new registers the next
  75    time you call a searching or matching function.  Therefore, we need
  76    to call re_set_registers after compiling a new pattern or after
  77    setting the match registers, so that the regex functions will be
  78    able to free or re-allocate it properly.  */
  79 static struct re_registers search_regs;
  80
  81 /* The buffer in which the last search was performed, or
  82    Qt if the last search was done in a string;
  83    Qnil if no searching has been done yet.  */
  84 static Lisp_Object last_thing_searched;
  85
  86 /* error condition signaled when regexp compile_pattern fails */
  87
  88 Lisp_Object Qinvalid_regexp;
  89
  90 /* Error condition used for failing searches */
  91 Lisp_Object Qsearch_failed;
  92
  93 static void set_search_regs (EMACS_INT, EMACS_INT);
  94 static void save_search_regs (void);
  95 static EMACS_INT simple_search (EMACS_INT, unsigned char *, EMACS_INT,
  96                                 EMACS_INT, Lisp_Object, EMACS_INT, EMACS_INT,
  97                                 EMACS_INT, EMACS_INT);
  98 static EMACS_INT boyer_moore (EMACS_INT, unsigned char *, EMACS_INT, EMACS_INT,
  99                               Lisp_Object, Lisp_Object,
 100                               EMACS_INT, EMACS_INT,
 101                               EMACS_INT, EMACS_INT, int);
 102 static EMACS_INT search_buffer (Lisp_Object, EMACS_INT, EMACS_INT,
 103                                 EMACS_INT, EMACS_INT, EMACS_INT, int,
 104                                 Lisp_Object, Lisp_Object, int);
 105 static void matcher_overflow (void) NO_RETURN;
 106
 107 static void
 108 matcher_overflow (void)
 109 {
 110   error ("Stack overflow in regexp matcher");
 111 }
 112
 113 /* Compile a regexp and signal a Lisp error if anything goes wrong.
 114    PATTERN is the pattern to compile.
 115    CP is the place to put the result.
 116    TRANSLATE is a translation table for ignoring case, or nil for none.
 117    POSIX is nonzero if we want full backtracking (POSIX style)
 118    for this pattern.  0 means backtrack only enough to get a valid match.
 119
 120    The behavior also depends on Vsearch_spaces_regexp.  */
 121
 122 static void
 123 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, Lisp_Object translate, int posix)
 124 {
 125   char *val;
 126   reg_syntax_t old;
 127
 128   cp->regexp = Qnil;
 129   cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
 130   cp->posix = posix;
 131   cp->buf.multibyte = STRING_MULTIBYTE (pattern);
 132   cp->buf.charset_unibyte = charset_unibyte;
 133   if (STRINGP (Vsearch_spaces_regexp))
 134     cp->whitespace_regexp = Vsearch_spaces_regexp;
 135   else
 136     cp->whitespace_regexp = Qnil;
 137
 138   /* rms: I think BLOCK_INPUT is not needed here any more,
 139      because regex.c defines malloc to call xmalloc.
 140      Using BLOCK_INPUT here means the debugger won't run if an error occurs.
 141      So let's turn it off.  */
 142   /*  BLOCK_INPUT;  */
 143   old = re_set_syntax (RE_SYNTAX_EMACS
 144                        | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
 145
 146   if (STRINGP (Vsearch_spaces_regexp))
 147     re_set_whitespace_regexp (SSDATA (Vsearch_spaces_regexp));
 148   else
 149     re_set_whitespace_regexp (NULL);
 150
 151   val = (char *) re_compile_pattern (SSDATA (pattern),
 152                                      SBYTES (pattern), &cp->buf);
 153
 154   /* If the compiled pattern hard codes some of the contents of the
 155      syntax-table, it can only be reused with *this* syntax table.  */
 156   cp->syntax_table = cp->buf.used_syntax ? BVAR (current_buffer, syntax_table) : Qt;
 157
 158   re_set_whitespace_regexp (NULL);
 159
 160   re_set_syntax (old);
 161   /* UNBLOCK_INPUT;  */
 162   if (val)
 163     xsignal1 (Qinvalid_regexp, build_string (val));
 164
 165   cp->regexp = Fcopy_sequence (pattern);
 166 }
 167
 168 /* Shrink each compiled regexp buffer in the cache
 169    to the size actually used right now.
 170    This is called from garbage collection.  */
 171
 172 void
 173 shrink_regexp_cache (void)
 174 {
 175   struct regexp_cache *cp;
 176
 177   for (cp = searchbuf_head; cp != 0; cp = cp->next)
 178     {
 179       cp->buf.allocated = cp->buf.used;
 180       cp->buf.buffer
 181         = (unsigned char *) xrealloc (cp->buf.buffer, cp->buf.used);
 182     }
 183 }
 184
 185 /* Clear the regexp cache w.r.t. a particular syntax table,
 186    because it was changed.
 187    There is no danger of memory leak here because re_compile_pattern
 188    automagically manages the memory in each re_pattern_buffer struct,
 189    based on its `allocated' and `buffer' values.  */
 190 void
 191 clear_regexp_cache (void)
 192 {
 193   int i;
 194
 195   for (i = 0; i < REGEXP_CACHE_SIZE; ++i)
 196     /* It's tempting to compare with the syntax-table we've actually changed,
 197        but it's not sufficient because char-table inheritance means that
 198        modifying one syntax-table can change others at the same time.  */
 199     if (!EQ (searchbufs[i].syntax_table, Qt))
 200       searchbufs[i].regexp = Qnil;
 201 }
 202
 203 /* Compile a regexp if necessary, but first check to see if there's one in
 204    the cache.
 205    PATTERN is the pattern to compile.
 206    TRANSLATE is a translation table for ignoring case, or nil for none.
 207    REGP is the structure that says where to store the "register"
 208    values that will result from matching this pattern.
 209    If it is 0, we should compile the pattern not to record any
 210    subexpression bounds.
 211    POSIX is nonzero if we want full backtracking (POSIX style)
 212    for this pattern.  0 means backtrack only enough to get a valid match.  */
 213
 214 struct re_pattern_buffer *
 215 compile_pattern (Lisp_Object pattern, struct re_registers *regp, Lisp_Object translate, int posix, int multibyte)
 216 {
 217   struct regexp_cache *cp, **cpp;
 218
 219   for (cpp = &searchbuf_head; ; cpp = &cp->next)
 220     {
 221       cp = *cpp;
 222       /* Entries are initialized to nil, and may be set to nil by
 223          compile_pattern_1 if the pattern isn't valid.  Don't apply
 224          string accessors in those cases.  However, compile_pattern_1
 225          is only applied to the cache entry we pick here to reuse.  So
 226          nil should never appear before a non-nil entry.  */
 227       if (NILP (cp->regexp))
 228         goto compile_it;
 229       if (SCHARS (cp->regexp) == SCHARS (pattern)
 230           && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern)
 231           && !NILP (Fstring_equal (cp->regexp, pattern))
 232           && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
 233           && cp->posix == posix
 234           && (EQ (cp->syntax_table, Qt)
 235               || EQ (cp->syntax_table, BVAR (current_buffer, syntax_table)))
 236           && !NILP (Fequal (cp->whitespace_regexp, Vsearch_spaces_regexp))
 237           && cp->buf.charset_unibyte == charset_unibyte)
 238         break;
 239
 240       /* If we're at the end of the cache, compile into the nil cell
 241          we found, or the last (least recently used) cell with a
 242          string value.  */
 243       if (cp->next == 0)
 244         {
 245         compile_it:
 246           compile_pattern_1 (cp, pattern, translate, posix);
 247           break;
 248         }
 249     }
 250
 251   /* When we get here, cp (aka *cpp) contains the compiled pattern,
 252      either because we found it in the cache or because we just compiled it.
 253      Move it to the front of the queue to mark it as most recently used.  */
 254   *cpp = cp->next;
 255   cp->next = searchbuf_head;
 256   searchbuf_head = cp;
 257
 258   /* Advise the searching functions about the space we have allocated
 259      for register data.  */
 260   if (regp)
 261     re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end);
 262
 263   /* The compiled pattern can be used both for multibyte and unibyte
 264      target.  But, we have to tell which the pattern is used for. */
 265   cp->buf.target_multibyte = multibyte;
 266
 267   return &cp->buf;
 268 }
 269
 270 \f
 271 static Lisp_Object
 272 looking_at_1 (Lisp_Object string, int posix)
 273 {
 274   Lisp_Object val;
 275   unsigned char *p1, *p2;
 276   EMACS_INT s1, s2;
 277   register EMACS_INT i;
 278   struct re_pattern_buffer *bufp;
 279
 280   if (running_asynch_code)
 281     save_search_regs ();
 282
 283   /* This is so set_image_of_range_1 in regex.c can find the EQV table.  */
 284   XCHAR_TABLE (BVAR (current_buffer, case_canon_table))->extras[2]
 285     = BVAR (current_buffer, case_eqv_table);
 286
 287   CHECK_STRING (string);
 288   bufp = compile_pattern (string,
 289                           (NILP (Vinhibit_changing_match_data)
 290                            ? &search_regs : NULL),
 291                           (!NILP (BVAR (current_buffer, case_fold_search))
 292                            ? BVAR (current_buffer, case_canon_table) : Qnil),
 293                           posix,
 294                           !NILP (BVAR (current_buffer, enable_multibyte_characters)));
 295
 296   immediate_quit = 1;
 297   QUIT;                 /* Do a pending quit right away, to avoid paradoxical behavior */
 298
 299   /* Get pointers and sizes of the two strings
 300      that make up the visible portion of the buffer. */
 301
 302   p1 = BEGV_ADDR;
 303   s1 = GPT_BYTE - BEGV_BYTE;
 304   p2 = GAP_END_ADDR;
 305   s2 = ZV_BYTE - GPT_BYTE;
 306   if (s1 < 0)
 307     {
 308       p2 = p1;
 309       s2 = ZV_BYTE - BEGV_BYTE;
 310       s1 = 0;
 311     }
 312   if (s2 < 0)
 313     {
 314       s1 = ZV_BYTE - BEGV_BYTE;
 315       s2 = 0;
 316     }
 317
 318   re_match_object = Qnil;
 319
 320   i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2,
 321                   PT_BYTE - BEGV_BYTE,
 322                   (NILP (Vinhibit_changing_match_data)
 323                    ? &search_regs : NULL),
 324                   ZV_BYTE - BEGV_BYTE);
 325   immediate_quit = 0;
 326
 327   if (i == -2)
 328     matcher_overflow ();
 329
 330   val = (0 <= i ? Qt : Qnil);
 331   if (NILP (Vinhibit_changing_match_data) && i >= 0)
 332     for (i = 0; i < search_regs.num_regs; i++)
 333       if (search_regs.start[i] >= 0)
 334         {
 335           search_regs.start[i]
 336             = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
 337           search_regs.end[i]
 338             = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
 339         }
 340
 341   /* Set last_thing_searched only when match data is changed.  */
 342   if (NILP (Vinhibit_changing_match_data))
 343     XSETBUFFER (last_thing_searched, current_buffer);
 344
 345   return val;
 346 }
 347
 348 DEFUN ("looking-at", Flooking_at, Slooking_at, 1, 1, 0,
 349        doc: /* Return t if text after point matches regular expression REGEXP.
 350 This function modifies the match data that `match-beginning',
 351 `match-end' and `match-data' access; save and restore the match
 352 data if you want to preserve them.  */)
 353   (Lisp_Object regexp)
 354 {
 355   return looking_at_1 (regexp, 0);
 356 }
 357
 358 DEFUN ("posix-looking-at", Fposix_looking_at, Sposix_looking_at, 1, 1, 0,
 359        doc: /* Return t if text after point matches regular expression REGEXP.
 360 Find the longest match, in accord with Posix regular expression rules.
 361 This function modifies the match data that `match-beginning',
 362 `match-end' and `match-data' access; save and restore the match
 363 data if you want to preserve them.  */)
 364   (Lisp_Object regexp)
 365 {
 366   return looking_at_1 (regexp, 1);
 367 }
 368 \f
 369 static Lisp_Object
 370 string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, int posix)
 371 {
 372   int val;
 373   struct re_pattern_buffer *bufp;
 374   EMACS_INT pos, pos_byte;
 375   int i;
 376
 377   if (running_asynch_code)
 378     save_search_regs ();
 379
 380   CHECK_STRING (regexp);
 381   CHECK_STRING (string);
 382
 383   if (NILP (start))
 384     pos = 0, pos_byte = 0;
 385   else
 386     {
 387       EMACS_INT len = SCHARS (string);
 388
 389       CHECK_NUMBER (start);
 390       pos = XINT (start);
 391       if (pos < 0 && -pos <= len)
 392         pos = len + pos;
 393       else if (0 > pos || pos > len)
 394         args_out_of_range (string, start);
 395       pos_byte = string_char_to_byte (string, pos);
 396     }
 397
 398   /* This is so set_image_of_range_1 in regex.c can find the EQV table.  */
 399   XCHAR_TABLE (BVAR (current_buffer, case_canon_table))->extras[2]
 400     = BVAR (current_buffer, case_eqv_table);
 401
 402   bufp = compile_pattern (regexp,
 403                           (NILP (Vinhibit_changing_match_data)
 404                            ? &search_regs : NULL),
 405                           (!NILP (BVAR (current_buffer, case_fold_search))
 406                            ? BVAR (current_buffer, case_canon_table) : Qnil),
 407                           posix,
 408                           STRING_MULTIBYTE (string));
 409   immediate_quit = 1;
 410   re_match_object = string;
 411
 412   val = re_search (bufp, SSDATA (string),
 413                    SBYTES (string), pos_byte,
 414                    SBYTES (string) - pos_byte,
 415                    (NILP (Vinhibit_changing_match_data)
 416                     ? &search_regs : NULL));
 417   immediate_quit = 0;
 418
 419   /* Set last_thing_searched only when match data is changed.  */
 420   if (NILP (Vinhibit_changing_match_data))
 421     last_thing_searched = Qt;
 422
 423   if (val == -2)
 424     matcher_overflow ();
 425   if (val < 0) return Qnil;
 426
 427   if (NILP (Vinhibit_changing_match_data))
 428     for (i = 0; i < search_regs.num_regs; i++)
 429       if (search_regs.start[i] >= 0)
 430         {
 431           search_regs.start[i]
 432             = string_byte_to_char (string, search_regs.start[i]);
 433           search_regs.end[i]
 434             = string_byte_to_char (string, search_regs.end[i]);
 435         }
 436
 437   return make_number (string_byte_to_char (string, val));
 438 }
 439
 440 DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
 441        doc: /* Return index of start of first match for REGEXP in STRING, or nil.
 442 Matching ignores case if `case-fold-search' is non-nil.
 443 If third arg START is non-nil, start search at that index in STRING.
 444 For index of first char beyond the match, do (match-end 0).
 445 `match-end' and `match-beginning' also give indices of substrings
 446 matched by parenthesis constructs in the pattern.
 447
 448 You can use the function `match-string' to extract the substrings
 449 matched by the parenthesis constructions in REGEXP. */)
 450   (Lisp_Object regexp, Lisp_Object string, Lisp_Object start)
 451 {
 452   return string_match_1 (regexp, string, start, 0);
 453 }
 454
 455 DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 3, 0,
 456        doc: /* Return index of start of first match for REGEXP in STRING, or nil.
 457 Find the longest match, in accord with Posix regular expression rules.
 458 Case is ignored if `case-fold-search' is non-nil in the current buffer.
 459 If third arg START is non-nil, start search at that index in STRING.
 460 For index of first char beyond the match, do (match-end 0).
 461 `match-end' and `match-beginning' also give indices of substrings
 462 matched by parenthesis constructs in the pattern.  */)
 463   (Lisp_Object regexp, Lisp_Object string, Lisp_Object start)
 464 {
 465   return string_match_1 (regexp, string, start, 1);
 466 }
 467
 468 /* Match REGEXP against STRING, searching all of STRING,
 469    and return the index of the match, or negative on failure.
 470    This does not clobber the match data.  */
 471
 472 int
 473 fast_string_match (Lisp_Object regexp, Lisp_Object string)
 474 {
 475   int val;
 476   struct re_pattern_buffer *bufp;
 477
 478   bufp = compile_pattern (regexp, 0, Qnil,
 479                           0, STRING_MULTIBYTE (string));
 480   immediate_quit = 1;
 481   re_match_object = string;
 482
 483   val = re_search (bufp, SSDATA (string),
 484                    SBYTES (string), 0,
 485                    SBYTES (string), 0);
 486   immediate_quit = 0;
 487   return val;
 488 }
 489
 490 /* Match REGEXP against STRING, searching all of STRING ignoring case,
 491    and return the index of the match, or negative on failure.
 492    This does not clobber the match data.
 493    We assume that STRING contains single-byte characters.  */
 494
 495 int
 496 fast_c_string_match_ignore_case (Lisp_Object regexp, const char *string)
 497 {
 498   int val;
 499   struct re_pattern_buffer *bufp;
 500   size_t len = strlen (string);
 501
 502   regexp = string_make_unibyte (regexp);
 503   re_match_object = Qt;
 504   bufp = compile_pattern (regexp, 0,
 505                           Vascii_canon_table, 0,
 506                           0);
 507   immediate_quit = 1;
 508   val = re_search (bufp, string, len, 0, len, 0);
 509   immediate_quit = 0;
 510   return val;
 511 }
 512
 513 /* Like fast_string_match but ignore case.  */
 514
 515 int
 516 fast_string_match_ignore_case (Lisp_Object regexp, Lisp_Object string)
 517 {
 518   int val;
 519   struct re_pattern_buffer *bufp;
 520
 521   bufp = compile_pattern (regexp, 0, Vascii_canon_table,
 522                           0, STRING_MULTIBYTE (string));
 523   immediate_quit = 1;
 524   re_match_object = string;
 525
 526   val = re_search (bufp, SSDATA (string),
 527                    SBYTES (string), 0,
 528                    SBYTES (string), 0);
 529   immediate_quit = 0;
 530   return val;
 531 }
 532 \f
 533 /* Match REGEXP against the characters after POS to LIMIT, and return
 534    the number of matched characters.  If STRING is non-nil, match
 535    against the characters in it.  In that case, POS and LIMIT are
 536    indices into the string.  This function doesn't modify the match
 537    data.  */
 538
 539 EMACS_INT
 540 fast_looking_at (Lisp_Object regexp, EMACS_INT pos, EMACS_INT pos_byte, EMACS_INT limit, EMACS_INT limit_byte, Lisp_Object string)
 541 {
 542   int multibyte;
 543   struct re_pattern_buffer *buf;
 544   unsigned char *p1, *p2;
 545   EMACS_INT s1, s2;
 546   EMACS_INT len;
 547
 548   if (STRINGP (string))
 549     {
 550       if (pos_byte < 0)
 551         pos_byte = string_char_to_byte (string, pos);
 552       if (limit_byte < 0)
 553         limit_byte = string_char_to_byte (string, limit);
 554       p1 = NULL;
 555       s1 = 0;
 556       p2 = SDATA (string);
 557       s2 = SBYTES (string);
 558       re_match_object = string;
 559       multibyte = STRING_MULTIBYTE (string);
 560     }
 561   else
 562     {
 563       if (pos_byte < 0)
 564         pos_byte = CHAR_TO_BYTE (pos);
 565       if (limit_byte < 0)
 566         limit_byte = CHAR_TO_BYTE (limit);
 567       pos_byte -= BEGV_BYTE;
 568       limit_byte -= BEGV_BYTE;
 569       p1 = BEGV_ADDR;
 570       s1 = GPT_BYTE - BEGV_BYTE;
 571       p2 = GAP_END_ADDR;
 572       s2 = ZV_BYTE - GPT_BYTE;
 573       if (s1 < 0)
 574         {
 575           p2 = p1;
 576           s2 = ZV_BYTE - BEGV_BYTE;
 577           s1 = 0;
 578         }
 579       if (s2 < 0)
 580         {
 581           s1 = ZV_BYTE - BEGV_BYTE;
 582           s2 = 0;
 583         }
 584       re_match_object = Qnil;
 585       multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
 586     }
 587
 588   buf = compile_pattern (regexp, 0, Qnil, 0, multibyte);
 589   immediate_quit = 1;
 590   len = re_match_2 (buf, (char *) p1, s1, (char *) p2, s2,
 591                     pos_byte, NULL, limit_byte);
 592   immediate_quit = 0;
 593
 594   return len;
 595 }
 596
 597 \f
 598 /* The newline cache: remembering which sections of text have no newlines.  */
 599
 600 /* If the user has requested newline caching, make sure it's on.
 601    Otherwise, make sure it's off.
 602    This is our cheezy way of associating an action with the change of
 603    state of a buffer-local variable.  */
 604 static void
 605 newline_cache_on_off (struct buffer *buf)
 606 {
 607   if (NILP (BVAR (buf, cache_long_line_scans)))
 608     {
 609       /* It should be off.  */
 610       if (buf->newline_cache)
 611         {
 612           free_region_cache (buf->newline_cache);
 613           buf->newline_cache = 0;
 614         }
 615     }
 616   else
 617     {
 618       /* It should be on.  */
 619       if (buf->newline_cache == 0)
 620         buf->newline_cache = new_region_cache ();
 621     }
 622 }
 623
 624 \f
 625 /* Search for COUNT instances of the character TARGET between START and END.
 626
 627    If COUNT is positive, search forwards; END must be >= START.
 628    If COUNT is negative, search backwards for the -COUNTth instance;
 629       END must be <= START.
 630    If COUNT is zero, do anything you please; run rogue, for all I care.
 631
 632    If END is zero, use BEGV or ZV instead, as appropriate for the
 633    direction indicated by COUNT.
 634
 635    If we find COUNT instances, set *SHORTAGE to zero, and return the
 636    position past the COUNTth match.  Note that for reverse motion
 637    this is not the same as the usual convention for Emacs motion commands.
 638
 639    If we don't find COUNT instances before reaching END, set *SHORTAGE
 640    to the number of TARGETs left unfound, and return END.
 641
 642    If ALLOW_QUIT is non-zero, set immediate_quit.  That's good to do
 643    except when inside redisplay.  */
 644
 645 EMACS_INT
 646 scan_buffer (register int target, EMACS_INT start, EMACS_INT end,
 647              EMACS_INT count, int *shortage, int allow_quit)
 648 {
 649   struct region_cache *newline_cache;
 650   int direction;
 651
 652   if (count > 0)
 653     {
 654       direction = 1;
 655       if (! end) end = ZV;
 656     }
 657   else
 658     {
 659       direction = -1;
 660       if (! end) end = BEGV;
 661     }
 662
 663   newline_cache_on_off (current_buffer);
 664   newline_cache = current_buffer->newline_cache;
 665
 666   if (shortage != 0)
 667     *shortage = 0;
 668
 669   immediate_quit = allow_quit;
 670
 671   if (count > 0)
 672     while (start != end)
 673       {
 674         /* Our innermost scanning loop is very simple; it doesn't know
 675            about gaps, buffer ends, or the newline cache.  ceiling is
 676            the position of the last character before the next such
 677            obstacle --- the last character the dumb search loop should
 678            examine.  */
 679         EMACS_INT ceiling_byte = CHAR_TO_BYTE (end) - 1;
 680         EMACS_INT start_byte = CHAR_TO_BYTE (start);
 681         EMACS_INT tem;
 682
 683         /* If we're looking for a newline, consult the newline cache
 684            to see where we can avoid some scanning.  */
 685         if (target == '\n' && newline_cache)
 686           {
 687             EMACS_INT next_change;
 688             immediate_quit = 0;
 689             while (region_cache_forward
 690                    (current_buffer, newline_cache, start_byte, &next_change))
 691               start_byte = next_change;
 692             immediate_quit = allow_quit;
 693
 694             /* START should never be after END.  */
 695             if (start_byte > ceiling_byte)
 696               start_byte = ceiling_byte;
 697
 698             /* Now the text after start is an unknown region, and
 699                next_change is the position of the next known region. */
 700             ceiling_byte = min (next_change - 1, ceiling_byte);
 701           }
 702
 703         /* The dumb loop can only scan text stored in contiguous
 704            bytes. BUFFER_CEILING_OF returns the last character
 705            position that is contiguous, so the ceiling is the
 706            position after that.  */
 707         tem = BUFFER_CEILING_OF (start_byte);
 708         ceiling_byte = min (tem, ceiling_byte);
 709
 710         {
 711           /* The termination address of the dumb loop.  */
 712           register unsigned char *ceiling_addr
 713             = BYTE_POS_ADDR (ceiling_byte) + 1;
 714           register unsigned char *cursor
 715             = BYTE_POS_ADDR (start_byte);
 716           unsigned char *base = cursor;
 717
 718           while (cursor < ceiling_addr)
 719             {
 720               unsigned char *scan_start = cursor;
 721
 722               /* The dumb loop.  */
 723               while (*cursor != target && ++cursor < ceiling_addr)
 724                 ;
 725
 726               /* If we're looking for newlines, cache the fact that
 727                  the region from start to cursor is free of them. */
 728               if (target == '\n' && newline_cache)
 729                 know_region_cache (current_buffer, newline_cache,
 730                                    start_byte + scan_start - base,
 731                                    start_byte + cursor - base);
 732
 733               /* Did we find the target character?  */
 734               if (cursor < ceiling_addr)
 735                 {
 736                   if (--count == 0)
 737                     {
 738                       immediate_quit = 0;
 739                       return BYTE_TO_CHAR (start_byte + cursor - base + 1);
 740                     }
 741                   cursor++;
 742                 }
 743             }
 744
 745           start = BYTE_TO_CHAR (start_byte + cursor - base);
 746         }
 747       }
 748   else
 749     while (start > end)
 750       {
 751         /* The last character to check before the next obstacle.  */
 752         EMACS_INT ceiling_byte = CHAR_TO_BYTE (end);
 753         EMACS_INT start_byte = CHAR_TO_BYTE (start);
 754         EMACS_INT tem;
 755
 756         /* Consult the newline cache, if appropriate.  */
 757         if (target == '\n' && newline_cache)
 758           {
 759             EMACS_INT next_change;
 760             immediate_quit = 0;
 761             while (region_cache_backward
 762                    (current_buffer, newline_cache, start_byte, &next_change))
 763               start_byte = next_change;
 764             immediate_quit = allow_quit;
 765
 766             /* Start should never be at or before end.  */
 767             if (start_byte <= ceiling_byte)
 768               start_byte = ceiling_byte + 1;
 769
 770             /* Now the text before start is an unknown region, and
 771                next_change is the position of the next known region. */
 772             ceiling_byte = max (next_change, ceiling_byte);
 773           }
 774
 775         /* Stop scanning before the gap.  */
 776         tem = BUFFER_FLOOR_OF (start_byte - 1);
 777         ceiling_byte = max (tem, ceiling_byte);
 778
 779         {
 780           /* The termination address of the dumb loop.  */
 781           register unsigned char *ceiling_addr = BYTE_POS_ADDR (ceiling_byte);
 782           register unsigned char *cursor = BYTE_POS_ADDR (start_byte - 1);
 783           unsigned char *base = cursor;
 784
 785           while (cursor >= ceiling_addr)
 786             {
 787               unsigned char *scan_start = cursor;
 788
 789               while (*cursor != target && --cursor >= ceiling_addr)
 790                 ;
 791
 792               /* If we're looking for newlines, cache the fact that
 793                  the region from after the cursor to start is free of them.  */
 794               if (target == '\n' && newline_cache)
 795                 know_region_cache (current_buffer, newline_cache,
 796                                    start_byte + cursor - base,
 797                                    start_byte + scan_start - base);
 798
 799               /* Did we find the target character?  */
 800               if (cursor >= ceiling_addr)
 801                 {
 802                   if (++count >= 0)
 803                     {
 804                       immediate_quit = 0;
 805                       return BYTE_TO_CHAR (start_byte + cursor - base);
 806                     }
 807                   cursor--;
 808                 }
 809             }
 810
 811           start = BYTE_TO_CHAR (start_byte + cursor - base);
 812         }
 813       }
 814
 815   immediate_quit = 0;
 816   if (shortage != 0)
 817     *shortage = count * direction;
 818   return start;
 819 }
 820 \f
 821 /* Search for COUNT instances of a line boundary, which means either a
 822    newline or (if selective display enabled) a carriage return.
 823    Start at START.  If COUNT is negative, search backwards.
 824
 825    We report the resulting position by calling TEMP_SET_PT_BOTH.
 826
 827    If we find COUNT instances. we position after (always after,
 828    even if scanning backwards) the COUNTth match, and return 0.
 829
 830    If we don't find COUNT instances before reaching the end of the
 831    buffer (or the beginning, if scanning backwards), we return
 832    the number of line boundaries left unfound, and position at
 833    the limit we bumped up against.
 834
 835    If ALLOW_QUIT is non-zero, set immediate_quit.  That's good to do
 836    except in special cases.  */
 837
 838 EMACS_INT
 839 scan_newline (EMACS_INT start, EMACS_INT start_byte,
 840               EMACS_INT limit, EMACS_INT limit_byte,
 841               register EMACS_INT count, int allow_quit)
 842 {
 843   int direction = ((count > 0) ? 1 : -1);
 844
 845   register unsigned char *cursor;
 846   unsigned char *base;
 847
 848   EMACS_INT ceiling;
 849   register unsigned char *ceiling_addr;
 850
 851   int old_immediate_quit = immediate_quit;
 852
 853   /* The code that follows is like scan_buffer
 854      but checks for either newline or carriage return.  */
 855
 856   if (allow_quit)
 857     immediate_quit++;
 858
 859   start_byte = CHAR_TO_BYTE (start);
 860
 861   if (count > 0)
 862     {
 863       while (start_byte < limit_byte)
 864         {
 865           ceiling =  BUFFER_CEILING_OF (start_byte);
 866           ceiling = min (limit_byte - 1, ceiling);
 867           ceiling_addr = BYTE_POS_ADDR (ceiling) + 1;
 868           base = (cursor = BYTE_POS_ADDR (start_byte));
 869           while (1)
 870             {
 871               while (*cursor != '\n' && ++cursor != ceiling_addr)
 872                 ;
 873
 874               if (cursor != ceiling_addr)
 875                 {
 876                   if (--count == 0)
 877                     {
 878                       immediate_quit = old_immediate_quit;
 879                       start_byte = start_byte + cursor - base + 1;
 880                       start = BYTE_TO_CHAR (start_byte);
 881                       TEMP_SET_PT_BOTH (start, start_byte);
 882                       return 0;
 883                     }
 884                   else
 885                     if (++cursor == ceiling_addr)
 886                       break;
 887                 }
 888               else
 889                 break;
 890             }
 891           start_byte += cursor - base;
 892         }
 893     }
 894   else
 895     {
 896       while (start_byte > limit_byte)
 897         {
 898           ceiling = BUFFER_FLOOR_OF (start_byte - 1);
 899           ceiling = max (limit_byte, ceiling);
 900           ceiling_addr = BYTE_POS_ADDR (ceiling) - 1;
 901           base = (cursor = BYTE_POS_ADDR (start_byte - 1) + 1);
 902           while (1)
 903             {
 904               while (--cursor != ceiling_addr && *cursor != '\n')
 905                 ;
 906
 907               if (cursor != ceiling_addr)
 908                 {
 909                   if (++count == 0)
 910                     {
 911                       immediate_quit = old_immediate_quit;
 912                       /* Return the position AFTER the match we found.  */
 913                       start_byte = start_byte + cursor - base + 1;
 914                       start = BYTE_TO_CHAR (start_byte);
 915                       TEMP_SET_PT_BOTH (start, start_byte);
 916                       return 0;
 917                     }
 918                 }
 919               else
 920                 break;
 921             }
 922           /* Here we add 1 to compensate for the last decrement
 923              of CURSOR, which took it past the valid range.  */
 924           start_byte += cursor - base + 1;
 925         }
 926     }
 927
 928   TEMP_SET_PT_BOTH (limit, limit_byte);
 929   immediate_quit = old_immediate_quit;
 930
 931   return count * direction;
 932 }
 933
 934 EMACS_INT
 935 find_next_newline_no_quit (EMACS_INT from, EMACS_INT cnt)
 936 {
 937   return scan_buffer ('\n', from, 0, cnt, (int *) 0, 0);
 938 }
 939
 940 /* Like find_next_newline, but returns position before the newline,
 941    not after, and only search up to TO.  This isn't just
 942    find_next_newline (...)-1, because you might hit TO.  */
 943
 944 EMACS_INT
 945 find_before_next_newline (EMACS_INT from, EMACS_INT to, EMACS_INT cnt)
 946 {
 947   int shortage;
 948   EMACS_INT pos = scan_buffer ('\n', from, to, cnt, &shortage, 1);
 949
 950   if (shortage == 0)
 951     pos--;
 952
 953   return pos;
 954 }
 955 \f
 956 /* Subroutines of Lisp buffer search functions. */
 957
 958 static Lisp_Object
 959 search_command (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror,
 960                 Lisp_Object count, int direction, int RE, int posix)
 961 {
 962   register int np;
 963   EMACS_INT lim, lim_byte;
 964   int n = direction;
 965
 966   if (!NILP (count))
 967     {
 968       CHECK_NUMBER (count);
 969       n *= XINT (count);
 970     }
 971
 972   CHECK_STRING (string);
 973   if (NILP (bound))
 974     {
 975       if (n > 0)
 976         lim = ZV, lim_byte = ZV_BYTE;
 977       else
 978         lim = BEGV, lim_byte = BEGV_BYTE;
 979     }
 980   else
 981     {
 982       CHECK_NUMBER_COERCE_MARKER (bound);
 983       lim = XINT (bound);
 984       if (n > 0 ? lim < PT : lim > PT)
 985         error ("Invalid search bound (wrong side of point)");
 986       if (lim > ZV)
 987         lim = ZV, lim_byte = ZV_BYTE;
 988       else if (lim < BEGV)
 989         lim = BEGV, lim_byte = BEGV_BYTE;
 990       else
 991         lim_byte = CHAR_TO_BYTE (lim);
 992     }
 993
 994   /* This is so set_image_of_range_1 in regex.c can find the EQV table.  */
 995   XCHAR_TABLE (BVAR (current_buffer, case_canon_table))->extras[2]
 996     = BVAR (current_buffer, case_eqv_table);
 997
 998   np = search_buffer (string, PT, PT_BYTE, lim, lim_byte, n, RE,
 999                       (!NILP (BVAR (current_buffer, case_fold_search))
1000                        ? BVAR (current_buffer, case_canon_table)
1001                        : Qnil),
1002                       (!NILP (BVAR (current_buffer, case_fold_search))
1003                        ? BVAR (current_buffer, case_eqv_table)
1004                        : Qnil),
1005                       posix);
1006   if (np <= 0)
1007     {
1008       if (NILP (noerror))
1009         xsignal1 (Qsearch_failed, string);
1010
1011       if (!EQ (noerror, Qt))
1012         {
1013           if (lim < BEGV || lim > ZV)
1014             abort ();
1015           SET_PT_BOTH (lim, lim_byte);
1016           return Qnil;
1017 #if 0 /* This would be clean, but maybe programs depend on
1018          a value of nil here.  */
1019           np = lim;
1020 #endif
1021         }
1022       else
1023         return Qnil;
1024     }
1025
1026   if (np < BEGV || np > ZV)
1027     abort ();
1028
1029   SET_PT (np);
1030
1031   return make_number (np);
1032 }
1033 \f
1034 /* Return 1 if REGEXP it matches just one constant string.  */
1035
1036 static int
1037 trivial_regexp_p (Lisp_Object regexp)
1038 {
1039   EMACS_INT len = SBYTES (regexp);
1040   unsigned char *s = SDATA (regexp);
1041   while (--len >= 0)
1042     {
1043       switch (*s++)
1044         {
1045         case '.': case '*': case '+': case '?': case '[': case '^': case '$':
1046           return 0;
1047         case '\\':
1048           if (--len < 0)
1049             return 0;
1050           switch (*s++)
1051             {
1052             case '|': case '(': case ')': case '`': case '\'': case 'b':
1053             case 'B': case '<': case '>': case 'w': case 'W': case 's':
1054             case 'S': case '=': case '{': case '}': case '_':
1055             case 'c': case 'C': /* for categoryspec and notcategoryspec */
1056             case '1': case '2': case '3': case '4': case '5':
1057             case '6': case '7': case '8': case '9':
1058               return 0;
1059             }
1060         }
1061     }
1062   return 1;
1063 }
1064
1065 /* Search for the n'th occurrence of STRING in the current buffer,
1066    starting at position POS and stopping at position LIM,
1067    treating STRING as a literal string if RE is false or as
1068    a regular expression if RE is true.
1069
1070    If N is positive, searching is forward and LIM must be greater than POS.
1071    If N is negative, searching is backward and LIM must be less than POS.
1072
1073    Returns -x if x occurrences remain to be found (x > 0),
1074    or else the position at the beginning of the Nth occurrence
1075    (if searching backward) or the end (if searching forward).
1076
1077    POSIX is nonzero if we want full backtracking (POSIX style)
1078    for this pattern.  0 means backtrack only enough to get a valid match.  */
1079
1080 #define TRANSLATE(out, trt, d)                  \
1081 do                                              \
1082   {                                             \
1083     if (! NILP (trt))                           \
1084       {                                         \
1085         Lisp_Object temp;                       \
1086         temp = Faref (trt, make_number (d));    \
1087         if (INTEGERP (temp))                    \
1088           out = XINT (temp);                    \
1089         else                                    \
1090           out = d;                              \
1091       }                                         \
1092     else                                        \
1093       out = d;                                  \
1094   }                                             \
1095 while (0)
1096
1097 /* Only used in search_buffer, to record the end position of the match
1098    when searching regexps and SEARCH_REGS should not be changed
1099    (i.e. Vinhibit_changing_match_data is non-nil).  */
1100 static struct re_registers search_regs_1;
1101
1102 static EMACS_INT
1103 search_buffer (Lisp_Object string, EMACS_INT pos, EMACS_INT pos_byte,
1104                EMACS_INT lim, EMACS_INT lim_byte, EMACS_INT n,
1105                int RE, Lisp_Object trt, Lisp_Object inverse_trt, int posix)
1106 {
1107   EMACS_INT len = SCHARS (string);
1108   EMACS_INT len_byte = SBYTES (string);
1109   register int i;
1110
1111   if (running_asynch_code)
1112     save_search_regs ();
1113
1114   /* Searching 0 times means don't move.  */
1115   /* Null string is found at starting position.  */
1116   if (len == 0 || n == 0)
1117     {
1118       set_search_regs (pos_byte, 0);
1119       return pos;
1120     }
1121
1122   if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp)))
1123     {
1124       unsigned char *p1, *p2;
1125       EMACS_INT s1, s2;
1126       struct re_pattern_buffer *bufp;
1127
1128       bufp = compile_pattern (string,
1129                               (NILP (Vinhibit_changing_match_data)
1130                                ? &search_regs : &search_regs_1),
1131                               trt, posix,
1132                               !NILP (BVAR (current_buffer, enable_multibyte_characters)));
1133
1134       immediate_quit = 1;       /* Quit immediately if user types ^G,
1135                                    because letting this function finish
1136                                    can take too long. */
1137       QUIT;                     /* Do a pending quit right away,
1138                                    to avoid paradoxical behavior */
1139       /* Get pointers and sizes of the two strings
1140          that make up the visible portion of the buffer. */
1141
1142       p1 = BEGV_ADDR;
1143       s1 = GPT_BYTE - BEGV_BYTE;
1144       p2 = GAP_END_ADDR;
1145       s2 = ZV_BYTE - GPT_BYTE;
1146       if (s1 < 0)
1147         {
1148           p2 = p1;
1149           s2 = ZV_BYTE - BEGV_BYTE;
1150           s1 = 0;
1151         }
1152       if (s2 < 0)
1153         {
1154           s1 = ZV_BYTE - BEGV_BYTE;
1155           s2 = 0;
1156         }
1157       re_match_object = Qnil;
1158
1159       while (n < 0)
1160         {
1161           EMACS_INT val;
1162           val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
1163                              pos_byte - BEGV_BYTE, lim_byte - pos_byte,
1164                              (NILP (Vinhibit_changing_match_data)
1165                               ? &search_regs : &search_regs_1),
1166                              /* Don't allow match past current point */
1167                              pos_byte - BEGV_BYTE);
1168           if (val == -2)
1169             {
1170               matcher_overflow ();
1171             }
1172           if (val >= 0)
1173             {
1174               if (NILP (Vinhibit_changing_match_data))
1175                 {
1176                   pos_byte = search_regs.start[0] + BEGV_BYTE;
1177                   for (i = 0; i < search_regs.num_regs; i++)
1178                     if (search_regs.start[i] >= 0)
1179                       {
1180                         search_regs.start[i]
1181                           = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
1182                         search_regs.end[i]
1183                           = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
1184                       }
1185                   XSETBUFFER (last_thing_searched, current_buffer);
1186                   /* Set pos to the new position. */
1187                   pos = search_regs.start[0];
1188                 }
1189               else
1190                 {
1191                   pos_byte = search_regs_1.start[0] + BEGV_BYTE;
1192                   /* Set pos to the new position.  */
1193                   pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE);
1194                 }
1195             }
1196           else
1197             {
1198               immediate_quit = 0;
1199               return (n);
1200             }
1201           n++;
1202         }
1203       while (n > 0)
1204         {
1205           EMACS_INT val;
1206           val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
1207                              pos_byte - BEGV_BYTE, lim_byte - pos_byte,
1208                              (NILP (Vinhibit_changing_match_data)
1209                               ? &search_regs : &search_regs_1),
1210                              lim_byte - BEGV_BYTE);
1211           if (val == -2)
1212             {
1213               matcher_overflow ();
1214             }
1215           if (val >= 0)
1216             {
1217               if (NILP (Vinhibit_changing_match_data))
1218                 {
1219                   pos_byte = search_regs.end[0] + BEGV_BYTE;
1220                   for (i = 0; i < search_regs.num_regs; i++)
1221                     if (search_regs.start[i] >= 0)
1222                       {
1223                         search_regs.start[i]
1224                           = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
1225                         search_regs.end[i]
1226                           = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
1227                       }
1228                   XSETBUFFER (last_thing_searched, current_buffer);
1229                   pos = search_regs.end[0];
1230                 }
1231               else
1232                 {
1233                   pos_byte = search_regs_1.end[0] + BEGV_BYTE;
1234                   pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE);
1235                 }
1236             }
1237           else
1238             {
1239               immediate_quit = 0;
1240               return (0 - n);
1241             }
1242           n--;
1243         }
1244       immediate_quit = 0;
1245       return (pos);
1246     }
1247   else                          /* non-RE case */
1248     {
1249       unsigned char *raw_pattern, *pat;
1250       EMACS_INT raw_pattern_size;
1251       EMACS_INT raw_pattern_size_byte;
1252       unsigned char *patbuf;
1253       int multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
1254       unsigned char *base_pat;
1255       /* Set to positive if we find a non-ASCII char that need
1256          translation.  Otherwise set to zero later.  */
1257       int char_base = -1;
1258       int boyer_moore_ok = 1;
1259
1260       /* MULTIBYTE says whether the text to be searched is multibyte.
1261          We must convert PATTERN to match that, or we will not really
1262          find things right.  */
1263
1264       if (multibyte == STRING_MULTIBYTE (string))
1265         {
1266           raw_pattern = SDATA (string);
1267           raw_pattern_size = SCHARS (string);
1268           raw_pattern_size_byte = SBYTES (string);
1269         }
1270       else if (multibyte)
1271         {
1272           raw_pattern_size = SCHARS (string);
1273           raw_pattern_size_byte
1274             = count_size_as_multibyte (SDATA (string),
1275                                        raw_pattern_size);
1276           raw_pattern = (unsigned char *) alloca (raw_pattern_size_byte + 1);
1277           copy_text (SDATA (string), raw_pattern,
1278                      SCHARS (string), 0, 1);
1279         }
1280       else
1281         {
1282           /* Converting multibyte to single-byte.
1283
1284              ??? Perhaps this conversion should be done in a special way
1285              by subtracting nonascii-insert-offset from each non-ASCII char,
1286              so that only the multibyte chars which really correspond to
1287              the chosen single-byte character set can possibly match.  */
1288           raw_pattern_size = SCHARS (string);
1289           raw_pattern_size_byte = SCHARS (string);
1290           raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
1291           copy_text (SDATA (string), raw_pattern,
1292                      SBYTES (string), 1, 0);
1293         }
1294
1295       /* Copy and optionally translate the pattern.  */
1296       len = raw_pattern_size;
1297       len_byte = raw_pattern_size_byte;
1298       patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
1299       pat = patbuf;
1300       base_pat = raw_pattern;
1301       if (multibyte)
1302         {
1303           /* Fill patbuf by translated characters in STRING while
1304              checking if we can use boyer-moore search.  If TRT is
1305              non-nil, we can use boyer-moore search only if TRT can be
1306              represented by the byte array of 256 elements.  For that,
1307              all non-ASCII case-equivalents of all case-senstive
1308              characters in STRING must belong to the same charset and
1309              row.  */
1310
1311           while (--len >= 0)
1312             {
1313               unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
1314               int c, translated, inverse;
1315               int in_charlen, charlen;
1316
1317               /* If we got here and the RE flag is set, it's because we're
1318                  dealing with a regexp known to be trivial, so the backslash
1319                  just quotes the next character.  */
1320               if (RE && *base_pat == '\\')
1321                 {
1322                   len--;
1323                   raw_pattern_size--;
1324                   len_byte--;
1325                   base_pat++;
1326                 }
1327
1328               c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen);
1329
1330               if (NILP (trt))
1331                 {
1332                   str = base_pat;
1333                   charlen = in_charlen;
1334                 }
1335               else
1336                 {
1337                   /* Translate the character.  */
1338                   TRANSLATE (translated, trt, c);
1339                   charlen = CHAR_STRING (translated, str_base);
1340                   str = str_base;
1341
1342                   /* Check if C has any other case-equivalents.  */
1343                   TRANSLATE (inverse, inverse_trt, c);
1344                   /* If so, check if we can use boyer-moore.  */
1345                   if (c != inverse && boyer_moore_ok)
1346                     {
1347                       /* Check if all equivalents belong to the same
1348                          group of characters.  Note that the check of C
1349                          itself is done by the last iteration.  */
1350                       int this_char_base = -1;
1351
1352                       while (boyer_moore_ok)
1353                         {
1354                           if (ASCII_BYTE_P (inverse))
1355                             {
1356                               if (this_char_base > 0)
1357                                 boyer_moore_ok = 0;
1358                               else
1359                                 this_char_base = 0;
1360                             }
1361                           else if (CHAR_BYTE8_P (inverse))
1362                             /* Boyer-moore search can't handle a
1363                                translation of an eight-bit
1364                                character.  */
1365                             boyer_moore_ok = 0;
1366                           else if (this_char_base < 0)
1367                             {
1368                               this_char_base = inverse & ~0x3F;
1369                               if (char_base < 0)
1370                                 char_base = this_char_base;
1371                               else if (this_char_base != char_base)
1372                                 boyer_moore_ok = 0;
1373                             }
1374                           else if ((inverse & ~0x3F) != this_char_base)
1375                             boyer_moore_ok = 0;
1376                           if (c == inverse)
1377                             break;
1378                           TRANSLATE (inverse, inverse_trt, inverse);
1379                         }
1380                     }
1381                 }
1382
1383               /* Store this character into the translated pattern.  */
1384               memcpy (pat, str, charlen);
1385               pat += charlen;
1386               base_pat += in_charlen;
1387               len_byte -= in_charlen;
1388             }
1389
1390           /* If char_base is still negative we didn't find any translated
1391              non-ASCII characters.  */
1392           if (char_base < 0)
1393             char_base = 0;
1394         }
1395       else
1396         {
1397           /* Unibyte buffer.  */
1398           char_base = 0;
1399           while (--len >= 0)
1400             {
1401               int c, translated;
1402
1403               /* If we got here and the RE flag is set, it's because we're
1404                  dealing with a regexp known to be trivial, so the backslash
1405                  just quotes the next character.  */
1406               if (RE && *base_pat == '\\')
1407                 {
1408                   len--;
1409                   raw_pattern_size--;
1410                   base_pat++;
1411                 }
1412               c = *base_pat++;
1413               TRANSLATE (translated, trt, c);
1414               *pat++ = translated;
1415             }
1416         }
1417
1418       len_byte = pat - patbuf;
1419       len = raw_pattern_size;
1420       pat = base_pat = patbuf;
1421
1422       if (boyer_moore_ok)
1423         return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1424                             pos, pos_byte, lim, lim_byte,
1425                             char_base);
1426       else
1427         return simple_search (n, pat, len, len_byte, trt,
1428                               pos, pos_byte, lim, lim_byte);
1429     }
1430 }
1431 \f
1432 /* Do a simple string search N times for the string PAT,
1433    whose length is LEN/LEN_BYTE,
1434    from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1435    TRT is the translation table.
1436
1437    Return the character position where the match is found.
1438    Otherwise, if M matches remained to be found, return -M.
1439
1440    This kind of search works regardless of what is in PAT and
1441    regardless of what is in TRT.  It is used in cases where
1442    boyer_moore cannot work.  */
1443
1444 static EMACS_INT
1445 simple_search (EMACS_INT n, unsigned char *pat,
1446                EMACS_INT len, EMACS_INT len_byte, Lisp_Object trt,
1447                EMACS_INT pos, EMACS_INT pos_byte,
1448                EMACS_INT lim, EMACS_INT lim_byte)
1449 {
1450   int multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1451   int forward = n > 0;
1452   /* Number of buffer bytes matched.  Note that this may be different
1453      from len_byte in a multibyte buffer.  */
1454   EMACS_INT match_byte;
1455
1456   if (lim > pos && multibyte)
1457     while (n > 0)
1458       {
1459         while (1)
1460           {
1461             /* Try matching at position POS.  */
1462             EMACS_INT this_pos = pos;
1463             EMACS_INT this_pos_byte = pos_byte;
1464             EMACS_INT this_len = len;
1465             unsigned char *p = pat;
1466             if (pos + len > lim || pos_byte + len_byte > lim_byte)
1467               goto stop;
1468
1469             while (this_len > 0)
1470               {
1471                 int charlen, buf_charlen;
1472                 int pat_ch, buf_ch;
1473
1474                 pat_ch = STRING_CHAR_AND_LENGTH (p, charlen);
1475                 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte),
1476                                                  buf_charlen);
1477                 TRANSLATE (buf_ch, trt, buf_ch);
1478
1479                 if (buf_ch != pat_ch)
1480                   break;
1481
1482                 this_len--;
1483                 p += charlen;
1484
1485                 this_pos_byte += buf_charlen;
1486                 this_pos++;
1487               }
1488
1489             if (this_len == 0)
1490               {
1491                 match_byte = this_pos_byte - pos_byte;
1492                 pos += len;
1493                 pos_byte += match_byte;
1494                 break;
1495               }
1496
1497             INC_BOTH (pos, pos_byte);
1498           }
1499
1500         n--;
1501       }
1502   else if (lim > pos)
1503     while (n > 0)
1504       {
1505         while (1)
1506           {
1507             /* Try matching at position POS.  */
1508             EMACS_INT this_pos = pos;
1509             EMACS_INT this_len = len;
1510             unsigned char *p = pat;
1511
1512             if (pos + len > lim)
1513               goto stop;
1514
1515             while (this_len > 0)
1516               {
1517                 int pat_ch = *p++;
1518                 int buf_ch = FETCH_BYTE (this_pos);
1519                 TRANSLATE (buf_ch, trt, buf_ch);
1520
1521                 if (buf_ch != pat_ch)
1522                   break;
1523
1524                 this_len--;
1525                 this_pos++;
1526               }
1527
1528             if (this_len == 0)
1529               {
1530                 match_byte = len;
1531                 pos += len;
1532                 break;
1533               }
1534
1535             pos++;
1536           }
1537
1538         n--;
1539       }
1540   /* Backwards search.  */
1541   else if (lim < pos && multibyte)
1542     while (n < 0)
1543       {
1544         while (1)
1545           {
1546             /* Try matching at position POS.  */
1547             EMACS_INT this_pos = pos;
1548             EMACS_INT this_pos_byte = pos_byte;
1549             EMACS_INT this_len = len;
1550             const unsigned char *p = pat + len_byte;
1551
1552             if (this_pos - len < lim || (pos_byte - len_byte) < lim_byte)
1553               goto stop;
1554
1555             while (this_len > 0)
1556               {
1557                 int pat_ch, buf_ch;
1558
1559                 DEC_BOTH (this_pos, this_pos_byte);
1560                 PREV_CHAR_BOUNDARY (p, pat);
1561                 pat_ch = STRING_CHAR (p);
1562                 buf_ch = STRING_CHAR (BYTE_POS_ADDR (this_pos_byte));
1563                 TRANSLATE (buf_ch, trt, buf_ch);
1564
1565                 if (buf_ch != pat_ch)
1566                   break;
1567
1568                 this_len--;
1569               }
1570
1571             if (this_len == 0)
1572               {
1573                 match_byte = pos_byte - this_pos_byte;
1574                 pos = this_pos;
1575                 pos_byte = this_pos_byte;
1576                 break;
1577               }
1578
1579             DEC_BOTH (pos, pos_byte);
1580           }
1581
1582         n++;
1583       }
1584   else if (lim < pos)
1585     while (n < 0)
1586       {
1587         while (1)
1588           {
1589             /* Try matching at position POS.  */
1590             EMACS_INT this_pos = pos - len;
1591             EMACS_INT this_len = len;
1592             unsigned char *p = pat;
1593
1594             if (this_pos < lim)
1595               goto stop;
1596
1597             while (this_len > 0)
1598               {
1599                 int pat_ch = *p++;
1600                 int buf_ch = FETCH_BYTE (this_pos);
1601                 TRANSLATE (buf_ch, trt, buf_ch);
1602
1603                 if (buf_ch != pat_ch)
1604                   break;
1605                 this_len--;
1606                 this_pos++;
1607               }
1608
1609             if (this_len == 0)
1610               {
1611                 match_byte = len;
1612                 pos -= len;
1613                 break;
1614               }
1615
1616             pos--;
1617           }
1618
1619         n++;
1620       }
1621
1622  stop:
1623   if (n == 0)
1624     {
1625       if (forward)
1626         set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
1627       else
1628         set_search_regs (multibyte ? pos_byte : pos, match_byte);
1629
1630       return pos;
1631     }
1632   else if (n > 0)
1633     return -n;
1634   else
1635     return n;
1636 }
1637 \f
1638 /* Do Boyer-Moore search N times for the string BASE_PAT,
1639    whose length is LEN/LEN_BYTE,
1640    from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1641    DIRECTION says which direction we search in.
1642    TRT and INVERSE_TRT are translation tables.
1643    Characters in PAT are already translated by TRT.
1644
1645    This kind of search works if all the characters in BASE_PAT that
1646    have nontrivial translation are the same aside from the last byte.
1647    This makes it possible to translate just the last byte of a
1648    character, and do so after just a simple test of the context.
1649    CHAR_BASE is nonzero if there is such a non-ASCII character.
1650
1651    If that criterion is not satisfied, do not call this function.  */
1652
1653 static EMACS_INT
1654 boyer_moore (EMACS_INT n, unsigned char *base_pat,
1655              EMACS_INT len, EMACS_INT len_byte,
1656              Lisp_Object trt, Lisp_Object inverse_trt,
1657              EMACS_INT pos, EMACS_INT pos_byte,
1658              EMACS_INT lim, EMACS_INT lim_byte, int char_base)
1659 {
1660   int direction = ((n > 0) ? 1 : -1);
1661   register EMACS_INT dirlen;
1662   EMACS_INT limit;
1663   int stride_for_teases = 0;
1664   int BM_tab[0400];
1665   register unsigned char *cursor, *p_limit;
1666   register EMACS_INT i;
1667   register int j;
1668   unsigned char *pat, *pat_end;
1669   int multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1670
1671   unsigned char simple_translate[0400];
1672   /* These are set to the preceding bytes of a byte to be translated
1673      if char_base is nonzero.  As the maximum byte length of a
1674      multibyte character is 5, we have to check at most four previous
1675      bytes.  */
1676   int translate_prev_byte1 = 0;
1677   int translate_prev_byte2 = 0;
1678   int translate_prev_byte3 = 0;
1679   int translate_prev_byte4 = 0;
1680
1681   /* The general approach is that we are going to maintain that we know
1682      the first (closest to the present position, in whatever direction
1683      we're searching) character that could possibly be the last
1684      (furthest from present position) character of a valid match.  We
1685      advance the state of our knowledge by looking at that character
1686      and seeing whether it indeed matches the last character of the
1687      pattern.  If it does, we take a closer look.  If it does not, we
1688      move our pointer (to putative last characters) as far as is
1689      logically possible.  This amount of movement, which I call a
1690      stride, will be the length of the pattern if the actual character
1691      appears nowhere in the pattern, otherwise it will be the distance
1692      from the last occurrence of that character to the end of the
1693      pattern.  If the amount is zero we have a possible match.  */
1694
1695   /* Here we make a "mickey mouse" BM table.  The stride of the search
1696      is determined only by the last character of the putative match.
1697      If that character does not match, we will stride the proper
1698      distance to propose a match that superimposes it on the last
1699      instance of a character that matches it (per trt), or misses
1700      it entirely if there is none. */
1701
1702   dirlen = len_byte * direction;
1703
1704   /* Record position after the end of the pattern.  */
1705   pat_end = base_pat + len_byte;
1706   /* BASE_PAT points to a character that we start scanning from.
1707      It is the first character in a forward search,
1708      the last character in a backward search.  */
1709   if (direction < 0)
1710     base_pat = pat_end - 1;
1711
1712   /* A character that does not appear in the pattern induces a
1713      stride equal to the pattern length.  */
1714   for (i = 0; i < 0400; i++)
1715     BM_tab[i] = dirlen;
1716
1717   /* We use this for translation, instead of TRT itself.
1718      We fill this in to handle the characters that actually
1719      occur in the pattern.  Others don't matter anyway!  */
1720   for (i = 0; i < 0400; i++)
1721     simple_translate[i] = i;
1722
1723   if (char_base)
1724     {
1725       /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE.  Only a
1726          byte following them are the target of translation.  */
1727       unsigned char str[MAX_MULTIBYTE_LENGTH];
1728       int cblen = CHAR_STRING (char_base, str);
1729
1730       translate_prev_byte1 = str[cblen - 2];
1731       if (cblen > 2)
1732         {
1733           translate_prev_byte2 = str[cblen - 3];
1734           if (cblen > 3)
1735             {
1736               translate_prev_byte3 = str[cblen - 4];
1737               if (cblen > 4)
1738                 translate_prev_byte4 = str[cblen - 5];
1739             }
1740         }
1741     }
1742
1743   i = 0;
1744   while (i != dirlen)
1745     {
1746       unsigned char *ptr = base_pat + i;
1747       i += direction;
1748       if (! NILP (trt))
1749         {
1750           /* If the byte currently looking at is the last of a
1751              character to check case-equivalents, set CH to that
1752              character.  An ASCII character and a non-ASCII character
1753              matching with CHAR_BASE are to be checked.  */
1754           int ch = -1;
1755
1756           if (ASCII_BYTE_P (*ptr) || ! multibyte)
1757             ch = *ptr;
1758           else if (char_base
1759                    && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])))
1760             {
1761               unsigned char *charstart = ptr - 1;
1762
1763               while (! (CHAR_HEAD_P (*charstart)))
1764                 charstart--;
1765               ch = STRING_CHAR (charstart);
1766               if (char_base != (ch & ~0x3F))
1767                 ch = -1;
1768             }
1769
1770           if (ch >= 0200)
1771             j = (ch & 0x3F) | 0200;
1772           else
1773             j = *ptr;
1774
1775           if (i == dirlen)
1776             stride_for_teases = BM_tab[j];
1777
1778           BM_tab[j] = dirlen - i;
1779           /* A translation table is accompanied by its inverse -- see */
1780           /* comment following downcase_table for details */
1781           if (ch >= 0)
1782             {
1783               int starting_ch = ch;
1784               int starting_j = j;
1785
1786               while (1)
1787                 {
1788                   TRANSLATE (ch, inverse_trt, ch);
1789                   if (ch >= 0200)
1790                     j = (ch & 0x3F) | 0200;
1791                   else
1792                     j = ch;
1793
1794                   /* For all the characters that map into CH,
1795                      set up simple_translate to map the last byte
1796                      into STARTING_J.  */
1797                   simple_translate[j] = starting_j;
1798                   if (ch == starting_ch)
1799                     break;
1800                   BM_tab[j] = dirlen - i;
1801                 }
1802             }
1803         }
1804       else
1805         {
1806           j = *ptr;
1807
1808           if (i == dirlen)
1809             stride_for_teases = BM_tab[j];
1810           BM_tab[j] = dirlen - i;
1811         }
1812       /* stride_for_teases tells how much to stride if we get a
1813          match on the far character but are subsequently
1814          disappointed, by recording what the stride would have been
1815          for that character if the last character had been
1816          different.  */
1817     }
1818   pos_byte += dirlen - ((direction > 0) ? direction : 0);
1819   /* loop invariant - POS_BYTE points at where last char (first
1820      char if reverse) of pattern would align in a possible match.  */
1821   while (n != 0)
1822     {
1823       EMACS_INT tail_end;
1824       unsigned char *tail_end_ptr;
1825
1826       /* It's been reported that some (broken) compiler thinks that
1827          Boolean expressions in an arithmetic context are unsigned.
1828          Using an explicit ?1:0 prevents this.  */
1829       if ((lim_byte - pos_byte - ((direction > 0) ? 1 : 0)) * direction
1830           < 0)
1831         return (n * (0 - direction));
1832       /* First we do the part we can by pointers (maybe nothing) */
1833       QUIT;
1834       pat = base_pat;
1835       limit = pos_byte - dirlen + direction;
1836       if (direction > 0)
1837         {
1838           limit = BUFFER_CEILING_OF (limit);
1839           /* LIMIT is now the last (not beyond-last!) value POS_BYTE
1840              can take on without hitting edge of buffer or the gap.  */
1841           limit = min (limit, pos_byte + 20000);
1842           limit = min (limit, lim_byte - 1);
1843         }
1844       else
1845         {
1846           limit = BUFFER_FLOOR_OF (limit);
1847           /* LIMIT is now the last (not beyond-last!) value POS_BYTE
1848              can take on without hitting edge of buffer or the gap.  */
1849           limit = max (limit, pos_byte - 20000);
1850           limit = max (limit, lim_byte);
1851         }
1852       tail_end = BUFFER_CEILING_OF (pos_byte) + 1;
1853       tail_end_ptr = BYTE_POS_ADDR (tail_end);
1854
1855       if ((limit - pos_byte) * direction > 20)
1856         {
1857           unsigned char *p2;
1858
1859           p_limit = BYTE_POS_ADDR (limit);
1860           p2 = (cursor = BYTE_POS_ADDR (pos_byte));
1861           /* In this loop, pos + cursor - p2 is the surrogate for pos.  */
1862           while (1)             /* use one cursor setting as long as i can */
1863             {
1864               if (direction > 0) /* worth duplicating */
1865                 {
1866                   while (cursor <= p_limit)
1867                     {
1868                       if (BM_tab[*cursor] == 0)
1869                         goto hit;
1870                       cursor += BM_tab[*cursor];
1871                     }
1872                 }
1873               else
1874                 {
1875                   while (cursor >= p_limit)
1876                     {
1877                       if (BM_tab[*cursor] == 0)
1878                         goto hit;
1879                       cursor += BM_tab[*cursor];
1880                     }
1881                 }
1882               /* If you are here, cursor is beyond the end of the
1883                  searched region.  You fail to match within the
1884                  permitted region and would otherwise try a character
1885                  beyond that region.  */
1886               break;
1887
1888             hit:
1889               i = dirlen - direction;
1890               if (! NILP (trt))
1891                 {
1892                   while ((i -= direction) + direction != 0)
1893                     {
1894                       int ch;
1895                       cursor -= direction;
1896                       /* Translate only the last byte of a character.  */
1897                       if (! multibyte
1898                           || ((cursor == tail_end_ptr
1899                                || CHAR_HEAD_P (cursor[1]))
1900                               && (CHAR_HEAD_P (cursor[0])
1901                                   /* Check if this is the last byte of
1902                                      a translable character.  */
1903                                   || (translate_prev_byte1 == cursor[-1]
1904                                       && (CHAR_HEAD_P (translate_prev_byte1)
1905                                           || (translate_prev_byte2 == cursor[-2]
1906                                               && (CHAR_HEAD_P (translate_prev_byte2)
1907                                                   || (translate_prev_byte3 == cursor[-3]))))))))
1908                         ch = simple_translate[*cursor];
1909                       else
1910                         ch = *cursor;
1911                       if (pat[i] != ch)
1912                         break;
1913                     }
1914                 }
1915               else
1916                 {
1917                   while ((i -= direction) + direction != 0)
1918                     {
1919                       cursor -= direction;
1920                       if (pat[i] != *cursor)
1921                         break;
1922                     }
1923                 }
1924               cursor += dirlen - i - direction; /* fix cursor */
1925               if (i + direction == 0)
1926                 {
1927                   EMACS_INT position, start, end;
1928
1929                   cursor -= direction;
1930
1931                   position = pos_byte + cursor - p2 + ((direction > 0)
1932                                                        ? 1 - len_byte : 0);
1933                   set_search_regs (position, len_byte);
1934
1935                   if (NILP (Vinhibit_changing_match_data))
1936                     {
1937                       start = search_regs.start[0];
1938                       end = search_regs.end[0];
1939                     }
1940                   else
1941                     /* If Vinhibit_changing_match_data is non-nil,
1942                        search_regs will not be changed.  So let's
1943                        compute start and end here.  */
1944                     {
1945                       start = BYTE_TO_CHAR (position);
1946                       end = BYTE_TO_CHAR (position + len_byte);
1947                     }
1948
1949                   if ((n -= direction) != 0)
1950                     cursor += dirlen; /* to resume search */
1951                   else
1952                     return direction > 0 ? end : start;
1953                 }
1954               else
1955                 cursor += stride_for_teases; /* <sigh> we lose -  */
1956             }
1957           pos_byte += cursor - p2;
1958         }
1959       else
1960         /* Now we'll pick up a clump that has to be done the hard
1961            way because it covers a discontinuity.  */
1962         {
1963           limit = ((direction > 0)
1964                    ? BUFFER_CEILING_OF (pos_byte - dirlen + 1)
1965                    : BUFFER_FLOOR_OF (pos_byte - dirlen - 1));
1966           limit = ((direction > 0)
1967                    ? min (limit + len_byte, lim_byte - 1)
1968                    : max (limit - len_byte, lim_byte));
1969           /* LIMIT is now the last value POS_BYTE can have
1970              and still be valid for a possible match.  */
1971           while (1)
1972             {
1973               /* This loop can be coded for space rather than
1974                  speed because it will usually run only once.
1975                  (the reach is at most len + 21, and typically
1976                  does not exceed len).  */
1977               while ((limit - pos_byte) * direction >= 0)
1978                 {
1979                   int ch = FETCH_BYTE (pos_byte);
1980                   if (BM_tab[ch] == 0)
1981                     goto hit2;
1982                   pos_byte += BM_tab[ch];
1983                 }
1984               break;    /* ran off the end */
1985
1986             hit2:
1987               /* Found what might be a match.  */
1988               i = dirlen - direction;
1989               while ((i -= direction) + direction != 0)
1990                 {
1991                   int ch;
1992                   unsigned char *ptr;
1993                   pos_byte -= direction;
1994                   ptr = BYTE_POS_ADDR (pos_byte);
1995                   /* Translate only the last byte of a character.  */
1996                   if (! multibyte
1997                       || ((ptr == tail_end_ptr
1998                            || CHAR_HEAD_P (ptr[1]))
1999                           && (CHAR_HEAD_P (ptr[0])
2000                               /* Check if this is the last byte of a
2001                                  translable character.  */
2002                               || (translate_prev_byte1 == ptr[-1]
2003                                   && (CHAR_HEAD_P (translate_prev_byte1)
2004                                       || (translate_prev_byte2 == ptr[-2]
2005                                           && (CHAR_HEAD_P (translate_prev_byte2)
2006                                               || translate_prev_byte3 == ptr[-3])))))))
2007                     ch = simple_translate[*ptr];
2008                   else
2009                     ch = *ptr;
2010                   if (pat[i] != ch)
2011                     break;
2012                 }
2013               /* Above loop has moved POS_BYTE part or all the way
2014                  back to the first pos (last pos if reverse).
2015                  Set it once again at the last (first if reverse) char.  */
2016               pos_byte += dirlen - i - direction;
2017               if (i + direction == 0)
2018                 {
2019                   EMACS_INT position, start, end;
2020                   pos_byte -= direction;
2021
2022                   position = pos_byte + ((direction > 0) ? 1 - len_byte : 0);
2023                   set_search_regs (position, len_byte);
2024
2025                   if (NILP (Vinhibit_changing_match_data))
2026                     {
2027                       start = search_regs.start[0];
2028                       end = search_regs.end[0];
2029                     }
2030                   else
2031                     /* If Vinhibit_changing_match_data is non-nil,
2032                        search_regs will not be changed.  So let's
2033                        compute start and end here.  */
2034                     {
2035                       start = BYTE_TO_CHAR (position);
2036                       end = BYTE_TO_CHAR (position + len_byte);
2037                     }
2038
2039                   if ((n -= direction) != 0)
2040                     pos_byte += dirlen; /* to resume search */
2041                   else
2042                     return direction > 0 ? end : start;
2043                 }
2044               else
2045                 pos_byte += stride_for_teases;
2046             }
2047           }
2048       /* We have done one clump.  Can we continue? */
2049       if ((lim_byte - pos_byte) * direction < 0)
2050         return ((0 - n) * direction);
2051     }
2052   return BYTE_TO_CHAR (pos_byte);
2053 }
2054
2055 /* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES
2056    for the overall match just found in the current buffer.
2057    Also clear out the match data for registers 1 and up.  */
2058
2059 static void
2060 set_search_regs (EMACS_INT beg_byte, EMACS_INT nbytes)
2061 {
2062   int i;
2063
2064   if (!NILP (Vinhibit_changing_match_data))
2065     return;
2066
2067   /* Make sure we have registers in which to store
2068      the match position.  */
2069   if (search_regs.num_regs == 0)
2070     {
2071       search_regs.start = (regoff_t *) xmalloc (2 * sizeof (regoff_t));
2072       search_regs.end = (regoff_t *) xmalloc (2 * sizeof (regoff_t));
2073       search_regs.num_regs = 2;
2074     }
2075
2076   /* Clear out the other registers.  */
2077   for (i = 1; i < search_regs.num_regs; i++)
2078     {
2079       search_regs.start[i] = -1;
2080       search_regs.end[i] = -1;
2081     }
2082
2083   search_regs.start[0] = BYTE_TO_CHAR (beg_byte);
2084   search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes);
2085   XSETBUFFER (last_thing_searched, current_buffer);
2086 }
2087 \f
2088 /* Given STRING, a string of words separated by word delimiters,
2089    compute a regexp that matches those exact words separated by
2090    arbitrary punctuation.  If LAX is nonzero, the end of the string
2091    need not match a word boundary unless it ends in whitespace.  */
2092
2093 static Lisp_Object
2094 wordify (Lisp_Object string, int lax)
2095 {
2096   register unsigned char *p, *o;
2097   register EMACS_INT i, i_byte, len, punct_count = 0, word_count = 0;
2098   Lisp_Object val;
2099   int prev_c = 0;
2100   EMACS_INT adjust;
2101   int whitespace_at_end;
2102
2103   CHECK_STRING (string);
2104   p = SDATA (string);
2105   len = SCHARS (string);
2106
2107   for (i = 0, i_byte = 0; i < len; )
2108     {
2109       int c;
2110
2111       FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
2112
2113       if (SYNTAX (c) != Sword)
2114         {
2115           punct_count++;
2116           if (i > 0 && SYNTAX (prev_c) == Sword)
2117             word_count++;
2118         }
2119
2120       prev_c = c;
2121     }
2122
2123   if (SYNTAX (prev_c) == Sword)
2124     {
2125       word_count++;
2126       whitespace_at_end = 0;
2127     }
2128   else
2129     whitespace_at_end = 1;
2130
2131   if (!word_count)
2132     return empty_unibyte_string;
2133
2134   adjust = - punct_count + 5 * (word_count - 1)
2135     + ((lax && !whitespace_at_end) ? 2 : 4);
2136   if (STRING_MULTIBYTE (string))
2137     val = make_uninit_multibyte_string (len + adjust,
2138                                         SBYTES (string)
2139                                         + adjust);
2140   else
2141     val = make_uninit_string (len + adjust);
2142
2143   o = SDATA (val);
2144   *o++ = '\\';
2145   *o++ = 'b';
2146   prev_c = 0;
2147
2148   for (i = 0, i_byte = 0; i < len; )
2149     {
2150       int c;
2151       EMACS_INT i_byte_orig = i_byte;
2152
2153       FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
2154
2155       if (SYNTAX (c) == Sword)
2156         {
2157           memcpy (o, SDATA (string) + i_byte_orig, i_byte - i_byte_orig);
2158           o += i_byte - i_byte_orig;
2159         }
2160       else if (i > 0 && SYNTAX (prev_c) == Sword && --word_count)
2161         {
2162           *o++ = '\\';
2163           *o++ = 'W';
2164           *o++ = '\\';
2165           *o++ = 'W';
2166           *o++ = '*';
2167         }
2168
2169       prev_c = c;
2170     }
2171
2172   if (!lax || whitespace_at_end)
2173     {
2174       *o++ = '\\';
2175       *o++ = 'b';
2176     }
2177
2178   return val;
2179 }
2180 \f
2181 DEFUN ("search-backward", Fsearch_backward, Ssearch_backward, 1, 4,
2182        "MSearch backward: ",
2183        doc: /* Search backward from point for STRING.
2184 Set point to the beginning of the occurrence found, and return point.
2185 An optional second argument bounds the search; it is a buffer position.
2186 The match found must not extend before that position.
2187 Optional third argument, if t, means if fail just return nil (no error).
2188  If not nil and not t, position at limit of search and return nil.
2189 Optional fourth argument is repeat count--search for successive occurrences.
2190
2191 Search case-sensitivity is determined by the value of the variable
2192 `case-fold-search', which see.
2193
2194 See also the functions `match-beginning', `match-end' and `replace-match'.  */)
2195   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2196 {
2197   return search_command (string, bound, noerror, count, -1, 0, 0);
2198 }
2199
2200 DEFUN ("search-forward", Fsearch_forward, Ssearch_forward, 1, 4, "MSearch: ",
2201        doc: /* Search forward from point for STRING.
2202 Set point to the end of the occurrence found, and return point.
2203 An optional second argument bounds the search; it is a buffer position.
2204 The match found must not extend after that position.  A value of nil is
2205   equivalent to (point-max).
2206 Optional third argument, if t, means if fail just return nil (no error).
2207   If not nil and not t, move to limit of search and return nil.
2208 Optional fourth argument is repeat count--search for successive occurrences.
2209
2210 Search case-sensitivity is determined by the value of the variable
2211 `case-fold-search', which see.
2212
2213 See also the functions `match-beginning', `match-end' and `replace-match'.  */)
2214   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2215 {
2216   return search_command (string, bound, noerror, count, 1, 0, 0);
2217 }
2218
2219 DEFUN ("word-search-backward", Fword_search_backward, Sword_search_backward, 1, 4,
2220        "sWord search backward: ",
2221        doc: /* Search backward from point for STRING, ignoring differences in punctuation.
2222 Set point to the beginning of the occurrence found, and return point.
2223 An optional second argument bounds the search; it is a buffer position.
2224 The match found must not extend before that position.
2225 Optional third argument, if t, means if fail just return nil (no error).
2226   If not nil and not t, move to limit of search and return nil.
2227 Optional fourth argument is repeat count--search for successive occurrences.  */)
2228   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2229 {
2230   return search_command (wordify (string, 0), bound, noerror, count, -1, 1, 0);
2231 }
2232
2233 DEFUN ("word-search-forward", Fword_search_forward, Sword_search_forward, 1, 4,
2234        "sWord search: ",
2235        doc: /* Search forward from point for STRING, ignoring differences in punctuation.
2236 Set point to the end of the occurrence found, and return point.
2237 An optional second argument bounds the search; it is a buffer position.
2238 The match found must not extend after that position.
2239 Optional third argument, if t, means if fail just return nil (no error).
2240   If not nil and not t, move to limit of search and return nil.
2241 Optional fourth argument is repeat count--search for successive occurrences.  */)
2242   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2243 {
2244   return search_command (wordify (string, 0), bound, noerror, count, 1, 1, 0);
2245 }
2246
2247 DEFUN ("word-search-backward-lax", Fword_search_backward_lax, Sword_search_backward_lax, 1, 4,
2248        "sWord search backward: ",
2249        doc: /* Search backward from point for STRING, ignoring differences in punctuation.
2250 Set point to the beginning of the occurrence found, and return point.
2251
2252 Unlike `word-search-backward', the end of STRING need not match a word
2253 boundary unless it ends in whitespace.
2254
2255 An optional second argument bounds the search; it is a buffer position.
2256 The match found must not extend before that position.
2257 Optional third argument, if t, means if fail just return nil (no error).
2258   If not nil and not t, move to limit of search and return nil.
2259 Optional fourth argument is repeat count--search for successive occurrences.  */)
2260   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2261 {
2262   return search_command (wordify (string, 1), bound, noerror, count, -1, 1, 0);
2263 }
2264
2265 DEFUN ("word-search-forward-lax", Fword_search_forward_lax, Sword_search_forward_lax, 1, 4,
2266        "sWord search: ",
2267        doc: /* Search forward from point for STRING, ignoring differences in punctuation.
2268 Set point to the end of the occurrence found, and return point.
2269
2270 Unlike `word-search-forward', the end of STRING need not match a word
2271 boundary unless it ends in whitespace.
2272
2273 An optional second argument bounds the search; it is a buffer position.
2274 The match found must not extend after that position.
2275 Optional third argument, if t, means if fail just return nil (no error).
2276   If not nil and not t, move to limit of search and return nil.
2277 Optional fourth argument is repeat count--search for successive occurrences.  */)
2278   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2279 {
2280   return search_command (wordify (string, 1), bound, noerror, count, 1, 1, 0);
2281 }
2282
2283 DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4,
2284        "sRE search backward: ",
2285        doc: /* Search backward from point for match for regular expression REGEXP.
2286 Set point to the beginning of the match, and return point.
2287 The match found is the one starting last in the buffer
2288 and yet ending before the origin of the search.
2289 An optional second argument bounds the search; it is a buffer position.
2290 The match found must start at or after that position.
2291 Optional third argument, if t, means if fail just return nil (no error).
2292   If not nil and not t, move to limit of search and return nil.
2293 Optional fourth argument is repeat count--search for successive occurrences.
2294 See also the functions `match-beginning', `match-end', `match-string',
2295 and `replace-match'.  */)
2296   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2297 {
2298   return search_command (regexp, bound, noerror, count, -1, 1, 0);
2299 }
2300
2301 DEFUN ("re-search-forward", Fre_search_forward, Sre_search_forward, 1, 4,
2302        "sRE search: ",
2303        doc: /* Search forward from point for regular expression REGEXP.
2304 Set point to the end of the occurrence found, and return point.
2305 An optional second argument bounds the search; it is a buffer position.
2306 The match found must not extend after that position.
2307 Optional third argument, if t, means if fail just return nil (no error).
2308   If not nil and not t, move to limit of search and return nil.
2309 Optional fourth argument is repeat count--search for successive occurrences.
2310 See also the functions `match-beginning', `match-end', `match-string',
2311 and `replace-match'.  */)
2312   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2313 {
2314   return search_command (regexp, bound, noerror, count, 1, 1, 0);
2315 }
2316
2317 DEFUN ("posix-search-backward", Fposix_search_backward, Sposix_search_backward, 1, 4,
2318        "sPosix search backward: ",
2319        doc: /* Search backward from point for match for regular expression REGEXP.
2320 Find the longest match in accord with Posix regular expression rules.
2321 Set point to the beginning of the match, and return point.
2322 The match found is the one starting last in the buffer
2323 and yet ending before the origin of the search.
2324 An optional second argument bounds the search; it is a buffer position.
2325 The match found must start at or after that position.
2326 Optional third argument, if t, means if fail just return nil (no error).
2327   If not nil and not t, move to limit of search and return nil.
2328 Optional fourth argument is repeat count--search for successive occurrences.
2329 See also the functions `match-beginning', `match-end', `match-string',
2330 and `replace-match'.  */)
2331   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2332 {
2333   return search_command (regexp, bound, noerror, count, -1, 1, 1);
2334 }
2335
2336 DEFUN ("posix-search-forward", Fposix_search_forward, Sposix_search_forward, 1, 4,
2337        "sPosix search: ",
2338        doc: /* Search forward from point for regular expression REGEXP.
2339 Find the longest match in accord with Posix regular expression rules.
2340 Set point to the end of the occurrence found, and return point.
2341 An optional second argument bounds the search; it is a buffer position.
2342 The match found must not extend after that position.
2343 Optional third argument, if t, means if fail just return nil (no error).
2344   If not nil and not t, move to limit of search and return nil.
2345 Optional fourth argument is repeat count--search for successive occurrences.
2346 See also the functions `match-beginning', `match-end', `match-string',
2347 and `replace-match'.  */)
2348   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2349 {
2350   return search_command (regexp, bound, noerror, count, 1, 1, 1);
2351 }
2352 \f
2353 DEFUN ("replace-match", Freplace_match, Sreplace_match, 1, 5, 0,
2354        doc: /* Replace text matched by last search with NEWTEXT.
2355 Leave point at the end of the replacement text.
2356
2357 If second arg FIXEDCASE is non-nil, do not alter case of replacement text.
2358 Otherwise maybe capitalize the whole text, or maybe just word initials,
2359 based on the replaced text.
2360 If the replaced text has only capital letters
2361 and has at least one multiletter word, convert NEWTEXT to all caps.
2362 Otherwise if all words are capitalized in the replaced text,
2363 capitalize each word in NEWTEXT.
2364
2365 If third arg LITERAL is non-nil, insert NEWTEXT literally.
2366 Otherwise treat `\\' as special:
2367   `\\&' in NEWTEXT means substitute original matched text.
2368   `\\N' means substitute what matched the Nth `\\(...\\)'.
2369        If Nth parens didn't match, substitute nothing.
2370   `\\\\' means insert one `\\'.
2371 Case conversion does not apply to these substitutions.
2372
2373 FIXEDCASE and LITERAL are optional arguments.
2374
2375 The optional fourth argument STRING can be a string to modify.
2376 This is meaningful when the previous match was done against STRING,
2377 using `string-match'.  When used this way, `replace-match'
2378 creates and returns a new string made by copying STRING and replacing
2379 the part of STRING that was matched.
2380
2381 The optional fifth argument SUBEXP specifies a subexpression;
2382 it says to replace just that subexpression with NEWTEXT,
2383 rather than replacing the entire matched text.
2384 This is, in a vague sense, the inverse of using `\\N' in NEWTEXT;
2385 `\\N' copies subexp N into NEWTEXT, but using N as SUBEXP puts
2386 NEWTEXT in place of subexp N.
2387 This is useful only after a regular expression search or match,
2388 since only regular expressions have distinguished subexpressions.  */)
2389   (Lisp_Object newtext, Lisp_Object fixedcase, Lisp_Object literal, Lisp_Object string, Lisp_Object subexp)
2390 {
2391   enum { nochange, all_caps, cap_initial } case_action;
2392   register EMACS_INT pos, pos_byte;
2393   int some_multiletter_word;
2394   int some_lowercase;
2395   int some_uppercase;
2396   int some_nonuppercase_initial;
2397   register int c, prevc;
2398   int sub;
2399   EMACS_INT opoint, newpoint;
2400
2401   CHECK_STRING (newtext);
2402
2403   if (! NILP (string))
2404     CHECK_STRING (string);
2405
2406   case_action = nochange;       /* We tried an initialization */
2407                                 /* but some C compilers blew it */
2408
2409   if (search_regs.num_regs <= 0)
2410     error ("`replace-match' called before any match found");
2411
2412   if (NILP (subexp))
2413     sub = 0;
2414   else
2415     {
2416       CHECK_NUMBER (subexp);
2417       sub = XINT (subexp);
2418       if (sub < 0 || sub >= search_regs.num_regs)
2419         args_out_of_range (subexp, make_number (search_regs.num_regs));
2420     }
2421
2422   if (NILP (string))
2423     {
2424       if (search_regs.start[sub] < BEGV
2425           || search_regs.start[sub] > search_regs.end[sub]
2426           || search_regs.end[sub] > ZV)
2427         args_out_of_range (make_number (search_regs.start[sub]),
2428                            make_number (search_regs.end[sub]));
2429     }
2430   else
2431     {
2432       if (search_regs.start[sub] < 0
2433           || search_regs.start[sub] > search_regs.end[sub]
2434           || search_regs.end[sub] > SCHARS (string))
2435         args_out_of_range (make_number (search_regs.start[sub]),
2436                            make_number (search_regs.end[sub]));
2437     }
2438
2439   if (NILP (fixedcase))
2440     {
2441       /* Decide how to casify by examining the matched text. */
2442       EMACS_INT last;
2443
2444       pos = search_regs.start[sub];
2445       last = search_regs.end[sub];
2446
2447       if (NILP (string))
2448         pos_byte = CHAR_TO_BYTE (pos);
2449       else
2450         pos_byte = string_char_to_byte (string, pos);
2451
2452       prevc = '\n';
2453       case_action = all_caps;
2454
2455       /* some_multiletter_word is set nonzero if any original word
2456          is more than one letter long. */
2457       some_multiletter_word = 0;
2458       some_lowercase = 0;
2459       some_nonuppercase_initial = 0;
2460       some_uppercase = 0;
2461
2462       while (pos < last)
2463         {
2464           if (NILP (string))
2465             {
2466               c = FETCH_CHAR_AS_MULTIBYTE (pos_byte);
2467               INC_BOTH (pos, pos_byte);
2468             }
2469           else
2470             FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte);
2471
2472           if (lowercasep (c))
2473             {
2474               /* Cannot be all caps if any original char is lower case */
2475
2476               some_lowercase = 1;
2477               if (SYNTAX (prevc) != Sword)
2478                 some_nonuppercase_initial = 1;
2479               else
2480                 some_multiletter_word = 1;
2481             }
2482           else if (uppercasep (c))
2483             {
2484               some_uppercase = 1;
2485               if (SYNTAX (prevc) != Sword)
2486                 ;
2487               else
2488                 some_multiletter_word = 1;
2489             }
2490           else
2491             {
2492               /* If the initial is a caseless word constituent,
2493                  treat that like a lowercase initial.  */
2494               if (SYNTAX (prevc) != Sword)
2495                 some_nonuppercase_initial = 1;
2496             }
2497
2498           prevc = c;
2499         }
2500
2501       /* Convert to all caps if the old text is all caps
2502          and has at least one multiletter word.  */
2503       if (! some_lowercase && some_multiletter_word)
2504         case_action = all_caps;
2505       /* Capitalize each word, if the old text has all capitalized words.  */
2506       else if (!some_nonuppercase_initial && some_multiletter_word)
2507         case_action = cap_initial;
2508       else if (!some_nonuppercase_initial && some_uppercase)
2509         /* Should x -> yz, operating on X, give Yz or YZ?
2510            We'll assume the latter.  */
2511         case_action = all_caps;
2512       else
2513         case_action = nochange;
2514     }
2515
2516   /* Do replacement in a string.  */
2517   if (!NILP (string))
2518     {
2519       Lisp_Object before, after;
2520
2521       before = Fsubstring (string, make_number (0),
2522                            make_number (search_regs.start[sub]));
2523       after = Fsubstring (string, make_number (search_regs.end[sub]), Qnil);
2524
2525       /* Substitute parts of the match into NEWTEXT
2526          if desired.  */
2527       if (NILP (literal))
2528         {
2529           EMACS_INT lastpos = 0;
2530           EMACS_INT lastpos_byte = 0;
2531           /* We build up the substituted string in ACCUM.  */
2532           Lisp_Object accum;
2533           Lisp_Object middle;
2534           int length = SBYTES (newtext);
2535
2536           accum = Qnil;
2537
2538           for (pos_byte = 0, pos = 0; pos_byte < length;)
2539             {
2540               EMACS_INT substart = -1;
2541               EMACS_INT subend = 0;
2542               int delbackslash = 0;
2543
2544               FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
2545
2546               if (c == '\\')
2547                 {
2548                   FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
2549
2550                   if (c == '&')
2551                     {
2552                       substart = search_regs.start[sub];
2553                       subend = search_regs.end[sub];
2554                     }
2555                   else if (c >= '1' && c <= '9')
2556                     {
2557                       if (search_regs.start[c - '0'] >= 0
2558                           && c <= search_regs.num_regs + '0')
2559                         {
2560                           substart = search_regs.start[c - '0'];
2561                           subend = search_regs.end[c - '0'];
2562                         }
2563                       else
2564                         {
2565                           /* If that subexp did not match,
2566                              replace \\N with nothing.  */
2567                           substart = 0;
2568                           subend = 0;
2569                         }
2570                     }
2571                   else if (c == '\\')
2572                     delbackslash = 1;
2573                   else
2574                     error ("Invalid use of `\\' in replacement text");
2575                 }
2576               if (substart >= 0)
2577                 {
2578                   if (pos - 2 != lastpos)
2579                     middle = substring_both (newtext, lastpos,
2580                                              lastpos_byte,
2581                                              pos - 2, pos_byte - 2);
2582                   else
2583                     middle = Qnil;
2584                   accum = concat3 (accum, middle,
2585                                    Fsubstring (string,
2586                                                make_number (substart),
2587                                                make_number (subend)));
2588                   lastpos = pos;
2589                   lastpos_byte = pos_byte;
2590                 }
2591               else if (delbackslash)
2592                 {
2593                   middle = substring_both (newtext, lastpos,
2594                                            lastpos_byte,
2595                                            pos - 1, pos_byte - 1);
2596
2597                   accum = concat2 (accum, middle);
2598                   lastpos = pos;
2599                   lastpos_byte = pos_byte;
2600                 }
2601             }
2602
2603           if (pos != lastpos)
2604             middle = substring_both (newtext, lastpos,
2605                                      lastpos_byte,
2606                                      pos, pos_byte);
2607           else
2608             middle = Qnil;
2609
2610           newtext = concat2 (accum, middle);
2611         }
2612
2613       /* Do case substitution in NEWTEXT if desired.  */
2614       if (case_action == all_caps)
2615         newtext = Fupcase (newtext);
2616       else if (case_action == cap_initial)
2617         newtext = Fupcase_initials (newtext);
2618
2619       return concat3 (before, newtext, after);
2620     }
2621
2622   /* Record point, then move (quietly) to the start of the match.  */
2623   if (PT >= search_regs.end[sub])
2624     opoint = PT - ZV;
2625   else if (PT > search_regs.start[sub])
2626     opoint = search_regs.end[sub] - ZV;
2627   else
2628     opoint = PT;
2629
2630   /* If we want non-literal replacement,
2631      perform substitution on the replacement string.  */
2632   if (NILP (literal))
2633     {
2634       EMACS_INT length = SBYTES (newtext);
2635       unsigned char *substed;
2636       EMACS_INT substed_alloc_size, substed_len;
2637       int buf_multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
2638       int str_multibyte = STRING_MULTIBYTE (newtext);
2639       Lisp_Object rev_tbl;
2640       int really_changed = 0;
2641
2642       rev_tbl = Qnil;
2643
2644       substed_alloc_size = length * 2 + 100;
2645       substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
2646       substed_len = 0;
2647
2648       /* Go thru NEWTEXT, producing the actual text to insert in
2649          SUBSTED while adjusting multibyteness to that of the current
2650          buffer.  */
2651
2652       for (pos_byte = 0, pos = 0; pos_byte < length;)
2653         {
2654           unsigned char str[MAX_MULTIBYTE_LENGTH];
2655           const unsigned char *add_stuff = NULL;
2656           EMACS_INT add_len = 0;
2657           int idx = -1;
2658
2659           if (str_multibyte)
2660             {
2661               FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, pos, pos_byte);
2662               if (!buf_multibyte)
2663                 c = multibyte_char_to_unibyte (c, rev_tbl);
2664             }
2665           else
2666             {
2667               /* Note that we don't have to increment POS.  */
2668               c = SREF (newtext, pos_byte++);
2669               if (buf_multibyte)
2670                 MAKE_CHAR_MULTIBYTE (c);
2671             }
2672
2673           /* Either set ADD_STUFF and ADD_LEN to the text to put in SUBSTED,
2674              or set IDX to a match index, which means put that part
2675              of the buffer text into SUBSTED.  */
2676
2677           if (c == '\\')
2678             {
2679               really_changed = 1;
2680
2681               if (str_multibyte)
2682                 {
2683                   FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext,
2684                                                       pos, pos_byte);
2685                   if (!buf_multibyte && !ASCII_CHAR_P (c))
2686                     c = multibyte_char_to_unibyte (c, rev_tbl);
2687                 }
2688               else
2689                 {
2690                   c = SREF (newtext, pos_byte++);
2691                   if (buf_multibyte)
2692                     MAKE_CHAR_MULTIBYTE (c);
2693                 }
2694
2695               if (c == '&')
2696                 idx = sub;
2697               else if (c >= '1' && c <= '9' && c <= search_regs.num_regs + '0')
2698                 {
2699                   if (search_regs.start[c - '0'] >= 1)
2700                     idx = c - '0';
2701                 }
2702               else if (c == '\\')
2703                 add_len = 1, add_stuff = (unsigned char *) "\\";
2704               else
2705                 {
2706                   xfree (substed);
2707                   error ("Invalid use of `\\' in replacement text");
2708                 }
2709             }
2710           else
2711             {
2712               add_len = CHAR_STRING (c, str);
2713               add_stuff = str;
2714             }
2715
2716           /* If we want to copy part of a previous match,
2717              set up ADD_STUFF and ADD_LEN to point to it.  */
2718           if (idx >= 0)
2719             {
2720               EMACS_INT begbyte = CHAR_TO_BYTE (search_regs.start[idx]);
2721               add_len = CHAR_TO_BYTE (search_regs.end[idx]) - begbyte;
2722               if (search_regs.start[idx] < GPT && GPT < search_regs.end[idx])
2723                 move_gap (search_regs.start[idx]);
2724               add_stuff = BYTE_POS_ADDR (begbyte);
2725             }
2726
2727           /* Now the stuff we want to add to SUBSTED
2728              is invariably ADD_LEN bytes starting at ADD_STUFF.  */
2729
2730           /* Make sure SUBSTED is big enough.  */
2731           if (substed_len + add_len >= substed_alloc_size)
2732             {
2733               substed_alloc_size = substed_len + add_len + 500;
2734               substed = (unsigned char *) xrealloc (substed,
2735                                                     substed_alloc_size + 1);
2736             }
2737
2738           /* Now add to the end of SUBSTED.  */
2739           if (add_stuff)
2740             {
2741               memcpy (substed + substed_len, add_stuff, add_len);
2742               substed_len += add_len;
2743             }
2744         }
2745
2746       if (really_changed)
2747         {
2748           if (buf_multibyte)
2749             {
2750               EMACS_INT nchars =
2751                 multibyte_chars_in_text (substed, substed_len);
2752
2753               newtext = make_multibyte_string ((char *) substed, nchars,
2754                                                substed_len);
2755             }
2756           else
2757             newtext = make_unibyte_string ((char *) substed, substed_len);
2758         }
2759       xfree (substed);
2760     }
2761
2762   /* Replace the old text with the new in the cleanest possible way.  */
2763   replace_range (search_regs.start[sub], search_regs.end[sub],
2764                  newtext, 1, 0, 1);
2765   newpoint = search_regs.start[sub] + SCHARS (newtext);
2766
2767   if (case_action == all_caps)
2768     Fupcase_region (make_number (search_regs.start[sub]),
2769                     make_number (newpoint));
2770   else if (case_action == cap_initial)
2771     Fupcase_initials_region (make_number (search_regs.start[sub]),
2772                              make_number (newpoint));
2773
2774   /* Adjust search data for this change.  */
2775   {
2776     EMACS_INT oldend = search_regs.end[sub];
2777     EMACS_INT oldstart = search_regs.start[sub];
2778     EMACS_INT change = newpoint - search_regs.end[sub];
2779     int i;
2780
2781     for (i = 0; i < search_regs.num_regs; i++)
2782       {
2783         if (search_regs.start[i] >= oldend)
2784           search_regs.start[i] += change;
2785         else if (search_regs.start[i] > oldstart)
2786           search_regs.start[i] = oldstart;
2787         if (search_regs.end[i] >= oldend)
2788           search_regs.end[i] += change;
2789         else if (search_regs.end[i] > oldstart)
2790           search_regs.end[i] = oldstart;
2791       }
2792   }
2793
2794   /* Put point back where it was in the text.  */
2795   if (opoint <= 0)
2796     TEMP_SET_PT (opoint + ZV);
2797   else
2798     TEMP_SET_PT (opoint);
2799
2800   /* Now move point "officially" to the start of the inserted replacement.  */
2801   move_if_not_intangible (newpoint);
2802
2803   return Qnil;
2804 }
2805 \f
2806 static Lisp_Object
2807 match_limit (Lisp_Object num, int beginningp)
2808 {
2809   register int n;
2810
2811   CHECK_NUMBER (num);
2812   n = XINT (num);
2813   if (n < 0)
2814     args_out_of_range (num, make_number (0));
2815   if (search_regs.num_regs <= 0)
2816     error ("No match data, because no search succeeded");
2817   if (n >= search_regs.num_regs
2818       || search_regs.start[n] < 0)
2819     return Qnil;
2820   return (make_number ((beginningp) ? search_regs.start[n]
2821                                     : search_regs.end[n]));
2822 }
2823
2824 DEFUN ("match-beginning", Fmatch_beginning, Smatch_beginning, 1, 1, 0,
2825        doc: /* Return position of start of text matched by last search.
2826 SUBEXP, a number, specifies which parenthesized expression in the last
2827   regexp.
2828 Value is nil if SUBEXPth pair didn't match, or there were less than
2829   SUBEXP pairs.
2830 Zero means the entire text matched by the whole regexp or whole string.  */)
2831   (Lisp_Object subexp)
2832 {
2833   return match_limit (subexp, 1);
2834 }
2835
2836 DEFUN ("match-end", Fmatch_end, Smatch_end, 1, 1, 0,
2837        doc: /* Return position of end of text matched by last search.
2838 SUBEXP, a number, specifies which parenthesized expression in the last
2839   regexp.
2840 Value is nil if SUBEXPth pair didn't match, or there were less than
2841   SUBEXP pairs.
2842 Zero means the entire text matched by the whole regexp or whole string.  */)
2843   (Lisp_Object subexp)
2844 {
2845   return match_limit (subexp, 0);
2846 }
2847
2848 DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 3, 0,
2849        doc: /* Return a list containing all info on what the last search matched.
2850 Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'.
2851 All the elements are markers or nil (nil if the Nth pair didn't match)
2852 if the last match was on a buffer; integers or nil if a string was matched.
2853 Use `set-match-data' to reinstate the data in this list.
2854
2855 If INTEGERS (the optional first argument) is non-nil, always use
2856 integers \(rather than markers) to represent buffer positions.  In
2857 this case, and if the last match was in a buffer, the buffer will get
2858 stored as one additional element at the end of the list.
2859
2860 If REUSE is a list, reuse it as part of the value.  If REUSE is long
2861 enough to hold all the values, and if INTEGERS is non-nil, no consing
2862 is done.
2863
2864 If optional third arg RESEAT is non-nil, any previous markers on the
2865 REUSE list will be modified to point to nowhere.
2866
2867 Return value is undefined if the last search failed.  */)
2868   (Lisp_Object integers, Lisp_Object reuse, Lisp_Object reseat)
2869 {
2870   Lisp_Object tail, prev;
2871   Lisp_Object *data;
2872   int i, len;
2873
2874   if (!NILP (reseat))
2875     for (tail = reuse; CONSP (tail); tail = XCDR (tail))
2876       if (MARKERP (XCAR (tail)))
2877         {
2878           unchain_marker (XMARKER (XCAR (tail)));
2879           XSETCAR (tail, Qnil);
2880         }
2881
2882   if (NILP (last_thing_searched))
2883     return Qnil;
2884
2885   prev = Qnil;
2886
2887   data = (Lisp_Object *) alloca ((2 * search_regs.num_regs + 1)
2888                                  * sizeof (Lisp_Object));
2889
2890   len = 0;
2891   for (i = 0; i < search_regs.num_regs; i++)
2892     {
2893       int start = search_regs.start[i];
2894       if (start >= 0)
2895         {
2896           if (EQ (last_thing_searched, Qt)
2897               || ! NILP (integers))
2898             {
2899               XSETFASTINT (data[2 * i], start);
2900               XSETFASTINT (data[2 * i + 1], search_regs.end[i]);
2901             }
2902           else if (BUFFERP (last_thing_searched))
2903             {
2904               data[2 * i] = Fmake_marker ();
2905               Fset_marker (data[2 * i],
2906                            make_number (start),
2907                            last_thing_searched);
2908               data[2 * i + 1] = Fmake_marker ();
2909               Fset_marker (data[2 * i + 1],
2910                            make_number (search_regs.end[i]),
2911                            last_thing_searched);
2912             }
2913           else
2914             /* last_thing_searched must always be Qt, a buffer, or Qnil.  */
2915             abort ();
2916
2917           len = 2 * i + 2;
2918         }
2919       else
2920         data[2 * i] = data[2 * i + 1] = Qnil;
2921     }
2922
2923   if (BUFFERP (last_thing_searched) && !NILP (integers))
2924     {
2925       data[len] = last_thing_searched;
2926       len++;
2927     }
2928
2929   /* If REUSE is not usable, cons up the values and return them.  */
2930   if (! CONSP (reuse))
2931     return Flist (len, data);
2932
2933   /* If REUSE is a list, store as many value elements as will fit
2934      into the elements of REUSE.  */
2935   for (i = 0, tail = reuse; CONSP (tail);
2936        i++, tail = XCDR (tail))
2937     {
2938       if (i < len)
2939         XSETCAR (tail, data[i]);
2940       else
2941         XSETCAR (tail, Qnil);
2942       prev = tail;
2943     }
2944
2945   /* If we couldn't fit all value elements into REUSE,
2946      cons up the rest of them and add them to the end of REUSE.  */
2947   if (i < len)
2948     XSETCDR (prev, Flist (len - i, data + i));
2949
2950   return reuse;
2951 }
2952
2953 /* We used to have an internal use variant of `reseat' described as:
2954
2955       If RESEAT is `evaporate', put the markers back on the free list
2956       immediately.  No other references to the markers must exist in this
2957       case, so it is used only internally on the unwind stack and
2958       save-match-data from Lisp.
2959
2960    But it was ill-conceived: those supposedly-internal markers get exposed via
2961    the undo-list, so freeing them here is unsafe.  */
2962
2963 DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 2, 0,
2964        doc: /* Set internal data on last search match from elements of LIST.
2965 LIST should have been created by calling `match-data' previously.
2966
2967 If optional arg RESEAT is non-nil, make markers on LIST point nowhere.  */)
2968   (register Lisp_Object list, Lisp_Object reseat)
2969 {
2970   register int i;
2971   register Lisp_Object marker;
2972
2973   if (running_asynch_code)
2974     save_search_regs ();
2975
2976   CHECK_LIST (list);
2977
2978   /* Unless we find a marker with a buffer or an explicit buffer
2979      in LIST, assume that this match data came from a string.  */
2980   last_thing_searched = Qt;
2981
2982   /* Allocate registers if they don't already exist.  */
2983   {
2984     int length = XFASTINT (Flength (list)) / 2;
2985
2986     if (length > search_regs.num_regs)
2987       {
2988         if (search_regs.num_regs == 0)
2989           {
2990             search_regs.start
2991               = (regoff_t *) xmalloc (length * sizeof (regoff_t));
2992             search_regs.end
2993               = (regoff_t *) xmalloc (length * sizeof (regoff_t));
2994           }
2995         else
2996           {
2997             search_regs.start
2998               = (regoff_t *) xrealloc (search_regs.start,
2999                                        length * sizeof (regoff_t));
3000             search_regs.end
3001               = (regoff_t *) xrealloc (search_regs.end,
3002                                        length * sizeof (regoff_t));
3003           }
3004
3005         for (i = search_regs.num_regs; i < length; i++)
3006           search_regs.start[i] = -1;
3007
3008         search_regs.num_regs = length;
3009       }
3010
3011     for (i = 0; CONSP (list); i++)
3012       {
3013         marker = XCAR (list);
3014         if (BUFFERP (marker))
3015           {
3016             last_thing_searched = marker;
3017             break;
3018           }
3019         if (i >= length)
3020           break;
3021         if (NILP (marker))
3022           {
3023             search_regs.start[i] = -1;
3024             list = XCDR (list);
3025           }
3026         else
3027           {
3028             EMACS_INT from;
3029             Lisp_Object m;
3030
3031             m = marker;
3032             if (MARKERP (marker))
3033               {
3034                 if (XMARKER (marker)->buffer == 0)
3035                   XSETFASTINT (marker, 0);
3036                 else
3037                   XSETBUFFER (last_thing_searched, XMARKER (marker)->buffer);
3038               }
3039
3040             CHECK_NUMBER_COERCE_MARKER (marker);
3041             from = XINT (marker);
3042
3043             if (!NILP (reseat) && MARKERP (m))
3044               {
3045                 unchain_marker (XMARKER (m));
3046                 XSETCAR (list, Qnil);
3047               }
3048
3049             if ((list = XCDR (list), !CONSP (list)))
3050               break;
3051
3052             m = marker = XCAR (list);
3053
3054             if (MARKERP (marker) && XMARKER (marker)->buffer == 0)
3055               XSETFASTINT (marker, 0);
3056
3057             CHECK_NUMBER_COERCE_MARKER (marker);
3058             search_regs.start[i] = from;
3059             search_regs.end[i] = XINT (marker);
3060
3061             if (!NILP (reseat) && MARKERP (m))
3062               {
3063                 unchain_marker (XMARKER (m));
3064                 XSETCAR (list, Qnil);
3065               }
3066           }
3067         list = XCDR (list);
3068       }
3069
3070     for (; i < search_regs.num_regs; i++)
3071       search_regs.start[i] = -1;
3072   }
3073
3074   return Qnil;
3075 }
3076
3077 /* If non-zero the match data have been saved in saved_search_regs
3078    during the execution of a sentinel or filter. */
3079 static int search_regs_saved;
3080 static struct re_registers saved_search_regs;
3081 static Lisp_Object saved_last_thing_searched;
3082
3083 /* Called from Flooking_at, Fstring_match, search_buffer, Fstore_match_data
3084    if asynchronous code (filter or sentinel) is running. */
3085 static void
3086 save_search_regs (void)
3087 {
3088   if (!search_regs_saved)
3089     {
3090       saved_search_regs.num_regs = search_regs.num_regs;
3091       saved_search_regs.start = search_regs.start;
3092       saved_search_regs.end = search_regs.end;
3093       saved_last_thing_searched = last_thing_searched;
3094       last_thing_searched = Qnil;
3095       search_regs.num_regs = 0;
3096       search_regs.start = 0;
3097       search_regs.end = 0;
3098
3099       search_regs_saved = 1;
3100     }
3101 }
3102
3103 /* Called upon exit from filters and sentinels. */
3104 void
3105 restore_search_regs (void)
3106 {
3107   if (search_regs_saved)
3108     {
3109       if (search_regs.num_regs > 0)
3110         {
3111           xfree (search_regs.start);
3112           xfree (search_regs.end);
3113         }
3114       search_regs.num_regs = saved_search_regs.num_regs;
3115       search_regs.start = saved_search_regs.start;
3116       search_regs.end = saved_search_regs.end;
3117       last_thing_searched = saved_last_thing_searched;
3118       saved_last_thing_searched = Qnil;
3119       search_regs_saved = 0;
3120     }
3121 }
3122
3123 static Lisp_Object
3124 unwind_set_match_data (Lisp_Object list)
3125 {
3126   /* It is NOT ALWAYS safe to free (evaporate) the markers immediately.  */
3127   return Fset_match_data (list, Qt);
3128 }
3129
3130 /* Called to unwind protect the match data.  */
3131 void
3132 record_unwind_save_match_data (void)
3133 {
3134   record_unwind_protect (unwind_set_match_data,
3135                          Fmatch_data (Qnil, Qnil, Qnil));
3136 }
3137
3138 /* Quote a string to inactivate reg-expr chars */
3139
3140 DEFUN ("regexp-quote", Fregexp_quote, Sregexp_quote, 1, 1, 0,
3141        doc: /* Return a regexp string which matches exactly STRING and nothing else.  */)
3142   (Lisp_Object string)
3143 {
3144   register char *in, *out, *end;
3145   register char *temp;
3146   int backslashes_added = 0;
3147
3148   CHECK_STRING (string);
3149
3150   temp = (char *) alloca (SBYTES (string) * 2);
3151
3152   /* Now copy the data into the new string, inserting escapes. */
3153
3154   in = SSDATA (string);
3155   end = in + SBYTES (string);
3156   out = temp;
3157
3158   for (; in != end; in++)
3159     {
3160       if (*in == '['
3161           || *in == '*' || *in == '.' || *in == '\\'
3162           || *in == '?' || *in == '+'
3163           || *in == '^' || *in == '$')
3164         *out++ = '\\', backslashes_added++;
3165       *out++ = *in;
3166     }
3167
3168   return make_specified_string (temp,
3169                                 SCHARS (string) + backslashes_added,
3170                                 out - temp,
3171                                 STRING_MULTIBYTE (string));
3172 }
3173 \f
3174 void
3175 syms_of_search (void)
3176 {
3177   register int i;
3178
3179   for (i = 0; i < REGEXP_CACHE_SIZE; ++i)
3180     {
3181       searchbufs[i].buf.allocated = 100;
3182       searchbufs[i].buf.buffer = (unsigned char *) xmalloc (100);
3183       searchbufs[i].buf.fastmap = searchbufs[i].fastmap;
3184       searchbufs[i].regexp = Qnil;
3185       searchbufs[i].whitespace_regexp = Qnil;
3186       searchbufs[i].syntax_table = Qnil;
3187       staticpro (&searchbufs[i].regexp);
3188       staticpro (&searchbufs[i].whitespace_regexp);
3189       staticpro (&searchbufs[i].syntax_table);
3190       searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]);
3191     }
3192   searchbuf_head = &searchbufs[0];
3193
3194   Qsearch_failed = intern_c_string ("search-failed");
3195   staticpro (&Qsearch_failed);
3196   Qinvalid_regexp = intern_c_string ("invalid-regexp");
3197   staticpro (&Qinvalid_regexp);
3198
3199   Fput (Qsearch_failed, Qerror_conditions,
3200         pure_cons (Qsearch_failed, pure_cons (Qerror, Qnil)));
3201   Fput (Qsearch_failed, Qerror_message,
3202         make_pure_c_string ("Search failed"));
3203
3204   Fput (Qinvalid_regexp, Qerror_conditions,
3205         pure_cons (Qinvalid_regexp, pure_cons (Qerror, Qnil)));
3206   Fput (Qinvalid_regexp, Qerror_message,
3207         make_pure_c_string ("Invalid regexp"));
3208
3209   last_thing_searched = Qnil;
3210   staticpro (&last_thing_searched);
3211
3212   saved_last_thing_searched = Qnil;
3213   staticpro (&saved_last_thing_searched);
3214
3215   DEFVAR_LISP ("search-spaces-regexp", Vsearch_spaces_regexp,
3216       doc: /* Regexp to substitute for bunches of spaces in regexp search.
3217 Some commands use this for user-specified regexps.
3218 Spaces that occur inside character classes or repetition operators
3219 or other such regexp constructs are not replaced with this.
3220 A value of nil (which is the normal value) means treat spaces literally.  */);
3221   Vsearch_spaces_regexp = Qnil;
3222
3223   DEFVAR_LISP ("inhibit-changing-match-data", Vinhibit_changing_match_data,
3224       doc: /* Internal use only.
3225 If non-nil, the primitive searching and matching functions
3226 such as `looking-at', `string-match', `re-search-forward', etc.,
3227 do not set the match data.  The proper way to use this variable
3228 is to bind it with `let' around a small expression.  */);
3229   Vinhibit_changing_match_data = Qnil;
3230
3231   defsubr (&Slooking_at);
3232   defsubr (&Sposix_looking_at);
3233   defsubr (&Sstring_match);
3234   defsubr (&Sposix_string_match);
3235   defsubr (&Ssearch_forward);
3236   defsubr (&Ssearch_backward);
3237   defsubr (&Sword_search_forward);
3238   defsubr (&Sword_search_backward);
3239   defsubr (&Sword_search_forward_lax);
3240   defsubr (&Sword_search_backward_lax);
3241   defsubr (&Sre_search_forward);
3242   defsubr (&Sre_search_backward);
3243   defsubr (&Sposix_search_forward);
3244   defsubr (&Sposix_search_backward);
3245   defsubr (&Sreplace_match);
3246   defsubr (&Smatch_beginning);
3247   defsubr (&Smatch_end);
3248   defsubr (&Smatch_data);
3249   defsubr (&Sset_match_data);
3250   defsubr (&Sregexp_quote);
3251 }