(timezone-parse-date): Use < 69 not < 70 to distinguish 20YY from 19YY.

[bpt/emacs.git] / src / search.c
diff --git a/src/search.c b/src/search.c

index 3ee17bf..da4f635 100644 (file)
--- a/src/search.c
+++ b/src/search.c
@@ -170,7 +170,7 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
    cp->posix = posix;
    cp->buf.multibyte = multibyte;
    BLOCK_INPUT;
-  old = re_set_syntax (RE_SYNTAX_EMACS
+  old = re_set_syntax (RE_SYNTAX_EMACS | RE_CHAR_CLASSES
                        | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
    val = (char *) re_compile_pattern ((char *)raw_pattern,
                                      raw_pattern_size, &cp->buf);
@@ -182,6 +182,23 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
    cp->regexp = Fcopy_sequence (pattern);
  }
  
+/* Shrink each compiled regexp buffer in the cache
+   to the size actually used right now.
+   This is called from garbage collection.  */
+
+void
+shrink_regexp_cache ()
+{
+  struct regexp_cache *cp, **cpp;
+
+  for (cp = searchbuf_head; cp != 0; cp = cp->next)
+    {
+      cp->buf.allocated = cp->buf.used;
+      cp->buf.buffer
+       = (unsigned char *) realloc (cp->buf.buffer, cp->buf.used);
+    }
+}
+
  /* Compile a regexp if necessary, but first check to see if there's one in
     the cache.
     PATTERN is the pattern to compile.
@@ -398,6 +415,7 @@ string_match_1 (regexp, string, start, posix)
  
  DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
    "Return index of start of first match for REGEXP in STRING, or nil.\n\
+Case is ignored if `case-fold-search' is non-nil in the current buffer.\n\
  If third arg START is non-nil, start search at that index in STRING.\n\
  For index of first char beyond the match, do (match-end 0).\n\
  `match-end' and `match-beginning' also give indices of substrings\n\
@@ -411,6 +429,7 @@ matched by parenthesis constructs in the pattern.")
  DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 3, 0,
    "Return index of start of first match for REGEXP in STRING, or nil.\n\
  Find the longest match, in accord with Posix regular expression rules.\n\
+Case is ignored if `case-fold-search' is non-nil in the current buffer.\n\
  If third arg START is non-nil, start search at that index in STRING.\n\
  For index of first char beyond the match, do (match-end 0).\n\
  `match-end' and `match-beginning' also give indices of substrings\n\
@@ -1005,17 +1024,14 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
    if (running_asynch_code)
      save_search_regs ();
  
+  /* Searching 0 times means don't move.  */
    /* Null string is found at starting position.  */
-  if (len == 0)
+  if (len == 0 || n == 0)
      {
        set_search_regs (pos, 0);
        return pos;
      }
  
-  /* Searching 0 times means don't move.  */
-  if (n == 0)
-    return pos;
-
    if (RE && !trivial_regexp_p (string))
      {
        unsigned char *p1, *p2;
@@ -1128,7 +1144,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
        int multibyte = !NILP (current_buffer->enable_multibyte_characters);
        unsigned char *base_pat = XSTRING (string)->data;
        int charset_base = -1;
-      int simple = 1;
+      int boyer_moore_ok = 1;
  
        /* MULTIBYTE says whether the text to be searched is multibyte.
          We must convert PATTERN to match that, or we will not really
@@ -1190,6 +1206,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
                 }
  
               c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
+
               /* Translate the character, if requested.  */
               TRANSLATE (translated, trt, c);
               /* If translation changed the byte-length, go back
@@ -1201,6 +1218,14 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
                   charlen = CHAR_STRING (c, workbuf, str);
                 }
  
+             /* If we are searching for something strange,
+                an invalid multibyte code, don't use boyer-moore.  */
+             if (! ASCII_BYTE_P (translated)
+                 && (charlen == 1 /* 8bit code */
+                     || charlen != in_charlen /* invalid multibyte code */
+                     ))
+               boyer_moore_ok = 0;
+
               TRANSLATE (inverse, inverse_trt, c);
  
               /* Did this char actually get translated?
@@ -1209,14 +1234,14 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
                 {
                   /* Keep track of which character set row
                      contains the characters that need translation.  */
-                 int charset_base_code = c & ~0xff;
+                 int charset_base_code = c & ~CHAR_FIELD3_MASK;
                   if (charset_base == -1)
                     charset_base = charset_base_code;
                   else if (charset_base != charset_base_code)
                     /* If two different rows appear, needing translation,
                        then we cannot use boyer_moore search.  */
-                   simple = 0;
-                   /* ??? Handa: this must do simple = 0
+                   boyer_moore_ok = 0;
+                   /* ??? Handa: this must do boyer_moore_ok = 0
                        if c is a composite character.  */
                 }
  
@@ -1229,9 +1254,11 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
         }
        else
         {
+         /* Unibyte buffer.  */
+         charset_base = 0;
           while (--len >= 0)
             {
-             int c, translated, inverse;
+             int c, translated;
  
               /* If we got here and the RE flag is set, it's because we're
                  dealing with a regexp known to be trivial, so the backslash
@@ -1243,22 +1270,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
                 }
               c = *base_pat++;
               TRANSLATE (translated, trt, c);
-             TRANSLATE (inverse, inverse_trt, c);
-
-             /* Did this char actually get translated?
-                Would any other char get translated into it?  */
-             if (translated != c || inverse != c)
-               {
-                 /* Keep track of which character set row
-                    contains the characters that need translation.  */
-                 int charset_base_code = c & ~0xff;
-                 if (charset_base == -1)
-                   charset_base = charset_base_code;
-                 else if (charset_base != charset_base_code)
-                   /* If two different rows appear, needing translation,
-                      then we cannot use boyer_moore search.  */
-                   simple = 0;
-               }
               *pat++ = translated;
             }
         }
@@ -1267,7 +1278,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
        len = raw_pattern_size;
        pat = base_pat = patbuf;
  
-      if (simple)
+      if (boyer_moore_ok)
         return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
                             pos, pos_byte, lim, lim_byte,
                             charset_base);
@@ -1612,7 +1623,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
               while (! CHAR_HEAD_P (*charstart))
                 charstart--;
               untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
-             if (charset_base == (untranslated & ~0xff))
+             if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
                 {
                   TRANSLATE (ch, trt, untranslated);
                   if (! CHAR_HEAD_P (*ptr))
@@ -1896,12 +1907,15 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
  }
  
  /* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES
-   for a match just found in the current buffer.  */
+   for the overall match just found in the current buffer.
+   Also clear out the match data for registers 1 and up.  */
  
  static void
  set_search_regs (beg_byte, nbytes)
       int beg_byte, nbytes;
  {
+  int i;
+
    /* Make sure we have registers in which to store
       the match position.  */
    if (search_regs.num_regs == 0)
@@ -1911,6 +1925,13 @@ set_search_regs (beg_byte, nbytes)
        search_regs.num_regs = 2;
      }
  
+  /* Clear out the other registers.  */
+  for (i = 1; i < search_regs.num_regs; i++)
+    {
+      search_regs.start[i] = -1;
+      search_regs.end[i] = -1;
+    }
+
    search_regs.start[0] = BYTE_TO_CHAR (beg_byte);
    search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes);
    XSETBUFFER (last_thing_searched, current_buffer);
@@ -1959,8 +1980,12 @@ wordify (string)
      return build_string ("");
  
    adjust = - punct_count + 5 * (word_count - 1) + 4;
-  val = make_uninit_multibyte_string (len + adjust,
-                                     STRING_BYTES (XSTRING (string)) + adjust);
+  if (STRING_MULTIBYTE (string))
+    val = make_uninit_multibyte_string (len + adjust,
+                                       STRING_BYTES (XSTRING (string))
+                                       + adjust);
+  else
+    val = make_uninit_string (len + adjust);
  
    o = XSTRING (val)->data;
    *o++ = '\\';
@@ -1975,7 +2000,10 @@ wordify (string)
        if (STRING_MULTIBYTE (string))
         FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte);
        else
-       c = XSTRING (string)->data[i++];
+       {
+         c = XSTRING (string)->data[i++];
+         i_byte++;
+       }
  
        if (SYNTAX (c) == Sword)
         {
@@ -2163,7 +2191,7 @@ since only regular expressions have distinguished subexpressions.")
       Lisp_Object newtext, fixedcase, literal, string, subexp;
  {
    enum { nochange, all_caps, cap_initial } case_action;
-  register int pos, last;
+  register int pos, pos_byte;
    int some_multiletter_word;
    int some_lowercase;
    int some_uppercase;
@@ -2213,18 +2241,16 @@ since only regular expressions have distinguished subexpressions.")
  
    if (NILP (fixedcase))
      {
-      int beg;
        /* Decide how to casify by examining the matched text. */
+      int last;
  
-      if (NILP (string))
-       last = CHAR_TO_BYTE (search_regs.end[sub]);
-      else
-       last = search_regs.end[sub];
+      pos = search_regs.start[sub];
+      last = search_regs.end[sub];
  
        if (NILP (string))
-       beg = CHAR_TO_BYTE (search_regs.start[sub]);
+       pos_byte = CHAR_TO_BYTE (pos);
        else
-       beg = search_regs.start[sub];
+       pos_byte = string_char_to_byte (string, pos);
  
        prevc = '\n';
        case_action = all_caps;
@@ -2236,12 +2262,15 @@ since only regular expressions have distinguished subexpressions.")
        some_nonuppercase_initial = 0;
        some_uppercase = 0;
  
-      for (pos = beg; pos < last; pos++)
+      while (pos < last)
         {
           if (NILP (string))
-           c = FETCH_BYTE (pos);
+           {
+             c = FETCH_CHAR (pos_byte);
+             INC_BOTH (pos, pos_byte);
+           }
           else
-           c = XSTRING (string)->data[pos];
+           FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte);
  
           if (LOWERCASEP (c))
             {
@@ -2305,11 +2334,11 @@ since only regular expressions have distinguished subexpressions.")
           /* We build up the substituted string in ACCUM.  */
           Lisp_Object accum;
           Lisp_Object middle;
-         int pos_byte;
+         int length = STRING_BYTES (XSTRING (newtext));
  
           accum = Qnil;
  
-         for (pos_byte = 0, pos = 0; pos_byte < STRING_BYTES (XSTRING (newtext));)
+         for (pos_byte = 0, pos = 0; pos_byte < length;)
             {
               int substart = -1;
               int subend;
@@ -2385,8 +2414,10 @@ since only regular expressions have distinguished subexpressions.")
      }
  
    /* Record point, the move (quietly) to the start of the match.  */
-  if (PT > search_regs.start[sub])
+  if (PT >= search_regs.end[sub])
      opoint = PT - ZV;
+  else if (PT > search_regs.start[sub])
+    opoint = search_regs.end[sub] - ZV;
    else
      opoint = PT;
  
@@ -2401,16 +2432,19 @@ since only regular expressions have distinguished subexpressions.")
    else
      {
        struct gcpro gcpro1;
+      int length = STRING_BYTES (XSTRING (newtext));
+
        GCPRO1 (newtext);
  
-      for (pos = 0; pos < XSTRING (newtext)->size; pos++)
+      for (pos_byte = 0, pos = 0; pos_byte < length;)
         {
           int offset = PT - search_regs.start[sub];
  
-         c = XSTRING (newtext)->data[pos];
+         FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
+
           if (c == '\\')
             {
-             c = XSTRING (newtext)->data[++pos];
+             FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
               if (c == '&')
                 Finsert_buffer_substring
                   (Fcurrent_buffer (),