/* String search routines for GNU Emacs.
- Copyright (C) 1985, 86,87,93,94,97,98, 1999, 2004
- Free Software Foundation, Inc.
+ Copyright (C) 1985, 1986, 1987, 1993, 1994, 1997, 1998, 1999, 2002, 2003,
+ 2004, 2005, 2006 Free Software Foundation, Inc.
This file is part of GNU Emacs.
You should have received a copy of the GNU General Public License
along with GNU Emacs; see the file COPYING. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA. */
+the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA. */
#include <config.h>
Lisp_Object Qinvalid_regexp;
-Lisp_Object Vsearch_whitespace_regexp;
+/* Error condition used for failing searches */
+Lisp_Object Qsearch_failed;
+
+Lisp_Object Vsearch_spaces_regexp;
static void set_search_regs ();
static void save_search_regs ();
static int simple_search ();
static int boyer_moore ();
static int search_buffer ();
+static void matcher_overflow () NO_RETURN;
static void
matcher_overflow ()
PATTERN. 0 means all multibyte characters are recognized just as
sequences of binary data.
- The behavior also depends on Vsearch_whitespace_regexp. */
+ The behavior also depends on Vsearch_spaces_regexp. */
static void
compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
cp->posix = posix;
cp->buf.multibyte = multibyte;
- cp->whitespace_regexp = Vsearch_whitespace_regexp;
+ cp->whitespace_regexp = Vsearch_spaces_regexp;
BLOCK_INPUT;
old = re_set_syntax (RE_SYNTAX_EMACS
| (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
- re_set_whitespace_regexp (NILP (Vsearch_whitespace_regexp) ? NULL
- : SDATA (Vsearch_whitespace_regexp));
+ re_set_whitespace_regexp (NILP (Vsearch_spaces_regexp) ? NULL
+ : SDATA (Vsearch_spaces_regexp));
val = (char *) re_compile_pattern ((char *)raw_pattern,
raw_pattern_size, &cp->buf);
re_set_syntax (old);
UNBLOCK_INPUT;
if (val)
- Fsignal (Qinvalid_regexp, Fcons (build_string (val), Qnil));
+ xsignal1 (Qinvalid_regexp, build_string (val));
cp->regexp = Fcopy_sequence (pattern);
}
&& EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
&& cp->posix == posix
&& cp->buf.multibyte == multibyte
- && !NILP (Fequal (cp->whitespace_regexp, Vsearch_whitespace_regexp)))
+ && !NILP (Fequal (cp->whitespace_regexp, Vsearch_spaces_regexp)))
break;
/* If we're at the end of the cache, compile into the nil cell
return &cp->buf;
}
-/* Error condition used for failing searches */
-Lisp_Object Qsearch_failed;
-
-Lisp_Object
-signal_failure (arg)
- Lisp_Object arg;
-{
- Fsignal (Qsearch_failed, Fcons (arg, Qnil));
- return Qnil;
-}
\f
static Lisp_Object
looking_at_1 (string, posix)
CHECK_STRING (string);
bufp = compile_pattern (string, &search_regs,
(!NILP (current_buffer->case_fold_search)
- ? DOWNCASE_TABLE : Qnil),
+ ? current_buffer->case_canon_table : Qnil),
posix,
!NILP (current_buffer->enable_multibyte_characters));
bufp = compile_pattern (regexp, &search_regs,
(!NILP (current_buffer->case_fold_search)
- ? DOWNCASE_TABLE : Qnil),
+ ? current_buffer->case_canon_table : Qnil),
posix,
STRING_MULTIBYTE (string));
immediate_quit = 1;
DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
doc: /* Return index of start of first match for REGEXP in STRING, or nil.
-Case is ignored if `case-fold-search' is non-nil in the current buffer.
+Matching ignores case if `case-fold-search' is non-nil.
If third arg START is non-nil, start search at that index in STRING.
For index of first char beyond the match, do (match-end 0).
`match-end' and `match-beginning' also give indices of substrings
regexp = string_make_unibyte (regexp);
re_match_object = Qt;
bufp = compile_pattern (regexp, 0,
- Vascii_downcase_table, 0,
+ Vascii_canon_table, 0,
0);
immediate_quit = 1;
val = re_search (bufp, string, len, 0, len, 0);
int val;
struct re_pattern_buffer *bufp;
- bufp = compile_pattern (regexp, 0, Vascii_downcase_table,
+ bufp = compile_pattern (regexp, 0, Vascii_canon_table,
0, STRING_MULTIBYTE (string));
immediate_quit = 1;
re_match_object = string;
if (np <= 0)
{
if (NILP (noerror))
- return signal_failure (string);
+ xsignal1 (Qsearch_failed, string);
+
if (!EQ (noerror, Qt))
{
if (lim < BEGV || lim > ZV)
return pos;
}
- if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_whitespace_regexp)))
+ if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp)))
{
unsigned char *p1, *p2;
int s1, s2;
int raw_pattern_size_byte;
unsigned char *patbuf;
int multibyte = !NILP (current_buffer->enable_multibyte_characters);
- unsigned char *base_pat = SDATA (string);
+ unsigned char *base_pat;
+ /* Set to positive if we find a non-ASCII char that need
+ translation. Otherwise set to zero later. */
int charset_base = -1;
int boyer_moore_ok = 1;
base_pat = raw_pattern;
if (multibyte)
{
+ /* Fill patbuf by translated characters in STRING while
+ checking if we can use boyer-moore search. If TRT is
+ non-nil, we can use boyer-moore search only if TRT can be
+ represented by the byte array of 256 elements. For that,
+ all non-ASCII case-equivalents of all case-senstive
+ characters in STRING must belong to the same charset and
+ row. */
+
while (--len >= 0)
{
- unsigned char str[MAX_MULTIBYTE_LENGTH];
+ unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
int c, translated, inverse;
int in_charlen, charlen;
if (RE && *base_pat == '\\')
{
len--;
+ raw_pattern_size--;
len_byte--;
base_pat++;
}
c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
- /* Translate the character, if requested. */
- TRANSLATE (translated, trt, c);
- /* If translation changed the byte-length, go back
- to the original character. */
- charlen = CHAR_STRING (translated, str);
- if (in_charlen != charlen)
+ if (NILP (trt))
{
- translated = c;
- charlen = CHAR_STRING (c, str);
+ str = base_pat;
+ charlen = in_charlen;
}
-
- /* If we are searching for something strange,
- an invalid multibyte code, don't use boyer-moore. */
- if (! ASCII_BYTE_P (translated)
- && (charlen == 1 /* 8bit code */
- || charlen != in_charlen /* invalid multibyte code */
- ))
- boyer_moore_ok = 0;
-
- TRANSLATE (inverse, inverse_trt, c);
-
- /* Did this char actually get translated?
- Would any other char get translated into it? */
- if (translated != c || inverse != c)
+ else
{
- /* Keep track of which character set row
- contains the characters that need translation. */
- int charset_base_code = c & ~CHAR_FIELD3_MASK;
- int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
-
- if (charset_base_code != inverse_charset_base)
- boyer_moore_ok = 0;
- else if (charset_base == -1)
- charset_base = charset_base_code;
- else if (charset_base != charset_base_code)
- /* If two different rows appear, needing translation,
- then we cannot use boyer_moore search. */
- boyer_moore_ok = 0;
+ /* Translate the character. */
+ TRANSLATE (translated, trt, c);
+ charlen = CHAR_STRING (translated, str_base);
+ str = str_base;
+
+ /* Check if C has any other case-equivalents. */
+ TRANSLATE (inverse, inverse_trt, c);
+ /* If so, check if we can use boyer-moore. */
+ if (c != inverse && boyer_moore_ok)
+ {
+ /* Check if all equivalents belong to the same
+ charset & row. Note that the check of C
+ itself is done by the last iteration. Note
+ also that we don't have to check ASCII
+ characters because boyer-moore search can
+ always handle their translation. */
+ while (1)
+ {
+ if (ASCII_BYTE_P (inverse))
+ {
+ if (charset_base > 0)
+ {
+ boyer_moore_ok = 0;
+ break;
+ }
+ charset_base = 0;
+ }
+ else if (SINGLE_BYTE_CHAR_P (inverse))
+ {
+ /* Boyer-moore search can't handle a
+ translation of an eight-bit
+ character. */
+ boyer_moore_ok = 0;
+ break;
+ }
+ else if (charset_base < 0)
+ charset_base = inverse & ~CHAR_FIELD3_MASK;
+ else if ((inverse & ~CHAR_FIELD3_MASK)
+ != charset_base)
+ {
+ boyer_moore_ok = 0;
+ break;
+ }
+ if (c == inverse)
+ break;
+ TRANSLATE (inverse, inverse_trt, inverse);
+ }
+ }
}
+ if (charset_base < 0)
+ charset_base = 0;
/* Store this character into the translated pattern. */
bcopy (str, pat, charlen);
if (RE && *base_pat == '\\')
{
len--;
+ raw_pattern_size--;
base_pat++;
}
c = *base_pat++;
return n;
}
\f
-/* Do Boyer-Moore search N times for the string PAT,
+/* Do Boyer-Moore search N times for the string BASE_PAT,
whose length is LEN/LEN_BYTE,
from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
DIRECTION says which direction we search in.
TRT and INVERSE_TRT are translation tables.
+ Characters in PAT are already translated by TRT.
- This kind of search works if all the characters in PAT that have
- nontrivial translation are the same aside from the last byte. This
- makes it possible to translate just the last byte of a character,
- and do so after just a simple test of the context.
+ This kind of search works if all the characters in BASE_PAT that
+ have nontrivial translation are the same aside from the last byte.
+ This makes it possible to translate just the last byte of a
+ character, and do so after just a simple test of the context.
+ CHARSET_BASE is nonzero iff there is such a non-ASCII character.
If that criterion is not satisfied, do not call this function. */
int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
unsigned char simple_translate[0400];
- int translate_prev_byte = 0;
- int translate_anteprev_byte = 0;
+ /* These are set to the preceding bytes of a byte to be translated
+ if charset_base is nonzero. As the maximum byte length of a
+ multibyte character is 4, we have to check at most three previous
+ bytes. */
+ int translate_prev_byte1 = 0;
+ int translate_prev_byte2 = 0;
+ int translate_prev_byte3 = 0;
#ifdef C_ALLOCA
int BM_tab_space[0400];
for (i = 0; i < 0400; i++)
simple_translate[i] = i;
+ if (charset_base)
+ {
+ /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a
+ byte following them are the target of translation. */
+ int sample_char = charset_base | 0x20;
+ unsigned char str[MAX_MULTIBYTE_LENGTH];
+ int len = CHAR_STRING (sample_char, str);
+
+ translate_prev_byte1 = str[len - 2];
+ if (len > 2)
+ {
+ translate_prev_byte2 = str[len - 3];
+ if (len > 3)
+ translate_prev_byte3 = str[len - 4];
+ }
+ }
+
i = 0;
while (i != infinity)
{
i = infinity;
if (! NILP (trt))
{
- int ch;
- int untranslated;
- int this_translated = 1;
-
- if (multibyte
- /* Is *PTR the last byte of a character? */
- && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1])))
+ /* If the byte currently looking at is the last of a
+ character to check case-equivalents, set CH to that
+ character. An ASCII character and a non-ASCII character
+ matching with CHARSET_BASE are to be checked. */
+ int ch = -1;
+
+ if (ASCII_BYTE_P (*ptr) || ! multibyte)
+ ch = *ptr;
+ else if (charset_base
+ && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])))
{
- unsigned char *charstart = ptr;
- while (! CHAR_HEAD_P (*charstart))
+ unsigned char *charstart = ptr - 1;
+
+ while (! (CHAR_HEAD_P (*charstart)))
charstart--;
- untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
- if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
- {
- TRANSLATE (ch, trt, untranslated);
- if (! CHAR_HEAD_P (*ptr))
- {
- translate_prev_byte = ptr[-1];
- if (! CHAR_HEAD_P (translate_prev_byte))
- translate_anteprev_byte = ptr[-2];
- }
- }
- else
- {
- this_translated = 0;
- ch = *ptr;
- }
- }
- else if (!multibyte)
- TRANSLATE (ch, trt, *ptr);
- else
- {
- ch = *ptr;
- this_translated = 0;
+ ch = STRING_CHAR (charstart, ptr - charstart + 1);
+ if (charset_base != (ch & ~CHAR_FIELD3_MASK))
+ ch = -1;
}
- if (ch > 0400)
+ if (ch >= 0400)
j = ((unsigned char) ch) | 0200;
else
- j = (unsigned char) ch;
+ j = *ptr;
if (i == infinity)
stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i;
/* A translation table is accompanied by its inverse -- see */
/* comment following downcase_table for details */
- if (this_translated)
+ if (ch >= 0)
{
int starting_ch = ch;
int starting_j = j;
+
while (1)
{
TRANSLATE (ch, inverse_trt, ch);
- if (ch > 0400)
+ if (ch >= 0400)
j = ((unsigned char) ch) | 0200;
else
j = (unsigned char) ch;
|| ((cursor == tail_end_ptr
|| CHAR_HEAD_P (cursor[1]))
&& (CHAR_HEAD_P (cursor[0])
- || (translate_prev_byte == cursor[-1]
- && (CHAR_HEAD_P (translate_prev_byte)
- || translate_anteprev_byte == cursor[-2])))))
+ /* Check if this is the last byte of
+ a translable character. */
+ || (translate_prev_byte1 == cursor[-1]
+ && (CHAR_HEAD_P (translate_prev_byte1)
+ || (translate_prev_byte2 == cursor[-2]
+ && (CHAR_HEAD_P (translate_prev_byte2)
+ || (translate_prev_byte3 == cursor[-3]))))))))
ch = simple_translate[*cursor];
else
ch = *cursor;
|| ((ptr == tail_end_ptr
|| CHAR_HEAD_P (ptr[1]))
&& (CHAR_HEAD_P (ptr[0])
- || (translate_prev_byte == ptr[-1]
- && (CHAR_HEAD_P (translate_prev_byte)
- || translate_anteprev_byte == ptr[-2])))))
+ /* Check if this is the last byte of a
+ translable character. */
+ || (translate_prev_byte1 == ptr[-1]
+ && (CHAR_HEAD_P (translate_prev_byte1)
+ || (translate_prev_byte2 == ptr[-2]
+ && (CHAR_HEAD_P (translate_prev_byte2)
+ || translate_prev_byte3 == ptr[-3])))))))
ch = simple_translate[*ptr];
else
ch = *ptr;
/* but some C compilers blew it */
if (search_regs.num_regs <= 0)
- error ("replace-match called before any match found");
+ error ("`replace-match' called before any match found");
if (NILP (subexp))
sub = 0;
else
some_multiletter_word = 1;
}
- else if (!NOCASEP (c))
+ else if (UPPERCASEP (c))
{
some_uppercase = 1;
if (SYNTAX (prevc) != Sword)
return match_limit (subexp, 0);
}
-DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 2, 0,
+DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 3, 0,
doc: /* Return a list containing all info on what the last search matched.
Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'.
All the elements are markers or nil (nil if the Nth pair didn't match)
this case, and if the last match was in a buffer, the buffer will get
stored as one additional element at the end of the list.
-If REUSE is a list, reuse it as part of the value. If REUSE is long enough
-to hold all the values, and if INTEGERS is non-nil, no consing is done.
+If REUSE is a list, reuse it as part of the value. If REUSE is long
+enough to hold all the values, and if INTEGERS is non-nil, no consing
+is done.
+
+If optional third arg RESEAT is non-nil, any previous markers on the
+REUSE list will be modified to point to nowhere.
Return value is undefined if the last search failed. */)
- (integers, reuse)
- Lisp_Object integers, reuse;
+ (integers, reuse, reseat)
+ Lisp_Object integers, reuse, reseat;
{
Lisp_Object tail, prev;
Lisp_Object *data;
int i, len;
+ if (!NILP (reseat))
+ for (tail = reuse; CONSP (tail); tail = XCDR (tail))
+ if (MARKERP (XCAR (tail)))
+ {
+ unchain_marker (XMARKER (XCAR (tail)));
+ XSETCAR (tail, Qnil);
+ }
+
if (NILP (last_thing_searched))
return Qnil;
/* last_thing_searched must always be Qt, a buffer, or Qnil. */
abort ();
- len = 2*(i+1);
+ len = 2 * i + 2;
}
else
- data[2 * i] = data [2 * i + 1] = Qnil;
+ data[2 * i] = data[2 * i + 1] = Qnil;
}
if (BUFFERP (last_thing_searched) && !NILP (integers))
return reuse;
}
+/* Internal usage only:
+ If RESEAT is `evaporate', put the markers back on the free list
+ immediately. No other references to the markers must exist in this case,
+ so it is used only internally on the unwind stack and save-match-data from
+ Lisp. */
-DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 1, 0,
+DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 2, 0,
doc: /* Set internal data on last search match from elements of LIST.
-LIST should have been created by calling `match-data' previously. */)
- (list)
- register Lisp_Object list;
+LIST should have been created by calling `match-data' previously.
+
+If optional arg RESEAT is non-nil, make markers on LIST point nowhere. */)
+ (list, reseat)
+ register Lisp_Object list, reseat;
{
register int i;
register Lisp_Object marker;
if (running_asynch_code)
save_search_regs ();
- if (!CONSP (list) && !NILP (list))
- list = wrong_type_argument (Qconsp, list);
+ CHECK_LIST (list);
/* Unless we find a marker with a buffer or an explicit buffer
in LIST, assume that this match data came from a string. */
search_regs.num_regs = length;
}
- for (i = 0;; i++)
+ for (i = 0; CONSP (list); i++)
{
- marker = Fcar (list);
+ marker = XCAR (list);
if (BUFFERP (marker))
{
last_thing_searched = marker;
if (NILP (marker))
{
search_regs.start[i] = -1;
- list = Fcdr (list);
+ list = XCDR (list);
}
else
{
int from;
-
+ Lisp_Object m;
+
+ m = marker;
if (MARKERP (marker))
{
if (XMARKER (marker)->buffer == 0)
else
XSETBUFFER (last_thing_searched, XMARKER (marker)->buffer);
}
-
+
CHECK_NUMBER_COERCE_MARKER (marker);
from = XINT (marker);
- list = Fcdr (list);
-
- marker = Fcar (list);
+
+ if (!NILP (reseat) && MARKERP (m))
+ {
+ if (EQ (reseat, Qevaporate))
+ free_marker (m);
+ else
+ unchain_marker (XMARKER (m));
+ XSETCAR (list, Qnil);
+ }
+
+ if ((list = XCDR (list), !CONSP (list)))
+ break;
+
+ m = marker = XCAR (list);
+
if (MARKERP (marker) && XMARKER (marker)->buffer == 0)
XSETFASTINT (marker, 0);
-
+
CHECK_NUMBER_COERCE_MARKER (marker);
search_regs.start[i] = from;
search_regs.end[i] = XINT (marker);
+
+ if (!NILP (reseat) && MARKERP (m))
+ {
+ if (EQ (reseat, Qevaporate))
+ free_marker (m);
+ else
+ unchain_marker (XMARKER (m));
+ XSETCAR (list, Qnil);
+ }
}
- list = Fcdr (list);
+ list = XCDR (list);
}
for (; i < search_regs.num_regs; i++)
/* Called upon exit from filters and sentinels. */
void
-restore_match_data ()
+restore_search_regs ()
{
if (search_regs_saved)
{
}
}
+static Lisp_Object
+unwind_set_match_data (list)
+ Lisp_Object list;
+{
+ /* It is safe to free (evaporate) the markers immediately. */
+ return Fset_match_data (list, Qevaporate);
+}
+
+/* Called to unwind protect the match data. */
+void
+record_unwind_save_match_data ()
+{
+ record_unwind_protect (unwind_set_match_data,
+ Fmatch_data (Qnil, Qnil, Qnil));
+}
+
/* Quote a string to inactivate reg-expr chars */
DEFUN ("regexp-quote", Fregexp_quote, Sregexp_quote, 1, 1, 0,
for (; in != end; in++)
{
- if (*in == '[' || *in == ']'
+ if (*in == '['
|| *in == '*' || *in == '.' || *in == '\\'
|| *in == '?' || *in == '+'
|| *in == '^' || *in == '$')
searchbufs[i].regexp = Qnil;
searchbufs[i].whitespace_regexp = Qnil;
staticpro (&searchbufs[i].regexp);
+ staticpro (&searchbufs[i].whitespace_regexp);
searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]);
}
searchbuf_head = &searchbufs[0];
saved_last_thing_searched = Qnil;
staticpro (&saved_last_thing_searched);
- DEFVAR_LISP ("search-whitespace-regexp", &Vsearch_whitespace_regexp,
- /* doc: Regexp to substitute for bunches of spaces in regexp search.
+ DEFVAR_LISP ("search-spaces-regexp", &Vsearch_spaces_regexp,
+ doc: /* Regexp to substitute for bunches of spaces in regexp search.
Some commands use this for user-specified regexps.
Spaces that occur inside character classes or repetition operators
or other such regexp constructs are not replaced with this.
A value of nil (which is the normal value) means treat spaces literally. */);
- Vsearch_whitespace_regexp = Qnil;
+ Vsearch_spaces_regexp = Qnil;
defsubr (&Slooking_at);
defsubr (&Sposix_looking_at);