-/* Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004 Free Software Foundation, Inc.
+/* Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 3 of
+ * the License, or (at your option) any later version.
*
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
*/
libraries which do not agree with the Spencer implementation may
produce varying behavior. Sigh. */
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "libguile/_scm.h"
-/* Supposedly, this file is never compiled unless we know we have
- POSIX regular expressions. But we still put this in an #ifdef so
- the file is CPP'able (for dependency scanning) even on systems that
- don't have a <regex.h> header. */
-#ifdef HAVE_REGCOMP
-#ifdef HAVE_REGEX_H
#include <regex.h>
-#else
-#ifdef HAVE_RXPOSIX_H
-#include <rxposix.h> /* GNU Rx library */
-#else
-#ifdef HAVE_RX_RXPOSIX_H
-#include <rx/rxposix.h> /* GNU Rx library on Linux */
-#endif
-#endif
-#endif
+
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
#endif
+#include "libguile/async.h"
#include "libguile/smob.h"
#include "libguile/symbols.h"
#include "libguile/vectors.h"
flag = SCM_CDR (flag);
}
- rx = scm_gc_malloc (sizeof(regex_t), "regex");
+ rx = scm_gc_malloc_pointerless (sizeof (regex_t), "regex");
c_pat = scm_to_locale_string (pat);
status = regcomp (rx, c_pat,
/* Make sure they're not passing REG_NOSUB;
scm_from_locale_string (FUNC_NAME),
errmsg,
SCM_BOOL_F,
- SCM_BOOL_F);
+ scm_list_1 (pat));
+
/* never returns */
}
SCM_RETURN_NEWSMOB (scm_tc16_regex, rx);
}
#undef FUNC_NAME
+#ifdef HAVE_WCHAR_H
+/*
+ * While regexec does respect the current locale, it returns byte
+ * offsets instead of character offsets. This routine fixes up the
+ * regmatch_t structures to refer to characters instead. See "Converting
+ * a Character" in the libc manual, for more details.
+ */
+static void
+fixup_multibyte_match (regmatch_t *matches, int nmatches, char *str)
+{
+ mbstate_t state;
+ int i;
+ size_t char_idx, byte_idx;
+ size_t nbytes = 1; /* just to kick off the for loop */
+
+ memset (&state, '\0', sizeof (state));
+
+ for (char_idx = byte_idx = 0; nbytes > 0; char_idx++, byte_idx += nbytes)
+ {
+ for (i = 0; i < nmatches; ++i)
+ {
+ if (matches[i].rm_so == byte_idx)
+ matches[i].rm_so = char_idx;
+ if (matches[i].rm_eo == byte_idx)
+ matches[i].rm_eo = char_idx;
+ }
+
+ nbytes = mbrlen (str + byte_idx, MB_LEN_MAX, &state);
+ if (nbytes == (size_t) -2 || nbytes == (size_t) -1)
+ /* Something is wrong. Shouldn't be possible, as the regex match
+ succeeded. */
+ abort ();
+ }
+
+}
+#endif
+
SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0,
(SCM rx, SCM str, SCM start, SCM flags),
"Match the compiled regular expression @var{rx} against\n"
"@end table")
#define FUNC_NAME s_scm_regexp_exec
{
+ /* We used to have an SCM_DEFER_INTS, and then later an
+ SCM_CRITICAL_SECTION_START, around the regexec() call. Can't quite
+ remember what defer ints was for, but a critical section would only be
+ wanted now if we think regexec() is not thread-safe. The posix spec
+
+ http://www.opengroup.org/onlinepubs/009695399/functions/regcomp.html
+
+ reads like regexec is meant to be both thread safe and reentrant
+ (mentioning simultaneous use in threads, and in signal handlers). So
+ for now believe no protection needed. */
+
int status, nmatches, offset;
regmatch_t *matches;
char *c_str;
/* re_nsub doesn't account for the `subexpression' representing the
whole regexp, so add 1 to nmatches. */
+ c_str = scm_to_locale_string (substr);
+
nmatches = SCM_RGX(rx)->re_nsub + 1;
- SCM_DEFER_INTS;
matches = scm_malloc (sizeof (regmatch_t) * nmatches);
- c_str = scm_to_locale_string (substr);
status = regexec (SCM_RGX (rx), c_str, nmatches, matches,
scm_to_int (flags));
+
+#ifdef HAVE_WCHAR_H
+ if (!status)
+ fixup_multibyte_match (matches, nmatches, c_str);
+#endif
+
free (c_str);
if (!status)
/* The match vector must include a cell for the string that was matched,
so add 1. */
mvec = scm_c_make_vector (nmatches + 1, SCM_UNSPECIFIED);
- SCM_VECTOR_SET(mvec,0, str);
+ SCM_SIMPLE_VECTOR_SET(mvec,0, str);
for (i = 0; i < nmatches; ++i)
if (matches[i].rm_so == -1)
- SCM_VECTOR_SET(mvec, i+1,
+ SCM_SIMPLE_VECTOR_SET(mvec, i+1,
scm_cons (scm_from_int (-1), scm_from_int (-1)));
else
- SCM_VECTOR_SET(mvec, i+1,
+ SCM_SIMPLE_VECTOR_SET(mvec, i+1,
scm_cons (scm_from_long (matches[i].rm_so + offset),
scm_from_long (matches[i].rm_eo + offset)));
}
free (matches);
- SCM_ALLOW_INTS;
if (status != 0 && status != REG_NOMATCH)
scm_error_scm (scm_regexp_error_key,
scm_from_locale_string (FUNC_NAME),
scm_regexp_error_msg (status, SCM_RGX (rx)),
- SCM_BOOL_F,
- SCM_BOOL_F);
+ SCM_BOOL_F, SCM_BOOL_F);
return mvec;
}
#undef FUNC_NAME
scm_set_smob_free (scm_tc16_regex, regex_free);
/* Compilation flags. */
- scm_c_define ("regexp/basic", scm_from_long (REG_BASIC));
- scm_c_define ("regexp/extended", scm_from_long (REG_EXTENDED));
- scm_c_define ("regexp/icase", scm_from_long (REG_ICASE));
- scm_c_define ("regexp/newline", scm_from_long (REG_NEWLINE));
+ scm_c_define ("regexp/basic", scm_from_int (REG_BASIC));
+ scm_c_define ("regexp/extended", scm_from_int (REG_EXTENDED));
+ scm_c_define ("regexp/icase", scm_from_int (REG_ICASE));
+ scm_c_define ("regexp/newline", scm_from_int (REG_NEWLINE));
/* Execution flags. */
- scm_c_define ("regexp/notbol", scm_from_long (REG_NOTBOL));
- scm_c_define ("regexp/noteol", scm_from_long (REG_NOTEOL));
+ scm_c_define ("regexp/notbol", scm_from_int (REG_NOTBOL));
+ scm_c_define ("regexp/noteol", scm_from_int (REG_NOTEOL));
#include "libguile/regex-posix.x"