X-Git-Url: http://git.hcoop.net/bpt/guile.git/blobdiff_plain/a06e3a75b298d2335d134cfec97ee893b29aea33..f4af36aca47f7d0653b997986e8be9894bbd87ff:/libguile/regex-posix.c diff --git a/libguile/regex-posix.c b/libguile/regex-posix.c index 1a232d59f..bec0f89fb 100644 --- a/libguile/regex-posix.c +++ b/libguile/regex-posix.c @@ -1,43 +1,19 @@ -/* Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc. +/* Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 3 of + * the License, or (at your option) any later version. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this software; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 59 Temple Place, Suite 330, - * Boston, MA 02111-1307 USA - * - * As a special exception, the Free Software Foundation gives permission - * for additional uses of the text contained in its release of GUILE. - * - * The exception is that, if you link the GUILE library with other files - * to produce an executable, this does not by itself cause the - * resulting executable to be covered by the GNU General Public License. - * Your use of that executable is in no way restricted on account of - * linking the GUILE library code into it. - * - * This exception does not however invalidate any other reasons why - * the executable file might be covered by the GNU General Public License. - * - * This exception applies only to the code released by the - * Free Software Foundation under the name GUILE. If you copy - * code from other Free Software Foundation releases into a copy of - * GUILE, as the General Public License permits, the exception does - * not apply to the code that you add in this way. To avoid misleading - * anyone as to the status of such modified files, you must delete - * this exception notice from them. - * - * If you write modifications of your own for GUILE, it is your choice - * whether to permit this exception to apply to your modifications. - * If you do not wish that, delete this exception notice. + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA */ @@ -51,28 +27,21 @@ libraries which do not agree with the Spencer implementation may produce varying behavior. Sigh. */ +#ifdef HAVE_CONFIG_H +# include +#endif + #include #include "libguile/_scm.h" -/* Supposedly, this file is never compiled unless we know we have - POSIX regular expressions. But we still put this in an #ifdef so - the file is CPP'able (for dependency scanning) even on systems that - don't have a header. */ -#ifdef HAVE_REGCOMP -#ifdef HAVE_REGEX_H #include -#else -#ifdef HAVE_RXPOSIX_H -#include /* GNU Rx library */ -#else -#ifdef HAVE_RX_RXPOSIX_H -#include /* GNU Rx library on Linux */ -#endif -#endif -#endif + +#ifdef HAVE_WCHAR_H +#include #endif +#include "libguile/async.h" #include "libguile/smob.h" #include "libguile/symbols.h" #include "libguile/vectors.h" @@ -95,40 +64,29 @@ static size_t regex_free (SCM obj) { regfree (SCM_RGX (obj)); - free (SCM_RGX (obj)); - return sizeof(regex_t); + scm_gc_free (SCM_RGX (obj), sizeof(regex_t), "regex"); + return 0; } SCM_SYMBOL (scm_regexp_error_key, "regular-expression-syntax"); -static char * +static SCM scm_regexp_error_msg (int regerrno, regex_t *rx) { - SCM errmsg; + char *errmsg; int l; - /* FIXME: must we wrap any external calls in SCM_DEFER_INTS...SCM_ALLOW_INTS? - Or are these only necessary when a SCM object may be left in an - undetermined state (half-formed)? If the latter then I believe we - may do without the critical section code. -twp */ - - /* We could simply make errmsg a char pointer, and allocate space with - malloc. But since we are about to pass the pointer to scm_error, which - never returns, we would never have the opportunity to free it. Creating - it as a SCM object means that the system will GC it at some point. */ - - errmsg = scm_make_string (SCM_MAKINUM (80), SCM_UNDEFINED); - SCM_DEFER_INTS; - l = regerror (regerrno, rx, SCM_STRING_CHARS (errmsg), 80); + errmsg = scm_malloc (80); + l = regerror (regerrno, rx, errmsg, 80); if (l > 80) { - errmsg = scm_make_string (SCM_MAKINUM (l), SCM_UNDEFINED); - regerror (regerrno, rx, SCM_STRING_CHARS (errmsg), l); + free (errmsg); + errmsg = scm_malloc (l); + regerror (regerrno, rx, errmsg, l); } - SCM_ALLOW_INTS; - return SCM_STRING_CHARS (errmsg); + return scm_take_locale_string (errmsg); } SCM_DEFINE (scm_regexp_p, "regexp?", 1, 0, 0, @@ -137,7 +95,7 @@ SCM_DEFINE (scm_regexp_p, "regexp?", 1, 0, 0, "or @code{#f} otherwise.") #define FUNC_NAME s_scm_regexp_p { - return SCM_BOOL(SCM_RGXP (obj)); + return scm_from_bool(SCM_RGXP (obj)); } #undef FUNC_NAME @@ -185,42 +143,84 @@ SCM_DEFINE (scm_make_regexp, "make-regexp", 1, 0, 1, SCM flag; regex_t *rx; int status, cflags; + char *c_pat; SCM_VALIDATE_STRING (1, pat); SCM_VALIDATE_REST_ARGUMENT (flags); - SCM_STRING_COERCE_0TERMINATION_X (pat); /* Examine list of regexp flags. If REG_BASIC is supplied, then turn off REG_EXTENDED flag (on by default). */ cflags = REG_EXTENDED; flag = flags; - while (!SCM_NULLP (flag)) + while (!scm_is_null (flag)) { - if (SCM_INUM (SCM_CAR (flag)) == REG_BASIC) + if (scm_to_int (SCM_CAR (flag)) == REG_BASIC) cflags &= ~REG_EXTENDED; else - cflags |= SCM_INUM (SCM_CAR (flag)); + cflags |= scm_to_int (SCM_CAR (flag)); flag = SCM_CDR (flag); } - rx = SCM_MUST_MALLOC_TYPE (regex_t); - status = regcomp (rx, SCM_STRING_CHARS (pat), + rx = scm_gc_malloc_pointerless (sizeof (regex_t), "regex"); + c_pat = scm_to_locale_string (pat); + status = regcomp (rx, c_pat, /* Make sure they're not passing REG_NOSUB; regexp-exec assumes we're getting match data. */ cflags & ~REG_NOSUB); + free (c_pat); if (status != 0) { - scm_error (scm_regexp_error_key, - FUNC_NAME, - scm_regexp_error_msg (status, rx), - SCM_BOOL_F, - SCM_BOOL_F); + SCM errmsg = scm_regexp_error_msg (status, rx); + scm_gc_free (rx, sizeof(regex_t), "regex"); + scm_error_scm (scm_regexp_error_key, + scm_from_locale_string (FUNC_NAME), + errmsg, + SCM_BOOL_F, + scm_list_1 (pat)); + /* never returns */ } SCM_RETURN_NEWSMOB (scm_tc16_regex, rx); } #undef FUNC_NAME +#ifdef HAVE_WCHAR_H +/* + * While regexec does respect the current locale, it returns byte + * offsets instead of character offsets. This routine fixes up the + * regmatch_t structures to refer to characters instead. See "Converting + * a Character" in the libc manual, for more details. + */ +static void +fixup_multibyte_match (regmatch_t *matches, int nmatches, char *str) +{ + mbstate_t state; + int i; + size_t char_idx, byte_idx; + size_t nbytes = 1; /* just to kick off the for loop */ + + memset (&state, '\0', sizeof (state)); + + for (char_idx = byte_idx = 0; nbytes > 0; char_idx++, byte_idx += nbytes) + { + for (i = 0; i < nmatches; ++i) + { + if (matches[i].rm_so == byte_idx) + matches[i].rm_so = char_idx; + if (matches[i].rm_eo == byte_idx) + matches[i].rm_eo = char_idx; + } + + nbytes = mbrlen (str + byte_idx, MB_LEN_MAX, &state); + if (nbytes == (size_t) -2 || nbytes == (size_t) -1) + /* Something is wrong. Shouldn't be possible, as the regex match + succeeded. */ + abort (); + } + +} +#endif + SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0, (SCM rx, SCM str, SCM start, SCM flags), "Match the compiled regular expression @var{rx} against\n" @@ -244,52 +244,80 @@ SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0, "@end table") #define FUNC_NAME s_scm_regexp_exec { + /* We used to have an SCM_DEFER_INTS, and then later an + SCM_CRITICAL_SECTION_START, around the regexec() call. Can't quite + remember what defer ints was for, but a critical section would only be + wanted now if we think regexec() is not thread-safe. The posix spec + + http://www.opengroup.org/onlinepubs/009695399/functions/regcomp.html + + reads like regexec is meant to be both thread safe and reentrant + (mentioning simultaneous use in threads, and in signal handlers). So + for now believe no protection needed. */ + int status, nmatches, offset; regmatch_t *matches; + char *c_str; SCM mvec = SCM_BOOL_F; + SCM substr; - SCM_VALIDATE_RGXP (1,rx); + SCM_VALIDATE_RGXP (1, rx); SCM_VALIDATE_STRING (2, str); - SCM_VALIDATE_INUM_DEF_COPY (3,start,0,offset); - SCM_ASSERT_RANGE (3,start, offset >= 0 && offset <= SCM_STRING_LENGTH (str)); + + if (SCM_UNBNDP (start)) + { + substr = str; + offset = 0; + } + else + { + substr = scm_substring (str, start, SCM_UNDEFINED); + offset = scm_to_int (start); + } + if (SCM_UNBNDP (flags)) flags = SCM_INUM0; - SCM_VALIDATE_INUM (4,flags); - SCM_STRING_COERCE_0TERMINATION_X (str); /* re_nsub doesn't account for the `subexpression' representing the whole regexp, so add 1 to nmatches. */ + c_str = scm_to_locale_string (substr); + nmatches = SCM_RGX(rx)->re_nsub + 1; - SCM_DEFER_INTS; - matches = SCM_MUST_MALLOC_TYPE_NUM (regmatch_t,nmatches); - status = regexec (SCM_RGX (rx), SCM_STRING_CHARS (str) + offset, - nmatches, matches, - SCM_INUM (flags)); + matches = scm_malloc (sizeof (regmatch_t) * nmatches); + status = regexec (SCM_RGX (rx), c_str, nmatches, matches, + scm_to_int (flags)); + +#ifdef HAVE_WCHAR_H + if (!status) + fixup_multibyte_match (matches, nmatches, c_str); +#endif + + free (c_str); + if (!status) { int i; /* The match vector must include a cell for the string that was matched, so add 1. */ mvec = scm_c_make_vector (nmatches + 1, SCM_UNSPECIFIED); - SCM_VELTS(mvec)[0] = str; + SCM_SIMPLE_VECTOR_SET(mvec,0, str); for (i = 0; i < nmatches; ++i) if (matches[i].rm_so == -1) - SCM_VELTS(mvec)[i+1] = scm_cons (SCM_MAKINUM (-1), SCM_MAKINUM (-1)); + SCM_SIMPLE_VECTOR_SET(mvec, i+1, + scm_cons (scm_from_int (-1), scm_from_int (-1))); else - SCM_VELTS(mvec)[i+1] - = scm_cons(SCM_MAKINUM(matches[i].rm_so + offset), - SCM_MAKINUM(matches[i].rm_eo + offset)); + SCM_SIMPLE_VECTOR_SET(mvec, i+1, + scm_cons (scm_from_long (matches[i].rm_so + offset), + scm_from_long (matches[i].rm_eo + offset))); } - scm_must_free ((char *) matches); - SCM_ALLOW_INTS; + free (matches); if (status != 0 && status != REG_NOMATCH) - scm_error (scm_regexp_error_key, - FUNC_NAME, - scm_regexp_error_msg (status, SCM_RGX (rx)), - SCM_BOOL_F, - SCM_BOOL_F); + scm_error_scm (scm_regexp_error_key, + scm_from_locale_string (FUNC_NAME), + scm_regexp_error_msg (status, SCM_RGX (rx)), + SCM_BOOL_F, SCM_BOOL_F); return mvec; } #undef FUNC_NAME @@ -301,18 +329,16 @@ scm_init_regex_posix () scm_set_smob_free (scm_tc16_regex, regex_free); /* Compilation flags. */ - scm_c_define ("regexp/basic", scm_long2num (REG_BASIC)); - scm_c_define ("regexp/extended", scm_long2num (REG_EXTENDED)); - scm_c_define ("regexp/icase", scm_long2num (REG_ICASE)); - scm_c_define ("regexp/newline", scm_long2num (REG_NEWLINE)); + scm_c_define ("regexp/basic", scm_from_int (REG_BASIC)); + scm_c_define ("regexp/extended", scm_from_int (REG_EXTENDED)); + scm_c_define ("regexp/icase", scm_from_int (REG_ICASE)); + scm_c_define ("regexp/newline", scm_from_int (REG_NEWLINE)); /* Execution flags. */ - scm_c_define ("regexp/notbol", scm_long2num (REG_NOTBOL)); - scm_c_define ("regexp/noteol", scm_long2num (REG_NOTEOL)); + scm_c_define ("regexp/notbol", scm_from_int (REG_NOTBOL)); + scm_c_define ("regexp/noteol", scm_from_int (REG_NOTEOL)); -#ifndef SCM_MAGIC_SNARFER #include "libguile/regex-posix.x" -#endif scm_add_feature ("regex"); }