X-Git-Url: https://git.hcoop.net/bpt/guile.git/blobdiff_plain/e610dc3851da716e6ee4568f94f5f7cace84d2d9..4fa65b903bd0ad5ed62dca92df71325c0a110809:/libguile/regex-posix.c diff --git a/libguile/regex-posix.c b/libguile/regex-posix.c index d280c82b6..bec0f89fb 100644 --- a/libguile/regex-posix.c +++ b/libguile/regex-posix.c @@ -1,18 +1,19 @@ -/* Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2006, 2007 Free Software Foundation, Inc. +/* Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2006, 2007, 2010, 2011, 2012 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 3 of + * the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA */ @@ -26,7 +27,7 @@ libraries which do not agree with the Spencer implementation may produce varying behavior. Sigh. */ -#if HAVE_CONFIG_H +#ifdef HAVE_CONFIG_H # include #endif @@ -34,22 +35,10 @@ #include "libguile/_scm.h" -/* Supposedly, this file is never compiled unless we know we have - POSIX regular expressions. But we still put this in an #ifdef so - the file is CPP'able (for dependency scanning) even on systems that - don't have a header. */ -#ifdef HAVE_REGCOMP -#ifdef HAVE_REGEX_H #include -#else -#ifdef HAVE_RXPOSIX_H -#include /* GNU Rx library */ -#else -#ifdef HAVE_RX_RXPOSIX_H -#include /* GNU Rx library on Linux */ -#endif -#endif -#endif + +#ifdef HAVE_WCHAR_H +#include #endif #include "libguile/async.h" @@ -172,7 +161,7 @@ SCM_DEFINE (scm_make_regexp, "make-regexp", 1, 0, 1, flag = SCM_CDR (flag); } - rx = scm_gc_malloc (sizeof(regex_t), "regex"); + rx = scm_gc_malloc_pointerless (sizeof (regex_t), "regex"); c_pat = scm_to_locale_string (pat); status = regcomp (rx, c_pat, /* Make sure they're not passing REG_NOSUB; @@ -195,6 +184,43 @@ SCM_DEFINE (scm_make_regexp, "make-regexp", 1, 0, 1, } #undef FUNC_NAME +#ifdef HAVE_WCHAR_H +/* + * While regexec does respect the current locale, it returns byte + * offsets instead of character offsets. This routine fixes up the + * regmatch_t structures to refer to characters instead. See "Converting + * a Character" in the libc manual, for more details. + */ +static void +fixup_multibyte_match (regmatch_t *matches, int nmatches, char *str) +{ + mbstate_t state; + int i; + size_t char_idx, byte_idx; + size_t nbytes = 1; /* just to kick off the for loop */ + + memset (&state, '\0', sizeof (state)); + + for (char_idx = byte_idx = 0; nbytes > 0; char_idx++, byte_idx += nbytes) + { + for (i = 0; i < nmatches; ++i) + { + if (matches[i].rm_so == byte_idx) + matches[i].rm_so = char_idx; + if (matches[i].rm_eo == byte_idx) + matches[i].rm_eo = char_idx; + } + + nbytes = mbrlen (str + byte_idx, MB_LEN_MAX, &state); + if (nbytes == (size_t) -2 || nbytes == (size_t) -1) + /* Something is wrong. Shouldn't be possible, as the regex match + succeeded. */ + abort (); + } + +} +#endif + SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0, (SCM rx, SCM str, SCM start, SCM flags), "Match the compiled regular expression @var{rx} against\n" @@ -255,11 +281,18 @@ SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0, /* re_nsub doesn't account for the `subexpression' representing the whole regexp, so add 1 to nmatches. */ + c_str = scm_to_locale_string (substr); + nmatches = SCM_RGX(rx)->re_nsub + 1; matches = scm_malloc (sizeof (regmatch_t) * nmatches); - c_str = scm_to_locale_string (substr); status = regexec (SCM_RGX (rx), c_str, nmatches, matches, scm_to_int (flags)); + +#ifdef HAVE_WCHAR_H + if (!status) + fixup_multibyte_match (matches, nmatches, c_str); +#endif + free (c_str); if (!status)