| 1 | /* Copyright (C) 1997, 1998, 1999, 2000 Free Software Foundation, Inc. |
| 2 | * |
| 3 | * This program is free software; you can redistribute it and/or modify |
| 4 | * it under the terms of the GNU General Public License as published by |
| 5 | * the Free Software Foundation; either version 2, or (at your option) |
| 6 | * any later version. |
| 7 | * |
| 8 | * This program is distributed in the hope that it will be useful, |
| 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | * GNU General Public License for more details. |
| 12 | * |
| 13 | * You should have received a copy of the GNU General Public License |
| 14 | * along with this software; see the file COPYING. If not, write to |
| 15 | * the Free Software Foundation, Inc., 59 Temple Place, Suite 330, |
| 16 | * Boston, MA 02111-1307 USA |
| 17 | * |
| 18 | * As a special exception, the Free Software Foundation gives permission |
| 19 | * for additional uses of the text contained in its release of GUILE. |
| 20 | * |
| 21 | * The exception is that, if you link the GUILE library with other files |
| 22 | * to produce an executable, this does not by itself cause the |
| 23 | * resulting executable to be covered by the GNU General Public License. |
| 24 | * Your use of that executable is in no way restricted on account of |
| 25 | * linking the GUILE library code into it. |
| 26 | * |
| 27 | * This exception does not however invalidate any other reasons why |
| 28 | * the executable file might be covered by the GNU General Public License. |
| 29 | * |
| 30 | * This exception applies only to the code released by the |
| 31 | * Free Software Foundation under the name GUILE. If you copy |
| 32 | * code from other Free Software Foundation releases into a copy of |
| 33 | * GUILE, as the General Public License permits, the exception does |
| 34 | * not apply to the code that you add in this way. To avoid misleading |
| 35 | * anyone as to the status of such modified files, you must delete |
| 36 | * this exception notice from them. |
| 37 | * |
| 38 | * If you write modifications of your own for GUILE, it is your choice |
| 39 | * whether to permit this exception to apply to your modifications. |
| 40 | * If you do not wish that, delete this exception notice. |
| 41 | */ |
| 42 | |
| 43 | /* Software engineering face-lift by Greg J. Badros, 11-Dec-1999, |
| 44 | gjb@cs.washington.edu, http://www.cs.washington.edu/homes/gjb */ |
| 45 | |
| 46 | \f |
| 47 | |
| 48 | /* regex-posix.c -- POSIX regular expression support. |
| 49 | |
| 50 | This code was written against Henry Spencer's famous regex package. |
| 51 | The principal reference for POSIX behavior was the man page for this |
| 52 | library, not the 1003.2 document itself. Ergo, other `POSIX' |
| 53 | libraries which do not agree with the Spencer implementation may |
| 54 | produce varying behavior. Sigh. */ |
| 55 | |
| 56 | #include <stdio.h> |
| 57 | #include <sys/types.h> |
| 58 | |
| 59 | #include "libguile/_scm.h" |
| 60 | |
| 61 | /* Supposedly, this file is never compiled unless we know we have |
| 62 | POSIX regular expressions. But we still put this in an #ifdef so |
| 63 | the file is CPP'able (for dependency scanning) even on systems that |
| 64 | don't have a <regex.h> header. */ |
| 65 | #ifdef HAVE_REGCOMP |
| 66 | #ifdef HAVE_REGEX_H |
| 67 | #include <regex.h> |
| 68 | #else |
| 69 | #ifdef HAVE_RXPOSIX_H |
| 70 | #include <rxposix.h> /* GNU Rx library */ |
| 71 | #else |
| 72 | #ifdef HAVE_RX_RXPOSIX_H |
| 73 | #include <rx/rxposix.h> /* GNU Rx library on Linux */ |
| 74 | #endif |
| 75 | #endif |
| 76 | #endif |
| 77 | #endif |
| 78 | |
| 79 | #include "libguile/smob.h" |
| 80 | #include "libguile/symbols.h" |
| 81 | #include "libguile/vectors.h" |
| 82 | #include "libguile/strports.h" |
| 83 | #include "libguile/ports.h" |
| 84 | #include "libguile/feature.h" |
| 85 | #include "libguile/strings.h" |
| 86 | |
| 87 | #include "libguile/validate.h" |
| 88 | #include "libguile/regex-posix.h" |
| 89 | |
| 90 | /* This is defined by some regex libraries and omitted by others. */ |
| 91 | #ifndef REG_BASIC |
| 92 | #define REG_BASIC 0 |
| 93 | #endif |
| 94 | |
| 95 | long scm_tc16_regex; |
| 96 | |
| 97 | static scm_sizet |
| 98 | free_regex (SCM obj) |
| 99 | { |
| 100 | regfree (SCM_RGX (obj)); |
| 101 | free (SCM_RGX (obj)); |
| 102 | return sizeof(regex_t); |
| 103 | } |
| 104 | |
| 105 | \f |
| 106 | |
| 107 | SCM_SYMBOL (scm_regexp_error_key, "regular-expression-syntax"); |
| 108 | |
| 109 | static char * |
| 110 | scm_regexp_error_msg (int regerrno, regex_t *rx) |
| 111 | { |
| 112 | SCM errmsg; |
| 113 | int l; |
| 114 | |
| 115 | /* FIXME: must we wrap any external calls in SCM_DEFER_INTS...SCM_ALLOW_INTS? |
| 116 | Or are these only necessary when a SCM object may be left in an |
| 117 | undetermined state (half-formed)? If the latter then I believe we |
| 118 | may do without the critical section code. -twp */ |
| 119 | |
| 120 | /* We could simply make errmsg a char pointer, and allocate space with |
| 121 | malloc. But since we are about to pass the pointer to scm_error, which |
| 122 | never returns, we would never have the opportunity to free it. Creating |
| 123 | it as a SCM object means that the system will GC it at some point. */ |
| 124 | |
| 125 | errmsg = scm_make_string (SCM_MAKINUM (80), SCM_UNDEFINED); |
| 126 | SCM_DEFER_INTS; |
| 127 | l = regerror (regerrno, rx, SCM_CHARS (errmsg), 80); |
| 128 | if (l > 80) |
| 129 | { |
| 130 | errmsg = scm_make_string (SCM_MAKINUM (l), SCM_UNDEFINED); |
| 131 | regerror (regerrno, rx, SCM_CHARS (errmsg), l); |
| 132 | } |
| 133 | SCM_ALLOW_INTS; |
| 134 | return SCM_CHARS (errmsg); |
| 135 | } |
| 136 | |
| 137 | SCM_DEFINE (scm_regexp_p, "regexp?", 1, 0, 0, |
| 138 | (SCM x), |
| 139 | "Return @code{#t} if @var{obj} is a compiled regular expression, or\n" |
| 140 | "@code{#f} otherwise.") |
| 141 | #define FUNC_NAME s_scm_regexp_p |
| 142 | { |
| 143 | return SCM_BOOL(SCM_RGXP (x)); |
| 144 | } |
| 145 | #undef FUNC_NAME |
| 146 | |
| 147 | SCM_DEFINE (scm_make_regexp, "make-regexp", 1, 0, 1, |
| 148 | (SCM pat, SCM flags), |
| 149 | "Compile the regular expression described by @var{str}, and return the\n" |
| 150 | "compiled regexp structure. If @var{str} does not describe a legal\n" |
| 151 | "regular expression, @code{make-regexp} throws a\n" |
| 152 | "@code{regular-expression-syntax} error.\n\n" |
| 153 | "The @var{flag} arguments change the behavior of the compiled regexp.\n" |
| 154 | "The following flags may be supplied:\n\n" |
| 155 | "@table @code\n" |
| 156 | "@item regexp/icase\n" |
| 157 | "Consider uppercase and lowercase letters to be the same when matching.\n\n" |
| 158 | "@item regexp/newline\n" |
| 159 | "If a newline appears in the target string, then permit the @samp{^} and\n" |
| 160 | "@samp{$} operators to match immediately after or immediately before the\n" |
| 161 | "newline, respectively. Also, the @samp{.} and @samp{[^...]} operators\n" |
| 162 | "will never match a newline character. The intent of this flag is to\n" |
| 163 | "treat the target string as a buffer containing many lines of text, and\n" |
| 164 | "the regular expression as a pattern that may match a single one of those\n" |
| 165 | "lines.\n\n" |
| 166 | "@item regexp/basic\n" |
| 167 | "Compile a basic (``obsolete'') regexp instead of the extended\n" |
| 168 | "(``modern'') regexps that are the default. Basic regexps do not\n" |
| 169 | "consider @samp{|}, @samp{+} or @samp{?} to be special characters, and\n" |
| 170 | "require the @samp{@{...@}} and @samp{(...)} metacharacters to be\n" |
| 171 | "backslash-escaped (@pxref{Backslash Escapes}). There are several other\n" |
| 172 | "differences between basic and extended regular expressions, but these\n" |
| 173 | "are the most significant.\n\n" |
| 174 | "@item regexp/extended\n" |
| 175 | "Compile an extended regular expression rather than a basic regexp. This\n" |
| 176 | "is the default behavior; this flag will not usually be needed. If a\n" |
| 177 | "call to @code{make-regexp} includes both @code{regexp/basic} and\n" |
| 178 | "@code{regexp/extended} flags, the one which comes last will override\n" |
| 179 | "the earlier one.\n" |
| 180 | "@end table\n") |
| 181 | #define FUNC_NAME s_scm_make_regexp |
| 182 | { |
| 183 | SCM flag; |
| 184 | regex_t *rx; |
| 185 | int status, cflags; |
| 186 | |
| 187 | SCM_VALIDATE_ROSTRING (1,pat); |
| 188 | SCM_VALIDATE_REST_ARGUMENT (flags); |
| 189 | SCM_COERCE_SUBSTR (pat); |
| 190 | |
| 191 | /* Examine list of regexp flags. If REG_BASIC is supplied, then |
| 192 | turn off REG_EXTENDED flag (on by default). */ |
| 193 | cflags = REG_EXTENDED; |
| 194 | flag = flags; |
| 195 | while (!SCM_NULLP (flag)) |
| 196 | { |
| 197 | if (SCM_INUM (SCM_CAR (flag)) == REG_BASIC) |
| 198 | cflags &= ~REG_EXTENDED; |
| 199 | else |
| 200 | cflags |= SCM_INUM (SCM_CAR (flag)); |
| 201 | flag = SCM_CDR (flag); |
| 202 | } |
| 203 | |
| 204 | rx = SCM_MUST_MALLOC_TYPE (regex_t); |
| 205 | status = regcomp (rx, SCM_ROCHARS (pat), |
| 206 | /* Make sure they're not passing REG_NOSUB; |
| 207 | regexp-exec assumes we're getting match data. */ |
| 208 | cflags & ~REG_NOSUB); |
| 209 | if (status != 0) |
| 210 | { |
| 211 | scm_error (scm_regexp_error_key, |
| 212 | FUNC_NAME, |
| 213 | scm_regexp_error_msg (status, rx), |
| 214 | SCM_BOOL_F, |
| 215 | SCM_BOOL_F); |
| 216 | /* never returns */ |
| 217 | } |
| 218 | SCM_RETURN_NEWSMOB (scm_tc16_regex, rx); |
| 219 | } |
| 220 | #undef FUNC_NAME |
| 221 | |
| 222 | SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0, |
| 223 | (SCM rx, SCM str, SCM start, SCM flags), |
| 224 | "Match the compiled regular expression @var{regexp} against @code{str}.\n" |
| 225 | "If the optional integer @var{start} argument is provided, begin matching\n" |
| 226 | "from that position in the string. Return a match structure describing\n" |
| 227 | "the results of the match, or @code{#f} if no match could be found.") |
| 228 | #define FUNC_NAME s_scm_regexp_exec |
| 229 | { |
| 230 | int status, nmatches, offset; |
| 231 | regmatch_t *matches; |
| 232 | SCM mvec = SCM_BOOL_F; |
| 233 | |
| 234 | SCM_VALIDATE_RGXP (1,rx); |
| 235 | SCM_VALIDATE_ROSTRING (2,str); |
| 236 | SCM_VALIDATE_INUM_DEF_COPY (3,start,0,offset); |
| 237 | SCM_ASSERT_RANGE (3,start,offset >= 0 && (unsigned) offset <= SCM_LENGTH (str)); |
| 238 | if (SCM_UNBNDP (flags)) |
| 239 | flags = SCM_INUM0; |
| 240 | SCM_VALIDATE_INUM (4,flags); |
| 241 | SCM_COERCE_SUBSTR (str); |
| 242 | |
| 243 | /* re_nsub doesn't account for the `subexpression' representing the |
| 244 | whole regexp, so add 1 to nmatches. */ |
| 245 | |
| 246 | nmatches = SCM_RGX(rx)->re_nsub + 1; |
| 247 | SCM_DEFER_INTS; |
| 248 | matches = SCM_MUST_MALLOC_TYPE_NUM (regmatch_t,nmatches); |
| 249 | status = regexec (SCM_RGX (rx), SCM_ROCHARS (str) + offset, |
| 250 | nmatches, matches, |
| 251 | SCM_INUM (flags)); |
| 252 | if (!status) |
| 253 | { |
| 254 | int i; |
| 255 | /* The match vector must include a cell for the string that was matched, |
| 256 | so add 1. */ |
| 257 | mvec = scm_make_vector (SCM_MAKINUM (nmatches + 1), SCM_UNSPECIFIED); |
| 258 | SCM_VELTS(mvec)[0] = str; |
| 259 | for (i = 0; i < nmatches; ++i) |
| 260 | if (matches[i].rm_so == -1) |
| 261 | SCM_VELTS(mvec)[i+1] = scm_cons (SCM_MAKINUM (-1), SCM_MAKINUM (-1)); |
| 262 | else |
| 263 | SCM_VELTS(mvec)[i+1] |
| 264 | = scm_cons(SCM_MAKINUM(matches[i].rm_so + offset), |
| 265 | SCM_MAKINUM(matches[i].rm_eo + offset)); |
| 266 | } |
| 267 | scm_must_free ((char *) matches); |
| 268 | SCM_ALLOW_INTS; |
| 269 | |
| 270 | if (status != 0 && status != REG_NOMATCH) |
| 271 | scm_error (scm_regexp_error_key, |
| 272 | FUNC_NAME, |
| 273 | scm_regexp_error_msg (status, SCM_RGX (rx)), |
| 274 | SCM_BOOL_F, |
| 275 | SCM_BOOL_F); |
| 276 | return mvec; |
| 277 | } |
| 278 | #undef FUNC_NAME |
| 279 | |
| 280 | void |
| 281 | scm_init_regex_posix () |
| 282 | { |
| 283 | scm_tc16_regex = scm_make_smob_type_mfpe ("regexp", sizeof (regex_t), |
| 284 | NULL, free_regex, NULL, NULL); |
| 285 | |
| 286 | /* Compilation flags. */ |
| 287 | scm_sysintern ("regexp/basic", scm_long2num (REG_BASIC)); |
| 288 | scm_sysintern ("regexp/extended", scm_long2num (REG_EXTENDED)); |
| 289 | scm_sysintern ("regexp/icase", scm_long2num (REG_ICASE)); |
| 290 | scm_sysintern ("regexp/newline", scm_long2num (REG_NEWLINE)); |
| 291 | |
| 292 | /* Execution flags. */ |
| 293 | scm_sysintern ("regexp/notbol", scm_long2num (REG_NOTBOL)); |
| 294 | scm_sysintern ("regexp/noteol", scm_long2num (REG_NOTEOL)); |
| 295 | |
| 296 | #include "libguile/regex-posix.x" |
| 297 | |
| 298 | scm_add_feature ("regex"); |
| 299 | } |
| 300 | |
| 301 | /* |
| 302 | Local Variables: |
| 303 | c-file-style: "gnu" |
| 304 | End: |
| 305 | */ |