-/* Copyright (C) 1997 Free Software Foundation, Inc.
+/* Copyright (C) 1997, 1998, 1999 Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*
* You should have received a copy of the GNU General Public License
* along with this software; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ * the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
*
* As a special exception, the Free Software Foundation gives permission
* for additional uses of the text contained in its release of GUILE.
* whether to permit this exception to apply to your modifications.
* If you do not wish that, delete this exception notice.
*/
+
+/* Software engineering face-lift by Greg J. Badros, 11-Dec-1999,
+ gjb@cs.washington.edu, http://www.cs.washington.edu/homes/gjb */
+
\f
/* regex-posix.c -- POSIX regular expression support.
the file is CPP'able (for dependency scanning) even on systems that
don't have a <regex.h> header. */
#ifdef HAVE_REGCOMP
+#ifdef HAVE_REGEX_H
#include <regex.h>
-#endif
+#else
+#ifdef HAVE_RXPOSIX_H
+#include <rxposix.h> /* GNU Rx library */
+#else
+#ifdef HAVE_RX_RXPOSIX_H
+#include <rx/rxposix.h> /* GNU Rx library on Linux */
+#endif
+#endif
+#endif
+#endif
#include "smob.h"
#include "symbols.h"
#include "ports.h"
#include "feature.h"
+#include "scm_validate.h"
#include "regex-posix.h"
-long scm_tc16_regex_t;
+/* This is defined by some regex libraries and omitted by others. */
+#ifndef REG_BASIC
+#define REG_BASIC 0
+#endif
+
+long scm_tc16_regex;
-static size_t
-scm_free_regex_t (obj)
- SCM obj;
+static scm_sizet
+free_regex (SCM obj)
{
regfree (SCM_RGX (obj));
free (SCM_RGX (obj));
- return 0;
-}
-
-static int
-scm_print_regex_t (obj, port, pstate)
- SCM obj;
- SCM port;
- scm_print_state *pstate;
-{
- regex_t *r;
- r = SCM_RGX (obj);
- scm_gen_puts (scm_regular_string, "#<rgx ", port);
- scm_intprint (obj, 16, port);
- scm_gen_puts (scm_regular_string, ">", port);
- return 1;
+ return sizeof(regex_t);
}
-
-static scm_smobfuns regex_t_smob =
-{ scm_mark0, scm_free_regex_t, scm_print_regex_t, 0 };
\f
SCM_SYMBOL (scm_regexp_error_key, "regular-expression-syntax");
-char *
-scm_regexp_error_msg (regerrno, rx)
- int regerrno;
- SCM rx;
+static char *
+scm_regexp_error_msg (int regerrno, regex_t *rx)
{
SCM errmsg;
int l;
errmsg = scm_make_string (SCM_MAKINUM (80), SCM_UNDEFINED);
SCM_DEFER_INTS;
- l = regerror (regerrno, SCM_RGX (rx), SCM_CHARS (errmsg), 80);
+ l = regerror (regerrno, rx, SCM_CHARS (errmsg), 80);
if (l > 80)
{
errmsg = scm_make_string (SCM_MAKINUM (l), SCM_UNDEFINED);
- regerror (regerrno, SCM_RGX (rx), SCM_CHARS (errmsg), l);
+ regerror (regerrno, rx, SCM_CHARS (errmsg), l);
}
SCM_ALLOW_INTS;
return SCM_CHARS (errmsg);
}
-SCM_PROC (s_regexp_p, "regexp?", 1, 0, 0, scm_regexp_p);
-
-SCM
-scm_regexp_p (x)
- SCM x;
+SCM_DEFINE (scm_regexp_p, "regexp?", 1, 0, 0,
+ (SCM x),
+"Return @code{#t} if @var{obj} is a compiled regular expression, or
+@code{#f} otherwise.")
+#define FUNC_NAME s_scm_regexp_p
{
- return (SCM_NIMP (x) && SCM_RGXP (x) ? SCM_BOOL_T : SCM_BOOL_F);
+ return SCM_BOOL(SCM_RGXP (x));
}
-
-/* FIXME: make-regexp should support flags like
- * REG_BASIC and REG_ICASE. Maybe these could be optional symbols
- * in the command args: e.g.:
- * (make-regexp "foo.*bar" 'basic
- * 'ignore-case
- * 'multi-line)
- */
-
-SCM_PROC (s_make_regexp, "make-regexp", 1, 0, 0, scm_make_regexp);
-
-SCM
-scm_make_regexp (pat)
- SCM pat;
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_make_regexp, "make-regexp", 1, 0, 1,
+ (SCM pat, SCM flags),
+"Compile the regular expression described by @var{str}, and return the
+compiled regexp structure. If @var{str} does not describe a legal
+regular expression, @code{make-regexp} throws a
+@code{regular-expression-syntax} error.
+
+The @var{flag} arguments change the behavior of the compiled regexp.
+The following flags may be supplied:
+
+@table @code
+@item regexp/icase
+Consider uppercase and lowercase letters to be the same when matching.
+
+@item regexp/newline
+If a newline appears in the target string, then permit the @samp{^} and
+@samp{$} operators to match immediately after or immediately before the
+newline, respectively. Also, the @samp{.} and @samp{[^...]} operators
+will never match a newline character. The intent of this flag is to
+treat the target string as a buffer containing many lines of text, and
+the regular expression as a pattern that may match a single one of those
+lines.
+
+@item regexp/basic
+Compile a basic (``obsolete'') regexp instead of the extended
+(``modern'') regexps that are the default. Basic regexps do not
+consider @samp{|}, @samp{+} or @samp{?} to be special characters, and
+require the @samp{@{...@}} and @samp{(...)} metacharacters to be
+backslash-escaped (@pxref{Backslash Escapes}). There are several other
+differences between basic and extended regular expressions, but these
+are the most significant.
+
+@item regexp/extended
+Compile an extended regular expression rather than a basic regexp. This
+is the default behavior; this flag will not usually be needed. If a
+call to @code{make-regexp} includes both @code{regexp/basic} and
+@code{regexp/extended} flags, the one which comes last will override
+the earlier one.
+@end table
+")
+#define FUNC_NAME s_scm_make_regexp
{
- SCM result;
+ SCM flag;
regex_t *rx;
- int status;
+ int status, cflags;
- SCM_ASSERT (SCM_NIMP(pat) && SCM_ROSTRINGP(pat), pat, SCM_ARG1,
- s_make_regexp);
+ SCM_VALIDATE_ROSTRING (1,pat);
SCM_COERCE_SUBSTR (pat);
- SCM_DEFER_INTS;
- rx = (regex_t *) scm_must_malloc (sizeof (regex_t), s_make_regexp);
- status = regcomp (rx, SCM_ROCHARS (pat), REG_EXTENDED);
+ /* Examine list of regexp flags. If REG_BASIC is supplied, then
+ turn off REG_EXTENDED flag (on by default). */
+ cflags = REG_EXTENDED;
+ flag = flags;
+ while (SCM_NNULLP (flag))
+ {
+ if (SCM_INUM (SCM_CAR (flag)) == REG_BASIC)
+ cflags &= ~REG_EXTENDED;
+ else
+ cflags |= SCM_INUM (SCM_CAR (flag));
+ flag = SCM_CDR (flag);
+ }
+
+ rx = SCM_MUST_MALLOC_TYPE(regex_t);
+ status = regcomp (rx, SCM_ROCHARS (pat),
+ /* Make sure they're not passing REG_NOSUB;
+ regexp-exec assumes we're getting match data. */
+ cflags & ~REG_NOSUB);
if (status != 0)
{
- SCM_ALLOW_INTS;
scm_error (scm_regexp_error_key,
- s_make_regexp,
+ FUNC_NAME,
scm_regexp_error_msg (status, rx),
SCM_BOOL_F,
SCM_BOOL_F);
/* never returns */
}
- SCM_NEWCELL (result);
- SCM_SETCAR (result, scm_tc16_regex_t);
- SCM_SETCDR (result, rx);
- SCM_ALLOW_INTS;
- return result;
+ SCM_RETURN_NEWSMOB (scm_tc16_regex, rx);
}
-
-SCM_PROC (s_regexp_exec, "regexp-exec", 2, 1, 0, scm_regexp_exec);
-
-SCM
-scm_regexp_exec (rx, str, start)
- SCM rx;
- SCM str;
- SCM start;
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0,
+ (SCM rx, SCM str, SCM start, SCM flags),
+"Match the compiled regular expression @var{regexp} against @code{str}.
+If the optional integer @var{start} argument is provided, begin matching
+from that position in the string. Return a match structure describing
+the results of the match, or @code{#f} if no match could be found.")
+#define FUNC_NAME s_scm_regexp_exec
{
- int status, nmatches;
+ int status, nmatches, offset;
regmatch_t *matches;
SCM mvec = SCM_BOOL_F;
- SCM_ASSERT (SCM_NIMP (rx) && SCM_RGXP (rx), rx, SCM_ARG1, s_regexp_exec);
- SCM_ASSERT (SCM_NIMP (str) && SCM_ROSTRINGP (str), str, SCM_ARG2,
- s_regexp_exec);
+ SCM_VALIDATE_RGXP (1,rx);
+ SCM_VALIDATE_ROSTRING (2,str);
+ SCM_VALIDATE_INUM_DEF_COPY (3,start,0,offset);
+ SCM_ASSERT_RANGE (3,start,offset >= 0 && (unsigned) offset <= SCM_LENGTH (str));
+ if (SCM_UNBNDP (flags))
+ flags = SCM_INUM0;
+ SCM_VALIDATE_INUM (4,flags);
SCM_COERCE_SUBSTR (str);
/* re_nsub doesn't account for the `subexpression' representing the
nmatches = SCM_RGX(rx)->re_nsub + 1;
SCM_DEFER_INTS;
- matches = (regmatch_t *) scm_must_malloc (sizeof (regmatch_t) * nmatches,
- s_regexp_exec);
- status = regexec (SCM_RGX (rx), SCM_ROCHARS (str), nmatches, matches, 0);
+ matches = SCM_MUST_MALLOC_TYPE_NUM (regmatch_t,nmatches);
+ status = regexec (SCM_RGX (rx), SCM_ROCHARS (str) + offset,
+ nmatches, matches,
+ SCM_INUM (flags));
if (!status)
{
int i;
/* The match vector must include a cell for the string that was matched,
so add 1. */
- mvec = scm_make_vector (SCM_MAKINUM (nmatches + 1), SCM_UNSPECIFIED,
- SCM_UNDEFINED);
+ mvec = scm_make_vector (SCM_MAKINUM (nmatches + 1), SCM_UNSPECIFIED);
SCM_VELTS(mvec)[0] = str;
for (i = 0; i < nmatches; ++i)
- SCM_VELTS(mvec)[i+1] = scm_cons (SCM_MAKINUM (matches[i].rm_so),
- SCM_MAKINUM (matches[i].rm_eo));
+ if (matches[i].rm_so == -1)
+ SCM_VELTS(mvec)[i+1] = scm_cons (SCM_MAKINUM (-1), SCM_MAKINUM (-1));
+ else
+ SCM_VELTS(mvec)[i+1]
+ = scm_cons(SCM_MAKINUM(matches[i].rm_so + offset),
+ SCM_MAKINUM(matches[i].rm_eo + offset));
}
+ scm_must_free ((char *) matches);
SCM_ALLOW_INTS;
if (status != 0 && status != REG_NOMATCH)
scm_error (scm_regexp_error_key,
- s_regexp_exec,
- scm_regexp_error_msg (status),
+ FUNC_NAME,
+ scm_regexp_error_msg (status, SCM_RGX (rx)),
SCM_BOOL_F,
SCM_BOOL_F);
return mvec;
}
+#undef FUNC_NAME
void
scm_init_regex_posix ()
{
- scm_tc16_regex_t = scm_newsmob (®ex_t_smob);
+ scm_tc16_regex = scm_make_smob_type_mfpe ("regexp", sizeof (regex_t),
+ NULL, free_regex, NULL, NULL);
+
+ /* Compilation flags. */
+ scm_sysintern ("regexp/basic", scm_long2num (REG_BASIC));
+ scm_sysintern ("regexp/extended", scm_long2num (REG_EXTENDED));
+ scm_sysintern ("regexp/icase", scm_long2num (REG_ICASE));
+ scm_sysintern ("regexp/newline", scm_long2num (REG_NEWLINE));
+
+ /* Execution flags. */
+ scm_sysintern ("regexp/notbol", scm_long2num (REG_NOTBOL));
+ scm_sysintern ("regexp/noteol", scm_long2num (REG_NOTEOL));
+
#include "regex-posix.x"
scm_add_feature ("regex");