/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
#include <unistd.h>
#include <unicase.h>
#include <unictype.h>
+#include <c-strcase.h>
+#include <c-ctype.h>
#include "libguile/_scm.h"
#include "libguile/bytevectors.h"
#include "libguile/hashtab.h"
#include "libguile/hash.h"
#include "libguile/ports.h"
+#include "libguile/ports-internal.h"
#include "libguile/fports.h"
#include "libguile/root.h"
#include "libguile/strings.h"
"In strings, consume leading whitespace after an escaped end-of-line."},
{ SCM_OPTION_BOOLEAN, "curly-infix", 0,
"Support SRFI-105 curly infix expressions."},
+ { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
+ "Support R7RS |...| symbol notation."},
{ 0, },
};
unsigned int hungry_eol_escapes_p : 1;
unsigned int curly_infix_p : 1;
unsigned int neoteric_p : 1;
+ unsigned int r7rs_symbols_p : 1;
};
typedef struct t_read_opts scm_t_read_opts;
scm_ungetc_unlocked (c, port);
}
+/* Read either a double-quoted string or an R7RS-style symbol delimited
+ by vertical lines, depending on the value of 'chr' ('"' or '|').
+ Regardless, the result is always returned as a string. */
static SCM
-scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
+scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
#define FUNC_NAME "scm_lreadr"
{
/* For strings smaller than C_STR, this function creates only one Scheme
long line = SCM_LINUM (port);
int column = SCM_COL (port) - 1;
- while ('"' != (c = scm_getc_unlocked (port)))
+ while (chr != (c = scm_getc_unlocked (port)))
{
if (c == EOF)
{
str_eof:
scm_i_input_error (FUNC_NAME, port,
- "end of file in string constant", SCM_EOL);
+ (chr == '|'
+ ? "end of file in symbol"
+ : "end of file in string constant"),
+ SCM_EOL);
}
if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
{
case EOF:
goto str_eof;
- case '"':
+ case '|':
case '\\':
break;
case '\n':
c = '\010';
break;
case 'x':
- if (opts->r6rs_escapes_p)
+ if (opts->r6rs_escapes_p || chr == '|')
SCM_READ_HEX_ESCAPE (10, ';');
else
SCM_READ_HEX_ESCAPE (2, '\0');
break;
}
default:
+ if (c == chr)
+ break;
bad_escaped:
scm_i_input_error (FUNC_NAME, port,
"illegal character in escape sequence: ~S",
}
#undef FUNC_NAME
+static SCM
+scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
+{
+ return scm_read_string_like_syntax (chr, port, opts);
+}
+
+static SCM
+scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
+{
+ return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
+}
static SCM
scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
return SCM_UNSPECIFIED;
}
+/* If the EXPECTED_CHARS are the next ones available from PORT, then
+ consume them and return 1. Otherwise leave the port position where
+ it was and return 0. EXPECTED_CHARS should be all lowercase, and
+ will be matched case-insensitively against the characters read from
+ PORT. */
+static int
+try_read_ci_chars (SCM port, const char *expected_chars)
+{
+ int num_chars_wanted = strlen (expected_chars);
+ int num_chars_read = 0;
+ char *chars_read = alloca (num_chars_wanted);
+ int c;
+
+ while (num_chars_read < num_chars_wanted)
+ {
+ c = scm_getc_unlocked (port);
+ if (c == EOF)
+ break;
+ else if (c_tolower (c) != expected_chars[num_chars_read])
+ {
+ scm_ungetc_unlocked (c, port);
+ break;
+ }
+ else
+ chars_read[num_chars_read++] = c;
+ }
+
+ if (num_chars_read == num_chars_wanted)
+ return 1;
+ else
+ {
+ while (num_chars_read > 0)
+ scm_ungetc_unlocked (chars_read[--num_chars_read], port);
+ return 0;
+ }
+}
+
\f
/* Sharp readers, i.e. readers called after a `#' sign has been read. */
{
case 't':
case 'T':
+ try_read_ci_chars (port, "rue");
return SCM_BOOL_T;
case 'f':
case 'F':
+ try_read_ci_chars (port, "alse");
return SCM_BOOL_F;
}
size_t charname_len, bytes_read;
scm_t_wchar cp;
int overflow;
- scm_t_port *pt;
+ scm_t_port_internal *pti;
overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
&bytes_read);
return (SCM_MAKE_CHAR (chr));
}
- pt = SCM_PTAB_ENTRY (port);
+ pti = SCM_PORT_GET_INTERNAL (port);
/* Simple ASCII characters can be processed immediately. Also, simple
ISO-8859-1 characters can be processed immediately if the encoding for this
port is ISO-8859-1. */
if (bytes_read == 1 &&
((unsigned char) buffer[0] <= 127
- || pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
+ || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
{
SCM_COL (port) += 1;
return SCM_MAKE_CHAR (buffer[0]);
while ('0' <= c && c <= '9')
{
+ if (((SSIZE_MAX - (c-'0')) / 10) <= res)
+ scm_i_input_error ("read_decimal_integer", port,
+ "number too large", SCM_EOL);
res = 10*res + c-'0';
got_it = 1;
c = scm_getc_unlocked (port);
c = scm_getc_unlocked (port);
if (c != '3' && c != '6')
{
- if (c != EOF)
- scm_ungetc_unlocked (c, port);
+ if (c == 'a' && try_read_ci_chars (port, "lse"))
+ return SCM_BOOL_F;
+ else if (c != EOF)
+ scm_ungetc_unlocked (c, port);
return SCM_BOOL_F;
}
rank = 1;
return (scm_read_sexp (chr, port, opts));
case '"':
return (scm_read_string (chr, port, opts));
+ case '|':
+ if (opts->r7rs_symbols_p)
+ return scm_read_r7rs_symbol (chr, port, opts);
+ else
+ return scm_read_mixed_case_symbol (chr, port, opts);
case '\'':
case '`':
case ',':
}
}
-#define SCM_ENCODING_SEARCH_SIZE (500)
-
static int
is_encoding_char (char c)
{
return strchr ("_-.:/,+=()", c) != NULL;
}
-/* Search the first few hundred characters of a file for an Emacs-like coding
- declaration. Returns either NULL or a string whose storage has been
- allocated with `scm_gc_malloc ()'. */
+/* Maximum size of an encoding name. This is a bit more than the
+ longest name listed at
+ <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
+ characters.) */
+#define ENCODING_NAME_MAX_SIZE 20
+
+/* Number of bytes at the beginning or end of a file that are scanned
+ for a "coding:" declaration. */
+#define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
+
+
+/* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
+ coding declaration. Returns either NULL or a string whose storage
+ has been allocated with `scm_gc_malloc'. */
char *
scm_i_scan_for_encoding (SCM port)
{
char header[SCM_ENCODING_SEARCH_SIZE+1];
size_t bytes_read, encoding_length, i;
char *encoding = NULL;
- int utf8_bom = 0;
char *pos, *encoding_start;
int in_comment;
scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
}
- if (bytes_read > 3
- && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
- utf8_bom = 1;
-
/* search past "coding[:=]" */
pos = header;
while (1)
if ((pos = strstr(pos, "coding")) == NULL)
return NULL;
- pos += strlen("coding");
- if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
+ pos += strlen ("coding");
+ if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
(*pos == ':' || *pos == '='))
{
pos ++;
}
/* skip spaces */
- while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
+ while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
(*pos == ' ' || *pos == '\t'))
pos ++;
+ if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
+ /* We found the "coding:" string, but there is probably not enough
+ room to store an encoding name in its entirety, so ignore it.
+ This makes sure we do not end up returning a truncated encoding
+ name. */
+ return NULL;
+
/* grab the next token */
encoding_start = pos;
i = 0;
/* This wasn't in a comment */
return NULL;
- if (utf8_bom && strcasecmp (encoding, "UTF-8"))
- scm_misc_error (NULL,
- "the port input declares the encoding ~s but is encoded as UTF-8",
- scm_list_1 (scm_from_locale_string (encoding)));
-
return encoding;
}
return SCM_BOOL_F;
else
{
- s_enc = scm_from_locale_string (enc);
+ s_enc = scm_string_upcase (scm_from_locale_string (enc));
return s_enc;
}
\f
/* Per-port read options.
- We store per-port read options in the 'port-read-options' key of the
- port's alist. The value stored in the alist is a single integer that
- contains a two-bit field for each read option.
+ We store per-port read options in the 'port-read-options' port
+ property, which is stored in the internal port structure. The value
+ stored is a single integer that contains a two-bit field for each
+ read option.
If a bit field contains READ_OPTION_INHERIT (3), that indicates that
the applicable value should be inherited from the corresponding
read option has been set per-port, its possible values are those in
'enum t_keyword_style'. */
-/* Key to read options in per-port alists. */
+/* Key to read options in port properties. */
SCM_SYMBOL (sym_port_read_options, "port-read-options");
/* Offsets of bit fields for each per-port override */
#define READ_OPTION_SQUARE_BRACKETS_P 10
#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
#define READ_OPTION_CURLY_INFIX_P 14
+#define READ_OPTION_R7RS_SYMBOLS_P 16
/* The total width in bits of the per-port overrides */
-#define READ_OPTIONS_NUM_BITS 16
+#define READ_OPTIONS_NUM_BITS 18
#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
unsigned int read_options;
new_value &= READ_OPTION_MASK;
- scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
- sym_port_read_options);
+
+ scm_dynwind_begin (0);
+ scm_dynwind_lock_port (port);
+
+ scm_read_options = scm_i_port_property (port, sym_port_read_options);
if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
read_options = scm_to_uint (scm_read_options);
else
read_options &= ~(READ_OPTION_MASK << option);
read_options |= new_value << option;
scm_read_options = scm_from_uint (read_options);
- SCM_PTAB_ENTRY(port)->alist = scm_assq_set_x (SCM_PTAB_ENTRY(port)->alist,
- sym_port_read_options,
- scm_read_options);
+ scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
+
+ scm_dynwind_end ();
}
/* Set OPTS and PORT's case-insensitivity according to VALUE. */
SCM val, scm_read_options;
unsigned int read_options, x;
- scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
- sym_port_read_options);
+ scm_read_options = scm_i_port_property (port, sym_port_read_options);
if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
read_options = scm_to_uint (scm_read_options);
RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
+ RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
#undef RESOLVE_BOOLEAN_OPTION