-/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008 Free Software
+/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
* Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 3 of
+ * the License, or (at your option) any later version.
*
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
+#include <unistd.h>
+#include <unicase.h>
#include "libguile/_scm.h"
+#include "libguile/bytevectors.h"
#include "libguile/chars.h"
#include "libguile/eval.h"
-#include "libguile/unif.h"
+#include "libguile/arrays.h"
+#include "libguile/bitvectors.h"
#include "libguile/keywords.h"
#include "libguile/alist.h"
#include "libguile/srcprop.h"
#include "libguile/hashtab.h"
#include "libguile/hash.h"
#include "libguile/ports.h"
+#include "libguile/fports.h"
#include "libguile/root.h"
#include "libguile/strings.h"
#include "libguile/strports.h"
SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
SCM_SYMBOL (scm_keyword_prefix, "prefix");
SCM_SYMBOL (scm_keyword_postfix, "postfix");
+SCM_SYMBOL (sym_nil, "nil");
scm_t_option scm_read_opts[] = {
{ SCM_OPTION_BOOLEAN, "copy", 0,
"Record positions of source code expressions." },
{ SCM_OPTION_BOOLEAN, "case-insensitive", 0,
"Convert symbols to lower case."},
- { SCM_OPTION_SCM, "keywords", SCM_UNPACK (SCM_BOOL_F),
+ { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
"Style of keyword recognition: #f, 'prefix or 'postfix."},
-#if SCM_ENABLE_ELISP
- { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
- "Support Elisp vector syntax, namely `[...]'."},
- { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
- "Support `\\(' and `\\)' in strings."},
-#endif
+ { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
+ "Use R6RS variable-length character and string hex escapes."},
+ { SCM_OPTION_BOOLEAN, "square-brackets", 1,
+ "Treat `[' and `]' as parentheses, for R6RS compatibility."},
{ 0, },
};
}
#undef FUNC_NAME
-/* An association list mapping extra hash characters to procedures. */
-static SCM *scm_read_hash_procedures;
+/* A fluid referring to an association list mapping extra hash
+ characters to procedures. */
+static SCM *scm_i_read_hash_procedures;
+static inline SCM
+scm_i_read_hash_procedures_ref (void)
+{
+ return scm_fluid_ref (*scm_i_read_hash_procedures);
+}
+
+static inline void
+scm_i_read_hash_procedures_set_x (SCM value)
+{
+ scm_fluid_set_x (*scm_i_read_hash_procedures, value);
+}
\f
/* Token readers. */
structure''). */
#define CHAR_IS_R5RS_DELIMITER(c) \
(CHAR_IS_BLANK (c) \
- || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
+ || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
+ || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
(((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
|| ((_chr) == 'd') || ((_chr) == 'l'))
-/* An inlinable version of `scm_c_downcase ()'. */
-#define CHAR_DOWNCASE(_chr) \
- (((_chr) <= UCHAR_MAX) ? tolower (_chr) : (_chr))
-
-
/* Read an SCSH block comment. */
-static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
-
-/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
- zero if the whole token fits in BUF, non-zero otherwise. */
+static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
+static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
+static SCM scm_read_commented_expression (scm_t_wchar, SCM);
+static SCM scm_read_shebang (scm_t_wchar, SCM);
+static SCM scm_get_hash_procedure (int);
+
+/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
+ result in the pre-allocated buffer BUF. Return zero if the whole token has
+ fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
+ bytes actually read. */
static inline int
-read_token (SCM port, char *buf, size_t buf_size, size_t *read)
-{
- *read = 0;
+read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
+ {
+ *read = 0;
- while (*read < buf_size)
- {
- int chr;
+ while (*read < buf_size)
+ {
+ int chr;
- chr = scm_getc (port);
- chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
+ chr = scm_get_byte_or_eof (port);
- if (chr == EOF)
- return 0;
+ if (chr == EOF)
+ return 0;
else if (CHAR_IS_DELIMITER (chr))
- {
- scm_ungetc (chr, port);
- return 0;
- }
+ {
+ scm_unget_byte (chr, port);
+ return 0;
+ }
else
- {
- *buf = (char) chr;
- buf++, (*read)++;
- }
+ {
+ *buf = (char) chr;
+ buf++, (*read)++;
+ }
+ }
+
+ return 1;
+ }
+
+/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
+ result in the pre-allocated buffer BUFFER, if the whole token has fewer than
+ BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
+ caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
+ will be set the number of bytes actually read. */
+static int
+read_complete_token (SCM port, char *buffer, const size_t buffer_size,
+ char **overflow_buffer, size_t *read)
+{
+ int overflow = 0;
+ size_t bytes_read, overflow_size;
+
+ *overflow_buffer = NULL;
+ overflow_size = 0;
+
+ do
+ {
+ overflow = read_token (port, buffer, buffer_size, &bytes_read);
+ if (bytes_read == 0)
+ break;
+ if (overflow || overflow_size != 0)
+ {
+ if (overflow_size == 0)
+ {
+ *overflow_buffer = scm_malloc (bytes_read);
+ memcpy (*overflow_buffer, buffer, bytes_read);
+ overflow_size = bytes_read;
+ }
+ else
+ {
+ *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
+ memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
+ overflow_size += bytes_read;
+ }
+ }
}
+ while (overflow);
- return 1;
-}
+ if (overflow_size)
+ *read = overflow_size;
+ else
+ *read = bytes_read;
+ return (overflow_size != 0);
+}
/* Skip whitespace from PORT and return the first non-whitespace character
read. Raise an error on end-of-file. */
static int
flush_ws (SCM port, const char *eoferr)
{
- register int c;
+ register scm_t_wchar c;
while (1)
switch (c = scm_getc (port))
{
eoferr = "read_sharp";
goto goteof;
case '!':
- scm_read_scsh_block_comment (c, port);
+ scm_read_shebang (c, port);
+ break;
+ case ';':
+ scm_read_commented_expression (c, port);
break;
+ case '|':
+ if (scm_is_false (scm_get_hash_procedure (c)))
+ {
+ scm_read_r6rs_block_comment (c, port);
+ break;
+ }
+ /* fall through */
default:
scm_ungetc (c, port);
return '#';
static SCM scm_read_expression (SCM port);
static SCM scm_read_sharp (int chr, SCM port);
-static SCM scm_get_hash_procedure (int c);
static SCM recsexpr (SCM obj, long line, int column, SCM filename);
static SCM
-scm_read_sexp (int chr, SCM port)
+scm_read_sexp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_i_lreadparen"
{
register int c;
register SCM tmp;
register SCM tl, ans = SCM_EOL;
SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
- static const int terminating_char = ')';
+ const int terminating_char = ((chr == '[') ? ']' : ')');
/* Need to capture line and column numbers here. */
long line = SCM_LINUM (port);
{
SCM new_tail;
+ if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
+ scm_i_input_error (FUNC_NAME, port,
+ "in pair: mismatched close paren: ~A",
+ scm_list_1 (SCM_MAKE_CHAR (c)));
+
scm_ungetc (c, port);
- if (scm_is_eq (scm_sym_dot,
- (tmp = scm_read_expression (port))))
+ tmp = scm_read_expression (port);
+
+ if (scm_is_eq (scm_sym_dot, tmp))
{
SCM_SETCDR (tl, tmp = scm_read_expression (port));
}
#undef FUNC_NAME
+
+/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
+ C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
+ found. */
+#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
+ do \
+ { \
+ scm_t_wchar a; \
+ size_t i = 0; \
+ c = 0; \
+ while (i < ndigits) \
+ { \
+ a = scm_getc (port); \
+ if (a == EOF) \
+ goto str_eof; \
+ if (terminator \
+ && (a == (scm_t_wchar) terminator) \
+ && (i > 0)) \
+ break; \
+ if ('0' <= a && a <= '9') \
+ a -= '0'; \
+ else if ('A' <= a && a <= 'F') \
+ a = a - 'A' + 10; \
+ else if ('a' <= a && a <= 'f') \
+ a = a - 'a' + 10; \
+ else \
+ { \
+ c = a; \
+ goto bad_escaped; \
+ } \
+ c = c * 16 + a; \
+ i ++; \
+ } \
+ } while (0)
+
static SCM
scm_read_string (int chr, SCM port)
#define FUNC_NAME "scm_lreadr"
object (the string returned). */
SCM str = SCM_BOOL_F;
- char c_str[READER_STRING_BUFFER_SIZE];
unsigned c_str_len = 0;
- int c;
+ scm_t_wchar c;
+ str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
while ('"' != (c = scm_getc (port)))
{
if (c == EOF)
- str_eof: scm_i_input_error (FUNC_NAME, port,
- "end of file in string constant",
- SCM_EOL);
+ {
+ str_eof:
+ scm_i_input_error (FUNC_NAME, port,
+ "end of file in string constant", SCM_EOL);
+ }
- if (c_str_len + 1 >= sizeof (c_str))
- {
- /* Flush the C buffer onto a Scheme string. */
- SCM addy;
+ if (c_str_len + 1 >= scm_i_string_length (str))
+ {
+ SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
- if (str == SCM_BOOL_F)
- str = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
-
- addy = scm_from_locale_stringn (c_str, c_str_len);
- str = scm_string_append_shared (scm_list_2 (str, addy));
-
- c_str_len = 0;
- }
+ str = scm_string_append (scm_list_2 (str, addy));
+ }
if (c == '\\')
- switch (c = scm_getc (port))
- {
- case EOF:
- goto str_eof;
- case '"':
- case '\\':
- break;
-#if SCM_ENABLE_ELISP
- case '(':
- case ')':
- if (SCM_ESCAPED_PARENS_P)
- break;
- goto bad_escaped;
-#endif
- case '\n':
- continue;
- case '0':
- c = '\0';
- break;
- case 'f':
- c = '\f';
- break;
- case 'n':
- c = '\n';
- break;
- case 'r':
- c = '\r';
- break;
- case 't':
- c = '\t';
- break;
- case 'a':
- c = '\007';
- break;
- case 'v':
- c = '\v';
- break;
- case 'x':
- {
- int a, b;
- a = scm_getc (port);
- if (a == EOF) goto str_eof;
- b = scm_getc (port);
- if (b == EOF) goto str_eof;
- if ('0' <= a && a <= '9') a -= '0';
- else if ('A' <= a && a <= 'F') a = a - 'A' + 10;
- else if ('a' <= a && a <= 'f') a = a - 'a' + 10;
- else goto bad_escaped;
- if ('0' <= b && b <= '9') b -= '0';
- else if ('A' <= b && b <= 'F') b = b - 'A' + 10;
- else if ('a' <= b && b <= 'f') b = b - 'a' + 10;
- else goto bad_escaped;
- c = a * 16 + b;
- break;
- }
- default:
- bad_escaped:
- scm_i_input_error (FUNC_NAME, port,
- "illegal character in escape sequence: ~S",
- scm_list_1 (SCM_MAKE_CHAR (c)));
- }
- c_str[c_str_len++] = c;
+ {
+ switch (c = scm_getc (port))
+ {
+ case EOF:
+ goto str_eof;
+ case '"':
+ case '\\':
+ break;
+ case '\n':
+ continue;
+ case '0':
+ c = '\0';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'a':
+ c = '\007';
+ break;
+ case 'v':
+ c = '\v';
+ break;
+ case 'b':
+ c = '\010';
+ break;
+ case 'x':
+ if (SCM_R6RS_ESCAPES_P)
+ SCM_READ_HEX_ESCAPE (10, ';');
+ else
+ SCM_READ_HEX_ESCAPE (2, '\0');
+ break;
+ case 'u':
+ if (!SCM_R6RS_ESCAPES_P)
+ {
+ SCM_READ_HEX_ESCAPE (4, '\0');
+ break;
+ }
+ case 'U':
+ if (!SCM_R6RS_ESCAPES_P)
+ {
+ SCM_READ_HEX_ESCAPE (6, '\0');
+ break;
+ }
+ default:
+ bad_escaped:
+ scm_i_input_error (FUNC_NAME, port,
+ "illegal character in escape sequence: ~S",
+ scm_list_1 (SCM_MAKE_CHAR (c)));
+ }
+ }
+ str = scm_i_string_start_writing (str);
+ scm_i_string_set_x (str, c_str_len++, c);
+ scm_i_string_stop_writing ();
}
if (c_str_len > 0)
{
- SCM addy;
-
- addy = scm_from_locale_stringn (c_str, c_str_len);
- if (str == SCM_BOOL_F)
- str = addy;
- else
- str = scm_string_append_shared (scm_list_2 (str, addy));
+ return scm_i_substring_copy (str, 0, c_str_len);
}
- else
- str = (str == SCM_BOOL_F) ? scm_nullstr : str;
- return scm_i_make_read_only_string (str);
+ return scm_nullstr;
}
#undef FUNC_NAME
static SCM
-scm_read_number (int chr, SCM port)
+scm_read_number (scm_t_wchar chr, SCM port)
{
SCM result, str = SCM_EOL;
char buffer[READER_BUFFER_SIZE];
- size_t read;
- int overflow = 0;
+ char *overflow_buffer = NULL;
+ size_t bytes_read;
+ int overflow;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
scm_ungetc (chr, port);
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
+ overflow = read_complete_token (port, buffer, sizeof (buffer),
+ &overflow_buffer, &bytes_read);
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- /* The slow path. */
-
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_number (str, SCM_UNDEFINED);
- if (!scm_is_true (result))
- /* Return a symbol instead of a number. */
- result = scm_string_to_symbol (str);
- }
+ if (!overflow)
+ str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
else
+ str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
+ pt->ilseq_handler);
+
+ result = scm_string_to_number (str, SCM_UNDEFINED);
+ if (!scm_is_true (result))
{
- result = scm_c_locale_stringn_to_number (buffer, read, 10);
- if (!scm_is_true (result))
- /* Return a symbol instead of a number. */
- result = scm_from_locale_symboln (buffer, read);
+ /* Return a symbol instead of a number */
+ if (SCM_CASE_INSENSITIVE_P)
+ str = scm_string_downcase_x (str);
+ result = scm_string_to_symbol (str);
}
+ if (overflow)
+ free (overflow_buffer);
+ SCM_COL (port) += scm_i_string_length (str);
return result;
}
static SCM
-scm_read_mixed_case_symbol (int chr, SCM port)
+scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
{
- SCM result, str = SCM_EOL;
- int overflow = 0, ends_with_colon = 0;
- char buffer[READER_BUFFER_SIZE];
- size_t read = 0;
+ SCM result;
+ int ends_with_colon = 0;
+ size_t bytes_read;
int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
+ int overflow;
+ char buffer[READER_BUFFER_SIZE], *overflow_buffer;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ SCM str;
scm_ungetc (chr, port);
- do
+ overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
+ &overflow_buffer, &bytes_read);
+ if (bytes_read > 0)
{
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if (read > 0)
- ends_with_colon = (buffer[read - 1] == ':');
-
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
+ if (!overflow)
+ ends_with_colon = buffer[bytes_read - 1] == ':';
+ else
+ ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
}
- while (overflow);
- if (scm_is_pair (str))
+ if (postfix && ends_with_colon && (bytes_read > 1))
{
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_symbol (str);
+ if (!overflow)
+ str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
+ else
+ str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
+ pt->ilseq_handler);
- /* Per SRFI-88, `:' alone is an identifier, not a keyword. */
- if (postfix && ends_with_colon && (scm_c_string_length (result) > 1))
- result = scm_symbol_to_keyword (result);
+ if (SCM_CASE_INSENSITIVE_P)
+ str = scm_string_downcase_x (str);
+ result = scm_symbol_to_keyword (scm_string_to_symbol (str));
}
else
{
- /* For symbols smaller than `sizeof (buffer)', we don't need to recur
- to Scheme strings. Therefore, we only create one Scheme object (a
- symbol) per symbol read. */
- if (postfix && ends_with_colon && (read > 1))
- result = scm_from_locale_keywordn (buffer, read - 1);
+ if (!overflow)
+ str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
else
- result = scm_from_locale_symboln (buffer, read);
+ str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
+ pt->ilseq_handler);
+
+ if (SCM_CASE_INSENSITIVE_P)
+ str = scm_string_downcase_x (str);
+ result = scm_string_to_symbol (str);
}
+ if (overflow)
+ free (overflow_buffer);
+ SCM_COL (port) += scm_i_string_length (str);
return result;
}
static SCM
-scm_read_number_and_radix (int chr, SCM port)
+scm_read_number_and_radix (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
- SCM result, str = SCM_EOL;
+ SCM result;
size_t read;
- char buffer[READER_BUFFER_SIZE];
+ char buffer[READER_BUFFER_SIZE], *overflow_buffer;
+ int overflow;
unsigned int radix;
- int overflow = 0;
+ SCM str;
+ scm_t_port *pt;
switch (chr)
{
radix = 10;
}
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
+ overflow = read_complete_token (port, buffer, sizeof (buffer),
+ &overflow_buffer, &read);
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_number (str, scm_from_uint (radix));
- }
+ pt = SCM_PTAB_ENTRY (port);
+ if (!overflow)
+ str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
else
- result = scm_c_locale_stringn_to_number (buffer, read, radix);
+ str = scm_from_stringn (overflow_buffer, read, pt->encoding,
+ pt->ilseq_handler);
+
+ result = scm_string_to_number (str, scm_from_uint (radix));
+
+ if (overflow)
+ free (overflow_buffer);
+
+ SCM_COL (port) += scm_i_string_length (str);
if (scm_is_true (result))
return result;
case ',':
{
- int c;
+ scm_t_wchar c;
c = scm_getc (port);
if ('@' == c)
return p;
}
+SCM_SYMBOL (sym_syntax, "syntax");
+SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
+SCM_SYMBOL (sym_unsyntax, "unsyntax");
+SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
+
+static SCM
+scm_read_syntax (int chr, SCM port)
+{
+ SCM p;
+ long line = SCM_LINUM (port);
+ int column = SCM_COL (port) - 1;
+
+ switch (chr)
+ {
+ case '`':
+ p = sym_quasisyntax;
+ break;
+
+ case '\'':
+ p = sym_syntax;
+ break;
+
+ case ',':
+ {
+ int c;
+
+ c = scm_getc (port);
+ if ('@' == c)
+ p = sym_unsyntax_splicing;
+ else
+ {
+ scm_ungetc (c, port);
+ p = sym_unsyntax;
+ }
+ break;
+ }
+
+ default:
+ fprintf (stderr, "%s: unhandled syntax character (%i)\n",
+ "scm_read_syntax", chr);
+ abort ();
+ }
+
+ p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
+ if (SCM_RECORD_POSITIONS_P)
+ scm_whash_insert (scm_source_whash, p,
+ scm_make_srcprops (line, column,
+ SCM_FILENAME (port),
+ SCM_COPY_SOURCE_P
+ ? (scm_cons2 (SCM_CAR (p),
+ SCM_CAR (SCM_CDR (p)),
+ SCM_EOL))
+ : SCM_UNDEFINED,
+ SCM_EOL));
+
+
+ return p;
+}
+
+static inline SCM
+scm_read_nil (int chr, SCM port)
+{
+ SCM id = scm_read_mixed_case_symbol (chr, port);
+
+ if (!scm_is_eq (id, sym_nil))
+ scm_i_input_error ("scm_read_nil", port,
+ "unexpected input while reading #nil: ~a",
+ scm_list_1 (id));
+
+ return SCM_ELISP_NIL;
+}
+
static inline SCM
scm_read_semicolon_comment (int chr, SCM port)
{
int c;
- for (c = scm_getc (port);
+ /* We use the get_byte here because there is no need to get the
+ locale correct with comment input. This presumes that newline
+ always represents itself no matter what the encoding is. */
+ for (c = scm_get_byte_or_eof (port);
(c != EOF) && (c != '\n');
- c = scm_getc (port));
+ c = scm_get_byte_or_eof (port));
return SCM_UNSPECIFIED;
}
}
static SCM
-scm_read_character (int chr, SCM port)
+scm_read_character (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
- unsigned c;
- char charname[READER_CHAR_NAME_MAX_SIZE];
- size_t charname_len;
-
- if (read_token (port, charname, sizeof (charname), &charname_len))
- goto char_error;
-
- if (charname_len == 0)
+ char buffer[READER_CHAR_NAME_MAX_SIZE];
+ SCM charname;
+ size_t charname_len, bytes_read;
+ scm_t_wchar cp;
+ int overflow;
+ scm_t_port *pt;
+
+ overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
+ if (overflow)
+ scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
+
+ if (bytes_read == 0)
{
chr = scm_getc (port);
if (chr == EOF)
return (SCM_MAKE_CHAR (chr));
}
+ pt = SCM_PTAB_ENTRY (port);
+
+ /* Simple ASCII characters can be processed immediately. Also, simple
+ ISO-8859-1 characters can be processed immediately if the encoding for this
+ port is ISO-8859-1. */
+ if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
+ {
+ SCM_COL (port) += 1;
+ return SCM_MAKE_CHAR (buffer[0]);
+ }
+
+ /* Otherwise, convert the buffer into a proper scheme string for
+ processing. */
+ charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
+ pt->ilseq_handler);
+ charname_len = scm_i_string_length (charname);
+ SCM_COL (port) += charname_len;
+ cp = scm_i_string_ref (charname, 0);
if (charname_len == 1)
- return SCM_MAKE_CHAR (charname[0]);
+ return SCM_MAKE_CHAR (cp);
+
+ /* Ignore dotted circles, which may be used to keep combining characters from
+ combining with the backslash in #\charname. */
+ if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
+ return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
- if (*charname >= '0' && *charname < '8')
+ if (cp >= '0' && cp < '8')
{
/* Dirk:FIXME:: This type of character syntax is not R5RS
* compliant. Further, it should be verified that the constant
- * does only consist of octal digits. Finally, it should be
- * checked whether the resulting fixnum is in the range of
- * characters. */
- SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
+ * does only consist of octal digits. */
+ SCM p = scm_string_to_number (charname, scm_from_uint (8));
if (SCM_I_INUMP (p))
- return SCM_MAKE_CHAR (SCM_I_INUM (p));
+ {
+ scm_t_wchar c = scm_to_uint32 (p);
+ if (SCM_IS_UNICODE_CHAR (c))
+ return SCM_MAKE_CHAR (c);
+ else
+ scm_i_input_error (FUNC_NAME, port,
+ "out-of-range octal character escape: ~a",
+ scm_list_1 (charname));
+ }
}
- for (c = 0; c < scm_n_charnames; c++)
- if (scm_charnames[c]
- && (!strncasecmp (scm_charnames[c], charname, charname_len)))
- return SCM_MAKE_CHAR (scm_charnums[c]);
+ if (cp == 'x' && (charname_len > 1))
+ {
+ SCM p;
+
+ /* Convert from hex, skipping the initial 'x' character in CHARNAME */
+ p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
+ scm_from_uint (16));
+ if (SCM_I_INUMP (p))
+ {
+ scm_t_wchar c = scm_to_uint32 (p);
+ if (SCM_IS_UNICODE_CHAR (c))
+ return SCM_MAKE_CHAR (c);
+ else
+ scm_i_input_error (FUNC_NAME, port,
+ "out-of-range hex character escape: ~a",
+ scm_list_1 (charname));
+ }
+ }
+
+ /* The names of characters should never have non-Latin1
+ characters. */
+ if (scm_i_is_narrow_string (charname)
+ || scm_i_try_narrow_string (charname))
+ { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
+ charname_len);
+ if (scm_is_true (ch))
+ return ch;
+ }
- char_error:
scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
- scm_list_1 (scm_from_locale_stringn (charname,
- charname_len)));
+ scm_list_1 (charname));
return SCM_UNSPECIFIED;
}
}
static SCM
-scm_read_guile_bit_vector (int chr, SCM port)
+scm_read_bytevector (scm_t_wchar chr, SCM port)
+{
+ chr = scm_getc (port);
+ if (chr != 'u')
+ goto syntax;
+
+ chr = scm_getc (port);
+ if (chr != '8')
+ goto syntax;
+
+ chr = scm_getc (port);
+ if (chr != '(')
+ goto syntax;
+
+ return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
+
+ syntax:
+ scm_i_input_error ("read_bytevector", port,
+ "invalid bytevector prefix",
+ SCM_MAKE_CHAR (chr));
+ return SCM_UNSPECIFIED;
+}
+
+static SCM
+scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
{
/* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
terribly inefficient but who cares? */
}
static inline SCM
-scm_read_scsh_block_comment (int chr, SCM port)
+scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
{
int bang_seen = 0;
+ /* We can use the get_byte here because there is no need to get the
+ locale correct when reading comments. This presumes that
+ hash and exclamation points always represent themselves no
+ matter what the source encoding is.*/
for (;;)
{
- int c = scm_getc (port);
+ int c = scm_get_byte_or_eof (port);
if (c == EOF)
scm_i_input_error ("skip_block_comment", port,
return SCM_UNSPECIFIED;
}
+static inline SCM
+scm_read_shebang (scm_t_wchar chr, SCM port)
+{
+ int c = 0;
+ if ((c = scm_get_byte_or_eof (port)) != 'r')
+ {
+ scm_ungetc (c, port);
+ return scm_read_scsh_block_comment (chr, port);
+ }
+ if ((c = scm_get_byte_or_eof (port)) != '6')
+ {
+ scm_ungetc (c, port);
+ scm_ungetc ('r', port);
+ return scm_read_scsh_block_comment (chr, port);
+ }
+ if ((c = scm_get_byte_or_eof (port)) != 'r')
+ {
+ scm_ungetc (c, port);
+ scm_ungetc ('6', port);
+ scm_ungetc ('r', port);
+ return scm_read_scsh_block_comment (chr, port);
+ }
+ if ((c = scm_get_byte_or_eof (port)) != 's')
+ {
+ scm_ungetc (c, port);
+ scm_ungetc ('r', port);
+ scm_ungetc ('6', port);
+ scm_ungetc ('r', port);
+ return scm_read_scsh_block_comment (chr, port);
+ }
+
+ return SCM_UNSPECIFIED;
+}
+
+static SCM
+scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
+{
+ /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
+ nested. So care must be taken. */
+ int nesting_level = 1;
+ int opening_seen = 0, closing_seen = 0;
+
+ while (nesting_level > 0)
+ {
+ int c = scm_getc (port);
+
+ if (c == EOF)
+ scm_i_input_error ("scm_read_r6rs_block_comment", port,
+ "unterminated `#| ... |#' comment", SCM_EOL);
+
+ if (opening_seen)
+ {
+ if (c == '|')
+ nesting_level++;
+ opening_seen = 0;
+ }
+ else if (closing_seen)
+ {
+ if (c == '#')
+ nesting_level--;
+ closing_seen = 0;
+ }
+ else if (c == '|')
+ closing_seen = 1;
+ else if (c == '#')
+ opening_seen = 1;
+ else
+ opening_seen = closing_seen = 0;
+ }
+
+ return SCM_UNSPECIFIED;
+}
+
static SCM
-scm_read_extended_symbol (int chr, SCM port)
+scm_read_commented_expression (scm_t_wchar chr, SCM port)
+{
+ scm_t_wchar c;
+
+ c = flush_ws (port, (char *) NULL);
+ if (EOF == c)
+ scm_i_input_error ("read_commented_expression", port,
+ "no expression after #; comment", SCM_EOL);
+ scm_ungetc (c, port);
+ scm_read_expression (port);
+ return SCM_UNSPECIFIED;
+}
+
+static SCM
+scm_read_extended_symbol (scm_t_wchar chr, SCM port)
{
/* Guile's extended symbol read syntax looks like this:
#{This is all a symbol name}#
So here, CHR is expected to be `{'. */
- SCM result;
int saw_brace = 0, finished = 0;
size_t len = 0;
- char buf[1024];
+ SCM buf = scm_i_make_string (1024, NULL);
- result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
+ buf = scm_i_string_start_writing (buf);
while ((chr = scm_getc (port)) != EOF)
{
else
{
saw_brace = 0;
- buf[len++] = '}';
- buf[len++] = chr;
+ scm_i_string_set_x (buf, len++, '}');
+ scm_i_string_set_x (buf, len++, chr);
}
}
else if (chr == '}')
saw_brace = 1;
else
- buf[len++] = chr;
+ scm_i_string_set_x (buf, len++, chr);
- if (len >= sizeof (buf) - 2)
+ if (len >= scm_i_string_length (buf) - 2)
{
- scm_string_append (scm_list_2 (result,
- scm_from_locale_stringn (buf, len)));
+ SCM addy;
+
+ scm_i_string_stop_writing ();
+ addy = scm_i_make_string (1024, NULL);
+ buf = scm_string_append (scm_list_2 (buf, addy));
len = 0;
+ buf = scm_i_string_start_writing (buf);
}
if (finished)
break;
}
+ scm_i_string_stop_writing ();
- if (len)
- result = scm_string_append (scm_list_2
- (result,
- scm_from_locale_stringn (buf, len)));
-
- return (scm_string_to_symbol (result));
+ return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
}
/* The reader for the sharp `#' character. It basically dispatches reads
among the above token readers. */
static SCM
-scm_read_sharp (int chr, SCM port)
+scm_read_sharp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM result;
case 'f':
/* This one may return either a boolean or an SRFI-4 vector. */
return (scm_read_srfi4_vector (chr, port));
+ case 'v':
+ return (scm_read_bytevector (chr, port));
case '*':
return (scm_read_guile_bit_vector (chr, port));
case 't':
{
/* When next char is '(', it really is an old-style
uniform array. */
- int next_c = scm_getc (port);
+ scm_t_wchar next_c = scm_getc (port);
if (next_c != EOF)
scm_ungetc (next_c, port);
if (next_c == '(')
case '{':
return (scm_read_extended_symbol (chr, port));
case '!':
- return (scm_read_scsh_block_comment (chr, port));
+ return (scm_read_shebang (chr, port));
+ case ';':
+ return (scm_read_commented_expression (chr, port));
+ case '`':
+ case '\'':
+ case ',':
+ return (scm_read_syntax (chr, port));
+ case 'n':
+ return (scm_read_nil (chr, port));
default:
result = scm_read_sharp_extension (chr, port);
if (scm_is_eq (result, SCM_UNSPECIFIED))
- scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
- scm_list_1 (SCM_MAKE_CHAR (chr)));
+ {
+ /* To remain compatible with 1.8 and earlier, the following
+ characters have lower precedence than `read-hash-extend'
+ characters. */
+ switch (chr)
+ {
+ case '|':
+ return scm_read_r6rs_block_comment (chr, port);
+ default:
+ scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
+ scm_list_1 (SCM_MAKE_CHAR (chr)));
+ }
+ }
else
return result;
}
{
while (1)
{
- register int chr;
+ register scm_t_wchar chr;
chr = scm_getc (port);
case ';':
(void) scm_read_semicolon_comment (chr, port);
break;
+ case '[':
+ if (!SCM_SQUARE_BRACKETS_P)
+ return (scm_read_mixed_case_symbol (chr, port));
+ /* otherwise fall through */
case '(':
return (scm_read_sexp (chr, port));
case '"':
case ')':
scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
break;
+ case ']':
+ if (SCM_SQUARE_BRACKETS_P)
+ scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
+ /* otherwise fall through */
case EOF:
return SCM_EOF_VAL;
case ':':
proc, SCM_ARG2, FUNC_NAME);
/* Check if chr is already in the alist. */
- this = *scm_read_hash_procedures;
+ this = scm_i_read_hash_procedures_ref ();
prev = SCM_BOOL_F;
while (1)
{
/* not found, so add it to the beginning. */
if (scm_is_true (proc))
{
- *scm_read_hash_procedures =
- scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
+ SCM new = scm_cons (scm_cons (chr, proc),
+ scm_i_read_hash_procedures_ref ());
+ scm_i_read_hash_procedures_set_x (new);
}
break;
}
/* remove it. */
if (scm_is_false (prev))
{
- *scm_read_hash_procedures =
- SCM_CDR (*scm_read_hash_procedures);
+ SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
+ scm_i_read_hash_procedures_set_x (rest);
}
else
scm_set_cdr_x (prev, SCM_CDR (this));
static SCM
scm_get_hash_procedure (int c)
{
- SCM rest = *scm_read_hash_procedures;
+ SCM rest = scm_i_read_hash_procedures_ref ();
while (1)
{
}
}
+#define SCM_ENCODING_SEARCH_SIZE (500)
+
+/* Search the first few hundred characters of a file for an Emacs-like coding
+ declaration. Returns either NULL or a string whose storage has been
+ allocated with `scm_gc_malloc ()'. */
+char *
+scm_i_scan_for_encoding (SCM port)
+{
+ char header[SCM_ENCODING_SEARCH_SIZE+1];
+ size_t bytes_read, encoding_length, i;
+ char *encoding = NULL;
+ int utf8_bom = 0;
+ char *pos, *encoding_start;
+ int in_comment;
+
+ if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
+ /* PORT is a non-seekable file port (e.g., as created by Bash when using
+ "guile <(echo '(display "hello")')") so bail out. */
+ return NULL;
+
+ bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
+ header[bytes_read] = '\0';
+
+ scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+
+ if (bytes_read > 3
+ && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
+ utf8_bom = 1;
+
+ /* search past "coding[:=]" */
+ pos = header;
+ while (1)
+ {
+ if ((pos = strstr(pos, "coding")) == NULL)
+ return NULL;
+
+ pos += strlen("coding");
+ if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
+ (*pos == ':' || *pos == '='))
+ {
+ pos ++;
+ break;
+ }
+ }
+
+ /* skip spaces */
+ while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
+ (*pos == ' ' || *pos == '\t'))
+ pos ++;
+
+ /* grab the next token */
+ encoding_start = pos;
+ i = 0;
+ while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
+ && encoding_start + i - header < bytes_read
+ && (isalnum ((int) encoding_start[i])
+ || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
+ i++;
+
+ encoding_length = i;
+ if (encoding_length == 0)
+ return NULL;
+
+ encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
+ for (i = 0; i < encoding_length; i++)
+ encoding[i] = toupper ((int) encoding[i]);
+
+ /* push backwards to make sure we were in a comment */
+ in_comment = 0;
+ pos = encoding_start;
+ while (pos >= header)
+ {
+ if (*pos == '\n')
+ {
+ /* This wasn't in a semicolon comment. Check for a
+ hash-bang comment. */
+ char *beg = strstr (header, "#!");
+ char *end = strstr (header, "!#");
+ if (beg < encoding_start && encoding_start + encoding_length < end)
+ in_comment = 1;
+ break;
+ }
+ if (*pos == ';')
+ {
+ in_comment = 1;
+ break;
+ }
+ pos --;
+ }
+ if (!in_comment)
+ /* This wasn't in a comment */
+ return NULL;
+
+ if (utf8_bom && strcmp(encoding, "UTF-8"))
+ scm_misc_error (NULL,
+ "the port input declares the encoding ~s but is encoded as UTF-8",
+ scm_list_1 (scm_from_locale_string (encoding)));
+
+ return encoding;
+}
+
+SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
+ (SCM port),
+ "Scans the port for an Emacs-like character coding declaration\n"
+ "near the top of the contents of a port with random-acessible contents.\n"
+ "The coding declaration is of the form\n"
+ "@code{coding: XXXXX} and must appear in a scheme comment.\n"
+ "\n"
+ "Returns a string containing the character encoding of the file\n"
+ "if a declaration was found, or @code{#f} otherwise.\n")
+#define FUNC_NAME s_scm_file_encoding
+{
+ char *enc;
+ SCM s_enc;
+
+ enc = scm_i_scan_for_encoding (port);
+ if (enc == NULL)
+ return SCM_BOOL_F;
+ else
+ {
+ s_enc = scm_from_locale_string (enc);
+ return s_enc;
+ }
+
+ return SCM_BOOL_F;
+}
+#undef FUNC_NAME
+
void
scm_init_read ()
{
- scm_read_hash_procedures =
- SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
+ SCM read_hash_procs;
+
+ read_hash_procs = scm_make_fluid ();
+ scm_fluid_set_x (read_hash_procs, SCM_EOL);
+
+ scm_i_read_hash_procedures =
+ SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
scm_init_opts (scm_read_options, scm_read_opts);
#include "libguile/read.x"