HCoop Git - bpt/guile.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
	2	* 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
	3	*
	4	* This library is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU Lesser General Public License
	6	* as published by the Free Software Foundation; either version 3 of
	7	* the License, or (at your option) any later version.
	8	*
	9	* This library is distributed in the hope that it will be useful, but
	10	* WITHOUT ANY WARRANTY; without even the implied warranty of
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	12	* Lesser General Public License for more details.
	13	*
	14	* You should have received a copy of the GNU Lesser General Public
	15	* License along with this library; if not, write to the Free Software
	16	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	17	* 02110-1301 USA
	18	*/
	19
	20
	21	\f
	22
	23	#ifdef HAVE_CONFIG_H
	24	# include <config.h>
	25	#endif
	26
	27	#include <stdio.h>
	28	#include <ctype.h>
	29	#include <string.h>
	30	#include <unistd.h>
	31	#include <unicase.h>
	32	#include <unictype.h>
	33
	34	#include "libguile/_scm.h"
	35	#include "libguile/bytevectors.h"
	36	#include "libguile/chars.h"
	37	#include "libguile/eval.h"
	38	#include "libguile/arrays.h"
	39	#include "libguile/bitvectors.h"
	40	#include "libguile/keywords.h"
	41	#include "libguile/alist.h"
	42	#include "libguile/srcprop.h"
	43	#include "libguile/hashtab.h"
	44	#include "libguile/hash.h"
	45	#include "libguile/ports.h"
	46	#include "libguile/fports.h"
	47	#include "libguile/root.h"
	48	#include "libguile/strings.h"
	49	#include "libguile/strports.h"
	50	#include "libguile/vectors.h"
	51	#include "libguile/validate.h"
	52	#include "libguile/srfi-4.h"
	53	#include "libguile/srfi-13.h"
	54
	55	#include "libguile/read.h"
	56	#include "libguile/private-options.h"
	57
	58
	59	\f
	60
	61	SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
	62	SCM_SYMBOL (scm_keyword_prefix, "prefix");
	63	SCM_SYMBOL (scm_keyword_postfix, "postfix");
	64	SCM_SYMBOL (sym_nil, "nil");
	65
	66	scm_t_option scm_read_opts[] = {
	67	{ SCM_OPTION_BOOLEAN, "copy", 0,
	68	"Copy source code expressions." },
	69	{ SCM_OPTION_BOOLEAN, "positions", 1,
	70	"Record positions of source code expressions." },
	71	{ SCM_OPTION_BOOLEAN, "case-insensitive", 0,
	72	"Convert symbols to lower case."},
	73	{ SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
	74	"Style of keyword recognition: #f, 'prefix or 'postfix."},
	75	{ SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
	76	"Use R6RS variable-length character and string hex escapes."},
	77	{ SCM_OPTION_BOOLEAN, "square-brackets", 1,
	78	"Treat `[' and `]' as parentheses, for R6RS compatibility."},
	79	{ SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
	80	"In strings, consume leading whitespace after an escaped end-of-line."},
	81	{ 0, },
	82	};
	83
	84	/*
	85	Give meaningful error messages for errors
	86
	87	We use the format
	88
	89	FILE:LINE:COL: MESSAGE
	90	This happened in ....
	91
	92	This is not standard GNU format, but the test-suite likes the real
	93	message to be in front.
	94
	95	*/
	96
	97
	98	void
	99	scm_i_input_error (char const *function,
	100	SCM port, const char *message, SCM arg)
	101	{
	102	SCM fn = (scm_is_string (SCM_FILENAME(port))
	103	? SCM_FILENAME(port)
	104	: scm_from_locale_string ("#<unknown port>"));
	105
	106	SCM string_port = scm_open_output_string ();
	107	SCM string = SCM_EOL;
	108	scm_simple_format (string_port,
	109	scm_from_locale_string ("~A:~S:~S: ~A"),
	110	scm_list_4 (fn,
	111	scm_from_long (SCM_LINUM (port) + 1),
	112	scm_from_int (SCM_COL (port) + 1),
	113	scm_from_locale_string (message)));
	114
	115	string = scm_get_output_string (string_port);
	116	scm_close_output_port (string_port);
	117	scm_error_scm (scm_from_latin1_symbol ("read-error"),
	118	function? scm_from_locale_string (function) : SCM_BOOL_F,
	119	string,
	120	arg,
	121	SCM_BOOL_F);
	122	}
	123
	124
	125	SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
	126	(SCM setting),
	127	"Option interface for the read options. Instead of using\n"
	128	"this procedure directly, use the procedures @code{read-enable},\n"
	129	"@code{read-disable}, @code{read-set!} and @code{read-options}.")
	130	#define FUNC_NAME s_scm_read_options
	131	{
	132	SCM ans = scm_options (setting,
	133	scm_read_opts,
	134	FUNC_NAME);
	135	if (SCM_COPY_SOURCE_P)
	136	SCM_RECORD_POSITIONS_P = 1;
	137	return ans;
	138	}
	139	#undef FUNC_NAME
	140
	141	/* A fluid referring to an association list mapping extra hash
	142	characters to procedures. */
	143	static SCM *scm_i_read_hash_procedures;
	144
	145	static SCM
	146	scm_i_read_hash_procedures_ref (void)
	147	{
	148	return scm_fluid_ref (*scm_i_read_hash_procedures);
	149	}
	150
	151	static void
	152	scm_i_read_hash_procedures_set_x (SCM value)
	153	{
	154	scm_fluid_set_x (*scm_i_read_hash_procedures, value);
	155	}
	156
	157	\f
	158	/* Token readers. */
	159
	160
	161	/* Size of the C buffer used to read symbols and numbers. */
	162	#define READER_BUFFER_SIZE 128
	163
	164	/* Number of 32-bit codepoints in the buffer used to read strings. */
	165	#define READER_STRING_BUFFER_SIZE 128
	166
	167	/* The maximum size of Scheme character names. */
	168	#define READER_CHAR_NAME_MAX_SIZE 50
	169
	170
	171	/* `isblank' is only in C99. */
	172	#define CHAR_IS_BLANK_(_chr) \
	173	(((_chr) == ' ') \|\| ((_chr) == '\t') \|\| ((_chr) == '\n') \
	174	\|\| ((_chr) == '\f') \|\| ((_chr) == '\r'))
	175
	176	#ifdef MSDOS
	177	# define CHAR_IS_BLANK(_chr) \
	178	((CHAR_IS_BLANK_ (chr)) \|\| ((_chr) == 26))
	179	#else
	180	# define CHAR_IS_BLANK CHAR_IS_BLANK_
	181	#endif
	182
	183
	184	/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
	185	structure''). */
	186	#define CHAR_IS_R5RS_DELIMITER(c) \
	187	(CHAR_IS_BLANK (c) \
	188	\|\| (c == ')') \|\| (c == '(') \|\| (c == ';') \|\| (c == '"') \
	189	\|\| (SCM_SQUARE_BRACKETS_P && ((c == '[') \|\| (c == ']'))))
	190
	191	#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
	192
	193	/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
	194	Structure''. */
	195	#define CHAR_IS_EXPONENT_MARKER(_chr) \
	196	(((_chr) == 'e') \|\| ((_chr) == 's') \|\| ((_chr) == 'f') \
	197	\|\| ((_chr) == 'd') \|\| ((_chr) == 'l'))
	198
	199	/* Read an SCSH block comment. */
	200	static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
	201	static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
	202	static SCM scm_read_commented_expression (scm_t_wchar, SCM);
	203	static SCM scm_read_shebang (scm_t_wchar, SCM);
	204	static SCM scm_get_hash_procedure (int);
	205
	206	/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
	207	result in the pre-allocated buffer BUF. Return zero if the whole token has
	208	fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
	209	bytes actually read. */
	210	static int
	211	read_token (SCM port, char buf, size_t buf_size, size_t read)
	212	{
	213	*read = 0;
	214
	215	while (*read < buf_size)
	216	{
	217	int chr;
	218
	219	chr = scm_get_byte_or_eof (port);
	220
	221	if (chr == EOF)
	222	return 0;
	223	else if (CHAR_IS_DELIMITER (chr))
	224	{
	225	scm_unget_byte (chr, port);
	226	return 0;
	227	}
	228	else
	229	{
	230	*buf = (char) chr;
	231	buf++, (*read)++;
	232	}
	233	}
	234
	235	return 1;
	236	}
	237
	238	/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
	239	if the token doesn't fit in BUFFER_SIZE bytes. */
	240	static char *
	241	read_complete_token (SCM port, char *buffer, size_t buffer_size,
	242	size_t *read)
	243	{
	244	int overflow = 0;
	245	size_t bytes_read, overflow_size = 0;
	246	char *overflow_buffer = NULL;
	247
	248	do
	249	{
	250	overflow = read_token (port, buffer, buffer_size, &bytes_read);
	251	if (bytes_read == 0)
	252	break;
	253	if (overflow \|\| overflow_size != 0)
	254	{
	255	if (overflow_size == 0)
	256	{
	257	overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
	258	memcpy (overflow_buffer, buffer, bytes_read);
	259	overflow_size = bytes_read;
	260	}
	261	else
	262	{
	263	char *new_buf =
	264	scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
	265
	266	memcpy (new_buf, overflow_buffer, overflow_size);
	267	memcpy (new_buf + overflow_size, buffer, bytes_read);
	268
	269	overflow_buffer = new_buf;
	270	overflow_size += bytes_read;
	271	}
	272	}
	273	}
	274	while (overflow);
	275
	276	if (overflow_size)
	277	*read = overflow_size;
	278	else
	279	*read = bytes_read;
	280
	281	return (overflow_size > 0 ? overflow_buffer : buffer);
	282	}
	283
	284	/* Skip whitespace from PORT and return the first non-whitespace character
	285	read. Raise an error on end-of-file. */
	286	static int
	287	flush_ws (SCM port, const char *eoferr)
	288	{
	289	scm_t_wchar c;
	290	while (1)
	291	switch (c = scm_getc (port))
	292	{
	293	case EOF:
	294	goteof:
	295	if (eoferr)
	296	{
	297	scm_i_input_error (eoferr,
	298	port,
	299	"end of file",
	300	SCM_EOL);
	301	}
	302	return c;
	303
	304	case ';':
	305	lp:
	306	switch (c = scm_getc (port))
	307	{
	308	case EOF:
	309	goto goteof;
	310	default:
	311	goto lp;
	312	case SCM_LINE_INCREMENTORS:
	313	break;
	314	}
	315	break;
	316
	317	case '#':
	318	switch (c = scm_getc (port))
	319	{
	320	case EOF:
	321	eoferr = "read_sharp";
	322	goto goteof;
	323	case '!':
	324	scm_read_shebang (c, port);
	325	break;
	326	case ';':
	327	scm_read_commented_expression (c, port);
	328	break;
	329	case '\|':
	330	if (scm_is_false (scm_get_hash_procedure (c)))
	331	{
	332	scm_read_r6rs_block_comment (c, port);
	333	break;
	334	}
	335	/* fall through */
	336	default:
	337	scm_ungetc (c, port);
	338	return '#';
	339	}
	340	break;
	341
	342	case SCM_LINE_INCREMENTORS:
	343	case SCM_SINGLE_SPACES:
	344	case '\t':
	345	break;
	346
	347	default:
	348	return c;
	349	}
	350
	351	return 0;
	352	}
	353
	354
	355	\f
	356	/* Token readers. */
	357
	358	static SCM scm_read_expression (SCM port);
	359	static SCM scm_read_sharp (int chr, SCM port, long line, int column);
	360
	361
	362	static SCM
	363	maybe_annotate_source (SCM x, SCM port, long line, int column)
	364	{
	365	if (SCM_RECORD_POSITIONS_P)
	366	scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
	367	return x;
	368	}
	369
	370	static SCM
	371	scm_read_sexp (scm_t_wchar chr, SCM port)
	372	#define FUNC_NAME "scm_i_lreadparen"
	373	{
	374	int c;
	375	SCM tmp, tl, ans = SCM_EOL;
	376	const int terminating_char = ((chr == '[') ? ']' : ')');
	377
	378	/* Need to capture line and column numbers here. */
	379	long line = SCM_LINUM (port);
	380	int column = SCM_COL (port) - 1;
	381
	382	c = flush_ws (port, FUNC_NAME);
	383	if (terminating_char == c)
	384	return SCM_EOL;
	385
	386	scm_ungetc (c, port);
	387	tmp = scm_read_expression (port);
	388
	389	/* Note that it is possible for scm_read_expression to return
	390	scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
	391	check that it's a real dot by checking `c'. */
	392	if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
	393	{
	394	ans = scm_read_expression (port);
	395	if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
	396	scm_i_input_error (FUNC_NAME, port, "missing close paren",
	397	SCM_EOL);
	398	return ans;
	399	}
	400
	401	/* Build the head of the list structure. */
	402	ans = tl = scm_cons (tmp, SCM_EOL);
	403
	404	while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
	405	{
	406	SCM new_tail;
	407
	408	if (c == ')' \|\| (SCM_SQUARE_BRACKETS_P && c == ']'))
	409	scm_i_input_error (FUNC_NAME, port,
	410	"in pair: mismatched close paren: ~A",
	411	scm_list_1 (SCM_MAKE_CHAR (c)));
	412
	413	scm_ungetc (c, port);
	414	tmp = scm_read_expression (port);
	415
	416	/* See above note about scm_sym_dot. */
	417	if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
	418	{
	419	SCM_SETCDR (tl, scm_read_expression (port));
	420
	421	c = flush_ws (port, FUNC_NAME);
	422	if (terminating_char != c)
	423	scm_i_input_error (FUNC_NAME, port,
	424	"in pair: missing close paren", SCM_EOL);
	425	goto exit;
	426	}
	427
	428	new_tail = scm_cons (tmp, SCM_EOL);
	429	SCM_SETCDR (tl, new_tail);
	430	tl = new_tail;
	431	}
	432
	433	exit:
	434	return maybe_annotate_source (ans, port, line, column);
	435	}
	436	#undef FUNC_NAME
	437
	438
	439	/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
	440	C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
	441	found. */
	442	#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
	443	do \
	444	{ \
	445	scm_t_wchar a; \
	446	size_t i = 0; \
	447	c = 0; \
	448	while (i < ndigits) \
	449	{ \
	450	a = scm_getc (port); \
	451	if (a == EOF) \
	452	goto str_eof; \
	453	if (terminator \
	454	&& (a == (scm_t_wchar) terminator) \
	455	&& (i > 0)) \
	456	break; \
	457	if ('0' <= a && a <= '9') \
	458	a -= '0'; \
	459	else if ('A' <= a && a <= 'F') \
	460	a = a - 'A' + 10; \
	461	else if ('a' <= a && a <= 'f') \
	462	a = a - 'a' + 10; \
	463	else \
	464	{ \
	465	c = a; \
	466	goto bad_escaped; \
	467	} \
	468	c = c * 16 + a; \
	469	i ++; \
	470	} \
	471	} while (0)
	472
	473	static void
	474	skip_intraline_whitespace (SCM port)
	475	{
	476	scm_t_wchar c;
	477
	478	do
	479	{
	480	c = scm_getc (port);
	481	if (c == EOF)
	482	return;
	483	}
	484	while (c == '\t' \|\| uc_is_general_category (c, UC_SPACE_SEPARATOR));
	485
	486	scm_ungetc (c, port);
	487	}
	488
	489	static SCM
	490	scm_read_string (int chr, SCM port)
	491	#define FUNC_NAME "scm_lreadr"
	492	{
	493	/* For strings smaller than C_STR, this function creates only one Scheme
	494	object (the string returned). */
	495
	496	SCM str = SCM_EOL;
	497	size_t c_str_len = 0;
	498	scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
	499
	500	/* Need to capture line and column numbers here. */
	501	long line = SCM_LINUM (port);
	502	int column = SCM_COL (port) - 1;
	503
	504	while ('"' != (c = scm_getc (port)))
	505	{
	506	if (c == EOF)
	507	{
	508	str_eof:
	509	scm_i_input_error (FUNC_NAME, port,
	510	"end of file in string constant", SCM_EOL);
	511	}
	512
	513	if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
	514	{
	515	str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
	516	c_str_len = 0;
	517	}
	518
	519	if (c == '\\')
	520	{
	521	switch (c = scm_getc (port))
	522	{
	523	case EOF:
	524	goto str_eof;
	525	case '"':
	526	case '\\':
	527	break;
	528	case '\n':
	529	if (SCM_HUNGRY_EOL_ESCAPES_P)
	530	skip_intraline_whitespace (port);
	531	continue;
	532	case '0':
	533	c = '\0';
	534	break;
	535	case 'f':
	536	c = '\f';
	537	break;
	538	case 'n':
	539	c = '\n';
	540	break;
	541	case 'r':
	542	c = '\r';
	543	break;
	544	case 't':
	545	c = '\t';
	546	break;
	547	case 'a':
	548	c = '\007';
	549	break;
	550	case 'v':
	551	c = '\v';
	552	break;
	553	case 'b':
	554	c = '\010';
	555	break;
	556	case 'x':
	557	if (SCM_R6RS_ESCAPES_P)
	558	SCM_READ_HEX_ESCAPE (10, ';');
	559	else
	560	SCM_READ_HEX_ESCAPE (2, '\0');
	561	break;
	562	case 'u':
	563	if (!SCM_R6RS_ESCAPES_P)
	564	{
	565	SCM_READ_HEX_ESCAPE (4, '\0');
	566	break;
	567	}
	568	case 'U':
	569	if (!SCM_R6RS_ESCAPES_P)
	570	{
	571	SCM_READ_HEX_ESCAPE (6, '\0');
	572	break;
	573	}
	574	default:
	575	bad_escaped:
	576	scm_i_input_error (FUNC_NAME, port,
	577	"illegal character in escape sequence: ~S",
	578	scm_list_1 (SCM_MAKE_CHAR (c)));
	579	}
	580	}
	581
	582	c_str[c_str_len++] = c;
	583	}
	584
	585	if (scm_is_null (str))
	586	/* Fast path: we got a string that fits in C_STR. */
	587	str = scm_from_utf32_stringn (c_str, c_str_len);
	588	else
	589	{
	590	if (c_str_len > 0)
	591	str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
	592
	593	str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
	594	}
	595
	596	return maybe_annotate_source (str, port, line, column);
	597	}
	598	#undef FUNC_NAME
	599
	600
	601	static SCM
	602	scm_read_number (scm_t_wchar chr, SCM port)
	603	{
	604	SCM result, str = SCM_EOL;
	605	char local_buffer[READER_BUFFER_SIZE], *buffer;
	606	size_t bytes_read;
	607	scm_t_port *pt = SCM_PTAB_ENTRY (port);
	608
	609	/* Need to capture line and column numbers here. */
	610	long line = SCM_LINUM (port);
	611	int column = SCM_COL (port) - 1;
	612
	613	scm_ungetc (chr, port);
	614	buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
	615	&bytes_read);
	616
	617	str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
	618
	619	result = scm_string_to_number (str, SCM_UNDEFINED);
	620	if (scm_is_false (result))
	621	{
	622	/* Return a symbol instead of a number */
	623	if (SCM_CASE_INSENSITIVE_P)
	624	str = scm_string_downcase_x (str);
	625	result = scm_string_to_symbol (str);
	626	}
	627	else if (SCM_NIMP (result))
	628	result = maybe_annotate_source (result, port, line, column);
	629
	630	SCM_COL (port) += scm_i_string_length (str);
	631	return result;
	632	}
	633
	634	static SCM
	635	scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
	636	{
	637	SCM result;
	638	int ends_with_colon = 0;
	639	size_t bytes_read;
	640	int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
	641	char local_buffer[READER_BUFFER_SIZE], *buffer;
	642	scm_t_port *pt = SCM_PTAB_ENTRY (port);
	643	SCM str;
	644
	645	scm_ungetc (chr, port);
	646	buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
	647	&bytes_read);
	648	if (bytes_read > 0)
	649	ends_with_colon = buffer[bytes_read - 1] == ':';
	650
	651	if (postfix && ends_with_colon && (bytes_read > 1))
	652	{
	653	str = scm_from_stringn (buffer, bytes_read - 1,
	654	pt->encoding, pt->ilseq_handler);
	655
	656	if (SCM_CASE_INSENSITIVE_P)
	657	str = scm_string_downcase_x (str);
	658	result = scm_symbol_to_keyword (scm_string_to_symbol (str));
	659	}
	660	else
	661	{
	662	str = scm_from_stringn (buffer, bytes_read,
	663	pt->encoding, pt->ilseq_handler);
	664
	665	if (SCM_CASE_INSENSITIVE_P)
	666	str = scm_string_downcase_x (str);
	667	result = scm_string_to_symbol (str);
	668	}
	669
	670	SCM_COL (port) += scm_i_string_length (str);
	671	return result;
	672	}
	673
	674	static SCM
	675	scm_read_number_and_radix (scm_t_wchar chr, SCM port)
	676	#define FUNC_NAME "scm_lreadr"
	677	{
	678	SCM result;
	679	size_t read;
	680	char local_buffer[READER_BUFFER_SIZE], *buffer;
	681	unsigned int radix;
	682	SCM str;
	683	scm_t_port *pt;
	684
	685	switch (chr)
	686	{
	687	case 'B':
	688	case 'b':
	689	radix = 2;
	690	break;
	691
	692	case 'o':
	693	case 'O':
	694	radix = 8;
	695	break;
	696
	697	case 'd':
	698	case 'D':
	699	radix = 10;
	700	break;
	701
	702	case 'x':
	703	case 'X':
	704	radix = 16;
	705	break;
	706
	707	default:
	708	scm_ungetc (chr, port);
	709	scm_ungetc ('#', port);
	710	radix = 10;
	711	}
	712
	713	buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
	714	&read);
	715
	716	pt = SCM_PTAB_ENTRY (port);
	717	str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
	718
	719	result = scm_string_to_number (str, scm_from_uint (radix));
	720
	721	SCM_COL (port) += scm_i_string_length (str);
	722
	723	if (scm_is_true (result))
	724	return result;
	725
	726	scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
	727
	728	return SCM_BOOL_F;
	729	}
	730	#undef FUNC_NAME
	731
	732	static SCM
	733	scm_read_quote (int chr, SCM port)
	734	{
	735	SCM p;
	736	long line = SCM_LINUM (port);
	737	int column = SCM_COL (port) - 1;
	738
	739	switch (chr)
	740	{
	741	case '`':
	742	p = scm_sym_quasiquote;
	743	break;
	744
	745	case '\'':
	746	p = scm_sym_quote;
	747	break;
	748
	749	case ',':
	750	{
	751	scm_t_wchar c;
	752
	753	c = scm_getc (port);
	754	if ('@' == c)
	755	p = scm_sym_uq_splicing;
	756	else
	757	{
	758	scm_ungetc (c, port);
	759	p = scm_sym_unquote;
	760	}
	761	break;
	762	}
	763
	764	default:
	765	fprintf (stderr, "%s: unhandled quote character (%i)\n",
	766	"scm_read_quote", chr);
	767	abort ();
	768	}
	769
	770	p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
	771	return maybe_annotate_source (p, port, line, column);
	772	}
	773
	774	SCM_SYMBOL (sym_syntax, "syntax");
	775	SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
	776	SCM_SYMBOL (sym_unsyntax, "unsyntax");
	777	SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
	778
	779	static SCM
	780	scm_read_syntax (int chr, SCM port)
	781	{
	782	SCM p;
	783	long line = SCM_LINUM (port);
	784	int column = SCM_COL (port) - 1;
	785
	786	switch (chr)
	787	{
	788	case '`':
	789	p = sym_quasisyntax;
	790	break;
	791
	792	case '\'':
	793	p = sym_syntax;
	794	break;
	795
	796	case ',':
	797	{
	798	int c;
	799
	800	c = scm_getc (port);
	801	if ('@' == c)
	802	p = sym_unsyntax_splicing;
	803	else
	804	{
	805	scm_ungetc (c, port);
	806	p = sym_unsyntax;
	807	}
	808	break;
	809	}
	810
	811	default:
	812	fprintf (stderr, "%s: unhandled syntax character (%i)\n",
	813	"scm_read_syntax", chr);
	814	abort ();
	815	}
	816
	817	p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
	818	return maybe_annotate_source (p, port, line, column);
	819	}
	820
	821	static SCM
	822	scm_read_nil (int chr, SCM port)
	823	{
	824	SCM id = scm_read_mixed_case_symbol (chr, port);
	825
	826	if (!scm_is_eq (id, sym_nil))
	827	scm_i_input_error ("scm_read_nil", port,
	828	"unexpected input while reading #nil: ~a",
	829	scm_list_1 (id));
	830
	831	return SCM_ELISP_NIL;
	832	}
	833
	834	static SCM
	835	scm_read_semicolon_comment (int chr, SCM port)
	836	{
	837	int c;
	838
	839	/* We use the get_byte here because there is no need to get the
	840	locale correct with comment input. This presumes that newline
	841	always represents itself no matter what the encoding is. */
	842	for (c = scm_get_byte_or_eof (port);
	843	(c != EOF) && (c != '\n');
	844	c = scm_get_byte_or_eof (port));
	845
	846	return SCM_UNSPECIFIED;
	847	}
	848
	849	\f
	850	/* Sharp readers, i.e. readers called after a `#' sign has been read. */
	851
	852	static SCM
	853	scm_read_boolean (int chr, SCM port)
	854	{
	855	switch (chr)
	856	{
	857	case 't':
	858	case 'T':
	859	return SCM_BOOL_T;
	860
	861	case 'f':
	862	case 'F':
	863	return SCM_BOOL_F;
	864	}
	865
	866	return SCM_UNSPECIFIED;
	867	}
	868
	869	static SCM
	870	scm_read_character (scm_t_wchar chr, SCM port)
	871	#define FUNC_NAME "scm_lreadr"
	872	{
	873	char buffer[READER_CHAR_NAME_MAX_SIZE];
	874	SCM charname;
	875	size_t charname_len, bytes_read;
	876	scm_t_wchar cp;
	877	int overflow;
	878	scm_t_port *pt;
	879
	880	overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
	881	if (overflow)
	882	scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
	883
	884	if (bytes_read == 0)
	885	{
	886	chr = scm_getc (port);
	887	if (chr == EOF)
	888	scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
	889	"while reading character", SCM_EOL);
	890
	891	/* CHR must be a token delimiter, like a whitespace. */
	892	return (SCM_MAKE_CHAR (chr));
	893	}
	894
	895	pt = SCM_PTAB_ENTRY (port);
	896
	897	/* Simple ASCII characters can be processed immediately. Also, simple
	898	ISO-8859-1 characters can be processed immediately if the encoding for this
	899	port is ISO-8859-1. */
	900	if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 \|\| pt->encoding == NULL))
	901	{
	902	SCM_COL (port) += 1;
	903	return SCM_MAKE_CHAR (buffer[0]);
	904	}
	905
	906	/* Otherwise, convert the buffer into a proper scheme string for
	907	processing. */
	908	charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
	909	pt->ilseq_handler);
	910	charname_len = scm_i_string_length (charname);
	911	SCM_COL (port) += charname_len;
	912	cp = scm_i_string_ref (charname, 0);
	913	if (charname_len == 1)
	914	return SCM_MAKE_CHAR (cp);
	915
	916	/* Ignore dotted circles, which may be used to keep combining characters from
	917	combining with the backslash in #\charname. */
	918	if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
	919	return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
	920
	921	if (cp >= '0' && cp < '8')
	922	{
	923	/* Dirk:FIXME:: This type of character syntax is not R5RS
	924	* compliant. Further, it should be verified that the constant
	925	* does only consist of octal digits. */
	926	SCM p = scm_string_to_number (charname, scm_from_uint (8));
	927	if (SCM_I_INUMP (p))
	928	{
	929	scm_t_wchar c = scm_to_uint32 (p);
	930	if (SCM_IS_UNICODE_CHAR (c))
	931	return SCM_MAKE_CHAR (c);
	932	else
	933	scm_i_input_error (FUNC_NAME, port,
	934	"out-of-range octal character escape: ~a",
	935	scm_list_1 (charname));
	936	}
	937	}
	938
	939	if (cp == 'x' && (charname_len > 1))
	940	{
	941	SCM p;
	942
	943	/* Convert from hex, skipping the initial 'x' character in CHARNAME */
	944	p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
	945	scm_from_uint (16));
	946	if (SCM_I_INUMP (p))
	947	{
	948	scm_t_wchar c = scm_to_uint32 (p);
	949	if (SCM_IS_UNICODE_CHAR (c))
	950	return SCM_MAKE_CHAR (c);
	951	else
	952	scm_i_input_error (FUNC_NAME, port,
	953	"out-of-range hex character escape: ~a",
	954	scm_list_1 (charname));
	955	}
	956	}
	957
	958	/* The names of characters should never have non-Latin1
	959	characters. */
	960	if (scm_i_is_narrow_string (charname)
	961	\|\| scm_i_try_narrow_string (charname))
	962	{ SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
	963	charname_len);
	964	if (scm_is_true (ch))
	965	return ch;
	966	}
	967
	968	scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
	969	scm_list_1 (charname));
	970
	971	return SCM_UNSPECIFIED;
	972	}
	973	#undef FUNC_NAME
	974
	975	static SCM
	976	scm_read_keyword (int chr, SCM port)
	977	{
	978	SCM symbol;
	979
	980	/* Read the symbol that comprises the keyword. Doing this instead of
	981	invoking a specific symbol reader function allows `scm_read_keyword ()'
	982	to adapt to the delimiters currently valid of symbols.
	983
	984	XXX: This implementation allows sloppy syntaxes like `#: key'. */
	985	symbol = scm_read_expression (port);
	986	if (!scm_is_symbol (symbol))
	987	scm_i_input_error ("scm_read_keyword", port,
	988	"keyword prefix `~a' not followed by a symbol: ~s",
	989	scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
	990
	991	return (scm_symbol_to_keyword (symbol));
	992	}
	993
	994	static SCM
	995	scm_read_vector (int chr, SCM port, long line, int column)
	996	{
	997	/* Note: We call `scm_read_sexp ()' rather than READER here in order to
	998	guarantee that it's going to do what we want. After all, this is an
	999	implementation detail of `scm_read_vector ()', not a desirable
	1000	property. */
	1001	return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
	1002	port, line, column);
	1003	}
	1004
	1005	static SCM
	1006	scm_read_array (int chr, SCM port, long line, int column)
	1007	{
	1008	SCM result = scm_i_read_array (port, chr);
	1009	if (scm_is_false (result))
	1010	return result;
	1011	else
	1012	return maybe_annotate_source (result, port, line, column);
	1013	}
	1014
	1015	static SCM
	1016	scm_read_srfi4_vector (int chr, SCM port, long line, int column)
	1017	{
	1018	return scm_read_array (chr, port, line, column);
	1019	}
	1020
	1021	static SCM
	1022	scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
	1023	{
	1024	chr = scm_getc (port);
	1025	if (chr != 'u')
	1026	goto syntax;
	1027
	1028	chr = scm_getc (port);
	1029	if (chr != '8')
	1030	goto syntax;
	1031
	1032	chr = scm_getc (port);
	1033	if (chr != '(')
	1034	goto syntax;
	1035
	1036	return maybe_annotate_source
	1037	(scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
	1038	port, line, column);
	1039
	1040	syntax:
	1041	scm_i_input_error ("read_bytevector", port,
	1042	"invalid bytevector prefix",
	1043	SCM_MAKE_CHAR (chr));
	1044	return SCM_UNSPECIFIED;
	1045	}
	1046
	1047	static SCM
	1048	scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
	1049	{
	1050	/* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
	1051	terribly inefficient but who cares? */
	1052	SCM s_bits = SCM_EOL;
	1053
	1054	for (chr = scm_getc (port);
	1055	(chr != EOF) && ((chr == '0') \|\| (chr == '1'));
	1056	chr = scm_getc (port))
	1057	{
	1058	s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
	1059	}
	1060
	1061	if (chr != EOF)
	1062	scm_ungetc (chr, port);
	1063
	1064	return maybe_annotate_source
	1065	(scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
	1066	port, line, column);
	1067	}
	1068
	1069	static SCM
	1070	scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
	1071	{
	1072	int bang_seen = 0;
	1073
	1074	for (;;)
	1075	{
	1076	int c = scm_getc (port);
	1077
	1078	if (c == EOF)
	1079	scm_i_input_error ("skip_block_comment", port,
	1080	"unterminated `#! ... !#' comment", SCM_EOL);
	1081
	1082	if (c == '!')
	1083	bang_seen = 1;
	1084	else if (c == '#' && bang_seen)
	1085	break;
	1086	else
	1087	bang_seen = 0;
	1088	}
	1089
	1090	return SCM_UNSPECIFIED;
	1091	}
	1092
	1093	static SCM
	1094	scm_read_shebang (scm_t_wchar chr, SCM port)
	1095	{
	1096	int c = 0;
	1097	if ((c = scm_get_byte_or_eof (port)) != 'r')
	1098	{
	1099	scm_ungetc (c, port);
	1100	return scm_read_scsh_block_comment (chr, port);
	1101	}
	1102	if ((c = scm_get_byte_or_eof (port)) != '6')
	1103	{
	1104	scm_ungetc (c, port);
	1105	scm_ungetc ('r', port);
	1106	return scm_read_scsh_block_comment (chr, port);
	1107	}
	1108	if ((c = scm_get_byte_or_eof (port)) != 'r')
	1109	{
	1110	scm_ungetc (c, port);
	1111	scm_ungetc ('6', port);
	1112	scm_ungetc ('r', port);
	1113	return scm_read_scsh_block_comment (chr, port);
	1114	}
	1115	if ((c = scm_get_byte_or_eof (port)) != 's')
	1116	{
	1117	scm_ungetc (c, port);
	1118	scm_ungetc ('r', port);
	1119	scm_ungetc ('6', port);
	1120	scm_ungetc ('r', port);
	1121	return scm_read_scsh_block_comment (chr, port);
	1122	}
	1123
	1124	return SCM_UNSPECIFIED;
	1125	}
	1126
	1127	static SCM
	1128	scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
	1129	{
	1130	/* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
	1131	nested. So care must be taken. */
	1132	int nesting_level = 1;
	1133
	1134	int a = scm_getc (port);
	1135
	1136	if (a == EOF)
	1137	scm_i_input_error ("scm_read_r6rs_block_comment", port,
	1138	"unterminated `#\| ... \|#' comment", SCM_EOL);
	1139
	1140	while (nesting_level > 0)
	1141	{
	1142	int b = scm_getc (port);
	1143
	1144	if (b == EOF)
	1145	scm_i_input_error ("scm_read_r6rs_block_comment", port,
	1146	"unterminated `#\| ... \|#' comment", SCM_EOL);
	1147
	1148	if (a == '\|' && b == '#')
	1149	{
	1150	nesting_level--;
	1151	b = EOF;
	1152	}
	1153	else if (a == '#' && b == '\|')
	1154	{
	1155	nesting_level++;
	1156	b = EOF;
	1157	}
	1158
	1159	a = b;
	1160	}
	1161
	1162	return SCM_UNSPECIFIED;
	1163	}
	1164
	1165	static SCM
	1166	scm_read_commented_expression (scm_t_wchar chr, SCM port)
	1167	{
	1168	scm_t_wchar c;
	1169
	1170	c = flush_ws (port, (char *) NULL);
	1171	if (EOF == c)
	1172	scm_i_input_error ("read_commented_expression", port,
	1173	"no expression after #; comment", SCM_EOL);
	1174	scm_ungetc (c, port);
	1175	scm_read_expression (port);
	1176	return SCM_UNSPECIFIED;
	1177	}
	1178
	1179	static SCM
	1180	scm_read_extended_symbol (scm_t_wchar chr, SCM port)
	1181	{
	1182	/* Guile's extended symbol read syntax looks like this:
	1183
	1184	#{This is all a symbol name}#
	1185
	1186	So here, CHR is expected to be `{'. */
	1187	int saw_brace = 0;
	1188	size_t len = 0;
	1189	SCM buf = scm_i_make_string (1024, NULL, 0);
	1190
	1191	buf = scm_i_string_start_writing (buf);
	1192
	1193	while ((chr = scm_getc (port)) != EOF)
	1194	{
	1195	if (saw_brace)
	1196	{
	1197	if (chr == '#')
	1198	{
	1199	break;
	1200	}
	1201	else
	1202	{
	1203	saw_brace = 0;
	1204	scm_i_string_set_x (buf, len++, '}');
	1205	}
	1206	}
	1207
	1208	if (chr == '}')
	1209	saw_brace = 1;
	1210	else if (chr == '\\')
	1211	{
	1212	/* It used to be that print.c would print extended-read-syntax
	1213	symbols with backslashes before "non-standard" chars, but
	1214	this routine wouldn't do anything with those escapes.
	1215	Bummer. What we've done is to change print.c to output
	1216	R6RS hex escapes for those characters, relying on the fact
	1217	that the extended read syntax would never put a `\' before
	1218	an `x'. For now, we just ignore other instances of
	1219	backslash in the string. */
	1220	switch ((chr = scm_getc (port)))
	1221	{
	1222	case EOF:
	1223	goto done;
	1224	case 'x':
	1225	{
	1226	scm_t_wchar c;
	1227
	1228	SCM_READ_HEX_ESCAPE (10, ';');
	1229	scm_i_string_set_x (buf, len++, c);
	1230	break;
	1231
	1232	str_eof:
	1233	chr = EOF;
	1234	goto done;
	1235
	1236	bad_escaped:
	1237	scm_i_string_stop_writing ();
	1238	scm_i_input_error ("scm_read_extended_symbol", port,
	1239	"illegal character in escape sequence: ~S",
	1240	scm_list_1 (SCM_MAKE_CHAR (c)));
	1241	break;
	1242	}
	1243	default:
	1244	scm_i_string_set_x (buf, len++, chr);
	1245	break;
	1246	}
	1247	}
	1248	else
	1249	scm_i_string_set_x (buf, len++, chr);
	1250
	1251	if (len >= scm_i_string_length (buf) - 2)
	1252	{
	1253	SCM addy;
	1254
	1255	scm_i_string_stop_writing ();
	1256	addy = scm_i_make_string (1024, NULL, 0);
	1257	buf = scm_string_append (scm_list_2 (buf, addy));
	1258	len = 0;
	1259	buf = scm_i_string_start_writing (buf);
	1260	}
	1261	}
	1262
	1263	done:
	1264	scm_i_string_stop_writing ();
	1265	if (chr == EOF)
	1266	scm_i_input_error ("scm_read_extended_symbol", port,
	1267	"end of file while reading symbol", SCM_EOL);
	1268
	1269	return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
	1270	}
	1271
	1272
	1273	\f
	1274	/* Top-level token readers, i.e., dispatchers. */
	1275
	1276	static SCM
	1277	scm_read_sharp_extension (int chr, SCM port)
	1278	{
	1279	SCM proc;
	1280
	1281	proc = scm_get_hash_procedure (chr);
	1282	if (scm_is_true (scm_procedure_p (proc)))
	1283	{
	1284	long line = SCM_LINUM (port);
	1285	int column = SCM_COL (port) - 2;
	1286	SCM got;
	1287
	1288	got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
	1289
	1290	if (scm_is_pair (got) && !scm_i_has_source_properties (got))
	1291	scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
	1292
	1293	return got;
	1294	}
	1295
	1296	return SCM_UNSPECIFIED;
	1297	}
	1298
	1299	/* The reader for the sharp `#' character. It basically dispatches reads
	1300	among the above token readers. */
	1301	static SCM
	1302	scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
	1303	#define FUNC_NAME "scm_lreadr"
	1304	{
	1305	SCM result;
	1306
	1307	chr = scm_getc (port);
	1308
	1309	result = scm_read_sharp_extension (chr, port);
	1310	if (!scm_is_eq (result, SCM_UNSPECIFIED))
	1311	return result;
	1312
	1313	switch (chr)
	1314	{
	1315	case '\\':
	1316	return (scm_read_character (chr, port));
	1317	case '(':
	1318	return (scm_read_vector (chr, port, line, column));
	1319	case 's':
	1320	case 'u':
	1321	case 'f':
	1322	case 'c':
	1323	/* This one may return either a boolean or an SRFI-4 vector. */
	1324	return (scm_read_srfi4_vector (chr, port, line, column));
	1325	case 'v':
	1326	return (scm_read_bytevector (chr, port, line, column));
	1327	case '*':
	1328	return (scm_read_guile_bit_vector (chr, port, line, column));
	1329	case 't':
	1330	case 'T':
	1331	case 'F':
	1332	return (scm_read_boolean (chr, port));
	1333	case ':':
	1334	return (scm_read_keyword (chr, port));
	1335	case '0': case '1': case '2': case '3': case '4':
	1336	case '5': case '6': case '7': case '8': case '9':
	1337	case '@':
	1338	#if SCM_ENABLE_DEPRECATED
	1339	/* See below for 'i' and 'e'. */
	1340	case 'a':
	1341	case 'y':
	1342	case 'h':
	1343	case 'l':
	1344	#endif
	1345	return (scm_read_array (chr, port, line, column));
	1346
	1347	case 'i':
	1348	case 'e':
	1349	#if SCM_ENABLE_DEPRECATED
	1350	{
	1351	/* When next char is '(', it really is an old-style
	1352	uniform array. */
	1353	scm_t_wchar next_c = scm_getc (port);
	1354	if (next_c != EOF)
	1355	scm_ungetc (next_c, port);
	1356	if (next_c == '(')
	1357	return scm_read_array (chr, port, line, column);
	1358	/* Fall through. */
	1359	}
	1360	#endif
	1361	case 'b':
	1362	case 'B':
	1363	case 'o':
	1364	case 'O':
	1365	case 'd':
	1366	case 'D':
	1367	case 'x':
	1368	case 'X':
	1369	case 'I':
	1370	case 'E':
	1371	return (scm_read_number_and_radix (chr, port));
	1372	case '{':
	1373	return (scm_read_extended_symbol (chr, port));
	1374	case '!':
	1375	return (scm_read_shebang (chr, port));
	1376	case ';':
	1377	return (scm_read_commented_expression (chr, port));
	1378	case '`':
	1379	case '\'':
	1380	case ',':
	1381	return (scm_read_syntax (chr, port));
	1382	case 'n':
	1383	return (scm_read_nil (chr, port));
	1384	default:
	1385	result = scm_read_sharp_extension (chr, port);
	1386	if (scm_is_eq (result, SCM_UNSPECIFIED))
	1387	{
	1388	/* To remain compatible with 1.8 and earlier, the following
	1389	characters have lower precedence than `read-hash-extend'
	1390	characters. */
	1391	switch (chr)
	1392	{
	1393	case '\|':
	1394	return scm_read_r6rs_block_comment (chr, port);
	1395	default:
	1396	scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
	1397	scm_list_1 (SCM_MAKE_CHAR (chr)));
	1398	}
	1399	}
	1400	else
	1401	return result;
	1402	}
	1403
	1404	return SCM_UNSPECIFIED;
	1405	}
	1406	#undef FUNC_NAME
	1407
	1408	static SCM
	1409	scm_read_expression (SCM port)
	1410	#define FUNC_NAME "scm_read_expression"
	1411	{
	1412	while (1)
	1413	{
	1414	scm_t_wchar chr;
	1415
	1416	chr = scm_getc (port);
	1417
	1418	switch (chr)
	1419	{
	1420	case SCM_WHITE_SPACES:
	1421	case SCM_LINE_INCREMENTORS:
	1422	break;
	1423	case ';':
	1424	(void) scm_read_semicolon_comment (chr, port);
	1425	break;
	1426	case '[':
	1427	if (!SCM_SQUARE_BRACKETS_P)
	1428	return (scm_read_mixed_case_symbol (chr, port));
	1429	/* otherwise fall through */
	1430	case '(':
	1431	return (scm_read_sexp (chr, port));
	1432	case '"':
	1433	return (scm_read_string (chr, port));
	1434	case '\'':
	1435	case '`':
	1436	case ',':
	1437	return (scm_read_quote (chr, port));
	1438	case '#':
	1439	{
	1440	long line = SCM_LINUM (port);
	1441	int column = SCM_COL (port) - 1;
	1442	SCM result = scm_read_sharp (chr, port, line, column);
	1443	if (scm_is_eq (result, SCM_UNSPECIFIED))
	1444	/* We read a comment or some such. */
	1445	break;
	1446	else
	1447	return result;
	1448	}
	1449	case ')':
	1450	scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
	1451	break;
	1452	case ']':
	1453	if (SCM_SQUARE_BRACKETS_P)
	1454	scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
	1455	/* otherwise fall through */
	1456	case EOF:
	1457	return SCM_EOF_VAL;
	1458	case ':':
	1459	if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
	1460	return scm_symbol_to_keyword (scm_read_expression (port));
	1461	/* Fall through. */
	1462
	1463	default:
	1464	{
	1465	if (((chr >= '0') && (chr <= '9'))
	1466	\|\| (strchr ("+-.", chr)))
	1467	return (scm_read_number (chr, port));
	1468	else
	1469	return (scm_read_mixed_case_symbol (chr, port));
	1470	}
	1471	}
	1472	}
	1473	}
	1474	#undef FUNC_NAME
	1475
	1476	\f
	1477	/* Actual reader. */
	1478
	1479	SCM_DEFINE (scm_read, "read", 0, 1, 0,
	1480	(SCM port),
	1481	"Read an s-expression from the input port @var{port}, or from\n"
	1482	"the current input port if @var{port} is not specified.\n"
	1483	"Any whitespace before the next token is discarded.")
	1484	#define FUNC_NAME s_scm_read
	1485	{
	1486	int c;
	1487
	1488	if (SCM_UNBNDP (port))
	1489	port = scm_current_input_port ();
	1490	SCM_VALIDATE_OPINPORT (1, port);
	1491
	1492	c = flush_ws (port, (char *) NULL);
	1493	if (EOF == c)
	1494	return SCM_EOF_VAL;
	1495	scm_ungetc (c, port);
	1496
	1497	return (scm_read_expression (port));
	1498	}
	1499	#undef FUNC_NAME
	1500
	1501
	1502	\f
	1503
	1504	/* Manipulate the read-hash-procedures alist. This could be written in
	1505	Scheme, but maybe it will also be used by C code during initialisation. */
	1506	SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
	1507	(SCM chr, SCM proc),
	1508	"Install the procedure @var{proc} for reading expressions\n"
	1509	"starting with the character sequence @code{#} and @var{chr}.\n"
	1510	"@var{proc} will be called with two arguments: the character\n"
	1511	"@var{chr} and the port to read further data from. The object\n"
	1512	"returned will be the return value of @code{read}. \n"
	1513	"Passing @code{#f} for @var{proc} will remove a previous setting. \n"
	1514	)
	1515	#define FUNC_NAME s_scm_read_hash_extend
	1516	{
	1517	SCM this;
	1518	SCM prev;
	1519
	1520	SCM_VALIDATE_CHAR (1, chr);
	1521	SCM_ASSERT (scm_is_false (proc)
	1522	\|\| scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
	1523	proc, SCM_ARG2, FUNC_NAME);
	1524
	1525	/* Check if chr is already in the alist. */
	1526	this = scm_i_read_hash_procedures_ref ();
	1527	prev = SCM_BOOL_F;
	1528	while (1)
	1529	{
	1530	if (scm_is_null (this))
	1531	{
	1532	/* not found, so add it to the beginning. */
	1533	if (scm_is_true (proc))
	1534	{
	1535	SCM new = scm_cons (scm_cons (chr, proc),
	1536	scm_i_read_hash_procedures_ref ());
	1537	scm_i_read_hash_procedures_set_x (new);
	1538	}
	1539	break;
	1540	}
	1541	if (scm_is_eq (chr, SCM_CAAR (this)))
	1542	{
	1543	/* already in the alist. */
	1544	if (scm_is_false (proc))
	1545	{
	1546	/* remove it. */
	1547	if (scm_is_false (prev))
	1548	{
	1549	SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
	1550	scm_i_read_hash_procedures_set_x (rest);
	1551	}
	1552	else
	1553	scm_set_cdr_x (prev, SCM_CDR (this));
	1554	}
	1555	else
	1556	{
	1557	/* replace it. */
	1558	scm_set_cdr_x (SCM_CAR (this), proc);
	1559	}
	1560	break;
	1561	}
	1562	prev = this;
	1563	this = SCM_CDR (this);
	1564	}
	1565
	1566	return SCM_UNSPECIFIED;
	1567	}
	1568	#undef FUNC_NAME
	1569
	1570	/* Recover the read-hash procedure corresponding to char c. */
	1571	static SCM
	1572	scm_get_hash_procedure (int c)
	1573	{
	1574	SCM rest = scm_i_read_hash_procedures_ref ();
	1575
	1576	while (1)
	1577	{
	1578	if (scm_is_null (rest))
	1579	return SCM_BOOL_F;
	1580
	1581	if (SCM_CHAR (SCM_CAAR (rest)) == c)
	1582	return SCM_CDAR (rest);
	1583
	1584	rest = SCM_CDR (rest);
	1585	}
	1586	}
	1587
	1588	#define SCM_ENCODING_SEARCH_SIZE (500)
	1589
	1590	/* Search the first few hundred characters of a file for an Emacs-like coding
	1591	declaration. Returns either NULL or a string whose storage has been
	1592	allocated with `scm_gc_malloc ()'. */
	1593	char *
	1594	scm_i_scan_for_encoding (SCM port)
	1595	{
	1596	scm_t_port *pt;
	1597	char header[SCM_ENCODING_SEARCH_SIZE+1];
	1598	size_t bytes_read, encoding_length, i;
	1599	char *encoding = NULL;
	1600	int utf8_bom = 0;
	1601	char pos, encoding_start;
	1602	int in_comment;
	1603
	1604	pt = SCM_PTAB_ENTRY (port);
	1605
	1606	if (pt->rw_active == SCM_PORT_WRITE)
	1607	scm_flush (port);
	1608
	1609	if (pt->rw_random)
	1610	pt->rw_active = SCM_PORT_READ;
	1611
	1612	if (pt->read_pos == pt->read_end)
	1613	{
	1614	/* We can use the read buffer, and thus avoid a seek. */
	1615	if (scm_fill_input (port) == EOF)
	1616	return NULL;
	1617
	1618	bytes_read = pt->read_end - pt->read_pos;
	1619	if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
	1620	bytes_read = SCM_ENCODING_SEARCH_SIZE;
	1621
	1622	if (bytes_read <= 1)
	1623	/* An unbuffered port -- don't scan. */
	1624	return NULL;
	1625
	1626	memcpy (header, pt->read_pos, bytes_read);
	1627	header[bytes_read] = '\0';
	1628	}
	1629	else
	1630	{
	1631	/* Try to read some bytes and then seek back. Not all ports
	1632	support seeking back; and indeed some file ports (like
	1633	/dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
	1634	check performed by SCM_FPORT_FDES---but fail to seek
	1635	backwards. Hence this block comes second. We prefer to use
	1636	the read buffer in-place. */
	1637	if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
	1638	return NULL;
	1639
	1640	bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
	1641	header[bytes_read] = '\0';
	1642	scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
	1643	}
	1644
	1645	if (bytes_read > 3
	1646	&& header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
	1647	utf8_bom = 1;
	1648
	1649	/* search past "coding[:=]" */
	1650	pos = header;
	1651	while (1)
	1652	{
	1653	if ((pos = strstr(pos, "coding")) == NULL)
	1654	return NULL;
	1655
	1656	pos += strlen("coding");
	1657	if (pos - header >= SCM_ENCODING_SEARCH_SIZE \|\|
	1658	(pos == ':' \|\| pos == '='))
	1659	{
	1660	pos ++;
	1661	break;
	1662	}
	1663	}
	1664
	1665	/* skip spaces */
	1666	while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
	1667	(pos == ' ' \|\| pos == '\t'))
	1668	pos ++;
	1669
	1670	/* grab the next token */
	1671	encoding_start = pos;
	1672	i = 0;
	1673	while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
	1674	&& encoding_start + i - header < bytes_read
	1675	&& (isalnum ((int) encoding_start[i])
	1676	\|\| strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
	1677	i++;
	1678
	1679	encoding_length = i;
	1680	if (encoding_length == 0)
	1681	return NULL;
	1682
	1683	encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
	1684	for (i = 0; i < encoding_length; i++)
	1685	encoding[i] = toupper ((int) encoding[i]);
	1686
	1687	/* push backwards to make sure we were in a comment */
	1688	in_comment = 0;
	1689	pos = encoding_start;
	1690	while (pos >= header)
	1691	{
	1692	if (*pos == ';')
	1693	{
	1694	in_comment = 1;
	1695	break;
	1696	}
	1697	else if (*pos == '\n' \|\| pos == header)
	1698	{
	1699	/* This wasn't in a semicolon comment. Check for a
	1700	hash-bang comment. */
	1701	char *beg = strstr (header, "#!");
	1702	char *end = strstr (header, "!#");
	1703	if (beg < encoding_start && encoding_start + encoding_length <= end)
	1704	in_comment = 1;
	1705	break;
	1706	}
	1707	else
	1708	{
	1709	pos --;
	1710	continue;
	1711	}
	1712	}
	1713	if (!in_comment)
	1714	/* This wasn't in a comment */
	1715	return NULL;
	1716
	1717	if (utf8_bom && strcmp(encoding, "UTF-8"))
	1718	scm_misc_error (NULL,
	1719	"the port input declares the encoding ~s but is encoded as UTF-8",
	1720	scm_list_1 (scm_from_locale_string (encoding)));
	1721
	1722	return encoding;
	1723	}
	1724
	1725	SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
	1726	(SCM port),
	1727	"Scans the port for an Emacs-like character coding declaration\n"
	1728	"near the top of the contents of a port with random-accessible contents.\n"
	1729	"The coding declaration is of the form\n"
	1730	"@code{coding: XXXXX} and must appear in a scheme comment.\n"
	1731	"\n"
	1732	"Returns a string containing the character encoding of the file\n"
	1733	"if a declaration was found, or @code{#f} otherwise.\n")
	1734	#define FUNC_NAME s_scm_file_encoding
	1735	{
	1736	char *enc;
	1737	SCM s_enc;
	1738
	1739	SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
	1740
	1741	enc = scm_i_scan_for_encoding (port);
	1742	if (enc == NULL)
	1743	return SCM_BOOL_F;
	1744	else
	1745	{
	1746	s_enc = scm_from_locale_string (enc);
	1747	return s_enc;
	1748	}
	1749
	1750	return SCM_BOOL_F;
	1751	}
	1752	#undef FUNC_NAME
	1753
	1754	void
	1755	scm_init_read ()
	1756	{
	1757	SCM read_hash_procs;
	1758
	1759	read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
	1760
	1761	scm_i_read_hash_procedures =
	1762	SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
	1763
	1764	scm_init_opts (scm_read_options, scm_read_opts);
	1765	#include "libguile/read.x"
	1766	}
	1767
	1768	/*
	1769	Local Variables:
	1770	c-file-style: "gnu"
	1771	End:
	1772	*/