read: Avoid signed integer overflow in 'read_decimal_integer'.
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37 1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
419c8736 2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
96965a6e 33#include <c-strcase.h>
7337d56d 34
a0599745 35#include "libguile/_scm.h"
0ba0b384 36#include "libguile/bytevectors.h"
a0599745
MD
37#include "libguile/chars.h"
38#include "libguile/eval.h"
2fa901a5 39#include "libguile/arrays.h"
cf396142 40#include "libguile/bitvectors.h"
a0599745
MD
41#include "libguile/keywords.h"
42#include "libguile/alist.h"
43#include "libguile/srcprop.h"
44#include "libguile/hashtab.h"
45#include "libguile/hash.h"
46#include "libguile/ports.h"
49bb5bd3 47#include "libguile/fports.h"
a0599745
MD
48#include "libguile/root.h"
49#include "libguile/strings.h"
ba1b2226 50#include "libguile/strports.h"
a0599745 51#include "libguile/vectors.h"
a0599745 52#include "libguile/validate.h"
a4022e69 53#include "libguile/srfi-4.h"
7337d56d 54#include "libguile/srfi-13.h"
ba1b2226 55
a0599745 56#include "libguile/read.h"
22fc179a
HWN
57#include "libguile/private-options.h"
58
0f2d19dd
JB
59
60\f
61
5bf6a6f0 62SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 63SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 64SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 65SCM_SYMBOL (sym_nil, "nil");
c7733771 66
bf9eb54a
MW
67/* SRFI-105 curly infix expression support */
68SCM_SYMBOL (sym_nfx, "$nfx$");
69SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
70SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
71
72scm_t_option scm_read_opts[] =
73 {
74 { SCM_OPTION_BOOLEAN, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
90 { 0, },
91 };
b1b5433d
MW
92
93/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
96
97enum t_keyword_style
98 {
99 KEYWORD_STYLE_HASH_PREFIX,
100 KEYWORD_STYLE_PREFIX,
101 KEYWORD_STYLE_POSTFIX
102 };
b1b5433d 103
ea8c9761
MW
104struct t_read_opts
105{
b1b5433d
MW
106 enum t_keyword_style keyword_style;
107 unsigned int copy_source_p : 1;
108 unsigned int record_positions_p : 1;
109 unsigned int case_insensitive_p : 1;
110 unsigned int r6rs_escapes_p : 1;
111 unsigned int square_brackets_p : 1;
112 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
113 unsigned int curly_infix_p : 1;
114 unsigned int neoteric_p : 1;
b1b5433d
MW
115};
116
117typedef struct t_read_opts scm_t_read_opts;
118
a16f6fe7 119
39e8f371
HWN
120/*
121 Give meaningful error messages for errors
122
123 We use the format
124
ba1b2226 125 FILE:LINE:COL: MESSAGE
39e8f371
HWN
126 This happened in ....
127
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
130
39e8f371
HWN
131 */
132
133
a4022e69
MV
134void
135scm_i_input_error (char const *function,
136 SCM port, const char *message, SCM arg)
ba1b2226 137{
29a837fd
MV
138 SCM fn = (scm_is_string (SCM_FILENAME(port))
139 ? SCM_FILENAME(port)
140 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 141
29a837fd 142 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
143 SCM string = SCM_EOL;
144 scm_simple_format (string_port,
272632a6 145 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 146 scm_list_4 (fn,
b3aa4626 147 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 148 scm_from_int (SCM_COL (port) + 1),
272632a6 149 scm_from_locale_string (message)));
ba1b2226
HWN
150
151 string = scm_get_output_string (string_port);
152 scm_close_output_port (string_port);
4a655e50 153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 154 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 155 string,
dd72382c 156 arg,
ba1b2226
HWN
157 SCM_BOOL_F);
158}
39e8f371
HWN
159
160
a1ec6916 161SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 162 (SCM setting),
dc7fa443
MG
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 166#define FUNC_NAME s_scm_read_options
a16f6fe7 167{
b7ff98dd
MD
168 SCM ans = scm_options (setting,
169 scm_read_opts,
1bbd0b84 170 FUNC_NAME);
b7ff98dd
MD
171 if (SCM_COPY_SOURCE_P)
172 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
173 return ans;
174}
1bbd0b84 175#undef FUNC_NAME
a16f6fe7 176
d458073b
AR
177/* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179static SCM *scm_i_read_hash_procedures;
deca31e1 180
cfd15439 181static SCM
d458073b
AR
182scm_i_read_hash_procedures_ref (void)
183{
184 return scm_fluid_ref (*scm_i_read_hash_procedures);
185}
186
cfd15439 187static void
d458073b
AR
188scm_i_read_hash_procedures_set_x (SCM value)
189{
190 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
191}
0f2d19dd 192
7337d56d
LC
193\f
194/* Token readers. */
0f2d19dd 195
0f2d19dd 196
7337d56d
LC
197/* Size of the C buffer used to read symbols and numbers. */
198#define READER_BUFFER_SIZE 128
0f2d19dd 199
ff4d3672
LC
200/* Number of 32-bit codepoints in the buffer used to read strings. */
201#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 202
7337d56d
LC
203/* The maximum size of Scheme character names. */
204#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 205
02327c0c
MW
206/* The maximum size of reader directive names. */
207#define READER_DIRECTIVE_NAME_MAX_SIZE 50
208
94115ae3 209
7337d56d
LC
210/* `isblank' is only in C99. */
211#define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 213 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
214
215#ifdef MSDOS
216# define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
218#else
219# define CHAR_IS_BLANK CHAR_IS_BLANK_
220#endif
221
222
223/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
224 structure''). */
225#define CHAR_IS_R5RS_DELIMITER(c) \
226 (CHAR_IS_BLANK (c) \
603234c6 227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 228
603234c6
MW
229#define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
234
235/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
236 Structure''. */
237#define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
240
454866e0 241/* Read an SCSH block comment. */
cfd15439 242static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 243static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
244static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
245static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 246static SCM scm_get_hash_procedure (int);
454866e0 247
69f90b0b
MG
248/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
cfd15439 252static int
b1b5433d
MW
253read_token (SCM port, scm_t_read_opts *opts,
254 char *buf, size_t buf_size, size_t *read)
b662b7e9 255{
69f90b0b 256 *read = 0;
0520c320 257
69f90b0b
MG
258 while (*read < buf_size)
259 {
260 int chr;
7337d56d 261
69f90b0b 262 chr = scm_get_byte_or_eof (port);
889975e5 263
69f90b0b
MG
264 if (chr == EOF)
265 return 0;
266 else if (CHAR_IS_DELIMITER (chr))
267 {
268 scm_unget_byte (chr, port);
269 return 0;
270 }
271 else
272 {
273 *buf = (char) chr;
274 buf++, (*read)++;
275 }
276 }
889975e5 277
69f90b0b
MG
278 return 1;
279 }
889975e5 280
b662b7e9
LC
281/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
283static char *
b1b5433d
MW
284read_complete_token (SCM port, scm_t_read_opts *opts,
285 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
286{
287 int overflow = 0;
b662b7e9
LC
288 size_t bytes_read, overflow_size = 0;
289 char *overflow_buffer = NULL;
1cc91f1b 290
69f90b0b
MG
291 do
292 {
b1b5433d 293 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
294 if (bytes_read == 0)
295 break;
296 if (overflow || overflow_size != 0)
297 {
298 if (overflow_size == 0)
299 {
b662b7e9
LC
300 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
301 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
302 overflow_size = bytes_read;
303 }
304 else
305 {
7be3c2fc 306 char *new_buf =
b662b7e9
LC
307 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
308
309 memcpy (new_buf, overflow_buffer, overflow_size);
310 memcpy (new_buf + overflow_size, buffer, bytes_read);
311
312 overflow_buffer = new_buf;
69f90b0b
MG
313 overflow_size += bytes_read;
314 }
315 }
889975e5 316 }
69f90b0b 317 while (overflow);
889975e5 318
69f90b0b
MG
319 if (overflow_size)
320 *read = overflow_size;
b521d265 321 else
69f90b0b
MG
322 *read = bytes_read;
323
b662b7e9 324 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 325}
7337d56d
LC
326
327/* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
329static int
b1b5433d 330flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 331{
cfd15439 332 scm_t_wchar c;
0f2d19dd 333 while (1)
b7f3516f 334 switch (c = scm_getc (port))
0f2d19dd
JB
335 {
336 case EOF:
337 goteof:
338 if (eoferr)
d156d3b7 339 {
a4022e69
MV
340 scm_i_input_error (eoferr,
341 port,
342 "end of file",
343 SCM_EOL);
d156d3b7 344 }
0f2d19dd 345 return c;
7337d56d 346
0f2d19dd
JB
347 case ';':
348 lp:
b7f3516f 349 switch (c = scm_getc (port))
0f2d19dd
JB
350 {
351 case EOF:
352 goto goteof;
353 default:
354 goto lp;
355 case SCM_LINE_INCREMENTORS:
356 break;
357 }
358 break;
7337d56d 359
454866e0
LC
360 case '#':
361 switch (c = scm_getc (port))
362 {
363 case EOF:
364 eoferr = "read_sharp";
365 goto goteof;
366 case '!':
b1b5433d 367 scm_read_shebang (c, port, opts);
454866e0 368 break;
34f3d47d 369 case ';':
b1b5433d 370 scm_read_commented_expression (c, port, opts);
34f3d47d 371 break;
620c8965
LC
372 case '|':
373 if (scm_is_false (scm_get_hash_procedure (c)))
374 {
375 scm_read_r6rs_block_comment (c, port);
376 break;
377 }
378 /* fall through */
454866e0
LC
379 default:
380 scm_ungetc (c, port);
381 return '#';
382 }
383 break;
384
0f2d19dd 385 case SCM_LINE_INCREMENTORS:
0f2d19dd 386 case SCM_SINGLE_SPACES:
0f2d19dd 387 case '\t':
0f2d19dd 388 break;
7337d56d 389
0f2d19dd
JB
390 default:
391 return c;
392 }
7337d56d
LC
393
394 return 0;
0f2d19dd
JB
395}
396
397
7337d56d
LC
398\f
399/* Token readers. */
1cc91f1b 400
b1b5433d
MW
401static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
402static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
403 long line, int column);
0f2d19dd
JB
404
405
d6cb0203 406static SCM
b1b5433d
MW
407maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
408 long line, int column)
d6cb0203 409{
b1b5433d 410 if (opts->record_positions_p)
d6cb0203
MW
411 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
412 return x;
413}
414
09a4f039 415static SCM
b1b5433d 416scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 417#define FUNC_NAME "scm_i_lreadparen"
09a4f039 418{
26c8cc14
AW
419 int c;
420 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
421 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
422 const int terminating_char = ((chr == '{') ? '}'
423 : ((chr == '[') ? ']'
424 : ')'));
7337d56d
LC
425
426 /* Need to capture line and column numbers here. */
427 long line = SCM_LINUM (port);
428 int column = SCM_COL (port) - 1;
f9c68a47 429
b1b5433d 430 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
431 if (terminating_char == c)
432 return SCM_EOL;
f9c68a47 433
7337d56d 434 scm_ungetc (c, port);
b1b5433d 435 tmp = scm_read_expression (port, opts);
1f7945a7
AW
436
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 441 {
b1b5433d
MW
442 ans = scm_read_expression (port, opts);
443 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
444 scm_i_input_error (FUNC_NAME, port, "missing close paren",
445 SCM_EOL);
446 return ans;
447 }
1cc91f1b 448
7337d56d
LC
449 /* Build the head of the list structure. */
450 ans = tl = scm_cons (tmp, SCM_EOL);
451
b1b5433d 452 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 453 {
7337d56d 454 SCM new_tail;
0f2d19dd 455
bf9eb54a
MW
456 if (c == ')' || (c == ']' && opts->square_brackets_p)
457 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
458 scm_i_input_error (FUNC_NAME, port,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c)));
461
7337d56d 462 scm_ungetc (c, port);
b1b5433d 463 tmp = scm_read_expression (port, opts);
5b69315e 464
1f7945a7
AW
465 /* See above note about scm_sym_dot. */
466 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 467 {
b1b5433d 468 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 469
b1b5433d 470 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
471 if (terminating_char != c)
472 scm_i_input_error (FUNC_NAME, port,
473 "in pair: missing close paren", SCM_EOL);
bf9eb54a 474 break;
0f2d19dd 475 }
b858464a 476
7337d56d
LC
477 new_tail = scm_cons (tmp, SCM_EOL);
478 SCM_SETCDR (tl, new_tail);
479 tl = new_tail;
7337d56d 480 }
0f2d19dd 481
bf9eb54a
MW
482 if (curly_list_p)
483 {
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len = scm_ilength (ans);
487
488 /* The (len == 0) case is handled above */
489 if (len == 1)
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans); /* {e} => e */
494 else if (len == 2)
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len >= 3 && (len & 1))
497 {
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op = scm_cadr (ans);
503
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
506 {
507 if (scm_is_null (tl))
508 {
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
511 tl = ans;
512 while (scm_is_pair (scm_cdr (tl)))
513 {
514 tmp = scm_cddr (tl);
515 SCM_SETCDR (tl, tmp);
516 tl = tmp;
517 }
518 ans = scm_cons (op, ans);
519 break;
520 }
521 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
522 {
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans = scm_cons (sym_nfx, ans);
525 break;
526 }
527 }
528 }
529 else
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans = scm_cons (sym_nfx, ans);
533 }
534
b1b5433d 535 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
536}
537#undef FUNC_NAME
a4022e69 538
c5661d28
MG
539
540/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
542 found. */
543#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
544 do \
545 { \
546 scm_t_wchar a; \
547 size_t i = 0; \
548 c = 0; \
549 while (i < ndigits) \
550 { \
551 a = scm_getc (port); \
552 if (a == EOF) \
553 goto str_eof; \
554 if (terminator \
555 && (a == (scm_t_wchar) terminator) \
556 && (i > 0)) \
557 break; \
558 if ('0' <= a && a <= '9') \
559 a -= '0'; \
560 else if ('A' <= a && a <= 'F') \
561 a = a - 'A' + 10; \
562 else if ('a' <= a && a <= 'f') \
563 a = a - 'a' + 10; \
564 else \
565 { \
566 c = a; \
567 goto bad_escaped; \
568 } \
569 c = c * 16 + a; \
570 i ++; \
571 } \
c5661d28
MG
572 } while (0)
573
684d664e
AW
574static void
575skip_intraline_whitespace (SCM port)
576{
577 scm_t_wchar c;
578
579 do
580 {
581 c = scm_getc (port);
582 if (c == EOF)
583 return;
584 }
585 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
586
587 scm_ungetc (c, port);
588}
589
7337d56d 590static SCM
b1b5433d 591scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
592#define FUNC_NAME "scm_lreadr"
593{
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
0f2d19dd 596
ff4d3672
LC
597 SCM str = SCM_EOL;
598 size_t c_str_len = 0;
599 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 600
b131b233
MW
601 /* Need to capture line and column numbers here. */
602 long line = SCM_LINUM (port);
603 int column = SCM_COL (port) - 1;
604
7337d56d
LC
605 while ('"' != (c = scm_getc (port)))
606 {
607 if (c == EOF)
9c44cd45
MG
608 {
609 str_eof:
610 scm_i_input_error (FUNC_NAME, port,
611 "end of file in string constant", SCM_EOL);
612 }
0f2d19dd 613
ff4d3672
LC
614 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
615 {
616 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
617 c_str_len = 0;
618 }
0f2d19dd 619
7337d56d 620 if (c == '\\')
9c44cd45
MG
621 {
622 switch (c = scm_getc (port))
623 {
624 case EOF:
625 goto str_eof;
626 case '"':
627 case '\\':
628 break;
9c44cd45 629 case '\n':
b1b5433d 630 if (opts->hungry_eol_escapes_p)
684d664e 631 skip_intraline_whitespace (port);
9c44cd45
MG
632 continue;
633 case '0':
634 c = '\0';
635 break;
636 case 'f':
637 c = '\f';
638 break;
639 case 'n':
640 c = '\n';
641 break;
642 case 'r':
643 c = '\r';
644 break;
645 case 't':
646 c = '\t';
647 break;
648 case 'a':
649 c = '\007';
650 break;
651 case 'v':
652 c = '\v';
653 break;
67a4a16d
MG
654 case 'b':
655 c = '\010';
656 break;
9c44cd45 657 case 'x':
b1b5433d 658 if (opts->r6rs_escapes_p)
dea901d6
MG
659 SCM_READ_HEX_ESCAPE (10, ';');
660 else
661 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 662 break;
9c44cd45 663 case 'u':
b1b5433d 664 if (!opts->r6rs_escapes_p)
898a0b5a
MG
665 {
666 SCM_READ_HEX_ESCAPE (4, '\0');
667 break;
668 }
9c44cd45 669 case 'U':
b1b5433d 670 if (!opts->r6rs_escapes_p)
898a0b5a
MG
671 {
672 SCM_READ_HEX_ESCAPE (6, '\0');
673 break;
674 }
9c44cd45
MG
675 default:
676 bad_escaped:
677 scm_i_input_error (FUNC_NAME, port,
678 "illegal character in escape sequence: ~S",
679 scm_list_1 (SCM_MAKE_CHAR (c)));
680 }
681 }
ff4d3672
LC
682
683 c_str[c_str_len++] = c;
7337d56d 684 }
ff4d3672
LC
685
686 if (scm_is_null (str))
687 /* Fast path: we got a string that fits in C_STR. */
688 str = scm_from_utf32_stringn (c_str, c_str_len);
689 else
690 {
691 if (c_str_len > 0)
692 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
693
694 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
695 }
696
b1b5433d 697 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 698}
db4b4ca6
DH
699#undef FUNC_NAME
700
0f2d19dd 701
7337d56d 702static SCM
b1b5433d 703scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 704{
69f90b0b 705 SCM result, str = SCM_EOL;
b662b7e9 706 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 707 size_t bytes_read;
69f90b0b 708 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 709
38f19074
MW
710 /* Need to capture line and column numbers here. */
711 long line = SCM_LINUM (port);
712 int column = SCM_COL (port) - 1;
713
7337d56d 714 scm_ungetc (chr, port);
b1b5433d 715 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 716 &bytes_read);
69f90b0b 717
b662b7e9 718 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
719
720 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 721 if (scm_is_false (result))
69f90b0b
MG
722 {
723 /* Return a symbol instead of a number */
b1b5433d 724 if (opts->case_insensitive_p)
69f90b0b
MG
725 str = scm_string_downcase_x (str);
726 result = scm_string_to_symbol (str);
727 }
38f19074 728 else if (SCM_NIMP (result))
b1b5433d 729 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 730
69f90b0b 731 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
732 return result;
733}
0f2d19dd 734
7337d56d 735static SCM
b1b5433d 736scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 737{
889975e5
MG
738 SCM result;
739 int ends_with_colon = 0;
69f90b0b 740 size_t bytes_read;
b1b5433d 741 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 742 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
743 scm_t_port *pt = SCM_PTAB_ENTRY (port);
744 SCM str;
7337d56d
LC
745
746 scm_ungetc (chr, port);
b1b5433d 747 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 748 &bytes_read);
69f90b0b 749 if (bytes_read > 0)
b662b7e9 750 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 751
69f90b0b
MG
752 if (postfix && ends_with_colon && (bytes_read > 1))
753 {
b662b7e9
LC
754 str = scm_from_stringn (buffer, bytes_read - 1,
755 pt->encoding, pt->ilseq_handler);
69f90b0b 756
b1b5433d 757 if (opts->case_insensitive_p)
69f90b0b
MG
758 str = scm_string_downcase_x (str);
759 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
760 }
7337d56d 761 else
69f90b0b 762 {
b662b7e9
LC
763 str = scm_from_stringn (buffer, bytes_read,
764 pt->encoding, pt->ilseq_handler);
69f90b0b 765
b1b5433d 766 if (opts->case_insensitive_p)
69f90b0b
MG
767 str = scm_string_downcase_x (str);
768 result = scm_string_to_symbol (str);
769 }
7337d56d 770
69f90b0b 771 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
772 return result;
773}
774
775static SCM
b1b5433d 776scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
777#define FUNC_NAME "scm_lreadr"
778{
889975e5 779 SCM result;
7337d56d 780 size_t read;
b662b7e9 781 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 782 unsigned int radix;
69f90b0b
MG
783 SCM str;
784 scm_t_port *pt;
7337d56d
LC
785
786 switch (chr)
787 {
788 case 'B':
789 case 'b':
790 radix = 2;
791 break;
792
793 case 'o':
794 case 'O':
795 radix = 8;
796 break;
797
798 case 'd':
799 case 'D':
800 radix = 10;
801 break;
802
803 case 'x':
804 case 'X':
805 radix = 16;
806 break;
807
808 default:
809 scm_ungetc (chr, port);
810 scm_ungetc ('#', port);
811 radix = 10;
812 }
813
b1b5433d 814 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 815 &read);
69f90b0b
MG
816
817 pt = SCM_PTAB_ENTRY (port);
b662b7e9 818 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
819
820 result = scm_string_to_number (str, scm_from_uint (radix));
821
69f90b0b 822 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
823
824 if (scm_is_true (result))
825 return result;
826
827 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
828
829 return SCM_BOOL_F;
830}
831#undef FUNC_NAME
832
833static SCM
b1b5433d 834scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
835{
836 SCM p;
492faee1
LC
837 long line = SCM_LINUM (port);
838 int column = SCM_COL (port) - 1;
7337d56d
LC
839
840 switch (chr)
841 {
842 case '`':
843 p = scm_sym_quasiquote;
844 break;
845
846 case '\'':
847 p = scm_sym_quote;
848 break;
849
850 case ',':
851 {
889975e5 852 scm_t_wchar c;
7337d56d
LC
853
854 c = scm_getc (port);
855 if ('@' == c)
856 p = scm_sym_uq_splicing;
857 else
0f2d19dd 858 {
7337d56d
LC
859 scm_ungetc (c, port);
860 p = scm_sym_unquote;
0f2d19dd 861 }
7337d56d
LC
862 break;
863 }
0f2d19dd 864
7337d56d
LC
865 default:
866 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 867 "scm_read_quote", chr);
7337d56d 868 abort ();
0f2d19dd 869 }
1cc91f1b 870
b1b5433d
MW
871 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
872 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
873}
874
34f3d47d
AW
875SCM_SYMBOL (sym_syntax, "syntax");
876SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
877SCM_SYMBOL (sym_unsyntax, "unsyntax");
878SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
879
880static SCM
b1b5433d 881scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
882{
883 SCM p;
884 long line = SCM_LINUM (port);
885 int column = SCM_COL (port) - 1;
886
887 switch (chr)
888 {
889 case '`':
890 p = sym_quasisyntax;
891 break;
892
893 case '\'':
894 p = sym_syntax;
895 break;
896
897 case ',':
898 {
899 int c;
900
901 c = scm_getc (port);
902 if ('@' == c)
903 p = sym_unsyntax_splicing;
904 else
905 {
906 scm_ungetc (c, port);
907 p = sym_unsyntax;
908 }
909 break;
910 }
911
912 default:
913 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
914 "scm_read_syntax", chr);
915 abort ();
916 }
917
b1b5433d
MW
918 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
919 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
920}
921
cfd15439 922static SCM
b1b5433d 923scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 924{
b1b5433d 925 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
926
927 if (!scm_is_eq (id, sym_nil))
928 scm_i_input_error ("scm_read_nil", port,
929 "unexpected input while reading #nil: ~a",
930 scm_list_1 (id));
931
932 return SCM_ELISP_NIL;
933}
934
cfd15439 935static SCM
7337d56d 936scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 937{
0f2d19dd
JB
938 int c;
939
889975e5
MG
940 /* We use the get_byte here because there is no need to get the
941 locale correct with comment input. This presumes that newline
942 always represents itself no matter what the encoding is. */
943 for (c = scm_get_byte_or_eof (port);
7337d56d 944 (c != EOF) && (c != '\n');
8a8da78d 945 c = scm_get_byte_or_eof (port));
7337d56d
LC
946
947 return SCM_UNSPECIFIED;
948}
949
950\f
951/* Sharp readers, i.e. readers called after a `#' sign has been read. */
952
953static SCM
954scm_read_boolean (int chr, SCM port)
955{
956 switch (chr)
0f2d19dd 957 {
7337d56d
LC
958 case 't':
959 case 'T':
960 return SCM_BOOL_T;
961
962 case 'f':
963 case 'F':
964 return SCM_BOOL_F;
0f2d19dd 965 }
7337d56d
LC
966
967 return SCM_UNSPECIFIED;
968}
969
970static SCM
b1b5433d 971scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
972#define FUNC_NAME "scm_lreadr"
973{
69f90b0b
MG
974 char buffer[READER_CHAR_NAME_MAX_SIZE];
975 SCM charname;
976 size_t charname_len, bytes_read;
889975e5
MG
977 scm_t_wchar cp;
978 int overflow;
69f90b0b 979 scm_t_port *pt;
889975e5 980
b1b5433d
MW
981 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
982 &bytes_read);
889975e5 983 if (overflow)
c1a0ba1c 984 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 985
69f90b0b 986 if (bytes_read == 0)
0f2d19dd 987 {
7337d56d
LC
988 chr = scm_getc (port);
989 if (chr == EOF)
990 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
991 "while reading character", SCM_EOL);
992
993 /* CHR must be a token delimiter, like a whitespace. */
994 return (SCM_MAKE_CHAR (chr));
0f2d19dd 995 }
7337d56d 996
69f90b0b 997 pt = SCM_PTAB_ENTRY (port);
7337d56d 998
69f90b0b
MG
999 /* Simple ASCII characters can be processed immediately. Also, simple
1000 ISO-8859-1 characters can be processed immediately if the encoding for this
1001 port is ISO-8859-1. */
1002 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1003 {
1004 SCM_COL (port) += 1;
1005 return SCM_MAKE_CHAR (buffer[0]);
1006 }
1007
1008 /* Otherwise, convert the buffer into a proper scheme string for
1009 processing. */
1010 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1011 pt->ilseq_handler);
1012 charname_len = scm_i_string_length (charname);
1013 SCM_COL (port) += charname_len;
889975e5 1014 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1015 if (charname_len == 1)
1016 return SCM_MAKE_CHAR (cp);
1017
1018 /* Ignore dotted circles, which may be used to keep combining characters from
1019 combining with the backslash in #\charname. */
0dcd7e61
MG
1020 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1021 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1022
889975e5 1023 if (cp >= '0' && cp < '8')
7337d56d
LC
1024 {
1025 /* Dirk:FIXME:: This type of character syntax is not R5RS
1026 * compliant. Further, it should be verified that the constant
0ffc78e3 1027 * does only consist of octal digits. */
889975e5 1028 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1029 if (SCM_I_INUMP (p))
0ffc78e3 1030 {
e25f3727 1031 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1032 if (SCM_IS_UNICODE_CHAR (c))
1033 return SCM_MAKE_CHAR (c);
1034 else
0f3a70cf 1035 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1036 "out-of-range octal character escape: ~a",
1037 scm_list_1 (charname));
1038 }
7337d56d
LC
1039 }
1040
0f3a70cf 1041 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1042 {
1043 SCM p;
0f3a70cf 1044
dea901d6
MG
1045 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1046 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1047 scm_from_uint (16));
1048 if (SCM_I_INUMP (p))
1049 {
e25f3727 1050 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1051 if (SCM_IS_UNICODE_CHAR (c))
1052 return SCM_MAKE_CHAR (c);
1053 else
1054 scm_i_input_error (FUNC_NAME, port,
1055 "out-of-range hex character escape: ~a",
1056 scm_list_1 (charname));
1057 }
1058 }
1059
889975e5
MG
1060 /* The names of characters should never have non-Latin1
1061 characters. */
1062 if (scm_i_is_narrow_string (charname)
1063 || scm_i_try_narrow_string (charname))
4769c9db
AW
1064 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1065 charname_len);
1066 if (scm_is_true (ch))
1067 return ch;
1068 }
7337d56d 1069
7337d56d 1070 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1071 scm_list_1 (charname));
7337d56d
LC
1072
1073 return SCM_UNSPECIFIED;
0f2d19dd 1074}
db4b4ca6 1075#undef FUNC_NAME
0f2d19dd 1076
cfd15439 1077static SCM
b1b5433d 1078scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1079{
1080 SCM symbol;
1081
1082 /* Read the symbol that comprises the keyword. Doing this instead of
1083 invoking a specific symbol reader function allows `scm_read_keyword ()'
1084 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1085
7337d56d 1086 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1087 symbol = scm_read_expression (port, opts);
7337d56d 1088 if (!scm_is_symbol (symbol))
7f74cf9a 1089 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1090 "keyword prefix `~a' not followed by a symbol: ~s",
1091 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1092
1093 return (scm_symbol_to_keyword (symbol));
1094}
1095
cfd15439 1096static SCM
b1b5433d
MW
1097scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1098 long line, int column)
09a4f039 1099{
7337d56d
LC
1100 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1101 guarantee that it's going to do what we want. After all, this is an
1102 implementation detail of `scm_read_vector ()', not a desirable
1103 property. */
b1b5433d
MW
1104 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1105 port, opts, line, column);
b131b233
MW
1106}
1107
493ceb99
MW
1108/* Helper used by scm_read_array */
1109static int
1110read_decimal_integer (SCM port, int c, ssize_t *resp)
1111{
1112 ssize_t sign = 1;
1113 ssize_t res = 0;
1114 int got_it = 0;
1115
1116 if (c == '-')
1117 {
1118 sign = -1;
1119 c = scm_getc (port);
1120 }
1121
1122 while ('0' <= c && c <= '9')
1123 {
a662686a
MW
1124 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1125 scm_i_input_error ("read_decimal_integer", port,
1126 "number too large", SCM_EOL);
493ceb99
MW
1127 res = 10*res + c-'0';
1128 got_it = 1;
1129 c = scm_getc (port);
1130 }
1131
1132 if (got_it)
1133 *resp = sign * res;
1134 return c;
1135}
1136
1137/* Read an array. This function can also read vectors and uniform
1138 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1139 handled here.
1140
10744b7c 1141 C is the first character read after the '#'. */
b131b233 1142static SCM
b1b5433d 1143scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
b131b233 1144{
493ceb99
MW
1145 ssize_t rank;
1146 scm_t_wchar tag_buf[8];
1147 int tag_len;
1148
1149 SCM tag, shape = SCM_BOOL_F, elements, array;
1150
1151 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1152 the array code can not deal with zero-length dimensions yet, and
10744b7c 1153 we want to allow zero-length vectors, of course. */
493ceb99 1154 if (c == '(')
b1b5433d 1155 return scm_read_vector (c, port, opts, line, column);
493ceb99 1156
10744b7c 1157 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1158 if (c == 'f')
1159 {
1160 c = scm_getc (port);
1161 if (c != '3' && c != '6')
1162 {
1163 if (c != EOF)
1164 scm_ungetc (c, port);
1165 return SCM_BOOL_F;
1166 }
1167 rank = 1;
1168 tag_buf[0] = 'f';
1169 tag_len = 1;
1170 goto continue_reading_tag;
1171 }
1172
1173 /* Read rank. */
1174 rank = 1;
1175 c = read_decimal_integer (port, c, &rank);
1176 if (rank < 0)
1177 scm_i_input_error (NULL, port, "array rank must be non-negative",
1178 SCM_EOL);
1179
1180 /* Read tag. */
1181 tag_len = 0;
1182 continue_reading_tag:
1183 while (c != EOF && c != '(' && c != '@' && c != ':'
1184 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1185 {
1186 tag_buf[tag_len++] = c;
1187 c = scm_getc (port);
1188 }
1189 if (tag_len == 0)
1190 tag = SCM_BOOL_T;
b131b233 1191 else
493ceb99
MW
1192 {
1193 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1194 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1195 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1196 scm_list_1 (tag));
1197 }
1198
1199 /* Read shape. */
1200 if (c == '@' || c == ':')
1201 {
1202 shape = SCM_EOL;
1203
1204 do
1205 {
1206 ssize_t lbnd = 0, len = 0;
1207 SCM s;
1208
1209 if (c == '@')
1210 {
1211 c = scm_getc (port);
1212 c = read_decimal_integer (port, c, &lbnd);
1213 }
1214
1215 s = scm_from_ssize_t (lbnd);
1216
1217 if (c == ':')
1218 {
1219 c = scm_getc (port);
1220 c = read_decimal_integer (port, c, &len);
1221 if (len < 0)
1222 scm_i_input_error (NULL, port,
1223 "array length must be non-negative",
1224 SCM_EOL);
1225
1226 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1227 }
1228
1229 shape = scm_cons (s, shape);
1230 } while (c == '@' || c == ':');
1231
1232 shape = scm_reverse_x (shape, SCM_EOL);
1233 }
1234
1235 /* Read nested lists of elements. */
1236 if (c != '(')
1237 scm_i_input_error (NULL, port,
1238 "missing '(' in vector or array literal",
1239 SCM_EOL);
b1b5433d 1240 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1241
1242 if (scm_is_false (shape))
1243 shape = scm_from_ssize_t (rank);
1244 else if (scm_ilength (shape) != rank)
1245 scm_i_input_error
1246 (NULL, port,
1247 "the number of shape specifications must match the array rank",
1248 SCM_EOL);
1249
1250 /* Handle special print syntax of rank zero arrays; see
1251 scm_i_print_array for a rationale. */
1252 if (rank == 0)
1253 {
1254 if (!scm_is_pair (elements))
1255 scm_i_input_error (NULL, port,
1256 "too few elements in array literal, need 1",
1257 SCM_EOL);
1258 if (!scm_is_null (SCM_CDR (elements)))
1259 scm_i_input_error (NULL, port,
1260 "too many elements in array literal, want 1",
1261 SCM_EOL);
1262 elements = SCM_CAR (elements);
1263 }
1264
1265 /* Construct array, annotate with source location, and return. */
1266 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1267 return maybe_annotate_source (array, port, opts, line, column);
7337d56d 1268}
09a4f039 1269
cfd15439 1270static SCM
b1b5433d
MW
1271scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1272 long line, int column)
7337d56d 1273{
b1b5433d 1274 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1275}
1276
0ba0b384 1277static SCM
b1b5433d
MW
1278scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1279 long line, int column)
0ba0b384
LC
1280{
1281 chr = scm_getc (port);
1282 if (chr != 'u')
1283 goto syntax;
1284
1285 chr = scm_getc (port);
1286 if (chr != '8')
1287 goto syntax;
1288
1289 chr = scm_getc (port);
1290 if (chr != '(')
1291 goto syntax;
1292
b131b233 1293 return maybe_annotate_source
b1b5433d
MW
1294 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1295 port, opts, line, column);
0ba0b384
LC
1296
1297 syntax:
1298 scm_i_input_error ("read_bytevector", port,
1299 "invalid bytevector prefix",
1300 SCM_MAKE_CHAR (chr));
1301 return SCM_UNSPECIFIED;
1302}
1303
7337d56d 1304static SCM
b1b5433d
MW
1305scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1306 long line, int column)
7337d56d
LC
1307{
1308 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1309 terribly inefficient but who cares? */
1310 SCM s_bits = SCM_EOL;
1311
1312 for (chr = scm_getc (port);
1313 (chr != EOF) && ((chr == '0') || (chr == '1'));
1314 chr = scm_getc (port))
09a4f039 1315 {
7337d56d 1316 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1317 }
7337d56d
LC
1318
1319 if (chr != EOF)
1320 scm_ungetc (chr, port);
1321
b131b233
MW
1322 return maybe_annotate_source
1323 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1324 port, opts, line, column);
7337d56d
LC
1325}
1326
cfd15439 1327static SCM
889975e5 1328scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1329{
1330 int bang_seen = 0;
1331
1332 for (;;)
09a4f039 1333 {
58b1db5f 1334 int c = scm_getc (port);
62850ef3 1335
7337d56d
LC
1336 if (c == EOF)
1337 scm_i_input_error ("skip_block_comment", port,
1338 "unterminated `#! ... !#' comment", SCM_EOL);
1339
1340 if (c == '!')
1341 bang_seen = 1;
1342 else if (c == '#' && bang_seen)
1343 break;
1344 else
1345 bang_seen = 0;
1346 }
1347
1348 return SCM_UNSPECIFIED;
1349}
1350
9331ffd8
MW
1351static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1352 int value);
bf9eb54a
MW
1353static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1354 int value);
1355static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1356 int value);
9331ffd8 1357
d7fcaec3 1358static SCM
b1b5433d 1359scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1360{
02327c0c
MW
1361 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1362 int c;
1363 int i = 0;
1364
1365 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1366 {
02327c0c
MW
1367 c = scm_getc (port);
1368 if (c == EOF)
1369 scm_i_input_error ("skip_block_comment", port,
1370 "unterminated `#! ... !#' comment", SCM_EOL);
1371 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1372 name[i++] = c;
1373 else if (CHAR_IS_DELIMITER (c))
1374 {
1375 scm_ungetc (c, port);
1376 name[i] = '\0';
1377 if (0 == strcmp ("r6rs", name))
1378 ; /* Silently ignore */
9331ffd8
MW
1379 else if (0 == strcmp ("fold-case", name))
1380 set_port_case_insensitive_p (port, opts, 1);
1381 else if (0 == strcmp ("no-fold-case", name))
1382 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1383 else if (0 == strcmp ("curly-infix", name))
1384 set_port_curly_infix_p (port, opts, 1);
1385 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1386 {
1387 set_port_curly_infix_p (port, opts, 1);
1388 set_port_square_brackets_p (port, opts, 0);
1389 }
02327c0c
MW
1390 else
1391 break;
1392
1393 return SCM_UNSPECIFIED;
1394 }
fa746547
MW
1395 else
1396 {
1397 scm_ungetc (c, port);
1398 break;
1399 }
911b03b2 1400 }
02327c0c
MW
1401 while (i > 0)
1402 scm_ungetc (name[--i], port);
1403 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1404}
1405
620c8965
LC
1406static SCM
1407scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1408{
1409 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1410 nested. So care must be taken. */
1411 int nesting_level = 1;
6d5f8c32
AW
1412
1413 int a = scm_getc (port);
1414
1415 if (a == EOF)
1416 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1417 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1418
1419 while (nesting_level > 0)
1420 {
6d5f8c32 1421 int b = scm_getc (port);
620c8965 1422
6d5f8c32 1423 if (b == EOF)
cd169c5a 1424 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1425 "unterminated `#| ... |#' comment", SCM_EOL);
1426
6d5f8c32
AW
1427 if (a == '|' && b == '#')
1428 {
1429 nesting_level--;
1430 b = EOF;
1431 }
1432 else if (a == '#' && b == '|')
1433 {
1434 nesting_level++;
1435 b = EOF;
1436 }
1437
1438 a = b;
620c8965
LC
1439 }
1440
1441 return SCM_UNSPECIFIED;
1442}
1443
34f3d47d 1444static SCM
b1b5433d
MW
1445scm_read_commented_expression (scm_t_wchar chr, SCM port,
1446 scm_t_read_opts *opts)
34f3d47d 1447{
889975e5 1448 scm_t_wchar c;
34f3d47d 1449
b1b5433d 1450 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1451 if (EOF == c)
1452 scm_i_input_error ("read_commented_expression", port,
1453 "no expression after #; comment", SCM_EOL);
1454 scm_ungetc (c, port);
b1b5433d 1455 scm_read_expression (port, opts);
34f3d47d
AW
1456 return SCM_UNSPECIFIED;
1457}
1458
7337d56d 1459static SCM
889975e5 1460scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1461{
1462 /* Guile's extended symbol read syntax looks like this:
1463
1464 #{This is all a symbol name}#
1465
1466 So here, CHR is expected to be `{'. */
d9527cfa 1467 int saw_brace = 0;
7337d56d 1468 size_t len = 0;
190d4b0d 1469 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1470
889975e5 1471 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1472
1473 while ((chr = scm_getc (port)) != EOF)
1474 {
1475 if (saw_brace)
09a4f039 1476 {
7337d56d
LC
1477 if (chr == '#')
1478 {
7337d56d
LC
1479 break;
1480 }
1481 else
1482 {
1483 saw_brace = 0;
889975e5 1484 scm_i_string_set_x (buf, len++, '}');
7337d56d 1485 }
09a4f039 1486 }
d9527cfa
AW
1487
1488 if (chr == '}')
7337d56d 1489 saw_brace = 1;
d9527cfa
AW
1490 else if (chr == '\\')
1491 {
1492 /* It used to be that print.c would print extended-read-syntax
1493 symbols with backslashes before "non-standard" chars, but
1494 this routine wouldn't do anything with those escapes.
1495 Bummer. What we've done is to change print.c to output
1496 R6RS hex escapes for those characters, relying on the fact
1497 that the extended read syntax would never put a `\' before
1498 an `x'. For now, we just ignore other instances of
1499 backslash in the string. */
1500 switch ((chr = scm_getc (port)))
1501 {
1502 case EOF:
1503 goto done;
1504 case 'x':
1505 {
1506 scm_t_wchar c;
1507
1508 SCM_READ_HEX_ESCAPE (10, ';');
1509 scm_i_string_set_x (buf, len++, c);
1510 break;
1511
1512 str_eof:
1513 chr = EOF;
1514 goto done;
1515
1516 bad_escaped:
1517 scm_i_string_stop_writing ();
1518 scm_i_input_error ("scm_read_extended_symbol", port,
1519 "illegal character in escape sequence: ~S",
1520 scm_list_1 (SCM_MAKE_CHAR (c)));
1521 break;
1522 }
1523 default:
1524 scm_i_string_set_x (buf, len++, chr);
1525 break;
1526 }
1527 }
7337d56d 1528 else
d9527cfa 1529 scm_i_string_set_x (buf, len++, chr);
62850ef3 1530
889975e5 1531 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1532 {
7f991c7d
LC
1533 SCM addy;
1534
889975e5 1535 scm_i_string_stop_writing ();
190d4b0d 1536 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1537 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1538 len = 0;
889975e5 1539 buf = scm_i_string_start_writing (buf);
7337d56d 1540 }
7337d56d 1541 }
d9527cfa
AW
1542
1543 done:
889975e5 1544 scm_i_string_stop_writing ();
d9527cfa
AW
1545 if (chr == EOF)
1546 scm_i_input_error ("scm_read_extended_symbol", port,
1547 "end of file while reading symbol", SCM_EOL);
7337d56d 1548
889975e5 1549 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1550}
1551
1552
1553\f
1554/* Top-level token readers, i.e., dispatchers. */
1555
1556static SCM
b1b5433d 1557scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1558{
1559 SCM proc;
1560
1561 proc = scm_get_hash_procedure (chr);
1562 if (scm_is_true (scm_procedure_p (proc)))
1563 {
1564 long line = SCM_LINUM (port);
1565 int column = SCM_COL (port) - 2;
1566 SCM got;
1567
1568 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1569
3655ed89
MW
1570 if (opts->record_positions_p && SCM_NIMP (got)
1571 && !scm_i_has_source_properties (got))
26c8cc14
AW
1572 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1573
1574 return got;
09a4f039 1575 }
7337d56d
LC
1576
1577 return SCM_UNSPECIFIED;
1578}
1579
1580/* The reader for the sharp `#' character. It basically dispatches reads
1581 among the above token readers. */
1582static SCM
b1b5433d
MW
1583scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1584 long line, int column)
7337d56d
LC
1585#define FUNC_NAME "scm_lreadr"
1586{
1587 SCM result;
1588
1589 chr = scm_getc (port);
1590
b1b5433d 1591 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1592 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1593 return result;
1594
1595 switch (chr)
1596 {
1597 case '\\':
b1b5433d 1598 return (scm_read_character (chr, port, opts));
7337d56d 1599 case '(':
b1b5433d 1600 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1601 case 's':
1602 case 'u':
1603 case 'f':
df941b5b 1604 case 'c':
7337d56d 1605 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1606 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1607 case 'v':
b1b5433d 1608 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1609 case '*':
b1b5433d 1610 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1611 case 't':
1612 case 'T':
1613 case 'F':
7337d56d
LC
1614 return (scm_read_boolean (chr, port));
1615 case ':':
b1b5433d 1616 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1617 case '0': case '1': case '2': case '3': case '4':
1618 case '5': case '6': case '7': case '8': case '9':
1619 case '@':
1620#if SCM_ENABLE_DEPRECATED
1621 /* See below for 'i' and 'e'. */
1622 case 'a':
7337d56d
LC
1623 case 'y':
1624 case 'h':
1625 case 'l':
1626#endif
b1b5433d 1627 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1628
1629 case 'i':
1630 case 'e':
1631#if SCM_ENABLE_DEPRECATED
1632 {
1633 /* When next char is '(', it really is an old-style
1634 uniform array. */
889975e5 1635 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1636 if (next_c != EOF)
1637 scm_ungetc (next_c, port);
1638 if (next_c == '(')
b1b5433d 1639 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1640 /* Fall through. */
1641 }
1642#endif
1643 case 'b':
1644 case 'B':
1645 case 'o':
1646 case 'O':
1647 case 'd':
1648 case 'D':
1649 case 'x':
1650 case 'X':
1651 case 'I':
1652 case 'E':
b1b5433d 1653 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1654 case '{':
1655 return (scm_read_extended_symbol (chr, port));
1656 case '!':
b1b5433d 1657 return (scm_read_shebang (chr, port, opts));
34f3d47d 1658 case ';':
b1b5433d 1659 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1660 case '`':
1661 case '\'':
1662 case ',':
b1b5433d 1663 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1664 case 'n':
b1b5433d 1665 return (scm_read_nil (chr, port, opts));
7337d56d 1666 default:
b1b5433d 1667 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1668 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1669 {
1670 /* To remain compatible with 1.8 and earlier, the following
1671 characters have lower precedence than `read-hash-extend'
1672 characters. */
1673 switch (chr)
1674 {
1675 case '|':
1676 return scm_read_r6rs_block_comment (chr, port);
1677 default:
1678 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1679 scm_list_1 (SCM_MAKE_CHAR (chr)));
1680 }
1681 }
7337d56d
LC
1682 else
1683 return result;
1684 }
1685
1686 return SCM_UNSPECIFIED;
1687}
1688#undef FUNC_NAME
1689
1690static SCM
bf9eb54a
MW
1691read_inner_expression (SCM port, scm_t_read_opts *opts)
1692#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1693{
1694 while (1)
1695 {
cfd15439 1696 scm_t_wchar chr;
7337d56d
LC
1697
1698 chr = scm_getc (port);
1699
1700 switch (chr)
1701 {
1702 case SCM_WHITE_SPACES:
1703 case SCM_LINE_INCREMENTORS:
1704 break;
1705 case ';':
1706 (void) scm_read_semicolon_comment (chr, port);
1707 break;
bf9eb54a
MW
1708 case '{':
1709 if (opts->curly_infix_p)
1710 {
1711 if (opts->neoteric_p)
1712 return scm_read_sexp (chr, port, opts);
1713 else
1714 {
1715 SCM expr;
1716
1717 /* Enable neoteric expressions within curly braces */
1718 opts->neoteric_p = 1;
1719 expr = scm_read_sexp (chr, port, opts);
1720 opts->neoteric_p = 0;
1721 return expr;
1722 }
1723 }
1724 else
1725 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1726 case '[':
bf9eb54a
MW
1727 if (opts->square_brackets_p)
1728 return scm_read_sexp (chr, port, opts);
1729 else if (opts->curly_infix_p)
1730 {
1731 /* The syntax of neoteric expressions requires that '[' be
1732 a delimiter when curly-infix is enabled, so it cannot
1733 be part of an unescaped symbol. We might as well do
1734 something useful with it, so we adopt Kawa's convention:
1735 [...] => ($bracket-list$ ...) */
1736 long line = SCM_LINUM (port);
1737 int column = SCM_COL (port) - 1;
1738 return maybe_annotate_source
1739 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1740 port, opts, line, column);
1741 }
1742 else
1743 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1744 case '(':
b1b5433d 1745 return (scm_read_sexp (chr, port, opts));
7337d56d 1746 case '"':
b1b5433d 1747 return (scm_read_string (chr, port, opts));
7337d56d
LC
1748 case '\'':
1749 case '`':
1750 case ',':
b1b5433d 1751 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1752 case '#':
1753 {
b131b233
MW
1754 long line = SCM_LINUM (port);
1755 int column = SCM_COL (port) - 1;
b1b5433d 1756 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1757 if (scm_is_eq (result, SCM_UNSPECIFIED))
1758 /* We read a comment or some such. */
1759 break;
1760 else
1761 return result;
1762 }
1763 case ')':
1764 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1765 break;
bf9eb54a
MW
1766 case '}':
1767 if (opts->curly_infix_p)
1768 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1769 else
1770 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1771 case ']':
b1b5433d 1772 if (opts->square_brackets_p)
a4e47229
MG
1773 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1774 /* otherwise fall through */
7337d56d
LC
1775 case EOF:
1776 return SCM_EOF_VAL;
1777 case ':':
b1b5433d
MW
1778 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1779 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1780 /* Fall through. */
1781
1782 default:
1783 {
1784 if (((chr >= '0') && (chr <= '9'))
1785 || (strchr ("+-.", chr)))
b1b5433d 1786 return (scm_read_number (chr, port, opts));
7337d56d 1787 else
b1b5433d 1788 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1789 }
1790 }
1791 }
1792}
1793#undef FUNC_NAME
1794
bf9eb54a
MW
1795static SCM
1796scm_read_expression (SCM port, scm_t_read_opts *opts)
1797#define FUNC_NAME "scm_read_expression"
1798{
1799 if (!opts->neoteric_p)
1800 return read_inner_expression (port, opts);
1801 else
1802 {
1803 long line = 0;
1804 int column = 0;
1805 SCM expr;
1806
1807 if (opts->record_positions_p)
1808 {
1809 /* We need to get the position of the first non-whitespace
1810 character in order to correctly annotate neoteric
1811 expressions. For example, for the expression 'f(x)', the
1812 first call to 'read_inner_expression' reads the 'f' (which
1813 cannot be annotated), and then we later read the '(x)' and
1814 use it to construct the new list (f x). */
1815 int c = flush_ws (port, opts, (char *) NULL);
1816 if (c == EOF)
1817 return SCM_EOF_VAL;
1818 scm_ungetc (c, port);
1819 line = SCM_LINUM (port);
1820 column = SCM_COL (port);
1821 }
1822
1823 expr = read_inner_expression (port, opts);
1824
1825 /* 'expr' is the first component of the neoteric expression. Now
1826 we loop, and as long as the next character is '(', '[', or '{',
1827 (without any intervening whitespace), we use it to construct a
1828 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1829 for (;;)
1830 {
1831 int chr = scm_getc (port);
1832
1833 if (chr == '(')
1834 /* e(...) => (e ...) */
1835 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1836 else if (chr == '[')
1837 /* e[...] => ($bracket-apply$ e ...) */
1838 expr = scm_cons (sym_bracket_apply,
1839 scm_cons (expr,
1840 scm_read_sexp (chr, port, opts)));
1841 else if (chr == '{')
1842 {
1843 SCM arg = scm_read_sexp (chr, port, opts);
1844
1845 if (scm_is_null (arg))
1846 expr = scm_list_1 (expr); /* e{} => (e) */
1847 else
1848 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1849 }
1850 else
1851 {
1852 if (chr != EOF)
1853 scm_ungetc (chr, port);
1854 break;
1855 }
1856 maybe_annotate_source (expr, port, opts, line, column);
1857 }
1858 return expr;
1859 }
1860}
1861#undef FUNC_NAME
1862
7337d56d
LC
1863\f
1864/* Actual reader. */
1865
ea8c9761
MW
1866static void init_read_options (SCM port, scm_t_read_opts *opts);
1867
7337d56d
LC
1868SCM_DEFINE (scm_read, "read", 0, 1, 0,
1869 (SCM port),
1870 "Read an s-expression from the input port @var{port}, or from\n"
1871 "the current input port if @var{port} is not specified.\n"
1872 "Any whitespace before the next token is discarded.")
1873#define FUNC_NAME s_scm_read
1874{
b1b5433d 1875 scm_t_read_opts opts;
7337d56d
LC
1876 int c;
1877
1878 if (SCM_UNBNDP (port))
1879 port = scm_current_input_port ();
1880 SCM_VALIDATE_OPINPORT (1, port);
1881
ea8c9761 1882 init_read_options (port, &opts);
b1b5433d
MW
1883
1884 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1885 if (EOF == c)
1886 return SCM_EOF_VAL;
1887 scm_ungetc (c, port);
1888
b1b5433d 1889 return (scm_read_expression (port, &opts));
09a4f039 1890}
db4b4ca6 1891#undef FUNC_NAME
09a4f039 1892
0f2d19dd
JB
1893
1894\f
1895
14de3b42
GH
1896/* Manipulate the read-hash-procedures alist. This could be written in
1897 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1898SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1899 (SCM chr, SCM proc),
dc7fa443
MG
1900 "Install the procedure @var{proc} for reading expressions\n"
1901 "starting with the character sequence @code{#} and @var{chr}.\n"
1902 "@var{proc} will be called with two arguments: the character\n"
1903 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1904 "returned will be the return value of @code{read}. \n"
1905 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1906 )
1bbd0b84 1907#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1908{
fed9c9a2
GH
1909 SCM this;
1910 SCM prev;
1911
36284627 1912 SCM_VALIDATE_CHAR (1, chr);
7888309b 1913 SCM_ASSERT (scm_is_false (proc)
bc36d050 1914 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1915 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1916
14de3b42 1917 /* Check if chr is already in the alist. */
d458073b 1918 this = scm_i_read_hash_procedures_ref ();
14de3b42 1919 prev = SCM_BOOL_F;
fed9c9a2
GH
1920 while (1)
1921 {
d2e53ed6 1922 if (scm_is_null (this))
fed9c9a2
GH
1923 {
1924 /* not found, so add it to the beginning. */
7888309b 1925 if (scm_is_true (proc))
fed9c9a2 1926 {
d458073b
AR
1927 SCM new = scm_cons (scm_cons (chr, proc),
1928 scm_i_read_hash_procedures_ref ());
1929 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1930 }
1931 break;
1932 }
bc36d050 1933 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1934 {
1935 /* already in the alist. */
7888309b 1936 if (scm_is_false (proc))
14de3b42
GH
1937 {
1938 /* remove it. */
7888309b 1939 if (scm_is_false (prev))
14de3b42 1940 {
d458073b
AR
1941 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1942 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1943 }
1944 else
1945 scm_set_cdr_x (prev, SCM_CDR (this));
1946 }
fed9c9a2 1947 else
14de3b42
GH
1948 {
1949 /* replace it. */
1950 scm_set_cdr_x (SCM_CAR (this), proc);
1951 }
fed9c9a2
GH
1952 break;
1953 }
1954 prev = this;
1955 this = SCM_CDR (this);
1956 }
deca31e1 1957
deca31e1
GH
1958 return SCM_UNSPECIFIED;
1959}
1bbd0b84 1960#undef FUNC_NAME
0f2d19dd 1961
deca31e1
GH
1962/* Recover the read-hash procedure corresponding to char c. */
1963static SCM
6e8d25a6 1964scm_get_hash_procedure (int c)
deca31e1 1965{
d458073b 1966 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1967
deca31e1
GH
1968 while (1)
1969 {
d2e53ed6 1970 if (scm_is_null (rest))
deca31e1
GH
1971 return SCM_BOOL_F;
1972
7866a09b 1973 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1974 return SCM_CDAR (rest);
1975
1976 rest = SCM_CDR (rest);
1977 }
1978}
1cc91f1b 1979
889975e5
MG
1980#define SCM_ENCODING_SEARCH_SIZE (500)
1981
f8a1c9a8
LC
1982/* Search the first few hundred characters of a file for an Emacs-like coding
1983 declaration. Returns either NULL or a string whose storage has been
1984 allocated with `scm_gc_malloc ()'. */
889975e5 1985char *
f8a1c9a8 1986scm_i_scan_for_encoding (SCM port)
889975e5 1987{
d900843c 1988 scm_t_port *pt;
889975e5 1989 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1990 size_t bytes_read, encoding_length, i;
889975e5 1991 char *encoding = NULL;
daedbca7 1992 char *pos, *encoding_start;
889975e5
MG
1993 int in_comment;
1994
d900843c 1995 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1996
d900843c
AW
1997 if (pt->rw_active == SCM_PORT_WRITE)
1998 scm_flush (port);
49bb5bd3 1999
d900843c
AW
2000 if (pt->rw_random)
2001 pt->rw_active = SCM_PORT_READ;
2002
2003 if (pt->read_pos == pt->read_end)
2004 {
2005 /* We can use the read buffer, and thus avoid a seek. */
2006 if (scm_fill_input (port) == EOF)
2007 return NULL;
2008
2009 bytes_read = pt->read_end - pt->read_pos;
2010 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2011 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2012
2013 if (bytes_read <= 1)
2014 /* An unbuffered port -- don't scan. */
2015 return NULL;
2016
2017 memcpy (header, pt->read_pos, bytes_read);
2018 header[bytes_read] = '\0';
2019 }
2020 else
2021 {
2022 /* Try to read some bytes and then seek back. Not all ports
2023 support seeking back; and indeed some file ports (like
2024 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2025 check performed by SCM_FPORT_FDES---but fail to seek
2026 backwards. Hence this block comes second. We prefer to use
2027 the read buffer in-place. */
2028 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2029 return NULL;
2030
2031 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2032 header[bytes_read] = '\0';
419c8736 2033 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
d900843c 2034 }
889975e5 2035
889975e5
MG
2036 /* search past "coding[:=]" */
2037 pos = header;
2038 while (1)
2039 {
2040 if ((pos = strstr(pos, "coding")) == NULL)
2041 return NULL;
2042
2043 pos += strlen("coding");
2044 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2045 (*pos == ':' || *pos == '='))
2046 {
2047 pos ++;
2048 break;
2049 }
2050 }
2051
2052 /* skip spaces */
2053 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2054 (*pos == ' ' || *pos == '\t'))
2055 pos ++;
2056
2057 /* grab the next token */
daedbca7 2058 encoding_start = pos;
889975e5 2059 i = 0;
daedbca7
MG
2060 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2061 && encoding_start + i - header < bytes_read
2062 && (isalnum ((int) encoding_start[i])
2063 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
2064 i++;
2065
daedbca7
MG
2066 encoding_length = i;
2067 if (encoding_length == 0)
889975e5
MG
2068 return NULL;
2069
daedbca7
MG
2070 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2071 for (i = 0; i < encoding_length; i++)
026ed239 2072 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
2073
2074 /* push backwards to make sure we were in a comment */
2075 in_comment = 0;
daedbca7
MG
2076 pos = encoding_start;
2077 while (pos >= header)
889975e5 2078 {
8a12aeb9
AW
2079 if (*pos == ';')
2080 {
2081 in_comment = 1;
2082 break;
2083 }
2084 else if (*pos == '\n' || pos == header)
889975e5
MG
2085 {
2086 /* This wasn't in a semicolon comment. Check for a
2087 hash-bang comment. */
2088 char *beg = strstr (header, "#!");
2089 char *end = strstr (header, "!#");
8a12aeb9 2090 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2091 in_comment = 1;
2092 break;
2093 }
8a12aeb9
AW
2094 else
2095 {
2096 pos --;
2097 continue;
2098 }
889975e5
MG
2099 }
2100 if (!in_comment)
f8a1c9a8
LC
2101 /* This wasn't in a comment */
2102 return NULL;
2103
889975e5
MG
2104 return encoding;
2105}
2106
2107SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2108 (SCM port),
a270e133 2109 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2110 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2111 "The coding declaration is of the form\n"
2112 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2113 "\n"
2114 "Returns a string containing the character encoding of the file\n"
2115 "if a declaration was found, or @code{#f} otherwise.\n")
2116#define FUNC_NAME s_scm_file_encoding
2117{
2118 char *enc;
2119 SCM s_enc;
f8a1c9a8 2120
d900843c
AW
2121 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2122
f8a1c9a8 2123 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2124 if (enc == NULL)
2125 return SCM_BOOL_F;
2126 else
2127 {
2128 s_enc = scm_from_locale_string (enc);
889975e5
MG
2129 return s_enc;
2130 }
f8a1c9a8 2131
889975e5
MG
2132 return SCM_BOOL_F;
2133}
2134#undef FUNC_NAME
2135
ea8c9761
MW
2136\f
2137/* Per-port read options.
2138
a38024ba
MW
2139 We store per-port read options in the 'port-read-options' port
2140 property, which is stored in the internal port structure. The value
2141 stored is a single integer that contains a two-bit field for each
2142 read option.
ea8c9761
MW
2143
2144 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2145 the applicable value should be inherited from the corresponding
2146 global read option. Otherwise, the bit field contains the value of
2147 the read option. For boolean read options that have been set
2148 per-port, the possible values are 0 or 1. If the 'keyword_style'
2149 read option has been set per-port, its possible values are those in
2150 'enum t_keyword_style'. */
2151
a38024ba 2152/* Key to read options in port properties. */
ea8c9761
MW
2153SCM_SYMBOL (sym_port_read_options, "port-read-options");
2154
2155/* Offsets of bit fields for each per-port override */
2156#define READ_OPTION_COPY_SOURCE_P 0
2157#define READ_OPTION_RECORD_POSITIONS_P 2
2158#define READ_OPTION_CASE_INSENSITIVE_P 4
2159#define READ_OPTION_KEYWORD_STYLE 6
2160#define READ_OPTION_R6RS_ESCAPES_P 8
2161#define READ_OPTION_SQUARE_BRACKETS_P 10
2162#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2163#define READ_OPTION_CURLY_INFIX_P 14
ea8c9761 2164
bf9eb54a
MW
2165/* The total width in bits of the per-port overrides */
2166#define READ_OPTIONS_NUM_BITS 16
ea8c9761
MW
2167
2168#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2169#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2170
2171#define READ_OPTION_MASK 3
2172#define READ_OPTION_INHERIT 3
2173
2174static void
2175set_port_read_option (SCM port, int option, int new_value)
2176{
a38024ba 2177 SCM scm_read_options;
ea8c9761
MW
2178 unsigned int read_options;
2179
2180 new_value &= READ_OPTION_MASK;
a38024ba 2181 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2182 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2183 read_options = scm_to_uint (scm_read_options);
2184 else
2185 read_options = READ_OPTIONS_INHERIT_ALL;
2186 read_options &= ~(READ_OPTION_MASK << option);
2187 read_options |= new_value << option;
2188 scm_read_options = scm_from_uint (read_options);
a38024ba 2189 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
ea8c9761
MW
2190}
2191
9331ffd8
MW
2192/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2193static void
2194set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2195{
2196 value = !!value;
2197 opts->case_insensitive_p = value;
2198 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2199}
2200
bf9eb54a
MW
2201/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2202static void
2203set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2204{
2205 value = !!value;
2206 opts->square_brackets_p = value;
2207 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2208}
2209
2210/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2211static void
2212set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2213{
2214 value = !!value;
2215 opts->curly_infix_p = value;
2216 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2217}
2218
ea8c9761
MW
2219/* Initialize OPTS based on PORT's read options and the global read
2220 options. */
2221static void
2222init_read_options (SCM port, scm_t_read_opts *opts)
2223{
a38024ba 2224 SCM val, scm_read_options;
ea8c9761
MW
2225 unsigned int read_options, x;
2226
a38024ba 2227 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2228
2229 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2230 read_options = scm_to_uint (scm_read_options);
2231 else
2232 read_options = READ_OPTIONS_INHERIT_ALL;
2233
2234 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2235 if (x == READ_OPTION_INHERIT)
2236 {
2237 val = SCM_PACK (SCM_KEYWORD_STYLE);
2238 if (scm_is_eq (val, scm_keyword_prefix))
2239 x = KEYWORD_STYLE_PREFIX;
2240 else if (scm_is_eq (val, scm_keyword_postfix))
2241 x = KEYWORD_STYLE_POSTFIX;
2242 else
2243 x = KEYWORD_STYLE_HASH_PREFIX;
2244 }
2245 opts->keyword_style = x;
2246
2247#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2248 do \
2249 { \
2250 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2251 if (x == READ_OPTION_INHERIT) \
2252 x = !!SCM_ ## NAME; \
2253 opts->name = x; \
2254 } \
2255 while (0)
2256
2257 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2258 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2259 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2260 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2261 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2262 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2263 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
ea8c9761
MW
2264
2265#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2266
2267 opts->neoteric_p = 0;
ea8c9761
MW
2268}
2269
0f2d19dd
JB
2270void
2271scm_init_read ()
0f2d19dd 2272{
d458073b
AR
2273 SCM read_hash_procs;
2274
c81c2ad3 2275 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2276
2277 scm_i_read_hash_procedures =
2278 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2279
62560650 2280 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2281#include "libguile/read.x"
0f2d19dd 2282}
89e00824
ML
2283
2284/*
2285 Local Variables:
2286 c-file-style: "gnu"
2287 End:
2288*/