read: Accept "\|" in string literals.
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37 1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
7a329029 2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
96965a6e 33#include <c-strcase.h>
7337d56d 34
a0599745 35#include "libguile/_scm.h"
0ba0b384 36#include "libguile/bytevectors.h"
a0599745
MD
37#include "libguile/chars.h"
38#include "libguile/eval.h"
2fa901a5 39#include "libguile/arrays.h"
cf396142 40#include "libguile/bitvectors.h"
a0599745
MD
41#include "libguile/keywords.h"
42#include "libguile/alist.h"
43#include "libguile/srcprop.h"
44#include "libguile/hashtab.h"
45#include "libguile/hash.h"
46#include "libguile/ports.h"
49bb5bd3 47#include "libguile/fports.h"
a0599745
MD
48#include "libguile/root.h"
49#include "libguile/strings.h"
ba1b2226 50#include "libguile/strports.h"
a0599745 51#include "libguile/vectors.h"
a0599745 52#include "libguile/validate.h"
a4022e69 53#include "libguile/srfi-4.h"
7337d56d 54#include "libguile/srfi-13.h"
ba1b2226 55
a0599745 56#include "libguile/read.h"
22fc179a
HWN
57#include "libguile/private-options.h"
58
0f2d19dd
JB
59
60\f
61
5bf6a6f0 62SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 63SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 64SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 65SCM_SYMBOL (sym_nil, "nil");
c7733771 66
bf9eb54a
MW
67/* SRFI-105 curly infix expression support */
68SCM_SYMBOL (sym_nfx, "$nfx$");
69SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
70SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
71
72scm_t_option scm_read_opts[] =
73 {
74 { SCM_OPTION_BOOLEAN, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
90 { 0, },
91 };
b1b5433d
MW
92
93/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
96
97enum t_keyword_style
98 {
99 KEYWORD_STYLE_HASH_PREFIX,
100 KEYWORD_STYLE_PREFIX,
101 KEYWORD_STYLE_POSTFIX
102 };
b1b5433d 103
ea8c9761
MW
104struct t_read_opts
105{
b1b5433d
MW
106 enum t_keyword_style keyword_style;
107 unsigned int copy_source_p : 1;
108 unsigned int record_positions_p : 1;
109 unsigned int case_insensitive_p : 1;
110 unsigned int r6rs_escapes_p : 1;
111 unsigned int square_brackets_p : 1;
112 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
113 unsigned int curly_infix_p : 1;
114 unsigned int neoteric_p : 1;
b1b5433d
MW
115};
116
117typedef struct t_read_opts scm_t_read_opts;
118
a16f6fe7 119
39e8f371
HWN
120/*
121 Give meaningful error messages for errors
122
123 We use the format
124
ba1b2226 125 FILE:LINE:COL: MESSAGE
39e8f371
HWN
126 This happened in ....
127
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
130
39e8f371
HWN
131 */
132
133
a4022e69
MV
134void
135scm_i_input_error (char const *function,
136 SCM port, const char *message, SCM arg)
ba1b2226 137{
29a837fd
MV
138 SCM fn = (scm_is_string (SCM_FILENAME(port))
139 ? SCM_FILENAME(port)
140 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 141
29a837fd 142 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
143 SCM string = SCM_EOL;
144 scm_simple_format (string_port,
272632a6 145 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 146 scm_list_4 (fn,
b3aa4626 147 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 148 scm_from_int (SCM_COL (port) + 1),
272632a6 149 scm_from_locale_string (message)));
ba1b2226
HWN
150
151 string = scm_get_output_string (string_port);
152 scm_close_output_port (string_port);
4a655e50 153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 154 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 155 string,
dd72382c 156 arg,
ba1b2226
HWN
157 SCM_BOOL_F);
158}
39e8f371
HWN
159
160
a1ec6916 161SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 162 (SCM setting),
dc7fa443
MG
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 166#define FUNC_NAME s_scm_read_options
a16f6fe7 167{
b7ff98dd
MD
168 SCM ans = scm_options (setting,
169 scm_read_opts,
1bbd0b84 170 FUNC_NAME);
b7ff98dd
MD
171 if (SCM_COPY_SOURCE_P)
172 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
173 return ans;
174}
1bbd0b84 175#undef FUNC_NAME
a16f6fe7 176
d458073b
AR
177/* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179static SCM *scm_i_read_hash_procedures;
deca31e1 180
cfd15439 181static SCM
d458073b
AR
182scm_i_read_hash_procedures_ref (void)
183{
184 return scm_fluid_ref (*scm_i_read_hash_procedures);
185}
186
cfd15439 187static void
d458073b
AR
188scm_i_read_hash_procedures_set_x (SCM value)
189{
190 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
191}
0f2d19dd 192
7337d56d
LC
193\f
194/* Token readers. */
0f2d19dd 195
0f2d19dd 196
7337d56d
LC
197/* Size of the C buffer used to read symbols and numbers. */
198#define READER_BUFFER_SIZE 128
0f2d19dd 199
ff4d3672
LC
200/* Number of 32-bit codepoints in the buffer used to read strings. */
201#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 202
7337d56d
LC
203/* The maximum size of Scheme character names. */
204#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 205
02327c0c
MW
206/* The maximum size of reader directive names. */
207#define READER_DIRECTIVE_NAME_MAX_SIZE 50
208
94115ae3 209
7337d56d
LC
210/* `isblank' is only in C99. */
211#define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 213 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
214
215#ifdef MSDOS
216# define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
218#else
219# define CHAR_IS_BLANK CHAR_IS_BLANK_
220#endif
221
222
223/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
224 structure''). */
225#define CHAR_IS_R5RS_DELIMITER(c) \
226 (CHAR_IS_BLANK (c) \
603234c6 227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 228
603234c6
MW
229#define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
234
235/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
236 Structure''. */
237#define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
240
454866e0 241/* Read an SCSH block comment. */
cfd15439 242static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 243static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
244static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
245static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 246static SCM scm_get_hash_procedure (int);
454866e0 247
69f90b0b
MG
248/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
cfd15439 252static int
b1b5433d
MW
253read_token (SCM port, scm_t_read_opts *opts,
254 char *buf, size_t buf_size, size_t *read)
b662b7e9 255{
69f90b0b 256 *read = 0;
0520c320 257
69f90b0b
MG
258 while (*read < buf_size)
259 {
260 int chr;
7337d56d 261
69f90b0b 262 chr = scm_get_byte_or_eof (port);
889975e5 263
69f90b0b
MG
264 if (chr == EOF)
265 return 0;
266 else if (CHAR_IS_DELIMITER (chr))
267 {
268 scm_unget_byte (chr, port);
269 return 0;
270 }
271 else
272 {
273 *buf = (char) chr;
274 buf++, (*read)++;
275 }
276 }
889975e5 277
69f90b0b
MG
278 return 1;
279 }
889975e5 280
b662b7e9
LC
281/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
283static char *
b1b5433d
MW
284read_complete_token (SCM port, scm_t_read_opts *opts,
285 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
286{
287 int overflow = 0;
b662b7e9
LC
288 size_t bytes_read, overflow_size = 0;
289 char *overflow_buffer = NULL;
1cc91f1b 290
69f90b0b
MG
291 do
292 {
b1b5433d 293 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
294 if (bytes_read == 0)
295 break;
296 if (overflow || overflow_size != 0)
297 {
298 if (overflow_size == 0)
299 {
b662b7e9
LC
300 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
301 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
302 overflow_size = bytes_read;
303 }
304 else
305 {
7be3c2fc 306 char *new_buf =
b662b7e9
LC
307 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
308
309 memcpy (new_buf, overflow_buffer, overflow_size);
310 memcpy (new_buf + overflow_size, buffer, bytes_read);
311
312 overflow_buffer = new_buf;
69f90b0b
MG
313 overflow_size += bytes_read;
314 }
315 }
889975e5 316 }
69f90b0b 317 while (overflow);
889975e5 318
69f90b0b
MG
319 if (overflow_size)
320 *read = overflow_size;
b521d265 321 else
69f90b0b
MG
322 *read = bytes_read;
323
b662b7e9 324 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 325}
7337d56d
LC
326
327/* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
329static int
b1b5433d 330flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 331{
cfd15439 332 scm_t_wchar c;
0f2d19dd 333 while (1)
b7f3516f 334 switch (c = scm_getc (port))
0f2d19dd
JB
335 {
336 case EOF:
337 goteof:
338 if (eoferr)
d156d3b7 339 {
a4022e69
MV
340 scm_i_input_error (eoferr,
341 port,
342 "end of file",
343 SCM_EOL);
d156d3b7 344 }
0f2d19dd 345 return c;
7337d56d 346
0f2d19dd
JB
347 case ';':
348 lp:
b7f3516f 349 switch (c = scm_getc (port))
0f2d19dd
JB
350 {
351 case EOF:
352 goto goteof;
353 default:
354 goto lp;
355 case SCM_LINE_INCREMENTORS:
356 break;
357 }
358 break;
7337d56d 359
454866e0
LC
360 case '#':
361 switch (c = scm_getc (port))
362 {
363 case EOF:
364 eoferr = "read_sharp";
365 goto goteof;
366 case '!':
b1b5433d 367 scm_read_shebang (c, port, opts);
454866e0 368 break;
34f3d47d 369 case ';':
b1b5433d 370 scm_read_commented_expression (c, port, opts);
34f3d47d 371 break;
620c8965
LC
372 case '|':
373 if (scm_is_false (scm_get_hash_procedure (c)))
374 {
375 scm_read_r6rs_block_comment (c, port);
376 break;
377 }
378 /* fall through */
454866e0
LC
379 default:
380 scm_ungetc (c, port);
381 return '#';
382 }
383 break;
384
0f2d19dd 385 case SCM_LINE_INCREMENTORS:
0f2d19dd 386 case SCM_SINGLE_SPACES:
0f2d19dd 387 case '\t':
0f2d19dd 388 break;
7337d56d 389
0f2d19dd
JB
390 default:
391 return c;
392 }
7337d56d
LC
393
394 return 0;
0f2d19dd
JB
395}
396
397
7337d56d
LC
398\f
399/* Token readers. */
1cc91f1b 400
b1b5433d
MW
401static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
402static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
403 long line, int column);
0f2d19dd
JB
404
405
d6cb0203 406static SCM
b1b5433d
MW
407maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
408 long line, int column)
d6cb0203 409{
b1b5433d 410 if (opts->record_positions_p)
d6cb0203
MW
411 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
412 return x;
413}
414
09a4f039 415static SCM
b1b5433d 416scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 417#define FUNC_NAME "scm_i_lreadparen"
09a4f039 418{
26c8cc14
AW
419 int c;
420 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
421 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
422 const int terminating_char = ((chr == '{') ? '}'
423 : ((chr == '[') ? ']'
424 : ')'));
7337d56d
LC
425
426 /* Need to capture line and column numbers here. */
427 long line = SCM_LINUM (port);
428 int column = SCM_COL (port) - 1;
f9c68a47 429
b1b5433d 430 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
431 if (terminating_char == c)
432 return SCM_EOL;
f9c68a47 433
7337d56d 434 scm_ungetc (c, port);
b1b5433d 435 tmp = scm_read_expression (port, opts);
1f7945a7
AW
436
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 441 {
b1b5433d
MW
442 ans = scm_read_expression (port, opts);
443 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
444 scm_i_input_error (FUNC_NAME, port, "missing close paren",
445 SCM_EOL);
446 return ans;
447 }
1cc91f1b 448
7337d56d
LC
449 /* Build the head of the list structure. */
450 ans = tl = scm_cons (tmp, SCM_EOL);
451
b1b5433d 452 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 453 {
7337d56d 454 SCM new_tail;
0f2d19dd 455
bf9eb54a
MW
456 if (c == ')' || (c == ']' && opts->square_brackets_p)
457 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
458 scm_i_input_error (FUNC_NAME, port,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c)));
461
7337d56d 462 scm_ungetc (c, port);
b1b5433d 463 tmp = scm_read_expression (port, opts);
5b69315e 464
1f7945a7
AW
465 /* See above note about scm_sym_dot. */
466 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 467 {
b1b5433d 468 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 469
b1b5433d 470 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
471 if (terminating_char != c)
472 scm_i_input_error (FUNC_NAME, port,
473 "in pair: missing close paren", SCM_EOL);
bf9eb54a 474 break;
0f2d19dd 475 }
b858464a 476
7337d56d
LC
477 new_tail = scm_cons (tmp, SCM_EOL);
478 SCM_SETCDR (tl, new_tail);
479 tl = new_tail;
7337d56d 480 }
0f2d19dd 481
bf9eb54a
MW
482 if (curly_list_p)
483 {
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len = scm_ilength (ans);
487
488 /* The (len == 0) case is handled above */
489 if (len == 1)
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans); /* {e} => e */
494 else if (len == 2)
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len >= 3 && (len & 1))
497 {
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op = scm_cadr (ans);
503
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
506 {
507 if (scm_is_null (tl))
508 {
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
511 tl = ans;
512 while (scm_is_pair (scm_cdr (tl)))
513 {
514 tmp = scm_cddr (tl);
515 SCM_SETCDR (tl, tmp);
516 tl = tmp;
517 }
518 ans = scm_cons (op, ans);
519 break;
520 }
521 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
522 {
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans = scm_cons (sym_nfx, ans);
525 break;
526 }
527 }
528 }
529 else
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans = scm_cons (sym_nfx, ans);
533 }
534
b1b5433d 535 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
536}
537#undef FUNC_NAME
a4022e69 538
c5661d28
MG
539
540/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
542 found. */
543#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
544 do \
545 { \
546 scm_t_wchar a; \
547 size_t i = 0; \
548 c = 0; \
549 while (i < ndigits) \
550 { \
551 a = scm_getc (port); \
552 if (a == EOF) \
553 goto str_eof; \
554 if (terminator \
555 && (a == (scm_t_wchar) terminator) \
556 && (i > 0)) \
557 break; \
558 if ('0' <= a && a <= '9') \
559 a -= '0'; \
560 else if ('A' <= a && a <= 'F') \
561 a = a - 'A' + 10; \
562 else if ('a' <= a && a <= 'f') \
563 a = a - 'a' + 10; \
564 else \
565 { \
566 c = a; \
567 goto bad_escaped; \
568 } \
569 c = c * 16 + a; \
570 i ++; \
571 } \
c5661d28
MG
572 } while (0)
573
684d664e
AW
574static void
575skip_intraline_whitespace (SCM port)
576{
577 scm_t_wchar c;
578
579 do
580 {
581 c = scm_getc (port);
582 if (c == EOF)
583 return;
584 }
585 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
586
587 scm_ungetc (c, port);
588}
589
7337d56d 590static SCM
b1b5433d 591scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
592#define FUNC_NAME "scm_lreadr"
593{
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
0f2d19dd 596
ff4d3672
LC
597 SCM str = SCM_EOL;
598 size_t c_str_len = 0;
599 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 600
b131b233
MW
601 /* Need to capture line and column numbers here. */
602 long line = SCM_LINUM (port);
603 int column = SCM_COL (port) - 1;
604
7337d56d
LC
605 while ('"' != (c = scm_getc (port)))
606 {
607 if (c == EOF)
9c44cd45
MG
608 {
609 str_eof:
610 scm_i_input_error (FUNC_NAME, port,
611 "end of file in string constant", SCM_EOL);
612 }
0f2d19dd 613
ff4d3672
LC
614 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
615 {
616 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
617 c_str_len = 0;
618 }
0f2d19dd 619
7337d56d 620 if (c == '\\')
9c44cd45
MG
621 {
622 switch (c = scm_getc (port))
623 {
624 case EOF:
625 goto str_eof;
626 case '"':
6579c330 627 case '|':
9c44cd45
MG
628 case '\\':
629 break;
9c44cd45 630 case '\n':
b1b5433d 631 if (opts->hungry_eol_escapes_p)
684d664e 632 skip_intraline_whitespace (port);
9c44cd45
MG
633 continue;
634 case '0':
635 c = '\0';
636 break;
637 case 'f':
638 c = '\f';
639 break;
640 case 'n':
641 c = '\n';
642 break;
643 case 'r':
644 c = '\r';
645 break;
646 case 't':
647 c = '\t';
648 break;
649 case 'a':
650 c = '\007';
651 break;
652 case 'v':
653 c = '\v';
654 break;
67a4a16d
MG
655 case 'b':
656 c = '\010';
657 break;
9c44cd45 658 case 'x':
b1b5433d 659 if (opts->r6rs_escapes_p)
dea901d6
MG
660 SCM_READ_HEX_ESCAPE (10, ';');
661 else
662 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 663 break;
9c44cd45 664 case 'u':
b1b5433d 665 if (!opts->r6rs_escapes_p)
898a0b5a
MG
666 {
667 SCM_READ_HEX_ESCAPE (4, '\0');
668 break;
669 }
9c44cd45 670 case 'U':
b1b5433d 671 if (!opts->r6rs_escapes_p)
898a0b5a
MG
672 {
673 SCM_READ_HEX_ESCAPE (6, '\0');
674 break;
675 }
9c44cd45
MG
676 default:
677 bad_escaped:
678 scm_i_input_error (FUNC_NAME, port,
679 "illegal character in escape sequence: ~S",
680 scm_list_1 (SCM_MAKE_CHAR (c)));
681 }
682 }
ff4d3672
LC
683
684 c_str[c_str_len++] = c;
7337d56d 685 }
ff4d3672
LC
686
687 if (scm_is_null (str))
688 /* Fast path: we got a string that fits in C_STR. */
689 str = scm_from_utf32_stringn (c_str, c_str_len);
690 else
691 {
692 if (c_str_len > 0)
693 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
694
695 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
696 }
697
b1b5433d 698 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 699}
db4b4ca6
DH
700#undef FUNC_NAME
701
0f2d19dd 702
7337d56d 703static SCM
b1b5433d 704scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 705{
69f90b0b 706 SCM result, str = SCM_EOL;
b662b7e9 707 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 708 size_t bytes_read;
69f90b0b 709 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 710
38f19074
MW
711 /* Need to capture line and column numbers here. */
712 long line = SCM_LINUM (port);
713 int column = SCM_COL (port) - 1;
714
7337d56d 715 scm_ungetc (chr, port);
b1b5433d 716 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 717 &bytes_read);
69f90b0b 718
b662b7e9 719 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
720
721 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 722 if (scm_is_false (result))
69f90b0b
MG
723 {
724 /* Return a symbol instead of a number */
b1b5433d 725 if (opts->case_insensitive_p)
69f90b0b
MG
726 str = scm_string_downcase_x (str);
727 result = scm_string_to_symbol (str);
728 }
38f19074 729 else if (SCM_NIMP (result))
b1b5433d 730 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 731
69f90b0b 732 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
733 return result;
734}
0f2d19dd 735
7337d56d 736static SCM
b1b5433d 737scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 738{
889975e5
MG
739 SCM result;
740 int ends_with_colon = 0;
69f90b0b 741 size_t bytes_read;
b1b5433d 742 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 743 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
744 scm_t_port *pt = SCM_PTAB_ENTRY (port);
745 SCM str;
7337d56d
LC
746
747 scm_ungetc (chr, port);
b1b5433d 748 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 749 &bytes_read);
69f90b0b 750 if (bytes_read > 0)
b662b7e9 751 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 752
69f90b0b
MG
753 if (postfix && ends_with_colon && (bytes_read > 1))
754 {
b662b7e9
LC
755 str = scm_from_stringn (buffer, bytes_read - 1,
756 pt->encoding, pt->ilseq_handler);
69f90b0b 757
b1b5433d 758 if (opts->case_insensitive_p)
69f90b0b
MG
759 str = scm_string_downcase_x (str);
760 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
761 }
7337d56d 762 else
69f90b0b 763 {
b662b7e9
LC
764 str = scm_from_stringn (buffer, bytes_read,
765 pt->encoding, pt->ilseq_handler);
69f90b0b 766
b1b5433d 767 if (opts->case_insensitive_p)
69f90b0b
MG
768 str = scm_string_downcase_x (str);
769 result = scm_string_to_symbol (str);
770 }
7337d56d 771
69f90b0b 772 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
773 return result;
774}
775
776static SCM
b1b5433d 777scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
778#define FUNC_NAME "scm_lreadr"
779{
889975e5 780 SCM result;
7337d56d 781 size_t read;
b662b7e9 782 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 783 unsigned int radix;
69f90b0b
MG
784 SCM str;
785 scm_t_port *pt;
7337d56d
LC
786
787 switch (chr)
788 {
789 case 'B':
790 case 'b':
791 radix = 2;
792 break;
793
794 case 'o':
795 case 'O':
796 radix = 8;
797 break;
798
799 case 'd':
800 case 'D':
801 radix = 10;
802 break;
803
804 case 'x':
805 case 'X':
806 radix = 16;
807 break;
808
809 default:
810 scm_ungetc (chr, port);
811 scm_ungetc ('#', port);
812 radix = 10;
813 }
814
b1b5433d 815 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 816 &read);
69f90b0b
MG
817
818 pt = SCM_PTAB_ENTRY (port);
b662b7e9 819 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
820
821 result = scm_string_to_number (str, scm_from_uint (radix));
822
69f90b0b 823 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
824
825 if (scm_is_true (result))
826 return result;
827
828 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
829
830 return SCM_BOOL_F;
831}
832#undef FUNC_NAME
833
834static SCM
b1b5433d 835scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
836{
837 SCM p;
492faee1
LC
838 long line = SCM_LINUM (port);
839 int column = SCM_COL (port) - 1;
7337d56d
LC
840
841 switch (chr)
842 {
843 case '`':
844 p = scm_sym_quasiquote;
845 break;
846
847 case '\'':
848 p = scm_sym_quote;
849 break;
850
851 case ',':
852 {
889975e5 853 scm_t_wchar c;
7337d56d
LC
854
855 c = scm_getc (port);
856 if ('@' == c)
857 p = scm_sym_uq_splicing;
858 else
0f2d19dd 859 {
7337d56d
LC
860 scm_ungetc (c, port);
861 p = scm_sym_unquote;
0f2d19dd 862 }
7337d56d
LC
863 break;
864 }
0f2d19dd 865
7337d56d
LC
866 default:
867 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 868 "scm_read_quote", chr);
7337d56d 869 abort ();
0f2d19dd 870 }
1cc91f1b 871
b1b5433d
MW
872 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
873 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
874}
875
34f3d47d
AW
876SCM_SYMBOL (sym_syntax, "syntax");
877SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
878SCM_SYMBOL (sym_unsyntax, "unsyntax");
879SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
880
881static SCM
b1b5433d 882scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
883{
884 SCM p;
885 long line = SCM_LINUM (port);
886 int column = SCM_COL (port) - 1;
887
888 switch (chr)
889 {
890 case '`':
891 p = sym_quasisyntax;
892 break;
893
894 case '\'':
895 p = sym_syntax;
896 break;
897
898 case ',':
899 {
900 int c;
901
902 c = scm_getc (port);
903 if ('@' == c)
904 p = sym_unsyntax_splicing;
905 else
906 {
907 scm_ungetc (c, port);
908 p = sym_unsyntax;
909 }
910 break;
911 }
912
913 default:
914 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
915 "scm_read_syntax", chr);
916 abort ();
917 }
918
b1b5433d
MW
919 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
920 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
921}
922
cfd15439 923static SCM
b1b5433d 924scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 925{
b1b5433d 926 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
927
928 if (!scm_is_eq (id, sym_nil))
929 scm_i_input_error ("scm_read_nil", port,
930 "unexpected input while reading #nil: ~a",
931 scm_list_1 (id));
932
933 return SCM_ELISP_NIL;
934}
935
cfd15439 936static SCM
7337d56d 937scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 938{
0f2d19dd
JB
939 int c;
940
889975e5
MG
941 /* We use the get_byte here because there is no need to get the
942 locale correct with comment input. This presumes that newline
943 always represents itself no matter what the encoding is. */
944 for (c = scm_get_byte_or_eof (port);
7337d56d 945 (c != EOF) && (c != '\n');
8a8da78d 946 c = scm_get_byte_or_eof (port));
7337d56d
LC
947
948 return SCM_UNSPECIFIED;
949}
950
7a329029
MW
951/* If the EXPECTED_CHARS are the next ones available from PORT, then
952 consume them and return 1. Otherwise leave the port position where
953 it was and return 0. EXPECTED_CHARS should be all lowercase, and
954 will be matched case-insensitively against the characters read from
955 PORT. */
956static int
957try_read_ci_chars (SCM port, const char *expected_chars)
958{
959 int num_chars_wanted = strlen (expected_chars);
960 int num_chars_read = 0;
961 char *chars_read = alloca (num_chars_wanted);
962 int c;
963
964 while (num_chars_read < num_chars_wanted)
965 {
966 c = scm_getc (port);
967 if (c == EOF)
968 break;
969 else if (tolower (c) != expected_chars[num_chars_read])
970 {
971 scm_ungetc (c, port);
972 break;
973 }
974 else
975 chars_read[num_chars_read++] = c;
976 }
977
978 if (num_chars_read == num_chars_wanted)
979 return 1;
980 else
981 {
982 while (num_chars_read > 0)
983 scm_ungetc (chars_read[--num_chars_read], port);
984 return 0;
985 }
986}
987
7337d56d
LC
988\f
989/* Sharp readers, i.e. readers called after a `#' sign has been read. */
990
991static SCM
992scm_read_boolean (int chr, SCM port)
993{
994 switch (chr)
0f2d19dd 995 {
7337d56d
LC
996 case 't':
997 case 'T':
7a329029 998 try_read_ci_chars (port, "rue");
7337d56d
LC
999 return SCM_BOOL_T;
1000
1001 case 'f':
1002 case 'F':
7a329029 1003 try_read_ci_chars (port, "alse");
7337d56d 1004 return SCM_BOOL_F;
0f2d19dd 1005 }
7337d56d
LC
1006
1007 return SCM_UNSPECIFIED;
1008}
1009
1010static SCM
b1b5433d 1011scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1012#define FUNC_NAME "scm_lreadr"
1013{
69f90b0b
MG
1014 char buffer[READER_CHAR_NAME_MAX_SIZE];
1015 SCM charname;
1016 size_t charname_len, bytes_read;
889975e5
MG
1017 scm_t_wchar cp;
1018 int overflow;
69f90b0b 1019 scm_t_port *pt;
889975e5 1020
b1b5433d
MW
1021 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1022 &bytes_read);
889975e5 1023 if (overflow)
c1a0ba1c 1024 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 1025
69f90b0b 1026 if (bytes_read == 0)
0f2d19dd 1027 {
7337d56d
LC
1028 chr = scm_getc (port);
1029 if (chr == EOF)
1030 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1031 "while reading character", SCM_EOL);
1032
1033 /* CHR must be a token delimiter, like a whitespace. */
1034 return (SCM_MAKE_CHAR (chr));
0f2d19dd 1035 }
7337d56d 1036
69f90b0b 1037 pt = SCM_PTAB_ENTRY (port);
7337d56d 1038
69f90b0b
MG
1039 /* Simple ASCII characters can be processed immediately. Also, simple
1040 ISO-8859-1 characters can be processed immediately if the encoding for this
1041 port is ISO-8859-1. */
1042 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1043 {
1044 SCM_COL (port) += 1;
1045 return SCM_MAKE_CHAR (buffer[0]);
1046 }
1047
1048 /* Otherwise, convert the buffer into a proper scheme string for
1049 processing. */
1050 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1051 pt->ilseq_handler);
1052 charname_len = scm_i_string_length (charname);
1053 SCM_COL (port) += charname_len;
889975e5 1054 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1055 if (charname_len == 1)
1056 return SCM_MAKE_CHAR (cp);
1057
1058 /* Ignore dotted circles, which may be used to keep combining characters from
1059 combining with the backslash in #\charname. */
0dcd7e61
MG
1060 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1061 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1062
889975e5 1063 if (cp >= '0' && cp < '8')
7337d56d
LC
1064 {
1065 /* Dirk:FIXME:: This type of character syntax is not R5RS
1066 * compliant. Further, it should be verified that the constant
0ffc78e3 1067 * does only consist of octal digits. */
889975e5 1068 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1069 if (SCM_I_INUMP (p))
0ffc78e3 1070 {
e25f3727 1071 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1072 if (SCM_IS_UNICODE_CHAR (c))
1073 return SCM_MAKE_CHAR (c);
1074 else
0f3a70cf 1075 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1076 "out-of-range octal character escape: ~a",
1077 scm_list_1 (charname));
1078 }
7337d56d
LC
1079 }
1080
0f3a70cf 1081 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1082 {
1083 SCM p;
0f3a70cf 1084
dea901d6
MG
1085 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1086 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1087 scm_from_uint (16));
1088 if (SCM_I_INUMP (p))
1089 {
e25f3727 1090 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1091 if (SCM_IS_UNICODE_CHAR (c))
1092 return SCM_MAKE_CHAR (c);
1093 else
1094 scm_i_input_error (FUNC_NAME, port,
1095 "out-of-range hex character escape: ~a",
1096 scm_list_1 (charname));
1097 }
1098 }
1099
889975e5
MG
1100 /* The names of characters should never have non-Latin1
1101 characters. */
1102 if (scm_i_is_narrow_string (charname)
1103 || scm_i_try_narrow_string (charname))
4769c9db
AW
1104 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1105 charname_len);
1106 if (scm_is_true (ch))
1107 return ch;
1108 }
7337d56d 1109
7337d56d 1110 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1111 scm_list_1 (charname));
7337d56d
LC
1112
1113 return SCM_UNSPECIFIED;
0f2d19dd 1114}
db4b4ca6 1115#undef FUNC_NAME
0f2d19dd 1116
cfd15439 1117static SCM
b1b5433d 1118scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1119{
1120 SCM symbol;
1121
1122 /* Read the symbol that comprises the keyword. Doing this instead of
1123 invoking a specific symbol reader function allows `scm_read_keyword ()'
1124 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1125
7337d56d 1126 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1127 symbol = scm_read_expression (port, opts);
7337d56d 1128 if (!scm_is_symbol (symbol))
7f74cf9a 1129 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1130 "keyword prefix `~a' not followed by a symbol: ~s",
1131 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1132
1133 return (scm_symbol_to_keyword (symbol));
1134}
1135
cfd15439 1136static SCM
b1b5433d
MW
1137scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1138 long line, int column)
09a4f039 1139{
7337d56d
LC
1140 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1141 guarantee that it's going to do what we want. After all, this is an
1142 implementation detail of `scm_read_vector ()', not a desirable
1143 property. */
b1b5433d
MW
1144 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1145 port, opts, line, column);
b131b233
MW
1146}
1147
493ceb99
MW
1148/* Helper used by scm_read_array */
1149static int
1150read_decimal_integer (SCM port, int c, ssize_t *resp)
1151{
1152 ssize_t sign = 1;
1153 ssize_t res = 0;
1154 int got_it = 0;
1155
1156 if (c == '-')
1157 {
1158 sign = -1;
1159 c = scm_getc (port);
1160 }
1161
1162 while ('0' <= c && c <= '9')
1163 {
a662686a
MW
1164 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1165 scm_i_input_error ("read_decimal_integer", port,
1166 "number too large", SCM_EOL);
493ceb99
MW
1167 res = 10*res + c-'0';
1168 got_it = 1;
1169 c = scm_getc (port);
1170 }
1171
1172 if (got_it)
1173 *resp = sign * res;
1174 return c;
1175}
1176
1177/* Read an array. This function can also read vectors and uniform
1178 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1179 handled here.
1180
10744b7c 1181 C is the first character read after the '#'. */
b131b233 1182static SCM
b1b5433d 1183scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
b131b233 1184{
493ceb99
MW
1185 ssize_t rank;
1186 scm_t_wchar tag_buf[8];
1187 int tag_len;
1188
1189 SCM tag, shape = SCM_BOOL_F, elements, array;
1190
1191 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1192 the array code can not deal with zero-length dimensions yet, and
10744b7c 1193 we want to allow zero-length vectors, of course. */
493ceb99 1194 if (c == '(')
b1b5433d 1195 return scm_read_vector (c, port, opts, line, column);
493ceb99 1196
10744b7c 1197 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1198 if (c == 'f')
1199 {
1200 c = scm_getc (port);
1201 if (c != '3' && c != '6')
1202 {
7a329029
MW
1203 if (c == 'a' && try_read_ci_chars (port, "lse"))
1204 return SCM_BOOL_F;
1205 else if (c != EOF)
1206 scm_ungetc (c, port);
493ceb99
MW
1207 return SCM_BOOL_F;
1208 }
1209 rank = 1;
1210 tag_buf[0] = 'f';
1211 tag_len = 1;
1212 goto continue_reading_tag;
1213 }
1214
1215 /* Read rank. */
1216 rank = 1;
1217 c = read_decimal_integer (port, c, &rank);
1218 if (rank < 0)
1219 scm_i_input_error (NULL, port, "array rank must be non-negative",
1220 SCM_EOL);
1221
1222 /* Read tag. */
1223 tag_len = 0;
1224 continue_reading_tag:
1225 while (c != EOF && c != '(' && c != '@' && c != ':'
1226 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1227 {
1228 tag_buf[tag_len++] = c;
1229 c = scm_getc (port);
1230 }
1231 if (tag_len == 0)
1232 tag = SCM_BOOL_T;
b131b233 1233 else
493ceb99
MW
1234 {
1235 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1236 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1237 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1238 scm_list_1 (tag));
1239 }
1240
1241 /* Read shape. */
1242 if (c == '@' || c == ':')
1243 {
1244 shape = SCM_EOL;
1245
1246 do
1247 {
1248 ssize_t lbnd = 0, len = 0;
1249 SCM s;
1250
1251 if (c == '@')
1252 {
1253 c = scm_getc (port);
1254 c = read_decimal_integer (port, c, &lbnd);
1255 }
1256
1257 s = scm_from_ssize_t (lbnd);
1258
1259 if (c == ':')
1260 {
1261 c = scm_getc (port);
1262 c = read_decimal_integer (port, c, &len);
1263 if (len < 0)
1264 scm_i_input_error (NULL, port,
1265 "array length must be non-negative",
1266 SCM_EOL);
1267
1268 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1269 }
1270
1271 shape = scm_cons (s, shape);
1272 } while (c == '@' || c == ':');
1273
1274 shape = scm_reverse_x (shape, SCM_EOL);
1275 }
1276
1277 /* Read nested lists of elements. */
1278 if (c != '(')
1279 scm_i_input_error (NULL, port,
1280 "missing '(' in vector or array literal",
1281 SCM_EOL);
b1b5433d 1282 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1283
1284 if (scm_is_false (shape))
1285 shape = scm_from_ssize_t (rank);
1286 else if (scm_ilength (shape) != rank)
1287 scm_i_input_error
1288 (NULL, port,
1289 "the number of shape specifications must match the array rank",
1290 SCM_EOL);
1291
1292 /* Handle special print syntax of rank zero arrays; see
1293 scm_i_print_array for a rationale. */
1294 if (rank == 0)
1295 {
1296 if (!scm_is_pair (elements))
1297 scm_i_input_error (NULL, port,
1298 "too few elements in array literal, need 1",
1299 SCM_EOL);
1300 if (!scm_is_null (SCM_CDR (elements)))
1301 scm_i_input_error (NULL, port,
1302 "too many elements in array literal, want 1",
1303 SCM_EOL);
1304 elements = SCM_CAR (elements);
1305 }
1306
1307 /* Construct array, annotate with source location, and return. */
1308 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1309 return maybe_annotate_source (array, port, opts, line, column);
7337d56d 1310}
09a4f039 1311
cfd15439 1312static SCM
b1b5433d
MW
1313scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1314 long line, int column)
7337d56d 1315{
b1b5433d 1316 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1317}
1318
0ba0b384 1319static SCM
b1b5433d
MW
1320scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1321 long line, int column)
0ba0b384
LC
1322{
1323 chr = scm_getc (port);
1324 if (chr != 'u')
1325 goto syntax;
1326
1327 chr = scm_getc (port);
1328 if (chr != '8')
1329 goto syntax;
1330
1331 chr = scm_getc (port);
1332 if (chr != '(')
1333 goto syntax;
1334
b131b233 1335 return maybe_annotate_source
b1b5433d
MW
1336 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1337 port, opts, line, column);
0ba0b384
LC
1338
1339 syntax:
1340 scm_i_input_error ("read_bytevector", port,
1341 "invalid bytevector prefix",
1342 SCM_MAKE_CHAR (chr));
1343 return SCM_UNSPECIFIED;
1344}
1345
7337d56d 1346static SCM
b1b5433d
MW
1347scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1348 long line, int column)
7337d56d
LC
1349{
1350 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1351 terribly inefficient but who cares? */
1352 SCM s_bits = SCM_EOL;
1353
1354 for (chr = scm_getc (port);
1355 (chr != EOF) && ((chr == '0') || (chr == '1'));
1356 chr = scm_getc (port))
09a4f039 1357 {
7337d56d 1358 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1359 }
7337d56d
LC
1360
1361 if (chr != EOF)
1362 scm_ungetc (chr, port);
1363
b131b233
MW
1364 return maybe_annotate_source
1365 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1366 port, opts, line, column);
7337d56d
LC
1367}
1368
cfd15439 1369static SCM
889975e5 1370scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1371{
1372 int bang_seen = 0;
1373
1374 for (;;)
09a4f039 1375 {
58b1db5f 1376 int c = scm_getc (port);
62850ef3 1377
7337d56d
LC
1378 if (c == EOF)
1379 scm_i_input_error ("skip_block_comment", port,
1380 "unterminated `#! ... !#' comment", SCM_EOL);
1381
1382 if (c == '!')
1383 bang_seen = 1;
1384 else if (c == '#' && bang_seen)
1385 break;
1386 else
1387 bang_seen = 0;
1388 }
1389
1390 return SCM_UNSPECIFIED;
1391}
1392
9331ffd8
MW
1393static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1394 int value);
bf9eb54a
MW
1395static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1396 int value);
1397static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1398 int value);
9331ffd8 1399
d7fcaec3 1400static SCM
b1b5433d 1401scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1402{
02327c0c
MW
1403 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1404 int c;
1405 int i = 0;
1406
1407 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1408 {
02327c0c
MW
1409 c = scm_getc (port);
1410 if (c == EOF)
1411 scm_i_input_error ("skip_block_comment", port,
1412 "unterminated `#! ... !#' comment", SCM_EOL);
1413 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1414 name[i++] = c;
1415 else if (CHAR_IS_DELIMITER (c))
1416 {
1417 scm_ungetc (c, port);
1418 name[i] = '\0';
1419 if (0 == strcmp ("r6rs", name))
1420 ; /* Silently ignore */
9331ffd8
MW
1421 else if (0 == strcmp ("fold-case", name))
1422 set_port_case_insensitive_p (port, opts, 1);
1423 else if (0 == strcmp ("no-fold-case", name))
1424 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1425 else if (0 == strcmp ("curly-infix", name))
1426 set_port_curly_infix_p (port, opts, 1);
1427 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1428 {
1429 set_port_curly_infix_p (port, opts, 1);
1430 set_port_square_brackets_p (port, opts, 0);
1431 }
02327c0c
MW
1432 else
1433 break;
1434
1435 return SCM_UNSPECIFIED;
1436 }
fa746547
MW
1437 else
1438 {
1439 scm_ungetc (c, port);
1440 break;
1441 }
911b03b2 1442 }
02327c0c
MW
1443 while (i > 0)
1444 scm_ungetc (name[--i], port);
1445 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1446}
1447
620c8965
LC
1448static SCM
1449scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1450{
1451 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1452 nested. So care must be taken. */
1453 int nesting_level = 1;
6d5f8c32
AW
1454
1455 int a = scm_getc (port);
1456
1457 if (a == EOF)
1458 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1459 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1460
1461 while (nesting_level > 0)
1462 {
6d5f8c32 1463 int b = scm_getc (port);
620c8965 1464
6d5f8c32 1465 if (b == EOF)
cd169c5a 1466 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1467 "unterminated `#| ... |#' comment", SCM_EOL);
1468
6d5f8c32
AW
1469 if (a == '|' && b == '#')
1470 {
1471 nesting_level--;
1472 b = EOF;
1473 }
1474 else if (a == '#' && b == '|')
1475 {
1476 nesting_level++;
1477 b = EOF;
1478 }
1479
1480 a = b;
620c8965
LC
1481 }
1482
1483 return SCM_UNSPECIFIED;
1484}
1485
34f3d47d 1486static SCM
b1b5433d
MW
1487scm_read_commented_expression (scm_t_wchar chr, SCM port,
1488 scm_t_read_opts *opts)
34f3d47d 1489{
889975e5 1490 scm_t_wchar c;
34f3d47d 1491
b1b5433d 1492 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1493 if (EOF == c)
1494 scm_i_input_error ("read_commented_expression", port,
1495 "no expression after #; comment", SCM_EOL);
1496 scm_ungetc (c, port);
b1b5433d 1497 scm_read_expression (port, opts);
34f3d47d
AW
1498 return SCM_UNSPECIFIED;
1499}
1500
7337d56d 1501static SCM
889975e5 1502scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1503{
1504 /* Guile's extended symbol read syntax looks like this:
1505
1506 #{This is all a symbol name}#
1507
1508 So here, CHR is expected to be `{'. */
d9527cfa 1509 int saw_brace = 0;
7337d56d 1510 size_t len = 0;
190d4b0d 1511 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1512
889975e5 1513 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1514
1515 while ((chr = scm_getc (port)) != EOF)
1516 {
1517 if (saw_brace)
09a4f039 1518 {
7337d56d
LC
1519 if (chr == '#')
1520 {
7337d56d
LC
1521 break;
1522 }
1523 else
1524 {
1525 saw_brace = 0;
889975e5 1526 scm_i_string_set_x (buf, len++, '}');
7337d56d 1527 }
09a4f039 1528 }
d9527cfa
AW
1529
1530 if (chr == '}')
7337d56d 1531 saw_brace = 1;
d9527cfa
AW
1532 else if (chr == '\\')
1533 {
1534 /* It used to be that print.c would print extended-read-syntax
1535 symbols with backslashes before "non-standard" chars, but
1536 this routine wouldn't do anything with those escapes.
1537 Bummer. What we've done is to change print.c to output
1538 R6RS hex escapes for those characters, relying on the fact
1539 that the extended read syntax would never put a `\' before
1540 an `x'. For now, we just ignore other instances of
1541 backslash in the string. */
1542 switch ((chr = scm_getc (port)))
1543 {
1544 case EOF:
1545 goto done;
1546 case 'x':
1547 {
1548 scm_t_wchar c;
1549
1550 SCM_READ_HEX_ESCAPE (10, ';');
1551 scm_i_string_set_x (buf, len++, c);
1552 break;
1553
1554 str_eof:
1555 chr = EOF;
1556 goto done;
1557
1558 bad_escaped:
1559 scm_i_string_stop_writing ();
1560 scm_i_input_error ("scm_read_extended_symbol", port,
1561 "illegal character in escape sequence: ~S",
1562 scm_list_1 (SCM_MAKE_CHAR (c)));
1563 break;
1564 }
1565 default:
1566 scm_i_string_set_x (buf, len++, chr);
1567 break;
1568 }
1569 }
7337d56d 1570 else
d9527cfa 1571 scm_i_string_set_x (buf, len++, chr);
62850ef3 1572
889975e5 1573 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1574 {
7f991c7d
LC
1575 SCM addy;
1576
889975e5 1577 scm_i_string_stop_writing ();
190d4b0d 1578 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1579 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1580 len = 0;
889975e5 1581 buf = scm_i_string_start_writing (buf);
7337d56d 1582 }
7337d56d 1583 }
d9527cfa
AW
1584
1585 done:
889975e5 1586 scm_i_string_stop_writing ();
d9527cfa
AW
1587 if (chr == EOF)
1588 scm_i_input_error ("scm_read_extended_symbol", port,
1589 "end of file while reading symbol", SCM_EOL);
7337d56d 1590
889975e5 1591 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1592}
1593
1594
1595\f
1596/* Top-level token readers, i.e., dispatchers. */
1597
1598static SCM
b1b5433d 1599scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1600{
1601 SCM proc;
1602
1603 proc = scm_get_hash_procedure (chr);
1604 if (scm_is_true (scm_procedure_p (proc)))
1605 {
1606 long line = SCM_LINUM (port);
1607 int column = SCM_COL (port) - 2;
1608 SCM got;
1609
1610 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1611
3655ed89
MW
1612 if (opts->record_positions_p && SCM_NIMP (got)
1613 && !scm_i_has_source_properties (got))
26c8cc14
AW
1614 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1615
1616 return got;
09a4f039 1617 }
7337d56d
LC
1618
1619 return SCM_UNSPECIFIED;
1620}
1621
1622/* The reader for the sharp `#' character. It basically dispatches reads
1623 among the above token readers. */
1624static SCM
b1b5433d
MW
1625scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1626 long line, int column)
7337d56d
LC
1627#define FUNC_NAME "scm_lreadr"
1628{
1629 SCM result;
1630
1631 chr = scm_getc (port);
1632
b1b5433d 1633 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1634 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1635 return result;
1636
1637 switch (chr)
1638 {
1639 case '\\':
b1b5433d 1640 return (scm_read_character (chr, port, opts));
7337d56d 1641 case '(':
b1b5433d 1642 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1643 case 's':
1644 case 'u':
1645 case 'f':
df941b5b 1646 case 'c':
7337d56d 1647 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1648 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1649 case 'v':
b1b5433d 1650 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1651 case '*':
b1b5433d 1652 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1653 case 't':
1654 case 'T':
1655 case 'F':
7337d56d
LC
1656 return (scm_read_boolean (chr, port));
1657 case ':':
b1b5433d 1658 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1659 case '0': case '1': case '2': case '3': case '4':
1660 case '5': case '6': case '7': case '8': case '9':
1661 case '@':
1662#if SCM_ENABLE_DEPRECATED
1663 /* See below for 'i' and 'e'. */
1664 case 'a':
7337d56d
LC
1665 case 'y':
1666 case 'h':
1667 case 'l':
1668#endif
b1b5433d 1669 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1670
1671 case 'i':
1672 case 'e':
1673#if SCM_ENABLE_DEPRECATED
1674 {
1675 /* When next char is '(', it really is an old-style
1676 uniform array. */
889975e5 1677 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1678 if (next_c != EOF)
1679 scm_ungetc (next_c, port);
1680 if (next_c == '(')
b1b5433d 1681 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1682 /* Fall through. */
1683 }
1684#endif
1685 case 'b':
1686 case 'B':
1687 case 'o':
1688 case 'O':
1689 case 'd':
1690 case 'D':
1691 case 'x':
1692 case 'X':
1693 case 'I':
1694 case 'E':
b1b5433d 1695 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1696 case '{':
1697 return (scm_read_extended_symbol (chr, port));
1698 case '!':
b1b5433d 1699 return (scm_read_shebang (chr, port, opts));
34f3d47d 1700 case ';':
b1b5433d 1701 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1702 case '`':
1703 case '\'':
1704 case ',':
b1b5433d 1705 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1706 case 'n':
b1b5433d 1707 return (scm_read_nil (chr, port, opts));
7337d56d 1708 default:
b1b5433d 1709 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1710 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1711 {
1712 /* To remain compatible with 1.8 and earlier, the following
1713 characters have lower precedence than `read-hash-extend'
1714 characters. */
1715 switch (chr)
1716 {
1717 case '|':
1718 return scm_read_r6rs_block_comment (chr, port);
1719 default:
1720 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1721 scm_list_1 (SCM_MAKE_CHAR (chr)));
1722 }
1723 }
7337d56d
LC
1724 else
1725 return result;
1726 }
1727
1728 return SCM_UNSPECIFIED;
1729}
1730#undef FUNC_NAME
1731
1732static SCM
bf9eb54a
MW
1733read_inner_expression (SCM port, scm_t_read_opts *opts)
1734#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1735{
1736 while (1)
1737 {
cfd15439 1738 scm_t_wchar chr;
7337d56d
LC
1739
1740 chr = scm_getc (port);
1741
1742 switch (chr)
1743 {
1744 case SCM_WHITE_SPACES:
1745 case SCM_LINE_INCREMENTORS:
1746 break;
1747 case ';':
1748 (void) scm_read_semicolon_comment (chr, port);
1749 break;
bf9eb54a
MW
1750 case '{':
1751 if (opts->curly_infix_p)
1752 {
1753 if (opts->neoteric_p)
1754 return scm_read_sexp (chr, port, opts);
1755 else
1756 {
1757 SCM expr;
1758
1759 /* Enable neoteric expressions within curly braces */
1760 opts->neoteric_p = 1;
1761 expr = scm_read_sexp (chr, port, opts);
1762 opts->neoteric_p = 0;
1763 return expr;
1764 }
1765 }
1766 else
1767 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1768 case '[':
bf9eb54a
MW
1769 if (opts->square_brackets_p)
1770 return scm_read_sexp (chr, port, opts);
1771 else if (opts->curly_infix_p)
1772 {
1773 /* The syntax of neoteric expressions requires that '[' be
1774 a delimiter when curly-infix is enabled, so it cannot
1775 be part of an unescaped symbol. We might as well do
1776 something useful with it, so we adopt Kawa's convention:
1777 [...] => ($bracket-list$ ...) */
1778 long line = SCM_LINUM (port);
1779 int column = SCM_COL (port) - 1;
1780 return maybe_annotate_source
1781 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1782 port, opts, line, column);
1783 }
1784 else
1785 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1786 case '(':
b1b5433d 1787 return (scm_read_sexp (chr, port, opts));
7337d56d 1788 case '"':
b1b5433d 1789 return (scm_read_string (chr, port, opts));
7337d56d
LC
1790 case '\'':
1791 case '`':
1792 case ',':
b1b5433d 1793 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1794 case '#':
1795 {
b131b233
MW
1796 long line = SCM_LINUM (port);
1797 int column = SCM_COL (port) - 1;
b1b5433d 1798 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1799 if (scm_is_eq (result, SCM_UNSPECIFIED))
1800 /* We read a comment or some such. */
1801 break;
1802 else
1803 return result;
1804 }
1805 case ')':
1806 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1807 break;
bf9eb54a
MW
1808 case '}':
1809 if (opts->curly_infix_p)
1810 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1811 else
1812 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1813 case ']':
b1b5433d 1814 if (opts->square_brackets_p)
a4e47229
MG
1815 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1816 /* otherwise fall through */
7337d56d
LC
1817 case EOF:
1818 return SCM_EOF_VAL;
1819 case ':':
b1b5433d
MW
1820 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1821 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1822 /* Fall through. */
1823
1824 default:
1825 {
1826 if (((chr >= '0') && (chr <= '9'))
1827 || (strchr ("+-.", chr)))
b1b5433d 1828 return (scm_read_number (chr, port, opts));
7337d56d 1829 else
b1b5433d 1830 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1831 }
1832 }
1833 }
1834}
1835#undef FUNC_NAME
1836
bf9eb54a
MW
1837static SCM
1838scm_read_expression (SCM port, scm_t_read_opts *opts)
1839#define FUNC_NAME "scm_read_expression"
1840{
1841 if (!opts->neoteric_p)
1842 return read_inner_expression (port, opts);
1843 else
1844 {
1845 long line = 0;
1846 int column = 0;
1847 SCM expr;
1848
1849 if (opts->record_positions_p)
1850 {
1851 /* We need to get the position of the first non-whitespace
1852 character in order to correctly annotate neoteric
1853 expressions. For example, for the expression 'f(x)', the
1854 first call to 'read_inner_expression' reads the 'f' (which
1855 cannot be annotated), and then we later read the '(x)' and
1856 use it to construct the new list (f x). */
1857 int c = flush_ws (port, opts, (char *) NULL);
1858 if (c == EOF)
1859 return SCM_EOF_VAL;
1860 scm_ungetc (c, port);
1861 line = SCM_LINUM (port);
1862 column = SCM_COL (port);
1863 }
1864
1865 expr = read_inner_expression (port, opts);
1866
1867 /* 'expr' is the first component of the neoteric expression. Now
1868 we loop, and as long as the next character is '(', '[', or '{',
1869 (without any intervening whitespace), we use it to construct a
1870 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1871 for (;;)
1872 {
1873 int chr = scm_getc (port);
1874
1875 if (chr == '(')
1876 /* e(...) => (e ...) */
1877 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1878 else if (chr == '[')
1879 /* e[...] => ($bracket-apply$ e ...) */
1880 expr = scm_cons (sym_bracket_apply,
1881 scm_cons (expr,
1882 scm_read_sexp (chr, port, opts)));
1883 else if (chr == '{')
1884 {
1885 SCM arg = scm_read_sexp (chr, port, opts);
1886
1887 if (scm_is_null (arg))
1888 expr = scm_list_1 (expr); /* e{} => (e) */
1889 else
1890 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1891 }
1892 else
1893 {
1894 if (chr != EOF)
1895 scm_ungetc (chr, port);
1896 break;
1897 }
1898 maybe_annotate_source (expr, port, opts, line, column);
1899 }
1900 return expr;
1901 }
1902}
1903#undef FUNC_NAME
1904
7337d56d
LC
1905\f
1906/* Actual reader. */
1907
ea8c9761
MW
1908static void init_read_options (SCM port, scm_t_read_opts *opts);
1909
7337d56d
LC
1910SCM_DEFINE (scm_read, "read", 0, 1, 0,
1911 (SCM port),
1912 "Read an s-expression from the input port @var{port}, or from\n"
1913 "the current input port if @var{port} is not specified.\n"
1914 "Any whitespace before the next token is discarded.")
1915#define FUNC_NAME s_scm_read
1916{
b1b5433d 1917 scm_t_read_opts opts;
7337d56d
LC
1918 int c;
1919
1920 if (SCM_UNBNDP (port))
1921 port = scm_current_input_port ();
1922 SCM_VALIDATE_OPINPORT (1, port);
1923
ea8c9761 1924 init_read_options (port, &opts);
b1b5433d
MW
1925
1926 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1927 if (EOF == c)
1928 return SCM_EOF_VAL;
1929 scm_ungetc (c, port);
1930
b1b5433d 1931 return (scm_read_expression (port, &opts));
09a4f039 1932}
db4b4ca6 1933#undef FUNC_NAME
09a4f039 1934
0f2d19dd
JB
1935
1936\f
1937
14de3b42
GH
1938/* Manipulate the read-hash-procedures alist. This could be written in
1939 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1940SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1941 (SCM chr, SCM proc),
dc7fa443
MG
1942 "Install the procedure @var{proc} for reading expressions\n"
1943 "starting with the character sequence @code{#} and @var{chr}.\n"
1944 "@var{proc} will be called with two arguments: the character\n"
1945 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1946 "returned will be the return value of @code{read}. \n"
1947 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1948 )
1bbd0b84 1949#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1950{
fed9c9a2
GH
1951 SCM this;
1952 SCM prev;
1953
36284627 1954 SCM_VALIDATE_CHAR (1, chr);
7888309b 1955 SCM_ASSERT (scm_is_false (proc)
bc36d050 1956 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1957 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1958
14de3b42 1959 /* Check if chr is already in the alist. */
d458073b 1960 this = scm_i_read_hash_procedures_ref ();
14de3b42 1961 prev = SCM_BOOL_F;
fed9c9a2
GH
1962 while (1)
1963 {
d2e53ed6 1964 if (scm_is_null (this))
fed9c9a2
GH
1965 {
1966 /* not found, so add it to the beginning. */
7888309b 1967 if (scm_is_true (proc))
fed9c9a2 1968 {
d458073b
AR
1969 SCM new = scm_cons (scm_cons (chr, proc),
1970 scm_i_read_hash_procedures_ref ());
1971 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1972 }
1973 break;
1974 }
bc36d050 1975 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1976 {
1977 /* already in the alist. */
7888309b 1978 if (scm_is_false (proc))
14de3b42
GH
1979 {
1980 /* remove it. */
7888309b 1981 if (scm_is_false (prev))
14de3b42 1982 {
d458073b
AR
1983 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1984 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1985 }
1986 else
1987 scm_set_cdr_x (prev, SCM_CDR (this));
1988 }
fed9c9a2 1989 else
14de3b42
GH
1990 {
1991 /* replace it. */
1992 scm_set_cdr_x (SCM_CAR (this), proc);
1993 }
fed9c9a2
GH
1994 break;
1995 }
1996 prev = this;
1997 this = SCM_CDR (this);
1998 }
deca31e1 1999
deca31e1
GH
2000 return SCM_UNSPECIFIED;
2001}
1bbd0b84 2002#undef FUNC_NAME
0f2d19dd 2003
deca31e1
GH
2004/* Recover the read-hash procedure corresponding to char c. */
2005static SCM
6e8d25a6 2006scm_get_hash_procedure (int c)
deca31e1 2007{
d458073b 2008 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 2009
deca31e1
GH
2010 while (1)
2011 {
d2e53ed6 2012 if (scm_is_null (rest))
deca31e1
GH
2013 return SCM_BOOL_F;
2014
7866a09b 2015 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
2016 return SCM_CDAR (rest);
2017
2018 rest = SCM_CDR (rest);
2019 }
2020}
1cc91f1b 2021
889975e5
MG
2022#define SCM_ENCODING_SEARCH_SIZE (500)
2023
f8a1c9a8
LC
2024/* Search the first few hundred characters of a file for an Emacs-like coding
2025 declaration. Returns either NULL or a string whose storage has been
2026 allocated with `scm_gc_malloc ()'. */
889975e5 2027char *
f8a1c9a8 2028scm_i_scan_for_encoding (SCM port)
889975e5 2029{
d900843c 2030 scm_t_port *pt;
889975e5 2031 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 2032 size_t bytes_read, encoding_length, i;
889975e5 2033 char *encoding = NULL;
daedbca7 2034 char *pos, *encoding_start;
889975e5
MG
2035 int in_comment;
2036
d900843c 2037 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 2038
d900843c
AW
2039 if (pt->rw_active == SCM_PORT_WRITE)
2040 scm_flush (port);
49bb5bd3 2041
d900843c
AW
2042 if (pt->rw_random)
2043 pt->rw_active = SCM_PORT_READ;
2044
2045 if (pt->read_pos == pt->read_end)
2046 {
2047 /* We can use the read buffer, and thus avoid a seek. */
2048 if (scm_fill_input (port) == EOF)
2049 return NULL;
2050
2051 bytes_read = pt->read_end - pt->read_pos;
2052 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2053 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2054
2055 if (bytes_read <= 1)
2056 /* An unbuffered port -- don't scan. */
2057 return NULL;
2058
2059 memcpy (header, pt->read_pos, bytes_read);
2060 header[bytes_read] = '\0';
2061 }
2062 else
2063 {
2064 /* Try to read some bytes and then seek back. Not all ports
2065 support seeking back; and indeed some file ports (like
2066 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2067 check performed by SCM_FPORT_FDES---but fail to seek
2068 backwards. Hence this block comes second. We prefer to use
2069 the read buffer in-place. */
2070 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2071 return NULL;
2072
2073 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2074 header[bytes_read] = '\0';
419c8736 2075 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
d900843c 2076 }
889975e5 2077
889975e5
MG
2078 /* search past "coding[:=]" */
2079 pos = header;
2080 while (1)
2081 {
2082 if ((pos = strstr(pos, "coding")) == NULL)
2083 return NULL;
2084
2085 pos += strlen("coding");
2086 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2087 (*pos == ':' || *pos == '='))
2088 {
2089 pos ++;
2090 break;
2091 }
2092 }
2093
2094 /* skip spaces */
2095 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2096 (*pos == ' ' || *pos == '\t'))
2097 pos ++;
2098
2099 /* grab the next token */
daedbca7 2100 encoding_start = pos;
889975e5 2101 i = 0;
daedbca7
MG
2102 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2103 && encoding_start + i - header < bytes_read
2104 && (isalnum ((int) encoding_start[i])
2105 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
2106 i++;
2107
daedbca7
MG
2108 encoding_length = i;
2109 if (encoding_length == 0)
889975e5
MG
2110 return NULL;
2111
daedbca7
MG
2112 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2113 for (i = 0; i < encoding_length; i++)
026ed239 2114 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
2115
2116 /* push backwards to make sure we were in a comment */
2117 in_comment = 0;
daedbca7
MG
2118 pos = encoding_start;
2119 while (pos >= header)
889975e5 2120 {
8a12aeb9
AW
2121 if (*pos == ';')
2122 {
2123 in_comment = 1;
2124 break;
2125 }
2126 else if (*pos == '\n' || pos == header)
889975e5
MG
2127 {
2128 /* This wasn't in a semicolon comment. Check for a
2129 hash-bang comment. */
2130 char *beg = strstr (header, "#!");
2131 char *end = strstr (header, "!#");
8a12aeb9 2132 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2133 in_comment = 1;
2134 break;
2135 }
8a12aeb9
AW
2136 else
2137 {
2138 pos --;
2139 continue;
2140 }
889975e5
MG
2141 }
2142 if (!in_comment)
f8a1c9a8
LC
2143 /* This wasn't in a comment */
2144 return NULL;
2145
889975e5
MG
2146 return encoding;
2147}
2148
2149SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2150 (SCM port),
a270e133 2151 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2152 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2153 "The coding declaration is of the form\n"
2154 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2155 "\n"
2156 "Returns a string containing the character encoding of the file\n"
2157 "if a declaration was found, or @code{#f} otherwise.\n")
2158#define FUNC_NAME s_scm_file_encoding
2159{
2160 char *enc;
2161 SCM s_enc;
f8a1c9a8 2162
d900843c
AW
2163 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2164
f8a1c9a8 2165 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2166 if (enc == NULL)
2167 return SCM_BOOL_F;
2168 else
2169 {
2170 s_enc = scm_from_locale_string (enc);
889975e5
MG
2171 return s_enc;
2172 }
f8a1c9a8 2173
889975e5
MG
2174 return SCM_BOOL_F;
2175}
2176#undef FUNC_NAME
2177
ea8c9761
MW
2178\f
2179/* Per-port read options.
2180
a38024ba
MW
2181 We store per-port read options in the 'port-read-options' port
2182 property, which is stored in the internal port structure. The value
2183 stored is a single integer that contains a two-bit field for each
2184 read option.
ea8c9761
MW
2185
2186 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2187 the applicable value should be inherited from the corresponding
2188 global read option. Otherwise, the bit field contains the value of
2189 the read option. For boolean read options that have been set
2190 per-port, the possible values are 0 or 1. If the 'keyword_style'
2191 read option has been set per-port, its possible values are those in
2192 'enum t_keyword_style'. */
2193
a38024ba 2194/* Key to read options in port properties. */
ea8c9761
MW
2195SCM_SYMBOL (sym_port_read_options, "port-read-options");
2196
2197/* Offsets of bit fields for each per-port override */
2198#define READ_OPTION_COPY_SOURCE_P 0
2199#define READ_OPTION_RECORD_POSITIONS_P 2
2200#define READ_OPTION_CASE_INSENSITIVE_P 4
2201#define READ_OPTION_KEYWORD_STYLE 6
2202#define READ_OPTION_R6RS_ESCAPES_P 8
2203#define READ_OPTION_SQUARE_BRACKETS_P 10
2204#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2205#define READ_OPTION_CURLY_INFIX_P 14
ea8c9761 2206
bf9eb54a
MW
2207/* The total width in bits of the per-port overrides */
2208#define READ_OPTIONS_NUM_BITS 16
ea8c9761
MW
2209
2210#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2211#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2212
2213#define READ_OPTION_MASK 3
2214#define READ_OPTION_INHERIT 3
2215
2216static void
2217set_port_read_option (SCM port, int option, int new_value)
2218{
a38024ba 2219 SCM scm_read_options;
ea8c9761
MW
2220 unsigned int read_options;
2221
2222 new_value &= READ_OPTION_MASK;
a38024ba 2223 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2224 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2225 read_options = scm_to_uint (scm_read_options);
2226 else
2227 read_options = READ_OPTIONS_INHERIT_ALL;
2228 read_options &= ~(READ_OPTION_MASK << option);
2229 read_options |= new_value << option;
2230 scm_read_options = scm_from_uint (read_options);
a38024ba 2231 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
ea8c9761
MW
2232}
2233
9331ffd8
MW
2234/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2235static void
2236set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2237{
2238 value = !!value;
2239 opts->case_insensitive_p = value;
2240 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2241}
2242
bf9eb54a
MW
2243/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2244static void
2245set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2246{
2247 value = !!value;
2248 opts->square_brackets_p = value;
2249 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2250}
2251
2252/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2253static void
2254set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2255{
2256 value = !!value;
2257 opts->curly_infix_p = value;
2258 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2259}
2260
ea8c9761
MW
2261/* Initialize OPTS based on PORT's read options and the global read
2262 options. */
2263static void
2264init_read_options (SCM port, scm_t_read_opts *opts)
2265{
a38024ba 2266 SCM val, scm_read_options;
ea8c9761
MW
2267 unsigned int read_options, x;
2268
a38024ba 2269 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2270
2271 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2272 read_options = scm_to_uint (scm_read_options);
2273 else
2274 read_options = READ_OPTIONS_INHERIT_ALL;
2275
2276 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2277 if (x == READ_OPTION_INHERIT)
2278 {
2279 val = SCM_PACK (SCM_KEYWORD_STYLE);
2280 if (scm_is_eq (val, scm_keyword_prefix))
2281 x = KEYWORD_STYLE_PREFIX;
2282 else if (scm_is_eq (val, scm_keyword_postfix))
2283 x = KEYWORD_STYLE_POSTFIX;
2284 else
2285 x = KEYWORD_STYLE_HASH_PREFIX;
2286 }
2287 opts->keyword_style = x;
2288
2289#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2290 do \
2291 { \
2292 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2293 if (x == READ_OPTION_INHERIT) \
2294 x = !!SCM_ ## NAME; \
2295 opts->name = x; \
2296 } \
2297 while (0)
2298
2299 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2300 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2301 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2302 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2303 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2304 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2305 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
ea8c9761
MW
2306
2307#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2308
2309 opts->neoteric_p = 0;
ea8c9761
MW
2310}
2311
0f2d19dd
JB
2312void
2313scm_init_read ()
0f2d19dd 2314{
d458073b
AR
2315 SCM read_hash_procs;
2316
c81c2ad3 2317 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2318
2319 scm_i_read_hash_procedures =
2320 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2321
62560650 2322 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2323#include "libguile/read.x"
0f2d19dd 2324}
89e00824
ML
2325
2326/*
2327 Local Variables:
2328 c-file-style: "gnu"
2329 End:
2330*/