Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37
MW
1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
bf9eb54a
MW
66/* SRFI-105 curly infix expression support */
67SCM_SYMBOL (sym_nfx, "$nfx$");
68SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
69SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
70
71scm_t_option scm_read_opts[] =
72 {
73 { SCM_OPTION_BOOLEAN, "copy", 0,
74 "Copy source code expressions." },
75 { SCM_OPTION_BOOLEAN, "positions", 1,
76 "Record positions of source code expressions." },
77 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
78 "Convert symbols to lower case."},
79 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
80 "Style of keyword recognition: #f, 'prefix or 'postfix."},
81 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
82 "Use R6RS variable-length character and string hex escapes."},
83 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
84 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
85 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
86 "In strings, consume leading whitespace after an escaped end-of-line."},
87 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
88 "Support SRFI-105 curly infix expressions."},
89 { 0, },
90 };
b1b5433d
MW
91
92/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
93 from the global and per-port read options, and a pointer is passed
94 down to all helper functions. */
95
96enum t_keyword_style
97 {
98 KEYWORD_STYLE_HASH_PREFIX,
99 KEYWORD_STYLE_PREFIX,
100 KEYWORD_STYLE_POSTFIX
101 };
b1b5433d 102
ea8c9761
MW
103struct t_read_opts
104{
b1b5433d
MW
105 enum t_keyword_style keyword_style;
106 unsigned int copy_source_p : 1;
107 unsigned int record_positions_p : 1;
108 unsigned int case_insensitive_p : 1;
109 unsigned int r6rs_escapes_p : 1;
110 unsigned int square_brackets_p : 1;
111 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
112 unsigned int curly_infix_p : 1;
113 unsigned int neoteric_p : 1;
a16f6fe7
MD
114};
115
b1b5433d
MW
116typedef struct t_read_opts scm_t_read_opts;
117
a16f6fe7 118
39e8f371
HWN
119/*
120 Give meaningful error messages for errors
121
122 We use the format
123
ba1b2226 124 FILE:LINE:COL: MESSAGE
39e8f371
HWN
125 This happened in ....
126
127 This is not standard GNU format, but the test-suite likes the real
128 message to be in front.
129
39e8f371
HWN
130 */
131
132
a4022e69
MV
133void
134scm_i_input_error (char const *function,
135 SCM port, const char *message, SCM arg)
ba1b2226 136{
29a837fd
MV
137 SCM fn = (scm_is_string (SCM_FILENAME(port))
138 ? SCM_FILENAME(port)
139 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 140
29a837fd 141 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
142 SCM string = SCM_EOL;
143 scm_simple_format (string_port,
272632a6 144 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 145 scm_list_4 (fn,
b3aa4626 146 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 147 scm_from_int (SCM_COL (port) + 1),
272632a6 148 scm_from_locale_string (message)));
ba1b2226
HWN
149
150 string = scm_get_output_string (string_port);
151 scm_close_output_port (string_port);
4a655e50 152 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 153 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 154 string,
dd72382c 155 arg,
ba1b2226
HWN
156 SCM_BOOL_F);
157}
39e8f371
HWN
158
159
a1ec6916 160SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 161 (SCM setting),
dc7fa443
MG
162 "Option interface for the read options. Instead of using\n"
163 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 164 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 165#define FUNC_NAME s_scm_read_options
a16f6fe7 166{
b7ff98dd
MD
167 SCM ans = scm_options (setting,
168 scm_read_opts,
1bbd0b84 169 FUNC_NAME);
b7ff98dd
MD
170 if (SCM_COPY_SOURCE_P)
171 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
172 return ans;
173}
1bbd0b84 174#undef FUNC_NAME
a16f6fe7 175
d458073b
AR
176/* A fluid referring to an association list mapping extra hash
177 characters to procedures. */
178static SCM *scm_i_read_hash_procedures;
deca31e1 179
cfd15439 180static SCM
d458073b
AR
181scm_i_read_hash_procedures_ref (void)
182{
183 return scm_fluid_ref (*scm_i_read_hash_procedures);
184}
185
cfd15439 186static void
d458073b
AR
187scm_i_read_hash_procedures_set_x (SCM value)
188{
189 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
190}
0f2d19dd 191
7337d56d
LC
192\f
193/* Token readers. */
0f2d19dd 194
0f2d19dd 195
7337d56d
LC
196/* Size of the C buffer used to read symbols and numbers. */
197#define READER_BUFFER_SIZE 128
0f2d19dd 198
ff4d3672
LC
199/* Number of 32-bit codepoints in the buffer used to read strings. */
200#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 201
7337d56d
LC
202/* The maximum size of Scheme character names. */
203#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 204
02327c0c
MW
205/* The maximum size of reader directive names. */
206#define READER_DIRECTIVE_NAME_MAX_SIZE 50
207
94115ae3 208
7337d56d
LC
209/* `isblank' is only in C99. */
210#define CHAR_IS_BLANK_(_chr) \
211 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 212 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
213
214#ifdef MSDOS
215# define CHAR_IS_BLANK(_chr) \
216 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
217#else
218# define CHAR_IS_BLANK CHAR_IS_BLANK_
219#endif
220
221
222/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
223 structure''). */
224#define CHAR_IS_R5RS_DELIMITER(c) \
225 (CHAR_IS_BLANK (c) \
603234c6 226 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 227
603234c6
MW
228#define CHAR_IS_DELIMITER(c) \
229 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
230 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
231 || opts->curly_infix_p)) \
232 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
233
234/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
235 Structure''. */
236#define CHAR_IS_EXPONENT_MARKER(_chr) \
237 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
238 || ((_chr) == 'd') || ((_chr) == 'l'))
239
454866e0 240/* Read an SCSH block comment. */
cfd15439 241static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 242static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
243static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
244static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 245static SCM scm_get_hash_procedure (int);
454866e0 246
69f90b0b
MG
247/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
248 result in the pre-allocated buffer BUF. Return zero if the whole token has
249 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
250 bytes actually read. */
cfd15439 251static int
b1b5433d
MW
252read_token (SCM port, scm_t_read_opts *opts,
253 char *buf, size_t buf_size, size_t *read)
b662b7e9 254{
69f90b0b 255 *read = 0;
0520c320 256
69f90b0b
MG
257 while (*read < buf_size)
258 {
259 int chr;
7337d56d 260
0d959103 261 chr = scm_get_byte_or_eof_unlocked (port);
889975e5 262
69f90b0b
MG
263 if (chr == EOF)
264 return 0;
265 else if (CHAR_IS_DELIMITER (chr))
266 {
c932ce0b 267 scm_unget_byte_unlocked (chr, port);
69f90b0b
MG
268 return 0;
269 }
270 else
271 {
272 *buf = (char) chr;
273 buf++, (*read)++;
274 }
275 }
889975e5 276
69f90b0b
MG
277 return 1;
278 }
889975e5 279
b662b7e9
LC
280/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
281 if the token doesn't fit in BUFFER_SIZE bytes. */
282static char *
b1b5433d
MW
283read_complete_token (SCM port, scm_t_read_opts *opts,
284 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
285{
286 int overflow = 0;
b662b7e9
LC
287 size_t bytes_read, overflow_size = 0;
288 char *overflow_buffer = NULL;
1cc91f1b 289
69f90b0b
MG
290 do
291 {
b1b5433d 292 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
293 if (bytes_read == 0)
294 break;
295 if (overflow || overflow_size != 0)
296 {
297 if (overflow_size == 0)
298 {
b662b7e9
LC
299 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
300 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
301 overflow_size = bytes_read;
302 }
303 else
304 {
7be3c2fc 305 char *new_buf =
b662b7e9
LC
306 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
307
308 memcpy (new_buf, overflow_buffer, overflow_size);
309 memcpy (new_buf + overflow_size, buffer, bytes_read);
310
311 overflow_buffer = new_buf;
69f90b0b
MG
312 overflow_size += bytes_read;
313 }
314 }
889975e5 315 }
69f90b0b 316 while (overflow);
889975e5 317
69f90b0b
MG
318 if (overflow_size)
319 *read = overflow_size;
b521d265 320 else
69f90b0b
MG
321 *read = bytes_read;
322
b662b7e9 323 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 324}
7337d56d
LC
325
326/* Skip whitespace from PORT and return the first non-whitespace character
327 read. Raise an error on end-of-file. */
328static int
b1b5433d 329flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 330{
cfd15439 331 scm_t_wchar c;
0f2d19dd 332 while (1)
be632904 333 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
334 {
335 case EOF:
336 goteof:
337 if (eoferr)
d156d3b7 338 {
a4022e69
MV
339 scm_i_input_error (eoferr,
340 port,
341 "end of file",
342 SCM_EOL);
d156d3b7 343 }
0f2d19dd 344 return c;
7337d56d 345
0f2d19dd
JB
346 case ';':
347 lp:
be632904 348 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
349 {
350 case EOF:
351 goto goteof;
352 default:
353 goto lp;
354 case SCM_LINE_INCREMENTORS:
355 break;
356 }
357 break;
7337d56d 358
454866e0 359 case '#':
be632904 360 switch (c = scm_getc_unlocked (port))
454866e0
LC
361 {
362 case EOF:
363 eoferr = "read_sharp";
364 goto goteof;
365 case '!':
b1b5433d 366 scm_read_shebang (c, port, opts);
454866e0 367 break;
34f3d47d 368 case ';':
b1b5433d 369 scm_read_commented_expression (c, port, opts);
34f3d47d 370 break;
620c8965
LC
371 case '|':
372 if (scm_is_false (scm_get_hash_procedure (c)))
373 {
374 scm_read_r6rs_block_comment (c, port);
375 break;
376 }
377 /* fall through */
454866e0 378 default:
c932ce0b 379 scm_ungetc_unlocked (c, port);
454866e0
LC
380 return '#';
381 }
382 break;
383
0f2d19dd 384 case SCM_LINE_INCREMENTORS:
0f2d19dd 385 case SCM_SINGLE_SPACES:
0f2d19dd 386 case '\t':
0f2d19dd 387 break;
7337d56d 388
0f2d19dd
JB
389 default:
390 return c;
391 }
7337d56d
LC
392
393 return 0;
0f2d19dd
JB
394}
395
396
7337d56d
LC
397\f
398/* Token readers. */
1cc91f1b 399
b1b5433d
MW
400static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
401static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
402 long line, int column);
0f2d19dd 403
0f2d19dd 404
d6cb0203 405static SCM
b1b5433d
MW
406maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
407 long line, int column)
d6cb0203 408{
b1b5433d 409 if (opts->record_positions_p)
d6cb0203
MW
410 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
411 return x;
412}
0f2d19dd 413
09a4f039 414static SCM
b1b5433d 415scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 416#define FUNC_NAME "scm_i_lreadparen"
09a4f039 417{
26c8cc14
AW
418 int c;
419 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
420 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
421 const int terminating_char = ((chr == '{') ? '}'
422 : ((chr == '[') ? ']'
423 : ')'));
7337d56d
LC
424
425 /* Need to capture line and column numbers here. */
426 long line = SCM_LINUM (port);
427 int column = SCM_COL (port) - 1;
f9c68a47 428
b1b5433d 429 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
430 if (terminating_char == c)
431 return SCM_EOL;
f9c68a47 432
c932ce0b 433 scm_ungetc_unlocked (c, port);
b1b5433d 434 tmp = scm_read_expression (port, opts);
1f7945a7
AW
435
436 /* Note that it is possible for scm_read_expression to return
437 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
438 check that it's a real dot by checking `c'. */
439 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 440 {
b1b5433d
MW
441 ans = scm_read_expression (port, opts);
442 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
443 scm_i_input_error (FUNC_NAME, port, "missing close paren",
444 SCM_EOL);
445 return ans;
446 }
1cc91f1b 447
7337d56d
LC
448 /* Build the head of the list structure. */
449 ans = tl = scm_cons (tmp, SCM_EOL);
450
b1b5433d 451 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 452 {
7337d56d 453 SCM new_tail;
0f2d19dd 454
bf9eb54a
MW
455 if (c == ')' || (c == ']' && opts->square_brackets_p)
456 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
457 scm_i_input_error (FUNC_NAME, port,
458 "in pair: mismatched close paren: ~A",
459 scm_list_1 (SCM_MAKE_CHAR (c)));
460
c932ce0b 461 scm_ungetc_unlocked (c, port);
b1b5433d 462 tmp = scm_read_expression (port, opts);
5b69315e 463
1f7945a7
AW
464 /* See above note about scm_sym_dot. */
465 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 466 {
b1b5433d 467 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 468
b1b5433d 469 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
470 if (terminating_char != c)
471 scm_i_input_error (FUNC_NAME, port,
472 "in pair: missing close paren", SCM_EOL);
bf9eb54a 473 break;
0f2d19dd 474 }
b858464a 475
7337d56d
LC
476 new_tail = scm_cons (tmp, SCM_EOL);
477 SCM_SETCDR (tl, new_tail);
478 tl = new_tail;
7337d56d 479 }
0f2d19dd 480
bf9eb54a
MW
481 if (curly_list_p)
482 {
483 /* In addition to finding the length, 'scm_ilength' checks for
484 improper or circular lists, in which case it returns -1. */
485 int len = scm_ilength (ans);
486
487 /* The (len == 0) case is handled above */
488 if (len == 1)
489 /* Return directly to avoid re-annotating the element's source
490 location with the position of the outer brace. Also, it
491 might not be possible to annotate the element. */
492 return scm_car (ans); /* {e} => e */
493 else if (len == 2)
494 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
495 else if (len >= 3 && (len & 1))
496 {
497 /* It's a proper list whose length is odd and at least 3. If
498 the elements at odd indices (the infix operator positions)
499 are all 'equal?', then it's a simple curly-infix list.
500 Otherwise it's a mixed curly-infix list. */
501 SCM op = scm_cadr (ans);
502
503 /* Check to see if the elements at odd indices are 'equal?' */
504 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
505 {
506 if (scm_is_null (tl))
507 {
508 /* Convert simple curly-infix list to prefix:
509 {a <op> b <op> ...} => (<op> a b ...) */
510 tl = ans;
511 while (scm_is_pair (scm_cdr (tl)))
512 {
513 tmp = scm_cddr (tl);
514 SCM_SETCDR (tl, tmp);
515 tl = tmp;
516 }
517 ans = scm_cons (op, ans);
518 break;
519 }
520 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
521 {
522 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
523 ans = scm_cons (sym_nfx, ans);
524 break;
525 }
526 }
527 }
528 else
529 /* Mixed curly-infix (possibly improper) list:
530 {e . tail} => ($nfx$ e . tail) */
531 ans = scm_cons (sym_nfx, ans);
532 }
533
b1b5433d 534 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
535}
536#undef FUNC_NAME
a4022e69 537
c5661d28
MG
538
539/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
540 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
541 found. */
542#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
543 do \
544 { \
545 scm_t_wchar a; \
546 size_t i = 0; \
547 c = 0; \
548 while (i < ndigits) \
549 { \
be632904 550 a = scm_getc_unlocked (port); \
dea901d6
MG
551 if (a == EOF) \
552 goto str_eof; \
553 if (terminator \
554 && (a == (scm_t_wchar) terminator) \
555 && (i > 0)) \
556 break; \
557 if ('0' <= a && a <= '9') \
558 a -= '0'; \
559 else if ('A' <= a && a <= 'F') \
560 a = a - 'A' + 10; \
561 else if ('a' <= a && a <= 'f') \
562 a = a - 'a' + 10; \
563 else \
564 { \
565 c = a; \
566 goto bad_escaped; \
567 } \
568 c = c * 16 + a; \
569 i ++; \
570 } \
c5661d28
MG
571 } while (0)
572
684d664e
AW
573static void
574skip_intraline_whitespace (SCM port)
575{
576 scm_t_wchar c;
577
578 do
579 {
be632904 580 c = scm_getc_unlocked (port);
684d664e
AW
581 if (c == EOF)
582 return;
583 }
584 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
585
c932ce0b 586 scm_ungetc_unlocked (c, port);
684d664e
AW
587}
588
7337d56d 589static SCM
b1b5433d 590scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
591#define FUNC_NAME "scm_lreadr"
592{
593 /* For strings smaller than C_STR, this function creates only one Scheme
594 object (the string returned). */
0f2d19dd 595
ff4d3672
LC
596 SCM str = SCM_EOL;
597 size_t c_str_len = 0;
598 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 599
b131b233
MW
600 /* Need to capture line and column numbers here. */
601 long line = SCM_LINUM (port);
602 int column = SCM_COL (port) - 1;
603
be632904 604 while ('"' != (c = scm_getc_unlocked (port)))
7337d56d
LC
605 {
606 if (c == EOF)
9c44cd45
MG
607 {
608 str_eof:
609 scm_i_input_error (FUNC_NAME, port,
610 "end of file in string constant", SCM_EOL);
611 }
0f2d19dd 612
ff4d3672
LC
613 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
614 {
615 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
616 c_str_len = 0;
617 }
0f2d19dd 618
7337d56d 619 if (c == '\\')
9c44cd45 620 {
be632904 621 switch (c = scm_getc_unlocked (port))
9c44cd45
MG
622 {
623 case EOF:
624 goto str_eof;
625 case '"':
626 case '\\':
627 break;
9c44cd45 628 case '\n':
b1b5433d 629 if (opts->hungry_eol_escapes_p)
684d664e 630 skip_intraline_whitespace (port);
9c44cd45
MG
631 continue;
632 case '0':
633 c = '\0';
634 break;
635 case 'f':
636 c = '\f';
637 break;
638 case 'n':
639 c = '\n';
640 break;
641 case 'r':
642 c = '\r';
643 break;
644 case 't':
645 c = '\t';
646 break;
647 case 'a':
648 c = '\007';
649 break;
650 case 'v':
651 c = '\v';
652 break;
67a4a16d
MG
653 case 'b':
654 c = '\010';
655 break;
9c44cd45 656 case 'x':
b1b5433d 657 if (opts->r6rs_escapes_p)
dea901d6
MG
658 SCM_READ_HEX_ESCAPE (10, ';');
659 else
660 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 661 break;
9c44cd45 662 case 'u':
b1b5433d 663 if (!opts->r6rs_escapes_p)
898a0b5a
MG
664 {
665 SCM_READ_HEX_ESCAPE (4, '\0');
666 break;
667 }
9c44cd45 668 case 'U':
b1b5433d 669 if (!opts->r6rs_escapes_p)
898a0b5a
MG
670 {
671 SCM_READ_HEX_ESCAPE (6, '\0');
672 break;
673 }
9c44cd45
MG
674 default:
675 bad_escaped:
676 scm_i_input_error (FUNC_NAME, port,
677 "illegal character in escape sequence: ~S",
678 scm_list_1 (SCM_MAKE_CHAR (c)));
679 }
680 }
ff4d3672
LC
681
682 c_str[c_str_len++] = c;
7337d56d 683 }
ff4d3672
LC
684
685 if (scm_is_null (str))
686 /* Fast path: we got a string that fits in C_STR. */
687 str = scm_from_utf32_stringn (c_str, c_str_len);
688 else
689 {
690 if (c_str_len > 0)
691 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
692
693 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
694 }
695
b1b5433d 696 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 697}
db4b4ca6
DH
698#undef FUNC_NAME
699
0f2d19dd 700
7337d56d 701static SCM
b1b5433d 702scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 703{
69f90b0b 704 SCM result, str = SCM_EOL;
b662b7e9 705 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 706 size_t bytes_read;
69f90b0b 707 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 708
38f19074
MW
709 /* Need to capture line and column numbers here. */
710 long line = SCM_LINUM (port);
711 int column = SCM_COL (port) - 1;
712
c932ce0b 713 scm_ungetc_unlocked (chr, port);
b1b5433d 714 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 715 &bytes_read);
69f90b0b 716
b662b7e9 717 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
718
719 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 720 if (scm_is_false (result))
69f90b0b
MG
721 {
722 /* Return a symbol instead of a number */
b1b5433d 723 if (opts->case_insensitive_p)
69f90b0b
MG
724 str = scm_string_downcase_x (str);
725 result = scm_string_to_symbol (str);
726 }
38f19074 727 else if (SCM_NIMP (result))
b1b5433d 728 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 729
69f90b0b 730 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
731 return result;
732}
0f2d19dd 733
7337d56d 734static SCM
b1b5433d 735scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 736{
889975e5
MG
737 SCM result;
738 int ends_with_colon = 0;
69f90b0b 739 size_t bytes_read;
b1b5433d 740 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 741 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
742 scm_t_port *pt = SCM_PTAB_ENTRY (port);
743 SCM str;
7337d56d 744
c932ce0b 745 scm_ungetc_unlocked (chr, port);
b1b5433d 746 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 747 &bytes_read);
69f90b0b 748 if (bytes_read > 0)
b662b7e9 749 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 750
69f90b0b
MG
751 if (postfix && ends_with_colon && (bytes_read > 1))
752 {
b662b7e9
LC
753 str = scm_from_stringn (buffer, bytes_read - 1,
754 pt->encoding, pt->ilseq_handler);
69f90b0b 755
b1b5433d 756 if (opts->case_insensitive_p)
69f90b0b
MG
757 str = scm_string_downcase_x (str);
758 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
759 }
7337d56d 760 else
69f90b0b 761 {
b662b7e9
LC
762 str = scm_from_stringn (buffer, bytes_read,
763 pt->encoding, pt->ilseq_handler);
69f90b0b 764
b1b5433d 765 if (opts->case_insensitive_p)
69f90b0b
MG
766 str = scm_string_downcase_x (str);
767 result = scm_string_to_symbol (str);
768 }
7337d56d 769
69f90b0b 770 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
771 return result;
772}
773
774static SCM
b1b5433d 775scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
776#define FUNC_NAME "scm_lreadr"
777{
889975e5 778 SCM result;
7337d56d 779 size_t read;
b662b7e9 780 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 781 unsigned int radix;
69f90b0b
MG
782 SCM str;
783 scm_t_port *pt;
7337d56d
LC
784
785 switch (chr)
786 {
787 case 'B':
788 case 'b':
789 radix = 2;
790 break;
791
792 case 'o':
793 case 'O':
794 radix = 8;
795 break;
796
797 case 'd':
798 case 'D':
799 radix = 10;
800 break;
801
802 case 'x':
803 case 'X':
804 radix = 16;
805 break;
806
807 default:
c932ce0b
AW
808 scm_ungetc_unlocked (chr, port);
809 scm_ungetc_unlocked ('#', port);
7337d56d
LC
810 radix = 10;
811 }
812
b1b5433d 813 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 814 &read);
69f90b0b
MG
815
816 pt = SCM_PTAB_ENTRY (port);
b662b7e9 817 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
818
819 result = scm_string_to_number (str, scm_from_uint (radix));
820
69f90b0b 821 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
822
823 if (scm_is_true (result))
824 return result;
825
826 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
827
828 return SCM_BOOL_F;
829}
830#undef FUNC_NAME
831
832static SCM
b1b5433d 833scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
834{
835 SCM p;
492faee1
LC
836 long line = SCM_LINUM (port);
837 int column = SCM_COL (port) - 1;
7337d56d
LC
838
839 switch (chr)
840 {
841 case '`':
842 p = scm_sym_quasiquote;
843 break;
844
845 case '\'':
846 p = scm_sym_quote;
847 break;
848
849 case ',':
850 {
889975e5 851 scm_t_wchar c;
7337d56d 852
be632904 853 c = scm_getc_unlocked (port);
7337d56d
LC
854 if ('@' == c)
855 p = scm_sym_uq_splicing;
856 else
0f2d19dd 857 {
c932ce0b 858 scm_ungetc_unlocked (c, port);
7337d56d 859 p = scm_sym_unquote;
0f2d19dd 860 }
7337d56d
LC
861 break;
862 }
0f2d19dd 863
7337d56d
LC
864 default:
865 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 866 "scm_read_quote", chr);
7337d56d 867 abort ();
0f2d19dd 868 }
1cc91f1b 869
b1b5433d
MW
870 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
871 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
872}
873
34f3d47d
AW
874SCM_SYMBOL (sym_syntax, "syntax");
875SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
876SCM_SYMBOL (sym_unsyntax, "unsyntax");
877SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
878
879static SCM
b1b5433d 880scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
881{
882 SCM p;
883 long line = SCM_LINUM (port);
884 int column = SCM_COL (port) - 1;
885
886 switch (chr)
887 {
888 case '`':
889 p = sym_quasisyntax;
890 break;
891
892 case '\'':
893 p = sym_syntax;
894 break;
895
896 case ',':
897 {
898 int c;
899
be632904 900 c = scm_getc_unlocked (port);
34f3d47d
AW
901 if ('@' == c)
902 p = sym_unsyntax_splicing;
903 else
904 {
c932ce0b 905 scm_ungetc_unlocked (c, port);
34f3d47d
AW
906 p = sym_unsyntax;
907 }
908 break;
909 }
910
911 default:
912 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
913 "scm_read_syntax", chr);
914 abort ();
915 }
916
b1b5433d
MW
917 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
918 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
919}
920
cfd15439 921static SCM
b1b5433d 922scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 923{
b1b5433d 924 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
925
926 if (!scm_is_eq (id, sym_nil))
927 scm_i_input_error ("scm_read_nil", port,
928 "unexpected input while reading #nil: ~a",
929 scm_list_1 (id));
930
931 return SCM_ELISP_NIL;
932}
933
cfd15439 934static SCM
7337d56d 935scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 936{
0f2d19dd
JB
937 int c;
938
889975e5
MG
939 /* We use the get_byte here because there is no need to get the
940 locale correct with comment input. This presumes that newline
941 always represents itself no matter what the encoding is. */
0d959103 942 for (c = scm_get_byte_or_eof_unlocked (port);
7337d56d 943 (c != EOF) && (c != '\n');
0d959103 944 c = scm_get_byte_or_eof_unlocked (port));
7337d56d
LC
945
946 return SCM_UNSPECIFIED;
947}
948
949\f
950/* Sharp readers, i.e. readers called after a `#' sign has been read. */
951
952static SCM
953scm_read_boolean (int chr, SCM port)
954{
955 switch (chr)
0f2d19dd 956 {
7337d56d
LC
957 case 't':
958 case 'T':
959 return SCM_BOOL_T;
960
961 case 'f':
962 case 'F':
963 return SCM_BOOL_F;
0f2d19dd 964 }
7337d56d
LC
965
966 return SCM_UNSPECIFIED;
967}
968
969static SCM
b1b5433d 970scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
971#define FUNC_NAME "scm_lreadr"
972{
69f90b0b
MG
973 char buffer[READER_CHAR_NAME_MAX_SIZE];
974 SCM charname;
975 size_t charname_len, bytes_read;
889975e5
MG
976 scm_t_wchar cp;
977 int overflow;
69f90b0b 978 scm_t_port *pt;
889975e5 979
b1b5433d
MW
980 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
981 &bytes_read);
889975e5 982 if (overflow)
c1a0ba1c 983 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 984
69f90b0b 985 if (bytes_read == 0)
0f2d19dd 986 {
be632904 987 chr = scm_getc_unlocked (port);
7337d56d
LC
988 if (chr == EOF)
989 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
990 "while reading character", SCM_EOL);
991
992 /* CHR must be a token delimiter, like a whitespace. */
993 return (SCM_MAKE_CHAR (chr));
0f2d19dd 994 }
7337d56d 995
69f90b0b 996 pt = SCM_PTAB_ENTRY (port);
7337d56d 997
69f90b0b
MG
998 /* Simple ASCII characters can be processed immediately. Also, simple
999 ISO-8859-1 characters can be processed immediately if the encoding for this
1000 port is ISO-8859-1. */
1001 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1002 {
1003 SCM_COL (port) += 1;
1004 return SCM_MAKE_CHAR (buffer[0]);
1005 }
1006
1007 /* Otherwise, convert the buffer into a proper scheme string for
1008 processing. */
1009 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1010 pt->ilseq_handler);
1011 charname_len = scm_i_string_length (charname);
1012 SCM_COL (port) += charname_len;
889975e5 1013 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1014 if (charname_len == 1)
1015 return SCM_MAKE_CHAR (cp);
1016
1017 /* Ignore dotted circles, which may be used to keep combining characters from
1018 combining with the backslash in #\charname. */
0dcd7e61
MG
1019 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1020 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1021
889975e5 1022 if (cp >= '0' && cp < '8')
7337d56d
LC
1023 {
1024 /* Dirk:FIXME:: This type of character syntax is not R5RS
1025 * compliant. Further, it should be verified that the constant
0ffc78e3 1026 * does only consist of octal digits. */
889975e5 1027 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1028 if (SCM_I_INUMP (p))
0ffc78e3 1029 {
e25f3727 1030 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1031 if (SCM_IS_UNICODE_CHAR (c))
1032 return SCM_MAKE_CHAR (c);
1033 else
0f3a70cf 1034 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1035 "out-of-range octal character escape: ~a",
1036 scm_list_1 (charname));
1037 }
7337d56d
LC
1038 }
1039
0f3a70cf 1040 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1041 {
1042 SCM p;
0f3a70cf 1043
dea901d6
MG
1044 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1045 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1046 scm_from_uint (16));
1047 if (SCM_I_INUMP (p))
1048 {
e25f3727 1049 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1050 if (SCM_IS_UNICODE_CHAR (c))
1051 return SCM_MAKE_CHAR (c);
1052 else
1053 scm_i_input_error (FUNC_NAME, port,
1054 "out-of-range hex character escape: ~a",
1055 scm_list_1 (charname));
1056 }
1057 }
1058
889975e5
MG
1059 /* The names of characters should never have non-Latin1
1060 characters. */
1061 if (scm_i_is_narrow_string (charname)
1062 || scm_i_try_narrow_string (charname))
4769c9db
AW
1063 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1064 charname_len);
1065 if (scm_is_true (ch))
1066 return ch;
1067 }
7337d56d 1068
7337d56d 1069 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1070 scm_list_1 (charname));
7337d56d
LC
1071
1072 return SCM_UNSPECIFIED;
0f2d19dd 1073}
db4b4ca6 1074#undef FUNC_NAME
0f2d19dd 1075
cfd15439 1076static SCM
b1b5433d 1077scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1078{
1079 SCM symbol;
1080
1081 /* Read the symbol that comprises the keyword. Doing this instead of
1082 invoking a specific symbol reader function allows `scm_read_keyword ()'
1083 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1084
7337d56d 1085 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1086 symbol = scm_read_expression (port, opts);
7337d56d 1087 if (!scm_is_symbol (symbol))
7f74cf9a 1088 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1089 "keyword prefix `~a' not followed by a symbol: ~s",
1090 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1091
1092 return (scm_symbol_to_keyword (symbol));
1093}
1094
cfd15439 1095static SCM
b1b5433d
MW
1096scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1097 long line, int column)
09a4f039 1098{
7337d56d
LC
1099 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1100 guarantee that it's going to do what we want. After all, this is an
1101 implementation detail of `scm_read_vector ()', not a desirable
1102 property. */
b1b5433d
MW
1103 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1104 port, opts, line, column);
b131b233
MW
1105}
1106
493ceb99
MW
1107/* Helper used by scm_read_array */
1108static int
1109read_decimal_integer (SCM port, int c, ssize_t *resp)
1110{
1111 ssize_t sign = 1;
1112 ssize_t res = 0;
1113 int got_it = 0;
1114
1115 if (c == '-')
1116 {
1117 sign = -1;
fa980bcc 1118 c = scm_getc_unlocked (port);
493ceb99
MW
1119 }
1120
1121 while ('0' <= c && c <= '9')
1122 {
1123 res = 10*res + c-'0';
1124 got_it = 1;
fa980bcc 1125 c = scm_getc_unlocked (port);
493ceb99
MW
1126 }
1127
1128 if (got_it)
1129 *resp = sign * res;
1130 return c;
7337d56d 1131}
09a4f039 1132
493ceb99
MW
1133/* Read an array. This function can also read vectors and uniform
1134 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1135 handled here.
1136
10744b7c 1137 C is the first character read after the '#'. */
b131b233 1138static SCM
b1b5433d 1139scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
7337d56d 1140{
493ceb99
MW
1141 ssize_t rank;
1142 scm_t_wchar tag_buf[8];
1143 int tag_len;
1144
1145 SCM tag, shape = SCM_BOOL_F, elements, array;
1146
1147 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1148 the array code can not deal with zero-length dimensions yet, and
10744b7c 1149 we want to allow zero-length vectors, of course. */
493ceb99 1150 if (c == '(')
b1b5433d 1151 return scm_read_vector (c, port, opts, line, column);
493ceb99 1152
10744b7c 1153 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1154 if (c == 'f')
1155 {
fa980bcc 1156 c = scm_getc_unlocked (port);
493ceb99
MW
1157 if (c != '3' && c != '6')
1158 {
1159 if (c != EOF)
fa980bcc 1160 scm_ungetc_unlocked (c, port);
493ceb99
MW
1161 return SCM_BOOL_F;
1162 }
1163 rank = 1;
1164 tag_buf[0] = 'f';
1165 tag_len = 1;
1166 goto continue_reading_tag;
1167 }
1168
1169 /* Read rank. */
1170 rank = 1;
1171 c = read_decimal_integer (port, c, &rank);
1172 if (rank < 0)
1173 scm_i_input_error (NULL, port, "array rank must be non-negative",
1174 SCM_EOL);
1175
1176 /* Read tag. */
1177 tag_len = 0;
1178 continue_reading_tag:
1179 while (c != EOF && c != '(' && c != '@' && c != ':'
1180 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1181 {
1182 tag_buf[tag_len++] = c;
fa980bcc 1183 c = scm_getc_unlocked (port);
493ceb99
MW
1184 }
1185 if (tag_len == 0)
1186 tag = SCM_BOOL_T;
b131b233 1187 else
493ceb99
MW
1188 {
1189 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1190 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1191 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1192 scm_list_1 (tag));
1193 }
1194
1195 /* Read shape. */
1196 if (c == '@' || c == ':')
1197 {
1198 shape = SCM_EOL;
1199
1200 do
1201 {
1202 ssize_t lbnd = 0, len = 0;
1203 SCM s;
1204
1205 if (c == '@')
1206 {
fa980bcc 1207 c = scm_getc_unlocked (port);
493ceb99
MW
1208 c = read_decimal_integer (port, c, &lbnd);
1209 }
1210
1211 s = scm_from_ssize_t (lbnd);
1212
1213 if (c == ':')
1214 {
fa980bcc 1215 c = scm_getc_unlocked (port);
493ceb99
MW
1216 c = read_decimal_integer (port, c, &len);
1217 if (len < 0)
1218 scm_i_input_error (NULL, port,
1219 "array length must be non-negative",
1220 SCM_EOL);
1221
1222 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1223 }
1224
1225 shape = scm_cons (s, shape);
1226 } while (c == '@' || c == ':');
1227
1228 shape = scm_reverse_x (shape, SCM_EOL);
1229 }
1230
1231 /* Read nested lists of elements. */
1232 if (c != '(')
1233 scm_i_input_error (NULL, port,
1234 "missing '(' in vector or array literal",
1235 SCM_EOL);
b1b5433d 1236 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1237
1238 if (scm_is_false (shape))
1239 shape = scm_from_ssize_t (rank);
1240 else if (scm_ilength (shape) != rank)
1241 scm_i_input_error
1242 (NULL, port,
1243 "the number of shape specifications must match the array rank",
1244 SCM_EOL);
1245
1246 /* Handle special print syntax of rank zero arrays; see
1247 scm_i_print_array for a rationale. */
1248 if (rank == 0)
1249 {
1250 if (!scm_is_pair (elements))
1251 scm_i_input_error (NULL, port,
1252 "too few elements in array literal, need 1",
1253 SCM_EOL);
1254 if (!scm_is_null (SCM_CDR (elements)))
1255 scm_i_input_error (NULL, port,
1256 "too many elements in array literal, want 1",
1257 SCM_EOL);
1258 elements = SCM_CAR (elements);
1259 }
1260
1261 /* Construct array, annotate with source location, and return. */
1262 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1263 return maybe_annotate_source (array, port, opts, line, column);
7337d56d
LC
1264}
1265
0ba0b384 1266static SCM
b1b5433d
MW
1267scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1268 long line, int column)
7337d56d 1269{
b1b5433d 1270 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1271}
1272
0ba0b384 1273static SCM
b1b5433d
MW
1274scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1275 long line, int column)
0ba0b384 1276{
be632904 1277 chr = scm_getc_unlocked (port);
0ba0b384
LC
1278 if (chr != 'u')
1279 goto syntax;
1280
be632904 1281 chr = scm_getc_unlocked (port);
0ba0b384
LC
1282 if (chr != '8')
1283 goto syntax;
1284
be632904 1285 chr = scm_getc_unlocked (port);
0ba0b384
LC
1286 if (chr != '(')
1287 goto syntax;
1288
b131b233 1289 return maybe_annotate_source
b1b5433d
MW
1290 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1291 port, opts, line, column);
0ba0b384
LC
1292
1293 syntax:
1294 scm_i_input_error ("read_bytevector", port,
1295 "invalid bytevector prefix",
1296 SCM_MAKE_CHAR (chr));
1297 return SCM_UNSPECIFIED;
1298}
1299
7337d56d 1300static SCM
b1b5433d
MW
1301scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1302 long line, int column)
7337d56d
LC
1303{
1304 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1305 terribly inefficient but who cares? */
1306 SCM s_bits = SCM_EOL;
1307
be632904 1308 for (chr = scm_getc_unlocked (port);
7337d56d 1309 (chr != EOF) && ((chr == '0') || (chr == '1'));
be632904 1310 chr = scm_getc_unlocked (port))
09a4f039 1311 {
7337d56d 1312 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1313 }
7337d56d
LC
1314
1315 if (chr != EOF)
c932ce0b 1316 scm_ungetc_unlocked (chr, port);
7337d56d 1317
b131b233
MW
1318 return maybe_annotate_source
1319 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1320 port, opts, line, column);
7337d56d
LC
1321}
1322
cfd15439 1323static SCM
889975e5 1324scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1325{
1326 int bang_seen = 0;
1327
1328 for (;;)
09a4f039 1329 {
be632904 1330 int c = scm_getc_unlocked (port);
62850ef3 1331
7337d56d
LC
1332 if (c == EOF)
1333 scm_i_input_error ("skip_block_comment", port,
1334 "unterminated `#! ... !#' comment", SCM_EOL);
1335
1336 if (c == '!')
1337 bang_seen = 1;
1338 else if (c == '#' && bang_seen)
1339 break;
1340 else
1341 bang_seen = 0;
1342 }
1343
1344 return SCM_UNSPECIFIED;
1345}
1346
9331ffd8
MW
1347static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1348 int value);
bf9eb54a
MW
1349static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1350 int value);
1351static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1352 int value);
9331ffd8 1353
d7fcaec3 1354static SCM
b1b5433d 1355scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1356{
02327c0c
MW
1357 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1358 int c;
1359 int i = 0;
1360
1361 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1362 {
fa980bcc 1363 c = scm_getc_unlocked (port);
02327c0c
MW
1364 if (c == EOF)
1365 scm_i_input_error ("skip_block_comment", port,
1366 "unterminated `#! ... !#' comment", SCM_EOL);
1367 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1368 name[i++] = c;
1369 else if (CHAR_IS_DELIMITER (c))
1370 {
fa980bcc 1371 scm_ungetc_unlocked (c, port);
02327c0c
MW
1372 name[i] = '\0';
1373 if (0 == strcmp ("r6rs", name))
1374 ; /* Silently ignore */
9331ffd8
MW
1375 else if (0 == strcmp ("fold-case", name))
1376 set_port_case_insensitive_p (port, opts, 1);
1377 else if (0 == strcmp ("no-fold-case", name))
1378 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1379 else if (0 == strcmp ("curly-infix", name))
1380 set_port_curly_infix_p (port, opts, 1);
1381 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1382 {
1383 set_port_curly_infix_p (port, opts, 1);
1384 set_port_square_brackets_p (port, opts, 0);
1385 }
02327c0c
MW
1386 else
1387 break;
1388
1389 return SCM_UNSPECIFIED;
1390 }
fa746547
MW
1391 else
1392 {
fa980bcc 1393 scm_ungetc_unlocked (c, port);
fa746547
MW
1394 break;
1395 }
911b03b2 1396 }
02327c0c 1397 while (i > 0)
fa980bcc 1398 scm_ungetc_unlocked (name[--i], port);
02327c0c 1399 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1400}
1401
620c8965
LC
1402static SCM
1403scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1404{
1405 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1406 nested. So care must be taken. */
1407 int nesting_level = 1;
6d5f8c32 1408
be632904 1409 int a = scm_getc_unlocked (port);
6d5f8c32
AW
1410
1411 if (a == EOF)
1412 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1413 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1414
1415 while (nesting_level > 0)
1416 {
be632904 1417 int b = scm_getc_unlocked (port);
620c8965 1418
6d5f8c32 1419 if (b == EOF)
cd169c5a 1420 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1421 "unterminated `#| ... |#' comment", SCM_EOL);
1422
6d5f8c32
AW
1423 if (a == '|' && b == '#')
1424 {
1425 nesting_level--;
1426 b = EOF;
1427 }
1428 else if (a == '#' && b == '|')
1429 {
1430 nesting_level++;
1431 b = EOF;
1432 }
1433
1434 a = b;
620c8965
LC
1435 }
1436
1437 return SCM_UNSPECIFIED;
1438}
1439
34f3d47d 1440static SCM
b1b5433d
MW
1441scm_read_commented_expression (scm_t_wchar chr, SCM port,
1442 scm_t_read_opts *opts)
34f3d47d 1443{
889975e5 1444 scm_t_wchar c;
34f3d47d 1445
b1b5433d 1446 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1447 if (EOF == c)
1448 scm_i_input_error ("read_commented_expression", port,
1449 "no expression after #; comment", SCM_EOL);
c932ce0b 1450 scm_ungetc_unlocked (c, port);
b1b5433d 1451 scm_read_expression (port, opts);
34f3d47d
AW
1452 return SCM_UNSPECIFIED;
1453}
1454
7337d56d 1455static SCM
889975e5 1456scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1457{
1458 /* Guile's extended symbol read syntax looks like this:
1459
1460 #{This is all a symbol name}#
1461
1462 So here, CHR is expected to be `{'. */
d9527cfa 1463 int saw_brace = 0;
7337d56d 1464 size_t len = 0;
190d4b0d 1465 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1466
889975e5 1467 buf = scm_i_string_start_writing (buf);
7337d56d 1468
be632904 1469 while ((chr = scm_getc_unlocked (port)) != EOF)
7337d56d
LC
1470 {
1471 if (saw_brace)
09a4f039 1472 {
7337d56d
LC
1473 if (chr == '#')
1474 {
7337d56d
LC
1475 break;
1476 }
1477 else
1478 {
1479 saw_brace = 0;
889975e5 1480 scm_i_string_set_x (buf, len++, '}');
7337d56d 1481 }
09a4f039 1482 }
d9527cfa
AW
1483
1484 if (chr == '}')
7337d56d 1485 saw_brace = 1;
d9527cfa
AW
1486 else if (chr == '\\')
1487 {
1488 /* It used to be that print.c would print extended-read-syntax
1489 symbols with backslashes before "non-standard" chars, but
1490 this routine wouldn't do anything with those escapes.
1491 Bummer. What we've done is to change print.c to output
1492 R6RS hex escapes for those characters, relying on the fact
1493 that the extended read syntax would never put a `\' before
1494 an `x'. For now, we just ignore other instances of
1495 backslash in the string. */
be632904 1496 switch ((chr = scm_getc_unlocked (port)))
d9527cfa
AW
1497 {
1498 case EOF:
1499 goto done;
1500 case 'x':
1501 {
1502 scm_t_wchar c;
1503
1504 SCM_READ_HEX_ESCAPE (10, ';');
1505 scm_i_string_set_x (buf, len++, c);
1506 break;
1507
1508 str_eof:
1509 chr = EOF;
1510 goto done;
1511
1512 bad_escaped:
1513 scm_i_string_stop_writing ();
1514 scm_i_input_error ("scm_read_extended_symbol", port,
1515 "illegal character in escape sequence: ~S",
1516 scm_list_1 (SCM_MAKE_CHAR (c)));
1517 break;
1518 }
1519 default:
1520 scm_i_string_set_x (buf, len++, chr);
1521 break;
1522 }
1523 }
7337d56d 1524 else
d9527cfa 1525 scm_i_string_set_x (buf, len++, chr);
62850ef3 1526
889975e5 1527 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1528 {
7f991c7d
LC
1529 SCM addy;
1530
889975e5 1531 scm_i_string_stop_writing ();
190d4b0d 1532 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1533 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1534 len = 0;
889975e5 1535 buf = scm_i_string_start_writing (buf);
7337d56d 1536 }
7337d56d 1537 }
d9527cfa
AW
1538
1539 done:
889975e5 1540 scm_i_string_stop_writing ();
d9527cfa
AW
1541 if (chr == EOF)
1542 scm_i_input_error ("scm_read_extended_symbol", port,
1543 "end of file while reading symbol", SCM_EOL);
7337d56d 1544
889975e5 1545 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1546}
1547
1548
1549\f
1550/* Top-level token readers, i.e., dispatchers. */
1551
1552static SCM
b1b5433d 1553scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1554{
1555 SCM proc;
1556
1557 proc = scm_get_hash_procedure (chr);
1558 if (scm_is_true (scm_procedure_p (proc)))
1559 {
1560 long line = SCM_LINUM (port);
1561 int column = SCM_COL (port) - 2;
1562 SCM got;
1563
1564 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1565
3655ed89
MW
1566 if (opts->record_positions_p && SCM_NIMP (got)
1567 && !scm_i_has_source_properties (got))
26c8cc14
AW
1568 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1569
1570 return got;
09a4f039 1571 }
7337d56d
LC
1572
1573 return SCM_UNSPECIFIED;
1574}
1575
1576/* The reader for the sharp `#' character. It basically dispatches reads
1577 among the above token readers. */
1578static SCM
b1b5433d
MW
1579scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1580 long line, int column)
7337d56d
LC
1581#define FUNC_NAME "scm_lreadr"
1582{
1583 SCM result;
1584
be632904 1585 chr = scm_getc_unlocked (port);
7337d56d 1586
b1b5433d 1587 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1588 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1589 return result;
1590
1591 switch (chr)
1592 {
1593 case '\\':
b1b5433d 1594 return (scm_read_character (chr, port, opts));
7337d56d 1595 case '(':
b1b5433d 1596 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1597 case 's':
1598 case 'u':
1599 case 'f':
df941b5b 1600 case 'c':
7337d56d 1601 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1602 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1603 case 'v':
b1b5433d 1604 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1605 case '*':
b1b5433d 1606 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1607 case 't':
1608 case 'T':
1609 case 'F':
7337d56d
LC
1610 return (scm_read_boolean (chr, port));
1611 case ':':
b1b5433d 1612 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1613 case '0': case '1': case '2': case '3': case '4':
1614 case '5': case '6': case '7': case '8': case '9':
1615 case '@':
b1b5433d 1616 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1617
1618 case 'i':
1619 case 'e':
7337d56d
LC
1620 case 'b':
1621 case 'B':
1622 case 'o':
1623 case 'O':
1624 case 'd':
1625 case 'D':
1626 case 'x':
1627 case 'X':
1628 case 'I':
1629 case 'E':
b1b5433d 1630 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1631 case '{':
1632 return (scm_read_extended_symbol (chr, port));
1633 case '!':
b1b5433d 1634 return (scm_read_shebang (chr, port, opts));
34f3d47d 1635 case ';':
b1b5433d 1636 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1637 case '`':
1638 case '\'':
1639 case ',':
b1b5433d 1640 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1641 case 'n':
b1b5433d 1642 return (scm_read_nil (chr, port, opts));
7337d56d 1643 default:
b1b5433d 1644 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1645 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1646 {
1647 /* To remain compatible with 1.8 and earlier, the following
1648 characters have lower precedence than `read-hash-extend'
1649 characters. */
1650 switch (chr)
1651 {
1652 case '|':
1653 return scm_read_r6rs_block_comment (chr, port);
1654 default:
1655 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1656 scm_list_1 (SCM_MAKE_CHAR (chr)));
1657 }
1658 }
7337d56d
LC
1659 else
1660 return result;
1661 }
1662
1663 return SCM_UNSPECIFIED;
1664}
1665#undef FUNC_NAME
1666
1667static SCM
bf9eb54a
MW
1668read_inner_expression (SCM port, scm_t_read_opts *opts)
1669#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1670{
1671 while (1)
1672 {
cfd15439 1673 scm_t_wchar chr;
7337d56d 1674
be632904 1675 chr = scm_getc_unlocked (port);
7337d56d
LC
1676
1677 switch (chr)
1678 {
1679 case SCM_WHITE_SPACES:
1680 case SCM_LINE_INCREMENTORS:
1681 break;
1682 case ';':
1683 (void) scm_read_semicolon_comment (chr, port);
1684 break;
bf9eb54a
MW
1685 case '{':
1686 if (opts->curly_infix_p)
1687 {
1688 if (opts->neoteric_p)
1689 return scm_read_sexp (chr, port, opts);
1690 else
1691 {
1692 SCM expr;
1693
1694 /* Enable neoteric expressions within curly braces */
1695 opts->neoteric_p = 1;
1696 expr = scm_read_sexp (chr, port, opts);
1697 opts->neoteric_p = 0;
1698 return expr;
1699 }
1700 }
1701 else
1702 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1703 case '[':
bf9eb54a
MW
1704 if (opts->square_brackets_p)
1705 return scm_read_sexp (chr, port, opts);
1706 else if (opts->curly_infix_p)
1707 {
1708 /* The syntax of neoteric expressions requires that '[' be
1709 a delimiter when curly-infix is enabled, so it cannot
1710 be part of an unescaped symbol. We might as well do
1711 something useful with it, so we adopt Kawa's convention:
1712 [...] => ($bracket-list$ ...) */
1713 long line = SCM_LINUM (port);
1714 int column = SCM_COL (port) - 1;
1715 return maybe_annotate_source
1716 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1717 port, opts, line, column);
1718 }
1719 else
1720 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1721 case '(':
b1b5433d 1722 return (scm_read_sexp (chr, port, opts));
7337d56d 1723 case '"':
b1b5433d 1724 return (scm_read_string (chr, port, opts));
7337d56d
LC
1725 case '\'':
1726 case '`':
1727 case ',':
b1b5433d 1728 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1729 case '#':
1730 {
b131b233
MW
1731 long line = SCM_LINUM (port);
1732 int column = SCM_COL (port) - 1;
b1b5433d 1733 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1734 if (scm_is_eq (result, SCM_UNSPECIFIED))
1735 /* We read a comment or some such. */
1736 break;
1737 else
1738 return result;
1739 }
1740 case ')':
1741 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1742 break;
bf9eb54a
MW
1743 case '}':
1744 if (opts->curly_infix_p)
1745 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1746 else
1747 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1748 case ']':
b1b5433d 1749 if (opts->square_brackets_p)
a4e47229
MG
1750 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1751 /* otherwise fall through */
7337d56d
LC
1752 case EOF:
1753 return SCM_EOF_VAL;
1754 case ':':
b1b5433d
MW
1755 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1756 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1757 /* Fall through. */
1758
1759 default:
1760 {
1761 if (((chr >= '0') && (chr <= '9'))
1762 || (strchr ("+-.", chr)))
b1b5433d 1763 return (scm_read_number (chr, port, opts));
7337d56d 1764 else
b1b5433d 1765 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1766 }
1767 }
1768 }
1769}
1770#undef FUNC_NAME
1771
bf9eb54a
MW
1772static SCM
1773scm_read_expression (SCM port, scm_t_read_opts *opts)
1774#define FUNC_NAME "scm_read_expression"
1775{
1776 if (!opts->neoteric_p)
1777 return read_inner_expression (port, opts);
1778 else
1779 {
1780 long line = 0;
1781 int column = 0;
1782 SCM expr;
1783
1784 if (opts->record_positions_p)
1785 {
1786 /* We need to get the position of the first non-whitespace
1787 character in order to correctly annotate neoteric
1788 expressions. For example, for the expression 'f(x)', the
1789 first call to 'read_inner_expression' reads the 'f' (which
1790 cannot be annotated), and then we later read the '(x)' and
1791 use it to construct the new list (f x). */
1792 int c = flush_ws (port, opts, (char *) NULL);
1793 if (c == EOF)
1794 return SCM_EOF_VAL;
fa980bcc 1795 scm_ungetc_unlocked (c, port);
bf9eb54a
MW
1796 line = SCM_LINUM (port);
1797 column = SCM_COL (port);
1798 }
1799
1800 expr = read_inner_expression (port, opts);
1801
1802 /* 'expr' is the first component of the neoteric expression. Now
1803 we loop, and as long as the next character is '(', '[', or '{',
1804 (without any intervening whitespace), we use it to construct a
1805 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1806 for (;;)
1807 {
fa980bcc 1808 int chr = scm_getc_unlocked (port);
bf9eb54a
MW
1809
1810 if (chr == '(')
1811 /* e(...) => (e ...) */
1812 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1813 else if (chr == '[')
1814 /* e[...] => ($bracket-apply$ e ...) */
1815 expr = scm_cons (sym_bracket_apply,
1816 scm_cons (expr,
1817 scm_read_sexp (chr, port, opts)));
1818 else if (chr == '{')
1819 {
1820 SCM arg = scm_read_sexp (chr, port, opts);
1821
1822 if (scm_is_null (arg))
1823 expr = scm_list_1 (expr); /* e{} => (e) */
1824 else
1825 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1826 }
1827 else
1828 {
1829 if (chr != EOF)
fa980bcc 1830 scm_ungetc_unlocked (chr, port);
bf9eb54a
MW
1831 break;
1832 }
1833 maybe_annotate_source (expr, port, opts, line, column);
1834 }
1835 return expr;
1836 }
1837}
1838#undef FUNC_NAME
1839
7337d56d
LC
1840\f
1841/* Actual reader. */
1842
ea8c9761
MW
1843static void init_read_options (SCM port, scm_t_read_opts *opts);
1844
7337d56d
LC
1845SCM_DEFINE (scm_read, "read", 0, 1, 0,
1846 (SCM port),
1847 "Read an s-expression from the input port @var{port}, or from\n"
1848 "the current input port if @var{port} is not specified.\n"
1849 "Any whitespace before the next token is discarded.")
1850#define FUNC_NAME s_scm_read
1851{
b1b5433d 1852 scm_t_read_opts opts;
7337d56d
LC
1853 int c;
1854
1855 if (SCM_UNBNDP (port))
1856 port = scm_current_input_port ();
1857 SCM_VALIDATE_OPINPORT (1, port);
1858
ea8c9761 1859 init_read_options (port, &opts);
b1b5433d
MW
1860
1861 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1862 if (EOF == c)
1863 return SCM_EOF_VAL;
c932ce0b 1864 scm_ungetc_unlocked (c, port);
7337d56d 1865
b1b5433d 1866 return (scm_read_expression (port, &opts));
09a4f039 1867}
db4b4ca6 1868#undef FUNC_NAME
09a4f039 1869
0f2d19dd
JB
1870
1871\f
1872
14de3b42
GH
1873/* Manipulate the read-hash-procedures alist. This could be written in
1874 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1875SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1876 (SCM chr, SCM proc),
dc7fa443
MG
1877 "Install the procedure @var{proc} for reading expressions\n"
1878 "starting with the character sequence @code{#} and @var{chr}.\n"
1879 "@var{proc} will be called with two arguments: the character\n"
1880 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1881 "returned will be the return value of @code{read}. \n"
1882 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1883 )
1bbd0b84 1884#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1885{
fed9c9a2
GH
1886 SCM this;
1887 SCM prev;
1888
36284627 1889 SCM_VALIDATE_CHAR (1, chr);
7888309b 1890 SCM_ASSERT (scm_is_false (proc)
bc36d050 1891 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1892 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1893
14de3b42 1894 /* Check if chr is already in the alist. */
d458073b 1895 this = scm_i_read_hash_procedures_ref ();
14de3b42 1896 prev = SCM_BOOL_F;
fed9c9a2
GH
1897 while (1)
1898 {
d2e53ed6 1899 if (scm_is_null (this))
fed9c9a2
GH
1900 {
1901 /* not found, so add it to the beginning. */
7888309b 1902 if (scm_is_true (proc))
fed9c9a2 1903 {
d458073b
AR
1904 SCM new = scm_cons (scm_cons (chr, proc),
1905 scm_i_read_hash_procedures_ref ());
1906 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1907 }
1908 break;
1909 }
bc36d050 1910 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1911 {
1912 /* already in the alist. */
7888309b 1913 if (scm_is_false (proc))
14de3b42
GH
1914 {
1915 /* remove it. */
7888309b 1916 if (scm_is_false (prev))
14de3b42 1917 {
d458073b
AR
1918 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1919 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1920 }
1921 else
1922 scm_set_cdr_x (prev, SCM_CDR (this));
1923 }
fed9c9a2 1924 else
14de3b42
GH
1925 {
1926 /* replace it. */
1927 scm_set_cdr_x (SCM_CAR (this), proc);
1928 }
fed9c9a2
GH
1929 break;
1930 }
1931 prev = this;
1932 this = SCM_CDR (this);
1933 }
deca31e1 1934
deca31e1
GH
1935 return SCM_UNSPECIFIED;
1936}
1bbd0b84 1937#undef FUNC_NAME
0f2d19dd 1938
deca31e1
GH
1939/* Recover the read-hash procedure corresponding to char c. */
1940static SCM
6e8d25a6 1941scm_get_hash_procedure (int c)
deca31e1 1942{
d458073b 1943 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1944
deca31e1
GH
1945 while (1)
1946 {
d2e53ed6 1947 if (scm_is_null (rest))
deca31e1
GH
1948 return SCM_BOOL_F;
1949
7866a09b 1950 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1951 return SCM_CDAR (rest);
1952
1953 rest = SCM_CDR (rest);
1954 }
1955}
1cc91f1b 1956
889975e5
MG
1957#define SCM_ENCODING_SEARCH_SIZE (500)
1958
f8a1c9a8
LC
1959/* Search the first few hundred characters of a file for an Emacs-like coding
1960 declaration. Returns either NULL or a string whose storage has been
1961 allocated with `scm_gc_malloc ()'. */
889975e5 1962char *
f8a1c9a8 1963scm_i_scan_for_encoding (SCM port)
889975e5 1964{
d900843c 1965 scm_t_port *pt;
889975e5 1966 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1967 size_t bytes_read, encoding_length, i;
889975e5
MG
1968 char *encoding = NULL;
1969 int utf8_bom = 0;
daedbca7 1970 char *pos, *encoding_start;
889975e5
MG
1971 int in_comment;
1972
d900843c 1973 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1974
d900843c 1975 if (pt->rw_active == SCM_PORT_WRITE)
4251ae2e 1976 scm_flush_unlocked (port);
49bb5bd3 1977
d900843c
AW
1978 if (pt->rw_random)
1979 pt->rw_active = SCM_PORT_READ;
1980
1981 if (pt->read_pos == pt->read_end)
1982 {
1983 /* We can use the read buffer, and thus avoid a seek. */
4251ae2e 1984 if (scm_fill_input_unlocked (port) == EOF)
d900843c
AW
1985 return NULL;
1986
1987 bytes_read = pt->read_end - pt->read_pos;
1988 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1989 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1990
1991 if (bytes_read <= 1)
1992 /* An unbuffered port -- don't scan. */
1993 return NULL;
1994
1995 memcpy (header, pt->read_pos, bytes_read);
1996 header[bytes_read] = '\0';
1997 }
1998 else
1999 {
2000 /* Try to read some bytes and then seek back. Not all ports
2001 support seeking back; and indeed some file ports (like
2002 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2003 check performed by SCM_FPORT_FDES---but fail to seek
2004 backwards. Hence this block comes second. We prefer to use
2005 the read buffer in-place. */
2006 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2007 return NULL;
2008
be632904 2009 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
d900843c
AW
2010 header[bytes_read] = '\0';
2011 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2012 }
889975e5
MG
2013
2014 if (bytes_read > 3
2015 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
2016 utf8_bom = 1;
2017
2018 /* search past "coding[:=]" */
2019 pos = header;
2020 while (1)
2021 {
2022 if ((pos = strstr(pos, "coding")) == NULL)
2023 return NULL;
2024
2025 pos += strlen("coding");
2026 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2027 (*pos == ':' || *pos == '='))
2028 {
2029 pos ++;
2030 break;
2031 }
2032 }
2033
2034 /* skip spaces */
2035 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2036 (*pos == ' ' || *pos == '\t'))
2037 pos ++;
2038
2039 /* grab the next token */
daedbca7 2040 encoding_start = pos;
889975e5 2041 i = 0;
daedbca7
MG
2042 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2043 && encoding_start + i - header < bytes_read
2044 && (isalnum ((int) encoding_start[i])
2045 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
2046 i++;
2047
daedbca7
MG
2048 encoding_length = i;
2049 if (encoding_length == 0)
889975e5
MG
2050 return NULL;
2051
daedbca7
MG
2052 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2053 for (i = 0; i < encoding_length; i++)
026ed239 2054 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
2055
2056 /* push backwards to make sure we were in a comment */
2057 in_comment = 0;
daedbca7
MG
2058 pos = encoding_start;
2059 while (pos >= header)
889975e5 2060 {
8a12aeb9
AW
2061 if (*pos == ';')
2062 {
2063 in_comment = 1;
2064 break;
2065 }
2066 else if (*pos == '\n' || pos == header)
889975e5
MG
2067 {
2068 /* This wasn't in a semicolon comment. Check for a
2069 hash-bang comment. */
2070 char *beg = strstr (header, "#!");
2071 char *end = strstr (header, "!#");
8a12aeb9 2072 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2073 in_comment = 1;
2074 break;
2075 }
8a12aeb9
AW
2076 else
2077 {
2078 pos --;
2079 continue;
2080 }
889975e5
MG
2081 }
2082 if (!in_comment)
f8a1c9a8
LC
2083 /* This wasn't in a comment */
2084 return NULL;
2085
889975e5 2086 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 2087 scm_misc_error (NULL,
889975e5
MG
2088 "the port input declares the encoding ~s but is encoded as UTF-8",
2089 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 2090
889975e5
MG
2091 return encoding;
2092}
2093
2094SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2095 (SCM port),
a270e133 2096 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2097 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2098 "The coding declaration is of the form\n"
2099 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2100 "\n"
2101 "Returns a string containing the character encoding of the file\n"
2102 "if a declaration was found, or @code{#f} otherwise.\n")
2103#define FUNC_NAME s_scm_file_encoding
2104{
2105 char *enc;
2106 SCM s_enc;
f8a1c9a8 2107
d900843c
AW
2108 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2109
f8a1c9a8 2110 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2111 if (enc == NULL)
2112 return SCM_BOOL_F;
2113 else
2114 {
2115 s_enc = scm_from_locale_string (enc);
889975e5
MG
2116 return s_enc;
2117 }
f8a1c9a8 2118
889975e5
MG
2119 return SCM_BOOL_F;
2120}
2121#undef FUNC_NAME
2122
ea8c9761
MW
2123\f
2124/* Per-port read options.
2125
2126 We store per-port read options in the 'port-read-options' key of the
fa980bcc
MW
2127 port's alist. The value stored in the alist is a single integer that
2128 contains a two-bit field for each read option.
ea8c9761
MW
2129
2130 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2131 the applicable value should be inherited from the corresponding
2132 global read option. Otherwise, the bit field contains the value of
2133 the read option. For boolean read options that have been set
2134 per-port, the possible values are 0 or 1. If the 'keyword_style'
2135 read option has been set per-port, its possible values are those in
2136 'enum t_keyword_style'. */
2137
2138/* Key to read options in per-port alists. */
2139SCM_SYMBOL (sym_port_read_options, "port-read-options");
2140
2141/* Offsets of bit fields for each per-port override */
2142#define READ_OPTION_COPY_SOURCE_P 0
2143#define READ_OPTION_RECORD_POSITIONS_P 2
2144#define READ_OPTION_CASE_INSENSITIVE_P 4
2145#define READ_OPTION_KEYWORD_STYLE 6
2146#define READ_OPTION_R6RS_ESCAPES_P 8
2147#define READ_OPTION_SQUARE_BRACKETS_P 10
2148#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2149#define READ_OPTION_CURLY_INFIX_P 14
ea8c9761 2150
bf9eb54a
MW
2151/* The total width in bits of the per-port overrides */
2152#define READ_OPTIONS_NUM_BITS 16
ea8c9761
MW
2153
2154#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2155#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2156
2157#define READ_OPTION_MASK 3
2158#define READ_OPTION_INHERIT 3
2159
2160static void
2161set_port_read_option (SCM port, int option, int new_value)
2162{
fa980bcc 2163 SCM scm_read_options;
ea8c9761
MW
2164 unsigned int read_options;
2165
2166 new_value &= READ_OPTION_MASK;
fa980bcc
MW
2167 scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2168 sym_port_read_options);
ea8c9761
MW
2169 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2170 read_options = scm_to_uint (scm_read_options);
2171 else
2172 read_options = READ_OPTIONS_INHERIT_ALL;
2173 read_options &= ~(READ_OPTION_MASK << option);
2174 read_options |= new_value << option;
2175 scm_read_options = scm_from_uint (read_options);
fa980bcc
MW
2176 SCM_PTAB_ENTRY(port)->alist = scm_assq_set_x (SCM_PTAB_ENTRY(port)->alist,
2177 sym_port_read_options,
2178 scm_read_options);
ea8c9761
MW
2179}
2180
9331ffd8
MW
2181/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2182static void
2183set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2184{
2185 value = !!value;
2186 opts->case_insensitive_p = value;
2187 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2188}
2189
bf9eb54a
MW
2190/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2191static void
2192set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2193{
2194 value = !!value;
2195 opts->square_brackets_p = value;
2196 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2197}
2198
2199/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2200static void
2201set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2202{
2203 value = !!value;
2204 opts->curly_infix_p = value;
2205 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2206}
2207
ea8c9761
MW
2208/* Initialize OPTS based on PORT's read options and the global read
2209 options. */
2210static void
2211init_read_options (SCM port, scm_t_read_opts *opts)
2212{
fa980bcc 2213 SCM val, scm_read_options;
ea8c9761
MW
2214 unsigned int read_options, x;
2215
fa980bcc
MW
2216 scm_read_options = scm_assq_ref (SCM_PTAB_ENTRY(port)->alist,
2217 sym_port_read_options);
ea8c9761
MW
2218
2219 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2220 read_options = scm_to_uint (scm_read_options);
2221 else
2222 read_options = READ_OPTIONS_INHERIT_ALL;
2223
2224 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2225 if (x == READ_OPTION_INHERIT)
2226 {
2227 val = SCM_PACK (SCM_KEYWORD_STYLE);
2228 if (scm_is_eq (val, scm_keyword_prefix))
2229 x = KEYWORD_STYLE_PREFIX;
2230 else if (scm_is_eq (val, scm_keyword_postfix))
2231 x = KEYWORD_STYLE_POSTFIX;
2232 else
2233 x = KEYWORD_STYLE_HASH_PREFIX;
2234 }
2235 opts->keyword_style = x;
2236
2237#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2238 do \
2239 { \
2240 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2241 if (x == READ_OPTION_INHERIT) \
2242 x = !!SCM_ ## NAME; \
2243 opts->name = x; \
2244 } \
2245 while (0)
2246
2247 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2248 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2249 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2250 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2251 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2252 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2253 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
ea8c9761
MW
2254
2255#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2256
2257 opts->neoteric_p = 0;
ea8c9761
MW
2258}
2259
0f2d19dd
JB
2260void
2261scm_init_read ()
0f2d19dd 2262{
d458073b
AR
2263 SCM read_hash_procs;
2264
c81c2ad3 2265 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2266
2267 scm_i_read_hash_procedures =
2268 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2269
62560650 2270 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2271#include "libguile/read.x"
0f2d19dd 2272}
89e00824
ML
2273
2274/*
2275 Local Variables:
2276 c-file-style: "gnu"
2277 End:
2278*/