Block system asyncs while 'overrides_lock' is held.
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37 1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
419c8736 2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
96965a6e 33#include <c-strcase.h>
7337d56d 34
a0599745 35#include "libguile/_scm.h"
0ba0b384 36#include "libguile/bytevectors.h"
a0599745
MD
37#include "libguile/chars.h"
38#include "libguile/eval.h"
2fa901a5 39#include "libguile/arrays.h"
cf396142 40#include "libguile/bitvectors.h"
a0599745
MD
41#include "libguile/keywords.h"
42#include "libguile/alist.h"
43#include "libguile/srcprop.h"
44#include "libguile/hashtab.h"
45#include "libguile/hash.h"
46#include "libguile/ports.h"
49bb5bd3 47#include "libguile/fports.h"
a0599745
MD
48#include "libguile/root.h"
49#include "libguile/strings.h"
ba1b2226 50#include "libguile/strports.h"
a0599745 51#include "libguile/vectors.h"
a0599745 52#include "libguile/validate.h"
a4022e69 53#include "libguile/srfi-4.h"
7337d56d 54#include "libguile/srfi-13.h"
ba1b2226 55
a0599745 56#include "libguile/read.h"
22fc179a
HWN
57#include "libguile/private-options.h"
58
0f2d19dd
JB
59
60\f
61
5bf6a6f0 62SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 63SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 64SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 65SCM_SYMBOL (sym_nil, "nil");
c7733771 66
bf9eb54a
MW
67/* SRFI-105 curly infix expression support */
68SCM_SYMBOL (sym_nfx, "$nfx$");
69SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
70SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
71
72scm_t_option scm_read_opts[] =
73 {
74 { SCM_OPTION_BOOLEAN, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
90 { 0, },
91 };
b1b5433d
MW
92
93/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
96
97enum t_keyword_style
98 {
99 KEYWORD_STYLE_HASH_PREFIX,
100 KEYWORD_STYLE_PREFIX,
101 KEYWORD_STYLE_POSTFIX
102 };
b1b5433d 103
ea8c9761
MW
104struct t_read_opts
105{
b1b5433d
MW
106 enum t_keyword_style keyword_style;
107 unsigned int copy_source_p : 1;
108 unsigned int record_positions_p : 1;
109 unsigned int case_insensitive_p : 1;
110 unsigned int r6rs_escapes_p : 1;
111 unsigned int square_brackets_p : 1;
112 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
113 unsigned int curly_infix_p : 1;
114 unsigned int neoteric_p : 1;
b1b5433d
MW
115};
116
117typedef struct t_read_opts scm_t_read_opts;
118
a16f6fe7 119
39e8f371
HWN
120/*
121 Give meaningful error messages for errors
122
123 We use the format
124
ba1b2226 125 FILE:LINE:COL: MESSAGE
39e8f371
HWN
126 This happened in ....
127
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
130
39e8f371
HWN
131 */
132
133
a4022e69
MV
134void
135scm_i_input_error (char const *function,
136 SCM port, const char *message, SCM arg)
ba1b2226 137{
29a837fd
MV
138 SCM fn = (scm_is_string (SCM_FILENAME(port))
139 ? SCM_FILENAME(port)
140 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 141
29a837fd 142 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
143 SCM string = SCM_EOL;
144 scm_simple_format (string_port,
272632a6 145 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 146 scm_list_4 (fn,
b3aa4626 147 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 148 scm_from_int (SCM_COL (port) + 1),
272632a6 149 scm_from_locale_string (message)));
ba1b2226
HWN
150
151 string = scm_get_output_string (string_port);
152 scm_close_output_port (string_port);
4a655e50 153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 154 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 155 string,
dd72382c 156 arg,
ba1b2226
HWN
157 SCM_BOOL_F);
158}
39e8f371
HWN
159
160
a1ec6916 161SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 162 (SCM setting),
dc7fa443
MG
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 166#define FUNC_NAME s_scm_read_options
a16f6fe7 167{
b7ff98dd
MD
168 SCM ans = scm_options (setting,
169 scm_read_opts,
1bbd0b84 170 FUNC_NAME);
b7ff98dd
MD
171 if (SCM_COPY_SOURCE_P)
172 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
173 return ans;
174}
1bbd0b84 175#undef FUNC_NAME
a16f6fe7 176
d458073b
AR
177/* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179static SCM *scm_i_read_hash_procedures;
deca31e1 180
cfd15439 181static SCM
d458073b
AR
182scm_i_read_hash_procedures_ref (void)
183{
184 return scm_fluid_ref (*scm_i_read_hash_procedures);
185}
186
cfd15439 187static void
d458073b
AR
188scm_i_read_hash_procedures_set_x (SCM value)
189{
190 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
191}
0f2d19dd 192
7337d56d
LC
193\f
194/* Token readers. */
0f2d19dd 195
0f2d19dd 196
7337d56d
LC
197/* Size of the C buffer used to read symbols and numbers. */
198#define READER_BUFFER_SIZE 128
0f2d19dd 199
ff4d3672
LC
200/* Number of 32-bit codepoints in the buffer used to read strings. */
201#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 202
7337d56d
LC
203/* The maximum size of Scheme character names. */
204#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 205
02327c0c
MW
206/* The maximum size of reader directive names. */
207#define READER_DIRECTIVE_NAME_MAX_SIZE 50
208
94115ae3 209
7337d56d
LC
210/* `isblank' is only in C99. */
211#define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 213 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
214
215#ifdef MSDOS
216# define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
218#else
219# define CHAR_IS_BLANK CHAR_IS_BLANK_
220#endif
221
222
223/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
224 structure''). */
225#define CHAR_IS_R5RS_DELIMITER(c) \
226 (CHAR_IS_BLANK (c) \
603234c6 227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 228
603234c6
MW
229#define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
234
235/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
236 Structure''. */
237#define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
240
454866e0 241/* Read an SCSH block comment. */
cfd15439 242static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 243static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
244static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
245static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 246static SCM scm_get_hash_procedure (int);
454866e0 247
69f90b0b
MG
248/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
cfd15439 252static int
b1b5433d
MW
253read_token (SCM port, scm_t_read_opts *opts,
254 char *buf, size_t buf_size, size_t *read)
b662b7e9 255{
69f90b0b 256 *read = 0;
0520c320 257
69f90b0b
MG
258 while (*read < buf_size)
259 {
260 int chr;
7337d56d 261
69f90b0b 262 chr = scm_get_byte_or_eof (port);
889975e5 263
69f90b0b
MG
264 if (chr == EOF)
265 return 0;
266 else if (CHAR_IS_DELIMITER (chr))
267 {
268 scm_unget_byte (chr, port);
269 return 0;
270 }
271 else
272 {
273 *buf = (char) chr;
274 buf++, (*read)++;
275 }
276 }
889975e5 277
69f90b0b
MG
278 return 1;
279 }
889975e5 280
b662b7e9
LC
281/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
283static char *
b1b5433d
MW
284read_complete_token (SCM port, scm_t_read_opts *opts,
285 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
286{
287 int overflow = 0;
b662b7e9
LC
288 size_t bytes_read, overflow_size = 0;
289 char *overflow_buffer = NULL;
1cc91f1b 290
69f90b0b
MG
291 do
292 {
b1b5433d 293 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
294 if (bytes_read == 0)
295 break;
296 if (overflow || overflow_size != 0)
297 {
298 if (overflow_size == 0)
299 {
b662b7e9
LC
300 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
301 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
302 overflow_size = bytes_read;
303 }
304 else
305 {
7be3c2fc 306 char *new_buf =
b662b7e9
LC
307 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
308
309 memcpy (new_buf, overflow_buffer, overflow_size);
310 memcpy (new_buf + overflow_size, buffer, bytes_read);
311
312 overflow_buffer = new_buf;
69f90b0b
MG
313 overflow_size += bytes_read;
314 }
315 }
889975e5 316 }
69f90b0b 317 while (overflow);
889975e5 318
69f90b0b
MG
319 if (overflow_size)
320 *read = overflow_size;
b521d265 321 else
69f90b0b
MG
322 *read = bytes_read;
323
b662b7e9 324 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 325}
7337d56d
LC
326
327/* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
329static int
b1b5433d 330flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 331{
cfd15439 332 scm_t_wchar c;
0f2d19dd 333 while (1)
b7f3516f 334 switch (c = scm_getc (port))
0f2d19dd
JB
335 {
336 case EOF:
337 goteof:
338 if (eoferr)
d156d3b7 339 {
a4022e69
MV
340 scm_i_input_error (eoferr,
341 port,
342 "end of file",
343 SCM_EOL);
d156d3b7 344 }
0f2d19dd 345 return c;
7337d56d 346
0f2d19dd
JB
347 case ';':
348 lp:
b7f3516f 349 switch (c = scm_getc (port))
0f2d19dd
JB
350 {
351 case EOF:
352 goto goteof;
353 default:
354 goto lp;
355 case SCM_LINE_INCREMENTORS:
356 break;
357 }
358 break;
7337d56d 359
454866e0
LC
360 case '#':
361 switch (c = scm_getc (port))
362 {
363 case EOF:
364 eoferr = "read_sharp";
365 goto goteof;
366 case '!':
b1b5433d 367 scm_read_shebang (c, port, opts);
454866e0 368 break;
34f3d47d 369 case ';':
b1b5433d 370 scm_read_commented_expression (c, port, opts);
34f3d47d 371 break;
620c8965
LC
372 case '|':
373 if (scm_is_false (scm_get_hash_procedure (c)))
374 {
375 scm_read_r6rs_block_comment (c, port);
376 break;
377 }
378 /* fall through */
454866e0
LC
379 default:
380 scm_ungetc (c, port);
381 return '#';
382 }
383 break;
384
0f2d19dd 385 case SCM_LINE_INCREMENTORS:
0f2d19dd 386 case SCM_SINGLE_SPACES:
0f2d19dd 387 case '\t':
0f2d19dd 388 break;
7337d56d 389
0f2d19dd
JB
390 default:
391 return c;
392 }
7337d56d
LC
393
394 return 0;
0f2d19dd
JB
395}
396
397
7337d56d
LC
398\f
399/* Token readers. */
1cc91f1b 400
b1b5433d
MW
401static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
402static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
403 long line, int column);
0f2d19dd
JB
404
405
d6cb0203 406static SCM
b1b5433d
MW
407maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
408 long line, int column)
d6cb0203 409{
b1b5433d 410 if (opts->record_positions_p)
d6cb0203
MW
411 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
412 return x;
413}
414
09a4f039 415static SCM
b1b5433d 416scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 417#define FUNC_NAME "scm_i_lreadparen"
09a4f039 418{
26c8cc14
AW
419 int c;
420 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
421 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
422 const int terminating_char = ((chr == '{') ? '}'
423 : ((chr == '[') ? ']'
424 : ')'));
7337d56d
LC
425
426 /* Need to capture line and column numbers here. */
427 long line = SCM_LINUM (port);
428 int column = SCM_COL (port) - 1;
f9c68a47 429
b1b5433d 430 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
431 if (terminating_char == c)
432 return SCM_EOL;
f9c68a47 433
7337d56d 434 scm_ungetc (c, port);
b1b5433d 435 tmp = scm_read_expression (port, opts);
1f7945a7
AW
436
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 441 {
b1b5433d
MW
442 ans = scm_read_expression (port, opts);
443 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
444 scm_i_input_error (FUNC_NAME, port, "missing close paren",
445 SCM_EOL);
446 return ans;
447 }
1cc91f1b 448
7337d56d
LC
449 /* Build the head of the list structure. */
450 ans = tl = scm_cons (tmp, SCM_EOL);
451
b1b5433d 452 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 453 {
7337d56d 454 SCM new_tail;
0f2d19dd 455
bf9eb54a
MW
456 if (c == ')' || (c == ']' && opts->square_brackets_p)
457 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
458 scm_i_input_error (FUNC_NAME, port,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c)));
461
7337d56d 462 scm_ungetc (c, port);
b1b5433d 463 tmp = scm_read_expression (port, opts);
5b69315e 464
1f7945a7
AW
465 /* See above note about scm_sym_dot. */
466 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 467 {
b1b5433d 468 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 469
b1b5433d 470 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
471 if (terminating_char != c)
472 scm_i_input_error (FUNC_NAME, port,
473 "in pair: missing close paren", SCM_EOL);
bf9eb54a 474 break;
0f2d19dd 475 }
b858464a 476
7337d56d
LC
477 new_tail = scm_cons (tmp, SCM_EOL);
478 SCM_SETCDR (tl, new_tail);
479 tl = new_tail;
7337d56d 480 }
0f2d19dd 481
bf9eb54a
MW
482 if (curly_list_p)
483 {
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len = scm_ilength (ans);
487
488 /* The (len == 0) case is handled above */
489 if (len == 1)
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans); /* {e} => e */
494 else if (len == 2)
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len >= 3 && (len & 1))
497 {
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op = scm_cadr (ans);
503
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
506 {
507 if (scm_is_null (tl))
508 {
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
511 tl = ans;
512 while (scm_is_pair (scm_cdr (tl)))
513 {
514 tmp = scm_cddr (tl);
515 SCM_SETCDR (tl, tmp);
516 tl = tmp;
517 }
518 ans = scm_cons (op, ans);
519 break;
520 }
521 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
522 {
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans = scm_cons (sym_nfx, ans);
525 break;
526 }
527 }
528 }
529 else
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans = scm_cons (sym_nfx, ans);
533 }
534
b1b5433d 535 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
536}
537#undef FUNC_NAME
a4022e69 538
c5661d28
MG
539
540/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
542 found. */
543#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
544 do \
545 { \
546 scm_t_wchar a; \
547 size_t i = 0; \
548 c = 0; \
549 while (i < ndigits) \
550 { \
551 a = scm_getc (port); \
552 if (a == EOF) \
553 goto str_eof; \
554 if (terminator \
555 && (a == (scm_t_wchar) terminator) \
556 && (i > 0)) \
557 break; \
558 if ('0' <= a && a <= '9') \
559 a -= '0'; \
560 else if ('A' <= a && a <= 'F') \
561 a = a - 'A' + 10; \
562 else if ('a' <= a && a <= 'f') \
563 a = a - 'a' + 10; \
564 else \
565 { \
566 c = a; \
567 goto bad_escaped; \
568 } \
569 c = c * 16 + a; \
570 i ++; \
571 } \
c5661d28
MG
572 } while (0)
573
684d664e
AW
574static void
575skip_intraline_whitespace (SCM port)
576{
577 scm_t_wchar c;
578
579 do
580 {
581 c = scm_getc (port);
582 if (c == EOF)
583 return;
584 }
585 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
586
587 scm_ungetc (c, port);
588}
589
7337d56d 590static SCM
b1b5433d 591scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
592#define FUNC_NAME "scm_lreadr"
593{
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
0f2d19dd 596
ff4d3672
LC
597 SCM str = SCM_EOL;
598 size_t c_str_len = 0;
599 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 600
b131b233
MW
601 /* Need to capture line and column numbers here. */
602 long line = SCM_LINUM (port);
603 int column = SCM_COL (port) - 1;
604
7337d56d
LC
605 while ('"' != (c = scm_getc (port)))
606 {
607 if (c == EOF)
9c44cd45
MG
608 {
609 str_eof:
610 scm_i_input_error (FUNC_NAME, port,
611 "end of file in string constant", SCM_EOL);
612 }
0f2d19dd 613
ff4d3672
LC
614 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
615 {
616 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
617 c_str_len = 0;
618 }
0f2d19dd 619
7337d56d 620 if (c == '\\')
9c44cd45
MG
621 {
622 switch (c = scm_getc (port))
623 {
624 case EOF:
625 goto str_eof;
626 case '"':
627 case '\\':
628 break;
9c44cd45 629 case '\n':
b1b5433d 630 if (opts->hungry_eol_escapes_p)
684d664e 631 skip_intraline_whitespace (port);
9c44cd45
MG
632 continue;
633 case '0':
634 c = '\0';
635 break;
636 case 'f':
637 c = '\f';
638 break;
639 case 'n':
640 c = '\n';
641 break;
642 case 'r':
643 c = '\r';
644 break;
645 case 't':
646 c = '\t';
647 break;
648 case 'a':
649 c = '\007';
650 break;
651 case 'v':
652 c = '\v';
653 break;
67a4a16d
MG
654 case 'b':
655 c = '\010';
656 break;
9c44cd45 657 case 'x':
b1b5433d 658 if (opts->r6rs_escapes_p)
dea901d6
MG
659 SCM_READ_HEX_ESCAPE (10, ';');
660 else
661 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 662 break;
9c44cd45 663 case 'u':
b1b5433d 664 if (!opts->r6rs_escapes_p)
898a0b5a
MG
665 {
666 SCM_READ_HEX_ESCAPE (4, '\0');
667 break;
668 }
9c44cd45 669 case 'U':
b1b5433d 670 if (!opts->r6rs_escapes_p)
898a0b5a
MG
671 {
672 SCM_READ_HEX_ESCAPE (6, '\0');
673 break;
674 }
9c44cd45
MG
675 default:
676 bad_escaped:
677 scm_i_input_error (FUNC_NAME, port,
678 "illegal character in escape sequence: ~S",
679 scm_list_1 (SCM_MAKE_CHAR (c)));
680 }
681 }
ff4d3672
LC
682
683 c_str[c_str_len++] = c;
7337d56d 684 }
ff4d3672
LC
685
686 if (scm_is_null (str))
687 /* Fast path: we got a string that fits in C_STR. */
688 str = scm_from_utf32_stringn (c_str, c_str_len);
689 else
690 {
691 if (c_str_len > 0)
692 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
693
694 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
695 }
696
b1b5433d 697 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 698}
db4b4ca6
DH
699#undef FUNC_NAME
700
0f2d19dd 701
7337d56d 702static SCM
b1b5433d 703scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 704{
69f90b0b 705 SCM result, str = SCM_EOL;
b662b7e9 706 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 707 size_t bytes_read;
69f90b0b 708 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 709
38f19074
MW
710 /* Need to capture line and column numbers here. */
711 long line = SCM_LINUM (port);
712 int column = SCM_COL (port) - 1;
713
7337d56d 714 scm_ungetc (chr, port);
b1b5433d 715 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 716 &bytes_read);
69f90b0b 717
b662b7e9 718 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
719
720 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 721 if (scm_is_false (result))
69f90b0b
MG
722 {
723 /* Return a symbol instead of a number */
b1b5433d 724 if (opts->case_insensitive_p)
69f90b0b
MG
725 str = scm_string_downcase_x (str);
726 result = scm_string_to_symbol (str);
727 }
38f19074 728 else if (SCM_NIMP (result))
b1b5433d 729 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 730
69f90b0b 731 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
732 return result;
733}
0f2d19dd 734
7337d56d 735static SCM
b1b5433d 736scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 737{
889975e5
MG
738 SCM result;
739 int ends_with_colon = 0;
69f90b0b 740 size_t bytes_read;
b1b5433d 741 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 742 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
743 scm_t_port *pt = SCM_PTAB_ENTRY (port);
744 SCM str;
7337d56d
LC
745
746 scm_ungetc (chr, port);
b1b5433d 747 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 748 &bytes_read);
69f90b0b 749 if (bytes_read > 0)
b662b7e9 750 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 751
69f90b0b
MG
752 if (postfix && ends_with_colon && (bytes_read > 1))
753 {
b662b7e9
LC
754 str = scm_from_stringn (buffer, bytes_read - 1,
755 pt->encoding, pt->ilseq_handler);
69f90b0b 756
b1b5433d 757 if (opts->case_insensitive_p)
69f90b0b
MG
758 str = scm_string_downcase_x (str);
759 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
760 }
7337d56d 761 else
69f90b0b 762 {
b662b7e9
LC
763 str = scm_from_stringn (buffer, bytes_read,
764 pt->encoding, pt->ilseq_handler);
69f90b0b 765
b1b5433d 766 if (opts->case_insensitive_p)
69f90b0b
MG
767 str = scm_string_downcase_x (str);
768 result = scm_string_to_symbol (str);
769 }
7337d56d 770
69f90b0b 771 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
772 return result;
773}
774
775static SCM
b1b5433d 776scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
777#define FUNC_NAME "scm_lreadr"
778{
889975e5 779 SCM result;
7337d56d 780 size_t read;
b662b7e9 781 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 782 unsigned int radix;
69f90b0b
MG
783 SCM str;
784 scm_t_port *pt;
7337d56d
LC
785
786 switch (chr)
787 {
788 case 'B':
789 case 'b':
790 radix = 2;
791 break;
792
793 case 'o':
794 case 'O':
795 radix = 8;
796 break;
797
798 case 'd':
799 case 'D':
800 radix = 10;
801 break;
802
803 case 'x':
804 case 'X':
805 radix = 16;
806 break;
807
808 default:
809 scm_ungetc (chr, port);
810 scm_ungetc ('#', port);
811 radix = 10;
812 }
813
b1b5433d 814 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 815 &read);
69f90b0b
MG
816
817 pt = SCM_PTAB_ENTRY (port);
b662b7e9 818 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
819
820 result = scm_string_to_number (str, scm_from_uint (radix));
821
69f90b0b 822 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
823
824 if (scm_is_true (result))
825 return result;
826
827 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
828
829 return SCM_BOOL_F;
830}
831#undef FUNC_NAME
832
833static SCM
b1b5433d 834scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
835{
836 SCM p;
492faee1
LC
837 long line = SCM_LINUM (port);
838 int column = SCM_COL (port) - 1;
7337d56d
LC
839
840 switch (chr)
841 {
842 case '`':
843 p = scm_sym_quasiquote;
844 break;
845
846 case '\'':
847 p = scm_sym_quote;
848 break;
849
850 case ',':
851 {
889975e5 852 scm_t_wchar c;
7337d56d
LC
853
854 c = scm_getc (port);
855 if ('@' == c)
856 p = scm_sym_uq_splicing;
857 else
0f2d19dd 858 {
7337d56d
LC
859 scm_ungetc (c, port);
860 p = scm_sym_unquote;
0f2d19dd 861 }
7337d56d
LC
862 break;
863 }
0f2d19dd 864
7337d56d
LC
865 default:
866 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 867 "scm_read_quote", chr);
7337d56d 868 abort ();
0f2d19dd 869 }
1cc91f1b 870
b1b5433d
MW
871 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
872 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
873}
874
34f3d47d
AW
875SCM_SYMBOL (sym_syntax, "syntax");
876SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
877SCM_SYMBOL (sym_unsyntax, "unsyntax");
878SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
879
880static SCM
b1b5433d 881scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
882{
883 SCM p;
884 long line = SCM_LINUM (port);
885 int column = SCM_COL (port) - 1;
886
887 switch (chr)
888 {
889 case '`':
890 p = sym_quasisyntax;
891 break;
892
893 case '\'':
894 p = sym_syntax;
895 break;
896
897 case ',':
898 {
899 int c;
900
901 c = scm_getc (port);
902 if ('@' == c)
903 p = sym_unsyntax_splicing;
904 else
905 {
906 scm_ungetc (c, port);
907 p = sym_unsyntax;
908 }
909 break;
910 }
911
912 default:
913 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
914 "scm_read_syntax", chr);
915 abort ();
916 }
917
b1b5433d
MW
918 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
919 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
920}
921
cfd15439 922static SCM
b1b5433d 923scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 924{
b1b5433d 925 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
926
927 if (!scm_is_eq (id, sym_nil))
928 scm_i_input_error ("scm_read_nil", port,
929 "unexpected input while reading #nil: ~a",
930 scm_list_1 (id));
931
932 return SCM_ELISP_NIL;
933}
934
cfd15439 935static SCM
7337d56d 936scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 937{
0f2d19dd
JB
938 int c;
939
889975e5
MG
940 /* We use the get_byte here because there is no need to get the
941 locale correct with comment input. This presumes that newline
942 always represents itself no matter what the encoding is. */
943 for (c = scm_get_byte_or_eof (port);
7337d56d 944 (c != EOF) && (c != '\n');
8a8da78d 945 c = scm_get_byte_or_eof (port));
7337d56d
LC
946
947 return SCM_UNSPECIFIED;
948}
949
950\f
951/* Sharp readers, i.e. readers called after a `#' sign has been read. */
952
953static SCM
954scm_read_boolean (int chr, SCM port)
955{
956 switch (chr)
0f2d19dd 957 {
7337d56d
LC
958 case 't':
959 case 'T':
960 return SCM_BOOL_T;
961
962 case 'f':
963 case 'F':
964 return SCM_BOOL_F;
0f2d19dd 965 }
7337d56d
LC
966
967 return SCM_UNSPECIFIED;
968}
969
970static SCM
b1b5433d 971scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
972#define FUNC_NAME "scm_lreadr"
973{
69f90b0b
MG
974 char buffer[READER_CHAR_NAME_MAX_SIZE];
975 SCM charname;
976 size_t charname_len, bytes_read;
889975e5
MG
977 scm_t_wchar cp;
978 int overflow;
69f90b0b 979 scm_t_port *pt;
889975e5 980
b1b5433d
MW
981 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
982 &bytes_read);
889975e5 983 if (overflow)
c1a0ba1c 984 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 985
69f90b0b 986 if (bytes_read == 0)
0f2d19dd 987 {
7337d56d
LC
988 chr = scm_getc (port);
989 if (chr == EOF)
990 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
991 "while reading character", SCM_EOL);
992
993 /* CHR must be a token delimiter, like a whitespace. */
994 return (SCM_MAKE_CHAR (chr));
0f2d19dd 995 }
7337d56d 996
69f90b0b 997 pt = SCM_PTAB_ENTRY (port);
7337d56d 998
69f90b0b
MG
999 /* Simple ASCII characters can be processed immediately. Also, simple
1000 ISO-8859-1 characters can be processed immediately if the encoding for this
1001 port is ISO-8859-1. */
1002 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1003 {
1004 SCM_COL (port) += 1;
1005 return SCM_MAKE_CHAR (buffer[0]);
1006 }
1007
1008 /* Otherwise, convert the buffer into a proper scheme string for
1009 processing. */
1010 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1011 pt->ilseq_handler);
1012 charname_len = scm_i_string_length (charname);
1013 SCM_COL (port) += charname_len;
889975e5 1014 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1015 if (charname_len == 1)
1016 return SCM_MAKE_CHAR (cp);
1017
1018 /* Ignore dotted circles, which may be used to keep combining characters from
1019 combining with the backslash in #\charname. */
0dcd7e61
MG
1020 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1021 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1022
889975e5 1023 if (cp >= '0' && cp < '8')
7337d56d
LC
1024 {
1025 /* Dirk:FIXME:: This type of character syntax is not R5RS
1026 * compliant. Further, it should be verified that the constant
0ffc78e3 1027 * does only consist of octal digits. */
889975e5 1028 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1029 if (SCM_I_INUMP (p))
0ffc78e3 1030 {
e25f3727 1031 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1032 if (SCM_IS_UNICODE_CHAR (c))
1033 return SCM_MAKE_CHAR (c);
1034 else
0f3a70cf 1035 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1036 "out-of-range octal character escape: ~a",
1037 scm_list_1 (charname));
1038 }
7337d56d
LC
1039 }
1040
0f3a70cf 1041 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1042 {
1043 SCM p;
0f3a70cf 1044
dea901d6
MG
1045 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1046 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1047 scm_from_uint (16));
1048 if (SCM_I_INUMP (p))
1049 {
e25f3727 1050 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1051 if (SCM_IS_UNICODE_CHAR (c))
1052 return SCM_MAKE_CHAR (c);
1053 else
1054 scm_i_input_error (FUNC_NAME, port,
1055 "out-of-range hex character escape: ~a",
1056 scm_list_1 (charname));
1057 }
1058 }
1059
889975e5
MG
1060 /* The names of characters should never have non-Latin1
1061 characters. */
1062 if (scm_i_is_narrow_string (charname)
1063 || scm_i_try_narrow_string (charname))
4769c9db
AW
1064 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1065 charname_len);
1066 if (scm_is_true (ch))
1067 return ch;
1068 }
7337d56d 1069
7337d56d 1070 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1071 scm_list_1 (charname));
7337d56d
LC
1072
1073 return SCM_UNSPECIFIED;
0f2d19dd 1074}
db4b4ca6 1075#undef FUNC_NAME
0f2d19dd 1076
cfd15439 1077static SCM
b1b5433d 1078scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1079{
1080 SCM symbol;
1081
1082 /* Read the symbol that comprises the keyword. Doing this instead of
1083 invoking a specific symbol reader function allows `scm_read_keyword ()'
1084 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1085
7337d56d 1086 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1087 symbol = scm_read_expression (port, opts);
7337d56d 1088 if (!scm_is_symbol (symbol))
7f74cf9a 1089 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1090 "keyword prefix `~a' not followed by a symbol: ~s",
1091 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1092
1093 return (scm_symbol_to_keyword (symbol));
1094}
1095
cfd15439 1096static SCM
b1b5433d
MW
1097scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1098 long line, int column)
09a4f039 1099{
7337d56d
LC
1100 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1101 guarantee that it's going to do what we want. After all, this is an
1102 implementation detail of `scm_read_vector ()', not a desirable
1103 property. */
b1b5433d
MW
1104 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1105 port, opts, line, column);
b131b233
MW
1106}
1107
493ceb99
MW
1108/* Helper used by scm_read_array */
1109static int
1110read_decimal_integer (SCM port, int c, ssize_t *resp)
1111{
1112 ssize_t sign = 1;
1113 ssize_t res = 0;
1114 int got_it = 0;
1115
1116 if (c == '-')
1117 {
1118 sign = -1;
1119 c = scm_getc (port);
1120 }
1121
1122 while ('0' <= c && c <= '9')
1123 {
1124 res = 10*res + c-'0';
1125 got_it = 1;
1126 c = scm_getc (port);
1127 }
1128
1129 if (got_it)
1130 *resp = sign * res;
1131 return c;
1132}
1133
1134/* Read an array. This function can also read vectors and uniform
1135 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1136 handled here.
1137
10744b7c 1138 C is the first character read after the '#'. */
b131b233 1139static SCM
b1b5433d 1140scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
b131b233 1141{
493ceb99
MW
1142 ssize_t rank;
1143 scm_t_wchar tag_buf[8];
1144 int tag_len;
1145
1146 SCM tag, shape = SCM_BOOL_F, elements, array;
1147
1148 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1149 the array code can not deal with zero-length dimensions yet, and
10744b7c 1150 we want to allow zero-length vectors, of course. */
493ceb99 1151 if (c == '(')
b1b5433d 1152 return scm_read_vector (c, port, opts, line, column);
493ceb99 1153
10744b7c 1154 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1155 if (c == 'f')
1156 {
1157 c = scm_getc (port);
1158 if (c != '3' && c != '6')
1159 {
1160 if (c != EOF)
1161 scm_ungetc (c, port);
1162 return SCM_BOOL_F;
1163 }
1164 rank = 1;
1165 tag_buf[0] = 'f';
1166 tag_len = 1;
1167 goto continue_reading_tag;
1168 }
1169
1170 /* Read rank. */
1171 rank = 1;
1172 c = read_decimal_integer (port, c, &rank);
1173 if (rank < 0)
1174 scm_i_input_error (NULL, port, "array rank must be non-negative",
1175 SCM_EOL);
1176
1177 /* Read tag. */
1178 tag_len = 0;
1179 continue_reading_tag:
1180 while (c != EOF && c != '(' && c != '@' && c != ':'
1181 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1182 {
1183 tag_buf[tag_len++] = c;
1184 c = scm_getc (port);
1185 }
1186 if (tag_len == 0)
1187 tag = SCM_BOOL_T;
b131b233 1188 else
493ceb99
MW
1189 {
1190 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1191 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1192 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1193 scm_list_1 (tag));
1194 }
1195
1196 /* Read shape. */
1197 if (c == '@' || c == ':')
1198 {
1199 shape = SCM_EOL;
1200
1201 do
1202 {
1203 ssize_t lbnd = 0, len = 0;
1204 SCM s;
1205
1206 if (c == '@')
1207 {
1208 c = scm_getc (port);
1209 c = read_decimal_integer (port, c, &lbnd);
1210 }
1211
1212 s = scm_from_ssize_t (lbnd);
1213
1214 if (c == ':')
1215 {
1216 c = scm_getc (port);
1217 c = read_decimal_integer (port, c, &len);
1218 if (len < 0)
1219 scm_i_input_error (NULL, port,
1220 "array length must be non-negative",
1221 SCM_EOL);
1222
1223 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1224 }
1225
1226 shape = scm_cons (s, shape);
1227 } while (c == '@' || c == ':');
1228
1229 shape = scm_reverse_x (shape, SCM_EOL);
1230 }
1231
1232 /* Read nested lists of elements. */
1233 if (c != '(')
1234 scm_i_input_error (NULL, port,
1235 "missing '(' in vector or array literal",
1236 SCM_EOL);
b1b5433d 1237 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1238
1239 if (scm_is_false (shape))
1240 shape = scm_from_ssize_t (rank);
1241 else if (scm_ilength (shape) != rank)
1242 scm_i_input_error
1243 (NULL, port,
1244 "the number of shape specifications must match the array rank",
1245 SCM_EOL);
1246
1247 /* Handle special print syntax of rank zero arrays; see
1248 scm_i_print_array for a rationale. */
1249 if (rank == 0)
1250 {
1251 if (!scm_is_pair (elements))
1252 scm_i_input_error (NULL, port,
1253 "too few elements in array literal, need 1",
1254 SCM_EOL);
1255 if (!scm_is_null (SCM_CDR (elements)))
1256 scm_i_input_error (NULL, port,
1257 "too many elements in array literal, want 1",
1258 SCM_EOL);
1259 elements = SCM_CAR (elements);
1260 }
1261
1262 /* Construct array, annotate with source location, and return. */
1263 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1264 return maybe_annotate_source (array, port, opts, line, column);
7337d56d 1265}
09a4f039 1266
cfd15439 1267static SCM
b1b5433d
MW
1268scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1269 long line, int column)
7337d56d 1270{
b1b5433d 1271 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1272}
1273
0ba0b384 1274static SCM
b1b5433d
MW
1275scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1276 long line, int column)
0ba0b384
LC
1277{
1278 chr = scm_getc (port);
1279 if (chr != 'u')
1280 goto syntax;
1281
1282 chr = scm_getc (port);
1283 if (chr != '8')
1284 goto syntax;
1285
1286 chr = scm_getc (port);
1287 if (chr != '(')
1288 goto syntax;
1289
b131b233 1290 return maybe_annotate_source
b1b5433d
MW
1291 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1292 port, opts, line, column);
0ba0b384
LC
1293
1294 syntax:
1295 scm_i_input_error ("read_bytevector", port,
1296 "invalid bytevector prefix",
1297 SCM_MAKE_CHAR (chr));
1298 return SCM_UNSPECIFIED;
1299}
1300
7337d56d 1301static SCM
b1b5433d
MW
1302scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1303 long line, int column)
7337d56d
LC
1304{
1305 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1306 terribly inefficient but who cares? */
1307 SCM s_bits = SCM_EOL;
1308
1309 for (chr = scm_getc (port);
1310 (chr != EOF) && ((chr == '0') || (chr == '1'));
1311 chr = scm_getc (port))
09a4f039 1312 {
7337d56d 1313 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1314 }
7337d56d
LC
1315
1316 if (chr != EOF)
1317 scm_ungetc (chr, port);
1318
b131b233
MW
1319 return maybe_annotate_source
1320 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1321 port, opts, line, column);
7337d56d
LC
1322}
1323
cfd15439 1324static SCM
889975e5 1325scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1326{
1327 int bang_seen = 0;
1328
1329 for (;;)
09a4f039 1330 {
58b1db5f 1331 int c = scm_getc (port);
62850ef3 1332
7337d56d
LC
1333 if (c == EOF)
1334 scm_i_input_error ("skip_block_comment", port,
1335 "unterminated `#! ... !#' comment", SCM_EOL);
1336
1337 if (c == '!')
1338 bang_seen = 1;
1339 else if (c == '#' && bang_seen)
1340 break;
1341 else
1342 bang_seen = 0;
1343 }
1344
1345 return SCM_UNSPECIFIED;
1346}
1347
9331ffd8
MW
1348static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1349 int value);
bf9eb54a
MW
1350static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1351 int value);
1352static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1353 int value);
9331ffd8 1354
d7fcaec3 1355static SCM
b1b5433d 1356scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1357{
02327c0c
MW
1358 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1359 int c;
1360 int i = 0;
1361
1362 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1363 {
02327c0c
MW
1364 c = scm_getc (port);
1365 if (c == EOF)
1366 scm_i_input_error ("skip_block_comment", port,
1367 "unterminated `#! ... !#' comment", SCM_EOL);
1368 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1369 name[i++] = c;
1370 else if (CHAR_IS_DELIMITER (c))
1371 {
1372 scm_ungetc (c, port);
1373 name[i] = '\0';
1374 if (0 == strcmp ("r6rs", name))
1375 ; /* Silently ignore */
9331ffd8
MW
1376 else if (0 == strcmp ("fold-case", name))
1377 set_port_case_insensitive_p (port, opts, 1);
1378 else if (0 == strcmp ("no-fold-case", name))
1379 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1380 else if (0 == strcmp ("curly-infix", name))
1381 set_port_curly_infix_p (port, opts, 1);
1382 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1383 {
1384 set_port_curly_infix_p (port, opts, 1);
1385 set_port_square_brackets_p (port, opts, 0);
1386 }
02327c0c
MW
1387 else
1388 break;
1389
1390 return SCM_UNSPECIFIED;
1391 }
fa746547
MW
1392 else
1393 {
1394 scm_ungetc (c, port);
1395 break;
1396 }
911b03b2 1397 }
02327c0c
MW
1398 while (i > 0)
1399 scm_ungetc (name[--i], port);
1400 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1401}
1402
620c8965
LC
1403static SCM
1404scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1405{
1406 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1407 nested. So care must be taken. */
1408 int nesting_level = 1;
6d5f8c32
AW
1409
1410 int a = scm_getc (port);
1411
1412 if (a == EOF)
1413 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1414 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1415
1416 while (nesting_level > 0)
1417 {
6d5f8c32 1418 int b = scm_getc (port);
620c8965 1419
6d5f8c32 1420 if (b == EOF)
cd169c5a 1421 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1422 "unterminated `#| ... |#' comment", SCM_EOL);
1423
6d5f8c32
AW
1424 if (a == '|' && b == '#')
1425 {
1426 nesting_level--;
1427 b = EOF;
1428 }
1429 else if (a == '#' && b == '|')
1430 {
1431 nesting_level++;
1432 b = EOF;
1433 }
1434
1435 a = b;
620c8965
LC
1436 }
1437
1438 return SCM_UNSPECIFIED;
1439}
1440
34f3d47d 1441static SCM
b1b5433d
MW
1442scm_read_commented_expression (scm_t_wchar chr, SCM port,
1443 scm_t_read_opts *opts)
34f3d47d 1444{
889975e5 1445 scm_t_wchar c;
34f3d47d 1446
b1b5433d 1447 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1448 if (EOF == c)
1449 scm_i_input_error ("read_commented_expression", port,
1450 "no expression after #; comment", SCM_EOL);
1451 scm_ungetc (c, port);
b1b5433d 1452 scm_read_expression (port, opts);
34f3d47d
AW
1453 return SCM_UNSPECIFIED;
1454}
1455
7337d56d 1456static SCM
889975e5 1457scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1458{
1459 /* Guile's extended symbol read syntax looks like this:
1460
1461 #{This is all a symbol name}#
1462
1463 So here, CHR is expected to be `{'. */
d9527cfa 1464 int saw_brace = 0;
7337d56d 1465 size_t len = 0;
190d4b0d 1466 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1467
889975e5 1468 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1469
1470 while ((chr = scm_getc (port)) != EOF)
1471 {
1472 if (saw_brace)
09a4f039 1473 {
7337d56d
LC
1474 if (chr == '#')
1475 {
7337d56d
LC
1476 break;
1477 }
1478 else
1479 {
1480 saw_brace = 0;
889975e5 1481 scm_i_string_set_x (buf, len++, '}');
7337d56d 1482 }
09a4f039 1483 }
d9527cfa
AW
1484
1485 if (chr == '}')
7337d56d 1486 saw_brace = 1;
d9527cfa
AW
1487 else if (chr == '\\')
1488 {
1489 /* It used to be that print.c would print extended-read-syntax
1490 symbols with backslashes before "non-standard" chars, but
1491 this routine wouldn't do anything with those escapes.
1492 Bummer. What we've done is to change print.c to output
1493 R6RS hex escapes for those characters, relying on the fact
1494 that the extended read syntax would never put a `\' before
1495 an `x'. For now, we just ignore other instances of
1496 backslash in the string. */
1497 switch ((chr = scm_getc (port)))
1498 {
1499 case EOF:
1500 goto done;
1501 case 'x':
1502 {
1503 scm_t_wchar c;
1504
1505 SCM_READ_HEX_ESCAPE (10, ';');
1506 scm_i_string_set_x (buf, len++, c);
1507 break;
1508
1509 str_eof:
1510 chr = EOF;
1511 goto done;
1512
1513 bad_escaped:
1514 scm_i_string_stop_writing ();
1515 scm_i_input_error ("scm_read_extended_symbol", port,
1516 "illegal character in escape sequence: ~S",
1517 scm_list_1 (SCM_MAKE_CHAR (c)));
1518 break;
1519 }
1520 default:
1521 scm_i_string_set_x (buf, len++, chr);
1522 break;
1523 }
1524 }
7337d56d 1525 else
d9527cfa 1526 scm_i_string_set_x (buf, len++, chr);
62850ef3 1527
889975e5 1528 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1529 {
7f991c7d
LC
1530 SCM addy;
1531
889975e5 1532 scm_i_string_stop_writing ();
190d4b0d 1533 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1534 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1535 len = 0;
889975e5 1536 buf = scm_i_string_start_writing (buf);
7337d56d 1537 }
7337d56d 1538 }
d9527cfa
AW
1539
1540 done:
889975e5 1541 scm_i_string_stop_writing ();
d9527cfa
AW
1542 if (chr == EOF)
1543 scm_i_input_error ("scm_read_extended_symbol", port,
1544 "end of file while reading symbol", SCM_EOL);
7337d56d 1545
889975e5 1546 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1547}
1548
1549
1550\f
1551/* Top-level token readers, i.e., dispatchers. */
1552
1553static SCM
b1b5433d 1554scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1555{
1556 SCM proc;
1557
1558 proc = scm_get_hash_procedure (chr);
1559 if (scm_is_true (scm_procedure_p (proc)))
1560 {
1561 long line = SCM_LINUM (port);
1562 int column = SCM_COL (port) - 2;
1563 SCM got;
1564
1565 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1566
3655ed89
MW
1567 if (opts->record_positions_p && SCM_NIMP (got)
1568 && !scm_i_has_source_properties (got))
26c8cc14
AW
1569 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1570
1571 return got;
09a4f039 1572 }
7337d56d
LC
1573
1574 return SCM_UNSPECIFIED;
1575}
1576
1577/* The reader for the sharp `#' character. It basically dispatches reads
1578 among the above token readers. */
1579static SCM
b1b5433d
MW
1580scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1581 long line, int column)
7337d56d
LC
1582#define FUNC_NAME "scm_lreadr"
1583{
1584 SCM result;
1585
1586 chr = scm_getc (port);
1587
b1b5433d 1588 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1589 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1590 return result;
1591
1592 switch (chr)
1593 {
1594 case '\\':
b1b5433d 1595 return (scm_read_character (chr, port, opts));
7337d56d 1596 case '(':
b1b5433d 1597 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1598 case 's':
1599 case 'u':
1600 case 'f':
df941b5b 1601 case 'c':
7337d56d 1602 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1603 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1604 case 'v':
b1b5433d 1605 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1606 case '*':
b1b5433d 1607 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1608 case 't':
1609 case 'T':
1610 case 'F':
7337d56d
LC
1611 return (scm_read_boolean (chr, port));
1612 case ':':
b1b5433d 1613 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1614 case '0': case '1': case '2': case '3': case '4':
1615 case '5': case '6': case '7': case '8': case '9':
1616 case '@':
1617#if SCM_ENABLE_DEPRECATED
1618 /* See below for 'i' and 'e'. */
1619 case 'a':
7337d56d
LC
1620 case 'y':
1621 case 'h':
1622 case 'l':
1623#endif
b1b5433d 1624 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1625
1626 case 'i':
1627 case 'e':
1628#if SCM_ENABLE_DEPRECATED
1629 {
1630 /* When next char is '(', it really is an old-style
1631 uniform array. */
889975e5 1632 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1633 if (next_c != EOF)
1634 scm_ungetc (next_c, port);
1635 if (next_c == '(')
b1b5433d 1636 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1637 /* Fall through. */
1638 }
1639#endif
1640 case 'b':
1641 case 'B':
1642 case 'o':
1643 case 'O':
1644 case 'd':
1645 case 'D':
1646 case 'x':
1647 case 'X':
1648 case 'I':
1649 case 'E':
b1b5433d 1650 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1651 case '{':
1652 return (scm_read_extended_symbol (chr, port));
1653 case '!':
b1b5433d 1654 return (scm_read_shebang (chr, port, opts));
34f3d47d 1655 case ';':
b1b5433d 1656 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1657 case '`':
1658 case '\'':
1659 case ',':
b1b5433d 1660 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1661 case 'n':
b1b5433d 1662 return (scm_read_nil (chr, port, opts));
7337d56d 1663 default:
b1b5433d 1664 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1665 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1666 {
1667 /* To remain compatible with 1.8 and earlier, the following
1668 characters have lower precedence than `read-hash-extend'
1669 characters. */
1670 switch (chr)
1671 {
1672 case '|':
1673 return scm_read_r6rs_block_comment (chr, port);
1674 default:
1675 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1676 scm_list_1 (SCM_MAKE_CHAR (chr)));
1677 }
1678 }
7337d56d
LC
1679 else
1680 return result;
1681 }
1682
1683 return SCM_UNSPECIFIED;
1684}
1685#undef FUNC_NAME
1686
1687static SCM
bf9eb54a
MW
1688read_inner_expression (SCM port, scm_t_read_opts *opts)
1689#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1690{
1691 while (1)
1692 {
cfd15439 1693 scm_t_wchar chr;
7337d56d
LC
1694
1695 chr = scm_getc (port);
1696
1697 switch (chr)
1698 {
1699 case SCM_WHITE_SPACES:
1700 case SCM_LINE_INCREMENTORS:
1701 break;
1702 case ';':
1703 (void) scm_read_semicolon_comment (chr, port);
1704 break;
bf9eb54a
MW
1705 case '{':
1706 if (opts->curly_infix_p)
1707 {
1708 if (opts->neoteric_p)
1709 return scm_read_sexp (chr, port, opts);
1710 else
1711 {
1712 SCM expr;
1713
1714 /* Enable neoteric expressions within curly braces */
1715 opts->neoteric_p = 1;
1716 expr = scm_read_sexp (chr, port, opts);
1717 opts->neoteric_p = 0;
1718 return expr;
1719 }
1720 }
1721 else
1722 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1723 case '[':
bf9eb54a
MW
1724 if (opts->square_brackets_p)
1725 return scm_read_sexp (chr, port, opts);
1726 else if (opts->curly_infix_p)
1727 {
1728 /* The syntax of neoteric expressions requires that '[' be
1729 a delimiter when curly-infix is enabled, so it cannot
1730 be part of an unescaped symbol. We might as well do
1731 something useful with it, so we adopt Kawa's convention:
1732 [...] => ($bracket-list$ ...) */
1733 long line = SCM_LINUM (port);
1734 int column = SCM_COL (port) - 1;
1735 return maybe_annotate_source
1736 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1737 port, opts, line, column);
1738 }
1739 else
1740 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1741 case '(':
b1b5433d 1742 return (scm_read_sexp (chr, port, opts));
7337d56d 1743 case '"':
b1b5433d 1744 return (scm_read_string (chr, port, opts));
7337d56d
LC
1745 case '\'':
1746 case '`':
1747 case ',':
b1b5433d 1748 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1749 case '#':
1750 {
b131b233
MW
1751 long line = SCM_LINUM (port);
1752 int column = SCM_COL (port) - 1;
b1b5433d 1753 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1754 if (scm_is_eq (result, SCM_UNSPECIFIED))
1755 /* We read a comment or some such. */
1756 break;
1757 else
1758 return result;
1759 }
1760 case ')':
1761 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1762 break;
bf9eb54a
MW
1763 case '}':
1764 if (opts->curly_infix_p)
1765 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1766 else
1767 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1768 case ']':
b1b5433d 1769 if (opts->square_brackets_p)
a4e47229
MG
1770 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1771 /* otherwise fall through */
7337d56d
LC
1772 case EOF:
1773 return SCM_EOF_VAL;
1774 case ':':
b1b5433d
MW
1775 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1776 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1777 /* Fall through. */
1778
1779 default:
1780 {
1781 if (((chr >= '0') && (chr <= '9'))
1782 || (strchr ("+-.", chr)))
b1b5433d 1783 return (scm_read_number (chr, port, opts));
7337d56d 1784 else
b1b5433d 1785 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1786 }
1787 }
1788 }
1789}
1790#undef FUNC_NAME
1791
bf9eb54a
MW
1792static SCM
1793scm_read_expression (SCM port, scm_t_read_opts *opts)
1794#define FUNC_NAME "scm_read_expression"
1795{
1796 if (!opts->neoteric_p)
1797 return read_inner_expression (port, opts);
1798 else
1799 {
1800 long line = 0;
1801 int column = 0;
1802 SCM expr;
1803
1804 if (opts->record_positions_p)
1805 {
1806 /* We need to get the position of the first non-whitespace
1807 character in order to correctly annotate neoteric
1808 expressions. For example, for the expression 'f(x)', the
1809 first call to 'read_inner_expression' reads the 'f' (which
1810 cannot be annotated), and then we later read the '(x)' and
1811 use it to construct the new list (f x). */
1812 int c = flush_ws (port, opts, (char *) NULL);
1813 if (c == EOF)
1814 return SCM_EOF_VAL;
1815 scm_ungetc (c, port);
1816 line = SCM_LINUM (port);
1817 column = SCM_COL (port);
1818 }
1819
1820 expr = read_inner_expression (port, opts);
1821
1822 /* 'expr' is the first component of the neoteric expression. Now
1823 we loop, and as long as the next character is '(', '[', or '{',
1824 (without any intervening whitespace), we use it to construct a
1825 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1826 for (;;)
1827 {
1828 int chr = scm_getc (port);
1829
1830 if (chr == '(')
1831 /* e(...) => (e ...) */
1832 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1833 else if (chr == '[')
1834 /* e[...] => ($bracket-apply$ e ...) */
1835 expr = scm_cons (sym_bracket_apply,
1836 scm_cons (expr,
1837 scm_read_sexp (chr, port, opts)));
1838 else if (chr == '{')
1839 {
1840 SCM arg = scm_read_sexp (chr, port, opts);
1841
1842 if (scm_is_null (arg))
1843 expr = scm_list_1 (expr); /* e{} => (e) */
1844 else
1845 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1846 }
1847 else
1848 {
1849 if (chr != EOF)
1850 scm_ungetc (chr, port);
1851 break;
1852 }
1853 maybe_annotate_source (expr, port, opts, line, column);
1854 }
1855 return expr;
1856 }
1857}
1858#undef FUNC_NAME
1859
7337d56d
LC
1860\f
1861/* Actual reader. */
1862
ea8c9761
MW
1863static void init_read_options (SCM port, scm_t_read_opts *opts);
1864
7337d56d
LC
1865SCM_DEFINE (scm_read, "read", 0, 1, 0,
1866 (SCM port),
1867 "Read an s-expression from the input port @var{port}, or from\n"
1868 "the current input port if @var{port} is not specified.\n"
1869 "Any whitespace before the next token is discarded.")
1870#define FUNC_NAME s_scm_read
1871{
b1b5433d 1872 scm_t_read_opts opts;
7337d56d
LC
1873 int c;
1874
1875 if (SCM_UNBNDP (port))
1876 port = scm_current_input_port ();
1877 SCM_VALIDATE_OPINPORT (1, port);
1878
ea8c9761 1879 init_read_options (port, &opts);
b1b5433d
MW
1880
1881 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1882 if (EOF == c)
1883 return SCM_EOF_VAL;
1884 scm_ungetc (c, port);
1885
b1b5433d 1886 return (scm_read_expression (port, &opts));
09a4f039 1887}
db4b4ca6 1888#undef FUNC_NAME
09a4f039 1889
0f2d19dd
JB
1890
1891\f
1892
14de3b42
GH
1893/* Manipulate the read-hash-procedures alist. This could be written in
1894 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1895SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1896 (SCM chr, SCM proc),
dc7fa443
MG
1897 "Install the procedure @var{proc} for reading expressions\n"
1898 "starting with the character sequence @code{#} and @var{chr}.\n"
1899 "@var{proc} will be called with two arguments: the character\n"
1900 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1901 "returned will be the return value of @code{read}. \n"
1902 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1903 )
1bbd0b84 1904#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1905{
fed9c9a2
GH
1906 SCM this;
1907 SCM prev;
1908
36284627 1909 SCM_VALIDATE_CHAR (1, chr);
7888309b 1910 SCM_ASSERT (scm_is_false (proc)
bc36d050 1911 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1912 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1913
14de3b42 1914 /* Check if chr is already in the alist. */
d458073b 1915 this = scm_i_read_hash_procedures_ref ();
14de3b42 1916 prev = SCM_BOOL_F;
fed9c9a2
GH
1917 while (1)
1918 {
d2e53ed6 1919 if (scm_is_null (this))
fed9c9a2
GH
1920 {
1921 /* not found, so add it to the beginning. */
7888309b 1922 if (scm_is_true (proc))
fed9c9a2 1923 {
d458073b
AR
1924 SCM new = scm_cons (scm_cons (chr, proc),
1925 scm_i_read_hash_procedures_ref ());
1926 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1927 }
1928 break;
1929 }
bc36d050 1930 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1931 {
1932 /* already in the alist. */
7888309b 1933 if (scm_is_false (proc))
14de3b42
GH
1934 {
1935 /* remove it. */
7888309b 1936 if (scm_is_false (prev))
14de3b42 1937 {
d458073b
AR
1938 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1939 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1940 }
1941 else
1942 scm_set_cdr_x (prev, SCM_CDR (this));
1943 }
fed9c9a2 1944 else
14de3b42
GH
1945 {
1946 /* replace it. */
1947 scm_set_cdr_x (SCM_CAR (this), proc);
1948 }
fed9c9a2
GH
1949 break;
1950 }
1951 prev = this;
1952 this = SCM_CDR (this);
1953 }
deca31e1 1954
deca31e1
GH
1955 return SCM_UNSPECIFIED;
1956}
1bbd0b84 1957#undef FUNC_NAME
0f2d19dd 1958
deca31e1
GH
1959/* Recover the read-hash procedure corresponding to char c. */
1960static SCM
6e8d25a6 1961scm_get_hash_procedure (int c)
deca31e1 1962{
d458073b 1963 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1964
deca31e1
GH
1965 while (1)
1966 {
d2e53ed6 1967 if (scm_is_null (rest))
deca31e1
GH
1968 return SCM_BOOL_F;
1969
7866a09b 1970 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1971 return SCM_CDAR (rest);
1972
1973 rest = SCM_CDR (rest);
1974 }
1975}
1cc91f1b 1976
889975e5
MG
1977#define SCM_ENCODING_SEARCH_SIZE (500)
1978
f8a1c9a8
LC
1979/* Search the first few hundred characters of a file for an Emacs-like coding
1980 declaration. Returns either NULL or a string whose storage has been
1981 allocated with `scm_gc_malloc ()'. */
889975e5 1982char *
f8a1c9a8 1983scm_i_scan_for_encoding (SCM port)
889975e5 1984{
d900843c 1985 scm_t_port *pt;
889975e5 1986 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1987 size_t bytes_read, encoding_length, i;
889975e5 1988 char *encoding = NULL;
daedbca7 1989 char *pos, *encoding_start;
889975e5
MG
1990 int in_comment;
1991
d900843c 1992 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1993
d900843c
AW
1994 if (pt->rw_active == SCM_PORT_WRITE)
1995 scm_flush (port);
49bb5bd3 1996
d900843c
AW
1997 if (pt->rw_random)
1998 pt->rw_active = SCM_PORT_READ;
1999
2000 if (pt->read_pos == pt->read_end)
2001 {
2002 /* We can use the read buffer, and thus avoid a seek. */
2003 if (scm_fill_input (port) == EOF)
2004 return NULL;
2005
2006 bytes_read = pt->read_end - pt->read_pos;
2007 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2008 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2009
2010 if (bytes_read <= 1)
2011 /* An unbuffered port -- don't scan. */
2012 return NULL;
2013
2014 memcpy (header, pt->read_pos, bytes_read);
2015 header[bytes_read] = '\0';
2016 }
2017 else
2018 {
2019 /* Try to read some bytes and then seek back. Not all ports
2020 support seeking back; and indeed some file ports (like
2021 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2022 check performed by SCM_FPORT_FDES---but fail to seek
2023 backwards. Hence this block comes second. We prefer to use
2024 the read buffer in-place. */
2025 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2026 return NULL;
2027
2028 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2029 header[bytes_read] = '\0';
419c8736 2030 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
d900843c 2031 }
889975e5 2032
889975e5
MG
2033 /* search past "coding[:=]" */
2034 pos = header;
2035 while (1)
2036 {
2037 if ((pos = strstr(pos, "coding")) == NULL)
2038 return NULL;
2039
2040 pos += strlen("coding");
2041 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2042 (*pos == ':' || *pos == '='))
2043 {
2044 pos ++;
2045 break;
2046 }
2047 }
2048
2049 /* skip spaces */
2050 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2051 (*pos == ' ' || *pos == '\t'))
2052 pos ++;
2053
2054 /* grab the next token */
daedbca7 2055 encoding_start = pos;
889975e5 2056 i = 0;
daedbca7
MG
2057 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2058 && encoding_start + i - header < bytes_read
2059 && (isalnum ((int) encoding_start[i])
2060 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
2061 i++;
2062
daedbca7
MG
2063 encoding_length = i;
2064 if (encoding_length == 0)
889975e5
MG
2065 return NULL;
2066
daedbca7
MG
2067 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2068 for (i = 0; i < encoding_length; i++)
026ed239 2069 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
2070
2071 /* push backwards to make sure we were in a comment */
2072 in_comment = 0;
daedbca7
MG
2073 pos = encoding_start;
2074 while (pos >= header)
889975e5 2075 {
8a12aeb9
AW
2076 if (*pos == ';')
2077 {
2078 in_comment = 1;
2079 break;
2080 }
2081 else if (*pos == '\n' || pos == header)
889975e5
MG
2082 {
2083 /* This wasn't in a semicolon comment. Check for a
2084 hash-bang comment. */
2085 char *beg = strstr (header, "#!");
2086 char *end = strstr (header, "!#");
8a12aeb9 2087 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2088 in_comment = 1;
2089 break;
2090 }
8a12aeb9
AW
2091 else
2092 {
2093 pos --;
2094 continue;
2095 }
889975e5
MG
2096 }
2097 if (!in_comment)
f8a1c9a8
LC
2098 /* This wasn't in a comment */
2099 return NULL;
2100
889975e5
MG
2101 return encoding;
2102}
2103
2104SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2105 (SCM port),
a270e133 2106 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2107 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2108 "The coding declaration is of the form\n"
2109 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2110 "\n"
2111 "Returns a string containing the character encoding of the file\n"
2112 "if a declaration was found, or @code{#f} otherwise.\n")
2113#define FUNC_NAME s_scm_file_encoding
2114{
2115 char *enc;
2116 SCM s_enc;
f8a1c9a8 2117
d900843c
AW
2118 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2119
f8a1c9a8 2120 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2121 if (enc == NULL)
2122 return SCM_BOOL_F;
2123 else
2124 {
2125 s_enc = scm_from_locale_string (enc);
889975e5
MG
2126 return s_enc;
2127 }
f8a1c9a8 2128
889975e5
MG
2129 return SCM_BOOL_F;
2130}
2131#undef FUNC_NAME
2132
ea8c9761
MW
2133\f
2134/* Per-port read options.
2135
2136 We store per-port read options in the 'port-read-options' key of the
05d7f762
MW
2137 port's alist, which is stored in the internal port structure. The
2138 value stored in the alist is a single integer that contains a two-bit
2139 field for each read option.
ea8c9761
MW
2140
2141 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2142 the applicable value should be inherited from the corresponding
2143 global read option. Otherwise, the bit field contains the value of
2144 the read option. For boolean read options that have been set
2145 per-port, the possible values are 0 or 1. If the 'keyword_style'
2146 read option has been set per-port, its possible values are those in
2147 'enum t_keyword_style'. */
2148
2149/* Key to read options in per-port alists. */
2150SCM_SYMBOL (sym_port_read_options, "port-read-options");
2151
2152/* Offsets of bit fields for each per-port override */
2153#define READ_OPTION_COPY_SOURCE_P 0
2154#define READ_OPTION_RECORD_POSITIONS_P 2
2155#define READ_OPTION_CASE_INSENSITIVE_P 4
2156#define READ_OPTION_KEYWORD_STYLE 6
2157#define READ_OPTION_R6RS_ESCAPES_P 8
2158#define READ_OPTION_SQUARE_BRACKETS_P 10
2159#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2160#define READ_OPTION_CURLY_INFIX_P 14
ea8c9761 2161
bf9eb54a
MW
2162/* The total width in bits of the per-port overrides */
2163#define READ_OPTIONS_NUM_BITS 16
ea8c9761
MW
2164
2165#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2166#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2167
2168#define READ_OPTION_MASK 3
2169#define READ_OPTION_INHERIT 3
2170
2171static void
2172set_port_read_option (SCM port, int option, int new_value)
2173{
2174 SCM alist, scm_read_options;
2175 unsigned int read_options;
2176
2177 new_value &= READ_OPTION_MASK;
05d7f762 2178 alist = scm_i_port_alist (port);
ea8c9761
MW
2179 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2180 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2181 read_options = scm_to_uint (scm_read_options);
2182 else
2183 read_options = READ_OPTIONS_INHERIT_ALL;
2184 read_options &= ~(READ_OPTION_MASK << option);
2185 read_options |= new_value << option;
2186 scm_read_options = scm_from_uint (read_options);
2187 alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
05d7f762 2188 scm_i_set_port_alist_x (port, alist);
ea8c9761
MW
2189}
2190
9331ffd8
MW
2191/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2192static void
2193set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2194{
2195 value = !!value;
2196 opts->case_insensitive_p = value;
2197 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2198}
2199
bf9eb54a
MW
2200/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2201static void
2202set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2203{
2204 value = !!value;
2205 opts->square_brackets_p = value;
2206 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2207}
2208
2209/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2210static void
2211set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2212{
2213 value = !!value;
2214 opts->curly_infix_p = value;
2215 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2216}
2217
ea8c9761
MW
2218/* Initialize OPTS based on PORT's read options and the global read
2219 options. */
2220static void
2221init_read_options (SCM port, scm_t_read_opts *opts)
2222{
2223 SCM alist, val, scm_read_options;
2224 unsigned int read_options, x;
2225
05d7f762 2226 alist = scm_i_port_alist (port);
ea8c9761 2227 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
ea8c9761
MW
2228
2229 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2230 read_options = scm_to_uint (scm_read_options);
2231 else
2232 read_options = READ_OPTIONS_INHERIT_ALL;
2233
2234 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2235 if (x == READ_OPTION_INHERIT)
2236 {
2237 val = SCM_PACK (SCM_KEYWORD_STYLE);
2238 if (scm_is_eq (val, scm_keyword_prefix))
2239 x = KEYWORD_STYLE_PREFIX;
2240 else if (scm_is_eq (val, scm_keyword_postfix))
2241 x = KEYWORD_STYLE_POSTFIX;
2242 else
2243 x = KEYWORD_STYLE_HASH_PREFIX;
2244 }
2245 opts->keyword_style = x;
2246
2247#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2248 do \
2249 { \
2250 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2251 if (x == READ_OPTION_INHERIT) \
2252 x = !!SCM_ ## NAME; \
2253 opts->name = x; \
2254 } \
2255 while (0)
2256
2257 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2258 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2259 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2260 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2261 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2262 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2263 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
ea8c9761
MW
2264
2265#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2266
2267 opts->neoteric_p = 0;
ea8c9761
MW
2268}
2269
0f2d19dd
JB
2270void
2271scm_init_read ()
0f2d19dd 2272{
d458073b
AR
2273 SCM read_hash_procs;
2274
c81c2ad3 2275 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2276
2277 scm_i_read_hash_procedures =
2278 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2279
62560650 2280 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2281#include "libguile/read.x"
0f2d19dd 2282}
89e00824
ML
2283
2284/*
2285 Local Variables:
2286 c-file-style: "gnu"
2287 End:
2288*/