Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37
MW
1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d 28#include <string.h>
889975e5
MG
29#include <unistd.h>
30#include <unicase.h>
684d664e 31#include <unictype.h>
96965a6e 32#include <c-strcase.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
f6f4feb0 46#include "libguile/ports-internal.h"
49bb5bd3 47#include "libguile/fports.h"
a0599745
MD
48#include "libguile/root.h"
49#include "libguile/strings.h"
ba1b2226 50#include "libguile/strports.h"
a0599745 51#include "libguile/vectors.h"
a0599745 52#include "libguile/validate.h"
a4022e69 53#include "libguile/srfi-4.h"
7337d56d 54#include "libguile/srfi-13.h"
ba1b2226 55
a0599745 56#include "libguile/read.h"
22fc179a
HWN
57#include "libguile/private-options.h"
58
0f2d19dd
JB
59
60\f
61
5bf6a6f0 62SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 63SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 64SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 65SCM_SYMBOL (sym_nil, "nil");
c7733771 66
bf9eb54a
MW
67/* SRFI-105 curly infix expression support */
68SCM_SYMBOL (sym_nfx, "$nfx$");
69SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
70SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
71
72scm_t_option scm_read_opts[] =
73 {
74 { SCM_OPTION_BOOLEAN, "copy", 0,
75 "Copy source code expressions." },
76 { SCM_OPTION_BOOLEAN, "positions", 1,
77 "Record positions of source code expressions." },
78 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
79 "Convert symbols to lower case."},
80 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
81 "Style of keyword recognition: #f, 'prefix or 'postfix."},
82 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
83 "Use R6RS variable-length character and string hex escapes."},
84 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
85 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
86 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
87 "In strings, consume leading whitespace after an escaped end-of-line."},
88 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
89 "Support SRFI-105 curly infix expressions."},
90 { 0, },
91 };
b1b5433d
MW
92
93/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
94 from the global and per-port read options, and a pointer is passed
95 down to all helper functions. */
96
97enum t_keyword_style
98 {
99 KEYWORD_STYLE_HASH_PREFIX,
100 KEYWORD_STYLE_PREFIX,
101 KEYWORD_STYLE_POSTFIX
102 };
b1b5433d 103
ea8c9761
MW
104struct t_read_opts
105{
b1b5433d
MW
106 enum t_keyword_style keyword_style;
107 unsigned int copy_source_p : 1;
108 unsigned int record_positions_p : 1;
109 unsigned int case_insensitive_p : 1;
110 unsigned int r6rs_escapes_p : 1;
111 unsigned int square_brackets_p : 1;
112 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
113 unsigned int curly_infix_p : 1;
114 unsigned int neoteric_p : 1;
a16f6fe7
MD
115};
116
b1b5433d
MW
117typedef struct t_read_opts scm_t_read_opts;
118
a16f6fe7 119
39e8f371
HWN
120/*
121 Give meaningful error messages for errors
122
123 We use the format
124
ba1b2226 125 FILE:LINE:COL: MESSAGE
39e8f371
HWN
126 This happened in ....
127
128 This is not standard GNU format, but the test-suite likes the real
129 message to be in front.
130
39e8f371
HWN
131 */
132
133
a4022e69
MV
134void
135scm_i_input_error (char const *function,
136 SCM port, const char *message, SCM arg)
ba1b2226 137{
29a837fd
MV
138 SCM fn = (scm_is_string (SCM_FILENAME(port))
139 ? SCM_FILENAME(port)
140 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 141
29a837fd 142 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
143 SCM string = SCM_EOL;
144 scm_simple_format (string_port,
272632a6 145 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 146 scm_list_4 (fn,
b3aa4626 147 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 148 scm_from_int (SCM_COL (port) + 1),
272632a6 149 scm_from_locale_string (message)));
ba1b2226
HWN
150
151 string = scm_get_output_string (string_port);
152 scm_close_output_port (string_port);
4a655e50 153 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 154 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 155 string,
dd72382c 156 arg,
ba1b2226
HWN
157 SCM_BOOL_F);
158}
39e8f371
HWN
159
160
a1ec6916 161SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 162 (SCM setting),
dc7fa443
MG
163 "Option interface for the read options. Instead of using\n"
164 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 165 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 166#define FUNC_NAME s_scm_read_options
a16f6fe7 167{
b7ff98dd
MD
168 SCM ans = scm_options (setting,
169 scm_read_opts,
1bbd0b84 170 FUNC_NAME);
b7ff98dd
MD
171 if (SCM_COPY_SOURCE_P)
172 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
173 return ans;
174}
1bbd0b84 175#undef FUNC_NAME
a16f6fe7 176
d458073b
AR
177/* A fluid referring to an association list mapping extra hash
178 characters to procedures. */
179static SCM *scm_i_read_hash_procedures;
deca31e1 180
cfd15439 181static SCM
d458073b
AR
182scm_i_read_hash_procedures_ref (void)
183{
184 return scm_fluid_ref (*scm_i_read_hash_procedures);
185}
186
cfd15439 187static void
d458073b
AR
188scm_i_read_hash_procedures_set_x (SCM value)
189{
190 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
191}
0f2d19dd 192
7337d56d
LC
193\f
194/* Token readers. */
0f2d19dd 195
0f2d19dd 196
7337d56d
LC
197/* Size of the C buffer used to read symbols and numbers. */
198#define READER_BUFFER_SIZE 128
0f2d19dd 199
ff4d3672
LC
200/* Number of 32-bit codepoints in the buffer used to read strings. */
201#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 202
7337d56d
LC
203/* The maximum size of Scheme character names. */
204#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 205
02327c0c
MW
206/* The maximum size of reader directive names. */
207#define READER_DIRECTIVE_NAME_MAX_SIZE 50
208
94115ae3 209
7337d56d
LC
210/* `isblank' is only in C99. */
211#define CHAR_IS_BLANK_(_chr) \
212 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 213 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
214
215#ifdef MSDOS
216# define CHAR_IS_BLANK(_chr) \
217 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
218#else
219# define CHAR_IS_BLANK CHAR_IS_BLANK_
220#endif
221
222
223/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
224 structure''). */
225#define CHAR_IS_R5RS_DELIMITER(c) \
226 (CHAR_IS_BLANK (c) \
603234c6 227 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 228
603234c6
MW
229#define CHAR_IS_DELIMITER(c) \
230 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
231 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
232 || opts->curly_infix_p)) \
233 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
234
235/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
236 Structure''. */
237#define CHAR_IS_EXPONENT_MARKER(_chr) \
238 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
239 || ((_chr) == 'd') || ((_chr) == 'l'))
240
454866e0 241/* Read an SCSH block comment. */
cfd15439 242static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 243static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
244static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
245static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 246static SCM scm_get_hash_procedure (int);
454866e0 247
69f90b0b
MG
248/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
249 result in the pre-allocated buffer BUF. Return zero if the whole token has
250 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
251 bytes actually read. */
cfd15439 252static int
b1b5433d
MW
253read_token (SCM port, scm_t_read_opts *opts,
254 char *buf, size_t buf_size, size_t *read)
b662b7e9 255{
69f90b0b 256 *read = 0;
0520c320 257
69f90b0b
MG
258 while (*read < buf_size)
259 {
260 int chr;
7337d56d 261
0d959103 262 chr = scm_get_byte_or_eof_unlocked (port);
889975e5 263
69f90b0b
MG
264 if (chr == EOF)
265 return 0;
266 else if (CHAR_IS_DELIMITER (chr))
267 {
c932ce0b 268 scm_unget_byte_unlocked (chr, port);
69f90b0b
MG
269 return 0;
270 }
271 else
272 {
273 *buf = (char) chr;
274 buf++, (*read)++;
275 }
276 }
889975e5 277
69f90b0b
MG
278 return 1;
279 }
889975e5 280
b662b7e9
LC
281/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
282 if the token doesn't fit in BUFFER_SIZE bytes. */
283static char *
b1b5433d
MW
284read_complete_token (SCM port, scm_t_read_opts *opts,
285 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
286{
287 int overflow = 0;
b662b7e9
LC
288 size_t bytes_read, overflow_size = 0;
289 char *overflow_buffer = NULL;
1cc91f1b 290
69f90b0b
MG
291 do
292 {
b1b5433d 293 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
294 if (bytes_read == 0)
295 break;
296 if (overflow || overflow_size != 0)
297 {
298 if (overflow_size == 0)
299 {
b662b7e9
LC
300 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
301 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
302 overflow_size = bytes_read;
303 }
304 else
305 {
7be3c2fc 306 char *new_buf =
b662b7e9
LC
307 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
308
309 memcpy (new_buf, overflow_buffer, overflow_size);
310 memcpy (new_buf + overflow_size, buffer, bytes_read);
311
312 overflow_buffer = new_buf;
69f90b0b
MG
313 overflow_size += bytes_read;
314 }
315 }
889975e5 316 }
69f90b0b 317 while (overflow);
889975e5 318
69f90b0b
MG
319 if (overflow_size)
320 *read = overflow_size;
b521d265 321 else
69f90b0b
MG
322 *read = bytes_read;
323
b662b7e9 324 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 325}
7337d56d
LC
326
327/* Skip whitespace from PORT and return the first non-whitespace character
328 read. Raise an error on end-of-file. */
329static int
b1b5433d 330flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 331{
cfd15439 332 scm_t_wchar c;
0f2d19dd 333 while (1)
be632904 334 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
335 {
336 case EOF:
337 goteof:
338 if (eoferr)
d156d3b7 339 {
a4022e69
MV
340 scm_i_input_error (eoferr,
341 port,
342 "end of file",
343 SCM_EOL);
d156d3b7 344 }
0f2d19dd 345 return c;
7337d56d 346
0f2d19dd
JB
347 case ';':
348 lp:
be632904 349 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
350 {
351 case EOF:
352 goto goteof;
353 default:
354 goto lp;
355 case SCM_LINE_INCREMENTORS:
356 break;
357 }
358 break;
7337d56d 359
454866e0 360 case '#':
be632904 361 switch (c = scm_getc_unlocked (port))
454866e0
LC
362 {
363 case EOF:
364 eoferr = "read_sharp";
365 goto goteof;
366 case '!':
b1b5433d 367 scm_read_shebang (c, port, opts);
454866e0 368 break;
34f3d47d 369 case ';':
b1b5433d 370 scm_read_commented_expression (c, port, opts);
34f3d47d 371 break;
620c8965
LC
372 case '|':
373 if (scm_is_false (scm_get_hash_procedure (c)))
374 {
375 scm_read_r6rs_block_comment (c, port);
376 break;
377 }
378 /* fall through */
454866e0 379 default:
c932ce0b 380 scm_ungetc_unlocked (c, port);
454866e0
LC
381 return '#';
382 }
383 break;
384
0f2d19dd 385 case SCM_LINE_INCREMENTORS:
0f2d19dd 386 case SCM_SINGLE_SPACES:
0f2d19dd 387 case '\t':
0f2d19dd 388 break;
7337d56d 389
0f2d19dd
JB
390 default:
391 return c;
392 }
7337d56d
LC
393
394 return 0;
0f2d19dd
JB
395}
396
397
7337d56d
LC
398\f
399/* Token readers. */
1cc91f1b 400
b1b5433d
MW
401static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
402static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
403 long line, int column);
0f2d19dd 404
0f2d19dd 405
d6cb0203 406static SCM
b1b5433d
MW
407maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
408 long line, int column)
d6cb0203 409{
b1b5433d 410 if (opts->record_positions_p)
d6cb0203
MW
411 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
412 return x;
413}
0f2d19dd 414
09a4f039 415static SCM
b1b5433d 416scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 417#define FUNC_NAME "scm_i_lreadparen"
09a4f039 418{
26c8cc14
AW
419 int c;
420 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
421 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
422 const int terminating_char = ((chr == '{') ? '}'
423 : ((chr == '[') ? ']'
424 : ')'));
7337d56d
LC
425
426 /* Need to capture line and column numbers here. */
427 long line = SCM_LINUM (port);
428 int column = SCM_COL (port) - 1;
f9c68a47 429
b1b5433d 430 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
431 if (terminating_char == c)
432 return SCM_EOL;
f9c68a47 433
c932ce0b 434 scm_ungetc_unlocked (c, port);
b1b5433d 435 tmp = scm_read_expression (port, opts);
1f7945a7
AW
436
437 /* Note that it is possible for scm_read_expression to return
438 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
439 check that it's a real dot by checking `c'. */
440 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 441 {
b1b5433d
MW
442 ans = scm_read_expression (port, opts);
443 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
444 scm_i_input_error (FUNC_NAME, port, "missing close paren",
445 SCM_EOL);
446 return ans;
447 }
1cc91f1b 448
7337d56d
LC
449 /* Build the head of the list structure. */
450 ans = tl = scm_cons (tmp, SCM_EOL);
451
b1b5433d 452 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 453 {
7337d56d 454 SCM new_tail;
0f2d19dd 455
bf9eb54a
MW
456 if (c == ')' || (c == ']' && opts->square_brackets_p)
457 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
458 scm_i_input_error (FUNC_NAME, port,
459 "in pair: mismatched close paren: ~A",
460 scm_list_1 (SCM_MAKE_CHAR (c)));
461
c932ce0b 462 scm_ungetc_unlocked (c, port);
b1b5433d 463 tmp = scm_read_expression (port, opts);
5b69315e 464
1f7945a7
AW
465 /* See above note about scm_sym_dot. */
466 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 467 {
b1b5433d 468 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 469
b1b5433d 470 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
471 if (terminating_char != c)
472 scm_i_input_error (FUNC_NAME, port,
473 "in pair: missing close paren", SCM_EOL);
bf9eb54a 474 break;
0f2d19dd 475 }
b858464a 476
7337d56d
LC
477 new_tail = scm_cons (tmp, SCM_EOL);
478 SCM_SETCDR (tl, new_tail);
479 tl = new_tail;
7337d56d 480 }
0f2d19dd 481
bf9eb54a
MW
482 if (curly_list_p)
483 {
484 /* In addition to finding the length, 'scm_ilength' checks for
485 improper or circular lists, in which case it returns -1. */
486 int len = scm_ilength (ans);
487
488 /* The (len == 0) case is handled above */
489 if (len == 1)
490 /* Return directly to avoid re-annotating the element's source
491 location with the position of the outer brace. Also, it
492 might not be possible to annotate the element. */
493 return scm_car (ans); /* {e} => e */
494 else if (len == 2)
495 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
496 else if (len >= 3 && (len & 1))
497 {
498 /* It's a proper list whose length is odd and at least 3. If
499 the elements at odd indices (the infix operator positions)
500 are all 'equal?', then it's a simple curly-infix list.
501 Otherwise it's a mixed curly-infix list. */
502 SCM op = scm_cadr (ans);
503
504 /* Check to see if the elements at odd indices are 'equal?' */
505 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
506 {
507 if (scm_is_null (tl))
508 {
509 /* Convert simple curly-infix list to prefix:
510 {a <op> b <op> ...} => (<op> a b ...) */
511 tl = ans;
512 while (scm_is_pair (scm_cdr (tl)))
513 {
514 tmp = scm_cddr (tl);
515 SCM_SETCDR (tl, tmp);
516 tl = tmp;
517 }
518 ans = scm_cons (op, ans);
519 break;
520 }
521 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
522 {
523 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
524 ans = scm_cons (sym_nfx, ans);
525 break;
526 }
527 }
528 }
529 else
530 /* Mixed curly-infix (possibly improper) list:
531 {e . tail} => ($nfx$ e . tail) */
532 ans = scm_cons (sym_nfx, ans);
533 }
534
b1b5433d 535 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
536}
537#undef FUNC_NAME
a4022e69 538
c5661d28
MG
539
540/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
541 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
542 found. */
543#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
544 do \
545 { \
546 scm_t_wchar a; \
547 size_t i = 0; \
548 c = 0; \
549 while (i < ndigits) \
550 { \
be632904 551 a = scm_getc_unlocked (port); \
dea901d6
MG
552 if (a == EOF) \
553 goto str_eof; \
554 if (terminator \
555 && (a == (scm_t_wchar) terminator) \
556 && (i > 0)) \
557 break; \
558 if ('0' <= a && a <= '9') \
559 a -= '0'; \
560 else if ('A' <= a && a <= 'F') \
561 a = a - 'A' + 10; \
562 else if ('a' <= a && a <= 'f') \
563 a = a - 'a' + 10; \
564 else \
565 { \
566 c = a; \
567 goto bad_escaped; \
568 } \
569 c = c * 16 + a; \
570 i ++; \
571 } \
c5661d28
MG
572 } while (0)
573
684d664e
AW
574static void
575skip_intraline_whitespace (SCM port)
576{
577 scm_t_wchar c;
578
579 do
580 {
be632904 581 c = scm_getc_unlocked (port);
684d664e
AW
582 if (c == EOF)
583 return;
584 }
585 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
586
c932ce0b 587 scm_ungetc_unlocked (c, port);
684d664e
AW
588}
589
7337d56d 590static SCM
b1b5433d 591scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
592#define FUNC_NAME "scm_lreadr"
593{
594 /* For strings smaller than C_STR, this function creates only one Scheme
595 object (the string returned). */
0f2d19dd 596
ff4d3672
LC
597 SCM str = SCM_EOL;
598 size_t c_str_len = 0;
599 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 600
b131b233
MW
601 /* Need to capture line and column numbers here. */
602 long line = SCM_LINUM (port);
603 int column = SCM_COL (port) - 1;
604
be632904 605 while ('"' != (c = scm_getc_unlocked (port)))
7337d56d
LC
606 {
607 if (c == EOF)
9c44cd45
MG
608 {
609 str_eof:
610 scm_i_input_error (FUNC_NAME, port,
611 "end of file in string constant", SCM_EOL);
612 }
0f2d19dd 613
ff4d3672
LC
614 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
615 {
616 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
617 c_str_len = 0;
618 }
0f2d19dd 619
7337d56d 620 if (c == '\\')
9c44cd45 621 {
be632904 622 switch (c = scm_getc_unlocked (port))
9c44cd45
MG
623 {
624 case EOF:
625 goto str_eof;
626 case '"':
627 case '\\':
628 break;
9c44cd45 629 case '\n':
b1b5433d 630 if (opts->hungry_eol_escapes_p)
684d664e 631 skip_intraline_whitespace (port);
9c44cd45
MG
632 continue;
633 case '0':
634 c = '\0';
635 break;
636 case 'f':
637 c = '\f';
638 break;
639 case 'n':
640 c = '\n';
641 break;
642 case 'r':
643 c = '\r';
644 break;
645 case 't':
646 c = '\t';
647 break;
648 case 'a':
649 c = '\007';
650 break;
651 case 'v':
652 c = '\v';
653 break;
67a4a16d
MG
654 case 'b':
655 c = '\010';
656 break;
9c44cd45 657 case 'x':
b1b5433d 658 if (opts->r6rs_escapes_p)
dea901d6
MG
659 SCM_READ_HEX_ESCAPE (10, ';');
660 else
661 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 662 break;
9c44cd45 663 case 'u':
b1b5433d 664 if (!opts->r6rs_escapes_p)
898a0b5a
MG
665 {
666 SCM_READ_HEX_ESCAPE (4, '\0');
667 break;
668 }
9c44cd45 669 case 'U':
b1b5433d 670 if (!opts->r6rs_escapes_p)
898a0b5a
MG
671 {
672 SCM_READ_HEX_ESCAPE (6, '\0');
673 break;
674 }
9c44cd45
MG
675 default:
676 bad_escaped:
677 scm_i_input_error (FUNC_NAME, port,
678 "illegal character in escape sequence: ~S",
679 scm_list_1 (SCM_MAKE_CHAR (c)));
680 }
681 }
ff4d3672
LC
682
683 c_str[c_str_len++] = c;
7337d56d 684 }
ff4d3672
LC
685
686 if (scm_is_null (str))
687 /* Fast path: we got a string that fits in C_STR. */
688 str = scm_from_utf32_stringn (c_str, c_str_len);
689 else
690 {
691 if (c_str_len > 0)
692 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
693
694 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
695 }
696
b1b5433d 697 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 698}
db4b4ca6
DH
699#undef FUNC_NAME
700
0f2d19dd 701
7337d56d 702static SCM
b1b5433d 703scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 704{
69f90b0b 705 SCM result, str = SCM_EOL;
b662b7e9 706 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 707 size_t bytes_read;
0f2d19dd 708
38f19074
MW
709 /* Need to capture line and column numbers here. */
710 long line = SCM_LINUM (port);
711 int column = SCM_COL (port) - 1;
712
c932ce0b 713 scm_ungetc_unlocked (chr, port);
b1b5433d 714 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 715 &bytes_read);
69f90b0b 716
08467a7e 717 str = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b
MG
718
719 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 720 if (scm_is_false (result))
69f90b0b
MG
721 {
722 /* Return a symbol instead of a number */
b1b5433d 723 if (opts->case_insensitive_p)
69f90b0b
MG
724 str = scm_string_downcase_x (str);
725 result = scm_string_to_symbol (str);
726 }
38f19074 727 else if (SCM_NIMP (result))
b1b5433d 728 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 729
69f90b0b 730 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
731 return result;
732}
0f2d19dd 733
7337d56d 734static SCM
b1b5433d 735scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 736{
889975e5
MG
737 SCM result;
738 int ends_with_colon = 0;
69f90b0b 739 size_t bytes_read;
b1b5433d 740 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 741 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 742 SCM str;
7337d56d 743
c932ce0b 744 scm_ungetc_unlocked (chr, port);
b1b5433d 745 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 746 &bytes_read);
69f90b0b 747 if (bytes_read > 0)
b662b7e9 748 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 749
69f90b0b
MG
750 if (postfix && ends_with_colon && (bytes_read > 1))
751 {
08467a7e 752 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
69f90b0b 753
b1b5433d 754 if (opts->case_insensitive_p)
69f90b0b
MG
755 str = scm_string_downcase_x (str);
756 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
757 }
7337d56d 758 else
69f90b0b 759 {
08467a7e 760 str = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b 761
b1b5433d 762 if (opts->case_insensitive_p)
69f90b0b
MG
763 str = scm_string_downcase_x (str);
764 result = scm_string_to_symbol (str);
765 }
7337d56d 766
69f90b0b 767 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
768 return result;
769}
770
771static SCM
b1b5433d 772scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
773#define FUNC_NAME "scm_lreadr"
774{
889975e5 775 SCM result;
7337d56d 776 size_t read;
b662b7e9 777 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 778 unsigned int radix;
69f90b0b 779 SCM str;
7337d56d
LC
780
781 switch (chr)
782 {
783 case 'B':
784 case 'b':
785 radix = 2;
786 break;
787
788 case 'o':
789 case 'O':
790 radix = 8;
791 break;
792
793 case 'd':
794 case 'D':
795 radix = 10;
796 break;
797
798 case 'x':
799 case 'X':
800 radix = 16;
801 break;
802
803 default:
c932ce0b
AW
804 scm_ungetc_unlocked (chr, port);
805 scm_ungetc_unlocked ('#', port);
7337d56d
LC
806 radix = 10;
807 }
808
b1b5433d 809 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 810 &read);
69f90b0b 811
08467a7e 812 str = scm_from_port_stringn (buffer, read, port);
69f90b0b
MG
813
814 result = scm_string_to_number (str, scm_from_uint (radix));
815
69f90b0b 816 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
817
818 if (scm_is_true (result))
819 return result;
820
821 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
822
823 return SCM_BOOL_F;
824}
825#undef FUNC_NAME
826
827static SCM
b1b5433d 828scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
829{
830 SCM p;
492faee1
LC
831 long line = SCM_LINUM (port);
832 int column = SCM_COL (port) - 1;
7337d56d
LC
833
834 switch (chr)
835 {
836 case '`':
837 p = scm_sym_quasiquote;
838 break;
839
840 case '\'':
841 p = scm_sym_quote;
842 break;
843
844 case ',':
845 {
889975e5 846 scm_t_wchar c;
7337d56d 847
be632904 848 c = scm_getc_unlocked (port);
7337d56d
LC
849 if ('@' == c)
850 p = scm_sym_uq_splicing;
851 else
0f2d19dd 852 {
c932ce0b 853 scm_ungetc_unlocked (c, port);
7337d56d 854 p = scm_sym_unquote;
0f2d19dd 855 }
7337d56d
LC
856 break;
857 }
0f2d19dd 858
7337d56d
LC
859 default:
860 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 861 "scm_read_quote", chr);
7337d56d 862 abort ();
0f2d19dd 863 }
1cc91f1b 864
b1b5433d
MW
865 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
866 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
867}
868
34f3d47d
AW
869SCM_SYMBOL (sym_syntax, "syntax");
870SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
871SCM_SYMBOL (sym_unsyntax, "unsyntax");
872SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
873
874static SCM
b1b5433d 875scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
876{
877 SCM p;
878 long line = SCM_LINUM (port);
879 int column = SCM_COL (port) - 1;
880
881 switch (chr)
882 {
883 case '`':
884 p = sym_quasisyntax;
885 break;
886
887 case '\'':
888 p = sym_syntax;
889 break;
890
891 case ',':
892 {
893 int c;
894
be632904 895 c = scm_getc_unlocked (port);
34f3d47d
AW
896 if ('@' == c)
897 p = sym_unsyntax_splicing;
898 else
899 {
c932ce0b 900 scm_ungetc_unlocked (c, port);
34f3d47d
AW
901 p = sym_unsyntax;
902 }
903 break;
904 }
905
906 default:
907 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
908 "scm_read_syntax", chr);
909 abort ();
910 }
911
b1b5433d
MW
912 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
913 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
914}
915
cfd15439 916static SCM
b1b5433d 917scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 918{
b1b5433d 919 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
920
921 if (!scm_is_eq (id, sym_nil))
922 scm_i_input_error ("scm_read_nil", port,
923 "unexpected input while reading #nil: ~a",
924 scm_list_1 (id));
925
926 return SCM_ELISP_NIL;
927}
928
cfd15439 929static SCM
7337d56d 930scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 931{
0f2d19dd
JB
932 int c;
933
889975e5
MG
934 /* We use the get_byte here because there is no need to get the
935 locale correct with comment input. This presumes that newline
936 always represents itself no matter what the encoding is. */
0d959103 937 for (c = scm_get_byte_or_eof_unlocked (port);
7337d56d 938 (c != EOF) && (c != '\n');
0d959103 939 c = scm_get_byte_or_eof_unlocked (port));
7337d56d
LC
940
941 return SCM_UNSPECIFIED;
942}
943
944\f
945/* Sharp readers, i.e. readers called after a `#' sign has been read. */
946
947static SCM
948scm_read_boolean (int chr, SCM port)
949{
950 switch (chr)
0f2d19dd 951 {
7337d56d
LC
952 case 't':
953 case 'T':
954 return SCM_BOOL_T;
955
956 case 'f':
957 case 'F':
958 return SCM_BOOL_F;
0f2d19dd 959 }
7337d56d
LC
960
961 return SCM_UNSPECIFIED;
962}
963
964static SCM
b1b5433d 965scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
966#define FUNC_NAME "scm_lreadr"
967{
69f90b0b
MG
968 char buffer[READER_CHAR_NAME_MAX_SIZE];
969 SCM charname;
970 size_t charname_len, bytes_read;
889975e5
MG
971 scm_t_wchar cp;
972 int overflow;
f6f4feb0 973 scm_t_port_internal *pti;
889975e5 974
b1b5433d
MW
975 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
976 &bytes_read);
889975e5 977 if (overflow)
c1a0ba1c 978 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 979
69f90b0b 980 if (bytes_read == 0)
0f2d19dd 981 {
be632904 982 chr = scm_getc_unlocked (port);
7337d56d
LC
983 if (chr == EOF)
984 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
985 "while reading character", SCM_EOL);
986
987 /* CHR must be a token delimiter, like a whitespace. */
988 return (SCM_MAKE_CHAR (chr));
0f2d19dd 989 }
7337d56d 990
f6f4feb0 991 pti = SCM_PORT_GET_INTERNAL (port);
7337d56d 992
69f90b0b
MG
993 /* Simple ASCII characters can be processed immediately. Also, simple
994 ISO-8859-1 characters can be processed immediately if the encoding for this
995 port is ISO-8859-1. */
d1b9f8ac
AW
996 if (bytes_read == 1 &&
997 ((unsigned char) buffer[0] <= 127
f6f4feb0 998 || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
69f90b0b
MG
999 {
1000 SCM_COL (port) += 1;
1001 return SCM_MAKE_CHAR (buffer[0]);
1002 }
1003
1004 /* Otherwise, convert the buffer into a proper scheme string for
1005 processing. */
08467a7e 1006 charname = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b
MG
1007 charname_len = scm_i_string_length (charname);
1008 SCM_COL (port) += charname_len;
889975e5 1009 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1010 if (charname_len == 1)
1011 return SCM_MAKE_CHAR (cp);
1012
1013 /* Ignore dotted circles, which may be used to keep combining characters from
1014 combining with the backslash in #\charname. */
0dcd7e61
MG
1015 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1016 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1017
889975e5 1018 if (cp >= '0' && cp < '8')
7337d56d
LC
1019 {
1020 /* Dirk:FIXME:: This type of character syntax is not R5RS
1021 * compliant. Further, it should be verified that the constant
0ffc78e3 1022 * does only consist of octal digits. */
889975e5 1023 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1024 if (SCM_I_INUMP (p))
0ffc78e3 1025 {
e25f3727 1026 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1027 if (SCM_IS_UNICODE_CHAR (c))
1028 return SCM_MAKE_CHAR (c);
1029 else
0f3a70cf 1030 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1031 "out-of-range octal character escape: ~a",
1032 scm_list_1 (charname));
1033 }
7337d56d
LC
1034 }
1035
0f3a70cf 1036 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1037 {
1038 SCM p;
0f3a70cf 1039
dea901d6
MG
1040 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1041 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1042 scm_from_uint (16));
1043 if (SCM_I_INUMP (p))
1044 {
e25f3727 1045 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1046 if (SCM_IS_UNICODE_CHAR (c))
1047 return SCM_MAKE_CHAR (c);
1048 else
1049 scm_i_input_error (FUNC_NAME, port,
1050 "out-of-range hex character escape: ~a",
1051 scm_list_1 (charname));
1052 }
1053 }
1054
889975e5
MG
1055 /* The names of characters should never have non-Latin1
1056 characters. */
1057 if (scm_i_is_narrow_string (charname)
1058 || scm_i_try_narrow_string (charname))
4769c9db
AW
1059 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1060 charname_len);
1061 if (scm_is_true (ch))
1062 return ch;
1063 }
7337d56d 1064
7337d56d 1065 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1066 scm_list_1 (charname));
7337d56d
LC
1067
1068 return SCM_UNSPECIFIED;
0f2d19dd 1069}
db4b4ca6 1070#undef FUNC_NAME
0f2d19dd 1071
cfd15439 1072static SCM
b1b5433d 1073scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1074{
1075 SCM symbol;
1076
1077 /* Read the symbol that comprises the keyword. Doing this instead of
1078 invoking a specific symbol reader function allows `scm_read_keyword ()'
1079 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1080
7337d56d 1081 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1082 symbol = scm_read_expression (port, opts);
7337d56d 1083 if (!scm_is_symbol (symbol))
7f74cf9a 1084 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1085 "keyword prefix `~a' not followed by a symbol: ~s",
1086 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1087
1088 return (scm_symbol_to_keyword (symbol));
1089}
1090
cfd15439 1091static SCM
b1b5433d
MW
1092scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1093 long line, int column)
09a4f039 1094{
7337d56d
LC
1095 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1096 guarantee that it's going to do what we want. After all, this is an
1097 implementation detail of `scm_read_vector ()', not a desirable
1098 property. */
b1b5433d
MW
1099 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1100 port, opts, line, column);
b131b233
MW
1101}
1102
493ceb99
MW
1103/* Helper used by scm_read_array */
1104static int
1105read_decimal_integer (SCM port, int c, ssize_t *resp)
1106{
1107 ssize_t sign = 1;
1108 ssize_t res = 0;
1109 int got_it = 0;
1110
1111 if (c == '-')
1112 {
1113 sign = -1;
fa980bcc 1114 c = scm_getc_unlocked (port);
493ceb99
MW
1115 }
1116
1117 while ('0' <= c && c <= '9')
1118 {
1119 res = 10*res + c-'0';
1120 got_it = 1;
fa980bcc 1121 c = scm_getc_unlocked (port);
493ceb99
MW
1122 }
1123
1124 if (got_it)
1125 *resp = sign * res;
1126 return c;
7337d56d 1127}
09a4f039 1128
493ceb99
MW
1129/* Read an array. This function can also read vectors and uniform
1130 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1131 handled here.
1132
10744b7c 1133 C is the first character read after the '#'. */
b131b233 1134static SCM
b1b5433d 1135scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
7337d56d 1136{
493ceb99
MW
1137 ssize_t rank;
1138 scm_t_wchar tag_buf[8];
1139 int tag_len;
1140
1141 SCM tag, shape = SCM_BOOL_F, elements, array;
1142
1143 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1144 the array code can not deal with zero-length dimensions yet, and
10744b7c 1145 we want to allow zero-length vectors, of course. */
493ceb99 1146 if (c == '(')
b1b5433d 1147 return scm_read_vector (c, port, opts, line, column);
493ceb99 1148
10744b7c 1149 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1150 if (c == 'f')
1151 {
fa980bcc 1152 c = scm_getc_unlocked (port);
493ceb99
MW
1153 if (c != '3' && c != '6')
1154 {
1155 if (c != EOF)
fa980bcc 1156 scm_ungetc_unlocked (c, port);
493ceb99
MW
1157 return SCM_BOOL_F;
1158 }
1159 rank = 1;
1160 tag_buf[0] = 'f';
1161 tag_len = 1;
1162 goto continue_reading_tag;
1163 }
1164
1165 /* Read rank. */
1166 rank = 1;
1167 c = read_decimal_integer (port, c, &rank);
1168 if (rank < 0)
1169 scm_i_input_error (NULL, port, "array rank must be non-negative",
1170 SCM_EOL);
1171
1172 /* Read tag. */
1173 tag_len = 0;
1174 continue_reading_tag:
1175 while (c != EOF && c != '(' && c != '@' && c != ':'
1176 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1177 {
1178 tag_buf[tag_len++] = c;
fa980bcc 1179 c = scm_getc_unlocked (port);
493ceb99
MW
1180 }
1181 if (tag_len == 0)
1182 tag = SCM_BOOL_T;
b131b233 1183 else
493ceb99
MW
1184 {
1185 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1186 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1187 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1188 scm_list_1 (tag));
1189 }
1190
1191 /* Read shape. */
1192 if (c == '@' || c == ':')
1193 {
1194 shape = SCM_EOL;
1195
1196 do
1197 {
1198 ssize_t lbnd = 0, len = 0;
1199 SCM s;
1200
1201 if (c == '@')
1202 {
fa980bcc 1203 c = scm_getc_unlocked (port);
493ceb99
MW
1204 c = read_decimal_integer (port, c, &lbnd);
1205 }
1206
1207 s = scm_from_ssize_t (lbnd);
1208
1209 if (c == ':')
1210 {
fa980bcc 1211 c = scm_getc_unlocked (port);
493ceb99
MW
1212 c = read_decimal_integer (port, c, &len);
1213 if (len < 0)
1214 scm_i_input_error (NULL, port,
1215 "array length must be non-negative",
1216 SCM_EOL);
1217
1218 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1219 }
1220
1221 shape = scm_cons (s, shape);
1222 } while (c == '@' || c == ':');
1223
1224 shape = scm_reverse_x (shape, SCM_EOL);
1225 }
1226
1227 /* Read nested lists of elements. */
1228 if (c != '(')
1229 scm_i_input_error (NULL, port,
1230 "missing '(' in vector or array literal",
1231 SCM_EOL);
b1b5433d 1232 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1233
1234 if (scm_is_false (shape))
1235 shape = scm_from_ssize_t (rank);
1236 else if (scm_ilength (shape) != rank)
1237 scm_i_input_error
1238 (NULL, port,
1239 "the number of shape specifications must match the array rank",
1240 SCM_EOL);
1241
1242 /* Handle special print syntax of rank zero arrays; see
1243 scm_i_print_array for a rationale. */
1244 if (rank == 0)
1245 {
1246 if (!scm_is_pair (elements))
1247 scm_i_input_error (NULL, port,
1248 "too few elements in array literal, need 1",
1249 SCM_EOL);
1250 if (!scm_is_null (SCM_CDR (elements)))
1251 scm_i_input_error (NULL, port,
1252 "too many elements in array literal, want 1",
1253 SCM_EOL);
1254 elements = SCM_CAR (elements);
1255 }
1256
1257 /* Construct array, annotate with source location, and return. */
1258 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1259 return maybe_annotate_source (array, port, opts, line, column);
7337d56d
LC
1260}
1261
0ba0b384 1262static SCM
b1b5433d
MW
1263scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1264 long line, int column)
7337d56d 1265{
b1b5433d 1266 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1267}
1268
0ba0b384 1269static SCM
b1b5433d
MW
1270scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1271 long line, int column)
0ba0b384 1272{
be632904 1273 chr = scm_getc_unlocked (port);
0ba0b384
LC
1274 if (chr != 'u')
1275 goto syntax;
1276
be632904 1277 chr = scm_getc_unlocked (port);
0ba0b384
LC
1278 if (chr != '8')
1279 goto syntax;
1280
be632904 1281 chr = scm_getc_unlocked (port);
0ba0b384
LC
1282 if (chr != '(')
1283 goto syntax;
1284
b131b233 1285 return maybe_annotate_source
b1b5433d
MW
1286 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1287 port, opts, line, column);
0ba0b384
LC
1288
1289 syntax:
1290 scm_i_input_error ("read_bytevector", port,
1291 "invalid bytevector prefix",
1292 SCM_MAKE_CHAR (chr));
1293 return SCM_UNSPECIFIED;
1294}
1295
7337d56d 1296static SCM
b1b5433d
MW
1297scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1298 long line, int column)
7337d56d
LC
1299{
1300 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1301 terribly inefficient but who cares? */
1302 SCM s_bits = SCM_EOL;
1303
be632904 1304 for (chr = scm_getc_unlocked (port);
7337d56d 1305 (chr != EOF) && ((chr == '0') || (chr == '1'));
be632904 1306 chr = scm_getc_unlocked (port))
09a4f039 1307 {
7337d56d 1308 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1309 }
7337d56d
LC
1310
1311 if (chr != EOF)
c932ce0b 1312 scm_ungetc_unlocked (chr, port);
7337d56d 1313
b131b233
MW
1314 return maybe_annotate_source
1315 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1316 port, opts, line, column);
7337d56d
LC
1317}
1318
cfd15439 1319static SCM
889975e5 1320scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1321{
1322 int bang_seen = 0;
1323
1324 for (;;)
09a4f039 1325 {
be632904 1326 int c = scm_getc_unlocked (port);
62850ef3 1327
7337d56d
LC
1328 if (c == EOF)
1329 scm_i_input_error ("skip_block_comment", port,
1330 "unterminated `#! ... !#' comment", SCM_EOL);
1331
1332 if (c == '!')
1333 bang_seen = 1;
1334 else if (c == '#' && bang_seen)
1335 break;
1336 else
1337 bang_seen = 0;
1338 }
1339
1340 return SCM_UNSPECIFIED;
1341}
1342
9331ffd8
MW
1343static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1344 int value);
bf9eb54a
MW
1345static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1346 int value);
1347static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1348 int value);
9331ffd8 1349
d7fcaec3 1350static SCM
b1b5433d 1351scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1352{
02327c0c
MW
1353 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1354 int c;
1355 int i = 0;
1356
1357 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1358 {
fa980bcc 1359 c = scm_getc_unlocked (port);
02327c0c
MW
1360 if (c == EOF)
1361 scm_i_input_error ("skip_block_comment", port,
1362 "unterminated `#! ... !#' comment", SCM_EOL);
1363 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1364 name[i++] = c;
1365 else if (CHAR_IS_DELIMITER (c))
1366 {
fa980bcc 1367 scm_ungetc_unlocked (c, port);
02327c0c
MW
1368 name[i] = '\0';
1369 if (0 == strcmp ("r6rs", name))
1370 ; /* Silently ignore */
9331ffd8
MW
1371 else if (0 == strcmp ("fold-case", name))
1372 set_port_case_insensitive_p (port, opts, 1);
1373 else if (0 == strcmp ("no-fold-case", name))
1374 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1375 else if (0 == strcmp ("curly-infix", name))
1376 set_port_curly_infix_p (port, opts, 1);
1377 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1378 {
1379 set_port_curly_infix_p (port, opts, 1);
1380 set_port_square_brackets_p (port, opts, 0);
1381 }
02327c0c
MW
1382 else
1383 break;
1384
1385 return SCM_UNSPECIFIED;
1386 }
fa746547
MW
1387 else
1388 {
fa980bcc 1389 scm_ungetc_unlocked (c, port);
fa746547
MW
1390 break;
1391 }
911b03b2 1392 }
02327c0c 1393 while (i > 0)
fa980bcc 1394 scm_ungetc_unlocked (name[--i], port);
02327c0c 1395 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1396}
1397
620c8965
LC
1398static SCM
1399scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1400{
1401 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1402 nested. So care must be taken. */
1403 int nesting_level = 1;
6d5f8c32 1404
be632904 1405 int a = scm_getc_unlocked (port);
6d5f8c32
AW
1406
1407 if (a == EOF)
1408 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1409 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1410
1411 while (nesting_level > 0)
1412 {
be632904 1413 int b = scm_getc_unlocked (port);
620c8965 1414
6d5f8c32 1415 if (b == EOF)
cd169c5a 1416 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1417 "unterminated `#| ... |#' comment", SCM_EOL);
1418
6d5f8c32
AW
1419 if (a == '|' && b == '#')
1420 {
1421 nesting_level--;
1422 b = EOF;
1423 }
1424 else if (a == '#' && b == '|')
1425 {
1426 nesting_level++;
1427 b = EOF;
1428 }
1429
1430 a = b;
620c8965
LC
1431 }
1432
1433 return SCM_UNSPECIFIED;
1434}
1435
34f3d47d 1436static SCM
b1b5433d
MW
1437scm_read_commented_expression (scm_t_wchar chr, SCM port,
1438 scm_t_read_opts *opts)
34f3d47d 1439{
889975e5 1440 scm_t_wchar c;
34f3d47d 1441
b1b5433d 1442 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1443 if (EOF == c)
1444 scm_i_input_error ("read_commented_expression", port,
1445 "no expression after #; comment", SCM_EOL);
c932ce0b 1446 scm_ungetc_unlocked (c, port);
b1b5433d 1447 scm_read_expression (port, opts);
34f3d47d
AW
1448 return SCM_UNSPECIFIED;
1449}
1450
7337d56d 1451static SCM
889975e5 1452scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1453{
1454 /* Guile's extended symbol read syntax looks like this:
1455
1456 #{This is all a symbol name}#
1457
1458 So here, CHR is expected to be `{'. */
d9527cfa 1459 int saw_brace = 0;
7337d56d 1460 size_t len = 0;
190d4b0d 1461 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1462
889975e5 1463 buf = scm_i_string_start_writing (buf);
7337d56d 1464
be632904 1465 while ((chr = scm_getc_unlocked (port)) != EOF)
7337d56d
LC
1466 {
1467 if (saw_brace)
09a4f039 1468 {
7337d56d
LC
1469 if (chr == '#')
1470 {
7337d56d
LC
1471 break;
1472 }
1473 else
1474 {
1475 saw_brace = 0;
889975e5 1476 scm_i_string_set_x (buf, len++, '}');
7337d56d 1477 }
09a4f039 1478 }
d9527cfa
AW
1479
1480 if (chr == '}')
7337d56d 1481 saw_brace = 1;
d9527cfa
AW
1482 else if (chr == '\\')
1483 {
1484 /* It used to be that print.c would print extended-read-syntax
1485 symbols with backslashes before "non-standard" chars, but
1486 this routine wouldn't do anything with those escapes.
1487 Bummer. What we've done is to change print.c to output
1488 R6RS hex escapes for those characters, relying on the fact
1489 that the extended read syntax would never put a `\' before
1490 an `x'. For now, we just ignore other instances of
1491 backslash in the string. */
be632904 1492 switch ((chr = scm_getc_unlocked (port)))
d9527cfa
AW
1493 {
1494 case EOF:
1495 goto done;
1496 case 'x':
1497 {
1498 scm_t_wchar c;
1499
1500 SCM_READ_HEX_ESCAPE (10, ';');
1501 scm_i_string_set_x (buf, len++, c);
1502 break;
1503
1504 str_eof:
1505 chr = EOF;
1506 goto done;
1507
1508 bad_escaped:
1509 scm_i_string_stop_writing ();
1510 scm_i_input_error ("scm_read_extended_symbol", port,
1511 "illegal character in escape sequence: ~S",
1512 scm_list_1 (SCM_MAKE_CHAR (c)));
1513 break;
1514 }
1515 default:
1516 scm_i_string_set_x (buf, len++, chr);
1517 break;
1518 }
1519 }
7337d56d 1520 else
d9527cfa 1521 scm_i_string_set_x (buf, len++, chr);
62850ef3 1522
889975e5 1523 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1524 {
7f991c7d
LC
1525 SCM addy;
1526
889975e5 1527 scm_i_string_stop_writing ();
190d4b0d 1528 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1529 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1530 len = 0;
889975e5 1531 buf = scm_i_string_start_writing (buf);
7337d56d 1532 }
7337d56d 1533 }
d9527cfa
AW
1534
1535 done:
889975e5 1536 scm_i_string_stop_writing ();
d9527cfa
AW
1537 if (chr == EOF)
1538 scm_i_input_error ("scm_read_extended_symbol", port,
1539 "end of file while reading symbol", SCM_EOL);
7337d56d 1540
889975e5 1541 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1542}
1543
1544
1545\f
1546/* Top-level token readers, i.e., dispatchers. */
1547
1548static SCM
b1b5433d 1549scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1550{
1551 SCM proc;
1552
1553 proc = scm_get_hash_procedure (chr);
1554 if (scm_is_true (scm_procedure_p (proc)))
1555 {
1556 long line = SCM_LINUM (port);
1557 int column = SCM_COL (port) - 2;
1558 SCM got;
1559
1560 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1561
3655ed89
MW
1562 if (opts->record_positions_p && SCM_NIMP (got)
1563 && !scm_i_has_source_properties (got))
26c8cc14
AW
1564 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1565
1566 return got;
09a4f039 1567 }
7337d56d
LC
1568
1569 return SCM_UNSPECIFIED;
1570}
1571
1572/* The reader for the sharp `#' character. It basically dispatches reads
1573 among the above token readers. */
1574static SCM
b1b5433d
MW
1575scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1576 long line, int column)
7337d56d
LC
1577#define FUNC_NAME "scm_lreadr"
1578{
1579 SCM result;
1580
be632904 1581 chr = scm_getc_unlocked (port);
7337d56d 1582
b1b5433d 1583 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1584 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1585 return result;
1586
1587 switch (chr)
1588 {
1589 case '\\':
b1b5433d 1590 return (scm_read_character (chr, port, opts));
7337d56d 1591 case '(':
b1b5433d 1592 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1593 case 's':
1594 case 'u':
1595 case 'f':
df941b5b 1596 case 'c':
7337d56d 1597 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1598 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1599 case 'v':
b1b5433d 1600 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1601 case '*':
b1b5433d 1602 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1603 case 't':
1604 case 'T':
1605 case 'F':
7337d56d
LC
1606 return (scm_read_boolean (chr, port));
1607 case ':':
b1b5433d 1608 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1609 case '0': case '1': case '2': case '3': case '4':
1610 case '5': case '6': case '7': case '8': case '9':
1611 case '@':
b1b5433d 1612 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1613
1614 case 'i':
1615 case 'e':
7337d56d
LC
1616 case 'b':
1617 case 'B':
1618 case 'o':
1619 case 'O':
1620 case 'd':
1621 case 'D':
1622 case 'x':
1623 case 'X':
1624 case 'I':
1625 case 'E':
b1b5433d 1626 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1627 case '{':
1628 return (scm_read_extended_symbol (chr, port));
1629 case '!':
b1b5433d 1630 return (scm_read_shebang (chr, port, opts));
34f3d47d 1631 case ';':
b1b5433d 1632 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1633 case '`':
1634 case '\'':
1635 case ',':
b1b5433d 1636 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1637 case 'n':
b1b5433d 1638 return (scm_read_nil (chr, port, opts));
7337d56d 1639 default:
b1b5433d 1640 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1641 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1642 {
1643 /* To remain compatible with 1.8 and earlier, the following
1644 characters have lower precedence than `read-hash-extend'
1645 characters. */
1646 switch (chr)
1647 {
1648 case '|':
1649 return scm_read_r6rs_block_comment (chr, port);
1650 default:
1651 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1652 scm_list_1 (SCM_MAKE_CHAR (chr)));
1653 }
1654 }
7337d56d
LC
1655 else
1656 return result;
1657 }
1658
1659 return SCM_UNSPECIFIED;
1660}
1661#undef FUNC_NAME
1662
1663static SCM
bf9eb54a
MW
1664read_inner_expression (SCM port, scm_t_read_opts *opts)
1665#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1666{
1667 while (1)
1668 {
cfd15439 1669 scm_t_wchar chr;
7337d56d 1670
be632904 1671 chr = scm_getc_unlocked (port);
7337d56d
LC
1672
1673 switch (chr)
1674 {
1675 case SCM_WHITE_SPACES:
1676 case SCM_LINE_INCREMENTORS:
1677 break;
1678 case ';':
1679 (void) scm_read_semicolon_comment (chr, port);
1680 break;
bf9eb54a
MW
1681 case '{':
1682 if (opts->curly_infix_p)
1683 {
1684 if (opts->neoteric_p)
1685 return scm_read_sexp (chr, port, opts);
1686 else
1687 {
1688 SCM expr;
1689
1690 /* Enable neoteric expressions within curly braces */
1691 opts->neoteric_p = 1;
1692 expr = scm_read_sexp (chr, port, opts);
1693 opts->neoteric_p = 0;
1694 return expr;
1695 }
1696 }
1697 else
1698 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1699 case '[':
bf9eb54a
MW
1700 if (opts->square_brackets_p)
1701 return scm_read_sexp (chr, port, opts);
1702 else if (opts->curly_infix_p)
1703 {
1704 /* The syntax of neoteric expressions requires that '[' be
1705 a delimiter when curly-infix is enabled, so it cannot
1706 be part of an unescaped symbol. We might as well do
1707 something useful with it, so we adopt Kawa's convention:
1708 [...] => ($bracket-list$ ...) */
1709 long line = SCM_LINUM (port);
1710 int column = SCM_COL (port) - 1;
1711 return maybe_annotate_source
1712 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1713 port, opts, line, column);
1714 }
1715 else
1716 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1717 case '(':
b1b5433d 1718 return (scm_read_sexp (chr, port, opts));
7337d56d 1719 case '"':
b1b5433d 1720 return (scm_read_string (chr, port, opts));
7337d56d
LC
1721 case '\'':
1722 case '`':
1723 case ',':
b1b5433d 1724 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1725 case '#':
1726 {
b131b233
MW
1727 long line = SCM_LINUM (port);
1728 int column = SCM_COL (port) - 1;
b1b5433d 1729 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1730 if (scm_is_eq (result, SCM_UNSPECIFIED))
1731 /* We read a comment or some such. */
1732 break;
1733 else
1734 return result;
1735 }
1736 case ')':
1737 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1738 break;
bf9eb54a
MW
1739 case '}':
1740 if (opts->curly_infix_p)
1741 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1742 else
1743 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1744 case ']':
b1b5433d 1745 if (opts->square_brackets_p)
a4e47229
MG
1746 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1747 /* otherwise fall through */
7337d56d
LC
1748 case EOF:
1749 return SCM_EOF_VAL;
1750 case ':':
b1b5433d
MW
1751 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1752 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1753 /* Fall through. */
1754
1755 default:
1756 {
1757 if (((chr >= '0') && (chr <= '9'))
1758 || (strchr ("+-.", chr)))
b1b5433d 1759 return (scm_read_number (chr, port, opts));
7337d56d 1760 else
b1b5433d 1761 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1762 }
1763 }
1764 }
1765}
1766#undef FUNC_NAME
1767
bf9eb54a
MW
1768static SCM
1769scm_read_expression (SCM port, scm_t_read_opts *opts)
1770#define FUNC_NAME "scm_read_expression"
1771{
1772 if (!opts->neoteric_p)
1773 return read_inner_expression (port, opts);
1774 else
1775 {
1776 long line = 0;
1777 int column = 0;
1778 SCM expr;
1779
1780 if (opts->record_positions_p)
1781 {
1782 /* We need to get the position of the first non-whitespace
1783 character in order to correctly annotate neoteric
1784 expressions. For example, for the expression 'f(x)', the
1785 first call to 'read_inner_expression' reads the 'f' (which
1786 cannot be annotated), and then we later read the '(x)' and
1787 use it to construct the new list (f x). */
1788 int c = flush_ws (port, opts, (char *) NULL);
1789 if (c == EOF)
1790 return SCM_EOF_VAL;
fa980bcc 1791 scm_ungetc_unlocked (c, port);
bf9eb54a
MW
1792 line = SCM_LINUM (port);
1793 column = SCM_COL (port);
1794 }
1795
1796 expr = read_inner_expression (port, opts);
1797
1798 /* 'expr' is the first component of the neoteric expression. Now
1799 we loop, and as long as the next character is '(', '[', or '{',
1800 (without any intervening whitespace), we use it to construct a
1801 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1802 for (;;)
1803 {
fa980bcc 1804 int chr = scm_getc_unlocked (port);
bf9eb54a
MW
1805
1806 if (chr == '(')
1807 /* e(...) => (e ...) */
1808 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1809 else if (chr == '[')
1810 /* e[...] => ($bracket-apply$ e ...) */
1811 expr = scm_cons (sym_bracket_apply,
1812 scm_cons (expr,
1813 scm_read_sexp (chr, port, opts)));
1814 else if (chr == '{')
1815 {
1816 SCM arg = scm_read_sexp (chr, port, opts);
1817
1818 if (scm_is_null (arg))
1819 expr = scm_list_1 (expr); /* e{} => (e) */
1820 else
1821 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1822 }
1823 else
1824 {
1825 if (chr != EOF)
fa980bcc 1826 scm_ungetc_unlocked (chr, port);
bf9eb54a
MW
1827 break;
1828 }
1829 maybe_annotate_source (expr, port, opts, line, column);
1830 }
1831 return expr;
1832 }
1833}
1834#undef FUNC_NAME
1835
7337d56d
LC
1836\f
1837/* Actual reader. */
1838
ea8c9761
MW
1839static void init_read_options (SCM port, scm_t_read_opts *opts);
1840
7337d56d
LC
1841SCM_DEFINE (scm_read, "read", 0, 1, 0,
1842 (SCM port),
1843 "Read an s-expression from the input port @var{port}, or from\n"
1844 "the current input port if @var{port} is not specified.\n"
1845 "Any whitespace before the next token is discarded.")
1846#define FUNC_NAME s_scm_read
1847{
b1b5433d 1848 scm_t_read_opts opts;
7337d56d
LC
1849 int c;
1850
1851 if (SCM_UNBNDP (port))
1852 port = scm_current_input_port ();
1853 SCM_VALIDATE_OPINPORT (1, port);
1854
ea8c9761 1855 init_read_options (port, &opts);
b1b5433d
MW
1856
1857 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1858 if (EOF == c)
1859 return SCM_EOF_VAL;
c932ce0b 1860 scm_ungetc_unlocked (c, port);
7337d56d 1861
b1b5433d 1862 return (scm_read_expression (port, &opts));
09a4f039 1863}
db4b4ca6 1864#undef FUNC_NAME
09a4f039 1865
0f2d19dd
JB
1866
1867\f
1868
14de3b42
GH
1869/* Manipulate the read-hash-procedures alist. This could be written in
1870 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1871SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1872 (SCM chr, SCM proc),
dc7fa443
MG
1873 "Install the procedure @var{proc} for reading expressions\n"
1874 "starting with the character sequence @code{#} and @var{chr}.\n"
1875 "@var{proc} will be called with two arguments: the character\n"
1876 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1877 "returned will be the return value of @code{read}. \n"
1878 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1879 )
1bbd0b84 1880#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1881{
fed9c9a2
GH
1882 SCM this;
1883 SCM prev;
1884
36284627 1885 SCM_VALIDATE_CHAR (1, chr);
7888309b 1886 SCM_ASSERT (scm_is_false (proc)
bc36d050 1887 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1888 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1889
14de3b42 1890 /* Check if chr is already in the alist. */
d458073b 1891 this = scm_i_read_hash_procedures_ref ();
14de3b42 1892 prev = SCM_BOOL_F;
fed9c9a2
GH
1893 while (1)
1894 {
d2e53ed6 1895 if (scm_is_null (this))
fed9c9a2
GH
1896 {
1897 /* not found, so add it to the beginning. */
7888309b 1898 if (scm_is_true (proc))
fed9c9a2 1899 {
d458073b
AR
1900 SCM new = scm_cons (scm_cons (chr, proc),
1901 scm_i_read_hash_procedures_ref ());
1902 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1903 }
1904 break;
1905 }
bc36d050 1906 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1907 {
1908 /* already in the alist. */
7888309b 1909 if (scm_is_false (proc))
14de3b42
GH
1910 {
1911 /* remove it. */
7888309b 1912 if (scm_is_false (prev))
14de3b42 1913 {
d458073b
AR
1914 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1915 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1916 }
1917 else
1918 scm_set_cdr_x (prev, SCM_CDR (this));
1919 }
fed9c9a2 1920 else
14de3b42
GH
1921 {
1922 /* replace it. */
1923 scm_set_cdr_x (SCM_CAR (this), proc);
1924 }
fed9c9a2
GH
1925 break;
1926 }
1927 prev = this;
1928 this = SCM_CDR (this);
1929 }
deca31e1 1930
deca31e1
GH
1931 return SCM_UNSPECIFIED;
1932}
1bbd0b84 1933#undef FUNC_NAME
0f2d19dd 1934
deca31e1
GH
1935/* Recover the read-hash procedure corresponding to char c. */
1936static SCM
6e8d25a6 1937scm_get_hash_procedure (int c)
deca31e1 1938{
d458073b 1939 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1940
deca31e1
GH
1941 while (1)
1942 {
d2e53ed6 1943 if (scm_is_null (rest))
deca31e1
GH
1944 return SCM_BOOL_F;
1945
7866a09b 1946 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1947 return SCM_CDAR (rest);
1948
1949 rest = SCM_CDR (rest);
1950 }
1951}
1cc91f1b 1952
889975e5
MG
1953#define SCM_ENCODING_SEARCH_SIZE (500)
1954
93c4fa21
AW
1955static int
1956is_encoding_char (char c)
1957{
1958 if (c >= 'a' && c <= 'z') return 1;
1959 if (c >= 'A' && c <= 'Z') return 1;
1960 if (c >= '0' && c <= '9') return 1;
1961 return strchr ("_-.:/,+=()", c) != NULL;
1962}
1963
f8a1c9a8
LC
1964/* Search the first few hundred characters of a file for an Emacs-like coding
1965 declaration. Returns either NULL or a string whose storage has been
1966 allocated with `scm_gc_malloc ()'. */
889975e5 1967char *
f8a1c9a8 1968scm_i_scan_for_encoding (SCM port)
889975e5 1969{
d900843c 1970 scm_t_port *pt;
889975e5 1971 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1972 size_t bytes_read, encoding_length, i;
889975e5 1973 char *encoding = NULL;
daedbca7 1974 char *pos, *encoding_start;
889975e5
MG
1975 int in_comment;
1976
d900843c 1977 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1978
d900843c 1979 if (pt->rw_active == SCM_PORT_WRITE)
4251ae2e 1980 scm_flush_unlocked (port);
49bb5bd3 1981
d900843c
AW
1982 if (pt->rw_random)
1983 pt->rw_active = SCM_PORT_READ;
1984
1985 if (pt->read_pos == pt->read_end)
1986 {
1987 /* We can use the read buffer, and thus avoid a seek. */
4251ae2e 1988 if (scm_fill_input_unlocked (port) == EOF)
d900843c
AW
1989 return NULL;
1990
1991 bytes_read = pt->read_end - pt->read_pos;
1992 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1993 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1994
1995 if (bytes_read <= 1)
1996 /* An unbuffered port -- don't scan. */
1997 return NULL;
1998
1999 memcpy (header, pt->read_pos, bytes_read);
2000 header[bytes_read] = '\0';
2001 }
2002 else
2003 {
2004 /* Try to read some bytes and then seek back. Not all ports
2005 support seeking back; and indeed some file ports (like
2006 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2007 check performed by SCM_FPORT_FDES---but fail to seek
2008 backwards. Hence this block comes second. We prefer to use
2009 the read buffer in-place. */
2010 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2011 return NULL;
2012
be632904 2013 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
d900843c
AW
2014 header[bytes_read] = '\0';
2015 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2016 }
889975e5 2017
889975e5
MG
2018 /* search past "coding[:=]" */
2019 pos = header;
2020 while (1)
2021 {
2022 if ((pos = strstr(pos, "coding")) == NULL)
2023 return NULL;
2024
2025 pos += strlen("coding");
2026 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2027 (*pos == ':' || *pos == '='))
2028 {
2029 pos ++;
2030 break;
2031 }
2032 }
2033
2034 /* skip spaces */
2035 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2036 (*pos == ' ' || *pos == '\t'))
2037 pos ++;
2038
2039 /* grab the next token */
daedbca7 2040 encoding_start = pos;
889975e5 2041 i = 0;
daedbca7
MG
2042 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2043 && encoding_start + i - header < bytes_read
93c4fa21 2044 && is_encoding_char (encoding_start[i]))
889975e5
MG
2045 i++;
2046
daedbca7
MG
2047 encoding_length = i;
2048 if (encoding_length == 0)
889975e5
MG
2049 return NULL;
2050
daedbca7 2051 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
889975e5
MG
2052
2053 /* push backwards to make sure we were in a comment */
2054 in_comment = 0;
daedbca7
MG
2055 pos = encoding_start;
2056 while (pos >= header)
889975e5 2057 {
8a12aeb9
AW
2058 if (*pos == ';')
2059 {
2060 in_comment = 1;
2061 break;
2062 }
2063 else if (*pos == '\n' || pos == header)
889975e5
MG
2064 {
2065 /* This wasn't in a semicolon comment. Check for a
2066 hash-bang comment. */
2067 char *beg = strstr (header, "#!");
2068 char *end = strstr (header, "!#");
8a12aeb9 2069 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2070 in_comment = 1;
2071 break;
2072 }
8a12aeb9
AW
2073 else
2074 {
2075 pos --;
2076 continue;
2077 }
889975e5
MG
2078 }
2079 if (!in_comment)
f8a1c9a8
LC
2080 /* This wasn't in a comment */
2081 return NULL;
2082
889975e5
MG
2083 return encoding;
2084}
2085
2086SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2087 (SCM port),
a270e133 2088 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2089 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2090 "The coding declaration is of the form\n"
2091 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2092 "\n"
2093 "Returns a string containing the character encoding of the file\n"
2094 "if a declaration was found, or @code{#f} otherwise.\n")
2095#define FUNC_NAME s_scm_file_encoding
2096{
2097 char *enc;
2098 SCM s_enc;
f8a1c9a8 2099
d900843c
AW
2100 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2101
f8a1c9a8 2102 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2103 if (enc == NULL)
2104 return SCM_BOOL_F;
2105 else
2106 {
f6f4feb0 2107 s_enc = scm_string_upcase (scm_from_locale_string (enc));
889975e5
MG
2108 return s_enc;
2109 }
f8a1c9a8 2110
889975e5
MG
2111 return SCM_BOOL_F;
2112}
2113#undef FUNC_NAME
2114
ea8c9761
MW
2115\f
2116/* Per-port read options.
2117
2118 We store per-port read options in the 'port-read-options' key of the
05d7f762
MW
2119 port's alist, which is stored in the internal port structure. The
2120 value stored in the alist is a single integer that contains a two-bit
2121 field for each read option.
ea8c9761
MW
2122
2123 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2124 the applicable value should be inherited from the corresponding
2125 global read option. Otherwise, the bit field contains the value of
2126 the read option. For boolean read options that have been set
2127 per-port, the possible values are 0 or 1. If the 'keyword_style'
2128 read option has been set per-port, its possible values are those in
2129 'enum t_keyword_style'. */
2130
2131/* Key to read options in per-port alists. */
2132SCM_SYMBOL (sym_port_read_options, "port-read-options");
2133
2134/* Offsets of bit fields for each per-port override */
2135#define READ_OPTION_COPY_SOURCE_P 0
2136#define READ_OPTION_RECORD_POSITIONS_P 2
2137#define READ_OPTION_CASE_INSENSITIVE_P 4
2138#define READ_OPTION_KEYWORD_STYLE 6
2139#define READ_OPTION_R6RS_ESCAPES_P 8
2140#define READ_OPTION_SQUARE_BRACKETS_P 10
2141#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2142#define READ_OPTION_CURLY_INFIX_P 14
ea8c9761 2143
bf9eb54a
MW
2144/* The total width in bits of the per-port overrides */
2145#define READ_OPTIONS_NUM_BITS 16
ea8c9761
MW
2146
2147#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2148#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2149
2150#define READ_OPTION_MASK 3
2151#define READ_OPTION_INHERIT 3
2152
2153static void
2154set_port_read_option (SCM port, int option, int new_value)
2155{
2156 SCM alist, scm_read_options;
2157 unsigned int read_options;
2158
2159 new_value &= READ_OPTION_MASK;
05d7f762 2160 alist = scm_i_port_alist (port);
ea8c9761
MW
2161 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
2162 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2163 read_options = scm_to_uint (scm_read_options);
2164 else
2165 read_options = READ_OPTIONS_INHERIT_ALL;
2166 read_options &= ~(READ_OPTION_MASK << option);
2167 read_options |= new_value << option;
2168 scm_read_options = scm_from_uint (read_options);
2169 alist = scm_assq_set_x (alist, sym_port_read_options, scm_read_options);
05d7f762 2170 scm_i_set_port_alist_x (port, alist);
ea8c9761
MW
2171}
2172
9331ffd8
MW
2173/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2174static void
2175set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2176{
2177 value = !!value;
2178 opts->case_insensitive_p = value;
2179 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2180}
2181
bf9eb54a
MW
2182/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2183static void
2184set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2185{
2186 value = !!value;
2187 opts->square_brackets_p = value;
2188 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2189}
2190
2191/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2192static void
2193set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2194{
2195 value = !!value;
2196 opts->curly_infix_p = value;
2197 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2198}
2199
ea8c9761
MW
2200/* Initialize OPTS based on PORT's read options and the global read
2201 options. */
2202static void
2203init_read_options (SCM port, scm_t_read_opts *opts)
2204{
2205 SCM alist, val, scm_read_options;
2206 unsigned int read_options, x;
2207
05d7f762 2208 alist = scm_i_port_alist (port);
ea8c9761 2209 scm_read_options = scm_assq_ref (alist, sym_port_read_options);
ea8c9761
MW
2210
2211 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2212 read_options = scm_to_uint (scm_read_options);
2213 else
2214 read_options = READ_OPTIONS_INHERIT_ALL;
2215
2216 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2217 if (x == READ_OPTION_INHERIT)
2218 {
2219 val = SCM_PACK (SCM_KEYWORD_STYLE);
2220 if (scm_is_eq (val, scm_keyword_prefix))
2221 x = KEYWORD_STYLE_PREFIX;
2222 else if (scm_is_eq (val, scm_keyword_postfix))
2223 x = KEYWORD_STYLE_POSTFIX;
2224 else
2225 x = KEYWORD_STYLE_HASH_PREFIX;
2226 }
2227 opts->keyword_style = x;
2228
2229#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2230 do \
2231 { \
2232 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2233 if (x == READ_OPTION_INHERIT) \
2234 x = !!SCM_ ## NAME; \
2235 opts->name = x; \
2236 } \
2237 while (0)
2238
2239 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2240 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2241 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2242 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2243 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2244 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2245 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
ea8c9761
MW
2246
2247#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2248
2249 opts->neoteric_p = 0;
ea8c9761
MW
2250}
2251
0f2d19dd
JB
2252void
2253scm_init_read ()
0f2d19dd 2254{
d458073b
AR
2255 SCM read_hash_procs;
2256
c81c2ad3 2257 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2258
2259 scm_i_read_hash_procedures =
2260 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2261
62560650 2262 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2263#include "libguile/read.x"
0f2d19dd 2264}
89e00824
ML
2265
2266/*
2267 Local Variables:
2268 c-file-style: "gnu"
2269 End:
2270*/