web: Don't throw if a response is longer than its Content-Length says.
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37 1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
7a329029 2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
96965a6e 33#include <c-strcase.h>
0fc54828 34#include <c-ctype.h>
7337d56d 35
a0599745 36#include "libguile/_scm.h"
0ba0b384 37#include "libguile/bytevectors.h"
a0599745
MD
38#include "libguile/chars.h"
39#include "libguile/eval.h"
2fa901a5 40#include "libguile/arrays.h"
cf396142 41#include "libguile/bitvectors.h"
a0599745
MD
42#include "libguile/keywords.h"
43#include "libguile/alist.h"
44#include "libguile/srcprop.h"
45#include "libguile/hashtab.h"
46#include "libguile/hash.h"
47#include "libguile/ports.h"
49bb5bd3 48#include "libguile/fports.h"
a0599745
MD
49#include "libguile/root.h"
50#include "libguile/strings.h"
ba1b2226 51#include "libguile/strports.h"
a0599745 52#include "libguile/vectors.h"
a0599745 53#include "libguile/validate.h"
a4022e69 54#include "libguile/srfi-4.h"
7337d56d 55#include "libguile/srfi-13.h"
ba1b2226 56
a0599745 57#include "libguile/read.h"
22fc179a
HWN
58#include "libguile/private-options.h"
59
0f2d19dd
JB
60
61\f
62
5bf6a6f0 63SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 64SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 65SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 66SCM_SYMBOL (sym_nil, "nil");
c7733771 67
bf9eb54a
MW
68/* SRFI-105 curly infix expression support */
69SCM_SYMBOL (sym_nfx, "$nfx$");
70SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
dc59631d
MW
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
bf9eb54a
MW
93 { 0, },
94 };
b1b5433d
MW
95
96/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
b1b5433d 106
ea8c9761
MW
107struct t_read_opts
108{
b1b5433d
MW
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
dc59631d 118 unsigned int r7rs_symbols_p : 1;
b1b5433d
MW
119};
120
121typedef struct t_read_opts scm_t_read_opts;
122
a16f6fe7 123
39e8f371
HWN
124/*
125 Give meaningful error messages for errors
126
127 We use the format
128
ba1b2226 129 FILE:LINE:COL: MESSAGE
39e8f371
HWN
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
39e8f371
HWN
135 */
136
137
a4022e69
MV
138void
139scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
ba1b2226 141{
29a837fd
MV
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 145
29a837fd 146 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
272632a6 149 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 150 scm_list_4 (fn,
b3aa4626 151 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 152 scm_from_int (SCM_COL (port) + 1),
272632a6 153 scm_from_locale_string (message)));
ba1b2226
HWN
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
4a655e50 157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 158 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 159 string,
dd72382c 160 arg,
ba1b2226
HWN
161 SCM_BOOL_F);
162}
39e8f371
HWN
163
164
a1ec6916 165SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 166 (SCM setting),
dc7fa443
MG
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 170#define FUNC_NAME s_scm_read_options
a16f6fe7 171{
b7ff98dd
MD
172 SCM ans = scm_options (setting,
173 scm_read_opts,
1bbd0b84 174 FUNC_NAME);
b7ff98dd
MD
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
177 return ans;
178}
1bbd0b84 179#undef FUNC_NAME
a16f6fe7 180
d458073b
AR
181/* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183static SCM *scm_i_read_hash_procedures;
deca31e1 184
cfd15439 185static SCM
d458073b
AR
186scm_i_read_hash_procedures_ref (void)
187{
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189}
190
cfd15439 191static void
d458073b
AR
192scm_i_read_hash_procedures_set_x (SCM value)
193{
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195}
0f2d19dd 196
7337d56d
LC
197\f
198/* Token readers. */
0f2d19dd 199
0f2d19dd 200
7337d56d
LC
201/* Size of the C buffer used to read symbols and numbers. */
202#define READER_BUFFER_SIZE 128
0f2d19dd 203
ff4d3672
LC
204/* Number of 32-bit codepoints in the buffer used to read strings. */
205#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 206
7337d56d
LC
207/* The maximum size of Scheme character names. */
208#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 209
02327c0c
MW
210/* The maximum size of reader directive names. */
211#define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
94115ae3 213
7337d56d
LC
214/* `isblank' is only in C99. */
215#define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 217 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
218
219#ifdef MSDOS
220# define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222#else
223# define CHAR_IS_BLANK CHAR_IS_BLANK_
224#endif
225
226
227/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229#define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
603234c6 231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 232
603234c6
MW
233#define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
238
239/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241#define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
454866e0 245/* Read an SCSH block comment. */
cfd15439 246static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 247static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
248static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 250static SCM scm_get_hash_procedure (int);
454866e0 251
69f90b0b
MG
252/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
cfd15439 256static int
b1b5433d
MW
257read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
b662b7e9 259{
69f90b0b 260 *read = 0;
0520c320 261
69f90b0b
MG
262 while (*read < buf_size)
263 {
264 int chr;
7337d56d 265
69f90b0b 266 chr = scm_get_byte_or_eof (port);
889975e5 267
69f90b0b
MG
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
272 scm_unget_byte (chr, port);
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
889975e5 281
69f90b0b
MG
282 return 1;
283 }
889975e5 284
b662b7e9
LC
285/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287static char *
b1b5433d
MW
288read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
290{
291 int overflow = 0;
b662b7e9
LC
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
1cc91f1b 294
69f90b0b
MG
295 do
296 {
b1b5433d 297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
b662b7e9
LC
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
306 overflow_size = bytes_read;
307 }
308 else
309 {
7be3c2fc 310 char *new_buf =
b662b7e9
LC
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
69f90b0b
MG
317 overflow_size += bytes_read;
318 }
319 }
889975e5 320 }
69f90b0b 321 while (overflow);
889975e5 322
69f90b0b
MG
323 if (overflow_size)
324 *read = overflow_size;
b521d265 325 else
69f90b0b
MG
326 *read = bytes_read;
327
b662b7e9 328 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 329}
7337d56d
LC
330
331/* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333static int
b1b5433d 334flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 335{
cfd15439 336 scm_t_wchar c;
0f2d19dd 337 while (1)
b7f3516f 338 switch (c = scm_getc (port))
0f2d19dd
JB
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
d156d3b7 343 {
a4022e69
MV
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
d156d3b7 348 }
0f2d19dd 349 return c;
7337d56d 350
0f2d19dd
JB
351 case ';':
352 lp:
b7f3516f 353 switch (c = scm_getc (port))
0f2d19dd
JB
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
7337d56d 363
454866e0
LC
364 case '#':
365 switch (c = scm_getc (port))
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
b1b5433d 371 scm_read_shebang (c, port, opts);
454866e0 372 break;
34f3d47d 373 case ';':
b1b5433d 374 scm_read_commented_expression (c, port, opts);
34f3d47d 375 break;
620c8965
LC
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
454866e0
LC
383 default:
384 scm_ungetc (c, port);
385 return '#';
386 }
387 break;
388
0f2d19dd 389 case SCM_LINE_INCREMENTORS:
0f2d19dd 390 case SCM_SINGLE_SPACES:
0f2d19dd 391 case '\t':
0f2d19dd 392 break;
7337d56d 393
0f2d19dd
JB
394 default:
395 return c;
396 }
7337d56d
LC
397
398 return 0;
0f2d19dd
JB
399}
400
401
7337d56d
LC
402\f
403/* Token readers. */
1cc91f1b 404
b1b5433d
MW
405static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
0f2d19dd
JB
408
409
d6cb0203 410static SCM
b1b5433d
MW
411maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
d6cb0203 413{
b1b5433d 414 if (opts->record_positions_p)
d6cb0203
MW
415 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
416 return x;
417}
418
09a4f039 419static SCM
b1b5433d 420scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 421#define FUNC_NAME "scm_i_lreadparen"
09a4f039 422{
26c8cc14
AW
423 int c;
424 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
425 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
426 const int terminating_char = ((chr == '{') ? '}'
427 : ((chr == '[') ? ']'
428 : ')'));
7337d56d
LC
429
430 /* Need to capture line and column numbers here. */
431 long line = SCM_LINUM (port);
432 int column = SCM_COL (port) - 1;
f9c68a47 433
b1b5433d 434 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
435 if (terminating_char == c)
436 return SCM_EOL;
f9c68a47 437
7337d56d 438 scm_ungetc (c, port);
b1b5433d 439 tmp = scm_read_expression (port, opts);
1f7945a7
AW
440
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 445 {
b1b5433d
MW
446 ans = scm_read_expression (port, opts);
447 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
448 scm_i_input_error (FUNC_NAME, port, "missing close paren",
449 SCM_EOL);
450 return ans;
451 }
1cc91f1b 452
7337d56d
LC
453 /* Build the head of the list structure. */
454 ans = tl = scm_cons (tmp, SCM_EOL);
455
b1b5433d 456 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 457 {
7337d56d 458 SCM new_tail;
0f2d19dd 459
bf9eb54a
MW
460 if (c == ')' || (c == ']' && opts->square_brackets_p)
461 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
462 scm_i_input_error (FUNC_NAME, port,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c)));
465
7337d56d 466 scm_ungetc (c, port);
b1b5433d 467 tmp = scm_read_expression (port, opts);
5b69315e 468
1f7945a7
AW
469 /* See above note about scm_sym_dot. */
470 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 471 {
b1b5433d 472 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 473
b1b5433d 474 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
475 if (terminating_char != c)
476 scm_i_input_error (FUNC_NAME, port,
477 "in pair: missing close paren", SCM_EOL);
bf9eb54a 478 break;
0f2d19dd 479 }
b858464a 480
7337d56d
LC
481 new_tail = scm_cons (tmp, SCM_EOL);
482 SCM_SETCDR (tl, new_tail);
483 tl = new_tail;
7337d56d 484 }
0f2d19dd 485
bf9eb54a
MW
486 if (curly_list_p)
487 {
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len = scm_ilength (ans);
491
492 /* The (len == 0) case is handled above */
493 if (len == 1)
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans); /* {e} => e */
498 else if (len == 2)
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len >= 3 && (len & 1))
501 {
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op = scm_cadr (ans);
507
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
510 {
511 if (scm_is_null (tl))
512 {
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
515 tl = ans;
516 while (scm_is_pair (scm_cdr (tl)))
517 {
518 tmp = scm_cddr (tl);
519 SCM_SETCDR (tl, tmp);
520 tl = tmp;
521 }
522 ans = scm_cons (op, ans);
523 break;
524 }
525 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
526 {
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans = scm_cons (sym_nfx, ans);
529 break;
530 }
531 }
532 }
533 else
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans = scm_cons (sym_nfx, ans);
537 }
538
b1b5433d 539 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
540}
541#undef FUNC_NAME
a4022e69 542
c5661d28
MG
543
544/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
546 found. */
547#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 do \
549 { \
550 scm_t_wchar a; \
551 size_t i = 0; \
552 c = 0; \
553 while (i < ndigits) \
554 { \
555 a = scm_getc (port); \
556 if (a == EOF) \
557 goto str_eof; \
558 if (terminator \
559 && (a == (scm_t_wchar) terminator) \
560 && (i > 0)) \
561 break; \
562 if ('0' <= a && a <= '9') \
563 a -= '0'; \
564 else if ('A' <= a && a <= 'F') \
565 a = a - 'A' + 10; \
566 else if ('a' <= a && a <= 'f') \
567 a = a - 'a' + 10; \
568 else \
569 { \
570 c = a; \
571 goto bad_escaped; \
572 } \
573 c = c * 16 + a; \
574 i ++; \
575 } \
c5661d28
MG
576 } while (0)
577
684d664e
AW
578static void
579skip_intraline_whitespace (SCM port)
580{
581 scm_t_wchar c;
582
583 do
584 {
585 c = scm_getc (port);
586 if (c == EOF)
587 return;
588 }
589 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
590
591 scm_ungetc (c, port);
592}
593
dc59631d
MW
594/* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
7337d56d 597static SCM
dc59631d 598scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
599#define FUNC_NAME "scm_lreadr"
600{
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
0f2d19dd 603
ff4d3672
LC
604 SCM str = SCM_EOL;
605 size_t c_str_len = 0;
606 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 607
b131b233
MW
608 /* Need to capture line and column numbers here. */
609 long line = SCM_LINUM (port);
610 int column = SCM_COL (port) - 1;
611
dc59631d 612 while (chr != (c = scm_getc (port)))
7337d56d
LC
613 {
614 if (c == EOF)
9c44cd45
MG
615 {
616 str_eof:
617 scm_i_input_error (FUNC_NAME, port,
dc59631d
MW
618 (chr == '|'
619 ? "end of file in symbol"
620 : "end of file in string constant"),
621 SCM_EOL);
9c44cd45 622 }
0f2d19dd 623
ff4d3672
LC
624 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
625 {
626 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
627 c_str_len = 0;
628 }
0f2d19dd 629
7337d56d 630 if (c == '\\')
9c44cd45
MG
631 {
632 switch (c = scm_getc (port))
633 {
634 case EOF:
635 goto str_eof;
6579c330 636 case '|':
9c44cd45
MG
637 case '\\':
638 break;
9c44cd45 639 case '\n':
b1b5433d 640 if (opts->hungry_eol_escapes_p)
684d664e 641 skip_intraline_whitespace (port);
9c44cd45
MG
642 continue;
643 case '0':
644 c = '\0';
645 break;
646 case 'f':
647 c = '\f';
648 break;
649 case 'n':
650 c = '\n';
651 break;
652 case 'r':
653 c = '\r';
654 break;
655 case 't':
656 c = '\t';
657 break;
658 case 'a':
659 c = '\007';
660 break;
661 case 'v':
662 c = '\v';
663 break;
67a4a16d
MG
664 case 'b':
665 c = '\010';
666 break;
9c44cd45 667 case 'x':
dc59631d 668 if (opts->r6rs_escapes_p || chr == '|')
dea901d6
MG
669 SCM_READ_HEX_ESCAPE (10, ';');
670 else
671 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 672 break;
9c44cd45 673 case 'u':
b1b5433d 674 if (!opts->r6rs_escapes_p)
898a0b5a
MG
675 {
676 SCM_READ_HEX_ESCAPE (4, '\0');
677 break;
678 }
9c44cd45 679 case 'U':
b1b5433d 680 if (!opts->r6rs_escapes_p)
898a0b5a
MG
681 {
682 SCM_READ_HEX_ESCAPE (6, '\0');
683 break;
684 }
9c44cd45 685 default:
dc59631d
MW
686 if (c == chr)
687 break;
9c44cd45
MG
688 bad_escaped:
689 scm_i_input_error (FUNC_NAME, port,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c)));
692 }
693 }
ff4d3672
LC
694
695 c_str[c_str_len++] = c;
7337d56d 696 }
ff4d3672
LC
697
698 if (scm_is_null (str))
699 /* Fast path: we got a string that fits in C_STR. */
700 str = scm_from_utf32_stringn (c_str, c_str_len);
701 else
702 {
703 if (c_str_len > 0)
704 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
705
706 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
707 }
708
b1b5433d 709 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 710}
db4b4ca6
DH
711#undef FUNC_NAME
712
dc59631d
MW
713static SCM
714scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
715{
716 return scm_read_string_like_syntax (chr, port, opts);
717}
718
719static SCM
720scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
721{
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
723}
0f2d19dd 724
7337d56d 725static SCM
b1b5433d 726scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 727{
69f90b0b 728 SCM result, str = SCM_EOL;
b662b7e9 729 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 730 size_t bytes_read;
69f90b0b 731 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 732
38f19074
MW
733 /* Need to capture line and column numbers here. */
734 long line = SCM_LINUM (port);
735 int column = SCM_COL (port) - 1;
736
7337d56d 737 scm_ungetc (chr, port);
b1b5433d 738 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 739 &bytes_read);
69f90b0b 740
b662b7e9 741 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
742
743 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 744 if (scm_is_false (result))
69f90b0b
MG
745 {
746 /* Return a symbol instead of a number */
b1b5433d 747 if (opts->case_insensitive_p)
69f90b0b
MG
748 str = scm_string_downcase_x (str);
749 result = scm_string_to_symbol (str);
750 }
38f19074 751 else if (SCM_NIMP (result))
b1b5433d 752 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 753
69f90b0b 754 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
755 return result;
756}
0f2d19dd 757
7337d56d 758static SCM
b1b5433d 759scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 760{
889975e5
MG
761 SCM result;
762 int ends_with_colon = 0;
69f90b0b 763 size_t bytes_read;
b1b5433d 764 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 765 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
766 scm_t_port *pt = SCM_PTAB_ENTRY (port);
767 SCM str;
7337d56d
LC
768
769 scm_ungetc (chr, port);
b1b5433d 770 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 771 &bytes_read);
69f90b0b 772 if (bytes_read > 0)
b662b7e9 773 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 774
69f90b0b
MG
775 if (postfix && ends_with_colon && (bytes_read > 1))
776 {
b662b7e9
LC
777 str = scm_from_stringn (buffer, bytes_read - 1,
778 pt->encoding, pt->ilseq_handler);
69f90b0b 779
b1b5433d 780 if (opts->case_insensitive_p)
69f90b0b
MG
781 str = scm_string_downcase_x (str);
782 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
783 }
7337d56d 784 else
69f90b0b 785 {
b662b7e9
LC
786 str = scm_from_stringn (buffer, bytes_read,
787 pt->encoding, pt->ilseq_handler);
69f90b0b 788
b1b5433d 789 if (opts->case_insensitive_p)
69f90b0b
MG
790 str = scm_string_downcase_x (str);
791 result = scm_string_to_symbol (str);
792 }
7337d56d 793
69f90b0b 794 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
795 return result;
796}
797
798static SCM
b1b5433d 799scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
800#define FUNC_NAME "scm_lreadr"
801{
889975e5 802 SCM result;
7337d56d 803 size_t read;
b662b7e9 804 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 805 unsigned int radix;
69f90b0b
MG
806 SCM str;
807 scm_t_port *pt;
7337d56d
LC
808
809 switch (chr)
810 {
811 case 'B':
812 case 'b':
813 radix = 2;
814 break;
815
816 case 'o':
817 case 'O':
818 radix = 8;
819 break;
820
821 case 'd':
822 case 'D':
823 radix = 10;
824 break;
825
826 case 'x':
827 case 'X':
828 radix = 16;
829 break;
830
831 default:
832 scm_ungetc (chr, port);
833 scm_ungetc ('#', port);
834 radix = 10;
835 }
836
b1b5433d 837 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 838 &read);
69f90b0b
MG
839
840 pt = SCM_PTAB_ENTRY (port);
b662b7e9 841 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
842
843 result = scm_string_to_number (str, scm_from_uint (radix));
844
69f90b0b 845 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
846
847 if (scm_is_true (result))
848 return result;
849
850 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
851
852 return SCM_BOOL_F;
853}
854#undef FUNC_NAME
855
856static SCM
b1b5433d 857scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
858{
859 SCM p;
492faee1
LC
860 long line = SCM_LINUM (port);
861 int column = SCM_COL (port) - 1;
7337d56d
LC
862
863 switch (chr)
864 {
865 case '`':
866 p = scm_sym_quasiquote;
867 break;
868
869 case '\'':
870 p = scm_sym_quote;
871 break;
872
873 case ',':
874 {
889975e5 875 scm_t_wchar c;
7337d56d
LC
876
877 c = scm_getc (port);
878 if ('@' == c)
879 p = scm_sym_uq_splicing;
880 else
0f2d19dd 881 {
7337d56d
LC
882 scm_ungetc (c, port);
883 p = scm_sym_unquote;
0f2d19dd 884 }
7337d56d
LC
885 break;
886 }
0f2d19dd 887
7337d56d
LC
888 default:
889 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 890 "scm_read_quote", chr);
7337d56d 891 abort ();
0f2d19dd 892 }
1cc91f1b 893
b1b5433d
MW
894 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
895 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
896}
897
34f3d47d
AW
898SCM_SYMBOL (sym_syntax, "syntax");
899SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
900SCM_SYMBOL (sym_unsyntax, "unsyntax");
901SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
902
903static SCM
b1b5433d 904scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
905{
906 SCM p;
907 long line = SCM_LINUM (port);
908 int column = SCM_COL (port) - 1;
909
910 switch (chr)
911 {
912 case '`':
913 p = sym_quasisyntax;
914 break;
915
916 case '\'':
917 p = sym_syntax;
918 break;
919
920 case ',':
921 {
922 int c;
923
924 c = scm_getc (port);
925 if ('@' == c)
926 p = sym_unsyntax_splicing;
927 else
928 {
929 scm_ungetc (c, port);
930 p = sym_unsyntax;
931 }
932 break;
933 }
934
935 default:
936 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
937 "scm_read_syntax", chr);
938 abort ();
939 }
940
b1b5433d
MW
941 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
942 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
943}
944
cfd15439 945static SCM
b1b5433d 946scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 947{
b1b5433d 948 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
949
950 if (!scm_is_eq (id, sym_nil))
951 scm_i_input_error ("scm_read_nil", port,
952 "unexpected input while reading #nil: ~a",
953 scm_list_1 (id));
954
955 return SCM_ELISP_NIL;
956}
957
cfd15439 958static SCM
7337d56d 959scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 960{
0f2d19dd
JB
961 int c;
962
889975e5
MG
963 /* We use the get_byte here because there is no need to get the
964 locale correct with comment input. This presumes that newline
965 always represents itself no matter what the encoding is. */
966 for (c = scm_get_byte_or_eof (port);
7337d56d 967 (c != EOF) && (c != '\n');
8a8da78d 968 c = scm_get_byte_or_eof (port));
7337d56d
LC
969
970 return SCM_UNSPECIFIED;
971}
972
7a329029
MW
973/* If the EXPECTED_CHARS are the next ones available from PORT, then
974 consume them and return 1. Otherwise leave the port position where
975 it was and return 0. EXPECTED_CHARS should be all lowercase, and
976 will be matched case-insensitively against the characters read from
977 PORT. */
978static int
979try_read_ci_chars (SCM port, const char *expected_chars)
980{
981 int num_chars_wanted = strlen (expected_chars);
982 int num_chars_read = 0;
983 char *chars_read = alloca (num_chars_wanted);
984 int c;
985
986 while (num_chars_read < num_chars_wanted)
987 {
988 c = scm_getc (port);
989 if (c == EOF)
990 break;
0fc54828 991 else if (c_tolower (c) != expected_chars[num_chars_read])
7a329029
MW
992 {
993 scm_ungetc (c, port);
994 break;
995 }
996 else
997 chars_read[num_chars_read++] = c;
998 }
999
1000 if (num_chars_read == num_chars_wanted)
1001 return 1;
1002 else
1003 {
1004 while (num_chars_read > 0)
1005 scm_ungetc (chars_read[--num_chars_read], port);
1006 return 0;
1007 }
1008}
1009
7337d56d
LC
1010\f
1011/* Sharp readers, i.e. readers called after a `#' sign has been read. */
1012
1013static SCM
1014scm_read_boolean (int chr, SCM port)
1015{
1016 switch (chr)
0f2d19dd 1017 {
7337d56d
LC
1018 case 't':
1019 case 'T':
7a329029 1020 try_read_ci_chars (port, "rue");
7337d56d
LC
1021 return SCM_BOOL_T;
1022
1023 case 'f':
1024 case 'F':
7a329029 1025 try_read_ci_chars (port, "alse");
7337d56d 1026 return SCM_BOOL_F;
0f2d19dd 1027 }
7337d56d
LC
1028
1029 return SCM_UNSPECIFIED;
1030}
1031
1032static SCM
b1b5433d 1033scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1034#define FUNC_NAME "scm_lreadr"
1035{
69f90b0b
MG
1036 char buffer[READER_CHAR_NAME_MAX_SIZE];
1037 SCM charname;
1038 size_t charname_len, bytes_read;
889975e5
MG
1039 scm_t_wchar cp;
1040 int overflow;
69f90b0b 1041 scm_t_port *pt;
889975e5 1042
b1b5433d
MW
1043 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1044 &bytes_read);
889975e5 1045 if (overflow)
c1a0ba1c 1046 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 1047
69f90b0b 1048 if (bytes_read == 0)
0f2d19dd 1049 {
7337d56d
LC
1050 chr = scm_getc (port);
1051 if (chr == EOF)
1052 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1053 "while reading character", SCM_EOL);
1054
1055 /* CHR must be a token delimiter, like a whitespace. */
1056 return (SCM_MAKE_CHAR (chr));
0f2d19dd 1057 }
7337d56d 1058
69f90b0b 1059 pt = SCM_PTAB_ENTRY (port);
7337d56d 1060
69f90b0b
MG
1061 /* Simple ASCII characters can be processed immediately. Also, simple
1062 ISO-8859-1 characters can be processed immediately if the encoding for this
1063 port is ISO-8859-1. */
1064 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1065 {
1066 SCM_COL (port) += 1;
1067 return SCM_MAKE_CHAR (buffer[0]);
1068 }
1069
1070 /* Otherwise, convert the buffer into a proper scheme string for
1071 processing. */
1072 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1073 pt->ilseq_handler);
1074 charname_len = scm_i_string_length (charname);
1075 SCM_COL (port) += charname_len;
889975e5 1076 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1077 if (charname_len == 1)
1078 return SCM_MAKE_CHAR (cp);
1079
1080 /* Ignore dotted circles, which may be used to keep combining characters from
1081 combining with the backslash in #\charname. */
0dcd7e61
MG
1082 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1083 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1084
889975e5 1085 if (cp >= '0' && cp < '8')
7337d56d
LC
1086 {
1087 /* Dirk:FIXME:: This type of character syntax is not R5RS
1088 * compliant. Further, it should be verified that the constant
0ffc78e3 1089 * does only consist of octal digits. */
889975e5 1090 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1091 if (SCM_I_INUMP (p))
0ffc78e3 1092 {
e25f3727 1093 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1094 if (SCM_IS_UNICODE_CHAR (c))
1095 return SCM_MAKE_CHAR (c);
1096 else
0f3a70cf 1097 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1098 "out-of-range octal character escape: ~a",
1099 scm_list_1 (charname));
1100 }
7337d56d
LC
1101 }
1102
0f3a70cf 1103 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1104 {
1105 SCM p;
0f3a70cf 1106
dea901d6
MG
1107 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1109 scm_from_uint (16));
1110 if (SCM_I_INUMP (p))
1111 {
e25f3727 1112 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1113 if (SCM_IS_UNICODE_CHAR (c))
1114 return SCM_MAKE_CHAR (c);
1115 else
1116 scm_i_input_error (FUNC_NAME, port,
1117 "out-of-range hex character escape: ~a",
1118 scm_list_1 (charname));
1119 }
1120 }
1121
889975e5
MG
1122 /* The names of characters should never have non-Latin1
1123 characters. */
1124 if (scm_i_is_narrow_string (charname)
1125 || scm_i_try_narrow_string (charname))
4769c9db
AW
1126 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1127 charname_len);
1128 if (scm_is_true (ch))
1129 return ch;
1130 }
7337d56d 1131
7337d56d 1132 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1133 scm_list_1 (charname));
7337d56d
LC
1134
1135 return SCM_UNSPECIFIED;
0f2d19dd 1136}
db4b4ca6 1137#undef FUNC_NAME
0f2d19dd 1138
cfd15439 1139static SCM
b1b5433d 1140scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1141{
1142 SCM symbol;
1143
1144 /* Read the symbol that comprises the keyword. Doing this instead of
1145 invoking a specific symbol reader function allows `scm_read_keyword ()'
1146 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1147
7337d56d 1148 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1149 symbol = scm_read_expression (port, opts);
7337d56d 1150 if (!scm_is_symbol (symbol))
7f74cf9a 1151 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1152 "keyword prefix `~a' not followed by a symbol: ~s",
1153 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1154
1155 return (scm_symbol_to_keyword (symbol));
1156}
1157
cfd15439 1158static SCM
b1b5433d
MW
1159scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1160 long line, int column)
09a4f039 1161{
7337d56d
LC
1162 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163 guarantee that it's going to do what we want. After all, this is an
1164 implementation detail of `scm_read_vector ()', not a desirable
1165 property. */
b1b5433d
MW
1166 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1167 port, opts, line, column);
b131b233
MW
1168}
1169
493ceb99
MW
1170/* Helper used by scm_read_array */
1171static int
1172read_decimal_integer (SCM port, int c, ssize_t *resp)
1173{
1174 ssize_t sign = 1;
1175 ssize_t res = 0;
1176 int got_it = 0;
1177
1178 if (c == '-')
1179 {
1180 sign = -1;
1181 c = scm_getc (port);
1182 }
1183
1184 while ('0' <= c && c <= '9')
1185 {
a662686a
MW
1186 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1187 scm_i_input_error ("read_decimal_integer", port,
1188 "number too large", SCM_EOL);
493ceb99
MW
1189 res = 10*res + c-'0';
1190 got_it = 1;
1191 c = scm_getc (port);
1192 }
1193
1194 if (got_it)
1195 *resp = sign * res;
1196 return c;
1197}
1198
1199/* Read an array. This function can also read vectors and uniform
1200 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1201 handled here.
1202
10744b7c 1203 C is the first character read after the '#'. */
b131b233 1204static SCM
b1b5433d 1205scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
b131b233 1206{
493ceb99
MW
1207 ssize_t rank;
1208 scm_t_wchar tag_buf[8];
1209 int tag_len;
1210
1211 SCM tag, shape = SCM_BOOL_F, elements, array;
1212
1213 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1214 the array code can not deal with zero-length dimensions yet, and
10744b7c 1215 we want to allow zero-length vectors, of course. */
493ceb99 1216 if (c == '(')
b1b5433d 1217 return scm_read_vector (c, port, opts, line, column);
493ceb99 1218
10744b7c 1219 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1220 if (c == 'f')
1221 {
1222 c = scm_getc (port);
1223 if (c != '3' && c != '6')
1224 {
7a329029
MW
1225 if (c == 'a' && try_read_ci_chars (port, "lse"))
1226 return SCM_BOOL_F;
1227 else if (c != EOF)
1228 scm_ungetc (c, port);
493ceb99
MW
1229 return SCM_BOOL_F;
1230 }
1231 rank = 1;
1232 tag_buf[0] = 'f';
1233 tag_len = 1;
1234 goto continue_reading_tag;
1235 }
1236
1237 /* Read rank. */
1238 rank = 1;
1239 c = read_decimal_integer (port, c, &rank);
1240 if (rank < 0)
1241 scm_i_input_error (NULL, port, "array rank must be non-negative",
1242 SCM_EOL);
1243
1244 /* Read tag. */
1245 tag_len = 0;
1246 continue_reading_tag:
1247 while (c != EOF && c != '(' && c != '@' && c != ':'
1248 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1249 {
1250 tag_buf[tag_len++] = c;
1251 c = scm_getc (port);
1252 }
1253 if (tag_len == 0)
1254 tag = SCM_BOOL_T;
b131b233 1255 else
493ceb99
MW
1256 {
1257 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1258 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1259 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1260 scm_list_1 (tag));
1261 }
1262
1263 /* Read shape. */
1264 if (c == '@' || c == ':')
1265 {
1266 shape = SCM_EOL;
1267
1268 do
1269 {
1270 ssize_t lbnd = 0, len = 0;
1271 SCM s;
1272
1273 if (c == '@')
1274 {
1275 c = scm_getc (port);
1276 c = read_decimal_integer (port, c, &lbnd);
1277 }
1278
1279 s = scm_from_ssize_t (lbnd);
1280
1281 if (c == ':')
1282 {
1283 c = scm_getc (port);
1284 c = read_decimal_integer (port, c, &len);
1285 if (len < 0)
1286 scm_i_input_error (NULL, port,
1287 "array length must be non-negative",
1288 SCM_EOL);
1289
1290 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1291 }
1292
1293 shape = scm_cons (s, shape);
1294 } while (c == '@' || c == ':');
1295
1296 shape = scm_reverse_x (shape, SCM_EOL);
1297 }
1298
1299 /* Read nested lists of elements. */
1300 if (c != '(')
1301 scm_i_input_error (NULL, port,
1302 "missing '(' in vector or array literal",
1303 SCM_EOL);
b1b5433d 1304 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1305
1306 if (scm_is_false (shape))
1307 shape = scm_from_ssize_t (rank);
1308 else if (scm_ilength (shape) != rank)
1309 scm_i_input_error
1310 (NULL, port,
1311 "the number of shape specifications must match the array rank",
1312 SCM_EOL);
1313
1314 /* Handle special print syntax of rank zero arrays; see
1315 scm_i_print_array for a rationale. */
1316 if (rank == 0)
1317 {
1318 if (!scm_is_pair (elements))
1319 scm_i_input_error (NULL, port,
1320 "too few elements in array literal, need 1",
1321 SCM_EOL);
1322 if (!scm_is_null (SCM_CDR (elements)))
1323 scm_i_input_error (NULL, port,
1324 "too many elements in array literal, want 1",
1325 SCM_EOL);
1326 elements = SCM_CAR (elements);
1327 }
1328
1329 /* Construct array, annotate with source location, and return. */
1330 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1331 return maybe_annotate_source (array, port, opts, line, column);
7337d56d 1332}
09a4f039 1333
cfd15439 1334static SCM
b1b5433d
MW
1335scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1336 long line, int column)
7337d56d 1337{
b1b5433d 1338 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1339}
1340
0ba0b384 1341static SCM
b1b5433d
MW
1342scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1343 long line, int column)
0ba0b384
LC
1344{
1345 chr = scm_getc (port);
1346 if (chr != 'u')
1347 goto syntax;
1348
1349 chr = scm_getc (port);
1350 if (chr != '8')
1351 goto syntax;
1352
1353 chr = scm_getc (port);
1354 if (chr != '(')
1355 goto syntax;
1356
b131b233 1357 return maybe_annotate_source
b1b5433d
MW
1358 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1359 port, opts, line, column);
0ba0b384
LC
1360
1361 syntax:
1362 scm_i_input_error ("read_bytevector", port,
1363 "invalid bytevector prefix",
1364 SCM_MAKE_CHAR (chr));
1365 return SCM_UNSPECIFIED;
1366}
1367
7337d56d 1368static SCM
b1b5433d
MW
1369scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1370 long line, int column)
7337d56d
LC
1371{
1372 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1373 terribly inefficient but who cares? */
1374 SCM s_bits = SCM_EOL;
1375
1376 for (chr = scm_getc (port);
1377 (chr != EOF) && ((chr == '0') || (chr == '1'));
1378 chr = scm_getc (port))
09a4f039 1379 {
7337d56d 1380 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1381 }
7337d56d
LC
1382
1383 if (chr != EOF)
1384 scm_ungetc (chr, port);
1385
b131b233
MW
1386 return maybe_annotate_source
1387 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1388 port, opts, line, column);
7337d56d
LC
1389}
1390
cfd15439 1391static SCM
889975e5 1392scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1393{
1394 int bang_seen = 0;
1395
1396 for (;;)
09a4f039 1397 {
58b1db5f 1398 int c = scm_getc (port);
62850ef3 1399
7337d56d
LC
1400 if (c == EOF)
1401 scm_i_input_error ("skip_block_comment", port,
1402 "unterminated `#! ... !#' comment", SCM_EOL);
1403
1404 if (c == '!')
1405 bang_seen = 1;
1406 else if (c == '#' && bang_seen)
1407 break;
1408 else
1409 bang_seen = 0;
1410 }
1411
1412 return SCM_UNSPECIFIED;
1413}
1414
9331ffd8
MW
1415static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1416 int value);
bf9eb54a
MW
1417static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1418 int value);
1419static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1420 int value);
9331ffd8 1421
d7fcaec3 1422static SCM
b1b5433d 1423scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1424{
02327c0c
MW
1425 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1426 int c;
1427 int i = 0;
1428
1429 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1430 {
02327c0c
MW
1431 c = scm_getc (port);
1432 if (c == EOF)
1433 scm_i_input_error ("skip_block_comment", port,
1434 "unterminated `#! ... !#' comment", SCM_EOL);
1435 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1436 name[i++] = c;
1437 else if (CHAR_IS_DELIMITER (c))
1438 {
1439 scm_ungetc (c, port);
1440 name[i] = '\0';
1441 if (0 == strcmp ("r6rs", name))
1442 ; /* Silently ignore */
9331ffd8
MW
1443 else if (0 == strcmp ("fold-case", name))
1444 set_port_case_insensitive_p (port, opts, 1);
1445 else if (0 == strcmp ("no-fold-case", name))
1446 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1447 else if (0 == strcmp ("curly-infix", name))
1448 set_port_curly_infix_p (port, opts, 1);
1449 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1450 {
1451 set_port_curly_infix_p (port, opts, 1);
1452 set_port_square_brackets_p (port, opts, 0);
1453 }
02327c0c
MW
1454 else
1455 break;
1456
1457 return SCM_UNSPECIFIED;
1458 }
fa746547
MW
1459 else
1460 {
1461 scm_ungetc (c, port);
1462 break;
1463 }
911b03b2 1464 }
02327c0c
MW
1465 while (i > 0)
1466 scm_ungetc (name[--i], port);
1467 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1468}
1469
620c8965
LC
1470static SCM
1471scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1472{
1473 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474 nested. So care must be taken. */
1475 int nesting_level = 1;
6d5f8c32
AW
1476
1477 int a = scm_getc (port);
1478
1479 if (a == EOF)
1480 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1481 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1482
1483 while (nesting_level > 0)
1484 {
6d5f8c32 1485 int b = scm_getc (port);
620c8965 1486
6d5f8c32 1487 if (b == EOF)
cd169c5a 1488 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1489 "unterminated `#| ... |#' comment", SCM_EOL);
1490
6d5f8c32
AW
1491 if (a == '|' && b == '#')
1492 {
1493 nesting_level--;
1494 b = EOF;
1495 }
1496 else if (a == '#' && b == '|')
1497 {
1498 nesting_level++;
1499 b = EOF;
1500 }
1501
1502 a = b;
620c8965
LC
1503 }
1504
1505 return SCM_UNSPECIFIED;
1506}
1507
34f3d47d 1508static SCM
b1b5433d
MW
1509scm_read_commented_expression (scm_t_wchar chr, SCM port,
1510 scm_t_read_opts *opts)
34f3d47d 1511{
889975e5 1512 scm_t_wchar c;
34f3d47d 1513
b1b5433d 1514 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1515 if (EOF == c)
1516 scm_i_input_error ("read_commented_expression", port,
1517 "no expression after #; comment", SCM_EOL);
1518 scm_ungetc (c, port);
b1b5433d 1519 scm_read_expression (port, opts);
34f3d47d
AW
1520 return SCM_UNSPECIFIED;
1521}
1522
7337d56d 1523static SCM
889975e5 1524scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1525{
1526 /* Guile's extended symbol read syntax looks like this:
1527
1528 #{This is all a symbol name}#
1529
1530 So here, CHR is expected to be `{'. */
d9527cfa 1531 int saw_brace = 0;
7337d56d 1532 size_t len = 0;
190d4b0d 1533 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1534
889975e5 1535 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1536
1537 while ((chr = scm_getc (port)) != EOF)
1538 {
1539 if (saw_brace)
09a4f039 1540 {
7337d56d
LC
1541 if (chr == '#')
1542 {
7337d56d
LC
1543 break;
1544 }
1545 else
1546 {
1547 saw_brace = 0;
889975e5 1548 scm_i_string_set_x (buf, len++, '}');
7337d56d 1549 }
09a4f039 1550 }
d9527cfa
AW
1551
1552 if (chr == '}')
7337d56d 1553 saw_brace = 1;
d9527cfa
AW
1554 else if (chr == '\\')
1555 {
1556 /* It used to be that print.c would print extended-read-syntax
1557 symbols with backslashes before "non-standard" chars, but
1558 this routine wouldn't do anything with those escapes.
1559 Bummer. What we've done is to change print.c to output
1560 R6RS hex escapes for those characters, relying on the fact
1561 that the extended read syntax would never put a `\' before
1562 an `x'. For now, we just ignore other instances of
1563 backslash in the string. */
1564 switch ((chr = scm_getc (port)))
1565 {
1566 case EOF:
1567 goto done;
1568 case 'x':
1569 {
1570 scm_t_wchar c;
1571
1572 SCM_READ_HEX_ESCAPE (10, ';');
1573 scm_i_string_set_x (buf, len++, c);
1574 break;
1575
1576 str_eof:
1577 chr = EOF;
1578 goto done;
1579
1580 bad_escaped:
1581 scm_i_string_stop_writing ();
1582 scm_i_input_error ("scm_read_extended_symbol", port,
1583 "illegal character in escape sequence: ~S",
1584 scm_list_1 (SCM_MAKE_CHAR (c)));
1585 break;
1586 }
1587 default:
1588 scm_i_string_set_x (buf, len++, chr);
1589 break;
1590 }
1591 }
7337d56d 1592 else
d9527cfa 1593 scm_i_string_set_x (buf, len++, chr);
62850ef3 1594
889975e5 1595 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1596 {
7f991c7d
LC
1597 SCM addy;
1598
889975e5 1599 scm_i_string_stop_writing ();
190d4b0d 1600 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1601 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1602 len = 0;
889975e5 1603 buf = scm_i_string_start_writing (buf);
7337d56d 1604 }
7337d56d 1605 }
d9527cfa
AW
1606
1607 done:
889975e5 1608 scm_i_string_stop_writing ();
d9527cfa
AW
1609 if (chr == EOF)
1610 scm_i_input_error ("scm_read_extended_symbol", port,
1611 "end of file while reading symbol", SCM_EOL);
7337d56d 1612
889975e5 1613 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1614}
1615
1616
1617\f
1618/* Top-level token readers, i.e., dispatchers. */
1619
1620static SCM
b1b5433d 1621scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1622{
1623 SCM proc;
1624
1625 proc = scm_get_hash_procedure (chr);
1626 if (scm_is_true (scm_procedure_p (proc)))
1627 {
1628 long line = SCM_LINUM (port);
1629 int column = SCM_COL (port) - 2;
1630 SCM got;
1631
1632 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1633
3655ed89
MW
1634 if (opts->record_positions_p && SCM_NIMP (got)
1635 && !scm_i_has_source_properties (got))
26c8cc14
AW
1636 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1637
1638 return got;
09a4f039 1639 }
7337d56d
LC
1640
1641 return SCM_UNSPECIFIED;
1642}
1643
1644/* The reader for the sharp `#' character. It basically dispatches reads
1645 among the above token readers. */
1646static SCM
b1b5433d
MW
1647scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1648 long line, int column)
7337d56d
LC
1649#define FUNC_NAME "scm_lreadr"
1650{
1651 SCM result;
1652
1653 chr = scm_getc (port);
1654
b1b5433d 1655 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1656 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1657 return result;
1658
1659 switch (chr)
1660 {
1661 case '\\':
b1b5433d 1662 return (scm_read_character (chr, port, opts));
7337d56d 1663 case '(':
b1b5433d 1664 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1665 case 's':
1666 case 'u':
1667 case 'f':
df941b5b 1668 case 'c':
7337d56d 1669 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1670 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1671 case 'v':
b1b5433d 1672 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1673 case '*':
b1b5433d 1674 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1675 case 't':
1676 case 'T':
1677 case 'F':
7337d56d
LC
1678 return (scm_read_boolean (chr, port));
1679 case ':':
b1b5433d 1680 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1681 case '0': case '1': case '2': case '3': case '4':
1682 case '5': case '6': case '7': case '8': case '9':
1683 case '@':
1684#if SCM_ENABLE_DEPRECATED
1685 /* See below for 'i' and 'e'. */
1686 case 'a':
7337d56d
LC
1687 case 'y':
1688 case 'h':
1689 case 'l':
1690#endif
b1b5433d 1691 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1692
1693 case 'i':
1694 case 'e':
1695#if SCM_ENABLE_DEPRECATED
1696 {
1697 /* When next char is '(', it really is an old-style
1698 uniform array. */
889975e5 1699 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1700 if (next_c != EOF)
1701 scm_ungetc (next_c, port);
1702 if (next_c == '(')
b1b5433d 1703 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1704 /* Fall through. */
1705 }
1706#endif
1707 case 'b':
1708 case 'B':
1709 case 'o':
1710 case 'O':
1711 case 'd':
1712 case 'D':
1713 case 'x':
1714 case 'X':
1715 case 'I':
1716 case 'E':
b1b5433d 1717 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1718 case '{':
1719 return (scm_read_extended_symbol (chr, port));
1720 case '!':
b1b5433d 1721 return (scm_read_shebang (chr, port, opts));
34f3d47d 1722 case ';':
b1b5433d 1723 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1724 case '`':
1725 case '\'':
1726 case ',':
b1b5433d 1727 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1728 case 'n':
b1b5433d 1729 return (scm_read_nil (chr, port, opts));
7337d56d 1730 default:
b1b5433d 1731 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1732 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1733 {
1734 /* To remain compatible with 1.8 and earlier, the following
1735 characters have lower precedence than `read-hash-extend'
1736 characters. */
1737 switch (chr)
1738 {
1739 case '|':
1740 return scm_read_r6rs_block_comment (chr, port);
1741 default:
1742 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1743 scm_list_1 (SCM_MAKE_CHAR (chr)));
1744 }
1745 }
7337d56d
LC
1746 else
1747 return result;
1748 }
1749
1750 return SCM_UNSPECIFIED;
1751}
1752#undef FUNC_NAME
1753
1754static SCM
bf9eb54a
MW
1755read_inner_expression (SCM port, scm_t_read_opts *opts)
1756#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1757{
1758 while (1)
1759 {
cfd15439 1760 scm_t_wchar chr;
7337d56d
LC
1761
1762 chr = scm_getc (port);
1763
1764 switch (chr)
1765 {
1766 case SCM_WHITE_SPACES:
1767 case SCM_LINE_INCREMENTORS:
1768 break;
1769 case ';':
1770 (void) scm_read_semicolon_comment (chr, port);
1771 break;
bf9eb54a
MW
1772 case '{':
1773 if (opts->curly_infix_p)
1774 {
1775 if (opts->neoteric_p)
1776 return scm_read_sexp (chr, port, opts);
1777 else
1778 {
1779 SCM expr;
1780
1781 /* Enable neoteric expressions within curly braces */
1782 opts->neoteric_p = 1;
1783 expr = scm_read_sexp (chr, port, opts);
1784 opts->neoteric_p = 0;
1785 return expr;
1786 }
1787 }
1788 else
1789 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1790 case '[':
bf9eb54a
MW
1791 if (opts->square_brackets_p)
1792 return scm_read_sexp (chr, port, opts);
1793 else if (opts->curly_infix_p)
1794 {
1795 /* The syntax of neoteric expressions requires that '[' be
1796 a delimiter when curly-infix is enabled, so it cannot
1797 be part of an unescaped symbol. We might as well do
1798 something useful with it, so we adopt Kawa's convention:
1799 [...] => ($bracket-list$ ...) */
1800 long line = SCM_LINUM (port);
1801 int column = SCM_COL (port) - 1;
1802 return maybe_annotate_source
1803 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1804 port, opts, line, column);
1805 }
1806 else
1807 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1808 case '(':
b1b5433d 1809 return (scm_read_sexp (chr, port, opts));
7337d56d 1810 case '"':
b1b5433d 1811 return (scm_read_string (chr, port, opts));
dc59631d
MW
1812 case '|':
1813 if (opts->r7rs_symbols_p)
1814 return scm_read_r7rs_symbol (chr, port, opts);
1815 else
1816 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d
LC
1817 case '\'':
1818 case '`':
1819 case ',':
b1b5433d 1820 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1821 case '#':
1822 {
b131b233
MW
1823 long line = SCM_LINUM (port);
1824 int column = SCM_COL (port) - 1;
b1b5433d 1825 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1826 if (scm_is_eq (result, SCM_UNSPECIFIED))
1827 /* We read a comment or some such. */
1828 break;
1829 else
1830 return result;
1831 }
1832 case ')':
1833 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1834 break;
bf9eb54a
MW
1835 case '}':
1836 if (opts->curly_infix_p)
1837 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1838 else
1839 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1840 case ']':
b1b5433d 1841 if (opts->square_brackets_p)
a4e47229
MG
1842 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1843 /* otherwise fall through */
7337d56d
LC
1844 case EOF:
1845 return SCM_EOF_VAL;
1846 case ':':
b1b5433d
MW
1847 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1848 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1849 /* Fall through. */
1850
1851 default:
1852 {
1853 if (((chr >= '0') && (chr <= '9'))
1854 || (strchr ("+-.", chr)))
b1b5433d 1855 return (scm_read_number (chr, port, opts));
7337d56d 1856 else
b1b5433d 1857 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1858 }
1859 }
1860 }
1861}
1862#undef FUNC_NAME
1863
bf9eb54a
MW
1864static SCM
1865scm_read_expression (SCM port, scm_t_read_opts *opts)
1866#define FUNC_NAME "scm_read_expression"
1867{
1868 if (!opts->neoteric_p)
1869 return read_inner_expression (port, opts);
1870 else
1871 {
1872 long line = 0;
1873 int column = 0;
1874 SCM expr;
1875
1876 if (opts->record_positions_p)
1877 {
1878 /* We need to get the position of the first non-whitespace
1879 character in order to correctly annotate neoteric
1880 expressions. For example, for the expression 'f(x)', the
1881 first call to 'read_inner_expression' reads the 'f' (which
1882 cannot be annotated), and then we later read the '(x)' and
1883 use it to construct the new list (f x). */
1884 int c = flush_ws (port, opts, (char *) NULL);
1885 if (c == EOF)
1886 return SCM_EOF_VAL;
1887 scm_ungetc (c, port);
1888 line = SCM_LINUM (port);
1889 column = SCM_COL (port);
1890 }
1891
1892 expr = read_inner_expression (port, opts);
1893
1894 /* 'expr' is the first component of the neoteric expression. Now
1895 we loop, and as long as the next character is '(', '[', or '{',
1896 (without any intervening whitespace), we use it to construct a
1897 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1898 for (;;)
1899 {
1900 int chr = scm_getc (port);
1901
1902 if (chr == '(')
1903 /* e(...) => (e ...) */
1904 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1905 else if (chr == '[')
1906 /* e[...] => ($bracket-apply$ e ...) */
1907 expr = scm_cons (sym_bracket_apply,
1908 scm_cons (expr,
1909 scm_read_sexp (chr, port, opts)));
1910 else if (chr == '{')
1911 {
1912 SCM arg = scm_read_sexp (chr, port, opts);
1913
1914 if (scm_is_null (arg))
1915 expr = scm_list_1 (expr); /* e{} => (e) */
1916 else
1917 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1918 }
1919 else
1920 {
1921 if (chr != EOF)
1922 scm_ungetc (chr, port);
1923 break;
1924 }
1925 maybe_annotate_source (expr, port, opts, line, column);
1926 }
1927 return expr;
1928 }
1929}
1930#undef FUNC_NAME
1931
7337d56d
LC
1932\f
1933/* Actual reader. */
1934
ea8c9761
MW
1935static void init_read_options (SCM port, scm_t_read_opts *opts);
1936
7337d56d
LC
1937SCM_DEFINE (scm_read, "read", 0, 1, 0,
1938 (SCM port),
1939 "Read an s-expression from the input port @var{port}, or from\n"
1940 "the current input port if @var{port} is not specified.\n"
1941 "Any whitespace before the next token is discarded.")
1942#define FUNC_NAME s_scm_read
1943{
b1b5433d 1944 scm_t_read_opts opts;
7337d56d
LC
1945 int c;
1946
1947 if (SCM_UNBNDP (port))
1948 port = scm_current_input_port ();
1949 SCM_VALIDATE_OPINPORT (1, port);
1950
ea8c9761 1951 init_read_options (port, &opts);
b1b5433d
MW
1952
1953 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1954 if (EOF == c)
1955 return SCM_EOF_VAL;
1956 scm_ungetc (c, port);
1957
b1b5433d 1958 return (scm_read_expression (port, &opts));
09a4f039 1959}
db4b4ca6 1960#undef FUNC_NAME
09a4f039 1961
0f2d19dd
JB
1962
1963\f
1964
14de3b42
GH
1965/* Manipulate the read-hash-procedures alist. This could be written in
1966 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1967SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1968 (SCM chr, SCM proc),
dc7fa443
MG
1969 "Install the procedure @var{proc} for reading expressions\n"
1970 "starting with the character sequence @code{#} and @var{chr}.\n"
1971 "@var{proc} will be called with two arguments: the character\n"
1972 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1973 "returned will be the return value of @code{read}. \n"
1974 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1975 )
1bbd0b84 1976#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1977{
fed9c9a2
GH
1978 SCM this;
1979 SCM prev;
1980
36284627 1981 SCM_VALIDATE_CHAR (1, chr);
7888309b 1982 SCM_ASSERT (scm_is_false (proc)
bc36d050 1983 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1984 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1985
14de3b42 1986 /* Check if chr is already in the alist. */
d458073b 1987 this = scm_i_read_hash_procedures_ref ();
14de3b42 1988 prev = SCM_BOOL_F;
fed9c9a2
GH
1989 while (1)
1990 {
d2e53ed6 1991 if (scm_is_null (this))
fed9c9a2
GH
1992 {
1993 /* not found, so add it to the beginning. */
7888309b 1994 if (scm_is_true (proc))
fed9c9a2 1995 {
d458073b
AR
1996 SCM new = scm_cons (scm_cons (chr, proc),
1997 scm_i_read_hash_procedures_ref ());
1998 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1999 }
2000 break;
2001 }
bc36d050 2002 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
2003 {
2004 /* already in the alist. */
7888309b 2005 if (scm_is_false (proc))
14de3b42
GH
2006 {
2007 /* remove it. */
7888309b 2008 if (scm_is_false (prev))
14de3b42 2009 {
d458073b
AR
2010 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
2011 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
2012 }
2013 else
2014 scm_set_cdr_x (prev, SCM_CDR (this));
2015 }
fed9c9a2 2016 else
14de3b42
GH
2017 {
2018 /* replace it. */
2019 scm_set_cdr_x (SCM_CAR (this), proc);
2020 }
fed9c9a2
GH
2021 break;
2022 }
2023 prev = this;
2024 this = SCM_CDR (this);
2025 }
deca31e1 2026
deca31e1
GH
2027 return SCM_UNSPECIFIED;
2028}
1bbd0b84 2029#undef FUNC_NAME
0f2d19dd 2030
deca31e1
GH
2031/* Recover the read-hash procedure corresponding to char c. */
2032static SCM
6e8d25a6 2033scm_get_hash_procedure (int c)
deca31e1 2034{
d458073b 2035 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 2036
deca31e1
GH
2037 while (1)
2038 {
d2e53ed6 2039 if (scm_is_null (rest))
deca31e1
GH
2040 return SCM_BOOL_F;
2041
7866a09b 2042 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
2043 return SCM_CDAR (rest);
2044
2045 rest = SCM_CDR (rest);
2046 }
2047}
1cc91f1b 2048
889975e5
MG
2049#define SCM_ENCODING_SEARCH_SIZE (500)
2050
f8a1c9a8
LC
2051/* Search the first few hundred characters of a file for an Emacs-like coding
2052 declaration. Returns either NULL or a string whose storage has been
2053 allocated with `scm_gc_malloc ()'. */
889975e5 2054char *
f8a1c9a8 2055scm_i_scan_for_encoding (SCM port)
889975e5 2056{
d900843c 2057 scm_t_port *pt;
889975e5 2058 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 2059 size_t bytes_read, encoding_length, i;
889975e5 2060 char *encoding = NULL;
daedbca7 2061 char *pos, *encoding_start;
889975e5
MG
2062 int in_comment;
2063
d900843c 2064 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 2065
d900843c
AW
2066 if (pt->rw_active == SCM_PORT_WRITE)
2067 scm_flush (port);
49bb5bd3 2068
d900843c
AW
2069 if (pt->rw_random)
2070 pt->rw_active = SCM_PORT_READ;
2071
2072 if (pt->read_pos == pt->read_end)
2073 {
2074 /* We can use the read buffer, and thus avoid a seek. */
2075 if (scm_fill_input (port) == EOF)
2076 return NULL;
2077
2078 bytes_read = pt->read_end - pt->read_pos;
2079 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2080 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2081
2082 if (bytes_read <= 1)
2083 /* An unbuffered port -- don't scan. */
2084 return NULL;
2085
2086 memcpy (header, pt->read_pos, bytes_read);
2087 header[bytes_read] = '\0';
2088 }
2089 else
2090 {
2091 /* Try to read some bytes and then seek back. Not all ports
2092 support seeking back; and indeed some file ports (like
2093 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2094 check performed by SCM_FPORT_FDES---but fail to seek
2095 backwards. Hence this block comes second. We prefer to use
2096 the read buffer in-place. */
2097 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2098 return NULL;
2099
2100 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2101 header[bytes_read] = '\0';
419c8736 2102 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
d900843c 2103 }
889975e5 2104
889975e5
MG
2105 /* search past "coding[:=]" */
2106 pos = header;
2107 while (1)
2108 {
2109 if ((pos = strstr(pos, "coding")) == NULL)
2110 return NULL;
2111
2112 pos += strlen("coding");
2113 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2114 (*pos == ':' || *pos == '='))
2115 {
2116 pos ++;
2117 break;
2118 }
2119 }
2120
2121 /* skip spaces */
2122 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2123 (*pos == ' ' || *pos == '\t'))
2124 pos ++;
2125
2126 /* grab the next token */
daedbca7 2127 encoding_start = pos;
889975e5 2128 i = 0;
daedbca7
MG
2129 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2130 && encoding_start + i - header < bytes_read
2131 && (isalnum ((int) encoding_start[i])
2132 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
2133 i++;
2134
daedbca7
MG
2135 encoding_length = i;
2136 if (encoding_length == 0)
889975e5
MG
2137 return NULL;
2138
daedbca7
MG
2139 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2140 for (i = 0; i < encoding_length; i++)
026ed239 2141 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
2142
2143 /* push backwards to make sure we were in a comment */
2144 in_comment = 0;
daedbca7
MG
2145 pos = encoding_start;
2146 while (pos >= header)
889975e5 2147 {
8a12aeb9
AW
2148 if (*pos == ';')
2149 {
2150 in_comment = 1;
2151 break;
2152 }
2153 else if (*pos == '\n' || pos == header)
889975e5
MG
2154 {
2155 /* This wasn't in a semicolon comment. Check for a
2156 hash-bang comment. */
2157 char *beg = strstr (header, "#!");
2158 char *end = strstr (header, "!#");
8a12aeb9 2159 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2160 in_comment = 1;
2161 break;
2162 }
8a12aeb9
AW
2163 else
2164 {
2165 pos --;
2166 continue;
2167 }
889975e5
MG
2168 }
2169 if (!in_comment)
f8a1c9a8
LC
2170 /* This wasn't in a comment */
2171 return NULL;
2172
889975e5
MG
2173 return encoding;
2174}
2175
2176SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2177 (SCM port),
a270e133 2178 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2179 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2180 "The coding declaration is of the form\n"
2181 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2182 "\n"
2183 "Returns a string containing the character encoding of the file\n"
2184 "if a declaration was found, or @code{#f} otherwise.\n")
2185#define FUNC_NAME s_scm_file_encoding
2186{
2187 char *enc;
2188 SCM s_enc;
f8a1c9a8 2189
d900843c
AW
2190 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2191
f8a1c9a8 2192 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2193 if (enc == NULL)
2194 return SCM_BOOL_F;
2195 else
2196 {
2197 s_enc = scm_from_locale_string (enc);
889975e5
MG
2198 return s_enc;
2199 }
f8a1c9a8 2200
889975e5
MG
2201 return SCM_BOOL_F;
2202}
2203#undef FUNC_NAME
2204
ea8c9761
MW
2205\f
2206/* Per-port read options.
2207
a38024ba
MW
2208 We store per-port read options in the 'port-read-options' port
2209 property, which is stored in the internal port structure. The value
2210 stored is a single integer that contains a two-bit field for each
2211 read option.
ea8c9761
MW
2212
2213 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2214 the applicable value should be inherited from the corresponding
2215 global read option. Otherwise, the bit field contains the value of
2216 the read option. For boolean read options that have been set
2217 per-port, the possible values are 0 or 1. If the 'keyword_style'
2218 read option has been set per-port, its possible values are those in
2219 'enum t_keyword_style'. */
2220
a38024ba 2221/* Key to read options in port properties. */
ea8c9761
MW
2222SCM_SYMBOL (sym_port_read_options, "port-read-options");
2223
2224/* Offsets of bit fields for each per-port override */
2225#define READ_OPTION_COPY_SOURCE_P 0
2226#define READ_OPTION_RECORD_POSITIONS_P 2
2227#define READ_OPTION_CASE_INSENSITIVE_P 4
2228#define READ_OPTION_KEYWORD_STYLE 6
2229#define READ_OPTION_R6RS_ESCAPES_P 8
2230#define READ_OPTION_SQUARE_BRACKETS_P 10
2231#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2232#define READ_OPTION_CURLY_INFIX_P 14
dc59631d 2233#define READ_OPTION_R7RS_SYMBOLS_P 16
ea8c9761 2234
bf9eb54a 2235/* The total width in bits of the per-port overrides */
dc59631d 2236#define READ_OPTIONS_NUM_BITS 18
ea8c9761
MW
2237
2238#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2239#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2240
2241#define READ_OPTION_MASK 3
2242#define READ_OPTION_INHERIT 3
2243
2244static void
2245set_port_read_option (SCM port, int option, int new_value)
2246{
a38024ba 2247 SCM scm_read_options;
ea8c9761
MW
2248 unsigned int read_options;
2249
2250 new_value &= READ_OPTION_MASK;
a38024ba 2251 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2252 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2253 read_options = scm_to_uint (scm_read_options);
2254 else
2255 read_options = READ_OPTIONS_INHERIT_ALL;
2256 read_options &= ~(READ_OPTION_MASK << option);
2257 read_options |= new_value << option;
2258 scm_read_options = scm_from_uint (read_options);
a38024ba 2259 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
ea8c9761
MW
2260}
2261
9331ffd8
MW
2262/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2263static void
2264set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2265{
2266 value = !!value;
2267 opts->case_insensitive_p = value;
2268 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2269}
2270
bf9eb54a
MW
2271/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2272static void
2273set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2274{
2275 value = !!value;
2276 opts->square_brackets_p = value;
2277 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2278}
2279
2280/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2281static void
2282set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2283{
2284 value = !!value;
2285 opts->curly_infix_p = value;
2286 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2287}
2288
ea8c9761
MW
2289/* Initialize OPTS based on PORT's read options and the global read
2290 options. */
2291static void
2292init_read_options (SCM port, scm_t_read_opts *opts)
2293{
a38024ba 2294 SCM val, scm_read_options;
ea8c9761
MW
2295 unsigned int read_options, x;
2296
a38024ba 2297 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2298
2299 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2300 read_options = scm_to_uint (scm_read_options);
2301 else
2302 read_options = READ_OPTIONS_INHERIT_ALL;
2303
2304 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2305 if (x == READ_OPTION_INHERIT)
2306 {
2307 val = SCM_PACK (SCM_KEYWORD_STYLE);
2308 if (scm_is_eq (val, scm_keyword_prefix))
2309 x = KEYWORD_STYLE_PREFIX;
2310 else if (scm_is_eq (val, scm_keyword_postfix))
2311 x = KEYWORD_STYLE_POSTFIX;
2312 else
2313 x = KEYWORD_STYLE_HASH_PREFIX;
2314 }
2315 opts->keyword_style = x;
2316
2317#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2318 do \
2319 { \
2320 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2321 if (x == READ_OPTION_INHERIT) \
2322 x = !!SCM_ ## NAME; \
2323 opts->name = x; \
2324 } \
2325 while (0)
2326
2327 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2328 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2329 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2330 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2331 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2332 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2333 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
dc59631d 2334 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
ea8c9761
MW
2335
2336#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2337
2338 opts->neoteric_p = 0;
ea8c9761
MW
2339}
2340
0f2d19dd
JB
2341void
2342scm_init_read ()
0f2d19dd 2343{
d458073b
AR
2344 SCM read_hash_procs;
2345
c81c2ad3 2346 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2347
2348 scm_i_read_hash_procedures =
2349 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2350
62560650 2351 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2352#include "libguile/read.x"
0f2d19dd 2353}
89e00824
ML
2354
2355/*
2356 Local Variables:
2357 c-file-style: "gnu"
2358 End:
2359*/