Merge commit 'ca5e0414e96886177d883a249edd957d2331db65'
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37 1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
7a329029 2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d 28#include <string.h>
889975e5
MG
29#include <unistd.h>
30#include <unicase.h>
684d664e 31#include <unictype.h>
96965a6e 32#include <c-strcase.h>
0fc54828 33#include <c-ctype.h>
7337d56d 34
a0599745 35#include "libguile/_scm.h"
0ba0b384 36#include "libguile/bytevectors.h"
a0599745
MD
37#include "libguile/chars.h"
38#include "libguile/eval.h"
2fa901a5 39#include "libguile/arrays.h"
cf396142 40#include "libguile/bitvectors.h"
a0599745
MD
41#include "libguile/keywords.h"
42#include "libguile/alist.h"
43#include "libguile/srcprop.h"
44#include "libguile/hashtab.h"
45#include "libguile/hash.h"
46#include "libguile/ports.h"
f6f4feb0 47#include "libguile/ports-internal.h"
49bb5bd3 48#include "libguile/fports.h"
a0599745
MD
49#include "libguile/root.h"
50#include "libguile/strings.h"
ba1b2226 51#include "libguile/strports.h"
a0599745 52#include "libguile/vectors.h"
a0599745 53#include "libguile/validate.h"
a4022e69 54#include "libguile/srfi-4.h"
7337d56d 55#include "libguile/srfi-13.h"
ba1b2226 56
a0599745 57#include "libguile/read.h"
22fc179a
HWN
58#include "libguile/private-options.h"
59
0f2d19dd
JB
60
61\f
62
5bf6a6f0 63SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 64SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 65SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 66SCM_SYMBOL (sym_nil, "nil");
c7733771 67
bf9eb54a
MW
68/* SRFI-105 curly infix expression support */
69SCM_SYMBOL (sym_nfx, "$nfx$");
70SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
dc59631d
MW
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
bf9eb54a
MW
93 { 0, },
94 };
b1b5433d
MW
95
96/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
b1b5433d 106
ea8c9761
MW
107struct t_read_opts
108{
b1b5433d
MW
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
dc59631d 118 unsigned int r7rs_symbols_p : 1;
a16f6fe7
MD
119};
120
b1b5433d
MW
121typedef struct t_read_opts scm_t_read_opts;
122
a16f6fe7 123
39e8f371
HWN
124/*
125 Give meaningful error messages for errors
126
127 We use the format
128
ba1b2226 129 FILE:LINE:COL: MESSAGE
39e8f371
HWN
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
39e8f371
HWN
135 */
136
137
a4022e69
MV
138void
139scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
ba1b2226 141{
29a837fd
MV
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 145
29a837fd 146 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
272632a6 149 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 150 scm_list_4 (fn,
b3aa4626 151 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 152 scm_from_int (SCM_COL (port) + 1),
272632a6 153 scm_from_locale_string (message)));
ba1b2226
HWN
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
4a655e50 157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 158 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 159 string,
dd72382c 160 arg,
ba1b2226
HWN
161 SCM_BOOL_F);
162}
39e8f371
HWN
163
164
a1ec6916 165SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 166 (SCM setting),
dc7fa443
MG
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 170#define FUNC_NAME s_scm_read_options
a16f6fe7 171{
b7ff98dd
MD
172 SCM ans = scm_options (setting,
173 scm_read_opts,
1bbd0b84 174 FUNC_NAME);
b7ff98dd
MD
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
177 return ans;
178}
1bbd0b84 179#undef FUNC_NAME
a16f6fe7 180
d458073b
AR
181/* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183static SCM *scm_i_read_hash_procedures;
deca31e1 184
cfd15439 185static SCM
d458073b
AR
186scm_i_read_hash_procedures_ref (void)
187{
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189}
190
cfd15439 191static void
d458073b
AR
192scm_i_read_hash_procedures_set_x (SCM value)
193{
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195}
0f2d19dd 196
7337d56d
LC
197\f
198/* Token readers. */
0f2d19dd 199
0f2d19dd 200
7337d56d
LC
201/* Size of the C buffer used to read symbols and numbers. */
202#define READER_BUFFER_SIZE 128
0f2d19dd 203
ff4d3672
LC
204/* Number of 32-bit codepoints in the buffer used to read strings. */
205#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 206
7337d56d
LC
207/* The maximum size of Scheme character names. */
208#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 209
02327c0c
MW
210/* The maximum size of reader directive names. */
211#define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
94115ae3 213
7337d56d
LC
214/* `isblank' is only in C99. */
215#define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 217 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
218
219#ifdef MSDOS
220# define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222#else
223# define CHAR_IS_BLANK CHAR_IS_BLANK_
224#endif
225
226
227/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229#define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
603234c6 231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 232
603234c6
MW
233#define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
238
239/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241#define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
454866e0 245/* Read an SCSH block comment. */
cfd15439 246static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 247static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
248static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 250static SCM scm_get_hash_procedure (int);
454866e0 251
69f90b0b
MG
252/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
cfd15439 256static int
b1b5433d
MW
257read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
b662b7e9 259{
69f90b0b 260 *read = 0;
0520c320 261
69f90b0b
MG
262 while (*read < buf_size)
263 {
264 int chr;
7337d56d 265
0d959103 266 chr = scm_get_byte_or_eof_unlocked (port);
889975e5 267
69f90b0b
MG
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
c932ce0b 272 scm_unget_byte_unlocked (chr, port);
69f90b0b
MG
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
889975e5 281
69f90b0b
MG
282 return 1;
283 }
889975e5 284
b662b7e9
LC
285/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287static char *
b1b5433d
MW
288read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
290{
291 int overflow = 0;
b662b7e9
LC
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
1cc91f1b 294
69f90b0b
MG
295 do
296 {
b1b5433d 297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
b662b7e9
LC
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
306 overflow_size = bytes_read;
307 }
308 else
309 {
7be3c2fc 310 char *new_buf =
b662b7e9
LC
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
69f90b0b
MG
317 overflow_size += bytes_read;
318 }
319 }
889975e5 320 }
69f90b0b 321 while (overflow);
889975e5 322
69f90b0b
MG
323 if (overflow_size)
324 *read = overflow_size;
b521d265 325 else
69f90b0b
MG
326 *read = bytes_read;
327
b662b7e9 328 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 329}
7337d56d
LC
330
331/* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333static int
b1b5433d 334flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 335{
cfd15439 336 scm_t_wchar c;
0f2d19dd 337 while (1)
be632904 338 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
d156d3b7 343 {
a4022e69
MV
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
d156d3b7 348 }
0f2d19dd 349 return c;
7337d56d 350
0f2d19dd
JB
351 case ';':
352 lp:
be632904 353 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
7337d56d 363
454866e0 364 case '#':
be632904 365 switch (c = scm_getc_unlocked (port))
454866e0
LC
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
b1b5433d 371 scm_read_shebang (c, port, opts);
454866e0 372 break;
34f3d47d 373 case ';':
b1b5433d 374 scm_read_commented_expression (c, port, opts);
34f3d47d 375 break;
620c8965
LC
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
454866e0 383 default:
c932ce0b 384 scm_ungetc_unlocked (c, port);
454866e0
LC
385 return '#';
386 }
387 break;
388
0f2d19dd 389 case SCM_LINE_INCREMENTORS:
0f2d19dd 390 case SCM_SINGLE_SPACES:
0f2d19dd 391 case '\t':
0f2d19dd 392 break;
7337d56d 393
0f2d19dd
JB
394 default:
395 return c;
396 }
7337d56d
LC
397
398 return 0;
0f2d19dd
JB
399}
400
401
7337d56d
LC
402\f
403/* Token readers. */
1cc91f1b 404
b1b5433d
MW
405static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
0f2d19dd 408
0f2d19dd 409
d6cb0203 410static SCM
b1b5433d
MW
411maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
d6cb0203 413{
b1b5433d 414 if (opts->record_positions_p)
d6cb0203
MW
415 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
416 return x;
417}
0f2d19dd 418
09a4f039 419static SCM
b1b5433d 420scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 421#define FUNC_NAME "scm_i_lreadparen"
09a4f039 422{
26c8cc14
AW
423 int c;
424 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
425 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
426 const int terminating_char = ((chr == '{') ? '}'
427 : ((chr == '[') ? ']'
428 : ')'));
7337d56d
LC
429
430 /* Need to capture line and column numbers here. */
431 long line = SCM_LINUM (port);
432 int column = SCM_COL (port) - 1;
f9c68a47 433
b1b5433d 434 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
435 if (terminating_char == c)
436 return SCM_EOL;
f9c68a47 437
c932ce0b 438 scm_ungetc_unlocked (c, port);
b1b5433d 439 tmp = scm_read_expression (port, opts);
1f7945a7
AW
440
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 445 {
b1b5433d
MW
446 ans = scm_read_expression (port, opts);
447 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
448 scm_i_input_error (FUNC_NAME, port, "missing close paren",
449 SCM_EOL);
450 return ans;
451 }
1cc91f1b 452
7337d56d
LC
453 /* Build the head of the list structure. */
454 ans = tl = scm_cons (tmp, SCM_EOL);
455
b1b5433d 456 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 457 {
7337d56d 458 SCM new_tail;
0f2d19dd 459
bf9eb54a
MW
460 if (c == ')' || (c == ']' && opts->square_brackets_p)
461 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
462 scm_i_input_error (FUNC_NAME, port,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c)));
465
c932ce0b 466 scm_ungetc_unlocked (c, port);
b1b5433d 467 tmp = scm_read_expression (port, opts);
5b69315e 468
1f7945a7
AW
469 /* See above note about scm_sym_dot. */
470 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 471 {
b1b5433d 472 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 473
b1b5433d 474 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
475 if (terminating_char != c)
476 scm_i_input_error (FUNC_NAME, port,
477 "in pair: missing close paren", SCM_EOL);
bf9eb54a 478 break;
0f2d19dd 479 }
b858464a 480
7337d56d
LC
481 new_tail = scm_cons (tmp, SCM_EOL);
482 SCM_SETCDR (tl, new_tail);
483 tl = new_tail;
7337d56d 484 }
0f2d19dd 485
bf9eb54a
MW
486 if (curly_list_p)
487 {
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len = scm_ilength (ans);
491
492 /* The (len == 0) case is handled above */
493 if (len == 1)
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans); /* {e} => e */
498 else if (len == 2)
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len >= 3 && (len & 1))
501 {
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op = scm_cadr (ans);
507
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
510 {
511 if (scm_is_null (tl))
512 {
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
515 tl = ans;
516 while (scm_is_pair (scm_cdr (tl)))
517 {
518 tmp = scm_cddr (tl);
519 SCM_SETCDR (tl, tmp);
520 tl = tmp;
521 }
522 ans = scm_cons (op, ans);
523 break;
524 }
525 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
526 {
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans = scm_cons (sym_nfx, ans);
529 break;
530 }
531 }
532 }
533 else
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans = scm_cons (sym_nfx, ans);
537 }
538
b1b5433d 539 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
540}
541#undef FUNC_NAME
a4022e69 542
c5661d28
MG
543
544/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
546 found. */
547#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 do \
549 { \
550 scm_t_wchar a; \
551 size_t i = 0; \
552 c = 0; \
553 while (i < ndigits) \
554 { \
be632904 555 a = scm_getc_unlocked (port); \
dea901d6
MG
556 if (a == EOF) \
557 goto str_eof; \
558 if (terminator \
559 && (a == (scm_t_wchar) terminator) \
560 && (i > 0)) \
561 break; \
562 if ('0' <= a && a <= '9') \
563 a -= '0'; \
564 else if ('A' <= a && a <= 'F') \
565 a = a - 'A' + 10; \
566 else if ('a' <= a && a <= 'f') \
567 a = a - 'a' + 10; \
568 else \
569 { \
570 c = a; \
571 goto bad_escaped; \
572 } \
573 c = c * 16 + a; \
574 i ++; \
575 } \
c5661d28
MG
576 } while (0)
577
684d664e
AW
578static void
579skip_intraline_whitespace (SCM port)
580{
581 scm_t_wchar c;
582
583 do
584 {
be632904 585 c = scm_getc_unlocked (port);
684d664e
AW
586 if (c == EOF)
587 return;
588 }
589 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
590
c932ce0b 591 scm_ungetc_unlocked (c, port);
684d664e
AW
592}
593
dc59631d
MW
594/* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
7337d56d 597static SCM
dc59631d 598scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
599#define FUNC_NAME "scm_lreadr"
600{
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
0f2d19dd 603
ff4d3672
LC
604 SCM str = SCM_EOL;
605 size_t c_str_len = 0;
606 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 607
b131b233
MW
608 /* Need to capture line and column numbers here. */
609 long line = SCM_LINUM (port);
610 int column = SCM_COL (port) - 1;
611
c92ee2b3 612 while (chr != (c = scm_getc_unlocked (port)))
7337d56d
LC
613 {
614 if (c == EOF)
9c44cd45
MG
615 {
616 str_eof:
617 scm_i_input_error (FUNC_NAME, port,
dc59631d
MW
618 (chr == '|'
619 ? "end of file in symbol"
620 : "end of file in string constant"),
621 SCM_EOL);
9c44cd45 622 }
0f2d19dd 623
ff4d3672
LC
624 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
625 {
626 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
627 c_str_len = 0;
628 }
0f2d19dd 629
7337d56d 630 if (c == '\\')
9c44cd45 631 {
be632904 632 switch (c = scm_getc_unlocked (port))
9c44cd45
MG
633 {
634 case EOF:
635 goto str_eof;
6579c330 636 case '|':
9c44cd45
MG
637 case '\\':
638 break;
9c44cd45 639 case '\n':
b1b5433d 640 if (opts->hungry_eol_escapes_p)
684d664e 641 skip_intraline_whitespace (port);
9c44cd45
MG
642 continue;
643 case '0':
644 c = '\0';
645 break;
646 case 'f':
647 c = '\f';
648 break;
649 case 'n':
650 c = '\n';
651 break;
652 case 'r':
653 c = '\r';
654 break;
655 case 't':
656 c = '\t';
657 break;
658 case 'a':
659 c = '\007';
660 break;
661 case 'v':
662 c = '\v';
663 break;
67a4a16d
MG
664 case 'b':
665 c = '\010';
666 break;
9c44cd45 667 case 'x':
dc59631d 668 if (opts->r6rs_escapes_p || chr == '|')
dea901d6
MG
669 SCM_READ_HEX_ESCAPE (10, ';');
670 else
671 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 672 break;
9c44cd45 673 case 'u':
b1b5433d 674 if (!opts->r6rs_escapes_p)
898a0b5a
MG
675 {
676 SCM_READ_HEX_ESCAPE (4, '\0');
677 break;
678 }
9c44cd45 679 case 'U':
b1b5433d 680 if (!opts->r6rs_escapes_p)
898a0b5a
MG
681 {
682 SCM_READ_HEX_ESCAPE (6, '\0');
683 break;
684 }
9c44cd45 685 default:
dc59631d
MW
686 if (c == chr)
687 break;
9c44cd45
MG
688 bad_escaped:
689 scm_i_input_error (FUNC_NAME, port,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c)));
692 }
693 }
ff4d3672
LC
694
695 c_str[c_str_len++] = c;
7337d56d 696 }
ff4d3672
LC
697
698 if (scm_is_null (str))
699 /* Fast path: we got a string that fits in C_STR. */
700 str = scm_from_utf32_stringn (c_str, c_str_len);
701 else
702 {
703 if (c_str_len > 0)
704 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
705
706 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
707 }
708
b1b5433d 709 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 710}
db4b4ca6
DH
711#undef FUNC_NAME
712
dc59631d
MW
713static SCM
714scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
715{
716 return scm_read_string_like_syntax (chr, port, opts);
717}
718
719static SCM
720scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
721{
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
723}
0f2d19dd 724
7337d56d 725static SCM
b1b5433d 726scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 727{
69f90b0b 728 SCM result, str = SCM_EOL;
b662b7e9 729 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 730 size_t bytes_read;
0f2d19dd 731
38f19074
MW
732 /* Need to capture line and column numbers here. */
733 long line = SCM_LINUM (port);
734 int column = SCM_COL (port) - 1;
735
c932ce0b 736 scm_ungetc_unlocked (chr, port);
b1b5433d 737 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 738 &bytes_read);
69f90b0b 739
08467a7e 740 str = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b
MG
741
742 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 743 if (scm_is_false (result))
69f90b0b
MG
744 {
745 /* Return a symbol instead of a number */
b1b5433d 746 if (opts->case_insensitive_p)
69f90b0b
MG
747 str = scm_string_downcase_x (str);
748 result = scm_string_to_symbol (str);
749 }
38f19074 750 else if (SCM_NIMP (result))
b1b5433d 751 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 752
69f90b0b 753 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
754 return result;
755}
0f2d19dd 756
7337d56d 757static SCM
b1b5433d 758scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 759{
889975e5
MG
760 SCM result;
761 int ends_with_colon = 0;
69f90b0b 762 size_t bytes_read;
b1b5433d 763 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 764 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 765 SCM str;
7337d56d 766
c932ce0b 767 scm_ungetc_unlocked (chr, port);
b1b5433d 768 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 769 &bytes_read);
69f90b0b 770 if (bytes_read > 0)
b662b7e9 771 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 772
69f90b0b
MG
773 if (postfix && ends_with_colon && (bytes_read > 1))
774 {
08467a7e 775 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
69f90b0b 776
b1b5433d 777 if (opts->case_insensitive_p)
69f90b0b
MG
778 str = scm_string_downcase_x (str);
779 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
780 }
7337d56d 781 else
69f90b0b 782 {
08467a7e 783 str = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b 784
b1b5433d 785 if (opts->case_insensitive_p)
69f90b0b
MG
786 str = scm_string_downcase_x (str);
787 result = scm_string_to_symbol (str);
788 }
7337d56d 789
69f90b0b 790 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
791 return result;
792}
793
794static SCM
b1b5433d 795scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
796#define FUNC_NAME "scm_lreadr"
797{
889975e5 798 SCM result;
7337d56d 799 size_t read;
b662b7e9 800 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 801 unsigned int radix;
69f90b0b 802 SCM str;
7337d56d
LC
803
804 switch (chr)
805 {
806 case 'B':
807 case 'b':
808 radix = 2;
809 break;
810
811 case 'o':
812 case 'O':
813 radix = 8;
814 break;
815
816 case 'd':
817 case 'D':
818 radix = 10;
819 break;
820
821 case 'x':
822 case 'X':
823 radix = 16;
824 break;
825
826 default:
c932ce0b
AW
827 scm_ungetc_unlocked (chr, port);
828 scm_ungetc_unlocked ('#', port);
7337d56d
LC
829 radix = 10;
830 }
831
b1b5433d 832 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 833 &read);
69f90b0b 834
08467a7e 835 str = scm_from_port_stringn (buffer, read, port);
69f90b0b
MG
836
837 result = scm_string_to_number (str, scm_from_uint (radix));
838
69f90b0b 839 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
840
841 if (scm_is_true (result))
842 return result;
843
844 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
845
846 return SCM_BOOL_F;
847}
848#undef FUNC_NAME
849
850static SCM
b1b5433d 851scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
852{
853 SCM p;
492faee1
LC
854 long line = SCM_LINUM (port);
855 int column = SCM_COL (port) - 1;
7337d56d
LC
856
857 switch (chr)
858 {
859 case '`':
860 p = scm_sym_quasiquote;
861 break;
862
863 case '\'':
864 p = scm_sym_quote;
865 break;
866
867 case ',':
868 {
889975e5 869 scm_t_wchar c;
7337d56d 870
be632904 871 c = scm_getc_unlocked (port);
7337d56d
LC
872 if ('@' == c)
873 p = scm_sym_uq_splicing;
874 else
0f2d19dd 875 {
c932ce0b 876 scm_ungetc_unlocked (c, port);
7337d56d 877 p = scm_sym_unquote;
0f2d19dd 878 }
7337d56d
LC
879 break;
880 }
0f2d19dd 881
7337d56d
LC
882 default:
883 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 884 "scm_read_quote", chr);
7337d56d 885 abort ();
0f2d19dd 886 }
1cc91f1b 887
b1b5433d
MW
888 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
889 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
890}
891
34f3d47d
AW
892SCM_SYMBOL (sym_syntax, "syntax");
893SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
894SCM_SYMBOL (sym_unsyntax, "unsyntax");
895SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
896
897static SCM
b1b5433d 898scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
899{
900 SCM p;
901 long line = SCM_LINUM (port);
902 int column = SCM_COL (port) - 1;
903
904 switch (chr)
905 {
906 case '`':
907 p = sym_quasisyntax;
908 break;
909
910 case '\'':
911 p = sym_syntax;
912 break;
913
914 case ',':
915 {
916 int c;
917
be632904 918 c = scm_getc_unlocked (port);
34f3d47d
AW
919 if ('@' == c)
920 p = sym_unsyntax_splicing;
921 else
922 {
c932ce0b 923 scm_ungetc_unlocked (c, port);
34f3d47d
AW
924 p = sym_unsyntax;
925 }
926 break;
927 }
928
929 default:
930 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
931 "scm_read_syntax", chr);
932 abort ();
933 }
934
b1b5433d
MW
935 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
936 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
937}
938
cfd15439 939static SCM
b1b5433d 940scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 941{
b1b5433d 942 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
943
944 if (!scm_is_eq (id, sym_nil))
945 scm_i_input_error ("scm_read_nil", port,
946 "unexpected input while reading #nil: ~a",
947 scm_list_1 (id));
948
949 return SCM_ELISP_NIL;
950}
951
cfd15439 952static SCM
7337d56d 953scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 954{
0f2d19dd
JB
955 int c;
956
889975e5
MG
957 /* We use the get_byte here because there is no need to get the
958 locale correct with comment input. This presumes that newline
959 always represents itself no matter what the encoding is. */
0d959103 960 for (c = scm_get_byte_or_eof_unlocked (port);
7337d56d 961 (c != EOF) && (c != '\n');
0d959103 962 c = scm_get_byte_or_eof_unlocked (port));
7337d56d
LC
963
964 return SCM_UNSPECIFIED;
965}
966
7a329029
MW
967/* If the EXPECTED_CHARS are the next ones available from PORT, then
968 consume them and return 1. Otherwise leave the port position where
969 it was and return 0. EXPECTED_CHARS should be all lowercase, and
970 will be matched case-insensitively against the characters read from
971 PORT. */
972static int
973try_read_ci_chars (SCM port, const char *expected_chars)
974{
975 int num_chars_wanted = strlen (expected_chars);
976 int num_chars_read = 0;
977 char *chars_read = alloca (num_chars_wanted);
978 int c;
979
980 while (num_chars_read < num_chars_wanted)
981 {
cb8aaef4 982 c = scm_getc_unlocked (port);
7a329029
MW
983 if (c == EOF)
984 break;
0fc54828 985 else if (c_tolower (c) != expected_chars[num_chars_read])
7a329029 986 {
cb8aaef4 987 scm_ungetc_unlocked (c, port);
7a329029
MW
988 break;
989 }
990 else
991 chars_read[num_chars_read++] = c;
992 }
993
994 if (num_chars_read == num_chars_wanted)
995 return 1;
996 else
997 {
998 while (num_chars_read > 0)
cb8aaef4 999 scm_ungetc_unlocked (chars_read[--num_chars_read], port);
7a329029
MW
1000 return 0;
1001 }
1002}
1003
7337d56d
LC
1004\f
1005/* Sharp readers, i.e. readers called after a `#' sign has been read. */
1006
1007static SCM
1008scm_read_boolean (int chr, SCM port)
1009{
1010 switch (chr)
0f2d19dd 1011 {
7337d56d
LC
1012 case 't':
1013 case 'T':
7a329029 1014 try_read_ci_chars (port, "rue");
7337d56d
LC
1015 return SCM_BOOL_T;
1016
1017 case 'f':
1018 case 'F':
7a329029 1019 try_read_ci_chars (port, "alse");
7337d56d 1020 return SCM_BOOL_F;
0f2d19dd 1021 }
7337d56d
LC
1022
1023 return SCM_UNSPECIFIED;
1024}
1025
1026static SCM
b1b5433d 1027scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1028#define FUNC_NAME "scm_lreadr"
1029{
69f90b0b
MG
1030 char buffer[READER_CHAR_NAME_MAX_SIZE];
1031 SCM charname;
1032 size_t charname_len, bytes_read;
889975e5
MG
1033 scm_t_wchar cp;
1034 int overflow;
f6f4feb0 1035 scm_t_port_internal *pti;
889975e5 1036
b1b5433d
MW
1037 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1038 &bytes_read);
889975e5 1039 if (overflow)
c1a0ba1c 1040 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 1041
69f90b0b 1042 if (bytes_read == 0)
0f2d19dd 1043 {
be632904 1044 chr = scm_getc_unlocked (port);
7337d56d
LC
1045 if (chr == EOF)
1046 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1047 "while reading character", SCM_EOL);
1048
1049 /* CHR must be a token delimiter, like a whitespace. */
1050 return (SCM_MAKE_CHAR (chr));
0f2d19dd 1051 }
7337d56d 1052
f6f4feb0 1053 pti = SCM_PORT_GET_INTERNAL (port);
7337d56d 1054
69f90b0b
MG
1055 /* Simple ASCII characters can be processed immediately. Also, simple
1056 ISO-8859-1 characters can be processed immediately if the encoding for this
1057 port is ISO-8859-1. */
d1b9f8ac
AW
1058 if (bytes_read == 1 &&
1059 ((unsigned char) buffer[0] <= 127
f6f4feb0 1060 || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
69f90b0b
MG
1061 {
1062 SCM_COL (port) += 1;
1063 return SCM_MAKE_CHAR (buffer[0]);
1064 }
1065
1066 /* Otherwise, convert the buffer into a proper scheme string for
1067 processing. */
08467a7e 1068 charname = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b
MG
1069 charname_len = scm_i_string_length (charname);
1070 SCM_COL (port) += charname_len;
889975e5 1071 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1072 if (charname_len == 1)
1073 return SCM_MAKE_CHAR (cp);
1074
1075 /* Ignore dotted circles, which may be used to keep combining characters from
1076 combining with the backslash in #\charname. */
0dcd7e61
MG
1077 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1078 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1079
889975e5 1080 if (cp >= '0' && cp < '8')
7337d56d
LC
1081 {
1082 /* Dirk:FIXME:: This type of character syntax is not R5RS
1083 * compliant. Further, it should be verified that the constant
0ffc78e3 1084 * does only consist of octal digits. */
889975e5 1085 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1086 if (SCM_I_INUMP (p))
0ffc78e3 1087 {
e25f3727 1088 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1089 if (SCM_IS_UNICODE_CHAR (c))
1090 return SCM_MAKE_CHAR (c);
1091 else
0f3a70cf 1092 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1093 "out-of-range octal character escape: ~a",
1094 scm_list_1 (charname));
1095 }
7337d56d
LC
1096 }
1097
0f3a70cf 1098 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1099 {
1100 SCM p;
0f3a70cf 1101
dea901d6
MG
1102 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1103 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1104 scm_from_uint (16));
1105 if (SCM_I_INUMP (p))
1106 {
e25f3727 1107 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1108 if (SCM_IS_UNICODE_CHAR (c))
1109 return SCM_MAKE_CHAR (c);
1110 else
1111 scm_i_input_error (FUNC_NAME, port,
1112 "out-of-range hex character escape: ~a",
1113 scm_list_1 (charname));
1114 }
1115 }
1116
889975e5
MG
1117 /* The names of characters should never have non-Latin1
1118 characters. */
1119 if (scm_i_is_narrow_string (charname)
1120 || scm_i_try_narrow_string (charname))
4769c9db
AW
1121 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1122 charname_len);
1123 if (scm_is_true (ch))
1124 return ch;
1125 }
7337d56d 1126
7337d56d 1127 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1128 scm_list_1 (charname));
7337d56d
LC
1129
1130 return SCM_UNSPECIFIED;
0f2d19dd 1131}
db4b4ca6 1132#undef FUNC_NAME
0f2d19dd 1133
cfd15439 1134static SCM
b1b5433d 1135scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1136{
1137 SCM symbol;
1138
1139 /* Read the symbol that comprises the keyword. Doing this instead of
1140 invoking a specific symbol reader function allows `scm_read_keyword ()'
1141 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1142
7337d56d 1143 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1144 symbol = scm_read_expression (port, opts);
7337d56d 1145 if (!scm_is_symbol (symbol))
7f74cf9a 1146 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1147 "keyword prefix `~a' not followed by a symbol: ~s",
1148 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1149
1150 return (scm_symbol_to_keyword (symbol));
1151}
1152
cfd15439 1153static SCM
b1b5433d
MW
1154scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1155 long line, int column)
09a4f039 1156{
7337d56d
LC
1157 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1158 guarantee that it's going to do what we want. After all, this is an
1159 implementation detail of `scm_read_vector ()', not a desirable
1160 property. */
b1b5433d
MW
1161 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1162 port, opts, line, column);
b131b233
MW
1163}
1164
493ceb99
MW
1165/* Helper used by scm_read_array */
1166static int
1167read_decimal_integer (SCM port, int c, ssize_t *resp)
1168{
1169 ssize_t sign = 1;
1170 ssize_t res = 0;
1171 int got_it = 0;
1172
1173 if (c == '-')
1174 {
1175 sign = -1;
fa980bcc 1176 c = scm_getc_unlocked (port);
493ceb99
MW
1177 }
1178
1179 while ('0' <= c && c <= '9')
1180 {
a662686a
MW
1181 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1182 scm_i_input_error ("read_decimal_integer", port,
1183 "number too large", SCM_EOL);
493ceb99
MW
1184 res = 10*res + c-'0';
1185 got_it = 1;
fa980bcc 1186 c = scm_getc_unlocked (port);
493ceb99
MW
1187 }
1188
1189 if (got_it)
1190 *resp = sign * res;
1191 return c;
7337d56d 1192}
09a4f039 1193
493ceb99
MW
1194/* Read an array. This function can also read vectors and uniform
1195 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1196 handled here.
1197
10744b7c 1198 C is the first character read after the '#'. */
b131b233 1199static SCM
b1b5433d 1200scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
7337d56d 1201{
493ceb99
MW
1202 ssize_t rank;
1203 scm_t_wchar tag_buf[8];
1204 int tag_len;
1205
1206 SCM tag, shape = SCM_BOOL_F, elements, array;
1207
1208 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1209 the array code can not deal with zero-length dimensions yet, and
10744b7c 1210 we want to allow zero-length vectors, of course. */
493ceb99 1211 if (c == '(')
b1b5433d 1212 return scm_read_vector (c, port, opts, line, column);
493ceb99 1213
10744b7c 1214 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1215 if (c == 'f')
1216 {
fa980bcc 1217 c = scm_getc_unlocked (port);
493ceb99
MW
1218 if (c != '3' && c != '6')
1219 {
7a329029
MW
1220 if (c == 'a' && try_read_ci_chars (port, "lse"))
1221 return SCM_BOOL_F;
1222 else if (c != EOF)
cb8aaef4 1223 scm_ungetc_unlocked (c, port);
493ceb99
MW
1224 return SCM_BOOL_F;
1225 }
1226 rank = 1;
1227 tag_buf[0] = 'f';
1228 tag_len = 1;
1229 goto continue_reading_tag;
1230 }
1231
1232 /* Read rank. */
1233 rank = 1;
1234 c = read_decimal_integer (port, c, &rank);
1235 if (rank < 0)
1236 scm_i_input_error (NULL, port, "array rank must be non-negative",
1237 SCM_EOL);
1238
1239 /* Read tag. */
1240 tag_len = 0;
1241 continue_reading_tag:
1242 while (c != EOF && c != '(' && c != '@' && c != ':'
1243 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1244 {
1245 tag_buf[tag_len++] = c;
fa980bcc 1246 c = scm_getc_unlocked (port);
493ceb99
MW
1247 }
1248 if (tag_len == 0)
1249 tag = SCM_BOOL_T;
b131b233 1250 else
493ceb99
MW
1251 {
1252 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1253 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1254 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1255 scm_list_1 (tag));
1256 }
1257
1258 /* Read shape. */
1259 if (c == '@' || c == ':')
1260 {
1261 shape = SCM_EOL;
1262
1263 do
1264 {
1265 ssize_t lbnd = 0, len = 0;
1266 SCM s;
1267
1268 if (c == '@')
1269 {
fa980bcc 1270 c = scm_getc_unlocked (port);
493ceb99
MW
1271 c = read_decimal_integer (port, c, &lbnd);
1272 }
1273
1274 s = scm_from_ssize_t (lbnd);
1275
1276 if (c == ':')
1277 {
fa980bcc 1278 c = scm_getc_unlocked (port);
493ceb99
MW
1279 c = read_decimal_integer (port, c, &len);
1280 if (len < 0)
1281 scm_i_input_error (NULL, port,
1282 "array length must be non-negative",
1283 SCM_EOL);
1284
1285 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1286 }
1287
1288 shape = scm_cons (s, shape);
1289 } while (c == '@' || c == ':');
1290
1291 shape = scm_reverse_x (shape, SCM_EOL);
1292 }
1293
1294 /* Read nested lists of elements. */
1295 if (c != '(')
1296 scm_i_input_error (NULL, port,
1297 "missing '(' in vector or array literal",
1298 SCM_EOL);
b1b5433d 1299 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1300
1301 if (scm_is_false (shape))
1302 shape = scm_from_ssize_t (rank);
1303 else if (scm_ilength (shape) != rank)
1304 scm_i_input_error
1305 (NULL, port,
1306 "the number of shape specifications must match the array rank",
1307 SCM_EOL);
1308
1309 /* Handle special print syntax of rank zero arrays; see
1310 scm_i_print_array for a rationale. */
1311 if (rank == 0)
1312 {
1313 if (!scm_is_pair (elements))
1314 scm_i_input_error (NULL, port,
1315 "too few elements in array literal, need 1",
1316 SCM_EOL);
1317 if (!scm_is_null (SCM_CDR (elements)))
1318 scm_i_input_error (NULL, port,
1319 "too many elements in array literal, want 1",
1320 SCM_EOL);
1321 elements = SCM_CAR (elements);
1322 }
1323
1324 /* Construct array, annotate with source location, and return. */
1325 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1326 return maybe_annotate_source (array, port, opts, line, column);
7337d56d
LC
1327}
1328
0ba0b384 1329static SCM
b1b5433d
MW
1330scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1331 long line, int column)
7337d56d 1332{
b1b5433d 1333 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1334}
1335
0ba0b384 1336static SCM
b1b5433d
MW
1337scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1338 long line, int column)
0ba0b384 1339{
be632904 1340 chr = scm_getc_unlocked (port);
0ba0b384
LC
1341 if (chr != 'u')
1342 goto syntax;
1343
be632904 1344 chr = scm_getc_unlocked (port);
0ba0b384
LC
1345 if (chr != '8')
1346 goto syntax;
1347
be632904 1348 chr = scm_getc_unlocked (port);
0ba0b384
LC
1349 if (chr != '(')
1350 goto syntax;
1351
b131b233 1352 return maybe_annotate_source
b1b5433d
MW
1353 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1354 port, opts, line, column);
0ba0b384
LC
1355
1356 syntax:
1357 scm_i_input_error ("read_bytevector", port,
1358 "invalid bytevector prefix",
1359 SCM_MAKE_CHAR (chr));
1360 return SCM_UNSPECIFIED;
1361}
1362
7337d56d 1363static SCM
b1b5433d
MW
1364scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1365 long line, int column)
7337d56d
LC
1366{
1367 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1368 terribly inefficient but who cares? */
1369 SCM s_bits = SCM_EOL;
1370
be632904 1371 for (chr = scm_getc_unlocked (port);
7337d56d 1372 (chr != EOF) && ((chr == '0') || (chr == '1'));
be632904 1373 chr = scm_getc_unlocked (port))
09a4f039 1374 {
7337d56d 1375 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1376 }
7337d56d
LC
1377
1378 if (chr != EOF)
c932ce0b 1379 scm_ungetc_unlocked (chr, port);
7337d56d 1380
b131b233
MW
1381 return maybe_annotate_source
1382 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1383 port, opts, line, column);
7337d56d
LC
1384}
1385
cfd15439 1386static SCM
889975e5 1387scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1388{
1389 int bang_seen = 0;
1390
1391 for (;;)
09a4f039 1392 {
be632904 1393 int c = scm_getc_unlocked (port);
62850ef3 1394
7337d56d
LC
1395 if (c == EOF)
1396 scm_i_input_error ("skip_block_comment", port,
1397 "unterminated `#! ... !#' comment", SCM_EOL);
1398
1399 if (c == '!')
1400 bang_seen = 1;
1401 else if (c == '#' && bang_seen)
1402 break;
1403 else
1404 bang_seen = 0;
1405 }
1406
1407 return SCM_UNSPECIFIED;
1408}
1409
9331ffd8
MW
1410static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1411 int value);
bf9eb54a
MW
1412static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1413 int value);
1414static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1415 int value);
9331ffd8 1416
d7fcaec3 1417static SCM
b1b5433d 1418scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1419{
02327c0c
MW
1420 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1421 int c;
1422 int i = 0;
1423
1424 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1425 {
fa980bcc 1426 c = scm_getc_unlocked (port);
02327c0c
MW
1427 if (c == EOF)
1428 scm_i_input_error ("skip_block_comment", port,
1429 "unterminated `#! ... !#' comment", SCM_EOL);
1430 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1431 name[i++] = c;
1432 else if (CHAR_IS_DELIMITER (c))
1433 {
fa980bcc 1434 scm_ungetc_unlocked (c, port);
02327c0c
MW
1435 name[i] = '\0';
1436 if (0 == strcmp ("r6rs", name))
1437 ; /* Silently ignore */
9331ffd8
MW
1438 else if (0 == strcmp ("fold-case", name))
1439 set_port_case_insensitive_p (port, opts, 1);
1440 else if (0 == strcmp ("no-fold-case", name))
1441 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1442 else if (0 == strcmp ("curly-infix", name))
1443 set_port_curly_infix_p (port, opts, 1);
1444 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1445 {
1446 set_port_curly_infix_p (port, opts, 1);
1447 set_port_square_brackets_p (port, opts, 0);
1448 }
02327c0c
MW
1449 else
1450 break;
1451
1452 return SCM_UNSPECIFIED;
1453 }
fa746547
MW
1454 else
1455 {
fa980bcc 1456 scm_ungetc_unlocked (c, port);
fa746547
MW
1457 break;
1458 }
911b03b2 1459 }
02327c0c 1460 while (i > 0)
fa980bcc 1461 scm_ungetc_unlocked (name[--i], port);
02327c0c 1462 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1463}
1464
620c8965
LC
1465static SCM
1466scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1467{
1468 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1469 nested. So care must be taken. */
1470 int nesting_level = 1;
6d5f8c32 1471
be632904 1472 int a = scm_getc_unlocked (port);
6d5f8c32
AW
1473
1474 if (a == EOF)
1475 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1476 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1477
1478 while (nesting_level > 0)
1479 {
be632904 1480 int b = scm_getc_unlocked (port);
620c8965 1481
6d5f8c32 1482 if (b == EOF)
cd169c5a 1483 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1484 "unterminated `#| ... |#' comment", SCM_EOL);
1485
6d5f8c32
AW
1486 if (a == '|' && b == '#')
1487 {
1488 nesting_level--;
1489 b = EOF;
1490 }
1491 else if (a == '#' && b == '|')
1492 {
1493 nesting_level++;
1494 b = EOF;
1495 }
1496
1497 a = b;
620c8965
LC
1498 }
1499
1500 return SCM_UNSPECIFIED;
1501}
1502
34f3d47d 1503static SCM
b1b5433d
MW
1504scm_read_commented_expression (scm_t_wchar chr, SCM port,
1505 scm_t_read_opts *opts)
34f3d47d 1506{
889975e5 1507 scm_t_wchar c;
34f3d47d 1508
b1b5433d 1509 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1510 if (EOF == c)
1511 scm_i_input_error ("read_commented_expression", port,
1512 "no expression after #; comment", SCM_EOL);
c932ce0b 1513 scm_ungetc_unlocked (c, port);
b1b5433d 1514 scm_read_expression (port, opts);
34f3d47d
AW
1515 return SCM_UNSPECIFIED;
1516}
1517
7337d56d 1518static SCM
889975e5 1519scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1520{
1521 /* Guile's extended symbol read syntax looks like this:
1522
1523 #{This is all a symbol name}#
1524
1525 So here, CHR is expected to be `{'. */
d9527cfa 1526 int saw_brace = 0;
7337d56d 1527 size_t len = 0;
190d4b0d 1528 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1529
889975e5 1530 buf = scm_i_string_start_writing (buf);
7337d56d 1531
be632904 1532 while ((chr = scm_getc_unlocked (port)) != EOF)
7337d56d
LC
1533 {
1534 if (saw_brace)
09a4f039 1535 {
7337d56d
LC
1536 if (chr == '#')
1537 {
7337d56d
LC
1538 break;
1539 }
1540 else
1541 {
1542 saw_brace = 0;
889975e5 1543 scm_i_string_set_x (buf, len++, '}');
7337d56d 1544 }
09a4f039 1545 }
d9527cfa
AW
1546
1547 if (chr == '}')
7337d56d 1548 saw_brace = 1;
d9527cfa
AW
1549 else if (chr == '\\')
1550 {
1551 /* It used to be that print.c would print extended-read-syntax
1552 symbols with backslashes before "non-standard" chars, but
1553 this routine wouldn't do anything with those escapes.
1554 Bummer. What we've done is to change print.c to output
1555 R6RS hex escapes for those characters, relying on the fact
1556 that the extended read syntax would never put a `\' before
1557 an `x'. For now, we just ignore other instances of
1558 backslash in the string. */
be632904 1559 switch ((chr = scm_getc_unlocked (port)))
d9527cfa
AW
1560 {
1561 case EOF:
1562 goto done;
1563 case 'x':
1564 {
1565 scm_t_wchar c;
1566
1567 SCM_READ_HEX_ESCAPE (10, ';');
1568 scm_i_string_set_x (buf, len++, c);
1569 break;
1570
1571 str_eof:
1572 chr = EOF;
1573 goto done;
1574
1575 bad_escaped:
1576 scm_i_string_stop_writing ();
1577 scm_i_input_error ("scm_read_extended_symbol", port,
1578 "illegal character in escape sequence: ~S",
1579 scm_list_1 (SCM_MAKE_CHAR (c)));
1580 break;
1581 }
1582 default:
1583 scm_i_string_set_x (buf, len++, chr);
1584 break;
1585 }
1586 }
7337d56d 1587 else
d9527cfa 1588 scm_i_string_set_x (buf, len++, chr);
62850ef3 1589
889975e5 1590 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1591 {
7f991c7d
LC
1592 SCM addy;
1593
889975e5 1594 scm_i_string_stop_writing ();
190d4b0d 1595 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1596 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1597 len = 0;
889975e5 1598 buf = scm_i_string_start_writing (buf);
7337d56d 1599 }
7337d56d 1600 }
d9527cfa
AW
1601
1602 done:
889975e5 1603 scm_i_string_stop_writing ();
d9527cfa
AW
1604 if (chr == EOF)
1605 scm_i_input_error ("scm_read_extended_symbol", port,
1606 "end of file while reading symbol", SCM_EOL);
7337d56d 1607
889975e5 1608 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1609}
1610
1611
1612\f
1613/* Top-level token readers, i.e., dispatchers. */
1614
1615static SCM
b1b5433d 1616scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1617{
1618 SCM proc;
1619
1620 proc = scm_get_hash_procedure (chr);
1621 if (scm_is_true (scm_procedure_p (proc)))
1622 {
1623 long line = SCM_LINUM (port);
1624 int column = SCM_COL (port) - 2;
1625 SCM got;
1626
1627 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1628
3655ed89
MW
1629 if (opts->record_positions_p && SCM_NIMP (got)
1630 && !scm_i_has_source_properties (got))
26c8cc14
AW
1631 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1632
1633 return got;
09a4f039 1634 }
7337d56d
LC
1635
1636 return SCM_UNSPECIFIED;
1637}
1638
1639/* The reader for the sharp `#' character. It basically dispatches reads
1640 among the above token readers. */
1641static SCM
b1b5433d
MW
1642scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1643 long line, int column)
7337d56d
LC
1644#define FUNC_NAME "scm_lreadr"
1645{
1646 SCM result;
1647
be632904 1648 chr = scm_getc_unlocked (port);
7337d56d 1649
b1b5433d 1650 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1651 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1652 return result;
1653
1654 switch (chr)
1655 {
1656 case '\\':
b1b5433d 1657 return (scm_read_character (chr, port, opts));
7337d56d 1658 case '(':
b1b5433d 1659 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1660 case 's':
1661 case 'u':
1662 case 'f':
df941b5b 1663 case 'c':
7337d56d 1664 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1665 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1666 case 'v':
b1b5433d 1667 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1668 case '*':
b1b5433d 1669 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1670 case 't':
1671 case 'T':
1672 case 'F':
7337d56d
LC
1673 return (scm_read_boolean (chr, port));
1674 case ':':
b1b5433d 1675 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1676 case '0': case '1': case '2': case '3': case '4':
1677 case '5': case '6': case '7': case '8': case '9':
1678 case '@':
b1b5433d 1679 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1680
1681 case 'i':
1682 case 'e':
7337d56d
LC
1683 case 'b':
1684 case 'B':
1685 case 'o':
1686 case 'O':
1687 case 'd':
1688 case 'D':
1689 case 'x':
1690 case 'X':
1691 case 'I':
1692 case 'E':
b1b5433d 1693 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1694 case '{':
1695 return (scm_read_extended_symbol (chr, port));
1696 case '!':
b1b5433d 1697 return (scm_read_shebang (chr, port, opts));
34f3d47d 1698 case ';':
b1b5433d 1699 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1700 case '`':
1701 case '\'':
1702 case ',':
b1b5433d 1703 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1704 case 'n':
b1b5433d 1705 return (scm_read_nil (chr, port, opts));
7337d56d 1706 default:
b1b5433d 1707 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1708 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1709 {
1710 /* To remain compatible with 1.8 and earlier, the following
1711 characters have lower precedence than `read-hash-extend'
1712 characters. */
1713 switch (chr)
1714 {
1715 case '|':
1716 return scm_read_r6rs_block_comment (chr, port);
1717 default:
1718 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1719 scm_list_1 (SCM_MAKE_CHAR (chr)));
1720 }
1721 }
7337d56d
LC
1722 else
1723 return result;
1724 }
1725
1726 return SCM_UNSPECIFIED;
1727}
1728#undef FUNC_NAME
1729
1730static SCM
bf9eb54a
MW
1731read_inner_expression (SCM port, scm_t_read_opts *opts)
1732#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1733{
1734 while (1)
1735 {
cfd15439 1736 scm_t_wchar chr;
7337d56d 1737
be632904 1738 chr = scm_getc_unlocked (port);
7337d56d
LC
1739
1740 switch (chr)
1741 {
1742 case SCM_WHITE_SPACES:
1743 case SCM_LINE_INCREMENTORS:
1744 break;
1745 case ';':
1746 (void) scm_read_semicolon_comment (chr, port);
1747 break;
bf9eb54a
MW
1748 case '{':
1749 if (opts->curly_infix_p)
1750 {
1751 if (opts->neoteric_p)
1752 return scm_read_sexp (chr, port, opts);
1753 else
1754 {
1755 SCM expr;
1756
1757 /* Enable neoteric expressions within curly braces */
1758 opts->neoteric_p = 1;
1759 expr = scm_read_sexp (chr, port, opts);
1760 opts->neoteric_p = 0;
1761 return expr;
1762 }
1763 }
1764 else
1765 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1766 case '[':
bf9eb54a
MW
1767 if (opts->square_brackets_p)
1768 return scm_read_sexp (chr, port, opts);
1769 else if (opts->curly_infix_p)
1770 {
1771 /* The syntax of neoteric expressions requires that '[' be
1772 a delimiter when curly-infix is enabled, so it cannot
1773 be part of an unescaped symbol. We might as well do
1774 something useful with it, so we adopt Kawa's convention:
1775 [...] => ($bracket-list$ ...) */
1776 long line = SCM_LINUM (port);
1777 int column = SCM_COL (port) - 1;
1778 return maybe_annotate_source
1779 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1780 port, opts, line, column);
1781 }
1782 else
1783 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1784 case '(':
b1b5433d 1785 return (scm_read_sexp (chr, port, opts));
7337d56d 1786 case '"':
b1b5433d 1787 return (scm_read_string (chr, port, opts));
dc59631d
MW
1788 case '|':
1789 if (opts->r7rs_symbols_p)
1790 return scm_read_r7rs_symbol (chr, port, opts);
1791 else
1792 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d
LC
1793 case '\'':
1794 case '`':
1795 case ',':
b1b5433d 1796 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1797 case '#':
1798 {
b131b233
MW
1799 long line = SCM_LINUM (port);
1800 int column = SCM_COL (port) - 1;
b1b5433d 1801 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1802 if (scm_is_eq (result, SCM_UNSPECIFIED))
1803 /* We read a comment or some such. */
1804 break;
1805 else
1806 return result;
1807 }
1808 case ')':
1809 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1810 break;
bf9eb54a
MW
1811 case '}':
1812 if (opts->curly_infix_p)
1813 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1814 else
1815 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1816 case ']':
b1b5433d 1817 if (opts->square_brackets_p)
a4e47229
MG
1818 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1819 /* otherwise fall through */
7337d56d
LC
1820 case EOF:
1821 return SCM_EOF_VAL;
1822 case ':':
b1b5433d
MW
1823 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1824 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1825 /* Fall through. */
1826
1827 default:
1828 {
1829 if (((chr >= '0') && (chr <= '9'))
1830 || (strchr ("+-.", chr)))
b1b5433d 1831 return (scm_read_number (chr, port, opts));
7337d56d 1832 else
b1b5433d 1833 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1834 }
1835 }
1836 }
1837}
1838#undef FUNC_NAME
1839
bf9eb54a
MW
1840static SCM
1841scm_read_expression (SCM port, scm_t_read_opts *opts)
1842#define FUNC_NAME "scm_read_expression"
1843{
1844 if (!opts->neoteric_p)
1845 return read_inner_expression (port, opts);
1846 else
1847 {
1848 long line = 0;
1849 int column = 0;
1850 SCM expr;
1851
1852 if (opts->record_positions_p)
1853 {
1854 /* We need to get the position of the first non-whitespace
1855 character in order to correctly annotate neoteric
1856 expressions. For example, for the expression 'f(x)', the
1857 first call to 'read_inner_expression' reads the 'f' (which
1858 cannot be annotated), and then we later read the '(x)' and
1859 use it to construct the new list (f x). */
1860 int c = flush_ws (port, opts, (char *) NULL);
1861 if (c == EOF)
1862 return SCM_EOF_VAL;
fa980bcc 1863 scm_ungetc_unlocked (c, port);
bf9eb54a
MW
1864 line = SCM_LINUM (port);
1865 column = SCM_COL (port);
1866 }
1867
1868 expr = read_inner_expression (port, opts);
1869
1870 /* 'expr' is the first component of the neoteric expression. Now
1871 we loop, and as long as the next character is '(', '[', or '{',
1872 (without any intervening whitespace), we use it to construct a
1873 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1874 for (;;)
1875 {
fa980bcc 1876 int chr = scm_getc_unlocked (port);
bf9eb54a
MW
1877
1878 if (chr == '(')
1879 /* e(...) => (e ...) */
1880 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1881 else if (chr == '[')
1882 /* e[...] => ($bracket-apply$ e ...) */
1883 expr = scm_cons (sym_bracket_apply,
1884 scm_cons (expr,
1885 scm_read_sexp (chr, port, opts)));
1886 else if (chr == '{')
1887 {
1888 SCM arg = scm_read_sexp (chr, port, opts);
1889
1890 if (scm_is_null (arg))
1891 expr = scm_list_1 (expr); /* e{} => (e) */
1892 else
1893 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1894 }
1895 else
1896 {
1897 if (chr != EOF)
fa980bcc 1898 scm_ungetc_unlocked (chr, port);
bf9eb54a
MW
1899 break;
1900 }
1901 maybe_annotate_source (expr, port, opts, line, column);
1902 }
1903 return expr;
1904 }
1905}
1906#undef FUNC_NAME
1907
7337d56d
LC
1908\f
1909/* Actual reader. */
1910
ea8c9761
MW
1911static void init_read_options (SCM port, scm_t_read_opts *opts);
1912
7337d56d
LC
1913SCM_DEFINE (scm_read, "read", 0, 1, 0,
1914 (SCM port),
1915 "Read an s-expression from the input port @var{port}, or from\n"
1916 "the current input port if @var{port} is not specified.\n"
1917 "Any whitespace before the next token is discarded.")
1918#define FUNC_NAME s_scm_read
1919{
b1b5433d 1920 scm_t_read_opts opts;
7337d56d
LC
1921 int c;
1922
1923 if (SCM_UNBNDP (port))
1924 port = scm_current_input_port ();
1925 SCM_VALIDATE_OPINPORT (1, port);
1926
ea8c9761 1927 init_read_options (port, &opts);
b1b5433d
MW
1928
1929 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1930 if (EOF == c)
1931 return SCM_EOF_VAL;
c932ce0b 1932 scm_ungetc_unlocked (c, port);
7337d56d 1933
b1b5433d 1934 return (scm_read_expression (port, &opts));
09a4f039 1935}
db4b4ca6 1936#undef FUNC_NAME
09a4f039 1937
0f2d19dd
JB
1938
1939\f
1940
14de3b42
GH
1941/* Manipulate the read-hash-procedures alist. This could be written in
1942 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1943SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1944 (SCM chr, SCM proc),
dc7fa443
MG
1945 "Install the procedure @var{proc} for reading expressions\n"
1946 "starting with the character sequence @code{#} and @var{chr}.\n"
1947 "@var{proc} will be called with two arguments: the character\n"
1948 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1949 "returned will be the return value of @code{read}. \n"
1950 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1951 )
1bbd0b84 1952#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1953{
fed9c9a2
GH
1954 SCM this;
1955 SCM prev;
1956
36284627 1957 SCM_VALIDATE_CHAR (1, chr);
7888309b 1958 SCM_ASSERT (scm_is_false (proc)
bc36d050 1959 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1960 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1961
14de3b42 1962 /* Check if chr is already in the alist. */
d458073b 1963 this = scm_i_read_hash_procedures_ref ();
14de3b42 1964 prev = SCM_BOOL_F;
fed9c9a2
GH
1965 while (1)
1966 {
d2e53ed6 1967 if (scm_is_null (this))
fed9c9a2
GH
1968 {
1969 /* not found, so add it to the beginning. */
7888309b 1970 if (scm_is_true (proc))
fed9c9a2 1971 {
d458073b
AR
1972 SCM new = scm_cons (scm_cons (chr, proc),
1973 scm_i_read_hash_procedures_ref ());
1974 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1975 }
1976 break;
1977 }
bc36d050 1978 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1979 {
1980 /* already in the alist. */
7888309b 1981 if (scm_is_false (proc))
14de3b42
GH
1982 {
1983 /* remove it. */
7888309b 1984 if (scm_is_false (prev))
14de3b42 1985 {
d458073b
AR
1986 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1987 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1988 }
1989 else
1990 scm_set_cdr_x (prev, SCM_CDR (this));
1991 }
fed9c9a2 1992 else
14de3b42
GH
1993 {
1994 /* replace it. */
1995 scm_set_cdr_x (SCM_CAR (this), proc);
1996 }
fed9c9a2
GH
1997 break;
1998 }
1999 prev = this;
2000 this = SCM_CDR (this);
2001 }
deca31e1 2002
deca31e1
GH
2003 return SCM_UNSPECIFIED;
2004}
1bbd0b84 2005#undef FUNC_NAME
0f2d19dd 2006
deca31e1
GH
2007/* Recover the read-hash procedure corresponding to char c. */
2008static SCM
6e8d25a6 2009scm_get_hash_procedure (int c)
deca31e1 2010{
d458073b 2011 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 2012
deca31e1
GH
2013 while (1)
2014 {
d2e53ed6 2015 if (scm_is_null (rest))
deca31e1
GH
2016 return SCM_BOOL_F;
2017
7866a09b 2018 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
2019 return SCM_CDAR (rest);
2020
2021 rest = SCM_CDR (rest);
2022 }
2023}
1cc91f1b 2024
93c4fa21
AW
2025static int
2026is_encoding_char (char c)
2027{
2028 if (c >= 'a' && c <= 'z') return 1;
2029 if (c >= 'A' && c <= 'Z') return 1;
2030 if (c >= '0' && c <= '9') return 1;
2031 return strchr ("_-.:/,+=()", c) != NULL;
2032}
2033
3ff8a9d6
LC
2034/* Maximum size of an encoding name. This is a bit more than the
2035 longest name listed at
2036 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2037 characters.) */
2038#define ENCODING_NAME_MAX_SIZE 20
889975e5 2039
3ff8a9d6
LC
2040/* Number of bytes at the beginning or end of a file that are scanned
2041 for a "coding:" declaration. */
2042#define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2043
2044
2045/* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2046 coding declaration. Returns either NULL or a string whose storage
2047 has been allocated with `scm_gc_malloc'. */
889975e5 2048char *
f8a1c9a8 2049scm_i_scan_for_encoding (SCM port)
889975e5 2050{
d900843c 2051 scm_t_port *pt;
889975e5 2052 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 2053 size_t bytes_read, encoding_length, i;
889975e5 2054 char *encoding = NULL;
daedbca7 2055 char *pos, *encoding_start;
889975e5
MG
2056 int in_comment;
2057
d900843c 2058 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 2059
d900843c 2060 if (pt->rw_active == SCM_PORT_WRITE)
4251ae2e 2061 scm_flush_unlocked (port);
49bb5bd3 2062
d900843c
AW
2063 if (pt->rw_random)
2064 pt->rw_active = SCM_PORT_READ;
2065
2066 if (pt->read_pos == pt->read_end)
2067 {
2068 /* We can use the read buffer, and thus avoid a seek. */
4251ae2e 2069 if (scm_fill_input_unlocked (port) == EOF)
d900843c
AW
2070 return NULL;
2071
2072 bytes_read = pt->read_end - pt->read_pos;
2073 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2074 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2075
2076 if (bytes_read <= 1)
2077 /* An unbuffered port -- don't scan. */
2078 return NULL;
2079
2080 memcpy (header, pt->read_pos, bytes_read);
2081 header[bytes_read] = '\0';
2082 }
2083 else
2084 {
2085 /* Try to read some bytes and then seek back. Not all ports
2086 support seeking back; and indeed some file ports (like
2087 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2088 check performed by SCM_FPORT_FDES---but fail to seek
2089 backwards. Hence this block comes second. We prefer to use
2090 the read buffer in-place. */
2091 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2092 return NULL;
2093
be632904 2094 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
d900843c
AW
2095 header[bytes_read] = '\0';
2096 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2097 }
889975e5 2098
889975e5
MG
2099 /* search past "coding[:=]" */
2100 pos = header;
2101 while (1)
2102 {
2103 if ((pos = strstr(pos, "coding")) == NULL)
2104 return NULL;
2105
3ff8a9d6
LC
2106 pos += strlen ("coding");
2107 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
889975e5
MG
2108 (*pos == ':' || *pos == '='))
2109 {
2110 pos ++;
2111 break;
2112 }
2113 }
2114
2115 /* skip spaces */
3ff8a9d6 2116 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
889975e5
MG
2117 (*pos == ' ' || *pos == '\t'))
2118 pos ++;
2119
3ff8a9d6
LC
2120 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2121 /* We found the "coding:" string, but there is probably not enough
2122 room to store an encoding name in its entirety, so ignore it.
2123 This makes sure we do not end up returning a truncated encoding
2124 name. */
2125 return NULL;
2126
889975e5 2127 /* grab the next token */
daedbca7 2128 encoding_start = pos;
889975e5 2129 i = 0;
daedbca7
MG
2130 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2131 && encoding_start + i - header < bytes_read
93c4fa21 2132 && is_encoding_char (encoding_start[i]))
889975e5
MG
2133 i++;
2134
daedbca7
MG
2135 encoding_length = i;
2136 if (encoding_length == 0)
889975e5
MG
2137 return NULL;
2138
daedbca7 2139 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
889975e5
MG
2140
2141 /* push backwards to make sure we were in a comment */
2142 in_comment = 0;
daedbca7
MG
2143 pos = encoding_start;
2144 while (pos >= header)
889975e5 2145 {
8a12aeb9
AW
2146 if (*pos == ';')
2147 {
2148 in_comment = 1;
2149 break;
2150 }
2151 else if (*pos == '\n' || pos == header)
889975e5
MG
2152 {
2153 /* This wasn't in a semicolon comment. Check for a
2154 hash-bang comment. */
2155 char *beg = strstr (header, "#!");
2156 char *end = strstr (header, "!#");
8a12aeb9 2157 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2158 in_comment = 1;
2159 break;
2160 }
8a12aeb9
AW
2161 else
2162 {
2163 pos --;
2164 continue;
2165 }
889975e5
MG
2166 }
2167 if (!in_comment)
f8a1c9a8
LC
2168 /* This wasn't in a comment */
2169 return NULL;
2170
889975e5
MG
2171 return encoding;
2172}
2173
2174SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2175 (SCM port),
a270e133 2176 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2177 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2178 "The coding declaration is of the form\n"
2179 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2180 "\n"
2181 "Returns a string containing the character encoding of the file\n"
2182 "if a declaration was found, or @code{#f} otherwise.\n")
2183#define FUNC_NAME s_scm_file_encoding
2184{
2185 char *enc;
2186 SCM s_enc;
f8a1c9a8 2187
d900843c
AW
2188 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2189
f8a1c9a8 2190 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2191 if (enc == NULL)
2192 return SCM_BOOL_F;
2193 else
2194 {
f6f4feb0 2195 s_enc = scm_string_upcase (scm_from_locale_string (enc));
889975e5
MG
2196 return s_enc;
2197 }
f8a1c9a8 2198
889975e5
MG
2199 return SCM_BOOL_F;
2200}
2201#undef FUNC_NAME
2202
ea8c9761
MW
2203\f
2204/* Per-port read options.
2205
a38024ba
MW
2206 We store per-port read options in the 'port-read-options' port
2207 property, which is stored in the internal port structure. The value
2208 stored is a single integer that contains a two-bit field for each
2209 read option.
ea8c9761
MW
2210
2211 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2212 the applicable value should be inherited from the corresponding
2213 global read option. Otherwise, the bit field contains the value of
2214 the read option. For boolean read options that have been set
2215 per-port, the possible values are 0 or 1. If the 'keyword_style'
2216 read option has been set per-port, its possible values are those in
2217 'enum t_keyword_style'. */
2218
a38024ba 2219/* Key to read options in port properties. */
ea8c9761
MW
2220SCM_SYMBOL (sym_port_read_options, "port-read-options");
2221
2222/* Offsets of bit fields for each per-port override */
2223#define READ_OPTION_COPY_SOURCE_P 0
2224#define READ_OPTION_RECORD_POSITIONS_P 2
2225#define READ_OPTION_CASE_INSENSITIVE_P 4
2226#define READ_OPTION_KEYWORD_STYLE 6
2227#define READ_OPTION_R6RS_ESCAPES_P 8
2228#define READ_OPTION_SQUARE_BRACKETS_P 10
2229#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2230#define READ_OPTION_CURLY_INFIX_P 14
dc59631d 2231#define READ_OPTION_R7RS_SYMBOLS_P 16
ea8c9761 2232
bf9eb54a 2233/* The total width in bits of the per-port overrides */
dc59631d 2234#define READ_OPTIONS_NUM_BITS 18
ea8c9761
MW
2235
2236#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2237#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2238
2239#define READ_OPTION_MASK 3
2240#define READ_OPTION_INHERIT 3
2241
2242static void
2243set_port_read_option (SCM port, int option, int new_value)
2244{
a38024ba 2245 SCM scm_read_options;
ea8c9761
MW
2246 unsigned int read_options;
2247
2248 new_value &= READ_OPTION_MASK;
79657fd3
MW
2249
2250 scm_dynwind_begin (0);
2251 scm_dynwind_lock_port (port);
2252
a38024ba 2253 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2254 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2255 read_options = scm_to_uint (scm_read_options);
2256 else
2257 read_options = READ_OPTIONS_INHERIT_ALL;
2258 read_options &= ~(READ_OPTION_MASK << option);
2259 read_options |= new_value << option;
2260 scm_read_options = scm_from_uint (read_options);
a38024ba 2261 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
79657fd3
MW
2262
2263 scm_dynwind_end ();
ea8c9761
MW
2264}
2265
9331ffd8
MW
2266/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2267static void
2268set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2269{
2270 value = !!value;
2271 opts->case_insensitive_p = value;
2272 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2273}
2274
bf9eb54a
MW
2275/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2276static void
2277set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2278{
2279 value = !!value;
2280 opts->square_brackets_p = value;
2281 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2282}
2283
2284/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2285static void
2286set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2287{
2288 value = !!value;
2289 opts->curly_infix_p = value;
2290 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2291}
2292
ea8c9761
MW
2293/* Initialize OPTS based on PORT's read options and the global read
2294 options. */
2295static void
2296init_read_options (SCM port, scm_t_read_opts *opts)
2297{
a38024ba 2298 SCM val, scm_read_options;
ea8c9761
MW
2299 unsigned int read_options, x;
2300
a38024ba 2301 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2302
2303 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2304 read_options = scm_to_uint (scm_read_options);
2305 else
2306 read_options = READ_OPTIONS_INHERIT_ALL;
2307
2308 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2309 if (x == READ_OPTION_INHERIT)
2310 {
2311 val = SCM_PACK (SCM_KEYWORD_STYLE);
2312 if (scm_is_eq (val, scm_keyword_prefix))
2313 x = KEYWORD_STYLE_PREFIX;
2314 else if (scm_is_eq (val, scm_keyword_postfix))
2315 x = KEYWORD_STYLE_POSTFIX;
2316 else
2317 x = KEYWORD_STYLE_HASH_PREFIX;
2318 }
2319 opts->keyword_style = x;
2320
2321#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2322 do \
2323 { \
2324 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2325 if (x == READ_OPTION_INHERIT) \
2326 x = !!SCM_ ## NAME; \
2327 opts->name = x; \
2328 } \
2329 while (0)
2330
2331 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2332 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2333 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2334 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2335 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2336 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2337 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
dc59631d 2338 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
ea8c9761
MW
2339
2340#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2341
2342 opts->neoteric_p = 0;
ea8c9761
MW
2343}
2344
0f2d19dd
JB
2345void
2346scm_init_read ()
0f2d19dd 2347{
d458073b
AR
2348 SCM read_hash_procs;
2349
c81c2ad3 2350 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2351
2352 scm_i_read_hash_procedures =
2353 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2354
62560650 2355 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2356#include "libguile/read.x"
0f2d19dd 2357}
89e00824
ML
2358
2359/*
2360 Local Variables:
2361 c-file-style: "gnu"
2362 End:
2363*/