temporarily disable elisp exception tests
[bpt/guile.git] / libguile / read.c
CommitLineData
5af307de
MW
1/* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014
2 * Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d 28#include <string.h>
889975e5
MG
29#include <unistd.h>
30#include <unicase.h>
684d664e 31#include <unictype.h>
96965a6e 32#include <c-strcase.h>
0fc54828 33#include <c-ctype.h>
7337d56d 34
a0599745 35#include "libguile/_scm.h"
0ba0b384 36#include "libguile/bytevectors.h"
a0599745
MD
37#include "libguile/chars.h"
38#include "libguile/eval.h"
2fa901a5 39#include "libguile/arrays.h"
cf396142 40#include "libguile/bitvectors.h"
a0599745
MD
41#include "libguile/keywords.h"
42#include "libguile/alist.h"
43#include "libguile/srcprop.h"
44#include "libguile/hashtab.h"
45#include "libguile/hash.h"
46#include "libguile/ports.h"
f6f4feb0 47#include "libguile/ports-internal.h"
49bb5bd3 48#include "libguile/fports.h"
a0599745
MD
49#include "libguile/root.h"
50#include "libguile/strings.h"
ba1b2226 51#include "libguile/strports.h"
a0599745 52#include "libguile/vectors.h"
a0599745 53#include "libguile/validate.h"
a4022e69 54#include "libguile/srfi-4.h"
7337d56d 55#include "libguile/srfi-13.h"
ba1b2226 56
a0599745 57#include "libguile/read.h"
22fc179a
HWN
58#include "libguile/private-options.h"
59
0f2d19dd
JB
60
61\f
62
5bf6a6f0 63SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 64SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 65SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 66SCM_SYMBOL (sym_nil, "nil");
c7733771 67
bf9eb54a
MW
68/* SRFI-105 curly infix expression support */
69SCM_SYMBOL (sym_nfx, "$nfx$");
70SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
dc59631d
MW
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
bf9eb54a
MW
93 { 0, },
94 };
b1b5433d
MW
95
96/* Internal read options structure. This is initialized by 'scm_read'
ea8c9761
MW
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
b1b5433d 106
ea8c9761
MW
107struct t_read_opts
108{
b1b5433d
MW
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
bf9eb54a
MW
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
dc59631d 118 unsigned int r7rs_symbols_p : 1;
a16f6fe7
MD
119};
120
b1b5433d
MW
121typedef struct t_read_opts scm_t_read_opts;
122
a16f6fe7 123
39e8f371
HWN
124/*
125 Give meaningful error messages for errors
126
127 We use the format
128
ba1b2226 129 FILE:LINE:COL: MESSAGE
39e8f371
HWN
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
39e8f371
HWN
135 */
136
137
a4022e69
MV
138void
139scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
ba1b2226 141{
29a837fd
MV
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 145
29a837fd 146 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
272632a6 149 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 150 scm_list_4 (fn,
b3aa4626 151 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 152 scm_from_int (SCM_COL (port) + 1),
272632a6 153 scm_from_locale_string (message)));
ba1b2226
HWN
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
4a655e50 157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 158 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 159 string,
dd72382c 160 arg,
ba1b2226
HWN
161 SCM_BOOL_F);
162}
39e8f371
HWN
163
164
a1ec6916 165SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 166 (SCM setting),
dc7fa443
MG
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 170#define FUNC_NAME s_scm_read_options
a16f6fe7 171{
b7ff98dd
MD
172 SCM ans = scm_options (setting,
173 scm_read_opts,
1bbd0b84 174 FUNC_NAME);
b7ff98dd
MD
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
177 return ans;
178}
1bbd0b84 179#undef FUNC_NAME
a16f6fe7 180
d458073b
AR
181/* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183static SCM *scm_i_read_hash_procedures;
deca31e1 184
cfd15439 185static SCM
d458073b
AR
186scm_i_read_hash_procedures_ref (void)
187{
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189}
190
cfd15439 191static void
d458073b
AR
192scm_i_read_hash_procedures_set_x (SCM value)
193{
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195}
0f2d19dd 196
7337d56d
LC
197\f
198/* Token readers. */
0f2d19dd 199
0f2d19dd 200
7337d56d
LC
201/* Size of the C buffer used to read symbols and numbers. */
202#define READER_BUFFER_SIZE 128
0f2d19dd 203
ff4d3672
LC
204/* Number of 32-bit codepoints in the buffer used to read strings. */
205#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 206
7337d56d
LC
207/* The maximum size of Scheme character names. */
208#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 209
02327c0c
MW
210/* The maximum size of reader directive names. */
211#define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
94115ae3 213
7337d56d
LC
214/* `isblank' is only in C99. */
215#define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 217 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
218
219#ifdef MSDOS
220# define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222#else
223# define CHAR_IS_BLANK CHAR_IS_BLANK_
224#endif
225
226
227/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229#define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
603234c6 231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
7337d56d 232
603234c6
MW
233#define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
bf9eb54a
MW
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
7337d56d
LC
238
239/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241#define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
454866e0 245/* Read an SCSH block comment. */
cfd15439 246static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965 247static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
b1b5433d
MW
248static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
620c8965 250static SCM scm_get_hash_procedure (int);
454866e0 251
69f90b0b
MG
252/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
cfd15439 256static int
b1b5433d
MW
257read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
b662b7e9 259{
69f90b0b 260 *read = 0;
0520c320 261
69f90b0b
MG
262 while (*read < buf_size)
263 {
264 int chr;
7337d56d 265
0d959103 266 chr = scm_get_byte_or_eof_unlocked (port);
889975e5 267
69f90b0b
MG
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
c932ce0b 272 scm_unget_byte_unlocked (chr, port);
69f90b0b
MG
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
889975e5 281
69f90b0b
MG
282 return 1;
283 }
889975e5 284
b662b7e9
LC
285/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287static char *
b1b5433d
MW
288read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
69f90b0b
MG
290{
291 int overflow = 0;
b662b7e9
LC
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
1cc91f1b 294
69f90b0b
MG
295 do
296 {
b1b5433d 297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
69f90b0b
MG
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
b662b7e9
LC
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
306 overflow_size = bytes_read;
307 }
308 else
309 {
7be3c2fc 310 char *new_buf =
b662b7e9
LC
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
69f90b0b
MG
317 overflow_size += bytes_read;
318 }
319 }
889975e5 320 }
69f90b0b 321 while (overflow);
889975e5 322
69f90b0b
MG
323 if (overflow_size)
324 *read = overflow_size;
b521d265 325 else
69f90b0b
MG
326 *read = bytes_read;
327
b662b7e9 328 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 329}
7337d56d
LC
330
331/* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333static int
b1b5433d 334flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
0f2d19dd 335{
cfd15439 336 scm_t_wchar c;
0f2d19dd 337 while (1)
be632904 338 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
d156d3b7 343 {
a4022e69
MV
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
d156d3b7 348 }
0f2d19dd 349 return c;
7337d56d 350
0f2d19dd
JB
351 case ';':
352 lp:
be632904 353 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
7337d56d 363
454866e0 364 case '#':
be632904 365 switch (c = scm_getc_unlocked (port))
454866e0
LC
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
b1b5433d 371 scm_read_shebang (c, port, opts);
454866e0 372 break;
34f3d47d 373 case ';':
b1b5433d 374 scm_read_commented_expression (c, port, opts);
34f3d47d 375 break;
620c8965
LC
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
454866e0 383 default:
c932ce0b 384 scm_ungetc_unlocked (c, port);
454866e0
LC
385 return '#';
386 }
387 break;
388
0f2d19dd 389 case SCM_LINE_INCREMENTORS:
0f2d19dd 390 case SCM_SINGLE_SPACES:
0f2d19dd 391 case '\t':
0f2d19dd 392 break;
7337d56d 393
0f2d19dd
JB
394 default:
395 return c;
396 }
7337d56d
LC
397
398 return 0;
0f2d19dd
JB
399}
400
401
7337d56d
LC
402\f
403/* Token readers. */
1cc91f1b 404
b1b5433d
MW
405static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
0f2d19dd 408
0f2d19dd 409
d6cb0203 410static SCM
b1b5433d
MW
411maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
d6cb0203 413{
3ddd4381
AW
414 /* This condition can be caused by a user calling
415 set-port-column!. */
416 if (line < 0 || column < 0)
417 return x;
418
b1b5433d 419 if (opts->record_positions_p)
d6cb0203
MW
420 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
421 return x;
422}
0f2d19dd 423
09a4f039 424static SCM
b1b5433d 425scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 426#define FUNC_NAME "scm_i_lreadparen"
09a4f039 427{
26c8cc14
AW
428 int c;
429 SCM tmp, tl, ans = SCM_EOL;
bf9eb54a
MW
430 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
431 const int terminating_char = ((chr == '{') ? '}'
432 : ((chr == '[') ? ']'
433 : ')'));
7337d56d
LC
434
435 /* Need to capture line and column numbers here. */
436 long line = SCM_LINUM (port);
437 int column = SCM_COL (port) - 1;
f9c68a47 438
b1b5433d 439 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
440 if (terminating_char == c)
441 return SCM_EOL;
f9c68a47 442
c932ce0b 443 scm_ungetc_unlocked (c, port);
b1b5433d 444 tmp = scm_read_expression (port, opts);
1f7945a7
AW
445
446 /* Note that it is possible for scm_read_expression to return
447 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
448 check that it's a real dot by checking `c'. */
449 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d 450 {
b1b5433d
MW
451 ans = scm_read_expression (port, opts);
452 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
7337d56d
LC
453 scm_i_input_error (FUNC_NAME, port, "missing close paren",
454 SCM_EOL);
455 return ans;
456 }
1cc91f1b 457
7337d56d
LC
458 /* Build the head of the list structure. */
459 ans = tl = scm_cons (tmp, SCM_EOL);
460
b1b5433d 461 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
0f2d19dd 462 {
7337d56d 463 SCM new_tail;
0f2d19dd 464
bf9eb54a
MW
465 if (c == ')' || (c == ']' && opts->square_brackets_p)
466 || ((c == '}' || c == ']') && opts->curly_infix_p))
5b69315e
AW
467 scm_i_input_error (FUNC_NAME, port,
468 "in pair: mismatched close paren: ~A",
469 scm_list_1 (SCM_MAKE_CHAR (c)));
470
c932ce0b 471 scm_ungetc_unlocked (c, port);
b1b5433d 472 tmp = scm_read_expression (port, opts);
5b69315e 473
1f7945a7
AW
474 /* See above note about scm_sym_dot. */
475 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 476 {
b1b5433d 477 SCM_SETCDR (tl, scm_read_expression (port, opts));
7337d56d 478
b1b5433d 479 c = flush_ws (port, opts, FUNC_NAME);
7337d56d
LC
480 if (terminating_char != c)
481 scm_i_input_error (FUNC_NAME, port,
482 "in pair: missing close paren", SCM_EOL);
bf9eb54a 483 break;
0f2d19dd 484 }
b858464a 485
7337d56d
LC
486 new_tail = scm_cons (tmp, SCM_EOL);
487 SCM_SETCDR (tl, new_tail);
488 tl = new_tail;
7337d56d 489 }
0f2d19dd 490
bf9eb54a
MW
491 if (curly_list_p)
492 {
493 /* In addition to finding the length, 'scm_ilength' checks for
494 improper or circular lists, in which case it returns -1. */
495 int len = scm_ilength (ans);
496
497 /* The (len == 0) case is handled above */
498 if (len == 1)
499 /* Return directly to avoid re-annotating the element's source
500 location with the position of the outer brace. Also, it
501 might not be possible to annotate the element. */
502 return scm_car (ans); /* {e} => e */
503 else if (len == 2)
504 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
505 else if (len >= 3 && (len & 1))
506 {
507 /* It's a proper list whose length is odd and at least 3. If
508 the elements at odd indices (the infix operator positions)
509 are all 'equal?', then it's a simple curly-infix list.
510 Otherwise it's a mixed curly-infix list. */
511 SCM op = scm_cadr (ans);
512
513 /* Check to see if the elements at odd indices are 'equal?' */
514 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
515 {
516 if (scm_is_null (tl))
517 {
518 /* Convert simple curly-infix list to prefix:
519 {a <op> b <op> ...} => (<op> a b ...) */
520 tl = ans;
521 while (scm_is_pair (scm_cdr (tl)))
522 {
523 tmp = scm_cddr (tl);
524 SCM_SETCDR (tl, tmp);
525 tl = tmp;
526 }
527 ans = scm_cons (op, ans);
528 break;
529 }
530 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
531 {
532 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
533 ans = scm_cons (sym_nfx, ans);
534 break;
535 }
536 }
537 }
538 else
539 /* Mixed curly-infix (possibly improper) list:
540 {e . tail} => ($nfx$ e . tail) */
541 ans = scm_cons (sym_nfx, ans);
542 }
543
b1b5433d 544 return maybe_annotate_source (ans, port, opts, line, column);
7337d56d
LC
545}
546#undef FUNC_NAME
a4022e69 547
c5661d28
MG
548
549/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
550 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
551 found. */
552#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
553 do \
554 { \
555 scm_t_wchar a; \
556 size_t i = 0; \
557 c = 0; \
558 while (i < ndigits) \
559 { \
be632904 560 a = scm_getc_unlocked (port); \
dea901d6
MG
561 if (a == EOF) \
562 goto str_eof; \
563 if (terminator \
564 && (a == (scm_t_wchar) terminator) \
565 && (i > 0)) \
566 break; \
567 if ('0' <= a && a <= '9') \
568 a -= '0'; \
569 else if ('A' <= a && a <= 'F') \
570 a = a - 'A' + 10; \
571 else if ('a' <= a && a <= 'f') \
572 a = a - 'a' + 10; \
573 else \
574 { \
575 c = a; \
576 goto bad_escaped; \
577 } \
578 c = c * 16 + a; \
579 i ++; \
580 } \
c5661d28
MG
581 } while (0)
582
684d664e
AW
583static void
584skip_intraline_whitespace (SCM port)
585{
586 scm_t_wchar c;
587
588 do
589 {
be632904 590 c = scm_getc_unlocked (port);
684d664e
AW
591 if (c == EOF)
592 return;
593 }
594 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
595
c932ce0b 596 scm_ungetc_unlocked (c, port);
684d664e
AW
597}
598
dc59631d
MW
599/* Read either a double-quoted string or an R7RS-style symbol delimited
600 by vertical lines, depending on the value of 'chr' ('"' or '|').
601 Regardless, the result is always returned as a string. */
7337d56d 602static SCM
dc59631d 603scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
604#define FUNC_NAME "scm_lreadr"
605{
606 /* For strings smaller than C_STR, this function creates only one Scheme
607 object (the string returned). */
0f2d19dd 608
ff4d3672
LC
609 SCM str = SCM_EOL;
610 size_t c_str_len = 0;
611 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 612
b131b233
MW
613 /* Need to capture line and column numbers here. */
614 long line = SCM_LINUM (port);
615 int column = SCM_COL (port) - 1;
616
c92ee2b3 617 while (chr != (c = scm_getc_unlocked (port)))
7337d56d
LC
618 {
619 if (c == EOF)
9c44cd45
MG
620 {
621 str_eof:
622 scm_i_input_error (FUNC_NAME, port,
dc59631d
MW
623 (chr == '|'
624 ? "end of file in symbol"
625 : "end of file in string constant"),
626 SCM_EOL);
9c44cd45 627 }
0f2d19dd 628
ff4d3672
LC
629 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
630 {
631 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
632 c_str_len = 0;
633 }
0f2d19dd 634
7337d56d 635 if (c == '\\')
9c44cd45 636 {
be632904 637 switch (c = scm_getc_unlocked (port))
9c44cd45
MG
638 {
639 case EOF:
640 goto str_eof;
6579c330 641 case '|':
9c44cd45 642 case '\\':
5af307de
MW
643 case '(': /* Accept "\(" for use at the beginning of lines
644 in multiline strings to avoid confusing emacs
645 lisp modes. */
9c44cd45 646 break;
9c44cd45 647 case '\n':
b1b5433d 648 if (opts->hungry_eol_escapes_p)
684d664e 649 skip_intraline_whitespace (port);
9c44cd45
MG
650 continue;
651 case '0':
652 c = '\0';
653 break;
654 case 'f':
655 c = '\f';
656 break;
657 case 'n':
658 c = '\n';
659 break;
660 case 'r':
661 c = '\r';
662 break;
663 case 't':
664 c = '\t';
665 break;
666 case 'a':
667 c = '\007';
668 break;
669 case 'v':
670 c = '\v';
671 break;
67a4a16d
MG
672 case 'b':
673 c = '\010';
674 break;
9c44cd45 675 case 'x':
dc59631d 676 if (opts->r6rs_escapes_p || chr == '|')
dea901d6
MG
677 SCM_READ_HEX_ESCAPE (10, ';');
678 else
679 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 680 break;
9c44cd45 681 case 'u':
b1b5433d 682 if (!opts->r6rs_escapes_p)
898a0b5a
MG
683 {
684 SCM_READ_HEX_ESCAPE (4, '\0');
685 break;
686 }
9c44cd45 687 case 'U':
b1b5433d 688 if (!opts->r6rs_escapes_p)
898a0b5a
MG
689 {
690 SCM_READ_HEX_ESCAPE (6, '\0');
691 break;
692 }
9c44cd45 693 default:
dc59631d
MW
694 if (c == chr)
695 break;
9c44cd45
MG
696 bad_escaped:
697 scm_i_input_error (FUNC_NAME, port,
698 "illegal character in escape sequence: ~S",
699 scm_list_1 (SCM_MAKE_CHAR (c)));
700 }
701 }
ff4d3672
LC
702
703 c_str[c_str_len++] = c;
7337d56d 704 }
ff4d3672
LC
705
706 if (scm_is_null (str))
707 /* Fast path: we got a string that fits in C_STR. */
708 str = scm_from_utf32_stringn (c_str, c_str_len);
709 else
710 {
711 if (c_str_len > 0)
712 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
713
714 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
715 }
716
b1b5433d 717 return maybe_annotate_source (str, port, opts, line, column);
0f2d19dd 718}
db4b4ca6
DH
719#undef FUNC_NAME
720
dc59631d
MW
721static SCM
722scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
723{
724 return scm_read_string_like_syntax (chr, port, opts);
725}
726
727static SCM
728scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
729{
730 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
731}
0f2d19dd 732
7337d56d 733static SCM
b1b5433d 734scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
0f2d19dd 735{
69f90b0b 736 SCM result, str = SCM_EOL;
b662b7e9 737 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 738 size_t bytes_read;
0f2d19dd 739
38f19074
MW
740 /* Need to capture line and column numbers here. */
741 long line = SCM_LINUM (port);
742 int column = SCM_COL (port) - 1;
743
c932ce0b 744 scm_ungetc_unlocked (chr, port);
b1b5433d 745 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 746 &bytes_read);
69f90b0b 747
08467a7e 748 str = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b
MG
749
750 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 751 if (scm_is_false (result))
69f90b0b
MG
752 {
753 /* Return a symbol instead of a number */
b1b5433d 754 if (opts->case_insensitive_p)
69f90b0b
MG
755 str = scm_string_downcase_x (str);
756 result = scm_string_to_symbol (str);
757 }
38f19074 758 else if (SCM_NIMP (result))
b1b5433d 759 result = maybe_annotate_source (result, port, opts, line, column);
0f2d19dd 760
69f90b0b 761 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
762 return result;
763}
0f2d19dd 764
7337d56d 765static SCM
b1b5433d 766scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d 767{
889975e5
MG
768 SCM result;
769 int ends_with_colon = 0;
69f90b0b 770 size_t bytes_read;
b1b5433d 771 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
b662b7e9 772 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 773 SCM str;
7337d56d 774
c932ce0b 775 scm_ungetc_unlocked (chr, port);
b1b5433d 776 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 777 &bytes_read);
69f90b0b 778 if (bytes_read > 0)
b662b7e9 779 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 780
69f90b0b
MG
781 if (postfix && ends_with_colon && (bytes_read > 1))
782 {
08467a7e 783 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
69f90b0b 784
b1b5433d 785 if (opts->case_insensitive_p)
69f90b0b
MG
786 str = scm_string_downcase_x (str);
787 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
788 }
7337d56d 789 else
69f90b0b 790 {
08467a7e 791 str = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b 792
b1b5433d 793 if (opts->case_insensitive_p)
69f90b0b
MG
794 str = scm_string_downcase_x (str);
795 result = scm_string_to_symbol (str);
796 }
7337d56d 797
69f90b0b 798 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
799 return result;
800}
801
802static SCM
b1b5433d 803scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
804#define FUNC_NAME "scm_lreadr"
805{
889975e5 806 SCM result;
7337d56d 807 size_t read;
b662b7e9 808 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 809 unsigned int radix;
69f90b0b 810 SCM str;
7337d56d
LC
811
812 switch (chr)
813 {
814 case 'B':
815 case 'b':
816 radix = 2;
817 break;
818
819 case 'o':
820 case 'O':
821 radix = 8;
822 break;
823
824 case 'd':
825 case 'D':
826 radix = 10;
827 break;
828
829 case 'x':
830 case 'X':
831 radix = 16;
832 break;
833
834 default:
c932ce0b
AW
835 scm_ungetc_unlocked (chr, port);
836 scm_ungetc_unlocked ('#', port);
7337d56d
LC
837 radix = 10;
838 }
839
b1b5433d 840 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
b662b7e9 841 &read);
69f90b0b 842
08467a7e 843 str = scm_from_port_stringn (buffer, read, port);
69f90b0b
MG
844
845 result = scm_string_to_number (str, scm_from_uint (radix));
846
69f90b0b 847 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
848
849 if (scm_is_true (result))
850 return result;
851
852 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
853
854 return SCM_BOOL_F;
855}
856#undef FUNC_NAME
857
858static SCM
b1b5433d 859scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
860{
861 SCM p;
492faee1
LC
862 long line = SCM_LINUM (port);
863 int column = SCM_COL (port) - 1;
7337d56d
LC
864
865 switch (chr)
866 {
867 case '`':
868 p = scm_sym_quasiquote;
869 break;
870
871 case '\'':
872 p = scm_sym_quote;
873 break;
874
875 case ',':
876 {
889975e5 877 scm_t_wchar c;
7337d56d 878
be632904 879 c = scm_getc_unlocked (port);
7337d56d
LC
880 if ('@' == c)
881 p = scm_sym_uq_splicing;
882 else
0f2d19dd 883 {
c932ce0b 884 scm_ungetc_unlocked (c, port);
7337d56d 885 p = scm_sym_unquote;
0f2d19dd 886 }
7337d56d
LC
887 break;
888 }
0f2d19dd 889
7337d56d
LC
890 default:
891 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 892 "scm_read_quote", chr);
7337d56d 893 abort ();
0f2d19dd 894 }
1cc91f1b 895
b1b5433d
MW
896 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
897 return maybe_annotate_source (p, port, opts, line, column);
7337d56d
LC
898}
899
34f3d47d
AW
900SCM_SYMBOL (sym_syntax, "syntax");
901SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
902SCM_SYMBOL (sym_unsyntax, "unsyntax");
903SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
904
905static SCM
b1b5433d 906scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
34f3d47d
AW
907{
908 SCM p;
909 long line = SCM_LINUM (port);
910 int column = SCM_COL (port) - 1;
911
912 switch (chr)
913 {
914 case '`':
915 p = sym_quasisyntax;
916 break;
917
918 case '\'':
919 p = sym_syntax;
920 break;
921
922 case ',':
923 {
924 int c;
925
be632904 926 c = scm_getc_unlocked (port);
34f3d47d
AW
927 if ('@' == c)
928 p = sym_unsyntax_splicing;
929 else
930 {
c932ce0b 931 scm_ungetc_unlocked (c, port);
34f3d47d
AW
932 p = sym_unsyntax;
933 }
934 break;
935 }
936
937 default:
938 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
939 "scm_read_syntax", chr);
940 abort ();
941 }
942
b1b5433d
MW
943 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
944 return maybe_annotate_source (p, port, opts, line, column);
34f3d47d
AW
945}
946
cfd15439 947static SCM
b1b5433d 948scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
7c4aad9c 949{
b1b5433d 950 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
7c4aad9c
AW
951
952 if (!scm_is_eq (id, sym_nil))
953 scm_i_input_error ("scm_read_nil", port,
954 "unexpected input while reading #nil: ~a",
955 scm_list_1 (id));
956
957 return SCM_ELISP_NIL;
958}
959
cfd15439 960static SCM
7337d56d 961scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 962{
0f2d19dd
JB
963 int c;
964
889975e5
MG
965 /* We use the get_byte here because there is no need to get the
966 locale correct with comment input. This presumes that newline
967 always represents itself no matter what the encoding is. */
0d959103 968 for (c = scm_get_byte_or_eof_unlocked (port);
7337d56d 969 (c != EOF) && (c != '\n');
0d959103 970 c = scm_get_byte_or_eof_unlocked (port));
7337d56d
LC
971
972 return SCM_UNSPECIFIED;
973}
974
7a329029
MW
975/* If the EXPECTED_CHARS are the next ones available from PORT, then
976 consume them and return 1. Otherwise leave the port position where
977 it was and return 0. EXPECTED_CHARS should be all lowercase, and
978 will be matched case-insensitively against the characters read from
979 PORT. */
980static int
981try_read_ci_chars (SCM port, const char *expected_chars)
982{
983 int num_chars_wanted = strlen (expected_chars);
984 int num_chars_read = 0;
985 char *chars_read = alloca (num_chars_wanted);
986 int c;
987
988 while (num_chars_read < num_chars_wanted)
989 {
cb8aaef4 990 c = scm_getc_unlocked (port);
7a329029
MW
991 if (c == EOF)
992 break;
0fc54828 993 else if (c_tolower (c) != expected_chars[num_chars_read])
7a329029 994 {
cb8aaef4 995 scm_ungetc_unlocked (c, port);
7a329029
MW
996 break;
997 }
998 else
999 chars_read[num_chars_read++] = c;
1000 }
1001
1002 if (num_chars_read == num_chars_wanted)
1003 return 1;
1004 else
1005 {
1006 while (num_chars_read > 0)
cb8aaef4 1007 scm_ungetc_unlocked (chars_read[--num_chars_read], port);
7a329029
MW
1008 return 0;
1009 }
1010}
1011
7337d56d
LC
1012\f
1013/* Sharp readers, i.e. readers called after a `#' sign has been read. */
1014
1015static SCM
1016scm_read_boolean (int chr, SCM port)
1017{
1018 switch (chr)
0f2d19dd 1019 {
7337d56d
LC
1020 case 't':
1021 case 'T':
7a329029 1022 try_read_ci_chars (port, "rue");
7337d56d
LC
1023 return SCM_BOOL_T;
1024
1025 case 'f':
1026 case 'F':
7a329029 1027 try_read_ci_chars (port, "alse");
7337d56d 1028 return SCM_BOOL_F;
0f2d19dd 1029 }
7337d56d
LC
1030
1031 return SCM_UNSPECIFIED;
1032}
1033
1034static SCM
b1b5433d 1035scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1036#define FUNC_NAME "scm_lreadr"
1037{
69f90b0b
MG
1038 char buffer[READER_CHAR_NAME_MAX_SIZE];
1039 SCM charname;
1040 size_t charname_len, bytes_read;
889975e5
MG
1041 scm_t_wchar cp;
1042 int overflow;
f6f4feb0 1043 scm_t_port_internal *pti;
889975e5 1044
b1b5433d
MW
1045 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1046 &bytes_read);
889975e5 1047 if (overflow)
c1a0ba1c 1048 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 1049
69f90b0b 1050 if (bytes_read == 0)
0f2d19dd 1051 {
be632904 1052 chr = scm_getc_unlocked (port);
7337d56d
LC
1053 if (chr == EOF)
1054 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1055 "while reading character", SCM_EOL);
1056
1057 /* CHR must be a token delimiter, like a whitespace. */
1058 return (SCM_MAKE_CHAR (chr));
0f2d19dd 1059 }
7337d56d 1060
f6f4feb0 1061 pti = SCM_PORT_GET_INTERNAL (port);
7337d56d 1062
69f90b0b
MG
1063 /* Simple ASCII characters can be processed immediately. Also, simple
1064 ISO-8859-1 characters can be processed immediately if the encoding for this
1065 port is ISO-8859-1. */
d1b9f8ac
AW
1066 if (bytes_read == 1 &&
1067 ((unsigned char) buffer[0] <= 127
f6f4feb0 1068 || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
69f90b0b
MG
1069 {
1070 SCM_COL (port) += 1;
1071 return SCM_MAKE_CHAR (buffer[0]);
1072 }
1073
1074 /* Otherwise, convert the buffer into a proper scheme string for
1075 processing. */
08467a7e 1076 charname = scm_from_port_stringn (buffer, bytes_read, port);
69f90b0b
MG
1077 charname_len = scm_i_string_length (charname);
1078 SCM_COL (port) += charname_len;
889975e5 1079 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
1080 if (charname_len == 1)
1081 return SCM_MAKE_CHAR (cp);
1082
1083 /* Ignore dotted circles, which may be used to keep combining characters from
1084 combining with the backslash in #\charname. */
0dcd7e61
MG
1085 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1086 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1087
889975e5 1088 if (cp >= '0' && cp < '8')
7337d56d
LC
1089 {
1090 /* Dirk:FIXME:: This type of character syntax is not R5RS
1091 * compliant. Further, it should be verified that the constant
0ffc78e3 1092 * does only consist of octal digits. */
889975e5 1093 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 1094 if (SCM_I_INUMP (p))
0ffc78e3 1095 {
e25f3727 1096 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
1097 if (SCM_IS_UNICODE_CHAR (c))
1098 return SCM_MAKE_CHAR (c);
1099 else
0f3a70cf 1100 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
1101 "out-of-range octal character escape: ~a",
1102 scm_list_1 (charname));
1103 }
7337d56d
LC
1104 }
1105
0f3a70cf 1106 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1107 {
1108 SCM p;
0f3a70cf 1109
dea901d6
MG
1110 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1111 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1112 scm_from_uint (16));
1113 if (SCM_I_INUMP (p))
1114 {
e25f3727 1115 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1116 if (SCM_IS_UNICODE_CHAR (c))
1117 return SCM_MAKE_CHAR (c);
1118 else
1119 scm_i_input_error (FUNC_NAME, port,
1120 "out-of-range hex character escape: ~a",
1121 scm_list_1 (charname));
1122 }
1123 }
1124
889975e5
MG
1125 /* The names of characters should never have non-Latin1
1126 characters. */
1127 if (scm_i_is_narrow_string (charname)
1128 || scm_i_try_narrow_string (charname))
4769c9db
AW
1129 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1130 charname_len);
1131 if (scm_is_true (ch))
1132 return ch;
1133 }
7337d56d 1134
7337d56d 1135 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1136 scm_list_1 (charname));
7337d56d
LC
1137
1138 return SCM_UNSPECIFIED;
0f2d19dd 1139}
db4b4ca6 1140#undef FUNC_NAME
0f2d19dd 1141
cfd15439 1142static SCM
b1b5433d 1143scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1144{
1145 SCM symbol;
1146
1147 /* Read the symbol that comprises the keyword. Doing this instead of
1148 invoking a specific symbol reader function allows `scm_read_keyword ()'
1149 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1150
7337d56d 1151 XXX: This implementation allows sloppy syntaxes like `#: key'. */
b1b5433d 1152 symbol = scm_read_expression (port, opts);
7337d56d 1153 if (!scm_is_symbol (symbol))
7f74cf9a 1154 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1155 "keyword prefix `~a' not followed by a symbol: ~s",
1156 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1157
1158 return (scm_symbol_to_keyword (symbol));
1159}
1160
cfd15439 1161static SCM
b1b5433d
MW
1162scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1163 long line, int column)
09a4f039 1164{
7337d56d
LC
1165 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1166 guarantee that it's going to do what we want. After all, this is an
1167 implementation detail of `scm_read_vector ()', not a desirable
1168 property. */
b1b5433d
MW
1169 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1170 port, opts, line, column);
b131b233
MW
1171}
1172
493ceb99
MW
1173/* Helper used by scm_read_array */
1174static int
1175read_decimal_integer (SCM port, int c, ssize_t *resp)
1176{
1177 ssize_t sign = 1;
1178 ssize_t res = 0;
1179 int got_it = 0;
1180
1181 if (c == '-')
1182 {
1183 sign = -1;
fa980bcc 1184 c = scm_getc_unlocked (port);
493ceb99
MW
1185 }
1186
1187 while ('0' <= c && c <= '9')
1188 {
a662686a
MW
1189 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1190 scm_i_input_error ("read_decimal_integer", port,
1191 "number too large", SCM_EOL);
493ceb99
MW
1192 res = 10*res + c-'0';
1193 got_it = 1;
fa980bcc 1194 c = scm_getc_unlocked (port);
493ceb99
MW
1195 }
1196
1197 if (got_it)
1198 *resp = sign * res;
1199 return c;
7337d56d 1200}
09a4f039 1201
493ceb99
MW
1202/* Read an array. This function can also read vectors and uniform
1203 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1204 handled here.
1205
10744b7c 1206 C is the first character read after the '#'. */
b131b233 1207static SCM
b1b5433d 1208scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
7337d56d 1209{
493ceb99
MW
1210 ssize_t rank;
1211 scm_t_wchar tag_buf[8];
1212 int tag_len;
1213
1214 SCM tag, shape = SCM_BOOL_F, elements, array;
1215
1216 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1217 the array code can not deal with zero-length dimensions yet, and
10744b7c 1218 we want to allow zero-length vectors, of course. */
493ceb99 1219 if (c == '(')
b1b5433d 1220 return scm_read_vector (c, port, opts, line, column);
493ceb99 1221
10744b7c 1222 /* Disambiguate between '#f' and uniform floating point vectors. */
493ceb99
MW
1223 if (c == 'f')
1224 {
fa980bcc 1225 c = scm_getc_unlocked (port);
493ceb99
MW
1226 if (c != '3' && c != '6')
1227 {
7a329029
MW
1228 if (c == 'a' && try_read_ci_chars (port, "lse"))
1229 return SCM_BOOL_F;
1230 else if (c != EOF)
cb8aaef4 1231 scm_ungetc_unlocked (c, port);
493ceb99
MW
1232 return SCM_BOOL_F;
1233 }
1234 rank = 1;
1235 tag_buf[0] = 'f';
1236 tag_len = 1;
1237 goto continue_reading_tag;
1238 }
1239
1240 /* Read rank. */
1241 rank = 1;
1242 c = read_decimal_integer (port, c, &rank);
1243 if (rank < 0)
1244 scm_i_input_error (NULL, port, "array rank must be non-negative",
1245 SCM_EOL);
1246
1247 /* Read tag. */
1248 tag_len = 0;
1249 continue_reading_tag:
1250 while (c != EOF && c != '(' && c != '@' && c != ':'
1251 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1252 {
1253 tag_buf[tag_len++] = c;
fa980bcc 1254 c = scm_getc_unlocked (port);
493ceb99
MW
1255 }
1256 if (tag_len == 0)
1257 tag = SCM_BOOL_T;
b131b233 1258 else
493ceb99
MW
1259 {
1260 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1261 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1262 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1263 scm_list_1 (tag));
1264 }
1265
1266 /* Read shape. */
1267 if (c == '@' || c == ':')
1268 {
1269 shape = SCM_EOL;
1270
1271 do
1272 {
1273 ssize_t lbnd = 0, len = 0;
1274 SCM s;
1275
1276 if (c == '@')
1277 {
fa980bcc 1278 c = scm_getc_unlocked (port);
493ceb99
MW
1279 c = read_decimal_integer (port, c, &lbnd);
1280 }
1281
1282 s = scm_from_ssize_t (lbnd);
1283
1284 if (c == ':')
1285 {
fa980bcc 1286 c = scm_getc_unlocked (port);
493ceb99
MW
1287 c = read_decimal_integer (port, c, &len);
1288 if (len < 0)
1289 scm_i_input_error (NULL, port,
1290 "array length must be non-negative",
1291 SCM_EOL);
1292
1293 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1294 }
1295
1296 shape = scm_cons (s, shape);
1297 } while (c == '@' || c == ':');
1298
1299 shape = scm_reverse_x (shape, SCM_EOL);
1300 }
1301
1302 /* Read nested lists of elements. */
1303 if (c != '(')
1304 scm_i_input_error (NULL, port,
1305 "missing '(' in vector or array literal",
1306 SCM_EOL);
b1b5433d 1307 elements = scm_read_sexp (c, port, opts);
493ceb99
MW
1308
1309 if (scm_is_false (shape))
1310 shape = scm_from_ssize_t (rank);
1311 else if (scm_ilength (shape) != rank)
1312 scm_i_input_error
1313 (NULL, port,
1314 "the number of shape specifications must match the array rank",
1315 SCM_EOL);
1316
1317 /* Handle special print syntax of rank zero arrays; see
1318 scm_i_print_array for a rationale. */
1319 if (rank == 0)
1320 {
1321 if (!scm_is_pair (elements))
1322 scm_i_input_error (NULL, port,
1323 "too few elements in array literal, need 1",
1324 SCM_EOL);
1325 if (!scm_is_null (SCM_CDR (elements)))
1326 scm_i_input_error (NULL, port,
1327 "too many elements in array literal, want 1",
1328 SCM_EOL);
1329 elements = SCM_CAR (elements);
1330 }
1331
1332 /* Construct array, annotate with source location, and return. */
1333 array = scm_list_to_typed_array (tag, shape, elements);
b1b5433d 1334 return maybe_annotate_source (array, port, opts, line, column);
7337d56d
LC
1335}
1336
0ba0b384 1337static SCM
b1b5433d
MW
1338scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1339 long line, int column)
7337d56d 1340{
b1b5433d 1341 return scm_read_array (chr, port, opts, line, column);
7337d56d
LC
1342}
1343
0ba0b384 1344static SCM
b1b5433d
MW
1345scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1346 long line, int column)
0ba0b384 1347{
be632904 1348 chr = scm_getc_unlocked (port);
0ba0b384
LC
1349 if (chr != 'u')
1350 goto syntax;
1351
be632904 1352 chr = scm_getc_unlocked (port);
0ba0b384
LC
1353 if (chr != '8')
1354 goto syntax;
1355
be632904 1356 chr = scm_getc_unlocked (port);
0ba0b384
LC
1357 if (chr != '(')
1358 goto syntax;
1359
b131b233 1360 return maybe_annotate_source
b1b5433d
MW
1361 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1362 port, opts, line, column);
0ba0b384
LC
1363
1364 syntax:
1365 scm_i_input_error ("read_bytevector", port,
1366 "invalid bytevector prefix",
1367 SCM_MAKE_CHAR (chr));
1368 return SCM_UNSPECIFIED;
1369}
1370
7337d56d 1371static SCM
b1b5433d
MW
1372scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1373 long line, int column)
7337d56d
LC
1374{
1375 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1376 terribly inefficient but who cares? */
1377 SCM s_bits = SCM_EOL;
1378
be632904 1379 for (chr = scm_getc_unlocked (port);
7337d56d 1380 (chr != EOF) && ((chr == '0') || (chr == '1'));
be632904 1381 chr = scm_getc_unlocked (port))
09a4f039 1382 {
7337d56d 1383 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1384 }
7337d56d
LC
1385
1386 if (chr != EOF)
c932ce0b 1387 scm_ungetc_unlocked (chr, port);
7337d56d 1388
b131b233
MW
1389 return maybe_annotate_source
1390 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
b1b5433d 1391 port, opts, line, column);
7337d56d
LC
1392}
1393
cfd15439 1394static SCM
889975e5 1395scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1396{
1397 int bang_seen = 0;
1398
1399 for (;;)
09a4f039 1400 {
be632904 1401 int c = scm_getc_unlocked (port);
62850ef3 1402
7337d56d
LC
1403 if (c == EOF)
1404 scm_i_input_error ("skip_block_comment", port,
1405 "unterminated `#! ... !#' comment", SCM_EOL);
1406
1407 if (c == '!')
1408 bang_seen = 1;
1409 else if (c == '#' && bang_seen)
1410 break;
1411 else
1412 bang_seen = 0;
1413 }
1414
1415 return SCM_UNSPECIFIED;
1416}
1417
9331ffd8
MW
1418static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1419 int value);
bf9eb54a
MW
1420static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1421 int value);
1422static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1423 int value);
9331ffd8 1424
d7fcaec3 1425static SCM
b1b5433d 1426scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
911b03b2 1427{
02327c0c
MW
1428 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1429 int c;
1430 int i = 0;
1431
1432 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
911b03b2 1433 {
fa980bcc 1434 c = scm_getc_unlocked (port);
02327c0c
MW
1435 if (c == EOF)
1436 scm_i_input_error ("skip_block_comment", port,
1437 "unterminated `#! ... !#' comment", SCM_EOL);
1438 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1439 name[i++] = c;
1440 else if (CHAR_IS_DELIMITER (c))
1441 {
fa980bcc 1442 scm_ungetc_unlocked (c, port);
02327c0c
MW
1443 name[i] = '\0';
1444 if (0 == strcmp ("r6rs", name))
1445 ; /* Silently ignore */
9331ffd8
MW
1446 else if (0 == strcmp ("fold-case", name))
1447 set_port_case_insensitive_p (port, opts, 1);
1448 else if (0 == strcmp ("no-fold-case", name))
1449 set_port_case_insensitive_p (port, opts, 0);
bf9eb54a
MW
1450 else if (0 == strcmp ("curly-infix", name))
1451 set_port_curly_infix_p (port, opts, 1);
1452 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1453 {
1454 set_port_curly_infix_p (port, opts, 1);
1455 set_port_square_brackets_p (port, opts, 0);
1456 }
02327c0c
MW
1457 else
1458 break;
1459
1460 return SCM_UNSPECIFIED;
1461 }
fa746547
MW
1462 else
1463 {
fa980bcc 1464 scm_ungetc_unlocked (c, port);
fa746547
MW
1465 break;
1466 }
911b03b2 1467 }
02327c0c 1468 while (i > 0)
fa980bcc 1469 scm_ungetc_unlocked (name[--i], port);
02327c0c 1470 return scm_read_scsh_block_comment (chr, port);
911b03b2
JG
1471}
1472
620c8965
LC
1473static SCM
1474scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1475{
1476 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1477 nested. So care must be taken. */
1478 int nesting_level = 1;
6d5f8c32 1479
be632904 1480 int a = scm_getc_unlocked (port);
6d5f8c32
AW
1481
1482 if (a == EOF)
1483 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1484 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1485
1486 while (nesting_level > 0)
1487 {
be632904 1488 int b = scm_getc_unlocked (port);
620c8965 1489
6d5f8c32 1490 if (b == EOF)
cd169c5a 1491 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1492 "unterminated `#| ... |#' comment", SCM_EOL);
1493
6d5f8c32
AW
1494 if (a == '|' && b == '#')
1495 {
1496 nesting_level--;
1497 b = EOF;
1498 }
1499 else if (a == '#' && b == '|')
1500 {
1501 nesting_level++;
1502 b = EOF;
1503 }
1504
1505 a = b;
620c8965
LC
1506 }
1507
1508 return SCM_UNSPECIFIED;
1509}
1510
34f3d47d 1511static SCM
b1b5433d
MW
1512scm_read_commented_expression (scm_t_wchar chr, SCM port,
1513 scm_t_read_opts *opts)
34f3d47d 1514{
889975e5 1515 scm_t_wchar c;
34f3d47d 1516
b1b5433d 1517 c = flush_ws (port, opts, (char *) NULL);
34f3d47d
AW
1518 if (EOF == c)
1519 scm_i_input_error ("read_commented_expression", port,
1520 "no expression after #; comment", SCM_EOL);
c932ce0b 1521 scm_ungetc_unlocked (c, port);
b1b5433d 1522 scm_read_expression (port, opts);
34f3d47d
AW
1523 return SCM_UNSPECIFIED;
1524}
1525
7337d56d 1526static SCM
889975e5 1527scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1528{
1529 /* Guile's extended symbol read syntax looks like this:
1530
1531 #{This is all a symbol name}#
1532
1533 So here, CHR is expected to be `{'. */
d9527cfa 1534 int saw_brace = 0;
7337d56d 1535 size_t len = 0;
190d4b0d 1536 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1537
889975e5 1538 buf = scm_i_string_start_writing (buf);
7337d56d 1539
be632904 1540 while ((chr = scm_getc_unlocked (port)) != EOF)
7337d56d
LC
1541 {
1542 if (saw_brace)
09a4f039 1543 {
7337d56d
LC
1544 if (chr == '#')
1545 {
7337d56d
LC
1546 break;
1547 }
1548 else
1549 {
1550 saw_brace = 0;
889975e5 1551 scm_i_string_set_x (buf, len++, '}');
7337d56d 1552 }
09a4f039 1553 }
d9527cfa
AW
1554
1555 if (chr == '}')
7337d56d 1556 saw_brace = 1;
d9527cfa
AW
1557 else if (chr == '\\')
1558 {
1559 /* It used to be that print.c would print extended-read-syntax
1560 symbols with backslashes before "non-standard" chars, but
1561 this routine wouldn't do anything with those escapes.
1562 Bummer. What we've done is to change print.c to output
1563 R6RS hex escapes for those characters, relying on the fact
1564 that the extended read syntax would never put a `\' before
1565 an `x'. For now, we just ignore other instances of
1566 backslash in the string. */
be632904 1567 switch ((chr = scm_getc_unlocked (port)))
d9527cfa
AW
1568 {
1569 case EOF:
1570 goto done;
1571 case 'x':
1572 {
1573 scm_t_wchar c;
1574
1575 SCM_READ_HEX_ESCAPE (10, ';');
1576 scm_i_string_set_x (buf, len++, c);
1577 break;
1578
1579 str_eof:
1580 chr = EOF;
1581 goto done;
1582
1583 bad_escaped:
1584 scm_i_string_stop_writing ();
1585 scm_i_input_error ("scm_read_extended_symbol", port,
1586 "illegal character in escape sequence: ~S",
1587 scm_list_1 (SCM_MAKE_CHAR (c)));
1588 break;
1589 }
1590 default:
1591 scm_i_string_set_x (buf, len++, chr);
1592 break;
1593 }
1594 }
7337d56d 1595 else
d9527cfa 1596 scm_i_string_set_x (buf, len++, chr);
62850ef3 1597
889975e5 1598 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1599 {
7f991c7d
LC
1600 SCM addy;
1601
889975e5 1602 scm_i_string_stop_writing ();
190d4b0d 1603 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1604 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1605 len = 0;
889975e5 1606 buf = scm_i_string_start_writing (buf);
7337d56d 1607 }
7337d56d 1608 }
d9527cfa
AW
1609
1610 done:
889975e5 1611 scm_i_string_stop_writing ();
d9527cfa
AW
1612 if (chr == EOF)
1613 scm_i_input_error ("scm_read_extended_symbol", port,
1614 "end of file while reading symbol", SCM_EOL);
7337d56d 1615
889975e5 1616 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1617}
1618
1619
1620\f
1621/* Top-level token readers, i.e., dispatchers. */
1622
1623static SCM
b1b5433d 1624scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
7337d56d
LC
1625{
1626 SCM proc;
1627
1628 proc = scm_get_hash_procedure (chr);
1629 if (scm_is_true (scm_procedure_p (proc)))
1630 {
1631 long line = SCM_LINUM (port);
1632 int column = SCM_COL (port) - 2;
1633 SCM got;
1634
1635 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14 1636
3655ed89
MW
1637 if (opts->record_positions_p && SCM_NIMP (got)
1638 && !scm_i_has_source_properties (got))
26c8cc14
AW
1639 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1640
1641 return got;
09a4f039 1642 }
7337d56d
LC
1643
1644 return SCM_UNSPECIFIED;
1645}
1646
1647/* The reader for the sharp `#' character. It basically dispatches reads
1648 among the above token readers. */
1649static SCM
b1b5433d
MW
1650scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1651 long line, int column)
7337d56d
LC
1652#define FUNC_NAME "scm_lreadr"
1653{
1654 SCM result;
1655
be632904 1656 chr = scm_getc_unlocked (port);
7337d56d 1657
b1b5433d 1658 result = scm_read_sharp_extension (chr, port, opts);
7337d56d
LC
1659 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1660 return result;
1661
1662 switch (chr)
1663 {
1664 case '\\':
b1b5433d 1665 return (scm_read_character (chr, port, opts));
7337d56d 1666 case '(':
b1b5433d 1667 return (scm_read_vector (chr, port, opts, line, column));
7337d56d
LC
1668 case 's':
1669 case 'u':
1670 case 'f':
df941b5b 1671 case 'c':
7337d56d 1672 /* This one may return either a boolean or an SRFI-4 vector. */
b1b5433d 1673 return (scm_read_srfi4_vector (chr, port, opts, line, column));
0ba0b384 1674 case 'v':
b1b5433d 1675 return (scm_read_bytevector (chr, port, opts, line, column));
7337d56d 1676 case '*':
b1b5433d 1677 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
7337d56d
LC
1678 case 't':
1679 case 'T':
1680 case 'F':
7337d56d
LC
1681 return (scm_read_boolean (chr, port));
1682 case ':':
b1b5433d 1683 return (scm_read_keyword (chr, port, opts));
7337d56d
LC
1684 case '0': case '1': case '2': case '3': case '4':
1685 case '5': case '6': case '7': case '8': case '9':
1686 case '@':
b1b5433d 1687 return (scm_read_array (chr, port, opts, line, column));
7337d56d
LC
1688
1689 case 'i':
1690 case 'e':
7337d56d
LC
1691 case 'b':
1692 case 'B':
1693 case 'o':
1694 case 'O':
1695 case 'd':
1696 case 'D':
1697 case 'x':
1698 case 'X':
1699 case 'I':
1700 case 'E':
b1b5433d 1701 return (scm_read_number_and_radix (chr, port, opts));
7337d56d
LC
1702 case '{':
1703 return (scm_read_extended_symbol (chr, port));
1704 case '!':
b1b5433d 1705 return (scm_read_shebang (chr, port, opts));
34f3d47d 1706 case ';':
b1b5433d 1707 return (scm_read_commented_expression (chr, port, opts));
34f3d47d
AW
1708 case '`':
1709 case '\'':
1710 case ',':
b1b5433d 1711 return (scm_read_syntax (chr, port, opts));
7c4aad9c 1712 case 'n':
b1b5433d 1713 return (scm_read_nil (chr, port, opts));
7337d56d 1714 default:
b1b5433d 1715 result = scm_read_sharp_extension (chr, port, opts);
7337d56d 1716 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1717 {
1718 /* To remain compatible with 1.8 and earlier, the following
1719 characters have lower precedence than `read-hash-extend'
1720 characters. */
1721 switch (chr)
1722 {
1723 case '|':
1724 return scm_read_r6rs_block_comment (chr, port);
1725 default:
1726 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1727 scm_list_1 (SCM_MAKE_CHAR (chr)));
1728 }
1729 }
7337d56d
LC
1730 else
1731 return result;
1732 }
1733
1734 return SCM_UNSPECIFIED;
1735}
1736#undef FUNC_NAME
1737
1738static SCM
bf9eb54a
MW
1739read_inner_expression (SCM port, scm_t_read_opts *opts)
1740#define FUNC_NAME "read_inner_expression"
7337d56d
LC
1741{
1742 while (1)
1743 {
cfd15439 1744 scm_t_wchar chr;
7337d56d 1745
be632904 1746 chr = scm_getc_unlocked (port);
7337d56d
LC
1747
1748 switch (chr)
1749 {
1750 case SCM_WHITE_SPACES:
1751 case SCM_LINE_INCREMENTORS:
1752 break;
1753 case ';':
1754 (void) scm_read_semicolon_comment (chr, port);
1755 break;
bf9eb54a
MW
1756 case '{':
1757 if (opts->curly_infix_p)
1758 {
1759 if (opts->neoteric_p)
1760 return scm_read_sexp (chr, port, opts);
1761 else
1762 {
1763 SCM expr;
1764
1765 /* Enable neoteric expressions within curly braces */
1766 opts->neoteric_p = 1;
1767 expr = scm_read_sexp (chr, port, opts);
1768 opts->neoteric_p = 0;
1769 return expr;
1770 }
1771 }
1772 else
1773 return scm_read_mixed_case_symbol (chr, port, opts);
5afa815c 1774 case '[':
bf9eb54a
MW
1775 if (opts->square_brackets_p)
1776 return scm_read_sexp (chr, port, opts);
1777 else if (opts->curly_infix_p)
1778 {
1779 /* The syntax of neoteric expressions requires that '[' be
1780 a delimiter when curly-infix is enabled, so it cannot
1781 be part of an unescaped symbol. We might as well do
1782 something useful with it, so we adopt Kawa's convention:
1783 [...] => ($bracket-list$ ...) */
1784 long line = SCM_LINUM (port);
1785 int column = SCM_COL (port) - 1;
1786 return maybe_annotate_source
1787 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1788 port, opts, line, column);
1789 }
1790 else
1791 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d 1792 case '(':
b1b5433d 1793 return (scm_read_sexp (chr, port, opts));
7337d56d 1794 case '"':
b1b5433d 1795 return (scm_read_string (chr, port, opts));
dc59631d
MW
1796 case '|':
1797 if (opts->r7rs_symbols_p)
1798 return scm_read_r7rs_symbol (chr, port, opts);
1799 else
1800 return scm_read_mixed_case_symbol (chr, port, opts);
7337d56d
LC
1801 case '\'':
1802 case '`':
1803 case ',':
b1b5433d 1804 return (scm_read_quote (chr, port, opts));
7337d56d
LC
1805 case '#':
1806 {
b131b233
MW
1807 long line = SCM_LINUM (port);
1808 int column = SCM_COL (port) - 1;
b1b5433d 1809 SCM result = scm_read_sharp (chr, port, opts, line, column);
7337d56d
LC
1810 if (scm_is_eq (result, SCM_UNSPECIFIED))
1811 /* We read a comment or some such. */
1812 break;
1813 else
1814 return result;
1815 }
1816 case ')':
1817 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1818 break;
bf9eb54a
MW
1819 case '}':
1820 if (opts->curly_infix_p)
1821 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1822 else
1823 return scm_read_mixed_case_symbol (chr, port, opts);
a4e47229 1824 case ']':
b1b5433d 1825 if (opts->square_brackets_p)
a4e47229
MG
1826 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1827 /* otherwise fall through */
7337d56d
LC
1828 case EOF:
1829 return SCM_EOF_VAL;
1830 case ':':
b1b5433d
MW
1831 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1832 return scm_symbol_to_keyword (scm_read_expression (port, opts));
7337d56d
LC
1833 /* Fall through. */
1834
1835 default:
1836 {
1837 if (((chr >= '0') && (chr <= '9'))
1838 || (strchr ("+-.", chr)))
b1b5433d 1839 return (scm_read_number (chr, port, opts));
7337d56d 1840 else
b1b5433d 1841 return (scm_read_mixed_case_symbol (chr, port, opts));
7337d56d
LC
1842 }
1843 }
1844 }
1845}
1846#undef FUNC_NAME
1847
bf9eb54a
MW
1848static SCM
1849scm_read_expression (SCM port, scm_t_read_opts *opts)
1850#define FUNC_NAME "scm_read_expression"
1851{
1852 if (!opts->neoteric_p)
1853 return read_inner_expression (port, opts);
1854 else
1855 {
1856 long line = 0;
1857 int column = 0;
1858 SCM expr;
1859
1860 if (opts->record_positions_p)
1861 {
1862 /* We need to get the position of the first non-whitespace
1863 character in order to correctly annotate neoteric
1864 expressions. For example, for the expression 'f(x)', the
1865 first call to 'read_inner_expression' reads the 'f' (which
1866 cannot be annotated), and then we later read the '(x)' and
1867 use it to construct the new list (f x). */
1868 int c = flush_ws (port, opts, (char *) NULL);
1869 if (c == EOF)
1870 return SCM_EOF_VAL;
fa980bcc 1871 scm_ungetc_unlocked (c, port);
bf9eb54a
MW
1872 line = SCM_LINUM (port);
1873 column = SCM_COL (port);
1874 }
1875
1876 expr = read_inner_expression (port, opts);
1877
1878 /* 'expr' is the first component of the neoteric expression. Now
1879 we loop, and as long as the next character is '(', '[', or '{',
1880 (without any intervening whitespace), we use it to construct a
1881 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1882 for (;;)
1883 {
fa980bcc 1884 int chr = scm_getc_unlocked (port);
bf9eb54a
MW
1885
1886 if (chr == '(')
1887 /* e(...) => (e ...) */
1888 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1889 else if (chr == '[')
1890 /* e[...] => ($bracket-apply$ e ...) */
1891 expr = scm_cons (sym_bracket_apply,
1892 scm_cons (expr,
1893 scm_read_sexp (chr, port, opts)));
1894 else if (chr == '{')
1895 {
1896 SCM arg = scm_read_sexp (chr, port, opts);
1897
1898 if (scm_is_null (arg))
1899 expr = scm_list_1 (expr); /* e{} => (e) */
1900 else
1901 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1902 }
1903 else
1904 {
1905 if (chr != EOF)
fa980bcc 1906 scm_ungetc_unlocked (chr, port);
bf9eb54a
MW
1907 break;
1908 }
1909 maybe_annotate_source (expr, port, opts, line, column);
1910 }
1911 return expr;
1912 }
1913}
1914#undef FUNC_NAME
1915
7337d56d
LC
1916\f
1917/* Actual reader. */
1918
ea8c9761
MW
1919static void init_read_options (SCM port, scm_t_read_opts *opts);
1920
7337d56d
LC
1921SCM_DEFINE (scm_read, "read", 0, 1, 0,
1922 (SCM port),
1923 "Read an s-expression from the input port @var{port}, or from\n"
1924 "the current input port if @var{port} is not specified.\n"
1925 "Any whitespace before the next token is discarded.")
1926#define FUNC_NAME s_scm_read
1927{
b1b5433d 1928 scm_t_read_opts opts;
7337d56d
LC
1929 int c;
1930
1931 if (SCM_UNBNDP (port))
1932 port = scm_current_input_port ();
1933 SCM_VALIDATE_OPINPORT (1, port);
1934
ea8c9761 1935 init_read_options (port, &opts);
b1b5433d
MW
1936
1937 c = flush_ws (port, &opts, (char *) NULL);
7337d56d
LC
1938 if (EOF == c)
1939 return SCM_EOF_VAL;
c932ce0b 1940 scm_ungetc_unlocked (c, port);
7337d56d 1941
b1b5433d 1942 return (scm_read_expression (port, &opts));
09a4f039 1943}
db4b4ca6 1944#undef FUNC_NAME
09a4f039 1945
0f2d19dd
JB
1946
1947\f
1948
14de3b42
GH
1949/* Manipulate the read-hash-procedures alist. This could be written in
1950 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1951SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1952 (SCM chr, SCM proc),
dc7fa443
MG
1953 "Install the procedure @var{proc} for reading expressions\n"
1954 "starting with the character sequence @code{#} and @var{chr}.\n"
1955 "@var{proc} will be called with two arguments: the character\n"
1956 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1957 "returned will be the return value of @code{read}. \n"
1958 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1959 )
1bbd0b84 1960#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1961{
fed9c9a2
GH
1962 SCM this;
1963 SCM prev;
1964
36284627 1965 SCM_VALIDATE_CHAR (1, chr);
7888309b 1966 SCM_ASSERT (scm_is_false (proc)
bc36d050 1967 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1968 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1969
14de3b42 1970 /* Check if chr is already in the alist. */
d458073b 1971 this = scm_i_read_hash_procedures_ref ();
14de3b42 1972 prev = SCM_BOOL_F;
fed9c9a2
GH
1973 while (1)
1974 {
d2e53ed6 1975 if (scm_is_null (this))
fed9c9a2
GH
1976 {
1977 /* not found, so add it to the beginning. */
7888309b 1978 if (scm_is_true (proc))
fed9c9a2 1979 {
d458073b
AR
1980 SCM new = scm_cons (scm_cons (chr, proc),
1981 scm_i_read_hash_procedures_ref ());
1982 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1983 }
1984 break;
1985 }
bc36d050 1986 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1987 {
1988 /* already in the alist. */
7888309b 1989 if (scm_is_false (proc))
14de3b42
GH
1990 {
1991 /* remove it. */
7888309b 1992 if (scm_is_false (prev))
14de3b42 1993 {
d458073b
AR
1994 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1995 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1996 }
1997 else
1998 scm_set_cdr_x (prev, SCM_CDR (this));
1999 }
fed9c9a2 2000 else
14de3b42
GH
2001 {
2002 /* replace it. */
2003 scm_set_cdr_x (SCM_CAR (this), proc);
2004 }
fed9c9a2
GH
2005 break;
2006 }
2007 prev = this;
2008 this = SCM_CDR (this);
2009 }
deca31e1 2010
deca31e1
GH
2011 return SCM_UNSPECIFIED;
2012}
1bbd0b84 2013#undef FUNC_NAME
0f2d19dd 2014
deca31e1
GH
2015/* Recover the read-hash procedure corresponding to char c. */
2016static SCM
6e8d25a6 2017scm_get_hash_procedure (int c)
deca31e1 2018{
d458073b 2019 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 2020
deca31e1
GH
2021 while (1)
2022 {
d2e53ed6 2023 if (scm_is_null (rest))
deca31e1
GH
2024 return SCM_BOOL_F;
2025
7866a09b 2026 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
2027 return SCM_CDAR (rest);
2028
2029 rest = SCM_CDR (rest);
2030 }
2031}
1cc91f1b 2032
93c4fa21
AW
2033static int
2034is_encoding_char (char c)
2035{
2036 if (c >= 'a' && c <= 'z') return 1;
2037 if (c >= 'A' && c <= 'Z') return 1;
2038 if (c >= '0' && c <= '9') return 1;
2039 return strchr ("_-.:/,+=()", c) != NULL;
2040}
2041
3ff8a9d6
LC
2042/* Maximum size of an encoding name. This is a bit more than the
2043 longest name listed at
2044 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2045 characters.) */
2046#define ENCODING_NAME_MAX_SIZE 20
889975e5 2047
3ff8a9d6
LC
2048/* Number of bytes at the beginning or end of a file that are scanned
2049 for a "coding:" declaration. */
2050#define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2051
2052
2053/* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2054 coding declaration. Returns either NULL or a string whose storage
2055 has been allocated with `scm_gc_malloc'. */
889975e5 2056char *
f8a1c9a8 2057scm_i_scan_for_encoding (SCM port)
889975e5 2058{
d900843c 2059 scm_t_port *pt;
889975e5 2060 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 2061 size_t bytes_read, encoding_length, i;
889975e5 2062 char *encoding = NULL;
daedbca7 2063 char *pos, *encoding_start;
889975e5
MG
2064 int in_comment;
2065
d900843c 2066 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 2067
d900843c 2068 if (pt->rw_active == SCM_PORT_WRITE)
4251ae2e 2069 scm_flush_unlocked (port);
49bb5bd3 2070
d900843c
AW
2071 if (pt->rw_random)
2072 pt->rw_active = SCM_PORT_READ;
2073
2074 if (pt->read_pos == pt->read_end)
2075 {
2076 /* We can use the read buffer, and thus avoid a seek. */
4251ae2e 2077 if (scm_fill_input_unlocked (port) == EOF)
d900843c
AW
2078 return NULL;
2079
2080 bytes_read = pt->read_end - pt->read_pos;
2081 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2082 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2083
2084 if (bytes_read <= 1)
2085 /* An unbuffered port -- don't scan. */
2086 return NULL;
2087
2088 memcpy (header, pt->read_pos, bytes_read);
2089 header[bytes_read] = '\0';
2090 }
2091 else
2092 {
2093 /* Try to read some bytes and then seek back. Not all ports
2094 support seeking back; and indeed some file ports (like
2095 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2096 check performed by SCM_FPORT_FDES---but fail to seek
2097 backwards. Hence this block comes second. We prefer to use
2098 the read buffer in-place. */
2099 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2100 return NULL;
2101
be632904 2102 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
d900843c
AW
2103 header[bytes_read] = '\0';
2104 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2105 }
889975e5 2106
889975e5
MG
2107 /* search past "coding[:=]" */
2108 pos = header;
2109 while (1)
2110 {
2111 if ((pos = strstr(pos, "coding")) == NULL)
2112 return NULL;
2113
3ff8a9d6
LC
2114 pos += strlen ("coding");
2115 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
889975e5
MG
2116 (*pos == ':' || *pos == '='))
2117 {
2118 pos ++;
2119 break;
2120 }
2121 }
2122
2123 /* skip spaces */
3ff8a9d6 2124 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
889975e5
MG
2125 (*pos == ' ' || *pos == '\t'))
2126 pos ++;
2127
3ff8a9d6
LC
2128 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2129 /* We found the "coding:" string, but there is probably not enough
2130 room to store an encoding name in its entirety, so ignore it.
2131 This makes sure we do not end up returning a truncated encoding
2132 name. */
2133 return NULL;
2134
889975e5 2135 /* grab the next token */
daedbca7 2136 encoding_start = pos;
889975e5 2137 i = 0;
daedbca7
MG
2138 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2139 && encoding_start + i - header < bytes_read
93c4fa21 2140 && is_encoding_char (encoding_start[i]))
889975e5
MG
2141 i++;
2142
daedbca7
MG
2143 encoding_length = i;
2144 if (encoding_length == 0)
889975e5
MG
2145 return NULL;
2146
daedbca7 2147 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
889975e5
MG
2148
2149 /* push backwards to make sure we were in a comment */
2150 in_comment = 0;
daedbca7
MG
2151 pos = encoding_start;
2152 while (pos >= header)
889975e5 2153 {
8a12aeb9
AW
2154 if (*pos == ';')
2155 {
2156 in_comment = 1;
2157 break;
2158 }
2159 else if (*pos == '\n' || pos == header)
889975e5
MG
2160 {
2161 /* This wasn't in a semicolon comment. Check for a
2162 hash-bang comment. */
2163 char *beg = strstr (header, "#!");
2164 char *end = strstr (header, "!#");
8a12aeb9 2165 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
2166 in_comment = 1;
2167 break;
2168 }
8a12aeb9
AW
2169 else
2170 {
2171 pos --;
2172 continue;
2173 }
889975e5
MG
2174 }
2175 if (!in_comment)
f8a1c9a8
LC
2176 /* This wasn't in a comment */
2177 return NULL;
2178
889975e5
MG
2179 return encoding;
2180}
2181
2182SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2183 (SCM port),
a270e133 2184 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 2185 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
2186 "The coding declaration is of the form\n"
2187 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2188 "\n"
2189 "Returns a string containing the character encoding of the file\n"
2190 "if a declaration was found, or @code{#f} otherwise.\n")
2191#define FUNC_NAME s_scm_file_encoding
2192{
2193 char *enc;
2194 SCM s_enc;
f8a1c9a8 2195
d900843c
AW
2196 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2197
f8a1c9a8 2198 enc = scm_i_scan_for_encoding (port);
889975e5
MG
2199 if (enc == NULL)
2200 return SCM_BOOL_F;
2201 else
2202 {
f6f4feb0 2203 s_enc = scm_string_upcase (scm_from_locale_string (enc));
889975e5
MG
2204 return s_enc;
2205 }
f8a1c9a8 2206
889975e5
MG
2207 return SCM_BOOL_F;
2208}
2209#undef FUNC_NAME
2210
ea8c9761
MW
2211\f
2212/* Per-port read options.
2213
a38024ba
MW
2214 We store per-port read options in the 'port-read-options' port
2215 property, which is stored in the internal port structure. The value
2216 stored is a single integer that contains a two-bit field for each
2217 read option.
ea8c9761
MW
2218
2219 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2220 the applicable value should be inherited from the corresponding
2221 global read option. Otherwise, the bit field contains the value of
2222 the read option. For boolean read options that have been set
2223 per-port, the possible values are 0 or 1. If the 'keyword_style'
2224 read option has been set per-port, its possible values are those in
2225 'enum t_keyword_style'. */
2226
a38024ba 2227/* Key to read options in port properties. */
ea8c9761
MW
2228SCM_SYMBOL (sym_port_read_options, "port-read-options");
2229
2230/* Offsets of bit fields for each per-port override */
2231#define READ_OPTION_COPY_SOURCE_P 0
2232#define READ_OPTION_RECORD_POSITIONS_P 2
2233#define READ_OPTION_CASE_INSENSITIVE_P 4
2234#define READ_OPTION_KEYWORD_STYLE 6
2235#define READ_OPTION_R6RS_ESCAPES_P 8
2236#define READ_OPTION_SQUARE_BRACKETS_P 10
2237#define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
bf9eb54a 2238#define READ_OPTION_CURLY_INFIX_P 14
dc59631d 2239#define READ_OPTION_R7RS_SYMBOLS_P 16
ea8c9761 2240
bf9eb54a 2241/* The total width in bits of the per-port overrides */
dc59631d 2242#define READ_OPTIONS_NUM_BITS 18
ea8c9761
MW
2243
2244#define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2245#define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2246
2247#define READ_OPTION_MASK 3
2248#define READ_OPTION_INHERIT 3
2249
2250static void
2251set_port_read_option (SCM port, int option, int new_value)
2252{
a38024ba 2253 SCM scm_read_options;
ea8c9761
MW
2254 unsigned int read_options;
2255
2256 new_value &= READ_OPTION_MASK;
79657fd3
MW
2257
2258 scm_dynwind_begin (0);
2259 scm_dynwind_lock_port (port);
2260
a38024ba 2261 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2262 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2263 read_options = scm_to_uint (scm_read_options);
2264 else
2265 read_options = READ_OPTIONS_INHERIT_ALL;
2266 read_options &= ~(READ_OPTION_MASK << option);
2267 read_options |= new_value << option;
2268 scm_read_options = scm_from_uint (read_options);
a38024ba 2269 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
79657fd3
MW
2270
2271 scm_dynwind_end ();
ea8c9761
MW
2272}
2273
9331ffd8
MW
2274/* Set OPTS and PORT's case-insensitivity according to VALUE. */
2275static void
2276set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2277{
2278 value = !!value;
2279 opts->case_insensitive_p = value;
2280 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2281}
2282
bf9eb54a
MW
2283/* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2284static void
2285set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2286{
2287 value = !!value;
2288 opts->square_brackets_p = value;
2289 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2290}
2291
2292/* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2293static void
2294set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2295{
2296 value = !!value;
2297 opts->curly_infix_p = value;
2298 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2299}
2300
ea8c9761
MW
2301/* Initialize OPTS based on PORT's read options and the global read
2302 options. */
2303static void
2304init_read_options (SCM port, scm_t_read_opts *opts)
2305{
a38024ba 2306 SCM val, scm_read_options;
ea8c9761
MW
2307 unsigned int read_options, x;
2308
a38024ba 2309 scm_read_options = scm_i_port_property (port, sym_port_read_options);
ea8c9761
MW
2310
2311 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2312 read_options = scm_to_uint (scm_read_options);
2313 else
2314 read_options = READ_OPTIONS_INHERIT_ALL;
2315
2316 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2317 if (x == READ_OPTION_INHERIT)
2318 {
2319 val = SCM_PACK (SCM_KEYWORD_STYLE);
2320 if (scm_is_eq (val, scm_keyword_prefix))
2321 x = KEYWORD_STYLE_PREFIX;
2322 else if (scm_is_eq (val, scm_keyword_postfix))
2323 x = KEYWORD_STYLE_POSTFIX;
2324 else
2325 x = KEYWORD_STYLE_HASH_PREFIX;
2326 }
2327 opts->keyword_style = x;
2328
2329#define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2330 do \
2331 { \
2332 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2333 if (x == READ_OPTION_INHERIT) \
2334 x = !!SCM_ ## NAME; \
2335 opts->name = x; \
2336 } \
2337 while (0)
2338
2339 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2340 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2341 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2342 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2343 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2344 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
bf9eb54a 2345 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
dc59631d 2346 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
ea8c9761
MW
2347
2348#undef RESOLVE_BOOLEAN_OPTION
bf9eb54a
MW
2349
2350 opts->neoteric_p = 0;
ea8c9761
MW
2351}
2352
0f2d19dd
JB
2353void
2354scm_init_read ()
0f2d19dd 2355{
d458073b
AR
2356 SCM read_hash_procs;
2357
c81c2ad3 2358 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
2359
2360 scm_i_read_hash_procedures =
2361 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 2362
62560650 2363 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 2364#include "libguile/read.x"
0f2d19dd 2365}
89e00824
ML
2366
2367/*
2368 Local Variables:
2369 c-file-style: "gnu"
2370 End:
2371*/