Move array reader from arrays.c to read.c
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37
MW
1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
210c0325 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
cfd15439 145static SCM
d458073b
AR
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
cfd15439 151static void
d458073b
AR
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
ff4d3672
LC
164/* Number of 32-bit codepoints in the buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
cfd15439 200static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965
LC
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
cfd15439 210static int
b662b7e9
LC
211read_token (SCM port, char *buf, size_t buf_size, size_t *read)
212{
69f90b0b 213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
69f90b0b 219 chr = scm_get_byte_or_eof (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
b662b7e9
LC
238/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
239 if the token doesn't fit in BUFFER_SIZE bytes. */
240static char *
241read_complete_token (SCM port, char *buffer, size_t buffer_size,
242 size_t *read)
69f90b0b
MG
243{
244 int overflow = 0;
b662b7e9
LC
245 size_t bytes_read, overflow_size = 0;
246 char *overflow_buffer = NULL;
1cc91f1b 247
69f90b0b
MG
248 do
249 {
250 overflow = read_token (port, buffer, buffer_size, &bytes_read);
251 if (bytes_read == 0)
252 break;
253 if (overflow || overflow_size != 0)
254 {
255 if (overflow_size == 0)
256 {
b662b7e9
LC
257 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
258 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
259 overflow_size = bytes_read;
260 }
261 else
262 {
7be3c2fc 263 char *new_buf =
b662b7e9
LC
264 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
265
266 memcpy (new_buf, overflow_buffer, overflow_size);
267 memcpy (new_buf + overflow_size, buffer, bytes_read);
268
269 overflow_buffer = new_buf;
69f90b0b
MG
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
b662b7e9 281 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
cfd15439 289 scm_t_wchar c;
0f2d19dd 290 while (1)
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
b7f3516f 306 switch (c = scm_getc (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0
LC
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0
LC
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d 358static SCM scm_read_expression (SCM port);
b131b233 359static SCM scm_read_sharp (int chr, SCM port, long line, int column);
0f2d19dd
JB
360
361
d6cb0203
MW
362static SCM
363maybe_annotate_source (SCM x, SCM port, long line, int column)
364{
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368}
369
09a4f039 370static SCM
889975e5 371scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 372#define FUNC_NAME "scm_i_lreadparen"
09a4f039 373{
26c8cc14
AW
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
5afa815c 376 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
f9c68a47 381
7337d56d
LC
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
f9c68a47 385
7337d56d 386 scm_ungetc (c, port);
1f7945a7
AW
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d
LC
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
1cc91f1b 400
7337d56d
LC
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
7337d56d 404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 405 {
7337d56d 406 SCM new_tail;
0f2d19dd 407
5b69315e
AW
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
7337d56d 413 scm_ungetc (c, port);
5b69315e
AW
414 tmp = scm_read_expression (port);
415
1f7945a7
AW
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 418 {
89f88612 419 SCM_SETCDR (tl, scm_read_expression (port));
7337d56d 420
7337d56d
LC
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
0f2d19dd 426 }
b858464a 427
7337d56d
LC
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
7337d56d 431 }
0f2d19dd 432
7337d56d 433 exit:
d6cb0203 434 return maybe_annotate_source (ans, port, line, column);
7337d56d
LC
435}
436#undef FUNC_NAME
a4022e69 437
c5661d28
MG
438
439/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
c5661d28
MG
471 } while (0)
472
684d664e
AW
473static void
474skip_intraline_whitespace (SCM port)
475{
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487}
488
7337d56d
LC
489static SCM
490scm_read_string (int chr, SCM port)
491#define FUNC_NAME "scm_lreadr"
492{
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
0f2d19dd 495
ff4d3672
LC
496 SCM str = SCM_EOL;
497 size_t c_str_len = 0;
498 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 499
b131b233
MW
500 /* Need to capture line and column numbers here. */
501 long line = SCM_LINUM (port);
502 int column = SCM_COL (port) - 1;
503
7337d56d
LC
504 while ('"' != (c = scm_getc (port)))
505 {
506 if (c == EOF)
9c44cd45
MG
507 {
508 str_eof:
509 scm_i_input_error (FUNC_NAME, port,
510 "end of file in string constant", SCM_EOL);
511 }
0f2d19dd 512
ff4d3672
LC
513 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
514 {
515 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
516 c_str_len = 0;
517 }
0f2d19dd 518
7337d56d 519 if (c == '\\')
9c44cd45
MG
520 {
521 switch (c = scm_getc (port))
522 {
523 case EOF:
524 goto str_eof;
525 case '"':
526 case '\\':
527 break;
9c44cd45 528 case '\n':
684d664e
AW
529 if (SCM_HUNGRY_EOL_ESCAPES_P)
530 skip_intraline_whitespace (port);
9c44cd45
MG
531 continue;
532 case '0':
533 c = '\0';
534 break;
535 case 'f':
536 c = '\f';
537 break;
538 case 'n':
539 c = '\n';
540 break;
541 case 'r':
542 c = '\r';
543 break;
544 case 't':
545 c = '\t';
546 break;
547 case 'a':
548 c = '\007';
549 break;
550 case 'v':
551 c = '\v';
552 break;
67a4a16d
MG
553 case 'b':
554 c = '\010';
555 break;
9c44cd45 556 case 'x':
dea901d6
MG
557 if (SCM_R6RS_ESCAPES_P)
558 SCM_READ_HEX_ESCAPE (10, ';');
559 else
560 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 561 break;
9c44cd45 562 case 'u':
898a0b5a
MG
563 if (!SCM_R6RS_ESCAPES_P)
564 {
565 SCM_READ_HEX_ESCAPE (4, '\0');
566 break;
567 }
9c44cd45 568 case 'U':
898a0b5a
MG
569 if (!SCM_R6RS_ESCAPES_P)
570 {
571 SCM_READ_HEX_ESCAPE (6, '\0');
572 break;
573 }
9c44cd45
MG
574 default:
575 bad_escaped:
576 scm_i_input_error (FUNC_NAME, port,
577 "illegal character in escape sequence: ~S",
578 scm_list_1 (SCM_MAKE_CHAR (c)));
579 }
580 }
ff4d3672
LC
581
582 c_str[c_str_len++] = c;
7337d56d 583 }
ff4d3672
LC
584
585 if (scm_is_null (str))
586 /* Fast path: we got a string that fits in C_STR. */
587 str = scm_from_utf32_stringn (c_str, c_str_len);
588 else
589 {
590 if (c_str_len > 0)
591 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
592
593 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
594 }
595
596 return maybe_annotate_source (str, port, line, column);
0f2d19dd 597}
db4b4ca6
DH
598#undef FUNC_NAME
599
0f2d19dd 600
7337d56d 601static SCM
889975e5 602scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 603{
69f90b0b 604 SCM result, str = SCM_EOL;
b662b7e9 605 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 606 size_t bytes_read;
69f90b0b 607 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 608
38f19074
MW
609 /* Need to capture line and column numbers here. */
610 long line = SCM_LINUM (port);
611 int column = SCM_COL (port) - 1;
612
7337d56d 613 scm_ungetc (chr, port);
b662b7e9
LC
614 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
615 &bytes_read);
69f90b0b 616
b662b7e9 617 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
618
619 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 620 if (scm_is_false (result))
69f90b0b
MG
621 {
622 /* Return a symbol instead of a number */
623 if (SCM_CASE_INSENSITIVE_P)
624 str = scm_string_downcase_x (str);
625 result = scm_string_to_symbol (str);
626 }
38f19074
MW
627 else if (SCM_NIMP (result))
628 result = maybe_annotate_source (result, port, line, column);
0f2d19dd 629
69f90b0b 630 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
631 return result;
632}
0f2d19dd 633
7337d56d 634static SCM
889975e5 635scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 636{
889975e5
MG
637 SCM result;
638 int ends_with_colon = 0;
69f90b0b 639 size_t bytes_read;
ef4cbc08 640 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
b662b7e9 641 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
642 scm_t_port *pt = SCM_PTAB_ENTRY (port);
643 SCM str;
7337d56d
LC
644
645 scm_ungetc (chr, port);
b662b7e9
LC
646 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
647 &bytes_read);
69f90b0b 648 if (bytes_read > 0)
b662b7e9 649 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 650
69f90b0b
MG
651 if (postfix && ends_with_colon && (bytes_read > 1))
652 {
b662b7e9
LC
653 str = scm_from_stringn (buffer, bytes_read - 1,
654 pt->encoding, pt->ilseq_handler);
69f90b0b
MG
655
656 if (SCM_CASE_INSENSITIVE_P)
657 str = scm_string_downcase_x (str);
658 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
659 }
7337d56d 660 else
69f90b0b 661 {
b662b7e9
LC
662 str = scm_from_stringn (buffer, bytes_read,
663 pt->encoding, pt->ilseq_handler);
69f90b0b
MG
664
665 if (SCM_CASE_INSENSITIVE_P)
666 str = scm_string_downcase_x (str);
667 result = scm_string_to_symbol (str);
668 }
7337d56d 669
69f90b0b 670 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
671 return result;
672}
673
674static SCM
889975e5 675scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
676#define FUNC_NAME "scm_lreadr"
677{
889975e5 678 SCM result;
7337d56d 679 size_t read;
b662b7e9 680 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 681 unsigned int radix;
69f90b0b
MG
682 SCM str;
683 scm_t_port *pt;
7337d56d
LC
684
685 switch (chr)
686 {
687 case 'B':
688 case 'b':
689 radix = 2;
690 break;
691
692 case 'o':
693 case 'O':
694 radix = 8;
695 break;
696
697 case 'd':
698 case 'D':
699 radix = 10;
700 break;
701
702 case 'x':
703 case 'X':
704 radix = 16;
705 break;
706
707 default:
708 scm_ungetc (chr, port);
709 scm_ungetc ('#', port);
710 radix = 10;
711 }
712
b662b7e9
LC
713 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
714 &read);
69f90b0b
MG
715
716 pt = SCM_PTAB_ENTRY (port);
b662b7e9 717 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
718
719 result = scm_string_to_number (str, scm_from_uint (radix));
720
69f90b0b 721 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
722
723 if (scm_is_true (result))
724 return result;
725
726 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
727
728 return SCM_BOOL_F;
729}
730#undef FUNC_NAME
731
732static SCM
733scm_read_quote (int chr, SCM port)
734{
735 SCM p;
492faee1
LC
736 long line = SCM_LINUM (port);
737 int column = SCM_COL (port) - 1;
7337d56d
LC
738
739 switch (chr)
740 {
741 case '`':
742 p = scm_sym_quasiquote;
743 break;
744
745 case '\'':
746 p = scm_sym_quote;
747 break;
748
749 case ',':
750 {
889975e5 751 scm_t_wchar c;
7337d56d
LC
752
753 c = scm_getc (port);
754 if ('@' == c)
755 p = scm_sym_uq_splicing;
756 else
0f2d19dd 757 {
7337d56d
LC
758 scm_ungetc (c, port);
759 p = scm_sym_unquote;
0f2d19dd 760 }
7337d56d
LC
761 break;
762 }
0f2d19dd 763
7337d56d
LC
764 default:
765 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 766 "scm_read_quote", chr);
7337d56d 767 abort ();
0f2d19dd 768 }
1cc91f1b 769
7337d56d 770 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
d6cb0203 771 return maybe_annotate_source (p, port, line, column);
7337d56d
LC
772}
773
34f3d47d
AW
774SCM_SYMBOL (sym_syntax, "syntax");
775SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
776SCM_SYMBOL (sym_unsyntax, "unsyntax");
777SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
778
779static SCM
780scm_read_syntax (int chr, SCM port)
781{
782 SCM p;
783 long line = SCM_LINUM (port);
784 int column = SCM_COL (port) - 1;
785
786 switch (chr)
787 {
788 case '`':
789 p = sym_quasisyntax;
790 break;
791
792 case '\'':
793 p = sym_syntax;
794 break;
795
796 case ',':
797 {
798 int c;
799
800 c = scm_getc (port);
801 if ('@' == c)
802 p = sym_unsyntax_splicing;
803 else
804 {
805 scm_ungetc (c, port);
806 p = sym_unsyntax;
807 }
808 break;
809 }
810
811 default:
812 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
813 "scm_read_syntax", chr);
814 abort ();
815 }
816
817 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
d6cb0203 818 return maybe_annotate_source (p, port, line, column);
34f3d47d
AW
819}
820
cfd15439 821static SCM
7c4aad9c
AW
822scm_read_nil (int chr, SCM port)
823{
824 SCM id = scm_read_mixed_case_symbol (chr, port);
825
826 if (!scm_is_eq (id, sym_nil))
827 scm_i_input_error ("scm_read_nil", port,
828 "unexpected input while reading #nil: ~a",
829 scm_list_1 (id));
830
831 return SCM_ELISP_NIL;
832}
833
cfd15439 834static SCM
7337d56d 835scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 836{
0f2d19dd
JB
837 int c;
838
889975e5
MG
839 /* We use the get_byte here because there is no need to get the
840 locale correct with comment input. This presumes that newline
841 always represents itself no matter what the encoding is. */
842 for (c = scm_get_byte_or_eof (port);
7337d56d 843 (c != EOF) && (c != '\n');
8a8da78d 844 c = scm_get_byte_or_eof (port));
7337d56d
LC
845
846 return SCM_UNSPECIFIED;
847}
848
849\f
850/* Sharp readers, i.e. readers called after a `#' sign has been read. */
851
852static SCM
853scm_read_boolean (int chr, SCM port)
854{
855 switch (chr)
0f2d19dd 856 {
7337d56d
LC
857 case 't':
858 case 'T':
859 return SCM_BOOL_T;
860
861 case 'f':
862 case 'F':
863 return SCM_BOOL_F;
0f2d19dd 864 }
7337d56d
LC
865
866 return SCM_UNSPECIFIED;
867}
868
869static SCM
889975e5 870scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
871#define FUNC_NAME "scm_lreadr"
872{
69f90b0b
MG
873 char buffer[READER_CHAR_NAME_MAX_SIZE];
874 SCM charname;
875 size_t charname_len, bytes_read;
889975e5
MG
876 scm_t_wchar cp;
877 int overflow;
69f90b0b 878 scm_t_port *pt;
889975e5 879
69f90b0b 880 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 881 if (overflow)
c1a0ba1c 882 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 883
69f90b0b 884 if (bytes_read == 0)
0f2d19dd 885 {
7337d56d
LC
886 chr = scm_getc (port);
887 if (chr == EOF)
888 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
889 "while reading character", SCM_EOL);
890
891 /* CHR must be a token delimiter, like a whitespace. */
892 return (SCM_MAKE_CHAR (chr));
0f2d19dd 893 }
7337d56d 894
69f90b0b 895 pt = SCM_PTAB_ENTRY (port);
7337d56d 896
69f90b0b
MG
897 /* Simple ASCII characters can be processed immediately. Also, simple
898 ISO-8859-1 characters can be processed immediately if the encoding for this
899 port is ISO-8859-1. */
900 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
901 {
902 SCM_COL (port) += 1;
903 return SCM_MAKE_CHAR (buffer[0]);
904 }
905
906 /* Otherwise, convert the buffer into a proper scheme string for
907 processing. */
908 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
909 pt->ilseq_handler);
910 charname_len = scm_i_string_length (charname);
911 SCM_COL (port) += charname_len;
889975e5 912 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
913 if (charname_len == 1)
914 return SCM_MAKE_CHAR (cp);
915
916 /* Ignore dotted circles, which may be used to keep combining characters from
917 combining with the backslash in #\charname. */
0dcd7e61
MG
918 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
919 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
920
889975e5 921 if (cp >= '0' && cp < '8')
7337d56d
LC
922 {
923 /* Dirk:FIXME:: This type of character syntax is not R5RS
924 * compliant. Further, it should be verified that the constant
0ffc78e3 925 * does only consist of octal digits. */
889975e5 926 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 927 if (SCM_I_INUMP (p))
0ffc78e3 928 {
e25f3727 929 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
930 if (SCM_IS_UNICODE_CHAR (c))
931 return SCM_MAKE_CHAR (c);
932 else
0f3a70cf 933 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
934 "out-of-range octal character escape: ~a",
935 scm_list_1 (charname));
936 }
7337d56d
LC
937 }
938
0f3a70cf 939 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
940 {
941 SCM p;
0f3a70cf 942
dea901d6
MG
943 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
944 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
945 scm_from_uint (16));
946 if (SCM_I_INUMP (p))
947 {
e25f3727 948 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
949 if (SCM_IS_UNICODE_CHAR (c))
950 return SCM_MAKE_CHAR (c);
951 else
952 scm_i_input_error (FUNC_NAME, port,
953 "out-of-range hex character escape: ~a",
954 scm_list_1 (charname));
955 }
956 }
957
889975e5
MG
958 /* The names of characters should never have non-Latin1
959 characters. */
960 if (scm_i_is_narrow_string (charname)
961 || scm_i_try_narrow_string (charname))
4769c9db
AW
962 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
963 charname_len);
964 if (scm_is_true (ch))
965 return ch;
966 }
7337d56d 967
7337d56d 968 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 969 scm_list_1 (charname));
7337d56d
LC
970
971 return SCM_UNSPECIFIED;
0f2d19dd 972}
db4b4ca6 973#undef FUNC_NAME
0f2d19dd 974
cfd15439 975static SCM
7337d56d
LC
976scm_read_keyword (int chr, SCM port)
977{
978 SCM symbol;
979
980 /* Read the symbol that comprises the keyword. Doing this instead of
981 invoking a specific symbol reader function allows `scm_read_keyword ()'
982 to adapt to the delimiters currently valid of symbols.
1cc91f1b 983
7337d56d
LC
984 XXX: This implementation allows sloppy syntaxes like `#: key'. */
985 symbol = scm_read_expression (port);
986 if (!scm_is_symbol (symbol))
7f74cf9a 987 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
988 "keyword prefix `~a' not followed by a symbol: ~s",
989 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
990
991 return (scm_symbol_to_keyword (symbol));
992}
993
cfd15439 994static SCM
b131b233 995scm_read_vector (int chr, SCM port, long line, int column)
09a4f039 996{
7337d56d
LC
997 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
998 guarantee that it's going to do what we want. After all, this is an
999 implementation detail of `scm_read_vector ()', not a desirable
1000 property. */
b131b233
MW
1001 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1002 port, line, column);
1003}
1004
493ceb99
MW
1005/* Helper used by scm_read_array */
1006static int
1007read_decimal_integer (SCM port, int c, ssize_t *resp)
1008{
1009 ssize_t sign = 1;
1010 ssize_t res = 0;
1011 int got_it = 0;
1012
1013 if (c == '-')
1014 {
1015 sign = -1;
1016 c = scm_getc (port);
1017 }
1018
1019 while ('0' <= c && c <= '9')
1020 {
1021 res = 10*res + c-'0';
1022 got_it = 1;
1023 c = scm_getc (port);
1024 }
1025
1026 if (got_it)
1027 *resp = sign * res;
1028 return c;
1029}
1030
1031/* Read an array. This function can also read vectors and uniform
1032 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1033 handled here.
1034
1035 C is the first character read after the '#'. */
b131b233 1036static SCM
493ceb99 1037scm_read_array (int c, SCM port, long line, int column)
b131b233 1038{
493ceb99
MW
1039 ssize_t rank;
1040 scm_t_wchar tag_buf[8];
1041 int tag_len;
1042
1043 SCM tag, shape = SCM_BOOL_F, elements, array;
1044
1045 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1046 the array code can not deal with zero-length dimensions yet, and
1047 we want to allow zero-length vectors, of course. */
1048 if (c == '(')
1049 return scm_read_vector (c, port, line, column);
1050
1051 /* Disambiguate between '#f' and uniform floating point vectors. */
1052 if (c == 'f')
1053 {
1054 c = scm_getc (port);
1055 if (c != '3' && c != '6')
1056 {
1057 if (c != EOF)
1058 scm_ungetc (c, port);
1059 return SCM_BOOL_F;
1060 }
1061 rank = 1;
1062 tag_buf[0] = 'f';
1063 tag_len = 1;
1064 goto continue_reading_tag;
1065 }
1066
1067 /* Read rank. */
1068 rank = 1;
1069 c = read_decimal_integer (port, c, &rank);
1070 if (rank < 0)
1071 scm_i_input_error (NULL, port, "array rank must be non-negative",
1072 SCM_EOL);
1073
1074 /* Read tag. */
1075 tag_len = 0;
1076 continue_reading_tag:
1077 while (c != EOF && c != '(' && c != '@' && c != ':'
1078 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1079 {
1080 tag_buf[tag_len++] = c;
1081 c = scm_getc (port);
1082 }
1083 if (tag_len == 0)
1084 tag = SCM_BOOL_T;
b131b233 1085 else
493ceb99
MW
1086 {
1087 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1088 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1089 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1090 scm_list_1 (tag));
1091 }
1092
1093 /* Read shape. */
1094 if (c == '@' || c == ':')
1095 {
1096 shape = SCM_EOL;
1097
1098 do
1099 {
1100 ssize_t lbnd = 0, len = 0;
1101 SCM s;
1102
1103 if (c == '@')
1104 {
1105 c = scm_getc (port);
1106 c = read_decimal_integer (port, c, &lbnd);
1107 }
1108
1109 s = scm_from_ssize_t (lbnd);
1110
1111 if (c == ':')
1112 {
1113 c = scm_getc (port);
1114 c = read_decimal_integer (port, c, &len);
1115 if (len < 0)
1116 scm_i_input_error (NULL, port,
1117 "array length must be non-negative",
1118 SCM_EOL);
1119
1120 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1121 }
1122
1123 shape = scm_cons (s, shape);
1124 } while (c == '@' || c == ':');
1125
1126 shape = scm_reverse_x (shape, SCM_EOL);
1127 }
1128
1129 /* Read nested lists of elements. */
1130 if (c != '(')
1131 scm_i_input_error (NULL, port,
1132 "missing '(' in vector or array literal",
1133 SCM_EOL);
1134 elements = scm_read_sexp (c, port);
1135
1136 if (scm_is_false (shape))
1137 shape = scm_from_ssize_t (rank);
1138 else if (scm_ilength (shape) != rank)
1139 scm_i_input_error
1140 (NULL, port,
1141 "the number of shape specifications must match the array rank",
1142 SCM_EOL);
1143
1144 /* Handle special print syntax of rank zero arrays; see
1145 scm_i_print_array for a rationale. */
1146 if (rank == 0)
1147 {
1148 if (!scm_is_pair (elements))
1149 scm_i_input_error (NULL, port,
1150 "too few elements in array literal, need 1",
1151 SCM_EOL);
1152 if (!scm_is_null (SCM_CDR (elements)))
1153 scm_i_input_error (NULL, port,
1154 "too many elements in array literal, want 1",
1155 SCM_EOL);
1156 elements = SCM_CAR (elements);
1157 }
1158
1159 /* Construct array, annotate with source location, and return. */
1160 array = scm_list_to_typed_array (tag, shape, elements);
1161 return maybe_annotate_source (array, port, line, column);
7337d56d 1162}
09a4f039 1163
cfd15439 1164static SCM
b131b233 1165scm_read_srfi4_vector (int chr, SCM port, long line, int column)
7337d56d 1166{
b131b233 1167 return scm_read_array (chr, port, line, column);
7337d56d
LC
1168}
1169
0ba0b384 1170static SCM
b131b233 1171scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
0ba0b384
LC
1172{
1173 chr = scm_getc (port);
1174 if (chr != 'u')
1175 goto syntax;
1176
1177 chr = scm_getc (port);
1178 if (chr != '8')
1179 goto syntax;
1180
1181 chr = scm_getc (port);
1182 if (chr != '(')
1183 goto syntax;
1184
b131b233
MW
1185 return maybe_annotate_source
1186 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1187 port, line, column);
0ba0b384
LC
1188
1189 syntax:
1190 scm_i_input_error ("read_bytevector", port,
1191 "invalid bytevector prefix",
1192 SCM_MAKE_CHAR (chr));
1193 return SCM_UNSPECIFIED;
1194}
1195
7337d56d 1196static SCM
b131b233 1197scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
7337d56d
LC
1198{
1199 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1200 terribly inefficient but who cares? */
1201 SCM s_bits = SCM_EOL;
1202
1203 for (chr = scm_getc (port);
1204 (chr != EOF) && ((chr == '0') || (chr == '1'));
1205 chr = scm_getc (port))
09a4f039 1206 {
7337d56d 1207 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1208 }
7337d56d
LC
1209
1210 if (chr != EOF)
1211 scm_ungetc (chr, port);
1212
b131b233
MW
1213 return maybe_annotate_source
1214 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1215 port, line, column);
7337d56d
LC
1216}
1217
cfd15439 1218static SCM
889975e5 1219scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1220{
1221 int bang_seen = 0;
1222
1223 for (;;)
09a4f039 1224 {
58b1db5f 1225 int c = scm_getc (port);
62850ef3 1226
7337d56d
LC
1227 if (c == EOF)
1228 scm_i_input_error ("skip_block_comment", port,
1229 "unterminated `#! ... !#' comment", SCM_EOL);
1230
1231 if (c == '!')
1232 bang_seen = 1;
1233 else if (c == '#' && bang_seen)
1234 break;
1235 else
1236 bang_seen = 0;
1237 }
1238
1239 return SCM_UNSPECIFIED;
1240}
1241
d7fcaec3 1242static SCM
911b03b2
JG
1243scm_read_shebang (scm_t_wchar chr, SCM port)
1244{
1245 int c = 0;
1246 if ((c = scm_get_byte_or_eof (port)) != 'r')
1247 {
1248 scm_ungetc (c, port);
1249 return scm_read_scsh_block_comment (chr, port);
1250 }
1251 if ((c = scm_get_byte_or_eof (port)) != '6')
1252 {
1253 scm_ungetc (c, port);
1254 scm_ungetc ('r', port);
1255 return scm_read_scsh_block_comment (chr, port);
1256 }
1257 if ((c = scm_get_byte_or_eof (port)) != 'r')
1258 {
1259 scm_ungetc (c, port);
1260 scm_ungetc ('6', port);
1261 scm_ungetc ('r', port);
1262 return scm_read_scsh_block_comment (chr, port);
1263 }
1264 if ((c = scm_get_byte_or_eof (port)) != 's')
1265 {
1266 scm_ungetc (c, port);
1267 scm_ungetc ('r', port);
1268 scm_ungetc ('6', port);
1269 scm_ungetc ('r', port);
1270 return scm_read_scsh_block_comment (chr, port);
1271 }
1272
1273 return SCM_UNSPECIFIED;
1274}
1275
620c8965
LC
1276static SCM
1277scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1278{
1279 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1280 nested. So care must be taken. */
1281 int nesting_level = 1;
6d5f8c32
AW
1282
1283 int a = scm_getc (port);
1284
1285 if (a == EOF)
1286 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1287 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1288
1289 while (nesting_level > 0)
1290 {
6d5f8c32 1291 int b = scm_getc (port);
620c8965 1292
6d5f8c32 1293 if (b == EOF)
cd169c5a 1294 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1295 "unterminated `#| ... |#' comment", SCM_EOL);
1296
6d5f8c32
AW
1297 if (a == '|' && b == '#')
1298 {
1299 nesting_level--;
1300 b = EOF;
1301 }
1302 else if (a == '#' && b == '|')
1303 {
1304 nesting_level++;
1305 b = EOF;
1306 }
1307
1308 a = b;
620c8965
LC
1309 }
1310
1311 return SCM_UNSPECIFIED;
1312}
1313
34f3d47d 1314static SCM
889975e5 1315scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1316{
889975e5 1317 scm_t_wchar c;
34f3d47d
AW
1318
1319 c = flush_ws (port, (char *) NULL);
1320 if (EOF == c)
1321 scm_i_input_error ("read_commented_expression", port,
1322 "no expression after #; comment", SCM_EOL);
1323 scm_ungetc (c, port);
1324 scm_read_expression (port);
1325 return SCM_UNSPECIFIED;
1326}
1327
7337d56d 1328static SCM
889975e5 1329scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1330{
1331 /* Guile's extended symbol read syntax looks like this:
1332
1333 #{This is all a symbol name}#
1334
1335 So here, CHR is expected to be `{'. */
d9527cfa 1336 int saw_brace = 0;
7337d56d 1337 size_t len = 0;
190d4b0d 1338 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1339
889975e5 1340 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1341
1342 while ((chr = scm_getc (port)) != EOF)
1343 {
1344 if (saw_brace)
09a4f039 1345 {
7337d56d
LC
1346 if (chr == '#')
1347 {
7337d56d
LC
1348 break;
1349 }
1350 else
1351 {
1352 saw_brace = 0;
889975e5 1353 scm_i_string_set_x (buf, len++, '}');
7337d56d 1354 }
09a4f039 1355 }
d9527cfa
AW
1356
1357 if (chr == '}')
7337d56d 1358 saw_brace = 1;
d9527cfa
AW
1359 else if (chr == '\\')
1360 {
1361 /* It used to be that print.c would print extended-read-syntax
1362 symbols with backslashes before "non-standard" chars, but
1363 this routine wouldn't do anything with those escapes.
1364 Bummer. What we've done is to change print.c to output
1365 R6RS hex escapes for those characters, relying on the fact
1366 that the extended read syntax would never put a `\' before
1367 an `x'. For now, we just ignore other instances of
1368 backslash in the string. */
1369 switch ((chr = scm_getc (port)))
1370 {
1371 case EOF:
1372 goto done;
1373 case 'x':
1374 {
1375 scm_t_wchar c;
1376
1377 SCM_READ_HEX_ESCAPE (10, ';');
1378 scm_i_string_set_x (buf, len++, c);
1379 break;
1380
1381 str_eof:
1382 chr = EOF;
1383 goto done;
1384
1385 bad_escaped:
1386 scm_i_string_stop_writing ();
1387 scm_i_input_error ("scm_read_extended_symbol", port,
1388 "illegal character in escape sequence: ~S",
1389 scm_list_1 (SCM_MAKE_CHAR (c)));
1390 break;
1391 }
1392 default:
1393 scm_i_string_set_x (buf, len++, chr);
1394 break;
1395 }
1396 }
7337d56d 1397 else
d9527cfa 1398 scm_i_string_set_x (buf, len++, chr);
62850ef3 1399
889975e5 1400 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1401 {
7f991c7d
LC
1402 SCM addy;
1403
889975e5 1404 scm_i_string_stop_writing ();
190d4b0d 1405 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1406 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1407 len = 0;
889975e5 1408 buf = scm_i_string_start_writing (buf);
7337d56d 1409 }
7337d56d 1410 }
d9527cfa
AW
1411
1412 done:
889975e5 1413 scm_i_string_stop_writing ();
d9527cfa
AW
1414 if (chr == EOF)
1415 scm_i_input_error ("scm_read_extended_symbol", port,
1416 "end of file while reading symbol", SCM_EOL);
7337d56d 1417
889975e5 1418 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1419}
1420
1421
1422\f
1423/* Top-level token readers, i.e., dispatchers. */
1424
1425static SCM
1426scm_read_sharp_extension (int chr, SCM port)
1427{
1428 SCM proc;
1429
1430 proc = scm_get_hash_procedure (chr);
1431 if (scm_is_true (scm_procedure_p (proc)))
1432 {
1433 long line = SCM_LINUM (port);
1434 int column = SCM_COL (port) - 2;
1435 SCM got;
1436
1437 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14
AW
1438
1439 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1440 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1441
1442 return got;
09a4f039 1443 }
7337d56d
LC
1444
1445 return SCM_UNSPECIFIED;
1446}
1447
1448/* The reader for the sharp `#' character. It basically dispatches reads
1449 among the above token readers. */
1450static SCM
b131b233 1451scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
7337d56d
LC
1452#define FUNC_NAME "scm_lreadr"
1453{
1454 SCM result;
1455
1456 chr = scm_getc (port);
1457
1458 result = scm_read_sharp_extension (chr, port);
1459 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1460 return result;
1461
1462 switch (chr)
1463 {
1464 case '\\':
1465 return (scm_read_character (chr, port));
1466 case '(':
b131b233 1467 return (scm_read_vector (chr, port, line, column));
7337d56d
LC
1468 case 's':
1469 case 'u':
1470 case 'f':
df941b5b 1471 case 'c':
7337d56d 1472 /* This one may return either a boolean or an SRFI-4 vector. */
b131b233 1473 return (scm_read_srfi4_vector (chr, port, line, column));
0ba0b384 1474 case 'v':
b131b233 1475 return (scm_read_bytevector (chr, port, line, column));
7337d56d 1476 case '*':
b131b233 1477 return (scm_read_guile_bit_vector (chr, port, line, column));
7337d56d
LC
1478 case 't':
1479 case 'T':
1480 case 'F':
7337d56d
LC
1481 return (scm_read_boolean (chr, port));
1482 case ':':
1483 return (scm_read_keyword (chr, port));
1484 case '0': case '1': case '2': case '3': case '4':
1485 case '5': case '6': case '7': case '8': case '9':
1486 case '@':
1487#if SCM_ENABLE_DEPRECATED
1488 /* See below for 'i' and 'e'. */
1489 case 'a':
7337d56d
LC
1490 case 'y':
1491 case 'h':
1492 case 'l':
1493#endif
b131b233 1494 return (scm_read_array (chr, port, line, column));
7337d56d
LC
1495
1496 case 'i':
1497 case 'e':
1498#if SCM_ENABLE_DEPRECATED
1499 {
1500 /* When next char is '(', it really is an old-style
1501 uniform array. */
889975e5 1502 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1503 if (next_c != EOF)
1504 scm_ungetc (next_c, port);
1505 if (next_c == '(')
b131b233 1506 return scm_read_array (chr, port, line, column);
7337d56d
LC
1507 /* Fall through. */
1508 }
1509#endif
1510 case 'b':
1511 case 'B':
1512 case 'o':
1513 case 'O':
1514 case 'd':
1515 case 'D':
1516 case 'x':
1517 case 'X':
1518 case 'I':
1519 case 'E':
1520 return (scm_read_number_and_radix (chr, port));
1521 case '{':
1522 return (scm_read_extended_symbol (chr, port));
1523 case '!':
911b03b2 1524 return (scm_read_shebang (chr, port));
34f3d47d
AW
1525 case ';':
1526 return (scm_read_commented_expression (chr, port));
1527 case '`':
1528 case '\'':
1529 case ',':
1530 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1531 case 'n':
1532 return (scm_read_nil (chr, port));
7337d56d
LC
1533 default:
1534 result = scm_read_sharp_extension (chr, port);
1535 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1536 {
1537 /* To remain compatible with 1.8 and earlier, the following
1538 characters have lower precedence than `read-hash-extend'
1539 characters. */
1540 switch (chr)
1541 {
1542 case '|':
1543 return scm_read_r6rs_block_comment (chr, port);
1544 default:
1545 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1546 scm_list_1 (SCM_MAKE_CHAR (chr)));
1547 }
1548 }
7337d56d
LC
1549 else
1550 return result;
1551 }
1552
1553 return SCM_UNSPECIFIED;
1554}
1555#undef FUNC_NAME
1556
1557static SCM
1558scm_read_expression (SCM port)
1559#define FUNC_NAME "scm_read_expression"
1560{
1561 while (1)
1562 {
cfd15439 1563 scm_t_wchar chr;
7337d56d
LC
1564
1565 chr = scm_getc (port);
1566
1567 switch (chr)
1568 {
1569 case SCM_WHITE_SPACES:
1570 case SCM_LINE_INCREMENTORS:
1571 break;
1572 case ';':
1573 (void) scm_read_semicolon_comment (chr, port);
1574 break;
5afa815c
AW
1575 case '[':
1576 if (!SCM_SQUARE_BRACKETS_P)
1577 return (scm_read_mixed_case_symbol (chr, port));
1578 /* otherwise fall through */
7337d56d
LC
1579 case '(':
1580 return (scm_read_sexp (chr, port));
1581 case '"':
1582 return (scm_read_string (chr, port));
1583 case '\'':
1584 case '`':
1585 case ',':
1586 return (scm_read_quote (chr, port));
1587 case '#':
1588 {
b131b233
MW
1589 long line = SCM_LINUM (port);
1590 int column = SCM_COL (port) - 1;
1591 SCM result = scm_read_sharp (chr, port, line, column);
7337d56d
LC
1592 if (scm_is_eq (result, SCM_UNSPECIFIED))
1593 /* We read a comment or some such. */
1594 break;
1595 else
1596 return result;
1597 }
1598 case ')':
1599 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1600 break;
a4e47229
MG
1601 case ']':
1602 if (SCM_SQUARE_BRACKETS_P)
1603 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1604 /* otherwise fall through */
7337d56d
LC
1605 case EOF:
1606 return SCM_EOF_VAL;
1607 case ':':
1608 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1609 return scm_symbol_to_keyword (scm_read_expression (port));
1610 /* Fall through. */
1611
1612 default:
1613 {
1614 if (((chr >= '0') && (chr <= '9'))
1615 || (strchr ("+-.", chr)))
1616 return (scm_read_number (chr, port));
1617 else
1618 return (scm_read_mixed_case_symbol (chr, port));
1619 }
1620 }
1621 }
1622}
1623#undef FUNC_NAME
1624
1625\f
1626/* Actual reader. */
1627
1628SCM_DEFINE (scm_read, "read", 0, 1, 0,
1629 (SCM port),
1630 "Read an s-expression from the input port @var{port}, or from\n"
1631 "the current input port if @var{port} is not specified.\n"
1632 "Any whitespace before the next token is discarded.")
1633#define FUNC_NAME s_scm_read
1634{
1635 int c;
1636
1637 if (SCM_UNBNDP (port))
1638 port = scm_current_input_port ();
1639 SCM_VALIDATE_OPINPORT (1, port);
1640
1641 c = flush_ws (port, (char *) NULL);
1642 if (EOF == c)
1643 return SCM_EOF_VAL;
1644 scm_ungetc (c, port);
1645
1646 return (scm_read_expression (port));
09a4f039 1647}
db4b4ca6 1648#undef FUNC_NAME
09a4f039 1649
0f2d19dd
JB
1650
1651\f
1652
14de3b42
GH
1653/* Manipulate the read-hash-procedures alist. This could be written in
1654 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1655SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1656 (SCM chr, SCM proc),
dc7fa443
MG
1657 "Install the procedure @var{proc} for reading expressions\n"
1658 "starting with the character sequence @code{#} and @var{chr}.\n"
1659 "@var{proc} will be called with two arguments: the character\n"
1660 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1661 "returned will be the return value of @code{read}. \n"
1662 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1663 )
1bbd0b84 1664#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1665{
fed9c9a2
GH
1666 SCM this;
1667 SCM prev;
1668
36284627 1669 SCM_VALIDATE_CHAR (1, chr);
7888309b 1670 SCM_ASSERT (scm_is_false (proc)
bc36d050 1671 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1672 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1673
14de3b42 1674 /* Check if chr is already in the alist. */
d458073b 1675 this = scm_i_read_hash_procedures_ref ();
14de3b42 1676 prev = SCM_BOOL_F;
fed9c9a2
GH
1677 while (1)
1678 {
d2e53ed6 1679 if (scm_is_null (this))
fed9c9a2
GH
1680 {
1681 /* not found, so add it to the beginning. */
7888309b 1682 if (scm_is_true (proc))
fed9c9a2 1683 {
d458073b
AR
1684 SCM new = scm_cons (scm_cons (chr, proc),
1685 scm_i_read_hash_procedures_ref ());
1686 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1687 }
1688 break;
1689 }
bc36d050 1690 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1691 {
1692 /* already in the alist. */
7888309b 1693 if (scm_is_false (proc))
14de3b42
GH
1694 {
1695 /* remove it. */
7888309b 1696 if (scm_is_false (prev))
14de3b42 1697 {
d458073b
AR
1698 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1699 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1700 }
1701 else
1702 scm_set_cdr_x (prev, SCM_CDR (this));
1703 }
fed9c9a2 1704 else
14de3b42
GH
1705 {
1706 /* replace it. */
1707 scm_set_cdr_x (SCM_CAR (this), proc);
1708 }
fed9c9a2
GH
1709 break;
1710 }
1711 prev = this;
1712 this = SCM_CDR (this);
1713 }
deca31e1 1714
deca31e1
GH
1715 return SCM_UNSPECIFIED;
1716}
1bbd0b84 1717#undef FUNC_NAME
0f2d19dd 1718
deca31e1
GH
1719/* Recover the read-hash procedure corresponding to char c. */
1720static SCM
6e8d25a6 1721scm_get_hash_procedure (int c)
deca31e1 1722{
d458073b 1723 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1724
deca31e1
GH
1725 while (1)
1726 {
d2e53ed6 1727 if (scm_is_null (rest))
deca31e1
GH
1728 return SCM_BOOL_F;
1729
7866a09b 1730 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1731 return SCM_CDAR (rest);
1732
1733 rest = SCM_CDR (rest);
1734 }
1735}
1cc91f1b 1736
889975e5
MG
1737#define SCM_ENCODING_SEARCH_SIZE (500)
1738
f8a1c9a8
LC
1739/* Search the first few hundred characters of a file for an Emacs-like coding
1740 declaration. Returns either NULL or a string whose storage has been
1741 allocated with `scm_gc_malloc ()'. */
889975e5 1742char *
f8a1c9a8 1743scm_i_scan_for_encoding (SCM port)
889975e5 1744{
d900843c 1745 scm_t_port *pt;
889975e5 1746 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1747 size_t bytes_read, encoding_length, i;
889975e5
MG
1748 char *encoding = NULL;
1749 int utf8_bom = 0;
daedbca7 1750 char *pos, *encoding_start;
889975e5
MG
1751 int in_comment;
1752
d900843c 1753 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1754
d900843c
AW
1755 if (pt->rw_active == SCM_PORT_WRITE)
1756 scm_flush (port);
49bb5bd3 1757
d900843c
AW
1758 if (pt->rw_random)
1759 pt->rw_active = SCM_PORT_READ;
1760
1761 if (pt->read_pos == pt->read_end)
1762 {
1763 /* We can use the read buffer, and thus avoid a seek. */
1764 if (scm_fill_input (port) == EOF)
1765 return NULL;
1766
1767 bytes_read = pt->read_end - pt->read_pos;
1768 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1769 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1770
1771 if (bytes_read <= 1)
1772 /* An unbuffered port -- don't scan. */
1773 return NULL;
1774
1775 memcpy (header, pt->read_pos, bytes_read);
1776 header[bytes_read] = '\0';
1777 }
1778 else
1779 {
1780 /* Try to read some bytes and then seek back. Not all ports
1781 support seeking back; and indeed some file ports (like
1782 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1783 check performed by SCM_FPORT_FDES---but fail to seek
1784 backwards. Hence this block comes second. We prefer to use
1785 the read buffer in-place. */
1786 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1787 return NULL;
1788
1789 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1790 header[bytes_read] = '\0';
1791 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1792 }
889975e5
MG
1793
1794 if (bytes_read > 3
1795 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1796 utf8_bom = 1;
1797
1798 /* search past "coding[:=]" */
1799 pos = header;
1800 while (1)
1801 {
1802 if ((pos = strstr(pos, "coding")) == NULL)
1803 return NULL;
1804
1805 pos += strlen("coding");
1806 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1807 (*pos == ':' || *pos == '='))
1808 {
1809 pos ++;
1810 break;
1811 }
1812 }
1813
1814 /* skip spaces */
1815 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1816 (*pos == ' ' || *pos == '\t'))
1817 pos ++;
1818
1819 /* grab the next token */
daedbca7 1820 encoding_start = pos;
889975e5 1821 i = 0;
daedbca7
MG
1822 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1823 && encoding_start + i - header < bytes_read
1824 && (isalnum ((int) encoding_start[i])
1825 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1826 i++;
1827
daedbca7
MG
1828 encoding_length = i;
1829 if (encoding_length == 0)
889975e5
MG
1830 return NULL;
1831
daedbca7
MG
1832 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1833 for (i = 0; i < encoding_length; i++)
026ed239 1834 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1835
1836 /* push backwards to make sure we were in a comment */
1837 in_comment = 0;
daedbca7
MG
1838 pos = encoding_start;
1839 while (pos >= header)
889975e5 1840 {
8a12aeb9
AW
1841 if (*pos == ';')
1842 {
1843 in_comment = 1;
1844 break;
1845 }
1846 else if (*pos == '\n' || pos == header)
889975e5
MG
1847 {
1848 /* This wasn't in a semicolon comment. Check for a
1849 hash-bang comment. */
1850 char *beg = strstr (header, "#!");
1851 char *end = strstr (header, "!#");
8a12aeb9 1852 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1853 in_comment = 1;
1854 break;
1855 }
8a12aeb9
AW
1856 else
1857 {
1858 pos --;
1859 continue;
1860 }
889975e5
MG
1861 }
1862 if (!in_comment)
f8a1c9a8
LC
1863 /* This wasn't in a comment */
1864 return NULL;
1865
889975e5 1866 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1867 scm_misc_error (NULL,
889975e5
MG
1868 "the port input declares the encoding ~s but is encoded as UTF-8",
1869 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1870
889975e5
MG
1871 return encoding;
1872}
1873
1874SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1875 (SCM port),
a270e133 1876 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1877 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1878 "The coding declaration is of the form\n"
1879 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1880 "\n"
1881 "Returns a string containing the character encoding of the file\n"
1882 "if a declaration was found, or @code{#f} otherwise.\n")
1883#define FUNC_NAME s_scm_file_encoding
1884{
1885 char *enc;
1886 SCM s_enc;
f8a1c9a8 1887
d900843c
AW
1888 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1889
f8a1c9a8 1890 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1891 if (enc == NULL)
1892 return SCM_BOOL_F;
1893 else
1894 {
1895 s_enc = scm_from_locale_string (enc);
889975e5
MG
1896 return s_enc;
1897 }
f8a1c9a8 1898
889975e5
MG
1899 return SCM_BOOL_F;
1900}
1901#undef FUNC_NAME
1902
0f2d19dd
JB
1903void
1904scm_init_read ()
0f2d19dd 1905{
d458073b
AR
1906 SCM read_hash_procs;
1907
c81c2ad3 1908 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
1909
1910 scm_i_read_hash_procedures =
1911 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1912
62560650 1913 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1914#include "libguile/read.x"
0f2d19dd 1915}
89e00824
ML
1916
1917/*
1918 Local Variables:
1919 c-file-style: "gnu"
1920 End:
1921*/