fix static allocation with debug_typing_strictness==2
[bpt/guile.git] / libguile / read.c
CommitLineData
4a655e50 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
3d27ef4b 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
d458073b
AR
145static inline SCM
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
151static inline void
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
7337d56d
LC
164/* Size of the C buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
620c8965
LC
200static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
7337d56d 210static inline int
69f90b0b
MG
211read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
69f90b0b 219 chr = scm_get_byte_or_eof (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
69f90b0b
MG
238/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243static int
244read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246{
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
7337d56d 249
69f90b0b
MG
250 *overflow_buffer = NULL;
251 overflow_size = 0;
1cc91f1b 252
69f90b0b
MG
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
281 return (overflow_size != 0);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
889975e5 289 register scm_t_wchar c;
0f2d19dd 290 while (1)
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
b7f3516f 306 switch (c = scm_getc (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0
LC
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0
LC
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d
LC
358static SCM scm_read_expression (SCM port);
359static SCM scm_read_sharp (int chr, SCM port);
7337d56d 360static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
361
362
09a4f039 363static SCM
889975e5 364scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 365#define FUNC_NAME "scm_i_lreadparen"
09a4f039 366{
7337d56d
LC
367 register int c;
368 register SCM tmp;
369 register SCM tl, ans = SCM_EOL;
bd22f1c7 370 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
5afa815c 371 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
372
373 /* Need to capture line and column numbers here. */
374 long line = SCM_LINUM (port);
375 int column = SCM_COL (port) - 1;
f9c68a47 376
f9c68a47 377
7337d56d
LC
378 c = flush_ws (port, FUNC_NAME);
379 if (terminating_char == c)
380 return SCM_EOL;
f9c68a47 381
7337d56d
LC
382 scm_ungetc (c, port);
383 if (scm_is_eq (scm_sym_dot,
384 (tmp = scm_read_expression (port))))
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
1cc91f1b 392
7337d56d
LC
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
396 if (SCM_COPY_SOURCE_P)
397 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
398 ? copy
399 : tmp,
400 SCM_EOL);
401
402 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 403 {
7337d56d 404 SCM new_tail;
0f2d19dd 405
5b69315e
AW
406 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
407 scm_i_input_error (FUNC_NAME, port,
408 "in pair: mismatched close paren: ~A",
409 scm_list_1 (SCM_MAKE_CHAR (c)));
410
7337d56d 411 scm_ungetc (c, port);
5b69315e
AW
412 tmp = scm_read_expression (port);
413
414 if (scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 415 {
7337d56d
LC
416 SCM_SETCDR (tl, tmp = scm_read_expression (port));
417
418 if (SCM_COPY_SOURCE_P)
419 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
420 SCM_EOL));
421
422 c = flush_ws (port, FUNC_NAME);
423 if (terminating_char != c)
424 scm_i_input_error (FUNC_NAME, port,
425 "in pair: missing close paren", SCM_EOL);
426 goto exit;
0f2d19dd 427 }
b858464a 428
7337d56d
LC
429 new_tail = scm_cons (tmp, SCM_EOL);
430 SCM_SETCDR (tl, new_tail);
431 tl = new_tail;
432
433 if (SCM_COPY_SOURCE_P)
0f2d19dd 434 {
7337d56d
LC
435 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
436 ? copy
437 : tmp, SCM_EOL);
438 SCM_SETCDR (tl2, new_tail2);
439 tl2 = new_tail2;
440 }
441 }
0f2d19dd 442
7337d56d
LC
443 exit:
444 if (SCM_RECORD_POSITIONS_P)
d1c4720c
AW
445 scm_hashq_set_x (scm_source_whash,
446 ans,
447 scm_make_srcprops (line, column,
448 SCM_FILENAME (port),
449 SCM_COPY_SOURCE_P
450 ? ans2
451 : SCM_UNDEFINED,
452 SCM_EOL));
7337d56d
LC
453 return ans;
454}
455#undef FUNC_NAME
a4022e69 456
c5661d28
MG
457
458/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
459 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
460 found. */
461#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
462 do \
463 { \
464 scm_t_wchar a; \
465 size_t i = 0; \
466 c = 0; \
467 while (i < ndigits) \
468 { \
469 a = scm_getc (port); \
470 if (a == EOF) \
471 goto str_eof; \
472 if (terminator \
473 && (a == (scm_t_wchar) terminator) \
474 && (i > 0)) \
475 break; \
476 if ('0' <= a && a <= '9') \
477 a -= '0'; \
478 else if ('A' <= a && a <= 'F') \
479 a = a - 'A' + 10; \
480 else if ('a' <= a && a <= 'f') \
481 a = a - 'a' + 10; \
482 else \
483 { \
484 c = a; \
485 goto bad_escaped; \
486 } \
487 c = c * 16 + a; \
488 i ++; \
489 } \
c5661d28
MG
490 } while (0)
491
684d664e
AW
492static void
493skip_intraline_whitespace (SCM port)
494{
495 scm_t_wchar c;
496
497 do
498 {
499 c = scm_getc (port);
500 if (c == EOF)
501 return;
502 }
503 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
504
505 scm_ungetc (c, port);
506}
507
7337d56d
LC
508static SCM
509scm_read_string (int chr, SCM port)
510#define FUNC_NAME "scm_lreadr"
511{
512 /* For strings smaller than C_STR, this function creates only one Scheme
513 object (the string returned). */
0f2d19dd 514
7337d56d 515 SCM str = SCM_BOOL_F;
7337d56d 516 unsigned c_str_len = 0;
9c44cd45 517 scm_t_wchar c;
eb42ff25 518
190d4b0d 519 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
7337d56d
LC
520 while ('"' != (c = scm_getc (port)))
521 {
522 if (c == EOF)
9c44cd45
MG
523 {
524 str_eof:
525 scm_i_input_error (FUNC_NAME, port,
526 "end of file in string constant", SCM_EOL);
527 }
0f2d19dd 528
9c44cd45
MG
529 if (c_str_len + 1 >= scm_i_string_length (str))
530 {
190d4b0d 531 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
0f2d19dd 532
9c44cd45
MG
533 str = scm_string_append (scm_list_2 (str, addy));
534 }
0f2d19dd 535
7337d56d 536 if (c == '\\')
9c44cd45
MG
537 {
538 switch (c = scm_getc (port))
539 {
540 case EOF:
541 goto str_eof;
542 case '"':
543 case '\\':
544 break;
9c44cd45 545 case '\n':
684d664e
AW
546 if (SCM_HUNGRY_EOL_ESCAPES_P)
547 skip_intraline_whitespace (port);
9c44cd45
MG
548 continue;
549 case '0':
550 c = '\0';
551 break;
552 case 'f':
553 c = '\f';
554 break;
555 case 'n':
556 c = '\n';
557 break;
558 case 'r':
559 c = '\r';
560 break;
561 case 't':
562 c = '\t';
563 break;
564 case 'a':
565 c = '\007';
566 break;
567 case 'v':
568 c = '\v';
569 break;
67a4a16d
MG
570 case 'b':
571 c = '\010';
572 break;
9c44cd45 573 case 'x':
dea901d6
MG
574 if (SCM_R6RS_ESCAPES_P)
575 SCM_READ_HEX_ESCAPE (10, ';');
576 else
577 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 578 break;
9c44cd45 579 case 'u':
898a0b5a
MG
580 if (!SCM_R6RS_ESCAPES_P)
581 {
582 SCM_READ_HEX_ESCAPE (4, '\0');
583 break;
584 }
9c44cd45 585 case 'U':
898a0b5a
MG
586 if (!SCM_R6RS_ESCAPES_P)
587 {
588 SCM_READ_HEX_ESCAPE (6, '\0');
589 break;
590 }
9c44cd45
MG
591 default:
592 bad_escaped:
593 scm_i_input_error (FUNC_NAME, port,
594 "illegal character in escape sequence: ~S",
595 scm_list_1 (SCM_MAKE_CHAR (c)));
596 }
597 }
598 str = scm_i_string_start_writing (str);
599 scm_i_string_set_x (str, c_str_len++, c);
600 scm_i_string_stop_writing ();
7337d56d 601 }
f13b4400 602
7337d56d
LC
603 if (c_str_len > 0)
604 {
9c44cd45 605 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 606 }
69f90b0b 607
9c44cd45 608 return scm_nullstr;
0f2d19dd 609}
db4b4ca6
DH
610#undef FUNC_NAME
611
0f2d19dd 612
7337d56d 613static SCM
889975e5 614scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 615{
69f90b0b
MG
616 SCM result, str = SCM_EOL;
617 char buffer[READER_BUFFER_SIZE];
618 char *overflow_buffer = NULL;
619 size_t bytes_read;
620 int overflow;
621 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 622
7337d56d 623 scm_ungetc (chr, port);
69f90b0b
MG
624 overflow = read_complete_token (port, buffer, sizeof (buffer),
625 &overflow_buffer, &bytes_read);
626
627 if (!overflow)
628 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
629 else
630 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
631 pt->ilseq_handler);
632
633 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 634 if (!scm_is_true (result))
69f90b0b
MG
635 {
636 /* Return a symbol instead of a number */
637 if (SCM_CASE_INSENSITIVE_P)
638 str = scm_string_downcase_x (str);
639 result = scm_string_to_symbol (str);
640 }
0f2d19dd 641
69f90b0b
MG
642 if (overflow)
643 free (overflow_buffer);
644 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
645 return result;
646}
0f2d19dd 647
7337d56d 648static SCM
889975e5 649scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 650{
889975e5
MG
651 SCM result;
652 int ends_with_colon = 0;
69f90b0b 653 size_t bytes_read;
ef4cbc08 654 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
655 int overflow;
656 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
657 scm_t_port *pt = SCM_PTAB_ENTRY (port);
658 SCM str;
7337d56d
LC
659
660 scm_ungetc (chr, port);
69f90b0b
MG
661 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
662 &overflow_buffer, &bytes_read);
663 if (bytes_read > 0)
664 {
665 if (!overflow)
666 ends_with_colon = buffer[bytes_read - 1] == ':';
667 else
668 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
669 }
ef4cbc08 670
69f90b0b
MG
671 if (postfix && ends_with_colon && (bytes_read > 1))
672 {
673 if (!overflow)
674 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
675 else
676 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
677 pt->ilseq_handler);
678
679 if (SCM_CASE_INSENSITIVE_P)
680 str = scm_string_downcase_x (str);
681 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
682 }
7337d56d 683 else
69f90b0b
MG
684 {
685 if (!overflow)
686 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
687 else
688 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
689 pt->ilseq_handler);
690
691 if (SCM_CASE_INSENSITIVE_P)
692 str = scm_string_downcase_x (str);
693 result = scm_string_to_symbol (str);
694 }
7337d56d 695
69f90b0b
MG
696 if (overflow)
697 free (overflow_buffer);
698 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
699 return result;
700}
701
702static SCM
889975e5 703scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
704#define FUNC_NAME "scm_lreadr"
705{
889975e5 706 SCM result;
7337d56d 707 size_t read;
69f90b0b
MG
708 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
709 int overflow;
7337d56d 710 unsigned int radix;
69f90b0b
MG
711 SCM str;
712 scm_t_port *pt;
7337d56d
LC
713
714 switch (chr)
715 {
716 case 'B':
717 case 'b':
718 radix = 2;
719 break;
720
721 case 'o':
722 case 'O':
723 radix = 8;
724 break;
725
726 case 'd':
727 case 'D':
728 radix = 10;
729 break;
730
731 case 'x':
732 case 'X':
733 radix = 16;
734 break;
735
736 default:
737 scm_ungetc (chr, port);
738 scm_ungetc ('#', port);
739 radix = 10;
740 }
741
69f90b0b
MG
742 overflow = read_complete_token (port, buffer, sizeof (buffer),
743 &overflow_buffer, &read);
744
745 pt = SCM_PTAB_ENTRY (port);
746 if (!overflow)
747 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
748 else
749 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
750 pt->ilseq_handler);
751
752 result = scm_string_to_number (str, scm_from_uint (radix));
753
754 if (overflow)
755 free (overflow_buffer);
756
757 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
758
759 if (scm_is_true (result))
760 return result;
761
762 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
763
764 return SCM_BOOL_F;
765}
766#undef FUNC_NAME
767
768static SCM
769scm_read_quote (int chr, SCM port)
770{
771 SCM p;
492faee1
LC
772 long line = SCM_LINUM (port);
773 int column = SCM_COL (port) - 1;
7337d56d
LC
774
775 switch (chr)
776 {
777 case '`':
778 p = scm_sym_quasiquote;
779 break;
780
781 case '\'':
782 p = scm_sym_quote;
783 break;
784
785 case ',':
786 {
889975e5 787 scm_t_wchar c;
7337d56d
LC
788
789 c = scm_getc (port);
790 if ('@' == c)
791 p = scm_sym_uq_splicing;
792 else
0f2d19dd 793 {
7337d56d
LC
794 scm_ungetc (c, port);
795 p = scm_sym_unquote;
0f2d19dd 796 }
7337d56d
LC
797 break;
798 }
0f2d19dd 799
7337d56d
LC
800 default:
801 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 802 "scm_read_quote", chr);
7337d56d 803 abort ();
0f2d19dd 804 }
1cc91f1b 805
7337d56d 806 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1 807 if (SCM_RECORD_POSITIONS_P)
d1c4720c
AW
808 scm_hashq_set_x (scm_source_whash, p,
809 scm_make_srcprops (line, column,
810 SCM_FILENAME (port),
811 SCM_COPY_SOURCE_P
812 ? (scm_cons2 (SCM_CAR (p),
813 SCM_CAR (SCM_CDR (p)),
814 SCM_EOL))
815 : SCM_UNDEFINED,
816 SCM_EOL));
492faee1 817
0f2d19dd 818
7337d56d
LC
819 return p;
820}
821
34f3d47d
AW
822SCM_SYMBOL (sym_syntax, "syntax");
823SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
824SCM_SYMBOL (sym_unsyntax, "unsyntax");
825SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
826
827static SCM
828scm_read_syntax (int chr, SCM port)
829{
830 SCM p;
831 long line = SCM_LINUM (port);
832 int column = SCM_COL (port) - 1;
833
834 switch (chr)
835 {
836 case '`':
837 p = sym_quasisyntax;
838 break;
839
840 case '\'':
841 p = sym_syntax;
842 break;
843
844 case ',':
845 {
846 int c;
847
848 c = scm_getc (port);
849 if ('@' == c)
850 p = sym_unsyntax_splicing;
851 else
852 {
853 scm_ungetc (c, port);
854 p = sym_unsyntax;
855 }
856 break;
857 }
858
859 default:
860 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
861 "scm_read_syntax", chr);
862 abort ();
863 }
864
865 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
866 if (SCM_RECORD_POSITIONS_P)
d1c4720c
AW
867 scm_hashq_set_x (scm_source_whash, p,
868 scm_make_srcprops (line, column,
869 SCM_FILENAME (port),
870 SCM_COPY_SOURCE_P
871 ? (scm_cons2 (SCM_CAR (p),
872 SCM_CAR (SCM_CDR (p)),
873 SCM_EOL))
874 : SCM_UNDEFINED,
875 SCM_EOL));
34f3d47d
AW
876
877
878 return p;
879}
880
7c4aad9c
AW
881static inline SCM
882scm_read_nil (int chr, SCM port)
883{
884 SCM id = scm_read_mixed_case_symbol (chr, port);
885
886 if (!scm_is_eq (id, sym_nil))
887 scm_i_input_error ("scm_read_nil", port,
888 "unexpected input while reading #nil: ~a",
889 scm_list_1 (id));
890
891 return SCM_ELISP_NIL;
892}
893
7337d56d
LC
894static inline SCM
895scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 896{
0f2d19dd
JB
897 int c;
898
889975e5
MG
899 /* We use the get_byte here because there is no need to get the
900 locale correct with comment input. This presumes that newline
901 always represents itself no matter what the encoding is. */
902 for (c = scm_get_byte_or_eof (port);
7337d56d 903 (c != EOF) && (c != '\n');
8a8da78d 904 c = scm_get_byte_or_eof (port));
7337d56d
LC
905
906 return SCM_UNSPECIFIED;
907}
908
909\f
910/* Sharp readers, i.e. readers called after a `#' sign has been read. */
911
912static SCM
913scm_read_boolean (int chr, SCM port)
914{
915 switch (chr)
0f2d19dd 916 {
7337d56d
LC
917 case 't':
918 case 'T':
919 return SCM_BOOL_T;
920
921 case 'f':
922 case 'F':
923 return SCM_BOOL_F;
0f2d19dd 924 }
7337d56d
LC
925
926 return SCM_UNSPECIFIED;
927}
928
929static SCM
889975e5 930scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
931#define FUNC_NAME "scm_lreadr"
932{
69f90b0b
MG
933 char buffer[READER_CHAR_NAME_MAX_SIZE];
934 SCM charname;
935 size_t charname_len, bytes_read;
889975e5
MG
936 scm_t_wchar cp;
937 int overflow;
69f90b0b 938 scm_t_port *pt;
889975e5 939
69f90b0b 940 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 941 if (overflow)
c1a0ba1c 942 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 943
69f90b0b 944 if (bytes_read == 0)
0f2d19dd 945 {
7337d56d
LC
946 chr = scm_getc (port);
947 if (chr == EOF)
948 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
949 "while reading character", SCM_EOL);
950
951 /* CHR must be a token delimiter, like a whitespace. */
952 return (SCM_MAKE_CHAR (chr));
0f2d19dd 953 }
7337d56d 954
69f90b0b 955 pt = SCM_PTAB_ENTRY (port);
7337d56d 956
69f90b0b
MG
957 /* Simple ASCII characters can be processed immediately. Also, simple
958 ISO-8859-1 characters can be processed immediately if the encoding for this
959 port is ISO-8859-1. */
960 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
961 {
962 SCM_COL (port) += 1;
963 return SCM_MAKE_CHAR (buffer[0]);
964 }
965
966 /* Otherwise, convert the buffer into a proper scheme string for
967 processing. */
968 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
969 pt->ilseq_handler);
970 charname_len = scm_i_string_length (charname);
971 SCM_COL (port) += charname_len;
889975e5 972 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
973 if (charname_len == 1)
974 return SCM_MAKE_CHAR (cp);
975
976 /* Ignore dotted circles, which may be used to keep combining characters from
977 combining with the backslash in #\charname. */
0dcd7e61
MG
978 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
979 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
980
889975e5 981 if (cp >= '0' && cp < '8')
7337d56d
LC
982 {
983 /* Dirk:FIXME:: This type of character syntax is not R5RS
984 * compliant. Further, it should be verified that the constant
0ffc78e3 985 * does only consist of octal digits. */
889975e5 986 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 987 if (SCM_I_INUMP (p))
0ffc78e3 988 {
e25f3727 989 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
990 if (SCM_IS_UNICODE_CHAR (c))
991 return SCM_MAKE_CHAR (c);
992 else
0f3a70cf 993 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
994 "out-of-range octal character escape: ~a",
995 scm_list_1 (charname));
996 }
7337d56d
LC
997 }
998
0f3a70cf 999 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
1000 {
1001 SCM p;
0f3a70cf 1002
dea901d6
MG
1003 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1004 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1005 scm_from_uint (16));
1006 if (SCM_I_INUMP (p))
1007 {
e25f3727 1008 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
1009 if (SCM_IS_UNICODE_CHAR (c))
1010 return SCM_MAKE_CHAR (c);
1011 else
1012 scm_i_input_error (FUNC_NAME, port,
1013 "out-of-range hex character escape: ~a",
1014 scm_list_1 (charname));
1015 }
1016 }
1017
889975e5
MG
1018 /* The names of characters should never have non-Latin1
1019 characters. */
1020 if (scm_i_is_narrow_string (charname)
1021 || scm_i_try_narrow_string (charname))
4769c9db
AW
1022 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1023 charname_len);
1024 if (scm_is_true (ch))
1025 return ch;
1026 }
7337d56d 1027
7337d56d 1028 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1029 scm_list_1 (charname));
7337d56d
LC
1030
1031 return SCM_UNSPECIFIED;
0f2d19dd 1032}
db4b4ca6 1033#undef FUNC_NAME
0f2d19dd 1034
7337d56d
LC
1035static inline SCM
1036scm_read_keyword (int chr, SCM port)
1037{
1038 SCM symbol;
1039
1040 /* Read the symbol that comprises the keyword. Doing this instead of
1041 invoking a specific symbol reader function allows `scm_read_keyword ()'
1042 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1043
7337d56d
LC
1044 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1045 symbol = scm_read_expression (port);
1046 if (!scm_is_symbol (symbol))
7f74cf9a 1047 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1048 "keyword prefix `~a' not followed by a symbol: ~s",
1049 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1050
1051 return (scm_symbol_to_keyword (symbol));
1052}
1053
1054static inline SCM
1055scm_read_vector (int chr, SCM port)
09a4f039 1056{
7337d56d
LC
1057 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1058 guarantee that it's going to do what we want. After all, this is an
1059 implementation detail of `scm_read_vector ()', not a desirable
1060 property. */
1061 return (scm_vector (scm_read_sexp (chr, port)));
1062}
09a4f039 1063
7337d56d
LC
1064static inline SCM
1065scm_read_srfi4_vector (int chr, SCM port)
1066{
1067 return scm_i_read_array (port, chr);
1068}
1069
0ba0b384 1070static SCM
889975e5 1071scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
1072{
1073 chr = scm_getc (port);
1074 if (chr != 'u')
1075 goto syntax;
1076
1077 chr = scm_getc (port);
1078 if (chr != '8')
1079 goto syntax;
1080
1081 chr = scm_getc (port);
1082 if (chr != '(')
1083 goto syntax;
1084
1085 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1086
1087 syntax:
1088 scm_i_input_error ("read_bytevector", port,
1089 "invalid bytevector prefix",
1090 SCM_MAKE_CHAR (chr));
1091 return SCM_UNSPECIFIED;
1092}
1093
7337d56d 1094static SCM
889975e5 1095scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1096{
1097 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1098 terribly inefficient but who cares? */
1099 SCM s_bits = SCM_EOL;
1100
1101 for (chr = scm_getc (port);
1102 (chr != EOF) && ((chr == '0') || (chr == '1'));
1103 chr = scm_getc (port))
09a4f039 1104 {
7337d56d 1105 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1106 }
7337d56d
LC
1107
1108 if (chr != EOF)
1109 scm_ungetc (chr, port);
1110
1111 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1112}
1113
1114static inline SCM
889975e5 1115scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1116{
1117 int bang_seen = 0;
1118
1119 for (;;)
09a4f039 1120 {
58b1db5f 1121 int c = scm_getc (port);
62850ef3 1122
7337d56d
LC
1123 if (c == EOF)
1124 scm_i_input_error ("skip_block_comment", port,
1125 "unterminated `#! ... !#' comment", SCM_EOL);
1126
1127 if (c == '!')
1128 bang_seen = 1;
1129 else if (c == '#' && bang_seen)
1130 break;
1131 else
1132 bang_seen = 0;
1133 }
1134
1135 return SCM_UNSPECIFIED;
1136}
1137
d7fcaec3 1138static SCM
911b03b2
JG
1139scm_read_shebang (scm_t_wchar chr, SCM port)
1140{
1141 int c = 0;
1142 if ((c = scm_get_byte_or_eof (port)) != 'r')
1143 {
1144 scm_ungetc (c, port);
1145 return scm_read_scsh_block_comment (chr, port);
1146 }
1147 if ((c = scm_get_byte_or_eof (port)) != '6')
1148 {
1149 scm_ungetc (c, port);
1150 scm_ungetc ('r', port);
1151 return scm_read_scsh_block_comment (chr, port);
1152 }
1153 if ((c = scm_get_byte_or_eof (port)) != 'r')
1154 {
1155 scm_ungetc (c, port);
1156 scm_ungetc ('6', port);
1157 scm_ungetc ('r', port);
1158 return scm_read_scsh_block_comment (chr, port);
1159 }
1160 if ((c = scm_get_byte_or_eof (port)) != 's')
1161 {
1162 scm_ungetc (c, port);
1163 scm_ungetc ('r', port);
1164 scm_ungetc ('6', port);
1165 scm_ungetc ('r', port);
1166 return scm_read_scsh_block_comment (chr, port);
1167 }
1168
1169 return SCM_UNSPECIFIED;
1170}
1171
620c8965
LC
1172static SCM
1173scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1174{
1175 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1176 nested. So care must be taken. */
1177 int nesting_level = 1;
1178 int opening_seen = 0, closing_seen = 0;
1179
1180 while (nesting_level > 0)
1181 {
1182 int c = scm_getc (port);
1183
1184 if (c == EOF)
cd169c5a 1185 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1186 "unterminated `#| ... |#' comment", SCM_EOL);
1187
1188 if (opening_seen)
1189 {
1190 if (c == '|')
1191 nesting_level++;
1192 opening_seen = 0;
1193 }
1194 else if (closing_seen)
1195 {
1196 if (c == '#')
1197 nesting_level--;
1198 closing_seen = 0;
1199 }
1200 else if (c == '|')
1201 closing_seen = 1;
1202 else if (c == '#')
1203 opening_seen = 1;
1204 else
1205 opening_seen = closing_seen = 0;
1206 }
1207
1208 return SCM_UNSPECIFIED;
1209}
1210
34f3d47d 1211static SCM
889975e5 1212scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1213{
889975e5 1214 scm_t_wchar c;
34f3d47d
AW
1215
1216 c = flush_ws (port, (char *) NULL);
1217 if (EOF == c)
1218 scm_i_input_error ("read_commented_expression", port,
1219 "no expression after #; comment", SCM_EOL);
1220 scm_ungetc (c, port);
1221 scm_read_expression (port);
1222 return SCM_UNSPECIFIED;
1223}
1224
7337d56d 1225static SCM
889975e5 1226scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1227{
1228 /* Guile's extended symbol read syntax looks like this:
1229
1230 #{This is all a symbol name}#
1231
1232 So here, CHR is expected to be `{'. */
d9527cfa 1233 int saw_brace = 0;
7337d56d 1234 size_t len = 0;
190d4b0d 1235 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1236
889975e5 1237 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1238
1239 while ((chr = scm_getc (port)) != EOF)
1240 {
1241 if (saw_brace)
09a4f039 1242 {
7337d56d
LC
1243 if (chr == '#')
1244 {
7337d56d
LC
1245 break;
1246 }
1247 else
1248 {
1249 saw_brace = 0;
889975e5 1250 scm_i_string_set_x (buf, len++, '}');
7337d56d 1251 }
09a4f039 1252 }
d9527cfa
AW
1253
1254 if (chr == '}')
7337d56d 1255 saw_brace = 1;
d9527cfa
AW
1256 else if (chr == '\\')
1257 {
1258 /* It used to be that print.c would print extended-read-syntax
1259 symbols with backslashes before "non-standard" chars, but
1260 this routine wouldn't do anything with those escapes.
1261 Bummer. What we've done is to change print.c to output
1262 R6RS hex escapes for those characters, relying on the fact
1263 that the extended read syntax would never put a `\' before
1264 an `x'. For now, we just ignore other instances of
1265 backslash in the string. */
1266 switch ((chr = scm_getc (port)))
1267 {
1268 case EOF:
1269 goto done;
1270 case 'x':
1271 {
1272 scm_t_wchar c;
1273
1274 SCM_READ_HEX_ESCAPE (10, ';');
1275 scm_i_string_set_x (buf, len++, c);
1276 break;
1277
1278 str_eof:
1279 chr = EOF;
1280 goto done;
1281
1282 bad_escaped:
1283 scm_i_string_stop_writing ();
1284 scm_i_input_error ("scm_read_extended_symbol", port,
1285 "illegal character in escape sequence: ~S",
1286 scm_list_1 (SCM_MAKE_CHAR (c)));
1287 break;
1288 }
1289 default:
1290 scm_i_string_set_x (buf, len++, chr);
1291 break;
1292 }
1293 }
7337d56d 1294 else
d9527cfa 1295 scm_i_string_set_x (buf, len++, chr);
62850ef3 1296
889975e5 1297 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1298 {
7f991c7d
LC
1299 SCM addy;
1300
889975e5 1301 scm_i_string_stop_writing ();
190d4b0d 1302 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1303 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1304 len = 0;
889975e5 1305 buf = scm_i_string_start_writing (buf);
7337d56d 1306 }
7337d56d 1307 }
d9527cfa
AW
1308
1309 done:
889975e5 1310 scm_i_string_stop_writing ();
d9527cfa
AW
1311 if (chr == EOF)
1312 scm_i_input_error ("scm_read_extended_symbol", port,
1313 "end of file while reading symbol", SCM_EOL);
7337d56d 1314
889975e5 1315 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1316}
1317
1318
1319\f
1320/* Top-level token readers, i.e., dispatchers. */
1321
1322static SCM
1323scm_read_sharp_extension (int chr, SCM port)
1324{
1325 SCM proc;
1326
1327 proc = scm_get_hash_procedure (chr);
1328 if (scm_is_true (scm_procedure_p (proc)))
1329 {
1330 long line = SCM_LINUM (port);
1331 int column = SCM_COL (port) - 2;
1332 SCM got;
1333
1334 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1335 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1336 {
7337d56d
LC
1337 if (SCM_RECORD_POSITIONS_P)
1338 return (recsexpr (got, line, column,
1339 SCM_FILENAME (port)));
1340 else
1341 return got;
62850ef3 1342 }
09a4f039 1343 }
7337d56d
LC
1344
1345 return SCM_UNSPECIFIED;
1346}
1347
1348/* The reader for the sharp `#' character. It basically dispatches reads
1349 among the above token readers. */
1350static SCM
889975e5 1351scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1352#define FUNC_NAME "scm_lreadr"
1353{
1354 SCM result;
1355
1356 chr = scm_getc (port);
1357
1358 result = scm_read_sharp_extension (chr, port);
1359 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1360 return result;
1361
1362 switch (chr)
1363 {
1364 case '\\':
1365 return (scm_read_character (chr, port));
1366 case '(':
1367 return (scm_read_vector (chr, port));
1368 case 's':
1369 case 'u':
1370 case 'f':
df941b5b 1371 case 'c':
7337d56d
LC
1372 /* This one may return either a boolean or an SRFI-4 vector. */
1373 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1374 case 'v':
1375 return (scm_read_bytevector (chr, port));
7337d56d
LC
1376 case '*':
1377 return (scm_read_guile_bit_vector (chr, port));
1378 case 't':
1379 case 'T':
1380 case 'F':
1381 /* This one may return either a boolean or an SRFI-4 vector. */
1382 return (scm_read_boolean (chr, port));
1383 case ':':
1384 return (scm_read_keyword (chr, port));
1385 case '0': case '1': case '2': case '3': case '4':
1386 case '5': case '6': case '7': case '8': case '9':
1387 case '@':
1388#if SCM_ENABLE_DEPRECATED
1389 /* See below for 'i' and 'e'. */
1390 case 'a':
7337d56d
LC
1391 case 'y':
1392 case 'h':
1393 case 'l':
1394#endif
1395 return (scm_i_read_array (port, chr));
1396
1397 case 'i':
1398 case 'e':
1399#if SCM_ENABLE_DEPRECATED
1400 {
1401 /* When next char is '(', it really is an old-style
1402 uniform array. */
889975e5 1403 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1404 if (next_c != EOF)
1405 scm_ungetc (next_c, port);
1406 if (next_c == '(')
1407 return scm_i_read_array (port, chr);
1408 /* Fall through. */
1409 }
1410#endif
1411 case 'b':
1412 case 'B':
1413 case 'o':
1414 case 'O':
1415 case 'd':
1416 case 'D':
1417 case 'x':
1418 case 'X':
1419 case 'I':
1420 case 'E':
1421 return (scm_read_number_and_radix (chr, port));
1422 case '{':
1423 return (scm_read_extended_symbol (chr, port));
1424 case '!':
911b03b2 1425 return (scm_read_shebang (chr, port));
34f3d47d
AW
1426 case ';':
1427 return (scm_read_commented_expression (chr, port));
1428 case '`':
1429 case '\'':
1430 case ',':
1431 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1432 case 'n':
1433 return (scm_read_nil (chr, port));
7337d56d
LC
1434 default:
1435 result = scm_read_sharp_extension (chr, port);
1436 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1437 {
1438 /* To remain compatible with 1.8 and earlier, the following
1439 characters have lower precedence than `read-hash-extend'
1440 characters. */
1441 switch (chr)
1442 {
1443 case '|':
1444 return scm_read_r6rs_block_comment (chr, port);
1445 default:
1446 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1447 scm_list_1 (SCM_MAKE_CHAR (chr)));
1448 }
1449 }
7337d56d
LC
1450 else
1451 return result;
1452 }
1453
1454 return SCM_UNSPECIFIED;
1455}
1456#undef FUNC_NAME
1457
1458static SCM
1459scm_read_expression (SCM port)
1460#define FUNC_NAME "scm_read_expression"
1461{
1462 while (1)
1463 {
889975e5 1464 register scm_t_wchar chr;
7337d56d
LC
1465
1466 chr = scm_getc (port);
1467
1468 switch (chr)
1469 {
1470 case SCM_WHITE_SPACES:
1471 case SCM_LINE_INCREMENTORS:
1472 break;
1473 case ';':
1474 (void) scm_read_semicolon_comment (chr, port);
1475 break;
5afa815c
AW
1476 case '[':
1477 if (!SCM_SQUARE_BRACKETS_P)
1478 return (scm_read_mixed_case_symbol (chr, port));
1479 /* otherwise fall through */
7337d56d
LC
1480 case '(':
1481 return (scm_read_sexp (chr, port));
1482 case '"':
1483 return (scm_read_string (chr, port));
1484 case '\'':
1485 case '`':
1486 case ',':
1487 return (scm_read_quote (chr, port));
1488 case '#':
1489 {
1490 SCM result;
1491 result = scm_read_sharp (chr, port);
1492 if (scm_is_eq (result, SCM_UNSPECIFIED))
1493 /* We read a comment or some such. */
1494 break;
1495 else
1496 return result;
1497 }
1498 case ')':
1499 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1500 break;
a4e47229
MG
1501 case ']':
1502 if (SCM_SQUARE_BRACKETS_P)
1503 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1504 /* otherwise fall through */
7337d56d
LC
1505 case EOF:
1506 return SCM_EOF_VAL;
1507 case ':':
1508 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1509 return scm_symbol_to_keyword (scm_read_expression (port));
1510 /* Fall through. */
1511
1512 default:
1513 {
1514 if (((chr >= '0') && (chr <= '9'))
1515 || (strchr ("+-.", chr)))
1516 return (scm_read_number (chr, port));
1517 else
1518 return (scm_read_mixed_case_symbol (chr, port));
1519 }
1520 }
1521 }
1522}
1523#undef FUNC_NAME
1524
1525\f
1526/* Actual reader. */
1527
1528SCM_DEFINE (scm_read, "read", 0, 1, 0,
1529 (SCM port),
1530 "Read an s-expression from the input port @var{port}, or from\n"
1531 "the current input port if @var{port} is not specified.\n"
1532 "Any whitespace before the next token is discarded.")
1533#define FUNC_NAME s_scm_read
1534{
1535 int c;
1536
1537 if (SCM_UNBNDP (port))
1538 port = scm_current_input_port ();
1539 SCM_VALIDATE_OPINPORT (1, port);
1540
1541 c = flush_ws (port, (char *) NULL);
1542 if (EOF == c)
1543 return SCM_EOF_VAL;
1544 scm_ungetc (c, port);
1545
1546 return (scm_read_expression (port));
09a4f039 1547}
db4b4ca6 1548#undef FUNC_NAME
09a4f039 1549
0f2d19dd
JB
1550
1551\f
1552
7337d56d
LC
1553/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1554static SCM
1555recsexpr (SCM obj, long line, int column, SCM filename)
1556{
1557 if (!scm_is_pair(obj)) {
1558 return obj;
1559 } else {
1560 SCM tmp = obj, copy;
1561 /* If this sexpr is visible in the read:sharp source, we want to
1562 keep that information, so only record non-constant cons cells
1563 which haven't previously been read by the reader. */
d1c4720c 1564 if (scm_is_false (scm_hashq_ref (scm_source_whash, obj, SCM_BOOL_F)))
7337d56d
LC
1565 {
1566 if (SCM_COPY_SOURCE_P)
1567 {
1568 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1569 SCM_UNDEFINED);
1570 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1571 {
1572 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1573 line,
1574 column,
1575 filename),
1576 SCM_UNDEFINED));
1577 copy = SCM_CDR (copy);
1578 }
1579 SCM_SETCDR (copy, tmp);
1580 }
1581 else
1582 {
1583 recsexpr (SCM_CAR (obj), line, column, filename);
1584 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1585 recsexpr (SCM_CAR (tmp), line, column, filename);
1586 copy = SCM_UNDEFINED;
1587 }
d1c4720c
AW
1588 scm_hashq_set_x (scm_source_whash,
1589 obj,
1590 scm_make_srcprops (line,
1591 column,
1592 filename,
1593 copy,
1594 SCM_EOL));
7337d56d
LC
1595 }
1596 return obj;
1597 }
1598}
1599
14de3b42
GH
1600/* Manipulate the read-hash-procedures alist. This could be written in
1601 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1602SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1603 (SCM chr, SCM proc),
dc7fa443
MG
1604 "Install the procedure @var{proc} for reading expressions\n"
1605 "starting with the character sequence @code{#} and @var{chr}.\n"
1606 "@var{proc} will be called with two arguments: the character\n"
1607 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1608 "returned will be the return value of @code{read}. \n"
1609 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1610 )
1bbd0b84 1611#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1612{
fed9c9a2
GH
1613 SCM this;
1614 SCM prev;
1615
36284627 1616 SCM_VALIDATE_CHAR (1, chr);
7888309b 1617 SCM_ASSERT (scm_is_false (proc)
bc36d050 1618 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1619 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1620
14de3b42 1621 /* Check if chr is already in the alist. */
d458073b 1622 this = scm_i_read_hash_procedures_ref ();
14de3b42 1623 prev = SCM_BOOL_F;
fed9c9a2
GH
1624 while (1)
1625 {
d2e53ed6 1626 if (scm_is_null (this))
fed9c9a2
GH
1627 {
1628 /* not found, so add it to the beginning. */
7888309b 1629 if (scm_is_true (proc))
fed9c9a2 1630 {
d458073b
AR
1631 SCM new = scm_cons (scm_cons (chr, proc),
1632 scm_i_read_hash_procedures_ref ());
1633 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1634 }
1635 break;
1636 }
bc36d050 1637 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1638 {
1639 /* already in the alist. */
7888309b 1640 if (scm_is_false (proc))
14de3b42
GH
1641 {
1642 /* remove it. */
7888309b 1643 if (scm_is_false (prev))
14de3b42 1644 {
d458073b
AR
1645 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1646 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1647 }
1648 else
1649 scm_set_cdr_x (prev, SCM_CDR (this));
1650 }
fed9c9a2 1651 else
14de3b42
GH
1652 {
1653 /* replace it. */
1654 scm_set_cdr_x (SCM_CAR (this), proc);
1655 }
fed9c9a2
GH
1656 break;
1657 }
1658 prev = this;
1659 this = SCM_CDR (this);
1660 }
deca31e1 1661
deca31e1
GH
1662 return SCM_UNSPECIFIED;
1663}
1bbd0b84 1664#undef FUNC_NAME
0f2d19dd 1665
deca31e1
GH
1666/* Recover the read-hash procedure corresponding to char c. */
1667static SCM
6e8d25a6 1668scm_get_hash_procedure (int c)
deca31e1 1669{
d458073b 1670 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1671
deca31e1
GH
1672 while (1)
1673 {
d2e53ed6 1674 if (scm_is_null (rest))
deca31e1
GH
1675 return SCM_BOOL_F;
1676
7866a09b 1677 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1678 return SCM_CDAR (rest);
1679
1680 rest = SCM_CDR (rest);
1681 }
1682}
1cc91f1b 1683
889975e5
MG
1684#define SCM_ENCODING_SEARCH_SIZE (500)
1685
f8a1c9a8
LC
1686/* Search the first few hundred characters of a file for an Emacs-like coding
1687 declaration. Returns either NULL or a string whose storage has been
1688 allocated with `scm_gc_malloc ()'. */
889975e5 1689char *
f8a1c9a8 1690scm_i_scan_for_encoding (SCM port)
889975e5 1691{
d900843c 1692 scm_t_port *pt;
889975e5 1693 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1694 size_t bytes_read, encoding_length, i;
889975e5
MG
1695 char *encoding = NULL;
1696 int utf8_bom = 0;
daedbca7 1697 char *pos, *encoding_start;
889975e5
MG
1698 int in_comment;
1699
d900843c 1700 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1701
d900843c
AW
1702 if (pt->rw_active == SCM_PORT_WRITE)
1703 scm_flush (port);
49bb5bd3 1704
d900843c
AW
1705 if (pt->rw_random)
1706 pt->rw_active = SCM_PORT_READ;
1707
1708 if (pt->read_pos == pt->read_end)
1709 {
1710 /* We can use the read buffer, and thus avoid a seek. */
1711 if (scm_fill_input (port) == EOF)
1712 return NULL;
1713
1714 bytes_read = pt->read_end - pt->read_pos;
1715 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1716 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1717
1718 if (bytes_read <= 1)
1719 /* An unbuffered port -- don't scan. */
1720 return NULL;
1721
1722 memcpy (header, pt->read_pos, bytes_read);
1723 header[bytes_read] = '\0';
1724 }
1725 else
1726 {
1727 /* Try to read some bytes and then seek back. Not all ports
1728 support seeking back; and indeed some file ports (like
1729 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1730 check performed by SCM_FPORT_FDES---but fail to seek
1731 backwards. Hence this block comes second. We prefer to use
1732 the read buffer in-place. */
1733 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1734 return NULL;
1735
1736 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1737 header[bytes_read] = '\0';
1738 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1739 }
889975e5
MG
1740
1741 if (bytes_read > 3
1742 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1743 utf8_bom = 1;
1744
1745 /* search past "coding[:=]" */
1746 pos = header;
1747 while (1)
1748 {
1749 if ((pos = strstr(pos, "coding")) == NULL)
1750 return NULL;
1751
1752 pos += strlen("coding");
1753 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1754 (*pos == ':' || *pos == '='))
1755 {
1756 pos ++;
1757 break;
1758 }
1759 }
1760
1761 /* skip spaces */
1762 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1763 (*pos == ' ' || *pos == '\t'))
1764 pos ++;
1765
1766 /* grab the next token */
daedbca7 1767 encoding_start = pos;
889975e5 1768 i = 0;
daedbca7
MG
1769 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1770 && encoding_start + i - header < bytes_read
1771 && (isalnum ((int) encoding_start[i])
1772 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1773 i++;
1774
daedbca7
MG
1775 encoding_length = i;
1776 if (encoding_length == 0)
889975e5
MG
1777 return NULL;
1778
daedbca7
MG
1779 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1780 for (i = 0; i < encoding_length; i++)
026ed239 1781 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1782
1783 /* push backwards to make sure we were in a comment */
1784 in_comment = 0;
daedbca7
MG
1785 pos = encoding_start;
1786 while (pos >= header)
889975e5 1787 {
8a12aeb9
AW
1788 if (*pos == ';')
1789 {
1790 in_comment = 1;
1791 break;
1792 }
1793 else if (*pos == '\n' || pos == header)
889975e5
MG
1794 {
1795 /* This wasn't in a semicolon comment. Check for a
1796 hash-bang comment. */
1797 char *beg = strstr (header, "#!");
1798 char *end = strstr (header, "!#");
8a12aeb9 1799 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1800 in_comment = 1;
1801 break;
1802 }
8a12aeb9
AW
1803 else
1804 {
1805 pos --;
1806 continue;
1807 }
889975e5
MG
1808 }
1809 if (!in_comment)
f8a1c9a8
LC
1810 /* This wasn't in a comment */
1811 return NULL;
1812
889975e5 1813 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1814 scm_misc_error (NULL,
889975e5
MG
1815 "the port input declares the encoding ~s but is encoded as UTF-8",
1816 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1817
889975e5
MG
1818 return encoding;
1819}
1820
1821SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1822 (SCM port),
a270e133 1823 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1824 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1825 "The coding declaration is of the form\n"
1826 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1827 "\n"
1828 "Returns a string containing the character encoding of the file\n"
1829 "if a declaration was found, or @code{#f} otherwise.\n")
1830#define FUNC_NAME s_scm_file_encoding
1831{
1832 char *enc;
1833 SCM s_enc;
f8a1c9a8 1834
d900843c
AW
1835 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1836
f8a1c9a8 1837 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1838 if (enc == NULL)
1839 return SCM_BOOL_F;
1840 else
1841 {
1842 s_enc = scm_from_locale_string (enc);
889975e5
MG
1843 return s_enc;
1844 }
f8a1c9a8 1845
889975e5
MG
1846 return SCM_BOOL_F;
1847}
1848#undef FUNC_NAME
1849
0f2d19dd
JB
1850void
1851scm_init_read ()
0f2d19dd 1852{
d458073b
AR
1853 SCM read_hash_procs;
1854
1855 read_hash_procs = scm_make_fluid ();
1856 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1857
1858 scm_i_read_hash_procedures =
1859 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1860
62560650 1861 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1862#include "libguile/read.x"
0f2d19dd 1863}
89e00824
ML
1864
1865/*
1866 Local Variables:
1867 c-file-style: "gnu"
1868 End:
1869*/