remove elisp-strings and elisp-vectors read options
[bpt/guile.git] / libguile / read.c
CommitLineData
a8fc3852 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
7337d56d 32
a0599745 33#include "libguile/_scm.h"
0ba0b384 34#include "libguile/bytevectors.h"
a0599745
MD
35#include "libguile/chars.h"
36#include "libguile/eval.h"
2fa901a5 37#include "libguile/arrays.h"
cf396142 38#include "libguile/bitvectors.h"
a0599745
MD
39#include "libguile/keywords.h"
40#include "libguile/alist.h"
41#include "libguile/srcprop.h"
42#include "libguile/hashtab.h"
43#include "libguile/hash.h"
44#include "libguile/ports.h"
49bb5bd3 45#include "libguile/fports.h"
a0599745
MD
46#include "libguile/root.h"
47#include "libguile/strings.h"
ba1b2226 48#include "libguile/strports.h"
a0599745 49#include "libguile/vectors.h"
a0599745 50#include "libguile/validate.h"
a4022e69 51#include "libguile/srfi-4.h"
7337d56d 52#include "libguile/srfi-13.h"
ba1b2226 53
a0599745 54#include "libguile/read.h"
22fc179a
HWN
55#include "libguile/private-options.h"
56
0f2d19dd
JB
57
58\f
59
5bf6a6f0 60SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 61SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 62SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 63SCM_SYMBOL (sym_nil, "nil");
c7733771 64
92c2555f 65scm_t_option scm_read_opts[] = {
b7ff98dd
MD
66 { SCM_OPTION_BOOLEAN, "copy", 0,
67 "Copy source code expressions." },
ac74fc22 68 { SCM_OPTION_BOOLEAN, "positions", 0,
deca31e1
GH
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 71 "Convert symbols to lower case."},
62316c7f 72 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
904fabb6 73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
74 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
75 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
76 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
77 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
62560650 78 { 0, },
a16f6fe7
MD
79};
80
39e8f371
HWN
81/*
82 Give meaningful error messages for errors
83
84 We use the format
85
ba1b2226 86 FILE:LINE:COL: MESSAGE
39e8f371
HWN
87 This happened in ....
88
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
91
39e8f371
HWN
92 */
93
94
a4022e69
MV
95void
96scm_i_input_error (char const *function,
97 SCM port, const char *message, SCM arg)
ba1b2226 98{
29a837fd
MV
99 SCM fn = (scm_is_string (SCM_FILENAME(port))
100 ? SCM_FILENAME(port)
101 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 102
29a837fd 103 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
104 SCM string = SCM_EOL;
105 scm_simple_format (string_port,
272632a6 106 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 107 scm_list_4 (fn,
b3aa4626 108 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 109 scm_from_int (SCM_COL (port) + 1),
272632a6 110 scm_from_locale_string (message)));
ba1b2226
HWN
111
112 string = scm_get_output_string (string_port);
113 scm_close_output_port (string_port);
272632a6 114 scm_error_scm (scm_from_locale_symbol ("read-error"),
a4022e69 115 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 116 string,
dd72382c 117 arg,
ba1b2226
HWN
118 SCM_BOOL_F);
119}
39e8f371
HWN
120
121
a1ec6916 122SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 123 (SCM setting),
dc7fa443
MG
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 127#define FUNC_NAME s_scm_read_options
a16f6fe7 128{
b7ff98dd
MD
129 SCM ans = scm_options (setting,
130 scm_read_opts,
1bbd0b84 131 FUNC_NAME);
b7ff98dd
MD
132 if (SCM_COPY_SOURCE_P)
133 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
134 return ans;
135}
1bbd0b84 136#undef FUNC_NAME
a16f6fe7 137
14de3b42
GH
138/* An association list mapping extra hash characters to procedures. */
139static SCM *scm_read_hash_procedures;
deca31e1 140
0f2d19dd 141
7337d56d
LC
142\f
143/* Token readers. */
0f2d19dd 144
0f2d19dd 145
7337d56d
LC
146/* Size of the C buffer used to read symbols and numbers. */
147#define READER_BUFFER_SIZE 128
0f2d19dd 148
7337d56d
LC
149/* Size of the C buffer used to read strings. */
150#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 151
7337d56d
LC
152/* The maximum size of Scheme character names. */
153#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 154
94115ae3 155
7337d56d
LC
156/* `isblank' is only in C99. */
157#define CHAR_IS_BLANK_(_chr) \
158 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 159 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
160
161#ifdef MSDOS
162# define CHAR_IS_BLANK(_chr) \
163 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
164#else
165# define CHAR_IS_BLANK CHAR_IS_BLANK_
166#endif
167
168
169/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
170 structure''). */
171#define CHAR_IS_R5RS_DELIMITER(c) \
172 (CHAR_IS_BLANK (c) \
5afa815c
AW
173 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
174 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
175
176#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
177
178/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
179 Structure''. */
180#define CHAR_IS_EXPONENT_MARKER(_chr) \
181 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
182 || ((_chr) == 'd') || ((_chr) == 'l'))
183
454866e0 184/* Read an SCSH block comment. */
620c8965
LC
185static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
186static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
187static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 188static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 189static SCM scm_get_hash_procedure (int);
454866e0 190
69f90b0b
MG
191/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
192 result in the pre-allocated buffer BUF. Return zero if the whole token has
193 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
194 bytes actually read. */
7337d56d 195static inline int
69f90b0b
MG
196read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
197 {
198 *read = 0;
0520c320 199
69f90b0b
MG
200 while (*read < buf_size)
201 {
202 int chr;
7337d56d 203
69f90b0b 204 chr = scm_get_byte_or_eof (port);
889975e5 205
69f90b0b
MG
206 if (chr == EOF)
207 return 0;
208 else if (CHAR_IS_DELIMITER (chr))
209 {
210 scm_unget_byte (chr, port);
211 return 0;
212 }
213 else
214 {
215 *buf = (char) chr;
216 buf++, (*read)++;
217 }
218 }
889975e5 219
69f90b0b
MG
220 return 1;
221 }
889975e5 222
69f90b0b
MG
223/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
224 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
225 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
226 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
227 will be set the number of bytes actually read. */
228static int
229read_complete_token (SCM port, char *buffer, const size_t buffer_size,
230 char **overflow_buffer, size_t *read)
231{
232 int overflow = 0;
233 size_t bytes_read, overflow_size;
7337d56d 234
69f90b0b
MG
235 *overflow_buffer = NULL;
236 overflow_size = 0;
1cc91f1b 237
69f90b0b
MG
238 do
239 {
240 overflow = read_token (port, buffer, buffer_size, &bytes_read);
241 if (bytes_read == 0)
242 break;
243 if (overflow || overflow_size != 0)
244 {
245 if (overflow_size == 0)
246 {
247 *overflow_buffer = scm_malloc (bytes_read);
248 memcpy (*overflow_buffer, buffer, bytes_read);
249 overflow_size = bytes_read;
250 }
251 else
252 {
253 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
254 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
255 overflow_size += bytes_read;
256 }
257 }
889975e5 258 }
69f90b0b 259 while (overflow);
889975e5 260
69f90b0b
MG
261 if (overflow_size)
262 *read = overflow_size;
b521d265 263 else
69f90b0b
MG
264 *read = bytes_read;
265
266 return (overflow_size != 0);
889975e5 267}
7337d56d
LC
268
269/* Skip whitespace from PORT and return the first non-whitespace character
270 read. Raise an error on end-of-file. */
271static int
272flush_ws (SCM port, const char *eoferr)
0f2d19dd 273{
889975e5 274 register scm_t_wchar c;
0f2d19dd 275 while (1)
b7f3516f 276 switch (c = scm_getc (port))
0f2d19dd
JB
277 {
278 case EOF:
279 goteof:
280 if (eoferr)
d156d3b7 281 {
a4022e69
MV
282 scm_i_input_error (eoferr,
283 port,
284 "end of file",
285 SCM_EOL);
d156d3b7 286 }
0f2d19dd 287 return c;
7337d56d 288
0f2d19dd
JB
289 case ';':
290 lp:
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goto goteof;
295 default:
296 goto lp;
297 case SCM_LINE_INCREMENTORS:
298 break;
299 }
300 break;
7337d56d 301
454866e0
LC
302 case '#':
303 switch (c = scm_getc (port))
304 {
305 case EOF:
306 eoferr = "read_sharp";
307 goto goteof;
308 case '!':
911b03b2 309 scm_read_shebang (c, port);
454866e0 310 break;
34f3d47d
AW
311 case ';':
312 scm_read_commented_expression (c, port);
313 break;
620c8965
LC
314 case '|':
315 if (scm_is_false (scm_get_hash_procedure (c)))
316 {
317 scm_read_r6rs_block_comment (c, port);
318 break;
319 }
320 /* fall through */
454866e0
LC
321 default:
322 scm_ungetc (c, port);
323 return '#';
324 }
325 break;
326
0f2d19dd 327 case SCM_LINE_INCREMENTORS:
0f2d19dd 328 case SCM_SINGLE_SPACES:
0f2d19dd 329 case '\t':
0f2d19dd 330 break;
7337d56d 331
0f2d19dd
JB
332 default:
333 return c;
334 }
7337d56d
LC
335
336 return 0;
0f2d19dd
JB
337}
338
339
7337d56d
LC
340\f
341/* Token readers. */
1cc91f1b 342
7337d56d
LC
343static SCM scm_read_expression (SCM port);
344static SCM scm_read_sharp (int chr, SCM port);
7337d56d 345static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
346
347
09a4f039 348static SCM
889975e5 349scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 350#define FUNC_NAME "scm_i_lreadparen"
09a4f039 351{
7337d56d
LC
352 register int c;
353 register SCM tmp;
354 register SCM tl, ans = SCM_EOL;
bd22f1c7 355 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
5afa815c 356 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
357
358 /* Need to capture line and column numbers here. */
359 long line = SCM_LINUM (port);
360 int column = SCM_COL (port) - 1;
f9c68a47 361
f9c68a47 362
7337d56d
LC
363 c = flush_ws (port, FUNC_NAME);
364 if (terminating_char == c)
365 return SCM_EOL;
f9c68a47 366
7337d56d
LC
367 scm_ungetc (c, port);
368 if (scm_is_eq (scm_sym_dot,
369 (tmp = scm_read_expression (port))))
370 {
371 ans = scm_read_expression (port);
372 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
373 scm_i_input_error (FUNC_NAME, port, "missing close paren",
374 SCM_EOL);
375 return ans;
376 }
1cc91f1b 377
7337d56d
LC
378 /* Build the head of the list structure. */
379 ans = tl = scm_cons (tmp, SCM_EOL);
380
381 if (SCM_COPY_SOURCE_P)
382 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
383 ? copy
384 : tmp,
385 SCM_EOL);
386
387 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 388 {
7337d56d 389 SCM new_tail;
0f2d19dd 390
5b69315e
AW
391 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
392 scm_i_input_error (FUNC_NAME, port,
393 "in pair: mismatched close paren: ~A",
394 scm_list_1 (SCM_MAKE_CHAR (c)));
395
7337d56d 396 scm_ungetc (c, port);
5b69315e
AW
397 tmp = scm_read_expression (port);
398
399 if (scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 400 {
7337d56d
LC
401 SCM_SETCDR (tl, tmp = scm_read_expression (port));
402
403 if (SCM_COPY_SOURCE_P)
404 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
405 SCM_EOL));
406
407 c = flush_ws (port, FUNC_NAME);
408 if (terminating_char != c)
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: missing close paren", SCM_EOL);
411 goto exit;
0f2d19dd 412 }
b858464a 413
7337d56d
LC
414 new_tail = scm_cons (tmp, SCM_EOL);
415 SCM_SETCDR (tl, new_tail);
416 tl = new_tail;
417
418 if (SCM_COPY_SOURCE_P)
0f2d19dd 419 {
7337d56d
LC
420 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
421 ? copy
422 : tmp, SCM_EOL);
423 SCM_SETCDR (tl2, new_tail2);
424 tl2 = new_tail2;
425 }
426 }
0f2d19dd 427
7337d56d
LC
428 exit:
429 if (SCM_RECORD_POSITIONS_P)
430 scm_whash_insert (scm_source_whash,
431 ans,
432 scm_make_srcprops (line, column,
433 SCM_FILENAME (port),
434 SCM_COPY_SOURCE_P
435 ? ans2
436 : SCM_UNDEFINED,
437 SCM_EOL));
438 return ans;
439}
440#undef FUNC_NAME
a4022e69 441
c5661d28
MG
442
443/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
444 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
445 found. */
446#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
447 do \
448 { \
449 scm_t_wchar a; \
450 size_t i = 0; \
451 c = 0; \
452 while (i < ndigits) \
453 { \
454 a = scm_getc (port); \
455 if (a == EOF) \
456 goto str_eof; \
457 if (terminator \
458 && (a == (scm_t_wchar) terminator) \
459 && (i > 0)) \
460 break; \
461 if ('0' <= a && a <= '9') \
462 a -= '0'; \
463 else if ('A' <= a && a <= 'F') \
464 a = a - 'A' + 10; \
465 else if ('a' <= a && a <= 'f') \
466 a = a - 'a' + 10; \
467 else \
468 { \
469 c = a; \
470 goto bad_escaped; \
471 } \
472 c = c * 16 + a; \
473 i ++; \
474 } \
c5661d28
MG
475 } while (0)
476
7337d56d
LC
477static SCM
478scm_read_string (int chr, SCM port)
479#define FUNC_NAME "scm_lreadr"
480{
481 /* For strings smaller than C_STR, this function creates only one Scheme
482 object (the string returned). */
0f2d19dd 483
7337d56d 484 SCM str = SCM_BOOL_F;
7337d56d 485 unsigned c_str_len = 0;
9c44cd45 486 scm_t_wchar c;
eb42ff25 487
9c44cd45 488 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
7337d56d
LC
489 while ('"' != (c = scm_getc (port)))
490 {
491 if (c == EOF)
9c44cd45
MG
492 {
493 str_eof:
494 scm_i_input_error (FUNC_NAME, port,
495 "end of file in string constant", SCM_EOL);
496 }
0f2d19dd 497
9c44cd45
MG
498 if (c_str_len + 1 >= scm_i_string_length (str))
499 {
500 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
0f2d19dd 501
9c44cd45
MG
502 str = scm_string_append (scm_list_2 (str, addy));
503 }
0f2d19dd 504
7337d56d 505 if (c == '\\')
9c44cd45
MG
506 {
507 switch (c = scm_getc (port))
508 {
509 case EOF:
510 goto str_eof;
511 case '"':
512 case '\\':
513 break;
9c44cd45
MG
514 case '\n':
515 continue;
516 case '0':
517 c = '\0';
518 break;
519 case 'f':
520 c = '\f';
521 break;
522 case 'n':
523 c = '\n';
524 break;
525 case 'r':
526 c = '\r';
527 break;
528 case 't':
529 c = '\t';
530 break;
531 case 'a':
532 c = '\007';
533 break;
534 case 'v':
535 c = '\v';
536 break;
67a4a16d
MG
537 case 'b':
538 c = '\010';
539 break;
9c44cd45 540 case 'x':
dea901d6
MG
541 if (SCM_R6RS_ESCAPES_P)
542 SCM_READ_HEX_ESCAPE (10, ';');
543 else
544 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 545 break;
9c44cd45 546 case 'u':
898a0b5a
MG
547 if (!SCM_R6RS_ESCAPES_P)
548 {
549 SCM_READ_HEX_ESCAPE (4, '\0');
550 break;
551 }
9c44cd45 552 case 'U':
898a0b5a
MG
553 if (!SCM_R6RS_ESCAPES_P)
554 {
555 SCM_READ_HEX_ESCAPE (6, '\0');
556 break;
557 }
9c44cd45
MG
558 default:
559 bad_escaped:
560 scm_i_input_error (FUNC_NAME, port,
561 "illegal character in escape sequence: ~S",
562 scm_list_1 (SCM_MAKE_CHAR (c)));
563 }
564 }
565 str = scm_i_string_start_writing (str);
566 scm_i_string_set_x (str, c_str_len++, c);
567 scm_i_string_stop_writing ();
7337d56d 568 }
f13b4400 569
7337d56d
LC
570 if (c_str_len > 0)
571 {
9c44cd45 572 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 573 }
69f90b0b 574
9c44cd45 575 return scm_nullstr;
0f2d19dd 576}
db4b4ca6
DH
577#undef FUNC_NAME
578
0f2d19dd 579
7337d56d 580static SCM
889975e5 581scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 582{
69f90b0b
MG
583 SCM result, str = SCM_EOL;
584 char buffer[READER_BUFFER_SIZE];
585 char *overflow_buffer = NULL;
586 size_t bytes_read;
587 int overflow;
588 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 589
7337d56d 590 scm_ungetc (chr, port);
69f90b0b
MG
591 overflow = read_complete_token (port, buffer, sizeof (buffer),
592 &overflow_buffer, &bytes_read);
593
594 if (!overflow)
595 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
596 else
597 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
598 pt->ilseq_handler);
599
600 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 601 if (!scm_is_true (result))
69f90b0b
MG
602 {
603 /* Return a symbol instead of a number */
604 if (SCM_CASE_INSENSITIVE_P)
605 str = scm_string_downcase_x (str);
606 result = scm_string_to_symbol (str);
607 }
0f2d19dd 608
69f90b0b
MG
609 if (overflow)
610 free (overflow_buffer);
611 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
612 return result;
613}
0f2d19dd 614
7337d56d 615static SCM
889975e5 616scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 617{
889975e5
MG
618 SCM result;
619 int ends_with_colon = 0;
69f90b0b 620 size_t bytes_read;
ef4cbc08 621 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
622 int overflow;
623 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
624 scm_t_port *pt = SCM_PTAB_ENTRY (port);
625 SCM str;
7337d56d
LC
626
627 scm_ungetc (chr, port);
69f90b0b
MG
628 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
629 &overflow_buffer, &bytes_read);
630 if (bytes_read > 0)
631 {
632 if (!overflow)
633 ends_with_colon = buffer[bytes_read - 1] == ':';
634 else
635 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
636 }
ef4cbc08 637
69f90b0b
MG
638 if (postfix && ends_with_colon && (bytes_read > 1))
639 {
640 if (!overflow)
641 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
642 else
643 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
644 pt->ilseq_handler);
645
646 if (SCM_CASE_INSENSITIVE_P)
647 str = scm_string_downcase_x (str);
648 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
649 }
7337d56d 650 else
69f90b0b
MG
651 {
652 if (!overflow)
653 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
654 else
655 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
656 pt->ilseq_handler);
657
658 if (SCM_CASE_INSENSITIVE_P)
659 str = scm_string_downcase_x (str);
660 result = scm_string_to_symbol (str);
661 }
7337d56d 662
69f90b0b
MG
663 if (overflow)
664 free (overflow_buffer);
665 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
666 return result;
667}
668
669static SCM
889975e5 670scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
671#define FUNC_NAME "scm_lreadr"
672{
889975e5 673 SCM result;
7337d56d 674 size_t read;
69f90b0b
MG
675 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
676 int overflow;
7337d56d 677 unsigned int radix;
69f90b0b
MG
678 SCM str;
679 scm_t_port *pt;
7337d56d
LC
680
681 switch (chr)
682 {
683 case 'B':
684 case 'b':
685 radix = 2;
686 break;
687
688 case 'o':
689 case 'O':
690 radix = 8;
691 break;
692
693 case 'd':
694 case 'D':
695 radix = 10;
696 break;
697
698 case 'x':
699 case 'X':
700 radix = 16;
701 break;
702
703 default:
704 scm_ungetc (chr, port);
705 scm_ungetc ('#', port);
706 radix = 10;
707 }
708
69f90b0b
MG
709 overflow = read_complete_token (port, buffer, sizeof (buffer),
710 &overflow_buffer, &read);
711
712 pt = SCM_PTAB_ENTRY (port);
713 if (!overflow)
714 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
715 else
716 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
717 pt->ilseq_handler);
718
719 result = scm_string_to_number (str, scm_from_uint (radix));
720
721 if (overflow)
722 free (overflow_buffer);
723
724 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
725
726 if (scm_is_true (result))
727 return result;
728
729 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
730
731 return SCM_BOOL_F;
732}
733#undef FUNC_NAME
734
735static SCM
736scm_read_quote (int chr, SCM port)
737{
738 SCM p;
492faee1
LC
739 long line = SCM_LINUM (port);
740 int column = SCM_COL (port) - 1;
7337d56d
LC
741
742 switch (chr)
743 {
744 case '`':
745 p = scm_sym_quasiquote;
746 break;
747
748 case '\'':
749 p = scm_sym_quote;
750 break;
751
752 case ',':
753 {
889975e5 754 scm_t_wchar c;
7337d56d
LC
755
756 c = scm_getc (port);
757 if ('@' == c)
758 p = scm_sym_uq_splicing;
759 else
0f2d19dd 760 {
7337d56d
LC
761 scm_ungetc (c, port);
762 p = scm_sym_unquote;
0f2d19dd 763 }
7337d56d
LC
764 break;
765 }
0f2d19dd 766
7337d56d
LC
767 default:
768 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 769 "scm_read_quote", chr);
7337d56d 770 abort ();
0f2d19dd 771 }
1cc91f1b 772
7337d56d 773 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1
LC
774 if (SCM_RECORD_POSITIONS_P)
775 scm_whash_insert (scm_source_whash, p,
776 scm_make_srcprops (line, column,
777 SCM_FILENAME (port),
778 SCM_COPY_SOURCE_P
779 ? (scm_cons2 (SCM_CAR (p),
780 SCM_CAR (SCM_CDR (p)),
781 SCM_EOL))
782 : SCM_UNDEFINED,
783 SCM_EOL));
784
0f2d19dd 785
7337d56d
LC
786 return p;
787}
788
34f3d47d
AW
789SCM_SYMBOL (sym_syntax, "syntax");
790SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
791SCM_SYMBOL (sym_unsyntax, "unsyntax");
792SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
793
794static SCM
795scm_read_syntax (int chr, SCM port)
796{
797 SCM p;
798 long line = SCM_LINUM (port);
799 int column = SCM_COL (port) - 1;
800
801 switch (chr)
802 {
803 case '`':
804 p = sym_quasisyntax;
805 break;
806
807 case '\'':
808 p = sym_syntax;
809 break;
810
811 case ',':
812 {
813 int c;
814
815 c = scm_getc (port);
816 if ('@' == c)
817 p = sym_unsyntax_splicing;
818 else
819 {
820 scm_ungetc (c, port);
821 p = sym_unsyntax;
822 }
823 break;
824 }
825
826 default:
827 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr);
829 abort ();
830 }
831
832 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
833 if (SCM_RECORD_POSITIONS_P)
834 scm_whash_insert (scm_source_whash, p,
835 scm_make_srcprops (line, column,
836 SCM_FILENAME (port),
837 SCM_COPY_SOURCE_P
838 ? (scm_cons2 (SCM_CAR (p),
839 SCM_CAR (SCM_CDR (p)),
840 SCM_EOL))
841 : SCM_UNDEFINED,
842 SCM_EOL));
843
844
845 return p;
846}
847
7c4aad9c
AW
848static inline SCM
849scm_read_nil (int chr, SCM port)
850{
851 SCM id = scm_read_mixed_case_symbol (chr, port);
852
853 if (!scm_is_eq (id, sym_nil))
854 scm_i_input_error ("scm_read_nil", port,
855 "unexpected input while reading #nil: ~a",
856 scm_list_1 (id));
857
858 return SCM_ELISP_NIL;
859}
860
7337d56d
LC
861static inline SCM
862scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 863{
0f2d19dd
JB
864 int c;
865
889975e5
MG
866 /* We use the get_byte here because there is no need to get the
867 locale correct with comment input. This presumes that newline
868 always represents itself no matter what the encoding is. */
869 for (c = scm_get_byte_or_eof (port);
7337d56d 870 (c != EOF) && (c != '\n');
8a8da78d 871 c = scm_get_byte_or_eof (port));
7337d56d
LC
872
873 return SCM_UNSPECIFIED;
874}
875
876\f
877/* Sharp readers, i.e. readers called after a `#' sign has been read. */
878
879static SCM
880scm_read_boolean (int chr, SCM port)
881{
882 switch (chr)
0f2d19dd 883 {
7337d56d
LC
884 case 't':
885 case 'T':
886 return SCM_BOOL_T;
887
888 case 'f':
889 case 'F':
890 return SCM_BOOL_F;
0f2d19dd 891 }
7337d56d
LC
892
893 return SCM_UNSPECIFIED;
894}
895
896static SCM
889975e5 897scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
898#define FUNC_NAME "scm_lreadr"
899{
69f90b0b
MG
900 char buffer[READER_CHAR_NAME_MAX_SIZE];
901 SCM charname;
902 size_t charname_len, bytes_read;
889975e5
MG
903 scm_t_wchar cp;
904 int overflow;
69f90b0b 905 scm_t_port *pt;
889975e5 906
69f90b0b 907 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 908 if (overflow)
7337d56d
LC
909 goto char_error;
910
69f90b0b 911 if (bytes_read == 0)
0f2d19dd 912 {
7337d56d
LC
913 chr = scm_getc (port);
914 if (chr == EOF)
915 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
916 "while reading character", SCM_EOL);
917
918 /* CHR must be a token delimiter, like a whitespace. */
919 return (SCM_MAKE_CHAR (chr));
0f2d19dd 920 }
7337d56d 921
69f90b0b 922 pt = SCM_PTAB_ENTRY (port);
7337d56d 923
69f90b0b
MG
924 /* Simple ASCII characters can be processed immediately. Also, simple
925 ISO-8859-1 characters can be processed immediately if the encoding for this
926 port is ISO-8859-1. */
927 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
928 {
929 SCM_COL (port) += 1;
930 return SCM_MAKE_CHAR (buffer[0]);
931 }
932
933 /* Otherwise, convert the buffer into a proper scheme string for
934 processing. */
935 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
936 pt->ilseq_handler);
937 charname_len = scm_i_string_length (charname);
938 SCM_COL (port) += charname_len;
889975e5 939 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
940 if (charname_len == 1)
941 return SCM_MAKE_CHAR (cp);
942
943 /* Ignore dotted circles, which may be used to keep combining characters from
944 combining with the backslash in #\charname. */
0dcd7e61
MG
945 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
946 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
947
889975e5 948 if (cp >= '0' && cp < '8')
7337d56d
LC
949 {
950 /* Dirk:FIXME:: This type of character syntax is not R5RS
951 * compliant. Further, it should be verified that the constant
0ffc78e3 952 * does only consist of octal digits. */
889975e5 953 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 954 if (SCM_I_INUMP (p))
0ffc78e3
MG
955 {
956 scm_t_wchar c = SCM_I_INUM (p);
957 if (SCM_IS_UNICODE_CHAR (c))
958 return SCM_MAKE_CHAR (c);
959 else
0f3a70cf 960 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
961 "out-of-range octal character escape: ~a",
962 scm_list_1 (charname));
963 }
7337d56d
LC
964 }
965
0f3a70cf 966 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
967 {
968 SCM p;
0f3a70cf 969
dea901d6
MG
970 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
971 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
972 scm_from_uint (16));
973 if (SCM_I_INUMP (p))
974 {
975 scm_t_wchar c = SCM_I_INUM (p);
976 if (SCM_IS_UNICODE_CHAR (c))
977 return SCM_MAKE_CHAR (c);
978 else
979 scm_i_input_error (FUNC_NAME, port,
980 "out-of-range hex character escape: ~a",
981 scm_list_1 (charname));
982 }
983 }
984
889975e5
MG
985 /* The names of characters should never have non-Latin1
986 characters. */
987 if (scm_i_is_narrow_string (charname)
988 || scm_i_try_narrow_string (charname))
4769c9db
AW
989 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
990 charname_len);
991 if (scm_is_true (ch))
992 return ch;
993 }
7337d56d
LC
994
995 char_error:
996 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 997 scm_list_1 (charname));
7337d56d
LC
998
999 return SCM_UNSPECIFIED;
0f2d19dd 1000}
db4b4ca6 1001#undef FUNC_NAME
0f2d19dd 1002
7337d56d
LC
1003static inline SCM
1004scm_read_keyword (int chr, SCM port)
1005{
1006 SCM symbol;
1007
1008 /* Read the symbol that comprises the keyword. Doing this instead of
1009 invoking a specific symbol reader function allows `scm_read_keyword ()'
1010 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1011
7337d56d
LC
1012 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1013 symbol = scm_read_expression (port);
1014 if (!scm_is_symbol (symbol))
7f74cf9a 1015 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1016 "keyword prefix `~a' not followed by a symbol: ~s",
1017 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1018
1019 return (scm_symbol_to_keyword (symbol));
1020}
1021
1022static inline SCM
1023scm_read_vector (int chr, SCM port)
09a4f039 1024{
7337d56d
LC
1025 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1026 guarantee that it's going to do what we want. After all, this is an
1027 implementation detail of `scm_read_vector ()', not a desirable
1028 property. */
1029 return (scm_vector (scm_read_sexp (chr, port)));
1030}
09a4f039 1031
7337d56d
LC
1032static inline SCM
1033scm_read_srfi4_vector (int chr, SCM port)
1034{
1035 return scm_i_read_array (port, chr);
1036}
1037
0ba0b384 1038static SCM
889975e5 1039scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
1040{
1041 chr = scm_getc (port);
1042 if (chr != 'u')
1043 goto syntax;
1044
1045 chr = scm_getc (port);
1046 if (chr != '8')
1047 goto syntax;
1048
1049 chr = scm_getc (port);
1050 if (chr != '(')
1051 goto syntax;
1052
1053 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1054
1055 syntax:
1056 scm_i_input_error ("read_bytevector", port,
1057 "invalid bytevector prefix",
1058 SCM_MAKE_CHAR (chr));
1059 return SCM_UNSPECIFIED;
1060}
1061
7337d56d 1062static SCM
889975e5 1063scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1064{
1065 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1066 terribly inefficient but who cares? */
1067 SCM s_bits = SCM_EOL;
1068
1069 for (chr = scm_getc (port);
1070 (chr != EOF) && ((chr == '0') || (chr == '1'));
1071 chr = scm_getc (port))
09a4f039 1072 {
7337d56d 1073 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1074 }
7337d56d
LC
1075
1076 if (chr != EOF)
1077 scm_ungetc (chr, port);
1078
1079 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1080}
1081
1082static inline SCM
889975e5 1083scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1084{
1085 int bang_seen = 0;
1086
889975e5
MG
1087 /* We can use the get_byte here because there is no need to get the
1088 locale correct when reading comments. This presumes that
1089 hash and exclamation points always represent themselves no
1090 matter what the source encoding is.*/
7337d56d 1091 for (;;)
09a4f039 1092 {
889975e5 1093 int c = scm_get_byte_or_eof (port);
62850ef3 1094
7337d56d
LC
1095 if (c == EOF)
1096 scm_i_input_error ("skip_block_comment", port,
1097 "unterminated `#! ... !#' comment", SCM_EOL);
1098
1099 if (c == '!')
1100 bang_seen = 1;
1101 else if (c == '#' && bang_seen)
1102 break;
1103 else
1104 bang_seen = 0;
1105 }
1106
1107 return SCM_UNSPECIFIED;
1108}
1109
911b03b2
JG
1110static inline SCM
1111scm_read_shebang (scm_t_wchar chr, SCM port)
1112{
1113 int c = 0;
1114 if ((c = scm_get_byte_or_eof (port)) != 'r')
1115 {
1116 scm_ungetc (c, port);
1117 return scm_read_scsh_block_comment (chr, port);
1118 }
1119 if ((c = scm_get_byte_or_eof (port)) != '6')
1120 {
1121 scm_ungetc (c, port);
1122 scm_ungetc ('r', port);
1123 return scm_read_scsh_block_comment (chr, port);
1124 }
1125 if ((c = scm_get_byte_or_eof (port)) != 'r')
1126 {
1127 scm_ungetc (c, port);
1128 scm_ungetc ('6', port);
1129 scm_ungetc ('r', port);
1130 return scm_read_scsh_block_comment (chr, port);
1131 }
1132 if ((c = scm_get_byte_or_eof (port)) != 's')
1133 {
1134 scm_ungetc (c, port);
1135 scm_ungetc ('r', port);
1136 scm_ungetc ('6', port);
1137 scm_ungetc ('r', port);
1138 return scm_read_scsh_block_comment (chr, port);
1139 }
1140
1141 return SCM_UNSPECIFIED;
1142}
1143
620c8965
LC
1144static SCM
1145scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1146{
1147 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1148 nested. So care must be taken. */
1149 int nesting_level = 1;
1150 int opening_seen = 0, closing_seen = 0;
1151
1152 while (nesting_level > 0)
1153 {
1154 int c = scm_getc (port);
1155
1156 if (c == EOF)
cd169c5a 1157 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1158 "unterminated `#| ... |#' comment", SCM_EOL);
1159
1160 if (opening_seen)
1161 {
1162 if (c == '|')
1163 nesting_level++;
1164 opening_seen = 0;
1165 }
1166 else if (closing_seen)
1167 {
1168 if (c == '#')
1169 nesting_level--;
1170 closing_seen = 0;
1171 }
1172 else if (c == '|')
1173 closing_seen = 1;
1174 else if (c == '#')
1175 opening_seen = 1;
1176 else
1177 opening_seen = closing_seen = 0;
1178 }
1179
1180 return SCM_UNSPECIFIED;
1181}
1182
34f3d47d 1183static SCM
889975e5 1184scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1185{
889975e5 1186 scm_t_wchar c;
34f3d47d
AW
1187
1188 c = flush_ws (port, (char *) NULL);
1189 if (EOF == c)
1190 scm_i_input_error ("read_commented_expression", port,
1191 "no expression after #; comment", SCM_EOL);
1192 scm_ungetc (c, port);
1193 scm_read_expression (port);
1194 return SCM_UNSPECIFIED;
1195}
1196
7337d56d 1197static SCM
889975e5 1198scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1199{
1200 /* Guile's extended symbol read syntax looks like this:
1201
1202 #{This is all a symbol name}#
1203
1204 So here, CHR is expected to be `{'. */
7337d56d
LC
1205 int saw_brace = 0, finished = 0;
1206 size_t len = 0;
889975e5 1207 SCM buf = scm_i_make_string (1024, NULL);
7337d56d 1208
889975e5 1209 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1210
1211 while ((chr = scm_getc (port)) != EOF)
1212 {
1213 if (saw_brace)
09a4f039 1214 {
7337d56d
LC
1215 if (chr == '#')
1216 {
1217 finished = 1;
1218 break;
1219 }
1220 else
1221 {
1222 saw_brace = 0;
889975e5
MG
1223 scm_i_string_set_x (buf, len++, '}');
1224 scm_i_string_set_x (buf, len++, chr);
7337d56d 1225 }
09a4f039 1226 }
7337d56d
LC
1227 else if (chr == '}')
1228 saw_brace = 1;
1229 else
889975e5 1230 scm_i_string_set_x (buf, len++, chr);
62850ef3 1231
889975e5 1232 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1233 {
7f991c7d
LC
1234 SCM addy;
1235
889975e5 1236 scm_i_string_stop_writing ();
7f991c7d 1237 addy = scm_i_make_string (1024, NULL);
889975e5 1238 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1239 len = 0;
889975e5 1240 buf = scm_i_string_start_writing (buf);
7337d56d 1241 }
62850ef3 1242
7337d56d
LC
1243 if (finished)
1244 break;
1245 }
889975e5 1246 scm_i_string_stop_writing ();
7337d56d 1247
889975e5 1248 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1249}
1250
1251
1252\f
1253/* Top-level token readers, i.e., dispatchers. */
1254
1255static SCM
1256scm_read_sharp_extension (int chr, SCM port)
1257{
1258 SCM proc;
1259
1260 proc = scm_get_hash_procedure (chr);
1261 if (scm_is_true (scm_procedure_p (proc)))
1262 {
1263 long line = SCM_LINUM (port);
1264 int column = SCM_COL (port) - 2;
1265 SCM got;
1266
1267 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1268 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1269 {
7337d56d
LC
1270 if (SCM_RECORD_POSITIONS_P)
1271 return (recsexpr (got, line, column,
1272 SCM_FILENAME (port)));
1273 else
1274 return got;
62850ef3 1275 }
09a4f039 1276 }
7337d56d
LC
1277
1278 return SCM_UNSPECIFIED;
1279}
1280
1281/* The reader for the sharp `#' character. It basically dispatches reads
1282 among the above token readers. */
1283static SCM
889975e5 1284scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1285#define FUNC_NAME "scm_lreadr"
1286{
1287 SCM result;
1288
1289 chr = scm_getc (port);
1290
1291 result = scm_read_sharp_extension (chr, port);
1292 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1293 return result;
1294
1295 switch (chr)
1296 {
1297 case '\\':
1298 return (scm_read_character (chr, port));
1299 case '(':
1300 return (scm_read_vector (chr, port));
1301 case 's':
1302 case 'u':
1303 case 'f':
1304 /* This one may return either a boolean or an SRFI-4 vector. */
1305 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1306 case 'v':
1307 return (scm_read_bytevector (chr, port));
7337d56d
LC
1308 case '*':
1309 return (scm_read_guile_bit_vector (chr, port));
1310 case 't':
1311 case 'T':
1312 case 'F':
1313 /* This one may return either a boolean or an SRFI-4 vector. */
1314 return (scm_read_boolean (chr, port));
1315 case ':':
1316 return (scm_read_keyword (chr, port));
1317 case '0': case '1': case '2': case '3': case '4':
1318 case '5': case '6': case '7': case '8': case '9':
1319 case '@':
1320#if SCM_ENABLE_DEPRECATED
1321 /* See below for 'i' and 'e'. */
1322 case 'a':
1323 case 'c':
1324 case 'y':
1325 case 'h':
1326 case 'l':
1327#endif
1328 return (scm_i_read_array (port, chr));
1329
1330 case 'i':
1331 case 'e':
1332#if SCM_ENABLE_DEPRECATED
1333 {
1334 /* When next char is '(', it really is an old-style
1335 uniform array. */
889975e5 1336 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1337 if (next_c != EOF)
1338 scm_ungetc (next_c, port);
1339 if (next_c == '(')
1340 return scm_i_read_array (port, chr);
1341 /* Fall through. */
1342 }
1343#endif
1344 case 'b':
1345 case 'B':
1346 case 'o':
1347 case 'O':
1348 case 'd':
1349 case 'D':
1350 case 'x':
1351 case 'X':
1352 case 'I':
1353 case 'E':
1354 return (scm_read_number_and_radix (chr, port));
1355 case '{':
1356 return (scm_read_extended_symbol (chr, port));
1357 case '!':
911b03b2 1358 return (scm_read_shebang (chr, port));
34f3d47d
AW
1359 case ';':
1360 return (scm_read_commented_expression (chr, port));
1361 case '`':
1362 case '\'':
1363 case ',':
1364 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1365 case 'n':
1366 return (scm_read_nil (chr, port));
7337d56d
LC
1367 default:
1368 result = scm_read_sharp_extension (chr, port);
1369 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1370 {
1371 /* To remain compatible with 1.8 and earlier, the following
1372 characters have lower precedence than `read-hash-extend'
1373 characters. */
1374 switch (chr)
1375 {
1376 case '|':
1377 return scm_read_r6rs_block_comment (chr, port);
1378 default:
1379 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1380 scm_list_1 (SCM_MAKE_CHAR (chr)));
1381 }
1382 }
7337d56d
LC
1383 else
1384 return result;
1385 }
1386
1387 return SCM_UNSPECIFIED;
1388}
1389#undef FUNC_NAME
1390
1391static SCM
1392scm_read_expression (SCM port)
1393#define FUNC_NAME "scm_read_expression"
1394{
1395 while (1)
1396 {
889975e5 1397 register scm_t_wchar chr;
7337d56d
LC
1398
1399 chr = scm_getc (port);
1400
1401 switch (chr)
1402 {
1403 case SCM_WHITE_SPACES:
1404 case SCM_LINE_INCREMENTORS:
1405 break;
1406 case ';':
1407 (void) scm_read_semicolon_comment (chr, port);
1408 break;
5afa815c
AW
1409 case '[':
1410 if (!SCM_SQUARE_BRACKETS_P)
1411 return (scm_read_mixed_case_symbol (chr, port));
1412 /* otherwise fall through */
7337d56d
LC
1413 case '(':
1414 return (scm_read_sexp (chr, port));
1415 case '"':
1416 return (scm_read_string (chr, port));
1417 case '\'':
1418 case '`':
1419 case ',':
1420 return (scm_read_quote (chr, port));
1421 case '#':
1422 {
1423 SCM result;
1424 result = scm_read_sharp (chr, port);
1425 if (scm_is_eq (result, SCM_UNSPECIFIED))
1426 /* We read a comment or some such. */
1427 break;
1428 else
1429 return result;
1430 }
1431 case ')':
1432 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1433 break;
1434 case EOF:
1435 return SCM_EOF_VAL;
1436 case ':':
1437 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1438 return scm_symbol_to_keyword (scm_read_expression (port));
1439 /* Fall through. */
1440
1441 default:
1442 {
1443 if (((chr >= '0') && (chr <= '9'))
1444 || (strchr ("+-.", chr)))
1445 return (scm_read_number (chr, port));
1446 else
1447 return (scm_read_mixed_case_symbol (chr, port));
1448 }
1449 }
1450 }
1451}
1452#undef FUNC_NAME
1453
1454\f
1455/* Actual reader. */
1456
1457SCM_DEFINE (scm_read, "read", 0, 1, 0,
1458 (SCM port),
1459 "Read an s-expression from the input port @var{port}, or from\n"
1460 "the current input port if @var{port} is not specified.\n"
1461 "Any whitespace before the next token is discarded.")
1462#define FUNC_NAME s_scm_read
1463{
1464 int c;
1465
1466 if (SCM_UNBNDP (port))
1467 port = scm_current_input_port ();
1468 SCM_VALIDATE_OPINPORT (1, port);
1469
1470 c = flush_ws (port, (char *) NULL);
1471 if (EOF == c)
1472 return SCM_EOF_VAL;
1473 scm_ungetc (c, port);
1474
1475 return (scm_read_expression (port));
09a4f039 1476}
db4b4ca6 1477#undef FUNC_NAME
09a4f039 1478
0f2d19dd
JB
1479
1480\f
1481
7337d56d
LC
1482/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1483static SCM
1484recsexpr (SCM obj, long line, int column, SCM filename)
1485{
1486 if (!scm_is_pair(obj)) {
1487 return obj;
1488 } else {
1489 SCM tmp = obj, copy;
1490 /* If this sexpr is visible in the read:sharp source, we want to
1491 keep that information, so only record non-constant cons cells
1492 which haven't previously been read by the reader. */
1493 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1494 {
1495 if (SCM_COPY_SOURCE_P)
1496 {
1497 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1498 SCM_UNDEFINED);
1499 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1500 {
1501 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1502 line,
1503 column,
1504 filename),
1505 SCM_UNDEFINED));
1506 copy = SCM_CDR (copy);
1507 }
1508 SCM_SETCDR (copy, tmp);
1509 }
1510 else
1511 {
1512 recsexpr (SCM_CAR (obj), line, column, filename);
1513 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1514 recsexpr (SCM_CAR (tmp), line, column, filename);
1515 copy = SCM_UNDEFINED;
1516 }
1517 scm_whash_insert (scm_source_whash,
1518 obj,
1519 scm_make_srcprops (line,
1520 column,
1521 filename,
1522 copy,
1523 SCM_EOL));
1524 }
1525 return obj;
1526 }
1527}
1528
14de3b42
GH
1529/* Manipulate the read-hash-procedures alist. This could be written in
1530 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1531SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1532 (SCM chr, SCM proc),
dc7fa443
MG
1533 "Install the procedure @var{proc} for reading expressions\n"
1534 "starting with the character sequence @code{#} and @var{chr}.\n"
1535 "@var{proc} will be called with two arguments: the character\n"
1536 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1537 "returned will be the return value of @code{read}. \n"
1538 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1539 )
1bbd0b84 1540#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1541{
fed9c9a2
GH
1542 SCM this;
1543 SCM prev;
1544
36284627 1545 SCM_VALIDATE_CHAR (1, chr);
7888309b 1546 SCM_ASSERT (scm_is_false (proc)
bc36d050 1547 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1548 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1549
14de3b42
GH
1550 /* Check if chr is already in the alist. */
1551 this = *scm_read_hash_procedures;
1552 prev = SCM_BOOL_F;
fed9c9a2
GH
1553 while (1)
1554 {
d2e53ed6 1555 if (scm_is_null (this))
fed9c9a2
GH
1556 {
1557 /* not found, so add it to the beginning. */
7888309b 1558 if (scm_is_true (proc))
fed9c9a2 1559 {
14de3b42
GH
1560 *scm_read_hash_procedures =
1561 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
fed9c9a2
GH
1562 }
1563 break;
1564 }
bc36d050 1565 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1566 {
1567 /* already in the alist. */
7888309b 1568 if (scm_is_false (proc))
14de3b42
GH
1569 {
1570 /* remove it. */
7888309b 1571 if (scm_is_false (prev))
14de3b42
GH
1572 {
1573 *scm_read_hash_procedures =
1574 SCM_CDR (*scm_read_hash_procedures);
1575 }
1576 else
1577 scm_set_cdr_x (prev, SCM_CDR (this));
1578 }
fed9c9a2 1579 else
14de3b42
GH
1580 {
1581 /* replace it. */
1582 scm_set_cdr_x (SCM_CAR (this), proc);
1583 }
fed9c9a2
GH
1584 break;
1585 }
1586 prev = this;
1587 this = SCM_CDR (this);
1588 }
deca31e1 1589
deca31e1
GH
1590 return SCM_UNSPECIFIED;
1591}
1bbd0b84 1592#undef FUNC_NAME
0f2d19dd 1593
deca31e1
GH
1594/* Recover the read-hash procedure corresponding to char c. */
1595static SCM
6e8d25a6 1596scm_get_hash_procedure (int c)
deca31e1 1597{
14de3b42 1598 SCM rest = *scm_read_hash_procedures;
fed9c9a2 1599
deca31e1
GH
1600 while (1)
1601 {
d2e53ed6 1602 if (scm_is_null (rest))
deca31e1
GH
1603 return SCM_BOOL_F;
1604
7866a09b 1605 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1606 return SCM_CDAR (rest);
1607
1608 rest = SCM_CDR (rest);
1609 }
1610}
1cc91f1b 1611
889975e5
MG
1612#define SCM_ENCODING_SEARCH_SIZE (500)
1613
f8a1c9a8
LC
1614/* Search the first few hundred characters of a file for an Emacs-like coding
1615 declaration. Returns either NULL or a string whose storage has been
1616 allocated with `scm_gc_malloc ()'. */
889975e5 1617char *
f8a1c9a8 1618scm_i_scan_for_encoding (SCM port)
889975e5
MG
1619{
1620 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1621 size_t bytes_read, encoding_length, i;
889975e5
MG
1622 char *encoding = NULL;
1623 int utf8_bom = 0;
daedbca7 1624 char *pos, *encoding_start;
889975e5
MG
1625 int in_comment;
1626
49bb5bd3
LC
1627 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1628 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1629 "guile <(echo '(display "hello")')") so bail out. */
1630 return NULL;
1631
1632 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1633
889975e5
MG
1634 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1635
1636 if (bytes_read > 3
1637 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1638 utf8_bom = 1;
1639
1640 /* search past "coding[:=]" */
1641 pos = header;
1642 while (1)
1643 {
1644 if ((pos = strstr(pos, "coding")) == NULL)
1645 return NULL;
1646
1647 pos += strlen("coding");
1648 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1649 (*pos == ':' || *pos == '='))
1650 {
1651 pos ++;
1652 break;
1653 }
1654 }
1655
1656 /* skip spaces */
1657 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1658 (*pos == ' ' || *pos == '\t'))
1659 pos ++;
1660
1661 /* grab the next token */
daedbca7 1662 encoding_start = pos;
889975e5 1663 i = 0;
daedbca7
MG
1664 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1665 && encoding_start + i - header < bytes_read
1666 && (isalnum ((int) encoding_start[i])
1667 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1668 i++;
1669
daedbca7
MG
1670 encoding_length = i;
1671 if (encoding_length == 0)
889975e5
MG
1672 return NULL;
1673
daedbca7
MG
1674 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1675 for (i = 0; i < encoding_length; i++)
026ed239 1676 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1677
1678 /* push backwards to make sure we were in a comment */
1679 in_comment = 0;
daedbca7
MG
1680 pos = encoding_start;
1681 while (pos >= header)
889975e5 1682 {
daedbca7 1683 if (*pos == '\n')
889975e5
MG
1684 {
1685 /* This wasn't in a semicolon comment. Check for a
1686 hash-bang comment. */
1687 char *beg = strstr (header, "#!");
1688 char *end = strstr (header, "!#");
daedbca7 1689 if (beg < encoding_start && encoding_start + encoding_length < end)
889975e5
MG
1690 in_comment = 1;
1691 break;
1692 }
daedbca7 1693 if (*pos == ';')
889975e5
MG
1694 {
1695 in_comment = 1;
1696 break;
1697 }
daedbca7 1698 pos --;
889975e5
MG
1699 }
1700 if (!in_comment)
f8a1c9a8
LC
1701 /* This wasn't in a comment */
1702 return NULL;
1703
889975e5 1704 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1705 scm_misc_error (NULL,
889975e5
MG
1706 "the port input declares the encoding ~s but is encoded as UTF-8",
1707 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1708
889975e5
MG
1709 return encoding;
1710}
1711
1712SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1713 (SCM port),
a270e133 1714 "Scans the port for an Emacs-like character coding declaration\n"
889975e5
MG
1715 "near the top of the contents of a port with random-acessible contents.\n"
1716 "The coding declaration is of the form\n"
1717 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1718 "\n"
1719 "Returns a string containing the character encoding of the file\n"
1720 "if a declaration was found, or @code{#f} otherwise.\n")
1721#define FUNC_NAME s_scm_file_encoding
1722{
1723 char *enc;
1724 SCM s_enc;
f8a1c9a8
LC
1725
1726 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1727 if (enc == NULL)
1728 return SCM_BOOL_F;
1729 else
1730 {
1731 s_enc = scm_from_locale_string (enc);
889975e5
MG
1732 return s_enc;
1733 }
f8a1c9a8 1734
889975e5
MG
1735 return SCM_BOOL_F;
1736}
1737#undef FUNC_NAME
1738
0f2d19dd
JB
1739void
1740scm_init_read ()
0f2d19dd 1741{
14de3b42 1742 scm_read_hash_procedures =
86d31dfe 1743 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
fed9c9a2 1744
62560650 1745 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1746#include "libguile/read.x"
0f2d19dd 1747}
89e00824
ML
1748
1749/*
1750 Local Variables:
1751 c-file-style: "gnu"
1752 End:
1753*/