locking on unget_byte, ungetc, ungets
[bpt/guile.git] / libguile / read.c
CommitLineData
4a655e50 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
210c0325 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
d458073b
AR
145static inline SCM
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
151static inline void
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
7337d56d
LC
164/* Size of the C buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
620c8965
LC
200static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
7337d56d 210static inline int
69f90b0b
MG
211read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
0d959103 219 chr = scm_get_byte_or_eof_unlocked (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
c932ce0b 225 scm_unget_byte_unlocked (chr, port);
69f90b0b
MG
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
69f90b0b
MG
238/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243static int
244read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246{
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
7337d56d 249
69f90b0b
MG
250 *overflow_buffer = NULL;
251 overflow_size = 0;
1cc91f1b 252
69f90b0b
MG
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
281 return (overflow_size != 0);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
889975e5 289 register scm_t_wchar c;
0f2d19dd 290 while (1)
be632904 291 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
be632904 306 switch (c = scm_getc_unlocked (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0 317 case '#':
be632904 318 switch (c = scm_getc_unlocked (port))
454866e0
LC
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0 336 default:
c932ce0b 337 scm_ungetc_unlocked (c, port);
454866e0
LC
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d
LC
358static SCM scm_read_expression (SCM port);
359static SCM scm_read_sharp (int chr, SCM port);
0f2d19dd
JB
360
361
09a4f039 362static SCM
889975e5 363scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 364#define FUNC_NAME "scm_i_lreadparen"
09a4f039 365{
26c8cc14
AW
366 int c;
367 SCM tmp, tl, ans = SCM_EOL;
5afa815c 368 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
f9c68a47 373
7337d56d
LC
374 c = flush_ws (port, FUNC_NAME);
375 if (terminating_char == c)
376 return SCM_EOL;
f9c68a47 377
c932ce0b 378 scm_ungetc_unlocked (c, port);
1f7945a7
AW
379 tmp = scm_read_expression (port);
380
381 /* Note that it is possible for scm_read_expression to return
382 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
383 check that it's a real dot by checking `c'. */
384 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d
LC
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
1cc91f1b 392
7337d56d
LC
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
7337d56d 396 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 397 {
7337d56d 398 SCM new_tail;
0f2d19dd 399
5b69315e
AW
400 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
401 scm_i_input_error (FUNC_NAME, port,
402 "in pair: mismatched close paren: ~A",
403 scm_list_1 (SCM_MAKE_CHAR (c)));
404
c932ce0b 405 scm_ungetc_unlocked (c, port);
5b69315e
AW
406 tmp = scm_read_expression (port);
407
1f7945a7
AW
408 /* See above note about scm_sym_dot. */
409 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 410 {
89f88612 411 SCM_SETCDR (tl, scm_read_expression (port));
7337d56d 412
7337d56d
LC
413 c = flush_ws (port, FUNC_NAME);
414 if (terminating_char != c)
415 scm_i_input_error (FUNC_NAME, port,
416 "in pair: missing close paren", SCM_EOL);
417 goto exit;
0f2d19dd 418 }
b858464a 419
7337d56d
LC
420 new_tail = scm_cons (tmp, SCM_EOL);
421 SCM_SETCDR (tl, new_tail);
422 tl = new_tail;
7337d56d 423 }
0f2d19dd 424
7337d56d
LC
425 exit:
426 if (SCM_RECORD_POSITIONS_P)
26c8cc14
AW
427 scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
428
7337d56d
LC
429 return ans;
430}
431#undef FUNC_NAME
a4022e69 432
c5661d28
MG
433
434/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
435 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
436 found. */
437#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
438 do \
439 { \
440 scm_t_wchar a; \
441 size_t i = 0; \
442 c = 0; \
443 while (i < ndigits) \
444 { \
be632904 445 a = scm_getc_unlocked (port); \
dea901d6
MG
446 if (a == EOF) \
447 goto str_eof; \
448 if (terminator \
449 && (a == (scm_t_wchar) terminator) \
450 && (i > 0)) \
451 break; \
452 if ('0' <= a && a <= '9') \
453 a -= '0'; \
454 else if ('A' <= a && a <= 'F') \
455 a = a - 'A' + 10; \
456 else if ('a' <= a && a <= 'f') \
457 a = a - 'a' + 10; \
458 else \
459 { \
460 c = a; \
461 goto bad_escaped; \
462 } \
463 c = c * 16 + a; \
464 i ++; \
465 } \
c5661d28
MG
466 } while (0)
467
684d664e
AW
468static void
469skip_intraline_whitespace (SCM port)
470{
471 scm_t_wchar c;
472
473 do
474 {
be632904 475 c = scm_getc_unlocked (port);
684d664e
AW
476 if (c == EOF)
477 return;
478 }
479 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
480
c932ce0b 481 scm_ungetc_unlocked (c, port);
684d664e
AW
482}
483
7337d56d
LC
484static SCM
485scm_read_string (int chr, SCM port)
486#define FUNC_NAME "scm_lreadr"
487{
488 /* For strings smaller than C_STR, this function creates only one Scheme
489 object (the string returned). */
0f2d19dd 490
7337d56d 491 SCM str = SCM_BOOL_F;
7337d56d 492 unsigned c_str_len = 0;
9c44cd45 493 scm_t_wchar c;
eb42ff25 494
190d4b0d 495 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
be632904 496 while ('"' != (c = scm_getc_unlocked (port)))
7337d56d
LC
497 {
498 if (c == EOF)
9c44cd45
MG
499 {
500 str_eof:
501 scm_i_input_error (FUNC_NAME, port,
502 "end of file in string constant", SCM_EOL);
503 }
0f2d19dd 504
9c44cd45
MG
505 if (c_str_len + 1 >= scm_i_string_length (str))
506 {
190d4b0d 507 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
0f2d19dd 508
9c44cd45
MG
509 str = scm_string_append (scm_list_2 (str, addy));
510 }
0f2d19dd 511
7337d56d 512 if (c == '\\')
9c44cd45 513 {
be632904 514 switch (c = scm_getc_unlocked (port))
9c44cd45
MG
515 {
516 case EOF:
517 goto str_eof;
518 case '"':
519 case '\\':
520 break;
9c44cd45 521 case '\n':
684d664e
AW
522 if (SCM_HUNGRY_EOL_ESCAPES_P)
523 skip_intraline_whitespace (port);
9c44cd45
MG
524 continue;
525 case '0':
526 c = '\0';
527 break;
528 case 'f':
529 c = '\f';
530 break;
531 case 'n':
532 c = '\n';
533 break;
534 case 'r':
535 c = '\r';
536 break;
537 case 't':
538 c = '\t';
539 break;
540 case 'a':
541 c = '\007';
542 break;
543 case 'v':
544 c = '\v';
545 break;
67a4a16d
MG
546 case 'b':
547 c = '\010';
548 break;
9c44cd45 549 case 'x':
dea901d6
MG
550 if (SCM_R6RS_ESCAPES_P)
551 SCM_READ_HEX_ESCAPE (10, ';');
552 else
553 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 554 break;
9c44cd45 555 case 'u':
898a0b5a
MG
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (4, '\0');
559 break;
560 }
9c44cd45 561 case 'U':
898a0b5a
MG
562 if (!SCM_R6RS_ESCAPES_P)
563 {
564 SCM_READ_HEX_ESCAPE (6, '\0');
565 break;
566 }
9c44cd45
MG
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
7337d56d 577 }
f13b4400 578
7337d56d
LC
579 if (c_str_len > 0)
580 {
9c44cd45 581 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 582 }
69f90b0b 583
9c44cd45 584 return scm_nullstr;
0f2d19dd 585}
db4b4ca6
DH
586#undef FUNC_NAME
587
0f2d19dd 588
7337d56d 589static SCM
889975e5 590scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 591{
69f90b0b
MG
592 SCM result, str = SCM_EOL;
593 char buffer[READER_BUFFER_SIZE];
594 char *overflow_buffer = NULL;
595 size_t bytes_read;
596 int overflow;
597 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 598
c932ce0b 599 scm_ungetc_unlocked (chr, port);
69f90b0b
MG
600 overflow = read_complete_token (port, buffer, sizeof (buffer),
601 &overflow_buffer, &bytes_read);
602
603 if (!overflow)
604 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
605 else
606 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
607 pt->ilseq_handler);
608
609 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 610 if (!scm_is_true (result))
69f90b0b
MG
611 {
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P)
614 str = scm_string_downcase_x (str);
615 result = scm_string_to_symbol (str);
616 }
0f2d19dd 617
69f90b0b
MG
618 if (overflow)
619 free (overflow_buffer);
620 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
621 return result;
622}
0f2d19dd 623
7337d56d 624static SCM
889975e5 625scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 626{
889975e5
MG
627 SCM result;
628 int ends_with_colon = 0;
69f90b0b 629 size_t bytes_read;
ef4cbc08 630 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
631 int overflow;
632 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
633 scm_t_port *pt = SCM_PTAB_ENTRY (port);
634 SCM str;
7337d56d 635
c932ce0b 636 scm_ungetc_unlocked (chr, port);
69f90b0b
MG
637 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
638 &overflow_buffer, &bytes_read);
639 if (bytes_read > 0)
640 {
641 if (!overflow)
642 ends_with_colon = buffer[bytes_read - 1] == ':';
643 else
644 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
645 }
ef4cbc08 646
69f90b0b
MG
647 if (postfix && ends_with_colon && (bytes_read > 1))
648 {
649 if (!overflow)
650 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
651 else
652 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
653 pt->ilseq_handler);
654
655 if (SCM_CASE_INSENSITIVE_P)
656 str = scm_string_downcase_x (str);
657 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
658 }
7337d56d 659 else
69f90b0b
MG
660 {
661 if (!overflow)
662 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
663 else
664 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
665 pt->ilseq_handler);
666
667 if (SCM_CASE_INSENSITIVE_P)
668 str = scm_string_downcase_x (str);
669 result = scm_string_to_symbol (str);
670 }
7337d56d 671
69f90b0b
MG
672 if (overflow)
673 free (overflow_buffer);
674 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
675 return result;
676}
677
678static SCM
889975e5 679scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
680#define FUNC_NAME "scm_lreadr"
681{
889975e5 682 SCM result;
7337d56d 683 size_t read;
69f90b0b
MG
684 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
685 int overflow;
7337d56d 686 unsigned int radix;
69f90b0b
MG
687 SCM str;
688 scm_t_port *pt;
7337d56d
LC
689
690 switch (chr)
691 {
692 case 'B':
693 case 'b':
694 radix = 2;
695 break;
696
697 case 'o':
698 case 'O':
699 radix = 8;
700 break;
701
702 case 'd':
703 case 'D':
704 radix = 10;
705 break;
706
707 case 'x':
708 case 'X':
709 radix = 16;
710 break;
711
712 default:
c932ce0b
AW
713 scm_ungetc_unlocked (chr, port);
714 scm_ungetc_unlocked ('#', port);
7337d56d
LC
715 radix = 10;
716 }
717
69f90b0b
MG
718 overflow = read_complete_token (port, buffer, sizeof (buffer),
719 &overflow_buffer, &read);
720
721 pt = SCM_PTAB_ENTRY (port);
722 if (!overflow)
723 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
724 else
725 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
726 pt->ilseq_handler);
727
728 result = scm_string_to_number (str, scm_from_uint (radix));
729
730 if (overflow)
731 free (overflow_buffer);
732
733 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
734
735 if (scm_is_true (result))
736 return result;
737
738 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
739
740 return SCM_BOOL_F;
741}
742#undef FUNC_NAME
743
744static SCM
745scm_read_quote (int chr, SCM port)
746{
747 SCM p;
492faee1
LC
748 long line = SCM_LINUM (port);
749 int column = SCM_COL (port) - 1;
7337d56d
LC
750
751 switch (chr)
752 {
753 case '`':
754 p = scm_sym_quasiquote;
755 break;
756
757 case '\'':
758 p = scm_sym_quote;
759 break;
760
761 case ',':
762 {
889975e5 763 scm_t_wchar c;
7337d56d 764
be632904 765 c = scm_getc_unlocked (port);
7337d56d
LC
766 if ('@' == c)
767 p = scm_sym_uq_splicing;
768 else
0f2d19dd 769 {
c932ce0b 770 scm_ungetc_unlocked (c, port);
7337d56d 771 p = scm_sym_unquote;
0f2d19dd 772 }
7337d56d
LC
773 break;
774 }
0f2d19dd 775
7337d56d
LC
776 default:
777 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 778 "scm_read_quote", chr);
7337d56d 779 abort ();
0f2d19dd 780 }
1cc91f1b 781
7337d56d 782 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1 783 if (SCM_RECORD_POSITIONS_P)
26c8cc14 784 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
0f2d19dd 785
7337d56d
LC
786 return p;
787}
788
34f3d47d
AW
789SCM_SYMBOL (sym_syntax, "syntax");
790SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
791SCM_SYMBOL (sym_unsyntax, "unsyntax");
792SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
793
794static SCM
795scm_read_syntax (int chr, SCM port)
796{
797 SCM p;
798 long line = SCM_LINUM (port);
799 int column = SCM_COL (port) - 1;
800
801 switch (chr)
802 {
803 case '`':
804 p = sym_quasisyntax;
805 break;
806
807 case '\'':
808 p = sym_syntax;
809 break;
810
811 case ',':
812 {
813 int c;
814
be632904 815 c = scm_getc_unlocked (port);
34f3d47d
AW
816 if ('@' == c)
817 p = sym_unsyntax_splicing;
818 else
819 {
c932ce0b 820 scm_ungetc_unlocked (c, port);
34f3d47d
AW
821 p = sym_unsyntax;
822 }
823 break;
824 }
825
826 default:
827 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr);
829 abort ();
830 }
831
832 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
833 if (SCM_RECORD_POSITIONS_P)
26c8cc14 834 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
34f3d47d
AW
835
836 return p;
837}
838
7c4aad9c
AW
839static inline SCM
840scm_read_nil (int chr, SCM port)
841{
842 SCM id = scm_read_mixed_case_symbol (chr, port);
843
844 if (!scm_is_eq (id, sym_nil))
845 scm_i_input_error ("scm_read_nil", port,
846 "unexpected input while reading #nil: ~a",
847 scm_list_1 (id));
848
849 return SCM_ELISP_NIL;
850}
851
7337d56d
LC
852static inline SCM
853scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 854{
0f2d19dd
JB
855 int c;
856
889975e5
MG
857 /* We use the get_byte here because there is no need to get the
858 locale correct with comment input. This presumes that newline
859 always represents itself no matter what the encoding is. */
0d959103 860 for (c = scm_get_byte_or_eof_unlocked (port);
7337d56d 861 (c != EOF) && (c != '\n');
0d959103 862 c = scm_get_byte_or_eof_unlocked (port));
7337d56d
LC
863
864 return SCM_UNSPECIFIED;
865}
866
867\f
868/* Sharp readers, i.e. readers called after a `#' sign has been read. */
869
870static SCM
871scm_read_boolean (int chr, SCM port)
872{
873 switch (chr)
0f2d19dd 874 {
7337d56d
LC
875 case 't':
876 case 'T':
877 return SCM_BOOL_T;
878
879 case 'f':
880 case 'F':
881 return SCM_BOOL_F;
0f2d19dd 882 }
7337d56d
LC
883
884 return SCM_UNSPECIFIED;
885}
886
887static SCM
889975e5 888scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
889#define FUNC_NAME "scm_lreadr"
890{
69f90b0b
MG
891 char buffer[READER_CHAR_NAME_MAX_SIZE];
892 SCM charname;
893 size_t charname_len, bytes_read;
889975e5
MG
894 scm_t_wchar cp;
895 int overflow;
69f90b0b 896 scm_t_port *pt;
889975e5 897
69f90b0b 898 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 899 if (overflow)
c1a0ba1c 900 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 901
69f90b0b 902 if (bytes_read == 0)
0f2d19dd 903 {
be632904 904 chr = scm_getc_unlocked (port);
7337d56d
LC
905 if (chr == EOF)
906 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
907 "while reading character", SCM_EOL);
908
909 /* CHR must be a token delimiter, like a whitespace. */
910 return (SCM_MAKE_CHAR (chr));
0f2d19dd 911 }
7337d56d 912
69f90b0b 913 pt = SCM_PTAB_ENTRY (port);
7337d56d 914
69f90b0b
MG
915 /* Simple ASCII characters can be processed immediately. Also, simple
916 ISO-8859-1 characters can be processed immediately if the encoding for this
917 port is ISO-8859-1. */
918 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
919 {
920 SCM_COL (port) += 1;
921 return SCM_MAKE_CHAR (buffer[0]);
922 }
923
924 /* Otherwise, convert the buffer into a proper scheme string for
925 processing. */
926 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
927 pt->ilseq_handler);
928 charname_len = scm_i_string_length (charname);
929 SCM_COL (port) += charname_len;
889975e5 930 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
931 if (charname_len == 1)
932 return SCM_MAKE_CHAR (cp);
933
934 /* Ignore dotted circles, which may be used to keep combining characters from
935 combining with the backslash in #\charname. */
0dcd7e61
MG
936 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
937 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
938
889975e5 939 if (cp >= '0' && cp < '8')
7337d56d
LC
940 {
941 /* Dirk:FIXME:: This type of character syntax is not R5RS
942 * compliant. Further, it should be verified that the constant
0ffc78e3 943 * does only consist of octal digits. */
889975e5 944 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 945 if (SCM_I_INUMP (p))
0ffc78e3 946 {
e25f3727 947 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
948 if (SCM_IS_UNICODE_CHAR (c))
949 return SCM_MAKE_CHAR (c);
950 else
0f3a70cf 951 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
952 "out-of-range octal character escape: ~a",
953 scm_list_1 (charname));
954 }
7337d56d
LC
955 }
956
0f3a70cf 957 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
958 {
959 SCM p;
0f3a70cf 960
dea901d6
MG
961 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
962 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
963 scm_from_uint (16));
964 if (SCM_I_INUMP (p))
965 {
e25f3727 966 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
967 if (SCM_IS_UNICODE_CHAR (c))
968 return SCM_MAKE_CHAR (c);
969 else
970 scm_i_input_error (FUNC_NAME, port,
971 "out-of-range hex character escape: ~a",
972 scm_list_1 (charname));
973 }
974 }
975
889975e5
MG
976 /* The names of characters should never have non-Latin1
977 characters. */
978 if (scm_i_is_narrow_string (charname)
979 || scm_i_try_narrow_string (charname))
4769c9db
AW
980 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
981 charname_len);
982 if (scm_is_true (ch))
983 return ch;
984 }
7337d56d 985
7337d56d 986 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 987 scm_list_1 (charname));
7337d56d
LC
988
989 return SCM_UNSPECIFIED;
0f2d19dd 990}
db4b4ca6 991#undef FUNC_NAME
0f2d19dd 992
7337d56d
LC
993static inline SCM
994scm_read_keyword (int chr, SCM port)
995{
996 SCM symbol;
997
998 /* Read the symbol that comprises the keyword. Doing this instead of
999 invoking a specific symbol reader function allows `scm_read_keyword ()'
1000 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1001
7337d56d
LC
1002 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1003 symbol = scm_read_expression (port);
1004 if (!scm_is_symbol (symbol))
7f74cf9a 1005 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1006 "keyword prefix `~a' not followed by a symbol: ~s",
1007 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009 return (scm_symbol_to_keyword (symbol));
1010}
1011
1012static inline SCM
1013scm_read_vector (int chr, SCM port)
09a4f039 1014{
7337d56d
LC
1015 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016 guarantee that it's going to do what we want. After all, this is an
1017 implementation detail of `scm_read_vector ()', not a desirable
1018 property. */
1019 return (scm_vector (scm_read_sexp (chr, port)));
1020}
09a4f039 1021
7337d56d
LC
1022static inline SCM
1023scm_read_srfi4_vector (int chr, SCM port)
1024{
1025 return scm_i_read_array (port, chr);
1026}
1027
0ba0b384 1028static SCM
889975e5 1029scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384 1030{
be632904 1031 chr = scm_getc_unlocked (port);
0ba0b384
LC
1032 if (chr != 'u')
1033 goto syntax;
1034
be632904 1035 chr = scm_getc_unlocked (port);
0ba0b384
LC
1036 if (chr != '8')
1037 goto syntax;
1038
be632904 1039 chr = scm_getc_unlocked (port);
0ba0b384
LC
1040 if (chr != '(')
1041 goto syntax;
1042
1043 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045 syntax:
1046 scm_i_input_error ("read_bytevector", port,
1047 "invalid bytevector prefix",
1048 SCM_MAKE_CHAR (chr));
1049 return SCM_UNSPECIFIED;
1050}
1051
7337d56d 1052static SCM
889975e5 1053scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1054{
1055 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1056 terribly inefficient but who cares? */
1057 SCM s_bits = SCM_EOL;
1058
be632904 1059 for (chr = scm_getc_unlocked (port);
7337d56d 1060 (chr != EOF) && ((chr == '0') || (chr == '1'));
be632904 1061 chr = scm_getc_unlocked (port))
09a4f039 1062 {
7337d56d 1063 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1064 }
7337d56d
LC
1065
1066 if (chr != EOF)
c932ce0b 1067 scm_ungetc_unlocked (chr, port);
7337d56d
LC
1068
1069 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070}
1071
1072static inline SCM
889975e5 1073scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1074{
1075 int bang_seen = 0;
1076
1077 for (;;)
09a4f039 1078 {
be632904 1079 int c = scm_getc_unlocked (port);
62850ef3 1080
7337d56d
LC
1081 if (c == EOF)
1082 scm_i_input_error ("skip_block_comment", port,
1083 "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085 if (c == '!')
1086 bang_seen = 1;
1087 else if (c == '#' && bang_seen)
1088 break;
1089 else
1090 bang_seen = 0;
1091 }
1092
1093 return SCM_UNSPECIFIED;
1094}
1095
d7fcaec3 1096static SCM
911b03b2
JG
1097scm_read_shebang (scm_t_wchar chr, SCM port)
1098{
1099 int c = 0;
0d959103 1100 if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
911b03b2 1101 {
c932ce0b 1102 scm_ungetc_unlocked (c, port);
911b03b2
JG
1103 return scm_read_scsh_block_comment (chr, port);
1104 }
0d959103 1105 if ((c = scm_get_byte_or_eof_unlocked (port)) != '6')
911b03b2 1106 {
c932ce0b
AW
1107 scm_ungetc_unlocked (c, port);
1108 scm_ungetc_unlocked ('r', port);
911b03b2
JG
1109 return scm_read_scsh_block_comment (chr, port);
1110 }
0d959103 1111 if ((c = scm_get_byte_or_eof_unlocked (port)) != 'r')
911b03b2 1112 {
c932ce0b
AW
1113 scm_ungetc_unlocked (c, port);
1114 scm_ungetc_unlocked ('6', port);
1115 scm_ungetc_unlocked ('r', port);
911b03b2
JG
1116 return scm_read_scsh_block_comment (chr, port);
1117 }
0d959103 1118 if ((c = scm_get_byte_or_eof_unlocked (port)) != 's')
911b03b2 1119 {
c932ce0b
AW
1120 scm_ungetc_unlocked (c, port);
1121 scm_ungetc_unlocked ('r', port);
1122 scm_ungetc_unlocked ('6', port);
1123 scm_ungetc_unlocked ('r', port);
911b03b2
JG
1124 return scm_read_scsh_block_comment (chr, port);
1125 }
1126
1127 return SCM_UNSPECIFIED;
1128}
1129
620c8965
LC
1130static SCM
1131scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132{
1133 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134 nested. So care must be taken. */
1135 int nesting_level = 1;
6d5f8c32 1136
be632904 1137 int a = scm_getc_unlocked (port);
6d5f8c32
AW
1138
1139 if (a == EOF)
1140 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1141 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1142
1143 while (nesting_level > 0)
1144 {
be632904 1145 int b = scm_getc_unlocked (port);
620c8965 1146
6d5f8c32 1147 if (b == EOF)
cd169c5a 1148 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1149 "unterminated `#| ... |#' comment", SCM_EOL);
1150
6d5f8c32
AW
1151 if (a == '|' && b == '#')
1152 {
1153 nesting_level--;
1154 b = EOF;
1155 }
1156 else if (a == '#' && b == '|')
1157 {
1158 nesting_level++;
1159 b = EOF;
1160 }
1161
1162 a = b;
620c8965
LC
1163 }
1164
1165 return SCM_UNSPECIFIED;
1166}
1167
34f3d47d 1168static SCM
889975e5 1169scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1170{
889975e5 1171 scm_t_wchar c;
34f3d47d
AW
1172
1173 c = flush_ws (port, (char *) NULL);
1174 if (EOF == c)
1175 scm_i_input_error ("read_commented_expression", port,
1176 "no expression after #; comment", SCM_EOL);
c932ce0b 1177 scm_ungetc_unlocked (c, port);
34f3d47d
AW
1178 scm_read_expression (port);
1179 return SCM_UNSPECIFIED;
1180}
1181
7337d56d 1182static SCM
889975e5 1183scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1184{
1185 /* Guile's extended symbol read syntax looks like this:
1186
1187 #{This is all a symbol name}#
1188
1189 So here, CHR is expected to be `{'. */
d9527cfa 1190 int saw_brace = 0;
7337d56d 1191 size_t len = 0;
190d4b0d 1192 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1193
889975e5 1194 buf = scm_i_string_start_writing (buf);
7337d56d 1195
be632904 1196 while ((chr = scm_getc_unlocked (port)) != EOF)
7337d56d
LC
1197 {
1198 if (saw_brace)
09a4f039 1199 {
7337d56d
LC
1200 if (chr == '#')
1201 {
7337d56d
LC
1202 break;
1203 }
1204 else
1205 {
1206 saw_brace = 0;
889975e5 1207 scm_i_string_set_x (buf, len++, '}');
7337d56d 1208 }
09a4f039 1209 }
d9527cfa
AW
1210
1211 if (chr == '}')
7337d56d 1212 saw_brace = 1;
d9527cfa
AW
1213 else if (chr == '\\')
1214 {
1215 /* It used to be that print.c would print extended-read-syntax
1216 symbols with backslashes before "non-standard" chars, but
1217 this routine wouldn't do anything with those escapes.
1218 Bummer. What we've done is to change print.c to output
1219 R6RS hex escapes for those characters, relying on the fact
1220 that the extended read syntax would never put a `\' before
1221 an `x'. For now, we just ignore other instances of
1222 backslash in the string. */
be632904 1223 switch ((chr = scm_getc_unlocked (port)))
d9527cfa
AW
1224 {
1225 case EOF:
1226 goto done;
1227 case 'x':
1228 {
1229 scm_t_wchar c;
1230
1231 SCM_READ_HEX_ESCAPE (10, ';');
1232 scm_i_string_set_x (buf, len++, c);
1233 break;
1234
1235 str_eof:
1236 chr = EOF;
1237 goto done;
1238
1239 bad_escaped:
1240 scm_i_string_stop_writing ();
1241 scm_i_input_error ("scm_read_extended_symbol", port,
1242 "illegal character in escape sequence: ~S",
1243 scm_list_1 (SCM_MAKE_CHAR (c)));
1244 break;
1245 }
1246 default:
1247 scm_i_string_set_x (buf, len++, chr);
1248 break;
1249 }
1250 }
7337d56d 1251 else
d9527cfa 1252 scm_i_string_set_x (buf, len++, chr);
62850ef3 1253
889975e5 1254 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1255 {
7f991c7d
LC
1256 SCM addy;
1257
889975e5 1258 scm_i_string_stop_writing ();
190d4b0d 1259 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1260 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1261 len = 0;
889975e5 1262 buf = scm_i_string_start_writing (buf);
7337d56d 1263 }
7337d56d 1264 }
d9527cfa
AW
1265
1266 done:
889975e5 1267 scm_i_string_stop_writing ();
d9527cfa
AW
1268 if (chr == EOF)
1269 scm_i_input_error ("scm_read_extended_symbol", port,
1270 "end of file while reading symbol", SCM_EOL);
7337d56d 1271
889975e5 1272 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1273}
1274
1275
1276\f
1277/* Top-level token readers, i.e., dispatchers. */
1278
1279static SCM
1280scm_read_sharp_extension (int chr, SCM port)
1281{
1282 SCM proc;
1283
1284 proc = scm_get_hash_procedure (chr);
1285 if (scm_is_true (scm_procedure_p (proc)))
1286 {
1287 long line = SCM_LINUM (port);
1288 int column = SCM_COL (port) - 2;
1289 SCM got;
1290
1291 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14
AW
1292
1293 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1294 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1295
1296 return got;
09a4f039 1297 }
7337d56d
LC
1298
1299 return SCM_UNSPECIFIED;
1300}
1301
1302/* The reader for the sharp `#' character. It basically dispatches reads
1303 among the above token readers. */
1304static SCM
889975e5 1305scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1306#define FUNC_NAME "scm_lreadr"
1307{
1308 SCM result;
1309
be632904 1310 chr = scm_getc_unlocked (port);
7337d56d
LC
1311
1312 result = scm_read_sharp_extension (chr, port);
1313 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1314 return result;
1315
1316 switch (chr)
1317 {
1318 case '\\':
1319 return (scm_read_character (chr, port));
1320 case '(':
1321 return (scm_read_vector (chr, port));
1322 case 's':
1323 case 'u':
1324 case 'f':
df941b5b 1325 case 'c':
7337d56d
LC
1326 /* This one may return either a boolean or an SRFI-4 vector. */
1327 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1328 case 'v':
1329 return (scm_read_bytevector (chr, port));
7337d56d
LC
1330 case '*':
1331 return (scm_read_guile_bit_vector (chr, port));
1332 case 't':
1333 case 'T':
1334 case 'F':
1335 /* This one may return either a boolean or an SRFI-4 vector. */
1336 return (scm_read_boolean (chr, port));
1337 case ':':
1338 return (scm_read_keyword (chr, port));
1339 case '0': case '1': case '2': case '3': case '4':
1340 case '5': case '6': case '7': case '8': case '9':
1341 case '@':
7337d56d
LC
1342 return (scm_i_read_array (port, chr));
1343
1344 case 'i':
1345 case 'e':
7337d56d
LC
1346 case 'b':
1347 case 'B':
1348 case 'o':
1349 case 'O':
1350 case 'd':
1351 case 'D':
1352 case 'x':
1353 case 'X':
1354 case 'I':
1355 case 'E':
1356 return (scm_read_number_and_radix (chr, port));
1357 case '{':
1358 return (scm_read_extended_symbol (chr, port));
1359 case '!':
911b03b2 1360 return (scm_read_shebang (chr, port));
34f3d47d
AW
1361 case ';':
1362 return (scm_read_commented_expression (chr, port));
1363 case '`':
1364 case '\'':
1365 case ',':
1366 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1367 case 'n':
1368 return (scm_read_nil (chr, port));
7337d56d
LC
1369 default:
1370 result = scm_read_sharp_extension (chr, port);
1371 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1372 {
1373 /* To remain compatible with 1.8 and earlier, the following
1374 characters have lower precedence than `read-hash-extend'
1375 characters. */
1376 switch (chr)
1377 {
1378 case '|':
1379 return scm_read_r6rs_block_comment (chr, port);
1380 default:
1381 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1382 scm_list_1 (SCM_MAKE_CHAR (chr)));
1383 }
1384 }
7337d56d
LC
1385 else
1386 return result;
1387 }
1388
1389 return SCM_UNSPECIFIED;
1390}
1391#undef FUNC_NAME
1392
1393static SCM
1394scm_read_expression (SCM port)
1395#define FUNC_NAME "scm_read_expression"
1396{
1397 while (1)
1398 {
889975e5 1399 register scm_t_wchar chr;
7337d56d 1400
be632904 1401 chr = scm_getc_unlocked (port);
7337d56d
LC
1402
1403 switch (chr)
1404 {
1405 case SCM_WHITE_SPACES:
1406 case SCM_LINE_INCREMENTORS:
1407 break;
1408 case ';':
1409 (void) scm_read_semicolon_comment (chr, port);
1410 break;
5afa815c
AW
1411 case '[':
1412 if (!SCM_SQUARE_BRACKETS_P)
1413 return (scm_read_mixed_case_symbol (chr, port));
1414 /* otherwise fall through */
7337d56d
LC
1415 case '(':
1416 return (scm_read_sexp (chr, port));
1417 case '"':
1418 return (scm_read_string (chr, port));
1419 case '\'':
1420 case '`':
1421 case ',':
1422 return (scm_read_quote (chr, port));
1423 case '#':
1424 {
1425 SCM result;
1426 result = scm_read_sharp (chr, port);
1427 if (scm_is_eq (result, SCM_UNSPECIFIED))
1428 /* We read a comment or some such. */
1429 break;
1430 else
1431 return result;
1432 }
1433 case ')':
1434 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1435 break;
a4e47229
MG
1436 case ']':
1437 if (SCM_SQUARE_BRACKETS_P)
1438 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1439 /* otherwise fall through */
7337d56d
LC
1440 case EOF:
1441 return SCM_EOF_VAL;
1442 case ':':
1443 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1444 return scm_symbol_to_keyword (scm_read_expression (port));
1445 /* Fall through. */
1446
1447 default:
1448 {
1449 if (((chr >= '0') && (chr <= '9'))
1450 || (strchr ("+-.", chr)))
1451 return (scm_read_number (chr, port));
1452 else
1453 return (scm_read_mixed_case_symbol (chr, port));
1454 }
1455 }
1456 }
1457}
1458#undef FUNC_NAME
1459
1460\f
1461/* Actual reader. */
1462
1463SCM_DEFINE (scm_read, "read", 0, 1, 0,
1464 (SCM port),
1465 "Read an s-expression from the input port @var{port}, or from\n"
1466 "the current input port if @var{port} is not specified.\n"
1467 "Any whitespace before the next token is discarded.")
1468#define FUNC_NAME s_scm_read
1469{
1470 int c;
1471
1472 if (SCM_UNBNDP (port))
1473 port = scm_current_input_port ();
1474 SCM_VALIDATE_OPINPORT (1, port);
1475
1476 c = flush_ws (port, (char *) NULL);
1477 if (EOF == c)
1478 return SCM_EOF_VAL;
c932ce0b 1479 scm_ungetc_unlocked (c, port);
7337d56d
LC
1480
1481 return (scm_read_expression (port));
09a4f039 1482}
db4b4ca6 1483#undef FUNC_NAME
09a4f039 1484
0f2d19dd
JB
1485
1486\f
1487
14de3b42
GH
1488/* Manipulate the read-hash-procedures alist. This could be written in
1489 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1490SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1491 (SCM chr, SCM proc),
dc7fa443
MG
1492 "Install the procedure @var{proc} for reading expressions\n"
1493 "starting with the character sequence @code{#} and @var{chr}.\n"
1494 "@var{proc} will be called with two arguments: the character\n"
1495 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1496 "returned will be the return value of @code{read}. \n"
1497 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1498 )
1bbd0b84 1499#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1500{
fed9c9a2
GH
1501 SCM this;
1502 SCM prev;
1503
36284627 1504 SCM_VALIDATE_CHAR (1, chr);
7888309b 1505 SCM_ASSERT (scm_is_false (proc)
bc36d050 1506 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1507 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1508
14de3b42 1509 /* Check if chr is already in the alist. */
d458073b 1510 this = scm_i_read_hash_procedures_ref ();
14de3b42 1511 prev = SCM_BOOL_F;
fed9c9a2
GH
1512 while (1)
1513 {
d2e53ed6 1514 if (scm_is_null (this))
fed9c9a2
GH
1515 {
1516 /* not found, so add it to the beginning. */
7888309b 1517 if (scm_is_true (proc))
fed9c9a2 1518 {
d458073b
AR
1519 SCM new = scm_cons (scm_cons (chr, proc),
1520 scm_i_read_hash_procedures_ref ());
1521 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1522 }
1523 break;
1524 }
bc36d050 1525 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1526 {
1527 /* already in the alist. */
7888309b 1528 if (scm_is_false (proc))
14de3b42
GH
1529 {
1530 /* remove it. */
7888309b 1531 if (scm_is_false (prev))
14de3b42 1532 {
d458073b
AR
1533 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1534 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1535 }
1536 else
1537 scm_set_cdr_x (prev, SCM_CDR (this));
1538 }
fed9c9a2 1539 else
14de3b42
GH
1540 {
1541 /* replace it. */
1542 scm_set_cdr_x (SCM_CAR (this), proc);
1543 }
fed9c9a2
GH
1544 break;
1545 }
1546 prev = this;
1547 this = SCM_CDR (this);
1548 }
deca31e1 1549
deca31e1
GH
1550 return SCM_UNSPECIFIED;
1551}
1bbd0b84 1552#undef FUNC_NAME
0f2d19dd 1553
deca31e1
GH
1554/* Recover the read-hash procedure corresponding to char c. */
1555static SCM
6e8d25a6 1556scm_get_hash_procedure (int c)
deca31e1 1557{
d458073b 1558 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1559
deca31e1
GH
1560 while (1)
1561 {
d2e53ed6 1562 if (scm_is_null (rest))
deca31e1
GH
1563 return SCM_BOOL_F;
1564
7866a09b 1565 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1566 return SCM_CDAR (rest);
1567
1568 rest = SCM_CDR (rest);
1569 }
1570}
1cc91f1b 1571
889975e5
MG
1572#define SCM_ENCODING_SEARCH_SIZE (500)
1573
f8a1c9a8
LC
1574/* Search the first few hundred characters of a file for an Emacs-like coding
1575 declaration. Returns either NULL or a string whose storage has been
1576 allocated with `scm_gc_malloc ()'. */
889975e5 1577char *
f8a1c9a8 1578scm_i_scan_for_encoding (SCM port)
889975e5 1579{
d900843c 1580 scm_t_port *pt;
889975e5 1581 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1582 size_t bytes_read, encoding_length, i;
889975e5
MG
1583 char *encoding = NULL;
1584 int utf8_bom = 0;
daedbca7 1585 char *pos, *encoding_start;
889975e5
MG
1586 int in_comment;
1587
d900843c 1588 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1589
d900843c
AW
1590 if (pt->rw_active == SCM_PORT_WRITE)
1591 scm_flush (port);
49bb5bd3 1592
d900843c
AW
1593 if (pt->rw_random)
1594 pt->rw_active = SCM_PORT_READ;
1595
1596 if (pt->read_pos == pt->read_end)
1597 {
1598 /* We can use the read buffer, and thus avoid a seek. */
1599 if (scm_fill_input (port) == EOF)
1600 return NULL;
1601
1602 bytes_read = pt->read_end - pt->read_pos;
1603 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1604 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1605
1606 if (bytes_read <= 1)
1607 /* An unbuffered port -- don't scan. */
1608 return NULL;
1609
1610 memcpy (header, pt->read_pos, bytes_read);
1611 header[bytes_read] = '\0';
1612 }
1613 else
1614 {
1615 /* Try to read some bytes and then seek back. Not all ports
1616 support seeking back; and indeed some file ports (like
1617 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1618 check performed by SCM_FPORT_FDES---but fail to seek
1619 backwards. Hence this block comes second. We prefer to use
1620 the read buffer in-place. */
1621 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1622 return NULL;
1623
be632904 1624 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
d900843c
AW
1625 header[bytes_read] = '\0';
1626 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1627 }
889975e5
MG
1628
1629 if (bytes_read > 3
1630 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1631 utf8_bom = 1;
1632
1633 /* search past "coding[:=]" */
1634 pos = header;
1635 while (1)
1636 {
1637 if ((pos = strstr(pos, "coding")) == NULL)
1638 return NULL;
1639
1640 pos += strlen("coding");
1641 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1642 (*pos == ':' || *pos == '='))
1643 {
1644 pos ++;
1645 break;
1646 }
1647 }
1648
1649 /* skip spaces */
1650 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1651 (*pos == ' ' || *pos == '\t'))
1652 pos ++;
1653
1654 /* grab the next token */
daedbca7 1655 encoding_start = pos;
889975e5 1656 i = 0;
daedbca7
MG
1657 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1658 && encoding_start + i - header < bytes_read
1659 && (isalnum ((int) encoding_start[i])
1660 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1661 i++;
1662
daedbca7
MG
1663 encoding_length = i;
1664 if (encoding_length == 0)
889975e5
MG
1665 return NULL;
1666
daedbca7
MG
1667 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1668 for (i = 0; i < encoding_length; i++)
026ed239 1669 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1670
1671 /* push backwards to make sure we were in a comment */
1672 in_comment = 0;
daedbca7
MG
1673 pos = encoding_start;
1674 while (pos >= header)
889975e5 1675 {
8a12aeb9
AW
1676 if (*pos == ';')
1677 {
1678 in_comment = 1;
1679 break;
1680 }
1681 else if (*pos == '\n' || pos == header)
889975e5
MG
1682 {
1683 /* This wasn't in a semicolon comment. Check for a
1684 hash-bang comment. */
1685 char *beg = strstr (header, "#!");
1686 char *end = strstr (header, "!#");
8a12aeb9 1687 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1688 in_comment = 1;
1689 break;
1690 }
8a12aeb9
AW
1691 else
1692 {
1693 pos --;
1694 continue;
1695 }
889975e5
MG
1696 }
1697 if (!in_comment)
f8a1c9a8
LC
1698 /* This wasn't in a comment */
1699 return NULL;
1700
889975e5 1701 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1702 scm_misc_error (NULL,
889975e5
MG
1703 "the port input declares the encoding ~s but is encoded as UTF-8",
1704 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1705
889975e5
MG
1706 return encoding;
1707}
1708
1709SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1710 (SCM port),
a270e133 1711 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1712 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1713 "The coding declaration is of the form\n"
1714 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1715 "\n"
1716 "Returns a string containing the character encoding of the file\n"
1717 "if a declaration was found, or @code{#f} otherwise.\n")
1718#define FUNC_NAME s_scm_file_encoding
1719{
1720 char *enc;
1721 SCM s_enc;
f8a1c9a8 1722
d900843c
AW
1723 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1724
f8a1c9a8 1725 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1726 if (enc == NULL)
1727 return SCM_BOOL_F;
1728 else
1729 {
1730 s_enc = scm_from_locale_string (enc);
889975e5
MG
1731 return s_enc;
1732 }
f8a1c9a8 1733
889975e5
MG
1734 return SCM_BOOL_F;
1735}
1736#undef FUNC_NAME
1737
0f2d19dd
JB
1738void
1739scm_init_read ()
0f2d19dd 1740{
d458073b
AR
1741 SCM read_hash_procs;
1742
1743 read_hash_procs = scm_make_fluid ();
1744 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1745
1746 scm_i_read_hash_procedures =
1747 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1748
62560650 1749 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1750#include "libguile/read.x"
0f2d19dd 1751}
89e00824
ML
1752
1753/*
1754 Local Variables:
1755 c-file-style: "gnu"
1756 End:
1757*/