fix a number of assuptions that a long could hold an inum
[bpt/guile.git] / libguile / read.c
CommitLineData
a8fc3852 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
7337d56d 32
a0599745 33#include "libguile/_scm.h"
0ba0b384 34#include "libguile/bytevectors.h"
a0599745
MD
35#include "libguile/chars.h"
36#include "libguile/eval.h"
2fa901a5 37#include "libguile/arrays.h"
cf396142 38#include "libguile/bitvectors.h"
a0599745
MD
39#include "libguile/keywords.h"
40#include "libguile/alist.h"
41#include "libguile/srcprop.h"
42#include "libguile/hashtab.h"
43#include "libguile/hash.h"
44#include "libguile/ports.h"
49bb5bd3 45#include "libguile/fports.h"
a0599745
MD
46#include "libguile/root.h"
47#include "libguile/strings.h"
ba1b2226 48#include "libguile/strports.h"
a0599745 49#include "libguile/vectors.h"
a0599745 50#include "libguile/validate.h"
a4022e69 51#include "libguile/srfi-4.h"
7337d56d 52#include "libguile/srfi-13.h"
ba1b2226 53
a0599745 54#include "libguile/read.h"
22fc179a
HWN
55#include "libguile/private-options.h"
56
0f2d19dd
JB
57
58\f
59
5bf6a6f0 60SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 61SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 62SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 63SCM_SYMBOL (sym_nil, "nil");
c7733771 64
92c2555f 65scm_t_option scm_read_opts[] = {
b7ff98dd
MD
66 { SCM_OPTION_BOOLEAN, "copy", 0,
67 "Copy source code expressions." },
ac74fc22 68 { SCM_OPTION_BOOLEAN, "positions", 0,
deca31e1
GH
69 "Record positions of source code expressions." },
70 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 71 "Convert symbols to lower case."},
3d27ef4b 72 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F,
904fabb6 73 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
74 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
75 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
76 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
77 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
62560650 78 { 0, },
a16f6fe7
MD
79};
80
39e8f371
HWN
81/*
82 Give meaningful error messages for errors
83
84 We use the format
85
ba1b2226 86 FILE:LINE:COL: MESSAGE
39e8f371
HWN
87 This happened in ....
88
89 This is not standard GNU format, but the test-suite likes the real
90 message to be in front.
91
39e8f371
HWN
92 */
93
94
a4022e69
MV
95void
96scm_i_input_error (char const *function,
97 SCM port, const char *message, SCM arg)
ba1b2226 98{
29a837fd
MV
99 SCM fn = (scm_is_string (SCM_FILENAME(port))
100 ? SCM_FILENAME(port)
101 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 102
29a837fd 103 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
104 SCM string = SCM_EOL;
105 scm_simple_format (string_port,
272632a6 106 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 107 scm_list_4 (fn,
b3aa4626 108 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 109 scm_from_int (SCM_COL (port) + 1),
272632a6 110 scm_from_locale_string (message)));
ba1b2226
HWN
111
112 string = scm_get_output_string (string_port);
113 scm_close_output_port (string_port);
272632a6 114 scm_error_scm (scm_from_locale_symbol ("read-error"),
a4022e69 115 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 116 string,
dd72382c 117 arg,
ba1b2226
HWN
118 SCM_BOOL_F);
119}
39e8f371
HWN
120
121
a1ec6916 122SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 123 (SCM setting),
dc7fa443
MG
124 "Option interface for the read options. Instead of using\n"
125 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 126 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 127#define FUNC_NAME s_scm_read_options
a16f6fe7 128{
b7ff98dd
MD
129 SCM ans = scm_options (setting,
130 scm_read_opts,
1bbd0b84 131 FUNC_NAME);
b7ff98dd
MD
132 if (SCM_COPY_SOURCE_P)
133 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
134 return ans;
135}
1bbd0b84 136#undef FUNC_NAME
a16f6fe7 137
d458073b
AR
138/* A fluid referring to an association list mapping extra hash
139 characters to procedures. */
140static SCM *scm_i_read_hash_procedures;
deca31e1 141
d458073b
AR
142static inline SCM
143scm_i_read_hash_procedures_ref (void)
144{
145 return scm_fluid_ref (*scm_i_read_hash_procedures);
146}
147
148static inline void
149scm_i_read_hash_procedures_set_x (SCM value)
150{
151 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
152}
0f2d19dd 153
7337d56d
LC
154\f
155/* Token readers. */
0f2d19dd 156
0f2d19dd 157
7337d56d
LC
158/* Size of the C buffer used to read symbols and numbers. */
159#define READER_BUFFER_SIZE 128
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read strings. */
162#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 163
7337d56d
LC
164/* The maximum size of Scheme character names. */
165#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 166
94115ae3 167
7337d56d
LC
168/* `isblank' is only in C99. */
169#define CHAR_IS_BLANK_(_chr) \
170 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 171 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
172
173#ifdef MSDOS
174# define CHAR_IS_BLANK(_chr) \
175 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
176#else
177# define CHAR_IS_BLANK CHAR_IS_BLANK_
178#endif
179
180
181/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
182 structure''). */
183#define CHAR_IS_R5RS_DELIMITER(c) \
184 (CHAR_IS_BLANK (c) \
5afa815c
AW
185 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
186 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
187
188#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
189
190/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
191 Structure''. */
192#define CHAR_IS_EXPONENT_MARKER(_chr) \
193 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
194 || ((_chr) == 'd') || ((_chr) == 'l'))
195
454866e0 196/* Read an SCSH block comment. */
620c8965
LC
197static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
198static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
199static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 200static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 201static SCM scm_get_hash_procedure (int);
454866e0 202
69f90b0b
MG
203/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
204 result in the pre-allocated buffer BUF. Return zero if the whole token has
205 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
206 bytes actually read. */
7337d56d 207static inline int
69f90b0b
MG
208read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
209 {
210 *read = 0;
0520c320 211
69f90b0b
MG
212 while (*read < buf_size)
213 {
214 int chr;
7337d56d 215
69f90b0b 216 chr = scm_get_byte_or_eof (port);
889975e5 217
69f90b0b
MG
218 if (chr == EOF)
219 return 0;
220 else if (CHAR_IS_DELIMITER (chr))
221 {
222 scm_unget_byte (chr, port);
223 return 0;
224 }
225 else
226 {
227 *buf = (char) chr;
228 buf++, (*read)++;
229 }
230 }
889975e5 231
69f90b0b
MG
232 return 1;
233 }
889975e5 234
69f90b0b
MG
235/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
236 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
237 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
238 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
239 will be set the number of bytes actually read. */
240static int
241read_complete_token (SCM port, char *buffer, const size_t buffer_size,
242 char **overflow_buffer, size_t *read)
243{
244 int overflow = 0;
245 size_t bytes_read, overflow_size;
7337d56d 246
69f90b0b
MG
247 *overflow_buffer = NULL;
248 overflow_size = 0;
1cc91f1b 249
69f90b0b
MG
250 do
251 {
252 overflow = read_token (port, buffer, buffer_size, &bytes_read);
253 if (bytes_read == 0)
254 break;
255 if (overflow || overflow_size != 0)
256 {
257 if (overflow_size == 0)
258 {
259 *overflow_buffer = scm_malloc (bytes_read);
260 memcpy (*overflow_buffer, buffer, bytes_read);
261 overflow_size = bytes_read;
262 }
263 else
264 {
265 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
266 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
267 overflow_size += bytes_read;
268 }
269 }
889975e5 270 }
69f90b0b 271 while (overflow);
889975e5 272
69f90b0b
MG
273 if (overflow_size)
274 *read = overflow_size;
b521d265 275 else
69f90b0b
MG
276 *read = bytes_read;
277
278 return (overflow_size != 0);
889975e5 279}
7337d56d
LC
280
281/* Skip whitespace from PORT and return the first non-whitespace character
282 read. Raise an error on end-of-file. */
283static int
284flush_ws (SCM port, const char *eoferr)
0f2d19dd 285{
889975e5 286 register scm_t_wchar c;
0f2d19dd 287 while (1)
b7f3516f 288 switch (c = scm_getc (port))
0f2d19dd
JB
289 {
290 case EOF:
291 goteof:
292 if (eoferr)
d156d3b7 293 {
a4022e69
MV
294 scm_i_input_error (eoferr,
295 port,
296 "end of file",
297 SCM_EOL);
d156d3b7 298 }
0f2d19dd 299 return c;
7337d56d 300
0f2d19dd
JB
301 case ';':
302 lp:
b7f3516f 303 switch (c = scm_getc (port))
0f2d19dd
JB
304 {
305 case EOF:
306 goto goteof;
307 default:
308 goto lp;
309 case SCM_LINE_INCREMENTORS:
310 break;
311 }
312 break;
7337d56d 313
454866e0
LC
314 case '#':
315 switch (c = scm_getc (port))
316 {
317 case EOF:
318 eoferr = "read_sharp";
319 goto goteof;
320 case '!':
911b03b2 321 scm_read_shebang (c, port);
454866e0 322 break;
34f3d47d
AW
323 case ';':
324 scm_read_commented_expression (c, port);
325 break;
620c8965
LC
326 case '|':
327 if (scm_is_false (scm_get_hash_procedure (c)))
328 {
329 scm_read_r6rs_block_comment (c, port);
330 break;
331 }
332 /* fall through */
454866e0
LC
333 default:
334 scm_ungetc (c, port);
335 return '#';
336 }
337 break;
338
0f2d19dd 339 case SCM_LINE_INCREMENTORS:
0f2d19dd 340 case SCM_SINGLE_SPACES:
0f2d19dd 341 case '\t':
0f2d19dd 342 break;
7337d56d 343
0f2d19dd
JB
344 default:
345 return c;
346 }
7337d56d
LC
347
348 return 0;
0f2d19dd
JB
349}
350
351
7337d56d
LC
352\f
353/* Token readers. */
1cc91f1b 354
7337d56d
LC
355static SCM scm_read_expression (SCM port);
356static SCM scm_read_sharp (int chr, SCM port);
7337d56d 357static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
358
359
09a4f039 360static SCM
889975e5 361scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 362#define FUNC_NAME "scm_i_lreadparen"
09a4f039 363{
7337d56d
LC
364 register int c;
365 register SCM tmp;
366 register SCM tl, ans = SCM_EOL;
bd22f1c7 367 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
5afa815c 368 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
f9c68a47 373
f9c68a47 374
7337d56d
LC
375 c = flush_ws (port, FUNC_NAME);
376 if (terminating_char == c)
377 return SCM_EOL;
f9c68a47 378
7337d56d
LC
379 scm_ungetc (c, port);
380 if (scm_is_eq (scm_sym_dot,
381 (tmp = scm_read_expression (port))))
382 {
383 ans = scm_read_expression (port);
384 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
385 scm_i_input_error (FUNC_NAME, port, "missing close paren",
386 SCM_EOL);
387 return ans;
388 }
1cc91f1b 389
7337d56d
LC
390 /* Build the head of the list structure. */
391 ans = tl = scm_cons (tmp, SCM_EOL);
392
393 if (SCM_COPY_SOURCE_P)
394 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
395 ? copy
396 : tmp,
397 SCM_EOL);
398
399 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 400 {
7337d56d 401 SCM new_tail;
0f2d19dd 402
5b69315e
AW
403 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
404 scm_i_input_error (FUNC_NAME, port,
405 "in pair: mismatched close paren: ~A",
406 scm_list_1 (SCM_MAKE_CHAR (c)));
407
7337d56d 408 scm_ungetc (c, port);
5b69315e
AW
409 tmp = scm_read_expression (port);
410
411 if (scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 412 {
7337d56d
LC
413 SCM_SETCDR (tl, tmp = scm_read_expression (port));
414
415 if (SCM_COPY_SOURCE_P)
416 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
417 SCM_EOL));
418
419 c = flush_ws (port, FUNC_NAME);
420 if (terminating_char != c)
421 scm_i_input_error (FUNC_NAME, port,
422 "in pair: missing close paren", SCM_EOL);
423 goto exit;
0f2d19dd 424 }
b858464a 425
7337d56d
LC
426 new_tail = scm_cons (tmp, SCM_EOL);
427 SCM_SETCDR (tl, new_tail);
428 tl = new_tail;
429
430 if (SCM_COPY_SOURCE_P)
0f2d19dd 431 {
7337d56d
LC
432 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
433 ? copy
434 : tmp, SCM_EOL);
435 SCM_SETCDR (tl2, new_tail2);
436 tl2 = new_tail2;
437 }
438 }
0f2d19dd 439
7337d56d
LC
440 exit:
441 if (SCM_RECORD_POSITIONS_P)
442 scm_whash_insert (scm_source_whash,
443 ans,
444 scm_make_srcprops (line, column,
445 SCM_FILENAME (port),
446 SCM_COPY_SOURCE_P
447 ? ans2
448 : SCM_UNDEFINED,
449 SCM_EOL));
450 return ans;
451}
452#undef FUNC_NAME
a4022e69 453
c5661d28
MG
454
455/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
456 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
457 found. */
458#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
459 do \
460 { \
461 scm_t_wchar a; \
462 size_t i = 0; \
463 c = 0; \
464 while (i < ndigits) \
465 { \
466 a = scm_getc (port); \
467 if (a == EOF) \
468 goto str_eof; \
469 if (terminator \
470 && (a == (scm_t_wchar) terminator) \
471 && (i > 0)) \
472 break; \
473 if ('0' <= a && a <= '9') \
474 a -= '0'; \
475 else if ('A' <= a && a <= 'F') \
476 a = a - 'A' + 10; \
477 else if ('a' <= a && a <= 'f') \
478 a = a - 'a' + 10; \
479 else \
480 { \
481 c = a; \
482 goto bad_escaped; \
483 } \
484 c = c * 16 + a; \
485 i ++; \
486 } \
c5661d28
MG
487 } while (0)
488
7337d56d
LC
489static SCM
490scm_read_string (int chr, SCM port)
491#define FUNC_NAME "scm_lreadr"
492{
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
0f2d19dd 495
7337d56d 496 SCM str = SCM_BOOL_F;
7337d56d 497 unsigned c_str_len = 0;
9c44cd45 498 scm_t_wchar c;
eb42ff25 499
9c44cd45 500 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
7337d56d
LC
501 while ('"' != (c = scm_getc (port)))
502 {
503 if (c == EOF)
9c44cd45
MG
504 {
505 str_eof:
506 scm_i_input_error (FUNC_NAME, port,
507 "end of file in string constant", SCM_EOL);
508 }
0f2d19dd 509
9c44cd45
MG
510 if (c_str_len + 1 >= scm_i_string_length (str))
511 {
512 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
0f2d19dd 513
9c44cd45
MG
514 str = scm_string_append (scm_list_2 (str, addy));
515 }
0f2d19dd 516
7337d56d 517 if (c == '\\')
9c44cd45
MG
518 {
519 switch (c = scm_getc (port))
520 {
521 case EOF:
522 goto str_eof;
523 case '"':
524 case '\\':
525 break;
9c44cd45
MG
526 case '\n':
527 continue;
528 case '0':
529 c = '\0';
530 break;
531 case 'f':
532 c = '\f';
533 break;
534 case 'n':
535 c = '\n';
536 break;
537 case 'r':
538 c = '\r';
539 break;
540 case 't':
541 c = '\t';
542 break;
543 case 'a':
544 c = '\007';
545 break;
546 case 'v':
547 c = '\v';
548 break;
67a4a16d
MG
549 case 'b':
550 c = '\010';
551 break;
9c44cd45 552 case 'x':
dea901d6
MG
553 if (SCM_R6RS_ESCAPES_P)
554 SCM_READ_HEX_ESCAPE (10, ';');
555 else
556 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 557 break;
9c44cd45 558 case 'u':
898a0b5a
MG
559 if (!SCM_R6RS_ESCAPES_P)
560 {
561 SCM_READ_HEX_ESCAPE (4, '\0');
562 break;
563 }
9c44cd45 564 case 'U':
898a0b5a
MG
565 if (!SCM_R6RS_ESCAPES_P)
566 {
567 SCM_READ_HEX_ESCAPE (6, '\0');
568 break;
569 }
9c44cd45
MG
570 default:
571 bad_escaped:
572 scm_i_input_error (FUNC_NAME, port,
573 "illegal character in escape sequence: ~S",
574 scm_list_1 (SCM_MAKE_CHAR (c)));
575 }
576 }
577 str = scm_i_string_start_writing (str);
578 scm_i_string_set_x (str, c_str_len++, c);
579 scm_i_string_stop_writing ();
7337d56d 580 }
f13b4400 581
7337d56d
LC
582 if (c_str_len > 0)
583 {
9c44cd45 584 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 585 }
69f90b0b 586
9c44cd45 587 return scm_nullstr;
0f2d19dd 588}
db4b4ca6
DH
589#undef FUNC_NAME
590
0f2d19dd 591
7337d56d 592static SCM
889975e5 593scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 594{
69f90b0b
MG
595 SCM result, str = SCM_EOL;
596 char buffer[READER_BUFFER_SIZE];
597 char *overflow_buffer = NULL;
598 size_t bytes_read;
599 int overflow;
600 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 601
7337d56d 602 scm_ungetc (chr, port);
69f90b0b
MG
603 overflow = read_complete_token (port, buffer, sizeof (buffer),
604 &overflow_buffer, &bytes_read);
605
606 if (!overflow)
607 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
608 else
609 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
610 pt->ilseq_handler);
611
612 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 613 if (!scm_is_true (result))
69f90b0b
MG
614 {
615 /* Return a symbol instead of a number */
616 if (SCM_CASE_INSENSITIVE_P)
617 str = scm_string_downcase_x (str);
618 result = scm_string_to_symbol (str);
619 }
0f2d19dd 620
69f90b0b
MG
621 if (overflow)
622 free (overflow_buffer);
623 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
624 return result;
625}
0f2d19dd 626
7337d56d 627static SCM
889975e5 628scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 629{
889975e5
MG
630 SCM result;
631 int ends_with_colon = 0;
69f90b0b 632 size_t bytes_read;
ef4cbc08 633 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
634 int overflow;
635 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
636 scm_t_port *pt = SCM_PTAB_ENTRY (port);
637 SCM str;
7337d56d
LC
638
639 scm_ungetc (chr, port);
69f90b0b
MG
640 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
641 &overflow_buffer, &bytes_read);
642 if (bytes_read > 0)
643 {
644 if (!overflow)
645 ends_with_colon = buffer[bytes_read - 1] == ':';
646 else
647 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
648 }
ef4cbc08 649
69f90b0b
MG
650 if (postfix && ends_with_colon && (bytes_read > 1))
651 {
652 if (!overflow)
653 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
654 else
655 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
656 pt->ilseq_handler);
657
658 if (SCM_CASE_INSENSITIVE_P)
659 str = scm_string_downcase_x (str);
660 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
661 }
7337d56d 662 else
69f90b0b
MG
663 {
664 if (!overflow)
665 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
666 else
667 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
668 pt->ilseq_handler);
669
670 if (SCM_CASE_INSENSITIVE_P)
671 str = scm_string_downcase_x (str);
672 result = scm_string_to_symbol (str);
673 }
7337d56d 674
69f90b0b
MG
675 if (overflow)
676 free (overflow_buffer);
677 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
678 return result;
679}
680
681static SCM
889975e5 682scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
683#define FUNC_NAME "scm_lreadr"
684{
889975e5 685 SCM result;
7337d56d 686 size_t read;
69f90b0b
MG
687 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
688 int overflow;
7337d56d 689 unsigned int radix;
69f90b0b
MG
690 SCM str;
691 scm_t_port *pt;
7337d56d
LC
692
693 switch (chr)
694 {
695 case 'B':
696 case 'b':
697 radix = 2;
698 break;
699
700 case 'o':
701 case 'O':
702 radix = 8;
703 break;
704
705 case 'd':
706 case 'D':
707 radix = 10;
708 break;
709
710 case 'x':
711 case 'X':
712 radix = 16;
713 break;
714
715 default:
716 scm_ungetc (chr, port);
717 scm_ungetc ('#', port);
718 radix = 10;
719 }
720
69f90b0b
MG
721 overflow = read_complete_token (port, buffer, sizeof (buffer),
722 &overflow_buffer, &read);
723
724 pt = SCM_PTAB_ENTRY (port);
725 if (!overflow)
726 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
727 else
728 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
729 pt->ilseq_handler);
730
731 result = scm_string_to_number (str, scm_from_uint (radix));
732
733 if (overflow)
734 free (overflow_buffer);
735
736 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
737
738 if (scm_is_true (result))
739 return result;
740
741 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
742
743 return SCM_BOOL_F;
744}
745#undef FUNC_NAME
746
747static SCM
748scm_read_quote (int chr, SCM port)
749{
750 SCM p;
492faee1
LC
751 long line = SCM_LINUM (port);
752 int column = SCM_COL (port) - 1;
7337d56d
LC
753
754 switch (chr)
755 {
756 case '`':
757 p = scm_sym_quasiquote;
758 break;
759
760 case '\'':
761 p = scm_sym_quote;
762 break;
763
764 case ',':
765 {
889975e5 766 scm_t_wchar c;
7337d56d
LC
767
768 c = scm_getc (port);
769 if ('@' == c)
770 p = scm_sym_uq_splicing;
771 else
0f2d19dd 772 {
7337d56d
LC
773 scm_ungetc (c, port);
774 p = scm_sym_unquote;
0f2d19dd 775 }
7337d56d
LC
776 break;
777 }
0f2d19dd 778
7337d56d
LC
779 default:
780 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 781 "scm_read_quote", chr);
7337d56d 782 abort ();
0f2d19dd 783 }
1cc91f1b 784
7337d56d 785 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1
LC
786 if (SCM_RECORD_POSITIONS_P)
787 scm_whash_insert (scm_source_whash, p,
788 scm_make_srcprops (line, column,
789 SCM_FILENAME (port),
790 SCM_COPY_SOURCE_P
791 ? (scm_cons2 (SCM_CAR (p),
792 SCM_CAR (SCM_CDR (p)),
793 SCM_EOL))
794 : SCM_UNDEFINED,
795 SCM_EOL));
796
0f2d19dd 797
7337d56d
LC
798 return p;
799}
800
34f3d47d
AW
801SCM_SYMBOL (sym_syntax, "syntax");
802SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
803SCM_SYMBOL (sym_unsyntax, "unsyntax");
804SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
805
806static SCM
807scm_read_syntax (int chr, SCM port)
808{
809 SCM p;
810 long line = SCM_LINUM (port);
811 int column = SCM_COL (port) - 1;
812
813 switch (chr)
814 {
815 case '`':
816 p = sym_quasisyntax;
817 break;
818
819 case '\'':
820 p = sym_syntax;
821 break;
822
823 case ',':
824 {
825 int c;
826
827 c = scm_getc (port);
828 if ('@' == c)
829 p = sym_unsyntax_splicing;
830 else
831 {
832 scm_ungetc (c, port);
833 p = sym_unsyntax;
834 }
835 break;
836 }
837
838 default:
839 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
840 "scm_read_syntax", chr);
841 abort ();
842 }
843
844 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
845 if (SCM_RECORD_POSITIONS_P)
846 scm_whash_insert (scm_source_whash, p,
847 scm_make_srcprops (line, column,
848 SCM_FILENAME (port),
849 SCM_COPY_SOURCE_P
850 ? (scm_cons2 (SCM_CAR (p),
851 SCM_CAR (SCM_CDR (p)),
852 SCM_EOL))
853 : SCM_UNDEFINED,
854 SCM_EOL));
855
856
857 return p;
858}
859
7c4aad9c
AW
860static inline SCM
861scm_read_nil (int chr, SCM port)
862{
863 SCM id = scm_read_mixed_case_symbol (chr, port);
864
865 if (!scm_is_eq (id, sym_nil))
866 scm_i_input_error ("scm_read_nil", port,
867 "unexpected input while reading #nil: ~a",
868 scm_list_1 (id));
869
870 return SCM_ELISP_NIL;
871}
872
7337d56d
LC
873static inline SCM
874scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 875{
0f2d19dd
JB
876 int c;
877
889975e5
MG
878 /* We use the get_byte here because there is no need to get the
879 locale correct with comment input. This presumes that newline
880 always represents itself no matter what the encoding is. */
881 for (c = scm_get_byte_or_eof (port);
7337d56d 882 (c != EOF) && (c != '\n');
8a8da78d 883 c = scm_get_byte_or_eof (port));
7337d56d
LC
884
885 return SCM_UNSPECIFIED;
886}
887
888\f
889/* Sharp readers, i.e. readers called after a `#' sign has been read. */
890
891static SCM
892scm_read_boolean (int chr, SCM port)
893{
894 switch (chr)
0f2d19dd 895 {
7337d56d
LC
896 case 't':
897 case 'T':
898 return SCM_BOOL_T;
899
900 case 'f':
901 case 'F':
902 return SCM_BOOL_F;
0f2d19dd 903 }
7337d56d
LC
904
905 return SCM_UNSPECIFIED;
906}
907
908static SCM
889975e5 909scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
910#define FUNC_NAME "scm_lreadr"
911{
69f90b0b
MG
912 char buffer[READER_CHAR_NAME_MAX_SIZE];
913 SCM charname;
914 size_t charname_len, bytes_read;
889975e5
MG
915 scm_t_wchar cp;
916 int overflow;
69f90b0b 917 scm_t_port *pt;
889975e5 918
69f90b0b 919 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 920 if (overflow)
c1a0ba1c 921 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 922
69f90b0b 923 if (bytes_read == 0)
0f2d19dd 924 {
7337d56d
LC
925 chr = scm_getc (port);
926 if (chr == EOF)
927 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
928 "while reading character", SCM_EOL);
929
930 /* CHR must be a token delimiter, like a whitespace. */
931 return (SCM_MAKE_CHAR (chr));
0f2d19dd 932 }
7337d56d 933
69f90b0b 934 pt = SCM_PTAB_ENTRY (port);
7337d56d 935
69f90b0b
MG
936 /* Simple ASCII characters can be processed immediately. Also, simple
937 ISO-8859-1 characters can be processed immediately if the encoding for this
938 port is ISO-8859-1. */
939 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
940 {
941 SCM_COL (port) += 1;
942 return SCM_MAKE_CHAR (buffer[0]);
943 }
944
945 /* Otherwise, convert the buffer into a proper scheme string for
946 processing. */
947 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
948 pt->ilseq_handler);
949 charname_len = scm_i_string_length (charname);
950 SCM_COL (port) += charname_len;
889975e5 951 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
952 if (charname_len == 1)
953 return SCM_MAKE_CHAR (cp);
954
955 /* Ignore dotted circles, which may be used to keep combining characters from
956 combining with the backslash in #\charname. */
0dcd7e61
MG
957 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
958 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
959
889975e5 960 if (cp >= '0' && cp < '8')
7337d56d
LC
961 {
962 /* Dirk:FIXME:: This type of character syntax is not R5RS
963 * compliant. Further, it should be verified that the constant
0ffc78e3 964 * does only consist of octal digits. */
889975e5 965 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 966 if (SCM_I_INUMP (p))
0ffc78e3 967 {
e25f3727 968 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
969 if (SCM_IS_UNICODE_CHAR (c))
970 return SCM_MAKE_CHAR (c);
971 else
0f3a70cf 972 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
973 "out-of-range octal character escape: ~a",
974 scm_list_1 (charname));
975 }
7337d56d
LC
976 }
977
0f3a70cf 978 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
979 {
980 SCM p;
0f3a70cf 981
dea901d6
MG
982 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
983 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
984 scm_from_uint (16));
985 if (SCM_I_INUMP (p))
986 {
e25f3727 987 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
988 if (SCM_IS_UNICODE_CHAR (c))
989 return SCM_MAKE_CHAR (c);
990 else
991 scm_i_input_error (FUNC_NAME, port,
992 "out-of-range hex character escape: ~a",
993 scm_list_1 (charname));
994 }
995 }
996
889975e5
MG
997 /* The names of characters should never have non-Latin1
998 characters. */
999 if (scm_i_is_narrow_string (charname)
1000 || scm_i_try_narrow_string (charname))
4769c9db
AW
1001 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1002 charname_len);
1003 if (scm_is_true (ch))
1004 return ch;
1005 }
7337d56d 1006
7337d56d 1007 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 1008 scm_list_1 (charname));
7337d56d
LC
1009
1010 return SCM_UNSPECIFIED;
0f2d19dd 1011}
db4b4ca6 1012#undef FUNC_NAME
0f2d19dd 1013
7337d56d
LC
1014static inline SCM
1015scm_read_keyword (int chr, SCM port)
1016{
1017 SCM symbol;
1018
1019 /* Read the symbol that comprises the keyword. Doing this instead of
1020 invoking a specific symbol reader function allows `scm_read_keyword ()'
1021 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1022
7337d56d
LC
1023 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1024 symbol = scm_read_expression (port);
1025 if (!scm_is_symbol (symbol))
7f74cf9a 1026 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1027 "keyword prefix `~a' not followed by a symbol: ~s",
1028 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1029
1030 return (scm_symbol_to_keyword (symbol));
1031}
1032
1033static inline SCM
1034scm_read_vector (int chr, SCM port)
09a4f039 1035{
7337d56d
LC
1036 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1037 guarantee that it's going to do what we want. After all, this is an
1038 implementation detail of `scm_read_vector ()', not a desirable
1039 property. */
1040 return (scm_vector (scm_read_sexp (chr, port)));
1041}
09a4f039 1042
7337d56d
LC
1043static inline SCM
1044scm_read_srfi4_vector (int chr, SCM port)
1045{
1046 return scm_i_read_array (port, chr);
1047}
1048
0ba0b384 1049static SCM
889975e5 1050scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
1051{
1052 chr = scm_getc (port);
1053 if (chr != 'u')
1054 goto syntax;
1055
1056 chr = scm_getc (port);
1057 if (chr != '8')
1058 goto syntax;
1059
1060 chr = scm_getc (port);
1061 if (chr != '(')
1062 goto syntax;
1063
1064 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1065
1066 syntax:
1067 scm_i_input_error ("read_bytevector", port,
1068 "invalid bytevector prefix",
1069 SCM_MAKE_CHAR (chr));
1070 return SCM_UNSPECIFIED;
1071}
1072
7337d56d 1073static SCM
889975e5 1074scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1075{
1076 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1077 terribly inefficient but who cares? */
1078 SCM s_bits = SCM_EOL;
1079
1080 for (chr = scm_getc (port);
1081 (chr != EOF) && ((chr == '0') || (chr == '1'));
1082 chr = scm_getc (port))
09a4f039 1083 {
7337d56d 1084 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1085 }
7337d56d
LC
1086
1087 if (chr != EOF)
1088 scm_ungetc (chr, port);
1089
1090 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1091}
1092
1093static inline SCM
889975e5 1094scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1095{
1096 int bang_seen = 0;
1097
889975e5
MG
1098 /* We can use the get_byte here because there is no need to get the
1099 locale correct when reading comments. This presumes that
1100 hash and exclamation points always represent themselves no
1101 matter what the source encoding is.*/
7337d56d 1102 for (;;)
09a4f039 1103 {
889975e5 1104 int c = scm_get_byte_or_eof (port);
62850ef3 1105
7337d56d
LC
1106 if (c == EOF)
1107 scm_i_input_error ("skip_block_comment", port,
1108 "unterminated `#! ... !#' comment", SCM_EOL);
1109
1110 if (c == '!')
1111 bang_seen = 1;
1112 else if (c == '#' && bang_seen)
1113 break;
1114 else
1115 bang_seen = 0;
1116 }
1117
1118 return SCM_UNSPECIFIED;
1119}
1120
911b03b2
JG
1121static inline SCM
1122scm_read_shebang (scm_t_wchar chr, SCM port)
1123{
1124 int c = 0;
1125 if ((c = scm_get_byte_or_eof (port)) != 'r')
1126 {
1127 scm_ungetc (c, port);
1128 return scm_read_scsh_block_comment (chr, port);
1129 }
1130 if ((c = scm_get_byte_or_eof (port)) != '6')
1131 {
1132 scm_ungetc (c, port);
1133 scm_ungetc ('r', port);
1134 return scm_read_scsh_block_comment (chr, port);
1135 }
1136 if ((c = scm_get_byte_or_eof (port)) != 'r')
1137 {
1138 scm_ungetc (c, port);
1139 scm_ungetc ('6', port);
1140 scm_ungetc ('r', port);
1141 return scm_read_scsh_block_comment (chr, port);
1142 }
1143 if ((c = scm_get_byte_or_eof (port)) != 's')
1144 {
1145 scm_ungetc (c, port);
1146 scm_ungetc ('r', port);
1147 scm_ungetc ('6', port);
1148 scm_ungetc ('r', port);
1149 return scm_read_scsh_block_comment (chr, port);
1150 }
1151
1152 return SCM_UNSPECIFIED;
1153}
1154
620c8965
LC
1155static SCM
1156scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1157{
1158 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1159 nested. So care must be taken. */
1160 int nesting_level = 1;
1161 int opening_seen = 0, closing_seen = 0;
1162
1163 while (nesting_level > 0)
1164 {
1165 int c = scm_getc (port);
1166
1167 if (c == EOF)
cd169c5a 1168 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1169 "unterminated `#| ... |#' comment", SCM_EOL);
1170
1171 if (opening_seen)
1172 {
1173 if (c == '|')
1174 nesting_level++;
1175 opening_seen = 0;
1176 }
1177 else if (closing_seen)
1178 {
1179 if (c == '#')
1180 nesting_level--;
1181 closing_seen = 0;
1182 }
1183 else if (c == '|')
1184 closing_seen = 1;
1185 else if (c == '#')
1186 opening_seen = 1;
1187 else
1188 opening_seen = closing_seen = 0;
1189 }
1190
1191 return SCM_UNSPECIFIED;
1192}
1193
34f3d47d 1194static SCM
889975e5 1195scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1196{
889975e5 1197 scm_t_wchar c;
34f3d47d
AW
1198
1199 c = flush_ws (port, (char *) NULL);
1200 if (EOF == c)
1201 scm_i_input_error ("read_commented_expression", port,
1202 "no expression after #; comment", SCM_EOL);
1203 scm_ungetc (c, port);
1204 scm_read_expression (port);
1205 return SCM_UNSPECIFIED;
1206}
1207
7337d56d 1208static SCM
889975e5 1209scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1210{
1211 /* Guile's extended symbol read syntax looks like this:
1212
1213 #{This is all a symbol name}#
1214
1215 So here, CHR is expected to be `{'. */
7337d56d
LC
1216 int saw_brace = 0, finished = 0;
1217 size_t len = 0;
889975e5 1218 SCM buf = scm_i_make_string (1024, NULL);
7337d56d 1219
889975e5 1220 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1221
1222 while ((chr = scm_getc (port)) != EOF)
1223 {
1224 if (saw_brace)
09a4f039 1225 {
7337d56d
LC
1226 if (chr == '#')
1227 {
1228 finished = 1;
1229 break;
1230 }
1231 else
1232 {
1233 saw_brace = 0;
889975e5
MG
1234 scm_i_string_set_x (buf, len++, '}');
1235 scm_i_string_set_x (buf, len++, chr);
7337d56d 1236 }
09a4f039 1237 }
7337d56d
LC
1238 else if (chr == '}')
1239 saw_brace = 1;
1240 else
889975e5 1241 scm_i_string_set_x (buf, len++, chr);
62850ef3 1242
889975e5 1243 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1244 {
7f991c7d
LC
1245 SCM addy;
1246
889975e5 1247 scm_i_string_stop_writing ();
7f991c7d 1248 addy = scm_i_make_string (1024, NULL);
889975e5 1249 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1250 len = 0;
889975e5 1251 buf = scm_i_string_start_writing (buf);
7337d56d 1252 }
62850ef3 1253
7337d56d
LC
1254 if (finished)
1255 break;
1256 }
889975e5 1257 scm_i_string_stop_writing ();
7337d56d 1258
889975e5 1259 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1260}
1261
1262
1263\f
1264/* Top-level token readers, i.e., dispatchers. */
1265
1266static SCM
1267scm_read_sharp_extension (int chr, SCM port)
1268{
1269 SCM proc;
1270
1271 proc = scm_get_hash_procedure (chr);
1272 if (scm_is_true (scm_procedure_p (proc)))
1273 {
1274 long line = SCM_LINUM (port);
1275 int column = SCM_COL (port) - 2;
1276 SCM got;
1277
1278 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1279 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1280 {
7337d56d
LC
1281 if (SCM_RECORD_POSITIONS_P)
1282 return (recsexpr (got, line, column,
1283 SCM_FILENAME (port)));
1284 else
1285 return got;
62850ef3 1286 }
09a4f039 1287 }
7337d56d
LC
1288
1289 return SCM_UNSPECIFIED;
1290}
1291
1292/* The reader for the sharp `#' character. It basically dispatches reads
1293 among the above token readers. */
1294static SCM
889975e5 1295scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1296#define FUNC_NAME "scm_lreadr"
1297{
1298 SCM result;
1299
1300 chr = scm_getc (port);
1301
1302 result = scm_read_sharp_extension (chr, port);
1303 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1304 return result;
1305
1306 switch (chr)
1307 {
1308 case '\\':
1309 return (scm_read_character (chr, port));
1310 case '(':
1311 return (scm_read_vector (chr, port));
1312 case 's':
1313 case 'u':
1314 case 'f':
1315 /* This one may return either a boolean or an SRFI-4 vector. */
1316 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1317 case 'v':
1318 return (scm_read_bytevector (chr, port));
7337d56d
LC
1319 case '*':
1320 return (scm_read_guile_bit_vector (chr, port));
1321 case 't':
1322 case 'T':
1323 case 'F':
1324 /* This one may return either a boolean or an SRFI-4 vector. */
1325 return (scm_read_boolean (chr, port));
1326 case ':':
1327 return (scm_read_keyword (chr, port));
1328 case '0': case '1': case '2': case '3': case '4':
1329 case '5': case '6': case '7': case '8': case '9':
1330 case '@':
1331#if SCM_ENABLE_DEPRECATED
1332 /* See below for 'i' and 'e'. */
1333 case 'a':
1334 case 'c':
1335 case 'y':
1336 case 'h':
1337 case 'l':
1338#endif
1339 return (scm_i_read_array (port, chr));
1340
1341 case 'i':
1342 case 'e':
1343#if SCM_ENABLE_DEPRECATED
1344 {
1345 /* When next char is '(', it really is an old-style
1346 uniform array. */
889975e5 1347 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1348 if (next_c != EOF)
1349 scm_ungetc (next_c, port);
1350 if (next_c == '(')
1351 return scm_i_read_array (port, chr);
1352 /* Fall through. */
1353 }
1354#endif
1355 case 'b':
1356 case 'B':
1357 case 'o':
1358 case 'O':
1359 case 'd':
1360 case 'D':
1361 case 'x':
1362 case 'X':
1363 case 'I':
1364 case 'E':
1365 return (scm_read_number_and_radix (chr, port));
1366 case '{':
1367 return (scm_read_extended_symbol (chr, port));
1368 case '!':
911b03b2 1369 return (scm_read_shebang (chr, port));
34f3d47d
AW
1370 case ';':
1371 return (scm_read_commented_expression (chr, port));
1372 case '`':
1373 case '\'':
1374 case ',':
1375 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1376 case 'n':
1377 return (scm_read_nil (chr, port));
7337d56d
LC
1378 default:
1379 result = scm_read_sharp_extension (chr, port);
1380 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1381 {
1382 /* To remain compatible with 1.8 and earlier, the following
1383 characters have lower precedence than `read-hash-extend'
1384 characters. */
1385 switch (chr)
1386 {
1387 case '|':
1388 return scm_read_r6rs_block_comment (chr, port);
1389 default:
1390 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1391 scm_list_1 (SCM_MAKE_CHAR (chr)));
1392 }
1393 }
7337d56d
LC
1394 else
1395 return result;
1396 }
1397
1398 return SCM_UNSPECIFIED;
1399}
1400#undef FUNC_NAME
1401
1402static SCM
1403scm_read_expression (SCM port)
1404#define FUNC_NAME "scm_read_expression"
1405{
1406 while (1)
1407 {
889975e5 1408 register scm_t_wchar chr;
7337d56d
LC
1409
1410 chr = scm_getc (port);
1411
1412 switch (chr)
1413 {
1414 case SCM_WHITE_SPACES:
1415 case SCM_LINE_INCREMENTORS:
1416 break;
1417 case ';':
1418 (void) scm_read_semicolon_comment (chr, port);
1419 break;
5afa815c
AW
1420 case '[':
1421 if (!SCM_SQUARE_BRACKETS_P)
1422 return (scm_read_mixed_case_symbol (chr, port));
1423 /* otherwise fall through */
7337d56d
LC
1424 case '(':
1425 return (scm_read_sexp (chr, port));
1426 case '"':
1427 return (scm_read_string (chr, port));
1428 case '\'':
1429 case '`':
1430 case ',':
1431 return (scm_read_quote (chr, port));
1432 case '#':
1433 {
1434 SCM result;
1435 result = scm_read_sharp (chr, port);
1436 if (scm_is_eq (result, SCM_UNSPECIFIED))
1437 /* We read a comment or some such. */
1438 break;
1439 else
1440 return result;
1441 }
1442 case ')':
1443 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1444 break;
a4e47229
MG
1445 case ']':
1446 if (SCM_SQUARE_BRACKETS_P)
1447 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1448 /* otherwise fall through */
7337d56d
LC
1449 case EOF:
1450 return SCM_EOF_VAL;
1451 case ':':
1452 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1453 return scm_symbol_to_keyword (scm_read_expression (port));
1454 /* Fall through. */
1455
1456 default:
1457 {
1458 if (((chr >= '0') && (chr <= '9'))
1459 || (strchr ("+-.", chr)))
1460 return (scm_read_number (chr, port));
1461 else
1462 return (scm_read_mixed_case_symbol (chr, port));
1463 }
1464 }
1465 }
1466}
1467#undef FUNC_NAME
1468
1469\f
1470/* Actual reader. */
1471
1472SCM_DEFINE (scm_read, "read", 0, 1, 0,
1473 (SCM port),
1474 "Read an s-expression from the input port @var{port}, or from\n"
1475 "the current input port if @var{port} is not specified.\n"
1476 "Any whitespace before the next token is discarded.")
1477#define FUNC_NAME s_scm_read
1478{
1479 int c;
1480
1481 if (SCM_UNBNDP (port))
1482 port = scm_current_input_port ();
1483 SCM_VALIDATE_OPINPORT (1, port);
1484
1485 c = flush_ws (port, (char *) NULL);
1486 if (EOF == c)
1487 return SCM_EOF_VAL;
1488 scm_ungetc (c, port);
1489
1490 return (scm_read_expression (port));
09a4f039 1491}
db4b4ca6 1492#undef FUNC_NAME
09a4f039 1493
0f2d19dd
JB
1494
1495\f
1496
7337d56d
LC
1497/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1498static SCM
1499recsexpr (SCM obj, long line, int column, SCM filename)
1500{
1501 if (!scm_is_pair(obj)) {
1502 return obj;
1503 } else {
1504 SCM tmp = obj, copy;
1505 /* If this sexpr is visible in the read:sharp source, we want to
1506 keep that information, so only record non-constant cons cells
1507 which haven't previously been read by the reader. */
1508 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1509 {
1510 if (SCM_COPY_SOURCE_P)
1511 {
1512 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1513 SCM_UNDEFINED);
1514 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1515 {
1516 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1517 line,
1518 column,
1519 filename),
1520 SCM_UNDEFINED));
1521 copy = SCM_CDR (copy);
1522 }
1523 SCM_SETCDR (copy, tmp);
1524 }
1525 else
1526 {
1527 recsexpr (SCM_CAR (obj), line, column, filename);
1528 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1529 recsexpr (SCM_CAR (tmp), line, column, filename);
1530 copy = SCM_UNDEFINED;
1531 }
1532 scm_whash_insert (scm_source_whash,
1533 obj,
1534 scm_make_srcprops (line,
1535 column,
1536 filename,
1537 copy,
1538 SCM_EOL));
1539 }
1540 return obj;
1541 }
1542}
1543
14de3b42
GH
1544/* Manipulate the read-hash-procedures alist. This could be written in
1545 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1546SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1547 (SCM chr, SCM proc),
dc7fa443
MG
1548 "Install the procedure @var{proc} for reading expressions\n"
1549 "starting with the character sequence @code{#} and @var{chr}.\n"
1550 "@var{proc} will be called with two arguments: the character\n"
1551 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1552 "returned will be the return value of @code{read}. \n"
1553 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1554 )
1bbd0b84 1555#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1556{
fed9c9a2
GH
1557 SCM this;
1558 SCM prev;
1559
36284627 1560 SCM_VALIDATE_CHAR (1, chr);
7888309b 1561 SCM_ASSERT (scm_is_false (proc)
bc36d050 1562 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1563 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1564
14de3b42 1565 /* Check if chr is already in the alist. */
d458073b 1566 this = scm_i_read_hash_procedures_ref ();
14de3b42 1567 prev = SCM_BOOL_F;
fed9c9a2
GH
1568 while (1)
1569 {
d2e53ed6 1570 if (scm_is_null (this))
fed9c9a2
GH
1571 {
1572 /* not found, so add it to the beginning. */
7888309b 1573 if (scm_is_true (proc))
fed9c9a2 1574 {
d458073b
AR
1575 SCM new = scm_cons (scm_cons (chr, proc),
1576 scm_i_read_hash_procedures_ref ());
1577 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1578 }
1579 break;
1580 }
bc36d050 1581 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1582 {
1583 /* already in the alist. */
7888309b 1584 if (scm_is_false (proc))
14de3b42
GH
1585 {
1586 /* remove it. */
7888309b 1587 if (scm_is_false (prev))
14de3b42 1588 {
d458073b
AR
1589 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1590 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1591 }
1592 else
1593 scm_set_cdr_x (prev, SCM_CDR (this));
1594 }
fed9c9a2 1595 else
14de3b42
GH
1596 {
1597 /* replace it. */
1598 scm_set_cdr_x (SCM_CAR (this), proc);
1599 }
fed9c9a2
GH
1600 break;
1601 }
1602 prev = this;
1603 this = SCM_CDR (this);
1604 }
deca31e1 1605
deca31e1
GH
1606 return SCM_UNSPECIFIED;
1607}
1bbd0b84 1608#undef FUNC_NAME
0f2d19dd 1609
deca31e1
GH
1610/* Recover the read-hash procedure corresponding to char c. */
1611static SCM
6e8d25a6 1612scm_get_hash_procedure (int c)
deca31e1 1613{
d458073b 1614 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1615
deca31e1
GH
1616 while (1)
1617 {
d2e53ed6 1618 if (scm_is_null (rest))
deca31e1
GH
1619 return SCM_BOOL_F;
1620
7866a09b 1621 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1622 return SCM_CDAR (rest);
1623
1624 rest = SCM_CDR (rest);
1625 }
1626}
1cc91f1b 1627
889975e5
MG
1628#define SCM_ENCODING_SEARCH_SIZE (500)
1629
f8a1c9a8
LC
1630/* Search the first few hundred characters of a file for an Emacs-like coding
1631 declaration. Returns either NULL or a string whose storage has been
1632 allocated with `scm_gc_malloc ()'. */
889975e5 1633char *
f8a1c9a8 1634scm_i_scan_for_encoding (SCM port)
889975e5
MG
1635{
1636 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1637 size_t bytes_read, encoding_length, i;
889975e5
MG
1638 char *encoding = NULL;
1639 int utf8_bom = 0;
daedbca7 1640 char *pos, *encoding_start;
889975e5
MG
1641 int in_comment;
1642
49bb5bd3
LC
1643 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1644 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1645 "guile <(echo '(display "hello")')") so bail out. */
1646 return NULL;
1647
1648 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
190fa72a 1649 header[bytes_read] = '\0';
49bb5bd3 1650
889975e5
MG
1651 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1652
1653 if (bytes_read > 3
1654 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1655 utf8_bom = 1;
1656
1657 /* search past "coding[:=]" */
1658 pos = header;
1659 while (1)
1660 {
1661 if ((pos = strstr(pos, "coding")) == NULL)
1662 return NULL;
1663
1664 pos += strlen("coding");
1665 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1666 (*pos == ':' || *pos == '='))
1667 {
1668 pos ++;
1669 break;
1670 }
1671 }
1672
1673 /* skip spaces */
1674 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1675 (*pos == ' ' || *pos == '\t'))
1676 pos ++;
1677
1678 /* grab the next token */
daedbca7 1679 encoding_start = pos;
889975e5 1680 i = 0;
daedbca7
MG
1681 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1682 && encoding_start + i - header < bytes_read
1683 && (isalnum ((int) encoding_start[i])
1684 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1685 i++;
1686
daedbca7
MG
1687 encoding_length = i;
1688 if (encoding_length == 0)
889975e5
MG
1689 return NULL;
1690
daedbca7
MG
1691 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1692 for (i = 0; i < encoding_length; i++)
026ed239 1693 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1694
1695 /* push backwards to make sure we were in a comment */
1696 in_comment = 0;
daedbca7
MG
1697 pos = encoding_start;
1698 while (pos >= header)
889975e5 1699 {
daedbca7 1700 if (*pos == '\n')
889975e5
MG
1701 {
1702 /* This wasn't in a semicolon comment. Check for a
1703 hash-bang comment. */
1704 char *beg = strstr (header, "#!");
1705 char *end = strstr (header, "!#");
daedbca7 1706 if (beg < encoding_start && encoding_start + encoding_length < end)
889975e5
MG
1707 in_comment = 1;
1708 break;
1709 }
daedbca7 1710 if (*pos == ';')
889975e5
MG
1711 {
1712 in_comment = 1;
1713 break;
1714 }
daedbca7 1715 pos --;
889975e5
MG
1716 }
1717 if (!in_comment)
f8a1c9a8
LC
1718 /* This wasn't in a comment */
1719 return NULL;
1720
889975e5 1721 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1722 scm_misc_error (NULL,
889975e5
MG
1723 "the port input declares the encoding ~s but is encoded as UTF-8",
1724 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1725
889975e5
MG
1726 return encoding;
1727}
1728
1729SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1730 (SCM port),
a270e133 1731 "Scans the port for an Emacs-like character coding declaration\n"
889975e5
MG
1732 "near the top of the contents of a port with random-acessible contents.\n"
1733 "The coding declaration is of the form\n"
1734 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1735 "\n"
1736 "Returns a string containing the character encoding of the file\n"
1737 "if a declaration was found, or @code{#f} otherwise.\n")
1738#define FUNC_NAME s_scm_file_encoding
1739{
1740 char *enc;
1741 SCM s_enc;
f8a1c9a8
LC
1742
1743 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1744 if (enc == NULL)
1745 return SCM_BOOL_F;
1746 else
1747 {
1748 s_enc = scm_from_locale_string (enc);
889975e5
MG
1749 return s_enc;
1750 }
f8a1c9a8 1751
889975e5
MG
1752 return SCM_BOOL_F;
1753}
1754#undef FUNC_NAME
1755
0f2d19dd
JB
1756void
1757scm_init_read ()
0f2d19dd 1758{
d458073b
AR
1759 SCM read_hash_procs;
1760
1761 read_hash_procs = scm_make_fluid ();
1762 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1763
1764 scm_i_read_hash_procedures =
1765 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1766
62560650 1767 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1768#include "libguile/read.x"
0f2d19dd 1769}
89e00824
ML
1770
1771/*
1772 Local Variables:
1773 c-file-style: "gnu"
1774 End:
1775*/