Fix typo in scheme-using.texi
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37
MW
1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
210c0325 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
cfd15439 145static SCM
d458073b
AR
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
cfd15439 151static void
d458073b
AR
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
ff4d3672
LC
164/* Number of 32-bit codepoints in the buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 128
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
cfd15439 200static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965
LC
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
cfd15439 210static int
b662b7e9
LC
211read_token (SCM port, char *buf, size_t buf_size, size_t *read)
212{
69f90b0b 213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
69f90b0b 219 chr = scm_get_byte_or_eof (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
b662b7e9
LC
238/* Like `read_token', but return either BUFFER, or a GC-allocated buffer
239 if the token doesn't fit in BUFFER_SIZE bytes. */
240static char *
241read_complete_token (SCM port, char *buffer, size_t buffer_size,
242 size_t *read)
69f90b0b
MG
243{
244 int overflow = 0;
b662b7e9
LC
245 size_t bytes_read, overflow_size = 0;
246 char *overflow_buffer = NULL;
1cc91f1b 247
69f90b0b
MG
248 do
249 {
250 overflow = read_token (port, buffer, buffer_size, &bytes_read);
251 if (bytes_read == 0)
252 break;
253 if (overflow || overflow_size != 0)
254 {
255 if (overflow_size == 0)
256 {
b662b7e9
LC
257 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
258 memcpy (overflow_buffer, buffer, bytes_read);
69f90b0b
MG
259 overflow_size = bytes_read;
260 }
261 else
262 {
7be3c2fc 263 char *new_buf =
b662b7e9
LC
264 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
265
266 memcpy (new_buf, overflow_buffer, overflow_size);
267 memcpy (new_buf + overflow_size, buffer, bytes_read);
268
269 overflow_buffer = new_buf;
69f90b0b
MG
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
b662b7e9 281 return (overflow_size > 0 ? overflow_buffer : buffer);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
cfd15439 289 scm_t_wchar c;
0f2d19dd 290 while (1)
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
b7f3516f 306 switch (c = scm_getc (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0
LC
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0
LC
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d 358static SCM scm_read_expression (SCM port);
b131b233 359static SCM scm_read_sharp (int chr, SCM port, long line, int column);
0f2d19dd
JB
360
361
d6cb0203
MW
362static SCM
363maybe_annotate_source (SCM x, SCM port, long line, int column)
364{
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368}
369
09a4f039 370static SCM
889975e5 371scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 372#define FUNC_NAME "scm_i_lreadparen"
09a4f039 373{
26c8cc14
AW
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
5afa815c 376 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
f9c68a47 381
7337d56d
LC
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
f9c68a47 385
7337d56d 386 scm_ungetc (c, port);
1f7945a7
AW
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d
LC
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
1cc91f1b 400
7337d56d
LC
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
7337d56d 404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 405 {
7337d56d 406 SCM new_tail;
0f2d19dd 407
5b69315e
AW
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
7337d56d 413 scm_ungetc (c, port);
5b69315e
AW
414 tmp = scm_read_expression (port);
415
1f7945a7
AW
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 418 {
89f88612 419 SCM_SETCDR (tl, scm_read_expression (port));
7337d56d 420
7337d56d
LC
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
0f2d19dd 426 }
b858464a 427
7337d56d
LC
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
7337d56d 431 }
0f2d19dd 432
7337d56d 433 exit:
d6cb0203 434 return maybe_annotate_source (ans, port, line, column);
7337d56d
LC
435}
436#undef FUNC_NAME
a4022e69 437
c5661d28
MG
438
439/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
c5661d28
MG
471 } while (0)
472
684d664e
AW
473static void
474skip_intraline_whitespace (SCM port)
475{
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487}
488
7337d56d
LC
489static SCM
490scm_read_string (int chr, SCM port)
491#define FUNC_NAME "scm_lreadr"
492{
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
0f2d19dd 495
ff4d3672
LC
496 SCM str = SCM_EOL;
497 size_t c_str_len = 0;
498 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
eb42ff25 499
b131b233
MW
500 /* Need to capture line and column numbers here. */
501 long line = SCM_LINUM (port);
502 int column = SCM_COL (port) - 1;
503
7337d56d
LC
504 while ('"' != (c = scm_getc (port)))
505 {
506 if (c == EOF)
9c44cd45
MG
507 {
508 str_eof:
509 scm_i_input_error (FUNC_NAME, port,
510 "end of file in string constant", SCM_EOL);
511 }
0f2d19dd 512
ff4d3672
LC
513 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
514 {
515 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
516 c_str_len = 0;
517 }
0f2d19dd 518
7337d56d 519 if (c == '\\')
9c44cd45
MG
520 {
521 switch (c = scm_getc (port))
522 {
523 case EOF:
524 goto str_eof;
525 case '"':
526 case '\\':
527 break;
9c44cd45 528 case '\n':
684d664e
AW
529 if (SCM_HUNGRY_EOL_ESCAPES_P)
530 skip_intraline_whitespace (port);
9c44cd45
MG
531 continue;
532 case '0':
533 c = '\0';
534 break;
535 case 'f':
536 c = '\f';
537 break;
538 case 'n':
539 c = '\n';
540 break;
541 case 'r':
542 c = '\r';
543 break;
544 case 't':
545 c = '\t';
546 break;
547 case 'a':
548 c = '\007';
549 break;
550 case 'v':
551 c = '\v';
552 break;
67a4a16d
MG
553 case 'b':
554 c = '\010';
555 break;
9c44cd45 556 case 'x':
dea901d6
MG
557 if (SCM_R6RS_ESCAPES_P)
558 SCM_READ_HEX_ESCAPE (10, ';');
559 else
560 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 561 break;
9c44cd45 562 case 'u':
898a0b5a
MG
563 if (!SCM_R6RS_ESCAPES_P)
564 {
565 SCM_READ_HEX_ESCAPE (4, '\0');
566 break;
567 }
9c44cd45 568 case 'U':
898a0b5a
MG
569 if (!SCM_R6RS_ESCAPES_P)
570 {
571 SCM_READ_HEX_ESCAPE (6, '\0');
572 break;
573 }
9c44cd45
MG
574 default:
575 bad_escaped:
576 scm_i_input_error (FUNC_NAME, port,
577 "illegal character in escape sequence: ~S",
578 scm_list_1 (SCM_MAKE_CHAR (c)));
579 }
580 }
ff4d3672
LC
581
582 c_str[c_str_len++] = c;
7337d56d 583 }
ff4d3672
LC
584
585 if (scm_is_null (str))
586 /* Fast path: we got a string that fits in C_STR. */
587 str = scm_from_utf32_stringn (c_str, c_str_len);
588 else
589 {
590 if (c_str_len > 0)
591 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
592
593 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
594 }
595
596 return maybe_annotate_source (str, port, line, column);
0f2d19dd 597}
db4b4ca6
DH
598#undef FUNC_NAME
599
0f2d19dd 600
7337d56d 601static SCM
889975e5 602scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 603{
69f90b0b 604 SCM result, str = SCM_EOL;
b662b7e9 605 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b 606 size_t bytes_read;
69f90b0b 607 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 608
38f19074
MW
609 /* Need to capture line and column numbers here. */
610 long line = SCM_LINUM (port);
611 int column = SCM_COL (port) - 1;
612
7337d56d 613 scm_ungetc (chr, port);
b662b7e9
LC
614 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
615 &bytes_read);
69f90b0b 616
b662b7e9 617 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
618
619 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 620 if (scm_is_false (result))
69f90b0b
MG
621 {
622 /* Return a symbol instead of a number */
623 if (SCM_CASE_INSENSITIVE_P)
624 str = scm_string_downcase_x (str);
625 result = scm_string_to_symbol (str);
626 }
38f19074
MW
627 else if (SCM_NIMP (result))
628 result = maybe_annotate_source (result, port, line, column);
0f2d19dd 629
69f90b0b 630 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
631 return result;
632}
0f2d19dd 633
7337d56d 634static SCM
889975e5 635scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 636{
889975e5
MG
637 SCM result;
638 int ends_with_colon = 0;
69f90b0b 639 size_t bytes_read;
ef4cbc08 640 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
b662b7e9 641 char local_buffer[READER_BUFFER_SIZE], *buffer;
69f90b0b
MG
642 scm_t_port *pt = SCM_PTAB_ENTRY (port);
643 SCM str;
7337d56d
LC
644
645 scm_ungetc (chr, port);
b662b7e9
LC
646 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
647 &bytes_read);
69f90b0b 648 if (bytes_read > 0)
b662b7e9 649 ends_with_colon = buffer[bytes_read - 1] == ':';
ef4cbc08 650
69f90b0b
MG
651 if (postfix && ends_with_colon && (bytes_read > 1))
652 {
b662b7e9
LC
653 str = scm_from_stringn (buffer, bytes_read - 1,
654 pt->encoding, pt->ilseq_handler);
69f90b0b
MG
655
656 if (SCM_CASE_INSENSITIVE_P)
657 str = scm_string_downcase_x (str);
658 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
659 }
7337d56d 660 else
69f90b0b 661 {
b662b7e9
LC
662 str = scm_from_stringn (buffer, bytes_read,
663 pt->encoding, pt->ilseq_handler);
69f90b0b
MG
664
665 if (SCM_CASE_INSENSITIVE_P)
666 str = scm_string_downcase_x (str);
667 result = scm_string_to_symbol (str);
668 }
7337d56d 669
69f90b0b 670 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
671 return result;
672}
673
674static SCM
889975e5 675scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
676#define FUNC_NAME "scm_lreadr"
677{
889975e5 678 SCM result;
7337d56d 679 size_t read;
b662b7e9 680 char local_buffer[READER_BUFFER_SIZE], *buffer;
7337d56d 681 unsigned int radix;
69f90b0b
MG
682 SCM str;
683 scm_t_port *pt;
7337d56d
LC
684
685 switch (chr)
686 {
687 case 'B':
688 case 'b':
689 radix = 2;
690 break;
691
692 case 'o':
693 case 'O':
694 radix = 8;
695 break;
696
697 case 'd':
698 case 'D':
699 radix = 10;
700 break;
701
702 case 'x':
703 case 'X':
704 radix = 16;
705 break;
706
707 default:
708 scm_ungetc (chr, port);
709 scm_ungetc ('#', port);
710 radix = 10;
711 }
712
b662b7e9
LC
713 buffer = read_complete_token (port, local_buffer, sizeof local_buffer,
714 &read);
69f90b0b
MG
715
716 pt = SCM_PTAB_ENTRY (port);
b662b7e9 717 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
69f90b0b
MG
718
719 result = scm_string_to_number (str, scm_from_uint (radix));
720
69f90b0b 721 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
722
723 if (scm_is_true (result))
724 return result;
725
726 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
727
728 return SCM_BOOL_F;
729}
730#undef FUNC_NAME
731
732static SCM
733scm_read_quote (int chr, SCM port)
734{
735 SCM p;
492faee1
LC
736 long line = SCM_LINUM (port);
737 int column = SCM_COL (port) - 1;
7337d56d
LC
738
739 switch (chr)
740 {
741 case '`':
742 p = scm_sym_quasiquote;
743 break;
744
745 case '\'':
746 p = scm_sym_quote;
747 break;
748
749 case ',':
750 {
889975e5 751 scm_t_wchar c;
7337d56d
LC
752
753 c = scm_getc (port);
754 if ('@' == c)
755 p = scm_sym_uq_splicing;
756 else
0f2d19dd 757 {
7337d56d
LC
758 scm_ungetc (c, port);
759 p = scm_sym_unquote;
0f2d19dd 760 }
7337d56d
LC
761 break;
762 }
0f2d19dd 763
7337d56d
LC
764 default:
765 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 766 "scm_read_quote", chr);
7337d56d 767 abort ();
0f2d19dd 768 }
1cc91f1b 769
7337d56d 770 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
d6cb0203 771 return maybe_annotate_source (p, port, line, column);
7337d56d
LC
772}
773
34f3d47d
AW
774SCM_SYMBOL (sym_syntax, "syntax");
775SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
776SCM_SYMBOL (sym_unsyntax, "unsyntax");
777SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
778
779static SCM
780scm_read_syntax (int chr, SCM port)
781{
782 SCM p;
783 long line = SCM_LINUM (port);
784 int column = SCM_COL (port) - 1;
785
786 switch (chr)
787 {
788 case '`':
789 p = sym_quasisyntax;
790 break;
791
792 case '\'':
793 p = sym_syntax;
794 break;
795
796 case ',':
797 {
798 int c;
799
800 c = scm_getc (port);
801 if ('@' == c)
802 p = sym_unsyntax_splicing;
803 else
804 {
805 scm_ungetc (c, port);
806 p = sym_unsyntax;
807 }
808 break;
809 }
810
811 default:
812 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
813 "scm_read_syntax", chr);
814 abort ();
815 }
816
817 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
d6cb0203 818 return maybe_annotate_source (p, port, line, column);
34f3d47d
AW
819}
820
cfd15439 821static SCM
7c4aad9c
AW
822scm_read_nil (int chr, SCM port)
823{
824 SCM id = scm_read_mixed_case_symbol (chr, port);
825
826 if (!scm_is_eq (id, sym_nil))
827 scm_i_input_error ("scm_read_nil", port,
828 "unexpected input while reading #nil: ~a",
829 scm_list_1 (id));
830
831 return SCM_ELISP_NIL;
832}
833
cfd15439 834static SCM
7337d56d 835scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 836{
0f2d19dd
JB
837 int c;
838
889975e5
MG
839 /* We use the get_byte here because there is no need to get the
840 locale correct with comment input. This presumes that newline
841 always represents itself no matter what the encoding is. */
842 for (c = scm_get_byte_or_eof (port);
7337d56d 843 (c != EOF) && (c != '\n');
8a8da78d 844 c = scm_get_byte_or_eof (port));
7337d56d
LC
845
846 return SCM_UNSPECIFIED;
847}
848
849\f
850/* Sharp readers, i.e. readers called after a `#' sign has been read. */
851
852static SCM
853scm_read_boolean (int chr, SCM port)
854{
855 switch (chr)
0f2d19dd 856 {
7337d56d
LC
857 case 't':
858 case 'T':
859 return SCM_BOOL_T;
860
861 case 'f':
862 case 'F':
863 return SCM_BOOL_F;
0f2d19dd 864 }
7337d56d
LC
865
866 return SCM_UNSPECIFIED;
867}
868
869static SCM
889975e5 870scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
871#define FUNC_NAME "scm_lreadr"
872{
69f90b0b
MG
873 char buffer[READER_CHAR_NAME_MAX_SIZE];
874 SCM charname;
875 size_t charname_len, bytes_read;
889975e5
MG
876 scm_t_wchar cp;
877 int overflow;
69f90b0b 878 scm_t_port *pt;
889975e5 879
69f90b0b 880 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 881 if (overflow)
c1a0ba1c 882 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 883
69f90b0b 884 if (bytes_read == 0)
0f2d19dd 885 {
7337d56d
LC
886 chr = scm_getc (port);
887 if (chr == EOF)
888 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
889 "while reading character", SCM_EOL);
890
891 /* CHR must be a token delimiter, like a whitespace. */
892 return (SCM_MAKE_CHAR (chr));
0f2d19dd 893 }
7337d56d 894
69f90b0b 895 pt = SCM_PTAB_ENTRY (port);
7337d56d 896
69f90b0b
MG
897 /* Simple ASCII characters can be processed immediately. Also, simple
898 ISO-8859-1 characters can be processed immediately if the encoding for this
899 port is ISO-8859-1. */
900 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
901 {
902 SCM_COL (port) += 1;
903 return SCM_MAKE_CHAR (buffer[0]);
904 }
905
906 /* Otherwise, convert the buffer into a proper scheme string for
907 processing. */
908 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
909 pt->ilseq_handler);
910 charname_len = scm_i_string_length (charname);
911 SCM_COL (port) += charname_len;
889975e5 912 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
913 if (charname_len == 1)
914 return SCM_MAKE_CHAR (cp);
915
916 /* Ignore dotted circles, which may be used to keep combining characters from
917 combining with the backslash in #\charname. */
0dcd7e61
MG
918 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
919 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
920
889975e5 921 if (cp >= '0' && cp < '8')
7337d56d
LC
922 {
923 /* Dirk:FIXME:: This type of character syntax is not R5RS
924 * compliant. Further, it should be verified that the constant
0ffc78e3 925 * does only consist of octal digits. */
889975e5 926 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 927 if (SCM_I_INUMP (p))
0ffc78e3 928 {
e25f3727 929 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
930 if (SCM_IS_UNICODE_CHAR (c))
931 return SCM_MAKE_CHAR (c);
932 else
0f3a70cf 933 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
934 "out-of-range octal character escape: ~a",
935 scm_list_1 (charname));
936 }
7337d56d
LC
937 }
938
0f3a70cf 939 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
940 {
941 SCM p;
0f3a70cf 942
dea901d6
MG
943 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
944 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
945 scm_from_uint (16));
946 if (SCM_I_INUMP (p))
947 {
e25f3727 948 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
949 if (SCM_IS_UNICODE_CHAR (c))
950 return SCM_MAKE_CHAR (c);
951 else
952 scm_i_input_error (FUNC_NAME, port,
953 "out-of-range hex character escape: ~a",
954 scm_list_1 (charname));
955 }
956 }
957
889975e5
MG
958 /* The names of characters should never have non-Latin1
959 characters. */
960 if (scm_i_is_narrow_string (charname)
961 || scm_i_try_narrow_string (charname))
4769c9db
AW
962 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
963 charname_len);
964 if (scm_is_true (ch))
965 return ch;
966 }
7337d56d 967
7337d56d 968 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 969 scm_list_1 (charname));
7337d56d
LC
970
971 return SCM_UNSPECIFIED;
0f2d19dd 972}
db4b4ca6 973#undef FUNC_NAME
0f2d19dd 974
cfd15439 975static SCM
7337d56d
LC
976scm_read_keyword (int chr, SCM port)
977{
978 SCM symbol;
979
980 /* Read the symbol that comprises the keyword. Doing this instead of
981 invoking a specific symbol reader function allows `scm_read_keyword ()'
982 to adapt to the delimiters currently valid of symbols.
1cc91f1b 983
7337d56d
LC
984 XXX: This implementation allows sloppy syntaxes like `#: key'. */
985 symbol = scm_read_expression (port);
986 if (!scm_is_symbol (symbol))
7f74cf9a 987 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
988 "keyword prefix `~a' not followed by a symbol: ~s",
989 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
990
991 return (scm_symbol_to_keyword (symbol));
992}
993
cfd15439 994static SCM
b131b233 995scm_read_vector (int chr, SCM port, long line, int column)
09a4f039 996{
7337d56d
LC
997 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
998 guarantee that it's going to do what we want. After all, this is an
999 implementation detail of `scm_read_vector ()', not a desirable
1000 property. */
b131b233
MW
1001 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1002 port, line, column);
1003}
1004
1005static SCM
1006scm_read_array (int chr, SCM port, long line, int column)
1007{
1008 SCM result = scm_i_read_array (port, chr);
1009 if (scm_is_false (result))
1010 return result;
1011 else
1012 return maybe_annotate_source (result, port, line, column);
7337d56d 1013}
09a4f039 1014
cfd15439 1015static SCM
b131b233 1016scm_read_srfi4_vector (int chr, SCM port, long line, int column)
7337d56d 1017{
b131b233 1018 return scm_read_array (chr, port, line, column);
7337d56d
LC
1019}
1020
0ba0b384 1021static SCM
b131b233 1022scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
0ba0b384
LC
1023{
1024 chr = scm_getc (port);
1025 if (chr != 'u')
1026 goto syntax;
1027
1028 chr = scm_getc (port);
1029 if (chr != '8')
1030 goto syntax;
1031
1032 chr = scm_getc (port);
1033 if (chr != '(')
1034 goto syntax;
1035
b131b233
MW
1036 return maybe_annotate_source
1037 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1038 port, line, column);
0ba0b384
LC
1039
1040 syntax:
1041 scm_i_input_error ("read_bytevector", port,
1042 "invalid bytevector prefix",
1043 SCM_MAKE_CHAR (chr));
1044 return SCM_UNSPECIFIED;
1045}
1046
7337d56d 1047static SCM
b131b233 1048scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
7337d56d
LC
1049{
1050 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1051 terribly inefficient but who cares? */
1052 SCM s_bits = SCM_EOL;
1053
1054 for (chr = scm_getc (port);
1055 (chr != EOF) && ((chr == '0') || (chr == '1'));
1056 chr = scm_getc (port))
09a4f039 1057 {
7337d56d 1058 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1059 }
7337d56d
LC
1060
1061 if (chr != EOF)
1062 scm_ungetc (chr, port);
1063
b131b233
MW
1064 return maybe_annotate_source
1065 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1066 port, line, column);
7337d56d
LC
1067}
1068
cfd15439 1069static SCM
889975e5 1070scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1071{
1072 int bang_seen = 0;
1073
1074 for (;;)
09a4f039 1075 {
58b1db5f 1076 int c = scm_getc (port);
62850ef3 1077
7337d56d
LC
1078 if (c == EOF)
1079 scm_i_input_error ("skip_block_comment", port,
1080 "unterminated `#! ... !#' comment", SCM_EOL);
1081
1082 if (c == '!')
1083 bang_seen = 1;
1084 else if (c == '#' && bang_seen)
1085 break;
1086 else
1087 bang_seen = 0;
1088 }
1089
1090 return SCM_UNSPECIFIED;
1091}
1092
d7fcaec3 1093static SCM
911b03b2
JG
1094scm_read_shebang (scm_t_wchar chr, SCM port)
1095{
1096 int c = 0;
1097 if ((c = scm_get_byte_or_eof (port)) != 'r')
1098 {
1099 scm_ungetc (c, port);
1100 return scm_read_scsh_block_comment (chr, port);
1101 }
1102 if ((c = scm_get_byte_or_eof (port)) != '6')
1103 {
1104 scm_ungetc (c, port);
1105 scm_ungetc ('r', port);
1106 return scm_read_scsh_block_comment (chr, port);
1107 }
1108 if ((c = scm_get_byte_or_eof (port)) != 'r')
1109 {
1110 scm_ungetc (c, port);
1111 scm_ungetc ('6', port);
1112 scm_ungetc ('r', port);
1113 return scm_read_scsh_block_comment (chr, port);
1114 }
1115 if ((c = scm_get_byte_or_eof (port)) != 's')
1116 {
1117 scm_ungetc (c, port);
1118 scm_ungetc ('r', port);
1119 scm_ungetc ('6', port);
1120 scm_ungetc ('r', port);
1121 return scm_read_scsh_block_comment (chr, port);
1122 }
1123
1124 return SCM_UNSPECIFIED;
1125}
1126
620c8965
LC
1127static SCM
1128scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1129{
1130 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1131 nested. So care must be taken. */
1132 int nesting_level = 1;
6d5f8c32
AW
1133
1134 int a = scm_getc (port);
1135
1136 if (a == EOF)
1137 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1138 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1139
1140 while (nesting_level > 0)
1141 {
6d5f8c32 1142 int b = scm_getc (port);
620c8965 1143
6d5f8c32 1144 if (b == EOF)
cd169c5a 1145 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1146 "unterminated `#| ... |#' comment", SCM_EOL);
1147
6d5f8c32
AW
1148 if (a == '|' && b == '#')
1149 {
1150 nesting_level--;
1151 b = EOF;
1152 }
1153 else if (a == '#' && b == '|')
1154 {
1155 nesting_level++;
1156 b = EOF;
1157 }
1158
1159 a = b;
620c8965
LC
1160 }
1161
1162 return SCM_UNSPECIFIED;
1163}
1164
34f3d47d 1165static SCM
889975e5 1166scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1167{
889975e5 1168 scm_t_wchar c;
34f3d47d
AW
1169
1170 c = flush_ws (port, (char *) NULL);
1171 if (EOF == c)
1172 scm_i_input_error ("read_commented_expression", port,
1173 "no expression after #; comment", SCM_EOL);
1174 scm_ungetc (c, port);
1175 scm_read_expression (port);
1176 return SCM_UNSPECIFIED;
1177}
1178
7337d56d 1179static SCM
889975e5 1180scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1181{
1182 /* Guile's extended symbol read syntax looks like this:
1183
1184 #{This is all a symbol name}#
1185
1186 So here, CHR is expected to be `{'. */
d9527cfa 1187 int saw_brace = 0;
7337d56d 1188 size_t len = 0;
190d4b0d 1189 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1190
889975e5 1191 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1192
1193 while ((chr = scm_getc (port)) != EOF)
1194 {
1195 if (saw_brace)
09a4f039 1196 {
7337d56d
LC
1197 if (chr == '#')
1198 {
7337d56d
LC
1199 break;
1200 }
1201 else
1202 {
1203 saw_brace = 0;
889975e5 1204 scm_i_string_set_x (buf, len++, '}');
7337d56d 1205 }
09a4f039 1206 }
d9527cfa
AW
1207
1208 if (chr == '}')
7337d56d 1209 saw_brace = 1;
d9527cfa
AW
1210 else if (chr == '\\')
1211 {
1212 /* It used to be that print.c would print extended-read-syntax
1213 symbols with backslashes before "non-standard" chars, but
1214 this routine wouldn't do anything with those escapes.
1215 Bummer. What we've done is to change print.c to output
1216 R6RS hex escapes for those characters, relying on the fact
1217 that the extended read syntax would never put a `\' before
1218 an `x'. For now, we just ignore other instances of
1219 backslash in the string. */
1220 switch ((chr = scm_getc (port)))
1221 {
1222 case EOF:
1223 goto done;
1224 case 'x':
1225 {
1226 scm_t_wchar c;
1227
1228 SCM_READ_HEX_ESCAPE (10, ';');
1229 scm_i_string_set_x (buf, len++, c);
1230 break;
1231
1232 str_eof:
1233 chr = EOF;
1234 goto done;
1235
1236 bad_escaped:
1237 scm_i_string_stop_writing ();
1238 scm_i_input_error ("scm_read_extended_symbol", port,
1239 "illegal character in escape sequence: ~S",
1240 scm_list_1 (SCM_MAKE_CHAR (c)));
1241 break;
1242 }
1243 default:
1244 scm_i_string_set_x (buf, len++, chr);
1245 break;
1246 }
1247 }
7337d56d 1248 else
d9527cfa 1249 scm_i_string_set_x (buf, len++, chr);
62850ef3 1250
889975e5 1251 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1252 {
7f991c7d
LC
1253 SCM addy;
1254
889975e5 1255 scm_i_string_stop_writing ();
190d4b0d 1256 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1257 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1258 len = 0;
889975e5 1259 buf = scm_i_string_start_writing (buf);
7337d56d 1260 }
7337d56d 1261 }
d9527cfa
AW
1262
1263 done:
889975e5 1264 scm_i_string_stop_writing ();
d9527cfa
AW
1265 if (chr == EOF)
1266 scm_i_input_error ("scm_read_extended_symbol", port,
1267 "end of file while reading symbol", SCM_EOL);
7337d56d 1268
889975e5 1269 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1270}
1271
1272
1273\f
1274/* Top-level token readers, i.e., dispatchers. */
1275
1276static SCM
1277scm_read_sharp_extension (int chr, SCM port)
1278{
1279 SCM proc;
1280
1281 proc = scm_get_hash_procedure (chr);
1282 if (scm_is_true (scm_procedure_p (proc)))
1283 {
1284 long line = SCM_LINUM (port);
1285 int column = SCM_COL (port) - 2;
1286 SCM got;
1287
1288 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14
AW
1289
1290 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1291 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1292
1293 return got;
09a4f039 1294 }
7337d56d
LC
1295
1296 return SCM_UNSPECIFIED;
1297}
1298
1299/* The reader for the sharp `#' character. It basically dispatches reads
1300 among the above token readers. */
1301static SCM
b131b233 1302scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
7337d56d
LC
1303#define FUNC_NAME "scm_lreadr"
1304{
1305 SCM result;
1306
1307 chr = scm_getc (port);
1308
1309 result = scm_read_sharp_extension (chr, port);
1310 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1311 return result;
1312
1313 switch (chr)
1314 {
1315 case '\\':
1316 return (scm_read_character (chr, port));
1317 case '(':
b131b233 1318 return (scm_read_vector (chr, port, line, column));
7337d56d
LC
1319 case 's':
1320 case 'u':
1321 case 'f':
df941b5b 1322 case 'c':
7337d56d 1323 /* This one may return either a boolean or an SRFI-4 vector. */
b131b233 1324 return (scm_read_srfi4_vector (chr, port, line, column));
0ba0b384 1325 case 'v':
b131b233 1326 return (scm_read_bytevector (chr, port, line, column));
7337d56d 1327 case '*':
b131b233 1328 return (scm_read_guile_bit_vector (chr, port, line, column));
7337d56d
LC
1329 case 't':
1330 case 'T':
1331 case 'F':
7337d56d
LC
1332 return (scm_read_boolean (chr, port));
1333 case ':':
1334 return (scm_read_keyword (chr, port));
1335 case '0': case '1': case '2': case '3': case '4':
1336 case '5': case '6': case '7': case '8': case '9':
1337 case '@':
1338#if SCM_ENABLE_DEPRECATED
1339 /* See below for 'i' and 'e'. */
1340 case 'a':
7337d56d
LC
1341 case 'y':
1342 case 'h':
1343 case 'l':
1344#endif
b131b233 1345 return (scm_read_array (chr, port, line, column));
7337d56d
LC
1346
1347 case 'i':
1348 case 'e':
1349#if SCM_ENABLE_DEPRECATED
1350 {
1351 /* When next char is '(', it really is an old-style
1352 uniform array. */
889975e5 1353 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1354 if (next_c != EOF)
1355 scm_ungetc (next_c, port);
1356 if (next_c == '(')
b131b233 1357 return scm_read_array (chr, port, line, column);
7337d56d
LC
1358 /* Fall through. */
1359 }
1360#endif
1361 case 'b':
1362 case 'B':
1363 case 'o':
1364 case 'O':
1365 case 'd':
1366 case 'D':
1367 case 'x':
1368 case 'X':
1369 case 'I':
1370 case 'E':
1371 return (scm_read_number_and_radix (chr, port));
1372 case '{':
1373 return (scm_read_extended_symbol (chr, port));
1374 case '!':
911b03b2 1375 return (scm_read_shebang (chr, port));
34f3d47d
AW
1376 case ';':
1377 return (scm_read_commented_expression (chr, port));
1378 case '`':
1379 case '\'':
1380 case ',':
1381 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1382 case 'n':
1383 return (scm_read_nil (chr, port));
7337d56d
LC
1384 default:
1385 result = scm_read_sharp_extension (chr, port);
1386 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1387 {
1388 /* To remain compatible with 1.8 and earlier, the following
1389 characters have lower precedence than `read-hash-extend'
1390 characters. */
1391 switch (chr)
1392 {
1393 case '|':
1394 return scm_read_r6rs_block_comment (chr, port);
1395 default:
1396 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1397 scm_list_1 (SCM_MAKE_CHAR (chr)));
1398 }
1399 }
7337d56d
LC
1400 else
1401 return result;
1402 }
1403
1404 return SCM_UNSPECIFIED;
1405}
1406#undef FUNC_NAME
1407
1408static SCM
1409scm_read_expression (SCM port)
1410#define FUNC_NAME "scm_read_expression"
1411{
1412 while (1)
1413 {
cfd15439 1414 scm_t_wchar chr;
7337d56d
LC
1415
1416 chr = scm_getc (port);
1417
1418 switch (chr)
1419 {
1420 case SCM_WHITE_SPACES:
1421 case SCM_LINE_INCREMENTORS:
1422 break;
1423 case ';':
1424 (void) scm_read_semicolon_comment (chr, port);
1425 break;
5afa815c
AW
1426 case '[':
1427 if (!SCM_SQUARE_BRACKETS_P)
1428 return (scm_read_mixed_case_symbol (chr, port));
1429 /* otherwise fall through */
7337d56d
LC
1430 case '(':
1431 return (scm_read_sexp (chr, port));
1432 case '"':
1433 return (scm_read_string (chr, port));
1434 case '\'':
1435 case '`':
1436 case ',':
1437 return (scm_read_quote (chr, port));
1438 case '#':
1439 {
b131b233
MW
1440 long line = SCM_LINUM (port);
1441 int column = SCM_COL (port) - 1;
1442 SCM result = scm_read_sharp (chr, port, line, column);
7337d56d
LC
1443 if (scm_is_eq (result, SCM_UNSPECIFIED))
1444 /* We read a comment or some such. */
1445 break;
1446 else
1447 return result;
1448 }
1449 case ')':
1450 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1451 break;
a4e47229
MG
1452 case ']':
1453 if (SCM_SQUARE_BRACKETS_P)
1454 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1455 /* otherwise fall through */
7337d56d
LC
1456 case EOF:
1457 return SCM_EOF_VAL;
1458 case ':':
1459 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1460 return scm_symbol_to_keyword (scm_read_expression (port));
1461 /* Fall through. */
1462
1463 default:
1464 {
1465 if (((chr >= '0') && (chr <= '9'))
1466 || (strchr ("+-.", chr)))
1467 return (scm_read_number (chr, port));
1468 else
1469 return (scm_read_mixed_case_symbol (chr, port));
1470 }
1471 }
1472 }
1473}
1474#undef FUNC_NAME
1475
1476\f
1477/* Actual reader. */
1478
1479SCM_DEFINE (scm_read, "read", 0, 1, 0,
1480 (SCM port),
1481 "Read an s-expression from the input port @var{port}, or from\n"
1482 "the current input port if @var{port} is not specified.\n"
1483 "Any whitespace before the next token is discarded.")
1484#define FUNC_NAME s_scm_read
1485{
1486 int c;
1487
1488 if (SCM_UNBNDP (port))
1489 port = scm_current_input_port ();
1490 SCM_VALIDATE_OPINPORT (1, port);
1491
1492 c = flush_ws (port, (char *) NULL);
1493 if (EOF == c)
1494 return SCM_EOF_VAL;
1495 scm_ungetc (c, port);
1496
1497 return (scm_read_expression (port));
09a4f039 1498}
db4b4ca6 1499#undef FUNC_NAME
09a4f039 1500
0f2d19dd
JB
1501
1502\f
1503
14de3b42
GH
1504/* Manipulate the read-hash-procedures alist. This could be written in
1505 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1506SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1507 (SCM chr, SCM proc),
dc7fa443
MG
1508 "Install the procedure @var{proc} for reading expressions\n"
1509 "starting with the character sequence @code{#} and @var{chr}.\n"
1510 "@var{proc} will be called with two arguments: the character\n"
1511 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1512 "returned will be the return value of @code{read}. \n"
1513 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1514 )
1bbd0b84 1515#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1516{
fed9c9a2
GH
1517 SCM this;
1518 SCM prev;
1519
36284627 1520 SCM_VALIDATE_CHAR (1, chr);
7888309b 1521 SCM_ASSERT (scm_is_false (proc)
bc36d050 1522 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1523 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1524
14de3b42 1525 /* Check if chr is already in the alist. */
d458073b 1526 this = scm_i_read_hash_procedures_ref ();
14de3b42 1527 prev = SCM_BOOL_F;
fed9c9a2
GH
1528 while (1)
1529 {
d2e53ed6 1530 if (scm_is_null (this))
fed9c9a2
GH
1531 {
1532 /* not found, so add it to the beginning. */
7888309b 1533 if (scm_is_true (proc))
fed9c9a2 1534 {
d458073b
AR
1535 SCM new = scm_cons (scm_cons (chr, proc),
1536 scm_i_read_hash_procedures_ref ());
1537 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1538 }
1539 break;
1540 }
bc36d050 1541 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1542 {
1543 /* already in the alist. */
7888309b 1544 if (scm_is_false (proc))
14de3b42
GH
1545 {
1546 /* remove it. */
7888309b 1547 if (scm_is_false (prev))
14de3b42 1548 {
d458073b
AR
1549 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1550 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1551 }
1552 else
1553 scm_set_cdr_x (prev, SCM_CDR (this));
1554 }
fed9c9a2 1555 else
14de3b42
GH
1556 {
1557 /* replace it. */
1558 scm_set_cdr_x (SCM_CAR (this), proc);
1559 }
fed9c9a2
GH
1560 break;
1561 }
1562 prev = this;
1563 this = SCM_CDR (this);
1564 }
deca31e1 1565
deca31e1
GH
1566 return SCM_UNSPECIFIED;
1567}
1bbd0b84 1568#undef FUNC_NAME
0f2d19dd 1569
deca31e1
GH
1570/* Recover the read-hash procedure corresponding to char c. */
1571static SCM
6e8d25a6 1572scm_get_hash_procedure (int c)
deca31e1 1573{
d458073b 1574 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1575
deca31e1
GH
1576 while (1)
1577 {
d2e53ed6 1578 if (scm_is_null (rest))
deca31e1
GH
1579 return SCM_BOOL_F;
1580
7866a09b 1581 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1582 return SCM_CDAR (rest);
1583
1584 rest = SCM_CDR (rest);
1585 }
1586}
1cc91f1b 1587
889975e5
MG
1588#define SCM_ENCODING_SEARCH_SIZE (500)
1589
f8a1c9a8
LC
1590/* Search the first few hundred characters of a file for an Emacs-like coding
1591 declaration. Returns either NULL or a string whose storage has been
1592 allocated with `scm_gc_malloc ()'. */
889975e5 1593char *
f8a1c9a8 1594scm_i_scan_for_encoding (SCM port)
889975e5 1595{
d900843c 1596 scm_t_port *pt;
889975e5 1597 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1598 size_t bytes_read, encoding_length, i;
889975e5
MG
1599 char *encoding = NULL;
1600 int utf8_bom = 0;
daedbca7 1601 char *pos, *encoding_start;
889975e5
MG
1602 int in_comment;
1603
d900843c 1604 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1605
d900843c
AW
1606 if (pt->rw_active == SCM_PORT_WRITE)
1607 scm_flush (port);
49bb5bd3 1608
d900843c
AW
1609 if (pt->rw_random)
1610 pt->rw_active = SCM_PORT_READ;
1611
1612 if (pt->read_pos == pt->read_end)
1613 {
1614 /* We can use the read buffer, and thus avoid a seek. */
1615 if (scm_fill_input (port) == EOF)
1616 return NULL;
1617
1618 bytes_read = pt->read_end - pt->read_pos;
1619 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1620 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1621
1622 if (bytes_read <= 1)
1623 /* An unbuffered port -- don't scan. */
1624 return NULL;
1625
1626 memcpy (header, pt->read_pos, bytes_read);
1627 header[bytes_read] = '\0';
1628 }
1629 else
1630 {
1631 /* Try to read some bytes and then seek back. Not all ports
1632 support seeking back; and indeed some file ports (like
1633 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1634 check performed by SCM_FPORT_FDES---but fail to seek
1635 backwards. Hence this block comes second. We prefer to use
1636 the read buffer in-place. */
1637 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1638 return NULL;
1639
1640 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1641 header[bytes_read] = '\0';
1642 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1643 }
889975e5
MG
1644
1645 if (bytes_read > 3
1646 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1647 utf8_bom = 1;
1648
1649 /* search past "coding[:=]" */
1650 pos = header;
1651 while (1)
1652 {
1653 if ((pos = strstr(pos, "coding")) == NULL)
1654 return NULL;
1655
1656 pos += strlen("coding");
1657 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1658 (*pos == ':' || *pos == '='))
1659 {
1660 pos ++;
1661 break;
1662 }
1663 }
1664
1665 /* skip spaces */
1666 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1667 (*pos == ' ' || *pos == '\t'))
1668 pos ++;
1669
1670 /* grab the next token */
daedbca7 1671 encoding_start = pos;
889975e5 1672 i = 0;
daedbca7
MG
1673 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1674 && encoding_start + i - header < bytes_read
1675 && (isalnum ((int) encoding_start[i])
1676 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1677 i++;
1678
daedbca7
MG
1679 encoding_length = i;
1680 if (encoding_length == 0)
889975e5
MG
1681 return NULL;
1682
daedbca7
MG
1683 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1684 for (i = 0; i < encoding_length; i++)
026ed239 1685 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1686
1687 /* push backwards to make sure we were in a comment */
1688 in_comment = 0;
daedbca7
MG
1689 pos = encoding_start;
1690 while (pos >= header)
889975e5 1691 {
8a12aeb9
AW
1692 if (*pos == ';')
1693 {
1694 in_comment = 1;
1695 break;
1696 }
1697 else if (*pos == '\n' || pos == header)
889975e5
MG
1698 {
1699 /* This wasn't in a semicolon comment. Check for a
1700 hash-bang comment. */
1701 char *beg = strstr (header, "#!");
1702 char *end = strstr (header, "!#");
8a12aeb9 1703 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1704 in_comment = 1;
1705 break;
1706 }
8a12aeb9
AW
1707 else
1708 {
1709 pos --;
1710 continue;
1711 }
889975e5
MG
1712 }
1713 if (!in_comment)
f8a1c9a8
LC
1714 /* This wasn't in a comment */
1715 return NULL;
1716
889975e5 1717 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1718 scm_misc_error (NULL,
889975e5
MG
1719 "the port input declares the encoding ~s but is encoded as UTF-8",
1720 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1721
889975e5
MG
1722 return encoding;
1723}
1724
1725SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1726 (SCM port),
a270e133 1727 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1728 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1729 "The coding declaration is of the form\n"
1730 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1731 "\n"
1732 "Returns a string containing the character encoding of the file\n"
1733 "if a declaration was found, or @code{#f} otherwise.\n")
1734#define FUNC_NAME s_scm_file_encoding
1735{
1736 char *enc;
1737 SCM s_enc;
f8a1c9a8 1738
d900843c
AW
1739 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1740
f8a1c9a8 1741 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1742 if (enc == NULL)
1743 return SCM_BOOL_F;
1744 else
1745 {
1746 s_enc = scm_from_locale_string (enc);
889975e5
MG
1747 return s_enc;
1748 }
f8a1c9a8 1749
889975e5
MG
1750 return SCM_BOOL_F;
1751}
1752#undef FUNC_NAME
1753
0f2d19dd
JB
1754void
1755scm_init_read ()
0f2d19dd 1756{
d458073b
AR
1757 SCM read_hash_procs;
1758
c81c2ad3 1759 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
1760
1761 scm_i_read_hash_procedures =
1762 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1763
62560650 1764 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1765#include "libguile/read.x"
0f2d19dd 1766}
89e00824
ML
1767
1768/*
1769 Local Variables:
1770 c-file-style: "gnu"
1771 End:
1772*/