Bump version number for 1.9.9.
[bpt/guile.git] / libguile / read.c
CommitLineData
a8fc3852 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
7337d56d 32
a0599745 33#include "libguile/_scm.h"
0ba0b384 34#include "libguile/bytevectors.h"
a0599745
MD
35#include "libguile/chars.h"
36#include "libguile/eval.h"
2fa901a5 37#include "libguile/arrays.h"
cf396142 38#include "libguile/bitvectors.h"
a0599745
MD
39#include "libguile/keywords.h"
40#include "libguile/alist.h"
41#include "libguile/srcprop.h"
42#include "libguile/hashtab.h"
43#include "libguile/hash.h"
44#include "libguile/ports.h"
49bb5bd3 45#include "libguile/fports.h"
a0599745
MD
46#include "libguile/root.h"
47#include "libguile/strings.h"
ba1b2226 48#include "libguile/strports.h"
a0599745 49#include "libguile/vectors.h"
a0599745 50#include "libguile/validate.h"
a4022e69 51#include "libguile/srfi-4.h"
7337d56d 52#include "libguile/srfi-13.h"
ba1b2226 53
a0599745 54#include "libguile/read.h"
22fc179a
HWN
55#include "libguile/private-options.h"
56
0f2d19dd
JB
57
58\f
59
5bf6a6f0 60SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 61SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 62SCM_SYMBOL (scm_keyword_postfix, "postfix");
c7733771 63
92c2555f 64scm_t_option scm_read_opts[] = {
b7ff98dd
MD
65 { SCM_OPTION_BOOLEAN, "copy", 0,
66 "Copy source code expressions." },
ac74fc22 67 { SCM_OPTION_BOOLEAN, "positions", 0,
deca31e1
GH
68 "Record positions of source code expressions." },
69 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 70 "Convert symbols to lower case."},
62316c7f 71 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
904fabb6 72 "Style of keyword recognition: #f, 'prefix or 'postfix."},
16353acc 73#if SCM_ENABLE_ELISP
16353acc
NJ
74 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
cd21f5eb 76 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
62560650 77 "Support `\\(' and `\\)' in strings."},
16353acc 78#endif
dea901d6
MG
79 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
80 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
81 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
82 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
62560650 83 { 0, },
a16f6fe7
MD
84};
85
39e8f371
HWN
86/*
87 Give meaningful error messages for errors
88
89 We use the format
90
ba1b2226 91 FILE:LINE:COL: MESSAGE
39e8f371
HWN
92 This happened in ....
93
94 This is not standard GNU format, but the test-suite likes the real
95 message to be in front.
96
39e8f371
HWN
97 */
98
99
a4022e69
MV
100void
101scm_i_input_error (char const *function,
102 SCM port, const char *message, SCM arg)
ba1b2226 103{
29a837fd
MV
104 SCM fn = (scm_is_string (SCM_FILENAME(port))
105 ? SCM_FILENAME(port)
106 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 107
29a837fd 108 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
109 SCM string = SCM_EOL;
110 scm_simple_format (string_port,
272632a6 111 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 112 scm_list_4 (fn,
b3aa4626 113 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 114 scm_from_int (SCM_COL (port) + 1),
272632a6 115 scm_from_locale_string (message)));
ba1b2226
HWN
116
117 string = scm_get_output_string (string_port);
118 scm_close_output_port (string_port);
272632a6 119 scm_error_scm (scm_from_locale_symbol ("read-error"),
a4022e69 120 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 121 string,
dd72382c 122 arg,
ba1b2226
HWN
123 SCM_BOOL_F);
124}
39e8f371
HWN
125
126
a1ec6916 127SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 128 (SCM setting),
dc7fa443
MG
129 "Option interface for the read options. Instead of using\n"
130 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 131 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 132#define FUNC_NAME s_scm_read_options
a16f6fe7 133{
b7ff98dd
MD
134 SCM ans = scm_options (setting,
135 scm_read_opts,
1bbd0b84 136 FUNC_NAME);
b7ff98dd
MD
137 if (SCM_COPY_SOURCE_P)
138 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
139 return ans;
140}
1bbd0b84 141#undef FUNC_NAME
a16f6fe7 142
14de3b42
GH
143/* An association list mapping extra hash characters to procedures. */
144static SCM *scm_read_hash_procedures;
deca31e1 145
0f2d19dd 146
7337d56d
LC
147\f
148/* Token readers. */
0f2d19dd 149
0f2d19dd 150
7337d56d
LC
151/* Size of the C buffer used to read symbols and numbers. */
152#define READER_BUFFER_SIZE 128
0f2d19dd 153
7337d56d
LC
154/* Size of the C buffer used to read strings. */
155#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 156
7337d56d
LC
157/* The maximum size of Scheme character names. */
158#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 159
94115ae3 160
7337d56d
LC
161/* `isblank' is only in C99. */
162#define CHAR_IS_BLANK_(_chr) \
163 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 164 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
165
166#ifdef MSDOS
167# define CHAR_IS_BLANK(_chr) \
168 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
169#else
170# define CHAR_IS_BLANK CHAR_IS_BLANK_
171#endif
172
173
174/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
175 structure''). */
176#define CHAR_IS_R5RS_DELIMITER(c) \
177 (CHAR_IS_BLANK (c) \
5afa815c
AW
178 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
179 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
180
181#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
182
183/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
184 Structure''. */
185#define CHAR_IS_EXPONENT_MARKER(_chr) \
186 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
187 || ((_chr) == 'd') || ((_chr) == 'l'))
188
454866e0 189/* Read an SCSH block comment. */
620c8965
LC
190static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
191static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
192static SCM scm_read_commented_expression (scm_t_wchar, SCM);
193static SCM scm_get_hash_procedure (int);
454866e0 194
69f90b0b
MG
195/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
196 result in the pre-allocated buffer BUF. Return zero if the whole token has
197 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
198 bytes actually read. */
7337d56d 199static inline int
69f90b0b
MG
200read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
201 {
202 *read = 0;
0520c320 203
69f90b0b
MG
204 while (*read < buf_size)
205 {
206 int chr;
7337d56d 207
69f90b0b 208 chr = scm_get_byte_or_eof (port);
889975e5 209
69f90b0b
MG
210 if (chr == EOF)
211 return 0;
212 else if (CHAR_IS_DELIMITER (chr))
213 {
214 scm_unget_byte (chr, port);
215 return 0;
216 }
217 else
218 {
219 *buf = (char) chr;
220 buf++, (*read)++;
221 }
222 }
889975e5 223
69f90b0b
MG
224 return 1;
225 }
889975e5 226
69f90b0b
MG
227/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
228 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
229 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
230 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
231 will be set the number of bytes actually read. */
232static int
233read_complete_token (SCM port, char *buffer, const size_t buffer_size,
234 char **overflow_buffer, size_t *read)
235{
236 int overflow = 0;
237 size_t bytes_read, overflow_size;
7337d56d 238
69f90b0b
MG
239 *overflow_buffer = NULL;
240 overflow_size = 0;
1cc91f1b 241
69f90b0b
MG
242 do
243 {
244 overflow = read_token (port, buffer, buffer_size, &bytes_read);
245 if (bytes_read == 0)
246 break;
247 if (overflow || overflow_size != 0)
248 {
249 if (overflow_size == 0)
250 {
251 *overflow_buffer = scm_malloc (bytes_read);
252 memcpy (*overflow_buffer, buffer, bytes_read);
253 overflow_size = bytes_read;
254 }
255 else
256 {
257 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
258 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
259 overflow_size += bytes_read;
260 }
261 }
889975e5 262 }
69f90b0b 263 while (overflow);
889975e5 264
69f90b0b
MG
265 if (overflow_size)
266 *read = overflow_size;
b521d265 267 else
69f90b0b
MG
268 *read = bytes_read;
269
270 return (overflow_size != 0);
889975e5 271}
7337d56d
LC
272
273/* Skip whitespace from PORT and return the first non-whitespace character
274 read. Raise an error on end-of-file. */
275static int
276flush_ws (SCM port, const char *eoferr)
0f2d19dd 277{
889975e5 278 register scm_t_wchar c;
0f2d19dd 279 while (1)
b7f3516f 280 switch (c = scm_getc (port))
0f2d19dd
JB
281 {
282 case EOF:
283 goteof:
284 if (eoferr)
d156d3b7 285 {
a4022e69
MV
286 scm_i_input_error (eoferr,
287 port,
288 "end of file",
289 SCM_EOL);
d156d3b7 290 }
0f2d19dd 291 return c;
7337d56d 292
0f2d19dd
JB
293 case ';':
294 lp:
b7f3516f 295 switch (c = scm_getc (port))
0f2d19dd
JB
296 {
297 case EOF:
298 goto goteof;
299 default:
300 goto lp;
301 case SCM_LINE_INCREMENTORS:
302 break;
303 }
304 break;
7337d56d 305
454866e0
LC
306 case '#':
307 switch (c = scm_getc (port))
308 {
309 case EOF:
310 eoferr = "read_sharp";
311 goto goteof;
312 case '!':
313 scm_read_scsh_block_comment (c, port);
314 break;
34f3d47d
AW
315 case ';':
316 scm_read_commented_expression (c, port);
317 break;
620c8965
LC
318 case '|':
319 if (scm_is_false (scm_get_hash_procedure (c)))
320 {
321 scm_read_r6rs_block_comment (c, port);
322 break;
323 }
324 /* fall through */
454866e0
LC
325 default:
326 scm_ungetc (c, port);
327 return '#';
328 }
329 break;
330
0f2d19dd 331 case SCM_LINE_INCREMENTORS:
0f2d19dd 332 case SCM_SINGLE_SPACES:
0f2d19dd 333 case '\t':
0f2d19dd 334 break;
7337d56d 335
0f2d19dd
JB
336 default:
337 return c;
338 }
7337d56d
LC
339
340 return 0;
0f2d19dd
JB
341}
342
343
7337d56d
LC
344\f
345/* Token readers. */
1cc91f1b 346
7337d56d
LC
347static SCM scm_read_expression (SCM port);
348static SCM scm_read_sharp (int chr, SCM port);
7337d56d 349static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
350
351
09a4f039 352static SCM
889975e5 353scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 354#define FUNC_NAME "scm_i_lreadparen"
09a4f039 355{
7337d56d
LC
356 register int c;
357 register SCM tmp;
358 register SCM tl, ans = SCM_EOL;
bd22f1c7 359 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
5afa815c 360 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
361
362 /* Need to capture line and column numbers here. */
363 long line = SCM_LINUM (port);
364 int column = SCM_COL (port) - 1;
f9c68a47 365
f9c68a47 366
7337d56d
LC
367 c = flush_ws (port, FUNC_NAME);
368 if (terminating_char == c)
369 return SCM_EOL;
f9c68a47 370
7337d56d
LC
371 scm_ungetc (c, port);
372 if (scm_is_eq (scm_sym_dot,
373 (tmp = scm_read_expression (port))))
374 {
375 ans = scm_read_expression (port);
376 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
377 scm_i_input_error (FUNC_NAME, port, "missing close paren",
378 SCM_EOL);
379 return ans;
380 }
1cc91f1b 381
7337d56d
LC
382 /* Build the head of the list structure. */
383 ans = tl = scm_cons (tmp, SCM_EOL);
384
385 if (SCM_COPY_SOURCE_P)
386 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
387 ? copy
388 : tmp,
389 SCM_EOL);
390
391 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 392 {
7337d56d 393 SCM new_tail;
0f2d19dd 394
7337d56d
LC
395 scm_ungetc (c, port);
396 if (scm_is_eq (scm_sym_dot,
397 (tmp = scm_read_expression (port))))
0f2d19dd 398 {
7337d56d
LC
399 SCM_SETCDR (tl, tmp = scm_read_expression (port));
400
401 if (SCM_COPY_SOURCE_P)
402 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
403 SCM_EOL));
404
405 c = flush_ws (port, FUNC_NAME);
406 if (terminating_char != c)
407 scm_i_input_error (FUNC_NAME, port,
408 "in pair: missing close paren", SCM_EOL);
409 goto exit;
0f2d19dd 410 }
b858464a 411
7337d56d
LC
412 new_tail = scm_cons (tmp, SCM_EOL);
413 SCM_SETCDR (tl, new_tail);
414 tl = new_tail;
415
416 if (SCM_COPY_SOURCE_P)
0f2d19dd 417 {
7337d56d
LC
418 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
419 ? copy
420 : tmp, SCM_EOL);
421 SCM_SETCDR (tl2, new_tail2);
422 tl2 = new_tail2;
423 }
424 }
0f2d19dd 425
7337d56d
LC
426 exit:
427 if (SCM_RECORD_POSITIONS_P)
428 scm_whash_insert (scm_source_whash,
429 ans,
430 scm_make_srcprops (line, column,
431 SCM_FILENAME (port),
432 SCM_COPY_SOURCE_P
433 ? ans2
434 : SCM_UNDEFINED,
435 SCM_EOL));
436 return ans;
437}
438#undef FUNC_NAME
a4022e69 439
c5661d28
MG
440
441/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
442 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
443 found. */
444#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
445 do \
446 { \
447 scm_t_wchar a; \
448 size_t i = 0; \
449 c = 0; \
450 while (i < ndigits) \
451 { \
452 a = scm_getc (port); \
453 if (a == EOF) \
454 goto str_eof; \
455 if (terminator \
456 && (a == (scm_t_wchar) terminator) \
457 && (i > 0)) \
458 break; \
459 if ('0' <= a && a <= '9') \
460 a -= '0'; \
461 else if ('A' <= a && a <= 'F') \
462 a = a - 'A' + 10; \
463 else if ('a' <= a && a <= 'f') \
464 a = a - 'a' + 10; \
465 else \
466 { \
467 c = a; \
468 goto bad_escaped; \
469 } \
470 c = c * 16 + a; \
471 i ++; \
472 } \
c5661d28
MG
473 } while (0)
474
7337d56d
LC
475static SCM
476scm_read_string (int chr, SCM port)
477#define FUNC_NAME "scm_lreadr"
478{
479 /* For strings smaller than C_STR, this function creates only one Scheme
480 object (the string returned). */
0f2d19dd 481
7337d56d 482 SCM str = SCM_BOOL_F;
7337d56d 483 unsigned c_str_len = 0;
9c44cd45 484 scm_t_wchar c;
eb42ff25 485
9c44cd45 486 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
7337d56d
LC
487 while ('"' != (c = scm_getc (port)))
488 {
489 if (c == EOF)
9c44cd45
MG
490 {
491 str_eof:
492 scm_i_input_error (FUNC_NAME, port,
493 "end of file in string constant", SCM_EOL);
494 }
0f2d19dd 495
9c44cd45
MG
496 if (c_str_len + 1 >= scm_i_string_length (str))
497 {
498 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
0f2d19dd 499
9c44cd45
MG
500 str = scm_string_append (scm_list_2 (str, addy));
501 }
0f2d19dd 502
7337d56d 503 if (c == '\\')
9c44cd45
MG
504 {
505 switch (c = scm_getc (port))
506 {
507 case EOF:
508 goto str_eof;
509 case '"':
510 case '\\':
511 break;
16353acc 512#if SCM_ENABLE_ELISP
9c44cd45
MG
513 case '(':
514 case ')':
515 if (SCM_ESCAPED_PARENS_P)
516 break;
517 goto bad_escaped;
16353acc 518#endif
9c44cd45
MG
519 case '\n':
520 continue;
521 case '0':
522 c = '\0';
523 break;
524 case 'f':
525 c = '\f';
526 break;
527 case 'n':
528 c = '\n';
529 break;
530 case 'r':
531 c = '\r';
532 break;
533 case 't':
534 c = '\t';
535 break;
536 case 'a':
537 c = '\007';
538 break;
539 case 'v':
540 c = '\v';
541 break;
67a4a16d
MG
542 case 'b':
543 c = '\010';
544 break;
9c44cd45 545 case 'x':
dea901d6
MG
546 if (SCM_R6RS_ESCAPES_P)
547 SCM_READ_HEX_ESCAPE (10, ';');
548 else
549 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 550 break;
9c44cd45 551 case 'u':
898a0b5a
MG
552 if (!SCM_R6RS_ESCAPES_P)
553 {
554 SCM_READ_HEX_ESCAPE (4, '\0');
555 break;
556 }
9c44cd45 557 case 'U':
898a0b5a
MG
558 if (!SCM_R6RS_ESCAPES_P)
559 {
560 SCM_READ_HEX_ESCAPE (6, '\0');
561 break;
562 }
9c44cd45
MG
563 default:
564 bad_escaped:
565 scm_i_input_error (FUNC_NAME, port,
566 "illegal character in escape sequence: ~S",
567 scm_list_1 (SCM_MAKE_CHAR (c)));
568 }
569 }
570 str = scm_i_string_start_writing (str);
571 scm_i_string_set_x (str, c_str_len++, c);
572 scm_i_string_stop_writing ();
7337d56d 573 }
f13b4400 574
7337d56d
LC
575 if (c_str_len > 0)
576 {
9c44cd45 577 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 578 }
69f90b0b 579
9c44cd45 580 return scm_nullstr;
0f2d19dd 581}
db4b4ca6
DH
582#undef FUNC_NAME
583
0f2d19dd 584
7337d56d 585static SCM
889975e5 586scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 587{
69f90b0b
MG
588 SCM result, str = SCM_EOL;
589 char buffer[READER_BUFFER_SIZE];
590 char *overflow_buffer = NULL;
591 size_t bytes_read;
592 int overflow;
593 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 594
7337d56d 595 scm_ungetc (chr, port);
69f90b0b
MG
596 overflow = read_complete_token (port, buffer, sizeof (buffer),
597 &overflow_buffer, &bytes_read);
598
599 if (!overflow)
600 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
601 else
602 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
603 pt->ilseq_handler);
604
605 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 606 if (!scm_is_true (result))
69f90b0b
MG
607 {
608 /* Return a symbol instead of a number */
609 if (SCM_CASE_INSENSITIVE_P)
610 str = scm_string_downcase_x (str);
611 result = scm_string_to_symbol (str);
612 }
0f2d19dd 613
69f90b0b
MG
614 if (overflow)
615 free (overflow_buffer);
616 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
617 return result;
618}
0f2d19dd 619
7337d56d 620static SCM
889975e5 621scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 622{
889975e5
MG
623 SCM result;
624 int ends_with_colon = 0;
69f90b0b 625 size_t bytes_read;
ef4cbc08 626 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
627 int overflow;
628 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
629 scm_t_port *pt = SCM_PTAB_ENTRY (port);
630 SCM str;
7337d56d
LC
631
632 scm_ungetc (chr, port);
69f90b0b
MG
633 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
634 &overflow_buffer, &bytes_read);
635 if (bytes_read > 0)
636 {
637 if (!overflow)
638 ends_with_colon = buffer[bytes_read - 1] == ':';
639 else
640 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
641 }
ef4cbc08 642
69f90b0b
MG
643 if (postfix && ends_with_colon && (bytes_read > 1))
644 {
645 if (!overflow)
646 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
647 else
648 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
649 pt->ilseq_handler);
650
651 if (SCM_CASE_INSENSITIVE_P)
652 str = scm_string_downcase_x (str);
653 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
654 }
7337d56d 655 else
69f90b0b
MG
656 {
657 if (!overflow)
658 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
659 else
660 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
661 pt->ilseq_handler);
662
663 if (SCM_CASE_INSENSITIVE_P)
664 str = scm_string_downcase_x (str);
665 result = scm_string_to_symbol (str);
666 }
7337d56d 667
69f90b0b
MG
668 if (overflow)
669 free (overflow_buffer);
670 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
671 return result;
672}
673
674static SCM
889975e5 675scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
676#define FUNC_NAME "scm_lreadr"
677{
889975e5 678 SCM result;
7337d56d 679 size_t read;
69f90b0b
MG
680 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
681 int overflow;
7337d56d 682 unsigned int radix;
69f90b0b
MG
683 SCM str;
684 scm_t_port *pt;
7337d56d
LC
685
686 switch (chr)
687 {
688 case 'B':
689 case 'b':
690 radix = 2;
691 break;
692
693 case 'o':
694 case 'O':
695 radix = 8;
696 break;
697
698 case 'd':
699 case 'D':
700 radix = 10;
701 break;
702
703 case 'x':
704 case 'X':
705 radix = 16;
706 break;
707
708 default:
709 scm_ungetc (chr, port);
710 scm_ungetc ('#', port);
711 radix = 10;
712 }
713
69f90b0b
MG
714 overflow = read_complete_token (port, buffer, sizeof (buffer),
715 &overflow_buffer, &read);
716
717 pt = SCM_PTAB_ENTRY (port);
718 if (!overflow)
719 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
720 else
721 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
722 pt->ilseq_handler);
723
724 result = scm_string_to_number (str, scm_from_uint (radix));
725
726 if (overflow)
727 free (overflow_buffer);
728
729 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
730
731 if (scm_is_true (result))
732 return result;
733
734 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
735
736 return SCM_BOOL_F;
737}
738#undef FUNC_NAME
739
740static SCM
741scm_read_quote (int chr, SCM port)
742{
743 SCM p;
492faee1
LC
744 long line = SCM_LINUM (port);
745 int column = SCM_COL (port) - 1;
7337d56d
LC
746
747 switch (chr)
748 {
749 case '`':
750 p = scm_sym_quasiquote;
751 break;
752
753 case '\'':
754 p = scm_sym_quote;
755 break;
756
757 case ',':
758 {
889975e5 759 scm_t_wchar c;
7337d56d
LC
760
761 c = scm_getc (port);
762 if ('@' == c)
763 p = scm_sym_uq_splicing;
764 else
0f2d19dd 765 {
7337d56d
LC
766 scm_ungetc (c, port);
767 p = scm_sym_unquote;
0f2d19dd 768 }
7337d56d
LC
769 break;
770 }
0f2d19dd 771
7337d56d
LC
772 default:
773 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 774 "scm_read_quote", chr);
7337d56d 775 abort ();
0f2d19dd 776 }
1cc91f1b 777
7337d56d 778 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1
LC
779 if (SCM_RECORD_POSITIONS_P)
780 scm_whash_insert (scm_source_whash, p,
781 scm_make_srcprops (line, column,
782 SCM_FILENAME (port),
783 SCM_COPY_SOURCE_P
784 ? (scm_cons2 (SCM_CAR (p),
785 SCM_CAR (SCM_CDR (p)),
786 SCM_EOL))
787 : SCM_UNDEFINED,
788 SCM_EOL));
789
0f2d19dd 790
7337d56d
LC
791 return p;
792}
793
34f3d47d
AW
794SCM_SYMBOL (sym_syntax, "syntax");
795SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
796SCM_SYMBOL (sym_unsyntax, "unsyntax");
797SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
798
799static SCM
800scm_read_syntax (int chr, SCM port)
801{
802 SCM p;
803 long line = SCM_LINUM (port);
804 int column = SCM_COL (port) - 1;
805
806 switch (chr)
807 {
808 case '`':
809 p = sym_quasisyntax;
810 break;
811
812 case '\'':
813 p = sym_syntax;
814 break;
815
816 case ',':
817 {
818 int c;
819
820 c = scm_getc (port);
821 if ('@' == c)
822 p = sym_unsyntax_splicing;
823 else
824 {
825 scm_ungetc (c, port);
826 p = sym_unsyntax;
827 }
828 break;
829 }
830
831 default:
832 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
833 "scm_read_syntax", chr);
834 abort ();
835 }
836
837 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
838 if (SCM_RECORD_POSITIONS_P)
839 scm_whash_insert (scm_source_whash, p,
840 scm_make_srcprops (line, column,
841 SCM_FILENAME (port),
842 SCM_COPY_SOURCE_P
843 ? (scm_cons2 (SCM_CAR (p),
844 SCM_CAR (SCM_CDR (p)),
845 SCM_EOL))
846 : SCM_UNDEFINED,
847 SCM_EOL));
848
849
850 return p;
851}
852
7337d56d
LC
853static inline SCM
854scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 855{
0f2d19dd
JB
856 int c;
857
889975e5
MG
858 /* We use the get_byte here because there is no need to get the
859 locale correct with comment input. This presumes that newline
860 always represents itself no matter what the encoding is. */
861 for (c = scm_get_byte_or_eof (port);
7337d56d 862 (c != EOF) && (c != '\n');
8a8da78d 863 c = scm_get_byte_or_eof (port));
7337d56d
LC
864
865 return SCM_UNSPECIFIED;
866}
867
868\f
869/* Sharp readers, i.e. readers called after a `#' sign has been read. */
870
871static SCM
872scm_read_boolean (int chr, SCM port)
873{
874 switch (chr)
0f2d19dd 875 {
7337d56d
LC
876 case 't':
877 case 'T':
878 return SCM_BOOL_T;
879
880 case 'f':
881 case 'F':
882 return SCM_BOOL_F;
0f2d19dd 883 }
7337d56d
LC
884
885 return SCM_UNSPECIFIED;
886}
887
888static SCM
889975e5 889scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
890#define FUNC_NAME "scm_lreadr"
891{
69f90b0b
MG
892 char buffer[READER_CHAR_NAME_MAX_SIZE];
893 SCM charname;
894 size_t charname_len, bytes_read;
889975e5
MG
895 scm_t_wchar cp;
896 int overflow;
69f90b0b 897 scm_t_port *pt;
889975e5 898
69f90b0b 899 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 900 if (overflow)
7337d56d
LC
901 goto char_error;
902
69f90b0b 903 if (bytes_read == 0)
0f2d19dd 904 {
7337d56d
LC
905 chr = scm_getc (port);
906 if (chr == EOF)
907 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
908 "while reading character", SCM_EOL);
909
910 /* CHR must be a token delimiter, like a whitespace. */
911 return (SCM_MAKE_CHAR (chr));
0f2d19dd 912 }
7337d56d 913
69f90b0b 914 pt = SCM_PTAB_ENTRY (port);
7337d56d 915
69f90b0b
MG
916 /* Simple ASCII characters can be processed immediately. Also, simple
917 ISO-8859-1 characters can be processed immediately if the encoding for this
918 port is ISO-8859-1. */
919 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
920 {
921 SCM_COL (port) += 1;
922 return SCM_MAKE_CHAR (buffer[0]);
923 }
924
925 /* Otherwise, convert the buffer into a proper scheme string for
926 processing. */
927 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
928 pt->ilseq_handler);
929 charname_len = scm_i_string_length (charname);
930 SCM_COL (port) += charname_len;
889975e5 931 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
932 if (charname_len == 1)
933 return SCM_MAKE_CHAR (cp);
934
935 /* Ignore dotted circles, which may be used to keep combining characters from
936 combining with the backslash in #\charname. */
0dcd7e61
MG
937 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
938 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
939
889975e5 940 if (cp >= '0' && cp < '8')
7337d56d
LC
941 {
942 /* Dirk:FIXME:: This type of character syntax is not R5RS
943 * compliant. Further, it should be verified that the constant
0ffc78e3 944 * does only consist of octal digits. */
889975e5 945 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 946 if (SCM_I_INUMP (p))
0ffc78e3
MG
947 {
948 scm_t_wchar c = SCM_I_INUM (p);
949 if (SCM_IS_UNICODE_CHAR (c))
950 return SCM_MAKE_CHAR (c);
951 else
952 scm_i_input_error (FUNC_NAME, port,
953 "out-of-range octal character escape: ~a",
954 scm_list_1 (charname));
955 }
7337d56d
LC
956 }
957
dea901d6
MG
958 if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
959 {
960 SCM p;
dea901d6
MG
961
962 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
963 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
964 scm_from_uint (16));
965 if (SCM_I_INUMP (p))
966 {
967 scm_t_wchar c = SCM_I_INUM (p);
968 if (SCM_IS_UNICODE_CHAR (c))
969 return SCM_MAKE_CHAR (c);
970 else
971 scm_i_input_error (FUNC_NAME, port,
972 "out-of-range hex character escape: ~a",
973 scm_list_1 (charname));
974 }
975 }
976
889975e5
MG
977 /* The names of characters should never have non-Latin1
978 characters. */
979 if (scm_i_is_narrow_string (charname)
980 || scm_i_try_narrow_string (charname))
4769c9db
AW
981 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
982 charname_len);
983 if (scm_is_true (ch))
984 return ch;
985 }
7337d56d
LC
986
987 char_error:
988 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 989 scm_list_1 (charname));
7337d56d
LC
990
991 return SCM_UNSPECIFIED;
0f2d19dd 992}
db4b4ca6 993#undef FUNC_NAME
0f2d19dd 994
7337d56d
LC
995static inline SCM
996scm_read_keyword (int chr, SCM port)
997{
998 SCM symbol;
999
1000 /* Read the symbol that comprises the keyword. Doing this instead of
1001 invoking a specific symbol reader function allows `scm_read_keyword ()'
1002 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1003
7337d56d
LC
1004 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1005 symbol = scm_read_expression (port);
1006 if (!scm_is_symbol (symbol))
7f74cf9a 1007 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1008 "keyword prefix `~a' not followed by a symbol: ~s",
1009 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1010
1011 return (scm_symbol_to_keyword (symbol));
1012}
1013
1014static inline SCM
1015scm_read_vector (int chr, SCM port)
09a4f039 1016{
7337d56d
LC
1017 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1018 guarantee that it's going to do what we want. After all, this is an
1019 implementation detail of `scm_read_vector ()', not a desirable
1020 property. */
1021 return (scm_vector (scm_read_sexp (chr, port)));
1022}
09a4f039 1023
7337d56d
LC
1024static inline SCM
1025scm_read_srfi4_vector (int chr, SCM port)
1026{
1027 return scm_i_read_array (port, chr);
1028}
1029
0ba0b384 1030static SCM
889975e5 1031scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
1032{
1033 chr = scm_getc (port);
1034 if (chr != 'u')
1035 goto syntax;
1036
1037 chr = scm_getc (port);
1038 if (chr != '8')
1039 goto syntax;
1040
1041 chr = scm_getc (port);
1042 if (chr != '(')
1043 goto syntax;
1044
1045 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1046
1047 syntax:
1048 scm_i_input_error ("read_bytevector", port,
1049 "invalid bytevector prefix",
1050 SCM_MAKE_CHAR (chr));
1051 return SCM_UNSPECIFIED;
1052}
1053
7337d56d 1054static SCM
889975e5 1055scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1056{
1057 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1058 terribly inefficient but who cares? */
1059 SCM s_bits = SCM_EOL;
1060
1061 for (chr = scm_getc (port);
1062 (chr != EOF) && ((chr == '0') || (chr == '1'));
1063 chr = scm_getc (port))
09a4f039 1064 {
7337d56d 1065 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1066 }
7337d56d
LC
1067
1068 if (chr != EOF)
1069 scm_ungetc (chr, port);
1070
1071 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1072}
1073
1074static inline SCM
889975e5 1075scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1076{
1077 int bang_seen = 0;
1078
889975e5
MG
1079 /* We can use the get_byte here because there is no need to get the
1080 locale correct when reading comments. This presumes that
1081 hash and exclamation points always represent themselves no
1082 matter what the source encoding is.*/
7337d56d 1083 for (;;)
09a4f039 1084 {
889975e5 1085 int c = scm_get_byte_or_eof (port);
62850ef3 1086
7337d56d
LC
1087 if (c == EOF)
1088 scm_i_input_error ("skip_block_comment", port,
1089 "unterminated `#! ... !#' comment", SCM_EOL);
1090
1091 if (c == '!')
1092 bang_seen = 1;
1093 else if (c == '#' && bang_seen)
1094 break;
1095 else
1096 bang_seen = 0;
1097 }
1098
1099 return SCM_UNSPECIFIED;
1100}
1101
620c8965
LC
1102static SCM
1103scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1104{
1105 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1106 nested. So care must be taken. */
1107 int nesting_level = 1;
1108 int opening_seen = 0, closing_seen = 0;
1109
1110 while (nesting_level > 0)
1111 {
1112 int c = scm_getc (port);
1113
1114 if (c == EOF)
cd169c5a 1115 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1116 "unterminated `#| ... |#' comment", SCM_EOL);
1117
1118 if (opening_seen)
1119 {
1120 if (c == '|')
1121 nesting_level++;
1122 opening_seen = 0;
1123 }
1124 else if (closing_seen)
1125 {
1126 if (c == '#')
1127 nesting_level--;
1128 closing_seen = 0;
1129 }
1130 else if (c == '|')
1131 closing_seen = 1;
1132 else if (c == '#')
1133 opening_seen = 1;
1134 else
1135 opening_seen = closing_seen = 0;
1136 }
1137
1138 return SCM_UNSPECIFIED;
1139}
1140
34f3d47d 1141static SCM
889975e5 1142scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1143{
889975e5 1144 scm_t_wchar c;
34f3d47d
AW
1145
1146 c = flush_ws (port, (char *) NULL);
1147 if (EOF == c)
1148 scm_i_input_error ("read_commented_expression", port,
1149 "no expression after #; comment", SCM_EOL);
1150 scm_ungetc (c, port);
1151 scm_read_expression (port);
1152 return SCM_UNSPECIFIED;
1153}
1154
7337d56d 1155static SCM
889975e5 1156scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1157{
1158 /* Guile's extended symbol read syntax looks like this:
1159
1160 #{This is all a symbol name}#
1161
1162 So here, CHR is expected to be `{'. */
7337d56d
LC
1163 int saw_brace = 0, finished = 0;
1164 size_t len = 0;
889975e5 1165 SCM buf = scm_i_make_string (1024, NULL);
7337d56d 1166
889975e5 1167 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1168
1169 while ((chr = scm_getc (port)) != EOF)
1170 {
1171 if (saw_brace)
09a4f039 1172 {
7337d56d
LC
1173 if (chr == '#')
1174 {
1175 finished = 1;
1176 break;
1177 }
1178 else
1179 {
1180 saw_brace = 0;
889975e5
MG
1181 scm_i_string_set_x (buf, len++, '}');
1182 scm_i_string_set_x (buf, len++, chr);
7337d56d 1183 }
09a4f039 1184 }
7337d56d
LC
1185 else if (chr == '}')
1186 saw_brace = 1;
1187 else
889975e5 1188 scm_i_string_set_x (buf, len++, chr);
62850ef3 1189
889975e5 1190 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1191 {
7f991c7d
LC
1192 SCM addy;
1193
889975e5 1194 scm_i_string_stop_writing ();
7f991c7d 1195 addy = scm_i_make_string (1024, NULL);
889975e5 1196 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1197 len = 0;
889975e5 1198 buf = scm_i_string_start_writing (buf);
7337d56d 1199 }
62850ef3 1200
7337d56d
LC
1201 if (finished)
1202 break;
1203 }
889975e5 1204 scm_i_string_stop_writing ();
7337d56d 1205
889975e5 1206 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1207}
1208
1209
1210\f
1211/* Top-level token readers, i.e., dispatchers. */
1212
1213static SCM
1214scm_read_sharp_extension (int chr, SCM port)
1215{
1216 SCM proc;
1217
1218 proc = scm_get_hash_procedure (chr);
1219 if (scm_is_true (scm_procedure_p (proc)))
1220 {
1221 long line = SCM_LINUM (port);
1222 int column = SCM_COL (port) - 2;
1223 SCM got;
1224
1225 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1226 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1227 {
7337d56d
LC
1228 if (SCM_RECORD_POSITIONS_P)
1229 return (recsexpr (got, line, column,
1230 SCM_FILENAME (port)));
1231 else
1232 return got;
62850ef3 1233 }
09a4f039 1234 }
7337d56d
LC
1235
1236 return SCM_UNSPECIFIED;
1237}
1238
1239/* The reader for the sharp `#' character. It basically dispatches reads
1240 among the above token readers. */
1241static SCM
889975e5 1242scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1243#define FUNC_NAME "scm_lreadr"
1244{
1245 SCM result;
1246
1247 chr = scm_getc (port);
1248
1249 result = scm_read_sharp_extension (chr, port);
1250 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1251 return result;
1252
1253 switch (chr)
1254 {
1255 case '\\':
1256 return (scm_read_character (chr, port));
1257 case '(':
1258 return (scm_read_vector (chr, port));
1259 case 's':
1260 case 'u':
1261 case 'f':
1262 /* This one may return either a boolean or an SRFI-4 vector. */
1263 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1264 case 'v':
1265 return (scm_read_bytevector (chr, port));
7337d56d
LC
1266 case '*':
1267 return (scm_read_guile_bit_vector (chr, port));
1268 case 't':
1269 case 'T':
1270 case 'F':
1271 /* This one may return either a boolean or an SRFI-4 vector. */
1272 return (scm_read_boolean (chr, port));
1273 case ':':
1274 return (scm_read_keyword (chr, port));
1275 case '0': case '1': case '2': case '3': case '4':
1276 case '5': case '6': case '7': case '8': case '9':
1277 case '@':
1278#if SCM_ENABLE_DEPRECATED
1279 /* See below for 'i' and 'e'. */
1280 case 'a':
1281 case 'c':
1282 case 'y':
1283 case 'h':
1284 case 'l':
1285#endif
1286 return (scm_i_read_array (port, chr));
1287
1288 case 'i':
1289 case 'e':
1290#if SCM_ENABLE_DEPRECATED
1291 {
1292 /* When next char is '(', it really is an old-style
1293 uniform array. */
889975e5 1294 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1295 if (next_c != EOF)
1296 scm_ungetc (next_c, port);
1297 if (next_c == '(')
1298 return scm_i_read_array (port, chr);
1299 /* Fall through. */
1300 }
1301#endif
1302 case 'b':
1303 case 'B':
1304 case 'o':
1305 case 'O':
1306 case 'd':
1307 case 'D':
1308 case 'x':
1309 case 'X':
1310 case 'I':
1311 case 'E':
1312 return (scm_read_number_and_radix (chr, port));
1313 case '{':
1314 return (scm_read_extended_symbol (chr, port));
1315 case '!':
1316 return (scm_read_scsh_block_comment (chr, port));
34f3d47d
AW
1317 case ';':
1318 return (scm_read_commented_expression (chr, port));
1319 case '`':
1320 case '\'':
1321 case ',':
1322 return (scm_read_syntax (chr, port));
7337d56d
LC
1323 default:
1324 result = scm_read_sharp_extension (chr, port);
1325 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1326 {
1327 /* To remain compatible with 1.8 and earlier, the following
1328 characters have lower precedence than `read-hash-extend'
1329 characters. */
1330 switch (chr)
1331 {
1332 case '|':
1333 return scm_read_r6rs_block_comment (chr, port);
1334 default:
1335 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1336 scm_list_1 (SCM_MAKE_CHAR (chr)));
1337 }
1338 }
7337d56d
LC
1339 else
1340 return result;
1341 }
1342
1343 return SCM_UNSPECIFIED;
1344}
1345#undef FUNC_NAME
1346
1347static SCM
1348scm_read_expression (SCM port)
1349#define FUNC_NAME "scm_read_expression"
1350{
1351 while (1)
1352 {
889975e5 1353 register scm_t_wchar chr;
7337d56d
LC
1354
1355 chr = scm_getc (port);
1356
1357 switch (chr)
1358 {
1359 case SCM_WHITE_SPACES:
1360 case SCM_LINE_INCREMENTORS:
1361 break;
1362 case ';':
1363 (void) scm_read_semicolon_comment (chr, port);
1364 break;
5afa815c
AW
1365 case '[':
1366 if (!SCM_SQUARE_BRACKETS_P)
1367 return (scm_read_mixed_case_symbol (chr, port));
1368 /* otherwise fall through */
7337d56d
LC
1369 case '(':
1370 return (scm_read_sexp (chr, port));
1371 case '"':
1372 return (scm_read_string (chr, port));
1373 case '\'':
1374 case '`':
1375 case ',':
1376 return (scm_read_quote (chr, port));
1377 case '#':
1378 {
1379 SCM result;
1380 result = scm_read_sharp (chr, port);
1381 if (scm_is_eq (result, SCM_UNSPECIFIED))
1382 /* We read a comment or some such. */
1383 break;
1384 else
1385 return result;
1386 }
1387 case ')':
1388 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1389 break;
1390 case EOF:
1391 return SCM_EOF_VAL;
1392 case ':':
1393 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1394 return scm_symbol_to_keyword (scm_read_expression (port));
1395 /* Fall through. */
1396
1397 default:
1398 {
1399 if (((chr >= '0') && (chr <= '9'))
1400 || (strchr ("+-.", chr)))
1401 return (scm_read_number (chr, port));
1402 else
1403 return (scm_read_mixed_case_symbol (chr, port));
1404 }
1405 }
1406 }
1407}
1408#undef FUNC_NAME
1409
1410\f
1411/* Actual reader. */
1412
1413SCM_DEFINE (scm_read, "read", 0, 1, 0,
1414 (SCM port),
1415 "Read an s-expression from the input port @var{port}, or from\n"
1416 "the current input port if @var{port} is not specified.\n"
1417 "Any whitespace before the next token is discarded.")
1418#define FUNC_NAME s_scm_read
1419{
1420 int c;
1421
1422 if (SCM_UNBNDP (port))
1423 port = scm_current_input_port ();
1424 SCM_VALIDATE_OPINPORT (1, port);
1425
1426 c = flush_ws (port, (char *) NULL);
1427 if (EOF == c)
1428 return SCM_EOF_VAL;
1429 scm_ungetc (c, port);
1430
1431 return (scm_read_expression (port));
09a4f039 1432}
db4b4ca6 1433#undef FUNC_NAME
09a4f039 1434
0f2d19dd
JB
1435
1436\f
1437
7337d56d
LC
1438/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1439static SCM
1440recsexpr (SCM obj, long line, int column, SCM filename)
1441{
1442 if (!scm_is_pair(obj)) {
1443 return obj;
1444 } else {
1445 SCM tmp = obj, copy;
1446 /* If this sexpr is visible in the read:sharp source, we want to
1447 keep that information, so only record non-constant cons cells
1448 which haven't previously been read by the reader. */
1449 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1450 {
1451 if (SCM_COPY_SOURCE_P)
1452 {
1453 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1454 SCM_UNDEFINED);
1455 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1456 {
1457 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1458 line,
1459 column,
1460 filename),
1461 SCM_UNDEFINED));
1462 copy = SCM_CDR (copy);
1463 }
1464 SCM_SETCDR (copy, tmp);
1465 }
1466 else
1467 {
1468 recsexpr (SCM_CAR (obj), line, column, filename);
1469 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1470 recsexpr (SCM_CAR (tmp), line, column, filename);
1471 copy = SCM_UNDEFINED;
1472 }
1473 scm_whash_insert (scm_source_whash,
1474 obj,
1475 scm_make_srcprops (line,
1476 column,
1477 filename,
1478 copy,
1479 SCM_EOL));
1480 }
1481 return obj;
1482 }
1483}
1484
14de3b42
GH
1485/* Manipulate the read-hash-procedures alist. This could be written in
1486 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1487SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1488 (SCM chr, SCM proc),
dc7fa443
MG
1489 "Install the procedure @var{proc} for reading expressions\n"
1490 "starting with the character sequence @code{#} and @var{chr}.\n"
1491 "@var{proc} will be called with two arguments: the character\n"
1492 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1493 "returned will be the return value of @code{read}. \n"
1494 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1495 )
1bbd0b84 1496#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1497{
fed9c9a2
GH
1498 SCM this;
1499 SCM prev;
1500
36284627 1501 SCM_VALIDATE_CHAR (1, chr);
7888309b 1502 SCM_ASSERT (scm_is_false (proc)
bc36d050 1503 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1504 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1505
14de3b42
GH
1506 /* Check if chr is already in the alist. */
1507 this = *scm_read_hash_procedures;
1508 prev = SCM_BOOL_F;
fed9c9a2
GH
1509 while (1)
1510 {
d2e53ed6 1511 if (scm_is_null (this))
fed9c9a2
GH
1512 {
1513 /* not found, so add it to the beginning. */
7888309b 1514 if (scm_is_true (proc))
fed9c9a2 1515 {
14de3b42
GH
1516 *scm_read_hash_procedures =
1517 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
fed9c9a2
GH
1518 }
1519 break;
1520 }
bc36d050 1521 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1522 {
1523 /* already in the alist. */
7888309b 1524 if (scm_is_false (proc))
14de3b42
GH
1525 {
1526 /* remove it. */
7888309b 1527 if (scm_is_false (prev))
14de3b42
GH
1528 {
1529 *scm_read_hash_procedures =
1530 SCM_CDR (*scm_read_hash_procedures);
1531 }
1532 else
1533 scm_set_cdr_x (prev, SCM_CDR (this));
1534 }
fed9c9a2 1535 else
14de3b42
GH
1536 {
1537 /* replace it. */
1538 scm_set_cdr_x (SCM_CAR (this), proc);
1539 }
fed9c9a2
GH
1540 break;
1541 }
1542 prev = this;
1543 this = SCM_CDR (this);
1544 }
deca31e1 1545
deca31e1
GH
1546 return SCM_UNSPECIFIED;
1547}
1bbd0b84 1548#undef FUNC_NAME
0f2d19dd 1549
deca31e1
GH
1550/* Recover the read-hash procedure corresponding to char c. */
1551static SCM
6e8d25a6 1552scm_get_hash_procedure (int c)
deca31e1 1553{
14de3b42 1554 SCM rest = *scm_read_hash_procedures;
fed9c9a2 1555
deca31e1
GH
1556 while (1)
1557 {
d2e53ed6 1558 if (scm_is_null (rest))
deca31e1
GH
1559 return SCM_BOOL_F;
1560
7866a09b 1561 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1562 return SCM_CDAR (rest);
1563
1564 rest = SCM_CDR (rest);
1565 }
1566}
1cc91f1b 1567
889975e5
MG
1568#define SCM_ENCODING_SEARCH_SIZE (500)
1569
f8a1c9a8
LC
1570/* Search the first few hundred characters of a file for an Emacs-like coding
1571 declaration. Returns either NULL or a string whose storage has been
1572 allocated with `scm_gc_malloc ()'. */
889975e5 1573char *
f8a1c9a8 1574scm_i_scan_for_encoding (SCM port)
889975e5
MG
1575{
1576 char header[SCM_ENCODING_SEARCH_SIZE+1];
1577 size_t bytes_read;
1578 char *encoding = NULL;
1579 int utf8_bom = 0;
1580 char *pos;
1581 int i;
1582 int in_comment;
1583
49bb5bd3
LC
1584 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1585 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1586 "guile <(echo '(display "hello")')") so bail out. */
1587 return NULL;
1588
1589 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1590
889975e5
MG
1591 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1592
1593 if (bytes_read > 3
1594 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1595 utf8_bom = 1;
1596
1597 /* search past "coding[:=]" */
1598 pos = header;
1599 while (1)
1600 {
1601 if ((pos = strstr(pos, "coding")) == NULL)
1602 return NULL;
1603
1604 pos += strlen("coding");
1605 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1606 (*pos == ':' || *pos == '='))
1607 {
1608 pos ++;
1609 break;
1610 }
1611 }
1612
1613 /* skip spaces */
1614 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1615 (*pos == ' ' || *pos == '\t'))
1616 pos ++;
1617
1618 /* grab the next token */
1619 i = 0;
1620 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
060e305a 1621 && pos + i - header < bytes_read
6751d6db 1622 && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
889975e5
MG
1623 i++;
1624
1625 if (i == 0)
1626 return NULL;
1627
b9fd657a 1628 encoding = scm_gc_strndup (pos, i, "encoding");
026ed239
MG
1629 for (i = 0; i < strlen (encoding); i++)
1630 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1631
1632 /* push backwards to make sure we were in a comment */
1633 in_comment = 0;
1634 while (pos - i - header > 0)
1635 {
1636 if (*(pos - i) == '\n')
1637 {
1638 /* This wasn't in a semicolon comment. Check for a
1639 hash-bang comment. */
1640 char *beg = strstr (header, "#!");
1641 char *end = strstr (header, "!#");
1642 if (beg < pos && pos < end)
1643 in_comment = 1;
1644 break;
1645 }
1646 if (*(pos - i) == ';')
1647 {
1648 in_comment = 1;
1649 break;
1650 }
1651 i ++;
1652 }
1653 if (!in_comment)
f8a1c9a8
LC
1654 /* This wasn't in a comment */
1655 return NULL;
1656
889975e5
MG
1657 if (utf8_bom && strcmp(encoding, "UTF-8"))
1658 scm_misc_error (NULL,
1659 "the port input declares the encoding ~s but is encoded as UTF-8",
1660 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1661
889975e5
MG
1662 return encoding;
1663}
1664
1665SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1666 (SCM port),
a270e133 1667 "Scans the port for an Emacs-like character coding declaration\n"
889975e5
MG
1668 "near the top of the contents of a port with random-acessible contents.\n"
1669 "The coding declaration is of the form\n"
1670 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1671 "\n"
1672 "Returns a string containing the character encoding of the file\n"
1673 "if a declaration was found, or @code{#f} otherwise.\n")
1674#define FUNC_NAME s_scm_file_encoding
1675{
1676 char *enc;
1677 SCM s_enc;
f8a1c9a8
LC
1678
1679 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1680 if (enc == NULL)
1681 return SCM_BOOL_F;
1682 else
1683 {
1684 s_enc = scm_from_locale_string (enc);
889975e5
MG
1685 return s_enc;
1686 }
f8a1c9a8 1687
889975e5
MG
1688 return SCM_BOOL_F;
1689}
1690#undef FUNC_NAME
1691
0f2d19dd
JB
1692void
1693scm_init_read ()
0f2d19dd 1694{
14de3b42 1695 scm_read_hash_procedures =
86d31dfe 1696 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
fed9c9a2 1697
62560650 1698 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1699#include "libguile/read.x"
0f2d19dd 1700}
89e00824
ML
1701
1702/*
1703 Local Variables:
1704 c-file-style: "gnu"
1705 End:
1706*/