simplify inline function infrastructure
[bpt/guile.git] / libguile / read.c
CommitLineData
58996e37
MW
1/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
210c0325 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
cfd15439 145static SCM
d458073b
AR
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
cfd15439 151static void
d458073b
AR
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
7337d56d
LC
164/* Size of the C buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
cfd15439 200static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
620c8965
LC
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
cfd15439 210static int
69f90b0b
MG
211read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
69f90b0b 219 chr = scm_get_byte_or_eof (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
69f90b0b
MG
238/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243static int
244read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246{
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
7337d56d 249
69f90b0b
MG
250 *overflow_buffer = NULL;
251 overflow_size = 0;
1cc91f1b 252
69f90b0b
MG
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
281 return (overflow_size != 0);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
cfd15439 289 scm_t_wchar c;
0f2d19dd 290 while (1)
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
b7f3516f 306 switch (c = scm_getc (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0
LC
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0
LC
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d 358static SCM scm_read_expression (SCM port);
b131b233 359static SCM scm_read_sharp (int chr, SCM port, long line, int column);
0f2d19dd
JB
360
361
d6cb0203
MW
362static SCM
363maybe_annotate_source (SCM x, SCM port, long line, int column)
364{
365 if (SCM_RECORD_POSITIONS_P)
366 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
367 return x;
368}
369
09a4f039 370static SCM
889975e5 371scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 372#define FUNC_NAME "scm_i_lreadparen"
09a4f039 373{
26c8cc14
AW
374 int c;
375 SCM tmp, tl, ans = SCM_EOL;
5afa815c 376 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
377
378 /* Need to capture line and column numbers here. */
379 long line = SCM_LINUM (port);
380 int column = SCM_COL (port) - 1;
f9c68a47 381
7337d56d
LC
382 c = flush_ws (port, FUNC_NAME);
383 if (terminating_char == c)
384 return SCM_EOL;
f9c68a47 385
7337d56d 386 scm_ungetc (c, port);
1f7945a7
AW
387 tmp = scm_read_expression (port);
388
389 /* Note that it is possible for scm_read_expression to return
390 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
391 check that it's a real dot by checking `c'. */
392 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d
LC
393 {
394 ans = scm_read_expression (port);
395 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
396 scm_i_input_error (FUNC_NAME, port, "missing close paren",
397 SCM_EOL);
398 return ans;
399 }
1cc91f1b 400
7337d56d
LC
401 /* Build the head of the list structure. */
402 ans = tl = scm_cons (tmp, SCM_EOL);
403
7337d56d 404 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 405 {
7337d56d 406 SCM new_tail;
0f2d19dd 407
5b69315e
AW
408 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
409 scm_i_input_error (FUNC_NAME, port,
410 "in pair: mismatched close paren: ~A",
411 scm_list_1 (SCM_MAKE_CHAR (c)));
412
7337d56d 413 scm_ungetc (c, port);
5b69315e
AW
414 tmp = scm_read_expression (port);
415
1f7945a7
AW
416 /* See above note about scm_sym_dot. */
417 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 418 {
89f88612 419 SCM_SETCDR (tl, scm_read_expression (port));
7337d56d 420
7337d56d
LC
421 c = flush_ws (port, FUNC_NAME);
422 if (terminating_char != c)
423 scm_i_input_error (FUNC_NAME, port,
424 "in pair: missing close paren", SCM_EOL);
425 goto exit;
0f2d19dd 426 }
b858464a 427
7337d56d
LC
428 new_tail = scm_cons (tmp, SCM_EOL);
429 SCM_SETCDR (tl, new_tail);
430 tl = new_tail;
7337d56d 431 }
0f2d19dd 432
7337d56d 433 exit:
d6cb0203 434 return maybe_annotate_source (ans, port, line, column);
7337d56d
LC
435}
436#undef FUNC_NAME
a4022e69 437
c5661d28
MG
438
439/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
440 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
441 found. */
442#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
443 do \
444 { \
445 scm_t_wchar a; \
446 size_t i = 0; \
447 c = 0; \
448 while (i < ndigits) \
449 { \
450 a = scm_getc (port); \
451 if (a == EOF) \
452 goto str_eof; \
453 if (terminator \
454 && (a == (scm_t_wchar) terminator) \
455 && (i > 0)) \
456 break; \
457 if ('0' <= a && a <= '9') \
458 a -= '0'; \
459 else if ('A' <= a && a <= 'F') \
460 a = a - 'A' + 10; \
461 else if ('a' <= a && a <= 'f') \
462 a = a - 'a' + 10; \
463 else \
464 { \
465 c = a; \
466 goto bad_escaped; \
467 } \
468 c = c * 16 + a; \
469 i ++; \
470 } \
c5661d28
MG
471 } while (0)
472
684d664e
AW
473static void
474skip_intraline_whitespace (SCM port)
475{
476 scm_t_wchar c;
477
478 do
479 {
480 c = scm_getc (port);
481 if (c == EOF)
482 return;
483 }
484 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
485
486 scm_ungetc (c, port);
487}
488
7337d56d
LC
489static SCM
490scm_read_string (int chr, SCM port)
491#define FUNC_NAME "scm_lreadr"
492{
493 /* For strings smaller than C_STR, this function creates only one Scheme
494 object (the string returned). */
0f2d19dd 495
7337d56d 496 SCM str = SCM_BOOL_F;
7337d56d 497 unsigned c_str_len = 0;
9c44cd45 498 scm_t_wchar c;
eb42ff25 499
b131b233
MW
500 /* Need to capture line and column numbers here. */
501 long line = SCM_LINUM (port);
502 int column = SCM_COL (port) - 1;
503
190d4b0d 504 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
7337d56d
LC
505 while ('"' != (c = scm_getc (port)))
506 {
507 if (c == EOF)
9c44cd45
MG
508 {
509 str_eof:
510 scm_i_input_error (FUNC_NAME, port,
511 "end of file in string constant", SCM_EOL);
512 }
0f2d19dd 513
9c44cd45
MG
514 if (c_str_len + 1 >= scm_i_string_length (str))
515 {
190d4b0d 516 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
0f2d19dd 517
9c44cd45
MG
518 str = scm_string_append (scm_list_2 (str, addy));
519 }
0f2d19dd 520
7337d56d 521 if (c == '\\')
9c44cd45
MG
522 {
523 switch (c = scm_getc (port))
524 {
525 case EOF:
526 goto str_eof;
527 case '"':
528 case '\\':
529 break;
9c44cd45 530 case '\n':
684d664e
AW
531 if (SCM_HUNGRY_EOL_ESCAPES_P)
532 skip_intraline_whitespace (port);
9c44cd45
MG
533 continue;
534 case '0':
535 c = '\0';
536 break;
537 case 'f':
538 c = '\f';
539 break;
540 case 'n':
541 c = '\n';
542 break;
543 case 'r':
544 c = '\r';
545 break;
546 case 't':
547 c = '\t';
548 break;
549 case 'a':
550 c = '\007';
551 break;
552 case 'v':
553 c = '\v';
554 break;
67a4a16d
MG
555 case 'b':
556 c = '\010';
557 break;
9c44cd45 558 case 'x':
dea901d6
MG
559 if (SCM_R6RS_ESCAPES_P)
560 SCM_READ_HEX_ESCAPE (10, ';');
561 else
562 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 563 break;
9c44cd45 564 case 'u':
898a0b5a
MG
565 if (!SCM_R6RS_ESCAPES_P)
566 {
567 SCM_READ_HEX_ESCAPE (4, '\0');
568 break;
569 }
9c44cd45 570 case 'U':
898a0b5a
MG
571 if (!SCM_R6RS_ESCAPES_P)
572 {
573 SCM_READ_HEX_ESCAPE (6, '\0');
574 break;
575 }
9c44cd45
MG
576 default:
577 bad_escaped:
578 scm_i_input_error (FUNC_NAME, port,
579 "illegal character in escape sequence: ~S",
580 scm_list_1 (SCM_MAKE_CHAR (c)));
581 }
582 }
583 str = scm_i_string_start_writing (str);
584 scm_i_string_set_x (str, c_str_len++, c);
585 scm_i_string_stop_writing ();
7337d56d 586 }
b131b233
MW
587 return maybe_annotate_source (scm_i_substring_copy (str, 0, c_str_len),
588 port, line, column);
0f2d19dd 589}
db4b4ca6
DH
590#undef FUNC_NAME
591
0f2d19dd 592
7337d56d 593static SCM
889975e5 594scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 595{
69f90b0b
MG
596 SCM result, str = SCM_EOL;
597 char buffer[READER_BUFFER_SIZE];
598 char *overflow_buffer = NULL;
599 size_t bytes_read;
600 int overflow;
601 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 602
38f19074
MW
603 /* Need to capture line and column numbers here. */
604 long line = SCM_LINUM (port);
605 int column = SCM_COL (port) - 1;
606
7337d56d 607 scm_ungetc (chr, port);
69f90b0b
MG
608 overflow = read_complete_token (port, buffer, sizeof (buffer),
609 &overflow_buffer, &bytes_read);
610
611 if (!overflow)
612 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
613 else
614 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
615 pt->ilseq_handler);
616
617 result = scm_string_to_number (str, SCM_UNDEFINED);
38f19074 618 if (scm_is_false (result))
69f90b0b
MG
619 {
620 /* Return a symbol instead of a number */
621 if (SCM_CASE_INSENSITIVE_P)
622 str = scm_string_downcase_x (str);
623 result = scm_string_to_symbol (str);
624 }
38f19074
MW
625 else if (SCM_NIMP (result))
626 result = maybe_annotate_source (result, port, line, column);
0f2d19dd 627
69f90b0b
MG
628 if (overflow)
629 free (overflow_buffer);
630 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
631 return result;
632}
0f2d19dd 633
7337d56d 634static SCM
889975e5 635scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 636{
889975e5
MG
637 SCM result;
638 int ends_with_colon = 0;
69f90b0b 639 size_t bytes_read;
ef4cbc08 640 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
641 int overflow;
642 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
643 scm_t_port *pt = SCM_PTAB_ENTRY (port);
644 SCM str;
7337d56d
LC
645
646 scm_ungetc (chr, port);
69f90b0b
MG
647 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
648 &overflow_buffer, &bytes_read);
649 if (bytes_read > 0)
650 {
651 if (!overflow)
652 ends_with_colon = buffer[bytes_read - 1] == ':';
653 else
654 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
655 }
ef4cbc08 656
69f90b0b
MG
657 if (postfix && ends_with_colon && (bytes_read > 1))
658 {
659 if (!overflow)
660 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
661 else
662 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
663 pt->ilseq_handler);
664
665 if (SCM_CASE_INSENSITIVE_P)
666 str = scm_string_downcase_x (str);
667 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
668 }
7337d56d 669 else
69f90b0b
MG
670 {
671 if (!overflow)
672 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
673 else
674 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
675 pt->ilseq_handler);
676
677 if (SCM_CASE_INSENSITIVE_P)
678 str = scm_string_downcase_x (str);
679 result = scm_string_to_symbol (str);
680 }
7337d56d 681
69f90b0b
MG
682 if (overflow)
683 free (overflow_buffer);
684 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
685 return result;
686}
687
688static SCM
889975e5 689scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
690#define FUNC_NAME "scm_lreadr"
691{
889975e5 692 SCM result;
7337d56d 693 size_t read;
69f90b0b
MG
694 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
695 int overflow;
7337d56d 696 unsigned int radix;
69f90b0b
MG
697 SCM str;
698 scm_t_port *pt;
7337d56d
LC
699
700 switch (chr)
701 {
702 case 'B':
703 case 'b':
704 radix = 2;
705 break;
706
707 case 'o':
708 case 'O':
709 radix = 8;
710 break;
711
712 case 'd':
713 case 'D':
714 radix = 10;
715 break;
716
717 case 'x':
718 case 'X':
719 radix = 16;
720 break;
721
722 default:
723 scm_ungetc (chr, port);
724 scm_ungetc ('#', port);
725 radix = 10;
726 }
727
69f90b0b
MG
728 overflow = read_complete_token (port, buffer, sizeof (buffer),
729 &overflow_buffer, &read);
730
731 pt = SCM_PTAB_ENTRY (port);
732 if (!overflow)
733 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
734 else
735 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
736 pt->ilseq_handler);
737
738 result = scm_string_to_number (str, scm_from_uint (radix));
739
740 if (overflow)
741 free (overflow_buffer);
742
743 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
744
745 if (scm_is_true (result))
746 return result;
747
748 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
749
750 return SCM_BOOL_F;
751}
752#undef FUNC_NAME
753
754static SCM
755scm_read_quote (int chr, SCM port)
756{
757 SCM p;
492faee1
LC
758 long line = SCM_LINUM (port);
759 int column = SCM_COL (port) - 1;
7337d56d
LC
760
761 switch (chr)
762 {
763 case '`':
764 p = scm_sym_quasiquote;
765 break;
766
767 case '\'':
768 p = scm_sym_quote;
769 break;
770
771 case ',':
772 {
889975e5 773 scm_t_wchar c;
7337d56d
LC
774
775 c = scm_getc (port);
776 if ('@' == c)
777 p = scm_sym_uq_splicing;
778 else
0f2d19dd 779 {
7337d56d
LC
780 scm_ungetc (c, port);
781 p = scm_sym_unquote;
0f2d19dd 782 }
7337d56d
LC
783 break;
784 }
0f2d19dd 785
7337d56d
LC
786 default:
787 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 788 "scm_read_quote", chr);
7337d56d 789 abort ();
0f2d19dd 790 }
1cc91f1b 791
7337d56d 792 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
d6cb0203 793 return maybe_annotate_source (p, port, line, column);
7337d56d
LC
794}
795
34f3d47d
AW
796SCM_SYMBOL (sym_syntax, "syntax");
797SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
798SCM_SYMBOL (sym_unsyntax, "unsyntax");
799SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
800
801static SCM
802scm_read_syntax (int chr, SCM port)
803{
804 SCM p;
805 long line = SCM_LINUM (port);
806 int column = SCM_COL (port) - 1;
807
808 switch (chr)
809 {
810 case '`':
811 p = sym_quasisyntax;
812 break;
813
814 case '\'':
815 p = sym_syntax;
816 break;
817
818 case ',':
819 {
820 int c;
821
822 c = scm_getc (port);
823 if ('@' == c)
824 p = sym_unsyntax_splicing;
825 else
826 {
827 scm_ungetc (c, port);
828 p = sym_unsyntax;
829 }
830 break;
831 }
832
833 default:
834 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
835 "scm_read_syntax", chr);
836 abort ();
837 }
838
839 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
d6cb0203 840 return maybe_annotate_source (p, port, line, column);
34f3d47d
AW
841}
842
cfd15439 843static SCM
7c4aad9c
AW
844scm_read_nil (int chr, SCM port)
845{
846 SCM id = scm_read_mixed_case_symbol (chr, port);
847
848 if (!scm_is_eq (id, sym_nil))
849 scm_i_input_error ("scm_read_nil", port,
850 "unexpected input while reading #nil: ~a",
851 scm_list_1 (id));
852
853 return SCM_ELISP_NIL;
854}
855
cfd15439 856static SCM
7337d56d 857scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 858{
0f2d19dd
JB
859 int c;
860
889975e5
MG
861 /* We use the get_byte here because there is no need to get the
862 locale correct with comment input. This presumes that newline
863 always represents itself no matter what the encoding is. */
864 for (c = scm_get_byte_or_eof (port);
7337d56d 865 (c != EOF) && (c != '\n');
8a8da78d 866 c = scm_get_byte_or_eof (port));
7337d56d
LC
867
868 return SCM_UNSPECIFIED;
869}
870
871\f
872/* Sharp readers, i.e. readers called after a `#' sign has been read. */
873
874static SCM
875scm_read_boolean (int chr, SCM port)
876{
877 switch (chr)
0f2d19dd 878 {
7337d56d
LC
879 case 't':
880 case 'T':
881 return SCM_BOOL_T;
882
883 case 'f':
884 case 'F':
885 return SCM_BOOL_F;
0f2d19dd 886 }
7337d56d
LC
887
888 return SCM_UNSPECIFIED;
889}
890
891static SCM
889975e5 892scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
893#define FUNC_NAME "scm_lreadr"
894{
69f90b0b
MG
895 char buffer[READER_CHAR_NAME_MAX_SIZE];
896 SCM charname;
897 size_t charname_len, bytes_read;
889975e5
MG
898 scm_t_wchar cp;
899 int overflow;
69f90b0b 900 scm_t_port *pt;
889975e5 901
69f90b0b 902 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 903 if (overflow)
c1a0ba1c 904 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 905
69f90b0b 906 if (bytes_read == 0)
0f2d19dd 907 {
7337d56d
LC
908 chr = scm_getc (port);
909 if (chr == EOF)
910 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
911 "while reading character", SCM_EOL);
912
913 /* CHR must be a token delimiter, like a whitespace. */
914 return (SCM_MAKE_CHAR (chr));
0f2d19dd 915 }
7337d56d 916
69f90b0b 917 pt = SCM_PTAB_ENTRY (port);
7337d56d 918
69f90b0b
MG
919 /* Simple ASCII characters can be processed immediately. Also, simple
920 ISO-8859-1 characters can be processed immediately if the encoding for this
921 port is ISO-8859-1. */
922 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
923 {
924 SCM_COL (port) += 1;
925 return SCM_MAKE_CHAR (buffer[0]);
926 }
927
928 /* Otherwise, convert the buffer into a proper scheme string for
929 processing. */
930 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
931 pt->ilseq_handler);
932 charname_len = scm_i_string_length (charname);
933 SCM_COL (port) += charname_len;
889975e5 934 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
935 if (charname_len == 1)
936 return SCM_MAKE_CHAR (cp);
937
938 /* Ignore dotted circles, which may be used to keep combining characters from
939 combining with the backslash in #\charname. */
0dcd7e61
MG
940 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
941 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
942
889975e5 943 if (cp >= '0' && cp < '8')
7337d56d
LC
944 {
945 /* Dirk:FIXME:: This type of character syntax is not R5RS
946 * compliant. Further, it should be verified that the constant
0ffc78e3 947 * does only consist of octal digits. */
889975e5 948 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 949 if (SCM_I_INUMP (p))
0ffc78e3 950 {
e25f3727 951 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
952 if (SCM_IS_UNICODE_CHAR (c))
953 return SCM_MAKE_CHAR (c);
954 else
0f3a70cf 955 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
956 "out-of-range octal character escape: ~a",
957 scm_list_1 (charname));
958 }
7337d56d
LC
959 }
960
0f3a70cf 961 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
962 {
963 SCM p;
0f3a70cf 964
dea901d6
MG
965 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
966 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
967 scm_from_uint (16));
968 if (SCM_I_INUMP (p))
969 {
e25f3727 970 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
971 if (SCM_IS_UNICODE_CHAR (c))
972 return SCM_MAKE_CHAR (c);
973 else
974 scm_i_input_error (FUNC_NAME, port,
975 "out-of-range hex character escape: ~a",
976 scm_list_1 (charname));
977 }
978 }
979
889975e5
MG
980 /* The names of characters should never have non-Latin1
981 characters. */
982 if (scm_i_is_narrow_string (charname)
983 || scm_i_try_narrow_string (charname))
4769c9db
AW
984 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
985 charname_len);
986 if (scm_is_true (ch))
987 return ch;
988 }
7337d56d 989
7337d56d 990 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 991 scm_list_1 (charname));
7337d56d
LC
992
993 return SCM_UNSPECIFIED;
0f2d19dd 994}
db4b4ca6 995#undef FUNC_NAME
0f2d19dd 996
cfd15439 997static SCM
7337d56d
LC
998scm_read_keyword (int chr, SCM port)
999{
1000 SCM symbol;
1001
1002 /* Read the symbol that comprises the keyword. Doing this instead of
1003 invoking a specific symbol reader function allows `scm_read_keyword ()'
1004 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1005
7337d56d
LC
1006 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1007 symbol = scm_read_expression (port);
1008 if (!scm_is_symbol (symbol))
7f74cf9a 1009 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1010 "keyword prefix `~a' not followed by a symbol: ~s",
1011 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1012
1013 return (scm_symbol_to_keyword (symbol));
1014}
1015
cfd15439 1016static SCM
b131b233 1017scm_read_vector (int chr, SCM port, long line, int column)
09a4f039 1018{
7337d56d
LC
1019 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1020 guarantee that it's going to do what we want. After all, this is an
1021 implementation detail of `scm_read_vector ()', not a desirable
1022 property. */
b131b233
MW
1023 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port)),
1024 port, line, column);
1025}
1026
1027static SCM
1028scm_read_array (int chr, SCM port, long line, int column)
1029{
1030 SCM result = scm_i_read_array (port, chr);
1031 if (scm_is_false (result))
1032 return result;
1033 else
1034 return maybe_annotate_source (result, port, line, column);
7337d56d 1035}
09a4f039 1036
cfd15439 1037static SCM
b131b233 1038scm_read_srfi4_vector (int chr, SCM port, long line, int column)
7337d56d 1039{
b131b233 1040 return scm_read_array (chr, port, line, column);
7337d56d
LC
1041}
1042
0ba0b384 1043static SCM
b131b233 1044scm_read_bytevector (scm_t_wchar chr, SCM port, long line, int column)
0ba0b384
LC
1045{
1046 chr = scm_getc (port);
1047 if (chr != 'u')
1048 goto syntax;
1049
1050 chr = scm_getc (port);
1051 if (chr != '8')
1052 goto syntax;
1053
1054 chr = scm_getc (port);
1055 if (chr != '(')
1056 goto syntax;
1057
b131b233
MW
1058 return maybe_annotate_source
1059 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port)),
1060 port, line, column);
0ba0b384
LC
1061
1062 syntax:
1063 scm_i_input_error ("read_bytevector", port,
1064 "invalid bytevector prefix",
1065 SCM_MAKE_CHAR (chr));
1066 return SCM_UNSPECIFIED;
1067}
1068
7337d56d 1069static SCM
b131b233 1070scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, long line, int column)
7337d56d
LC
1071{
1072 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1073 terribly inefficient but who cares? */
1074 SCM s_bits = SCM_EOL;
1075
1076 for (chr = scm_getc (port);
1077 (chr != EOF) && ((chr == '0') || (chr == '1'));
1078 chr = scm_getc (port))
09a4f039 1079 {
7337d56d 1080 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1081 }
7337d56d
LC
1082
1083 if (chr != EOF)
1084 scm_ungetc (chr, port);
1085
b131b233
MW
1086 return maybe_annotate_source
1087 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1088 port, line, column);
7337d56d
LC
1089}
1090
cfd15439 1091static SCM
889975e5 1092scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1093{
1094 int bang_seen = 0;
1095
1096 for (;;)
09a4f039 1097 {
58b1db5f 1098 int c = scm_getc (port);
62850ef3 1099
7337d56d
LC
1100 if (c == EOF)
1101 scm_i_input_error ("skip_block_comment", port,
1102 "unterminated `#! ... !#' comment", SCM_EOL);
1103
1104 if (c == '!')
1105 bang_seen = 1;
1106 else if (c == '#' && bang_seen)
1107 break;
1108 else
1109 bang_seen = 0;
1110 }
1111
1112 return SCM_UNSPECIFIED;
1113}
1114
d7fcaec3 1115static SCM
911b03b2
JG
1116scm_read_shebang (scm_t_wchar chr, SCM port)
1117{
1118 int c = 0;
1119 if ((c = scm_get_byte_or_eof (port)) != 'r')
1120 {
1121 scm_ungetc (c, port);
1122 return scm_read_scsh_block_comment (chr, port);
1123 }
1124 if ((c = scm_get_byte_or_eof (port)) != '6')
1125 {
1126 scm_ungetc (c, port);
1127 scm_ungetc ('r', port);
1128 return scm_read_scsh_block_comment (chr, port);
1129 }
1130 if ((c = scm_get_byte_or_eof (port)) != 'r')
1131 {
1132 scm_ungetc (c, port);
1133 scm_ungetc ('6', port);
1134 scm_ungetc ('r', port);
1135 return scm_read_scsh_block_comment (chr, port);
1136 }
1137 if ((c = scm_get_byte_or_eof (port)) != 's')
1138 {
1139 scm_ungetc (c, port);
1140 scm_ungetc ('r', port);
1141 scm_ungetc ('6', port);
1142 scm_ungetc ('r', port);
1143 return scm_read_scsh_block_comment (chr, port);
1144 }
1145
1146 return SCM_UNSPECIFIED;
1147}
1148
620c8965
LC
1149static SCM
1150scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1151{
1152 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1153 nested. So care must be taken. */
1154 int nesting_level = 1;
6d5f8c32
AW
1155
1156 int a = scm_getc (port);
1157
1158 if (a == EOF)
1159 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1160 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1161
1162 while (nesting_level > 0)
1163 {
6d5f8c32 1164 int b = scm_getc (port);
620c8965 1165
6d5f8c32 1166 if (b == EOF)
cd169c5a 1167 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1168 "unterminated `#| ... |#' comment", SCM_EOL);
1169
6d5f8c32
AW
1170 if (a == '|' && b == '#')
1171 {
1172 nesting_level--;
1173 b = EOF;
1174 }
1175 else if (a == '#' && b == '|')
1176 {
1177 nesting_level++;
1178 b = EOF;
1179 }
1180
1181 a = b;
620c8965
LC
1182 }
1183
1184 return SCM_UNSPECIFIED;
1185}
1186
34f3d47d 1187static SCM
889975e5 1188scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1189{
889975e5 1190 scm_t_wchar c;
34f3d47d
AW
1191
1192 c = flush_ws (port, (char *) NULL);
1193 if (EOF == c)
1194 scm_i_input_error ("read_commented_expression", port,
1195 "no expression after #; comment", SCM_EOL);
1196 scm_ungetc (c, port);
1197 scm_read_expression (port);
1198 return SCM_UNSPECIFIED;
1199}
1200
7337d56d 1201static SCM
889975e5 1202scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1203{
1204 /* Guile's extended symbol read syntax looks like this:
1205
1206 #{This is all a symbol name}#
1207
1208 So here, CHR is expected to be `{'. */
d9527cfa 1209 int saw_brace = 0;
7337d56d 1210 size_t len = 0;
190d4b0d 1211 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1212
889975e5 1213 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1214
1215 while ((chr = scm_getc (port)) != EOF)
1216 {
1217 if (saw_brace)
09a4f039 1218 {
7337d56d
LC
1219 if (chr == '#')
1220 {
7337d56d
LC
1221 break;
1222 }
1223 else
1224 {
1225 saw_brace = 0;
889975e5 1226 scm_i_string_set_x (buf, len++, '}');
7337d56d 1227 }
09a4f039 1228 }
d9527cfa
AW
1229
1230 if (chr == '}')
7337d56d 1231 saw_brace = 1;
d9527cfa
AW
1232 else if (chr == '\\')
1233 {
1234 /* It used to be that print.c would print extended-read-syntax
1235 symbols with backslashes before "non-standard" chars, but
1236 this routine wouldn't do anything with those escapes.
1237 Bummer. What we've done is to change print.c to output
1238 R6RS hex escapes for those characters, relying on the fact
1239 that the extended read syntax would never put a `\' before
1240 an `x'. For now, we just ignore other instances of
1241 backslash in the string. */
1242 switch ((chr = scm_getc (port)))
1243 {
1244 case EOF:
1245 goto done;
1246 case 'x':
1247 {
1248 scm_t_wchar c;
1249
1250 SCM_READ_HEX_ESCAPE (10, ';');
1251 scm_i_string_set_x (buf, len++, c);
1252 break;
1253
1254 str_eof:
1255 chr = EOF;
1256 goto done;
1257
1258 bad_escaped:
1259 scm_i_string_stop_writing ();
1260 scm_i_input_error ("scm_read_extended_symbol", port,
1261 "illegal character in escape sequence: ~S",
1262 scm_list_1 (SCM_MAKE_CHAR (c)));
1263 break;
1264 }
1265 default:
1266 scm_i_string_set_x (buf, len++, chr);
1267 break;
1268 }
1269 }
7337d56d 1270 else
d9527cfa 1271 scm_i_string_set_x (buf, len++, chr);
62850ef3 1272
889975e5 1273 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1274 {
7f991c7d
LC
1275 SCM addy;
1276
889975e5 1277 scm_i_string_stop_writing ();
190d4b0d 1278 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1279 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1280 len = 0;
889975e5 1281 buf = scm_i_string_start_writing (buf);
7337d56d 1282 }
7337d56d 1283 }
d9527cfa
AW
1284
1285 done:
889975e5 1286 scm_i_string_stop_writing ();
d9527cfa
AW
1287 if (chr == EOF)
1288 scm_i_input_error ("scm_read_extended_symbol", port,
1289 "end of file while reading symbol", SCM_EOL);
7337d56d 1290
889975e5 1291 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1292}
1293
1294
1295\f
1296/* Top-level token readers, i.e., dispatchers. */
1297
1298static SCM
1299scm_read_sharp_extension (int chr, SCM port)
1300{
1301 SCM proc;
1302
1303 proc = scm_get_hash_procedure (chr);
1304 if (scm_is_true (scm_procedure_p (proc)))
1305 {
1306 long line = SCM_LINUM (port);
1307 int column = SCM_COL (port) - 2;
1308 SCM got;
1309
1310 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14
AW
1311
1312 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1313 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1314
1315 return got;
09a4f039 1316 }
7337d56d
LC
1317
1318 return SCM_UNSPECIFIED;
1319}
1320
1321/* The reader for the sharp `#' character. It basically dispatches reads
1322 among the above token readers. */
1323static SCM
b131b233 1324scm_read_sharp (scm_t_wchar chr, SCM port, long line, int column)
7337d56d
LC
1325#define FUNC_NAME "scm_lreadr"
1326{
1327 SCM result;
1328
1329 chr = scm_getc (port);
1330
1331 result = scm_read_sharp_extension (chr, port);
1332 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1333 return result;
1334
1335 switch (chr)
1336 {
1337 case '\\':
1338 return (scm_read_character (chr, port));
1339 case '(':
b131b233 1340 return (scm_read_vector (chr, port, line, column));
7337d56d
LC
1341 case 's':
1342 case 'u':
1343 case 'f':
df941b5b 1344 case 'c':
7337d56d 1345 /* This one may return either a boolean or an SRFI-4 vector. */
b131b233 1346 return (scm_read_srfi4_vector (chr, port, line, column));
0ba0b384 1347 case 'v':
b131b233 1348 return (scm_read_bytevector (chr, port, line, column));
7337d56d 1349 case '*':
b131b233 1350 return (scm_read_guile_bit_vector (chr, port, line, column));
7337d56d
LC
1351 case 't':
1352 case 'T':
1353 case 'F':
7337d56d
LC
1354 return (scm_read_boolean (chr, port));
1355 case ':':
1356 return (scm_read_keyword (chr, port));
1357 case '0': case '1': case '2': case '3': case '4':
1358 case '5': case '6': case '7': case '8': case '9':
1359 case '@':
1360#if SCM_ENABLE_DEPRECATED
1361 /* See below for 'i' and 'e'. */
1362 case 'a':
7337d56d
LC
1363 case 'y':
1364 case 'h':
1365 case 'l':
1366#endif
b131b233 1367 return (scm_read_array (chr, port, line, column));
7337d56d
LC
1368
1369 case 'i':
1370 case 'e':
1371#if SCM_ENABLE_DEPRECATED
1372 {
1373 /* When next char is '(', it really is an old-style
1374 uniform array. */
889975e5 1375 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1376 if (next_c != EOF)
1377 scm_ungetc (next_c, port);
1378 if (next_c == '(')
b131b233 1379 return scm_read_array (chr, port, line, column);
7337d56d
LC
1380 /* Fall through. */
1381 }
1382#endif
1383 case 'b':
1384 case 'B':
1385 case 'o':
1386 case 'O':
1387 case 'd':
1388 case 'D':
1389 case 'x':
1390 case 'X':
1391 case 'I':
1392 case 'E':
1393 return (scm_read_number_and_radix (chr, port));
1394 case '{':
1395 return (scm_read_extended_symbol (chr, port));
1396 case '!':
911b03b2 1397 return (scm_read_shebang (chr, port));
34f3d47d
AW
1398 case ';':
1399 return (scm_read_commented_expression (chr, port));
1400 case '`':
1401 case '\'':
1402 case ',':
1403 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1404 case 'n':
1405 return (scm_read_nil (chr, port));
7337d56d
LC
1406 default:
1407 result = scm_read_sharp_extension (chr, port);
1408 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1409 {
1410 /* To remain compatible with 1.8 and earlier, the following
1411 characters have lower precedence than `read-hash-extend'
1412 characters. */
1413 switch (chr)
1414 {
1415 case '|':
1416 return scm_read_r6rs_block_comment (chr, port);
1417 default:
1418 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1419 scm_list_1 (SCM_MAKE_CHAR (chr)));
1420 }
1421 }
7337d56d
LC
1422 else
1423 return result;
1424 }
1425
1426 return SCM_UNSPECIFIED;
1427}
1428#undef FUNC_NAME
1429
1430static SCM
1431scm_read_expression (SCM port)
1432#define FUNC_NAME "scm_read_expression"
1433{
1434 while (1)
1435 {
cfd15439 1436 scm_t_wchar chr;
7337d56d
LC
1437
1438 chr = scm_getc (port);
1439
1440 switch (chr)
1441 {
1442 case SCM_WHITE_SPACES:
1443 case SCM_LINE_INCREMENTORS:
1444 break;
1445 case ';':
1446 (void) scm_read_semicolon_comment (chr, port);
1447 break;
5afa815c
AW
1448 case '[':
1449 if (!SCM_SQUARE_BRACKETS_P)
1450 return (scm_read_mixed_case_symbol (chr, port));
1451 /* otherwise fall through */
7337d56d
LC
1452 case '(':
1453 return (scm_read_sexp (chr, port));
1454 case '"':
1455 return (scm_read_string (chr, port));
1456 case '\'':
1457 case '`':
1458 case ',':
1459 return (scm_read_quote (chr, port));
1460 case '#':
1461 {
b131b233
MW
1462 long line = SCM_LINUM (port);
1463 int column = SCM_COL (port) - 1;
1464 SCM result = scm_read_sharp (chr, port, line, column);
7337d56d
LC
1465 if (scm_is_eq (result, SCM_UNSPECIFIED))
1466 /* We read a comment or some such. */
1467 break;
1468 else
1469 return result;
1470 }
1471 case ')':
1472 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1473 break;
a4e47229
MG
1474 case ']':
1475 if (SCM_SQUARE_BRACKETS_P)
1476 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1477 /* otherwise fall through */
7337d56d
LC
1478 case EOF:
1479 return SCM_EOF_VAL;
1480 case ':':
1481 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1482 return scm_symbol_to_keyword (scm_read_expression (port));
1483 /* Fall through. */
1484
1485 default:
1486 {
1487 if (((chr >= '0') && (chr <= '9'))
1488 || (strchr ("+-.", chr)))
1489 return (scm_read_number (chr, port));
1490 else
1491 return (scm_read_mixed_case_symbol (chr, port));
1492 }
1493 }
1494 }
1495}
1496#undef FUNC_NAME
1497
1498\f
1499/* Actual reader. */
1500
1501SCM_DEFINE (scm_read, "read", 0, 1, 0,
1502 (SCM port),
1503 "Read an s-expression from the input port @var{port}, or from\n"
1504 "the current input port if @var{port} is not specified.\n"
1505 "Any whitespace before the next token is discarded.")
1506#define FUNC_NAME s_scm_read
1507{
1508 int c;
1509
1510 if (SCM_UNBNDP (port))
1511 port = scm_current_input_port ();
1512 SCM_VALIDATE_OPINPORT (1, port);
1513
1514 c = flush_ws (port, (char *) NULL);
1515 if (EOF == c)
1516 return SCM_EOF_VAL;
1517 scm_ungetc (c, port);
1518
1519 return (scm_read_expression (port));
09a4f039 1520}
db4b4ca6 1521#undef FUNC_NAME
09a4f039 1522
0f2d19dd
JB
1523
1524\f
1525
14de3b42
GH
1526/* Manipulate the read-hash-procedures alist. This could be written in
1527 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1528SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1529 (SCM chr, SCM proc),
dc7fa443
MG
1530 "Install the procedure @var{proc} for reading expressions\n"
1531 "starting with the character sequence @code{#} and @var{chr}.\n"
1532 "@var{proc} will be called with two arguments: the character\n"
1533 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1534 "returned will be the return value of @code{read}. \n"
1535 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1536 )
1bbd0b84 1537#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1538{
fed9c9a2
GH
1539 SCM this;
1540 SCM prev;
1541
36284627 1542 SCM_VALIDATE_CHAR (1, chr);
7888309b 1543 SCM_ASSERT (scm_is_false (proc)
bc36d050 1544 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1545 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1546
14de3b42 1547 /* Check if chr is already in the alist. */
d458073b 1548 this = scm_i_read_hash_procedures_ref ();
14de3b42 1549 prev = SCM_BOOL_F;
fed9c9a2
GH
1550 while (1)
1551 {
d2e53ed6 1552 if (scm_is_null (this))
fed9c9a2
GH
1553 {
1554 /* not found, so add it to the beginning. */
7888309b 1555 if (scm_is_true (proc))
fed9c9a2 1556 {
d458073b
AR
1557 SCM new = scm_cons (scm_cons (chr, proc),
1558 scm_i_read_hash_procedures_ref ());
1559 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1560 }
1561 break;
1562 }
bc36d050 1563 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1564 {
1565 /* already in the alist. */
7888309b 1566 if (scm_is_false (proc))
14de3b42
GH
1567 {
1568 /* remove it. */
7888309b 1569 if (scm_is_false (prev))
14de3b42 1570 {
d458073b
AR
1571 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1572 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1573 }
1574 else
1575 scm_set_cdr_x (prev, SCM_CDR (this));
1576 }
fed9c9a2 1577 else
14de3b42
GH
1578 {
1579 /* replace it. */
1580 scm_set_cdr_x (SCM_CAR (this), proc);
1581 }
fed9c9a2
GH
1582 break;
1583 }
1584 prev = this;
1585 this = SCM_CDR (this);
1586 }
deca31e1 1587
deca31e1
GH
1588 return SCM_UNSPECIFIED;
1589}
1bbd0b84 1590#undef FUNC_NAME
0f2d19dd 1591
deca31e1
GH
1592/* Recover the read-hash procedure corresponding to char c. */
1593static SCM
6e8d25a6 1594scm_get_hash_procedure (int c)
deca31e1 1595{
d458073b 1596 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1597
deca31e1
GH
1598 while (1)
1599 {
d2e53ed6 1600 if (scm_is_null (rest))
deca31e1
GH
1601 return SCM_BOOL_F;
1602
7866a09b 1603 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1604 return SCM_CDAR (rest);
1605
1606 rest = SCM_CDR (rest);
1607 }
1608}
1cc91f1b 1609
889975e5
MG
1610#define SCM_ENCODING_SEARCH_SIZE (500)
1611
f8a1c9a8
LC
1612/* Search the first few hundred characters of a file for an Emacs-like coding
1613 declaration. Returns either NULL or a string whose storage has been
1614 allocated with `scm_gc_malloc ()'. */
889975e5 1615char *
f8a1c9a8 1616scm_i_scan_for_encoding (SCM port)
889975e5 1617{
d900843c 1618 scm_t_port *pt;
889975e5 1619 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1620 size_t bytes_read, encoding_length, i;
889975e5
MG
1621 char *encoding = NULL;
1622 int utf8_bom = 0;
daedbca7 1623 char *pos, *encoding_start;
889975e5
MG
1624 int in_comment;
1625
d900843c 1626 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1627
d900843c
AW
1628 if (pt->rw_active == SCM_PORT_WRITE)
1629 scm_flush (port);
49bb5bd3 1630
d900843c
AW
1631 if (pt->rw_random)
1632 pt->rw_active = SCM_PORT_READ;
1633
1634 if (pt->read_pos == pt->read_end)
1635 {
1636 /* We can use the read buffer, and thus avoid a seek. */
1637 if (scm_fill_input (port) == EOF)
1638 return NULL;
1639
1640 bytes_read = pt->read_end - pt->read_pos;
1641 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1642 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1643
1644 if (bytes_read <= 1)
1645 /* An unbuffered port -- don't scan. */
1646 return NULL;
1647
1648 memcpy (header, pt->read_pos, bytes_read);
1649 header[bytes_read] = '\0';
1650 }
1651 else
1652 {
1653 /* Try to read some bytes and then seek back. Not all ports
1654 support seeking back; and indeed some file ports (like
1655 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1656 check performed by SCM_FPORT_FDES---but fail to seek
1657 backwards. Hence this block comes second. We prefer to use
1658 the read buffer in-place. */
1659 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1660 return NULL;
1661
1662 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1663 header[bytes_read] = '\0';
1664 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1665 }
889975e5
MG
1666
1667 if (bytes_read > 3
1668 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1669 utf8_bom = 1;
1670
1671 /* search past "coding[:=]" */
1672 pos = header;
1673 while (1)
1674 {
1675 if ((pos = strstr(pos, "coding")) == NULL)
1676 return NULL;
1677
1678 pos += strlen("coding");
1679 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1680 (*pos == ':' || *pos == '='))
1681 {
1682 pos ++;
1683 break;
1684 }
1685 }
1686
1687 /* skip spaces */
1688 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1689 (*pos == ' ' || *pos == '\t'))
1690 pos ++;
1691
1692 /* grab the next token */
daedbca7 1693 encoding_start = pos;
889975e5 1694 i = 0;
daedbca7
MG
1695 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1696 && encoding_start + i - header < bytes_read
1697 && (isalnum ((int) encoding_start[i])
1698 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1699 i++;
1700
daedbca7
MG
1701 encoding_length = i;
1702 if (encoding_length == 0)
889975e5
MG
1703 return NULL;
1704
daedbca7
MG
1705 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1706 for (i = 0; i < encoding_length; i++)
026ed239 1707 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1708
1709 /* push backwards to make sure we were in a comment */
1710 in_comment = 0;
daedbca7
MG
1711 pos = encoding_start;
1712 while (pos >= header)
889975e5 1713 {
8a12aeb9
AW
1714 if (*pos == ';')
1715 {
1716 in_comment = 1;
1717 break;
1718 }
1719 else if (*pos == '\n' || pos == header)
889975e5
MG
1720 {
1721 /* This wasn't in a semicolon comment. Check for a
1722 hash-bang comment. */
1723 char *beg = strstr (header, "#!");
1724 char *end = strstr (header, "!#");
8a12aeb9 1725 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1726 in_comment = 1;
1727 break;
1728 }
8a12aeb9
AW
1729 else
1730 {
1731 pos --;
1732 continue;
1733 }
889975e5
MG
1734 }
1735 if (!in_comment)
f8a1c9a8
LC
1736 /* This wasn't in a comment */
1737 return NULL;
1738
889975e5 1739 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1740 scm_misc_error (NULL,
889975e5
MG
1741 "the port input declares the encoding ~s but is encoded as UTF-8",
1742 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1743
889975e5
MG
1744 return encoding;
1745}
1746
1747SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1748 (SCM port),
a270e133 1749 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1750 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1751 "The coding declaration is of the form\n"
1752 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1753 "\n"
1754 "Returns a string containing the character encoding of the file\n"
1755 "if a declaration was found, or @code{#f} otherwise.\n")
1756#define FUNC_NAME s_scm_file_encoding
1757{
1758 char *enc;
1759 SCM s_enc;
f8a1c9a8 1760
d900843c
AW
1761 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1762
f8a1c9a8 1763 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1764 if (enc == NULL)
1765 return SCM_BOOL_F;
1766 else
1767 {
1768 s_enc = scm_from_locale_string (enc);
889975e5
MG
1769 return s_enc;
1770 }
f8a1c9a8 1771
889975e5
MG
1772 return SCM_BOOL_F;
1773}
1774#undef FUNC_NAME
1775
0f2d19dd
JB
1776void
1777scm_init_read ()
0f2d19dd 1778{
d458073b
AR
1779 SCM read_hash_procs;
1780
c81c2ad3 1781 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
1782
1783 scm_i_read_hash_procedures =
1784 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1785
62560650 1786 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1787#include "libguile/read.x"
0f2d19dd 1788}
89e00824
ML
1789
1790/*
1791 Local Variables:
1792 c-file-style: "gnu"
1793 End:
1794*/