minor style fix in bytevectors.c
[bpt/guile.git] / libguile / read.c
CommitLineData
4a655e50 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
210c0325 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
d458073b
AR
145static inline SCM
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
151static inline void
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
7337d56d
LC
164/* Size of the C buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
620c8965
LC
200static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
7337d56d 210static inline int
69f90b0b
MG
211read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
69f90b0b 219 chr = scm_get_byte_or_eof (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
69f90b0b
MG
238/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243static int
244read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246{
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
7337d56d 249
69f90b0b
MG
250 *overflow_buffer = NULL;
251 overflow_size = 0;
1cc91f1b 252
69f90b0b
MG
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
281 return (overflow_size != 0);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
889975e5 289 register scm_t_wchar c;
0f2d19dd 290 while (1)
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
b7f3516f 306 switch (c = scm_getc (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0
LC
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0
LC
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d
LC
358static SCM scm_read_expression (SCM port);
359static SCM scm_read_sharp (int chr, SCM port);
0f2d19dd
JB
360
361
09a4f039 362static SCM
889975e5 363scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 364#define FUNC_NAME "scm_i_lreadparen"
09a4f039 365{
26c8cc14
AW
366 int c;
367 SCM tmp, tl, ans = SCM_EOL;
5afa815c 368 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
f9c68a47 373
7337d56d
LC
374 c = flush_ws (port, FUNC_NAME);
375 if (terminating_char == c)
376 return SCM_EOL;
f9c68a47 377
7337d56d 378 scm_ungetc (c, port);
1f7945a7
AW
379 tmp = scm_read_expression (port);
380
381 /* Note that it is possible for scm_read_expression to return
382 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
383 check that it's a real dot by checking `c'. */
384 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d
LC
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
1cc91f1b 392
7337d56d
LC
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
7337d56d 396 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 397 {
7337d56d 398 SCM new_tail;
0f2d19dd 399
5b69315e
AW
400 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
401 scm_i_input_error (FUNC_NAME, port,
402 "in pair: mismatched close paren: ~A",
403 scm_list_1 (SCM_MAKE_CHAR (c)));
404
7337d56d 405 scm_ungetc (c, port);
5b69315e
AW
406 tmp = scm_read_expression (port);
407
1f7945a7
AW
408 /* See above note about scm_sym_dot. */
409 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 410 {
7337d56d
LC
411 SCM_SETCDR (tl, tmp = scm_read_expression (port));
412
7337d56d
LC
413 c = flush_ws (port, FUNC_NAME);
414 if (terminating_char != c)
415 scm_i_input_error (FUNC_NAME, port,
416 "in pair: missing close paren", SCM_EOL);
417 goto exit;
0f2d19dd 418 }
b858464a 419
7337d56d
LC
420 new_tail = scm_cons (tmp, SCM_EOL);
421 SCM_SETCDR (tl, new_tail);
422 tl = new_tail;
7337d56d 423 }
0f2d19dd 424
7337d56d
LC
425 exit:
426 if (SCM_RECORD_POSITIONS_P)
26c8cc14
AW
427 scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
428
7337d56d
LC
429 return ans;
430}
431#undef FUNC_NAME
a4022e69 432
c5661d28
MG
433
434/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
435 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
436 found. */
437#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
438 do \
439 { \
440 scm_t_wchar a; \
441 size_t i = 0; \
442 c = 0; \
443 while (i < ndigits) \
444 { \
445 a = scm_getc (port); \
446 if (a == EOF) \
447 goto str_eof; \
448 if (terminator \
449 && (a == (scm_t_wchar) terminator) \
450 && (i > 0)) \
451 break; \
452 if ('0' <= a && a <= '9') \
453 a -= '0'; \
454 else if ('A' <= a && a <= 'F') \
455 a = a - 'A' + 10; \
456 else if ('a' <= a && a <= 'f') \
457 a = a - 'a' + 10; \
458 else \
459 { \
460 c = a; \
461 goto bad_escaped; \
462 } \
463 c = c * 16 + a; \
464 i ++; \
465 } \
c5661d28
MG
466 } while (0)
467
684d664e
AW
468static void
469skip_intraline_whitespace (SCM port)
470{
471 scm_t_wchar c;
472
473 do
474 {
475 c = scm_getc (port);
476 if (c == EOF)
477 return;
478 }
479 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
480
481 scm_ungetc (c, port);
482}
483
7337d56d
LC
484static SCM
485scm_read_string (int chr, SCM port)
486#define FUNC_NAME "scm_lreadr"
487{
488 /* For strings smaller than C_STR, this function creates only one Scheme
489 object (the string returned). */
0f2d19dd 490
7337d56d 491 SCM str = SCM_BOOL_F;
7337d56d 492 unsigned c_str_len = 0;
9c44cd45 493 scm_t_wchar c;
eb42ff25 494
190d4b0d 495 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
7337d56d
LC
496 while ('"' != (c = scm_getc (port)))
497 {
498 if (c == EOF)
9c44cd45
MG
499 {
500 str_eof:
501 scm_i_input_error (FUNC_NAME, port,
502 "end of file in string constant", SCM_EOL);
503 }
0f2d19dd 504
9c44cd45
MG
505 if (c_str_len + 1 >= scm_i_string_length (str))
506 {
190d4b0d 507 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
0f2d19dd 508
9c44cd45
MG
509 str = scm_string_append (scm_list_2 (str, addy));
510 }
0f2d19dd 511
7337d56d 512 if (c == '\\')
9c44cd45
MG
513 {
514 switch (c = scm_getc (port))
515 {
516 case EOF:
517 goto str_eof;
518 case '"':
519 case '\\':
520 break;
9c44cd45 521 case '\n':
684d664e
AW
522 if (SCM_HUNGRY_EOL_ESCAPES_P)
523 skip_intraline_whitespace (port);
9c44cd45
MG
524 continue;
525 case '0':
526 c = '\0';
527 break;
528 case 'f':
529 c = '\f';
530 break;
531 case 'n':
532 c = '\n';
533 break;
534 case 'r':
535 c = '\r';
536 break;
537 case 't':
538 c = '\t';
539 break;
540 case 'a':
541 c = '\007';
542 break;
543 case 'v':
544 c = '\v';
545 break;
67a4a16d
MG
546 case 'b':
547 c = '\010';
548 break;
9c44cd45 549 case 'x':
dea901d6
MG
550 if (SCM_R6RS_ESCAPES_P)
551 SCM_READ_HEX_ESCAPE (10, ';');
552 else
553 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 554 break;
9c44cd45 555 case 'u':
898a0b5a
MG
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (4, '\0');
559 break;
560 }
9c44cd45 561 case 'U':
898a0b5a
MG
562 if (!SCM_R6RS_ESCAPES_P)
563 {
564 SCM_READ_HEX_ESCAPE (6, '\0');
565 break;
566 }
9c44cd45
MG
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
7337d56d 577 }
f13b4400 578
7337d56d
LC
579 if (c_str_len > 0)
580 {
9c44cd45 581 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 582 }
69f90b0b 583
9c44cd45 584 return scm_nullstr;
0f2d19dd 585}
db4b4ca6
DH
586#undef FUNC_NAME
587
0f2d19dd 588
7337d56d 589static SCM
889975e5 590scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 591{
69f90b0b
MG
592 SCM result, str = SCM_EOL;
593 char buffer[READER_BUFFER_SIZE];
594 char *overflow_buffer = NULL;
595 size_t bytes_read;
596 int overflow;
597 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 598
7337d56d 599 scm_ungetc (chr, port);
69f90b0b
MG
600 overflow = read_complete_token (port, buffer, sizeof (buffer),
601 &overflow_buffer, &bytes_read);
602
603 if (!overflow)
604 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
605 else
606 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
607 pt->ilseq_handler);
608
609 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 610 if (!scm_is_true (result))
69f90b0b
MG
611 {
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P)
614 str = scm_string_downcase_x (str);
615 result = scm_string_to_symbol (str);
616 }
0f2d19dd 617
69f90b0b
MG
618 if (overflow)
619 free (overflow_buffer);
620 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
621 return result;
622}
0f2d19dd 623
7337d56d 624static SCM
889975e5 625scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 626{
889975e5
MG
627 SCM result;
628 int ends_with_colon = 0;
69f90b0b 629 size_t bytes_read;
ef4cbc08 630 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
631 int overflow;
632 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
633 scm_t_port *pt = SCM_PTAB_ENTRY (port);
634 SCM str;
7337d56d
LC
635
636 scm_ungetc (chr, port);
69f90b0b
MG
637 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
638 &overflow_buffer, &bytes_read);
639 if (bytes_read > 0)
640 {
641 if (!overflow)
642 ends_with_colon = buffer[bytes_read - 1] == ':';
643 else
644 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
645 }
ef4cbc08 646
69f90b0b
MG
647 if (postfix && ends_with_colon && (bytes_read > 1))
648 {
649 if (!overflow)
650 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
651 else
652 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
653 pt->ilseq_handler);
654
655 if (SCM_CASE_INSENSITIVE_P)
656 str = scm_string_downcase_x (str);
657 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
658 }
7337d56d 659 else
69f90b0b
MG
660 {
661 if (!overflow)
662 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
663 else
664 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
665 pt->ilseq_handler);
666
667 if (SCM_CASE_INSENSITIVE_P)
668 str = scm_string_downcase_x (str);
669 result = scm_string_to_symbol (str);
670 }
7337d56d 671
69f90b0b
MG
672 if (overflow)
673 free (overflow_buffer);
674 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
675 return result;
676}
677
678static SCM
889975e5 679scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
680#define FUNC_NAME "scm_lreadr"
681{
889975e5 682 SCM result;
7337d56d 683 size_t read;
69f90b0b
MG
684 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
685 int overflow;
7337d56d 686 unsigned int radix;
69f90b0b
MG
687 SCM str;
688 scm_t_port *pt;
7337d56d
LC
689
690 switch (chr)
691 {
692 case 'B':
693 case 'b':
694 radix = 2;
695 break;
696
697 case 'o':
698 case 'O':
699 radix = 8;
700 break;
701
702 case 'd':
703 case 'D':
704 radix = 10;
705 break;
706
707 case 'x':
708 case 'X':
709 radix = 16;
710 break;
711
712 default:
713 scm_ungetc (chr, port);
714 scm_ungetc ('#', port);
715 radix = 10;
716 }
717
69f90b0b
MG
718 overflow = read_complete_token (port, buffer, sizeof (buffer),
719 &overflow_buffer, &read);
720
721 pt = SCM_PTAB_ENTRY (port);
722 if (!overflow)
723 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
724 else
725 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
726 pt->ilseq_handler);
727
728 result = scm_string_to_number (str, scm_from_uint (radix));
729
730 if (overflow)
731 free (overflow_buffer);
732
733 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
734
735 if (scm_is_true (result))
736 return result;
737
738 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
739
740 return SCM_BOOL_F;
741}
742#undef FUNC_NAME
743
744static SCM
745scm_read_quote (int chr, SCM port)
746{
747 SCM p;
492faee1
LC
748 long line = SCM_LINUM (port);
749 int column = SCM_COL (port) - 1;
7337d56d
LC
750
751 switch (chr)
752 {
753 case '`':
754 p = scm_sym_quasiquote;
755 break;
756
757 case '\'':
758 p = scm_sym_quote;
759 break;
760
761 case ',':
762 {
889975e5 763 scm_t_wchar c;
7337d56d
LC
764
765 c = scm_getc (port);
766 if ('@' == c)
767 p = scm_sym_uq_splicing;
768 else
0f2d19dd 769 {
7337d56d
LC
770 scm_ungetc (c, port);
771 p = scm_sym_unquote;
0f2d19dd 772 }
7337d56d
LC
773 break;
774 }
0f2d19dd 775
7337d56d
LC
776 default:
777 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 778 "scm_read_quote", chr);
7337d56d 779 abort ();
0f2d19dd 780 }
1cc91f1b 781
7337d56d 782 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1 783 if (SCM_RECORD_POSITIONS_P)
26c8cc14 784 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
0f2d19dd 785
7337d56d
LC
786 return p;
787}
788
34f3d47d
AW
789SCM_SYMBOL (sym_syntax, "syntax");
790SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
791SCM_SYMBOL (sym_unsyntax, "unsyntax");
792SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
793
794static SCM
795scm_read_syntax (int chr, SCM port)
796{
797 SCM p;
798 long line = SCM_LINUM (port);
799 int column = SCM_COL (port) - 1;
800
801 switch (chr)
802 {
803 case '`':
804 p = sym_quasisyntax;
805 break;
806
807 case '\'':
808 p = sym_syntax;
809 break;
810
811 case ',':
812 {
813 int c;
814
815 c = scm_getc (port);
816 if ('@' == c)
817 p = sym_unsyntax_splicing;
818 else
819 {
820 scm_ungetc (c, port);
821 p = sym_unsyntax;
822 }
823 break;
824 }
825
826 default:
827 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr);
829 abort ();
830 }
831
832 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
833 if (SCM_RECORD_POSITIONS_P)
26c8cc14 834 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
34f3d47d
AW
835
836 return p;
837}
838
7c4aad9c
AW
839static inline SCM
840scm_read_nil (int chr, SCM port)
841{
842 SCM id = scm_read_mixed_case_symbol (chr, port);
843
844 if (!scm_is_eq (id, sym_nil))
845 scm_i_input_error ("scm_read_nil", port,
846 "unexpected input while reading #nil: ~a",
847 scm_list_1 (id));
848
849 return SCM_ELISP_NIL;
850}
851
7337d56d
LC
852static inline SCM
853scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 854{
0f2d19dd
JB
855 int c;
856
889975e5
MG
857 /* We use the get_byte here because there is no need to get the
858 locale correct with comment input. This presumes that newline
859 always represents itself no matter what the encoding is. */
860 for (c = scm_get_byte_or_eof (port);
7337d56d 861 (c != EOF) && (c != '\n');
8a8da78d 862 c = scm_get_byte_or_eof (port));
7337d56d
LC
863
864 return SCM_UNSPECIFIED;
865}
866
867\f
868/* Sharp readers, i.e. readers called after a `#' sign has been read. */
869
870static SCM
871scm_read_boolean (int chr, SCM port)
872{
873 switch (chr)
0f2d19dd 874 {
7337d56d
LC
875 case 't':
876 case 'T':
877 return SCM_BOOL_T;
878
879 case 'f':
880 case 'F':
881 return SCM_BOOL_F;
0f2d19dd 882 }
7337d56d
LC
883
884 return SCM_UNSPECIFIED;
885}
886
887static SCM
889975e5 888scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
889#define FUNC_NAME "scm_lreadr"
890{
69f90b0b
MG
891 char buffer[READER_CHAR_NAME_MAX_SIZE];
892 SCM charname;
893 size_t charname_len, bytes_read;
889975e5
MG
894 scm_t_wchar cp;
895 int overflow;
69f90b0b 896 scm_t_port *pt;
889975e5 897
69f90b0b 898 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 899 if (overflow)
c1a0ba1c 900 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 901
69f90b0b 902 if (bytes_read == 0)
0f2d19dd 903 {
7337d56d
LC
904 chr = scm_getc (port);
905 if (chr == EOF)
906 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
907 "while reading character", SCM_EOL);
908
909 /* CHR must be a token delimiter, like a whitespace. */
910 return (SCM_MAKE_CHAR (chr));
0f2d19dd 911 }
7337d56d 912
69f90b0b 913 pt = SCM_PTAB_ENTRY (port);
7337d56d 914
69f90b0b
MG
915 /* Simple ASCII characters can be processed immediately. Also, simple
916 ISO-8859-1 characters can be processed immediately if the encoding for this
917 port is ISO-8859-1. */
918 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
919 {
920 SCM_COL (port) += 1;
921 return SCM_MAKE_CHAR (buffer[0]);
922 }
923
924 /* Otherwise, convert the buffer into a proper scheme string for
925 processing. */
926 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
927 pt->ilseq_handler);
928 charname_len = scm_i_string_length (charname);
929 SCM_COL (port) += charname_len;
889975e5 930 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
931 if (charname_len == 1)
932 return SCM_MAKE_CHAR (cp);
933
934 /* Ignore dotted circles, which may be used to keep combining characters from
935 combining with the backslash in #\charname. */
0dcd7e61
MG
936 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
937 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
938
889975e5 939 if (cp >= '0' && cp < '8')
7337d56d
LC
940 {
941 /* Dirk:FIXME:: This type of character syntax is not R5RS
942 * compliant. Further, it should be verified that the constant
0ffc78e3 943 * does only consist of octal digits. */
889975e5 944 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 945 if (SCM_I_INUMP (p))
0ffc78e3 946 {
e25f3727 947 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
948 if (SCM_IS_UNICODE_CHAR (c))
949 return SCM_MAKE_CHAR (c);
950 else
0f3a70cf 951 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
952 "out-of-range octal character escape: ~a",
953 scm_list_1 (charname));
954 }
7337d56d
LC
955 }
956
0f3a70cf 957 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
958 {
959 SCM p;
0f3a70cf 960
dea901d6
MG
961 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
962 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
963 scm_from_uint (16));
964 if (SCM_I_INUMP (p))
965 {
e25f3727 966 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
967 if (SCM_IS_UNICODE_CHAR (c))
968 return SCM_MAKE_CHAR (c);
969 else
970 scm_i_input_error (FUNC_NAME, port,
971 "out-of-range hex character escape: ~a",
972 scm_list_1 (charname));
973 }
974 }
975
889975e5
MG
976 /* The names of characters should never have non-Latin1
977 characters. */
978 if (scm_i_is_narrow_string (charname)
979 || scm_i_try_narrow_string (charname))
4769c9db
AW
980 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
981 charname_len);
982 if (scm_is_true (ch))
983 return ch;
984 }
7337d56d 985
7337d56d 986 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 987 scm_list_1 (charname));
7337d56d
LC
988
989 return SCM_UNSPECIFIED;
0f2d19dd 990}
db4b4ca6 991#undef FUNC_NAME
0f2d19dd 992
7337d56d
LC
993static inline SCM
994scm_read_keyword (int chr, SCM port)
995{
996 SCM symbol;
997
998 /* Read the symbol that comprises the keyword. Doing this instead of
999 invoking a specific symbol reader function allows `scm_read_keyword ()'
1000 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1001
7337d56d
LC
1002 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1003 symbol = scm_read_expression (port);
1004 if (!scm_is_symbol (symbol))
7f74cf9a 1005 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1006 "keyword prefix `~a' not followed by a symbol: ~s",
1007 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009 return (scm_symbol_to_keyword (symbol));
1010}
1011
1012static inline SCM
1013scm_read_vector (int chr, SCM port)
09a4f039 1014{
7337d56d
LC
1015 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016 guarantee that it's going to do what we want. After all, this is an
1017 implementation detail of `scm_read_vector ()', not a desirable
1018 property. */
1019 return (scm_vector (scm_read_sexp (chr, port)));
1020}
09a4f039 1021
7337d56d
LC
1022static inline SCM
1023scm_read_srfi4_vector (int chr, SCM port)
1024{
1025 return scm_i_read_array (port, chr);
1026}
1027
0ba0b384 1028static SCM
889975e5 1029scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
1030{
1031 chr = scm_getc (port);
1032 if (chr != 'u')
1033 goto syntax;
1034
1035 chr = scm_getc (port);
1036 if (chr != '8')
1037 goto syntax;
1038
1039 chr = scm_getc (port);
1040 if (chr != '(')
1041 goto syntax;
1042
1043 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045 syntax:
1046 scm_i_input_error ("read_bytevector", port,
1047 "invalid bytevector prefix",
1048 SCM_MAKE_CHAR (chr));
1049 return SCM_UNSPECIFIED;
1050}
1051
7337d56d 1052static SCM
889975e5 1053scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1054{
1055 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1056 terribly inefficient but who cares? */
1057 SCM s_bits = SCM_EOL;
1058
1059 for (chr = scm_getc (port);
1060 (chr != EOF) && ((chr == '0') || (chr == '1'));
1061 chr = scm_getc (port))
09a4f039 1062 {
7337d56d 1063 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1064 }
7337d56d
LC
1065
1066 if (chr != EOF)
1067 scm_ungetc (chr, port);
1068
1069 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070}
1071
1072static inline SCM
889975e5 1073scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1074{
1075 int bang_seen = 0;
1076
1077 for (;;)
09a4f039 1078 {
58b1db5f 1079 int c = scm_getc (port);
62850ef3 1080
7337d56d
LC
1081 if (c == EOF)
1082 scm_i_input_error ("skip_block_comment", port,
1083 "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085 if (c == '!')
1086 bang_seen = 1;
1087 else if (c == '#' && bang_seen)
1088 break;
1089 else
1090 bang_seen = 0;
1091 }
1092
1093 return SCM_UNSPECIFIED;
1094}
1095
d7fcaec3 1096static SCM
911b03b2
JG
1097scm_read_shebang (scm_t_wchar chr, SCM port)
1098{
1099 int c = 0;
1100 if ((c = scm_get_byte_or_eof (port)) != 'r')
1101 {
1102 scm_ungetc (c, port);
1103 return scm_read_scsh_block_comment (chr, port);
1104 }
1105 if ((c = scm_get_byte_or_eof (port)) != '6')
1106 {
1107 scm_ungetc (c, port);
1108 scm_ungetc ('r', port);
1109 return scm_read_scsh_block_comment (chr, port);
1110 }
1111 if ((c = scm_get_byte_or_eof (port)) != 'r')
1112 {
1113 scm_ungetc (c, port);
1114 scm_ungetc ('6', port);
1115 scm_ungetc ('r', port);
1116 return scm_read_scsh_block_comment (chr, port);
1117 }
1118 if ((c = scm_get_byte_or_eof (port)) != 's')
1119 {
1120 scm_ungetc (c, port);
1121 scm_ungetc ('r', port);
1122 scm_ungetc ('6', port);
1123 scm_ungetc ('r', port);
1124 return scm_read_scsh_block_comment (chr, port);
1125 }
1126
1127 return SCM_UNSPECIFIED;
1128}
1129
620c8965
LC
1130static SCM
1131scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132{
1133 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134 nested. So care must be taken. */
1135 int nesting_level = 1;
1136 int opening_seen = 0, closing_seen = 0;
1137
1138 while (nesting_level > 0)
1139 {
1140 int c = scm_getc (port);
1141
1142 if (c == EOF)
cd169c5a 1143 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1144 "unterminated `#| ... |#' comment", SCM_EOL);
1145
1146 if (opening_seen)
1147 {
1148 if (c == '|')
1149 nesting_level++;
1150 opening_seen = 0;
1151 }
1152 else if (closing_seen)
1153 {
1154 if (c == '#')
1155 nesting_level--;
1156 closing_seen = 0;
1157 }
1158 else if (c == '|')
1159 closing_seen = 1;
1160 else if (c == '#')
1161 opening_seen = 1;
1162 else
1163 opening_seen = closing_seen = 0;
1164 }
1165
1166 return SCM_UNSPECIFIED;
1167}
1168
34f3d47d 1169static SCM
889975e5 1170scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1171{
889975e5 1172 scm_t_wchar c;
34f3d47d
AW
1173
1174 c = flush_ws (port, (char *) NULL);
1175 if (EOF == c)
1176 scm_i_input_error ("read_commented_expression", port,
1177 "no expression after #; comment", SCM_EOL);
1178 scm_ungetc (c, port);
1179 scm_read_expression (port);
1180 return SCM_UNSPECIFIED;
1181}
1182
7337d56d 1183static SCM
889975e5 1184scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1185{
1186 /* Guile's extended symbol read syntax looks like this:
1187
1188 #{This is all a symbol name}#
1189
1190 So here, CHR is expected to be `{'. */
d9527cfa 1191 int saw_brace = 0;
7337d56d 1192 size_t len = 0;
190d4b0d 1193 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1194
889975e5 1195 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1196
1197 while ((chr = scm_getc (port)) != EOF)
1198 {
1199 if (saw_brace)
09a4f039 1200 {
7337d56d
LC
1201 if (chr == '#')
1202 {
7337d56d
LC
1203 break;
1204 }
1205 else
1206 {
1207 saw_brace = 0;
889975e5 1208 scm_i_string_set_x (buf, len++, '}');
7337d56d 1209 }
09a4f039 1210 }
d9527cfa
AW
1211
1212 if (chr == '}')
7337d56d 1213 saw_brace = 1;
d9527cfa
AW
1214 else if (chr == '\\')
1215 {
1216 /* It used to be that print.c would print extended-read-syntax
1217 symbols with backslashes before "non-standard" chars, but
1218 this routine wouldn't do anything with those escapes.
1219 Bummer. What we've done is to change print.c to output
1220 R6RS hex escapes for those characters, relying on the fact
1221 that the extended read syntax would never put a `\' before
1222 an `x'. For now, we just ignore other instances of
1223 backslash in the string. */
1224 switch ((chr = scm_getc (port)))
1225 {
1226 case EOF:
1227 goto done;
1228 case 'x':
1229 {
1230 scm_t_wchar c;
1231
1232 SCM_READ_HEX_ESCAPE (10, ';');
1233 scm_i_string_set_x (buf, len++, c);
1234 break;
1235
1236 str_eof:
1237 chr = EOF;
1238 goto done;
1239
1240 bad_escaped:
1241 scm_i_string_stop_writing ();
1242 scm_i_input_error ("scm_read_extended_symbol", port,
1243 "illegal character in escape sequence: ~S",
1244 scm_list_1 (SCM_MAKE_CHAR (c)));
1245 break;
1246 }
1247 default:
1248 scm_i_string_set_x (buf, len++, chr);
1249 break;
1250 }
1251 }
7337d56d 1252 else
d9527cfa 1253 scm_i_string_set_x (buf, len++, chr);
62850ef3 1254
889975e5 1255 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1256 {
7f991c7d
LC
1257 SCM addy;
1258
889975e5 1259 scm_i_string_stop_writing ();
190d4b0d 1260 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1261 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1262 len = 0;
889975e5 1263 buf = scm_i_string_start_writing (buf);
7337d56d 1264 }
7337d56d 1265 }
d9527cfa
AW
1266
1267 done:
889975e5 1268 scm_i_string_stop_writing ();
d9527cfa
AW
1269 if (chr == EOF)
1270 scm_i_input_error ("scm_read_extended_symbol", port,
1271 "end of file while reading symbol", SCM_EOL);
7337d56d 1272
889975e5 1273 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1274}
1275
1276
1277\f
1278/* Top-level token readers, i.e., dispatchers. */
1279
1280static SCM
1281scm_read_sharp_extension (int chr, SCM port)
1282{
1283 SCM proc;
1284
1285 proc = scm_get_hash_procedure (chr);
1286 if (scm_is_true (scm_procedure_p (proc)))
1287 {
1288 long line = SCM_LINUM (port);
1289 int column = SCM_COL (port) - 2;
1290 SCM got;
1291
1292 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14
AW
1293
1294 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1295 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1296
1297 return got;
09a4f039 1298 }
7337d56d
LC
1299
1300 return SCM_UNSPECIFIED;
1301}
1302
1303/* The reader for the sharp `#' character. It basically dispatches reads
1304 among the above token readers. */
1305static SCM
889975e5 1306scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1307#define FUNC_NAME "scm_lreadr"
1308{
1309 SCM result;
1310
1311 chr = scm_getc (port);
1312
1313 result = scm_read_sharp_extension (chr, port);
1314 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1315 return result;
1316
1317 switch (chr)
1318 {
1319 case '\\':
1320 return (scm_read_character (chr, port));
1321 case '(':
1322 return (scm_read_vector (chr, port));
1323 case 's':
1324 case 'u':
1325 case 'f':
df941b5b 1326 case 'c':
7337d56d
LC
1327 /* This one may return either a boolean or an SRFI-4 vector. */
1328 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1329 case 'v':
1330 return (scm_read_bytevector (chr, port));
7337d56d
LC
1331 case '*':
1332 return (scm_read_guile_bit_vector (chr, port));
1333 case 't':
1334 case 'T':
1335 case 'F':
1336 /* This one may return either a boolean or an SRFI-4 vector. */
1337 return (scm_read_boolean (chr, port));
1338 case ':':
1339 return (scm_read_keyword (chr, port));
1340 case '0': case '1': case '2': case '3': case '4':
1341 case '5': case '6': case '7': case '8': case '9':
1342 case '@':
1343#if SCM_ENABLE_DEPRECATED
1344 /* See below for 'i' and 'e'. */
1345 case 'a':
7337d56d
LC
1346 case 'y':
1347 case 'h':
1348 case 'l':
1349#endif
1350 return (scm_i_read_array (port, chr));
1351
1352 case 'i':
1353 case 'e':
1354#if SCM_ENABLE_DEPRECATED
1355 {
1356 /* When next char is '(', it really is an old-style
1357 uniform array. */
889975e5 1358 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1359 if (next_c != EOF)
1360 scm_ungetc (next_c, port);
1361 if (next_c == '(')
1362 return scm_i_read_array (port, chr);
1363 /* Fall through. */
1364 }
1365#endif
1366 case 'b':
1367 case 'B':
1368 case 'o':
1369 case 'O':
1370 case 'd':
1371 case 'D':
1372 case 'x':
1373 case 'X':
1374 case 'I':
1375 case 'E':
1376 return (scm_read_number_and_radix (chr, port));
1377 case '{':
1378 return (scm_read_extended_symbol (chr, port));
1379 case '!':
911b03b2 1380 return (scm_read_shebang (chr, port));
34f3d47d
AW
1381 case ';':
1382 return (scm_read_commented_expression (chr, port));
1383 case '`':
1384 case '\'':
1385 case ',':
1386 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1387 case 'n':
1388 return (scm_read_nil (chr, port));
7337d56d
LC
1389 default:
1390 result = scm_read_sharp_extension (chr, port);
1391 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1392 {
1393 /* To remain compatible with 1.8 and earlier, the following
1394 characters have lower precedence than `read-hash-extend'
1395 characters. */
1396 switch (chr)
1397 {
1398 case '|':
1399 return scm_read_r6rs_block_comment (chr, port);
1400 default:
1401 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1402 scm_list_1 (SCM_MAKE_CHAR (chr)));
1403 }
1404 }
7337d56d
LC
1405 else
1406 return result;
1407 }
1408
1409 return SCM_UNSPECIFIED;
1410}
1411#undef FUNC_NAME
1412
1413static SCM
1414scm_read_expression (SCM port)
1415#define FUNC_NAME "scm_read_expression"
1416{
1417 while (1)
1418 {
889975e5 1419 register scm_t_wchar chr;
7337d56d
LC
1420
1421 chr = scm_getc (port);
1422
1423 switch (chr)
1424 {
1425 case SCM_WHITE_SPACES:
1426 case SCM_LINE_INCREMENTORS:
1427 break;
1428 case ';':
1429 (void) scm_read_semicolon_comment (chr, port);
1430 break;
5afa815c
AW
1431 case '[':
1432 if (!SCM_SQUARE_BRACKETS_P)
1433 return (scm_read_mixed_case_symbol (chr, port));
1434 /* otherwise fall through */
7337d56d
LC
1435 case '(':
1436 return (scm_read_sexp (chr, port));
1437 case '"':
1438 return (scm_read_string (chr, port));
1439 case '\'':
1440 case '`':
1441 case ',':
1442 return (scm_read_quote (chr, port));
1443 case '#':
1444 {
1445 SCM result;
1446 result = scm_read_sharp (chr, port);
1447 if (scm_is_eq (result, SCM_UNSPECIFIED))
1448 /* We read a comment or some such. */
1449 break;
1450 else
1451 return result;
1452 }
1453 case ')':
1454 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1455 break;
a4e47229
MG
1456 case ']':
1457 if (SCM_SQUARE_BRACKETS_P)
1458 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1459 /* otherwise fall through */
7337d56d
LC
1460 case EOF:
1461 return SCM_EOF_VAL;
1462 case ':':
1463 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1464 return scm_symbol_to_keyword (scm_read_expression (port));
1465 /* Fall through. */
1466
1467 default:
1468 {
1469 if (((chr >= '0') && (chr <= '9'))
1470 || (strchr ("+-.", chr)))
1471 return (scm_read_number (chr, port));
1472 else
1473 return (scm_read_mixed_case_symbol (chr, port));
1474 }
1475 }
1476 }
1477}
1478#undef FUNC_NAME
1479
1480\f
1481/* Actual reader. */
1482
1483SCM_DEFINE (scm_read, "read", 0, 1, 0,
1484 (SCM port),
1485 "Read an s-expression from the input port @var{port}, or from\n"
1486 "the current input port if @var{port} is not specified.\n"
1487 "Any whitespace before the next token is discarded.")
1488#define FUNC_NAME s_scm_read
1489{
1490 int c;
1491
1492 if (SCM_UNBNDP (port))
1493 port = scm_current_input_port ();
1494 SCM_VALIDATE_OPINPORT (1, port);
1495
1496 c = flush_ws (port, (char *) NULL);
1497 if (EOF == c)
1498 return SCM_EOF_VAL;
1499 scm_ungetc (c, port);
1500
1501 return (scm_read_expression (port));
09a4f039 1502}
db4b4ca6 1503#undef FUNC_NAME
09a4f039 1504
0f2d19dd
JB
1505
1506\f
1507
14de3b42
GH
1508/* Manipulate the read-hash-procedures alist. This could be written in
1509 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1510SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1511 (SCM chr, SCM proc),
dc7fa443
MG
1512 "Install the procedure @var{proc} for reading expressions\n"
1513 "starting with the character sequence @code{#} and @var{chr}.\n"
1514 "@var{proc} will be called with two arguments: the character\n"
1515 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1516 "returned will be the return value of @code{read}. \n"
1517 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1518 )
1bbd0b84 1519#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1520{
fed9c9a2
GH
1521 SCM this;
1522 SCM prev;
1523
36284627 1524 SCM_VALIDATE_CHAR (1, chr);
7888309b 1525 SCM_ASSERT (scm_is_false (proc)
bc36d050 1526 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1527 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1528
14de3b42 1529 /* Check if chr is already in the alist. */
d458073b 1530 this = scm_i_read_hash_procedures_ref ();
14de3b42 1531 prev = SCM_BOOL_F;
fed9c9a2
GH
1532 while (1)
1533 {
d2e53ed6 1534 if (scm_is_null (this))
fed9c9a2
GH
1535 {
1536 /* not found, so add it to the beginning. */
7888309b 1537 if (scm_is_true (proc))
fed9c9a2 1538 {
d458073b
AR
1539 SCM new = scm_cons (scm_cons (chr, proc),
1540 scm_i_read_hash_procedures_ref ());
1541 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1542 }
1543 break;
1544 }
bc36d050 1545 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1546 {
1547 /* already in the alist. */
7888309b 1548 if (scm_is_false (proc))
14de3b42
GH
1549 {
1550 /* remove it. */
7888309b 1551 if (scm_is_false (prev))
14de3b42 1552 {
d458073b
AR
1553 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1554 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1555 }
1556 else
1557 scm_set_cdr_x (prev, SCM_CDR (this));
1558 }
fed9c9a2 1559 else
14de3b42
GH
1560 {
1561 /* replace it. */
1562 scm_set_cdr_x (SCM_CAR (this), proc);
1563 }
fed9c9a2
GH
1564 break;
1565 }
1566 prev = this;
1567 this = SCM_CDR (this);
1568 }
deca31e1 1569
deca31e1
GH
1570 return SCM_UNSPECIFIED;
1571}
1bbd0b84 1572#undef FUNC_NAME
0f2d19dd 1573
deca31e1
GH
1574/* Recover the read-hash procedure corresponding to char c. */
1575static SCM
6e8d25a6 1576scm_get_hash_procedure (int c)
deca31e1 1577{
d458073b 1578 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1579
deca31e1
GH
1580 while (1)
1581 {
d2e53ed6 1582 if (scm_is_null (rest))
deca31e1
GH
1583 return SCM_BOOL_F;
1584
7866a09b 1585 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1586 return SCM_CDAR (rest);
1587
1588 rest = SCM_CDR (rest);
1589 }
1590}
1cc91f1b 1591
889975e5
MG
1592#define SCM_ENCODING_SEARCH_SIZE (500)
1593
f8a1c9a8
LC
1594/* Search the first few hundred characters of a file for an Emacs-like coding
1595 declaration. Returns either NULL or a string whose storage has been
1596 allocated with `scm_gc_malloc ()'. */
889975e5 1597char *
f8a1c9a8 1598scm_i_scan_for_encoding (SCM port)
889975e5 1599{
d900843c 1600 scm_t_port *pt;
889975e5 1601 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1602 size_t bytes_read, encoding_length, i;
889975e5
MG
1603 char *encoding = NULL;
1604 int utf8_bom = 0;
daedbca7 1605 char *pos, *encoding_start;
889975e5
MG
1606 int in_comment;
1607
d900843c 1608 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1609
d900843c
AW
1610 if (pt->rw_active == SCM_PORT_WRITE)
1611 scm_flush (port);
49bb5bd3 1612
d900843c
AW
1613 if (pt->rw_random)
1614 pt->rw_active = SCM_PORT_READ;
1615
1616 if (pt->read_pos == pt->read_end)
1617 {
1618 /* We can use the read buffer, and thus avoid a seek. */
1619 if (scm_fill_input (port) == EOF)
1620 return NULL;
1621
1622 bytes_read = pt->read_end - pt->read_pos;
1623 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1624 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1625
1626 if (bytes_read <= 1)
1627 /* An unbuffered port -- don't scan. */
1628 return NULL;
1629
1630 memcpy (header, pt->read_pos, bytes_read);
1631 header[bytes_read] = '\0';
1632 }
1633 else
1634 {
1635 /* Try to read some bytes and then seek back. Not all ports
1636 support seeking back; and indeed some file ports (like
1637 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1638 check performed by SCM_FPORT_FDES---but fail to seek
1639 backwards. Hence this block comes second. We prefer to use
1640 the read buffer in-place. */
1641 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1642 return NULL;
1643
1644 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1645 header[bytes_read] = '\0';
1646 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1647 }
889975e5
MG
1648
1649 if (bytes_read > 3
1650 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1651 utf8_bom = 1;
1652
1653 /* search past "coding[:=]" */
1654 pos = header;
1655 while (1)
1656 {
1657 if ((pos = strstr(pos, "coding")) == NULL)
1658 return NULL;
1659
1660 pos += strlen("coding");
1661 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1662 (*pos == ':' || *pos == '='))
1663 {
1664 pos ++;
1665 break;
1666 }
1667 }
1668
1669 /* skip spaces */
1670 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1671 (*pos == ' ' || *pos == '\t'))
1672 pos ++;
1673
1674 /* grab the next token */
daedbca7 1675 encoding_start = pos;
889975e5 1676 i = 0;
daedbca7
MG
1677 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1678 && encoding_start + i - header < bytes_read
1679 && (isalnum ((int) encoding_start[i])
1680 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1681 i++;
1682
daedbca7
MG
1683 encoding_length = i;
1684 if (encoding_length == 0)
889975e5
MG
1685 return NULL;
1686
daedbca7
MG
1687 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1688 for (i = 0; i < encoding_length; i++)
026ed239 1689 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1690
1691 /* push backwards to make sure we were in a comment */
1692 in_comment = 0;
daedbca7
MG
1693 pos = encoding_start;
1694 while (pos >= header)
889975e5 1695 {
8a12aeb9
AW
1696 if (*pos == ';')
1697 {
1698 in_comment = 1;
1699 break;
1700 }
1701 else if (*pos == '\n' || pos == header)
889975e5
MG
1702 {
1703 /* This wasn't in a semicolon comment. Check for a
1704 hash-bang comment. */
1705 char *beg = strstr (header, "#!");
1706 char *end = strstr (header, "!#");
8a12aeb9 1707 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1708 in_comment = 1;
1709 break;
1710 }
8a12aeb9
AW
1711 else
1712 {
1713 pos --;
1714 continue;
1715 }
889975e5
MG
1716 }
1717 if (!in_comment)
f8a1c9a8
LC
1718 /* This wasn't in a comment */
1719 return NULL;
1720
889975e5 1721 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1722 scm_misc_error (NULL,
889975e5
MG
1723 "the port input declares the encoding ~s but is encoded as UTF-8",
1724 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1725
889975e5
MG
1726 return encoding;
1727}
1728
1729SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1730 (SCM port),
a270e133 1731 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1732 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1733 "The coding declaration is of the form\n"
1734 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1735 "\n"
1736 "Returns a string containing the character encoding of the file\n"
1737 "if a declaration was found, or @code{#f} otherwise.\n")
1738#define FUNC_NAME s_scm_file_encoding
1739{
1740 char *enc;
1741 SCM s_enc;
f8a1c9a8 1742
d900843c
AW
1743 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1744
f8a1c9a8 1745 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1746 if (enc == NULL)
1747 return SCM_BOOL_F;
1748 else
1749 {
1750 s_enc = scm_from_locale_string (enc);
889975e5
MG
1751 return s_enc;
1752 }
f8a1c9a8 1753
889975e5
MG
1754 return SCM_BOOL_F;
1755}
1756#undef FUNC_NAME
1757
0f2d19dd
JB
1758void
1759scm_init_read ()
0f2d19dd 1760{
d458073b
AR
1761 SCM read_hash_procs;
1762
1763 read_hash_procs = scm_make_fluid ();
1764 scm_fluid_set_x (read_hash_procs, SCM_EOL);
1765
1766 scm_i_read_hash_procedures =
1767 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1768
62560650 1769 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1770#include "libguile/read.x"
0f2d19dd 1771}
89e00824
ML
1772
1773/*
1774 Local Variables:
1775 c-file-style: "gnu"
1776 End:
1777*/