i18: avoid freelocale (NULL)
[bpt/guile.git] / libguile / read.c
CommitLineData
4a655e50 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
684d664e 32#include <unictype.h>
7337d56d 33
a0599745 34#include "libguile/_scm.h"
0ba0b384 35#include "libguile/bytevectors.h"
a0599745
MD
36#include "libguile/chars.h"
37#include "libguile/eval.h"
2fa901a5 38#include "libguile/arrays.h"
cf396142 39#include "libguile/bitvectors.h"
a0599745
MD
40#include "libguile/keywords.h"
41#include "libguile/alist.h"
42#include "libguile/srcprop.h"
43#include "libguile/hashtab.h"
44#include "libguile/hash.h"
45#include "libguile/ports.h"
49bb5bd3 46#include "libguile/fports.h"
a0599745
MD
47#include "libguile/root.h"
48#include "libguile/strings.h"
ba1b2226 49#include "libguile/strports.h"
a0599745 50#include "libguile/vectors.h"
a0599745 51#include "libguile/validate.h"
a4022e69 52#include "libguile/srfi-4.h"
7337d56d 53#include "libguile/srfi-13.h"
ba1b2226 54
a0599745 55#include "libguile/read.h"
22fc179a
HWN
56#include "libguile/private-options.h"
57
0f2d19dd
JB
58
59\f
60
5bf6a6f0 61SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 62SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 63SCM_SYMBOL (scm_keyword_postfix, "postfix");
7c4aad9c 64SCM_SYMBOL (sym_nil, "nil");
c7733771 65
92c2555f 66scm_t_option scm_read_opts[] = {
b7ff98dd
MD
67 { SCM_OPTION_BOOLEAN, "copy", 0,
68 "Copy source code expressions." },
6c51a40a 69 { SCM_OPTION_BOOLEAN, "positions", 1,
deca31e1
GH
70 "Record positions of source code expressions." },
71 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 72 "Convert symbols to lower case."},
210c0325 73 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
904fabb6 74 "Style of keyword recognition: #f, 'prefix or 'postfix."},
dea901d6
MG
75 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
76 "Use R6RS variable-length character and string hex escapes."},
5afa815c
AW
77 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
78 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
684d664e
AW
79 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
80 "In strings, consume leading whitespace after an escaped end-of-line."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
4a655e50 117 scm_error_scm (scm_from_latin1_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
d458073b
AR
141/* A fluid referring to an association list mapping extra hash
142 characters to procedures. */
143static SCM *scm_i_read_hash_procedures;
deca31e1 144
d458073b
AR
145static inline SCM
146scm_i_read_hash_procedures_ref (void)
147{
148 return scm_fluid_ref (*scm_i_read_hash_procedures);
149}
150
151static inline void
152scm_i_read_hash_procedures_set_x (SCM value)
153{
154 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
155}
0f2d19dd 156
7337d56d
LC
157\f
158/* Token readers. */
0f2d19dd 159
0f2d19dd 160
7337d56d
LC
161/* Size of the C buffer used to read symbols and numbers. */
162#define READER_BUFFER_SIZE 128
0f2d19dd 163
7337d56d
LC
164/* Size of the C buffer used to read strings. */
165#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 166
7337d56d
LC
167/* The maximum size of Scheme character names. */
168#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 169
94115ae3 170
7337d56d
LC
171/* `isblank' is only in C99. */
172#define CHAR_IS_BLANK_(_chr) \
173 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 174 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
175
176#ifdef MSDOS
177# define CHAR_IS_BLANK(_chr) \
178 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
179#else
180# define CHAR_IS_BLANK CHAR_IS_BLANK_
181#endif
182
183
184/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
185 structure''). */
186#define CHAR_IS_R5RS_DELIMITER(c) \
187 (CHAR_IS_BLANK (c) \
5afa815c
AW
188 || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
189 || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
7337d56d
LC
190
191#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
192
193/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
194 Structure''. */
195#define CHAR_IS_EXPONENT_MARKER(_chr) \
196 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
197 || ((_chr) == 'd') || ((_chr) == 'l'))
198
454866e0 199/* Read an SCSH block comment. */
620c8965
LC
200static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
201static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
202static SCM scm_read_commented_expression (scm_t_wchar, SCM);
911b03b2 203static SCM scm_read_shebang (scm_t_wchar, SCM);
620c8965 204static SCM scm_get_hash_procedure (int);
454866e0 205
69f90b0b
MG
206/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
207 result in the pre-allocated buffer BUF. Return zero if the whole token has
208 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
209 bytes actually read. */
7337d56d 210static inline int
69f90b0b
MG
211read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
212 {
213 *read = 0;
0520c320 214
69f90b0b
MG
215 while (*read < buf_size)
216 {
217 int chr;
7337d56d 218
69f90b0b 219 chr = scm_get_byte_or_eof (port);
889975e5 220
69f90b0b
MG
221 if (chr == EOF)
222 return 0;
223 else if (CHAR_IS_DELIMITER (chr))
224 {
225 scm_unget_byte (chr, port);
226 return 0;
227 }
228 else
229 {
230 *buf = (char) chr;
231 buf++, (*read)++;
232 }
233 }
889975e5 234
69f90b0b
MG
235 return 1;
236 }
889975e5 237
69f90b0b
MG
238/* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
239 result in the pre-allocated buffer BUFFER, if the whole token has fewer than
240 BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
241 caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
242 will be set the number of bytes actually read. */
243static int
244read_complete_token (SCM port, char *buffer, const size_t buffer_size,
245 char **overflow_buffer, size_t *read)
246{
247 int overflow = 0;
248 size_t bytes_read, overflow_size;
7337d56d 249
69f90b0b
MG
250 *overflow_buffer = NULL;
251 overflow_size = 0;
1cc91f1b 252
69f90b0b
MG
253 do
254 {
255 overflow = read_token (port, buffer, buffer_size, &bytes_read);
256 if (bytes_read == 0)
257 break;
258 if (overflow || overflow_size != 0)
259 {
260 if (overflow_size == 0)
261 {
262 *overflow_buffer = scm_malloc (bytes_read);
263 memcpy (*overflow_buffer, buffer, bytes_read);
264 overflow_size = bytes_read;
265 }
266 else
267 {
268 *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
269 memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
270 overflow_size += bytes_read;
271 }
272 }
889975e5 273 }
69f90b0b 274 while (overflow);
889975e5 275
69f90b0b
MG
276 if (overflow_size)
277 *read = overflow_size;
b521d265 278 else
69f90b0b
MG
279 *read = bytes_read;
280
281 return (overflow_size != 0);
889975e5 282}
7337d56d
LC
283
284/* Skip whitespace from PORT and return the first non-whitespace character
285 read. Raise an error on end-of-file. */
286static int
287flush_ws (SCM port, const char *eoferr)
0f2d19dd 288{
889975e5 289 register scm_t_wchar c;
0f2d19dd 290 while (1)
b7f3516f 291 switch (c = scm_getc (port))
0f2d19dd
JB
292 {
293 case EOF:
294 goteof:
295 if (eoferr)
d156d3b7 296 {
a4022e69
MV
297 scm_i_input_error (eoferr,
298 port,
299 "end of file",
300 SCM_EOL);
d156d3b7 301 }
0f2d19dd 302 return c;
7337d56d 303
0f2d19dd
JB
304 case ';':
305 lp:
b7f3516f 306 switch (c = scm_getc (port))
0f2d19dd
JB
307 {
308 case EOF:
309 goto goteof;
310 default:
311 goto lp;
312 case SCM_LINE_INCREMENTORS:
313 break;
314 }
315 break;
7337d56d 316
454866e0
LC
317 case '#':
318 switch (c = scm_getc (port))
319 {
320 case EOF:
321 eoferr = "read_sharp";
322 goto goteof;
323 case '!':
911b03b2 324 scm_read_shebang (c, port);
454866e0 325 break;
34f3d47d
AW
326 case ';':
327 scm_read_commented_expression (c, port);
328 break;
620c8965
LC
329 case '|':
330 if (scm_is_false (scm_get_hash_procedure (c)))
331 {
332 scm_read_r6rs_block_comment (c, port);
333 break;
334 }
335 /* fall through */
454866e0
LC
336 default:
337 scm_ungetc (c, port);
338 return '#';
339 }
340 break;
341
0f2d19dd 342 case SCM_LINE_INCREMENTORS:
0f2d19dd 343 case SCM_SINGLE_SPACES:
0f2d19dd 344 case '\t':
0f2d19dd 345 break;
7337d56d 346
0f2d19dd
JB
347 default:
348 return c;
349 }
7337d56d
LC
350
351 return 0;
0f2d19dd
JB
352}
353
354
7337d56d
LC
355\f
356/* Token readers. */
1cc91f1b 357
7337d56d
LC
358static SCM scm_read_expression (SCM port);
359static SCM scm_read_sharp (int chr, SCM port);
0f2d19dd
JB
360
361
09a4f039 362static SCM
889975e5 363scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 364#define FUNC_NAME "scm_i_lreadparen"
09a4f039 365{
26c8cc14
AW
366 int c;
367 SCM tmp, tl, ans = SCM_EOL;
5afa815c 368 const int terminating_char = ((chr == '[') ? ']' : ')');
7337d56d
LC
369
370 /* Need to capture line and column numbers here. */
371 long line = SCM_LINUM (port);
372 int column = SCM_COL (port) - 1;
f9c68a47 373
7337d56d
LC
374 c = flush_ws (port, FUNC_NAME);
375 if (terminating_char == c)
376 return SCM_EOL;
f9c68a47 377
7337d56d 378 scm_ungetc (c, port);
1f7945a7
AW
379 tmp = scm_read_expression (port);
380
381 /* Note that it is possible for scm_read_expression to return
382 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
383 check that it's a real dot by checking `c'. */
384 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
7337d56d
LC
385 {
386 ans = scm_read_expression (port);
387 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
388 scm_i_input_error (FUNC_NAME, port, "missing close paren",
389 SCM_EOL);
390 return ans;
391 }
1cc91f1b 392
7337d56d
LC
393 /* Build the head of the list structure. */
394 ans = tl = scm_cons (tmp, SCM_EOL);
395
7337d56d 396 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 397 {
7337d56d 398 SCM new_tail;
0f2d19dd 399
5b69315e
AW
400 if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
401 scm_i_input_error (FUNC_NAME, port,
402 "in pair: mismatched close paren: ~A",
403 scm_list_1 (SCM_MAKE_CHAR (c)));
404
7337d56d 405 scm_ungetc (c, port);
5b69315e
AW
406 tmp = scm_read_expression (port);
407
1f7945a7
AW
408 /* See above note about scm_sym_dot. */
409 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
0f2d19dd 410 {
89f88612 411 SCM_SETCDR (tl, scm_read_expression (port));
7337d56d 412
7337d56d
LC
413 c = flush_ws (port, FUNC_NAME);
414 if (terminating_char != c)
415 scm_i_input_error (FUNC_NAME, port,
416 "in pair: missing close paren", SCM_EOL);
417 goto exit;
0f2d19dd 418 }
b858464a 419
7337d56d
LC
420 new_tail = scm_cons (tmp, SCM_EOL);
421 SCM_SETCDR (tl, new_tail);
422 tl = new_tail;
7337d56d 423 }
0f2d19dd 424
7337d56d
LC
425 exit:
426 if (SCM_RECORD_POSITIONS_P)
26c8cc14
AW
427 scm_i_set_source_properties_x (ans, line, column, SCM_FILENAME (port));
428
7337d56d
LC
429 return ans;
430}
431#undef FUNC_NAME
a4022e69 432
c5661d28
MG
433
434/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
435 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
436 found. */
437#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
438 do \
439 { \
440 scm_t_wchar a; \
441 size_t i = 0; \
442 c = 0; \
443 while (i < ndigits) \
444 { \
445 a = scm_getc (port); \
446 if (a == EOF) \
447 goto str_eof; \
448 if (terminator \
449 && (a == (scm_t_wchar) terminator) \
450 && (i > 0)) \
451 break; \
452 if ('0' <= a && a <= '9') \
453 a -= '0'; \
454 else if ('A' <= a && a <= 'F') \
455 a = a - 'A' + 10; \
456 else if ('a' <= a && a <= 'f') \
457 a = a - 'a' + 10; \
458 else \
459 { \
460 c = a; \
461 goto bad_escaped; \
462 } \
463 c = c * 16 + a; \
464 i ++; \
465 } \
c5661d28
MG
466 } while (0)
467
684d664e
AW
468static void
469skip_intraline_whitespace (SCM port)
470{
471 scm_t_wchar c;
472
473 do
474 {
475 c = scm_getc (port);
476 if (c == EOF)
477 return;
478 }
479 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
480
481 scm_ungetc (c, port);
482}
483
7337d56d
LC
484static SCM
485scm_read_string (int chr, SCM port)
486#define FUNC_NAME "scm_lreadr"
487{
488 /* For strings smaller than C_STR, this function creates only one Scheme
489 object (the string returned). */
0f2d19dd 490
7337d56d 491 SCM str = SCM_BOOL_F;
7337d56d 492 unsigned c_str_len = 0;
9c44cd45 493 scm_t_wchar c;
eb42ff25 494
190d4b0d 495 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
7337d56d
LC
496 while ('"' != (c = scm_getc (port)))
497 {
498 if (c == EOF)
9c44cd45
MG
499 {
500 str_eof:
501 scm_i_input_error (FUNC_NAME, port,
502 "end of file in string constant", SCM_EOL);
503 }
0f2d19dd 504
9c44cd45
MG
505 if (c_str_len + 1 >= scm_i_string_length (str))
506 {
190d4b0d 507 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
0f2d19dd 508
9c44cd45
MG
509 str = scm_string_append (scm_list_2 (str, addy));
510 }
0f2d19dd 511
7337d56d 512 if (c == '\\')
9c44cd45
MG
513 {
514 switch (c = scm_getc (port))
515 {
516 case EOF:
517 goto str_eof;
518 case '"':
519 case '\\':
520 break;
9c44cd45 521 case '\n':
684d664e
AW
522 if (SCM_HUNGRY_EOL_ESCAPES_P)
523 skip_intraline_whitespace (port);
9c44cd45
MG
524 continue;
525 case '0':
526 c = '\0';
527 break;
528 case 'f':
529 c = '\f';
530 break;
531 case 'n':
532 c = '\n';
533 break;
534 case 'r':
535 c = '\r';
536 break;
537 case 't':
538 c = '\t';
539 break;
540 case 'a':
541 c = '\007';
542 break;
543 case 'v':
544 c = '\v';
545 break;
67a4a16d
MG
546 case 'b':
547 c = '\010';
548 break;
9c44cd45 549 case 'x':
dea901d6
MG
550 if (SCM_R6RS_ESCAPES_P)
551 SCM_READ_HEX_ESCAPE (10, ';');
552 else
553 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 554 break;
9c44cd45 555 case 'u':
898a0b5a
MG
556 if (!SCM_R6RS_ESCAPES_P)
557 {
558 SCM_READ_HEX_ESCAPE (4, '\0');
559 break;
560 }
9c44cd45 561 case 'U':
898a0b5a
MG
562 if (!SCM_R6RS_ESCAPES_P)
563 {
564 SCM_READ_HEX_ESCAPE (6, '\0');
565 break;
566 }
9c44cd45
MG
567 default:
568 bad_escaped:
569 scm_i_input_error (FUNC_NAME, port,
570 "illegal character in escape sequence: ~S",
571 scm_list_1 (SCM_MAKE_CHAR (c)));
572 }
573 }
574 str = scm_i_string_start_writing (str);
575 scm_i_string_set_x (str, c_str_len++, c);
576 scm_i_string_stop_writing ();
7337d56d 577 }
f13b4400 578
7337d56d
LC
579 if (c_str_len > 0)
580 {
9c44cd45 581 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 582 }
69f90b0b 583
9c44cd45 584 return scm_nullstr;
0f2d19dd 585}
db4b4ca6
DH
586#undef FUNC_NAME
587
0f2d19dd 588
7337d56d 589static SCM
889975e5 590scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 591{
69f90b0b
MG
592 SCM result, str = SCM_EOL;
593 char buffer[READER_BUFFER_SIZE];
594 char *overflow_buffer = NULL;
595 size_t bytes_read;
596 int overflow;
597 scm_t_port *pt = SCM_PTAB_ENTRY (port);
0f2d19dd 598
7337d56d 599 scm_ungetc (chr, port);
69f90b0b
MG
600 overflow = read_complete_token (port, buffer, sizeof (buffer),
601 &overflow_buffer, &bytes_read);
602
603 if (!overflow)
604 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
605 else
606 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
607 pt->ilseq_handler);
608
609 result = scm_string_to_number (str, SCM_UNDEFINED);
889975e5 610 if (!scm_is_true (result))
69f90b0b
MG
611 {
612 /* Return a symbol instead of a number */
613 if (SCM_CASE_INSENSITIVE_P)
614 str = scm_string_downcase_x (str);
615 result = scm_string_to_symbol (str);
616 }
0f2d19dd 617
69f90b0b
MG
618 if (overflow)
619 free (overflow_buffer);
620 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
621 return result;
622}
0f2d19dd 623
7337d56d 624static SCM
889975e5 625scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 626{
889975e5
MG
627 SCM result;
628 int ends_with_colon = 0;
69f90b0b 629 size_t bytes_read;
ef4cbc08 630 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
69f90b0b
MG
631 int overflow;
632 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
633 scm_t_port *pt = SCM_PTAB_ENTRY (port);
634 SCM str;
7337d56d
LC
635
636 scm_ungetc (chr, port);
69f90b0b
MG
637 overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
638 &overflow_buffer, &bytes_read);
639 if (bytes_read > 0)
640 {
641 if (!overflow)
642 ends_with_colon = buffer[bytes_read - 1] == ':';
643 else
644 ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
645 }
ef4cbc08 646
69f90b0b
MG
647 if (postfix && ends_with_colon && (bytes_read > 1))
648 {
649 if (!overflow)
650 str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
651 else
652 str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
653 pt->ilseq_handler);
654
655 if (SCM_CASE_INSENSITIVE_P)
656 str = scm_string_downcase_x (str);
657 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
658 }
7337d56d 659 else
69f90b0b
MG
660 {
661 if (!overflow)
662 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
663 else
664 str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
665 pt->ilseq_handler);
666
667 if (SCM_CASE_INSENSITIVE_P)
668 str = scm_string_downcase_x (str);
669 result = scm_string_to_symbol (str);
670 }
7337d56d 671
69f90b0b
MG
672 if (overflow)
673 free (overflow_buffer);
674 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
675 return result;
676}
677
678static SCM
889975e5 679scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
680#define FUNC_NAME "scm_lreadr"
681{
889975e5 682 SCM result;
7337d56d 683 size_t read;
69f90b0b
MG
684 char buffer[READER_BUFFER_SIZE], *overflow_buffer;
685 int overflow;
7337d56d 686 unsigned int radix;
69f90b0b
MG
687 SCM str;
688 scm_t_port *pt;
7337d56d
LC
689
690 switch (chr)
691 {
692 case 'B':
693 case 'b':
694 radix = 2;
695 break;
696
697 case 'o':
698 case 'O':
699 radix = 8;
700 break;
701
702 case 'd':
703 case 'D':
704 radix = 10;
705 break;
706
707 case 'x':
708 case 'X':
709 radix = 16;
710 break;
711
712 default:
713 scm_ungetc (chr, port);
714 scm_ungetc ('#', port);
715 radix = 10;
716 }
717
69f90b0b
MG
718 overflow = read_complete_token (port, buffer, sizeof (buffer),
719 &overflow_buffer, &read);
720
721 pt = SCM_PTAB_ENTRY (port);
722 if (!overflow)
723 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
724 else
725 str = scm_from_stringn (overflow_buffer, read, pt->encoding,
726 pt->ilseq_handler);
727
728 result = scm_string_to_number (str, scm_from_uint (radix));
729
730 if (overflow)
731 free (overflow_buffer);
732
733 SCM_COL (port) += scm_i_string_length (str);
7337d56d
LC
734
735 if (scm_is_true (result))
736 return result;
737
738 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
739
740 return SCM_BOOL_F;
741}
742#undef FUNC_NAME
743
744static SCM
745scm_read_quote (int chr, SCM port)
746{
747 SCM p;
492faee1
LC
748 long line = SCM_LINUM (port);
749 int column = SCM_COL (port) - 1;
7337d56d
LC
750
751 switch (chr)
752 {
753 case '`':
754 p = scm_sym_quasiquote;
755 break;
756
757 case '\'':
758 p = scm_sym_quote;
759 break;
760
761 case ',':
762 {
889975e5 763 scm_t_wchar c;
7337d56d
LC
764
765 c = scm_getc (port);
766 if ('@' == c)
767 p = scm_sym_uq_splicing;
768 else
0f2d19dd 769 {
7337d56d
LC
770 scm_ungetc (c, port);
771 p = scm_sym_unquote;
0f2d19dd 772 }
7337d56d
LC
773 break;
774 }
0f2d19dd 775
7337d56d
LC
776 default:
777 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 778 "scm_read_quote", chr);
7337d56d 779 abort ();
0f2d19dd 780 }
1cc91f1b 781
7337d56d 782 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1 783 if (SCM_RECORD_POSITIONS_P)
26c8cc14 784 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
0f2d19dd 785
7337d56d
LC
786 return p;
787}
788
34f3d47d
AW
789SCM_SYMBOL (sym_syntax, "syntax");
790SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
791SCM_SYMBOL (sym_unsyntax, "unsyntax");
792SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
793
794static SCM
795scm_read_syntax (int chr, SCM port)
796{
797 SCM p;
798 long line = SCM_LINUM (port);
799 int column = SCM_COL (port) - 1;
800
801 switch (chr)
802 {
803 case '`':
804 p = sym_quasisyntax;
805 break;
806
807 case '\'':
808 p = sym_syntax;
809 break;
810
811 case ',':
812 {
813 int c;
814
815 c = scm_getc (port);
816 if ('@' == c)
817 p = sym_unsyntax_splicing;
818 else
819 {
820 scm_ungetc (c, port);
821 p = sym_unsyntax;
822 }
823 break;
824 }
825
826 default:
827 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
828 "scm_read_syntax", chr);
829 abort ();
830 }
831
832 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
833 if (SCM_RECORD_POSITIONS_P)
26c8cc14 834 scm_i_set_source_properties_x (p, line, column, SCM_FILENAME (port));
34f3d47d
AW
835
836 return p;
837}
838
7c4aad9c
AW
839static inline SCM
840scm_read_nil (int chr, SCM port)
841{
842 SCM id = scm_read_mixed_case_symbol (chr, port);
843
844 if (!scm_is_eq (id, sym_nil))
845 scm_i_input_error ("scm_read_nil", port,
846 "unexpected input while reading #nil: ~a",
847 scm_list_1 (id));
848
849 return SCM_ELISP_NIL;
850}
851
7337d56d
LC
852static inline SCM
853scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 854{
0f2d19dd
JB
855 int c;
856
889975e5
MG
857 /* We use the get_byte here because there is no need to get the
858 locale correct with comment input. This presumes that newline
859 always represents itself no matter what the encoding is. */
860 for (c = scm_get_byte_or_eof (port);
7337d56d 861 (c != EOF) && (c != '\n');
8a8da78d 862 c = scm_get_byte_or_eof (port));
7337d56d
LC
863
864 return SCM_UNSPECIFIED;
865}
866
867\f
868/* Sharp readers, i.e. readers called after a `#' sign has been read. */
869
870static SCM
871scm_read_boolean (int chr, SCM port)
872{
873 switch (chr)
0f2d19dd 874 {
7337d56d
LC
875 case 't':
876 case 'T':
877 return SCM_BOOL_T;
878
879 case 'f':
880 case 'F':
881 return SCM_BOOL_F;
0f2d19dd 882 }
7337d56d
LC
883
884 return SCM_UNSPECIFIED;
885}
886
887static SCM
889975e5 888scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
889#define FUNC_NAME "scm_lreadr"
890{
69f90b0b
MG
891 char buffer[READER_CHAR_NAME_MAX_SIZE];
892 SCM charname;
893 size_t charname_len, bytes_read;
889975e5
MG
894 scm_t_wchar cp;
895 int overflow;
69f90b0b 896 scm_t_port *pt;
889975e5 897
69f90b0b 898 overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
889975e5 899 if (overflow)
c1a0ba1c 900 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
7337d56d 901
69f90b0b 902 if (bytes_read == 0)
0f2d19dd 903 {
7337d56d
LC
904 chr = scm_getc (port);
905 if (chr == EOF)
906 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
907 "while reading character", SCM_EOL);
908
909 /* CHR must be a token delimiter, like a whitespace. */
910 return (SCM_MAKE_CHAR (chr));
0f2d19dd 911 }
7337d56d 912
69f90b0b 913 pt = SCM_PTAB_ENTRY (port);
7337d56d 914
69f90b0b
MG
915 /* Simple ASCII characters can be processed immediately. Also, simple
916 ISO-8859-1 characters can be processed immediately if the encoding for this
917 port is ISO-8859-1. */
918 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
919 {
920 SCM_COL (port) += 1;
921 return SCM_MAKE_CHAR (buffer[0]);
922 }
923
924 /* Otherwise, convert the buffer into a proper scheme string for
925 processing. */
926 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
927 pt->ilseq_handler);
928 charname_len = scm_i_string_length (charname);
929 SCM_COL (port) += charname_len;
889975e5 930 cp = scm_i_string_ref (charname, 0);
69f90b0b
MG
931 if (charname_len == 1)
932 return SCM_MAKE_CHAR (cp);
933
934 /* Ignore dotted circles, which may be used to keep combining characters from
935 combining with the backslash in #\charname. */
0dcd7e61
MG
936 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
937 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
938
889975e5 939 if (cp >= '0' && cp < '8')
7337d56d
LC
940 {
941 /* Dirk:FIXME:: This type of character syntax is not R5RS
942 * compliant. Further, it should be verified that the constant
0ffc78e3 943 * does only consist of octal digits. */
889975e5 944 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 945 if (SCM_I_INUMP (p))
0ffc78e3 946 {
e25f3727 947 scm_t_wchar c = scm_to_uint32 (p);
0ffc78e3
MG
948 if (SCM_IS_UNICODE_CHAR (c))
949 return SCM_MAKE_CHAR (c);
950 else
0f3a70cf 951 scm_i_input_error (FUNC_NAME, port,
0ffc78e3
MG
952 "out-of-range octal character escape: ~a",
953 scm_list_1 (charname));
954 }
7337d56d
LC
955 }
956
0f3a70cf 957 if (cp == 'x' && (charname_len > 1))
dea901d6
MG
958 {
959 SCM p;
0f3a70cf 960
dea901d6
MG
961 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
962 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
963 scm_from_uint (16));
964 if (SCM_I_INUMP (p))
965 {
e25f3727 966 scm_t_wchar c = scm_to_uint32 (p);
dea901d6
MG
967 if (SCM_IS_UNICODE_CHAR (c))
968 return SCM_MAKE_CHAR (c);
969 else
970 scm_i_input_error (FUNC_NAME, port,
971 "out-of-range hex character escape: ~a",
972 scm_list_1 (charname));
973 }
974 }
975
889975e5
MG
976 /* The names of characters should never have non-Latin1
977 characters. */
978 if (scm_i_is_narrow_string (charname)
979 || scm_i_try_narrow_string (charname))
4769c9db
AW
980 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
981 charname_len);
982 if (scm_is_true (ch))
983 return ch;
984 }
7337d56d 985
7337d56d 986 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 987 scm_list_1 (charname));
7337d56d
LC
988
989 return SCM_UNSPECIFIED;
0f2d19dd 990}
db4b4ca6 991#undef FUNC_NAME
0f2d19dd 992
7337d56d
LC
993static inline SCM
994scm_read_keyword (int chr, SCM port)
995{
996 SCM symbol;
997
998 /* Read the symbol that comprises the keyword. Doing this instead of
999 invoking a specific symbol reader function allows `scm_read_keyword ()'
1000 to adapt to the delimiters currently valid of symbols.
1cc91f1b 1001
7337d56d
LC
1002 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1003 symbol = scm_read_expression (port);
1004 if (!scm_is_symbol (symbol))
7f74cf9a 1005 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
1006 "keyword prefix `~a' not followed by a symbol: ~s",
1007 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1008
1009 return (scm_symbol_to_keyword (symbol));
1010}
1011
1012static inline SCM
1013scm_read_vector (int chr, SCM port)
09a4f039 1014{
7337d56d
LC
1015 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1016 guarantee that it's going to do what we want. After all, this is an
1017 implementation detail of `scm_read_vector ()', not a desirable
1018 property. */
1019 return (scm_vector (scm_read_sexp (chr, port)));
1020}
09a4f039 1021
7337d56d
LC
1022static inline SCM
1023scm_read_srfi4_vector (int chr, SCM port)
1024{
1025 return scm_i_read_array (port, chr);
1026}
1027
0ba0b384 1028static SCM
889975e5 1029scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
1030{
1031 chr = scm_getc (port);
1032 if (chr != 'u')
1033 goto syntax;
1034
1035 chr = scm_getc (port);
1036 if (chr != '8')
1037 goto syntax;
1038
1039 chr = scm_getc (port);
1040 if (chr != '(')
1041 goto syntax;
1042
1043 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
1044
1045 syntax:
1046 scm_i_input_error ("read_bytevector", port,
1047 "invalid bytevector prefix",
1048 SCM_MAKE_CHAR (chr));
1049 return SCM_UNSPECIFIED;
1050}
1051
7337d56d 1052static SCM
889975e5 1053scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
1054{
1055 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1056 terribly inefficient but who cares? */
1057 SCM s_bits = SCM_EOL;
1058
1059 for (chr = scm_getc (port);
1060 (chr != EOF) && ((chr == '0') || (chr == '1'));
1061 chr = scm_getc (port))
09a4f039 1062 {
7337d56d 1063 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 1064 }
7337d56d
LC
1065
1066 if (chr != EOF)
1067 scm_ungetc (chr, port);
1068
1069 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
1070}
1071
1072static inline SCM
889975e5 1073scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
1074{
1075 int bang_seen = 0;
1076
1077 for (;;)
09a4f039 1078 {
58b1db5f 1079 int c = scm_getc (port);
62850ef3 1080
7337d56d
LC
1081 if (c == EOF)
1082 scm_i_input_error ("skip_block_comment", port,
1083 "unterminated `#! ... !#' comment", SCM_EOL);
1084
1085 if (c == '!')
1086 bang_seen = 1;
1087 else if (c == '#' && bang_seen)
1088 break;
1089 else
1090 bang_seen = 0;
1091 }
1092
1093 return SCM_UNSPECIFIED;
1094}
1095
d7fcaec3 1096static SCM
911b03b2
JG
1097scm_read_shebang (scm_t_wchar chr, SCM port)
1098{
1099 int c = 0;
1100 if ((c = scm_get_byte_or_eof (port)) != 'r')
1101 {
1102 scm_ungetc (c, port);
1103 return scm_read_scsh_block_comment (chr, port);
1104 }
1105 if ((c = scm_get_byte_or_eof (port)) != '6')
1106 {
1107 scm_ungetc (c, port);
1108 scm_ungetc ('r', port);
1109 return scm_read_scsh_block_comment (chr, port);
1110 }
1111 if ((c = scm_get_byte_or_eof (port)) != 'r')
1112 {
1113 scm_ungetc (c, port);
1114 scm_ungetc ('6', port);
1115 scm_ungetc ('r', port);
1116 return scm_read_scsh_block_comment (chr, port);
1117 }
1118 if ((c = scm_get_byte_or_eof (port)) != 's')
1119 {
1120 scm_ungetc (c, port);
1121 scm_ungetc ('r', port);
1122 scm_ungetc ('6', port);
1123 scm_ungetc ('r', port);
1124 return scm_read_scsh_block_comment (chr, port);
1125 }
1126
1127 return SCM_UNSPECIFIED;
1128}
1129
620c8965
LC
1130static SCM
1131scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1132{
1133 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1134 nested. So care must be taken. */
1135 int nesting_level = 1;
6d5f8c32
AW
1136
1137 int a = scm_getc (port);
1138
1139 if (a == EOF)
1140 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1141 "unterminated `#| ... |#' comment", SCM_EOL);
620c8965
LC
1142
1143 while (nesting_level > 0)
1144 {
6d5f8c32 1145 int b = scm_getc (port);
620c8965 1146
6d5f8c32 1147 if (b == EOF)
cd169c5a 1148 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1149 "unterminated `#| ... |#' comment", SCM_EOL);
1150
6d5f8c32
AW
1151 if (a == '|' && b == '#')
1152 {
1153 nesting_level--;
1154 b = EOF;
1155 }
1156 else if (a == '#' && b == '|')
1157 {
1158 nesting_level++;
1159 b = EOF;
1160 }
1161
1162 a = b;
620c8965
LC
1163 }
1164
1165 return SCM_UNSPECIFIED;
1166}
1167
34f3d47d 1168static SCM
889975e5 1169scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1170{
889975e5 1171 scm_t_wchar c;
34f3d47d
AW
1172
1173 c = flush_ws (port, (char *) NULL);
1174 if (EOF == c)
1175 scm_i_input_error ("read_commented_expression", port,
1176 "no expression after #; comment", SCM_EOL);
1177 scm_ungetc (c, port);
1178 scm_read_expression (port);
1179 return SCM_UNSPECIFIED;
1180}
1181
7337d56d 1182static SCM
889975e5 1183scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1184{
1185 /* Guile's extended symbol read syntax looks like this:
1186
1187 #{This is all a symbol name}#
1188
1189 So here, CHR is expected to be `{'. */
d9527cfa 1190 int saw_brace = 0;
7337d56d 1191 size_t len = 0;
190d4b0d 1192 SCM buf = scm_i_make_string (1024, NULL, 0);
7337d56d 1193
889975e5 1194 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1195
1196 while ((chr = scm_getc (port)) != EOF)
1197 {
1198 if (saw_brace)
09a4f039 1199 {
7337d56d
LC
1200 if (chr == '#')
1201 {
7337d56d
LC
1202 break;
1203 }
1204 else
1205 {
1206 saw_brace = 0;
889975e5 1207 scm_i_string_set_x (buf, len++, '}');
7337d56d 1208 }
09a4f039 1209 }
d9527cfa
AW
1210
1211 if (chr == '}')
7337d56d 1212 saw_brace = 1;
d9527cfa
AW
1213 else if (chr == '\\')
1214 {
1215 /* It used to be that print.c would print extended-read-syntax
1216 symbols with backslashes before "non-standard" chars, but
1217 this routine wouldn't do anything with those escapes.
1218 Bummer. What we've done is to change print.c to output
1219 R6RS hex escapes for those characters, relying on the fact
1220 that the extended read syntax would never put a `\' before
1221 an `x'. For now, we just ignore other instances of
1222 backslash in the string. */
1223 switch ((chr = scm_getc (port)))
1224 {
1225 case EOF:
1226 goto done;
1227 case 'x':
1228 {
1229 scm_t_wchar c;
1230
1231 SCM_READ_HEX_ESCAPE (10, ';');
1232 scm_i_string_set_x (buf, len++, c);
1233 break;
1234
1235 str_eof:
1236 chr = EOF;
1237 goto done;
1238
1239 bad_escaped:
1240 scm_i_string_stop_writing ();
1241 scm_i_input_error ("scm_read_extended_symbol", port,
1242 "illegal character in escape sequence: ~S",
1243 scm_list_1 (SCM_MAKE_CHAR (c)));
1244 break;
1245 }
1246 default:
1247 scm_i_string_set_x (buf, len++, chr);
1248 break;
1249 }
1250 }
7337d56d 1251 else
d9527cfa 1252 scm_i_string_set_x (buf, len++, chr);
62850ef3 1253
889975e5 1254 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1255 {
7f991c7d
LC
1256 SCM addy;
1257
889975e5 1258 scm_i_string_stop_writing ();
190d4b0d 1259 addy = scm_i_make_string (1024, NULL, 0);
889975e5 1260 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1261 len = 0;
889975e5 1262 buf = scm_i_string_start_writing (buf);
7337d56d 1263 }
7337d56d 1264 }
d9527cfa
AW
1265
1266 done:
889975e5 1267 scm_i_string_stop_writing ();
d9527cfa
AW
1268 if (chr == EOF)
1269 scm_i_input_error ("scm_read_extended_symbol", port,
1270 "end of file while reading symbol", SCM_EOL);
7337d56d 1271
889975e5 1272 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1273}
1274
1275
1276\f
1277/* Top-level token readers, i.e., dispatchers. */
1278
1279static SCM
1280scm_read_sharp_extension (int chr, SCM port)
1281{
1282 SCM proc;
1283
1284 proc = scm_get_hash_procedure (chr);
1285 if (scm_is_true (scm_procedure_p (proc)))
1286 {
1287 long line = SCM_LINUM (port);
1288 int column = SCM_COL (port) - 2;
1289 SCM got;
1290
1291 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
26c8cc14
AW
1292
1293 if (scm_is_pair (got) && !scm_i_has_source_properties (got))
1294 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1295
1296 return got;
09a4f039 1297 }
7337d56d
LC
1298
1299 return SCM_UNSPECIFIED;
1300}
1301
1302/* The reader for the sharp `#' character. It basically dispatches reads
1303 among the above token readers. */
1304static SCM
889975e5 1305scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1306#define FUNC_NAME "scm_lreadr"
1307{
1308 SCM result;
1309
1310 chr = scm_getc (port);
1311
1312 result = scm_read_sharp_extension (chr, port);
1313 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1314 return result;
1315
1316 switch (chr)
1317 {
1318 case '\\':
1319 return (scm_read_character (chr, port));
1320 case '(':
1321 return (scm_read_vector (chr, port));
1322 case 's':
1323 case 'u':
1324 case 'f':
df941b5b 1325 case 'c':
7337d56d
LC
1326 /* This one may return either a boolean or an SRFI-4 vector. */
1327 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1328 case 'v':
1329 return (scm_read_bytevector (chr, port));
7337d56d
LC
1330 case '*':
1331 return (scm_read_guile_bit_vector (chr, port));
1332 case 't':
1333 case 'T':
1334 case 'F':
1335 /* This one may return either a boolean or an SRFI-4 vector. */
1336 return (scm_read_boolean (chr, port));
1337 case ':':
1338 return (scm_read_keyword (chr, port));
1339 case '0': case '1': case '2': case '3': case '4':
1340 case '5': case '6': case '7': case '8': case '9':
1341 case '@':
1342#if SCM_ENABLE_DEPRECATED
1343 /* See below for 'i' and 'e'. */
1344 case 'a':
7337d56d
LC
1345 case 'y':
1346 case 'h':
1347 case 'l':
1348#endif
1349 return (scm_i_read_array (port, chr));
1350
1351 case 'i':
1352 case 'e':
1353#if SCM_ENABLE_DEPRECATED
1354 {
1355 /* When next char is '(', it really is an old-style
1356 uniform array. */
889975e5 1357 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1358 if (next_c != EOF)
1359 scm_ungetc (next_c, port);
1360 if (next_c == '(')
1361 return scm_i_read_array (port, chr);
1362 /* Fall through. */
1363 }
1364#endif
1365 case 'b':
1366 case 'B':
1367 case 'o':
1368 case 'O':
1369 case 'd':
1370 case 'D':
1371 case 'x':
1372 case 'X':
1373 case 'I':
1374 case 'E':
1375 return (scm_read_number_and_radix (chr, port));
1376 case '{':
1377 return (scm_read_extended_symbol (chr, port));
1378 case '!':
911b03b2 1379 return (scm_read_shebang (chr, port));
34f3d47d
AW
1380 case ';':
1381 return (scm_read_commented_expression (chr, port));
1382 case '`':
1383 case '\'':
1384 case ',':
1385 return (scm_read_syntax (chr, port));
7c4aad9c
AW
1386 case 'n':
1387 return (scm_read_nil (chr, port));
7337d56d
LC
1388 default:
1389 result = scm_read_sharp_extension (chr, port);
1390 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1391 {
1392 /* To remain compatible with 1.8 and earlier, the following
1393 characters have lower precedence than `read-hash-extend'
1394 characters. */
1395 switch (chr)
1396 {
1397 case '|':
1398 return scm_read_r6rs_block_comment (chr, port);
1399 default:
1400 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1401 scm_list_1 (SCM_MAKE_CHAR (chr)));
1402 }
1403 }
7337d56d
LC
1404 else
1405 return result;
1406 }
1407
1408 return SCM_UNSPECIFIED;
1409}
1410#undef FUNC_NAME
1411
1412static SCM
1413scm_read_expression (SCM port)
1414#define FUNC_NAME "scm_read_expression"
1415{
1416 while (1)
1417 {
889975e5 1418 register scm_t_wchar chr;
7337d56d
LC
1419
1420 chr = scm_getc (port);
1421
1422 switch (chr)
1423 {
1424 case SCM_WHITE_SPACES:
1425 case SCM_LINE_INCREMENTORS:
1426 break;
1427 case ';':
1428 (void) scm_read_semicolon_comment (chr, port);
1429 break;
5afa815c
AW
1430 case '[':
1431 if (!SCM_SQUARE_BRACKETS_P)
1432 return (scm_read_mixed_case_symbol (chr, port));
1433 /* otherwise fall through */
7337d56d
LC
1434 case '(':
1435 return (scm_read_sexp (chr, port));
1436 case '"':
1437 return (scm_read_string (chr, port));
1438 case '\'':
1439 case '`':
1440 case ',':
1441 return (scm_read_quote (chr, port));
1442 case '#':
1443 {
1444 SCM result;
1445 result = scm_read_sharp (chr, port);
1446 if (scm_is_eq (result, SCM_UNSPECIFIED))
1447 /* We read a comment or some such. */
1448 break;
1449 else
1450 return result;
1451 }
1452 case ')':
1453 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1454 break;
a4e47229
MG
1455 case ']':
1456 if (SCM_SQUARE_BRACKETS_P)
1457 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1458 /* otherwise fall through */
7337d56d
LC
1459 case EOF:
1460 return SCM_EOF_VAL;
1461 case ':':
1462 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1463 return scm_symbol_to_keyword (scm_read_expression (port));
1464 /* Fall through. */
1465
1466 default:
1467 {
1468 if (((chr >= '0') && (chr <= '9'))
1469 || (strchr ("+-.", chr)))
1470 return (scm_read_number (chr, port));
1471 else
1472 return (scm_read_mixed_case_symbol (chr, port));
1473 }
1474 }
1475 }
1476}
1477#undef FUNC_NAME
1478
1479\f
1480/* Actual reader. */
1481
1482SCM_DEFINE (scm_read, "read", 0, 1, 0,
1483 (SCM port),
1484 "Read an s-expression from the input port @var{port}, or from\n"
1485 "the current input port if @var{port} is not specified.\n"
1486 "Any whitespace before the next token is discarded.")
1487#define FUNC_NAME s_scm_read
1488{
1489 int c;
1490
1491 if (SCM_UNBNDP (port))
1492 port = scm_current_input_port ();
1493 SCM_VALIDATE_OPINPORT (1, port);
1494
1495 c = flush_ws (port, (char *) NULL);
1496 if (EOF == c)
1497 return SCM_EOF_VAL;
1498 scm_ungetc (c, port);
1499
1500 return (scm_read_expression (port));
09a4f039 1501}
db4b4ca6 1502#undef FUNC_NAME
09a4f039 1503
0f2d19dd
JB
1504
1505\f
1506
14de3b42
GH
1507/* Manipulate the read-hash-procedures alist. This could be written in
1508 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1509SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1510 (SCM chr, SCM proc),
dc7fa443
MG
1511 "Install the procedure @var{proc} for reading expressions\n"
1512 "starting with the character sequence @code{#} and @var{chr}.\n"
1513 "@var{proc} will be called with two arguments: the character\n"
1514 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1515 "returned will be the return value of @code{read}. \n"
1516 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1517 )
1bbd0b84 1518#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1519{
fed9c9a2
GH
1520 SCM this;
1521 SCM prev;
1522
36284627 1523 SCM_VALIDATE_CHAR (1, chr);
7888309b 1524 SCM_ASSERT (scm_is_false (proc)
bc36d050 1525 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1526 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1527
14de3b42 1528 /* Check if chr is already in the alist. */
d458073b 1529 this = scm_i_read_hash_procedures_ref ();
14de3b42 1530 prev = SCM_BOOL_F;
fed9c9a2
GH
1531 while (1)
1532 {
d2e53ed6 1533 if (scm_is_null (this))
fed9c9a2
GH
1534 {
1535 /* not found, so add it to the beginning. */
7888309b 1536 if (scm_is_true (proc))
fed9c9a2 1537 {
d458073b
AR
1538 SCM new = scm_cons (scm_cons (chr, proc),
1539 scm_i_read_hash_procedures_ref ());
1540 scm_i_read_hash_procedures_set_x (new);
fed9c9a2
GH
1541 }
1542 break;
1543 }
bc36d050 1544 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1545 {
1546 /* already in the alist. */
7888309b 1547 if (scm_is_false (proc))
14de3b42
GH
1548 {
1549 /* remove it. */
7888309b 1550 if (scm_is_false (prev))
14de3b42 1551 {
d458073b
AR
1552 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1553 scm_i_read_hash_procedures_set_x (rest);
14de3b42
GH
1554 }
1555 else
1556 scm_set_cdr_x (prev, SCM_CDR (this));
1557 }
fed9c9a2 1558 else
14de3b42
GH
1559 {
1560 /* replace it. */
1561 scm_set_cdr_x (SCM_CAR (this), proc);
1562 }
fed9c9a2
GH
1563 break;
1564 }
1565 prev = this;
1566 this = SCM_CDR (this);
1567 }
deca31e1 1568
deca31e1
GH
1569 return SCM_UNSPECIFIED;
1570}
1bbd0b84 1571#undef FUNC_NAME
0f2d19dd 1572
deca31e1
GH
1573/* Recover the read-hash procedure corresponding to char c. */
1574static SCM
6e8d25a6 1575scm_get_hash_procedure (int c)
deca31e1 1576{
d458073b 1577 SCM rest = scm_i_read_hash_procedures_ref ();
fed9c9a2 1578
deca31e1
GH
1579 while (1)
1580 {
d2e53ed6 1581 if (scm_is_null (rest))
deca31e1
GH
1582 return SCM_BOOL_F;
1583
7866a09b 1584 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1585 return SCM_CDAR (rest);
1586
1587 rest = SCM_CDR (rest);
1588 }
1589}
1cc91f1b 1590
889975e5
MG
1591#define SCM_ENCODING_SEARCH_SIZE (500)
1592
f8a1c9a8
LC
1593/* Search the first few hundred characters of a file for an Emacs-like coding
1594 declaration. Returns either NULL or a string whose storage has been
1595 allocated with `scm_gc_malloc ()'. */
889975e5 1596char *
f8a1c9a8 1597scm_i_scan_for_encoding (SCM port)
889975e5 1598{
d900843c 1599 scm_t_port *pt;
889975e5 1600 char header[SCM_ENCODING_SEARCH_SIZE+1];
daedbca7 1601 size_t bytes_read, encoding_length, i;
889975e5
MG
1602 char *encoding = NULL;
1603 int utf8_bom = 0;
daedbca7 1604 char *pos, *encoding_start;
889975e5
MG
1605 int in_comment;
1606
d900843c 1607 pt = SCM_PTAB_ENTRY (port);
49bb5bd3 1608
d900843c
AW
1609 if (pt->rw_active == SCM_PORT_WRITE)
1610 scm_flush (port);
49bb5bd3 1611
d900843c
AW
1612 if (pt->rw_random)
1613 pt->rw_active = SCM_PORT_READ;
1614
1615 if (pt->read_pos == pt->read_end)
1616 {
1617 /* We can use the read buffer, and thus avoid a seek. */
1618 if (scm_fill_input (port) == EOF)
1619 return NULL;
1620
1621 bytes_read = pt->read_end - pt->read_pos;
1622 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
1623 bytes_read = SCM_ENCODING_SEARCH_SIZE;
1624
1625 if (bytes_read <= 1)
1626 /* An unbuffered port -- don't scan. */
1627 return NULL;
1628
1629 memcpy (header, pt->read_pos, bytes_read);
1630 header[bytes_read] = '\0';
1631 }
1632 else
1633 {
1634 /* Try to read some bytes and then seek back. Not all ports
1635 support seeking back; and indeed some file ports (like
1636 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
1637 check performed by SCM_FPORT_FDES---but fail to seek
1638 backwards. Hence this block comes second. We prefer to use
1639 the read buffer in-place. */
1640 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1641 return NULL;
1642
1643 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1644 header[bytes_read] = '\0';
1645 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1646 }
889975e5
MG
1647
1648 if (bytes_read > 3
1649 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1650 utf8_bom = 1;
1651
1652 /* search past "coding[:=]" */
1653 pos = header;
1654 while (1)
1655 {
1656 if ((pos = strstr(pos, "coding")) == NULL)
1657 return NULL;
1658
1659 pos += strlen("coding");
1660 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1661 (*pos == ':' || *pos == '='))
1662 {
1663 pos ++;
1664 break;
1665 }
1666 }
1667
1668 /* skip spaces */
1669 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1670 (*pos == ' ' || *pos == '\t'))
1671 pos ++;
1672
1673 /* grab the next token */
daedbca7 1674 encoding_start = pos;
889975e5 1675 i = 0;
daedbca7
MG
1676 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
1677 && encoding_start + i - header < bytes_read
1678 && (isalnum ((int) encoding_start[i])
1679 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
889975e5
MG
1680 i++;
1681
daedbca7
MG
1682 encoding_length = i;
1683 if (encoding_length == 0)
889975e5
MG
1684 return NULL;
1685
daedbca7
MG
1686 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
1687 for (i = 0; i < encoding_length; i++)
026ed239 1688 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1689
1690 /* push backwards to make sure we were in a comment */
1691 in_comment = 0;
daedbca7
MG
1692 pos = encoding_start;
1693 while (pos >= header)
889975e5 1694 {
8a12aeb9
AW
1695 if (*pos == ';')
1696 {
1697 in_comment = 1;
1698 break;
1699 }
1700 else if (*pos == '\n' || pos == header)
889975e5
MG
1701 {
1702 /* This wasn't in a semicolon comment. Check for a
1703 hash-bang comment. */
1704 char *beg = strstr (header, "#!");
1705 char *end = strstr (header, "!#");
8a12aeb9 1706 if (beg < encoding_start && encoding_start + encoding_length <= end)
889975e5
MG
1707 in_comment = 1;
1708 break;
1709 }
8a12aeb9
AW
1710 else
1711 {
1712 pos --;
1713 continue;
1714 }
889975e5
MG
1715 }
1716 if (!in_comment)
f8a1c9a8
LC
1717 /* This wasn't in a comment */
1718 return NULL;
1719
889975e5 1720 if (utf8_bom && strcmp(encoding, "UTF-8"))
daedbca7 1721 scm_misc_error (NULL,
889975e5
MG
1722 "the port input declares the encoding ~s but is encoded as UTF-8",
1723 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1724
889975e5
MG
1725 return encoding;
1726}
1727
1728SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1729 (SCM port),
a270e133 1730 "Scans the port for an Emacs-like character coding declaration\n"
ffb62a43 1731 "near the top of the contents of a port with random-accessible contents.\n"
889975e5
MG
1732 "The coding declaration is of the form\n"
1733 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1734 "\n"
1735 "Returns a string containing the character encoding of the file\n"
1736 "if a declaration was found, or @code{#f} otherwise.\n")
1737#define FUNC_NAME s_scm_file_encoding
1738{
1739 char *enc;
1740 SCM s_enc;
f8a1c9a8 1741
d900843c
AW
1742 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
1743
f8a1c9a8 1744 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1745 if (enc == NULL)
1746 return SCM_BOOL_F;
1747 else
1748 {
1749 s_enc = scm_from_locale_string (enc);
889975e5
MG
1750 return s_enc;
1751 }
f8a1c9a8 1752
889975e5
MG
1753 return SCM_BOOL_F;
1754}
1755#undef FUNC_NAME
1756
0f2d19dd
JB
1757void
1758scm_init_read ()
0f2d19dd 1759{
d458073b
AR
1760 SCM read_hash_procs;
1761
c81c2ad3 1762 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
d458073b
AR
1763
1764 scm_i_read_hash_procedures =
1765 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
fed9c9a2 1766
62560650 1767 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1768#include "libguile/read.x"
0f2d19dd 1769}
89e00824
ML
1770
1771/*
1772 Local Variables:
1773 c-file-style: "gnu"
1774 End:
1775*/