Use `mkstemp' instead of `mktemp'.
[bpt/guile.git] / libguile / read.c
CommitLineData
0ba0b384 1/* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009 Free Software
dd72382c 2 * Foundation, Inc.
0f2d19dd 3 *
73be1d9e 4 * This library is free software; you can redistribute it and/or
53befeb7
NJ
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
0f2d19dd 8 *
53befeb7
NJ
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
0f2d19dd 13 *
73be1d9e
MV
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
53befeb7
NJ
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
73be1d9e 18 */
1bbd0b84 19
1bbd0b84 20
0f2d19dd
JB
21\f
22
7337d56d
LC
23#ifdef HAVE_CONFIG_H
24# include <config.h>
25#endif
26
0f2d19dd 27#include <stdio.h>
7337d56d
LC
28#include <ctype.h>
29#include <string.h>
889975e5
MG
30#include <unistd.h>
31#include <unicase.h>
7337d56d 32
a0599745 33#include "libguile/_scm.h"
0ba0b384 34#include "libguile/bytevectors.h"
a0599745
MD
35#include "libguile/chars.h"
36#include "libguile/eval.h"
2fa901a5 37#include "libguile/arrays.h"
cf396142 38#include "libguile/bitvectors.h"
a0599745
MD
39#include "libguile/keywords.h"
40#include "libguile/alist.h"
41#include "libguile/srcprop.h"
42#include "libguile/hashtab.h"
43#include "libguile/hash.h"
44#include "libguile/ports.h"
49bb5bd3 45#include "libguile/fports.h"
a0599745
MD
46#include "libguile/root.h"
47#include "libguile/strings.h"
ba1b2226 48#include "libguile/strports.h"
a0599745 49#include "libguile/vectors.h"
a0599745 50#include "libguile/validate.h"
a4022e69 51#include "libguile/srfi-4.h"
7337d56d 52#include "libguile/srfi-13.h"
ba1b2226 53
a0599745 54#include "libguile/read.h"
22fc179a
HWN
55#include "libguile/private-options.h"
56
0f2d19dd
JB
57
58\f
59
5bf6a6f0 60SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
c7733771 61SCM_SYMBOL (scm_keyword_prefix, "prefix");
ef4cbc08 62SCM_SYMBOL (scm_keyword_postfix, "postfix");
c7733771 63
92c2555f 64scm_t_option scm_read_opts[] = {
b7ff98dd
MD
65 { SCM_OPTION_BOOLEAN, "copy", 0,
66 "Copy source code expressions." },
ac74fc22 67 { SCM_OPTION_BOOLEAN, "positions", 0,
deca31e1
GH
68 "Record positions of source code expressions." },
69 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
c7733771 70 "Convert symbols to lower case."},
62316c7f 71 { SCM_OPTION_SCM, "keywords", (unsigned long) SCM_BOOL_F,
904fabb6 72 "Style of keyword recognition: #f, 'prefix or 'postfix."},
16353acc 73#if SCM_ENABLE_ELISP
16353acc
NJ
74 { SCM_OPTION_BOOLEAN, "elisp-vectors", 0,
75 "Support Elisp vector syntax, namely `[...]'."},
cd21f5eb 76 { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
62560650 77 "Support `\\(' and `\\)' in strings."},
16353acc 78#endif
dea901d6
MG
79 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
80 "Use R6RS variable-length character and string hex escapes."},
62560650 81 { 0, },
a16f6fe7
MD
82};
83
39e8f371
HWN
84/*
85 Give meaningful error messages for errors
86
87 We use the format
88
ba1b2226 89 FILE:LINE:COL: MESSAGE
39e8f371
HWN
90 This happened in ....
91
92 This is not standard GNU format, but the test-suite likes the real
93 message to be in front.
94
39e8f371
HWN
95 */
96
97
a4022e69
MV
98void
99scm_i_input_error (char const *function,
100 SCM port, const char *message, SCM arg)
ba1b2226 101{
29a837fd
MV
102 SCM fn = (scm_is_string (SCM_FILENAME(port))
103 ? SCM_FILENAME(port)
104 : scm_from_locale_string ("#<unknown port>"));
ba1b2226 105
29a837fd 106 SCM string_port = scm_open_output_string ();
ba1b2226
HWN
107 SCM string = SCM_EOL;
108 scm_simple_format (string_port,
272632a6 109 scm_from_locale_string ("~A:~S:~S: ~A"),
29a837fd 110 scm_list_4 (fn,
b3aa4626 111 scm_from_long (SCM_LINUM (port) + 1),
b9bd8526 112 scm_from_int (SCM_COL (port) + 1),
272632a6 113 scm_from_locale_string (message)));
ba1b2226
HWN
114
115 string = scm_get_output_string (string_port);
116 scm_close_output_port (string_port);
272632a6 117 scm_error_scm (scm_from_locale_symbol ("read-error"),
a4022e69 118 function? scm_from_locale_string (function) : SCM_BOOL_F,
ba1b2226 119 string,
dd72382c 120 arg,
ba1b2226
HWN
121 SCM_BOOL_F);
122}
39e8f371
HWN
123
124
a1ec6916 125SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
1bbd0b84 126 (SCM setting),
dc7fa443
MG
127 "Option interface for the read options. Instead of using\n"
128 "this procedure directly, use the procedures @code{read-enable},\n"
3939e9df 129 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
1bbd0b84 130#define FUNC_NAME s_scm_read_options
a16f6fe7 131{
b7ff98dd
MD
132 SCM ans = scm_options (setting,
133 scm_read_opts,
1bbd0b84 134 FUNC_NAME);
b7ff98dd
MD
135 if (SCM_COPY_SOURCE_P)
136 SCM_RECORD_POSITIONS_P = 1;
a16f6fe7
MD
137 return ans;
138}
1bbd0b84 139#undef FUNC_NAME
a16f6fe7 140
14de3b42
GH
141/* An association list mapping extra hash characters to procedures. */
142static SCM *scm_read_hash_procedures;
deca31e1 143
0f2d19dd 144
7337d56d
LC
145\f
146/* Token readers. */
0f2d19dd 147
0f2d19dd 148
7337d56d
LC
149/* Size of the C buffer used to read symbols and numbers. */
150#define READER_BUFFER_SIZE 128
0f2d19dd 151
7337d56d
LC
152/* Size of the C buffer used to read strings. */
153#define READER_STRING_BUFFER_SIZE 512
0f2d19dd 154
7337d56d
LC
155/* The maximum size of Scheme character names. */
156#define READER_CHAR_NAME_MAX_SIZE 50
1cc91f1b 157
94115ae3 158
7337d56d
LC
159/* `isblank' is only in C99. */
160#define CHAR_IS_BLANK_(_chr) \
161 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
d41668fa 162 || ((_chr) == '\f') || ((_chr) == '\r'))
7337d56d
LC
163
164#ifdef MSDOS
165# define CHAR_IS_BLANK(_chr) \
166 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
167#else
168# define CHAR_IS_BLANK CHAR_IS_BLANK_
169#endif
170
171
172/* R5RS one-character delimiters (see section 7.1.1, ``Lexical
173 structure''). */
174#define CHAR_IS_R5RS_DELIMITER(c) \
175 (CHAR_IS_BLANK (c) \
176 || (c == ')') || (c == '(') || (c == ';') || (c == '"'))
177
178#define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
179
180/* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
181 Structure''. */
182#define CHAR_IS_EXPONENT_MARKER(_chr) \
183 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
184 || ((_chr) == 'd') || ((_chr) == 'l'))
185
454866e0 186/* Read an SCSH block comment. */
620c8965
LC
187static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
188static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
189static SCM scm_read_commented_expression (scm_t_wchar, SCM);
190static SCM scm_get_hash_procedure (int);
454866e0 191
d41668fa
LC
192/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
193 zero if the whole token fits in BUF, non-zero otherwise. */
7337d56d 194static inline int
889975e5 195read_token (SCM port, SCM buf, size_t *read)
0520c320 196{
889975e5 197 scm_t_wchar chr;
7337d56d 198 *read = 0;
0520c320 199
889975e5 200 while (*read < scm_i_string_length (buf))
0520c320 201 {
7337d56d 202 chr = scm_getc (port);
7337d56d
LC
203
204 if (chr == EOF)
b521d265 205 return 0;
889975e5
MG
206
207 chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
208
209 if (CHAR_IS_DELIMITER (chr))
7337d56d 210 {
889975e5
MG
211 scm_ungetc (chr, port);
212 return 0;
7337d56d 213 }
889975e5
MG
214
215 scm_i_string_set_x (buf, *read, chr);
216 (*read)++;
0520c320 217 }
7337d56d
LC
218
219 return 1;
0520c320 220}
1cc91f1b 221
889975e5
MG
222static SCM
223read_complete_token (SCM port, size_t *read)
224{
b521d265 225 SCM buffer;
889975e5 226 int overflow;
b521d265
AW
227 size_t overflow_read;
228 SCM tail = SCM_EOL;
889975e5
MG
229
230 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
231 overflow = read_token (port, buffer, read);
b521d265
AW
232 while (overflow)
233 {
234 tail = scm_cons (buffer, tail);
235 buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
236 overflow = read_token (port, buffer, &overflow_read);
237 *read += overflow_read;
889975e5 238 }
889975e5 239
b521d265
AW
240 if (scm_is_null (tail))
241 return scm_i_substring (buffer, 0, *read);
242 else
243 return scm_string_append
244 (scm_reverse (scm_cons (scm_i_substring (buffer, 0, overflow_read),
245 tail)));
889975e5 246}
7337d56d
LC
247
248/* Skip whitespace from PORT and return the first non-whitespace character
249 read. Raise an error on end-of-file. */
250static int
251flush_ws (SCM port, const char *eoferr)
0f2d19dd 252{
889975e5 253 register scm_t_wchar c;
0f2d19dd 254 while (1)
b7f3516f 255 switch (c = scm_getc (port))
0f2d19dd
JB
256 {
257 case EOF:
258 goteof:
259 if (eoferr)
d156d3b7 260 {
a4022e69
MV
261 scm_i_input_error (eoferr,
262 port,
263 "end of file",
264 SCM_EOL);
d156d3b7 265 }
0f2d19dd 266 return c;
7337d56d 267
0f2d19dd
JB
268 case ';':
269 lp:
b7f3516f 270 switch (c = scm_getc (port))
0f2d19dd
JB
271 {
272 case EOF:
273 goto goteof;
274 default:
275 goto lp;
276 case SCM_LINE_INCREMENTORS:
277 break;
278 }
279 break;
7337d56d 280
454866e0
LC
281 case '#':
282 switch (c = scm_getc (port))
283 {
284 case EOF:
285 eoferr = "read_sharp";
286 goto goteof;
287 case '!':
288 scm_read_scsh_block_comment (c, port);
289 break;
34f3d47d
AW
290 case ';':
291 scm_read_commented_expression (c, port);
292 break;
620c8965
LC
293 case '|':
294 if (scm_is_false (scm_get_hash_procedure (c)))
295 {
296 scm_read_r6rs_block_comment (c, port);
297 break;
298 }
299 /* fall through */
454866e0
LC
300 default:
301 scm_ungetc (c, port);
302 return '#';
303 }
304 break;
305
0f2d19dd 306 case SCM_LINE_INCREMENTORS:
0f2d19dd 307 case SCM_SINGLE_SPACES:
0f2d19dd 308 case '\t':
0f2d19dd 309 break;
7337d56d 310
0f2d19dd
JB
311 default:
312 return c;
313 }
7337d56d
LC
314
315 return 0;
0f2d19dd
JB
316}
317
318
7337d56d
LC
319\f
320/* Token readers. */
1cc91f1b 321
7337d56d
LC
322static SCM scm_read_expression (SCM port);
323static SCM scm_read_sharp (int chr, SCM port);
7337d56d 324static SCM recsexpr (SCM obj, long line, int column, SCM filename);
0f2d19dd
JB
325
326
09a4f039 327static SCM
889975e5 328scm_read_sexp (scm_t_wchar chr, SCM port)
7337d56d 329#define FUNC_NAME "scm_i_lreadparen"
09a4f039 330{
7337d56d
LC
331 register int c;
332 register SCM tmp;
333 register SCM tl, ans = SCM_EOL;
bd22f1c7 334 SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
7337d56d
LC
335 static const int terminating_char = ')';
336
337 /* Need to capture line and column numbers here. */
338 long line = SCM_LINUM (port);
339 int column = SCM_COL (port) - 1;
f9c68a47 340
f9c68a47 341
7337d56d
LC
342 c = flush_ws (port, FUNC_NAME);
343 if (terminating_char == c)
344 return SCM_EOL;
f9c68a47 345
7337d56d
LC
346 scm_ungetc (c, port);
347 if (scm_is_eq (scm_sym_dot,
348 (tmp = scm_read_expression (port))))
349 {
350 ans = scm_read_expression (port);
351 if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
352 scm_i_input_error (FUNC_NAME, port, "missing close paren",
353 SCM_EOL);
354 return ans;
355 }
1cc91f1b 356
7337d56d
LC
357 /* Build the head of the list structure. */
358 ans = tl = scm_cons (tmp, SCM_EOL);
359
360 if (SCM_COPY_SOURCE_P)
361 ans2 = tl2 = scm_cons (scm_is_pair (tmp)
362 ? copy
363 : tmp,
364 SCM_EOL);
365
366 while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
0f2d19dd 367 {
7337d56d 368 SCM new_tail;
0f2d19dd 369
7337d56d
LC
370 scm_ungetc (c, port);
371 if (scm_is_eq (scm_sym_dot,
372 (tmp = scm_read_expression (port))))
0f2d19dd 373 {
7337d56d
LC
374 SCM_SETCDR (tl, tmp = scm_read_expression (port));
375
376 if (SCM_COPY_SOURCE_P)
377 SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
378 SCM_EOL));
379
380 c = flush_ws (port, FUNC_NAME);
381 if (terminating_char != c)
382 scm_i_input_error (FUNC_NAME, port,
383 "in pair: missing close paren", SCM_EOL);
384 goto exit;
0f2d19dd 385 }
b858464a 386
7337d56d
LC
387 new_tail = scm_cons (tmp, SCM_EOL);
388 SCM_SETCDR (tl, new_tail);
389 tl = new_tail;
390
391 if (SCM_COPY_SOURCE_P)
0f2d19dd 392 {
7337d56d
LC
393 SCM new_tail2 = scm_cons (scm_is_pair (tmp)
394 ? copy
395 : tmp, SCM_EOL);
396 SCM_SETCDR (tl2, new_tail2);
397 tl2 = new_tail2;
398 }
399 }
0f2d19dd 400
7337d56d
LC
401 exit:
402 if (SCM_RECORD_POSITIONS_P)
403 scm_whash_insert (scm_source_whash,
404 ans,
405 scm_make_srcprops (line, column,
406 SCM_FILENAME (port),
407 SCM_COPY_SOURCE_P
408 ? ans2
409 : SCM_UNDEFINED,
410 SCM_EOL));
411 return ans;
412}
413#undef FUNC_NAME
a4022e69 414
c5661d28
MG
415
416/* Read a hexadecimal number NDIGITS in length. Put its value into the variable
dea901d6
MG
417 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
418 found. */
419#define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
420 do \
421 { \
422 scm_t_wchar a; \
423 size_t i = 0; \
424 c = 0; \
425 while (i < ndigits) \
426 { \
427 a = scm_getc (port); \
428 if (a == EOF) \
429 goto str_eof; \
430 if (terminator \
431 && (a == (scm_t_wchar) terminator) \
432 && (i > 0)) \
433 break; \
434 if ('0' <= a && a <= '9') \
435 a -= '0'; \
436 else if ('A' <= a && a <= 'F') \
437 a = a - 'A' + 10; \
438 else if ('a' <= a && a <= 'f') \
439 a = a - 'a' + 10; \
440 else \
441 { \
442 c = a; \
443 goto bad_escaped; \
444 } \
445 c = c * 16 + a; \
446 i ++; \
447 } \
c5661d28
MG
448 } while (0)
449
7337d56d
LC
450static SCM
451scm_read_string (int chr, SCM port)
452#define FUNC_NAME "scm_lreadr"
453{
454 /* For strings smaller than C_STR, this function creates only one Scheme
455 object (the string returned). */
0f2d19dd 456
7337d56d 457 SCM str = SCM_BOOL_F;
7337d56d 458 unsigned c_str_len = 0;
9c44cd45 459 scm_t_wchar c;
eb42ff25 460
9c44cd45 461 str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
7337d56d
LC
462 while ('"' != (c = scm_getc (port)))
463 {
464 if (c == EOF)
9c44cd45
MG
465 {
466 str_eof:
467 scm_i_input_error (FUNC_NAME, port,
468 "end of file in string constant", SCM_EOL);
469 }
0f2d19dd 470
9c44cd45
MG
471 if (c_str_len + 1 >= scm_i_string_length (str))
472 {
473 SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL);
0f2d19dd 474
9c44cd45
MG
475 str = scm_string_append (scm_list_2 (str, addy));
476 }
0f2d19dd 477
7337d56d 478 if (c == '\\')
9c44cd45
MG
479 {
480 switch (c = scm_getc (port))
481 {
482 case EOF:
483 goto str_eof;
484 case '"':
485 case '\\':
486 break;
16353acc 487#if SCM_ENABLE_ELISP
9c44cd45
MG
488 case '(':
489 case ')':
490 if (SCM_ESCAPED_PARENS_P)
491 break;
492 goto bad_escaped;
16353acc 493#endif
9c44cd45
MG
494 case '\n':
495 continue;
496 case '0':
497 c = '\0';
498 break;
499 case 'f':
500 c = '\f';
501 break;
502 case 'n':
503 c = '\n';
504 break;
505 case 'r':
506 c = '\r';
507 break;
508 case 't':
509 c = '\t';
510 break;
511 case 'a':
512 c = '\007';
513 break;
514 case 'v':
515 c = '\v';
516 break;
67a4a16d
MG
517 case 'b':
518 c = '\010';
519 break;
9c44cd45 520 case 'x':
dea901d6
MG
521 if (SCM_R6RS_ESCAPES_P)
522 SCM_READ_HEX_ESCAPE (10, ';');
523 else
524 SCM_READ_HEX_ESCAPE (2, '\0');
c5661d28 525 break;
9c44cd45 526 case 'u':
898a0b5a
MG
527 if (!SCM_R6RS_ESCAPES_P)
528 {
529 SCM_READ_HEX_ESCAPE (4, '\0');
530 break;
531 }
9c44cd45 532 case 'U':
898a0b5a
MG
533 if (!SCM_R6RS_ESCAPES_P)
534 {
535 SCM_READ_HEX_ESCAPE (6, '\0');
536 break;
537 }
9c44cd45
MG
538 default:
539 bad_escaped:
540 scm_i_input_error (FUNC_NAME, port,
541 "illegal character in escape sequence: ~S",
542 scm_list_1 (SCM_MAKE_CHAR (c)));
543 }
544 }
545 str = scm_i_string_start_writing (str);
546 scm_i_string_set_x (str, c_str_len++, c);
547 scm_i_string_stop_writing ();
7337d56d 548 }
f13b4400 549
7337d56d
LC
550 if (c_str_len > 0)
551 {
9c44cd45 552 return scm_i_substring_copy (str, 0, c_str_len);
0f2d19dd 553 }
9c44cd45
MG
554
555 return scm_nullstr;
0f2d19dd 556}
db4b4ca6
DH
557#undef FUNC_NAME
558
0f2d19dd 559
7337d56d 560static SCM
889975e5 561scm_read_number (scm_t_wchar chr, SCM port)
0f2d19dd 562{
889975e5
MG
563 SCM result;
564 SCM buffer;
7337d56d 565 size_t read;
0f2d19dd 566
7337d56d 567 scm_ungetc (chr, port);
889975e5
MG
568 buffer = read_complete_token (port, &read);
569 result = scm_string_to_number (buffer, SCM_UNDEFINED);
570 if (!scm_is_true (result))
571 /* Return a symbol instead of a number. */
572 result = scm_string_to_symbol (buffer);
0f2d19dd 573
7337d56d
LC
574 return result;
575}
0f2d19dd 576
7337d56d 577static SCM
889975e5 578scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
7337d56d 579{
889975e5
MG
580 SCM result;
581 int ends_with_colon = 0;
582 SCM buffer;
7337d56d 583 size_t read = 0;
ef4cbc08 584 int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
7337d56d
LC
585
586 scm_ungetc (chr, port);
889975e5
MG
587 buffer = read_complete_token (port, &read);
588 if (read > 0)
589 ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
ef4cbc08 590
889975e5
MG
591 if (postfix && ends_with_colon && (read > 1))
592 result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
7337d56d 593 else
889975e5 594 result = scm_string_to_symbol (buffer);
7337d56d
LC
595
596 return result;
597}
598
599static SCM
889975e5 600scm_read_number_and_radix (scm_t_wchar chr, SCM port)
7337d56d
LC
601#define FUNC_NAME "scm_lreadr"
602{
889975e5 603 SCM result;
7337d56d 604 size_t read;
889975e5 605 SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
7337d56d 606 unsigned int radix;
7337d56d
LC
607
608 switch (chr)
609 {
610 case 'B':
611 case 'b':
612 radix = 2;
613 break;
614
615 case 'o':
616 case 'O':
617 radix = 8;
618 break;
619
620 case 'd':
621 case 'D':
622 radix = 10;
623 break;
624
625 case 'x':
626 case 'X':
627 radix = 16;
628 break;
629
630 default:
631 scm_ungetc (chr, port);
632 scm_ungetc ('#', port);
633 radix = 10;
634 }
635
889975e5
MG
636 buffer = read_complete_token (port, &read);
637 result = scm_string_to_number (buffer, scm_from_uint (radix));
7337d56d
LC
638
639 if (scm_is_true (result))
640 return result;
641
642 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
643
644 return SCM_BOOL_F;
645}
646#undef FUNC_NAME
647
648static SCM
649scm_read_quote (int chr, SCM port)
650{
651 SCM p;
492faee1
LC
652 long line = SCM_LINUM (port);
653 int column = SCM_COL (port) - 1;
7337d56d
LC
654
655 switch (chr)
656 {
657 case '`':
658 p = scm_sym_quasiquote;
659 break;
660
661 case '\'':
662 p = scm_sym_quote;
663 break;
664
665 case ',':
666 {
889975e5 667 scm_t_wchar c;
7337d56d
LC
668
669 c = scm_getc (port);
670 if ('@' == c)
671 p = scm_sym_uq_splicing;
672 else
0f2d19dd 673 {
7337d56d
LC
674 scm_ungetc (c, port);
675 p = scm_sym_unquote;
0f2d19dd 676 }
7337d56d
LC
677 break;
678 }
0f2d19dd 679
7337d56d
LC
680 default:
681 fprintf (stderr, "%s: unhandled quote character (%i)\n",
7f74cf9a 682 "scm_read_quote", chr);
7337d56d 683 abort ();
0f2d19dd 684 }
1cc91f1b 685
7337d56d 686 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
492faee1
LC
687 if (SCM_RECORD_POSITIONS_P)
688 scm_whash_insert (scm_source_whash, p,
689 scm_make_srcprops (line, column,
690 SCM_FILENAME (port),
691 SCM_COPY_SOURCE_P
692 ? (scm_cons2 (SCM_CAR (p),
693 SCM_CAR (SCM_CDR (p)),
694 SCM_EOL))
695 : SCM_UNDEFINED,
696 SCM_EOL));
697
0f2d19dd 698
7337d56d
LC
699 return p;
700}
701
34f3d47d
AW
702SCM_SYMBOL (sym_syntax, "syntax");
703SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
704SCM_SYMBOL (sym_unsyntax, "unsyntax");
705SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
706
707static SCM
708scm_read_syntax (int chr, SCM port)
709{
710 SCM p;
711 long line = SCM_LINUM (port);
712 int column = SCM_COL (port) - 1;
713
714 switch (chr)
715 {
716 case '`':
717 p = sym_quasisyntax;
718 break;
719
720 case '\'':
721 p = sym_syntax;
722 break;
723
724 case ',':
725 {
726 int c;
727
728 c = scm_getc (port);
729 if ('@' == c)
730 p = sym_unsyntax_splicing;
731 else
732 {
733 scm_ungetc (c, port);
734 p = sym_unsyntax;
735 }
736 break;
737 }
738
739 default:
740 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
741 "scm_read_syntax", chr);
742 abort ();
743 }
744
745 p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
746 if (SCM_RECORD_POSITIONS_P)
747 scm_whash_insert (scm_source_whash, p,
748 scm_make_srcprops (line, column,
749 SCM_FILENAME (port),
750 SCM_COPY_SOURCE_P
751 ? (scm_cons2 (SCM_CAR (p),
752 SCM_CAR (SCM_CDR (p)),
753 SCM_EOL))
754 : SCM_UNDEFINED,
755 SCM_EOL));
756
757
758 return p;
759}
760
7337d56d
LC
761static inline SCM
762scm_read_semicolon_comment (int chr, SCM port)
0f2d19dd 763{
0f2d19dd
JB
764 int c;
765
889975e5
MG
766 /* We use the get_byte here because there is no need to get the
767 locale correct with comment input. This presumes that newline
768 always represents itself no matter what the encoding is. */
769 for (c = scm_get_byte_or_eof (port);
7337d56d
LC
770 (c != EOF) && (c != '\n');
771 c = scm_getc (port));
772
773 return SCM_UNSPECIFIED;
774}
775
776\f
777/* Sharp readers, i.e. readers called after a `#' sign has been read. */
778
779static SCM
780scm_read_boolean (int chr, SCM port)
781{
782 switch (chr)
0f2d19dd 783 {
7337d56d
LC
784 case 't':
785 case 'T':
786 return SCM_BOOL_T;
787
788 case 'f':
789 case 'F':
790 return SCM_BOOL_F;
0f2d19dd 791 }
7337d56d
LC
792
793 return SCM_UNSPECIFIED;
794}
795
796static SCM
889975e5 797scm_read_character (scm_t_wchar chr, SCM port)
7337d56d
LC
798#define FUNC_NAME "scm_lreadr"
799{
889975e5 800 SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
7337d56d 801 size_t charname_len;
889975e5
MG
802 scm_t_wchar cp;
803 int overflow;
804
805 overflow = read_token (port, charname, &charname_len);
806 charname = scm_c_substring (charname, 0, charname_len);
7337d56d 807
889975e5 808 if (overflow)
7337d56d
LC
809 goto char_error;
810
811 if (charname_len == 0)
0f2d19dd 812 {
7337d56d
LC
813 chr = scm_getc (port);
814 if (chr == EOF)
815 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
816 "while reading character", SCM_EOL);
817
818 /* CHR must be a token delimiter, like a whitespace. */
819 return (SCM_MAKE_CHAR (chr));
0f2d19dd 820 }
7337d56d
LC
821
822 if (charname_len == 1)
889975e5 823 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
7337d56d 824
889975e5 825 cp = scm_i_string_ref (charname, 0);
0dcd7e61
MG
826 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
827 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
828
889975e5 829 if (cp >= '0' && cp < '8')
7337d56d
LC
830 {
831 /* Dirk:FIXME:: This type of character syntax is not R5RS
832 * compliant. Further, it should be verified that the constant
0ffc78e3 833 * does only consist of octal digits. */
889975e5 834 SCM p = scm_string_to_number (charname, scm_from_uint (8));
7337d56d 835 if (SCM_I_INUMP (p))
0ffc78e3
MG
836 {
837 scm_t_wchar c = SCM_I_INUM (p);
838 if (SCM_IS_UNICODE_CHAR (c))
839 return SCM_MAKE_CHAR (c);
840 else
841 scm_i_input_error (FUNC_NAME, port,
842 "out-of-range octal character escape: ~a",
843 scm_list_1 (charname));
844 }
7337d56d
LC
845 }
846
dea901d6
MG
847 if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
848 {
849 SCM p;
850 scm_t_wchar chr;
851
852 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
853 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
854 scm_from_uint (16));
855 if (SCM_I_INUMP (p))
856 {
857 scm_t_wchar c = SCM_I_INUM (p);
858 if (SCM_IS_UNICODE_CHAR (c))
859 return SCM_MAKE_CHAR (c);
860 else
861 scm_i_input_error (FUNC_NAME, port,
862 "out-of-range hex character escape: ~a",
863 scm_list_1 (charname));
864 }
865 }
866
889975e5
MG
867 /* The names of characters should never have non-Latin1
868 characters. */
869 if (scm_i_is_narrow_string (charname)
870 || scm_i_try_narrow_string (charname))
4769c9db
AW
871 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
872 charname_len);
873 if (scm_is_true (ch))
874 return ch;
875 }
7337d56d
LC
876
877 char_error:
878 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
889975e5 879 scm_list_1 (charname));
7337d56d
LC
880
881 return SCM_UNSPECIFIED;
0f2d19dd 882}
db4b4ca6 883#undef FUNC_NAME
0f2d19dd 884
7337d56d
LC
885static inline SCM
886scm_read_keyword (int chr, SCM port)
887{
888 SCM symbol;
889
890 /* Read the symbol that comprises the keyword. Doing this instead of
891 invoking a specific symbol reader function allows `scm_read_keyword ()'
892 to adapt to the delimiters currently valid of symbols.
1cc91f1b 893
7337d56d
LC
894 XXX: This implementation allows sloppy syntaxes like `#: key'. */
895 symbol = scm_read_expression (port);
896 if (!scm_is_symbol (symbol))
7f74cf9a 897 scm_i_input_error ("scm_read_keyword", port,
7337d56d
LC
898 "keyword prefix `~a' not followed by a symbol: ~s",
899 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
900
901 return (scm_symbol_to_keyword (symbol));
902}
903
904static inline SCM
905scm_read_vector (int chr, SCM port)
09a4f039 906{
7337d56d
LC
907 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
908 guarantee that it's going to do what we want. After all, this is an
909 implementation detail of `scm_read_vector ()', not a desirable
910 property. */
911 return (scm_vector (scm_read_sexp (chr, port)));
912}
09a4f039 913
7337d56d
LC
914static inline SCM
915scm_read_srfi4_vector (int chr, SCM port)
916{
917 return scm_i_read_array (port, chr);
918}
919
0ba0b384 920static SCM
889975e5 921scm_read_bytevector (scm_t_wchar chr, SCM port)
0ba0b384
LC
922{
923 chr = scm_getc (port);
924 if (chr != 'u')
925 goto syntax;
926
927 chr = scm_getc (port);
928 if (chr != '8')
929 goto syntax;
930
931 chr = scm_getc (port);
932 if (chr != '(')
933 goto syntax;
934
935 return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
936
937 syntax:
938 scm_i_input_error ("read_bytevector", port,
939 "invalid bytevector prefix",
940 SCM_MAKE_CHAR (chr));
941 return SCM_UNSPECIFIED;
942}
943
7337d56d 944static SCM
889975e5 945scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
7337d56d
LC
946{
947 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
948 terribly inefficient but who cares? */
949 SCM s_bits = SCM_EOL;
950
951 for (chr = scm_getc (port);
952 (chr != EOF) && ((chr == '0') || (chr == '1'));
953 chr = scm_getc (port))
09a4f039 954 {
7337d56d 955 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
09a4f039 956 }
7337d56d
LC
957
958 if (chr != EOF)
959 scm_ungetc (chr, port);
960
961 return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
962}
963
964static inline SCM
889975e5 965scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
7337d56d
LC
966{
967 int bang_seen = 0;
968
889975e5
MG
969 /* We can use the get_byte here because there is no need to get the
970 locale correct when reading comments. This presumes that
971 hash and exclamation points always represent themselves no
972 matter what the source encoding is.*/
7337d56d 973 for (;;)
09a4f039 974 {
889975e5 975 int c = scm_get_byte_or_eof (port);
62850ef3 976
7337d56d
LC
977 if (c == EOF)
978 scm_i_input_error ("skip_block_comment", port,
979 "unterminated `#! ... !#' comment", SCM_EOL);
980
981 if (c == '!')
982 bang_seen = 1;
983 else if (c == '#' && bang_seen)
984 break;
985 else
986 bang_seen = 0;
987 }
988
989 return SCM_UNSPECIFIED;
990}
991
620c8965
LC
992static SCM
993scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
994{
995 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
996 nested. So care must be taken. */
997 int nesting_level = 1;
998 int opening_seen = 0, closing_seen = 0;
999
1000 while (nesting_level > 0)
1001 {
1002 int c = scm_getc (port);
1003
1004 if (c == EOF)
cd169c5a 1005 scm_i_input_error ("scm_read_r6rs_block_comment", port,
620c8965
LC
1006 "unterminated `#| ... |#' comment", SCM_EOL);
1007
1008 if (opening_seen)
1009 {
1010 if (c == '|')
1011 nesting_level++;
1012 opening_seen = 0;
1013 }
1014 else if (closing_seen)
1015 {
1016 if (c == '#')
1017 nesting_level--;
1018 closing_seen = 0;
1019 }
1020 else if (c == '|')
1021 closing_seen = 1;
1022 else if (c == '#')
1023 opening_seen = 1;
1024 else
1025 opening_seen = closing_seen = 0;
1026 }
1027
1028 return SCM_UNSPECIFIED;
1029}
1030
34f3d47d 1031static SCM
889975e5 1032scm_read_commented_expression (scm_t_wchar chr, SCM port)
34f3d47d 1033{
889975e5 1034 scm_t_wchar c;
34f3d47d
AW
1035
1036 c = flush_ws (port, (char *) NULL);
1037 if (EOF == c)
1038 scm_i_input_error ("read_commented_expression", port,
1039 "no expression after #; comment", SCM_EOL);
1040 scm_ungetc (c, port);
1041 scm_read_expression (port);
1042 return SCM_UNSPECIFIED;
1043}
1044
7337d56d 1045static SCM
889975e5 1046scm_read_extended_symbol (scm_t_wchar chr, SCM port)
7337d56d
LC
1047{
1048 /* Guile's extended symbol read syntax looks like this:
1049
1050 #{This is all a symbol name}#
1051
1052 So here, CHR is expected to be `{'. */
7337d56d
LC
1053 int saw_brace = 0, finished = 0;
1054 size_t len = 0;
889975e5 1055 SCM buf = scm_i_make_string (1024, NULL);
7337d56d 1056
889975e5 1057 buf = scm_i_string_start_writing (buf);
7337d56d
LC
1058
1059 while ((chr = scm_getc (port)) != EOF)
1060 {
1061 if (saw_brace)
09a4f039 1062 {
7337d56d
LC
1063 if (chr == '#')
1064 {
1065 finished = 1;
1066 break;
1067 }
1068 else
1069 {
1070 saw_brace = 0;
889975e5
MG
1071 scm_i_string_set_x (buf, len++, '}');
1072 scm_i_string_set_x (buf, len++, chr);
7337d56d 1073 }
09a4f039 1074 }
7337d56d
LC
1075 else if (chr == '}')
1076 saw_brace = 1;
1077 else
889975e5 1078 scm_i_string_set_x (buf, len++, chr);
62850ef3 1079
889975e5 1080 if (len >= scm_i_string_length (buf) - 2)
7337d56d 1081 {
7f991c7d
LC
1082 SCM addy;
1083
889975e5 1084 scm_i_string_stop_writing ();
7f991c7d 1085 addy = scm_i_make_string (1024, NULL);
889975e5 1086 buf = scm_string_append (scm_list_2 (buf, addy));
7337d56d 1087 len = 0;
889975e5 1088 buf = scm_i_string_start_writing (buf);
7337d56d 1089 }
62850ef3 1090
7337d56d
LC
1091 if (finished)
1092 break;
1093 }
889975e5 1094 scm_i_string_stop_writing ();
7337d56d 1095
889975e5 1096 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
7337d56d
LC
1097}
1098
1099
1100\f
1101/* Top-level token readers, i.e., dispatchers. */
1102
1103static SCM
1104scm_read_sharp_extension (int chr, SCM port)
1105{
1106 SCM proc;
1107
1108 proc = scm_get_hash_procedure (chr);
1109 if (scm_is_true (scm_procedure_p (proc)))
1110 {
1111 long line = SCM_LINUM (port);
1112 int column = SCM_COL (port) - 2;
1113 SCM got;
1114
1115 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1116 if (!scm_is_eq (got, SCM_UNSPECIFIED))
62850ef3 1117 {
7337d56d
LC
1118 if (SCM_RECORD_POSITIONS_P)
1119 return (recsexpr (got, line, column,
1120 SCM_FILENAME (port)));
1121 else
1122 return got;
62850ef3 1123 }
09a4f039 1124 }
7337d56d
LC
1125
1126 return SCM_UNSPECIFIED;
1127}
1128
1129/* The reader for the sharp `#' character. It basically dispatches reads
1130 among the above token readers. */
1131static SCM
889975e5 1132scm_read_sharp (scm_t_wchar chr, SCM port)
7337d56d
LC
1133#define FUNC_NAME "scm_lreadr"
1134{
1135 SCM result;
1136
1137 chr = scm_getc (port);
1138
1139 result = scm_read_sharp_extension (chr, port);
1140 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1141 return result;
1142
1143 switch (chr)
1144 {
1145 case '\\':
1146 return (scm_read_character (chr, port));
1147 case '(':
1148 return (scm_read_vector (chr, port));
1149 case 's':
1150 case 'u':
1151 case 'f':
1152 /* This one may return either a boolean or an SRFI-4 vector. */
1153 return (scm_read_srfi4_vector (chr, port));
0ba0b384
LC
1154 case 'v':
1155 return (scm_read_bytevector (chr, port));
7337d56d
LC
1156 case '*':
1157 return (scm_read_guile_bit_vector (chr, port));
1158 case 't':
1159 case 'T':
1160 case 'F':
1161 /* This one may return either a boolean or an SRFI-4 vector. */
1162 return (scm_read_boolean (chr, port));
1163 case ':':
1164 return (scm_read_keyword (chr, port));
1165 case '0': case '1': case '2': case '3': case '4':
1166 case '5': case '6': case '7': case '8': case '9':
1167 case '@':
1168#if SCM_ENABLE_DEPRECATED
1169 /* See below for 'i' and 'e'. */
1170 case 'a':
1171 case 'c':
1172 case 'y':
1173 case 'h':
1174 case 'l':
1175#endif
1176 return (scm_i_read_array (port, chr));
1177
1178 case 'i':
1179 case 'e':
1180#if SCM_ENABLE_DEPRECATED
1181 {
1182 /* When next char is '(', it really is an old-style
1183 uniform array. */
889975e5 1184 scm_t_wchar next_c = scm_getc (port);
7337d56d
LC
1185 if (next_c != EOF)
1186 scm_ungetc (next_c, port);
1187 if (next_c == '(')
1188 return scm_i_read_array (port, chr);
1189 /* Fall through. */
1190 }
1191#endif
1192 case 'b':
1193 case 'B':
1194 case 'o':
1195 case 'O':
1196 case 'd':
1197 case 'D':
1198 case 'x':
1199 case 'X':
1200 case 'I':
1201 case 'E':
1202 return (scm_read_number_and_radix (chr, port));
1203 case '{':
1204 return (scm_read_extended_symbol (chr, port));
1205 case '!':
1206 return (scm_read_scsh_block_comment (chr, port));
34f3d47d
AW
1207 case ';':
1208 return (scm_read_commented_expression (chr, port));
1209 case '`':
1210 case '\'':
1211 case ',':
1212 return (scm_read_syntax (chr, port));
7337d56d
LC
1213 default:
1214 result = scm_read_sharp_extension (chr, port);
1215 if (scm_is_eq (result, SCM_UNSPECIFIED))
620c8965
LC
1216 {
1217 /* To remain compatible with 1.8 and earlier, the following
1218 characters have lower precedence than `read-hash-extend'
1219 characters. */
1220 switch (chr)
1221 {
1222 case '|':
1223 return scm_read_r6rs_block_comment (chr, port);
1224 default:
1225 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1226 scm_list_1 (SCM_MAKE_CHAR (chr)));
1227 }
1228 }
7337d56d
LC
1229 else
1230 return result;
1231 }
1232
1233 return SCM_UNSPECIFIED;
1234}
1235#undef FUNC_NAME
1236
1237static SCM
1238scm_read_expression (SCM port)
1239#define FUNC_NAME "scm_read_expression"
1240{
1241 while (1)
1242 {
889975e5 1243 register scm_t_wchar chr;
7337d56d
LC
1244
1245 chr = scm_getc (port);
1246
1247 switch (chr)
1248 {
1249 case SCM_WHITE_SPACES:
1250 case SCM_LINE_INCREMENTORS:
1251 break;
1252 case ';':
1253 (void) scm_read_semicolon_comment (chr, port);
1254 break;
1255 case '(':
1256 return (scm_read_sexp (chr, port));
1257 case '"':
1258 return (scm_read_string (chr, port));
1259 case '\'':
1260 case '`':
1261 case ',':
1262 return (scm_read_quote (chr, port));
1263 case '#':
1264 {
1265 SCM result;
1266 result = scm_read_sharp (chr, port);
1267 if (scm_is_eq (result, SCM_UNSPECIFIED))
1268 /* We read a comment or some such. */
1269 break;
1270 else
1271 return result;
1272 }
1273 case ')':
1274 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1275 break;
1276 case EOF:
1277 return SCM_EOF_VAL;
1278 case ':':
1279 if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
1280 return scm_symbol_to_keyword (scm_read_expression (port));
1281 /* Fall through. */
1282
1283 default:
1284 {
1285 if (((chr >= '0') && (chr <= '9'))
1286 || (strchr ("+-.", chr)))
1287 return (scm_read_number (chr, port));
1288 else
1289 return (scm_read_mixed_case_symbol (chr, port));
1290 }
1291 }
1292 }
1293}
1294#undef FUNC_NAME
1295
1296\f
1297/* Actual reader. */
1298
1299SCM_DEFINE (scm_read, "read", 0, 1, 0,
1300 (SCM port),
1301 "Read an s-expression from the input port @var{port}, or from\n"
1302 "the current input port if @var{port} is not specified.\n"
1303 "Any whitespace before the next token is discarded.")
1304#define FUNC_NAME s_scm_read
1305{
1306 int c;
1307
1308 if (SCM_UNBNDP (port))
1309 port = scm_current_input_port ();
1310 SCM_VALIDATE_OPINPORT (1, port);
1311
1312 c = flush_ws (port, (char *) NULL);
1313 if (EOF == c)
1314 return SCM_EOF_VAL;
1315 scm_ungetc (c, port);
1316
1317 return (scm_read_expression (port));
09a4f039 1318}
db4b4ca6 1319#undef FUNC_NAME
09a4f039 1320
0f2d19dd
JB
1321
1322\f
1323
7337d56d
LC
1324/* Used when recording expressions constructed by `scm_read_sharp ()'. */
1325static SCM
1326recsexpr (SCM obj, long line, int column, SCM filename)
1327{
1328 if (!scm_is_pair(obj)) {
1329 return obj;
1330 } else {
1331 SCM tmp = obj, copy;
1332 /* If this sexpr is visible in the read:sharp source, we want to
1333 keep that information, so only record non-constant cons cells
1334 which haven't previously been read by the reader. */
1335 if (scm_is_false (scm_whash_lookup (scm_source_whash, obj)))
1336 {
1337 if (SCM_COPY_SOURCE_P)
1338 {
1339 copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
1340 SCM_UNDEFINED);
1341 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1342 {
1343 SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
1344 line,
1345 column,
1346 filename),
1347 SCM_UNDEFINED));
1348 copy = SCM_CDR (copy);
1349 }
1350 SCM_SETCDR (copy, tmp);
1351 }
1352 else
1353 {
1354 recsexpr (SCM_CAR (obj), line, column, filename);
1355 while ((tmp = SCM_CDR (tmp)) && scm_is_pair (tmp))
1356 recsexpr (SCM_CAR (tmp), line, column, filename);
1357 copy = SCM_UNDEFINED;
1358 }
1359 scm_whash_insert (scm_source_whash,
1360 obj,
1361 scm_make_srcprops (line,
1362 column,
1363 filename,
1364 copy,
1365 SCM_EOL));
1366 }
1367 return obj;
1368 }
1369}
1370
14de3b42
GH
1371/* Manipulate the read-hash-procedures alist. This could be written in
1372 Scheme, but maybe it will also be used by C code during initialisation. */
a1ec6916 1373SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1bbd0b84 1374 (SCM chr, SCM proc),
dc7fa443
MG
1375 "Install the procedure @var{proc} for reading expressions\n"
1376 "starting with the character sequence @code{#} and @var{chr}.\n"
1377 "@var{proc} will be called with two arguments: the character\n"
1378 "@var{chr} and the port to read further data from. The object\n"
391f57e6
HWN
1379 "returned will be the return value of @code{read}. \n"
1380 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1381 )
1bbd0b84 1382#define FUNC_NAME s_scm_read_hash_extend
deca31e1 1383{
fed9c9a2
GH
1384 SCM this;
1385 SCM prev;
1386
36284627 1387 SCM_VALIDATE_CHAR (1, chr);
7888309b 1388 SCM_ASSERT (scm_is_false (proc)
bc36d050 1389 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
36284627 1390 proc, SCM_ARG2, FUNC_NAME);
fed9c9a2 1391
14de3b42
GH
1392 /* Check if chr is already in the alist. */
1393 this = *scm_read_hash_procedures;
1394 prev = SCM_BOOL_F;
fed9c9a2
GH
1395 while (1)
1396 {
d2e53ed6 1397 if (scm_is_null (this))
fed9c9a2
GH
1398 {
1399 /* not found, so add it to the beginning. */
7888309b 1400 if (scm_is_true (proc))
fed9c9a2 1401 {
14de3b42
GH
1402 *scm_read_hash_procedures =
1403 scm_cons (scm_cons (chr, proc), *scm_read_hash_procedures);
fed9c9a2
GH
1404 }
1405 break;
1406 }
bc36d050 1407 if (scm_is_eq (chr, SCM_CAAR (this)))
fed9c9a2
GH
1408 {
1409 /* already in the alist. */
7888309b 1410 if (scm_is_false (proc))
14de3b42
GH
1411 {
1412 /* remove it. */
7888309b 1413 if (scm_is_false (prev))
14de3b42
GH
1414 {
1415 *scm_read_hash_procedures =
1416 SCM_CDR (*scm_read_hash_procedures);
1417 }
1418 else
1419 scm_set_cdr_x (prev, SCM_CDR (this));
1420 }
fed9c9a2 1421 else
14de3b42
GH
1422 {
1423 /* replace it. */
1424 scm_set_cdr_x (SCM_CAR (this), proc);
1425 }
fed9c9a2
GH
1426 break;
1427 }
1428 prev = this;
1429 this = SCM_CDR (this);
1430 }
deca31e1 1431
deca31e1
GH
1432 return SCM_UNSPECIFIED;
1433}
1bbd0b84 1434#undef FUNC_NAME
0f2d19dd 1435
deca31e1
GH
1436/* Recover the read-hash procedure corresponding to char c. */
1437static SCM
6e8d25a6 1438scm_get_hash_procedure (int c)
deca31e1 1439{
14de3b42 1440 SCM rest = *scm_read_hash_procedures;
fed9c9a2 1441
deca31e1
GH
1442 while (1)
1443 {
d2e53ed6 1444 if (scm_is_null (rest))
deca31e1
GH
1445 return SCM_BOOL_F;
1446
7866a09b 1447 if (SCM_CHAR (SCM_CAAR (rest)) == c)
deca31e1
GH
1448 return SCM_CDAR (rest);
1449
1450 rest = SCM_CDR (rest);
1451 }
1452}
1cc91f1b 1453
889975e5
MG
1454#define SCM_ENCODING_SEARCH_SIZE (500)
1455
f8a1c9a8
LC
1456/* Search the first few hundred characters of a file for an Emacs-like coding
1457 declaration. Returns either NULL or a string whose storage has been
1458 allocated with `scm_gc_malloc ()'. */
889975e5 1459char *
f8a1c9a8 1460scm_i_scan_for_encoding (SCM port)
889975e5
MG
1461{
1462 char header[SCM_ENCODING_SEARCH_SIZE+1];
1463 size_t bytes_read;
1464 char *encoding = NULL;
1465 int utf8_bom = 0;
1466 char *pos;
1467 int i;
1468 int in_comment;
1469
49bb5bd3
LC
1470 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
1471 /* PORT is a non-seekable file port (e.g., as created by Bash when using
1472 "guile <(echo '(display "hello")')") so bail out. */
1473 return NULL;
1474
1475 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
1476
889975e5
MG
1477 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
1478
1479 if (bytes_read > 3
1480 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
1481 utf8_bom = 1;
1482
1483 /* search past "coding[:=]" */
1484 pos = header;
1485 while (1)
1486 {
1487 if ((pos = strstr(pos, "coding")) == NULL)
1488 return NULL;
1489
1490 pos += strlen("coding");
1491 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
1492 (*pos == ':' || *pos == '='))
1493 {
1494 pos ++;
1495 break;
1496 }
1497 }
1498
1499 /* skip spaces */
1500 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
1501 (*pos == ' ' || *pos == '\t'))
1502 pos ++;
1503
1504 /* grab the next token */
1505 i = 0;
1506 while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
060e305a 1507 && pos + i - header < bytes_read
6751d6db 1508 && (isalnum ((int) pos[i]) || strchr ("_-.:/,+=()", pos[i]) != NULL))
889975e5
MG
1509 i++;
1510
1511 if (i == 0)
1512 return NULL;
1513
b9fd657a 1514 encoding = scm_gc_strndup (pos, i, "encoding");
026ed239
MG
1515 for (i = 0; i < strlen (encoding); i++)
1516 encoding[i] = toupper ((int) encoding[i]);
889975e5
MG
1517
1518 /* push backwards to make sure we were in a comment */
1519 in_comment = 0;
1520 while (pos - i - header > 0)
1521 {
1522 if (*(pos - i) == '\n')
1523 {
1524 /* This wasn't in a semicolon comment. Check for a
1525 hash-bang comment. */
1526 char *beg = strstr (header, "#!");
1527 char *end = strstr (header, "!#");
1528 if (beg < pos && pos < end)
1529 in_comment = 1;
1530 break;
1531 }
1532 if (*(pos - i) == ';')
1533 {
1534 in_comment = 1;
1535 break;
1536 }
1537 i ++;
1538 }
1539 if (!in_comment)
f8a1c9a8
LC
1540 /* This wasn't in a comment */
1541 return NULL;
1542
889975e5
MG
1543 if (utf8_bom && strcmp(encoding, "UTF-8"))
1544 scm_misc_error (NULL,
1545 "the port input declares the encoding ~s but is encoded as UTF-8",
1546 scm_list_1 (scm_from_locale_string (encoding)));
f8a1c9a8 1547
889975e5
MG
1548 return encoding;
1549}
1550
1551SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
1552 (SCM port),
a270e133 1553 "Scans the port for an Emacs-like character coding declaration\n"
889975e5
MG
1554 "near the top of the contents of a port with random-acessible contents.\n"
1555 "The coding declaration is of the form\n"
1556 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
1557 "\n"
1558 "Returns a string containing the character encoding of the file\n"
1559 "if a declaration was found, or @code{#f} otherwise.\n")
1560#define FUNC_NAME s_scm_file_encoding
1561{
1562 char *enc;
1563 SCM s_enc;
f8a1c9a8
LC
1564
1565 enc = scm_i_scan_for_encoding (port);
889975e5
MG
1566 if (enc == NULL)
1567 return SCM_BOOL_F;
1568 else
1569 {
1570 s_enc = scm_from_locale_string (enc);
889975e5
MG
1571 return s_enc;
1572 }
f8a1c9a8 1573
889975e5
MG
1574 return SCM_BOOL_F;
1575}
1576#undef FUNC_NAME
1577
0f2d19dd
JB
1578void
1579scm_init_read ()
0f2d19dd 1580{
14de3b42 1581 scm_read_hash_procedures =
86d31dfe 1582 SCM_VARIABLE_LOC (scm_c_define ("read-hash-procedures", SCM_EOL));
fed9c9a2 1583
62560650 1584 scm_init_opts (scm_read_options, scm_read_opts);
a0599745 1585#include "libguile/read.x"
0f2d19dd 1586}
89e00824
ML
1587
1588/*
1589 Local Variables:
1590 c-file-style: "gnu"
1591 End:
1592*/