read: Support R7RS |...| symbol notation.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33 #include <c-strcase.h>
34 #include <c-ctype.h>
35
36 #include "libguile/_scm.h"
37 #include "libguile/bytevectors.h"
38 #include "libguile/chars.h"
39 #include "libguile/eval.h"
40 #include "libguile/arrays.h"
41 #include "libguile/bitvectors.h"
42 #include "libguile/keywords.h"
43 #include "libguile/alist.h"
44 #include "libguile/srcprop.h"
45 #include "libguile/hashtab.h"
46 #include "libguile/hash.h"
47 #include "libguile/ports.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59
60
61 \f
62
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73 scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
93 { 0, },
94 };
95
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100 enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
106
107 struct t_read_opts
108 {
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
118 unsigned int r7rs_symbols_p : 1;
119 };
120
121 typedef struct t_read_opts scm_t_read_opts;
122
123
124 /*
125 Give meaningful error messages for errors
126
127 We use the format
128
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
135 */
136
137
138 void
139 scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
141 {
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
145
146 SCM string_port = scm_open_output_string ();
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
150 scm_list_4 (fn,
151 scm_from_long (SCM_LINUM (port) + 1),
152 scm_from_int (SCM_COL (port) + 1),
153 scm_from_locale_string (message)));
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function? scm_from_locale_string (function) : SCM_BOOL_F,
159 string,
160 arg,
161 SCM_BOOL_F);
162 }
163
164
165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
166 (SCM setting),
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
171 {
172 SCM ans = scm_options (setting,
173 scm_read_opts,
174 FUNC_NAME);
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
177 return ans;
178 }
179 #undef FUNC_NAME
180
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM *scm_i_read_hash_procedures;
184
185 static SCM
186 scm_i_read_hash_procedures_ref (void)
187 {
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189 }
190
191 static void
192 scm_i_read_hash_procedures_set_x (SCM value)
193 {
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195 }
196
197 \f
198 /* Token readers. */
199
200
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
203
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
206
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
209
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
213
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
218
219 #ifdef MSDOS
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222 #else
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
224 #endif
225
226
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229 #define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
232
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
238
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
245 /* Read an SCSH block comment. */
246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_get_hash_procedure (int);
251
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
256 static int
257 read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
259 {
260 *read = 0;
261
262 while (*read < buf_size)
263 {
264 int chr;
265
266 chr = scm_get_byte_or_eof (port);
267
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
272 scm_unget_byte (chr, port);
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
281
282 return 1;
283 }
284
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287 static char *
288 read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
290 {
291 int overflow = 0;
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
294
295 do
296 {
297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
306 overflow_size = bytes_read;
307 }
308 else
309 {
310 char *new_buf =
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
317 overflow_size += bytes_read;
318 }
319 }
320 }
321 while (overflow);
322
323 if (overflow_size)
324 *read = overflow_size;
325 else
326 *read = bytes_read;
327
328 return (overflow_size > 0 ? overflow_buffer : buffer);
329 }
330
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333 static int
334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
335 {
336 scm_t_wchar c;
337 while (1)
338 switch (c = scm_getc (port))
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
343 {
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
348 }
349 return c;
350
351 case ';':
352 lp:
353 switch (c = scm_getc (port))
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
363
364 case '#':
365 switch (c = scm_getc (port))
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
371 scm_read_shebang (c, port, opts);
372 break;
373 case ';':
374 scm_read_commented_expression (c, port, opts);
375 break;
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
383 default:
384 scm_ungetc (c, port);
385 return '#';
386 }
387 break;
388
389 case SCM_LINE_INCREMENTORS:
390 case SCM_SINGLE_SPACES:
391 case '\t':
392 break;
393
394 default:
395 return c;
396 }
397
398 return 0;
399 }
400
401
402 \f
403 /* Token readers. */
404
405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
408
409
410 static SCM
411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
413 {
414 if (opts->record_positions_p)
415 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
416 return x;
417 }
418
419 static SCM
420 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
421 #define FUNC_NAME "scm_i_lreadparen"
422 {
423 int c;
424 SCM tmp, tl, ans = SCM_EOL;
425 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
426 const int terminating_char = ((chr == '{') ? '}'
427 : ((chr == '[') ? ']'
428 : ')'));
429
430 /* Need to capture line and column numbers here. */
431 long line = SCM_LINUM (port);
432 int column = SCM_COL (port) - 1;
433
434 c = flush_ws (port, opts, FUNC_NAME);
435 if (terminating_char == c)
436 return SCM_EOL;
437
438 scm_ungetc (c, port);
439 tmp = scm_read_expression (port, opts);
440
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
445 {
446 ans = scm_read_expression (port, opts);
447 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
448 scm_i_input_error (FUNC_NAME, port, "missing close paren",
449 SCM_EOL);
450 return ans;
451 }
452
453 /* Build the head of the list structure. */
454 ans = tl = scm_cons (tmp, SCM_EOL);
455
456 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
457 {
458 SCM new_tail;
459
460 if (c == ')' || (c == ']' && opts->square_brackets_p)
461 || ((c == '}' || c == ']') && opts->curly_infix_p))
462 scm_i_input_error (FUNC_NAME, port,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c)));
465
466 scm_ungetc (c, port);
467 tmp = scm_read_expression (port, opts);
468
469 /* See above note about scm_sym_dot. */
470 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
471 {
472 SCM_SETCDR (tl, scm_read_expression (port, opts));
473
474 c = flush_ws (port, opts, FUNC_NAME);
475 if (terminating_char != c)
476 scm_i_input_error (FUNC_NAME, port,
477 "in pair: missing close paren", SCM_EOL);
478 break;
479 }
480
481 new_tail = scm_cons (tmp, SCM_EOL);
482 SCM_SETCDR (tl, new_tail);
483 tl = new_tail;
484 }
485
486 if (curly_list_p)
487 {
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len = scm_ilength (ans);
491
492 /* The (len == 0) case is handled above */
493 if (len == 1)
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans); /* {e} => e */
498 else if (len == 2)
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len >= 3 && (len & 1))
501 {
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op = scm_cadr (ans);
507
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
510 {
511 if (scm_is_null (tl))
512 {
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
515 tl = ans;
516 while (scm_is_pair (scm_cdr (tl)))
517 {
518 tmp = scm_cddr (tl);
519 SCM_SETCDR (tl, tmp);
520 tl = tmp;
521 }
522 ans = scm_cons (op, ans);
523 break;
524 }
525 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
526 {
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans = scm_cons (sym_nfx, ans);
529 break;
530 }
531 }
532 }
533 else
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans = scm_cons (sym_nfx, ans);
537 }
538
539 return maybe_annotate_source (ans, port, opts, line, column);
540 }
541 #undef FUNC_NAME
542
543
544 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
546 found. */
547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 do \
549 { \
550 scm_t_wchar a; \
551 size_t i = 0; \
552 c = 0; \
553 while (i < ndigits) \
554 { \
555 a = scm_getc (port); \
556 if (a == EOF) \
557 goto str_eof; \
558 if (terminator \
559 && (a == (scm_t_wchar) terminator) \
560 && (i > 0)) \
561 break; \
562 if ('0' <= a && a <= '9') \
563 a -= '0'; \
564 else if ('A' <= a && a <= 'F') \
565 a = a - 'A' + 10; \
566 else if ('a' <= a && a <= 'f') \
567 a = a - 'a' + 10; \
568 else \
569 { \
570 c = a; \
571 goto bad_escaped; \
572 } \
573 c = c * 16 + a; \
574 i ++; \
575 } \
576 } while (0)
577
578 static void
579 skip_intraline_whitespace (SCM port)
580 {
581 scm_t_wchar c;
582
583 do
584 {
585 c = scm_getc (port);
586 if (c == EOF)
587 return;
588 }
589 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
590
591 scm_ungetc (c, port);
592 }
593
594 /* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
597 static SCM
598 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
599 #define FUNC_NAME "scm_lreadr"
600 {
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
603
604 SCM str = SCM_EOL;
605 size_t c_str_len = 0;
606 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
607
608 /* Need to capture line and column numbers here. */
609 long line = SCM_LINUM (port);
610 int column = SCM_COL (port) - 1;
611
612 while (chr != (c = scm_getc (port)))
613 {
614 if (c == EOF)
615 {
616 str_eof:
617 scm_i_input_error (FUNC_NAME, port,
618 (chr == '|'
619 ? "end of file in symbol"
620 : "end of file in string constant"),
621 SCM_EOL);
622 }
623
624 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
625 {
626 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
627 c_str_len = 0;
628 }
629
630 if (c == '\\')
631 {
632 switch (c = scm_getc (port))
633 {
634 case EOF:
635 goto str_eof;
636 case '|':
637 case '\\':
638 break;
639 case '\n':
640 if (opts->hungry_eol_escapes_p)
641 skip_intraline_whitespace (port);
642 continue;
643 case '0':
644 c = '\0';
645 break;
646 case 'f':
647 c = '\f';
648 break;
649 case 'n':
650 c = '\n';
651 break;
652 case 'r':
653 c = '\r';
654 break;
655 case 't':
656 c = '\t';
657 break;
658 case 'a':
659 c = '\007';
660 break;
661 case 'v':
662 c = '\v';
663 break;
664 case 'b':
665 c = '\010';
666 break;
667 case 'x':
668 if (opts->r6rs_escapes_p || chr == '|')
669 SCM_READ_HEX_ESCAPE (10, ';');
670 else
671 SCM_READ_HEX_ESCAPE (2, '\0');
672 break;
673 case 'u':
674 if (!opts->r6rs_escapes_p)
675 {
676 SCM_READ_HEX_ESCAPE (4, '\0');
677 break;
678 }
679 case 'U':
680 if (!opts->r6rs_escapes_p)
681 {
682 SCM_READ_HEX_ESCAPE (6, '\0');
683 break;
684 }
685 default:
686 if (c == chr)
687 break;
688 bad_escaped:
689 scm_i_input_error (FUNC_NAME, port,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c)));
692 }
693 }
694
695 c_str[c_str_len++] = c;
696 }
697
698 if (scm_is_null (str))
699 /* Fast path: we got a string that fits in C_STR. */
700 str = scm_from_utf32_stringn (c_str, c_str_len);
701 else
702 {
703 if (c_str_len > 0)
704 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
705
706 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
707 }
708
709 return maybe_annotate_source (str, port, opts, line, column);
710 }
711 #undef FUNC_NAME
712
713 static SCM
714 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
715 {
716 return scm_read_string_like_syntax (chr, port, opts);
717 }
718
719 static SCM
720 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
721 {
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
723 }
724
725 static SCM
726 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
727 {
728 SCM result, str = SCM_EOL;
729 char local_buffer[READER_BUFFER_SIZE], *buffer;
730 size_t bytes_read;
731 scm_t_port *pt = SCM_PTAB_ENTRY (port);
732
733 /* Need to capture line and column numbers here. */
734 long line = SCM_LINUM (port);
735 int column = SCM_COL (port) - 1;
736
737 scm_ungetc (chr, port);
738 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
739 &bytes_read);
740
741 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
742
743 result = scm_string_to_number (str, SCM_UNDEFINED);
744 if (scm_is_false (result))
745 {
746 /* Return a symbol instead of a number */
747 if (opts->case_insensitive_p)
748 str = scm_string_downcase_x (str);
749 result = scm_string_to_symbol (str);
750 }
751 else if (SCM_NIMP (result))
752 result = maybe_annotate_source (result, port, opts, line, column);
753
754 SCM_COL (port) += scm_i_string_length (str);
755 return result;
756 }
757
758 static SCM
759 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
760 {
761 SCM result;
762 int ends_with_colon = 0;
763 size_t bytes_read;
764 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
765 char local_buffer[READER_BUFFER_SIZE], *buffer;
766 scm_t_port *pt = SCM_PTAB_ENTRY (port);
767 SCM str;
768
769 scm_ungetc (chr, port);
770 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
771 &bytes_read);
772 if (bytes_read > 0)
773 ends_with_colon = buffer[bytes_read - 1] == ':';
774
775 if (postfix && ends_with_colon && (bytes_read > 1))
776 {
777 str = scm_from_stringn (buffer, bytes_read - 1,
778 pt->encoding, pt->ilseq_handler);
779
780 if (opts->case_insensitive_p)
781 str = scm_string_downcase_x (str);
782 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
783 }
784 else
785 {
786 str = scm_from_stringn (buffer, bytes_read,
787 pt->encoding, pt->ilseq_handler);
788
789 if (opts->case_insensitive_p)
790 str = scm_string_downcase_x (str);
791 result = scm_string_to_symbol (str);
792 }
793
794 SCM_COL (port) += scm_i_string_length (str);
795 return result;
796 }
797
798 static SCM
799 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
800 #define FUNC_NAME "scm_lreadr"
801 {
802 SCM result;
803 size_t read;
804 char local_buffer[READER_BUFFER_SIZE], *buffer;
805 unsigned int radix;
806 SCM str;
807 scm_t_port *pt;
808
809 switch (chr)
810 {
811 case 'B':
812 case 'b':
813 radix = 2;
814 break;
815
816 case 'o':
817 case 'O':
818 radix = 8;
819 break;
820
821 case 'd':
822 case 'D':
823 radix = 10;
824 break;
825
826 case 'x':
827 case 'X':
828 radix = 16;
829 break;
830
831 default:
832 scm_ungetc (chr, port);
833 scm_ungetc ('#', port);
834 radix = 10;
835 }
836
837 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
838 &read);
839
840 pt = SCM_PTAB_ENTRY (port);
841 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
842
843 result = scm_string_to_number (str, scm_from_uint (radix));
844
845 SCM_COL (port) += scm_i_string_length (str);
846
847 if (scm_is_true (result))
848 return result;
849
850 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
851
852 return SCM_BOOL_F;
853 }
854 #undef FUNC_NAME
855
856 static SCM
857 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
858 {
859 SCM p;
860 long line = SCM_LINUM (port);
861 int column = SCM_COL (port) - 1;
862
863 switch (chr)
864 {
865 case '`':
866 p = scm_sym_quasiquote;
867 break;
868
869 case '\'':
870 p = scm_sym_quote;
871 break;
872
873 case ',':
874 {
875 scm_t_wchar c;
876
877 c = scm_getc (port);
878 if ('@' == c)
879 p = scm_sym_uq_splicing;
880 else
881 {
882 scm_ungetc (c, port);
883 p = scm_sym_unquote;
884 }
885 break;
886 }
887
888 default:
889 fprintf (stderr, "%s: unhandled quote character (%i)\n",
890 "scm_read_quote", chr);
891 abort ();
892 }
893
894 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
895 return maybe_annotate_source (p, port, opts, line, column);
896 }
897
898 SCM_SYMBOL (sym_syntax, "syntax");
899 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
900 SCM_SYMBOL (sym_unsyntax, "unsyntax");
901 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
902
903 static SCM
904 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
905 {
906 SCM p;
907 long line = SCM_LINUM (port);
908 int column = SCM_COL (port) - 1;
909
910 switch (chr)
911 {
912 case '`':
913 p = sym_quasisyntax;
914 break;
915
916 case '\'':
917 p = sym_syntax;
918 break;
919
920 case ',':
921 {
922 int c;
923
924 c = scm_getc (port);
925 if ('@' == c)
926 p = sym_unsyntax_splicing;
927 else
928 {
929 scm_ungetc (c, port);
930 p = sym_unsyntax;
931 }
932 break;
933 }
934
935 default:
936 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
937 "scm_read_syntax", chr);
938 abort ();
939 }
940
941 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
942 return maybe_annotate_source (p, port, opts, line, column);
943 }
944
945 static SCM
946 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
947 {
948 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
949
950 if (!scm_is_eq (id, sym_nil))
951 scm_i_input_error ("scm_read_nil", port,
952 "unexpected input while reading #nil: ~a",
953 scm_list_1 (id));
954
955 return SCM_ELISP_NIL;
956 }
957
958 static SCM
959 scm_read_semicolon_comment (int chr, SCM port)
960 {
961 int c;
962
963 /* We use the get_byte here because there is no need to get the
964 locale correct with comment input. This presumes that newline
965 always represents itself no matter what the encoding is. */
966 for (c = scm_get_byte_or_eof (port);
967 (c != EOF) && (c != '\n');
968 c = scm_get_byte_or_eof (port));
969
970 return SCM_UNSPECIFIED;
971 }
972
973 /* If the EXPECTED_CHARS are the next ones available from PORT, then
974 consume them and return 1. Otherwise leave the port position where
975 it was and return 0. EXPECTED_CHARS should be all lowercase, and
976 will be matched case-insensitively against the characters read from
977 PORT. */
978 static int
979 try_read_ci_chars (SCM port, const char *expected_chars)
980 {
981 int num_chars_wanted = strlen (expected_chars);
982 int num_chars_read = 0;
983 char *chars_read = alloca (num_chars_wanted);
984 int c;
985
986 while (num_chars_read < num_chars_wanted)
987 {
988 c = scm_getc (port);
989 if (c == EOF)
990 break;
991 else if (c_tolower (c) != expected_chars[num_chars_read])
992 {
993 scm_ungetc (c, port);
994 break;
995 }
996 else
997 chars_read[num_chars_read++] = c;
998 }
999
1000 if (num_chars_read == num_chars_wanted)
1001 return 1;
1002 else
1003 {
1004 while (num_chars_read > 0)
1005 scm_ungetc (chars_read[--num_chars_read], port);
1006 return 0;
1007 }
1008 }
1009
1010 \f
1011 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1012
1013 static SCM
1014 scm_read_boolean (int chr, SCM port)
1015 {
1016 switch (chr)
1017 {
1018 case 't':
1019 case 'T':
1020 try_read_ci_chars (port, "rue");
1021 return SCM_BOOL_T;
1022
1023 case 'f':
1024 case 'F':
1025 try_read_ci_chars (port, "alse");
1026 return SCM_BOOL_F;
1027 }
1028
1029 return SCM_UNSPECIFIED;
1030 }
1031
1032 static SCM
1033 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1034 #define FUNC_NAME "scm_lreadr"
1035 {
1036 char buffer[READER_CHAR_NAME_MAX_SIZE];
1037 SCM charname;
1038 size_t charname_len, bytes_read;
1039 scm_t_wchar cp;
1040 int overflow;
1041 scm_t_port *pt;
1042
1043 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1044 &bytes_read);
1045 if (overflow)
1046 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1047
1048 if (bytes_read == 0)
1049 {
1050 chr = scm_getc (port);
1051 if (chr == EOF)
1052 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1053 "while reading character", SCM_EOL);
1054
1055 /* CHR must be a token delimiter, like a whitespace. */
1056 return (SCM_MAKE_CHAR (chr));
1057 }
1058
1059 pt = SCM_PTAB_ENTRY (port);
1060
1061 /* Simple ASCII characters can be processed immediately. Also, simple
1062 ISO-8859-1 characters can be processed immediately if the encoding for this
1063 port is ISO-8859-1. */
1064 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1065 {
1066 SCM_COL (port) += 1;
1067 return SCM_MAKE_CHAR (buffer[0]);
1068 }
1069
1070 /* Otherwise, convert the buffer into a proper scheme string for
1071 processing. */
1072 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1073 pt->ilseq_handler);
1074 charname_len = scm_i_string_length (charname);
1075 SCM_COL (port) += charname_len;
1076 cp = scm_i_string_ref (charname, 0);
1077 if (charname_len == 1)
1078 return SCM_MAKE_CHAR (cp);
1079
1080 /* Ignore dotted circles, which may be used to keep combining characters from
1081 combining with the backslash in #\charname. */
1082 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1083 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1084
1085 if (cp >= '0' && cp < '8')
1086 {
1087 /* Dirk:FIXME:: This type of character syntax is not R5RS
1088 * compliant. Further, it should be verified that the constant
1089 * does only consist of octal digits. */
1090 SCM p = scm_string_to_number (charname, scm_from_uint (8));
1091 if (SCM_I_INUMP (p))
1092 {
1093 scm_t_wchar c = scm_to_uint32 (p);
1094 if (SCM_IS_UNICODE_CHAR (c))
1095 return SCM_MAKE_CHAR (c);
1096 else
1097 scm_i_input_error (FUNC_NAME, port,
1098 "out-of-range octal character escape: ~a",
1099 scm_list_1 (charname));
1100 }
1101 }
1102
1103 if (cp == 'x' && (charname_len > 1))
1104 {
1105 SCM p;
1106
1107 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1109 scm_from_uint (16));
1110 if (SCM_I_INUMP (p))
1111 {
1112 scm_t_wchar c = scm_to_uint32 (p);
1113 if (SCM_IS_UNICODE_CHAR (c))
1114 return SCM_MAKE_CHAR (c);
1115 else
1116 scm_i_input_error (FUNC_NAME, port,
1117 "out-of-range hex character escape: ~a",
1118 scm_list_1 (charname));
1119 }
1120 }
1121
1122 /* The names of characters should never have non-Latin1
1123 characters. */
1124 if (scm_i_is_narrow_string (charname)
1125 || scm_i_try_narrow_string (charname))
1126 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1127 charname_len);
1128 if (scm_is_true (ch))
1129 return ch;
1130 }
1131
1132 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1133 scm_list_1 (charname));
1134
1135 return SCM_UNSPECIFIED;
1136 }
1137 #undef FUNC_NAME
1138
1139 static SCM
1140 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1141 {
1142 SCM symbol;
1143
1144 /* Read the symbol that comprises the keyword. Doing this instead of
1145 invoking a specific symbol reader function allows `scm_read_keyword ()'
1146 to adapt to the delimiters currently valid of symbols.
1147
1148 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1149 symbol = scm_read_expression (port, opts);
1150 if (!scm_is_symbol (symbol))
1151 scm_i_input_error ("scm_read_keyword", port,
1152 "keyword prefix `~a' not followed by a symbol: ~s",
1153 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1154
1155 return (scm_symbol_to_keyword (symbol));
1156 }
1157
1158 static SCM
1159 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1160 long line, int column)
1161 {
1162 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163 guarantee that it's going to do what we want. After all, this is an
1164 implementation detail of `scm_read_vector ()', not a desirable
1165 property. */
1166 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1167 port, opts, line, column);
1168 }
1169
1170 /* Helper used by scm_read_array */
1171 static int
1172 read_decimal_integer (SCM port, int c, ssize_t *resp)
1173 {
1174 ssize_t sign = 1;
1175 ssize_t res = 0;
1176 int got_it = 0;
1177
1178 if (c == '-')
1179 {
1180 sign = -1;
1181 c = scm_getc (port);
1182 }
1183
1184 while ('0' <= c && c <= '9')
1185 {
1186 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1187 scm_i_input_error ("read_decimal_integer", port,
1188 "number too large", SCM_EOL);
1189 res = 10*res + c-'0';
1190 got_it = 1;
1191 c = scm_getc (port);
1192 }
1193
1194 if (got_it)
1195 *resp = sign * res;
1196 return c;
1197 }
1198
1199 /* Read an array. This function can also read vectors and uniform
1200 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1201 handled here.
1202
1203 C is the first character read after the '#'. */
1204 static SCM
1205 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1206 {
1207 ssize_t rank;
1208 scm_t_wchar tag_buf[8];
1209 int tag_len;
1210
1211 SCM tag, shape = SCM_BOOL_F, elements, array;
1212
1213 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1214 the array code can not deal with zero-length dimensions yet, and
1215 we want to allow zero-length vectors, of course. */
1216 if (c == '(')
1217 return scm_read_vector (c, port, opts, line, column);
1218
1219 /* Disambiguate between '#f' and uniform floating point vectors. */
1220 if (c == 'f')
1221 {
1222 c = scm_getc (port);
1223 if (c != '3' && c != '6')
1224 {
1225 if (c == 'a' && try_read_ci_chars (port, "lse"))
1226 return SCM_BOOL_F;
1227 else if (c != EOF)
1228 scm_ungetc (c, port);
1229 return SCM_BOOL_F;
1230 }
1231 rank = 1;
1232 tag_buf[0] = 'f';
1233 tag_len = 1;
1234 goto continue_reading_tag;
1235 }
1236
1237 /* Read rank. */
1238 rank = 1;
1239 c = read_decimal_integer (port, c, &rank);
1240 if (rank < 0)
1241 scm_i_input_error (NULL, port, "array rank must be non-negative",
1242 SCM_EOL);
1243
1244 /* Read tag. */
1245 tag_len = 0;
1246 continue_reading_tag:
1247 while (c != EOF && c != '(' && c != '@' && c != ':'
1248 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1249 {
1250 tag_buf[tag_len++] = c;
1251 c = scm_getc (port);
1252 }
1253 if (tag_len == 0)
1254 tag = SCM_BOOL_T;
1255 else
1256 {
1257 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1258 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1259 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1260 scm_list_1 (tag));
1261 }
1262
1263 /* Read shape. */
1264 if (c == '@' || c == ':')
1265 {
1266 shape = SCM_EOL;
1267
1268 do
1269 {
1270 ssize_t lbnd = 0, len = 0;
1271 SCM s;
1272
1273 if (c == '@')
1274 {
1275 c = scm_getc (port);
1276 c = read_decimal_integer (port, c, &lbnd);
1277 }
1278
1279 s = scm_from_ssize_t (lbnd);
1280
1281 if (c == ':')
1282 {
1283 c = scm_getc (port);
1284 c = read_decimal_integer (port, c, &len);
1285 if (len < 0)
1286 scm_i_input_error (NULL, port,
1287 "array length must be non-negative",
1288 SCM_EOL);
1289
1290 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1291 }
1292
1293 shape = scm_cons (s, shape);
1294 } while (c == '@' || c == ':');
1295
1296 shape = scm_reverse_x (shape, SCM_EOL);
1297 }
1298
1299 /* Read nested lists of elements. */
1300 if (c != '(')
1301 scm_i_input_error (NULL, port,
1302 "missing '(' in vector or array literal",
1303 SCM_EOL);
1304 elements = scm_read_sexp (c, port, opts);
1305
1306 if (scm_is_false (shape))
1307 shape = scm_from_ssize_t (rank);
1308 else if (scm_ilength (shape) != rank)
1309 scm_i_input_error
1310 (NULL, port,
1311 "the number of shape specifications must match the array rank",
1312 SCM_EOL);
1313
1314 /* Handle special print syntax of rank zero arrays; see
1315 scm_i_print_array for a rationale. */
1316 if (rank == 0)
1317 {
1318 if (!scm_is_pair (elements))
1319 scm_i_input_error (NULL, port,
1320 "too few elements in array literal, need 1",
1321 SCM_EOL);
1322 if (!scm_is_null (SCM_CDR (elements)))
1323 scm_i_input_error (NULL, port,
1324 "too many elements in array literal, want 1",
1325 SCM_EOL);
1326 elements = SCM_CAR (elements);
1327 }
1328
1329 /* Construct array, annotate with source location, and return. */
1330 array = scm_list_to_typed_array (tag, shape, elements);
1331 return maybe_annotate_source (array, port, opts, line, column);
1332 }
1333
1334 static SCM
1335 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1336 long line, int column)
1337 {
1338 return scm_read_array (chr, port, opts, line, column);
1339 }
1340
1341 static SCM
1342 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1343 long line, int column)
1344 {
1345 chr = scm_getc (port);
1346 if (chr != 'u')
1347 goto syntax;
1348
1349 chr = scm_getc (port);
1350 if (chr != '8')
1351 goto syntax;
1352
1353 chr = scm_getc (port);
1354 if (chr != '(')
1355 goto syntax;
1356
1357 return maybe_annotate_source
1358 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1359 port, opts, line, column);
1360
1361 syntax:
1362 scm_i_input_error ("read_bytevector", port,
1363 "invalid bytevector prefix",
1364 SCM_MAKE_CHAR (chr));
1365 return SCM_UNSPECIFIED;
1366 }
1367
1368 static SCM
1369 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1370 long line, int column)
1371 {
1372 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1373 terribly inefficient but who cares? */
1374 SCM s_bits = SCM_EOL;
1375
1376 for (chr = scm_getc (port);
1377 (chr != EOF) && ((chr == '0') || (chr == '1'));
1378 chr = scm_getc (port))
1379 {
1380 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1381 }
1382
1383 if (chr != EOF)
1384 scm_ungetc (chr, port);
1385
1386 return maybe_annotate_source
1387 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1388 port, opts, line, column);
1389 }
1390
1391 static SCM
1392 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1393 {
1394 int bang_seen = 0;
1395
1396 for (;;)
1397 {
1398 int c = scm_getc (port);
1399
1400 if (c == EOF)
1401 scm_i_input_error ("skip_block_comment", port,
1402 "unterminated `#! ... !#' comment", SCM_EOL);
1403
1404 if (c == '!')
1405 bang_seen = 1;
1406 else if (c == '#' && bang_seen)
1407 break;
1408 else
1409 bang_seen = 0;
1410 }
1411
1412 return SCM_UNSPECIFIED;
1413 }
1414
1415 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1416 int value);
1417 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1418 int value);
1419 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1420 int value);
1421
1422 static SCM
1423 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1424 {
1425 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1426 int c;
1427 int i = 0;
1428
1429 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1430 {
1431 c = scm_getc (port);
1432 if (c == EOF)
1433 scm_i_input_error ("skip_block_comment", port,
1434 "unterminated `#! ... !#' comment", SCM_EOL);
1435 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1436 name[i++] = c;
1437 else if (CHAR_IS_DELIMITER (c))
1438 {
1439 scm_ungetc (c, port);
1440 name[i] = '\0';
1441 if (0 == strcmp ("r6rs", name))
1442 ; /* Silently ignore */
1443 else if (0 == strcmp ("fold-case", name))
1444 set_port_case_insensitive_p (port, opts, 1);
1445 else if (0 == strcmp ("no-fold-case", name))
1446 set_port_case_insensitive_p (port, opts, 0);
1447 else if (0 == strcmp ("curly-infix", name))
1448 set_port_curly_infix_p (port, opts, 1);
1449 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1450 {
1451 set_port_curly_infix_p (port, opts, 1);
1452 set_port_square_brackets_p (port, opts, 0);
1453 }
1454 else
1455 break;
1456
1457 return SCM_UNSPECIFIED;
1458 }
1459 else
1460 {
1461 scm_ungetc (c, port);
1462 break;
1463 }
1464 }
1465 while (i > 0)
1466 scm_ungetc (name[--i], port);
1467 return scm_read_scsh_block_comment (chr, port);
1468 }
1469
1470 static SCM
1471 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1472 {
1473 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474 nested. So care must be taken. */
1475 int nesting_level = 1;
1476
1477 int a = scm_getc (port);
1478
1479 if (a == EOF)
1480 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1481 "unterminated `#| ... |#' comment", SCM_EOL);
1482
1483 while (nesting_level > 0)
1484 {
1485 int b = scm_getc (port);
1486
1487 if (b == EOF)
1488 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1489 "unterminated `#| ... |#' comment", SCM_EOL);
1490
1491 if (a == '|' && b == '#')
1492 {
1493 nesting_level--;
1494 b = EOF;
1495 }
1496 else if (a == '#' && b == '|')
1497 {
1498 nesting_level++;
1499 b = EOF;
1500 }
1501
1502 a = b;
1503 }
1504
1505 return SCM_UNSPECIFIED;
1506 }
1507
1508 static SCM
1509 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1510 scm_t_read_opts *opts)
1511 {
1512 scm_t_wchar c;
1513
1514 c = flush_ws (port, opts, (char *) NULL);
1515 if (EOF == c)
1516 scm_i_input_error ("read_commented_expression", port,
1517 "no expression after #; comment", SCM_EOL);
1518 scm_ungetc (c, port);
1519 scm_read_expression (port, opts);
1520 return SCM_UNSPECIFIED;
1521 }
1522
1523 static SCM
1524 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1525 {
1526 /* Guile's extended symbol read syntax looks like this:
1527
1528 #{This is all a symbol name}#
1529
1530 So here, CHR is expected to be `{'. */
1531 int saw_brace = 0;
1532 size_t len = 0;
1533 SCM buf = scm_i_make_string (1024, NULL, 0);
1534
1535 buf = scm_i_string_start_writing (buf);
1536
1537 while ((chr = scm_getc (port)) != EOF)
1538 {
1539 if (saw_brace)
1540 {
1541 if (chr == '#')
1542 {
1543 break;
1544 }
1545 else
1546 {
1547 saw_brace = 0;
1548 scm_i_string_set_x (buf, len++, '}');
1549 }
1550 }
1551
1552 if (chr == '}')
1553 saw_brace = 1;
1554 else if (chr == '\\')
1555 {
1556 /* It used to be that print.c would print extended-read-syntax
1557 symbols with backslashes before "non-standard" chars, but
1558 this routine wouldn't do anything with those escapes.
1559 Bummer. What we've done is to change print.c to output
1560 R6RS hex escapes for those characters, relying on the fact
1561 that the extended read syntax would never put a `\' before
1562 an `x'. For now, we just ignore other instances of
1563 backslash in the string. */
1564 switch ((chr = scm_getc (port)))
1565 {
1566 case EOF:
1567 goto done;
1568 case 'x':
1569 {
1570 scm_t_wchar c;
1571
1572 SCM_READ_HEX_ESCAPE (10, ';');
1573 scm_i_string_set_x (buf, len++, c);
1574 break;
1575
1576 str_eof:
1577 chr = EOF;
1578 goto done;
1579
1580 bad_escaped:
1581 scm_i_string_stop_writing ();
1582 scm_i_input_error ("scm_read_extended_symbol", port,
1583 "illegal character in escape sequence: ~S",
1584 scm_list_1 (SCM_MAKE_CHAR (c)));
1585 break;
1586 }
1587 default:
1588 scm_i_string_set_x (buf, len++, chr);
1589 break;
1590 }
1591 }
1592 else
1593 scm_i_string_set_x (buf, len++, chr);
1594
1595 if (len >= scm_i_string_length (buf) - 2)
1596 {
1597 SCM addy;
1598
1599 scm_i_string_stop_writing ();
1600 addy = scm_i_make_string (1024, NULL, 0);
1601 buf = scm_string_append (scm_list_2 (buf, addy));
1602 len = 0;
1603 buf = scm_i_string_start_writing (buf);
1604 }
1605 }
1606
1607 done:
1608 scm_i_string_stop_writing ();
1609 if (chr == EOF)
1610 scm_i_input_error ("scm_read_extended_symbol", port,
1611 "end of file while reading symbol", SCM_EOL);
1612
1613 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1614 }
1615
1616
1617 \f
1618 /* Top-level token readers, i.e., dispatchers. */
1619
1620 static SCM
1621 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1622 {
1623 SCM proc;
1624
1625 proc = scm_get_hash_procedure (chr);
1626 if (scm_is_true (scm_procedure_p (proc)))
1627 {
1628 long line = SCM_LINUM (port);
1629 int column = SCM_COL (port) - 2;
1630 SCM got;
1631
1632 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1633
1634 if (opts->record_positions_p && SCM_NIMP (got)
1635 && !scm_i_has_source_properties (got))
1636 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1637
1638 return got;
1639 }
1640
1641 return SCM_UNSPECIFIED;
1642 }
1643
1644 /* The reader for the sharp `#' character. It basically dispatches reads
1645 among the above token readers. */
1646 static SCM
1647 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1648 long line, int column)
1649 #define FUNC_NAME "scm_lreadr"
1650 {
1651 SCM result;
1652
1653 chr = scm_getc (port);
1654
1655 result = scm_read_sharp_extension (chr, port, opts);
1656 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1657 return result;
1658
1659 switch (chr)
1660 {
1661 case '\\':
1662 return (scm_read_character (chr, port, opts));
1663 case '(':
1664 return (scm_read_vector (chr, port, opts, line, column));
1665 case 's':
1666 case 'u':
1667 case 'f':
1668 case 'c':
1669 /* This one may return either a boolean or an SRFI-4 vector. */
1670 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1671 case 'v':
1672 return (scm_read_bytevector (chr, port, opts, line, column));
1673 case '*':
1674 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1675 case 't':
1676 case 'T':
1677 case 'F':
1678 return (scm_read_boolean (chr, port));
1679 case ':':
1680 return (scm_read_keyword (chr, port, opts));
1681 case '0': case '1': case '2': case '3': case '4':
1682 case '5': case '6': case '7': case '8': case '9':
1683 case '@':
1684 #if SCM_ENABLE_DEPRECATED
1685 /* See below for 'i' and 'e'. */
1686 case 'a':
1687 case 'y':
1688 case 'h':
1689 case 'l':
1690 #endif
1691 return (scm_read_array (chr, port, opts, line, column));
1692
1693 case 'i':
1694 case 'e':
1695 #if SCM_ENABLE_DEPRECATED
1696 {
1697 /* When next char is '(', it really is an old-style
1698 uniform array. */
1699 scm_t_wchar next_c = scm_getc (port);
1700 if (next_c != EOF)
1701 scm_ungetc (next_c, port);
1702 if (next_c == '(')
1703 return scm_read_array (chr, port, opts, line, column);
1704 /* Fall through. */
1705 }
1706 #endif
1707 case 'b':
1708 case 'B':
1709 case 'o':
1710 case 'O':
1711 case 'd':
1712 case 'D':
1713 case 'x':
1714 case 'X':
1715 case 'I':
1716 case 'E':
1717 return (scm_read_number_and_radix (chr, port, opts));
1718 case '{':
1719 return (scm_read_extended_symbol (chr, port));
1720 case '!':
1721 return (scm_read_shebang (chr, port, opts));
1722 case ';':
1723 return (scm_read_commented_expression (chr, port, opts));
1724 case '`':
1725 case '\'':
1726 case ',':
1727 return (scm_read_syntax (chr, port, opts));
1728 case 'n':
1729 return (scm_read_nil (chr, port, opts));
1730 default:
1731 result = scm_read_sharp_extension (chr, port, opts);
1732 if (scm_is_eq (result, SCM_UNSPECIFIED))
1733 {
1734 /* To remain compatible with 1.8 and earlier, the following
1735 characters have lower precedence than `read-hash-extend'
1736 characters. */
1737 switch (chr)
1738 {
1739 case '|':
1740 return scm_read_r6rs_block_comment (chr, port);
1741 default:
1742 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1743 scm_list_1 (SCM_MAKE_CHAR (chr)));
1744 }
1745 }
1746 else
1747 return result;
1748 }
1749
1750 return SCM_UNSPECIFIED;
1751 }
1752 #undef FUNC_NAME
1753
1754 static SCM
1755 read_inner_expression (SCM port, scm_t_read_opts *opts)
1756 #define FUNC_NAME "read_inner_expression"
1757 {
1758 while (1)
1759 {
1760 scm_t_wchar chr;
1761
1762 chr = scm_getc (port);
1763
1764 switch (chr)
1765 {
1766 case SCM_WHITE_SPACES:
1767 case SCM_LINE_INCREMENTORS:
1768 break;
1769 case ';':
1770 (void) scm_read_semicolon_comment (chr, port);
1771 break;
1772 case '{':
1773 if (opts->curly_infix_p)
1774 {
1775 if (opts->neoteric_p)
1776 return scm_read_sexp (chr, port, opts);
1777 else
1778 {
1779 SCM expr;
1780
1781 /* Enable neoteric expressions within curly braces */
1782 opts->neoteric_p = 1;
1783 expr = scm_read_sexp (chr, port, opts);
1784 opts->neoteric_p = 0;
1785 return expr;
1786 }
1787 }
1788 else
1789 return scm_read_mixed_case_symbol (chr, port, opts);
1790 case '[':
1791 if (opts->square_brackets_p)
1792 return scm_read_sexp (chr, port, opts);
1793 else if (opts->curly_infix_p)
1794 {
1795 /* The syntax of neoteric expressions requires that '[' be
1796 a delimiter when curly-infix is enabled, so it cannot
1797 be part of an unescaped symbol. We might as well do
1798 something useful with it, so we adopt Kawa's convention:
1799 [...] => ($bracket-list$ ...) */
1800 long line = SCM_LINUM (port);
1801 int column = SCM_COL (port) - 1;
1802 return maybe_annotate_source
1803 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1804 port, opts, line, column);
1805 }
1806 else
1807 return scm_read_mixed_case_symbol (chr, port, opts);
1808 case '(':
1809 return (scm_read_sexp (chr, port, opts));
1810 case '"':
1811 return (scm_read_string (chr, port, opts));
1812 case '|':
1813 if (opts->r7rs_symbols_p)
1814 return scm_read_r7rs_symbol (chr, port, opts);
1815 else
1816 return scm_read_mixed_case_symbol (chr, port, opts);
1817 case '\'':
1818 case '`':
1819 case ',':
1820 return (scm_read_quote (chr, port, opts));
1821 case '#':
1822 {
1823 long line = SCM_LINUM (port);
1824 int column = SCM_COL (port) - 1;
1825 SCM result = scm_read_sharp (chr, port, opts, line, column);
1826 if (scm_is_eq (result, SCM_UNSPECIFIED))
1827 /* We read a comment or some such. */
1828 break;
1829 else
1830 return result;
1831 }
1832 case ')':
1833 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1834 break;
1835 case '}':
1836 if (opts->curly_infix_p)
1837 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1838 else
1839 return scm_read_mixed_case_symbol (chr, port, opts);
1840 case ']':
1841 if (opts->square_brackets_p)
1842 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1843 /* otherwise fall through */
1844 case EOF:
1845 return SCM_EOF_VAL;
1846 case ':':
1847 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1848 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1849 /* Fall through. */
1850
1851 default:
1852 {
1853 if (((chr >= '0') && (chr <= '9'))
1854 || (strchr ("+-.", chr)))
1855 return (scm_read_number (chr, port, opts));
1856 else
1857 return (scm_read_mixed_case_symbol (chr, port, opts));
1858 }
1859 }
1860 }
1861 }
1862 #undef FUNC_NAME
1863
1864 static SCM
1865 scm_read_expression (SCM port, scm_t_read_opts *opts)
1866 #define FUNC_NAME "scm_read_expression"
1867 {
1868 if (!opts->neoteric_p)
1869 return read_inner_expression (port, opts);
1870 else
1871 {
1872 long line = 0;
1873 int column = 0;
1874 SCM expr;
1875
1876 if (opts->record_positions_p)
1877 {
1878 /* We need to get the position of the first non-whitespace
1879 character in order to correctly annotate neoteric
1880 expressions. For example, for the expression 'f(x)', the
1881 first call to 'read_inner_expression' reads the 'f' (which
1882 cannot be annotated), and then we later read the '(x)' and
1883 use it to construct the new list (f x). */
1884 int c = flush_ws (port, opts, (char *) NULL);
1885 if (c == EOF)
1886 return SCM_EOF_VAL;
1887 scm_ungetc (c, port);
1888 line = SCM_LINUM (port);
1889 column = SCM_COL (port);
1890 }
1891
1892 expr = read_inner_expression (port, opts);
1893
1894 /* 'expr' is the first component of the neoteric expression. Now
1895 we loop, and as long as the next character is '(', '[', or '{',
1896 (without any intervening whitespace), we use it to construct a
1897 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1898 for (;;)
1899 {
1900 int chr = scm_getc (port);
1901
1902 if (chr == '(')
1903 /* e(...) => (e ...) */
1904 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1905 else if (chr == '[')
1906 /* e[...] => ($bracket-apply$ e ...) */
1907 expr = scm_cons (sym_bracket_apply,
1908 scm_cons (expr,
1909 scm_read_sexp (chr, port, opts)));
1910 else if (chr == '{')
1911 {
1912 SCM arg = scm_read_sexp (chr, port, opts);
1913
1914 if (scm_is_null (arg))
1915 expr = scm_list_1 (expr); /* e{} => (e) */
1916 else
1917 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1918 }
1919 else
1920 {
1921 if (chr != EOF)
1922 scm_ungetc (chr, port);
1923 break;
1924 }
1925 maybe_annotate_source (expr, port, opts, line, column);
1926 }
1927 return expr;
1928 }
1929 }
1930 #undef FUNC_NAME
1931
1932 \f
1933 /* Actual reader. */
1934
1935 static void init_read_options (SCM port, scm_t_read_opts *opts);
1936
1937 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1938 (SCM port),
1939 "Read an s-expression from the input port @var{port}, or from\n"
1940 "the current input port if @var{port} is not specified.\n"
1941 "Any whitespace before the next token is discarded.")
1942 #define FUNC_NAME s_scm_read
1943 {
1944 scm_t_read_opts opts;
1945 int c;
1946
1947 if (SCM_UNBNDP (port))
1948 port = scm_current_input_port ();
1949 SCM_VALIDATE_OPINPORT (1, port);
1950
1951 init_read_options (port, &opts);
1952
1953 c = flush_ws (port, &opts, (char *) NULL);
1954 if (EOF == c)
1955 return SCM_EOF_VAL;
1956 scm_ungetc (c, port);
1957
1958 return (scm_read_expression (port, &opts));
1959 }
1960 #undef FUNC_NAME
1961
1962
1963 \f
1964
1965 /* Manipulate the read-hash-procedures alist. This could be written in
1966 Scheme, but maybe it will also be used by C code during initialisation. */
1967 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1968 (SCM chr, SCM proc),
1969 "Install the procedure @var{proc} for reading expressions\n"
1970 "starting with the character sequence @code{#} and @var{chr}.\n"
1971 "@var{proc} will be called with two arguments: the character\n"
1972 "@var{chr} and the port to read further data from. The object\n"
1973 "returned will be the return value of @code{read}. \n"
1974 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1975 )
1976 #define FUNC_NAME s_scm_read_hash_extend
1977 {
1978 SCM this;
1979 SCM prev;
1980
1981 SCM_VALIDATE_CHAR (1, chr);
1982 SCM_ASSERT (scm_is_false (proc)
1983 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1984 proc, SCM_ARG2, FUNC_NAME);
1985
1986 /* Check if chr is already in the alist. */
1987 this = scm_i_read_hash_procedures_ref ();
1988 prev = SCM_BOOL_F;
1989 while (1)
1990 {
1991 if (scm_is_null (this))
1992 {
1993 /* not found, so add it to the beginning. */
1994 if (scm_is_true (proc))
1995 {
1996 SCM new = scm_cons (scm_cons (chr, proc),
1997 scm_i_read_hash_procedures_ref ());
1998 scm_i_read_hash_procedures_set_x (new);
1999 }
2000 break;
2001 }
2002 if (scm_is_eq (chr, SCM_CAAR (this)))
2003 {
2004 /* already in the alist. */
2005 if (scm_is_false (proc))
2006 {
2007 /* remove it. */
2008 if (scm_is_false (prev))
2009 {
2010 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
2011 scm_i_read_hash_procedures_set_x (rest);
2012 }
2013 else
2014 scm_set_cdr_x (prev, SCM_CDR (this));
2015 }
2016 else
2017 {
2018 /* replace it. */
2019 scm_set_cdr_x (SCM_CAR (this), proc);
2020 }
2021 break;
2022 }
2023 prev = this;
2024 this = SCM_CDR (this);
2025 }
2026
2027 return SCM_UNSPECIFIED;
2028 }
2029 #undef FUNC_NAME
2030
2031 /* Recover the read-hash procedure corresponding to char c. */
2032 static SCM
2033 scm_get_hash_procedure (int c)
2034 {
2035 SCM rest = scm_i_read_hash_procedures_ref ();
2036
2037 while (1)
2038 {
2039 if (scm_is_null (rest))
2040 return SCM_BOOL_F;
2041
2042 if (SCM_CHAR (SCM_CAAR (rest)) == c)
2043 return SCM_CDAR (rest);
2044
2045 rest = SCM_CDR (rest);
2046 }
2047 }
2048
2049 #define SCM_ENCODING_SEARCH_SIZE (500)
2050
2051 /* Search the first few hundred characters of a file for an Emacs-like coding
2052 declaration. Returns either NULL or a string whose storage has been
2053 allocated with `scm_gc_malloc ()'. */
2054 char *
2055 scm_i_scan_for_encoding (SCM port)
2056 {
2057 scm_t_port *pt;
2058 char header[SCM_ENCODING_SEARCH_SIZE+1];
2059 size_t bytes_read, encoding_length, i;
2060 char *encoding = NULL;
2061 char *pos, *encoding_start;
2062 int in_comment;
2063
2064 pt = SCM_PTAB_ENTRY (port);
2065
2066 if (pt->rw_active == SCM_PORT_WRITE)
2067 scm_flush (port);
2068
2069 if (pt->rw_random)
2070 pt->rw_active = SCM_PORT_READ;
2071
2072 if (pt->read_pos == pt->read_end)
2073 {
2074 /* We can use the read buffer, and thus avoid a seek. */
2075 if (scm_fill_input (port) == EOF)
2076 return NULL;
2077
2078 bytes_read = pt->read_end - pt->read_pos;
2079 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2080 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2081
2082 if (bytes_read <= 1)
2083 /* An unbuffered port -- don't scan. */
2084 return NULL;
2085
2086 memcpy (header, pt->read_pos, bytes_read);
2087 header[bytes_read] = '\0';
2088 }
2089 else
2090 {
2091 /* Try to read some bytes and then seek back. Not all ports
2092 support seeking back; and indeed some file ports (like
2093 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2094 check performed by SCM_FPORT_FDES---but fail to seek
2095 backwards. Hence this block comes second. We prefer to use
2096 the read buffer in-place. */
2097 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2098 return NULL;
2099
2100 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2101 header[bytes_read] = '\0';
2102 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2103 }
2104
2105 /* search past "coding[:=]" */
2106 pos = header;
2107 while (1)
2108 {
2109 if ((pos = strstr(pos, "coding")) == NULL)
2110 return NULL;
2111
2112 pos += strlen("coding");
2113 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2114 (*pos == ':' || *pos == '='))
2115 {
2116 pos ++;
2117 break;
2118 }
2119 }
2120
2121 /* skip spaces */
2122 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2123 (*pos == ' ' || *pos == '\t'))
2124 pos ++;
2125
2126 /* grab the next token */
2127 encoding_start = pos;
2128 i = 0;
2129 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2130 && encoding_start + i - header < bytes_read
2131 && (isalnum ((int) encoding_start[i])
2132 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2133 i++;
2134
2135 encoding_length = i;
2136 if (encoding_length == 0)
2137 return NULL;
2138
2139 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2140 for (i = 0; i < encoding_length; i++)
2141 encoding[i] = toupper ((int) encoding[i]);
2142
2143 /* push backwards to make sure we were in a comment */
2144 in_comment = 0;
2145 pos = encoding_start;
2146 while (pos >= header)
2147 {
2148 if (*pos == ';')
2149 {
2150 in_comment = 1;
2151 break;
2152 }
2153 else if (*pos == '\n' || pos == header)
2154 {
2155 /* This wasn't in a semicolon comment. Check for a
2156 hash-bang comment. */
2157 char *beg = strstr (header, "#!");
2158 char *end = strstr (header, "!#");
2159 if (beg < encoding_start && encoding_start + encoding_length <= end)
2160 in_comment = 1;
2161 break;
2162 }
2163 else
2164 {
2165 pos --;
2166 continue;
2167 }
2168 }
2169 if (!in_comment)
2170 /* This wasn't in a comment */
2171 return NULL;
2172
2173 return encoding;
2174 }
2175
2176 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2177 (SCM port),
2178 "Scans the port for an Emacs-like character coding declaration\n"
2179 "near the top of the contents of a port with random-accessible contents.\n"
2180 "The coding declaration is of the form\n"
2181 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2182 "\n"
2183 "Returns a string containing the character encoding of the file\n"
2184 "if a declaration was found, or @code{#f} otherwise.\n")
2185 #define FUNC_NAME s_scm_file_encoding
2186 {
2187 char *enc;
2188 SCM s_enc;
2189
2190 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2191
2192 enc = scm_i_scan_for_encoding (port);
2193 if (enc == NULL)
2194 return SCM_BOOL_F;
2195 else
2196 {
2197 s_enc = scm_from_locale_string (enc);
2198 return s_enc;
2199 }
2200
2201 return SCM_BOOL_F;
2202 }
2203 #undef FUNC_NAME
2204
2205 \f
2206 /* Per-port read options.
2207
2208 We store per-port read options in the 'port-read-options' port
2209 property, which is stored in the internal port structure. The value
2210 stored is a single integer that contains a two-bit field for each
2211 read option.
2212
2213 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2214 the applicable value should be inherited from the corresponding
2215 global read option. Otherwise, the bit field contains the value of
2216 the read option. For boolean read options that have been set
2217 per-port, the possible values are 0 or 1. If the 'keyword_style'
2218 read option has been set per-port, its possible values are those in
2219 'enum t_keyword_style'. */
2220
2221 /* Key to read options in port properties. */
2222 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2223
2224 /* Offsets of bit fields for each per-port override */
2225 #define READ_OPTION_COPY_SOURCE_P 0
2226 #define READ_OPTION_RECORD_POSITIONS_P 2
2227 #define READ_OPTION_CASE_INSENSITIVE_P 4
2228 #define READ_OPTION_KEYWORD_STYLE 6
2229 #define READ_OPTION_R6RS_ESCAPES_P 8
2230 #define READ_OPTION_SQUARE_BRACKETS_P 10
2231 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2232 #define READ_OPTION_CURLY_INFIX_P 14
2233 #define READ_OPTION_R7RS_SYMBOLS_P 16
2234
2235 /* The total width in bits of the per-port overrides */
2236 #define READ_OPTIONS_NUM_BITS 18
2237
2238 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2239 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2240
2241 #define READ_OPTION_MASK 3
2242 #define READ_OPTION_INHERIT 3
2243
2244 static void
2245 set_port_read_option (SCM port, int option, int new_value)
2246 {
2247 SCM scm_read_options;
2248 unsigned int read_options;
2249
2250 new_value &= READ_OPTION_MASK;
2251 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2252 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2253 read_options = scm_to_uint (scm_read_options);
2254 else
2255 read_options = READ_OPTIONS_INHERIT_ALL;
2256 read_options &= ~(READ_OPTION_MASK << option);
2257 read_options |= new_value << option;
2258 scm_read_options = scm_from_uint (read_options);
2259 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2260 }
2261
2262 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2263 static void
2264 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2265 {
2266 value = !!value;
2267 opts->case_insensitive_p = value;
2268 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2269 }
2270
2271 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2272 static void
2273 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2274 {
2275 value = !!value;
2276 opts->square_brackets_p = value;
2277 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2278 }
2279
2280 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2281 static void
2282 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2283 {
2284 value = !!value;
2285 opts->curly_infix_p = value;
2286 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2287 }
2288
2289 /* Initialize OPTS based on PORT's read options and the global read
2290 options. */
2291 static void
2292 init_read_options (SCM port, scm_t_read_opts *opts)
2293 {
2294 SCM val, scm_read_options;
2295 unsigned int read_options, x;
2296
2297 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2298
2299 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2300 read_options = scm_to_uint (scm_read_options);
2301 else
2302 read_options = READ_OPTIONS_INHERIT_ALL;
2303
2304 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2305 if (x == READ_OPTION_INHERIT)
2306 {
2307 val = SCM_PACK (SCM_KEYWORD_STYLE);
2308 if (scm_is_eq (val, scm_keyword_prefix))
2309 x = KEYWORD_STYLE_PREFIX;
2310 else if (scm_is_eq (val, scm_keyword_postfix))
2311 x = KEYWORD_STYLE_POSTFIX;
2312 else
2313 x = KEYWORD_STYLE_HASH_PREFIX;
2314 }
2315 opts->keyword_style = x;
2316
2317 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2318 do \
2319 { \
2320 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2321 if (x == READ_OPTION_INHERIT) \
2322 x = !!SCM_ ## NAME; \
2323 opts->name = x; \
2324 } \
2325 while (0)
2326
2327 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2328 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2329 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2330 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2331 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2332 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2333 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
2334 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
2335
2336 #undef RESOLVE_BOOLEAN_OPTION
2337
2338 opts->neoteric_p = 0;
2339 }
2340
2341 void
2342 scm_init_read ()
2343 {
2344 SCM read_hash_procs;
2345
2346 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2347
2348 scm_i_read_hash_procedures =
2349 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2350
2351 scm_init_opts (scm_read_options, scm_read_opts);
2352 #include "libguile/read.x"
2353 }
2354
2355 /*
2356 Local Variables:
2357 c-file-style: "gnu"
2358 End:
2359 */