Simplify the interpreter for trivial inits and no letrec
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <unicase.h>
31 #include <unictype.h>
32 #include <c-strcase.h>
33 #include <c-ctype.h>
34
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/ports-internal.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59
60
61 \f
62
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73 scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
93 { 0, },
94 };
95
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100 enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
106
107 struct t_read_opts
108 {
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
118 unsigned int r7rs_symbols_p : 1;
119 };
120
121 typedef struct t_read_opts scm_t_read_opts;
122
123
124 /*
125 Give meaningful error messages for errors
126
127 We use the format
128
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
135 */
136
137
138 void
139 scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
141 {
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
145
146 SCM string_port = scm_open_output_string ();
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
150 scm_list_4 (fn,
151 scm_from_long (SCM_LINUM (port) + 1),
152 scm_from_int (SCM_COL (port) + 1),
153 scm_from_locale_string (message)));
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function? scm_from_locale_string (function) : SCM_BOOL_F,
159 string,
160 arg,
161 SCM_BOOL_F);
162 }
163
164
165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
166 (SCM setting),
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
171 {
172 SCM ans = scm_options (setting,
173 scm_read_opts,
174 FUNC_NAME);
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
177 return ans;
178 }
179 #undef FUNC_NAME
180
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM *scm_i_read_hash_procedures;
184
185 static SCM
186 scm_i_read_hash_procedures_ref (void)
187 {
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189 }
190
191 static void
192 scm_i_read_hash_procedures_set_x (SCM value)
193 {
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195 }
196
197 \f
198 /* Token readers. */
199
200
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
203
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
206
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
209
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
213
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
218
219 #ifdef MSDOS
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222 #else
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
224 #endif
225
226
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229 #define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
232
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
238
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
245 /* Read an SCSH block comment. */
246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_get_hash_procedure (int);
251
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
256 static int
257 read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
259 {
260 *read = 0;
261
262 while (*read < buf_size)
263 {
264 int chr;
265
266 chr = scm_get_byte_or_eof_unlocked (port);
267
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
272 scm_unget_byte_unlocked (chr, port);
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
281
282 return 1;
283 }
284
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287 static char *
288 read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
290 {
291 int overflow = 0;
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
294
295 do
296 {
297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
306 overflow_size = bytes_read;
307 }
308 else
309 {
310 char *new_buf =
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
317 overflow_size += bytes_read;
318 }
319 }
320 }
321 while (overflow);
322
323 if (overflow_size)
324 *read = overflow_size;
325 else
326 *read = bytes_read;
327
328 return (overflow_size > 0 ? overflow_buffer : buffer);
329 }
330
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333 static int
334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
335 {
336 scm_t_wchar c;
337 while (1)
338 switch (c = scm_getc_unlocked (port))
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
343 {
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
348 }
349 return c;
350
351 case ';':
352 lp:
353 switch (c = scm_getc_unlocked (port))
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
363
364 case '#':
365 switch (c = scm_getc_unlocked (port))
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
371 scm_read_shebang (c, port, opts);
372 break;
373 case ';':
374 scm_read_commented_expression (c, port, opts);
375 break;
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
383 default:
384 scm_ungetc_unlocked (c, port);
385 return '#';
386 }
387 break;
388
389 case SCM_LINE_INCREMENTORS:
390 case SCM_SINGLE_SPACES:
391 case '\t':
392 break;
393
394 default:
395 return c;
396 }
397
398 return 0;
399 }
400
401
402 \f
403 /* Token readers. */
404
405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
408
409
410 static SCM
411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
413 {
414 /* This condition can be caused by a user calling
415 set-port-column!. */
416 if (line < 0 || column < 0)
417 return x;
418
419 if (opts->record_positions_p)
420 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
421 return x;
422 }
423
424 static SCM
425 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
426 #define FUNC_NAME "scm_i_lreadparen"
427 {
428 int c;
429 SCM tmp, tl, ans = SCM_EOL;
430 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
431 const int terminating_char = ((chr == '{') ? '}'
432 : ((chr == '[') ? ']'
433 : ')'));
434
435 /* Need to capture line and column numbers here. */
436 long line = SCM_LINUM (port);
437 int column = SCM_COL (port) - 1;
438
439 c = flush_ws (port, opts, FUNC_NAME);
440 if (terminating_char == c)
441 return SCM_EOL;
442
443 scm_ungetc_unlocked (c, port);
444 tmp = scm_read_expression (port, opts);
445
446 /* Note that it is possible for scm_read_expression to return
447 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
448 check that it's a real dot by checking `c'. */
449 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
450 {
451 ans = scm_read_expression (port, opts);
452 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
453 scm_i_input_error (FUNC_NAME, port, "missing close paren",
454 SCM_EOL);
455 return ans;
456 }
457
458 /* Build the head of the list structure. */
459 ans = tl = scm_cons (tmp, SCM_EOL);
460
461 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
462 {
463 SCM new_tail;
464
465 if (c == ')' || (c == ']' && opts->square_brackets_p)
466 || ((c == '}' || c == ']') && opts->curly_infix_p))
467 scm_i_input_error (FUNC_NAME, port,
468 "in pair: mismatched close paren: ~A",
469 scm_list_1 (SCM_MAKE_CHAR (c)));
470
471 scm_ungetc_unlocked (c, port);
472 tmp = scm_read_expression (port, opts);
473
474 /* See above note about scm_sym_dot. */
475 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
476 {
477 SCM_SETCDR (tl, scm_read_expression (port, opts));
478
479 c = flush_ws (port, opts, FUNC_NAME);
480 if (terminating_char != c)
481 scm_i_input_error (FUNC_NAME, port,
482 "in pair: missing close paren", SCM_EOL);
483 break;
484 }
485
486 new_tail = scm_cons (tmp, SCM_EOL);
487 SCM_SETCDR (tl, new_tail);
488 tl = new_tail;
489 }
490
491 if (curly_list_p)
492 {
493 /* In addition to finding the length, 'scm_ilength' checks for
494 improper or circular lists, in which case it returns -1. */
495 int len = scm_ilength (ans);
496
497 /* The (len == 0) case is handled above */
498 if (len == 1)
499 /* Return directly to avoid re-annotating the element's source
500 location with the position of the outer brace. Also, it
501 might not be possible to annotate the element. */
502 return scm_car (ans); /* {e} => e */
503 else if (len == 2)
504 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
505 else if (len >= 3 && (len & 1))
506 {
507 /* It's a proper list whose length is odd and at least 3. If
508 the elements at odd indices (the infix operator positions)
509 are all 'equal?', then it's a simple curly-infix list.
510 Otherwise it's a mixed curly-infix list. */
511 SCM op = scm_cadr (ans);
512
513 /* Check to see if the elements at odd indices are 'equal?' */
514 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
515 {
516 if (scm_is_null (tl))
517 {
518 /* Convert simple curly-infix list to prefix:
519 {a <op> b <op> ...} => (<op> a b ...) */
520 tl = ans;
521 while (scm_is_pair (scm_cdr (tl)))
522 {
523 tmp = scm_cddr (tl);
524 SCM_SETCDR (tl, tmp);
525 tl = tmp;
526 }
527 ans = scm_cons (op, ans);
528 break;
529 }
530 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
531 {
532 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
533 ans = scm_cons (sym_nfx, ans);
534 break;
535 }
536 }
537 }
538 else
539 /* Mixed curly-infix (possibly improper) list:
540 {e . tail} => ($nfx$ e . tail) */
541 ans = scm_cons (sym_nfx, ans);
542 }
543
544 return maybe_annotate_source (ans, port, opts, line, column);
545 }
546 #undef FUNC_NAME
547
548
549 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
550 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
551 found. */
552 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
553 do \
554 { \
555 scm_t_wchar a; \
556 size_t i = 0; \
557 c = 0; \
558 while (i < ndigits) \
559 { \
560 a = scm_getc_unlocked (port); \
561 if (a == EOF) \
562 goto str_eof; \
563 if (terminator \
564 && (a == (scm_t_wchar) terminator) \
565 && (i > 0)) \
566 break; \
567 if ('0' <= a && a <= '9') \
568 a -= '0'; \
569 else if ('A' <= a && a <= 'F') \
570 a = a - 'A' + 10; \
571 else if ('a' <= a && a <= 'f') \
572 a = a - 'a' + 10; \
573 else \
574 { \
575 c = a; \
576 goto bad_escaped; \
577 } \
578 c = c * 16 + a; \
579 i ++; \
580 } \
581 } while (0)
582
583 static void
584 skip_intraline_whitespace (SCM port)
585 {
586 scm_t_wchar c;
587
588 do
589 {
590 c = scm_getc_unlocked (port);
591 if (c == EOF)
592 return;
593 }
594 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
595
596 scm_ungetc_unlocked (c, port);
597 }
598
599 /* Read either a double-quoted string or an R7RS-style symbol delimited
600 by vertical lines, depending on the value of 'chr' ('"' or '|').
601 Regardless, the result is always returned as a string. */
602 static SCM
603 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
604 #define FUNC_NAME "scm_lreadr"
605 {
606 /* For strings smaller than C_STR, this function creates only one Scheme
607 object (the string returned). */
608
609 SCM str = SCM_EOL;
610 size_t c_str_len = 0;
611 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
612
613 /* Need to capture line and column numbers here. */
614 long line = SCM_LINUM (port);
615 int column = SCM_COL (port) - 1;
616
617 while (chr != (c = scm_getc_unlocked (port)))
618 {
619 if (c == EOF)
620 {
621 str_eof:
622 scm_i_input_error (FUNC_NAME, port,
623 (chr == '|'
624 ? "end of file in symbol"
625 : "end of file in string constant"),
626 SCM_EOL);
627 }
628
629 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
630 {
631 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
632 c_str_len = 0;
633 }
634
635 if (c == '\\')
636 {
637 switch (c = scm_getc_unlocked (port))
638 {
639 case EOF:
640 goto str_eof;
641 case '|':
642 case '\\':
643 break;
644 case '\n':
645 if (opts->hungry_eol_escapes_p)
646 skip_intraline_whitespace (port);
647 continue;
648 case '0':
649 c = '\0';
650 break;
651 case 'f':
652 c = '\f';
653 break;
654 case 'n':
655 c = '\n';
656 break;
657 case 'r':
658 c = '\r';
659 break;
660 case 't':
661 c = '\t';
662 break;
663 case 'a':
664 c = '\007';
665 break;
666 case 'v':
667 c = '\v';
668 break;
669 case 'b':
670 c = '\010';
671 break;
672 case 'x':
673 if (opts->r6rs_escapes_p || chr == '|')
674 SCM_READ_HEX_ESCAPE (10, ';');
675 else
676 SCM_READ_HEX_ESCAPE (2, '\0');
677 break;
678 case 'u':
679 if (!opts->r6rs_escapes_p)
680 {
681 SCM_READ_HEX_ESCAPE (4, '\0');
682 break;
683 }
684 case 'U':
685 if (!opts->r6rs_escapes_p)
686 {
687 SCM_READ_HEX_ESCAPE (6, '\0');
688 break;
689 }
690 default:
691 if (c == chr)
692 break;
693 bad_escaped:
694 scm_i_input_error (FUNC_NAME, port,
695 "illegal character in escape sequence: ~S",
696 scm_list_1 (SCM_MAKE_CHAR (c)));
697 }
698 }
699
700 c_str[c_str_len++] = c;
701 }
702
703 if (scm_is_null (str))
704 /* Fast path: we got a string that fits in C_STR. */
705 str = scm_from_utf32_stringn (c_str, c_str_len);
706 else
707 {
708 if (c_str_len > 0)
709 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
710
711 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
712 }
713
714 return maybe_annotate_source (str, port, opts, line, column);
715 }
716 #undef FUNC_NAME
717
718 static SCM
719 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
720 {
721 return scm_read_string_like_syntax (chr, port, opts);
722 }
723
724 static SCM
725 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
726 {
727 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
728 }
729
730 static SCM
731 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
732 {
733 SCM result, str = SCM_EOL;
734 char local_buffer[READER_BUFFER_SIZE], *buffer;
735 size_t bytes_read;
736
737 /* Need to capture line and column numbers here. */
738 long line = SCM_LINUM (port);
739 int column = SCM_COL (port) - 1;
740
741 scm_ungetc_unlocked (chr, port);
742 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
743 &bytes_read);
744
745 str = scm_from_port_stringn (buffer, bytes_read, port);
746
747 result = scm_string_to_number (str, SCM_UNDEFINED);
748 if (scm_is_false (result))
749 {
750 /* Return a symbol instead of a number */
751 if (opts->case_insensitive_p)
752 str = scm_string_downcase_x (str);
753 result = scm_string_to_symbol (str);
754 }
755 else if (SCM_NIMP (result))
756 result = maybe_annotate_source (result, port, opts, line, column);
757
758 SCM_COL (port) += scm_i_string_length (str);
759 return result;
760 }
761
762 static SCM
763 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
764 {
765 SCM result;
766 int ends_with_colon = 0;
767 size_t bytes_read;
768 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
769 char local_buffer[READER_BUFFER_SIZE], *buffer;
770 SCM str;
771
772 scm_ungetc_unlocked (chr, port);
773 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
774 &bytes_read);
775 if (bytes_read > 0)
776 ends_with_colon = buffer[bytes_read - 1] == ':';
777
778 if (postfix && ends_with_colon && (bytes_read > 1))
779 {
780 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
781
782 if (opts->case_insensitive_p)
783 str = scm_string_downcase_x (str);
784 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
785 }
786 else
787 {
788 str = scm_from_port_stringn (buffer, bytes_read, port);
789
790 if (opts->case_insensitive_p)
791 str = scm_string_downcase_x (str);
792 result = scm_string_to_symbol (str);
793 }
794
795 SCM_COL (port) += scm_i_string_length (str);
796 return result;
797 }
798
799 static SCM
800 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
801 #define FUNC_NAME "scm_lreadr"
802 {
803 SCM result;
804 size_t read;
805 char local_buffer[READER_BUFFER_SIZE], *buffer;
806 unsigned int radix;
807 SCM str;
808
809 switch (chr)
810 {
811 case 'B':
812 case 'b':
813 radix = 2;
814 break;
815
816 case 'o':
817 case 'O':
818 radix = 8;
819 break;
820
821 case 'd':
822 case 'D':
823 radix = 10;
824 break;
825
826 case 'x':
827 case 'X':
828 radix = 16;
829 break;
830
831 default:
832 scm_ungetc_unlocked (chr, port);
833 scm_ungetc_unlocked ('#', port);
834 radix = 10;
835 }
836
837 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
838 &read);
839
840 str = scm_from_port_stringn (buffer, read, port);
841
842 result = scm_string_to_number (str, scm_from_uint (radix));
843
844 SCM_COL (port) += scm_i_string_length (str);
845
846 if (scm_is_true (result))
847 return result;
848
849 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
850
851 return SCM_BOOL_F;
852 }
853 #undef FUNC_NAME
854
855 static SCM
856 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
857 {
858 SCM p;
859 long line = SCM_LINUM (port);
860 int column = SCM_COL (port) - 1;
861
862 switch (chr)
863 {
864 case '`':
865 p = scm_sym_quasiquote;
866 break;
867
868 case '\'':
869 p = scm_sym_quote;
870 break;
871
872 case ',':
873 {
874 scm_t_wchar c;
875
876 c = scm_getc_unlocked (port);
877 if ('@' == c)
878 p = scm_sym_uq_splicing;
879 else
880 {
881 scm_ungetc_unlocked (c, port);
882 p = scm_sym_unquote;
883 }
884 break;
885 }
886
887 default:
888 fprintf (stderr, "%s: unhandled quote character (%i)\n",
889 "scm_read_quote", chr);
890 abort ();
891 }
892
893 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
894 return maybe_annotate_source (p, port, opts, line, column);
895 }
896
897 SCM_SYMBOL (sym_syntax, "syntax");
898 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
899 SCM_SYMBOL (sym_unsyntax, "unsyntax");
900 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
901
902 static SCM
903 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
904 {
905 SCM p;
906 long line = SCM_LINUM (port);
907 int column = SCM_COL (port) - 1;
908
909 switch (chr)
910 {
911 case '`':
912 p = sym_quasisyntax;
913 break;
914
915 case '\'':
916 p = sym_syntax;
917 break;
918
919 case ',':
920 {
921 int c;
922
923 c = scm_getc_unlocked (port);
924 if ('@' == c)
925 p = sym_unsyntax_splicing;
926 else
927 {
928 scm_ungetc_unlocked (c, port);
929 p = sym_unsyntax;
930 }
931 break;
932 }
933
934 default:
935 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
936 "scm_read_syntax", chr);
937 abort ();
938 }
939
940 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
941 return maybe_annotate_source (p, port, opts, line, column);
942 }
943
944 static SCM
945 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
946 {
947 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
948
949 if (!scm_is_eq (id, sym_nil))
950 scm_i_input_error ("scm_read_nil", port,
951 "unexpected input while reading #nil: ~a",
952 scm_list_1 (id));
953
954 return SCM_ELISP_NIL;
955 }
956
957 static SCM
958 scm_read_semicolon_comment (int chr, SCM port)
959 {
960 int c;
961
962 /* We use the get_byte here because there is no need to get the
963 locale correct with comment input. This presumes that newline
964 always represents itself no matter what the encoding is. */
965 for (c = scm_get_byte_or_eof_unlocked (port);
966 (c != EOF) && (c != '\n');
967 c = scm_get_byte_or_eof_unlocked (port));
968
969 return SCM_UNSPECIFIED;
970 }
971
972 /* If the EXPECTED_CHARS are the next ones available from PORT, then
973 consume them and return 1. Otherwise leave the port position where
974 it was and return 0. EXPECTED_CHARS should be all lowercase, and
975 will be matched case-insensitively against the characters read from
976 PORT. */
977 static int
978 try_read_ci_chars (SCM port, const char *expected_chars)
979 {
980 int num_chars_wanted = strlen (expected_chars);
981 int num_chars_read = 0;
982 char *chars_read = alloca (num_chars_wanted);
983 int c;
984
985 while (num_chars_read < num_chars_wanted)
986 {
987 c = scm_getc_unlocked (port);
988 if (c == EOF)
989 break;
990 else if (c_tolower (c) != expected_chars[num_chars_read])
991 {
992 scm_ungetc_unlocked (c, port);
993 break;
994 }
995 else
996 chars_read[num_chars_read++] = c;
997 }
998
999 if (num_chars_read == num_chars_wanted)
1000 return 1;
1001 else
1002 {
1003 while (num_chars_read > 0)
1004 scm_ungetc_unlocked (chars_read[--num_chars_read], port);
1005 return 0;
1006 }
1007 }
1008
1009 \f
1010 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1011
1012 static SCM
1013 scm_read_boolean (int chr, SCM port)
1014 {
1015 switch (chr)
1016 {
1017 case 't':
1018 case 'T':
1019 try_read_ci_chars (port, "rue");
1020 return SCM_BOOL_T;
1021
1022 case 'f':
1023 case 'F':
1024 try_read_ci_chars (port, "alse");
1025 return SCM_BOOL_F;
1026 }
1027
1028 return SCM_UNSPECIFIED;
1029 }
1030
1031 static SCM
1032 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1033 #define FUNC_NAME "scm_lreadr"
1034 {
1035 char buffer[READER_CHAR_NAME_MAX_SIZE];
1036 SCM charname;
1037 size_t charname_len, bytes_read;
1038 scm_t_wchar cp;
1039 int overflow;
1040 scm_t_port_internal *pti;
1041
1042 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1043 &bytes_read);
1044 if (overflow)
1045 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1046
1047 if (bytes_read == 0)
1048 {
1049 chr = scm_getc_unlocked (port);
1050 if (chr == EOF)
1051 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1052 "while reading character", SCM_EOL);
1053
1054 /* CHR must be a token delimiter, like a whitespace. */
1055 return (SCM_MAKE_CHAR (chr));
1056 }
1057
1058 pti = SCM_PORT_GET_INTERNAL (port);
1059
1060 /* Simple ASCII characters can be processed immediately. Also, simple
1061 ISO-8859-1 characters can be processed immediately if the encoding for this
1062 port is ISO-8859-1. */
1063 if (bytes_read == 1 &&
1064 ((unsigned char) buffer[0] <= 127
1065 || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
1066 {
1067 SCM_COL (port) += 1;
1068 return SCM_MAKE_CHAR (buffer[0]);
1069 }
1070
1071 /* Otherwise, convert the buffer into a proper scheme string for
1072 processing. */
1073 charname = scm_from_port_stringn (buffer, bytes_read, port);
1074 charname_len = scm_i_string_length (charname);
1075 SCM_COL (port) += charname_len;
1076 cp = scm_i_string_ref (charname, 0);
1077 if (charname_len == 1)
1078 return SCM_MAKE_CHAR (cp);
1079
1080 /* Ignore dotted circles, which may be used to keep combining characters from
1081 combining with the backslash in #\charname. */
1082 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1083 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1084
1085 if (cp >= '0' && cp < '8')
1086 {
1087 /* Dirk:FIXME:: This type of character syntax is not R5RS
1088 * compliant. Further, it should be verified that the constant
1089 * does only consist of octal digits. */
1090 SCM p = scm_string_to_number (charname, scm_from_uint (8));
1091 if (SCM_I_INUMP (p))
1092 {
1093 scm_t_wchar c = scm_to_uint32 (p);
1094 if (SCM_IS_UNICODE_CHAR (c))
1095 return SCM_MAKE_CHAR (c);
1096 else
1097 scm_i_input_error (FUNC_NAME, port,
1098 "out-of-range octal character escape: ~a",
1099 scm_list_1 (charname));
1100 }
1101 }
1102
1103 if (cp == 'x' && (charname_len > 1))
1104 {
1105 SCM p;
1106
1107 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1109 scm_from_uint (16));
1110 if (SCM_I_INUMP (p))
1111 {
1112 scm_t_wchar c = scm_to_uint32 (p);
1113 if (SCM_IS_UNICODE_CHAR (c))
1114 return SCM_MAKE_CHAR (c);
1115 else
1116 scm_i_input_error (FUNC_NAME, port,
1117 "out-of-range hex character escape: ~a",
1118 scm_list_1 (charname));
1119 }
1120 }
1121
1122 /* The names of characters should never have non-Latin1
1123 characters. */
1124 if (scm_i_is_narrow_string (charname)
1125 || scm_i_try_narrow_string (charname))
1126 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1127 charname_len);
1128 if (scm_is_true (ch))
1129 return ch;
1130 }
1131
1132 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1133 scm_list_1 (charname));
1134
1135 return SCM_UNSPECIFIED;
1136 }
1137 #undef FUNC_NAME
1138
1139 static SCM
1140 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1141 {
1142 SCM symbol;
1143
1144 /* Read the symbol that comprises the keyword. Doing this instead of
1145 invoking a specific symbol reader function allows `scm_read_keyword ()'
1146 to adapt to the delimiters currently valid of symbols.
1147
1148 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1149 symbol = scm_read_expression (port, opts);
1150 if (!scm_is_symbol (symbol))
1151 scm_i_input_error ("scm_read_keyword", port,
1152 "keyword prefix `~a' not followed by a symbol: ~s",
1153 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1154
1155 return (scm_symbol_to_keyword (symbol));
1156 }
1157
1158 static SCM
1159 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1160 long line, int column)
1161 {
1162 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163 guarantee that it's going to do what we want. After all, this is an
1164 implementation detail of `scm_read_vector ()', not a desirable
1165 property. */
1166 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1167 port, opts, line, column);
1168 }
1169
1170 /* Helper used by scm_read_array */
1171 static int
1172 read_decimal_integer (SCM port, int c, ssize_t *resp)
1173 {
1174 ssize_t sign = 1;
1175 ssize_t res = 0;
1176 int got_it = 0;
1177
1178 if (c == '-')
1179 {
1180 sign = -1;
1181 c = scm_getc_unlocked (port);
1182 }
1183
1184 while ('0' <= c && c <= '9')
1185 {
1186 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1187 scm_i_input_error ("read_decimal_integer", port,
1188 "number too large", SCM_EOL);
1189 res = 10*res + c-'0';
1190 got_it = 1;
1191 c = scm_getc_unlocked (port);
1192 }
1193
1194 if (got_it)
1195 *resp = sign * res;
1196 return c;
1197 }
1198
1199 /* Read an array. This function can also read vectors and uniform
1200 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1201 handled here.
1202
1203 C is the first character read after the '#'. */
1204 static SCM
1205 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1206 {
1207 ssize_t rank;
1208 scm_t_wchar tag_buf[8];
1209 int tag_len;
1210
1211 SCM tag, shape = SCM_BOOL_F, elements, array;
1212
1213 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1214 the array code can not deal with zero-length dimensions yet, and
1215 we want to allow zero-length vectors, of course. */
1216 if (c == '(')
1217 return scm_read_vector (c, port, opts, line, column);
1218
1219 /* Disambiguate between '#f' and uniform floating point vectors. */
1220 if (c == 'f')
1221 {
1222 c = scm_getc_unlocked (port);
1223 if (c != '3' && c != '6')
1224 {
1225 if (c == 'a' && try_read_ci_chars (port, "lse"))
1226 return SCM_BOOL_F;
1227 else if (c != EOF)
1228 scm_ungetc_unlocked (c, port);
1229 return SCM_BOOL_F;
1230 }
1231 rank = 1;
1232 tag_buf[0] = 'f';
1233 tag_len = 1;
1234 goto continue_reading_tag;
1235 }
1236
1237 /* Read rank. */
1238 rank = 1;
1239 c = read_decimal_integer (port, c, &rank);
1240 if (rank < 0)
1241 scm_i_input_error (NULL, port, "array rank must be non-negative",
1242 SCM_EOL);
1243
1244 /* Read tag. */
1245 tag_len = 0;
1246 continue_reading_tag:
1247 while (c != EOF && c != '(' && c != '@' && c != ':'
1248 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1249 {
1250 tag_buf[tag_len++] = c;
1251 c = scm_getc_unlocked (port);
1252 }
1253 if (tag_len == 0)
1254 tag = SCM_BOOL_T;
1255 else
1256 {
1257 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1258 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1259 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1260 scm_list_1 (tag));
1261 }
1262
1263 /* Read shape. */
1264 if (c == '@' || c == ':')
1265 {
1266 shape = SCM_EOL;
1267
1268 do
1269 {
1270 ssize_t lbnd = 0, len = 0;
1271 SCM s;
1272
1273 if (c == '@')
1274 {
1275 c = scm_getc_unlocked (port);
1276 c = read_decimal_integer (port, c, &lbnd);
1277 }
1278
1279 s = scm_from_ssize_t (lbnd);
1280
1281 if (c == ':')
1282 {
1283 c = scm_getc_unlocked (port);
1284 c = read_decimal_integer (port, c, &len);
1285 if (len < 0)
1286 scm_i_input_error (NULL, port,
1287 "array length must be non-negative",
1288 SCM_EOL);
1289
1290 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1291 }
1292
1293 shape = scm_cons (s, shape);
1294 } while (c == '@' || c == ':');
1295
1296 shape = scm_reverse_x (shape, SCM_EOL);
1297 }
1298
1299 /* Read nested lists of elements. */
1300 if (c != '(')
1301 scm_i_input_error (NULL, port,
1302 "missing '(' in vector or array literal",
1303 SCM_EOL);
1304 elements = scm_read_sexp (c, port, opts);
1305
1306 if (scm_is_false (shape))
1307 shape = scm_from_ssize_t (rank);
1308 else if (scm_ilength (shape) != rank)
1309 scm_i_input_error
1310 (NULL, port,
1311 "the number of shape specifications must match the array rank",
1312 SCM_EOL);
1313
1314 /* Handle special print syntax of rank zero arrays; see
1315 scm_i_print_array for a rationale. */
1316 if (rank == 0)
1317 {
1318 if (!scm_is_pair (elements))
1319 scm_i_input_error (NULL, port,
1320 "too few elements in array literal, need 1",
1321 SCM_EOL);
1322 if (!scm_is_null (SCM_CDR (elements)))
1323 scm_i_input_error (NULL, port,
1324 "too many elements in array literal, want 1",
1325 SCM_EOL);
1326 elements = SCM_CAR (elements);
1327 }
1328
1329 /* Construct array, annotate with source location, and return. */
1330 array = scm_list_to_typed_array (tag, shape, elements);
1331 return maybe_annotate_source (array, port, opts, line, column);
1332 }
1333
1334 static SCM
1335 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1336 long line, int column)
1337 {
1338 return scm_read_array (chr, port, opts, line, column);
1339 }
1340
1341 static SCM
1342 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1343 long line, int column)
1344 {
1345 chr = scm_getc_unlocked (port);
1346 if (chr != 'u')
1347 goto syntax;
1348
1349 chr = scm_getc_unlocked (port);
1350 if (chr != '8')
1351 goto syntax;
1352
1353 chr = scm_getc_unlocked (port);
1354 if (chr != '(')
1355 goto syntax;
1356
1357 return maybe_annotate_source
1358 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1359 port, opts, line, column);
1360
1361 syntax:
1362 scm_i_input_error ("read_bytevector", port,
1363 "invalid bytevector prefix",
1364 SCM_MAKE_CHAR (chr));
1365 return SCM_UNSPECIFIED;
1366 }
1367
1368 static SCM
1369 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1370 long line, int column)
1371 {
1372 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1373 terribly inefficient but who cares? */
1374 SCM s_bits = SCM_EOL;
1375
1376 for (chr = scm_getc_unlocked (port);
1377 (chr != EOF) && ((chr == '0') || (chr == '1'));
1378 chr = scm_getc_unlocked (port))
1379 {
1380 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1381 }
1382
1383 if (chr != EOF)
1384 scm_ungetc_unlocked (chr, port);
1385
1386 return maybe_annotate_source
1387 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1388 port, opts, line, column);
1389 }
1390
1391 static SCM
1392 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1393 {
1394 int bang_seen = 0;
1395
1396 for (;;)
1397 {
1398 int c = scm_getc_unlocked (port);
1399
1400 if (c == EOF)
1401 scm_i_input_error ("skip_block_comment", port,
1402 "unterminated `#! ... !#' comment", SCM_EOL);
1403
1404 if (c == '!')
1405 bang_seen = 1;
1406 else if (c == '#' && bang_seen)
1407 break;
1408 else
1409 bang_seen = 0;
1410 }
1411
1412 return SCM_UNSPECIFIED;
1413 }
1414
1415 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1416 int value);
1417 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1418 int value);
1419 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1420 int value);
1421
1422 static SCM
1423 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1424 {
1425 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1426 int c;
1427 int i = 0;
1428
1429 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1430 {
1431 c = scm_getc_unlocked (port);
1432 if (c == EOF)
1433 scm_i_input_error ("skip_block_comment", port,
1434 "unterminated `#! ... !#' comment", SCM_EOL);
1435 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1436 name[i++] = c;
1437 else if (CHAR_IS_DELIMITER (c))
1438 {
1439 scm_ungetc_unlocked (c, port);
1440 name[i] = '\0';
1441 if (0 == strcmp ("r6rs", name))
1442 ; /* Silently ignore */
1443 else if (0 == strcmp ("fold-case", name))
1444 set_port_case_insensitive_p (port, opts, 1);
1445 else if (0 == strcmp ("no-fold-case", name))
1446 set_port_case_insensitive_p (port, opts, 0);
1447 else if (0 == strcmp ("curly-infix", name))
1448 set_port_curly_infix_p (port, opts, 1);
1449 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1450 {
1451 set_port_curly_infix_p (port, opts, 1);
1452 set_port_square_brackets_p (port, opts, 0);
1453 }
1454 else
1455 break;
1456
1457 return SCM_UNSPECIFIED;
1458 }
1459 else
1460 {
1461 scm_ungetc_unlocked (c, port);
1462 break;
1463 }
1464 }
1465 while (i > 0)
1466 scm_ungetc_unlocked (name[--i], port);
1467 return scm_read_scsh_block_comment (chr, port);
1468 }
1469
1470 static SCM
1471 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1472 {
1473 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474 nested. So care must be taken. */
1475 int nesting_level = 1;
1476
1477 int a = scm_getc_unlocked (port);
1478
1479 if (a == EOF)
1480 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1481 "unterminated `#| ... |#' comment", SCM_EOL);
1482
1483 while (nesting_level > 0)
1484 {
1485 int b = scm_getc_unlocked (port);
1486
1487 if (b == EOF)
1488 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1489 "unterminated `#| ... |#' comment", SCM_EOL);
1490
1491 if (a == '|' && b == '#')
1492 {
1493 nesting_level--;
1494 b = EOF;
1495 }
1496 else if (a == '#' && b == '|')
1497 {
1498 nesting_level++;
1499 b = EOF;
1500 }
1501
1502 a = b;
1503 }
1504
1505 return SCM_UNSPECIFIED;
1506 }
1507
1508 static SCM
1509 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1510 scm_t_read_opts *opts)
1511 {
1512 scm_t_wchar c;
1513
1514 c = flush_ws (port, opts, (char *) NULL);
1515 if (EOF == c)
1516 scm_i_input_error ("read_commented_expression", port,
1517 "no expression after #; comment", SCM_EOL);
1518 scm_ungetc_unlocked (c, port);
1519 scm_read_expression (port, opts);
1520 return SCM_UNSPECIFIED;
1521 }
1522
1523 static SCM
1524 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1525 {
1526 /* Guile's extended symbol read syntax looks like this:
1527
1528 #{This is all a symbol name}#
1529
1530 So here, CHR is expected to be `{'. */
1531 int saw_brace = 0;
1532 size_t len = 0;
1533 SCM buf = scm_i_make_string (1024, NULL, 0);
1534
1535 buf = scm_i_string_start_writing (buf);
1536
1537 while ((chr = scm_getc_unlocked (port)) != EOF)
1538 {
1539 if (saw_brace)
1540 {
1541 if (chr == '#')
1542 {
1543 break;
1544 }
1545 else
1546 {
1547 saw_brace = 0;
1548 scm_i_string_set_x (buf, len++, '}');
1549 }
1550 }
1551
1552 if (chr == '}')
1553 saw_brace = 1;
1554 else if (chr == '\\')
1555 {
1556 /* It used to be that print.c would print extended-read-syntax
1557 symbols with backslashes before "non-standard" chars, but
1558 this routine wouldn't do anything with those escapes.
1559 Bummer. What we've done is to change print.c to output
1560 R6RS hex escapes for those characters, relying on the fact
1561 that the extended read syntax would never put a `\' before
1562 an `x'. For now, we just ignore other instances of
1563 backslash in the string. */
1564 switch ((chr = scm_getc_unlocked (port)))
1565 {
1566 case EOF:
1567 goto done;
1568 case 'x':
1569 {
1570 scm_t_wchar c;
1571
1572 SCM_READ_HEX_ESCAPE (10, ';');
1573 scm_i_string_set_x (buf, len++, c);
1574 break;
1575
1576 str_eof:
1577 chr = EOF;
1578 goto done;
1579
1580 bad_escaped:
1581 scm_i_string_stop_writing ();
1582 scm_i_input_error ("scm_read_extended_symbol", port,
1583 "illegal character in escape sequence: ~S",
1584 scm_list_1 (SCM_MAKE_CHAR (c)));
1585 break;
1586 }
1587 default:
1588 scm_i_string_set_x (buf, len++, chr);
1589 break;
1590 }
1591 }
1592 else
1593 scm_i_string_set_x (buf, len++, chr);
1594
1595 if (len >= scm_i_string_length (buf) - 2)
1596 {
1597 SCM addy;
1598
1599 scm_i_string_stop_writing ();
1600 addy = scm_i_make_string (1024, NULL, 0);
1601 buf = scm_string_append (scm_list_2 (buf, addy));
1602 len = 0;
1603 buf = scm_i_string_start_writing (buf);
1604 }
1605 }
1606
1607 done:
1608 scm_i_string_stop_writing ();
1609 if (chr == EOF)
1610 scm_i_input_error ("scm_read_extended_symbol", port,
1611 "end of file while reading symbol", SCM_EOL);
1612
1613 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1614 }
1615
1616
1617 \f
1618 /* Top-level token readers, i.e., dispatchers. */
1619
1620 static SCM
1621 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1622 {
1623 SCM proc;
1624
1625 proc = scm_get_hash_procedure (chr);
1626 if (scm_is_true (scm_procedure_p (proc)))
1627 {
1628 long line = SCM_LINUM (port);
1629 int column = SCM_COL (port) - 2;
1630 SCM got;
1631
1632 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1633
1634 if (opts->record_positions_p && SCM_NIMP (got)
1635 && !scm_i_has_source_properties (got))
1636 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1637
1638 return got;
1639 }
1640
1641 return SCM_UNSPECIFIED;
1642 }
1643
1644 /* The reader for the sharp `#' character. It basically dispatches reads
1645 among the above token readers. */
1646 static SCM
1647 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1648 long line, int column)
1649 #define FUNC_NAME "scm_lreadr"
1650 {
1651 SCM result;
1652
1653 chr = scm_getc_unlocked (port);
1654
1655 result = scm_read_sharp_extension (chr, port, opts);
1656 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1657 return result;
1658
1659 switch (chr)
1660 {
1661 case '\\':
1662 return (scm_read_character (chr, port, opts));
1663 case '(':
1664 return (scm_read_vector (chr, port, opts, line, column));
1665 case 's':
1666 case 'u':
1667 case 'f':
1668 case 'c':
1669 /* This one may return either a boolean or an SRFI-4 vector. */
1670 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1671 case 'v':
1672 return (scm_read_bytevector (chr, port, opts, line, column));
1673 case '*':
1674 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1675 case 't':
1676 case 'T':
1677 case 'F':
1678 return (scm_read_boolean (chr, port));
1679 case ':':
1680 return (scm_read_keyword (chr, port, opts));
1681 case '0': case '1': case '2': case '3': case '4':
1682 case '5': case '6': case '7': case '8': case '9':
1683 case '@':
1684 return (scm_read_array (chr, port, opts, line, column));
1685
1686 case 'i':
1687 case 'e':
1688 case 'b':
1689 case 'B':
1690 case 'o':
1691 case 'O':
1692 case 'd':
1693 case 'D':
1694 case 'x':
1695 case 'X':
1696 case 'I':
1697 case 'E':
1698 return (scm_read_number_and_radix (chr, port, opts));
1699 case '{':
1700 return (scm_read_extended_symbol (chr, port));
1701 case '!':
1702 return (scm_read_shebang (chr, port, opts));
1703 case ';':
1704 return (scm_read_commented_expression (chr, port, opts));
1705 case '`':
1706 case '\'':
1707 case ',':
1708 return (scm_read_syntax (chr, port, opts));
1709 case 'n':
1710 return (scm_read_nil (chr, port, opts));
1711 default:
1712 result = scm_read_sharp_extension (chr, port, opts);
1713 if (scm_is_eq (result, SCM_UNSPECIFIED))
1714 {
1715 /* To remain compatible with 1.8 and earlier, the following
1716 characters have lower precedence than `read-hash-extend'
1717 characters. */
1718 switch (chr)
1719 {
1720 case '|':
1721 return scm_read_r6rs_block_comment (chr, port);
1722 default:
1723 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1724 scm_list_1 (SCM_MAKE_CHAR (chr)));
1725 }
1726 }
1727 else
1728 return result;
1729 }
1730
1731 return SCM_UNSPECIFIED;
1732 }
1733 #undef FUNC_NAME
1734
1735 static SCM
1736 read_inner_expression (SCM port, scm_t_read_opts *opts)
1737 #define FUNC_NAME "read_inner_expression"
1738 {
1739 while (1)
1740 {
1741 scm_t_wchar chr;
1742
1743 chr = scm_getc_unlocked (port);
1744
1745 switch (chr)
1746 {
1747 case SCM_WHITE_SPACES:
1748 case SCM_LINE_INCREMENTORS:
1749 break;
1750 case ';':
1751 (void) scm_read_semicolon_comment (chr, port);
1752 break;
1753 case '{':
1754 if (opts->curly_infix_p)
1755 {
1756 if (opts->neoteric_p)
1757 return scm_read_sexp (chr, port, opts);
1758 else
1759 {
1760 SCM expr;
1761
1762 /* Enable neoteric expressions within curly braces */
1763 opts->neoteric_p = 1;
1764 expr = scm_read_sexp (chr, port, opts);
1765 opts->neoteric_p = 0;
1766 return expr;
1767 }
1768 }
1769 else
1770 return scm_read_mixed_case_symbol (chr, port, opts);
1771 case '[':
1772 if (opts->square_brackets_p)
1773 return scm_read_sexp (chr, port, opts);
1774 else if (opts->curly_infix_p)
1775 {
1776 /* The syntax of neoteric expressions requires that '[' be
1777 a delimiter when curly-infix is enabled, so it cannot
1778 be part of an unescaped symbol. We might as well do
1779 something useful with it, so we adopt Kawa's convention:
1780 [...] => ($bracket-list$ ...) */
1781 long line = SCM_LINUM (port);
1782 int column = SCM_COL (port) - 1;
1783 return maybe_annotate_source
1784 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1785 port, opts, line, column);
1786 }
1787 else
1788 return scm_read_mixed_case_symbol (chr, port, opts);
1789 case '(':
1790 return (scm_read_sexp (chr, port, opts));
1791 case '"':
1792 return (scm_read_string (chr, port, opts));
1793 case '|':
1794 if (opts->r7rs_symbols_p)
1795 return scm_read_r7rs_symbol (chr, port, opts);
1796 else
1797 return scm_read_mixed_case_symbol (chr, port, opts);
1798 case '\'':
1799 case '`':
1800 case ',':
1801 return (scm_read_quote (chr, port, opts));
1802 case '#':
1803 {
1804 long line = SCM_LINUM (port);
1805 int column = SCM_COL (port) - 1;
1806 SCM result = scm_read_sharp (chr, port, opts, line, column);
1807 if (scm_is_eq (result, SCM_UNSPECIFIED))
1808 /* We read a comment or some such. */
1809 break;
1810 else
1811 return result;
1812 }
1813 case ')':
1814 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1815 break;
1816 case '}':
1817 if (opts->curly_infix_p)
1818 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1819 else
1820 return scm_read_mixed_case_symbol (chr, port, opts);
1821 case ']':
1822 if (opts->square_brackets_p)
1823 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1824 /* otherwise fall through */
1825 case EOF:
1826 return SCM_EOF_VAL;
1827 case ':':
1828 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1829 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1830 /* Fall through. */
1831
1832 default:
1833 {
1834 if (((chr >= '0') && (chr <= '9'))
1835 || (strchr ("+-.", chr)))
1836 return (scm_read_number (chr, port, opts));
1837 else
1838 return (scm_read_mixed_case_symbol (chr, port, opts));
1839 }
1840 }
1841 }
1842 }
1843 #undef FUNC_NAME
1844
1845 static SCM
1846 scm_read_expression (SCM port, scm_t_read_opts *opts)
1847 #define FUNC_NAME "scm_read_expression"
1848 {
1849 if (!opts->neoteric_p)
1850 return read_inner_expression (port, opts);
1851 else
1852 {
1853 long line = 0;
1854 int column = 0;
1855 SCM expr;
1856
1857 if (opts->record_positions_p)
1858 {
1859 /* We need to get the position of the first non-whitespace
1860 character in order to correctly annotate neoteric
1861 expressions. For example, for the expression 'f(x)', the
1862 first call to 'read_inner_expression' reads the 'f' (which
1863 cannot be annotated), and then we later read the '(x)' and
1864 use it to construct the new list (f x). */
1865 int c = flush_ws (port, opts, (char *) NULL);
1866 if (c == EOF)
1867 return SCM_EOF_VAL;
1868 scm_ungetc_unlocked (c, port);
1869 line = SCM_LINUM (port);
1870 column = SCM_COL (port);
1871 }
1872
1873 expr = read_inner_expression (port, opts);
1874
1875 /* 'expr' is the first component of the neoteric expression. Now
1876 we loop, and as long as the next character is '(', '[', or '{',
1877 (without any intervening whitespace), we use it to construct a
1878 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1879 for (;;)
1880 {
1881 int chr = scm_getc_unlocked (port);
1882
1883 if (chr == '(')
1884 /* e(...) => (e ...) */
1885 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1886 else if (chr == '[')
1887 /* e[...] => ($bracket-apply$ e ...) */
1888 expr = scm_cons (sym_bracket_apply,
1889 scm_cons (expr,
1890 scm_read_sexp (chr, port, opts)));
1891 else if (chr == '{')
1892 {
1893 SCM arg = scm_read_sexp (chr, port, opts);
1894
1895 if (scm_is_null (arg))
1896 expr = scm_list_1 (expr); /* e{} => (e) */
1897 else
1898 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1899 }
1900 else
1901 {
1902 if (chr != EOF)
1903 scm_ungetc_unlocked (chr, port);
1904 break;
1905 }
1906 maybe_annotate_source (expr, port, opts, line, column);
1907 }
1908 return expr;
1909 }
1910 }
1911 #undef FUNC_NAME
1912
1913 \f
1914 /* Actual reader. */
1915
1916 static void init_read_options (SCM port, scm_t_read_opts *opts);
1917
1918 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1919 (SCM port),
1920 "Read an s-expression from the input port @var{port}, or from\n"
1921 "the current input port if @var{port} is not specified.\n"
1922 "Any whitespace before the next token is discarded.")
1923 #define FUNC_NAME s_scm_read
1924 {
1925 scm_t_read_opts opts;
1926 int c;
1927
1928 if (SCM_UNBNDP (port))
1929 port = scm_current_input_port ();
1930 SCM_VALIDATE_OPINPORT (1, port);
1931
1932 init_read_options (port, &opts);
1933
1934 c = flush_ws (port, &opts, (char *) NULL);
1935 if (EOF == c)
1936 return SCM_EOF_VAL;
1937 scm_ungetc_unlocked (c, port);
1938
1939 return (scm_read_expression (port, &opts));
1940 }
1941 #undef FUNC_NAME
1942
1943
1944 \f
1945
1946 /* Manipulate the read-hash-procedures alist. This could be written in
1947 Scheme, but maybe it will also be used by C code during initialisation. */
1948 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1949 (SCM chr, SCM proc),
1950 "Install the procedure @var{proc} for reading expressions\n"
1951 "starting with the character sequence @code{#} and @var{chr}.\n"
1952 "@var{proc} will be called with two arguments: the character\n"
1953 "@var{chr} and the port to read further data from. The object\n"
1954 "returned will be the return value of @code{read}. \n"
1955 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1956 )
1957 #define FUNC_NAME s_scm_read_hash_extend
1958 {
1959 SCM this;
1960 SCM prev;
1961
1962 SCM_VALIDATE_CHAR (1, chr);
1963 SCM_ASSERT (scm_is_false (proc)
1964 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1965 proc, SCM_ARG2, FUNC_NAME);
1966
1967 /* Check if chr is already in the alist. */
1968 this = scm_i_read_hash_procedures_ref ();
1969 prev = SCM_BOOL_F;
1970 while (1)
1971 {
1972 if (scm_is_null (this))
1973 {
1974 /* not found, so add it to the beginning. */
1975 if (scm_is_true (proc))
1976 {
1977 SCM new = scm_cons (scm_cons (chr, proc),
1978 scm_i_read_hash_procedures_ref ());
1979 scm_i_read_hash_procedures_set_x (new);
1980 }
1981 break;
1982 }
1983 if (scm_is_eq (chr, SCM_CAAR (this)))
1984 {
1985 /* already in the alist. */
1986 if (scm_is_false (proc))
1987 {
1988 /* remove it. */
1989 if (scm_is_false (prev))
1990 {
1991 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1992 scm_i_read_hash_procedures_set_x (rest);
1993 }
1994 else
1995 scm_set_cdr_x (prev, SCM_CDR (this));
1996 }
1997 else
1998 {
1999 /* replace it. */
2000 scm_set_cdr_x (SCM_CAR (this), proc);
2001 }
2002 break;
2003 }
2004 prev = this;
2005 this = SCM_CDR (this);
2006 }
2007
2008 return SCM_UNSPECIFIED;
2009 }
2010 #undef FUNC_NAME
2011
2012 /* Recover the read-hash procedure corresponding to char c. */
2013 static SCM
2014 scm_get_hash_procedure (int c)
2015 {
2016 SCM rest = scm_i_read_hash_procedures_ref ();
2017
2018 while (1)
2019 {
2020 if (scm_is_null (rest))
2021 return SCM_BOOL_F;
2022
2023 if (SCM_CHAR (SCM_CAAR (rest)) == c)
2024 return SCM_CDAR (rest);
2025
2026 rest = SCM_CDR (rest);
2027 }
2028 }
2029
2030 static int
2031 is_encoding_char (char c)
2032 {
2033 if (c >= 'a' && c <= 'z') return 1;
2034 if (c >= 'A' && c <= 'Z') return 1;
2035 if (c >= '0' && c <= '9') return 1;
2036 return strchr ("_-.:/,+=()", c) != NULL;
2037 }
2038
2039 /* Maximum size of an encoding name. This is a bit more than the
2040 longest name listed at
2041 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2042 characters.) */
2043 #define ENCODING_NAME_MAX_SIZE 20
2044
2045 /* Number of bytes at the beginning or end of a file that are scanned
2046 for a "coding:" declaration. */
2047 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2048
2049
2050 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2051 coding declaration. Returns either NULL or a string whose storage
2052 has been allocated with `scm_gc_malloc'. */
2053 char *
2054 scm_i_scan_for_encoding (SCM port)
2055 {
2056 scm_t_port *pt;
2057 char header[SCM_ENCODING_SEARCH_SIZE+1];
2058 size_t bytes_read, encoding_length, i;
2059 char *encoding = NULL;
2060 char *pos, *encoding_start;
2061 int in_comment;
2062
2063 pt = SCM_PTAB_ENTRY (port);
2064
2065 if (pt->rw_active == SCM_PORT_WRITE)
2066 scm_flush_unlocked (port);
2067
2068 if (pt->rw_random)
2069 pt->rw_active = SCM_PORT_READ;
2070
2071 if (pt->read_pos == pt->read_end)
2072 {
2073 /* We can use the read buffer, and thus avoid a seek. */
2074 if (scm_fill_input_unlocked (port) == EOF)
2075 return NULL;
2076
2077 bytes_read = pt->read_end - pt->read_pos;
2078 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2079 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2080
2081 if (bytes_read <= 1)
2082 /* An unbuffered port -- don't scan. */
2083 return NULL;
2084
2085 memcpy (header, pt->read_pos, bytes_read);
2086 header[bytes_read] = '\0';
2087 }
2088 else
2089 {
2090 /* Try to read some bytes and then seek back. Not all ports
2091 support seeking back; and indeed some file ports (like
2092 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2093 check performed by SCM_FPORT_FDES---but fail to seek
2094 backwards. Hence this block comes second. We prefer to use
2095 the read buffer in-place. */
2096 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2097 return NULL;
2098
2099 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2100 header[bytes_read] = '\0';
2101 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2102 }
2103
2104 /* search past "coding[:=]" */
2105 pos = header;
2106 while (1)
2107 {
2108 if ((pos = strstr(pos, "coding")) == NULL)
2109 return NULL;
2110
2111 pos += strlen ("coding");
2112 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2113 (*pos == ':' || *pos == '='))
2114 {
2115 pos ++;
2116 break;
2117 }
2118 }
2119
2120 /* skip spaces */
2121 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2122 (*pos == ' ' || *pos == '\t'))
2123 pos ++;
2124
2125 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2126 /* We found the "coding:" string, but there is probably not enough
2127 room to store an encoding name in its entirety, so ignore it.
2128 This makes sure we do not end up returning a truncated encoding
2129 name. */
2130 return NULL;
2131
2132 /* grab the next token */
2133 encoding_start = pos;
2134 i = 0;
2135 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2136 && encoding_start + i - header < bytes_read
2137 && is_encoding_char (encoding_start[i]))
2138 i++;
2139
2140 encoding_length = i;
2141 if (encoding_length == 0)
2142 return NULL;
2143
2144 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2145
2146 /* push backwards to make sure we were in a comment */
2147 in_comment = 0;
2148 pos = encoding_start;
2149 while (pos >= header)
2150 {
2151 if (*pos == ';')
2152 {
2153 in_comment = 1;
2154 break;
2155 }
2156 else if (*pos == '\n' || pos == header)
2157 {
2158 /* This wasn't in a semicolon comment. Check for a
2159 hash-bang comment. */
2160 char *beg = strstr (header, "#!");
2161 char *end = strstr (header, "!#");
2162 if (beg < encoding_start && encoding_start + encoding_length <= end)
2163 in_comment = 1;
2164 break;
2165 }
2166 else
2167 {
2168 pos --;
2169 continue;
2170 }
2171 }
2172 if (!in_comment)
2173 /* This wasn't in a comment */
2174 return NULL;
2175
2176 return encoding;
2177 }
2178
2179 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2180 (SCM port),
2181 "Scans the port for an Emacs-like character coding declaration\n"
2182 "near the top of the contents of a port with random-accessible contents.\n"
2183 "The coding declaration is of the form\n"
2184 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2185 "\n"
2186 "Returns a string containing the character encoding of the file\n"
2187 "if a declaration was found, or @code{#f} otherwise.\n")
2188 #define FUNC_NAME s_scm_file_encoding
2189 {
2190 char *enc;
2191 SCM s_enc;
2192
2193 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2194
2195 enc = scm_i_scan_for_encoding (port);
2196 if (enc == NULL)
2197 return SCM_BOOL_F;
2198 else
2199 {
2200 s_enc = scm_string_upcase (scm_from_locale_string (enc));
2201 return s_enc;
2202 }
2203
2204 return SCM_BOOL_F;
2205 }
2206 #undef FUNC_NAME
2207
2208 \f
2209 /* Per-port read options.
2210
2211 We store per-port read options in the 'port-read-options' port
2212 property, which is stored in the internal port structure. The value
2213 stored is a single integer that contains a two-bit field for each
2214 read option.
2215
2216 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2217 the applicable value should be inherited from the corresponding
2218 global read option. Otherwise, the bit field contains the value of
2219 the read option. For boolean read options that have been set
2220 per-port, the possible values are 0 or 1. If the 'keyword_style'
2221 read option has been set per-port, its possible values are those in
2222 'enum t_keyword_style'. */
2223
2224 /* Key to read options in port properties. */
2225 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2226
2227 /* Offsets of bit fields for each per-port override */
2228 #define READ_OPTION_COPY_SOURCE_P 0
2229 #define READ_OPTION_RECORD_POSITIONS_P 2
2230 #define READ_OPTION_CASE_INSENSITIVE_P 4
2231 #define READ_OPTION_KEYWORD_STYLE 6
2232 #define READ_OPTION_R6RS_ESCAPES_P 8
2233 #define READ_OPTION_SQUARE_BRACKETS_P 10
2234 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2235 #define READ_OPTION_CURLY_INFIX_P 14
2236 #define READ_OPTION_R7RS_SYMBOLS_P 16
2237
2238 /* The total width in bits of the per-port overrides */
2239 #define READ_OPTIONS_NUM_BITS 18
2240
2241 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2242 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2243
2244 #define READ_OPTION_MASK 3
2245 #define READ_OPTION_INHERIT 3
2246
2247 static void
2248 set_port_read_option (SCM port, int option, int new_value)
2249 {
2250 SCM scm_read_options;
2251 unsigned int read_options;
2252
2253 new_value &= READ_OPTION_MASK;
2254
2255 scm_dynwind_begin (0);
2256 scm_dynwind_lock_port (port);
2257
2258 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2259 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2260 read_options = scm_to_uint (scm_read_options);
2261 else
2262 read_options = READ_OPTIONS_INHERIT_ALL;
2263 read_options &= ~(READ_OPTION_MASK << option);
2264 read_options |= new_value << option;
2265 scm_read_options = scm_from_uint (read_options);
2266 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2267
2268 scm_dynwind_end ();
2269 }
2270
2271 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2272 static void
2273 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2274 {
2275 value = !!value;
2276 opts->case_insensitive_p = value;
2277 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2278 }
2279
2280 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2281 static void
2282 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2283 {
2284 value = !!value;
2285 opts->square_brackets_p = value;
2286 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2287 }
2288
2289 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2290 static void
2291 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2292 {
2293 value = !!value;
2294 opts->curly_infix_p = value;
2295 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2296 }
2297
2298 /* Initialize OPTS based on PORT's read options and the global read
2299 options. */
2300 static void
2301 init_read_options (SCM port, scm_t_read_opts *opts)
2302 {
2303 SCM val, scm_read_options;
2304 unsigned int read_options, x;
2305
2306 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2307
2308 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2309 read_options = scm_to_uint (scm_read_options);
2310 else
2311 read_options = READ_OPTIONS_INHERIT_ALL;
2312
2313 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2314 if (x == READ_OPTION_INHERIT)
2315 {
2316 val = SCM_PACK (SCM_KEYWORD_STYLE);
2317 if (scm_is_eq (val, scm_keyword_prefix))
2318 x = KEYWORD_STYLE_PREFIX;
2319 else if (scm_is_eq (val, scm_keyword_postfix))
2320 x = KEYWORD_STYLE_POSTFIX;
2321 else
2322 x = KEYWORD_STYLE_HASH_PREFIX;
2323 }
2324 opts->keyword_style = x;
2325
2326 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2327 do \
2328 { \
2329 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2330 if (x == READ_OPTION_INHERIT) \
2331 x = !!SCM_ ## NAME; \
2332 opts->name = x; \
2333 } \
2334 while (0)
2335
2336 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2337 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2338 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2339 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2340 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2341 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2342 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
2343 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
2344
2345 #undef RESOLVE_BOOLEAN_OPTION
2346
2347 opts->neoteric_p = 0;
2348 }
2349
2350 void
2351 scm_init_read ()
2352 {
2353 SCM read_hash_procs;
2354
2355 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2356
2357 scm_i_read_hash_procedures =
2358 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2359
2360 scm_init_opts (scm_read_options, scm_read_opts);
2361 #include "libguile/read.x"
2362 }
2363
2364 /*
2365 Local Variables:
2366 c-file-style: "gnu"
2367 End:
2368 */