Merge commit '495cea0c931de23f074892b3f32808e676712a18'
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <unicase.h>
31 #include <unictype.h>
32 #include <c-strcase.h>
33 #include <c-ctype.h>
34
35 #include "libguile/_scm.h"
36 #include "libguile/bytevectors.h"
37 #include "libguile/chars.h"
38 #include "libguile/eval.h"
39 #include "libguile/arrays.h"
40 #include "libguile/bitvectors.h"
41 #include "libguile/keywords.h"
42 #include "libguile/alist.h"
43 #include "libguile/srcprop.h"
44 #include "libguile/hashtab.h"
45 #include "libguile/hash.h"
46 #include "libguile/ports.h"
47 #include "libguile/ports-internal.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59
60
61 \f
62
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73 scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
93 { 0, },
94 };
95
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100 enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
106
107 struct t_read_opts
108 {
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
118 unsigned int r7rs_symbols_p : 1;
119 };
120
121 typedef struct t_read_opts scm_t_read_opts;
122
123
124 /*
125 Give meaningful error messages for errors
126
127 We use the format
128
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
135 */
136
137
138 void
139 scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
141 {
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
145
146 SCM string_port = scm_open_output_string ();
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
150 scm_list_4 (fn,
151 scm_from_long (SCM_LINUM (port) + 1),
152 scm_from_int (SCM_COL (port) + 1),
153 scm_from_locale_string (message)));
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function? scm_from_locale_string (function) : SCM_BOOL_F,
159 string,
160 arg,
161 SCM_BOOL_F);
162 }
163
164
165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
166 (SCM setting),
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
171 {
172 SCM ans = scm_options (setting,
173 scm_read_opts,
174 FUNC_NAME);
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
177 return ans;
178 }
179 #undef FUNC_NAME
180
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM *scm_i_read_hash_procedures;
184
185 static SCM
186 scm_i_read_hash_procedures_ref (void)
187 {
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189 }
190
191 static void
192 scm_i_read_hash_procedures_set_x (SCM value)
193 {
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195 }
196
197 \f
198 /* Token readers. */
199
200
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
203
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
206
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
209
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
213
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
218
219 #ifdef MSDOS
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222 #else
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
224 #endif
225
226
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229 #define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
232
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
238
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
245 /* Read an SCSH block comment. */
246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_get_hash_procedure (int);
251
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
256 static int
257 read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
259 {
260 *read = 0;
261
262 while (*read < buf_size)
263 {
264 int chr;
265
266 chr = scm_get_byte_or_eof_unlocked (port);
267
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
272 scm_unget_byte_unlocked (chr, port);
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
281
282 return 1;
283 }
284
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287 static char *
288 read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
290 {
291 int overflow = 0;
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
294
295 do
296 {
297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
306 overflow_size = bytes_read;
307 }
308 else
309 {
310 char *new_buf =
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
317 overflow_size += bytes_read;
318 }
319 }
320 }
321 while (overflow);
322
323 if (overflow_size)
324 *read = overflow_size;
325 else
326 *read = bytes_read;
327
328 return (overflow_size > 0 ? overflow_buffer : buffer);
329 }
330
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333 static int
334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
335 {
336 scm_t_wchar c;
337 while (1)
338 switch (c = scm_getc_unlocked (port))
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
343 {
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
348 }
349 return c;
350
351 case ';':
352 lp:
353 switch (c = scm_getc_unlocked (port))
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
363
364 case '#':
365 switch (c = scm_getc_unlocked (port))
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
371 scm_read_shebang (c, port, opts);
372 break;
373 case ';':
374 scm_read_commented_expression (c, port, opts);
375 break;
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
383 default:
384 scm_ungetc_unlocked (c, port);
385 return '#';
386 }
387 break;
388
389 case SCM_LINE_INCREMENTORS:
390 case SCM_SINGLE_SPACES:
391 case '\t':
392 break;
393
394 default:
395 return c;
396 }
397
398 return 0;
399 }
400
401
402 \f
403 /* Token readers. */
404
405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
408
409
410 static SCM
411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
413 {
414 if (opts->record_positions_p)
415 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
416 return x;
417 }
418
419 static SCM
420 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
421 #define FUNC_NAME "scm_i_lreadparen"
422 {
423 int c;
424 SCM tmp, tl, ans = SCM_EOL;
425 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
426 const int terminating_char = ((chr == '{') ? '}'
427 : ((chr == '[') ? ']'
428 : ')'));
429
430 /* Need to capture line and column numbers here. */
431 long line = SCM_LINUM (port);
432 int column = SCM_COL (port) - 1;
433
434 c = flush_ws (port, opts, FUNC_NAME);
435 if (terminating_char == c)
436 return SCM_EOL;
437
438 scm_ungetc_unlocked (c, port);
439 tmp = scm_read_expression (port, opts);
440
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
445 {
446 ans = scm_read_expression (port, opts);
447 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
448 scm_i_input_error (FUNC_NAME, port, "missing close paren",
449 SCM_EOL);
450 return ans;
451 }
452
453 /* Build the head of the list structure. */
454 ans = tl = scm_cons (tmp, SCM_EOL);
455
456 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
457 {
458 SCM new_tail;
459
460 if (c == ')' || (c == ']' && opts->square_brackets_p)
461 || ((c == '}' || c == ']') && opts->curly_infix_p))
462 scm_i_input_error (FUNC_NAME, port,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c)));
465
466 scm_ungetc_unlocked (c, port);
467 tmp = scm_read_expression (port, opts);
468
469 /* See above note about scm_sym_dot. */
470 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
471 {
472 SCM_SETCDR (tl, scm_read_expression (port, opts));
473
474 c = flush_ws (port, opts, FUNC_NAME);
475 if (terminating_char != c)
476 scm_i_input_error (FUNC_NAME, port,
477 "in pair: missing close paren", SCM_EOL);
478 break;
479 }
480
481 new_tail = scm_cons (tmp, SCM_EOL);
482 SCM_SETCDR (tl, new_tail);
483 tl = new_tail;
484 }
485
486 if (curly_list_p)
487 {
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len = scm_ilength (ans);
491
492 /* The (len == 0) case is handled above */
493 if (len == 1)
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans); /* {e} => e */
498 else if (len == 2)
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len >= 3 && (len & 1))
501 {
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op = scm_cadr (ans);
507
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
510 {
511 if (scm_is_null (tl))
512 {
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
515 tl = ans;
516 while (scm_is_pair (scm_cdr (tl)))
517 {
518 tmp = scm_cddr (tl);
519 SCM_SETCDR (tl, tmp);
520 tl = tmp;
521 }
522 ans = scm_cons (op, ans);
523 break;
524 }
525 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
526 {
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans = scm_cons (sym_nfx, ans);
529 break;
530 }
531 }
532 }
533 else
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans = scm_cons (sym_nfx, ans);
537 }
538
539 return maybe_annotate_source (ans, port, opts, line, column);
540 }
541 #undef FUNC_NAME
542
543
544 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
546 found. */
547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 do \
549 { \
550 scm_t_wchar a; \
551 size_t i = 0; \
552 c = 0; \
553 while (i < ndigits) \
554 { \
555 a = scm_getc_unlocked (port); \
556 if (a == EOF) \
557 goto str_eof; \
558 if (terminator \
559 && (a == (scm_t_wchar) terminator) \
560 && (i > 0)) \
561 break; \
562 if ('0' <= a && a <= '9') \
563 a -= '0'; \
564 else if ('A' <= a && a <= 'F') \
565 a = a - 'A' + 10; \
566 else if ('a' <= a && a <= 'f') \
567 a = a - 'a' + 10; \
568 else \
569 { \
570 c = a; \
571 goto bad_escaped; \
572 } \
573 c = c * 16 + a; \
574 i ++; \
575 } \
576 } while (0)
577
578 static void
579 skip_intraline_whitespace (SCM port)
580 {
581 scm_t_wchar c;
582
583 do
584 {
585 c = scm_getc_unlocked (port);
586 if (c == EOF)
587 return;
588 }
589 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
590
591 scm_ungetc_unlocked (c, port);
592 }
593
594 /* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
597 static SCM
598 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
599 #define FUNC_NAME "scm_lreadr"
600 {
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
603
604 SCM str = SCM_EOL;
605 size_t c_str_len = 0;
606 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
607
608 /* Need to capture line and column numbers here. */
609 long line = SCM_LINUM (port);
610 int column = SCM_COL (port) - 1;
611
612 while (chr != (c = scm_getc_unlocked (port)))
613 {
614 if (c == EOF)
615 {
616 str_eof:
617 scm_i_input_error (FUNC_NAME, port,
618 (chr == '|'
619 ? "end of file in symbol"
620 : "end of file in string constant"),
621 SCM_EOL);
622 }
623
624 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
625 {
626 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
627 c_str_len = 0;
628 }
629
630 if (c == '\\')
631 {
632 switch (c = scm_getc_unlocked (port))
633 {
634 case EOF:
635 goto str_eof;
636 case '|':
637 case '\\':
638 break;
639 case '\n':
640 if (opts->hungry_eol_escapes_p)
641 skip_intraline_whitespace (port);
642 continue;
643 case '0':
644 c = '\0';
645 break;
646 case 'f':
647 c = '\f';
648 break;
649 case 'n':
650 c = '\n';
651 break;
652 case 'r':
653 c = '\r';
654 break;
655 case 't':
656 c = '\t';
657 break;
658 case 'a':
659 c = '\007';
660 break;
661 case 'v':
662 c = '\v';
663 break;
664 case 'b':
665 c = '\010';
666 break;
667 case 'x':
668 if (opts->r6rs_escapes_p || chr == '|')
669 SCM_READ_HEX_ESCAPE (10, ';');
670 else
671 SCM_READ_HEX_ESCAPE (2, '\0');
672 break;
673 case 'u':
674 if (!opts->r6rs_escapes_p)
675 {
676 SCM_READ_HEX_ESCAPE (4, '\0');
677 break;
678 }
679 case 'U':
680 if (!opts->r6rs_escapes_p)
681 {
682 SCM_READ_HEX_ESCAPE (6, '\0');
683 break;
684 }
685 default:
686 if (c == chr)
687 break;
688 bad_escaped:
689 scm_i_input_error (FUNC_NAME, port,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c)));
692 }
693 }
694
695 c_str[c_str_len++] = c;
696 }
697
698 if (scm_is_null (str))
699 /* Fast path: we got a string that fits in C_STR. */
700 str = scm_from_utf32_stringn (c_str, c_str_len);
701 else
702 {
703 if (c_str_len > 0)
704 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
705
706 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
707 }
708
709 return maybe_annotate_source (str, port, opts, line, column);
710 }
711 #undef FUNC_NAME
712
713 static SCM
714 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
715 {
716 return scm_read_string_like_syntax (chr, port, opts);
717 }
718
719 static SCM
720 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
721 {
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
723 }
724
725 static SCM
726 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
727 {
728 SCM result, str = SCM_EOL;
729 char local_buffer[READER_BUFFER_SIZE], *buffer;
730 size_t bytes_read;
731
732 /* Need to capture line and column numbers here. */
733 long line = SCM_LINUM (port);
734 int column = SCM_COL (port) - 1;
735
736 scm_ungetc_unlocked (chr, port);
737 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
738 &bytes_read);
739
740 str = scm_from_port_stringn (buffer, bytes_read, port);
741
742 result = scm_string_to_number (str, SCM_UNDEFINED);
743 if (scm_is_false (result))
744 {
745 /* Return a symbol instead of a number */
746 if (opts->case_insensitive_p)
747 str = scm_string_downcase_x (str);
748 result = scm_string_to_symbol (str);
749 }
750 else if (SCM_NIMP (result))
751 result = maybe_annotate_source (result, port, opts, line, column);
752
753 SCM_COL (port) += scm_i_string_length (str);
754 return result;
755 }
756
757 static SCM
758 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
759 {
760 SCM result;
761 int ends_with_colon = 0;
762 size_t bytes_read;
763 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
764 char local_buffer[READER_BUFFER_SIZE], *buffer;
765 SCM str;
766
767 scm_ungetc_unlocked (chr, port);
768 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
769 &bytes_read);
770 if (bytes_read > 0)
771 ends_with_colon = buffer[bytes_read - 1] == ':';
772
773 if (postfix && ends_with_colon && (bytes_read > 1))
774 {
775 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
776
777 if (opts->case_insensitive_p)
778 str = scm_string_downcase_x (str);
779 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
780 }
781 else
782 {
783 str = scm_from_port_stringn (buffer, bytes_read, port);
784
785 if (opts->case_insensitive_p)
786 str = scm_string_downcase_x (str);
787 result = scm_string_to_symbol (str);
788 }
789
790 SCM_COL (port) += scm_i_string_length (str);
791 return result;
792 }
793
794 static SCM
795 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
796 #define FUNC_NAME "scm_lreadr"
797 {
798 SCM result;
799 size_t read;
800 char local_buffer[READER_BUFFER_SIZE], *buffer;
801 unsigned int radix;
802 SCM str;
803
804 switch (chr)
805 {
806 case 'B':
807 case 'b':
808 radix = 2;
809 break;
810
811 case 'o':
812 case 'O':
813 radix = 8;
814 break;
815
816 case 'd':
817 case 'D':
818 radix = 10;
819 break;
820
821 case 'x':
822 case 'X':
823 radix = 16;
824 break;
825
826 default:
827 scm_ungetc_unlocked (chr, port);
828 scm_ungetc_unlocked ('#', port);
829 radix = 10;
830 }
831
832 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
833 &read);
834
835 str = scm_from_port_stringn (buffer, read, port);
836
837 result = scm_string_to_number (str, scm_from_uint (radix));
838
839 SCM_COL (port) += scm_i_string_length (str);
840
841 if (scm_is_true (result))
842 return result;
843
844 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
845
846 return SCM_BOOL_F;
847 }
848 #undef FUNC_NAME
849
850 static SCM
851 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
852 {
853 SCM p;
854 long line = SCM_LINUM (port);
855 int column = SCM_COL (port) - 1;
856
857 switch (chr)
858 {
859 case '`':
860 p = scm_sym_quasiquote;
861 break;
862
863 case '\'':
864 p = scm_sym_quote;
865 break;
866
867 case ',':
868 {
869 scm_t_wchar c;
870
871 c = scm_getc_unlocked (port);
872 if ('@' == c)
873 p = scm_sym_uq_splicing;
874 else
875 {
876 scm_ungetc_unlocked (c, port);
877 p = scm_sym_unquote;
878 }
879 break;
880 }
881
882 default:
883 fprintf (stderr, "%s: unhandled quote character (%i)\n",
884 "scm_read_quote", chr);
885 abort ();
886 }
887
888 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
889 return maybe_annotate_source (p, port, opts, line, column);
890 }
891
892 SCM_SYMBOL (sym_syntax, "syntax");
893 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
894 SCM_SYMBOL (sym_unsyntax, "unsyntax");
895 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
896
897 static SCM
898 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
899 {
900 SCM p;
901 long line = SCM_LINUM (port);
902 int column = SCM_COL (port) - 1;
903
904 switch (chr)
905 {
906 case '`':
907 p = sym_quasisyntax;
908 break;
909
910 case '\'':
911 p = sym_syntax;
912 break;
913
914 case ',':
915 {
916 int c;
917
918 c = scm_getc_unlocked (port);
919 if ('@' == c)
920 p = sym_unsyntax_splicing;
921 else
922 {
923 scm_ungetc_unlocked (c, port);
924 p = sym_unsyntax;
925 }
926 break;
927 }
928
929 default:
930 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
931 "scm_read_syntax", chr);
932 abort ();
933 }
934
935 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
936 return maybe_annotate_source (p, port, opts, line, column);
937 }
938
939 static SCM
940 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
941 {
942 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
943
944 if (!scm_is_eq (id, sym_nil))
945 scm_i_input_error ("scm_read_nil", port,
946 "unexpected input while reading #nil: ~a",
947 scm_list_1 (id));
948
949 return SCM_ELISP_NIL;
950 }
951
952 static SCM
953 scm_read_semicolon_comment (int chr, SCM port)
954 {
955 int c;
956
957 /* We use the get_byte here because there is no need to get the
958 locale correct with comment input. This presumes that newline
959 always represents itself no matter what the encoding is. */
960 for (c = scm_get_byte_or_eof_unlocked (port);
961 (c != EOF) && (c != '\n');
962 c = scm_get_byte_or_eof_unlocked (port));
963
964 return SCM_UNSPECIFIED;
965 }
966
967 /* If the EXPECTED_CHARS are the next ones available from PORT, then
968 consume them and return 1. Otherwise leave the port position where
969 it was and return 0. EXPECTED_CHARS should be all lowercase, and
970 will be matched case-insensitively against the characters read from
971 PORT. */
972 static int
973 try_read_ci_chars (SCM port, const char *expected_chars)
974 {
975 int num_chars_wanted = strlen (expected_chars);
976 int num_chars_read = 0;
977 char *chars_read = alloca (num_chars_wanted);
978 int c;
979
980 while (num_chars_read < num_chars_wanted)
981 {
982 c = scm_getc_unlocked (port);
983 if (c == EOF)
984 break;
985 else if (c_tolower (c) != expected_chars[num_chars_read])
986 {
987 scm_ungetc_unlocked (c, port);
988 break;
989 }
990 else
991 chars_read[num_chars_read++] = c;
992 }
993
994 if (num_chars_read == num_chars_wanted)
995 return 1;
996 else
997 {
998 while (num_chars_read > 0)
999 scm_ungetc_unlocked (chars_read[--num_chars_read], port);
1000 return 0;
1001 }
1002 }
1003
1004 \f
1005 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1006
1007 static SCM
1008 scm_read_boolean (int chr, SCM port)
1009 {
1010 switch (chr)
1011 {
1012 case 't':
1013 case 'T':
1014 try_read_ci_chars (port, "rue");
1015 return SCM_BOOL_T;
1016
1017 case 'f':
1018 case 'F':
1019 try_read_ci_chars (port, "alse");
1020 return SCM_BOOL_F;
1021 }
1022
1023 return SCM_UNSPECIFIED;
1024 }
1025
1026 static SCM
1027 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1028 #define FUNC_NAME "scm_lreadr"
1029 {
1030 char buffer[READER_CHAR_NAME_MAX_SIZE];
1031 SCM charname;
1032 size_t charname_len, bytes_read;
1033 scm_t_wchar cp;
1034 int overflow;
1035 scm_t_port_internal *pti;
1036
1037 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1038 &bytes_read);
1039 if (overflow)
1040 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1041
1042 if (bytes_read == 0)
1043 {
1044 chr = scm_getc_unlocked (port);
1045 if (chr == EOF)
1046 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1047 "while reading character", SCM_EOL);
1048
1049 /* CHR must be a token delimiter, like a whitespace. */
1050 return (SCM_MAKE_CHAR (chr));
1051 }
1052
1053 pti = SCM_PORT_GET_INTERNAL (port);
1054
1055 /* Simple ASCII characters can be processed immediately. Also, simple
1056 ISO-8859-1 characters can be processed immediately if the encoding for this
1057 port is ISO-8859-1. */
1058 if (bytes_read == 1 &&
1059 ((unsigned char) buffer[0] <= 127
1060 || pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1))
1061 {
1062 SCM_COL (port) += 1;
1063 return SCM_MAKE_CHAR (buffer[0]);
1064 }
1065
1066 /* Otherwise, convert the buffer into a proper scheme string for
1067 processing. */
1068 charname = scm_from_port_stringn (buffer, bytes_read, port);
1069 charname_len = scm_i_string_length (charname);
1070 SCM_COL (port) += charname_len;
1071 cp = scm_i_string_ref (charname, 0);
1072 if (charname_len == 1)
1073 return SCM_MAKE_CHAR (cp);
1074
1075 /* Ignore dotted circles, which may be used to keep combining characters from
1076 combining with the backslash in #\charname. */
1077 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1078 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1079
1080 if (cp >= '0' && cp < '8')
1081 {
1082 /* Dirk:FIXME:: This type of character syntax is not R5RS
1083 * compliant. Further, it should be verified that the constant
1084 * does only consist of octal digits. */
1085 SCM p = scm_string_to_number (charname, scm_from_uint (8));
1086 if (SCM_I_INUMP (p))
1087 {
1088 scm_t_wchar c = scm_to_uint32 (p);
1089 if (SCM_IS_UNICODE_CHAR (c))
1090 return SCM_MAKE_CHAR (c);
1091 else
1092 scm_i_input_error (FUNC_NAME, port,
1093 "out-of-range octal character escape: ~a",
1094 scm_list_1 (charname));
1095 }
1096 }
1097
1098 if (cp == 'x' && (charname_len > 1))
1099 {
1100 SCM p;
1101
1102 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1103 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1104 scm_from_uint (16));
1105 if (SCM_I_INUMP (p))
1106 {
1107 scm_t_wchar c = scm_to_uint32 (p);
1108 if (SCM_IS_UNICODE_CHAR (c))
1109 return SCM_MAKE_CHAR (c);
1110 else
1111 scm_i_input_error (FUNC_NAME, port,
1112 "out-of-range hex character escape: ~a",
1113 scm_list_1 (charname));
1114 }
1115 }
1116
1117 /* The names of characters should never have non-Latin1
1118 characters. */
1119 if (scm_i_is_narrow_string (charname)
1120 || scm_i_try_narrow_string (charname))
1121 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1122 charname_len);
1123 if (scm_is_true (ch))
1124 return ch;
1125 }
1126
1127 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1128 scm_list_1 (charname));
1129
1130 return SCM_UNSPECIFIED;
1131 }
1132 #undef FUNC_NAME
1133
1134 static SCM
1135 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1136 {
1137 SCM symbol;
1138
1139 /* Read the symbol that comprises the keyword. Doing this instead of
1140 invoking a specific symbol reader function allows `scm_read_keyword ()'
1141 to adapt to the delimiters currently valid of symbols.
1142
1143 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1144 symbol = scm_read_expression (port, opts);
1145 if (!scm_is_symbol (symbol))
1146 scm_i_input_error ("scm_read_keyword", port,
1147 "keyword prefix `~a' not followed by a symbol: ~s",
1148 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1149
1150 return (scm_symbol_to_keyword (symbol));
1151 }
1152
1153 static SCM
1154 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1155 long line, int column)
1156 {
1157 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1158 guarantee that it's going to do what we want. After all, this is an
1159 implementation detail of `scm_read_vector ()', not a desirable
1160 property. */
1161 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1162 port, opts, line, column);
1163 }
1164
1165 /* Helper used by scm_read_array */
1166 static int
1167 read_decimal_integer (SCM port, int c, ssize_t *resp)
1168 {
1169 ssize_t sign = 1;
1170 ssize_t res = 0;
1171 int got_it = 0;
1172
1173 if (c == '-')
1174 {
1175 sign = -1;
1176 c = scm_getc_unlocked (port);
1177 }
1178
1179 while ('0' <= c && c <= '9')
1180 {
1181 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1182 scm_i_input_error ("read_decimal_integer", port,
1183 "number too large", SCM_EOL);
1184 res = 10*res + c-'0';
1185 got_it = 1;
1186 c = scm_getc_unlocked (port);
1187 }
1188
1189 if (got_it)
1190 *resp = sign * res;
1191 return c;
1192 }
1193
1194 /* Read an array. This function can also read vectors and uniform
1195 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1196 handled here.
1197
1198 C is the first character read after the '#'. */
1199 static SCM
1200 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1201 {
1202 ssize_t rank;
1203 scm_t_wchar tag_buf[8];
1204 int tag_len;
1205
1206 SCM tag, shape = SCM_BOOL_F, elements, array;
1207
1208 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1209 the array code can not deal with zero-length dimensions yet, and
1210 we want to allow zero-length vectors, of course. */
1211 if (c == '(')
1212 return scm_read_vector (c, port, opts, line, column);
1213
1214 /* Disambiguate between '#f' and uniform floating point vectors. */
1215 if (c == 'f')
1216 {
1217 c = scm_getc_unlocked (port);
1218 if (c != '3' && c != '6')
1219 {
1220 if (c == 'a' && try_read_ci_chars (port, "lse"))
1221 return SCM_BOOL_F;
1222 else if (c != EOF)
1223 scm_ungetc_unlocked (c, port);
1224 return SCM_BOOL_F;
1225 }
1226 rank = 1;
1227 tag_buf[0] = 'f';
1228 tag_len = 1;
1229 goto continue_reading_tag;
1230 }
1231
1232 /* Read rank. */
1233 rank = 1;
1234 c = read_decimal_integer (port, c, &rank);
1235 if (rank < 0)
1236 scm_i_input_error (NULL, port, "array rank must be non-negative",
1237 SCM_EOL);
1238
1239 /* Read tag. */
1240 tag_len = 0;
1241 continue_reading_tag:
1242 while (c != EOF && c != '(' && c != '@' && c != ':'
1243 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1244 {
1245 tag_buf[tag_len++] = c;
1246 c = scm_getc_unlocked (port);
1247 }
1248 if (tag_len == 0)
1249 tag = SCM_BOOL_T;
1250 else
1251 {
1252 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1253 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1254 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1255 scm_list_1 (tag));
1256 }
1257
1258 /* Read shape. */
1259 if (c == '@' || c == ':')
1260 {
1261 shape = SCM_EOL;
1262
1263 do
1264 {
1265 ssize_t lbnd = 0, len = 0;
1266 SCM s;
1267
1268 if (c == '@')
1269 {
1270 c = scm_getc_unlocked (port);
1271 c = read_decimal_integer (port, c, &lbnd);
1272 }
1273
1274 s = scm_from_ssize_t (lbnd);
1275
1276 if (c == ':')
1277 {
1278 c = scm_getc_unlocked (port);
1279 c = read_decimal_integer (port, c, &len);
1280 if (len < 0)
1281 scm_i_input_error (NULL, port,
1282 "array length must be non-negative",
1283 SCM_EOL);
1284
1285 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1286 }
1287
1288 shape = scm_cons (s, shape);
1289 } while (c == '@' || c == ':');
1290
1291 shape = scm_reverse_x (shape, SCM_EOL);
1292 }
1293
1294 /* Read nested lists of elements. */
1295 if (c != '(')
1296 scm_i_input_error (NULL, port,
1297 "missing '(' in vector or array literal",
1298 SCM_EOL);
1299 elements = scm_read_sexp (c, port, opts);
1300
1301 if (scm_is_false (shape))
1302 shape = scm_from_ssize_t (rank);
1303 else if (scm_ilength (shape) != rank)
1304 scm_i_input_error
1305 (NULL, port,
1306 "the number of shape specifications must match the array rank",
1307 SCM_EOL);
1308
1309 /* Handle special print syntax of rank zero arrays; see
1310 scm_i_print_array for a rationale. */
1311 if (rank == 0)
1312 {
1313 if (!scm_is_pair (elements))
1314 scm_i_input_error (NULL, port,
1315 "too few elements in array literal, need 1",
1316 SCM_EOL);
1317 if (!scm_is_null (SCM_CDR (elements)))
1318 scm_i_input_error (NULL, port,
1319 "too many elements in array literal, want 1",
1320 SCM_EOL);
1321 elements = SCM_CAR (elements);
1322 }
1323
1324 /* Construct array, annotate with source location, and return. */
1325 array = scm_list_to_typed_array (tag, shape, elements);
1326 return maybe_annotate_source (array, port, opts, line, column);
1327 }
1328
1329 static SCM
1330 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1331 long line, int column)
1332 {
1333 return scm_read_array (chr, port, opts, line, column);
1334 }
1335
1336 static SCM
1337 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1338 long line, int column)
1339 {
1340 chr = scm_getc_unlocked (port);
1341 if (chr != 'u')
1342 goto syntax;
1343
1344 chr = scm_getc_unlocked (port);
1345 if (chr != '8')
1346 goto syntax;
1347
1348 chr = scm_getc_unlocked (port);
1349 if (chr != '(')
1350 goto syntax;
1351
1352 return maybe_annotate_source
1353 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1354 port, opts, line, column);
1355
1356 syntax:
1357 scm_i_input_error ("read_bytevector", port,
1358 "invalid bytevector prefix",
1359 SCM_MAKE_CHAR (chr));
1360 return SCM_UNSPECIFIED;
1361 }
1362
1363 static SCM
1364 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1365 long line, int column)
1366 {
1367 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1368 terribly inefficient but who cares? */
1369 SCM s_bits = SCM_EOL;
1370
1371 for (chr = scm_getc_unlocked (port);
1372 (chr != EOF) && ((chr == '0') || (chr == '1'));
1373 chr = scm_getc_unlocked (port))
1374 {
1375 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1376 }
1377
1378 if (chr != EOF)
1379 scm_ungetc_unlocked (chr, port);
1380
1381 return maybe_annotate_source
1382 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1383 port, opts, line, column);
1384 }
1385
1386 static SCM
1387 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1388 {
1389 int bang_seen = 0;
1390
1391 for (;;)
1392 {
1393 int c = scm_getc_unlocked (port);
1394
1395 if (c == EOF)
1396 scm_i_input_error ("skip_block_comment", port,
1397 "unterminated `#! ... !#' comment", SCM_EOL);
1398
1399 if (c == '!')
1400 bang_seen = 1;
1401 else if (c == '#' && bang_seen)
1402 break;
1403 else
1404 bang_seen = 0;
1405 }
1406
1407 return SCM_UNSPECIFIED;
1408 }
1409
1410 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1411 int value);
1412 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1413 int value);
1414 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1415 int value);
1416
1417 static SCM
1418 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1419 {
1420 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1421 int c;
1422 int i = 0;
1423
1424 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1425 {
1426 c = scm_getc_unlocked (port);
1427 if (c == EOF)
1428 scm_i_input_error ("skip_block_comment", port,
1429 "unterminated `#! ... !#' comment", SCM_EOL);
1430 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1431 name[i++] = c;
1432 else if (CHAR_IS_DELIMITER (c))
1433 {
1434 scm_ungetc_unlocked (c, port);
1435 name[i] = '\0';
1436 if (0 == strcmp ("r6rs", name))
1437 ; /* Silently ignore */
1438 else if (0 == strcmp ("fold-case", name))
1439 set_port_case_insensitive_p (port, opts, 1);
1440 else if (0 == strcmp ("no-fold-case", name))
1441 set_port_case_insensitive_p (port, opts, 0);
1442 else if (0 == strcmp ("curly-infix", name))
1443 set_port_curly_infix_p (port, opts, 1);
1444 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1445 {
1446 set_port_curly_infix_p (port, opts, 1);
1447 set_port_square_brackets_p (port, opts, 0);
1448 }
1449 else
1450 break;
1451
1452 return SCM_UNSPECIFIED;
1453 }
1454 else
1455 {
1456 scm_ungetc_unlocked (c, port);
1457 break;
1458 }
1459 }
1460 while (i > 0)
1461 scm_ungetc_unlocked (name[--i], port);
1462 return scm_read_scsh_block_comment (chr, port);
1463 }
1464
1465 static SCM
1466 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1467 {
1468 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1469 nested. So care must be taken. */
1470 int nesting_level = 1;
1471
1472 int a = scm_getc_unlocked (port);
1473
1474 if (a == EOF)
1475 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1476 "unterminated `#| ... |#' comment", SCM_EOL);
1477
1478 while (nesting_level > 0)
1479 {
1480 int b = scm_getc_unlocked (port);
1481
1482 if (b == EOF)
1483 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1484 "unterminated `#| ... |#' comment", SCM_EOL);
1485
1486 if (a == '|' && b == '#')
1487 {
1488 nesting_level--;
1489 b = EOF;
1490 }
1491 else if (a == '#' && b == '|')
1492 {
1493 nesting_level++;
1494 b = EOF;
1495 }
1496
1497 a = b;
1498 }
1499
1500 return SCM_UNSPECIFIED;
1501 }
1502
1503 static SCM
1504 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1505 scm_t_read_opts *opts)
1506 {
1507 scm_t_wchar c;
1508
1509 c = flush_ws (port, opts, (char *) NULL);
1510 if (EOF == c)
1511 scm_i_input_error ("read_commented_expression", port,
1512 "no expression after #; comment", SCM_EOL);
1513 scm_ungetc_unlocked (c, port);
1514 scm_read_expression (port, opts);
1515 return SCM_UNSPECIFIED;
1516 }
1517
1518 static SCM
1519 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1520 {
1521 /* Guile's extended symbol read syntax looks like this:
1522
1523 #{This is all a symbol name}#
1524
1525 So here, CHR is expected to be `{'. */
1526 int saw_brace = 0;
1527 size_t len = 0;
1528 SCM buf = scm_i_make_string (1024, NULL, 0);
1529
1530 buf = scm_i_string_start_writing (buf);
1531
1532 while ((chr = scm_getc_unlocked (port)) != EOF)
1533 {
1534 if (saw_brace)
1535 {
1536 if (chr == '#')
1537 {
1538 break;
1539 }
1540 else
1541 {
1542 saw_brace = 0;
1543 scm_i_string_set_x (buf, len++, '}');
1544 }
1545 }
1546
1547 if (chr == '}')
1548 saw_brace = 1;
1549 else if (chr == '\\')
1550 {
1551 /* It used to be that print.c would print extended-read-syntax
1552 symbols with backslashes before "non-standard" chars, but
1553 this routine wouldn't do anything with those escapes.
1554 Bummer. What we've done is to change print.c to output
1555 R6RS hex escapes for those characters, relying on the fact
1556 that the extended read syntax would never put a `\' before
1557 an `x'. For now, we just ignore other instances of
1558 backslash in the string. */
1559 switch ((chr = scm_getc_unlocked (port)))
1560 {
1561 case EOF:
1562 goto done;
1563 case 'x':
1564 {
1565 scm_t_wchar c;
1566
1567 SCM_READ_HEX_ESCAPE (10, ';');
1568 scm_i_string_set_x (buf, len++, c);
1569 break;
1570
1571 str_eof:
1572 chr = EOF;
1573 goto done;
1574
1575 bad_escaped:
1576 scm_i_string_stop_writing ();
1577 scm_i_input_error ("scm_read_extended_symbol", port,
1578 "illegal character in escape sequence: ~S",
1579 scm_list_1 (SCM_MAKE_CHAR (c)));
1580 break;
1581 }
1582 default:
1583 scm_i_string_set_x (buf, len++, chr);
1584 break;
1585 }
1586 }
1587 else
1588 scm_i_string_set_x (buf, len++, chr);
1589
1590 if (len >= scm_i_string_length (buf) - 2)
1591 {
1592 SCM addy;
1593
1594 scm_i_string_stop_writing ();
1595 addy = scm_i_make_string (1024, NULL, 0);
1596 buf = scm_string_append (scm_list_2 (buf, addy));
1597 len = 0;
1598 buf = scm_i_string_start_writing (buf);
1599 }
1600 }
1601
1602 done:
1603 scm_i_string_stop_writing ();
1604 if (chr == EOF)
1605 scm_i_input_error ("scm_read_extended_symbol", port,
1606 "end of file while reading symbol", SCM_EOL);
1607
1608 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1609 }
1610
1611
1612 \f
1613 /* Top-level token readers, i.e., dispatchers. */
1614
1615 static SCM
1616 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1617 {
1618 SCM proc;
1619
1620 proc = scm_get_hash_procedure (chr);
1621 if (scm_is_true (scm_procedure_p (proc)))
1622 {
1623 long line = SCM_LINUM (port);
1624 int column = SCM_COL (port) - 2;
1625 SCM got;
1626
1627 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1628
1629 if (opts->record_positions_p && SCM_NIMP (got)
1630 && !scm_i_has_source_properties (got))
1631 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1632
1633 return got;
1634 }
1635
1636 return SCM_UNSPECIFIED;
1637 }
1638
1639 /* The reader for the sharp `#' character. It basically dispatches reads
1640 among the above token readers. */
1641 static SCM
1642 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1643 long line, int column)
1644 #define FUNC_NAME "scm_lreadr"
1645 {
1646 SCM result;
1647
1648 chr = scm_getc_unlocked (port);
1649
1650 result = scm_read_sharp_extension (chr, port, opts);
1651 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1652 return result;
1653
1654 switch (chr)
1655 {
1656 case '\\':
1657 return (scm_read_character (chr, port, opts));
1658 case '(':
1659 return (scm_read_vector (chr, port, opts, line, column));
1660 case 's':
1661 case 'u':
1662 case 'f':
1663 case 'c':
1664 /* This one may return either a boolean or an SRFI-4 vector. */
1665 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1666 case 'v':
1667 return (scm_read_bytevector (chr, port, opts, line, column));
1668 case '*':
1669 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1670 case 't':
1671 case 'T':
1672 case 'F':
1673 return (scm_read_boolean (chr, port));
1674 case ':':
1675 return (scm_read_keyword (chr, port, opts));
1676 case '0': case '1': case '2': case '3': case '4':
1677 case '5': case '6': case '7': case '8': case '9':
1678 case '@':
1679 return (scm_read_array (chr, port, opts, line, column));
1680
1681 case 'i':
1682 case 'e':
1683 case 'b':
1684 case 'B':
1685 case 'o':
1686 case 'O':
1687 case 'd':
1688 case 'D':
1689 case 'x':
1690 case 'X':
1691 case 'I':
1692 case 'E':
1693 return (scm_read_number_and_radix (chr, port, opts));
1694 case '{':
1695 return (scm_read_extended_symbol (chr, port));
1696 case '!':
1697 return (scm_read_shebang (chr, port, opts));
1698 case ';':
1699 return (scm_read_commented_expression (chr, port, opts));
1700 case '`':
1701 case '\'':
1702 case ',':
1703 return (scm_read_syntax (chr, port, opts));
1704 case 'n':
1705 return (scm_read_nil (chr, port, opts));
1706 default:
1707 result = scm_read_sharp_extension (chr, port, opts);
1708 if (scm_is_eq (result, SCM_UNSPECIFIED))
1709 {
1710 /* To remain compatible with 1.8 and earlier, the following
1711 characters have lower precedence than `read-hash-extend'
1712 characters. */
1713 switch (chr)
1714 {
1715 case '|':
1716 return scm_read_r6rs_block_comment (chr, port);
1717 default:
1718 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1719 scm_list_1 (SCM_MAKE_CHAR (chr)));
1720 }
1721 }
1722 else
1723 return result;
1724 }
1725
1726 return SCM_UNSPECIFIED;
1727 }
1728 #undef FUNC_NAME
1729
1730 static SCM
1731 read_inner_expression (SCM port, scm_t_read_opts *opts)
1732 #define FUNC_NAME "read_inner_expression"
1733 {
1734 while (1)
1735 {
1736 scm_t_wchar chr;
1737
1738 chr = scm_getc_unlocked (port);
1739
1740 switch (chr)
1741 {
1742 case SCM_WHITE_SPACES:
1743 case SCM_LINE_INCREMENTORS:
1744 break;
1745 case ';':
1746 (void) scm_read_semicolon_comment (chr, port);
1747 break;
1748 case '{':
1749 if (opts->curly_infix_p)
1750 {
1751 if (opts->neoteric_p)
1752 return scm_read_sexp (chr, port, opts);
1753 else
1754 {
1755 SCM expr;
1756
1757 /* Enable neoteric expressions within curly braces */
1758 opts->neoteric_p = 1;
1759 expr = scm_read_sexp (chr, port, opts);
1760 opts->neoteric_p = 0;
1761 return expr;
1762 }
1763 }
1764 else
1765 return scm_read_mixed_case_symbol (chr, port, opts);
1766 case '[':
1767 if (opts->square_brackets_p)
1768 return scm_read_sexp (chr, port, opts);
1769 else if (opts->curly_infix_p)
1770 {
1771 /* The syntax of neoteric expressions requires that '[' be
1772 a delimiter when curly-infix is enabled, so it cannot
1773 be part of an unescaped symbol. We might as well do
1774 something useful with it, so we adopt Kawa's convention:
1775 [...] => ($bracket-list$ ...) */
1776 long line = SCM_LINUM (port);
1777 int column = SCM_COL (port) - 1;
1778 return maybe_annotate_source
1779 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1780 port, opts, line, column);
1781 }
1782 else
1783 return scm_read_mixed_case_symbol (chr, port, opts);
1784 case '(':
1785 return (scm_read_sexp (chr, port, opts));
1786 case '"':
1787 return (scm_read_string (chr, port, opts));
1788 case '|':
1789 if (opts->r7rs_symbols_p)
1790 return scm_read_r7rs_symbol (chr, port, opts);
1791 else
1792 return scm_read_mixed_case_symbol (chr, port, opts);
1793 case '\'':
1794 case '`':
1795 case ',':
1796 return (scm_read_quote (chr, port, opts));
1797 case '#':
1798 {
1799 long line = SCM_LINUM (port);
1800 int column = SCM_COL (port) - 1;
1801 SCM result = scm_read_sharp (chr, port, opts, line, column);
1802 if (scm_is_eq (result, SCM_UNSPECIFIED))
1803 /* We read a comment or some such. */
1804 break;
1805 else
1806 return result;
1807 }
1808 case ')':
1809 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1810 break;
1811 case '}':
1812 if (opts->curly_infix_p)
1813 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1814 else
1815 return scm_read_mixed_case_symbol (chr, port, opts);
1816 case ']':
1817 if (opts->square_brackets_p)
1818 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1819 /* otherwise fall through */
1820 case EOF:
1821 return SCM_EOF_VAL;
1822 case ':':
1823 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1824 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1825 /* Fall through. */
1826
1827 default:
1828 {
1829 if (((chr >= '0') && (chr <= '9'))
1830 || (strchr ("+-.", chr)))
1831 return (scm_read_number (chr, port, opts));
1832 else
1833 return (scm_read_mixed_case_symbol (chr, port, opts));
1834 }
1835 }
1836 }
1837 }
1838 #undef FUNC_NAME
1839
1840 static SCM
1841 scm_read_expression (SCM port, scm_t_read_opts *opts)
1842 #define FUNC_NAME "scm_read_expression"
1843 {
1844 if (!opts->neoteric_p)
1845 return read_inner_expression (port, opts);
1846 else
1847 {
1848 long line = 0;
1849 int column = 0;
1850 SCM expr;
1851
1852 if (opts->record_positions_p)
1853 {
1854 /* We need to get the position of the first non-whitespace
1855 character in order to correctly annotate neoteric
1856 expressions. For example, for the expression 'f(x)', the
1857 first call to 'read_inner_expression' reads the 'f' (which
1858 cannot be annotated), and then we later read the '(x)' and
1859 use it to construct the new list (f x). */
1860 int c = flush_ws (port, opts, (char *) NULL);
1861 if (c == EOF)
1862 return SCM_EOF_VAL;
1863 scm_ungetc_unlocked (c, port);
1864 line = SCM_LINUM (port);
1865 column = SCM_COL (port);
1866 }
1867
1868 expr = read_inner_expression (port, opts);
1869
1870 /* 'expr' is the first component of the neoteric expression. Now
1871 we loop, and as long as the next character is '(', '[', or '{',
1872 (without any intervening whitespace), we use it to construct a
1873 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1874 for (;;)
1875 {
1876 int chr = scm_getc_unlocked (port);
1877
1878 if (chr == '(')
1879 /* e(...) => (e ...) */
1880 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1881 else if (chr == '[')
1882 /* e[...] => ($bracket-apply$ e ...) */
1883 expr = scm_cons (sym_bracket_apply,
1884 scm_cons (expr,
1885 scm_read_sexp (chr, port, opts)));
1886 else if (chr == '{')
1887 {
1888 SCM arg = scm_read_sexp (chr, port, opts);
1889
1890 if (scm_is_null (arg))
1891 expr = scm_list_1 (expr); /* e{} => (e) */
1892 else
1893 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1894 }
1895 else
1896 {
1897 if (chr != EOF)
1898 scm_ungetc_unlocked (chr, port);
1899 break;
1900 }
1901 maybe_annotate_source (expr, port, opts, line, column);
1902 }
1903 return expr;
1904 }
1905 }
1906 #undef FUNC_NAME
1907
1908 \f
1909 /* Actual reader. */
1910
1911 static void init_read_options (SCM port, scm_t_read_opts *opts);
1912
1913 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1914 (SCM port),
1915 "Read an s-expression from the input port @var{port}, or from\n"
1916 "the current input port if @var{port} is not specified.\n"
1917 "Any whitespace before the next token is discarded.")
1918 #define FUNC_NAME s_scm_read
1919 {
1920 scm_t_read_opts opts;
1921 int c;
1922
1923 if (SCM_UNBNDP (port))
1924 port = scm_current_input_port ();
1925 SCM_VALIDATE_OPINPORT (1, port);
1926
1927 init_read_options (port, &opts);
1928
1929 c = flush_ws (port, &opts, (char *) NULL);
1930 if (EOF == c)
1931 return SCM_EOF_VAL;
1932 scm_ungetc_unlocked (c, port);
1933
1934 return (scm_read_expression (port, &opts));
1935 }
1936 #undef FUNC_NAME
1937
1938
1939 \f
1940
1941 /* Manipulate the read-hash-procedures alist. This could be written in
1942 Scheme, but maybe it will also be used by C code during initialisation. */
1943 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1944 (SCM chr, SCM proc),
1945 "Install the procedure @var{proc} for reading expressions\n"
1946 "starting with the character sequence @code{#} and @var{chr}.\n"
1947 "@var{proc} will be called with two arguments: the character\n"
1948 "@var{chr} and the port to read further data from. The object\n"
1949 "returned will be the return value of @code{read}. \n"
1950 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1951 )
1952 #define FUNC_NAME s_scm_read_hash_extend
1953 {
1954 SCM this;
1955 SCM prev;
1956
1957 SCM_VALIDATE_CHAR (1, chr);
1958 SCM_ASSERT (scm_is_false (proc)
1959 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1960 proc, SCM_ARG2, FUNC_NAME);
1961
1962 /* Check if chr is already in the alist. */
1963 this = scm_i_read_hash_procedures_ref ();
1964 prev = SCM_BOOL_F;
1965 while (1)
1966 {
1967 if (scm_is_null (this))
1968 {
1969 /* not found, so add it to the beginning. */
1970 if (scm_is_true (proc))
1971 {
1972 SCM new = scm_cons (scm_cons (chr, proc),
1973 scm_i_read_hash_procedures_ref ());
1974 scm_i_read_hash_procedures_set_x (new);
1975 }
1976 break;
1977 }
1978 if (scm_is_eq (chr, SCM_CAAR (this)))
1979 {
1980 /* already in the alist. */
1981 if (scm_is_false (proc))
1982 {
1983 /* remove it. */
1984 if (scm_is_false (prev))
1985 {
1986 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
1987 scm_i_read_hash_procedures_set_x (rest);
1988 }
1989 else
1990 scm_set_cdr_x (prev, SCM_CDR (this));
1991 }
1992 else
1993 {
1994 /* replace it. */
1995 scm_set_cdr_x (SCM_CAR (this), proc);
1996 }
1997 break;
1998 }
1999 prev = this;
2000 this = SCM_CDR (this);
2001 }
2002
2003 return SCM_UNSPECIFIED;
2004 }
2005 #undef FUNC_NAME
2006
2007 /* Recover the read-hash procedure corresponding to char c. */
2008 static SCM
2009 scm_get_hash_procedure (int c)
2010 {
2011 SCM rest = scm_i_read_hash_procedures_ref ();
2012
2013 while (1)
2014 {
2015 if (scm_is_null (rest))
2016 return SCM_BOOL_F;
2017
2018 if (SCM_CHAR (SCM_CAAR (rest)) == c)
2019 return SCM_CDAR (rest);
2020
2021 rest = SCM_CDR (rest);
2022 }
2023 }
2024
2025 static int
2026 is_encoding_char (char c)
2027 {
2028 if (c >= 'a' && c <= 'z') return 1;
2029 if (c >= 'A' && c <= 'Z') return 1;
2030 if (c >= '0' && c <= '9') return 1;
2031 return strchr ("_-.:/,+=()", c) != NULL;
2032 }
2033
2034 /* Maximum size of an encoding name. This is a bit more than the
2035 longest name listed at
2036 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2037 characters.) */
2038 #define ENCODING_NAME_MAX_SIZE 20
2039
2040 /* Number of bytes at the beginning or end of a file that are scanned
2041 for a "coding:" declaration. */
2042 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2043
2044
2045 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2046 coding declaration. Returns either NULL or a string whose storage
2047 has been allocated with `scm_gc_malloc'. */
2048 char *
2049 scm_i_scan_for_encoding (SCM port)
2050 {
2051 scm_t_port *pt;
2052 char header[SCM_ENCODING_SEARCH_SIZE+1];
2053 size_t bytes_read, encoding_length, i;
2054 char *encoding = NULL;
2055 char *pos, *encoding_start;
2056 int in_comment;
2057
2058 pt = SCM_PTAB_ENTRY (port);
2059
2060 if (pt->rw_active == SCM_PORT_WRITE)
2061 scm_flush_unlocked (port);
2062
2063 if (pt->rw_random)
2064 pt->rw_active = SCM_PORT_READ;
2065
2066 if (pt->read_pos == pt->read_end)
2067 {
2068 /* We can use the read buffer, and thus avoid a seek. */
2069 if (scm_fill_input_unlocked (port) == EOF)
2070 return NULL;
2071
2072 bytes_read = pt->read_end - pt->read_pos;
2073 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2074 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2075
2076 if (bytes_read <= 1)
2077 /* An unbuffered port -- don't scan. */
2078 return NULL;
2079
2080 memcpy (header, pt->read_pos, bytes_read);
2081 header[bytes_read] = '\0';
2082 }
2083 else
2084 {
2085 /* Try to read some bytes and then seek back. Not all ports
2086 support seeking back; and indeed some file ports (like
2087 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2088 check performed by SCM_FPORT_FDES---but fail to seek
2089 backwards. Hence this block comes second. We prefer to use
2090 the read buffer in-place. */
2091 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2092 return NULL;
2093
2094 bytes_read = scm_c_read_unlocked (port, header, SCM_ENCODING_SEARCH_SIZE);
2095 header[bytes_read] = '\0';
2096 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2097 }
2098
2099 /* search past "coding[:=]" */
2100 pos = header;
2101 while (1)
2102 {
2103 if ((pos = strstr(pos, "coding")) == NULL)
2104 return NULL;
2105
2106 pos += strlen ("coding");
2107 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2108 (*pos == ':' || *pos == '='))
2109 {
2110 pos ++;
2111 break;
2112 }
2113 }
2114
2115 /* skip spaces */
2116 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2117 (*pos == ' ' || *pos == '\t'))
2118 pos ++;
2119
2120 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2121 /* We found the "coding:" string, but there is probably not enough
2122 room to store an encoding name in its entirety, so ignore it.
2123 This makes sure we do not end up returning a truncated encoding
2124 name. */
2125 return NULL;
2126
2127 /* grab the next token */
2128 encoding_start = pos;
2129 i = 0;
2130 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2131 && encoding_start + i - header < bytes_read
2132 && is_encoding_char (encoding_start[i]))
2133 i++;
2134
2135 encoding_length = i;
2136 if (encoding_length == 0)
2137 return NULL;
2138
2139 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2140
2141 /* push backwards to make sure we were in a comment */
2142 in_comment = 0;
2143 pos = encoding_start;
2144 while (pos >= header)
2145 {
2146 if (*pos == ';')
2147 {
2148 in_comment = 1;
2149 break;
2150 }
2151 else if (*pos == '\n' || pos == header)
2152 {
2153 /* This wasn't in a semicolon comment. Check for a
2154 hash-bang comment. */
2155 char *beg = strstr (header, "#!");
2156 char *end = strstr (header, "!#");
2157 if (beg < encoding_start && encoding_start + encoding_length <= end)
2158 in_comment = 1;
2159 break;
2160 }
2161 else
2162 {
2163 pos --;
2164 continue;
2165 }
2166 }
2167 if (!in_comment)
2168 /* This wasn't in a comment */
2169 return NULL;
2170
2171 return encoding;
2172 }
2173
2174 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2175 (SCM port),
2176 "Scans the port for an Emacs-like character coding declaration\n"
2177 "near the top of the contents of a port with random-accessible contents.\n"
2178 "The coding declaration is of the form\n"
2179 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2180 "\n"
2181 "Returns a string containing the character encoding of the file\n"
2182 "if a declaration was found, or @code{#f} otherwise.\n")
2183 #define FUNC_NAME s_scm_file_encoding
2184 {
2185 char *enc;
2186 SCM s_enc;
2187
2188 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2189
2190 enc = scm_i_scan_for_encoding (port);
2191 if (enc == NULL)
2192 return SCM_BOOL_F;
2193 else
2194 {
2195 s_enc = scm_string_upcase (scm_from_locale_string (enc));
2196 return s_enc;
2197 }
2198
2199 return SCM_BOOL_F;
2200 }
2201 #undef FUNC_NAME
2202
2203 \f
2204 /* Per-port read options.
2205
2206 We store per-port read options in the 'port-read-options' port
2207 property, which is stored in the internal port structure. The value
2208 stored is a single integer that contains a two-bit field for each
2209 read option.
2210
2211 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2212 the applicable value should be inherited from the corresponding
2213 global read option. Otherwise, the bit field contains the value of
2214 the read option. For boolean read options that have been set
2215 per-port, the possible values are 0 or 1. If the 'keyword_style'
2216 read option has been set per-port, its possible values are those in
2217 'enum t_keyword_style'. */
2218
2219 /* Key to read options in port properties. */
2220 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2221
2222 /* Offsets of bit fields for each per-port override */
2223 #define READ_OPTION_COPY_SOURCE_P 0
2224 #define READ_OPTION_RECORD_POSITIONS_P 2
2225 #define READ_OPTION_CASE_INSENSITIVE_P 4
2226 #define READ_OPTION_KEYWORD_STYLE 6
2227 #define READ_OPTION_R6RS_ESCAPES_P 8
2228 #define READ_OPTION_SQUARE_BRACKETS_P 10
2229 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2230 #define READ_OPTION_CURLY_INFIX_P 14
2231 #define READ_OPTION_R7RS_SYMBOLS_P 16
2232
2233 /* The total width in bits of the per-port overrides */
2234 #define READ_OPTIONS_NUM_BITS 18
2235
2236 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2237 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2238
2239 #define READ_OPTION_MASK 3
2240 #define READ_OPTION_INHERIT 3
2241
2242 static void
2243 set_port_read_option (SCM port, int option, int new_value)
2244 {
2245 SCM scm_read_options;
2246 unsigned int read_options;
2247
2248 new_value &= READ_OPTION_MASK;
2249
2250 scm_dynwind_begin (0);
2251 scm_dynwind_lock_port (port);
2252
2253 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2254 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2255 read_options = scm_to_uint (scm_read_options);
2256 else
2257 read_options = READ_OPTIONS_INHERIT_ALL;
2258 read_options &= ~(READ_OPTION_MASK << option);
2259 read_options |= new_value << option;
2260 scm_read_options = scm_from_uint (read_options);
2261 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2262
2263 scm_dynwind_end ();
2264 }
2265
2266 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2267 static void
2268 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2269 {
2270 value = !!value;
2271 opts->case_insensitive_p = value;
2272 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2273 }
2274
2275 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2276 static void
2277 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2278 {
2279 value = !!value;
2280 opts->square_brackets_p = value;
2281 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2282 }
2283
2284 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2285 static void
2286 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2287 {
2288 value = !!value;
2289 opts->curly_infix_p = value;
2290 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2291 }
2292
2293 /* Initialize OPTS based on PORT's read options and the global read
2294 options. */
2295 static void
2296 init_read_options (SCM port, scm_t_read_opts *opts)
2297 {
2298 SCM val, scm_read_options;
2299 unsigned int read_options, x;
2300
2301 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2302
2303 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2304 read_options = scm_to_uint (scm_read_options);
2305 else
2306 read_options = READ_OPTIONS_INHERIT_ALL;
2307
2308 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2309 if (x == READ_OPTION_INHERIT)
2310 {
2311 val = SCM_PACK (SCM_KEYWORD_STYLE);
2312 if (scm_is_eq (val, scm_keyword_prefix))
2313 x = KEYWORD_STYLE_PREFIX;
2314 else if (scm_is_eq (val, scm_keyword_postfix))
2315 x = KEYWORD_STYLE_POSTFIX;
2316 else
2317 x = KEYWORD_STYLE_HASH_PREFIX;
2318 }
2319 opts->keyword_style = x;
2320
2321 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2322 do \
2323 { \
2324 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2325 if (x == READ_OPTION_INHERIT) \
2326 x = !!SCM_ ## NAME; \
2327 opts->name = x; \
2328 } \
2329 while (0)
2330
2331 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2332 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2333 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2334 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2335 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2336 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2337 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
2338 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
2339
2340 #undef RESOLVE_BOOLEAN_OPTION
2341
2342 opts->neoteric_p = 0;
2343 }
2344
2345 void
2346 scm_init_read ()
2347 {
2348 SCM read_hash_procs;
2349
2350 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2351
2352 scm_i_read_hash_procedures =
2353 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2354
2355 scm_init_opts (scm_read_options, scm_read_opts);
2356 #include "libguile/read.x"
2357 }
2358
2359 /*
2360 Local Variables:
2361 c-file-style: "gnu"
2362 End:
2363 */