REPL Server: Don't establish a SIGINT handler.
[bpt/guile.git] / libguile / read.c
1 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
2 * 2007, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unicase.h>
32 #include <unictype.h>
33 #include <c-strcase.h>
34 #include <c-ctype.h>
35
36 #include "libguile/_scm.h"
37 #include "libguile/bytevectors.h"
38 #include "libguile/chars.h"
39 #include "libguile/eval.h"
40 #include "libguile/arrays.h"
41 #include "libguile/bitvectors.h"
42 #include "libguile/keywords.h"
43 #include "libguile/alist.h"
44 #include "libguile/srcprop.h"
45 #include "libguile/hashtab.h"
46 #include "libguile/hash.h"
47 #include "libguile/ports.h"
48 #include "libguile/fports.h"
49 #include "libguile/root.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59
60
61 \f
62
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67
68 /* SRFI-105 curly infix expression support */
69 SCM_SYMBOL (sym_nfx, "$nfx$");
70 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
71 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
72
73 scm_t_option scm_read_opts[] =
74 {
75 { SCM_OPTION_BOOLEAN, "copy", 0,
76 "Copy source code expressions." },
77 { SCM_OPTION_BOOLEAN, "positions", 1,
78 "Record positions of source code expressions." },
79 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
80 "Convert symbols to lower case."},
81 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
82 "Style of keyword recognition: #f, 'prefix or 'postfix."},
83 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
84 "Use R6RS variable-length character and string hex escapes."},
85 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
86 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
87 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
88 "In strings, consume leading whitespace after an escaped end-of-line."},
89 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
90 "Support SRFI-105 curly infix expressions."},
91 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
92 "Support R7RS |...| symbol notation."},
93 { 0, },
94 };
95
96 /* Internal read options structure. This is initialized by 'scm_read'
97 from the global and per-port read options, and a pointer is passed
98 down to all helper functions. */
99
100 enum t_keyword_style
101 {
102 KEYWORD_STYLE_HASH_PREFIX,
103 KEYWORD_STYLE_PREFIX,
104 KEYWORD_STYLE_POSTFIX
105 };
106
107 struct t_read_opts
108 {
109 enum t_keyword_style keyword_style;
110 unsigned int copy_source_p : 1;
111 unsigned int record_positions_p : 1;
112 unsigned int case_insensitive_p : 1;
113 unsigned int r6rs_escapes_p : 1;
114 unsigned int square_brackets_p : 1;
115 unsigned int hungry_eol_escapes_p : 1;
116 unsigned int curly_infix_p : 1;
117 unsigned int neoteric_p : 1;
118 unsigned int r7rs_symbols_p : 1;
119 };
120
121 typedef struct t_read_opts scm_t_read_opts;
122
123
124 /*
125 Give meaningful error messages for errors
126
127 We use the format
128
129 FILE:LINE:COL: MESSAGE
130 This happened in ....
131
132 This is not standard GNU format, but the test-suite likes the real
133 message to be in front.
134
135 */
136
137
138 void
139 scm_i_input_error (char const *function,
140 SCM port, const char *message, SCM arg)
141 {
142 SCM fn = (scm_is_string (SCM_FILENAME(port))
143 ? SCM_FILENAME(port)
144 : scm_from_locale_string ("#<unknown port>"));
145
146 SCM string_port = scm_open_output_string ();
147 SCM string = SCM_EOL;
148 scm_simple_format (string_port,
149 scm_from_locale_string ("~A:~S:~S: ~A"),
150 scm_list_4 (fn,
151 scm_from_long (SCM_LINUM (port) + 1),
152 scm_from_int (SCM_COL (port) + 1),
153 scm_from_locale_string (message)));
154
155 string = scm_get_output_string (string_port);
156 scm_close_output_port (string_port);
157 scm_error_scm (scm_from_latin1_symbol ("read-error"),
158 function? scm_from_locale_string (function) : SCM_BOOL_F,
159 string,
160 arg,
161 SCM_BOOL_F);
162 }
163
164
165 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
166 (SCM setting),
167 "Option interface for the read options. Instead of using\n"
168 "this procedure directly, use the procedures @code{read-enable},\n"
169 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
170 #define FUNC_NAME s_scm_read_options
171 {
172 SCM ans = scm_options (setting,
173 scm_read_opts,
174 FUNC_NAME);
175 if (SCM_COPY_SOURCE_P)
176 SCM_RECORD_POSITIONS_P = 1;
177 return ans;
178 }
179 #undef FUNC_NAME
180
181 /* A fluid referring to an association list mapping extra hash
182 characters to procedures. */
183 static SCM *scm_i_read_hash_procedures;
184
185 static SCM
186 scm_i_read_hash_procedures_ref (void)
187 {
188 return scm_fluid_ref (*scm_i_read_hash_procedures);
189 }
190
191 static void
192 scm_i_read_hash_procedures_set_x (SCM value)
193 {
194 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
195 }
196
197 \f
198 /* Token readers. */
199
200
201 /* Size of the C buffer used to read symbols and numbers. */
202 #define READER_BUFFER_SIZE 128
203
204 /* Number of 32-bit codepoints in the buffer used to read strings. */
205 #define READER_STRING_BUFFER_SIZE 128
206
207 /* The maximum size of Scheme character names. */
208 #define READER_CHAR_NAME_MAX_SIZE 50
209
210 /* The maximum size of reader directive names. */
211 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
212
213
214 /* `isblank' is only in C99. */
215 #define CHAR_IS_BLANK_(_chr) \
216 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
217 || ((_chr) == '\f') || ((_chr) == '\r'))
218
219 #ifdef MSDOS
220 # define CHAR_IS_BLANK(_chr) \
221 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
222 #else
223 # define CHAR_IS_BLANK CHAR_IS_BLANK_
224 #endif
225
226
227 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
228 structure''). */
229 #define CHAR_IS_R5RS_DELIMITER(c) \
230 (CHAR_IS_BLANK (c) \
231 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
232
233 #define CHAR_IS_DELIMITER(c) \
234 (CHAR_IS_R5RS_DELIMITER (c) \
235 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
236 || opts->curly_infix_p)) \
237 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
238
239 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
240 Structure''. */
241 #define CHAR_IS_EXPONENT_MARKER(_chr) \
242 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
243 || ((_chr) == 'd') || ((_chr) == 'l'))
244
245 /* Read an SCSH block comment. */
246 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
247 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
249 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_get_hash_procedure (int);
251
252 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
253 result in the pre-allocated buffer BUF. Return zero if the whole token has
254 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
255 bytes actually read. */
256 static int
257 read_token (SCM port, scm_t_read_opts *opts,
258 char *buf, size_t buf_size, size_t *read)
259 {
260 *read = 0;
261
262 while (*read < buf_size)
263 {
264 int chr;
265
266 chr = scm_get_byte_or_eof (port);
267
268 if (chr == EOF)
269 return 0;
270 else if (CHAR_IS_DELIMITER (chr))
271 {
272 scm_unget_byte (chr, port);
273 return 0;
274 }
275 else
276 {
277 *buf = (char) chr;
278 buf++, (*read)++;
279 }
280 }
281
282 return 1;
283 }
284
285 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
286 if the token doesn't fit in BUFFER_SIZE bytes. */
287 static char *
288 read_complete_token (SCM port, scm_t_read_opts *opts,
289 char *buffer, size_t buffer_size, size_t *read)
290 {
291 int overflow = 0;
292 size_t bytes_read, overflow_size = 0;
293 char *overflow_buffer = NULL;
294
295 do
296 {
297 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
298 if (bytes_read == 0)
299 break;
300 if (overflow || overflow_size != 0)
301 {
302 if (overflow_size == 0)
303 {
304 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
305 memcpy (overflow_buffer, buffer, bytes_read);
306 overflow_size = bytes_read;
307 }
308 else
309 {
310 char *new_buf =
311 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
312
313 memcpy (new_buf, overflow_buffer, overflow_size);
314 memcpy (new_buf + overflow_size, buffer, bytes_read);
315
316 overflow_buffer = new_buf;
317 overflow_size += bytes_read;
318 }
319 }
320 }
321 while (overflow);
322
323 if (overflow_size)
324 *read = overflow_size;
325 else
326 *read = bytes_read;
327
328 return (overflow_size > 0 ? overflow_buffer : buffer);
329 }
330
331 /* Skip whitespace from PORT and return the first non-whitespace character
332 read. Raise an error on end-of-file. */
333 static int
334 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
335 {
336 scm_t_wchar c;
337 while (1)
338 switch (c = scm_getc (port))
339 {
340 case EOF:
341 goteof:
342 if (eoferr)
343 {
344 scm_i_input_error (eoferr,
345 port,
346 "end of file",
347 SCM_EOL);
348 }
349 return c;
350
351 case ';':
352 lp:
353 switch (c = scm_getc (port))
354 {
355 case EOF:
356 goto goteof;
357 default:
358 goto lp;
359 case SCM_LINE_INCREMENTORS:
360 break;
361 }
362 break;
363
364 case '#':
365 switch (c = scm_getc (port))
366 {
367 case EOF:
368 eoferr = "read_sharp";
369 goto goteof;
370 case '!':
371 scm_read_shebang (c, port, opts);
372 break;
373 case ';':
374 scm_read_commented_expression (c, port, opts);
375 break;
376 case '|':
377 if (scm_is_false (scm_get_hash_procedure (c)))
378 {
379 scm_read_r6rs_block_comment (c, port);
380 break;
381 }
382 /* fall through */
383 default:
384 scm_ungetc (c, port);
385 return '#';
386 }
387 break;
388
389 case SCM_LINE_INCREMENTORS:
390 case SCM_SINGLE_SPACES:
391 case '\t':
392 break;
393
394 default:
395 return c;
396 }
397
398 return 0;
399 }
400
401
402 \f
403 /* Token readers. */
404
405 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
406 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
407 long line, int column);
408
409
410 static SCM
411 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
412 long line, int column)
413 {
414 if (opts->record_positions_p)
415 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
416 return x;
417 }
418
419 static SCM
420 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
421 #define FUNC_NAME "scm_i_lreadparen"
422 {
423 int c;
424 SCM tmp, tl, ans = SCM_EOL;
425 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
426 const int terminating_char = ((chr == '{') ? '}'
427 : ((chr == '[') ? ']'
428 : ')'));
429
430 /* Need to capture line and column numbers here. */
431 long line = SCM_LINUM (port);
432 int column = SCM_COL (port) - 1;
433
434 c = flush_ws (port, opts, FUNC_NAME);
435 if (terminating_char == c)
436 return SCM_EOL;
437
438 scm_ungetc (c, port);
439 tmp = scm_read_expression (port, opts);
440
441 /* Note that it is possible for scm_read_expression to return
442 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
443 check that it's a real dot by checking `c'. */
444 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
445 {
446 ans = scm_read_expression (port, opts);
447 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
448 scm_i_input_error (FUNC_NAME, port, "missing close paren",
449 SCM_EOL);
450 return ans;
451 }
452
453 /* Build the head of the list structure. */
454 ans = tl = scm_cons (tmp, SCM_EOL);
455
456 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
457 {
458 SCM new_tail;
459
460 if (c == ')' || (c == ']' && opts->square_brackets_p)
461 || ((c == '}' || c == ']') && opts->curly_infix_p))
462 scm_i_input_error (FUNC_NAME, port,
463 "in pair: mismatched close paren: ~A",
464 scm_list_1 (SCM_MAKE_CHAR (c)));
465
466 scm_ungetc (c, port);
467 tmp = scm_read_expression (port, opts);
468
469 /* See above note about scm_sym_dot. */
470 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
471 {
472 SCM_SETCDR (tl, scm_read_expression (port, opts));
473
474 c = flush_ws (port, opts, FUNC_NAME);
475 if (terminating_char != c)
476 scm_i_input_error (FUNC_NAME, port,
477 "in pair: missing close paren", SCM_EOL);
478 break;
479 }
480
481 new_tail = scm_cons (tmp, SCM_EOL);
482 SCM_SETCDR (tl, new_tail);
483 tl = new_tail;
484 }
485
486 if (curly_list_p)
487 {
488 /* In addition to finding the length, 'scm_ilength' checks for
489 improper or circular lists, in which case it returns -1. */
490 int len = scm_ilength (ans);
491
492 /* The (len == 0) case is handled above */
493 if (len == 1)
494 /* Return directly to avoid re-annotating the element's source
495 location with the position of the outer brace. Also, it
496 might not be possible to annotate the element. */
497 return scm_car (ans); /* {e} => e */
498 else if (len == 2)
499 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
500 else if (len >= 3 && (len & 1))
501 {
502 /* It's a proper list whose length is odd and at least 3. If
503 the elements at odd indices (the infix operator positions)
504 are all 'equal?', then it's a simple curly-infix list.
505 Otherwise it's a mixed curly-infix list. */
506 SCM op = scm_cadr (ans);
507
508 /* Check to see if the elements at odd indices are 'equal?' */
509 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
510 {
511 if (scm_is_null (tl))
512 {
513 /* Convert simple curly-infix list to prefix:
514 {a <op> b <op> ...} => (<op> a b ...) */
515 tl = ans;
516 while (scm_is_pair (scm_cdr (tl)))
517 {
518 tmp = scm_cddr (tl);
519 SCM_SETCDR (tl, tmp);
520 tl = tmp;
521 }
522 ans = scm_cons (op, ans);
523 break;
524 }
525 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
526 {
527 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
528 ans = scm_cons (sym_nfx, ans);
529 break;
530 }
531 }
532 }
533 else
534 /* Mixed curly-infix (possibly improper) list:
535 {e . tail} => ($nfx$ e . tail) */
536 ans = scm_cons (sym_nfx, ans);
537 }
538
539 return maybe_annotate_source (ans, port, opts, line, column);
540 }
541 #undef FUNC_NAME
542
543
544 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
545 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
546 found. */
547 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
548 do \
549 { \
550 scm_t_wchar a; \
551 size_t i = 0; \
552 c = 0; \
553 while (i < ndigits) \
554 { \
555 a = scm_getc (port); \
556 if (a == EOF) \
557 goto str_eof; \
558 if (terminator \
559 && (a == (scm_t_wchar) terminator) \
560 && (i > 0)) \
561 break; \
562 if ('0' <= a && a <= '9') \
563 a -= '0'; \
564 else if ('A' <= a && a <= 'F') \
565 a = a - 'A' + 10; \
566 else if ('a' <= a && a <= 'f') \
567 a = a - 'a' + 10; \
568 else \
569 { \
570 c = a; \
571 goto bad_escaped; \
572 } \
573 c = c * 16 + a; \
574 i ++; \
575 } \
576 } while (0)
577
578 static void
579 skip_intraline_whitespace (SCM port)
580 {
581 scm_t_wchar c;
582
583 do
584 {
585 c = scm_getc (port);
586 if (c == EOF)
587 return;
588 }
589 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
590
591 scm_ungetc (c, port);
592 }
593
594 /* Read either a double-quoted string or an R7RS-style symbol delimited
595 by vertical lines, depending on the value of 'chr' ('"' or '|').
596 Regardless, the result is always returned as a string. */
597 static SCM
598 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
599 #define FUNC_NAME "scm_lreadr"
600 {
601 /* For strings smaller than C_STR, this function creates only one Scheme
602 object (the string returned). */
603
604 SCM str = SCM_EOL;
605 size_t c_str_len = 0;
606 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
607
608 /* Need to capture line and column numbers here. */
609 long line = SCM_LINUM (port);
610 int column = SCM_COL (port) - 1;
611
612 while (chr != (c = scm_getc (port)))
613 {
614 if (c == EOF)
615 {
616 str_eof:
617 scm_i_input_error (FUNC_NAME, port,
618 (chr == '|'
619 ? "end of file in symbol"
620 : "end of file in string constant"),
621 SCM_EOL);
622 }
623
624 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
625 {
626 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
627 c_str_len = 0;
628 }
629
630 if (c == '\\')
631 {
632 switch (c = scm_getc (port))
633 {
634 case EOF:
635 goto str_eof;
636 case '|':
637 case '\\':
638 break;
639 case '\n':
640 if (opts->hungry_eol_escapes_p)
641 skip_intraline_whitespace (port);
642 continue;
643 case '0':
644 c = '\0';
645 break;
646 case 'f':
647 c = '\f';
648 break;
649 case 'n':
650 c = '\n';
651 break;
652 case 'r':
653 c = '\r';
654 break;
655 case 't':
656 c = '\t';
657 break;
658 case 'a':
659 c = '\007';
660 break;
661 case 'v':
662 c = '\v';
663 break;
664 case 'b':
665 c = '\010';
666 break;
667 case 'x':
668 if (opts->r6rs_escapes_p || chr == '|')
669 SCM_READ_HEX_ESCAPE (10, ';');
670 else
671 SCM_READ_HEX_ESCAPE (2, '\0');
672 break;
673 case 'u':
674 if (!opts->r6rs_escapes_p)
675 {
676 SCM_READ_HEX_ESCAPE (4, '\0');
677 break;
678 }
679 case 'U':
680 if (!opts->r6rs_escapes_p)
681 {
682 SCM_READ_HEX_ESCAPE (6, '\0');
683 break;
684 }
685 default:
686 if (c == chr)
687 break;
688 bad_escaped:
689 scm_i_input_error (FUNC_NAME, port,
690 "illegal character in escape sequence: ~S",
691 scm_list_1 (SCM_MAKE_CHAR (c)));
692 }
693 }
694
695 c_str[c_str_len++] = c;
696 }
697
698 if (scm_is_null (str))
699 /* Fast path: we got a string that fits in C_STR. */
700 str = scm_from_utf32_stringn (c_str, c_str_len);
701 else
702 {
703 if (c_str_len > 0)
704 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
705
706 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
707 }
708
709 return maybe_annotate_source (str, port, opts, line, column);
710 }
711 #undef FUNC_NAME
712
713 static SCM
714 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
715 {
716 return scm_read_string_like_syntax (chr, port, opts);
717 }
718
719 static SCM
720 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
721 {
722 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
723 }
724
725 static SCM
726 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
727 {
728 SCM result, str = SCM_EOL;
729 char local_buffer[READER_BUFFER_SIZE], *buffer;
730 size_t bytes_read;
731 scm_t_port *pt = SCM_PTAB_ENTRY (port);
732
733 /* Need to capture line and column numbers here. */
734 long line = SCM_LINUM (port);
735 int column = SCM_COL (port) - 1;
736
737 scm_ungetc (chr, port);
738 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
739 &bytes_read);
740
741 str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
742
743 result = scm_string_to_number (str, SCM_UNDEFINED);
744 if (scm_is_false (result))
745 {
746 /* Return a symbol instead of a number */
747 if (opts->case_insensitive_p)
748 str = scm_string_downcase_x (str);
749 result = scm_string_to_symbol (str);
750 }
751 else if (SCM_NIMP (result))
752 result = maybe_annotate_source (result, port, opts, line, column);
753
754 SCM_COL (port) += scm_i_string_length (str);
755 return result;
756 }
757
758 static SCM
759 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
760 {
761 SCM result;
762 int ends_with_colon = 0;
763 size_t bytes_read;
764 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
765 char local_buffer[READER_BUFFER_SIZE], *buffer;
766 scm_t_port *pt = SCM_PTAB_ENTRY (port);
767 SCM str;
768
769 scm_ungetc (chr, port);
770 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
771 &bytes_read);
772 if (bytes_read > 0)
773 ends_with_colon = buffer[bytes_read - 1] == ':';
774
775 if (postfix && ends_with_colon && (bytes_read > 1))
776 {
777 str = scm_from_stringn (buffer, bytes_read - 1,
778 pt->encoding, pt->ilseq_handler);
779
780 if (opts->case_insensitive_p)
781 str = scm_string_downcase_x (str);
782 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
783 }
784 else
785 {
786 str = scm_from_stringn (buffer, bytes_read,
787 pt->encoding, pt->ilseq_handler);
788
789 if (opts->case_insensitive_p)
790 str = scm_string_downcase_x (str);
791 result = scm_string_to_symbol (str);
792 }
793
794 SCM_COL (port) += scm_i_string_length (str);
795 return result;
796 }
797
798 static SCM
799 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
800 #define FUNC_NAME "scm_lreadr"
801 {
802 SCM result;
803 size_t read;
804 char local_buffer[READER_BUFFER_SIZE], *buffer;
805 unsigned int radix;
806 SCM str;
807 scm_t_port *pt;
808
809 switch (chr)
810 {
811 case 'B':
812 case 'b':
813 radix = 2;
814 break;
815
816 case 'o':
817 case 'O':
818 radix = 8;
819 break;
820
821 case 'd':
822 case 'D':
823 radix = 10;
824 break;
825
826 case 'x':
827 case 'X':
828 radix = 16;
829 break;
830
831 default:
832 scm_ungetc (chr, port);
833 scm_ungetc ('#', port);
834 radix = 10;
835 }
836
837 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
838 &read);
839
840 pt = SCM_PTAB_ENTRY (port);
841 str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
842
843 result = scm_string_to_number (str, scm_from_uint (radix));
844
845 SCM_COL (port) += scm_i_string_length (str);
846
847 if (scm_is_true (result))
848 return result;
849
850 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
851
852 return SCM_BOOL_F;
853 }
854 #undef FUNC_NAME
855
856 static SCM
857 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
858 {
859 SCM p;
860 long line = SCM_LINUM (port);
861 int column = SCM_COL (port) - 1;
862
863 switch (chr)
864 {
865 case '`':
866 p = scm_sym_quasiquote;
867 break;
868
869 case '\'':
870 p = scm_sym_quote;
871 break;
872
873 case ',':
874 {
875 scm_t_wchar c;
876
877 c = scm_getc (port);
878 if ('@' == c)
879 p = scm_sym_uq_splicing;
880 else
881 {
882 scm_ungetc (c, port);
883 p = scm_sym_unquote;
884 }
885 break;
886 }
887
888 default:
889 fprintf (stderr, "%s: unhandled quote character (%i)\n",
890 "scm_read_quote", chr);
891 abort ();
892 }
893
894 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
895 return maybe_annotate_source (p, port, opts, line, column);
896 }
897
898 SCM_SYMBOL (sym_syntax, "syntax");
899 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
900 SCM_SYMBOL (sym_unsyntax, "unsyntax");
901 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
902
903 static SCM
904 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
905 {
906 SCM p;
907 long line = SCM_LINUM (port);
908 int column = SCM_COL (port) - 1;
909
910 switch (chr)
911 {
912 case '`':
913 p = sym_quasisyntax;
914 break;
915
916 case '\'':
917 p = sym_syntax;
918 break;
919
920 case ',':
921 {
922 int c;
923
924 c = scm_getc (port);
925 if ('@' == c)
926 p = sym_unsyntax_splicing;
927 else
928 {
929 scm_ungetc (c, port);
930 p = sym_unsyntax;
931 }
932 break;
933 }
934
935 default:
936 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
937 "scm_read_syntax", chr);
938 abort ();
939 }
940
941 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
942 return maybe_annotate_source (p, port, opts, line, column);
943 }
944
945 static SCM
946 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
947 {
948 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
949
950 if (!scm_is_eq (id, sym_nil))
951 scm_i_input_error ("scm_read_nil", port,
952 "unexpected input while reading #nil: ~a",
953 scm_list_1 (id));
954
955 return SCM_ELISP_NIL;
956 }
957
958 static SCM
959 scm_read_semicolon_comment (int chr, SCM port)
960 {
961 int c;
962
963 /* We use the get_byte here because there is no need to get the
964 locale correct with comment input. This presumes that newline
965 always represents itself no matter what the encoding is. */
966 for (c = scm_get_byte_or_eof (port);
967 (c != EOF) && (c != '\n');
968 c = scm_get_byte_or_eof (port));
969
970 return SCM_UNSPECIFIED;
971 }
972
973 /* If the EXPECTED_CHARS are the next ones available from PORT, then
974 consume them and return 1. Otherwise leave the port position where
975 it was and return 0. EXPECTED_CHARS should be all lowercase, and
976 will be matched case-insensitively against the characters read from
977 PORT. */
978 static int
979 try_read_ci_chars (SCM port, const char *expected_chars)
980 {
981 int num_chars_wanted = strlen (expected_chars);
982 int num_chars_read = 0;
983 char *chars_read = alloca (num_chars_wanted);
984 int c;
985
986 while (num_chars_read < num_chars_wanted)
987 {
988 c = scm_getc (port);
989 if (c == EOF)
990 break;
991 else if (c_tolower (c) != expected_chars[num_chars_read])
992 {
993 scm_ungetc (c, port);
994 break;
995 }
996 else
997 chars_read[num_chars_read++] = c;
998 }
999
1000 if (num_chars_read == num_chars_wanted)
1001 return 1;
1002 else
1003 {
1004 while (num_chars_read > 0)
1005 scm_ungetc (chars_read[--num_chars_read], port);
1006 return 0;
1007 }
1008 }
1009
1010 \f
1011 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1012
1013 static SCM
1014 scm_read_boolean (int chr, SCM port)
1015 {
1016 switch (chr)
1017 {
1018 case 't':
1019 case 'T':
1020 try_read_ci_chars (port, "rue");
1021 return SCM_BOOL_T;
1022
1023 case 'f':
1024 case 'F':
1025 try_read_ci_chars (port, "alse");
1026 return SCM_BOOL_F;
1027 }
1028
1029 return SCM_UNSPECIFIED;
1030 }
1031
1032 static SCM
1033 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1034 #define FUNC_NAME "scm_lreadr"
1035 {
1036 char buffer[READER_CHAR_NAME_MAX_SIZE];
1037 SCM charname;
1038 size_t charname_len, bytes_read;
1039 scm_t_wchar cp;
1040 int overflow;
1041 scm_t_port *pt;
1042
1043 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1044 &bytes_read);
1045 if (overflow)
1046 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1047
1048 if (bytes_read == 0)
1049 {
1050 chr = scm_getc (port);
1051 if (chr == EOF)
1052 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1053 "while reading character", SCM_EOL);
1054
1055 /* CHR must be a token delimiter, like a whitespace. */
1056 return (SCM_MAKE_CHAR (chr));
1057 }
1058
1059 pt = SCM_PTAB_ENTRY (port);
1060
1061 /* Simple ASCII characters can be processed immediately. Also, simple
1062 ISO-8859-1 characters can be processed immediately if the encoding for this
1063 port is ISO-8859-1. */
1064 if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
1065 {
1066 SCM_COL (port) += 1;
1067 return SCM_MAKE_CHAR (buffer[0]);
1068 }
1069
1070 /* Otherwise, convert the buffer into a proper scheme string for
1071 processing. */
1072 charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
1073 pt->ilseq_handler);
1074 charname_len = scm_i_string_length (charname);
1075 SCM_COL (port) += charname_len;
1076 cp = scm_i_string_ref (charname, 0);
1077 if (charname_len == 1)
1078 return SCM_MAKE_CHAR (cp);
1079
1080 /* Ignore dotted circles, which may be used to keep combining characters from
1081 combining with the backslash in #\charname. */
1082 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1083 return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
1084
1085 if (cp >= '0' && cp < '8')
1086 {
1087 /* Dirk:FIXME:: This type of character syntax is not R5RS
1088 * compliant. Further, it should be verified that the constant
1089 * does only consist of octal digits. */
1090 SCM p = scm_string_to_number (charname, scm_from_uint (8));
1091 if (SCM_I_INUMP (p))
1092 {
1093 scm_t_wchar c = scm_to_uint32 (p);
1094 if (SCM_IS_UNICODE_CHAR (c))
1095 return SCM_MAKE_CHAR (c);
1096 else
1097 scm_i_input_error (FUNC_NAME, port,
1098 "out-of-range octal character escape: ~a",
1099 scm_list_1 (charname));
1100 }
1101 }
1102
1103 if (cp == 'x' && (charname_len > 1))
1104 {
1105 SCM p;
1106
1107 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1108 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1109 scm_from_uint (16));
1110 if (SCM_I_INUMP (p))
1111 {
1112 scm_t_wchar c = scm_to_uint32 (p);
1113 if (SCM_IS_UNICODE_CHAR (c))
1114 return SCM_MAKE_CHAR (c);
1115 else
1116 scm_i_input_error (FUNC_NAME, port,
1117 "out-of-range hex character escape: ~a",
1118 scm_list_1 (charname));
1119 }
1120 }
1121
1122 /* The names of characters should never have non-Latin1
1123 characters. */
1124 if (scm_i_is_narrow_string (charname)
1125 || scm_i_try_narrow_string (charname))
1126 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1127 charname_len);
1128 if (scm_is_true (ch))
1129 return ch;
1130 }
1131
1132 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1133 scm_list_1 (charname));
1134
1135 return SCM_UNSPECIFIED;
1136 }
1137 #undef FUNC_NAME
1138
1139 static SCM
1140 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1141 {
1142 SCM symbol;
1143
1144 /* Read the symbol that comprises the keyword. Doing this instead of
1145 invoking a specific symbol reader function allows `scm_read_keyword ()'
1146 to adapt to the delimiters currently valid of symbols.
1147
1148 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1149 symbol = scm_read_expression (port, opts);
1150 if (!scm_is_symbol (symbol))
1151 scm_i_input_error ("scm_read_keyword", port,
1152 "keyword prefix `~a' not followed by a symbol: ~s",
1153 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1154
1155 return (scm_symbol_to_keyword (symbol));
1156 }
1157
1158 static SCM
1159 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1160 long line, int column)
1161 {
1162 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1163 guarantee that it's going to do what we want. After all, this is an
1164 implementation detail of `scm_read_vector ()', not a desirable
1165 property. */
1166 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1167 port, opts, line, column);
1168 }
1169
1170 /* Helper used by scm_read_array */
1171 static int
1172 read_decimal_integer (SCM port, int c, ssize_t *resp)
1173 {
1174 ssize_t sign = 1;
1175 ssize_t res = 0;
1176 int got_it = 0;
1177
1178 if (c == '-')
1179 {
1180 sign = -1;
1181 c = scm_getc (port);
1182 }
1183
1184 while ('0' <= c && c <= '9')
1185 {
1186 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1187 scm_i_input_error ("read_decimal_integer", port,
1188 "number too large", SCM_EOL);
1189 res = 10*res + c-'0';
1190 got_it = 1;
1191 c = scm_getc (port);
1192 }
1193
1194 if (got_it)
1195 *resp = sign * res;
1196 return c;
1197 }
1198
1199 /* Read an array. This function can also read vectors and uniform
1200 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1201 handled here.
1202
1203 C is the first character read after the '#'. */
1204 static SCM
1205 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1206 {
1207 ssize_t rank;
1208 scm_t_wchar tag_buf[8];
1209 int tag_len;
1210
1211 SCM tag, shape = SCM_BOOL_F, elements, array;
1212
1213 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1214 the array code can not deal with zero-length dimensions yet, and
1215 we want to allow zero-length vectors, of course. */
1216 if (c == '(')
1217 return scm_read_vector (c, port, opts, line, column);
1218
1219 /* Disambiguate between '#f' and uniform floating point vectors. */
1220 if (c == 'f')
1221 {
1222 c = scm_getc (port);
1223 if (c != '3' && c != '6')
1224 {
1225 if (c == 'a' && try_read_ci_chars (port, "lse"))
1226 return SCM_BOOL_F;
1227 else if (c != EOF)
1228 scm_ungetc (c, port);
1229 return SCM_BOOL_F;
1230 }
1231 rank = 1;
1232 tag_buf[0] = 'f';
1233 tag_len = 1;
1234 goto continue_reading_tag;
1235 }
1236
1237 /* Read rank. */
1238 rank = 1;
1239 c = read_decimal_integer (port, c, &rank);
1240 if (rank < 0)
1241 scm_i_input_error (NULL, port, "array rank must be non-negative",
1242 SCM_EOL);
1243
1244 /* Read tag. */
1245 tag_len = 0;
1246 continue_reading_tag:
1247 while (c != EOF && c != '(' && c != '@' && c != ':'
1248 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1249 {
1250 tag_buf[tag_len++] = c;
1251 c = scm_getc (port);
1252 }
1253 if (tag_len == 0)
1254 tag = SCM_BOOL_T;
1255 else
1256 {
1257 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1258 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1259 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1260 scm_list_1 (tag));
1261 }
1262
1263 /* Read shape. */
1264 if (c == '@' || c == ':')
1265 {
1266 shape = SCM_EOL;
1267
1268 do
1269 {
1270 ssize_t lbnd = 0, len = 0;
1271 SCM s;
1272
1273 if (c == '@')
1274 {
1275 c = scm_getc (port);
1276 c = read_decimal_integer (port, c, &lbnd);
1277 }
1278
1279 s = scm_from_ssize_t (lbnd);
1280
1281 if (c == ':')
1282 {
1283 c = scm_getc (port);
1284 c = read_decimal_integer (port, c, &len);
1285 if (len < 0)
1286 scm_i_input_error (NULL, port,
1287 "array length must be non-negative",
1288 SCM_EOL);
1289
1290 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1291 }
1292
1293 shape = scm_cons (s, shape);
1294 } while (c == '@' || c == ':');
1295
1296 shape = scm_reverse_x (shape, SCM_EOL);
1297 }
1298
1299 /* Read nested lists of elements. */
1300 if (c != '(')
1301 scm_i_input_error (NULL, port,
1302 "missing '(' in vector or array literal",
1303 SCM_EOL);
1304 elements = scm_read_sexp (c, port, opts);
1305
1306 if (scm_is_false (shape))
1307 shape = scm_from_ssize_t (rank);
1308 else if (scm_ilength (shape) != rank)
1309 scm_i_input_error
1310 (NULL, port,
1311 "the number of shape specifications must match the array rank",
1312 SCM_EOL);
1313
1314 /* Handle special print syntax of rank zero arrays; see
1315 scm_i_print_array for a rationale. */
1316 if (rank == 0)
1317 {
1318 if (!scm_is_pair (elements))
1319 scm_i_input_error (NULL, port,
1320 "too few elements in array literal, need 1",
1321 SCM_EOL);
1322 if (!scm_is_null (SCM_CDR (elements)))
1323 scm_i_input_error (NULL, port,
1324 "too many elements in array literal, want 1",
1325 SCM_EOL);
1326 elements = SCM_CAR (elements);
1327 }
1328
1329 /* Construct array, annotate with source location, and return. */
1330 array = scm_list_to_typed_array (tag, shape, elements);
1331 return maybe_annotate_source (array, port, opts, line, column);
1332 }
1333
1334 static SCM
1335 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1336 long line, int column)
1337 {
1338 return scm_read_array (chr, port, opts, line, column);
1339 }
1340
1341 static SCM
1342 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1343 long line, int column)
1344 {
1345 chr = scm_getc (port);
1346 if (chr != 'u')
1347 goto syntax;
1348
1349 chr = scm_getc (port);
1350 if (chr != '8')
1351 goto syntax;
1352
1353 chr = scm_getc (port);
1354 if (chr != '(')
1355 goto syntax;
1356
1357 return maybe_annotate_source
1358 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1359 port, opts, line, column);
1360
1361 syntax:
1362 scm_i_input_error ("read_bytevector", port,
1363 "invalid bytevector prefix",
1364 SCM_MAKE_CHAR (chr));
1365 return SCM_UNSPECIFIED;
1366 }
1367
1368 static SCM
1369 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1370 long line, int column)
1371 {
1372 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1373 terribly inefficient but who cares? */
1374 SCM s_bits = SCM_EOL;
1375
1376 for (chr = scm_getc (port);
1377 (chr != EOF) && ((chr == '0') || (chr == '1'));
1378 chr = scm_getc (port))
1379 {
1380 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1381 }
1382
1383 if (chr != EOF)
1384 scm_ungetc (chr, port);
1385
1386 return maybe_annotate_source
1387 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1388 port, opts, line, column);
1389 }
1390
1391 static SCM
1392 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1393 {
1394 int bang_seen = 0;
1395
1396 for (;;)
1397 {
1398 int c = scm_getc (port);
1399
1400 if (c == EOF)
1401 scm_i_input_error ("skip_block_comment", port,
1402 "unterminated `#! ... !#' comment", SCM_EOL);
1403
1404 if (c == '!')
1405 bang_seen = 1;
1406 else if (c == '#' && bang_seen)
1407 break;
1408 else
1409 bang_seen = 0;
1410 }
1411
1412 return SCM_UNSPECIFIED;
1413 }
1414
1415 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1416 int value);
1417 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1418 int value);
1419 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1420 int value);
1421
1422 static SCM
1423 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1424 {
1425 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1426 int c;
1427 int i = 0;
1428
1429 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1430 {
1431 c = scm_getc (port);
1432 if (c == EOF)
1433 scm_i_input_error ("skip_block_comment", port,
1434 "unterminated `#! ... !#' comment", SCM_EOL);
1435 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1436 name[i++] = c;
1437 else if (CHAR_IS_DELIMITER (c))
1438 {
1439 scm_ungetc (c, port);
1440 name[i] = '\0';
1441 if (0 == strcmp ("r6rs", name))
1442 ; /* Silently ignore */
1443 else if (0 == strcmp ("fold-case", name))
1444 set_port_case_insensitive_p (port, opts, 1);
1445 else if (0 == strcmp ("no-fold-case", name))
1446 set_port_case_insensitive_p (port, opts, 0);
1447 else if (0 == strcmp ("curly-infix", name))
1448 set_port_curly_infix_p (port, opts, 1);
1449 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1450 {
1451 set_port_curly_infix_p (port, opts, 1);
1452 set_port_square_brackets_p (port, opts, 0);
1453 }
1454 else
1455 break;
1456
1457 return SCM_UNSPECIFIED;
1458 }
1459 else
1460 {
1461 scm_ungetc (c, port);
1462 break;
1463 }
1464 }
1465 while (i > 0)
1466 scm_ungetc (name[--i], port);
1467 return scm_read_scsh_block_comment (chr, port);
1468 }
1469
1470 static SCM
1471 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1472 {
1473 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1474 nested. So care must be taken. */
1475 int nesting_level = 1;
1476
1477 int a = scm_getc (port);
1478
1479 if (a == EOF)
1480 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1481 "unterminated `#| ... |#' comment", SCM_EOL);
1482
1483 while (nesting_level > 0)
1484 {
1485 int b = scm_getc (port);
1486
1487 if (b == EOF)
1488 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1489 "unterminated `#| ... |#' comment", SCM_EOL);
1490
1491 if (a == '|' && b == '#')
1492 {
1493 nesting_level--;
1494 b = EOF;
1495 }
1496 else if (a == '#' && b == '|')
1497 {
1498 nesting_level++;
1499 b = EOF;
1500 }
1501
1502 a = b;
1503 }
1504
1505 return SCM_UNSPECIFIED;
1506 }
1507
1508 static SCM
1509 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1510 scm_t_read_opts *opts)
1511 {
1512 scm_t_wchar c;
1513
1514 c = flush_ws (port, opts, (char *) NULL);
1515 if (EOF == c)
1516 scm_i_input_error ("read_commented_expression", port,
1517 "no expression after #; comment", SCM_EOL);
1518 scm_ungetc (c, port);
1519 scm_read_expression (port, opts);
1520 return SCM_UNSPECIFIED;
1521 }
1522
1523 static SCM
1524 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1525 {
1526 /* Guile's extended symbol read syntax looks like this:
1527
1528 #{This is all a symbol name}#
1529
1530 So here, CHR is expected to be `{'. */
1531 int saw_brace = 0;
1532 size_t len = 0;
1533 SCM buf = scm_i_make_string (1024, NULL, 0);
1534
1535 buf = scm_i_string_start_writing (buf);
1536
1537 while ((chr = scm_getc (port)) != EOF)
1538 {
1539 if (saw_brace)
1540 {
1541 if (chr == '#')
1542 {
1543 break;
1544 }
1545 else
1546 {
1547 saw_brace = 0;
1548 scm_i_string_set_x (buf, len++, '}');
1549 }
1550 }
1551
1552 if (chr == '}')
1553 saw_brace = 1;
1554 else if (chr == '\\')
1555 {
1556 /* It used to be that print.c would print extended-read-syntax
1557 symbols with backslashes before "non-standard" chars, but
1558 this routine wouldn't do anything with those escapes.
1559 Bummer. What we've done is to change print.c to output
1560 R6RS hex escapes for those characters, relying on the fact
1561 that the extended read syntax would never put a `\' before
1562 an `x'. For now, we just ignore other instances of
1563 backslash in the string. */
1564 switch ((chr = scm_getc (port)))
1565 {
1566 case EOF:
1567 goto done;
1568 case 'x':
1569 {
1570 scm_t_wchar c;
1571
1572 SCM_READ_HEX_ESCAPE (10, ';');
1573 scm_i_string_set_x (buf, len++, c);
1574 break;
1575
1576 str_eof:
1577 chr = EOF;
1578 goto done;
1579
1580 bad_escaped:
1581 scm_i_string_stop_writing ();
1582 scm_i_input_error ("scm_read_extended_symbol", port,
1583 "illegal character in escape sequence: ~S",
1584 scm_list_1 (SCM_MAKE_CHAR (c)));
1585 break;
1586 }
1587 default:
1588 scm_i_string_set_x (buf, len++, chr);
1589 break;
1590 }
1591 }
1592 else
1593 scm_i_string_set_x (buf, len++, chr);
1594
1595 if (len >= scm_i_string_length (buf) - 2)
1596 {
1597 SCM addy;
1598
1599 scm_i_string_stop_writing ();
1600 addy = scm_i_make_string (1024, NULL, 0);
1601 buf = scm_string_append (scm_list_2 (buf, addy));
1602 len = 0;
1603 buf = scm_i_string_start_writing (buf);
1604 }
1605 }
1606
1607 done:
1608 scm_i_string_stop_writing ();
1609 if (chr == EOF)
1610 scm_i_input_error ("scm_read_extended_symbol", port,
1611 "end of file while reading symbol", SCM_EOL);
1612
1613 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1614 }
1615
1616
1617 \f
1618 /* Top-level token readers, i.e., dispatchers. */
1619
1620 static SCM
1621 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1622 {
1623 SCM proc;
1624
1625 proc = scm_get_hash_procedure (chr);
1626 if (scm_is_true (scm_procedure_p (proc)))
1627 {
1628 long line = SCM_LINUM (port);
1629 int column = SCM_COL (port) - 2;
1630 SCM got;
1631
1632 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1633
1634 if (opts->record_positions_p && SCM_NIMP (got)
1635 && !scm_i_has_source_properties (got))
1636 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1637
1638 return got;
1639 }
1640
1641 return SCM_UNSPECIFIED;
1642 }
1643
1644 /* The reader for the sharp `#' character. It basically dispatches reads
1645 among the above token readers. */
1646 static SCM
1647 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1648 long line, int column)
1649 #define FUNC_NAME "scm_lreadr"
1650 {
1651 SCM result;
1652
1653 chr = scm_getc (port);
1654
1655 result = scm_read_sharp_extension (chr, port, opts);
1656 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1657 return result;
1658
1659 switch (chr)
1660 {
1661 case '\\':
1662 return (scm_read_character (chr, port, opts));
1663 case '(':
1664 return (scm_read_vector (chr, port, opts, line, column));
1665 case 's':
1666 case 'u':
1667 case 'f':
1668 case 'c':
1669 /* This one may return either a boolean or an SRFI-4 vector. */
1670 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1671 case 'v':
1672 return (scm_read_bytevector (chr, port, opts, line, column));
1673 case '*':
1674 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1675 case 't':
1676 case 'T':
1677 case 'F':
1678 return (scm_read_boolean (chr, port));
1679 case ':':
1680 return (scm_read_keyword (chr, port, opts));
1681 case '0': case '1': case '2': case '3': case '4':
1682 case '5': case '6': case '7': case '8': case '9':
1683 case '@':
1684 #if SCM_ENABLE_DEPRECATED
1685 /* See below for 'i' and 'e'. */
1686 case 'a':
1687 case 'y':
1688 case 'h':
1689 case 'l':
1690 #endif
1691 return (scm_read_array (chr, port, opts, line, column));
1692
1693 case 'i':
1694 case 'e':
1695 #if SCM_ENABLE_DEPRECATED
1696 {
1697 /* When next char is '(', it really is an old-style
1698 uniform array. */
1699 scm_t_wchar next_c = scm_getc (port);
1700 if (next_c != EOF)
1701 scm_ungetc (next_c, port);
1702 if (next_c == '(')
1703 return scm_read_array (chr, port, opts, line, column);
1704 /* Fall through. */
1705 }
1706 #endif
1707 case 'b':
1708 case 'B':
1709 case 'o':
1710 case 'O':
1711 case 'd':
1712 case 'D':
1713 case 'x':
1714 case 'X':
1715 case 'I':
1716 case 'E':
1717 return (scm_read_number_and_radix (chr, port, opts));
1718 case '{':
1719 return (scm_read_extended_symbol (chr, port));
1720 case '!':
1721 return (scm_read_shebang (chr, port, opts));
1722 case ';':
1723 return (scm_read_commented_expression (chr, port, opts));
1724 case '`':
1725 case '\'':
1726 case ',':
1727 return (scm_read_syntax (chr, port, opts));
1728 case 'n':
1729 return (scm_read_nil (chr, port, opts));
1730 default:
1731 result = scm_read_sharp_extension (chr, port, opts);
1732 if (scm_is_eq (result, SCM_UNSPECIFIED))
1733 {
1734 /* To remain compatible with 1.8 and earlier, the following
1735 characters have lower precedence than `read-hash-extend'
1736 characters. */
1737 switch (chr)
1738 {
1739 case '|':
1740 return scm_read_r6rs_block_comment (chr, port);
1741 default:
1742 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1743 scm_list_1 (SCM_MAKE_CHAR (chr)));
1744 }
1745 }
1746 else
1747 return result;
1748 }
1749
1750 return SCM_UNSPECIFIED;
1751 }
1752 #undef FUNC_NAME
1753
1754 static SCM
1755 read_inner_expression (SCM port, scm_t_read_opts *opts)
1756 #define FUNC_NAME "read_inner_expression"
1757 {
1758 while (1)
1759 {
1760 scm_t_wchar chr;
1761
1762 chr = scm_getc (port);
1763
1764 switch (chr)
1765 {
1766 case SCM_WHITE_SPACES:
1767 case SCM_LINE_INCREMENTORS:
1768 break;
1769 case ';':
1770 (void) scm_read_semicolon_comment (chr, port);
1771 break;
1772 case '{':
1773 if (opts->curly_infix_p)
1774 {
1775 if (opts->neoteric_p)
1776 return scm_read_sexp (chr, port, opts);
1777 else
1778 {
1779 SCM expr;
1780
1781 /* Enable neoteric expressions within curly braces */
1782 opts->neoteric_p = 1;
1783 expr = scm_read_sexp (chr, port, opts);
1784 opts->neoteric_p = 0;
1785 return expr;
1786 }
1787 }
1788 else
1789 return scm_read_mixed_case_symbol (chr, port, opts);
1790 case '[':
1791 if (opts->square_brackets_p)
1792 return scm_read_sexp (chr, port, opts);
1793 else if (opts->curly_infix_p)
1794 {
1795 /* The syntax of neoteric expressions requires that '[' be
1796 a delimiter when curly-infix is enabled, so it cannot
1797 be part of an unescaped symbol. We might as well do
1798 something useful with it, so we adopt Kawa's convention:
1799 [...] => ($bracket-list$ ...) */
1800 long line = SCM_LINUM (port);
1801 int column = SCM_COL (port) - 1;
1802 return maybe_annotate_source
1803 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1804 port, opts, line, column);
1805 }
1806 else
1807 return scm_read_mixed_case_symbol (chr, port, opts);
1808 case '(':
1809 return (scm_read_sexp (chr, port, opts));
1810 case '"':
1811 return (scm_read_string (chr, port, opts));
1812 case '|':
1813 if (opts->r7rs_symbols_p)
1814 return scm_read_r7rs_symbol (chr, port, opts);
1815 else
1816 return scm_read_mixed_case_symbol (chr, port, opts);
1817 case '\'':
1818 case '`':
1819 case ',':
1820 return (scm_read_quote (chr, port, opts));
1821 case '#':
1822 {
1823 long line = SCM_LINUM (port);
1824 int column = SCM_COL (port) - 1;
1825 SCM result = scm_read_sharp (chr, port, opts, line, column);
1826 if (scm_is_eq (result, SCM_UNSPECIFIED))
1827 /* We read a comment or some such. */
1828 break;
1829 else
1830 return result;
1831 }
1832 case ')':
1833 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1834 break;
1835 case '}':
1836 if (opts->curly_infix_p)
1837 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1838 else
1839 return scm_read_mixed_case_symbol (chr, port, opts);
1840 case ']':
1841 if (opts->square_brackets_p)
1842 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1843 /* otherwise fall through */
1844 case EOF:
1845 return SCM_EOF_VAL;
1846 case ':':
1847 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1848 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1849 /* Fall through. */
1850
1851 default:
1852 {
1853 if (((chr >= '0') && (chr <= '9'))
1854 || (strchr ("+-.", chr)))
1855 return (scm_read_number (chr, port, opts));
1856 else
1857 return (scm_read_mixed_case_symbol (chr, port, opts));
1858 }
1859 }
1860 }
1861 }
1862 #undef FUNC_NAME
1863
1864 static SCM
1865 scm_read_expression (SCM port, scm_t_read_opts *opts)
1866 #define FUNC_NAME "scm_read_expression"
1867 {
1868 if (!opts->neoteric_p)
1869 return read_inner_expression (port, opts);
1870 else
1871 {
1872 long line = 0;
1873 int column = 0;
1874 SCM expr;
1875
1876 if (opts->record_positions_p)
1877 {
1878 /* We need to get the position of the first non-whitespace
1879 character in order to correctly annotate neoteric
1880 expressions. For example, for the expression 'f(x)', the
1881 first call to 'read_inner_expression' reads the 'f' (which
1882 cannot be annotated), and then we later read the '(x)' and
1883 use it to construct the new list (f x). */
1884 int c = flush_ws (port, opts, (char *) NULL);
1885 if (c == EOF)
1886 return SCM_EOF_VAL;
1887 scm_ungetc (c, port);
1888 line = SCM_LINUM (port);
1889 column = SCM_COL (port);
1890 }
1891
1892 expr = read_inner_expression (port, opts);
1893
1894 /* 'expr' is the first component of the neoteric expression. Now
1895 we loop, and as long as the next character is '(', '[', or '{',
1896 (without any intervening whitespace), we use it to construct a
1897 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1898 for (;;)
1899 {
1900 int chr = scm_getc (port);
1901
1902 if (chr == '(')
1903 /* e(...) => (e ...) */
1904 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1905 else if (chr == '[')
1906 /* e[...] => ($bracket-apply$ e ...) */
1907 expr = scm_cons (sym_bracket_apply,
1908 scm_cons (expr,
1909 scm_read_sexp (chr, port, opts)));
1910 else if (chr == '{')
1911 {
1912 SCM arg = scm_read_sexp (chr, port, opts);
1913
1914 if (scm_is_null (arg))
1915 expr = scm_list_1 (expr); /* e{} => (e) */
1916 else
1917 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1918 }
1919 else
1920 {
1921 if (chr != EOF)
1922 scm_ungetc (chr, port);
1923 break;
1924 }
1925 maybe_annotate_source (expr, port, opts, line, column);
1926 }
1927 return expr;
1928 }
1929 }
1930 #undef FUNC_NAME
1931
1932 \f
1933 /* Actual reader. */
1934
1935 static void init_read_options (SCM port, scm_t_read_opts *opts);
1936
1937 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1938 (SCM port),
1939 "Read an s-expression from the input port @var{port}, or from\n"
1940 "the current input port if @var{port} is not specified.\n"
1941 "Any whitespace before the next token is discarded.")
1942 #define FUNC_NAME s_scm_read
1943 {
1944 scm_t_read_opts opts;
1945 int c;
1946
1947 if (SCM_UNBNDP (port))
1948 port = scm_current_input_port ();
1949 SCM_VALIDATE_OPINPORT (1, port);
1950
1951 init_read_options (port, &opts);
1952
1953 c = flush_ws (port, &opts, (char *) NULL);
1954 if (EOF == c)
1955 return SCM_EOF_VAL;
1956 scm_ungetc (c, port);
1957
1958 return (scm_read_expression (port, &opts));
1959 }
1960 #undef FUNC_NAME
1961
1962
1963 \f
1964
1965 /* Manipulate the read-hash-procedures alist. This could be written in
1966 Scheme, but maybe it will also be used by C code during initialisation. */
1967 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1968 (SCM chr, SCM proc),
1969 "Install the procedure @var{proc} for reading expressions\n"
1970 "starting with the character sequence @code{#} and @var{chr}.\n"
1971 "@var{proc} will be called with two arguments: the character\n"
1972 "@var{chr} and the port to read further data from. The object\n"
1973 "returned will be the return value of @code{read}. \n"
1974 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1975 )
1976 #define FUNC_NAME s_scm_read_hash_extend
1977 {
1978 SCM this;
1979 SCM prev;
1980
1981 SCM_VALIDATE_CHAR (1, chr);
1982 SCM_ASSERT (scm_is_false (proc)
1983 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1984 proc, SCM_ARG2, FUNC_NAME);
1985
1986 /* Check if chr is already in the alist. */
1987 this = scm_i_read_hash_procedures_ref ();
1988 prev = SCM_BOOL_F;
1989 while (1)
1990 {
1991 if (scm_is_null (this))
1992 {
1993 /* not found, so add it to the beginning. */
1994 if (scm_is_true (proc))
1995 {
1996 SCM new = scm_cons (scm_cons (chr, proc),
1997 scm_i_read_hash_procedures_ref ());
1998 scm_i_read_hash_procedures_set_x (new);
1999 }
2000 break;
2001 }
2002 if (scm_is_eq (chr, SCM_CAAR (this)))
2003 {
2004 /* already in the alist. */
2005 if (scm_is_false (proc))
2006 {
2007 /* remove it. */
2008 if (scm_is_false (prev))
2009 {
2010 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
2011 scm_i_read_hash_procedures_set_x (rest);
2012 }
2013 else
2014 scm_set_cdr_x (prev, SCM_CDR (this));
2015 }
2016 else
2017 {
2018 /* replace it. */
2019 scm_set_cdr_x (SCM_CAR (this), proc);
2020 }
2021 break;
2022 }
2023 prev = this;
2024 this = SCM_CDR (this);
2025 }
2026
2027 return SCM_UNSPECIFIED;
2028 }
2029 #undef FUNC_NAME
2030
2031 /* Recover the read-hash procedure corresponding to char c. */
2032 static SCM
2033 scm_get_hash_procedure (int c)
2034 {
2035 SCM rest = scm_i_read_hash_procedures_ref ();
2036
2037 while (1)
2038 {
2039 if (scm_is_null (rest))
2040 return SCM_BOOL_F;
2041
2042 if (SCM_CHAR (SCM_CAAR (rest)) == c)
2043 return SCM_CDAR (rest);
2044
2045 rest = SCM_CDR (rest);
2046 }
2047 }
2048
2049 /* Maximum size of an encoding name. This is a bit more than the
2050 longest name listed at
2051 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2052 characters.) */
2053 #define ENCODING_NAME_MAX_SIZE 20
2054
2055 /* Number of bytes at the beginning or end of a file that are scanned
2056 for a "coding:" declaration. */
2057 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2058
2059
2060 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2061 coding declaration. Returns either NULL or a string whose storage
2062 has been allocated with `scm_gc_malloc'. */
2063 char *
2064 scm_i_scan_for_encoding (SCM port)
2065 {
2066 scm_t_port *pt;
2067 char header[SCM_ENCODING_SEARCH_SIZE+1];
2068 size_t bytes_read, encoding_length, i;
2069 char *encoding = NULL;
2070 char *pos, *encoding_start;
2071 int in_comment;
2072
2073 pt = SCM_PTAB_ENTRY (port);
2074
2075 if (pt->rw_active == SCM_PORT_WRITE)
2076 scm_flush (port);
2077
2078 if (pt->rw_random)
2079 pt->rw_active = SCM_PORT_READ;
2080
2081 if (pt->read_pos == pt->read_end)
2082 {
2083 /* We can use the read buffer, and thus avoid a seek. */
2084 if (scm_fill_input (port) == EOF)
2085 return NULL;
2086
2087 bytes_read = pt->read_end - pt->read_pos;
2088 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2089 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2090
2091 if (bytes_read <= 1)
2092 /* An unbuffered port -- don't scan. */
2093 return NULL;
2094
2095 memcpy (header, pt->read_pos, bytes_read);
2096 header[bytes_read] = '\0';
2097 }
2098 else
2099 {
2100 /* Try to read some bytes and then seek back. Not all ports
2101 support seeking back; and indeed some file ports (like
2102 /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
2103 check performed by SCM_FPORT_FDES---but fail to seek
2104 backwards. Hence this block comes second. We prefer to use
2105 the read buffer in-place. */
2106 if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
2107 return NULL;
2108
2109 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2110 header[bytes_read] = '\0';
2111 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2112 }
2113
2114 /* search past "coding[:=]" */
2115 pos = header;
2116 while (1)
2117 {
2118 if ((pos = strstr(pos, "coding")) == NULL)
2119 return NULL;
2120
2121 pos += strlen ("coding");
2122 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2123 (*pos == ':' || *pos == '='))
2124 {
2125 pos ++;
2126 break;
2127 }
2128 }
2129
2130 /* skip spaces */
2131 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2132 (*pos == ' ' || *pos == '\t'))
2133 pos ++;
2134
2135 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2136 /* We found the "coding:" string, but there is probably not enough
2137 room to store an encoding name in its entirety, so ignore it.
2138 This makes sure we do not end up returning a truncated encoding
2139 name. */
2140 return NULL;
2141
2142 /* grab the next token */
2143 encoding_start = pos;
2144 i = 0;
2145 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2146 && encoding_start + i - header < bytes_read
2147 && (isalnum ((int) encoding_start[i])
2148 || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
2149 i++;
2150
2151 encoding_length = i;
2152 if (encoding_length == 0)
2153 return NULL;
2154
2155 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2156 for (i = 0; i < encoding_length; i++)
2157 encoding[i] = toupper ((int) encoding[i]);
2158
2159 /* push backwards to make sure we were in a comment */
2160 in_comment = 0;
2161 pos = encoding_start;
2162 while (pos >= header)
2163 {
2164 if (*pos == ';')
2165 {
2166 in_comment = 1;
2167 break;
2168 }
2169 else if (*pos == '\n' || pos == header)
2170 {
2171 /* This wasn't in a semicolon comment. Check for a
2172 hash-bang comment. */
2173 char *beg = strstr (header, "#!");
2174 char *end = strstr (header, "!#");
2175 if (beg < encoding_start && encoding_start + encoding_length <= end)
2176 in_comment = 1;
2177 break;
2178 }
2179 else
2180 {
2181 pos --;
2182 continue;
2183 }
2184 }
2185 if (!in_comment)
2186 /* This wasn't in a comment */
2187 return NULL;
2188
2189 return encoding;
2190 }
2191
2192 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2193 (SCM port),
2194 "Scans the port for an Emacs-like character coding declaration\n"
2195 "near the top of the contents of a port with random-accessible contents.\n"
2196 "The coding declaration is of the form\n"
2197 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2198 "\n"
2199 "Returns a string containing the character encoding of the file\n"
2200 "if a declaration was found, or @code{#f} otherwise.\n")
2201 #define FUNC_NAME s_scm_file_encoding
2202 {
2203 char *enc;
2204 SCM s_enc;
2205
2206 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2207
2208 enc = scm_i_scan_for_encoding (port);
2209 if (enc == NULL)
2210 return SCM_BOOL_F;
2211 else
2212 {
2213 s_enc = scm_from_locale_string (enc);
2214 return s_enc;
2215 }
2216
2217 return SCM_BOOL_F;
2218 }
2219 #undef FUNC_NAME
2220
2221 \f
2222 /* Per-port read options.
2223
2224 We store per-port read options in the 'port-read-options' port
2225 property, which is stored in the internal port structure. The value
2226 stored is a single integer that contains a two-bit field for each
2227 read option.
2228
2229 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2230 the applicable value should be inherited from the corresponding
2231 global read option. Otherwise, the bit field contains the value of
2232 the read option. For boolean read options that have been set
2233 per-port, the possible values are 0 or 1. If the 'keyword_style'
2234 read option has been set per-port, its possible values are those in
2235 'enum t_keyword_style'. */
2236
2237 /* Key to read options in port properties. */
2238 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2239
2240 /* Offsets of bit fields for each per-port override */
2241 #define READ_OPTION_COPY_SOURCE_P 0
2242 #define READ_OPTION_RECORD_POSITIONS_P 2
2243 #define READ_OPTION_CASE_INSENSITIVE_P 4
2244 #define READ_OPTION_KEYWORD_STYLE 6
2245 #define READ_OPTION_R6RS_ESCAPES_P 8
2246 #define READ_OPTION_SQUARE_BRACKETS_P 10
2247 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2248 #define READ_OPTION_CURLY_INFIX_P 14
2249 #define READ_OPTION_R7RS_SYMBOLS_P 16
2250
2251 /* The total width in bits of the per-port overrides */
2252 #define READ_OPTIONS_NUM_BITS 18
2253
2254 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2255 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2256
2257 #define READ_OPTION_MASK 3
2258 #define READ_OPTION_INHERIT 3
2259
2260 static void
2261 set_port_read_option (SCM port, int option, int new_value)
2262 {
2263 SCM scm_read_options;
2264 unsigned int read_options;
2265
2266 new_value &= READ_OPTION_MASK;
2267 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2268 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2269 read_options = scm_to_uint (scm_read_options);
2270 else
2271 read_options = READ_OPTIONS_INHERIT_ALL;
2272 read_options &= ~(READ_OPTION_MASK << option);
2273 read_options |= new_value << option;
2274 scm_read_options = scm_from_uint (read_options);
2275 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2276 }
2277
2278 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2279 static void
2280 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2281 {
2282 value = !!value;
2283 opts->case_insensitive_p = value;
2284 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2285 }
2286
2287 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2288 static void
2289 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2290 {
2291 value = !!value;
2292 opts->square_brackets_p = value;
2293 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2294 }
2295
2296 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2297 static void
2298 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2299 {
2300 value = !!value;
2301 opts->curly_infix_p = value;
2302 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2303 }
2304
2305 /* Initialize OPTS based on PORT's read options and the global read
2306 options. */
2307 static void
2308 init_read_options (SCM port, scm_t_read_opts *opts)
2309 {
2310 SCM val, scm_read_options;
2311 unsigned int read_options, x;
2312
2313 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2314
2315 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2316 read_options = scm_to_uint (scm_read_options);
2317 else
2318 read_options = READ_OPTIONS_INHERIT_ALL;
2319
2320 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2321 if (x == READ_OPTION_INHERIT)
2322 {
2323 val = SCM_PACK (SCM_KEYWORD_STYLE);
2324 if (scm_is_eq (val, scm_keyword_prefix))
2325 x = KEYWORD_STYLE_PREFIX;
2326 else if (scm_is_eq (val, scm_keyword_postfix))
2327 x = KEYWORD_STYLE_POSTFIX;
2328 else
2329 x = KEYWORD_STYLE_HASH_PREFIX;
2330 }
2331 opts->keyword_style = x;
2332
2333 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2334 do \
2335 { \
2336 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2337 if (x == READ_OPTION_INHERIT) \
2338 x = !!SCM_ ## NAME; \
2339 opts->name = x; \
2340 } \
2341 while (0)
2342
2343 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2344 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2345 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2346 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2347 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2348 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2349 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
2350 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
2351
2352 #undef RESOLVE_BOOLEAN_OPTION
2353
2354 opts->neoteric_p = 0;
2355 }
2356
2357 void
2358 scm_init_read ()
2359 {
2360 SCM read_hash_procs;
2361
2362 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2363
2364 scm_i_read_hash_procedures =
2365 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2366
2367 scm_init_opts (scm_read_options, scm_read_opts);
2368 #include "libguile/read.x"
2369 }
2370
2371 /*
2372 Local Variables:
2373 c-file-style: "gnu"
2374 End:
2375 */